diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,52500 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 15.0, + "global_step": 7494, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00013344008540165466, + "grad_norm": 4.080643084961509, + "learning_rate": 8.88888888888889e-08, + "loss": 1.8736, + "step": 1 + }, + { + "epoch": 0.0002668801708033093, + "grad_norm": 5.08292352366484, + "learning_rate": 1.777777777777778e-07, + "loss": 1.9167, + "step": 2 + }, + { + "epoch": 0.00040032025620496394, + "grad_norm": 4.314269479207965, + "learning_rate": 2.666666666666667e-07, + "loss": 1.868, + "step": 3 + }, + { + "epoch": 0.0005337603416066186, + "grad_norm": 4.5997627346122325, + "learning_rate": 3.555555555555556e-07, + "loss": 1.8812, + "step": 4 + }, + { + "epoch": 0.0006672004270082733, + "grad_norm": 4.366746923746356, + "learning_rate": 4.444444444444445e-07, + "loss": 1.8816, + "step": 5 + }, + { + "epoch": 0.0008006405124099279, + "grad_norm": 6.417272891063272, + "learning_rate": 5.333333333333335e-07, + "loss": 1.9164, + "step": 6 + }, + { + "epoch": 0.0009340805978115826, + "grad_norm": 4.276397654879367, + "learning_rate": 6.222222222222223e-07, + "loss": 1.8699, + "step": 7 + }, + { + "epoch": 0.0010675206832132373, + "grad_norm": 40.21908327926398, + "learning_rate": 7.111111111111112e-07, + "loss": 1.9019, + "step": 8 + }, + { + "epoch": 0.0012009607686148918, + "grad_norm": 4.191772462937108, + "learning_rate": 8.000000000000001e-07, + "loss": 1.8959, + "step": 9 + }, + { + "epoch": 0.0013344008540165466, + "grad_norm": 4.39886202719947, + "learning_rate": 8.88888888888889e-07, + "loss": 1.8733, + "step": 10 + }, + { + "epoch": 0.0014678409394182012, + "grad_norm": 3.872194087105484, + "learning_rate": 9.77777777777778e-07, + "loss": 1.8535, + "step": 11 + }, + { + "epoch": 0.0016012810248198558, + "grad_norm": 3.738315413208318, + "learning_rate": 1.066666666666667e-06, + "loss": 1.8182, + "step": 12 + }, + { + "epoch": 0.0017347211102215106, + "grad_norm": 5.806590893863183, + "learning_rate": 1.1555555555555556e-06, + "loss": 1.841, + "step": 13 + }, + { + "epoch": 0.0018681611956231651, + "grad_norm": 3.9977833987634748, + "learning_rate": 1.2444444444444445e-06, + "loss": 1.8551, + "step": 14 + }, + { + "epoch": 0.0020016012810248197, + "grad_norm": 3.6521269631944073, + "learning_rate": 1.3333333333333334e-06, + "loss": 1.8873, + "step": 15 + }, + { + "epoch": 0.0021350413664264745, + "grad_norm": 2.9986950145773097, + "learning_rate": 1.4222222222222223e-06, + "loss": 1.8, + "step": 16 + }, + { + "epoch": 0.0022684814518281293, + "grad_norm": 3.1157009896387544, + "learning_rate": 1.5111111111111112e-06, + "loss": 1.8309, + "step": 17 + }, + { + "epoch": 0.0024019215372297837, + "grad_norm": 3.7767942592540855, + "learning_rate": 1.6000000000000001e-06, + "loss": 1.8749, + "step": 18 + }, + { + "epoch": 0.0025353616226314385, + "grad_norm": 3.480294979458449, + "learning_rate": 1.688888888888889e-06, + "loss": 1.8126, + "step": 19 + }, + { + "epoch": 0.0026688017080330933, + "grad_norm": 2.9012896594189366, + "learning_rate": 1.777777777777778e-06, + "loss": 1.7983, + "step": 20 + }, + { + "epoch": 0.0028022417934347476, + "grad_norm": 4.745562235186556, + "learning_rate": 1.8666666666666669e-06, + "loss": 1.7862, + "step": 21 + }, + { + "epoch": 0.0029356818788364024, + "grad_norm": 4.88191672119221, + "learning_rate": 1.955555555555556e-06, + "loss": 1.7786, + "step": 22 + }, + { + "epoch": 0.003069121964238057, + "grad_norm": 41.74795779070356, + "learning_rate": 2.0444444444444447e-06, + "loss": 1.7671, + "step": 23 + }, + { + "epoch": 0.0032025620496397116, + "grad_norm": 3.02827351605321, + "learning_rate": 2.133333333333334e-06, + "loss": 1.821, + "step": 24 + }, + { + "epoch": 0.0033360021350413663, + "grad_norm": 2.7722420080290804, + "learning_rate": 2.222222222222222e-06, + "loss": 1.7749, + "step": 25 + }, + { + "epoch": 0.003469442220443021, + "grad_norm": 3.669445898983361, + "learning_rate": 2.311111111111111e-06, + "loss": 1.8066, + "step": 26 + }, + { + "epoch": 0.003602882305844676, + "grad_norm": 2.928887643981388, + "learning_rate": 2.4000000000000003e-06, + "loss": 1.7647, + "step": 27 + }, + { + "epoch": 0.0037363223912463303, + "grad_norm": 3.7543465480647957, + "learning_rate": 2.488888888888889e-06, + "loss": 1.8005, + "step": 28 + }, + { + "epoch": 0.003869762476647985, + "grad_norm": 3.251679830837901, + "learning_rate": 2.577777777777778e-06, + "loss": 1.7779, + "step": 29 + }, + { + "epoch": 0.0040032025620496394, + "grad_norm": 21.468244588308337, + "learning_rate": 2.666666666666667e-06, + "loss": 1.7694, + "step": 30 + }, + { + "epoch": 0.004136642647451294, + "grad_norm": 4.000265636508565, + "learning_rate": 2.755555555555556e-06, + "loss": 1.7384, + "step": 31 + }, + { + "epoch": 0.004270082732852949, + "grad_norm": 3.0634285680156577, + "learning_rate": 2.8444444444444446e-06, + "loss": 1.7271, + "step": 32 + }, + { + "epoch": 0.004403522818254604, + "grad_norm": 2.406246700648424, + "learning_rate": 2.9333333333333338e-06, + "loss": 1.7338, + "step": 33 + }, + { + "epoch": 0.004536962903656259, + "grad_norm": 2.542980268594405, + "learning_rate": 3.0222222222222225e-06, + "loss": 1.7457, + "step": 34 + }, + { + "epoch": 0.004670402989057913, + "grad_norm": 2.7984146843006226, + "learning_rate": 3.1111111111111116e-06, + "loss": 1.7539, + "step": 35 + }, + { + "epoch": 0.004803843074459567, + "grad_norm": 3.013085830269592, + "learning_rate": 3.2000000000000003e-06, + "loss": 1.6781, + "step": 36 + }, + { + "epoch": 0.004937283159861222, + "grad_norm": 2.153019671959159, + "learning_rate": 3.2888888888888894e-06, + "loss": 1.744, + "step": 37 + }, + { + "epoch": 0.005070723245262877, + "grad_norm": 2.419384656819244, + "learning_rate": 3.377777777777778e-06, + "loss": 1.7563, + "step": 38 + }, + { + "epoch": 0.005204163330664532, + "grad_norm": 2.431242306767295, + "learning_rate": 3.4666666666666672e-06, + "loss": 1.7474, + "step": 39 + }, + { + "epoch": 0.0053376034160661865, + "grad_norm": 2.58042762585277, + "learning_rate": 3.555555555555556e-06, + "loss": 1.7052, + "step": 40 + }, + { + "epoch": 0.005471043501467841, + "grad_norm": 2.903485309507233, + "learning_rate": 3.644444444444445e-06, + "loss": 1.7613, + "step": 41 + }, + { + "epoch": 0.005604483586869495, + "grad_norm": 3.1612531609911123, + "learning_rate": 3.7333333333333337e-06, + "loss": 1.7141, + "step": 42 + }, + { + "epoch": 0.00573792367227115, + "grad_norm": 2.060069555889511, + "learning_rate": 3.8222222222222224e-06, + "loss": 1.6964, + "step": 43 + }, + { + "epoch": 0.005871363757672805, + "grad_norm": 2.0784087729038885, + "learning_rate": 3.911111111111112e-06, + "loss": 1.7272, + "step": 44 + }, + { + "epoch": 0.00600480384307446, + "grad_norm": 2.046333038993166, + "learning_rate": 4.000000000000001e-06, + "loss": 1.7155, + "step": 45 + }, + { + "epoch": 0.006138243928476114, + "grad_norm": 4.060174091987868, + "learning_rate": 4.088888888888889e-06, + "loss": 1.7394, + "step": 46 + }, + { + "epoch": 0.006271684013877769, + "grad_norm": 2.194133473755285, + "learning_rate": 4.177777777777778e-06, + "loss": 1.7473, + "step": 47 + }, + { + "epoch": 0.006405124099279423, + "grad_norm": 2.086936433450861, + "learning_rate": 4.266666666666668e-06, + "loss": 1.7482, + "step": 48 + }, + { + "epoch": 0.006538564184681078, + "grad_norm": 2.055176025616284, + "learning_rate": 4.3555555555555555e-06, + "loss": 1.6989, + "step": 49 + }, + { + "epoch": 0.006672004270082733, + "grad_norm": 4.2705916173620935, + "learning_rate": 4.444444444444444e-06, + "loss": 1.7113, + "step": 50 + }, + { + "epoch": 0.0068054443554843875, + "grad_norm": 3.062440642397753, + "learning_rate": 4.533333333333334e-06, + "loss": 1.6994, + "step": 51 + }, + { + "epoch": 0.006938884440886042, + "grad_norm": 3.6513027631899724, + "learning_rate": 4.622222222222222e-06, + "loss": 1.7435, + "step": 52 + }, + { + "epoch": 0.007072324526287697, + "grad_norm": 4.339915391562868, + "learning_rate": 4.711111111111111e-06, + "loss": 1.7093, + "step": 53 + }, + { + "epoch": 0.007205764611689352, + "grad_norm": 2.300952067433821, + "learning_rate": 4.800000000000001e-06, + "loss": 1.6687, + "step": 54 + }, + { + "epoch": 0.007339204697091006, + "grad_norm": 2.3827551772304814, + "learning_rate": 4.888888888888889e-06, + "loss": 1.748, + "step": 55 + }, + { + "epoch": 0.007472644782492661, + "grad_norm": 1.9554846454820922, + "learning_rate": 4.977777777777778e-06, + "loss": 1.6707, + "step": 56 + }, + { + "epoch": 0.007606084867894315, + "grad_norm": 2.208610815386933, + "learning_rate": 5.0666666666666676e-06, + "loss": 1.6908, + "step": 57 + }, + { + "epoch": 0.00773952495329597, + "grad_norm": 2.431678252426931, + "learning_rate": 5.155555555555556e-06, + "loss": 1.7041, + "step": 58 + }, + { + "epoch": 0.007872965038697624, + "grad_norm": 1.926978176003599, + "learning_rate": 5.244444444444445e-06, + "loss": 1.6928, + "step": 59 + }, + { + "epoch": 0.008006405124099279, + "grad_norm": 2.75648074491875, + "learning_rate": 5.333333333333334e-06, + "loss": 1.7009, + "step": 60 + }, + { + "epoch": 0.008139845209500934, + "grad_norm": 1.9054991774370111, + "learning_rate": 5.422222222222223e-06, + "loss": 1.6937, + "step": 61 + }, + { + "epoch": 0.008273285294902588, + "grad_norm": 2.7419834376889174, + "learning_rate": 5.511111111111112e-06, + "loss": 1.7218, + "step": 62 + }, + { + "epoch": 0.008406725380304243, + "grad_norm": 1.8897989454015407, + "learning_rate": 5.600000000000001e-06, + "loss": 1.7069, + "step": 63 + }, + { + "epoch": 0.008540165465705898, + "grad_norm": 2.159740439666748, + "learning_rate": 5.688888888888889e-06, + "loss": 1.7143, + "step": 64 + }, + { + "epoch": 0.008673605551107553, + "grad_norm": 2.1199099802628396, + "learning_rate": 5.777777777777778e-06, + "loss": 1.6872, + "step": 65 + }, + { + "epoch": 0.008807045636509208, + "grad_norm": 2.275963855081617, + "learning_rate": 5.8666666666666675e-06, + "loss": 1.8008, + "step": 66 + }, + { + "epoch": 0.008940485721910862, + "grad_norm": 2.208827668777784, + "learning_rate": 5.955555555555555e-06, + "loss": 1.686, + "step": 67 + }, + { + "epoch": 0.009073925807312517, + "grad_norm": 3.117561539877217, + "learning_rate": 6.044444444444445e-06, + "loss": 1.6589, + "step": 68 + }, + { + "epoch": 0.009207365892714172, + "grad_norm": 2.3085392935053766, + "learning_rate": 6.133333333333334e-06, + "loss": 1.6787, + "step": 69 + }, + { + "epoch": 0.009340805978115827, + "grad_norm": 2.1582405446873554, + "learning_rate": 6.222222222222223e-06, + "loss": 1.7265, + "step": 70 + }, + { + "epoch": 0.00947424606351748, + "grad_norm": 2.0969226907995053, + "learning_rate": 6.311111111111111e-06, + "loss": 1.6737, + "step": 71 + }, + { + "epoch": 0.009607686148919135, + "grad_norm": 4.200722941416422, + "learning_rate": 6.4000000000000006e-06, + "loss": 1.7188, + "step": 72 + }, + { + "epoch": 0.00974112623432079, + "grad_norm": 2.2485101560203105, + "learning_rate": 6.488888888888889e-06, + "loss": 1.6912, + "step": 73 + }, + { + "epoch": 0.009874566319722444, + "grad_norm": 2.763908223216233, + "learning_rate": 6.577777777777779e-06, + "loss": 1.6571, + "step": 74 + }, + { + "epoch": 0.010008006405124099, + "grad_norm": 1.9040632627929543, + "learning_rate": 6.666666666666667e-06, + "loss": 1.6861, + "step": 75 + }, + { + "epoch": 0.010141446490525754, + "grad_norm": 28.864482563980474, + "learning_rate": 6.755555555555556e-06, + "loss": 1.7206, + "step": 76 + }, + { + "epoch": 0.010274886575927409, + "grad_norm": 2.8893679461430732, + "learning_rate": 6.844444444444445e-06, + "loss": 1.651, + "step": 77 + }, + { + "epoch": 0.010408326661329063, + "grad_norm": 2.6807260850078705, + "learning_rate": 6.9333333333333344e-06, + "loss": 1.7404, + "step": 78 + }, + { + "epoch": 0.010541766746730718, + "grad_norm": 2.616916412887755, + "learning_rate": 7.022222222222222e-06, + "loss": 1.7453, + "step": 79 + }, + { + "epoch": 0.010675206832132373, + "grad_norm": 2.030396506610222, + "learning_rate": 7.111111111111112e-06, + "loss": 1.6951, + "step": 80 + }, + { + "epoch": 0.010808646917534028, + "grad_norm": 2.276338612685183, + "learning_rate": 7.2000000000000005e-06, + "loss": 1.6665, + "step": 81 + }, + { + "epoch": 0.010942087002935683, + "grad_norm": 4.873556416423791, + "learning_rate": 7.28888888888889e-06, + "loss": 1.6902, + "step": 82 + }, + { + "epoch": 0.011075527088337337, + "grad_norm": 2.480438858137466, + "learning_rate": 7.377777777777778e-06, + "loss": 1.6992, + "step": 83 + }, + { + "epoch": 0.01120896717373899, + "grad_norm": 2.0278767294473234, + "learning_rate": 7.4666666666666675e-06, + "loss": 1.6769, + "step": 84 + }, + { + "epoch": 0.011342407259140645, + "grad_norm": 2.7060597428983937, + "learning_rate": 7.555555555555556e-06, + "loss": 1.6538, + "step": 85 + }, + { + "epoch": 0.0114758473445423, + "grad_norm": 2.7423909405968963, + "learning_rate": 7.644444444444445e-06, + "loss": 1.7279, + "step": 86 + }, + { + "epoch": 0.011609287429943955, + "grad_norm": 2.2930045021509446, + "learning_rate": 7.733333333333334e-06, + "loss": 1.6586, + "step": 87 + }, + { + "epoch": 0.01174272751534561, + "grad_norm": 1.9855479277235186, + "learning_rate": 7.822222222222224e-06, + "loss": 1.7009, + "step": 88 + }, + { + "epoch": 0.011876167600747264, + "grad_norm": 2.7213236928751554, + "learning_rate": 7.911111111111112e-06, + "loss": 1.6854, + "step": 89 + }, + { + "epoch": 0.01200960768614892, + "grad_norm": 2.7306900665495757, + "learning_rate": 8.000000000000001e-06, + "loss": 1.7225, + "step": 90 + }, + { + "epoch": 0.012143047771550574, + "grad_norm": 2.107447235822766, + "learning_rate": 8.08888888888889e-06, + "loss": 1.7019, + "step": 91 + }, + { + "epoch": 0.012276487856952229, + "grad_norm": 2.570127439213928, + "learning_rate": 8.177777777777779e-06, + "loss": 1.6494, + "step": 92 + }, + { + "epoch": 0.012409927942353884, + "grad_norm": 1.8760712243264193, + "learning_rate": 8.266666666666667e-06, + "loss": 1.6933, + "step": 93 + }, + { + "epoch": 0.012543368027755538, + "grad_norm": 2.1290912423121253, + "learning_rate": 8.355555555555556e-06, + "loss": 1.6705, + "step": 94 + }, + { + "epoch": 0.012676808113157193, + "grad_norm": 2.488673234475764, + "learning_rate": 8.444444444444446e-06, + "loss": 1.7204, + "step": 95 + }, + { + "epoch": 0.012810248198558846, + "grad_norm": 2.1122071965215015, + "learning_rate": 8.533333333333335e-06, + "loss": 1.7021, + "step": 96 + }, + { + "epoch": 0.012943688283960501, + "grad_norm": 2.200704929120579, + "learning_rate": 8.622222222222223e-06, + "loss": 1.6436, + "step": 97 + }, + { + "epoch": 0.013077128369362156, + "grad_norm": 2.2007715935735606, + "learning_rate": 8.711111111111111e-06, + "loss": 1.6646, + "step": 98 + }, + { + "epoch": 0.01321056845476381, + "grad_norm": 2.2699598373044383, + "learning_rate": 8.8e-06, + "loss": 1.6495, + "step": 99 + }, + { + "epoch": 0.013344008540165465, + "grad_norm": 2.328713098955483, + "learning_rate": 8.888888888888888e-06, + "loss": 1.7009, + "step": 100 + }, + { + "epoch": 0.01347744862556712, + "grad_norm": 2.218972713645272, + "learning_rate": 8.977777777777778e-06, + "loss": 1.7082, + "step": 101 + }, + { + "epoch": 0.013610888710968775, + "grad_norm": 2.618934756244986, + "learning_rate": 9.066666666666667e-06, + "loss": 1.6985, + "step": 102 + }, + { + "epoch": 0.01374432879637043, + "grad_norm": 3.2314521781408074, + "learning_rate": 9.155555555555557e-06, + "loss": 1.7344, + "step": 103 + }, + { + "epoch": 0.013877768881772085, + "grad_norm": 3.697972156481625, + "learning_rate": 9.244444444444445e-06, + "loss": 1.683, + "step": 104 + }, + { + "epoch": 0.01401120896717374, + "grad_norm": 2.8531262360061698, + "learning_rate": 9.333333333333334e-06, + "loss": 1.6776, + "step": 105 + }, + { + "epoch": 0.014144649052575394, + "grad_norm": 39.794103633027675, + "learning_rate": 9.422222222222222e-06, + "loss": 1.7134, + "step": 106 + }, + { + "epoch": 0.014278089137977049, + "grad_norm": 2.9264673716827954, + "learning_rate": 9.511111111111112e-06, + "loss": 1.6754, + "step": 107 + }, + { + "epoch": 0.014411529223378704, + "grad_norm": 2.414594446024647, + "learning_rate": 9.600000000000001e-06, + "loss": 1.6917, + "step": 108 + }, + { + "epoch": 0.014544969308780357, + "grad_norm": 2.792320442984017, + "learning_rate": 9.688888888888889e-06, + "loss": 1.6453, + "step": 109 + }, + { + "epoch": 0.014678409394182012, + "grad_norm": 3.358788179969882, + "learning_rate": 9.777777777777779e-06, + "loss": 1.713, + "step": 110 + }, + { + "epoch": 0.014811849479583666, + "grad_norm": 2.044760856305306, + "learning_rate": 9.866666666666668e-06, + "loss": 1.6851, + "step": 111 + }, + { + "epoch": 0.014945289564985321, + "grad_norm": 2.4828242965545386, + "learning_rate": 9.955555555555556e-06, + "loss": 1.7159, + "step": 112 + }, + { + "epoch": 0.015078729650386976, + "grad_norm": 46.56955438722204, + "learning_rate": 1.0044444444444446e-05, + "loss": 1.7581, + "step": 113 + }, + { + "epoch": 0.01521216973578863, + "grad_norm": 4.705272454001238, + "learning_rate": 1.0133333333333335e-05, + "loss": 1.6542, + "step": 114 + }, + { + "epoch": 0.015345609821190286, + "grad_norm": 7.73608658858232, + "learning_rate": 1.0222222222222223e-05, + "loss": 1.6793, + "step": 115 + }, + { + "epoch": 0.01547904990659194, + "grad_norm": 5.229243538124936, + "learning_rate": 1.0311111111111113e-05, + "loss": 1.6866, + "step": 116 + }, + { + "epoch": 0.015612489991993595, + "grad_norm": 4.340274190286637, + "learning_rate": 1.04e-05, + "loss": 1.7025, + "step": 117 + }, + { + "epoch": 0.015745930077395248, + "grad_norm": 2.211270582628492, + "learning_rate": 1.048888888888889e-05, + "loss": 1.6903, + "step": 118 + }, + { + "epoch": 0.015879370162796905, + "grad_norm": 2.2975450110438334, + "learning_rate": 1.0577777777777778e-05, + "loss": 1.7088, + "step": 119 + }, + { + "epoch": 0.016012810248198558, + "grad_norm": 17.328579385026632, + "learning_rate": 1.0666666666666667e-05, + "loss": 1.7141, + "step": 120 + }, + { + "epoch": 0.016146250333600214, + "grad_norm": 3.877950326292574, + "learning_rate": 1.0755555555555557e-05, + "loss": 1.6759, + "step": 121 + }, + { + "epoch": 0.016279690419001867, + "grad_norm": 4.011366710334245, + "learning_rate": 1.0844444444444446e-05, + "loss": 1.699, + "step": 122 + }, + { + "epoch": 0.016413130504403524, + "grad_norm": 3.2602156984936417, + "learning_rate": 1.0933333333333334e-05, + "loss": 1.7003, + "step": 123 + }, + { + "epoch": 0.016546570589805177, + "grad_norm": 2.630659747532619, + "learning_rate": 1.1022222222222224e-05, + "loss": 1.6344, + "step": 124 + }, + { + "epoch": 0.016680010675206833, + "grad_norm": 2.458919675792434, + "learning_rate": 1.1111111111111113e-05, + "loss": 1.6945, + "step": 125 + }, + { + "epoch": 0.016813450760608487, + "grad_norm": 3.984975390451759, + "learning_rate": 1.1200000000000001e-05, + "loss": 1.7746, + "step": 126 + }, + { + "epoch": 0.016946890846010143, + "grad_norm": 3.8315471470735787, + "learning_rate": 1.1288888888888889e-05, + "loss": 1.6974, + "step": 127 + }, + { + "epoch": 0.017080330931411796, + "grad_norm": 2.499188895634502, + "learning_rate": 1.1377777777777779e-05, + "loss": 1.6892, + "step": 128 + }, + { + "epoch": 0.01721377101681345, + "grad_norm": 2.0452097919611005, + "learning_rate": 1.1466666666666668e-05, + "loss": 1.7397, + "step": 129 + }, + { + "epoch": 0.017347211102215106, + "grad_norm": 2.298966367562689, + "learning_rate": 1.1555555555555556e-05, + "loss": 1.6836, + "step": 130 + }, + { + "epoch": 0.01748065118761676, + "grad_norm": 2.5420025536951942, + "learning_rate": 1.1644444444444446e-05, + "loss": 1.7454, + "step": 131 + }, + { + "epoch": 0.017614091273018415, + "grad_norm": 13.604138953612503, + "learning_rate": 1.1733333333333335e-05, + "loss": 1.7288, + "step": 132 + }, + { + "epoch": 0.01774753135842007, + "grad_norm": 2.3801527003336487, + "learning_rate": 1.1822222222222225e-05, + "loss": 1.6876, + "step": 133 + }, + { + "epoch": 0.017880971443821725, + "grad_norm": 2.460244538839727, + "learning_rate": 1.191111111111111e-05, + "loss": 1.7061, + "step": 134 + }, + { + "epoch": 0.018014411529223378, + "grad_norm": 2.4874810432246095, + "learning_rate": 1.2e-05, + "loss": 1.6963, + "step": 135 + }, + { + "epoch": 0.018147851614625034, + "grad_norm": 2.409965122688259, + "learning_rate": 1.208888888888889e-05, + "loss": 1.7093, + "step": 136 + }, + { + "epoch": 0.018281291700026688, + "grad_norm": 2.5548220927651393, + "learning_rate": 1.217777777777778e-05, + "loss": 1.6531, + "step": 137 + }, + { + "epoch": 0.018414731785428344, + "grad_norm": 2.7637181173708676, + "learning_rate": 1.2266666666666667e-05, + "loss": 1.6484, + "step": 138 + }, + { + "epoch": 0.018548171870829997, + "grad_norm": 2.108441938061504, + "learning_rate": 1.2355555555555557e-05, + "loss": 1.6831, + "step": 139 + }, + { + "epoch": 0.018681611956231654, + "grad_norm": 1.843873869065762, + "learning_rate": 1.2444444444444446e-05, + "loss": 1.7209, + "step": 140 + }, + { + "epoch": 0.018815052041633307, + "grad_norm": 1.852059082824261, + "learning_rate": 1.2533333333333336e-05, + "loss": 1.6356, + "step": 141 + }, + { + "epoch": 0.01894849212703496, + "grad_norm": 1.8903070568688063, + "learning_rate": 1.2622222222222222e-05, + "loss": 1.666, + "step": 142 + }, + { + "epoch": 0.019081932212436616, + "grad_norm": 2.7546690000389145, + "learning_rate": 1.2711111111111112e-05, + "loss": 1.6859, + "step": 143 + }, + { + "epoch": 0.01921537229783827, + "grad_norm": 1.9120514050425437, + "learning_rate": 1.2800000000000001e-05, + "loss": 1.616, + "step": 144 + }, + { + "epoch": 0.019348812383239926, + "grad_norm": 1.9059883225343819, + "learning_rate": 1.288888888888889e-05, + "loss": 1.7452, + "step": 145 + }, + { + "epoch": 0.01948225246864158, + "grad_norm": 1.8505183157858194, + "learning_rate": 1.2977777777777779e-05, + "loss": 1.6603, + "step": 146 + }, + { + "epoch": 0.019615692554043235, + "grad_norm": 3.2853784344984027, + "learning_rate": 1.3066666666666668e-05, + "loss": 1.6544, + "step": 147 + }, + { + "epoch": 0.01974913263944489, + "grad_norm": 2.09698441082926, + "learning_rate": 1.3155555555555558e-05, + "loss": 1.725, + "step": 148 + }, + { + "epoch": 0.019882572724846545, + "grad_norm": 2.244317832313048, + "learning_rate": 1.3244444444444447e-05, + "loss": 1.6843, + "step": 149 + }, + { + "epoch": 0.020016012810248198, + "grad_norm": 1.906767252092642, + "learning_rate": 1.3333333333333333e-05, + "loss": 1.6561, + "step": 150 + }, + { + "epoch": 0.020149452895649855, + "grad_norm": 13.009656729569588, + "learning_rate": 1.3422222222222223e-05, + "loss": 1.717, + "step": 151 + }, + { + "epoch": 0.020282892981051508, + "grad_norm": 1.939458338988826, + "learning_rate": 1.3511111111111112e-05, + "loss": 1.6941, + "step": 152 + }, + { + "epoch": 0.020416333066453164, + "grad_norm": 1.9081248684144936, + "learning_rate": 1.3600000000000002e-05, + "loss": 1.7335, + "step": 153 + }, + { + "epoch": 0.020549773151854817, + "grad_norm": 3.46048722855863, + "learning_rate": 1.368888888888889e-05, + "loss": 1.6904, + "step": 154 + }, + { + "epoch": 0.02068321323725647, + "grad_norm": 3.2150826776743093, + "learning_rate": 1.377777777777778e-05, + "loss": 1.7326, + "step": 155 + }, + { + "epoch": 0.020816653322658127, + "grad_norm": 2.682239347552553, + "learning_rate": 1.3866666666666669e-05, + "loss": 1.6991, + "step": 156 + }, + { + "epoch": 0.02095009340805978, + "grad_norm": 2.9892152568563337, + "learning_rate": 1.3955555555555558e-05, + "loss": 1.6674, + "step": 157 + }, + { + "epoch": 0.021083533493461436, + "grad_norm": 1.9370449545076165, + "learning_rate": 1.4044444444444445e-05, + "loss": 1.7093, + "step": 158 + }, + { + "epoch": 0.02121697357886309, + "grad_norm": 2.8348396699033085, + "learning_rate": 1.4133333333333334e-05, + "loss": 1.6561, + "step": 159 + }, + { + "epoch": 0.021350413664264746, + "grad_norm": 5.362105984011789, + "learning_rate": 1.4222222222222224e-05, + "loss": 1.7225, + "step": 160 + }, + { + "epoch": 0.0214838537496664, + "grad_norm": 2.2489346309856972, + "learning_rate": 1.4311111111111111e-05, + "loss": 1.7205, + "step": 161 + }, + { + "epoch": 0.021617293835068056, + "grad_norm": 2.5792958960212813, + "learning_rate": 1.4400000000000001e-05, + "loss": 1.7106, + "step": 162 + }, + { + "epoch": 0.02175073392046971, + "grad_norm": 2.799373167012452, + "learning_rate": 1.448888888888889e-05, + "loss": 1.7011, + "step": 163 + }, + { + "epoch": 0.021884174005871365, + "grad_norm": 2.7270269575589587, + "learning_rate": 1.457777777777778e-05, + "loss": 1.7108, + "step": 164 + }, + { + "epoch": 0.022017614091273018, + "grad_norm": 2.14261161418117, + "learning_rate": 1.4666666666666666e-05, + "loss": 1.6779, + "step": 165 + }, + { + "epoch": 0.022151054176674675, + "grad_norm": 1.9467025797614437, + "learning_rate": 1.4755555555555556e-05, + "loss": 1.7194, + "step": 166 + }, + { + "epoch": 0.022284494262076328, + "grad_norm": 2.0951052041172034, + "learning_rate": 1.4844444444444445e-05, + "loss": 1.7088, + "step": 167 + }, + { + "epoch": 0.02241793434747798, + "grad_norm": 1.792897219038739, + "learning_rate": 1.4933333333333335e-05, + "loss": 1.6647, + "step": 168 + }, + { + "epoch": 0.022551374432879637, + "grad_norm": 2.057396723030578, + "learning_rate": 1.5022222222222223e-05, + "loss": 1.6376, + "step": 169 + }, + { + "epoch": 0.02268481451828129, + "grad_norm": 1.7777783661568858, + "learning_rate": 1.5111111111111112e-05, + "loss": 1.7434, + "step": 170 + }, + { + "epoch": 0.022818254603682947, + "grad_norm": 2.583723235252364, + "learning_rate": 1.5200000000000002e-05, + "loss": 1.6701, + "step": 171 + }, + { + "epoch": 0.0229516946890846, + "grad_norm": 2.708960473066722, + "learning_rate": 1.528888888888889e-05, + "loss": 1.7109, + "step": 172 + }, + { + "epoch": 0.023085134774486257, + "grad_norm": 2.179043118704419, + "learning_rate": 1.537777777777778e-05, + "loss": 1.6601, + "step": 173 + }, + { + "epoch": 0.02321857485988791, + "grad_norm": 30.22918083463127, + "learning_rate": 1.546666666666667e-05, + "loss": 1.8007, + "step": 174 + }, + { + "epoch": 0.023352014945289566, + "grad_norm": 2.230452217009043, + "learning_rate": 1.555555555555556e-05, + "loss": 1.7362, + "step": 175 + }, + { + "epoch": 0.02348545503069122, + "grad_norm": 2.9922017986999276, + "learning_rate": 1.5644444444444448e-05, + "loss": 1.7398, + "step": 176 + }, + { + "epoch": 0.023618895116092876, + "grad_norm": 3.914186394323879, + "learning_rate": 1.5733333333333334e-05, + "loss": 1.7256, + "step": 177 + }, + { + "epoch": 0.02375233520149453, + "grad_norm": 1.9015888556101328, + "learning_rate": 1.5822222222222224e-05, + "loss": 1.6762, + "step": 178 + }, + { + "epoch": 0.023885775286896185, + "grad_norm": 1.7266594765283543, + "learning_rate": 1.5911111111111113e-05, + "loss": 1.672, + "step": 179 + }, + { + "epoch": 0.02401921537229784, + "grad_norm": 2.070151343179939, + "learning_rate": 1.6000000000000003e-05, + "loss": 1.7933, + "step": 180 + }, + { + "epoch": 0.02415265545769949, + "grad_norm": 2.51353756811448, + "learning_rate": 1.608888888888889e-05, + "loss": 1.6875, + "step": 181 + }, + { + "epoch": 0.024286095543101148, + "grad_norm": 11.935615730367939, + "learning_rate": 1.617777777777778e-05, + "loss": 1.7512, + "step": 182 + }, + { + "epoch": 0.0244195356285028, + "grad_norm": 3.9274893534399635, + "learning_rate": 1.6266666666666668e-05, + "loss": 1.6877, + "step": 183 + }, + { + "epoch": 0.024552975713904458, + "grad_norm": 2.8888619941000377, + "learning_rate": 1.6355555555555557e-05, + "loss": 1.6988, + "step": 184 + }, + { + "epoch": 0.02468641579930611, + "grad_norm": 2.335255800684397, + "learning_rate": 1.6444444444444444e-05, + "loss": 1.7197, + "step": 185 + }, + { + "epoch": 0.024819855884707767, + "grad_norm": 2.132564305791294, + "learning_rate": 1.6533333333333333e-05, + "loss": 1.7189, + "step": 186 + }, + { + "epoch": 0.02495329597010942, + "grad_norm": 2.344308980206617, + "learning_rate": 1.6622222222222223e-05, + "loss": 1.6521, + "step": 187 + }, + { + "epoch": 0.025086736055511077, + "grad_norm": 1.7158444729386582, + "learning_rate": 1.6711111111111112e-05, + "loss": 1.7004, + "step": 188 + }, + { + "epoch": 0.02522017614091273, + "grad_norm": 3.0317644581837344, + "learning_rate": 1.6800000000000002e-05, + "loss": 1.7271, + "step": 189 + }, + { + "epoch": 0.025353616226314386, + "grad_norm": 1.9353267422280926, + "learning_rate": 1.688888888888889e-05, + "loss": 1.6795, + "step": 190 + }, + { + "epoch": 0.02548705631171604, + "grad_norm": 1.7304826902324242, + "learning_rate": 1.697777777777778e-05, + "loss": 1.6588, + "step": 191 + }, + { + "epoch": 0.025620496397117692, + "grad_norm": 1.9159549442018844, + "learning_rate": 1.706666666666667e-05, + "loss": 1.7432, + "step": 192 + }, + { + "epoch": 0.02575393648251935, + "grad_norm": 1.8910667470096478, + "learning_rate": 1.7155555555555557e-05, + "loss": 1.7513, + "step": 193 + }, + { + "epoch": 0.025887376567921002, + "grad_norm": 2.569829238717817, + "learning_rate": 1.7244444444444446e-05, + "loss": 1.6466, + "step": 194 + }, + { + "epoch": 0.02602081665332266, + "grad_norm": 2.078018415198727, + "learning_rate": 1.7333333333333336e-05, + "loss": 1.7358, + "step": 195 + }, + { + "epoch": 0.02615425673872431, + "grad_norm": 1.8817118359807499, + "learning_rate": 1.7422222222222222e-05, + "loss": 1.7516, + "step": 196 + }, + { + "epoch": 0.026287696824125968, + "grad_norm": 1.77364702683218, + "learning_rate": 1.751111111111111e-05, + "loss": 1.689, + "step": 197 + }, + { + "epoch": 0.02642113690952762, + "grad_norm": 2.2161414310718466, + "learning_rate": 1.76e-05, + "loss": 1.7242, + "step": 198 + }, + { + "epoch": 0.026554576994929278, + "grad_norm": 2.0643613848156392, + "learning_rate": 1.768888888888889e-05, + "loss": 1.6685, + "step": 199 + }, + { + "epoch": 0.02668801708033093, + "grad_norm": 14.87718153231451, + "learning_rate": 1.7777777777777777e-05, + "loss": 1.6533, + "step": 200 + }, + { + "epoch": 0.026821457165732587, + "grad_norm": 2.272950745857619, + "learning_rate": 1.7866666666666666e-05, + "loss": 1.7366, + "step": 201 + }, + { + "epoch": 0.02695489725113424, + "grad_norm": 2.8727770433260242, + "learning_rate": 1.7955555555555556e-05, + "loss": 1.7315, + "step": 202 + }, + { + "epoch": 0.027088337336535897, + "grad_norm": 2.3209623388005958, + "learning_rate": 1.8044444444444445e-05, + "loss": 1.7321, + "step": 203 + }, + { + "epoch": 0.02722177742193755, + "grad_norm": 1.897431656822105, + "learning_rate": 1.8133333333333335e-05, + "loss": 1.7697, + "step": 204 + }, + { + "epoch": 0.027355217507339203, + "grad_norm": 2.083186577685706, + "learning_rate": 1.8222222222222224e-05, + "loss": 1.7103, + "step": 205 + }, + { + "epoch": 0.02748865759274086, + "grad_norm": 2.4252062792709395, + "learning_rate": 1.8311111111111114e-05, + "loss": 1.7401, + "step": 206 + }, + { + "epoch": 0.027622097678142513, + "grad_norm": 5.5398864573778726, + "learning_rate": 1.8400000000000003e-05, + "loss": 1.7618, + "step": 207 + }, + { + "epoch": 0.02775553776354417, + "grad_norm": 2.12561692151565, + "learning_rate": 1.848888888888889e-05, + "loss": 1.698, + "step": 208 + }, + { + "epoch": 0.027888977848945822, + "grad_norm": 2.122353896017312, + "learning_rate": 1.857777777777778e-05, + "loss": 1.7519, + "step": 209 + }, + { + "epoch": 0.02802241793434748, + "grad_norm": 1.946448922291358, + "learning_rate": 1.866666666666667e-05, + "loss": 1.6719, + "step": 210 + }, + { + "epoch": 0.028155858019749132, + "grad_norm": 2.4301306367972813, + "learning_rate": 1.8755555555555558e-05, + "loss": 1.7039, + "step": 211 + }, + { + "epoch": 0.02828929810515079, + "grad_norm": 1.841018211290893, + "learning_rate": 1.8844444444444444e-05, + "loss": 1.7406, + "step": 212 + }, + { + "epoch": 0.02842273819055244, + "grad_norm": 1.9703359299596948, + "learning_rate": 1.8933333333333334e-05, + "loss": 1.6828, + "step": 213 + }, + { + "epoch": 0.028556178275954098, + "grad_norm": 2.0318091145704646, + "learning_rate": 1.9022222222222223e-05, + "loss": 1.6784, + "step": 214 + }, + { + "epoch": 0.02868961836135575, + "grad_norm": 1.8820707489427722, + "learning_rate": 1.9111111111111113e-05, + "loss": 1.7024, + "step": 215 + }, + { + "epoch": 0.028823058446757407, + "grad_norm": 2.4241888613922145, + "learning_rate": 1.9200000000000003e-05, + "loss": 1.685, + "step": 216 + }, + { + "epoch": 0.02895649853215906, + "grad_norm": 2.2092388050397456, + "learning_rate": 1.928888888888889e-05, + "loss": 1.7553, + "step": 217 + }, + { + "epoch": 0.029089938617560714, + "grad_norm": 2.165125467884484, + "learning_rate": 1.9377777777777778e-05, + "loss": 1.6919, + "step": 218 + }, + { + "epoch": 0.02922337870296237, + "grad_norm": 2.4113038080825624, + "learning_rate": 1.9466666666666668e-05, + "loss": 1.6618, + "step": 219 + }, + { + "epoch": 0.029356818788364023, + "grad_norm": 2.3316936837660576, + "learning_rate": 1.9555555555555557e-05, + "loss": 1.7472, + "step": 220 + }, + { + "epoch": 0.02949025887376568, + "grad_norm": 3.2727415100596855, + "learning_rate": 1.9644444444444447e-05, + "loss": 1.6679, + "step": 221 + }, + { + "epoch": 0.029623698959167333, + "grad_norm": 2.0492799095337784, + "learning_rate": 1.9733333333333336e-05, + "loss": 1.7506, + "step": 222 + }, + { + "epoch": 0.02975713904456899, + "grad_norm": 2.576771291880995, + "learning_rate": 1.9822222222222226e-05, + "loss": 1.737, + "step": 223 + }, + { + "epoch": 0.029890579129970642, + "grad_norm": 3.2741913701257257, + "learning_rate": 1.9911111111111112e-05, + "loss": 1.7066, + "step": 224 + }, + { + "epoch": 0.0300240192153723, + "grad_norm": 2.001886242005137, + "learning_rate": 2e-05, + "loss": 1.7444, + "step": 225 + }, + { + "epoch": 0.030157459300773952, + "grad_norm": 2.6698368689644334, + "learning_rate": 1.9999999066056927e-05, + "loss": 1.6765, + "step": 226 + }, + { + "epoch": 0.03029089938617561, + "grad_norm": 3.3466160854176628, + "learning_rate": 1.9999996264227883e-05, + "loss": 1.7286, + "step": 227 + }, + { + "epoch": 0.03042433947157726, + "grad_norm": 2.393225998646352, + "learning_rate": 1.999999159451339e-05, + "loss": 1.7539, + "step": 228 + }, + { + "epoch": 0.030557779556978918, + "grad_norm": 2.5849019495304777, + "learning_rate": 1.9999985056914326e-05, + "loss": 1.7202, + "step": 229 + }, + { + "epoch": 0.03069121964238057, + "grad_norm": 2.761192723900784, + "learning_rate": 1.9999976651431904e-05, + "loss": 1.7445, + "step": 230 + }, + { + "epoch": 0.030824659727782224, + "grad_norm": 2.440029715237141, + "learning_rate": 1.9999966378067696e-05, + "loss": 1.7564, + "step": 231 + }, + { + "epoch": 0.03095809981318388, + "grad_norm": 2.0847710036036307, + "learning_rate": 1.9999954236823625e-05, + "loss": 1.7053, + "step": 232 + }, + { + "epoch": 0.031091539898585534, + "grad_norm": 2.097727759259449, + "learning_rate": 1.9999940227701952e-05, + "loss": 1.7045, + "step": 233 + }, + { + "epoch": 0.03122497998398719, + "grad_norm": 15.81909104706428, + "learning_rate": 1.99999243507053e-05, + "loss": 1.8102, + "step": 234 + }, + { + "epoch": 0.03135842006938885, + "grad_norm": 2.223134069922936, + "learning_rate": 1.9999906605836637e-05, + "loss": 1.738, + "step": 235 + }, + { + "epoch": 0.031491860154790496, + "grad_norm": 2.3651382675222528, + "learning_rate": 1.9999886993099273e-05, + "loss": 1.7467, + "step": 236 + }, + { + "epoch": 0.03162530024019215, + "grad_norm": 1.8263105543794043, + "learning_rate": 1.9999865512496867e-05, + "loss": 1.7548, + "step": 237 + }, + { + "epoch": 0.03175874032559381, + "grad_norm": 3.9716970412217805, + "learning_rate": 1.9999842164033435e-05, + "loss": 1.7206, + "step": 238 + }, + { + "epoch": 0.031892180410995466, + "grad_norm": 2.0120218395868634, + "learning_rate": 1.9999816947713345e-05, + "loss": 1.7401, + "step": 239 + }, + { + "epoch": 0.032025620496397116, + "grad_norm": 2.3384151798445036, + "learning_rate": 1.9999789863541295e-05, + "loss": 1.7301, + "step": 240 + }, + { + "epoch": 0.03215906058179877, + "grad_norm": 3.940580388558925, + "learning_rate": 1.9999760911522355e-05, + "loss": 1.6965, + "step": 241 + }, + { + "epoch": 0.03229250066720043, + "grad_norm": 1.8224155291794617, + "learning_rate": 1.9999730091661928e-05, + "loss": 1.6705, + "step": 242 + }, + { + "epoch": 0.03242594075260208, + "grad_norm": 1.9045524865773107, + "learning_rate": 1.999969740396577e-05, + "loss": 1.7543, + "step": 243 + }, + { + "epoch": 0.032559380838003735, + "grad_norm": 1.6984600198344175, + "learning_rate": 1.9999662848439988e-05, + "loss": 1.7126, + "step": 244 + }, + { + "epoch": 0.03269282092340539, + "grad_norm": 1.9269958388703856, + "learning_rate": 1.9999626425091035e-05, + "loss": 1.7016, + "step": 245 + }, + { + "epoch": 0.03282626100880705, + "grad_norm": 2.4719605600430095, + "learning_rate": 1.9999588133925715e-05, + "loss": 1.733, + "step": 246 + }, + { + "epoch": 0.0329597010942087, + "grad_norm": 2.129823068185752, + "learning_rate": 1.9999547974951184e-05, + "loss": 1.6882, + "step": 247 + }, + { + "epoch": 0.033093141179610354, + "grad_norm": 2.0555832511145145, + "learning_rate": 1.9999505948174943e-05, + "loss": 1.7154, + "step": 248 + }, + { + "epoch": 0.03322658126501201, + "grad_norm": 1.8053396394120758, + "learning_rate": 1.9999462053604836e-05, + "loss": 1.6796, + "step": 249 + }, + { + "epoch": 0.03336002135041367, + "grad_norm": 5.575233121908451, + "learning_rate": 1.9999416291249064e-05, + "loss": 1.7884, + "step": 250 + }, + { + "epoch": 0.03349346143581532, + "grad_norm": 2.2350909040289673, + "learning_rate": 1.9999368661116177e-05, + "loss": 1.7342, + "step": 251 + }, + { + "epoch": 0.03362690152121697, + "grad_norm": 1.6497870642338468, + "learning_rate": 1.9999319163215075e-05, + "loss": 1.71, + "step": 252 + }, + { + "epoch": 0.03376034160661863, + "grad_norm": 1.993739573845127, + "learning_rate": 1.9999267797554995e-05, + "loss": 1.7162, + "step": 253 + }, + { + "epoch": 0.033893781692020286, + "grad_norm": 1.8123408280559632, + "learning_rate": 1.999921456414554e-05, + "loss": 1.7696, + "step": 254 + }, + { + "epoch": 0.034027221777421936, + "grad_norm": 1.6594530678698969, + "learning_rate": 1.9999159462996646e-05, + "loss": 1.6971, + "step": 255 + }, + { + "epoch": 0.03416066186282359, + "grad_norm": 1.7999533008081343, + "learning_rate": 1.9999102494118613e-05, + "loss": 1.7056, + "step": 256 + }, + { + "epoch": 0.03429410194822525, + "grad_norm": 1.5795266580620457, + "learning_rate": 1.999904365752208e-05, + "loss": 1.7091, + "step": 257 + }, + { + "epoch": 0.0344275420336269, + "grad_norm": 2.9298656896602355, + "learning_rate": 1.9998982953218028e-05, + "loss": 1.7193, + "step": 258 + }, + { + "epoch": 0.034560982119028555, + "grad_norm": 1.764460603536837, + "learning_rate": 1.9998920381217807e-05, + "loss": 1.7333, + "step": 259 + }, + { + "epoch": 0.03469442220443021, + "grad_norm": 2.3109901205414314, + "learning_rate": 1.99988559415331e-05, + "loss": 1.7506, + "step": 260 + }, + { + "epoch": 0.03482786228983187, + "grad_norm": 10.94371168313979, + "learning_rate": 1.9998789634175945e-05, + "loss": 1.7554, + "step": 261 + }, + { + "epoch": 0.03496130237523352, + "grad_norm": 1.8407021375815145, + "learning_rate": 1.999872145915873e-05, + "loss": 1.7242, + "step": 262 + }, + { + "epoch": 0.035094742460635174, + "grad_norm": 2.974143978244417, + "learning_rate": 1.999865141649418e-05, + "loss": 1.7209, + "step": 263 + }, + { + "epoch": 0.03522818254603683, + "grad_norm": 2.2722279313309475, + "learning_rate": 1.9998579506195384e-05, + "loss": 1.7143, + "step": 264 + }, + { + "epoch": 0.03536162263143849, + "grad_norm": 1.6990878341793165, + "learning_rate": 1.999850572827578e-05, + "loss": 1.7197, + "step": 265 + }, + { + "epoch": 0.03549506271684014, + "grad_norm": 1.6642794979330895, + "learning_rate": 1.999843008274914e-05, + "loss": 1.6542, + "step": 266 + }, + { + "epoch": 0.03562850280224179, + "grad_norm": 2.007102879639598, + "learning_rate": 1.9998352569629596e-05, + "loss": 1.7283, + "step": 267 + }, + { + "epoch": 0.03576194288764345, + "grad_norm": 1.603072629822543, + "learning_rate": 1.9998273188931628e-05, + "loss": 1.6958, + "step": 268 + }, + { + "epoch": 0.0358953829730451, + "grad_norm": 1.83795235651287, + "learning_rate": 1.9998191940670068e-05, + "loss": 1.7116, + "step": 269 + }, + { + "epoch": 0.036028823058446756, + "grad_norm": 1.9425709100774724, + "learning_rate": 1.999810882486008e-05, + "loss": 1.7268, + "step": 270 + }, + { + "epoch": 0.03616226314384841, + "grad_norm": 1.6308935939887939, + "learning_rate": 1.99980238415172e-05, + "loss": 1.7157, + "step": 271 + }, + { + "epoch": 0.03629570322925007, + "grad_norm": 1.668857008376373, + "learning_rate": 1.99979369906573e-05, + "loss": 1.7032, + "step": 272 + }, + { + "epoch": 0.03642914331465172, + "grad_norm": 2.1917958281972925, + "learning_rate": 1.9997848272296594e-05, + "loss": 1.666, + "step": 273 + }, + { + "epoch": 0.036562583400053375, + "grad_norm": 2.7045086446698567, + "learning_rate": 1.9997757686451668e-05, + "loss": 1.7166, + "step": 274 + }, + { + "epoch": 0.03669602348545503, + "grad_norm": 1.776827654276239, + "learning_rate": 1.9997665233139433e-05, + "loss": 1.7162, + "step": 275 + }, + { + "epoch": 0.03682946357085669, + "grad_norm": 1.8454111862717217, + "learning_rate": 1.999757091237716e-05, + "loss": 1.7304, + "step": 276 + }, + { + "epoch": 0.03696290365625834, + "grad_norm": 1.7745973768422731, + "learning_rate": 1.9997474724182465e-05, + "loss": 1.7309, + "step": 277 + }, + { + "epoch": 0.037096343741659994, + "grad_norm": 1.768404847121436, + "learning_rate": 1.9997376668573317e-05, + "loss": 1.7116, + "step": 278 + }, + { + "epoch": 0.03722978382706165, + "grad_norm": 1.6641524050216268, + "learning_rate": 1.9997276745568036e-05, + "loss": 1.718, + "step": 279 + }, + { + "epoch": 0.03736322391246331, + "grad_norm": 2.2160286554301094, + "learning_rate": 1.9997174955185276e-05, + "loss": 1.6983, + "step": 280 + }, + { + "epoch": 0.03749666399786496, + "grad_norm": 1.830594152900142, + "learning_rate": 1.9997071297444062e-05, + "loss": 1.7126, + "step": 281 + }, + { + "epoch": 0.03763010408326661, + "grad_norm": 1.9590317837706277, + "learning_rate": 1.9996965772363747e-05, + "loss": 1.7766, + "step": 282 + }, + { + "epoch": 0.03776354416866827, + "grad_norm": 17.291654078400484, + "learning_rate": 1.999685837996405e-05, + "loss": 1.7718, + "step": 283 + }, + { + "epoch": 0.03789698425406992, + "grad_norm": 1.9837536918624412, + "learning_rate": 1.9996749120265023e-05, + "loss": 1.7488, + "step": 284 + }, + { + "epoch": 0.038030424339471576, + "grad_norm": 2.143999288457606, + "learning_rate": 1.9996637993287072e-05, + "loss": 1.756, + "step": 285 + }, + { + "epoch": 0.03816386442487323, + "grad_norm": 2.0362899269691144, + "learning_rate": 1.9996524999050966e-05, + "loss": 1.7582, + "step": 286 + }, + { + "epoch": 0.03829730451027489, + "grad_norm": 1.90769408022856, + "learning_rate": 1.9996410137577806e-05, + "loss": 1.7833, + "step": 287 + }, + { + "epoch": 0.03843074459567654, + "grad_norm": 1.8427410428015334, + "learning_rate": 1.9996293408889046e-05, + "loss": 1.746, + "step": 288 + }, + { + "epoch": 0.038564184681078195, + "grad_norm": 1.6678979095618587, + "learning_rate": 1.9996174813006488e-05, + "loss": 1.7362, + "step": 289 + }, + { + "epoch": 0.03869762476647985, + "grad_norm": 1.8342284170270304, + "learning_rate": 1.9996054349952283e-05, + "loss": 1.7476, + "step": 290 + }, + { + "epoch": 0.03883106485188151, + "grad_norm": 1.71509522487389, + "learning_rate": 1.999593201974894e-05, + "loss": 1.7033, + "step": 291 + }, + { + "epoch": 0.03896450493728316, + "grad_norm": 1.6721230982503732, + "learning_rate": 1.9995807822419296e-05, + "loss": 1.7231, + "step": 292 + }, + { + "epoch": 0.039097945022684814, + "grad_norm": 1.6080547725886836, + "learning_rate": 1.9995681757986563e-05, + "loss": 1.7517, + "step": 293 + }, + { + "epoch": 0.03923138510808647, + "grad_norm": 3.219421965283889, + "learning_rate": 1.9995553826474282e-05, + "loss": 1.771, + "step": 294 + }, + { + "epoch": 0.03936482519348812, + "grad_norm": 1.6715306645770436, + "learning_rate": 1.9995424027906348e-05, + "loss": 1.7727, + "step": 295 + }, + { + "epoch": 0.03949826527888978, + "grad_norm": 2.0944536026816047, + "learning_rate": 1.999529236230701e-05, + "loss": 1.6944, + "step": 296 + }, + { + "epoch": 0.039631705364291434, + "grad_norm": 1.8183245740053557, + "learning_rate": 1.9995158829700857e-05, + "loss": 1.7002, + "step": 297 + }, + { + "epoch": 0.03976514544969309, + "grad_norm": 1.5947407157115723, + "learning_rate": 1.9995023430112838e-05, + "loss": 1.694, + "step": 298 + }, + { + "epoch": 0.03989858553509474, + "grad_norm": 1.732145875042339, + "learning_rate": 1.9994886163568234e-05, + "loss": 1.7074, + "step": 299 + }, + { + "epoch": 0.040032025620496396, + "grad_norm": 1.9283377201797984, + "learning_rate": 1.9994747030092694e-05, + "loss": 1.6803, + "step": 300 + }, + { + "epoch": 0.04016546570589805, + "grad_norm": 2.0214640496563603, + "learning_rate": 1.9994606029712204e-05, + "loss": 1.6983, + "step": 301 + }, + { + "epoch": 0.04029890579129971, + "grad_norm": 1.6234729327561015, + "learning_rate": 1.9994463162453098e-05, + "loss": 1.7016, + "step": 302 + }, + { + "epoch": 0.04043234587670136, + "grad_norm": 1.9270631511486271, + "learning_rate": 1.9994318428342066e-05, + "loss": 1.7469, + "step": 303 + }, + { + "epoch": 0.040565785962103015, + "grad_norm": 1.6310686042908182, + "learning_rate": 1.9994171827406143e-05, + "loss": 1.7147, + "step": 304 + }, + { + "epoch": 0.04069922604750467, + "grad_norm": 1.6867380171369333, + "learning_rate": 1.999402335967271e-05, + "loss": 1.7216, + "step": 305 + }, + { + "epoch": 0.04083266613290633, + "grad_norm": 2.1857500165121295, + "learning_rate": 1.99938730251695e-05, + "loss": 1.7171, + "step": 306 + }, + { + "epoch": 0.04096610621830798, + "grad_norm": 2.3135110454799332, + "learning_rate": 1.999372082392459e-05, + "loss": 1.7334, + "step": 307 + }, + { + "epoch": 0.041099546303709635, + "grad_norm": 1.589398670927812, + "learning_rate": 1.9993566755966414e-05, + "loss": 1.6869, + "step": 308 + }, + { + "epoch": 0.04123298638911129, + "grad_norm": 2.4365266348869428, + "learning_rate": 1.999341082132375e-05, + "loss": 1.7227, + "step": 309 + }, + { + "epoch": 0.04136642647451294, + "grad_norm": 1.6296732392377482, + "learning_rate": 1.999325302002572e-05, + "loss": 1.7589, + "step": 310 + }, + { + "epoch": 0.0414998665599146, + "grad_norm": 2.305895115990403, + "learning_rate": 1.999309335210181e-05, + "loss": 1.6795, + "step": 311 + }, + { + "epoch": 0.041633306645316254, + "grad_norm": 2.2267067402414136, + "learning_rate": 1.9992931817581836e-05, + "loss": 1.7257, + "step": 312 + }, + { + "epoch": 0.04176674673071791, + "grad_norm": 2.64734565806814, + "learning_rate": 1.999276841649597e-05, + "loss": 1.6765, + "step": 313 + }, + { + "epoch": 0.04190018681611956, + "grad_norm": 2.871860278561938, + "learning_rate": 1.9992603148874735e-05, + "loss": 1.7917, + "step": 314 + }, + { + "epoch": 0.042033626901521216, + "grad_norm": 1.680479043931703, + "learning_rate": 1.9992436014749002e-05, + "loss": 1.7058, + "step": 315 + }, + { + "epoch": 0.04216706698692287, + "grad_norm": 2.3517294932157986, + "learning_rate": 1.9992267014149992e-05, + "loss": 1.7037, + "step": 316 + }, + { + "epoch": 0.04230050707232453, + "grad_norm": 2.3384834771184315, + "learning_rate": 1.999209614710927e-05, + "loss": 1.7229, + "step": 317 + }, + { + "epoch": 0.04243394715772618, + "grad_norm": 3.0716993591543162, + "learning_rate": 1.9991923413658752e-05, + "loss": 1.7066, + "step": 318 + }, + { + "epoch": 0.042567387243127836, + "grad_norm": 22.39020312724416, + "learning_rate": 1.99917488138307e-05, + "loss": 1.7471, + "step": 319 + }, + { + "epoch": 0.04270082732852949, + "grad_norm": 4.513453647912413, + "learning_rate": 1.999157234765773e-05, + "loss": 1.7847, + "step": 320 + }, + { + "epoch": 0.04283426741393114, + "grad_norm": 2.8952574314494406, + "learning_rate": 1.9991394015172806e-05, + "loss": 1.7865, + "step": 321 + }, + { + "epoch": 0.0429677074993328, + "grad_norm": 2.3072174733434263, + "learning_rate": 1.9991213816409235e-05, + "loss": 1.6652, + "step": 322 + }, + { + "epoch": 0.043101147584734455, + "grad_norm": 3.501051033962392, + "learning_rate": 1.9991031751400678e-05, + "loss": 1.7621, + "step": 323 + }, + { + "epoch": 0.04323458767013611, + "grad_norm": 2.757581328302007, + "learning_rate": 1.9990847820181143e-05, + "loss": 1.7351, + "step": 324 + }, + { + "epoch": 0.04336802775553776, + "grad_norm": 2.208520128826526, + "learning_rate": 1.9990662022784984e-05, + "loss": 1.7927, + "step": 325 + }, + { + "epoch": 0.04350146784093942, + "grad_norm": 2.9115362978660366, + "learning_rate": 1.9990474359246907e-05, + "loss": 1.7272, + "step": 326 + }, + { + "epoch": 0.043634907926341074, + "grad_norm": 2.2116484952074638, + "learning_rate": 1.9990284829601965e-05, + "loss": 1.7165, + "step": 327 + }, + { + "epoch": 0.04376834801174273, + "grad_norm": 2.127585292605601, + "learning_rate": 1.999009343388556e-05, + "loss": 1.7696, + "step": 328 + }, + { + "epoch": 0.04390178809714438, + "grad_norm": 2.1126461502594407, + "learning_rate": 1.9989900172133446e-05, + "loss": 1.7299, + "step": 329 + }, + { + "epoch": 0.044035228182546036, + "grad_norm": 3.589659926709359, + "learning_rate": 1.9989705044381717e-05, + "loss": 1.7291, + "step": 330 + }, + { + "epoch": 0.04416866826794769, + "grad_norm": 2.5882477625445977, + "learning_rate": 1.998950805066682e-05, + "loss": 1.74, + "step": 331 + }, + { + "epoch": 0.04430210835334935, + "grad_norm": 2.102193237422039, + "learning_rate": 1.998930919102556e-05, + "loss": 1.7445, + "step": 332 + }, + { + "epoch": 0.044435548438751, + "grad_norm": 1.5603391863630198, + "learning_rate": 1.9989108465495074e-05, + "loss": 1.7013, + "step": 333 + }, + { + "epoch": 0.044568988524152656, + "grad_norm": 2.0093620838637096, + "learning_rate": 1.9988905874112853e-05, + "loss": 1.6947, + "step": 334 + }, + { + "epoch": 0.04470242860955431, + "grad_norm": 2.0641135398132873, + "learning_rate": 1.998870141691675e-05, + "loss": 1.6937, + "step": 335 + }, + { + "epoch": 0.04483586869495596, + "grad_norm": 1.9939169534113899, + "learning_rate": 1.9988495093944942e-05, + "loss": 1.7692, + "step": 336 + }, + { + "epoch": 0.04496930878035762, + "grad_norm": 1.5839856511709431, + "learning_rate": 1.998828690523597e-05, + "loss": 1.6663, + "step": 337 + }, + { + "epoch": 0.045102748865759275, + "grad_norm": 1.7578902574401223, + "learning_rate": 1.9988076850828736e-05, + "loss": 1.7076, + "step": 338 + }, + { + "epoch": 0.04523618895116093, + "grad_norm": 1.8132930759338493, + "learning_rate": 1.9987864930762456e-05, + "loss": 1.673, + "step": 339 + }, + { + "epoch": 0.04536962903656258, + "grad_norm": 1.4712573812370557, + "learning_rate": 1.9987651145076724e-05, + "loss": 1.7153, + "step": 340 + }, + { + "epoch": 0.04550306912196424, + "grad_norm": 1.7206193008312878, + "learning_rate": 1.9987435493811477e-05, + "loss": 1.7383, + "step": 341 + }, + { + "epoch": 0.045636509207365894, + "grad_norm": 1.7893043936529118, + "learning_rate": 1.998721797700699e-05, + "loss": 1.7391, + "step": 342 + }, + { + "epoch": 0.04576994929276755, + "grad_norm": 1.6899544517809348, + "learning_rate": 1.9986998594703887e-05, + "loss": 1.7279, + "step": 343 + }, + { + "epoch": 0.0459033893781692, + "grad_norm": 1.6139144085071184, + "learning_rate": 1.9986777346943157e-05, + "loss": 1.6811, + "step": 344 + }, + { + "epoch": 0.04603682946357086, + "grad_norm": 1.7986249828416747, + "learning_rate": 1.9986554233766125e-05, + "loss": 1.7152, + "step": 345 + }, + { + "epoch": 0.04617026954897251, + "grad_norm": 1.7071224522178132, + "learning_rate": 1.9986329255214457e-05, + "loss": 1.6777, + "step": 346 + }, + { + "epoch": 0.04630370963437416, + "grad_norm": 1.726135209124258, + "learning_rate": 1.9986102411330187e-05, + "loss": 1.6931, + "step": 347 + }, + { + "epoch": 0.04643714971977582, + "grad_norm": 1.640238498392757, + "learning_rate": 1.9985873702155684e-05, + "loss": 1.7056, + "step": 348 + }, + { + "epoch": 0.046570589805177476, + "grad_norm": 1.4745370494240224, + "learning_rate": 1.9985643127733664e-05, + "loss": 1.7318, + "step": 349 + }, + { + "epoch": 0.04670402989057913, + "grad_norm": 1.665538578112985, + "learning_rate": 1.99854106881072e-05, + "loss": 1.6866, + "step": 350 + }, + { + "epoch": 0.04683746997598078, + "grad_norm": 1.7265148941792416, + "learning_rate": 1.9985176383319706e-05, + "loss": 1.7242, + "step": 351 + }, + { + "epoch": 0.04697091006138244, + "grad_norm": 1.7372398018580704, + "learning_rate": 1.998494021341495e-05, + "loss": 1.6928, + "step": 352 + }, + { + "epoch": 0.047104350146784095, + "grad_norm": 1.9848537709359042, + "learning_rate": 1.998470217843705e-05, + "loss": 1.7095, + "step": 353 + }, + { + "epoch": 0.04723779023218575, + "grad_norm": 1.6273684847266405, + "learning_rate": 1.9984462278430454e-05, + "loss": 1.7052, + "step": 354 + }, + { + "epoch": 0.0473712303175874, + "grad_norm": 2.691629432371725, + "learning_rate": 1.9984220513439987e-05, + "loss": 1.7216, + "step": 355 + }, + { + "epoch": 0.04750467040298906, + "grad_norm": 3.0017537794303872, + "learning_rate": 1.9983976883510802e-05, + "loss": 1.6843, + "step": 356 + }, + { + "epoch": 0.047638110488390714, + "grad_norm": 1.6968617974676807, + "learning_rate": 1.9983731388688405e-05, + "loss": 1.7386, + "step": 357 + }, + { + "epoch": 0.04777155057379237, + "grad_norm": 1.7186160692409527, + "learning_rate": 1.998348402901866e-05, + "loss": 1.7086, + "step": 358 + }, + { + "epoch": 0.04790499065919402, + "grad_norm": 1.6381474039682424, + "learning_rate": 1.998323480454776e-05, + "loss": 1.7258, + "step": 359 + }, + { + "epoch": 0.04803843074459568, + "grad_norm": 1.5835971815447754, + "learning_rate": 1.998298371532226e-05, + "loss": 1.7738, + "step": 360 + }, + { + "epoch": 0.04817187082999733, + "grad_norm": 1.5878715756664898, + "learning_rate": 1.9982730761389063e-05, + "loss": 1.7094, + "step": 361 + }, + { + "epoch": 0.04830531091539898, + "grad_norm": 1.519758329640418, + "learning_rate": 1.9982475942795418e-05, + "loss": 1.7268, + "step": 362 + }, + { + "epoch": 0.04843875100080064, + "grad_norm": 1.514307211931559, + "learning_rate": 1.9982219259588925e-05, + "loss": 1.6696, + "step": 363 + }, + { + "epoch": 0.048572191086202296, + "grad_norm": 2.1724863692721352, + "learning_rate": 1.9981960711817524e-05, + "loss": 1.6908, + "step": 364 + }, + { + "epoch": 0.04870563117160395, + "grad_norm": 1.56228704416919, + "learning_rate": 1.998170029952951e-05, + "loss": 1.7315, + "step": 365 + }, + { + "epoch": 0.0488390712570056, + "grad_norm": 1.85473949983945, + "learning_rate": 1.9981438022773526e-05, + "loss": 1.7355, + "step": 366 + }, + { + "epoch": 0.04897251134240726, + "grad_norm": 11.783330160673703, + "learning_rate": 1.9981173881598563e-05, + "loss": 1.6998, + "step": 367 + }, + { + "epoch": 0.049105951427808915, + "grad_norm": 2.0242537102814415, + "learning_rate": 1.998090787605396e-05, + "loss": 1.6909, + "step": 368 + }, + { + "epoch": 0.04923939151321057, + "grad_norm": 1.6717583123155606, + "learning_rate": 1.99806400061894e-05, + "loss": 1.7593, + "step": 369 + }, + { + "epoch": 0.04937283159861222, + "grad_norm": 1.5943321940814934, + "learning_rate": 1.998037027205492e-05, + "loss": 1.7278, + "step": 370 + }, + { + "epoch": 0.04950627168401388, + "grad_norm": 1.8339002788527259, + "learning_rate": 1.998009867370091e-05, + "loss": 1.7573, + "step": 371 + }, + { + "epoch": 0.049639711769415534, + "grad_norm": 1.6495978655725259, + "learning_rate": 1.997982521117809e-05, + "loss": 1.7594, + "step": 372 + }, + { + "epoch": 0.049773151854817184, + "grad_norm": 1.9817095312631359, + "learning_rate": 1.9979549884537545e-05, + "loss": 1.699, + "step": 373 + }, + { + "epoch": 0.04990659194021884, + "grad_norm": 1.803977303801047, + "learning_rate": 1.997927269383071e-05, + "loss": 1.733, + "step": 374 + }, + { + "epoch": 0.0500400320256205, + "grad_norm": 1.6078905730093185, + "learning_rate": 1.9978993639109344e-05, + "loss": 1.741, + "step": 375 + }, + { + "epoch": 0.050173472111022153, + "grad_norm": 1.5995695562728156, + "learning_rate": 1.997871272042559e-05, + "loss": 1.7242, + "step": 376 + }, + { + "epoch": 0.0503069121964238, + "grad_norm": 1.6340423898197851, + "learning_rate": 1.9978429937831905e-05, + "loss": 1.7679, + "step": 377 + }, + { + "epoch": 0.05044035228182546, + "grad_norm": 1.6634270572428624, + "learning_rate": 1.997814529138112e-05, + "loss": 1.7056, + "step": 378 + }, + { + "epoch": 0.050573792367227116, + "grad_norm": 1.7876625580664276, + "learning_rate": 1.99778587811264e-05, + "loss": 1.7285, + "step": 379 + }, + { + "epoch": 0.05070723245262877, + "grad_norm": 1.5691636161666582, + "learning_rate": 1.9977570407121258e-05, + "loss": 1.7767, + "step": 380 + }, + { + "epoch": 0.05084067253803042, + "grad_norm": 2.0869743704839894, + "learning_rate": 1.9977280169419567e-05, + "loss": 1.7575, + "step": 381 + }, + { + "epoch": 0.05097411262343208, + "grad_norm": 1.5252100639836486, + "learning_rate": 1.9976988068075536e-05, + "loss": 1.8007, + "step": 382 + }, + { + "epoch": 0.051107552708833735, + "grad_norm": 1.4730478361850414, + "learning_rate": 1.9976694103143725e-05, + "loss": 1.7411, + "step": 383 + }, + { + "epoch": 0.051240992794235385, + "grad_norm": 2.0844587485090043, + "learning_rate": 1.9976398274679044e-05, + "loss": 1.7389, + "step": 384 + }, + { + "epoch": 0.05137443287963704, + "grad_norm": 1.6654951502904705, + "learning_rate": 1.9976100582736753e-05, + "loss": 1.7117, + "step": 385 + }, + { + "epoch": 0.0515078729650387, + "grad_norm": 1.4065668432108118, + "learning_rate": 1.997580102737245e-05, + "loss": 1.7272, + "step": 386 + }, + { + "epoch": 0.051641313050440354, + "grad_norm": 1.6267242440943979, + "learning_rate": 1.99754996086421e-05, + "loss": 1.7481, + "step": 387 + }, + { + "epoch": 0.051774753135842004, + "grad_norm": 2.2148633010781773, + "learning_rate": 1.9975196326601997e-05, + "loss": 1.7006, + "step": 388 + }, + { + "epoch": 0.05190819322124366, + "grad_norm": 6.191778325835291, + "learning_rate": 1.9974891181308788e-05, + "loss": 1.7091, + "step": 389 + }, + { + "epoch": 0.05204163330664532, + "grad_norm": 13.595285838337412, + "learning_rate": 1.9974584172819478e-05, + "loss": 1.7493, + "step": 390 + }, + { + "epoch": 0.052175073392046974, + "grad_norm": 2.062696345964478, + "learning_rate": 1.997427530119141e-05, + "loss": 1.7039, + "step": 391 + }, + { + "epoch": 0.05230851347744862, + "grad_norm": 2.0114191550940075, + "learning_rate": 1.9973964566482276e-05, + "loss": 1.7469, + "step": 392 + }, + { + "epoch": 0.05244195356285028, + "grad_norm": 1.7826740746863319, + "learning_rate": 1.997365196875012e-05, + "loss": 1.6988, + "step": 393 + }, + { + "epoch": 0.052575393648251936, + "grad_norm": 1.8065043559103795, + "learning_rate": 1.9973337508053328e-05, + "loss": 1.6763, + "step": 394 + }, + { + "epoch": 0.05270883373365359, + "grad_norm": 1.618657676406062, + "learning_rate": 1.9973021184450644e-05, + "loss": 1.664, + "step": 395 + }, + { + "epoch": 0.05284227381905524, + "grad_norm": 1.669337246084699, + "learning_rate": 1.9972702998001145e-05, + "loss": 1.6838, + "step": 396 + }, + { + "epoch": 0.0529757139044569, + "grad_norm": 1.6318231642923446, + "learning_rate": 1.9972382948764274e-05, + "loss": 1.7319, + "step": 397 + }, + { + "epoch": 0.053109153989858555, + "grad_norm": 1.68424778338556, + "learning_rate": 1.9972061036799805e-05, + "loss": 1.7526, + "step": 398 + }, + { + "epoch": 0.053242594075260205, + "grad_norm": 10.099486327210181, + "learning_rate": 1.9971737262167873e-05, + "loss": 1.7284, + "step": 399 + }, + { + "epoch": 0.05337603416066186, + "grad_norm": 2.4830108610902792, + "learning_rate": 1.9971411624928952e-05, + "loss": 1.6935, + "step": 400 + }, + { + "epoch": 0.05350947424606352, + "grad_norm": 2.157249041077982, + "learning_rate": 1.997108412514387e-05, + "loss": 1.7532, + "step": 401 + }, + { + "epoch": 0.053642914331465175, + "grad_norm": 17.029332948986212, + "learning_rate": 1.9970754762873793e-05, + "loss": 1.7565, + "step": 402 + }, + { + "epoch": 0.053776354416866824, + "grad_norm": 2.2002999089958517, + "learning_rate": 1.9970423538180253e-05, + "loss": 1.7299, + "step": 403 + }, + { + "epoch": 0.05390979450226848, + "grad_norm": 1.9526294546905574, + "learning_rate": 1.997009045112511e-05, + "loss": 1.7624, + "step": 404 + }, + { + "epoch": 0.05404323458767014, + "grad_norm": 3.451322381710603, + "learning_rate": 1.9969755501770588e-05, + "loss": 1.7655, + "step": 405 + }, + { + "epoch": 0.054176674673071794, + "grad_norm": 3.4396767676243116, + "learning_rate": 1.9969418690179245e-05, + "loss": 1.7311, + "step": 406 + }, + { + "epoch": 0.05431011475847344, + "grad_norm": 2.047158976072317, + "learning_rate": 1.9969080016413998e-05, + "loss": 1.823, + "step": 407 + }, + { + "epoch": 0.0544435548438751, + "grad_norm": 1.7003670198468013, + "learning_rate": 1.9968739480538106e-05, + "loss": 1.6803, + "step": 408 + }, + { + "epoch": 0.054576994929276756, + "grad_norm": 1.5275767558995044, + "learning_rate": 1.9968397082615177e-05, + "loss": 1.6911, + "step": 409 + }, + { + "epoch": 0.054710435014678406, + "grad_norm": 1.5021865116802673, + "learning_rate": 1.9968052822709168e-05, + "loss": 1.6727, + "step": 410 + }, + { + "epoch": 0.05484387510008006, + "grad_norm": 1.6346663207881, + "learning_rate": 1.9967706700884383e-05, + "loss": 1.7605, + "step": 411 + }, + { + "epoch": 0.05497731518548172, + "grad_norm": 1.4986173436934926, + "learning_rate": 1.9967358717205473e-05, + "loss": 1.7654, + "step": 412 + }, + { + "epoch": 0.055110755270883376, + "grad_norm": 9.343403487653784, + "learning_rate": 1.9967008871737435e-05, + "loss": 1.7153, + "step": 413 + }, + { + "epoch": 0.055244195356285025, + "grad_norm": 2.0824830761991167, + "learning_rate": 1.996665716454562e-05, + "loss": 1.738, + "step": 414 + }, + { + "epoch": 0.05537763544168668, + "grad_norm": 2.05226788462729, + "learning_rate": 1.996630359569572e-05, + "loss": 1.6989, + "step": 415 + }, + { + "epoch": 0.05551107552708834, + "grad_norm": 2.801402642121381, + "learning_rate": 1.9965948165253783e-05, + "loss": 1.7352, + "step": 416 + }, + { + "epoch": 0.055644515612489995, + "grad_norm": 1.5863635183628055, + "learning_rate": 1.996559087328619e-05, + "loss": 1.7482, + "step": 417 + }, + { + "epoch": 0.055777955697891644, + "grad_norm": 1.5347479681973688, + "learning_rate": 1.9965231719859686e-05, + "loss": 1.7804, + "step": 418 + }, + { + "epoch": 0.0559113957832933, + "grad_norm": 5.941069172684277, + "learning_rate": 1.9964870705041356e-05, + "loss": 1.6768, + "step": 419 + }, + { + "epoch": 0.05604483586869496, + "grad_norm": 1.9058886708572287, + "learning_rate": 1.996450782889863e-05, + "loss": 1.8138, + "step": 420 + }, + { + "epoch": 0.056178275954096614, + "grad_norm": 1.83905231786784, + "learning_rate": 1.9964143091499296e-05, + "loss": 1.696, + "step": 421 + }, + { + "epoch": 0.056311716039498264, + "grad_norm": 21.111853586038833, + "learning_rate": 1.996377649291148e-05, + "loss": 1.8305, + "step": 422 + }, + { + "epoch": 0.05644515612489992, + "grad_norm": 2.0473455232249957, + "learning_rate": 1.9963408033203652e-05, + "loss": 1.6751, + "step": 423 + }, + { + "epoch": 0.05657859621030158, + "grad_norm": 2.3640946700664305, + "learning_rate": 1.9963037712444643e-05, + "loss": 1.6767, + "step": 424 + }, + { + "epoch": 0.056712036295703226, + "grad_norm": 2.639814143002119, + "learning_rate": 1.9962665530703623e-05, + "loss": 1.7202, + "step": 425 + }, + { + "epoch": 0.05684547638110488, + "grad_norm": 2.035709803937704, + "learning_rate": 1.996229148805011e-05, + "loss": 1.7528, + "step": 426 + }, + { + "epoch": 0.05697891646650654, + "grad_norm": 4.8723906405798605, + "learning_rate": 1.996191558455397e-05, + "loss": 1.7948, + "step": 427 + }, + { + "epoch": 0.057112356551908196, + "grad_norm": 1.9755270680940058, + "learning_rate": 1.9961537820285425e-05, + "loss": 1.739, + "step": 428 + }, + { + "epoch": 0.057245796637309845, + "grad_norm": 1.36715317359475, + "learning_rate": 1.996115819531503e-05, + "loss": 1.7144, + "step": 429 + }, + { + "epoch": 0.0573792367227115, + "grad_norm": 2.099518986464087, + "learning_rate": 1.9960776709713695e-05, + "loss": 1.7191, + "step": 430 + }, + { + "epoch": 0.05751267680811316, + "grad_norm": 1.5524531892573008, + "learning_rate": 1.9960393363552677e-05, + "loss": 1.6912, + "step": 431 + }, + { + "epoch": 0.057646116893514815, + "grad_norm": 1.979055446110802, + "learning_rate": 1.9960008156903584e-05, + "loss": 1.6863, + "step": 432 + }, + { + "epoch": 0.057779556978916465, + "grad_norm": 1.700547443693333, + "learning_rate": 1.9959621089838363e-05, + "loss": 1.6982, + "step": 433 + }, + { + "epoch": 0.05791299706431812, + "grad_norm": 1.5225244263250537, + "learning_rate": 1.995923216242932e-05, + "loss": 1.7289, + "step": 434 + }, + { + "epoch": 0.05804643714971978, + "grad_norm": 1.8075354741575524, + "learning_rate": 1.9958841374749102e-05, + "loss": 1.7704, + "step": 435 + }, + { + "epoch": 0.05817987723512143, + "grad_norm": 1.5109107308638536, + "learning_rate": 1.9958448726870695e-05, + "loss": 1.7511, + "step": 436 + }, + { + "epoch": 0.058313317320523084, + "grad_norm": 1.722880833771426, + "learning_rate": 1.995805421886745e-05, + "loss": 1.7224, + "step": 437 + }, + { + "epoch": 0.05844675740592474, + "grad_norm": 1.507934531634378, + "learning_rate": 1.995765785081305e-05, + "loss": 1.7202, + "step": 438 + }, + { + "epoch": 0.0585801974913264, + "grad_norm": 1.4464233282667935, + "learning_rate": 1.995725962278154e-05, + "loss": 1.6983, + "step": 439 + }, + { + "epoch": 0.058713637576728046, + "grad_norm": 3.165760428882322, + "learning_rate": 1.99568595348473e-05, + "loss": 1.6845, + "step": 440 + }, + { + "epoch": 0.0588470776621297, + "grad_norm": 10.453161040701552, + "learning_rate": 1.995645758708506e-05, + "loss": 1.7234, + "step": 441 + }, + { + "epoch": 0.05898051774753136, + "grad_norm": 1.8410106428998834, + "learning_rate": 1.9956053779569905e-05, + "loss": 1.7281, + "step": 442 + }, + { + "epoch": 0.059113957832933016, + "grad_norm": 1.442847521907269, + "learning_rate": 1.9955648112377254e-05, + "loss": 1.7504, + "step": 443 + }, + { + "epoch": 0.059247397918334666, + "grad_norm": 1.4761935818962095, + "learning_rate": 1.9955240585582887e-05, + "loss": 1.6651, + "step": 444 + }, + { + "epoch": 0.05938083800373632, + "grad_norm": 1.4515253586484216, + "learning_rate": 1.995483119926292e-05, + "loss": 1.7229, + "step": 445 + }, + { + "epoch": 0.05951427808913798, + "grad_norm": 1.5540039321519692, + "learning_rate": 1.9954419953493827e-05, + "loss": 1.7037, + "step": 446 + }, + { + "epoch": 0.059647718174539635, + "grad_norm": 1.5484860948366919, + "learning_rate": 1.9954006848352423e-05, + "loss": 1.7149, + "step": 447 + }, + { + "epoch": 0.059781158259941285, + "grad_norm": 1.9290588477973933, + "learning_rate": 1.995359188391587e-05, + "loss": 1.7327, + "step": 448 + }, + { + "epoch": 0.05991459834534294, + "grad_norm": 1.5388782815766935, + "learning_rate": 1.9953175060261677e-05, + "loss": 1.7035, + "step": 449 + }, + { + "epoch": 0.0600480384307446, + "grad_norm": 1.4965138673427807, + "learning_rate": 1.9952756377467706e-05, + "loss": 1.7023, + "step": 450 + }, + { + "epoch": 0.06018147851614625, + "grad_norm": 1.7236071176748005, + "learning_rate": 1.995233583561216e-05, + "loss": 1.7184, + "step": 451 + }, + { + "epoch": 0.060314918601547904, + "grad_norm": 1.431141636104607, + "learning_rate": 1.9951913434773592e-05, + "loss": 1.7301, + "step": 452 + }, + { + "epoch": 0.06044835868694956, + "grad_norm": 1.6645336690164663, + "learning_rate": 1.9951489175030902e-05, + "loss": 1.7039, + "step": 453 + }, + { + "epoch": 0.06058179877235122, + "grad_norm": 2.517722128154126, + "learning_rate": 1.995106305646333e-05, + "loss": 1.7634, + "step": 454 + }, + { + "epoch": 0.060715238857752866, + "grad_norm": 1.6794801287390608, + "learning_rate": 1.9950635079150483e-05, + "loss": 1.7285, + "step": 455 + }, + { + "epoch": 0.06084867894315452, + "grad_norm": 1.7417705396361953, + "learning_rate": 1.995020524317229e-05, + "loss": 1.7344, + "step": 456 + }, + { + "epoch": 0.06098211902855618, + "grad_norm": 1.6141113551164898, + "learning_rate": 1.994977354860905e-05, + "loss": 1.7246, + "step": 457 + }, + { + "epoch": 0.061115559113957836, + "grad_norm": 1.468263394815004, + "learning_rate": 1.9949339995541393e-05, + "loss": 1.7539, + "step": 458 + }, + { + "epoch": 0.061248999199359486, + "grad_norm": 1.665408795940273, + "learning_rate": 1.99489045840503e-05, + "loss": 1.6866, + "step": 459 + }, + { + "epoch": 0.06138243928476114, + "grad_norm": 1.9816359801419392, + "learning_rate": 1.9948467314217104e-05, + "loss": 1.6644, + "step": 460 + }, + { + "epoch": 0.0615158793701628, + "grad_norm": 1.5840211934856465, + "learning_rate": 1.9948028186123482e-05, + "loss": 1.6853, + "step": 461 + }, + { + "epoch": 0.06164931945556445, + "grad_norm": 1.5472589180288636, + "learning_rate": 1.9947587199851454e-05, + "loss": 1.7171, + "step": 462 + }, + { + "epoch": 0.061782759540966105, + "grad_norm": 1.531878299898292, + "learning_rate": 1.99471443554834e-05, + "loss": 1.7135, + "step": 463 + }, + { + "epoch": 0.06191619962636776, + "grad_norm": 1.5698651966216552, + "learning_rate": 1.9946699653102027e-05, + "loss": 1.7386, + "step": 464 + }, + { + "epoch": 0.06204963971176942, + "grad_norm": 1.3931585707012093, + "learning_rate": 1.994625309279041e-05, + "loss": 1.7277, + "step": 465 + }, + { + "epoch": 0.06218307979717107, + "grad_norm": 1.360573940164653, + "learning_rate": 1.994580467463196e-05, + "loss": 1.7149, + "step": 466 + }, + { + "epoch": 0.062316519882572724, + "grad_norm": 2.827001353046576, + "learning_rate": 1.9945354398710427e-05, + "loss": 1.7028, + "step": 467 + }, + { + "epoch": 0.06244995996797438, + "grad_norm": 1.5514492612805253, + "learning_rate": 1.994490226510993e-05, + "loss": 1.75, + "step": 468 + }, + { + "epoch": 0.06258340005337604, + "grad_norm": 1.3811202391730824, + "learning_rate": 1.9944448273914917e-05, + "loss": 1.6524, + "step": 469 + }, + { + "epoch": 0.0627168401387777, + "grad_norm": 1.460693904274723, + "learning_rate": 1.9943992425210186e-05, + "loss": 1.7228, + "step": 470 + }, + { + "epoch": 0.06285028022417935, + "grad_norm": 1.5719369846193096, + "learning_rate": 1.9943534719080885e-05, + "loss": 1.6843, + "step": 471 + }, + { + "epoch": 0.06298372030958099, + "grad_norm": 1.4562562982659377, + "learning_rate": 1.9943075155612513e-05, + "loss": 1.7289, + "step": 472 + }, + { + "epoch": 0.06311716039498265, + "grad_norm": 1.426871294088278, + "learning_rate": 1.9942613734890908e-05, + "loss": 1.7302, + "step": 473 + }, + { + "epoch": 0.0632506004803843, + "grad_norm": 1.4065664674136416, + "learning_rate": 1.9942150457002258e-05, + "loss": 1.6727, + "step": 474 + }, + { + "epoch": 0.06338404056578596, + "grad_norm": 1.4239048483373207, + "learning_rate": 1.9941685322033095e-05, + "loss": 1.7427, + "step": 475 + }, + { + "epoch": 0.06351748065118762, + "grad_norm": 1.4154094770828864, + "learning_rate": 1.9941218330070305e-05, + "loss": 1.7506, + "step": 476 + }, + { + "epoch": 0.06365092073658928, + "grad_norm": 1.3948618761716254, + "learning_rate": 1.994074948120112e-05, + "loss": 1.6968, + "step": 477 + }, + { + "epoch": 0.06378436082199093, + "grad_norm": 1.4269960440031193, + "learning_rate": 1.994027877551311e-05, + "loss": 1.7036, + "step": 478 + }, + { + "epoch": 0.06391780090739257, + "grad_norm": 1.561320058873614, + "learning_rate": 1.9939806213094196e-05, + "loss": 1.7169, + "step": 479 + }, + { + "epoch": 0.06405124099279423, + "grad_norm": 1.3666224092243797, + "learning_rate": 1.9939331794032655e-05, + "loss": 1.6997, + "step": 480 + }, + { + "epoch": 0.06418468107819589, + "grad_norm": 1.5591971808566425, + "learning_rate": 1.9938855518417096e-05, + "loss": 1.7198, + "step": 481 + }, + { + "epoch": 0.06431812116359754, + "grad_norm": 1.436666270843483, + "learning_rate": 1.9938377386336483e-05, + "loss": 1.7764, + "step": 482 + }, + { + "epoch": 0.0644515612489992, + "grad_norm": 2.5574706627508386, + "learning_rate": 1.993789739788013e-05, + "loss": 1.7151, + "step": 483 + }, + { + "epoch": 0.06458500133440086, + "grad_norm": 1.682482407939873, + "learning_rate": 1.9937415553137686e-05, + "loss": 1.7123, + "step": 484 + }, + { + "epoch": 0.06471844141980251, + "grad_norm": 1.4561970319846727, + "learning_rate": 1.993693185219916e-05, + "loss": 1.7404, + "step": 485 + }, + { + "epoch": 0.06485188150520416, + "grad_norm": 1.503219984664679, + "learning_rate": 1.9936446295154902e-05, + "loss": 1.7044, + "step": 486 + }, + { + "epoch": 0.06498532159060581, + "grad_norm": 1.4575867517583754, + "learning_rate": 1.9935958882095607e-05, + "loss": 1.6829, + "step": 487 + }, + { + "epoch": 0.06511876167600747, + "grad_norm": 1.5458704457306156, + "learning_rate": 1.9935469613112318e-05, + "loss": 1.7147, + "step": 488 + }, + { + "epoch": 0.06525220176140913, + "grad_norm": 1.4324636862239977, + "learning_rate": 1.9934978488296423e-05, + "loss": 1.699, + "step": 489 + }, + { + "epoch": 0.06538564184681078, + "grad_norm": 1.609515083336598, + "learning_rate": 1.993448550773966e-05, + "loss": 1.7071, + "step": 490 + }, + { + "epoch": 0.06551908193221244, + "grad_norm": 1.63784929721931, + "learning_rate": 1.9933990671534116e-05, + "loss": 1.7445, + "step": 491 + }, + { + "epoch": 0.0656525220176141, + "grad_norm": 1.7557428701106073, + "learning_rate": 1.9933493979772215e-05, + "loss": 1.7638, + "step": 492 + }, + { + "epoch": 0.06578596210301575, + "grad_norm": 1.6719589515914446, + "learning_rate": 1.9932995432546733e-05, + "loss": 1.6957, + "step": 493 + }, + { + "epoch": 0.0659194021884174, + "grad_norm": 1.3831309866724475, + "learning_rate": 1.99324950299508e-05, + "loss": 1.7108, + "step": 494 + }, + { + "epoch": 0.06605284227381905, + "grad_norm": 1.6438709703821648, + "learning_rate": 1.9931992772077877e-05, + "loss": 1.7459, + "step": 495 + }, + { + "epoch": 0.06618628235922071, + "grad_norm": 1.556799980817237, + "learning_rate": 1.993148865902179e-05, + "loss": 1.7165, + "step": 496 + }, + { + "epoch": 0.06631972244462236, + "grad_norm": 1.6005504681612104, + "learning_rate": 1.993098269087669e-05, + "loss": 1.6964, + "step": 497 + }, + { + "epoch": 0.06645316253002402, + "grad_norm": 1.6767116156256117, + "learning_rate": 1.9930474867737093e-05, + "loss": 1.7101, + "step": 498 + }, + { + "epoch": 0.06658660261542568, + "grad_norm": 1.437357072942104, + "learning_rate": 1.9929965189697855e-05, + "loss": 1.7503, + "step": 499 + }, + { + "epoch": 0.06672004270082733, + "grad_norm": 1.2785056943422897, + "learning_rate": 1.9929453656854177e-05, + "loss": 1.7404, + "step": 500 + }, + { + "epoch": 0.06685348278622898, + "grad_norm": 1.4081248663083363, + "learning_rate": 1.9928940269301607e-05, + "loss": 1.6389, + "step": 501 + }, + { + "epoch": 0.06698692287163063, + "grad_norm": 1.2600630608207999, + "learning_rate": 1.9928425027136038e-05, + "loss": 1.6954, + "step": 502 + }, + { + "epoch": 0.06712036295703229, + "grad_norm": 1.323168623645999, + "learning_rate": 1.992790793045371e-05, + "loss": 1.7456, + "step": 503 + }, + { + "epoch": 0.06725380304243395, + "grad_norm": 1.337670730789513, + "learning_rate": 1.9927388979351222e-05, + "loss": 1.7088, + "step": 504 + }, + { + "epoch": 0.0673872431278356, + "grad_norm": 1.3450109564421233, + "learning_rate": 1.9926868173925496e-05, + "loss": 1.7313, + "step": 505 + }, + { + "epoch": 0.06752068321323726, + "grad_norm": 6.8172313878010415, + "learning_rate": 1.9926345514273817e-05, + "loss": 1.7266, + "step": 506 + }, + { + "epoch": 0.06765412329863892, + "grad_norm": 1.4477937313346818, + "learning_rate": 1.992582100049381e-05, + "loss": 1.7154, + "step": 507 + }, + { + "epoch": 0.06778756338404057, + "grad_norm": 1.4944076158515036, + "learning_rate": 1.9925294632683454e-05, + "loss": 1.681, + "step": 508 + }, + { + "epoch": 0.06792100346944221, + "grad_norm": 1.4995222349055868, + "learning_rate": 1.992476641094106e-05, + "loss": 1.7149, + "step": 509 + }, + { + "epoch": 0.06805444355484387, + "grad_norm": 1.489364590881723, + "learning_rate": 1.99242363353653e-05, + "loss": 1.7353, + "step": 510 + }, + { + "epoch": 0.06818788364024553, + "grad_norm": 1.6809529621977222, + "learning_rate": 1.9923704406055185e-05, + "loss": 1.7274, + "step": 511 + }, + { + "epoch": 0.06832132372564718, + "grad_norm": 3.583760089378779, + "learning_rate": 1.992317062311007e-05, + "loss": 1.7217, + "step": 512 + }, + { + "epoch": 0.06845476381104884, + "grad_norm": 1.3874294319409661, + "learning_rate": 1.9922634986629667e-05, + "loss": 1.7285, + "step": 513 + }, + { + "epoch": 0.0685882038964505, + "grad_norm": 1.3167043130129317, + "learning_rate": 1.992209749671402e-05, + "loss": 1.7086, + "step": 514 + }, + { + "epoch": 0.06872164398185215, + "grad_norm": 1.6269952933086236, + "learning_rate": 1.9921558153463526e-05, + "loss": 1.7474, + "step": 515 + }, + { + "epoch": 0.0688550840672538, + "grad_norm": 2.1627161830488304, + "learning_rate": 1.992101695697893e-05, + "loss": 1.7087, + "step": 516 + }, + { + "epoch": 0.06898852415265545, + "grad_norm": 1.6239043871414123, + "learning_rate": 1.9920473907361324e-05, + "loss": 1.7406, + "step": 517 + }, + { + "epoch": 0.06912196423805711, + "grad_norm": 1.3153898428508883, + "learning_rate": 1.9919929004712137e-05, + "loss": 1.7223, + "step": 518 + }, + { + "epoch": 0.06925540432345877, + "grad_norm": 1.368634326229657, + "learning_rate": 1.9919382249133158e-05, + "loss": 1.6969, + "step": 519 + }, + { + "epoch": 0.06938884440886042, + "grad_norm": 1.6237538448200741, + "learning_rate": 1.991883364072651e-05, + "loss": 1.724, + "step": 520 + }, + { + "epoch": 0.06952228449426208, + "grad_norm": 1.3169440768502187, + "learning_rate": 1.991828317959467e-05, + "loss": 1.7785, + "step": 521 + }, + { + "epoch": 0.06965572457966374, + "grad_norm": 1.3815810002503197, + "learning_rate": 1.9917730865840453e-05, + "loss": 1.737, + "step": 522 + }, + { + "epoch": 0.06978916466506539, + "grad_norm": 19.153199421244327, + "learning_rate": 1.991717669956703e-05, + "loss": 1.6833, + "step": 523 + }, + { + "epoch": 0.06992260475046704, + "grad_norm": 1.6039439835379967, + "learning_rate": 1.991662068087791e-05, + "loss": 1.7322, + "step": 524 + }, + { + "epoch": 0.07005604483586869, + "grad_norm": 1.6305057820452133, + "learning_rate": 1.991606280987695e-05, + "loss": 1.7338, + "step": 525 + }, + { + "epoch": 0.07018948492127035, + "grad_norm": 1.6126054306261002, + "learning_rate": 1.9915503086668358e-05, + "loss": 1.7227, + "step": 526 + }, + { + "epoch": 0.070322925006672, + "grad_norm": 1.396267469940587, + "learning_rate": 1.991494151135668e-05, + "loss": 1.7336, + "step": 527 + }, + { + "epoch": 0.07045636509207366, + "grad_norm": 1.3519617857019466, + "learning_rate": 1.9914378084046814e-05, + "loss": 1.685, + "step": 528 + }, + { + "epoch": 0.07058980517747532, + "grad_norm": 1.369805017409349, + "learning_rate": 1.9913812804844003e-05, + "loss": 1.6849, + "step": 529 + }, + { + "epoch": 0.07072324526287697, + "grad_norm": 1.3456126243717488, + "learning_rate": 1.9913245673853833e-05, + "loss": 1.6779, + "step": 530 + }, + { + "epoch": 0.07085668534827862, + "grad_norm": 1.4115469932734985, + "learning_rate": 1.9912676691182237e-05, + "loss": 1.6661, + "step": 531 + }, + { + "epoch": 0.07099012543368027, + "grad_norm": 1.452789685185474, + "learning_rate": 1.9912105856935497e-05, + "loss": 1.722, + "step": 532 + }, + { + "epoch": 0.07112356551908193, + "grad_norm": 1.4137246122718905, + "learning_rate": 1.9911533171220234e-05, + "loss": 1.6956, + "step": 533 + }, + { + "epoch": 0.07125700560448359, + "grad_norm": 1.657715031131543, + "learning_rate": 1.991095863414342e-05, + "loss": 1.7233, + "step": 534 + }, + { + "epoch": 0.07139044568988524, + "grad_norm": 1.5491300325617336, + "learning_rate": 1.9910382245812376e-05, + "loss": 1.7357, + "step": 535 + }, + { + "epoch": 0.0715238857752869, + "grad_norm": 1.3876706174173905, + "learning_rate": 1.990980400633476e-05, + "loss": 1.6843, + "step": 536 + }, + { + "epoch": 0.07165732586068856, + "grad_norm": 1.5151426749768526, + "learning_rate": 1.9909223915818584e-05, + "loss": 1.7651, + "step": 537 + }, + { + "epoch": 0.0717907659460902, + "grad_norm": 1.3575479856663701, + "learning_rate": 1.99086419743722e-05, + "loss": 1.6813, + "step": 538 + }, + { + "epoch": 0.07192420603149186, + "grad_norm": 1.2693638194156773, + "learning_rate": 1.9908058182104317e-05, + "loss": 1.7248, + "step": 539 + }, + { + "epoch": 0.07205764611689351, + "grad_norm": 1.6457462336156898, + "learning_rate": 1.9907472539123968e-05, + "loss": 1.752, + "step": 540 + }, + { + "epoch": 0.07219108620229517, + "grad_norm": 1.678698655916284, + "learning_rate": 1.9906885045540547e-05, + "loss": 1.7071, + "step": 541 + }, + { + "epoch": 0.07232452628769682, + "grad_norm": 1.650324255621724, + "learning_rate": 1.99062957014638e-05, + "loss": 1.7447, + "step": 542 + }, + { + "epoch": 0.07245796637309848, + "grad_norm": 1.417893991120254, + "learning_rate": 1.9905704507003794e-05, + "loss": 1.6948, + "step": 543 + }, + { + "epoch": 0.07259140645850014, + "grad_norm": 1.4999406881312796, + "learning_rate": 1.9905111462270976e-05, + "loss": 1.648, + "step": 544 + }, + { + "epoch": 0.0727248465439018, + "grad_norm": 1.4453828984993826, + "learning_rate": 1.9904516567376105e-05, + "loss": 1.7441, + "step": 545 + }, + { + "epoch": 0.07285828662930344, + "grad_norm": 1.466543506749187, + "learning_rate": 1.990391982243031e-05, + "loss": 1.736, + "step": 546 + }, + { + "epoch": 0.0729917267147051, + "grad_norm": 1.2757799214271062, + "learning_rate": 1.990332122754505e-05, + "loss": 1.7273, + "step": 547 + }, + { + "epoch": 0.07312516680010675, + "grad_norm": 1.786649049168531, + "learning_rate": 1.9902720782832136e-05, + "loss": 1.7035, + "step": 548 + }, + { + "epoch": 0.0732586068855084, + "grad_norm": 1.6705192239448357, + "learning_rate": 1.990211848840373e-05, + "loss": 1.7395, + "step": 549 + }, + { + "epoch": 0.07339204697091006, + "grad_norm": 1.4329159010944417, + "learning_rate": 1.990151434437233e-05, + "loss": 1.7018, + "step": 550 + }, + { + "epoch": 0.07352548705631172, + "grad_norm": 1.3424218662116187, + "learning_rate": 1.9900908350850784e-05, + "loss": 1.6933, + "step": 551 + }, + { + "epoch": 0.07365892714171338, + "grad_norm": 1.4293148650020853, + "learning_rate": 1.9900300507952282e-05, + "loss": 1.7374, + "step": 552 + }, + { + "epoch": 0.07379236722711502, + "grad_norm": 1.3488966110143432, + "learning_rate": 1.9899690815790365e-05, + "loss": 1.72, + "step": 553 + }, + { + "epoch": 0.07392580731251668, + "grad_norm": 1.2512462922247582, + "learning_rate": 1.9899079274478916e-05, + "loss": 1.6908, + "step": 554 + }, + { + "epoch": 0.07405924739791833, + "grad_norm": 1.4908815449799042, + "learning_rate": 1.9898465884132164e-05, + "loss": 1.6776, + "step": 555 + }, + { + "epoch": 0.07419268748331999, + "grad_norm": 1.340382784697227, + "learning_rate": 1.9897850644864683e-05, + "loss": 1.7624, + "step": 556 + }, + { + "epoch": 0.07432612756872164, + "grad_norm": 1.5972162605012559, + "learning_rate": 1.9897233556791392e-05, + "loss": 1.7383, + "step": 557 + }, + { + "epoch": 0.0744595676541233, + "grad_norm": 1.5896167899937395, + "learning_rate": 1.989661462002756e-05, + "loss": 1.7443, + "step": 558 + }, + { + "epoch": 0.07459300773952496, + "grad_norm": 1.257607085158065, + "learning_rate": 1.9895993834688792e-05, + "loss": 1.6755, + "step": 559 + }, + { + "epoch": 0.07472644782492661, + "grad_norm": 1.3197820287481992, + "learning_rate": 1.9895371200891045e-05, + "loss": 1.7056, + "step": 560 + }, + { + "epoch": 0.07485988791032826, + "grad_norm": 1.2908766480278888, + "learning_rate": 1.989474671875062e-05, + "loss": 1.6912, + "step": 561 + }, + { + "epoch": 0.07499332799572991, + "grad_norm": 1.3304422687884063, + "learning_rate": 1.9894120388384167e-05, + "loss": 1.6982, + "step": 562 + }, + { + "epoch": 0.07512676808113157, + "grad_norm": 1.270386568576679, + "learning_rate": 1.9893492209908673e-05, + "loss": 1.6924, + "step": 563 + }, + { + "epoch": 0.07526020816653323, + "grad_norm": 1.3181304828307967, + "learning_rate": 1.9892862183441475e-05, + "loss": 1.7527, + "step": 564 + }, + { + "epoch": 0.07539364825193488, + "grad_norm": 1.2811143921204557, + "learning_rate": 1.9892230309100257e-05, + "loss": 1.6951, + "step": 565 + }, + { + "epoch": 0.07552708833733654, + "grad_norm": 1.5238424880045898, + "learning_rate": 1.989159658700304e-05, + "loss": 1.7398, + "step": 566 + }, + { + "epoch": 0.0756605284227382, + "grad_norm": 1.4366895400044768, + "learning_rate": 1.9890961017268206e-05, + "loss": 1.7202, + "step": 567 + }, + { + "epoch": 0.07579396850813984, + "grad_norm": 1.3526495950339494, + "learning_rate": 1.9890323600014465e-05, + "loss": 1.732, + "step": 568 + }, + { + "epoch": 0.0759274085935415, + "grad_norm": 1.8603030771660538, + "learning_rate": 1.9889684335360883e-05, + "loss": 1.7233, + "step": 569 + }, + { + "epoch": 0.07606084867894315, + "grad_norm": 1.457863107278941, + "learning_rate": 1.9889043223426864e-05, + "loss": 1.6965, + "step": 570 + }, + { + "epoch": 0.07619428876434481, + "grad_norm": 1.391003183808799, + "learning_rate": 1.988840026433216e-05, + "loss": 1.6628, + "step": 571 + }, + { + "epoch": 0.07632772884974647, + "grad_norm": 1.5192166538443774, + "learning_rate": 1.988775545819687e-05, + "loss": 1.7319, + "step": 572 + }, + { + "epoch": 0.07646116893514812, + "grad_norm": 1.349717213694597, + "learning_rate": 1.988710880514144e-05, + "loss": 1.7236, + "step": 573 + }, + { + "epoch": 0.07659460902054978, + "grad_norm": 1.4799233730457397, + "learning_rate": 1.9886460305286653e-05, + "loss": 1.6907, + "step": 574 + }, + { + "epoch": 0.07672804910595143, + "grad_norm": 1.3506939293859863, + "learning_rate": 1.988580995875364e-05, + "loss": 1.7699, + "step": 575 + }, + { + "epoch": 0.07686148919135308, + "grad_norm": 1.407704376614465, + "learning_rate": 1.9885157765663882e-05, + "loss": 1.6905, + "step": 576 + }, + { + "epoch": 0.07699492927675473, + "grad_norm": 1.3193501358588269, + "learning_rate": 1.98845037261392e-05, + "loss": 1.6843, + "step": 577 + }, + { + "epoch": 0.07712836936215639, + "grad_norm": 1.4235546422730225, + "learning_rate": 1.9883847840301766e-05, + "loss": 1.6729, + "step": 578 + }, + { + "epoch": 0.07726180944755805, + "grad_norm": 1.416179755704021, + "learning_rate": 1.9883190108274083e-05, + "loss": 1.6728, + "step": 579 + }, + { + "epoch": 0.0773952495329597, + "grad_norm": 1.4443285178109353, + "learning_rate": 1.9882530530179013e-05, + "loss": 1.7048, + "step": 580 + }, + { + "epoch": 0.07752868961836136, + "grad_norm": 1.5186108478360427, + "learning_rate": 1.9881869106139756e-05, + "loss": 1.7514, + "step": 581 + }, + { + "epoch": 0.07766212970376302, + "grad_norm": 1.5545405165342625, + "learning_rate": 1.988120583627986e-05, + "loss": 1.7108, + "step": 582 + }, + { + "epoch": 0.07779556978916466, + "grad_norm": 1.3900902877777435, + "learning_rate": 1.9880540720723214e-05, + "loss": 1.7202, + "step": 583 + }, + { + "epoch": 0.07792900987456632, + "grad_norm": 1.5816584989791276, + "learning_rate": 1.9879873759594057e-05, + "loss": 1.6767, + "step": 584 + }, + { + "epoch": 0.07806244995996797, + "grad_norm": 1.6976660554047953, + "learning_rate": 1.9879204953016968e-05, + "loss": 1.7432, + "step": 585 + }, + { + "epoch": 0.07819589004536963, + "grad_norm": 1.3644155406532983, + "learning_rate": 1.987853430111687e-05, + "loss": 1.7208, + "step": 586 + }, + { + "epoch": 0.07832933013077129, + "grad_norm": 1.7167546825301851, + "learning_rate": 1.987786180401904e-05, + "loss": 1.7147, + "step": 587 + }, + { + "epoch": 0.07846277021617294, + "grad_norm": 1.3847591281625475, + "learning_rate": 1.9877187461849086e-05, + "loss": 1.7112, + "step": 588 + }, + { + "epoch": 0.0785962103015746, + "grad_norm": 1.5571722725576893, + "learning_rate": 1.9876511274732973e-05, + "loss": 1.7638, + "step": 589 + }, + { + "epoch": 0.07872965038697624, + "grad_norm": 1.394231398970201, + "learning_rate": 1.9875833242797e-05, + "loss": 1.7043, + "step": 590 + }, + { + "epoch": 0.0788630904723779, + "grad_norm": 1.3846937701988455, + "learning_rate": 1.987515336616782e-05, + "loss": 1.7134, + "step": 591 + }, + { + "epoch": 0.07899653055777955, + "grad_norm": 1.3032703638813365, + "learning_rate": 1.9874471644972423e-05, + "loss": 1.7126, + "step": 592 + }, + { + "epoch": 0.07912997064318121, + "grad_norm": 1.4886448086335287, + "learning_rate": 1.9873788079338147e-05, + "loss": 1.7153, + "step": 593 + }, + { + "epoch": 0.07926341072858287, + "grad_norm": 1.611970344713762, + "learning_rate": 1.9873102669392676e-05, + "loss": 1.7026, + "step": 594 + }, + { + "epoch": 0.07939685081398452, + "grad_norm": 1.341383594822625, + "learning_rate": 1.9872415415264036e-05, + "loss": 1.6898, + "step": 595 + }, + { + "epoch": 0.07953029089938618, + "grad_norm": 1.3360746005534239, + "learning_rate": 1.9871726317080596e-05, + "loss": 1.6745, + "step": 596 + }, + { + "epoch": 0.07966373098478784, + "grad_norm": 8.410282418991848, + "learning_rate": 1.9871035374971076e-05, + "loss": 1.7583, + "step": 597 + }, + { + "epoch": 0.07979717107018948, + "grad_norm": 1.6794637542120652, + "learning_rate": 1.9870342589064533e-05, + "loss": 1.6641, + "step": 598 + }, + { + "epoch": 0.07993061115559114, + "grad_norm": 1.3453314375845957, + "learning_rate": 1.9869647959490373e-05, + "loss": 1.6974, + "step": 599 + }, + { + "epoch": 0.08006405124099279, + "grad_norm": 1.6191652794215983, + "learning_rate": 1.9868951486378344e-05, + "loss": 1.7029, + "step": 600 + }, + { + "epoch": 0.08019749132639445, + "grad_norm": 7.014227307878178, + "learning_rate": 1.986825316985854e-05, + "loss": 1.6979, + "step": 601 + }, + { + "epoch": 0.0803309314117961, + "grad_norm": 1.666603092412851, + "learning_rate": 1.9867553010061397e-05, + "loss": 1.7212, + "step": 602 + }, + { + "epoch": 0.08046437149719776, + "grad_norm": 6.777435475671393, + "learning_rate": 1.98668510071177e-05, + "loss": 1.6779, + "step": 603 + }, + { + "epoch": 0.08059781158259942, + "grad_norm": 9.166357022667544, + "learning_rate": 1.9866147161158574e-05, + "loss": 1.7199, + "step": 604 + }, + { + "epoch": 0.08073125166800106, + "grad_norm": 1.8917080711861438, + "learning_rate": 1.9865441472315482e-05, + "loss": 1.6838, + "step": 605 + }, + { + "epoch": 0.08086469175340272, + "grad_norm": 1.5977719491010742, + "learning_rate": 1.986473394072025e-05, + "loss": 1.7179, + "step": 606 + }, + { + "epoch": 0.08099813183880437, + "grad_norm": 1.658856522654762, + "learning_rate": 1.9864024566505034e-05, + "loss": 1.6835, + "step": 607 + }, + { + "epoch": 0.08113157192420603, + "grad_norm": 1.499498737885197, + "learning_rate": 1.9863313349802333e-05, + "loss": 1.7349, + "step": 608 + }, + { + "epoch": 0.08126501200960769, + "grad_norm": 13.15546017312421, + "learning_rate": 1.9862600290744996e-05, + "loss": 1.7886, + "step": 609 + }, + { + "epoch": 0.08139845209500934, + "grad_norm": 1.9828316081894657, + "learning_rate": 1.9861885389466216e-05, + "loss": 1.731, + "step": 610 + }, + { + "epoch": 0.081531892180411, + "grad_norm": 1.7448728734068772, + "learning_rate": 1.9861168646099525e-05, + "loss": 1.6969, + "step": 611 + }, + { + "epoch": 0.08166533226581266, + "grad_norm": 1.6273856601272179, + "learning_rate": 1.9860450060778806e-05, + "loss": 1.7009, + "step": 612 + }, + { + "epoch": 0.0817987723512143, + "grad_norm": 1.624049525128813, + "learning_rate": 1.9859729633638278e-05, + "loss": 1.762, + "step": 613 + }, + { + "epoch": 0.08193221243661596, + "grad_norm": 2.6801910069846797, + "learning_rate": 1.9859007364812516e-05, + "loss": 1.64, + "step": 614 + }, + { + "epoch": 0.08206565252201761, + "grad_norm": 1.905005831673537, + "learning_rate": 1.9858283254436425e-05, + "loss": 1.6968, + "step": 615 + }, + { + "epoch": 0.08219909260741927, + "grad_norm": 1.7290318164845841, + "learning_rate": 1.9857557302645265e-05, + "loss": 1.669, + "step": 616 + }, + { + "epoch": 0.08233253269282093, + "grad_norm": 1.7447056074174758, + "learning_rate": 1.985682950957463e-05, + "loss": 1.6803, + "step": 617 + }, + { + "epoch": 0.08246597277822258, + "grad_norm": 1.7209313384190483, + "learning_rate": 1.9856099875360472e-05, + "loss": 1.7356, + "step": 618 + }, + { + "epoch": 0.08259941286362424, + "grad_norm": 1.6574968534579264, + "learning_rate": 1.9855368400139068e-05, + "loss": 1.714, + "step": 619 + }, + { + "epoch": 0.08273285294902588, + "grad_norm": 2.0519868971563344, + "learning_rate": 1.9854635084047055e-05, + "loss": 1.6875, + "step": 620 + }, + { + "epoch": 0.08286629303442754, + "grad_norm": 1.5958780040536966, + "learning_rate": 1.9853899927221408e-05, + "loss": 1.7272, + "step": 621 + }, + { + "epoch": 0.0829997331198292, + "grad_norm": 1.596271866444752, + "learning_rate": 1.9853162929799445e-05, + "loss": 1.6827, + "step": 622 + }, + { + "epoch": 0.08313317320523085, + "grad_norm": 1.7231315613385811, + "learning_rate": 1.9852424091918832e-05, + "loss": 1.7274, + "step": 623 + }, + { + "epoch": 0.08326661329063251, + "grad_norm": 1.4156414113003977, + "learning_rate": 1.985168341371757e-05, + "loss": 1.7028, + "step": 624 + }, + { + "epoch": 0.08340005337603416, + "grad_norm": 1.6053274294337398, + "learning_rate": 1.9850940895334014e-05, + "loss": 1.7042, + "step": 625 + }, + { + "epoch": 0.08353349346143582, + "grad_norm": 1.4585415436887006, + "learning_rate": 1.9850196536906857e-05, + "loss": 1.7034, + "step": 626 + }, + { + "epoch": 0.08366693354683746, + "grad_norm": 1.7610381200893763, + "learning_rate": 1.9849450338575132e-05, + "loss": 1.6512, + "step": 627 + }, + { + "epoch": 0.08380037363223912, + "grad_norm": 1.3605669698961698, + "learning_rate": 1.9848702300478227e-05, + "loss": 1.7041, + "step": 628 + }, + { + "epoch": 0.08393381371764078, + "grad_norm": 1.3831476932394524, + "learning_rate": 1.984795242275586e-05, + "loss": 1.6914, + "step": 629 + }, + { + "epoch": 0.08406725380304243, + "grad_norm": 1.3145903110830626, + "learning_rate": 1.9847200705548106e-05, + "loss": 1.7033, + "step": 630 + }, + { + "epoch": 0.08420069388844409, + "grad_norm": 1.509116622712505, + "learning_rate": 1.9846447148995374e-05, + "loss": 1.7358, + "step": 631 + }, + { + "epoch": 0.08433413397384575, + "grad_norm": 1.3781652631062695, + "learning_rate": 1.984569175323842e-05, + "loss": 1.6917, + "step": 632 + }, + { + "epoch": 0.0844675740592474, + "grad_norm": 1.264221299881922, + "learning_rate": 1.984493451841835e-05, + "loss": 1.6757, + "step": 633 + }, + { + "epoch": 0.08460101414464906, + "grad_norm": 1.4764352016979319, + "learning_rate": 1.9844175444676594e-05, + "loss": 1.6922, + "step": 634 + }, + { + "epoch": 0.0847344542300507, + "grad_norm": 1.2723999017947936, + "learning_rate": 1.9843414532154946e-05, + "loss": 1.6704, + "step": 635 + }, + { + "epoch": 0.08486789431545236, + "grad_norm": 1.3038264727029132, + "learning_rate": 1.9842651780995536e-05, + "loss": 1.7411, + "step": 636 + }, + { + "epoch": 0.08500133440085401, + "grad_norm": 1.2103071746870917, + "learning_rate": 1.9841887191340835e-05, + "loss": 1.7046, + "step": 637 + }, + { + "epoch": 0.08513477448625567, + "grad_norm": 1.4950179854172028, + "learning_rate": 1.9841120763333665e-05, + "loss": 1.6348, + "step": 638 + }, + { + "epoch": 0.08526821457165733, + "grad_norm": 1.3121278424473823, + "learning_rate": 1.9840352497117178e-05, + "loss": 1.6354, + "step": 639 + }, + { + "epoch": 0.08540165465705898, + "grad_norm": 1.2723164770543802, + "learning_rate": 1.9839582392834883e-05, + "loss": 1.7162, + "step": 640 + }, + { + "epoch": 0.08553509474246064, + "grad_norm": 1.2860827518610605, + "learning_rate": 1.983881045063062e-05, + "loss": 1.7034, + "step": 641 + }, + { + "epoch": 0.08566853482786228, + "grad_norm": 1.2951379952738893, + "learning_rate": 1.983803667064859e-05, + "loss": 1.6595, + "step": 642 + }, + { + "epoch": 0.08580197491326394, + "grad_norm": 1.3206243247305416, + "learning_rate": 1.9837261053033316e-05, + "loss": 1.6181, + "step": 643 + }, + { + "epoch": 0.0859354149986656, + "grad_norm": 1.2669833972811808, + "learning_rate": 1.983648359792968e-05, + "loss": 1.7101, + "step": 644 + }, + { + "epoch": 0.08606885508406725, + "grad_norm": 1.29975526714487, + "learning_rate": 1.9835704305482905e-05, + "loss": 1.7718, + "step": 645 + }, + { + "epoch": 0.08620229516946891, + "grad_norm": 1.292410669128173, + "learning_rate": 1.9834923175838543e-05, + "loss": 1.6673, + "step": 646 + }, + { + "epoch": 0.08633573525487057, + "grad_norm": 1.4355885884004973, + "learning_rate": 1.9834140209142507e-05, + "loss": 1.7111, + "step": 647 + }, + { + "epoch": 0.08646917534027222, + "grad_norm": 1.3556914239382971, + "learning_rate": 1.983335540554105e-05, + "loss": 1.7227, + "step": 648 + }, + { + "epoch": 0.08660261542567388, + "grad_norm": 1.3219454699436908, + "learning_rate": 1.9832568765180758e-05, + "loss": 1.6788, + "step": 649 + }, + { + "epoch": 0.08673605551107552, + "grad_norm": 1.3046640649028514, + "learning_rate": 1.9831780288208565e-05, + "loss": 1.7202, + "step": 650 + }, + { + "epoch": 0.08686949559647718, + "grad_norm": 1.2956980030658956, + "learning_rate": 1.983098997477176e-05, + "loss": 1.6821, + "step": 651 + }, + { + "epoch": 0.08700293568187883, + "grad_norm": 1.4884991397141505, + "learning_rate": 1.9830197825017955e-05, + "loss": 1.6731, + "step": 652 + }, + { + "epoch": 0.08713637576728049, + "grad_norm": 1.2808830107084233, + "learning_rate": 1.9829403839095115e-05, + "loss": 1.6643, + "step": 653 + }, + { + "epoch": 0.08726981585268215, + "grad_norm": 1.3169596863329627, + "learning_rate": 1.9828608017151554e-05, + "loss": 1.6755, + "step": 654 + }, + { + "epoch": 0.0874032559380838, + "grad_norm": 1.4063600492133352, + "learning_rate": 1.9827810359335914e-05, + "loss": 1.6977, + "step": 655 + }, + { + "epoch": 0.08753669602348546, + "grad_norm": 2.4592773035398823, + "learning_rate": 1.9827010865797196e-05, + "loss": 1.6971, + "step": 656 + }, + { + "epoch": 0.0876701361088871, + "grad_norm": 1.5217703143835495, + "learning_rate": 1.9826209536684732e-05, + "loss": 1.747, + "step": 657 + }, + { + "epoch": 0.08780357619428876, + "grad_norm": 1.2927816468462434, + "learning_rate": 1.98254063721482e-05, + "loss": 1.671, + "step": 658 + }, + { + "epoch": 0.08793701627969042, + "grad_norm": 1.4773678075247743, + "learning_rate": 1.9824601372337628e-05, + "loss": 1.7076, + "step": 659 + }, + { + "epoch": 0.08807045636509207, + "grad_norm": 1.4581860403342026, + "learning_rate": 1.9823794537403372e-05, + "loss": 1.7272, + "step": 660 + }, + { + "epoch": 0.08820389645049373, + "grad_norm": 1.3752337806151618, + "learning_rate": 1.982298586749615e-05, + "loss": 1.6682, + "step": 661 + }, + { + "epoch": 0.08833733653589539, + "grad_norm": 1.353951137249054, + "learning_rate": 1.9822175362767006e-05, + "loss": 1.6528, + "step": 662 + }, + { + "epoch": 0.08847077662129704, + "grad_norm": 1.3149849437010714, + "learning_rate": 1.9821363023367327e-05, + "loss": 1.7095, + "step": 663 + }, + { + "epoch": 0.0886042167066987, + "grad_norm": 1.3521374496959613, + "learning_rate": 1.982054884944886e-05, + "loss": 1.7091, + "step": 664 + }, + { + "epoch": 0.08873765679210034, + "grad_norm": 1.2635125106435583, + "learning_rate": 1.9819732841163683e-05, + "loss": 1.6917, + "step": 665 + }, + { + "epoch": 0.088871096877502, + "grad_norm": 1.4576045027521718, + "learning_rate": 1.981891499866421e-05, + "loss": 1.7307, + "step": 666 + }, + { + "epoch": 0.08900453696290365, + "grad_norm": 2.0342924247731333, + "learning_rate": 1.9818095322103207e-05, + "loss": 1.7208, + "step": 667 + }, + { + "epoch": 0.08913797704830531, + "grad_norm": 1.3227265586786, + "learning_rate": 1.981727381163378e-05, + "loss": 1.7141, + "step": 668 + }, + { + "epoch": 0.08927141713370697, + "grad_norm": 1.3798866135669063, + "learning_rate": 1.981645046740938e-05, + "loss": 1.7157, + "step": 669 + }, + { + "epoch": 0.08940485721910862, + "grad_norm": 1.399678731841314, + "learning_rate": 1.9815625289583802e-05, + "loss": 1.7082, + "step": 670 + }, + { + "epoch": 0.08953829730451028, + "grad_norm": 1.4295883763194557, + "learning_rate": 1.981479827831117e-05, + "loss": 1.7054, + "step": 671 + }, + { + "epoch": 0.08967173738991192, + "grad_norm": 1.2635430082079968, + "learning_rate": 1.9813969433745964e-05, + "loss": 1.7389, + "step": 672 + }, + { + "epoch": 0.08980517747531358, + "grad_norm": 1.2907473612580835, + "learning_rate": 1.981313875604301e-05, + "loss": 1.7277, + "step": 673 + }, + { + "epoch": 0.08993861756071524, + "grad_norm": 1.237003971445107, + "learning_rate": 1.981230624535746e-05, + "loss": 1.7385, + "step": 674 + }, + { + "epoch": 0.0900720576461169, + "grad_norm": 1.3068726757473064, + "learning_rate": 1.9811471901844818e-05, + "loss": 1.7012, + "step": 675 + }, + { + "epoch": 0.09020549773151855, + "grad_norm": 1.3009066481680596, + "learning_rate": 1.9810635725660934e-05, + "loss": 1.7133, + "step": 676 + }, + { + "epoch": 0.0903389378169202, + "grad_norm": 1.4485758632883985, + "learning_rate": 1.9809797716961995e-05, + "loss": 1.704, + "step": 677 + }, + { + "epoch": 0.09047237790232186, + "grad_norm": 1.2649229686508479, + "learning_rate": 1.980895787590453e-05, + "loss": 1.7537, + "step": 678 + }, + { + "epoch": 0.0906058179877235, + "grad_norm": 1.2115856539500358, + "learning_rate": 1.9808116202645414e-05, + "loss": 1.7245, + "step": 679 + }, + { + "epoch": 0.09073925807312516, + "grad_norm": 1.8074140919241495, + "learning_rate": 1.9807272697341862e-05, + "loss": 1.7238, + "step": 680 + }, + { + "epoch": 0.09087269815852682, + "grad_norm": 1.3795215862127743, + "learning_rate": 1.980642736015143e-05, + "loss": 1.7037, + "step": 681 + }, + { + "epoch": 0.09100613824392847, + "grad_norm": 1.2851711329581033, + "learning_rate": 1.9805580191232017e-05, + "loss": 1.6901, + "step": 682 + }, + { + "epoch": 0.09113957832933013, + "grad_norm": 1.2298242189849842, + "learning_rate": 1.9804731190741867e-05, + "loss": 1.6914, + "step": 683 + }, + { + "epoch": 0.09127301841473179, + "grad_norm": 1.2359913396040505, + "learning_rate": 1.980388035883956e-05, + "loss": 1.6764, + "step": 684 + }, + { + "epoch": 0.09140645850013344, + "grad_norm": 1.4724399232032126, + "learning_rate": 1.980302769568402e-05, + "loss": 1.6784, + "step": 685 + }, + { + "epoch": 0.0915398985855351, + "grad_norm": 1.4540529296606115, + "learning_rate": 1.9802173201434522e-05, + "loss": 1.726, + "step": 686 + }, + { + "epoch": 0.09167333867093674, + "grad_norm": 1.2409303185307805, + "learning_rate": 1.980131687625067e-05, + "loss": 1.7201, + "step": 687 + }, + { + "epoch": 0.0918067787563384, + "grad_norm": 1.2194142955503393, + "learning_rate": 1.980045872029242e-05, + "loss": 1.7108, + "step": 688 + }, + { + "epoch": 0.09194021884174006, + "grad_norm": 1.39809489648676, + "learning_rate": 1.979959873372006e-05, + "loss": 1.6765, + "step": 689 + }, + { + "epoch": 0.09207365892714171, + "grad_norm": 1.253703018642756, + "learning_rate": 1.9798736916694234e-05, + "loss": 1.7405, + "step": 690 + }, + { + "epoch": 0.09220709901254337, + "grad_norm": 1.4561652380463412, + "learning_rate": 1.979787326937591e-05, + "loss": 1.7281, + "step": 691 + }, + { + "epoch": 0.09234053909794503, + "grad_norm": 1.2475596641603122, + "learning_rate": 1.9797007791926416e-05, + "loss": 1.7424, + "step": 692 + }, + { + "epoch": 0.09247397918334668, + "grad_norm": 1.2389551154520175, + "learning_rate": 1.9796140484507407e-05, + "loss": 1.64, + "step": 693 + }, + { + "epoch": 0.09260741926874833, + "grad_norm": 1.2409009102177742, + "learning_rate": 1.979527134728089e-05, + "loss": 1.7022, + "step": 694 + }, + { + "epoch": 0.09274085935414998, + "grad_norm": 3.070113056380779, + "learning_rate": 1.979440038040921e-05, + "loss": 1.7089, + "step": 695 + }, + { + "epoch": 0.09287429943955164, + "grad_norm": 1.2847037546220312, + "learning_rate": 1.9793527584055048e-05, + "loss": 1.6702, + "step": 696 + }, + { + "epoch": 0.0930077395249533, + "grad_norm": 1.342430254516785, + "learning_rate": 1.9792652958381442e-05, + "loss": 1.6981, + "step": 697 + }, + { + "epoch": 0.09314117961035495, + "grad_norm": 1.285114160550707, + "learning_rate": 1.9791776503551753e-05, + "loss": 1.7194, + "step": 698 + }, + { + "epoch": 0.09327461969575661, + "grad_norm": 1.384740422514496, + "learning_rate": 1.97908982197297e-05, + "loss": 1.6947, + "step": 699 + }, + { + "epoch": 0.09340805978115826, + "grad_norm": 1.317742692774045, + "learning_rate": 1.9790018107079328e-05, + "loss": 1.7059, + "step": 700 + }, + { + "epoch": 0.09354149986655992, + "grad_norm": 1.3948823280878986, + "learning_rate": 1.978913616576504e-05, + "loss": 1.6838, + "step": 701 + }, + { + "epoch": 0.09367493995196156, + "grad_norm": 1.5694970220793347, + "learning_rate": 1.9788252395951572e-05, + "loss": 1.7468, + "step": 702 + }, + { + "epoch": 0.09380838003736322, + "grad_norm": 1.3817257421047109, + "learning_rate": 1.9787366797804e-05, + "loss": 1.7176, + "step": 703 + }, + { + "epoch": 0.09394182012276488, + "grad_norm": 1.6513313977834363, + "learning_rate": 1.978647937148774e-05, + "loss": 1.6944, + "step": 704 + }, + { + "epoch": 0.09407526020816653, + "grad_norm": 1.405768726367186, + "learning_rate": 1.9785590117168558e-05, + "loss": 1.6983, + "step": 705 + }, + { + "epoch": 0.09420870029356819, + "grad_norm": 1.3377994406213816, + "learning_rate": 1.9784699035012552e-05, + "loss": 1.668, + "step": 706 + }, + { + "epoch": 0.09434214037896985, + "grad_norm": 1.4374634387262166, + "learning_rate": 1.9783806125186176e-05, + "loss": 1.725, + "step": 707 + }, + { + "epoch": 0.0944755804643715, + "grad_norm": 1.2800462634526275, + "learning_rate": 1.9782911387856204e-05, + "loss": 1.6953, + "step": 708 + }, + { + "epoch": 0.09460902054977315, + "grad_norm": 1.364118854409637, + "learning_rate": 1.978201482318977e-05, + "loss": 1.7155, + "step": 709 + }, + { + "epoch": 0.0947424606351748, + "grad_norm": 1.2401749198659822, + "learning_rate": 1.9781116431354337e-05, + "loss": 1.7057, + "step": 710 + }, + { + "epoch": 0.09487590072057646, + "grad_norm": 1.8122777264311412, + "learning_rate": 1.9780216212517718e-05, + "loss": 1.7083, + "step": 711 + }, + { + "epoch": 0.09500934080597812, + "grad_norm": 1.3875021914221024, + "learning_rate": 1.9779314166848063e-05, + "loss": 1.6533, + "step": 712 + }, + { + "epoch": 0.09514278089137977, + "grad_norm": 1.2772834218394613, + "learning_rate": 1.9778410294513865e-05, + "loss": 1.6915, + "step": 713 + }, + { + "epoch": 0.09527622097678143, + "grad_norm": 1.4645542214685092, + "learning_rate": 1.977750459568395e-05, + "loss": 1.7061, + "step": 714 + }, + { + "epoch": 0.09540966106218308, + "grad_norm": 1.38108304312693, + "learning_rate": 1.9776597070527502e-05, + "loss": 1.721, + "step": 715 + }, + { + "epoch": 0.09554310114758474, + "grad_norm": 1.382167340023687, + "learning_rate": 1.977568771921403e-05, + "loss": 1.7469, + "step": 716 + }, + { + "epoch": 0.09567654123298638, + "grad_norm": 1.4198404214818579, + "learning_rate": 1.9774776541913394e-05, + "loss": 1.6709, + "step": 717 + }, + { + "epoch": 0.09580998131838804, + "grad_norm": 1.3599396711566136, + "learning_rate": 1.9773863538795787e-05, + "loss": 1.7425, + "step": 718 + }, + { + "epoch": 0.0959434214037897, + "grad_norm": 1.2499141359082184, + "learning_rate": 1.9772948710031754e-05, + "loss": 1.6758, + "step": 719 + }, + { + "epoch": 0.09607686148919135, + "grad_norm": 1.3452952575533974, + "learning_rate": 1.977203205579217e-05, + "loss": 1.6976, + "step": 720 + }, + { + "epoch": 0.09621030157459301, + "grad_norm": 1.5618388812813502, + "learning_rate": 1.9771113576248257e-05, + "loss": 1.7143, + "step": 721 + }, + { + "epoch": 0.09634374165999467, + "grad_norm": 1.4099658140052458, + "learning_rate": 1.9770193271571573e-05, + "loss": 1.704, + "step": 722 + }, + { + "epoch": 0.09647718174539632, + "grad_norm": 1.4101573255249529, + "learning_rate": 1.976927114193403e-05, + "loss": 1.7099, + "step": 723 + }, + { + "epoch": 0.09661062183079797, + "grad_norm": 2.4533761018966898, + "learning_rate": 1.976834718750786e-05, + "loss": 1.7659, + "step": 724 + }, + { + "epoch": 0.09674406191619962, + "grad_norm": 1.450517529596779, + "learning_rate": 1.9767421408465654e-05, + "loss": 1.7009, + "step": 725 + }, + { + "epoch": 0.09687750200160128, + "grad_norm": 1.268810752392761, + "learning_rate": 1.9766493804980335e-05, + "loss": 1.7101, + "step": 726 + }, + { + "epoch": 0.09701094208700294, + "grad_norm": 1.3332261405385217, + "learning_rate": 1.976556437722517e-05, + "loss": 1.7127, + "step": 727 + }, + { + "epoch": 0.09714438217240459, + "grad_norm": 1.2579594665184823, + "learning_rate": 1.976463312537376e-05, + "loss": 1.7202, + "step": 728 + }, + { + "epoch": 0.09727782225780625, + "grad_norm": 1.5697939160123118, + "learning_rate": 1.976370004960006e-05, + "loss": 1.7365, + "step": 729 + }, + { + "epoch": 0.0974112623432079, + "grad_norm": 1.437009988573765, + "learning_rate": 1.9762765150078356e-05, + "loss": 1.7322, + "step": 730 + }, + { + "epoch": 0.09754470242860955, + "grad_norm": 1.3169792642285323, + "learning_rate": 1.9761828426983275e-05, + "loss": 1.7273, + "step": 731 + }, + { + "epoch": 0.0976781425140112, + "grad_norm": 1.5676941938167916, + "learning_rate": 1.9760889880489785e-05, + "loss": 1.6898, + "step": 732 + }, + { + "epoch": 0.09781158259941286, + "grad_norm": 1.4697881996182383, + "learning_rate": 1.9759949510773198e-05, + "loss": 1.7192, + "step": 733 + }, + { + "epoch": 0.09794502268481452, + "grad_norm": 1.2684891400607772, + "learning_rate": 1.9759007318009163e-05, + "loss": 1.7186, + "step": 734 + }, + { + "epoch": 0.09807846277021617, + "grad_norm": 1.2364993695259308, + "learning_rate": 1.9758063302373668e-05, + "loss": 1.7161, + "step": 735 + }, + { + "epoch": 0.09821190285561783, + "grad_norm": 1.2322024949845318, + "learning_rate": 1.9757117464043054e-05, + "loss": 1.6399, + "step": 736 + }, + { + "epoch": 0.09834534294101949, + "grad_norm": 1.2787037328868474, + "learning_rate": 1.9756169803193982e-05, + "loss": 1.6938, + "step": 737 + }, + { + "epoch": 0.09847878302642114, + "grad_norm": 1.4067488547142601, + "learning_rate": 1.9755220320003472e-05, + "loss": 1.6728, + "step": 738 + }, + { + "epoch": 0.09861222311182279, + "grad_norm": 1.2315172135495314, + "learning_rate": 1.9754269014648876e-05, + "loss": 1.6648, + "step": 739 + }, + { + "epoch": 0.09874566319722444, + "grad_norm": 1.267447733362674, + "learning_rate": 1.975331588730788e-05, + "loss": 1.706, + "step": 740 + }, + { + "epoch": 0.0988791032826261, + "grad_norm": 1.2625669177056562, + "learning_rate": 1.9752360938158522e-05, + "loss": 1.6826, + "step": 741 + }, + { + "epoch": 0.09901254336802776, + "grad_norm": 1.3204723066933017, + "learning_rate": 1.9751404167379175e-05, + "loss": 1.7471, + "step": 742 + }, + { + "epoch": 0.09914598345342941, + "grad_norm": 1.2156415623133245, + "learning_rate": 1.9750445575148557e-05, + "loss": 1.7028, + "step": 743 + }, + { + "epoch": 0.09927942353883107, + "grad_norm": 1.3803504882683986, + "learning_rate": 1.9749485161645715e-05, + "loss": 1.5917, + "step": 744 + }, + { + "epoch": 0.09941286362423273, + "grad_norm": 1.3249632567906648, + "learning_rate": 1.974852292705005e-05, + "loss": 1.7628, + "step": 745 + }, + { + "epoch": 0.09954630370963437, + "grad_norm": 1.2675581593990985, + "learning_rate": 1.974755887154129e-05, + "loss": 1.7161, + "step": 746 + }, + { + "epoch": 0.09967974379503602, + "grad_norm": 1.1665077868327602, + "learning_rate": 1.9746592995299515e-05, + "loss": 1.6748, + "step": 747 + }, + { + "epoch": 0.09981318388043768, + "grad_norm": 1.8639201171370587, + "learning_rate": 1.974562529850514e-05, + "loss": 1.697, + "step": 748 + }, + { + "epoch": 0.09994662396583934, + "grad_norm": 1.3196170477492044, + "learning_rate": 1.9744655781338913e-05, + "loss": 1.7133, + "step": 749 + }, + { + "epoch": 0.100080064051241, + "grad_norm": 1.2684243723887993, + "learning_rate": 1.9743684443981933e-05, + "loss": 1.7135, + "step": 750 + }, + { + "epoch": 0.10021350413664265, + "grad_norm": 1.1693884140497548, + "learning_rate": 1.9742711286615637e-05, + "loss": 1.7565, + "step": 751 + }, + { + "epoch": 0.10034694422204431, + "grad_norm": 1.2151625452030927, + "learning_rate": 1.97417363094218e-05, + "loss": 1.6717, + "step": 752 + }, + { + "epoch": 0.10048038430744596, + "grad_norm": 1.205035641499454, + "learning_rate": 1.974075951258253e-05, + "loss": 1.699, + "step": 753 + }, + { + "epoch": 0.1006138243928476, + "grad_norm": 1.547680875790279, + "learning_rate": 1.973978089628029e-05, + "loss": 1.7031, + "step": 754 + }, + { + "epoch": 0.10074726447824926, + "grad_norm": 1.5859192028100046, + "learning_rate": 1.9738800460697864e-05, + "loss": 1.7173, + "step": 755 + }, + { + "epoch": 0.10088070456365092, + "grad_norm": 1.5322336200687723, + "learning_rate": 1.9737818206018398e-05, + "loss": 1.6959, + "step": 756 + }, + { + "epoch": 0.10101414464905258, + "grad_norm": 1.2626433527759493, + "learning_rate": 1.9736834132425358e-05, + "loss": 1.7239, + "step": 757 + }, + { + "epoch": 0.10114758473445423, + "grad_norm": 1.505323027317607, + "learning_rate": 1.973584824010256e-05, + "loss": 1.6621, + "step": 758 + }, + { + "epoch": 0.10128102481985589, + "grad_norm": 1.286494859170549, + "learning_rate": 1.9734860529234158e-05, + "loss": 1.7242, + "step": 759 + }, + { + "epoch": 0.10141446490525755, + "grad_norm": 1.2306017189093845, + "learning_rate": 1.9733871000004643e-05, + "loss": 1.7357, + "step": 760 + }, + { + "epoch": 0.10154790499065919, + "grad_norm": 1.2061936583163555, + "learning_rate": 1.973287965259885e-05, + "loss": 1.7067, + "step": 761 + }, + { + "epoch": 0.10168134507606084, + "grad_norm": 1.2179611482678936, + "learning_rate": 1.9731886487201955e-05, + "loss": 1.685, + "step": 762 + }, + { + "epoch": 0.1018147851614625, + "grad_norm": 1.2683345257288643, + "learning_rate": 1.973089150399946e-05, + "loss": 1.6885, + "step": 763 + }, + { + "epoch": 0.10194822524686416, + "grad_norm": 1.2219892409057462, + "learning_rate": 1.9729894703177224e-05, + "loss": 1.6881, + "step": 764 + }, + { + "epoch": 0.10208166533226581, + "grad_norm": 1.458449697810168, + "learning_rate": 1.972889608492144e-05, + "loss": 1.7057, + "step": 765 + }, + { + "epoch": 0.10221510541766747, + "grad_norm": 1.3400740587143003, + "learning_rate": 1.972789564941863e-05, + "loss": 1.7171, + "step": 766 + }, + { + "epoch": 0.10234854550306913, + "grad_norm": 1.2400559343919726, + "learning_rate": 1.9726893396855675e-05, + "loss": 1.6838, + "step": 767 + }, + { + "epoch": 0.10248198558847077, + "grad_norm": 1.2051072425315403, + "learning_rate": 1.9725889327419773e-05, + "loss": 1.7223, + "step": 768 + }, + { + "epoch": 0.10261542567387243, + "grad_norm": 1.2802866462485152, + "learning_rate": 1.9724883441298482e-05, + "loss": 1.6311, + "step": 769 + }, + { + "epoch": 0.10274886575927408, + "grad_norm": 1.2328867984674214, + "learning_rate": 1.9723875738679684e-05, + "loss": 1.7446, + "step": 770 + }, + { + "epoch": 0.10288230584467574, + "grad_norm": 1.2341749669672337, + "learning_rate": 1.9722866219751606e-05, + "loss": 1.7008, + "step": 771 + }, + { + "epoch": 0.1030157459300774, + "grad_norm": 1.236991722122687, + "learning_rate": 1.972185488470282e-05, + "loss": 1.7141, + "step": 772 + }, + { + "epoch": 0.10314918601547905, + "grad_norm": 1.577844605519268, + "learning_rate": 1.9720841733722228e-05, + "loss": 1.7455, + "step": 773 + }, + { + "epoch": 0.10328262610088071, + "grad_norm": 1.3705708996585753, + "learning_rate": 1.9719826766999076e-05, + "loss": 1.7616, + "step": 774 + }, + { + "epoch": 0.10341606618628237, + "grad_norm": 1.221952982391811, + "learning_rate": 1.971880998472295e-05, + "loss": 1.7413, + "step": 775 + }, + { + "epoch": 0.10354950627168401, + "grad_norm": 1.537181900251082, + "learning_rate": 1.9717791387083772e-05, + "loss": 1.7011, + "step": 776 + }, + { + "epoch": 0.10368294635708566, + "grad_norm": 1.2027484662414312, + "learning_rate": 1.9716770974271803e-05, + "loss": 1.6332, + "step": 777 + }, + { + "epoch": 0.10381638644248732, + "grad_norm": 1.253964108899184, + "learning_rate": 1.9715748746477644e-05, + "loss": 1.7075, + "step": 778 + }, + { + "epoch": 0.10394982652788898, + "grad_norm": 1.309242214190543, + "learning_rate": 1.9714724703892238e-05, + "loss": 1.7086, + "step": 779 + }, + { + "epoch": 0.10408326661329063, + "grad_norm": 1.1977182401997006, + "learning_rate": 1.9713698846706865e-05, + "loss": 1.6517, + "step": 780 + }, + { + "epoch": 0.10421670669869229, + "grad_norm": 6.937492769576936, + "learning_rate": 1.971267117511314e-05, + "loss": 1.7718, + "step": 781 + }, + { + "epoch": 0.10435014678409395, + "grad_norm": 1.7070387484387968, + "learning_rate": 1.9711641689303024e-05, + "loss": 1.6522, + "step": 782 + }, + { + "epoch": 0.10448358686949559, + "grad_norm": 1.407052136214357, + "learning_rate": 1.971061038946881e-05, + "loss": 1.6851, + "step": 783 + }, + { + "epoch": 0.10461702695489725, + "grad_norm": 1.5040283593232824, + "learning_rate": 1.970957727580314e-05, + "loss": 1.64, + "step": 784 + }, + { + "epoch": 0.1047504670402989, + "grad_norm": 1.3258063153724626, + "learning_rate": 1.9708542348498975e-05, + "loss": 1.6948, + "step": 785 + }, + { + "epoch": 0.10488390712570056, + "grad_norm": 1.288600009166478, + "learning_rate": 1.970750560774964e-05, + "loss": 1.7193, + "step": 786 + }, + { + "epoch": 0.10501734721110222, + "grad_norm": 1.4250574606731519, + "learning_rate": 1.9706467053748782e-05, + "loss": 1.7096, + "step": 787 + }, + { + "epoch": 0.10515078729650387, + "grad_norm": 1.5215309159514565, + "learning_rate": 1.9705426686690387e-05, + "loss": 1.6971, + "step": 788 + }, + { + "epoch": 0.10528422738190553, + "grad_norm": 1.3671118437746754, + "learning_rate": 1.9704384506768788e-05, + "loss": 1.7509, + "step": 789 + }, + { + "epoch": 0.10541766746730719, + "grad_norm": 1.2902319309239332, + "learning_rate": 1.9703340514178656e-05, + "loss": 1.7539, + "step": 790 + }, + { + "epoch": 0.10555110755270883, + "grad_norm": 1.2681335491850911, + "learning_rate": 1.9702294709114987e-05, + "loss": 1.7117, + "step": 791 + }, + { + "epoch": 0.10568454763811048, + "grad_norm": 1.2418617832342838, + "learning_rate": 1.9701247091773135e-05, + "loss": 1.7345, + "step": 792 + }, + { + "epoch": 0.10581798772351214, + "grad_norm": 1.2895790697871072, + "learning_rate": 1.9700197662348777e-05, + "loss": 1.6623, + "step": 793 + }, + { + "epoch": 0.1059514278089138, + "grad_norm": 1.2684357393175132, + "learning_rate": 1.9699146421037935e-05, + "loss": 1.6487, + "step": 794 + }, + { + "epoch": 0.10608486789431545, + "grad_norm": 1.2423605419736183, + "learning_rate": 1.9698093368036976e-05, + "loss": 1.7129, + "step": 795 + }, + { + "epoch": 0.10621830797971711, + "grad_norm": 1.4752327188086216, + "learning_rate": 1.969703850354259e-05, + "loss": 1.6892, + "step": 796 + }, + { + "epoch": 0.10635174806511877, + "grad_norm": 1.4526839181300373, + "learning_rate": 1.9695981827751815e-05, + "loss": 1.7072, + "step": 797 + }, + { + "epoch": 0.10648518815052041, + "grad_norm": 1.285791119441606, + "learning_rate": 1.969492334086203e-05, + "loss": 1.7122, + "step": 798 + }, + { + "epoch": 0.10661862823592207, + "grad_norm": 1.2256787041306045, + "learning_rate": 1.9693863043070944e-05, + "loss": 1.6888, + "step": 799 + }, + { + "epoch": 0.10675206832132372, + "grad_norm": 1.228951068308044, + "learning_rate": 1.9692800934576607e-05, + "loss": 1.727, + "step": 800 + }, + { + "epoch": 0.10688550840672538, + "grad_norm": 1.352118087145811, + "learning_rate": 1.9691737015577418e-05, + "loss": 1.7125, + "step": 801 + }, + { + "epoch": 0.10701894849212704, + "grad_norm": 1.21157400011724, + "learning_rate": 1.96906712862721e-05, + "loss": 1.6923, + "step": 802 + }, + { + "epoch": 0.10715238857752869, + "grad_norm": 1.3003550717703736, + "learning_rate": 1.9689603746859714e-05, + "loss": 1.6511, + "step": 803 + }, + { + "epoch": 0.10728582866293035, + "grad_norm": 1.1564673283657656, + "learning_rate": 1.9688534397539666e-05, + "loss": 1.6726, + "step": 804 + }, + { + "epoch": 0.107419268748332, + "grad_norm": 1.2730332287589288, + "learning_rate": 1.9687463238511704e-05, + "loss": 1.6618, + "step": 805 + }, + { + "epoch": 0.10755270883373365, + "grad_norm": 1.3800666144026246, + "learning_rate": 1.9686390269975907e-05, + "loss": 1.7154, + "step": 806 + }, + { + "epoch": 0.1076861489191353, + "grad_norm": 1.2048204552645487, + "learning_rate": 1.968531549213269e-05, + "loss": 1.6448, + "step": 807 + }, + { + "epoch": 0.10781958900453696, + "grad_norm": 1.3937316868921632, + "learning_rate": 1.9684238905182807e-05, + "loss": 1.729, + "step": 808 + }, + { + "epoch": 0.10795302908993862, + "grad_norm": 1.3900561665275548, + "learning_rate": 1.968316050932736e-05, + "loss": 1.6828, + "step": 809 + }, + { + "epoch": 0.10808646917534027, + "grad_norm": 1.6114246988977743, + "learning_rate": 1.9682080304767775e-05, + "loss": 1.6792, + "step": 810 + }, + { + "epoch": 0.10821990926074193, + "grad_norm": 1.197150037360335, + "learning_rate": 1.9680998291705822e-05, + "loss": 1.6617, + "step": 811 + }, + { + "epoch": 0.10835334934614359, + "grad_norm": 1.2177958885989686, + "learning_rate": 1.9679914470343615e-05, + "loss": 1.7133, + "step": 812 + }, + { + "epoch": 0.10848678943154523, + "grad_norm": 1.494320786513006, + "learning_rate": 1.9678828840883592e-05, + "loss": 1.7004, + "step": 813 + }, + { + "epoch": 0.10862022951694689, + "grad_norm": 1.1694521285820114, + "learning_rate": 1.9677741403528538e-05, + "loss": 1.6699, + "step": 814 + }, + { + "epoch": 0.10875366960234854, + "grad_norm": 1.1962398552992284, + "learning_rate": 1.967665215848158e-05, + "loss": 1.6874, + "step": 815 + }, + { + "epoch": 0.1088871096877502, + "grad_norm": 1.5912303268847496, + "learning_rate": 1.9675561105946165e-05, + "loss": 1.6643, + "step": 816 + }, + { + "epoch": 0.10902054977315186, + "grad_norm": 1.2975078209899873, + "learning_rate": 1.96744682461261e-05, + "loss": 1.6789, + "step": 817 + }, + { + "epoch": 0.10915398985855351, + "grad_norm": 1.2145763409844237, + "learning_rate": 1.9673373579225514e-05, + "loss": 1.6771, + "step": 818 + }, + { + "epoch": 0.10928742994395517, + "grad_norm": 1.493883539659001, + "learning_rate": 1.9672277105448878e-05, + "loss": 1.6724, + "step": 819 + }, + { + "epoch": 0.10942087002935681, + "grad_norm": 1.644683081505485, + "learning_rate": 1.9671178825001002e-05, + "loss": 1.6167, + "step": 820 + }, + { + "epoch": 0.10955431011475847, + "grad_norm": 1.2719900635243897, + "learning_rate": 1.967007873808703e-05, + "loss": 1.6765, + "step": 821 + }, + { + "epoch": 0.10968775020016013, + "grad_norm": 1.2528047543957326, + "learning_rate": 1.966897684491245e-05, + "loss": 1.7102, + "step": 822 + }, + { + "epoch": 0.10982119028556178, + "grad_norm": 1.2419647024115608, + "learning_rate": 1.9667873145683082e-05, + "loss": 1.7077, + "step": 823 + }, + { + "epoch": 0.10995463037096344, + "grad_norm": 1.4501286698451352, + "learning_rate": 1.966676764060508e-05, + "loss": 1.6994, + "step": 824 + }, + { + "epoch": 0.1100880704563651, + "grad_norm": 1.3630120265958166, + "learning_rate": 1.9665660329884944e-05, + "loss": 1.6926, + "step": 825 + }, + { + "epoch": 0.11022151054176675, + "grad_norm": 1.460903039980999, + "learning_rate": 1.966455121372951e-05, + "loss": 1.7103, + "step": 826 + }, + { + "epoch": 0.11035495062716841, + "grad_norm": 1.5274078789068972, + "learning_rate": 1.9663440292345942e-05, + "loss": 1.6988, + "step": 827 + }, + { + "epoch": 0.11048839071257005, + "grad_norm": 1.2090669145048123, + "learning_rate": 1.9662327565941747e-05, + "loss": 1.6496, + "step": 828 + }, + { + "epoch": 0.1106218307979717, + "grad_norm": 1.2269483421344651, + "learning_rate": 1.9661213034724776e-05, + "loss": 1.7122, + "step": 829 + }, + { + "epoch": 0.11075527088337336, + "grad_norm": 1.4069578940447711, + "learning_rate": 1.9660096698903203e-05, + "loss": 1.7269, + "step": 830 + }, + { + "epoch": 0.11088871096877502, + "grad_norm": 1.2037399973430059, + "learning_rate": 1.9658978558685557e-05, + "loss": 1.7083, + "step": 831 + }, + { + "epoch": 0.11102215105417668, + "grad_norm": 1.2631008462452196, + "learning_rate": 1.9657858614280682e-05, + "loss": 1.6926, + "step": 832 + }, + { + "epoch": 0.11115559113957833, + "grad_norm": 1.3642794324735983, + "learning_rate": 1.965673686589778e-05, + "loss": 1.7298, + "step": 833 + }, + { + "epoch": 0.11128903122497999, + "grad_norm": 1.7518036471850427, + "learning_rate": 1.9655613313746378e-05, + "loss": 1.669, + "step": 834 + }, + { + "epoch": 0.11142247131038163, + "grad_norm": 1.3194015458910031, + "learning_rate": 1.965448795803634e-05, + "loss": 1.6799, + "step": 835 + }, + { + "epoch": 0.11155591139578329, + "grad_norm": 1.2408206570528904, + "learning_rate": 1.9653360798977872e-05, + "loss": 1.7045, + "step": 836 + }, + { + "epoch": 0.11168935148118495, + "grad_norm": 1.255528423044181, + "learning_rate": 1.9652231836781514e-05, + "loss": 1.6847, + "step": 837 + }, + { + "epoch": 0.1118227915665866, + "grad_norm": 1.5892490809881583, + "learning_rate": 1.965110107165815e-05, + "loss": 1.6491, + "step": 838 + }, + { + "epoch": 0.11195623165198826, + "grad_norm": 1.117194657692538, + "learning_rate": 1.964996850381898e-05, + "loss": 1.6832, + "step": 839 + }, + { + "epoch": 0.11208967173738991, + "grad_norm": 1.3292775964235752, + "learning_rate": 1.9648834133475564e-05, + "loss": 1.7325, + "step": 840 + }, + { + "epoch": 0.11222311182279157, + "grad_norm": 1.2706968194762298, + "learning_rate": 1.964769796083979e-05, + "loss": 1.7105, + "step": 841 + }, + { + "epoch": 0.11235655190819323, + "grad_norm": 1.1622186343437864, + "learning_rate": 1.964655998612388e-05, + "loss": 1.6583, + "step": 842 + }, + { + "epoch": 0.11248999199359487, + "grad_norm": 1.2930179210511994, + "learning_rate": 1.9645420209540394e-05, + "loss": 1.728, + "step": 843 + }, + { + "epoch": 0.11262343207899653, + "grad_norm": 1.3384278372678646, + "learning_rate": 1.964427863130223e-05, + "loss": 1.7889, + "step": 844 + }, + { + "epoch": 0.11275687216439818, + "grad_norm": 1.3828255437162351, + "learning_rate": 1.964313525162262e-05, + "loss": 1.6834, + "step": 845 + }, + { + "epoch": 0.11289031224979984, + "grad_norm": 1.151410588671105, + "learning_rate": 1.964199007071514e-05, + "loss": 1.7299, + "step": 846 + }, + { + "epoch": 0.1130237523352015, + "grad_norm": 1.1440279928005437, + "learning_rate": 1.9640843088793692e-05, + "loss": 1.7109, + "step": 847 + }, + { + "epoch": 0.11315719242060315, + "grad_norm": 1.4236141452119535, + "learning_rate": 1.9639694306072518e-05, + "loss": 1.7074, + "step": 848 + }, + { + "epoch": 0.11329063250600481, + "grad_norm": 1.2106461953286103, + "learning_rate": 1.96385437227662e-05, + "loss": 1.7274, + "step": 849 + }, + { + "epoch": 0.11342407259140645, + "grad_norm": 1.1556803406122926, + "learning_rate": 1.9637391339089655e-05, + "loss": 1.6884, + "step": 850 + }, + { + "epoch": 0.11355751267680811, + "grad_norm": 4.314358844478156, + "learning_rate": 1.9636237155258132e-05, + "loss": 1.7116, + "step": 851 + }, + { + "epoch": 0.11369095276220977, + "grad_norm": 1.3854953599475233, + "learning_rate": 1.9635081171487223e-05, + "loss": 1.7502, + "step": 852 + }, + { + "epoch": 0.11382439284761142, + "grad_norm": 1.3815136446017762, + "learning_rate": 1.9633923387992852e-05, + "loss": 1.6506, + "step": 853 + }, + { + "epoch": 0.11395783293301308, + "grad_norm": 1.2593646207844973, + "learning_rate": 1.9632763804991275e-05, + "loss": 1.6633, + "step": 854 + }, + { + "epoch": 0.11409127301841474, + "grad_norm": 1.1318090116685733, + "learning_rate": 1.9631602422699093e-05, + "loss": 1.6382, + "step": 855 + }, + { + "epoch": 0.11422471310381639, + "grad_norm": 1.523834563895417, + "learning_rate": 1.963043924133324e-05, + "loss": 1.7173, + "step": 856 + }, + { + "epoch": 0.11435815318921803, + "grad_norm": 1.369606548099637, + "learning_rate": 1.962927426111098e-05, + "loss": 1.7182, + "step": 857 + }, + { + "epoch": 0.11449159327461969, + "grad_norm": 1.2489405584762354, + "learning_rate": 1.9628107482249926e-05, + "loss": 1.6701, + "step": 858 + }, + { + "epoch": 0.11462503336002135, + "grad_norm": 1.3750560913985916, + "learning_rate": 1.9626938904968013e-05, + "loss": 1.725, + "step": 859 + }, + { + "epoch": 0.114758473445423, + "grad_norm": 1.3689472702896512, + "learning_rate": 1.962576852948352e-05, + "loss": 1.6338, + "step": 860 + }, + { + "epoch": 0.11489191353082466, + "grad_norm": 1.5090068885201111, + "learning_rate": 1.9624596356015057e-05, + "loss": 1.7569, + "step": 861 + }, + { + "epoch": 0.11502535361622632, + "grad_norm": 1.1857397963469596, + "learning_rate": 1.9623422384781575e-05, + "loss": 1.689, + "step": 862 + }, + { + "epoch": 0.11515879370162797, + "grad_norm": 4.383590101850042, + "learning_rate": 1.9622246616002362e-05, + "loss": 1.7869, + "step": 863 + }, + { + "epoch": 0.11529223378702963, + "grad_norm": 1.6301360882052789, + "learning_rate": 1.9621069049897026e-05, + "loss": 1.7169, + "step": 864 + }, + { + "epoch": 0.11542567387243127, + "grad_norm": 1.7846551885225532, + "learning_rate": 1.961988968668554e-05, + "loss": 1.731, + "step": 865 + }, + { + "epoch": 0.11555911395783293, + "grad_norm": 1.576969047359226, + "learning_rate": 1.9618708526588187e-05, + "loss": 1.7073, + "step": 866 + }, + { + "epoch": 0.11569255404323459, + "grad_norm": 1.4423014187494878, + "learning_rate": 1.961752556982559e-05, + "loss": 1.6984, + "step": 867 + }, + { + "epoch": 0.11582599412863624, + "grad_norm": 1.411158314477759, + "learning_rate": 1.9616340816618718e-05, + "loss": 1.6689, + "step": 868 + }, + { + "epoch": 0.1159594342140379, + "grad_norm": 1.2615885604648809, + "learning_rate": 1.9615154267188865e-05, + "loss": 1.7033, + "step": 869 + }, + { + "epoch": 0.11609287429943956, + "grad_norm": 1.3497153430677546, + "learning_rate": 1.9613965921757672e-05, + "loss": 1.7052, + "step": 870 + }, + { + "epoch": 0.11622631438484121, + "grad_norm": 1.3238657405928431, + "learning_rate": 1.96127757805471e-05, + "loss": 1.6803, + "step": 871 + }, + { + "epoch": 0.11635975447024285, + "grad_norm": 1.4113872862910883, + "learning_rate": 1.961158384377946e-05, + "loss": 1.6832, + "step": 872 + }, + { + "epoch": 0.11649319455564451, + "grad_norm": 1.1657735990335347, + "learning_rate": 1.9610390111677388e-05, + "loss": 1.6761, + "step": 873 + }, + { + "epoch": 0.11662663464104617, + "grad_norm": 1.4288925931679666, + "learning_rate": 1.9609194584463866e-05, + "loss": 1.7142, + "step": 874 + }, + { + "epoch": 0.11676007472644782, + "grad_norm": 1.3482735605608565, + "learning_rate": 1.9607997262362195e-05, + "loss": 1.7664, + "step": 875 + }, + { + "epoch": 0.11689351481184948, + "grad_norm": 1.2758965299420557, + "learning_rate": 1.960679814559603e-05, + "loss": 1.7643, + "step": 876 + }, + { + "epoch": 0.11702695489725114, + "grad_norm": 2.091988370893863, + "learning_rate": 1.9605597234389347e-05, + "loss": 1.6968, + "step": 877 + }, + { + "epoch": 0.1171603949826528, + "grad_norm": 1.7213048905302604, + "learning_rate": 1.9604394528966467e-05, + "loss": 1.6905, + "step": 878 + }, + { + "epoch": 0.11729383506805445, + "grad_norm": 1.3791112926314404, + "learning_rate": 1.9603190029552036e-05, + "loss": 1.6883, + "step": 879 + }, + { + "epoch": 0.11742727515345609, + "grad_norm": 1.3014168835710302, + "learning_rate": 1.9601983736371047e-05, + "loss": 1.747, + "step": 880 + }, + { + "epoch": 0.11756071523885775, + "grad_norm": 1.2352469046555101, + "learning_rate": 1.9600775649648818e-05, + "loss": 1.663, + "step": 881 + }, + { + "epoch": 0.1176941553242594, + "grad_norm": 1.3042902670559662, + "learning_rate": 1.9599565769611004e-05, + "loss": 1.6927, + "step": 882 + }, + { + "epoch": 0.11782759540966106, + "grad_norm": 1.3814487920433733, + "learning_rate": 1.95983540964836e-05, + "loss": 1.6648, + "step": 883 + }, + { + "epoch": 0.11796103549506272, + "grad_norm": 1.192223516720673, + "learning_rate": 1.9597140630492934e-05, + "loss": 1.6921, + "step": 884 + }, + { + "epoch": 0.11809447558046438, + "grad_norm": 1.2034614588136634, + "learning_rate": 1.9595925371865667e-05, + "loss": 1.6936, + "step": 885 + }, + { + "epoch": 0.11822791566586603, + "grad_norm": 1.1538622984795832, + "learning_rate": 1.959470832082879e-05, + "loss": 1.6821, + "step": 886 + }, + { + "epoch": 0.11836135575126767, + "grad_norm": 1.2580439984544938, + "learning_rate": 1.9593489477609646e-05, + "loss": 1.7528, + "step": 887 + }, + { + "epoch": 0.11849479583666933, + "grad_norm": 1.4294495361334751, + "learning_rate": 1.9592268842435884e-05, + "loss": 1.7465, + "step": 888 + }, + { + "epoch": 0.11862823592207099, + "grad_norm": 1.257681366416578, + "learning_rate": 1.959104641553552e-05, + "loss": 1.6859, + "step": 889 + }, + { + "epoch": 0.11876167600747264, + "grad_norm": 1.2143693832991094, + "learning_rate": 1.958982219713688e-05, + "loss": 1.7008, + "step": 890 + }, + { + "epoch": 0.1188951160928743, + "grad_norm": 1.2891059442807018, + "learning_rate": 1.9588596187468642e-05, + "loss": 1.7103, + "step": 891 + }, + { + "epoch": 0.11902855617827596, + "grad_norm": 1.2311129858964251, + "learning_rate": 1.9587368386759805e-05, + "loss": 1.714, + "step": 892 + }, + { + "epoch": 0.11916199626367761, + "grad_norm": 1.2118946144194225, + "learning_rate": 1.9586138795239708e-05, + "loss": 1.6704, + "step": 893 + }, + { + "epoch": 0.11929543634907927, + "grad_norm": 1.159008481780571, + "learning_rate": 1.9584907413138028e-05, + "loss": 1.6918, + "step": 894 + }, + { + "epoch": 0.11942887643448091, + "grad_norm": 1.2289280001259455, + "learning_rate": 1.9583674240684774e-05, + "loss": 1.7155, + "step": 895 + }, + { + "epoch": 0.11956231651988257, + "grad_norm": 1.2482207680648996, + "learning_rate": 1.9582439278110282e-05, + "loss": 1.6881, + "step": 896 + }, + { + "epoch": 0.11969575660528423, + "grad_norm": 1.435525566789562, + "learning_rate": 1.9581202525645232e-05, + "loss": 1.6873, + "step": 897 + }, + { + "epoch": 0.11982919669068588, + "grad_norm": 1.3060438169865851, + "learning_rate": 1.957996398352064e-05, + "loss": 1.6797, + "step": 898 + }, + { + "epoch": 0.11996263677608754, + "grad_norm": 1.2737667444400789, + "learning_rate": 1.9578723651967845e-05, + "loss": 1.6942, + "step": 899 + }, + { + "epoch": 0.1200960768614892, + "grad_norm": 1.6807278602860825, + "learning_rate": 1.957748153121853e-05, + "loss": 1.693, + "step": 900 + }, + { + "epoch": 0.12022951694689085, + "grad_norm": 1.2626260309636503, + "learning_rate": 1.957623762150471e-05, + "loss": 1.6766, + "step": 901 + }, + { + "epoch": 0.1203629570322925, + "grad_norm": 1.6130682344451919, + "learning_rate": 1.9574991923058735e-05, + "loss": 1.7133, + "step": 902 + }, + { + "epoch": 0.12049639711769415, + "grad_norm": 1.1868637434354896, + "learning_rate": 1.957374443611328e-05, + "loss": 1.719, + "step": 903 + }, + { + "epoch": 0.12062983720309581, + "grad_norm": 1.1629455637439938, + "learning_rate": 1.9572495160901365e-05, + "loss": 1.6718, + "step": 904 + }, + { + "epoch": 0.12076327728849746, + "grad_norm": 1.2126155193246908, + "learning_rate": 1.9571244097656343e-05, + "loss": 1.6731, + "step": 905 + }, + { + "epoch": 0.12089671737389912, + "grad_norm": 1.3426037774841502, + "learning_rate": 1.9569991246611897e-05, + "loss": 1.7082, + "step": 906 + }, + { + "epoch": 0.12103015745930078, + "grad_norm": 1.237669168766994, + "learning_rate": 1.9568736608002045e-05, + "loss": 1.698, + "step": 907 + }, + { + "epoch": 0.12116359754470243, + "grad_norm": 1.2019255673482718, + "learning_rate": 1.9567480182061134e-05, + "loss": 1.7086, + "step": 908 + }, + { + "epoch": 0.12129703763010408, + "grad_norm": 1.4253857823504195, + "learning_rate": 1.956622196902386e-05, + "loss": 1.7049, + "step": 909 + }, + { + "epoch": 0.12143047771550573, + "grad_norm": 1.681519114291726, + "learning_rate": 1.9564961969125235e-05, + "loss": 1.6735, + "step": 910 + }, + { + "epoch": 0.12156391780090739, + "grad_norm": 1.2989313363354473, + "learning_rate": 1.9563700182600612e-05, + "loss": 1.7477, + "step": 911 + }, + { + "epoch": 0.12169735788630905, + "grad_norm": 1.1509458671490507, + "learning_rate": 1.9562436609685685e-05, + "loss": 1.7121, + "step": 912 + }, + { + "epoch": 0.1218307979717107, + "grad_norm": 1.234465309616413, + "learning_rate": 1.9561171250616472e-05, + "loss": 1.7032, + "step": 913 + }, + { + "epoch": 0.12196423805711236, + "grad_norm": 1.286630945624901, + "learning_rate": 1.955990410562933e-05, + "loss": 1.6834, + "step": 914 + }, + { + "epoch": 0.12209767814251402, + "grad_norm": 1.2427074764646417, + "learning_rate": 1.9558635174960942e-05, + "loss": 1.6784, + "step": 915 + }, + { + "epoch": 0.12223111822791567, + "grad_norm": 1.2023142218836853, + "learning_rate": 1.9557364458848334e-05, + "loss": 1.6907, + "step": 916 + }, + { + "epoch": 0.12236455831331731, + "grad_norm": 1.1692669558611244, + "learning_rate": 1.955609195752886e-05, + "loss": 1.6604, + "step": 917 + }, + { + "epoch": 0.12249799839871897, + "grad_norm": 1.441083824408625, + "learning_rate": 1.9554817671240205e-05, + "loss": 1.7268, + "step": 918 + }, + { + "epoch": 0.12263143848412063, + "grad_norm": 1.1637128804063606, + "learning_rate": 1.9553541600220395e-05, + "loss": 1.7105, + "step": 919 + }, + { + "epoch": 0.12276487856952228, + "grad_norm": 1.1948097801622213, + "learning_rate": 1.955226374470779e-05, + "loss": 1.6994, + "step": 920 + }, + { + "epoch": 0.12289831865492394, + "grad_norm": 1.19037198249763, + "learning_rate": 1.955098410494107e-05, + "loss": 1.6805, + "step": 921 + }, + { + "epoch": 0.1230317587403256, + "grad_norm": 1.1965278157578167, + "learning_rate": 1.954970268115926e-05, + "loss": 1.7126, + "step": 922 + }, + { + "epoch": 0.12316519882572725, + "grad_norm": 12.043177426547418, + "learning_rate": 1.954841947360172e-05, + "loss": 1.6847, + "step": 923 + }, + { + "epoch": 0.1232986389111289, + "grad_norm": 1.4171849613590173, + "learning_rate": 1.9547134482508135e-05, + "loss": 1.6773, + "step": 924 + }, + { + "epoch": 0.12343207899653055, + "grad_norm": 1.277439153377438, + "learning_rate": 1.9545847708118524e-05, + "loss": 1.654, + "step": 925 + }, + { + "epoch": 0.12356551908193221, + "grad_norm": 1.5344343949821269, + "learning_rate": 1.954455915067325e-05, + "loss": 1.714, + "step": 926 + }, + { + "epoch": 0.12369895916733387, + "grad_norm": 1.3273336556361204, + "learning_rate": 1.954326881041299e-05, + "loss": 1.7272, + "step": 927 + }, + { + "epoch": 0.12383239925273552, + "grad_norm": 1.2408609710996044, + "learning_rate": 1.9541976687578773e-05, + "loss": 1.7073, + "step": 928 + }, + { + "epoch": 0.12396583933813718, + "grad_norm": 1.3534841024628121, + "learning_rate": 1.954068278241195e-05, + "loss": 1.7464, + "step": 929 + }, + { + "epoch": 0.12409927942353884, + "grad_norm": 1.8710842856544097, + "learning_rate": 1.9539387095154207e-05, + "loss": 1.6445, + "step": 930 + }, + { + "epoch": 0.12423271950894049, + "grad_norm": 1.3643772964473504, + "learning_rate": 1.9538089626047566e-05, + "loss": 1.7005, + "step": 931 + }, + { + "epoch": 0.12436615959434213, + "grad_norm": 1.2248623778543628, + "learning_rate": 1.953679037533438e-05, + "loss": 1.6574, + "step": 932 + }, + { + "epoch": 0.12449959967974379, + "grad_norm": 1.3022000288924167, + "learning_rate": 1.9535489343257326e-05, + "loss": 1.7176, + "step": 933 + }, + { + "epoch": 0.12463303976514545, + "grad_norm": 1.4725976531996385, + "learning_rate": 1.9534186530059434e-05, + "loss": 1.6851, + "step": 934 + }, + { + "epoch": 0.1247664798505471, + "grad_norm": 1.2400512724415127, + "learning_rate": 1.9532881935984046e-05, + "loss": 1.688, + "step": 935 + }, + { + "epoch": 0.12489991993594876, + "grad_norm": 1.4145930106236353, + "learning_rate": 1.9531575561274852e-05, + "loss": 1.6835, + "step": 936 + }, + { + "epoch": 0.1250333600213504, + "grad_norm": 1.3793980244786415, + "learning_rate": 1.953026740617586e-05, + "loss": 1.7196, + "step": 937 + }, + { + "epoch": 0.12516680010675207, + "grad_norm": 13.216726991500538, + "learning_rate": 1.9528957470931424e-05, + "loss": 1.7525, + "step": 938 + }, + { + "epoch": 0.12530024019215372, + "grad_norm": 1.3237742593449273, + "learning_rate": 1.9527645755786224e-05, + "loss": 1.6883, + "step": 939 + }, + { + "epoch": 0.1254336802775554, + "grad_norm": 1.445191972771836, + "learning_rate": 1.9526332260985275e-05, + "loss": 1.6686, + "step": 940 + }, + { + "epoch": 0.12556712036295703, + "grad_norm": 1.5185282218871532, + "learning_rate": 1.952501698677392e-05, + "loss": 1.6581, + "step": 941 + }, + { + "epoch": 0.1257005604483587, + "grad_norm": 1.3124983698296924, + "learning_rate": 1.9523699933397834e-05, + "loss": 1.6544, + "step": 942 + }, + { + "epoch": 0.12583400053376034, + "grad_norm": 1.183777382042402, + "learning_rate": 1.9522381101103038e-05, + "loss": 1.7612, + "step": 943 + }, + { + "epoch": 0.12596744061916199, + "grad_norm": 1.2415393099346195, + "learning_rate": 1.9521060490135865e-05, + "loss": 1.7291, + "step": 944 + }, + { + "epoch": 0.12610088070456366, + "grad_norm": 1.176856792014906, + "learning_rate": 1.9519738100742995e-05, + "loss": 1.6638, + "step": 945 + }, + { + "epoch": 0.1262343207899653, + "grad_norm": 1.1487711372941813, + "learning_rate": 1.9518413933171432e-05, + "loss": 1.6464, + "step": 946 + }, + { + "epoch": 0.12636776087536697, + "grad_norm": 1.4297554387238383, + "learning_rate": 1.9517087987668522e-05, + "loss": 1.6069, + "step": 947 + }, + { + "epoch": 0.1265012009607686, + "grad_norm": 1.31215843034171, + "learning_rate": 1.951576026448193e-05, + "loss": 1.689, + "step": 948 + }, + { + "epoch": 0.12663464104617028, + "grad_norm": 1.1689420027444986, + "learning_rate": 1.951443076385966e-05, + "loss": 1.6715, + "step": 949 + }, + { + "epoch": 0.12676808113157192, + "grad_norm": 1.680542791799617, + "learning_rate": 1.951309948605005e-05, + "loss": 1.6546, + "step": 950 + }, + { + "epoch": 0.12690152121697357, + "grad_norm": 1.1797862555326082, + "learning_rate": 1.9511766431301766e-05, + "loss": 1.7215, + "step": 951 + }, + { + "epoch": 0.12703496130237524, + "grad_norm": 1.5427071574060558, + "learning_rate": 1.951043159986381e-05, + "loss": 1.6322, + "step": 952 + }, + { + "epoch": 0.12716840138777688, + "grad_norm": 1.1779500188493681, + "learning_rate": 1.950909499198551e-05, + "loss": 1.6812, + "step": 953 + }, + { + "epoch": 0.12730184147317855, + "grad_norm": 1.1913258200020012, + "learning_rate": 1.950775660791653e-05, + "loss": 1.6755, + "step": 954 + }, + { + "epoch": 0.1274352815585802, + "grad_norm": 1.232490173323732, + "learning_rate": 1.950641644790687e-05, + "loss": 1.7031, + "step": 955 + }, + { + "epoch": 0.12756872164398186, + "grad_norm": 1.327617648590312, + "learning_rate": 1.9505074512206847e-05, + "loss": 1.6455, + "step": 956 + }, + { + "epoch": 0.1277021617293835, + "grad_norm": 1.1740821011806148, + "learning_rate": 1.9503730801067125e-05, + "loss": 1.6913, + "step": 957 + }, + { + "epoch": 0.12783560181478515, + "grad_norm": 1.3812646322593256, + "learning_rate": 1.9502385314738697e-05, + "loss": 1.6215, + "step": 958 + }, + { + "epoch": 0.12796904190018682, + "grad_norm": 1.3155996801138305, + "learning_rate": 1.9501038053472877e-05, + "loss": 1.6518, + "step": 959 + }, + { + "epoch": 0.12810248198558846, + "grad_norm": 1.3349048294702728, + "learning_rate": 1.9499689017521323e-05, + "loss": 1.6609, + "step": 960 + }, + { + "epoch": 0.12823592207099013, + "grad_norm": 1.2671901915804749, + "learning_rate": 1.9498338207136022e-05, + "loss": 1.725, + "step": 961 + }, + { + "epoch": 0.12836936215639178, + "grad_norm": 1.1359327570816697, + "learning_rate": 1.949698562256928e-05, + "loss": 1.6597, + "step": 962 + }, + { + "epoch": 0.12850280224179345, + "grad_norm": 1.2333203786654017, + "learning_rate": 1.9495631264073758e-05, + "loss": 1.6659, + "step": 963 + }, + { + "epoch": 0.1286362423271951, + "grad_norm": 1.2554849794053347, + "learning_rate": 1.9494275131902423e-05, + "loss": 1.6749, + "step": 964 + }, + { + "epoch": 0.12876968241259673, + "grad_norm": 1.3230800031319025, + "learning_rate": 1.949291722630859e-05, + "loss": 1.6923, + "step": 965 + }, + { + "epoch": 0.1289031224979984, + "grad_norm": 2.1623618851588327, + "learning_rate": 1.9491557547545903e-05, + "loss": 1.6905, + "step": 966 + }, + { + "epoch": 0.12903656258340004, + "grad_norm": 1.3456119660809565, + "learning_rate": 1.9490196095868328e-05, + "loss": 1.7063, + "step": 967 + }, + { + "epoch": 0.12917000266880171, + "grad_norm": 1.600288717290206, + "learning_rate": 1.9488832871530173e-05, + "loss": 1.7006, + "step": 968 + }, + { + "epoch": 0.12930344275420336, + "grad_norm": 1.3672986818133261, + "learning_rate": 1.9487467874786076e-05, + "loss": 1.7197, + "step": 969 + }, + { + "epoch": 0.12943688283960503, + "grad_norm": 1.2760213827762075, + "learning_rate": 1.9486101105890993e-05, + "loss": 1.6708, + "step": 970 + }, + { + "epoch": 0.12957032292500667, + "grad_norm": 1.393135659234334, + "learning_rate": 1.948473256510023e-05, + "loss": 1.6924, + "step": 971 + }, + { + "epoch": 0.1297037630104083, + "grad_norm": 1.367984927030445, + "learning_rate": 1.948336225266941e-05, + "loss": 1.7087, + "step": 972 + }, + { + "epoch": 0.12983720309580998, + "grad_norm": 1.2953603648125673, + "learning_rate": 1.9481990168854494e-05, + "loss": 1.7151, + "step": 973 + }, + { + "epoch": 0.12997064318121163, + "grad_norm": 1.167269554903983, + "learning_rate": 1.9480616313911774e-05, + "loss": 1.6787, + "step": 974 + }, + { + "epoch": 0.1301040832666133, + "grad_norm": 1.246260379852551, + "learning_rate": 1.9479240688097864e-05, + "loss": 1.7456, + "step": 975 + }, + { + "epoch": 0.13023752335201494, + "grad_norm": 1.2725941500644802, + "learning_rate": 1.9477863291669718e-05, + "loss": 1.6552, + "step": 976 + }, + { + "epoch": 0.1303709634374166, + "grad_norm": 1.2084106713506604, + "learning_rate": 1.9476484124884622e-05, + "loss": 1.6639, + "step": 977 + }, + { + "epoch": 0.13050440352281825, + "grad_norm": 1.194547259427426, + "learning_rate": 1.9475103188000183e-05, + "loss": 1.6877, + "step": 978 + }, + { + "epoch": 0.13063784360821992, + "grad_norm": 1.2780352392355023, + "learning_rate": 1.9473720481274347e-05, + "loss": 1.6605, + "step": 979 + }, + { + "epoch": 0.13077128369362157, + "grad_norm": 1.5045389202002308, + "learning_rate": 1.9472336004965387e-05, + "loss": 1.7074, + "step": 980 + }, + { + "epoch": 0.1309047237790232, + "grad_norm": 1.2421804342522536, + "learning_rate": 1.947094975933191e-05, + "loss": 1.686, + "step": 981 + }, + { + "epoch": 0.13103816386442488, + "grad_norm": 1.1426071394085213, + "learning_rate": 1.9469561744632845e-05, + "loss": 1.6592, + "step": 982 + }, + { + "epoch": 0.13117160394982652, + "grad_norm": 1.6999618031943038, + "learning_rate": 1.9468171961127464e-05, + "loss": 1.7108, + "step": 983 + }, + { + "epoch": 0.1313050440352282, + "grad_norm": 1.2427085932088913, + "learning_rate": 1.946678040907536e-05, + "loss": 1.657, + "step": 984 + }, + { + "epoch": 0.13143848412062983, + "grad_norm": 8.676443998708113, + "learning_rate": 1.9465387088736455e-05, + "loss": 1.6552, + "step": 985 + }, + { + "epoch": 0.1315719242060315, + "grad_norm": 1.6685163662088924, + "learning_rate": 1.9463992000371014e-05, + "loss": 1.6877, + "step": 986 + }, + { + "epoch": 0.13170536429143315, + "grad_norm": 1.4363009160454192, + "learning_rate": 1.9462595144239616e-05, + "loss": 1.7549, + "step": 987 + }, + { + "epoch": 0.1318388043768348, + "grad_norm": 1.4849169479024855, + "learning_rate": 1.946119652060318e-05, + "loss": 1.6377, + "step": 988 + }, + { + "epoch": 0.13197224446223646, + "grad_norm": 1.384439488826471, + "learning_rate": 1.9459796129722962e-05, + "loss": 1.69, + "step": 989 + }, + { + "epoch": 0.1321056845476381, + "grad_norm": 1.486350400208871, + "learning_rate": 1.9458393971860522e-05, + "loss": 1.6643, + "step": 990 + }, + { + "epoch": 0.13223912463303977, + "grad_norm": 1.4827478161059948, + "learning_rate": 1.9456990047277777e-05, + "loss": 1.6915, + "step": 991 + }, + { + "epoch": 0.13237256471844142, + "grad_norm": 1.2049573199404926, + "learning_rate": 1.9455584356236965e-05, + "loss": 1.6686, + "step": 992 + }, + { + "epoch": 0.13250600480384309, + "grad_norm": 1.490030676305235, + "learning_rate": 1.9454176899000653e-05, + "loss": 1.7358, + "step": 993 + }, + { + "epoch": 0.13263944488924473, + "grad_norm": 1.1805251966858894, + "learning_rate": 1.9452767675831733e-05, + "loss": 1.6745, + "step": 994 + }, + { + "epoch": 0.13277288497464637, + "grad_norm": 1.4646671341281234, + "learning_rate": 1.9451356686993435e-05, + "loss": 1.671, + "step": 995 + }, + { + "epoch": 0.13290632506004804, + "grad_norm": 1.2502496308048068, + "learning_rate": 1.9449943932749316e-05, + "loss": 1.6041, + "step": 996 + }, + { + "epoch": 0.13303976514544968, + "grad_norm": 1.3302442965706283, + "learning_rate": 1.9448529413363264e-05, + "loss": 1.692, + "step": 997 + }, + { + "epoch": 0.13317320523085135, + "grad_norm": 1.5733523617366565, + "learning_rate": 1.944711312909949e-05, + "loss": 1.6779, + "step": 998 + }, + { + "epoch": 0.133306645316253, + "grad_norm": 1.2021852120833718, + "learning_rate": 1.9445695080222543e-05, + "loss": 1.688, + "step": 999 + }, + { + "epoch": 0.13344008540165467, + "grad_norm": 1.3704807104442995, + "learning_rate": 1.9444275266997302e-05, + "loss": 1.6618, + "step": 1000 + }, + { + "epoch": 0.1335735254870563, + "grad_norm": 1.1311995894978009, + "learning_rate": 1.9442853689688965e-05, + "loss": 1.6144, + "step": 1001 + }, + { + "epoch": 0.13370696557245795, + "grad_norm": 1.5372526742454895, + "learning_rate": 1.9441430348563072e-05, + "loss": 1.7336, + "step": 1002 + }, + { + "epoch": 0.13384040565785962, + "grad_norm": 1.3090351266009799, + "learning_rate": 1.9440005243885482e-05, + "loss": 1.6752, + "step": 1003 + }, + { + "epoch": 0.13397384574326127, + "grad_norm": 1.313411350120398, + "learning_rate": 1.943857837592239e-05, + "loss": 1.6838, + "step": 1004 + }, + { + "epoch": 0.13410728582866294, + "grad_norm": 1.3955032805375227, + "learning_rate": 1.943714974494032e-05, + "loss": 1.7377, + "step": 1005 + }, + { + "epoch": 0.13424072591406458, + "grad_norm": 1.1613148391890242, + "learning_rate": 1.943571935120613e-05, + "loss": 1.6595, + "step": 1006 + }, + { + "epoch": 0.13437416599946625, + "grad_norm": 1.5018393485227763, + "learning_rate": 1.943428719498699e-05, + "loss": 1.7533, + "step": 1007 + }, + { + "epoch": 0.1345076060848679, + "grad_norm": 1.1758602862709078, + "learning_rate": 1.9432853276550412e-05, + "loss": 1.6804, + "step": 1008 + }, + { + "epoch": 0.13464104617026956, + "grad_norm": 2.1854421038226075, + "learning_rate": 1.9431417596164246e-05, + "loss": 1.7315, + "step": 1009 + }, + { + "epoch": 0.1347744862556712, + "grad_norm": 1.3381407737075066, + "learning_rate": 1.942998015409665e-05, + "loss": 1.6721, + "step": 1010 + }, + { + "epoch": 0.13490792634107285, + "grad_norm": 1.2547528294212573, + "learning_rate": 1.9428540950616127e-05, + "loss": 1.6782, + "step": 1011 + }, + { + "epoch": 0.13504136642647452, + "grad_norm": 5.939952891644821, + "learning_rate": 1.94270999859915e-05, + "loss": 1.6811, + "step": 1012 + }, + { + "epoch": 0.13517480651187616, + "grad_norm": 1.3380178004751162, + "learning_rate": 1.942565726049193e-05, + "loss": 1.6712, + "step": 1013 + }, + { + "epoch": 0.13530824659727783, + "grad_norm": 1.2408694717484239, + "learning_rate": 1.9424212774386894e-05, + "loss": 1.7259, + "step": 1014 + }, + { + "epoch": 0.13544168668267947, + "grad_norm": 1.2756143544585534, + "learning_rate": 1.9422766527946217e-05, + "loss": 1.6631, + "step": 1015 + }, + { + "epoch": 0.13557512676808114, + "grad_norm": 1.4810902782091733, + "learning_rate": 1.942131852144003e-05, + "loss": 1.6927, + "step": 1016 + }, + { + "epoch": 0.1357085668534828, + "grad_norm": 1.2419032087935538, + "learning_rate": 1.941986875513881e-05, + "loss": 1.6898, + "step": 1017 + }, + { + "epoch": 0.13584200693888443, + "grad_norm": 1.2663899524703148, + "learning_rate": 1.9418417229313357e-05, + "loss": 1.679, + "step": 1018 + }, + { + "epoch": 0.1359754470242861, + "grad_norm": 1.6857267169996077, + "learning_rate": 1.9416963944234795e-05, + "loss": 1.6354, + "step": 1019 + }, + { + "epoch": 0.13610888710968774, + "grad_norm": 1.4554351634221496, + "learning_rate": 1.9415508900174587e-05, + "loss": 1.7462, + "step": 1020 + }, + { + "epoch": 0.1362423271950894, + "grad_norm": 1.2229508649241836, + "learning_rate": 1.941405209740452e-05, + "loss": 1.6975, + "step": 1021 + }, + { + "epoch": 0.13637576728049106, + "grad_norm": 1.8919520969087211, + "learning_rate": 1.94125935361967e-05, + "loss": 1.7258, + "step": 1022 + }, + { + "epoch": 0.13650920736589273, + "grad_norm": 1.2669785367995678, + "learning_rate": 1.9411133216823573e-05, + "loss": 1.7269, + "step": 1023 + }, + { + "epoch": 0.13664264745129437, + "grad_norm": 1.394093842588581, + "learning_rate": 1.9409671139557913e-05, + "loss": 1.6975, + "step": 1024 + }, + { + "epoch": 0.136776087536696, + "grad_norm": 1.382684928389381, + "learning_rate": 1.9408207304672815e-05, + "loss": 1.667, + "step": 1025 + }, + { + "epoch": 0.13690952762209768, + "grad_norm": 1.1652111588255187, + "learning_rate": 1.940674171244171e-05, + "loss": 1.6379, + "step": 1026 + }, + { + "epoch": 0.13704296770749932, + "grad_norm": 1.3038078568540548, + "learning_rate": 1.9405274363138353e-05, + "loss": 1.6447, + "step": 1027 + }, + { + "epoch": 0.137176407792901, + "grad_norm": 1.1387426323073633, + "learning_rate": 1.940380525703683e-05, + "loss": 1.6675, + "step": 1028 + }, + { + "epoch": 0.13730984787830264, + "grad_norm": 1.474771378157035, + "learning_rate": 1.940233439441155e-05, + "loss": 1.7026, + "step": 1029 + }, + { + "epoch": 0.1374432879637043, + "grad_norm": 1.3414858835240848, + "learning_rate": 1.9400861775537253e-05, + "loss": 1.6212, + "step": 1030 + }, + { + "epoch": 0.13757672804910595, + "grad_norm": 1.4550778747404767, + "learning_rate": 1.939938740068901e-05, + "loss": 1.6785, + "step": 1031 + }, + { + "epoch": 0.1377101681345076, + "grad_norm": 1.3556680615235186, + "learning_rate": 1.939791127014222e-05, + "loss": 1.6597, + "step": 1032 + }, + { + "epoch": 0.13784360821990926, + "grad_norm": 1.4224278910538175, + "learning_rate": 1.93964333841726e-05, + "loss": 1.6917, + "step": 1033 + }, + { + "epoch": 0.1379770483053109, + "grad_norm": 2.2986502944753275, + "learning_rate": 1.9394953743056205e-05, + "loss": 1.6871, + "step": 1034 + }, + { + "epoch": 0.13811048839071258, + "grad_norm": 2.0365277465356924, + "learning_rate": 1.939347234706942e-05, + "loss": 1.7383, + "step": 1035 + }, + { + "epoch": 0.13824392847611422, + "grad_norm": 1.4908556334356526, + "learning_rate": 1.9391989196488947e-05, + "loss": 1.6577, + "step": 1036 + }, + { + "epoch": 0.1383773685615159, + "grad_norm": 1.2426015406392708, + "learning_rate": 1.9390504291591825e-05, + "loss": 1.6762, + "step": 1037 + }, + { + "epoch": 0.13851080864691753, + "grad_norm": 1.1622345855922083, + "learning_rate": 1.9389017632655417e-05, + "loss": 1.6452, + "step": 1038 + }, + { + "epoch": 0.13864424873231918, + "grad_norm": 1.2036449724168792, + "learning_rate": 1.938752921995741e-05, + "loss": 1.6652, + "step": 1039 + }, + { + "epoch": 0.13877768881772085, + "grad_norm": 1.3193413331431836, + "learning_rate": 1.938603905377583e-05, + "loss": 1.7153, + "step": 1040 + }, + { + "epoch": 0.1389111289031225, + "grad_norm": 3.9652856357976005, + "learning_rate": 1.9384547134389017e-05, + "loss": 1.6998, + "step": 1041 + }, + { + "epoch": 0.13904456898852416, + "grad_norm": 1.2832120473397637, + "learning_rate": 1.9383053462075645e-05, + "loss": 1.6865, + "step": 1042 + }, + { + "epoch": 0.1391780090739258, + "grad_norm": 1.9488157590841544, + "learning_rate": 1.938155803711472e-05, + "loss": 1.7429, + "step": 1043 + }, + { + "epoch": 0.13931144915932747, + "grad_norm": 1.334661146454789, + "learning_rate": 1.9380060859785562e-05, + "loss": 1.687, + "step": 1044 + }, + { + "epoch": 0.13944488924472911, + "grad_norm": 1.254965676826134, + "learning_rate": 1.937856193036783e-05, + "loss": 1.647, + "step": 1045 + }, + { + "epoch": 0.13957832933013078, + "grad_norm": 1.3924358256427258, + "learning_rate": 1.9377061249141515e-05, + "loss": 1.7171, + "step": 1046 + }, + { + "epoch": 0.13971176941553243, + "grad_norm": 1.3956910901280772, + "learning_rate": 1.9375558816386915e-05, + "loss": 1.743, + "step": 1047 + }, + { + "epoch": 0.13984520950093407, + "grad_norm": 1.2701133772552229, + "learning_rate": 1.9374054632384677e-05, + "loss": 1.7041, + "step": 1048 + }, + { + "epoch": 0.13997864958633574, + "grad_norm": 1.3350393841513681, + "learning_rate": 1.937254869741576e-05, + "loss": 1.6846, + "step": 1049 + }, + { + "epoch": 0.14011208967173738, + "grad_norm": 1.4627388869419586, + "learning_rate": 1.9371041011761456e-05, + "loss": 1.6207, + "step": 1050 + }, + { + "epoch": 0.14024552975713905, + "grad_norm": 1.4866992503394356, + "learning_rate": 1.9369531575703388e-05, + "loss": 1.7129, + "step": 1051 + }, + { + "epoch": 0.1403789698425407, + "grad_norm": 1.1685028747662984, + "learning_rate": 1.9368020389523493e-05, + "loss": 1.668, + "step": 1052 + }, + { + "epoch": 0.14051240992794237, + "grad_norm": 1.3215015297643653, + "learning_rate": 1.936650745350405e-05, + "loss": 1.7145, + "step": 1053 + }, + { + "epoch": 0.140645850013344, + "grad_norm": 1.3693292355410813, + "learning_rate": 1.9364992767927656e-05, + "loss": 1.701, + "step": 1054 + }, + { + "epoch": 0.14077929009874565, + "grad_norm": 1.2258997785581536, + "learning_rate": 1.9363476333077237e-05, + "loss": 1.7085, + "step": 1055 + }, + { + "epoch": 0.14091273018414732, + "grad_norm": 1.2455180226057498, + "learning_rate": 1.9361958149236045e-05, + "loss": 1.7652, + "step": 1056 + }, + { + "epoch": 0.14104617026954896, + "grad_norm": 1.1398881313468456, + "learning_rate": 1.9360438216687663e-05, + "loss": 1.6521, + "step": 1057 + }, + { + "epoch": 0.14117961035495064, + "grad_norm": 1.4032373423726727, + "learning_rate": 1.9358916535715995e-05, + "loss": 1.6822, + "step": 1058 + }, + { + "epoch": 0.14131305044035228, + "grad_norm": 1.2386258022282814, + "learning_rate": 1.9357393106605273e-05, + "loss": 1.6565, + "step": 1059 + }, + { + "epoch": 0.14144649052575395, + "grad_norm": 1.2563917204753323, + "learning_rate": 1.9355867929640054e-05, + "loss": 1.7224, + "step": 1060 + }, + { + "epoch": 0.1415799306111556, + "grad_norm": 1.1600878491451077, + "learning_rate": 1.9354341005105228e-05, + "loss": 1.6831, + "step": 1061 + }, + { + "epoch": 0.14171337069655723, + "grad_norm": 1.1898921958138153, + "learning_rate": 1.9352812333286006e-05, + "loss": 1.7034, + "step": 1062 + }, + { + "epoch": 0.1418468107819589, + "grad_norm": 1.3776077175515267, + "learning_rate": 1.9351281914467924e-05, + "loss": 1.6558, + "step": 1063 + }, + { + "epoch": 0.14198025086736055, + "grad_norm": 1.4565706718782248, + "learning_rate": 1.934974974893685e-05, + "loss": 1.7167, + "step": 1064 + }, + { + "epoch": 0.14211369095276222, + "grad_norm": 1.2442621324452396, + "learning_rate": 1.9348215836978978e-05, + "loss": 1.6459, + "step": 1065 + }, + { + "epoch": 0.14224713103816386, + "grad_norm": 1.1267333234718482, + "learning_rate": 1.9346680178880813e-05, + "loss": 1.7133, + "step": 1066 + }, + { + "epoch": 0.14238057112356553, + "grad_norm": 1.1628045406596315, + "learning_rate": 1.9345142774929214e-05, + "loss": 1.6475, + "step": 1067 + }, + { + "epoch": 0.14251401120896717, + "grad_norm": 1.1018713280842254, + "learning_rate": 1.9343603625411338e-05, + "loss": 1.5936, + "step": 1068 + }, + { + "epoch": 0.14264745129436882, + "grad_norm": 1.193003889508144, + "learning_rate": 1.9342062730614688e-05, + "loss": 1.6788, + "step": 1069 + }, + { + "epoch": 0.14278089137977049, + "grad_norm": 1.1343045579344455, + "learning_rate": 1.9340520090827086e-05, + "loss": 1.6561, + "step": 1070 + }, + { + "epoch": 0.14291433146517213, + "grad_norm": 1.1710425209498292, + "learning_rate": 1.9338975706336672e-05, + "loss": 1.5907, + "step": 1071 + }, + { + "epoch": 0.1430477715505738, + "grad_norm": 1.3817597541347761, + "learning_rate": 1.9337429577431926e-05, + "loss": 1.7147, + "step": 1072 + }, + { + "epoch": 0.14318121163597544, + "grad_norm": 1.1573508951337832, + "learning_rate": 1.9335881704401642e-05, + "loss": 1.698, + "step": 1073 + }, + { + "epoch": 0.1433146517213771, + "grad_norm": 1.0912396515600804, + "learning_rate": 1.9334332087534956e-05, + "loss": 1.6756, + "step": 1074 + }, + { + "epoch": 0.14344809180677875, + "grad_norm": 1.1837140980523393, + "learning_rate": 1.9332780727121306e-05, + "loss": 1.6908, + "step": 1075 + }, + { + "epoch": 0.1435815318921804, + "grad_norm": 9.673515773086278, + "learning_rate": 1.9331227623450475e-05, + "loss": 1.71, + "step": 1076 + }, + { + "epoch": 0.14371497197758207, + "grad_norm": 1.3759562691897052, + "learning_rate": 1.9329672776812563e-05, + "loss": 1.7094, + "step": 1077 + }, + { + "epoch": 0.1438484120629837, + "grad_norm": 1.1597684961716581, + "learning_rate": 1.9328116187498e-05, + "loss": 1.6868, + "step": 1078 + }, + { + "epoch": 0.14398185214838538, + "grad_norm": 1.861180144739059, + "learning_rate": 1.9326557855797537e-05, + "loss": 1.695, + "step": 1079 + }, + { + "epoch": 0.14411529223378702, + "grad_norm": 1.3971061309442314, + "learning_rate": 1.9324997782002258e-05, + "loss": 1.6394, + "step": 1080 + }, + { + "epoch": 0.1442487323191887, + "grad_norm": 1.1696642529732444, + "learning_rate": 1.9323435966403557e-05, + "loss": 1.6628, + "step": 1081 + }, + { + "epoch": 0.14438217240459034, + "grad_norm": 1.4363997803769488, + "learning_rate": 1.932187240929317e-05, + "loss": 1.6358, + "step": 1082 + }, + { + "epoch": 0.144515612489992, + "grad_norm": 1.2009780630510838, + "learning_rate": 1.932030711096315e-05, + "loss": 1.6941, + "step": 1083 + }, + { + "epoch": 0.14464905257539365, + "grad_norm": 1.36326083414618, + "learning_rate": 1.931874007170588e-05, + "loss": 1.7041, + "step": 1084 + }, + { + "epoch": 0.1447824926607953, + "grad_norm": 1.1982580377129664, + "learning_rate": 1.9317171291814058e-05, + "loss": 1.7533, + "step": 1085 + }, + { + "epoch": 0.14491593274619696, + "grad_norm": 1.2862961234292616, + "learning_rate": 1.931560077158072e-05, + "loss": 1.6702, + "step": 1086 + }, + { + "epoch": 0.1450493728315986, + "grad_norm": 1.1631924404688276, + "learning_rate": 1.9314028511299223e-05, + "loss": 1.6285, + "step": 1087 + }, + { + "epoch": 0.14518281291700028, + "grad_norm": 1.3994096815725487, + "learning_rate": 1.9312454511263242e-05, + "loss": 1.6882, + "step": 1088 + }, + { + "epoch": 0.14531625300240192, + "grad_norm": 1.2308714130213054, + "learning_rate": 1.9310878771766787e-05, + "loss": 1.662, + "step": 1089 + }, + { + "epoch": 0.1454496930878036, + "grad_norm": 1.2400995841145075, + "learning_rate": 1.9309301293104182e-05, + "loss": 1.6559, + "step": 1090 + }, + { + "epoch": 0.14558313317320523, + "grad_norm": 1.1648127201282061, + "learning_rate": 1.930772207557009e-05, + "loss": 1.697, + "step": 1091 + }, + { + "epoch": 0.14571657325860687, + "grad_norm": 1.1588422983053825, + "learning_rate": 1.9306141119459483e-05, + "loss": 1.7384, + "step": 1092 + }, + { + "epoch": 0.14585001334400854, + "grad_norm": 1.504230986251467, + "learning_rate": 1.930455842506767e-05, + "loss": 1.6645, + "step": 1093 + }, + { + "epoch": 0.1459834534294102, + "grad_norm": 1.4254729141660931, + "learning_rate": 1.9302973992690282e-05, + "loss": 1.7063, + "step": 1094 + }, + { + "epoch": 0.14611689351481186, + "grad_norm": 3.5366147232314296, + "learning_rate": 1.9301387822623266e-05, + "loss": 1.7281, + "step": 1095 + }, + { + "epoch": 0.1462503336002135, + "grad_norm": 1.2361609720127364, + "learning_rate": 1.929979991516291e-05, + "loss": 1.7209, + "step": 1096 + }, + { + "epoch": 0.14638377368561517, + "grad_norm": 1.095253097518337, + "learning_rate": 1.929821027060581e-05, + "loss": 1.63, + "step": 1097 + }, + { + "epoch": 0.1465172137710168, + "grad_norm": 1.2870911756015357, + "learning_rate": 1.9296618889248893e-05, + "loss": 1.6939, + "step": 1098 + }, + { + "epoch": 0.14665065385641846, + "grad_norm": 1.3716292424936098, + "learning_rate": 1.9295025771389414e-05, + "loss": 1.7245, + "step": 1099 + }, + { + "epoch": 0.14678409394182013, + "grad_norm": 1.2427470679061952, + "learning_rate": 1.9293430917324955e-05, + "loss": 1.666, + "step": 1100 + }, + { + "epoch": 0.14691753402722177, + "grad_norm": 1.460158973358874, + "learning_rate": 1.9291834327353403e-05, + "loss": 1.7147, + "step": 1101 + }, + { + "epoch": 0.14705097411262344, + "grad_norm": 1.184155803132356, + "learning_rate": 1.9290236001772995e-05, + "loss": 1.6566, + "step": 1102 + }, + { + "epoch": 0.14718441419802508, + "grad_norm": 1.2799912701156315, + "learning_rate": 1.9288635940882273e-05, + "loss": 1.6627, + "step": 1103 + }, + { + "epoch": 0.14731785428342675, + "grad_norm": 1.4406914311548737, + "learning_rate": 1.9287034144980114e-05, + "loss": 1.6758, + "step": 1104 + }, + { + "epoch": 0.1474512943688284, + "grad_norm": 1.2219976061823485, + "learning_rate": 1.928543061436571e-05, + "loss": 1.71, + "step": 1105 + }, + { + "epoch": 0.14758473445423004, + "grad_norm": 1.2519035119374464, + "learning_rate": 1.9283825349338588e-05, + "loss": 1.6538, + "step": 1106 + }, + { + "epoch": 0.1477181745396317, + "grad_norm": 1.3524232905207045, + "learning_rate": 1.9282218350198588e-05, + "loss": 1.6776, + "step": 1107 + }, + { + "epoch": 0.14785161462503335, + "grad_norm": 1.30724045787273, + "learning_rate": 1.928060961724589e-05, + "loss": 1.6482, + "step": 1108 + }, + { + "epoch": 0.14798505471043502, + "grad_norm": 1.1368808570323825, + "learning_rate": 1.9278999150780972e-05, + "loss": 1.6194, + "step": 1109 + }, + { + "epoch": 0.14811849479583666, + "grad_norm": 1.300916828823837, + "learning_rate": 1.927738695110466e-05, + "loss": 1.6947, + "step": 1110 + }, + { + "epoch": 0.14825193488123833, + "grad_norm": 1.1726520891396903, + "learning_rate": 1.9275773018518094e-05, + "loss": 1.6945, + "step": 1111 + }, + { + "epoch": 0.14838537496663998, + "grad_norm": 1.4312765994187449, + "learning_rate": 1.9274157353322736e-05, + "loss": 1.7079, + "step": 1112 + }, + { + "epoch": 0.14851881505204162, + "grad_norm": 1.3042905231751305, + "learning_rate": 1.9272539955820372e-05, + "loss": 1.6697, + "step": 1113 + }, + { + "epoch": 0.1486522551374433, + "grad_norm": 1.2109176431266062, + "learning_rate": 1.9270920826313122e-05, + "loss": 1.7066, + "step": 1114 + }, + { + "epoch": 0.14878569522284493, + "grad_norm": 1.269814239652846, + "learning_rate": 1.926929996510341e-05, + "loss": 1.6679, + "step": 1115 + }, + { + "epoch": 0.1489191353082466, + "grad_norm": 1.26285461824445, + "learning_rate": 1.9267677372494003e-05, + "loss": 1.6651, + "step": 1116 + }, + { + "epoch": 0.14905257539364825, + "grad_norm": 1.4162229382915708, + "learning_rate": 1.9266053048787973e-05, + "loss": 1.7093, + "step": 1117 + }, + { + "epoch": 0.14918601547904992, + "grad_norm": 1.1695719882927538, + "learning_rate": 1.926442699428874e-05, + "loss": 1.6775, + "step": 1118 + }, + { + "epoch": 0.14931945556445156, + "grad_norm": 1.5241355299411088, + "learning_rate": 1.9262799209300017e-05, + "loss": 1.7078, + "step": 1119 + }, + { + "epoch": 0.14945289564985323, + "grad_norm": 1.3679819271720925, + "learning_rate": 1.9261169694125868e-05, + "loss": 1.6512, + "step": 1120 + }, + { + "epoch": 0.14958633573525487, + "grad_norm": 1.0852517422618726, + "learning_rate": 1.925953844907066e-05, + "loss": 1.6732, + "step": 1121 + }, + { + "epoch": 0.14971977582065651, + "grad_norm": 1.1505718677390935, + "learning_rate": 1.9257905474439093e-05, + "loss": 1.6627, + "step": 1122 + }, + { + "epoch": 0.14985321590605818, + "grad_norm": 1.366145668087986, + "learning_rate": 1.9256270770536187e-05, + "loss": 1.6224, + "step": 1123 + }, + { + "epoch": 0.14998665599145983, + "grad_norm": 1.2009763385012844, + "learning_rate": 1.925463433766729e-05, + "loss": 1.667, + "step": 1124 + }, + { + "epoch": 0.1501200960768615, + "grad_norm": 1.1495050946339627, + "learning_rate": 1.9252996176138065e-05, + "loss": 1.7039, + "step": 1125 + }, + { + "epoch": 0.15025353616226314, + "grad_norm": 1.202004713006955, + "learning_rate": 1.9251356286254506e-05, + "loss": 1.6953, + "step": 1126 + }, + { + "epoch": 0.1503869762476648, + "grad_norm": 1.2049847095982797, + "learning_rate": 1.9249714668322922e-05, + "loss": 1.7301, + "step": 1127 + }, + { + "epoch": 0.15052041633306645, + "grad_norm": 1.3586596709801637, + "learning_rate": 1.924807132264995e-05, + "loss": 1.6488, + "step": 1128 + }, + { + "epoch": 0.1506538564184681, + "grad_norm": 1.304495283882212, + "learning_rate": 1.924642624954255e-05, + "loss": 1.6784, + "step": 1129 + }, + { + "epoch": 0.15078729650386977, + "grad_norm": 1.2254989980261268, + "learning_rate": 1.9244779449308e-05, + "loss": 1.6712, + "step": 1130 + }, + { + "epoch": 0.1509207365892714, + "grad_norm": 1.131251283705494, + "learning_rate": 1.9243130922253902e-05, + "loss": 1.6639, + "step": 1131 + }, + { + "epoch": 0.15105417667467308, + "grad_norm": 1.0999764886521537, + "learning_rate": 1.924148066868819e-05, + "loss": 1.7247, + "step": 1132 + }, + { + "epoch": 0.15118761676007472, + "grad_norm": 1.1537923655335296, + "learning_rate": 1.92398286889191e-05, + "loss": 1.6669, + "step": 1133 + }, + { + "epoch": 0.1513210568454764, + "grad_norm": 1.2811987513376073, + "learning_rate": 1.9238174983255216e-05, + "loss": 1.6873, + "step": 1134 + }, + { + "epoch": 0.15145449693087804, + "grad_norm": 1.189456464225642, + "learning_rate": 1.9236519552005425e-05, + "loss": 1.7032, + "step": 1135 + }, + { + "epoch": 0.15158793701627968, + "grad_norm": 1.226791373769935, + "learning_rate": 1.923486239547894e-05, + "loss": 1.7184, + "step": 1136 + }, + { + "epoch": 0.15172137710168135, + "grad_norm": 1.103146464311287, + "learning_rate": 1.9233203513985307e-05, + "loss": 1.6491, + "step": 1137 + }, + { + "epoch": 0.151854817187083, + "grad_norm": 1.1779372105249897, + "learning_rate": 1.923154290783438e-05, + "loss": 1.6474, + "step": 1138 + }, + { + "epoch": 0.15198825727248466, + "grad_norm": 1.2505895966166218, + "learning_rate": 1.922988057733634e-05, + "loss": 1.7153, + "step": 1139 + }, + { + "epoch": 0.1521216973578863, + "grad_norm": 1.2586045407401536, + "learning_rate": 1.92282165228017e-05, + "loss": 1.6305, + "step": 1140 + }, + { + "epoch": 0.15225513744328797, + "grad_norm": 1.1801506753191604, + "learning_rate": 1.9226550744541276e-05, + "loss": 1.6803, + "step": 1141 + }, + { + "epoch": 0.15238857752868962, + "grad_norm": 1.2155598704280677, + "learning_rate": 1.9224883242866223e-05, + "loss": 1.6921, + "step": 1142 + }, + { + "epoch": 0.15252201761409126, + "grad_norm": 1.1553853036033908, + "learning_rate": 1.9223214018088007e-05, + "loss": 1.6683, + "step": 1143 + }, + { + "epoch": 0.15265545769949293, + "grad_norm": 1.7355840451920468, + "learning_rate": 1.9221543070518427e-05, + "loss": 1.6839, + "step": 1144 + }, + { + "epoch": 0.15278889778489457, + "grad_norm": 10.109110380656796, + "learning_rate": 1.9219870400469588e-05, + "loss": 1.762, + "step": 1145 + }, + { + "epoch": 0.15292233787029624, + "grad_norm": 1.7615641665674961, + "learning_rate": 1.9218196008253934e-05, + "loss": 1.7248, + "step": 1146 + }, + { + "epoch": 0.15305577795569789, + "grad_norm": 15.239890369804119, + "learning_rate": 1.921651989418422e-05, + "loss": 1.7259, + "step": 1147 + }, + { + "epoch": 0.15318921804109956, + "grad_norm": 1.2991891405839884, + "learning_rate": 1.9214842058573517e-05, + "loss": 1.6952, + "step": 1148 + }, + { + "epoch": 0.1533226581265012, + "grad_norm": 1.2912291596013608, + "learning_rate": 1.921316250173524e-05, + "loss": 1.6993, + "step": 1149 + }, + { + "epoch": 0.15345609821190287, + "grad_norm": 1.1663630774969274, + "learning_rate": 1.92114812239831e-05, + "loss": 1.6719, + "step": 1150 + }, + { + "epoch": 0.1535895382973045, + "grad_norm": 1.2285813852625158, + "learning_rate": 1.9209798225631144e-05, + "loss": 1.7126, + "step": 1151 + }, + { + "epoch": 0.15372297838270615, + "grad_norm": 1.1467685561512206, + "learning_rate": 1.9208113506993734e-05, + "loss": 1.6242, + "step": 1152 + }, + { + "epoch": 0.15385641846810783, + "grad_norm": 1.1465340851099293, + "learning_rate": 1.920642706838556e-05, + "loss": 1.7279, + "step": 1153 + }, + { + "epoch": 0.15398985855350947, + "grad_norm": 1.1578599709330035, + "learning_rate": 1.9204738910121635e-05, + "loss": 1.6549, + "step": 1154 + }, + { + "epoch": 0.15412329863891114, + "grad_norm": 1.1683154746708295, + "learning_rate": 1.9203049032517276e-05, + "loss": 1.6429, + "step": 1155 + }, + { + "epoch": 0.15425673872431278, + "grad_norm": 1.20845397374898, + "learning_rate": 1.920135743588814e-05, + "loss": 1.681, + "step": 1156 + }, + { + "epoch": 0.15439017880971445, + "grad_norm": 1.3494049767476677, + "learning_rate": 1.9199664120550197e-05, + "loss": 1.7222, + "step": 1157 + }, + { + "epoch": 0.1545236188951161, + "grad_norm": 1.1544440730765555, + "learning_rate": 1.9197969086819735e-05, + "loss": 1.6895, + "step": 1158 + }, + { + "epoch": 0.15465705898051774, + "grad_norm": 1.1304557747461206, + "learning_rate": 1.9196272335013373e-05, + "loss": 1.6369, + "step": 1159 + }, + { + "epoch": 0.1547904990659194, + "grad_norm": 1.119564016408627, + "learning_rate": 1.9194573865448044e-05, + "loss": 1.6653, + "step": 1160 + }, + { + "epoch": 0.15492393915132105, + "grad_norm": 1.1506348346262587, + "learning_rate": 1.9192873678441e-05, + "loss": 1.6596, + "step": 1161 + }, + { + "epoch": 0.15505737923672272, + "grad_norm": 1.197324233503071, + "learning_rate": 1.9191171774309816e-05, + "loss": 1.7255, + "step": 1162 + }, + { + "epoch": 0.15519081932212436, + "grad_norm": 1.1674782781588378, + "learning_rate": 1.9189468153372394e-05, + "loss": 1.6745, + "step": 1163 + }, + { + "epoch": 0.15532425940752603, + "grad_norm": 1.308070450830591, + "learning_rate": 1.9187762815946942e-05, + "loss": 1.6307, + "step": 1164 + }, + { + "epoch": 0.15545769949292768, + "grad_norm": 1.1249631316465178, + "learning_rate": 1.9186055762352003e-05, + "loss": 1.6749, + "step": 1165 + }, + { + "epoch": 0.15559113957832932, + "grad_norm": 1.22808803451986, + "learning_rate": 1.918434699290644e-05, + "loss": 1.7167, + "step": 1166 + }, + { + "epoch": 0.155724579663731, + "grad_norm": 1.142395151395811, + "learning_rate": 1.918263650792942e-05, + "loss": 1.7143, + "step": 1167 + }, + { + "epoch": 0.15585801974913263, + "grad_norm": 1.1288423274679555, + "learning_rate": 1.9180924307740453e-05, + "loss": 1.6465, + "step": 1168 + }, + { + "epoch": 0.1559914598345343, + "grad_norm": 1.4078641204542017, + "learning_rate": 1.9179210392659353e-05, + "loss": 1.7182, + "step": 1169 + }, + { + "epoch": 0.15612489991993594, + "grad_norm": 1.1847205713407944, + "learning_rate": 1.917749476300626e-05, + "loss": 1.6681, + "step": 1170 + }, + { + "epoch": 0.15625834000533761, + "grad_norm": 1.096423039092125, + "learning_rate": 1.9175777419101634e-05, + "loss": 1.644, + "step": 1171 + }, + { + "epoch": 0.15639178009073926, + "grad_norm": 1.115302048331846, + "learning_rate": 1.917405836126626e-05, + "loss": 1.7129, + "step": 1172 + }, + { + "epoch": 0.1565252201761409, + "grad_norm": 1.1932528993802167, + "learning_rate": 1.917233758982123e-05, + "loss": 1.6633, + "step": 1173 + }, + { + "epoch": 0.15665866026154257, + "grad_norm": 1.4039420662328945, + "learning_rate": 1.917061510508797e-05, + "loss": 1.7039, + "step": 1174 + }, + { + "epoch": 0.1567921003469442, + "grad_norm": 1.3133488883749793, + "learning_rate": 1.9168890907388224e-05, + "loss": 1.639, + "step": 1175 + }, + { + "epoch": 0.15692554043234588, + "grad_norm": 1.1512042689466013, + "learning_rate": 1.9167164997044044e-05, + "loss": 1.6891, + "step": 1176 + }, + { + "epoch": 0.15705898051774753, + "grad_norm": 1.4318667585471574, + "learning_rate": 1.9165437374377815e-05, + "loss": 1.653, + "step": 1177 + }, + { + "epoch": 0.1571924206031492, + "grad_norm": 1.5436421619414216, + "learning_rate": 1.9163708039712238e-05, + "loss": 1.6979, + "step": 1178 + }, + { + "epoch": 0.15732586068855084, + "grad_norm": 1.163504109244437, + "learning_rate": 1.916197699337033e-05, + "loss": 1.6458, + "step": 1179 + }, + { + "epoch": 0.15745930077395248, + "grad_norm": 1.139437842953167, + "learning_rate": 1.9160244235675436e-05, + "loss": 1.7182, + "step": 1180 + }, + { + "epoch": 0.15759274085935415, + "grad_norm": 1.206983035678986, + "learning_rate": 1.915850976695121e-05, + "loss": 1.6761, + "step": 1181 + }, + { + "epoch": 0.1577261809447558, + "grad_norm": 1.1218445645631256, + "learning_rate": 1.9156773587521634e-05, + "loss": 1.691, + "step": 1182 + }, + { + "epoch": 0.15785962103015747, + "grad_norm": 15.435206717839069, + "learning_rate": 1.9155035697711008e-05, + "loss": 1.6704, + "step": 1183 + }, + { + "epoch": 0.1579930611155591, + "grad_norm": 1.253164673663483, + "learning_rate": 1.9153296097843944e-05, + "loss": 1.6793, + "step": 1184 + }, + { + "epoch": 0.15812650120096078, + "grad_norm": 1.2264750200407253, + "learning_rate": 1.9151554788245383e-05, + "loss": 1.6929, + "step": 1185 + }, + { + "epoch": 0.15825994128636242, + "grad_norm": 1.4772217364726286, + "learning_rate": 1.9149811769240585e-05, + "loss": 1.6452, + "step": 1186 + }, + { + "epoch": 0.1583933813717641, + "grad_norm": 1.1489987221515146, + "learning_rate": 1.9148067041155122e-05, + "loss": 1.6701, + "step": 1187 + }, + { + "epoch": 0.15852682145716573, + "grad_norm": 1.2914670100647596, + "learning_rate": 1.914632060431489e-05, + "loss": 1.7303, + "step": 1188 + }, + { + "epoch": 0.15866026154256738, + "grad_norm": 1.299350514801071, + "learning_rate": 1.9144572459046104e-05, + "loss": 1.681, + "step": 1189 + }, + { + "epoch": 0.15879370162796905, + "grad_norm": 1.3119727658820086, + "learning_rate": 1.9142822605675296e-05, + "loss": 1.696, + "step": 1190 + }, + { + "epoch": 0.1589271417133707, + "grad_norm": 1.1081403990699985, + "learning_rate": 1.9141071044529318e-05, + "loss": 1.6713, + "step": 1191 + }, + { + "epoch": 0.15906058179877236, + "grad_norm": 1.1284776489853792, + "learning_rate": 1.9139317775935348e-05, + "loss": 1.6738, + "step": 1192 + }, + { + "epoch": 0.159194021884174, + "grad_norm": 1.1261514805847666, + "learning_rate": 1.9137562800220872e-05, + "loss": 1.7054, + "step": 1193 + }, + { + "epoch": 0.15932746196957567, + "grad_norm": 1.313484879072088, + "learning_rate": 1.9135806117713697e-05, + "loss": 1.6716, + "step": 1194 + }, + { + "epoch": 0.15946090205497732, + "grad_norm": 1.278083810674143, + "learning_rate": 1.913404772874196e-05, + "loss": 1.6833, + "step": 1195 + }, + { + "epoch": 0.15959434214037896, + "grad_norm": 1.4428806272343444, + "learning_rate": 1.9132287633634097e-05, + "loss": 1.6512, + "step": 1196 + }, + { + "epoch": 0.15972778222578063, + "grad_norm": 1.209556177929987, + "learning_rate": 1.9130525832718884e-05, + "loss": 1.6942, + "step": 1197 + }, + { + "epoch": 0.15986122231118227, + "grad_norm": 1.1106358013243507, + "learning_rate": 1.9128762326325394e-05, + "loss": 1.6982, + "step": 1198 + }, + { + "epoch": 0.15999466239658394, + "grad_norm": 1.0994258199335178, + "learning_rate": 1.912699711478304e-05, + "loss": 1.6675, + "step": 1199 + }, + { + "epoch": 0.16012810248198558, + "grad_norm": 1.1185448399034177, + "learning_rate": 1.912523019842154e-05, + "loss": 1.6926, + "step": 1200 + }, + { + "epoch": 0.16026154256738726, + "grad_norm": 1.1152656846403104, + "learning_rate": 1.912346157757093e-05, + "loss": 1.7078, + "step": 1201 + }, + { + "epoch": 0.1603949826527889, + "grad_norm": 1.2924058078691643, + "learning_rate": 1.9121691252561578e-05, + "loss": 1.7084, + "step": 1202 + }, + { + "epoch": 0.16052842273819054, + "grad_norm": 1.0611065471561838, + "learning_rate": 1.911991922372415e-05, + "loss": 1.6611, + "step": 1203 + }, + { + "epoch": 0.1606618628235922, + "grad_norm": 1.3699736830557303, + "learning_rate": 1.911814549138965e-05, + "loss": 1.6897, + "step": 1204 + }, + { + "epoch": 0.16079530290899385, + "grad_norm": 1.4751343067321943, + "learning_rate": 1.9116370055889382e-05, + "loss": 1.6932, + "step": 1205 + }, + { + "epoch": 0.16092874299439552, + "grad_norm": 1.1716253181544283, + "learning_rate": 1.911459291755498e-05, + "loss": 1.7116, + "step": 1206 + }, + { + "epoch": 0.16106218307979717, + "grad_norm": 1.1114747666351001, + "learning_rate": 1.91128140767184e-05, + "loss": 1.6428, + "step": 1207 + }, + { + "epoch": 0.16119562316519884, + "grad_norm": 1.0879838952081482, + "learning_rate": 1.9111033533711897e-05, + "loss": 1.6373, + "step": 1208 + }, + { + "epoch": 0.16132906325060048, + "grad_norm": 1.1094725515932415, + "learning_rate": 1.9109251288868064e-05, + "loss": 1.6457, + "step": 1209 + }, + { + "epoch": 0.16146250333600212, + "grad_norm": 1.1303873737596453, + "learning_rate": 1.9107467342519808e-05, + "loss": 1.6839, + "step": 1210 + }, + { + "epoch": 0.1615959434214038, + "grad_norm": 1.4781044833796941, + "learning_rate": 1.9105681695000342e-05, + "loss": 1.6406, + "step": 1211 + }, + { + "epoch": 0.16172938350680544, + "grad_norm": 1.1657729793459108, + "learning_rate": 1.9103894346643204e-05, + "loss": 1.6726, + "step": 1212 + }, + { + "epoch": 0.1618628235922071, + "grad_norm": 1.2198288103403852, + "learning_rate": 1.910210529778226e-05, + "loss": 1.6929, + "step": 1213 + }, + { + "epoch": 0.16199626367760875, + "grad_norm": 1.1871995635063328, + "learning_rate": 1.9100314548751676e-05, + "loss": 1.6907, + "step": 1214 + }, + { + "epoch": 0.16212970376301042, + "grad_norm": 1.1864807109439603, + "learning_rate": 1.9098522099885944e-05, + "loss": 1.653, + "step": 1215 + }, + { + "epoch": 0.16226314384841206, + "grad_norm": 1.2368576392699981, + "learning_rate": 1.9096727951519872e-05, + "loss": 1.7201, + "step": 1216 + }, + { + "epoch": 0.1623965839338137, + "grad_norm": 1.1260231051664356, + "learning_rate": 1.909493210398859e-05, + "loss": 1.6123, + "step": 1217 + }, + { + "epoch": 0.16253002401921537, + "grad_norm": 1.4256559162950146, + "learning_rate": 1.9093134557627544e-05, + "loss": 1.666, + "step": 1218 + }, + { + "epoch": 0.16266346410461702, + "grad_norm": 1.4887124369987752, + "learning_rate": 1.9091335312772493e-05, + "loss": 1.7597, + "step": 1219 + }, + { + "epoch": 0.1627969041900187, + "grad_norm": 1.1294991712339444, + "learning_rate": 1.9089534369759508e-05, + "loss": 1.688, + "step": 1220 + }, + { + "epoch": 0.16293034427542033, + "grad_norm": 1.177420926926422, + "learning_rate": 1.9087731728924996e-05, + "loss": 1.7286, + "step": 1221 + }, + { + "epoch": 0.163063784360822, + "grad_norm": 1.6913536181421298, + "learning_rate": 1.9085927390605663e-05, + "loss": 1.6913, + "step": 1222 + }, + { + "epoch": 0.16319722444622364, + "grad_norm": 1.2163160869711371, + "learning_rate": 1.908412135513854e-05, + "loss": 1.6669, + "step": 1223 + }, + { + "epoch": 0.1633306645316253, + "grad_norm": 1.3781638643041434, + "learning_rate": 1.9082313622860976e-05, + "loss": 1.657, + "step": 1224 + }, + { + "epoch": 0.16346410461702696, + "grad_norm": 1.121301734458933, + "learning_rate": 1.9080504194110633e-05, + "loss": 1.7017, + "step": 1225 + }, + { + "epoch": 0.1635975447024286, + "grad_norm": 1.2118234782577832, + "learning_rate": 1.907869306922549e-05, + "loss": 1.7203, + "step": 1226 + }, + { + "epoch": 0.16373098478783027, + "grad_norm": 1.2074642428560403, + "learning_rate": 1.9076880248543847e-05, + "loss": 1.6558, + "step": 1227 + }, + { + "epoch": 0.1638644248732319, + "grad_norm": 1.119322476739159, + "learning_rate": 1.9075065732404318e-05, + "loss": 1.6947, + "step": 1228 + }, + { + "epoch": 0.16399786495863358, + "grad_norm": 1.1009776042368575, + "learning_rate": 1.9073249521145833e-05, + "loss": 1.6984, + "step": 1229 + }, + { + "epoch": 0.16413130504403523, + "grad_norm": 1.3636116015946005, + "learning_rate": 1.9071431615107642e-05, + "loss": 1.6266, + "step": 1230 + }, + { + "epoch": 0.1642647451294369, + "grad_norm": 1.2193076150679258, + "learning_rate": 1.9069612014629307e-05, + "loss": 1.6548, + "step": 1231 + }, + { + "epoch": 0.16439818521483854, + "grad_norm": 1.108647716162107, + "learning_rate": 1.9067790720050708e-05, + "loss": 1.6949, + "step": 1232 + }, + { + "epoch": 0.16453162530024018, + "grad_norm": 1.3428440762492813, + "learning_rate": 1.9065967731712044e-05, + "loss": 1.628, + "step": 1233 + }, + { + "epoch": 0.16466506538564185, + "grad_norm": 1.1264569960102961, + "learning_rate": 1.9064143049953826e-05, + "loss": 1.69, + "step": 1234 + }, + { + "epoch": 0.1647985054710435, + "grad_norm": 1.2129394646481007, + "learning_rate": 1.9062316675116886e-05, + "loss": 1.7039, + "step": 1235 + }, + { + "epoch": 0.16493194555644516, + "grad_norm": 1.3168167400509623, + "learning_rate": 1.9060488607542373e-05, + "loss": 1.6854, + "step": 1236 + }, + { + "epoch": 0.1650653856418468, + "grad_norm": 1.172892361541946, + "learning_rate": 1.9058658847571744e-05, + "loss": 1.6908, + "step": 1237 + }, + { + "epoch": 0.16519882572724848, + "grad_norm": 1.8095333564387064, + "learning_rate": 1.905682739554678e-05, + "loss": 1.6951, + "step": 1238 + }, + { + "epoch": 0.16533226581265012, + "grad_norm": 1.3485139476585741, + "learning_rate": 1.905499425180957e-05, + "loss": 1.6748, + "step": 1239 + }, + { + "epoch": 0.16546570589805176, + "grad_norm": 1.2787725949509623, + "learning_rate": 1.905315941670253e-05, + "loss": 1.6657, + "step": 1240 + }, + { + "epoch": 0.16559914598345343, + "grad_norm": 1.2224431058019596, + "learning_rate": 1.9051322890568386e-05, + "loss": 1.6656, + "step": 1241 + }, + { + "epoch": 0.16573258606885508, + "grad_norm": 1.137537755181391, + "learning_rate": 1.904948467375018e-05, + "loss": 1.6577, + "step": 1242 + }, + { + "epoch": 0.16586602615425675, + "grad_norm": 1.0868355979985056, + "learning_rate": 1.9047644766591273e-05, + "loss": 1.7073, + "step": 1243 + }, + { + "epoch": 0.1659994662396584, + "grad_norm": 1.114462768904141, + "learning_rate": 1.9045803169435326e-05, + "loss": 1.6305, + "step": 1244 + }, + { + "epoch": 0.16613290632506006, + "grad_norm": 1.3561430912216557, + "learning_rate": 1.9043959882626343e-05, + "loss": 1.6719, + "step": 1245 + }, + { + "epoch": 0.1662663464104617, + "grad_norm": 1.3197220598884711, + "learning_rate": 1.9042114906508623e-05, + "loss": 1.6904, + "step": 1246 + }, + { + "epoch": 0.16639978649586334, + "grad_norm": 1.1663473878740998, + "learning_rate": 1.9040268241426786e-05, + "loss": 1.6815, + "step": 1247 + }, + { + "epoch": 0.16653322658126501, + "grad_norm": 1.1208171885233782, + "learning_rate": 1.9038419887725768e-05, + "loss": 1.6888, + "step": 1248 + }, + { + "epoch": 0.16666666666666666, + "grad_norm": 1.2317096593237165, + "learning_rate": 1.9036569845750822e-05, + "loss": 1.6998, + "step": 1249 + }, + { + "epoch": 0.16680010675206833, + "grad_norm": 1.2118086361733955, + "learning_rate": 1.9034718115847516e-05, + "loss": 1.7239, + "step": 1250 + }, + { + "epoch": 0.16693354683746997, + "grad_norm": 1.1328099587839497, + "learning_rate": 1.9032864698361728e-05, + "loss": 1.6664, + "step": 1251 + }, + { + "epoch": 0.16706698692287164, + "grad_norm": 1.2351889795437343, + "learning_rate": 1.9031009593639656e-05, + "loss": 1.7225, + "step": 1252 + }, + { + "epoch": 0.16720042700827328, + "grad_norm": 1.1680475805874204, + "learning_rate": 1.9029152802027816e-05, + "loss": 1.7055, + "step": 1253 + }, + { + "epoch": 0.16733386709367493, + "grad_norm": 1.1396480995870708, + "learning_rate": 1.9027294323873032e-05, + "loss": 1.6754, + "step": 1254 + }, + { + "epoch": 0.1674673071790766, + "grad_norm": 1.099951848229152, + "learning_rate": 1.902543415952245e-05, + "loss": 1.6574, + "step": 1255 + }, + { + "epoch": 0.16760074726447824, + "grad_norm": 1.091862726736696, + "learning_rate": 1.9023572309323522e-05, + "loss": 1.6254, + "step": 1256 + }, + { + "epoch": 0.1677341873498799, + "grad_norm": 1.2660462090997906, + "learning_rate": 1.9021708773624027e-05, + "loss": 1.6797, + "step": 1257 + }, + { + "epoch": 0.16786762743528155, + "grad_norm": 1.1200461208933035, + "learning_rate": 1.901984355277205e-05, + "loss": 1.6885, + "step": 1258 + }, + { + "epoch": 0.16800106752068322, + "grad_norm": 1.2050844123558844, + "learning_rate": 1.9017976647115988e-05, + "loss": 1.6851, + "step": 1259 + }, + { + "epoch": 0.16813450760608487, + "grad_norm": 1.142685776004093, + "learning_rate": 1.9016108057004566e-05, + "loss": 1.625, + "step": 1260 + }, + { + "epoch": 0.16826794769148654, + "grad_norm": 1.1684071493919217, + "learning_rate": 1.9014237782786806e-05, + "loss": 1.6768, + "step": 1261 + }, + { + "epoch": 0.16840138777688818, + "grad_norm": 1.1496482446886893, + "learning_rate": 1.9012365824812063e-05, + "loss": 1.6409, + "step": 1262 + }, + { + "epoch": 0.16853482786228982, + "grad_norm": 1.0944121754604215, + "learning_rate": 1.9010492183429994e-05, + "loss": 1.63, + "step": 1263 + }, + { + "epoch": 0.1686682679476915, + "grad_norm": 1.0842462969633766, + "learning_rate": 1.9008616858990572e-05, + "loss": 1.646, + "step": 1264 + }, + { + "epoch": 0.16880170803309313, + "grad_norm": 1.4767767757650743, + "learning_rate": 1.9006739851844086e-05, + "loss": 1.6454, + "step": 1265 + }, + { + "epoch": 0.1689351481184948, + "grad_norm": 1.18032002297524, + "learning_rate": 1.9004861162341144e-05, + "loss": 1.6861, + "step": 1266 + }, + { + "epoch": 0.16906858820389645, + "grad_norm": 1.3372419431896025, + "learning_rate": 1.9002980790832663e-05, + "loss": 1.6586, + "step": 1267 + }, + { + "epoch": 0.16920202828929812, + "grad_norm": 1.172100506483245, + "learning_rate": 1.900109873766987e-05, + "loss": 1.6657, + "step": 1268 + }, + { + "epoch": 0.16933546837469976, + "grad_norm": 1.1145469314883394, + "learning_rate": 1.8999215003204316e-05, + "loss": 1.7082, + "step": 1269 + }, + { + "epoch": 0.1694689084601014, + "grad_norm": 1.1206009643179509, + "learning_rate": 1.8997329587787856e-05, + "loss": 1.6578, + "step": 1270 + }, + { + "epoch": 0.16960234854550307, + "grad_norm": 1.399049311820928, + "learning_rate": 1.8995442491772668e-05, + "loss": 1.6955, + "step": 1271 + }, + { + "epoch": 0.16973578863090472, + "grad_norm": 1.1958580398149483, + "learning_rate": 1.899355371551124e-05, + "loss": 1.7101, + "step": 1272 + }, + { + "epoch": 0.1698692287163064, + "grad_norm": 1.3556638628411706, + "learning_rate": 1.8991663259356374e-05, + "loss": 1.7108, + "step": 1273 + }, + { + "epoch": 0.17000266880170803, + "grad_norm": 1.3451150415787545, + "learning_rate": 1.8989771123661186e-05, + "loss": 1.7315, + "step": 1274 + }, + { + "epoch": 0.1701361088871097, + "grad_norm": 1.3135350609625422, + "learning_rate": 1.8987877308779104e-05, + "loss": 1.6609, + "step": 1275 + }, + { + "epoch": 0.17026954897251134, + "grad_norm": 1.1333530911950962, + "learning_rate": 1.8985981815063873e-05, + "loss": 1.6684, + "step": 1276 + }, + { + "epoch": 0.17040298905791298, + "grad_norm": 1.2791530568330647, + "learning_rate": 1.8984084642869546e-05, + "loss": 1.6651, + "step": 1277 + }, + { + "epoch": 0.17053642914331466, + "grad_norm": 1.3692883154733826, + "learning_rate": 1.8982185792550495e-05, + "loss": 1.7462, + "step": 1278 + }, + { + "epoch": 0.1706698692287163, + "grad_norm": 1.335470306118413, + "learning_rate": 1.8980285264461404e-05, + "loss": 1.6328, + "step": 1279 + }, + { + "epoch": 0.17080330931411797, + "grad_norm": 1.609394077858067, + "learning_rate": 1.8978383058957272e-05, + "loss": 1.7349, + "step": 1280 + }, + { + "epoch": 0.1709367493995196, + "grad_norm": 1.0926180749124967, + "learning_rate": 1.8976479176393405e-05, + "loss": 1.6251, + "step": 1281 + }, + { + "epoch": 0.17107018948492128, + "grad_norm": 1.5796826003726887, + "learning_rate": 1.8974573617125433e-05, + "loss": 1.7046, + "step": 1282 + }, + { + "epoch": 0.17120362957032292, + "grad_norm": 1.2777834800327206, + "learning_rate": 1.8972666381509283e-05, + "loss": 1.6877, + "step": 1283 + }, + { + "epoch": 0.17133706965572457, + "grad_norm": 1.1654445244110958, + "learning_rate": 1.8970757469901215e-05, + "loss": 1.6931, + "step": 1284 + }, + { + "epoch": 0.17147050974112624, + "grad_norm": 1.2695593578179767, + "learning_rate": 1.8968846882657784e-05, + "loss": 1.6171, + "step": 1285 + }, + { + "epoch": 0.17160394982652788, + "grad_norm": 1.1260255267285963, + "learning_rate": 1.896693462013587e-05, + "loss": 1.6733, + "step": 1286 + }, + { + "epoch": 0.17173738991192955, + "grad_norm": 1.1761366558123016, + "learning_rate": 1.8965020682692662e-05, + "loss": 1.6513, + "step": 1287 + }, + { + "epoch": 0.1718708299973312, + "grad_norm": 1.3298600101322868, + "learning_rate": 1.8963105070685667e-05, + "loss": 1.6684, + "step": 1288 + }, + { + "epoch": 0.17200427008273286, + "grad_norm": 1.1292417955776037, + "learning_rate": 1.8961187784472685e-05, + "loss": 1.6489, + "step": 1289 + }, + { + "epoch": 0.1721377101681345, + "grad_norm": 1.2976031015347522, + "learning_rate": 1.8959268824411857e-05, + "loss": 1.7234, + "step": 1290 + }, + { + "epoch": 0.17227115025353618, + "grad_norm": 5.468850373981861, + "learning_rate": 1.8957348190861612e-05, + "loss": 1.6717, + "step": 1291 + }, + { + "epoch": 0.17240459033893782, + "grad_norm": 1.2061510206612853, + "learning_rate": 1.8955425884180715e-05, + "loss": 1.6682, + "step": 1292 + }, + { + "epoch": 0.17253803042433946, + "grad_norm": 1.2340106256554149, + "learning_rate": 1.895350190472822e-05, + "loss": 1.6452, + "step": 1293 + }, + { + "epoch": 0.17267147050974113, + "grad_norm": 1.1295754093637331, + "learning_rate": 1.895157625286351e-05, + "loss": 1.6993, + "step": 1294 + }, + { + "epoch": 0.17280491059514277, + "grad_norm": 1.1791235092887307, + "learning_rate": 1.8949648928946275e-05, + "loss": 1.6347, + "step": 1295 + }, + { + "epoch": 0.17293835068054444, + "grad_norm": 1.0917941158292654, + "learning_rate": 1.894771993333651e-05, + "loss": 1.6361, + "step": 1296 + }, + { + "epoch": 0.1730717907659461, + "grad_norm": 1.243225904269034, + "learning_rate": 1.8945789266394543e-05, + "loss": 1.6859, + "step": 1297 + }, + { + "epoch": 0.17320523085134776, + "grad_norm": 1.4426394116019783, + "learning_rate": 1.894385692848099e-05, + "loss": 1.6686, + "step": 1298 + }, + { + "epoch": 0.1733386709367494, + "grad_norm": 1.1402525280881741, + "learning_rate": 1.8941922919956788e-05, + "loss": 1.6656, + "step": 1299 + }, + { + "epoch": 0.17347211102215104, + "grad_norm": 1.1712942843243668, + "learning_rate": 1.8939987241183195e-05, + "loss": 1.6622, + "step": 1300 + }, + { + "epoch": 0.1736055511075527, + "grad_norm": 1.1420488961794493, + "learning_rate": 1.8938049892521772e-05, + "loss": 1.6048, + "step": 1301 + }, + { + "epoch": 0.17373899119295436, + "grad_norm": 1.4695327916997878, + "learning_rate": 1.8936110874334392e-05, + "loss": 1.6625, + "step": 1302 + }, + { + "epoch": 0.17387243127835603, + "grad_norm": 1.0900641908664435, + "learning_rate": 1.8934170186983242e-05, + "loss": 1.6396, + "step": 1303 + }, + { + "epoch": 0.17400587136375767, + "grad_norm": 1.1307482898544658, + "learning_rate": 1.8932227830830822e-05, + "loss": 1.6878, + "step": 1304 + }, + { + "epoch": 0.17413931144915934, + "grad_norm": 1.0752115709650287, + "learning_rate": 1.8930283806239936e-05, + "loss": 1.6868, + "step": 1305 + }, + { + "epoch": 0.17427275153456098, + "grad_norm": 1.3371575151194106, + "learning_rate": 1.8928338113573715e-05, + "loss": 1.6374, + "step": 1306 + }, + { + "epoch": 0.17440619161996262, + "grad_norm": 1.2309293597249826, + "learning_rate": 1.8926390753195583e-05, + "loss": 1.6629, + "step": 1307 + }, + { + "epoch": 0.1745396317053643, + "grad_norm": 1.1999413493774056, + "learning_rate": 1.8924441725469293e-05, + "loss": 1.6837, + "step": 1308 + }, + { + "epoch": 0.17467307179076594, + "grad_norm": 4.1911788978313735, + "learning_rate": 1.8922491030758893e-05, + "loss": 1.7471, + "step": 1309 + }, + { + "epoch": 0.1748065118761676, + "grad_norm": 1.1779561572349584, + "learning_rate": 1.892053866942876e-05, + "loss": 1.7246, + "step": 1310 + }, + { + "epoch": 0.17493995196156925, + "grad_norm": 1.1617629781133412, + "learning_rate": 1.8918584641843564e-05, + "loss": 1.6462, + "step": 1311 + }, + { + "epoch": 0.17507339204697092, + "grad_norm": 1.2004531219911387, + "learning_rate": 1.89166289483683e-05, + "loss": 1.6432, + "step": 1312 + }, + { + "epoch": 0.17520683213237256, + "grad_norm": 1.1057648722865505, + "learning_rate": 1.891467158936827e-05, + "loss": 1.6363, + "step": 1313 + }, + { + "epoch": 0.1753402722177742, + "grad_norm": 1.0830292426032284, + "learning_rate": 1.8912712565209082e-05, + "loss": 1.6543, + "step": 1314 + }, + { + "epoch": 0.17547371230317588, + "grad_norm": 1.390700802694287, + "learning_rate": 1.8910751876256663e-05, + "loss": 1.5831, + "step": 1315 + }, + { + "epoch": 0.17560715238857752, + "grad_norm": 1.1442545355007538, + "learning_rate": 1.890878952287724e-05, + "loss": 1.6725, + "step": 1316 + }, + { + "epoch": 0.1757405924739792, + "grad_norm": 4.524949596846156, + "learning_rate": 1.8906825505437375e-05, + "loss": 1.7064, + "step": 1317 + }, + { + "epoch": 0.17587403255938083, + "grad_norm": 1.5116207526291794, + "learning_rate": 1.8904859824303906e-05, + "loss": 1.6625, + "step": 1318 + }, + { + "epoch": 0.1760074726447825, + "grad_norm": 1.3022482630711962, + "learning_rate": 1.8902892479844012e-05, + "loss": 1.6899, + "step": 1319 + }, + { + "epoch": 0.17614091273018415, + "grad_norm": 1.3036609447195004, + "learning_rate": 1.8900923472425162e-05, + "loss": 1.7094, + "step": 1320 + }, + { + "epoch": 0.1762743528155858, + "grad_norm": 1.201826945742321, + "learning_rate": 1.889895280241515e-05, + "loss": 1.6874, + "step": 1321 + }, + { + "epoch": 0.17640779290098746, + "grad_norm": 1.331895493241366, + "learning_rate": 1.8896980470182074e-05, + "loss": 1.6567, + "step": 1322 + }, + { + "epoch": 0.1765412329863891, + "grad_norm": 1.2443633859323437, + "learning_rate": 1.889500647609434e-05, + "loss": 1.667, + "step": 1323 + }, + { + "epoch": 0.17667467307179077, + "grad_norm": 1.2258655395860159, + "learning_rate": 1.8893030820520676e-05, + "loss": 1.6747, + "step": 1324 + }, + { + "epoch": 0.17680811315719241, + "grad_norm": 1.1121857279555136, + "learning_rate": 1.8891053503830096e-05, + "loss": 1.6623, + "step": 1325 + }, + { + "epoch": 0.17694155324259409, + "grad_norm": 1.1496733774130343, + "learning_rate": 1.8889074526391956e-05, + "loss": 1.6505, + "step": 1326 + }, + { + "epoch": 0.17707499332799573, + "grad_norm": 1.190622161497496, + "learning_rate": 1.8887093888575897e-05, + "loss": 1.6946, + "step": 1327 + }, + { + "epoch": 0.1772084334133974, + "grad_norm": 1.234769822108329, + "learning_rate": 1.8885111590751887e-05, + "loss": 1.6822, + "step": 1328 + }, + { + "epoch": 0.17734187349879904, + "grad_norm": 1.1961580846084812, + "learning_rate": 1.8883127633290193e-05, + "loss": 1.6098, + "step": 1329 + }, + { + "epoch": 0.17747531358420068, + "grad_norm": 1.1500701811676424, + "learning_rate": 1.888114201656139e-05, + "loss": 1.6979, + "step": 1330 + }, + { + "epoch": 0.17760875366960235, + "grad_norm": 1.068835069330444, + "learning_rate": 1.887915474093638e-05, + "loss": 1.6443, + "step": 1331 + }, + { + "epoch": 0.177742193755004, + "grad_norm": 1.0715312154596754, + "learning_rate": 1.8877165806786353e-05, + "loss": 1.6178, + "step": 1332 + }, + { + "epoch": 0.17787563384040567, + "grad_norm": 1.3293068311768508, + "learning_rate": 1.8875175214482823e-05, + "loss": 1.6494, + "step": 1333 + }, + { + "epoch": 0.1780090739258073, + "grad_norm": 1.2954287617747784, + "learning_rate": 1.8873182964397614e-05, + "loss": 1.6546, + "step": 1334 + }, + { + "epoch": 0.17814251401120898, + "grad_norm": 1.125061002872152, + "learning_rate": 1.887118905690285e-05, + "loss": 1.6897, + "step": 1335 + }, + { + "epoch": 0.17827595409661062, + "grad_norm": 1.0654922527296347, + "learning_rate": 1.8869193492370975e-05, + "loss": 1.6899, + "step": 1336 + }, + { + "epoch": 0.17840939418201227, + "grad_norm": 1.2859684785579713, + "learning_rate": 1.8867196271174732e-05, + "loss": 1.6489, + "step": 1337 + }, + { + "epoch": 0.17854283426741394, + "grad_norm": 1.2913796594054934, + "learning_rate": 1.8865197393687184e-05, + "loss": 1.6531, + "step": 1338 + }, + { + "epoch": 0.17867627435281558, + "grad_norm": 1.1287917089144135, + "learning_rate": 1.8863196860281696e-05, + "loss": 1.7184, + "step": 1339 + }, + { + "epoch": 0.17880971443821725, + "grad_norm": 1.066588369186229, + "learning_rate": 1.8861194671331946e-05, + "loss": 1.7052, + "step": 1340 + }, + { + "epoch": 0.1789431545236189, + "grad_norm": 1.3155491787656075, + "learning_rate": 1.8859190827211922e-05, + "loss": 1.739, + "step": 1341 + }, + { + "epoch": 0.17907659460902056, + "grad_norm": 1.351336369334145, + "learning_rate": 1.8857185328295915e-05, + "loss": 1.6433, + "step": 1342 + }, + { + "epoch": 0.1792100346944222, + "grad_norm": 1.144045347845426, + "learning_rate": 1.885517817495853e-05, + "loss": 1.6588, + "step": 1343 + }, + { + "epoch": 0.17934347477982385, + "grad_norm": 1.0634778809440733, + "learning_rate": 1.8853169367574684e-05, + "loss": 1.6542, + "step": 1344 + }, + { + "epoch": 0.17947691486522552, + "grad_norm": 1.177888372263127, + "learning_rate": 1.88511589065196e-05, + "loss": 1.6408, + "step": 1345 + }, + { + "epoch": 0.17961035495062716, + "grad_norm": 1.2708302318930562, + "learning_rate": 1.88491467921688e-05, + "loss": 1.6817, + "step": 1346 + }, + { + "epoch": 0.17974379503602883, + "grad_norm": 1.170081033949634, + "learning_rate": 1.8847133024898135e-05, + "loss": 1.6806, + "step": 1347 + }, + { + "epoch": 0.17987723512143047, + "grad_norm": 1.0804261336260308, + "learning_rate": 1.884511760508375e-05, + "loss": 1.6134, + "step": 1348 + }, + { + "epoch": 0.18001067520683214, + "grad_norm": 1.087811387530533, + "learning_rate": 1.88431005331021e-05, + "loss": 1.6393, + "step": 1349 + }, + { + "epoch": 0.1801441152922338, + "grad_norm": 1.3853627907695811, + "learning_rate": 1.8841081809329952e-05, + "loss": 1.7002, + "step": 1350 + }, + { + "epoch": 0.18027755537763543, + "grad_norm": 1.1966968339164359, + "learning_rate": 1.8839061434144383e-05, + "loss": 1.6703, + "step": 1351 + }, + { + "epoch": 0.1804109954630371, + "grad_norm": 1.1737840062211229, + "learning_rate": 1.8837039407922776e-05, + "loss": 1.7185, + "step": 1352 + }, + { + "epoch": 0.18054443554843874, + "grad_norm": 1.0896871608203742, + "learning_rate": 1.883501573104282e-05, + "loss": 1.6583, + "step": 1353 + }, + { + "epoch": 0.1806778756338404, + "grad_norm": 1.3179615694791937, + "learning_rate": 1.883299040388251e-05, + "loss": 1.6311, + "step": 1354 + }, + { + "epoch": 0.18081131571924206, + "grad_norm": 1.1110191440501396, + "learning_rate": 1.883096342682017e-05, + "loss": 1.6461, + "step": 1355 + }, + { + "epoch": 0.18094475580464373, + "grad_norm": 1.327469108458132, + "learning_rate": 1.88289348002344e-05, + "loss": 1.624, + "step": 1356 + }, + { + "epoch": 0.18107819589004537, + "grad_norm": 1.1046636131234442, + "learning_rate": 1.882690452450413e-05, + "loss": 1.6798, + "step": 1357 + }, + { + "epoch": 0.181211635975447, + "grad_norm": 1.0969053723393494, + "learning_rate": 1.8824872600008595e-05, + "loss": 1.6196, + "step": 1358 + }, + { + "epoch": 0.18134507606084868, + "grad_norm": 4.9414750848280455, + "learning_rate": 1.8822839027127333e-05, + "loss": 1.6521, + "step": 1359 + }, + { + "epoch": 0.18147851614625032, + "grad_norm": 1.1053740384798774, + "learning_rate": 1.8820803806240192e-05, + "loss": 1.6409, + "step": 1360 + }, + { + "epoch": 0.181611956231652, + "grad_norm": 1.1800356925014113, + "learning_rate": 1.881876693772733e-05, + "loss": 1.6952, + "step": 1361 + }, + { + "epoch": 0.18174539631705364, + "grad_norm": 1.339932629654733, + "learning_rate": 1.8816728421969208e-05, + "loss": 1.6583, + "step": 1362 + }, + { + "epoch": 0.1818788364024553, + "grad_norm": 1.1809439235732044, + "learning_rate": 1.88146882593466e-05, + "loss": 1.657, + "step": 1363 + }, + { + "epoch": 0.18201227648785695, + "grad_norm": 1.1124814368658855, + "learning_rate": 1.8812646450240586e-05, + "loss": 1.6248, + "step": 1364 + }, + { + "epoch": 0.18214571657325862, + "grad_norm": 1.3035826545815201, + "learning_rate": 1.881060299503255e-05, + "loss": 1.6932, + "step": 1365 + }, + { + "epoch": 0.18227915665866026, + "grad_norm": 1.2040318351320083, + "learning_rate": 1.8808557894104185e-05, + "loss": 1.6214, + "step": 1366 + }, + { + "epoch": 0.1824125967440619, + "grad_norm": 1.2476963311275309, + "learning_rate": 1.8806511147837492e-05, + "loss": 1.6661, + "step": 1367 + }, + { + "epoch": 0.18254603682946358, + "grad_norm": 1.1447416720262047, + "learning_rate": 1.8804462756614788e-05, + "loss": 1.6795, + "step": 1368 + }, + { + "epoch": 0.18267947691486522, + "grad_norm": 1.2146315931776808, + "learning_rate": 1.8802412720818683e-05, + "loss": 1.6683, + "step": 1369 + }, + { + "epoch": 0.1828129170002669, + "grad_norm": 1.0961663105222377, + "learning_rate": 1.8800361040832098e-05, + "loss": 1.6522, + "step": 1370 + }, + { + "epoch": 0.18294635708566853, + "grad_norm": 1.4720443132763514, + "learning_rate": 1.8798307717038267e-05, + "loss": 1.6972, + "step": 1371 + }, + { + "epoch": 0.1830797971710702, + "grad_norm": 1.077369472777587, + "learning_rate": 1.8796252749820728e-05, + "loss": 1.6867, + "step": 1372 + }, + { + "epoch": 0.18321323725647184, + "grad_norm": 1.082352978247377, + "learning_rate": 1.8794196139563324e-05, + "loss": 1.6186, + "step": 1373 + }, + { + "epoch": 0.1833466773418735, + "grad_norm": 1.3376025342187408, + "learning_rate": 1.8792137886650207e-05, + "loss": 1.6242, + "step": 1374 + }, + { + "epoch": 0.18348011742727516, + "grad_norm": 1.060000822920305, + "learning_rate": 1.8790077991465833e-05, + "loss": 1.6694, + "step": 1375 + }, + { + "epoch": 0.1836135575126768, + "grad_norm": 1.1946549594821052, + "learning_rate": 1.8788016454394972e-05, + "loss": 1.7262, + "step": 1376 + }, + { + "epoch": 0.18374699759807847, + "grad_norm": 1.1956085836847756, + "learning_rate": 1.8785953275822694e-05, + "loss": 1.6692, + "step": 1377 + }, + { + "epoch": 0.1838804376834801, + "grad_norm": 1.398886222221775, + "learning_rate": 1.8783888456134373e-05, + "loss": 1.6827, + "step": 1378 + }, + { + "epoch": 0.18401387776888178, + "grad_norm": 1.0879716586164903, + "learning_rate": 1.8781821995715694e-05, + "loss": 1.6799, + "step": 1379 + }, + { + "epoch": 0.18414731785428343, + "grad_norm": 1.0905252446482103, + "learning_rate": 1.8779753894952658e-05, + "loss": 1.6059, + "step": 1380 + }, + { + "epoch": 0.18428075793968507, + "grad_norm": 11.800697000575934, + "learning_rate": 1.877768415423155e-05, + "loss": 1.7131, + "step": 1381 + }, + { + "epoch": 0.18441419802508674, + "grad_norm": 1.4694858089280596, + "learning_rate": 1.8775612773938984e-05, + "loss": 1.6846, + "step": 1382 + }, + { + "epoch": 0.18454763811048838, + "grad_norm": 1.4328576424428283, + "learning_rate": 1.877353975446186e-05, + "loss": 1.6188, + "step": 1383 + }, + { + "epoch": 0.18468107819589005, + "grad_norm": 1.3413019372811694, + "learning_rate": 1.8771465096187404e-05, + "loss": 1.6835, + "step": 1384 + }, + { + "epoch": 0.1848145182812917, + "grad_norm": 1.1291865647681716, + "learning_rate": 1.8769388799503138e-05, + "loss": 1.6709, + "step": 1385 + }, + { + "epoch": 0.18494795836669337, + "grad_norm": 1.2530526469172218, + "learning_rate": 1.876731086479688e-05, + "loss": 1.6951, + "step": 1386 + }, + { + "epoch": 0.185081398452095, + "grad_norm": 1.1547439851135595, + "learning_rate": 1.876523129245678e-05, + "loss": 1.6539, + "step": 1387 + }, + { + "epoch": 0.18521483853749665, + "grad_norm": 1.083610486699219, + "learning_rate": 1.8763150082871264e-05, + "loss": 1.6559, + "step": 1388 + }, + { + "epoch": 0.18534827862289832, + "grad_norm": 1.3139745218385874, + "learning_rate": 1.876106723642909e-05, + "loss": 1.6658, + "step": 1389 + }, + { + "epoch": 0.18548171870829996, + "grad_norm": 1.2845524278681613, + "learning_rate": 1.8758982753519302e-05, + "loss": 1.6613, + "step": 1390 + }, + { + "epoch": 0.18561515879370163, + "grad_norm": 1.3886592309613566, + "learning_rate": 1.875689663453126e-05, + "loss": 1.6973, + "step": 1391 + }, + { + "epoch": 0.18574859887910328, + "grad_norm": 1.2127883048481998, + "learning_rate": 1.8754808879854628e-05, + "loss": 1.6643, + "step": 1392 + }, + { + "epoch": 0.18588203896450495, + "grad_norm": 1.1707525451232437, + "learning_rate": 1.8752719489879373e-05, + "loss": 1.6844, + "step": 1393 + }, + { + "epoch": 0.1860154790499066, + "grad_norm": 1.2601795453565978, + "learning_rate": 1.8750628464995775e-05, + "loss": 1.7148, + "step": 1394 + }, + { + "epoch": 0.18614891913530823, + "grad_norm": 1.1313620285698516, + "learning_rate": 1.8748535805594406e-05, + "loss": 1.6856, + "step": 1395 + }, + { + "epoch": 0.1862823592207099, + "grad_norm": 1.2540203774883083, + "learning_rate": 1.8746441512066155e-05, + "loss": 1.6585, + "step": 1396 + }, + { + "epoch": 0.18641579930611155, + "grad_norm": 1.218035613217223, + "learning_rate": 1.8744345584802212e-05, + "loss": 1.6955, + "step": 1397 + }, + { + "epoch": 0.18654923939151322, + "grad_norm": 1.106574827128144, + "learning_rate": 1.8742248024194073e-05, + "loss": 1.6404, + "step": 1398 + }, + { + "epoch": 0.18668267947691486, + "grad_norm": 1.1254058920479488, + "learning_rate": 1.8740148830633534e-05, + "loss": 1.6456, + "step": 1399 + }, + { + "epoch": 0.18681611956231653, + "grad_norm": 1.1988842778577748, + "learning_rate": 1.8738048004512707e-05, + "loss": 1.6735, + "step": 1400 + }, + { + "epoch": 0.18694955964771817, + "grad_norm": 1.1597783939590798, + "learning_rate": 1.8735945546224e-05, + "loss": 1.6463, + "step": 1401 + }, + { + "epoch": 0.18708299973311984, + "grad_norm": 1.1108971920248814, + "learning_rate": 1.8733841456160123e-05, + "loss": 1.7286, + "step": 1402 + }, + { + "epoch": 0.18721643981852149, + "grad_norm": 1.195745553995662, + "learning_rate": 1.8731735734714106e-05, + "loss": 1.655, + "step": 1403 + }, + { + "epoch": 0.18734987990392313, + "grad_norm": 1.1437291616721488, + "learning_rate": 1.8729628382279265e-05, + "loss": 1.7116, + "step": 1404 + }, + { + "epoch": 0.1874833199893248, + "grad_norm": 1.2582582933611897, + "learning_rate": 1.872751939924923e-05, + "loss": 1.6831, + "step": 1405 + }, + { + "epoch": 0.18761676007472644, + "grad_norm": 1.126051643445675, + "learning_rate": 1.8725408786017942e-05, + "loss": 1.6487, + "step": 1406 + }, + { + "epoch": 0.1877502001601281, + "grad_norm": 1.2320240819480008, + "learning_rate": 1.8723296542979635e-05, + "loss": 1.7051, + "step": 1407 + }, + { + "epoch": 0.18788364024552975, + "grad_norm": 1.1614213897490868, + "learning_rate": 1.872118267052885e-05, + "loss": 1.6758, + "step": 1408 + }, + { + "epoch": 0.18801708033093142, + "grad_norm": 1.2384327956894376, + "learning_rate": 1.8719067169060436e-05, + "loss": 1.6544, + "step": 1409 + }, + { + "epoch": 0.18815052041633307, + "grad_norm": 1.4000168772794017, + "learning_rate": 1.8716950038969547e-05, + "loss": 1.6822, + "step": 1410 + }, + { + "epoch": 0.1882839605017347, + "grad_norm": 1.1742590284506205, + "learning_rate": 1.871483128065164e-05, + "loss": 1.6987, + "step": 1411 + }, + { + "epoch": 0.18841740058713638, + "grad_norm": 1.206314558147808, + "learning_rate": 1.8712710894502465e-05, + "loss": 1.6341, + "step": 1412 + }, + { + "epoch": 0.18855084067253802, + "grad_norm": 1.199751013189083, + "learning_rate": 1.8710588880918098e-05, + "loss": 1.6946, + "step": 1413 + }, + { + "epoch": 0.1886842807579397, + "grad_norm": 1.2017855345339992, + "learning_rate": 1.8708465240294902e-05, + "loss": 1.6726, + "step": 1414 + }, + { + "epoch": 0.18881772084334134, + "grad_norm": 1.2227953100114268, + "learning_rate": 1.870633997302955e-05, + "loss": 1.6407, + "step": 1415 + }, + { + "epoch": 0.188951160928743, + "grad_norm": 1.2964391199169882, + "learning_rate": 1.8704213079519008e-05, + "loss": 1.6367, + "step": 1416 + }, + { + "epoch": 0.18908460101414465, + "grad_norm": 1.2207494370258463, + "learning_rate": 1.8702084560160572e-05, + "loss": 1.7262, + "step": 1417 + }, + { + "epoch": 0.1892180410995463, + "grad_norm": 5.905607034298311, + "learning_rate": 1.8699954415351813e-05, + "loss": 1.5948, + "step": 1418 + }, + { + "epoch": 0.18935148118494796, + "grad_norm": 1.1469661893942433, + "learning_rate": 1.8697822645490625e-05, + "loss": 1.6524, + "step": 1419 + }, + { + "epoch": 0.1894849212703496, + "grad_norm": 1.1259620274025783, + "learning_rate": 1.8695689250975193e-05, + "loss": 1.6331, + "step": 1420 + }, + { + "epoch": 0.18961836135575127, + "grad_norm": 1.0763567309071966, + "learning_rate": 1.8693554232204014e-05, + "loss": 1.7, + "step": 1421 + }, + { + "epoch": 0.18975180144115292, + "grad_norm": 1.1307077011187483, + "learning_rate": 1.8691417589575885e-05, + "loss": 1.6599, + "step": 1422 + }, + { + "epoch": 0.1898852415265546, + "grad_norm": 1.2777295336158558, + "learning_rate": 1.8689279323489902e-05, + "loss": 1.6505, + "step": 1423 + }, + { + "epoch": 0.19001868161195623, + "grad_norm": 1.2379608805979987, + "learning_rate": 1.8687139434345478e-05, + "loss": 1.6474, + "step": 1424 + }, + { + "epoch": 0.19015212169735787, + "grad_norm": 1.080970192917941, + "learning_rate": 1.8684997922542313e-05, + "loss": 1.6836, + "step": 1425 + }, + { + "epoch": 0.19028556178275954, + "grad_norm": 1.0712881958144271, + "learning_rate": 1.8682854788480417e-05, + "loss": 1.6071, + "step": 1426 + }, + { + "epoch": 0.1904190018681612, + "grad_norm": 1.26548294983667, + "learning_rate": 1.8680710032560102e-05, + "loss": 1.6904, + "step": 1427 + }, + { + "epoch": 0.19055244195356286, + "grad_norm": 1.0469874519952864, + "learning_rate": 1.867856365518199e-05, + "loss": 1.6422, + "step": 1428 + }, + { + "epoch": 0.1906858820389645, + "grad_norm": 1.0942346487916967, + "learning_rate": 1.8676415656746996e-05, + "loss": 1.6864, + "step": 1429 + }, + { + "epoch": 0.19081932212436617, + "grad_norm": 1.103259467200675, + "learning_rate": 1.8674266037656343e-05, + "loss": 1.6855, + "step": 1430 + }, + { + "epoch": 0.1909527622097678, + "grad_norm": 1.0683367237417463, + "learning_rate": 1.867211479831155e-05, + "loss": 1.6525, + "step": 1431 + }, + { + "epoch": 0.19108620229516948, + "grad_norm": 1.1121615299096883, + "learning_rate": 1.8669961939114457e-05, + "loss": 1.6904, + "step": 1432 + }, + { + "epoch": 0.19121964238057113, + "grad_norm": 1.1617841181260873, + "learning_rate": 1.8667807460467178e-05, + "loss": 1.6352, + "step": 1433 + }, + { + "epoch": 0.19135308246597277, + "grad_norm": 1.084146580072923, + "learning_rate": 1.8665651362772152e-05, + "loss": 1.6905, + "step": 1434 + }, + { + "epoch": 0.19148652255137444, + "grad_norm": 1.1047653403081839, + "learning_rate": 1.8663493646432116e-05, + "loss": 1.7152, + "step": 1435 + }, + { + "epoch": 0.19161996263677608, + "grad_norm": 1.0986414741598525, + "learning_rate": 1.8661334311850104e-05, + "loss": 1.6874, + "step": 1436 + }, + { + "epoch": 0.19175340272217775, + "grad_norm": 1.3689551704009346, + "learning_rate": 1.8659173359429455e-05, + "loss": 1.6464, + "step": 1437 + }, + { + "epoch": 0.1918868428075794, + "grad_norm": 1.2060959441159418, + "learning_rate": 1.8657010789573812e-05, + "loss": 1.7476, + "step": 1438 + }, + { + "epoch": 0.19202028289298106, + "grad_norm": 1.094136327662655, + "learning_rate": 1.8654846602687112e-05, + "loss": 1.6862, + "step": 1439 + }, + { + "epoch": 0.1921537229783827, + "grad_norm": 1.1003792674170316, + "learning_rate": 1.865268079917361e-05, + "loss": 1.7001, + "step": 1440 + }, + { + "epoch": 0.19228716306378435, + "grad_norm": 1.397260975809977, + "learning_rate": 1.8650513379437854e-05, + "loss": 1.6541, + "step": 1441 + }, + { + "epoch": 0.19242060314918602, + "grad_norm": 1.1317586552025098, + "learning_rate": 1.864834434388468e-05, + "loss": 1.654, + "step": 1442 + }, + { + "epoch": 0.19255404323458766, + "grad_norm": 1.2801188417922258, + "learning_rate": 1.8646173692919254e-05, + "loss": 1.72, + "step": 1443 + }, + { + "epoch": 0.19268748331998933, + "grad_norm": 1.1044353428107876, + "learning_rate": 1.8644001426947017e-05, + "loss": 1.6633, + "step": 1444 + }, + { + "epoch": 0.19282092340539098, + "grad_norm": 1.7385377002254025, + "learning_rate": 1.8641827546373736e-05, + "loss": 1.6934, + "step": 1445 + }, + { + "epoch": 0.19295436349079265, + "grad_norm": 1.1251578855313504, + "learning_rate": 1.8639652051605454e-05, + "loss": 1.6855, + "step": 1446 + }, + { + "epoch": 0.1930878035761943, + "grad_norm": 1.0823798317472224, + "learning_rate": 1.8637474943048538e-05, + "loss": 1.6851, + "step": 1447 + }, + { + "epoch": 0.19322124366159593, + "grad_norm": 1.2414924209927352, + "learning_rate": 1.8635296221109643e-05, + "loss": 1.6342, + "step": 1448 + }, + { + "epoch": 0.1933546837469976, + "grad_norm": 1.081073074757343, + "learning_rate": 1.8633115886195733e-05, + "loss": 1.6326, + "step": 1449 + }, + { + "epoch": 0.19348812383239924, + "grad_norm": 1.3276067715392756, + "learning_rate": 1.863093393871406e-05, + "loss": 1.682, + "step": 1450 + }, + { + "epoch": 0.19362156391780092, + "grad_norm": 1.0512206782809177, + "learning_rate": 1.86287503790722e-05, + "loss": 1.6339, + "step": 1451 + }, + { + "epoch": 0.19375500400320256, + "grad_norm": 1.1118643544696887, + "learning_rate": 1.8626565207678014e-05, + "loss": 1.6412, + "step": 1452 + }, + { + "epoch": 0.19388844408860423, + "grad_norm": 1.1881349320604198, + "learning_rate": 1.862437842493966e-05, + "loss": 1.6535, + "step": 1453 + }, + { + "epoch": 0.19402188417400587, + "grad_norm": 1.0650617509844336, + "learning_rate": 1.8622190031265608e-05, + "loss": 1.6608, + "step": 1454 + }, + { + "epoch": 0.1941553242594075, + "grad_norm": 1.2022764202107716, + "learning_rate": 1.8620000027064625e-05, + "loss": 1.6818, + "step": 1455 + }, + { + "epoch": 0.19428876434480918, + "grad_norm": 1.0945586894168828, + "learning_rate": 1.861780841274578e-05, + "loss": 1.6518, + "step": 1456 + }, + { + "epoch": 0.19442220443021083, + "grad_norm": 1.181691965256765, + "learning_rate": 1.8615615188718442e-05, + "loss": 1.6526, + "step": 1457 + }, + { + "epoch": 0.1945556445156125, + "grad_norm": 1.21873002721777, + "learning_rate": 1.861342035539228e-05, + "loss": 1.6685, + "step": 1458 + }, + { + "epoch": 0.19468908460101414, + "grad_norm": 1.1888383727594938, + "learning_rate": 1.8611223913177264e-05, + "loss": 1.6939, + "step": 1459 + }, + { + "epoch": 0.1948225246864158, + "grad_norm": 1.1092188067858701, + "learning_rate": 1.8609025862483658e-05, + "loss": 1.7001, + "step": 1460 + }, + { + "epoch": 0.19495596477181745, + "grad_norm": 1.4312758954493943, + "learning_rate": 1.8606826203722043e-05, + "loss": 1.6334, + "step": 1461 + }, + { + "epoch": 0.1950894048572191, + "grad_norm": 1.2022595063751074, + "learning_rate": 1.8604624937303285e-05, + "loss": 1.6806, + "step": 1462 + }, + { + "epoch": 0.19522284494262077, + "grad_norm": 1.0171052837252916, + "learning_rate": 1.8602422063638554e-05, + "loss": 1.6488, + "step": 1463 + }, + { + "epoch": 0.1953562850280224, + "grad_norm": 1.3510050011424308, + "learning_rate": 1.860021758313933e-05, + "loss": 1.6833, + "step": 1464 + }, + { + "epoch": 0.19548972511342408, + "grad_norm": 1.257684566145725, + "learning_rate": 1.859801149621737e-05, + "loss": 1.6647, + "step": 1465 + }, + { + "epoch": 0.19562316519882572, + "grad_norm": 1.1217536119317055, + "learning_rate": 1.8595803803284757e-05, + "loss": 1.6662, + "step": 1466 + }, + { + "epoch": 0.1957566052842274, + "grad_norm": 1.1668773044915945, + "learning_rate": 1.8593594504753863e-05, + "loss": 1.711, + "step": 1467 + }, + { + "epoch": 0.19589004536962903, + "grad_norm": 1.1958090413202758, + "learning_rate": 1.8591383601037357e-05, + "loss": 1.709, + "step": 1468 + }, + { + "epoch": 0.1960234854550307, + "grad_norm": 1.3120713864562594, + "learning_rate": 1.8589171092548208e-05, + "loss": 1.6493, + "step": 1469 + }, + { + "epoch": 0.19615692554043235, + "grad_norm": 1.1266277557758897, + "learning_rate": 1.8586956979699692e-05, + "loss": 1.6786, + "step": 1470 + }, + { + "epoch": 0.196290365625834, + "grad_norm": 2.464652796809538, + "learning_rate": 1.858474126290538e-05, + "loss": 1.6538, + "step": 1471 + }, + { + "epoch": 0.19642380571123566, + "grad_norm": 1.358175538034263, + "learning_rate": 1.8582523942579138e-05, + "loss": 1.6298, + "step": 1472 + }, + { + "epoch": 0.1965572457966373, + "grad_norm": 1.1218515326836789, + "learning_rate": 1.8580305019135144e-05, + "loss": 1.6807, + "step": 1473 + }, + { + "epoch": 0.19669068588203897, + "grad_norm": 1.1314598132036782, + "learning_rate": 1.8578084492987857e-05, + "loss": 1.6085, + "step": 1474 + }, + { + "epoch": 0.19682412596744062, + "grad_norm": 1.0806632798443008, + "learning_rate": 1.8575862364552056e-05, + "loss": 1.6943, + "step": 1475 + }, + { + "epoch": 0.1969575660528423, + "grad_norm": 1.0456494280144883, + "learning_rate": 1.8573638634242806e-05, + "loss": 1.6425, + "step": 1476 + }, + { + "epoch": 0.19709100613824393, + "grad_norm": 1.4643633002109615, + "learning_rate": 1.857141330247547e-05, + "loss": 1.717, + "step": 1477 + }, + { + "epoch": 0.19722444622364557, + "grad_norm": 1.3907399472017639, + "learning_rate": 1.856918636966572e-05, + "loss": 1.6938, + "step": 1478 + }, + { + "epoch": 0.19735788630904724, + "grad_norm": 1.074837673820265, + "learning_rate": 1.856695783622952e-05, + "loss": 1.6547, + "step": 1479 + }, + { + "epoch": 0.19749132639444889, + "grad_norm": 1.1065228121324435, + "learning_rate": 1.8564727702583132e-05, + "loss": 1.6468, + "step": 1480 + }, + { + "epoch": 0.19762476647985056, + "grad_norm": 1.1931788828028584, + "learning_rate": 1.8562495969143125e-05, + "loss": 1.658, + "step": 1481 + }, + { + "epoch": 0.1977582065652522, + "grad_norm": 1.213866762163482, + "learning_rate": 1.8560262636326358e-05, + "loss": 1.631, + "step": 1482 + }, + { + "epoch": 0.19789164665065387, + "grad_norm": 1.099947809835569, + "learning_rate": 1.8558027704549993e-05, + "loss": 1.6351, + "step": 1483 + }, + { + "epoch": 0.1980250867360555, + "grad_norm": 1.1088893853131983, + "learning_rate": 1.8555791174231494e-05, + "loss": 1.6723, + "step": 1484 + }, + { + "epoch": 0.19815852682145715, + "grad_norm": 1.1011144028074373, + "learning_rate": 1.855355304578861e-05, + "loss": 1.707, + "step": 1485 + }, + { + "epoch": 0.19829196690685882, + "grad_norm": 1.3818385213484043, + "learning_rate": 1.85513133196394e-05, + "loss": 1.652, + "step": 1486 + }, + { + "epoch": 0.19842540699226047, + "grad_norm": 1.1229293638925342, + "learning_rate": 1.8549071996202225e-05, + "loss": 1.656, + "step": 1487 + }, + { + "epoch": 0.19855884707766214, + "grad_norm": 1.080075119649869, + "learning_rate": 1.8546829075895737e-05, + "loss": 1.7099, + "step": 1488 + }, + { + "epoch": 0.19869228716306378, + "grad_norm": 1.2852126608442598, + "learning_rate": 1.8544584559138888e-05, + "loss": 1.7062, + "step": 1489 + }, + { + "epoch": 0.19882572724846545, + "grad_norm": 1.14699198879905, + "learning_rate": 1.8542338446350924e-05, + "loss": 1.6963, + "step": 1490 + }, + { + "epoch": 0.1989591673338671, + "grad_norm": 1.0866913345021045, + "learning_rate": 1.85400907379514e-05, + "loss": 1.692, + "step": 1491 + }, + { + "epoch": 0.19909260741926874, + "grad_norm": 1.1345754144326983, + "learning_rate": 1.853784143436016e-05, + "loss": 1.6911, + "step": 1492 + }, + { + "epoch": 0.1992260475046704, + "grad_norm": 1.0859201677475647, + "learning_rate": 1.853559053599734e-05, + "loss": 1.7136, + "step": 1493 + }, + { + "epoch": 0.19935948759007205, + "grad_norm": 1.114793968801779, + "learning_rate": 1.8533338043283392e-05, + "loss": 1.6262, + "step": 1494 + }, + { + "epoch": 0.19949292767547372, + "grad_norm": 1.0646044544826536, + "learning_rate": 1.8531083956639054e-05, + "loss": 1.6374, + "step": 1495 + }, + { + "epoch": 0.19962636776087536, + "grad_norm": 1.0943515393210304, + "learning_rate": 1.8528828276485365e-05, + "loss": 1.6565, + "step": 1496 + }, + { + "epoch": 0.19975980784627703, + "grad_norm": 1.153219518432924, + "learning_rate": 1.852657100324366e-05, + "loss": 1.6468, + "step": 1497 + }, + { + "epoch": 0.19989324793167867, + "grad_norm": 1.3658364399948308, + "learning_rate": 1.8524312137335565e-05, + "loss": 1.6546, + "step": 1498 + }, + { + "epoch": 0.20002668801708032, + "grad_norm": 1.0830597310395975, + "learning_rate": 1.8522051679183017e-05, + "loss": 1.6464, + "step": 1499 + }, + { + "epoch": 0.200160128102482, + "grad_norm": 1.0897256468244192, + "learning_rate": 1.8519789629208245e-05, + "loss": 1.6335, + "step": 1500 + }, + { + "epoch": 0.20029356818788363, + "grad_norm": 1.1152974632184582, + "learning_rate": 1.851752598783377e-05, + "loss": 1.6539, + "step": 1501 + }, + { + "epoch": 0.2004270082732853, + "grad_norm": 1.070943336830069, + "learning_rate": 1.8515260755482414e-05, + "loss": 1.663, + "step": 1502 + }, + { + "epoch": 0.20056044835868694, + "grad_norm": 1.1301005431558968, + "learning_rate": 1.85129939325773e-05, + "loss": 1.6386, + "step": 1503 + }, + { + "epoch": 0.20069388844408861, + "grad_norm": 1.4376814743628707, + "learning_rate": 1.8510725519541845e-05, + "loss": 1.6777, + "step": 1504 + }, + { + "epoch": 0.20082732852949026, + "grad_norm": 1.1571460766809745, + "learning_rate": 1.8508455516799762e-05, + "loss": 1.6458, + "step": 1505 + }, + { + "epoch": 0.20096076861489193, + "grad_norm": 1.526537352021282, + "learning_rate": 1.850618392477506e-05, + "loss": 1.6673, + "step": 1506 + }, + { + "epoch": 0.20109420870029357, + "grad_norm": 1.171428989010074, + "learning_rate": 1.8503910743892044e-05, + "loss": 1.6548, + "step": 1507 + }, + { + "epoch": 0.2012276487856952, + "grad_norm": 1.0654992362564706, + "learning_rate": 1.8501635974575328e-05, + "loss": 1.6856, + "step": 1508 + }, + { + "epoch": 0.20136108887109688, + "grad_norm": 1.2869430405209437, + "learning_rate": 1.84993596172498e-05, + "loss": 1.6663, + "step": 1509 + }, + { + "epoch": 0.20149452895649853, + "grad_norm": 1.3280855964940679, + "learning_rate": 1.849708167234067e-05, + "loss": 1.6049, + "step": 1510 + }, + { + "epoch": 0.2016279690419002, + "grad_norm": 1.2170509648131613, + "learning_rate": 1.8494802140273423e-05, + "loss": 1.6645, + "step": 1511 + }, + { + "epoch": 0.20176140912730184, + "grad_norm": 1.1693660112296544, + "learning_rate": 1.8492521021473854e-05, + "loss": 1.6448, + "step": 1512 + }, + { + "epoch": 0.2018948492127035, + "grad_norm": 1.1495960407236363, + "learning_rate": 1.8490238316368048e-05, + "loss": 1.6717, + "step": 1513 + }, + { + "epoch": 0.20202828929810515, + "grad_norm": 1.0607525124108388, + "learning_rate": 1.848795402538239e-05, + "loss": 1.685, + "step": 1514 + }, + { + "epoch": 0.2021617293835068, + "grad_norm": 1.1588629194975635, + "learning_rate": 1.848566814894356e-05, + "loss": 1.6733, + "step": 1515 + }, + { + "epoch": 0.20229516946890846, + "grad_norm": 1.0477044055470504, + "learning_rate": 1.8483380687478535e-05, + "loss": 1.6364, + "step": 1516 + }, + { + "epoch": 0.2024286095543101, + "grad_norm": 1.3539461437967024, + "learning_rate": 1.848109164141458e-05, + "loss": 1.6329, + "step": 1517 + }, + { + "epoch": 0.20256204963971178, + "grad_norm": 1.316562445670924, + "learning_rate": 1.847880101117927e-05, + "loss": 1.7161, + "step": 1518 + }, + { + "epoch": 0.20269548972511342, + "grad_norm": 1.1325223006869773, + "learning_rate": 1.8476508797200464e-05, + "loss": 1.6976, + "step": 1519 + }, + { + "epoch": 0.2028289298105151, + "grad_norm": 1.1271240323882026, + "learning_rate": 1.847421499990633e-05, + "loss": 1.6724, + "step": 1520 + }, + { + "epoch": 0.20296236989591673, + "grad_norm": 1.1553839820996277, + "learning_rate": 1.8471919619725307e-05, + "loss": 1.6322, + "step": 1521 + }, + { + "epoch": 0.20309580998131838, + "grad_norm": 1.1027336658945248, + "learning_rate": 1.8469622657086158e-05, + "loss": 1.6447, + "step": 1522 + }, + { + "epoch": 0.20322925006672005, + "grad_norm": 1.183883565807101, + "learning_rate": 1.846732411241793e-05, + "loss": 1.6999, + "step": 1523 + }, + { + "epoch": 0.2033626901521217, + "grad_norm": 1.1642046420989085, + "learning_rate": 1.846502398614996e-05, + "loss": 1.7149, + "step": 1524 + }, + { + "epoch": 0.20349613023752336, + "grad_norm": 1.0755063096784014, + "learning_rate": 1.8462722278711887e-05, + "loss": 1.5943, + "step": 1525 + }, + { + "epoch": 0.203629570322925, + "grad_norm": 1.1419970032576714, + "learning_rate": 1.8460418990533647e-05, + "loss": 1.7073, + "step": 1526 + }, + { + "epoch": 0.20376301040832667, + "grad_norm": 1.302454031541249, + "learning_rate": 1.845811412204546e-05, + "loss": 1.6758, + "step": 1527 + }, + { + "epoch": 0.20389645049372832, + "grad_norm": 1.1668477615464077, + "learning_rate": 1.845580767367786e-05, + "loss": 1.694, + "step": 1528 + }, + { + "epoch": 0.20402989057912996, + "grad_norm": 1.1087107394603948, + "learning_rate": 1.8453499645861656e-05, + "loss": 1.691, + "step": 1529 + }, + { + "epoch": 0.20416333066453163, + "grad_norm": 1.0455825217353827, + "learning_rate": 1.8451190039027964e-05, + "loss": 1.6684, + "step": 1530 + }, + { + "epoch": 0.20429677074993327, + "grad_norm": 1.219445656841785, + "learning_rate": 1.8448878853608195e-05, + "loss": 1.7155, + "step": 1531 + }, + { + "epoch": 0.20443021083533494, + "grad_norm": 1.335960910259822, + "learning_rate": 1.8446566090034053e-05, + "loss": 1.7326, + "step": 1532 + }, + { + "epoch": 0.20456365092073658, + "grad_norm": 1.271132783595969, + "learning_rate": 1.8444251748737526e-05, + "loss": 1.6686, + "step": 1533 + }, + { + "epoch": 0.20469709100613825, + "grad_norm": 1.0514626639526814, + "learning_rate": 1.844193583015092e-05, + "loss": 1.6366, + "step": 1534 + }, + { + "epoch": 0.2048305310915399, + "grad_norm": 1.0454900741987578, + "learning_rate": 1.8439618334706812e-05, + "loss": 1.6237, + "step": 1535 + }, + { + "epoch": 0.20496397117694154, + "grad_norm": 1.073823751577798, + "learning_rate": 1.8437299262838092e-05, + "loss": 1.6749, + "step": 1536 + }, + { + "epoch": 0.2050974112623432, + "grad_norm": 1.2798723643570793, + "learning_rate": 1.843497861497793e-05, + "loss": 1.6411, + "step": 1537 + }, + { + "epoch": 0.20523085134774485, + "grad_norm": 1.1135912832633508, + "learning_rate": 1.8432656391559794e-05, + "loss": 1.63, + "step": 1538 + }, + { + "epoch": 0.20536429143314652, + "grad_norm": 1.0600593382175174, + "learning_rate": 1.8430332593017463e-05, + "loss": 1.6932, + "step": 1539 + }, + { + "epoch": 0.20549773151854817, + "grad_norm": 1.243565535810705, + "learning_rate": 1.8428007219784978e-05, + "loss": 1.6734, + "step": 1540 + }, + { + "epoch": 0.20563117160394984, + "grad_norm": 1.1107029670183746, + "learning_rate": 1.8425680272296706e-05, + "loss": 1.6954, + "step": 1541 + }, + { + "epoch": 0.20576461168935148, + "grad_norm": 1.1611386236626087, + "learning_rate": 1.8423351750987285e-05, + "loss": 1.6676, + "step": 1542 + }, + { + "epoch": 0.20589805177475315, + "grad_norm": 1.0922966177873632, + "learning_rate": 1.842102165629166e-05, + "loss": 1.6751, + "step": 1543 + }, + { + "epoch": 0.2060314918601548, + "grad_norm": 1.1728266705222763, + "learning_rate": 1.8418689988645072e-05, + "loss": 1.6662, + "step": 1544 + }, + { + "epoch": 0.20616493194555643, + "grad_norm": 1.281999128501973, + "learning_rate": 1.841635674848304e-05, + "loss": 1.6855, + "step": 1545 + }, + { + "epoch": 0.2062983720309581, + "grad_norm": 1.1950922902237124, + "learning_rate": 1.8414021936241394e-05, + "loss": 1.6543, + "step": 1546 + }, + { + "epoch": 0.20643181211635975, + "grad_norm": 1.1228445170626977, + "learning_rate": 1.8411685552356244e-05, + "loss": 1.6518, + "step": 1547 + }, + { + "epoch": 0.20656525220176142, + "grad_norm": 1.2061203106698866, + "learning_rate": 1.840934759726401e-05, + "loss": 1.6469, + "step": 1548 + }, + { + "epoch": 0.20669869228716306, + "grad_norm": 1.1059566734641804, + "learning_rate": 1.840700807140138e-05, + "loss": 1.6259, + "step": 1549 + }, + { + "epoch": 0.20683213237256473, + "grad_norm": 1.1551811421928873, + "learning_rate": 1.8404666975205367e-05, + "loss": 1.6333, + "step": 1550 + }, + { + "epoch": 0.20696557245796637, + "grad_norm": 1.1528270603987256, + "learning_rate": 1.840232430911325e-05, + "loss": 1.7053, + "step": 1551 + }, + { + "epoch": 0.20709901254336802, + "grad_norm": 1.0615331899156355, + "learning_rate": 1.8399980073562615e-05, + "loss": 1.6093, + "step": 1552 + }, + { + "epoch": 0.2072324526287697, + "grad_norm": 1.2018501248996023, + "learning_rate": 1.8397634268991343e-05, + "loss": 1.6831, + "step": 1553 + }, + { + "epoch": 0.20736589271417133, + "grad_norm": 1.129260522162621, + "learning_rate": 1.8395286895837598e-05, + "loss": 1.6725, + "step": 1554 + }, + { + "epoch": 0.207499332799573, + "grad_norm": 1.2931052464354142, + "learning_rate": 1.8392937954539845e-05, + "loss": 1.6261, + "step": 1555 + }, + { + "epoch": 0.20763277288497464, + "grad_norm": 1.158760216852382, + "learning_rate": 1.8390587445536844e-05, + "loss": 1.6595, + "step": 1556 + }, + { + "epoch": 0.2077662129703763, + "grad_norm": 1.16264835183651, + "learning_rate": 1.8388235369267632e-05, + "loss": 1.6538, + "step": 1557 + }, + { + "epoch": 0.20789965305577796, + "grad_norm": 1.1390190977362333, + "learning_rate": 1.838588172617156e-05, + "loss": 1.6174, + "step": 1558 + }, + { + "epoch": 0.2080330931411796, + "grad_norm": 1.0281490566989566, + "learning_rate": 1.8383526516688257e-05, + "loss": 1.6256, + "step": 1559 + }, + { + "epoch": 0.20816653322658127, + "grad_norm": 1.3060131907983772, + "learning_rate": 1.8381169741257653e-05, + "loss": 1.6876, + "step": 1560 + }, + { + "epoch": 0.2082999733119829, + "grad_norm": 1.0851729874752505, + "learning_rate": 1.8378811400319962e-05, + "loss": 1.6448, + "step": 1561 + }, + { + "epoch": 0.20843341339738458, + "grad_norm": 1.1173227297527768, + "learning_rate": 1.8376451494315697e-05, + "loss": 1.6397, + "step": 1562 + }, + { + "epoch": 0.20856685348278622, + "grad_norm": 1.1250418404408145, + "learning_rate": 1.8374090023685664e-05, + "loss": 1.7092, + "step": 1563 + }, + { + "epoch": 0.2087002935681879, + "grad_norm": 1.1313533969061775, + "learning_rate": 1.8371726988870956e-05, + "loss": 1.6625, + "step": 1564 + }, + { + "epoch": 0.20883373365358954, + "grad_norm": 1.1578578516746643, + "learning_rate": 1.8369362390312965e-05, + "loss": 1.6829, + "step": 1565 + }, + { + "epoch": 0.20896717373899118, + "grad_norm": 1.0419177694211474, + "learning_rate": 1.8366996228453366e-05, + "loss": 1.6711, + "step": 1566 + }, + { + "epoch": 0.20910061382439285, + "grad_norm": 1.110808392383394, + "learning_rate": 1.8364628503734133e-05, + "loss": 1.6313, + "step": 1567 + }, + { + "epoch": 0.2092340539097945, + "grad_norm": 1.0553327445995773, + "learning_rate": 1.8362259216597532e-05, + "loss": 1.6769, + "step": 1568 + }, + { + "epoch": 0.20936749399519616, + "grad_norm": 1.1605648342199986, + "learning_rate": 1.8359888367486116e-05, + "loss": 1.6423, + "step": 1569 + }, + { + "epoch": 0.2095009340805978, + "grad_norm": 1.3528845887090233, + "learning_rate": 1.8357515956842735e-05, + "loss": 1.6641, + "step": 1570 + }, + { + "epoch": 0.20963437416599948, + "grad_norm": 1.0381834374569774, + "learning_rate": 1.8355141985110524e-05, + "loss": 1.6645, + "step": 1571 + }, + { + "epoch": 0.20976781425140112, + "grad_norm": 1.1268678857914454, + "learning_rate": 1.8352766452732922e-05, + "loss": 1.6252, + "step": 1572 + }, + { + "epoch": 0.20990125433680276, + "grad_norm": 1.07645387262839, + "learning_rate": 1.8350389360153645e-05, + "loss": 1.6234, + "step": 1573 + }, + { + "epoch": 0.21003469442220443, + "grad_norm": 1.1601184907461661, + "learning_rate": 1.8348010707816704e-05, + "loss": 1.6625, + "step": 1574 + }, + { + "epoch": 0.21016813450760607, + "grad_norm": 1.1598070223911565, + "learning_rate": 1.8345630496166413e-05, + "loss": 1.6072, + "step": 1575 + }, + { + "epoch": 0.21030157459300775, + "grad_norm": 1.0478963594602182, + "learning_rate": 1.834324872564736e-05, + "loss": 1.6713, + "step": 1576 + }, + { + "epoch": 0.2104350146784094, + "grad_norm": 1.1864899396092392, + "learning_rate": 1.8340865396704442e-05, + "loss": 1.6994, + "step": 1577 + }, + { + "epoch": 0.21056845476381106, + "grad_norm": 1.2082345518638513, + "learning_rate": 1.833848050978283e-05, + "loss": 1.6325, + "step": 1578 + }, + { + "epoch": 0.2107018948492127, + "grad_norm": 1.0705831509203583, + "learning_rate": 1.8336094065327994e-05, + "loss": 1.6455, + "step": 1579 + }, + { + "epoch": 0.21083533493461437, + "grad_norm": 1.0714080597971536, + "learning_rate": 1.8333706063785698e-05, + "loss": 1.6415, + "step": 1580 + }, + { + "epoch": 0.21096877502001601, + "grad_norm": 1.4855074921163713, + "learning_rate": 1.8331316505601995e-05, + "loss": 1.6629, + "step": 1581 + }, + { + "epoch": 0.21110221510541766, + "grad_norm": 1.0376544838658421, + "learning_rate": 1.832892539122322e-05, + "loss": 1.6218, + "step": 1582 + }, + { + "epoch": 0.21123565519081933, + "grad_norm": 1.0937886957684666, + "learning_rate": 1.832653272109601e-05, + "loss": 1.6446, + "step": 1583 + }, + { + "epoch": 0.21136909527622097, + "grad_norm": 0.9935625042710026, + "learning_rate": 1.8324138495667293e-05, + "loss": 1.5577, + "step": 1584 + }, + { + "epoch": 0.21150253536162264, + "grad_norm": 1.0414931198212998, + "learning_rate": 1.8321742715384277e-05, + "loss": 1.7021, + "step": 1585 + }, + { + "epoch": 0.21163597544702428, + "grad_norm": 1.0888067291778296, + "learning_rate": 1.8319345380694464e-05, + "loss": 1.6425, + "step": 1586 + }, + { + "epoch": 0.21176941553242595, + "grad_norm": 1.1889065641556036, + "learning_rate": 1.831694649204566e-05, + "loss": 1.6804, + "step": 1587 + }, + { + "epoch": 0.2119028556178276, + "grad_norm": 1.2027320435869355, + "learning_rate": 1.831454604988594e-05, + "loss": 1.6518, + "step": 1588 + }, + { + "epoch": 0.21203629570322924, + "grad_norm": 1.1633662750060685, + "learning_rate": 1.8312144054663682e-05, + "loss": 1.6471, + "step": 1589 + }, + { + "epoch": 0.2121697357886309, + "grad_norm": 1.62970080028646, + "learning_rate": 1.8309740506827552e-05, + "loss": 1.6254, + "step": 1590 + }, + { + "epoch": 0.21230317587403255, + "grad_norm": 1.0879307197813337, + "learning_rate": 1.8307335406826505e-05, + "loss": 1.6575, + "step": 1591 + }, + { + "epoch": 0.21243661595943422, + "grad_norm": 1.269123197918674, + "learning_rate": 1.830492875510979e-05, + "loss": 1.6364, + "step": 1592 + }, + { + "epoch": 0.21257005604483586, + "grad_norm": 1.0373985047234797, + "learning_rate": 1.8302520552126935e-05, + "loss": 1.6571, + "step": 1593 + }, + { + "epoch": 0.21270349613023753, + "grad_norm": 1.0542865827845302, + "learning_rate": 1.830011079832777e-05, + "loss": 1.6695, + "step": 1594 + }, + { + "epoch": 0.21283693621563918, + "grad_norm": 1.0402881306171436, + "learning_rate": 1.8297699494162406e-05, + "loss": 1.6562, + "step": 1595 + }, + { + "epoch": 0.21297037630104082, + "grad_norm": 1.1294113481127595, + "learning_rate": 1.8295286640081252e-05, + "loss": 1.6583, + "step": 1596 + }, + { + "epoch": 0.2131038163864425, + "grad_norm": 1.1895908197616782, + "learning_rate": 1.8292872236534996e-05, + "loss": 1.6487, + "step": 1597 + }, + { + "epoch": 0.21323725647184413, + "grad_norm": 1.0379115957592848, + "learning_rate": 1.829045628397463e-05, + "loss": 1.6322, + "step": 1598 + }, + { + "epoch": 0.2133706965572458, + "grad_norm": 1.222610128560367, + "learning_rate": 1.8288038782851417e-05, + "loss": 1.6657, + "step": 1599 + }, + { + "epoch": 0.21350413664264745, + "grad_norm": 1.130257915264502, + "learning_rate": 1.828561973361692e-05, + "loss": 1.6602, + "step": 1600 + }, + { + "epoch": 0.21363757672804912, + "grad_norm": 1.126575657802467, + "learning_rate": 1.8283199136722995e-05, + "loss": 1.6918, + "step": 1601 + }, + { + "epoch": 0.21377101681345076, + "grad_norm": 1.3153982419349068, + "learning_rate": 1.828077699262178e-05, + "loss": 1.6693, + "step": 1602 + }, + { + "epoch": 0.2139044568988524, + "grad_norm": 1.262078471065165, + "learning_rate": 1.8278353301765702e-05, + "loss": 1.6933, + "step": 1603 + }, + { + "epoch": 0.21403789698425407, + "grad_norm": 1.1173878288546095, + "learning_rate": 1.8275928064607478e-05, + "loss": 1.5781, + "step": 1604 + }, + { + "epoch": 0.21417133706965572, + "grad_norm": 2.31232037905848, + "learning_rate": 1.8273501281600118e-05, + "loss": 1.66, + "step": 1605 + }, + { + "epoch": 0.21430477715505739, + "grad_norm": 1.1674269299374407, + "learning_rate": 1.8271072953196915e-05, + "loss": 1.7023, + "step": 1606 + }, + { + "epoch": 0.21443821724045903, + "grad_norm": 1.1591034820385062, + "learning_rate": 1.8268643079851457e-05, + "loss": 1.706, + "step": 1607 + }, + { + "epoch": 0.2145716573258607, + "grad_norm": 1.2450171637095553, + "learning_rate": 1.826621166201761e-05, + "loss": 1.6084, + "step": 1608 + }, + { + "epoch": 0.21470509741126234, + "grad_norm": 1.1159214690409671, + "learning_rate": 1.826377870014954e-05, + "loss": 1.6834, + "step": 1609 + }, + { + "epoch": 0.214838537496664, + "grad_norm": 1.099652892199378, + "learning_rate": 1.8261344194701695e-05, + "loss": 1.6745, + "step": 1610 + }, + { + "epoch": 0.21497197758206565, + "grad_norm": 1.1872360367368382, + "learning_rate": 1.8258908146128814e-05, + "loss": 1.6332, + "step": 1611 + }, + { + "epoch": 0.2151054176674673, + "grad_norm": 1.0948685127381836, + "learning_rate": 1.8256470554885922e-05, + "loss": 1.631, + "step": 1612 + }, + { + "epoch": 0.21523885775286897, + "grad_norm": 1.1075853907414162, + "learning_rate": 1.8254031421428334e-05, + "loss": 1.7092, + "step": 1613 + }, + { + "epoch": 0.2153722978382706, + "grad_norm": 1.1518564227271244, + "learning_rate": 1.8251590746211655e-05, + "loss": 1.7042, + "step": 1614 + }, + { + "epoch": 0.21550573792367228, + "grad_norm": 1.1038926708394592, + "learning_rate": 1.8249148529691768e-05, + "loss": 1.6277, + "step": 1615 + }, + { + "epoch": 0.21563917800907392, + "grad_norm": 1.1063850648205573, + "learning_rate": 1.824670477232486e-05, + "loss": 1.6506, + "step": 1616 + }, + { + "epoch": 0.2157726180944756, + "grad_norm": 1.0603627332408834, + "learning_rate": 1.824425947456739e-05, + "loss": 1.6636, + "step": 1617 + }, + { + "epoch": 0.21590605817987724, + "grad_norm": 1.2546873356938104, + "learning_rate": 1.8241812636876113e-05, + "loss": 1.6793, + "step": 1618 + }, + { + "epoch": 0.21603949826527888, + "grad_norm": 1.181239728860882, + "learning_rate": 1.8239364259708076e-05, + "loss": 1.6664, + "step": 1619 + }, + { + "epoch": 0.21617293835068055, + "grad_norm": 1.180838315282554, + "learning_rate": 1.8236914343520598e-05, + "loss": 1.6998, + "step": 1620 + }, + { + "epoch": 0.2163063784360822, + "grad_norm": 1.1308148567631164, + "learning_rate": 1.8234462888771308e-05, + "loss": 1.7154, + "step": 1621 + }, + { + "epoch": 0.21643981852148386, + "grad_norm": 1.1326281559924998, + "learning_rate": 1.82320098959181e-05, + "loss": 1.6256, + "step": 1622 + }, + { + "epoch": 0.2165732586068855, + "grad_norm": 1.0865149229961686, + "learning_rate": 1.822955536541917e-05, + "loss": 1.6228, + "step": 1623 + }, + { + "epoch": 0.21670669869228718, + "grad_norm": 1.2423233898451607, + "learning_rate": 1.8227099297732997e-05, + "loss": 1.648, + "step": 1624 + }, + { + "epoch": 0.21684013877768882, + "grad_norm": 1.1502479506643792, + "learning_rate": 1.8224641693318338e-05, + "loss": 1.6578, + "step": 1625 + }, + { + "epoch": 0.21697357886309046, + "grad_norm": 1.440887259770447, + "learning_rate": 1.8222182552634257e-05, + "loss": 1.6826, + "step": 1626 + }, + { + "epoch": 0.21710701894849213, + "grad_norm": 1.121187218751373, + "learning_rate": 1.8219721876140084e-05, + "loss": 1.6264, + "step": 1627 + }, + { + "epoch": 0.21724045903389377, + "grad_norm": 4.7724358351335905, + "learning_rate": 1.8217259664295452e-05, + "loss": 1.6344, + "step": 1628 + }, + { + "epoch": 0.21737389911929544, + "grad_norm": 1.2326061740840186, + "learning_rate": 1.821479591756027e-05, + "loss": 1.6977, + "step": 1629 + }, + { + "epoch": 0.2175073392046971, + "grad_norm": 1.1773999953923238, + "learning_rate": 1.8212330636394743e-05, + "loss": 1.6613, + "step": 1630 + }, + { + "epoch": 0.21764077929009876, + "grad_norm": 1.0745824244836635, + "learning_rate": 1.820986382125935e-05, + "loss": 1.6746, + "step": 1631 + }, + { + "epoch": 0.2177742193755004, + "grad_norm": 1.236415394795963, + "learning_rate": 1.820739547261487e-05, + "loss": 1.6253, + "step": 1632 + }, + { + "epoch": 0.21790765946090204, + "grad_norm": 1.1065194060629115, + "learning_rate": 1.8204925590922362e-05, + "loss": 1.604, + "step": 1633 + }, + { + "epoch": 0.2180410995463037, + "grad_norm": 1.0940043580451333, + "learning_rate": 1.820245417664317e-05, + "loss": 1.6449, + "step": 1634 + }, + { + "epoch": 0.21817453963170536, + "grad_norm": 1.1869832191003882, + "learning_rate": 1.8199981230238924e-05, + "loss": 1.6705, + "step": 1635 + }, + { + "epoch": 0.21830797971710703, + "grad_norm": 1.1189817227152579, + "learning_rate": 1.8197506752171545e-05, + "loss": 1.6218, + "step": 1636 + }, + { + "epoch": 0.21844141980250867, + "grad_norm": 1.1473359068259987, + "learning_rate": 1.8195030742903236e-05, + "loss": 1.6832, + "step": 1637 + }, + { + "epoch": 0.21857485988791034, + "grad_norm": 1.2362438437725367, + "learning_rate": 1.819255320289649e-05, + "loss": 1.6471, + "step": 1638 + }, + { + "epoch": 0.21870829997331198, + "grad_norm": 1.1116111042127557, + "learning_rate": 1.8190074132614083e-05, + "loss": 1.6397, + "step": 1639 + }, + { + "epoch": 0.21884174005871362, + "grad_norm": 1.0931240118741188, + "learning_rate": 1.818759353251907e-05, + "loss": 1.7028, + "step": 1640 + }, + { + "epoch": 0.2189751801441153, + "grad_norm": 1.1134694456669874, + "learning_rate": 1.818511140307481e-05, + "loss": 1.7135, + "step": 1641 + }, + { + "epoch": 0.21910862022951694, + "grad_norm": 1.1225439283708083, + "learning_rate": 1.8182627744744928e-05, + "loss": 1.6863, + "step": 1642 + }, + { + "epoch": 0.2192420603149186, + "grad_norm": 1.1226470273814733, + "learning_rate": 1.8180142557993346e-05, + "loss": 1.6654, + "step": 1643 + }, + { + "epoch": 0.21937550040032025, + "grad_norm": 12.702656527886313, + "learning_rate": 1.8177655843284268e-05, + "loss": 1.6785, + "step": 1644 + }, + { + "epoch": 0.21950894048572192, + "grad_norm": 1.1646728669056514, + "learning_rate": 1.8175167601082187e-05, + "loss": 1.6678, + "step": 1645 + }, + { + "epoch": 0.21964238057112356, + "grad_norm": 1.1370827214131438, + "learning_rate": 1.8172677831851874e-05, + "loss": 1.6678, + "step": 1646 + }, + { + "epoch": 0.21977582065652523, + "grad_norm": 1.0805021664243795, + "learning_rate": 1.8170186536058393e-05, + "loss": 1.6445, + "step": 1647 + }, + { + "epoch": 0.21990926074192688, + "grad_norm": 1.068690271627419, + "learning_rate": 1.8167693714167088e-05, + "loss": 1.6944, + "step": 1648 + }, + { + "epoch": 0.22004270082732852, + "grad_norm": 1.2765183302069458, + "learning_rate": 1.816519936664359e-05, + "loss": 1.6567, + "step": 1649 + }, + { + "epoch": 0.2201761409127302, + "grad_norm": 1.1215664466938502, + "learning_rate": 1.8162703493953812e-05, + "loss": 1.6553, + "step": 1650 + }, + { + "epoch": 0.22030958099813183, + "grad_norm": 1.057949579182957, + "learning_rate": 1.8160206096563957e-05, + "loss": 1.6541, + "step": 1651 + }, + { + "epoch": 0.2204430210835335, + "grad_norm": 1.3213535546275152, + "learning_rate": 1.8157707174940516e-05, + "loss": 1.6824, + "step": 1652 + }, + { + "epoch": 0.22057646116893515, + "grad_norm": 1.1669724021565158, + "learning_rate": 1.8155206729550248e-05, + "loss": 1.6525, + "step": 1653 + }, + { + "epoch": 0.22070990125433682, + "grad_norm": 1.3110532840171654, + "learning_rate": 1.8152704760860217e-05, + "loss": 1.678, + "step": 1654 + }, + { + "epoch": 0.22084334133973846, + "grad_norm": 1.1672424586728913, + "learning_rate": 1.815020126933776e-05, + "loss": 1.6696, + "step": 1655 + }, + { + "epoch": 0.2209767814251401, + "grad_norm": 1.3061093334442921, + "learning_rate": 1.8147696255450495e-05, + "loss": 1.6654, + "step": 1656 + }, + { + "epoch": 0.22111022151054177, + "grad_norm": 1.0702194871522392, + "learning_rate": 1.8145189719666335e-05, + "loss": 1.7042, + "step": 1657 + }, + { + "epoch": 0.2212436615959434, + "grad_norm": 1.2742123644262122, + "learning_rate": 1.8142681662453473e-05, + "loss": 1.6924, + "step": 1658 + }, + { + "epoch": 0.22137710168134508, + "grad_norm": 1.090573827396432, + "learning_rate": 1.8140172084280384e-05, + "loss": 1.7083, + "step": 1659 + }, + { + "epoch": 0.22151054176674673, + "grad_norm": 1.0477634184103604, + "learning_rate": 1.813766098561583e-05, + "loss": 1.6293, + "step": 1660 + }, + { + "epoch": 0.2216439818521484, + "grad_norm": 1.1079746682957217, + "learning_rate": 1.8135148366928855e-05, + "loss": 1.6785, + "step": 1661 + }, + { + "epoch": 0.22177742193755004, + "grad_norm": 1.1056303875592357, + "learning_rate": 1.8132634228688785e-05, + "loss": 1.5936, + "step": 1662 + }, + { + "epoch": 0.22191086202295168, + "grad_norm": 1.0872810931934485, + "learning_rate": 1.8130118571365235e-05, + "loss": 1.6854, + "step": 1663 + }, + { + "epoch": 0.22204430210835335, + "grad_norm": 1.180998755115771, + "learning_rate": 1.8127601395428104e-05, + "loss": 1.5947, + "step": 1664 + }, + { + "epoch": 0.222177742193755, + "grad_norm": 1.076365662325189, + "learning_rate": 1.8125082701347564e-05, + "loss": 1.6827, + "step": 1665 + }, + { + "epoch": 0.22231118227915667, + "grad_norm": 1.286948665055857, + "learning_rate": 1.8122562489594084e-05, + "loss": 1.684, + "step": 1666 + }, + { + "epoch": 0.2224446223645583, + "grad_norm": 1.1501953176983266, + "learning_rate": 1.812004076063841e-05, + "loss": 1.6775, + "step": 1667 + }, + { + "epoch": 0.22257806244995998, + "grad_norm": 1.1268435794579326, + "learning_rate": 1.8117517514951573e-05, + "loss": 1.7057, + "step": 1668 + }, + { + "epoch": 0.22271150253536162, + "grad_norm": 1.1620900257687603, + "learning_rate": 1.8114992753004887e-05, + "loss": 1.7043, + "step": 1669 + }, + { + "epoch": 0.22284494262076326, + "grad_norm": 1.1338951625822513, + "learning_rate": 1.8112466475269946e-05, + "loss": 1.6707, + "step": 1670 + }, + { + "epoch": 0.22297838270616493, + "grad_norm": 1.242098630419242, + "learning_rate": 1.8109938682218633e-05, + "loss": 1.6795, + "step": 1671 + }, + { + "epoch": 0.22311182279156658, + "grad_norm": 1.1679404879363304, + "learning_rate": 1.8107409374323107e-05, + "loss": 1.6397, + "step": 1672 + }, + { + "epoch": 0.22324526287696825, + "grad_norm": 1.176042698983641, + "learning_rate": 1.8104878552055817e-05, + "loss": 1.6591, + "step": 1673 + }, + { + "epoch": 0.2233787029623699, + "grad_norm": 1.312264219479212, + "learning_rate": 1.810234621588949e-05, + "loss": 1.6572, + "step": 1674 + }, + { + "epoch": 0.22351214304777156, + "grad_norm": 1.1369656057368374, + "learning_rate": 1.809981236629714e-05, + "loss": 1.7209, + "step": 1675 + }, + { + "epoch": 0.2236455831331732, + "grad_norm": 1.2223686585190343, + "learning_rate": 1.809727700375206e-05, + "loss": 1.6587, + "step": 1676 + }, + { + "epoch": 0.22377902321857485, + "grad_norm": 1.0737175824141045, + "learning_rate": 1.8094740128727823e-05, + "loss": 1.6123, + "step": 1677 + }, + { + "epoch": 0.22391246330397652, + "grad_norm": 1.0487717384337922, + "learning_rate": 1.8092201741698297e-05, + "loss": 1.603, + "step": 1678 + }, + { + "epoch": 0.22404590338937816, + "grad_norm": 1.0154503615440715, + "learning_rate": 1.8089661843137616e-05, + "loss": 1.6558, + "step": 1679 + }, + { + "epoch": 0.22417934347477983, + "grad_norm": 1.0612689044451018, + "learning_rate": 1.8087120433520207e-05, + "loss": 1.6634, + "step": 1680 + }, + { + "epoch": 0.22431278356018147, + "grad_norm": 1.0783694345863775, + "learning_rate": 1.8084577513320777e-05, + "loss": 1.6973, + "step": 1681 + }, + { + "epoch": 0.22444622364558314, + "grad_norm": 1.4059689465691732, + "learning_rate": 1.8082033083014315e-05, + "loss": 1.6692, + "step": 1682 + }, + { + "epoch": 0.22457966373098479, + "grad_norm": 1.0174187245468838, + "learning_rate": 1.807948714307609e-05, + "loss": 1.6965, + "step": 1683 + }, + { + "epoch": 0.22471310381638646, + "grad_norm": 1.2508321455102909, + "learning_rate": 1.8076939693981652e-05, + "loss": 1.6924, + "step": 1684 + }, + { + "epoch": 0.2248465439017881, + "grad_norm": 1.240543795410627, + "learning_rate": 1.807439073620684e-05, + "loss": 1.6354, + "step": 1685 + }, + { + "epoch": 0.22497998398718974, + "grad_norm": 1.5809329681051474, + "learning_rate": 1.807184027022777e-05, + "loss": 1.6596, + "step": 1686 + }, + { + "epoch": 0.2251134240725914, + "grad_norm": 1.1469707941171723, + "learning_rate": 1.806928829652084e-05, + "loss": 1.7269, + "step": 1687 + }, + { + "epoch": 0.22524686415799305, + "grad_norm": 1.0758984989477276, + "learning_rate": 1.8066734815562726e-05, + "loss": 1.7109, + "step": 1688 + }, + { + "epoch": 0.22538030424339472, + "grad_norm": 1.299428663534772, + "learning_rate": 1.8064179827830393e-05, + "loss": 1.6851, + "step": 1689 + }, + { + "epoch": 0.22551374432879637, + "grad_norm": 1.043487684350109, + "learning_rate": 1.8061623333801085e-05, + "loss": 1.6506, + "step": 1690 + }, + { + "epoch": 0.22564718441419804, + "grad_norm": 1.1237380692317867, + "learning_rate": 1.805906533395232e-05, + "loss": 1.7187, + "step": 1691 + }, + { + "epoch": 0.22578062449959968, + "grad_norm": 1.148118031537372, + "learning_rate": 1.8056505828761904e-05, + "loss": 1.6431, + "step": 1692 + }, + { + "epoch": 0.22591406458500132, + "grad_norm": 1.2374398808076512, + "learning_rate": 1.8053944818707932e-05, + "loss": 1.6198, + "step": 1693 + }, + { + "epoch": 0.226047504670403, + "grad_norm": 1.0903178987983715, + "learning_rate": 1.8051382304268762e-05, + "loss": 1.6473, + "step": 1694 + }, + { + "epoch": 0.22618094475580464, + "grad_norm": 1.1205806647882046, + "learning_rate": 1.8048818285923047e-05, + "loss": 1.6681, + "step": 1695 + }, + { + "epoch": 0.2263143848412063, + "grad_norm": 1.150148641527194, + "learning_rate": 1.8046252764149715e-05, + "loss": 1.6546, + "step": 1696 + }, + { + "epoch": 0.22644782492660795, + "grad_norm": 1.1208283417789737, + "learning_rate": 1.8043685739427976e-05, + "loss": 1.6365, + "step": 1697 + }, + { + "epoch": 0.22658126501200962, + "grad_norm": 1.9233641539386115, + "learning_rate": 1.804111721223732e-05, + "loss": 1.6183, + "step": 1698 + }, + { + "epoch": 0.22671470509741126, + "grad_norm": 1.1665972627988115, + "learning_rate": 1.8038547183057524e-05, + "loss": 1.6547, + "step": 1699 + }, + { + "epoch": 0.2268481451828129, + "grad_norm": 1.0590609907387925, + "learning_rate": 1.8035975652368635e-05, + "loss": 1.6456, + "step": 1700 + }, + { + "epoch": 0.22698158526821458, + "grad_norm": 1.0642643971203603, + "learning_rate": 1.8033402620650986e-05, + "loss": 1.7051, + "step": 1701 + }, + { + "epoch": 0.22711502535361622, + "grad_norm": 1.364226790660327, + "learning_rate": 1.803082808838519e-05, + "loss": 1.6045, + "step": 1702 + }, + { + "epoch": 0.2272484654390179, + "grad_norm": 1.1029813182150021, + "learning_rate": 1.8028252056052148e-05, + "loss": 1.682, + "step": 1703 + }, + { + "epoch": 0.22738190552441953, + "grad_norm": 1.1377601146917016, + "learning_rate": 1.8025674524133022e-05, + "loss": 1.7134, + "step": 1704 + }, + { + "epoch": 0.2275153456098212, + "grad_norm": 1.0987200088336204, + "learning_rate": 1.8023095493109273e-05, + "loss": 1.6286, + "step": 1705 + }, + { + "epoch": 0.22764878569522284, + "grad_norm": 1.0648434423713318, + "learning_rate": 1.802051496346263e-05, + "loss": 1.6687, + "step": 1706 + }, + { + "epoch": 0.2277822257806245, + "grad_norm": 1.1058007563564674, + "learning_rate": 1.8017932935675108e-05, + "loss": 1.6537, + "step": 1707 + }, + { + "epoch": 0.22791566586602616, + "grad_norm": 1.1956539306049252, + "learning_rate": 1.8015349410229004e-05, + "loss": 1.6273, + "step": 1708 + }, + { + "epoch": 0.2280491059514278, + "grad_norm": 1.097833315258532, + "learning_rate": 1.8012764387606887e-05, + "loss": 1.6283, + "step": 1709 + }, + { + "epoch": 0.22818254603682947, + "grad_norm": 1.032052358075353, + "learning_rate": 1.801017786829161e-05, + "loss": 1.6069, + "step": 1710 + }, + { + "epoch": 0.2283159861222311, + "grad_norm": 1.0217813572157182, + "learning_rate": 1.800758985276631e-05, + "loss": 1.6272, + "step": 1711 + }, + { + "epoch": 0.22844942620763278, + "grad_norm": 1.1320615088828117, + "learning_rate": 1.8005000341514392e-05, + "loss": 1.6433, + "step": 1712 + }, + { + "epoch": 0.22858286629303443, + "grad_norm": 1.197563433343801, + "learning_rate": 1.8002409335019552e-05, + "loss": 1.6391, + "step": 1713 + }, + { + "epoch": 0.22871630637843607, + "grad_norm": 1.0876681965716914, + "learning_rate": 1.799981683376576e-05, + "loss": 1.6864, + "step": 1714 + }, + { + "epoch": 0.22884974646383774, + "grad_norm": 1.0354932640447034, + "learning_rate": 1.7997222838237264e-05, + "loss": 1.6708, + "step": 1715 + }, + { + "epoch": 0.22898318654923938, + "grad_norm": 2.734360348457865, + "learning_rate": 1.7994627348918593e-05, + "loss": 1.6584, + "step": 1716 + }, + { + "epoch": 0.22911662663464105, + "grad_norm": 1.1394911063346953, + "learning_rate": 1.7992030366294555e-05, + "loss": 1.5976, + "step": 1717 + }, + { + "epoch": 0.2292500667200427, + "grad_norm": 1.1531653881087183, + "learning_rate": 1.798943189085024e-05, + "loss": 1.705, + "step": 1718 + }, + { + "epoch": 0.22938350680544436, + "grad_norm": 1.21758738022185, + "learning_rate": 1.7986831923071007e-05, + "loss": 1.6886, + "step": 1719 + }, + { + "epoch": 0.229516946890846, + "grad_norm": 1.0529056168835895, + "learning_rate": 1.7984230463442505e-05, + "loss": 1.6642, + "step": 1720 + }, + { + "epoch": 0.22965038697624768, + "grad_norm": 1.2125231023635614, + "learning_rate": 1.798162751245066e-05, + "loss": 1.6182, + "step": 1721 + }, + { + "epoch": 0.22978382706164932, + "grad_norm": 1.1933594423844636, + "learning_rate": 1.7979023070581664e-05, + "loss": 1.609, + "step": 1722 + }, + { + "epoch": 0.22991726714705096, + "grad_norm": 1.0700311747867306, + "learning_rate": 1.7976417138322008e-05, + "loss": 1.6493, + "step": 1723 + }, + { + "epoch": 0.23005070723245263, + "grad_norm": 1.2988519356273156, + "learning_rate": 1.7973809716158444e-05, + "loss": 1.6935, + "step": 1724 + }, + { + "epoch": 0.23018414731785428, + "grad_norm": 1.2630821844608227, + "learning_rate": 1.7971200804578008e-05, + "loss": 1.6877, + "step": 1725 + }, + { + "epoch": 0.23031758740325595, + "grad_norm": 1.0760773885115775, + "learning_rate": 1.796859040406802e-05, + "loss": 1.6648, + "step": 1726 + }, + { + "epoch": 0.2304510274886576, + "grad_norm": 1.0659107902842577, + "learning_rate": 1.796597851511607e-05, + "loss": 1.6765, + "step": 1727 + }, + { + "epoch": 0.23058446757405926, + "grad_norm": 1.0640564215933035, + "learning_rate": 1.796336513821003e-05, + "loss": 1.6519, + "step": 1728 + }, + { + "epoch": 0.2307179076594609, + "grad_norm": 1.044766417697617, + "learning_rate": 1.7960750273838046e-05, + "loss": 1.7201, + "step": 1729 + }, + { + "epoch": 0.23085134774486255, + "grad_norm": 1.3620145766553122, + "learning_rate": 1.795813392248855e-05, + "loss": 1.6141, + "step": 1730 + }, + { + "epoch": 0.23098478783026422, + "grad_norm": 1.0460951962418277, + "learning_rate": 1.7955516084650245e-05, + "loss": 1.6431, + "step": 1731 + }, + { + "epoch": 0.23111822791566586, + "grad_norm": 1.0561334650845267, + "learning_rate": 1.7952896760812106e-05, + "loss": 1.6234, + "step": 1732 + }, + { + "epoch": 0.23125166800106753, + "grad_norm": 1.1267791689507525, + "learning_rate": 1.7950275951463404e-05, + "loss": 1.6478, + "step": 1733 + }, + { + "epoch": 0.23138510808646917, + "grad_norm": 1.1014450865423768, + "learning_rate": 1.7947653657093672e-05, + "loss": 1.6141, + "step": 1734 + }, + { + "epoch": 0.23151854817187084, + "grad_norm": 1.089444038239352, + "learning_rate": 1.7945029878192722e-05, + "loss": 1.6511, + "step": 1735 + }, + { + "epoch": 0.23165198825727248, + "grad_norm": 1.1341511933956758, + "learning_rate": 1.7942404615250652e-05, + "loss": 1.6546, + "step": 1736 + }, + { + "epoch": 0.23178542834267413, + "grad_norm": 1.1162152522317423, + "learning_rate": 1.7939777868757825e-05, + "loss": 1.6413, + "step": 1737 + }, + { + "epoch": 0.2319188684280758, + "grad_norm": 1.0922713243144162, + "learning_rate": 1.7937149639204888e-05, + "loss": 1.6675, + "step": 1738 + }, + { + "epoch": 0.23205230851347744, + "grad_norm": 1.1196061707852998, + "learning_rate": 1.7934519927082773e-05, + "loss": 1.6891, + "step": 1739 + }, + { + "epoch": 0.2321857485988791, + "grad_norm": 1.0373630063131618, + "learning_rate": 1.7931888732882665e-05, + "loss": 1.6752, + "step": 1740 + }, + { + "epoch": 0.23231918868428075, + "grad_norm": 1.1715453355241499, + "learning_rate": 1.7929256057096056e-05, + "loss": 1.6895, + "step": 1741 + }, + { + "epoch": 0.23245262876968242, + "grad_norm": 1.1754651897827133, + "learning_rate": 1.792662190021469e-05, + "loss": 1.6807, + "step": 1742 + }, + { + "epoch": 0.23258606885508407, + "grad_norm": 1.089104477056595, + "learning_rate": 1.79239862627306e-05, + "loss": 1.6262, + "step": 1743 + }, + { + "epoch": 0.2327195089404857, + "grad_norm": 1.047704791420425, + "learning_rate": 1.79213491451361e-05, + "loss": 1.674, + "step": 1744 + }, + { + "epoch": 0.23285294902588738, + "grad_norm": 1.0478486718373898, + "learning_rate": 1.7918710547923763e-05, + "loss": 1.6386, + "step": 1745 + }, + { + "epoch": 0.23298638911128902, + "grad_norm": 1.0374153527662222, + "learning_rate": 1.7916070471586455e-05, + "loss": 1.6535, + "step": 1746 + }, + { + "epoch": 0.2331198291966907, + "grad_norm": 1.1734013425233318, + "learning_rate": 1.7913428916617307e-05, + "loss": 1.6911, + "step": 1747 + }, + { + "epoch": 0.23325326928209233, + "grad_norm": 1.0828478675607198, + "learning_rate": 1.791078588350974e-05, + "loss": 1.6512, + "step": 1748 + }, + { + "epoch": 0.233386709367494, + "grad_norm": 1.114480950469954, + "learning_rate": 1.7908141372757436e-05, + "loss": 1.6491, + "step": 1749 + }, + { + "epoch": 0.23352014945289565, + "grad_norm": 1.1436925883505036, + "learning_rate": 1.790549538485436e-05, + "loss": 1.6854, + "step": 1750 + }, + { + "epoch": 0.23365358953829732, + "grad_norm": 10.687146402666658, + "learning_rate": 1.7902847920294754e-05, + "loss": 1.621, + "step": 1751 + }, + { + "epoch": 0.23378702962369896, + "grad_norm": 1.3707877310751082, + "learning_rate": 1.7900198979573135e-05, + "loss": 1.6246, + "step": 1752 + }, + { + "epoch": 0.2339204697091006, + "grad_norm": 1.1877922731508994, + "learning_rate": 1.7897548563184296e-05, + "loss": 1.6376, + "step": 1753 + }, + { + "epoch": 0.23405390979450227, + "grad_norm": 1.36255029255512, + "learning_rate": 1.7894896671623296e-05, + "loss": 1.6354, + "step": 1754 + }, + { + "epoch": 0.23418734987990392, + "grad_norm": 1.4838375564796251, + "learning_rate": 1.7892243305385487e-05, + "loss": 1.6155, + "step": 1755 + }, + { + "epoch": 0.2343207899653056, + "grad_norm": 1.1622308653549671, + "learning_rate": 1.7889588464966488e-05, + "loss": 1.6665, + "step": 1756 + }, + { + "epoch": 0.23445423005070723, + "grad_norm": 1.336261221880211, + "learning_rate": 1.788693215086219e-05, + "loss": 1.6409, + "step": 1757 + }, + { + "epoch": 0.2345876701361089, + "grad_norm": 1.2428755384173737, + "learning_rate": 1.788427436356876e-05, + "loss": 1.6523, + "step": 1758 + }, + { + "epoch": 0.23472111022151054, + "grad_norm": 1.1734745642911637, + "learning_rate": 1.7881615103582642e-05, + "loss": 1.6823, + "step": 1759 + }, + { + "epoch": 0.23485455030691219, + "grad_norm": 1.359796714523189, + "learning_rate": 1.7878954371400563e-05, + "loss": 1.6766, + "step": 1760 + }, + { + "epoch": 0.23498799039231386, + "grad_norm": 1.1543938773322537, + "learning_rate": 1.787629216751951e-05, + "loss": 1.6808, + "step": 1761 + }, + { + "epoch": 0.2351214304777155, + "grad_norm": 1.2231767398910764, + "learning_rate": 1.7873628492436757e-05, + "loss": 1.6149, + "step": 1762 + }, + { + "epoch": 0.23525487056311717, + "grad_norm": 1.3991330507525954, + "learning_rate": 1.7870963346649847e-05, + "loss": 1.6663, + "step": 1763 + }, + { + "epoch": 0.2353883106485188, + "grad_norm": 1.2111519435232232, + "learning_rate": 1.7868296730656596e-05, + "loss": 1.6832, + "step": 1764 + }, + { + "epoch": 0.23552175073392048, + "grad_norm": 1.1858904981204466, + "learning_rate": 1.7865628644955098e-05, + "loss": 1.6314, + "step": 1765 + }, + { + "epoch": 0.23565519081932212, + "grad_norm": 1.2029794734253174, + "learning_rate": 1.786295909004373e-05, + "loss": 1.5582, + "step": 1766 + }, + { + "epoch": 0.23578863090472377, + "grad_norm": 1.319501116381306, + "learning_rate": 1.7860288066421118e-05, + "loss": 1.6763, + "step": 1767 + }, + { + "epoch": 0.23592207099012544, + "grad_norm": 1.4049089786241191, + "learning_rate": 1.7857615574586193e-05, + "loss": 1.6361, + "step": 1768 + }, + { + "epoch": 0.23605551107552708, + "grad_norm": 1.1918917655522046, + "learning_rate": 1.785494161503814e-05, + "loss": 1.6137, + "step": 1769 + }, + { + "epoch": 0.23618895116092875, + "grad_norm": 1.144754220344139, + "learning_rate": 1.7852266188276422e-05, + "loss": 1.6175, + "step": 1770 + }, + { + "epoch": 0.2363223912463304, + "grad_norm": 1.3381800606475023, + "learning_rate": 1.7849589294800787e-05, + "loss": 1.6195, + "step": 1771 + }, + { + "epoch": 0.23645583133173206, + "grad_norm": 1.0458078567231872, + "learning_rate": 1.7846910935111242e-05, + "loss": 1.6848, + "step": 1772 + }, + { + "epoch": 0.2365892714171337, + "grad_norm": 1.4105954493444146, + "learning_rate": 1.7844231109708072e-05, + "loss": 1.654, + "step": 1773 + }, + { + "epoch": 0.23672271150253535, + "grad_norm": 1.289151142605529, + "learning_rate": 1.784154981909184e-05, + "loss": 1.6533, + "step": 1774 + }, + { + "epoch": 0.23685615158793702, + "grad_norm": 1.2707742721373032, + "learning_rate": 1.7838867063763383e-05, + "loss": 1.581, + "step": 1775 + }, + { + "epoch": 0.23698959167333866, + "grad_norm": 1.2053249887223572, + "learning_rate": 1.7836182844223808e-05, + "loss": 1.6464, + "step": 1776 + }, + { + "epoch": 0.23712303175874033, + "grad_norm": 1.06210419581067, + "learning_rate": 1.7833497160974495e-05, + "loss": 1.5838, + "step": 1777 + }, + { + "epoch": 0.23725647184414198, + "grad_norm": 1.3047023677050178, + "learning_rate": 1.78308100145171e-05, + "loss": 1.5983, + "step": 1778 + }, + { + "epoch": 0.23738991192954365, + "grad_norm": 1.29218147582307, + "learning_rate": 1.7828121405353553e-05, + "loss": 1.6862, + "step": 1779 + }, + { + "epoch": 0.2375233520149453, + "grad_norm": 1.0466696307900745, + "learning_rate": 1.7825431333986052e-05, + "loss": 1.6623, + "step": 1780 + }, + { + "epoch": 0.23765679210034693, + "grad_norm": 1.2349477939262232, + "learning_rate": 1.7822739800917073e-05, + "loss": 1.5709, + "step": 1781 + }, + { + "epoch": 0.2377902321857486, + "grad_norm": 1.2147084949319535, + "learning_rate": 1.782004680664937e-05, + "loss": 1.6726, + "step": 1782 + }, + { + "epoch": 0.23792367227115024, + "grad_norm": 1.0232408573291623, + "learning_rate": 1.7817352351685954e-05, + "loss": 1.6278, + "step": 1783 + }, + { + "epoch": 0.23805711235655191, + "grad_norm": 1.13584849589059, + "learning_rate": 1.781465643653012e-05, + "loss": 1.7017, + "step": 1784 + }, + { + "epoch": 0.23819055244195356, + "grad_norm": 1.0754152565333037, + "learning_rate": 1.7811959061685438e-05, + "loss": 1.6089, + "step": 1785 + }, + { + "epoch": 0.23832399252735523, + "grad_norm": 1.1362601818532363, + "learning_rate": 1.7809260227655747e-05, + "loss": 1.6161, + "step": 1786 + }, + { + "epoch": 0.23845743261275687, + "grad_norm": 1.0336376244273875, + "learning_rate": 1.7806559934945156e-05, + "loss": 1.6126, + "step": 1787 + }, + { + "epoch": 0.23859087269815854, + "grad_norm": 1.2284171891499982, + "learning_rate": 1.7803858184058053e-05, + "loss": 1.609, + "step": 1788 + }, + { + "epoch": 0.23872431278356018, + "grad_norm": 1.0993616922411698, + "learning_rate": 1.780115497549909e-05, + "loss": 1.6893, + "step": 1789 + }, + { + "epoch": 0.23885775286896183, + "grad_norm": 1.0345469608413957, + "learning_rate": 1.7798450309773195e-05, + "loss": 1.6798, + "step": 1790 + }, + { + "epoch": 0.2389911929543635, + "grad_norm": 1.2117978955967876, + "learning_rate": 1.7795744187385575e-05, + "loss": 1.6246, + "step": 1791 + }, + { + "epoch": 0.23912463303976514, + "grad_norm": 1.0463913434816907, + "learning_rate": 1.7793036608841694e-05, + "loss": 1.6595, + "step": 1792 + }, + { + "epoch": 0.2392580731251668, + "grad_norm": 1.1218163300477884, + "learning_rate": 1.7790327574647306e-05, + "loss": 1.6288, + "step": 1793 + }, + { + "epoch": 0.23939151321056845, + "grad_norm": 1.3920149154127004, + "learning_rate": 1.778761708530842e-05, + "loss": 1.638, + "step": 1794 + }, + { + "epoch": 0.23952495329597012, + "grad_norm": 1.0453245062677134, + "learning_rate": 1.7784905141331334e-05, + "loss": 1.7159, + "step": 1795 + }, + { + "epoch": 0.23965839338137176, + "grad_norm": 1.0905984393131543, + "learning_rate": 1.7782191743222594e-05, + "loss": 1.6959, + "step": 1796 + }, + { + "epoch": 0.2397918334667734, + "grad_norm": 1.0576313468931025, + "learning_rate": 1.7779476891489044e-05, + "loss": 1.6626, + "step": 1797 + }, + { + "epoch": 0.23992527355217508, + "grad_norm": 1.1004207621683646, + "learning_rate": 1.777676058663778e-05, + "loss": 1.6204, + "step": 1798 + }, + { + "epoch": 0.24005871363757672, + "grad_norm": 1.1873198699247158, + "learning_rate": 1.7774042829176186e-05, + "loss": 1.7395, + "step": 1799 + }, + { + "epoch": 0.2401921537229784, + "grad_norm": 1.0687609878005364, + "learning_rate": 1.77713236196119e-05, + "loss": 1.6988, + "step": 1800 + }, + { + "epoch": 0.24032559380838003, + "grad_norm": 1.0679027457183934, + "learning_rate": 1.7768602958452837e-05, + "loss": 1.6671, + "step": 1801 + }, + { + "epoch": 0.2404590338937817, + "grad_norm": 1.0417473997306737, + "learning_rate": 1.7765880846207193e-05, + "loss": 1.631, + "step": 1802 + }, + { + "epoch": 0.24059247397918335, + "grad_norm": 1.119803585337448, + "learning_rate": 1.7763157283383426e-05, + "loss": 1.6908, + "step": 1803 + }, + { + "epoch": 0.240725914064585, + "grad_norm": 1.0143233262923002, + "learning_rate": 1.7760432270490266e-05, + "loss": 1.6342, + "step": 1804 + }, + { + "epoch": 0.24085935414998666, + "grad_norm": 1.2812766744363617, + "learning_rate": 1.7757705808036714e-05, + "loss": 1.6356, + "step": 1805 + }, + { + "epoch": 0.2409927942353883, + "grad_norm": 1.0926304516768464, + "learning_rate": 1.775497789653204e-05, + "loss": 1.6898, + "step": 1806 + }, + { + "epoch": 0.24112623432078997, + "grad_norm": 1.013517851069127, + "learning_rate": 1.7752248536485787e-05, + "loss": 1.6464, + "step": 1807 + }, + { + "epoch": 0.24125967440619162, + "grad_norm": 1.0833079031525588, + "learning_rate": 1.7749517728407775e-05, + "loss": 1.6374, + "step": 1808 + }, + { + "epoch": 0.24139311449159329, + "grad_norm": 1.031007153005283, + "learning_rate": 1.774678547280808e-05, + "loss": 1.6838, + "step": 1809 + }, + { + "epoch": 0.24152655457699493, + "grad_norm": 1.1069793411968527, + "learning_rate": 1.7744051770197058e-05, + "loss": 1.6641, + "step": 1810 + }, + { + "epoch": 0.24165999466239657, + "grad_norm": 1.143899658709366, + "learning_rate": 1.7741316621085336e-05, + "loss": 1.7275, + "step": 1811 + }, + { + "epoch": 0.24179343474779824, + "grad_norm": 1.0641576337141183, + "learning_rate": 1.7738580025983808e-05, + "loss": 1.6649, + "step": 1812 + }, + { + "epoch": 0.24192687483319988, + "grad_norm": 1.2238048171560199, + "learning_rate": 1.773584198540364e-05, + "loss": 1.6442, + "step": 1813 + }, + { + "epoch": 0.24206031491860155, + "grad_norm": 1.2197399885035782, + "learning_rate": 1.773310249985626e-05, + "loss": 1.648, + "step": 1814 + }, + { + "epoch": 0.2421937550040032, + "grad_norm": 1.0167788681079188, + "learning_rate": 1.773036156985338e-05, + "loss": 1.6676, + "step": 1815 + }, + { + "epoch": 0.24232719508940487, + "grad_norm": 1.2114816489016487, + "learning_rate": 1.7727619195906972e-05, + "loss": 1.6639, + "step": 1816 + }, + { + "epoch": 0.2424606351748065, + "grad_norm": 8.428467351350283, + "learning_rate": 1.7724875378529282e-05, + "loss": 1.6576, + "step": 1817 + }, + { + "epoch": 0.24259407526020815, + "grad_norm": 1.0465542814144335, + "learning_rate": 1.7722130118232823e-05, + "loss": 1.6399, + "step": 1818 + }, + { + "epoch": 0.24272751534560982, + "grad_norm": 1.1988097703131617, + "learning_rate": 1.7719383415530375e-05, + "loss": 1.626, + "step": 1819 + }, + { + "epoch": 0.24286095543101147, + "grad_norm": 1.1423995626336525, + "learning_rate": 1.7716635270934996e-05, + "loss": 1.6633, + "step": 1820 + }, + { + "epoch": 0.24299439551641314, + "grad_norm": 1.0587665476874228, + "learning_rate": 1.7713885684960002e-05, + "loss": 1.6313, + "step": 1821 + }, + { + "epoch": 0.24312783560181478, + "grad_norm": 1.2825083624500806, + "learning_rate": 1.771113465811899e-05, + "loss": 1.6683, + "step": 1822 + }, + { + "epoch": 0.24326127568721645, + "grad_norm": 1.167576975923983, + "learning_rate": 1.770838219092582e-05, + "loss": 1.6607, + "step": 1823 + }, + { + "epoch": 0.2433947157726181, + "grad_norm": 1.0672931246277235, + "learning_rate": 1.7705628283894617e-05, + "loss": 1.6341, + "step": 1824 + }, + { + "epoch": 0.24352815585801976, + "grad_norm": 1.134208618821163, + "learning_rate": 1.7702872937539783e-05, + "loss": 1.7071, + "step": 1825 + }, + { + "epoch": 0.2436615959434214, + "grad_norm": 1.2440133024491684, + "learning_rate": 1.7700116152375986e-05, + "loss": 1.7007, + "step": 1826 + }, + { + "epoch": 0.24379503602882305, + "grad_norm": 1.0256084895859983, + "learning_rate": 1.769735792891816e-05, + "loss": 1.6557, + "step": 1827 + }, + { + "epoch": 0.24392847611422472, + "grad_norm": 1.063245387715446, + "learning_rate": 1.769459826768151e-05, + "loss": 1.6766, + "step": 1828 + }, + { + "epoch": 0.24406191619962636, + "grad_norm": 1.079027783398279, + "learning_rate": 1.7691837169181513e-05, + "loss": 1.6567, + "step": 1829 + }, + { + "epoch": 0.24419535628502803, + "grad_norm": 1.332484756061736, + "learning_rate": 1.7689074633933903e-05, + "loss": 1.6403, + "step": 1830 + }, + { + "epoch": 0.24432879637042967, + "grad_norm": 1.114632167604677, + "learning_rate": 1.7686310662454698e-05, + "loss": 1.654, + "step": 1831 + }, + { + "epoch": 0.24446223645583134, + "grad_norm": 1.0194057684489455, + "learning_rate": 1.7683545255260173e-05, + "loss": 1.6437, + "step": 1832 + }, + { + "epoch": 0.244595676541233, + "grad_norm": 1.1549546103317554, + "learning_rate": 1.7680778412866876e-05, + "loss": 1.581, + "step": 1833 + }, + { + "epoch": 0.24472911662663463, + "grad_norm": 1.2807877506947511, + "learning_rate": 1.7678010135791616e-05, + "loss": 1.6518, + "step": 1834 + }, + { + "epoch": 0.2448625567120363, + "grad_norm": 1.246730596301354, + "learning_rate": 1.7675240424551483e-05, + "loss": 1.6254, + "step": 1835 + }, + { + "epoch": 0.24499599679743794, + "grad_norm": 1.0865214357163941, + "learning_rate": 1.7672469279663827e-05, + "loss": 1.6957, + "step": 1836 + }, + { + "epoch": 0.2451294368828396, + "grad_norm": 1.247069373038027, + "learning_rate": 1.766969670164626e-05, + "loss": 1.5999, + "step": 1837 + }, + { + "epoch": 0.24526287696824126, + "grad_norm": 1.0837926981903307, + "learning_rate": 1.7666922691016673e-05, + "loss": 1.6915, + "step": 1838 + }, + { + "epoch": 0.24539631705364293, + "grad_norm": 1.2095352189589086, + "learning_rate": 1.766414724829322e-05, + "loss": 1.6801, + "step": 1839 + }, + { + "epoch": 0.24552975713904457, + "grad_norm": 1.0527519692262675, + "learning_rate": 1.7661370373994318e-05, + "loss": 1.7169, + "step": 1840 + }, + { + "epoch": 0.2456631972244462, + "grad_norm": 1.06283226700497, + "learning_rate": 1.765859206863866e-05, + "loss": 1.653, + "step": 1841 + }, + { + "epoch": 0.24579663730984788, + "grad_norm": 1.1067902633010984, + "learning_rate": 1.7655812332745198e-05, + "loss": 1.6609, + "step": 1842 + }, + { + "epoch": 0.24593007739524952, + "grad_norm": 1.11251472312465, + "learning_rate": 1.7653031166833158e-05, + "loss": 1.6144, + "step": 1843 + }, + { + "epoch": 0.2460635174806512, + "grad_norm": 1.074234471124788, + "learning_rate": 1.765024857142203e-05, + "loss": 1.6302, + "step": 1844 + }, + { + "epoch": 0.24619695756605284, + "grad_norm": 1.2669376467474798, + "learning_rate": 1.764746454703157e-05, + "loss": 1.7182, + "step": 1845 + }, + { + "epoch": 0.2463303976514545, + "grad_norm": 1.0774090725880676, + "learning_rate": 1.7644679094181803e-05, + "loss": 1.6356, + "step": 1846 + }, + { + "epoch": 0.24646383773685615, + "grad_norm": 1.2670775121665758, + "learning_rate": 1.7641892213393017e-05, + "loss": 1.6567, + "step": 1847 + }, + { + "epoch": 0.2465972778222578, + "grad_norm": 1.2199961678910605, + "learning_rate": 1.7639103905185774e-05, + "loss": 1.6689, + "step": 1848 + }, + { + "epoch": 0.24673071790765946, + "grad_norm": 1.093950746093173, + "learning_rate": 1.7636314170080893e-05, + "loss": 1.6764, + "step": 1849 + }, + { + "epoch": 0.2468641579930611, + "grad_norm": 1.3555020553608637, + "learning_rate": 1.7633523008599468e-05, + "loss": 1.6779, + "step": 1850 + }, + { + "epoch": 0.24699759807846278, + "grad_norm": 1.2770301389385996, + "learning_rate": 1.7630730421262857e-05, + "loss": 1.6845, + "step": 1851 + }, + { + "epoch": 0.24713103816386442, + "grad_norm": 1.04055907190113, + "learning_rate": 1.7627936408592684e-05, + "loss": 1.71, + "step": 1852 + }, + { + "epoch": 0.2472644782492661, + "grad_norm": 1.2660383381787914, + "learning_rate": 1.7625140971110834e-05, + "loss": 1.6673, + "step": 1853 + }, + { + "epoch": 0.24739791833466773, + "grad_norm": 1.1241950492161565, + "learning_rate": 1.7622344109339468e-05, + "loss": 1.6426, + "step": 1854 + }, + { + "epoch": 0.24753135842006938, + "grad_norm": 1.1634703156362944, + "learning_rate": 1.7619545823801008e-05, + "loss": 1.6395, + "step": 1855 + }, + { + "epoch": 0.24766479850547105, + "grad_norm": 1.1887592831201785, + "learning_rate": 1.7616746115018136e-05, + "loss": 1.6759, + "step": 1856 + }, + { + "epoch": 0.2477982385908727, + "grad_norm": 1.04558208144064, + "learning_rate": 1.7613944983513812e-05, + "loss": 1.6663, + "step": 1857 + }, + { + "epoch": 0.24793167867627436, + "grad_norm": 1.2246725069546616, + "learning_rate": 1.7611142429811253e-05, + "loss": 1.6571, + "step": 1858 + }, + { + "epoch": 0.248065118761676, + "grad_norm": 1.064172850739474, + "learning_rate": 1.7608338454433945e-05, + "loss": 1.6479, + "step": 1859 + }, + { + "epoch": 0.24819855884707767, + "grad_norm": 1.1389852926930035, + "learning_rate": 1.7605533057905635e-05, + "loss": 1.6456, + "step": 1860 + }, + { + "epoch": 0.24833199893247931, + "grad_norm": 1.1395310795650293, + "learning_rate": 1.7602726240750346e-05, + "loss": 1.6769, + "step": 1861 + }, + { + "epoch": 0.24846543901788098, + "grad_norm": 1.1049036582128695, + "learning_rate": 1.7599918003492354e-05, + "loss": 1.6121, + "step": 1862 + }, + { + "epoch": 0.24859887910328263, + "grad_norm": 1.2866424724351742, + "learning_rate": 1.7597108346656206e-05, + "loss": 1.6422, + "step": 1863 + }, + { + "epoch": 0.24873231918868427, + "grad_norm": 1.0354908069373776, + "learning_rate": 1.7594297270766713e-05, + "loss": 1.6581, + "step": 1864 + }, + { + "epoch": 0.24886575927408594, + "grad_norm": 1.0312439084550453, + "learning_rate": 1.7591484776348958e-05, + "loss": 1.6143, + "step": 1865 + }, + { + "epoch": 0.24899919935948758, + "grad_norm": 1.0200041648342764, + "learning_rate": 1.7588670863928278e-05, + "loss": 1.6014, + "step": 1866 + }, + { + "epoch": 0.24913263944488925, + "grad_norm": 1.0618120625694842, + "learning_rate": 1.758585553403028e-05, + "loss": 1.6641, + "step": 1867 + }, + { + "epoch": 0.2492660795302909, + "grad_norm": 1.0713518559188207, + "learning_rate": 1.758303878718084e-05, + "loss": 1.6472, + "step": 1868 + }, + { + "epoch": 0.24939951961569257, + "grad_norm": 1.0046786864644603, + "learning_rate": 1.7580220623906088e-05, + "loss": 1.6321, + "step": 1869 + }, + { + "epoch": 0.2495329597010942, + "grad_norm": 1.1134637775554515, + "learning_rate": 1.7577401044732428e-05, + "loss": 1.6474, + "step": 1870 + }, + { + "epoch": 0.24966639978649585, + "grad_norm": 1.27550190258781, + "learning_rate": 1.7574580050186524e-05, + "loss": 1.6594, + "step": 1871 + }, + { + "epoch": 0.24979983987189752, + "grad_norm": 1.0247658171892224, + "learning_rate": 1.7571757640795308e-05, + "loss": 1.6159, + "step": 1872 + }, + { + "epoch": 0.24993327995729916, + "grad_norm": 1.0462382916284119, + "learning_rate": 1.7568933817085966e-05, + "loss": 1.5982, + "step": 1873 + }, + { + "epoch": 0.2500667200427008, + "grad_norm": 1.0804215813368583, + "learning_rate": 1.756610857958597e-05, + "loss": 1.7014, + "step": 1874 + }, + { + "epoch": 0.2502001601281025, + "grad_norm": 1.0738193355707433, + "learning_rate": 1.756328192882303e-05, + "loss": 1.6642, + "step": 1875 + }, + { + "epoch": 0.25033360021350415, + "grad_norm": 1.3455199376826739, + "learning_rate": 1.7560453865325143e-05, + "loss": 1.6508, + "step": 1876 + }, + { + "epoch": 0.2504670402989058, + "grad_norm": 1.084814244415092, + "learning_rate": 1.7557624389620548e-05, + "loss": 1.6794, + "step": 1877 + }, + { + "epoch": 0.25060048038430743, + "grad_norm": 1.061930007641208, + "learning_rate": 1.7554793502237765e-05, + "loss": 1.6879, + "step": 1878 + }, + { + "epoch": 0.2507339204697091, + "grad_norm": 1.0450629751248275, + "learning_rate": 1.7551961203705573e-05, + "loss": 1.6783, + "step": 1879 + }, + { + "epoch": 0.2508673605551108, + "grad_norm": 1.0478570277539474, + "learning_rate": 1.7549127494553005e-05, + "loss": 1.6616, + "step": 1880 + }, + { + "epoch": 0.2510008006405124, + "grad_norm": 1.055824286400764, + "learning_rate": 1.7546292375309375e-05, + "loss": 1.5953, + "step": 1881 + }, + { + "epoch": 0.25113424072591406, + "grad_norm": 1.2242477360612551, + "learning_rate": 1.7543455846504245e-05, + "loss": 1.6843, + "step": 1882 + }, + { + "epoch": 0.25126768081131573, + "grad_norm": 1.0389910946552412, + "learning_rate": 1.754061790866745e-05, + "loss": 1.6307, + "step": 1883 + }, + { + "epoch": 0.2514011208967174, + "grad_norm": 1.054285947670663, + "learning_rate": 1.7537778562329083e-05, + "loss": 1.6369, + "step": 1884 + }, + { + "epoch": 0.251534560982119, + "grad_norm": 1.2248259207278507, + "learning_rate": 1.7534937808019502e-05, + "loss": 1.6693, + "step": 1885 + }, + { + "epoch": 0.2516680010675207, + "grad_norm": 19.66684114165284, + "learning_rate": 1.7532095646269324e-05, + "loss": 1.7343, + "step": 1886 + }, + { + "epoch": 0.25180144115292236, + "grad_norm": 1.1148849102770775, + "learning_rate": 1.7529252077609435e-05, + "loss": 1.6762, + "step": 1887 + }, + { + "epoch": 0.25193488123832397, + "grad_norm": 1.3680236555548535, + "learning_rate": 1.7526407102570985e-05, + "loss": 1.6241, + "step": 1888 + }, + { + "epoch": 0.25206832132372564, + "grad_norm": 1.0884368102610955, + "learning_rate": 1.7523560721685376e-05, + "loss": 1.6284, + "step": 1889 + }, + { + "epoch": 0.2522017614091273, + "grad_norm": 1.0802430716006264, + "learning_rate": 1.7520712935484288e-05, + "loss": 1.6485, + "step": 1890 + }, + { + "epoch": 0.252335201494529, + "grad_norm": 1.0586064611090562, + "learning_rate": 1.7517863744499645e-05, + "loss": 1.5886, + "step": 1891 + }, + { + "epoch": 0.2524686415799306, + "grad_norm": 1.0549630121480713, + "learning_rate": 1.7515013149263654e-05, + "loss": 1.6656, + "step": 1892 + }, + { + "epoch": 0.25260208166533227, + "grad_norm": 1.021796454571563, + "learning_rate": 1.7512161150308763e-05, + "loss": 1.6442, + "step": 1893 + }, + { + "epoch": 0.25273552175073394, + "grad_norm": 1.0820683485752427, + "learning_rate": 1.7509307748167703e-05, + "loss": 1.7135, + "step": 1894 + }, + { + "epoch": 0.25286896183613555, + "grad_norm": 1.1110777101924227, + "learning_rate": 1.750645294337345e-05, + "loss": 1.6766, + "step": 1895 + }, + { + "epoch": 0.2530024019215372, + "grad_norm": 1.380759537151671, + "learning_rate": 1.750359673645925e-05, + "loss": 1.6784, + "step": 1896 + }, + { + "epoch": 0.2531358420069389, + "grad_norm": 1.2299813651145617, + "learning_rate": 1.750073912795861e-05, + "loss": 1.6679, + "step": 1897 + }, + { + "epoch": 0.25326928209234056, + "grad_norm": 13.108704611195062, + "learning_rate": 1.7497880118405305e-05, + "loss": 1.6712, + "step": 1898 + }, + { + "epoch": 0.2534027221777422, + "grad_norm": 8.088838037228543, + "learning_rate": 1.7495019708333362e-05, + "loss": 1.8226, + "step": 1899 + }, + { + "epoch": 0.25353616226314385, + "grad_norm": 1.284583284047351, + "learning_rate": 1.7492157898277066e-05, + "loss": 1.6856, + "step": 1900 + }, + { + "epoch": 0.2536696023485455, + "grad_norm": 1.1185025918470872, + "learning_rate": 1.7489294688770976e-05, + "loss": 1.6563, + "step": 1901 + }, + { + "epoch": 0.25380304243394713, + "grad_norm": 1.310393482796925, + "learning_rate": 1.748643008034991e-05, + "loss": 1.6655, + "step": 1902 + }, + { + "epoch": 0.2539364825193488, + "grad_norm": 1.2823242056261748, + "learning_rate": 1.748356407354894e-05, + "loss": 1.6294, + "step": 1903 + }, + { + "epoch": 0.2540699226047505, + "grad_norm": 1.115456828324836, + "learning_rate": 1.7480696668903405e-05, + "loss": 1.6311, + "step": 1904 + }, + { + "epoch": 0.25420336269015215, + "grad_norm": 1.098358515379306, + "learning_rate": 1.7477827866948902e-05, + "loss": 1.6474, + "step": 1905 + }, + { + "epoch": 0.25433680277555376, + "grad_norm": 1.2499952987679348, + "learning_rate": 1.7474957668221294e-05, + "loss": 1.6127, + "step": 1906 + }, + { + "epoch": 0.25447024286095543, + "grad_norm": 1.0653256471838468, + "learning_rate": 1.7472086073256695e-05, + "loss": 1.6237, + "step": 1907 + }, + { + "epoch": 0.2546036829463571, + "grad_norm": 1.100375109746571, + "learning_rate": 1.7469213082591493e-05, + "loss": 1.6623, + "step": 1908 + }, + { + "epoch": 0.2547371230317587, + "grad_norm": 1.0728865640963992, + "learning_rate": 1.746633869676233e-05, + "loss": 1.6604, + "step": 1909 + }, + { + "epoch": 0.2548705631171604, + "grad_norm": 1.4592548751795749, + "learning_rate": 1.7463462916306103e-05, + "loss": 1.7118, + "step": 1910 + }, + { + "epoch": 0.25500400320256206, + "grad_norm": 1.1932048673196494, + "learning_rate": 1.7460585741759978e-05, + "loss": 1.6134, + "step": 1911 + }, + { + "epoch": 0.2551374432879637, + "grad_norm": 1.2140073379811964, + "learning_rate": 1.7457707173661378e-05, + "loss": 1.7213, + "step": 1912 + }, + { + "epoch": 0.25527088337336534, + "grad_norm": 1.1053613025091942, + "learning_rate": 1.7454827212547988e-05, + "loss": 1.6205, + "step": 1913 + }, + { + "epoch": 0.255404323458767, + "grad_norm": 1.0688175948277936, + "learning_rate": 1.745194585895775e-05, + "loss": 1.6845, + "step": 1914 + }, + { + "epoch": 0.2555377635441687, + "grad_norm": 1.0696512783400127, + "learning_rate": 1.744906311342887e-05, + "loss": 1.6337, + "step": 1915 + }, + { + "epoch": 0.2556712036295703, + "grad_norm": 1.0723419328954982, + "learning_rate": 1.744617897649981e-05, + "loss": 1.631, + "step": 1916 + }, + { + "epoch": 0.25580464371497197, + "grad_norm": 1.0336966371989529, + "learning_rate": 1.7443293448709297e-05, + "loss": 1.6656, + "step": 1917 + }, + { + "epoch": 0.25593808380037364, + "grad_norm": 1.0461578537171803, + "learning_rate": 1.7440406530596312e-05, + "loss": 1.6317, + "step": 1918 + }, + { + "epoch": 0.2560715238857753, + "grad_norm": 1.1097399246146262, + "learning_rate": 1.74375182227001e-05, + "loss": 1.6383, + "step": 1919 + }, + { + "epoch": 0.2562049639711769, + "grad_norm": 1.0337650567788563, + "learning_rate": 1.7434628525560163e-05, + "loss": 1.6528, + "step": 1920 + }, + { + "epoch": 0.2563384040565786, + "grad_norm": 1.018254122534702, + "learning_rate": 1.7431737439716262e-05, + "loss": 1.6495, + "step": 1921 + }, + { + "epoch": 0.25647184414198027, + "grad_norm": 1.0402215429993655, + "learning_rate": 1.7428844965708425e-05, + "loss": 1.6646, + "step": 1922 + }, + { + "epoch": 0.2566052842273819, + "grad_norm": 1.2456549178005256, + "learning_rate": 1.7425951104076925e-05, + "loss": 1.6625, + "step": 1923 + }, + { + "epoch": 0.25673872431278355, + "grad_norm": 1.1766568400761854, + "learning_rate": 1.7423055855362306e-05, + "loss": 1.6389, + "step": 1924 + }, + { + "epoch": 0.2568721643981852, + "grad_norm": 0.9891969725565736, + "learning_rate": 1.742015922010537e-05, + "loss": 1.6642, + "step": 1925 + }, + { + "epoch": 0.2570056044835869, + "grad_norm": 1.0679382580870513, + "learning_rate": 1.7417261198847175e-05, + "loss": 1.6542, + "step": 1926 + }, + { + "epoch": 0.2571390445689885, + "grad_norm": 1.0468412544977608, + "learning_rate": 1.7414361792129034e-05, + "loss": 1.6376, + "step": 1927 + }, + { + "epoch": 0.2572724846543902, + "grad_norm": 1.0350649856988523, + "learning_rate": 1.7411461000492527e-05, + "loss": 1.6653, + "step": 1928 + }, + { + "epoch": 0.25740592473979185, + "grad_norm": 1.036385533715168, + "learning_rate": 1.7408558824479485e-05, + "loss": 1.6968, + "step": 1929 + }, + { + "epoch": 0.25753936482519346, + "grad_norm": 1.0569592050053869, + "learning_rate": 1.7405655264632007e-05, + "loss": 1.6024, + "step": 1930 + }, + { + "epoch": 0.25767280491059513, + "grad_norm": 1.004763067338033, + "learning_rate": 1.7402750321492445e-05, + "loss": 1.6594, + "step": 1931 + }, + { + "epoch": 0.2578062449959968, + "grad_norm": 1.1608084994679726, + "learning_rate": 1.7399843995603404e-05, + "loss": 1.657, + "step": 1932 + }, + { + "epoch": 0.2579396850813985, + "grad_norm": 1.1836864291351534, + "learning_rate": 1.739693628750775e-05, + "loss": 1.6493, + "step": 1933 + }, + { + "epoch": 0.2580731251668001, + "grad_norm": 1.0463963871461142, + "learning_rate": 1.739402719774862e-05, + "loss": 1.6738, + "step": 1934 + }, + { + "epoch": 0.25820656525220176, + "grad_norm": 1.0417140646609107, + "learning_rate": 1.7391116726869395e-05, + "loss": 1.6386, + "step": 1935 + }, + { + "epoch": 0.25834000533760343, + "grad_norm": 1.0230471101003555, + "learning_rate": 1.7388204875413716e-05, + "loss": 1.6851, + "step": 1936 + }, + { + "epoch": 0.25847344542300504, + "grad_norm": 1.175831444603941, + "learning_rate": 1.7385291643925478e-05, + "loss": 1.631, + "step": 1937 + }, + { + "epoch": 0.2586068855084067, + "grad_norm": 1.0005993864292368, + "learning_rate": 1.738237703294885e-05, + "loss": 1.6293, + "step": 1938 + }, + { + "epoch": 0.2587403255938084, + "grad_norm": 1.071233770103943, + "learning_rate": 1.7379461043028242e-05, + "loss": 1.6477, + "step": 1939 + }, + { + "epoch": 0.25887376567921005, + "grad_norm": 1.27924501051401, + "learning_rate": 1.7376543674708332e-05, + "loss": 1.6601, + "step": 1940 + }, + { + "epoch": 0.25900720576461167, + "grad_norm": 1.0592769595491713, + "learning_rate": 1.737362492853405e-05, + "loss": 1.7318, + "step": 1941 + }, + { + "epoch": 0.25914064585001334, + "grad_norm": 1.0043569196961857, + "learning_rate": 1.737070480505058e-05, + "loss": 1.6029, + "step": 1942 + }, + { + "epoch": 0.259274085935415, + "grad_norm": 1.0651608229693987, + "learning_rate": 1.7367783304803373e-05, + "loss": 1.6302, + "step": 1943 + }, + { + "epoch": 0.2594075260208166, + "grad_norm": 1.0274924793106943, + "learning_rate": 1.736486042833813e-05, + "loss": 1.6225, + "step": 1944 + }, + { + "epoch": 0.2595409661062183, + "grad_norm": 1.0973325495792983, + "learning_rate": 1.7361936176200806e-05, + "loss": 1.637, + "step": 1945 + }, + { + "epoch": 0.25967440619161997, + "grad_norm": 1.1480559231420868, + "learning_rate": 1.735901054893763e-05, + "loss": 1.6579, + "step": 1946 + }, + { + "epoch": 0.25980784627702164, + "grad_norm": 9.742353418129637, + "learning_rate": 1.7356083547095065e-05, + "loss": 1.7036, + "step": 1947 + }, + { + "epoch": 0.25994128636242325, + "grad_norm": 1.1930642002916203, + "learning_rate": 1.7353155171219845e-05, + "loss": 1.686, + "step": 1948 + }, + { + "epoch": 0.2600747264478249, + "grad_norm": 1.1344495102081051, + "learning_rate": 1.7350225421858963e-05, + "loss": 1.6927, + "step": 1949 + }, + { + "epoch": 0.2602081665332266, + "grad_norm": 1.160268471890251, + "learning_rate": 1.7347294299559656e-05, + "loss": 1.6395, + "step": 1950 + }, + { + "epoch": 0.26034160661862826, + "grad_norm": 1.0571542473634332, + "learning_rate": 1.7344361804869422e-05, + "loss": 1.6482, + "step": 1951 + }, + { + "epoch": 0.2604750467040299, + "grad_norm": 1.0741618519583103, + "learning_rate": 1.734142793833602e-05, + "loss": 1.6322, + "step": 1952 + }, + { + "epoch": 0.26060848678943155, + "grad_norm": 1.0810620959227508, + "learning_rate": 1.733849270050747e-05, + "loss": 1.6614, + "step": 1953 + }, + { + "epoch": 0.2607419268748332, + "grad_norm": 1.650858745406716, + "learning_rate": 1.7335556091932033e-05, + "loss": 1.6594, + "step": 1954 + }, + { + "epoch": 0.26087536696023483, + "grad_norm": 1.1357173618685, + "learning_rate": 1.7332618113158238e-05, + "loss": 1.6246, + "step": 1955 + }, + { + "epoch": 0.2610088070456365, + "grad_norm": 1.228866928090115, + "learning_rate": 1.7329678764734865e-05, + "loss": 1.6502, + "step": 1956 + }, + { + "epoch": 0.2611422471310382, + "grad_norm": 1.063500974508951, + "learning_rate": 1.732673804721095e-05, + "loss": 1.6553, + "step": 1957 + }, + { + "epoch": 0.26127568721643984, + "grad_norm": 1.1182247677924306, + "learning_rate": 1.732379596113578e-05, + "loss": 1.5929, + "step": 1958 + }, + { + "epoch": 0.26140912730184146, + "grad_norm": 1.0343485691283105, + "learning_rate": 1.7320852507058914e-05, + "loss": 1.6195, + "step": 1959 + }, + { + "epoch": 0.26154256738724313, + "grad_norm": 1.1899371182298315, + "learning_rate": 1.7317907685530152e-05, + "loss": 1.6208, + "step": 1960 + }, + { + "epoch": 0.2616760074726448, + "grad_norm": 1.2655530384071296, + "learning_rate": 1.7314961497099546e-05, + "loss": 1.6743, + "step": 1961 + }, + { + "epoch": 0.2618094475580464, + "grad_norm": 1.1142841192035378, + "learning_rate": 1.7312013942317423e-05, + "loss": 1.6915, + "step": 1962 + }, + { + "epoch": 0.2619428876434481, + "grad_norm": 1.2748126364938344, + "learning_rate": 1.7309065021734345e-05, + "loss": 1.6708, + "step": 1963 + }, + { + "epoch": 0.26207632772884976, + "grad_norm": 1.0763574665626, + "learning_rate": 1.7306114735901135e-05, + "loss": 1.6867, + "step": 1964 + }, + { + "epoch": 0.2622097678142514, + "grad_norm": 1.1840696437423681, + "learning_rate": 1.7303163085368876e-05, + "loss": 1.6526, + "step": 1965 + }, + { + "epoch": 0.26234320789965304, + "grad_norm": 1.0215098328750423, + "learning_rate": 1.7300210070688902e-05, + "loss": 1.6395, + "step": 1966 + }, + { + "epoch": 0.2624766479850547, + "grad_norm": 1.0619335145218514, + "learning_rate": 1.7297255692412807e-05, + "loss": 1.6554, + "step": 1967 + }, + { + "epoch": 0.2626100880704564, + "grad_norm": 1.0512215210347395, + "learning_rate": 1.7294299951092427e-05, + "loss": 1.6577, + "step": 1968 + }, + { + "epoch": 0.262743528155858, + "grad_norm": 1.1231704876083466, + "learning_rate": 1.7291342847279864e-05, + "loss": 1.61, + "step": 1969 + }, + { + "epoch": 0.26287696824125967, + "grad_norm": 1.051508193824374, + "learning_rate": 1.7288384381527473e-05, + "loss": 1.6351, + "step": 1970 + }, + { + "epoch": 0.26301040832666134, + "grad_norm": 1.0623240318595546, + "learning_rate": 1.7285424554387863e-05, + "loss": 1.6435, + "step": 1971 + }, + { + "epoch": 0.263143848412063, + "grad_norm": 1.0134535097893844, + "learning_rate": 1.7282463366413895e-05, + "loss": 1.6195, + "step": 1972 + }, + { + "epoch": 0.2632772884974646, + "grad_norm": 1.0944547998591982, + "learning_rate": 1.7279500818158678e-05, + "loss": 1.6385, + "step": 1973 + }, + { + "epoch": 0.2634107285828663, + "grad_norm": 1.014436566990623, + "learning_rate": 1.7276536910175596e-05, + "loss": 1.5849, + "step": 1974 + }, + { + "epoch": 0.26354416866826796, + "grad_norm": 1.0957069455341975, + "learning_rate": 1.727357164301826e-05, + "loss": 1.6463, + "step": 1975 + }, + { + "epoch": 0.2636776087536696, + "grad_norm": 1.2346921385867662, + "learning_rate": 1.7270605017240557e-05, + "loss": 1.653, + "step": 1976 + }, + { + "epoch": 0.26381104883907125, + "grad_norm": 1.0897637998274192, + "learning_rate": 1.7267637033396613e-05, + "loss": 1.6921, + "step": 1977 + }, + { + "epoch": 0.2639444889244729, + "grad_norm": 1.415961246967308, + "learning_rate": 1.7264667692040816e-05, + "loss": 1.6308, + "step": 1978 + }, + { + "epoch": 0.2640779290098746, + "grad_norm": 14.277629558243925, + "learning_rate": 1.726169699372781e-05, + "loss": 1.6807, + "step": 1979 + }, + { + "epoch": 0.2642113690952762, + "grad_norm": 1.3259197439263068, + "learning_rate": 1.725872493901248e-05, + "loss": 1.6796, + "step": 1980 + }, + { + "epoch": 0.2643448091806779, + "grad_norm": 1.4714297960193359, + "learning_rate": 1.7255751528449972e-05, + "loss": 1.6084, + "step": 1981 + }, + { + "epoch": 0.26447824926607955, + "grad_norm": 1.3783447543972154, + "learning_rate": 1.7252776762595695e-05, + "loss": 1.674, + "step": 1982 + }, + { + "epoch": 0.26461168935148116, + "grad_norm": 1.345089903670124, + "learning_rate": 1.724980064200529e-05, + "loss": 1.6719, + "step": 1983 + }, + { + "epoch": 0.26474512943688283, + "grad_norm": 1.6788138951425957, + "learning_rate": 1.724682316723467e-05, + "loss": 1.6242, + "step": 1984 + }, + { + "epoch": 0.2648785695222845, + "grad_norm": 1.057556035989643, + "learning_rate": 1.724384433883999e-05, + "loss": 1.6688, + "step": 1985 + }, + { + "epoch": 0.26501200960768617, + "grad_norm": 1.935146990397484, + "learning_rate": 1.724086415737766e-05, + "loss": 1.6685, + "step": 1986 + }, + { + "epoch": 0.2651454496930878, + "grad_norm": 1.4029312343106273, + "learning_rate": 1.7237882623404347e-05, + "loss": 1.69, + "step": 1987 + }, + { + "epoch": 0.26527888977848946, + "grad_norm": 1.0640529672227876, + "learning_rate": 1.7234899737476968e-05, + "loss": 1.5853, + "step": 1988 + }, + { + "epoch": 0.2654123298638911, + "grad_norm": 1.0733639436628284, + "learning_rate": 1.7231915500152685e-05, + "loss": 1.677, + "step": 1989 + }, + { + "epoch": 0.26554576994929274, + "grad_norm": 1.0241005822898823, + "learning_rate": 1.722892991198893e-05, + "loss": 1.6374, + "step": 1990 + }, + { + "epoch": 0.2656792100346944, + "grad_norm": 1.0875108906486028, + "learning_rate": 1.7225942973543368e-05, + "loss": 1.6445, + "step": 1991 + }, + { + "epoch": 0.2658126501200961, + "grad_norm": 1.2697214356659474, + "learning_rate": 1.7222954685373933e-05, + "loss": 1.5832, + "step": 1992 + }, + { + "epoch": 0.26594609020549775, + "grad_norm": 1.0830834504640086, + "learning_rate": 1.7219965048038795e-05, + "loss": 1.6888, + "step": 1993 + }, + { + "epoch": 0.26607953029089937, + "grad_norm": 1.3072951866209532, + "learning_rate": 1.7216974062096392e-05, + "loss": 1.6289, + "step": 1994 + }, + { + "epoch": 0.26621297037630104, + "grad_norm": 1.0836421243679286, + "learning_rate": 1.72139817281054e-05, + "loss": 1.6658, + "step": 1995 + }, + { + "epoch": 0.2663464104617027, + "grad_norm": 1.8046043701192065, + "learning_rate": 1.7210988046624758e-05, + "loss": 1.622, + "step": 1996 + }, + { + "epoch": 0.2664798505471043, + "grad_norm": 1.0933300325897486, + "learning_rate": 1.720799301821365e-05, + "loss": 1.6297, + "step": 1997 + }, + { + "epoch": 0.266613290632506, + "grad_norm": 1.0459969375513192, + "learning_rate": 1.720499664343151e-05, + "loss": 1.6484, + "step": 1998 + }, + { + "epoch": 0.26674673071790767, + "grad_norm": 1.0751449290453639, + "learning_rate": 1.720199892283803e-05, + "loss": 1.625, + "step": 1999 + }, + { + "epoch": 0.26688017080330934, + "grad_norm": 1.0928368447753933, + "learning_rate": 1.719899985699315e-05, + "loss": 1.6806, + "step": 2000 + }, + { + "epoch": 0.26701361088871095, + "grad_norm": 1.0349222592130347, + "learning_rate": 1.7195999446457053e-05, + "loss": 1.595, + "step": 2001 + }, + { + "epoch": 0.2671470509741126, + "grad_norm": 1.063465046316729, + "learning_rate": 1.7192997691790197e-05, + "loss": 1.6143, + "step": 2002 + }, + { + "epoch": 0.2672804910595143, + "grad_norm": 1.0284024134886187, + "learning_rate": 1.7189994593553266e-05, + "loss": 1.6447, + "step": 2003 + }, + { + "epoch": 0.2674139311449159, + "grad_norm": 1.307165702348938, + "learning_rate": 1.7186990152307203e-05, + "loss": 1.6691, + "step": 2004 + }, + { + "epoch": 0.2675473712303176, + "grad_norm": 1.119045388640825, + "learning_rate": 1.718398436861321e-05, + "loss": 1.6466, + "step": 2005 + }, + { + "epoch": 0.26768081131571925, + "grad_norm": 1.0226913583499544, + "learning_rate": 1.718097724303273e-05, + "loss": 1.6593, + "step": 2006 + }, + { + "epoch": 0.2678142514011209, + "grad_norm": 1.1645428303381167, + "learning_rate": 1.717796877612746e-05, + "loss": 1.6316, + "step": 2007 + }, + { + "epoch": 0.26794769148652253, + "grad_norm": 1.0290487690958936, + "learning_rate": 1.7174958968459344e-05, + "loss": 1.6416, + "step": 2008 + }, + { + "epoch": 0.2680811315719242, + "grad_norm": 1.113710094399956, + "learning_rate": 1.7171947820590584e-05, + "loss": 1.6459, + "step": 2009 + }, + { + "epoch": 0.2682145716573259, + "grad_norm": 1.0721033027517342, + "learning_rate": 1.7168935333083624e-05, + "loss": 1.6481, + "step": 2010 + }, + { + "epoch": 0.2683480117427275, + "grad_norm": 4.148877971329257, + "learning_rate": 1.7165921506501168e-05, + "loss": 1.6853, + "step": 2011 + }, + { + "epoch": 0.26848145182812916, + "grad_norm": 1.026325617891188, + "learning_rate": 1.716290634140616e-05, + "loss": 1.6395, + "step": 2012 + }, + { + "epoch": 0.26861489191353083, + "grad_norm": 1.0458614779045183, + "learning_rate": 1.71598898383618e-05, + "loss": 1.6385, + "step": 2013 + }, + { + "epoch": 0.2687483319989325, + "grad_norm": 1.3489378298749055, + "learning_rate": 1.715687199793154e-05, + "loss": 1.6306, + "step": 2014 + }, + { + "epoch": 0.2688817720843341, + "grad_norm": 1.066106782654306, + "learning_rate": 1.7153852820679073e-05, + "loss": 1.6143, + "step": 2015 + }, + { + "epoch": 0.2690152121697358, + "grad_norm": 1.0517288617879899, + "learning_rate": 1.715083230716835e-05, + "loss": 1.6918, + "step": 2016 + }, + { + "epoch": 0.26914865225513745, + "grad_norm": 1.0631774833951584, + "learning_rate": 1.7147810457963565e-05, + "loss": 1.6196, + "step": 2017 + }, + { + "epoch": 0.2692820923405391, + "grad_norm": 1.5500958673330705, + "learning_rate": 1.714478727362917e-05, + "loss": 1.605, + "step": 2018 + }, + { + "epoch": 0.26941553242594074, + "grad_norm": 1.2061760172934797, + "learning_rate": 1.7141762754729855e-05, + "loss": 1.627, + "step": 2019 + }, + { + "epoch": 0.2695489725113424, + "grad_norm": 1.063335069996398, + "learning_rate": 1.713873690183057e-05, + "loss": 1.6546, + "step": 2020 + }, + { + "epoch": 0.2696824125967441, + "grad_norm": 1.0345427555272797, + "learning_rate": 1.713570971549651e-05, + "loss": 1.6332, + "step": 2021 + }, + { + "epoch": 0.2698158526821457, + "grad_norm": 1.0387506796892585, + "learning_rate": 1.713268119629312e-05, + "loss": 1.6208, + "step": 2022 + }, + { + "epoch": 0.26994929276754737, + "grad_norm": 1.0451937535961915, + "learning_rate": 1.7129651344786088e-05, + "loss": 1.6419, + "step": 2023 + }, + { + "epoch": 0.27008273285294904, + "grad_norm": 1.0424161111119927, + "learning_rate": 1.7126620161541364e-05, + "loss": 1.6801, + "step": 2024 + }, + { + "epoch": 0.2702161729383507, + "grad_norm": 1.0453586363001601, + "learning_rate": 1.712358764712513e-05, + "loss": 1.6202, + "step": 2025 + }, + { + "epoch": 0.2703496130237523, + "grad_norm": 1.1453862840223692, + "learning_rate": 1.7120553802103828e-05, + "loss": 1.6623, + "step": 2026 + }, + { + "epoch": 0.270483053109154, + "grad_norm": 1.325903344513902, + "learning_rate": 1.7117518627044148e-05, + "loss": 1.6376, + "step": 2027 + }, + { + "epoch": 0.27061649319455566, + "grad_norm": 1.080112760240277, + "learning_rate": 1.7114482122513024e-05, + "loss": 1.6184, + "step": 2028 + }, + { + "epoch": 0.2707499332799573, + "grad_norm": 1.0594663654156977, + "learning_rate": 1.711144428907764e-05, + "loss": 1.6887, + "step": 2029 + }, + { + "epoch": 0.27088337336535895, + "grad_norm": 1.0462695966057693, + "learning_rate": 1.710840512730543e-05, + "loss": 1.5665, + "step": 2030 + }, + { + "epoch": 0.2710168134507606, + "grad_norm": 1.1251591649106787, + "learning_rate": 1.7105364637764075e-05, + "loss": 1.6294, + "step": 2031 + }, + { + "epoch": 0.2711502535361623, + "grad_norm": 1.1329461782399421, + "learning_rate": 1.7102322821021505e-05, + "loss": 1.6344, + "step": 2032 + }, + { + "epoch": 0.2712836936215639, + "grad_norm": 1.0166935218662512, + "learning_rate": 1.709927967764589e-05, + "loss": 1.6171, + "step": 2033 + }, + { + "epoch": 0.2714171337069656, + "grad_norm": 1.2601520388342762, + "learning_rate": 1.7096235208205665e-05, + "loss": 1.634, + "step": 2034 + }, + { + "epoch": 0.27155057379236724, + "grad_norm": 1.076982069439433, + "learning_rate": 1.7093189413269497e-05, + "loss": 1.6637, + "step": 2035 + }, + { + "epoch": 0.27168401387776886, + "grad_norm": 1.0321358819200304, + "learning_rate": 1.70901422934063e-05, + "loss": 1.6222, + "step": 2036 + }, + { + "epoch": 0.27181745396317053, + "grad_norm": 1.0654015458142159, + "learning_rate": 1.708709384918525e-05, + "loss": 1.586, + "step": 2037 + }, + { + "epoch": 0.2719508940485722, + "grad_norm": 0.9875095341342335, + "learning_rate": 1.708404408117576e-05, + "loss": 1.6419, + "step": 2038 + }, + { + "epoch": 0.27208433413397387, + "grad_norm": 1.2163989586437813, + "learning_rate": 1.708099298994749e-05, + "loss": 1.6356, + "step": 2039 + }, + { + "epoch": 0.2722177742193755, + "grad_norm": 1.0424446221236658, + "learning_rate": 1.7077940576070347e-05, + "loss": 1.6426, + "step": 2040 + }, + { + "epoch": 0.27235121430477716, + "grad_norm": 1.1917437972972873, + "learning_rate": 1.707488684011449e-05, + "loss": 1.6128, + "step": 2041 + }, + { + "epoch": 0.2724846543901788, + "grad_norm": 1.0561747251809694, + "learning_rate": 1.7071831782650325e-05, + "loss": 1.6523, + "step": 2042 + }, + { + "epoch": 0.27261809447558044, + "grad_norm": 1.0385624086824277, + "learning_rate": 1.7068775404248497e-05, + "loss": 1.6634, + "step": 2043 + }, + { + "epoch": 0.2727515345609821, + "grad_norm": 1.303828973820152, + "learning_rate": 1.7065717705479906e-05, + "loss": 1.6508, + "step": 2044 + }, + { + "epoch": 0.2728849746463838, + "grad_norm": 1.1910010358615803, + "learning_rate": 1.706265868691569e-05, + "loss": 1.6699, + "step": 2045 + }, + { + "epoch": 0.27301841473178545, + "grad_norm": 1.1784688305684132, + "learning_rate": 1.7059598349127245e-05, + "loss": 1.645, + "step": 2046 + }, + { + "epoch": 0.27315185481718707, + "grad_norm": 0.985356851054121, + "learning_rate": 1.7056536692686204e-05, + "loss": 1.6146, + "step": 2047 + }, + { + "epoch": 0.27328529490258874, + "grad_norm": 1.0255343499954996, + "learning_rate": 1.7053473718164455e-05, + "loss": 1.6418, + "step": 2048 + }, + { + "epoch": 0.2734187349879904, + "grad_norm": 1.0638983947233618, + "learning_rate": 1.7050409426134117e-05, + "loss": 1.7435, + "step": 2049 + }, + { + "epoch": 0.273552175073392, + "grad_norm": 1.0851770979360476, + "learning_rate": 1.704734381716757e-05, + "loss": 1.658, + "step": 2050 + }, + { + "epoch": 0.2736856151587937, + "grad_norm": 1.1763821324448986, + "learning_rate": 1.704427689183744e-05, + "loss": 1.6535, + "step": 2051 + }, + { + "epoch": 0.27381905524419536, + "grad_norm": 1.3104441981575137, + "learning_rate": 1.7041208650716586e-05, + "loss": 1.6589, + "step": 2052 + }, + { + "epoch": 0.27395249532959703, + "grad_norm": 1.0136315973372723, + "learning_rate": 1.703813909437812e-05, + "loss": 1.6133, + "step": 2053 + }, + { + "epoch": 0.27408593541499865, + "grad_norm": 0.9960414593425704, + "learning_rate": 1.7035068223395407e-05, + "loss": 1.621, + "step": 2054 + }, + { + "epoch": 0.2742193755004003, + "grad_norm": 1.2127828477430966, + "learning_rate": 1.7031996038342045e-05, + "loss": 1.6397, + "step": 2055 + }, + { + "epoch": 0.274352815585802, + "grad_norm": 0.9828639066951254, + "learning_rate": 1.702892253979189e-05, + "loss": 1.6125, + "step": 2056 + }, + { + "epoch": 0.2744862556712036, + "grad_norm": 0.9758056512184737, + "learning_rate": 1.7025847728319027e-05, + "loss": 1.6425, + "step": 2057 + }, + { + "epoch": 0.2746196957566053, + "grad_norm": 1.0754633573934569, + "learning_rate": 1.7022771604497802e-05, + "loss": 1.6622, + "step": 2058 + }, + { + "epoch": 0.27475313584200695, + "grad_norm": 1.0344089922720026, + "learning_rate": 1.70196941689028e-05, + "loss": 1.6888, + "step": 2059 + }, + { + "epoch": 0.2748865759274086, + "grad_norm": 1.0736302080536169, + "learning_rate": 1.7016615422108847e-05, + "loss": 1.658, + "step": 2060 + }, + { + "epoch": 0.27502001601281023, + "grad_norm": 1.20409158832475, + "learning_rate": 1.7013535364691023e-05, + "loss": 1.6427, + "step": 2061 + }, + { + "epoch": 0.2751534560982119, + "grad_norm": 1.069163552633532, + "learning_rate": 1.7010453997224642e-05, + "loss": 1.6237, + "step": 2062 + }, + { + "epoch": 0.27528689618361357, + "grad_norm": 1.1894978043506685, + "learning_rate": 1.7007371320285275e-05, + "loss": 1.6635, + "step": 2063 + }, + { + "epoch": 0.2754203362690152, + "grad_norm": 1.1903195019190183, + "learning_rate": 1.7004287334448723e-05, + "loss": 1.5996, + "step": 2064 + }, + { + "epoch": 0.27555377635441686, + "grad_norm": 1.0460875658896527, + "learning_rate": 1.7001202040291048e-05, + "loss": 1.7078, + "step": 2065 + }, + { + "epoch": 0.2756872164398185, + "grad_norm": 1.4369499128117869, + "learning_rate": 1.699811543838854e-05, + "loss": 1.6444, + "step": 2066 + }, + { + "epoch": 0.2758206565252202, + "grad_norm": 1.209341903013088, + "learning_rate": 1.6995027529317746e-05, + "loss": 1.6181, + "step": 2067 + }, + { + "epoch": 0.2759540966106218, + "grad_norm": 1.1036519149996684, + "learning_rate": 1.6991938313655453e-05, + "loss": 1.6224, + "step": 2068 + }, + { + "epoch": 0.2760875366960235, + "grad_norm": 1.146216102199816, + "learning_rate": 1.6988847791978687e-05, + "loss": 1.6165, + "step": 2069 + }, + { + "epoch": 0.27622097678142515, + "grad_norm": 1.1564719422933685, + "learning_rate": 1.6985755964864723e-05, + "loss": 1.7315, + "step": 2070 + }, + { + "epoch": 0.27635441686682677, + "grad_norm": 1.1449774555454721, + "learning_rate": 1.698266283289108e-05, + "loss": 1.6398, + "step": 2071 + }, + { + "epoch": 0.27648785695222844, + "grad_norm": 1.0564082538230344, + "learning_rate": 1.6979568396635526e-05, + "loss": 1.5539, + "step": 2072 + }, + { + "epoch": 0.2766212970376301, + "grad_norm": 1.0646332200728448, + "learning_rate": 1.6976472656676058e-05, + "loss": 1.6822, + "step": 2073 + }, + { + "epoch": 0.2767547371230318, + "grad_norm": 1.2326207337266486, + "learning_rate": 1.697337561359093e-05, + "loss": 1.5862, + "step": 2074 + }, + { + "epoch": 0.2768881772084334, + "grad_norm": 1.2149498397188576, + "learning_rate": 1.6970277267958625e-05, + "loss": 1.6561, + "step": 2075 + }, + { + "epoch": 0.27702161729383507, + "grad_norm": 1.15730574634711, + "learning_rate": 1.6967177620357894e-05, + "loss": 1.6772, + "step": 2076 + }, + { + "epoch": 0.27715505737923674, + "grad_norm": 1.041366398730123, + "learning_rate": 1.6964076671367703e-05, + "loss": 1.6171, + "step": 2077 + }, + { + "epoch": 0.27728849746463835, + "grad_norm": 1.0099579426889422, + "learning_rate": 1.696097442156728e-05, + "loss": 1.6003, + "step": 2078 + }, + { + "epoch": 0.27742193755004, + "grad_norm": 1.1597495545248406, + "learning_rate": 1.6957870871536086e-05, + "loss": 1.629, + "step": 2079 + }, + { + "epoch": 0.2775553776354417, + "grad_norm": 1.2971905519568292, + "learning_rate": 1.6954766021853836e-05, + "loss": 1.6335, + "step": 2080 + }, + { + "epoch": 0.27768881772084336, + "grad_norm": 1.0081447437285673, + "learning_rate": 1.6951659873100474e-05, + "loss": 1.661, + "step": 2081 + }, + { + "epoch": 0.277822257806245, + "grad_norm": 1.155709221150846, + "learning_rate": 1.6948552425856197e-05, + "loss": 1.6418, + "step": 2082 + }, + { + "epoch": 0.27795569789164665, + "grad_norm": 1.3578902806881925, + "learning_rate": 1.694544368070144e-05, + "loss": 1.7022, + "step": 2083 + }, + { + "epoch": 0.2780891379770483, + "grad_norm": 1.0473984071647915, + "learning_rate": 1.694233363821688e-05, + "loss": 1.6471, + "step": 2084 + }, + { + "epoch": 0.27822257806244993, + "grad_norm": 1.0998361204044, + "learning_rate": 1.6939222298983432e-05, + "loss": 1.6853, + "step": 2085 + }, + { + "epoch": 0.2783560181478516, + "grad_norm": 1.0597277511694383, + "learning_rate": 1.693610966358227e-05, + "loss": 1.6518, + "step": 2086 + }, + { + "epoch": 0.2784894582332533, + "grad_norm": 1.0956187785249976, + "learning_rate": 1.693299573259479e-05, + "loss": 1.6657, + "step": 2087 + }, + { + "epoch": 0.27862289831865494, + "grad_norm": 1.136040109917539, + "learning_rate": 1.6929880506602644e-05, + "loss": 1.6538, + "step": 2088 + }, + { + "epoch": 0.27875633840405656, + "grad_norm": 1.13032282747865, + "learning_rate": 1.692676398618772e-05, + "loss": 1.6329, + "step": 2089 + }, + { + "epoch": 0.27888977848945823, + "grad_norm": 1.297996071275606, + "learning_rate": 1.6923646171932148e-05, + "loss": 1.6816, + "step": 2090 + }, + { + "epoch": 0.2790232185748599, + "grad_norm": 1.036055755996991, + "learning_rate": 1.6920527064418298e-05, + "loss": 1.6716, + "step": 2091 + }, + { + "epoch": 0.27915665866026157, + "grad_norm": 1.1008523939624144, + "learning_rate": 1.6917406664228785e-05, + "loss": 1.6202, + "step": 2092 + }, + { + "epoch": 0.2792900987456632, + "grad_norm": 1.1056127215942615, + "learning_rate": 1.6914284971946466e-05, + "loss": 1.6673, + "step": 2093 + }, + { + "epoch": 0.27942353883106485, + "grad_norm": 1.3280112976952287, + "learning_rate": 1.6911161988154435e-05, + "loss": 1.6588, + "step": 2094 + }, + { + "epoch": 0.2795569789164665, + "grad_norm": 1.2052642948323982, + "learning_rate": 1.6908037713436037e-05, + "loss": 1.6314, + "step": 2095 + }, + { + "epoch": 0.27969041900186814, + "grad_norm": 0.9865645913607, + "learning_rate": 1.6904912148374837e-05, + "loss": 1.6214, + "step": 2096 + }, + { + "epoch": 0.2798238590872698, + "grad_norm": 1.059342517625569, + "learning_rate": 1.6901785293554667e-05, + "loss": 1.6119, + "step": 2097 + }, + { + "epoch": 0.2799572991726715, + "grad_norm": 1.0202929873543987, + "learning_rate": 1.6898657149559585e-05, + "loss": 1.681, + "step": 2098 + }, + { + "epoch": 0.28009073925807315, + "grad_norm": 1.0885504607819505, + "learning_rate": 1.689552771697389e-05, + "loss": 1.641, + "step": 2099 + }, + { + "epoch": 0.28022417934347477, + "grad_norm": 1.2387895966845563, + "learning_rate": 1.6892396996382125e-05, + "loss": 1.6229, + "step": 2100 + }, + { + "epoch": 0.28035761942887644, + "grad_norm": 1.091745358204505, + "learning_rate": 1.6889264988369074e-05, + "loss": 1.5917, + "step": 2101 + }, + { + "epoch": 0.2804910595142781, + "grad_norm": 1.003916358618551, + "learning_rate": 1.688613169351976e-05, + "loss": 1.6095, + "step": 2102 + }, + { + "epoch": 0.2806244995996797, + "grad_norm": 0.9972302973585249, + "learning_rate": 1.6882997112419452e-05, + "loss": 1.6731, + "step": 2103 + }, + { + "epoch": 0.2807579396850814, + "grad_norm": 1.0210763608341367, + "learning_rate": 1.6879861245653647e-05, + "loss": 1.7185, + "step": 2104 + }, + { + "epoch": 0.28089137977048306, + "grad_norm": 1.0637023781709813, + "learning_rate": 1.687672409380809e-05, + "loss": 1.6653, + "step": 2105 + }, + { + "epoch": 0.28102481985588473, + "grad_norm": 1.0336190974023656, + "learning_rate": 1.687358565746877e-05, + "loss": 1.6459, + "step": 2106 + }, + { + "epoch": 0.28115825994128635, + "grad_norm": 1.0180661487350322, + "learning_rate": 1.68704459372219e-05, + "loss": 1.6258, + "step": 2107 + }, + { + "epoch": 0.281291700026688, + "grad_norm": 1.1141096916948245, + "learning_rate": 1.686730493365396e-05, + "loss": 1.658, + "step": 2108 + }, + { + "epoch": 0.2814251401120897, + "grad_norm": 3.197208541082059, + "learning_rate": 1.686416264735164e-05, + "loss": 1.818, + "step": 2109 + }, + { + "epoch": 0.2815585801974913, + "grad_norm": 1.221770226887997, + "learning_rate": 1.6861019078901894e-05, + "loss": 1.558, + "step": 2110 + }, + { + "epoch": 0.281692020282893, + "grad_norm": 1.0374977596663437, + "learning_rate": 1.6857874228891896e-05, + "loss": 1.6517, + "step": 2111 + }, + { + "epoch": 0.28182546036829464, + "grad_norm": 1.0335222022664212, + "learning_rate": 1.6854728097909073e-05, + "loss": 1.6713, + "step": 2112 + }, + { + "epoch": 0.2819589004536963, + "grad_norm": 1.3195530821608707, + "learning_rate": 1.6851580686541087e-05, + "loss": 1.7385, + "step": 2113 + }, + { + "epoch": 0.28209234053909793, + "grad_norm": 1.195240974112386, + "learning_rate": 1.6848431995375834e-05, + "loss": 1.6521, + "step": 2114 + }, + { + "epoch": 0.2822257806244996, + "grad_norm": 1.0436378450311068, + "learning_rate": 1.684528202500146e-05, + "loss": 1.6722, + "step": 2115 + }, + { + "epoch": 0.28235922070990127, + "grad_norm": 1.02333315302456, + "learning_rate": 1.6842130776006332e-05, + "loss": 1.6158, + "step": 2116 + }, + { + "epoch": 0.2824926607953029, + "grad_norm": 1.107427692009098, + "learning_rate": 1.6838978248979083e-05, + "loss": 1.5904, + "step": 2117 + }, + { + "epoch": 0.28262610088070456, + "grad_norm": 1.1045697230959286, + "learning_rate": 1.683582444450856e-05, + "loss": 1.6672, + "step": 2118 + }, + { + "epoch": 0.2827595409661062, + "grad_norm": 1.0660148829983587, + "learning_rate": 1.6832669363183863e-05, + "loss": 1.5848, + "step": 2119 + }, + { + "epoch": 0.2828929810515079, + "grad_norm": 0.997746177166706, + "learning_rate": 1.6829513005594318e-05, + "loss": 1.6168, + "step": 2120 + }, + { + "epoch": 0.2830264211369095, + "grad_norm": 1.0851170598026787, + "learning_rate": 1.6826355372329502e-05, + "loss": 1.6698, + "step": 2121 + }, + { + "epoch": 0.2831598612223112, + "grad_norm": 1.157100041505963, + "learning_rate": 1.682319646397922e-05, + "loss": 1.6833, + "step": 2122 + }, + { + "epoch": 0.28329330130771285, + "grad_norm": 1.0494927317075036, + "learning_rate": 1.682003628113353e-05, + "loss": 1.6263, + "step": 2123 + }, + { + "epoch": 0.28342674139311447, + "grad_norm": 1.0386913852887472, + "learning_rate": 1.6816874824382704e-05, + "loss": 1.6658, + "step": 2124 + }, + { + "epoch": 0.28356018147851614, + "grad_norm": 0.9903946170282486, + "learning_rate": 1.6813712094317282e-05, + "loss": 1.5875, + "step": 2125 + }, + { + "epoch": 0.2836936215639178, + "grad_norm": 1.0742398505956183, + "learning_rate": 1.681054809152801e-05, + "loss": 1.713, + "step": 2126 + }, + { + "epoch": 0.2838270616493195, + "grad_norm": 1.0730348396285945, + "learning_rate": 1.6807382816605903e-05, + "loss": 1.6241, + "step": 2127 + }, + { + "epoch": 0.2839605017347211, + "grad_norm": 1.2030549884856265, + "learning_rate": 1.6804216270142183e-05, + "loss": 1.6708, + "step": 2128 + }, + { + "epoch": 0.28409394182012276, + "grad_norm": 1.0495436375707339, + "learning_rate": 1.6801048452728338e-05, + "loss": 1.6271, + "step": 2129 + }, + { + "epoch": 0.28422738190552443, + "grad_norm": 1.0625147413097187, + "learning_rate": 1.6797879364956075e-05, + "loss": 1.6423, + "step": 2130 + }, + { + "epoch": 0.28436082199092605, + "grad_norm": 1.1920231207118999, + "learning_rate": 1.679470900741734e-05, + "loss": 1.6252, + "step": 2131 + }, + { + "epoch": 0.2844942620763277, + "grad_norm": 1.0599528204014201, + "learning_rate": 1.6791537380704326e-05, + "loss": 1.6599, + "step": 2132 + }, + { + "epoch": 0.2846277021617294, + "grad_norm": 1.0231405998686767, + "learning_rate": 1.6788364485409454e-05, + "loss": 1.6155, + "step": 2133 + }, + { + "epoch": 0.28476114224713106, + "grad_norm": 0.9948628308935985, + "learning_rate": 1.6785190322125382e-05, + "loss": 1.6538, + "step": 2134 + }, + { + "epoch": 0.2848945823325327, + "grad_norm": 3.8142735734554662, + "learning_rate": 1.678201489144501e-05, + "loss": 1.6744, + "step": 2135 + }, + { + "epoch": 0.28502802241793435, + "grad_norm": 1.1704524649360155, + "learning_rate": 1.6778838193961475e-05, + "loss": 1.7045, + "step": 2136 + }, + { + "epoch": 0.285161462503336, + "grad_norm": 1.1934124423424233, + "learning_rate": 1.6775660230268146e-05, + "loss": 1.6685, + "step": 2137 + }, + { + "epoch": 0.28529490258873763, + "grad_norm": 1.0625696814614358, + "learning_rate": 1.6772481000958628e-05, + "loss": 1.6241, + "step": 2138 + }, + { + "epoch": 0.2854283426741393, + "grad_norm": 1.0424150836147343, + "learning_rate": 1.6769300506626766e-05, + "loss": 1.5597, + "step": 2139 + }, + { + "epoch": 0.28556178275954097, + "grad_norm": 1.3055292931541427, + "learning_rate": 1.676611874786664e-05, + "loss": 1.6378, + "step": 2140 + }, + { + "epoch": 0.28569522284494264, + "grad_norm": 1.207338132471859, + "learning_rate": 1.676293572527257e-05, + "loss": 1.6932, + "step": 2141 + }, + { + "epoch": 0.28582866293034426, + "grad_norm": 1.3576084701670912, + "learning_rate": 1.6759751439439105e-05, + "loss": 1.6737, + "step": 2142 + }, + { + "epoch": 0.2859621030157459, + "grad_norm": 1.1131581474723593, + "learning_rate": 1.675656589096103e-05, + "loss": 1.6575, + "step": 2143 + }, + { + "epoch": 0.2860955431011476, + "grad_norm": 1.0404856911010416, + "learning_rate": 1.6753379080433375e-05, + "loss": 1.6199, + "step": 2144 + }, + { + "epoch": 0.2862289831865492, + "grad_norm": 1.0326352373682337, + "learning_rate": 1.6750191008451403e-05, + "loss": 1.6281, + "step": 2145 + }, + { + "epoch": 0.2863624232719509, + "grad_norm": 1.0618014692593303, + "learning_rate": 1.6747001675610596e-05, + "loss": 1.7052, + "step": 2146 + }, + { + "epoch": 0.28649586335735255, + "grad_norm": 1.156021303055593, + "learning_rate": 1.67438110825067e-05, + "loss": 1.6295, + "step": 2147 + }, + { + "epoch": 0.2866293034427542, + "grad_norm": 1.0687356689670264, + "learning_rate": 1.674061922973567e-05, + "loss": 1.6377, + "step": 2148 + }, + { + "epoch": 0.28676274352815584, + "grad_norm": 1.0422058937640384, + "learning_rate": 1.6737426117893716e-05, + "loss": 1.5742, + "step": 2149 + }, + { + "epoch": 0.2868961836135575, + "grad_norm": 1.0058798981165387, + "learning_rate": 1.673423174757727e-05, + "loss": 1.6808, + "step": 2150 + }, + { + "epoch": 0.2870296236989592, + "grad_norm": 1.3968858626075848, + "learning_rate": 1.6731036119383007e-05, + "loss": 1.6662, + "step": 2151 + }, + { + "epoch": 0.2871630637843608, + "grad_norm": 1.0783132009504053, + "learning_rate": 1.6727839233907833e-05, + "loss": 1.62, + "step": 2152 + }, + { + "epoch": 0.28729650386976247, + "grad_norm": 1.045602618496203, + "learning_rate": 1.672464109174889e-05, + "loss": 1.6085, + "step": 2153 + }, + { + "epoch": 0.28742994395516414, + "grad_norm": 1.0204746262045936, + "learning_rate": 1.672144169350355e-05, + "loss": 1.682, + "step": 2154 + }, + { + "epoch": 0.2875633840405658, + "grad_norm": 0.9712644138016593, + "learning_rate": 1.671824103976943e-05, + "loss": 1.591, + "step": 2155 + }, + { + "epoch": 0.2876968241259674, + "grad_norm": 1.2273318949317467, + "learning_rate": 1.6715039131144375e-05, + "loss": 1.6639, + "step": 2156 + }, + { + "epoch": 0.2878302642113691, + "grad_norm": 1.241652981677857, + "learning_rate": 1.6711835968226463e-05, + "loss": 1.656, + "step": 2157 + }, + { + "epoch": 0.28796370429677076, + "grad_norm": 1.0646723228459751, + "learning_rate": 1.670863155161401e-05, + "loss": 1.6509, + "step": 2158 + }, + { + "epoch": 0.28809714438217243, + "grad_norm": 0.9640798880564916, + "learning_rate": 1.6705425881905564e-05, + "loss": 1.6043, + "step": 2159 + }, + { + "epoch": 0.28823058446757405, + "grad_norm": 1.0190920068560503, + "learning_rate": 1.6702218959699906e-05, + "loss": 1.6723, + "step": 2160 + }, + { + "epoch": 0.2883640245529757, + "grad_norm": 1.029425832894891, + "learning_rate": 1.6699010785596056e-05, + "loss": 1.6663, + "step": 2161 + }, + { + "epoch": 0.2884974646383774, + "grad_norm": 1.0069920792713327, + "learning_rate": 1.669580136019326e-05, + "loss": 1.6331, + "step": 2162 + }, + { + "epoch": 0.288630904723779, + "grad_norm": 1.1668045734582408, + "learning_rate": 1.6692590684091004e-05, + "loss": 1.6401, + "step": 2163 + }, + { + "epoch": 0.2887643448091807, + "grad_norm": 1.056267375662492, + "learning_rate": 1.6689378757889007e-05, + "loss": 1.6311, + "step": 2164 + }, + { + "epoch": 0.28889778489458234, + "grad_norm": 1.0648778315304113, + "learning_rate": 1.6686165582187223e-05, + "loss": 1.6636, + "step": 2165 + }, + { + "epoch": 0.289031224979984, + "grad_norm": 1.0124410015208933, + "learning_rate": 1.668295115758583e-05, + "loss": 1.6381, + "step": 2166 + }, + { + "epoch": 0.28916466506538563, + "grad_norm": 1.0022830955131297, + "learning_rate": 1.6679735484685247e-05, + "loss": 1.5945, + "step": 2167 + }, + { + "epoch": 0.2892981051507873, + "grad_norm": 12.676569029129164, + "learning_rate": 1.6676518564086133e-05, + "loss": 1.6591, + "step": 2168 + }, + { + "epoch": 0.28943154523618897, + "grad_norm": 1.1813552827187637, + "learning_rate": 1.667330039638936e-05, + "loss": 1.6071, + "step": 2169 + }, + { + "epoch": 0.2895649853215906, + "grad_norm": 1.059369062118976, + "learning_rate": 1.6670080982196055e-05, + "loss": 1.6634, + "step": 2170 + }, + { + "epoch": 0.28969842540699225, + "grad_norm": 12.044345487483524, + "learning_rate": 1.6666860322107563e-05, + "loss": 1.7567, + "step": 2171 + }, + { + "epoch": 0.2898318654923939, + "grad_norm": 1.1174156183857291, + "learning_rate": 1.6663638416725466e-05, + "loss": 1.6669, + "step": 2172 + }, + { + "epoch": 0.2899653055777956, + "grad_norm": 1.422061825117082, + "learning_rate": 1.6660415266651588e-05, + "loss": 1.6174, + "step": 2173 + }, + { + "epoch": 0.2900987456631972, + "grad_norm": 1.09465155740145, + "learning_rate": 1.6657190872487964e-05, + "loss": 1.6354, + "step": 2174 + }, + { + "epoch": 0.2902321857485989, + "grad_norm": 1.6546096916969328, + "learning_rate": 1.665396523483688e-05, + "loss": 1.5803, + "step": 2175 + }, + { + "epoch": 0.29036562583400055, + "grad_norm": 10.336323730521444, + "learning_rate": 1.6650738354300848e-05, + "loss": 1.6456, + "step": 2176 + }, + { + "epoch": 0.29049906591940217, + "grad_norm": 1.1978798944636024, + "learning_rate": 1.664751023148262e-05, + "loss": 1.6693, + "step": 2177 + }, + { + "epoch": 0.29063250600480384, + "grad_norm": 1.1345982910370151, + "learning_rate": 1.6644280866985155e-05, + "loss": 1.5768, + "step": 2178 + }, + { + "epoch": 0.2907659460902055, + "grad_norm": 1.3001746534071794, + "learning_rate": 1.6641050261411676e-05, + "loss": 1.6761, + "step": 2179 + }, + { + "epoch": 0.2908993861756072, + "grad_norm": 1.321671111629523, + "learning_rate": 1.6637818415365622e-05, + "loss": 1.6474, + "step": 2180 + }, + { + "epoch": 0.2910328262610088, + "grad_norm": 1.0570804882194134, + "learning_rate": 1.663458532945066e-05, + "loss": 1.6042, + "step": 2181 + }, + { + "epoch": 0.29116626634641046, + "grad_norm": 1.242601537641294, + "learning_rate": 1.66313510042707e-05, + "loss": 1.6398, + "step": 2182 + }, + { + "epoch": 0.29129970643181213, + "grad_norm": 1.3245077209972027, + "learning_rate": 1.6628115440429867e-05, + "loss": 1.643, + "step": 2183 + }, + { + "epoch": 0.29143314651721375, + "grad_norm": 1.0817940467899572, + "learning_rate": 1.662487863853254e-05, + "loss": 1.5973, + "step": 2184 + }, + { + "epoch": 0.2915665866026154, + "grad_norm": 1.0749074924656212, + "learning_rate": 1.6621640599183307e-05, + "loss": 1.6395, + "step": 2185 + }, + { + "epoch": 0.2917000266880171, + "grad_norm": 1.0699143617678875, + "learning_rate": 1.6618401322986998e-05, + "loss": 1.6294, + "step": 2186 + }, + { + "epoch": 0.29183346677341876, + "grad_norm": 1.2973422398546064, + "learning_rate": 1.6615160810548677e-05, + "loss": 1.6334, + "step": 2187 + }, + { + "epoch": 0.2919669068588204, + "grad_norm": 1.2752225836668032, + "learning_rate": 1.6611919062473635e-05, + "loss": 1.6568, + "step": 2188 + }, + { + "epoch": 0.29210034694422204, + "grad_norm": 1.0820454505520494, + "learning_rate": 1.660867607936739e-05, + "loss": 1.6779, + "step": 2189 + }, + { + "epoch": 0.2922337870296237, + "grad_norm": 1.1086323014280555, + "learning_rate": 1.6605431861835695e-05, + "loss": 1.6372, + "step": 2190 + }, + { + "epoch": 0.29236722711502533, + "grad_norm": 1.1434647125560675, + "learning_rate": 1.6602186410484536e-05, + "loss": 1.5664, + "step": 2191 + }, + { + "epoch": 0.292500667200427, + "grad_norm": 1.6902321777443527, + "learning_rate": 1.6598939725920122e-05, + "loss": 1.6695, + "step": 2192 + }, + { + "epoch": 0.29263410728582867, + "grad_norm": 1.2716880193854676, + "learning_rate": 1.65956918087489e-05, + "loss": 1.7116, + "step": 2193 + }, + { + "epoch": 0.29276754737123034, + "grad_norm": 1.1161202972917799, + "learning_rate": 1.6592442659577545e-05, + "loss": 1.6309, + "step": 2194 + }, + { + "epoch": 0.29290098745663196, + "grad_norm": 1.0699166169760714, + "learning_rate": 1.6589192279012956e-05, + "loss": 1.6484, + "step": 2195 + }, + { + "epoch": 0.2930344275420336, + "grad_norm": 1.0222347107395644, + "learning_rate": 1.658594066766227e-05, + "loss": 1.6596, + "step": 2196 + }, + { + "epoch": 0.2931678676274353, + "grad_norm": 1.0807908428296922, + "learning_rate": 1.6582687826132854e-05, + "loss": 1.6678, + "step": 2197 + }, + { + "epoch": 0.2933013077128369, + "grad_norm": 1.081068785656472, + "learning_rate": 1.6579433755032297e-05, + "loss": 1.6052, + "step": 2198 + }, + { + "epoch": 0.2934347477982386, + "grad_norm": 1.0103124427107717, + "learning_rate": 1.6576178454968422e-05, + "loss": 1.6409, + "step": 2199 + }, + { + "epoch": 0.29356818788364025, + "grad_norm": 0.9811350388342566, + "learning_rate": 1.657292192654929e-05, + "loss": 1.5593, + "step": 2200 + }, + { + "epoch": 0.2937016279690419, + "grad_norm": 1.306039871025814, + "learning_rate": 1.656966417038317e-05, + "loss": 1.6302, + "step": 2201 + }, + { + "epoch": 0.29383506805444354, + "grad_norm": 1.021756773048609, + "learning_rate": 1.656640518707859e-05, + "loss": 1.6938, + "step": 2202 + }, + { + "epoch": 0.2939685081398452, + "grad_norm": 1.0718659276893743, + "learning_rate": 1.6563144977244277e-05, + "loss": 1.6301, + "step": 2203 + }, + { + "epoch": 0.2941019482252469, + "grad_norm": 1.2325944349438038, + "learning_rate": 1.6559883541489204e-05, + "loss": 1.6064, + "step": 2204 + }, + { + "epoch": 0.2942353883106485, + "grad_norm": 1.04367146349336, + "learning_rate": 1.6556620880422577e-05, + "loss": 1.6461, + "step": 2205 + }, + { + "epoch": 0.29436882839605016, + "grad_norm": 1.1064673095517306, + "learning_rate": 1.6553356994653818e-05, + "loss": 1.6265, + "step": 2206 + }, + { + "epoch": 0.29450226848145183, + "grad_norm": 1.0133174183375973, + "learning_rate": 1.6550091884792586e-05, + "loss": 1.647, + "step": 2207 + }, + { + "epoch": 0.2946357085668535, + "grad_norm": 1.0556625341442718, + "learning_rate": 1.6546825551448766e-05, + "loss": 1.6508, + "step": 2208 + }, + { + "epoch": 0.2947691486522551, + "grad_norm": 1.0201247011199586, + "learning_rate": 1.6543557995232472e-05, + "loss": 1.6311, + "step": 2209 + }, + { + "epoch": 0.2949025887376568, + "grad_norm": 1.0341845337571378, + "learning_rate": 1.6540289216754042e-05, + "loss": 1.6522, + "step": 2210 + }, + { + "epoch": 0.29503602882305846, + "grad_norm": 1.0728331970850973, + "learning_rate": 1.6537019216624055e-05, + "loss": 1.6073, + "step": 2211 + }, + { + "epoch": 0.2951694689084601, + "grad_norm": 1.087051064593804, + "learning_rate": 1.65337479954533e-05, + "loss": 1.6518, + "step": 2212 + }, + { + "epoch": 0.29530290899386175, + "grad_norm": 1.1504157623012197, + "learning_rate": 1.653047555385281e-05, + "loss": 1.6489, + "step": 2213 + }, + { + "epoch": 0.2954363490792634, + "grad_norm": 1.6929487032356967, + "learning_rate": 1.652720189243384e-05, + "loss": 1.6564, + "step": 2214 + }, + { + "epoch": 0.2955697891646651, + "grad_norm": 9.547350526694943, + "learning_rate": 1.6523927011807873e-05, + "loss": 1.6765, + "step": 2215 + }, + { + "epoch": 0.2957032292500667, + "grad_norm": 1.166505303552692, + "learning_rate": 1.652065091258662e-05, + "loss": 1.6389, + "step": 2216 + }, + { + "epoch": 0.29583666933546837, + "grad_norm": 1.1611750428510519, + "learning_rate": 1.651737359538201e-05, + "loss": 1.6382, + "step": 2217 + }, + { + "epoch": 0.29597010942087004, + "grad_norm": 1.161944383190572, + "learning_rate": 1.651409506080622e-05, + "loss": 1.6458, + "step": 2218 + }, + { + "epoch": 0.29610354950627166, + "grad_norm": 1.0328377880475517, + "learning_rate": 1.6510815309471638e-05, + "loss": 1.6698, + "step": 2219 + }, + { + "epoch": 0.2962369895916733, + "grad_norm": 1.2670883726284181, + "learning_rate": 1.6507534341990884e-05, + "loss": 1.6669, + "step": 2220 + }, + { + "epoch": 0.296370429677075, + "grad_norm": 1.2369384329163466, + "learning_rate": 1.6504252158976804e-05, + "loss": 1.6523, + "step": 2221 + }, + { + "epoch": 0.29650386976247667, + "grad_norm": 1.0226119191785725, + "learning_rate": 1.6500968761042477e-05, + "loss": 1.6438, + "step": 2222 + }, + { + "epoch": 0.2966373098478783, + "grad_norm": 0.9968700209514012, + "learning_rate": 1.6497684148801204e-05, + "loss": 1.7007, + "step": 2223 + }, + { + "epoch": 0.29677074993327995, + "grad_norm": 1.036350320781629, + "learning_rate": 1.6494398322866503e-05, + "loss": 1.6678, + "step": 2224 + }, + { + "epoch": 0.2969041900186816, + "grad_norm": 0.9847140373170601, + "learning_rate": 1.6491111283852147e-05, + "loss": 1.6433, + "step": 2225 + }, + { + "epoch": 0.29703763010408324, + "grad_norm": 0.9415756130495302, + "learning_rate": 1.64878230323721e-05, + "loss": 1.6222, + "step": 2226 + }, + { + "epoch": 0.2971710701894849, + "grad_norm": 1.4397026580916885, + "learning_rate": 1.648453356904058e-05, + "loss": 1.777, + "step": 2227 + }, + { + "epoch": 0.2973045102748866, + "grad_norm": 1.1618462036671973, + "learning_rate": 1.6481242894472016e-05, + "loss": 1.6368, + "step": 2228 + }, + { + "epoch": 0.29743795036028825, + "grad_norm": 1.0250435345601665, + "learning_rate": 1.6477951009281072e-05, + "loss": 1.6461, + "step": 2229 + }, + { + "epoch": 0.29757139044568987, + "grad_norm": 0.9923678067336387, + "learning_rate": 1.6474657914082638e-05, + "loss": 1.6785, + "step": 2230 + }, + { + "epoch": 0.29770483053109154, + "grad_norm": 1.0267804282384663, + "learning_rate": 1.647136360949182e-05, + "loss": 1.6343, + "step": 2231 + }, + { + "epoch": 0.2978382706164932, + "grad_norm": 1.023674327352044, + "learning_rate": 1.6468068096123957e-05, + "loss": 1.5966, + "step": 2232 + }, + { + "epoch": 0.2979717107018949, + "grad_norm": 1.1944718768952494, + "learning_rate": 1.6464771374594615e-05, + "loss": 1.6535, + "step": 2233 + }, + { + "epoch": 0.2981051507872965, + "grad_norm": 1.152852164366958, + "learning_rate": 1.6461473445519585e-05, + "loss": 1.6781, + "step": 2234 + }, + { + "epoch": 0.29823859087269816, + "grad_norm": 1.0678201700556582, + "learning_rate": 1.6458174309514882e-05, + "loss": 1.6313, + "step": 2235 + }, + { + "epoch": 0.29837203095809983, + "grad_norm": 1.0355313650728948, + "learning_rate": 1.6454873967196748e-05, + "loss": 1.6683, + "step": 2236 + }, + { + "epoch": 0.29850547104350145, + "grad_norm": 1.0884233154304546, + "learning_rate": 1.645157241918165e-05, + "loss": 1.5859, + "step": 2237 + }, + { + "epoch": 0.2986389111289031, + "grad_norm": 1.0472190401927721, + "learning_rate": 1.6448269666086278e-05, + "loss": 1.6596, + "step": 2238 + }, + { + "epoch": 0.2987723512143048, + "grad_norm": 1.0311680152934846, + "learning_rate": 1.6444965708527546e-05, + "loss": 1.6525, + "step": 2239 + }, + { + "epoch": 0.29890579129970646, + "grad_norm": 2.10561403694868, + "learning_rate": 1.64416605471226e-05, + "loss": 1.6469, + "step": 2240 + }, + { + "epoch": 0.2990392313851081, + "grad_norm": 1.0497996204276114, + "learning_rate": 1.6438354182488802e-05, + "loss": 1.607, + "step": 2241 + }, + { + "epoch": 0.29917267147050974, + "grad_norm": 1.043507533207965, + "learning_rate": 1.6435046615243747e-05, + "loss": 1.6621, + "step": 2242 + }, + { + "epoch": 0.2993061115559114, + "grad_norm": 1.042723485887548, + "learning_rate": 1.6431737846005254e-05, + "loss": 1.6912, + "step": 2243 + }, + { + "epoch": 0.29943955164131303, + "grad_norm": 1.0051152191792925, + "learning_rate": 1.6428427875391353e-05, + "loss": 1.6349, + "step": 2244 + }, + { + "epoch": 0.2995729917267147, + "grad_norm": 1.128346274800114, + "learning_rate": 1.642511670402032e-05, + "loss": 1.7053, + "step": 2245 + }, + { + "epoch": 0.29970643181211637, + "grad_norm": 1.0722777257603044, + "learning_rate": 1.642180433251064e-05, + "loss": 1.6878, + "step": 2246 + }, + { + "epoch": 0.29983987189751804, + "grad_norm": 1.017261398089863, + "learning_rate": 1.6418490761481022e-05, + "loss": 1.6521, + "step": 2247 + }, + { + "epoch": 0.29997331198291965, + "grad_norm": 1.0292260761586245, + "learning_rate": 1.6415175991550412e-05, + "loss": 1.6755, + "step": 2248 + }, + { + "epoch": 0.3001067520683213, + "grad_norm": 1.0931207786222135, + "learning_rate": 1.6411860023337962e-05, + "loss": 1.6749, + "step": 2249 + }, + { + "epoch": 0.300240192153723, + "grad_norm": 1.0831524536934993, + "learning_rate": 1.6408542857463062e-05, + "loss": 1.587, + "step": 2250 + }, + { + "epoch": 0.3003736322391246, + "grad_norm": 1.0454308838172524, + "learning_rate": 1.6405224494545322e-05, + "loss": 1.6651, + "step": 2251 + }, + { + "epoch": 0.3005070723245263, + "grad_norm": 1.0806258557902406, + "learning_rate": 1.640190493520457e-05, + "loss": 1.6699, + "step": 2252 + }, + { + "epoch": 0.30064051240992795, + "grad_norm": 12.400419377894314, + "learning_rate": 1.6398584180060867e-05, + "loss": 1.6377, + "step": 2253 + }, + { + "epoch": 0.3007739524953296, + "grad_norm": 1.2948113785574655, + "learning_rate": 1.6395262229734486e-05, + "loss": 1.607, + "step": 2254 + }, + { + "epoch": 0.30090739258073124, + "grad_norm": 1.198607665528997, + "learning_rate": 1.639193908484594e-05, + "loss": 1.7028, + "step": 2255 + }, + { + "epoch": 0.3010408326661329, + "grad_norm": 1.1317395785557423, + "learning_rate": 1.6388614746015946e-05, + "loss": 1.6525, + "step": 2256 + }, + { + "epoch": 0.3011742727515346, + "grad_norm": 1.1476872660243271, + "learning_rate": 1.6385289213865452e-05, + "loss": 1.64, + "step": 2257 + }, + { + "epoch": 0.3013077128369362, + "grad_norm": 1.480875813813185, + "learning_rate": 1.6381962489015633e-05, + "loss": 1.6505, + "step": 2258 + }, + { + "epoch": 0.30144115292233786, + "grad_norm": 1.0975918871872568, + "learning_rate": 1.637863457208788e-05, + "loss": 1.6105, + "step": 2259 + }, + { + "epoch": 0.30157459300773953, + "grad_norm": 1.147793532979415, + "learning_rate": 1.6375305463703816e-05, + "loss": 1.6267, + "step": 2260 + }, + { + "epoch": 0.3017080330931412, + "grad_norm": 1.2879635429689233, + "learning_rate": 1.6371975164485277e-05, + "loss": 1.7015, + "step": 2261 + }, + { + "epoch": 0.3018414731785428, + "grad_norm": 1.3283545115204671, + "learning_rate": 1.636864367505432e-05, + "loss": 1.6196, + "step": 2262 + }, + { + "epoch": 0.3019749132639445, + "grad_norm": 1.0479344997197932, + "learning_rate": 1.636531099603324e-05, + "loss": 1.6243, + "step": 2263 + }, + { + "epoch": 0.30210835334934616, + "grad_norm": 1.0765762306122926, + "learning_rate": 1.6361977128044535e-05, + "loss": 1.6196, + "step": 2264 + }, + { + "epoch": 0.3022417934347478, + "grad_norm": 1.0309118980744778, + "learning_rate": 1.6358642071710935e-05, + "loss": 1.6135, + "step": 2265 + }, + { + "epoch": 0.30237523352014944, + "grad_norm": 0.9953925772810231, + "learning_rate": 1.6355305827655398e-05, + "loss": 1.6429, + "step": 2266 + }, + { + "epoch": 0.3025086736055511, + "grad_norm": 1.2130624721172607, + "learning_rate": 1.6351968396501084e-05, + "loss": 1.6343, + "step": 2267 + }, + { + "epoch": 0.3026421136909528, + "grad_norm": 1.0691255501843615, + "learning_rate": 1.6348629778871393e-05, + "loss": 1.6658, + "step": 2268 + }, + { + "epoch": 0.3027755537763544, + "grad_norm": 1.0154122944841089, + "learning_rate": 1.6345289975389946e-05, + "loss": 1.6048, + "step": 2269 + }, + { + "epoch": 0.30290899386175607, + "grad_norm": 1.0303963960703182, + "learning_rate": 1.6341948986680574e-05, + "loss": 1.5902, + "step": 2270 + }, + { + "epoch": 0.30304243394715774, + "grad_norm": 1.103571288321129, + "learning_rate": 1.6338606813367334e-05, + "loss": 1.6284, + "step": 2271 + }, + { + "epoch": 0.30317587403255936, + "grad_norm": 1.0340663960820138, + "learning_rate": 1.633526345607451e-05, + "loss": 1.6625, + "step": 2272 + }, + { + "epoch": 0.303309314117961, + "grad_norm": 1.0868515192551496, + "learning_rate": 1.6331918915426607e-05, + "loss": 1.6508, + "step": 2273 + }, + { + "epoch": 0.3034427542033627, + "grad_norm": 1.0806584970772868, + "learning_rate": 1.6328573192048337e-05, + "loss": 1.6793, + "step": 2274 + }, + { + "epoch": 0.30357619428876437, + "grad_norm": 1.1131107600277885, + "learning_rate": 1.632522628656465e-05, + "loss": 1.6524, + "step": 2275 + }, + { + "epoch": 0.303709634374166, + "grad_norm": 0.970695670852881, + "learning_rate": 1.6321878199600705e-05, + "loss": 1.6254, + "step": 2276 + }, + { + "epoch": 0.30384307445956765, + "grad_norm": 1.0449698885856158, + "learning_rate": 1.6318528931781893e-05, + "loss": 1.6488, + "step": 2277 + }, + { + "epoch": 0.3039765145449693, + "grad_norm": 2.3976140688143754, + "learning_rate": 1.6315178483733817e-05, + "loss": 1.6349, + "step": 2278 + }, + { + "epoch": 0.30410995463037094, + "grad_norm": 1.0721815977781701, + "learning_rate": 1.6311826856082298e-05, + "loss": 1.6444, + "step": 2279 + }, + { + "epoch": 0.3042433947157726, + "grad_norm": 1.0877886525914027, + "learning_rate": 1.6308474049453385e-05, + "loss": 1.6218, + "step": 2280 + }, + { + "epoch": 0.3043768348011743, + "grad_norm": 1.2215891275833313, + "learning_rate": 1.6305120064473348e-05, + "loss": 1.6729, + "step": 2281 + }, + { + "epoch": 0.30451027488657595, + "grad_norm": 1.0066658286438341, + "learning_rate": 1.6301764901768665e-05, + "loss": 1.6514, + "step": 2282 + }, + { + "epoch": 0.30464371497197756, + "grad_norm": 5.18175722987443, + "learning_rate": 1.629840856196605e-05, + "loss": 1.7849, + "step": 2283 + }, + { + "epoch": 0.30477715505737923, + "grad_norm": 1.0661836522987367, + "learning_rate": 1.6295051045692422e-05, + "loss": 1.6515, + "step": 2284 + }, + { + "epoch": 0.3049105951427809, + "grad_norm": 0.9948954364094378, + "learning_rate": 1.6291692353574934e-05, + "loss": 1.6653, + "step": 2285 + }, + { + "epoch": 0.3050440352281825, + "grad_norm": 1.1776280843724025, + "learning_rate": 1.6288332486240946e-05, + "loss": 1.6249, + "step": 2286 + }, + { + "epoch": 0.3051774753135842, + "grad_norm": 0.9803581714490958, + "learning_rate": 1.6284971444318046e-05, + "loss": 1.6381, + "step": 2287 + }, + { + "epoch": 0.30531091539898586, + "grad_norm": 1.0334824917780492, + "learning_rate": 1.628160922843404e-05, + "loss": 1.7033, + "step": 2288 + }, + { + "epoch": 0.30544435548438753, + "grad_norm": 1.1825464876579848, + "learning_rate": 1.6278245839216947e-05, + "loss": 1.6364, + "step": 2289 + }, + { + "epoch": 0.30557779556978915, + "grad_norm": 1.089932123868756, + "learning_rate": 1.627488127729501e-05, + "loss": 1.6358, + "step": 2290 + }, + { + "epoch": 0.3057112356551908, + "grad_norm": 1.0063881587721086, + "learning_rate": 1.6271515543296693e-05, + "loss": 1.6245, + "step": 2291 + }, + { + "epoch": 0.3058446757405925, + "grad_norm": 0.9557576711140595, + "learning_rate": 1.6268148637850675e-05, + "loss": 1.6148, + "step": 2292 + }, + { + "epoch": 0.3059781158259941, + "grad_norm": 1.155386941840756, + "learning_rate": 1.626478056158586e-05, + "loss": 1.6679, + "step": 2293 + }, + { + "epoch": 0.30611155591139577, + "grad_norm": 1.002724278928917, + "learning_rate": 1.626141131513136e-05, + "loss": 1.6717, + "step": 2294 + }, + { + "epoch": 0.30624499599679744, + "grad_norm": 1.0181937079803456, + "learning_rate": 1.6258040899116517e-05, + "loss": 1.6276, + "step": 2295 + }, + { + "epoch": 0.3063784360821991, + "grad_norm": 1.0190676733905637, + "learning_rate": 1.6254669314170887e-05, + "loss": 1.6428, + "step": 2296 + }, + { + "epoch": 0.3065118761676007, + "grad_norm": 1.0126611742120983, + "learning_rate": 1.625129656092424e-05, + "loss": 1.6943, + "step": 2297 + }, + { + "epoch": 0.3066453162530024, + "grad_norm": 0.9702747048248926, + "learning_rate": 1.624792264000657e-05, + "loss": 1.6091, + "step": 2298 + }, + { + "epoch": 0.30677875633840407, + "grad_norm": 1.0176969387216936, + "learning_rate": 1.624454755204808e-05, + "loss": 1.6432, + "step": 2299 + }, + { + "epoch": 0.30691219642380574, + "grad_norm": 0.9905054594963236, + "learning_rate": 1.624117129767921e-05, + "loss": 1.6908, + "step": 2300 + }, + { + "epoch": 0.30704563650920735, + "grad_norm": 1.0990745253836063, + "learning_rate": 1.62377938775306e-05, + "loss": 1.6545, + "step": 2301 + }, + { + "epoch": 0.307179076594609, + "grad_norm": 0.9872989722764705, + "learning_rate": 1.623441529223311e-05, + "loss": 1.6699, + "step": 2302 + }, + { + "epoch": 0.3073125166800107, + "grad_norm": 1.2705211460028154, + "learning_rate": 1.6231035542417826e-05, + "loss": 1.6582, + "step": 2303 + }, + { + "epoch": 0.3074459567654123, + "grad_norm": 1.0150840493548405, + "learning_rate": 1.6227654628716044e-05, + "loss": 1.665, + "step": 2304 + }, + { + "epoch": 0.307579396850814, + "grad_norm": 1.1379930735351889, + "learning_rate": 1.6224272551759288e-05, + "loss": 1.676, + "step": 2305 + }, + { + "epoch": 0.30771283693621565, + "grad_norm": 1.2444071746179561, + "learning_rate": 1.622088931217928e-05, + "loss": 1.6459, + "step": 2306 + }, + { + "epoch": 0.3078462770216173, + "grad_norm": 0.9976784854166414, + "learning_rate": 1.621750491060798e-05, + "loss": 1.6636, + "step": 2307 + }, + { + "epoch": 0.30797971710701894, + "grad_norm": 1.2682449189830896, + "learning_rate": 1.6214119347677548e-05, + "loss": 1.6616, + "step": 2308 + }, + { + "epoch": 0.3081131571924206, + "grad_norm": 1.0829225349432583, + "learning_rate": 1.621073262402037e-05, + "loss": 1.644, + "step": 2309 + }, + { + "epoch": 0.3082465972778223, + "grad_norm": 1.0838885803573566, + "learning_rate": 1.6207344740269056e-05, + "loss": 1.6494, + "step": 2310 + }, + { + "epoch": 0.3083800373632239, + "grad_norm": 1.0708462365866025, + "learning_rate": 1.6203955697056416e-05, + "loss": 1.6671, + "step": 2311 + }, + { + "epoch": 0.30851347744862556, + "grad_norm": 0.9823970370572715, + "learning_rate": 1.6200565495015487e-05, + "loss": 1.6396, + "step": 2312 + }, + { + "epoch": 0.30864691753402723, + "grad_norm": 1.0031602532602553, + "learning_rate": 1.6197174134779517e-05, + "loss": 1.6102, + "step": 2313 + }, + { + "epoch": 0.3087803576194289, + "grad_norm": 1.2206609039872867, + "learning_rate": 1.619378161698198e-05, + "loss": 1.648, + "step": 2314 + }, + { + "epoch": 0.3089137977048305, + "grad_norm": 1.3166269143643101, + "learning_rate": 1.6190387942256548e-05, + "loss": 1.6009, + "step": 2315 + }, + { + "epoch": 0.3090472377902322, + "grad_norm": 1.2675618240810085, + "learning_rate": 1.6186993111237134e-05, + "loss": 1.6224, + "step": 2316 + }, + { + "epoch": 0.30918067787563386, + "grad_norm": 1.0640732745150545, + "learning_rate": 1.618359712455785e-05, + "loss": 1.6662, + "step": 2317 + }, + { + "epoch": 0.3093141179610355, + "grad_norm": 1.019032967258582, + "learning_rate": 1.6180199982853026e-05, + "loss": 1.6237, + "step": 2318 + }, + { + "epoch": 0.30944755804643714, + "grad_norm": 1.0944634109792915, + "learning_rate": 1.6176801686757207e-05, + "loss": 1.6415, + "step": 2319 + }, + { + "epoch": 0.3095809981318388, + "grad_norm": 1.24072168667135, + "learning_rate": 1.6173402236905156e-05, + "loss": 1.5915, + "step": 2320 + }, + { + "epoch": 0.3097144382172405, + "grad_norm": 0.9746140716316298, + "learning_rate": 1.6170001633931857e-05, + "loss": 1.6443, + "step": 2321 + }, + { + "epoch": 0.3098478783026421, + "grad_norm": 1.1844321844657195, + "learning_rate": 1.6166599878472502e-05, + "loss": 1.6174, + "step": 2322 + }, + { + "epoch": 0.30998131838804377, + "grad_norm": 1.001876907977257, + "learning_rate": 1.61631969711625e-05, + "loss": 1.6162, + "step": 2323 + }, + { + "epoch": 0.31011475847344544, + "grad_norm": 0.9886662045991245, + "learning_rate": 1.6159792912637467e-05, + "loss": 1.6354, + "step": 2324 + }, + { + "epoch": 0.31024819855884705, + "grad_norm": 0.9775352505641491, + "learning_rate": 1.6156387703533258e-05, + "loss": 1.6204, + "step": 2325 + }, + { + "epoch": 0.3103816386442487, + "grad_norm": 0.9826413139650488, + "learning_rate": 1.6152981344485915e-05, + "loss": 1.6398, + "step": 2326 + }, + { + "epoch": 0.3105150787296504, + "grad_norm": 1.0049022279607043, + "learning_rate": 1.6149573836131713e-05, + "loss": 1.6421, + "step": 2327 + }, + { + "epoch": 0.31064851881505207, + "grad_norm": 1.151819306544648, + "learning_rate": 1.6146165179107135e-05, + "loss": 1.5993, + "step": 2328 + }, + { + "epoch": 0.3107819589004537, + "grad_norm": 1.144986255564853, + "learning_rate": 1.6142755374048876e-05, + "loss": 1.6405, + "step": 2329 + }, + { + "epoch": 0.31091539898585535, + "grad_norm": 1.0389802092801697, + "learning_rate": 1.613934442159385e-05, + "loss": 1.6334, + "step": 2330 + }, + { + "epoch": 0.311048839071257, + "grad_norm": 1.016003258094143, + "learning_rate": 1.6135932322379188e-05, + "loss": 1.6236, + "step": 2331 + }, + { + "epoch": 0.31118227915665864, + "grad_norm": 0.966534466876772, + "learning_rate": 1.6132519077042224e-05, + "loss": 1.6443, + "step": 2332 + }, + { + "epoch": 0.3113157192420603, + "grad_norm": 1.0190490159617054, + "learning_rate": 1.6129104686220522e-05, + "loss": 1.6327, + "step": 2333 + }, + { + "epoch": 0.311449159327462, + "grad_norm": 0.9508676466180186, + "learning_rate": 1.6125689150551846e-05, + "loss": 1.5877, + "step": 2334 + }, + { + "epoch": 0.31158259941286365, + "grad_norm": 1.0322824165183992, + "learning_rate": 1.612227247067418e-05, + "loss": 1.6722, + "step": 2335 + }, + { + "epoch": 0.31171603949826526, + "grad_norm": 1.1605340980731977, + "learning_rate": 1.6118854647225722e-05, + "loss": 1.6116, + "step": 2336 + }, + { + "epoch": 0.31184947958366693, + "grad_norm": 1.0040963152787474, + "learning_rate": 1.611543568084488e-05, + "loss": 1.6418, + "step": 2337 + }, + { + "epoch": 0.3119829196690686, + "grad_norm": 1.0426733287211412, + "learning_rate": 1.6112015572170283e-05, + "loss": 1.6439, + "step": 2338 + }, + { + "epoch": 0.3121163597544702, + "grad_norm": 1.0528462917004136, + "learning_rate": 1.6108594321840763e-05, + "loss": 1.6341, + "step": 2339 + }, + { + "epoch": 0.3122497998398719, + "grad_norm": 1.005308410868297, + "learning_rate": 1.6105171930495373e-05, + "loss": 1.6207, + "step": 2340 + }, + { + "epoch": 0.31238323992527356, + "grad_norm": 1.0757636731132663, + "learning_rate": 1.6101748398773374e-05, + "loss": 1.651, + "step": 2341 + }, + { + "epoch": 0.31251668001067523, + "grad_norm": 1.157330419763036, + "learning_rate": 1.609832372731425e-05, + "loss": 1.5569, + "step": 2342 + }, + { + "epoch": 0.31265012009607684, + "grad_norm": 1.1655530310462219, + "learning_rate": 1.609489791675768e-05, + "loss": 1.622, + "step": 2343 + }, + { + "epoch": 0.3127835601814785, + "grad_norm": 1.0163847934254429, + "learning_rate": 1.6091470967743577e-05, + "loss": 1.6283, + "step": 2344 + }, + { + "epoch": 0.3129170002668802, + "grad_norm": 0.9913652557421357, + "learning_rate": 1.608804288091205e-05, + "loss": 1.6029, + "step": 2345 + }, + { + "epoch": 0.3130504403522818, + "grad_norm": 1.0250978527017118, + "learning_rate": 1.6084613656903426e-05, + "loss": 1.6701, + "step": 2346 + }, + { + "epoch": 0.31318388043768347, + "grad_norm": 1.0036453857759162, + "learning_rate": 1.6081183296358246e-05, + "loss": 1.6656, + "step": 2347 + }, + { + "epoch": 0.31331732052308514, + "grad_norm": 1.0093725549829087, + "learning_rate": 1.607775179991726e-05, + "loss": 1.62, + "step": 2348 + }, + { + "epoch": 0.3134507606084868, + "grad_norm": 1.0549793980242639, + "learning_rate": 1.6074319168221446e-05, + "loss": 1.6474, + "step": 2349 + }, + { + "epoch": 0.3135842006938884, + "grad_norm": 1.0014461970290505, + "learning_rate": 1.6070885401911964e-05, + "loss": 1.6402, + "step": 2350 + }, + { + "epoch": 0.3137176407792901, + "grad_norm": 1.2302428695760383, + "learning_rate": 1.606745050163021e-05, + "loss": 1.6615, + "step": 2351 + }, + { + "epoch": 0.31385108086469177, + "grad_norm": 1.278062546442448, + "learning_rate": 1.6064014468017778e-05, + "loss": 1.6443, + "step": 2352 + }, + { + "epoch": 0.3139845209500934, + "grad_norm": 1.014494022872304, + "learning_rate": 1.606057730171649e-05, + "loss": 1.6771, + "step": 2353 + }, + { + "epoch": 0.31411796103549505, + "grad_norm": 0.9992975913870881, + "learning_rate": 1.6057139003368365e-05, + "loss": 1.6673, + "step": 2354 + }, + { + "epoch": 0.3142514011208967, + "grad_norm": 1.0264389162628824, + "learning_rate": 1.6053699573615634e-05, + "loss": 1.645, + "step": 2355 + }, + { + "epoch": 0.3143848412062984, + "grad_norm": 1.2998311967872789, + "learning_rate": 1.605025901310075e-05, + "loss": 1.653, + "step": 2356 + }, + { + "epoch": 0.3145182812917, + "grad_norm": 1.1129013572248108, + "learning_rate": 1.604681732246636e-05, + "loss": 1.5694, + "step": 2357 + }, + { + "epoch": 0.3146517213771017, + "grad_norm": 0.998120908236157, + "learning_rate": 1.6043374502355348e-05, + "loss": 1.6577, + "step": 2358 + }, + { + "epoch": 0.31478516146250335, + "grad_norm": 1.1224918644835182, + "learning_rate": 1.603993055341078e-05, + "loss": 1.6469, + "step": 2359 + }, + { + "epoch": 0.31491860154790496, + "grad_norm": 1.0269545547781742, + "learning_rate": 1.6036485476275957e-05, + "loss": 1.6068, + "step": 2360 + }, + { + "epoch": 0.31505204163330663, + "grad_norm": 1.136514585808053, + "learning_rate": 1.603303927159437e-05, + "loss": 1.6382, + "step": 2361 + }, + { + "epoch": 0.3151854817187083, + "grad_norm": 1.2494184496056624, + "learning_rate": 1.602959194000974e-05, + "loss": 1.7152, + "step": 2362 + }, + { + "epoch": 0.31531892180411, + "grad_norm": 1.0140755418750587, + "learning_rate": 1.602614348216598e-05, + "loss": 1.591, + "step": 2363 + }, + { + "epoch": 0.3154523618895116, + "grad_norm": 1.0007366397635697, + "learning_rate": 1.602269389870723e-05, + "loss": 1.6349, + "step": 2364 + }, + { + "epoch": 0.31558580197491326, + "grad_norm": 1.0379624969316805, + "learning_rate": 1.6019243190277832e-05, + "loss": 1.583, + "step": 2365 + }, + { + "epoch": 0.31571924206031493, + "grad_norm": 1.122118459336519, + "learning_rate": 1.6015791357522336e-05, + "loss": 1.5789, + "step": 2366 + }, + { + "epoch": 0.31585268214571655, + "grad_norm": 1.218518313111331, + "learning_rate": 1.601233840108551e-05, + "loss": 1.6483, + "step": 2367 + }, + { + "epoch": 0.3159861222311182, + "grad_norm": 1.0325320279857855, + "learning_rate": 1.600888432161232e-05, + "loss": 1.6786, + "step": 2368 + }, + { + "epoch": 0.3161195623165199, + "grad_norm": 1.0105488940747966, + "learning_rate": 1.600542911974796e-05, + "loss": 1.6214, + "step": 2369 + }, + { + "epoch": 0.31625300240192156, + "grad_norm": 1.0174955025149985, + "learning_rate": 1.6001972796137804e-05, + "loss": 1.6188, + "step": 2370 + }, + { + "epoch": 0.31638644248732317, + "grad_norm": 0.9686366099778118, + "learning_rate": 1.5998515351427472e-05, + "loss": 1.6052, + "step": 2371 + }, + { + "epoch": 0.31651988257272484, + "grad_norm": 1.0943501073343973, + "learning_rate": 1.5995056786262763e-05, + "loss": 1.6652, + "step": 2372 + }, + { + "epoch": 0.3166533226581265, + "grad_norm": 1.1056669145759417, + "learning_rate": 1.59915971012897e-05, + "loss": 1.5992, + "step": 2373 + }, + { + "epoch": 0.3167867627435282, + "grad_norm": 1.0153669225410895, + "learning_rate": 1.5988136297154525e-05, + "loss": 1.6659, + "step": 2374 + }, + { + "epoch": 0.3169202028289298, + "grad_norm": 0.9485851246206498, + "learning_rate": 1.5984674374503663e-05, + "loss": 1.638, + "step": 2375 + }, + { + "epoch": 0.31705364291433147, + "grad_norm": 1.2485290662581918, + "learning_rate": 1.5981211333983765e-05, + "loss": 1.6167, + "step": 2376 + }, + { + "epoch": 0.31718708299973314, + "grad_norm": 1.0025032236221025, + "learning_rate": 1.5977747176241688e-05, + "loss": 1.6485, + "step": 2377 + }, + { + "epoch": 0.31732052308513475, + "grad_norm": 1.027037150167266, + "learning_rate": 1.59742819019245e-05, + "loss": 1.635, + "step": 2378 + }, + { + "epoch": 0.3174539631705364, + "grad_norm": 1.0390794115599424, + "learning_rate": 1.5970815511679472e-05, + "loss": 1.6242, + "step": 2379 + }, + { + "epoch": 0.3175874032559381, + "grad_norm": 1.178314062565435, + "learning_rate": 1.5967348006154084e-05, + "loss": 1.6109, + "step": 2380 + }, + { + "epoch": 0.31772084334133976, + "grad_norm": 1.1670387061276462, + "learning_rate": 1.5963879385996032e-05, + "loss": 1.6196, + "step": 2381 + }, + { + "epoch": 0.3178542834267414, + "grad_norm": 1.0774835661150515, + "learning_rate": 1.596040965185321e-05, + "loss": 1.6497, + "step": 2382 + }, + { + "epoch": 0.31798772351214305, + "grad_norm": 1.1109464505151274, + "learning_rate": 1.5956938804373726e-05, + "loss": 1.6395, + "step": 2383 + }, + { + "epoch": 0.3181211635975447, + "grad_norm": 1.0759976671368048, + "learning_rate": 1.59534668442059e-05, + "loss": 1.6022, + "step": 2384 + }, + { + "epoch": 0.31825460368294634, + "grad_norm": 1.0363642572438576, + "learning_rate": 1.5949993771998248e-05, + "loss": 1.6947, + "step": 2385 + }, + { + "epoch": 0.318388043768348, + "grad_norm": 1.1364653119855215, + "learning_rate": 1.59465195883995e-05, + "loss": 1.6464, + "step": 2386 + }, + { + "epoch": 0.3185214838537497, + "grad_norm": 1.1363035834435125, + "learning_rate": 1.59430442940586e-05, + "loss": 1.6516, + "step": 2387 + }, + { + "epoch": 0.31865492393915135, + "grad_norm": 1.0660321057250608, + "learning_rate": 1.5939567889624693e-05, + "loss": 1.6402, + "step": 2388 + }, + { + "epoch": 0.31878836402455296, + "grad_norm": 1.0643000544064976, + "learning_rate": 1.5936090375747123e-05, + "loss": 1.6054, + "step": 2389 + }, + { + "epoch": 0.31892180410995463, + "grad_norm": 1.234549999729085, + "learning_rate": 1.593261175307546e-05, + "loss": 1.6499, + "step": 2390 + }, + { + "epoch": 0.3190552441953563, + "grad_norm": 1.1660252709285277, + "learning_rate": 1.592913202225946e-05, + "loss": 1.6534, + "step": 2391 + }, + { + "epoch": 0.3191886842807579, + "grad_norm": 1.0300136024667785, + "learning_rate": 1.592565118394911e-05, + "loss": 1.6732, + "step": 2392 + }, + { + "epoch": 0.3193221243661596, + "grad_norm": 1.0430395771115888, + "learning_rate": 1.592216923879458e-05, + "loss": 1.6707, + "step": 2393 + }, + { + "epoch": 0.31945556445156126, + "grad_norm": 1.056183541401937, + "learning_rate": 1.591868618744627e-05, + "loss": 1.6245, + "step": 2394 + }, + { + "epoch": 0.31958900453696293, + "grad_norm": 1.0460496279175218, + "learning_rate": 1.591520203055476e-05, + "loss": 1.6525, + "step": 2395 + }, + { + "epoch": 0.31972244462236454, + "grad_norm": 1.168050816657518, + "learning_rate": 1.5911716768770863e-05, + "loss": 1.692, + "step": 2396 + }, + { + "epoch": 0.3198558847077662, + "grad_norm": 1.014257361343356, + "learning_rate": 1.5908230402745576e-05, + "loss": 1.639, + "step": 2397 + }, + { + "epoch": 0.3199893247931679, + "grad_norm": 0.9882262303186528, + "learning_rate": 1.590474293313012e-05, + "loss": 1.6204, + "step": 2398 + }, + { + "epoch": 0.3201227648785695, + "grad_norm": 1.0647363395044251, + "learning_rate": 1.5901254360575912e-05, + "loss": 1.6276, + "step": 2399 + }, + { + "epoch": 0.32025620496397117, + "grad_norm": 1.205341934103895, + "learning_rate": 1.589776468573458e-05, + "loss": 1.6697, + "step": 2400 + }, + { + "epoch": 0.32038964504937284, + "grad_norm": 1.1796009355838823, + "learning_rate": 1.589427390925795e-05, + "loss": 1.6451, + "step": 2401 + }, + { + "epoch": 0.3205230851347745, + "grad_norm": 1.238674085054047, + "learning_rate": 1.5890782031798065e-05, + "loss": 1.6161, + "step": 2402 + }, + { + "epoch": 0.3206565252201761, + "grad_norm": 1.0448493530439331, + "learning_rate": 1.5887289054007166e-05, + "loss": 1.6487, + "step": 2403 + }, + { + "epoch": 0.3207899653055778, + "grad_norm": 1.196197573510979, + "learning_rate": 1.5883794976537697e-05, + "loss": 1.6033, + "step": 2404 + }, + { + "epoch": 0.32092340539097947, + "grad_norm": 1.0194877794457862, + "learning_rate": 1.588029980004232e-05, + "loss": 1.6694, + "step": 2405 + }, + { + "epoch": 0.3210568454763811, + "grad_norm": 1.0318768025671652, + "learning_rate": 1.5876803525173883e-05, + "loss": 1.6659, + "step": 2406 + }, + { + "epoch": 0.32119028556178275, + "grad_norm": 0.9865820243660075, + "learning_rate": 1.5873306152585466e-05, + "loss": 1.6008, + "step": 2407 + }, + { + "epoch": 0.3213237256471844, + "grad_norm": 1.016542806768825, + "learning_rate": 1.5869807682930327e-05, + "loss": 1.5922, + "step": 2408 + }, + { + "epoch": 0.3214571657325861, + "grad_norm": 6.608911252971496, + "learning_rate": 1.5866308116861944e-05, + "loss": 1.6503, + "step": 2409 + }, + { + "epoch": 0.3215906058179877, + "grad_norm": 1.2676970243988273, + "learning_rate": 1.586280745503399e-05, + "loss": 1.6671, + "step": 2410 + }, + { + "epoch": 0.3217240459033894, + "grad_norm": 1.1773866986931305, + "learning_rate": 1.585930569810036e-05, + "loss": 1.6256, + "step": 2411 + }, + { + "epoch": 0.32185748598879105, + "grad_norm": 1.0205646117216987, + "learning_rate": 1.5855802846715134e-05, + "loss": 1.6661, + "step": 2412 + }, + { + "epoch": 0.32199092607419266, + "grad_norm": 1.1132522973470225, + "learning_rate": 1.5852298901532604e-05, + "loss": 1.615, + "step": 2413 + }, + { + "epoch": 0.32212436615959433, + "grad_norm": 1.0003679945512642, + "learning_rate": 1.5848793863207276e-05, + "loss": 1.6667, + "step": 2414 + }, + { + "epoch": 0.322257806244996, + "grad_norm": 1.0445124912690658, + "learning_rate": 1.5845287732393845e-05, + "loss": 1.6084, + "step": 2415 + }, + { + "epoch": 0.3223912463303977, + "grad_norm": 1.341548510183378, + "learning_rate": 1.5841780509747213e-05, + "loss": 1.5919, + "step": 2416 + }, + { + "epoch": 0.3225246864157993, + "grad_norm": 0.9783216242963252, + "learning_rate": 1.5838272195922492e-05, + "loss": 1.6474, + "step": 2417 + }, + { + "epoch": 0.32265812650120096, + "grad_norm": 1.0079878098801114, + "learning_rate": 1.5834762791575e-05, + "loss": 1.6694, + "step": 2418 + }, + { + "epoch": 0.32279156658660263, + "grad_norm": 0.9777003006073635, + "learning_rate": 1.5831252297360248e-05, + "loss": 1.6087, + "step": 2419 + }, + { + "epoch": 0.32292500667200424, + "grad_norm": 1.0210674211146504, + "learning_rate": 1.5827740713933958e-05, + "loss": 1.6225, + "step": 2420 + }, + { + "epoch": 0.3230584467574059, + "grad_norm": 0.9767096638361283, + "learning_rate": 1.5824228041952054e-05, + "loss": 1.6115, + "step": 2421 + }, + { + "epoch": 0.3231918868428076, + "grad_norm": 1.049229142974769, + "learning_rate": 1.5820714282070663e-05, + "loss": 1.586, + "step": 2422 + }, + { + "epoch": 0.32332532692820926, + "grad_norm": 0.9905778604640099, + "learning_rate": 1.5817199434946115e-05, + "loss": 1.5858, + "step": 2423 + }, + { + "epoch": 0.32345876701361087, + "grad_norm": 1.0227388841710876, + "learning_rate": 1.5813683501234946e-05, + "loss": 1.6233, + "step": 2424 + }, + { + "epoch": 0.32359220709901254, + "grad_norm": 1.0857008064049787, + "learning_rate": 1.581016648159389e-05, + "loss": 1.6423, + "step": 2425 + }, + { + "epoch": 0.3237256471844142, + "grad_norm": 1.1213206892388514, + "learning_rate": 1.5806648376679885e-05, + "loss": 1.6308, + "step": 2426 + }, + { + "epoch": 0.3238590872698158, + "grad_norm": 1.0371863646742951, + "learning_rate": 1.5803129187150074e-05, + "loss": 1.613, + "step": 2427 + }, + { + "epoch": 0.3239925273552175, + "grad_norm": 1.0285238928995435, + "learning_rate": 1.57996089136618e-05, + "loss": 1.632, + "step": 2428 + }, + { + "epoch": 0.32412596744061917, + "grad_norm": 1.0053527521421413, + "learning_rate": 1.5796087556872616e-05, + "loss": 1.6295, + "step": 2429 + }, + { + "epoch": 0.32425940752602084, + "grad_norm": 1.0593515172965136, + "learning_rate": 1.5792565117440267e-05, + "loss": 1.6714, + "step": 2430 + }, + { + "epoch": 0.32439284761142245, + "grad_norm": 0.9962509724536518, + "learning_rate": 1.5789041596022702e-05, + "loss": 1.6014, + "step": 2431 + }, + { + "epoch": 0.3245262876968241, + "grad_norm": 1.150755264452728, + "learning_rate": 1.5785516993278082e-05, + "loss": 1.568, + "step": 2432 + }, + { + "epoch": 0.3246597277822258, + "grad_norm": 1.1482871392650946, + "learning_rate": 1.578199130986476e-05, + "loss": 1.6891, + "step": 2433 + }, + { + "epoch": 0.3247931678676274, + "grad_norm": 1.0524081924283053, + "learning_rate": 1.5778464546441283e-05, + "loss": 1.6186, + "step": 2434 + }, + { + "epoch": 0.3249266079530291, + "grad_norm": 0.9963103718861039, + "learning_rate": 1.5774936703666423e-05, + "loss": 1.6579, + "step": 2435 + }, + { + "epoch": 0.32506004803843075, + "grad_norm": 1.034416639979626, + "learning_rate": 1.5771407782199137e-05, + "loss": 1.6743, + "step": 2436 + }, + { + "epoch": 0.3251934881238324, + "grad_norm": 1.0233115564518966, + "learning_rate": 1.5767877782698588e-05, + "loss": 1.6388, + "step": 2437 + }, + { + "epoch": 0.32532692820923403, + "grad_norm": 1.0203611371448205, + "learning_rate": 1.576434670582414e-05, + "loss": 1.6796, + "step": 2438 + }, + { + "epoch": 0.3254603682946357, + "grad_norm": 1.1316243969954793, + "learning_rate": 1.5760814552235353e-05, + "loss": 1.6287, + "step": 2439 + }, + { + "epoch": 0.3255938083800374, + "grad_norm": 2.773420421285832, + "learning_rate": 1.5757281322592e-05, + "loss": 1.7335, + "step": 2440 + }, + { + "epoch": 0.32572724846543905, + "grad_norm": 0.9752268956267421, + "learning_rate": 1.5753747017554043e-05, + "loss": 1.6287, + "step": 2441 + }, + { + "epoch": 0.32586068855084066, + "grad_norm": 1.1865730627498443, + "learning_rate": 1.5750211637781654e-05, + "loss": 1.6662, + "step": 2442 + }, + { + "epoch": 0.32599412863624233, + "grad_norm": 1.066883641739535, + "learning_rate": 1.57466751839352e-05, + "loss": 1.6201, + "step": 2443 + }, + { + "epoch": 0.326127568721644, + "grad_norm": 1.012400294015082, + "learning_rate": 1.5743137656675248e-05, + "loss": 1.6834, + "step": 2444 + }, + { + "epoch": 0.3262610088070456, + "grad_norm": 1.2919053431818395, + "learning_rate": 1.5739599056662572e-05, + "loss": 1.63, + "step": 2445 + }, + { + "epoch": 0.3263944488924473, + "grad_norm": 1.1008048203769303, + "learning_rate": 1.5736059384558136e-05, + "loss": 1.6535, + "step": 2446 + }, + { + "epoch": 0.32652788897784896, + "grad_norm": 1.0198928027043783, + "learning_rate": 1.573251864102312e-05, + "loss": 1.6043, + "step": 2447 + }, + { + "epoch": 0.3266613290632506, + "grad_norm": 1.255841958086851, + "learning_rate": 1.5728976826718887e-05, + "loss": 1.613, + "step": 2448 + }, + { + "epoch": 0.32679476914865224, + "grad_norm": 1.1079219569085927, + "learning_rate": 1.572543394230701e-05, + "loss": 1.5904, + "step": 2449 + }, + { + "epoch": 0.3269282092340539, + "grad_norm": 1.0252017128719715, + "learning_rate": 1.5721889988449258e-05, + "loss": 1.6363, + "step": 2450 + }, + { + "epoch": 0.3270616493194556, + "grad_norm": 1.051784185647374, + "learning_rate": 1.5718344965807602e-05, + "loss": 1.611, + "step": 2451 + }, + { + "epoch": 0.3271950894048572, + "grad_norm": 0.9834253613840619, + "learning_rate": 1.5714798875044214e-05, + "loss": 1.6072, + "step": 2452 + }, + { + "epoch": 0.32732852949025887, + "grad_norm": 1.0314033695212004, + "learning_rate": 1.5711251716821458e-05, + "loss": 1.6627, + "step": 2453 + }, + { + "epoch": 0.32746196957566054, + "grad_norm": 0.9745160928801103, + "learning_rate": 1.570770349180191e-05, + "loss": 1.6387, + "step": 2454 + }, + { + "epoch": 0.3275954096610622, + "grad_norm": 1.0004048445912241, + "learning_rate": 1.5704154200648328e-05, + "loss": 1.616, + "step": 2455 + }, + { + "epoch": 0.3277288497464638, + "grad_norm": 1.0107807331175838, + "learning_rate": 1.5700603844023695e-05, + "loss": 1.6356, + "step": 2456 + }, + { + "epoch": 0.3278622898318655, + "grad_norm": 0.9520529144617134, + "learning_rate": 1.5697052422591163e-05, + "loss": 1.5897, + "step": 2457 + }, + { + "epoch": 0.32799572991726716, + "grad_norm": 1.0058106372712747, + "learning_rate": 1.56934999370141e-05, + "loss": 1.6069, + "step": 2458 + }, + { + "epoch": 0.3281291700026688, + "grad_norm": 1.0663435525855554, + "learning_rate": 1.5689946387956075e-05, + "loss": 1.7066, + "step": 2459 + }, + { + "epoch": 0.32826261008807045, + "grad_norm": 0.9831844195678802, + "learning_rate": 1.5686391776080844e-05, + "loss": 1.5786, + "step": 2460 + }, + { + "epoch": 0.3283960501734721, + "grad_norm": 1.0642774090185225, + "learning_rate": 1.5682836102052376e-05, + "loss": 1.6452, + "step": 2461 + }, + { + "epoch": 0.3285294902588738, + "grad_norm": 1.037534479854548, + "learning_rate": 1.5679279366534822e-05, + "loss": 1.6082, + "step": 2462 + }, + { + "epoch": 0.3286629303442754, + "grad_norm": 1.173992093363848, + "learning_rate": 1.5675721570192545e-05, + "loss": 1.6464, + "step": 2463 + }, + { + "epoch": 0.3287963704296771, + "grad_norm": 1.0051707385512147, + "learning_rate": 1.5672162713690098e-05, + "loss": 1.6256, + "step": 2464 + }, + { + "epoch": 0.32892981051507875, + "grad_norm": 1.0045689191856597, + "learning_rate": 1.5668602797692238e-05, + "loss": 1.629, + "step": 2465 + }, + { + "epoch": 0.32906325060048036, + "grad_norm": 1.0240816684727387, + "learning_rate": 1.566504182286391e-05, + "loss": 1.6125, + "step": 2466 + }, + { + "epoch": 0.32919669068588203, + "grad_norm": 0.9893049585917324, + "learning_rate": 1.5661479789870277e-05, + "loss": 1.6334, + "step": 2467 + }, + { + "epoch": 0.3293301307712837, + "grad_norm": 1.0764320535215586, + "learning_rate": 1.5657916699376668e-05, + "loss": 1.6781, + "step": 2468 + }, + { + "epoch": 0.3294635708566854, + "grad_norm": 1.2439578576993158, + "learning_rate": 1.565435255204864e-05, + "loss": 1.6416, + "step": 2469 + }, + { + "epoch": 0.329597010942087, + "grad_norm": 1.006873672025213, + "learning_rate": 1.5650787348551934e-05, + "loss": 1.6524, + "step": 2470 + }, + { + "epoch": 0.32973045102748866, + "grad_norm": 1.0804959807305459, + "learning_rate": 1.564722108955249e-05, + "loss": 1.6437, + "step": 2471 + }, + { + "epoch": 0.32986389111289033, + "grad_norm": 1.1619045794374623, + "learning_rate": 1.564365377571644e-05, + "loss": 1.5983, + "step": 2472 + }, + { + "epoch": 0.32999733119829194, + "grad_norm": 1.1240948384895437, + "learning_rate": 1.5640085407710122e-05, + "loss": 1.6569, + "step": 2473 + }, + { + "epoch": 0.3301307712836936, + "grad_norm": 1.0470044546188255, + "learning_rate": 1.5636515986200064e-05, + "loss": 1.6384, + "step": 2474 + }, + { + "epoch": 0.3302642113690953, + "grad_norm": 0.9851348342236802, + "learning_rate": 1.563294551185299e-05, + "loss": 1.6211, + "step": 2475 + }, + { + "epoch": 0.33039765145449695, + "grad_norm": 1.0566689739549326, + "learning_rate": 1.5629373985335836e-05, + "loss": 1.6107, + "step": 2476 + }, + { + "epoch": 0.33053109153989857, + "grad_norm": 1.2307836242301233, + "learning_rate": 1.5625801407315706e-05, + "loss": 1.6671, + "step": 2477 + }, + { + "epoch": 0.33066453162530024, + "grad_norm": 1.1555471488035003, + "learning_rate": 1.562222777845993e-05, + "loss": 1.6066, + "step": 2478 + }, + { + "epoch": 0.3307979717107019, + "grad_norm": 1.132011537007146, + "learning_rate": 1.5618653099436015e-05, + "loss": 1.6749, + "step": 2479 + }, + { + "epoch": 0.3309314117961035, + "grad_norm": 1.0591256350110159, + "learning_rate": 1.561507737091167e-05, + "loss": 1.6399, + "step": 2480 + }, + { + "epoch": 0.3310648518815052, + "grad_norm": 1.006730799707782, + "learning_rate": 1.561150059355481e-05, + "loss": 1.641, + "step": 2481 + }, + { + "epoch": 0.33119829196690687, + "grad_norm": 1.114208556739606, + "learning_rate": 1.560792276803352e-05, + "loss": 1.645, + "step": 2482 + }, + { + "epoch": 0.33133173205230854, + "grad_norm": 1.0266741767972964, + "learning_rate": 1.5604343895016107e-05, + "loss": 1.6305, + "step": 2483 + }, + { + "epoch": 0.33146517213771015, + "grad_norm": 1.0172781397984123, + "learning_rate": 1.5600763975171065e-05, + "loss": 1.6619, + "step": 2484 + }, + { + "epoch": 0.3315986122231118, + "grad_norm": 1.0013978329841358, + "learning_rate": 1.5597183009167076e-05, + "loss": 1.6364, + "step": 2485 + }, + { + "epoch": 0.3317320523085135, + "grad_norm": 1.023294574063603, + "learning_rate": 1.559360099767303e-05, + "loss": 1.6665, + "step": 2486 + }, + { + "epoch": 0.3318654923939151, + "grad_norm": 1.0000206147873272, + "learning_rate": 1.5590017941358002e-05, + "loss": 1.5405, + "step": 2487 + }, + { + "epoch": 0.3319989324793168, + "grad_norm": 1.0226703188487969, + "learning_rate": 1.5586433840891268e-05, + "loss": 1.6402, + "step": 2488 + }, + { + "epoch": 0.33213237256471845, + "grad_norm": 1.1484976146272843, + "learning_rate": 1.5582848696942298e-05, + "loss": 1.6451, + "step": 2489 + }, + { + "epoch": 0.3322658126501201, + "grad_norm": 1.0151865246487197, + "learning_rate": 1.557926251018075e-05, + "loss": 1.6442, + "step": 2490 + }, + { + "epoch": 0.33239925273552173, + "grad_norm": 1.0315722760396437, + "learning_rate": 1.557567528127649e-05, + "loss": 1.6478, + "step": 2491 + }, + { + "epoch": 0.3325326928209234, + "grad_norm": 0.9943912244470389, + "learning_rate": 1.557208701089957e-05, + "loss": 1.622, + "step": 2492 + }, + { + "epoch": 0.3326661329063251, + "grad_norm": 1.27101225719953, + "learning_rate": 1.5568497699720237e-05, + "loss": 1.619, + "step": 2493 + }, + { + "epoch": 0.3327995729917267, + "grad_norm": 1.0729429345846575, + "learning_rate": 1.556490734840893e-05, + "loss": 1.6505, + "step": 2494 + }, + { + "epoch": 0.33293301307712836, + "grad_norm": 1.1996428230299965, + "learning_rate": 1.5561315957636294e-05, + "loss": 1.6437, + "step": 2495 + }, + { + "epoch": 0.33306645316253003, + "grad_norm": 1.019452040571454, + "learning_rate": 1.555772352807315e-05, + "loss": 1.5922, + "step": 2496 + }, + { + "epoch": 0.3331998932479317, + "grad_norm": 0.9966197268098483, + "learning_rate": 1.5554130060390535e-05, + "loss": 1.6469, + "step": 2497 + }, + { + "epoch": 0.3333333333333333, + "grad_norm": 0.9924997296677041, + "learning_rate": 1.5550535555259654e-05, + "loss": 1.6905, + "step": 2498 + }, + { + "epoch": 0.333466773418735, + "grad_norm": 1.0222682047094578, + "learning_rate": 1.554694001335193e-05, + "loss": 1.6463, + "step": 2499 + }, + { + "epoch": 0.33360021350413666, + "grad_norm": 0.9803234389294245, + "learning_rate": 1.5543343435338965e-05, + "loss": 1.6299, + "step": 2500 + }, + { + "epoch": 0.33373365358953827, + "grad_norm": 1.1178595006838485, + "learning_rate": 1.5539745821892558e-05, + "loss": 1.5962, + "step": 2501 + }, + { + "epoch": 0.33386709367493994, + "grad_norm": 1.2903970617621783, + "learning_rate": 1.5536147173684706e-05, + "loss": 1.6377, + "step": 2502 + }, + { + "epoch": 0.3340005337603416, + "grad_norm": 0.9991404310713827, + "learning_rate": 1.553254749138759e-05, + "loss": 1.676, + "step": 2503 + }, + { + "epoch": 0.3341339738457433, + "grad_norm": 1.0017240659943722, + "learning_rate": 1.55289467756736e-05, + "loss": 1.6382, + "step": 2504 + }, + { + "epoch": 0.3342674139311449, + "grad_norm": 1.0389840759263576, + "learning_rate": 1.5525345027215296e-05, + "loss": 1.6566, + "step": 2505 + }, + { + "epoch": 0.33440085401654657, + "grad_norm": 1.1995901930820982, + "learning_rate": 1.552174224668545e-05, + "loss": 1.586, + "step": 2506 + }, + { + "epoch": 0.33453429410194824, + "grad_norm": 1.0567962769597756, + "learning_rate": 1.551813843475702e-05, + "loss": 1.6554, + "step": 2507 + }, + { + "epoch": 0.33466773418734985, + "grad_norm": 1.005841753947666, + "learning_rate": 1.5514533592103154e-05, + "loss": 1.625, + "step": 2508 + }, + { + "epoch": 0.3348011742727515, + "grad_norm": 0.9857509429122189, + "learning_rate": 1.5510927719397203e-05, + "loss": 1.6568, + "step": 2509 + }, + { + "epoch": 0.3349346143581532, + "grad_norm": 1.0001185366594785, + "learning_rate": 1.55073208173127e-05, + "loss": 1.6372, + "step": 2510 + }, + { + "epoch": 0.33506805444355486, + "grad_norm": 1.015290456272018, + "learning_rate": 1.5503712886523366e-05, + "loss": 1.6635, + "step": 2511 + }, + { + "epoch": 0.3352014945289565, + "grad_norm": 1.010515707552436, + "learning_rate": 1.5500103927703128e-05, + "loss": 1.5674, + "step": 2512 + }, + { + "epoch": 0.33533493461435815, + "grad_norm": 1.2221140435977949, + "learning_rate": 1.54964939415261e-05, + "loss": 1.6435, + "step": 2513 + }, + { + "epoch": 0.3354683746997598, + "grad_norm": 1.0752443571797692, + "learning_rate": 1.549288292866658e-05, + "loss": 1.6192, + "step": 2514 + }, + { + "epoch": 0.3356018147851615, + "grad_norm": 0.9988521065908124, + "learning_rate": 1.5489270889799074e-05, + "loss": 1.6454, + "step": 2515 + }, + { + "epoch": 0.3357352548705631, + "grad_norm": 1.386021157462588, + "learning_rate": 1.548565782559826e-05, + "loss": 1.6567, + "step": 2516 + }, + { + "epoch": 0.3358686949559648, + "grad_norm": 1.2499411923947341, + "learning_rate": 1.5482043736739018e-05, + "loss": 1.65, + "step": 2517 + }, + { + "epoch": 0.33600213504136645, + "grad_norm": 1.040398945045627, + "learning_rate": 1.5478428623896423e-05, + "loss": 1.6505, + "step": 2518 + }, + { + "epoch": 0.33613557512676806, + "grad_norm": 1.0467166314907603, + "learning_rate": 1.5474812487745738e-05, + "loss": 1.6145, + "step": 2519 + }, + { + "epoch": 0.33626901521216973, + "grad_norm": 0.9735753640039505, + "learning_rate": 1.547119532896241e-05, + "loss": 1.5752, + "step": 2520 + }, + { + "epoch": 0.3364024552975714, + "grad_norm": 1.1331650065762788, + "learning_rate": 1.5467577148222087e-05, + "loss": 1.5787, + "step": 2521 + }, + { + "epoch": 0.33653589538297307, + "grad_norm": 1.0303627504262778, + "learning_rate": 1.5463957946200606e-05, + "loss": 1.5802, + "step": 2522 + }, + { + "epoch": 0.3366693354683747, + "grad_norm": 1.0155983478918824, + "learning_rate": 1.5460337723573985e-05, + "loss": 1.6243, + "step": 2523 + }, + { + "epoch": 0.33680277555377636, + "grad_norm": 1.0289284948725979, + "learning_rate": 1.545671648101845e-05, + "loss": 1.658, + "step": 2524 + }, + { + "epoch": 0.336936215639178, + "grad_norm": 1.2059550726829327, + "learning_rate": 1.5453094219210402e-05, + "loss": 1.6115, + "step": 2525 + }, + { + "epoch": 0.33706965572457964, + "grad_norm": 1.0560606404340085, + "learning_rate": 1.544947093882644e-05, + "loss": 1.6066, + "step": 2526 + }, + { + "epoch": 0.3372030958099813, + "grad_norm": 1.1249026083140772, + "learning_rate": 1.5445846640543353e-05, + "loss": 1.6824, + "step": 2527 + }, + { + "epoch": 0.337336535895383, + "grad_norm": 1.0524987361668996, + "learning_rate": 1.5442221325038113e-05, + "loss": 1.6226, + "step": 2528 + }, + { + "epoch": 0.33746997598078465, + "grad_norm": 1.1006613854032747, + "learning_rate": 1.5438594992987893e-05, + "loss": 1.6257, + "step": 2529 + }, + { + "epoch": 0.33760341606618627, + "grad_norm": 1.1312957489575692, + "learning_rate": 1.543496764507005e-05, + "loss": 1.6335, + "step": 2530 + }, + { + "epoch": 0.33773685615158794, + "grad_norm": 1.0553318381697188, + "learning_rate": 1.543133928196213e-05, + "loss": 1.6227, + "step": 2531 + }, + { + "epoch": 0.3378702962369896, + "grad_norm": 1.1640026808016215, + "learning_rate": 1.542770990434187e-05, + "loss": 1.615, + "step": 2532 + }, + { + "epoch": 0.3380037363223912, + "grad_norm": 1.1330496348678385, + "learning_rate": 1.5424079512887197e-05, + "loss": 1.6289, + "step": 2533 + }, + { + "epoch": 0.3381371764077929, + "grad_norm": 1.235500382588085, + "learning_rate": 1.5420448108276225e-05, + "loss": 1.6293, + "step": 2534 + }, + { + "epoch": 0.33827061649319456, + "grad_norm": 0.9892442737683309, + "learning_rate": 1.5416815691187263e-05, + "loss": 1.6451, + "step": 2535 + }, + { + "epoch": 0.33840405657859624, + "grad_norm": 1.0714941246892575, + "learning_rate": 1.54131822622988e-05, + "loss": 1.6383, + "step": 2536 + }, + { + "epoch": 0.33853749666399785, + "grad_norm": 0.9765304919004115, + "learning_rate": 1.5409547822289523e-05, + "loss": 1.6106, + "step": 2537 + }, + { + "epoch": 0.3386709367493995, + "grad_norm": 1.1397242313512768, + "learning_rate": 1.5405912371838303e-05, + "loss": 1.6759, + "step": 2538 + }, + { + "epoch": 0.3388043768348012, + "grad_norm": 1.04733183839702, + "learning_rate": 1.5402275911624202e-05, + "loss": 1.6065, + "step": 2539 + }, + { + "epoch": 0.3389378169202028, + "grad_norm": 1.0923000163302528, + "learning_rate": 1.5398638442326466e-05, + "loss": 1.5728, + "step": 2540 + }, + { + "epoch": 0.3390712570056045, + "grad_norm": 1.012586965117265, + "learning_rate": 1.5394999964624534e-05, + "loss": 1.6695, + "step": 2541 + }, + { + "epoch": 0.33920469709100615, + "grad_norm": 1.2400537139148242, + "learning_rate": 1.5391360479198035e-05, + "loss": 1.6388, + "step": 2542 + }, + { + "epoch": 0.3393381371764078, + "grad_norm": 1.2509378456829192, + "learning_rate": 1.538771998672678e-05, + "loss": 1.596, + "step": 2543 + }, + { + "epoch": 0.33947157726180943, + "grad_norm": 1.075544011557188, + "learning_rate": 1.538407848789077e-05, + "loss": 1.5977, + "step": 2544 + }, + { + "epoch": 0.3396050173472111, + "grad_norm": 1.2176294488296542, + "learning_rate": 1.5380435983370207e-05, + "loss": 1.6392, + "step": 2545 + }, + { + "epoch": 0.3397384574326128, + "grad_norm": 1.3397975236283604, + "learning_rate": 1.5376792473845453e-05, + "loss": 1.6818, + "step": 2546 + }, + { + "epoch": 0.3398718975180144, + "grad_norm": 1.046773813890146, + "learning_rate": 1.5373147959997085e-05, + "loss": 1.6099, + "step": 2547 + }, + { + "epoch": 0.34000533760341606, + "grad_norm": 1.0648283430543095, + "learning_rate": 1.5369502442505852e-05, + "loss": 1.6449, + "step": 2548 + }, + { + "epoch": 0.34013877768881773, + "grad_norm": 1.1528358481625516, + "learning_rate": 1.53658559220527e-05, + "loss": 1.6178, + "step": 2549 + }, + { + "epoch": 0.3402722177742194, + "grad_norm": 1.230719491103403, + "learning_rate": 1.5362208399318753e-05, + "loss": 1.6431, + "step": 2550 + }, + { + "epoch": 0.340405657859621, + "grad_norm": 0.9911045985776983, + "learning_rate": 1.5358559874985326e-05, + "loss": 1.6593, + "step": 2551 + }, + { + "epoch": 0.3405390979450227, + "grad_norm": 1.0148399749816694, + "learning_rate": 1.5354910349733926e-05, + "loss": 1.6305, + "step": 2552 + }, + { + "epoch": 0.34067253803042435, + "grad_norm": 0.9757587532893943, + "learning_rate": 1.5351259824246244e-05, + "loss": 1.6463, + "step": 2553 + }, + { + "epoch": 0.34080597811582597, + "grad_norm": 1.0377383659545665, + "learning_rate": 1.5347608299204147e-05, + "loss": 1.5969, + "step": 2554 + }, + { + "epoch": 0.34093941820122764, + "grad_norm": 1.1327301101792895, + "learning_rate": 1.5343955775289708e-05, + "loss": 1.5886, + "step": 2555 + }, + { + "epoch": 0.3410728582866293, + "grad_norm": 1.1199829427800625, + "learning_rate": 1.534030225318517e-05, + "loss": 1.6568, + "step": 2556 + }, + { + "epoch": 0.341206298372031, + "grad_norm": 1.1255501040237739, + "learning_rate": 1.533664773357298e-05, + "loss": 1.6195, + "step": 2557 + }, + { + "epoch": 0.3413397384574326, + "grad_norm": 0.9751153526994987, + "learning_rate": 1.5332992217135746e-05, + "loss": 1.6075, + "step": 2558 + }, + { + "epoch": 0.34147317854283427, + "grad_norm": 1.0250661162211099, + "learning_rate": 1.5329335704556284e-05, + "loss": 1.6398, + "step": 2559 + }, + { + "epoch": 0.34160661862823594, + "grad_norm": 1.1963912340246656, + "learning_rate": 1.532567819651759e-05, + "loss": 1.6364, + "step": 2560 + }, + { + "epoch": 0.34174005871363755, + "grad_norm": 1.0270905478988817, + "learning_rate": 1.532201969370285e-05, + "loss": 1.6027, + "step": 2561 + }, + { + "epoch": 0.3418734987990392, + "grad_norm": 0.9716309113456719, + "learning_rate": 1.531836019679542e-05, + "loss": 1.6109, + "step": 2562 + }, + { + "epoch": 0.3420069388844409, + "grad_norm": 1.0974203593959186, + "learning_rate": 1.5314699706478856e-05, + "loss": 1.6531, + "step": 2563 + }, + { + "epoch": 0.34214037896984256, + "grad_norm": 1.1280911638633904, + "learning_rate": 1.53110382234369e-05, + "loss": 1.6526, + "step": 2564 + }, + { + "epoch": 0.3422738190552442, + "grad_norm": 1.0352666364430376, + "learning_rate": 1.530737574835347e-05, + "loss": 1.638, + "step": 2565 + }, + { + "epoch": 0.34240725914064585, + "grad_norm": 0.9852011126418923, + "learning_rate": 1.530371228191268e-05, + "loss": 1.629, + "step": 2566 + }, + { + "epoch": 0.3425406992260475, + "grad_norm": 1.7874095738872313, + "learning_rate": 1.5300047824798815e-05, + "loss": 1.6322, + "step": 2567 + }, + { + "epoch": 0.34267413931144913, + "grad_norm": 1.068401710484991, + "learning_rate": 1.5296382377696364e-05, + "loss": 1.6951, + "step": 2568 + }, + { + "epoch": 0.3428075793968508, + "grad_norm": 0.9922432887897237, + "learning_rate": 1.5292715941289985e-05, + "loss": 1.6335, + "step": 2569 + }, + { + "epoch": 0.3429410194822525, + "grad_norm": 0.9469476131303167, + "learning_rate": 1.528904851626453e-05, + "loss": 1.5556, + "step": 2570 + }, + { + "epoch": 0.34307445956765414, + "grad_norm": 1.0252957676256522, + "learning_rate": 1.528538010330503e-05, + "loss": 1.6207, + "step": 2571 + }, + { + "epoch": 0.34320789965305576, + "grad_norm": 1.256901118189651, + "learning_rate": 1.52817107030967e-05, + "loss": 1.5499, + "step": 2572 + }, + { + "epoch": 0.34334133973845743, + "grad_norm": 1.1760660704565096, + "learning_rate": 1.5278040316324947e-05, + "loss": 1.5907, + "step": 2573 + }, + { + "epoch": 0.3434747798238591, + "grad_norm": 1.056335970772178, + "learning_rate": 1.5274368943675358e-05, + "loss": 1.6585, + "step": 2574 + }, + { + "epoch": 0.3436082199092607, + "grad_norm": 1.0440163675723362, + "learning_rate": 1.5270696585833697e-05, + "loss": 1.6956, + "step": 2575 + }, + { + "epoch": 0.3437416599946624, + "grad_norm": 1.1496273753007886, + "learning_rate": 1.5267023243485927e-05, + "loss": 1.626, + "step": 2576 + }, + { + "epoch": 0.34387510008006406, + "grad_norm": 0.989190837912653, + "learning_rate": 1.526334891731818e-05, + "loss": 1.5821, + "step": 2577 + }, + { + "epoch": 0.3440085401654657, + "grad_norm": 0.975382429954711, + "learning_rate": 1.5259673608016782e-05, + "loss": 1.6546, + "step": 2578 + }, + { + "epoch": 0.34414198025086734, + "grad_norm": 1.2380666712332904, + "learning_rate": 1.5255997316268234e-05, + "loss": 1.6596, + "step": 2579 + }, + { + "epoch": 0.344275420336269, + "grad_norm": 1.0226269070252, + "learning_rate": 1.5252320042759232e-05, + "loss": 1.6045, + "step": 2580 + }, + { + "epoch": 0.3444088604216707, + "grad_norm": 0.9919122022483479, + "learning_rate": 1.5248641788176647e-05, + "loss": 1.5316, + "step": 2581 + }, + { + "epoch": 0.34454230050707235, + "grad_norm": 1.1347269775006643, + "learning_rate": 1.5244962553207531e-05, + "loss": 1.6384, + "step": 2582 + }, + { + "epoch": 0.34467574059247397, + "grad_norm": 0.9828544036878673, + "learning_rate": 1.524128233853913e-05, + "loss": 1.6399, + "step": 2583 + }, + { + "epoch": 0.34480918067787564, + "grad_norm": 1.3101691050309692, + "learning_rate": 1.523760114485886e-05, + "loss": 1.6285, + "step": 2584 + }, + { + "epoch": 0.3449426207632773, + "grad_norm": 1.0927621397780132, + "learning_rate": 1.5233918972854329e-05, + "loss": 1.5724, + "step": 2585 + }, + { + "epoch": 0.3450760608486789, + "grad_norm": 1.0744584089585294, + "learning_rate": 1.5230235823213325e-05, + "loss": 1.5861, + "step": 2586 + }, + { + "epoch": 0.3452095009340806, + "grad_norm": 1.1708529579248028, + "learning_rate": 1.5226551696623815e-05, + "loss": 1.6419, + "step": 2587 + }, + { + "epoch": 0.34534294101948226, + "grad_norm": 1.170484702302893, + "learning_rate": 1.5222866593773955e-05, + "loss": 1.6793, + "step": 2588 + }, + { + "epoch": 0.34547638110488393, + "grad_norm": 0.9940131617224933, + "learning_rate": 1.521918051535208e-05, + "loss": 1.6196, + "step": 2589 + }, + { + "epoch": 0.34560982119028555, + "grad_norm": 1.0291494485832964, + "learning_rate": 1.5215493462046711e-05, + "loss": 1.6436, + "step": 2590 + }, + { + "epoch": 0.3457432612756872, + "grad_norm": 1.1772938675721651, + "learning_rate": 1.5211805434546541e-05, + "loss": 1.5946, + "step": 2591 + }, + { + "epoch": 0.3458767013610889, + "grad_norm": 1.3230747393201525, + "learning_rate": 1.5208116433540454e-05, + "loss": 1.6298, + "step": 2592 + }, + { + "epoch": 0.3460101414464905, + "grad_norm": 1.0683806522297445, + "learning_rate": 1.5204426459717516e-05, + "loss": 1.6174, + "step": 2593 + }, + { + "epoch": 0.3461435815318922, + "grad_norm": 1.1046905551827517, + "learning_rate": 1.520073551376697e-05, + "loss": 1.6434, + "step": 2594 + }, + { + "epoch": 0.34627702161729385, + "grad_norm": 1.070450277819012, + "learning_rate": 1.519704359637824e-05, + "loss": 1.6019, + "step": 2595 + }, + { + "epoch": 0.3464104617026955, + "grad_norm": 1.3058537795961895, + "learning_rate": 1.519335070824094e-05, + "loss": 1.6049, + "step": 2596 + }, + { + "epoch": 0.34654390178809713, + "grad_norm": 1.0080854625304823, + "learning_rate": 1.5189656850044853e-05, + "loss": 1.6254, + "step": 2597 + }, + { + "epoch": 0.3466773418734988, + "grad_norm": 1.2017908924230127, + "learning_rate": 1.5185962022479954e-05, + "loss": 1.6323, + "step": 2598 + }, + { + "epoch": 0.34681078195890047, + "grad_norm": 1.0124260951271844, + "learning_rate": 1.5182266226236394e-05, + "loss": 1.6621, + "step": 2599 + }, + { + "epoch": 0.3469442220443021, + "grad_norm": 1.2316810096284962, + "learning_rate": 1.5178569462004506e-05, + "loss": 1.5827, + "step": 2600 + }, + { + "epoch": 0.34707766212970376, + "grad_norm": 1.0242976500030199, + "learning_rate": 1.51748717304748e-05, + "loss": 1.6034, + "step": 2601 + }, + { + "epoch": 0.3472111022151054, + "grad_norm": 1.0030046362345064, + "learning_rate": 1.5171173032337976e-05, + "loss": 1.654, + "step": 2602 + }, + { + "epoch": 0.3473445423005071, + "grad_norm": 1.0713133664664767, + "learning_rate": 1.5167473368284902e-05, + "loss": 1.6352, + "step": 2603 + }, + { + "epoch": 0.3474779823859087, + "grad_norm": 1.0969747552844828, + "learning_rate": 1.516377273900664e-05, + "loss": 1.6475, + "step": 2604 + }, + { + "epoch": 0.3476114224713104, + "grad_norm": 0.9899374388035455, + "learning_rate": 1.5160071145194422e-05, + "loss": 1.6286, + "step": 2605 + }, + { + "epoch": 0.34774486255671205, + "grad_norm": 0.9679955249669937, + "learning_rate": 1.515636858753966e-05, + "loss": 1.6383, + "step": 2606 + }, + { + "epoch": 0.34787830264211367, + "grad_norm": 1.0249461107418476, + "learning_rate": 1.5152665066733957e-05, + "loss": 1.6083, + "step": 2607 + }, + { + "epoch": 0.34801174272751534, + "grad_norm": 1.303220194256495, + "learning_rate": 1.5148960583469085e-05, + "loss": 1.6688, + "step": 2608 + }, + { + "epoch": 0.348145182812917, + "grad_norm": 1.0188653868990143, + "learning_rate": 1.5145255138436999e-05, + "loss": 1.6288, + "step": 2609 + }, + { + "epoch": 0.3482786228983187, + "grad_norm": 1.2584157965822476, + "learning_rate": 1.5141548732329831e-05, + "loss": 1.6492, + "step": 2610 + }, + { + "epoch": 0.3484120629837203, + "grad_norm": 1.0306474809508248, + "learning_rate": 1.5137841365839901e-05, + "loss": 1.6854, + "step": 2611 + }, + { + "epoch": 0.34854550306912196, + "grad_norm": 1.056381467093589, + "learning_rate": 1.51341330396597e-05, + "loss": 1.6271, + "step": 2612 + }, + { + "epoch": 0.34867894315452364, + "grad_norm": 1.0295530745478738, + "learning_rate": 1.51304237544819e-05, + "loss": 1.6633, + "step": 2613 + }, + { + "epoch": 0.34881238323992525, + "grad_norm": 1.0398915950232068, + "learning_rate": 1.5126713510999359e-05, + "loss": 1.627, + "step": 2614 + }, + { + "epoch": 0.3489458233253269, + "grad_norm": 1.0146998246386771, + "learning_rate": 1.51230023099051e-05, + "loss": 1.627, + "step": 2615 + }, + { + "epoch": 0.3490792634107286, + "grad_norm": 1.0172109821324007, + "learning_rate": 1.5119290151892339e-05, + "loss": 1.5836, + "step": 2616 + }, + { + "epoch": 0.34921270349613026, + "grad_norm": 1.0267520201741256, + "learning_rate": 1.5115577037654464e-05, + "loss": 1.6439, + "step": 2617 + }, + { + "epoch": 0.3493461435815319, + "grad_norm": 1.0457857231337446, + "learning_rate": 1.5111862967885036e-05, + "loss": 1.6157, + "step": 2618 + }, + { + "epoch": 0.34947958366693355, + "grad_norm": 1.0555824876624866, + "learning_rate": 1.5108147943277813e-05, + "loss": 1.5927, + "step": 2619 + }, + { + "epoch": 0.3496130237523352, + "grad_norm": 0.9824068117065244, + "learning_rate": 1.5104431964526707e-05, + "loss": 1.6322, + "step": 2620 + }, + { + "epoch": 0.34974646383773683, + "grad_norm": 1.0882477650288422, + "learning_rate": 1.510071503232583e-05, + "loss": 1.5678, + "step": 2621 + }, + { + "epoch": 0.3498799039231385, + "grad_norm": 0.966848710204604, + "learning_rate": 1.5096997147369457e-05, + "loss": 1.6656, + "step": 2622 + }, + { + "epoch": 0.3500133440085402, + "grad_norm": 0.9955616430245504, + "learning_rate": 1.509327831035205e-05, + "loss": 1.6429, + "step": 2623 + }, + { + "epoch": 0.35014678409394184, + "grad_norm": 1.5091711649173347, + "learning_rate": 1.5089558521968242e-05, + "loss": 1.7588, + "step": 2624 + }, + { + "epoch": 0.35028022417934346, + "grad_norm": 0.9707332773761684, + "learning_rate": 1.5085837782912851e-05, + "loss": 1.6116, + "step": 2625 + }, + { + "epoch": 0.35041366426474513, + "grad_norm": 0.9642328468057494, + "learning_rate": 1.5082116093880862e-05, + "loss": 1.6222, + "step": 2626 + }, + { + "epoch": 0.3505471043501468, + "grad_norm": 1.0194989516497073, + "learning_rate": 1.5078393455567452e-05, + "loss": 1.6335, + "step": 2627 + }, + { + "epoch": 0.3506805444355484, + "grad_norm": 0.9709628625829926, + "learning_rate": 1.5074669868667964e-05, + "loss": 1.6335, + "step": 2628 + }, + { + "epoch": 0.3508139845209501, + "grad_norm": 1.0095163214896312, + "learning_rate": 1.5070945333877923e-05, + "loss": 1.6442, + "step": 2629 + }, + { + "epoch": 0.35094742460635175, + "grad_norm": 1.1931846380684248, + "learning_rate": 1.506721985189303e-05, + "loss": 1.638, + "step": 2630 + }, + { + "epoch": 0.3510808646917534, + "grad_norm": 0.9839265090005194, + "learning_rate": 1.5063493423409154e-05, + "loss": 1.6196, + "step": 2631 + }, + { + "epoch": 0.35121430477715504, + "grad_norm": 1.1186506149798139, + "learning_rate": 1.5059766049122364e-05, + "loss": 1.6225, + "step": 2632 + }, + { + "epoch": 0.3513477448625567, + "grad_norm": 1.0864720980811629, + "learning_rate": 1.5056037729728878e-05, + "loss": 1.5935, + "step": 2633 + }, + { + "epoch": 0.3514811849479584, + "grad_norm": 1.2246349786822723, + "learning_rate": 1.5052308465925115e-05, + "loss": 1.5767, + "step": 2634 + }, + { + "epoch": 0.35161462503336, + "grad_norm": 1.0333745712542706, + "learning_rate": 1.5048578258407646e-05, + "loss": 1.6637, + "step": 2635 + }, + { + "epoch": 0.35174806511876167, + "grad_norm": 1.0018496493629654, + "learning_rate": 1.5044847107873246e-05, + "loss": 1.6278, + "step": 2636 + }, + { + "epoch": 0.35188150520416334, + "grad_norm": 1.1758224822257188, + "learning_rate": 1.5041115015018841e-05, + "loss": 1.6303, + "step": 2637 + }, + { + "epoch": 0.352014945289565, + "grad_norm": 1.250987136440272, + "learning_rate": 1.5037381980541546e-05, + "loss": 1.6571, + "step": 2638 + }, + { + "epoch": 0.3521483853749666, + "grad_norm": 1.2181097297996704, + "learning_rate": 1.503364800513865e-05, + "loss": 1.704, + "step": 2639 + }, + { + "epoch": 0.3522818254603683, + "grad_norm": 1.0786935661772743, + "learning_rate": 1.5029913089507617e-05, + "loss": 1.5949, + "step": 2640 + }, + { + "epoch": 0.35241526554576996, + "grad_norm": 1.0531753164421735, + "learning_rate": 1.5026177234346087e-05, + "loss": 1.6373, + "step": 2641 + }, + { + "epoch": 0.3525487056311716, + "grad_norm": 1.3629803662802447, + "learning_rate": 1.5022440440351873e-05, + "loss": 1.6762, + "step": 2642 + }, + { + "epoch": 0.35268214571657325, + "grad_norm": 1.1620679753807144, + "learning_rate": 1.501870270822297e-05, + "loss": 1.6133, + "step": 2643 + }, + { + "epoch": 0.3528155858019749, + "grad_norm": 1.0348126543518374, + "learning_rate": 1.5014964038657538e-05, + "loss": 1.654, + "step": 2644 + }, + { + "epoch": 0.3529490258873766, + "grad_norm": 1.1304213137830432, + "learning_rate": 1.5011224432353924e-05, + "loss": 1.6386, + "step": 2645 + }, + { + "epoch": 0.3530824659727782, + "grad_norm": 1.1277308613925912, + "learning_rate": 1.5007483890010642e-05, + "loss": 1.6194, + "step": 2646 + }, + { + "epoch": 0.3532159060581799, + "grad_norm": 1.0163090940302686, + "learning_rate": 1.5003742412326377e-05, + "loss": 1.6575, + "step": 2647 + }, + { + "epoch": 0.35334934614358154, + "grad_norm": 1.0125879795963375, + "learning_rate": 1.5000000000000002e-05, + "loss": 1.6453, + "step": 2648 + }, + { + "epoch": 0.35348278622898316, + "grad_norm": 1.0098185677548541, + "learning_rate": 1.4996256653730554e-05, + "loss": 1.6068, + "step": 2649 + }, + { + "epoch": 0.35361622631438483, + "grad_norm": 1.2735854212695745, + "learning_rate": 1.4992512374217247e-05, + "loss": 1.6487, + "step": 2650 + }, + { + "epoch": 0.3537496663997865, + "grad_norm": 1.0932430437817762, + "learning_rate": 1.498876716215947e-05, + "loss": 1.6284, + "step": 2651 + }, + { + "epoch": 0.35388310648518817, + "grad_norm": 0.9777781439946768, + "learning_rate": 1.4985021018256786e-05, + "loss": 1.625, + "step": 2652 + }, + { + "epoch": 0.3540165465705898, + "grad_norm": 1.1136916517793387, + "learning_rate": 1.4981273943208937e-05, + "loss": 1.6708, + "step": 2653 + }, + { + "epoch": 0.35414998665599146, + "grad_norm": 1.373624233594401, + "learning_rate": 1.4977525937715824e-05, + "loss": 1.6216, + "step": 2654 + }, + { + "epoch": 0.3542834267413931, + "grad_norm": 1.0595448685922348, + "learning_rate": 1.4973777002477538e-05, + "loss": 1.5722, + "step": 2655 + }, + { + "epoch": 0.3544168668267948, + "grad_norm": 1.2378751910850914, + "learning_rate": 1.4970027138194336e-05, + "loss": 1.6311, + "step": 2656 + }, + { + "epoch": 0.3545503069121964, + "grad_norm": 1.1786205067232547, + "learning_rate": 1.4966276345566654e-05, + "loss": 1.6456, + "step": 2657 + }, + { + "epoch": 0.3546837469975981, + "grad_norm": 1.0572423131869906, + "learning_rate": 1.4962524625295092e-05, + "loss": 1.6392, + "step": 2658 + }, + { + "epoch": 0.35481718708299975, + "grad_norm": 1.0148567443614354, + "learning_rate": 1.495877197808043e-05, + "loss": 1.6173, + "step": 2659 + }, + { + "epoch": 0.35495062716840137, + "grad_norm": 0.9578348594491616, + "learning_rate": 1.4955018404623623e-05, + "loss": 1.628, + "step": 2660 + }, + { + "epoch": 0.35508406725380304, + "grad_norm": 1.1181766176664125, + "learning_rate": 1.4951263905625788e-05, + "loss": 1.5787, + "step": 2661 + }, + { + "epoch": 0.3552175073392047, + "grad_norm": 1.1415697297087068, + "learning_rate": 1.4947508481788231e-05, + "loss": 1.5782, + "step": 2662 + }, + { + "epoch": 0.3553509474246064, + "grad_norm": 0.9866478663748992, + "learning_rate": 1.494375213381242e-05, + "loss": 1.5652, + "step": 2663 + }, + { + "epoch": 0.355484387510008, + "grad_norm": 1.2168790215873122, + "learning_rate": 1.4939994862399996e-05, + "loss": 1.6349, + "step": 2664 + }, + { + "epoch": 0.35561782759540966, + "grad_norm": 1.0419160584710712, + "learning_rate": 1.4936236668252772e-05, + "loss": 1.625, + "step": 2665 + }, + { + "epoch": 0.35575126768081133, + "grad_norm": 1.0033295412035523, + "learning_rate": 1.4932477552072745e-05, + "loss": 1.6232, + "step": 2666 + }, + { + "epoch": 0.35588470776621295, + "grad_norm": 0.9869871754447705, + "learning_rate": 1.4928717514562066e-05, + "loss": 1.5835, + "step": 2667 + }, + { + "epoch": 0.3560181478516146, + "grad_norm": 0.992408467300245, + "learning_rate": 1.4924956556423072e-05, + "loss": 1.603, + "step": 2668 + }, + { + "epoch": 0.3561515879370163, + "grad_norm": 0.9824615329438933, + "learning_rate": 1.4921194678358266e-05, + "loss": 1.5976, + "step": 2669 + }, + { + "epoch": 0.35628502802241796, + "grad_norm": 0.9934973089076394, + "learning_rate": 1.4917431881070323e-05, + "loss": 1.595, + "step": 2670 + }, + { + "epoch": 0.3564184681078196, + "grad_norm": 0.9868126452811748, + "learning_rate": 1.4913668165262095e-05, + "loss": 1.6287, + "step": 2671 + }, + { + "epoch": 0.35655190819322125, + "grad_norm": 1.0146369534155588, + "learning_rate": 1.4909903531636593e-05, + "loss": 1.5831, + "step": 2672 + }, + { + "epoch": 0.3566853482786229, + "grad_norm": 1.0128166092521045, + "learning_rate": 1.4906137980897017e-05, + "loss": 1.6241, + "step": 2673 + }, + { + "epoch": 0.35681878836402453, + "grad_norm": 1.0086684442228309, + "learning_rate": 1.4902371513746723e-05, + "loss": 1.6162, + "step": 2674 + }, + { + "epoch": 0.3569522284494262, + "grad_norm": 1.0430890218619906, + "learning_rate": 1.4898604130889243e-05, + "loss": 1.6444, + "step": 2675 + }, + { + "epoch": 0.35708566853482787, + "grad_norm": 0.9873271173986554, + "learning_rate": 1.489483583302829e-05, + "loss": 1.6058, + "step": 2676 + }, + { + "epoch": 0.35721910862022954, + "grad_norm": 0.9633834898755523, + "learning_rate": 1.4891066620867726e-05, + "loss": 1.5722, + "step": 2677 + }, + { + "epoch": 0.35735254870563116, + "grad_norm": 1.0004128851077128, + "learning_rate": 1.4887296495111611e-05, + "loss": 1.6337, + "step": 2678 + }, + { + "epoch": 0.3574859887910328, + "grad_norm": 1.1889045283167872, + "learning_rate": 1.4883525456464152e-05, + "loss": 1.5775, + "step": 2679 + }, + { + "epoch": 0.3576194288764345, + "grad_norm": 2.5683847190418736, + "learning_rate": 1.4879753505629739e-05, + "loss": 1.6822, + "step": 2680 + }, + { + "epoch": 0.3577528689618361, + "grad_norm": 1.0263603075923868, + "learning_rate": 1.4875980643312931e-05, + "loss": 1.6556, + "step": 2681 + }, + { + "epoch": 0.3578863090472378, + "grad_norm": 1.0275465262931753, + "learning_rate": 1.4872206870218451e-05, + "loss": 1.6586, + "step": 2682 + }, + { + "epoch": 0.35801974913263945, + "grad_norm": 1.0578558588998055, + "learning_rate": 1.4868432187051201e-05, + "loss": 1.6126, + "step": 2683 + }, + { + "epoch": 0.3581531892180411, + "grad_norm": 1.0209797685600523, + "learning_rate": 1.4864656594516245e-05, + "loss": 1.6051, + "step": 2684 + }, + { + "epoch": 0.35828662930344274, + "grad_norm": 1.0698279049277513, + "learning_rate": 1.4860880093318827e-05, + "loss": 1.6162, + "step": 2685 + }, + { + "epoch": 0.3584200693888444, + "grad_norm": 1.0175812497772847, + "learning_rate": 1.4857102684164349e-05, + "loss": 1.6055, + "step": 2686 + }, + { + "epoch": 0.3585535094742461, + "grad_norm": 1.0104871621067544, + "learning_rate": 1.485332436775839e-05, + "loss": 1.6041, + "step": 2687 + }, + { + "epoch": 0.3586869495596477, + "grad_norm": 1.1287495893065134, + "learning_rate": 1.4849545144806697e-05, + "loss": 1.603, + "step": 2688 + }, + { + "epoch": 0.35882038964504936, + "grad_norm": 1.0368374324423373, + "learning_rate": 1.4845765016015183e-05, + "loss": 1.6295, + "step": 2689 + }, + { + "epoch": 0.35895382973045104, + "grad_norm": 1.0590887497081614, + "learning_rate": 1.4841983982089936e-05, + "loss": 1.6473, + "step": 2690 + }, + { + "epoch": 0.3590872698158527, + "grad_norm": 1.1066849585601135, + "learning_rate": 1.4838202043737209e-05, + "loss": 1.6567, + "step": 2691 + }, + { + "epoch": 0.3592207099012543, + "grad_norm": 1.018947820472309, + "learning_rate": 1.4834419201663429e-05, + "loss": 1.6562, + "step": 2692 + }, + { + "epoch": 0.359354149986656, + "grad_norm": 1.128393294375381, + "learning_rate": 1.483063545657518e-05, + "loss": 1.6634, + "step": 2693 + }, + { + "epoch": 0.35948759007205766, + "grad_norm": 1.0975481758375052, + "learning_rate": 1.4826850809179228e-05, + "loss": 1.6274, + "step": 2694 + }, + { + "epoch": 0.3596210301574593, + "grad_norm": 1.1035342439097737, + "learning_rate": 1.4823065260182499e-05, + "loss": 1.6199, + "step": 2695 + }, + { + "epoch": 0.35975447024286095, + "grad_norm": 1.0823826339805787, + "learning_rate": 1.4819278810292096e-05, + "loss": 1.5614, + "step": 2696 + }, + { + "epoch": 0.3598879103282626, + "grad_norm": 1.0810219041790743, + "learning_rate": 1.4815491460215277e-05, + "loss": 1.6457, + "step": 2697 + }, + { + "epoch": 0.3600213504136643, + "grad_norm": 1.008665774442881, + "learning_rate": 1.481170321065948e-05, + "loss": 1.6252, + "step": 2698 + }, + { + "epoch": 0.3601547904990659, + "grad_norm": 1.0280588719731665, + "learning_rate": 1.4807914062332307e-05, + "loss": 1.6583, + "step": 2699 + }, + { + "epoch": 0.3602882305844676, + "grad_norm": 1.0944712865857813, + "learning_rate": 1.4804124015941528e-05, + "loss": 1.6016, + "step": 2700 + }, + { + "epoch": 0.36042167066986924, + "grad_norm": 1.0990375701438984, + "learning_rate": 1.480033307219508e-05, + "loss": 1.5977, + "step": 2701 + }, + { + "epoch": 0.36055511075527086, + "grad_norm": 1.0149776394646597, + "learning_rate": 1.4796541231801068e-05, + "loss": 1.6027, + "step": 2702 + }, + { + "epoch": 0.36068855084067253, + "grad_norm": 1.1355704225231722, + "learning_rate": 1.4792748495467763e-05, + "loss": 1.5975, + "step": 2703 + }, + { + "epoch": 0.3608219909260742, + "grad_norm": 2.0682004186698784, + "learning_rate": 1.4788954863903608e-05, + "loss": 1.5899, + "step": 2704 + }, + { + "epoch": 0.36095543101147587, + "grad_norm": 1.0954874367416438, + "learning_rate": 1.478516033781721e-05, + "loss": 1.5893, + "step": 2705 + }, + { + "epoch": 0.3610888710968775, + "grad_norm": 1.09549046341114, + "learning_rate": 1.4781364917917339e-05, + "loss": 1.6175, + "step": 2706 + }, + { + "epoch": 0.36122231118227915, + "grad_norm": 1.0665598499326367, + "learning_rate": 1.477756860491294e-05, + "loss": 1.6201, + "step": 2707 + }, + { + "epoch": 0.3613557512676808, + "grad_norm": 1.0431226101624222, + "learning_rate": 1.4773771399513122e-05, + "loss": 1.5921, + "step": 2708 + }, + { + "epoch": 0.36148919135308244, + "grad_norm": 1.0347916658313776, + "learning_rate": 1.4769973302427154e-05, + "loss": 1.5922, + "step": 2709 + }, + { + "epoch": 0.3616226314384841, + "grad_norm": 1.188173513439476, + "learning_rate": 1.4766174314364486e-05, + "loss": 1.6227, + "step": 2710 + }, + { + "epoch": 0.3617560715238858, + "grad_norm": 1.1816851578943308, + "learning_rate": 1.476237443603472e-05, + "loss": 1.6272, + "step": 2711 + }, + { + "epoch": 0.36188951160928745, + "grad_norm": 1.047966632364183, + "learning_rate": 1.475857366814763e-05, + "loss": 1.6298, + "step": 2712 + }, + { + "epoch": 0.36202295169468907, + "grad_norm": 1.134317205650707, + "learning_rate": 1.4754772011413154e-05, + "loss": 1.5883, + "step": 2713 + }, + { + "epoch": 0.36215639178009074, + "grad_norm": 1.0021173494097093, + "learning_rate": 1.4750969466541407e-05, + "loss": 1.6369, + "step": 2714 + }, + { + "epoch": 0.3622898318654924, + "grad_norm": 1.1709267393721203, + "learning_rate": 1.4747166034242652e-05, + "loss": 1.5887, + "step": 2715 + }, + { + "epoch": 0.362423271950894, + "grad_norm": 1.1258130657900838, + "learning_rate": 1.4743361715227331e-05, + "loss": 1.6778, + "step": 2716 + }, + { + "epoch": 0.3625567120362957, + "grad_norm": 0.9885486941912983, + "learning_rate": 1.4739556510206047e-05, + "loss": 1.6048, + "step": 2717 + }, + { + "epoch": 0.36269015212169736, + "grad_norm": 1.0268383631231532, + "learning_rate": 1.473575041988957e-05, + "loss": 1.594, + "step": 2718 + }, + { + "epoch": 0.36282359220709903, + "grad_norm": 1.190424029103695, + "learning_rate": 1.4731943444988831e-05, + "loss": 1.63, + "step": 2719 + }, + { + "epoch": 0.36295703229250065, + "grad_norm": 1.0167446687335664, + "learning_rate": 1.4728135586214933e-05, + "loss": 1.6216, + "step": 2720 + }, + { + "epoch": 0.3630904723779023, + "grad_norm": 1.0972474419145468, + "learning_rate": 1.472432684427914e-05, + "loss": 1.6385, + "step": 2721 + }, + { + "epoch": 0.363223912463304, + "grad_norm": 1.1518104547155832, + "learning_rate": 1.4720517219892881e-05, + "loss": 1.6232, + "step": 2722 + }, + { + "epoch": 0.36335735254870566, + "grad_norm": 7.034867431932981, + "learning_rate": 1.4716706713767748e-05, + "loss": 1.5632, + "step": 2723 + }, + { + "epoch": 0.3634907926341073, + "grad_norm": 1.166832787708479, + "learning_rate": 1.4712895326615505e-05, + "loss": 1.607, + "step": 2724 + }, + { + "epoch": 0.36362423271950894, + "grad_norm": 1.0638457063513032, + "learning_rate": 1.4709083059148073e-05, + "loss": 1.6328, + "step": 2725 + }, + { + "epoch": 0.3637576728049106, + "grad_norm": 1.0079889035962148, + "learning_rate": 1.470526991207754e-05, + "loss": 1.6063, + "step": 2726 + }, + { + "epoch": 0.36389111289031223, + "grad_norm": 1.0596287476120059, + "learning_rate": 1.4701455886116159e-05, + "loss": 1.6502, + "step": 2727 + }, + { + "epoch": 0.3640245529757139, + "grad_norm": 0.9990070809309942, + "learning_rate": 1.4697640981976347e-05, + "loss": 1.6093, + "step": 2728 + }, + { + "epoch": 0.36415799306111557, + "grad_norm": 1.1369821739015482, + "learning_rate": 1.4693825200370682e-05, + "loss": 1.6204, + "step": 2729 + }, + { + "epoch": 0.36429143314651724, + "grad_norm": 1.1486606111537232, + "learning_rate": 1.4690008542011912e-05, + "loss": 1.656, + "step": 2730 + }, + { + "epoch": 0.36442487323191886, + "grad_norm": 1.038173101624409, + "learning_rate": 1.4686191007612945e-05, + "loss": 1.5957, + "step": 2731 + }, + { + "epoch": 0.3645583133173205, + "grad_norm": 1.3383555292824745, + "learning_rate": 1.4682372597886851e-05, + "loss": 1.6269, + "step": 2732 + }, + { + "epoch": 0.3646917534027222, + "grad_norm": 1.1725746335257135, + "learning_rate": 1.4678553313546869e-05, + "loss": 1.6646, + "step": 2733 + }, + { + "epoch": 0.3648251934881238, + "grad_norm": 1.0117764785870433, + "learning_rate": 1.4674733155306392e-05, + "loss": 1.5871, + "step": 2734 + }, + { + "epoch": 0.3649586335735255, + "grad_norm": 1.139180055289969, + "learning_rate": 1.4670912123878987e-05, + "loss": 1.666, + "step": 2735 + }, + { + "epoch": 0.36509207365892715, + "grad_norm": 1.160600503319934, + "learning_rate": 1.4667090219978376e-05, + "loss": 1.6344, + "step": 2736 + }, + { + "epoch": 0.3652255137443288, + "grad_norm": 0.9804628531594025, + "learning_rate": 1.4663267444318451e-05, + "loss": 1.6504, + "step": 2737 + }, + { + "epoch": 0.36535895382973044, + "grad_norm": 1.2244168997082292, + "learning_rate": 1.465944379761326e-05, + "loss": 1.6267, + "step": 2738 + }, + { + "epoch": 0.3654923939151321, + "grad_norm": 1.142784430348776, + "learning_rate": 1.4655619280577015e-05, + "loss": 1.6029, + "step": 2739 + }, + { + "epoch": 0.3656258340005338, + "grad_norm": 1.1856642918086469, + "learning_rate": 1.46517938939241e-05, + "loss": 1.6084, + "step": 2740 + }, + { + "epoch": 0.3657592740859354, + "grad_norm": 1.1684003107608707, + "learning_rate": 1.4647967638369043e-05, + "loss": 1.673, + "step": 2741 + }, + { + "epoch": 0.36589271417133706, + "grad_norm": 0.9808643566324687, + "learning_rate": 1.4644140514626553e-05, + "loss": 1.6068, + "step": 2742 + }, + { + "epoch": 0.36602615425673873, + "grad_norm": 1.0096442361014633, + "learning_rate": 1.4640312523411491e-05, + "loss": 1.6395, + "step": 2743 + }, + { + "epoch": 0.3661595943421404, + "grad_norm": 1.0898350041048794, + "learning_rate": 1.4636483665438878e-05, + "loss": 1.7007, + "step": 2744 + }, + { + "epoch": 0.366293034427542, + "grad_norm": 0.9501423520521707, + "learning_rate": 1.4632653941423911e-05, + "loss": 1.5732, + "step": 2745 + }, + { + "epoch": 0.3664264745129437, + "grad_norm": 1.001779649980241, + "learning_rate": 1.4628823352081929e-05, + "loss": 1.6178, + "step": 2746 + }, + { + "epoch": 0.36655991459834536, + "grad_norm": 1.0032387536870389, + "learning_rate": 1.4624991898128445e-05, + "loss": 1.6319, + "step": 2747 + }, + { + "epoch": 0.366693354683747, + "grad_norm": 0.9476089503337908, + "learning_rate": 1.4621159580279129e-05, + "loss": 1.5986, + "step": 2748 + }, + { + "epoch": 0.36682679476914865, + "grad_norm": 0.9672551178709096, + "learning_rate": 1.4617326399249823e-05, + "loss": 1.6172, + "step": 2749 + }, + { + "epoch": 0.3669602348545503, + "grad_norm": 0.9999506532379091, + "learning_rate": 1.4613492355756514e-05, + "loss": 1.6411, + "step": 2750 + }, + { + "epoch": 0.367093674939952, + "grad_norm": 1.0229999120124786, + "learning_rate": 1.460965745051536e-05, + "loss": 1.6105, + "step": 2751 + }, + { + "epoch": 0.3672271150253536, + "grad_norm": 1.008641986491956, + "learning_rate": 1.4605821684242674e-05, + "loss": 1.6731, + "step": 2752 + }, + { + "epoch": 0.36736055511075527, + "grad_norm": 1.042758475874373, + "learning_rate": 1.460198505765494e-05, + "loss": 1.554, + "step": 2753 + }, + { + "epoch": 0.36749399519615694, + "grad_norm": 1.1472564831402514, + "learning_rate": 1.4598147571468791e-05, + "loss": 1.6019, + "step": 2754 + }, + { + "epoch": 0.36762743528155856, + "grad_norm": 0.9737879253423936, + "learning_rate": 1.4594309226401027e-05, + "loss": 1.6319, + "step": 2755 + }, + { + "epoch": 0.3677608753669602, + "grad_norm": 0.9632772149963865, + "learning_rate": 1.4590470023168611e-05, + "loss": 1.5865, + "step": 2756 + }, + { + "epoch": 0.3678943154523619, + "grad_norm": 1.1241458546313505, + "learning_rate": 1.4586629962488654e-05, + "loss": 1.6882, + "step": 2757 + }, + { + "epoch": 0.36802775553776357, + "grad_norm": 1.0034203806903836, + "learning_rate": 1.4582789045078445e-05, + "loss": 1.6177, + "step": 2758 + }, + { + "epoch": 0.3681611956231652, + "grad_norm": 0.9619488800895231, + "learning_rate": 1.4578947271655417e-05, + "loss": 1.6104, + "step": 2759 + }, + { + "epoch": 0.36829463570856685, + "grad_norm": 1.0707252512115273, + "learning_rate": 1.4575104642937175e-05, + "loss": 1.5266, + "step": 2760 + }, + { + "epoch": 0.3684280757939685, + "grad_norm": 1.0221303660590428, + "learning_rate": 1.4571261159641472e-05, + "loss": 1.6157, + "step": 2761 + }, + { + "epoch": 0.36856151587937014, + "grad_norm": 1.1856740576654383, + "learning_rate": 1.4567416822486233e-05, + "loss": 1.5843, + "step": 2762 + }, + { + "epoch": 0.3686949559647718, + "grad_norm": 1.057753625247735, + "learning_rate": 1.456357163218953e-05, + "loss": 1.6437, + "step": 2763 + }, + { + "epoch": 0.3688283960501735, + "grad_norm": 1.167606648878092, + "learning_rate": 1.4559725589469609e-05, + "loss": 1.62, + "step": 2764 + }, + { + "epoch": 0.36896183613557515, + "grad_norm": 1.0332270116796345, + "learning_rate": 1.4555878695044859e-05, + "loss": 1.5791, + "step": 2765 + }, + { + "epoch": 0.36909527622097676, + "grad_norm": 1.0273288838594588, + "learning_rate": 1.4552030949633839e-05, + "loss": 1.6575, + "step": 2766 + }, + { + "epoch": 0.36922871630637843, + "grad_norm": 1.0275198082813934, + "learning_rate": 1.4548182353955267e-05, + "loss": 1.626, + "step": 2767 + }, + { + "epoch": 0.3693621563917801, + "grad_norm": 1.009577517124903, + "learning_rate": 1.4544332908728011e-05, + "loss": 1.6139, + "step": 2768 + }, + { + "epoch": 0.3694955964771817, + "grad_norm": 0.9669422230954393, + "learning_rate": 1.454048261467111e-05, + "loss": 1.6449, + "step": 2769 + }, + { + "epoch": 0.3696290365625834, + "grad_norm": 1.2874584717461675, + "learning_rate": 1.453663147250375e-05, + "loss": 1.5875, + "step": 2770 + }, + { + "epoch": 0.36976247664798506, + "grad_norm": 1.025068484975409, + "learning_rate": 1.4532779482945283e-05, + "loss": 1.6252, + "step": 2771 + }, + { + "epoch": 0.36989591673338673, + "grad_norm": 1.005806168229964, + "learning_rate": 1.4528926646715215e-05, + "loss": 1.6093, + "step": 2772 + }, + { + "epoch": 0.37002935681878835, + "grad_norm": 1.208605409648711, + "learning_rate": 1.4525072964533213e-05, + "loss": 1.6543, + "step": 2773 + }, + { + "epoch": 0.37016279690419, + "grad_norm": 1.0895321534730344, + "learning_rate": 1.4521218437119105e-05, + "loss": 1.6162, + "step": 2774 + }, + { + "epoch": 0.3702962369895917, + "grad_norm": 0.9832287315998608, + "learning_rate": 1.4517363065192865e-05, + "loss": 1.5716, + "step": 2775 + }, + { + "epoch": 0.3704296770749933, + "grad_norm": 1.2767029129562657, + "learning_rate": 1.4513506849474639e-05, + "loss": 1.6252, + "step": 2776 + }, + { + "epoch": 0.370563117160395, + "grad_norm": 1.0660572092116536, + "learning_rate": 1.450964979068472e-05, + "loss": 1.5639, + "step": 2777 + }, + { + "epoch": 0.37069655724579664, + "grad_norm": 1.0431072621084843, + "learning_rate": 1.4505791889543565e-05, + "loss": 1.6989, + "step": 2778 + }, + { + "epoch": 0.3708299973311983, + "grad_norm": 1.1890314142284257, + "learning_rate": 1.4501933146771785e-05, + "loss": 1.6471, + "step": 2779 + }, + { + "epoch": 0.37096343741659993, + "grad_norm": 1.1319054159207953, + "learning_rate": 1.449807356309015e-05, + "loss": 1.5851, + "step": 2780 + }, + { + "epoch": 0.3710968775020016, + "grad_norm": 1.0145901459250226, + "learning_rate": 1.4494213139219587e-05, + "loss": 1.6279, + "step": 2781 + }, + { + "epoch": 0.37123031758740327, + "grad_norm": 0.9600581548472692, + "learning_rate": 1.4490351875881174e-05, + "loss": 1.6292, + "step": 2782 + }, + { + "epoch": 0.3713637576728049, + "grad_norm": 1.1450325048073626, + "learning_rate": 1.448648977379616e-05, + "loss": 1.5946, + "step": 2783 + }, + { + "epoch": 0.37149719775820655, + "grad_norm": 0.945594443926737, + "learning_rate": 1.4482626833685936e-05, + "loss": 1.6552, + "step": 2784 + }, + { + "epoch": 0.3716306378436082, + "grad_norm": 1.168278859108492, + "learning_rate": 1.4478763056272054e-05, + "loss": 1.6365, + "step": 2785 + }, + { + "epoch": 0.3717640779290099, + "grad_norm": 1.108937200609111, + "learning_rate": 1.4474898442276226e-05, + "loss": 1.5822, + "step": 2786 + }, + { + "epoch": 0.3718975180144115, + "grad_norm": 0.9571836543010801, + "learning_rate": 1.447103299242032e-05, + "loss": 1.5651, + "step": 2787 + }, + { + "epoch": 0.3720309580998132, + "grad_norm": 10.157439322526868, + "learning_rate": 1.4467166707426357e-05, + "loss": 1.6417, + "step": 2788 + }, + { + "epoch": 0.37216439818521485, + "grad_norm": 1.0795077239195907, + "learning_rate": 1.4463299588016508e-05, + "loss": 1.6007, + "step": 2789 + }, + { + "epoch": 0.37229783827061647, + "grad_norm": 1.02760083176124, + "learning_rate": 1.4459431634913118e-05, + "loss": 1.5834, + "step": 2790 + }, + { + "epoch": 0.37243127835601814, + "grad_norm": 1.052624541464461, + "learning_rate": 1.4455562848838666e-05, + "loss": 1.5611, + "step": 2791 + }, + { + "epoch": 0.3725647184414198, + "grad_norm": 1.0230420491562466, + "learning_rate": 1.4451693230515807e-05, + "loss": 1.6217, + "step": 2792 + }, + { + "epoch": 0.3726981585268215, + "grad_norm": 0.9764342262244122, + "learning_rate": 1.4447822780667332e-05, + "loss": 1.6139, + "step": 2793 + }, + { + "epoch": 0.3728315986122231, + "grad_norm": 1.128603754261654, + "learning_rate": 1.4443951500016207e-05, + "loss": 1.6407, + "step": 2794 + }, + { + "epoch": 0.37296503869762476, + "grad_norm": 1.1192050512936182, + "learning_rate": 1.4440079389285535e-05, + "loss": 1.6054, + "step": 2795 + }, + { + "epoch": 0.37309847878302643, + "grad_norm": 1.0094200838680423, + "learning_rate": 1.4436206449198584e-05, + "loss": 1.584, + "step": 2796 + }, + { + "epoch": 0.3732319188684281, + "grad_norm": 1.0820268405383773, + "learning_rate": 1.443233268047878e-05, + "loss": 1.6154, + "step": 2797 + }, + { + "epoch": 0.3733653589538297, + "grad_norm": 1.0454550915071419, + "learning_rate": 1.4428458083849693e-05, + "loss": 1.5913, + "step": 2798 + }, + { + "epoch": 0.3734987990392314, + "grad_norm": 1.0114425617387492, + "learning_rate": 1.4424582660035059e-05, + "loss": 1.6053, + "step": 2799 + }, + { + "epoch": 0.37363223912463306, + "grad_norm": 1.0867361007007368, + "learning_rate": 1.4420706409758753e-05, + "loss": 1.6393, + "step": 2800 + }, + { + "epoch": 0.3737656792100347, + "grad_norm": 1.1845819474365904, + "learning_rate": 1.4416829333744828e-05, + "loss": 1.6132, + "step": 2801 + }, + { + "epoch": 0.37389911929543634, + "grad_norm": 1.0205258166513704, + "learning_rate": 1.4412951432717467e-05, + "loss": 1.6227, + "step": 2802 + }, + { + "epoch": 0.374032559380838, + "grad_norm": 1.0438780709981974, + "learning_rate": 1.4409072707401024e-05, + "loss": 1.636, + "step": 2803 + }, + { + "epoch": 0.3741659994662397, + "grad_norm": 1.0589699296179769, + "learning_rate": 1.4405193158519998e-05, + "loss": 1.6119, + "step": 2804 + }, + { + "epoch": 0.3742994395516413, + "grad_norm": 0.9619237433348188, + "learning_rate": 1.4401312786799044e-05, + "loss": 1.6066, + "step": 2805 + }, + { + "epoch": 0.37443287963704297, + "grad_norm": 1.0797086642870346, + "learning_rate": 1.4397431592962974e-05, + "loss": 1.5915, + "step": 2806 + }, + { + "epoch": 0.37456631972244464, + "grad_norm": 0.9662518764136966, + "learning_rate": 1.439354957773675e-05, + "loss": 1.6207, + "step": 2807 + }, + { + "epoch": 0.37469975980784626, + "grad_norm": 1.1237578053297503, + "learning_rate": 1.4389666741845485e-05, + "loss": 1.6194, + "step": 2808 + }, + { + "epoch": 0.3748331998932479, + "grad_norm": 0.9840735377034793, + "learning_rate": 1.4385783086014451e-05, + "loss": 1.6028, + "step": 2809 + }, + { + "epoch": 0.3749666399786496, + "grad_norm": 1.0118695314963424, + "learning_rate": 1.4381898610969071e-05, + "loss": 1.5789, + "step": 2810 + }, + { + "epoch": 0.37510008006405127, + "grad_norm": 1.2580488826742735, + "learning_rate": 1.437801331743492e-05, + "loss": 1.6479, + "step": 2811 + }, + { + "epoch": 0.3752335201494529, + "grad_norm": 1.0726283441727469, + "learning_rate": 1.4374127206137727e-05, + "loss": 1.6275, + "step": 2812 + }, + { + "epoch": 0.37536696023485455, + "grad_norm": 1.0383966783564913, + "learning_rate": 1.4370240277803374e-05, + "loss": 1.6389, + "step": 2813 + }, + { + "epoch": 0.3755004003202562, + "grad_norm": 1.323435758142834, + "learning_rate": 1.4366352533157893e-05, + "loss": 1.5959, + "step": 2814 + }, + { + "epoch": 0.37563384040565784, + "grad_norm": 1.1012025284439741, + "learning_rate": 1.4362463972927472e-05, + "loss": 1.5828, + "step": 2815 + }, + { + "epoch": 0.3757672804910595, + "grad_norm": 1.1171657662365868, + "learning_rate": 1.4358574597838448e-05, + "loss": 1.6421, + "step": 2816 + }, + { + "epoch": 0.3759007205764612, + "grad_norm": 1.0544176694187513, + "learning_rate": 1.4354684408617316e-05, + "loss": 1.6215, + "step": 2817 + }, + { + "epoch": 0.37603416066186285, + "grad_norm": 1.0414279460927407, + "learning_rate": 1.4350793405990716e-05, + "loss": 1.5807, + "step": 2818 + }, + { + "epoch": 0.37616760074726446, + "grad_norm": 1.1376881831993566, + "learning_rate": 1.434690159068544e-05, + "loss": 1.6808, + "step": 2819 + }, + { + "epoch": 0.37630104083266613, + "grad_norm": 1.0250101070023419, + "learning_rate": 1.4343008963428442e-05, + "loss": 1.6021, + "step": 2820 + }, + { + "epoch": 0.3764344809180678, + "grad_norm": 1.1002348920804779, + "learning_rate": 1.4339115524946815e-05, + "loss": 1.6074, + "step": 2821 + }, + { + "epoch": 0.3765679210034694, + "grad_norm": 1.0123930431013084, + "learning_rate": 1.4335221275967812e-05, + "loss": 1.6426, + "step": 2822 + }, + { + "epoch": 0.3767013610888711, + "grad_norm": 1.2209497008197363, + "learning_rate": 1.433132621721883e-05, + "loss": 1.6471, + "step": 2823 + }, + { + "epoch": 0.37683480117427276, + "grad_norm": 1.0189131304238113, + "learning_rate": 1.432743034942743e-05, + "loss": 1.6218, + "step": 2824 + }, + { + "epoch": 0.37696824125967443, + "grad_norm": 1.0178791123141788, + "learning_rate": 1.4323533673321304e-05, + "loss": 1.6242, + "step": 2825 + }, + { + "epoch": 0.37710168134507605, + "grad_norm": 0.9991762214209088, + "learning_rate": 1.4319636189628316e-05, + "loss": 1.6042, + "step": 2826 + }, + { + "epoch": 0.3772351214304777, + "grad_norm": 1.016918051574137, + "learning_rate": 1.431573789907647e-05, + "loss": 1.6255, + "step": 2827 + }, + { + "epoch": 0.3773685615158794, + "grad_norm": 0.968894964593842, + "learning_rate": 1.4311838802393921e-05, + "loss": 1.5977, + "step": 2828 + }, + { + "epoch": 0.377502001601281, + "grad_norm": 0.9868572481236255, + "learning_rate": 1.4307938900308971e-05, + "loss": 1.6564, + "step": 2829 + }, + { + "epoch": 0.37763544168668267, + "grad_norm": 1.6711098134309899, + "learning_rate": 1.4304038193550087e-05, + "loss": 1.6036, + "step": 2830 + }, + { + "epoch": 0.37776888177208434, + "grad_norm": 1.2720681532176408, + "learning_rate": 1.4300136682845874e-05, + "loss": 1.6172, + "step": 2831 + }, + { + "epoch": 0.377902321857486, + "grad_norm": 0.9972549450196833, + "learning_rate": 1.4296234368925082e-05, + "loss": 1.6582, + "step": 2832 + }, + { + "epoch": 0.3780357619428876, + "grad_norm": 1.033670572949725, + "learning_rate": 1.4292331252516627e-05, + "loss": 1.598, + "step": 2833 + }, + { + "epoch": 0.3781692020282893, + "grad_norm": 1.3209056443094034, + "learning_rate": 1.4288427334349562e-05, + "loss": 1.5872, + "step": 2834 + }, + { + "epoch": 0.37830264211369097, + "grad_norm": 1.00727971762687, + "learning_rate": 1.4284522615153098e-05, + "loss": 1.6444, + "step": 2835 + }, + { + "epoch": 0.3784360821990926, + "grad_norm": 1.1341291826363633, + "learning_rate": 1.4280617095656591e-05, + "loss": 1.6396, + "step": 2836 + }, + { + "epoch": 0.37856952228449425, + "grad_norm": 1.088875710596739, + "learning_rate": 1.4276710776589546e-05, + "loss": 1.5717, + "step": 2837 + }, + { + "epoch": 0.3787029623698959, + "grad_norm": 1.5266027482348503, + "learning_rate": 1.4272803658681622e-05, + "loss": 1.6019, + "step": 2838 + }, + { + "epoch": 0.3788364024552976, + "grad_norm": 1.1012146674109902, + "learning_rate": 1.4268895742662618e-05, + "loss": 1.6891, + "step": 2839 + }, + { + "epoch": 0.3789698425406992, + "grad_norm": 1.2571923095061812, + "learning_rate": 1.4264987029262497e-05, + "loss": 1.637, + "step": 2840 + }, + { + "epoch": 0.3791032826261009, + "grad_norm": 1.0068568438867096, + "learning_rate": 1.4261077519211358e-05, + "loss": 1.5796, + "step": 2841 + }, + { + "epoch": 0.37923672271150255, + "grad_norm": 1.008009465786665, + "learning_rate": 1.4257167213239451e-05, + "loss": 1.636, + "step": 2842 + }, + { + "epoch": 0.37937016279690416, + "grad_norm": 1.0065398200515008, + "learning_rate": 1.4253256112077176e-05, + "loss": 1.6736, + "step": 2843 + }, + { + "epoch": 0.37950360288230583, + "grad_norm": 0.9492387619872094, + "learning_rate": 1.4249344216455085e-05, + "loss": 1.632, + "step": 2844 + }, + { + "epoch": 0.3796370429677075, + "grad_norm": 0.9873644452435473, + "learning_rate": 1.4245431527103879e-05, + "loss": 1.5646, + "step": 2845 + }, + { + "epoch": 0.3797704830531092, + "grad_norm": 0.9940129192913516, + "learning_rate": 1.4241518044754397e-05, + "loss": 1.6068, + "step": 2846 + }, + { + "epoch": 0.3799039231385108, + "grad_norm": 0.9786673958035172, + "learning_rate": 1.4237603770137635e-05, + "loss": 1.6508, + "step": 2847 + }, + { + "epoch": 0.38003736322391246, + "grad_norm": 1.0669464495332486, + "learning_rate": 1.4233688703984737e-05, + "loss": 1.5973, + "step": 2848 + }, + { + "epoch": 0.38017080330931413, + "grad_norm": 0.9699464474368997, + "learning_rate": 1.4229772847026993e-05, + "loss": 1.5928, + "step": 2849 + }, + { + "epoch": 0.38030424339471575, + "grad_norm": 0.9602245627811605, + "learning_rate": 1.4225856199995836e-05, + "loss": 1.6669, + "step": 2850 + }, + { + "epoch": 0.3804376834801174, + "grad_norm": 1.0000207901051725, + "learning_rate": 1.4221938763622856e-05, + "loss": 1.5871, + "step": 2851 + }, + { + "epoch": 0.3805711235655191, + "grad_norm": 1.1884986045536179, + "learning_rate": 1.4218020538639782e-05, + "loss": 1.6184, + "step": 2852 + }, + { + "epoch": 0.38070456365092076, + "grad_norm": 1.0365928654144692, + "learning_rate": 1.4214101525778495e-05, + "loss": 1.665, + "step": 2853 + }, + { + "epoch": 0.3808380037363224, + "grad_norm": 0.9983952327474825, + "learning_rate": 1.4210181725771026e-05, + "loss": 1.6284, + "step": 2854 + }, + { + "epoch": 0.38097144382172404, + "grad_norm": 1.0013533383795308, + "learning_rate": 1.420626113934954e-05, + "loss": 1.6141, + "step": 2855 + }, + { + "epoch": 0.3811048839071257, + "grad_norm": 1.1423554042414863, + "learning_rate": 1.4202339767246367e-05, + "loss": 1.6336, + "step": 2856 + }, + { + "epoch": 0.38123832399252733, + "grad_norm": 0.9703881563875031, + "learning_rate": 1.419841761019397e-05, + "loss": 1.6393, + "step": 2857 + }, + { + "epoch": 0.381371764077929, + "grad_norm": 0.9642652272603265, + "learning_rate": 1.4194494668924966e-05, + "loss": 1.6384, + "step": 2858 + }, + { + "epoch": 0.38150520416333067, + "grad_norm": 1.131226033110534, + "learning_rate": 1.419057094417211e-05, + "loss": 1.6435, + "step": 2859 + }, + { + "epoch": 0.38163864424873234, + "grad_norm": 0.9917335521919108, + "learning_rate": 1.4186646436668312e-05, + "loss": 1.6202, + "step": 2860 + }, + { + "epoch": 0.38177208433413395, + "grad_norm": 1.097375636507755, + "learning_rate": 1.4182721147146633e-05, + "loss": 1.7061, + "step": 2861 + }, + { + "epoch": 0.3819055244195356, + "grad_norm": 0.9975696907135774, + "learning_rate": 1.4178795076340259e-05, + "loss": 1.6118, + "step": 2862 + }, + { + "epoch": 0.3820389645049373, + "grad_norm": 0.9870041753998313, + "learning_rate": 1.4174868224982547e-05, + "loss": 1.6137, + "step": 2863 + }, + { + "epoch": 0.38217240459033897, + "grad_norm": 1.165292789080929, + "learning_rate": 1.417094059380698e-05, + "loss": 1.6004, + "step": 2864 + }, + { + "epoch": 0.3823058446757406, + "grad_norm": 1.145639325521145, + "learning_rate": 1.4167012183547203e-05, + "loss": 1.6066, + "step": 2865 + }, + { + "epoch": 0.38243928476114225, + "grad_norm": 1.1503077171125726, + "learning_rate": 1.4163082994936988e-05, + "loss": 1.615, + "step": 2866 + }, + { + "epoch": 0.3825727248465439, + "grad_norm": 1.0404542214973824, + "learning_rate": 1.4159153028710268e-05, + "loss": 1.6169, + "step": 2867 + }, + { + "epoch": 0.38270616493194554, + "grad_norm": 0.996004145579702, + "learning_rate": 1.415522228560112e-05, + "loss": 1.6029, + "step": 2868 + }, + { + "epoch": 0.3828396050173472, + "grad_norm": 1.2254062428716475, + "learning_rate": 1.4151290766343752e-05, + "loss": 1.6211, + "step": 2869 + }, + { + "epoch": 0.3829730451027489, + "grad_norm": 0.992077891417151, + "learning_rate": 1.4147358471672541e-05, + "loss": 1.5512, + "step": 2870 + }, + { + "epoch": 0.38310648518815055, + "grad_norm": 0.952569959666935, + "learning_rate": 1.4143425402321982e-05, + "loss": 1.6052, + "step": 2871 + }, + { + "epoch": 0.38323992527355216, + "grad_norm": 1.2306890442084908, + "learning_rate": 1.4139491559026732e-05, + "loss": 1.5909, + "step": 2872 + }, + { + "epoch": 0.38337336535895383, + "grad_norm": 0.9759054239574055, + "learning_rate": 1.4135556942521594e-05, + "loss": 1.6061, + "step": 2873 + }, + { + "epoch": 0.3835068054443555, + "grad_norm": 0.9913047688419279, + "learning_rate": 1.4131621553541499e-05, + "loss": 1.6104, + "step": 2874 + }, + { + "epoch": 0.3836402455297571, + "grad_norm": 1.246242626432296, + "learning_rate": 1.412768539282154e-05, + "loss": 1.6091, + "step": 2875 + }, + { + "epoch": 0.3837736856151588, + "grad_norm": 0.9433807734418067, + "learning_rate": 1.4123748461096942e-05, + "loss": 1.6305, + "step": 2876 + }, + { + "epoch": 0.38390712570056046, + "grad_norm": 0.982212397205281, + "learning_rate": 1.4119810759103088e-05, + "loss": 1.6221, + "step": 2877 + }, + { + "epoch": 0.38404056578596213, + "grad_norm": 1.0575127644185438, + "learning_rate": 1.4115872287575485e-05, + "loss": 1.602, + "step": 2878 + }, + { + "epoch": 0.38417400587136374, + "grad_norm": 1.1374999128456018, + "learning_rate": 1.4111933047249802e-05, + "loss": 1.6081, + "step": 2879 + }, + { + "epoch": 0.3843074459567654, + "grad_norm": 9.571339510483169, + "learning_rate": 1.410799303886184e-05, + "loss": 1.6055, + "step": 2880 + }, + { + "epoch": 0.3844408860421671, + "grad_norm": 1.145304347974641, + "learning_rate": 1.4104052263147553e-05, + "loss": 1.6612, + "step": 2881 + }, + { + "epoch": 0.3845743261275687, + "grad_norm": 1.099466074032018, + "learning_rate": 1.4100110720843025e-05, + "loss": 1.5704, + "step": 2882 + }, + { + "epoch": 0.38470776621297037, + "grad_norm": 1.0509418929003724, + "learning_rate": 1.4096168412684497e-05, + "loss": 1.648, + "step": 2883 + }, + { + "epoch": 0.38484120629837204, + "grad_norm": 1.1230867470007317, + "learning_rate": 1.409222533940835e-05, + "loss": 1.6401, + "step": 2884 + }, + { + "epoch": 0.3849746463837737, + "grad_norm": 1.007807394086856, + "learning_rate": 1.4088281501751095e-05, + "loss": 1.6452, + "step": 2885 + }, + { + "epoch": 0.3851080864691753, + "grad_norm": 1.061247642675541, + "learning_rate": 1.408433690044941e-05, + "loss": 1.6049, + "step": 2886 + }, + { + "epoch": 0.385241526554577, + "grad_norm": 1.0601502148961404, + "learning_rate": 1.4080391536240088e-05, + "loss": 1.6375, + "step": 2887 + }, + { + "epoch": 0.38537496663997867, + "grad_norm": 0.9817836229390783, + "learning_rate": 1.4076445409860086e-05, + "loss": 1.6106, + "step": 2888 + }, + { + "epoch": 0.3855084067253803, + "grad_norm": 1.2826160394488593, + "learning_rate": 1.4072498522046494e-05, + "loss": 1.6531, + "step": 2889 + }, + { + "epoch": 0.38564184681078195, + "grad_norm": 1.201976231365931, + "learning_rate": 1.4068550873536543e-05, + "loss": 1.6206, + "step": 2890 + }, + { + "epoch": 0.3857752868961836, + "grad_norm": 1.026733475170239, + "learning_rate": 1.4064602465067613e-05, + "loss": 1.6505, + "step": 2891 + }, + { + "epoch": 0.3859087269815853, + "grad_norm": 1.236834192825807, + "learning_rate": 1.406065329737722e-05, + "loss": 1.6342, + "step": 2892 + }, + { + "epoch": 0.3860421670669869, + "grad_norm": 1.150715310952092, + "learning_rate": 1.4056703371203023e-05, + "loss": 1.6205, + "step": 2893 + }, + { + "epoch": 0.3861756071523886, + "grad_norm": 1.0642046173241948, + "learning_rate": 1.4052752687282824e-05, + "loss": 1.6052, + "step": 2894 + }, + { + "epoch": 0.38630904723779025, + "grad_norm": 1.0978042763206004, + "learning_rate": 1.4048801246354567e-05, + "loss": 1.6058, + "step": 2895 + }, + { + "epoch": 0.38644248732319186, + "grad_norm": 1.177880101416564, + "learning_rate": 1.4044849049156328e-05, + "loss": 1.6127, + "step": 2896 + }, + { + "epoch": 0.38657592740859353, + "grad_norm": 1.017132075267852, + "learning_rate": 1.4040896096426346e-05, + "loss": 1.6143, + "step": 2897 + }, + { + "epoch": 0.3867093674939952, + "grad_norm": 1.0501232630820656, + "learning_rate": 1.4036942388902976e-05, + "loss": 1.5509, + "step": 2898 + }, + { + "epoch": 0.3868428075793969, + "grad_norm": 1.2571254734715342, + "learning_rate": 1.403298792732473e-05, + "loss": 1.6453, + "step": 2899 + }, + { + "epoch": 0.3869762476647985, + "grad_norm": 0.9875221484898435, + "learning_rate": 1.4029032712430262e-05, + "loss": 1.596, + "step": 2900 + }, + { + "epoch": 0.38710968775020016, + "grad_norm": 1.1285508086710352, + "learning_rate": 1.4025076744958348e-05, + "loss": 1.6236, + "step": 2901 + }, + { + "epoch": 0.38724312783560183, + "grad_norm": 1.2197035948818218, + "learning_rate": 1.4021120025647932e-05, + "loss": 1.5706, + "step": 2902 + }, + { + "epoch": 0.38737656792100345, + "grad_norm": 1.0890256309645825, + "learning_rate": 1.4017162555238072e-05, + "loss": 1.5571, + "step": 2903 + }, + { + "epoch": 0.3875100080064051, + "grad_norm": 1.080695243503158, + "learning_rate": 1.4013204334467987e-05, + "loss": 1.6454, + "step": 2904 + }, + { + "epoch": 0.3876434480918068, + "grad_norm": 0.9814101368260878, + "learning_rate": 1.4009245364077024e-05, + "loss": 1.6153, + "step": 2905 + }, + { + "epoch": 0.38777688817720846, + "grad_norm": 0.9904485921133632, + "learning_rate": 1.4005285644804673e-05, + "loss": 1.6276, + "step": 2906 + }, + { + "epoch": 0.38791032826261007, + "grad_norm": 0.9972089960040826, + "learning_rate": 1.4001325177390565e-05, + "loss": 1.6205, + "step": 2907 + }, + { + "epoch": 0.38804376834801174, + "grad_norm": 1.2730345239020744, + "learning_rate": 1.3997363962574473e-05, + "loss": 1.6412, + "step": 2908 + }, + { + "epoch": 0.3881772084334134, + "grad_norm": 0.9675278590077582, + "learning_rate": 1.3993402001096304e-05, + "loss": 1.6351, + "step": 2909 + }, + { + "epoch": 0.388310648518815, + "grad_norm": 1.0292534070839274, + "learning_rate": 1.3989439293696105e-05, + "loss": 1.6089, + "step": 2910 + }, + { + "epoch": 0.3884440886042167, + "grad_norm": 1.019759996442744, + "learning_rate": 1.3985475841114071e-05, + "loss": 1.63, + "step": 2911 + }, + { + "epoch": 0.38857752868961837, + "grad_norm": 0.9807744844897855, + "learning_rate": 1.3981511644090523e-05, + "loss": 1.6103, + "step": 2912 + }, + { + "epoch": 0.38871096877502004, + "grad_norm": 1.025209057661364, + "learning_rate": 1.3977546703365934e-05, + "loss": 1.5834, + "step": 2913 + }, + { + "epoch": 0.38884440886042165, + "grad_norm": 1.089409574851773, + "learning_rate": 1.3973581019680906e-05, + "loss": 1.707, + "step": 2914 + }, + { + "epoch": 0.3889778489458233, + "grad_norm": 0.9724916772280182, + "learning_rate": 1.3969614593776184e-05, + "loss": 1.6204, + "step": 2915 + }, + { + "epoch": 0.389111289031225, + "grad_norm": 1.007702700819211, + "learning_rate": 1.3965647426392653e-05, + "loss": 1.6122, + "step": 2916 + }, + { + "epoch": 0.3892447291166266, + "grad_norm": 0.9419094811442577, + "learning_rate": 1.396167951827133e-05, + "loss": 1.5903, + "step": 2917 + }, + { + "epoch": 0.3893781692020283, + "grad_norm": 1.0374726700116226, + "learning_rate": 1.395771087015338e-05, + "loss": 1.6066, + "step": 2918 + }, + { + "epoch": 0.38951160928742995, + "grad_norm": 1.0328706825533176, + "learning_rate": 1.3953741482780102e-05, + "loss": 1.5566, + "step": 2919 + }, + { + "epoch": 0.3896450493728316, + "grad_norm": 1.2728567621408549, + "learning_rate": 1.3949771356892927e-05, + "loss": 1.6384, + "step": 2920 + }, + { + "epoch": 0.38977848945823323, + "grad_norm": 1.0123347440431487, + "learning_rate": 1.3945800493233432e-05, + "loss": 1.5981, + "step": 2921 + }, + { + "epoch": 0.3899119295436349, + "grad_norm": 0.991927710941825, + "learning_rate": 1.3941828892543332e-05, + "loss": 1.6323, + "step": 2922 + }, + { + "epoch": 0.3900453696290366, + "grad_norm": 0.9860434334934471, + "learning_rate": 1.3937856555564472e-05, + "loss": 1.5993, + "step": 2923 + }, + { + "epoch": 0.3901788097144382, + "grad_norm": 1.1272050680874104, + "learning_rate": 1.3933883483038843e-05, + "loss": 1.6889, + "step": 2924 + }, + { + "epoch": 0.39031224979983986, + "grad_norm": 1.013248475393018, + "learning_rate": 1.392990967570857e-05, + "loss": 1.5832, + "step": 2925 + }, + { + "epoch": 0.39044568988524153, + "grad_norm": 0.9972907037837129, + "learning_rate": 1.392593513431591e-05, + "loss": 1.6312, + "step": 2926 + }, + { + "epoch": 0.3905791299706432, + "grad_norm": 1.0368488326806515, + "learning_rate": 1.392195985960327e-05, + "loss": 1.593, + "step": 2927 + }, + { + "epoch": 0.3907125700560448, + "grad_norm": 0.9818098293390851, + "learning_rate": 1.3917983852313174e-05, + "loss": 1.6771, + "step": 2928 + }, + { + "epoch": 0.3908460101414465, + "grad_norm": 0.9843368644401832, + "learning_rate": 1.391400711318831e-05, + "loss": 1.6287, + "step": 2929 + }, + { + "epoch": 0.39097945022684816, + "grad_norm": 1.1575289949746161, + "learning_rate": 1.3910029642971473e-05, + "loss": 1.5908, + "step": 2930 + }, + { + "epoch": 0.3911128903122498, + "grad_norm": 1.0086519987992577, + "learning_rate": 1.390605144240562e-05, + "loss": 1.5931, + "step": 2931 + }, + { + "epoch": 0.39124633039765144, + "grad_norm": 0.9903875800619729, + "learning_rate": 1.390207251223383e-05, + "loss": 1.5842, + "step": 2932 + }, + { + "epoch": 0.3913797704830531, + "grad_norm": 0.9577055345082642, + "learning_rate": 1.3898092853199318e-05, + "loss": 1.6194, + "step": 2933 + }, + { + "epoch": 0.3915132105684548, + "grad_norm": 1.2357946602483, + "learning_rate": 1.3894112466045448e-05, + "loss": 1.6087, + "step": 2934 + }, + { + "epoch": 0.3916466506538564, + "grad_norm": 1.0769210602839947, + "learning_rate": 1.3890131351515703e-05, + "loss": 1.6153, + "step": 2935 + }, + { + "epoch": 0.39178009073925807, + "grad_norm": 1.10534271606038, + "learning_rate": 1.388614951035371e-05, + "loss": 1.6172, + "step": 2936 + }, + { + "epoch": 0.39191353082465974, + "grad_norm": 1.0791728229209796, + "learning_rate": 1.3882166943303239e-05, + "loss": 1.5885, + "step": 2937 + }, + { + "epoch": 0.3920469709100614, + "grad_norm": 1.048943382922718, + "learning_rate": 1.387818365110818e-05, + "loss": 1.6466, + "step": 2938 + }, + { + "epoch": 0.392180410995463, + "grad_norm": 0.9775623637977353, + "learning_rate": 1.3874199634512568e-05, + "loss": 1.6842, + "step": 2939 + }, + { + "epoch": 0.3923138510808647, + "grad_norm": 1.2823226827585823, + "learning_rate": 1.3870214894260577e-05, + "loss": 1.6346, + "step": 2940 + }, + { + "epoch": 0.39244729116626637, + "grad_norm": 2.9554542149643392, + "learning_rate": 1.3866229431096506e-05, + "loss": 1.631, + "step": 2941 + }, + { + "epoch": 0.392580731251668, + "grad_norm": 1.4313180292149659, + "learning_rate": 1.3862243245764795e-05, + "loss": 1.5588, + "step": 2942 + }, + { + "epoch": 0.39271417133706965, + "grad_norm": 1.0226581878161995, + "learning_rate": 1.3858256339010021e-05, + "loss": 1.5696, + "step": 2943 + }, + { + "epoch": 0.3928476114224713, + "grad_norm": 1.203432785432225, + "learning_rate": 1.385426871157689e-05, + "loss": 1.5932, + "step": 2944 + }, + { + "epoch": 0.392981051507873, + "grad_norm": 1.2211552689992726, + "learning_rate": 1.3850280364210246e-05, + "loss": 1.6112, + "step": 2945 + }, + { + "epoch": 0.3931144915932746, + "grad_norm": 1.0952032810522179, + "learning_rate": 1.3846291297655066e-05, + "loss": 1.6057, + "step": 2946 + }, + { + "epoch": 0.3932479316786763, + "grad_norm": 1.1552204218064228, + "learning_rate": 1.3842301512656465e-05, + "loss": 1.6422, + "step": 2947 + }, + { + "epoch": 0.39338137176407795, + "grad_norm": 1.0183412684546411, + "learning_rate": 1.3838311009959686e-05, + "loss": 1.5689, + "step": 2948 + }, + { + "epoch": 0.39351481184947956, + "grad_norm": 1.047132191195357, + "learning_rate": 1.3834319790310113e-05, + "loss": 1.6462, + "step": 2949 + }, + { + "epoch": 0.39364825193488123, + "grad_norm": 0.9755539464048916, + "learning_rate": 1.3830327854453258e-05, + "loss": 1.6675, + "step": 2950 + }, + { + "epoch": 0.3937816920202829, + "grad_norm": 1.035910398586731, + "learning_rate": 1.3826335203134768e-05, + "loss": 1.6491, + "step": 2951 + }, + { + "epoch": 0.3939151321056846, + "grad_norm": 1.0231501045177207, + "learning_rate": 1.382234183710043e-05, + "loss": 1.644, + "step": 2952 + }, + { + "epoch": 0.3940485721910862, + "grad_norm": 1.0110762267018458, + "learning_rate": 1.3818347757096152e-05, + "loss": 1.6056, + "step": 2953 + }, + { + "epoch": 0.39418201227648786, + "grad_norm": 1.9202250806838923, + "learning_rate": 1.381435296386799e-05, + "loss": 1.5915, + "step": 2954 + }, + { + "epoch": 0.39431545236188953, + "grad_norm": 1.06141099974795, + "learning_rate": 1.381035745816212e-05, + "loss": 1.6049, + "step": 2955 + }, + { + "epoch": 0.39444889244729114, + "grad_norm": 1.0770580734734054, + "learning_rate": 1.3806361240724862e-05, + "loss": 1.6341, + "step": 2956 + }, + { + "epoch": 0.3945823325326928, + "grad_norm": 1.0439072213146445, + "learning_rate": 1.3802364312302659e-05, + "loss": 1.6512, + "step": 2957 + }, + { + "epoch": 0.3947157726180945, + "grad_norm": 1.0133974448550223, + "learning_rate": 1.3798366673642095e-05, + "loss": 1.6098, + "step": 2958 + }, + { + "epoch": 0.39484921270349616, + "grad_norm": 1.0228026592925512, + "learning_rate": 1.3794368325489881e-05, + "loss": 1.6292, + "step": 2959 + }, + { + "epoch": 0.39498265278889777, + "grad_norm": 1.2383549797377225, + "learning_rate": 1.3790369268592865e-05, + "loss": 1.6379, + "step": 2960 + }, + { + "epoch": 0.39511609287429944, + "grad_norm": 1.0527485547831446, + "learning_rate": 1.3786369503698024e-05, + "loss": 1.6094, + "step": 2961 + }, + { + "epoch": 0.3952495329597011, + "grad_norm": 1.2613117991115332, + "learning_rate": 1.3782369031552469e-05, + "loss": 1.6241, + "step": 2962 + }, + { + "epoch": 0.3953829730451027, + "grad_norm": 1.0530326147165556, + "learning_rate": 1.3778367852903443e-05, + "loss": 1.6379, + "step": 2963 + }, + { + "epoch": 0.3955164131305044, + "grad_norm": 2.2697531891999043, + "learning_rate": 1.3774365968498323e-05, + "loss": 1.6122, + "step": 2964 + }, + { + "epoch": 0.39564985321590607, + "grad_norm": 1.0088488119923664, + "learning_rate": 1.377036337908461e-05, + "loss": 1.6069, + "step": 2965 + }, + { + "epoch": 0.39578329330130774, + "grad_norm": 1.1919777045176008, + "learning_rate": 1.3766360085409947e-05, + "loss": 1.6282, + "step": 2966 + }, + { + "epoch": 0.39591673338670935, + "grad_norm": 1.0394509374153975, + "learning_rate": 1.37623560882221e-05, + "loss": 1.6261, + "step": 2967 + }, + { + "epoch": 0.396050173472111, + "grad_norm": 1.4778314816259879, + "learning_rate": 1.375835138826897e-05, + "loss": 1.5999, + "step": 2968 + }, + { + "epoch": 0.3961836135575127, + "grad_norm": 1.076372619162072, + "learning_rate": 1.3754345986298594e-05, + "loss": 1.63, + "step": 2969 + }, + { + "epoch": 0.3963170536429143, + "grad_norm": 1.0029057794554, + "learning_rate": 1.3750339883059132e-05, + "loss": 1.5545, + "step": 2970 + }, + { + "epoch": 0.396450493728316, + "grad_norm": 0.9763005134113368, + "learning_rate": 1.3746333079298882e-05, + "loss": 1.6233, + "step": 2971 + }, + { + "epoch": 0.39658393381371765, + "grad_norm": 0.9997766940306253, + "learning_rate": 1.374232557576626e-05, + "loss": 1.6042, + "step": 2972 + }, + { + "epoch": 0.3967173738991193, + "grad_norm": 1.081981802944108, + "learning_rate": 1.3738317373209833e-05, + "loss": 1.6644, + "step": 2973 + }, + { + "epoch": 0.39685081398452093, + "grad_norm": 0.9784210671800242, + "learning_rate": 1.3734308472378281e-05, + "loss": 1.6678, + "step": 2974 + }, + { + "epoch": 0.3969842540699226, + "grad_norm": 1.0089465304994234, + "learning_rate": 1.3730298874020424e-05, + "loss": 1.5872, + "step": 2975 + }, + { + "epoch": 0.3971176941553243, + "grad_norm": 0.9897744114028344, + "learning_rate": 1.372628857888521e-05, + "loss": 1.6752, + "step": 2976 + }, + { + "epoch": 0.3972511342407259, + "grad_norm": 1.0113015313370548, + "learning_rate": 1.3722277587721713e-05, + "loss": 1.6196, + "step": 2977 + }, + { + "epoch": 0.39738457432612756, + "grad_norm": 1.248353362601478, + "learning_rate": 1.3718265901279144e-05, + "loss": 1.6446, + "step": 2978 + }, + { + "epoch": 0.39751801441152923, + "grad_norm": 0.9976910628273125, + "learning_rate": 1.3714253520306835e-05, + "loss": 1.6559, + "step": 2979 + }, + { + "epoch": 0.3976514544969309, + "grad_norm": 0.9395020657372218, + "learning_rate": 1.371024044555426e-05, + "loss": 1.5507, + "step": 2980 + }, + { + "epoch": 0.3977848945823325, + "grad_norm": 1.059218486991219, + "learning_rate": 1.3706226677771011e-05, + "loss": 1.6096, + "step": 2981 + }, + { + "epoch": 0.3979183346677342, + "grad_norm": 1.1913088141870016, + "learning_rate": 1.3702212217706819e-05, + "loss": 1.605, + "step": 2982 + }, + { + "epoch": 0.39805177475313586, + "grad_norm": 1.0112436026820137, + "learning_rate": 1.3698197066111529e-05, + "loss": 1.6008, + "step": 2983 + }, + { + "epoch": 0.39818521483853747, + "grad_norm": 1.2284860321693414, + "learning_rate": 1.3694181223735138e-05, + "loss": 1.6154, + "step": 2984 + }, + { + "epoch": 0.39831865492393914, + "grad_norm": 1.0098640773316505, + "learning_rate": 1.3690164691327752e-05, + "loss": 1.6321, + "step": 2985 + }, + { + "epoch": 0.3984520950093408, + "grad_norm": 0.9717411694382783, + "learning_rate": 1.3686147469639616e-05, + "loss": 1.6847, + "step": 2986 + }, + { + "epoch": 0.3985855350947425, + "grad_norm": 0.9284579466010013, + "learning_rate": 1.3682129559421102e-05, + "loss": 1.5653, + "step": 2987 + }, + { + "epoch": 0.3987189751801441, + "grad_norm": 1.0645254671871938, + "learning_rate": 1.3678110961422708e-05, + "loss": 1.5928, + "step": 2988 + }, + { + "epoch": 0.39885241526554577, + "grad_norm": 1.1099047312189834, + "learning_rate": 1.3674091676395067e-05, + "loss": 1.5968, + "step": 2989 + }, + { + "epoch": 0.39898585535094744, + "grad_norm": 1.0039809381615434, + "learning_rate": 1.3670071705088925e-05, + "loss": 1.5968, + "step": 2990 + }, + { + "epoch": 0.39911929543634905, + "grad_norm": 1.1183712241615666, + "learning_rate": 1.366605104825518e-05, + "loss": 1.5859, + "step": 2991 + }, + { + "epoch": 0.3992527355217507, + "grad_norm": 1.2317478828162558, + "learning_rate": 1.3662029706644834e-05, + "loss": 1.633, + "step": 2992 + }, + { + "epoch": 0.3993861756071524, + "grad_norm": 1.0317396851316578, + "learning_rate": 1.3658007681009038e-05, + "loss": 1.604, + "step": 2993 + }, + { + "epoch": 0.39951961569255406, + "grad_norm": 1.0355968618370723, + "learning_rate": 1.365398497209905e-05, + "loss": 1.6271, + "step": 2994 + }, + { + "epoch": 0.3996530557779557, + "grad_norm": 0.9930994963511364, + "learning_rate": 1.3649961580666274e-05, + "loss": 1.6196, + "step": 2995 + }, + { + "epoch": 0.39978649586335735, + "grad_norm": 1.2269862375772342, + "learning_rate": 1.364593750746223e-05, + "loss": 1.6219, + "step": 2996 + }, + { + "epoch": 0.399919935948759, + "grad_norm": 0.9700028478438667, + "learning_rate": 1.3641912753238572e-05, + "loss": 1.6564, + "step": 2997 + }, + { + "epoch": 0.40005337603416063, + "grad_norm": 0.9545800135062432, + "learning_rate": 1.3637887318747077e-05, + "loss": 1.6116, + "step": 2998 + }, + { + "epoch": 0.4001868161195623, + "grad_norm": 0.9570154971534128, + "learning_rate": 1.3633861204739647e-05, + "loss": 1.6712, + "step": 2999 + }, + { + "epoch": 0.400320256204964, + "grad_norm": 1.2079133599715546, + "learning_rate": 1.3629834411968317e-05, + "loss": 1.6293, + "step": 3000 + }, + { + "epoch": 0.40045369629036565, + "grad_norm": 0.9836480600801262, + "learning_rate": 1.362580694118525e-05, + "loss": 1.5987, + "step": 3001 + }, + { + "epoch": 0.40058713637576726, + "grad_norm": 0.9910032459354412, + "learning_rate": 1.3621778793142721e-05, + "loss": 1.6064, + "step": 3002 + }, + { + "epoch": 0.40072057646116893, + "grad_norm": 1.0967755414857783, + "learning_rate": 1.3617749968593155e-05, + "loss": 1.6292, + "step": 3003 + }, + { + "epoch": 0.4008540165465706, + "grad_norm": 1.0003163865850948, + "learning_rate": 1.3613720468289083e-05, + "loss": 1.6362, + "step": 3004 + }, + { + "epoch": 0.40098745663197227, + "grad_norm": 1.0315705480717272, + "learning_rate": 1.3609690292983171e-05, + "loss": 1.5829, + "step": 3005 + }, + { + "epoch": 0.4011208967173739, + "grad_norm": 0.976139700381007, + "learning_rate": 1.3605659443428208e-05, + "loss": 1.6224, + "step": 3006 + }, + { + "epoch": 0.40125433680277556, + "grad_norm": 0.9736571196522579, + "learning_rate": 1.3601627920377114e-05, + "loss": 1.5872, + "step": 3007 + }, + { + "epoch": 0.40138777688817723, + "grad_norm": 1.0002994090288835, + "learning_rate": 1.359759572458293e-05, + "loss": 1.6186, + "step": 3008 + }, + { + "epoch": 0.40152121697357884, + "grad_norm": 1.045079222117039, + "learning_rate": 1.3593562856798828e-05, + "loss": 1.5786, + "step": 3009 + }, + { + "epoch": 0.4016546570589805, + "grad_norm": 0.9688502047059884, + "learning_rate": 1.3589529317778097e-05, + "loss": 1.6257, + "step": 3010 + }, + { + "epoch": 0.4017880971443822, + "grad_norm": 1.0897993308963512, + "learning_rate": 1.3585495108274155e-05, + "loss": 1.6415, + "step": 3011 + }, + { + "epoch": 0.40192153722978385, + "grad_norm": 1.024133361111547, + "learning_rate": 1.3581460229040552e-05, + "loss": 1.6256, + "step": 3012 + }, + { + "epoch": 0.40205497731518547, + "grad_norm": 1.0901551656265192, + "learning_rate": 1.3577424680830953e-05, + "loss": 1.6085, + "step": 3013 + }, + { + "epoch": 0.40218841740058714, + "grad_norm": 1.0074753288531135, + "learning_rate": 1.3573388464399158e-05, + "loss": 1.6159, + "step": 3014 + }, + { + "epoch": 0.4023218574859888, + "grad_norm": 1.0808434730186502, + "learning_rate": 1.3569351580499077e-05, + "loss": 1.6081, + "step": 3015 + }, + { + "epoch": 0.4024552975713904, + "grad_norm": 1.3390539579090508, + "learning_rate": 1.3565314029884764e-05, + "loss": 1.6476, + "step": 3016 + }, + { + "epoch": 0.4025887376567921, + "grad_norm": 1.1325494027036593, + "learning_rate": 1.356127581331038e-05, + "loss": 1.6408, + "step": 3017 + }, + { + "epoch": 0.40272217774219377, + "grad_norm": 1.1561856323405204, + "learning_rate": 1.3557236931530223e-05, + "loss": 1.6112, + "step": 3018 + }, + { + "epoch": 0.40285561782759544, + "grad_norm": 1.0096598053584744, + "learning_rate": 1.3553197385298704e-05, + "loss": 1.6228, + "step": 3019 + }, + { + "epoch": 0.40298905791299705, + "grad_norm": 0.9619782469364923, + "learning_rate": 1.3549157175370374e-05, + "loss": 1.6378, + "step": 3020 + }, + { + "epoch": 0.4031224979983987, + "grad_norm": 1.0828538858146166, + "learning_rate": 1.3545116302499888e-05, + "loss": 1.6662, + "step": 3021 + }, + { + "epoch": 0.4032559380838004, + "grad_norm": 1.1612286498560225, + "learning_rate": 1.3541074767442039e-05, + "loss": 1.6038, + "step": 3022 + }, + { + "epoch": 0.403389378169202, + "grad_norm": 1.0469641391213618, + "learning_rate": 1.3537032570951742e-05, + "loss": 1.5947, + "step": 3023 + }, + { + "epoch": 0.4035228182546037, + "grad_norm": 1.1061755741793078, + "learning_rate": 1.353298971378403e-05, + "loss": 1.541, + "step": 3024 + }, + { + "epoch": 0.40365625834000535, + "grad_norm": 1.255991398258131, + "learning_rate": 1.3528946196694067e-05, + "loss": 1.6509, + "step": 3025 + }, + { + "epoch": 0.403789698425407, + "grad_norm": 1.2128774118932317, + "learning_rate": 1.352490202043713e-05, + "loss": 1.6228, + "step": 3026 + }, + { + "epoch": 0.40392313851080863, + "grad_norm": 0.9952887623145266, + "learning_rate": 1.3520857185768627e-05, + "loss": 1.647, + "step": 3027 + }, + { + "epoch": 0.4040565785962103, + "grad_norm": 0.9867845852620436, + "learning_rate": 1.3516811693444092e-05, + "loss": 1.5929, + "step": 3028 + }, + { + "epoch": 0.404190018681612, + "grad_norm": 1.0634687182480442, + "learning_rate": 1.3512765544219168e-05, + "loss": 1.6571, + "step": 3029 + }, + { + "epoch": 0.4043234587670136, + "grad_norm": 0.9923107764534385, + "learning_rate": 1.350871873884964e-05, + "loss": 1.6509, + "step": 3030 + }, + { + "epoch": 0.40445689885241526, + "grad_norm": 0.9418612013435159, + "learning_rate": 1.3504671278091396e-05, + "loss": 1.6459, + "step": 3031 + }, + { + "epoch": 0.40459033893781693, + "grad_norm": 0.9743708699323149, + "learning_rate": 1.3500623162700464e-05, + "loss": 1.5773, + "step": 3032 + }, + { + "epoch": 0.4047237790232186, + "grad_norm": 1.0021337250013098, + "learning_rate": 1.3496574393432978e-05, + "loss": 1.6072, + "step": 3033 + }, + { + "epoch": 0.4048572191086202, + "grad_norm": 0.9905407512325389, + "learning_rate": 1.3492524971045202e-05, + "loss": 1.5955, + "step": 3034 + }, + { + "epoch": 0.4049906591940219, + "grad_norm": 0.9829445518512612, + "learning_rate": 1.3488474896293531e-05, + "loss": 1.6473, + "step": 3035 + }, + { + "epoch": 0.40512409927942356, + "grad_norm": 1.071551407117045, + "learning_rate": 1.3484424169934465e-05, + "loss": 1.6365, + "step": 3036 + }, + { + "epoch": 0.40525753936482517, + "grad_norm": 1.0858770844558168, + "learning_rate": 1.3480372792724636e-05, + "loss": 1.5908, + "step": 3037 + }, + { + "epoch": 0.40539097945022684, + "grad_norm": 1.097690642719979, + "learning_rate": 1.3476320765420794e-05, + "loss": 1.6341, + "step": 3038 + }, + { + "epoch": 0.4055244195356285, + "grad_norm": 0.9550194281013512, + "learning_rate": 1.3472268088779814e-05, + "loss": 1.6558, + "step": 3039 + }, + { + "epoch": 0.4056578596210302, + "grad_norm": 0.9831862531483857, + "learning_rate": 1.3468214763558686e-05, + "loss": 1.5536, + "step": 3040 + }, + { + "epoch": 0.4057912997064318, + "grad_norm": 1.1319350865381137, + "learning_rate": 1.3464160790514531e-05, + "loss": 1.6586, + "step": 3041 + }, + { + "epoch": 0.40592473979183347, + "grad_norm": 0.988139658682674, + "learning_rate": 1.3460106170404579e-05, + "loss": 1.5955, + "step": 3042 + }, + { + "epoch": 0.40605817987723514, + "grad_norm": 1.0084348562604524, + "learning_rate": 1.3456050903986189e-05, + "loss": 1.6381, + "step": 3043 + }, + { + "epoch": 0.40619161996263675, + "grad_norm": 1.0391432220005692, + "learning_rate": 1.3451994992016839e-05, + "loss": 1.5705, + "step": 3044 + }, + { + "epoch": 0.4063250600480384, + "grad_norm": 0.9843503035286154, + "learning_rate": 1.3447938435254127e-05, + "loss": 1.5664, + "step": 3045 + }, + { + "epoch": 0.4064585001334401, + "grad_norm": 0.9833405027126961, + "learning_rate": 1.3443881234455772e-05, + "loss": 1.6586, + "step": 3046 + }, + { + "epoch": 0.40659194021884176, + "grad_norm": 0.9898970019998985, + "learning_rate": 1.3439823390379609e-05, + "loss": 1.6347, + "step": 3047 + }, + { + "epoch": 0.4067253803042434, + "grad_norm": 1.0000597042602213, + "learning_rate": 1.3435764903783605e-05, + "loss": 1.6239, + "step": 3048 + }, + { + "epoch": 0.40685882038964505, + "grad_norm": 0.9928896355998765, + "learning_rate": 1.3431705775425835e-05, + "loss": 1.5948, + "step": 3049 + }, + { + "epoch": 0.4069922604750467, + "grad_norm": 1.2336188070965246, + "learning_rate": 1.3427646006064492e-05, + "loss": 1.5953, + "step": 3050 + }, + { + "epoch": 0.40712570056044833, + "grad_norm": 1.0034026633786906, + "learning_rate": 1.3423585596457906e-05, + "loss": 1.6316, + "step": 3051 + }, + { + "epoch": 0.40725914064585, + "grad_norm": 2.1481053784351336, + "learning_rate": 1.3419524547364506e-05, + "loss": 1.6295, + "step": 3052 + }, + { + "epoch": 0.4073925807312517, + "grad_norm": 1.1897086059631425, + "learning_rate": 1.3415462859542856e-05, + "loss": 1.5635, + "step": 3053 + }, + { + "epoch": 0.40752602081665334, + "grad_norm": 1.023418328395939, + "learning_rate": 1.3411400533751628e-05, + "loss": 1.6415, + "step": 3054 + }, + { + "epoch": 0.40765946090205496, + "grad_norm": 0.98213814874442, + "learning_rate": 1.3407337570749622e-05, + "loss": 1.6428, + "step": 3055 + }, + { + "epoch": 0.40779290098745663, + "grad_norm": 1.283169457369164, + "learning_rate": 1.3403273971295749e-05, + "loss": 1.6235, + "step": 3056 + }, + { + "epoch": 0.4079263410728583, + "grad_norm": 0.9925266060021142, + "learning_rate": 1.339920973614905e-05, + "loss": 1.621, + "step": 3057 + }, + { + "epoch": 0.4080597811582599, + "grad_norm": 0.9614672523532065, + "learning_rate": 1.3395144866068673e-05, + "loss": 1.5774, + "step": 3058 + }, + { + "epoch": 0.4081932212436616, + "grad_norm": 0.9815628673629643, + "learning_rate": 1.3391079361813888e-05, + "loss": 1.5849, + "step": 3059 + }, + { + "epoch": 0.40832666132906326, + "grad_norm": 1.0926187808935628, + "learning_rate": 1.338701322414409e-05, + "loss": 1.6139, + "step": 3060 + }, + { + "epoch": 0.4084601014144649, + "grad_norm": 0.9687836831287908, + "learning_rate": 1.3382946453818782e-05, + "loss": 1.5821, + "step": 3061 + }, + { + "epoch": 0.40859354149986654, + "grad_norm": 1.2615662453906389, + "learning_rate": 1.3378879051597594e-05, + "loss": 1.6187, + "step": 3062 + }, + { + "epoch": 0.4087269815852682, + "grad_norm": 1.143934867223233, + "learning_rate": 1.3374811018240268e-05, + "loss": 1.6114, + "step": 3063 + }, + { + "epoch": 0.4088604216706699, + "grad_norm": 0.9474085642193837, + "learning_rate": 1.3370742354506667e-05, + "loss": 1.6128, + "step": 3064 + }, + { + "epoch": 0.4089938617560715, + "grad_norm": 0.9872797898071304, + "learning_rate": 1.3366673061156775e-05, + "loss": 1.6521, + "step": 3065 + }, + { + "epoch": 0.40912730184147317, + "grad_norm": 1.3660459817420605, + "learning_rate": 1.3362603138950681e-05, + "loss": 1.5566, + "step": 3066 + }, + { + "epoch": 0.40926074192687484, + "grad_norm": 1.0188587037661874, + "learning_rate": 1.335853258864861e-05, + "loss": 1.6181, + "step": 3067 + }, + { + "epoch": 0.4093941820122765, + "grad_norm": 0.9730036168336966, + "learning_rate": 1.3354461411010887e-05, + "loss": 1.6294, + "step": 3068 + }, + { + "epoch": 0.4095276220976781, + "grad_norm": 1.0088183488719165, + "learning_rate": 1.3350389606797966e-05, + "loss": 1.599, + "step": 3069 + }, + { + "epoch": 0.4096610621830798, + "grad_norm": 0.9662028481495397, + "learning_rate": 1.3346317176770409e-05, + "loss": 1.6385, + "step": 3070 + }, + { + "epoch": 0.40979450226848146, + "grad_norm": 0.9917563809796904, + "learning_rate": 1.3342244121688905e-05, + "loss": 1.6225, + "step": 3071 + }, + { + "epoch": 0.4099279423538831, + "grad_norm": 0.9830125253461315, + "learning_rate": 1.3338170442314254e-05, + "loss": 1.6107, + "step": 3072 + }, + { + "epoch": 0.41006138243928475, + "grad_norm": 0.9826012171089086, + "learning_rate": 1.3334096139407368e-05, + "loss": 1.5969, + "step": 3073 + }, + { + "epoch": 0.4101948225246864, + "grad_norm": 1.3174009396986306, + "learning_rate": 1.3330021213729283e-05, + "loss": 1.5944, + "step": 3074 + }, + { + "epoch": 0.4103282626100881, + "grad_norm": 0.971753779897976, + "learning_rate": 1.332594566604115e-05, + "loss": 1.6118, + "step": 3075 + }, + { + "epoch": 0.4104617026954897, + "grad_norm": 0.9913292187671285, + "learning_rate": 1.3321869497104233e-05, + "loss": 1.6234, + "step": 3076 + }, + { + "epoch": 0.4105951427808914, + "grad_norm": 1.1276395807046564, + "learning_rate": 1.3317792707679915e-05, + "loss": 1.6527, + "step": 3077 + }, + { + "epoch": 0.41072858286629305, + "grad_norm": 0.9668291003462994, + "learning_rate": 1.3313715298529697e-05, + "loss": 1.6186, + "step": 3078 + }, + { + "epoch": 0.4108620229516947, + "grad_norm": 0.995376867257694, + "learning_rate": 1.3309637270415185e-05, + "loss": 1.6667, + "step": 3079 + }, + { + "epoch": 0.41099546303709633, + "grad_norm": 1.0129402139207564, + "learning_rate": 1.3305558624098116e-05, + "loss": 1.6623, + "step": 3080 + }, + { + "epoch": 0.411128903122498, + "grad_norm": 1.1770451338395234, + "learning_rate": 1.3301479360340329e-05, + "loss": 1.6662, + "step": 3081 + }, + { + "epoch": 0.41126234320789967, + "grad_norm": 1.0658383068003052, + "learning_rate": 1.3297399479903787e-05, + "loss": 1.6057, + "step": 3082 + }, + { + "epoch": 0.4113957832933013, + "grad_norm": 1.1694647133067295, + "learning_rate": 1.3293318983550563e-05, + "loss": 1.6122, + "step": 3083 + }, + { + "epoch": 0.41152922337870296, + "grad_norm": 0.942161713707869, + "learning_rate": 1.3289237872042851e-05, + "loss": 1.606, + "step": 3084 + }, + { + "epoch": 0.41166266346410463, + "grad_norm": 0.9757692839225567, + "learning_rate": 1.3285156146142954e-05, + "loss": 1.6343, + "step": 3085 + }, + { + "epoch": 0.4117961035495063, + "grad_norm": 0.9486089863227384, + "learning_rate": 1.3281073806613289e-05, + "loss": 1.5431, + "step": 3086 + }, + { + "epoch": 0.4119295436349079, + "grad_norm": 0.9732494030371487, + "learning_rate": 1.3276990854216396e-05, + "loss": 1.6314, + "step": 3087 + }, + { + "epoch": 0.4120629837203096, + "grad_norm": 1.1111387987861114, + "learning_rate": 1.3272907289714918e-05, + "loss": 1.6326, + "step": 3088 + }, + { + "epoch": 0.41219642380571125, + "grad_norm": 0.9652044439453622, + "learning_rate": 1.3268823113871627e-05, + "loss": 1.6681, + "step": 3089 + }, + { + "epoch": 0.41232986389111287, + "grad_norm": 0.9561565798940396, + "learning_rate": 1.3264738327449389e-05, + "loss": 1.5918, + "step": 3090 + }, + { + "epoch": 0.41246330397651454, + "grad_norm": 1.3152845877647086, + "learning_rate": 1.3260652931211207e-05, + "loss": 1.607, + "step": 3091 + }, + { + "epoch": 0.4125967440619162, + "grad_norm": 0.989537074682015, + "learning_rate": 1.325656692592018e-05, + "loss": 1.6011, + "step": 3092 + }, + { + "epoch": 0.4127301841473179, + "grad_norm": 0.9734140085573458, + "learning_rate": 1.3252480312339526e-05, + "loss": 1.648, + "step": 3093 + }, + { + "epoch": 0.4128636242327195, + "grad_norm": 0.9558460700959762, + "learning_rate": 1.3248393091232583e-05, + "loss": 1.5901, + "step": 3094 + }, + { + "epoch": 0.41299706431812117, + "grad_norm": 0.9913733635852103, + "learning_rate": 1.3244305263362796e-05, + "loss": 1.6253, + "step": 3095 + }, + { + "epoch": 0.41313050440352284, + "grad_norm": 1.1740036558110662, + "learning_rate": 1.3240216829493722e-05, + "loss": 1.5879, + "step": 3096 + }, + { + "epoch": 0.41326394448892445, + "grad_norm": 1.1257399618437767, + "learning_rate": 1.3236127790389036e-05, + "loss": 1.617, + "step": 3097 + }, + { + "epoch": 0.4133973845743261, + "grad_norm": 0.9756326130850466, + "learning_rate": 1.323203814681252e-05, + "loss": 1.596, + "step": 3098 + }, + { + "epoch": 0.4135308246597278, + "grad_norm": 0.91654844990653, + "learning_rate": 1.3227947899528081e-05, + "loss": 1.5737, + "step": 3099 + }, + { + "epoch": 0.41366426474512946, + "grad_norm": 1.0075079473682016, + "learning_rate": 1.3223857049299724e-05, + "loss": 1.5862, + "step": 3100 + }, + { + "epoch": 0.4137977048305311, + "grad_norm": 1.0128738725898727, + "learning_rate": 1.3219765596891576e-05, + "loss": 1.6742, + "step": 3101 + }, + { + "epoch": 0.41393114491593275, + "grad_norm": 4.5247192473377655, + "learning_rate": 1.321567354306787e-05, + "loss": 1.6274, + "step": 3102 + }, + { + "epoch": 0.4140645850013344, + "grad_norm": 1.2025566547291768, + "learning_rate": 1.3211580888592964e-05, + "loss": 1.6561, + "step": 3103 + }, + { + "epoch": 0.41419802508673603, + "grad_norm": 1.2057864440073822, + "learning_rate": 1.3207487634231308e-05, + "loss": 1.6046, + "step": 3104 + }, + { + "epoch": 0.4143314651721377, + "grad_norm": 1.032603380515401, + "learning_rate": 1.3203393780747482e-05, + "loss": 1.6494, + "step": 3105 + }, + { + "epoch": 0.4144649052575394, + "grad_norm": 0.984221055123714, + "learning_rate": 1.3199299328906173e-05, + "loss": 1.5627, + "step": 3106 + }, + { + "epoch": 0.41459834534294104, + "grad_norm": 1.0080633070266374, + "learning_rate": 1.3195204279472171e-05, + "loss": 1.6455, + "step": 3107 + }, + { + "epoch": 0.41473178542834266, + "grad_norm": 1.0172031612333243, + "learning_rate": 1.319110863321039e-05, + "loss": 1.6523, + "step": 3108 + }, + { + "epoch": 0.41486522551374433, + "grad_norm": 0.9819246274039858, + "learning_rate": 1.3187012390885844e-05, + "loss": 1.6192, + "step": 3109 + }, + { + "epoch": 0.414998665599146, + "grad_norm": 1.0218273012540824, + "learning_rate": 1.3182915553263676e-05, + "loss": 1.5381, + "step": 3110 + }, + { + "epoch": 0.4151321056845476, + "grad_norm": 1.1870404877073484, + "learning_rate": 1.3178818121109116e-05, + "loss": 1.5987, + "step": 3111 + }, + { + "epoch": 0.4152655457699493, + "grad_norm": 0.9665697428225051, + "learning_rate": 1.3174720095187527e-05, + "loss": 1.5904, + "step": 3112 + }, + { + "epoch": 0.41539898585535096, + "grad_norm": 1.0024983274761543, + "learning_rate": 1.3170621476264368e-05, + "loss": 1.5948, + "step": 3113 + }, + { + "epoch": 0.4155324259407526, + "grad_norm": 1.3283967395783622, + "learning_rate": 1.3166522265105216e-05, + "loss": 1.6199, + "step": 3114 + }, + { + "epoch": 0.41566586602615424, + "grad_norm": 1.0606323988381896, + "learning_rate": 1.3162422462475757e-05, + "loss": 1.5634, + "step": 3115 + }, + { + "epoch": 0.4157993061115559, + "grad_norm": 1.0009105274760155, + "learning_rate": 1.3158322069141788e-05, + "loss": 1.6536, + "step": 3116 + }, + { + "epoch": 0.4159327461969576, + "grad_norm": 0.998531188875352, + "learning_rate": 1.3154221085869215e-05, + "loss": 1.569, + "step": 3117 + }, + { + "epoch": 0.4160661862823592, + "grad_norm": 1.0318215900037766, + "learning_rate": 1.3150119513424054e-05, + "loss": 1.5927, + "step": 3118 + }, + { + "epoch": 0.41619962636776087, + "grad_norm": 1.09023864908779, + "learning_rate": 1.3146017352572435e-05, + "loss": 1.588, + "step": 3119 + }, + { + "epoch": 0.41633306645316254, + "grad_norm": 1.1300901695201984, + "learning_rate": 1.3141914604080593e-05, + "loss": 1.5848, + "step": 3120 + }, + { + "epoch": 0.4164665065385642, + "grad_norm": 0.9765411604695532, + "learning_rate": 1.3137811268714875e-05, + "loss": 1.5884, + "step": 3121 + }, + { + "epoch": 0.4165999466239658, + "grad_norm": 0.9917987982620767, + "learning_rate": 1.3133707347241735e-05, + "loss": 1.6489, + "step": 3122 + }, + { + "epoch": 0.4167333867093675, + "grad_norm": 1.0549323113529163, + "learning_rate": 1.3129602840427741e-05, + "loss": 1.6262, + "step": 3123 + }, + { + "epoch": 0.41686682679476916, + "grad_norm": 1.0110367235245075, + "learning_rate": 1.3125497749039574e-05, + "loss": 1.6089, + "step": 3124 + }, + { + "epoch": 0.4170002668801708, + "grad_norm": 1.1439338907133554, + "learning_rate": 1.312139207384401e-05, + "loss": 1.5526, + "step": 3125 + }, + { + "epoch": 0.41713370696557245, + "grad_norm": 1.2855762643940758, + "learning_rate": 1.3117285815607943e-05, + "loss": 1.6033, + "step": 3126 + }, + { + "epoch": 0.4172671470509741, + "grad_norm": 1.1679452250656743, + "learning_rate": 1.311317897509838e-05, + "loss": 1.6454, + "step": 3127 + }, + { + "epoch": 0.4174005871363758, + "grad_norm": 1.0069767600727495, + "learning_rate": 1.3109071553082426e-05, + "loss": 1.6761, + "step": 3128 + }, + { + "epoch": 0.4175340272217774, + "grad_norm": 0.9913179933467611, + "learning_rate": 1.3104963550327307e-05, + "loss": 1.5818, + "step": 3129 + }, + { + "epoch": 0.4176674673071791, + "grad_norm": 1.008746692632273, + "learning_rate": 1.3100854967600346e-05, + "loss": 1.5777, + "step": 3130 + }, + { + "epoch": 0.41780090739258074, + "grad_norm": 1.1253948107175336, + "learning_rate": 1.3096745805668985e-05, + "loss": 1.6108, + "step": 3131 + }, + { + "epoch": 0.41793434747798236, + "grad_norm": 1.0118014744129107, + "learning_rate": 1.309263606530076e-05, + "loss": 1.5923, + "step": 3132 + }, + { + "epoch": 0.41806778756338403, + "grad_norm": 1.0052085925079868, + "learning_rate": 1.3088525747263334e-05, + "loss": 1.617, + "step": 3133 + }, + { + "epoch": 0.4182012276487857, + "grad_norm": 1.028648913612512, + "learning_rate": 1.308441485232446e-05, + "loss": 1.5913, + "step": 3134 + }, + { + "epoch": 0.41833466773418737, + "grad_norm": 0.9714092683461435, + "learning_rate": 1.308030338125201e-05, + "loss": 1.5615, + "step": 3135 + }, + { + "epoch": 0.418468107819589, + "grad_norm": 0.9640330982395612, + "learning_rate": 1.307619133481396e-05, + "loss": 1.5793, + "step": 3136 + }, + { + "epoch": 0.41860154790499066, + "grad_norm": 0.9654481412303503, + "learning_rate": 1.3072078713778391e-05, + "loss": 1.5752, + "step": 3137 + }, + { + "epoch": 0.4187349879903923, + "grad_norm": 1.0550083483656565, + "learning_rate": 1.3067965518913495e-05, + "loss": 1.6224, + "step": 3138 + }, + { + "epoch": 0.41886842807579394, + "grad_norm": 1.0212917389559222, + "learning_rate": 1.3063851750987566e-05, + "loss": 1.657, + "step": 3139 + }, + { + "epoch": 0.4190018681611956, + "grad_norm": 7.254823593563437, + "learning_rate": 1.305973741076902e-05, + "loss": 1.6833, + "step": 3140 + }, + { + "epoch": 0.4191353082465973, + "grad_norm": 1.192322814644137, + "learning_rate": 1.3055622499026358e-05, + "loss": 1.6178, + "step": 3141 + }, + { + "epoch": 0.41926874833199895, + "grad_norm": 1.0727747759744528, + "learning_rate": 1.3051507016528206e-05, + "loss": 1.6121, + "step": 3142 + }, + { + "epoch": 0.41940218841740057, + "grad_norm": 1.090188535676899, + "learning_rate": 1.3047390964043282e-05, + "loss": 1.6112, + "step": 3143 + }, + { + "epoch": 0.41953562850280224, + "grad_norm": 1.0358843384129968, + "learning_rate": 1.3043274342340426e-05, + "loss": 1.6308, + "step": 3144 + }, + { + "epoch": 0.4196690685882039, + "grad_norm": 1.0321088750194614, + "learning_rate": 1.3039157152188569e-05, + "loss": 1.6028, + "step": 3145 + }, + { + "epoch": 0.4198025086736055, + "grad_norm": 0.9448257397915428, + "learning_rate": 1.3035039394356761e-05, + "loss": 1.5707, + "step": 3146 + }, + { + "epoch": 0.4199359487590072, + "grad_norm": 1.1440735516549028, + "learning_rate": 1.3030921069614145e-05, + "loss": 1.6633, + "step": 3147 + }, + { + "epoch": 0.42006938884440886, + "grad_norm": 1.690173809325389, + "learning_rate": 1.3026802178729985e-05, + "loss": 1.643, + "step": 3148 + }, + { + "epoch": 0.42020282892981053, + "grad_norm": 0.9819001583539148, + "learning_rate": 1.302268272247364e-05, + "loss": 1.613, + "step": 3149 + }, + { + "epoch": 0.42033626901521215, + "grad_norm": 1.2486781524480748, + "learning_rate": 1.3018562701614572e-05, + "loss": 1.6, + "step": 3150 + }, + { + "epoch": 0.4204697091006138, + "grad_norm": 1.075695894476628, + "learning_rate": 1.3014442116922363e-05, + "loss": 1.6246, + "step": 3151 + }, + { + "epoch": 0.4206031491860155, + "grad_norm": 1.0010043297382096, + "learning_rate": 1.3010320969166688e-05, + "loss": 1.6071, + "step": 3152 + }, + { + "epoch": 0.42073658927141716, + "grad_norm": 1.0686067267812616, + "learning_rate": 1.300619925911733e-05, + "loss": 1.5974, + "step": 3153 + }, + { + "epoch": 0.4208700293568188, + "grad_norm": 1.5326383589995518, + "learning_rate": 1.3002076987544173e-05, + "loss": 1.616, + "step": 3154 + }, + { + "epoch": 0.42100346944222045, + "grad_norm": 1.1448204497310523, + "learning_rate": 1.2997954155217216e-05, + "loss": 1.6295, + "step": 3155 + }, + { + "epoch": 0.4211369095276221, + "grad_norm": 1.0093147440921253, + "learning_rate": 1.2993830762906558e-05, + "loss": 1.5872, + "step": 3156 + }, + { + "epoch": 0.42127034961302373, + "grad_norm": 1.1083677568054542, + "learning_rate": 1.29897068113824e-05, + "loss": 1.6149, + "step": 3157 + }, + { + "epoch": 0.4214037896984254, + "grad_norm": 1.0172339428080068, + "learning_rate": 1.2985582301415045e-05, + "loss": 1.6115, + "step": 3158 + }, + { + "epoch": 0.42153722978382707, + "grad_norm": 1.122382756050523, + "learning_rate": 1.298145723377491e-05, + "loss": 1.5786, + "step": 3159 + }, + { + "epoch": 0.42167066986922874, + "grad_norm": 1.0389932694756445, + "learning_rate": 1.2977331609232511e-05, + "loss": 1.634, + "step": 3160 + }, + { + "epoch": 0.42180410995463036, + "grad_norm": 1.0581673343897444, + "learning_rate": 1.2973205428558461e-05, + "loss": 1.5624, + "step": 3161 + }, + { + "epoch": 0.42193755004003203, + "grad_norm": 1.7234834631306422, + "learning_rate": 1.2969078692523491e-05, + "loss": 1.6275, + "step": 3162 + }, + { + "epoch": 0.4220709901254337, + "grad_norm": 0.9813583900737928, + "learning_rate": 1.2964951401898427e-05, + "loss": 1.6535, + "step": 3163 + }, + { + "epoch": 0.4222044302108353, + "grad_norm": 0.9898306998533509, + "learning_rate": 1.2960823557454196e-05, + "loss": 1.6149, + "step": 3164 + }, + { + "epoch": 0.422337870296237, + "grad_norm": 1.0720992088919126, + "learning_rate": 1.2956695159961835e-05, + "loss": 1.6335, + "step": 3165 + }, + { + "epoch": 0.42247131038163865, + "grad_norm": 0.9836128501217848, + "learning_rate": 1.2952566210192483e-05, + "loss": 1.6103, + "step": 3166 + }, + { + "epoch": 0.4226047504670403, + "grad_norm": 1.0276047259882335, + "learning_rate": 1.2948436708917377e-05, + "loss": 1.529, + "step": 3167 + }, + { + "epoch": 0.42273819055244194, + "grad_norm": 0.9856021354742732, + "learning_rate": 1.2944306656907863e-05, + "loss": 1.6331, + "step": 3168 + }, + { + "epoch": 0.4228716306378436, + "grad_norm": 1.102227555293798, + "learning_rate": 1.2940176054935392e-05, + "loss": 1.6351, + "step": 3169 + }, + { + "epoch": 0.4230050707232453, + "grad_norm": 1.163021632169652, + "learning_rate": 1.2936044903771507e-05, + "loss": 1.6315, + "step": 3170 + }, + { + "epoch": 0.4231385108086469, + "grad_norm": 0.9872219387832729, + "learning_rate": 1.293191320418786e-05, + "loss": 1.5893, + "step": 3171 + }, + { + "epoch": 0.42327195089404857, + "grad_norm": 0.9582985814406623, + "learning_rate": 1.2927780956956208e-05, + "loss": 1.5824, + "step": 3172 + }, + { + "epoch": 0.42340539097945024, + "grad_norm": 1.1512050311999436, + "learning_rate": 1.2923648162848407e-05, + "loss": 1.5659, + "step": 3173 + }, + { + "epoch": 0.4235388310648519, + "grad_norm": 0.9359070452550783, + "learning_rate": 1.2919514822636419e-05, + "loss": 1.5615, + "step": 3174 + }, + { + "epoch": 0.4236722711502535, + "grad_norm": 1.095401798316347, + "learning_rate": 1.29153809370923e-05, + "loss": 1.5976, + "step": 3175 + }, + { + "epoch": 0.4238057112356552, + "grad_norm": 1.148887324591379, + "learning_rate": 1.2911246506988215e-05, + "loss": 1.6529, + "step": 3176 + }, + { + "epoch": 0.42393915132105686, + "grad_norm": 1.045994466470146, + "learning_rate": 1.2907111533096429e-05, + "loss": 1.6055, + "step": 3177 + }, + { + "epoch": 0.4240725914064585, + "grad_norm": 1.0087477247067256, + "learning_rate": 1.2902976016189304e-05, + "loss": 1.6546, + "step": 3178 + }, + { + "epoch": 0.42420603149186015, + "grad_norm": 1.016393015662281, + "learning_rate": 1.2898839957039313e-05, + "loss": 1.6213, + "step": 3179 + }, + { + "epoch": 0.4243394715772618, + "grad_norm": 1.0626699808769553, + "learning_rate": 1.2894703356419023e-05, + "loss": 1.5953, + "step": 3180 + }, + { + "epoch": 0.4244729116626635, + "grad_norm": 1.055001415948082, + "learning_rate": 1.2890566215101103e-05, + "loss": 1.6129, + "step": 3181 + }, + { + "epoch": 0.4246063517480651, + "grad_norm": 1.289178506303072, + "learning_rate": 1.2886428533858323e-05, + "loss": 1.5734, + "step": 3182 + }, + { + "epoch": 0.4247397918334668, + "grad_norm": 0.9875166066103552, + "learning_rate": 1.2882290313463561e-05, + "loss": 1.5684, + "step": 3183 + }, + { + "epoch": 0.42487323191886844, + "grad_norm": 0.9865955139060382, + "learning_rate": 1.2878151554689779e-05, + "loss": 1.611, + "step": 3184 + }, + { + "epoch": 0.42500667200427006, + "grad_norm": 0.9518489443348546, + "learning_rate": 1.287401225831006e-05, + "loss": 1.6425, + "step": 3185 + }, + { + "epoch": 0.42514011208967173, + "grad_norm": 0.9998924842921293, + "learning_rate": 1.286987242509757e-05, + "loss": 1.5908, + "step": 3186 + }, + { + "epoch": 0.4252735521750734, + "grad_norm": 0.9623933406546137, + "learning_rate": 1.2865732055825584e-05, + "loss": 1.6013, + "step": 3187 + }, + { + "epoch": 0.42540699226047507, + "grad_norm": 1.014360008466468, + "learning_rate": 1.2861591151267483e-05, + "loss": 1.6092, + "step": 3188 + }, + { + "epoch": 0.4255404323458767, + "grad_norm": 1.0008036520235628, + "learning_rate": 1.2857449712196733e-05, + "loss": 1.6417, + "step": 3189 + }, + { + "epoch": 0.42567387243127836, + "grad_norm": 1.0695928415515632, + "learning_rate": 1.2853307739386908e-05, + "loss": 1.6225, + "step": 3190 + }, + { + "epoch": 0.42580731251668, + "grad_norm": 0.9505901957555256, + "learning_rate": 1.2849165233611687e-05, + "loss": 1.5678, + "step": 3191 + }, + { + "epoch": 0.42594075260208164, + "grad_norm": 1.0097739379690966, + "learning_rate": 1.2845022195644838e-05, + "loss": 1.6279, + "step": 3192 + }, + { + "epoch": 0.4260741926874833, + "grad_norm": 0.9745534806645423, + "learning_rate": 1.2840878626260231e-05, + "loss": 1.6465, + "step": 3193 + }, + { + "epoch": 0.426207632772885, + "grad_norm": 0.9315201229811604, + "learning_rate": 1.2836734526231844e-05, + "loss": 1.518, + "step": 3194 + }, + { + "epoch": 0.42634107285828665, + "grad_norm": 0.9736368620203836, + "learning_rate": 1.2832589896333747e-05, + "loss": 1.5971, + "step": 3195 + }, + { + "epoch": 0.42647451294368827, + "grad_norm": 1.1002775590993574, + "learning_rate": 1.2828444737340105e-05, + "loss": 1.6011, + "step": 3196 + }, + { + "epoch": 0.42660795302908994, + "grad_norm": 1.0076905774463565, + "learning_rate": 1.282429905002519e-05, + "loss": 1.6084, + "step": 3197 + }, + { + "epoch": 0.4267413931144916, + "grad_norm": 0.9760408811448265, + "learning_rate": 1.2820152835163366e-05, + "loss": 1.5717, + "step": 3198 + }, + { + "epoch": 0.4268748331998932, + "grad_norm": 0.9853900063619548, + "learning_rate": 1.2816006093529106e-05, + "loss": 1.6631, + "step": 3199 + }, + { + "epoch": 0.4270082732852949, + "grad_norm": 1.2084382692384266, + "learning_rate": 1.2811858825896965e-05, + "loss": 1.6046, + "step": 3200 + }, + { + "epoch": 0.42714171337069656, + "grad_norm": 0.9676385504121742, + "learning_rate": 1.2807711033041613e-05, + "loss": 1.6724, + "step": 3201 + }, + { + "epoch": 0.42727515345609823, + "grad_norm": 0.9778031832163321, + "learning_rate": 1.2803562715737802e-05, + "loss": 1.5844, + "step": 3202 + }, + { + "epoch": 0.42740859354149985, + "grad_norm": 0.954310934487862, + "learning_rate": 1.2799413874760398e-05, + "loss": 1.5669, + "step": 3203 + }, + { + "epoch": 0.4275420336269015, + "grad_norm": 1.102491655832009, + "learning_rate": 1.2795264510884357e-05, + "loss": 1.5134, + "step": 3204 + }, + { + "epoch": 0.4276754737123032, + "grad_norm": 1.1945300325637898, + "learning_rate": 1.2791114624884728e-05, + "loss": 1.6208, + "step": 3205 + }, + { + "epoch": 0.4278089137977048, + "grad_norm": 1.0204751169748503, + "learning_rate": 1.2786964217536666e-05, + "loss": 1.5952, + "step": 3206 + }, + { + "epoch": 0.4279423538831065, + "grad_norm": 1.2582481061790463, + "learning_rate": 1.278281328961542e-05, + "loss": 1.5961, + "step": 3207 + }, + { + "epoch": 0.42807579396850814, + "grad_norm": 1.0191124850439586, + "learning_rate": 1.2778661841896333e-05, + "loss": 1.6123, + "step": 3208 + }, + { + "epoch": 0.4282092340539098, + "grad_norm": 1.135529578537383, + "learning_rate": 1.277450987515485e-05, + "loss": 1.6442, + "step": 3209 + }, + { + "epoch": 0.42834267413931143, + "grad_norm": 1.099285147964999, + "learning_rate": 1.2770357390166513e-05, + "loss": 1.6706, + "step": 3210 + }, + { + "epoch": 0.4284761142247131, + "grad_norm": 1.017546212676164, + "learning_rate": 1.2766204387706955e-05, + "loss": 1.582, + "step": 3211 + }, + { + "epoch": 0.42860955431011477, + "grad_norm": 1.1530348114411721, + "learning_rate": 1.2762050868551913e-05, + "loss": 1.6239, + "step": 3212 + }, + { + "epoch": 0.4287429943955164, + "grad_norm": 0.9715880360042976, + "learning_rate": 1.275789683347722e-05, + "loss": 1.578, + "step": 3213 + }, + { + "epoch": 0.42887643448091806, + "grad_norm": 0.9866979071658567, + "learning_rate": 1.2753742283258793e-05, + "loss": 1.6167, + "step": 3214 + }, + { + "epoch": 0.4290098745663197, + "grad_norm": 0.9922836437350845, + "learning_rate": 1.2749587218672663e-05, + "loss": 1.6215, + "step": 3215 + }, + { + "epoch": 0.4291433146517214, + "grad_norm": 1.2015902362721134, + "learning_rate": 1.2745431640494944e-05, + "loss": 1.6423, + "step": 3216 + }, + { + "epoch": 0.429276754737123, + "grad_norm": 1.0161108583369185, + "learning_rate": 1.2741275549501853e-05, + "loss": 1.6025, + "step": 3217 + }, + { + "epoch": 0.4294101948225247, + "grad_norm": 0.9630893697181409, + "learning_rate": 1.2737118946469697e-05, + "loss": 1.5888, + "step": 3218 + }, + { + "epoch": 0.42954363490792635, + "grad_norm": 0.9590114370085036, + "learning_rate": 1.2732961832174888e-05, + "loss": 1.5735, + "step": 3219 + }, + { + "epoch": 0.429677074993328, + "grad_norm": 0.9698052625279515, + "learning_rate": 1.2728804207393925e-05, + "loss": 1.5758, + "step": 3220 + }, + { + "epoch": 0.42981051507872964, + "grad_norm": 1.1113879856734203, + "learning_rate": 1.2724646072903403e-05, + "loss": 1.5833, + "step": 3221 + }, + { + "epoch": 0.4299439551641313, + "grad_norm": 0.9741250433041043, + "learning_rate": 1.2720487429480017e-05, + "loss": 1.604, + "step": 3222 + }, + { + "epoch": 0.430077395249533, + "grad_norm": 1.0434937344586053, + "learning_rate": 1.2716328277900553e-05, + "loss": 1.6282, + "step": 3223 + }, + { + "epoch": 0.4302108353349346, + "grad_norm": 1.218598876665266, + "learning_rate": 1.2712168618941895e-05, + "loss": 1.6405, + "step": 3224 + }, + { + "epoch": 0.43034427542033626, + "grad_norm": 0.9274834745968594, + "learning_rate": 1.2708008453381015e-05, + "loss": 1.5594, + "step": 3225 + }, + { + "epoch": 0.43047771550573793, + "grad_norm": 0.9572245478353469, + "learning_rate": 1.2703847781994988e-05, + "loss": 1.5985, + "step": 3226 + }, + { + "epoch": 0.4306111555911396, + "grad_norm": 0.9804393710597245, + "learning_rate": 1.2699686605560984e-05, + "loss": 1.6098, + "step": 3227 + }, + { + "epoch": 0.4307445956765412, + "grad_norm": 0.9465366010602443, + "learning_rate": 1.2695524924856252e-05, + "loss": 1.6262, + "step": 3228 + }, + { + "epoch": 0.4308780357619429, + "grad_norm": 1.1417073930345842, + "learning_rate": 1.2691362740658162e-05, + "loss": 1.6277, + "step": 3229 + }, + { + "epoch": 0.43101147584734456, + "grad_norm": 1.078579390525697, + "learning_rate": 1.2687200053744148e-05, + "loss": 1.5878, + "step": 3230 + }, + { + "epoch": 0.4311449159327462, + "grad_norm": 0.9835105604076124, + "learning_rate": 1.2683036864891762e-05, + "loss": 1.585, + "step": 3231 + }, + { + "epoch": 0.43127835601814785, + "grad_norm": 0.9942457861542611, + "learning_rate": 1.2678873174878637e-05, + "loss": 1.6226, + "step": 3232 + }, + { + "epoch": 0.4314117961035495, + "grad_norm": 1.077188400186942, + "learning_rate": 1.2674708984482503e-05, + "loss": 1.617, + "step": 3233 + }, + { + "epoch": 0.4315452361889512, + "grad_norm": 0.9450353505746655, + "learning_rate": 1.2670544294481184e-05, + "loss": 1.5642, + "step": 3234 + }, + { + "epoch": 0.4316786762743528, + "grad_norm": 1.0397917929273337, + "learning_rate": 1.2666379105652593e-05, + "loss": 1.5236, + "step": 3235 + }, + { + "epoch": 0.43181211635975447, + "grad_norm": 1.2967775820644207, + "learning_rate": 1.2662213418774747e-05, + "loss": 1.6222, + "step": 3236 + }, + { + "epoch": 0.43194555644515614, + "grad_norm": 0.9953395285218007, + "learning_rate": 1.2658047234625741e-05, + "loss": 1.6417, + "step": 3237 + }, + { + "epoch": 0.43207899653055776, + "grad_norm": 1.0148344019514997, + "learning_rate": 1.2653880553983777e-05, + "loss": 1.6523, + "step": 3238 + }, + { + "epoch": 0.4322124366159594, + "grad_norm": 0.919347172987927, + "learning_rate": 1.264971337762714e-05, + "loss": 1.5869, + "step": 3239 + }, + { + "epoch": 0.4323458767013611, + "grad_norm": 0.9647756161069322, + "learning_rate": 1.2645545706334213e-05, + "loss": 1.6183, + "step": 3240 + }, + { + "epoch": 0.43247931678676277, + "grad_norm": 0.9857220961118699, + "learning_rate": 1.2641377540883469e-05, + "loss": 1.5573, + "step": 3241 + }, + { + "epoch": 0.4326127568721644, + "grad_norm": 1.004715345407096, + "learning_rate": 1.2637208882053469e-05, + "loss": 1.5961, + "step": 3242 + }, + { + "epoch": 0.43274619695756605, + "grad_norm": 0.951945556321612, + "learning_rate": 1.2633039730622883e-05, + "loss": 1.5411, + "step": 3243 + }, + { + "epoch": 0.4328796370429677, + "grad_norm": 0.9907591493074744, + "learning_rate": 1.2628870087370446e-05, + "loss": 1.5953, + "step": 3244 + }, + { + "epoch": 0.43301307712836934, + "grad_norm": 0.9807352408063607, + "learning_rate": 1.2624699953075015e-05, + "loss": 1.5794, + "step": 3245 + }, + { + "epoch": 0.433146517213771, + "grad_norm": 1.3426423736461461, + "learning_rate": 1.262052932851551e-05, + "loss": 1.6409, + "step": 3246 + }, + { + "epoch": 0.4332799572991727, + "grad_norm": 0.9714093568492226, + "learning_rate": 1.2616358214470967e-05, + "loss": 1.5626, + "step": 3247 + }, + { + "epoch": 0.43341339738457435, + "grad_norm": 1.2115238526991772, + "learning_rate": 1.2612186611720494e-05, + "loss": 1.5797, + "step": 3248 + }, + { + "epoch": 0.43354683746997597, + "grad_norm": 1.1075061582793597, + "learning_rate": 1.2608014521043305e-05, + "loss": 1.6192, + "step": 3249 + }, + { + "epoch": 0.43368027755537764, + "grad_norm": 0.9318065689517002, + "learning_rate": 1.2603841943218695e-05, + "loss": 1.5508, + "step": 3250 + }, + { + "epoch": 0.4338137176407793, + "grad_norm": 0.9342106979423349, + "learning_rate": 1.2599668879026057e-05, + "loss": 1.5932, + "step": 3251 + }, + { + "epoch": 0.4339471577261809, + "grad_norm": 0.9881294610347703, + "learning_rate": 1.259549532924487e-05, + "loss": 1.6374, + "step": 3252 + }, + { + "epoch": 0.4340805978115826, + "grad_norm": 0.9842232409667695, + "learning_rate": 1.259132129465471e-05, + "loss": 1.5709, + "step": 3253 + }, + { + "epoch": 0.43421403789698426, + "grad_norm": 1.0325015498789856, + "learning_rate": 1.2587146776035233e-05, + "loss": 1.6679, + "step": 3254 + }, + { + "epoch": 0.43434747798238593, + "grad_norm": 0.9483677954324838, + "learning_rate": 1.2582971774166195e-05, + "loss": 1.6129, + "step": 3255 + }, + { + "epoch": 0.43448091806778755, + "grad_norm": 1.0038948510108203, + "learning_rate": 1.2578796289827437e-05, + "loss": 1.639, + "step": 3256 + }, + { + "epoch": 0.4346143581531892, + "grad_norm": 1.0870982465609627, + "learning_rate": 1.2574620323798891e-05, + "loss": 1.559, + "step": 3257 + }, + { + "epoch": 0.4347477982385909, + "grad_norm": 1.1919615794250933, + "learning_rate": 1.257044387686058e-05, + "loss": 1.5857, + "step": 3258 + }, + { + "epoch": 0.4348812383239925, + "grad_norm": 2.641937471598561, + "learning_rate": 1.2566266949792625e-05, + "loss": 1.5931, + "step": 3259 + }, + { + "epoch": 0.4350146784093942, + "grad_norm": 0.9743834744800941, + "learning_rate": 1.2562089543375215e-05, + "loss": 1.6235, + "step": 3260 + }, + { + "epoch": 0.43514811849479584, + "grad_norm": 1.1785303574207608, + "learning_rate": 1.2557911658388655e-05, + "loss": 1.6243, + "step": 3261 + }, + { + "epoch": 0.4352815585801975, + "grad_norm": 1.3537174009151587, + "learning_rate": 1.2553733295613314e-05, + "loss": 1.5904, + "step": 3262 + }, + { + "epoch": 0.43541499866559913, + "grad_norm": 1.0054913667026213, + "learning_rate": 1.2549554455829676e-05, + "loss": 1.5623, + "step": 3263 + }, + { + "epoch": 0.4355484387510008, + "grad_norm": 1.1821789472170066, + "learning_rate": 1.2545375139818287e-05, + "loss": 1.5667, + "step": 3264 + }, + { + "epoch": 0.43568187883640247, + "grad_norm": 1.0060542893441342, + "learning_rate": 1.2541195348359805e-05, + "loss": 1.6021, + "step": 3265 + }, + { + "epoch": 0.4358153189218041, + "grad_norm": 0.9774430570620395, + "learning_rate": 1.2537015082234963e-05, + "loss": 1.6073, + "step": 3266 + }, + { + "epoch": 0.43594875900720576, + "grad_norm": 1.031179584529375, + "learning_rate": 1.253283434222459e-05, + "loss": 1.6927, + "step": 3267 + }, + { + "epoch": 0.4360821990926074, + "grad_norm": 1.66539793901367, + "learning_rate": 1.2528653129109597e-05, + "loss": 1.6507, + "step": 3268 + }, + { + "epoch": 0.4362156391780091, + "grad_norm": 1.0734574677292559, + "learning_rate": 1.2524471443670992e-05, + "loss": 1.6163, + "step": 3269 + }, + { + "epoch": 0.4363490792634107, + "grad_norm": 1.0252180431657283, + "learning_rate": 1.2520289286689864e-05, + "loss": 1.6268, + "step": 3270 + }, + { + "epoch": 0.4364825193488124, + "grad_norm": 1.0638123845106042, + "learning_rate": 1.2516106658947389e-05, + "loss": 1.5639, + "step": 3271 + }, + { + "epoch": 0.43661595943421405, + "grad_norm": 0.9457091191118885, + "learning_rate": 1.251192356122484e-05, + "loss": 1.6036, + "step": 3272 + }, + { + "epoch": 0.43674939951961567, + "grad_norm": 0.9937359629999882, + "learning_rate": 1.2507739994303564e-05, + "loss": 1.6365, + "step": 3273 + }, + { + "epoch": 0.43688283960501734, + "grad_norm": 1.1942032412216412, + "learning_rate": 1.2503555958965014e-05, + "loss": 1.6593, + "step": 3274 + }, + { + "epoch": 0.437016279690419, + "grad_norm": 1.0002882913147546, + "learning_rate": 1.2499371455990714e-05, + "loss": 1.5772, + "step": 3275 + }, + { + "epoch": 0.4371497197758207, + "grad_norm": 0.9629918020750007, + "learning_rate": 1.2495186486162284e-05, + "loss": 1.5488, + "step": 3276 + }, + { + "epoch": 0.4372831598612223, + "grad_norm": 0.9857190069244037, + "learning_rate": 1.2491001050261425e-05, + "loss": 1.6248, + "step": 3277 + }, + { + "epoch": 0.43741659994662396, + "grad_norm": 1.099517425168157, + "learning_rate": 1.2486815149069928e-05, + "loss": 1.5877, + "step": 3278 + }, + { + "epoch": 0.43755004003202563, + "grad_norm": 1.1165501715980635, + "learning_rate": 1.248262878336968e-05, + "loss": 1.6246, + "step": 3279 + }, + { + "epoch": 0.43768348011742725, + "grad_norm": 1.2759947966412557, + "learning_rate": 1.2478441953942637e-05, + "loss": 1.579, + "step": 3280 + }, + { + "epoch": 0.4378169202028289, + "grad_norm": 1.042737881072034, + "learning_rate": 1.2474254661570858e-05, + "loss": 1.6238, + "step": 3281 + }, + { + "epoch": 0.4379503602882306, + "grad_norm": 0.9997308223160407, + "learning_rate": 1.2470066907036475e-05, + "loss": 1.6247, + "step": 3282 + }, + { + "epoch": 0.43808380037363226, + "grad_norm": 0.9873336853234015, + "learning_rate": 1.2465878691121717e-05, + "loss": 1.594, + "step": 3283 + }, + { + "epoch": 0.4382172404590339, + "grad_norm": 1.073937141396183, + "learning_rate": 1.2461690014608898e-05, + "loss": 1.5442, + "step": 3284 + }, + { + "epoch": 0.43835068054443554, + "grad_norm": 1.066440340364437, + "learning_rate": 1.2457500878280408e-05, + "loss": 1.5947, + "step": 3285 + }, + { + "epoch": 0.4384841206298372, + "grad_norm": 0.9823058406854864, + "learning_rate": 1.2453311282918738e-05, + "loss": 1.5988, + "step": 3286 + }, + { + "epoch": 0.43861756071523883, + "grad_norm": 0.9529342446435208, + "learning_rate": 1.2449121229306449e-05, + "loss": 1.5879, + "step": 3287 + }, + { + "epoch": 0.4387510008006405, + "grad_norm": 1.2995317730664497, + "learning_rate": 1.2444930718226201e-05, + "loss": 1.6078, + "step": 3288 + }, + { + "epoch": 0.43888444088604217, + "grad_norm": 1.0273219898248251, + "learning_rate": 1.2440739750460728e-05, + "loss": 1.5873, + "step": 3289 + }, + { + "epoch": 0.43901788097144384, + "grad_norm": 0.9613704608116244, + "learning_rate": 1.2436548326792858e-05, + "loss": 1.5971, + "step": 3290 + }, + { + "epoch": 0.43915132105684546, + "grad_norm": 0.9741621574671587, + "learning_rate": 1.2432356448005507e-05, + "loss": 1.5664, + "step": 3291 + }, + { + "epoch": 0.4392847611422471, + "grad_norm": 0.9421233395790419, + "learning_rate": 1.2428164114881663e-05, + "loss": 1.6159, + "step": 3292 + }, + { + "epoch": 0.4394182012276488, + "grad_norm": 0.9714375843061244, + "learning_rate": 1.2423971328204407e-05, + "loss": 1.5948, + "step": 3293 + }, + { + "epoch": 0.43955164131305047, + "grad_norm": 0.9853929138462288, + "learning_rate": 1.2419778088756904e-05, + "loss": 1.6295, + "step": 3294 + }, + { + "epoch": 0.4396850813984521, + "grad_norm": 0.9658869477722872, + "learning_rate": 1.2415584397322406e-05, + "loss": 1.6026, + "step": 3295 + }, + { + "epoch": 0.43981852148385375, + "grad_norm": 0.9824798838337809, + "learning_rate": 1.2411390254684246e-05, + "loss": 1.6511, + "step": 3296 + }, + { + "epoch": 0.4399519615692554, + "grad_norm": 1.1148315528827775, + "learning_rate": 1.2407195661625838e-05, + "loss": 1.6555, + "step": 3297 + }, + { + "epoch": 0.44008540165465704, + "grad_norm": 0.9935944482329716, + "learning_rate": 1.240300061893069e-05, + "loss": 1.5694, + "step": 3298 + }, + { + "epoch": 0.4402188417400587, + "grad_norm": 1.0104396213439037, + "learning_rate": 1.2398805127382382e-05, + "loss": 1.6266, + "step": 3299 + }, + { + "epoch": 0.4403522818254604, + "grad_norm": 0.9406209717223495, + "learning_rate": 1.2394609187764593e-05, + "loss": 1.6403, + "step": 3300 + }, + { + "epoch": 0.44048572191086205, + "grad_norm": 0.9669078454658077, + "learning_rate": 1.2390412800861066e-05, + "loss": 1.614, + "step": 3301 + }, + { + "epoch": 0.44061916199626366, + "grad_norm": 0.9556156862945018, + "learning_rate": 1.2386215967455648e-05, + "loss": 1.5857, + "step": 3302 + }, + { + "epoch": 0.44075260208166533, + "grad_norm": 0.9777122703402282, + "learning_rate": 1.2382018688332251e-05, + "loss": 1.6382, + "step": 3303 + }, + { + "epoch": 0.440886042167067, + "grad_norm": 0.9515365211823855, + "learning_rate": 1.2377820964274887e-05, + "loss": 1.5879, + "step": 3304 + }, + { + "epoch": 0.4410194822524686, + "grad_norm": 0.9807886665234027, + "learning_rate": 1.2373622796067637e-05, + "loss": 1.6029, + "step": 3305 + }, + { + "epoch": 0.4411529223378703, + "grad_norm": 0.9635105279990112, + "learning_rate": 1.2369424184494673e-05, + "loss": 1.5992, + "step": 3306 + }, + { + "epoch": 0.44128636242327196, + "grad_norm": 0.9665429167972764, + "learning_rate": 1.236522513034025e-05, + "loss": 1.5908, + "step": 3307 + }, + { + "epoch": 0.44141980250867363, + "grad_norm": 1.0554959304610874, + "learning_rate": 1.2361025634388701e-05, + "loss": 1.6039, + "step": 3308 + }, + { + "epoch": 0.44155324259407525, + "grad_norm": 1.206338234294267, + "learning_rate": 1.2356825697424449e-05, + "loss": 1.6125, + "step": 3309 + }, + { + "epoch": 0.4416866826794769, + "grad_norm": 0.9701936211446273, + "learning_rate": 1.2352625320231984e-05, + "loss": 1.5516, + "step": 3310 + }, + { + "epoch": 0.4418201227648786, + "grad_norm": 1.011429585012499, + "learning_rate": 1.2348424503595898e-05, + "loss": 1.6186, + "step": 3311 + }, + { + "epoch": 0.4419535628502802, + "grad_norm": 1.1168012771237403, + "learning_rate": 1.234422324830085e-05, + "loss": 1.565, + "step": 3312 + }, + { + "epoch": 0.44208700293568187, + "grad_norm": 1.0738983437774423, + "learning_rate": 1.2340021555131592e-05, + "loss": 1.6279, + "step": 3313 + }, + { + "epoch": 0.44222044302108354, + "grad_norm": 0.9987855843220456, + "learning_rate": 1.2335819424872948e-05, + "loss": 1.5884, + "step": 3314 + }, + { + "epoch": 0.4423538831064852, + "grad_norm": 1.1047058977235424, + "learning_rate": 1.233161685830983e-05, + "loss": 1.6259, + "step": 3315 + }, + { + "epoch": 0.4424873231918868, + "grad_norm": 12.94394608036084, + "learning_rate": 1.2327413856227231e-05, + "loss": 1.6551, + "step": 3316 + }, + { + "epoch": 0.4426207632772885, + "grad_norm": 0.9922991075854602, + "learning_rate": 1.232321041941022e-05, + "loss": 1.6057, + "step": 3317 + }, + { + "epoch": 0.44275420336269017, + "grad_norm": 1.5832757455395445, + "learning_rate": 1.2319006548643955e-05, + "loss": 1.5688, + "step": 3318 + }, + { + "epoch": 0.4428876434480918, + "grad_norm": 1.0990617479523308, + "learning_rate": 1.2314802244713671e-05, + "loss": 1.6046, + "step": 3319 + }, + { + "epoch": 0.44302108353349345, + "grad_norm": 1.1847275698751005, + "learning_rate": 1.2310597508404683e-05, + "loss": 1.5834, + "step": 3320 + }, + { + "epoch": 0.4431545236188951, + "grad_norm": 1.044830968363967, + "learning_rate": 1.2306392340502382e-05, + "loss": 1.5843, + "step": 3321 + }, + { + "epoch": 0.4432879637042968, + "grad_norm": 1.2604211119820607, + "learning_rate": 1.2302186741792255e-05, + "loss": 1.6076, + "step": 3322 + }, + { + "epoch": 0.4434214037896984, + "grad_norm": 1.041133538088134, + "learning_rate": 1.2297980713059857e-05, + "loss": 1.6237, + "step": 3323 + }, + { + "epoch": 0.4435548438751001, + "grad_norm": 1.4649823119870107, + "learning_rate": 1.229377425509082e-05, + "loss": 1.6073, + "step": 3324 + }, + { + "epoch": 0.44368828396050175, + "grad_norm": 0.9934245058649951, + "learning_rate": 1.2289567368670873e-05, + "loss": 1.5818, + "step": 3325 + }, + { + "epoch": 0.44382172404590337, + "grad_norm": 0.9664114510742975, + "learning_rate": 1.2285360054585807e-05, + "loss": 1.644, + "step": 3326 + }, + { + "epoch": 0.44395516413130504, + "grad_norm": 1.015993424077106, + "learning_rate": 1.2281152313621505e-05, + "loss": 1.5808, + "step": 3327 + }, + { + "epoch": 0.4440886042167067, + "grad_norm": 1.0461329594907725, + "learning_rate": 1.2276944146563918e-05, + "loss": 1.6381, + "step": 3328 + }, + { + "epoch": 0.4442220443021084, + "grad_norm": 1.0135143248090914, + "learning_rate": 1.2272735554199091e-05, + "loss": 1.577, + "step": 3329 + }, + { + "epoch": 0.44435548438751, + "grad_norm": 1.0070141228550389, + "learning_rate": 1.2268526537313142e-05, + "loss": 1.5906, + "step": 3330 + }, + { + "epoch": 0.44448892447291166, + "grad_norm": 0.9761989502121017, + "learning_rate": 1.2264317096692257e-05, + "loss": 1.5424, + "step": 3331 + }, + { + "epoch": 0.44462236455831333, + "grad_norm": 0.9755643448669162, + "learning_rate": 1.2260107233122724e-05, + "loss": 1.621, + "step": 3332 + }, + { + "epoch": 0.44475580464371495, + "grad_norm": 1.1020152124691254, + "learning_rate": 1.2255896947390891e-05, + "loss": 1.5605, + "step": 3333 + }, + { + "epoch": 0.4448892447291166, + "grad_norm": 1.3925175705055546, + "learning_rate": 1.2251686240283191e-05, + "loss": 1.6224, + "step": 3334 + }, + { + "epoch": 0.4450226848145183, + "grad_norm": 1.0243447818889067, + "learning_rate": 1.224747511258614e-05, + "loss": 1.6326, + "step": 3335 + }, + { + "epoch": 0.44515612489991996, + "grad_norm": 1.064152850059643, + "learning_rate": 1.2243263565086325e-05, + "loss": 1.5844, + "step": 3336 + }, + { + "epoch": 0.4452895649853216, + "grad_norm": 1.0995677570425262, + "learning_rate": 1.2239051598570417e-05, + "loss": 1.6008, + "step": 3337 + }, + { + "epoch": 0.44542300507072324, + "grad_norm": 0.9935950777647963, + "learning_rate": 1.2234839213825163e-05, + "loss": 1.6462, + "step": 3338 + }, + { + "epoch": 0.4455564451561249, + "grad_norm": 0.9919635923562781, + "learning_rate": 1.2230626411637388e-05, + "loss": 1.6029, + "step": 3339 + }, + { + "epoch": 0.44568988524152653, + "grad_norm": 1.3321534487303814, + "learning_rate": 1.2226413192793998e-05, + "loss": 1.6439, + "step": 3340 + }, + { + "epoch": 0.4458233253269282, + "grad_norm": 1.0388019535817785, + "learning_rate": 1.222219955808197e-05, + "loss": 1.6172, + "step": 3341 + }, + { + "epoch": 0.44595676541232987, + "grad_norm": 0.9948371593169276, + "learning_rate": 1.2217985508288366e-05, + "loss": 1.5684, + "step": 3342 + }, + { + "epoch": 0.44609020549773154, + "grad_norm": 0.9566696947998643, + "learning_rate": 1.2213771044200323e-05, + "loss": 1.5894, + "step": 3343 + }, + { + "epoch": 0.44622364558313315, + "grad_norm": 1.0144760346896669, + "learning_rate": 1.220955616660505e-05, + "loss": 1.6187, + "step": 3344 + }, + { + "epoch": 0.4463570856685348, + "grad_norm": 0.9783013624017898, + "learning_rate": 1.2205340876289842e-05, + "loss": 1.5798, + "step": 3345 + }, + { + "epoch": 0.4464905257539365, + "grad_norm": 1.1775470120965925, + "learning_rate": 1.220112517404207e-05, + "loss": 1.6027, + "step": 3346 + }, + { + "epoch": 0.4466239658393381, + "grad_norm": 1.0832460341808767, + "learning_rate": 1.2196909060649173e-05, + "loss": 1.6384, + "step": 3347 + }, + { + "epoch": 0.4467574059247398, + "grad_norm": 1.5868021023209082, + "learning_rate": 1.219269253689868e-05, + "loss": 1.5758, + "step": 3348 + }, + { + "epoch": 0.44689084601014145, + "grad_norm": 1.1241637872673438, + "learning_rate": 1.2188475603578186e-05, + "loss": 1.5948, + "step": 3349 + }, + { + "epoch": 0.4470242860955431, + "grad_norm": 0.9779424708497318, + "learning_rate": 1.2184258261475364e-05, + "loss": 1.6002, + "step": 3350 + }, + { + "epoch": 0.44715772618094474, + "grad_norm": 0.986774564760355, + "learning_rate": 1.2180040511377966e-05, + "loss": 1.5651, + "step": 3351 + }, + { + "epoch": 0.4472911662663464, + "grad_norm": 0.9605234817512258, + "learning_rate": 1.2175822354073826e-05, + "loss": 1.5525, + "step": 3352 + }, + { + "epoch": 0.4474246063517481, + "grad_norm": 0.9492182807666106, + "learning_rate": 1.2171603790350836e-05, + "loss": 1.5657, + "step": 3353 + }, + { + "epoch": 0.4475580464371497, + "grad_norm": 1.004022400441443, + "learning_rate": 1.2167384820996988e-05, + "loss": 1.5807, + "step": 3354 + }, + { + "epoch": 0.44769148652255136, + "grad_norm": 0.9732333755288013, + "learning_rate": 1.2163165446800332e-05, + "loss": 1.6453, + "step": 3355 + }, + { + "epoch": 0.44782492660795303, + "grad_norm": 1.0036906910118235, + "learning_rate": 1.2158945668548997e-05, + "loss": 1.6128, + "step": 3356 + }, + { + "epoch": 0.4479583666933547, + "grad_norm": 1.0085064642711343, + "learning_rate": 1.215472548703119e-05, + "loss": 1.613, + "step": 3357 + }, + { + "epoch": 0.4480918067787563, + "grad_norm": 1.2764967944347796, + "learning_rate": 1.2150504903035196e-05, + "loss": 1.5606, + "step": 3358 + }, + { + "epoch": 0.448225246864158, + "grad_norm": 0.995504684910624, + "learning_rate": 1.2146283917349373e-05, + "loss": 1.6166, + "step": 3359 + }, + { + "epoch": 0.44835868694955966, + "grad_norm": 0.9568604455855414, + "learning_rate": 1.214206253076215e-05, + "loss": 1.5966, + "step": 3360 + }, + { + "epoch": 0.44849212703496133, + "grad_norm": 1.1039131846221917, + "learning_rate": 1.2137840744062032e-05, + "loss": 1.5601, + "step": 3361 + }, + { + "epoch": 0.44862556712036294, + "grad_norm": 1.005103856197314, + "learning_rate": 1.2133618558037607e-05, + "loss": 1.5586, + "step": 3362 + }, + { + "epoch": 0.4487590072057646, + "grad_norm": 0.9893281059878032, + "learning_rate": 1.2129395973477522e-05, + "loss": 1.664, + "step": 3363 + }, + { + "epoch": 0.4488924472911663, + "grad_norm": 1.3408090451344328, + "learning_rate": 1.212517299117052e-05, + "loss": 1.5822, + "step": 3364 + }, + { + "epoch": 0.4490258873765679, + "grad_norm": 1.1012723616514362, + "learning_rate": 1.2120949611905393e-05, + "loss": 1.5357, + "step": 3365 + }, + { + "epoch": 0.44915932746196957, + "grad_norm": 1.1116159284883955, + "learning_rate": 1.2116725836471031e-05, + "loss": 1.5744, + "step": 3366 + }, + { + "epoch": 0.44929276754737124, + "grad_norm": 0.9861652273388498, + "learning_rate": 1.211250166565638e-05, + "loss": 1.6247, + "step": 3367 + }, + { + "epoch": 0.4494262076327729, + "grad_norm": 1.1947078047934485, + "learning_rate": 1.2108277100250472e-05, + "loss": 1.6308, + "step": 3368 + }, + { + "epoch": 0.4495596477181745, + "grad_norm": 0.946296734521657, + "learning_rate": 1.2104052141042402e-05, + "loss": 1.5815, + "step": 3369 + }, + { + "epoch": 0.4496930878035762, + "grad_norm": 0.9521803217831275, + "learning_rate": 1.2099826788821347e-05, + "loss": 1.6103, + "step": 3370 + }, + { + "epoch": 0.44982652788897787, + "grad_norm": 0.9822551188681323, + "learning_rate": 1.2095601044376558e-05, + "loss": 1.5977, + "step": 3371 + }, + { + "epoch": 0.4499599679743795, + "grad_norm": 0.9614883569465209, + "learning_rate": 1.2091374908497352e-05, + "loss": 1.5943, + "step": 3372 + }, + { + "epoch": 0.45009340805978115, + "grad_norm": 1.0694103742791419, + "learning_rate": 1.2087148381973126e-05, + "loss": 1.6066, + "step": 3373 + }, + { + "epoch": 0.4502268481451828, + "grad_norm": 1.1709430242560819, + "learning_rate": 1.2082921465593345e-05, + "loss": 1.6409, + "step": 3374 + }, + { + "epoch": 0.4503602882305845, + "grad_norm": 0.9918589860941452, + "learning_rate": 1.2078694160147549e-05, + "loss": 1.5869, + "step": 3375 + }, + { + "epoch": 0.4504937283159861, + "grad_norm": 1.189016080561096, + "learning_rate": 1.2074466466425348e-05, + "loss": 1.6317, + "step": 3376 + }, + { + "epoch": 0.4506271684013878, + "grad_norm": 1.0693280010712716, + "learning_rate": 1.2070238385216431e-05, + "loss": 1.5298, + "step": 3377 + }, + { + "epoch": 0.45076060848678945, + "grad_norm": 0.9802287723386353, + "learning_rate": 1.2066009917310557e-05, + "loss": 1.5671, + "step": 3378 + }, + { + "epoch": 0.45089404857219106, + "grad_norm": 1.0001980093487308, + "learning_rate": 1.2061781063497549e-05, + "loss": 1.57, + "step": 3379 + }, + { + "epoch": 0.45102748865759273, + "grad_norm": 1.1240347451070862, + "learning_rate": 1.2057551824567315e-05, + "loss": 1.6214, + "step": 3380 + }, + { + "epoch": 0.4511609287429944, + "grad_norm": 1.016259709659121, + "learning_rate": 1.2053322201309827e-05, + "loss": 1.5418, + "step": 3381 + }, + { + "epoch": 0.4512943688283961, + "grad_norm": 0.9810480790681763, + "learning_rate": 1.2049092194515129e-05, + "loss": 1.5886, + "step": 3382 + }, + { + "epoch": 0.4514278089137977, + "grad_norm": 1.0999439267701696, + "learning_rate": 1.2044861804973339e-05, + "loss": 1.6381, + "step": 3383 + }, + { + "epoch": 0.45156124899919936, + "grad_norm": 0.9599654396714106, + "learning_rate": 1.2040631033474645e-05, + "loss": 1.5533, + "step": 3384 + }, + { + "epoch": 0.45169468908460103, + "grad_norm": 0.9466851026826987, + "learning_rate": 1.2036399880809307e-05, + "loss": 1.5944, + "step": 3385 + }, + { + "epoch": 0.45182812917000265, + "grad_norm": 0.9678219133269332, + "learning_rate": 1.2032168347767656e-05, + "loss": 1.6415, + "step": 3386 + }, + { + "epoch": 0.4519615692554043, + "grad_norm": 1.0940832917339585, + "learning_rate": 1.2027936435140097e-05, + "loss": 1.5896, + "step": 3387 + }, + { + "epoch": 0.452095009340806, + "grad_norm": 1.352412592314921, + "learning_rate": 1.2023704143717099e-05, + "loss": 1.5859, + "step": 3388 + }, + { + "epoch": 0.45222844942620766, + "grad_norm": 1.0442749639629683, + "learning_rate": 1.2019471474289209e-05, + "loss": 1.603, + "step": 3389 + }, + { + "epoch": 0.45236188951160927, + "grad_norm": 1.1089040085027504, + "learning_rate": 1.2015238427647039e-05, + "loss": 1.5855, + "step": 3390 + }, + { + "epoch": 0.45249532959701094, + "grad_norm": 1.172896894033313, + "learning_rate": 1.2011005004581275e-05, + "loss": 1.5967, + "step": 3391 + }, + { + "epoch": 0.4526287696824126, + "grad_norm": 1.0422201350270612, + "learning_rate": 1.2006771205882673e-05, + "loss": 1.588, + "step": 3392 + }, + { + "epoch": 0.4527622097678142, + "grad_norm": 0.9833998073404502, + "learning_rate": 1.2002537032342054e-05, + "loss": 1.6304, + "step": 3393 + }, + { + "epoch": 0.4528956498532159, + "grad_norm": 1.1092253938233356, + "learning_rate": 1.1998302484750322e-05, + "loss": 1.6304, + "step": 3394 + }, + { + "epoch": 0.45302908993861757, + "grad_norm": 1.0958444605435562, + "learning_rate": 1.1994067563898435e-05, + "loss": 1.5521, + "step": 3395 + }, + { + "epoch": 0.45316253002401924, + "grad_norm": 0.9941289195859265, + "learning_rate": 1.1989832270577432e-05, + "loss": 1.5917, + "step": 3396 + }, + { + "epoch": 0.45329597010942085, + "grad_norm": 1.0144543411061597, + "learning_rate": 1.1985596605578413e-05, + "loss": 1.5596, + "step": 3397 + }, + { + "epoch": 0.4534294101948225, + "grad_norm": 0.9660753780508365, + "learning_rate": 1.198136056969256e-05, + "loss": 1.5673, + "step": 3398 + }, + { + "epoch": 0.4535628502802242, + "grad_norm": 1.1410534595473867, + "learning_rate": 1.1977124163711108e-05, + "loss": 1.6167, + "step": 3399 + }, + { + "epoch": 0.4536962903656258, + "grad_norm": 0.9906940152019744, + "learning_rate": 1.1972887388425374e-05, + "loss": 1.5943, + "step": 3400 + }, + { + "epoch": 0.4538297304510275, + "grad_norm": 1.1280922031572023, + "learning_rate": 1.1968650244626733e-05, + "loss": 1.6018, + "step": 3401 + }, + { + "epoch": 0.45396317053642915, + "grad_norm": 1.2472440687519395, + "learning_rate": 1.1964412733106648e-05, + "loss": 1.5388, + "step": 3402 + }, + { + "epoch": 0.4540966106218308, + "grad_norm": 1.0569580420732267, + "learning_rate": 1.1960174854656623e-05, + "loss": 1.5699, + "step": 3403 + }, + { + "epoch": 0.45423005070723244, + "grad_norm": 1.0325493795551215, + "learning_rate": 1.1955936610068257e-05, + "loss": 1.6212, + "step": 3404 + }, + { + "epoch": 0.4543634907926341, + "grad_norm": 0.9811755437676621, + "learning_rate": 1.1951698000133203e-05, + "loss": 1.6113, + "step": 3405 + }, + { + "epoch": 0.4544969308780358, + "grad_norm": 1.2389436209658955, + "learning_rate": 1.1947459025643177e-05, + "loss": 1.6252, + "step": 3406 + }, + { + "epoch": 0.4546303709634374, + "grad_norm": 1.0656924772924845, + "learning_rate": 1.1943219687389984e-05, + "loss": 1.6018, + "step": 3407 + }, + { + "epoch": 0.45476381104883906, + "grad_norm": 1.1894263785585688, + "learning_rate": 1.1938979986165476e-05, + "loss": 1.6683, + "step": 3408 + }, + { + "epoch": 0.45489725113424073, + "grad_norm": 0.9518835451824228, + "learning_rate": 1.193473992276158e-05, + "loss": 1.6263, + "step": 3409 + }, + { + "epoch": 0.4550306912196424, + "grad_norm": 0.9790868312599547, + "learning_rate": 1.1930499497970296e-05, + "loss": 1.6023, + "step": 3410 + }, + { + "epoch": 0.455164131305044, + "grad_norm": 1.0311273154214786, + "learning_rate": 1.1926258712583685e-05, + "loss": 1.6393, + "step": 3411 + }, + { + "epoch": 0.4552975713904457, + "grad_norm": 0.9778706613876423, + "learning_rate": 1.192201756739388e-05, + "loss": 1.5959, + "step": 3412 + }, + { + "epoch": 0.45543101147584736, + "grad_norm": 1.0265470549844626, + "learning_rate": 1.1917776063193073e-05, + "loss": 1.5936, + "step": 3413 + }, + { + "epoch": 0.455564451561249, + "grad_norm": 1.0845896709498193, + "learning_rate": 1.1913534200773536e-05, + "loss": 1.5618, + "step": 3414 + }, + { + "epoch": 0.45569789164665064, + "grad_norm": 0.9625869411364787, + "learning_rate": 1.1909291980927592e-05, + "loss": 1.5905, + "step": 3415 + }, + { + "epoch": 0.4558313317320523, + "grad_norm": 1.072697736476755, + "learning_rate": 1.1905049404447649e-05, + "loss": 1.5894, + "step": 3416 + }, + { + "epoch": 0.455964771817454, + "grad_norm": 1.142515736949019, + "learning_rate": 1.1900806472126162e-05, + "loss": 1.5963, + "step": 3417 + }, + { + "epoch": 0.4560982119028556, + "grad_norm": 0.9816650958835474, + "learning_rate": 1.189656318475567e-05, + "loss": 1.6364, + "step": 3418 + }, + { + "epoch": 0.45623165198825727, + "grad_norm": 0.9293454502463179, + "learning_rate": 1.189231954312877e-05, + "loss": 1.5843, + "step": 3419 + }, + { + "epoch": 0.45636509207365894, + "grad_norm": 1.2000199775285119, + "learning_rate": 1.188807554803812e-05, + "loss": 1.6047, + "step": 3420 + }, + { + "epoch": 0.45649853215906055, + "grad_norm": 0.9742674279366978, + "learning_rate": 1.1883831200276459e-05, + "loss": 1.6254, + "step": 3421 + }, + { + "epoch": 0.4566319722444622, + "grad_norm": 1.1378422279774119, + "learning_rate": 1.1879586500636574e-05, + "loss": 1.5892, + "step": 3422 + }, + { + "epoch": 0.4567654123298639, + "grad_norm": 1.1369928812081609, + "learning_rate": 1.1875341449911333e-05, + "loss": 1.5573, + "step": 3423 + }, + { + "epoch": 0.45689885241526557, + "grad_norm": 0.9878064111116921, + "learning_rate": 1.1871096048893662e-05, + "loss": 1.6357, + "step": 3424 + }, + { + "epoch": 0.4570322925006672, + "grad_norm": 1.0138596892355174, + "learning_rate": 1.1866850298376549e-05, + "loss": 1.6267, + "step": 3425 + }, + { + "epoch": 0.45716573258606885, + "grad_norm": 1.03109296011423, + "learning_rate": 1.1862604199153058e-05, + "loss": 1.6326, + "step": 3426 + }, + { + "epoch": 0.4572991726714705, + "grad_norm": 0.983321243406952, + "learning_rate": 1.1858357752016307e-05, + "loss": 1.6337, + "step": 3427 + }, + { + "epoch": 0.45743261275687214, + "grad_norm": 1.069437667120727, + "learning_rate": 1.1854110957759487e-05, + "loss": 1.5948, + "step": 3428 + }, + { + "epoch": 0.4575660528422738, + "grad_norm": 0.9630689249149076, + "learning_rate": 1.1849863817175848e-05, + "loss": 1.6009, + "step": 3429 + }, + { + "epoch": 0.4576994929276755, + "grad_norm": 1.195697896821968, + "learning_rate": 1.1845616331058714e-05, + "loss": 1.6401, + "step": 3430 + }, + { + "epoch": 0.45783293301307715, + "grad_norm": 1.0373880035420473, + "learning_rate": 1.1841368500201457e-05, + "loss": 1.5717, + "step": 3431 + }, + { + "epoch": 0.45796637309847876, + "grad_norm": 1.0152163282536957, + "learning_rate": 1.1837120325397533e-05, + "loss": 1.6412, + "step": 3432 + }, + { + "epoch": 0.45809981318388043, + "grad_norm": 0.9465550717350613, + "learning_rate": 1.1832871807440448e-05, + "loss": 1.6038, + "step": 3433 + }, + { + "epoch": 0.4582332532692821, + "grad_norm": 0.9393643634807096, + "learning_rate": 1.1828622947123774e-05, + "loss": 1.542, + "step": 3434 + }, + { + "epoch": 0.4583666933546838, + "grad_norm": 0.9998488486015676, + "learning_rate": 1.1824373745241159e-05, + "loss": 1.6714, + "step": 3435 + }, + { + "epoch": 0.4585001334400854, + "grad_norm": 1.0463357172364034, + "learning_rate": 1.1820124202586294e-05, + "loss": 1.6897, + "step": 3436 + }, + { + "epoch": 0.45863357352548706, + "grad_norm": 0.9977756148556246, + "learning_rate": 1.1815874319952954e-05, + "loss": 1.5722, + "step": 3437 + }, + { + "epoch": 0.45876701361088873, + "grad_norm": 0.9781228945231666, + "learning_rate": 1.1811624098134963e-05, + "loss": 1.5516, + "step": 3438 + }, + { + "epoch": 0.45890045369629034, + "grad_norm": 0.9515422839609833, + "learning_rate": 1.180737353792622e-05, + "loss": 1.6011, + "step": 3439 + }, + { + "epoch": 0.459033893781692, + "grad_norm": 1.099348617644577, + "learning_rate": 1.1803122640120675e-05, + "loss": 1.5827, + "step": 3440 + }, + { + "epoch": 0.4591673338670937, + "grad_norm": 1.1211811802963583, + "learning_rate": 1.1798871405512352e-05, + "loss": 1.6251, + "step": 3441 + }, + { + "epoch": 0.45930077395249536, + "grad_norm": 0.9456734985827367, + "learning_rate": 1.1794619834895329e-05, + "loss": 1.5872, + "step": 3442 + }, + { + "epoch": 0.45943421403789697, + "grad_norm": 1.0345174270836286, + "learning_rate": 1.1790367929063756e-05, + "loss": 1.5859, + "step": 3443 + }, + { + "epoch": 0.45956765412329864, + "grad_norm": 1.1988418117321362, + "learning_rate": 1.1786115688811836e-05, + "loss": 1.6008, + "step": 3444 + }, + { + "epoch": 0.4597010942087003, + "grad_norm": 0.9823921150047465, + "learning_rate": 1.1781863114933845e-05, + "loss": 1.5628, + "step": 3445 + }, + { + "epoch": 0.4598345342941019, + "grad_norm": 0.9709054414773352, + "learning_rate": 1.1777610208224107e-05, + "loss": 1.592, + "step": 3446 + }, + { + "epoch": 0.4599679743795036, + "grad_norm": 0.913819442194726, + "learning_rate": 1.1773356969477023e-05, + "loss": 1.5924, + "step": 3447 + }, + { + "epoch": 0.46010141446490527, + "grad_norm": 0.9314527155118181, + "learning_rate": 1.1769103399487047e-05, + "loss": 1.5904, + "step": 3448 + }, + { + "epoch": 0.46023485455030694, + "grad_norm": 1.132160179087773, + "learning_rate": 1.1764849499048699e-05, + "loss": 1.6422, + "step": 3449 + }, + { + "epoch": 0.46036829463570855, + "grad_norm": 1.2291620905625844, + "learning_rate": 1.1760595268956556e-05, + "loss": 1.5999, + "step": 3450 + }, + { + "epoch": 0.4605017347211102, + "grad_norm": 0.990821816789439, + "learning_rate": 1.1756340710005264e-05, + "loss": 1.6038, + "step": 3451 + }, + { + "epoch": 0.4606351748065119, + "grad_norm": 1.1734233511272216, + "learning_rate": 1.175208582298952e-05, + "loss": 1.5486, + "step": 3452 + }, + { + "epoch": 0.4607686148919135, + "grad_norm": 1.0603222014465778, + "learning_rate": 1.1747830608704098e-05, + "loss": 1.5833, + "step": 3453 + }, + { + "epoch": 0.4609020549773152, + "grad_norm": 0.9788386077828661, + "learning_rate": 1.1743575067943813e-05, + "loss": 1.5973, + "step": 3454 + }, + { + "epoch": 0.46103549506271685, + "grad_norm": 1.1654930199086528, + "learning_rate": 1.1739319201503561e-05, + "loss": 1.6585, + "step": 3455 + }, + { + "epoch": 0.4611689351481185, + "grad_norm": 0.975258594253613, + "learning_rate": 1.1735063010178283e-05, + "loss": 1.5713, + "step": 3456 + }, + { + "epoch": 0.46130237523352013, + "grad_norm": 1.2641032198352828, + "learning_rate": 1.1730806494762987e-05, + "loss": 1.5982, + "step": 3457 + }, + { + "epoch": 0.4614358153189218, + "grad_norm": 1.242994584150048, + "learning_rate": 1.1726549656052748e-05, + "loss": 1.6169, + "step": 3458 + }, + { + "epoch": 0.4615692554043235, + "grad_norm": 0.9635164521389502, + "learning_rate": 1.1722292494842688e-05, + "loss": 1.598, + "step": 3459 + }, + { + "epoch": 0.4617026954897251, + "grad_norm": 0.9513310854478186, + "learning_rate": 1.1718035011928002e-05, + "loss": 1.5811, + "step": 3460 + }, + { + "epoch": 0.46183613557512676, + "grad_norm": 0.946112789103914, + "learning_rate": 1.1713777208103933e-05, + "loss": 1.5948, + "step": 3461 + }, + { + "epoch": 0.46196957566052843, + "grad_norm": 0.9946176475506616, + "learning_rate": 1.1709519084165797e-05, + "loss": 1.6119, + "step": 3462 + }, + { + "epoch": 0.4621030157459301, + "grad_norm": 0.9874079907996892, + "learning_rate": 1.1705260640908955e-05, + "loss": 1.6237, + "step": 3463 + }, + { + "epoch": 0.4622364558313317, + "grad_norm": 1.7891192519617156, + "learning_rate": 1.1701001879128843e-05, + "loss": 1.5662, + "step": 3464 + }, + { + "epoch": 0.4623698959167334, + "grad_norm": 1.0421651256059734, + "learning_rate": 1.1696742799620946e-05, + "loss": 1.6307, + "step": 3465 + }, + { + "epoch": 0.46250333600213506, + "grad_norm": 0.979356707410168, + "learning_rate": 1.1692483403180814e-05, + "loss": 1.607, + "step": 3466 + }, + { + "epoch": 0.46263677608753667, + "grad_norm": 1.1226620423723173, + "learning_rate": 1.1688223690604052e-05, + "loss": 1.5689, + "step": 3467 + }, + { + "epoch": 0.46277021617293834, + "grad_norm": 1.0965605702509604, + "learning_rate": 1.1683963662686324e-05, + "loss": 1.5973, + "step": 3468 + }, + { + "epoch": 0.46290365625834, + "grad_norm": 1.0921547887155878, + "learning_rate": 1.167970332022336e-05, + "loss": 1.5853, + "step": 3469 + }, + { + "epoch": 0.4630370963437417, + "grad_norm": 1.1736803003382774, + "learning_rate": 1.1675442664010935e-05, + "loss": 1.5885, + "step": 3470 + }, + { + "epoch": 0.4631705364291433, + "grad_norm": 1.082987731836864, + "learning_rate": 1.1671181694844897e-05, + "loss": 1.6115, + "step": 3471 + }, + { + "epoch": 0.46330397651454497, + "grad_norm": 0.9708478904094577, + "learning_rate": 1.1666920413521146e-05, + "loss": 1.597, + "step": 3472 + }, + { + "epoch": 0.46343741659994664, + "grad_norm": 1.0126029044158875, + "learning_rate": 1.1662658820835639e-05, + "loss": 1.6249, + "step": 3473 + }, + { + "epoch": 0.46357085668534825, + "grad_norm": 1.0207349433202457, + "learning_rate": 1.1658396917584397e-05, + "loss": 1.5908, + "step": 3474 + }, + { + "epoch": 0.4637042967707499, + "grad_norm": 0.9372775427319033, + "learning_rate": 1.1654134704563492e-05, + "loss": 1.6199, + "step": 3475 + }, + { + "epoch": 0.4638377368561516, + "grad_norm": 1.1272128181343017, + "learning_rate": 1.1649872182569058e-05, + "loss": 1.6617, + "step": 3476 + }, + { + "epoch": 0.46397117694155326, + "grad_norm": 1.110304791632166, + "learning_rate": 1.1645609352397282e-05, + "loss": 1.6041, + "step": 3477 + }, + { + "epoch": 0.4641046170269549, + "grad_norm": 1.0017766303846791, + "learning_rate": 1.1641346214844417e-05, + "loss": 1.6007, + "step": 3478 + }, + { + "epoch": 0.46423805711235655, + "grad_norm": 0.9880107584390453, + "learning_rate": 1.1637082770706764e-05, + "loss": 1.6511, + "step": 3479 + }, + { + "epoch": 0.4643714971977582, + "grad_norm": 1.2922751760310007, + "learning_rate": 1.1632819020780693e-05, + "loss": 1.6072, + "step": 3480 + }, + { + "epoch": 0.46450493728315984, + "grad_norm": 0.9883475103271387, + "learning_rate": 1.1628554965862615e-05, + "loss": 1.6685, + "step": 3481 + }, + { + "epoch": 0.4646383773685615, + "grad_norm": 1.1928187505375785, + "learning_rate": 1.1624290606749012e-05, + "loss": 1.6114, + "step": 3482 + }, + { + "epoch": 0.4647718174539632, + "grad_norm": 1.0066345866358597, + "learning_rate": 1.1620025944236418e-05, + "loss": 1.6761, + "step": 3483 + }, + { + "epoch": 0.46490525753936485, + "grad_norm": 1.0953896264697256, + "learning_rate": 1.161576097912142e-05, + "loss": 1.5629, + "step": 3484 + }, + { + "epoch": 0.46503869762476646, + "grad_norm": 0.984883775282697, + "learning_rate": 1.161149571220067e-05, + "loss": 1.5929, + "step": 3485 + }, + { + "epoch": 0.46517213771016813, + "grad_norm": 0.9445601434963037, + "learning_rate": 1.1607230144270866e-05, + "loss": 1.6208, + "step": 3486 + }, + { + "epoch": 0.4653055777955698, + "grad_norm": 0.9240480810034248, + "learning_rate": 1.1602964276128774e-05, + "loss": 1.5202, + "step": 3487 + }, + { + "epoch": 0.4654390178809714, + "grad_norm": 1.2349918808254974, + "learning_rate": 1.1598698108571205e-05, + "loss": 1.5377, + "step": 3488 + }, + { + "epoch": 0.4655724579663731, + "grad_norm": 1.0096021895282354, + "learning_rate": 1.1594431642395027e-05, + "loss": 1.593, + "step": 3489 + }, + { + "epoch": 0.46570589805177476, + "grad_norm": 1.0714154627752588, + "learning_rate": 1.1590164878397174e-05, + "loss": 1.5892, + "step": 3490 + }, + { + "epoch": 0.46583933813717643, + "grad_norm": 0.9628486041384523, + "learning_rate": 1.1585897817374628e-05, + "loss": 1.5884, + "step": 3491 + }, + { + "epoch": 0.46597277822257804, + "grad_norm": 0.9752330456513079, + "learning_rate": 1.1581630460124424e-05, + "loss": 1.576, + "step": 3492 + }, + { + "epoch": 0.4661062183079797, + "grad_norm": 0.9802951196533614, + "learning_rate": 1.1577362807443657e-05, + "loss": 1.5754, + "step": 3493 + }, + { + "epoch": 0.4662396583933814, + "grad_norm": 1.1032933740856454, + "learning_rate": 1.1573094860129479e-05, + "loss": 1.5705, + "step": 3494 + }, + { + "epoch": 0.466373098478783, + "grad_norm": 0.9835441668655777, + "learning_rate": 1.1568826618979087e-05, + "loss": 1.6197, + "step": 3495 + }, + { + "epoch": 0.46650653856418467, + "grad_norm": 0.9873490655801218, + "learning_rate": 1.1564558084789749e-05, + "loss": 1.6491, + "step": 3496 + }, + { + "epoch": 0.46663997864958634, + "grad_norm": 1.1431057876156487, + "learning_rate": 1.1560289258358773e-05, + "loss": 1.6006, + "step": 3497 + }, + { + "epoch": 0.466773418734988, + "grad_norm": 1.1765306955508206, + "learning_rate": 1.1556020140483523e-05, + "loss": 1.6422, + "step": 3498 + }, + { + "epoch": 0.4669068588203896, + "grad_norm": 0.900400853861777, + "learning_rate": 1.1551750731961433e-05, + "loss": 1.575, + "step": 3499 + }, + { + "epoch": 0.4670402989057913, + "grad_norm": 0.957183643780711, + "learning_rate": 1.1547481033589971e-05, + "loss": 1.5993, + "step": 3500 + }, + { + "epoch": 0.46717373899119297, + "grad_norm": 1.1014233549217831, + "learning_rate": 1.1543211046166672e-05, + "loss": 1.5655, + "step": 3501 + }, + { + "epoch": 0.46730717907659464, + "grad_norm": 0.9656596890342347, + "learning_rate": 1.1538940770489118e-05, + "loss": 1.5974, + "step": 3502 + }, + { + "epoch": 0.46744061916199625, + "grad_norm": 1.1217113813358535, + "learning_rate": 1.1534670207354952e-05, + "loss": 1.6228, + "step": 3503 + }, + { + "epoch": 0.4675740592473979, + "grad_norm": 1.0088247160067236, + "learning_rate": 1.1530399357561861e-05, + "loss": 1.6261, + "step": 3504 + }, + { + "epoch": 0.4677074993327996, + "grad_norm": 1.0056946912456848, + "learning_rate": 1.1526128221907595e-05, + "loss": 1.572, + "step": 3505 + }, + { + "epoch": 0.4678409394182012, + "grad_norm": 0.984689525859442, + "learning_rate": 1.1521856801189954e-05, + "loss": 1.5713, + "step": 3506 + }, + { + "epoch": 0.4679743795036029, + "grad_norm": 0.9741585900265354, + "learning_rate": 1.1517585096206788e-05, + "loss": 1.587, + "step": 3507 + }, + { + "epoch": 0.46810781958900455, + "grad_norm": 0.9889481536506061, + "learning_rate": 1.1513313107756007e-05, + "loss": 1.5535, + "step": 3508 + }, + { + "epoch": 0.4682412596744062, + "grad_norm": 0.9996665079177771, + "learning_rate": 1.1509040836635568e-05, + "loss": 1.5746, + "step": 3509 + }, + { + "epoch": 0.46837469975980783, + "grad_norm": 1.1930390389598884, + "learning_rate": 1.1504768283643476e-05, + "loss": 1.5963, + "step": 3510 + }, + { + "epoch": 0.4685081398452095, + "grad_norm": 1.015228011873478, + "learning_rate": 1.1500495449577806e-05, + "loss": 1.5463, + "step": 3511 + }, + { + "epoch": 0.4686415799306112, + "grad_norm": 0.9872673249391366, + "learning_rate": 1.149622233523667e-05, + "loss": 1.5713, + "step": 3512 + }, + { + "epoch": 0.4687750200160128, + "grad_norm": 1.1412360147938814, + "learning_rate": 1.1491948941418234e-05, + "loss": 1.6021, + "step": 3513 + }, + { + "epoch": 0.46890846010141446, + "grad_norm": 1.0170872902758004, + "learning_rate": 1.1487675268920721e-05, + "loss": 1.6049, + "step": 3514 + }, + { + "epoch": 0.46904190018681613, + "grad_norm": 0.9653090890079009, + "learning_rate": 1.148340131854241e-05, + "loss": 1.5985, + "step": 3515 + }, + { + "epoch": 0.4691753402722178, + "grad_norm": 1.0182190080319995, + "learning_rate": 1.147912709108162e-05, + "loss": 1.6062, + "step": 3516 + }, + { + "epoch": 0.4693087803576194, + "grad_norm": 1.1931214012383156, + "learning_rate": 1.1474852587336731e-05, + "loss": 1.6188, + "step": 3517 + }, + { + "epoch": 0.4694422204430211, + "grad_norm": 0.9521461072306361, + "learning_rate": 1.147057780810617e-05, + "loss": 1.546, + "step": 3518 + }, + { + "epoch": 0.46957566052842276, + "grad_norm": 1.0407743299860555, + "learning_rate": 1.1466302754188417e-05, + "loss": 1.6001, + "step": 3519 + }, + { + "epoch": 0.46970910061382437, + "grad_norm": 1.0007245963172906, + "learning_rate": 1.1462027426382002e-05, + "loss": 1.5495, + "step": 3520 + }, + { + "epoch": 0.46984254069922604, + "grad_norm": 1.00485415407387, + "learning_rate": 1.145775182548551e-05, + "loss": 1.5786, + "step": 3521 + }, + { + "epoch": 0.4699759807846277, + "grad_norm": 0.9919189415028729, + "learning_rate": 1.1453475952297577e-05, + "loss": 1.6291, + "step": 3522 + }, + { + "epoch": 0.4701094208700294, + "grad_norm": 1.1020789294943216, + "learning_rate": 1.1449199807616882e-05, + "loss": 1.533, + "step": 3523 + }, + { + "epoch": 0.470242860955431, + "grad_norm": 1.3158857310226424, + "learning_rate": 1.1444923392242165e-05, + "loss": 1.6232, + "step": 3524 + }, + { + "epoch": 0.47037630104083267, + "grad_norm": 1.1748271233312169, + "learning_rate": 1.1440646706972207e-05, + "loss": 1.5839, + "step": 3525 + }, + { + "epoch": 0.47050974112623434, + "grad_norm": 0.9926376613394198, + "learning_rate": 1.143636975260585e-05, + "loss": 1.6421, + "step": 3526 + }, + { + "epoch": 0.47064318121163595, + "grad_norm": 0.9974971729341541, + "learning_rate": 1.1432092529941972e-05, + "loss": 1.6288, + "step": 3527 + }, + { + "epoch": 0.4707766212970376, + "grad_norm": 1.07546662467877, + "learning_rate": 1.142781503977952e-05, + "loss": 1.597, + "step": 3528 + }, + { + "epoch": 0.4709100613824393, + "grad_norm": 1.0889300149702186, + "learning_rate": 1.1423537282917469e-05, + "loss": 1.5977, + "step": 3529 + }, + { + "epoch": 0.47104350146784096, + "grad_norm": 1.081280310987519, + "learning_rate": 1.1419259260154864e-05, + "loss": 1.5912, + "step": 3530 + }, + { + "epoch": 0.4711769415532426, + "grad_norm": 1.0372628884781476, + "learning_rate": 1.141498097229079e-05, + "loss": 1.6334, + "step": 3531 + }, + { + "epoch": 0.47131038163864425, + "grad_norm": 1.1883125180451641, + "learning_rate": 1.1410702420124377e-05, + "loss": 1.6159, + "step": 3532 + }, + { + "epoch": 0.4714438217240459, + "grad_norm": 0.9457223363765193, + "learning_rate": 1.1406423604454816e-05, + "loss": 1.6048, + "step": 3533 + }, + { + "epoch": 0.47157726180944753, + "grad_norm": 1.1775437736757077, + "learning_rate": 1.1402144526081338e-05, + "loss": 1.5696, + "step": 3534 + }, + { + "epoch": 0.4717107018948492, + "grad_norm": 0.9405003660186544, + "learning_rate": 1.1397865185803227e-05, + "loss": 1.6001, + "step": 3535 + }, + { + "epoch": 0.4718441419802509, + "grad_norm": 0.9593945256058269, + "learning_rate": 1.1393585584419812e-05, + "loss": 1.5745, + "step": 3536 + }, + { + "epoch": 0.47197758206565255, + "grad_norm": 1.037264876154079, + "learning_rate": 1.1389305722730478e-05, + "loss": 1.5845, + "step": 3537 + }, + { + "epoch": 0.47211102215105416, + "grad_norm": 1.0164422235337136, + "learning_rate": 1.1385025601534654e-05, + "loss": 1.5994, + "step": 3538 + }, + { + "epoch": 0.47224446223645583, + "grad_norm": 1.0015580982419405, + "learning_rate": 1.1380745221631813e-05, + "loss": 1.5474, + "step": 3539 + }, + { + "epoch": 0.4723779023218575, + "grad_norm": 0.9922512359099007, + "learning_rate": 1.137646458382149e-05, + "loss": 1.6085, + "step": 3540 + }, + { + "epoch": 0.4725113424072591, + "grad_norm": 0.9513030545535126, + "learning_rate": 1.137218368890325e-05, + "loss": 1.583, + "step": 3541 + }, + { + "epoch": 0.4726447824926608, + "grad_norm": 1.0137343494231201, + "learning_rate": 1.1367902537676722e-05, + "loss": 1.6333, + "step": 3542 + }, + { + "epoch": 0.47277822257806246, + "grad_norm": 1.053043019819065, + "learning_rate": 1.1363621130941573e-05, + "loss": 1.5458, + "step": 3543 + }, + { + "epoch": 0.4729116626634641, + "grad_norm": 0.991708813143814, + "learning_rate": 1.1359339469497525e-05, + "loss": 1.5617, + "step": 3544 + }, + { + "epoch": 0.47304510274886574, + "grad_norm": 0.9881479349845488, + "learning_rate": 1.1355057554144338e-05, + "loss": 1.5599, + "step": 3545 + }, + { + "epoch": 0.4731785428342674, + "grad_norm": 0.9677599502311968, + "learning_rate": 1.1350775385681827e-05, + "loss": 1.661, + "step": 3546 + }, + { + "epoch": 0.4733119829196691, + "grad_norm": 0.9637140530562013, + "learning_rate": 1.1346492964909856e-05, + "loss": 1.5783, + "step": 3547 + }, + { + "epoch": 0.4734454230050707, + "grad_norm": 1.0174979764772296, + "learning_rate": 1.1342210292628327e-05, + "loss": 1.6682, + "step": 3548 + }, + { + "epoch": 0.47357886309047237, + "grad_norm": 0.956767731266162, + "learning_rate": 1.1337927369637198e-05, + "loss": 1.6158, + "step": 3549 + }, + { + "epoch": 0.47371230317587404, + "grad_norm": 1.1908661498029431, + "learning_rate": 1.1333644196736468e-05, + "loss": 1.5445, + "step": 3550 + }, + { + "epoch": 0.4738457432612757, + "grad_norm": 1.0665948861651355, + "learning_rate": 1.132936077472619e-05, + "loss": 1.6007, + "step": 3551 + }, + { + "epoch": 0.4739791833466773, + "grad_norm": 0.9949979990909358, + "learning_rate": 1.1325077104406455e-05, + "loss": 1.6371, + "step": 3552 + }, + { + "epoch": 0.474112623432079, + "grad_norm": 1.1119040943652865, + "learning_rate": 1.1320793186577398e-05, + "loss": 1.5935, + "step": 3553 + }, + { + "epoch": 0.47424606351748066, + "grad_norm": 1.2024374524083807, + "learning_rate": 1.1316509022039215e-05, + "loss": 1.5906, + "step": 3554 + }, + { + "epoch": 0.4743795036028823, + "grad_norm": 0.9908007603433848, + "learning_rate": 1.1312224611592132e-05, + "loss": 1.6032, + "step": 3555 + }, + { + "epoch": 0.47451294368828395, + "grad_norm": 1.0257647339374822, + "learning_rate": 1.1307939956036437e-05, + "loss": 1.6118, + "step": 3556 + }, + { + "epoch": 0.4746463837736856, + "grad_norm": 0.9890146650400216, + "learning_rate": 1.1303655056172447e-05, + "loss": 1.6011, + "step": 3557 + }, + { + "epoch": 0.4747798238590873, + "grad_norm": 8.167075213418878, + "learning_rate": 1.1299369912800537e-05, + "loss": 1.5942, + "step": 3558 + }, + { + "epoch": 0.4749132639444889, + "grad_norm": 0.9783430062632217, + "learning_rate": 1.1295084526721119e-05, + "loss": 1.5433, + "step": 3559 + }, + { + "epoch": 0.4750467040298906, + "grad_norm": 1.0096065716017133, + "learning_rate": 1.129079889873466e-05, + "loss": 1.6455, + "step": 3560 + }, + { + "epoch": 0.47518014411529225, + "grad_norm": 1.0537017716025912, + "learning_rate": 1.1286513029641657e-05, + "loss": 1.5721, + "step": 3561 + }, + { + "epoch": 0.47531358420069386, + "grad_norm": 1.1698259882179534, + "learning_rate": 1.1282226920242669e-05, + "loss": 1.5693, + "step": 3562 + }, + { + "epoch": 0.47544702428609553, + "grad_norm": 1.096536277070531, + "learning_rate": 1.1277940571338296e-05, + "loss": 1.651, + "step": 3563 + }, + { + "epoch": 0.4755804643714972, + "grad_norm": 0.9870886196000643, + "learning_rate": 1.1273653983729169e-05, + "loss": 1.5927, + "step": 3564 + }, + { + "epoch": 0.4757139044568989, + "grad_norm": 1.0227690927337232, + "learning_rate": 1.1269367158215982e-05, + "loss": 1.5799, + "step": 3565 + }, + { + "epoch": 0.4758473445423005, + "grad_norm": 0.981433243463809, + "learning_rate": 1.1265080095599459e-05, + "loss": 1.6033, + "step": 3566 + }, + { + "epoch": 0.47598078462770216, + "grad_norm": 0.9241540078980625, + "learning_rate": 1.126079279668038e-05, + "loss": 1.6059, + "step": 3567 + }, + { + "epoch": 0.47611422471310383, + "grad_norm": 0.9966040467022464, + "learning_rate": 1.1256505262259561e-05, + "loss": 1.6017, + "step": 3568 + }, + { + "epoch": 0.47624766479850544, + "grad_norm": 1.00788906341642, + "learning_rate": 1.1252217493137863e-05, + "loss": 1.6031, + "step": 3569 + }, + { + "epoch": 0.4763811048839071, + "grad_norm": 1.1866583647363547, + "learning_rate": 1.1247929490116198e-05, + "loss": 1.5722, + "step": 3570 + }, + { + "epoch": 0.4765145449693088, + "grad_norm": 1.0142892640611485, + "learning_rate": 1.124364125399551e-05, + "loss": 1.6128, + "step": 3571 + }, + { + "epoch": 0.47664798505471045, + "grad_norm": 0.9882028516457761, + "learning_rate": 1.1239352785576795e-05, + "loss": 1.6019, + "step": 3572 + }, + { + "epoch": 0.47678142514011207, + "grad_norm": 0.9485363827235442, + "learning_rate": 1.1235064085661094e-05, + "loss": 1.5451, + "step": 3573 + }, + { + "epoch": 0.47691486522551374, + "grad_norm": 1.0423832728014304, + "learning_rate": 1.1230775155049478e-05, + "loss": 1.6573, + "step": 3574 + }, + { + "epoch": 0.4770483053109154, + "grad_norm": 0.9522899518242469, + "learning_rate": 1.122648599454308e-05, + "loss": 1.5389, + "step": 3575 + }, + { + "epoch": 0.4771817453963171, + "grad_norm": 1.0795989027553348, + "learning_rate": 1.122219660494306e-05, + "loss": 1.5245, + "step": 3576 + }, + { + "epoch": 0.4773151854817187, + "grad_norm": 0.9703295676591189, + "learning_rate": 1.121790698705063e-05, + "loss": 1.6262, + "step": 3577 + }, + { + "epoch": 0.47744862556712037, + "grad_norm": 0.9466306660145242, + "learning_rate": 1.1213617141667042e-05, + "loss": 1.5771, + "step": 3578 + }, + { + "epoch": 0.47758206565252204, + "grad_norm": 1.1652671591625094, + "learning_rate": 1.1209327069593587e-05, + "loss": 1.6108, + "step": 3579 + }, + { + "epoch": 0.47771550573792365, + "grad_norm": 0.9637891663914724, + "learning_rate": 1.1205036771631606e-05, + "loss": 1.6177, + "step": 3580 + }, + { + "epoch": 0.4778489458233253, + "grad_norm": 1.0591406701546173, + "learning_rate": 1.1200746248582478e-05, + "loss": 1.6277, + "step": 3581 + }, + { + "epoch": 0.477982385908727, + "grad_norm": 1.075013785861123, + "learning_rate": 1.1196455501247619e-05, + "loss": 1.6509, + "step": 3582 + }, + { + "epoch": 0.47811582599412866, + "grad_norm": 1.0099560088443318, + "learning_rate": 1.1192164530428495e-05, + "loss": 1.5818, + "step": 3583 + }, + { + "epoch": 0.4782492660795303, + "grad_norm": 1.2309318054357847, + "learning_rate": 1.1187873336926609e-05, + "loss": 1.5818, + "step": 3584 + }, + { + "epoch": 0.47838270616493195, + "grad_norm": 1.0494190297239068, + "learning_rate": 1.1183581921543507e-05, + "loss": 1.5966, + "step": 3585 + }, + { + "epoch": 0.4785161462503336, + "grad_norm": 0.9551043069869481, + "learning_rate": 1.1179290285080782e-05, + "loss": 1.5893, + "step": 3586 + }, + { + "epoch": 0.47864958633573523, + "grad_norm": 0.9368775120300123, + "learning_rate": 1.1174998428340055e-05, + "loss": 1.6137, + "step": 3587 + }, + { + "epoch": 0.4787830264211369, + "grad_norm": 1.1506412501949679, + "learning_rate": 1.1170706352123002e-05, + "loss": 1.6004, + "step": 3588 + }, + { + "epoch": 0.4789164665065386, + "grad_norm": 1.0842690119366971, + "learning_rate": 1.116641405723133e-05, + "loss": 1.6812, + "step": 3589 + }, + { + "epoch": 0.47904990659194024, + "grad_norm": 1.1775677070138726, + "learning_rate": 1.1162121544466794e-05, + "loss": 1.6091, + "step": 3590 + }, + { + "epoch": 0.47918334667734186, + "grad_norm": 0.9567709845629377, + "learning_rate": 1.1157828814631179e-05, + "loss": 1.596, + "step": 3591 + }, + { + "epoch": 0.47931678676274353, + "grad_norm": 0.970022267558387, + "learning_rate": 1.115353586852633e-05, + "loss": 1.657, + "step": 3592 + }, + { + "epoch": 0.4794502268481452, + "grad_norm": 0.9647653037200512, + "learning_rate": 1.1149242706954111e-05, + "loss": 1.5747, + "step": 3593 + }, + { + "epoch": 0.4795836669335468, + "grad_norm": 1.0040826902190914, + "learning_rate": 1.1144949330716441e-05, + "loss": 1.624, + "step": 3594 + }, + { + "epoch": 0.4797171070189485, + "grad_norm": 7.924396978950378, + "learning_rate": 1.1140655740615274e-05, + "loss": 1.6323, + "step": 3595 + }, + { + "epoch": 0.47985054710435016, + "grad_norm": 1.2546815828202986, + "learning_rate": 1.1136361937452595e-05, + "loss": 1.5693, + "step": 3596 + }, + { + "epoch": 0.4799839871897518, + "grad_norm": 1.0142710081731812, + "learning_rate": 1.113206792203045e-05, + "loss": 1.5772, + "step": 3597 + }, + { + "epoch": 0.48011742727515344, + "grad_norm": 0.9910079104514572, + "learning_rate": 1.1127773695150904e-05, + "loss": 1.6178, + "step": 3598 + }, + { + "epoch": 0.4802508673605551, + "grad_norm": 0.9866567619524242, + "learning_rate": 1.1123479257616072e-05, + "loss": 1.5508, + "step": 3599 + }, + { + "epoch": 0.4803843074459568, + "grad_norm": 1.0128213856045458, + "learning_rate": 1.1119184610228103e-05, + "loss": 1.5817, + "step": 3600 + }, + { + "epoch": 0.4805177475313584, + "grad_norm": 1.07285721769306, + "learning_rate": 1.1114889753789193e-05, + "loss": 1.5987, + "step": 3601 + }, + { + "epoch": 0.48065118761676007, + "grad_norm": 1.1570972062497342, + "learning_rate": 1.1110594689101572e-05, + "loss": 1.5853, + "step": 3602 + }, + { + "epoch": 0.48078462770216174, + "grad_norm": 0.9546422737264942, + "learning_rate": 1.1106299416967508e-05, + "loss": 1.5883, + "step": 3603 + }, + { + "epoch": 0.4809180677875634, + "grad_norm": 1.2179319826085693, + "learning_rate": 1.1102003938189308e-05, + "loss": 1.6357, + "step": 3604 + }, + { + "epoch": 0.481051507872965, + "grad_norm": 1.016415996934519, + "learning_rate": 1.1097708253569317e-05, + "loss": 1.5237, + "step": 3605 + }, + { + "epoch": 0.4811849479583667, + "grad_norm": 1.0641665463344543, + "learning_rate": 1.1093412363909926e-05, + "loss": 1.5878, + "step": 3606 + }, + { + "epoch": 0.48131838804376836, + "grad_norm": 0.9711099512141003, + "learning_rate": 1.1089116270013552e-05, + "loss": 1.5518, + "step": 3607 + }, + { + "epoch": 0.48145182812917, + "grad_norm": 1.2233276218733482, + "learning_rate": 1.108481997268266e-05, + "loss": 1.5926, + "step": 3608 + }, + { + "epoch": 0.48158526821457165, + "grad_norm": 1.1743091714878038, + "learning_rate": 1.1080523472719745e-05, + "loss": 1.5618, + "step": 3609 + }, + { + "epoch": 0.4817187082999733, + "grad_norm": 1.226731129476479, + "learning_rate": 1.1076226770927349e-05, + "loss": 1.5961, + "step": 3610 + }, + { + "epoch": 0.481852148385375, + "grad_norm": 1.061342840791177, + "learning_rate": 1.1071929868108046e-05, + "loss": 1.6121, + "step": 3611 + }, + { + "epoch": 0.4819855884707766, + "grad_norm": 1.0043889707036777, + "learning_rate": 1.1067632765064449e-05, + "loss": 1.559, + "step": 3612 + }, + { + "epoch": 0.4821190285561783, + "grad_norm": 0.9334874178935394, + "learning_rate": 1.1063335462599208e-05, + "loss": 1.6363, + "step": 3613 + }, + { + "epoch": 0.48225246864157995, + "grad_norm": 0.9541127453212594, + "learning_rate": 1.1059037961515005e-05, + "loss": 1.5859, + "step": 3614 + }, + { + "epoch": 0.48238590872698156, + "grad_norm": 1.2659844329901926, + "learning_rate": 1.1054740262614571e-05, + "loss": 1.6083, + "step": 3615 + }, + { + "epoch": 0.48251934881238323, + "grad_norm": 1.009394331400396, + "learning_rate": 1.1050442366700666e-05, + "loss": 1.5752, + "step": 3616 + }, + { + "epoch": 0.4826527888977849, + "grad_norm": 0.9588149443609268, + "learning_rate": 1.1046144274576085e-05, + "loss": 1.5623, + "step": 3617 + }, + { + "epoch": 0.48278622898318657, + "grad_norm": 1.1186126571235782, + "learning_rate": 1.1041845987043664e-05, + "loss": 1.5748, + "step": 3618 + }, + { + "epoch": 0.4829196690685882, + "grad_norm": 1.1820677624653801, + "learning_rate": 1.1037547504906275e-05, + "loss": 1.5901, + "step": 3619 + }, + { + "epoch": 0.48305310915398986, + "grad_norm": 0.9674980847745737, + "learning_rate": 1.1033248828966825e-05, + "loss": 1.5353, + "step": 3620 + }, + { + "epoch": 0.4831865492393915, + "grad_norm": 1.037410201754664, + "learning_rate": 1.1028949960028257e-05, + "loss": 1.5884, + "step": 3621 + }, + { + "epoch": 0.48331998932479314, + "grad_norm": 1.0179633311651575, + "learning_rate": 1.1024650898893554e-05, + "loss": 1.6185, + "step": 3622 + }, + { + "epoch": 0.4834534294101948, + "grad_norm": 1.024857123528208, + "learning_rate": 1.1020351646365726e-05, + "loss": 1.6043, + "step": 3623 + }, + { + "epoch": 0.4835868694955965, + "grad_norm": 1.0553654696487733, + "learning_rate": 1.1016052203247829e-05, + "loss": 1.5471, + "step": 3624 + }, + { + "epoch": 0.48372030958099815, + "grad_norm": 0.9703280830869259, + "learning_rate": 1.1011752570342949e-05, + "loss": 1.4886, + "step": 3625 + }, + { + "epoch": 0.48385374966639977, + "grad_norm": 0.9916678812963029, + "learning_rate": 1.1007452748454206e-05, + "loss": 1.6389, + "step": 3626 + }, + { + "epoch": 0.48398718975180144, + "grad_norm": 1.1877794657449894, + "learning_rate": 1.1003152738384762e-05, + "loss": 1.6355, + "step": 3627 + }, + { + "epoch": 0.4841206298372031, + "grad_norm": 1.1020608920211157, + "learning_rate": 1.0998852540937806e-05, + "loss": 1.6111, + "step": 3628 + }, + { + "epoch": 0.4842540699226047, + "grad_norm": 0.9894785991525039, + "learning_rate": 1.0994552156916569e-05, + "loss": 1.5638, + "step": 3629 + }, + { + "epoch": 0.4843875100080064, + "grad_norm": 1.3735777211891464, + "learning_rate": 1.0990251587124313e-05, + "loss": 1.5996, + "step": 3630 + }, + { + "epoch": 0.48452095009340806, + "grad_norm": 1.2295882874045658, + "learning_rate": 1.0985950832364333e-05, + "loss": 1.6332, + "step": 3631 + }, + { + "epoch": 0.48465439017880974, + "grad_norm": 1.0441878854812512, + "learning_rate": 1.0981649893439965e-05, + "loss": 1.6525, + "step": 3632 + }, + { + "epoch": 0.48478783026421135, + "grad_norm": 1.0469977037741003, + "learning_rate": 1.0977348771154572e-05, + "loss": 1.5817, + "step": 3633 + }, + { + "epoch": 0.484921270349613, + "grad_norm": 1.0948912787896075, + "learning_rate": 1.0973047466311556e-05, + "loss": 1.5934, + "step": 3634 + }, + { + "epoch": 0.4850547104350147, + "grad_norm": 1.2238531316763264, + "learning_rate": 1.0968745979714355e-05, + "loss": 1.6424, + "step": 3635 + }, + { + "epoch": 0.4851881505204163, + "grad_norm": 0.9811824398502238, + "learning_rate": 1.0964444312166432e-05, + "loss": 1.6459, + "step": 3636 + }, + { + "epoch": 0.485321590605818, + "grad_norm": 1.011203803623103, + "learning_rate": 1.0960142464471293e-05, + "loss": 1.5845, + "step": 3637 + }, + { + "epoch": 0.48545503069121965, + "grad_norm": 1.1695581236457981, + "learning_rate": 1.0955840437432472e-05, + "loss": 1.5702, + "step": 3638 + }, + { + "epoch": 0.4855884707766213, + "grad_norm": 1.0248849738040775, + "learning_rate": 1.095153823185354e-05, + "loss": 1.5914, + "step": 3639 + }, + { + "epoch": 0.48572191086202293, + "grad_norm": 1.028338773698459, + "learning_rate": 1.0947235848538103e-05, + "loss": 1.6658, + "step": 3640 + }, + { + "epoch": 0.4858553509474246, + "grad_norm": 0.9565872039882299, + "learning_rate": 1.094293328828979e-05, + "loss": 1.6082, + "step": 3641 + }, + { + "epoch": 0.4859887910328263, + "grad_norm": 0.9438603164321578, + "learning_rate": 1.0938630551912275e-05, + "loss": 1.6025, + "step": 3642 + }, + { + "epoch": 0.48612223111822794, + "grad_norm": 1.1670924197530739, + "learning_rate": 1.0934327640209264e-05, + "loss": 1.5918, + "step": 3643 + }, + { + "epoch": 0.48625567120362956, + "grad_norm": 0.990930041258616, + "learning_rate": 1.0930024553984482e-05, + "loss": 1.5648, + "step": 3644 + }, + { + "epoch": 0.48638911128903123, + "grad_norm": 0.9674206698473387, + "learning_rate": 1.0925721294041704e-05, + "loss": 1.5957, + "step": 3645 + }, + { + "epoch": 0.4865225513744329, + "grad_norm": 1.0275864571669882, + "learning_rate": 1.0921417861184728e-05, + "loss": 1.6259, + "step": 3646 + }, + { + "epoch": 0.4866559914598345, + "grad_norm": 1.0030357183260572, + "learning_rate": 1.091711425621739e-05, + "loss": 1.6057, + "step": 3647 + }, + { + "epoch": 0.4867894315452362, + "grad_norm": 1.1702999599033863, + "learning_rate": 1.0912810479943546e-05, + "loss": 1.6031, + "step": 3648 + }, + { + "epoch": 0.48692287163063785, + "grad_norm": 1.09322465585248, + "learning_rate": 1.0908506533167096e-05, + "loss": 1.5765, + "step": 3649 + }, + { + "epoch": 0.4870563117160395, + "grad_norm": 4.935499852254243, + "learning_rate": 1.0904202416691973e-05, + "loss": 1.6665, + "step": 3650 + }, + { + "epoch": 0.48718975180144114, + "grad_norm": 1.0758505123866504, + "learning_rate": 1.0899898131322131e-05, + "loss": 1.5891, + "step": 3651 + }, + { + "epoch": 0.4873231918868428, + "grad_norm": 0.9638129734070673, + "learning_rate": 1.0895593677861564e-05, + "loss": 1.6171, + "step": 3652 + }, + { + "epoch": 0.4874566319722445, + "grad_norm": 0.9472603260400863, + "learning_rate": 1.0891289057114297e-05, + "loss": 1.5797, + "step": 3653 + }, + { + "epoch": 0.4875900720576461, + "grad_norm": 1.266655084226231, + "learning_rate": 1.088698426988438e-05, + "loss": 1.5955, + "step": 3654 + }, + { + "epoch": 0.48772351214304777, + "grad_norm": 1.0120153785951085, + "learning_rate": 1.08826793169759e-05, + "loss": 1.5924, + "step": 3655 + }, + { + "epoch": 0.48785695222844944, + "grad_norm": 1.2704389458734962, + "learning_rate": 1.0878374199192974e-05, + "loss": 1.5805, + "step": 3656 + }, + { + "epoch": 0.4879903923138511, + "grad_norm": 1.0968646982460992, + "learning_rate": 1.0874068917339749e-05, + "loss": 1.5939, + "step": 3657 + }, + { + "epoch": 0.4881238323992527, + "grad_norm": 0.9666879260201952, + "learning_rate": 1.08697634722204e-05, + "loss": 1.5857, + "step": 3658 + }, + { + "epoch": 0.4882572724846544, + "grad_norm": 1.0192846733338263, + "learning_rate": 1.0865457864639139e-05, + "loss": 1.5674, + "step": 3659 + }, + { + "epoch": 0.48839071257005606, + "grad_norm": 0.9524173836018979, + "learning_rate": 1.08611520954002e-05, + "loss": 1.5765, + "step": 3660 + }, + { + "epoch": 0.4885241526554577, + "grad_norm": 1.0159163718461748, + "learning_rate": 1.0856846165307858e-05, + "loss": 1.5984, + "step": 3661 + }, + { + "epoch": 0.48865759274085935, + "grad_norm": 1.0174232310565294, + "learning_rate": 1.0852540075166404e-05, + "loss": 1.6555, + "step": 3662 + }, + { + "epoch": 0.488791032826261, + "grad_norm": 1.1547851642140878, + "learning_rate": 1.0848233825780171e-05, + "loss": 1.6071, + "step": 3663 + }, + { + "epoch": 0.4889244729116627, + "grad_norm": 1.0165575340976178, + "learning_rate": 1.0843927417953517e-05, + "loss": 1.6243, + "step": 3664 + }, + { + "epoch": 0.4890579129970643, + "grad_norm": 1.0034096707629767, + "learning_rate": 1.0839620852490831e-05, + "loss": 1.5922, + "step": 3665 + }, + { + "epoch": 0.489191353082466, + "grad_norm": 0.9459155508614493, + "learning_rate": 1.083531413019653e-05, + "loss": 1.595, + "step": 3666 + }, + { + "epoch": 0.48932479316786764, + "grad_norm": 0.9796138002566875, + "learning_rate": 1.0831007251875056e-05, + "loss": 1.5732, + "step": 3667 + }, + { + "epoch": 0.48945823325326926, + "grad_norm": 0.9370636379889555, + "learning_rate": 1.0826700218330895e-05, + "loss": 1.6168, + "step": 3668 + }, + { + "epoch": 0.48959167333867093, + "grad_norm": 1.032334040488697, + "learning_rate": 1.082239303036854e-05, + "loss": 1.5999, + "step": 3669 + }, + { + "epoch": 0.4897251134240726, + "grad_norm": 1.2833243357988338, + "learning_rate": 1.0818085688792532e-05, + "loss": 1.5513, + "step": 3670 + }, + { + "epoch": 0.48985855350947427, + "grad_norm": 0.9893354575062938, + "learning_rate": 1.0813778194407432e-05, + "loss": 1.6065, + "step": 3671 + }, + { + "epoch": 0.4899919935948759, + "grad_norm": 0.9731145363428029, + "learning_rate": 1.0809470548017828e-05, + "loss": 1.6603, + "step": 3672 + }, + { + "epoch": 0.49012543368027756, + "grad_norm": 0.966161570856358, + "learning_rate": 1.0805162750428345e-05, + "loss": 1.5408, + "step": 3673 + }, + { + "epoch": 0.4902588737656792, + "grad_norm": 1.1087789778186123, + "learning_rate": 1.0800854802443626e-05, + "loss": 1.5869, + "step": 3674 + }, + { + "epoch": 0.49039231385108084, + "grad_norm": 1.012321782174784, + "learning_rate": 1.0796546704868348e-05, + "loss": 1.6429, + "step": 3675 + }, + { + "epoch": 0.4905257539364825, + "grad_norm": 0.9858444455189594, + "learning_rate": 1.0792238458507215e-05, + "loss": 1.5603, + "step": 3676 + }, + { + "epoch": 0.4906591940218842, + "grad_norm": 1.130805582793479, + "learning_rate": 1.0787930064164959e-05, + "loss": 1.6041, + "step": 3677 + }, + { + "epoch": 0.49079263410728585, + "grad_norm": 0.9578865164171134, + "learning_rate": 1.0783621522646336e-05, + "loss": 1.5791, + "step": 3678 + }, + { + "epoch": 0.49092607419268747, + "grad_norm": 0.9913348610814515, + "learning_rate": 1.0779312834756134e-05, + "loss": 1.6555, + "step": 3679 + }, + { + "epoch": 0.49105951427808914, + "grad_norm": 0.9482971209808349, + "learning_rate": 1.0775004001299173e-05, + "loss": 1.5926, + "step": 3680 + }, + { + "epoch": 0.4911929543634908, + "grad_norm": 0.9705291068336154, + "learning_rate": 1.0770695023080282e-05, + "loss": 1.6281, + "step": 3681 + }, + { + "epoch": 0.4913263944488924, + "grad_norm": 1.1282798108079959, + "learning_rate": 1.0766385900904337e-05, + "loss": 1.5623, + "step": 3682 + }, + { + "epoch": 0.4914598345342941, + "grad_norm": 0.994148436807919, + "learning_rate": 1.0762076635576231e-05, + "loss": 1.6486, + "step": 3683 + }, + { + "epoch": 0.49159327461969576, + "grad_norm": 0.9583103245372037, + "learning_rate": 1.0757767227900888e-05, + "loss": 1.5759, + "step": 3684 + }, + { + "epoch": 0.49172671470509743, + "grad_norm": 0.9434367640143331, + "learning_rate": 1.075345767868325e-05, + "loss": 1.6, + "step": 3685 + }, + { + "epoch": 0.49186015479049905, + "grad_norm": 1.054046209894045, + "learning_rate": 1.0749147988728302e-05, + "loss": 1.6308, + "step": 3686 + }, + { + "epoch": 0.4919935948759007, + "grad_norm": 0.9921895864830362, + "learning_rate": 1.0744838158841034e-05, + "loss": 1.6343, + "step": 3687 + }, + { + "epoch": 0.4921270349613024, + "grad_norm": 1.2122795180169637, + "learning_rate": 1.074052818982648e-05, + "loss": 1.5849, + "step": 3688 + }, + { + "epoch": 0.492260475046704, + "grad_norm": 0.9788260702681275, + "learning_rate": 1.0736218082489691e-05, + "loss": 1.6075, + "step": 3689 + }, + { + "epoch": 0.4923939151321057, + "grad_norm": 1.3297573232605928, + "learning_rate": 1.0731907837635747e-05, + "loss": 1.6099, + "step": 3690 + }, + { + "epoch": 0.49252735521750735, + "grad_norm": 0.9550880821538821, + "learning_rate": 1.0727597456069755e-05, + "loss": 1.6125, + "step": 3691 + }, + { + "epoch": 0.492660795302909, + "grad_norm": 0.9619871054912161, + "learning_rate": 1.0723286938596836e-05, + "loss": 1.6079, + "step": 3692 + }, + { + "epoch": 0.49279423538831063, + "grad_norm": 1.1849634487362217, + "learning_rate": 1.0718976286022157e-05, + "loss": 1.6274, + "step": 3693 + }, + { + "epoch": 0.4929276754737123, + "grad_norm": 1.2585836751661978, + "learning_rate": 1.0714665499150888e-05, + "loss": 1.5792, + "step": 3694 + }, + { + "epoch": 0.49306111555911397, + "grad_norm": 0.9682726552538469, + "learning_rate": 1.0710354578788247e-05, + "loss": 1.5719, + "step": 3695 + }, + { + "epoch": 0.4931945556445156, + "grad_norm": 1.0111852850310221, + "learning_rate": 1.0706043525739454e-05, + "loss": 1.5881, + "step": 3696 + }, + { + "epoch": 0.49332799572991726, + "grad_norm": 1.0158485435950086, + "learning_rate": 1.070173234080977e-05, + "loss": 1.557, + "step": 3697 + }, + { + "epoch": 0.4934614358153189, + "grad_norm": 0.9601365629223316, + "learning_rate": 1.0697421024804475e-05, + "loss": 1.5992, + "step": 3698 + }, + { + "epoch": 0.4935948759007206, + "grad_norm": 0.9711819226222418, + "learning_rate": 1.0693109578528875e-05, + "loss": 1.5762, + "step": 3699 + }, + { + "epoch": 0.4937283159861222, + "grad_norm": 0.9747571113243313, + "learning_rate": 1.0688798002788295e-05, + "loss": 1.627, + "step": 3700 + }, + { + "epoch": 0.4938617560715239, + "grad_norm": 0.9155209925749146, + "learning_rate": 1.068448629838809e-05, + "loss": 1.5424, + "step": 3701 + }, + { + "epoch": 0.49399519615692555, + "grad_norm": 0.9680580273820439, + "learning_rate": 1.0680174466133639e-05, + "loss": 1.5782, + "step": 3702 + }, + { + "epoch": 0.49412863624232717, + "grad_norm": 0.9685689662721528, + "learning_rate": 1.067586250683034e-05, + "loss": 1.5927, + "step": 3703 + }, + { + "epoch": 0.49426207632772884, + "grad_norm": 0.9349199396735328, + "learning_rate": 1.0671550421283618e-05, + "loss": 1.6169, + "step": 3704 + }, + { + "epoch": 0.4943955164131305, + "grad_norm": 0.9766752432473302, + "learning_rate": 1.0667238210298927e-05, + "loss": 1.6029, + "step": 3705 + }, + { + "epoch": 0.4945289564985322, + "grad_norm": 1.1337519998185137, + "learning_rate": 1.0662925874681733e-05, + "loss": 1.5993, + "step": 3706 + }, + { + "epoch": 0.4946623965839338, + "grad_norm": 0.9756943770828267, + "learning_rate": 1.0658613415237535e-05, + "loss": 1.6645, + "step": 3707 + }, + { + "epoch": 0.49479583666933546, + "grad_norm": 0.9602283668761221, + "learning_rate": 1.0654300832771847e-05, + "loss": 1.6323, + "step": 3708 + }, + { + "epoch": 0.49492927675473714, + "grad_norm": 1.2356580385421276, + "learning_rate": 1.0649988128090216e-05, + "loss": 1.5881, + "step": 3709 + }, + { + "epoch": 0.49506271684013875, + "grad_norm": 0.9804354142593109, + "learning_rate": 1.06456753019982e-05, + "loss": 1.5797, + "step": 3710 + }, + { + "epoch": 0.4951961569255404, + "grad_norm": 0.9413974484117791, + "learning_rate": 1.0641362355301392e-05, + "loss": 1.5959, + "step": 3711 + }, + { + "epoch": 0.4953295970109421, + "grad_norm": 1.0173430778833028, + "learning_rate": 1.0637049288805395e-05, + "loss": 1.6835, + "step": 3712 + }, + { + "epoch": 0.49546303709634376, + "grad_norm": 1.0230987750185572, + "learning_rate": 1.0632736103315843e-05, + "loss": 1.5686, + "step": 3713 + }, + { + "epoch": 0.4955964771817454, + "grad_norm": 0.9776487226730937, + "learning_rate": 1.0628422799638396e-05, + "loss": 1.5768, + "step": 3714 + }, + { + "epoch": 0.49572991726714705, + "grad_norm": 1.1209401946738562, + "learning_rate": 1.0624109378578721e-05, + "loss": 1.5674, + "step": 3715 + }, + { + "epoch": 0.4958633573525487, + "grad_norm": 1.1825505150445272, + "learning_rate": 1.0619795840942524e-05, + "loss": 1.5712, + "step": 3716 + }, + { + "epoch": 0.4959967974379504, + "grad_norm": 0.939437949764124, + "learning_rate": 1.0615482187535515e-05, + "loss": 1.5544, + "step": 3717 + }, + { + "epoch": 0.496130237523352, + "grad_norm": 1.0394749330983024, + "learning_rate": 1.0611168419163444e-05, + "loss": 1.6918, + "step": 3718 + }, + { + "epoch": 0.4962636776087537, + "grad_norm": 1.0963483530806055, + "learning_rate": 1.060685453663207e-05, + "loss": 1.5935, + "step": 3719 + }, + { + "epoch": 0.49639711769415534, + "grad_norm": 0.9214028923141944, + "learning_rate": 1.0602540540747179e-05, + "loss": 1.605, + "step": 3720 + }, + { + "epoch": 0.49653055777955696, + "grad_norm": 0.9811475135324195, + "learning_rate": 1.0598226432314573e-05, + "loss": 1.5914, + "step": 3721 + }, + { + "epoch": 0.49666399786495863, + "grad_norm": 0.9866310168569355, + "learning_rate": 1.0593912212140086e-05, + "loss": 1.6091, + "step": 3722 + }, + { + "epoch": 0.4967974379503603, + "grad_norm": 0.9398392471927407, + "learning_rate": 1.0589597881029554e-05, + "loss": 1.5977, + "step": 3723 + }, + { + "epoch": 0.49693087803576197, + "grad_norm": 0.9946024082527848, + "learning_rate": 1.0585283439788851e-05, + "loss": 1.532, + "step": 3724 + }, + { + "epoch": 0.4970643181211636, + "grad_norm": 0.9747426365894848, + "learning_rate": 1.0580968889223868e-05, + "loss": 1.5502, + "step": 3725 + }, + { + "epoch": 0.49719775820656525, + "grad_norm": 1.0170841216880055, + "learning_rate": 1.0576654230140508e-05, + "loss": 1.5805, + "step": 3726 + }, + { + "epoch": 0.4973311982919669, + "grad_norm": 0.9642570769540154, + "learning_rate": 1.0572339463344707e-05, + "loss": 1.5801, + "step": 3727 + }, + { + "epoch": 0.49746463837736854, + "grad_norm": 0.9628913796090154, + "learning_rate": 1.0568024589642408e-05, + "loss": 1.605, + "step": 3728 + }, + { + "epoch": 0.4975980784627702, + "grad_norm": 0.9946599596096027, + "learning_rate": 1.0563709609839581e-05, + "loss": 1.5589, + "step": 3729 + }, + { + "epoch": 0.4977315185481719, + "grad_norm": 1.1095821154904126, + "learning_rate": 1.055939452474222e-05, + "loss": 1.5844, + "step": 3730 + }, + { + "epoch": 0.49786495863357355, + "grad_norm": 1.1727773862931867, + "learning_rate": 1.0555079335156328e-05, + "loss": 1.5687, + "step": 3731 + }, + { + "epoch": 0.49799839871897517, + "grad_norm": 1.0061224792693193, + "learning_rate": 1.055076404188794e-05, + "loss": 1.5969, + "step": 3732 + }, + { + "epoch": 0.49813183880437684, + "grad_norm": 0.9895293534993739, + "learning_rate": 1.0546448645743097e-05, + "loss": 1.6145, + "step": 3733 + }, + { + "epoch": 0.4982652788897785, + "grad_norm": 1.0574963661665566, + "learning_rate": 1.054213314752787e-05, + "loss": 1.6219, + "step": 3734 + }, + { + "epoch": 0.4983987189751801, + "grad_norm": 1.0560776793903364, + "learning_rate": 1.0537817548048341e-05, + "loss": 1.577, + "step": 3735 + }, + { + "epoch": 0.4985321590605818, + "grad_norm": 1.0083197939133182, + "learning_rate": 1.0533501848110617e-05, + "loss": 1.6201, + "step": 3736 + }, + { + "epoch": 0.49866559914598346, + "grad_norm": 0.9934023783176211, + "learning_rate": 1.0529186048520825e-05, + "loss": 1.5933, + "step": 3737 + }, + { + "epoch": 0.49879903923138513, + "grad_norm": 1.243448461460969, + "learning_rate": 1.0524870150085103e-05, + "loss": 1.6314, + "step": 3738 + }, + { + "epoch": 0.49893247931678675, + "grad_norm": 1.1745919731057533, + "learning_rate": 1.0520554153609613e-05, + "loss": 1.6081, + "step": 3739 + }, + { + "epoch": 0.4990659194021884, + "grad_norm": 0.9690359243672029, + "learning_rate": 1.0516238059900532e-05, + "loss": 1.6033, + "step": 3740 + }, + { + "epoch": 0.4991993594875901, + "grad_norm": 0.9910492376976103, + "learning_rate": 1.0511921869764062e-05, + "loss": 1.6147, + "step": 3741 + }, + { + "epoch": 0.4993327995729917, + "grad_norm": 0.9841002474852324, + "learning_rate": 1.0507605584006413e-05, + "loss": 1.5855, + "step": 3742 + }, + { + "epoch": 0.4994662396583934, + "grad_norm": 0.9835960197152689, + "learning_rate": 1.0503289203433822e-05, + "loss": 1.5912, + "step": 3743 + }, + { + "epoch": 0.49959967974379504, + "grad_norm": 0.9331205335074193, + "learning_rate": 1.049897272885254e-05, + "loss": 1.5957, + "step": 3744 + }, + { + "epoch": 0.4997331198291967, + "grad_norm": 1.0670315368640506, + "learning_rate": 1.0494656161068828e-05, + "loss": 1.5724, + "step": 3745 + }, + { + "epoch": 0.49986655991459833, + "grad_norm": 0.9964186881122445, + "learning_rate": 1.049033950088898e-05, + "loss": 1.6113, + "step": 3746 + }, + { + "epoch": 0.5, + "grad_norm": 0.9622713227335437, + "learning_rate": 1.0486022749119294e-05, + "loss": 1.6103, + "step": 3747 + }, + { + "epoch": 0.5001334400854016, + "grad_norm": 1.5585035104556884, + "learning_rate": 1.0481705906566092e-05, + "loss": 1.5407, + "step": 3748 + }, + { + "epoch": 0.5002668801708033, + "grad_norm": 0.9525397308234141, + "learning_rate": 1.0477388974035713e-05, + "loss": 1.56, + "step": 3749 + }, + { + "epoch": 0.500400320256205, + "grad_norm": 1.2183987988540275, + "learning_rate": 1.0473071952334508e-05, + "loss": 1.5559, + "step": 3750 + }, + { + "epoch": 0.5005337603416066, + "grad_norm": 1.0009923680441615, + "learning_rate": 1.0468754842268849e-05, + "loss": 1.6096, + "step": 3751 + }, + { + "epoch": 0.5006672004270083, + "grad_norm": 0.9445861127024189, + "learning_rate": 1.046443764464512e-05, + "loss": 1.6056, + "step": 3752 + }, + { + "epoch": 0.5008006405124099, + "grad_norm": 0.9742127353900115, + "learning_rate": 1.046012036026973e-05, + "loss": 1.6246, + "step": 3753 + }, + { + "epoch": 0.5009340805978116, + "grad_norm": 0.9520930850005337, + "learning_rate": 1.0455802989949092e-05, + "loss": 1.5292, + "step": 3754 + }, + { + "epoch": 0.5010675206832133, + "grad_norm": 0.935444191000388, + "learning_rate": 1.0451485534489649e-05, + "loss": 1.5537, + "step": 3755 + }, + { + "epoch": 0.5012009607686149, + "grad_norm": 0.9491673736063649, + "learning_rate": 1.0447167994697846e-05, + "loss": 1.5746, + "step": 3756 + }, + { + "epoch": 0.5013344008540166, + "grad_norm": 0.9554087936976446, + "learning_rate": 1.0442850371380155e-05, + "loss": 1.6095, + "step": 3757 + }, + { + "epoch": 0.5014678409394182, + "grad_norm": 1.2103781704529109, + "learning_rate": 1.0438532665343053e-05, + "loss": 1.5771, + "step": 3758 + }, + { + "epoch": 0.5016012810248198, + "grad_norm": 1.2128718749710947, + "learning_rate": 1.0434214877393045e-05, + "loss": 1.5952, + "step": 3759 + }, + { + "epoch": 0.5017347211102215, + "grad_norm": 0.9603242533460603, + "learning_rate": 1.042989700833664e-05, + "loss": 1.5571, + "step": 3760 + }, + { + "epoch": 0.5018681611956232, + "grad_norm": 1.1013519805324967, + "learning_rate": 1.042557905898037e-05, + "loss": 1.5555, + "step": 3761 + }, + { + "epoch": 0.5020016012810248, + "grad_norm": 1.113312267406771, + "learning_rate": 1.0421261030130776e-05, + "loss": 1.5986, + "step": 3762 + }, + { + "epoch": 0.5021350413664265, + "grad_norm": 0.9630320933350655, + "learning_rate": 1.041694292259442e-05, + "loss": 1.5864, + "step": 3763 + }, + { + "epoch": 0.5022684814518281, + "grad_norm": 0.9790488821683675, + "learning_rate": 1.041262473717787e-05, + "loss": 1.6064, + "step": 3764 + }, + { + "epoch": 0.5024019215372297, + "grad_norm": 1.0221627458345286, + "learning_rate": 1.0408306474687719e-05, + "loss": 1.6228, + "step": 3765 + }, + { + "epoch": 0.5025353616226315, + "grad_norm": 0.9477242589703042, + "learning_rate": 1.0403988135930568e-05, + "loss": 1.6004, + "step": 3766 + }, + { + "epoch": 0.5026688017080331, + "grad_norm": 0.9934183349840355, + "learning_rate": 1.039966972171303e-05, + "loss": 1.554, + "step": 3767 + }, + { + "epoch": 0.5028022417934348, + "grad_norm": 0.9582444414220985, + "learning_rate": 1.0395351232841739e-05, + "loss": 1.595, + "step": 3768 + }, + { + "epoch": 0.5029356818788364, + "grad_norm": 1.0891630480558472, + "learning_rate": 1.039103267012334e-05, + "loss": 1.555, + "step": 3769 + }, + { + "epoch": 0.503069121964238, + "grad_norm": 0.9957005065240585, + "learning_rate": 1.038671403436449e-05, + "loss": 1.6035, + "step": 3770 + }, + { + "epoch": 0.5032025620496398, + "grad_norm": 1.2830993460776894, + "learning_rate": 1.0382395326371861e-05, + "loss": 1.6037, + "step": 3771 + }, + { + "epoch": 0.5033360021350414, + "grad_norm": 0.9767721406060091, + "learning_rate": 1.0378076546952138e-05, + "loss": 1.5673, + "step": 3772 + }, + { + "epoch": 0.503469442220443, + "grad_norm": 0.9734968119255416, + "learning_rate": 1.0373757696912024e-05, + "loss": 1.5924, + "step": 3773 + }, + { + "epoch": 0.5036028823058447, + "grad_norm": 1.1161953113263952, + "learning_rate": 1.0369438777058226e-05, + "loss": 1.6249, + "step": 3774 + }, + { + "epoch": 0.5037363223912463, + "grad_norm": 0.9572612259574967, + "learning_rate": 1.0365119788197468e-05, + "loss": 1.5998, + "step": 3775 + }, + { + "epoch": 0.5038697624766479, + "grad_norm": 0.9432193177518614, + "learning_rate": 1.0360800731136493e-05, + "loss": 1.5321, + "step": 3776 + }, + { + "epoch": 0.5040032025620497, + "grad_norm": 1.0074347003449116, + "learning_rate": 1.0356481606682047e-05, + "loss": 1.6185, + "step": 3777 + }, + { + "epoch": 0.5041366426474513, + "grad_norm": 0.9454406764250083, + "learning_rate": 1.0352162415640898e-05, + "loss": 1.5624, + "step": 3778 + }, + { + "epoch": 0.5042700827328529, + "grad_norm": 1.004743047178849, + "learning_rate": 1.034784315881982e-05, + "loss": 1.6052, + "step": 3779 + }, + { + "epoch": 0.5044035228182546, + "grad_norm": 1.0081054951554982, + "learning_rate": 1.0343523837025598e-05, + "loss": 1.5967, + "step": 3780 + }, + { + "epoch": 0.5045369629036562, + "grad_norm": 1.1995648160845978, + "learning_rate": 1.0339204451065035e-05, + "loss": 1.5653, + "step": 3781 + }, + { + "epoch": 0.504670402989058, + "grad_norm": 1.0397558826608888, + "learning_rate": 1.0334885001744943e-05, + "loss": 1.6437, + "step": 3782 + }, + { + "epoch": 0.5048038430744596, + "grad_norm": 0.9903660282060232, + "learning_rate": 1.0330565489872144e-05, + "loss": 1.5705, + "step": 3783 + }, + { + "epoch": 0.5049372831598612, + "grad_norm": 0.9972288626686299, + "learning_rate": 1.0326245916253478e-05, + "loss": 1.5884, + "step": 3784 + }, + { + "epoch": 0.5050707232452629, + "grad_norm": 0.9476343942885561, + "learning_rate": 1.0321926281695787e-05, + "loss": 1.6023, + "step": 3785 + }, + { + "epoch": 0.5052041633306645, + "grad_norm": 0.9738449763592262, + "learning_rate": 1.0317606587005936e-05, + "loss": 1.5609, + "step": 3786 + }, + { + "epoch": 0.5053376034160662, + "grad_norm": 1.039829392353301, + "learning_rate": 1.0313286832990788e-05, + "loss": 1.6218, + "step": 3787 + }, + { + "epoch": 0.5054710435014679, + "grad_norm": 1.0381227642185753, + "learning_rate": 1.0308967020457223e-05, + "loss": 1.6251, + "step": 3788 + }, + { + "epoch": 0.5056044835868695, + "grad_norm": 0.9867023264523529, + "learning_rate": 1.0304647150212142e-05, + "loss": 1.5399, + "step": 3789 + }, + { + "epoch": 0.5057379236722711, + "grad_norm": 0.9867055454700514, + "learning_rate": 1.0300327223062436e-05, + "loss": 1.6344, + "step": 3790 + }, + { + "epoch": 0.5058713637576728, + "grad_norm": 1.0315163736938113, + "learning_rate": 1.029600723981503e-05, + "loss": 1.6119, + "step": 3791 + }, + { + "epoch": 0.5060048038430744, + "grad_norm": 0.9670113161953792, + "learning_rate": 1.0291687201276837e-05, + "loss": 1.5639, + "step": 3792 + }, + { + "epoch": 0.5061382439284761, + "grad_norm": 0.9921317429814296, + "learning_rate": 1.0287367108254796e-05, + "loss": 1.5514, + "step": 3793 + }, + { + "epoch": 0.5062716840138778, + "grad_norm": 1.0082597213286801, + "learning_rate": 1.0283046961555854e-05, + "loss": 1.5689, + "step": 3794 + }, + { + "epoch": 0.5064051240992794, + "grad_norm": 1.394709462594669, + "learning_rate": 1.027872676198696e-05, + "loss": 1.6046, + "step": 3795 + }, + { + "epoch": 0.5065385641846811, + "grad_norm": 1.0174333180253856, + "learning_rate": 1.0274406510355082e-05, + "loss": 1.5815, + "step": 3796 + }, + { + "epoch": 0.5066720042700827, + "grad_norm": 1.0733270672727413, + "learning_rate": 1.0270086207467188e-05, + "loss": 1.5879, + "step": 3797 + }, + { + "epoch": 0.5068054443554844, + "grad_norm": 1.0234017065005216, + "learning_rate": 1.0265765854130272e-05, + "loss": 1.6075, + "step": 3798 + }, + { + "epoch": 0.5069388844408861, + "grad_norm": 1.08499646847553, + "learning_rate": 1.0261445451151314e-05, + "loss": 1.6305, + "step": 3799 + }, + { + "epoch": 0.5070723245262877, + "grad_norm": 0.9704549685581058, + "learning_rate": 1.0257124999337324e-05, + "loss": 1.5948, + "step": 3800 + }, + { + "epoch": 0.5072057646116893, + "grad_norm": 7.9206996649868, + "learning_rate": 1.0252804499495314e-05, + "loss": 1.6509, + "step": 3801 + }, + { + "epoch": 0.507339204697091, + "grad_norm": 1.0387910709315784, + "learning_rate": 1.02484839524323e-05, + "loss": 1.5677, + "step": 3802 + }, + { + "epoch": 0.5074726447824927, + "grad_norm": 1.050554110371522, + "learning_rate": 1.0244163358955315e-05, + "loss": 1.6384, + "step": 3803 + }, + { + "epoch": 0.5076060848678943, + "grad_norm": 1.1559737239306573, + "learning_rate": 1.023984271987139e-05, + "loss": 1.6467, + "step": 3804 + }, + { + "epoch": 0.507739524953296, + "grad_norm": 1.0907297235523432, + "learning_rate": 1.0235522035987581e-05, + "loss": 1.632, + "step": 3805 + }, + { + "epoch": 0.5078729650386976, + "grad_norm": 1.0831891794610449, + "learning_rate": 1.0231201308110936e-05, + "loss": 1.6218, + "step": 3806 + }, + { + "epoch": 0.5080064051240992, + "grad_norm": 0.9955232658741183, + "learning_rate": 1.0226880537048518e-05, + "loss": 1.6157, + "step": 3807 + }, + { + "epoch": 0.508139845209501, + "grad_norm": 1.0436607505823476, + "learning_rate": 1.02225597236074e-05, + "loss": 1.5852, + "step": 3808 + }, + { + "epoch": 0.5082732852949026, + "grad_norm": 1.0380692870921138, + "learning_rate": 1.0218238868594656e-05, + "loss": 1.4999, + "step": 3809 + }, + { + "epoch": 0.5084067253803043, + "grad_norm": 0.95584778107239, + "learning_rate": 1.021391797281738e-05, + "loss": 1.5926, + "step": 3810 + }, + { + "epoch": 0.5085401654657059, + "grad_norm": 1.13056350804676, + "learning_rate": 1.0209597037082658e-05, + "loss": 1.599, + "step": 3811 + }, + { + "epoch": 0.5086736055511075, + "grad_norm": 1.0358679977491891, + "learning_rate": 1.02052760621976e-05, + "loss": 1.6291, + "step": 3812 + }, + { + "epoch": 0.5088070456365092, + "grad_norm": 0.9752350063012711, + "learning_rate": 1.0200955048969307e-05, + "loss": 1.5868, + "step": 3813 + }, + { + "epoch": 0.5089404857219109, + "grad_norm": 2.6417596103398284, + "learning_rate": 1.0196633998204903e-05, + "loss": 1.5943, + "step": 3814 + }, + { + "epoch": 0.5090739258073125, + "grad_norm": 1.1501400580375445, + "learning_rate": 1.0192312910711504e-05, + "loss": 1.5856, + "step": 3815 + }, + { + "epoch": 0.5092073658927142, + "grad_norm": 1.08515119526117, + "learning_rate": 1.0187991787296243e-05, + "loss": 1.6207, + "step": 3816 + }, + { + "epoch": 0.5093408059781158, + "grad_norm": 0.9540437758466517, + "learning_rate": 1.0183670628766258e-05, + "loss": 1.643, + "step": 3817 + }, + { + "epoch": 0.5094742460635174, + "grad_norm": 1.0299232805390495, + "learning_rate": 1.017934943592869e-05, + "loss": 1.6673, + "step": 3818 + }, + { + "epoch": 0.5096076861489192, + "grad_norm": 1.0039257031904898, + "learning_rate": 1.017502820959069e-05, + "loss": 1.5761, + "step": 3819 + }, + { + "epoch": 0.5097411262343208, + "grad_norm": 0.9346765769936345, + "learning_rate": 1.017070695055941e-05, + "loss": 1.5594, + "step": 3820 + }, + { + "epoch": 0.5098745663197225, + "grad_norm": 0.9858372448777835, + "learning_rate": 1.0166385659642017e-05, + "loss": 1.5746, + "step": 3821 + }, + { + "epoch": 0.5100080064051241, + "grad_norm": 1.0006528357635736, + "learning_rate": 1.0162064337645678e-05, + "loss": 1.6062, + "step": 3822 + }, + { + "epoch": 0.5101414464905257, + "grad_norm": 0.9404463154241544, + "learning_rate": 1.0157742985377567e-05, + "loss": 1.6299, + "step": 3823 + }, + { + "epoch": 0.5102748865759275, + "grad_norm": 1.0401934215904935, + "learning_rate": 1.015342160364486e-05, + "loss": 1.61, + "step": 3824 + }, + { + "epoch": 0.5104083266613291, + "grad_norm": 0.9544524367700885, + "learning_rate": 1.0149100193254744e-05, + "loss": 1.5981, + "step": 3825 + }, + { + "epoch": 0.5105417667467307, + "grad_norm": 1.1258447776031195, + "learning_rate": 1.0144778755014411e-05, + "loss": 1.5924, + "step": 3826 + }, + { + "epoch": 0.5106752068321324, + "grad_norm": 1.048900303879943, + "learning_rate": 1.0140457289731056e-05, + "loss": 1.5822, + "step": 3827 + }, + { + "epoch": 0.510808646917534, + "grad_norm": 0.9978329789351215, + "learning_rate": 1.0136135798211874e-05, + "loss": 1.5754, + "step": 3828 + }, + { + "epoch": 0.5109420870029356, + "grad_norm": 1.0026431461629155, + "learning_rate": 1.013181428126408e-05, + "loss": 1.5149, + "step": 3829 + }, + { + "epoch": 0.5110755270883374, + "grad_norm": 0.9910636307338526, + "learning_rate": 1.012749273969488e-05, + "loss": 1.6098, + "step": 3830 + }, + { + "epoch": 0.511208967173739, + "grad_norm": 0.9607665572703091, + "learning_rate": 1.0123171174311482e-05, + "loss": 1.596, + "step": 3831 + }, + { + "epoch": 0.5113424072591406, + "grad_norm": 0.9629876623056436, + "learning_rate": 1.0118849585921114e-05, + "loss": 1.5757, + "step": 3832 + }, + { + "epoch": 0.5114758473445423, + "grad_norm": 0.9574400509477552, + "learning_rate": 1.0114527975330997e-05, + "loss": 1.5883, + "step": 3833 + }, + { + "epoch": 0.5116092874299439, + "grad_norm": 1.0875584418211826, + "learning_rate": 1.0110206343348354e-05, + "loss": 1.5646, + "step": 3834 + }, + { + "epoch": 0.5117427275153457, + "grad_norm": 1.169489430147777, + "learning_rate": 1.0105884690780426e-05, + "loss": 1.5489, + "step": 3835 + }, + { + "epoch": 0.5118761676007473, + "grad_norm": 1.4633224716352873, + "learning_rate": 1.0101563018434441e-05, + "loss": 1.612, + "step": 3836 + }, + { + "epoch": 0.5120096076861489, + "grad_norm": 0.9544832295384696, + "learning_rate": 1.0097241327117642e-05, + "loss": 1.5445, + "step": 3837 + }, + { + "epoch": 0.5121430477715506, + "grad_norm": 1.0099497190113178, + "learning_rate": 1.0092919617637267e-05, + "loss": 1.6371, + "step": 3838 + }, + { + "epoch": 0.5122764878569522, + "grad_norm": 0.9869374341847159, + "learning_rate": 1.0088597890800568e-05, + "loss": 1.5956, + "step": 3839 + }, + { + "epoch": 0.5124099279423538, + "grad_norm": 0.9740701564399505, + "learning_rate": 1.008427614741479e-05, + "loss": 1.6369, + "step": 3840 + }, + { + "epoch": 0.5125433680277556, + "grad_norm": 1.0435801092284192, + "learning_rate": 1.0079954388287187e-05, + "loss": 1.6168, + "step": 3841 + }, + { + "epoch": 0.5126768081131572, + "grad_norm": 1.0717965001586232, + "learning_rate": 1.007563261422502e-05, + "loss": 1.5815, + "step": 3842 + }, + { + "epoch": 0.5128102481985588, + "grad_norm": 0.951753888240989, + "learning_rate": 1.0071310826035536e-05, + "loss": 1.5753, + "step": 3843 + }, + { + "epoch": 0.5129436882839605, + "grad_norm": 1.0427073825832405, + "learning_rate": 1.0066989024526004e-05, + "loss": 1.571, + "step": 3844 + }, + { + "epoch": 0.5130771283693621, + "grad_norm": 0.9665144556917987, + "learning_rate": 1.0062667210503682e-05, + "loss": 1.5675, + "step": 3845 + }, + { + "epoch": 0.5132105684547638, + "grad_norm": 0.9841126172568884, + "learning_rate": 1.0058345384775843e-05, + "loss": 1.566, + "step": 3846 + }, + { + "epoch": 0.5133440085401655, + "grad_norm": 1.0231266781644845, + "learning_rate": 1.0054023548149747e-05, + "loss": 1.6019, + "step": 3847 + }, + { + "epoch": 0.5134774486255671, + "grad_norm": 1.013759415570729, + "learning_rate": 1.004970170143267e-05, + "loss": 1.5977, + "step": 3848 + }, + { + "epoch": 0.5136108887109688, + "grad_norm": 0.9596994251252524, + "learning_rate": 1.0045379845431877e-05, + "loss": 1.6183, + "step": 3849 + }, + { + "epoch": 0.5137443287963704, + "grad_norm": 0.9562275485239146, + "learning_rate": 1.004105798095465e-05, + "loss": 1.5953, + "step": 3850 + }, + { + "epoch": 0.5138777688817721, + "grad_norm": 1.0389139499008198, + "learning_rate": 1.0036736108808258e-05, + "loss": 1.6092, + "step": 3851 + }, + { + "epoch": 0.5140112089671738, + "grad_norm": 1.1170799172414176, + "learning_rate": 1.0032414229799978e-05, + "loss": 1.6152, + "step": 3852 + }, + { + "epoch": 0.5141446490525754, + "grad_norm": 1.05178593292583, + "learning_rate": 1.0028092344737093e-05, + "loss": 1.5869, + "step": 3853 + }, + { + "epoch": 0.514278089137977, + "grad_norm": 0.9406322853227639, + "learning_rate": 1.0023770454426873e-05, + "loss": 1.5934, + "step": 3854 + }, + { + "epoch": 0.5144115292233787, + "grad_norm": 1.0236251156656235, + "learning_rate": 1.0019448559676605e-05, + "loss": 1.5402, + "step": 3855 + }, + { + "epoch": 0.5145449693087804, + "grad_norm": 0.9301092670030722, + "learning_rate": 1.0015126661293566e-05, + "loss": 1.5849, + "step": 3856 + }, + { + "epoch": 0.514678409394182, + "grad_norm": 0.9761352830656781, + "learning_rate": 1.0010804760085037e-05, + "loss": 1.5598, + "step": 3857 + }, + { + "epoch": 0.5148118494795837, + "grad_norm": 0.9784181556059924, + "learning_rate": 1.0006482856858306e-05, + "loss": 1.6337, + "step": 3858 + }, + { + "epoch": 0.5149452895649853, + "grad_norm": 1.0310199190875282, + "learning_rate": 1.000216095242065e-05, + "loss": 1.5553, + "step": 3859 + }, + { + "epoch": 0.5150787296503869, + "grad_norm": 0.9413946989014104, + "learning_rate": 9.997839047579351e-06, + "loss": 1.5955, + "step": 3860 + }, + { + "epoch": 0.5152121697357886, + "grad_norm": 1.0310793127717695, + "learning_rate": 9.993517143141695e-06, + "loss": 1.5891, + "step": 3861 + }, + { + "epoch": 0.5153456098211903, + "grad_norm": 1.1466677120724456, + "learning_rate": 9.989195239914964e-06, + "loss": 1.5896, + "step": 3862 + }, + { + "epoch": 0.515479049906592, + "grad_norm": 2.0571376714105436, + "learning_rate": 9.984873338706439e-06, + "loss": 1.6166, + "step": 3863 + }, + { + "epoch": 0.5156124899919936, + "grad_norm": 0.9772929955397268, + "learning_rate": 9.980551440323398e-06, + "loss": 1.6124, + "step": 3864 + }, + { + "epoch": 0.5157459300773952, + "grad_norm": 0.995449303499809, + "learning_rate": 9.97622954557313e-06, + "loss": 1.55, + "step": 3865 + }, + { + "epoch": 0.515879370162797, + "grad_norm": 0.9947089431138232, + "learning_rate": 9.971907655262914e-06, + "loss": 1.6228, + "step": 3866 + }, + { + "epoch": 0.5160128102481986, + "grad_norm": 1.0584016266307699, + "learning_rate": 9.967585770200023e-06, + "loss": 1.5816, + "step": 3867 + }, + { + "epoch": 0.5161462503336002, + "grad_norm": 3.1959955421300643, + "learning_rate": 9.963263891191743e-06, + "loss": 1.584, + "step": 3868 + }, + { + "epoch": 0.5162796904190019, + "grad_norm": 1.0146744361794484, + "learning_rate": 9.958942019045352e-06, + "loss": 1.5777, + "step": 3869 + }, + { + "epoch": 0.5164131305044035, + "grad_norm": 1.0815439584288364, + "learning_rate": 9.954620154568125e-06, + "loss": 1.5771, + "step": 3870 + }, + { + "epoch": 0.5165465705898051, + "grad_norm": 0.9947735001779133, + "learning_rate": 9.950298298567333e-06, + "loss": 1.6323, + "step": 3871 + }, + { + "epoch": 0.5166800106752069, + "grad_norm": 1.025350666289356, + "learning_rate": 9.945976451850256e-06, + "loss": 1.5726, + "step": 3872 + }, + { + "epoch": 0.5168134507606085, + "grad_norm": 1.0309153481695548, + "learning_rate": 9.941654615224162e-06, + "loss": 1.6411, + "step": 3873 + }, + { + "epoch": 0.5169468908460101, + "grad_norm": 0.9706291026832317, + "learning_rate": 9.937332789496318e-06, + "loss": 1.5344, + "step": 3874 + }, + { + "epoch": 0.5170803309314118, + "grad_norm": 1.0442149306572364, + "learning_rate": 9.933010975474e-06, + "loss": 1.5868, + "step": 3875 + }, + { + "epoch": 0.5172137710168134, + "grad_norm": 0.9801227554963917, + "learning_rate": 9.928689173964467e-06, + "loss": 1.584, + "step": 3876 + }, + { + "epoch": 0.5173472111022152, + "grad_norm": 0.9612770789559671, + "learning_rate": 9.924367385774987e-06, + "loss": 1.5624, + "step": 3877 + }, + { + "epoch": 0.5174806511876168, + "grad_norm": 12.372251835234461, + "learning_rate": 9.920045611712812e-06, + "loss": 1.6331, + "step": 3878 + }, + { + "epoch": 0.5176140912730184, + "grad_norm": 1.0323071031650972, + "learning_rate": 9.915723852585211e-06, + "loss": 1.5644, + "step": 3879 + }, + { + "epoch": 0.5177475313584201, + "grad_norm": 1.0939706773461586, + "learning_rate": 9.911402109199437e-06, + "loss": 1.5847, + "step": 3880 + }, + { + "epoch": 0.5178809714438217, + "grad_norm": 1.1539521887238684, + "learning_rate": 9.907080382362733e-06, + "loss": 1.6092, + "step": 3881 + }, + { + "epoch": 0.5180144115292233, + "grad_norm": 1.4844965472462381, + "learning_rate": 9.902758672882361e-06, + "loss": 1.6156, + "step": 3882 + }, + { + "epoch": 0.5181478516146251, + "grad_norm": 1.0477571393778307, + "learning_rate": 9.898436981565562e-06, + "loss": 1.5609, + "step": 3883 + }, + { + "epoch": 0.5182812917000267, + "grad_norm": 0.9919178224095909, + "learning_rate": 9.894115309219579e-06, + "loss": 1.6325, + "step": 3884 + }, + { + "epoch": 0.5184147317854283, + "grad_norm": 1.039487001315658, + "learning_rate": 9.889793656651646e-06, + "loss": 1.6016, + "step": 3885 + }, + { + "epoch": 0.51854817187083, + "grad_norm": 0.9999173249401138, + "learning_rate": 9.885472024669007e-06, + "loss": 1.5301, + "step": 3886 + }, + { + "epoch": 0.5186816119562316, + "grad_norm": 1.0016577339792758, + "learning_rate": 9.881150414078891e-06, + "loss": 1.6022, + "step": 3887 + }, + { + "epoch": 0.5188150520416333, + "grad_norm": 1.022731064253213, + "learning_rate": 9.87682882568852e-06, + "loss": 1.6181, + "step": 3888 + }, + { + "epoch": 0.518948492127035, + "grad_norm": 1.053953730841017, + "learning_rate": 9.872507260305124e-06, + "loss": 1.5722, + "step": 3889 + }, + { + "epoch": 0.5190819322124366, + "grad_norm": 0.9483103314838104, + "learning_rate": 9.868185718735922e-06, + "loss": 1.5721, + "step": 3890 + }, + { + "epoch": 0.5192153722978383, + "grad_norm": 0.9716784609015309, + "learning_rate": 9.863864201788127e-06, + "loss": 1.5958, + "step": 3891 + }, + { + "epoch": 0.5193488123832399, + "grad_norm": 7.41992607929927, + "learning_rate": 9.859542710268947e-06, + "loss": 1.601, + "step": 3892 + }, + { + "epoch": 0.5194822524686415, + "grad_norm": 1.0842848553275373, + "learning_rate": 9.85522124498559e-06, + "loss": 1.6163, + "step": 3893 + }, + { + "epoch": 0.5196156925540433, + "grad_norm": 1.1825348756660392, + "learning_rate": 9.850899806745258e-06, + "loss": 1.5681, + "step": 3894 + }, + { + "epoch": 0.5197491326394449, + "grad_norm": 1.219221323073541, + "learning_rate": 9.846578396355144e-06, + "loss": 1.6045, + "step": 3895 + }, + { + "epoch": 0.5198825727248465, + "grad_norm": 0.9727027071204921, + "learning_rate": 9.842257014622436e-06, + "loss": 1.6032, + "step": 3896 + }, + { + "epoch": 0.5200160128102482, + "grad_norm": 0.9822928747027722, + "learning_rate": 9.837935662354325e-06, + "loss": 1.5696, + "step": 3897 + }, + { + "epoch": 0.5201494528956498, + "grad_norm": 0.9616838656896538, + "learning_rate": 9.833614340357986e-06, + "loss": 1.5782, + "step": 3898 + }, + { + "epoch": 0.5202828929810515, + "grad_norm": 1.0479795740977809, + "learning_rate": 9.829293049440592e-06, + "loss": 1.5405, + "step": 3899 + }, + { + "epoch": 0.5204163330664532, + "grad_norm": 1.02813031490662, + "learning_rate": 9.824971790409314e-06, + "loss": 1.5542, + "step": 3900 + }, + { + "epoch": 0.5205497731518548, + "grad_norm": 1.1751181903794317, + "learning_rate": 9.820650564071314e-06, + "loss": 1.6051, + "step": 3901 + }, + { + "epoch": 0.5206832132372565, + "grad_norm": 1.01933272576621, + "learning_rate": 9.816329371233747e-06, + "loss": 1.6075, + "step": 3902 + }, + { + "epoch": 0.5208166533226581, + "grad_norm": 1.065928748207789, + "learning_rate": 9.812008212703758e-06, + "loss": 1.6226, + "step": 3903 + }, + { + "epoch": 0.5209500934080598, + "grad_norm": 1.0491540793462275, + "learning_rate": 9.8076870892885e-06, + "loss": 1.62, + "step": 3904 + }, + { + "epoch": 0.5210835334934615, + "grad_norm": 0.9904597628301255, + "learning_rate": 9.803366001795102e-06, + "loss": 1.5997, + "step": 3905 + }, + { + "epoch": 0.5212169735788631, + "grad_norm": 0.9273698585055911, + "learning_rate": 9.799044951030693e-06, + "loss": 1.5687, + "step": 3906 + }, + { + "epoch": 0.5213504136642647, + "grad_norm": 1.051717478475509, + "learning_rate": 9.794723937802403e-06, + "loss": 1.5735, + "step": 3907 + }, + { + "epoch": 0.5214838537496664, + "grad_norm": 0.9428209123723238, + "learning_rate": 9.790402962917343e-06, + "loss": 1.5791, + "step": 3908 + }, + { + "epoch": 0.521617293835068, + "grad_norm": 0.9740214957204884, + "learning_rate": 9.786082027182626e-06, + "loss": 1.6194, + "step": 3909 + }, + { + "epoch": 0.5217507339204697, + "grad_norm": 0.9865488024149134, + "learning_rate": 9.781761131405346e-06, + "loss": 1.5704, + "step": 3910 + }, + { + "epoch": 0.5218841740058714, + "grad_norm": 0.9376345788702144, + "learning_rate": 9.777440276392604e-06, + "loss": 1.5978, + "step": 3911 + }, + { + "epoch": 0.522017614091273, + "grad_norm": 0.9550562216569386, + "learning_rate": 9.773119462951487e-06, + "loss": 1.6584, + "step": 3912 + }, + { + "epoch": 0.5221510541766746, + "grad_norm": 0.9603118976190848, + "learning_rate": 9.768798691889066e-06, + "loss": 1.5987, + "step": 3913 + }, + { + "epoch": 0.5222844942620763, + "grad_norm": 1.1573687573433378, + "learning_rate": 9.76447796401242e-06, + "loss": 1.5802, + "step": 3914 + }, + { + "epoch": 0.522417934347478, + "grad_norm": 1.0874659242828772, + "learning_rate": 9.760157280128611e-06, + "loss": 1.629, + "step": 3915 + }, + { + "epoch": 0.5225513744328797, + "grad_norm": 1.2673180853706751, + "learning_rate": 9.755836641044686e-06, + "loss": 1.5725, + "step": 3916 + }, + { + "epoch": 0.5226848145182813, + "grad_norm": 0.933164844674267, + "learning_rate": 9.7515160475677e-06, + "loss": 1.6217, + "step": 3917 + }, + { + "epoch": 0.5228182546036829, + "grad_norm": 0.951797289455154, + "learning_rate": 9.747195500504687e-06, + "loss": 1.5448, + "step": 3918 + }, + { + "epoch": 0.5229516946890846, + "grad_norm": 0.9440107265148531, + "learning_rate": 9.742875000662679e-06, + "loss": 1.5796, + "step": 3919 + }, + { + "epoch": 0.5230851347744863, + "grad_norm": 0.9563610291259368, + "learning_rate": 9.738554548848686e-06, + "loss": 1.5944, + "step": 3920 + }, + { + "epoch": 0.5232185748598879, + "grad_norm": 0.9662588914444328, + "learning_rate": 9.734234145869731e-06, + "loss": 1.5565, + "step": 3921 + }, + { + "epoch": 0.5233520149452896, + "grad_norm": 0.9669810427348244, + "learning_rate": 9.729913792532813e-06, + "loss": 1.584, + "step": 3922 + }, + { + "epoch": 0.5234854550306912, + "grad_norm": 0.9499418557964407, + "learning_rate": 9.725593489644925e-06, + "loss": 1.5611, + "step": 3923 + }, + { + "epoch": 0.5236188951160928, + "grad_norm": 0.9825228300383785, + "learning_rate": 9.721273238013042e-06, + "loss": 1.5699, + "step": 3924 + }, + { + "epoch": 0.5237523352014946, + "grad_norm": 1.0180904933312087, + "learning_rate": 9.716953038444149e-06, + "loss": 1.6399, + "step": 3925 + }, + { + "epoch": 0.5238857752868962, + "grad_norm": 0.9427216935978336, + "learning_rate": 9.712632891745208e-06, + "loss": 1.6125, + "step": 3926 + }, + { + "epoch": 0.5240192153722978, + "grad_norm": 0.9985494748443655, + "learning_rate": 9.708312798723168e-06, + "loss": 1.607, + "step": 3927 + }, + { + "epoch": 0.5241526554576995, + "grad_norm": 0.9775720421880318, + "learning_rate": 9.703992760184976e-06, + "loss": 1.6487, + "step": 3928 + }, + { + "epoch": 0.5242860955431011, + "grad_norm": 1.0440323447007265, + "learning_rate": 9.699672776937567e-06, + "loss": 1.6203, + "step": 3929 + }, + { + "epoch": 0.5244195356285029, + "grad_norm": 1.073896330976579, + "learning_rate": 9.695352849787865e-06, + "loss": 1.6193, + "step": 3930 + }, + { + "epoch": 0.5245529757139045, + "grad_norm": 0.9666654017851334, + "learning_rate": 9.69103297954278e-06, + "loss": 1.6498, + "step": 3931 + }, + { + "epoch": 0.5246864157993061, + "grad_norm": 1.1178863635267453, + "learning_rate": 9.686713167009216e-06, + "loss": 1.6103, + "step": 3932 + }, + { + "epoch": 0.5248198558847078, + "grad_norm": 1.0066761509572055, + "learning_rate": 9.68239341299407e-06, + "loss": 1.6333, + "step": 3933 + }, + { + "epoch": 0.5249532959701094, + "grad_norm": 0.9297439952534905, + "learning_rate": 9.678073718304215e-06, + "loss": 1.5461, + "step": 3934 + }, + { + "epoch": 0.525086736055511, + "grad_norm": 0.9488135408582773, + "learning_rate": 9.673754083746523e-06, + "loss": 1.6361, + "step": 3935 + }, + { + "epoch": 0.5252201761409128, + "grad_norm": 0.9845229518536815, + "learning_rate": 9.669434510127858e-06, + "loss": 1.6412, + "step": 3936 + }, + { + "epoch": 0.5253536162263144, + "grad_norm": 0.9802847942341244, + "learning_rate": 9.66511499825506e-06, + "loss": 1.5989, + "step": 3937 + }, + { + "epoch": 0.525487056311716, + "grad_norm": 0.9632382904131538, + "learning_rate": 9.660795548934965e-06, + "loss": 1.6071, + "step": 3938 + }, + { + "epoch": 0.5256204963971177, + "grad_norm": 1.0699178531684905, + "learning_rate": 9.656476162974403e-06, + "loss": 1.6323, + "step": 3939 + }, + { + "epoch": 0.5257539364825193, + "grad_norm": 0.9511806151181409, + "learning_rate": 9.652156841180185e-06, + "loss": 1.6131, + "step": 3940 + }, + { + "epoch": 0.525887376567921, + "grad_norm": 0.9838327792979288, + "learning_rate": 9.647837584359107e-06, + "loss": 1.5408, + "step": 3941 + }, + { + "epoch": 0.5260208166533227, + "grad_norm": 1.1196962403177173, + "learning_rate": 9.643518393317953e-06, + "loss": 1.583, + "step": 3942 + }, + { + "epoch": 0.5261542567387243, + "grad_norm": 0.9499526650383459, + "learning_rate": 9.63919926886351e-06, + "loss": 1.567, + "step": 3943 + }, + { + "epoch": 0.526287696824126, + "grad_norm": 0.9520997264212869, + "learning_rate": 9.634880211802535e-06, + "loss": 1.6393, + "step": 3944 + }, + { + "epoch": 0.5264211369095276, + "grad_norm": 1.2483925528165147, + "learning_rate": 9.630561222941776e-06, + "loss": 1.6007, + "step": 3945 + }, + { + "epoch": 0.5265545769949292, + "grad_norm": 0.9647842321723614, + "learning_rate": 9.62624230308798e-06, + "loss": 1.6036, + "step": 3946 + }, + { + "epoch": 0.526688017080331, + "grad_norm": 1.051538449954908, + "learning_rate": 9.621923453047864e-06, + "loss": 1.6024, + "step": 3947 + }, + { + "epoch": 0.5268214571657326, + "grad_norm": 1.0353802905237608, + "learning_rate": 9.617604673628142e-06, + "loss": 1.5987, + "step": 3948 + }, + { + "epoch": 0.5269548972511342, + "grad_norm": 0.9689586440798221, + "learning_rate": 9.61328596563551e-06, + "loss": 1.5775, + "step": 3949 + }, + { + "epoch": 0.5270883373365359, + "grad_norm": 1.110258725459353, + "learning_rate": 9.608967329876662e-06, + "loss": 1.5446, + "step": 3950 + }, + { + "epoch": 0.5272217774219375, + "grad_norm": 0.9779790351795254, + "learning_rate": 9.604648767158263e-06, + "loss": 1.5587, + "step": 3951 + }, + { + "epoch": 0.5273552175073392, + "grad_norm": 0.9136985660924678, + "learning_rate": 9.600330278286972e-06, + "loss": 1.5626, + "step": 3952 + }, + { + "epoch": 0.5274886575927409, + "grad_norm": 0.9427079281067329, + "learning_rate": 9.596011864069434e-06, + "loss": 1.597, + "step": 3953 + }, + { + "epoch": 0.5276220976781425, + "grad_norm": 0.9576255235969523, + "learning_rate": 9.591693525312283e-06, + "loss": 1.5782, + "step": 3954 + }, + { + "epoch": 0.5277555377635441, + "grad_norm": 0.9802126130169267, + "learning_rate": 9.587375262822132e-06, + "loss": 1.572, + "step": 3955 + }, + { + "epoch": 0.5278889778489458, + "grad_norm": 0.967258469940652, + "learning_rate": 9.583057077405583e-06, + "loss": 1.5762, + "step": 3956 + }, + { + "epoch": 0.5280224179343475, + "grad_norm": 1.034695214577326, + "learning_rate": 9.578738969869227e-06, + "loss": 1.5677, + "step": 3957 + }, + { + "epoch": 0.5281558580197492, + "grad_norm": 0.9403505616977419, + "learning_rate": 9.574420941019634e-06, + "loss": 1.5619, + "step": 3958 + }, + { + "epoch": 0.5282892981051508, + "grad_norm": 0.9592411207184994, + "learning_rate": 9.570102991663361e-06, + "loss": 1.6072, + "step": 3959 + }, + { + "epoch": 0.5284227381905524, + "grad_norm": 0.9743736463148753, + "learning_rate": 9.565785122606957e-06, + "loss": 1.5505, + "step": 3960 + }, + { + "epoch": 0.5285561782759541, + "grad_norm": 1.0090249364331059, + "learning_rate": 9.561467334656949e-06, + "loss": 1.5881, + "step": 3961 + }, + { + "epoch": 0.5286896183613558, + "grad_norm": 0.9420075410877025, + "learning_rate": 9.55714962861985e-06, + "loss": 1.618, + "step": 3962 + }, + { + "epoch": 0.5288230584467574, + "grad_norm": 0.9692223836450211, + "learning_rate": 9.552832005302154e-06, + "loss": 1.5772, + "step": 3963 + }, + { + "epoch": 0.5289564985321591, + "grad_norm": 0.9489004175073099, + "learning_rate": 9.548514465510353e-06, + "loss": 1.6164, + "step": 3964 + }, + { + "epoch": 0.5290899386175607, + "grad_norm": 0.9421154885492968, + "learning_rate": 9.54419701005091e-06, + "loss": 1.5717, + "step": 3965 + }, + { + "epoch": 0.5292233787029623, + "grad_norm": 0.9424115466890982, + "learning_rate": 9.539879639730276e-06, + "loss": 1.5631, + "step": 3966 + }, + { + "epoch": 0.529356818788364, + "grad_norm": 2.7276724914362083, + "learning_rate": 9.53556235535488e-06, + "loss": 1.5696, + "step": 3967 + }, + { + "epoch": 0.5294902588737657, + "grad_norm": 0.9386033684279529, + "learning_rate": 9.531245157731154e-06, + "loss": 1.5285, + "step": 3968 + }, + { + "epoch": 0.5296236989591674, + "grad_norm": 0.8899585596068399, + "learning_rate": 9.526928047665495e-06, + "loss": 1.5118, + "step": 3969 + }, + { + "epoch": 0.529757139044569, + "grad_norm": 0.9826968773740292, + "learning_rate": 9.522611025964288e-06, + "loss": 1.5888, + "step": 3970 + }, + { + "epoch": 0.5298905791299706, + "grad_norm": 1.0400188151367677, + "learning_rate": 9.51829409343391e-06, + "loss": 1.6159, + "step": 3971 + }, + { + "epoch": 0.5300240192153723, + "grad_norm": 0.9561987813115096, + "learning_rate": 9.51397725088071e-06, + "loss": 1.5299, + "step": 3972 + }, + { + "epoch": 0.530157459300774, + "grad_norm": 0.9612234452056033, + "learning_rate": 9.509660499111027e-06, + "loss": 1.5819, + "step": 3973 + }, + { + "epoch": 0.5302908993861756, + "grad_norm": 0.9139976394076528, + "learning_rate": 9.505343838931177e-06, + "loss": 1.5485, + "step": 3974 + }, + { + "epoch": 0.5304243394715773, + "grad_norm": 1.0692106111452742, + "learning_rate": 9.501027271147466e-06, + "loss": 1.581, + "step": 3975 + }, + { + "epoch": 0.5305577795569789, + "grad_norm": 1.0131134517933047, + "learning_rate": 9.496710796566181e-06, + "loss": 1.6409, + "step": 3976 + }, + { + "epoch": 0.5306912196423805, + "grad_norm": 0.9508397877941868, + "learning_rate": 9.492394415993587e-06, + "loss": 1.5288, + "step": 3977 + }, + { + "epoch": 0.5308246597277823, + "grad_norm": 1.0043689220791059, + "learning_rate": 9.48807813023594e-06, + "loss": 1.5687, + "step": 3978 + }, + { + "epoch": 0.5309580998131839, + "grad_norm": 1.0338708219457289, + "learning_rate": 9.483761940099471e-06, + "loss": 1.6097, + "step": 3979 + }, + { + "epoch": 0.5310915398985855, + "grad_norm": 0.9661798704066481, + "learning_rate": 9.479445846390389e-06, + "loss": 1.6003, + "step": 3980 + }, + { + "epoch": 0.5312249799839872, + "grad_norm": 1.1436314427778058, + "learning_rate": 9.475129849914898e-06, + "loss": 1.6077, + "step": 3981 + }, + { + "epoch": 0.5313584200693888, + "grad_norm": 1.1001896581114239, + "learning_rate": 9.470813951479177e-06, + "loss": 1.5785, + "step": 3982 + }, + { + "epoch": 0.5314918601547906, + "grad_norm": 0.9585361064875515, + "learning_rate": 9.466498151889385e-06, + "loss": 1.5498, + "step": 3983 + }, + { + "epoch": 0.5316253002401922, + "grad_norm": 1.0620960267902282, + "learning_rate": 9.46218245195166e-06, + "loss": 1.5932, + "step": 3984 + }, + { + "epoch": 0.5317587403255938, + "grad_norm": 1.0499763780853832, + "learning_rate": 9.457866852472132e-06, + "loss": 1.6056, + "step": 3985 + }, + { + "epoch": 0.5318921804109955, + "grad_norm": 0.973771229695574, + "learning_rate": 9.453551354256907e-06, + "loss": 1.5651, + "step": 3986 + }, + { + "epoch": 0.5320256204963971, + "grad_norm": 0.9807116282877302, + "learning_rate": 9.449235958112065e-06, + "loss": 1.5977, + "step": 3987 + }, + { + "epoch": 0.5321590605817987, + "grad_norm": 0.9631392166085614, + "learning_rate": 9.444920664843671e-06, + "loss": 1.6167, + "step": 3988 + }, + { + "epoch": 0.5322925006672005, + "grad_norm": 0.993702665081086, + "learning_rate": 9.440605475257782e-06, + "loss": 1.5629, + "step": 3989 + }, + { + "epoch": 0.5324259407526021, + "grad_norm": 1.107669942509192, + "learning_rate": 9.436290390160422e-06, + "loss": 1.5701, + "step": 3990 + }, + { + "epoch": 0.5325593808380037, + "grad_norm": 1.0557051966228765, + "learning_rate": 9.431975410357597e-06, + "loss": 1.5419, + "step": 3991 + }, + { + "epoch": 0.5326928209234054, + "grad_norm": 1.0739759302965814, + "learning_rate": 9.427660536655296e-06, + "loss": 1.6363, + "step": 3992 + }, + { + "epoch": 0.532826261008807, + "grad_norm": 0.9274766595100078, + "learning_rate": 9.423345769859494e-06, + "loss": 1.5837, + "step": 3993 + }, + { + "epoch": 0.5329597010942086, + "grad_norm": 0.9774667853108802, + "learning_rate": 9.419031110776137e-06, + "loss": 1.6363, + "step": 3994 + }, + { + "epoch": 0.5330931411796104, + "grad_norm": 0.9665074812404877, + "learning_rate": 9.41471656021115e-06, + "loss": 1.5858, + "step": 3995 + }, + { + "epoch": 0.533226581265012, + "grad_norm": 0.9516009165945285, + "learning_rate": 9.410402118970447e-06, + "loss": 1.6294, + "step": 3996 + }, + { + "epoch": 0.5333600213504137, + "grad_norm": 0.9668981552357112, + "learning_rate": 9.40608778785992e-06, + "loss": 1.5588, + "step": 3997 + }, + { + "epoch": 0.5334934614358153, + "grad_norm": 0.9784909518756532, + "learning_rate": 9.401773567685428e-06, + "loss": 1.5723, + "step": 3998 + }, + { + "epoch": 0.533626901521217, + "grad_norm": 0.9517428419308205, + "learning_rate": 9.397459459252823e-06, + "loss": 1.6369, + "step": 3999 + }, + { + "epoch": 0.5337603416066187, + "grad_norm": 0.9506059122541574, + "learning_rate": 9.393145463367932e-06, + "loss": 1.6134, + "step": 4000 + }, + { + "epoch": 0.5338937816920203, + "grad_norm": 0.9352905785580472, + "learning_rate": 9.388831580836559e-06, + "loss": 1.596, + "step": 4001 + }, + { + "epoch": 0.5340272217774219, + "grad_norm": 0.9230020903319824, + "learning_rate": 9.384517812464485e-06, + "loss": 1.542, + "step": 4002 + }, + { + "epoch": 0.5341606618628236, + "grad_norm": 1.0665813163596254, + "learning_rate": 9.38020415905748e-06, + "loss": 1.5823, + "step": 4003 + }, + { + "epoch": 0.5342941019482252, + "grad_norm": 0.9983547945807545, + "learning_rate": 9.37589062142128e-06, + "loss": 1.6159, + "step": 4004 + }, + { + "epoch": 0.5344275420336269, + "grad_norm": 1.0317932796668363, + "learning_rate": 9.371577200361609e-06, + "loss": 1.5753, + "step": 4005 + }, + { + "epoch": 0.5345609821190286, + "grad_norm": 0.9463210756193553, + "learning_rate": 9.367263896684155e-06, + "loss": 1.5713, + "step": 4006 + }, + { + "epoch": 0.5346944222044302, + "grad_norm": 0.9578078666890666, + "learning_rate": 9.362950711194607e-06, + "loss": 1.5438, + "step": 4007 + }, + { + "epoch": 0.5348278622898318, + "grad_norm": 0.9984488850157227, + "learning_rate": 9.358637644698614e-06, + "loss": 1.5737, + "step": 4008 + }, + { + "epoch": 0.5349613023752335, + "grad_norm": 0.9572054111804951, + "learning_rate": 9.354324698001802e-06, + "loss": 1.5819, + "step": 4009 + }, + { + "epoch": 0.5350947424606352, + "grad_norm": 0.9538875645675433, + "learning_rate": 9.350011871909787e-06, + "loss": 1.5913, + "step": 4010 + }, + { + "epoch": 0.5352281825460369, + "grad_norm": 1.0188818733645557, + "learning_rate": 9.345699167228156e-06, + "loss": 1.5497, + "step": 4011 + }, + { + "epoch": 0.5353616226314385, + "grad_norm": 0.9520922440061501, + "learning_rate": 9.34138658476247e-06, + "loss": 1.6327, + "step": 4012 + }, + { + "epoch": 0.5354950627168401, + "grad_norm": 0.9539885539724596, + "learning_rate": 9.337074125318268e-06, + "loss": 1.579, + "step": 4013 + }, + { + "epoch": 0.5356285028022418, + "grad_norm": 1.2835828749577056, + "learning_rate": 9.332761789701076e-06, + "loss": 1.5306, + "step": 4014 + }, + { + "epoch": 0.5357619428876434, + "grad_norm": 1.023893323462547, + "learning_rate": 9.328449578716384e-06, + "loss": 1.5653, + "step": 4015 + }, + { + "epoch": 0.5358953829730451, + "grad_norm": 1.2706402832851262, + "learning_rate": 9.324137493169664e-06, + "loss": 1.561, + "step": 4016 + }, + { + "epoch": 0.5360288230584468, + "grad_norm": 0.9500733970079291, + "learning_rate": 9.319825533866364e-06, + "loss": 1.5684, + "step": 4017 + }, + { + "epoch": 0.5361622631438484, + "grad_norm": 0.9654037579664738, + "learning_rate": 9.315513701611913e-06, + "loss": 1.5545, + "step": 4018 + }, + { + "epoch": 0.53629570322925, + "grad_norm": 0.9637541303641998, + "learning_rate": 9.31120199721171e-06, + "loss": 1.6, + "step": 4019 + }, + { + "epoch": 0.5364291433146517, + "grad_norm": 0.9449747779438818, + "learning_rate": 9.306890421471128e-06, + "loss": 1.5729, + "step": 4020 + }, + { + "epoch": 0.5365625834000534, + "grad_norm": 1.015400427968874, + "learning_rate": 9.302578975195527e-06, + "loss": 1.5912, + "step": 4021 + }, + { + "epoch": 0.536696023485455, + "grad_norm": 1.2000776620053406, + "learning_rate": 9.298267659190234e-06, + "loss": 1.6051, + "step": 4022 + }, + { + "epoch": 0.5368294635708567, + "grad_norm": 0.9947410519922049, + "learning_rate": 9.293956474260548e-06, + "loss": 1.584, + "step": 4023 + }, + { + "epoch": 0.5369629036562583, + "grad_norm": 0.9569770795832728, + "learning_rate": 9.289645421211756e-06, + "loss": 1.5406, + "step": 4024 + }, + { + "epoch": 0.53709634374166, + "grad_norm": 1.0002749981588506, + "learning_rate": 9.285334500849113e-06, + "loss": 1.6485, + "step": 4025 + }, + { + "epoch": 0.5372297838270617, + "grad_norm": 0.9254684190371253, + "learning_rate": 9.281023713977848e-06, + "loss": 1.6221, + "step": 4026 + }, + { + "epoch": 0.5373632239124633, + "grad_norm": 1.0069500001650689, + "learning_rate": 9.276713061403164e-06, + "loss": 1.6204, + "step": 4027 + }, + { + "epoch": 0.537496663997865, + "grad_norm": 1.0859626601584031, + "learning_rate": 9.27240254393025e-06, + "loss": 1.5635, + "step": 4028 + }, + { + "epoch": 0.5376301040832666, + "grad_norm": 0.9601269217825726, + "learning_rate": 9.268092162364256e-06, + "loss": 1.6201, + "step": 4029 + }, + { + "epoch": 0.5377635441686682, + "grad_norm": 1.005400728893139, + "learning_rate": 9.263781917510312e-06, + "loss": 1.559, + "step": 4030 + }, + { + "epoch": 0.53789698425407, + "grad_norm": 0.9879107408018795, + "learning_rate": 9.259471810173522e-06, + "loss": 1.5615, + "step": 4031 + }, + { + "epoch": 0.5380304243394716, + "grad_norm": 1.035330873394029, + "learning_rate": 9.255161841158968e-06, + "loss": 1.6013, + "step": 4032 + }, + { + "epoch": 0.5381638644248732, + "grad_norm": 1.092318326623765, + "learning_rate": 9.250852011271703e-06, + "loss": 1.6165, + "step": 4033 + }, + { + "epoch": 0.5382973045102749, + "grad_norm": 1.018361237141597, + "learning_rate": 9.24654232131675e-06, + "loss": 1.5758, + "step": 4034 + }, + { + "epoch": 0.5384307445956765, + "grad_norm": 0.9345797395155986, + "learning_rate": 9.242232772099116e-06, + "loss": 1.5766, + "step": 4035 + }, + { + "epoch": 0.5385641846810783, + "grad_norm": 0.9679231135179047, + "learning_rate": 9.237923364423772e-06, + "loss": 1.5835, + "step": 4036 + }, + { + "epoch": 0.5386976247664799, + "grad_norm": 1.0031021300962772, + "learning_rate": 9.233614099095668e-06, + "loss": 1.6144, + "step": 4037 + }, + { + "epoch": 0.5388310648518815, + "grad_norm": 0.9915425586784125, + "learning_rate": 9.229304976919721e-06, + "loss": 1.6167, + "step": 4038 + }, + { + "epoch": 0.5389645049372832, + "grad_norm": 0.9956070431123422, + "learning_rate": 9.224995998700832e-06, + "loss": 1.6112, + "step": 4039 + }, + { + "epoch": 0.5390979450226848, + "grad_norm": 0.9175317898321539, + "learning_rate": 9.220687165243867e-06, + "loss": 1.5324, + "step": 4040 + }, + { + "epoch": 0.5392313851080864, + "grad_norm": 0.9964567601709668, + "learning_rate": 9.216378477353664e-06, + "loss": 1.5438, + "step": 4041 + }, + { + "epoch": 0.5393648251934882, + "grad_norm": 1.2134149238548315, + "learning_rate": 9.212069935835044e-06, + "loss": 1.5954, + "step": 4042 + }, + { + "epoch": 0.5394982652788898, + "grad_norm": 0.9893376951501015, + "learning_rate": 9.207761541492788e-06, + "loss": 1.635, + "step": 4043 + }, + { + "epoch": 0.5396317053642914, + "grad_norm": 0.9752793339516473, + "learning_rate": 9.203453295131657e-06, + "loss": 1.5758, + "step": 4044 + }, + { + "epoch": 0.5397651454496931, + "grad_norm": 0.9849050003080004, + "learning_rate": 9.199145197556375e-06, + "loss": 1.59, + "step": 4045 + }, + { + "epoch": 0.5398985855350947, + "grad_norm": 1.3161223332489436, + "learning_rate": 9.194837249571658e-06, + "loss": 1.6444, + "step": 4046 + }, + { + "epoch": 0.5400320256204963, + "grad_norm": 1.0146112609602544, + "learning_rate": 9.190529451982173e-06, + "loss": 1.615, + "step": 4047 + }, + { + "epoch": 0.5401654657058981, + "grad_norm": 0.9544932036464441, + "learning_rate": 9.18622180559257e-06, + "loss": 1.6292, + "step": 4048 + }, + { + "epoch": 0.5402989057912997, + "grad_norm": 0.9541877435381779, + "learning_rate": 9.18191431120747e-06, + "loss": 1.5881, + "step": 4049 + }, + { + "epoch": 0.5404323458767014, + "grad_norm": 0.9729000021252879, + "learning_rate": 9.177606969631464e-06, + "loss": 1.577, + "step": 4050 + }, + { + "epoch": 0.540565785962103, + "grad_norm": 0.9802692030861584, + "learning_rate": 9.173299781669112e-06, + "loss": 1.5656, + "step": 4051 + }, + { + "epoch": 0.5406992260475046, + "grad_norm": 0.9490128443326544, + "learning_rate": 9.168992748124943e-06, + "loss": 1.5705, + "step": 4052 + }, + { + "epoch": 0.5408326661329064, + "grad_norm": 0.9644978819486809, + "learning_rate": 9.164685869803474e-06, + "loss": 1.5745, + "step": 4053 + }, + { + "epoch": 0.540966106218308, + "grad_norm": 0.9346805372231108, + "learning_rate": 9.160379147509172e-06, + "loss": 1.5811, + "step": 4054 + }, + { + "epoch": 0.5410995463037096, + "grad_norm": 1.0592259248945302, + "learning_rate": 9.156072582046483e-06, + "loss": 1.5821, + "step": 4055 + }, + { + "epoch": 0.5412329863891113, + "grad_norm": 0.9912214570473821, + "learning_rate": 9.15176617421983e-06, + "loss": 1.61, + "step": 4056 + }, + { + "epoch": 0.5413664264745129, + "grad_norm": 0.9898284421098535, + "learning_rate": 9.147459924833599e-06, + "loss": 1.574, + "step": 4057 + }, + { + "epoch": 0.5414998665599146, + "grad_norm": 0.9503201276694209, + "learning_rate": 9.143153834692147e-06, + "loss": 1.5615, + "step": 4058 + }, + { + "epoch": 0.5416333066453163, + "grad_norm": 0.933992350962357, + "learning_rate": 9.138847904599802e-06, + "loss": 1.5795, + "step": 4059 + }, + { + "epoch": 0.5417667467307179, + "grad_norm": 1.188754739203571, + "learning_rate": 9.134542135360863e-06, + "loss": 1.5382, + "step": 4060 + }, + { + "epoch": 0.5419001868161195, + "grad_norm": 0.9697813847683896, + "learning_rate": 9.130236527779602e-06, + "loss": 1.5861, + "step": 4061 + }, + { + "epoch": 0.5420336269015212, + "grad_norm": 0.9800137082741205, + "learning_rate": 9.125931082660255e-06, + "loss": 1.5988, + "step": 4062 + }, + { + "epoch": 0.5421670669869229, + "grad_norm": 1.0320301308567623, + "learning_rate": 9.121625800807028e-06, + "loss": 1.5667, + "step": 4063 + }, + { + "epoch": 0.5423005070723246, + "grad_norm": 0.9333833148734103, + "learning_rate": 9.117320683024104e-06, + "loss": 1.5643, + "step": 4064 + }, + { + "epoch": 0.5424339471577262, + "grad_norm": 0.9278004209324023, + "learning_rate": 9.113015730115626e-06, + "loss": 1.5886, + "step": 4065 + }, + { + "epoch": 0.5425673872431278, + "grad_norm": 0.9639727825463563, + "learning_rate": 9.108710942885705e-06, + "loss": 1.5911, + "step": 4066 + }, + { + "epoch": 0.5427008273285295, + "grad_norm": 0.9307792098772837, + "learning_rate": 9.104406322138438e-06, + "loss": 1.5635, + "step": 4067 + }, + { + "epoch": 0.5428342674139311, + "grad_norm": 1.0667447561969865, + "learning_rate": 9.100101868677872e-06, + "loss": 1.5563, + "step": 4068 + }, + { + "epoch": 0.5429677074993328, + "grad_norm": 0.912096392390571, + "learning_rate": 9.095797583308033e-06, + "loss": 1.566, + "step": 4069 + }, + { + "epoch": 0.5431011475847345, + "grad_norm": 1.3022530608595129, + "learning_rate": 9.091493466832905e-06, + "loss": 1.5883, + "step": 4070 + }, + { + "epoch": 0.5432345876701361, + "grad_norm": 0.9697016564423682, + "learning_rate": 9.087189520056459e-06, + "loss": 1.6272, + "step": 4071 + }, + { + "epoch": 0.5433680277555377, + "grad_norm": 1.1713917719657274, + "learning_rate": 9.082885743782617e-06, + "loss": 1.539, + "step": 4072 + }, + { + "epoch": 0.5435014678409394, + "grad_norm": 0.9945992018009926, + "learning_rate": 9.078582138815272e-06, + "loss": 1.5641, + "step": 4073 + }, + { + "epoch": 0.5436349079263411, + "grad_norm": 1.09987126567883, + "learning_rate": 9.074278705958297e-06, + "loss": 1.5987, + "step": 4074 + }, + { + "epoch": 0.5437683480117427, + "grad_norm": 0.9528284537379281, + "learning_rate": 9.069975446015522e-06, + "loss": 1.5384, + "step": 4075 + }, + { + "epoch": 0.5439017880971444, + "grad_norm": 5.663201568713656, + "learning_rate": 9.065672359790743e-06, + "loss": 1.4918, + "step": 4076 + }, + { + "epoch": 0.544035228182546, + "grad_norm": 0.9679094263779862, + "learning_rate": 9.061369448087725e-06, + "loss": 1.5623, + "step": 4077 + }, + { + "epoch": 0.5441686682679477, + "grad_norm": 2.567953353801192, + "learning_rate": 9.057066711710212e-06, + "loss": 1.5437, + "step": 4078 + }, + { + "epoch": 0.5443021083533494, + "grad_norm": 1.1402343926159393, + "learning_rate": 9.052764151461902e-06, + "loss": 1.6053, + "step": 4079 + }, + { + "epoch": 0.544435548438751, + "grad_norm": 1.0524535435440139, + "learning_rate": 9.048461768146461e-06, + "loss": 1.678, + "step": 4080 + }, + { + "epoch": 0.5445689885241527, + "grad_norm": 0.9636960807728361, + "learning_rate": 9.044159562567532e-06, + "loss": 1.6102, + "step": 4081 + }, + { + "epoch": 0.5447024286095543, + "grad_norm": 1.0002275536656045, + "learning_rate": 9.03985753552871e-06, + "loss": 1.6551, + "step": 4082 + }, + { + "epoch": 0.5448358686949559, + "grad_norm": 0.993667451230517, + "learning_rate": 9.035555687833571e-06, + "loss": 1.5973, + "step": 4083 + }, + { + "epoch": 0.5449693087803577, + "grad_norm": 1.4929672107006777, + "learning_rate": 9.031254020285648e-06, + "loss": 1.5943, + "step": 4084 + }, + { + "epoch": 0.5451027488657593, + "grad_norm": 0.9664973212133721, + "learning_rate": 9.026952533688445e-06, + "loss": 1.5989, + "step": 4085 + }, + { + "epoch": 0.5452361889511609, + "grad_norm": 0.9850092658759886, + "learning_rate": 9.022651228845431e-06, + "loss": 1.6171, + "step": 4086 + }, + { + "epoch": 0.5453696290365626, + "grad_norm": 0.971030931040022, + "learning_rate": 9.018350106560036e-06, + "loss": 1.5712, + "step": 4087 + }, + { + "epoch": 0.5455030691219642, + "grad_norm": 1.2898471611348323, + "learning_rate": 9.014049167635668e-06, + "loss": 1.5786, + "step": 4088 + }, + { + "epoch": 0.5456365092073658, + "grad_norm": 0.9722886044790409, + "learning_rate": 9.00974841287569e-06, + "loss": 1.5522, + "step": 4089 + }, + { + "epoch": 0.5457699492927676, + "grad_norm": 1.274819296685493, + "learning_rate": 9.005447843083436e-06, + "loss": 1.6179, + "step": 4090 + }, + { + "epoch": 0.5459033893781692, + "grad_norm": 0.9802333166916215, + "learning_rate": 9.001147459062196e-06, + "loss": 1.5479, + "step": 4091 + }, + { + "epoch": 0.5460368294635709, + "grad_norm": 0.9612343249635548, + "learning_rate": 8.996847261615242e-06, + "loss": 1.5777, + "step": 4092 + }, + { + "epoch": 0.5461702695489725, + "grad_norm": 1.0934912868937452, + "learning_rate": 8.992547251545798e-06, + "loss": 1.5733, + "step": 4093 + }, + { + "epoch": 0.5463037096343741, + "grad_norm": 1.051460553342782, + "learning_rate": 8.988247429657058e-06, + "loss": 1.5806, + "step": 4094 + }, + { + "epoch": 0.5464371497197759, + "grad_norm": 1.0365839786798572, + "learning_rate": 8.983947796752174e-06, + "loss": 1.5982, + "step": 4095 + }, + { + "epoch": 0.5465705898051775, + "grad_norm": 1.1028438221141459, + "learning_rate": 8.979648353634278e-06, + "loss": 1.6002, + "step": 4096 + }, + { + "epoch": 0.5467040298905791, + "grad_norm": 0.9267409999435865, + "learning_rate": 8.975349101106451e-06, + "loss": 1.5865, + "step": 4097 + }, + { + "epoch": 0.5468374699759808, + "grad_norm": 0.9232565563556838, + "learning_rate": 8.971050039971743e-06, + "loss": 1.5637, + "step": 4098 + }, + { + "epoch": 0.5469709100613824, + "grad_norm": 1.0548309109185285, + "learning_rate": 8.966751171033177e-06, + "loss": 1.5598, + "step": 4099 + }, + { + "epoch": 0.547104350146784, + "grad_norm": 0.9446200469309572, + "learning_rate": 8.962452495093729e-06, + "loss": 1.5853, + "step": 4100 + }, + { + "epoch": 0.5472377902321858, + "grad_norm": 0.9503336907790122, + "learning_rate": 8.95815401295634e-06, + "loss": 1.5885, + "step": 4101 + }, + { + "epoch": 0.5473712303175874, + "grad_norm": 1.1631702714115804, + "learning_rate": 8.953855725423918e-06, + "loss": 1.5864, + "step": 4102 + }, + { + "epoch": 0.5475046704029891, + "grad_norm": 0.928539952699467, + "learning_rate": 8.949557633299335e-06, + "loss": 1.5443, + "step": 4103 + }, + { + "epoch": 0.5476381104883907, + "grad_norm": 0.9789123315297109, + "learning_rate": 8.945259737385432e-06, + "loss": 1.6024, + "step": 4104 + }, + { + "epoch": 0.5477715505737923, + "grad_norm": 0.9695913713026796, + "learning_rate": 8.940962038484997e-06, + "loss": 1.5721, + "step": 4105 + }, + { + "epoch": 0.5479049906591941, + "grad_norm": 1.2560495210786051, + "learning_rate": 8.936664537400797e-06, + "loss": 1.5756, + "step": 4106 + }, + { + "epoch": 0.5480384307445957, + "grad_norm": 1.0698213851452432, + "learning_rate": 8.932367234935554e-06, + "loss": 1.5682, + "step": 4107 + }, + { + "epoch": 0.5481718708299973, + "grad_norm": 1.0468767253083986, + "learning_rate": 8.928070131891959e-06, + "loss": 1.5945, + "step": 4108 + }, + { + "epoch": 0.548305310915399, + "grad_norm": 0.9589490681881839, + "learning_rate": 8.923773229072653e-06, + "loss": 1.6103, + "step": 4109 + }, + { + "epoch": 0.5484387510008006, + "grad_norm": 0.9736679031008811, + "learning_rate": 8.919476527280257e-06, + "loss": 1.543, + "step": 4110 + }, + { + "epoch": 0.5485721910862023, + "grad_norm": 0.9528406682287175, + "learning_rate": 8.915180027317345e-06, + "loss": 1.58, + "step": 4111 + }, + { + "epoch": 0.548705631171604, + "grad_norm": 1.0097173511452968, + "learning_rate": 8.91088372998645e-06, + "loss": 1.616, + "step": 4112 + }, + { + "epoch": 0.5488390712570056, + "grad_norm": 0.9881054764092163, + "learning_rate": 8.906587636090078e-06, + "loss": 1.5138, + "step": 4113 + }, + { + "epoch": 0.5489725113424072, + "grad_norm": 0.9333735783197412, + "learning_rate": 8.902291746430686e-06, + "loss": 1.5852, + "step": 4114 + }, + { + "epoch": 0.5491059514278089, + "grad_norm": 0.9489550127540991, + "learning_rate": 8.897996061810697e-06, + "loss": 1.563, + "step": 4115 + }, + { + "epoch": 0.5492393915132106, + "grad_norm": 0.962585687561712, + "learning_rate": 8.893700583032494e-06, + "loss": 1.5634, + "step": 4116 + }, + { + "epoch": 0.5493728315986123, + "grad_norm": 1.0345096599792367, + "learning_rate": 8.88940531089843e-06, + "loss": 1.5799, + "step": 4117 + }, + { + "epoch": 0.5495062716840139, + "grad_norm": 0.9472708442632688, + "learning_rate": 8.885110246210809e-06, + "loss": 1.5588, + "step": 4118 + }, + { + "epoch": 0.5496397117694155, + "grad_norm": 0.9677807823462035, + "learning_rate": 8.880815389771896e-06, + "loss": 1.5661, + "step": 4119 + }, + { + "epoch": 0.5497731518548172, + "grad_norm": 0.9496679955804133, + "learning_rate": 8.876520742383931e-06, + "loss": 1.5943, + "step": 4120 + }, + { + "epoch": 0.5499065919402188, + "grad_norm": 0.9783924610253193, + "learning_rate": 8.8722263048491e-06, + "loss": 1.566, + "step": 4121 + }, + { + "epoch": 0.5500400320256205, + "grad_norm": 1.1878684772471404, + "learning_rate": 8.867932077969555e-06, + "loss": 1.6396, + "step": 4122 + }, + { + "epoch": 0.5501734721110222, + "grad_norm": 0.9611842305243232, + "learning_rate": 8.863638062547406e-06, + "loss": 1.5659, + "step": 4123 + }, + { + "epoch": 0.5503069121964238, + "grad_norm": 0.9306514165663081, + "learning_rate": 8.85934425938473e-06, + "loss": 1.606, + "step": 4124 + }, + { + "epoch": 0.5504403522818254, + "grad_norm": 0.9465699027935999, + "learning_rate": 8.85505066928356e-06, + "loss": 1.6102, + "step": 4125 + }, + { + "epoch": 0.5505737923672271, + "grad_norm": 0.9749768439700673, + "learning_rate": 8.85075729304589e-06, + "loss": 1.5877, + "step": 4126 + }, + { + "epoch": 0.5507072324526288, + "grad_norm": 0.9815904995653956, + "learning_rate": 8.846464131473671e-06, + "loss": 1.538, + "step": 4127 + }, + { + "epoch": 0.5508406725380304, + "grad_norm": 0.9804198563493697, + "learning_rate": 8.842171185368822e-06, + "loss": 1.5889, + "step": 4128 + }, + { + "epoch": 0.5509741126234321, + "grad_norm": 1.0046191555459012, + "learning_rate": 8.837878455533213e-06, + "loss": 1.563, + "step": 4129 + }, + { + "epoch": 0.5511075527088337, + "grad_norm": 0.9815357324359852, + "learning_rate": 8.833585942768672e-06, + "loss": 1.605, + "step": 4130 + }, + { + "epoch": 0.5512409927942354, + "grad_norm": 1.0030901091320008, + "learning_rate": 8.829293647877002e-06, + "loss": 1.5989, + "step": 4131 + }, + { + "epoch": 0.551374432879637, + "grad_norm": 1.0070184181295139, + "learning_rate": 8.825001571659948e-06, + "loss": 1.6026, + "step": 4132 + }, + { + "epoch": 0.5515078729650387, + "grad_norm": 0.9393916243147451, + "learning_rate": 8.820709714919223e-06, + "loss": 1.5979, + "step": 4133 + }, + { + "epoch": 0.5516413130504404, + "grad_norm": 0.9663628069756637, + "learning_rate": 8.816418078456493e-06, + "loss": 1.6353, + "step": 4134 + }, + { + "epoch": 0.551774753135842, + "grad_norm": 1.0337970146828375, + "learning_rate": 8.812126663073394e-06, + "loss": 1.591, + "step": 4135 + }, + { + "epoch": 0.5519081932212436, + "grad_norm": 0.944300804007319, + "learning_rate": 8.807835469571511e-06, + "loss": 1.5938, + "step": 4136 + }, + { + "epoch": 0.5520416333066454, + "grad_norm": 0.9431341497957093, + "learning_rate": 8.803544498752383e-06, + "loss": 1.5962, + "step": 4137 + }, + { + "epoch": 0.552175073392047, + "grad_norm": 0.932632959879075, + "learning_rate": 8.799253751417526e-06, + "loss": 1.5937, + "step": 4138 + }, + { + "epoch": 0.5523085134774486, + "grad_norm": 0.9584857084573524, + "learning_rate": 8.794963228368397e-06, + "loss": 1.5368, + "step": 4139 + }, + { + "epoch": 0.5524419535628503, + "grad_norm": 0.959115772237164, + "learning_rate": 8.790672930406416e-06, + "loss": 1.5858, + "step": 4140 + }, + { + "epoch": 0.5525753936482519, + "grad_norm": 1.1797548555444335, + "learning_rate": 8.78638285833296e-06, + "loss": 1.5726, + "step": 4141 + }, + { + "epoch": 0.5527088337336535, + "grad_norm": 0.9424673897338993, + "learning_rate": 8.782093012949373e-06, + "loss": 1.5949, + "step": 4142 + }, + { + "epoch": 0.5528422738190553, + "grad_norm": 0.9713375108175255, + "learning_rate": 8.777803395056945e-06, + "loss": 1.6092, + "step": 4143 + }, + { + "epoch": 0.5529757139044569, + "grad_norm": 0.9774551665234783, + "learning_rate": 8.773514005456923e-06, + "loss": 1.5954, + "step": 4144 + }, + { + "epoch": 0.5531091539898586, + "grad_norm": 1.0793312067897565, + "learning_rate": 8.769224844950525e-06, + "loss": 1.5821, + "step": 4145 + }, + { + "epoch": 0.5532425940752602, + "grad_norm": 1.8746469240528907, + "learning_rate": 8.76493591433891e-06, + "loss": 1.5594, + "step": 4146 + }, + { + "epoch": 0.5533760341606618, + "grad_norm": 0.990387748915661, + "learning_rate": 8.760647214423206e-06, + "loss": 1.5758, + "step": 4147 + }, + { + "epoch": 0.5535094742460636, + "grad_norm": 0.9682888207800222, + "learning_rate": 8.756358746004492e-06, + "loss": 1.574, + "step": 4148 + }, + { + "epoch": 0.5536429143314652, + "grad_norm": 1.10384040970137, + "learning_rate": 8.752070509883805e-06, + "loss": 1.5835, + "step": 4149 + }, + { + "epoch": 0.5537763544168668, + "grad_norm": 0.9794947979779128, + "learning_rate": 8.747782506862139e-06, + "loss": 1.5744, + "step": 4150 + }, + { + "epoch": 0.5539097945022685, + "grad_norm": 1.0987054858543068, + "learning_rate": 8.74349473774044e-06, + "loss": 1.6253, + "step": 4151 + }, + { + "epoch": 0.5540432345876701, + "grad_norm": 0.9490644187289331, + "learning_rate": 8.739207203319622e-06, + "loss": 1.5796, + "step": 4152 + }, + { + "epoch": 0.5541766746730717, + "grad_norm": 1.0378405608665844, + "learning_rate": 8.734919904400544e-06, + "loss": 1.5961, + "step": 4153 + }, + { + "epoch": 0.5543101147584735, + "grad_norm": 0.9322492359210561, + "learning_rate": 8.730632841784023e-06, + "loss": 1.5715, + "step": 4154 + }, + { + "epoch": 0.5544435548438751, + "grad_norm": 0.9781596956125678, + "learning_rate": 8.726346016270833e-06, + "loss": 1.5597, + "step": 4155 + }, + { + "epoch": 0.5545769949292767, + "grad_norm": 0.9527266527507525, + "learning_rate": 8.72205942866171e-06, + "loss": 1.6075, + "step": 4156 + }, + { + "epoch": 0.5547104350146784, + "grad_norm": 0.9106864869517425, + "learning_rate": 8.717773079757333e-06, + "loss": 1.5677, + "step": 4157 + }, + { + "epoch": 0.55484387510008, + "grad_norm": 0.9851888382485386, + "learning_rate": 8.713486970358348e-06, + "loss": 1.5512, + "step": 4158 + }, + { + "epoch": 0.5549773151854818, + "grad_norm": 0.986066626003795, + "learning_rate": 8.709201101265346e-06, + "loss": 1.6065, + "step": 4159 + }, + { + "epoch": 0.5551107552708834, + "grad_norm": 0.9632922780457756, + "learning_rate": 8.704915473278885e-06, + "loss": 1.5635, + "step": 4160 + }, + { + "epoch": 0.555244195356285, + "grad_norm": 0.9616053211021358, + "learning_rate": 8.700630087199468e-06, + "loss": 1.5781, + "step": 4161 + }, + { + "epoch": 0.5553776354416867, + "grad_norm": 0.9536548628997336, + "learning_rate": 8.696344943827553e-06, + "loss": 1.5276, + "step": 4162 + }, + { + "epoch": 0.5555110755270883, + "grad_norm": 1.0768239441946357, + "learning_rate": 8.692060043963565e-06, + "loss": 1.6121, + "step": 4163 + }, + { + "epoch": 0.55564451561249, + "grad_norm": 1.214532797811325, + "learning_rate": 8.68777538840787e-06, + "loss": 1.6292, + "step": 4164 + }, + { + "epoch": 0.5557779556978917, + "grad_norm": 0.9333872970846957, + "learning_rate": 8.68349097796079e-06, + "loss": 1.5566, + "step": 4165 + }, + { + "epoch": 0.5559113957832933, + "grad_norm": 0.9800519597605887, + "learning_rate": 8.679206813422605e-06, + "loss": 1.5816, + "step": 4166 + }, + { + "epoch": 0.5560448358686949, + "grad_norm": 1.0132442396753862, + "learning_rate": 8.67492289559355e-06, + "loss": 1.5314, + "step": 4167 + }, + { + "epoch": 0.5561782759540966, + "grad_norm": 0.961576475954957, + "learning_rate": 8.670639225273813e-06, + "loss": 1.6011, + "step": 4168 + }, + { + "epoch": 0.5563117160394982, + "grad_norm": 0.9788757658928525, + "learning_rate": 8.66635580326353e-06, + "loss": 1.5529, + "step": 4169 + }, + { + "epoch": 0.5564451561248999, + "grad_norm": 1.0085557507010017, + "learning_rate": 8.662072630362803e-06, + "loss": 1.5839, + "step": 4170 + }, + { + "epoch": 0.5565785962103016, + "grad_norm": 1.0390389689513875, + "learning_rate": 8.657789707371675e-06, + "loss": 1.6039, + "step": 4171 + }, + { + "epoch": 0.5567120362957032, + "grad_norm": 8.058031207442108, + "learning_rate": 8.65350703509015e-06, + "loss": 1.6436, + "step": 4172 + }, + { + "epoch": 0.5568454763811049, + "grad_norm": 0.9705636062848059, + "learning_rate": 8.649224614318174e-06, + "loss": 1.5705, + "step": 4173 + }, + { + "epoch": 0.5569789164665065, + "grad_norm": 0.9395606140307743, + "learning_rate": 8.644942445855666e-06, + "loss": 1.619, + "step": 4174 + }, + { + "epoch": 0.5571123565519082, + "grad_norm": 0.9746333291733118, + "learning_rate": 8.64066053050248e-06, + "loss": 1.5455, + "step": 4175 + }, + { + "epoch": 0.5572457966373099, + "grad_norm": 0.9673212477085391, + "learning_rate": 8.636378869058427e-06, + "loss": 1.5958, + "step": 4176 + }, + { + "epoch": 0.5573792367227115, + "grad_norm": 0.9991053615117632, + "learning_rate": 8.63209746232328e-06, + "loss": 1.6009, + "step": 4177 + }, + { + "epoch": 0.5575126768081131, + "grad_norm": 0.9645490800883675, + "learning_rate": 8.627816311096753e-06, + "loss": 1.5588, + "step": 4178 + }, + { + "epoch": 0.5576461168935148, + "grad_norm": 1.0862567601365494, + "learning_rate": 8.623535416178516e-06, + "loss": 1.6125, + "step": 4179 + }, + { + "epoch": 0.5577795569789165, + "grad_norm": 0.9565579659399656, + "learning_rate": 8.619254778368187e-06, + "loss": 1.5513, + "step": 4180 + }, + { + "epoch": 0.5579129970643181, + "grad_norm": 1.0678592227935304, + "learning_rate": 8.61497439846535e-06, + "loss": 1.6321, + "step": 4181 + }, + { + "epoch": 0.5580464371497198, + "grad_norm": 0.96056585056449, + "learning_rate": 8.610694277269526e-06, + "loss": 1.6026, + "step": 4182 + }, + { + "epoch": 0.5581798772351214, + "grad_norm": 1.011056257284224, + "learning_rate": 8.60641441558019e-06, + "loss": 1.5451, + "step": 4183 + }, + { + "epoch": 0.5583133173205231, + "grad_norm": 0.9413247204545455, + "learning_rate": 8.602134814196776e-06, + "loss": 1.5566, + "step": 4184 + }, + { + "epoch": 0.5584467574059248, + "grad_norm": 1.0445816035815227, + "learning_rate": 8.597855473918664e-06, + "loss": 1.5626, + "step": 4185 + }, + { + "epoch": 0.5585801974913264, + "grad_norm": 0.9575339480339196, + "learning_rate": 8.593576395545187e-06, + "loss": 1.6168, + "step": 4186 + }, + { + "epoch": 0.5587136375767281, + "grad_norm": 0.9282496732846511, + "learning_rate": 8.589297579875624e-06, + "loss": 1.5874, + "step": 4187 + }, + { + "epoch": 0.5588470776621297, + "grad_norm": 0.9840758991935276, + "learning_rate": 8.585019027709211e-06, + "loss": 1.6467, + "step": 4188 + }, + { + "epoch": 0.5589805177475313, + "grad_norm": 1.0328945704121253, + "learning_rate": 8.580740739845138e-06, + "loss": 1.5509, + "step": 4189 + }, + { + "epoch": 0.559113957832933, + "grad_norm": 1.0478866163329408, + "learning_rate": 8.576462717082532e-06, + "loss": 1.5587, + "step": 4190 + }, + { + "epoch": 0.5592473979183347, + "grad_norm": 1.1273918515526908, + "learning_rate": 8.572184960220485e-06, + "loss": 1.6093, + "step": 4191 + }, + { + "epoch": 0.5593808380037363, + "grad_norm": 0.9626145010766713, + "learning_rate": 8.56790747005803e-06, + "loss": 1.5516, + "step": 4192 + }, + { + "epoch": 0.559514278089138, + "grad_norm": 2.3623694177009043, + "learning_rate": 8.563630247394157e-06, + "loss": 1.6187, + "step": 4193 + }, + { + "epoch": 0.5596477181745396, + "grad_norm": 1.056578864134312, + "learning_rate": 8.559353293027793e-06, + "loss": 1.5665, + "step": 4194 + }, + { + "epoch": 0.5597811582599412, + "grad_norm": 0.9458997799992117, + "learning_rate": 8.555076607757837e-06, + "loss": 1.5715, + "step": 4195 + }, + { + "epoch": 0.559914598345343, + "grad_norm": 0.9618299176959383, + "learning_rate": 8.55080019238312e-06, + "loss": 1.5743, + "step": 4196 + }, + { + "epoch": 0.5600480384307446, + "grad_norm": 0.9743576637342902, + "learning_rate": 8.546524047702428e-06, + "loss": 1.6132, + "step": 4197 + }, + { + "epoch": 0.5601814785161463, + "grad_norm": 0.954261768012623, + "learning_rate": 8.54224817451449e-06, + "loss": 1.5947, + "step": 4198 + }, + { + "epoch": 0.5603149186015479, + "grad_norm": 0.967928239432757, + "learning_rate": 8.537972573618e-06, + "loss": 1.5806, + "step": 4199 + }, + { + "epoch": 0.5604483586869495, + "grad_norm": 0.9509794485601105, + "learning_rate": 8.53369724581159e-06, + "loss": 1.5455, + "step": 4200 + }, + { + "epoch": 0.5605817987723513, + "grad_norm": 0.948976660250996, + "learning_rate": 8.529422191893832e-06, + "loss": 1.6024, + "step": 4201 + }, + { + "epoch": 0.5607152388577529, + "grad_norm": 0.9018937165977122, + "learning_rate": 8.525147412663272e-06, + "loss": 1.5771, + "step": 4202 + }, + { + "epoch": 0.5608486789431545, + "grad_norm": 0.9881572931305829, + "learning_rate": 8.520872908918382e-06, + "loss": 1.5898, + "step": 4203 + }, + { + "epoch": 0.5609821190285562, + "grad_norm": 1.124313244736089, + "learning_rate": 8.516598681457595e-06, + "loss": 1.6095, + "step": 4204 + }, + { + "epoch": 0.5611155591139578, + "grad_norm": 1.1089045566415832, + "learning_rate": 8.512324731079277e-06, + "loss": 1.5805, + "step": 4205 + }, + { + "epoch": 0.5612489991993594, + "grad_norm": 0.9797580686965339, + "learning_rate": 8.508051058581768e-06, + "loss": 1.5679, + "step": 4206 + }, + { + "epoch": 0.5613824392847612, + "grad_norm": 0.9722387996589369, + "learning_rate": 8.503777664763336e-06, + "loss": 1.5833, + "step": 4207 + }, + { + "epoch": 0.5615158793701628, + "grad_norm": 1.07631421997575, + "learning_rate": 8.499504550422195e-06, + "loss": 1.562, + "step": 4208 + }, + { + "epoch": 0.5616493194555644, + "grad_norm": 1.0147686908909404, + "learning_rate": 8.495231716356525e-06, + "loss": 1.5465, + "step": 4209 + }, + { + "epoch": 0.5617827595409661, + "grad_norm": 0.9427076649487832, + "learning_rate": 8.490959163364436e-06, + "loss": 1.5497, + "step": 4210 + }, + { + "epoch": 0.5619161996263677, + "grad_norm": 1.1230893036871608, + "learning_rate": 8.486686892243996e-06, + "loss": 1.5773, + "step": 4211 + }, + { + "epoch": 0.5620496397117695, + "grad_norm": 0.9289630374962218, + "learning_rate": 8.482414903793213e-06, + "loss": 1.6441, + "step": 4212 + }, + { + "epoch": 0.5621830797971711, + "grad_norm": 0.9520481957952125, + "learning_rate": 8.478143198810048e-06, + "loss": 1.6163, + "step": 4213 + }, + { + "epoch": 0.5623165198825727, + "grad_norm": 1.0545873114232873, + "learning_rate": 8.473871778092408e-06, + "loss": 1.5663, + "step": 4214 + }, + { + "epoch": 0.5624499599679744, + "grad_norm": 0.9532253546176354, + "learning_rate": 8.469600642438139e-06, + "loss": 1.5472, + "step": 4215 + }, + { + "epoch": 0.562583400053376, + "grad_norm": 0.9979955697123374, + "learning_rate": 8.465329792645051e-06, + "loss": 1.5256, + "step": 4216 + }, + { + "epoch": 0.5627168401387777, + "grad_norm": 0.9702416076361973, + "learning_rate": 8.461059229510886e-06, + "loss": 1.541, + "step": 4217 + }, + { + "epoch": 0.5628502802241794, + "grad_norm": 0.998163150298418, + "learning_rate": 8.456788953833333e-06, + "loss": 1.586, + "step": 4218 + }, + { + "epoch": 0.562983720309581, + "grad_norm": 0.9618357742711305, + "learning_rate": 8.45251896641003e-06, + "loss": 1.5574, + "step": 4219 + }, + { + "epoch": 0.5631171603949826, + "grad_norm": 0.9617959238619088, + "learning_rate": 8.448249268038569e-06, + "loss": 1.5801, + "step": 4220 + }, + { + "epoch": 0.5632506004803843, + "grad_norm": 0.9388770256513779, + "learning_rate": 8.443979859516478e-06, + "loss": 1.5987, + "step": 4221 + }, + { + "epoch": 0.563384040565786, + "grad_norm": 1.0984016692407286, + "learning_rate": 8.439710741641234e-06, + "loss": 1.6375, + "step": 4222 + }, + { + "epoch": 0.5635174806511876, + "grad_norm": 0.9883405451834961, + "learning_rate": 8.435441915210254e-06, + "loss": 1.5521, + "step": 4223 + }, + { + "epoch": 0.5636509207365893, + "grad_norm": 1.216557539530969, + "learning_rate": 8.431173381020915e-06, + "loss": 1.5352, + "step": 4224 + }, + { + "epoch": 0.5637843608219909, + "grad_norm": 0.9774137933177777, + "learning_rate": 8.426905139870528e-06, + "loss": 1.555, + "step": 4225 + }, + { + "epoch": 0.5639178009073926, + "grad_norm": 1.2860621275048014, + "learning_rate": 8.422637192556345e-06, + "loss": 1.5603, + "step": 4226 + }, + { + "epoch": 0.5640512409927942, + "grad_norm": 1.0444121247666944, + "learning_rate": 8.418369539875579e-06, + "loss": 1.6277, + "step": 4227 + }, + { + "epoch": 0.5641846810781959, + "grad_norm": 0.9276944621654523, + "learning_rate": 8.414102182625377e-06, + "loss": 1.6169, + "step": 4228 + }, + { + "epoch": 0.5643181211635976, + "grad_norm": 0.9785924417239961, + "learning_rate": 8.40983512160283e-06, + "loss": 1.5704, + "step": 4229 + }, + { + "epoch": 0.5644515612489992, + "grad_norm": 0.9784553941398924, + "learning_rate": 8.405568357604975e-06, + "loss": 1.5979, + "step": 4230 + }, + { + "epoch": 0.5645850013344008, + "grad_norm": 1.0150051857133884, + "learning_rate": 8.4013018914288e-06, + "loss": 1.5686, + "step": 4231 + }, + { + "epoch": 0.5647184414198025, + "grad_norm": 1.0133584620768177, + "learning_rate": 8.39703572387123e-06, + "loss": 1.5938, + "step": 4232 + }, + { + "epoch": 0.5648518815052042, + "grad_norm": 0.9864373077024665, + "learning_rate": 8.392769855729134e-06, + "loss": 1.6306, + "step": 4233 + }, + { + "epoch": 0.5649853215906058, + "grad_norm": 0.9740231720507322, + "learning_rate": 8.388504287799333e-06, + "loss": 1.6303, + "step": 4234 + }, + { + "epoch": 0.5651187616760075, + "grad_norm": 1.0375657601783386, + "learning_rate": 8.384239020878583e-06, + "loss": 1.6057, + "step": 4235 + }, + { + "epoch": 0.5652522017614091, + "grad_norm": 0.9612076808860169, + "learning_rate": 8.379974055763587e-06, + "loss": 1.5814, + "step": 4236 + }, + { + "epoch": 0.5653856418468107, + "grad_norm": 0.9641347727662392, + "learning_rate": 8.37570939325099e-06, + "loss": 1.5954, + "step": 4237 + }, + { + "epoch": 0.5655190819322125, + "grad_norm": 1.1246728358254432, + "learning_rate": 8.371445034137388e-06, + "loss": 1.5883, + "step": 4238 + }, + { + "epoch": 0.5656525220176141, + "grad_norm": 0.9658697763988295, + "learning_rate": 8.367180979219314e-06, + "loss": 1.5931, + "step": 4239 + }, + { + "epoch": 0.5657859621030158, + "grad_norm": 1.1394715701292135, + "learning_rate": 8.362917229293236e-06, + "loss": 1.5455, + "step": 4240 + }, + { + "epoch": 0.5659194021884174, + "grad_norm": 1.0026059848550364, + "learning_rate": 8.358653785155586e-06, + "loss": 1.5607, + "step": 4241 + }, + { + "epoch": 0.566052842273819, + "grad_norm": 0.9721001045323454, + "learning_rate": 8.354390647602721e-06, + "loss": 1.6145, + "step": 4242 + }, + { + "epoch": 0.5661862823592207, + "grad_norm": 0.9081483328670256, + "learning_rate": 8.350127817430949e-06, + "loss": 1.5278, + "step": 4243 + }, + { + "epoch": 0.5663197224446224, + "grad_norm": 1.0420600474407877, + "learning_rate": 8.34586529543651e-06, + "loss": 1.5287, + "step": 4244 + }, + { + "epoch": 0.566453162530024, + "grad_norm": 1.0148442225077963, + "learning_rate": 8.341603082415604e-06, + "loss": 1.5427, + "step": 4245 + }, + { + "epoch": 0.5665866026154257, + "grad_norm": 0.9893277215121336, + "learning_rate": 8.337341179164363e-06, + "loss": 1.6118, + "step": 4246 + }, + { + "epoch": 0.5667200427008273, + "grad_norm": 1.2148103117653153, + "learning_rate": 8.333079586478854e-06, + "loss": 1.582, + "step": 4247 + }, + { + "epoch": 0.5668534827862289, + "grad_norm": 0.9457405540546562, + "learning_rate": 8.328818305155105e-06, + "loss": 1.6044, + "step": 4248 + }, + { + "epoch": 0.5669869228716307, + "grad_norm": 0.990303038169507, + "learning_rate": 8.324557335989068e-06, + "loss": 1.5833, + "step": 4249 + }, + { + "epoch": 0.5671203629570323, + "grad_norm": 0.9371628277905556, + "learning_rate": 8.320296679776647e-06, + "loss": 1.5408, + "step": 4250 + }, + { + "epoch": 0.567253803042434, + "grad_norm": 0.989229363405912, + "learning_rate": 8.316036337313678e-06, + "loss": 1.5705, + "step": 4251 + }, + { + "epoch": 0.5673872431278356, + "grad_norm": 0.9163649237832772, + "learning_rate": 8.31177630939595e-06, + "loss": 1.577, + "step": 4252 + }, + { + "epoch": 0.5675206832132372, + "grad_norm": 0.9865296079255822, + "learning_rate": 8.307516596819188e-06, + "loss": 1.5335, + "step": 4253 + }, + { + "epoch": 0.567654123298639, + "grad_norm": 0.9327295789695722, + "learning_rate": 8.303257200379055e-06, + "loss": 1.6193, + "step": 4254 + }, + { + "epoch": 0.5677875633840406, + "grad_norm": 0.996608302154684, + "learning_rate": 8.298998120871159e-06, + "loss": 1.5586, + "step": 4255 + }, + { + "epoch": 0.5679210034694422, + "grad_norm": 1.0163058132224498, + "learning_rate": 8.294739359091048e-06, + "loss": 1.5789, + "step": 4256 + }, + { + "epoch": 0.5680544435548439, + "grad_norm": 0.9545226447240991, + "learning_rate": 8.29048091583421e-06, + "loss": 1.5417, + "step": 4257 + }, + { + "epoch": 0.5681878836402455, + "grad_norm": 0.9899206964927434, + "learning_rate": 8.286222791896068e-06, + "loss": 1.6199, + "step": 4258 + }, + { + "epoch": 0.5683213237256471, + "grad_norm": 1.0349824476929077, + "learning_rate": 8.281964988072001e-06, + "loss": 1.5948, + "step": 4259 + }, + { + "epoch": 0.5684547638110489, + "grad_norm": 0.9700116138822578, + "learning_rate": 8.277707505157313e-06, + "loss": 1.5548, + "step": 4260 + }, + { + "epoch": 0.5685882038964505, + "grad_norm": 0.9509425096012817, + "learning_rate": 8.273450343947255e-06, + "loss": 1.6276, + "step": 4261 + }, + { + "epoch": 0.5687216439818521, + "grad_norm": 1.2640017963359513, + "learning_rate": 8.269193505237013e-06, + "loss": 1.5947, + "step": 4262 + }, + { + "epoch": 0.5688550840672538, + "grad_norm": 0.9467480150131761, + "learning_rate": 8.26493698982172e-06, + "loss": 1.6069, + "step": 4263 + }, + { + "epoch": 0.5689885241526554, + "grad_norm": 0.9907646708465446, + "learning_rate": 8.260680798496444e-06, + "loss": 1.578, + "step": 4264 + }, + { + "epoch": 0.5691219642380572, + "grad_norm": 1.1061065525912817, + "learning_rate": 8.256424932056187e-06, + "loss": 1.579, + "step": 4265 + }, + { + "epoch": 0.5692554043234588, + "grad_norm": 1.325438533023951, + "learning_rate": 8.252169391295905e-06, + "loss": 1.5684, + "step": 4266 + }, + { + "epoch": 0.5693888444088604, + "grad_norm": 0.9261579791060407, + "learning_rate": 8.247914177010482e-06, + "loss": 1.5429, + "step": 4267 + }, + { + "epoch": 0.5695222844942621, + "grad_norm": 0.9433004236090312, + "learning_rate": 8.243659289994741e-06, + "loss": 1.5836, + "step": 4268 + }, + { + "epoch": 0.5696557245796637, + "grad_norm": 0.9493027808271886, + "learning_rate": 8.239404731043446e-06, + "loss": 1.5597, + "step": 4269 + }, + { + "epoch": 0.5697891646650654, + "grad_norm": 0.972218396115009, + "learning_rate": 8.235150500951304e-06, + "loss": 1.6362, + "step": 4270 + }, + { + "epoch": 0.5699226047504671, + "grad_norm": 0.9262069004868365, + "learning_rate": 8.230896600512956e-06, + "loss": 1.6098, + "step": 4271 + }, + { + "epoch": 0.5700560448358687, + "grad_norm": 1.0129980611981801, + "learning_rate": 8.226643030522979e-06, + "loss": 1.6419, + "step": 4272 + }, + { + "epoch": 0.5701894849212703, + "grad_norm": 0.9574543891512742, + "learning_rate": 8.222389791775895e-06, + "loss": 1.5302, + "step": 4273 + }, + { + "epoch": 0.570322925006672, + "grad_norm": 2.7284176094811077, + "learning_rate": 8.218136885066158e-06, + "loss": 1.6217, + "step": 4274 + }, + { + "epoch": 0.5704563650920736, + "grad_norm": 0.9819347041563397, + "learning_rate": 8.213884311188166e-06, + "loss": 1.5888, + "step": 4275 + }, + { + "epoch": 0.5705898051774753, + "grad_norm": 0.9895366406849194, + "learning_rate": 8.209632070936246e-06, + "loss": 1.5732, + "step": 4276 + }, + { + "epoch": 0.570723245262877, + "grad_norm": 0.9348464052451259, + "learning_rate": 8.205380165104673e-06, + "loss": 1.5906, + "step": 4277 + }, + { + "epoch": 0.5708566853482786, + "grad_norm": 0.9528816676643417, + "learning_rate": 8.201128594487653e-06, + "loss": 1.6113, + "step": 4278 + }, + { + "epoch": 0.5709901254336803, + "grad_norm": 0.9776229291280035, + "learning_rate": 8.196877359879327e-06, + "loss": 1.5856, + "step": 4279 + }, + { + "epoch": 0.5711235655190819, + "grad_norm": 0.9414417277905998, + "learning_rate": 8.192626462073782e-06, + "loss": 1.564, + "step": 4280 + }, + { + "epoch": 0.5712570056044836, + "grad_norm": 0.9808001957769379, + "learning_rate": 8.18837590186504e-06, + "loss": 1.5853, + "step": 4281 + }, + { + "epoch": 0.5713904456898853, + "grad_norm": 1.1219868857209958, + "learning_rate": 8.184125680047051e-06, + "loss": 1.5385, + "step": 4282 + }, + { + "epoch": 0.5715238857752869, + "grad_norm": 0.9792466209394581, + "learning_rate": 8.179875797413707e-06, + "loss": 1.6143, + "step": 4283 + }, + { + "epoch": 0.5716573258606885, + "grad_norm": 0.9655150327845816, + "learning_rate": 8.175626254758847e-06, + "loss": 1.5814, + "step": 4284 + }, + { + "epoch": 0.5717907659460902, + "grad_norm": 1.0677323984435168, + "learning_rate": 8.171377052876228e-06, + "loss": 1.5212, + "step": 4285 + }, + { + "epoch": 0.5719242060314919, + "grad_norm": 1.0478556969877482, + "learning_rate": 8.167128192559557e-06, + "loss": 1.5864, + "step": 4286 + }, + { + "epoch": 0.5720576461168935, + "grad_norm": 0.9806397349097133, + "learning_rate": 8.162879674602469e-06, + "loss": 1.5688, + "step": 4287 + }, + { + "epoch": 0.5721910862022952, + "grad_norm": 0.957753049608993, + "learning_rate": 8.158631499798545e-06, + "loss": 1.5769, + "step": 4288 + }, + { + "epoch": 0.5723245262876968, + "grad_norm": 0.9714614263607235, + "learning_rate": 8.154383668941293e-06, + "loss": 1.5687, + "step": 4289 + }, + { + "epoch": 0.5724579663730984, + "grad_norm": 1.1716096470089616, + "learning_rate": 8.150136182824152e-06, + "loss": 1.5396, + "step": 4290 + }, + { + "epoch": 0.5725914064585002, + "grad_norm": 0.992260967601937, + "learning_rate": 8.145889042240516e-06, + "loss": 1.5828, + "step": 4291 + }, + { + "epoch": 0.5727248465439018, + "grad_norm": 0.965708034327343, + "learning_rate": 8.141642247983696e-06, + "loss": 1.5623, + "step": 4292 + }, + { + "epoch": 0.5728582866293035, + "grad_norm": 0.9907802430036895, + "learning_rate": 8.137395800846948e-06, + "loss": 1.6284, + "step": 4293 + }, + { + "epoch": 0.5729917267147051, + "grad_norm": 0.9396948795159468, + "learning_rate": 8.133149701623454e-06, + "loss": 1.6403, + "step": 4294 + }, + { + "epoch": 0.5731251668001067, + "grad_norm": 0.9597661967095336, + "learning_rate": 8.12890395110634e-06, + "loss": 1.5808, + "step": 4295 + }, + { + "epoch": 0.5732586068855084, + "grad_norm": 9.3684749649107, + "learning_rate": 8.124658550088668e-06, + "loss": 1.6139, + "step": 4296 + }, + { + "epoch": 0.5733920469709101, + "grad_norm": 0.9286997041501741, + "learning_rate": 8.120413499363427e-06, + "loss": 1.5668, + "step": 4297 + }, + { + "epoch": 0.5735254870563117, + "grad_norm": 0.9796234552599323, + "learning_rate": 8.116168799723544e-06, + "loss": 1.5597, + "step": 4298 + }, + { + "epoch": 0.5736589271417134, + "grad_norm": 1.0267380425578665, + "learning_rate": 8.111924451961883e-06, + "loss": 1.6226, + "step": 4299 + }, + { + "epoch": 0.573792367227115, + "grad_norm": 1.125467638614621, + "learning_rate": 8.107680456871236e-06, + "loss": 1.6001, + "step": 4300 + }, + { + "epoch": 0.5739258073125166, + "grad_norm": 0.9428006871087928, + "learning_rate": 8.10343681524433e-06, + "loss": 1.6035, + "step": 4301 + }, + { + "epoch": 0.5740592473979184, + "grad_norm": 0.9693180970207167, + "learning_rate": 8.09919352787384e-06, + "loss": 1.5463, + "step": 4302 + }, + { + "epoch": 0.57419268748332, + "grad_norm": 1.0176972242986655, + "learning_rate": 8.094950595552356e-06, + "loss": 1.6065, + "step": 4303 + }, + { + "epoch": 0.5743261275687216, + "grad_norm": 1.1248089412402027, + "learning_rate": 8.09070801907241e-06, + "loss": 1.6226, + "step": 4304 + }, + { + "epoch": 0.5744595676541233, + "grad_norm": 0.9608200644959385, + "learning_rate": 8.086465799226468e-06, + "loss": 1.5211, + "step": 4305 + }, + { + "epoch": 0.5745930077395249, + "grad_norm": 0.9756758699322823, + "learning_rate": 8.08222393680693e-06, + "loss": 1.5682, + "step": 4306 + }, + { + "epoch": 0.5747264478249267, + "grad_norm": 1.0688838840416501, + "learning_rate": 8.077982432606125e-06, + "loss": 1.5725, + "step": 4307 + }, + { + "epoch": 0.5748598879103283, + "grad_norm": 1.1309638787833816, + "learning_rate": 8.073741287416317e-06, + "loss": 1.6059, + "step": 4308 + }, + { + "epoch": 0.5749933279957299, + "grad_norm": 1.0556322400323577, + "learning_rate": 8.069500502029705e-06, + "loss": 1.5937, + "step": 4309 + }, + { + "epoch": 0.5751267680811316, + "grad_norm": 0.9506475357460462, + "learning_rate": 8.065260077238423e-06, + "loss": 1.5802, + "step": 4310 + }, + { + "epoch": 0.5752602081665332, + "grad_norm": 1.077301912471358, + "learning_rate": 8.061020013834526e-06, + "loss": 1.594, + "step": 4311 + }, + { + "epoch": 0.5753936482519348, + "grad_norm": 0.9475246370591415, + "learning_rate": 8.056780312610019e-06, + "loss": 1.5518, + "step": 4312 + }, + { + "epoch": 0.5755270883373366, + "grad_norm": 1.1642637584365982, + "learning_rate": 8.052540974356825e-06, + "loss": 1.5499, + "step": 4313 + }, + { + "epoch": 0.5756605284227382, + "grad_norm": 0.962853058046866, + "learning_rate": 8.048301999866803e-06, + "loss": 1.625, + "step": 4314 + }, + { + "epoch": 0.5757939685081398, + "grad_norm": 0.951938089036065, + "learning_rate": 8.044063389931745e-06, + "loss": 1.6001, + "step": 4315 + }, + { + "epoch": 0.5759274085935415, + "grad_norm": 0.9850586486463077, + "learning_rate": 8.039825145343378e-06, + "loss": 1.5917, + "step": 4316 + }, + { + "epoch": 0.5760608486789431, + "grad_norm": 0.9996996255343955, + "learning_rate": 8.035587266893357e-06, + "loss": 1.59, + "step": 4317 + }, + { + "epoch": 0.5761942887643449, + "grad_norm": 0.9361786850612416, + "learning_rate": 8.031349755373269e-06, + "loss": 1.5217, + "step": 4318 + }, + { + "epoch": 0.5763277288497465, + "grad_norm": 0.9534574866115021, + "learning_rate": 8.02711261157463e-06, + "loss": 1.6341, + "step": 4319 + }, + { + "epoch": 0.5764611689351481, + "grad_norm": 0.9607054461212049, + "learning_rate": 8.022875836288896e-06, + "loss": 1.5407, + "step": 4320 + }, + { + "epoch": 0.5765946090205498, + "grad_norm": 1.035113379371202, + "learning_rate": 8.018639430307445e-06, + "loss": 1.546, + "step": 4321 + }, + { + "epoch": 0.5767280491059514, + "grad_norm": 0.9424964205669921, + "learning_rate": 8.014403394421585e-06, + "loss": 1.5562, + "step": 4322 + }, + { + "epoch": 0.576861489191353, + "grad_norm": 0.9423821513514596, + "learning_rate": 8.010167729422571e-06, + "loss": 1.5583, + "step": 4323 + }, + { + "epoch": 0.5769949292767548, + "grad_norm": 0.9340881718791657, + "learning_rate": 8.005932436101567e-06, + "loss": 1.5567, + "step": 4324 + }, + { + "epoch": 0.5771283693621564, + "grad_norm": 0.9680824040104852, + "learning_rate": 8.001697515249683e-06, + "loss": 1.5881, + "step": 4325 + }, + { + "epoch": 0.577261809447558, + "grad_norm": 0.9638573991235978, + "learning_rate": 7.997462967657946e-06, + "loss": 1.5619, + "step": 4326 + }, + { + "epoch": 0.5773952495329597, + "grad_norm": 0.9479429964037883, + "learning_rate": 7.993228794117332e-06, + "loss": 1.573, + "step": 4327 + }, + { + "epoch": 0.5775286896183613, + "grad_norm": 0.9744883318609773, + "learning_rate": 7.988994995418731e-06, + "loss": 1.5964, + "step": 4328 + }, + { + "epoch": 0.577662129703763, + "grad_norm": 0.9469778260011422, + "learning_rate": 7.984761572352963e-06, + "loss": 1.5858, + "step": 4329 + }, + { + "epoch": 0.5777955697891647, + "grad_norm": 0.9464430279636084, + "learning_rate": 7.980528525710795e-06, + "loss": 1.6171, + "step": 4330 + }, + { + "epoch": 0.5779290098745663, + "grad_norm": 7.978193421669048, + "learning_rate": 7.976295856282904e-06, + "loss": 1.5857, + "step": 4331 + }, + { + "epoch": 0.578062449959968, + "grad_norm": 1.2761028754638115, + "learning_rate": 7.97206356485991e-06, + "loss": 1.6031, + "step": 4332 + }, + { + "epoch": 0.5781958900453696, + "grad_norm": 0.993124845092899, + "learning_rate": 7.967831652232346e-06, + "loss": 1.5551, + "step": 4333 + }, + { + "epoch": 0.5783293301307713, + "grad_norm": 0.962042815286027, + "learning_rate": 7.963600119190695e-06, + "loss": 1.5865, + "step": 4334 + }, + { + "epoch": 0.578462770216173, + "grad_norm": 0.9541286812670964, + "learning_rate": 7.959368966525358e-06, + "loss": 1.5067, + "step": 4335 + }, + { + "epoch": 0.5785962103015746, + "grad_norm": 0.9197038368832774, + "learning_rate": 7.955138195026663e-06, + "loss": 1.5261, + "step": 4336 + }, + { + "epoch": 0.5787296503869762, + "grad_norm": 0.9334310052059513, + "learning_rate": 7.950907805484874e-06, + "loss": 1.5611, + "step": 4337 + }, + { + "epoch": 0.5788630904723779, + "grad_norm": 0.9637654944676048, + "learning_rate": 7.946677798690175e-06, + "loss": 1.5307, + "step": 4338 + }, + { + "epoch": 0.5789965305577796, + "grad_norm": 0.9357279064845219, + "learning_rate": 7.942448175432687e-06, + "loss": 1.5625, + "step": 4339 + }, + { + "epoch": 0.5791299706431812, + "grad_norm": 1.0168364623595538, + "learning_rate": 7.938218936502451e-06, + "loss": 1.6031, + "step": 4340 + }, + { + "epoch": 0.5792634107285829, + "grad_norm": 0.9662217980950242, + "learning_rate": 7.933990082689447e-06, + "loss": 1.5805, + "step": 4341 + }, + { + "epoch": 0.5793968508139845, + "grad_norm": 0.9669704491262874, + "learning_rate": 7.92976161478357e-06, + "loss": 1.5878, + "step": 4342 + }, + { + "epoch": 0.5795302908993861, + "grad_norm": 1.00799200737011, + "learning_rate": 7.925533533574652e-06, + "loss": 1.5348, + "step": 4343 + }, + { + "epoch": 0.5796637309847879, + "grad_norm": 0.9856175919162833, + "learning_rate": 7.921305839852454e-06, + "loss": 1.5661, + "step": 4344 + }, + { + "epoch": 0.5797971710701895, + "grad_norm": 1.0344494447784194, + "learning_rate": 7.91707853440666e-06, + "loss": 1.5692, + "step": 4345 + }, + { + "epoch": 0.5799306111555912, + "grad_norm": 0.9964855077609798, + "learning_rate": 7.912851618026878e-06, + "loss": 1.6038, + "step": 4346 + }, + { + "epoch": 0.5800640512409928, + "grad_norm": 1.0409910183629953, + "learning_rate": 7.90862509150265e-06, + "loss": 1.5509, + "step": 4347 + }, + { + "epoch": 0.5801974913263944, + "grad_norm": 0.9473445719334401, + "learning_rate": 7.904398955623443e-06, + "loss": 1.6044, + "step": 4348 + }, + { + "epoch": 0.5803309314117961, + "grad_norm": 1.0011241389147565, + "learning_rate": 7.900173211178655e-06, + "loss": 1.5819, + "step": 4349 + }, + { + "epoch": 0.5804643714971978, + "grad_norm": 0.9616239459082261, + "learning_rate": 7.895947858957603e-06, + "loss": 1.5791, + "step": 4350 + }, + { + "epoch": 0.5805978115825994, + "grad_norm": 0.9896389964933032, + "learning_rate": 7.891722899749531e-06, + "loss": 1.5909, + "step": 4351 + }, + { + "epoch": 0.5807312516680011, + "grad_norm": 1.1806253383689567, + "learning_rate": 7.887498334343625e-06, + "loss": 1.5656, + "step": 4352 + }, + { + "epoch": 0.5808646917534027, + "grad_norm": 0.9467806964207979, + "learning_rate": 7.883274163528974e-06, + "loss": 1.5911, + "step": 4353 + }, + { + "epoch": 0.5809981318388043, + "grad_norm": 0.9523674420210726, + "learning_rate": 7.879050388094606e-06, + "loss": 1.5564, + "step": 4354 + }, + { + "epoch": 0.5811315719242061, + "grad_norm": 0.9826286319469207, + "learning_rate": 7.874827008829485e-06, + "loss": 1.6093, + "step": 4355 + }, + { + "epoch": 0.5812650120096077, + "grad_norm": 0.9353553399457585, + "learning_rate": 7.87060402652248e-06, + "loss": 1.5576, + "step": 4356 + }, + { + "epoch": 0.5813984520950093, + "grad_norm": 0.9890860146426698, + "learning_rate": 7.866381441962398e-06, + "loss": 1.6012, + "step": 4357 + }, + { + "epoch": 0.581531892180411, + "grad_norm": 1.3238728327798368, + "learning_rate": 7.86215925593797e-06, + "loss": 1.5197, + "step": 4358 + }, + { + "epoch": 0.5816653322658126, + "grad_norm": 0.9735487703890018, + "learning_rate": 7.857937469237853e-06, + "loss": 1.577, + "step": 4359 + }, + { + "epoch": 0.5817987723512144, + "grad_norm": 1.0582339375938654, + "learning_rate": 7.85371608265063e-06, + "loss": 1.5682, + "step": 4360 + }, + { + "epoch": 0.581932212436616, + "grad_norm": 1.1375161685312762, + "learning_rate": 7.849495096964803e-06, + "loss": 1.5383, + "step": 4361 + }, + { + "epoch": 0.5820656525220176, + "grad_norm": 1.026443022449838, + "learning_rate": 7.84527451296881e-06, + "loss": 1.5996, + "step": 4362 + }, + { + "epoch": 0.5821990926074193, + "grad_norm": 0.97828369672077, + "learning_rate": 7.841054331451008e-06, + "loss": 1.5338, + "step": 4363 + }, + { + "epoch": 0.5823325326928209, + "grad_norm": 1.0179518604332118, + "learning_rate": 7.836834553199675e-06, + "loss": 1.568, + "step": 4364 + }, + { + "epoch": 0.5824659727782225, + "grad_norm": 0.9804583013213366, + "learning_rate": 7.832615179003013e-06, + "loss": 1.5702, + "step": 4365 + }, + { + "epoch": 0.5825994128636243, + "grad_norm": 1.1561708940014976, + "learning_rate": 7.828396209649166e-06, + "loss": 1.5852, + "step": 4366 + }, + { + "epoch": 0.5827328529490259, + "grad_norm": 0.9703494752720414, + "learning_rate": 7.824177645926181e-06, + "loss": 1.6168, + "step": 4367 + }, + { + "epoch": 0.5828662930344275, + "grad_norm": 0.975607666820386, + "learning_rate": 7.819959488622034e-06, + "loss": 1.5722, + "step": 4368 + }, + { + "epoch": 0.5829997331198292, + "grad_norm": 1.330373136193926, + "learning_rate": 7.81574173852464e-06, + "loss": 1.5116, + "step": 4369 + }, + { + "epoch": 0.5831331732052308, + "grad_norm": 0.9750167029171607, + "learning_rate": 7.811524396421818e-06, + "loss": 1.5523, + "step": 4370 + }, + { + "epoch": 0.5832666132906325, + "grad_norm": 0.977713970035295, + "learning_rate": 7.807307463101323e-06, + "loss": 1.5085, + "step": 4371 + }, + { + "epoch": 0.5834000533760342, + "grad_norm": 0.9684749707247869, + "learning_rate": 7.803090939350825e-06, + "loss": 1.5986, + "step": 4372 + }, + { + "epoch": 0.5835334934614358, + "grad_norm": 0.9894691014054225, + "learning_rate": 7.798874825957932e-06, + "loss": 1.5637, + "step": 4373 + }, + { + "epoch": 0.5836669335468375, + "grad_norm": 0.9811750958682591, + "learning_rate": 7.79465912371016e-06, + "loss": 1.5513, + "step": 4374 + }, + { + "epoch": 0.5838003736322391, + "grad_norm": 0.9268597986006433, + "learning_rate": 7.790443833394951e-06, + "loss": 1.5789, + "step": 4375 + }, + { + "epoch": 0.5839338137176407, + "grad_norm": 0.9589691936010886, + "learning_rate": 7.786228955799682e-06, + "loss": 1.6262, + "step": 4376 + }, + { + "epoch": 0.5840672538030425, + "grad_norm": 1.0121372632908987, + "learning_rate": 7.782014491711638e-06, + "loss": 1.5929, + "step": 4377 + }, + { + "epoch": 0.5842006938884441, + "grad_norm": 0.9838055243486561, + "learning_rate": 7.777800441918036e-06, + "loss": 1.6404, + "step": 4378 + }, + { + "epoch": 0.5843341339738457, + "grad_norm": 0.9753282451165607, + "learning_rate": 7.773586807206007e-06, + "loss": 1.614, + "step": 4379 + }, + { + "epoch": 0.5844675740592474, + "grad_norm": 0.9364592889583468, + "learning_rate": 7.769373588362616e-06, + "loss": 1.6258, + "step": 4380 + }, + { + "epoch": 0.584601014144649, + "grad_norm": 0.9733374026966305, + "learning_rate": 7.765160786174838e-06, + "loss": 1.5896, + "step": 4381 + }, + { + "epoch": 0.5847344542300507, + "grad_norm": 0.9583014709288828, + "learning_rate": 7.760948401429587e-06, + "loss": 1.569, + "step": 4382 + }, + { + "epoch": 0.5848678943154524, + "grad_norm": 1.1438908541726007, + "learning_rate": 7.756736434913678e-06, + "loss": 1.5556, + "step": 4383 + }, + { + "epoch": 0.585001334400854, + "grad_norm": 0.9585046094942674, + "learning_rate": 7.752524887413864e-06, + "loss": 1.5598, + "step": 4384 + }, + { + "epoch": 0.5851347744862557, + "grad_norm": 1.0484117367016692, + "learning_rate": 7.748313759716812e-06, + "loss": 1.6091, + "step": 4385 + }, + { + "epoch": 0.5852682145716573, + "grad_norm": 0.9196030055638242, + "learning_rate": 7.74410305260911e-06, + "loss": 1.5314, + "step": 4386 + }, + { + "epoch": 0.585401654657059, + "grad_norm": 0.9757449887475365, + "learning_rate": 7.739892766877278e-06, + "loss": 1.577, + "step": 4387 + }, + { + "epoch": 0.5855350947424607, + "grad_norm": 1.2029427959464503, + "learning_rate": 7.735682903307745e-06, + "loss": 1.6089, + "step": 4388 + }, + { + "epoch": 0.5856685348278623, + "grad_norm": 0.9931185472448057, + "learning_rate": 7.731473462686865e-06, + "loss": 1.566, + "step": 4389 + }, + { + "epoch": 0.5858019749132639, + "grad_norm": 0.9306099137325194, + "learning_rate": 7.727264445800909e-06, + "loss": 1.559, + "step": 4390 + }, + { + "epoch": 0.5859354149986656, + "grad_norm": 0.9714231921122422, + "learning_rate": 7.723055853436084e-06, + "loss": 1.6069, + "step": 4391 + }, + { + "epoch": 0.5860688550840673, + "grad_norm": 0.9857980626951369, + "learning_rate": 7.718847686378502e-06, + "loss": 1.5685, + "step": 4392 + }, + { + "epoch": 0.5862022951694689, + "grad_norm": 1.0373287455803648, + "learning_rate": 7.714639945414193e-06, + "loss": 1.6062, + "step": 4393 + }, + { + "epoch": 0.5863357352548706, + "grad_norm": 1.0138880235597887, + "learning_rate": 7.71043263132913e-06, + "loss": 1.5886, + "step": 4394 + }, + { + "epoch": 0.5864691753402722, + "grad_norm": 0.9451198980798459, + "learning_rate": 7.70622574490918e-06, + "loss": 1.6238, + "step": 4395 + }, + { + "epoch": 0.5866026154256738, + "grad_norm": 0.9760854352122434, + "learning_rate": 7.70201928694015e-06, + "loss": 1.5641, + "step": 4396 + }, + { + "epoch": 0.5867360555110755, + "grad_norm": 0.9771605508734201, + "learning_rate": 7.697813258207747e-06, + "loss": 1.5616, + "step": 4397 + }, + { + "epoch": 0.5868694955964772, + "grad_norm": 1.016442594503307, + "learning_rate": 7.693607659497621e-06, + "loss": 1.572, + "step": 4398 + }, + { + "epoch": 0.5870029356818789, + "grad_norm": 1.1109077879279698, + "learning_rate": 7.689402491595324e-06, + "loss": 1.5504, + "step": 4399 + }, + { + "epoch": 0.5871363757672805, + "grad_norm": 7.23681120527737, + "learning_rate": 7.685197755286332e-06, + "loss": 1.6183, + "step": 4400 + }, + { + "epoch": 0.5872698158526821, + "grad_norm": 1.0014151692392677, + "learning_rate": 7.680993451356047e-06, + "loss": 1.6214, + "step": 4401 + }, + { + "epoch": 0.5874032559380838, + "grad_norm": 1.0492817400153631, + "learning_rate": 7.676789580589781e-06, + "loss": 1.5989, + "step": 4402 + }, + { + "epoch": 0.5875366960234855, + "grad_norm": 0.9443703490938348, + "learning_rate": 7.672586143772772e-06, + "loss": 1.5646, + "step": 4403 + }, + { + "epoch": 0.5876701361088871, + "grad_norm": 0.9410293011189405, + "learning_rate": 7.66838314169017e-06, + "loss": 1.5367, + "step": 4404 + }, + { + "epoch": 0.5878035761942888, + "grad_norm": 0.9504920239170566, + "learning_rate": 7.664180575127054e-06, + "loss": 1.5748, + "step": 4405 + }, + { + "epoch": 0.5879370162796904, + "grad_norm": 1.1257241646301057, + "learning_rate": 7.659978444868412e-06, + "loss": 1.5842, + "step": 4406 + }, + { + "epoch": 0.588070456365092, + "grad_norm": 1.0186274137996902, + "learning_rate": 7.65577675169915e-06, + "loss": 1.591, + "step": 4407 + }, + { + "epoch": 0.5882038964504938, + "grad_norm": 1.1327767010645338, + "learning_rate": 7.651575496404104e-06, + "loss": 1.654, + "step": 4408 + }, + { + "epoch": 0.5883373365358954, + "grad_norm": 0.9109735683849596, + "learning_rate": 7.64737467976802e-06, + "loss": 1.5817, + "step": 4409 + }, + { + "epoch": 0.588470776621297, + "grad_norm": 0.9553652260357995, + "learning_rate": 7.643174302575558e-06, + "loss": 1.5662, + "step": 4410 + }, + { + "epoch": 0.5886042167066987, + "grad_norm": 0.9727279713423016, + "learning_rate": 7.638974365611299e-06, + "loss": 1.6463, + "step": 4411 + }, + { + "epoch": 0.5887376567921003, + "grad_norm": 0.9442402335099999, + "learning_rate": 7.634774869659751e-06, + "loss": 1.5588, + "step": 4412 + }, + { + "epoch": 0.588871096877502, + "grad_norm": 0.9022310204474651, + "learning_rate": 7.630575815505328e-06, + "loss": 1.5363, + "step": 4413 + }, + { + "epoch": 0.5890045369629037, + "grad_norm": 0.9386827766287047, + "learning_rate": 7.626377203932368e-06, + "loss": 1.5746, + "step": 4414 + }, + { + "epoch": 0.5891379770483053, + "grad_norm": 1.086942823690136, + "learning_rate": 7.622179035725116e-06, + "loss": 1.6061, + "step": 4415 + }, + { + "epoch": 0.589271417133707, + "grad_norm": 1.0205058353350165, + "learning_rate": 7.617981311667751e-06, + "loss": 1.5322, + "step": 4416 + }, + { + "epoch": 0.5894048572191086, + "grad_norm": 0.9823401514999283, + "learning_rate": 7.613784032544358e-06, + "loss": 1.5882, + "step": 4417 + }, + { + "epoch": 0.5895382973045102, + "grad_norm": 0.9433142149165429, + "learning_rate": 7.609587199138934e-06, + "loss": 1.5114, + "step": 4418 + }, + { + "epoch": 0.589671737389912, + "grad_norm": 0.9904727920933845, + "learning_rate": 7.605390812235412e-06, + "loss": 1.595, + "step": 4419 + }, + { + "epoch": 0.5898051774753136, + "grad_norm": 0.9453239515038033, + "learning_rate": 7.60119487261762e-06, + "loss": 1.5716, + "step": 4420 + }, + { + "epoch": 0.5899386175607152, + "grad_norm": 0.9710304464960416, + "learning_rate": 7.596999381069316e-06, + "loss": 1.6152, + "step": 4421 + }, + { + "epoch": 0.5900720576461169, + "grad_norm": 0.9149997314948785, + "learning_rate": 7.592804338374166e-06, + "loss": 1.5745, + "step": 4422 + }, + { + "epoch": 0.5902054977315185, + "grad_norm": 0.9611341617563518, + "learning_rate": 7.588609745315758e-06, + "loss": 1.5301, + "step": 4423 + }, + { + "epoch": 0.5903389378169202, + "grad_norm": 1.0320270411702828, + "learning_rate": 7.584415602677597e-06, + "loss": 1.6336, + "step": 4424 + }, + { + "epoch": 0.5904723779023219, + "grad_norm": 0.9578784490487312, + "learning_rate": 7.580221911243098e-06, + "loss": 1.612, + "step": 4425 + }, + { + "epoch": 0.5906058179877235, + "grad_norm": 1.0925180577920106, + "learning_rate": 7.576028671795596e-06, + "loss": 1.6268, + "step": 4426 + }, + { + "epoch": 0.5907392580731252, + "grad_norm": 1.0026716186226203, + "learning_rate": 7.571835885118341e-06, + "loss": 1.545, + "step": 4427 + }, + { + "epoch": 0.5908726981585268, + "grad_norm": 1.1376864232127364, + "learning_rate": 7.567643551994498e-06, + "loss": 1.579, + "step": 4428 + }, + { + "epoch": 0.5910061382439284, + "grad_norm": 0.99803761360832, + "learning_rate": 7.56345167320714e-06, + "loss": 1.5658, + "step": 4429 + }, + { + "epoch": 0.5911395783293302, + "grad_norm": 0.9610248802222642, + "learning_rate": 7.5592602495392744e-06, + "loss": 1.6057, + "step": 4430 + }, + { + "epoch": 0.5912730184147318, + "grad_norm": 0.9418422477809483, + "learning_rate": 7.555069281773805e-06, + "loss": 1.5575, + "step": 4431 + }, + { + "epoch": 0.5914064585001334, + "grad_norm": 0.9421321981921353, + "learning_rate": 7.550878770693551e-06, + "loss": 1.5754, + "step": 4432 + }, + { + "epoch": 0.5915398985855351, + "grad_norm": 1.035596217537491, + "learning_rate": 7.546688717081265e-06, + "loss": 1.5615, + "step": 4433 + }, + { + "epoch": 0.5916733386709367, + "grad_norm": 0.9571740732982612, + "learning_rate": 7.542499121719593e-06, + "loss": 1.5694, + "step": 4434 + }, + { + "epoch": 0.5918067787563384, + "grad_norm": 0.9590760123078373, + "learning_rate": 7.538309985391107e-06, + "loss": 1.5727, + "step": 4435 + }, + { + "epoch": 0.5919402188417401, + "grad_norm": 0.9626015322928804, + "learning_rate": 7.534121308878283e-06, + "loss": 1.5456, + "step": 4436 + }, + { + "epoch": 0.5920736589271417, + "grad_norm": 0.9448338727741227, + "learning_rate": 7.529933092963527e-06, + "loss": 1.5902, + "step": 4437 + }, + { + "epoch": 0.5922070990125433, + "grad_norm": 1.0098216915844183, + "learning_rate": 7.525745338429148e-06, + "loss": 1.5554, + "step": 4438 + }, + { + "epoch": 0.592340539097945, + "grad_norm": 0.9387499498237363, + "learning_rate": 7.521558046057364e-06, + "loss": 1.5319, + "step": 4439 + }, + { + "epoch": 0.5924739791833467, + "grad_norm": 0.9367552163787548, + "learning_rate": 7.517371216630324e-06, + "loss": 1.5604, + "step": 4440 + }, + { + "epoch": 0.5926074192687484, + "grad_norm": 15.945356768463004, + "learning_rate": 7.513184850930075e-06, + "loss": 1.5926, + "step": 4441 + }, + { + "epoch": 0.59274085935415, + "grad_norm": 1.0201759619596091, + "learning_rate": 7.5089989497385805e-06, + "loss": 1.5564, + "step": 4442 + }, + { + "epoch": 0.5928742994395516, + "grad_norm": 1.0381537793869493, + "learning_rate": 7.504813513837721e-06, + "loss": 1.5926, + "step": 4443 + }, + { + "epoch": 0.5930077395249533, + "grad_norm": 1.066841128052637, + "learning_rate": 7.5006285440092894e-06, + "loss": 1.5384, + "step": 4444 + }, + { + "epoch": 0.593141179610355, + "grad_norm": 1.060786355395775, + "learning_rate": 7.496444041034987e-06, + "loss": 1.5944, + "step": 4445 + }, + { + "epoch": 0.5932746196957566, + "grad_norm": 0.9860687095888007, + "learning_rate": 7.492260005696435e-06, + "loss": 1.5632, + "step": 4446 + }, + { + "epoch": 0.5934080597811583, + "grad_norm": 0.9858498336031716, + "learning_rate": 7.488076438775164e-06, + "loss": 1.5638, + "step": 4447 + }, + { + "epoch": 0.5935414998665599, + "grad_norm": 0.9833114803634568, + "learning_rate": 7.483893341052613e-06, + "loss": 1.5688, + "step": 4448 + }, + { + "epoch": 0.5936749399519615, + "grad_norm": 0.9443909139265638, + "learning_rate": 7.479710713310142e-06, + "loss": 1.6111, + "step": 4449 + }, + { + "epoch": 0.5938083800373632, + "grad_norm": 1.754532656615733, + "learning_rate": 7.4755285563290084e-06, + "loss": 1.5721, + "step": 4450 + }, + { + "epoch": 0.5939418201227649, + "grad_norm": 0.9696978925421736, + "learning_rate": 7.4713468708904035e-06, + "loss": 1.583, + "step": 4451 + }, + { + "epoch": 0.5940752602081665, + "grad_norm": 0.9849603466349487, + "learning_rate": 7.4671656577754134e-06, + "loss": 1.5552, + "step": 4452 + }, + { + "epoch": 0.5942087002935682, + "grad_norm": 0.9804321553897487, + "learning_rate": 7.462984917765042e-06, + "loss": 1.5638, + "step": 4453 + }, + { + "epoch": 0.5943421403789698, + "grad_norm": 0.987700482819893, + "learning_rate": 7.4588046516401965e-06, + "loss": 1.605, + "step": 4454 + }, + { + "epoch": 0.5944755804643715, + "grad_norm": 0.9960716798209621, + "learning_rate": 7.454624860181716e-06, + "loss": 1.5791, + "step": 4455 + }, + { + "epoch": 0.5946090205497732, + "grad_norm": 1.0747136053703605, + "learning_rate": 7.450445544170331e-06, + "loss": 1.5723, + "step": 4456 + }, + { + "epoch": 0.5947424606351748, + "grad_norm": 0.9519414871231925, + "learning_rate": 7.446266704386685e-06, + "loss": 1.5776, + "step": 4457 + }, + { + "epoch": 0.5948759007205765, + "grad_norm": 1.0093105843986834, + "learning_rate": 7.442088341611349e-06, + "loss": 1.5719, + "step": 4458 + }, + { + "epoch": 0.5950093408059781, + "grad_norm": 0.9633876822009539, + "learning_rate": 7.437910456624786e-06, + "loss": 1.5895, + "step": 4459 + }, + { + "epoch": 0.5951427808913797, + "grad_norm": 1.1771212337662196, + "learning_rate": 7.4337330502073815e-06, + "loss": 1.5678, + "step": 4460 + }, + { + "epoch": 0.5952762209767815, + "grad_norm": 1.027523600819857, + "learning_rate": 7.429556123139418e-06, + "loss": 1.5628, + "step": 4461 + }, + { + "epoch": 0.5954096610621831, + "grad_norm": 1.0233162890520189, + "learning_rate": 7.425379676201112e-06, + "loss": 1.6438, + "step": 4462 + }, + { + "epoch": 0.5955431011475847, + "grad_norm": 0.982674022132527, + "learning_rate": 7.421203710172569e-06, + "loss": 1.5377, + "step": 4463 + }, + { + "epoch": 0.5956765412329864, + "grad_norm": 0.928927279763224, + "learning_rate": 7.417028225833809e-06, + "loss": 1.5629, + "step": 4464 + }, + { + "epoch": 0.595809981318388, + "grad_norm": 1.003275613884901, + "learning_rate": 7.412853223964771e-06, + "loss": 1.5441, + "step": 4465 + }, + { + "epoch": 0.5959434214037898, + "grad_norm": 1.0353673874246174, + "learning_rate": 7.408678705345292e-06, + "loss": 1.6081, + "step": 4466 + }, + { + "epoch": 0.5960768614891914, + "grad_norm": 0.9778720926332395, + "learning_rate": 7.40450467075513e-06, + "loss": 1.5633, + "step": 4467 + }, + { + "epoch": 0.596210301574593, + "grad_norm": 1.0932199693315858, + "learning_rate": 7.400331120973943e-06, + "loss": 1.5514, + "step": 4468 + }, + { + "epoch": 0.5963437416599947, + "grad_norm": 0.9995117901765698, + "learning_rate": 7.3961580567813065e-06, + "loss": 1.6022, + "step": 4469 + }, + { + "epoch": 0.5964771817453963, + "grad_norm": 1.04545722147908, + "learning_rate": 7.391985478956699e-06, + "loss": 1.5811, + "step": 4470 + }, + { + "epoch": 0.5966106218307979, + "grad_norm": 0.9491415931893125, + "learning_rate": 7.387813388279507e-06, + "loss": 1.5659, + "step": 4471 + }, + { + "epoch": 0.5967440619161997, + "grad_norm": 0.9962391326790979, + "learning_rate": 7.383641785529037e-06, + "loss": 1.5627, + "step": 4472 + }, + { + "epoch": 0.5968775020016013, + "grad_norm": 0.9766553612308028, + "learning_rate": 7.379470671484492e-06, + "loss": 1.5631, + "step": 4473 + }, + { + "epoch": 0.5970109420870029, + "grad_norm": 0.9060488686225177, + "learning_rate": 7.375300046924991e-06, + "loss": 1.5714, + "step": 4474 + }, + { + "epoch": 0.5971443821724046, + "grad_norm": 0.9993988316021899, + "learning_rate": 7.3711299126295535e-06, + "loss": 1.6233, + "step": 4475 + }, + { + "epoch": 0.5972778222578062, + "grad_norm": 0.9685553543361992, + "learning_rate": 7.366960269377122e-06, + "loss": 1.5233, + "step": 4476 + }, + { + "epoch": 0.5974112623432078, + "grad_norm": 0.9992661533210414, + "learning_rate": 7.362791117946533e-06, + "loss": 1.5369, + "step": 4477 + }, + { + "epoch": 0.5975447024286096, + "grad_norm": 0.9445669078870059, + "learning_rate": 7.3586224591165335e-06, + "loss": 1.5614, + "step": 4478 + }, + { + "epoch": 0.5976781425140112, + "grad_norm": 1.1684515472112844, + "learning_rate": 7.354454293665789e-06, + "loss": 1.5789, + "step": 4479 + }, + { + "epoch": 0.5978115825994129, + "grad_norm": 0.9866975359267203, + "learning_rate": 7.350286622372863e-06, + "loss": 1.5559, + "step": 4480 + }, + { + "epoch": 0.5979450226848145, + "grad_norm": 0.948924170206914, + "learning_rate": 7.346119446016228e-06, + "loss": 1.6123, + "step": 4481 + }, + { + "epoch": 0.5980784627702161, + "grad_norm": 6.114143464113286, + "learning_rate": 7.34195276537426e-06, + "loss": 1.5768, + "step": 4482 + }, + { + "epoch": 0.5982119028556179, + "grad_norm": 1.0955825105939028, + "learning_rate": 7.337786581225257e-06, + "loss": 1.5832, + "step": 4483 + }, + { + "epoch": 0.5983453429410195, + "grad_norm": 1.1743034044346985, + "learning_rate": 7.333620894347408e-06, + "loss": 1.6208, + "step": 4484 + }, + { + "epoch": 0.5984787830264211, + "grad_norm": 0.9813455603327857, + "learning_rate": 7.329455705518822e-06, + "loss": 1.5635, + "step": 4485 + }, + { + "epoch": 0.5986122231118228, + "grad_norm": 0.97080471220344, + "learning_rate": 7.325291015517499e-06, + "loss": 1.5297, + "step": 4486 + }, + { + "epoch": 0.5987456631972244, + "grad_norm": 1.0737172814869553, + "learning_rate": 7.321126825121367e-06, + "loss": 1.5659, + "step": 4487 + }, + { + "epoch": 0.5988791032826261, + "grad_norm": 0.9965236541353721, + "learning_rate": 7.316963135108239e-06, + "loss": 1.5796, + "step": 4488 + }, + { + "epoch": 0.5990125433680278, + "grad_norm": 0.9817919653306049, + "learning_rate": 7.3127999462558515e-06, + "loss": 1.6373, + "step": 4489 + }, + { + "epoch": 0.5991459834534294, + "grad_norm": 1.050110329669737, + "learning_rate": 7.308637259341842e-06, + "loss": 1.5925, + "step": 4490 + }, + { + "epoch": 0.599279423538831, + "grad_norm": 0.967042532570296, + "learning_rate": 7.304475075143749e-06, + "loss": 1.6259, + "step": 4491 + }, + { + "epoch": 0.5994128636242327, + "grad_norm": 4.112038683707192, + "learning_rate": 7.3003133944390226e-06, + "loss": 1.5667, + "step": 4492 + }, + { + "epoch": 0.5995463037096344, + "grad_norm": 1.0543783444772516, + "learning_rate": 7.296152218005012e-06, + "loss": 1.547, + "step": 4493 + }, + { + "epoch": 0.5996797437950361, + "grad_norm": 1.0059093670666783, + "learning_rate": 7.291991546618987e-06, + "loss": 1.5244, + "step": 4494 + }, + { + "epoch": 0.5998131838804377, + "grad_norm": 0.9544888819843551, + "learning_rate": 7.28783138105811e-06, + "loss": 1.5661, + "step": 4495 + }, + { + "epoch": 0.5999466239658393, + "grad_norm": 0.99919602721375, + "learning_rate": 7.283671722099447e-06, + "loss": 1.5961, + "step": 4496 + }, + { + "epoch": 0.600080064051241, + "grad_norm": 0.9615117915366642, + "learning_rate": 7.279512570519984e-06, + "loss": 1.5151, + "step": 4497 + }, + { + "epoch": 0.6002135041366427, + "grad_norm": 0.9810089776515174, + "learning_rate": 7.275353927096599e-06, + "loss": 1.5632, + "step": 4498 + }, + { + "epoch": 0.6003469442220443, + "grad_norm": 0.930480896476526, + "learning_rate": 7.271195792606079e-06, + "loss": 1.5504, + "step": 4499 + }, + { + "epoch": 0.600480384307446, + "grad_norm": 1.1160803987845584, + "learning_rate": 7.2670381678251135e-06, + "loss": 1.5948, + "step": 4500 + }, + { + "epoch": 0.6006138243928476, + "grad_norm": 0.9662586762512358, + "learning_rate": 7.262881053530304e-06, + "loss": 1.5188, + "step": 4501 + }, + { + "epoch": 0.6007472644782492, + "grad_norm": 0.9546482055535903, + "learning_rate": 7.258724450498153e-06, + "loss": 1.5879, + "step": 4502 + }, + { + "epoch": 0.600880704563651, + "grad_norm": 0.9262806341117151, + "learning_rate": 7.254568359505059e-06, + "loss": 1.5666, + "step": 4503 + }, + { + "epoch": 0.6010141446490526, + "grad_norm": 0.9344199271306789, + "learning_rate": 7.250412781327341e-06, + "loss": 1.584, + "step": 4504 + }, + { + "epoch": 0.6011475847344542, + "grad_norm": 1.006266539272583, + "learning_rate": 7.24625771674121e-06, + "loss": 1.6019, + "step": 4505 + }, + { + "epoch": 0.6012810248198559, + "grad_norm": 0.9707157824495067, + "learning_rate": 7.242103166522786e-06, + "loss": 1.5485, + "step": 4506 + }, + { + "epoch": 0.6014144649052575, + "grad_norm": 0.9330255414867166, + "learning_rate": 7.2379491314480874e-06, + "loss": 1.6067, + "step": 4507 + }, + { + "epoch": 0.6015479049906592, + "grad_norm": 1.1082901632889555, + "learning_rate": 7.233795612293048e-06, + "loss": 1.6019, + "step": 4508 + }, + { + "epoch": 0.6016813450760609, + "grad_norm": 0.9629851142489154, + "learning_rate": 7.229642609833489e-06, + "loss": 1.6062, + "step": 4509 + }, + { + "epoch": 0.6018147851614625, + "grad_norm": 0.9416715246155154, + "learning_rate": 7.2254901248451515e-06, + "loss": 1.5347, + "step": 4510 + }, + { + "epoch": 0.6019482252468642, + "grad_norm": 1.0187472160513316, + "learning_rate": 7.22133815810367e-06, + "loss": 1.5216, + "step": 4511 + }, + { + "epoch": 0.6020816653322658, + "grad_norm": 1.2896554547647145, + "learning_rate": 7.217186710384585e-06, + "loss": 1.588, + "step": 4512 + }, + { + "epoch": 0.6022151054176674, + "grad_norm": 0.9584433847605133, + "learning_rate": 7.213035782463339e-06, + "loss": 1.5431, + "step": 4513 + }, + { + "epoch": 0.6023485455030692, + "grad_norm": 0.948451999723668, + "learning_rate": 7.208885375115273e-06, + "loss": 1.5385, + "step": 4514 + }, + { + "epoch": 0.6024819855884708, + "grad_norm": 0.9144383102664776, + "learning_rate": 7.204735489115646e-06, + "loss": 1.6077, + "step": 4515 + }, + { + "epoch": 0.6026154256738724, + "grad_norm": 1.0021475280100742, + "learning_rate": 7.200586125239605e-06, + "loss": 1.5783, + "step": 4516 + }, + { + "epoch": 0.6027488657592741, + "grad_norm": 1.0074542834053597, + "learning_rate": 7.196437284262202e-06, + "loss": 1.6181, + "step": 4517 + }, + { + "epoch": 0.6028823058446757, + "grad_norm": 0.9555123773303952, + "learning_rate": 7.19228896695839e-06, + "loss": 1.5896, + "step": 4518 + }, + { + "epoch": 0.6030157459300773, + "grad_norm": 0.9474460595871118, + "learning_rate": 7.1881411741030385e-06, + "loss": 1.525, + "step": 4519 + }, + { + "epoch": 0.6031491860154791, + "grad_norm": 0.9503729257356935, + "learning_rate": 7.1839939064708985e-06, + "loss": 1.6017, + "step": 4520 + }, + { + "epoch": 0.6032826261008807, + "grad_norm": 0.9907523282551414, + "learning_rate": 7.179847164836633e-06, + "loss": 1.585, + "step": 4521 + }, + { + "epoch": 0.6034160661862824, + "grad_norm": 0.9768177889092633, + "learning_rate": 7.1757009499748135e-06, + "loss": 1.5499, + "step": 4522 + }, + { + "epoch": 0.603549506271684, + "grad_norm": 0.9674311604833397, + "learning_rate": 7.171555262659899e-06, + "loss": 1.5691, + "step": 4523 + }, + { + "epoch": 0.6036829463570856, + "grad_norm": 1.045115554264623, + "learning_rate": 7.167410103666258e-06, + "loss": 1.5396, + "step": 4524 + }, + { + "epoch": 0.6038163864424874, + "grad_norm": 1.008009383355456, + "learning_rate": 7.1632654737681565e-06, + "loss": 1.5004, + "step": 4525 + }, + { + "epoch": 0.603949826527889, + "grad_norm": 0.9418379068039596, + "learning_rate": 7.15912137373977e-06, + "loss": 1.5583, + "step": 4526 + }, + { + "epoch": 0.6040832666132906, + "grad_norm": 0.9601231973109439, + "learning_rate": 7.154977804355167e-06, + "loss": 1.5904, + "step": 4527 + }, + { + "epoch": 0.6042167066986923, + "grad_norm": 1.091493380346413, + "learning_rate": 7.1508347663883175e-06, + "loss": 1.5876, + "step": 4528 + }, + { + "epoch": 0.6043501467840939, + "grad_norm": 0.9773122566221465, + "learning_rate": 7.146692260613095e-06, + "loss": 1.5567, + "step": 4529 + }, + { + "epoch": 0.6044835868694955, + "grad_norm": 0.9744498878123181, + "learning_rate": 7.142550287803271e-06, + "loss": 1.6091, + "step": 4530 + }, + { + "epoch": 0.6046170269548973, + "grad_norm": 0.9623687799737277, + "learning_rate": 7.138408848732521e-06, + "loss": 1.5327, + "step": 4531 + }, + { + "epoch": 0.6047504670402989, + "grad_norm": 1.0233729179790898, + "learning_rate": 7.134267944174415e-06, + "loss": 1.5748, + "step": 4532 + }, + { + "epoch": 0.6048839071257006, + "grad_norm": 1.0743557870760962, + "learning_rate": 7.130127574902433e-06, + "loss": 1.5972, + "step": 4533 + }, + { + "epoch": 0.6050173472111022, + "grad_norm": 0.9535415958368197, + "learning_rate": 7.125987741689946e-06, + "loss": 1.6278, + "step": 4534 + }, + { + "epoch": 0.6051507872965038, + "grad_norm": 0.9506929714552727, + "learning_rate": 7.121848445310221e-06, + "loss": 1.5897, + "step": 4535 + }, + { + "epoch": 0.6052842273819056, + "grad_norm": 1.0206134352483975, + "learning_rate": 7.117709686536442e-06, + "loss": 1.5732, + "step": 4536 + }, + { + "epoch": 0.6054176674673072, + "grad_norm": 0.9419673451960351, + "learning_rate": 7.113571466141678e-06, + "loss": 1.5487, + "step": 4537 + }, + { + "epoch": 0.6055511075527088, + "grad_norm": 0.9968498706511424, + "learning_rate": 7.1094337848989e-06, + "loss": 1.6093, + "step": 4538 + }, + { + "epoch": 0.6056845476381105, + "grad_norm": 0.9476217156397696, + "learning_rate": 7.105296643580979e-06, + "loss": 1.564, + "step": 4539 + }, + { + "epoch": 0.6058179877235121, + "grad_norm": 0.936868206502605, + "learning_rate": 7.101160042960688e-06, + "loss": 1.5533, + "step": 4540 + }, + { + "epoch": 0.6059514278089138, + "grad_norm": 1.0845350953229937, + "learning_rate": 7.097023983810699e-06, + "loss": 1.553, + "step": 4541 + }, + { + "epoch": 0.6060848678943155, + "grad_norm": 1.6937000276166494, + "learning_rate": 7.092888466903574e-06, + "loss": 1.602, + "step": 4542 + }, + { + "epoch": 0.6062183079797171, + "grad_norm": 0.9553209112139722, + "learning_rate": 7.0887534930117885e-06, + "loss": 1.6083, + "step": 4543 + }, + { + "epoch": 0.6063517480651187, + "grad_norm": 0.9702968584345638, + "learning_rate": 7.084619062907704e-06, + "loss": 1.6188, + "step": 4544 + }, + { + "epoch": 0.6064851881505204, + "grad_norm": 0.9483887892908953, + "learning_rate": 7.080485177363585e-06, + "loss": 1.5665, + "step": 4545 + }, + { + "epoch": 0.606618628235922, + "grad_norm": 0.9832425329814938, + "learning_rate": 7.076351837151593e-06, + "loss": 1.582, + "step": 4546 + }, + { + "epoch": 0.6067520683213238, + "grad_norm": 0.9685038627751058, + "learning_rate": 7.072219043043793e-06, + "loss": 1.6042, + "step": 4547 + }, + { + "epoch": 0.6068855084067254, + "grad_norm": 0.93835156044391, + "learning_rate": 7.0680867958121434e-06, + "loss": 1.5932, + "step": 4548 + }, + { + "epoch": 0.607018948492127, + "grad_norm": 0.9471054104804755, + "learning_rate": 7.063955096228498e-06, + "loss": 1.6032, + "step": 4549 + }, + { + "epoch": 0.6071523885775287, + "grad_norm": 0.9886055085335903, + "learning_rate": 7.059823945064611e-06, + "loss": 1.5914, + "step": 4550 + }, + { + "epoch": 0.6072858286629303, + "grad_norm": 1.1513593773945008, + "learning_rate": 7.055693343092138e-06, + "loss": 1.5919, + "step": 4551 + }, + { + "epoch": 0.607419268748332, + "grad_norm": 0.9399419338670711, + "learning_rate": 7.051563291082624e-06, + "loss": 1.6072, + "step": 4552 + }, + { + "epoch": 0.6075527088337337, + "grad_norm": 0.968419038546263, + "learning_rate": 7.047433789807518e-06, + "loss": 1.5867, + "step": 4553 + }, + { + "epoch": 0.6076861489191353, + "grad_norm": 1.159641461420781, + "learning_rate": 7.043304840038166e-06, + "loss": 1.6149, + "step": 4554 + }, + { + "epoch": 0.6078195890045369, + "grad_norm": 0.9281119870608585, + "learning_rate": 7.039176442545808e-06, + "loss": 1.5342, + "step": 4555 + }, + { + "epoch": 0.6079530290899386, + "grad_norm": 0.9235387184302374, + "learning_rate": 7.035048598101578e-06, + "loss": 1.5764, + "step": 4556 + }, + { + "epoch": 0.6080864691753403, + "grad_norm": 0.9721122200439459, + "learning_rate": 7.03092130747651e-06, + "loss": 1.549, + "step": 4557 + }, + { + "epoch": 0.6082199092607419, + "grad_norm": 0.9229902607898596, + "learning_rate": 7.02679457144154e-06, + "loss": 1.5277, + "step": 4558 + }, + { + "epoch": 0.6083533493461436, + "grad_norm": 1.0561032370889225, + "learning_rate": 7.022668390767495e-06, + "loss": 1.5849, + "step": 4559 + }, + { + "epoch": 0.6084867894315452, + "grad_norm": 0.910655450515761, + "learning_rate": 7.018542766225091e-06, + "loss": 1.5658, + "step": 4560 + }, + { + "epoch": 0.6086202295169469, + "grad_norm": 0.9741339058858085, + "learning_rate": 7.0144176985849565e-06, + "loss": 1.5601, + "step": 4561 + }, + { + "epoch": 0.6087536696023486, + "grad_norm": 0.9130348746535919, + "learning_rate": 7.0102931886176055e-06, + "loss": 1.5731, + "step": 4562 + }, + { + "epoch": 0.6088871096877502, + "grad_norm": 0.9215074638471374, + "learning_rate": 7.006169237093447e-06, + "loss": 1.5319, + "step": 4563 + }, + { + "epoch": 0.6090205497731519, + "grad_norm": 0.920657107314274, + "learning_rate": 7.002045844782785e-06, + "loss": 1.5355, + "step": 4564 + }, + { + "epoch": 0.6091539898585535, + "grad_norm": 0.9316084219094781, + "learning_rate": 6.9979230124558295e-06, + "loss": 1.5888, + "step": 4565 + }, + { + "epoch": 0.6092874299439551, + "grad_norm": 1.0136087188967267, + "learning_rate": 6.9938007408826765e-06, + "loss": 1.6226, + "step": 4566 + }, + { + "epoch": 0.6094208700293569, + "grad_norm": 0.9419684119712161, + "learning_rate": 6.989679030833314e-06, + "loss": 1.5506, + "step": 4567 + }, + { + "epoch": 0.6095543101147585, + "grad_norm": 0.9475350185524345, + "learning_rate": 6.9855578830776385e-06, + "loss": 1.5925, + "step": 4568 + }, + { + "epoch": 0.6096877502001601, + "grad_norm": 1.069989064607908, + "learning_rate": 6.98143729838543e-06, + "loss": 1.6082, + "step": 4569 + }, + { + "epoch": 0.6098211902855618, + "grad_norm": 0.9321140114017321, + "learning_rate": 6.977317277526366e-06, + "loss": 1.5179, + "step": 4570 + }, + { + "epoch": 0.6099546303709634, + "grad_norm": 1.0848876865520107, + "learning_rate": 6.973197821270018e-06, + "loss": 1.6046, + "step": 4571 + }, + { + "epoch": 0.610088070456365, + "grad_norm": 0.9683930596028457, + "learning_rate": 6.969078930385858e-06, + "loss": 1.5473, + "step": 4572 + }, + { + "epoch": 0.6102215105417668, + "grad_norm": 0.9338234054464237, + "learning_rate": 6.964960605643243e-06, + "loss": 1.5351, + "step": 4573 + }, + { + "epoch": 0.6103549506271684, + "grad_norm": 0.9528182144070801, + "learning_rate": 6.960842847811432e-06, + "loss": 1.5674, + "step": 4574 + }, + { + "epoch": 0.6104883907125701, + "grad_norm": 0.950808534998642, + "learning_rate": 6.956725657659578e-06, + "loss": 1.5983, + "step": 4575 + }, + { + "epoch": 0.6106218307979717, + "grad_norm": 1.0116221503224754, + "learning_rate": 6.95260903595672e-06, + "loss": 1.5796, + "step": 4576 + }, + { + "epoch": 0.6107552708833733, + "grad_norm": 0.9762343267133087, + "learning_rate": 6.948492983471799e-06, + "loss": 1.6156, + "step": 4577 + }, + { + "epoch": 0.6108887109687751, + "grad_norm": 0.9069025915216041, + "learning_rate": 6.944377500973642e-06, + "loss": 1.5628, + "step": 4578 + }, + { + "epoch": 0.6110221510541767, + "grad_norm": 0.9272734873764399, + "learning_rate": 6.9402625892309825e-06, + "loss": 1.5324, + "step": 4579 + }, + { + "epoch": 0.6111555911395783, + "grad_norm": 0.9668585275369073, + "learning_rate": 6.936148249012436e-06, + "loss": 1.5459, + "step": 4580 + }, + { + "epoch": 0.61128903122498, + "grad_norm": 0.9728627382480686, + "learning_rate": 6.932034481086512e-06, + "loss": 1.647, + "step": 4581 + }, + { + "epoch": 0.6114224713103816, + "grad_norm": 0.950645414784866, + "learning_rate": 6.927921286221613e-06, + "loss": 1.545, + "step": 4582 + }, + { + "epoch": 0.6115559113957832, + "grad_norm": 1.0447314600826156, + "learning_rate": 6.923808665186045e-06, + "loss": 1.6108, + "step": 4583 + }, + { + "epoch": 0.611689351481185, + "grad_norm": 1.0871835933095921, + "learning_rate": 6.919696618747994e-06, + "loss": 1.5817, + "step": 4584 + }, + { + "epoch": 0.6118227915665866, + "grad_norm": 1.041327828019737, + "learning_rate": 6.915585147675541e-06, + "loss": 1.5483, + "step": 4585 + }, + { + "epoch": 0.6119562316519882, + "grad_norm": 0.9762414215476425, + "learning_rate": 6.911474252736667e-06, + "loss": 1.507, + "step": 4586 + }, + { + "epoch": 0.6120896717373899, + "grad_norm": 1.030786578810606, + "learning_rate": 6.907363934699241e-06, + "loss": 1.6357, + "step": 4587 + }, + { + "epoch": 0.6122231118227915, + "grad_norm": 0.9814383825607843, + "learning_rate": 6.90325419433102e-06, + "loss": 1.5921, + "step": 4588 + }, + { + "epoch": 0.6123565519081933, + "grad_norm": 0.9363299777601847, + "learning_rate": 6.8991450323996535e-06, + "loss": 1.5827, + "step": 4589 + }, + { + "epoch": 0.6124899919935949, + "grad_norm": 0.9538862237104196, + "learning_rate": 6.895036449672694e-06, + "loss": 1.5372, + "step": 4590 + }, + { + "epoch": 0.6126234320789965, + "grad_norm": 0.97494009355015, + "learning_rate": 6.890928446917575e-06, + "loss": 1.561, + "step": 4591 + }, + { + "epoch": 0.6127568721643982, + "grad_norm": 0.9647040856357274, + "learning_rate": 6.886821024901622e-06, + "loss": 1.5882, + "step": 4592 + }, + { + "epoch": 0.6128903122497998, + "grad_norm": 1.091307209481251, + "learning_rate": 6.8827141843920585e-06, + "loss": 1.6223, + "step": 4593 + }, + { + "epoch": 0.6130237523352015, + "grad_norm": 0.9582456852774733, + "learning_rate": 6.878607926155992e-06, + "loss": 1.5471, + "step": 4594 + }, + { + "epoch": 0.6131571924206032, + "grad_norm": 0.9618755168259653, + "learning_rate": 6.874502250960429e-06, + "loss": 1.5851, + "step": 4595 + }, + { + "epoch": 0.6132906325060048, + "grad_norm": 0.9960213913651693, + "learning_rate": 6.870397159572257e-06, + "loss": 1.5711, + "step": 4596 + }, + { + "epoch": 0.6134240725914064, + "grad_norm": 0.9675989710680151, + "learning_rate": 6.866292652758266e-06, + "loss": 1.5615, + "step": 4597 + }, + { + "epoch": 0.6135575126768081, + "grad_norm": 1.1290180972157486, + "learning_rate": 6.862188731285131e-06, + "loss": 1.5547, + "step": 4598 + }, + { + "epoch": 0.6136909527622098, + "grad_norm": 1.0884500014392584, + "learning_rate": 6.8580853959194095e-06, + "loss": 1.5555, + "step": 4599 + }, + { + "epoch": 0.6138243928476115, + "grad_norm": 0.9752550116948083, + "learning_rate": 6.853982647427568e-06, + "loss": 1.5554, + "step": 4600 + }, + { + "epoch": 0.6139578329330131, + "grad_norm": 0.9556126452390394, + "learning_rate": 6.84988048657595e-06, + "loss": 1.6092, + "step": 4601 + }, + { + "epoch": 0.6140912730184147, + "grad_norm": 0.9563824103011194, + "learning_rate": 6.845778914130792e-06, + "loss": 1.5739, + "step": 4602 + }, + { + "epoch": 0.6142247131038164, + "grad_norm": 0.9841983764198419, + "learning_rate": 6.841677930858215e-06, + "loss": 1.5626, + "step": 4603 + }, + { + "epoch": 0.614358153189218, + "grad_norm": 0.945890058393798, + "learning_rate": 6.837577537524247e-06, + "loss": 1.5651, + "step": 4604 + }, + { + "epoch": 0.6144915932746197, + "grad_norm": 0.9637365996725469, + "learning_rate": 6.833477734894789e-06, + "loss": 1.5531, + "step": 4605 + }, + { + "epoch": 0.6146250333600214, + "grad_norm": 14.074871733099643, + "learning_rate": 6.829378523735635e-06, + "loss": 1.4761, + "step": 4606 + }, + { + "epoch": 0.614758473445423, + "grad_norm": 1.0353012877423016, + "learning_rate": 6.825279904812476e-06, + "loss": 1.5987, + "step": 4607 + }, + { + "epoch": 0.6148919135308246, + "grad_norm": 1.069043606434025, + "learning_rate": 6.821181878890886e-06, + "loss": 1.6326, + "step": 4608 + }, + { + "epoch": 0.6150253536162263, + "grad_norm": 0.9544980613861165, + "learning_rate": 6.817084446736329e-06, + "loss": 1.5769, + "step": 4609 + }, + { + "epoch": 0.615158793701628, + "grad_norm": 1.0198488235290817, + "learning_rate": 6.812987609114155e-06, + "loss": 1.5888, + "step": 4610 + }, + { + "epoch": 0.6152922337870296, + "grad_norm": 1.0114306343190909, + "learning_rate": 6.808891366789614e-06, + "loss": 1.5866, + "step": 4611 + }, + { + "epoch": 0.6154256738724313, + "grad_norm": 0.9683323450632398, + "learning_rate": 6.804795720527832e-06, + "loss": 1.6103, + "step": 4612 + }, + { + "epoch": 0.6155591139578329, + "grad_norm": 0.9651109935189197, + "learning_rate": 6.800700671093831e-06, + "loss": 1.5516, + "step": 4613 + }, + { + "epoch": 0.6156925540432346, + "grad_norm": 1.02345582166681, + "learning_rate": 6.796606219252519e-06, + "loss": 1.5824, + "step": 4614 + }, + { + "epoch": 0.6158259941286363, + "grad_norm": 0.9451195015480363, + "learning_rate": 6.7925123657686956e-06, + "loss": 1.5651, + "step": 4615 + }, + { + "epoch": 0.6159594342140379, + "grad_norm": 0.9485310120839627, + "learning_rate": 6.78841911140704e-06, + "loss": 1.5163, + "step": 4616 + }, + { + "epoch": 0.6160928742994396, + "grad_norm": 0.9543391087151242, + "learning_rate": 6.784326456932129e-06, + "loss": 1.6248, + "step": 4617 + }, + { + "epoch": 0.6162263143848412, + "grad_norm": 1.1822886103916135, + "learning_rate": 6.7802344031084264e-06, + "loss": 1.589, + "step": 4618 + }, + { + "epoch": 0.6163597544702428, + "grad_norm": 1.0774092766967276, + "learning_rate": 6.77614295070028e-06, + "loss": 1.5877, + "step": 4619 + }, + { + "epoch": 0.6164931945556446, + "grad_norm": 1.0639094196701442, + "learning_rate": 6.772052100471924e-06, + "loss": 1.6265, + "step": 4620 + }, + { + "epoch": 0.6166266346410462, + "grad_norm": 0.9726691460017839, + "learning_rate": 6.76796185318748e-06, + "loss": 1.5791, + "step": 4621 + }, + { + "epoch": 0.6167600747264478, + "grad_norm": 0.8990169867198253, + "learning_rate": 6.763872209610969e-06, + "loss": 1.5604, + "step": 4622 + }, + { + "epoch": 0.6168935148118495, + "grad_norm": 0.9730961102585249, + "learning_rate": 6.759783170506283e-06, + "loss": 1.5419, + "step": 4623 + }, + { + "epoch": 0.6170269548972511, + "grad_norm": 1.2338560915834138, + "learning_rate": 6.755694736637206e-06, + "loss": 1.5565, + "step": 4624 + }, + { + "epoch": 0.6171603949826527, + "grad_norm": 0.9206856089021481, + "learning_rate": 6.7516069087674186e-06, + "loss": 1.5654, + "step": 4625 + }, + { + "epoch": 0.6172938350680545, + "grad_norm": 0.9814257642355421, + "learning_rate": 6.747519687660477e-06, + "loss": 1.5811, + "step": 4626 + }, + { + "epoch": 0.6174272751534561, + "grad_norm": 0.9803535643820207, + "learning_rate": 6.743433074079826e-06, + "loss": 1.6115, + "step": 4627 + }, + { + "epoch": 0.6175607152388578, + "grad_norm": 0.9340160005856276, + "learning_rate": 6.739347068788795e-06, + "loss": 1.5486, + "step": 4628 + }, + { + "epoch": 0.6176941553242594, + "grad_norm": 3.36171539060685, + "learning_rate": 6.7352616725506125e-06, + "loss": 1.5931, + "step": 4629 + }, + { + "epoch": 0.617827595409661, + "grad_norm": 1.4276244458599252, + "learning_rate": 6.731176886128379e-06, + "loss": 1.5847, + "step": 4630 + }, + { + "epoch": 0.6179610354950628, + "grad_norm": 1.0518515488300342, + "learning_rate": 6.727092710285081e-06, + "loss": 1.5419, + "step": 4631 + }, + { + "epoch": 0.6180944755804644, + "grad_norm": 0.9733143124272681, + "learning_rate": 6.723009145783607e-06, + "loss": 1.5984, + "step": 4632 + }, + { + "epoch": 0.618227915665866, + "grad_norm": 0.9431452106066329, + "learning_rate": 6.718926193386714e-06, + "loss": 1.5877, + "step": 4633 + }, + { + "epoch": 0.6183613557512677, + "grad_norm": 1.0629787651629599, + "learning_rate": 6.714843853857052e-06, + "loss": 1.5851, + "step": 4634 + }, + { + "epoch": 0.6184947958366693, + "grad_norm": 1.110188030750085, + "learning_rate": 6.710762127957152e-06, + "loss": 1.5875, + "step": 4635 + }, + { + "epoch": 0.618628235922071, + "grad_norm": 1.1946206918422089, + "learning_rate": 6.706681016449441e-06, + "loss": 1.6129, + "step": 4636 + }, + { + "epoch": 0.6187616760074727, + "grad_norm": 0.996759831026245, + "learning_rate": 6.702600520096216e-06, + "loss": 1.5783, + "step": 4637 + }, + { + "epoch": 0.6188951160928743, + "grad_norm": 0.945997143369553, + "learning_rate": 6.698520639659674e-06, + "loss": 1.5278, + "step": 4638 + }, + { + "epoch": 0.6190285561782759, + "grad_norm": 0.9733621613054791, + "learning_rate": 6.694441375901888e-06, + "loss": 1.5809, + "step": 4639 + }, + { + "epoch": 0.6191619962636776, + "grad_norm": 0.9735974397589957, + "learning_rate": 6.690362729584818e-06, + "loss": 1.6003, + "step": 4640 + }, + { + "epoch": 0.6192954363490792, + "grad_norm": 0.9864220968652933, + "learning_rate": 6.686284701470309e-06, + "loss": 1.5841, + "step": 4641 + }, + { + "epoch": 0.619428876434481, + "grad_norm": 0.9772535789032403, + "learning_rate": 6.682207292320084e-06, + "loss": 1.5611, + "step": 4642 + }, + { + "epoch": 0.6195623165198826, + "grad_norm": 1.0280279728242607, + "learning_rate": 6.678130502895769e-06, + "loss": 1.5529, + "step": 4643 + }, + { + "epoch": 0.6196957566052842, + "grad_norm": 0.9248902395905066, + "learning_rate": 6.674054333958854e-06, + "loss": 1.5616, + "step": 4644 + }, + { + "epoch": 0.6198291966906859, + "grad_norm": 0.9523872687282976, + "learning_rate": 6.669978786270721e-06, + "loss": 1.5617, + "step": 4645 + }, + { + "epoch": 0.6199626367760875, + "grad_norm": 0.9776370256998808, + "learning_rate": 6.665903860592635e-06, + "loss": 1.5794, + "step": 4646 + }, + { + "epoch": 0.6200960768614892, + "grad_norm": 0.952043520590941, + "learning_rate": 6.661829557685751e-06, + "loss": 1.4796, + "step": 4647 + }, + { + "epoch": 0.6202295169468909, + "grad_norm": 1.1835704731983463, + "learning_rate": 6.6577558783110985e-06, + "loss": 1.5927, + "step": 4648 + }, + { + "epoch": 0.6203629570322925, + "grad_norm": 0.9658315781942995, + "learning_rate": 6.65368282322959e-06, + "loss": 1.5559, + "step": 4649 + }, + { + "epoch": 0.6204963971176941, + "grad_norm": 0.9972805039885417, + "learning_rate": 6.649610393202037e-06, + "loss": 1.6225, + "step": 4650 + }, + { + "epoch": 0.6206298372030958, + "grad_norm": 1.0977055810151135, + "learning_rate": 6.645538588989117e-06, + "loss": 1.5555, + "step": 4651 + }, + { + "epoch": 0.6207632772884975, + "grad_norm": 1.25686084150492, + "learning_rate": 6.641467411351395e-06, + "loss": 1.6138, + "step": 4652 + }, + { + "epoch": 0.6208967173738991, + "grad_norm": 0.9606097211322975, + "learning_rate": 6.637396861049319e-06, + "loss": 1.5605, + "step": 4653 + }, + { + "epoch": 0.6210301574593008, + "grad_norm": 0.9543190537739319, + "learning_rate": 6.6333269388432295e-06, + "loss": 1.5894, + "step": 4654 + }, + { + "epoch": 0.6211635975447024, + "grad_norm": 0.9663911647320823, + "learning_rate": 6.6292576454933355e-06, + "loss": 1.5667, + "step": 4655 + }, + { + "epoch": 0.6212970376301041, + "grad_norm": 1.0490534949737367, + "learning_rate": 6.625188981759734e-06, + "loss": 1.6118, + "step": 4656 + }, + { + "epoch": 0.6214304777155057, + "grad_norm": 1.053734926356376, + "learning_rate": 6.621120948402411e-06, + "loss": 1.595, + "step": 4657 + }, + { + "epoch": 0.6215639178009074, + "grad_norm": 0.9503515200342192, + "learning_rate": 6.617053546181222e-06, + "loss": 1.5537, + "step": 4658 + }, + { + "epoch": 0.6216973578863091, + "grad_norm": 0.9603504106356093, + "learning_rate": 6.612986775855914e-06, + "loss": 1.5798, + "step": 4659 + }, + { + "epoch": 0.6218307979717107, + "grad_norm": 0.9347443417349168, + "learning_rate": 6.6089206381861135e-06, + "loss": 1.5463, + "step": 4660 + }, + { + "epoch": 0.6219642380571123, + "grad_norm": 0.9559788324052123, + "learning_rate": 6.60485513393133e-06, + "loss": 1.5512, + "step": 4661 + }, + { + "epoch": 0.622097678142514, + "grad_norm": 0.9350187591640845, + "learning_rate": 6.600790263850953e-06, + "loss": 1.5178, + "step": 4662 + }, + { + "epoch": 0.6222311182279157, + "grad_norm": 0.9721750502110509, + "learning_rate": 6.59672602870425e-06, + "loss": 1.6067, + "step": 4663 + }, + { + "epoch": 0.6223645583133173, + "grad_norm": 0.9681735106486332, + "learning_rate": 6.592662429250381e-06, + "loss": 1.5639, + "step": 4664 + }, + { + "epoch": 0.622497998398719, + "grad_norm": 1.0145576572286186, + "learning_rate": 6.588599466248376e-06, + "loss": 1.5534, + "step": 4665 + }, + { + "epoch": 0.6226314384841206, + "grad_norm": 1.0483912253591907, + "learning_rate": 6.58453714045715e-06, + "loss": 1.5743, + "step": 4666 + }, + { + "epoch": 0.6227648785695223, + "grad_norm": 0.9862160876054513, + "learning_rate": 6.580475452635495e-06, + "loss": 1.5696, + "step": 4667 + }, + { + "epoch": 0.622898318654924, + "grad_norm": 0.9554839586876005, + "learning_rate": 6.576414403542098e-06, + "loss": 1.5909, + "step": 4668 + }, + { + "epoch": 0.6230317587403256, + "grad_norm": 1.0820406864016012, + "learning_rate": 6.57235399393551e-06, + "loss": 1.5565, + "step": 4669 + }, + { + "epoch": 0.6231651988257273, + "grad_norm": 0.98515013329777, + "learning_rate": 6.568294224574168e-06, + "loss": 1.5059, + "step": 4670 + }, + { + "epoch": 0.6232986389111289, + "grad_norm": 1.0239649377684183, + "learning_rate": 6.564235096216397e-06, + "loss": 1.6116, + "step": 4671 + }, + { + "epoch": 0.6234320789965305, + "grad_norm": 0.9784224342622083, + "learning_rate": 6.560176609620392e-06, + "loss": 1.6186, + "step": 4672 + }, + { + "epoch": 0.6235655190819323, + "grad_norm": 0.9678732574412918, + "learning_rate": 6.556118765544233e-06, + "loss": 1.5708, + "step": 4673 + }, + { + "epoch": 0.6236989591673339, + "grad_norm": 0.9800954857490598, + "learning_rate": 6.5520615647458754e-06, + "loss": 1.5868, + "step": 4674 + }, + { + "epoch": 0.6238323992527355, + "grad_norm": 1.052059487698131, + "learning_rate": 6.548005007983163e-06, + "loss": 1.5298, + "step": 4675 + }, + { + "epoch": 0.6239658393381372, + "grad_norm": 0.9774410359007661, + "learning_rate": 6.543949096013814e-06, + "loss": 1.6378, + "step": 4676 + }, + { + "epoch": 0.6240992794235388, + "grad_norm": 1.0154666358789204, + "learning_rate": 6.539893829595425e-06, + "loss": 1.5616, + "step": 4677 + }, + { + "epoch": 0.6242327195089404, + "grad_norm": 1.0875488553183978, + "learning_rate": 6.535839209485473e-06, + "loss": 1.6052, + "step": 4678 + }, + { + "epoch": 0.6243661595943422, + "grad_norm": 0.996755457440969, + "learning_rate": 6.531785236441316e-06, + "loss": 1.5472, + "step": 4679 + }, + { + "epoch": 0.6244995996797438, + "grad_norm": 0.9597796664804172, + "learning_rate": 6.527731911220188e-06, + "loss": 1.5385, + "step": 4680 + }, + { + "epoch": 0.6246330397651455, + "grad_norm": 0.9868274657841508, + "learning_rate": 6.523679234579207e-06, + "loss": 1.5555, + "step": 4681 + }, + { + "epoch": 0.6247664798505471, + "grad_norm": 0.9610538520046898, + "learning_rate": 6.5196272072753665e-06, + "loss": 1.6318, + "step": 4682 + }, + { + "epoch": 0.6248999199359487, + "grad_norm": 0.9714178990062254, + "learning_rate": 6.515575830065538e-06, + "loss": 1.5617, + "step": 4683 + }, + { + "epoch": 0.6250333600213505, + "grad_norm": 1.108314947127384, + "learning_rate": 6.511525103706473e-06, + "loss": 1.6024, + "step": 4684 + }, + { + "epoch": 0.6251668001067521, + "grad_norm": 0.9704286966755745, + "learning_rate": 6.507475028954797e-06, + "loss": 1.5746, + "step": 4685 + }, + { + "epoch": 0.6253002401921537, + "grad_norm": 0.9303993833203598, + "learning_rate": 6.503425606567026e-06, + "loss": 1.5919, + "step": 4686 + }, + { + "epoch": 0.6254336802775554, + "grad_norm": 0.9605139751499199, + "learning_rate": 6.499376837299541e-06, + "loss": 1.5261, + "step": 4687 + }, + { + "epoch": 0.625567120362957, + "grad_norm": 0.9509770084954142, + "learning_rate": 6.4953287219086035e-06, + "loss": 1.5105, + "step": 4688 + }, + { + "epoch": 0.6257005604483586, + "grad_norm": 0.9729887663740894, + "learning_rate": 6.491281261150362e-06, + "loss": 1.6217, + "step": 4689 + }, + { + "epoch": 0.6258340005337604, + "grad_norm": 0.9413725926216433, + "learning_rate": 6.487234455780833e-06, + "loss": 1.5847, + "step": 4690 + }, + { + "epoch": 0.625967440619162, + "grad_norm": 0.9874467867153237, + "learning_rate": 6.483188306555913e-06, + "loss": 1.5823, + "step": 4691 + }, + { + "epoch": 0.6261008807045636, + "grad_norm": 0.9489046613674973, + "learning_rate": 6.479142814231374e-06, + "loss": 1.5656, + "step": 4692 + }, + { + "epoch": 0.6262343207899653, + "grad_norm": 0.9603586164165097, + "learning_rate": 6.475097979562873e-06, + "loss": 1.5801, + "step": 4693 + }, + { + "epoch": 0.6263677608753669, + "grad_norm": 0.9391371736350217, + "learning_rate": 6.471053803305938e-06, + "loss": 1.6123, + "step": 4694 + }, + { + "epoch": 0.6265012009607687, + "grad_norm": 0.9897178331583285, + "learning_rate": 6.467010286215971e-06, + "loss": 1.5621, + "step": 4695 + }, + { + "epoch": 0.6266346410461703, + "grad_norm": 1.3397968048069753, + "learning_rate": 6.462967429048259e-06, + "loss": 1.5448, + "step": 4696 + }, + { + "epoch": 0.6267680811315719, + "grad_norm": 0.9969569105203754, + "learning_rate": 6.458925232557964e-06, + "loss": 1.5672, + "step": 4697 + }, + { + "epoch": 0.6269015212169736, + "grad_norm": 1.4282904976235595, + "learning_rate": 6.4548836975001165e-06, + "loss": 1.5755, + "step": 4698 + }, + { + "epoch": 0.6270349613023752, + "grad_norm": 0.9304292340727252, + "learning_rate": 6.4508428246296306e-06, + "loss": 1.5704, + "step": 4699 + }, + { + "epoch": 0.6271684013877769, + "grad_norm": 1.0676289094313383, + "learning_rate": 6.446802614701298e-06, + "loss": 1.6326, + "step": 4700 + }, + { + "epoch": 0.6273018414731786, + "grad_norm": 0.9536617515956951, + "learning_rate": 6.44276306846978e-06, + "loss": 1.5803, + "step": 4701 + }, + { + "epoch": 0.6274352815585802, + "grad_norm": 1.0650052391897409, + "learning_rate": 6.438724186689621e-06, + "loss": 1.5345, + "step": 4702 + }, + { + "epoch": 0.6275687216439818, + "grad_norm": 0.928912755262742, + "learning_rate": 6.43468597011524e-06, + "loss": 1.6111, + "step": 4703 + }, + { + "epoch": 0.6277021617293835, + "grad_norm": 1.0609739791096682, + "learning_rate": 6.4306484195009246e-06, + "loss": 1.5605, + "step": 4704 + }, + { + "epoch": 0.6278356018147851, + "grad_norm": 1.0242522017660989, + "learning_rate": 6.426611535600848e-06, + "loss": 1.5373, + "step": 4705 + }, + { + "epoch": 0.6279690419001868, + "grad_norm": 1.1731834357230497, + "learning_rate": 6.422575319169047e-06, + "loss": 1.5407, + "step": 4706 + }, + { + "epoch": 0.6281024819855885, + "grad_norm": 0.9787653325551281, + "learning_rate": 6.418539770959451e-06, + "loss": 1.6097, + "step": 4707 + }, + { + "epoch": 0.6282359220709901, + "grad_norm": 1.0177554635966886, + "learning_rate": 6.414504891725848e-06, + "loss": 1.5137, + "step": 4708 + }, + { + "epoch": 0.6283693621563918, + "grad_norm": 0.9446114640014235, + "learning_rate": 6.41047068222191e-06, + "loss": 1.6302, + "step": 4709 + }, + { + "epoch": 0.6285028022417934, + "grad_norm": 4.459094665047282, + "learning_rate": 6.406437143201174e-06, + "loss": 1.6297, + "step": 4710 + }, + { + "epoch": 0.6286362423271951, + "grad_norm": 0.9485026202281593, + "learning_rate": 6.402404275417071e-06, + "loss": 1.532, + "step": 4711 + }, + { + "epoch": 0.6287696824125968, + "grad_norm": 0.9321489717238166, + "learning_rate": 6.39837207962289e-06, + "loss": 1.5491, + "step": 4712 + }, + { + "epoch": 0.6289031224979984, + "grad_norm": 1.013553078508425, + "learning_rate": 6.394340556571794e-06, + "loss": 1.5433, + "step": 4713 + }, + { + "epoch": 0.6290365625834, + "grad_norm": 0.9238968592943705, + "learning_rate": 6.390309707016833e-06, + "loss": 1.5176, + "step": 4714 + }, + { + "epoch": 0.6291700026688017, + "grad_norm": 0.9769175672259005, + "learning_rate": 6.386279531710921e-06, + "loss": 1.5942, + "step": 4715 + }, + { + "epoch": 0.6293034427542034, + "grad_norm": 0.9697292609419739, + "learning_rate": 6.382250031406851e-06, + "loss": 1.5822, + "step": 4716 + }, + { + "epoch": 0.629436882839605, + "grad_norm": 1.0258012834838501, + "learning_rate": 6.378221206857278e-06, + "loss": 1.5787, + "step": 4717 + }, + { + "epoch": 0.6295703229250067, + "grad_norm": 0.9633940936233738, + "learning_rate": 6.374193058814755e-06, + "loss": 1.557, + "step": 4718 + }, + { + "epoch": 0.6297037630104083, + "grad_norm": 0.9921666668432783, + "learning_rate": 6.370165588031686e-06, + "loss": 1.5687, + "step": 4719 + }, + { + "epoch": 0.6298372030958099, + "grad_norm": 0.9501732810691954, + "learning_rate": 6.366138795260356e-06, + "loss": 1.4943, + "step": 4720 + }, + { + "epoch": 0.6299706431812117, + "grad_norm": 0.952051603638449, + "learning_rate": 6.362112681252928e-06, + "loss": 1.5632, + "step": 4721 + }, + { + "epoch": 0.6301040832666133, + "grad_norm": 1.0075164901271165, + "learning_rate": 6.358087246761432e-06, + "loss": 1.5648, + "step": 4722 + }, + { + "epoch": 0.630237523352015, + "grad_norm": 0.9580671930174274, + "learning_rate": 6.354062492537772e-06, + "loss": 1.5348, + "step": 4723 + }, + { + "epoch": 0.6303709634374166, + "grad_norm": 0.9723415922337499, + "learning_rate": 6.3500384193337275e-06, + "loss": 1.5957, + "step": 4724 + }, + { + "epoch": 0.6305044035228182, + "grad_norm": 1.0243012844746644, + "learning_rate": 6.346015027900951e-06, + "loss": 1.6132, + "step": 4725 + }, + { + "epoch": 0.63063784360822, + "grad_norm": 0.923480627824626, + "learning_rate": 6.3419923189909674e-06, + "loss": 1.6316, + "step": 4726 + }, + { + "epoch": 0.6307712836936216, + "grad_norm": 0.9845476502020702, + "learning_rate": 6.337970293355164e-06, + "loss": 1.5867, + "step": 4727 + }, + { + "epoch": 0.6309047237790232, + "grad_norm": 1.0867072821365296, + "learning_rate": 6.333948951744823e-06, + "loss": 1.5594, + "step": 4728 + }, + { + "epoch": 0.6310381638644249, + "grad_norm": 0.9725956783048197, + "learning_rate": 6.329928294911076e-06, + "loss": 1.5799, + "step": 4729 + }, + { + "epoch": 0.6311716039498265, + "grad_norm": 0.92191760617083, + "learning_rate": 6.325908323604939e-06, + "loss": 1.5785, + "step": 4730 + }, + { + "epoch": 0.6313050440352281, + "grad_norm": 0.9502477068461954, + "learning_rate": 6.321889038577291e-06, + "loss": 1.5772, + "step": 4731 + }, + { + "epoch": 0.6314384841206299, + "grad_norm": 0.9378807993847438, + "learning_rate": 6.317870440578899e-06, + "loss": 1.5395, + "step": 4732 + }, + { + "epoch": 0.6315719242060315, + "grad_norm": 1.0944522310079952, + "learning_rate": 6.313852530360387e-06, + "loss": 1.597, + "step": 4733 + }, + { + "epoch": 0.6317053642914331, + "grad_norm": 0.9643616082201257, + "learning_rate": 6.309835308672248e-06, + "loss": 1.5474, + "step": 4734 + }, + { + "epoch": 0.6318388043768348, + "grad_norm": 1.1104130886728687, + "learning_rate": 6.305818776264864e-06, + "loss": 1.5716, + "step": 4735 + }, + { + "epoch": 0.6319722444622364, + "grad_norm": 1.0769075057648472, + "learning_rate": 6.301802933888472e-06, + "loss": 1.5691, + "step": 4736 + }, + { + "epoch": 0.6321056845476382, + "grad_norm": 0.9721124377884932, + "learning_rate": 6.297787782293188e-06, + "loss": 1.5656, + "step": 4737 + }, + { + "epoch": 0.6322391246330398, + "grad_norm": 0.945916645197926, + "learning_rate": 6.293773322228989e-06, + "loss": 1.5976, + "step": 4738 + }, + { + "epoch": 0.6323725647184414, + "grad_norm": 1.0327667218788614, + "learning_rate": 6.2897595544457426e-06, + "loss": 1.5513, + "step": 4739 + }, + { + "epoch": 0.6325060048038431, + "grad_norm": 0.9500434363342504, + "learning_rate": 6.285746479693166e-06, + "loss": 1.5667, + "step": 4740 + }, + { + "epoch": 0.6326394448892447, + "grad_norm": 1.1553660667606065, + "learning_rate": 6.281734098720862e-06, + "loss": 1.5331, + "step": 4741 + }, + { + "epoch": 0.6327728849746463, + "grad_norm": 1.0053138525266323, + "learning_rate": 6.27772241227829e-06, + "loss": 1.6188, + "step": 4742 + }, + { + "epoch": 0.6329063250600481, + "grad_norm": 1.117959784943353, + "learning_rate": 6.2737114211147946e-06, + "loss": 1.6097, + "step": 4743 + }, + { + "epoch": 0.6330397651454497, + "grad_norm": 0.91872923216644, + "learning_rate": 6.269701125979577e-06, + "loss": 1.5345, + "step": 4744 + }, + { + "epoch": 0.6331732052308513, + "grad_norm": 1.0580192994245377, + "learning_rate": 6.26569152762172e-06, + "loss": 1.5928, + "step": 4745 + }, + { + "epoch": 0.633306645316253, + "grad_norm": 0.9371703645600504, + "learning_rate": 6.261682626790169e-06, + "loss": 1.5538, + "step": 4746 + }, + { + "epoch": 0.6334400854016546, + "grad_norm": 0.9689827210946913, + "learning_rate": 6.257674424233742e-06, + "loss": 1.6359, + "step": 4747 + }, + { + "epoch": 0.6335735254870564, + "grad_norm": 0.9472008982891928, + "learning_rate": 6.253666920701125e-06, + "loss": 1.6057, + "step": 4748 + }, + { + "epoch": 0.633706965572458, + "grad_norm": 0.9192823024984135, + "learning_rate": 6.249660116940869e-06, + "loss": 1.5798, + "step": 4749 + }, + { + "epoch": 0.6338404056578596, + "grad_norm": 1.031749263396716, + "learning_rate": 6.245654013701408e-06, + "loss": 1.5517, + "step": 4750 + }, + { + "epoch": 0.6339738457432613, + "grad_norm": 0.9624574391068491, + "learning_rate": 6.2416486117310326e-06, + "loss": 1.6054, + "step": 4751 + }, + { + "epoch": 0.6341072858286629, + "grad_norm": 0.9912718125263276, + "learning_rate": 6.237643911777902e-06, + "loss": 1.5548, + "step": 4752 + }, + { + "epoch": 0.6342407259140646, + "grad_norm": 0.9308587514815552, + "learning_rate": 6.233639914590056e-06, + "loss": 1.5791, + "step": 4753 + }, + { + "epoch": 0.6343741659994663, + "grad_norm": 0.9574581547965888, + "learning_rate": 6.2296366209153935e-06, + "loss": 1.582, + "step": 4754 + }, + { + "epoch": 0.6345076060848679, + "grad_norm": 0.9471871838216913, + "learning_rate": 6.225634031501682e-06, + "loss": 1.5467, + "step": 4755 + }, + { + "epoch": 0.6346410461702695, + "grad_norm": 0.9623094686395599, + "learning_rate": 6.221632147096557e-06, + "loss": 1.5942, + "step": 4756 + }, + { + "epoch": 0.6347744862556712, + "grad_norm": 1.0311697985777823, + "learning_rate": 6.2176309684475325e-06, + "loss": 1.5744, + "step": 4757 + }, + { + "epoch": 0.6349079263410728, + "grad_norm": 1.000138982436376, + "learning_rate": 6.213630496301979e-06, + "loss": 1.6009, + "step": 4758 + }, + { + "epoch": 0.6350413664264745, + "grad_norm": 1.0996770869148338, + "learning_rate": 6.209630731407138e-06, + "loss": 1.5789, + "step": 4759 + }, + { + "epoch": 0.6351748065118762, + "grad_norm": 0.9621838976319239, + "learning_rate": 6.205631674510122e-06, + "loss": 1.5206, + "step": 4760 + }, + { + "epoch": 0.6353082465972778, + "grad_norm": 1.0190348806503502, + "learning_rate": 6.2016333263579096e-06, + "loss": 1.529, + "step": 4761 + }, + { + "epoch": 0.6354416866826795, + "grad_norm": 0.9330752296166617, + "learning_rate": 6.197635687697345e-06, + "loss": 1.5315, + "step": 4762 + }, + { + "epoch": 0.6355751267680811, + "grad_norm": 0.9925347326400968, + "learning_rate": 6.1936387592751425e-06, + "loss": 1.5504, + "step": 4763 + }, + { + "epoch": 0.6357085668534828, + "grad_norm": 0.9627856019335367, + "learning_rate": 6.189642541837883e-06, + "loss": 1.5874, + "step": 4764 + }, + { + "epoch": 0.6358420069388845, + "grad_norm": 1.0100325648862956, + "learning_rate": 6.185647036132013e-06, + "loss": 1.6278, + "step": 4765 + }, + { + "epoch": 0.6359754470242861, + "grad_norm": 0.9333049284580013, + "learning_rate": 6.181652242903848e-06, + "loss": 1.5312, + "step": 4766 + }, + { + "epoch": 0.6361088871096877, + "grad_norm": 0.9491077426744001, + "learning_rate": 6.177658162899573e-06, + "loss": 1.5889, + "step": 4767 + }, + { + "epoch": 0.6362423271950894, + "grad_norm": 1.0817082893352956, + "learning_rate": 6.1736647968652345e-06, + "loss": 1.5957, + "step": 4768 + }, + { + "epoch": 0.6363757672804911, + "grad_norm": 1.0229373542272857, + "learning_rate": 6.169672145546747e-06, + "loss": 1.5911, + "step": 4769 + }, + { + "epoch": 0.6365092073658927, + "grad_norm": 1.0467028908683291, + "learning_rate": 6.165680209689889e-06, + "loss": 1.5631, + "step": 4770 + }, + { + "epoch": 0.6366426474512944, + "grad_norm": 0.9981899865143897, + "learning_rate": 6.161688990040315e-06, + "loss": 1.566, + "step": 4771 + }, + { + "epoch": 0.636776087536696, + "grad_norm": 0.9334836157550336, + "learning_rate": 6.15769848734354e-06, + "loss": 1.5353, + "step": 4772 + }, + { + "epoch": 0.6369095276220976, + "grad_norm": 0.9251044403398437, + "learning_rate": 6.1537087023449386e-06, + "loss": 1.5542, + "step": 4773 + }, + { + "epoch": 0.6370429677074994, + "grad_norm": 0.9282614735198017, + "learning_rate": 6.149719635789757e-06, + "loss": 1.5508, + "step": 4774 + }, + { + "epoch": 0.637176407792901, + "grad_norm": 0.9399449871354199, + "learning_rate": 6.145731288423114e-06, + "loss": 1.5636, + "step": 4775 + }, + { + "epoch": 0.6373098478783027, + "grad_norm": 0.9865252792900647, + "learning_rate": 6.141743660989984e-06, + "loss": 1.5661, + "step": 4776 + }, + { + "epoch": 0.6374432879637043, + "grad_norm": 0.9797903050975381, + "learning_rate": 6.137756754235205e-06, + "loss": 1.5981, + "step": 4777 + }, + { + "epoch": 0.6375767280491059, + "grad_norm": 0.9421928709402042, + "learning_rate": 6.133770568903497e-06, + "loss": 1.5313, + "step": 4778 + }, + { + "epoch": 0.6377101681345076, + "grad_norm": 1.0060524251777645, + "learning_rate": 6.129785105739427e-06, + "loss": 1.6234, + "step": 4779 + }, + { + "epoch": 0.6378436082199093, + "grad_norm": 0.9803594189527336, + "learning_rate": 6.1258003654874355e-06, + "loss": 1.58, + "step": 4780 + }, + { + "epoch": 0.6379770483053109, + "grad_norm": 0.9902714344055336, + "learning_rate": 6.121816348891822e-06, + "loss": 1.5152, + "step": 4781 + }, + { + "epoch": 0.6381104883907126, + "grad_norm": 1.057224007473837, + "learning_rate": 6.117833056696765e-06, + "loss": 1.5827, + "step": 4782 + }, + { + "epoch": 0.6382439284761142, + "grad_norm": 1.4743335061623923, + "learning_rate": 6.1138504896462915e-06, + "loss": 1.593, + "step": 4783 + }, + { + "epoch": 0.6383773685615158, + "grad_norm": 0.953837336988305, + "learning_rate": 6.1098686484843e-06, + "loss": 1.5946, + "step": 4784 + }, + { + "epoch": 0.6385108086469176, + "grad_norm": 1.0530003073440923, + "learning_rate": 6.105887533954555e-06, + "loss": 1.5438, + "step": 4785 + }, + { + "epoch": 0.6386442487323192, + "grad_norm": 1.091118899015719, + "learning_rate": 6.101907146800683e-06, + "loss": 1.5471, + "step": 4786 + }, + { + "epoch": 0.6387776888177208, + "grad_norm": 0.9672697431062299, + "learning_rate": 6.0979274877661734e-06, + "loss": 1.5845, + "step": 4787 + }, + { + "epoch": 0.6389111289031225, + "grad_norm": 1.004597787760983, + "learning_rate": 6.09394855759438e-06, + "loss": 1.5889, + "step": 4788 + }, + { + "epoch": 0.6390445689885241, + "grad_norm": 0.9685979515200861, + "learning_rate": 6.089970357028528e-06, + "loss": 1.5362, + "step": 4789 + }, + { + "epoch": 0.6391780090739259, + "grad_norm": 0.9618619817586496, + "learning_rate": 6.085992886811696e-06, + "loss": 1.5808, + "step": 4790 + }, + { + "epoch": 0.6393114491593275, + "grad_norm": 0.928222428021561, + "learning_rate": 6.082016147686824e-06, + "loss": 1.5076, + "step": 4791 + }, + { + "epoch": 0.6394448892447291, + "grad_norm": 0.9491261783995589, + "learning_rate": 6.0780401403967346e-06, + "loss": 1.5641, + "step": 4792 + }, + { + "epoch": 0.6395783293301308, + "grad_norm": 1.0480229583034089, + "learning_rate": 6.0740648656840925e-06, + "loss": 1.54, + "step": 4793 + }, + { + "epoch": 0.6397117694155324, + "grad_norm": 0.9544547807714423, + "learning_rate": 6.070090324291436e-06, + "loss": 1.5695, + "step": 4794 + }, + { + "epoch": 0.639845209500934, + "grad_norm": 0.9172480413703332, + "learning_rate": 6.066116516961157e-06, + "loss": 1.525, + "step": 4795 + }, + { + "epoch": 0.6399786495863358, + "grad_norm": 0.9492147743557356, + "learning_rate": 6.062143444435529e-06, + "loss": 1.5423, + "step": 4796 + }, + { + "epoch": 0.6401120896717374, + "grad_norm": 0.97339963817088, + "learning_rate": 6.058171107456672e-06, + "loss": 1.5391, + "step": 4797 + }, + { + "epoch": 0.640245529757139, + "grad_norm": 1.2402769128852358, + "learning_rate": 6.0541995067665675e-06, + "loss": 1.5826, + "step": 4798 + }, + { + "epoch": 0.6403789698425407, + "grad_norm": 0.9756211394635729, + "learning_rate": 6.050228643107074e-06, + "loss": 1.5811, + "step": 4799 + }, + { + "epoch": 0.6405124099279423, + "grad_norm": 0.923769213113528, + "learning_rate": 6.046258517219902e-06, + "loss": 1.54, + "step": 4800 + }, + { + "epoch": 0.640645850013344, + "grad_norm": 1.1597847530387866, + "learning_rate": 6.042289129846623e-06, + "loss": 1.5603, + "step": 4801 + }, + { + "epoch": 0.6407792900987457, + "grad_norm": 1.1413316657225265, + "learning_rate": 6.03832048172867e-06, + "loss": 1.5174, + "step": 4802 + }, + { + "epoch": 0.6409127301841473, + "grad_norm": 0.9271474721968134, + "learning_rate": 6.0343525736073506e-06, + "loss": 1.5434, + "step": 4803 + }, + { + "epoch": 0.641046170269549, + "grad_norm": 0.9223447060910756, + "learning_rate": 6.03038540622382e-06, + "loss": 1.5687, + "step": 4804 + }, + { + "epoch": 0.6411796103549506, + "grad_norm": 0.9276917949899848, + "learning_rate": 6.026418980319098e-06, + "loss": 1.5586, + "step": 4805 + }, + { + "epoch": 0.6413130504403523, + "grad_norm": 0.9541595431849743, + "learning_rate": 6.022453296634069e-06, + "loss": 1.5539, + "step": 4806 + }, + { + "epoch": 0.641446490525754, + "grad_norm": 7.6768327775349, + "learning_rate": 6.01848835590948e-06, + "loss": 1.5641, + "step": 4807 + }, + { + "epoch": 0.6415799306111556, + "grad_norm": 1.2862124397586092, + "learning_rate": 6.0145241588859324e-06, + "loss": 1.5405, + "step": 4808 + }, + { + "epoch": 0.6417133706965572, + "grad_norm": 0.9605935993165724, + "learning_rate": 6.010560706303896e-06, + "loss": 1.5141, + "step": 4809 + }, + { + "epoch": 0.6418468107819589, + "grad_norm": 0.9542783776622776, + "learning_rate": 6.006597998903699e-06, + "loss": 1.522, + "step": 4810 + }, + { + "epoch": 0.6419802508673605, + "grad_norm": 0.9681776765662814, + "learning_rate": 6.002636037425531e-06, + "loss": 1.5561, + "step": 4811 + }, + { + "epoch": 0.6421136909527622, + "grad_norm": 0.9642447674867372, + "learning_rate": 5.998674822609438e-06, + "loss": 1.5985, + "step": 4812 + }, + { + "epoch": 0.6422471310381639, + "grad_norm": 0.991926581119737, + "learning_rate": 5.9947143551953275e-06, + "loss": 1.536, + "step": 4813 + }, + { + "epoch": 0.6423805711235655, + "grad_norm": 0.9566617525797997, + "learning_rate": 5.99075463592298e-06, + "loss": 1.5697, + "step": 4814 + }, + { + "epoch": 0.6425140112089672, + "grad_norm": 0.9676450282612642, + "learning_rate": 5.986795665532017e-06, + "loss": 1.5289, + "step": 4815 + }, + { + "epoch": 0.6426474512943688, + "grad_norm": 0.9268671621535911, + "learning_rate": 5.982837444761929e-06, + "loss": 1.5541, + "step": 4816 + }, + { + "epoch": 0.6427808913797705, + "grad_norm": 0.9595643521431843, + "learning_rate": 5.978879974352072e-06, + "loss": 1.5536, + "step": 4817 + }, + { + "epoch": 0.6429143314651722, + "grad_norm": 0.9796809856469711, + "learning_rate": 5.974923255041653e-06, + "loss": 1.5676, + "step": 4818 + }, + { + "epoch": 0.6430477715505738, + "grad_norm": 0.9646690903498765, + "learning_rate": 5.9709672875697445e-06, + "loss": 1.6259, + "step": 4819 + }, + { + "epoch": 0.6431812116359754, + "grad_norm": 0.9534358718940747, + "learning_rate": 5.967012072675269e-06, + "loss": 1.5495, + "step": 4820 + }, + { + "epoch": 0.6433146517213771, + "grad_norm": 0.9361561726578995, + "learning_rate": 5.963057611097026e-06, + "loss": 1.5803, + "step": 4821 + }, + { + "epoch": 0.6434480918067788, + "grad_norm": 0.9139126102709447, + "learning_rate": 5.9591039035736595e-06, + "loss": 1.5603, + "step": 4822 + }, + { + "epoch": 0.6435815318921804, + "grad_norm": 1.0664502849150646, + "learning_rate": 5.955150950843673e-06, + "loss": 1.5331, + "step": 4823 + }, + { + "epoch": 0.6437149719775821, + "grad_norm": 1.021561984009056, + "learning_rate": 5.951198753645437e-06, + "loss": 1.5335, + "step": 4824 + }, + { + "epoch": 0.6438484120629837, + "grad_norm": 0.9503166506512712, + "learning_rate": 5.9472473127171795e-06, + "loss": 1.5493, + "step": 4825 + }, + { + "epoch": 0.6439818521483853, + "grad_norm": 0.9149759543673214, + "learning_rate": 5.9432966287969805e-06, + "loss": 1.5783, + "step": 4826 + }, + { + "epoch": 0.644115292233787, + "grad_norm": 0.9900352522766276, + "learning_rate": 5.939346702622782e-06, + "loss": 1.632, + "step": 4827 + }, + { + "epoch": 0.6442487323191887, + "grad_norm": 0.9616514543356586, + "learning_rate": 5.93539753493239e-06, + "loss": 1.5656, + "step": 4828 + }, + { + "epoch": 0.6443821724045904, + "grad_norm": 0.9438961654754505, + "learning_rate": 5.93144912646346e-06, + "loss": 1.5744, + "step": 4829 + }, + { + "epoch": 0.644515612489992, + "grad_norm": 0.914116676055478, + "learning_rate": 5.927501477953508e-06, + "loss": 1.5665, + "step": 4830 + }, + { + "epoch": 0.6446490525753936, + "grad_norm": 1.0053787424529315, + "learning_rate": 5.923554590139917e-06, + "loss": 1.5984, + "step": 4831 + }, + { + "epoch": 0.6447824926607953, + "grad_norm": 1.0553270414804121, + "learning_rate": 5.919608463759916e-06, + "loss": 1.6098, + "step": 4832 + }, + { + "epoch": 0.644915932746197, + "grad_norm": 1.0814676717925145, + "learning_rate": 5.915663099550597e-06, + "loss": 1.5929, + "step": 4833 + }, + { + "epoch": 0.6450493728315986, + "grad_norm": 1.2260986664990163, + "learning_rate": 5.911718498248904e-06, + "loss": 1.5417, + "step": 4834 + }, + { + "epoch": 0.6451828129170003, + "grad_norm": 0.9760168090523013, + "learning_rate": 5.907774660591654e-06, + "loss": 1.5707, + "step": 4835 + }, + { + "epoch": 0.6453162530024019, + "grad_norm": 1.038465167882452, + "learning_rate": 5.903831587315505e-06, + "loss": 1.5461, + "step": 4836 + }, + { + "epoch": 0.6454496930878035, + "grad_norm": 0.9965870568346387, + "learning_rate": 5.899889279156976e-06, + "loss": 1.5951, + "step": 4837 + }, + { + "epoch": 0.6455831331732053, + "grad_norm": 0.921641688093775, + "learning_rate": 5.895947736852452e-06, + "loss": 1.5367, + "step": 4838 + }, + { + "epoch": 0.6457165732586069, + "grad_norm": 0.9488713580551892, + "learning_rate": 5.892006961138164e-06, + "loss": 1.5896, + "step": 4839 + }, + { + "epoch": 0.6458500133440085, + "grad_norm": 0.94133164082473, + "learning_rate": 5.8880669527502035e-06, + "loss": 1.5983, + "step": 4840 + }, + { + "epoch": 0.6459834534294102, + "grad_norm": 0.960447508576979, + "learning_rate": 5.884127712424517e-06, + "loss": 1.5464, + "step": 4841 + }, + { + "epoch": 0.6461168935148118, + "grad_norm": 1.0620853615582413, + "learning_rate": 5.880189240896916e-06, + "loss": 1.5659, + "step": 4842 + }, + { + "epoch": 0.6462503336002136, + "grad_norm": 0.9604672083966226, + "learning_rate": 5.876251538903059e-06, + "loss": 1.5793, + "step": 4843 + }, + { + "epoch": 0.6463837736856152, + "grad_norm": 0.96613702059847, + "learning_rate": 5.8723146071784654e-06, + "loss": 1.58, + "step": 4844 + }, + { + "epoch": 0.6465172137710168, + "grad_norm": 0.9248419108271216, + "learning_rate": 5.868378446458503e-06, + "loss": 1.5689, + "step": 4845 + }, + { + "epoch": 0.6466506538564185, + "grad_norm": 0.9190338150548885, + "learning_rate": 5.864443057478411e-06, + "loss": 1.5892, + "step": 4846 + }, + { + "epoch": 0.6467840939418201, + "grad_norm": 1.0016350457379213, + "learning_rate": 5.860508440973269e-06, + "loss": 1.5805, + "step": 4847 + }, + { + "epoch": 0.6469175340272217, + "grad_norm": 1.0549731633470492, + "learning_rate": 5.856574597678019e-06, + "loss": 1.5599, + "step": 4848 + }, + { + "epoch": 0.6470509741126235, + "grad_norm": 1.0001222985459557, + "learning_rate": 5.852641528327463e-06, + "loss": 1.5254, + "step": 4849 + }, + { + "epoch": 0.6471844141980251, + "grad_norm": 0.9739278595702441, + "learning_rate": 5.848709233656249e-06, + "loss": 1.6014, + "step": 4850 + }, + { + "epoch": 0.6473178542834267, + "grad_norm": 0.9808998489102456, + "learning_rate": 5.844777714398884e-06, + "loss": 1.5885, + "step": 4851 + }, + { + "epoch": 0.6474512943688284, + "grad_norm": 0.9562991254634166, + "learning_rate": 5.840846971289733e-06, + "loss": 1.5986, + "step": 4852 + }, + { + "epoch": 0.64758473445423, + "grad_norm": 0.9170189603539738, + "learning_rate": 5.836917005063016e-06, + "loss": 1.5917, + "step": 4853 + }, + { + "epoch": 0.6477181745396317, + "grad_norm": 1.0146263459940281, + "learning_rate": 5.832987816452804e-06, + "loss": 1.5877, + "step": 4854 + }, + { + "epoch": 0.6478516146250334, + "grad_norm": 1.0944373179261144, + "learning_rate": 5.82905940619302e-06, + "loss": 1.5499, + "step": 4855 + }, + { + "epoch": 0.647985054710435, + "grad_norm": 1.0119646752828362, + "learning_rate": 5.825131775017457e-06, + "loss": 1.5549, + "step": 4856 + }, + { + "epoch": 0.6481184947958367, + "grad_norm": 0.9692668549051643, + "learning_rate": 5.8212049236597426e-06, + "loss": 1.5815, + "step": 4857 + }, + { + "epoch": 0.6482519348812383, + "grad_norm": 0.9836312870853201, + "learning_rate": 5.817278852853373e-06, + "loss": 1.5055, + "step": 4858 + }, + { + "epoch": 0.64838537496664, + "grad_norm": 0.9881357720367061, + "learning_rate": 5.813353563331687e-06, + "loss": 1.6121, + "step": 4859 + }, + { + "epoch": 0.6485188150520417, + "grad_norm": 0.9589370913572998, + "learning_rate": 5.809429055827893e-06, + "loss": 1.5853, + "step": 4860 + }, + { + "epoch": 0.6486522551374433, + "grad_norm": 1.0049866924367945, + "learning_rate": 5.805505331075041e-06, + "loss": 1.5434, + "step": 4861 + }, + { + "epoch": 0.6487856952228449, + "grad_norm": 0.9715555700327767, + "learning_rate": 5.801582389806031e-06, + "loss": 1.5507, + "step": 4862 + }, + { + "epoch": 0.6489191353082466, + "grad_norm": 1.048295893300858, + "learning_rate": 5.797660232753635e-06, + "loss": 1.6251, + "step": 4863 + }, + { + "epoch": 0.6490525753936482, + "grad_norm": 1.1124589689964615, + "learning_rate": 5.793738860650462e-06, + "loss": 1.5035, + "step": 4864 + }, + { + "epoch": 0.6491860154790499, + "grad_norm": 1.0962898619845716, + "learning_rate": 5.78981827422898e-06, + "loss": 1.5325, + "step": 4865 + }, + { + "epoch": 0.6493194555644516, + "grad_norm": 0.9341503031181468, + "learning_rate": 5.785898474221505e-06, + "loss": 1.4986, + "step": 4866 + }, + { + "epoch": 0.6494528956498532, + "grad_norm": 1.042466248631925, + "learning_rate": 5.781979461360221e-06, + "loss": 1.5031, + "step": 4867 + }, + { + "epoch": 0.6495863357352548, + "grad_norm": 0.9970733349938308, + "learning_rate": 5.778061236377148e-06, + "loss": 1.5859, + "step": 4868 + }, + { + "epoch": 0.6497197758206565, + "grad_norm": 0.9565435536603323, + "learning_rate": 5.774143800004164e-06, + "loss": 1.5419, + "step": 4869 + }, + { + "epoch": 0.6498532159060582, + "grad_norm": 0.9836979900956078, + "learning_rate": 5.770227152973009e-06, + "loss": 1.5494, + "step": 4870 + }, + { + "epoch": 0.6499866559914599, + "grad_norm": 1.0135189985173951, + "learning_rate": 5.766311296015263e-06, + "loss": 1.6009, + "step": 4871 + }, + { + "epoch": 0.6501200960768615, + "grad_norm": 0.9506176687117898, + "learning_rate": 5.762396229862367e-06, + "loss": 1.5465, + "step": 4872 + }, + { + "epoch": 0.6502535361622631, + "grad_norm": 0.971050453122327, + "learning_rate": 5.758481955245603e-06, + "loss": 1.5391, + "step": 4873 + }, + { + "epoch": 0.6503869762476648, + "grad_norm": 1.0432819879051074, + "learning_rate": 5.754568472896123e-06, + "loss": 1.5995, + "step": 4874 + }, + { + "epoch": 0.6505204163330665, + "grad_norm": 1.079857789585133, + "learning_rate": 5.750655783544914e-06, + "loss": 1.5867, + "step": 4875 + }, + { + "epoch": 0.6506538564184681, + "grad_norm": 0.9601148800734111, + "learning_rate": 5.7467438879228255e-06, + "loss": 1.6103, + "step": 4876 + }, + { + "epoch": 0.6507872965038698, + "grad_norm": 1.0295074482916855, + "learning_rate": 5.7428327867605505e-06, + "loss": 1.5321, + "step": 4877 + }, + { + "epoch": 0.6509207365892714, + "grad_norm": 0.9531639765490284, + "learning_rate": 5.738922480788645e-06, + "loss": 1.5331, + "step": 4878 + }, + { + "epoch": 0.651054176674673, + "grad_norm": 0.9435011459958462, + "learning_rate": 5.7350129707375035e-06, + "loss": 1.5342, + "step": 4879 + }, + { + "epoch": 0.6511876167600748, + "grad_norm": 0.9604110330797555, + "learning_rate": 5.731104257337383e-06, + "loss": 1.671, + "step": 4880 + }, + { + "epoch": 0.6513210568454764, + "grad_norm": 1.1655933070828943, + "learning_rate": 5.727196341318383e-06, + "loss": 1.5843, + "step": 4881 + }, + { + "epoch": 0.6514544969308781, + "grad_norm": 0.915628984133999, + "learning_rate": 5.72328922341046e-06, + "loss": 1.5624, + "step": 4882 + }, + { + "epoch": 0.6515879370162797, + "grad_norm": 0.9121442794247614, + "learning_rate": 5.719382904343411e-06, + "loss": 1.5364, + "step": 4883 + }, + { + "epoch": 0.6517213771016813, + "grad_norm": 1.0971113356076188, + "learning_rate": 5.715477384846906e-06, + "loss": 1.606, + "step": 4884 + }, + { + "epoch": 0.651854817187083, + "grad_norm": 0.9787600524230616, + "learning_rate": 5.7115726656504425e-06, + "loss": 1.5761, + "step": 4885 + }, + { + "epoch": 0.6519882572724847, + "grad_norm": 0.9284424960740136, + "learning_rate": 5.7076687474833795e-06, + "loss": 1.5292, + "step": 4886 + }, + { + "epoch": 0.6521216973578863, + "grad_norm": 0.9590699643912758, + "learning_rate": 5.703765631074922e-06, + "loss": 1.5564, + "step": 4887 + }, + { + "epoch": 0.652255137443288, + "grad_norm": 0.9384956754557705, + "learning_rate": 5.699863317154133e-06, + "loss": 1.5851, + "step": 4888 + }, + { + "epoch": 0.6523885775286896, + "grad_norm": 0.9240054444337821, + "learning_rate": 5.695961806449917e-06, + "loss": 1.5768, + "step": 4889 + }, + { + "epoch": 0.6525220176140912, + "grad_norm": 1.021644427657407, + "learning_rate": 5.692061099691033e-06, + "loss": 1.5574, + "step": 4890 + }, + { + "epoch": 0.652655457699493, + "grad_norm": 1.017574465371194, + "learning_rate": 5.688161197606083e-06, + "loss": 1.6171, + "step": 4891 + }, + { + "epoch": 0.6527888977848946, + "grad_norm": 0.9327196434111883, + "learning_rate": 5.6842621009235345e-06, + "loss": 1.5752, + "step": 4892 + }, + { + "epoch": 0.6529223378702962, + "grad_norm": 0.9728469536906587, + "learning_rate": 5.680363810371687e-06, + "loss": 1.6167, + "step": 4893 + }, + { + "epoch": 0.6530557779556979, + "grad_norm": 1.1670661476372766, + "learning_rate": 5.676466326678697e-06, + "loss": 1.5304, + "step": 4894 + }, + { + "epoch": 0.6531892180410995, + "grad_norm": 1.0260789492682234, + "learning_rate": 5.672569650572574e-06, + "loss": 1.5498, + "step": 4895 + }, + { + "epoch": 0.6533226581265013, + "grad_norm": 0.942513307680248, + "learning_rate": 5.668673782781173e-06, + "loss": 1.5392, + "step": 4896 + }, + { + "epoch": 0.6534560982119029, + "grad_norm": 0.9337827595786473, + "learning_rate": 5.664778724032194e-06, + "loss": 1.5591, + "step": 4897 + }, + { + "epoch": 0.6535895382973045, + "grad_norm": 0.9280660756568939, + "learning_rate": 5.660884475053187e-06, + "loss": 1.5647, + "step": 4898 + }, + { + "epoch": 0.6537229783827062, + "grad_norm": 0.9775864120631028, + "learning_rate": 5.656991036571561e-06, + "loss": 1.5598, + "step": 4899 + }, + { + "epoch": 0.6538564184681078, + "grad_norm": 0.9090145991125382, + "learning_rate": 5.653098409314562e-06, + "loss": 1.5081, + "step": 4900 + }, + { + "epoch": 0.6539898585535094, + "grad_norm": 1.1498598455869353, + "learning_rate": 5.649206594009287e-06, + "loss": 1.5525, + "step": 4901 + }, + { + "epoch": 0.6541232986389112, + "grad_norm": 0.9452608782280502, + "learning_rate": 5.645315591382686e-06, + "loss": 1.484, + "step": 4902 + }, + { + "epoch": 0.6542567387243128, + "grad_norm": 1.0058072441790753, + "learning_rate": 5.641425402161553e-06, + "loss": 1.5849, + "step": 4903 + }, + { + "epoch": 0.6543901788097144, + "grad_norm": 0.9290237461844418, + "learning_rate": 5.6375360270725324e-06, + "loss": 1.5293, + "step": 4904 + }, + { + "epoch": 0.6545236188951161, + "grad_norm": 1.0552247300637112, + "learning_rate": 5.633647466842108e-06, + "loss": 1.5393, + "step": 4905 + }, + { + "epoch": 0.6546570589805177, + "grad_norm": 0.9311882795001768, + "learning_rate": 5.629759722196629e-06, + "loss": 1.5945, + "step": 4906 + }, + { + "epoch": 0.6547904990659194, + "grad_norm": 6.174325495147954, + "learning_rate": 5.625872793862276e-06, + "loss": 1.5381, + "step": 4907 + }, + { + "epoch": 0.6549239391513211, + "grad_norm": 0.9663081495612666, + "learning_rate": 5.621986682565084e-06, + "loss": 1.5634, + "step": 4908 + }, + { + "epoch": 0.6550573792367227, + "grad_norm": 1.0036068699063934, + "learning_rate": 5.61810138903093e-06, + "loss": 1.6021, + "step": 4909 + }, + { + "epoch": 0.6551908193221244, + "grad_norm": 1.0524583355071078, + "learning_rate": 5.614216913985551e-06, + "loss": 1.5163, + "step": 4910 + }, + { + "epoch": 0.655324259407526, + "grad_norm": 0.9691722893349417, + "learning_rate": 5.610333258154519e-06, + "loss": 1.5607, + "step": 4911 + }, + { + "epoch": 0.6554576994929276, + "grad_norm": 0.9220645447522142, + "learning_rate": 5.606450422263251e-06, + "loss": 1.5614, + "step": 4912 + }, + { + "epoch": 0.6555911395783294, + "grad_norm": 0.9449443848226934, + "learning_rate": 5.602568407037025e-06, + "loss": 1.6021, + "step": 4913 + }, + { + "epoch": 0.655724579663731, + "grad_norm": 0.991397619710688, + "learning_rate": 5.598687213200956e-06, + "loss": 1.5747, + "step": 4914 + }, + { + "epoch": 0.6558580197491326, + "grad_norm": 0.9384924843099448, + "learning_rate": 5.594806841480005e-06, + "loss": 1.5914, + "step": 4915 + }, + { + "epoch": 0.6559914598345343, + "grad_norm": 2.1381936866886178, + "learning_rate": 5.5909272925989756e-06, + "loss": 1.5951, + "step": 4916 + }, + { + "epoch": 0.6561248999199359, + "grad_norm": 0.9393256924385065, + "learning_rate": 5.587048567282533e-06, + "loss": 1.5611, + "step": 4917 + }, + { + "epoch": 0.6562583400053376, + "grad_norm": 1.0866095395539688, + "learning_rate": 5.583170666255174e-06, + "loss": 1.5516, + "step": 4918 + }, + { + "epoch": 0.6563917800907393, + "grad_norm": 0.9771179575753627, + "learning_rate": 5.5792935902412485e-06, + "loss": 1.6421, + "step": 4919 + }, + { + "epoch": 0.6565252201761409, + "grad_norm": 0.9901820279238563, + "learning_rate": 5.575417339964944e-06, + "loss": 1.573, + "step": 4920 + }, + { + "epoch": 0.6566586602615425, + "grad_norm": 0.9453752857587083, + "learning_rate": 5.5715419161503085e-06, + "loss": 1.5831, + "step": 4921 + }, + { + "epoch": 0.6567921003469442, + "grad_norm": 0.9600381275049558, + "learning_rate": 5.567667319521222e-06, + "loss": 1.5852, + "step": 4922 + }, + { + "epoch": 0.6569255404323459, + "grad_norm": 0.9722922529852711, + "learning_rate": 5.563793550801418e-06, + "loss": 1.5744, + "step": 4923 + }, + { + "epoch": 0.6570589805177476, + "grad_norm": 1.088044217413912, + "learning_rate": 5.559920610714471e-06, + "loss": 1.5541, + "step": 4924 + }, + { + "epoch": 0.6571924206031492, + "grad_norm": 1.0635606707693064, + "learning_rate": 5.5560484999838005e-06, + "loss": 1.5841, + "step": 4925 + }, + { + "epoch": 0.6573258606885508, + "grad_norm": 0.9473218174863391, + "learning_rate": 5.552177219332671e-06, + "loss": 1.5783, + "step": 4926 + }, + { + "epoch": 0.6574593007739525, + "grad_norm": 0.9671296980936441, + "learning_rate": 5.548306769484199e-06, + "loss": 1.5286, + "step": 4927 + }, + { + "epoch": 0.6575927408593542, + "grad_norm": 0.9206430753905613, + "learning_rate": 5.544437151161339e-06, + "loss": 1.5968, + "step": 4928 + }, + { + "epoch": 0.6577261809447558, + "grad_norm": 0.954056329951995, + "learning_rate": 5.540568365086891e-06, + "loss": 1.5636, + "step": 4929 + }, + { + "epoch": 0.6578596210301575, + "grad_norm": 0.9276580594697565, + "learning_rate": 5.536700411983495e-06, + "loss": 1.6237, + "step": 4930 + }, + { + "epoch": 0.6579930611155591, + "grad_norm": 0.9240637357436527, + "learning_rate": 5.5328332925736495e-06, + "loss": 1.5337, + "step": 4931 + }, + { + "epoch": 0.6581265012009607, + "grad_norm": 0.9782170394984411, + "learning_rate": 5.528967007579684e-06, + "loss": 1.5287, + "step": 4932 + }, + { + "epoch": 0.6582599412863624, + "grad_norm": 0.9715908393003854, + "learning_rate": 5.525101557723773e-06, + "loss": 1.6012, + "step": 4933 + }, + { + "epoch": 0.6583933813717641, + "grad_norm": 1.0552772715223717, + "learning_rate": 5.521236943727948e-06, + "loss": 1.6308, + "step": 4934 + }, + { + "epoch": 0.6585268214571657, + "grad_norm": 0.9273214286380065, + "learning_rate": 5.517373166314068e-06, + "loss": 1.5141, + "step": 4935 + }, + { + "epoch": 0.6586602615425674, + "grad_norm": 1.167409050528772, + "learning_rate": 5.513510226203844e-06, + "loss": 1.5365, + "step": 4936 + }, + { + "epoch": 0.658793701627969, + "grad_norm": 0.9613274910119957, + "learning_rate": 5.5096481241188246e-06, + "loss": 1.5552, + "step": 4937 + }, + { + "epoch": 0.6589271417133707, + "grad_norm": 1.0120678736060127, + "learning_rate": 5.505786860780416e-06, + "loss": 1.5682, + "step": 4938 + }, + { + "epoch": 0.6590605817987724, + "grad_norm": 0.9499179626379158, + "learning_rate": 5.5019264369098535e-06, + "loss": 1.5632, + "step": 4939 + }, + { + "epoch": 0.659194021884174, + "grad_norm": 0.9642512741416759, + "learning_rate": 5.4980668532282185e-06, + "loss": 1.5566, + "step": 4940 + }, + { + "epoch": 0.6593274619695757, + "grad_norm": 0.934320051094941, + "learning_rate": 5.494208110456437e-06, + "loss": 1.5399, + "step": 4941 + }, + { + "epoch": 0.6594609020549773, + "grad_norm": 1.0972956472557598, + "learning_rate": 5.490350209315283e-06, + "loss": 1.5875, + "step": 4942 + }, + { + "epoch": 0.6595943421403789, + "grad_norm": 0.9405793372237963, + "learning_rate": 5.486493150525365e-06, + "loss": 1.5474, + "step": 4943 + }, + { + "epoch": 0.6597277822257807, + "grad_norm": 0.9802055587437492, + "learning_rate": 5.482636934807135e-06, + "loss": 1.5946, + "step": 4944 + }, + { + "epoch": 0.6598612223111823, + "grad_norm": 0.949526189030937, + "learning_rate": 5.478781562880897e-06, + "loss": 1.595, + "step": 4945 + }, + { + "epoch": 0.6599946623965839, + "grad_norm": 0.9485337756721357, + "learning_rate": 5.474927035466789e-06, + "loss": 1.6176, + "step": 4946 + }, + { + "epoch": 0.6601281024819856, + "grad_norm": 0.917365602362209, + "learning_rate": 5.471073353284788e-06, + "loss": 1.5559, + "step": 4947 + }, + { + "epoch": 0.6602615425673872, + "grad_norm": 0.9752592699248175, + "learning_rate": 5.467220517054719e-06, + "loss": 1.5717, + "step": 4948 + }, + { + "epoch": 0.660394982652789, + "grad_norm": 1.0396268387538175, + "learning_rate": 5.463368527496254e-06, + "loss": 1.5159, + "step": 4949 + }, + { + "epoch": 0.6605284227381906, + "grad_norm": 0.9639723430780847, + "learning_rate": 5.459517385328894e-06, + "loss": 1.5804, + "step": 4950 + }, + { + "epoch": 0.6606618628235922, + "grad_norm": 0.9221523093110253, + "learning_rate": 5.4556670912719885e-06, + "loss": 1.5503, + "step": 4951 + }, + { + "epoch": 0.6607953029089939, + "grad_norm": 0.9723372937636856, + "learning_rate": 5.451817646044735e-06, + "loss": 1.6308, + "step": 4952 + }, + { + "epoch": 0.6609287429943955, + "grad_norm": 0.9472013736532353, + "learning_rate": 5.447969050366163e-06, + "loss": 1.5603, + "step": 4953 + }, + { + "epoch": 0.6610621830797971, + "grad_norm": 1.2387063918292023, + "learning_rate": 5.444121304955145e-06, + "loss": 1.6083, + "step": 4954 + }, + { + "epoch": 0.6611956231651989, + "grad_norm": 1.2148466148047536, + "learning_rate": 5.440274410530393e-06, + "loss": 1.5535, + "step": 4955 + }, + { + "epoch": 0.6613290632506005, + "grad_norm": 0.9349547994328788, + "learning_rate": 5.43642836781047e-06, + "loss": 1.6053, + "step": 4956 + }, + { + "epoch": 0.6614625033360021, + "grad_norm": 0.9546746784572933, + "learning_rate": 5.43258317751377e-06, + "loss": 1.5645, + "step": 4957 + }, + { + "epoch": 0.6615959434214038, + "grad_norm": 0.9571654338821786, + "learning_rate": 5.4287388403585275e-06, + "loss": 1.5331, + "step": 4958 + }, + { + "epoch": 0.6617293835068054, + "grad_norm": 1.0081431486737675, + "learning_rate": 5.424895357062826e-06, + "loss": 1.5648, + "step": 4959 + }, + { + "epoch": 0.661862823592207, + "grad_norm": 1.1640026927531668, + "learning_rate": 5.4210527283445824e-06, + "loss": 1.5421, + "step": 4960 + }, + { + "epoch": 0.6619962636776088, + "grad_norm": 0.9785271528986913, + "learning_rate": 5.417210954921557e-06, + "loss": 1.5635, + "step": 4961 + }, + { + "epoch": 0.6621297037630104, + "grad_norm": 0.9591488914375493, + "learning_rate": 5.413370037511347e-06, + "loss": 1.5291, + "step": 4962 + }, + { + "epoch": 0.6622631438484121, + "grad_norm": 0.9320181389081423, + "learning_rate": 5.409529976831392e-06, + "loss": 1.5519, + "step": 4963 + }, + { + "epoch": 0.6623965839338137, + "grad_norm": 0.9436255066913496, + "learning_rate": 5.4056907735989735e-06, + "loss": 1.587, + "step": 4964 + }, + { + "epoch": 0.6625300240192153, + "grad_norm": 0.9445224040253575, + "learning_rate": 5.401852428531212e-06, + "loss": 1.616, + "step": 4965 + }, + { + "epoch": 0.6626634641046171, + "grad_norm": 1.0814364292819985, + "learning_rate": 5.398014942345064e-06, + "loss": 1.5734, + "step": 4966 + }, + { + "epoch": 0.6627969041900187, + "grad_norm": 0.9584108100106682, + "learning_rate": 5.39417831575733e-06, + "loss": 1.5685, + "step": 4967 + }, + { + "epoch": 0.6629303442754203, + "grad_norm": 0.9665649530241158, + "learning_rate": 5.3903425494846485e-06, + "loss": 1.6028, + "step": 4968 + }, + { + "epoch": 0.663063784360822, + "grad_norm": 0.9365312188882325, + "learning_rate": 5.386507644243491e-06, + "loss": 1.5945, + "step": 4969 + }, + { + "epoch": 0.6631972244462236, + "grad_norm": 0.9729492213835405, + "learning_rate": 5.382673600750182e-06, + "loss": 1.5775, + "step": 4970 + }, + { + "epoch": 0.6633306645316253, + "grad_norm": 1.0805444576468706, + "learning_rate": 5.3788404197208744e-06, + "loss": 1.5617, + "step": 4971 + }, + { + "epoch": 0.663464104617027, + "grad_norm": 0.9617923471221628, + "learning_rate": 5.375008101871563e-06, + "loss": 1.5363, + "step": 4972 + }, + { + "epoch": 0.6635975447024286, + "grad_norm": 0.9481741920611069, + "learning_rate": 5.371176647918076e-06, + "loss": 1.5512, + "step": 4973 + }, + { + "epoch": 0.6637309847878302, + "grad_norm": 0.980896728929079, + "learning_rate": 5.367346058576095e-06, + "loss": 1.5675, + "step": 4974 + }, + { + "epoch": 0.6638644248732319, + "grad_norm": 1.2424778825915792, + "learning_rate": 5.363516334561125e-06, + "loss": 1.5574, + "step": 4975 + }, + { + "epoch": 0.6639978649586336, + "grad_norm": 1.3069446139543355, + "learning_rate": 5.359687476588511e-06, + "loss": 1.5679, + "step": 4976 + }, + { + "epoch": 0.6641313050440353, + "grad_norm": 0.9334235903224387, + "learning_rate": 5.355859485373449e-06, + "loss": 1.515, + "step": 4977 + }, + { + "epoch": 0.6642647451294369, + "grad_norm": 0.9796056460510058, + "learning_rate": 5.352032361630959e-06, + "loss": 1.5388, + "step": 4978 + }, + { + "epoch": 0.6643981852148385, + "grad_norm": 1.0622315589231197, + "learning_rate": 5.348206106075906e-06, + "loss": 1.6171, + "step": 4979 + }, + { + "epoch": 0.6645316253002402, + "grad_norm": 0.9535560005647347, + "learning_rate": 5.344380719422985e-06, + "loss": 1.5387, + "step": 4980 + }, + { + "epoch": 0.6646650653856419, + "grad_norm": 0.9332886144151455, + "learning_rate": 5.340556202386743e-06, + "loss": 1.5437, + "step": 4981 + }, + { + "epoch": 0.6647985054710435, + "grad_norm": 0.926508722116168, + "learning_rate": 5.336732555681552e-06, + "loss": 1.5249, + "step": 4982 + }, + { + "epoch": 0.6649319455564452, + "grad_norm": 0.9597733820765061, + "learning_rate": 5.3329097800216244e-06, + "loss": 1.6112, + "step": 4983 + }, + { + "epoch": 0.6650653856418468, + "grad_norm": 0.9263015315418155, + "learning_rate": 5.329087876121016e-06, + "loss": 1.5548, + "step": 4984 + }, + { + "epoch": 0.6651988257272484, + "grad_norm": 0.9197318999051447, + "learning_rate": 5.325266844693611e-06, + "loss": 1.5108, + "step": 4985 + }, + { + "epoch": 0.6653322658126501, + "grad_norm": 0.9346342407015626, + "learning_rate": 5.321446686453137e-06, + "loss": 1.5587, + "step": 4986 + }, + { + "epoch": 0.6654657058980518, + "grad_norm": 0.9609313535996501, + "learning_rate": 5.31762740211315e-06, + "loss": 1.5212, + "step": 4987 + }, + { + "epoch": 0.6655991459834534, + "grad_norm": 0.9379253903460791, + "learning_rate": 5.313808992387057e-06, + "loss": 1.5553, + "step": 4988 + }, + { + "epoch": 0.6657325860688551, + "grad_norm": 0.9452683100970093, + "learning_rate": 5.309991457988091e-06, + "loss": 1.5827, + "step": 4989 + }, + { + "epoch": 0.6658660261542567, + "grad_norm": 0.9871345858341555, + "learning_rate": 5.306174799629317e-06, + "loss": 1.5048, + "step": 4990 + }, + { + "epoch": 0.6659994662396584, + "grad_norm": 0.9745323174519458, + "learning_rate": 5.302359018023656e-06, + "loss": 1.5655, + "step": 4991 + }, + { + "epoch": 0.6661329063250601, + "grad_norm": 1.0874975316674835, + "learning_rate": 5.298544113883845e-06, + "loss": 1.5807, + "step": 4992 + }, + { + "epoch": 0.6662663464104617, + "grad_norm": 0.9359689028295645, + "learning_rate": 5.294730087922464e-06, + "loss": 1.555, + "step": 4993 + }, + { + "epoch": 0.6663997864958634, + "grad_norm": 1.065007282594557, + "learning_rate": 5.290916940851929e-06, + "loss": 1.5666, + "step": 4994 + }, + { + "epoch": 0.666533226581265, + "grad_norm": 1.1502681142848064, + "learning_rate": 5.287104673384498e-06, + "loss": 1.5713, + "step": 4995 + }, + { + "epoch": 0.6666666666666666, + "grad_norm": 0.9479228632028691, + "learning_rate": 5.283293286232254e-06, + "loss": 1.5684, + "step": 4996 + }, + { + "epoch": 0.6668001067520684, + "grad_norm": 0.9987657361216116, + "learning_rate": 5.279482780107119e-06, + "loss": 1.5635, + "step": 4997 + }, + { + "epoch": 0.66693354683747, + "grad_norm": 1.1782973127657173, + "learning_rate": 5.275673155720861e-06, + "loss": 1.5947, + "step": 4998 + }, + { + "epoch": 0.6670669869228716, + "grad_norm": 0.9525996465669817, + "learning_rate": 5.271864413785068e-06, + "loss": 1.5657, + "step": 4999 + }, + { + "epoch": 0.6672004270082733, + "grad_norm": 0.9683386220692802, + "learning_rate": 5.26805655501117e-06, + "loss": 1.5778, + "step": 5000 + }, + { + "epoch": 0.6673338670936749, + "grad_norm": 1.0500879084213508, + "learning_rate": 5.26424958011043e-06, + "loss": 1.5636, + "step": 5001 + }, + { + "epoch": 0.6674673071790765, + "grad_norm": 0.9531705010447756, + "learning_rate": 5.260443489793953e-06, + "loss": 1.5954, + "step": 5002 + }, + { + "epoch": 0.6676007472644783, + "grad_norm": 0.9421433711002707, + "learning_rate": 5.25663828477267e-06, + "loss": 1.5517, + "step": 5003 + }, + { + "epoch": 0.6677341873498799, + "grad_norm": 0.9278026930537797, + "learning_rate": 5.252833965757351e-06, + "loss": 1.583, + "step": 5004 + }, + { + "epoch": 0.6678676274352816, + "grad_norm": 0.9463961992902421, + "learning_rate": 5.249030533458594e-06, + "loss": 1.5259, + "step": 5005 + }, + { + "epoch": 0.6680010675206832, + "grad_norm": 1.019719000874434, + "learning_rate": 5.245227988586845e-06, + "loss": 1.5647, + "step": 5006 + }, + { + "epoch": 0.6681345076060848, + "grad_norm": 0.9199080705366238, + "learning_rate": 5.241426331852373e-06, + "loss": 1.5497, + "step": 5007 + }, + { + "epoch": 0.6682679476914866, + "grad_norm": 0.9388293679619114, + "learning_rate": 5.237625563965285e-06, + "loss": 1.512, + "step": 5008 + }, + { + "epoch": 0.6684013877768882, + "grad_norm": 0.9772164529249927, + "learning_rate": 5.233825685635518e-06, + "loss": 1.5984, + "step": 5009 + }, + { + "epoch": 0.6685348278622898, + "grad_norm": 1.3066296901302348, + "learning_rate": 5.23002669757285e-06, + "loss": 1.5724, + "step": 5010 + }, + { + "epoch": 0.6686682679476915, + "grad_norm": 1.1126937385329367, + "learning_rate": 5.226228600486883e-06, + "loss": 1.5821, + "step": 5011 + }, + { + "epoch": 0.6688017080330931, + "grad_norm": 0.9120825110494507, + "learning_rate": 5.222431395087064e-06, + "loss": 1.5863, + "step": 5012 + }, + { + "epoch": 0.6689351481184947, + "grad_norm": 0.9636066977274008, + "learning_rate": 5.218635082082666e-06, + "loss": 1.592, + "step": 5013 + }, + { + "epoch": 0.6690685882038965, + "grad_norm": 0.9146764316842598, + "learning_rate": 5.214839662182798e-06, + "loss": 1.5786, + "step": 5014 + }, + { + "epoch": 0.6692020282892981, + "grad_norm": 0.9588946756171576, + "learning_rate": 5.211045136096394e-06, + "loss": 1.5927, + "step": 5015 + }, + { + "epoch": 0.6693354683746997, + "grad_norm": 0.944621271446456, + "learning_rate": 5.20725150453224e-06, + "loss": 1.51, + "step": 5016 + }, + { + "epoch": 0.6694689084601014, + "grad_norm": 0.9688157625406866, + "learning_rate": 5.2034587681989364e-06, + "loss": 1.5454, + "step": 5017 + }, + { + "epoch": 0.669602348545503, + "grad_norm": 0.9487031537844105, + "learning_rate": 5.199666927804925e-06, + "loss": 1.5289, + "step": 5018 + }, + { + "epoch": 0.6697357886309048, + "grad_norm": 0.9892335986551877, + "learning_rate": 5.195875984058474e-06, + "loss": 1.5918, + "step": 5019 + }, + { + "epoch": 0.6698692287163064, + "grad_norm": 0.9044754479784632, + "learning_rate": 5.192085937667696e-06, + "loss": 1.6101, + "step": 5020 + }, + { + "epoch": 0.670002668801708, + "grad_norm": 0.9397019351239638, + "learning_rate": 5.188296789340523e-06, + "loss": 1.5, + "step": 5021 + }, + { + "epoch": 0.6701361088871097, + "grad_norm": 0.9823103967616779, + "learning_rate": 5.184508539784725e-06, + "loss": 1.6176, + "step": 5022 + }, + { + "epoch": 0.6702695489725113, + "grad_norm": 0.9220689248773222, + "learning_rate": 5.180721189707908e-06, + "loss": 1.5349, + "step": 5023 + }, + { + "epoch": 0.670402989057913, + "grad_norm": 0.9481270673997315, + "learning_rate": 5.176934739817503e-06, + "loss": 1.5678, + "step": 5024 + }, + { + "epoch": 0.6705364291433147, + "grad_norm": 1.0396242680439707, + "learning_rate": 5.173149190820776e-06, + "loss": 1.6279, + "step": 5025 + }, + { + "epoch": 0.6706698692287163, + "grad_norm": 0.9646160121743006, + "learning_rate": 5.1693645434248216e-06, + "loss": 1.5928, + "step": 5026 + }, + { + "epoch": 0.6708033093141179, + "grad_norm": 0.9553746604638392, + "learning_rate": 5.165580798336575e-06, + "loss": 1.5881, + "step": 5027 + }, + { + "epoch": 0.6709367493995196, + "grad_norm": 0.9360288710325949, + "learning_rate": 5.161797956262793e-06, + "loss": 1.6201, + "step": 5028 + }, + { + "epoch": 0.6710701894849213, + "grad_norm": 1.0854046179543964, + "learning_rate": 5.158016017910064e-06, + "loss": 1.5416, + "step": 5029 + }, + { + "epoch": 0.671203629570323, + "grad_norm": 0.9591439987959153, + "learning_rate": 5.154234983984818e-06, + "loss": 1.5665, + "step": 5030 + }, + { + "epoch": 0.6713370696557246, + "grad_norm": 1.0881353005554923, + "learning_rate": 5.150454855193308e-06, + "loss": 1.5806, + "step": 5031 + }, + { + "epoch": 0.6714705097411262, + "grad_norm": 1.0095150308218221, + "learning_rate": 5.146675632241614e-06, + "loss": 1.5884, + "step": 5032 + }, + { + "epoch": 0.6716039498265279, + "grad_norm": 0.9853452337837284, + "learning_rate": 5.142897315835653e-06, + "loss": 1.5928, + "step": 5033 + }, + { + "epoch": 0.6717373899119295, + "grad_norm": 0.9114971278210366, + "learning_rate": 5.139119906681176e-06, + "loss": 1.5162, + "step": 5034 + }, + { + "epoch": 0.6718708299973312, + "grad_norm": 0.970853734583226, + "learning_rate": 5.135343405483757e-06, + "loss": 1.555, + "step": 5035 + }, + { + "epoch": 0.6720042700827329, + "grad_norm": 0.9830827365486369, + "learning_rate": 5.131567812948805e-06, + "loss": 1.5957, + "step": 5036 + }, + { + "epoch": 0.6721377101681345, + "grad_norm": 1.0109831996031944, + "learning_rate": 5.127793129781551e-06, + "loss": 1.6049, + "step": 5037 + }, + { + "epoch": 0.6722711502535361, + "grad_norm": 0.9273191953375031, + "learning_rate": 5.124019356687073e-06, + "loss": 1.5482, + "step": 5038 + }, + { + "epoch": 0.6724045903389378, + "grad_norm": 0.9530762057231332, + "learning_rate": 5.120246494370264e-06, + "loss": 1.557, + "step": 5039 + }, + { + "epoch": 0.6725380304243395, + "grad_norm": 1.009716414085073, + "learning_rate": 5.116474543535848e-06, + "loss": 1.5817, + "step": 5040 + }, + { + "epoch": 0.6726714705097411, + "grad_norm": 0.936535223545121, + "learning_rate": 5.11270350488839e-06, + "loss": 1.5498, + "step": 5041 + }, + { + "epoch": 0.6728049105951428, + "grad_norm": 0.9587135356454837, + "learning_rate": 5.108933379132272e-06, + "loss": 1.6064, + "step": 5042 + }, + { + "epoch": 0.6729383506805444, + "grad_norm": 0.9707574935271643, + "learning_rate": 5.105164166971714e-06, + "loss": 1.4911, + "step": 5043 + }, + { + "epoch": 0.6730717907659461, + "grad_norm": 0.9255255572906976, + "learning_rate": 5.101395869110755e-06, + "loss": 1.5974, + "step": 5044 + }, + { + "epoch": 0.6732052308513478, + "grad_norm": 0.9526351690279754, + "learning_rate": 5.097628486253278e-06, + "loss": 1.583, + "step": 5045 + }, + { + "epoch": 0.6733386709367494, + "grad_norm": 0.9231045207080708, + "learning_rate": 5.0938620191029865e-06, + "loss": 1.571, + "step": 5046 + }, + { + "epoch": 0.6734721110221511, + "grad_norm": 0.9515238827519511, + "learning_rate": 5.090096468363409e-06, + "loss": 1.6318, + "step": 5047 + }, + { + "epoch": 0.6736055511075527, + "grad_norm": 0.9356804828343049, + "learning_rate": 5.086331834737908e-06, + "loss": 1.5706, + "step": 5048 + }, + { + "epoch": 0.6737389911929543, + "grad_norm": 1.1128834664193885, + "learning_rate": 5.082568118929678e-06, + "loss": 1.5895, + "step": 5049 + }, + { + "epoch": 0.673872431278356, + "grad_norm": 1.1955255342152211, + "learning_rate": 5.078805321641736e-06, + "loss": 1.5635, + "step": 5050 + }, + { + "epoch": 0.6740058713637577, + "grad_norm": 0.9185509120855007, + "learning_rate": 5.075043443576933e-06, + "loss": 1.5417, + "step": 5051 + }, + { + "epoch": 0.6741393114491593, + "grad_norm": 0.9576951089769443, + "learning_rate": 5.071282485437938e-06, + "loss": 1.5684, + "step": 5052 + }, + { + "epoch": 0.674272751534561, + "grad_norm": 0.9875819130260579, + "learning_rate": 5.067522447927262e-06, + "loss": 1.5387, + "step": 5053 + }, + { + "epoch": 0.6744061916199626, + "grad_norm": 1.0759454029563662, + "learning_rate": 5.06376333174723e-06, + "loss": 1.5579, + "step": 5054 + }, + { + "epoch": 0.6745396317053642, + "grad_norm": 0.9953780319051102, + "learning_rate": 5.06000513760001e-06, + "loss": 1.5767, + "step": 5055 + }, + { + "epoch": 0.674673071790766, + "grad_norm": 0.9538482284664277, + "learning_rate": 5.056247866187587e-06, + "loss": 1.6003, + "step": 5056 + }, + { + "epoch": 0.6748065118761676, + "grad_norm": 0.9578854545173783, + "learning_rate": 5.0524915182117754e-06, + "loss": 1.5625, + "step": 5057 + }, + { + "epoch": 0.6749399519615693, + "grad_norm": 0.9333496781315979, + "learning_rate": 5.0487360943742135e-06, + "loss": 1.5545, + "step": 5058 + }, + { + "epoch": 0.6750733920469709, + "grad_norm": 0.936433357559595, + "learning_rate": 5.044981595376383e-06, + "loss": 1.571, + "step": 5059 + }, + { + "epoch": 0.6752068321323725, + "grad_norm": 0.9366758168083295, + "learning_rate": 5.0412280219195746e-06, + "loss": 1.6242, + "step": 5060 + }, + { + "epoch": 0.6753402722177743, + "grad_norm": 1.0171664676564722, + "learning_rate": 5.03747537470491e-06, + "loss": 1.578, + "step": 5061 + }, + { + "epoch": 0.6754737123031759, + "grad_norm": 1.2233118469857216, + "learning_rate": 5.033723654433349e-06, + "loss": 1.577, + "step": 5062 + }, + { + "epoch": 0.6756071523885775, + "grad_norm": 0.9036663728010922, + "learning_rate": 5.029972861805665e-06, + "loss": 1.522, + "step": 5063 + }, + { + "epoch": 0.6757405924739792, + "grad_norm": 0.9771449490125018, + "learning_rate": 5.026222997522465e-06, + "loss": 1.5776, + "step": 5064 + }, + { + "epoch": 0.6758740325593808, + "grad_norm": 0.9516286727943922, + "learning_rate": 5.022474062284177e-06, + "loss": 1.5614, + "step": 5065 + }, + { + "epoch": 0.6760074726447824, + "grad_norm": 0.9177876450126852, + "learning_rate": 5.018726056791068e-06, + "loss": 1.5596, + "step": 5066 + }, + { + "epoch": 0.6761409127301842, + "grad_norm": 0.9415870237949052, + "learning_rate": 5.014978981743216e-06, + "loss": 1.5728, + "step": 5067 + }, + { + "epoch": 0.6762743528155858, + "grad_norm": 0.9378316877697337, + "learning_rate": 5.011232837840534e-06, + "loss": 1.5742, + "step": 5068 + }, + { + "epoch": 0.6764077929009874, + "grad_norm": 0.9740662444000914, + "learning_rate": 5.007487625782755e-06, + "loss": 1.5946, + "step": 5069 + }, + { + "epoch": 0.6765412329863891, + "grad_norm": 0.955131305489453, + "learning_rate": 5.003743346269449e-06, + "loss": 1.5925, + "step": 5070 + }, + { + "epoch": 0.6766746730717907, + "grad_norm": 0.9317999097948068, + "learning_rate": 5.000000000000003e-06, + "loss": 1.5604, + "step": 5071 + }, + { + "epoch": 0.6768081131571925, + "grad_norm": 0.9365536795010426, + "learning_rate": 4.9962575876736245e-06, + "loss": 1.6174, + "step": 5072 + }, + { + "epoch": 0.6769415532425941, + "grad_norm": 1.0580187577949391, + "learning_rate": 4.992516109989362e-06, + "loss": 1.5542, + "step": 5073 + }, + { + "epoch": 0.6770749933279957, + "grad_norm": 0.9475023984254052, + "learning_rate": 4.988775567646079e-06, + "loss": 1.5899, + "step": 5074 + }, + { + "epoch": 0.6772084334133974, + "grad_norm": 1.0674090359857158, + "learning_rate": 4.985035961342466e-06, + "loss": 1.5677, + "step": 5075 + }, + { + "epoch": 0.677341873498799, + "grad_norm": 0.9866266273983164, + "learning_rate": 4.981297291777032e-06, + "loss": 1.5378, + "step": 5076 + }, + { + "epoch": 0.6774753135842007, + "grad_norm": 0.9868827012284229, + "learning_rate": 4.977559559648128e-06, + "loss": 1.5811, + "step": 5077 + }, + { + "epoch": 0.6776087536696024, + "grad_norm": 1.1780046727927997, + "learning_rate": 4.973822765653917e-06, + "loss": 1.4914, + "step": 5078 + }, + { + "epoch": 0.677742193755004, + "grad_norm": 0.95257815072713, + "learning_rate": 4.970086910492385e-06, + "loss": 1.5842, + "step": 5079 + }, + { + "epoch": 0.6778756338404056, + "grad_norm": 1.0663204797517163, + "learning_rate": 4.966351994861352e-06, + "loss": 1.6202, + "step": 5080 + }, + { + "epoch": 0.6780090739258073, + "grad_norm": 0.935486976169621, + "learning_rate": 4.962618019458456e-06, + "loss": 1.5488, + "step": 5081 + }, + { + "epoch": 0.678142514011209, + "grad_norm": 0.9770535662216747, + "learning_rate": 4.958884984981163e-06, + "loss": 1.5611, + "step": 5082 + }, + { + "epoch": 0.6782759540966106, + "grad_norm": 0.918514304568767, + "learning_rate": 4.9551528921267545e-06, + "loss": 1.5666, + "step": 5083 + }, + { + "epoch": 0.6784093941820123, + "grad_norm": 0.9318149736553532, + "learning_rate": 4.951421741592353e-06, + "loss": 1.5857, + "step": 5084 + }, + { + "epoch": 0.6785428342674139, + "grad_norm": 13.688631168582868, + "learning_rate": 4.947691534074889e-06, + "loss": 1.6078, + "step": 5085 + }, + { + "epoch": 0.6786762743528156, + "grad_norm": 1.002752650809866, + "learning_rate": 4.94396227027112e-06, + "loss": 1.516, + "step": 5086 + }, + { + "epoch": 0.6788097144382172, + "grad_norm": 1.023591072044742, + "learning_rate": 4.940233950877637e-06, + "loss": 1.5348, + "step": 5087 + }, + { + "epoch": 0.6789431545236189, + "grad_norm": 1.1764691952827722, + "learning_rate": 4.936506576590846e-06, + "loss": 1.5717, + "step": 5088 + }, + { + "epoch": 0.6790765946090206, + "grad_norm": 1.029025311630692, + "learning_rate": 4.932780148106975e-06, + "loss": 1.5159, + "step": 5089 + }, + { + "epoch": 0.6792100346944222, + "grad_norm": 0.950650193469443, + "learning_rate": 4.929054666122081e-06, + "loss": 1.5447, + "step": 5090 + }, + { + "epoch": 0.6793434747798238, + "grad_norm": 1.1799569092706195, + "learning_rate": 4.9253301313320355e-06, + "loss": 1.5769, + "step": 5091 + }, + { + "epoch": 0.6794769148652255, + "grad_norm": 0.9432993878667583, + "learning_rate": 4.921606544432549e-06, + "loss": 1.5891, + "step": 5092 + }, + { + "epoch": 0.6796103549506272, + "grad_norm": 0.9465085033881826, + "learning_rate": 4.91788390611914e-06, + "loss": 1.5216, + "step": 5093 + }, + { + "epoch": 0.6797437950360288, + "grad_norm": 1.0542649886569853, + "learning_rate": 4.914162217087155e-06, + "loss": 1.5736, + "step": 5094 + }, + { + "epoch": 0.6798772351214305, + "grad_norm": 1.0000819446908524, + "learning_rate": 4.910441478031763e-06, + "loss": 1.6619, + "step": 5095 + }, + { + "epoch": 0.6800106752068321, + "grad_norm": 0.9322943527809388, + "learning_rate": 4.906721689647957e-06, + "loss": 1.5867, + "step": 5096 + }, + { + "epoch": 0.6801441152922338, + "grad_norm": 0.9464543046875021, + "learning_rate": 4.903002852630546e-06, + "loss": 1.5638, + "step": 5097 + }, + { + "epoch": 0.6802775553776355, + "grad_norm": 0.9523084416197721, + "learning_rate": 4.899284967674174e-06, + "loss": 1.4874, + "step": 5098 + }, + { + "epoch": 0.6804109954630371, + "grad_norm": 0.9259089895023304, + "learning_rate": 4.895568035473297e-06, + "loss": 1.575, + "step": 5099 + }, + { + "epoch": 0.6805444355484388, + "grad_norm": 0.913018189144235, + "learning_rate": 4.891852056722195e-06, + "loss": 1.5759, + "step": 5100 + }, + { + "epoch": 0.6806778756338404, + "grad_norm": 0.9351223685450596, + "learning_rate": 4.888137032114966e-06, + "loss": 1.552, + "step": 5101 + }, + { + "epoch": 0.680811315719242, + "grad_norm": 1.1155154706231596, + "learning_rate": 4.884422962345542e-06, + "loss": 1.5482, + "step": 5102 + }, + { + "epoch": 0.6809447558046438, + "grad_norm": 0.9366870347444463, + "learning_rate": 4.880709848107666e-06, + "loss": 1.5683, + "step": 5103 + }, + { + "epoch": 0.6810781958900454, + "grad_norm": 0.972915796830404, + "learning_rate": 4.876997690094902e-06, + "loss": 1.5796, + "step": 5104 + }, + { + "epoch": 0.681211635975447, + "grad_norm": 0.9605649263345989, + "learning_rate": 4.873286489000645e-06, + "loss": 1.4809, + "step": 5105 + }, + { + "epoch": 0.6813450760608487, + "grad_norm": 1.0145828253220486, + "learning_rate": 4.869576245518101e-06, + "loss": 1.5529, + "step": 5106 + }, + { + "epoch": 0.6814785161462503, + "grad_norm": 1.0299976481166175, + "learning_rate": 4.865866960340304e-06, + "loss": 1.556, + "step": 5107 + }, + { + "epoch": 0.6816119562316519, + "grad_norm": 0.9634555693945733, + "learning_rate": 4.862158634160101e-06, + "loss": 1.5617, + "step": 5108 + }, + { + "epoch": 0.6817453963170537, + "grad_norm": 1.0182839375048582, + "learning_rate": 4.858451267670171e-06, + "loss": 1.5827, + "step": 5109 + }, + { + "epoch": 0.6818788364024553, + "grad_norm": 1.1453431192773607, + "learning_rate": 4.854744861563007e-06, + "loss": 1.5854, + "step": 5110 + }, + { + "epoch": 0.682012276487857, + "grad_norm": 0.9803243409985238, + "learning_rate": 4.851039416530917e-06, + "loss": 1.5329, + "step": 5111 + }, + { + "epoch": 0.6821457165732586, + "grad_norm": 1.016213296072417, + "learning_rate": 4.847334933266044e-06, + "loss": 1.6221, + "step": 5112 + }, + { + "epoch": 0.6822791566586602, + "grad_norm": 0.9505609706075121, + "learning_rate": 4.843631412460341e-06, + "loss": 1.5815, + "step": 5113 + }, + { + "epoch": 0.682412596744062, + "grad_norm": 1.0060585154404742, + "learning_rate": 4.839928854805583e-06, + "loss": 1.5788, + "step": 5114 + }, + { + "epoch": 0.6825460368294636, + "grad_norm": 3.7584015873744785, + "learning_rate": 4.836227260993361e-06, + "loss": 1.5487, + "step": 5115 + }, + { + "epoch": 0.6826794769148652, + "grad_norm": 0.9458041427982877, + "learning_rate": 4.8325266317150996e-06, + "loss": 1.5569, + "step": 5116 + }, + { + "epoch": 0.6828129170002669, + "grad_norm": 1.0741204494235426, + "learning_rate": 4.828826967662028e-06, + "loss": 1.5389, + "step": 5117 + }, + { + "epoch": 0.6829463570856685, + "grad_norm": 1.0796007293741565, + "learning_rate": 4.8251282695252e-06, + "loss": 1.5467, + "step": 5118 + }, + { + "epoch": 0.6830797971710701, + "grad_norm": 0.984104078234117, + "learning_rate": 4.821430537995497e-06, + "loss": 1.5582, + "step": 5119 + }, + { + "epoch": 0.6832132372564719, + "grad_norm": 0.9768350794179331, + "learning_rate": 4.817733773763609e-06, + "loss": 1.5747, + "step": 5120 + }, + { + "epoch": 0.6833466773418735, + "grad_norm": 0.97701351831298, + "learning_rate": 4.8140379775200494e-06, + "loss": 1.562, + "step": 5121 + }, + { + "epoch": 0.6834801174272751, + "grad_norm": 1.1798523816680644, + "learning_rate": 4.8103431499551474e-06, + "loss": 1.5862, + "step": 5122 + }, + { + "epoch": 0.6836135575126768, + "grad_norm": 1.0067654037143134, + "learning_rate": 4.8066492917590635e-06, + "loss": 1.594, + "step": 5123 + }, + { + "epoch": 0.6837469975980784, + "grad_norm": 0.9823447933349997, + "learning_rate": 4.802956403621763e-06, + "loss": 1.5824, + "step": 5124 + }, + { + "epoch": 0.6838804376834802, + "grad_norm": 0.9800571652551026, + "learning_rate": 4.799264486233031e-06, + "loss": 1.5426, + "step": 5125 + }, + { + "epoch": 0.6840138777688818, + "grad_norm": 0.9373855164532967, + "learning_rate": 4.795573540282484e-06, + "loss": 1.5959, + "step": 5126 + }, + { + "epoch": 0.6841473178542834, + "grad_norm": 1.1258697616784983, + "learning_rate": 4.791883566459546e-06, + "loss": 1.5546, + "step": 5127 + }, + { + "epoch": 0.6842807579396851, + "grad_norm": 1.0460314104044222, + "learning_rate": 4.7881945654534615e-06, + "loss": 1.5461, + "step": 5128 + }, + { + "epoch": 0.6844141980250867, + "grad_norm": 0.9470018047113221, + "learning_rate": 4.78450653795329e-06, + "loss": 1.5871, + "step": 5129 + }, + { + "epoch": 0.6845476381104884, + "grad_norm": 0.9381571561346944, + "learning_rate": 4.780819484647918e-06, + "loss": 1.5455, + "step": 5130 + }, + { + "epoch": 0.6846810781958901, + "grad_norm": 1.1209989432461438, + "learning_rate": 4.777133406226046e-06, + "loss": 1.6104, + "step": 5131 + }, + { + "epoch": 0.6848145182812917, + "grad_norm": 0.9861145590915344, + "learning_rate": 4.773448303376188e-06, + "loss": 1.6122, + "step": 5132 + }, + { + "epoch": 0.6849479583666933, + "grad_norm": 0.9632711224553404, + "learning_rate": 4.769764176786681e-06, + "loss": 1.5319, + "step": 5133 + }, + { + "epoch": 0.685081398452095, + "grad_norm": 1.0114719533896925, + "learning_rate": 4.7660810271456735e-06, + "loss": 1.5546, + "step": 5134 + }, + { + "epoch": 0.6852148385374967, + "grad_norm": 0.9449667280372048, + "learning_rate": 4.762398855141143e-06, + "loss": 1.4989, + "step": 5135 + }, + { + "epoch": 0.6853482786228983, + "grad_norm": 0.925328878656421, + "learning_rate": 4.758717661460874e-06, + "loss": 1.5486, + "step": 5136 + }, + { + "epoch": 0.6854817187083, + "grad_norm": 0.9589142342169662, + "learning_rate": 4.755037446792472e-06, + "loss": 1.5787, + "step": 5137 + }, + { + "epoch": 0.6856151587937016, + "grad_norm": 0.9539174340381931, + "learning_rate": 4.751358211823359e-06, + "loss": 1.4947, + "step": 5138 + }, + { + "epoch": 0.6857485988791033, + "grad_norm": 0.9935890025844325, + "learning_rate": 4.747679957240774e-06, + "loss": 1.5835, + "step": 5139 + }, + { + "epoch": 0.685882038964505, + "grad_norm": 0.9561138767634939, + "learning_rate": 4.744002683731769e-06, + "loss": 1.5068, + "step": 5140 + }, + { + "epoch": 0.6860154790499066, + "grad_norm": 1.0881008249779185, + "learning_rate": 4.740326391983225e-06, + "loss": 1.5547, + "step": 5141 + }, + { + "epoch": 0.6861489191353083, + "grad_norm": 1.0331157860816333, + "learning_rate": 4.736651082681826e-06, + "loss": 1.5485, + "step": 5142 + }, + { + "epoch": 0.6862823592207099, + "grad_norm": 0.9726155072774376, + "learning_rate": 4.732976756514077e-06, + "loss": 1.5666, + "step": 5143 + }, + { + "epoch": 0.6864157993061115, + "grad_norm": 0.9698903915052955, + "learning_rate": 4.729303414166305e-06, + "loss": 1.5705, + "step": 5144 + }, + { + "epoch": 0.6865492393915132, + "grad_norm": 0.9739962559971576, + "learning_rate": 4.7256310563246475e-06, + "loss": 1.5984, + "step": 5145 + }, + { + "epoch": 0.6866826794769149, + "grad_norm": 1.0147943743702117, + "learning_rate": 4.721959683675057e-06, + "loss": 1.5452, + "step": 5146 + }, + { + "epoch": 0.6868161195623165, + "grad_norm": 0.9785035074553505, + "learning_rate": 4.718289296903301e-06, + "loss": 1.5578, + "step": 5147 + }, + { + "epoch": 0.6869495596477182, + "grad_norm": 0.9329927814076208, + "learning_rate": 4.714619896694974e-06, + "loss": 1.5801, + "step": 5148 + }, + { + "epoch": 0.6870829997331198, + "grad_norm": 0.9379344188241905, + "learning_rate": 4.710951483735474e-06, + "loss": 1.5543, + "step": 5149 + }, + { + "epoch": 0.6872164398185214, + "grad_norm": 1.2323367929741, + "learning_rate": 4.7072840587100146e-06, + "loss": 1.5424, + "step": 5150 + }, + { + "epoch": 0.6873498799039232, + "grad_norm": 0.9866412354494483, + "learning_rate": 4.7036176223036375e-06, + "loss": 1.5593, + "step": 5151 + }, + { + "epoch": 0.6874833199893248, + "grad_norm": 1.0200114322334428, + "learning_rate": 4.699952175201187e-06, + "loss": 1.5624, + "step": 5152 + }, + { + "epoch": 0.6876167600747265, + "grad_norm": 1.0952002464406705, + "learning_rate": 4.696287718087326e-06, + "loss": 1.5471, + "step": 5153 + }, + { + "epoch": 0.6877502001601281, + "grad_norm": 0.9626777049116638, + "learning_rate": 4.6926242516465315e-06, + "loss": 1.5889, + "step": 5154 + }, + { + "epoch": 0.6878836402455297, + "grad_norm": 0.9768971512913973, + "learning_rate": 4.688961776563102e-06, + "loss": 1.5778, + "step": 5155 + }, + { + "epoch": 0.6880170803309315, + "grad_norm": 1.0154475915685601, + "learning_rate": 4.685300293521146e-06, + "loss": 1.5428, + "step": 5156 + }, + { + "epoch": 0.6881505204163331, + "grad_norm": 0.9175407597697326, + "learning_rate": 4.68163980320458e-06, + "loss": 1.5406, + "step": 5157 + }, + { + "epoch": 0.6882839605017347, + "grad_norm": 0.9219686003018891, + "learning_rate": 4.677980306297153e-06, + "loss": 1.5645, + "step": 5158 + }, + { + "epoch": 0.6884174005871364, + "grad_norm": 0.9931694076801224, + "learning_rate": 4.674321803482409e-06, + "loss": 1.5528, + "step": 5159 + }, + { + "epoch": 0.688550840672538, + "grad_norm": 0.9871448513395474, + "learning_rate": 4.670664295443718e-06, + "loss": 1.5606, + "step": 5160 + }, + { + "epoch": 0.6886842807579396, + "grad_norm": 0.9678884161793881, + "learning_rate": 4.667007782864256e-06, + "loss": 1.6019, + "step": 5161 + }, + { + "epoch": 0.6888177208433414, + "grad_norm": 0.9310915342747638, + "learning_rate": 4.663352266427026e-06, + "loss": 1.6019, + "step": 5162 + }, + { + "epoch": 0.688951160928743, + "grad_norm": 0.9779613924239206, + "learning_rate": 4.659697746814832e-06, + "loss": 1.5946, + "step": 5163 + }, + { + "epoch": 0.6890846010141447, + "grad_norm": 0.944018936409953, + "learning_rate": 4.656044224710297e-06, + "loss": 1.6024, + "step": 5164 + }, + { + "epoch": 0.6892180410995463, + "grad_norm": 0.9489255197624183, + "learning_rate": 4.652391700795854e-06, + "loss": 1.6148, + "step": 5165 + }, + { + "epoch": 0.6893514811849479, + "grad_norm": 0.9554477607577454, + "learning_rate": 4.64874017575376e-06, + "loss": 1.6305, + "step": 5166 + }, + { + "epoch": 0.6894849212703497, + "grad_norm": 0.9578515697117463, + "learning_rate": 4.645089650266074e-06, + "loss": 1.5761, + "step": 5167 + }, + { + "epoch": 0.6896183613557513, + "grad_norm": 1.089494909741061, + "learning_rate": 4.641440125014672e-06, + "loss": 1.552, + "step": 5168 + }, + { + "epoch": 0.6897518014411529, + "grad_norm": 0.9764757676054326, + "learning_rate": 4.637791600681247e-06, + "loss": 1.6047, + "step": 5169 + }, + { + "epoch": 0.6898852415265546, + "grad_norm": 0.9704402753837931, + "learning_rate": 4.634144077947301e-06, + "loss": 1.5901, + "step": 5170 + }, + { + "epoch": 0.6900186816119562, + "grad_norm": 0.9450629354753408, + "learning_rate": 4.630497557494149e-06, + "loss": 1.5672, + "step": 5171 + }, + { + "epoch": 0.6901521216973578, + "grad_norm": 0.9265680379145714, + "learning_rate": 4.626852040002915e-06, + "loss": 1.5683, + "step": 5172 + }, + { + "epoch": 0.6902855617827596, + "grad_norm": 1.0282668336579435, + "learning_rate": 4.6232075261545476e-06, + "loss": 1.5511, + "step": 5173 + }, + { + "epoch": 0.6904190018681612, + "grad_norm": 0.9406742239628743, + "learning_rate": 4.6195640166297975e-06, + "loss": 1.5473, + "step": 5174 + }, + { + "epoch": 0.6905524419535628, + "grad_norm": 0.9331259548380249, + "learning_rate": 4.61592151210923e-06, + "loss": 1.5373, + "step": 5175 + }, + { + "epoch": 0.6906858820389645, + "grad_norm": 0.9424269744921971, + "learning_rate": 4.612280013273221e-06, + "loss": 1.5352, + "step": 5176 + }, + { + "epoch": 0.6908193221243661, + "grad_norm": 0.9992155998336348, + "learning_rate": 4.608639520801967e-06, + "loss": 1.5999, + "step": 5177 + }, + { + "epoch": 0.6909527622097679, + "grad_norm": 1.0114712743867609, + "learning_rate": 4.605000035375469e-06, + "loss": 1.5355, + "step": 5178 + }, + { + "epoch": 0.6910862022951695, + "grad_norm": 0.9838800177008744, + "learning_rate": 4.6013615576735385e-06, + "loss": 1.6047, + "step": 5179 + }, + { + "epoch": 0.6912196423805711, + "grad_norm": 0.9546619375864034, + "learning_rate": 4.597724088375803e-06, + "loss": 1.5293, + "step": 5180 + }, + { + "epoch": 0.6913530824659728, + "grad_norm": 1.3199845007698923, + "learning_rate": 4.594087628161702e-06, + "loss": 1.6278, + "step": 5181 + }, + { + "epoch": 0.6914865225513744, + "grad_norm": 0.971042571067602, + "learning_rate": 4.590452177710479e-06, + "loss": 1.5705, + "step": 5182 + }, + { + "epoch": 0.691619962636776, + "grad_norm": 0.9499493704833659, + "learning_rate": 4.586817737701204e-06, + "loss": 1.5513, + "step": 5183 + }, + { + "epoch": 0.6917534027221778, + "grad_norm": 1.0111784613671377, + "learning_rate": 4.583184308812742e-06, + "loss": 1.5944, + "step": 5184 + }, + { + "epoch": 0.6918868428075794, + "grad_norm": 0.9428606553786436, + "learning_rate": 4.579551891723781e-06, + "loss": 1.5875, + "step": 5185 + }, + { + "epoch": 0.692020282892981, + "grad_norm": 1.017385930406295, + "learning_rate": 4.575920487112806e-06, + "loss": 1.4778, + "step": 5186 + }, + { + "epoch": 0.6921537229783827, + "grad_norm": 0.9481626619930875, + "learning_rate": 4.572290095658134e-06, + "loss": 1.5244, + "step": 5187 + }, + { + "epoch": 0.6922871630637843, + "grad_norm": 1.0138210258708102, + "learning_rate": 4.568660718037875e-06, + "loss": 1.5753, + "step": 5188 + }, + { + "epoch": 0.692420603149186, + "grad_norm": 1.0491600377755286, + "learning_rate": 4.565032354929951e-06, + "loss": 1.5974, + "step": 5189 + }, + { + "epoch": 0.6925540432345877, + "grad_norm": 1.2073763312887016, + "learning_rate": 4.561405007012109e-06, + "loss": 1.5599, + "step": 5190 + }, + { + "epoch": 0.6926874833199893, + "grad_norm": 1.0293910333928922, + "learning_rate": 4.55777867496189e-06, + "loss": 1.5523, + "step": 5191 + }, + { + "epoch": 0.692820923405391, + "grad_norm": 0.9189903325399789, + "learning_rate": 4.5541533594566535e-06, + "loss": 1.5528, + "step": 5192 + }, + { + "epoch": 0.6929543634907926, + "grad_norm": 0.9080583203789369, + "learning_rate": 4.550529061173562e-06, + "loss": 1.5131, + "step": 5193 + }, + { + "epoch": 0.6930878035761943, + "grad_norm": 0.952626918394523, + "learning_rate": 4.5469057807896e-06, + "loss": 1.5493, + "step": 5194 + }, + { + "epoch": 0.693221243661596, + "grad_norm": 0.9394419115149728, + "learning_rate": 4.5432835189815535e-06, + "loss": 1.5554, + "step": 5195 + }, + { + "epoch": 0.6933546837469976, + "grad_norm": 0.9546357659052805, + "learning_rate": 4.539662276426018e-06, + "loss": 1.5209, + "step": 5196 + }, + { + "epoch": 0.6934881238323992, + "grad_norm": 1.003350765525599, + "learning_rate": 4.536042053799398e-06, + "loss": 1.5391, + "step": 5197 + }, + { + "epoch": 0.6936215639178009, + "grad_norm": 0.9836741751211436, + "learning_rate": 4.532422851777916e-06, + "loss": 1.5573, + "step": 5198 + }, + { + "epoch": 0.6937550040032026, + "grad_norm": 0.9574473385195087, + "learning_rate": 4.528804671037594e-06, + "loss": 1.5952, + "step": 5199 + }, + { + "epoch": 0.6938884440886042, + "grad_norm": 0.9311362883331841, + "learning_rate": 4.525187512254266e-06, + "loss": 1.5085, + "step": 5200 + }, + { + "epoch": 0.6940218841740059, + "grad_norm": 0.9609583738468779, + "learning_rate": 4.521571376103578e-06, + "loss": 1.5997, + "step": 5201 + }, + { + "epoch": 0.6941553242594075, + "grad_norm": 0.9375491622362763, + "learning_rate": 4.517956263260985e-06, + "loss": 1.5734, + "step": 5202 + }, + { + "epoch": 0.6942887643448091, + "grad_norm": 0.9381515614250917, + "learning_rate": 4.514342174401747e-06, + "loss": 1.5768, + "step": 5203 + }, + { + "epoch": 0.6944222044302109, + "grad_norm": 0.9464641129012018, + "learning_rate": 4.510729110200929e-06, + "loss": 1.5765, + "step": 5204 + }, + { + "epoch": 0.6945556445156125, + "grad_norm": 0.9461797676889874, + "learning_rate": 4.50711707133342e-06, + "loss": 1.5685, + "step": 5205 + }, + { + "epoch": 0.6946890846010142, + "grad_norm": 0.9925712528011031, + "learning_rate": 4.503506058473903e-06, + "loss": 1.53, + "step": 5206 + }, + { + "epoch": 0.6948225246864158, + "grad_norm": 0.932476678094929, + "learning_rate": 4.499896072296871e-06, + "loss": 1.5794, + "step": 5207 + }, + { + "epoch": 0.6949559647718174, + "grad_norm": 1.0005881114895052, + "learning_rate": 4.496287113476635e-06, + "loss": 1.5897, + "step": 5208 + }, + { + "epoch": 0.6950894048572192, + "grad_norm": 0.945425425153403, + "learning_rate": 4.492679182687304e-06, + "loss": 1.5572, + "step": 5209 + }, + { + "epoch": 0.6952228449426208, + "grad_norm": 0.9279450617465351, + "learning_rate": 4.489072280602799e-06, + "loss": 1.5723, + "step": 5210 + }, + { + "epoch": 0.6953562850280224, + "grad_norm": 0.9947713480204747, + "learning_rate": 4.485466407896844e-06, + "loss": 1.5525, + "step": 5211 + }, + { + "epoch": 0.6954897251134241, + "grad_norm": 1.0073089670685307, + "learning_rate": 4.481861565242982e-06, + "loss": 1.538, + "step": 5212 + }, + { + "epoch": 0.6956231651988257, + "grad_norm": 0.9775029711915599, + "learning_rate": 4.478257753314554e-06, + "loss": 1.478, + "step": 5213 + }, + { + "epoch": 0.6957566052842273, + "grad_norm": 1.1051713607759426, + "learning_rate": 4.474654972784705e-06, + "loss": 1.578, + "step": 5214 + }, + { + "epoch": 0.6958900453696291, + "grad_norm": 0.9162253285174159, + "learning_rate": 4.471053224326404e-06, + "loss": 1.5747, + "step": 5215 + }, + { + "epoch": 0.6960234854550307, + "grad_norm": 0.9337462353761629, + "learning_rate": 4.46745250861241e-06, + "loss": 1.5675, + "step": 5216 + }, + { + "epoch": 0.6961569255404323, + "grad_norm": 1.0028329737784027, + "learning_rate": 4.463852826315298e-06, + "loss": 1.5358, + "step": 5217 + }, + { + "epoch": 0.696290365625834, + "grad_norm": 0.9098824085690614, + "learning_rate": 4.460254178107446e-06, + "loss": 1.5468, + "step": 5218 + }, + { + "epoch": 0.6964238057112356, + "grad_norm": 0.954411548951353, + "learning_rate": 4.456656564661037e-06, + "loss": 1.5395, + "step": 5219 + }, + { + "epoch": 0.6965572457966374, + "grad_norm": 0.9723768754629288, + "learning_rate": 4.453059986648073e-06, + "loss": 1.5383, + "step": 5220 + }, + { + "epoch": 0.696690685882039, + "grad_norm": 0.9714260510102523, + "learning_rate": 4.449464444740349e-06, + "loss": 1.5604, + "step": 5221 + }, + { + "epoch": 0.6968241259674406, + "grad_norm": 0.9812563625331769, + "learning_rate": 4.445869939609472e-06, + "loss": 1.5926, + "step": 5222 + }, + { + "epoch": 0.6969575660528423, + "grad_norm": 1.1501049604817195, + "learning_rate": 4.442276471926853e-06, + "loss": 1.5484, + "step": 5223 + }, + { + "epoch": 0.6970910061382439, + "grad_norm": 0.9626957186688674, + "learning_rate": 4.4386840423637124e-06, + "loss": 1.5799, + "step": 5224 + }, + { + "epoch": 0.6972244462236455, + "grad_norm": 1.0216025455702171, + "learning_rate": 4.435092651591072e-06, + "loss": 1.5409, + "step": 5225 + }, + { + "epoch": 0.6973578863090473, + "grad_norm": 0.9485116910427717, + "learning_rate": 4.4315023002797675e-06, + "loss": 1.5487, + "step": 5226 + }, + { + "epoch": 0.6974913263944489, + "grad_norm": 0.9715863993865531, + "learning_rate": 4.427912989100434e-06, + "loss": 1.596, + "step": 5227 + }, + { + "epoch": 0.6976247664798505, + "grad_norm": 0.968567527529441, + "learning_rate": 4.42432471872351e-06, + "loss": 1.5294, + "step": 5228 + }, + { + "epoch": 0.6977582065652522, + "grad_norm": 0.9687730959837667, + "learning_rate": 4.420737489819253e-06, + "loss": 1.5265, + "step": 5229 + }, + { + "epoch": 0.6978916466506538, + "grad_norm": 1.0885904607859849, + "learning_rate": 4.417151303057707e-06, + "loss": 1.5883, + "step": 5230 + }, + { + "epoch": 0.6980250867360555, + "grad_norm": 1.025218940735289, + "learning_rate": 4.413566159108736e-06, + "loss": 1.514, + "step": 5231 + }, + { + "epoch": 0.6981585268214572, + "grad_norm": 2.751796441629812, + "learning_rate": 4.409982058641999e-06, + "loss": 1.5411, + "step": 5232 + }, + { + "epoch": 0.6982919669068588, + "grad_norm": 0.9234092508289905, + "learning_rate": 4.406399002326973e-06, + "loss": 1.5368, + "step": 5233 + }, + { + "epoch": 0.6984254069922605, + "grad_norm": 1.0403344284539555, + "learning_rate": 4.402816990832928e-06, + "loss": 1.5848, + "step": 5234 + }, + { + "epoch": 0.6985588470776621, + "grad_norm": 0.9148227947424791, + "learning_rate": 4.3992360248289416e-06, + "loss": 1.5145, + "step": 5235 + }, + { + "epoch": 0.6986922871630638, + "grad_norm": 0.9779207787661259, + "learning_rate": 4.395656104983895e-06, + "loss": 1.5853, + "step": 5236 + }, + { + "epoch": 0.6988257272484655, + "grad_norm": 1.111013896532024, + "learning_rate": 4.392077231966484e-06, + "loss": 1.5895, + "step": 5237 + }, + { + "epoch": 0.6989591673338671, + "grad_norm": 1.0099682804012988, + "learning_rate": 4.388499406445198e-06, + "loss": 1.5454, + "step": 5238 + }, + { + "epoch": 0.6990926074192687, + "grad_norm": 0.9508070443731435, + "learning_rate": 4.384922629088329e-06, + "loss": 1.5156, + "step": 5239 + }, + { + "epoch": 0.6992260475046704, + "grad_norm": 0.9772725704881033, + "learning_rate": 4.381346900563988e-06, + "loss": 1.5688, + "step": 5240 + }, + { + "epoch": 0.699359487590072, + "grad_norm": 1.1327443354133715, + "learning_rate": 4.377772221540073e-06, + "loss": 1.5734, + "step": 5241 + }, + { + "epoch": 0.6994929276754737, + "grad_norm": 1.080615395417145, + "learning_rate": 4.374198592684296e-06, + "loss": 1.5525, + "step": 5242 + }, + { + "epoch": 0.6996263677608754, + "grad_norm": 0.9705398879426412, + "learning_rate": 4.370626014664168e-06, + "loss": 1.5905, + "step": 5243 + }, + { + "epoch": 0.699759807846277, + "grad_norm": 0.9423985690259806, + "learning_rate": 4.36705448814701e-06, + "loss": 1.5543, + "step": 5244 + }, + { + "epoch": 0.6998932479316787, + "grad_norm": 1.0013253966326623, + "learning_rate": 4.36348401379994e-06, + "loss": 1.6252, + "step": 5245 + }, + { + "epoch": 0.7000266880170803, + "grad_norm": 0.9849762775796472, + "learning_rate": 4.359914592289879e-06, + "loss": 1.5735, + "step": 5246 + }, + { + "epoch": 0.700160128102482, + "grad_norm": 0.9466224944504175, + "learning_rate": 4.3563462242835605e-06, + "loss": 1.5474, + "step": 5247 + }, + { + "epoch": 0.7002935681878837, + "grad_norm": 1.0452884135924572, + "learning_rate": 4.352778910447513e-06, + "loss": 1.55, + "step": 5248 + }, + { + "epoch": 0.7004270082732853, + "grad_norm": 0.9636258481231191, + "learning_rate": 4.349212651448067e-06, + "loss": 1.5592, + "step": 5249 + }, + { + "epoch": 0.7005604483586869, + "grad_norm": 0.9266243703586854, + "learning_rate": 4.345647447951359e-06, + "loss": 1.5426, + "step": 5250 + }, + { + "epoch": 0.7006938884440886, + "grad_norm": 0.9465277944454074, + "learning_rate": 4.3420833006233335e-06, + "loss": 1.5223, + "step": 5251 + }, + { + "epoch": 0.7008273285294903, + "grad_norm": 0.9476657743051038, + "learning_rate": 4.338520210129729e-06, + "loss": 1.5138, + "step": 5252 + }, + { + "epoch": 0.7009607686148919, + "grad_norm": 0.92673177526777, + "learning_rate": 4.33495817713609e-06, + "loss": 1.5719, + "step": 5253 + }, + { + "epoch": 0.7010942087002936, + "grad_norm": 1.1410712640987881, + "learning_rate": 4.3313972023077656e-06, + "loss": 1.5309, + "step": 5254 + }, + { + "epoch": 0.7012276487856952, + "grad_norm": 0.8997294098548428, + "learning_rate": 4.327837286309905e-06, + "loss": 1.5379, + "step": 5255 + }, + { + "epoch": 0.7013610888710968, + "grad_norm": 1.117573499608535, + "learning_rate": 4.324278429807459e-06, + "loss": 1.5973, + "step": 5256 + }, + { + "epoch": 0.7014945289564986, + "grad_norm": 0.9591244120794552, + "learning_rate": 4.320720633465178e-06, + "loss": 1.576, + "step": 5257 + }, + { + "epoch": 0.7016279690419002, + "grad_norm": 1.246892551311886, + "learning_rate": 4.317163897947626e-06, + "loss": 1.5957, + "step": 5258 + }, + { + "epoch": 0.7017614091273019, + "grad_norm": 0.9650925501800929, + "learning_rate": 4.3136082239191565e-06, + "loss": 1.5404, + "step": 5259 + }, + { + "epoch": 0.7018948492127035, + "grad_norm": 0.937754472050194, + "learning_rate": 4.310053612043928e-06, + "loss": 1.5491, + "step": 5260 + }, + { + "epoch": 0.7020282892981051, + "grad_norm": 0.918105960005196, + "learning_rate": 4.306500062985903e-06, + "loss": 1.5244, + "step": 5261 + }, + { + "epoch": 0.7021617293835068, + "grad_norm": 0.9586676779477473, + "learning_rate": 4.302947577408839e-06, + "loss": 1.6228, + "step": 5262 + }, + { + "epoch": 0.7022951694689085, + "grad_norm": 0.9517519387872295, + "learning_rate": 4.299396155976308e-06, + "loss": 1.6056, + "step": 5263 + }, + { + "epoch": 0.7024286095543101, + "grad_norm": 1.495533408695814, + "learning_rate": 4.295845799351672e-06, + "loss": 1.5421, + "step": 5264 + }, + { + "epoch": 0.7025620496397118, + "grad_norm": 0.9864875327078859, + "learning_rate": 4.2922965081980964e-06, + "loss": 1.5468, + "step": 5265 + }, + { + "epoch": 0.7026954897251134, + "grad_norm": 0.9301887930563217, + "learning_rate": 4.288748283178546e-06, + "loss": 1.5442, + "step": 5266 + }, + { + "epoch": 0.702828929810515, + "grad_norm": 0.9633255736311295, + "learning_rate": 4.285201124955795e-06, + "loss": 1.6113, + "step": 5267 + }, + { + "epoch": 0.7029623698959168, + "grad_norm": 0.9458457528853886, + "learning_rate": 4.281655034192402e-06, + "loss": 1.5682, + "step": 5268 + }, + { + "epoch": 0.7030958099813184, + "grad_norm": 0.9396435856415133, + "learning_rate": 4.278110011550748e-06, + "loss": 1.5206, + "step": 5269 + }, + { + "epoch": 0.70322925006672, + "grad_norm": 0.932365193699769, + "learning_rate": 4.274566057692996e-06, + "loss": 1.5637, + "step": 5270 + }, + { + "epoch": 0.7033626901521217, + "grad_norm": 0.9356770658793856, + "learning_rate": 4.271023173281116e-06, + "loss": 1.5513, + "step": 5271 + }, + { + "epoch": 0.7034961302375233, + "grad_norm": 0.9470539843139907, + "learning_rate": 4.267481358976883e-06, + "loss": 1.5313, + "step": 5272 + }, + { + "epoch": 0.7036295703229251, + "grad_norm": 0.936074803599193, + "learning_rate": 4.263940615441865e-06, + "loss": 1.5915, + "step": 5273 + }, + { + "epoch": 0.7037630104083267, + "grad_norm": 0.9601362770329818, + "learning_rate": 4.260400943337433e-06, + "loss": 1.5683, + "step": 5274 + }, + { + "epoch": 0.7038964504937283, + "grad_norm": 0.9200436045608203, + "learning_rate": 4.2568623433247534e-06, + "loss": 1.5353, + "step": 5275 + }, + { + "epoch": 0.70402989057913, + "grad_norm": 1.2183542283592865, + "learning_rate": 4.253324816064803e-06, + "loss": 1.5893, + "step": 5276 + }, + { + "epoch": 0.7041633306645316, + "grad_norm": 0.9608670542983757, + "learning_rate": 4.24978836221835e-06, + "loss": 1.5719, + "step": 5277 + }, + { + "epoch": 0.7042967707499332, + "grad_norm": 1.0831628265033406, + "learning_rate": 4.246252982445957e-06, + "loss": 1.5496, + "step": 5278 + }, + { + "epoch": 0.704430210835335, + "grad_norm": 0.9288733637532146, + "learning_rate": 4.242718677408002e-06, + "loss": 1.5955, + "step": 5279 + }, + { + "epoch": 0.7045636509207366, + "grad_norm": 0.9220556310697627, + "learning_rate": 4.23918544776465e-06, + "loss": 1.5466, + "step": 5280 + }, + { + "epoch": 0.7046970910061382, + "grad_norm": 1.0053641177822847, + "learning_rate": 4.235653294175866e-06, + "loss": 1.5668, + "step": 5281 + }, + { + "epoch": 0.7048305310915399, + "grad_norm": 0.9592861362485582, + "learning_rate": 4.232122217301414e-06, + "loss": 1.5207, + "step": 5282 + }, + { + "epoch": 0.7049639711769415, + "grad_norm": 1.003209125290528, + "learning_rate": 4.228592217800865e-06, + "loss": 1.5636, + "step": 5283 + }, + { + "epoch": 0.7050974112623432, + "grad_norm": 0.9377732106678393, + "learning_rate": 4.2250632963335805e-06, + "loss": 1.5727, + "step": 5284 + }, + { + "epoch": 0.7052308513477449, + "grad_norm": 10.633257878300137, + "learning_rate": 4.221535453558718e-06, + "loss": 1.6243, + "step": 5285 + }, + { + "epoch": 0.7053642914331465, + "grad_norm": 0.9780546181932777, + "learning_rate": 4.218008690135247e-06, + "loss": 1.5263, + "step": 5286 + }, + { + "epoch": 0.7054977315185482, + "grad_norm": 1.016085908744265, + "learning_rate": 4.214483006721921e-06, + "loss": 1.5835, + "step": 5287 + }, + { + "epoch": 0.7056311716039498, + "grad_norm": 0.946908828245464, + "learning_rate": 4.2109584039773e-06, + "loss": 1.6154, + "step": 5288 + }, + { + "epoch": 0.7057646116893515, + "grad_norm": 0.9794302729269099, + "learning_rate": 4.207434882559734e-06, + "loss": 1.5523, + "step": 5289 + }, + { + "epoch": 0.7058980517747532, + "grad_norm": 1.1316618652316528, + "learning_rate": 4.2039124431273845e-06, + "loss": 1.5279, + "step": 5290 + }, + { + "epoch": 0.7060314918601548, + "grad_norm": 0.9378659449314672, + "learning_rate": 4.2003910863382005e-06, + "loss": 1.565, + "step": 5291 + }, + { + "epoch": 0.7061649319455564, + "grad_norm": 0.9503258372477722, + "learning_rate": 4.1968708128499256e-06, + "loss": 1.5009, + "step": 5292 + }, + { + "epoch": 0.7062983720309581, + "grad_norm": 0.9519765695098307, + "learning_rate": 4.1933516233201165e-06, + "loss": 1.6068, + "step": 5293 + }, + { + "epoch": 0.7064318121163597, + "grad_norm": 1.1619371544024075, + "learning_rate": 4.189833518406113e-06, + "loss": 1.5333, + "step": 5294 + }, + { + "epoch": 0.7065652522017614, + "grad_norm": 0.9543539047132761, + "learning_rate": 4.1863164987650575e-06, + "loss": 1.5793, + "step": 5295 + }, + { + "epoch": 0.7066986922871631, + "grad_norm": 0.9515900226999982, + "learning_rate": 4.182800565053884e-06, + "loss": 1.5893, + "step": 5296 + }, + { + "epoch": 0.7068321323725647, + "grad_norm": 0.9355762162950707, + "learning_rate": 4.179285717929338e-06, + "loss": 1.5553, + "step": 5297 + }, + { + "epoch": 0.7069655724579663, + "grad_norm": 0.9421727010797218, + "learning_rate": 4.175771958047947e-06, + "loss": 1.5825, + "step": 5298 + }, + { + "epoch": 0.707099012543368, + "grad_norm": 1.0799208667442957, + "learning_rate": 4.172259286066045e-06, + "loss": 1.5503, + "step": 5299 + }, + { + "epoch": 0.7072324526287697, + "grad_norm": 0.9679399491700288, + "learning_rate": 4.168747702639753e-06, + "loss": 1.5524, + "step": 5300 + }, + { + "epoch": 0.7073658927141714, + "grad_norm": 0.9691270167436293, + "learning_rate": 4.165237208425001e-06, + "loss": 1.5649, + "step": 5301 + }, + { + "epoch": 0.707499332799573, + "grad_norm": 0.9311122890360443, + "learning_rate": 4.1617278040775086e-06, + "loss": 1.514, + "step": 5302 + }, + { + "epoch": 0.7076327728849746, + "grad_norm": 0.8967366726125487, + "learning_rate": 4.158219490252791e-06, + "loss": 1.5358, + "step": 5303 + }, + { + "epoch": 0.7077662129703763, + "grad_norm": 0.9509487136930983, + "learning_rate": 4.154712267606161e-06, + "loss": 1.5899, + "step": 5304 + }, + { + "epoch": 0.707899653055778, + "grad_norm": 0.9712447192685107, + "learning_rate": 4.151206136792725e-06, + "loss": 1.5993, + "step": 5305 + }, + { + "epoch": 0.7080330931411796, + "grad_norm": 0.9453208811579048, + "learning_rate": 4.147701098467395e-06, + "loss": 1.5903, + "step": 5306 + }, + { + "epoch": 0.7081665332265813, + "grad_norm": 0.955132045264726, + "learning_rate": 4.144197153284869e-06, + "loss": 1.513, + "step": 5307 + }, + { + "epoch": 0.7082999733119829, + "grad_norm": 0.9400188884444305, + "learning_rate": 4.140694301899645e-06, + "loss": 1.551, + "step": 5308 + }, + { + "epoch": 0.7084334133973845, + "grad_norm": 0.9193593248190934, + "learning_rate": 4.137192544966013e-06, + "loss": 1.585, + "step": 5309 + }, + { + "epoch": 0.7085668534827863, + "grad_norm": 0.9312434980756322, + "learning_rate": 4.13369188313806e-06, + "loss": 1.5386, + "step": 5310 + }, + { + "epoch": 0.7087002935681879, + "grad_norm": 0.9271931561265911, + "learning_rate": 4.130192317069677e-06, + "loss": 1.5208, + "step": 5311 + }, + { + "epoch": 0.7088337336535896, + "grad_norm": 0.9970719656664363, + "learning_rate": 4.126693847414538e-06, + "loss": 1.5826, + "step": 5312 + }, + { + "epoch": 0.7089671737389912, + "grad_norm": 0.9479786344427346, + "learning_rate": 4.123196474826119e-06, + "loss": 1.5403, + "step": 5313 + }, + { + "epoch": 0.7091006138243928, + "grad_norm": 0.8979702441638332, + "learning_rate": 4.119700199957684e-06, + "loss": 1.5509, + "step": 5314 + }, + { + "epoch": 0.7092340539097945, + "grad_norm": 0.9895920970366872, + "learning_rate": 4.116205023462306e-06, + "loss": 1.5785, + "step": 5315 + }, + { + "epoch": 0.7093674939951962, + "grad_norm": 0.9499405479425314, + "learning_rate": 4.112710945992842e-06, + "loss": 1.5824, + "step": 5316 + }, + { + "epoch": 0.7095009340805978, + "grad_norm": 0.9355717622213187, + "learning_rate": 4.109217968201937e-06, + "loss": 1.5706, + "step": 5317 + }, + { + "epoch": 0.7096343741659995, + "grad_norm": 0.9477283832957076, + "learning_rate": 4.1057260907420524e-06, + "loss": 1.497, + "step": 5318 + }, + { + "epoch": 0.7097678142514011, + "grad_norm": 0.9551646576385708, + "learning_rate": 4.102235314265425e-06, + "loss": 1.5367, + "step": 5319 + }, + { + "epoch": 0.7099012543368027, + "grad_norm": 0.9248106472529288, + "learning_rate": 4.098745639424091e-06, + "loss": 1.5693, + "step": 5320 + }, + { + "epoch": 0.7100346944222045, + "grad_norm": 0.963099005180929, + "learning_rate": 4.095257066869881e-06, + "loss": 1.573, + "step": 5321 + }, + { + "epoch": 0.7101681345076061, + "grad_norm": 0.9687592868939011, + "learning_rate": 4.091769597254426e-06, + "loss": 1.5924, + "step": 5322 + }, + { + "epoch": 0.7103015745930077, + "grad_norm": 0.9896034088236875, + "learning_rate": 4.088283231229142e-06, + "loss": 1.5621, + "step": 5323 + }, + { + "epoch": 0.7104350146784094, + "grad_norm": 0.9341590041901182, + "learning_rate": 4.08479796944524e-06, + "loss": 1.552, + "step": 5324 + }, + { + "epoch": 0.710568454763811, + "grad_norm": 0.9304265266280125, + "learning_rate": 4.081313812553734e-06, + "loss": 1.5652, + "step": 5325 + }, + { + "epoch": 0.7107018948492128, + "grad_norm": 0.9605863893931225, + "learning_rate": 4.07783076120542e-06, + "loss": 1.5676, + "step": 5326 + }, + { + "epoch": 0.7108353349346144, + "grad_norm": 1.0997008119225524, + "learning_rate": 4.074348816050895e-06, + "loss": 1.6232, + "step": 5327 + }, + { + "epoch": 0.710968775020016, + "grad_norm": 0.9045291420444287, + "learning_rate": 4.07086797774054e-06, + "loss": 1.5255, + "step": 5328 + }, + { + "epoch": 0.7111022151054177, + "grad_norm": 0.9985345986185172, + "learning_rate": 4.067388246924545e-06, + "loss": 1.6026, + "step": 5329 + }, + { + "epoch": 0.7112356551908193, + "grad_norm": 0.9841974283402879, + "learning_rate": 4.063909624252881e-06, + "loss": 1.5454, + "step": 5330 + }, + { + "epoch": 0.7113690952762209, + "grad_norm": 0.9526315404945778, + "learning_rate": 4.060432110375314e-06, + "loss": 1.5019, + "step": 5331 + }, + { + "epoch": 0.7115025353616227, + "grad_norm": 0.9491732500227824, + "learning_rate": 4.0569557059414e-06, + "loss": 1.5423, + "step": 5332 + }, + { + "epoch": 0.7116359754470243, + "grad_norm": 0.9491096007519367, + "learning_rate": 4.0534804116005e-06, + "loss": 1.5221, + "step": 5333 + }, + { + "epoch": 0.7117694155324259, + "grad_norm": 0.9284527190069253, + "learning_rate": 4.0500062280017545e-06, + "loss": 1.5625, + "step": 5334 + }, + { + "epoch": 0.7119028556178276, + "grad_norm": 0.9759261683989565, + "learning_rate": 4.0465331557941e-06, + "loss": 1.5516, + "step": 5335 + }, + { + "epoch": 0.7120362957032292, + "grad_norm": 1.0927488467118913, + "learning_rate": 4.0430611956262735e-06, + "loss": 1.5364, + "step": 5336 + }, + { + "epoch": 0.7121697357886309, + "grad_norm": 2.0805229151915445, + "learning_rate": 4.039590348146791e-06, + "loss": 1.5377, + "step": 5337 + }, + { + "epoch": 0.7123031758740326, + "grad_norm": 0.9353765748626967, + "learning_rate": 4.036120614003972e-06, + "loss": 1.571, + "step": 5338 + }, + { + "epoch": 0.7124366159594342, + "grad_norm": 0.9903701860089882, + "learning_rate": 4.032651993845917e-06, + "loss": 1.5247, + "step": 5339 + }, + { + "epoch": 0.7125700560448359, + "grad_norm": 0.9235475362427148, + "learning_rate": 4.029184488320531e-06, + "loss": 1.515, + "step": 5340 + }, + { + "epoch": 0.7127034961302375, + "grad_norm": 0.9026258019298207, + "learning_rate": 4.0257180980755025e-06, + "loss": 1.5408, + "step": 5341 + }, + { + "epoch": 0.7128369362156391, + "grad_norm": 1.2538215164072486, + "learning_rate": 4.022252823758311e-06, + "loss": 1.6143, + "step": 5342 + }, + { + "epoch": 0.7129703763010409, + "grad_norm": 0.9545237012787869, + "learning_rate": 4.018788666016236e-06, + "loss": 1.5585, + "step": 5343 + }, + { + "epoch": 0.7131038163864425, + "grad_norm": 0.9280503299040727, + "learning_rate": 4.015325625496339e-06, + "loss": 1.573, + "step": 5344 + }, + { + "epoch": 0.7132372564718441, + "grad_norm": 0.9990855517950481, + "learning_rate": 4.011863702845477e-06, + "loss": 1.5783, + "step": 5345 + }, + { + "epoch": 0.7133706965572458, + "grad_norm": 0.9614841419311843, + "learning_rate": 4.008402898710299e-06, + "loss": 1.5716, + "step": 5346 + }, + { + "epoch": 0.7135041366426474, + "grad_norm": 0.9450318885739145, + "learning_rate": 4.004943213737238e-06, + "loss": 1.5561, + "step": 5347 + }, + { + "epoch": 0.7136375767280491, + "grad_norm": 0.9957058765830085, + "learning_rate": 4.001484648572532e-06, + "loss": 1.5563, + "step": 5348 + }, + { + "epoch": 0.7137710168134508, + "grad_norm": 1.0067457396072523, + "learning_rate": 3.998027203862199e-06, + "loss": 1.5962, + "step": 5349 + }, + { + "epoch": 0.7139044568988524, + "grad_norm": 0.9853384038154768, + "learning_rate": 3.994570880252049e-06, + "loss": 1.5547, + "step": 5350 + }, + { + "epoch": 0.714037896984254, + "grad_norm": 0.9554713328733841, + "learning_rate": 3.991115678387684e-06, + "loss": 1.5627, + "step": 5351 + }, + { + "epoch": 0.7141713370696557, + "grad_norm": 1.0861226656873586, + "learning_rate": 3.987661598914496e-06, + "loss": 1.6133, + "step": 5352 + }, + { + "epoch": 0.7143047771550574, + "grad_norm": 0.9363068359607128, + "learning_rate": 3.984208642477665e-06, + "loss": 1.5493, + "step": 5353 + }, + { + "epoch": 0.7144382172404591, + "grad_norm": 0.9496779214889088, + "learning_rate": 3.9807568097221705e-06, + "loss": 1.5819, + "step": 5354 + }, + { + "epoch": 0.7145716573258607, + "grad_norm": 0.9293168489795315, + "learning_rate": 3.977306101292773e-06, + "loss": 1.5272, + "step": 5355 + }, + { + "epoch": 0.7147050974112623, + "grad_norm": 0.962632697448326, + "learning_rate": 3.973856517834021e-06, + "loss": 1.5798, + "step": 5356 + }, + { + "epoch": 0.714838537496664, + "grad_norm": 0.9584702412606565, + "learning_rate": 3.9704080599902635e-06, + "loss": 1.5512, + "step": 5357 + }, + { + "epoch": 0.7149719775820657, + "grad_norm": 0.9815244757813827, + "learning_rate": 3.966960728405633e-06, + "loss": 1.6353, + "step": 5358 + }, + { + "epoch": 0.7151054176674673, + "grad_norm": 0.9928728802532544, + "learning_rate": 3.963514523724049e-06, + "loss": 1.5299, + "step": 5359 + }, + { + "epoch": 0.715238857752869, + "grad_norm": 0.9623909333997922, + "learning_rate": 3.96006944658922e-06, + "loss": 1.5684, + "step": 5360 + }, + { + "epoch": 0.7153722978382706, + "grad_norm": 0.9112193396740259, + "learning_rate": 3.956625497644655e-06, + "loss": 1.5723, + "step": 5361 + }, + { + "epoch": 0.7155057379236722, + "grad_norm": 0.9459655781359935, + "learning_rate": 3.95318267753364e-06, + "loss": 1.5027, + "step": 5362 + }, + { + "epoch": 0.715639178009074, + "grad_norm": 1.0216963512845265, + "learning_rate": 3.949740986899257e-06, + "loss": 1.5619, + "step": 5363 + }, + { + "epoch": 0.7157726180944756, + "grad_norm": 14.9027623425261, + "learning_rate": 3.946300426384368e-06, + "loss": 1.5481, + "step": 5364 + }, + { + "epoch": 0.7159060581798772, + "grad_norm": 1.0993613787167713, + "learning_rate": 3.942860996631639e-06, + "loss": 1.587, + "step": 5365 + }, + { + "epoch": 0.7160394982652789, + "grad_norm": 1.0313006265813407, + "learning_rate": 3.939422698283513e-06, + "loss": 1.5491, + "step": 5366 + }, + { + "epoch": 0.7161729383506805, + "grad_norm": 0.9390749627476853, + "learning_rate": 3.9359855319822205e-06, + "loss": 1.5661, + "step": 5367 + }, + { + "epoch": 0.7163063784360822, + "grad_norm": 0.9760457226921616, + "learning_rate": 3.932549498369793e-06, + "loss": 1.6101, + "step": 5368 + }, + { + "epoch": 0.7164398185214839, + "grad_norm": 0.958734457032826, + "learning_rate": 3.92911459808804e-06, + "loss": 1.5229, + "step": 5369 + }, + { + "epoch": 0.7165732586068855, + "grad_norm": 0.9526647222869271, + "learning_rate": 3.925680831778559e-06, + "loss": 1.5667, + "step": 5370 + }, + { + "epoch": 0.7167066986922872, + "grad_norm": 1.071079462379526, + "learning_rate": 3.922248200082736e-06, + "loss": 1.5439, + "step": 5371 + }, + { + "epoch": 0.7168401387776888, + "grad_norm": 1.0754749545538824, + "learning_rate": 3.918816703641757e-06, + "loss": 1.4997, + "step": 5372 + }, + { + "epoch": 0.7169735788630904, + "grad_norm": 0.9839191618810875, + "learning_rate": 3.915386343096579e-06, + "loss": 1.5861, + "step": 5373 + }, + { + "epoch": 0.7171070189484922, + "grad_norm": 0.9696932839295033, + "learning_rate": 3.911957119087953e-06, + "loss": 1.5767, + "step": 5374 + }, + { + "epoch": 0.7172404590338938, + "grad_norm": 1.0878185921141816, + "learning_rate": 3.908529032256426e-06, + "loss": 1.5695, + "step": 5375 + }, + { + "epoch": 0.7173738991192954, + "grad_norm": 1.0356241772862398, + "learning_rate": 3.905102083242321e-06, + "loss": 1.5772, + "step": 5376 + }, + { + "epoch": 0.7175073392046971, + "grad_norm": 0.9534440152222651, + "learning_rate": 3.901676272685755e-06, + "loss": 1.5578, + "step": 5377 + }, + { + "epoch": 0.7176407792900987, + "grad_norm": 1.0211708448584833, + "learning_rate": 3.898251601226626e-06, + "loss": 1.4683, + "step": 5378 + }, + { + "epoch": 0.7177742193755005, + "grad_norm": 1.0468960541282286, + "learning_rate": 3.894828069504629e-06, + "loss": 1.5454, + "step": 5379 + }, + { + "epoch": 0.7179076594609021, + "grad_norm": 0.9536008377149614, + "learning_rate": 3.89140567815924e-06, + "loss": 1.5293, + "step": 5380 + }, + { + "epoch": 0.7180410995463037, + "grad_norm": 1.0458155687344466, + "learning_rate": 3.8879844278297164e-06, + "loss": 1.6136, + "step": 5381 + }, + { + "epoch": 0.7181745396317054, + "grad_norm": 0.9300674323746942, + "learning_rate": 3.884564319155119e-06, + "loss": 1.5283, + "step": 5382 + }, + { + "epoch": 0.718307979717107, + "grad_norm": 1.147948065522506, + "learning_rate": 3.881145352774278e-06, + "loss": 1.4485, + "step": 5383 + }, + { + "epoch": 0.7184414198025086, + "grad_norm": 0.9414728729601566, + "learning_rate": 3.877727529325821e-06, + "loss": 1.5722, + "step": 5384 + }, + { + "epoch": 0.7185748598879104, + "grad_norm": 0.9845671466215753, + "learning_rate": 3.874310849448152e-06, + "loss": 1.5702, + "step": 5385 + }, + { + "epoch": 0.718708299973312, + "grad_norm": 1.2446034917882773, + "learning_rate": 3.870895313779477e-06, + "loss": 1.5208, + "step": 5386 + }, + { + "epoch": 0.7188417400587136, + "grad_norm": 0.9504776390771918, + "learning_rate": 3.867480922957775e-06, + "loss": 1.5833, + "step": 5387 + }, + { + "epoch": 0.7189751801441153, + "grad_norm": 0.9383944211772788, + "learning_rate": 3.864067677620815e-06, + "loss": 1.5096, + "step": 5388 + }, + { + "epoch": 0.7191086202295169, + "grad_norm": 0.9197073694471657, + "learning_rate": 3.860655578406153e-06, + "loss": 1.4891, + "step": 5389 + }, + { + "epoch": 0.7192420603149186, + "grad_norm": 1.293587939218792, + "learning_rate": 3.857244625951125e-06, + "loss": 1.5577, + "step": 5390 + }, + { + "epoch": 0.7193755004003203, + "grad_norm": 0.9425333596458038, + "learning_rate": 3.8538348208928675e-06, + "loss": 1.547, + "step": 5391 + }, + { + "epoch": 0.7195089404857219, + "grad_norm": 0.9187340276232536, + "learning_rate": 3.850426163868289e-06, + "loss": 1.5602, + "step": 5392 + }, + { + "epoch": 0.7196423805711236, + "grad_norm": 1.0066203632714577, + "learning_rate": 3.847018655514087e-06, + "loss": 1.5686, + "step": 5393 + }, + { + "epoch": 0.7197758206565252, + "grad_norm": 0.9101557342470021, + "learning_rate": 3.843612296466747e-06, + "loss": 1.5074, + "step": 5394 + }, + { + "epoch": 0.7199092607419268, + "grad_norm": 1.0091781445483228, + "learning_rate": 3.840207087362535e-06, + "loss": 1.5506, + "step": 5395 + }, + { + "epoch": 0.7200427008273286, + "grad_norm": 0.9766556965941918, + "learning_rate": 3.836803028837506e-06, + "loss": 1.575, + "step": 5396 + }, + { + "epoch": 0.7201761409127302, + "grad_norm": 0.9326384838247909, + "learning_rate": 3.833400121527502e-06, + "loss": 1.5406, + "step": 5397 + }, + { + "epoch": 0.7203095809981318, + "grad_norm": 0.9729805047938272, + "learning_rate": 3.829998366068147e-06, + "loss": 1.583, + "step": 5398 + }, + { + "epoch": 0.7204430210835335, + "grad_norm": 1.010555826446732, + "learning_rate": 3.826597763094844e-06, + "loss": 1.533, + "step": 5399 + }, + { + "epoch": 0.7205764611689351, + "grad_norm": 0.9416257579609061, + "learning_rate": 3.823198313242799e-06, + "loss": 1.5651, + "step": 5400 + }, + { + "epoch": 0.7207099012543368, + "grad_norm": 0.9745957493904607, + "learning_rate": 3.81980001714698e-06, + "loss": 1.5579, + "step": 5401 + }, + { + "epoch": 0.7208433413397385, + "grad_norm": 0.9508022115629777, + "learning_rate": 3.816402875442155e-06, + "loss": 1.5297, + "step": 5402 + }, + { + "epoch": 0.7209767814251401, + "grad_norm": 9.996627671307085, + "learning_rate": 3.813006888762867e-06, + "loss": 1.5849, + "step": 5403 + }, + { + "epoch": 0.7211102215105417, + "grad_norm": 0.9362163294366803, + "learning_rate": 3.809612057743454e-06, + "loss": 1.5874, + "step": 5404 + }, + { + "epoch": 0.7212436615959434, + "grad_norm": 0.9650373486942456, + "learning_rate": 3.8062183830180278e-06, + "loss": 1.5313, + "step": 5405 + }, + { + "epoch": 0.7213771016813451, + "grad_norm": 0.9201142210045637, + "learning_rate": 3.802825865220485e-06, + "loss": 1.5155, + "step": 5406 + }, + { + "epoch": 0.7215105417667468, + "grad_norm": 0.9383591060889741, + "learning_rate": 3.799434504984517e-06, + "loss": 1.5318, + "step": 5407 + }, + { + "epoch": 0.7216439818521484, + "grad_norm": 0.9356509582385691, + "learning_rate": 3.7960443029435876e-06, + "loss": 1.5374, + "step": 5408 + }, + { + "epoch": 0.72177742193755, + "grad_norm": 0.9314768342510902, + "learning_rate": 3.7926552597309472e-06, + "loss": 1.597, + "step": 5409 + }, + { + "epoch": 0.7219108620229517, + "grad_norm": 1.230724666512068, + "learning_rate": 3.7892673759796285e-06, + "loss": 1.5506, + "step": 5410 + }, + { + "epoch": 0.7220443021083534, + "grad_norm": 0.8916809377299305, + "learning_rate": 3.7858806523224546e-06, + "loss": 1.4881, + "step": 5411 + }, + { + "epoch": 0.722177742193755, + "grad_norm": 1.150653767752609, + "learning_rate": 3.7824950893920256e-06, + "loss": 1.5351, + "step": 5412 + }, + { + "epoch": 0.7223111822791567, + "grad_norm": 0.9860122847150068, + "learning_rate": 3.7791106878207206e-06, + "loss": 1.5517, + "step": 5413 + }, + { + "epoch": 0.7224446223645583, + "grad_norm": 0.9187176873452039, + "learning_rate": 3.7757274482407146e-06, + "loss": 1.5182, + "step": 5414 + }, + { + "epoch": 0.7225780624499599, + "grad_norm": 0.9502222725296599, + "learning_rate": 3.7723453712839566e-06, + "loss": 1.5383, + "step": 5415 + }, + { + "epoch": 0.7227115025353616, + "grad_norm": 0.9501143358026882, + "learning_rate": 3.768964457582177e-06, + "loss": 1.6326, + "step": 5416 + }, + { + "epoch": 0.7228449426207633, + "grad_norm": 0.9523336545668618, + "learning_rate": 3.7655847077668915e-06, + "loss": 1.5764, + "step": 5417 + }, + { + "epoch": 0.7229783827061649, + "grad_norm": 1.0389137372207382, + "learning_rate": 3.7622061224694038e-06, + "loss": 1.5737, + "step": 5418 + }, + { + "epoch": 0.7231118227915666, + "grad_norm": 0.9455855773831311, + "learning_rate": 3.758828702320794e-06, + "loss": 1.5635, + "step": 5419 + }, + { + "epoch": 0.7232452628769682, + "grad_norm": 0.9693626405402851, + "learning_rate": 3.7554524479519196e-06, + "loss": 1.5343, + "step": 5420 + }, + { + "epoch": 0.72337870296237, + "grad_norm": 0.9574938541001241, + "learning_rate": 3.7520773599934347e-06, + "loss": 1.5699, + "step": 5421 + }, + { + "epoch": 0.7235121430477716, + "grad_norm": 0.940692924931385, + "learning_rate": 3.748703439075764e-06, + "loss": 1.5674, + "step": 5422 + }, + { + "epoch": 0.7236455831331732, + "grad_norm": 0.9270797016669773, + "learning_rate": 3.7453306858291163e-06, + "loss": 1.517, + "step": 5423 + }, + { + "epoch": 0.7237790232185749, + "grad_norm": 0.9375714221926796, + "learning_rate": 3.741959100883481e-06, + "loss": 1.5181, + "step": 5424 + }, + { + "epoch": 0.7239124633039765, + "grad_norm": 0.9283234071509436, + "learning_rate": 3.738588684868639e-06, + "loss": 1.5591, + "step": 5425 + }, + { + "epoch": 0.7240459033893781, + "grad_norm": 0.9733008805447092, + "learning_rate": 3.7352194384141426e-06, + "loss": 1.5131, + "step": 5426 + }, + { + "epoch": 0.7241793434747799, + "grad_norm": 0.9570280233761831, + "learning_rate": 3.731851362149327e-06, + "loss": 1.5628, + "step": 5427 + }, + { + "epoch": 0.7243127835601815, + "grad_norm": 1.2042047907577633, + "learning_rate": 3.7284844567033083e-06, + "loss": 1.538, + "step": 5428 + }, + { + "epoch": 0.7244462236455831, + "grad_norm": 0.9741593159201551, + "learning_rate": 3.7251187227049924e-06, + "loss": 1.5553, + "step": 5429 + }, + { + "epoch": 0.7245796637309848, + "grad_norm": 0.9197479513774383, + "learning_rate": 3.7217541607830576e-06, + "loss": 1.5162, + "step": 5430 + }, + { + "epoch": 0.7247131038163864, + "grad_norm": 0.9403320524684858, + "learning_rate": 3.718390771565964e-06, + "loss": 1.5288, + "step": 5431 + }, + { + "epoch": 0.724846543901788, + "grad_norm": 0.95182884051891, + "learning_rate": 3.7150285556819563e-06, + "loss": 1.5433, + "step": 5432 + }, + { + "epoch": 0.7249799839871898, + "grad_norm": 1.0304654498405337, + "learning_rate": 3.711667513759053e-06, + "loss": 1.622, + "step": 5433 + }, + { + "epoch": 0.7251134240725914, + "grad_norm": 0.98382338711681, + "learning_rate": 3.708307646425068e-06, + "loss": 1.5138, + "step": 5434 + }, + { + "epoch": 0.7252468641579931, + "grad_norm": 1.058008764362752, + "learning_rate": 3.704948954307579e-06, + "loss": 1.5539, + "step": 5435 + }, + { + "epoch": 0.7253803042433947, + "grad_norm": 1.0067806065661722, + "learning_rate": 3.7015914380339545e-06, + "loss": 1.5266, + "step": 5436 + }, + { + "epoch": 0.7255137443287963, + "grad_norm": 0.9964621996103804, + "learning_rate": 3.6982350982313387e-06, + "loss": 1.5728, + "step": 5437 + }, + { + "epoch": 0.7256471844141981, + "grad_norm": 1.2415140773272089, + "learning_rate": 3.694879935526655e-06, + "loss": 1.4833, + "step": 5438 + }, + { + "epoch": 0.7257806244995997, + "grad_norm": 1.0064068768265315, + "learning_rate": 3.691525950546617e-06, + "loss": 1.5633, + "step": 5439 + }, + { + "epoch": 0.7259140645850013, + "grad_norm": 1.335840952388961, + "learning_rate": 3.6881731439177058e-06, + "loss": 1.5479, + "step": 5440 + }, + { + "epoch": 0.726047504670403, + "grad_norm": 0.9765575128964399, + "learning_rate": 3.6848215162661894e-06, + "loss": 1.5502, + "step": 5441 + }, + { + "epoch": 0.7261809447558046, + "grad_norm": 0.9318261067807464, + "learning_rate": 3.6814710682181088e-06, + "loss": 1.525, + "step": 5442 + }, + { + "epoch": 0.7263143848412063, + "grad_norm": 0.9722119023080255, + "learning_rate": 3.6781218003992967e-06, + "loss": 1.5279, + "step": 5443 + }, + { + "epoch": 0.726447824926608, + "grad_norm": 0.9517245541221244, + "learning_rate": 3.6747737134353557e-06, + "loss": 1.5848, + "step": 5444 + }, + { + "epoch": 0.7265812650120096, + "grad_norm": 9.955006013526363, + "learning_rate": 3.6714268079516657e-06, + "loss": 1.6209, + "step": 5445 + }, + { + "epoch": 0.7267147050974113, + "grad_norm": 0.9547952003339406, + "learning_rate": 3.668081084573398e-06, + "loss": 1.6035, + "step": 5446 + }, + { + "epoch": 0.7268481451828129, + "grad_norm": 0.965245818039803, + "learning_rate": 3.6647365439254923e-06, + "loss": 1.5966, + "step": 5447 + }, + { + "epoch": 0.7269815852682145, + "grad_norm": 0.9878697703561382, + "learning_rate": 3.6613931866326704e-06, + "loss": 1.5274, + "step": 5448 + }, + { + "epoch": 0.7271150253536163, + "grad_norm": 0.9432582883437258, + "learning_rate": 3.658051013319429e-06, + "loss": 1.5539, + "step": 5449 + }, + { + "epoch": 0.7272484654390179, + "grad_norm": 0.9559304328129197, + "learning_rate": 3.6547100246100576e-06, + "loss": 1.5629, + "step": 5450 + }, + { + "epoch": 0.7273819055244195, + "grad_norm": 1.0818678239934452, + "learning_rate": 3.6513702211286086e-06, + "loss": 1.5481, + "step": 5451 + }, + { + "epoch": 0.7275153456098212, + "grad_norm": 0.9568005743025887, + "learning_rate": 3.6480316034989173e-06, + "loss": 1.5597, + "step": 5452 + }, + { + "epoch": 0.7276487856952228, + "grad_norm": 0.9041772250496767, + "learning_rate": 3.644694172344606e-06, + "loss": 1.534, + "step": 5453 + }, + { + "epoch": 0.7277822257806245, + "grad_norm": 0.9169233659753793, + "learning_rate": 3.6413579282890655e-06, + "loss": 1.5407, + "step": 5454 + }, + { + "epoch": 0.7279156658660262, + "grad_norm": 0.9466397250058866, + "learning_rate": 3.638022871955469e-06, + "loss": 1.5682, + "step": 5455 + }, + { + "epoch": 0.7280491059514278, + "grad_norm": 1.1961458887332446, + "learning_rate": 3.6346890039667616e-06, + "loss": 1.5361, + "step": 5456 + }, + { + "epoch": 0.7281825460368294, + "grad_norm": 0.9721270068626529, + "learning_rate": 3.6313563249456806e-06, + "loss": 1.5553, + "step": 5457 + }, + { + "epoch": 0.7283159861222311, + "grad_norm": 0.9791647245219625, + "learning_rate": 3.6280248355147273e-06, + "loss": 1.5297, + "step": 5458 + }, + { + "epoch": 0.7284494262076328, + "grad_norm": 0.9573169731204293, + "learning_rate": 3.6246945362961882e-06, + "loss": 1.5585, + "step": 5459 + }, + { + "epoch": 0.7285828662930345, + "grad_norm": 1.0013874444620199, + "learning_rate": 3.62136542791212e-06, + "loss": 1.5126, + "step": 5460 + }, + { + "epoch": 0.7287163063784361, + "grad_norm": 0.9187983530032887, + "learning_rate": 3.6180375109843703e-06, + "loss": 1.5512, + "step": 5461 + }, + { + "epoch": 0.7288497464638377, + "grad_norm": 1.0234386688020545, + "learning_rate": 3.614710786134552e-06, + "loss": 1.5312, + "step": 5462 + }, + { + "epoch": 0.7289831865492394, + "grad_norm": 0.9315553183193896, + "learning_rate": 3.6113852539840567e-06, + "loss": 1.5634, + "step": 5463 + }, + { + "epoch": 0.729116626634641, + "grad_norm": 0.925019388625177, + "learning_rate": 3.6080609151540613e-06, + "loss": 1.5594, + "step": 5464 + }, + { + "epoch": 0.7292500667200427, + "grad_norm": 0.935710933008227, + "learning_rate": 3.604737770265513e-06, + "loss": 1.4976, + "step": 5465 + }, + { + "epoch": 0.7293835068054444, + "grad_norm": 1.4420915957678724, + "learning_rate": 3.6014158199391358e-06, + "loss": 1.5603, + "step": 5466 + }, + { + "epoch": 0.729516946890846, + "grad_norm": 1.0497135426493642, + "learning_rate": 3.5980950647954294e-06, + "loss": 1.5248, + "step": 5467 + }, + { + "epoch": 0.7296503869762476, + "grad_norm": 0.9269061560231096, + "learning_rate": 3.5947755054546795e-06, + "loss": 1.5422, + "step": 5468 + }, + { + "epoch": 0.7297838270616493, + "grad_norm": 0.9527020311342057, + "learning_rate": 3.5914571425369393e-06, + "loss": 1.5373, + "step": 5469 + }, + { + "epoch": 0.729917267147051, + "grad_norm": 0.9372367174715784, + "learning_rate": 3.5881399766620373e-06, + "loss": 1.5657, + "step": 5470 + }, + { + "epoch": 0.7300507072324526, + "grad_norm": 0.9581054862273867, + "learning_rate": 3.58482400844959e-06, + "loss": 1.5739, + "step": 5471 + }, + { + "epoch": 0.7301841473178543, + "grad_norm": 0.947441074687153, + "learning_rate": 3.5815092385189777e-06, + "loss": 1.5728, + "step": 5472 + }, + { + "epoch": 0.7303175874032559, + "grad_norm": 1.0228746400085942, + "learning_rate": 3.5781956674893626e-06, + "loss": 1.5334, + "step": 5473 + }, + { + "epoch": 0.7304510274886576, + "grad_norm": 0.9536334081802301, + "learning_rate": 3.574883295979682e-06, + "loss": 1.562, + "step": 5474 + }, + { + "epoch": 0.7305844675740593, + "grad_norm": 0.9606672650917066, + "learning_rate": 3.5715721246086486e-06, + "loss": 1.5526, + "step": 5475 + }, + { + "epoch": 0.7307179076594609, + "grad_norm": 0.9577088048584481, + "learning_rate": 3.568262153994749e-06, + "loss": 1.5837, + "step": 5476 + }, + { + "epoch": 0.7308513477448626, + "grad_norm": 0.9719144515399968, + "learning_rate": 3.5649533847562544e-06, + "loss": 1.5316, + "step": 5477 + }, + { + "epoch": 0.7309847878302642, + "grad_norm": 1.004032695088233, + "learning_rate": 3.5616458175112013e-06, + "loss": 1.593, + "step": 5478 + }, + { + "epoch": 0.7311182279156658, + "grad_norm": 0.9494207677824381, + "learning_rate": 3.558339452877406e-06, + "loss": 1.5683, + "step": 5479 + }, + { + "epoch": 0.7312516680010676, + "grad_norm": 0.9698864247819369, + "learning_rate": 3.5550342914724613e-06, + "loss": 1.585, + "step": 5480 + }, + { + "epoch": 0.7313851080864692, + "grad_norm": 0.9508423495123989, + "learning_rate": 3.5517303339137267e-06, + "loss": 1.5207, + "step": 5481 + }, + { + "epoch": 0.7315185481718708, + "grad_norm": 1.2166047965486122, + "learning_rate": 3.5484275808183544e-06, + "loss": 1.5507, + "step": 5482 + }, + { + "epoch": 0.7316519882572725, + "grad_norm": 0.9436409445639349, + "learning_rate": 3.5451260328032555e-06, + "loss": 1.579, + "step": 5483 + }, + { + "epoch": 0.7317854283426741, + "grad_norm": 1.2067580184607811, + "learning_rate": 3.541825690485119e-06, + "loss": 1.5399, + "step": 5484 + }, + { + "epoch": 0.7319188684280757, + "grad_norm": 0.9128886571978432, + "learning_rate": 3.5385265544804172e-06, + "loss": 1.5099, + "step": 5485 + }, + { + "epoch": 0.7320523085134775, + "grad_norm": 0.9232766943936563, + "learning_rate": 3.535228625405389e-06, + "loss": 1.5688, + "step": 5486 + }, + { + "epoch": 0.7321857485988791, + "grad_norm": 0.9443761629982292, + "learning_rate": 3.531931903876049e-06, + "loss": 1.5515, + "step": 5487 + }, + { + "epoch": 0.7323191886842808, + "grad_norm": 0.9281491684957837, + "learning_rate": 3.5286363905081843e-06, + "loss": 1.5491, + "step": 5488 + }, + { + "epoch": 0.7324526287696824, + "grad_norm": 1.0926741045880084, + "learning_rate": 3.525342085917366e-06, + "loss": 1.5447, + "step": 5489 + }, + { + "epoch": 0.732586068855084, + "grad_norm": 0.9386932615205383, + "learning_rate": 3.5220489907189293e-06, + "loss": 1.5314, + "step": 5490 + }, + { + "epoch": 0.7327195089404858, + "grad_norm": 0.9166276573659027, + "learning_rate": 3.518757105527988e-06, + "loss": 1.5458, + "step": 5491 + }, + { + "epoch": 0.7328529490258874, + "grad_norm": 0.9123097547816297, + "learning_rate": 3.5154664309594224e-06, + "loss": 1.5524, + "step": 5492 + }, + { + "epoch": 0.732986389111289, + "grad_norm": 1.1139458388619161, + "learning_rate": 3.5121769676279026e-06, + "loss": 1.5188, + "step": 5493 + }, + { + "epoch": 0.7331198291966907, + "grad_norm": 0.958270690060531, + "learning_rate": 3.508888716147859e-06, + "loss": 1.5596, + "step": 5494 + }, + { + "epoch": 0.7332532692820923, + "grad_norm": 0.9709805252564494, + "learning_rate": 3.5056016771334954e-06, + "loss": 1.5756, + "step": 5495 + }, + { + "epoch": 0.733386709367494, + "grad_norm": 1.2247215933789883, + "learning_rate": 3.5023158511988e-06, + "loss": 1.597, + "step": 5496 + }, + { + "epoch": 0.7335201494528957, + "grad_norm": 0.9334813075768208, + "learning_rate": 3.4990312389575253e-06, + "loss": 1.5761, + "step": 5497 + }, + { + "epoch": 0.7336535895382973, + "grad_norm": 0.9468394042300734, + "learning_rate": 3.495747841023198e-06, + "loss": 1.547, + "step": 5498 + }, + { + "epoch": 0.7337870296236989, + "grad_norm": 0.9478146984584698, + "learning_rate": 3.4924656580091176e-06, + "loss": 1.6384, + "step": 5499 + }, + { + "epoch": 0.7339204697091006, + "grad_norm": 0.967091384617016, + "learning_rate": 3.4891846905283645e-06, + "loss": 1.5693, + "step": 5500 + }, + { + "epoch": 0.7340539097945022, + "grad_norm": 0.9249763240908678, + "learning_rate": 3.4859049391937827e-06, + "loss": 1.5253, + "step": 5501 + }, + { + "epoch": 0.734187349879904, + "grad_norm": 0.96990837542742, + "learning_rate": 3.48262640461799e-06, + "loss": 1.5783, + "step": 5502 + }, + { + "epoch": 0.7343207899653056, + "grad_norm": 0.9655104010410549, + "learning_rate": 3.479349087413384e-06, + "loss": 1.5925, + "step": 5503 + }, + { + "epoch": 0.7344542300507072, + "grad_norm": 0.9165403211065593, + "learning_rate": 3.4760729881921286e-06, + "loss": 1.4963, + "step": 5504 + }, + { + "epoch": 0.7345876701361089, + "grad_norm": 0.9683466520946085, + "learning_rate": 3.472798107566161e-06, + "loss": 1.6019, + "step": 5505 + }, + { + "epoch": 0.7347211102215105, + "grad_norm": 0.9552291626544579, + "learning_rate": 3.469524446147189e-06, + "loss": 1.5571, + "step": 5506 + }, + { + "epoch": 0.7348545503069122, + "grad_norm": 0.9311445516993911, + "learning_rate": 3.466252004546702e-06, + "loss": 1.5499, + "step": 5507 + }, + { + "epoch": 0.7349879903923139, + "grad_norm": 1.1146712718754999, + "learning_rate": 3.46298078337595e-06, + "loss": 1.5284, + "step": 5508 + }, + { + "epoch": 0.7351214304777155, + "grad_norm": 0.9571872321016519, + "learning_rate": 3.4597107832459575e-06, + "loss": 1.6109, + "step": 5509 + }, + { + "epoch": 0.7352548705631171, + "grad_norm": 0.9544626733275546, + "learning_rate": 3.456442004767531e-06, + "loss": 1.584, + "step": 5510 + }, + { + "epoch": 0.7353883106485188, + "grad_norm": 0.9436856603892109, + "learning_rate": 3.4531744485512362e-06, + "loss": 1.6014, + "step": 5511 + }, + { + "epoch": 0.7355217507339205, + "grad_norm": 0.9393458177163648, + "learning_rate": 3.4499081152074156e-06, + "loss": 1.5539, + "step": 5512 + }, + { + "epoch": 0.7356551908193221, + "grad_norm": 0.9422174761137512, + "learning_rate": 3.446643005346181e-06, + "loss": 1.569, + "step": 5513 + }, + { + "epoch": 0.7357886309047238, + "grad_norm": 0.9392923260054774, + "learning_rate": 3.4433791195774237e-06, + "loss": 1.545, + "step": 5514 + }, + { + "epoch": 0.7359220709901254, + "grad_norm": 0.9545864141748973, + "learning_rate": 3.440116458510796e-06, + "loss": 1.5286, + "step": 5515 + }, + { + "epoch": 0.7360555110755271, + "grad_norm": 0.9852651403635309, + "learning_rate": 3.4368550227557272e-06, + "loss": 1.5654, + "step": 5516 + }, + { + "epoch": 0.7361889511609288, + "grad_norm": 0.9339962134756217, + "learning_rate": 3.433594812921416e-06, + "loss": 1.507, + "step": 5517 + }, + { + "epoch": 0.7363223912463304, + "grad_norm": 0.9437597981521707, + "learning_rate": 3.4303358296168287e-06, + "loss": 1.5623, + "step": 5518 + }, + { + "epoch": 0.7364558313317321, + "grad_norm": 0.9428139658289024, + "learning_rate": 3.4270780734507136e-06, + "loss": 1.514, + "step": 5519 + }, + { + "epoch": 0.7365892714171337, + "grad_norm": 0.9476341999117343, + "learning_rate": 3.4238215450315794e-06, + "loss": 1.5945, + "step": 5520 + }, + { + "epoch": 0.7367227115025353, + "grad_norm": 0.9394390246936388, + "learning_rate": 3.420566244967707e-06, + "loss": 1.5188, + "step": 5521 + }, + { + "epoch": 0.736856151587937, + "grad_norm": 0.9506593690055114, + "learning_rate": 3.417312173867151e-06, + "loss": 1.5694, + "step": 5522 + }, + { + "epoch": 0.7369895916733387, + "grad_norm": 1.0129257670360825, + "learning_rate": 3.4140593323377336e-06, + "loss": 1.6033, + "step": 5523 + }, + { + "epoch": 0.7371230317587403, + "grad_norm": 0.9480996004589485, + "learning_rate": 3.410807720987046e-06, + "loss": 1.5339, + "step": 5524 + }, + { + "epoch": 0.737256471844142, + "grad_norm": 0.927903797277357, + "learning_rate": 3.4075573404224594e-06, + "loss": 1.5274, + "step": 5525 + }, + { + "epoch": 0.7373899119295436, + "grad_norm": 0.9379989494588817, + "learning_rate": 3.4043081912511033e-06, + "loss": 1.5647, + "step": 5526 + }, + { + "epoch": 0.7375233520149453, + "grad_norm": 0.9218438261963459, + "learning_rate": 3.4010602740798795e-06, + "loss": 1.5587, + "step": 5527 + }, + { + "epoch": 0.737656792100347, + "grad_norm": 0.9400943430420226, + "learning_rate": 3.3978135895154674e-06, + "loss": 1.5383, + "step": 5528 + }, + { + "epoch": 0.7377902321857486, + "grad_norm": 0.9268874491253772, + "learning_rate": 3.394568138164308e-06, + "loss": 1.5851, + "step": 5529 + }, + { + "epoch": 0.7379236722711503, + "grad_norm": 0.9115511290678305, + "learning_rate": 3.3913239206326154e-06, + "loss": 1.5734, + "step": 5530 + }, + { + "epoch": 0.7380571123565519, + "grad_norm": 0.9222620027598017, + "learning_rate": 3.388080937526368e-06, + "loss": 1.5531, + "step": 5531 + }, + { + "epoch": 0.7381905524419535, + "grad_norm": 0.9826304238549518, + "learning_rate": 3.3848391894513253e-06, + "loss": 1.5357, + "step": 5532 + }, + { + "epoch": 0.7383239925273553, + "grad_norm": 0.9812896742852667, + "learning_rate": 3.381598677013006e-06, + "loss": 1.6455, + "step": 5533 + }, + { + "epoch": 0.7384574326127569, + "grad_norm": 1.1940903464741186, + "learning_rate": 3.3783594008166963e-06, + "loss": 1.5608, + "step": 5534 + }, + { + "epoch": 0.7385908726981585, + "grad_norm": 1.0051527729116323, + "learning_rate": 3.3751213614674647e-06, + "loss": 1.4906, + "step": 5535 + }, + { + "epoch": 0.7387243127835602, + "grad_norm": 0.9448310950987612, + "learning_rate": 3.3718845595701356e-06, + "loss": 1.5768, + "step": 5536 + }, + { + "epoch": 0.7388577528689618, + "grad_norm": 0.9253739341701587, + "learning_rate": 3.3686489957293057e-06, + "loss": 1.5335, + "step": 5537 + }, + { + "epoch": 0.7389911929543634, + "grad_norm": 0.9565771682432299, + "learning_rate": 3.3654146705493407e-06, + "loss": 1.5332, + "step": 5538 + }, + { + "epoch": 0.7391246330397652, + "grad_norm": 0.9417608192865259, + "learning_rate": 3.3621815846343797e-06, + "loss": 1.5608, + "step": 5539 + }, + { + "epoch": 0.7392580731251668, + "grad_norm": 0.9655822978265454, + "learning_rate": 3.3589497385883253e-06, + "loss": 1.5471, + "step": 5540 + }, + { + "epoch": 0.7393915132105685, + "grad_norm": 0.9690529415186309, + "learning_rate": 3.355719133014844e-06, + "loss": 1.5502, + "step": 5541 + }, + { + "epoch": 0.7395249532959701, + "grad_norm": 1.2161957457480974, + "learning_rate": 3.3524897685173853e-06, + "loss": 1.5819, + "step": 5542 + }, + { + "epoch": 0.7396583933813717, + "grad_norm": 0.9441515276869841, + "learning_rate": 3.3492616456991524e-06, + "loss": 1.5041, + "step": 5543 + }, + { + "epoch": 0.7397918334667735, + "grad_norm": 0.960027360606955, + "learning_rate": 3.346034765163123e-06, + "loss": 1.5315, + "step": 5544 + }, + { + "epoch": 0.7399252735521751, + "grad_norm": 0.9330330601144551, + "learning_rate": 3.342809127512038e-06, + "loss": 1.5264, + "step": 5545 + }, + { + "epoch": 0.7400587136375767, + "grad_norm": 1.1030181822089198, + "learning_rate": 3.3395847333484153e-06, + "loss": 1.5608, + "step": 5546 + }, + { + "epoch": 0.7401921537229784, + "grad_norm": 0.9682087080598382, + "learning_rate": 3.3363615832745344e-06, + "loss": 1.5755, + "step": 5547 + }, + { + "epoch": 0.74032559380838, + "grad_norm": 0.9159493952844268, + "learning_rate": 3.333139677892436e-06, + "loss": 1.555, + "step": 5548 + }, + { + "epoch": 0.7404590338937816, + "grad_norm": 0.9659206359188842, + "learning_rate": 3.3299190178039464e-06, + "loss": 1.6404, + "step": 5549 + }, + { + "epoch": 0.7405924739791834, + "grad_norm": 1.1568043350228006, + "learning_rate": 3.3266996036106415e-06, + "loss": 1.5864, + "step": 5550 + }, + { + "epoch": 0.740725914064585, + "grad_norm": 0.923838794736863, + "learning_rate": 3.323481435913871e-06, + "loss": 1.5286, + "step": 5551 + }, + { + "epoch": 0.7408593541499866, + "grad_norm": 0.9701142939931843, + "learning_rate": 3.320264515314752e-06, + "loss": 1.515, + "step": 5552 + }, + { + "epoch": 0.7409927942353883, + "grad_norm": 0.9266738295815183, + "learning_rate": 3.3170488424141713e-06, + "loss": 1.5394, + "step": 5553 + }, + { + "epoch": 0.74112623432079, + "grad_norm": 0.9206286448502721, + "learning_rate": 3.31383441781278e-06, + "loss": 1.5438, + "step": 5554 + }, + { + "epoch": 0.7412596744061917, + "grad_norm": 0.931353809690202, + "learning_rate": 3.310621242110994e-06, + "loss": 1.5102, + "step": 5555 + }, + { + "epoch": 0.7413931144915933, + "grad_norm": 0.9508289819339473, + "learning_rate": 3.307409315908995e-06, + "loss": 1.5811, + "step": 5556 + }, + { + "epoch": 0.7415265545769949, + "grad_norm": 0.959176272650246, + "learning_rate": 3.3041986398067416e-06, + "loss": 1.599, + "step": 5557 + }, + { + "epoch": 0.7416599946623966, + "grad_norm": 0.9375548404744494, + "learning_rate": 3.3009892144039478e-06, + "loss": 1.5679, + "step": 5558 + }, + { + "epoch": 0.7417934347477982, + "grad_norm": 0.9217050733117328, + "learning_rate": 3.2977810403000977e-06, + "loss": 1.5366, + "step": 5559 + }, + { + "epoch": 0.7419268748331999, + "grad_norm": 0.9481402337066958, + "learning_rate": 3.29457411809444e-06, + "loss": 1.5539, + "step": 5560 + }, + { + "epoch": 0.7420603149186016, + "grad_norm": 0.9269200198823844, + "learning_rate": 3.291368448385992e-06, + "loss": 1.511, + "step": 5561 + }, + { + "epoch": 0.7421937550040032, + "grad_norm": 0.9484784210641377, + "learning_rate": 3.2881640317735386e-06, + "loss": 1.5821, + "step": 5562 + }, + { + "epoch": 0.7423271950894048, + "grad_norm": 0.9728501168391839, + "learning_rate": 3.2849608688556276e-06, + "loss": 1.5608, + "step": 5563 + }, + { + "epoch": 0.7424606351748065, + "grad_norm": 0.9582363784642031, + "learning_rate": 3.2817589602305732e-06, + "loss": 1.5794, + "step": 5564 + }, + { + "epoch": 0.7425940752602082, + "grad_norm": 0.9398306842093537, + "learning_rate": 3.2785583064964545e-06, + "loss": 1.5571, + "step": 5565 + }, + { + "epoch": 0.7427275153456098, + "grad_norm": 0.9467894900781275, + "learning_rate": 3.2753589082511152e-06, + "loss": 1.5512, + "step": 5566 + }, + { + "epoch": 0.7428609554310115, + "grad_norm": 0.9367465516942596, + "learning_rate": 3.2721607660921716e-06, + "loss": 1.5603, + "step": 5567 + }, + { + "epoch": 0.7429943955164131, + "grad_norm": 0.9499101812547797, + "learning_rate": 3.2689638806169977e-06, + "loss": 1.5349, + "step": 5568 + }, + { + "epoch": 0.7431278356018148, + "grad_norm": 0.9191430811722798, + "learning_rate": 3.265768252422734e-06, + "loss": 1.4816, + "step": 5569 + }, + { + "epoch": 0.7432612756872164, + "grad_norm": 1.0165076519034235, + "learning_rate": 3.2625738821062868e-06, + "loss": 1.5599, + "step": 5570 + }, + { + "epoch": 0.7433947157726181, + "grad_norm": 0.9387670444451917, + "learning_rate": 3.259380770264332e-06, + "loss": 1.5309, + "step": 5571 + }, + { + "epoch": 0.7435281558580198, + "grad_norm": 0.970178143993145, + "learning_rate": 3.256188917493306e-06, + "loss": 1.5265, + "step": 5572 + }, + { + "epoch": 0.7436615959434214, + "grad_norm": 0.9464180115309929, + "learning_rate": 3.2529983243894046e-06, + "loss": 1.5373, + "step": 5573 + }, + { + "epoch": 0.743795036028823, + "grad_norm": 0.9513946770758279, + "learning_rate": 3.2498089915486032e-06, + "loss": 1.5375, + "step": 5574 + }, + { + "epoch": 0.7439284761142247, + "grad_norm": 0.9395784172257111, + "learning_rate": 3.2466209195666266e-06, + "loss": 1.5607, + "step": 5575 + }, + { + "epoch": 0.7440619161996264, + "grad_norm": 1.081019248269522, + "learning_rate": 3.2434341090389734e-06, + "loss": 1.5749, + "step": 5576 + }, + { + "epoch": 0.744195356285028, + "grad_norm": 0.9327852617129883, + "learning_rate": 3.240248560560899e-06, + "loss": 1.5236, + "step": 5577 + }, + { + "epoch": 0.7443287963704297, + "grad_norm": 0.9878773231924048, + "learning_rate": 3.237064274727433e-06, + "loss": 1.5515, + "step": 5578 + }, + { + "epoch": 0.7444622364558313, + "grad_norm": 1.010601298560983, + "learning_rate": 3.233881252133363e-06, + "loss": 1.5761, + "step": 5579 + }, + { + "epoch": 0.7445956765412329, + "grad_norm": 0.9550690993331595, + "learning_rate": 3.230699493373236e-06, + "loss": 1.5799, + "step": 5580 + }, + { + "epoch": 0.7447291166266347, + "grad_norm": 0.979629622187405, + "learning_rate": 3.2275189990413746e-06, + "loss": 1.5323, + "step": 5581 + }, + { + "epoch": 0.7448625567120363, + "grad_norm": 1.0433957484563827, + "learning_rate": 3.224339769731858e-06, + "loss": 1.4937, + "step": 5582 + }, + { + "epoch": 0.744995996797438, + "grad_norm": 0.9320393784593036, + "learning_rate": 3.2211618060385285e-06, + "loss": 1.5419, + "step": 5583 + }, + { + "epoch": 0.7451294368828396, + "grad_norm": 0.9316119615350978, + "learning_rate": 3.2179851085549897e-06, + "loss": 1.5457, + "step": 5584 + }, + { + "epoch": 0.7452628769682412, + "grad_norm": 0.9283803212880422, + "learning_rate": 3.2148096778746195e-06, + "loss": 1.5546, + "step": 5585 + }, + { + "epoch": 0.745396317053643, + "grad_norm": 0.9313780199697975, + "learning_rate": 3.21163551459055e-06, + "loss": 1.5485, + "step": 5586 + }, + { + "epoch": 0.7455297571390446, + "grad_norm": 1.001529063138306, + "learning_rate": 3.2084626192956745e-06, + "loss": 1.5779, + "step": 5587 + }, + { + "epoch": 0.7456631972244462, + "grad_norm": 1.0662137147201587, + "learning_rate": 3.2052909925826604e-06, + "loss": 1.5484, + "step": 5588 + }, + { + "epoch": 0.7457966373098479, + "grad_norm": 0.9485702607614251, + "learning_rate": 3.202120635043928e-06, + "loss": 1.5674, + "step": 5589 + }, + { + "epoch": 0.7459300773952495, + "grad_norm": 0.9472743543057982, + "learning_rate": 3.198951547271665e-06, + "loss": 1.5625, + "step": 5590 + }, + { + "epoch": 0.7460635174806511, + "grad_norm": 0.9051518915153008, + "learning_rate": 3.1957837298578154e-06, + "loss": 1.5522, + "step": 5591 + }, + { + "epoch": 0.7461969575660529, + "grad_norm": 0.9130917382566023, + "learning_rate": 3.1926171833941e-06, + "loss": 1.5459, + "step": 5592 + }, + { + "epoch": 0.7463303976514545, + "grad_norm": 1.149337316223058, + "learning_rate": 3.18945190847199e-06, + "loss": 1.5367, + "step": 5593 + }, + { + "epoch": 0.7464638377368562, + "grad_norm": 1.0675304023468861, + "learning_rate": 3.1862879056827225e-06, + "loss": 1.5151, + "step": 5594 + }, + { + "epoch": 0.7465972778222578, + "grad_norm": 0.9568872060715193, + "learning_rate": 3.1831251756172943e-06, + "loss": 1.5452, + "step": 5595 + }, + { + "epoch": 0.7467307179076594, + "grad_norm": 1.0314564637633439, + "learning_rate": 3.1799637188664736e-06, + "loss": 1.5902, + "step": 5596 + }, + { + "epoch": 0.7468641579930612, + "grad_norm": 0.9252768296530237, + "learning_rate": 3.1768035360207806e-06, + "loss": 1.5499, + "step": 5597 + }, + { + "epoch": 0.7469975980784628, + "grad_norm": 1.2637363917190654, + "learning_rate": 3.173644627670499e-06, + "loss": 1.505, + "step": 5598 + }, + { + "epoch": 0.7471310381638644, + "grad_norm": 1.0200228137515415, + "learning_rate": 3.1704869944056826e-06, + "loss": 1.583, + "step": 5599 + }, + { + "epoch": 0.7472644782492661, + "grad_norm": 1.2623537137006278, + "learning_rate": 3.1673306368161394e-06, + "loss": 1.5582, + "step": 5600 + }, + { + "epoch": 0.7473979183346677, + "grad_norm": 0.9942148402882126, + "learning_rate": 3.16417555549144e-06, + "loss": 1.5138, + "step": 5601 + }, + { + "epoch": 0.7475313584200693, + "grad_norm": 0.9769665826208432, + "learning_rate": 3.1610217510209184e-06, + "loss": 1.5111, + "step": 5602 + }, + { + "epoch": 0.7476647985054711, + "grad_norm": 1.0229850987939395, + "learning_rate": 3.1578692239936693e-06, + "loss": 1.5134, + "step": 5603 + }, + { + "epoch": 0.7477982385908727, + "grad_norm": 0.9351520451791383, + "learning_rate": 3.1547179749985445e-06, + "loss": 1.6311, + "step": 5604 + }, + { + "epoch": 0.7479316786762743, + "grad_norm": 0.9204017069129004, + "learning_rate": 3.1515680046241693e-06, + "loss": 1.5303, + "step": 5605 + }, + { + "epoch": 0.748065118761676, + "grad_norm": 0.9599046278620328, + "learning_rate": 3.148419313458918e-06, + "loss": 1.5719, + "step": 5606 + }, + { + "epoch": 0.7481985588470776, + "grad_norm": 0.948039514652451, + "learning_rate": 3.1452719020909317e-06, + "loss": 1.5452, + "step": 5607 + }, + { + "epoch": 0.7483319989324794, + "grad_norm": 0.9481417279723116, + "learning_rate": 3.1421257711081097e-06, + "loss": 1.5983, + "step": 5608 + }, + { + "epoch": 0.748465439017881, + "grad_norm": 0.9485239077024988, + "learning_rate": 3.13898092109811e-06, + "loss": 1.531, + "step": 5609 + }, + { + "epoch": 0.7485988791032826, + "grad_norm": 1.1358614513622873, + "learning_rate": 3.135837352648362e-06, + "loss": 1.5357, + "step": 5610 + }, + { + "epoch": 0.7487323191886843, + "grad_norm": 0.970237163017964, + "learning_rate": 3.1326950663460466e-06, + "loss": 1.5521, + "step": 5611 + }, + { + "epoch": 0.7488657592740859, + "grad_norm": 0.9433944906141754, + "learning_rate": 3.1295540627781006e-06, + "loss": 1.6221, + "step": 5612 + }, + { + "epoch": 0.7489991993594876, + "grad_norm": 1.0020127389963633, + "learning_rate": 3.1264143425312366e-06, + "loss": 1.5551, + "step": 5613 + }, + { + "epoch": 0.7491326394448893, + "grad_norm": 0.9670401814605709, + "learning_rate": 3.123275906191915e-06, + "loss": 1.5776, + "step": 5614 + }, + { + "epoch": 0.7492660795302909, + "grad_norm": 0.9361899965589122, + "learning_rate": 3.120138754346359e-06, + "loss": 1.5674, + "step": 5615 + }, + { + "epoch": 0.7493995196156925, + "grad_norm": 0.976493101276714, + "learning_rate": 3.1170028875805504e-06, + "loss": 1.5411, + "step": 5616 + }, + { + "epoch": 0.7495329597010942, + "grad_norm": 0.9634598582821797, + "learning_rate": 3.11386830648024e-06, + "loss": 1.5704, + "step": 5617 + }, + { + "epoch": 0.7496663997864959, + "grad_norm": 0.9640441671842809, + "learning_rate": 3.1107350116309275e-06, + "loss": 1.5383, + "step": 5618 + }, + { + "epoch": 0.7497998398718975, + "grad_norm": 0.9555181724485184, + "learning_rate": 3.1076030036178763e-06, + "loss": 1.6026, + "step": 5619 + }, + { + "epoch": 0.7499332799572992, + "grad_norm": 1.2807705278680843, + "learning_rate": 3.104472283026113e-06, + "loss": 1.5881, + "step": 5620 + }, + { + "epoch": 0.7500667200427008, + "grad_norm": 0.9356127813233456, + "learning_rate": 3.1013428504404187e-06, + "loss": 1.561, + "step": 5621 + }, + { + "epoch": 0.7502001601281025, + "grad_norm": 0.9571532107573171, + "learning_rate": 3.098214706445336e-06, + "loss": 1.5119, + "step": 5622 + }, + { + "epoch": 0.7503336002135041, + "grad_norm": 0.9491196174548282, + "learning_rate": 3.0950878516251636e-06, + "loss": 1.5096, + "step": 5623 + }, + { + "epoch": 0.7504670402989058, + "grad_norm": 0.9417652753789462, + "learning_rate": 3.0919622865639677e-06, + "loss": 1.5102, + "step": 5624 + }, + { + "epoch": 0.7506004803843075, + "grad_norm": 0.9630569879115026, + "learning_rate": 3.088838011845566e-06, + "loss": 1.5818, + "step": 5625 + }, + { + "epoch": 0.7507339204697091, + "grad_norm": 0.9368704080314552, + "learning_rate": 3.0857150280535365e-06, + "loss": 1.5337, + "step": 5626 + }, + { + "epoch": 0.7508673605551107, + "grad_norm": 0.9571714147311596, + "learning_rate": 3.082593335771216e-06, + "loss": 1.5982, + "step": 5627 + }, + { + "epoch": 0.7510008006405124, + "grad_norm": 0.9254871682343206, + "learning_rate": 3.079472935581704e-06, + "loss": 1.5988, + "step": 5628 + }, + { + "epoch": 0.7511342407259141, + "grad_norm": 0.9612391153764122, + "learning_rate": 3.0763538280678563e-06, + "loss": 1.5205, + "step": 5629 + }, + { + "epoch": 0.7512676808113157, + "grad_norm": 0.952348233253775, + "learning_rate": 3.0732360138122806e-06, + "loss": 1.5686, + "step": 5630 + }, + { + "epoch": 0.7514011208967174, + "grad_norm": 0.9494761120662082, + "learning_rate": 3.0701194933973566e-06, + "loss": 1.5929, + "step": 5631 + }, + { + "epoch": 0.751534560982119, + "grad_norm": 0.917699574591178, + "learning_rate": 3.0670042674052116e-06, + "loss": 1.524, + "step": 5632 + }, + { + "epoch": 0.7516680010675206, + "grad_norm": 1.0333026625817825, + "learning_rate": 3.0638903364177343e-06, + "loss": 1.5439, + "step": 5633 + }, + { + "epoch": 0.7518014411529224, + "grad_norm": 0.9417912177877552, + "learning_rate": 3.0607777010165683e-06, + "loss": 1.6148, + "step": 5634 + }, + { + "epoch": 0.751934881238324, + "grad_norm": 0.9406321738897914, + "learning_rate": 3.057666361783126e-06, + "loss": 1.583, + "step": 5635 + }, + { + "epoch": 0.7520683213237257, + "grad_norm": 1.1578117225090305, + "learning_rate": 3.054556319298565e-06, + "loss": 1.5209, + "step": 5636 + }, + { + "epoch": 0.7522017614091273, + "grad_norm": 0.9600519785804953, + "learning_rate": 3.051447574143803e-06, + "loss": 1.5374, + "step": 5637 + }, + { + "epoch": 0.7523352014945289, + "grad_norm": 0.9270313809638928, + "learning_rate": 3.048340126899526e-06, + "loss": 1.5212, + "step": 5638 + }, + { + "epoch": 0.7524686415799307, + "grad_norm": 0.9114207753606299, + "learning_rate": 3.045233978146166e-06, + "loss": 1.5157, + "step": 5639 + }, + { + "epoch": 0.7526020816653323, + "grad_norm": 0.981499892052798, + "learning_rate": 3.0421291284639143e-06, + "loss": 1.5459, + "step": 5640 + }, + { + "epoch": 0.7527355217507339, + "grad_norm": 0.9458653564470572, + "learning_rate": 3.0390255784327215e-06, + "loss": 1.5687, + "step": 5641 + }, + { + "epoch": 0.7528689618361356, + "grad_norm": 1.2124512830726673, + "learning_rate": 3.0359233286322986e-06, + "loss": 1.5388, + "step": 5642 + }, + { + "epoch": 0.7530024019215372, + "grad_norm": 0.9508117630773106, + "learning_rate": 3.0328223796421107e-06, + "loss": 1.5721, + "step": 5643 + }, + { + "epoch": 0.7531358420069388, + "grad_norm": 0.9592182453362529, + "learning_rate": 3.029722732041377e-06, + "loss": 1.5769, + "step": 5644 + }, + { + "epoch": 0.7532692820923406, + "grad_norm": 1.08965444186456, + "learning_rate": 3.0266243864090772e-06, + "loss": 1.5829, + "step": 5645 + }, + { + "epoch": 0.7534027221777422, + "grad_norm": 2.9774916226895147, + "learning_rate": 3.0235273433239475e-06, + "loss": 1.5859, + "step": 5646 + }, + { + "epoch": 0.7535361622631438, + "grad_norm": 1.1074468709821748, + "learning_rate": 3.0204316033644765e-06, + "loss": 1.5437, + "step": 5647 + }, + { + "epoch": 0.7536696023485455, + "grad_norm": 0.9565752685671445, + "learning_rate": 3.01733716710892e-06, + "loss": 1.5924, + "step": 5648 + }, + { + "epoch": 0.7538030424339471, + "grad_norm": 0.9585562387187216, + "learning_rate": 3.0142440351352797e-06, + "loss": 1.5454, + "step": 5649 + }, + { + "epoch": 0.7539364825193489, + "grad_norm": 1.056117358661078, + "learning_rate": 3.0111522080213185e-06, + "loss": 1.5113, + "step": 5650 + }, + { + "epoch": 0.7540699226047505, + "grad_norm": 1.2766508430903627, + "learning_rate": 3.008061686344551e-06, + "loss": 1.5445, + "step": 5651 + }, + { + "epoch": 0.7542033626901521, + "grad_norm": 0.9812676587742001, + "learning_rate": 3.0049724706822557e-06, + "loss": 1.5591, + "step": 5652 + }, + { + "epoch": 0.7543368027755538, + "grad_norm": 0.9349340551684687, + "learning_rate": 3.001884561611463e-06, + "loss": 1.5658, + "step": 5653 + }, + { + "epoch": 0.7544702428609554, + "grad_norm": 1.2097788395210352, + "learning_rate": 2.998797959708958e-06, + "loss": 1.4703, + "step": 5654 + }, + { + "epoch": 0.754603682946357, + "grad_norm": 0.9150765980891424, + "learning_rate": 2.995712665551278e-06, + "loss": 1.5338, + "step": 5655 + }, + { + "epoch": 0.7547371230317588, + "grad_norm": 1.1493932341594215, + "learning_rate": 2.9926286797147284e-06, + "loss": 1.5119, + "step": 5656 + }, + { + "epoch": 0.7548705631171604, + "grad_norm": 0.9295681174421622, + "learning_rate": 2.989546002775361e-06, + "loss": 1.5785, + "step": 5657 + }, + { + "epoch": 0.755004003202562, + "grad_norm": 0.9844513532497011, + "learning_rate": 2.9864646353089822e-06, + "loss": 1.513, + "step": 5658 + }, + { + "epoch": 0.7551374432879637, + "grad_norm": 0.9691752292557538, + "learning_rate": 2.983384577891154e-06, + "loss": 1.5196, + "step": 5659 + }, + { + "epoch": 0.7552708833733653, + "grad_norm": 1.0348255829580888, + "learning_rate": 2.980305831097203e-06, + "loss": 1.5305, + "step": 5660 + }, + { + "epoch": 0.7554043234587671, + "grad_norm": 0.9795461493134596, + "learning_rate": 2.9772283955022006e-06, + "loss": 1.5619, + "step": 5661 + }, + { + "epoch": 0.7555377635441687, + "grad_norm": 0.9480708314986227, + "learning_rate": 2.9741522716809734e-06, + "loss": 1.5426, + "step": 5662 + }, + { + "epoch": 0.7556712036295703, + "grad_norm": 0.9365703829056337, + "learning_rate": 2.971077460208113e-06, + "loss": 1.5243, + "step": 5663 + }, + { + "epoch": 0.755804643714972, + "grad_norm": 0.9664797943870821, + "learning_rate": 2.9680039616579558e-06, + "loss": 1.5866, + "step": 5664 + }, + { + "epoch": 0.7559380838003736, + "grad_norm": 0.9326504176989024, + "learning_rate": 2.964931776604596e-06, + "loss": 1.5703, + "step": 5665 + }, + { + "epoch": 0.7560715238857753, + "grad_norm": 0.9662933536347439, + "learning_rate": 2.9618609056218797e-06, + "loss": 1.5854, + "step": 5666 + }, + { + "epoch": 0.756204963971177, + "grad_norm": 1.0951318602049294, + "learning_rate": 2.9587913492834174e-06, + "loss": 1.5744, + "step": 5667 + }, + { + "epoch": 0.7563384040565786, + "grad_norm": 0.9387753384543569, + "learning_rate": 2.9557231081625637e-06, + "loss": 1.591, + "step": 5668 + }, + { + "epoch": 0.7564718441419802, + "grad_norm": 0.9885761349123516, + "learning_rate": 2.9526561828324286e-06, + "loss": 1.5488, + "step": 5669 + }, + { + "epoch": 0.7566052842273819, + "grad_norm": 0.9272808115972163, + "learning_rate": 2.9495905738658846e-06, + "loss": 1.5201, + "step": 5670 + }, + { + "epoch": 0.7567387243127836, + "grad_norm": 0.9268527670988117, + "learning_rate": 2.946526281835549e-06, + "loss": 1.571, + "step": 5671 + }, + { + "epoch": 0.7568721643981852, + "grad_norm": 1.0147610911957214, + "learning_rate": 2.9434633073137976e-06, + "loss": 1.586, + "step": 5672 + }, + { + "epoch": 0.7570056044835869, + "grad_norm": 0.9383855121743576, + "learning_rate": 2.940401650872755e-06, + "loss": 1.5405, + "step": 5673 + }, + { + "epoch": 0.7571390445689885, + "grad_norm": 1.0683692922351982, + "learning_rate": 2.9373413130843108e-06, + "loss": 1.5319, + "step": 5674 + }, + { + "epoch": 0.7572724846543902, + "grad_norm": 0.9501481416038345, + "learning_rate": 2.934282294520099e-06, + "loss": 1.5894, + "step": 5675 + }, + { + "epoch": 0.7574059247397918, + "grad_norm": 0.9176546375459858, + "learning_rate": 2.931224595751504e-06, + "loss": 1.4954, + "step": 5676 + }, + { + "epoch": 0.7575393648251935, + "grad_norm": 0.947992780591222, + "learning_rate": 2.9281682173496764e-06, + "loss": 1.5466, + "step": 5677 + }, + { + "epoch": 0.7576728049105952, + "grad_norm": 0.9790622488674117, + "learning_rate": 2.925113159885511e-06, + "loss": 1.5469, + "step": 5678 + }, + { + "epoch": 0.7578062449959968, + "grad_norm": 0.9636768377878497, + "learning_rate": 2.922059423929656e-06, + "loss": 1.5419, + "step": 5679 + }, + { + "epoch": 0.7579396850813984, + "grad_norm": 0.9219842776449305, + "learning_rate": 2.9190070100525124e-06, + "loss": 1.4979, + "step": 5680 + }, + { + "epoch": 0.7580731251668001, + "grad_norm": 0.9863967160238304, + "learning_rate": 2.9159559188242428e-06, + "loss": 1.5115, + "step": 5681 + }, + { + "epoch": 0.7582065652522018, + "grad_norm": 0.9540589917238507, + "learning_rate": 2.9129061508147514e-06, + "loss": 1.5829, + "step": 5682 + }, + { + "epoch": 0.7583400053376034, + "grad_norm": 0.9415496333259665, + "learning_rate": 2.9098577065937027e-06, + "loss": 1.5155, + "step": 5683 + }, + { + "epoch": 0.7584734454230051, + "grad_norm": 1.0264214386562416, + "learning_rate": 2.906810586730506e-06, + "loss": 1.5762, + "step": 5684 + }, + { + "epoch": 0.7586068855084067, + "grad_norm": 1.0085987174197029, + "learning_rate": 2.903764791794337e-06, + "loss": 1.6053, + "step": 5685 + }, + { + "epoch": 0.7587403255938083, + "grad_norm": 0.9746022220580336, + "learning_rate": 2.9007203223541104e-06, + "loss": 1.5505, + "step": 5686 + }, + { + "epoch": 0.75887376567921, + "grad_norm": 1.1436325712704063, + "learning_rate": 2.8976771789784986e-06, + "loss": 1.5532, + "step": 5687 + }, + { + "epoch": 0.7590072057646117, + "grad_norm": 0.9458780762382005, + "learning_rate": 2.894635362235928e-06, + "loss": 1.5335, + "step": 5688 + }, + { + "epoch": 0.7591406458500134, + "grad_norm": 0.9586718389826383, + "learning_rate": 2.8915948726945696e-06, + "loss": 1.5686, + "step": 5689 + }, + { + "epoch": 0.759274085935415, + "grad_norm": 0.9374091977052081, + "learning_rate": 2.8885557109223615e-06, + "loss": 1.5533, + "step": 5690 + }, + { + "epoch": 0.7594075260208166, + "grad_norm": 0.9210874426208345, + "learning_rate": 2.8855178774869784e-06, + "loss": 1.6033, + "step": 5691 + }, + { + "epoch": 0.7595409661062184, + "grad_norm": 0.9425967721286689, + "learning_rate": 2.882481372955855e-06, + "loss": 1.5192, + "step": 5692 + }, + { + "epoch": 0.75967440619162, + "grad_norm": 0.9409853280626861, + "learning_rate": 2.879446197896176e-06, + "loss": 1.5309, + "step": 5693 + }, + { + "epoch": 0.7598078462770216, + "grad_norm": 0.9175409560809977, + "learning_rate": 2.8764123528748724e-06, + "loss": 1.5253, + "step": 5694 + }, + { + "epoch": 0.7599412863624233, + "grad_norm": 1.0633079652522899, + "learning_rate": 2.8733798384586398e-06, + "loss": 1.5564, + "step": 5695 + }, + { + "epoch": 0.7600747264478249, + "grad_norm": 1.0083618844062685, + "learning_rate": 2.870348655213914e-06, + "loss": 1.5223, + "step": 5696 + }, + { + "epoch": 0.7602081665332265, + "grad_norm": 1.1484670895236269, + "learning_rate": 2.8673188037068843e-06, + "loss": 1.5488, + "step": 5697 + }, + { + "epoch": 0.7603416066186283, + "grad_norm": 0.9648612510439487, + "learning_rate": 2.8642902845034914e-06, + "loss": 1.62, + "step": 5698 + }, + { + "epoch": 0.7604750467040299, + "grad_norm": 0.9409488162926377, + "learning_rate": 2.8612630981694322e-06, + "loss": 1.5146, + "step": 5699 + }, + { + "epoch": 0.7606084867894315, + "grad_norm": 1.0185521319833666, + "learning_rate": 2.85823724527015e-06, + "loss": 1.5524, + "step": 5700 + }, + { + "epoch": 0.7607419268748332, + "grad_norm": 0.9261998030680365, + "learning_rate": 2.855212726370834e-06, + "loss": 1.5362, + "step": 5701 + }, + { + "epoch": 0.7608753669602348, + "grad_norm": 0.9131576322837639, + "learning_rate": 2.852189542036439e-06, + "loss": 1.5431, + "step": 5702 + }, + { + "epoch": 0.7610088070456366, + "grad_norm": 0.9672674722537751, + "learning_rate": 2.849167692831655e-06, + "loss": 1.5413, + "step": 5703 + }, + { + "epoch": 0.7611422471310382, + "grad_norm": 0.9672341780718317, + "learning_rate": 2.8461471793209318e-06, + "loss": 1.5816, + "step": 5704 + }, + { + "epoch": 0.7612756872164398, + "grad_norm": 1.024093269678312, + "learning_rate": 2.8431280020684615e-06, + "loss": 1.5202, + "step": 5705 + }, + { + "epoch": 0.7614091273018415, + "grad_norm": 0.9730376429468712, + "learning_rate": 2.840110161638201e-06, + "loss": 1.5619, + "step": 5706 + }, + { + "epoch": 0.7615425673872431, + "grad_norm": 0.9277118027287636, + "learning_rate": 2.8370936585938425e-06, + "loss": 1.5741, + "step": 5707 + }, + { + "epoch": 0.7616760074726447, + "grad_norm": 0.9385522263679275, + "learning_rate": 2.834078493498833e-06, + "loss": 1.5243, + "step": 5708 + }, + { + "epoch": 0.7618094475580465, + "grad_norm": 0.921781169116257, + "learning_rate": 2.8310646669163777e-06, + "loss": 1.5539, + "step": 5709 + }, + { + "epoch": 0.7619428876434481, + "grad_norm": 0.9708668936556057, + "learning_rate": 2.8280521794094205e-06, + "loss": 1.5721, + "step": 5710 + }, + { + "epoch": 0.7620763277288497, + "grad_norm": 0.9753523420806991, + "learning_rate": 2.8250410315406606e-06, + "loss": 1.5386, + "step": 5711 + }, + { + "epoch": 0.7622097678142514, + "grad_norm": 1.1177484636132677, + "learning_rate": 2.822031223872543e-06, + "loss": 1.5526, + "step": 5712 + }, + { + "epoch": 0.762343207899653, + "grad_norm": 0.9770528135249951, + "learning_rate": 2.8190227569672722e-06, + "loss": 1.5853, + "step": 5713 + }, + { + "epoch": 0.7624766479850547, + "grad_norm": 1.3032469298982818, + "learning_rate": 2.8160156313867915e-06, + "loss": 1.6291, + "step": 5714 + }, + { + "epoch": 0.7626100880704564, + "grad_norm": 0.9638843608824444, + "learning_rate": 2.813009847692795e-06, + "loss": 1.5735, + "step": 5715 + }, + { + "epoch": 0.762743528155858, + "grad_norm": 1.1791110749699807, + "learning_rate": 2.8100054064467355e-06, + "loss": 1.5285, + "step": 5716 + }, + { + "epoch": 0.7628769682412597, + "grad_norm": 16.86319433750556, + "learning_rate": 2.807002308209804e-06, + "loss": 1.5693, + "step": 5717 + }, + { + "epoch": 0.7630104083266613, + "grad_norm": 0.9439903919518662, + "learning_rate": 2.8040005535429472e-06, + "loss": 1.5474, + "step": 5718 + }, + { + "epoch": 0.763143848412063, + "grad_norm": 0.961074528153183, + "learning_rate": 2.8010001430068535e-06, + "loss": 1.5005, + "step": 5719 + }, + { + "epoch": 0.7632772884974647, + "grad_norm": 0.9524749124010052, + "learning_rate": 2.7980010771619727e-06, + "loss": 1.5212, + "step": 5720 + }, + { + "epoch": 0.7634107285828663, + "grad_norm": 0.9533980355725618, + "learning_rate": 2.795003356568492e-06, + "loss": 1.5894, + "step": 5721 + }, + { + "epoch": 0.7635441686682679, + "grad_norm": 0.9213712133256243, + "learning_rate": 2.792006981786354e-06, + "loss": 1.5208, + "step": 5722 + }, + { + "epoch": 0.7636776087536696, + "grad_norm": 0.9294323574854917, + "learning_rate": 2.7890119533752415e-06, + "loss": 1.5441, + "step": 5723 + }, + { + "epoch": 0.7638110488390712, + "grad_norm": 0.9784510201172563, + "learning_rate": 2.7860182718945993e-06, + "loss": 1.581, + "step": 5724 + }, + { + "epoch": 0.7639444889244729, + "grad_norm": 0.9588683754336024, + "learning_rate": 2.7830259379036095e-06, + "loss": 1.5307, + "step": 5725 + }, + { + "epoch": 0.7640779290098746, + "grad_norm": 0.9143625087310671, + "learning_rate": 2.7800349519612023e-06, + "loss": 1.5518, + "step": 5726 + }, + { + "epoch": 0.7642113690952762, + "grad_norm": 0.9792840961883873, + "learning_rate": 2.777045314626068e-06, + "loss": 1.4893, + "step": 5727 + }, + { + "epoch": 0.7643448091806779, + "grad_norm": 0.9591101744910538, + "learning_rate": 2.7740570264566325e-06, + "loss": 1.576, + "step": 5728 + }, + { + "epoch": 0.7644782492660795, + "grad_norm": 1.0493610713948585, + "learning_rate": 2.771070088011073e-06, + "loss": 1.5944, + "step": 5729 + }, + { + "epoch": 0.7646116893514812, + "grad_norm": 0.940470963878361, + "learning_rate": 2.7680844998473176e-06, + "loss": 1.6017, + "step": 5730 + }, + { + "epoch": 0.7647451294368829, + "grad_norm": 0.9114372371817887, + "learning_rate": 2.7651002625230394e-06, + "loss": 1.5043, + "step": 5731 + }, + { + "epoch": 0.7648785695222845, + "grad_norm": 0.9514804602724689, + "learning_rate": 2.7621173765956553e-06, + "loss": 1.556, + "step": 5732 + }, + { + "epoch": 0.7650120096076861, + "grad_norm": 0.931967453227579, + "learning_rate": 2.7591358426223437e-06, + "loss": 1.5529, + "step": 5733 + }, + { + "epoch": 0.7651454496930878, + "grad_norm": 0.9641034664380269, + "learning_rate": 2.756155661160015e-06, + "loss": 1.5584, + "step": 5734 + }, + { + "epoch": 0.7652788897784895, + "grad_norm": 1.0406330527700243, + "learning_rate": 2.753176832765334e-06, + "loss": 1.5685, + "step": 5735 + }, + { + "epoch": 0.7654123298638911, + "grad_norm": 1.1171214620153784, + "learning_rate": 2.750199357994714e-06, + "loss": 1.5532, + "step": 5736 + }, + { + "epoch": 0.7655457699492928, + "grad_norm": 0.9866430386535575, + "learning_rate": 2.7472232374043084e-06, + "loss": 1.5261, + "step": 5737 + }, + { + "epoch": 0.7656792100346944, + "grad_norm": 0.9361087966486119, + "learning_rate": 2.7442484715500286e-06, + "loss": 1.5359, + "step": 5738 + }, + { + "epoch": 0.765812650120096, + "grad_norm": 0.9609366943677766, + "learning_rate": 2.741275060987525e-06, + "loss": 1.544, + "step": 5739 + }, + { + "epoch": 0.7659460902054978, + "grad_norm": 1.0456283664316062, + "learning_rate": 2.7383030062721926e-06, + "loss": 1.5098, + "step": 5740 + }, + { + "epoch": 0.7660795302908994, + "grad_norm": 0.9071979960109646, + "learning_rate": 2.7353323079591842e-06, + "loss": 1.5621, + "step": 5741 + }, + { + "epoch": 0.7662129703763011, + "grad_norm": 0.9422303587878669, + "learning_rate": 2.7323629666033903e-06, + "loss": 1.5661, + "step": 5742 + }, + { + "epoch": 0.7663464104617027, + "grad_norm": 1.0122385866589452, + "learning_rate": 2.729394982759449e-06, + "loss": 1.5319, + "step": 5743 + }, + { + "epoch": 0.7664798505471043, + "grad_norm": 1.1141139885458986, + "learning_rate": 2.726428356981742e-06, + "loss": 1.5295, + "step": 5744 + }, + { + "epoch": 0.766613290632506, + "grad_norm": 0.982862912585683, + "learning_rate": 2.7234630898244084e-06, + "loss": 1.5591, + "step": 5745 + }, + { + "epoch": 0.7667467307179077, + "grad_norm": 1.075501803864991, + "learning_rate": 2.720499181841324e-06, + "loss": 1.5615, + "step": 5746 + }, + { + "epoch": 0.7668801708033093, + "grad_norm": 1.028976839916406, + "learning_rate": 2.7175366335861087e-06, + "loss": 1.5477, + "step": 5747 + }, + { + "epoch": 0.767013610888711, + "grad_norm": 1.0645308095468675, + "learning_rate": 2.714575445612139e-06, + "loss": 1.5542, + "step": 5748 + }, + { + "epoch": 0.7671470509741126, + "grad_norm": 0.9392883647391629, + "learning_rate": 2.7116156184725285e-06, + "loss": 1.5699, + "step": 5749 + }, + { + "epoch": 0.7672804910595142, + "grad_norm": 0.965446906684568, + "learning_rate": 2.708657152720139e-06, + "loss": 1.5796, + "step": 5750 + }, + { + "epoch": 0.767413931144916, + "grad_norm": 1.0070562909813836, + "learning_rate": 2.705700048907576e-06, + "loss": 1.5419, + "step": 5751 + }, + { + "epoch": 0.7675473712303176, + "grad_norm": 0.9415702130765551, + "learning_rate": 2.7027443075871974e-06, + "loss": 1.5344, + "step": 5752 + }, + { + "epoch": 0.7676808113157192, + "grad_norm": 1.007240476174913, + "learning_rate": 2.6997899293110997e-06, + "loss": 1.5223, + "step": 5753 + }, + { + "epoch": 0.7678142514011209, + "grad_norm": 0.9583583605966772, + "learning_rate": 2.696836914631127e-06, + "loss": 1.4856, + "step": 5754 + }, + { + "epoch": 0.7679476914865225, + "grad_norm": 0.9286247656808373, + "learning_rate": 2.6938852640988666e-06, + "loss": 1.5845, + "step": 5755 + }, + { + "epoch": 0.7680811315719243, + "grad_norm": 1.0066591400064955, + "learning_rate": 2.690934978265659e-06, + "loss": 1.5767, + "step": 5756 + }, + { + "epoch": 0.7682145716573259, + "grad_norm": 1.052076839954032, + "learning_rate": 2.68798605768258e-06, + "loss": 1.5927, + "step": 5757 + }, + { + "epoch": 0.7683480117427275, + "grad_norm": 1.0098347870196225, + "learning_rate": 2.685038502900452e-06, + "loss": 1.5831, + "step": 5758 + }, + { + "epoch": 0.7684814518281292, + "grad_norm": 1.0546240360318806, + "learning_rate": 2.682092314469851e-06, + "loss": 1.5924, + "step": 5759 + }, + { + "epoch": 0.7686148919135308, + "grad_norm": 0.985560507673923, + "learning_rate": 2.6791474929410877e-06, + "loss": 1.5819, + "step": 5760 + }, + { + "epoch": 0.7687483319989324, + "grad_norm": 0.9490509821938993, + "learning_rate": 2.6762040388642217e-06, + "loss": 1.5278, + "step": 5761 + }, + { + "epoch": 0.7688817720843342, + "grad_norm": 0.9668194942766439, + "learning_rate": 2.6732619527890547e-06, + "loss": 1.5719, + "step": 5762 + }, + { + "epoch": 0.7690152121697358, + "grad_norm": 1.0376817559211673, + "learning_rate": 2.670321235265139e-06, + "loss": 1.5694, + "step": 5763 + }, + { + "epoch": 0.7691486522551374, + "grad_norm": 1.1459513474130882, + "learning_rate": 2.6673818868417646e-06, + "loss": 1.5727, + "step": 5764 + }, + { + "epoch": 0.7692820923405391, + "grad_norm": 0.9389863744334982, + "learning_rate": 2.6644439080679662e-06, + "loss": 1.5679, + "step": 5765 + }, + { + "epoch": 0.7694155324259407, + "grad_norm": 0.9437950047313849, + "learning_rate": 2.6615072994925308e-06, + "loss": 1.5559, + "step": 5766 + }, + { + "epoch": 0.7695489725113424, + "grad_norm": 0.955667607095649, + "learning_rate": 2.65857206166398e-06, + "loss": 1.526, + "step": 5767 + }, + { + "epoch": 0.7696824125967441, + "grad_norm": 0.8988996567956318, + "learning_rate": 2.655638195130582e-06, + "loss": 1.4886, + "step": 5768 + }, + { + "epoch": 0.7698158526821457, + "grad_norm": 1.0552144404288448, + "learning_rate": 2.652705700440348e-06, + "loss": 1.5902, + "step": 5769 + }, + { + "epoch": 0.7699492927675474, + "grad_norm": 0.9570299871105503, + "learning_rate": 2.64977457814104e-06, + "loss": 1.5706, + "step": 5770 + }, + { + "epoch": 0.770082732852949, + "grad_norm": 0.9664283679747206, + "learning_rate": 2.6468448287801552e-06, + "loss": 1.5595, + "step": 5771 + }, + { + "epoch": 0.7702161729383507, + "grad_norm": 0.929178580968516, + "learning_rate": 2.6439164529049376e-06, + "loss": 1.5301, + "step": 5772 + }, + { + "epoch": 0.7703496130237524, + "grad_norm": 0.9305117830654621, + "learning_rate": 2.6409894510623744e-06, + "loss": 1.5678, + "step": 5773 + }, + { + "epoch": 0.770483053109154, + "grad_norm": 1.0322434008439814, + "learning_rate": 2.6380638237991963e-06, + "loss": 1.5298, + "step": 5774 + }, + { + "epoch": 0.7706164931945556, + "grad_norm": 0.9590848759112884, + "learning_rate": 2.6351395716618746e-06, + "loss": 1.5437, + "step": 5775 + }, + { + "epoch": 0.7707499332799573, + "grad_norm": 0.9231804585207396, + "learning_rate": 2.632216695196631e-06, + "loss": 1.4736, + "step": 5776 + }, + { + "epoch": 0.770883373365359, + "grad_norm": 0.9786231108386787, + "learning_rate": 2.6292951949494237e-06, + "loss": 1.5593, + "step": 5777 + }, + { + "epoch": 0.7710168134507606, + "grad_norm": 0.9603881195608965, + "learning_rate": 2.626375071465955e-06, + "loss": 1.5821, + "step": 5778 + }, + { + "epoch": 0.7711502535361623, + "grad_norm": 0.9430362885685091, + "learning_rate": 2.623456325291669e-06, + "loss": 1.5404, + "step": 5779 + }, + { + "epoch": 0.7712836936215639, + "grad_norm": 0.9446912811828378, + "learning_rate": 2.6205389569717586e-06, + "loss": 1.5394, + "step": 5780 + }, + { + "epoch": 0.7714171337069655, + "grad_norm": 0.9412298808949228, + "learning_rate": 2.6176229670511533e-06, + "loss": 1.5538, + "step": 5781 + }, + { + "epoch": 0.7715505737923672, + "grad_norm": 1.3095768335252589, + "learning_rate": 2.6147083560745257e-06, + "loss": 1.5564, + "step": 5782 + }, + { + "epoch": 0.7716840138777689, + "grad_norm": 0.9918252751196945, + "learning_rate": 2.6117951245862893e-06, + "loss": 1.5451, + "step": 5783 + }, + { + "epoch": 0.7718174539631706, + "grad_norm": 0.996958048566631, + "learning_rate": 2.608883273130609e-06, + "loss": 1.5229, + "step": 5784 + }, + { + "epoch": 0.7719508940485722, + "grad_norm": 0.957110530564296, + "learning_rate": 2.6059728022513832e-06, + "loss": 1.55, + "step": 5785 + }, + { + "epoch": 0.7720843341339738, + "grad_norm": 1.0399836848407493, + "learning_rate": 2.603063712492252e-06, + "loss": 1.579, + "step": 5786 + }, + { + "epoch": 0.7722177742193755, + "grad_norm": 0.9408064311310185, + "learning_rate": 2.6001560043966e-06, + "loss": 1.5572, + "step": 5787 + }, + { + "epoch": 0.7723512143047772, + "grad_norm": 0.9412201777682861, + "learning_rate": 2.5972496785075594e-06, + "loss": 1.544, + "step": 5788 + }, + { + "epoch": 0.7724846543901788, + "grad_norm": 1.0115310422207648, + "learning_rate": 2.594344735367995e-06, + "loss": 1.5008, + "step": 5789 + }, + { + "epoch": 0.7726180944755805, + "grad_norm": 0.9227438226603408, + "learning_rate": 2.591441175520514e-06, + "loss": 1.5712, + "step": 5790 + }, + { + "epoch": 0.7727515345609821, + "grad_norm": 0.9594213056108911, + "learning_rate": 2.5885389995074752e-06, + "loss": 1.5903, + "step": 5791 + }, + { + "epoch": 0.7728849746463837, + "grad_norm": 1.086165595831062, + "learning_rate": 2.5856382078709685e-06, + "loss": 1.5691, + "step": 5792 + }, + { + "epoch": 0.7730184147317855, + "grad_norm": 0.9505020761449048, + "learning_rate": 2.5827388011528296e-06, + "loss": 1.5793, + "step": 5793 + }, + { + "epoch": 0.7731518548171871, + "grad_norm": 3.5110780857670107, + "learning_rate": 2.57984077989463e-06, + "loss": 1.5527, + "step": 5794 + }, + { + "epoch": 0.7732852949025887, + "grad_norm": 0.9455488863259419, + "learning_rate": 2.576944144637694e-06, + "loss": 1.5232, + "step": 5795 + }, + { + "epoch": 0.7734187349879904, + "grad_norm": 0.9672606415809805, + "learning_rate": 2.574048895923078e-06, + "loss": 1.5559, + "step": 5796 + }, + { + "epoch": 0.773552175073392, + "grad_norm": 0.9385093821290592, + "learning_rate": 2.571155034291577e-06, + "loss": 1.5565, + "step": 5797 + }, + { + "epoch": 0.7736856151587937, + "grad_norm": 0.9520678330195554, + "learning_rate": 2.5682625602837384e-06, + "loss": 1.5558, + "step": 5798 + }, + { + "epoch": 0.7738190552441954, + "grad_norm": 0.99051318050388, + "learning_rate": 2.5653714744398393e-06, + "loss": 1.5109, + "step": 5799 + }, + { + "epoch": 0.773952495329597, + "grad_norm": 0.984714941570732, + "learning_rate": 2.5624817772999033e-06, + "loss": 1.5115, + "step": 5800 + }, + { + "epoch": 0.7740859354149987, + "grad_norm": 0.9194604559966288, + "learning_rate": 2.5595934694036882e-06, + "loss": 1.4929, + "step": 5801 + }, + { + "epoch": 0.7742193755004003, + "grad_norm": 0.9247157704040789, + "learning_rate": 2.556706551290704e-06, + "loss": 1.5369, + "step": 5802 + }, + { + "epoch": 0.7743528155858019, + "grad_norm": 0.9360390213094082, + "learning_rate": 2.5538210235001913e-06, + "loss": 1.5576, + "step": 5803 + }, + { + "epoch": 0.7744862556712037, + "grad_norm": 0.9064471735808803, + "learning_rate": 2.5509368865711304e-06, + "loss": 1.5415, + "step": 5804 + }, + { + "epoch": 0.7746196957566053, + "grad_norm": 0.9513137240894838, + "learning_rate": 2.548054141042251e-06, + "loss": 1.5708, + "step": 5805 + }, + { + "epoch": 0.7747531358420069, + "grad_norm": 1.2175807518906996, + "learning_rate": 2.5451727874520148e-06, + "loss": 1.514, + "step": 5806 + }, + { + "epoch": 0.7748865759274086, + "grad_norm": 0.9892844938681642, + "learning_rate": 2.542292826338626e-06, + "loss": 1.5524, + "step": 5807 + }, + { + "epoch": 0.7750200160128102, + "grad_norm": 1.0937981716259788, + "learning_rate": 2.5394142582400238e-06, + "loss": 1.4936, + "step": 5808 + }, + { + "epoch": 0.775153456098212, + "grad_norm": 1.169626597958446, + "learning_rate": 2.5365370836938997e-06, + "loss": 1.5551, + "step": 5809 + }, + { + "epoch": 0.7752868961836136, + "grad_norm": 0.9516314581228, + "learning_rate": 2.5336613032376744e-06, + "loss": 1.5305, + "step": 5810 + }, + { + "epoch": 0.7754203362690152, + "grad_norm": 0.9404267765234818, + "learning_rate": 2.5307869174085086e-06, + "loss": 1.5901, + "step": 5811 + }, + { + "epoch": 0.7755537763544169, + "grad_norm": 0.9359769036295512, + "learning_rate": 2.527913926743305e-06, + "loss": 1.5633, + "step": 5812 + }, + { + "epoch": 0.7756872164398185, + "grad_norm": 0.9494830051173815, + "learning_rate": 2.5250423317787086e-06, + "loss": 1.5296, + "step": 5813 + }, + { + "epoch": 0.7758206565252201, + "grad_norm": 0.9336095498356144, + "learning_rate": 2.522172133051101e-06, + "loss": 1.5527, + "step": 5814 + }, + { + "epoch": 0.7759540966106219, + "grad_norm": 0.9206894715103576, + "learning_rate": 2.5193033310965987e-06, + "loss": 1.5765, + "step": 5815 + }, + { + "epoch": 0.7760875366960235, + "grad_norm": 0.9285054392212316, + "learning_rate": 2.516435926451064e-06, + "loss": 1.5186, + "step": 5816 + }, + { + "epoch": 0.7762209767814251, + "grad_norm": 0.9469454273921929, + "learning_rate": 2.513569919650095e-06, + "loss": 1.5748, + "step": 5817 + }, + { + "epoch": 0.7763544168668268, + "grad_norm": 1.0422340407226904, + "learning_rate": 2.510705311229025e-06, + "loss": 1.5367, + "step": 5818 + }, + { + "epoch": 0.7764878569522284, + "grad_norm": 0.9341799059535522, + "learning_rate": 2.507842101722938e-06, + "loss": 1.5679, + "step": 5819 + }, + { + "epoch": 0.77662129703763, + "grad_norm": 0.9183397021685292, + "learning_rate": 2.5049802916666445e-06, + "loss": 1.5037, + "step": 5820 + }, + { + "epoch": 0.7767547371230318, + "grad_norm": 0.932957030523527, + "learning_rate": 2.5021198815946978e-06, + "loss": 1.5195, + "step": 5821 + }, + { + "epoch": 0.7768881772084334, + "grad_norm": 0.9241883977653392, + "learning_rate": 2.4992608720413893e-06, + "loss": 1.5227, + "step": 5822 + }, + { + "epoch": 0.7770216172938351, + "grad_norm": 1.2172435346645842, + "learning_rate": 2.496403263540752e-06, + "loss": 1.5402, + "step": 5823 + }, + { + "epoch": 0.7771550573792367, + "grad_norm": 0.9449596951384638, + "learning_rate": 2.493547056626554e-06, + "loss": 1.5355, + "step": 5824 + }, + { + "epoch": 0.7772884974646384, + "grad_norm": 1.0524048742416596, + "learning_rate": 2.490692251832302e-06, + "loss": 1.5999, + "step": 5825 + }, + { + "epoch": 0.7774219375500401, + "grad_norm": 0.984791837305669, + "learning_rate": 2.4878388496912377e-06, + "loss": 1.5482, + "step": 5826 + }, + { + "epoch": 0.7775553776354417, + "grad_norm": 0.9575185903107368, + "learning_rate": 2.4849868507363506e-06, + "loss": 1.5152, + "step": 5827 + }, + { + "epoch": 0.7776888177208433, + "grad_norm": 0.9431849505627886, + "learning_rate": 2.482136255500357e-06, + "loss": 1.5557, + "step": 5828 + }, + { + "epoch": 0.777822257806245, + "grad_norm": 0.9027138718964596, + "learning_rate": 2.479287064515714e-06, + "loss": 1.5192, + "step": 5829 + }, + { + "epoch": 0.7779556978916466, + "grad_norm": 0.9769963398115894, + "learning_rate": 2.476439278314624e-06, + "loss": 1.5003, + "step": 5830 + }, + { + "epoch": 0.7780891379770483, + "grad_norm": 1.0871003730614877, + "learning_rate": 2.473592897429018e-06, + "loss": 1.5801, + "step": 5831 + }, + { + "epoch": 0.77822257806245, + "grad_norm": 0.9763548093718147, + "learning_rate": 2.470747922390567e-06, + "loss": 1.5494, + "step": 5832 + }, + { + "epoch": 0.7783560181478516, + "grad_norm": 0.984785198847868, + "learning_rate": 2.467904353730678e-06, + "loss": 1.5478, + "step": 5833 + }, + { + "epoch": 0.7784894582332532, + "grad_norm": 0.9601413586909818, + "learning_rate": 2.465062191980503e-06, + "loss": 1.5631, + "step": 5834 + }, + { + "epoch": 0.7786228983186549, + "grad_norm": 1.0051540823117064, + "learning_rate": 2.462221437670921e-06, + "loss": 1.5519, + "step": 5835 + }, + { + "epoch": 0.7787563384040566, + "grad_norm": 1.0448166746922907, + "learning_rate": 2.4593820913325507e-06, + "loss": 1.5673, + "step": 5836 + }, + { + "epoch": 0.7788897784894583, + "grad_norm": 1.0626411844360724, + "learning_rate": 2.4565441534957558e-06, + "loss": 1.5583, + "step": 5837 + }, + { + "epoch": 0.7790232185748599, + "grad_norm": 1.0838658214565313, + "learning_rate": 2.453707624690628e-06, + "loss": 1.5399, + "step": 5838 + }, + { + "epoch": 0.7791566586602615, + "grad_norm": 1.0694003456730499, + "learning_rate": 2.4508725054469973e-06, + "loss": 1.5874, + "step": 5839 + }, + { + "epoch": 0.7792900987456632, + "grad_norm": 1.0215015403096692, + "learning_rate": 2.4480387962944297e-06, + "loss": 1.545, + "step": 5840 + }, + { + "epoch": 0.7794235388310649, + "grad_norm": 0.947422869064629, + "learning_rate": 2.4452064977622357e-06, + "loss": 1.541, + "step": 5841 + }, + { + "epoch": 0.7795569789164665, + "grad_norm": 0.9555266023407392, + "learning_rate": 2.4423756103794538e-06, + "loss": 1.5314, + "step": 5842 + }, + { + "epoch": 0.7796904190018682, + "grad_norm": 1.012624888514248, + "learning_rate": 2.439546134674858e-06, + "loss": 1.5672, + "step": 5843 + }, + { + "epoch": 0.7798238590872698, + "grad_norm": 1.2038001979285369, + "learning_rate": 2.436718071176969e-06, + "loss": 1.5503, + "step": 5844 + }, + { + "epoch": 0.7799572991726714, + "grad_norm": 0.9750829570129861, + "learning_rate": 2.4338914204140317e-06, + "loss": 1.5958, + "step": 5845 + }, + { + "epoch": 0.7800907392580732, + "grad_norm": 1.0711410399768173, + "learning_rate": 2.4310661829140335e-06, + "loss": 1.5094, + "step": 5846 + }, + { + "epoch": 0.7802241793434748, + "grad_norm": 0.960236661793123, + "learning_rate": 2.4282423592046956e-06, + "loss": 1.5084, + "step": 5847 + }, + { + "epoch": 0.7803576194288764, + "grad_norm": 0.9702210353839494, + "learning_rate": 2.425419949813479e-06, + "loss": 1.5571, + "step": 5848 + }, + { + "epoch": 0.7804910595142781, + "grad_norm": 0.9253780960251425, + "learning_rate": 2.4225989552675756e-06, + "loss": 1.579, + "step": 5849 + }, + { + "epoch": 0.7806244995996797, + "grad_norm": 0.9233717044203158, + "learning_rate": 2.419779376093916e-06, + "loss": 1.5668, + "step": 5850 + }, + { + "epoch": 0.7807579396850814, + "grad_norm": 0.9348350250160402, + "learning_rate": 2.416961212819162e-06, + "loss": 1.536, + "step": 5851 + }, + { + "epoch": 0.7808913797704831, + "grad_norm": 0.9548401099327721, + "learning_rate": 2.41414446596972e-06, + "loss": 1.5628, + "step": 5852 + }, + { + "epoch": 0.7810248198558847, + "grad_norm": 0.956552292282773, + "learning_rate": 2.411329136071724e-06, + "loss": 1.6029, + "step": 5853 + }, + { + "epoch": 0.7811582599412864, + "grad_norm": 0.9985446373657281, + "learning_rate": 2.4085152236510445e-06, + "loss": 1.5279, + "step": 5854 + }, + { + "epoch": 0.781291700026688, + "grad_norm": 0.9286530398266631, + "learning_rate": 2.4057027292332857e-06, + "loss": 1.5534, + "step": 5855 + }, + { + "epoch": 0.7814251401120896, + "grad_norm": 0.9838982529058777, + "learning_rate": 2.4028916533437963e-06, + "loss": 1.5773, + "step": 5856 + }, + { + "epoch": 0.7815585801974914, + "grad_norm": 0.9685070922452822, + "learning_rate": 2.4000819965076506e-06, + "loss": 1.5407, + "step": 5857 + }, + { + "epoch": 0.781692020282893, + "grad_norm": 0.9289471086756744, + "learning_rate": 2.3972737592496576e-06, + "loss": 1.5773, + "step": 5858 + }, + { + "epoch": 0.7818254603682946, + "grad_norm": 0.9607754330554751, + "learning_rate": 2.3944669420943678e-06, + "loss": 1.4774, + "step": 5859 + }, + { + "epoch": 0.7819589004536963, + "grad_norm": 1.0361591280939995, + "learning_rate": 2.3916615455660574e-06, + "loss": 1.5427, + "step": 5860 + }, + { + "epoch": 0.7820923405390979, + "grad_norm": 0.9200656629737055, + "learning_rate": 2.3888575701887483e-06, + "loss": 1.5319, + "step": 5861 + }, + { + "epoch": 0.7822257806244995, + "grad_norm": 1.2418620489944565, + "learning_rate": 2.3860550164861908e-06, + "loss": 1.5005, + "step": 5862 + }, + { + "epoch": 0.7823592207099013, + "grad_norm": 0.9438642577089267, + "learning_rate": 2.3832538849818663e-06, + "loss": 1.5266, + "step": 5863 + }, + { + "epoch": 0.7824926607953029, + "grad_norm": 0.9488414068156699, + "learning_rate": 2.3804541761989974e-06, + "loss": 1.568, + "step": 5864 + }, + { + "epoch": 0.7826261008807046, + "grad_norm": 0.9217470739999812, + "learning_rate": 2.377655890660533e-06, + "loss": 1.4917, + "step": 5865 + }, + { + "epoch": 0.7827595409661062, + "grad_norm": 0.9403617005912336, + "learning_rate": 2.374859028889168e-06, + "loss": 1.5904, + "step": 5866 + }, + { + "epoch": 0.7828929810515078, + "grad_norm": 0.9377360220679098, + "learning_rate": 2.372063591407321e-06, + "loss": 1.529, + "step": 5867 + }, + { + "epoch": 0.7830264211369096, + "grad_norm": 0.9289834664828129, + "learning_rate": 2.3692695787371443e-06, + "loss": 1.5424, + "step": 5868 + }, + { + "epoch": 0.7831598612223112, + "grad_norm": 0.9284928064699219, + "learning_rate": 2.366476991400535e-06, + "loss": 1.5683, + "step": 5869 + }, + { + "epoch": 0.7832933013077128, + "grad_norm": 0.9414971322161758, + "learning_rate": 2.3636858299191113e-06, + "loss": 1.5869, + "step": 5870 + }, + { + "epoch": 0.7834267413931145, + "grad_norm": 1.049554007951963, + "learning_rate": 2.3608960948142334e-06, + "loss": 1.581, + "step": 5871 + }, + { + "epoch": 0.7835601814785161, + "grad_norm": 0.9261270821311429, + "learning_rate": 2.3581077866069868e-06, + "loss": 1.5777, + "step": 5872 + }, + { + "epoch": 0.7836936215639178, + "grad_norm": 0.9228475363078417, + "learning_rate": 2.3553209058182025e-06, + "loss": 1.5916, + "step": 5873 + }, + { + "epoch": 0.7838270616493195, + "grad_norm": 0.9372723449214597, + "learning_rate": 2.3525354529684354e-06, + "loss": 1.4841, + "step": 5874 + }, + { + "epoch": 0.7839605017347211, + "grad_norm": 1.0271631020714844, + "learning_rate": 2.349751428577972e-06, + "loss": 1.5588, + "step": 5875 + }, + { + "epoch": 0.7840939418201228, + "grad_norm": 0.9083784084217819, + "learning_rate": 2.346968833166844e-06, + "loss": 1.5394, + "step": 5876 + }, + { + "epoch": 0.7842273819055244, + "grad_norm": 0.950058509885086, + "learning_rate": 2.3441876672548046e-06, + "loss": 1.5469, + "step": 5877 + }, + { + "epoch": 0.784360821990926, + "grad_norm": 0.9649329509240152, + "learning_rate": 2.341407931361345e-06, + "loss": 1.6095, + "step": 5878 + }, + { + "epoch": 0.7844942620763278, + "grad_norm": 0.9865778802161118, + "learning_rate": 2.338629626005684e-06, + "loss": 1.5117, + "step": 5879 + }, + { + "epoch": 0.7846277021617294, + "grad_norm": 0.9479757386340179, + "learning_rate": 2.335852751706784e-06, + "loss": 1.5221, + "step": 5880 + }, + { + "epoch": 0.784761142247131, + "grad_norm": 1.0079567810813972, + "learning_rate": 2.3330773089833303e-06, + "loss": 1.5785, + "step": 5881 + }, + { + "epoch": 0.7848945823325327, + "grad_norm": 0.9604536184535951, + "learning_rate": 2.3303032983537445e-06, + "loss": 1.5498, + "step": 5882 + }, + { + "epoch": 0.7850280224179343, + "grad_norm": 0.9554406788386689, + "learning_rate": 2.327530720336176e-06, + "loss": 1.4614, + "step": 5883 + }, + { + "epoch": 0.785161462503336, + "grad_norm": 0.9339548764005035, + "learning_rate": 2.3247595754485185e-06, + "loss": 1.5512, + "step": 5884 + }, + { + "epoch": 0.7852949025887377, + "grad_norm": 0.9499142303394195, + "learning_rate": 2.321989864208386e-06, + "loss": 1.5241, + "step": 5885 + }, + { + "epoch": 0.7854283426741393, + "grad_norm": 0.976577687622226, + "learning_rate": 2.319221587133127e-06, + "loss": 1.5734, + "step": 5886 + }, + { + "epoch": 0.7855617827595409, + "grad_norm": 1.0810206584465782, + "learning_rate": 2.316454744739829e-06, + "loss": 1.5224, + "step": 5887 + }, + { + "epoch": 0.7856952228449426, + "grad_norm": 0.9179787586115475, + "learning_rate": 2.313689337545304e-06, + "loss": 1.472, + "step": 5888 + }, + { + "epoch": 0.7858286629303443, + "grad_norm": 0.9423241752395348, + "learning_rate": 2.310925366066099e-06, + "loss": 1.5131, + "step": 5889 + }, + { + "epoch": 0.785962103015746, + "grad_norm": 1.0074056359823236, + "learning_rate": 2.308162830818489e-06, + "loss": 1.5842, + "step": 5890 + }, + { + "epoch": 0.7860955431011476, + "grad_norm": 1.140943510869096, + "learning_rate": 2.3054017323184907e-06, + "loss": 1.5383, + "step": 5891 + }, + { + "epoch": 0.7862289831865492, + "grad_norm": 0.9686268722021224, + "learning_rate": 2.3026420710818433e-06, + "loss": 1.569, + "step": 5892 + }, + { + "epoch": 0.7863624232719509, + "grad_norm": 1.0903976628772232, + "learning_rate": 2.299883847624015e-06, + "loss": 1.5378, + "step": 5893 + }, + { + "epoch": 0.7864958633573526, + "grad_norm": 1.0249636474092916, + "learning_rate": 2.297127062460218e-06, + "loss": 1.4946, + "step": 5894 + }, + { + "epoch": 0.7866293034427542, + "grad_norm": 0.9160337674447874, + "learning_rate": 2.2943717161053856e-06, + "loss": 1.563, + "step": 5895 + }, + { + "epoch": 0.7867627435281559, + "grad_norm": 0.992905443952185, + "learning_rate": 2.2916178090741848e-06, + "loss": 1.5425, + "step": 5896 + }, + { + "epoch": 0.7868961836135575, + "grad_norm": 1.0099924911665865, + "learning_rate": 2.2888653418810114e-06, + "loss": 1.5827, + "step": 5897 + }, + { + "epoch": 0.7870296236989591, + "grad_norm": 0.9732471135416901, + "learning_rate": 2.2861143150400002e-06, + "loss": 1.5571, + "step": 5898 + }, + { + "epoch": 0.7871630637843609, + "grad_norm": 0.9352056643017831, + "learning_rate": 2.2833647290650084e-06, + "loss": 1.526, + "step": 5899 + }, + { + "epoch": 0.7872965038697625, + "grad_norm": 0.9444113116845154, + "learning_rate": 2.2806165844696284e-06, + "loss": 1.5729, + "step": 5900 + }, + { + "epoch": 0.7874299439551641, + "grad_norm": 0.9816140583735885, + "learning_rate": 2.277869881767182e-06, + "loss": 1.5734, + "step": 5901 + }, + { + "epoch": 0.7875633840405658, + "grad_norm": 1.2377672190640088, + "learning_rate": 2.2751246214707223e-06, + "loss": 1.5564, + "step": 5902 + }, + { + "epoch": 0.7876968241259674, + "grad_norm": 0.9171027423735342, + "learning_rate": 2.2723808040930282e-06, + "loss": 1.4997, + "step": 5903 + }, + { + "epoch": 0.7878302642113691, + "grad_norm": 1.0105329476647573, + "learning_rate": 2.2696384301466224e-06, + "loss": 1.5311, + "step": 5904 + }, + { + "epoch": 0.7879637042967708, + "grad_norm": 0.9299980163921587, + "learning_rate": 2.2668975001437433e-06, + "loss": 1.5344, + "step": 5905 + }, + { + "epoch": 0.7880971443821724, + "grad_norm": 0.9457043818410217, + "learning_rate": 2.264158014596366e-06, + "loss": 1.5521, + "step": 5906 + }, + { + "epoch": 0.7882305844675741, + "grad_norm": 1.2092222254834237, + "learning_rate": 2.2614199740161935e-06, + "loss": 1.5422, + "step": 5907 + }, + { + "epoch": 0.7883640245529757, + "grad_norm": 1.0681054745909435, + "learning_rate": 2.2586833789146657e-06, + "loss": 1.5889, + "step": 5908 + }, + { + "epoch": 0.7884974646383773, + "grad_norm": 0.9368681322416695, + "learning_rate": 2.2559482298029447e-06, + "loss": 1.5816, + "step": 5909 + }, + { + "epoch": 0.7886309047237791, + "grad_norm": 0.9228201723152973, + "learning_rate": 2.2532145271919247e-06, + "loss": 1.5449, + "step": 5910 + }, + { + "epoch": 0.7887643448091807, + "grad_norm": 0.943660701996512, + "learning_rate": 2.250482271592228e-06, + "loss": 1.5683, + "step": 5911 + }, + { + "epoch": 0.7888977848945823, + "grad_norm": 0.929628779619005, + "learning_rate": 2.247751463514214e-06, + "loss": 1.5122, + "step": 5912 + }, + { + "epoch": 0.789031224979984, + "grad_norm": 0.9387161790518015, + "learning_rate": 2.245022103467964e-06, + "loss": 1.5627, + "step": 5913 + }, + { + "epoch": 0.7891646650653856, + "grad_norm": 1.0924957479088928, + "learning_rate": 2.242294191963291e-06, + "loss": 1.5348, + "step": 5914 + }, + { + "epoch": 0.7892981051507872, + "grad_norm": 0.9504151391255395, + "learning_rate": 2.2395677295097362e-06, + "loss": 1.523, + "step": 5915 + }, + { + "epoch": 0.789431545236189, + "grad_norm": 0.9629607665332615, + "learning_rate": 2.236842716616575e-06, + "loss": 1.549, + "step": 5916 + }, + { + "epoch": 0.7895649853215906, + "grad_norm": 1.0660833347570207, + "learning_rate": 2.2341191537928087e-06, + "loss": 1.5227, + "step": 5917 + }, + { + "epoch": 0.7896984254069923, + "grad_norm": 0.953518244934922, + "learning_rate": 2.2313970415471643e-06, + "loss": 1.5568, + "step": 5918 + }, + { + "epoch": 0.7898318654923939, + "grad_norm": 1.0025411935106783, + "learning_rate": 2.2286763803881052e-06, + "loss": 1.5277, + "step": 5919 + }, + { + "epoch": 0.7899653055777955, + "grad_norm": 0.9253963712749954, + "learning_rate": 2.2259571708238194e-06, + "loss": 1.506, + "step": 5920 + }, + { + "epoch": 0.7900987456631973, + "grad_norm": 0.9896713961958903, + "learning_rate": 2.2232394133622225e-06, + "loss": 1.5484, + "step": 5921 + }, + { + "epoch": 0.7902321857485989, + "grad_norm": 1.1168051427517722, + "learning_rate": 2.220523108510959e-06, + "loss": 1.5591, + "step": 5922 + }, + { + "epoch": 0.7903656258340005, + "grad_norm": 0.9692435749687704, + "learning_rate": 2.2178082567774086e-06, + "loss": 1.5639, + "step": 5923 + }, + { + "epoch": 0.7904990659194022, + "grad_norm": 1.0227885427358632, + "learning_rate": 2.2150948586686728e-06, + "loss": 1.5236, + "step": 5924 + }, + { + "epoch": 0.7906325060048038, + "grad_norm": 0.9422956336171604, + "learning_rate": 2.21238291469158e-06, + "loss": 1.5855, + "step": 5925 + }, + { + "epoch": 0.7907659460902055, + "grad_norm": 1.3638108258879078, + "learning_rate": 2.2096724253526956e-06, + "loss": 1.5657, + "step": 5926 + }, + { + "epoch": 0.7908993861756072, + "grad_norm": 0.8997297461212911, + "learning_rate": 2.2069633911583067e-06, + "loss": 1.5429, + "step": 5927 + }, + { + "epoch": 0.7910328262610088, + "grad_norm": 0.964122119886273, + "learning_rate": 2.204255812614429e-06, + "loss": 1.5413, + "step": 5928 + }, + { + "epoch": 0.7911662663464104, + "grad_norm": 0.9482056082342423, + "learning_rate": 2.201549690226804e-06, + "loss": 1.5358, + "step": 5929 + }, + { + "epoch": 0.7912997064318121, + "grad_norm": 1.0861037743666542, + "learning_rate": 2.198845024500912e-06, + "loss": 1.5515, + "step": 5930 + }, + { + "epoch": 0.7914331465172137, + "grad_norm": 0.9464827990376123, + "learning_rate": 2.196141815941949e-06, + "loss": 1.5723, + "step": 5931 + }, + { + "epoch": 0.7915665866026155, + "grad_norm": 0.934897802356449, + "learning_rate": 2.193440065054843e-06, + "loss": 1.5334, + "step": 5932 + }, + { + "epoch": 0.7917000266880171, + "grad_norm": 0.9891163916956548, + "learning_rate": 2.1907397723442536e-06, + "loss": 1.5606, + "step": 5933 + }, + { + "epoch": 0.7918334667734187, + "grad_norm": 0.9667703054129473, + "learning_rate": 2.188040938314564e-06, + "loss": 1.5315, + "step": 5934 + }, + { + "epoch": 0.7919669068588204, + "grad_norm": 0.9296819166809209, + "learning_rate": 2.1853435634698837e-06, + "loss": 1.5643, + "step": 5935 + }, + { + "epoch": 0.792100346944222, + "grad_norm": 0.9552094294250992, + "learning_rate": 2.1826476483140503e-06, + "loss": 1.5044, + "step": 5936 + }, + { + "epoch": 0.7922337870296237, + "grad_norm": 1.0811395195378388, + "learning_rate": 2.1799531933506346e-06, + "loss": 1.5888, + "step": 5937 + }, + { + "epoch": 0.7923672271150254, + "grad_norm": 1.0395532411915318, + "learning_rate": 2.177260199082928e-06, + "loss": 1.5514, + "step": 5938 + }, + { + "epoch": 0.792500667200427, + "grad_norm": 0.9561248759072963, + "learning_rate": 2.174568666013951e-06, + "loss": 1.5294, + "step": 5939 + }, + { + "epoch": 0.7926341072858286, + "grad_norm": 0.9242513920785822, + "learning_rate": 2.1718785946464483e-06, + "loss": 1.5342, + "step": 5940 + }, + { + "epoch": 0.7927675473712303, + "grad_norm": 0.9534837808168115, + "learning_rate": 2.1691899854829014e-06, + "loss": 1.5325, + "step": 5941 + }, + { + "epoch": 0.792900987456632, + "grad_norm": 0.9322289028522723, + "learning_rate": 2.1665028390255073e-06, + "loss": 1.5002, + "step": 5942 + }, + { + "epoch": 0.7930344275420337, + "grad_norm": 1.1723532471213491, + "learning_rate": 2.1638171557761943e-06, + "loss": 1.5653, + "step": 5943 + }, + { + "epoch": 0.7931678676274353, + "grad_norm": 1.1432508740043539, + "learning_rate": 2.1611329362366195e-06, + "loss": 1.5889, + "step": 5944 + }, + { + "epoch": 0.7933013077128369, + "grad_norm": 0.9622094585479455, + "learning_rate": 2.1584501809081628e-06, + "loss": 1.604, + "step": 5945 + }, + { + "epoch": 0.7934347477982386, + "grad_norm": 0.9865589871665827, + "learning_rate": 2.1557688902919305e-06, + "loss": 1.5216, + "step": 5946 + }, + { + "epoch": 0.7935681878836403, + "grad_norm": 1.0241441606366657, + "learning_rate": 2.1530890648887628e-06, + "loss": 1.5134, + "step": 5947 + }, + { + "epoch": 0.7937016279690419, + "grad_norm": 0.9301015988878937, + "learning_rate": 2.150410705199216e-06, + "loss": 1.5294, + "step": 5948 + }, + { + "epoch": 0.7938350680544436, + "grad_norm": 0.9621334353445851, + "learning_rate": 2.147733811723579e-06, + "loss": 1.5781, + "step": 5949 + }, + { + "epoch": 0.7939685081398452, + "grad_norm": 0.9655006569053254, + "learning_rate": 2.145058384961862e-06, + "loss": 1.5538, + "step": 5950 + }, + { + "epoch": 0.7941019482252468, + "grad_norm": 1.0251786646717194, + "learning_rate": 2.1423844254138105e-06, + "loss": 1.5959, + "step": 5951 + }, + { + "epoch": 0.7942353883106485, + "grad_norm": 0.9424616777571333, + "learning_rate": 2.139711933578885e-06, + "loss": 1.5048, + "step": 5952 + }, + { + "epoch": 0.7943688283960502, + "grad_norm": 0.9412547582328011, + "learning_rate": 2.1370409099562774e-06, + "loss": 1.5633, + "step": 5953 + }, + { + "epoch": 0.7945022684814518, + "grad_norm": 1.016228232058081, + "learning_rate": 2.134371355044902e-06, + "loss": 1.5567, + "step": 5954 + }, + { + "epoch": 0.7946357085668535, + "grad_norm": 0.9500451975852726, + "learning_rate": 2.131703269343407e-06, + "loss": 1.5571, + "step": 5955 + }, + { + "epoch": 0.7947691486522551, + "grad_norm": 0.9985865820172933, + "learning_rate": 2.1290366533501572e-06, + "loss": 1.5439, + "step": 5956 + }, + { + "epoch": 0.7949025887376568, + "grad_norm": 1.0616401273574188, + "learning_rate": 2.126371507563244e-06, + "loss": 1.5213, + "step": 5957 + }, + { + "epoch": 0.7950360288230585, + "grad_norm": 0.9523800662623815, + "learning_rate": 2.1237078324804906e-06, + "loss": 1.5169, + "step": 5958 + }, + { + "epoch": 0.7951694689084601, + "grad_norm": 0.9444862343077112, + "learning_rate": 2.1210456285994397e-06, + "loss": 1.5903, + "step": 5959 + }, + { + "epoch": 0.7953029089938618, + "grad_norm": 1.037122868292527, + "learning_rate": 2.1183848964173604e-06, + "loss": 1.5931, + "step": 5960 + }, + { + "epoch": 0.7954363490792634, + "grad_norm": 0.9387631641570781, + "learning_rate": 2.115725636431243e-06, + "loss": 1.5488, + "step": 5961 + }, + { + "epoch": 0.795569789164665, + "grad_norm": 1.1749210941871775, + "learning_rate": 2.1130678491378143e-06, + "loss": 1.544, + "step": 5962 + }, + { + "epoch": 0.7957032292500668, + "grad_norm": 0.9411833484581066, + "learning_rate": 2.110411535033515e-06, + "loss": 1.5385, + "step": 5963 + }, + { + "epoch": 0.7958366693354684, + "grad_norm": 0.9404179032536638, + "learning_rate": 2.1077566946145124e-06, + "loss": 1.5881, + "step": 5964 + }, + { + "epoch": 0.79597010942087, + "grad_norm": 0.9781497662602053, + "learning_rate": 2.1051033283767054e-06, + "loss": 1.5534, + "step": 5965 + }, + { + "epoch": 0.7961035495062717, + "grad_norm": 0.9400404759624681, + "learning_rate": 2.102451436815709e-06, + "loss": 1.5625, + "step": 5966 + }, + { + "epoch": 0.7962369895916733, + "grad_norm": 0.91952723034791, + "learning_rate": 2.0998010204268683e-06, + "loss": 1.5542, + "step": 5967 + }, + { + "epoch": 0.7963704296770749, + "grad_norm": 0.9219414927078943, + "learning_rate": 2.0971520797052468e-06, + "loss": 1.5401, + "step": 5968 + }, + { + "epoch": 0.7965038697624767, + "grad_norm": 0.9602262059129528, + "learning_rate": 2.0945046151456417e-06, + "loss": 1.5379, + "step": 5969 + }, + { + "epoch": 0.7966373098478783, + "grad_norm": 0.9345892591110488, + "learning_rate": 2.091858627242568e-06, + "loss": 1.5246, + "step": 5970 + }, + { + "epoch": 0.79677074993328, + "grad_norm": 0.9407983789526347, + "learning_rate": 2.089214116490261e-06, + "loss": 1.5373, + "step": 5971 + }, + { + "epoch": 0.7969041900186816, + "grad_norm": 0.9833161312725881, + "learning_rate": 2.0865710833826936e-06, + "loss": 1.547, + "step": 5972 + }, + { + "epoch": 0.7970376301040832, + "grad_norm": 1.0593932491674578, + "learning_rate": 2.0839295284135486e-06, + "loss": 1.5509, + "step": 5973 + }, + { + "epoch": 0.797171070189485, + "grad_norm": 1.046152778746016, + "learning_rate": 2.0812894520762416e-06, + "loss": 1.6058, + "step": 5974 + }, + { + "epoch": 0.7973045102748866, + "grad_norm": 1.0912168422367754, + "learning_rate": 2.078650854863903e-06, + "loss": 1.5468, + "step": 5975 + }, + { + "epoch": 0.7974379503602882, + "grad_norm": 0.9464811881645303, + "learning_rate": 2.0760137372694e-06, + "loss": 1.551, + "step": 5976 + }, + { + "epoch": 0.7975713904456899, + "grad_norm": 0.9185730065970497, + "learning_rate": 2.0733780997853124e-06, + "loss": 1.4854, + "step": 5977 + }, + { + "epoch": 0.7977048305310915, + "grad_norm": 0.9526089149618532, + "learning_rate": 2.0707439429039454e-06, + "loss": 1.6068, + "step": 5978 + }, + { + "epoch": 0.7978382706164932, + "grad_norm": 0.9587953235425581, + "learning_rate": 2.0681112671173344e-06, + "loss": 1.6037, + "step": 5979 + }, + { + "epoch": 0.7979717107018949, + "grad_norm": 0.9361644772074595, + "learning_rate": 2.065480072917231e-06, + "loss": 1.5868, + "step": 5980 + }, + { + "epoch": 0.7981051507872965, + "grad_norm": 0.9534441446175175, + "learning_rate": 2.062850360795112e-06, + "loss": 1.5701, + "step": 5981 + }, + { + "epoch": 0.7982385908726981, + "grad_norm": 0.9946923698582439, + "learning_rate": 2.060222131242178e-06, + "loss": 1.5967, + "step": 5982 + }, + { + "epoch": 0.7983720309580998, + "grad_norm": 0.9626053858910402, + "learning_rate": 2.057595384749349e-06, + "loss": 1.578, + "step": 5983 + }, + { + "epoch": 0.7985054710435014, + "grad_norm": 1.1359335916882283, + "learning_rate": 2.054970121807278e-06, + "loss": 1.5129, + "step": 5984 + }, + { + "epoch": 0.7986389111289032, + "grad_norm": 1.1202578382909025, + "learning_rate": 2.0523463429063295e-06, + "loss": 1.5608, + "step": 5985 + }, + { + "epoch": 0.7987723512143048, + "grad_norm": 1.043896917536381, + "learning_rate": 2.0497240485365975e-06, + "loss": 1.5364, + "step": 5986 + }, + { + "epoch": 0.7989057912997064, + "grad_norm": 1.0753022531810645, + "learning_rate": 2.0471032391878963e-06, + "loss": 1.5591, + "step": 5987 + }, + { + "epoch": 0.7990392313851081, + "grad_norm": 1.2319921435864252, + "learning_rate": 2.0444839153497618e-06, + "loss": 1.5409, + "step": 5988 + }, + { + "epoch": 0.7991726714705097, + "grad_norm": 1.00815621813523, + "learning_rate": 2.041866077511453e-06, + "loss": 1.5381, + "step": 5989 + }, + { + "epoch": 0.7993061115559114, + "grad_norm": 0.9256965403165084, + "learning_rate": 2.039249726161957e-06, + "loss": 1.5051, + "step": 5990 + }, + { + "epoch": 0.7994395516413131, + "grad_norm": 0.9415742466538367, + "learning_rate": 2.0366348617899745e-06, + "loss": 1.4909, + "step": 5991 + }, + { + "epoch": 0.7995729917267147, + "grad_norm": 1.204722600477448, + "learning_rate": 2.0340214848839347e-06, + "loss": 1.5813, + "step": 5992 + }, + { + "epoch": 0.7997064318121163, + "grad_norm": 0.9223199394194576, + "learning_rate": 2.0314095959319822e-06, + "loss": 1.5269, + "step": 5993 + }, + { + "epoch": 0.799839871897518, + "grad_norm": 0.9377131565914819, + "learning_rate": 2.0287991954219945e-06, + "loss": 1.4804, + "step": 5994 + }, + { + "epoch": 0.7999733119829197, + "grad_norm": 0.9766250429689536, + "learning_rate": 2.0261902838415605e-06, + "loss": 1.5481, + "step": 5995 + }, + { + "epoch": 0.8001067520683213, + "grad_norm": 0.9549561111606707, + "learning_rate": 2.023582861677995e-06, + "loss": 1.5329, + "step": 5996 + }, + { + "epoch": 0.800240192153723, + "grad_norm": 0.9789891454720687, + "learning_rate": 2.020976929418338e-06, + "loss": 1.5963, + "step": 5997 + }, + { + "epoch": 0.8003736322391246, + "grad_norm": 0.9398469925192877, + "learning_rate": 2.018372487549346e-06, + "loss": 1.4924, + "step": 5998 + }, + { + "epoch": 0.8005070723245263, + "grad_norm": 0.9346327961340182, + "learning_rate": 2.015769536557499e-06, + "loss": 1.5465, + "step": 5999 + }, + { + "epoch": 0.800640512409928, + "grad_norm": 0.9516873692804836, + "learning_rate": 2.0131680769289964e-06, + "loss": 1.6011, + "step": 6000 + }, + { + "epoch": 0.8007739524953296, + "grad_norm": 1.0067538076216085, + "learning_rate": 2.0105681091497653e-06, + "loss": 1.5703, + "step": 6001 + }, + { + "epoch": 0.8009073925807313, + "grad_norm": 0.9585095677361529, + "learning_rate": 2.007969633705449e-06, + "loss": 1.5311, + "step": 6002 + }, + { + "epoch": 0.8010408326661329, + "grad_norm": 0.9360385818007098, + "learning_rate": 2.00537265108141e-06, + "loss": 1.5563, + "step": 6003 + }, + { + "epoch": 0.8011742727515345, + "grad_norm": 0.9620281711842354, + "learning_rate": 2.002777161762739e-06, + "loss": 1.5605, + "step": 6004 + }, + { + "epoch": 0.8013077128369362, + "grad_norm": 0.9717123448452647, + "learning_rate": 2.000183166234244e-06, + "loss": 1.5998, + "step": 6005 + }, + { + "epoch": 0.8014411529223379, + "grad_norm": 1.2665547727823556, + "learning_rate": 1.997590664980451e-06, + "loss": 1.5412, + "step": 6006 + }, + { + "epoch": 0.8015745930077395, + "grad_norm": 0.9436274926419813, + "learning_rate": 1.9949996584856093e-06, + "loss": 1.5404, + "step": 6007 + }, + { + "epoch": 0.8017080330931412, + "grad_norm": 0.9485403732848344, + "learning_rate": 1.992410147233692e-06, + "loss": 1.4864, + "step": 6008 + }, + { + "epoch": 0.8018414731785428, + "grad_norm": 0.9397520552993349, + "learning_rate": 1.989822131708391e-06, + "loss": 1.5356, + "step": 6009 + }, + { + "epoch": 0.8019749132639445, + "grad_norm": 0.9678083881364095, + "learning_rate": 1.9872356123931137e-06, + "loss": 1.5437, + "step": 6010 + }, + { + "epoch": 0.8021083533493462, + "grad_norm": 8.488638953717834, + "learning_rate": 1.984650589770998e-06, + "loss": 1.5579, + "step": 6011 + }, + { + "epoch": 0.8022417934347478, + "grad_norm": 0.9477151839816731, + "learning_rate": 1.982067064324893e-06, + "loss": 1.5687, + "step": 6012 + }, + { + "epoch": 0.8023752335201495, + "grad_norm": 0.9561015940175384, + "learning_rate": 1.979485036537373e-06, + "loss": 1.5424, + "step": 6013 + }, + { + "epoch": 0.8025086736055511, + "grad_norm": 0.940578131890771, + "learning_rate": 1.976904506890729e-06, + "loss": 1.5604, + "step": 6014 + }, + { + "epoch": 0.8026421136909527, + "grad_norm": 0.971107847461901, + "learning_rate": 1.9743254758669794e-06, + "loss": 1.5516, + "step": 6015 + }, + { + "epoch": 0.8027755537763545, + "grad_norm": 1.151724003936446, + "learning_rate": 1.971747943947855e-06, + "loss": 1.5418, + "step": 6016 + }, + { + "epoch": 0.8029089938617561, + "grad_norm": 0.9436979624078382, + "learning_rate": 1.969171911614809e-06, + "loss": 1.5554, + "step": 6017 + }, + { + "epoch": 0.8030424339471577, + "grad_norm": 1.088649547266759, + "learning_rate": 1.9665973793490134e-06, + "loss": 1.546, + "step": 6018 + }, + { + "epoch": 0.8031758740325594, + "grad_norm": 1.1775693108073233, + "learning_rate": 1.964024347631367e-06, + "loss": 1.5331, + "step": 6019 + }, + { + "epoch": 0.803309314117961, + "grad_norm": 0.9338043323421088, + "learning_rate": 1.9614528169424784e-06, + "loss": 1.5825, + "step": 6020 + }, + { + "epoch": 0.8034427542033626, + "grad_norm": 0.9745864204912411, + "learning_rate": 1.958882787762678e-06, + "loss": 1.5838, + "step": 6021 + }, + { + "epoch": 0.8035761942887644, + "grad_norm": 1.5277909690793727, + "learning_rate": 1.9563142605720254e-06, + "loss": 1.5049, + "step": 6022 + }, + { + "epoch": 0.803709634374166, + "grad_norm": 0.9821884824130115, + "learning_rate": 1.953747235850287e-06, + "loss": 1.5272, + "step": 6023 + }, + { + "epoch": 0.8038430744595677, + "grad_norm": 0.9310206254375682, + "learning_rate": 1.9511817140769563e-06, + "loss": 1.5323, + "step": 6024 + }, + { + "epoch": 0.8039765145449693, + "grad_norm": 1.0213597847479, + "learning_rate": 1.948617695731242e-06, + "loss": 1.5192, + "step": 6025 + }, + { + "epoch": 0.8041099546303709, + "grad_norm": 0.9562151575602164, + "learning_rate": 1.9460551812920703e-06, + "loss": 1.5758, + "step": 6026 + }, + { + "epoch": 0.8042433947157727, + "grad_norm": 0.9387251801749865, + "learning_rate": 1.943494171238095e-06, + "loss": 1.5662, + "step": 6027 + }, + { + "epoch": 0.8043768348011743, + "grad_norm": 0.9762010066502882, + "learning_rate": 1.9409346660476834e-06, + "loss": 1.573, + "step": 6028 + }, + { + "epoch": 0.8045102748865759, + "grad_norm": 1.087852765493359, + "learning_rate": 1.938376666198919e-06, + "loss": 1.5465, + "step": 6029 + }, + { + "epoch": 0.8046437149719776, + "grad_norm": 0.938575775358334, + "learning_rate": 1.935820172169609e-06, + "loss": 1.5516, + "step": 6030 + }, + { + "epoch": 0.8047771550573792, + "grad_norm": 0.965176200493375, + "learning_rate": 1.933265184437274e-06, + "loss": 1.5584, + "step": 6031 + }, + { + "epoch": 0.8049105951427808, + "grad_norm": 1.2915982271448758, + "learning_rate": 1.930711703479162e-06, + "loss": 1.594, + "step": 6032 + }, + { + "epoch": 0.8050440352281826, + "grad_norm": 0.9273187149756227, + "learning_rate": 1.928159729772231e-06, + "loss": 1.5509, + "step": 6033 + }, + { + "epoch": 0.8051774753135842, + "grad_norm": 0.9188671634655157, + "learning_rate": 1.925609263793162e-06, + "loss": 1.5233, + "step": 6034 + }, + { + "epoch": 0.8053109153989858, + "grad_norm": 1.009702927758053, + "learning_rate": 1.9230603060183493e-06, + "loss": 1.5312, + "step": 6035 + }, + { + "epoch": 0.8054443554843875, + "grad_norm": 1.100233587871606, + "learning_rate": 1.920512856923914e-06, + "loss": 1.5386, + "step": 6036 + }, + { + "epoch": 0.8055777955697891, + "grad_norm": 0.9308659676568186, + "learning_rate": 1.9179669169856896e-06, + "loss": 1.537, + "step": 6037 + }, + { + "epoch": 0.8057112356551909, + "grad_norm": 0.9407626383495175, + "learning_rate": 1.915422486679227e-06, + "loss": 1.5401, + "step": 6038 + }, + { + "epoch": 0.8058446757405925, + "grad_norm": 0.9436636824443787, + "learning_rate": 1.912879566479795e-06, + "loss": 1.5742, + "step": 6039 + }, + { + "epoch": 0.8059781158259941, + "grad_norm": 0.9418856484629331, + "learning_rate": 1.9103381568623868e-06, + "loss": 1.5231, + "step": 6040 + }, + { + "epoch": 0.8061115559113958, + "grad_norm": 0.9938636042780336, + "learning_rate": 1.907798258301707e-06, + "loss": 1.5326, + "step": 6041 + }, + { + "epoch": 0.8062449959967974, + "grad_norm": 0.9167069778310091, + "learning_rate": 1.9052598712721771e-06, + "loss": 1.5517, + "step": 6042 + }, + { + "epoch": 0.8063784360821991, + "grad_norm": 0.9396867142428339, + "learning_rate": 1.9027229962479433e-06, + "loss": 1.5704, + "step": 6043 + }, + { + "epoch": 0.8065118761676008, + "grad_norm": 0.9421359789583224, + "learning_rate": 1.9001876337028635e-06, + "loss": 1.5256, + "step": 6044 + }, + { + "epoch": 0.8066453162530024, + "grad_norm": 0.973198022271884, + "learning_rate": 1.8976537841105136e-06, + "loss": 1.528, + "step": 6045 + }, + { + "epoch": 0.806778756338404, + "grad_norm": 0.9243015337796414, + "learning_rate": 1.895121447944185e-06, + "loss": 1.5705, + "step": 6046 + }, + { + "epoch": 0.8069121964238057, + "grad_norm": 0.9373269634820086, + "learning_rate": 1.8925906256768957e-06, + "loss": 1.5274, + "step": 6047 + }, + { + "epoch": 0.8070456365092074, + "grad_norm": 1.108272451485646, + "learning_rate": 1.8900613177813708e-06, + "loss": 1.5485, + "step": 6048 + }, + { + "epoch": 0.807179076594609, + "grad_norm": 0.9455270267638988, + "learning_rate": 1.8875335247300564e-06, + "loss": 1.5525, + "step": 6049 + }, + { + "epoch": 0.8073125166800107, + "grad_norm": 0.948942726258177, + "learning_rate": 1.8850072469951142e-06, + "loss": 1.5362, + "step": 6050 + }, + { + "epoch": 0.8074459567654123, + "grad_norm": 1.0549675256800648, + "learning_rate": 1.8824824850484269e-06, + "loss": 1.5568, + "step": 6051 + }, + { + "epoch": 0.807579396850814, + "grad_norm": 0.9693613341191636, + "learning_rate": 1.8799592393615906e-06, + "loss": 1.5386, + "step": 6052 + }, + { + "epoch": 0.8077128369362157, + "grad_norm": 1.002495730710032, + "learning_rate": 1.877437510405915e-06, + "loss": 1.5567, + "step": 6053 + }, + { + "epoch": 0.8078462770216173, + "grad_norm": 0.9399879061201342, + "learning_rate": 1.8749172986524378e-06, + "loss": 1.5198, + "step": 6054 + }, + { + "epoch": 0.807979717107019, + "grad_norm": 0.916955889393747, + "learning_rate": 1.8723986045719e-06, + "loss": 1.5238, + "step": 6055 + }, + { + "epoch": 0.8081131571924206, + "grad_norm": 0.9305269696315253, + "learning_rate": 1.8698814286347678e-06, + "loss": 1.5412, + "step": 6056 + }, + { + "epoch": 0.8082465972778222, + "grad_norm": 0.9234504269114264, + "learning_rate": 1.867365771311216e-06, + "loss": 1.4926, + "step": 6057 + }, + { + "epoch": 0.808380037363224, + "grad_norm": 0.9598446073788602, + "learning_rate": 1.8648516330711486e-06, + "loss": 1.5225, + "step": 6058 + }, + { + "epoch": 0.8085134774486256, + "grad_norm": 1.006044534644564, + "learning_rate": 1.8623390143841735e-06, + "loss": 1.5641, + "step": 6059 + }, + { + "epoch": 0.8086469175340272, + "grad_norm": 0.9891625291510784, + "learning_rate": 1.8598279157196164e-06, + "loss": 1.5121, + "step": 6060 + }, + { + "epoch": 0.8087803576194289, + "grad_norm": 0.9313511828058488, + "learning_rate": 1.8573183375465286e-06, + "loss": 1.5156, + "step": 6061 + }, + { + "epoch": 0.8089137977048305, + "grad_norm": 0.9339926309721247, + "learning_rate": 1.8548102803336677e-06, + "loss": 1.5326, + "step": 6062 + }, + { + "epoch": 0.8090472377902321, + "grad_norm": 0.9664081912314659, + "learning_rate": 1.8523037445495095e-06, + "loss": 1.5461, + "step": 6063 + }, + { + "epoch": 0.8091806778756339, + "grad_norm": 1.12857579777555, + "learning_rate": 1.8497987306622446e-06, + "loss": 1.5084, + "step": 6064 + }, + { + "epoch": 0.8093141179610355, + "grad_norm": 1.0205222142779857, + "learning_rate": 1.8472952391397846e-06, + "loss": 1.5501, + "step": 6065 + }, + { + "epoch": 0.8094475580464372, + "grad_norm": 0.9269906202615147, + "learning_rate": 1.844793270449753e-06, + "loss": 1.5647, + "step": 6066 + }, + { + "epoch": 0.8095809981318388, + "grad_norm": 0.9111441101773377, + "learning_rate": 1.8422928250594884e-06, + "loss": 1.532, + "step": 6067 + }, + { + "epoch": 0.8097144382172404, + "grad_norm": 1.0423582302266032, + "learning_rate": 1.8397939034360424e-06, + "loss": 1.5852, + "step": 6068 + }, + { + "epoch": 0.8098478783026422, + "grad_norm": 0.9495371122585425, + "learning_rate": 1.837296506046189e-06, + "loss": 1.5211, + "step": 6069 + }, + { + "epoch": 0.8099813183880438, + "grad_norm": 0.9361902779942389, + "learning_rate": 1.834800633356414e-06, + "loss": 1.5148, + "step": 6070 + }, + { + "epoch": 0.8101147584734454, + "grad_norm": 1.234996354576864, + "learning_rate": 1.8323062858329155e-06, + "loss": 1.6077, + "step": 6071 + }, + { + "epoch": 0.8102481985588471, + "grad_norm": 0.9278663038573935, + "learning_rate": 1.82981346394161e-06, + "loss": 1.5558, + "step": 6072 + }, + { + "epoch": 0.8103816386442487, + "grad_norm": 0.9476924229244504, + "learning_rate": 1.827322168148129e-06, + "loss": 1.5618, + "step": 6073 + }, + { + "epoch": 0.8105150787296503, + "grad_norm": 0.9319818911496207, + "learning_rate": 1.8248323989178151e-06, + "loss": 1.5607, + "step": 6074 + }, + { + "epoch": 0.8106485188150521, + "grad_norm": 0.9145783092870352, + "learning_rate": 1.8223441567157329e-06, + "loss": 1.5446, + "step": 6075 + }, + { + "epoch": 0.8107819589004537, + "grad_norm": 0.9381651247628452, + "learning_rate": 1.8198574420066572e-06, + "loss": 1.5931, + "step": 6076 + }, + { + "epoch": 0.8109153989858553, + "grad_norm": 0.9248267670752973, + "learning_rate": 1.8173722552550766e-06, + "loss": 1.5598, + "step": 6077 + }, + { + "epoch": 0.811048839071257, + "grad_norm": 0.933228793651718, + "learning_rate": 1.8148885969251928e-06, + "loss": 1.5767, + "step": 6078 + }, + { + "epoch": 0.8111822791566586, + "grad_norm": 0.907509359626052, + "learning_rate": 1.8124064674809316e-06, + "loss": 1.5065, + "step": 6079 + }, + { + "epoch": 0.8113157192420604, + "grad_norm": 0.9228783070319467, + "learning_rate": 1.8099258673859221e-06, + "loss": 1.4855, + "step": 6080 + }, + { + "epoch": 0.811449159327462, + "grad_norm": 1.0177477019581869, + "learning_rate": 1.8074467971035136e-06, + "loss": 1.4958, + "step": 6081 + }, + { + "epoch": 0.8115825994128636, + "grad_norm": 0.9588231215311086, + "learning_rate": 1.8049692570967647e-06, + "loss": 1.5646, + "step": 6082 + }, + { + "epoch": 0.8117160394982653, + "grad_norm": 0.9155594177985604, + "learning_rate": 1.8024932478284584e-06, + "loss": 1.5231, + "step": 6083 + }, + { + "epoch": 0.8118494795836669, + "grad_norm": 0.9565317839445175, + "learning_rate": 1.8000187697610804e-06, + "loss": 1.6076, + "step": 6084 + }, + { + "epoch": 0.8119829196690685, + "grad_norm": 0.9745756544326605, + "learning_rate": 1.7975458233568332e-06, + "loss": 1.5622, + "step": 6085 + }, + { + "epoch": 0.8121163597544703, + "grad_norm": 1.0317394403946698, + "learning_rate": 1.7950744090776407e-06, + "loss": 1.5838, + "step": 6086 + }, + { + "epoch": 0.8122497998398719, + "grad_norm": 0.9537967660036222, + "learning_rate": 1.792604527385131e-06, + "loss": 1.524, + "step": 6087 + }, + { + "epoch": 0.8123832399252735, + "grad_norm": 1.1140810286258387, + "learning_rate": 1.7901361787406524e-06, + "loss": 1.5551, + "step": 6088 + }, + { + "epoch": 0.8125166800106752, + "grad_norm": 0.9970484456453595, + "learning_rate": 1.7876693636052588e-06, + "loss": 1.563, + "step": 6089 + }, + { + "epoch": 0.8126501200960768, + "grad_norm": 0.975647861270303, + "learning_rate": 1.785204082439731e-06, + "loss": 1.5874, + "step": 6090 + }, + { + "epoch": 0.8127835601814786, + "grad_norm": 0.9369203293774957, + "learning_rate": 1.7827403357045514e-06, + "loss": 1.5696, + "step": 6091 + }, + { + "epoch": 0.8129170002668802, + "grad_norm": 1.0641811139775401, + "learning_rate": 1.7802781238599164e-06, + "loss": 1.5656, + "step": 6092 + }, + { + "epoch": 0.8130504403522818, + "grad_norm": 0.9475062980136262, + "learning_rate": 1.777817447365746e-06, + "loss": 1.5263, + "step": 6093 + }, + { + "epoch": 0.8131838804376835, + "grad_norm": 0.9652813026259098, + "learning_rate": 1.7753583066816637e-06, + "loss": 1.5176, + "step": 6094 + }, + { + "epoch": 0.8133173205230851, + "grad_norm": 1.1089594490573167, + "learning_rate": 1.7729007022670085e-06, + "loss": 1.5231, + "step": 6095 + }, + { + "epoch": 0.8134507606084868, + "grad_norm": 0.9258367136985601, + "learning_rate": 1.7704446345808312e-06, + "loss": 1.5387, + "step": 6096 + }, + { + "epoch": 0.8135842006938885, + "grad_norm": 1.0577847232171296, + "learning_rate": 1.7679901040819004e-06, + "loss": 1.6006, + "step": 6097 + }, + { + "epoch": 0.8137176407792901, + "grad_norm": 1.1127882285883361, + "learning_rate": 1.7655371112286946e-06, + "loss": 1.5717, + "step": 6098 + }, + { + "epoch": 0.8138510808646917, + "grad_norm": 1.0673924071414274, + "learning_rate": 1.7630856564793996e-06, + "loss": 1.5309, + "step": 6099 + }, + { + "epoch": 0.8139845209500934, + "grad_norm": 0.9464251733577566, + "learning_rate": 1.760635740291926e-06, + "loss": 1.5303, + "step": 6100 + }, + { + "epoch": 0.814117961035495, + "grad_norm": 0.9319478908646294, + "learning_rate": 1.758187363123889e-06, + "loss": 1.5401, + "step": 6101 + }, + { + "epoch": 0.8142514011208967, + "grad_norm": 0.9287040927493834, + "learning_rate": 1.7557405254326144e-06, + "loss": 1.5454, + "step": 6102 + }, + { + "epoch": 0.8143848412062984, + "grad_norm": 0.9409713573767424, + "learning_rate": 1.7532952276751424e-06, + "loss": 1.5564, + "step": 6103 + }, + { + "epoch": 0.8145182812917, + "grad_norm": 1.1149430849686321, + "learning_rate": 1.7508514703082336e-06, + "loss": 1.5285, + "step": 6104 + }, + { + "epoch": 0.8146517213771017, + "grad_norm": 0.942572754627203, + "learning_rate": 1.748409253788349e-06, + "loss": 1.5839, + "step": 6105 + }, + { + "epoch": 0.8147851614625033, + "grad_norm": 1.0681724159465906, + "learning_rate": 1.7459685785716651e-06, + "loss": 1.5143, + "step": 6106 + }, + { + "epoch": 0.814918601547905, + "grad_norm": 0.9013869560225362, + "learning_rate": 1.7435294451140783e-06, + "loss": 1.5133, + "step": 6107 + }, + { + "epoch": 0.8150520416333067, + "grad_norm": 0.9616884493441265, + "learning_rate": 1.7410918538711875e-06, + "loss": 1.6119, + "step": 6108 + }, + { + "epoch": 0.8151854817187083, + "grad_norm": 0.9680636039329384, + "learning_rate": 1.738655805298307e-06, + "loss": 1.5284, + "step": 6109 + }, + { + "epoch": 0.8153189218041099, + "grad_norm": 0.9024891296925446, + "learning_rate": 1.7362212998504635e-06, + "loss": 1.5047, + "step": 6110 + }, + { + "epoch": 0.8154523618895116, + "grad_norm": 0.9523905191421255, + "learning_rate": 1.7337883379823917e-06, + "loss": 1.5871, + "step": 6111 + }, + { + "epoch": 0.8155858019749133, + "grad_norm": 0.9263795793949324, + "learning_rate": 1.731356920148547e-06, + "loss": 1.5285, + "step": 6112 + }, + { + "epoch": 0.8157192420603149, + "grad_norm": 0.9647766098487651, + "learning_rate": 1.7289270468030871e-06, + "loss": 1.5834, + "step": 6113 + }, + { + "epoch": 0.8158526821457166, + "grad_norm": 0.9197615859806225, + "learning_rate": 1.726498718399886e-06, + "loss": 1.5364, + "step": 6114 + }, + { + "epoch": 0.8159861222311182, + "grad_norm": 0.9626572376096142, + "learning_rate": 1.7240719353925262e-06, + "loss": 1.5519, + "step": 6115 + }, + { + "epoch": 0.8161195623165198, + "grad_norm": 0.9396787834195818, + "learning_rate": 1.7216466982343039e-06, + "loss": 1.5418, + "step": 6116 + }, + { + "epoch": 0.8162530024019216, + "grad_norm": 0.91235670010565, + "learning_rate": 1.7192230073782234e-06, + "loss": 1.477, + "step": 6117 + }, + { + "epoch": 0.8163864424873232, + "grad_norm": 0.9262321266478081, + "learning_rate": 1.7168008632770072e-06, + "loss": 1.5578, + "step": 6118 + }, + { + "epoch": 0.8165198825727249, + "grad_norm": 0.9400952842702311, + "learning_rate": 1.7143802663830821e-06, + "loss": 1.4842, + "step": 6119 + }, + { + "epoch": 0.8166533226581265, + "grad_norm": 0.9299282662667695, + "learning_rate": 1.7119612171485877e-06, + "loss": 1.5302, + "step": 6120 + }, + { + "epoch": 0.8167867627435281, + "grad_norm": 0.9932044315186542, + "learning_rate": 1.7095437160253725e-06, + "loss": 1.5168, + "step": 6121 + }, + { + "epoch": 0.8169202028289299, + "grad_norm": 0.9580569276859812, + "learning_rate": 1.707127763465004e-06, + "loss": 1.5769, + "step": 6122 + }, + { + "epoch": 0.8170536429143315, + "grad_norm": 1.102986018038682, + "learning_rate": 1.7047133599187515e-06, + "loss": 1.531, + "step": 6123 + }, + { + "epoch": 0.8171870829997331, + "grad_norm": 0.9193541580129797, + "learning_rate": 1.702300505837594e-06, + "loss": 1.5448, + "step": 6124 + }, + { + "epoch": 0.8173205230851348, + "grad_norm": 0.9456674504166714, + "learning_rate": 1.6998892016722334e-06, + "loss": 1.5154, + "step": 6125 + }, + { + "epoch": 0.8174539631705364, + "grad_norm": 1.3332667316333, + "learning_rate": 1.6974794478730683e-06, + "loss": 1.5605, + "step": 6126 + }, + { + "epoch": 0.817587403255938, + "grad_norm": 0.9558729747564527, + "learning_rate": 1.695071244890215e-06, + "loss": 1.5821, + "step": 6127 + }, + { + "epoch": 0.8177208433413398, + "grad_norm": 1.2620823710527826, + "learning_rate": 1.6926645931734964e-06, + "loss": 1.5201, + "step": 6128 + }, + { + "epoch": 0.8178542834267414, + "grad_norm": 0.9179178473951229, + "learning_rate": 1.6902594931724503e-06, + "loss": 1.5354, + "step": 6129 + }, + { + "epoch": 0.817987723512143, + "grad_norm": 0.9450627126995589, + "learning_rate": 1.687855945336322e-06, + "loss": 1.5345, + "step": 6130 + }, + { + "epoch": 0.8181211635975447, + "grad_norm": 0.9178343790581889, + "learning_rate": 1.6854539501140632e-06, + "loss": 1.55, + "step": 6131 + }, + { + "epoch": 0.8182546036829463, + "grad_norm": 0.9964264932180854, + "learning_rate": 1.6830535079543432e-06, + "loss": 1.5697, + "step": 6132 + }, + { + "epoch": 0.8183880437683481, + "grad_norm": 0.9890708742005667, + "learning_rate": 1.6806546193055374e-06, + "loss": 1.5219, + "step": 6133 + }, + { + "epoch": 0.8185214838537497, + "grad_norm": 0.9229456288469, + "learning_rate": 1.6782572846157285e-06, + "loss": 1.5209, + "step": 6134 + }, + { + "epoch": 0.8186549239391513, + "grad_norm": 0.9130464886045846, + "learning_rate": 1.6758615043327097e-06, + "loss": 1.4973, + "step": 6135 + }, + { + "epoch": 0.818788364024553, + "grad_norm": 0.953168022773084, + "learning_rate": 1.6734672789039907e-06, + "loss": 1.5345, + "step": 6136 + }, + { + "epoch": 0.8189218041099546, + "grad_norm": 0.9178014077533045, + "learning_rate": 1.6710746087767826e-06, + "loss": 1.5069, + "step": 6137 + }, + { + "epoch": 0.8190552441953562, + "grad_norm": 0.9433818817785761, + "learning_rate": 1.668683494398008e-06, + "loss": 1.5628, + "step": 6138 + }, + { + "epoch": 0.819188684280758, + "grad_norm": 0.955567446026117, + "learning_rate": 1.6662939362143028e-06, + "loss": 1.5812, + "step": 6139 + }, + { + "epoch": 0.8193221243661596, + "grad_norm": 1.0316675276310674, + "learning_rate": 1.6639059346720065e-06, + "loss": 1.5408, + "step": 6140 + }, + { + "epoch": 0.8194555644515612, + "grad_norm": 1.0994368830180041, + "learning_rate": 1.6615194902171728e-06, + "loss": 1.5808, + "step": 6141 + }, + { + "epoch": 0.8195890045369629, + "grad_norm": 0.9318559913000551, + "learning_rate": 1.659134603295558e-06, + "loss": 1.5227, + "step": 6142 + }, + { + "epoch": 0.8197224446223645, + "grad_norm": 0.9700753022715083, + "learning_rate": 1.6567512743526383e-06, + "loss": 1.5809, + "step": 6143 + }, + { + "epoch": 0.8198558847077662, + "grad_norm": 0.9487897185182752, + "learning_rate": 1.6543695038335882e-06, + "loss": 1.5474, + "step": 6144 + }, + { + "epoch": 0.8199893247931679, + "grad_norm": 0.9629831813661353, + "learning_rate": 1.651989292183297e-06, + "loss": 1.5777, + "step": 6145 + }, + { + "epoch": 0.8201227648785695, + "grad_norm": 1.057479149875002, + "learning_rate": 1.6496106398463574e-06, + "loss": 1.5347, + "step": 6146 + }, + { + "epoch": 0.8202562049639712, + "grad_norm": 0.933470698014775, + "learning_rate": 1.6472335472670798e-06, + "loss": 1.5632, + "step": 6147 + }, + { + "epoch": 0.8203896450493728, + "grad_norm": 1.5917968282853159, + "learning_rate": 1.6448580148894755e-06, + "loss": 1.5685, + "step": 6148 + }, + { + "epoch": 0.8205230851347745, + "grad_norm": 0.9576309077077291, + "learning_rate": 1.6424840431572652e-06, + "loss": 1.5568, + "step": 6149 + }, + { + "epoch": 0.8206565252201762, + "grad_norm": 1.0240140958776593, + "learning_rate": 1.6401116325138843e-06, + "loss": 1.5005, + "step": 6150 + }, + { + "epoch": 0.8207899653055778, + "grad_norm": 1.042576719135628, + "learning_rate": 1.6377407834024694e-06, + "loss": 1.5537, + "step": 6151 + }, + { + "epoch": 0.8209234053909794, + "grad_norm": 0.9365442898192005, + "learning_rate": 1.6353714962658684e-06, + "loss": 1.5258, + "step": 6152 + }, + { + "epoch": 0.8210568454763811, + "grad_norm": 6.035336114616692, + "learning_rate": 1.6330037715466373e-06, + "loss": 1.5402, + "step": 6153 + }, + { + "epoch": 0.8211902855617828, + "grad_norm": 1.2568273540776527, + "learning_rate": 1.630637609687037e-06, + "loss": 1.5612, + "step": 6154 + }, + { + "epoch": 0.8213237256471844, + "grad_norm": 0.9408521308120555, + "learning_rate": 1.6282730111290446e-06, + "loss": 1.5436, + "step": 6155 + }, + { + "epoch": 0.8214571657325861, + "grad_norm": 0.9116594461537203, + "learning_rate": 1.6259099763143383e-06, + "loss": 1.532, + "step": 6156 + }, + { + "epoch": 0.8215906058179877, + "grad_norm": 0.9640703464570622, + "learning_rate": 1.623548505684306e-06, + "loss": 1.5475, + "step": 6157 + }, + { + "epoch": 0.8217240459033894, + "grad_norm": 0.9628540017226783, + "learning_rate": 1.6211885996800426e-06, + "loss": 1.5435, + "step": 6158 + }, + { + "epoch": 0.821857485988791, + "grad_norm": 1.0175932870044642, + "learning_rate": 1.6188302587423532e-06, + "loss": 1.5133, + "step": 6159 + }, + { + "epoch": 0.8219909260741927, + "grad_norm": 0.9328929841123439, + "learning_rate": 1.6164734833117458e-06, + "loss": 1.4553, + "step": 6160 + }, + { + "epoch": 0.8221243661595944, + "grad_norm": 0.9326921991444765, + "learning_rate": 1.6141182738284444e-06, + "loss": 1.58, + "step": 6161 + }, + { + "epoch": 0.822257806244996, + "grad_norm": 0.9198207496240384, + "learning_rate": 1.611764630732372e-06, + "loss": 1.5411, + "step": 6162 + }, + { + "epoch": 0.8223912463303976, + "grad_norm": 0.9240771169421748, + "learning_rate": 1.60941255446316e-06, + "loss": 1.5323, + "step": 6163 + }, + { + "epoch": 0.8225246864157993, + "grad_norm": 0.9435650366109184, + "learning_rate": 1.607062045460156e-06, + "loss": 1.5653, + "step": 6164 + }, + { + "epoch": 0.822658126501201, + "grad_norm": 1.0152052475294857, + "learning_rate": 1.6047131041624041e-06, + "loss": 1.5638, + "step": 6165 + }, + { + "epoch": 0.8227915665866026, + "grad_norm": 1.0524123141586792, + "learning_rate": 1.6023657310086605e-06, + "loss": 1.5782, + "step": 6166 + }, + { + "epoch": 0.8229250066720043, + "grad_norm": 0.9311840684459297, + "learning_rate": 1.600019926437385e-06, + "loss": 1.5414, + "step": 6167 + }, + { + "epoch": 0.8230584467574059, + "grad_norm": 0.9019457165743004, + "learning_rate": 1.597675690886753e-06, + "loss": 1.5351, + "step": 6168 + }, + { + "epoch": 0.8231918868428075, + "grad_norm": 0.923450760055443, + "learning_rate": 1.5953330247946375e-06, + "loss": 1.5313, + "step": 6169 + }, + { + "epoch": 0.8233253269282093, + "grad_norm": 1.2831560841203558, + "learning_rate": 1.5929919285986195e-06, + "loss": 1.5338, + "step": 6170 + }, + { + "epoch": 0.8234587670136109, + "grad_norm": 0.9339060300388902, + "learning_rate": 1.5906524027359948e-06, + "loss": 1.5511, + "step": 6171 + }, + { + "epoch": 0.8235922070990126, + "grad_norm": 0.935595003361667, + "learning_rate": 1.5883144476437572e-06, + "loss": 1.5072, + "step": 6172 + }, + { + "epoch": 0.8237256471844142, + "grad_norm": 0.9158207412122499, + "learning_rate": 1.5859780637586098e-06, + "loss": 1.5835, + "step": 6173 + }, + { + "epoch": 0.8238590872698158, + "grad_norm": 1.050328096524351, + "learning_rate": 1.5836432515169608e-06, + "loss": 1.5829, + "step": 6174 + }, + { + "epoch": 0.8239925273552176, + "grad_norm": 1.0263704705140466, + "learning_rate": 1.5813100113549307e-06, + "loss": 1.5493, + "step": 6175 + }, + { + "epoch": 0.8241259674406192, + "grad_norm": 0.9214587200849176, + "learning_rate": 1.5789783437083406e-06, + "loss": 1.5124, + "step": 6176 + }, + { + "epoch": 0.8242594075260208, + "grad_norm": 0.9624753786839196, + "learning_rate": 1.5766482490127176e-06, + "loss": 1.5424, + "step": 6177 + }, + { + "epoch": 0.8243928476114225, + "grad_norm": 0.9397643380771115, + "learning_rate": 1.5743197277032974e-06, + "loss": 1.5551, + "step": 6178 + }, + { + "epoch": 0.8245262876968241, + "grad_norm": 1.0598447563742128, + "learning_rate": 1.5719927802150236e-06, + "loss": 1.5502, + "step": 6179 + }, + { + "epoch": 0.8246597277822257, + "grad_norm": 1.3413789438510157, + "learning_rate": 1.5696674069825425e-06, + "loss": 1.5785, + "step": 6180 + }, + { + "epoch": 0.8247931678676275, + "grad_norm": 1.0362130690683422, + "learning_rate": 1.5673436084402039e-06, + "loss": 1.5472, + "step": 6181 + }, + { + "epoch": 0.8249266079530291, + "grad_norm": 1.190451521816014, + "learning_rate": 1.565021385022073e-06, + "loss": 1.4974, + "step": 6182 + }, + { + "epoch": 0.8250600480384307, + "grad_norm": 1.0064225238687168, + "learning_rate": 1.5627007371619107e-06, + "loss": 1.5868, + "step": 6183 + }, + { + "epoch": 0.8251934881238324, + "grad_norm": 0.9183532599911997, + "learning_rate": 1.560381665293189e-06, + "loss": 1.5492, + "step": 6184 + }, + { + "epoch": 0.825326928209234, + "grad_norm": 0.9451229117253995, + "learning_rate": 1.5580641698490805e-06, + "loss": 1.5507, + "step": 6185 + }, + { + "epoch": 0.8254603682946358, + "grad_norm": 0.9256676754994859, + "learning_rate": 1.5557482512624733e-06, + "loss": 1.5438, + "step": 6186 + }, + { + "epoch": 0.8255938083800374, + "grad_norm": 1.0241913570296761, + "learning_rate": 1.5534339099659512e-06, + "loss": 1.5336, + "step": 6187 + }, + { + "epoch": 0.825727248465439, + "grad_norm": 0.9648855141551541, + "learning_rate": 1.551121146391804e-06, + "loss": 1.5565, + "step": 6188 + }, + { + "epoch": 0.8258606885508407, + "grad_norm": 0.9417547056164751, + "learning_rate": 1.5488099609720353e-06, + "loss": 1.5198, + "step": 6189 + }, + { + "epoch": 0.8259941286362423, + "grad_norm": 1.019075853789808, + "learning_rate": 1.546500354138346e-06, + "loss": 1.5174, + "step": 6190 + }, + { + "epoch": 0.826127568721644, + "grad_norm": 0.950534403894064, + "learning_rate": 1.5441923263221426e-06, + "loss": 1.5386, + "step": 6191 + }, + { + "epoch": 0.8262610088070457, + "grad_norm": 0.9522247872246755, + "learning_rate": 1.5418858779545387e-06, + "loss": 1.5558, + "step": 6192 + }, + { + "epoch": 0.8263944488924473, + "grad_norm": 1.14220578778169, + "learning_rate": 1.5395810094663544e-06, + "loss": 1.5794, + "step": 6193 + }, + { + "epoch": 0.8265278889778489, + "grad_norm": 0.9516184349282555, + "learning_rate": 1.537277721288113e-06, + "loss": 1.5279, + "step": 6194 + }, + { + "epoch": 0.8266613290632506, + "grad_norm": 0.9428708938228508, + "learning_rate": 1.5349760138500414e-06, + "loss": 1.5303, + "step": 6195 + }, + { + "epoch": 0.8267947691486522, + "grad_norm": 0.906052569291839, + "learning_rate": 1.5326758875820724e-06, + "loss": 1.535, + "step": 6196 + }, + { + "epoch": 0.8269282092340539, + "grad_norm": 0.9243034140621643, + "learning_rate": 1.5303773429138414e-06, + "loss": 1.5628, + "step": 6197 + }, + { + "epoch": 0.8270616493194556, + "grad_norm": 0.9422165125262247, + "learning_rate": 1.5280803802746947e-06, + "loss": 1.5517, + "step": 6198 + }, + { + "epoch": 0.8271950894048572, + "grad_norm": 1.0325744245704787, + "learning_rate": 1.5257850000936768e-06, + "loss": 1.5963, + "step": 6199 + }, + { + "epoch": 0.8273285294902589, + "grad_norm": 1.1261838747600312, + "learning_rate": 1.5234912027995374e-06, + "loss": 1.5573, + "step": 6200 + }, + { + "epoch": 0.8274619695756605, + "grad_norm": 1.0318114988538098, + "learning_rate": 1.5211989888207335e-06, + "loss": 1.5389, + "step": 6201 + }, + { + "epoch": 0.8275954096610622, + "grad_norm": 0.9575726653864882, + "learning_rate": 1.5189083585854213e-06, + "loss": 1.5731, + "step": 6202 + }, + { + "epoch": 0.8277288497464639, + "grad_norm": 0.9495250274991178, + "learning_rate": 1.516619312521469e-06, + "loss": 1.5819, + "step": 6203 + }, + { + "epoch": 0.8278622898318655, + "grad_norm": 0.9001039404937866, + "learning_rate": 1.514331851056442e-06, + "loss": 1.5126, + "step": 6204 + }, + { + "epoch": 0.8279957299172671, + "grad_norm": 0.9596548897156311, + "learning_rate": 1.512045974617612e-06, + "loss": 1.5236, + "step": 6205 + }, + { + "epoch": 0.8281291700026688, + "grad_norm": 0.948592313867705, + "learning_rate": 1.5097616836319528e-06, + "loss": 1.5963, + "step": 6206 + }, + { + "epoch": 0.8282626100880705, + "grad_norm": 0.9678395783216994, + "learning_rate": 1.507478978526149e-06, + "loss": 1.5601, + "step": 6207 + }, + { + "epoch": 0.8283960501734721, + "grad_norm": 0.9282502916058483, + "learning_rate": 1.5051978597265814e-06, + "loss": 1.5221, + "step": 6208 + }, + { + "epoch": 0.8285294902588738, + "grad_norm": 1.1106582385865, + "learning_rate": 1.502918327659335e-06, + "loss": 1.5289, + "step": 6209 + }, + { + "epoch": 0.8286629303442754, + "grad_norm": 1.1265979346871835, + "learning_rate": 1.5006403827502014e-06, + "loss": 1.5341, + "step": 6210 + }, + { + "epoch": 0.828796370429677, + "grad_norm": 1.035765628780522, + "learning_rate": 1.4983640254246767e-06, + "loss": 1.5652, + "step": 6211 + }, + { + "epoch": 0.8289298105150787, + "grad_norm": 0.9611613457634671, + "learning_rate": 1.4960892561079577e-06, + "loss": 1.5555, + "step": 6212 + }, + { + "epoch": 0.8290632506004804, + "grad_norm": 0.9101115150391339, + "learning_rate": 1.4938160752249431e-06, + "loss": 1.5279, + "step": 6213 + }, + { + "epoch": 0.8291966906858821, + "grad_norm": 0.9444030552766006, + "learning_rate": 1.4915444832002413e-06, + "loss": 1.6253, + "step": 6214 + }, + { + "epoch": 0.8293301307712837, + "grad_norm": 0.9679370076321139, + "learning_rate": 1.4892744804581572e-06, + "loss": 1.5131, + "step": 6215 + }, + { + "epoch": 0.8294635708566853, + "grad_norm": 0.9687392610701684, + "learning_rate": 1.4870060674227016e-06, + "loss": 1.5411, + "step": 6216 + }, + { + "epoch": 0.829597010942087, + "grad_norm": 0.9324868882302184, + "learning_rate": 1.4847392445175867e-06, + "loss": 1.5546, + "step": 6217 + }, + { + "epoch": 0.8297304510274887, + "grad_norm": 0.9659785539366846, + "learning_rate": 1.482474012166234e-06, + "loss": 1.5354, + "step": 6218 + }, + { + "epoch": 0.8298638911128903, + "grad_norm": 1.0403468201365482, + "learning_rate": 1.4802103707917591e-06, + "loss": 1.5384, + "step": 6219 + }, + { + "epoch": 0.829997331198292, + "grad_norm": 0.9235060298849219, + "learning_rate": 1.4779483208169832e-06, + "loss": 1.4958, + "step": 6220 + }, + { + "epoch": 0.8301307712836936, + "grad_norm": 0.9948663890358413, + "learning_rate": 1.4756878626644367e-06, + "loss": 1.5092, + "step": 6221 + }, + { + "epoch": 0.8302642113690952, + "grad_norm": 1.2438225582984503, + "learning_rate": 1.4734289967563442e-06, + "loss": 1.5525, + "step": 6222 + }, + { + "epoch": 0.830397651454497, + "grad_norm": 0.9394237729404794, + "learning_rate": 1.471171723514636e-06, + "loss": 1.5704, + "step": 6223 + }, + { + "epoch": 0.8305310915398986, + "grad_norm": 0.9220696013800785, + "learning_rate": 1.468916043360944e-06, + "loss": 1.5649, + "step": 6224 + }, + { + "epoch": 0.8306645316253003, + "grad_norm": 0.9661997734554147, + "learning_rate": 1.4666619567166074e-06, + "loss": 1.5551, + "step": 6225 + }, + { + "epoch": 0.8307979717107019, + "grad_norm": 0.9713988388879101, + "learning_rate": 1.4644094640026607e-06, + "loss": 1.4893, + "step": 6226 + }, + { + "epoch": 0.8309314117961035, + "grad_norm": 0.9362262546367968, + "learning_rate": 1.4621585656398429e-06, + "loss": 1.5044, + "step": 6227 + }, + { + "epoch": 0.8310648518815053, + "grad_norm": 1.070256981646457, + "learning_rate": 1.459909262048601e-06, + "loss": 1.5317, + "step": 6228 + }, + { + "epoch": 0.8311982919669069, + "grad_norm": 0.9309802597099771, + "learning_rate": 1.4576615536490756e-06, + "loss": 1.5314, + "step": 6229 + }, + { + "epoch": 0.8313317320523085, + "grad_norm": 0.9534061214073728, + "learning_rate": 1.4554154408611142e-06, + "loss": 1.5432, + "step": 6230 + }, + { + "epoch": 0.8314651721377102, + "grad_norm": 0.9260408668974777, + "learning_rate": 1.4531709241042624e-06, + "loss": 1.4923, + "step": 6231 + }, + { + "epoch": 0.8315986122231118, + "grad_norm": 1.0984291256861187, + "learning_rate": 1.4509280037977746e-06, + "loss": 1.5121, + "step": 6232 + }, + { + "epoch": 0.8317320523085134, + "grad_norm": 1.080816725382994, + "learning_rate": 1.4486866803606003e-06, + "loss": 1.5719, + "step": 6233 + }, + { + "epoch": 0.8318654923939152, + "grad_norm": 0.9403182913647171, + "learning_rate": 1.4464469542113924e-06, + "loss": 1.5661, + "step": 6234 + }, + { + "epoch": 0.8319989324793168, + "grad_norm": 0.9315844916031217, + "learning_rate": 1.4442088257685105e-06, + "loss": 1.5355, + "step": 6235 + }, + { + "epoch": 0.8321323725647184, + "grad_norm": 0.9523132017213204, + "learning_rate": 1.4419722954500071e-06, + "loss": 1.6324, + "step": 6236 + }, + { + "epoch": 0.8322658126501201, + "grad_norm": 0.9122663743215488, + "learning_rate": 1.4397373636736435e-06, + "loss": 1.51, + "step": 6237 + }, + { + "epoch": 0.8323992527355217, + "grad_norm": 1.2517381014691573, + "learning_rate": 1.4375040308568765e-06, + "loss": 1.6326, + "step": 6238 + }, + { + "epoch": 0.8325326928209235, + "grad_norm": 1.0826573600314662, + "learning_rate": 1.4352722974168675e-06, + "loss": 1.5585, + "step": 6239 + }, + { + "epoch": 0.8326661329063251, + "grad_norm": 0.9348054260521418, + "learning_rate": 1.4330421637704828e-06, + "loss": 1.4989, + "step": 6240 + }, + { + "epoch": 0.8327995729917267, + "grad_norm": 0.9451331002738911, + "learning_rate": 1.4308136303342835e-06, + "loss": 1.5498, + "step": 6241 + }, + { + "epoch": 0.8329330130771284, + "grad_norm": 0.9455134123416408, + "learning_rate": 1.4285866975245333e-06, + "loss": 1.4996, + "step": 6242 + }, + { + "epoch": 0.83306645316253, + "grad_norm": 0.9642660018609069, + "learning_rate": 1.4263613657571995e-06, + "loss": 1.5439, + "step": 6243 + }, + { + "epoch": 0.8331998932479316, + "grad_norm": 1.2808082607240958, + "learning_rate": 1.4241376354479475e-06, + "loss": 1.5403, + "step": 6244 + }, + { + "epoch": 0.8333333333333334, + "grad_norm": 1.8043818463976273, + "learning_rate": 1.4219155070121438e-06, + "loss": 1.5046, + "step": 6245 + }, + { + "epoch": 0.833466773418735, + "grad_norm": 0.949954061324863, + "learning_rate": 1.4196949808648597e-06, + "loss": 1.5568, + "step": 6246 + }, + { + "epoch": 0.8336002135041366, + "grad_norm": 0.9241430802845367, + "learning_rate": 1.4174760574208634e-06, + "loss": 1.5384, + "step": 6247 + }, + { + "epoch": 0.8337336535895383, + "grad_norm": 0.9247044084518099, + "learning_rate": 1.4152587370946235e-06, + "loss": 1.5395, + "step": 6248 + }, + { + "epoch": 0.8338670936749399, + "grad_norm": 0.9470481656442147, + "learning_rate": 1.4130430203003088e-06, + "loss": 1.5524, + "step": 6249 + }, + { + "epoch": 0.8340005337603416, + "grad_norm": 0.95349918298535, + "learning_rate": 1.4108289074517934e-06, + "loss": 1.592, + "step": 6250 + }, + { + "epoch": 0.8341339738457433, + "grad_norm": 0.930859710005307, + "learning_rate": 1.4086163989626467e-06, + "loss": 1.5467, + "step": 6251 + }, + { + "epoch": 0.8342674139311449, + "grad_norm": 0.9827202874058659, + "learning_rate": 1.4064054952461382e-06, + "loss": 1.5415, + "step": 6252 + }, + { + "epoch": 0.8344008540165466, + "grad_norm": 1.0609242939930341, + "learning_rate": 1.404196196715244e-06, + "loss": 1.5581, + "step": 6253 + }, + { + "epoch": 0.8345342941019482, + "grad_norm": 0.9879789904417466, + "learning_rate": 1.401988503782633e-06, + "loss": 1.541, + "step": 6254 + }, + { + "epoch": 0.8346677341873499, + "grad_norm": 0.9589864610692524, + "learning_rate": 1.3997824168606777e-06, + "loss": 1.5305, + "step": 6255 + }, + { + "epoch": 0.8348011742727516, + "grad_norm": 0.929209062077639, + "learning_rate": 1.397577936361446e-06, + "loss": 1.5345, + "step": 6256 + }, + { + "epoch": 0.8349346143581532, + "grad_norm": 0.9515351783274919, + "learning_rate": 1.3953750626967178e-06, + "loss": 1.5299, + "step": 6257 + }, + { + "epoch": 0.8350680544435548, + "grad_norm": 1.000540782983064, + "learning_rate": 1.39317379627796e-06, + "loss": 1.5766, + "step": 6258 + }, + { + "epoch": 0.8352014945289565, + "grad_norm": 1.0567054179998463, + "learning_rate": 1.3909741375163422e-06, + "loss": 1.5723, + "step": 6259 + }, + { + "epoch": 0.8353349346143581, + "grad_norm": 0.9549905329939615, + "learning_rate": 1.3887760868227396e-06, + "loss": 1.5789, + "step": 6260 + }, + { + "epoch": 0.8354683746997598, + "grad_norm": 0.9872576585804954, + "learning_rate": 1.386579644607723e-06, + "loss": 1.5734, + "step": 6261 + }, + { + "epoch": 0.8356018147851615, + "grad_norm": 0.9367144021734644, + "learning_rate": 1.3843848112815594e-06, + "loss": 1.5708, + "step": 6262 + }, + { + "epoch": 0.8357352548705631, + "grad_norm": 0.9513673423313426, + "learning_rate": 1.3821915872542202e-06, + "loss": 1.5313, + "step": 6263 + }, + { + "epoch": 0.8358686949559647, + "grad_norm": 0.9567825688256574, + "learning_rate": 1.3799999729353764e-06, + "loss": 1.5118, + "step": 6264 + }, + { + "epoch": 0.8360021350413664, + "grad_norm": 1.0863657842635757, + "learning_rate": 1.3778099687343948e-06, + "loss": 1.5669, + "step": 6265 + }, + { + "epoch": 0.8361355751267681, + "grad_norm": 0.9751182715087741, + "learning_rate": 1.375621575060343e-06, + "loss": 1.5848, + "step": 6266 + }, + { + "epoch": 0.8362690152121698, + "grad_norm": 0.9552963864700986, + "learning_rate": 1.3734347923219893e-06, + "loss": 1.5594, + "step": 6267 + }, + { + "epoch": 0.8364024552975714, + "grad_norm": 0.9334563882233021, + "learning_rate": 1.3712496209278004e-06, + "loss": 1.5085, + "step": 6268 + }, + { + "epoch": 0.836535895382973, + "grad_norm": 0.9418447706556196, + "learning_rate": 1.3690660612859397e-06, + "loss": 1.484, + "step": 6269 + }, + { + "epoch": 0.8366693354683747, + "grad_norm": 1.2452469016859455, + "learning_rate": 1.36688411380427e-06, + "loss": 1.5939, + "step": 6270 + }, + { + "epoch": 0.8368027755537764, + "grad_norm": 0.9630484251295978, + "learning_rate": 1.3647037788903582e-06, + "loss": 1.5588, + "step": 6271 + }, + { + "epoch": 0.836936215639178, + "grad_norm": 0.9163101746800517, + "learning_rate": 1.3625250569514636e-06, + "loss": 1.543, + "step": 6272 + }, + { + "epoch": 0.8370696557245797, + "grad_norm": 1.0159540192644183, + "learning_rate": 1.3603479483945482e-06, + "loss": 1.5317, + "step": 6273 + }, + { + "epoch": 0.8372030958099813, + "grad_norm": 0.9495771661136421, + "learning_rate": 1.3581724536262664e-06, + "loss": 1.5721, + "step": 6274 + }, + { + "epoch": 0.8373365358953829, + "grad_norm": 0.9259908173056178, + "learning_rate": 1.3559985730529824e-06, + "loss": 1.5375, + "step": 6275 + }, + { + "epoch": 0.8374699759807847, + "grad_norm": 1.0746035612986713, + "learning_rate": 1.353826307080749e-06, + "loss": 1.545, + "step": 6276 + }, + { + "epoch": 0.8376034160661863, + "grad_norm": 1.206161587602088, + "learning_rate": 1.3516556561153182e-06, + "loss": 1.4896, + "step": 6277 + }, + { + "epoch": 0.8377368561515879, + "grad_norm": 0.9394100024042479, + "learning_rate": 1.3494866205621492e-06, + "loss": 1.569, + "step": 6278 + }, + { + "epoch": 0.8378702962369896, + "grad_norm": 0.9298549718077799, + "learning_rate": 1.347319200826389e-06, + "loss": 1.5184, + "step": 6279 + }, + { + "epoch": 0.8380037363223912, + "grad_norm": 0.9364031759470136, + "learning_rate": 1.3451533973128873e-06, + "loss": 1.5611, + "step": 6280 + }, + { + "epoch": 0.838137176407793, + "grad_norm": 0.9438099018882347, + "learning_rate": 1.3429892104261922e-06, + "loss": 1.5884, + "step": 6281 + }, + { + "epoch": 0.8382706164931946, + "grad_norm": 1.0963915983521988, + "learning_rate": 1.3408266405705462e-06, + "loss": 1.5056, + "step": 6282 + }, + { + "epoch": 0.8384040565785962, + "grad_norm": 1.0553961263791714, + "learning_rate": 1.3386656881498982e-06, + "loss": 1.5072, + "step": 6283 + }, + { + "epoch": 0.8385374966639979, + "grad_norm": 0.9313093194946619, + "learning_rate": 1.3365063535678868e-06, + "loss": 1.5504, + "step": 6284 + }, + { + "epoch": 0.8386709367493995, + "grad_norm": 0.9472337824960516, + "learning_rate": 1.3343486372278502e-06, + "loss": 1.5925, + "step": 6285 + }, + { + "epoch": 0.8388043768348011, + "grad_norm": 0.9563239409348798, + "learning_rate": 1.3321925395328261e-06, + "loss": 1.4905, + "step": 6286 + }, + { + "epoch": 0.8389378169202029, + "grad_norm": 0.9193497426197157, + "learning_rate": 1.33003806088555e-06, + "loss": 1.5251, + "step": 6287 + }, + { + "epoch": 0.8390712570056045, + "grad_norm": 0.9463362147811565, + "learning_rate": 1.3278852016884491e-06, + "loss": 1.5755, + "step": 6288 + }, + { + "epoch": 0.8392046970910061, + "grad_norm": 1.0465969350369784, + "learning_rate": 1.3257339623436606e-06, + "loss": 1.558, + "step": 6289 + }, + { + "epoch": 0.8393381371764078, + "grad_norm": 0.9078117750928425, + "learning_rate": 1.323584343253007e-06, + "loss": 1.5181, + "step": 6290 + }, + { + "epoch": 0.8394715772618094, + "grad_norm": 0.9471706272387931, + "learning_rate": 1.3214363448180111e-06, + "loss": 1.5852, + "step": 6291 + }, + { + "epoch": 0.839605017347211, + "grad_norm": 0.9438263601613096, + "learning_rate": 1.3192899674398985e-06, + "loss": 1.5328, + "step": 6292 + }, + { + "epoch": 0.8397384574326128, + "grad_norm": 12.634716581956164, + "learning_rate": 1.317145211519587e-06, + "loss": 1.551, + "step": 6293 + }, + { + "epoch": 0.8398718975180144, + "grad_norm": 0.947237397048608, + "learning_rate": 1.315002077457692e-06, + "loss": 1.523, + "step": 6294 + }, + { + "epoch": 0.8400053376034161, + "grad_norm": 0.9268524675831373, + "learning_rate": 1.3128605656545245e-06, + "loss": 1.5293, + "step": 6295 + }, + { + "epoch": 0.8401387776888177, + "grad_norm": 0.9206635334147892, + "learning_rate": 1.3107206765100987e-06, + "loss": 1.5504, + "step": 6296 + }, + { + "epoch": 0.8402722177742193, + "grad_norm": 0.9437804573296683, + "learning_rate": 1.3085824104241185e-06, + "loss": 1.5366, + "step": 6297 + }, + { + "epoch": 0.8404056578596211, + "grad_norm": 0.9532930752867279, + "learning_rate": 1.3064457677959874e-06, + "loss": 1.5794, + "step": 6298 + }, + { + "epoch": 0.8405390979450227, + "grad_norm": 0.9331423826329348, + "learning_rate": 1.3043107490248086e-06, + "loss": 1.5141, + "step": 6299 + }, + { + "epoch": 0.8406725380304243, + "grad_norm": 0.9831369699762285, + "learning_rate": 1.3021773545093775e-06, + "loss": 1.5939, + "step": 6300 + }, + { + "epoch": 0.840805978115826, + "grad_norm": 0.9671730377179253, + "learning_rate": 1.3000455846481886e-06, + "loss": 1.5556, + "step": 6301 + }, + { + "epoch": 0.8409394182012276, + "grad_norm": 0.9400385680698677, + "learning_rate": 1.29791543983943e-06, + "loss": 1.5407, + "step": 6302 + }, + { + "epoch": 0.8410728582866293, + "grad_norm": 0.9409729894045126, + "learning_rate": 1.2957869204809925e-06, + "loss": 1.5392, + "step": 6303 + }, + { + "epoch": 0.841206298372031, + "grad_norm": 0.94097206563248, + "learning_rate": 1.2936600269704559e-06, + "loss": 1.5605, + "step": 6304 + }, + { + "epoch": 0.8413397384574326, + "grad_norm": 0.9949943289521407, + "learning_rate": 1.291534759705102e-06, + "loss": 1.6005, + "step": 6305 + }, + { + "epoch": 0.8414731785428343, + "grad_norm": 0.9355636029437794, + "learning_rate": 1.2894111190819025e-06, + "loss": 1.5574, + "step": 6306 + }, + { + "epoch": 0.8416066186282359, + "grad_norm": 0.9725584670762065, + "learning_rate": 1.2872891054975346e-06, + "loss": 1.5632, + "step": 6307 + }, + { + "epoch": 0.8417400587136376, + "grad_norm": 0.9010547164650611, + "learning_rate": 1.2851687193483642e-06, + "loss": 1.4956, + "step": 6308 + }, + { + "epoch": 0.8418734987990393, + "grad_norm": 0.975052957442709, + "learning_rate": 1.2830499610304526e-06, + "loss": 1.6136, + "step": 6309 + }, + { + "epoch": 0.8420069388844409, + "grad_norm": 0.9766365640836961, + "learning_rate": 1.280932830939564e-06, + "loss": 1.5266, + "step": 6310 + }, + { + "epoch": 0.8421403789698425, + "grad_norm": 0.9456532975205986, + "learning_rate": 1.2788173294711526e-06, + "loss": 1.5745, + "step": 6311 + }, + { + "epoch": 0.8422738190552442, + "grad_norm": 0.9410865383508794, + "learning_rate": 1.2767034570203685e-06, + "loss": 1.5397, + "step": 6312 + }, + { + "epoch": 0.8424072591406458, + "grad_norm": 0.9832507755872817, + "learning_rate": 1.2745912139820594e-06, + "loss": 1.589, + "step": 6313 + }, + { + "epoch": 0.8425406992260475, + "grad_norm": 0.9510871209768055, + "learning_rate": 1.2724806007507706e-06, + "loss": 1.5622, + "step": 6314 + }, + { + "epoch": 0.8426741393114492, + "grad_norm": 0.9418455606671381, + "learning_rate": 1.2703716177207393e-06, + "loss": 1.5386, + "step": 6315 + }, + { + "epoch": 0.8428075793968508, + "grad_norm": 0.9267882316614677, + "learning_rate": 1.2682642652858968e-06, + "loss": 1.5246, + "step": 6316 + }, + { + "epoch": 0.8429410194822524, + "grad_norm": 0.9241066105531126, + "learning_rate": 1.2661585438398771e-06, + "loss": 1.5255, + "step": 6317 + }, + { + "epoch": 0.8430744595676541, + "grad_norm": 0.9299217006327872, + "learning_rate": 1.2640544537760035e-06, + "loss": 1.5331, + "step": 6318 + }, + { + "epoch": 0.8432078996530558, + "grad_norm": 0.9460477011701305, + "learning_rate": 1.261951995487295e-06, + "loss": 1.5356, + "step": 6319 + }, + { + "epoch": 0.8433413397384575, + "grad_norm": 0.99568183474739, + "learning_rate": 1.259851169366465e-06, + "loss": 1.5396, + "step": 6320 + }, + { + "epoch": 0.8434747798238591, + "grad_norm": 0.9531319408928621, + "learning_rate": 1.2577519758059286e-06, + "loss": 1.5645, + "step": 6321 + }, + { + "epoch": 0.8436082199092607, + "grad_norm": 1.4786773551886767, + "learning_rate": 1.255654415197789e-06, + "loss": 1.5033, + "step": 6322 + }, + { + "epoch": 0.8437416599946624, + "grad_norm": 0.9837349009581106, + "learning_rate": 1.2535584879338469e-06, + "loss": 1.5878, + "step": 6323 + }, + { + "epoch": 0.8438751000800641, + "grad_norm": 0.9380148129468954, + "learning_rate": 1.2514641944055961e-06, + "loss": 1.5127, + "step": 6324 + }, + { + "epoch": 0.8440085401654657, + "grad_norm": 0.9800681388655729, + "learning_rate": 1.2493715350042267e-06, + "loss": 1.5177, + "step": 6325 + }, + { + "epoch": 0.8441419802508674, + "grad_norm": 0.9363139844260511, + "learning_rate": 1.2472805101206265e-06, + "loss": 1.5309, + "step": 6326 + }, + { + "epoch": 0.844275420336269, + "grad_norm": 0.9716477758023239, + "learning_rate": 1.2451911201453747e-06, + "loss": 1.5331, + "step": 6327 + }, + { + "epoch": 0.8444088604216706, + "grad_norm": 1.037355252448747, + "learning_rate": 1.243103365468743e-06, + "loss": 1.5556, + "step": 6328 + }, + { + "epoch": 0.8445423005070724, + "grad_norm": 1.127301529246472, + "learning_rate": 1.2410172464807024e-06, + "loss": 1.5456, + "step": 6329 + }, + { + "epoch": 0.844675740592474, + "grad_norm": 0.9421048807023837, + "learning_rate": 1.2389327635709136e-06, + "loss": 1.4901, + "step": 6330 + }, + { + "epoch": 0.8448091806778756, + "grad_norm": 1.1002964846418235, + "learning_rate": 1.2368499171287374e-06, + "loss": 1.6021, + "step": 6331 + }, + { + "epoch": 0.8449426207632773, + "grad_norm": 0.9387328752601601, + "learning_rate": 1.2347687075432246e-06, + "loss": 1.5047, + "step": 6332 + }, + { + "epoch": 0.8450760608486789, + "grad_norm": 0.9123192171247185, + "learning_rate": 1.2326891352031223e-06, + "loss": 1.5134, + "step": 6333 + }, + { + "epoch": 0.8452095009340806, + "grad_norm": 1.1014224330465954, + "learning_rate": 1.2306112004968662e-06, + "loss": 1.5769, + "step": 6334 + }, + { + "epoch": 0.8453429410194823, + "grad_norm": 0.9516250282510863, + "learning_rate": 1.2285349038125981e-06, + "loss": 1.585, + "step": 6335 + }, + { + "epoch": 0.8454763811048839, + "grad_norm": 0.9572784838289615, + "learning_rate": 1.2264602455381424e-06, + "loss": 1.5436, + "step": 6336 + }, + { + "epoch": 0.8456098211902856, + "grad_norm": 0.927276956609323, + "learning_rate": 1.2243872260610223e-06, + "loss": 1.4966, + "step": 6337 + }, + { + "epoch": 0.8457432612756872, + "grad_norm": 0.9439895084993872, + "learning_rate": 1.2223158457684526e-06, + "loss": 1.5114, + "step": 6338 + }, + { + "epoch": 0.8458767013610888, + "grad_norm": 1.0329741808696042, + "learning_rate": 1.2202461050473469e-06, + "loss": 1.5154, + "step": 6339 + }, + { + "epoch": 0.8460101414464906, + "grad_norm": 0.9806884542561266, + "learning_rate": 1.2181780042843071e-06, + "loss": 1.4914, + "step": 6340 + }, + { + "epoch": 0.8461435815318922, + "grad_norm": 1.0466476181426838, + "learning_rate": 1.2161115438656301e-06, + "loss": 1.5529, + "step": 6341 + }, + { + "epoch": 0.8462770216172938, + "grad_norm": 0.9141443517568483, + "learning_rate": 1.2140467241773103e-06, + "loss": 1.513, + "step": 6342 + }, + { + "epoch": 0.8464104617026955, + "grad_norm": 0.9944934691896574, + "learning_rate": 1.2119835456050311e-06, + "loss": 1.5567, + "step": 6343 + }, + { + "epoch": 0.8465439017880971, + "grad_norm": 1.0618279777174555, + "learning_rate": 1.2099220085341689e-06, + "loss": 1.5453, + "step": 6344 + }, + { + "epoch": 0.8466773418734987, + "grad_norm": 0.9826611891622069, + "learning_rate": 1.2078621133497958e-06, + "loss": 1.5373, + "step": 6345 + }, + { + "epoch": 0.8468107819589005, + "grad_norm": 0.9429543528239474, + "learning_rate": 1.2058038604366796e-06, + "loss": 1.5587, + "step": 6346 + }, + { + "epoch": 0.8469442220443021, + "grad_norm": 0.9780300370698021, + "learning_rate": 1.2037472501792757e-06, + "loss": 1.5841, + "step": 6347 + }, + { + "epoch": 0.8470776621297038, + "grad_norm": 0.9485414768814703, + "learning_rate": 1.201692282961735e-06, + "loss": 1.532, + "step": 6348 + }, + { + "epoch": 0.8472111022151054, + "grad_norm": 0.9528273449657707, + "learning_rate": 1.1996389591679047e-06, + "loss": 1.5611, + "step": 6349 + }, + { + "epoch": 0.847344542300507, + "grad_norm": 0.9369740412100928, + "learning_rate": 1.1975872791813225e-06, + "loss": 1.5032, + "step": 6350 + }, + { + "epoch": 0.8474779823859088, + "grad_norm": 0.9223426425400875, + "learning_rate": 1.1955372433852163e-06, + "loss": 1.5512, + "step": 6351 + }, + { + "epoch": 0.8476114224713104, + "grad_norm": 0.9430681441104803, + "learning_rate": 1.1934888521625076e-06, + "loss": 1.561, + "step": 6352 + }, + { + "epoch": 0.847744862556712, + "grad_norm": 0.969082370915136, + "learning_rate": 1.1914421058958192e-06, + "loss": 1.5353, + "step": 6353 + }, + { + "epoch": 0.8478783026421137, + "grad_norm": 1.0413763556214914, + "learning_rate": 1.1893970049674553e-06, + "loss": 1.5652, + "step": 6354 + }, + { + "epoch": 0.8480117427275153, + "grad_norm": 0.9204143577512675, + "learning_rate": 1.1873535497594157e-06, + "loss": 1.5406, + "step": 6355 + }, + { + "epoch": 0.848145182812917, + "grad_norm": 1.1508462678819407, + "learning_rate": 1.185311740653401e-06, + "loss": 1.5223, + "step": 6356 + }, + { + "epoch": 0.8482786228983187, + "grad_norm": 1.0061378039252045, + "learning_rate": 1.1832715780307924e-06, + "loss": 1.5438, + "step": 6357 + }, + { + "epoch": 0.8484120629837203, + "grad_norm": 1.0156019695357563, + "learning_rate": 1.181233062272672e-06, + "loss": 1.5484, + "step": 6358 + }, + { + "epoch": 0.8485455030691219, + "grad_norm": 0.9583797003702161, + "learning_rate": 1.1791961937598073e-06, + "loss": 1.5511, + "step": 6359 + }, + { + "epoch": 0.8486789431545236, + "grad_norm": 1.77815502053269, + "learning_rate": 1.1771609728726674e-06, + "loss": 1.5944, + "step": 6360 + }, + { + "epoch": 0.8488123832399252, + "grad_norm": 0.9601898228216447, + "learning_rate": 1.1751273999914059e-06, + "loss": 1.5584, + "step": 6361 + }, + { + "epoch": 0.848945823325327, + "grad_norm": 1.0337878007106784, + "learning_rate": 1.173095475495869e-06, + "loss": 1.5255, + "step": 6362 + }, + { + "epoch": 0.8490792634107286, + "grad_norm": 1.069797310519057, + "learning_rate": 1.171065199765602e-06, + "loss": 1.5929, + "step": 6363 + }, + { + "epoch": 0.8492127034961302, + "grad_norm": 1.2157650247272633, + "learning_rate": 1.1690365731798337e-06, + "loss": 1.5466, + "step": 6364 + }, + { + "epoch": 0.8493461435815319, + "grad_norm": 1.0078551254584747, + "learning_rate": 1.1670095961174889e-06, + "loss": 1.5735, + "step": 6365 + }, + { + "epoch": 0.8494795836669335, + "grad_norm": 0.9609917847295425, + "learning_rate": 1.1649842689571855e-06, + "loss": 1.5789, + "step": 6366 + }, + { + "epoch": 0.8496130237523352, + "grad_norm": 0.9255278292579112, + "learning_rate": 1.1629605920772292e-06, + "loss": 1.5726, + "step": 6367 + }, + { + "epoch": 0.8497464638377369, + "grad_norm": 0.9461037345263595, + "learning_rate": 1.1609385658556183e-06, + "loss": 1.572, + "step": 6368 + }, + { + "epoch": 0.8498799039231385, + "grad_norm": 0.9622357864602324, + "learning_rate": 1.1589181906700498e-06, + "loss": 1.5018, + "step": 6369 + }, + { + "epoch": 0.8500133440085401, + "grad_norm": 0.9486160725025564, + "learning_rate": 1.156899466897904e-06, + "loss": 1.5379, + "step": 6370 + }, + { + "epoch": 0.8501467840939418, + "grad_norm": 1.0361906044731701, + "learning_rate": 1.1548823949162546e-06, + "loss": 1.5514, + "step": 6371 + }, + { + "epoch": 0.8502802241793435, + "grad_norm": 0.9657512138409541, + "learning_rate": 1.1528669751018684e-06, + "loss": 1.508, + "step": 6372 + }, + { + "epoch": 0.8504136642647452, + "grad_norm": 0.9981316716898595, + "learning_rate": 1.1508532078312007e-06, + "loss": 1.5569, + "step": 6373 + }, + { + "epoch": 0.8505471043501468, + "grad_norm": 0.9476506985506088, + "learning_rate": 1.1488410934804051e-06, + "loss": 1.539, + "step": 6374 + }, + { + "epoch": 0.8506805444355484, + "grad_norm": 1.026783945248529, + "learning_rate": 1.1468306324253187e-06, + "loss": 1.55, + "step": 6375 + }, + { + "epoch": 0.8508139845209501, + "grad_norm": 1.0018298913831105, + "learning_rate": 1.1448218250414734e-06, + "loss": 1.5283, + "step": 6376 + }, + { + "epoch": 0.8509474246063518, + "grad_norm": 0.9541809577080766, + "learning_rate": 1.1428146717040888e-06, + "loss": 1.4845, + "step": 6377 + }, + { + "epoch": 0.8510808646917534, + "grad_norm": 0.94168687520855, + "learning_rate": 1.1408091727880822e-06, + "loss": 1.5261, + "step": 6378 + }, + { + "epoch": 0.8512143047771551, + "grad_norm": 0.9534267240246544, + "learning_rate": 1.1388053286680566e-06, + "loss": 1.5505, + "step": 6379 + }, + { + "epoch": 0.8513477448625567, + "grad_norm": 0.9608021930487494, + "learning_rate": 1.1368031397183055e-06, + "loss": 1.5709, + "step": 6380 + }, + { + "epoch": 0.8514811849479583, + "grad_norm": 1.0827148794751535, + "learning_rate": 1.1348026063128193e-06, + "loss": 1.54, + "step": 6381 + }, + { + "epoch": 0.85161462503336, + "grad_norm": 0.9496776082560817, + "learning_rate": 1.1328037288252714e-06, + "loss": 1.5394, + "step": 6382 + }, + { + "epoch": 0.8517480651187617, + "grad_norm": 0.925719277344387, + "learning_rate": 1.1308065076290298e-06, + "loss": 1.5546, + "step": 6383 + }, + { + "epoch": 0.8518815052041633, + "grad_norm": 0.99649007906056, + "learning_rate": 1.1288109430971516e-06, + "loss": 1.5796, + "step": 6384 + }, + { + "epoch": 0.852014945289565, + "grad_norm": 0.9217233177569367, + "learning_rate": 1.1268170356023889e-06, + "loss": 1.4987, + "step": 6385 + }, + { + "epoch": 0.8521483853749666, + "grad_norm": 0.9795549846257534, + "learning_rate": 1.124824785517179e-06, + "loss": 1.498, + "step": 6386 + }, + { + "epoch": 0.8522818254603683, + "grad_norm": 0.9466460505441463, + "learning_rate": 1.122834193213649e-06, + "loss": 1.5069, + "step": 6387 + }, + { + "epoch": 0.85241526554577, + "grad_norm": 0.9622325848501593, + "learning_rate": 1.1208452590636243e-06, + "loss": 1.572, + "step": 6388 + }, + { + "epoch": 0.8525487056311716, + "grad_norm": 0.907176146573117, + "learning_rate": 1.1188579834386116e-06, + "loss": 1.5447, + "step": 6389 + }, + { + "epoch": 0.8526821457165733, + "grad_norm": 1.227317725567885, + "learning_rate": 1.1168723667098115e-06, + "loss": 1.5481, + "step": 6390 + }, + { + "epoch": 0.8528155858019749, + "grad_norm": 0.9872294854469825, + "learning_rate": 1.1148884092481138e-06, + "loss": 1.5726, + "step": 6391 + }, + { + "epoch": 0.8529490258873765, + "grad_norm": 0.9774988409822314, + "learning_rate": 1.1129061114241024e-06, + "loss": 1.5505, + "step": 6392 + }, + { + "epoch": 0.8530824659727783, + "grad_norm": 0.9342823752805767, + "learning_rate": 1.1109254736080456e-06, + "loss": 1.5665, + "step": 6393 + }, + { + "epoch": 0.8532159060581799, + "grad_norm": 0.945834873384594, + "learning_rate": 1.1089464961699025e-06, + "loss": 1.5237, + "step": 6394 + }, + { + "epoch": 0.8533493461435815, + "grad_norm": 1.0390817853688066, + "learning_rate": 1.106969179479328e-06, + "loss": 1.559, + "step": 6395 + }, + { + "epoch": 0.8534827862289832, + "grad_norm": 0.9915119704103906, + "learning_rate": 1.1049935239056598e-06, + "loss": 1.5479, + "step": 6396 + }, + { + "epoch": 0.8536162263143848, + "grad_norm": 0.9778650816854455, + "learning_rate": 1.1030195298179269e-06, + "loss": 1.5655, + "step": 6397 + }, + { + "epoch": 0.8537496663997864, + "grad_norm": 0.9412423616517623, + "learning_rate": 1.101047197584849e-06, + "loss": 1.5287, + "step": 6398 + }, + { + "epoch": 0.8538831064851882, + "grad_norm": 0.9125544805452201, + "learning_rate": 1.0990765275748383e-06, + "loss": 1.5672, + "step": 6399 + }, + { + "epoch": 0.8540165465705898, + "grad_norm": 0.9887312218815218, + "learning_rate": 1.09710752015599e-06, + "loss": 1.5265, + "step": 6400 + }, + { + "epoch": 0.8541499866559915, + "grad_norm": 1.0483588182325054, + "learning_rate": 1.0951401756960934e-06, + "loss": 1.5733, + "step": 6401 + }, + { + "epoch": 0.8542834267413931, + "grad_norm": 0.9407674603973352, + "learning_rate": 1.0931744945626276e-06, + "loss": 1.5581, + "step": 6402 + }, + { + "epoch": 0.8544168668267947, + "grad_norm": 0.9313568972065337, + "learning_rate": 1.0912104771227584e-06, + "loss": 1.5329, + "step": 6403 + }, + { + "epoch": 0.8545503069121965, + "grad_norm": 0.9194823821651573, + "learning_rate": 1.0892481237433405e-06, + "loss": 1.5163, + "step": 6404 + }, + { + "epoch": 0.8546837469975981, + "grad_norm": 1.0427301724789466, + "learning_rate": 1.08728743479092e-06, + "loss": 1.5137, + "step": 6405 + }, + { + "epoch": 0.8548171870829997, + "grad_norm": 0.9377269194420035, + "learning_rate": 1.085328410631733e-06, + "loss": 1.4877, + "step": 6406 + }, + { + "epoch": 0.8549506271684014, + "grad_norm": 1.021530858485102, + "learning_rate": 1.0833710516317009e-06, + "loss": 1.599, + "step": 6407 + }, + { + "epoch": 0.855084067253803, + "grad_norm": 0.9614653085487166, + "learning_rate": 1.081415358156438e-06, + "loss": 1.562, + "step": 6408 + }, + { + "epoch": 0.8552175073392047, + "grad_norm": 0.9332515315220702, + "learning_rate": 1.0794613305712432e-06, + "loss": 1.5861, + "step": 6409 + }, + { + "epoch": 0.8553509474246064, + "grad_norm": 0.9465013594113041, + "learning_rate": 1.0775089692411057e-06, + "loss": 1.544, + "step": 6410 + }, + { + "epoch": 0.855484387510008, + "grad_norm": 0.9299855464015608, + "learning_rate": 1.075558274530709e-06, + "loss": 1.5169, + "step": 6411 + }, + { + "epoch": 0.8556178275954096, + "grad_norm": 0.9633862952339556, + "learning_rate": 1.073609246804418e-06, + "loss": 1.5498, + "step": 6412 + }, + { + "epoch": 0.8557512676808113, + "grad_norm": 0.9683201294799403, + "learning_rate": 1.0716618864262885e-06, + "loss": 1.4671, + "step": 6413 + }, + { + "epoch": 0.855884707766213, + "grad_norm": 0.9649479650839727, + "learning_rate": 1.0697161937600665e-06, + "loss": 1.57, + "step": 6414 + }, + { + "epoch": 0.8560181478516147, + "grad_norm": 0.9185273611508583, + "learning_rate": 1.0677721691691833e-06, + "loss": 1.5408, + "step": 6415 + }, + { + "epoch": 0.8561515879370163, + "grad_norm": 0.9695842140595766, + "learning_rate": 1.0658298130167599e-06, + "loss": 1.507, + "step": 6416 + }, + { + "epoch": 0.8562850280224179, + "grad_norm": 0.9303971531538695, + "learning_rate": 1.0638891256656103e-06, + "loss": 1.4942, + "step": 6417 + }, + { + "epoch": 0.8564184681078196, + "grad_norm": 1.0549407029198568, + "learning_rate": 1.0619501074782313e-06, + "loss": 1.5486, + "step": 6418 + }, + { + "epoch": 0.8565519081932212, + "grad_norm": 3.80436306285889, + "learning_rate": 1.0600127588168063e-06, + "loss": 1.5436, + "step": 6419 + }, + { + "epoch": 0.8566853482786229, + "grad_norm": 0.9231151613693299, + "learning_rate": 1.0580770800432139e-06, + "loss": 1.5291, + "step": 6420 + }, + { + "epoch": 0.8568187883640246, + "grad_norm": 1.076962491434642, + "learning_rate": 1.0561430715190158e-06, + "loss": 1.5237, + "step": 6421 + }, + { + "epoch": 0.8569522284494262, + "grad_norm": 0.9284052147555532, + "learning_rate": 1.054210733605462e-06, + "loss": 1.5671, + "step": 6422 + }, + { + "epoch": 0.8570856685348278, + "grad_norm": 1.013864860833294, + "learning_rate": 1.0522800666634891e-06, + "loss": 1.513, + "step": 6423 + }, + { + "epoch": 0.8572191086202295, + "grad_norm": 1.0640857162799244, + "learning_rate": 1.050351071053729e-06, + "loss": 1.5686, + "step": 6424 + }, + { + "epoch": 0.8573525487056312, + "grad_norm": 1.6541230030547414, + "learning_rate": 1.0484237471364922e-06, + "loss": 1.5551, + "step": 6425 + }, + { + "epoch": 0.8574859887910328, + "grad_norm": 1.0049096936614765, + "learning_rate": 1.0464980952717807e-06, + "loss": 1.507, + "step": 6426 + }, + { + "epoch": 0.8576194288764345, + "grad_norm": 0.9371213973946467, + "learning_rate": 1.0445741158192879e-06, + "loss": 1.5747, + "step": 6427 + }, + { + "epoch": 0.8577528689618361, + "grad_norm": 1.2240293324907436, + "learning_rate": 1.0426518091383886e-06, + "loss": 1.6061, + "step": 6428 + }, + { + "epoch": 0.8578863090472378, + "grad_norm": 0.9655483000092794, + "learning_rate": 1.0407311755881477e-06, + "loss": 1.5201, + "step": 6429 + }, + { + "epoch": 0.8580197491326395, + "grad_norm": 1.0963997227966682, + "learning_rate": 1.0388122155273162e-06, + "loss": 1.54, + "step": 6430 + }, + { + "epoch": 0.8581531892180411, + "grad_norm": 1.0333139188267229, + "learning_rate": 1.0368949293143383e-06, + "loss": 1.5161, + "step": 6431 + }, + { + "epoch": 0.8582866293034428, + "grad_norm": 0.9470028994674716, + "learning_rate": 1.0349793173073386e-06, + "loss": 1.5838, + "step": 6432 + }, + { + "epoch": 0.8584200693888444, + "grad_norm": 0.945024534455689, + "learning_rate": 1.0330653798641288e-06, + "loss": 1.5591, + "step": 6433 + }, + { + "epoch": 0.858553509474246, + "grad_norm": 0.9404662766746317, + "learning_rate": 1.0311531173422172e-06, + "loss": 1.5622, + "step": 6434 + }, + { + "epoch": 0.8586869495596477, + "grad_norm": 0.9927286739349651, + "learning_rate": 1.0292425300987885e-06, + "loss": 1.5112, + "step": 6435 + }, + { + "epoch": 0.8588203896450494, + "grad_norm": 0.9324477538639491, + "learning_rate": 1.027333618490719e-06, + "loss": 1.5527, + "step": 6436 + }, + { + "epoch": 0.858953829730451, + "grad_norm": 0.9841322098310834, + "learning_rate": 1.0254263828745704e-06, + "loss": 1.5666, + "step": 6437 + }, + { + "epoch": 0.8590872698158527, + "grad_norm": 0.9705280053887522, + "learning_rate": 1.0235208236065964e-06, + "loss": 1.4959, + "step": 6438 + }, + { + "epoch": 0.8592207099012543, + "grad_norm": 0.961832358126531, + "learning_rate": 1.0216169410427312e-06, + "loss": 1.5583, + "step": 6439 + }, + { + "epoch": 0.859354149986656, + "grad_norm": 0.937404869373422, + "learning_rate": 1.0197147355385983e-06, + "loss": 1.4833, + "step": 6440 + }, + { + "epoch": 0.8594875900720577, + "grad_norm": 0.9488407481430322, + "learning_rate": 1.0178142074495068e-06, + "loss": 1.5485, + "step": 6441 + }, + { + "epoch": 0.8596210301574593, + "grad_norm": 0.9318508881776466, + "learning_rate": 1.015915357130457e-06, + "loss": 1.5505, + "step": 6442 + }, + { + "epoch": 0.859754470242861, + "grad_norm": 0.9180739689951576, + "learning_rate": 1.0140181849361307e-06, + "loss": 1.4841, + "step": 6443 + }, + { + "epoch": 0.8598879103282626, + "grad_norm": 0.9168330491896693, + "learning_rate": 1.012122691220897e-06, + "loss": 1.4934, + "step": 6444 + }, + { + "epoch": 0.8600213504136642, + "grad_norm": 0.936550488916631, + "learning_rate": 1.0102288763388147e-06, + "loss": 1.5639, + "step": 6445 + }, + { + "epoch": 0.860154790499066, + "grad_norm": 1.2407428273088301, + "learning_rate": 1.0083367406436263e-06, + "loss": 1.5286, + "step": 6446 + }, + { + "epoch": 0.8602882305844676, + "grad_norm": 0.9564893007922771, + "learning_rate": 1.0064462844887613e-06, + "loss": 1.5186, + "step": 6447 + }, + { + "epoch": 0.8604216706698692, + "grad_norm": 0.9357452140960504, + "learning_rate": 1.004557508227333e-06, + "loss": 1.5172, + "step": 6448 + }, + { + "epoch": 0.8605551107552709, + "grad_norm": 0.8972018016158957, + "learning_rate": 1.0026704122121466e-06, + "loss": 1.5275, + "step": 6449 + }, + { + "epoch": 0.8606885508406725, + "grad_norm": 0.9998053596687703, + "learning_rate": 1.0007849967956884e-06, + "loss": 1.5544, + "step": 6450 + }, + { + "epoch": 0.8608219909260741, + "grad_norm": 0.9489874805787626, + "learning_rate": 9.989012623301343e-07, + "loss": 1.5508, + "step": 6451 + }, + { + "epoch": 0.8609554310114759, + "grad_norm": 0.9471306443210268, + "learning_rate": 9.970192091673414e-07, + "loss": 1.534, + "step": 6452 + }, + { + "epoch": 0.8610888710968775, + "grad_norm": 0.9137682040739373, + "learning_rate": 9.951388376588567e-07, + "loss": 1.5459, + "step": 6453 + }, + { + "epoch": 0.8612223111822792, + "grad_norm": 0.9641556651125667, + "learning_rate": 9.932601481559146e-07, + "loss": 1.5387, + "step": 6454 + }, + { + "epoch": 0.8613557512676808, + "grad_norm": 0.93922770841347, + "learning_rate": 9.91383141009431e-07, + "loss": 1.5581, + "step": 6455 + }, + { + "epoch": 0.8614891913530824, + "grad_norm": 0.9436691135229323, + "learning_rate": 9.8950781657001e-07, + "loss": 1.5473, + "step": 6456 + }, + { + "epoch": 0.8616226314384842, + "grad_norm": 0.9338614269793593, + "learning_rate": 9.876341751879404e-07, + "loss": 1.5505, + "step": 6457 + }, + { + "epoch": 0.8617560715238858, + "grad_norm": 0.9397414325734582, + "learning_rate": 9.857622172131952e-07, + "loss": 1.5642, + "step": 6458 + }, + { + "epoch": 0.8618895116092874, + "grad_norm": 1.0086274613066035, + "learning_rate": 9.838919429954386e-07, + "loss": 1.5355, + "step": 6459 + }, + { + "epoch": 0.8620229516946891, + "grad_norm": 0.9481076250398918, + "learning_rate": 9.820233528840151e-07, + "loss": 1.5557, + "step": 6460 + }, + { + "epoch": 0.8621563917800907, + "grad_norm": 0.9155281021750805, + "learning_rate": 9.801564472279557e-07, + "loss": 1.4963, + "step": 6461 + }, + { + "epoch": 0.8622898318654924, + "grad_norm": 0.9144924099874533, + "learning_rate": 9.782912263759748e-07, + "loss": 1.5214, + "step": 6462 + }, + { + "epoch": 0.8624232719508941, + "grad_norm": 0.9363225569423211, + "learning_rate": 9.764276906764792e-07, + "loss": 1.5297, + "step": 6463 + }, + { + "epoch": 0.8625567120362957, + "grad_norm": 0.9435473694327043, + "learning_rate": 9.745658404775537e-07, + "loss": 1.5394, + "step": 6464 + }, + { + "epoch": 0.8626901521216973, + "grad_norm": 0.9985762329589031, + "learning_rate": 9.727056761269693e-07, + "loss": 1.5468, + "step": 6465 + }, + { + "epoch": 0.862823592207099, + "grad_norm": 0.9290866980377026, + "learning_rate": 9.708471979721868e-07, + "loss": 1.5702, + "step": 6466 + }, + { + "epoch": 0.8629570322925006, + "grad_norm": 1.1072478162840556, + "learning_rate": 9.689904063603461e-07, + "loss": 1.582, + "step": 6467 + }, + { + "epoch": 0.8630904723779024, + "grad_norm": 0.9126478135993858, + "learning_rate": 9.671353016382767e-07, + "loss": 1.5269, + "step": 6468 + }, + { + "epoch": 0.863223912463304, + "grad_norm": 0.9443342912207419, + "learning_rate": 9.65281884152487e-07, + "loss": 1.5555, + "step": 6469 + }, + { + "epoch": 0.8633573525487056, + "grad_norm": 0.9209254675884877, + "learning_rate": 9.634301542491798e-07, + "loss": 1.5324, + "step": 6470 + }, + { + "epoch": 0.8634907926341073, + "grad_norm": 0.9549939583445304, + "learning_rate": 9.61580112274234e-07, + "loss": 1.5308, + "step": 6471 + }, + { + "epoch": 0.8636242327195089, + "grad_norm": 0.9770641027377173, + "learning_rate": 9.59731758573217e-07, + "loss": 1.544, + "step": 6472 + }, + { + "epoch": 0.8637576728049106, + "grad_norm": 0.8817507518794163, + "learning_rate": 9.578850934913786e-07, + "loss": 1.5241, + "step": 6473 + }, + { + "epoch": 0.8638911128903123, + "grad_norm": 0.9128330688258248, + "learning_rate": 9.560401173736588e-07, + "loss": 1.5328, + "step": 6474 + }, + { + "epoch": 0.8640245529757139, + "grad_norm": 0.9271171282021506, + "learning_rate": 9.541968305646754e-07, + "loss": 1.5089, + "step": 6475 + }, + { + "epoch": 0.8641579930611155, + "grad_norm": 0.898516087846143, + "learning_rate": 9.523552334087316e-07, + "loss": 1.4933, + "step": 6476 + }, + { + "epoch": 0.8642914331465172, + "grad_norm": 0.9455820905578254, + "learning_rate": 9.505153262498201e-07, + "loss": 1.5412, + "step": 6477 + }, + { + "epoch": 0.8644248732319189, + "grad_norm": 0.9769692174530545, + "learning_rate": 9.486771094316149e-07, + "loss": 1.5058, + "step": 6478 + }, + { + "epoch": 0.8645583133173205, + "grad_norm": 0.9225921262405403, + "learning_rate": 9.468405832974714e-07, + "loss": 1.4898, + "step": 6479 + }, + { + "epoch": 0.8646917534027222, + "grad_norm": 0.9336861605888874, + "learning_rate": 9.450057481904306e-07, + "loss": 1.5869, + "step": 6480 + }, + { + "epoch": 0.8648251934881238, + "grad_norm": 1.1283612150349465, + "learning_rate": 9.431726044532241e-07, + "loss": 1.5553, + "step": 6481 + }, + { + "epoch": 0.8649586335735255, + "grad_norm": 0.9124111539042673, + "learning_rate": 9.41341152428259e-07, + "loss": 1.5299, + "step": 6482 + }, + { + "epoch": 0.8650920736589272, + "grad_norm": 0.9550521448834836, + "learning_rate": 9.395113924576271e-07, + "loss": 1.549, + "step": 6483 + }, + { + "epoch": 0.8652255137443288, + "grad_norm": 0.9872576025252855, + "learning_rate": 9.376833248831119e-07, + "loss": 1.5379, + "step": 6484 + }, + { + "epoch": 0.8653589538297305, + "grad_norm": 0.9222828401540125, + "learning_rate": 9.358569500461734e-07, + "loss": 1.5166, + "step": 6485 + }, + { + "epoch": 0.8654923939151321, + "grad_norm": 1.0447537425480853, + "learning_rate": 9.340322682879577e-07, + "loss": 1.5124, + "step": 6486 + }, + { + "epoch": 0.8656258340005337, + "grad_norm": 0.9039392609995586, + "learning_rate": 9.322092799492921e-07, + "loss": 1.5092, + "step": 6487 + }, + { + "epoch": 0.8657592740859354, + "grad_norm": 0.9498842193910904, + "learning_rate": 9.303879853706955e-07, + "loss": 1.5568, + "step": 6488 + }, + { + "epoch": 0.8658927141713371, + "grad_norm": 0.9356401186158558, + "learning_rate": 9.285683848923599e-07, + "loss": 1.5631, + "step": 6489 + }, + { + "epoch": 0.8660261542567387, + "grad_norm": 0.9816096213234118, + "learning_rate": 9.26750478854167e-07, + "loss": 1.5634, + "step": 6490 + }, + { + "epoch": 0.8661595943421404, + "grad_norm": 0.9902314871977751, + "learning_rate": 9.249342675956841e-07, + "loss": 1.5368, + "step": 6491 + }, + { + "epoch": 0.866293034427542, + "grad_norm": 0.913769147225799, + "learning_rate": 9.231197514561552e-07, + "loss": 1.494, + "step": 6492 + }, + { + "epoch": 0.8664264745129436, + "grad_norm": 0.9901239051487305, + "learning_rate": 9.213069307745137e-07, + "loss": 1.529, + "step": 6493 + }, + { + "epoch": 0.8665599145983454, + "grad_norm": 0.9186278024861386, + "learning_rate": 9.194958058893722e-07, + "loss": 1.5525, + "step": 6494 + }, + { + "epoch": 0.866693354683747, + "grad_norm": 0.9601791869003774, + "learning_rate": 9.176863771390288e-07, + "loss": 1.5595, + "step": 6495 + }, + { + "epoch": 0.8668267947691487, + "grad_norm": 0.9344491574122844, + "learning_rate": 9.158786448614621e-07, + "loss": 1.5765, + "step": 6496 + }, + { + "epoch": 0.8669602348545503, + "grad_norm": 1.0059039142212696, + "learning_rate": 9.140726093943409e-07, + "loss": 1.5543, + "step": 6497 + }, + { + "epoch": 0.8670936749399519, + "grad_norm": 1.1316836388517142, + "learning_rate": 9.122682710750074e-07, + "loss": 1.5017, + "step": 6498 + }, + { + "epoch": 0.8672271150253537, + "grad_norm": 0.9402893670787278, + "learning_rate": 9.104656302404946e-07, + "loss": 1.5371, + "step": 6499 + }, + { + "epoch": 0.8673605551107553, + "grad_norm": 0.9832968180647175, + "learning_rate": 9.086646872275129e-07, + "loss": 1.5387, + "step": 6500 + }, + { + "epoch": 0.8674939951961569, + "grad_norm": 0.9514391898183131, + "learning_rate": 9.068654423724577e-07, + "loss": 1.5823, + "step": 6501 + }, + { + "epoch": 0.8676274352815586, + "grad_norm": 0.9488421342873816, + "learning_rate": 9.050678960114101e-07, + "loss": 1.5656, + "step": 6502 + }, + { + "epoch": 0.8677608753669602, + "grad_norm": 0.9115733795737984, + "learning_rate": 9.032720484801294e-07, + "loss": 1.5759, + "step": 6503 + }, + { + "epoch": 0.8678943154523618, + "grad_norm": 0.9981354349705964, + "learning_rate": 9.014779001140606e-07, + "loss": 1.574, + "step": 6504 + }, + { + "epoch": 0.8680277555377636, + "grad_norm": 0.9520562761893402, + "learning_rate": 8.996854512483277e-07, + "loss": 1.6245, + "step": 6505 + }, + { + "epoch": 0.8681611956231652, + "grad_norm": 1.4726974542567874, + "learning_rate": 8.978947022177431e-07, + "loss": 1.525, + "step": 6506 + }, + { + "epoch": 0.8682946357085669, + "grad_norm": 1.184012125670187, + "learning_rate": 8.961056533567969e-07, + "loss": 1.5583, + "step": 6507 + }, + { + "epoch": 0.8684280757939685, + "grad_norm": 0.9351096655055477, + "learning_rate": 8.943183049996606e-07, + "loss": 1.6061, + "step": 6508 + }, + { + "epoch": 0.8685615158793701, + "grad_norm": 0.9305843754964107, + "learning_rate": 8.925326574801952e-07, + "loss": 1.5381, + "step": 6509 + }, + { + "epoch": 0.8686949559647719, + "grad_norm": 1.3119179296076262, + "learning_rate": 8.907487111319368e-07, + "loss": 1.5312, + "step": 6510 + }, + { + "epoch": 0.8688283960501735, + "grad_norm": 0.9409917789541443, + "learning_rate": 8.889664662881059e-07, + "loss": 1.5381, + "step": 6511 + }, + { + "epoch": 0.8689618361355751, + "grad_norm": 0.9287019422318122, + "learning_rate": 8.871859232816049e-07, + "loss": 1.5654, + "step": 6512 + }, + { + "epoch": 0.8690952762209768, + "grad_norm": 0.9231687758110173, + "learning_rate": 8.854070824450223e-07, + "loss": 1.6082, + "step": 6513 + }, + { + "epoch": 0.8692287163063784, + "grad_norm": 0.9338271404408033, + "learning_rate": 8.836299441106222e-07, + "loss": 1.5037, + "step": 6514 + }, + { + "epoch": 0.86936215639178, + "grad_norm": 0.9727417740878118, + "learning_rate": 8.818545086103536e-07, + "loss": 1.5739, + "step": 6515 + }, + { + "epoch": 0.8694955964771818, + "grad_norm": 0.95148914834048, + "learning_rate": 8.800807762758501e-07, + "loss": 1.5617, + "step": 6516 + }, + { + "epoch": 0.8696290365625834, + "grad_norm": 0.9361374474665792, + "learning_rate": 8.783087474384245e-07, + "loss": 1.5317, + "step": 6517 + }, + { + "epoch": 0.869762476647985, + "grad_norm": 0.9353510556127035, + "learning_rate": 8.765384224290697e-07, + "loss": 1.5276, + "step": 6518 + }, + { + "epoch": 0.8698959167333867, + "grad_norm": 0.9385834806918207, + "learning_rate": 8.747698015784612e-07, + "loss": 1.5294, + "step": 6519 + }, + { + "epoch": 0.8700293568187883, + "grad_norm": 0.97192379174236, + "learning_rate": 8.730028852169614e-07, + "loss": 1.516, + "step": 6520 + }, + { + "epoch": 0.8701627969041901, + "grad_norm": 0.9442829507665126, + "learning_rate": 8.712376736746075e-07, + "loss": 1.4664, + "step": 6521 + }, + { + "epoch": 0.8702962369895917, + "grad_norm": 1.0520474911116193, + "learning_rate": 8.694741672811191e-07, + "loss": 1.5211, + "step": 6522 + }, + { + "epoch": 0.8704296770749933, + "grad_norm": 0.9648717939397788, + "learning_rate": 8.677123663659038e-07, + "loss": 1.5513, + "step": 6523 + }, + { + "epoch": 0.870563117160395, + "grad_norm": 0.968648455882469, + "learning_rate": 8.659522712580437e-07, + "loss": 1.5971, + "step": 6524 + }, + { + "epoch": 0.8706965572457966, + "grad_norm": 0.9731547427610653, + "learning_rate": 8.641938822863039e-07, + "loss": 1.5163, + "step": 6525 + }, + { + "epoch": 0.8708299973311983, + "grad_norm": 1.3042383135891946, + "learning_rate": 8.624371997791292e-07, + "loss": 1.5791, + "step": 6526 + }, + { + "epoch": 0.8709634374166, + "grad_norm": 0.9354023468065126, + "learning_rate": 8.60682224064654e-07, + "loss": 1.5338, + "step": 6527 + }, + { + "epoch": 0.8710968775020016, + "grad_norm": 1.0952681463977836, + "learning_rate": 8.589289554706826e-07, + "loss": 1.4912, + "step": 6528 + }, + { + "epoch": 0.8712303175874032, + "grad_norm": 0.9461007651182953, + "learning_rate": 8.571773943247063e-07, + "loss": 1.5812, + "step": 6529 + }, + { + "epoch": 0.8713637576728049, + "grad_norm": 0.9328438835651939, + "learning_rate": 8.554275409539004e-07, + "loss": 1.5688, + "step": 6530 + }, + { + "epoch": 0.8714971977582066, + "grad_norm": 0.9452776089490901, + "learning_rate": 8.536793956851141e-07, + "loss": 1.519, + "step": 6531 + }, + { + "epoch": 0.8716306378436082, + "grad_norm": 0.9314474567474438, + "learning_rate": 8.519329588448822e-07, + "loss": 1.487, + "step": 6532 + }, + { + "epoch": 0.8717640779290099, + "grad_norm": 0.9203268868498949, + "learning_rate": 8.501882307594167e-07, + "loss": 1.5422, + "step": 6533 + }, + { + "epoch": 0.8718975180144115, + "grad_norm": 0.9548253170186739, + "learning_rate": 8.484452117546171e-07, + "loss": 1.5782, + "step": 6534 + }, + { + "epoch": 0.8720309580998132, + "grad_norm": 0.9327588816923488, + "learning_rate": 8.467039021560575e-07, + "loss": 1.5449, + "step": 6535 + }, + { + "epoch": 0.8721643981852149, + "grad_norm": 1.116947865086207, + "learning_rate": 8.449643022889953e-07, + "loss": 1.5331, + "step": 6536 + }, + { + "epoch": 0.8722978382706165, + "grad_norm": 0.9154028744010556, + "learning_rate": 8.432264124783662e-07, + "loss": 1.5358, + "step": 6537 + }, + { + "epoch": 0.8724312783560182, + "grad_norm": 0.9584163843597435, + "learning_rate": 8.414902330487906e-07, + "loss": 1.5348, + "step": 6538 + }, + { + "epoch": 0.8725647184414198, + "grad_norm": 0.96915727974722, + "learning_rate": 8.397557643245646e-07, + "loss": 1.5153, + "step": 6539 + }, + { + "epoch": 0.8726981585268214, + "grad_norm": 0.9454548862508139, + "learning_rate": 8.38023006629669e-07, + "loss": 1.5727, + "step": 6540 + }, + { + "epoch": 0.8728315986122231, + "grad_norm": 0.9381925491031865, + "learning_rate": 8.36291960287764e-07, + "loss": 1.5877, + "step": 6541 + }, + { + "epoch": 0.8729650386976248, + "grad_norm": 0.9475264001662446, + "learning_rate": 8.345626256221873e-07, + "loss": 1.5583, + "step": 6542 + }, + { + "epoch": 0.8730984787830264, + "grad_norm": 0.9063919928958889, + "learning_rate": 8.328350029559595e-07, + "loss": 1.5421, + "step": 6543 + }, + { + "epoch": 0.8732319188684281, + "grad_norm": 1.162151324167172, + "learning_rate": 8.311090926117793e-07, + "loss": 1.5868, + "step": 6544 + }, + { + "epoch": 0.8733653589538297, + "grad_norm": 0.9586262421769765, + "learning_rate": 8.293848949120309e-07, + "loss": 1.5314, + "step": 6545 + }, + { + "epoch": 0.8734987990392313, + "grad_norm": 0.9299431509946539, + "learning_rate": 8.276624101787733e-07, + "loss": 1.4904, + "step": 6546 + }, + { + "epoch": 0.8736322391246331, + "grad_norm": 1.1008446800425085, + "learning_rate": 8.259416387337437e-07, + "loss": 1.5502, + "step": 6547 + }, + { + "epoch": 0.8737656792100347, + "grad_norm": 0.9407230063487958, + "learning_rate": 8.242225808983684e-07, + "loss": 1.5496, + "step": 6548 + }, + { + "epoch": 0.8738991192954364, + "grad_norm": 0.9950896376938949, + "learning_rate": 8.225052369937436e-07, + "loss": 1.569, + "step": 6549 + }, + { + "epoch": 0.874032559380838, + "grad_norm": 0.9343428376024484, + "learning_rate": 8.207896073406518e-07, + "loss": 1.5303, + "step": 6550 + }, + { + "epoch": 0.8741659994662396, + "grad_norm": 0.9213894936942306, + "learning_rate": 8.1907569225955e-07, + "loss": 1.5117, + "step": 6551 + }, + { + "epoch": 0.8742994395516414, + "grad_norm": 1.0003551884905675, + "learning_rate": 8.17363492070582e-07, + "loss": 1.5317, + "step": 6552 + }, + { + "epoch": 0.874432879637043, + "grad_norm": 0.952551223522479, + "learning_rate": 8.156530070935654e-07, + "loss": 1.5314, + "step": 6553 + }, + { + "epoch": 0.8745663197224446, + "grad_norm": 0.9370104185043665, + "learning_rate": 8.139442376479967e-07, + "loss": 1.5173, + "step": 6554 + }, + { + "epoch": 0.8746997598078463, + "grad_norm": 1.0396921232304424, + "learning_rate": 8.122371840530597e-07, + "loss": 1.5413, + "step": 6555 + }, + { + "epoch": 0.8748331998932479, + "grad_norm": 0.9443695541062251, + "learning_rate": 8.105318466276102e-07, + "loss": 1.5138, + "step": 6556 + }, + { + "epoch": 0.8749666399786495, + "grad_norm": 0.9529659601953869, + "learning_rate": 8.088282256901858e-07, + "loss": 1.5592, + "step": 6557 + }, + { + "epoch": 0.8751000800640513, + "grad_norm": 0.9345505185968372, + "learning_rate": 8.07126321559002e-07, + "loss": 1.5475, + "step": 6558 + }, + { + "epoch": 0.8752335201494529, + "grad_norm": 0.9949322978905206, + "learning_rate": 8.054261345519576e-07, + "loss": 1.5437, + "step": 6559 + }, + { + "epoch": 0.8753669602348545, + "grad_norm": 1.3528014835986673, + "learning_rate": 8.037276649866277e-07, + "loss": 1.5508, + "step": 6560 + }, + { + "epoch": 0.8755004003202562, + "grad_norm": 0.9126381930458377, + "learning_rate": 8.02030913180264e-07, + "loss": 1.5883, + "step": 6561 + }, + { + "epoch": 0.8756338404056578, + "grad_norm": 0.9059415684652883, + "learning_rate": 8.003358794498051e-07, + "loss": 1.4654, + "step": 6562 + }, + { + "epoch": 0.8757672804910596, + "grad_norm": 0.9317951935750919, + "learning_rate": 7.986425641118612e-07, + "loss": 1.526, + "step": 6563 + }, + { + "epoch": 0.8759007205764612, + "grad_norm": 1.2757830287361367, + "learning_rate": 7.969509674827258e-07, + "loss": 1.5593, + "step": 6564 + }, + { + "epoch": 0.8760341606618628, + "grad_norm": 0.9421033561329558, + "learning_rate": 7.952610898783675e-07, + "loss": 1.5101, + "step": 6565 + }, + { + "epoch": 0.8761676007472645, + "grad_norm": 0.9277438199353042, + "learning_rate": 7.935729316144381e-07, + "loss": 1.5095, + "step": 6566 + }, + { + "epoch": 0.8763010408326661, + "grad_norm": 0.9336579625689782, + "learning_rate": 7.918864930062675e-07, + "loss": 1.5145, + "step": 6567 + }, + { + "epoch": 0.8764344809180677, + "grad_norm": 0.9777046753987705, + "learning_rate": 7.902017743688606e-07, + "loss": 1.5592, + "step": 6568 + }, + { + "epoch": 0.8765679210034695, + "grad_norm": 0.9684594181447197, + "learning_rate": 7.885187760169033e-07, + "loss": 1.572, + "step": 6569 + }, + { + "epoch": 0.8767013610888711, + "grad_norm": 1.267214799587763, + "learning_rate": 7.868374982647642e-07, + "loss": 1.5555, + "step": 6570 + }, + { + "epoch": 0.8768348011742727, + "grad_norm": 0.9419267296602276, + "learning_rate": 7.851579414264843e-07, + "loss": 1.5705, + "step": 6571 + }, + { + "epoch": 0.8769682412596744, + "grad_norm": 0.9304158422245021, + "learning_rate": 7.834801058157837e-07, + "loss": 1.5284, + "step": 6572 + }, + { + "epoch": 0.877101681345076, + "grad_norm": 0.9745745483632863, + "learning_rate": 7.818039917460674e-07, + "loss": 1.512, + "step": 6573 + }, + { + "epoch": 0.8772351214304777, + "grad_norm": 0.9525590369369205, + "learning_rate": 7.801295995304125e-07, + "loss": 1.532, + "step": 6574 + }, + { + "epoch": 0.8773685615158794, + "grad_norm": 0.9198789574840573, + "learning_rate": 7.78456929481577e-07, + "loss": 1.516, + "step": 6575 + }, + { + "epoch": 0.877502001601281, + "grad_norm": 0.901804075657504, + "learning_rate": 7.767859819119927e-07, + "loss": 1.5354, + "step": 6576 + }, + { + "epoch": 0.8776354416866827, + "grad_norm": 0.9437717919984645, + "learning_rate": 7.751167571337792e-07, + "loss": 1.5562, + "step": 6577 + }, + { + "epoch": 0.8777688817720843, + "grad_norm": 0.901156691623584, + "learning_rate": 7.734492554587269e-07, + "loss": 1.566, + "step": 6578 + }, + { + "epoch": 0.877902321857486, + "grad_norm": 0.9507122398403107, + "learning_rate": 7.717834771983046e-07, + "loss": 1.5809, + "step": 6579 + }, + { + "epoch": 0.8780357619428877, + "grad_norm": 0.9154819364254669, + "learning_rate": 7.70119422663661e-07, + "loss": 1.5289, + "step": 6580 + }, + { + "epoch": 0.8781692020282893, + "grad_norm": 0.9235692837642071, + "learning_rate": 7.684570921656231e-07, + "loss": 1.5468, + "step": 6581 + }, + { + "epoch": 0.8783026421136909, + "grad_norm": 0.9431590240253189, + "learning_rate": 7.667964860146959e-07, + "loss": 1.5715, + "step": 6582 + }, + { + "epoch": 0.8784360821990926, + "grad_norm": 0.9841453617835374, + "learning_rate": 7.651376045210612e-07, + "loss": 1.5703, + "step": 6583 + }, + { + "epoch": 0.8785695222844943, + "grad_norm": 0.9280133482016604, + "learning_rate": 7.63480447994579e-07, + "loss": 1.5325, + "step": 6584 + }, + { + "epoch": 0.8787029623698959, + "grad_norm": 0.9042299305841511, + "learning_rate": 7.618250167447871e-07, + "loss": 1.5151, + "step": 6585 + }, + { + "epoch": 0.8788364024552976, + "grad_norm": 0.9303954686816721, + "learning_rate": 7.601713110809006e-07, + "loss": 1.5533, + "step": 6586 + }, + { + "epoch": 0.8789698425406992, + "grad_norm": 0.9450693162515615, + "learning_rate": 7.585193313118155e-07, + "loss": 1.5242, + "step": 6587 + }, + { + "epoch": 0.8791032826261009, + "grad_norm": 0.9224963862036915, + "learning_rate": 7.568690777461008e-07, + "loss": 1.5231, + "step": 6588 + }, + { + "epoch": 0.8792367227115025, + "grad_norm": 1.051976485103453, + "learning_rate": 7.552205506920052e-07, + "loss": 1.5829, + "step": 6589 + }, + { + "epoch": 0.8793701627969042, + "grad_norm": 0.9433620420881246, + "learning_rate": 7.535737504574536e-07, + "loss": 1.4849, + "step": 6590 + }, + { + "epoch": 0.8795036028823059, + "grad_norm": 1.0769771830246422, + "learning_rate": 7.519286773500522e-07, + "loss": 1.5769, + "step": 6591 + }, + { + "epoch": 0.8796370429677075, + "grad_norm": 0.9338463460938362, + "learning_rate": 7.502853316770808e-07, + "loss": 1.5084, + "step": 6592 + }, + { + "epoch": 0.8797704830531091, + "grad_norm": 0.9513071114461021, + "learning_rate": 7.48643713745496e-07, + "loss": 1.4993, + "step": 6593 + }, + { + "epoch": 0.8799039231385108, + "grad_norm": 0.9736174380569763, + "learning_rate": 7.47003823861936e-07, + "loss": 1.5405, + "step": 6594 + }, + { + "epoch": 0.8800373632239125, + "grad_norm": 0.9381351170469457, + "learning_rate": 7.453656623327132e-07, + "loss": 1.5456, + "step": 6595 + }, + { + "epoch": 0.8801708033093141, + "grad_norm": 0.9345529815258716, + "learning_rate": 7.437292294638155e-07, + "loss": 1.509, + "step": 6596 + }, + { + "epoch": 0.8803042433947158, + "grad_norm": 0.9678042966662639, + "learning_rate": 7.420945255609102e-07, + "loss": 1.4996, + "step": 6597 + }, + { + "epoch": 0.8804376834801174, + "grad_norm": 0.9243133790343941, + "learning_rate": 7.404615509293444e-07, + "loss": 1.5459, + "step": 6598 + }, + { + "epoch": 0.880571123565519, + "grad_norm": 0.9421650715900654, + "learning_rate": 7.388303058741364e-07, + "loss": 1.5101, + "step": 6599 + }, + { + "epoch": 0.8807045636509208, + "grad_norm": 0.9521800237087055, + "learning_rate": 7.372007906999856e-07, + "loss": 1.56, + "step": 6600 + }, + { + "epoch": 0.8808380037363224, + "grad_norm": 0.9828304259855544, + "learning_rate": 7.355730057112643e-07, + "loss": 1.5758, + "step": 6601 + }, + { + "epoch": 0.8809714438217241, + "grad_norm": 0.9439889966484398, + "learning_rate": 7.339469512120268e-07, + "loss": 1.5744, + "step": 6602 + }, + { + "epoch": 0.8811048839071257, + "grad_norm": 0.9616630127654491, + "learning_rate": 7.323226275060014e-07, + "loss": 1.5572, + "step": 6603 + }, + { + "epoch": 0.8812383239925273, + "grad_norm": 0.9218601721204037, + "learning_rate": 7.307000348965909e-07, + "loss": 1.4996, + "step": 6604 + }, + { + "epoch": 0.881371764077929, + "grad_norm": 1.0701166098993966, + "learning_rate": 7.290791736868819e-07, + "loss": 1.5297, + "step": 6605 + }, + { + "epoch": 0.8815052041633307, + "grad_norm": 0.9708535489021737, + "learning_rate": 7.274600441796287e-07, + "loss": 1.519, + "step": 6606 + }, + { + "epoch": 0.8816386442487323, + "grad_norm": 0.9728154176736694, + "learning_rate": 7.258426466772672e-07, + "loss": 1.5832, + "step": 6607 + }, + { + "epoch": 0.881772084334134, + "grad_norm": 0.9383164976637655, + "learning_rate": 7.242269814819081e-07, + "loss": 1.5461, + "step": 6608 + }, + { + "epoch": 0.8819055244195356, + "grad_norm": 0.9213707626663843, + "learning_rate": 7.226130488953409e-07, + "loss": 1.5343, + "step": 6609 + }, + { + "epoch": 0.8820389645049372, + "grad_norm": 0.9620469927915996, + "learning_rate": 7.210008492190301e-07, + "loss": 1.5522, + "step": 6610 + }, + { + "epoch": 0.882172404590339, + "grad_norm": 0.9341015762591438, + "learning_rate": 7.193903827541127e-07, + "loss": 1.4715, + "step": 6611 + }, + { + "epoch": 0.8823058446757406, + "grad_norm": 0.9956857458374051, + "learning_rate": 7.177816498014101e-07, + "loss": 1.5903, + "step": 6612 + }, + { + "epoch": 0.8824392847611422, + "grad_norm": 0.9857230476852422, + "learning_rate": 7.161746506614142e-07, + "loss": 1.5852, + "step": 6613 + }, + { + "epoch": 0.8825727248465439, + "grad_norm": 0.9343322357829968, + "learning_rate": 7.145693856342928e-07, + "loss": 1.5051, + "step": 6614 + }, + { + "epoch": 0.8827061649319455, + "grad_norm": 0.9231910471570226, + "learning_rate": 7.129658550198892e-07, + "loss": 1.5404, + "step": 6615 + }, + { + "epoch": 0.8828396050173473, + "grad_norm": 0.9422235323401871, + "learning_rate": 7.113640591177296e-07, + "loss": 1.5096, + "step": 6616 + }, + { + "epoch": 0.8829730451027489, + "grad_norm": 0.9268050292928415, + "learning_rate": 7.097639982270077e-07, + "loss": 1.5613, + "step": 6617 + }, + { + "epoch": 0.8831064851881505, + "grad_norm": 1.0117584913067665, + "learning_rate": 7.081656726465968e-07, + "loss": 1.5219, + "step": 6618 + }, + { + "epoch": 0.8832399252735522, + "grad_norm": 1.0082915428918369, + "learning_rate": 7.065690826750482e-07, + "loss": 1.564, + "step": 6619 + }, + { + "epoch": 0.8833733653589538, + "grad_norm": 0.9226919224076718, + "learning_rate": 7.049742286105843e-07, + "loss": 1.4563, + "step": 6620 + }, + { + "epoch": 0.8835068054443554, + "grad_norm": 0.9235669483501304, + "learning_rate": 7.033811107511079e-07, + "loss": 1.5549, + "step": 6621 + }, + { + "epoch": 0.8836402455297572, + "grad_norm": 0.9675042520116867, + "learning_rate": 7.017897293941934e-07, + "loss": 1.5154, + "step": 6622 + }, + { + "epoch": 0.8837736856151588, + "grad_norm": 0.9532787577491273, + "learning_rate": 7.002000848370938e-07, + "loss": 1.5967, + "step": 6623 + }, + { + "epoch": 0.8839071257005604, + "grad_norm": 0.9172226848899977, + "learning_rate": 6.98612177376734e-07, + "loss": 1.5259, + "step": 6624 + }, + { + "epoch": 0.8840405657859621, + "grad_norm": 0.9383987828753451, + "learning_rate": 6.970260073097213e-07, + "loss": 1.5119, + "step": 6625 + }, + { + "epoch": 0.8841740058713637, + "grad_norm": 1.2105279069704957, + "learning_rate": 6.954415749323318e-07, + "loss": 1.5682, + "step": 6626 + }, + { + "epoch": 0.8843074459567654, + "grad_norm": 0.9799759626641332, + "learning_rate": 6.9385888054052e-07, + "loss": 1.631, + "step": 6627 + }, + { + "epoch": 0.8844408860421671, + "grad_norm": 0.9054994550461375, + "learning_rate": 6.92277924429915e-07, + "loss": 1.4996, + "step": 6628 + }, + { + "epoch": 0.8845743261275687, + "grad_norm": 0.9271188131214871, + "learning_rate": 6.906987068958193e-07, + "loss": 1.6082, + "step": 6629 + }, + { + "epoch": 0.8847077662129704, + "grad_norm": 0.9259197742858476, + "learning_rate": 6.89121228233216e-07, + "loss": 1.5372, + "step": 6630 + }, + { + "epoch": 0.884841206298372, + "grad_norm": 0.9170649924609936, + "learning_rate": 6.875454887367605e-07, + "loss": 1.5452, + "step": 6631 + }, + { + "epoch": 0.8849746463837737, + "grad_norm": 1.2210390039887022, + "learning_rate": 6.859714887007796e-07, + "loss": 1.5014, + "step": 6632 + }, + { + "epoch": 0.8851080864691754, + "grad_norm": 0.9534407665379254, + "learning_rate": 6.843992284192802e-07, + "loss": 1.5863, + "step": 6633 + }, + { + "epoch": 0.885241526554577, + "grad_norm": 0.961514819564435, + "learning_rate": 6.82828708185943e-07, + "loss": 1.5202, + "step": 6634 + }, + { + "epoch": 0.8853749666399786, + "grad_norm": 0.9797343158237848, + "learning_rate": 6.812599282941246e-07, + "loss": 1.5998, + "step": 6635 + }, + { + "epoch": 0.8855084067253803, + "grad_norm": 0.9254630062159546, + "learning_rate": 6.796928890368515e-07, + "loss": 1.5759, + "step": 6636 + }, + { + "epoch": 0.885641846810782, + "grad_norm": 0.921610927618888, + "learning_rate": 6.781275907068329e-07, + "loss": 1.5506, + "step": 6637 + }, + { + "epoch": 0.8857752868961836, + "grad_norm": 1.1965720258668404, + "learning_rate": 6.765640335964463e-07, + "loss": 1.4921, + "step": 6638 + }, + { + "epoch": 0.8859087269815853, + "grad_norm": 0.9199991742942989, + "learning_rate": 6.750022179977467e-07, + "loss": 1.5276, + "step": 6639 + }, + { + "epoch": 0.8860421670669869, + "grad_norm": 0.9414233305947609, + "learning_rate": 6.73442144202463e-07, + "loss": 1.5265, + "step": 6640 + }, + { + "epoch": 0.8861756071523885, + "grad_norm": 0.9303782628792104, + "learning_rate": 6.718838125020011e-07, + "loss": 1.4829, + "step": 6641 + }, + { + "epoch": 0.8863090472377902, + "grad_norm": 1.244221234348905, + "learning_rate": 6.703272231874391e-07, + "loss": 1.5792, + "step": 6642 + }, + { + "epoch": 0.8864424873231919, + "grad_norm": 0.9051064947661962, + "learning_rate": 6.687723765495268e-07, + "loss": 1.558, + "step": 6643 + }, + { + "epoch": 0.8865759274085936, + "grad_norm": 1.0602549890874733, + "learning_rate": 6.672192728786964e-07, + "loss": 1.5136, + "step": 6644 + }, + { + "epoch": 0.8867093674939952, + "grad_norm": 1.015376492992175, + "learning_rate": 6.656679124650489e-07, + "loss": 1.526, + "step": 6645 + }, + { + "epoch": 0.8868428075793968, + "grad_norm": 0.9763385623831438, + "learning_rate": 6.641182955983594e-07, + "loss": 1.5627, + "step": 6646 + }, + { + "epoch": 0.8869762476647985, + "grad_norm": 0.9484361059315187, + "learning_rate": 6.625704225680773e-07, + "loss": 1.5681, + "step": 6647 + }, + { + "epoch": 0.8871096877502002, + "grad_norm": 1.1736786559443448, + "learning_rate": 6.610242936633315e-07, + "loss": 1.52, + "step": 6648 + }, + { + "epoch": 0.8872431278356018, + "grad_norm": 1.0652253170073327, + "learning_rate": 6.594799091729198e-07, + "loss": 1.5487, + "step": 6649 + }, + { + "epoch": 0.8873765679210035, + "grad_norm": 0.9130108805332712, + "learning_rate": 6.579372693853137e-07, + "loss": 1.5838, + "step": 6650 + }, + { + "epoch": 0.8875100080064051, + "grad_norm": 0.9461766707449843, + "learning_rate": 6.563963745886637e-07, + "loss": 1.5557, + "step": 6651 + }, + { + "epoch": 0.8876434480918067, + "grad_norm": 0.9345577948179379, + "learning_rate": 6.548572250707896e-07, + "loss": 1.5397, + "step": 6652 + }, + { + "epoch": 0.8877768881772085, + "grad_norm": 0.9306784187277369, + "learning_rate": 6.533198211191871e-07, + "loss": 1.5641, + "step": 6653 + }, + { + "epoch": 0.8879103282626101, + "grad_norm": 0.9904047827430623, + "learning_rate": 6.517841630210254e-07, + "loss": 1.5339, + "step": 6654 + }, + { + "epoch": 0.8880437683480118, + "grad_norm": 0.9847791868831337, + "learning_rate": 6.502502510631503e-07, + "loss": 1.5608, + "step": 6655 + }, + { + "epoch": 0.8881772084334134, + "grad_norm": 1.2239512456758879, + "learning_rate": 6.487180855320762e-07, + "loss": 1.5556, + "step": 6656 + }, + { + "epoch": 0.888310648518815, + "grad_norm": 0.9473098956072885, + "learning_rate": 6.471876667139954e-07, + "loss": 1.5671, + "step": 6657 + }, + { + "epoch": 0.8884440886042168, + "grad_norm": 0.9150781295209052, + "learning_rate": 6.456589948947733e-07, + "loss": 1.4855, + "step": 6658 + }, + { + "epoch": 0.8885775286896184, + "grad_norm": 0.9262691372999762, + "learning_rate": 6.441320703599474e-07, + "loss": 1.5296, + "step": 6659 + }, + { + "epoch": 0.88871096877502, + "grad_norm": 1.2521772098087953, + "learning_rate": 6.426068933947305e-07, + "loss": 1.5772, + "step": 6660 + }, + { + "epoch": 0.8888444088604217, + "grad_norm": 0.9718277083295738, + "learning_rate": 6.410834642840069e-07, + "loss": 1.5483, + "step": 6661 + }, + { + "epoch": 0.8889778489458233, + "grad_norm": 0.9249506579829485, + "learning_rate": 6.395617833123379e-07, + "loss": 1.5262, + "step": 6662 + }, + { + "epoch": 0.8891112890312249, + "grad_norm": 0.991283938492636, + "learning_rate": 6.380418507639563e-07, + "loss": 1.5752, + "step": 6663 + }, + { + "epoch": 0.8892447291166267, + "grad_norm": 1.145717875907442, + "learning_rate": 6.365236669227659e-07, + "loss": 1.5203, + "step": 6664 + }, + { + "epoch": 0.8893781692020283, + "grad_norm": 0.9459042028552311, + "learning_rate": 6.350072320723477e-07, + "loss": 1.5782, + "step": 6665 + }, + { + "epoch": 0.8895116092874299, + "grad_norm": 0.951409036273284, + "learning_rate": 6.33492546495954e-07, + "loss": 1.5228, + "step": 6666 + }, + { + "epoch": 0.8896450493728316, + "grad_norm": 0.9376247682782157, + "learning_rate": 6.319796104765097e-07, + "loss": 1.5482, + "step": 6667 + }, + { + "epoch": 0.8897784894582332, + "grad_norm": 0.9343467840146905, + "learning_rate": 6.304684242966164e-07, + "loss": 1.5819, + "step": 6668 + }, + { + "epoch": 0.889911929543635, + "grad_norm": 0.9677052840154801, + "learning_rate": 6.289589882385461e-07, + "loss": 1.5253, + "step": 6669 + }, + { + "epoch": 0.8900453696290366, + "grad_norm": 0.9467119490943833, + "learning_rate": 6.274513025842421e-07, + "loss": 1.5582, + "step": 6670 + }, + { + "epoch": 0.8901788097144382, + "grad_norm": 0.9311722512072491, + "learning_rate": 6.25945367615326e-07, + "loss": 1.5084, + "step": 6671 + }, + { + "epoch": 0.8903122497998399, + "grad_norm": 1.1535295894188413, + "learning_rate": 6.244411836130848e-07, + "loss": 1.5288, + "step": 6672 + }, + { + "epoch": 0.8904456898852415, + "grad_norm": 0.9398613520690328, + "learning_rate": 6.229387508584872e-07, + "loss": 1.541, + "step": 6673 + }, + { + "epoch": 0.8905791299706431, + "grad_norm": 0.9460664090823528, + "learning_rate": 6.214380696321698e-07, + "loss": 1.5259, + "step": 6674 + }, + { + "epoch": 0.8907125700560449, + "grad_norm": 0.9192830875852988, + "learning_rate": 6.199391402144406e-07, + "loss": 1.5079, + "step": 6675 + }, + { + "epoch": 0.8908460101414465, + "grad_norm": 0.9624182903854707, + "learning_rate": 6.184419628852845e-07, + "loss": 1.514, + "step": 6676 + }, + { + "epoch": 0.8909794502268481, + "grad_norm": 0.9567443782327006, + "learning_rate": 6.169465379243578e-07, + "loss": 1.5349, + "step": 6677 + }, + { + "epoch": 0.8911128903122498, + "grad_norm": 1.173850544587069, + "learning_rate": 6.154528656109871e-07, + "loss": 1.5518, + "step": 6678 + }, + { + "epoch": 0.8912463303976514, + "grad_norm": 0.9397559763327206, + "learning_rate": 6.139609462241724e-07, + "loss": 1.524, + "step": 6679 + }, + { + "epoch": 0.8913797704830531, + "grad_norm": 0.9739117546526423, + "learning_rate": 6.124707800425911e-07, + "loss": 1.5663, + "step": 6680 + }, + { + "epoch": 0.8915132105684548, + "grad_norm": 0.9526569594291362, + "learning_rate": 6.109823673445869e-07, + "loss": 1.5959, + "step": 6681 + }, + { + "epoch": 0.8916466506538564, + "grad_norm": 1.2208803452356254, + "learning_rate": 6.094957084081765e-07, + "loss": 1.527, + "step": 6682 + }, + { + "epoch": 0.8917800907392581, + "grad_norm": 0.9533030946813461, + "learning_rate": 6.080108035110543e-07, + "loss": 1.5158, + "step": 6683 + }, + { + "epoch": 0.8919135308246597, + "grad_norm": 0.9373914483177651, + "learning_rate": 6.06527652930583e-07, + "loss": 1.5035, + "step": 6684 + }, + { + "epoch": 0.8920469709100614, + "grad_norm": 0.9206005814229758, + "learning_rate": 6.050462569437965e-07, + "loss": 1.5194, + "step": 6685 + }, + { + "epoch": 0.8921804109954631, + "grad_norm": 0.9411608802351985, + "learning_rate": 6.035666158274034e-07, + "loss": 1.5614, + "step": 6686 + }, + { + "epoch": 0.8923138510808647, + "grad_norm": 0.9549233585076925, + "learning_rate": 6.020887298577838e-07, + "loss": 1.524, + "step": 6687 + }, + { + "epoch": 0.8924472911662663, + "grad_norm": 1.198668371629655, + "learning_rate": 6.006125993109913e-07, + "loss": 1.5915, + "step": 6688 + }, + { + "epoch": 0.892580731251668, + "grad_norm": 0.9623095343289453, + "learning_rate": 5.991382244627475e-07, + "loss": 1.51, + "step": 6689 + }, + { + "epoch": 0.8927141713370697, + "grad_norm": 0.9678788722229147, + "learning_rate": 5.976656055884522e-07, + "loss": 1.5247, + "step": 6690 + }, + { + "epoch": 0.8928476114224713, + "grad_norm": 1.0279035033137902, + "learning_rate": 5.961947429631721e-07, + "loss": 1.5636, + "step": 6691 + }, + { + "epoch": 0.892981051507873, + "grad_norm": 0.9304167484501092, + "learning_rate": 5.947256368616483e-07, + "loss": 1.5038, + "step": 6692 + }, + { + "epoch": 0.8931144915932746, + "grad_norm": 0.9374804062894364, + "learning_rate": 5.932582875582904e-07, + "loss": 1.5582, + "step": 6693 + }, + { + "epoch": 0.8932479316786762, + "grad_norm": 0.9034683624937042, + "learning_rate": 5.917926953271857e-07, + "loss": 1.5244, + "step": 6694 + }, + { + "epoch": 0.893381371764078, + "grad_norm": 1.0506006939018857, + "learning_rate": 5.903288604420887e-07, + "loss": 1.5336, + "step": 6695 + }, + { + "epoch": 0.8935148118494796, + "grad_norm": 0.9685577008965324, + "learning_rate": 5.888667831764283e-07, + "loss": 1.5201, + "step": 6696 + }, + { + "epoch": 0.8936482519348813, + "grad_norm": 0.9546554805063521, + "learning_rate": 5.874064638033017e-07, + "loss": 1.5315, + "step": 6697 + }, + { + "epoch": 0.8937816920202829, + "grad_norm": 0.9383844169484069, + "learning_rate": 5.859479025954817e-07, + "loss": 1.5399, + "step": 6698 + }, + { + "epoch": 0.8939151321056845, + "grad_norm": 1.0001219462580064, + "learning_rate": 5.844910998254117e-07, + "loss": 1.534, + "step": 6699 + }, + { + "epoch": 0.8940485721910862, + "grad_norm": 1.0514709520939773, + "learning_rate": 5.830360557652026e-07, + "loss": 1.5436, + "step": 6700 + }, + { + "epoch": 0.8941820122764879, + "grad_norm": 0.9603464211449854, + "learning_rate": 5.815827706866439e-07, + "loss": 1.6296, + "step": 6701 + }, + { + "epoch": 0.8943154523618895, + "grad_norm": 0.9699909980766925, + "learning_rate": 5.801312448611907e-07, + "loss": 1.5712, + "step": 6702 + }, + { + "epoch": 0.8944488924472912, + "grad_norm": 0.9081518549220741, + "learning_rate": 5.786814785599715e-07, + "loss": 1.5363, + "step": 6703 + }, + { + "epoch": 0.8945823325326928, + "grad_norm": 0.9477859920909177, + "learning_rate": 5.772334720537854e-07, + "loss": 1.5438, + "step": 6704 + }, + { + "epoch": 0.8947157726180944, + "grad_norm": 0.9467954261018504, + "learning_rate": 5.757872256131048e-07, + "loss": 1.5359, + "step": 6705 + }, + { + "epoch": 0.8948492127034962, + "grad_norm": 1.0635463978582553, + "learning_rate": 5.743427395080736e-07, + "loss": 1.5161, + "step": 6706 + }, + { + "epoch": 0.8949826527888978, + "grad_norm": 0.9139015356788128, + "learning_rate": 5.729000140085017e-07, + "loss": 1.5263, + "step": 6707 + }, + { + "epoch": 0.8951160928742994, + "grad_norm": 1.2623806658299492, + "learning_rate": 5.714590493838768e-07, + "loss": 1.5617, + "step": 6708 + }, + { + "epoch": 0.8952495329597011, + "grad_norm": 1.0270566802124301, + "learning_rate": 5.700198459033535e-07, + "loss": 1.5226, + "step": 6709 + }, + { + "epoch": 0.8953829730451027, + "grad_norm": 0.9557701098327668, + "learning_rate": 5.685824038357568e-07, + "loss": 1.554, + "step": 6710 + }, + { + "epoch": 0.8955164131305045, + "grad_norm": 0.8951591320979515, + "learning_rate": 5.671467234495875e-07, + "loss": 1.5101, + "step": 6711 + }, + { + "epoch": 0.8956498532159061, + "grad_norm": 0.9290933211646522, + "learning_rate": 5.657128050130134e-07, + "loss": 1.5454, + "step": 6712 + }, + { + "epoch": 0.8957832933013077, + "grad_norm": 0.935571799488784, + "learning_rate": 5.642806487938746e-07, + "loss": 1.6001, + "step": 6713 + }, + { + "epoch": 0.8959167333867094, + "grad_norm": 0.967264821795858, + "learning_rate": 5.628502550596781e-07, + "loss": 1.4861, + "step": 6714 + }, + { + "epoch": 0.896050173472111, + "grad_norm": 0.9487698266935534, + "learning_rate": 5.614216240776105e-07, + "loss": 1.5693, + "step": 6715 + }, + { + "epoch": 0.8961836135575126, + "grad_norm": 1.1670691771030441, + "learning_rate": 5.599947561145214e-07, + "loss": 1.4934, + "step": 6716 + }, + { + "epoch": 0.8963170536429144, + "grad_norm": 0.960589734890621, + "learning_rate": 5.585696514369321e-07, + "loss": 1.5477, + "step": 6717 + }, + { + "epoch": 0.896450493728316, + "grad_norm": 0.935167699678419, + "learning_rate": 5.571463103110375e-07, + "loss": 1.4821, + "step": 6718 + }, + { + "epoch": 0.8965839338137176, + "grad_norm": 0.9947979187682457, + "learning_rate": 5.557247330027016e-07, + "loss": 1.5684, + "step": 6719 + }, + { + "epoch": 0.8967173738991193, + "grad_norm": 0.9593305751018268, + "learning_rate": 5.543049197774586e-07, + "loss": 1.5996, + "step": 6720 + }, + { + "epoch": 0.8968508139845209, + "grad_norm": 0.9375856914820678, + "learning_rate": 5.52886870900512e-07, + "loss": 1.5154, + "step": 6721 + }, + { + "epoch": 0.8969842540699227, + "grad_norm": 1.2862051330318942, + "learning_rate": 5.514705866367387e-07, + "loss": 1.5196, + "step": 6722 + }, + { + "epoch": 0.8971176941553243, + "grad_norm": 0.9340659004674106, + "learning_rate": 5.500560672506861e-07, + "loss": 1.4784, + "step": 6723 + }, + { + "epoch": 0.8972511342407259, + "grad_norm": 0.945137802994412, + "learning_rate": 5.486433130065672e-07, + "loss": 1.5399, + "step": 6724 + }, + { + "epoch": 0.8973845743261276, + "grad_norm": 1.0209524963685872, + "learning_rate": 5.472323241682687e-07, + "loss": 1.5503, + "step": 6725 + }, + { + "epoch": 0.8975180144115292, + "grad_norm": 0.927685929338529, + "learning_rate": 5.458231009993498e-07, + "loss": 1.5844, + "step": 6726 + }, + { + "epoch": 0.8976514544969308, + "grad_norm": 0.9428359886770269, + "learning_rate": 5.444156437630365e-07, + "loss": 1.5389, + "step": 6727 + }, + { + "epoch": 0.8977848945823326, + "grad_norm": 0.9334863494390306, + "learning_rate": 5.430099527222244e-07, + "loss": 1.5162, + "step": 6728 + }, + { + "epoch": 0.8979183346677342, + "grad_norm": 0.9436338050010254, + "learning_rate": 5.416060281394797e-07, + "loss": 1.5382, + "step": 6729 + }, + { + "epoch": 0.8980517747531358, + "grad_norm": 0.9365789196387994, + "learning_rate": 5.402038702770418e-07, + "loss": 1.5249, + "step": 6730 + }, + { + "epoch": 0.8981852148385375, + "grad_norm": 0.9221528407309991, + "learning_rate": 5.388034793968189e-07, + "loss": 1.5386, + "step": 6731 + }, + { + "epoch": 0.8983186549239391, + "grad_norm": 0.9515618324069464, + "learning_rate": 5.374048557603828e-07, + "loss": 1.5362, + "step": 6732 + }, + { + "epoch": 0.8984520950093408, + "grad_norm": 0.9944556911301887, + "learning_rate": 5.360079996289869e-07, + "loss": 1.5167, + "step": 6733 + }, + { + "epoch": 0.8985855350947425, + "grad_norm": 0.97850772806653, + "learning_rate": 5.346129112635445e-07, + "loss": 1.5102, + "step": 6734 + }, + { + "epoch": 0.8987189751801441, + "grad_norm": 0.9540868569995883, + "learning_rate": 5.332195909246429e-07, + "loss": 1.5752, + "step": 6735 + }, + { + "epoch": 0.8988524152655458, + "grad_norm": 0.9221941861156301, + "learning_rate": 5.318280388725372e-07, + "loss": 1.5118, + "step": 6736 + }, + { + "epoch": 0.8989858553509474, + "grad_norm": 1.0270466084981307, + "learning_rate": 5.304382553671561e-07, + "loss": 1.5436, + "step": 6737 + }, + { + "epoch": 0.899119295436349, + "grad_norm": 0.9203067458998816, + "learning_rate": 5.290502406680931e-07, + "loss": 1.5437, + "step": 6738 + }, + { + "epoch": 0.8992527355217508, + "grad_norm": 0.9501536472353344, + "learning_rate": 5.276639950346129e-07, + "loss": 1.5075, + "step": 6739 + }, + { + "epoch": 0.8993861756071524, + "grad_norm": 0.9334210192752896, + "learning_rate": 5.262795187256542e-07, + "loss": 1.5422, + "step": 6740 + }, + { + "epoch": 0.899519615692554, + "grad_norm": 0.9567078114398553, + "learning_rate": 5.248968119998188e-07, + "loss": 1.5471, + "step": 6741 + }, + { + "epoch": 0.8996530557779557, + "grad_norm": 0.9282038591345071, + "learning_rate": 5.235158751153801e-07, + "loss": 1.4623, + "step": 6742 + }, + { + "epoch": 0.8997864958633573, + "grad_norm": 0.9384691548104735, + "learning_rate": 5.22136708330282e-07, + "loss": 1.5249, + "step": 6743 + }, + { + "epoch": 0.899919935948759, + "grad_norm": 1.0717530433261342, + "learning_rate": 5.20759311902137e-07, + "loss": 1.5446, + "step": 6744 + }, + { + "epoch": 0.9000533760341607, + "grad_norm": 0.949517161915402, + "learning_rate": 5.193836860882284e-07, + "loss": 1.5237, + "step": 6745 + }, + { + "epoch": 0.9001868161195623, + "grad_norm": 0.9209721246022604, + "learning_rate": 5.180098311455051e-07, + "loss": 1.4865, + "step": 6746 + }, + { + "epoch": 0.9003202562049639, + "grad_norm": 1.0530255691032593, + "learning_rate": 5.166377473305894e-07, + "loss": 1.5196, + "step": 6747 + }, + { + "epoch": 0.9004536962903656, + "grad_norm": 0.9330131313465537, + "learning_rate": 5.15267434899771e-07, + "loss": 1.5512, + "step": 6748 + }, + { + "epoch": 0.9005871363757673, + "grad_norm": 0.9202256141014292, + "learning_rate": 5.138988941090084e-07, + "loss": 1.5196, + "step": 6749 + }, + { + "epoch": 0.900720576461169, + "grad_norm": 0.9650715822764238, + "learning_rate": 5.125321252139282e-07, + "loss": 1.5177, + "step": 6750 + }, + { + "epoch": 0.9008540165465706, + "grad_norm": 0.9433756615509346, + "learning_rate": 5.111671284698283e-07, + "loss": 1.5432, + "step": 6751 + }, + { + "epoch": 0.9009874566319722, + "grad_norm": 0.9533022305140074, + "learning_rate": 5.098039041316738e-07, + "loss": 1.5537, + "step": 6752 + }, + { + "epoch": 0.9011208967173739, + "grad_norm": 0.9346416532826066, + "learning_rate": 5.084424524540999e-07, + "loss": 1.5495, + "step": 6753 + }, + { + "epoch": 0.9012543368027756, + "grad_norm": 1.1020333932337052, + "learning_rate": 5.070827736914119e-07, + "loss": 1.5476, + "step": 6754 + }, + { + "epoch": 0.9013877768881772, + "grad_norm": 0.9410283837732905, + "learning_rate": 5.057248680975802e-07, + "loss": 1.5037, + "step": 6755 + }, + { + "epoch": 0.9015212169735789, + "grad_norm": 1.0772876014109762, + "learning_rate": 5.043687359262472e-07, + "loss": 1.5047, + "step": 6756 + }, + { + "epoch": 0.9016546570589805, + "grad_norm": 0.9439200358382797, + "learning_rate": 5.030143774307205e-07, + "loss": 1.5584, + "step": 6757 + }, + { + "epoch": 0.9017880971443821, + "grad_norm": 1.0161041213410495, + "learning_rate": 5.016617928639822e-07, + "loss": 1.5249, + "step": 6758 + }, + { + "epoch": 0.9019215372297839, + "grad_norm": 0.9412328793676751, + "learning_rate": 5.003109824786789e-07, + "loss": 1.5376, + "step": 6759 + }, + { + "epoch": 0.9020549773151855, + "grad_norm": 0.9639374913924775, + "learning_rate": 4.989619465271245e-07, + "loss": 1.5075, + "step": 6760 + }, + { + "epoch": 0.9021884174005871, + "grad_norm": 0.9124862517814768, + "learning_rate": 4.976146852613062e-07, + "loss": 1.5236, + "step": 6761 + }, + { + "epoch": 0.9023218574859888, + "grad_norm": 0.9328183814360856, + "learning_rate": 4.96269198932876e-07, + "loss": 1.5257, + "step": 6762 + }, + { + "epoch": 0.9024552975713904, + "grad_norm": 0.9480856180023016, + "learning_rate": 4.949254877931564e-07, + "loss": 1.6097, + "step": 6763 + }, + { + "epoch": 0.9025887376567922, + "grad_norm": 0.9452684747563659, + "learning_rate": 4.935835520931342e-07, + "loss": 1.5583, + "step": 6764 + }, + { + "epoch": 0.9027221777421938, + "grad_norm": 0.9435189294655035, + "learning_rate": 4.922433920834713e-07, + "loss": 1.5576, + "step": 6765 + }, + { + "epoch": 0.9028556178275954, + "grad_norm": 0.9100774226094155, + "learning_rate": 4.909050080144928e-07, + "loss": 1.5061, + "step": 6766 + }, + { + "epoch": 0.9029890579129971, + "grad_norm": 0.9188166446494538, + "learning_rate": 4.895684001361933e-07, + "loss": 1.525, + "step": 6767 + }, + { + "epoch": 0.9031224979983987, + "grad_norm": 0.9513533676277205, + "learning_rate": 4.882335686982354e-07, + "loss": 1.5194, + "step": 6768 + }, + { + "epoch": 0.9032559380838003, + "grad_norm": 0.9901332701917303, + "learning_rate": 4.869005139499528e-07, + "loss": 1.5135, + "step": 6769 + }, + { + "epoch": 0.9033893781692021, + "grad_norm": 0.9252469459313303, + "learning_rate": 4.85569236140343e-07, + "loss": 1.5236, + "step": 6770 + }, + { + "epoch": 0.9035228182546037, + "grad_norm": 1.0114758784078768, + "learning_rate": 4.842397355180728e-07, + "loss": 1.5166, + "step": 6771 + }, + { + "epoch": 0.9036562583400053, + "grad_norm": 0.972194230662802, + "learning_rate": 4.829120123314801e-07, + "loss": 1.5787, + "step": 6772 + }, + { + "epoch": 0.903789698425407, + "grad_norm": 0.917284388104258, + "learning_rate": 4.815860668285688e-07, + "loss": 1.5353, + "step": 6773 + }, + { + "epoch": 0.9039231385108086, + "grad_norm": 0.9549812627537546, + "learning_rate": 4.802618992570074e-07, + "loss": 1.545, + "step": 6774 + }, + { + "epoch": 0.9040565785962102, + "grad_norm": 1.052982946218394, + "learning_rate": 4.789395098641359e-07, + "loss": 1.5932, + "step": 6775 + }, + { + "epoch": 0.904190018681612, + "grad_norm": 1.1732404735888684, + "learning_rate": 4.776188988969643e-07, + "loss": 1.4899, + "step": 6776 + }, + { + "epoch": 0.9043234587670136, + "grad_norm": 1.0066806501063883, + "learning_rate": 4.7630006660216665e-07, + "loss": 1.5585, + "step": 6777 + }, + { + "epoch": 0.9044568988524153, + "grad_norm": 0.9579935580737865, + "learning_rate": 4.749830132260824e-07, + "loss": 1.5712, + "step": 6778 + }, + { + "epoch": 0.9045903389378169, + "grad_norm": 0.9169006378445536, + "learning_rate": 4.736677390147271e-07, + "loss": 1.5371, + "step": 6779 + }, + { + "epoch": 0.9047237790232185, + "grad_norm": 0.9413950267463799, + "learning_rate": 4.723542442137774e-07, + "loss": 1.5586, + "step": 6780 + }, + { + "epoch": 0.9048572191086203, + "grad_norm": 0.9375274450226259, + "learning_rate": 4.710425290685772e-07, + "loss": 1.526, + "step": 6781 + }, + { + "epoch": 0.9049906591940219, + "grad_norm": 0.920951060703472, + "learning_rate": 4.697325938241404e-07, + "loss": 1.5496, + "step": 6782 + }, + { + "epoch": 0.9051240992794235, + "grad_norm": 0.9705124398869889, + "learning_rate": 4.684244387251513e-07, + "loss": 1.552, + "step": 6783 + }, + { + "epoch": 0.9052575393648252, + "grad_norm": 0.9307149713548113, + "learning_rate": 4.671180640159545e-07, + "loss": 1.5031, + "step": 6784 + }, + { + "epoch": 0.9053909794502268, + "grad_norm": 1.0044776590484505, + "learning_rate": 4.65813469940567e-07, + "loss": 1.5668, + "step": 6785 + }, + { + "epoch": 0.9055244195356285, + "grad_norm": 0.8995467573403738, + "learning_rate": 4.645106567426738e-07, + "loss": 1.4998, + "step": 6786 + }, + { + "epoch": 0.9056578596210302, + "grad_norm": 0.9441419162843016, + "learning_rate": 4.632096246656237e-07, + "loss": 1.5544, + "step": 6787 + }, + { + "epoch": 0.9057912997064318, + "grad_norm": 1.37214339045131, + "learning_rate": 4.619103739524355e-07, + "loss": 1.5009, + "step": 6788 + }, + { + "epoch": 0.9059247397918335, + "grad_norm": 0.9569798512304817, + "learning_rate": 4.6061290484579304e-07, + "loss": 1.5217, + "step": 6789 + }, + { + "epoch": 0.9060581798772351, + "grad_norm": 0.9238396452899695, + "learning_rate": 4.5931721758805117e-07, + "loss": 1.5632, + "step": 6790 + }, + { + "epoch": 0.9061916199626368, + "grad_norm": 0.9533994593848583, + "learning_rate": 4.5802331242122855e-07, + "loss": 1.559, + "step": 6791 + }, + { + "epoch": 0.9063250600480385, + "grad_norm": 1.0268411283902035, + "learning_rate": 4.567311895870108e-07, + "loss": 1.517, + "step": 6792 + }, + { + "epoch": 0.9064585001334401, + "grad_norm": 1.0100719989531157, + "learning_rate": 4.554408493267537e-07, + "loss": 1.5145, + "step": 6793 + }, + { + "epoch": 0.9065919402188417, + "grad_norm": 1.1648731959473748, + "learning_rate": 4.5415229188147667e-07, + "loss": 1.5143, + "step": 6794 + }, + { + "epoch": 0.9067253803042434, + "grad_norm": 0.92959204449064, + "learning_rate": 4.5286551749186726e-07, + "loss": 1.5744, + "step": 6795 + }, + { + "epoch": 0.906858820389645, + "grad_norm": 0.9435972338142883, + "learning_rate": 4.5158052639828085e-07, + "loss": 1.5309, + "step": 6796 + }, + { + "epoch": 0.9069922604750467, + "grad_norm": 1.0579708377998769, + "learning_rate": 4.5029731884074004e-07, + "loss": 1.5452, + "step": 6797 + }, + { + "epoch": 0.9071257005604484, + "grad_norm": 1.6099878391111002, + "learning_rate": 4.4901589505893297e-07, + "loss": 1.535, + "step": 6798 + }, + { + "epoch": 0.90725914064585, + "grad_norm": 0.9431159488281261, + "learning_rate": 4.4773625529221486e-07, + "loss": 1.5271, + "step": 6799 + }, + { + "epoch": 0.9073925807312516, + "grad_norm": 0.9302835803444568, + "learning_rate": 4.464583997796057e-07, + "loss": 1.5343, + "step": 6800 + }, + { + "epoch": 0.9075260208166533, + "grad_norm": 0.9727837080368901, + "learning_rate": 4.451823287597978e-07, + "loss": 1.5523, + "step": 6801 + }, + { + "epoch": 0.907659460902055, + "grad_norm": 0.9311133151109089, + "learning_rate": 4.4390804247114503e-07, + "loss": 1.5352, + "step": 6802 + }, + { + "epoch": 0.9077929009874567, + "grad_norm": 0.9834944210354547, + "learning_rate": 4.4263554115166805e-07, + "loss": 1.5093, + "step": 6803 + }, + { + "epoch": 0.9079263410728583, + "grad_norm": 0.924803540849428, + "learning_rate": 4.413648250390601e-07, + "loss": 1.5702, + "step": 6804 + }, + { + "epoch": 0.9080597811582599, + "grad_norm": 1.0516475698411472, + "learning_rate": 4.400958943706724e-07, + "loss": 1.566, + "step": 6805 + }, + { + "epoch": 0.9081932212436616, + "grad_norm": 1.0934743180424145, + "learning_rate": 4.388287493835286e-07, + "loss": 1.5735, + "step": 6806 + }, + { + "epoch": 0.9083266613290633, + "grad_norm": 0.9552392103374737, + "learning_rate": 4.3756339031431394e-07, + "loss": 1.563, + "step": 6807 + }, + { + "epoch": 0.9084601014144649, + "grad_norm": 0.9653006612408468, + "learning_rate": 4.362998173993882e-07, + "loss": 1.5681, + "step": 6808 + }, + { + "epoch": 0.9085935414998666, + "grad_norm": 0.9718108077335248, + "learning_rate": 4.3503803087476926e-07, + "loss": 1.5478, + "step": 6809 + }, + { + "epoch": 0.9087269815852682, + "grad_norm": 0.9602753698421362, + "learning_rate": 4.337780309761441e-07, + "loss": 1.554, + "step": 6810 + }, + { + "epoch": 0.9088604216706698, + "grad_norm": 0.9424771815385352, + "learning_rate": 4.3251981793886787e-07, + "loss": 1.6088, + "step": 6811 + }, + { + "epoch": 0.9089938617560716, + "grad_norm": 0.9274015813847016, + "learning_rate": 4.312633919979603e-07, + "loss": 1.5803, + "step": 6812 + }, + { + "epoch": 0.9091273018414732, + "grad_norm": 0.9230912583513253, + "learning_rate": 4.300087533881059e-07, + "loss": 1.5684, + "step": 6813 + }, + { + "epoch": 0.9092607419268748, + "grad_norm": 0.9305252391203362, + "learning_rate": 4.2875590234365825e-07, + "loss": 1.4863, + "step": 6814 + }, + { + "epoch": 0.9093941820122765, + "grad_norm": 0.9029270378762562, + "learning_rate": 4.2750483909863584e-07, + "loss": 1.5725, + "step": 6815 + }, + { + "epoch": 0.9095276220976781, + "grad_norm": 0.9383970397120545, + "learning_rate": 4.2625556388672163e-07, + "loss": 1.5339, + "step": 6816 + }, + { + "epoch": 0.9096610621830798, + "grad_norm": 0.9345725438722642, + "learning_rate": 4.2500807694126677e-07, + "loss": 1.5505, + "step": 6817 + }, + { + "epoch": 0.9097945022684815, + "grad_norm": 0.9259234911532692, + "learning_rate": 4.2376237849528936e-07, + "loss": 1.5278, + "step": 6818 + }, + { + "epoch": 0.9099279423538831, + "grad_norm": 1.0476543726975824, + "learning_rate": 4.2251846878146873e-07, + "loss": 1.5572, + "step": 6819 + }, + { + "epoch": 0.9100613824392848, + "grad_norm": 0.9337601320766555, + "learning_rate": 4.2127634803215576e-07, + "loss": 1.5409, + "step": 6820 + }, + { + "epoch": 0.9101948225246864, + "grad_norm": 0.9335871299642117, + "learning_rate": 4.2003601647936156e-07, + "loss": 1.5401, + "step": 6821 + }, + { + "epoch": 0.910328262610088, + "grad_norm": 0.9622149934684122, + "learning_rate": 4.187974743547674e-07, + "loss": 1.5137, + "step": 6822 + }, + { + "epoch": 0.9104617026954898, + "grad_norm": 0.9148887346642454, + "learning_rate": 4.175607218897204e-07, + "loss": 1.5505, + "step": 6823 + }, + { + "epoch": 0.9105951427808914, + "grad_norm": 1.0530922369152327, + "learning_rate": 4.1632575931522923e-07, + "loss": 1.5171, + "step": 6824 + }, + { + "epoch": 0.910728582866293, + "grad_norm": 0.9530413630746577, + "learning_rate": 4.150925868619726e-07, + "loss": 1.6038, + "step": 6825 + }, + { + "epoch": 0.9108620229516947, + "grad_norm": 0.9248293389790625, + "learning_rate": 4.138612047602919e-07, + "loss": 1.53, + "step": 6826 + }, + { + "epoch": 0.9109954630370963, + "grad_norm": 0.9435050547492767, + "learning_rate": 4.126316132401975e-07, + "loss": 1.5497, + "step": 6827 + }, + { + "epoch": 0.911128903122498, + "grad_norm": 0.9172974815783576, + "learning_rate": 4.1140381253135907e-07, + "loss": 1.53, + "step": 6828 + }, + { + "epoch": 0.9112623432078997, + "grad_norm": 0.9198332491276605, + "learning_rate": 4.101778028631198e-07, + "loss": 1.4973, + "step": 6829 + }, + { + "epoch": 0.9113957832933013, + "grad_norm": 0.9340095445157393, + "learning_rate": 4.0895358446448205e-07, + "loss": 1.5311, + "step": 6830 + }, + { + "epoch": 0.911529223378703, + "grad_norm": 0.9218669809304821, + "learning_rate": 4.077311575641174e-07, + "loss": 1.4977, + "step": 6831 + }, + { + "epoch": 0.9116626634641046, + "grad_norm": 0.9386440639251815, + "learning_rate": 4.0651052239035873e-07, + "loss": 1.4714, + "step": 6832 + }, + { + "epoch": 0.9117961035495062, + "grad_norm": 0.9188422179185332, + "learning_rate": 4.0529167917121024e-07, + "loss": 1.5086, + "step": 6833 + }, + { + "epoch": 0.911929543634908, + "grad_norm": 0.9929237105805191, + "learning_rate": 4.040746281343355e-07, + "loss": 1.5435, + "step": 6834 + }, + { + "epoch": 0.9120629837203096, + "grad_norm": 0.9262323335599812, + "learning_rate": 4.0285936950706705e-07, + "loss": 1.543, + "step": 6835 + }, + { + "epoch": 0.9121964238057112, + "grad_norm": 0.9413275796185295, + "learning_rate": 4.016459035164e-07, + "loss": 1.5475, + "step": 6836 + }, + { + "epoch": 0.9123298638911129, + "grad_norm": 0.9355976771543438, + "learning_rate": 4.0043423038899743e-07, + "loss": 1.5646, + "step": 6837 + }, + { + "epoch": 0.9124633039765145, + "grad_norm": 0.9308452239118499, + "learning_rate": 3.99224350351185e-07, + "loss": 1.5282, + "step": 6838 + }, + { + "epoch": 0.9125967440619162, + "grad_norm": 0.9352299931869122, + "learning_rate": 3.9801626362895527e-07, + "loss": 1.5478, + "step": 6839 + }, + { + "epoch": 0.9127301841473179, + "grad_norm": 0.920552426822107, + "learning_rate": 3.9680997044796445e-07, + "loss": 1.571, + "step": 6840 + }, + { + "epoch": 0.9128636242327195, + "grad_norm": 0.9614541460432983, + "learning_rate": 3.9560547103353553e-07, + "loss": 1.5716, + "step": 6841 + }, + { + "epoch": 0.9129970643181211, + "grad_norm": 0.921161207602936, + "learning_rate": 3.9440276561065306e-07, + "loss": 1.5062, + "step": 6842 + }, + { + "epoch": 0.9131305044035228, + "grad_norm": 0.9298591480513266, + "learning_rate": 3.932018544039717e-07, + "loss": 1.5823, + "step": 6843 + }, + { + "epoch": 0.9132639444889245, + "grad_norm": 0.9522285441788466, + "learning_rate": 3.920027376378055e-07, + "loss": 1.5636, + "step": 6844 + }, + { + "epoch": 0.9133973845743262, + "grad_norm": 0.9048091631898952, + "learning_rate": 3.9080541553613737e-07, + "loss": 1.5236, + "step": 6845 + }, + { + "epoch": 0.9135308246597278, + "grad_norm": 0.8997967100666988, + "learning_rate": 3.8960988832261184e-07, + "loss": 1.543, + "step": 6846 + }, + { + "epoch": 0.9136642647451294, + "grad_norm": 0.9344054356817217, + "learning_rate": 3.884161562205413e-07, + "loss": 1.5238, + "step": 6847 + }, + { + "epoch": 0.9137977048305311, + "grad_norm": 0.9433085668577765, + "learning_rate": 3.872242194529019e-07, + "loss": 1.5323, + "step": 6848 + }, + { + "epoch": 0.9139311449159327, + "grad_norm": 0.9298922807904612, + "learning_rate": 3.8603407824233e-07, + "loss": 1.5804, + "step": 6849 + }, + { + "epoch": 0.9140645850013344, + "grad_norm": 0.916116285836366, + "learning_rate": 3.8484573281113546e-07, + "loss": 1.5167, + "step": 6850 + }, + { + "epoch": 0.9141980250867361, + "grad_norm": 1.0047053088456148, + "learning_rate": 3.8365918338128525e-07, + "loss": 1.5509, + "step": 6851 + }, + { + "epoch": 0.9143314651721377, + "grad_norm": 0.9283300536875279, + "learning_rate": 3.8247443017441323e-07, + "loss": 1.5111, + "step": 6852 + }, + { + "epoch": 0.9144649052575393, + "grad_norm": 1.1042519917914184, + "learning_rate": 3.812914734118167e-07, + "loss": 1.5714, + "step": 6853 + }, + { + "epoch": 0.914598345342941, + "grad_norm": 0.9081858321179135, + "learning_rate": 3.8011031331446125e-07, + "loss": 1.5021, + "step": 6854 + }, + { + "epoch": 0.9147317854283427, + "grad_norm": 0.9771021703620095, + "learning_rate": 3.7893095010297255e-07, + "loss": 1.5391, + "step": 6855 + }, + { + "epoch": 0.9148652255137443, + "grad_norm": 0.9348735621584969, + "learning_rate": 3.7775338399764106e-07, + "loss": 1.5353, + "step": 6856 + }, + { + "epoch": 0.914998665599146, + "grad_norm": 1.217330780323833, + "learning_rate": 3.765776152184264e-07, + "loss": 1.5584, + "step": 6857 + }, + { + "epoch": 0.9151321056845476, + "grad_norm": 0.9415726884299152, + "learning_rate": 3.754036439849451e-07, + "loss": 1.5495, + "step": 6858 + }, + { + "epoch": 0.9152655457699493, + "grad_norm": 0.9466565821101811, + "learning_rate": 3.742314705164829e-07, + "loss": 1.5281, + "step": 6859 + }, + { + "epoch": 0.915398985855351, + "grad_norm": 0.9439757015604497, + "learning_rate": 3.7306109503198797e-07, + "loss": 1.6172, + "step": 6860 + }, + { + "epoch": 0.9155324259407526, + "grad_norm": 0.9358623022886202, + "learning_rate": 3.718925177500743e-07, + "loss": 1.5008, + "step": 6861 + }, + { + "epoch": 0.9156658660261543, + "grad_norm": 0.9532844740769746, + "learning_rate": 3.707257388890195e-07, + "loss": 1.5318, + "step": 6862 + }, + { + "epoch": 0.9157993061115559, + "grad_norm": 0.9749521833223408, + "learning_rate": 3.6956075866676155e-07, + "loss": 1.528, + "step": 6863 + }, + { + "epoch": 0.9159327461969575, + "grad_norm": 1.1423465704015638, + "learning_rate": 3.6839757730090744e-07, + "loss": 1.5111, + "step": 6864 + }, + { + "epoch": 0.9160661862823593, + "grad_norm": 0.9134442615213153, + "learning_rate": 3.672361950087266e-07, + "loss": 1.5284, + "step": 6865 + }, + { + "epoch": 0.9161996263677609, + "grad_norm": 0.9286686714079571, + "learning_rate": 3.660766120071513e-07, + "loss": 1.4942, + "step": 6866 + }, + { + "epoch": 0.9163330664531625, + "grad_norm": 0.9431823883198829, + "learning_rate": 3.649188285127769e-07, + "loss": 1.5244, + "step": 6867 + }, + { + "epoch": 0.9164665065385642, + "grad_norm": 1.0666489900172478, + "learning_rate": 3.637628447418673e-07, + "loss": 1.5604, + "step": 6868 + }, + { + "epoch": 0.9165999466239658, + "grad_norm": 0.9836730783947925, + "learning_rate": 3.626086609103463e-07, + "loss": 1.5952, + "step": 6869 + }, + { + "epoch": 0.9167333867093675, + "grad_norm": 0.9593749865507312, + "learning_rate": 3.614562772338015e-07, + "loss": 1.4742, + "step": 6870 + }, + { + "epoch": 0.9168668267947692, + "grad_norm": 1.0579631089786683, + "learning_rate": 3.6030569392748294e-07, + "loss": 1.5043, + "step": 6871 + }, + { + "epoch": 0.9170002668801708, + "grad_norm": 0.979103974391311, + "learning_rate": 3.5915691120631093e-07, + "loss": 1.5323, + "step": 6872 + }, + { + "epoch": 0.9171337069655725, + "grad_norm": 1.503246770281354, + "learning_rate": 3.5800992928486265e-07, + "loss": 1.5514, + "step": 6873 + }, + { + "epoch": 0.9172671470509741, + "grad_norm": 0.9405027952328543, + "learning_rate": 3.568647483773813e-07, + "loss": 1.5276, + "step": 6874 + }, + { + "epoch": 0.9174005871363757, + "grad_norm": 1.0134385203890537, + "learning_rate": 3.557213686977723e-07, + "loss": 1.5814, + "step": 6875 + }, + { + "epoch": 0.9175340272217775, + "grad_norm": 0.9200398848804521, + "learning_rate": 3.545797904596082e-07, + "loss": 1.5544, + "step": 6876 + }, + { + "epoch": 0.9176674673071791, + "grad_norm": 0.9685297046655957, + "learning_rate": 3.5344001387612293e-07, + "loss": 1.5557, + "step": 6877 + }, + { + "epoch": 0.9178009073925807, + "grad_norm": 0.9225117368528908, + "learning_rate": 3.5230203916021277e-07, + "loss": 1.5779, + "step": 6878 + }, + { + "epoch": 0.9179343474779824, + "grad_norm": 0.9089807335651439, + "learning_rate": 3.511658665244377e-07, + "loss": 1.4707, + "step": 6879 + }, + { + "epoch": 0.918067787563384, + "grad_norm": 1.0909826397284352, + "learning_rate": 3.5003149618102253e-07, + "loss": 1.5554, + "step": 6880 + }, + { + "epoch": 0.9182012276487856, + "grad_norm": 0.9370669644460864, + "learning_rate": 3.4889892834185533e-07, + "loss": 1.4784, + "step": 6881 + }, + { + "epoch": 0.9183346677341874, + "grad_norm": 0.9322223361726935, + "learning_rate": 3.4776816321848596e-07, + "loss": 1.5147, + "step": 6882 + }, + { + "epoch": 0.918468107819589, + "grad_norm": 0.9329739735097461, + "learning_rate": 3.466392010221298e-07, + "loss": 1.5383, + "step": 6883 + }, + { + "epoch": 0.9186015479049907, + "grad_norm": 0.9151417878518067, + "learning_rate": 3.4551204196366263e-07, + "loss": 1.5576, + "step": 6884 + }, + { + "epoch": 0.9187349879903923, + "grad_norm": 0.951435126300393, + "learning_rate": 3.4438668625362493e-07, + "loss": 1.5314, + "step": 6885 + }, + { + "epoch": 0.9188684280757939, + "grad_norm": 0.913279499004807, + "learning_rate": 3.432631341022219e-07, + "loss": 1.4513, + "step": 6886 + }, + { + "epoch": 0.9190018681611957, + "grad_norm": 0.9441844786118736, + "learning_rate": 3.42141385719319e-07, + "loss": 1.5387, + "step": 6887 + }, + { + "epoch": 0.9191353082465973, + "grad_norm": 0.932767307846722, + "learning_rate": 3.410214413144464e-07, + "loss": 1.5246, + "step": 6888 + }, + { + "epoch": 0.9192687483319989, + "grad_norm": 0.9472847800493374, + "learning_rate": 3.399033010967967e-07, + "loss": 1.4983, + "step": 6889 + }, + { + "epoch": 0.9194021884174006, + "grad_norm": 1.0419370287145757, + "learning_rate": 3.3878696527522624e-07, + "loss": 1.5103, + "step": 6890 + }, + { + "epoch": 0.9195356285028022, + "grad_norm": 0.9681983142545559, + "learning_rate": 3.3767243405825487e-07, + "loss": 1.5462, + "step": 6891 + }, + { + "epoch": 0.9196690685882039, + "grad_norm": 0.9052614017344128, + "learning_rate": 3.3655970765406056e-07, + "loss": 1.4736, + "step": 6892 + }, + { + "epoch": 0.9198025086736056, + "grad_norm": 1.0228191814158987, + "learning_rate": 3.354487862704925e-07, + "loss": 1.5639, + "step": 6893 + }, + { + "epoch": 0.9199359487590072, + "grad_norm": 0.9360422534666102, + "learning_rate": 3.3433967011505586e-07, + "loss": 1.5466, + "step": 6894 + }, + { + "epoch": 0.9200693888444088, + "grad_norm": 0.9264242257227067, + "learning_rate": 3.332323593949205e-07, + "loss": 1.5214, + "step": 6895 + }, + { + "epoch": 0.9202028289298105, + "grad_norm": 0.9785840279808523, + "learning_rate": 3.3212685431691983e-07, + "loss": 1.521, + "step": 6896 + }, + { + "epoch": 0.9203362690152121, + "grad_norm": 0.990110652510918, + "learning_rate": 3.310231550875509e-07, + "loss": 1.539, + "step": 6897 + }, + { + "epoch": 0.9204697091006139, + "grad_norm": 0.9933025587703744, + "learning_rate": 3.29921261912971e-07, + "loss": 1.5062, + "step": 6898 + }, + { + "epoch": 0.9206031491860155, + "grad_norm": 0.924550064236254, + "learning_rate": 3.28821174999e-07, + "loss": 1.5394, + "step": 6899 + }, + { + "epoch": 0.9207365892714171, + "grad_norm": 0.9765816922537065, + "learning_rate": 3.277228945511246e-07, + "loss": 1.4902, + "step": 6900 + }, + { + "epoch": 0.9208700293568188, + "grad_norm": 0.9523736840444362, + "learning_rate": 3.266264207744885e-07, + "loss": 1.5404, + "step": 6901 + }, + { + "epoch": 0.9210034694422204, + "grad_norm": 0.9485498472126853, + "learning_rate": 3.2553175387390225e-07, + "loss": 1.5547, + "step": 6902 + }, + { + "epoch": 0.9211369095276221, + "grad_norm": 0.9345918935854542, + "learning_rate": 3.2443889405383564e-07, + "loss": 1.4989, + "step": 6903 + }, + { + "epoch": 0.9212703496130238, + "grad_norm": 0.9620003150256877, + "learning_rate": 3.2334784151842434e-07, + "loss": 1.5763, + "step": 6904 + }, + { + "epoch": 0.9214037896984254, + "grad_norm": 0.9533414019445823, + "learning_rate": 3.2225859647146306e-07, + "loss": 1.5578, + "step": 6905 + }, + { + "epoch": 0.921537229783827, + "grad_norm": 1.030837787394548, + "learning_rate": 3.2117115911640904e-07, + "loss": 1.5385, + "step": 6906 + }, + { + "epoch": 0.9216706698692287, + "grad_norm": 0.9838369808924554, + "learning_rate": 3.200855296563865e-07, + "loss": 1.5466, + "step": 6907 + }, + { + "epoch": 0.9218041099546304, + "grad_norm": 0.9369615843893399, + "learning_rate": 3.1900170829417765e-07, + "loss": 1.5616, + "step": 6908 + }, + { + "epoch": 0.921937550040032, + "grad_norm": 1.0512907143003218, + "learning_rate": 3.179196952322272e-07, + "loss": 1.5901, + "step": 6909 + }, + { + "epoch": 0.9220709901254337, + "grad_norm": 0.9920000789006053, + "learning_rate": 3.168394906726413e-07, + "loss": 1.5857, + "step": 6910 + }, + { + "epoch": 0.9222044302108353, + "grad_norm": 0.9257587202232741, + "learning_rate": 3.1576109481719294e-07, + "loss": 1.5313, + "step": 6911 + }, + { + "epoch": 0.922337870296237, + "grad_norm": 0.9418319592189258, + "learning_rate": 3.146845078673133e-07, + "loss": 1.539, + "step": 6912 + }, + { + "epoch": 0.9224713103816387, + "grad_norm": 0.9070087342084403, + "learning_rate": 3.136097300240948e-07, + "loss": 1.4764, + "step": 6913 + }, + { + "epoch": 0.9226047504670403, + "grad_norm": 0.965385596189811, + "learning_rate": 3.125367614882957e-07, + "loss": 1.5912, + "step": 6914 + }, + { + "epoch": 0.922738190552442, + "grad_norm": 0.9457846928440798, + "learning_rate": 3.114656024603346e-07, + "loss": 1.6022, + "step": 6915 + }, + { + "epoch": 0.9228716306378436, + "grad_norm": 0.9626967040370848, + "learning_rate": 3.1039625314028934e-07, + "loss": 1.5788, + "step": 6916 + }, + { + "epoch": 0.9230050707232452, + "grad_norm": 0.9262926987232699, + "learning_rate": 3.093287137279044e-07, + "loss": 1.5076, + "step": 6917 + }, + { + "epoch": 0.923138510808647, + "grad_norm": 0.9263671794999331, + "learning_rate": 3.0826298442258263e-07, + "loss": 1.5376, + "step": 6918 + }, + { + "epoch": 0.9232719508940486, + "grad_norm": 1.0802339378714219, + "learning_rate": 3.071990654233925e-07, + "loss": 1.5194, + "step": 6919 + }, + { + "epoch": 0.9234053909794502, + "grad_norm": 0.9517175443121075, + "learning_rate": 3.0613695692905955e-07, + "loss": 1.5472, + "step": 6920 + }, + { + "epoch": 0.9235388310648519, + "grad_norm": 0.9997691999553755, + "learning_rate": 3.050766591379739e-07, + "loss": 1.5521, + "step": 6921 + }, + { + "epoch": 0.9236722711502535, + "grad_norm": 0.927201693441311, + "learning_rate": 3.040181722481872e-07, + "loss": 1.5507, + "step": 6922 + }, + { + "epoch": 0.9238057112356551, + "grad_norm": 0.9436121306544267, + "learning_rate": 3.0296149645741344e-07, + "loss": 1.5319, + "step": 6923 + }, + { + "epoch": 0.9239391513210569, + "grad_norm": 0.9358298860342295, + "learning_rate": 3.0190663196302706e-07, + "loss": 1.512, + "step": 6924 + }, + { + "epoch": 0.9240725914064585, + "grad_norm": 0.9671848262898582, + "learning_rate": 3.008535789620648e-07, + "loss": 1.5107, + "step": 6925 + }, + { + "epoch": 0.9242060314918602, + "grad_norm": 0.9462619779524947, + "learning_rate": 2.99802337651226e-07, + "loss": 1.5133, + "step": 6926 + }, + { + "epoch": 0.9243394715772618, + "grad_norm": 0.9375880200694752, + "learning_rate": 2.987529082268692e-07, + "loss": 1.5224, + "step": 6927 + }, + { + "epoch": 0.9244729116626634, + "grad_norm": 0.9185590934633807, + "learning_rate": 2.977052908850142e-07, + "loss": 1.5366, + "step": 6928 + }, + { + "epoch": 0.9246063517480652, + "grad_norm": 0.925660519615923, + "learning_rate": 2.9665948582134783e-07, + "loss": 1.5373, + "step": 6929 + }, + { + "epoch": 0.9247397918334668, + "grad_norm": 0.9558442289500324, + "learning_rate": 2.9561549323121385e-07, + "loss": 1.5556, + "step": 6930 + }, + { + "epoch": 0.9248732319188684, + "grad_norm": 0.9105723816514041, + "learning_rate": 2.9457331330961513e-07, + "loss": 1.5277, + "step": 6931 + }, + { + "epoch": 0.9250066720042701, + "grad_norm": 0.9402020298370549, + "learning_rate": 2.935329462512226e-07, + "loss": 1.5444, + "step": 6932 + }, + { + "epoch": 0.9251401120896717, + "grad_norm": 0.9389120664607241, + "learning_rate": 2.9249439225036313e-07, + "loss": 1.5906, + "step": 6933 + }, + { + "epoch": 0.9252735521750733, + "grad_norm": 0.9566777094490774, + "learning_rate": 2.91457651501027e-07, + "loss": 1.5388, + "step": 6934 + }, + { + "epoch": 0.9254069922604751, + "grad_norm": 0.9475669400048962, + "learning_rate": 2.904227241968638e-07, + "loss": 1.5122, + "step": 6935 + }, + { + "epoch": 0.9255404323458767, + "grad_norm": 0.9606016426402592, + "learning_rate": 2.8938961053118997e-07, + "loss": 1.5747, + "step": 6936 + }, + { + "epoch": 0.9256738724312784, + "grad_norm": 0.9416160631017734, + "learning_rate": 2.8835831069697786e-07, + "loss": 1.5942, + "step": 6937 + }, + { + "epoch": 0.92580731251668, + "grad_norm": 0.9255067631796212, + "learning_rate": 2.873288248868611e-07, + "loss": 1.5681, + "step": 6938 + }, + { + "epoch": 0.9259407526020816, + "grad_norm": 0.9354963560086351, + "learning_rate": 2.86301153293137e-07, + "loss": 1.5262, + "step": 6939 + }, + { + "epoch": 0.9260741926874834, + "grad_norm": 0.9413294820231983, + "learning_rate": 2.852752961077632e-07, + "loss": 1.535, + "step": 6940 + }, + { + "epoch": 0.926207632772885, + "grad_norm": 0.9362918263402716, + "learning_rate": 2.842512535223585e-07, + "loss": 1.5211, + "step": 6941 + }, + { + "epoch": 0.9263410728582866, + "grad_norm": 0.921974590840743, + "learning_rate": 2.832290257282e-07, + "loss": 1.5103, + "step": 6942 + }, + { + "epoch": 0.9264745129436883, + "grad_norm": 0.9369764170400131, + "learning_rate": 2.822086129162305e-07, + "loss": 1.5596, + "step": 6943 + }, + { + "epoch": 0.9266079530290899, + "grad_norm": 0.9147557346642654, + "learning_rate": 2.811900152770519e-07, + "loss": 1.5058, + "step": 6944 + }, + { + "epoch": 0.9267413931144916, + "grad_norm": 0.9679302190588822, + "learning_rate": 2.8017323300092435e-07, + "loss": 1.576, + "step": 6945 + }, + { + "epoch": 0.9268748331998933, + "grad_norm": 0.9466621416815499, + "learning_rate": 2.791582662777725e-07, + "loss": 1.5155, + "step": 6946 + }, + { + "epoch": 0.9270082732852949, + "grad_norm": 1.0259125716265545, + "learning_rate": 2.7814511529718146e-07, + "loss": 1.588, + "step": 6947 + }, + { + "epoch": 0.9271417133706965, + "grad_norm": 0.9713536021169017, + "learning_rate": 2.771337802483953e-07, + "loss": 1.505, + "step": 6948 + }, + { + "epoch": 0.9272751534560982, + "grad_norm": 0.9124231485781754, + "learning_rate": 2.7612426132031966e-07, + "loss": 1.5179, + "step": 6949 + }, + { + "epoch": 0.9274085935414998, + "grad_norm": 0.9450377082479692, + "learning_rate": 2.751165587015214e-07, + "loss": 1.516, + "step": 6950 + }, + { + "epoch": 0.9275420336269016, + "grad_norm": 0.9430651969927591, + "learning_rate": 2.7411067258022896e-07, + "loss": 1.5616, + "step": 6951 + }, + { + "epoch": 0.9276754737123032, + "grad_norm": 1.1461572084836693, + "learning_rate": 2.731066031443275e-07, + "loss": 1.5797, + "step": 6952 + }, + { + "epoch": 0.9278089137977048, + "grad_norm": 0.9850748413044481, + "learning_rate": 2.721043505813692e-07, + "loss": 1.6142, + "step": 6953 + }, + { + "epoch": 0.9279423538831065, + "grad_norm": 0.9410326470678416, + "learning_rate": 2.7110391507856215e-07, + "loss": 1.5513, + "step": 6954 + }, + { + "epoch": 0.9280757939685081, + "grad_norm": 0.9717540522614107, + "learning_rate": 2.7010529682277573e-07, + "loss": 1.5271, + "step": 6955 + }, + { + "epoch": 0.9282092340539098, + "grad_norm": 1.2527445070154288, + "learning_rate": 2.691084960005408e-07, + "loss": 1.554, + "step": 6956 + }, + { + "epoch": 0.9283426741393115, + "grad_norm": 0.9380409123192367, + "learning_rate": 2.681135127980483e-07, + "loss": 1.4966, + "step": 6957 + }, + { + "epoch": 0.9284761142247131, + "grad_norm": 1.067794635668956, + "learning_rate": 2.671203474011508e-07, + "loss": 1.4979, + "step": 6958 + }, + { + "epoch": 0.9286095543101147, + "grad_norm": 0.924552942569084, + "learning_rate": 2.6612899999535867e-07, + "loss": 1.6066, + "step": 6959 + }, + { + "epoch": 0.9287429943955164, + "grad_norm": 0.9844576271371721, + "learning_rate": 2.65139470765845e-07, + "loss": 1.558, + "step": 6960 + }, + { + "epoch": 0.9288764344809181, + "grad_norm": 0.9671888751234345, + "learning_rate": 2.64151759897443e-07, + "loss": 1.5542, + "step": 6961 + }, + { + "epoch": 0.9290098745663197, + "grad_norm": 0.9178373406964115, + "learning_rate": 2.6316586757464513e-07, + "loss": 1.5549, + "step": 6962 + }, + { + "epoch": 0.9291433146517214, + "grad_norm": 0.9799980886987, + "learning_rate": 2.621817939816051e-07, + "loss": 1.5763, + "step": 6963 + }, + { + "epoch": 0.929276754737123, + "grad_norm": 0.9239659156537983, + "learning_rate": 2.6119953930213713e-07, + "loss": 1.5654, + "step": 6964 + }, + { + "epoch": 0.9294101948225247, + "grad_norm": 1.0117689513131394, + "learning_rate": 2.602191037197155e-07, + "loss": 1.6097, + "step": 6965 + }, + { + "epoch": 0.9295436349079264, + "grad_norm": 0.9259063770945314, + "learning_rate": 2.592404874174714e-07, + "loss": 1.5442, + "step": 6966 + }, + { + "epoch": 0.929677074993328, + "grad_norm": 0.94289731705226, + "learning_rate": 2.582636905782032e-07, + "loss": 1.5511, + "step": 6967 + }, + { + "epoch": 0.9298105150787297, + "grad_norm": 1.2363951568921343, + "learning_rate": 2.572887133843638e-07, + "loss": 1.5284, + "step": 6968 + }, + { + "epoch": 0.9299439551641313, + "grad_norm": 0.9237895244839454, + "learning_rate": 2.5631555601806746e-07, + "loss": 1.5666, + "step": 6969 + }, + { + "epoch": 0.9300773952495329, + "grad_norm": 1.2534199107639514, + "learning_rate": 2.5534421866108884e-07, + "loss": 1.5386, + "step": 6970 + }, + { + "epoch": 0.9302108353349346, + "grad_norm": 0.9440071222676694, + "learning_rate": 2.543747014948639e-07, + "loss": 1.5252, + "step": 6971 + }, + { + "epoch": 0.9303442754203363, + "grad_norm": 0.9410423473034675, + "learning_rate": 2.5340700470048664e-07, + "loss": 1.5539, + "step": 6972 + }, + { + "epoch": 0.9304777155057379, + "grad_norm": 0.9370283821223521, + "learning_rate": 2.524411284587114e-07, + "loss": 1.5497, + "step": 6973 + }, + { + "epoch": 0.9306111555911396, + "grad_norm": 0.9353742776563619, + "learning_rate": 2.5147707294995274e-07, + "loss": 1.4892, + "step": 6974 + }, + { + "epoch": 0.9307445956765412, + "grad_norm": 0.9297881657463101, + "learning_rate": 2.505148383542866e-07, + "loss": 1.5467, + "step": 6975 + }, + { + "epoch": 0.9308780357619428, + "grad_norm": 0.952275461349787, + "learning_rate": 2.495544248514459e-07, + "loss": 1.4877, + "step": 6976 + }, + { + "epoch": 0.9310114758473446, + "grad_norm": 0.9152447233126888, + "learning_rate": 2.48595832620826e-07, + "loss": 1.5295, + "step": 6977 + }, + { + "epoch": 0.9311449159327462, + "grad_norm": 0.91217801833338, + "learning_rate": 2.476390618414803e-07, + "loss": 1.5292, + "step": 6978 + }, + { + "epoch": 0.9312783560181479, + "grad_norm": 0.927701396795192, + "learning_rate": 2.4668411269212377e-07, + "loss": 1.5662, + "step": 6979 + }, + { + "epoch": 0.9314117961035495, + "grad_norm": 0.9404375961303916, + "learning_rate": 2.4573098535112913e-07, + "loss": 1.5882, + "step": 6980 + }, + { + "epoch": 0.9315452361889511, + "grad_norm": 0.9405292351716275, + "learning_rate": 2.4477967999652854e-07, + "loss": 1.512, + "step": 6981 + }, + { + "epoch": 0.9316786762743529, + "grad_norm": 0.9390791133758444, + "learning_rate": 2.438301968060186e-07, + "loss": 1.5456, + "step": 6982 + }, + { + "epoch": 0.9318121163597545, + "grad_norm": 0.9294723456622169, + "learning_rate": 2.4288253595694865e-07, + "loss": 1.5763, + "step": 6983 + }, + { + "epoch": 0.9319455564451561, + "grad_norm": 0.9086002326565813, + "learning_rate": 2.419366976263315e-07, + "loss": 1.5412, + "step": 6984 + }, + { + "epoch": 0.9320789965305578, + "grad_norm": 1.0686971025848702, + "learning_rate": 2.409926819908404e-07, + "loss": 1.5746, + "step": 6985 + }, + { + "epoch": 0.9322124366159594, + "grad_norm": 1.0891885771737662, + "learning_rate": 2.400504892268052e-07, + "loss": 1.5662, + "step": 6986 + }, + { + "epoch": 0.932345876701361, + "grad_norm": 0.9460922151250107, + "learning_rate": 2.391101195102175e-07, + "loss": 1.557, + "step": 6987 + }, + { + "epoch": 0.9324793167867628, + "grad_norm": 0.985651528432505, + "learning_rate": 2.3817157301672777e-07, + "loss": 1.578, + "step": 6988 + }, + { + "epoch": 0.9326127568721644, + "grad_norm": 0.9664112206315217, + "learning_rate": 2.372348499216459e-07, + "loss": 1.539, + "step": 6989 + }, + { + "epoch": 0.932746196957566, + "grad_norm": 0.9490309404868328, + "learning_rate": 2.3629995039994082e-07, + "loss": 1.5389, + "step": 6990 + }, + { + "epoch": 0.9328796370429677, + "grad_norm": 0.9577672566554155, + "learning_rate": 2.3536687462624053e-07, + "loss": 1.5601, + "step": 6991 + }, + { + "epoch": 0.9330130771283693, + "grad_norm": 0.9223528519716557, + "learning_rate": 2.3443562277483345e-07, + "loss": 1.5533, + "step": 6992 + }, + { + "epoch": 0.9331465172137711, + "grad_norm": 0.9427663030847381, + "learning_rate": 2.335061950196671e-07, + "loss": 1.5562, + "step": 6993 + }, + { + "epoch": 0.9332799572991727, + "grad_norm": 0.9523796363575632, + "learning_rate": 2.3257859153434815e-07, + "loss": 1.556, + "step": 6994 + }, + { + "epoch": 0.9334133973845743, + "grad_norm": 0.9514855637959606, + "learning_rate": 2.3165281249214133e-07, + "loss": 1.5486, + "step": 6995 + }, + { + "epoch": 0.933546837469976, + "grad_norm": 0.8960652793709516, + "learning_rate": 2.307288580659728e-07, + "loss": 1.4996, + "step": 6996 + }, + { + "epoch": 0.9336802775553776, + "grad_norm": 0.9468839774939591, + "learning_rate": 2.2980672842842665e-07, + "loss": 1.5557, + "step": 6997 + }, + { + "epoch": 0.9338137176407793, + "grad_norm": 0.9323423502741678, + "learning_rate": 2.288864237517463e-07, + "loss": 1.5423, + "step": 6998 + }, + { + "epoch": 0.933947157726181, + "grad_norm": 0.9477440100001612, + "learning_rate": 2.2796794420783198e-07, + "loss": 1.5839, + "step": 6999 + }, + { + "epoch": 0.9340805978115826, + "grad_norm": 0.9719508936768406, + "learning_rate": 2.2705128996824755e-07, + "loss": 1.5303, + "step": 7000 + }, + { + "epoch": 0.9342140378969842, + "grad_norm": 0.9511975135556928, + "learning_rate": 2.2613646120421383e-07, + "loss": 1.5622, + "step": 7001 + }, + { + "epoch": 0.9343474779823859, + "grad_norm": 0.9164128395943615, + "learning_rate": 2.2522345808660861e-07, + "loss": 1.5558, + "step": 7002 + }, + { + "epoch": 0.9344809180677875, + "grad_norm": 0.9532778700819494, + "learning_rate": 2.24312280785971e-07, + "loss": 1.5872, + "step": 7003 + }, + { + "epoch": 0.9346143581531893, + "grad_norm": 0.9256358437386732, + "learning_rate": 2.2340292947250043e-07, + "loss": 1.5318, + "step": 7004 + }, + { + "epoch": 0.9347477982385909, + "grad_norm": 1.0723225175898141, + "learning_rate": 2.2249540431605099e-07, + "loss": 1.5304, + "step": 7005 + }, + { + "epoch": 0.9348812383239925, + "grad_norm": 0.941909432602312, + "learning_rate": 2.2158970548613934e-07, + "loss": 1.4993, + "step": 7006 + }, + { + "epoch": 0.9350146784093942, + "grad_norm": 0.9580854716526129, + "learning_rate": 2.2068583315193902e-07, + "loss": 1.5505, + "step": 7007 + }, + { + "epoch": 0.9351481184947958, + "grad_norm": 0.9149155992270841, + "learning_rate": 2.197837874822839e-07, + "loss": 1.5243, + "step": 7008 + }, + { + "epoch": 0.9352815585801975, + "grad_norm": 0.931850561675305, + "learning_rate": 2.1888356864566363e-07, + "loss": 1.5763, + "step": 7009 + }, + { + "epoch": 0.9354149986655992, + "grad_norm": 0.945264466148816, + "learning_rate": 2.1798517681023257e-07, + "loss": 1.5681, + "step": 7010 + }, + { + "epoch": 0.9355484387510008, + "grad_norm": 1.1408246656369734, + "learning_rate": 2.1708861214379762e-07, + "loss": 1.5508, + "step": 7011 + }, + { + "epoch": 0.9356818788364024, + "grad_norm": 0.9305152136394855, + "learning_rate": 2.1619387481382704e-07, + "loss": 1.4831, + "step": 7012 + }, + { + "epoch": 0.9358153189218041, + "grad_norm": 0.9374514423774111, + "learning_rate": 2.153009649874471e-07, + "loss": 1.534, + "step": 7013 + }, + { + "epoch": 0.9359487590072058, + "grad_norm": 0.9373740971918221, + "learning_rate": 2.144098828314445e-07, + "loss": 1.4639, + "step": 7014 + }, + { + "epoch": 0.9360821990926074, + "grad_norm": 0.9202329086139532, + "learning_rate": 2.1352062851226263e-07, + "loss": 1.5189, + "step": 7015 + }, + { + "epoch": 0.9362156391780091, + "grad_norm": 0.9251964887586024, + "learning_rate": 2.1263320219600426e-07, + "loss": 1.5786, + "step": 7016 + }, + { + "epoch": 0.9363490792634107, + "grad_norm": 0.9284256574113852, + "learning_rate": 2.1174760404843008e-07, + "loss": 1.526, + "step": 7017 + }, + { + "epoch": 0.9364825193488124, + "grad_norm": 1.882242068390261, + "learning_rate": 2.1086383423496004e-07, + "loss": 1.5768, + "step": 7018 + }, + { + "epoch": 0.936615959434214, + "grad_norm": 0.9901007118211904, + "learning_rate": 2.0998189292067316e-07, + "loss": 1.5077, + "step": 7019 + }, + { + "epoch": 0.9367493995196157, + "grad_norm": 1.0159260253271438, + "learning_rate": 2.0910178027030326e-07, + "loss": 1.5657, + "step": 7020 + }, + { + "epoch": 0.9368828396050174, + "grad_norm": 0.9401229086492431, + "learning_rate": 2.082234964482488e-07, + "loss": 1.5582, + "step": 7021 + }, + { + "epoch": 0.937016279690419, + "grad_norm": 1.0751751887062786, + "learning_rate": 2.073470416185619e-07, + "loss": 1.5412, + "step": 7022 + }, + { + "epoch": 0.9371497197758206, + "grad_norm": 0.9506968457876054, + "learning_rate": 2.0647241594495381e-07, + "loss": 1.525, + "step": 7023 + }, + { + "epoch": 0.9372831598612223, + "grad_norm": 0.9301700959194582, + "learning_rate": 2.0559961959079278e-07, + "loss": 1.531, + "step": 7024 + }, + { + "epoch": 0.937416599946624, + "grad_norm": 0.9323275478960292, + "learning_rate": 2.047286527191117e-07, + "loss": 1.5362, + "step": 7025 + }, + { + "epoch": 0.9375500400320256, + "grad_norm": 0.9191649723560835, + "learning_rate": 2.0385951549259486e-07, + "loss": 1.5815, + "step": 7026 + }, + { + "epoch": 0.9376834801174273, + "grad_norm": 1.032563096761044, + "learning_rate": 2.0299220807358578e-07, + "loss": 1.5741, + "step": 7027 + }, + { + "epoch": 0.9378169202028289, + "grad_norm": 0.987366129369346, + "learning_rate": 2.0212673062409038e-07, + "loss": 1.5807, + "step": 7028 + }, + { + "epoch": 0.9379503602882305, + "grad_norm": 0.9364694767430964, + "learning_rate": 2.0126308330576937e-07, + "loss": 1.5519, + "step": 7029 + }, + { + "epoch": 0.9380838003736323, + "grad_norm": 1.4286340232745998, + "learning_rate": 2.004012662799404e-07, + "loss": 1.5545, + "step": 7030 + }, + { + "epoch": 0.9382172404590339, + "grad_norm": 0.935633863001671, + "learning_rate": 1.9954127970758131e-07, + "loss": 1.5272, + "step": 7031 + }, + { + "epoch": 0.9383506805444356, + "grad_norm": 0.9511612705616507, + "learning_rate": 1.986831237493303e-07, + "loss": 1.5378, + "step": 7032 + }, + { + "epoch": 0.9384841206298372, + "grad_norm": 0.9426846301005706, + "learning_rate": 1.9782679856548025e-07, + "loss": 1.5306, + "step": 7033 + }, + { + "epoch": 0.9386175607152388, + "grad_norm": 1.055465469484963, + "learning_rate": 1.969723043159799e-07, + "loss": 1.5642, + "step": 7034 + }, + { + "epoch": 0.9387510008006406, + "grad_norm": 0.9183296763521461, + "learning_rate": 1.961196411604438e-07, + "loss": 1.5428, + "step": 7035 + }, + { + "epoch": 0.9388844408860422, + "grad_norm": 0.9134482621394847, + "learning_rate": 1.9526880925813673e-07, + "loss": 1.501, + "step": 7036 + }, + { + "epoch": 0.9390178809714438, + "grad_norm": 0.9419067060611698, + "learning_rate": 1.9441980876798493e-07, + "loss": 1.5601, + "step": 7037 + }, + { + "epoch": 0.9391513210568455, + "grad_norm": 0.9369479568180347, + "learning_rate": 1.9357263984857044e-07, + "loss": 1.5471, + "step": 7038 + }, + { + "epoch": 0.9392847611422471, + "grad_norm": 0.9557500312485121, + "learning_rate": 1.9272730265813887e-07, + "loss": 1.6029, + "step": 7039 + }, + { + "epoch": 0.9394182012276487, + "grad_norm": 1.0554599506743223, + "learning_rate": 1.9188379735458618e-07, + "loss": 1.5511, + "step": 7040 + }, + { + "epoch": 0.9395516413130505, + "grad_norm": 1.0458439629457223, + "learning_rate": 1.9104212409546964e-07, + "loss": 1.5568, + "step": 7041 + }, + { + "epoch": 0.9396850813984521, + "grad_norm": 1.0948545675024355, + "learning_rate": 1.902022830380068e-07, + "loss": 1.537, + "step": 7042 + }, + { + "epoch": 0.9398185214838537, + "grad_norm": 1.004814199766514, + "learning_rate": 1.8936427433906778e-07, + "loss": 1.5177, + "step": 7043 + }, + { + "epoch": 0.9399519615692554, + "grad_norm": 0.9607332814184879, + "learning_rate": 1.8852809815518514e-07, + "loss": 1.5538, + "step": 7044 + }, + { + "epoch": 0.940085401654657, + "grad_norm": 0.9487552535897916, + "learning_rate": 1.8769375464254503e-07, + "loss": 1.5562, + "step": 7045 + }, + { + "epoch": 0.9402188417400588, + "grad_norm": 0.9083170430971447, + "learning_rate": 1.868612439569939e-07, + "loss": 1.5094, + "step": 7046 + }, + { + "epoch": 0.9403522818254604, + "grad_norm": 0.9634251033944141, + "learning_rate": 1.8603056625403627e-07, + "loss": 1.5631, + "step": 7047 + }, + { + "epoch": 0.940485721910862, + "grad_norm": 0.9656510203512615, + "learning_rate": 1.8520172168883243e-07, + "loss": 1.5834, + "step": 7048 + }, + { + "epoch": 0.9406191619962637, + "grad_norm": 1.0000660756193365, + "learning_rate": 1.843747104162008e-07, + "loss": 1.582, + "step": 7049 + }, + { + "epoch": 0.9407526020816653, + "grad_norm": 1.1327819091986768, + "learning_rate": 1.8354953259061892e-07, + "loss": 1.5539, + "step": 7050 + }, + { + "epoch": 0.940886042167067, + "grad_norm": 0.9810323631990697, + "learning_rate": 1.8272618836621902e-07, + "loss": 1.5371, + "step": 7051 + }, + { + "epoch": 0.9410194822524687, + "grad_norm": 0.9130660342403393, + "learning_rate": 1.819046778967948e-07, + "loss": 1.522, + "step": 7052 + }, + { + "epoch": 0.9411529223378703, + "grad_norm": 0.9194929376758173, + "learning_rate": 1.8108500133579233e-07, + "loss": 1.4903, + "step": 7053 + }, + { + "epoch": 0.9412863624232719, + "grad_norm": 1.2769265280912176, + "learning_rate": 1.8026715883631917e-07, + "loss": 1.5293, + "step": 7054 + }, + { + "epoch": 0.9414198025086736, + "grad_norm": 0.983577585375118, + "learning_rate": 1.7945115055113983e-07, + "loss": 1.5471, + "step": 7055 + }, + { + "epoch": 0.9415532425940752, + "grad_norm": 1.2831644360579264, + "learning_rate": 1.7863697663267231e-07, + "loss": 1.5696, + "step": 7056 + }, + { + "epoch": 0.9416866826794769, + "grad_norm": 0.9321407428486242, + "learning_rate": 1.7782463723299947e-07, + "loss": 1.5499, + "step": 7057 + }, + { + "epoch": 0.9418201227648786, + "grad_norm": 0.9239769506776327, + "learning_rate": 1.7701413250385324e-07, + "loss": 1.552, + "step": 7058 + }, + { + "epoch": 0.9419535628502802, + "grad_norm": 0.9822924040248578, + "learning_rate": 1.7620546259662808e-07, + "loss": 1.5701, + "step": 7059 + }, + { + "epoch": 0.9420870029356819, + "grad_norm": 0.9264197624873486, + "learning_rate": 1.7539862766237536e-07, + "loss": 1.5301, + "step": 7060 + }, + { + "epoch": 0.9422204430210835, + "grad_norm": 0.9098879290150069, + "learning_rate": 1.7459362785180122e-07, + "loss": 1.497, + "step": 7061 + }, + { + "epoch": 0.9423538831064852, + "grad_norm": 0.9543713844981121, + "learning_rate": 1.7379046331527094e-07, + "loss": 1.4844, + "step": 7062 + }, + { + "epoch": 0.9424873231918869, + "grad_norm": 1.091370085062858, + "learning_rate": 1.729891342028067e-07, + "loss": 1.5259, + "step": 7063 + }, + { + "epoch": 0.9426207632772885, + "grad_norm": 1.0782943171835628, + "learning_rate": 1.7218964066408773e-07, + "loss": 1.5141, + "step": 7064 + }, + { + "epoch": 0.9427542033626901, + "grad_norm": 0.9167701552489729, + "learning_rate": 1.7139198284845005e-07, + "loss": 1.4914, + "step": 7065 + }, + { + "epoch": 0.9428876434480918, + "grad_norm": 0.9315361672522141, + "learning_rate": 1.7059616090488561e-07, + "loss": 1.553, + "step": 7066 + }, + { + "epoch": 0.9430210835334935, + "grad_norm": 0.941731632670938, + "learning_rate": 1.6980217498204777e-07, + "loss": 1.5184, + "step": 7067 + }, + { + "epoch": 0.9431545236188951, + "grad_norm": 1.0951605019526698, + "learning_rate": 1.6901002522824228e-07, + "loss": 1.518, + "step": 7068 + }, + { + "epoch": 0.9432879637042968, + "grad_norm": 0.9413032824460642, + "learning_rate": 1.6821971179143415e-07, + "loss": 1.4946, + "step": 7069 + }, + { + "epoch": 0.9434214037896984, + "grad_norm": 0.9158286385674823, + "learning_rate": 1.6743123481924417e-07, + "loss": 1.5623, + "step": 7070 + }, + { + "epoch": 0.9435548438751001, + "grad_norm": 0.9396586297163264, + "learning_rate": 1.666445944589523e-07, + "loss": 1.5608, + "step": 7071 + }, + { + "epoch": 0.9436882839605018, + "grad_norm": 0.9392711665239271, + "learning_rate": 1.6585979085749326e-07, + "loss": 1.5315, + "step": 7072 + }, + { + "epoch": 0.9438217240459034, + "grad_norm": 0.9382386473834549, + "learning_rate": 1.6507682416145865e-07, + "loss": 1.578, + "step": 7073 + }, + { + "epoch": 0.9439551641313051, + "grad_norm": 0.9007505568219435, + "learning_rate": 1.6429569451709925e-07, + "loss": 1.5409, + "step": 7074 + }, + { + "epoch": 0.9440886042167067, + "grad_norm": 0.9402668564767092, + "learning_rate": 1.635164020703206e-07, + "loss": 1.5789, + "step": 7075 + }, + { + "epoch": 0.9442220443021083, + "grad_norm": 0.9642072770932624, + "learning_rate": 1.6273894696668514e-07, + "loss": 1.5041, + "step": 7076 + }, + { + "epoch": 0.94435548438751, + "grad_norm": 0.9169461721627207, + "learning_rate": 1.6196332935141225e-07, + "loss": 1.5472, + "step": 7077 + }, + { + "epoch": 0.9444889244729117, + "grad_norm": 1.3273948346582414, + "learning_rate": 1.6118954936938046e-07, + "loss": 1.5359, + "step": 7078 + }, + { + "epoch": 0.9446223645583133, + "grad_norm": 1.129474162644648, + "learning_rate": 1.604176071651209e-07, + "loss": 1.5153, + "step": 7079 + }, + { + "epoch": 0.944755804643715, + "grad_norm": 0.9293810385080555, + "learning_rate": 1.596475028828237e-07, + "loss": 1.5414, + "step": 7080 + }, + { + "epoch": 0.9448892447291166, + "grad_norm": 0.9382288057598555, + "learning_rate": 1.588792366663383e-07, + "loss": 1.5622, + "step": 7081 + }, + { + "epoch": 0.9450226848145182, + "grad_norm": 0.989321226514514, + "learning_rate": 1.5811280865916435e-07, + "loss": 1.5804, + "step": 7082 + }, + { + "epoch": 0.94515612489992, + "grad_norm": 0.9390344091231897, + "learning_rate": 1.5734821900446507e-07, + "loss": 1.5219, + "step": 7083 + }, + { + "epoch": 0.9452895649853216, + "grad_norm": 1.0089198056152286, + "learning_rate": 1.56585467845054e-07, + "loss": 1.5103, + "step": 7084 + }, + { + "epoch": 0.9454230050707233, + "grad_norm": 0.9927138491828363, + "learning_rate": 1.5582455532340836e-07, + "loss": 1.5626, + "step": 7085 + }, + { + "epoch": 0.9455564451561249, + "grad_norm": 6.151724858717579, + "learning_rate": 1.5506548158165437e-07, + "loss": 1.5719, + "step": 7086 + }, + { + "epoch": 0.9456898852415265, + "grad_norm": 0.9752024728609847, + "learning_rate": 1.5430824676157974e-07, + "loss": 1.5091, + "step": 7087 + }, + { + "epoch": 0.9458233253269283, + "grad_norm": 0.9228271908165766, + "learning_rate": 1.535528510046258e-07, + "loss": 1.5471, + "step": 7088 + }, + { + "epoch": 0.9459567654123299, + "grad_norm": 0.9142093131081027, + "learning_rate": 1.5279929445189523e-07, + "loss": 1.56, + "step": 7089 + }, + { + "epoch": 0.9460902054977315, + "grad_norm": 1.040271354196889, + "learning_rate": 1.5204757724414098e-07, + "loss": 1.5649, + "step": 7090 + }, + { + "epoch": 0.9462236455831332, + "grad_norm": 0.944030106842017, + "learning_rate": 1.512976995217774e-07, + "loss": 1.5361, + "step": 7091 + }, + { + "epoch": 0.9463570856685348, + "grad_norm": 0.9345127025549201, + "learning_rate": 1.5054966142487027e-07, + "loss": 1.5856, + "step": 7092 + }, + { + "epoch": 0.9464905257539364, + "grad_norm": 0.9282759654458305, + "learning_rate": 1.4980346309314776e-07, + "loss": 1.5434, + "step": 7093 + }, + { + "epoch": 0.9466239658393382, + "grad_norm": 0.9226489628851323, + "learning_rate": 1.4905910466598727e-07, + "loss": 1.569, + "step": 7094 + }, + { + "epoch": 0.9467574059247398, + "grad_norm": 0.9279025309090545, + "learning_rate": 1.4831658628243096e-07, + "loss": 1.5276, + "step": 7095 + }, + { + "epoch": 0.9468908460101414, + "grad_norm": 0.9423761860152485, + "learning_rate": 1.4757590808117006e-07, + "loss": 1.5249, + "step": 7096 + }, + { + "epoch": 0.9470242860955431, + "grad_norm": 1.004907421039197, + "learning_rate": 1.4683707020055614e-07, + "loss": 1.5448, + "step": 7097 + }, + { + "epoch": 0.9471577261809447, + "grad_norm": 0.9258429181621549, + "learning_rate": 1.4610007277859328e-07, + "loss": 1.5371, + "step": 7098 + }, + { + "epoch": 0.9472911662663465, + "grad_norm": 0.9395593033423324, + "learning_rate": 1.453649159529469e-07, + "loss": 1.5647, + "step": 7099 + }, + { + "epoch": 0.9474246063517481, + "grad_norm": 0.9180402841843257, + "learning_rate": 1.446315998609349e-07, + "loss": 1.5026, + "step": 7100 + }, + { + "epoch": 0.9475580464371497, + "grad_norm": 0.9387391943964363, + "learning_rate": 1.4390012463953329e-07, + "loss": 1.4874, + "step": 7101 + }, + { + "epoch": 0.9476914865225514, + "grad_norm": 0.9403053185795006, + "learning_rate": 1.4317049042537167e-07, + "loss": 1.557, + "step": 7102 + }, + { + "epoch": 0.947824926607953, + "grad_norm": 0.9716572056708245, + "learning_rate": 1.424426973547377e-07, + "loss": 1.5695, + "step": 7103 + }, + { + "epoch": 0.9479583666933546, + "grad_norm": 0.9638053337694246, + "learning_rate": 1.4171674556357705e-07, + "loss": 1.528, + "step": 7104 + }, + { + "epoch": 0.9480918067787564, + "grad_norm": 0.9326742291482022, + "learning_rate": 1.4099263518748574e-07, + "loss": 1.5284, + "step": 7105 + }, + { + "epoch": 0.948225246864158, + "grad_norm": 0.9202879189907175, + "learning_rate": 1.402703663617222e-07, + "loss": 1.5171, + "step": 7106 + }, + { + "epoch": 0.9483586869495596, + "grad_norm": 0.9499293926304142, + "learning_rate": 1.395499392211963e-07, + "loss": 1.5009, + "step": 7107 + }, + { + "epoch": 0.9484921270349613, + "grad_norm": 0.9191237857115953, + "learning_rate": 1.388313539004771e-07, + "loss": 1.5325, + "step": 7108 + }, + { + "epoch": 0.948625567120363, + "grad_norm": 0.9281253464726291, + "learning_rate": 1.3811461053378606e-07, + "loss": 1.5305, + "step": 7109 + }, + { + "epoch": 0.9487590072057646, + "grad_norm": 0.9391089740766908, + "learning_rate": 1.3739970925500613e-07, + "loss": 1.5645, + "step": 7110 + }, + { + "epoch": 0.9488924472911663, + "grad_norm": 0.9906753900198737, + "learning_rate": 1.3668665019766937e-07, + "loss": 1.5169, + "step": 7111 + }, + { + "epoch": 0.9490258873765679, + "grad_norm": 0.9307202205615231, + "learning_rate": 1.35975433494967e-07, + "loss": 1.5244, + "step": 7112 + }, + { + "epoch": 0.9491593274619696, + "grad_norm": 0.9307053074686159, + "learning_rate": 1.3526605927974946e-07, + "loss": 1.5106, + "step": 7113 + }, + { + "epoch": 0.9492927675473712, + "grad_norm": 0.9528924062350823, + "learning_rate": 1.3455852768451738e-07, + "loss": 1.4496, + "step": 7114 + }, + { + "epoch": 0.9494262076327729, + "grad_norm": 1.4092694693096828, + "learning_rate": 1.3385283884143064e-07, + "loss": 1.5641, + "step": 7115 + }, + { + "epoch": 0.9495596477181746, + "grad_norm": 0.9508352494225291, + "learning_rate": 1.3314899288230154e-07, + "loss": 1.5428, + "step": 7116 + }, + { + "epoch": 0.9496930878035762, + "grad_norm": 1.0036182199394033, + "learning_rate": 1.3244698993860383e-07, + "loss": 1.5242, + "step": 7117 + }, + { + "epoch": 0.9498265278889778, + "grad_norm": 0.9239362307261929, + "learning_rate": 1.317468301414615e-07, + "loss": 1.5461, + "step": 7118 + }, + { + "epoch": 0.9499599679743795, + "grad_norm": 0.9265412901982545, + "learning_rate": 1.310485136216566e-07, + "loss": 1.5534, + "step": 7119 + }, + { + "epoch": 0.9500934080597812, + "grad_norm": 0.9381184097035782, + "learning_rate": 1.3035204050962702e-07, + "loss": 1.5232, + "step": 7120 + }, + { + "epoch": 0.9502268481451828, + "grad_norm": 1.0080877900305665, + "learning_rate": 1.2965741093546757e-07, + "loss": 1.5221, + "step": 7121 + }, + { + "epoch": 0.9503602882305845, + "grad_norm": 0.9245848574437733, + "learning_rate": 1.289646250289245e-07, + "loss": 1.5618, + "step": 7122 + }, + { + "epoch": 0.9504937283159861, + "grad_norm": 0.9340954109480951, + "learning_rate": 1.282736829194031e-07, + "loss": 1.539, + "step": 7123 + }, + { + "epoch": 0.9506271684013877, + "grad_norm": 0.9413706371278181, + "learning_rate": 1.2758458473596468e-07, + "loss": 1.5756, + "step": 7124 + }, + { + "epoch": 0.9507606084867894, + "grad_norm": 0.9435011931603419, + "learning_rate": 1.2689733060732512e-07, + "loss": 1.5168, + "step": 7125 + }, + { + "epoch": 0.9508940485721911, + "grad_norm": 0.9640763949138873, + "learning_rate": 1.2621192066185394e-07, + "loss": 1.5767, + "step": 7126 + }, + { + "epoch": 0.9510274886575928, + "grad_norm": 1.0666760841057328, + "learning_rate": 1.2552835502757765e-07, + "loss": 1.5459, + "step": 7127 + }, + { + "epoch": 0.9511609287429944, + "grad_norm": 0.9177228398177878, + "learning_rate": 1.2484663383217964e-07, + "loss": 1.483, + "step": 7128 + }, + { + "epoch": 0.951294368828396, + "grad_norm": 0.9400026184127935, + "learning_rate": 1.241667572029992e-07, + "loss": 1.5753, + "step": 7129 + }, + { + "epoch": 0.9514278089137977, + "grad_norm": 1.0492605217878297, + "learning_rate": 1.2348872526702693e-07, + "loss": 1.5503, + "step": 7130 + }, + { + "epoch": 0.9515612489991994, + "grad_norm": 0.9251817708144794, + "learning_rate": 1.2281253815091154e-07, + "loss": 1.5143, + "step": 7131 + }, + { + "epoch": 0.951694689084601, + "grad_norm": 0.940577041105443, + "learning_rate": 1.2213819598095979e-07, + "loss": 1.5524, + "step": 7132 + }, + { + "epoch": 0.9518281291700027, + "grad_norm": 0.9912924069921497, + "learning_rate": 1.2146569888312865e-07, + "loss": 1.505, + "step": 7133 + }, + { + "epoch": 0.9519615692554043, + "grad_norm": 1.0163650859754099, + "learning_rate": 1.2079504698303324e-07, + "loss": 1.5415, + "step": 7134 + }, + { + "epoch": 0.9520950093408059, + "grad_norm": 0.91474346735063, + "learning_rate": 1.2012624040594445e-07, + "loss": 1.5454, + "step": 7135 + }, + { + "epoch": 0.9522284494262077, + "grad_norm": 0.9327035920232858, + "learning_rate": 1.194592792767879e-07, + "loss": 1.5117, + "step": 7136 + }, + { + "epoch": 0.9523618895116093, + "grad_norm": 0.9217306273503845, + "learning_rate": 1.1879416372014285e-07, + "loss": 1.529, + "step": 7137 + }, + { + "epoch": 0.9524953295970109, + "grad_norm": 0.9609918293396442, + "learning_rate": 1.1813089386024657e-07, + "loss": 1.5807, + "step": 7138 + }, + { + "epoch": 0.9526287696824126, + "grad_norm": 1.112063642124847, + "learning_rate": 1.1746946982098995e-07, + "loss": 1.5532, + "step": 7139 + }, + { + "epoch": 0.9527622097678142, + "grad_norm": 0.9466676036098123, + "learning_rate": 1.1680989172591972e-07, + "loss": 1.52, + "step": 7140 + }, + { + "epoch": 0.952895649853216, + "grad_norm": 0.9269239682104546, + "learning_rate": 1.1615215969823734e-07, + "loss": 1.5417, + "step": 7141 + }, + { + "epoch": 0.9530290899386176, + "grad_norm": 1.38625103234689, + "learning_rate": 1.1549627386080009e-07, + "loss": 1.5533, + "step": 7142 + }, + { + "epoch": 0.9531625300240192, + "grad_norm": 0.9665414589291429, + "learning_rate": 1.1484223433611885e-07, + "loss": 1.5013, + "step": 7143 + }, + { + "epoch": 0.9532959701094209, + "grad_norm": 1.1016470069768787, + "learning_rate": 1.1419004124636146e-07, + "loss": 1.5066, + "step": 7144 + }, + { + "epoch": 0.9534294101948225, + "grad_norm": 1.1136680127273073, + "learning_rate": 1.1353969471335047e-07, + "loss": 1.5206, + "step": 7145 + }, + { + "epoch": 0.9535628502802241, + "grad_norm": 0.9664150023916923, + "learning_rate": 1.1289119485856315e-07, + "loss": 1.5303, + "step": 7146 + }, + { + "epoch": 0.9536962903656259, + "grad_norm": 0.9298992429566982, + "learning_rate": 1.122445418031315e-07, + "loss": 1.5278, + "step": 7147 + }, + { + "epoch": 0.9538297304510275, + "grad_norm": 1.1638106219236741, + "learning_rate": 1.1159973566784221e-07, + "loss": 1.5079, + "step": 7148 + }, + { + "epoch": 0.9539631705364291, + "grad_norm": 0.9586600372591042, + "learning_rate": 1.1095677657314008e-07, + "loss": 1.5023, + "step": 7149 + }, + { + "epoch": 0.9540966106218308, + "grad_norm": 0.9231194918030783, + "learning_rate": 1.1031566463912014e-07, + "loss": 1.5314, + "step": 7150 + }, + { + "epoch": 0.9542300507072324, + "grad_norm": 0.9169704635554706, + "learning_rate": 1.0967639998553659e-07, + "loss": 1.5137, + "step": 7151 + }, + { + "epoch": 0.9543634907926342, + "grad_norm": 0.9623890401190696, + "learning_rate": 1.0903898273179503e-07, + "loss": 1.54, + "step": 7152 + }, + { + "epoch": 0.9544969308780358, + "grad_norm": 0.9269400628043825, + "learning_rate": 1.0840341299695911e-07, + "loss": 1.5609, + "step": 7153 + }, + { + "epoch": 0.9546303709634374, + "grad_norm": 0.9465474447152321, + "learning_rate": 1.0776969089974609e-07, + "loss": 1.5059, + "step": 7154 + }, + { + "epoch": 0.9547638110488391, + "grad_norm": 0.9356339177793419, + "learning_rate": 1.0713781655852684e-07, + "loss": 1.545, + "step": 7155 + }, + { + "epoch": 0.9548972511342407, + "grad_norm": 0.927542167441195, + "learning_rate": 1.0650779009132917e-07, + "loss": 1.5608, + "step": 7156 + }, + { + "epoch": 0.9550306912196423, + "grad_norm": 0.9369805873387638, + "learning_rate": 1.0587961161583448e-07, + "loss": 1.5508, + "step": 7157 + }, + { + "epoch": 0.9551641313050441, + "grad_norm": 0.9523970569734105, + "learning_rate": 1.0525328124938006e-07, + "loss": 1.5117, + "step": 7158 + }, + { + "epoch": 0.9552975713904457, + "grad_norm": 0.9097601640170604, + "learning_rate": 1.0462879910895674e-07, + "loss": 1.5232, + "step": 7159 + }, + { + "epoch": 0.9554310114758473, + "grad_norm": 0.9589791998825555, + "learning_rate": 1.0400616531121011e-07, + "loss": 1.56, + "step": 7160 + }, + { + "epoch": 0.955564451561249, + "grad_norm": 0.9522319609125601, + "learning_rate": 1.033853799724427e-07, + "loss": 1.543, + "step": 7161 + }, + { + "epoch": 0.9556978916466506, + "grad_norm": 0.9285567819238145, + "learning_rate": 1.0276644320860729e-07, + "loss": 1.5558, + "step": 7162 + }, + { + "epoch": 0.9558313317320523, + "grad_norm": 0.9595527770779047, + "learning_rate": 1.021493551353181e-07, + "loss": 1.5226, + "step": 7163 + }, + { + "epoch": 0.955964771817454, + "grad_norm": 0.9018522623887555, + "learning_rate": 1.0153411586783734e-07, + "loss": 1.5229, + "step": 7164 + }, + { + "epoch": 0.9560982119028556, + "grad_norm": 1.0712384937201727, + "learning_rate": 1.0092072552108534e-07, + "loss": 1.5517, + "step": 7165 + }, + { + "epoch": 0.9562316519882573, + "grad_norm": 0.9341382301028408, + "learning_rate": 1.0030918420963598e-07, + "loss": 1.5233, + "step": 7166 + }, + { + "epoch": 0.9563650920736589, + "grad_norm": 0.9928816977811972, + "learning_rate": 9.969949204772011e-08, + "loss": 1.5388, + "step": 7167 + }, + { + "epoch": 0.9564985321590606, + "grad_norm": 0.9410197074503386, + "learning_rate": 9.909164914921886e-08, + "loss": 1.6122, + "step": 7168 + }, + { + "epoch": 0.9566319722444623, + "grad_norm": 0.9028807413403246, + "learning_rate": 9.848565562767143e-08, + "loss": 1.4786, + "step": 7169 + }, + { + "epoch": 0.9567654123298639, + "grad_norm": 0.9567117405537122, + "learning_rate": 9.788151159627168e-08, + "loss": 1.5112, + "step": 7170 + }, + { + "epoch": 0.9568988524152655, + "grad_norm": 0.9811327272197673, + "learning_rate": 9.727921716786492e-08, + "loss": 1.5945, + "step": 7171 + }, + { + "epoch": 0.9570322925006672, + "grad_norm": 1.0524690809980066, + "learning_rate": 9.667877245495338e-08, + "loss": 1.5584, + "step": 7172 + }, + { + "epoch": 0.9571657325860689, + "grad_norm": 0.9613936267762925, + "learning_rate": 9.608017756969512e-08, + "loss": 1.5532, + "step": 7173 + }, + { + "epoch": 0.9572991726714705, + "grad_norm": 1.2156726555516593, + "learning_rate": 9.548343262389736e-08, + "loss": 1.6236, + "step": 7174 + }, + { + "epoch": 0.9574326127568722, + "grad_norm": 0.9579182785549096, + "learning_rate": 9.488853772902762e-08, + "loss": 1.5155, + "step": 7175 + }, + { + "epoch": 0.9575660528422738, + "grad_norm": 0.9311509207700649, + "learning_rate": 9.429549299620589e-08, + "loss": 1.5794, + "step": 7176 + }, + { + "epoch": 0.9576994929276754, + "grad_norm": 0.9409890634401489, + "learning_rate": 9.370429853620578e-08, + "loss": 1.5597, + "step": 7177 + }, + { + "epoch": 0.9578329330130771, + "grad_norm": 0.9199706623701136, + "learning_rate": 9.311495445945451e-08, + "loss": 1.5511, + "step": 7178 + }, + { + "epoch": 0.9579663730984788, + "grad_norm": 0.9160829798550142, + "learning_rate": 9.252746087603626e-08, + "loss": 1.5512, + "step": 7179 + }, + { + "epoch": 0.9580998131838805, + "grad_norm": 0.9742308156882673, + "learning_rate": 9.194181789568657e-08, + "loss": 1.5763, + "step": 7180 + }, + { + "epoch": 0.9582332532692821, + "grad_norm": 1.6586965890912353, + "learning_rate": 9.135802562779794e-08, + "loss": 1.4843, + "step": 7181 + }, + { + "epoch": 0.9583666933546837, + "grad_norm": 0.9929783009078508, + "learning_rate": 9.077608418141648e-08, + "loss": 1.565, + "step": 7182 + }, + { + "epoch": 0.9585001334400854, + "grad_norm": 0.9342398271698058, + "learning_rate": 9.019599366524079e-08, + "loss": 1.524, + "step": 7183 + }, + { + "epoch": 0.9586335735254871, + "grad_norm": 0.9169837288541282, + "learning_rate": 8.961775418762752e-08, + "loss": 1.495, + "step": 7184 + }, + { + "epoch": 0.9587670136108887, + "grad_norm": 0.9084724578571916, + "learning_rate": 8.904136585658251e-08, + "loss": 1.5988, + "step": 7185 + }, + { + "epoch": 0.9589004536962904, + "grad_norm": 0.9504512059920357, + "learning_rate": 8.846682877977075e-08, + "loss": 1.5612, + "step": 7186 + }, + { + "epoch": 0.959033893781692, + "grad_norm": 0.9505995914507539, + "learning_rate": 8.78941430645075e-08, + "loss": 1.5368, + "step": 7187 + }, + { + "epoch": 0.9591673338670936, + "grad_norm": 1.1809298932125372, + "learning_rate": 8.732330881776608e-08, + "loss": 1.5281, + "step": 7188 + }, + { + "epoch": 0.9593007739524954, + "grad_norm": 0.9314187782139279, + "learning_rate": 8.675432614617008e-08, + "loss": 1.5489, + "step": 7189 + }, + { + "epoch": 0.959434214037897, + "grad_norm": 0.9232377132180151, + "learning_rate": 8.618719515599894e-08, + "loss": 1.5449, + "step": 7190 + }, + { + "epoch": 0.9595676541232986, + "grad_norm": 1.0270042691413004, + "learning_rate": 8.56219159531868e-08, + "loss": 1.5986, + "step": 7191 + }, + { + "epoch": 0.9597010942087003, + "grad_norm": 1.2061551149492193, + "learning_rate": 8.505848864332145e-08, + "loss": 1.5768, + "step": 7192 + }, + { + "epoch": 0.9598345342941019, + "grad_norm": 0.9340146167897232, + "learning_rate": 8.449691333164423e-08, + "loss": 1.5385, + "step": 7193 + }, + { + "epoch": 0.9599679743795037, + "grad_norm": 0.9296173201446742, + "learning_rate": 8.393719012305124e-08, + "loss": 1.5808, + "step": 7194 + }, + { + "epoch": 0.9601014144649053, + "grad_norm": 0.9802388493036821, + "learning_rate": 8.337931912209329e-08, + "loss": 1.6037, + "step": 7195 + }, + { + "epoch": 0.9602348545503069, + "grad_norm": 0.9331120869630394, + "learning_rate": 8.28233004329737e-08, + "loss": 1.5445, + "step": 7196 + }, + { + "epoch": 0.9603682946357086, + "grad_norm": 1.025665909891443, + "learning_rate": 8.22691341595494e-08, + "loss": 1.5029, + "step": 7197 + }, + { + "epoch": 0.9605017347211102, + "grad_norm": 0.9441651844649627, + "learning_rate": 8.171682040533314e-08, + "loss": 1.5366, + "step": 7198 + }, + { + "epoch": 0.9606351748065118, + "grad_norm": 0.914204298926644, + "learning_rate": 8.116635927349126e-08, + "loss": 1.4671, + "step": 7199 + }, + { + "epoch": 0.9607686148919136, + "grad_norm": 0.9547090251419438, + "learning_rate": 8.061775086684376e-08, + "loss": 1.5321, + "step": 7200 + }, + { + "epoch": 0.9609020549773152, + "grad_norm": 0.9231643897440952, + "learning_rate": 8.007099528786311e-08, + "loss": 1.4749, + "step": 7201 + }, + { + "epoch": 0.9610354950627168, + "grad_norm": 0.9330739495247855, + "learning_rate": 7.952609263867872e-08, + "loss": 1.549, + "step": 7202 + }, + { + "epoch": 0.9611689351481185, + "grad_norm": 0.9436536688017126, + "learning_rate": 7.89830430210714e-08, + "loss": 1.573, + "step": 7203 + }, + { + "epoch": 0.9613023752335201, + "grad_norm": 0.9289138338185312, + "learning_rate": 7.844184653647669e-08, + "loss": 1.4821, + "step": 7204 + }, + { + "epoch": 0.9614358153189217, + "grad_norm": 1.0587966606284311, + "learning_rate": 7.79025032859837e-08, + "loss": 1.5745, + "step": 7205 + }, + { + "epoch": 0.9615692554043235, + "grad_norm": 0.9391981736849577, + "learning_rate": 7.736501337033519e-08, + "loss": 1.5196, + "step": 7206 + }, + { + "epoch": 0.9617026954897251, + "grad_norm": 0.921984477255766, + "learning_rate": 7.682937688992975e-08, + "loss": 1.5816, + "step": 7207 + }, + { + "epoch": 0.9618361355751268, + "grad_norm": 0.9210876386975504, + "learning_rate": 7.629559394481622e-08, + "loss": 1.5665, + "step": 7208 + }, + { + "epoch": 0.9619695756605284, + "grad_norm": 0.9320013031423707, + "learning_rate": 7.576366463470042e-08, + "loss": 1.4974, + "step": 7209 + }, + { + "epoch": 0.96210301574593, + "grad_norm": 1.108953281361308, + "learning_rate": 7.523358905894063e-08, + "loss": 1.548, + "step": 7210 + }, + { + "epoch": 0.9622364558313318, + "grad_norm": 0.9286055208909001, + "learning_rate": 7.470536731654876e-08, + "loss": 1.4809, + "step": 7211 + }, + { + "epoch": 0.9623698959167334, + "grad_norm": 0.9397318541414119, + "learning_rate": 7.417899950619035e-08, + "loss": 1.5441, + "step": 7212 + }, + { + "epoch": 0.962503336002135, + "grad_norm": 0.928266538779687, + "learning_rate": 7.36544857261845e-08, + "loss": 1.5575, + "step": 7213 + }, + { + "epoch": 0.9626367760875367, + "grad_norm": 1.1602227605302893, + "learning_rate": 7.31318260745062e-08, + "loss": 1.4993, + "step": 7214 + }, + { + "epoch": 0.9627702161729383, + "grad_norm": 0.933054423819918, + "learning_rate": 7.261102064878067e-08, + "loss": 1.5281, + "step": 7215 + }, + { + "epoch": 0.96290365625834, + "grad_norm": 0.984048837363307, + "learning_rate": 7.2092069546289e-08, + "loss": 1.5852, + "step": 7216 + }, + { + "epoch": 0.9630370963437417, + "grad_norm": 0.9736412117865043, + "learning_rate": 7.157497286396475e-08, + "loss": 1.5516, + "step": 7217 + }, + { + "epoch": 0.9631705364291433, + "grad_norm": 1.0788935314073798, + "learning_rate": 7.105973069839622e-08, + "loss": 1.5497, + "step": 7218 + }, + { + "epoch": 0.963303976514545, + "grad_norm": 1.0829067853857433, + "learning_rate": 7.054634314582531e-08, + "loss": 1.5787, + "step": 7219 + }, + { + "epoch": 0.9634374165999466, + "grad_norm": 0.9424299932130654, + "learning_rate": 7.003481030214642e-08, + "loss": 1.5995, + "step": 7220 + }, + { + "epoch": 0.9635708566853483, + "grad_norm": 0.9587712887267895, + "learning_rate": 6.952513226290758e-08, + "loss": 1.5399, + "step": 7221 + }, + { + "epoch": 0.96370429677075, + "grad_norm": 0.9129861391767959, + "learning_rate": 6.90173091233115e-08, + "loss": 1.5154, + "step": 7222 + }, + { + "epoch": 0.9638377368561516, + "grad_norm": 1.0939988766367443, + "learning_rate": 6.851134097821344e-08, + "loss": 1.5632, + "step": 7223 + }, + { + "epoch": 0.9639711769415532, + "grad_norm": 0.936664559385459, + "learning_rate": 6.800722792212333e-08, + "loss": 1.5212, + "step": 7224 + }, + { + "epoch": 0.9641046170269549, + "grad_norm": 0.93098974114669, + "learning_rate": 6.750497004920253e-08, + "loss": 1.5135, + "step": 7225 + }, + { + "epoch": 0.9642380571123566, + "grad_norm": 0.9927276595661103, + "learning_rate": 6.700456745326822e-08, + "loss": 1.5372, + "step": 7226 + }, + { + "epoch": 0.9643714971977582, + "grad_norm": 1.0843133033461316, + "learning_rate": 6.650602022778785e-08, + "loss": 1.5053, + "step": 7227 + }, + { + "epoch": 0.9645049372831599, + "grad_norm": 1.1442758247625016, + "learning_rate": 6.600932846588692e-08, + "loss": 1.5681, + "step": 7228 + }, + { + "epoch": 0.9646383773685615, + "grad_norm": 0.9544653994484005, + "learning_rate": 6.551449226034124e-08, + "loss": 1.4815, + "step": 7229 + }, + { + "epoch": 0.9647718174539631, + "grad_norm": 0.9771673417993357, + "learning_rate": 6.502151170357906e-08, + "loss": 1.5094, + "step": 7230 + }, + { + "epoch": 0.9649052575393648, + "grad_norm": 0.9952586049134462, + "learning_rate": 6.453038688768454e-08, + "loss": 1.5057, + "step": 7231 + }, + { + "epoch": 0.9650386976247665, + "grad_norm": 0.9468439711909263, + "learning_rate": 6.404111790439427e-08, + "loss": 1.5695, + "step": 7232 + }, + { + "epoch": 0.9651721377101682, + "grad_norm": 0.9741329945289054, + "learning_rate": 6.355370484509848e-08, + "loss": 1.5713, + "step": 7233 + }, + { + "epoch": 0.9653055777955698, + "grad_norm": 1.18170513807492, + "learning_rate": 6.306814780083992e-08, + "loss": 1.521, + "step": 7234 + }, + { + "epoch": 0.9654390178809714, + "grad_norm": 0.9705328509871132, + "learning_rate": 6.258444686231491e-08, + "loss": 1.5803, + "step": 7235 + }, + { + "epoch": 0.9655724579663731, + "grad_norm": 0.8989950439543595, + "learning_rate": 6.21026021198734e-08, + "loss": 1.4914, + "step": 7236 + }, + { + "epoch": 0.9657058980517748, + "grad_norm": 0.9190782535619162, + "learning_rate": 6.162261366351785e-08, + "loss": 1.5458, + "step": 7237 + }, + { + "epoch": 0.9658393381371764, + "grad_norm": 0.9325432801015195, + "learning_rate": 6.114448158290653e-08, + "loss": 1.557, + "step": 7238 + }, + { + "epoch": 0.9659727782225781, + "grad_norm": 0.9129405790473425, + "learning_rate": 6.066820596734801e-08, + "loss": 1.5548, + "step": 7239 + }, + { + "epoch": 0.9661062183079797, + "grad_norm": 0.9077011984641612, + "learning_rate": 6.019378690580447e-08, + "loss": 1.515, + "step": 7240 + }, + { + "epoch": 0.9662396583933813, + "grad_norm": 0.9878409436375739, + "learning_rate": 5.972122448689278e-08, + "loss": 1.5372, + "step": 7241 + }, + { + "epoch": 0.966373098478783, + "grad_norm": 0.9403367481562539, + "learning_rate": 5.925051879888233e-08, + "loss": 1.531, + "step": 7242 + }, + { + "epoch": 0.9665065385641847, + "grad_norm": 0.9133740011830598, + "learning_rate": 5.878166992969503e-08, + "loss": 1.5515, + "step": 7243 + }, + { + "epoch": 0.9666399786495863, + "grad_norm": 0.9468373367676625, + "learning_rate": 5.8314677966906376e-08, + "loss": 1.5626, + "step": 7244 + }, + { + "epoch": 0.966773418734988, + "grad_norm": 0.9583047387897976, + "learning_rate": 5.784954299774548e-08, + "loss": 1.5667, + "step": 7245 + }, + { + "epoch": 0.9669068588203896, + "grad_norm": 1.1076391186400782, + "learning_rate": 5.738626510909506e-08, + "loss": 1.5306, + "step": 7246 + }, + { + "epoch": 0.9670402989057914, + "grad_norm": 0.9531885401281327, + "learning_rate": 5.692484438748924e-08, + "loss": 1.5195, + "step": 7247 + }, + { + "epoch": 0.967173738991193, + "grad_norm": 0.9286226294276955, + "learning_rate": 5.646528091911574e-08, + "loss": 1.5468, + "step": 7248 + }, + { + "epoch": 0.9673071790765946, + "grad_norm": 1.0932763529633334, + "learning_rate": 5.6007574789817e-08, + "loss": 1.5583, + "step": 7249 + }, + { + "epoch": 0.9674406191619963, + "grad_norm": 0.9368950168319131, + "learning_rate": 5.5551726085086854e-08, + "loss": 1.5233, + "step": 7250 + }, + { + "epoch": 0.9675740592473979, + "grad_norm": 1.0046496115780676, + "learning_rate": 5.509773489007164e-08, + "loss": 1.5453, + "step": 7251 + }, + { + "epoch": 0.9677074993327995, + "grad_norm": 0.9110736411180317, + "learning_rate": 5.4645601289572414e-08, + "loss": 1.5221, + "step": 7252 + }, + { + "epoch": 0.9678409394182013, + "grad_norm": 0.9490066324920002, + "learning_rate": 5.419532536804384e-08, + "loss": 1.5655, + "step": 7253 + }, + { + "epoch": 0.9679743795036029, + "grad_norm": 0.962378240370102, + "learning_rate": 5.3746907209590856e-08, + "loss": 1.5207, + "step": 7254 + }, + { + "epoch": 0.9681078195890045, + "grad_norm": 1.1335586421572892, + "learning_rate": 5.3300346897973143e-08, + "loss": 1.5354, + "step": 7255 + }, + { + "epoch": 0.9682412596744062, + "grad_norm": 0.9713944583777968, + "learning_rate": 5.285564451660285e-08, + "loss": 1.5339, + "step": 7256 + }, + { + "epoch": 0.9683746997598078, + "grad_norm": 0.9407589092657578, + "learning_rate": 5.2412800148546884e-08, + "loss": 1.5333, + "step": 7257 + }, + { + "epoch": 0.9685081398452094, + "grad_norm": 0.9498219588447119, + "learning_rate": 5.197181387652128e-08, + "loss": 1.5341, + "step": 7258 + }, + { + "epoch": 0.9686415799306112, + "grad_norm": 1.184152865294345, + "learning_rate": 5.153268578289794e-08, + "loss": 1.5766, + "step": 7259 + }, + { + "epoch": 0.9687750200160128, + "grad_norm": 0.9171966164362452, + "learning_rate": 5.109541594970235e-08, + "loss": 1.4931, + "step": 7260 + }, + { + "epoch": 0.9689084601014145, + "grad_norm": 0.9294958426402554, + "learning_rate": 5.06600044586103e-08, + "loss": 1.507, + "step": 7261 + }, + { + "epoch": 0.9690419001868161, + "grad_norm": 1.0547850975106445, + "learning_rate": 5.022645139095117e-08, + "loss": 1.568, + "step": 7262 + }, + { + "epoch": 0.9691753402722177, + "grad_norm": 0.9867765801059861, + "learning_rate": 4.979475682770907e-08, + "loss": 1.516, + "step": 7263 + }, + { + "epoch": 0.9693087803576195, + "grad_norm": 0.9697836144717429, + "learning_rate": 4.936492084951949e-08, + "loss": 1.5803, + "step": 7264 + }, + { + "epoch": 0.9694422204430211, + "grad_norm": 0.9720384959061538, + "learning_rate": 4.893694353666934e-08, + "loss": 1.5052, + "step": 7265 + }, + { + "epoch": 0.9695756605284227, + "grad_norm": 0.9188699165623265, + "learning_rate": 4.851082496910242e-08, + "loss": 1.5465, + "step": 7266 + }, + { + "epoch": 0.9697091006138244, + "grad_norm": 0.9508384028938736, + "learning_rate": 4.808656522641064e-08, + "loss": 1.5879, + "step": 7267 + }, + { + "epoch": 0.969842540699226, + "grad_norm": 1.2180128273543447, + "learning_rate": 4.766416438784172e-08, + "loss": 1.5716, + "step": 7268 + }, + { + "epoch": 0.9699759807846277, + "grad_norm": 0.9353158222871983, + "learning_rate": 4.7243622532294756e-08, + "loss": 1.5542, + "step": 7269 + }, + { + "epoch": 0.9701094208700294, + "grad_norm": 0.9324250384056547, + "learning_rate": 4.682493973832358e-08, + "loss": 1.5415, + "step": 7270 + }, + { + "epoch": 0.970242860955431, + "grad_norm": 0.9495787614825245, + "learning_rate": 4.6408116084132313e-08, + "loss": 1.5307, + "step": 7271 + }, + { + "epoch": 0.9703763010408326, + "grad_norm": 0.9474456637216822, + "learning_rate": 4.599315164757867e-08, + "loss": 1.5638, + "step": 7272 + }, + { + "epoch": 0.9705097411262343, + "grad_norm": 0.9360048271462087, + "learning_rate": 4.558004650617398e-08, + "loss": 1.5106, + "step": 7273 + }, + { + "epoch": 0.970643181211636, + "grad_norm": 1.0394175365220302, + "learning_rate": 4.5168800737080965e-08, + "loss": 1.5319, + "step": 7274 + }, + { + "epoch": 0.9707766212970377, + "grad_norm": 0.9512409507257436, + "learning_rate": 4.4759414417117064e-08, + "loss": 1.542, + "step": 7275 + }, + { + "epoch": 0.9709100613824393, + "grad_norm": 0.938780333252122, + "learning_rate": 4.435188762274778e-08, + "loss": 1.5228, + "step": 7276 + }, + { + "epoch": 0.9710435014678409, + "grad_norm": 0.9585863277465944, + "learning_rate": 4.3946220430098885e-08, + "loss": 1.5299, + "step": 7277 + }, + { + "epoch": 0.9711769415532426, + "grad_norm": 1.2292729065904253, + "learning_rate": 4.354241291494088e-08, + "loss": 1.5671, + "step": 7278 + }, + { + "epoch": 0.9713103816386442, + "grad_norm": 0.9701786083988774, + "learning_rate": 4.314046515270121e-08, + "loss": 1.5487, + "step": 7279 + }, + { + "epoch": 0.9714438217240459, + "grad_norm": 0.9290988392315854, + "learning_rate": 4.2740377218459805e-08, + "loss": 1.5489, + "step": 7280 + }, + { + "epoch": 0.9715772618094476, + "grad_norm": 0.9516211408003198, + "learning_rate": 4.234214918694912e-08, + "loss": 1.5158, + "step": 7281 + }, + { + "epoch": 0.9717107018948492, + "grad_norm": 0.9216274492882804, + "learning_rate": 4.194578113255188e-08, + "loss": 1.4825, + "step": 7282 + }, + { + "epoch": 0.9718441419802508, + "grad_norm": 0.956433000469751, + "learning_rate": 4.1551273129306624e-08, + "loss": 1.5503, + "step": 7283 + }, + { + "epoch": 0.9719775820656525, + "grad_norm": 0.959454062575314, + "learning_rate": 4.11586252509022e-08, + "loss": 1.5901, + "step": 7284 + }, + { + "epoch": 0.9721110221510542, + "grad_norm": 0.9623469747948837, + "learning_rate": 4.076783757068103e-08, + "loss": 1.5469, + "step": 7285 + }, + { + "epoch": 0.9722444622364559, + "grad_norm": 0.941165654507255, + "learning_rate": 4.037891016163697e-08, + "loss": 1.5266, + "step": 7286 + }, + { + "epoch": 0.9723779023218575, + "grad_norm": 0.953573546159197, + "learning_rate": 3.999184309641857e-08, + "loss": 1.5544, + "step": 7287 + }, + { + "epoch": 0.9725113424072591, + "grad_norm": 1.0783246600687082, + "learning_rate": 3.960663644732465e-08, + "loss": 1.515, + "step": 7288 + }, + { + "epoch": 0.9726447824926608, + "grad_norm": 0.9455437522062007, + "learning_rate": 3.922329028630767e-08, + "loss": 1.62, + "step": 7289 + }, + { + "epoch": 0.9727782225780625, + "grad_norm": 0.933995861984615, + "learning_rate": 3.8841804684972564e-08, + "loss": 1.6147, + "step": 7290 + }, + { + "epoch": 0.9729116626634641, + "grad_norm": 0.9369601765686556, + "learning_rate": 3.846217971457677e-08, + "loss": 1.4899, + "step": 7291 + }, + { + "epoch": 0.9730451027488658, + "grad_norm": 0.9638712887226217, + "learning_rate": 3.808441544602914e-08, + "loss": 1.572, + "step": 7292 + }, + { + "epoch": 0.9731785428342674, + "grad_norm": 0.9358530817042976, + "learning_rate": 3.7708511949891e-08, + "loss": 1.5785, + "step": 7293 + }, + { + "epoch": 0.973311982919669, + "grad_norm": 0.956879412152412, + "learning_rate": 3.7334469296378406e-08, + "loss": 1.5398, + "step": 7294 + }, + { + "epoch": 0.9734454230050708, + "grad_norm": 0.9452032315669289, + "learning_rate": 3.69622875553588e-08, + "loss": 1.563, + "step": 7295 + }, + { + "epoch": 0.9735788630904724, + "grad_norm": 0.9496476555370976, + "learning_rate": 3.659196679634991e-08, + "loss": 1.5428, + "step": 7296 + }, + { + "epoch": 0.973712303175874, + "grad_norm": 0.9327306321435094, + "learning_rate": 3.622350708852307e-08, + "loss": 1.5386, + "step": 7297 + }, + { + "epoch": 0.9738457432612757, + "grad_norm": 0.9712502348220395, + "learning_rate": 3.5856908500704335e-08, + "loss": 1.5161, + "step": 7298 + }, + { + "epoch": 0.9739791833466773, + "grad_norm": 0.9257919013138982, + "learning_rate": 3.5492171101368935e-08, + "loss": 1.5558, + "step": 7299 + }, + { + "epoch": 0.974112623432079, + "grad_norm": 0.9385635102986536, + "learning_rate": 3.512929495864459e-08, + "loss": 1.5224, + "step": 7300 + }, + { + "epoch": 0.9742460635174807, + "grad_norm": 0.9404378377963817, + "learning_rate": 3.476828014031486e-08, + "loss": 1.5744, + "step": 7301 + }, + { + "epoch": 0.9743795036028823, + "grad_norm": 0.9341637296301126, + "learning_rate": 3.440912671381136e-08, + "loss": 1.557, + "step": 7302 + }, + { + "epoch": 0.974512943688284, + "grad_norm": 0.9208183908614412, + "learning_rate": 3.4051834746221534e-08, + "loss": 1.5211, + "step": 7303 + }, + { + "epoch": 0.9746463837736856, + "grad_norm": 0.9118064472195799, + "learning_rate": 3.369640430428089e-08, + "loss": 1.5383, + "step": 7304 + }, + { + "epoch": 0.9747798238590872, + "grad_norm": 0.9329162779176879, + "learning_rate": 3.3342835454382996e-08, + "loss": 1.5513, + "step": 7305 + }, + { + "epoch": 0.974913263944489, + "grad_norm": 0.951014370362824, + "learning_rate": 3.299112826256723e-08, + "loss": 1.5102, + "step": 7306 + }, + { + "epoch": 0.9750467040298906, + "grad_norm": 1.047900440973857, + "learning_rate": 3.264128279453105e-08, + "loss": 1.554, + "step": 7307 + }, + { + "epoch": 0.9751801441152922, + "grad_norm": 0.9197324429445751, + "learning_rate": 3.229329911561996e-08, + "loss": 1.5731, + "step": 7308 + }, + { + "epoch": 0.9753135842006939, + "grad_norm": 0.9333452848398622, + "learning_rate": 3.1947177290834184e-08, + "loss": 1.588, + "step": 7309 + }, + { + "epoch": 0.9754470242860955, + "grad_norm": 0.9777880188410225, + "learning_rate": 3.160291738482535e-08, + "loss": 1.5162, + "step": 7310 + }, + { + "epoch": 0.9755804643714971, + "grad_norm": 0.9366198386385366, + "learning_rate": 3.1260519461896456e-08, + "loss": 1.5737, + "step": 7311 + }, + { + "epoch": 0.9757139044568989, + "grad_norm": 0.9239853504065914, + "learning_rate": 3.091998358600523e-08, + "loss": 1.554, + "step": 7312 + }, + { + "epoch": 0.9758473445423005, + "grad_norm": 0.9193926283503289, + "learning_rate": 3.0581309820757466e-08, + "loss": 1.4998, + "step": 7313 + }, + { + "epoch": 0.9759807846277022, + "grad_norm": 0.9795340493304713, + "learning_rate": 3.0244498229415885e-08, + "loss": 1.4955, + "step": 7314 + }, + { + "epoch": 0.9761142247131038, + "grad_norm": 0.9333034891767017, + "learning_rate": 2.990954887489239e-08, + "loss": 1.4969, + "step": 7315 + }, + { + "epoch": 0.9762476647985054, + "grad_norm": 0.9282493967471739, + "learning_rate": 2.957646181975027e-08, + "loss": 1.5149, + "step": 7316 + }, + { + "epoch": 0.9763811048839072, + "grad_norm": 0.9838269293925297, + "learning_rate": 2.9245237126208638e-08, + "loss": 1.5284, + "step": 7317 + }, + { + "epoch": 0.9765145449693088, + "grad_norm": 0.9463715264539991, + "learning_rate": 2.8915874856134672e-08, + "loss": 1.5611, + "step": 7318 + }, + { + "epoch": 0.9766479850547104, + "grad_norm": 0.9355403594679113, + "learning_rate": 2.858837507105028e-08, + "loss": 1.5054, + "step": 7319 + }, + { + "epoch": 0.9767814251401121, + "grad_norm": 1.2260606411830046, + "learning_rate": 2.826273783212874e-08, + "loss": 1.5631, + "step": 7320 + }, + { + "epoch": 0.9769148652255137, + "grad_norm": 0.9215276937501206, + "learning_rate": 2.7938963200195844e-08, + "loss": 1.5625, + "step": 7321 + }, + { + "epoch": 0.9770483053109154, + "grad_norm": 1.0251253524014017, + "learning_rate": 2.7617051235727666e-08, + "loss": 1.5516, + "step": 7322 + }, + { + "epoch": 0.9771817453963171, + "grad_norm": 0.9213818791054645, + "learning_rate": 2.7297001998854987e-08, + "loss": 1.5389, + "step": 7323 + }, + { + "epoch": 0.9773151854817187, + "grad_norm": 0.941181131336436, + "learning_rate": 2.697881554935888e-08, + "loss": 1.5423, + "step": 7324 + }, + { + "epoch": 0.9774486255671203, + "grad_norm": 0.8999309030256885, + "learning_rate": 2.666249194667292e-08, + "loss": 1.5542, + "step": 7325 + }, + { + "epoch": 0.977582065652522, + "grad_norm": 0.9331555186069251, + "learning_rate": 2.6348031249882057e-08, + "loss": 1.5703, + "step": 7326 + }, + { + "epoch": 0.9777155057379237, + "grad_norm": 1.0508215046605072, + "learning_rate": 2.603543351772486e-08, + "loss": 1.5462, + "step": 7327 + }, + { + "epoch": 0.9778489458233254, + "grad_norm": 0.9500724069103249, + "learning_rate": 2.5724698808591297e-08, + "loss": 1.5018, + "step": 7328 + }, + { + "epoch": 0.977982385908727, + "grad_norm": 0.950196772138831, + "learning_rate": 2.541582718052271e-08, + "loss": 1.4982, + "step": 7329 + }, + { + "epoch": 0.9781158259941286, + "grad_norm": 1.022399182945929, + "learning_rate": 2.5108818691212956e-08, + "loss": 1.5155, + "step": 7330 + }, + { + "epoch": 0.9782492660795303, + "grad_norm": 1.092331980943047, + "learning_rate": 2.4803673398006157e-08, + "loss": 1.5725, + "step": 7331 + }, + { + "epoch": 0.978382706164932, + "grad_norm": 0.9255819560432912, + "learning_rate": 2.4500391357902277e-08, + "loss": 1.4885, + "step": 7332 + }, + { + "epoch": 0.9785161462503336, + "grad_norm": 0.9513513483680666, + "learning_rate": 2.4198972627549335e-08, + "loss": 1.5104, + "step": 7333 + }, + { + "epoch": 0.9786495863357353, + "grad_norm": 0.8903283569870852, + "learning_rate": 2.389941726325007e-08, + "loss": 1.5539, + "step": 7334 + }, + { + "epoch": 0.9787830264211369, + "grad_norm": 0.9338400501650106, + "learning_rate": 2.3601725320957503e-08, + "loss": 1.4993, + "step": 7335 + }, + { + "epoch": 0.9789164665065385, + "grad_norm": 0.971758384623749, + "learning_rate": 2.3305896856277154e-08, + "loss": 1.5658, + "step": 7336 + }, + { + "epoch": 0.9790499065919402, + "grad_norm": 0.9672566853309181, + "learning_rate": 2.3011931924465936e-08, + "loss": 1.5645, + "step": 7337 + }, + { + "epoch": 0.9791833466773419, + "grad_norm": 0.9218261068570299, + "learning_rate": 2.2719830580434366e-08, + "loss": 1.5638, + "step": 7338 + }, + { + "epoch": 0.9793167867627435, + "grad_norm": 0.9367852380054548, + "learning_rate": 2.2429592878742134e-08, + "loss": 1.547, + "step": 7339 + }, + { + "epoch": 0.9794502268481452, + "grad_norm": 0.9339712727726341, + "learning_rate": 2.2141218873602544e-08, + "loss": 1.514, + "step": 7340 + }, + { + "epoch": 0.9795836669335468, + "grad_norm": 0.9417814446241446, + "learning_rate": 2.1854708618882504e-08, + "loss": 1.5222, + "step": 7341 + }, + { + "epoch": 0.9797171070189485, + "grad_norm": 0.9339447722012021, + "learning_rate": 2.1570062168095873e-08, + "loss": 1.5335, + "step": 7342 + }, + { + "epoch": 0.9798505471043502, + "grad_norm": 1.0402602333871376, + "learning_rate": 2.1287279574414566e-08, + "loss": 1.534, + "step": 7343 + }, + { + "epoch": 0.9799839871897518, + "grad_norm": 0.9326106042217657, + "learning_rate": 2.100636089065633e-08, + "loss": 1.4977, + "step": 7344 + }, + { + "epoch": 0.9801174272751535, + "grad_norm": 0.930957256971068, + "learning_rate": 2.0727306169294747e-08, + "loss": 1.5236, + "step": 7345 + }, + { + "epoch": 0.9802508673605551, + "grad_norm": 0.9473415741877985, + "learning_rate": 2.0450115462454788e-08, + "loss": 1.5171, + "step": 7346 + }, + { + "epoch": 0.9803843074459567, + "grad_norm": 0.9215591618457778, + "learning_rate": 2.0174788821911706e-08, + "loss": 1.4988, + "step": 7347 + }, + { + "epoch": 0.9805177475313585, + "grad_norm": 0.9501836671635087, + "learning_rate": 1.990132629909325e-08, + "loss": 1.5363, + "step": 7348 + }, + { + "epoch": 0.9806511876167601, + "grad_norm": 1.0095366173547495, + "learning_rate": 1.9629727945079668e-08, + "loss": 1.5768, + "step": 7349 + }, + { + "epoch": 0.9807846277021617, + "grad_norm": 0.9307006869728814, + "learning_rate": 1.9359993810601495e-08, + "loss": 1.5595, + "step": 7350 + }, + { + "epoch": 0.9809180677875634, + "grad_norm": 0.9742637399272271, + "learning_rate": 1.9092123946042873e-08, + "loss": 1.5153, + "step": 7351 + }, + { + "epoch": 0.981051507872965, + "grad_norm": 0.9314930322710189, + "learning_rate": 1.882611840143822e-08, + "loss": 1.5104, + "step": 7352 + }, + { + "epoch": 0.9811849479583667, + "grad_norm": 0.9381519532395032, + "learning_rate": 1.856197722647557e-08, + "loss": 1.5689, + "step": 7353 + }, + { + "epoch": 0.9813183880437684, + "grad_norm": 0.9039239250179172, + "learning_rate": 1.8299700470492122e-08, + "loss": 1.5105, + "step": 7354 + }, + { + "epoch": 0.98145182812917, + "grad_norm": 0.9396159105126344, + "learning_rate": 1.8039288182478686e-08, + "loss": 1.5139, + "step": 7355 + }, + { + "epoch": 0.9815852682145717, + "grad_norm": 0.9951647404292933, + "learning_rate": 1.778074041107747e-08, + "loss": 1.5556, + "step": 7356 + }, + { + "epoch": 0.9817187082999733, + "grad_norm": 0.970298389786309, + "learning_rate": 1.7524057204582058e-08, + "loss": 1.5558, + "step": 7357 + }, + { + "epoch": 0.9818521483853749, + "grad_norm": 0.9653437553551486, + "learning_rate": 1.7269238610938544e-08, + "loss": 1.5561, + "step": 7358 + }, + { + "epoch": 0.9819855884707767, + "grad_norm": 1.006023062423882, + "learning_rate": 1.701628467774219e-08, + "loss": 1.5283, + "step": 7359 + }, + { + "epoch": 0.9821190285561783, + "grad_norm": 0.9396565250832867, + "learning_rate": 1.6765195452245198e-08, + "loss": 1.5434, + "step": 7360 + }, + { + "epoch": 0.9822524686415799, + "grad_norm": 0.9608296788404236, + "learning_rate": 1.6515970981344498e-08, + "loss": 1.4794, + "step": 7361 + }, + { + "epoch": 0.9823859087269816, + "grad_norm": 1.0298643297103718, + "learning_rate": 1.6268611311595072e-08, + "loss": 1.5356, + "step": 7362 + }, + { + "epoch": 0.9825193488123832, + "grad_norm": 0.9122880384105183, + "learning_rate": 1.6023116489199962e-08, + "loss": 1.4973, + "step": 7363 + }, + { + "epoch": 0.9826527888977848, + "grad_norm": 0.9280142474276555, + "learning_rate": 1.5779486560014713e-08, + "loss": 1.5525, + "step": 7364 + }, + { + "epoch": 0.9827862289831866, + "grad_norm": 0.9191238933559271, + "learning_rate": 1.553772156954736e-08, + "loss": 1.4899, + "step": 7365 + }, + { + "epoch": 0.9829196690685882, + "grad_norm": 0.9177971381786734, + "learning_rate": 1.529782156295512e-08, + "loss": 1.5066, + "step": 7366 + }, + { + "epoch": 0.9830531091539899, + "grad_norm": 0.9212731057030706, + "learning_rate": 1.505978658505103e-08, + "loss": 1.502, + "step": 7367 + }, + { + "epoch": 0.9831865492393915, + "grad_norm": 0.9287034571182928, + "learning_rate": 1.4823616680295083e-08, + "loss": 1.5333, + "step": 7368 + }, + { + "epoch": 0.9833199893247931, + "grad_norm": 0.9141592195619862, + "learning_rate": 1.4589311892801994e-08, + "loss": 1.5547, + "step": 7369 + }, + { + "epoch": 0.9834534294101949, + "grad_norm": 0.9139700337452249, + "learning_rate": 1.4356872266337862e-08, + "loss": 1.556, + "step": 7370 + }, + { + "epoch": 0.9835868694955965, + "grad_norm": 0.9454395337463968, + "learning_rate": 1.4126297844317959e-08, + "loss": 1.5366, + "step": 7371 + }, + { + "epoch": 0.9837203095809981, + "grad_norm": 0.9532365221783823, + "learning_rate": 1.389758866981339e-08, + "loss": 1.5516, + "step": 7372 + }, + { + "epoch": 0.9838537496663998, + "grad_norm": 0.9276600231944385, + "learning_rate": 1.367074478554331e-08, + "loss": 1.549, + "step": 7373 + }, + { + "epoch": 0.9839871897518014, + "grad_norm": 0.9614937926329741, + "learning_rate": 1.3445766233878277e-08, + "loss": 1.5544, + "step": 7374 + }, + { + "epoch": 0.984120629837203, + "grad_norm": 0.8899635478946575, + "learning_rate": 1.322265305684356e-08, + "loss": 1.4937, + "step": 7375 + }, + { + "epoch": 0.9842540699226048, + "grad_norm": 0.9288487499950765, + "learning_rate": 1.3001405296113601e-08, + "loss": 1.5814, + "step": 7376 + }, + { + "epoch": 0.9843875100080064, + "grad_norm": 1.1647701919689324, + "learning_rate": 1.2782022993015342e-08, + "loss": 1.5296, + "step": 7377 + }, + { + "epoch": 0.984520950093408, + "grad_norm": 0.9358750657202466, + "learning_rate": 1.2564506188526004e-08, + "loss": 1.5041, + "step": 7378 + }, + { + "epoch": 0.9846543901788097, + "grad_norm": 0.9197611351339202, + "learning_rate": 1.2348854923275311e-08, + "loss": 1.5391, + "step": 7379 + }, + { + "epoch": 0.9847878302642114, + "grad_norm": 0.9827592983158957, + "learning_rate": 1.2135069237545483e-08, + "loss": 1.535, + "step": 7380 + }, + { + "epoch": 0.9849212703496131, + "grad_norm": 0.9615691598090853, + "learning_rate": 1.1923149171267911e-08, + "loss": 1.6116, + "step": 7381 + }, + { + "epoch": 0.9850547104350147, + "grad_norm": 0.9273458595588824, + "learning_rate": 1.171309476402871e-08, + "loss": 1.5576, + "step": 7382 + }, + { + "epoch": 0.9851881505204163, + "grad_norm": 0.9460589830522615, + "learning_rate": 1.1504906055060938e-08, + "loss": 1.5578, + "step": 7383 + }, + { + "epoch": 0.985321590605818, + "grad_norm": 0.9954193989279615, + "learning_rate": 1.1298583083254599e-08, + "loss": 1.5602, + "step": 7384 + }, + { + "epoch": 0.9854550306912196, + "grad_norm": 0.9445437464464703, + "learning_rate": 1.1094125887146644e-08, + "loss": 1.5294, + "step": 7385 + }, + { + "epoch": 0.9855884707766213, + "grad_norm": 0.9104360078993541, + "learning_rate": 1.0891534504928747e-08, + "loss": 1.5428, + "step": 7386 + }, + { + "epoch": 0.985721910862023, + "grad_norm": 1.0236669925899005, + "learning_rate": 1.0690808974441747e-08, + "loss": 1.6108, + "step": 7387 + }, + { + "epoch": 0.9858553509474246, + "grad_norm": 0.9071583750469212, + "learning_rate": 1.0491949333178986e-08, + "loss": 1.5168, + "step": 7388 + }, + { + "epoch": 0.9859887910328262, + "grad_norm": 0.9927493294549666, + "learning_rate": 1.0294955618285196e-08, + "loss": 1.5921, + "step": 7389 + }, + { + "epoch": 0.9861222311182279, + "grad_norm": 0.9363046349406777, + "learning_rate": 1.00998278665565e-08, + "loss": 1.5694, + "step": 7390 + }, + { + "epoch": 0.9862556712036296, + "grad_norm": 0.9135114422937861, + "learning_rate": 9.906566114440408e-09, + "loss": 1.5794, + "step": 7391 + }, + { + "epoch": 0.9863891112890312, + "grad_norm": 0.9437284819277648, + "learning_rate": 9.715170398036933e-09, + "loss": 1.584, + "step": 7392 + }, + { + "epoch": 0.9865225513744329, + "grad_norm": 0.9211484661856915, + "learning_rate": 9.525640753095256e-09, + "loss": 1.5472, + "step": 7393 + }, + { + "epoch": 0.9866559914598345, + "grad_norm": 0.9176256061269126, + "learning_rate": 9.337977215018169e-09, + "loss": 1.4833, + "step": 7394 + }, + { + "epoch": 0.9867894315452362, + "grad_norm": 0.9564873181171374, + "learning_rate": 9.152179818859853e-09, + "loss": 1.5476, + "step": 7395 + }, + { + "epoch": 0.9869228716306379, + "grad_norm": 0.89959690847276, + "learning_rate": 8.968248599323659e-09, + "loss": 1.525, + "step": 7396 + }, + { + "epoch": 0.9870563117160395, + "grad_norm": 0.919080132940536, + "learning_rate": 8.78618359076544e-09, + "loss": 1.5491, + "step": 7397 + }, + { + "epoch": 0.9871897518014412, + "grad_norm": 1.4794996154790685, + "learning_rate": 8.605984827195767e-09, + "loss": 1.5364, + "step": 7398 + }, + { + "epoch": 0.9873231918868428, + "grad_norm": 1.0985804202382334, + "learning_rate": 8.427652342271053e-09, + "loss": 1.529, + "step": 7399 + }, + { + "epoch": 0.9874566319722444, + "grad_norm": 1.1736650971200993, + "learning_rate": 8.251186169301318e-09, + "loss": 1.5596, + "step": 7400 + }, + { + "epoch": 0.9875900720576462, + "grad_norm": 1.0303346794805686, + "learning_rate": 8.076586341251303e-09, + "loss": 1.4777, + "step": 7401 + }, + { + "epoch": 0.9877235121430478, + "grad_norm": 0.9872701203867904, + "learning_rate": 7.903852890732699e-09, + "loss": 1.5144, + "step": 7402 + }, + { + "epoch": 0.9878569522284494, + "grad_norm": 0.9850487540902245, + "learning_rate": 7.73298585000859e-09, + "loss": 1.5604, + "step": 7403 + }, + { + "epoch": 0.9879903923138511, + "grad_norm": 0.9471651528476187, + "learning_rate": 7.563985250997884e-09, + "loss": 1.5555, + "step": 7404 + }, + { + "epoch": 0.9881238323992527, + "grad_norm": 0.9420598022378026, + "learning_rate": 7.3968511252664466e-09, + "loss": 1.517, + "step": 7405 + }, + { + "epoch": 0.9882572724846543, + "grad_norm": 0.9419967634572181, + "learning_rate": 7.231583504032636e-09, + "loss": 1.5475, + "step": 7406 + }, + { + "epoch": 0.9883907125700561, + "grad_norm": 1.0474390058108154, + "learning_rate": 7.0681824181673134e-09, + "loss": 1.5696, + "step": 7407 + }, + { + "epoch": 0.9885241526554577, + "grad_norm": 1.036920220314024, + "learning_rate": 6.906647898191621e-09, + "loss": 1.554, + "step": 7408 + }, + { + "epoch": 0.9886575927408594, + "grad_norm": 0.928746553820107, + "learning_rate": 6.7469799742780895e-09, + "loss": 1.5312, + "step": 7409 + }, + { + "epoch": 0.988791032826261, + "grad_norm": 0.9153121582186817, + "learning_rate": 6.589178676251751e-09, + "loss": 1.528, + "step": 7410 + }, + { + "epoch": 0.9889244729116626, + "grad_norm": 1.0836635609150753, + "learning_rate": 6.433244033587916e-09, + "loss": 1.5573, + "step": 7411 + }, + { + "epoch": 0.9890579129970644, + "grad_norm": 1.221340256835362, + "learning_rate": 6.279176075412175e-09, + "loss": 1.5926, + "step": 7412 + }, + { + "epoch": 0.989191353082466, + "grad_norm": 0.9091108579676047, + "learning_rate": 6.12697483050484e-09, + "loss": 1.5529, + "step": 7413 + }, + { + "epoch": 0.9893247931678676, + "grad_norm": 0.92677410973629, + "learning_rate": 5.976640327293171e-09, + "loss": 1.541, + "step": 7414 + }, + { + "epoch": 0.9894582332532693, + "grad_norm": 0.9383472475782342, + "learning_rate": 5.828172593858039e-09, + "loss": 1.4556, + "step": 7415 + }, + { + "epoch": 0.9895916733386709, + "grad_norm": 0.9238065180005837, + "learning_rate": 5.681571657933927e-09, + "loss": 1.5083, + "step": 7416 + }, + { + "epoch": 0.9897251134240725, + "grad_norm": 0.9377466504722105, + "learning_rate": 5.536837546902263e-09, + "loss": 1.5493, + "step": 7417 + }, + { + "epoch": 0.9898585535094743, + "grad_norm": 0.9335726191972902, + "learning_rate": 5.393970287796979e-09, + "loss": 1.571, + "step": 7418 + }, + { + "epoch": 0.9899919935948759, + "grad_norm": 1.1669560380448265, + "learning_rate": 5.2529699073067265e-09, + "loss": 1.504, + "step": 7419 + }, + { + "epoch": 0.9901254336802775, + "grad_norm": 0.9537386546748483, + "learning_rate": 5.113836431765995e-09, + "loss": 1.5505, + "step": 7420 + }, + { + "epoch": 0.9902588737656792, + "grad_norm": 0.9165621627626547, + "learning_rate": 4.976569887165106e-09, + "loss": 1.5176, + "step": 7421 + }, + { + "epoch": 0.9903923138510808, + "grad_norm": 0.9389326153560223, + "learning_rate": 4.8411702991435494e-09, + "loss": 1.5421, + "step": 7422 + }, + { + "epoch": 0.9905257539364826, + "grad_norm": 0.9939334972185406, + "learning_rate": 4.707637692992206e-09, + "loss": 1.5195, + "step": 7423 + }, + { + "epoch": 0.9906591940218842, + "grad_norm": 1.0278495243989596, + "learning_rate": 4.575972093653347e-09, + "loss": 1.5995, + "step": 7424 + }, + { + "epoch": 0.9907926341072858, + "grad_norm": 0.9567686246995181, + "learning_rate": 4.4461735257206315e-09, + "loss": 1.459, + "step": 7425 + }, + { + "epoch": 0.9909260741926875, + "grad_norm": 1.136055181982407, + "learning_rate": 4.318242013439111e-09, + "loss": 1.4569, + "step": 7426 + }, + { + "epoch": 0.9910595142780891, + "grad_norm": 0.9458609375171427, + "learning_rate": 4.1921775807041154e-09, + "loss": 1.5101, + "step": 7427 + }, + { + "epoch": 0.9911929543634908, + "grad_norm": 0.9533736019573524, + "learning_rate": 4.067980251064585e-09, + "loss": 1.5205, + "step": 7428 + }, + { + "epoch": 0.9913263944488925, + "grad_norm": 1.0479939592780765, + "learning_rate": 3.94565004771863e-09, + "loss": 1.6043, + "step": 7429 + }, + { + "epoch": 0.9914598345342941, + "grad_norm": 0.9972412023234489, + "learning_rate": 3.825186993515751e-09, + "loss": 1.5548, + "step": 7430 + }, + { + "epoch": 0.9915932746196957, + "grad_norm": 0.9272943886701682, + "learning_rate": 3.7065911109568365e-09, + "loss": 1.5379, + "step": 7431 + }, + { + "epoch": 0.9917267147050974, + "grad_norm": 0.9473938913490474, + "learning_rate": 3.589862422195278e-09, + "loss": 1.5721, + "step": 7432 + }, + { + "epoch": 0.991860154790499, + "grad_norm": 0.9185317252383915, + "learning_rate": 3.4750009490336355e-09, + "loss": 1.5664, + "step": 7433 + }, + { + "epoch": 0.9919935948759008, + "grad_norm": 0.9791716481267135, + "learning_rate": 3.3620067129269683e-09, + "loss": 1.5587, + "step": 7434 + }, + { + "epoch": 0.9921270349613024, + "grad_norm": 0.9173521049585661, + "learning_rate": 3.2508797349817268e-09, + "loss": 1.5304, + "step": 7435 + }, + { + "epoch": 0.992260475046704, + "grad_norm": 0.9632980233243099, + "learning_rate": 3.14162003595353e-09, + "loss": 1.5745, + "step": 7436 + }, + { + "epoch": 0.9923939151321057, + "grad_norm": 0.9581161698339242, + "learning_rate": 3.034227636253828e-09, + "loss": 1.5666, + "step": 7437 + }, + { + "epoch": 0.9925273552175073, + "grad_norm": 0.9413492862074422, + "learning_rate": 2.9287025559399108e-09, + "loss": 1.5062, + "step": 7438 + }, + { + "epoch": 0.992660795302909, + "grad_norm": 0.9747881941796354, + "learning_rate": 2.8250448147237875e-09, + "loss": 1.4891, + "step": 7439 + }, + { + "epoch": 0.9927942353883107, + "grad_norm": 0.9824468800744949, + "learning_rate": 2.723254431967748e-09, + "loss": 1.5657, + "step": 7440 + }, + { + "epoch": 0.9929276754737123, + "grad_norm": 0.9476470497339562, + "learning_rate": 2.623331426683251e-09, + "loss": 1.5411, + "step": 7441 + }, + { + "epoch": 0.9930611155591139, + "grad_norm": 0.9089786350328128, + "learning_rate": 2.525275817536477e-09, + "loss": 1.573, + "step": 7442 + }, + { + "epoch": 0.9931945556445156, + "grad_norm": 0.9170076888295607, + "learning_rate": 2.429087622842774e-09, + "loss": 1.5052, + "step": 7443 + }, + { + "epoch": 0.9933279957299173, + "grad_norm": 0.9562919014528187, + "learning_rate": 2.334766860568882e-09, + "loss": 1.5247, + "step": 7444 + }, + { + "epoch": 0.9934614358153189, + "grad_norm": 0.9327221876692189, + "learning_rate": 2.2423135483329306e-09, + "loss": 1.5501, + "step": 7445 + }, + { + "epoch": 0.9935948759007206, + "grad_norm": 0.9382241408409218, + "learning_rate": 2.151727703404438e-09, + "loss": 1.5226, + "step": 7446 + }, + { + "epoch": 0.9937283159861222, + "grad_norm": 0.9581003164477122, + "learning_rate": 2.0630093427032038e-09, + "loss": 1.5585, + "step": 7447 + }, + { + "epoch": 0.9938617560715239, + "grad_norm": 1.049522255731219, + "learning_rate": 1.9761584828004164e-09, + "loss": 1.5329, + "step": 7448 + }, + { + "epoch": 0.9939951961569256, + "grad_norm": 0.9641339166472345, + "learning_rate": 1.8911751399197655e-09, + "loss": 1.5166, + "step": 7449 + }, + { + "epoch": 0.9941286362423272, + "grad_norm": 1.1001878062084773, + "learning_rate": 1.808059329935219e-09, + "loss": 1.5105, + "step": 7450 + }, + { + "epoch": 0.9942620763277289, + "grad_norm": 0.9672405162002357, + "learning_rate": 1.726811068371026e-09, + "loss": 1.5858, + "step": 7451 + }, + { + "epoch": 0.9943955164131305, + "grad_norm": 0.9497524896899447, + "learning_rate": 1.6474303704039352e-09, + "loss": 1.534, + "step": 7452 + }, + { + "epoch": 0.9945289564985321, + "grad_norm": 0.9337846143023557, + "learning_rate": 1.5699172508620852e-09, + "loss": 1.4735, + "step": 7453 + }, + { + "epoch": 0.9946623965839338, + "grad_norm": 1.083632648142817, + "learning_rate": 1.4942717242227844e-09, + "loss": 1.5346, + "step": 7454 + }, + { + "epoch": 0.9947958366693355, + "grad_norm": 0.9438573488466988, + "learning_rate": 1.4204938046158412e-09, + "loss": 1.5054, + "step": 7455 + }, + { + "epoch": 0.9949292767547371, + "grad_norm": 0.9556477000553659, + "learning_rate": 1.3485835058224538e-09, + "loss": 1.5491, + "step": 7456 + }, + { + "epoch": 0.9950627168401388, + "grad_norm": 0.9113180378493281, + "learning_rate": 1.278540841275211e-09, + "loss": 1.4985, + "step": 7457 + }, + { + "epoch": 0.9951961569255404, + "grad_norm": 0.9329564036235153, + "learning_rate": 1.2103658240569805e-09, + "loss": 1.5467, + "step": 7458 + }, + { + "epoch": 0.995329597010942, + "grad_norm": 0.931682502757043, + "learning_rate": 1.1440584669020205e-09, + "loss": 1.5791, + "step": 7459 + }, + { + "epoch": 0.9954630370963438, + "grad_norm": 1.1265746496215818, + "learning_rate": 1.0796187821959792e-09, + "loss": 1.589, + "step": 7460 + }, + { + "epoch": 0.9955964771817454, + "grad_norm": 1.139830131670322, + "learning_rate": 1.017046781973674e-09, + "loss": 1.5046, + "step": 7461 + }, + { + "epoch": 0.9957299172671471, + "grad_norm": 0.9401769463698738, + "learning_rate": 9.563424779257535e-10, + "loss": 1.4786, + "step": 7462 + }, + { + "epoch": 0.9958633573525487, + "grad_norm": 1.0561437037692578, + "learning_rate": 8.97505881388705e-10, + "loss": 1.5136, + "step": 7463 + }, + { + "epoch": 0.9959967974379503, + "grad_norm": 1.0139954761378014, + "learning_rate": 8.405370033548466e-10, + "loss": 1.5199, + "step": 7464 + }, + { + "epoch": 0.9961302375233521, + "grad_norm": 0.9447397803000418, + "learning_rate": 7.854358544623353e-10, + "loss": 1.5971, + "step": 7465 + }, + { + "epoch": 0.9962636776087537, + "grad_norm": 0.9299070974900873, + "learning_rate": 7.322024450062693e-10, + "loss": 1.5235, + "step": 7466 + }, + { + "epoch": 0.9963971176941553, + "grad_norm": 0.9261836009002768, + "learning_rate": 6.808367849286956e-10, + "loss": 1.5404, + "step": 7467 + }, + { + "epoch": 0.996530557779557, + "grad_norm": 0.9439837053013257, + "learning_rate": 6.313388838230517e-10, + "loss": 1.5434, + "step": 7468 + }, + { + "epoch": 0.9966639978649586, + "grad_norm": 0.9180840699289173, + "learning_rate": 5.83708750937495e-10, + "loss": 1.4938, + "step": 7469 + }, + { + "epoch": 0.9967974379503602, + "grad_norm": 0.9964338171196607, + "learning_rate": 5.379463951671326e-10, + "loss": 1.5414, + "step": 7470 + }, + { + "epoch": 0.996930878035762, + "grad_norm": 0.967091118915389, + "learning_rate": 4.940518250606818e-10, + "loss": 1.5766, + "step": 7471 + }, + { + "epoch": 0.9970643181211636, + "grad_norm": 0.9025190757694261, + "learning_rate": 4.5202504881602936e-10, + "loss": 1.5351, + "step": 7472 + }, + { + "epoch": 0.9971977582065652, + "grad_norm": 0.92342728170414, + "learning_rate": 4.118660742846725e-10, + "loss": 1.546, + "step": 7473 + }, + { + "epoch": 0.9973311982919669, + "grad_norm": 0.9285850741711503, + "learning_rate": 3.735749089661678e-10, + "loss": 1.5224, + "step": 7474 + }, + { + "epoch": 0.9974646383773685, + "grad_norm": 0.9333801469713987, + "learning_rate": 3.371515600147923e-10, + "loss": 1.5543, + "step": 7475 + }, + { + "epoch": 0.9975980784627703, + "grad_norm": 0.9326257599816116, + "learning_rate": 3.0259603423288266e-10, + "loss": 1.548, + "step": 7476 + }, + { + "epoch": 0.9977315185481719, + "grad_norm": 0.9283677855379775, + "learning_rate": 2.699083380741652e-10, + "loss": 1.562, + "step": 7477 + }, + { + "epoch": 0.9978649586335735, + "grad_norm": 0.9887750864167223, + "learning_rate": 2.3908847764597674e-10, + "loss": 1.5263, + "step": 7478 + }, + { + "epoch": 0.9979983987189752, + "grad_norm": 1.0609376235918477, + "learning_rate": 2.101364587048238e-10, + "loss": 1.5183, + "step": 7479 + }, + { + "epoch": 0.9981318388043768, + "grad_norm": 0.9276456917237319, + "learning_rate": 1.8305228665860264e-10, + "loss": 1.5592, + "step": 7480 + }, + { + "epoch": 0.9982652788897785, + "grad_norm": 0.9511667430507186, + "learning_rate": 1.578359665654894e-10, + "loss": 1.6094, + "step": 7481 + }, + { + "epoch": 0.9983987189751802, + "grad_norm": 0.935192544085142, + "learning_rate": 1.3448750313616032e-10, + "loss": 1.5063, + "step": 7482 + }, + { + "epoch": 0.9985321590605818, + "grad_norm": 0.9411754750534707, + "learning_rate": 1.130069007315715e-10, + "loss": 1.5261, + "step": 7483 + }, + { + "epoch": 0.9986655991459834, + "grad_norm": 1.1219517093294005, + "learning_rate": 9.339416336517915e-11, + "loss": 1.5611, + "step": 7484 + }, + { + "epoch": 0.9987990392313851, + "grad_norm": 1.1069618633698497, + "learning_rate": 7.56492946984988e-11, + "loss": 1.5211, + "step": 7485 + }, + { + "epoch": 0.9989324793167867, + "grad_norm": 0.9326395703982616, + "learning_rate": 5.977229804776663e-11, + "loss": 1.5316, + "step": 7486 + }, + { + "epoch": 0.9990659194021884, + "grad_norm": 1.0224397864563275, + "learning_rate": 4.576317637838834e-11, + "loss": 1.5785, + "step": 7487 + }, + { + "epoch": 0.9991993594875901, + "grad_norm": 1.2901485937141546, + "learning_rate": 3.3621932306049375e-11, + "loss": 1.5255, + "step": 7488 + }, + { + "epoch": 0.9993327995729917, + "grad_norm": 0.9764240130637175, + "learning_rate": 2.3348568098935376e-11, + "loss": 1.5258, + "step": 7489 + }, + { + "epoch": 0.9994662396583934, + "grad_norm": 0.9510632886911196, + "learning_rate": 1.494308567662195e-11, + "loss": 1.5212, + "step": 7490 + }, + { + "epoch": 0.999599679743795, + "grad_norm": 0.9937550478187936, + "learning_rate": 8.405486608964452e-12, + "loss": 1.5589, + "step": 7491 + }, + { + "epoch": 0.9997331198291967, + "grad_norm": 0.9234682778153688, + "learning_rate": 3.735772117208214e-12, + "loss": 1.501, + "step": 7492 + }, + { + "epoch": 0.9998665599145984, + "grad_norm": 0.9297623176763948, + "learning_rate": 9.339430728783073e-13, + "loss": 1.5671, + "step": 7493 + }, + { + "epoch": 1.0, + "grad_norm": 1.109193681967266, + "learning_rate": 0.0, + "loss": 1.5531, + "step": 7494 + }, + { + "epoch": 1.0, + "step": 7494, + "total_flos": 2.902622270823963e+19, + "train_loss": 1.6068721134280217, + "train_runtime": 34738.8495, + "train_samples_per_second": 36.238, + "train_steps_per_second": 0.216 + } + ], + "logging_steps": 1.0, + "max_steps": 7494, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 100000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": false, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 2.902622270823963e+19, + "train_batch_size": 24, + "trial_name": null, + "trial_params": null +}