diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,8421 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.5936927565773118, + "eval_steps": 500, + "global_step": 120000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "grad_norm": 296.6432189941406, + "learning_rate": 2.0000000000000002e-07, + "loss": 36.1442, + "step": 100 + }, + { + "epoch": 0.0, + "grad_norm": 282.62713623046875, + "learning_rate": 4.0000000000000003e-07, + "loss": 36.2463, + "step": 200 + }, + { + "epoch": 0.0, + "grad_norm": 271.8738098144531, + "learning_rate": 6.000000000000001e-07, + "loss": 36.3589, + "step": 300 + }, + { + "epoch": 0.0, + "grad_norm": 287.5734558105469, + "learning_rate": 8.000000000000001e-07, + "loss": 36.0855, + "step": 400 + }, + { + "epoch": 0.0, + "grad_norm": 259.08685302734375, + "learning_rate": 1.0000000000000002e-06, + "loss": 35.8685, + "step": 500 + }, + { + "epoch": 0.0, + "grad_norm": 292.2701416015625, + "learning_rate": 1.2000000000000002e-06, + "loss": 35.4046, + "step": 600 + }, + { + "epoch": 0.0, + "grad_norm": 307.0222473144531, + "learning_rate": 1.4000000000000001e-06, + "loss": 34.6875, + "step": 700 + }, + { + "epoch": 0.0, + "grad_norm": 274.7489929199219, + "learning_rate": 1.6000000000000001e-06, + "loss": 34.1517, + "step": 800 + }, + { + "epoch": 0.0, + "grad_norm": 240.46612548828125, + "learning_rate": 1.8000000000000001e-06, + "loss": 33.4424, + "step": 900 + }, + { + "epoch": 0.0, + "grad_norm": 268.32684326171875, + "learning_rate": 2.0000000000000003e-06, + "loss": 32.5807, + "step": 1000 + }, + { + "epoch": 0.01, + "grad_norm": 258.87274169921875, + "learning_rate": 2.2e-06, + "loss": 30.8752, + "step": 1100 + }, + { + "epoch": 0.01, + "grad_norm": 288.45611572265625, + "learning_rate": 2.4000000000000003e-06, + "loss": 29.5351, + "step": 1200 + }, + { + "epoch": 0.01, + "grad_norm": 261.72149658203125, + "learning_rate": 2.6e-06, + "loss": 27.398, + "step": 1300 + }, + { + "epoch": 0.01, + "grad_norm": 331.3612365722656, + "learning_rate": 2.8000000000000003e-06, + "loss": 24.6465, + "step": 1400 + }, + { + "epoch": 0.01, + "grad_norm": 215.5654296875, + "learning_rate": 3e-06, + "loss": 21.7979, + "step": 1500 + }, + { + "epoch": 0.01, + "grad_norm": 222.94651794433594, + "learning_rate": 3.2000000000000003e-06, + "loss": 18.6465, + "step": 1600 + }, + { + "epoch": 0.01, + "grad_norm": 252.55087280273438, + "learning_rate": 3.4000000000000005e-06, + "loss": 15.7462, + "step": 1700 + }, + { + "epoch": 0.01, + "grad_norm": 273.9644470214844, + "learning_rate": 3.6000000000000003e-06, + "loss": 13.9379, + "step": 1800 + }, + { + "epoch": 0.01, + "grad_norm": 205.786376953125, + "learning_rate": 3.8000000000000005e-06, + "loss": 12.2574, + "step": 1900 + }, + { + "epoch": 0.01, + "grad_norm": 151.63124084472656, + "learning_rate": 4.000000000000001e-06, + "loss": 9.865, + "step": 2000 + }, + { + "epoch": 0.01, + "grad_norm": 120.38298034667969, + "learning_rate": 4.2000000000000004e-06, + "loss": 8.9936, + "step": 2100 + }, + { + "epoch": 0.01, + "grad_norm": 93.93321990966797, + "learning_rate": 4.4e-06, + "loss": 8.3415, + "step": 2200 + }, + { + "epoch": 0.01, + "grad_norm": 102.68135833740234, + "learning_rate": 4.600000000000001e-06, + "loss": 7.4711, + "step": 2300 + }, + { + "epoch": 0.01, + "grad_norm": 250.72817993164062, + "learning_rate": 4.800000000000001e-06, + "loss": 6.5714, + "step": 2400 + }, + { + "epoch": 0.01, + "grad_norm": 249.8506317138672, + "learning_rate": 5e-06, + "loss": 5.9448, + "step": 2500 + }, + { + "epoch": 0.01, + "grad_norm": 66.83155059814453, + "learning_rate": 5.2e-06, + "loss": 5.6368, + "step": 2600 + }, + { + "epoch": 0.01, + "grad_norm": 51.391082763671875, + "learning_rate": 5.400000000000001e-06, + "loss": 4.8538, + "step": 2700 + }, + { + "epoch": 0.01, + "grad_norm": 50.51924133300781, + "learning_rate": 5.600000000000001e-06, + "loss": 4.5733, + "step": 2800 + }, + { + "epoch": 0.01, + "grad_norm": 33.91701889038086, + "learning_rate": 5.8e-06, + "loss": 4.1586, + "step": 2900 + }, + { + "epoch": 0.01, + "grad_norm": 41.544532775878906, + "learning_rate": 6e-06, + "loss": 3.8914, + "step": 3000 + }, + { + "epoch": 0.02, + "grad_norm": 44.3348274230957, + "learning_rate": 6.200000000000001e-06, + "loss": 3.4145, + "step": 3100 + }, + { + "epoch": 0.02, + "grad_norm": 27.11107063293457, + "learning_rate": 6.4000000000000006e-06, + "loss": 3.2646, + "step": 3200 + }, + { + "epoch": 0.02, + "grad_norm": 144.55479431152344, + "learning_rate": 6.600000000000001e-06, + "loss": 3.1211, + "step": 3300 + }, + { + "epoch": 0.02, + "grad_norm": 16.845191955566406, + "learning_rate": 6.800000000000001e-06, + "loss": 3.0997, + "step": 3400 + }, + { + "epoch": 0.02, + "grad_norm": 13.76279067993164, + "learning_rate": 7e-06, + "loss": 2.7326, + "step": 3500 + }, + { + "epoch": 0.02, + "grad_norm": 10.799291610717773, + "learning_rate": 7.2000000000000005e-06, + "loss": 2.743, + "step": 3600 + }, + { + "epoch": 0.02, + "grad_norm": 10.540129661560059, + "learning_rate": 7.4e-06, + "loss": 2.6056, + "step": 3700 + }, + { + "epoch": 0.02, + "grad_norm": 11.966181755065918, + "learning_rate": 7.600000000000001e-06, + "loss": 2.4526, + "step": 3800 + }, + { + "epoch": 0.02, + "grad_norm": 16.517465591430664, + "learning_rate": 7.800000000000002e-06, + "loss": 2.419, + "step": 3900 + }, + { + "epoch": 0.02, + "grad_norm": 11.533743858337402, + "learning_rate": 8.000000000000001e-06, + "loss": 2.3781, + "step": 4000 + }, + { + "epoch": 0.02, + "grad_norm": 12.458216667175293, + "learning_rate": 8.2e-06, + "loss": 2.296, + "step": 4100 + }, + { + "epoch": 0.02, + "grad_norm": 7.576855182647705, + "learning_rate": 8.400000000000001e-06, + "loss": 2.269, + "step": 4200 + }, + { + "epoch": 0.02, + "grad_norm": 6.14406681060791, + "learning_rate": 8.6e-06, + "loss": 2.2047, + "step": 4300 + }, + { + "epoch": 0.02, + "grad_norm": 8.389999389648438, + "learning_rate": 8.8e-06, + "loss": 2.1709, + "step": 4400 + }, + { + "epoch": 0.02, + "grad_norm": 9.2235689163208, + "learning_rate": 9e-06, + "loss": 2.0764, + "step": 4500 + }, + { + "epoch": 0.02, + "grad_norm": 19.56966209411621, + "learning_rate": 9.200000000000002e-06, + "loss": 2.0616, + "step": 4600 + }, + { + "epoch": 0.02, + "grad_norm": 14.260141372680664, + "learning_rate": 9.4e-06, + "loss": 2.0158, + "step": 4700 + }, + { + "epoch": 0.02, + "grad_norm": 6.950816631317139, + "learning_rate": 9.600000000000001e-06, + "loss": 1.9903, + "step": 4800 + }, + { + "epoch": 0.02, + "grad_norm": 6.523265361785889, + "learning_rate": 9.800000000000001e-06, + "loss": 1.9903, + "step": 4900 + }, + { + "epoch": 0.02, + "grad_norm": 5.682758331298828, + "learning_rate": 1e-05, + "loss": 2.0119, + "step": 5000 + }, + { + "epoch": 0.03, + "grad_norm": 6.691493988037109, + "learning_rate": 9.99937486520531e-06, + "loss": 1.966, + "step": 5100 + }, + { + "epoch": 0.03, + "grad_norm": 6.045505046844482, + "learning_rate": 9.99874973041062e-06, + "loss": 1.9175, + "step": 5200 + }, + { + "epoch": 0.03, + "grad_norm": 5.446534633636475, + "learning_rate": 9.99812459561593e-06, + "loss": 1.9086, + "step": 5300 + }, + { + "epoch": 0.03, + "grad_norm": 6.22329044342041, + "learning_rate": 9.99749946082124e-06, + "loss": 1.8706, + "step": 5400 + }, + { + "epoch": 0.03, + "grad_norm": 6.028206825256348, + "learning_rate": 9.99687432602655e-06, + "loss": 1.8183, + "step": 5500 + }, + { + "epoch": 0.03, + "grad_norm": 5.474781036376953, + "learning_rate": 9.99624919123186e-06, + "loss": 1.9045, + "step": 5600 + }, + { + "epoch": 0.03, + "grad_norm": 4.7177886962890625, + "learning_rate": 9.99562405643717e-06, + "loss": 1.8141, + "step": 5700 + }, + { + "epoch": 0.03, + "grad_norm": 6.469454288482666, + "learning_rate": 9.99499892164248e-06, + "loss": 1.8079, + "step": 5800 + }, + { + "epoch": 0.03, + "grad_norm": 5.826772689819336, + "learning_rate": 9.99437378684779e-06, + "loss": 1.852, + "step": 5900 + }, + { + "epoch": 0.03, + "grad_norm": 4.368248462677002, + "learning_rate": 9.9937486520531e-06, + "loss": 1.8207, + "step": 6000 + }, + { + "epoch": 0.03, + "grad_norm": 6.686717987060547, + "learning_rate": 9.99312351725841e-06, + "loss": 1.7868, + "step": 6100 + }, + { + "epoch": 0.03, + "grad_norm": 4.562761306762695, + "learning_rate": 9.99249838246372e-06, + "loss": 1.8201, + "step": 6200 + }, + { + "epoch": 0.03, + "grad_norm": 4.788825511932373, + "learning_rate": 9.99187324766903e-06, + "loss": 1.7568, + "step": 6300 + }, + { + "epoch": 0.03, + "grad_norm": 5.2414870262146, + "learning_rate": 9.991248112874338e-06, + "loss": 1.7635, + "step": 6400 + }, + { + "epoch": 0.03, + "grad_norm": 4.775527477264404, + "learning_rate": 9.99062297807965e-06, + "loss": 1.7465, + "step": 6500 + }, + { + "epoch": 0.03, + "grad_norm": 5.719698429107666, + "learning_rate": 9.989997843284958e-06, + "loss": 1.7776, + "step": 6600 + }, + { + "epoch": 0.03, + "grad_norm": 5.103610515594482, + "learning_rate": 9.98937270849027e-06, + "loss": 1.7364, + "step": 6700 + }, + { + "epoch": 0.03, + "grad_norm": 5.553420543670654, + "learning_rate": 9.988747573695578e-06, + "loss": 1.7341, + "step": 6800 + }, + { + "epoch": 0.03, + "grad_norm": 4.687087535858154, + "learning_rate": 9.98812243890089e-06, + "loss": 1.7586, + "step": 6900 + }, + { + "epoch": 0.03, + "grad_norm": 6.242082595825195, + "learning_rate": 9.987497304106198e-06, + "loss": 1.7255, + "step": 7000 + }, + { + "epoch": 0.04, + "grad_norm": 7.58695650100708, + "learning_rate": 9.98687216931151e-06, + "loss": 1.724, + "step": 7100 + }, + { + "epoch": 0.04, + "grad_norm": 4.764819622039795, + "learning_rate": 9.986247034516818e-06, + "loss": 1.6934, + "step": 7200 + }, + { + "epoch": 0.04, + "grad_norm": 5.300253868103027, + "learning_rate": 9.985621899722129e-06, + "loss": 1.6773, + "step": 7300 + }, + { + "epoch": 0.04, + "grad_norm": 4.917991638183594, + "learning_rate": 9.984996764927437e-06, + "loss": 1.7492, + "step": 7400 + }, + { + "epoch": 0.04, + "grad_norm": 9.860074043273926, + "learning_rate": 9.984371630132749e-06, + "loss": 1.6835, + "step": 7500 + }, + { + "epoch": 0.04, + "grad_norm": 4.517050743103027, + "learning_rate": 9.983746495338059e-06, + "loss": 1.6981, + "step": 7600 + }, + { + "epoch": 0.04, + "grad_norm": 4.88366174697876, + "learning_rate": 9.983121360543367e-06, + "loss": 1.7226, + "step": 7700 + }, + { + "epoch": 0.04, + "grad_norm": 4.612452983856201, + "learning_rate": 9.982496225748679e-06, + "loss": 1.6904, + "step": 7800 + }, + { + "epoch": 0.04, + "grad_norm": 4.865972518920898, + "learning_rate": 9.981871090953987e-06, + "loss": 1.6969, + "step": 7900 + }, + { + "epoch": 0.04, + "grad_norm": 4.375401973724365, + "learning_rate": 9.981245956159299e-06, + "loss": 1.6524, + "step": 8000 + }, + { + "epoch": 0.04, + "grad_norm": 5.660288333892822, + "learning_rate": 9.980620821364607e-06, + "loss": 1.6866, + "step": 8100 + }, + { + "epoch": 0.04, + "grad_norm": 4.874125957489014, + "learning_rate": 9.979995686569918e-06, + "loss": 1.6697, + "step": 8200 + }, + { + "epoch": 0.04, + "grad_norm": 5.102114200592041, + "learning_rate": 9.979370551775227e-06, + "loss": 1.6429, + "step": 8300 + }, + { + "epoch": 0.04, + "grad_norm": 4.398207664489746, + "learning_rate": 9.978745416980538e-06, + "loss": 1.6489, + "step": 8400 + }, + { + "epoch": 0.04, + "grad_norm": 8.623647689819336, + "learning_rate": 9.978120282185847e-06, + "loss": 1.6278, + "step": 8500 + }, + { + "epoch": 0.04, + "grad_norm": 4.62777853012085, + "learning_rate": 9.977495147391158e-06, + "loss": 1.6555, + "step": 8600 + }, + { + "epoch": 0.04, + "grad_norm": 5.873004913330078, + "learning_rate": 9.976870012596466e-06, + "loss": 1.6624, + "step": 8700 + }, + { + "epoch": 0.04, + "grad_norm": 4.456192493438721, + "learning_rate": 9.976244877801778e-06, + "loss": 1.6201, + "step": 8800 + }, + { + "epoch": 0.04, + "grad_norm": 5.2842183113098145, + "learning_rate": 9.975619743007086e-06, + "loss": 1.6334, + "step": 8900 + }, + { + "epoch": 0.04, + "grad_norm": 5.298410415649414, + "learning_rate": 9.974994608212396e-06, + "loss": 1.6472, + "step": 9000 + }, + { + "epoch": 0.05, + "grad_norm": 5.887086391448975, + "learning_rate": 9.974369473417706e-06, + "loss": 1.6124, + "step": 9100 + }, + { + "epoch": 0.05, + "grad_norm": 4.660162925720215, + "learning_rate": 9.973744338623016e-06, + "loss": 1.6131, + "step": 9200 + }, + { + "epoch": 0.05, + "grad_norm": 5.750434398651123, + "learning_rate": 9.973119203828326e-06, + "loss": 1.6205, + "step": 9300 + }, + { + "epoch": 0.05, + "grad_norm": 5.6416707038879395, + "learning_rate": 9.972494069033636e-06, + "loss": 1.619, + "step": 9400 + }, + { + "epoch": 0.05, + "grad_norm": 4.29620361328125, + "learning_rate": 9.971868934238946e-06, + "loss": 1.6062, + "step": 9500 + }, + { + "epoch": 0.05, + "grad_norm": 4.172244071960449, + "learning_rate": 9.971243799444256e-06, + "loss": 1.6139, + "step": 9600 + }, + { + "epoch": 0.05, + "grad_norm": 3.9731390476226807, + "learning_rate": 9.970618664649566e-06, + "loss": 1.5706, + "step": 9700 + }, + { + "epoch": 0.05, + "grad_norm": 4.9260454177856445, + "learning_rate": 9.969993529854876e-06, + "loss": 1.6224, + "step": 9800 + }, + { + "epoch": 0.05, + "grad_norm": 5.228986740112305, + "learning_rate": 9.969368395060186e-06, + "loss": 1.5969, + "step": 9900 + }, + { + "epoch": 0.05, + "grad_norm": 5.494061470031738, + "learning_rate": 9.968743260265495e-06, + "loss": 1.6129, + "step": 10000 + }, + { + "epoch": 0.05, + "grad_norm": 4.893834590911865, + "learning_rate": 9.968118125470805e-06, + "loss": 1.5918, + "step": 10100 + }, + { + "epoch": 0.05, + "grad_norm": 4.201370716094971, + "learning_rate": 9.967492990676115e-06, + "loss": 1.571, + "step": 10200 + }, + { + "epoch": 0.05, + "grad_norm": 6.3033576011657715, + "learning_rate": 9.966867855881425e-06, + "loss": 1.6216, + "step": 10300 + }, + { + "epoch": 0.05, + "grad_norm": 5.211835861206055, + "learning_rate": 9.966242721086735e-06, + "loss": 1.6006, + "step": 10400 + }, + { + "epoch": 0.05, + "grad_norm": 4.3779730796813965, + "learning_rate": 9.965617586292045e-06, + "loss": 1.6184, + "step": 10500 + }, + { + "epoch": 0.05, + "grad_norm": 4.778099060058594, + "learning_rate": 9.964992451497355e-06, + "loss": 1.5855, + "step": 10600 + }, + { + "epoch": 0.05, + "grad_norm": 7.489856243133545, + "learning_rate": 9.964367316702665e-06, + "loss": 1.5827, + "step": 10700 + }, + { + "epoch": 0.05, + "grad_norm": 4.601972579956055, + "learning_rate": 9.963742181907975e-06, + "loss": 1.6029, + "step": 10800 + }, + { + "epoch": 0.05, + "grad_norm": 4.222909450531006, + "learning_rate": 9.963117047113285e-06, + "loss": 1.6027, + "step": 10900 + }, + { + "epoch": 0.05, + "grad_norm": 4.561893939971924, + "learning_rate": 9.962491912318595e-06, + "loss": 1.592, + "step": 11000 + }, + { + "epoch": 0.05, + "grad_norm": 4.908820152282715, + "learning_rate": 9.961866777523905e-06, + "loss": 1.612, + "step": 11100 + }, + { + "epoch": 0.06, + "grad_norm": 4.185163974761963, + "learning_rate": 9.961241642729215e-06, + "loss": 1.5834, + "step": 11200 + }, + { + "epoch": 0.06, + "grad_norm": 5.215177536010742, + "learning_rate": 9.960616507934524e-06, + "loss": 1.5741, + "step": 11300 + }, + { + "epoch": 0.06, + "grad_norm": 6.018292427062988, + "learning_rate": 9.959991373139834e-06, + "loss": 1.5687, + "step": 11400 + }, + { + "epoch": 0.06, + "grad_norm": 6.539705276489258, + "learning_rate": 9.959366238345144e-06, + "loss": 1.5967, + "step": 11500 + }, + { + "epoch": 0.06, + "grad_norm": 5.168763637542725, + "learning_rate": 9.958741103550454e-06, + "loss": 1.5716, + "step": 11600 + }, + { + "epoch": 0.06, + "grad_norm": 4.548024654388428, + "learning_rate": 9.958115968755764e-06, + "loss": 1.5576, + "step": 11700 + }, + { + "epoch": 0.06, + "grad_norm": 5.756062030792236, + "learning_rate": 9.957490833961072e-06, + "loss": 1.5742, + "step": 11800 + }, + { + "epoch": 0.06, + "grad_norm": 5.219858646392822, + "learning_rate": 9.956865699166384e-06, + "loss": 1.5807, + "step": 11900 + }, + { + "epoch": 0.06, + "grad_norm": 4.460545063018799, + "learning_rate": 9.956240564371692e-06, + "loss": 1.5859, + "step": 12000 + }, + { + "epoch": 0.06, + "grad_norm": 4.683807849884033, + "learning_rate": 9.955615429577004e-06, + "loss": 1.5769, + "step": 12100 + }, + { + "epoch": 0.06, + "grad_norm": 6.083448886871338, + "learning_rate": 9.954990294782312e-06, + "loss": 1.5701, + "step": 12200 + }, + { + "epoch": 0.06, + "grad_norm": 5.151342391967773, + "learning_rate": 9.954365159987624e-06, + "loss": 1.5834, + "step": 12300 + }, + { + "epoch": 0.06, + "grad_norm": 4.23958158493042, + "learning_rate": 9.953740025192932e-06, + "loss": 1.5797, + "step": 12400 + }, + { + "epoch": 0.06, + "grad_norm": 5.883495330810547, + "learning_rate": 9.953114890398244e-06, + "loss": 1.5584, + "step": 12500 + }, + { + "epoch": 0.06, + "grad_norm": 4.5151190757751465, + "learning_rate": 9.952489755603552e-06, + "loss": 1.5723, + "step": 12600 + }, + { + "epoch": 0.06, + "grad_norm": 4.234920024871826, + "learning_rate": 9.951864620808863e-06, + "loss": 1.5404, + "step": 12700 + }, + { + "epoch": 0.06, + "grad_norm": 4.942254543304443, + "learning_rate": 9.951239486014173e-06, + "loss": 1.5373, + "step": 12800 + }, + { + "epoch": 0.06, + "grad_norm": 4.068475723266602, + "learning_rate": 9.950614351219483e-06, + "loss": 1.5584, + "step": 12900 + }, + { + "epoch": 0.06, + "grad_norm": 4.250300407409668, + "learning_rate": 9.949989216424793e-06, + "loss": 1.6023, + "step": 13000 + }, + { + "epoch": 0.06, + "grad_norm": 4.279661178588867, + "learning_rate": 9.949364081630101e-06, + "loss": 1.5517, + "step": 13100 + }, + { + "epoch": 0.07, + "grad_norm": 4.5876946449279785, + "learning_rate": 9.948738946835413e-06, + "loss": 1.5142, + "step": 13200 + }, + { + "epoch": 0.07, + "grad_norm": 4.220710754394531, + "learning_rate": 9.948113812040721e-06, + "loss": 1.601, + "step": 13300 + }, + { + "epoch": 0.07, + "grad_norm": 4.183436870574951, + "learning_rate": 9.947488677246033e-06, + "loss": 1.5508, + "step": 13400 + }, + { + "epoch": 0.07, + "grad_norm": 4.20064640045166, + "learning_rate": 9.946863542451341e-06, + "loss": 1.532, + "step": 13500 + }, + { + "epoch": 0.07, + "grad_norm": 5.151244640350342, + "learning_rate": 9.946238407656653e-06, + "loss": 1.5256, + "step": 13600 + }, + { + "epoch": 0.07, + "grad_norm": 4.308995246887207, + "learning_rate": 9.945613272861961e-06, + "loss": 1.5349, + "step": 13700 + }, + { + "epoch": 0.07, + "grad_norm": 5.477377891540527, + "learning_rate": 9.944988138067273e-06, + "loss": 1.5365, + "step": 13800 + }, + { + "epoch": 0.07, + "grad_norm": 5.085025310516357, + "learning_rate": 9.944363003272581e-06, + "loss": 1.5038, + "step": 13900 + }, + { + "epoch": 0.07, + "grad_norm": 4.769080638885498, + "learning_rate": 9.943737868477893e-06, + "loss": 1.5387, + "step": 14000 + }, + { + "epoch": 0.07, + "grad_norm": 4.0054931640625, + "learning_rate": 9.9431127336832e-06, + "loss": 1.5018, + "step": 14100 + }, + { + "epoch": 0.07, + "grad_norm": 4.712356090545654, + "learning_rate": 9.942487598888512e-06, + "loss": 1.5049, + "step": 14200 + }, + { + "epoch": 0.07, + "grad_norm": 4.574007034301758, + "learning_rate": 9.94186246409382e-06, + "loss": 1.5316, + "step": 14300 + }, + { + "epoch": 0.07, + "grad_norm": 4.079704761505127, + "learning_rate": 9.94123732929913e-06, + "loss": 1.527, + "step": 14400 + }, + { + "epoch": 0.07, + "grad_norm": 4.1134490966796875, + "learning_rate": 9.94061219450444e-06, + "loss": 1.5472, + "step": 14500 + }, + { + "epoch": 0.07, + "grad_norm": 5.486052989959717, + "learning_rate": 9.93998705970975e-06, + "loss": 1.5063, + "step": 14600 + }, + { + "epoch": 0.07, + "grad_norm": 4.8249921798706055, + "learning_rate": 9.93936192491506e-06, + "loss": 1.5116, + "step": 14700 + }, + { + "epoch": 0.07, + "grad_norm": 4.1462931632995605, + "learning_rate": 9.93873679012037e-06, + "loss": 1.5259, + "step": 14800 + }, + { + "epoch": 0.07, + "grad_norm": 4.222506999969482, + "learning_rate": 9.93811165532568e-06, + "loss": 1.5378, + "step": 14900 + }, + { + "epoch": 0.07, + "grad_norm": 3.8890185356140137, + "learning_rate": 9.93748652053099e-06, + "loss": 1.5424, + "step": 15000 + }, + { + "epoch": 0.07, + "grad_norm": 6.006450176239014, + "learning_rate": 9.9368613857363e-06, + "loss": 1.5276, + "step": 15100 + }, + { + "epoch": 0.08, + "grad_norm": 4.570881366729736, + "learning_rate": 9.93623625094161e-06, + "loss": 1.5173, + "step": 15200 + }, + { + "epoch": 0.08, + "grad_norm": 4.1144633293151855, + "learning_rate": 9.93561111614692e-06, + "loss": 1.4905, + "step": 15300 + }, + { + "epoch": 0.08, + "grad_norm": 5.034255027770996, + "learning_rate": 9.93498598135223e-06, + "loss": 1.5321, + "step": 15400 + }, + { + "epoch": 0.08, + "grad_norm": 4.831255912780762, + "learning_rate": 9.93436084655754e-06, + "loss": 1.5297, + "step": 15500 + }, + { + "epoch": 0.08, + "grad_norm": 4.800346851348877, + "learning_rate": 9.93373571176285e-06, + "loss": 1.5109, + "step": 15600 + }, + { + "epoch": 0.08, + "grad_norm": 4.187744617462158, + "learning_rate": 9.93311057696816e-06, + "loss": 1.4896, + "step": 15700 + }, + { + "epoch": 0.08, + "grad_norm": 4.569481372833252, + "learning_rate": 9.93248544217347e-06, + "loss": 1.5254, + "step": 15800 + }, + { + "epoch": 0.08, + "grad_norm": 4.826055526733398, + "learning_rate": 9.93186030737878e-06, + "loss": 1.5003, + "step": 15900 + }, + { + "epoch": 0.08, + "grad_norm": 5.592813491821289, + "learning_rate": 9.93123517258409e-06, + "loss": 1.5386, + "step": 16000 + }, + { + "epoch": 0.08, + "grad_norm": 4.18519926071167, + "learning_rate": 9.9306100377894e-06, + "loss": 1.5192, + "step": 16100 + }, + { + "epoch": 0.08, + "grad_norm": 3.737257719039917, + "learning_rate": 9.92998490299471e-06, + "loss": 1.5049, + "step": 16200 + }, + { + "epoch": 0.08, + "grad_norm": 4.077634334564209, + "learning_rate": 9.929359768200019e-06, + "loss": 1.4786, + "step": 16300 + }, + { + "epoch": 0.08, + "grad_norm": 3.7141683101654053, + "learning_rate": 9.928734633405329e-06, + "loss": 1.5008, + "step": 16400 + }, + { + "epoch": 0.08, + "grad_norm": 5.656344413757324, + "learning_rate": 9.928109498610639e-06, + "loss": 1.4907, + "step": 16500 + }, + { + "epoch": 0.08, + "grad_norm": 4.02158784866333, + "learning_rate": 9.927484363815949e-06, + "loss": 1.5436, + "step": 16600 + }, + { + "epoch": 0.08, + "grad_norm": 4.094794273376465, + "learning_rate": 9.926859229021259e-06, + "loss": 1.5135, + "step": 16700 + }, + { + "epoch": 0.08, + "grad_norm": 5.3504533767700195, + "learning_rate": 9.926234094226569e-06, + "loss": 1.4825, + "step": 16800 + }, + { + "epoch": 0.08, + "grad_norm": 4.326951503753662, + "learning_rate": 9.925608959431879e-06, + "loss": 1.5042, + "step": 16900 + }, + { + "epoch": 0.08, + "grad_norm": 4.341583728790283, + "learning_rate": 9.924983824637187e-06, + "loss": 1.5239, + "step": 17000 + }, + { + "epoch": 0.08, + "grad_norm": 4.5446648597717285, + "learning_rate": 9.924358689842499e-06, + "loss": 1.5104, + "step": 17100 + }, + { + "epoch": 0.09, + "grad_norm": 4.787079811096191, + "learning_rate": 9.923733555047807e-06, + "loss": 1.4917, + "step": 17200 + }, + { + "epoch": 0.09, + "grad_norm": 4.259307384490967, + "learning_rate": 9.923108420253118e-06, + "loss": 1.5162, + "step": 17300 + }, + { + "epoch": 0.09, + "grad_norm": 4.553911209106445, + "learning_rate": 9.922483285458427e-06, + "loss": 1.4805, + "step": 17400 + }, + { + "epoch": 0.09, + "grad_norm": 4.846059322357178, + "learning_rate": 9.921858150663738e-06, + "loss": 1.51, + "step": 17500 + }, + { + "epoch": 0.09, + "grad_norm": 4.385834217071533, + "learning_rate": 9.921233015869046e-06, + "loss": 1.4883, + "step": 17600 + }, + { + "epoch": 0.09, + "grad_norm": 4.686222553253174, + "learning_rate": 9.920607881074358e-06, + "loss": 1.525, + "step": 17700 + }, + { + "epoch": 0.09, + "grad_norm": 3.7362844944000244, + "learning_rate": 9.919982746279666e-06, + "loss": 1.4877, + "step": 17800 + }, + { + "epoch": 0.09, + "grad_norm": 4.107117652893066, + "learning_rate": 9.919357611484978e-06, + "loss": 1.5353, + "step": 17900 + }, + { + "epoch": 0.09, + "grad_norm": 4.318368911743164, + "learning_rate": 9.918732476690288e-06, + "loss": 1.4682, + "step": 18000 + }, + { + "epoch": 0.09, + "grad_norm": 4.914721488952637, + "learning_rate": 9.918107341895598e-06, + "loss": 1.5261, + "step": 18100 + }, + { + "epoch": 0.09, + "grad_norm": 4.494168281555176, + "learning_rate": 9.917482207100908e-06, + "loss": 1.5013, + "step": 18200 + }, + { + "epoch": 0.09, + "grad_norm": 5.026334762573242, + "learning_rate": 9.916857072306216e-06, + "loss": 1.5093, + "step": 18300 + }, + { + "epoch": 0.09, + "grad_norm": 4.3312907218933105, + "learning_rate": 9.916231937511528e-06, + "loss": 1.5224, + "step": 18400 + }, + { + "epoch": 0.09, + "grad_norm": 3.9422335624694824, + "learning_rate": 9.915606802716836e-06, + "loss": 1.5059, + "step": 18500 + }, + { + "epoch": 0.09, + "grad_norm": 4.773715496063232, + "learning_rate": 9.914981667922147e-06, + "loss": 1.5031, + "step": 18600 + }, + { + "epoch": 0.09, + "grad_norm": 5.202546119689941, + "learning_rate": 9.914356533127456e-06, + "loss": 1.5133, + "step": 18700 + }, + { + "epoch": 0.09, + "grad_norm": 4.315513610839844, + "learning_rate": 9.913731398332767e-06, + "loss": 1.5343, + "step": 18800 + }, + { + "epoch": 0.09, + "grad_norm": 4.325439929962158, + "learning_rate": 9.913106263538076e-06, + "loss": 1.4993, + "step": 18900 + }, + { + "epoch": 0.09, + "grad_norm": 4.189039707183838, + "learning_rate": 9.912481128743387e-06, + "loss": 1.4871, + "step": 19000 + }, + { + "epoch": 0.09, + "grad_norm": 4.019628524780273, + "learning_rate": 9.911855993948695e-06, + "loss": 1.5294, + "step": 19100 + }, + { + "epoch": 0.09, + "grad_norm": 3.682359457015991, + "learning_rate": 9.911230859154007e-06, + "loss": 1.4728, + "step": 19200 + }, + { + "epoch": 0.1, + "grad_norm": 3.9543027877807617, + "learning_rate": 9.910605724359315e-06, + "loss": 1.473, + "step": 19300 + }, + { + "epoch": 0.1, + "grad_norm": 3.9522476196289062, + "learning_rate": 9.909980589564627e-06, + "loss": 1.5023, + "step": 19400 + }, + { + "epoch": 0.1, + "grad_norm": 3.9608728885650635, + "learning_rate": 9.909355454769935e-06, + "loss": 1.4532, + "step": 19500 + }, + { + "epoch": 0.1, + "grad_norm": 4.24020528793335, + "learning_rate": 9.908730319975245e-06, + "loss": 1.4691, + "step": 19600 + }, + { + "epoch": 0.1, + "grad_norm": 4.217301845550537, + "learning_rate": 9.908105185180555e-06, + "loss": 1.4704, + "step": 19700 + }, + { + "epoch": 0.1, + "grad_norm": 3.651137113571167, + "learning_rate": 9.907480050385865e-06, + "loss": 1.4933, + "step": 19800 + }, + { + "epoch": 0.1, + "grad_norm": 4.657069206237793, + "learning_rate": 9.906854915591175e-06, + "loss": 1.4778, + "step": 19900 + }, + { + "epoch": 0.1, + "grad_norm": 4.250380992889404, + "learning_rate": 9.906229780796485e-06, + "loss": 1.4931, + "step": 20000 + }, + { + "epoch": 0.1, + "grad_norm": 4.308979034423828, + "learning_rate": 9.905604646001795e-06, + "loss": 1.4946, + "step": 20100 + }, + { + "epoch": 0.1, + "grad_norm": 4.854809284210205, + "learning_rate": 9.904979511207105e-06, + "loss": 1.4399, + "step": 20200 + }, + { + "epoch": 0.1, + "grad_norm": 4.725897789001465, + "learning_rate": 9.904354376412414e-06, + "loss": 1.4711, + "step": 20300 + }, + { + "epoch": 0.1, + "grad_norm": 4.014156818389893, + "learning_rate": 9.903729241617724e-06, + "loss": 1.5038, + "step": 20400 + }, + { + "epoch": 0.1, + "grad_norm": 6.402193546295166, + "learning_rate": 9.903104106823034e-06, + "loss": 1.4607, + "step": 20500 + }, + { + "epoch": 0.1, + "grad_norm": 4.356836795806885, + "learning_rate": 9.902478972028344e-06, + "loss": 1.4767, + "step": 20600 + }, + { + "epoch": 0.1, + "grad_norm": 4.174656867980957, + "learning_rate": 9.901853837233654e-06, + "loss": 1.4675, + "step": 20700 + }, + { + "epoch": 0.1, + "grad_norm": 3.668475866317749, + "learning_rate": 9.901228702438964e-06, + "loss": 1.4529, + "step": 20800 + }, + { + "epoch": 0.1, + "grad_norm": 3.7700912952423096, + "learning_rate": 9.900603567644274e-06, + "loss": 1.4521, + "step": 20900 + }, + { + "epoch": 0.1, + "grad_norm": 3.570835828781128, + "learning_rate": 9.899978432849584e-06, + "loss": 1.4323, + "step": 21000 + }, + { + "epoch": 0.1, + "grad_norm": 3.7499380111694336, + "learning_rate": 9.899353298054894e-06, + "loss": 1.4644, + "step": 21100 + }, + { + "epoch": 0.1, + "grad_norm": 3.8630640506744385, + "learning_rate": 9.898728163260204e-06, + "loss": 1.4801, + "step": 21200 + }, + { + "epoch": 0.11, + "grad_norm": 4.1705145835876465, + "learning_rate": 9.898103028465514e-06, + "loss": 1.4649, + "step": 21300 + }, + { + "epoch": 0.11, + "grad_norm": 4.312972545623779, + "learning_rate": 9.897477893670824e-06, + "loss": 1.4696, + "step": 21400 + }, + { + "epoch": 0.11, + "grad_norm": 4.0083088874816895, + "learning_rate": 9.896852758876134e-06, + "loss": 1.4557, + "step": 21500 + }, + { + "epoch": 0.11, + "grad_norm": 3.791517496109009, + "learning_rate": 9.896227624081444e-06, + "loss": 1.4507, + "step": 21600 + }, + { + "epoch": 0.11, + "grad_norm": 4.635531425476074, + "learning_rate": 9.895602489286753e-06, + "loss": 1.4922, + "step": 21700 + }, + { + "epoch": 0.11, + "grad_norm": 3.6790366172790527, + "learning_rate": 9.894977354492063e-06, + "loss": 1.456, + "step": 21800 + }, + { + "epoch": 0.11, + "grad_norm": 4.739284038543701, + "learning_rate": 9.894352219697373e-06, + "loss": 1.4993, + "step": 21900 + }, + { + "epoch": 0.11, + "grad_norm": 3.6052489280700684, + "learning_rate": 9.893727084902683e-06, + "loss": 1.4599, + "step": 22000 + }, + { + "epoch": 0.11, + "grad_norm": 4.582137584686279, + "learning_rate": 9.893101950107993e-06, + "loss": 1.464, + "step": 22100 + }, + { + "epoch": 0.11, + "grad_norm": 4.414693355560303, + "learning_rate": 9.892476815313303e-06, + "loss": 1.4383, + "step": 22200 + }, + { + "epoch": 0.11, + "grad_norm": 4.012635707855225, + "learning_rate": 9.891851680518613e-06, + "loss": 1.4496, + "step": 22300 + }, + { + "epoch": 0.11, + "grad_norm": 3.935889482498169, + "learning_rate": 9.891226545723921e-06, + "loss": 1.44, + "step": 22400 + }, + { + "epoch": 0.11, + "grad_norm": 3.735189199447632, + "learning_rate": 9.890601410929233e-06, + "loss": 1.4396, + "step": 22500 + }, + { + "epoch": 0.11, + "grad_norm": 7.265974998474121, + "learning_rate": 9.889976276134541e-06, + "loss": 1.4367, + "step": 22600 + }, + { + "epoch": 0.11, + "grad_norm": 3.6876790523529053, + "learning_rate": 9.889351141339853e-06, + "loss": 1.4477, + "step": 22700 + }, + { + "epoch": 0.11, + "grad_norm": 4.87882661819458, + "learning_rate": 9.888726006545161e-06, + "loss": 1.4687, + "step": 22800 + }, + { + "epoch": 0.11, + "grad_norm": 4.071088790893555, + "learning_rate": 9.888100871750473e-06, + "loss": 1.5096, + "step": 22900 + }, + { + "epoch": 0.11, + "grad_norm": 3.7322299480438232, + "learning_rate": 9.887475736955783e-06, + "loss": 1.4701, + "step": 23000 + }, + { + "epoch": 0.11, + "grad_norm": 4.085651397705078, + "learning_rate": 9.886850602161092e-06, + "loss": 1.4518, + "step": 23100 + }, + { + "epoch": 0.11, + "grad_norm": 3.951169729232788, + "learning_rate": 9.886225467366402e-06, + "loss": 1.4336, + "step": 23200 + }, + { + "epoch": 0.12, + "grad_norm": 4.026634693145752, + "learning_rate": 9.885600332571712e-06, + "loss": 1.476, + "step": 23300 + }, + { + "epoch": 0.12, + "grad_norm": 5.0271477699279785, + "learning_rate": 9.884975197777022e-06, + "loss": 1.4925, + "step": 23400 + }, + { + "epoch": 0.12, + "grad_norm": 3.727571725845337, + "learning_rate": 9.884350062982332e-06, + "loss": 1.452, + "step": 23500 + }, + { + "epoch": 0.12, + "grad_norm": 3.8245084285736084, + "learning_rate": 9.883724928187642e-06, + "loss": 1.4597, + "step": 23600 + }, + { + "epoch": 0.12, + "grad_norm": 4.508169174194336, + "learning_rate": 9.88309979339295e-06, + "loss": 1.4481, + "step": 23700 + }, + { + "epoch": 0.12, + "grad_norm": 4.080810070037842, + "learning_rate": 9.882474658598262e-06, + "loss": 1.4807, + "step": 23800 + }, + { + "epoch": 0.12, + "grad_norm": 4.269100666046143, + "learning_rate": 9.88184952380357e-06, + "loss": 1.4108, + "step": 23900 + }, + { + "epoch": 0.12, + "grad_norm": 4.10108757019043, + "learning_rate": 9.881224389008882e-06, + "loss": 1.4592, + "step": 24000 + }, + { + "epoch": 0.12, + "grad_norm": 4.869362831115723, + "learning_rate": 9.88059925421419e-06, + "loss": 1.4687, + "step": 24100 + }, + { + "epoch": 0.12, + "grad_norm": 3.7101335525512695, + "learning_rate": 9.879974119419502e-06, + "loss": 1.4748, + "step": 24200 + }, + { + "epoch": 0.12, + "grad_norm": 3.676862955093384, + "learning_rate": 9.87934898462481e-06, + "loss": 1.4439, + "step": 24300 + }, + { + "epoch": 0.12, + "grad_norm": 4.2332844734191895, + "learning_rate": 9.878723849830121e-06, + "loss": 1.4371, + "step": 24400 + }, + { + "epoch": 0.12, + "grad_norm": 3.947660207748413, + "learning_rate": 9.87809871503543e-06, + "loss": 1.4619, + "step": 24500 + }, + { + "epoch": 0.12, + "grad_norm": 4.331580638885498, + "learning_rate": 9.877473580240741e-06, + "loss": 1.4332, + "step": 24600 + }, + { + "epoch": 0.12, + "grad_norm": 5.24333381652832, + "learning_rate": 9.87684844544605e-06, + "loss": 1.4474, + "step": 24700 + }, + { + "epoch": 0.12, + "grad_norm": 3.499051570892334, + "learning_rate": 9.876223310651361e-06, + "loss": 1.4303, + "step": 24800 + }, + { + "epoch": 0.12, + "grad_norm": 3.9360058307647705, + "learning_rate": 9.87559817585667e-06, + "loss": 1.4731, + "step": 24900 + }, + { + "epoch": 0.12, + "grad_norm": 4.0263352394104, + "learning_rate": 9.87497304106198e-06, + "loss": 1.4429, + "step": 25000 + }, + { + "epoch": 0.12, + "grad_norm": 3.827765941619873, + "learning_rate": 9.87434790626729e-06, + "loss": 1.4312, + "step": 25100 + }, + { + "epoch": 0.12, + "grad_norm": 3.680577278137207, + "learning_rate": 9.8737227714726e-06, + "loss": 1.4571, + "step": 25200 + }, + { + "epoch": 0.13, + "grad_norm": 4.409987926483154, + "learning_rate": 9.873097636677909e-06, + "loss": 1.4708, + "step": 25300 + }, + { + "epoch": 0.13, + "grad_norm": 4.233061790466309, + "learning_rate": 9.872472501883219e-06, + "loss": 1.4079, + "step": 25400 + }, + { + "epoch": 0.13, + "grad_norm": 3.923621416091919, + "learning_rate": 9.871847367088529e-06, + "loss": 1.4389, + "step": 25500 + }, + { + "epoch": 0.13, + "grad_norm": 3.695704698562622, + "learning_rate": 9.871222232293839e-06, + "loss": 1.448, + "step": 25600 + }, + { + "epoch": 0.13, + "grad_norm": 4.188453674316406, + "learning_rate": 9.870597097499149e-06, + "loss": 1.4356, + "step": 25700 + }, + { + "epoch": 0.13, + "grad_norm": 4.8711934089660645, + "learning_rate": 9.869971962704459e-06, + "loss": 1.4221, + "step": 25800 + }, + { + "epoch": 0.13, + "grad_norm": 4.016773223876953, + "learning_rate": 9.869346827909769e-06, + "loss": 1.4409, + "step": 25900 + }, + { + "epoch": 0.13, + "grad_norm": 4.488391399383545, + "learning_rate": 9.868721693115079e-06, + "loss": 1.4609, + "step": 26000 + }, + { + "epoch": 0.13, + "grad_norm": 4.66510534286499, + "learning_rate": 9.868096558320389e-06, + "loss": 1.4132, + "step": 26100 + }, + { + "epoch": 0.13, + "grad_norm": 3.9409704208374023, + "learning_rate": 9.867471423525698e-06, + "loss": 1.4406, + "step": 26200 + }, + { + "epoch": 0.13, + "grad_norm": 3.3582875728607178, + "learning_rate": 9.866846288731008e-06, + "loss": 1.438, + "step": 26300 + }, + { + "epoch": 0.13, + "grad_norm": 4.238399505615234, + "learning_rate": 9.866221153936318e-06, + "loss": 1.4201, + "step": 26400 + }, + { + "epoch": 0.13, + "grad_norm": 3.6502299308776855, + "learning_rate": 9.865596019141628e-06, + "loss": 1.4592, + "step": 26500 + }, + { + "epoch": 0.13, + "grad_norm": 3.8077006340026855, + "learning_rate": 9.864970884346938e-06, + "loss": 1.4556, + "step": 26600 + }, + { + "epoch": 0.13, + "grad_norm": 4.270641326904297, + "learning_rate": 9.864345749552248e-06, + "loss": 1.4101, + "step": 26700 + }, + { + "epoch": 0.13, + "grad_norm": 4.743376731872559, + "learning_rate": 9.863720614757558e-06, + "loss": 1.4719, + "step": 26800 + }, + { + "epoch": 0.13, + "grad_norm": 4.345736980438232, + "learning_rate": 9.863095479962868e-06, + "loss": 1.4002, + "step": 26900 + }, + { + "epoch": 0.13, + "grad_norm": 3.8192217350006104, + "learning_rate": 9.862470345168178e-06, + "loss": 1.4697, + "step": 27000 + }, + { + "epoch": 0.13, + "grad_norm": 4.685102939605713, + "learning_rate": 9.861845210373488e-06, + "loss": 1.4519, + "step": 27100 + }, + { + "epoch": 0.13, + "grad_norm": 3.690993070602417, + "learning_rate": 9.861220075578798e-06, + "loss": 1.4424, + "step": 27200 + }, + { + "epoch": 0.14, + "grad_norm": 3.8806326389312744, + "learning_rate": 9.860594940784108e-06, + "loss": 1.4416, + "step": 27300 + }, + { + "epoch": 0.14, + "grad_norm": 3.362546443939209, + "learning_rate": 9.859969805989418e-06, + "loss": 1.4245, + "step": 27400 + }, + { + "epoch": 0.14, + "grad_norm": 4.167792320251465, + "learning_rate": 9.859344671194728e-06, + "loss": 1.4645, + "step": 27500 + }, + { + "epoch": 0.14, + "grad_norm": 4.120845317840576, + "learning_rate": 9.858719536400036e-06, + "loss": 1.3847, + "step": 27600 + }, + { + "epoch": 0.14, + "grad_norm": 3.8441598415374756, + "learning_rate": 9.858094401605347e-06, + "loss": 1.4244, + "step": 27700 + }, + { + "epoch": 0.14, + "grad_norm": 3.3462672233581543, + "learning_rate": 9.857469266810656e-06, + "loss": 1.4401, + "step": 27800 + }, + { + "epoch": 0.14, + "grad_norm": 4.065661430358887, + "learning_rate": 9.856844132015967e-06, + "loss": 1.4212, + "step": 27900 + }, + { + "epoch": 0.14, + "grad_norm": 3.5907657146453857, + "learning_rate": 9.856218997221275e-06, + "loss": 1.4639, + "step": 28000 + }, + { + "epoch": 0.14, + "grad_norm": 3.701472759246826, + "learning_rate": 9.855593862426587e-06, + "loss": 1.4052, + "step": 28100 + }, + { + "epoch": 0.14, + "grad_norm": 3.7131853103637695, + "learning_rate": 9.854968727631897e-06, + "loss": 1.4237, + "step": 28200 + }, + { + "epoch": 0.14, + "grad_norm": 3.156214475631714, + "learning_rate": 9.854343592837207e-06, + "loss": 1.4364, + "step": 28300 + }, + { + "epoch": 0.14, + "grad_norm": 4.9435715675354, + "learning_rate": 9.853718458042517e-06, + "loss": 1.4352, + "step": 28400 + }, + { + "epoch": 0.14, + "grad_norm": 3.94811749458313, + "learning_rate": 9.853093323247827e-06, + "loss": 1.4303, + "step": 28500 + }, + { + "epoch": 0.14, + "grad_norm": 3.5269935131073, + "learning_rate": 9.852468188453137e-06, + "loss": 1.4214, + "step": 28600 + }, + { + "epoch": 0.14, + "grad_norm": 4.688473224639893, + "learning_rate": 9.851843053658447e-06, + "loss": 1.3854, + "step": 28700 + }, + { + "epoch": 0.14, + "grad_norm": 4.054961204528809, + "learning_rate": 9.851217918863757e-06, + "loss": 1.432, + "step": 28800 + }, + { + "epoch": 0.14, + "grad_norm": 3.178467273712158, + "learning_rate": 9.850592784069065e-06, + "loss": 1.4439, + "step": 28900 + }, + { + "epoch": 0.14, + "grad_norm": 4.031513690948486, + "learning_rate": 9.849967649274376e-06, + "loss": 1.4228, + "step": 29000 + }, + { + "epoch": 0.14, + "grad_norm": 3.9268980026245117, + "learning_rate": 9.849342514479685e-06, + "loss": 1.4002, + "step": 29100 + }, + { + "epoch": 0.14, + "grad_norm": 3.176645040512085, + "learning_rate": 9.848717379684996e-06, + "loss": 1.4732, + "step": 29200 + }, + { + "epoch": 0.14, + "grad_norm": 2.7952117919921875, + "learning_rate": 9.848092244890304e-06, + "loss": 1.4609, + "step": 29300 + }, + { + "epoch": 0.15, + "grad_norm": 3.6165409088134766, + "learning_rate": 9.847467110095616e-06, + "loss": 1.4179, + "step": 29400 + }, + { + "epoch": 0.15, + "grad_norm": 4.359500408172607, + "learning_rate": 9.846841975300924e-06, + "loss": 1.4443, + "step": 29500 + }, + { + "epoch": 0.15, + "grad_norm": 4.256430625915527, + "learning_rate": 9.846216840506236e-06, + "loss": 1.4205, + "step": 29600 + }, + { + "epoch": 0.15, + "grad_norm": 4.939763069152832, + "learning_rate": 9.845591705711544e-06, + "loss": 1.3889, + "step": 29700 + }, + { + "epoch": 0.15, + "grad_norm": 3.5934700965881348, + "learning_rate": 9.844966570916856e-06, + "loss": 1.4045, + "step": 29800 + }, + { + "epoch": 0.15, + "grad_norm": 3.0760035514831543, + "learning_rate": 9.844341436122164e-06, + "loss": 1.4534, + "step": 29900 + }, + { + "epoch": 0.15, + "grad_norm": 4.314694881439209, + "learning_rate": 9.843716301327476e-06, + "loss": 1.4284, + "step": 30000 + }, + { + "epoch": 0.15, + "grad_norm": 3.9042022228240967, + "learning_rate": 9.843091166532784e-06, + "loss": 1.4249, + "step": 30100 + }, + { + "epoch": 0.15, + "grad_norm": 3.454749822616577, + "learning_rate": 9.842466031738094e-06, + "loss": 1.4291, + "step": 30200 + }, + { + "epoch": 0.15, + "grad_norm": 3.8640189170837402, + "learning_rate": 9.841840896943404e-06, + "loss": 1.4514, + "step": 30300 + }, + { + "epoch": 0.15, + "grad_norm": 4.65750789642334, + "learning_rate": 9.841215762148714e-06, + "loss": 1.4626, + "step": 30400 + }, + { + "epoch": 0.15, + "grad_norm": 4.030206680297852, + "learning_rate": 9.840590627354024e-06, + "loss": 1.4038, + "step": 30500 + }, + { + "epoch": 0.15, + "grad_norm": 4.036793231964111, + "learning_rate": 9.839965492559334e-06, + "loss": 1.4225, + "step": 30600 + }, + { + "epoch": 0.15, + "grad_norm": 3.980349063873291, + "learning_rate": 9.839340357764643e-06, + "loss": 1.4334, + "step": 30700 + }, + { + "epoch": 0.15, + "grad_norm": 4.157260894775391, + "learning_rate": 9.838715222969953e-06, + "loss": 1.453, + "step": 30800 + }, + { + "epoch": 0.15, + "grad_norm": 3.416947841644287, + "learning_rate": 9.838090088175263e-06, + "loss": 1.4176, + "step": 30900 + }, + { + "epoch": 0.15, + "grad_norm": 5.0742645263671875, + "learning_rate": 9.837464953380573e-06, + "loss": 1.4492, + "step": 31000 + }, + { + "epoch": 0.15, + "grad_norm": 3.5331027507781982, + "learning_rate": 9.836839818585883e-06, + "loss": 1.4258, + "step": 31100 + }, + { + "epoch": 0.15, + "grad_norm": 3.268676280975342, + "learning_rate": 9.836214683791193e-06, + "loss": 1.4081, + "step": 31200 + }, + { + "epoch": 0.15, + "grad_norm": 3.619158983230591, + "learning_rate": 9.835589548996503e-06, + "loss": 1.3998, + "step": 31300 + }, + { + "epoch": 0.16, + "grad_norm": 3.513633966445923, + "learning_rate": 9.834964414201813e-06, + "loss": 1.4102, + "step": 31400 + }, + { + "epoch": 0.16, + "grad_norm": 3.7973320484161377, + "learning_rate": 9.834339279407123e-06, + "loss": 1.3846, + "step": 31500 + }, + { + "epoch": 0.16, + "grad_norm": 4.910383701324463, + "learning_rate": 9.833714144612433e-06, + "loss": 1.3886, + "step": 31600 + }, + { + "epoch": 0.16, + "grad_norm": 3.820688009262085, + "learning_rate": 9.833089009817743e-06, + "loss": 1.4178, + "step": 31700 + }, + { + "epoch": 0.16, + "grad_norm": 3.5854384899139404, + "learning_rate": 9.832463875023053e-06, + "loss": 1.4154, + "step": 31800 + }, + { + "epoch": 0.16, + "grad_norm": 3.8664228916168213, + "learning_rate": 9.831838740228363e-06, + "loss": 1.4255, + "step": 31900 + }, + { + "epoch": 0.16, + "grad_norm": 3.179574728012085, + "learning_rate": 9.831213605433673e-06, + "loss": 1.4039, + "step": 32000 + }, + { + "epoch": 0.16, + "grad_norm": 4.206747531890869, + "learning_rate": 9.830588470638982e-06, + "loss": 1.4066, + "step": 32100 + }, + { + "epoch": 0.16, + "grad_norm": 4.408509731292725, + "learning_rate": 9.829963335844292e-06, + "loss": 1.4028, + "step": 32200 + }, + { + "epoch": 0.16, + "grad_norm": 5.01927375793457, + "learning_rate": 9.829338201049602e-06, + "loss": 1.4415, + "step": 32300 + }, + { + "epoch": 0.16, + "grad_norm": 3.166085720062256, + "learning_rate": 9.828713066254912e-06, + "loss": 1.4698, + "step": 32400 + }, + { + "epoch": 0.16, + "grad_norm": 3.4799346923828125, + "learning_rate": 9.828087931460222e-06, + "loss": 1.4055, + "step": 32500 + }, + { + "epoch": 0.16, + "grad_norm": 3.68662428855896, + "learning_rate": 9.827462796665532e-06, + "loss": 1.3942, + "step": 32600 + }, + { + "epoch": 0.16, + "grad_norm": 4.4798970222473145, + "learning_rate": 9.826837661870842e-06, + "loss": 1.4293, + "step": 32700 + }, + { + "epoch": 0.16, + "grad_norm": 3.4319188594818115, + "learning_rate": 9.826212527076152e-06, + "loss": 1.4134, + "step": 32800 + }, + { + "epoch": 0.16, + "grad_norm": 3.2756521701812744, + "learning_rate": 9.825587392281462e-06, + "loss": 1.4535, + "step": 32900 + }, + { + "epoch": 0.16, + "grad_norm": 3.3544061183929443, + "learning_rate": 9.82496225748677e-06, + "loss": 1.3997, + "step": 33000 + }, + { + "epoch": 0.16, + "grad_norm": 3.7909374237060547, + "learning_rate": 9.824337122692082e-06, + "loss": 1.4043, + "step": 33100 + }, + { + "epoch": 0.16, + "grad_norm": 3.8240981101989746, + "learning_rate": 9.82371198789739e-06, + "loss": 1.428, + "step": 33200 + }, + { + "epoch": 0.16, + "grad_norm": 3.214618682861328, + "learning_rate": 9.823086853102702e-06, + "loss": 1.3999, + "step": 33300 + }, + { + "epoch": 0.17, + "grad_norm": 8.105681419372559, + "learning_rate": 9.822461718308011e-06, + "loss": 1.4082, + "step": 33400 + }, + { + "epoch": 0.17, + "grad_norm": 4.449899196624756, + "learning_rate": 9.821836583513321e-06, + "loss": 1.4273, + "step": 33500 + }, + { + "epoch": 0.17, + "grad_norm": 4.17997932434082, + "learning_rate": 9.821211448718631e-06, + "loss": 1.4291, + "step": 33600 + }, + { + "epoch": 0.17, + "grad_norm": 3.813826322555542, + "learning_rate": 9.820586313923941e-06, + "loss": 1.407, + "step": 33700 + }, + { + "epoch": 0.17, + "grad_norm": 3.7879397869110107, + "learning_rate": 9.819961179129251e-06, + "loss": 1.4159, + "step": 33800 + }, + { + "epoch": 0.17, + "grad_norm": 3.9027743339538574, + "learning_rate": 9.819336044334561e-06, + "loss": 1.439, + "step": 33900 + }, + { + "epoch": 0.17, + "grad_norm": 3.7069435119628906, + "learning_rate": 9.818710909539871e-06, + "loss": 1.4117, + "step": 34000 + }, + { + "epoch": 0.17, + "grad_norm": 3.904519557952881, + "learning_rate": 9.818085774745181e-06, + "loss": 1.4227, + "step": 34100 + }, + { + "epoch": 0.17, + "grad_norm": 3.545767068862915, + "learning_rate": 9.817460639950491e-06, + "loss": 1.4344, + "step": 34200 + }, + { + "epoch": 0.17, + "grad_norm": 3.717536687850952, + "learning_rate": 9.816835505155799e-06, + "loss": 1.447, + "step": 34300 + }, + { + "epoch": 0.17, + "grad_norm": 3.036220073699951, + "learning_rate": 9.81621037036111e-06, + "loss": 1.4124, + "step": 34400 + }, + { + "epoch": 0.17, + "grad_norm": 4.334647178649902, + "learning_rate": 9.815585235566419e-06, + "loss": 1.4347, + "step": 34500 + }, + { + "epoch": 0.17, + "grad_norm": 3.7795979976654053, + "learning_rate": 9.81496010077173e-06, + "loss": 1.417, + "step": 34600 + }, + { + "epoch": 0.17, + "grad_norm": 3.4146125316619873, + "learning_rate": 9.814334965977039e-06, + "loss": 1.3847, + "step": 34700 + }, + { + "epoch": 0.17, + "grad_norm": 3.193895101547241, + "learning_rate": 9.81370983118235e-06, + "loss": 1.4084, + "step": 34800 + }, + { + "epoch": 0.17, + "grad_norm": 3.3683910369873047, + "learning_rate": 9.813084696387659e-06, + "loss": 1.4393, + "step": 34900 + }, + { + "epoch": 0.17, + "grad_norm": 4.252421855926514, + "learning_rate": 9.81245956159297e-06, + "loss": 1.4294, + "step": 35000 + }, + { + "epoch": 0.17, + "grad_norm": 3.6922848224639893, + "learning_rate": 9.811834426798279e-06, + "loss": 1.4212, + "step": 35100 + }, + { + "epoch": 0.17, + "grad_norm": 3.27756667137146, + "learning_rate": 9.81120929200359e-06, + "loss": 1.4185, + "step": 35200 + }, + { + "epoch": 0.17, + "grad_norm": 3.377180337905884, + "learning_rate": 9.810584157208898e-06, + "loss": 1.3991, + "step": 35300 + }, + { + "epoch": 0.18, + "grad_norm": 3.4141881465911865, + "learning_rate": 9.80995902241421e-06, + "loss": 1.3816, + "step": 35400 + }, + { + "epoch": 0.18, + "grad_norm": 3.6975343227386475, + "learning_rate": 9.809333887619518e-06, + "loss": 1.4275, + "step": 35500 + }, + { + "epoch": 0.18, + "grad_norm": 3.303208112716675, + "learning_rate": 9.808708752824828e-06, + "loss": 1.3996, + "step": 35600 + }, + { + "epoch": 0.18, + "grad_norm": 3.1281208992004395, + "learning_rate": 9.808083618030138e-06, + "loss": 1.4573, + "step": 35700 + }, + { + "epoch": 0.18, + "grad_norm": 4.216818809509277, + "learning_rate": 9.807458483235448e-06, + "loss": 1.4264, + "step": 35800 + }, + { + "epoch": 0.18, + "grad_norm": 3.6236705780029297, + "learning_rate": 9.806833348440758e-06, + "loss": 1.42, + "step": 35900 + }, + { + "epoch": 0.18, + "grad_norm": 3.4652881622314453, + "learning_rate": 9.806208213646068e-06, + "loss": 1.4509, + "step": 36000 + }, + { + "epoch": 0.18, + "grad_norm": 4.3565449714660645, + "learning_rate": 9.805583078851378e-06, + "loss": 1.3968, + "step": 36100 + }, + { + "epoch": 0.18, + "grad_norm": 5.522129535675049, + "learning_rate": 9.804957944056688e-06, + "loss": 1.4402, + "step": 36200 + }, + { + "epoch": 0.18, + "grad_norm": 3.9470767974853516, + "learning_rate": 9.804332809261998e-06, + "loss": 1.3922, + "step": 36300 + }, + { + "epoch": 0.18, + "grad_norm": 3.978543758392334, + "learning_rate": 9.803707674467308e-06, + "loss": 1.403, + "step": 36400 + }, + { + "epoch": 0.18, + "grad_norm": 5.382244110107422, + "learning_rate": 9.803082539672618e-06, + "loss": 1.3968, + "step": 36500 + }, + { + "epoch": 0.18, + "grad_norm": 4.595647811889648, + "learning_rate": 9.802457404877927e-06, + "loss": 1.4002, + "step": 36600 + }, + { + "epoch": 0.18, + "grad_norm": 3.6310489177703857, + "learning_rate": 9.801832270083237e-06, + "loss": 1.4122, + "step": 36700 + }, + { + "epoch": 0.18, + "grad_norm": 3.4216363430023193, + "learning_rate": 9.801207135288547e-06, + "loss": 1.3859, + "step": 36800 + }, + { + "epoch": 0.18, + "grad_norm": 3.4577724933624268, + "learning_rate": 9.800582000493857e-06, + "loss": 1.4206, + "step": 36900 + }, + { + "epoch": 0.18, + "grad_norm": 4.211758136749268, + "learning_rate": 9.799956865699167e-06, + "loss": 1.4146, + "step": 37000 + }, + { + "epoch": 0.18, + "grad_norm": 3.5187759399414062, + "learning_rate": 9.799331730904477e-06, + "loss": 1.3983, + "step": 37100 + }, + { + "epoch": 0.18, + "grad_norm": 3.524277925491333, + "learning_rate": 9.798706596109787e-06, + "loss": 1.39, + "step": 37200 + }, + { + "epoch": 0.18, + "grad_norm": 3.746493339538574, + "learning_rate": 9.798081461315097e-06, + "loss": 1.4351, + "step": 37300 + }, + { + "epoch": 0.19, + "grad_norm": 3.489757537841797, + "learning_rate": 9.797456326520407e-06, + "loss": 1.4215, + "step": 37400 + }, + { + "epoch": 0.19, + "grad_norm": 3.862546443939209, + "learning_rate": 9.796831191725717e-06, + "loss": 1.3839, + "step": 37500 + }, + { + "epoch": 0.19, + "grad_norm": 3.700289487838745, + "learning_rate": 9.796206056931027e-06, + "loss": 1.4134, + "step": 37600 + }, + { + "epoch": 0.19, + "grad_norm": 4.463230609893799, + "learning_rate": 9.795580922136337e-06, + "loss": 1.4094, + "step": 37700 + }, + { + "epoch": 0.19, + "grad_norm": 3.6630661487579346, + "learning_rate": 9.794955787341647e-06, + "loss": 1.4008, + "step": 37800 + }, + { + "epoch": 0.19, + "grad_norm": 4.630967140197754, + "learning_rate": 9.794330652546956e-06, + "loss": 1.3759, + "step": 37900 + }, + { + "epoch": 0.19, + "grad_norm": 3.3025717735290527, + "learning_rate": 9.793705517752266e-06, + "loss": 1.3884, + "step": 38000 + }, + { + "epoch": 0.19, + "grad_norm": 3.3258678913116455, + "learning_rate": 9.793080382957576e-06, + "loss": 1.386, + "step": 38100 + }, + { + "epoch": 0.19, + "grad_norm": 3.719531536102295, + "learning_rate": 9.792455248162885e-06, + "loss": 1.3964, + "step": 38200 + }, + { + "epoch": 0.19, + "grad_norm": 3.2938575744628906, + "learning_rate": 9.791830113368196e-06, + "loss": 1.4057, + "step": 38300 + }, + { + "epoch": 0.19, + "grad_norm": 4.785384654998779, + "learning_rate": 9.791204978573506e-06, + "loss": 1.4134, + "step": 38400 + }, + { + "epoch": 0.19, + "grad_norm": 3.3767313957214355, + "learning_rate": 9.790579843778816e-06, + "loss": 1.4139, + "step": 38500 + }, + { + "epoch": 0.19, + "grad_norm": 2.9999425411224365, + "learning_rate": 9.789954708984126e-06, + "loss": 1.4039, + "step": 38600 + }, + { + "epoch": 0.19, + "grad_norm": 4.019780158996582, + "learning_rate": 9.789329574189436e-06, + "loss": 1.3637, + "step": 38700 + }, + { + "epoch": 0.19, + "grad_norm": 3.2933456897735596, + "learning_rate": 9.788704439394746e-06, + "loss": 1.3784, + "step": 38800 + }, + { + "epoch": 0.19, + "grad_norm": 3.511465549468994, + "learning_rate": 9.788079304600056e-06, + "loss": 1.3845, + "step": 38900 + }, + { + "epoch": 0.19, + "grad_norm": 3.031588077545166, + "learning_rate": 9.787454169805366e-06, + "loss": 1.3815, + "step": 39000 + }, + { + "epoch": 0.19, + "grad_norm": 3.726041078567505, + "learning_rate": 9.786829035010676e-06, + "loss": 1.4031, + "step": 39100 + }, + { + "epoch": 0.19, + "grad_norm": 3.459808349609375, + "learning_rate": 9.786203900215986e-06, + "loss": 1.4168, + "step": 39200 + }, + { + "epoch": 0.19, + "grad_norm": 5.884055137634277, + "learning_rate": 9.785578765421295e-06, + "loss": 1.3897, + "step": 39300 + }, + { + "epoch": 0.19, + "grad_norm": 3.8769099712371826, + "learning_rate": 9.784953630626605e-06, + "loss": 1.4214, + "step": 39400 + }, + { + "epoch": 0.2, + "grad_norm": 4.353100776672363, + "learning_rate": 9.784328495831914e-06, + "loss": 1.3875, + "step": 39500 + }, + { + "epoch": 0.2, + "grad_norm": 3.665733575820923, + "learning_rate": 9.783703361037225e-06, + "loss": 1.4033, + "step": 39600 + }, + { + "epoch": 0.2, + "grad_norm": 4.098516941070557, + "learning_rate": 9.783078226242533e-06, + "loss": 1.3755, + "step": 39700 + }, + { + "epoch": 0.2, + "grad_norm": 3.6719651222229004, + "learning_rate": 9.782453091447845e-06, + "loss": 1.4025, + "step": 39800 + }, + { + "epoch": 0.2, + "grad_norm": 3.7323970794677734, + "learning_rate": 9.781827956653153e-06, + "loss": 1.4072, + "step": 39900 + }, + { + "epoch": 0.2, + "grad_norm": 3.6012964248657227, + "learning_rate": 9.781202821858465e-06, + "loss": 1.3839, + "step": 40000 + }, + { + "epoch": 0.2, + "grad_norm": 3.1830966472625732, + "learning_rate": 9.780577687063773e-06, + "loss": 1.4025, + "step": 40100 + }, + { + "epoch": 0.2, + "grad_norm": 3.503458261489868, + "learning_rate": 9.779952552269085e-06, + "loss": 1.3849, + "step": 40200 + }, + { + "epoch": 0.2, + "grad_norm": 3.934358596801758, + "learning_rate": 9.779327417474393e-06, + "loss": 1.3908, + "step": 40300 + }, + { + "epoch": 0.2, + "grad_norm": 3.263597249984741, + "learning_rate": 9.778702282679705e-06, + "loss": 1.4144, + "step": 40400 + }, + { + "epoch": 0.2, + "grad_norm": 3.6019351482391357, + "learning_rate": 9.778077147885013e-06, + "loss": 1.3966, + "step": 40500 + }, + { + "epoch": 0.2, + "grad_norm": 3.211871862411499, + "learning_rate": 9.777452013090324e-06, + "loss": 1.4029, + "step": 40600 + }, + { + "epoch": 0.2, + "grad_norm": 4.445366382598877, + "learning_rate": 9.776826878295633e-06, + "loss": 1.4192, + "step": 40700 + }, + { + "epoch": 0.2, + "grad_norm": 3.834134340286255, + "learning_rate": 9.776201743500943e-06, + "loss": 1.3971, + "step": 40800 + }, + { + "epoch": 0.2, + "grad_norm": 11.223153114318848, + "learning_rate": 9.775576608706253e-06, + "loss": 1.4199, + "step": 40900 + }, + { + "epoch": 0.2, + "grad_norm": 3.168875217437744, + "learning_rate": 9.774951473911563e-06, + "loss": 1.403, + "step": 41000 + }, + { + "epoch": 0.2, + "grad_norm": 4.082376480102539, + "learning_rate": 9.774326339116872e-06, + "loss": 1.3758, + "step": 41100 + }, + { + "epoch": 0.2, + "grad_norm": 3.381903648376465, + "learning_rate": 9.773701204322182e-06, + "loss": 1.4124, + "step": 41200 + }, + { + "epoch": 0.2, + "grad_norm": 3.220072031021118, + "learning_rate": 9.773076069527492e-06, + "loss": 1.3731, + "step": 41300 + }, + { + "epoch": 0.2, + "grad_norm": 4.445113182067871, + "learning_rate": 9.772450934732802e-06, + "loss": 1.409, + "step": 41400 + }, + { + "epoch": 0.21, + "grad_norm": 4.551964282989502, + "learning_rate": 9.771825799938112e-06, + "loss": 1.4092, + "step": 41500 + }, + { + "epoch": 0.21, + "grad_norm": 3.120997905731201, + "learning_rate": 9.771200665143422e-06, + "loss": 1.4316, + "step": 41600 + }, + { + "epoch": 0.21, + "grad_norm": 3.3942294120788574, + "learning_rate": 9.770575530348732e-06, + "loss": 1.3752, + "step": 41700 + }, + { + "epoch": 0.21, + "grad_norm": 3.3764448165893555, + "learning_rate": 9.769950395554042e-06, + "loss": 1.4238, + "step": 41800 + }, + { + "epoch": 0.21, + "grad_norm": 3.600349187850952, + "learning_rate": 9.769325260759352e-06, + "loss": 1.4012, + "step": 41900 + }, + { + "epoch": 0.21, + "grad_norm": 3.6565279960632324, + "learning_rate": 9.768700125964662e-06, + "loss": 1.3768, + "step": 42000 + }, + { + "epoch": 0.21, + "grad_norm": 3.65138578414917, + "learning_rate": 9.768074991169972e-06, + "loss": 1.4022, + "step": 42100 + }, + { + "epoch": 0.21, + "grad_norm": 3.3732988834381104, + "learning_rate": 9.767449856375282e-06, + "loss": 1.409, + "step": 42200 + }, + { + "epoch": 0.21, + "grad_norm": 3.8248541355133057, + "learning_rate": 9.766824721580592e-06, + "loss": 1.4121, + "step": 42300 + }, + { + "epoch": 0.21, + "grad_norm": 3.3323121070861816, + "learning_rate": 9.766199586785901e-06, + "loss": 1.4203, + "step": 42400 + }, + { + "epoch": 0.21, + "grad_norm": 3.0881714820861816, + "learning_rate": 9.765574451991211e-06, + "loss": 1.3993, + "step": 42500 + }, + { + "epoch": 0.21, + "grad_norm": 3.6461262702941895, + "learning_rate": 9.764949317196521e-06, + "loss": 1.3842, + "step": 42600 + }, + { + "epoch": 0.21, + "grad_norm": 3.2910470962524414, + "learning_rate": 9.764324182401831e-06, + "loss": 1.4108, + "step": 42700 + }, + { + "epoch": 0.21, + "grad_norm": 3.8803622722625732, + "learning_rate": 9.763699047607141e-06, + "loss": 1.3962, + "step": 42800 + }, + { + "epoch": 0.21, + "grad_norm": 3.9977149963378906, + "learning_rate": 9.763073912812451e-06, + "loss": 1.3876, + "step": 42900 + }, + { + "epoch": 0.21, + "grad_norm": 3.4803717136383057, + "learning_rate": 9.762448778017761e-06, + "loss": 1.3601, + "step": 43000 + }, + { + "epoch": 0.21, + "grad_norm": 3.819903612136841, + "learning_rate": 9.761823643223071e-06, + "loss": 1.3891, + "step": 43100 + }, + { + "epoch": 0.21, + "grad_norm": 3.582854747772217, + "learning_rate": 9.761198508428381e-06, + "loss": 1.4145, + "step": 43200 + }, + { + "epoch": 0.21, + "grad_norm": 3.323837995529175, + "learning_rate": 9.76057337363369e-06, + "loss": 1.4141, + "step": 43300 + }, + { + "epoch": 0.21, + "grad_norm": 2.7181520462036133, + "learning_rate": 9.759948238839e-06, + "loss": 1.3464, + "step": 43400 + }, + { + "epoch": 0.22, + "grad_norm": 3.4599475860595703, + "learning_rate": 9.75932310404431e-06, + "loss": 1.3775, + "step": 43500 + }, + { + "epoch": 0.22, + "grad_norm": 3.005889654159546, + "learning_rate": 9.75869796924962e-06, + "loss": 1.4078, + "step": 43600 + }, + { + "epoch": 0.22, + "grad_norm": 3.152175188064575, + "learning_rate": 9.75807283445493e-06, + "loss": 1.411, + "step": 43700 + }, + { + "epoch": 0.22, + "grad_norm": 3.951000213623047, + "learning_rate": 9.75744769966024e-06, + "loss": 1.3737, + "step": 43800 + }, + { + "epoch": 0.22, + "grad_norm": 3.722508192062378, + "learning_rate": 9.75682256486555e-06, + "loss": 1.4092, + "step": 43900 + }, + { + "epoch": 0.22, + "grad_norm": 4.0199761390686035, + "learning_rate": 9.75619743007086e-06, + "loss": 1.3964, + "step": 44000 + }, + { + "epoch": 0.22, + "grad_norm": 3.306147336959839, + "learning_rate": 9.75557229527617e-06, + "loss": 1.3772, + "step": 44100 + }, + { + "epoch": 0.22, + "grad_norm": 3.0931670665740967, + "learning_rate": 9.75494716048148e-06, + "loss": 1.3848, + "step": 44200 + }, + { + "epoch": 0.22, + "grad_norm": 3.4775798320770264, + "learning_rate": 9.75432202568679e-06, + "loss": 1.391, + "step": 44300 + }, + { + "epoch": 0.22, + "grad_norm": 2.973440170288086, + "learning_rate": 9.7536968908921e-06, + "loss": 1.3993, + "step": 44400 + }, + { + "epoch": 0.22, + "grad_norm": 2.9864256381988525, + "learning_rate": 9.75307175609741e-06, + "loss": 1.3523, + "step": 44500 + }, + { + "epoch": 0.22, + "grad_norm": 3.9546356201171875, + "learning_rate": 9.75244662130272e-06, + "loss": 1.3527, + "step": 44600 + }, + { + "epoch": 0.22, + "grad_norm": 6.238116264343262, + "learning_rate": 9.75182148650803e-06, + "loss": 1.3624, + "step": 44700 + }, + { + "epoch": 0.22, + "grad_norm": 3.241060495376587, + "learning_rate": 9.75119635171334e-06, + "loss": 1.3875, + "step": 44800 + }, + { + "epoch": 0.22, + "grad_norm": 3.17099666595459, + "learning_rate": 9.750571216918648e-06, + "loss": 1.381, + "step": 44900 + }, + { + "epoch": 0.22, + "grad_norm": 3.647505283355713, + "learning_rate": 9.74994608212396e-06, + "loss": 1.3673, + "step": 45000 + }, + { + "epoch": 0.22, + "grad_norm": 3.5791754722595215, + "learning_rate": 9.749320947329268e-06, + "loss": 1.3936, + "step": 45100 + }, + { + "epoch": 0.22, + "grad_norm": 4.946603775024414, + "learning_rate": 9.74869581253458e-06, + "loss": 1.4135, + "step": 45200 + }, + { + "epoch": 0.22, + "grad_norm": 2.9948465824127197, + "learning_rate": 9.748070677739888e-06, + "loss": 1.3824, + "step": 45300 + }, + { + "epoch": 0.22, + "grad_norm": 3.355520725250244, + "learning_rate": 9.7474455429452e-06, + "loss": 1.3757, + "step": 45400 + }, + { + "epoch": 0.23, + "grad_norm": 3.599808931350708, + "learning_rate": 9.746820408150508e-06, + "loss": 1.4019, + "step": 45500 + }, + { + "epoch": 0.23, + "grad_norm": 2.60089111328125, + "learning_rate": 9.746195273355819e-06, + "loss": 1.4147, + "step": 45600 + }, + { + "epoch": 0.23, + "grad_norm": 3.074833393096924, + "learning_rate": 9.745570138561127e-06, + "loss": 1.4084, + "step": 45700 + }, + { + "epoch": 0.23, + "grad_norm": 3.3081626892089844, + "learning_rate": 9.744945003766439e-06, + "loss": 1.3999, + "step": 45800 + }, + { + "epoch": 0.23, + "grad_norm": 4.088558673858643, + "learning_rate": 9.744319868971747e-06, + "loss": 1.398, + "step": 45900 + }, + { + "epoch": 0.23, + "grad_norm": 3.7625129222869873, + "learning_rate": 9.743694734177059e-06, + "loss": 1.388, + "step": 46000 + }, + { + "epoch": 0.23, + "grad_norm": 4.328056335449219, + "learning_rate": 9.743069599382367e-06, + "loss": 1.3721, + "step": 46100 + }, + { + "epoch": 0.23, + "grad_norm": 3.1336140632629395, + "learning_rate": 9.742444464587677e-06, + "loss": 1.3865, + "step": 46200 + }, + { + "epoch": 0.23, + "grad_norm": 3.0789365768432617, + "learning_rate": 9.741819329792987e-06, + "loss": 1.3609, + "step": 46300 + }, + { + "epoch": 0.23, + "grad_norm": 3.569803237915039, + "learning_rate": 9.741194194998297e-06, + "loss": 1.3954, + "step": 46400 + }, + { + "epoch": 0.23, + "grad_norm": 2.9342846870422363, + "learning_rate": 9.740569060203607e-06, + "loss": 1.3381, + "step": 46500 + }, + { + "epoch": 0.23, + "grad_norm": 3.4400010108947754, + "learning_rate": 9.739943925408917e-06, + "loss": 1.3582, + "step": 46600 + }, + { + "epoch": 0.23, + "grad_norm": 3.698220729827881, + "learning_rate": 9.739318790614227e-06, + "loss": 1.4165, + "step": 46700 + }, + { + "epoch": 0.23, + "grad_norm": 3.7116315364837646, + "learning_rate": 9.738693655819537e-06, + "loss": 1.3643, + "step": 46800 + }, + { + "epoch": 0.23, + "grad_norm": 4.515981674194336, + "learning_rate": 9.738068521024846e-06, + "loss": 1.3899, + "step": 46900 + }, + { + "epoch": 0.23, + "grad_norm": 3.1222646236419678, + "learning_rate": 9.737443386230156e-06, + "loss": 1.3497, + "step": 47000 + }, + { + "epoch": 0.23, + "grad_norm": 3.3143482208251953, + "learning_rate": 9.736818251435466e-06, + "loss": 1.3906, + "step": 47100 + }, + { + "epoch": 0.23, + "grad_norm": 3.673861026763916, + "learning_rate": 9.736193116640776e-06, + "loss": 1.3742, + "step": 47200 + }, + { + "epoch": 0.23, + "grad_norm": 3.1223580837249756, + "learning_rate": 9.735567981846086e-06, + "loss": 1.3679, + "step": 47300 + }, + { + "epoch": 0.23, + "grad_norm": 3.321925640106201, + "learning_rate": 9.734942847051396e-06, + "loss": 1.3872, + "step": 47400 + }, + { + "epoch": 0.24, + "grad_norm": 3.3574061393737793, + "learning_rate": 9.734317712256706e-06, + "loss": 1.3536, + "step": 47500 + }, + { + "epoch": 0.24, + "grad_norm": 3.0097270011901855, + "learning_rate": 9.733692577462016e-06, + "loss": 1.3867, + "step": 47600 + }, + { + "epoch": 0.24, + "grad_norm": 6.731925964355469, + "learning_rate": 9.733067442667326e-06, + "loss": 1.3671, + "step": 47700 + }, + { + "epoch": 0.24, + "grad_norm": 4.5036468505859375, + "learning_rate": 9.732442307872636e-06, + "loss": 1.3935, + "step": 47800 + }, + { + "epoch": 0.24, + "grad_norm": 5.059147357940674, + "learning_rate": 9.731817173077946e-06, + "loss": 1.3693, + "step": 47900 + }, + { + "epoch": 0.24, + "grad_norm": 3.65627384185791, + "learning_rate": 9.731192038283256e-06, + "loss": 1.3838, + "step": 48000 + }, + { + "epoch": 0.24, + "grad_norm": 3.8300678730010986, + "learning_rate": 9.730566903488566e-06, + "loss": 1.4273, + "step": 48100 + }, + { + "epoch": 0.24, + "grad_norm": 3.2943530082702637, + "learning_rate": 9.729941768693876e-06, + "loss": 1.3598, + "step": 48200 + }, + { + "epoch": 0.24, + "grad_norm": 3.6148500442504883, + "learning_rate": 9.729316633899185e-06, + "loss": 1.3916, + "step": 48300 + }, + { + "epoch": 0.24, + "grad_norm": 3.14809250831604, + "learning_rate": 9.728691499104495e-06, + "loss": 1.3596, + "step": 48400 + }, + { + "epoch": 0.24, + "grad_norm": 3.4826507568359375, + "learning_rate": 9.728066364309805e-06, + "loss": 1.4113, + "step": 48500 + }, + { + "epoch": 0.24, + "grad_norm": 3.761301279067993, + "learning_rate": 9.727441229515115e-06, + "loss": 1.3811, + "step": 48600 + }, + { + "epoch": 0.24, + "grad_norm": 3.280597448348999, + "learning_rate": 9.726816094720425e-06, + "loss": 1.3735, + "step": 48700 + }, + { + "epoch": 0.24, + "grad_norm": 3.2056515216827393, + "learning_rate": 9.726190959925735e-06, + "loss": 1.3928, + "step": 48800 + }, + { + "epoch": 0.24, + "grad_norm": 5.535262107849121, + "learning_rate": 9.725565825131045e-06, + "loss": 1.3795, + "step": 48900 + }, + { + "epoch": 0.24, + "grad_norm": 3.71197509765625, + "learning_rate": 9.724940690336355e-06, + "loss": 1.3956, + "step": 49000 + }, + { + "epoch": 0.24, + "grad_norm": 3.048292875289917, + "learning_rate": 9.724315555541665e-06, + "loss": 1.3756, + "step": 49100 + }, + { + "epoch": 0.24, + "grad_norm": 3.6094841957092285, + "learning_rate": 9.723690420746975e-06, + "loss": 1.4143, + "step": 49200 + }, + { + "epoch": 0.24, + "grad_norm": 4.416449546813965, + "learning_rate": 9.723065285952285e-06, + "loss": 1.3618, + "step": 49300 + }, + { + "epoch": 0.24, + "grad_norm": 4.372152328491211, + "learning_rate": 9.722440151157595e-06, + "loss": 1.402, + "step": 49400 + }, + { + "epoch": 0.24, + "grad_norm": 3.1622934341430664, + "learning_rate": 9.721815016362905e-06, + "loss": 1.3914, + "step": 49500 + }, + { + "epoch": 0.25, + "grad_norm": 3.1704394817352295, + "learning_rate": 9.721189881568214e-06, + "loss": 1.3827, + "step": 49600 + }, + { + "epoch": 0.25, + "grad_norm": 3.9178764820098877, + "learning_rate": 9.720564746773524e-06, + "loss": 1.371, + "step": 49700 + }, + { + "epoch": 0.25, + "grad_norm": 3.839916706085205, + "learning_rate": 9.719939611978834e-06, + "loss": 1.361, + "step": 49800 + }, + { + "epoch": 0.25, + "grad_norm": 3.5647811889648438, + "learning_rate": 9.719314477184144e-06, + "loss": 1.3857, + "step": 49900 + }, + { + "epoch": 0.25, + "grad_norm": 3.2756240367889404, + "learning_rate": 9.718689342389454e-06, + "loss": 1.375, + "step": 50000 + }, + { + "epoch": 0.25, + "grad_norm": 4.051654815673828, + "learning_rate": 9.718064207594762e-06, + "loss": 1.3941, + "step": 50100 + }, + { + "epoch": 0.25, + "grad_norm": 4.137097358703613, + "learning_rate": 9.717439072800074e-06, + "loss": 1.3892, + "step": 50200 + }, + { + "epoch": 0.25, + "grad_norm": 3.646369457244873, + "learning_rate": 9.716813938005382e-06, + "loss": 1.3846, + "step": 50300 + }, + { + "epoch": 0.25, + "grad_norm": 3.162900686264038, + "learning_rate": 9.716188803210694e-06, + "loss": 1.4173, + "step": 50400 + }, + { + "epoch": 0.25, + "grad_norm": 3.1182548999786377, + "learning_rate": 9.715563668416002e-06, + "loss": 1.3769, + "step": 50500 + }, + { + "epoch": 0.25, + "grad_norm": 5.595252513885498, + "learning_rate": 9.714938533621314e-06, + "loss": 1.4087, + "step": 50600 + }, + { + "epoch": 0.25, + "grad_norm": 2.782058000564575, + "learning_rate": 9.714313398826622e-06, + "loss": 1.4179, + "step": 50700 + }, + { + "epoch": 0.25, + "grad_norm": 3.0897421836853027, + "learning_rate": 9.713688264031934e-06, + "loss": 1.3671, + "step": 50800 + }, + { + "epoch": 0.25, + "grad_norm": 3.455578565597534, + "learning_rate": 9.713063129237242e-06, + "loss": 1.401, + "step": 50900 + }, + { + "epoch": 0.25, + "grad_norm": 3.877256155014038, + "learning_rate": 9.712437994442553e-06, + "loss": 1.4114, + "step": 51000 + }, + { + "epoch": 0.25, + "grad_norm": 3.3103723526000977, + "learning_rate": 9.711812859647862e-06, + "loss": 1.3895, + "step": 51100 + }, + { + "epoch": 0.25, + "grad_norm": 3.909396171569824, + "learning_rate": 9.711187724853173e-06, + "loss": 1.3632, + "step": 51200 + }, + { + "epoch": 0.25, + "grad_norm": 3.4170010089874268, + "learning_rate": 9.710562590058482e-06, + "loss": 1.3819, + "step": 51300 + }, + { + "epoch": 0.25, + "grad_norm": 3.1259448528289795, + "learning_rate": 9.709937455263791e-06, + "loss": 1.3894, + "step": 51400 + }, + { + "epoch": 0.25, + "grad_norm": 4.449690818786621, + "learning_rate": 9.709312320469101e-06, + "loss": 1.3998, + "step": 51500 + }, + { + "epoch": 0.26, + "grad_norm": 3.47631573677063, + "learning_rate": 9.708687185674411e-06, + "loss": 1.374, + "step": 51600 + }, + { + "epoch": 0.26, + "grad_norm": 3.5324013233184814, + "learning_rate": 9.708062050879721e-06, + "loss": 1.3518, + "step": 51700 + }, + { + "epoch": 0.26, + "grad_norm": 3.9155433177948, + "learning_rate": 9.707436916085031e-06, + "loss": 1.3848, + "step": 51800 + }, + { + "epoch": 0.26, + "grad_norm": 3.4327306747436523, + "learning_rate": 9.706811781290341e-06, + "loss": 1.3653, + "step": 51900 + }, + { + "epoch": 0.26, + "grad_norm": 6.06643533706665, + "learning_rate": 9.706186646495651e-06, + "loss": 1.3892, + "step": 52000 + }, + { + "epoch": 0.26, + "grad_norm": 3.518132448196411, + "learning_rate": 9.705561511700961e-06, + "loss": 1.3433, + "step": 52100 + }, + { + "epoch": 0.26, + "grad_norm": 3.2915585041046143, + "learning_rate": 9.704936376906271e-06, + "loss": 1.3579, + "step": 52200 + }, + { + "epoch": 0.26, + "grad_norm": 3.804596185684204, + "learning_rate": 9.70431124211158e-06, + "loss": 1.3809, + "step": 52300 + }, + { + "epoch": 0.26, + "grad_norm": 4.617377758026123, + "learning_rate": 9.70368610731689e-06, + "loss": 1.398, + "step": 52400 + }, + { + "epoch": 0.26, + "grad_norm": 3.0816659927368164, + "learning_rate": 9.7030609725222e-06, + "loss": 1.3652, + "step": 52500 + }, + { + "epoch": 0.26, + "grad_norm": 3.5903923511505127, + "learning_rate": 9.70243583772751e-06, + "loss": 1.3767, + "step": 52600 + }, + { + "epoch": 0.26, + "grad_norm": 3.43280029296875, + "learning_rate": 9.70181070293282e-06, + "loss": 1.3802, + "step": 52700 + }, + { + "epoch": 0.26, + "grad_norm": 4.5906081199646, + "learning_rate": 9.70118556813813e-06, + "loss": 1.3726, + "step": 52800 + }, + { + "epoch": 0.26, + "grad_norm": 3.5110647678375244, + "learning_rate": 9.70056043334344e-06, + "loss": 1.4125, + "step": 52900 + }, + { + "epoch": 0.26, + "grad_norm": 3.5731992721557617, + "learning_rate": 9.69993529854875e-06, + "loss": 1.3971, + "step": 53000 + }, + { + "epoch": 0.26, + "grad_norm": 3.522200584411621, + "learning_rate": 9.69931016375406e-06, + "loss": 1.3758, + "step": 53100 + }, + { + "epoch": 0.26, + "grad_norm": 3.754093647003174, + "learning_rate": 9.69868502895937e-06, + "loss": 1.4125, + "step": 53200 + }, + { + "epoch": 0.26, + "grad_norm": 4.088795185089111, + "learning_rate": 9.69805989416468e-06, + "loss": 1.374, + "step": 53300 + }, + { + "epoch": 0.26, + "grad_norm": 3.095700263977051, + "learning_rate": 9.69743475936999e-06, + "loss": 1.3475, + "step": 53400 + }, + { + "epoch": 0.26, + "grad_norm": 3.6446001529693604, + "learning_rate": 9.6968096245753e-06, + "loss": 1.3675, + "step": 53500 + }, + { + "epoch": 0.27, + "grad_norm": 3.0287554264068604, + "learning_rate": 9.69618448978061e-06, + "loss": 1.3648, + "step": 53600 + }, + { + "epoch": 0.27, + "grad_norm": 3.5153772830963135, + "learning_rate": 9.69555935498592e-06, + "loss": 1.3526, + "step": 53700 + }, + { + "epoch": 0.27, + "grad_norm": 3.402449131011963, + "learning_rate": 9.69493422019123e-06, + "loss": 1.4021, + "step": 53800 + }, + { + "epoch": 0.27, + "grad_norm": 4.223129749298096, + "learning_rate": 9.69430908539654e-06, + "loss": 1.3754, + "step": 53900 + }, + { + "epoch": 0.27, + "grad_norm": 2.7301337718963623, + "learning_rate": 9.69368395060185e-06, + "loss": 1.411, + "step": 54000 + }, + { + "epoch": 0.27, + "grad_norm": 3.3157832622528076, + "learning_rate": 9.69305881580716e-06, + "loss": 1.3695, + "step": 54100 + }, + { + "epoch": 0.27, + "grad_norm": 4.0217671394348145, + "learning_rate": 9.69243368101247e-06, + "loss": 1.3822, + "step": 54200 + }, + { + "epoch": 0.27, + "grad_norm": 3.575080633163452, + "learning_rate": 9.69180854621778e-06, + "loss": 1.3578, + "step": 54300 + }, + { + "epoch": 0.27, + "grad_norm": 3.4445888996124268, + "learning_rate": 9.69118341142309e-06, + "loss": 1.3795, + "step": 54400 + }, + { + "epoch": 0.27, + "grad_norm": 3.173060894012451, + "learning_rate": 9.6905582766284e-06, + "loss": 1.3446, + "step": 54500 + }, + { + "epoch": 0.27, + "grad_norm": 3.8823065757751465, + "learning_rate": 9.689933141833709e-06, + "loss": 1.3696, + "step": 54600 + }, + { + "epoch": 0.27, + "grad_norm": 3.5092880725860596, + "learning_rate": 9.689308007039019e-06, + "loss": 1.3262, + "step": 54700 + }, + { + "epoch": 0.27, + "grad_norm": 3.270498514175415, + "learning_rate": 9.688682872244329e-06, + "loss": 1.3905, + "step": 54800 + }, + { + "epoch": 0.27, + "grad_norm": 3.0580673217773438, + "learning_rate": 9.688057737449639e-06, + "loss": 1.3707, + "step": 54900 + }, + { + "epoch": 0.27, + "grad_norm": 2.982407569885254, + "learning_rate": 9.687432602654949e-06, + "loss": 1.3909, + "step": 55000 + }, + { + "epoch": 0.27, + "grad_norm": 4.194490432739258, + "learning_rate": 9.686807467860259e-06, + "loss": 1.3703, + "step": 55100 + }, + { + "epoch": 0.27, + "grad_norm": 3.5486743450164795, + "learning_rate": 9.686182333065569e-06, + "loss": 1.3528, + "step": 55200 + }, + { + "epoch": 0.27, + "grad_norm": 3.081116199493408, + "learning_rate": 9.685557198270879e-06, + "loss": 1.3929, + "step": 55300 + }, + { + "epoch": 0.27, + "grad_norm": 3.644366979598999, + "learning_rate": 9.684932063476189e-06, + "loss": 1.3847, + "step": 55400 + }, + { + "epoch": 0.27, + "grad_norm": 2.9201712608337402, + "learning_rate": 9.684306928681497e-06, + "loss": 1.3689, + "step": 55500 + }, + { + "epoch": 0.28, + "grad_norm": 3.1488590240478516, + "learning_rate": 9.683681793886808e-06, + "loss": 1.3573, + "step": 55600 + }, + { + "epoch": 0.28, + "grad_norm": 3.4069724082946777, + "learning_rate": 9.683056659092117e-06, + "loss": 1.3447, + "step": 55700 + }, + { + "epoch": 0.28, + "grad_norm": 3.669130563735962, + "learning_rate": 9.682431524297428e-06, + "loss": 1.3629, + "step": 55800 + }, + { + "epoch": 0.28, + "grad_norm": 3.5612809658050537, + "learning_rate": 9.681806389502736e-06, + "loss": 1.3463, + "step": 55900 + }, + { + "epoch": 0.28, + "grad_norm": 3.3988492488861084, + "learning_rate": 9.681181254708048e-06, + "loss": 1.3262, + "step": 56000 + }, + { + "epoch": 0.28, + "grad_norm": 3.286510705947876, + "learning_rate": 9.680556119913356e-06, + "loss": 1.3671, + "step": 56100 + }, + { + "epoch": 0.28, + "grad_norm": 4.079017639160156, + "learning_rate": 9.679930985118668e-06, + "loss": 1.3691, + "step": 56200 + }, + { + "epoch": 0.28, + "grad_norm": 5.038201808929443, + "learning_rate": 9.679305850323976e-06, + "loss": 1.3684, + "step": 56300 + }, + { + "epoch": 0.28, + "grad_norm": 2.9335787296295166, + "learning_rate": 9.678680715529288e-06, + "loss": 1.3651, + "step": 56400 + }, + { + "epoch": 0.28, + "grad_norm": 3.750838279724121, + "learning_rate": 9.678055580734596e-06, + "loss": 1.3979, + "step": 56500 + }, + { + "epoch": 0.28, + "grad_norm": 3.269113779067993, + "learning_rate": 9.677430445939908e-06, + "loss": 1.4005, + "step": 56600 + }, + { + "epoch": 0.28, + "grad_norm": 2.9525506496429443, + "learning_rate": 9.676805311145216e-06, + "loss": 1.3535, + "step": 56700 + }, + { + "epoch": 0.28, + "grad_norm": 4.0349273681640625, + "learning_rate": 9.676180176350526e-06, + "loss": 1.3568, + "step": 56800 + }, + { + "epoch": 0.28, + "grad_norm": 3.9644954204559326, + "learning_rate": 9.675555041555836e-06, + "loss": 1.3368, + "step": 56900 + }, + { + "epoch": 0.28, + "grad_norm": 3.748861312866211, + "learning_rate": 9.674929906761146e-06, + "loss": 1.3788, + "step": 57000 + }, + { + "epoch": 0.28, + "grad_norm": 3.927027940750122, + "learning_rate": 9.674304771966456e-06, + "loss": 1.3369, + "step": 57100 + }, + { + "epoch": 0.28, + "grad_norm": 3.5700511932373047, + "learning_rate": 9.673679637171766e-06, + "loss": 1.3476, + "step": 57200 + }, + { + "epoch": 0.28, + "grad_norm": 3.2039616107940674, + "learning_rate": 9.673054502377075e-06, + "loss": 1.358, + "step": 57300 + }, + { + "epoch": 0.28, + "grad_norm": 3.7656171321868896, + "learning_rate": 9.672429367582385e-06, + "loss": 1.3656, + "step": 57400 + }, + { + "epoch": 0.28, + "grad_norm": 3.84936261177063, + "learning_rate": 9.671804232787695e-06, + "loss": 1.4221, + "step": 57500 + }, + { + "epoch": 0.28, + "grad_norm": 3.145597457885742, + "learning_rate": 9.671179097993005e-06, + "loss": 1.3444, + "step": 57600 + }, + { + "epoch": 0.29, + "grad_norm": 3.1017513275146484, + "learning_rate": 9.670553963198315e-06, + "loss": 1.3604, + "step": 57700 + }, + { + "epoch": 0.29, + "grad_norm": 3.460015058517456, + "learning_rate": 9.669928828403625e-06, + "loss": 1.3556, + "step": 57800 + }, + { + "epoch": 0.29, + "grad_norm": 3.2410836219787598, + "learning_rate": 9.669303693608935e-06, + "loss": 1.3468, + "step": 57900 + }, + { + "epoch": 0.29, + "grad_norm": 4.7556843757629395, + "learning_rate": 9.668678558814245e-06, + "loss": 1.3535, + "step": 58000 + }, + { + "epoch": 0.29, + "grad_norm": 2.8711953163146973, + "learning_rate": 9.668053424019555e-06, + "loss": 1.3587, + "step": 58100 + }, + { + "epoch": 0.29, + "grad_norm": 2.9098100662231445, + "learning_rate": 9.667428289224865e-06, + "loss": 1.3866, + "step": 58200 + }, + { + "epoch": 0.29, + "grad_norm": 3.6795027256011963, + "learning_rate": 9.666803154430175e-06, + "loss": 1.3692, + "step": 58300 + }, + { + "epoch": 0.29, + "grad_norm": 4.239531517028809, + "learning_rate": 9.666178019635485e-06, + "loss": 1.3898, + "step": 58400 + }, + { + "epoch": 0.29, + "grad_norm": 2.871461868286133, + "learning_rate": 9.665552884840795e-06, + "loss": 1.3863, + "step": 58500 + }, + { + "epoch": 0.29, + "grad_norm": 3.8275647163391113, + "learning_rate": 9.664927750046104e-06, + "loss": 1.4035, + "step": 58600 + }, + { + "epoch": 0.29, + "grad_norm": 3.168945074081421, + "learning_rate": 9.664302615251414e-06, + "loss": 1.4028, + "step": 58700 + }, + { + "epoch": 0.29, + "grad_norm": 3.4457874298095703, + "learning_rate": 9.663677480456724e-06, + "loss": 1.3484, + "step": 58800 + }, + { + "epoch": 0.29, + "grad_norm": 3.0998809337615967, + "learning_rate": 9.663052345662034e-06, + "loss": 1.3533, + "step": 58900 + }, + { + "epoch": 0.29, + "grad_norm": 3.2760820388793945, + "learning_rate": 9.662427210867344e-06, + "loss": 1.3733, + "step": 59000 + }, + { + "epoch": 0.29, + "grad_norm": 3.1642961502075195, + "learning_rate": 9.661802076072654e-06, + "loss": 1.3675, + "step": 59100 + }, + { + "epoch": 0.29, + "grad_norm": 3.5796260833740234, + "learning_rate": 9.661176941277964e-06, + "loss": 1.3842, + "step": 59200 + }, + { + "epoch": 0.29, + "grad_norm": 3.379223108291626, + "learning_rate": 9.660551806483274e-06, + "loss": 1.3, + "step": 59300 + }, + { + "epoch": 0.29, + "grad_norm": 3.7059500217437744, + "learning_rate": 9.659926671688584e-06, + "loss": 1.3801, + "step": 59400 + }, + { + "epoch": 0.29, + "grad_norm": 3.9644994735717773, + "learning_rate": 9.659301536893894e-06, + "loss": 1.3882, + "step": 59500 + }, + { + "epoch": 0.29, + "grad_norm": 3.516009569168091, + "learning_rate": 9.658676402099204e-06, + "loss": 1.3507, + "step": 59600 + }, + { + "epoch": 0.3, + "grad_norm": 3.6307122707366943, + "learning_rate": 9.658051267304514e-06, + "loss": 1.3887, + "step": 59700 + }, + { + "epoch": 0.3, + "grad_norm": 2.963676929473877, + "learning_rate": 9.657426132509824e-06, + "loss": 1.3574, + "step": 59800 + }, + { + "epoch": 0.3, + "grad_norm": 3.6590583324432373, + "learning_rate": 9.656800997715134e-06, + "loss": 1.3766, + "step": 59900 + }, + { + "epoch": 0.3, + "grad_norm": 3.9890248775482178, + "learning_rate": 9.656175862920443e-06, + "loss": 1.3766, + "step": 60000 + }, + { + "epoch": 0.3, + "grad_norm": 3.7033519744873047, + "learning_rate": 9.655550728125753e-06, + "loss": 1.3653, + "step": 60100 + }, + { + "epoch": 0.3, + "grad_norm": 3.2749149799346924, + "learning_rate": 9.654925593331063e-06, + "loss": 1.3555, + "step": 60200 + }, + { + "epoch": 0.3, + "grad_norm": 2.6017117500305176, + "learning_rate": 9.654300458536373e-06, + "loss": 1.3379, + "step": 60300 + }, + { + "epoch": 0.3, + "grad_norm": 3.2133805751800537, + "learning_rate": 9.653675323741683e-06, + "loss": 1.3383, + "step": 60400 + }, + { + "epoch": 0.3, + "grad_norm": 3.5152649879455566, + "learning_rate": 9.653050188946993e-06, + "loss": 1.3821, + "step": 60500 + }, + { + "epoch": 0.3, + "grad_norm": 3.8433949947357178, + "learning_rate": 9.652425054152303e-06, + "loss": 1.3648, + "step": 60600 + }, + { + "epoch": 0.3, + "grad_norm": 3.219630241394043, + "learning_rate": 9.651799919357611e-06, + "loss": 1.3381, + "step": 60700 + }, + { + "epoch": 0.3, + "grad_norm": 2.9422874450683594, + "learning_rate": 9.651174784562923e-06, + "loss": 1.3596, + "step": 60800 + }, + { + "epoch": 0.3, + "grad_norm": 3.0160861015319824, + "learning_rate": 9.650549649768231e-06, + "loss": 1.4086, + "step": 60900 + }, + { + "epoch": 0.3, + "grad_norm": 3.6983797550201416, + "learning_rate": 9.649924514973543e-06, + "loss": 1.3653, + "step": 61000 + }, + { + "epoch": 0.3, + "grad_norm": 3.4345366954803467, + "learning_rate": 9.649299380178851e-06, + "loss": 1.3836, + "step": 61100 + }, + { + "epoch": 0.3, + "grad_norm": 4.864907741546631, + "learning_rate": 9.648674245384163e-06, + "loss": 1.3395, + "step": 61200 + }, + { + "epoch": 0.3, + "grad_norm": 3.0346243381500244, + "learning_rate": 9.64804911058947e-06, + "loss": 1.3483, + "step": 61300 + }, + { + "epoch": 0.3, + "grad_norm": 3.271688938140869, + "learning_rate": 9.647423975794782e-06, + "loss": 1.3718, + "step": 61400 + }, + { + "epoch": 0.3, + "grad_norm": 5.519439697265625, + "learning_rate": 9.64679884100009e-06, + "loss": 1.3516, + "step": 61500 + }, + { + "epoch": 0.3, + "grad_norm": 3.676679849624634, + "learning_rate": 9.646173706205402e-06, + "loss": 1.3786, + "step": 61600 + }, + { + "epoch": 0.31, + "grad_norm": 2.799685001373291, + "learning_rate": 9.64554857141071e-06, + "loss": 1.3622, + "step": 61700 + }, + { + "epoch": 0.31, + "grad_norm": 3.5693440437316895, + "learning_rate": 9.644923436616022e-06, + "loss": 1.3506, + "step": 61800 + }, + { + "epoch": 0.31, + "grad_norm": 4.081248760223389, + "learning_rate": 9.64429830182133e-06, + "loss": 1.3551, + "step": 61900 + }, + { + "epoch": 0.31, + "grad_norm": 3.275651454925537, + "learning_rate": 9.64367316702664e-06, + "loss": 1.3691, + "step": 62000 + }, + { + "epoch": 0.31, + "grad_norm": 3.2802531719207764, + "learning_rate": 9.64304803223195e-06, + "loss": 1.3654, + "step": 62100 + }, + { + "epoch": 0.31, + "grad_norm": 2.8903188705444336, + "learning_rate": 9.64242289743726e-06, + "loss": 1.3694, + "step": 62200 + }, + { + "epoch": 0.31, + "grad_norm": 2.767051935195923, + "learning_rate": 9.64179776264257e-06, + "loss": 1.4189, + "step": 62300 + }, + { + "epoch": 0.31, + "grad_norm": 3.3065268993377686, + "learning_rate": 9.64117262784788e-06, + "loss": 1.3528, + "step": 62400 + }, + { + "epoch": 0.31, + "grad_norm": 3.6622681617736816, + "learning_rate": 9.64054749305319e-06, + "loss": 1.3744, + "step": 62500 + }, + { + "epoch": 0.31, + "grad_norm": 2.5773024559020996, + "learning_rate": 9.6399223582585e-06, + "loss": 1.3678, + "step": 62600 + }, + { + "epoch": 0.31, + "grad_norm": 4.050888538360596, + "learning_rate": 9.63929722346381e-06, + "loss": 1.3459, + "step": 62700 + }, + { + "epoch": 0.31, + "grad_norm": 3.3641510009765625, + "learning_rate": 9.63867208866912e-06, + "loss": 1.3289, + "step": 62800 + }, + { + "epoch": 0.31, + "grad_norm": 3.2410778999328613, + "learning_rate": 9.63804695387443e-06, + "loss": 1.3582, + "step": 62900 + }, + { + "epoch": 0.31, + "grad_norm": 3.7819199562072754, + "learning_rate": 9.63742181907974e-06, + "loss": 1.3535, + "step": 63000 + }, + { + "epoch": 0.31, + "grad_norm": 3.4329464435577393, + "learning_rate": 9.63679668428505e-06, + "loss": 1.3637, + "step": 63100 + }, + { + "epoch": 0.31, + "grad_norm": 3.7776496410369873, + "learning_rate": 9.63617154949036e-06, + "loss": 1.3427, + "step": 63200 + }, + { + "epoch": 0.31, + "grad_norm": 3.6807868480682373, + "learning_rate": 9.63554641469567e-06, + "loss": 1.3366, + "step": 63300 + }, + { + "epoch": 0.31, + "grad_norm": 3.182055711746216, + "learning_rate": 9.63492127990098e-06, + "loss": 1.3907, + "step": 63400 + }, + { + "epoch": 0.31, + "grad_norm": 3.0613508224487305, + "learning_rate": 9.63429614510629e-06, + "loss": 1.3628, + "step": 63500 + }, + { + "epoch": 0.31, + "grad_norm": 3.813504219055176, + "learning_rate": 9.633671010311599e-06, + "loss": 1.3534, + "step": 63600 + }, + { + "epoch": 0.32, + "grad_norm": 3.0388875007629395, + "learning_rate": 9.633045875516909e-06, + "loss": 1.3701, + "step": 63700 + }, + { + "epoch": 0.32, + "grad_norm": 3.5311150550842285, + "learning_rate": 9.632420740722219e-06, + "loss": 1.3419, + "step": 63800 + }, + { + "epoch": 0.32, + "grad_norm": 3.283538341522217, + "learning_rate": 9.631795605927529e-06, + "loss": 1.3772, + "step": 63900 + }, + { + "epoch": 0.32, + "grad_norm": 2.6988024711608887, + "learning_rate": 9.631170471132839e-06, + "loss": 1.3633, + "step": 64000 + }, + { + "epoch": 0.32, + "grad_norm": 3.738215684890747, + "learning_rate": 9.630545336338149e-06, + "loss": 1.3814, + "step": 64100 + }, + { + "epoch": 0.32, + "grad_norm": 3.899857997894287, + "learning_rate": 9.629920201543459e-06, + "loss": 1.3787, + "step": 64200 + }, + { + "epoch": 0.32, + "grad_norm": 3.2490193843841553, + "learning_rate": 9.629295066748769e-06, + "loss": 1.347, + "step": 64300 + }, + { + "epoch": 0.32, + "grad_norm": 3.262529134750366, + "learning_rate": 9.628669931954079e-06, + "loss": 1.3405, + "step": 64400 + }, + { + "epoch": 0.32, + "grad_norm": 3.1799771785736084, + "learning_rate": 9.628044797159388e-06, + "loss": 1.3796, + "step": 64500 + }, + { + "epoch": 0.32, + "grad_norm": 3.5044260025024414, + "learning_rate": 9.627419662364698e-06, + "loss": 1.322, + "step": 64600 + }, + { + "epoch": 0.32, + "grad_norm": 3.560049295425415, + "learning_rate": 9.626794527570008e-06, + "loss": 1.356, + "step": 64700 + }, + { + "epoch": 0.32, + "grad_norm": 2.3910019397735596, + "learning_rate": 9.626169392775318e-06, + "loss": 1.3546, + "step": 64800 + }, + { + "epoch": 0.32, + "grad_norm": 4.631550312042236, + "learning_rate": 9.625544257980628e-06, + "loss": 1.3664, + "step": 64900 + }, + { + "epoch": 0.32, + "grad_norm": 3.415191650390625, + "learning_rate": 9.624919123185938e-06, + "loss": 1.3818, + "step": 65000 + }, + { + "epoch": 0.32, + "grad_norm": 3.071653127670288, + "learning_rate": 9.624293988391248e-06, + "loss": 1.3947, + "step": 65100 + }, + { + "epoch": 0.32, + "grad_norm": 3.575773000717163, + "learning_rate": 9.623668853596558e-06, + "loss": 1.3392, + "step": 65200 + }, + { + "epoch": 0.32, + "grad_norm": 3.1757047176361084, + "learning_rate": 9.623043718801868e-06, + "loss": 1.3619, + "step": 65300 + }, + { + "epoch": 0.32, + "grad_norm": 2.487311840057373, + "learning_rate": 9.622418584007178e-06, + "loss": 1.3538, + "step": 65400 + }, + { + "epoch": 0.32, + "grad_norm": 2.791187047958374, + "learning_rate": 9.621793449212488e-06, + "loss": 1.343, + "step": 65500 + }, + { + "epoch": 0.32, + "grad_norm": 2.840940237045288, + "learning_rate": 9.621168314417798e-06, + "loss": 1.3479, + "step": 65600 + }, + { + "epoch": 0.33, + "grad_norm": 3.4006075859069824, + "learning_rate": 9.620543179623108e-06, + "loss": 1.3742, + "step": 65700 + }, + { + "epoch": 0.33, + "grad_norm": 2.8290023803710938, + "learning_rate": 9.619918044828418e-06, + "loss": 1.3825, + "step": 65800 + }, + { + "epoch": 0.33, + "grad_norm": 4.114961624145508, + "learning_rate": 9.619292910033727e-06, + "loss": 1.3742, + "step": 65900 + }, + { + "epoch": 0.33, + "grad_norm": 3.4792447090148926, + "learning_rate": 9.618667775239037e-06, + "loss": 1.3442, + "step": 66000 + }, + { + "epoch": 0.33, + "grad_norm": 3.3174169063568115, + "learning_rate": 9.618042640444346e-06, + "loss": 1.3896, + "step": 66100 + }, + { + "epoch": 0.33, + "grad_norm": 2.7161898612976074, + "learning_rate": 9.617417505649657e-06, + "loss": 1.3492, + "step": 66200 + }, + { + "epoch": 0.33, + "grad_norm": 3.3158161640167236, + "learning_rate": 9.616792370854965e-06, + "loss": 1.3378, + "step": 66300 + }, + { + "epoch": 0.33, + "grad_norm": 4.810378074645996, + "learning_rate": 9.616167236060277e-06, + "loss": 1.3482, + "step": 66400 + }, + { + "epoch": 0.33, + "grad_norm": 3.8272716999053955, + "learning_rate": 9.615542101265585e-06, + "loss": 1.3624, + "step": 66500 + }, + { + "epoch": 0.33, + "grad_norm": 3.1543540954589844, + "learning_rate": 9.614916966470897e-06, + "loss": 1.3703, + "step": 66600 + }, + { + "epoch": 0.33, + "grad_norm": 3.571789503097534, + "learning_rate": 9.614291831676205e-06, + "loss": 1.3467, + "step": 66700 + }, + { + "epoch": 0.33, + "grad_norm": 3.8913381099700928, + "learning_rate": 9.613666696881517e-06, + "loss": 1.3629, + "step": 66800 + }, + { + "epoch": 0.33, + "grad_norm": 3.3162176609039307, + "learning_rate": 9.613041562086825e-06, + "loss": 1.399, + "step": 66900 + }, + { + "epoch": 0.33, + "grad_norm": 3.26802396774292, + "learning_rate": 9.612416427292137e-06, + "loss": 1.3481, + "step": 67000 + }, + { + "epoch": 0.33, + "grad_norm": 3.7507224082946777, + "learning_rate": 9.611791292497445e-06, + "loss": 1.3875, + "step": 67100 + }, + { + "epoch": 0.33, + "grad_norm": 3.059849262237549, + "learning_rate": 9.611166157702756e-06, + "loss": 1.3402, + "step": 67200 + }, + { + "epoch": 0.33, + "grad_norm": 2.9418632984161377, + "learning_rate": 9.610541022908065e-06, + "loss": 1.385, + "step": 67300 + }, + { + "epoch": 0.33, + "grad_norm": 3.126817464828491, + "learning_rate": 9.609915888113375e-06, + "loss": 1.3342, + "step": 67400 + }, + { + "epoch": 0.33, + "grad_norm": 3.908066749572754, + "learning_rate": 9.609290753318685e-06, + "loss": 1.3658, + "step": 67500 + }, + { + "epoch": 0.33, + "grad_norm": 3.3799283504486084, + "learning_rate": 9.608665618523994e-06, + "loss": 1.3392, + "step": 67600 + }, + { + "epoch": 0.33, + "grad_norm": 3.0953500270843506, + "learning_rate": 9.608040483729304e-06, + "loss": 1.3664, + "step": 67700 + }, + { + "epoch": 0.34, + "grad_norm": 3.4090096950531006, + "learning_rate": 9.607415348934614e-06, + "loss": 1.3913, + "step": 67800 + }, + { + "epoch": 0.34, + "grad_norm": 3.0916600227355957, + "learning_rate": 9.606790214139924e-06, + "loss": 1.3482, + "step": 67900 + }, + { + "epoch": 0.34, + "grad_norm": 4.232104778289795, + "learning_rate": 9.606165079345234e-06, + "loss": 1.3663, + "step": 68000 + }, + { + "epoch": 0.34, + "grad_norm": 3.300558090209961, + "learning_rate": 9.605539944550544e-06, + "loss": 1.3608, + "step": 68100 + }, + { + "epoch": 0.34, + "grad_norm": 2.795227527618408, + "learning_rate": 9.604914809755854e-06, + "loss": 1.3637, + "step": 68200 + }, + { + "epoch": 0.34, + "grad_norm": 3.083174467086792, + "learning_rate": 9.604289674961164e-06, + "loss": 1.3269, + "step": 68300 + }, + { + "epoch": 0.34, + "grad_norm": 3.8292133808135986, + "learning_rate": 9.603664540166474e-06, + "loss": 1.3311, + "step": 68400 + }, + { + "epoch": 0.34, + "grad_norm": 3.3727259635925293, + "learning_rate": 9.603039405371784e-06, + "loss": 1.3233, + "step": 68500 + }, + { + "epoch": 0.34, + "grad_norm": 3.0696310997009277, + "learning_rate": 9.602414270577094e-06, + "loss": 1.3217, + "step": 68600 + }, + { + "epoch": 0.34, + "grad_norm": 5.1085591316223145, + "learning_rate": 9.601789135782404e-06, + "loss": 1.395, + "step": 68700 + }, + { + "epoch": 0.34, + "grad_norm": 4.036706447601318, + "learning_rate": 9.601164000987714e-06, + "loss": 1.3763, + "step": 68800 + }, + { + "epoch": 0.34, + "grad_norm": 3.823237419128418, + "learning_rate": 9.600538866193024e-06, + "loss": 1.3868, + "step": 68900 + }, + { + "epoch": 0.34, + "grad_norm": 3.535228729248047, + "learning_rate": 9.599913731398333e-06, + "loss": 1.3714, + "step": 69000 + }, + { + "epoch": 0.34, + "grad_norm": 3.333162546157837, + "learning_rate": 9.599288596603643e-06, + "loss": 1.3509, + "step": 69100 + }, + { + "epoch": 0.34, + "grad_norm": 3.901670455932617, + "learning_rate": 9.598663461808953e-06, + "loss": 1.3486, + "step": 69200 + }, + { + "epoch": 0.34, + "grad_norm": 2.89204478263855, + "learning_rate": 9.598038327014263e-06, + "loss": 1.3975, + "step": 69300 + }, + { + "epoch": 0.34, + "grad_norm": 3.5590710639953613, + "learning_rate": 9.597413192219573e-06, + "loss": 1.3822, + "step": 69400 + }, + { + "epoch": 0.34, + "grad_norm": 3.234952449798584, + "learning_rate": 9.596788057424883e-06, + "loss": 1.3942, + "step": 69500 + }, + { + "epoch": 0.34, + "grad_norm": 3.125939130783081, + "learning_rate": 9.596162922630193e-06, + "loss": 1.3573, + "step": 69600 + }, + { + "epoch": 0.34, + "grad_norm": 3.25191593170166, + "learning_rate": 9.595537787835503e-06, + "loss": 1.3742, + "step": 69700 + }, + { + "epoch": 0.35, + "grad_norm": 3.0981853008270264, + "learning_rate": 9.594912653040813e-06, + "loss": 1.3272, + "step": 69800 + }, + { + "epoch": 0.35, + "grad_norm": 2.660688638687134, + "learning_rate": 9.594287518246123e-06, + "loss": 1.3859, + "step": 69900 + }, + { + "epoch": 0.35, + "grad_norm": 4.212889671325684, + "learning_rate": 9.593662383451433e-06, + "loss": 1.3685, + "step": 70000 + }, + { + "epoch": 0.35, + "grad_norm": 2.602475643157959, + "learning_rate": 9.593037248656743e-06, + "loss": 1.3486, + "step": 70100 + }, + { + "epoch": 0.35, + "grad_norm": 3.8037405014038086, + "learning_rate": 9.592412113862053e-06, + "loss": 1.3094, + "step": 70200 + }, + { + "epoch": 0.35, + "grad_norm": 3.735767364501953, + "learning_rate": 9.591786979067363e-06, + "loss": 1.3443, + "step": 70300 + }, + { + "epoch": 0.35, + "grad_norm": 3.10837984085083, + "learning_rate": 9.591161844272672e-06, + "loss": 1.3637, + "step": 70400 + }, + { + "epoch": 0.35, + "grad_norm": 3.339202880859375, + "learning_rate": 9.590536709477982e-06, + "loss": 1.374, + "step": 70500 + }, + { + "epoch": 0.35, + "grad_norm": 4.676008224487305, + "learning_rate": 9.589911574683292e-06, + "loss": 1.3609, + "step": 70600 + }, + { + "epoch": 0.35, + "grad_norm": 3.2127492427825928, + "learning_rate": 9.589286439888602e-06, + "loss": 1.3443, + "step": 70700 + }, + { + "epoch": 0.35, + "grad_norm": 3.552145481109619, + "learning_rate": 9.588661305093912e-06, + "loss": 1.3653, + "step": 70800 + }, + { + "epoch": 0.35, + "grad_norm": 4.267813205718994, + "learning_rate": 9.588036170299222e-06, + "loss": 1.3316, + "step": 70900 + }, + { + "epoch": 0.35, + "grad_norm": 4.004978179931641, + "learning_rate": 9.587411035504532e-06, + "loss": 1.3259, + "step": 71000 + }, + { + "epoch": 0.35, + "grad_norm": 3.433945417404175, + "learning_rate": 9.586785900709842e-06, + "loss": 1.3475, + "step": 71100 + }, + { + "epoch": 0.35, + "grad_norm": 3.3748490810394287, + "learning_rate": 9.586160765915152e-06, + "loss": 1.3442, + "step": 71200 + }, + { + "epoch": 0.35, + "grad_norm": 3.2221007347106934, + "learning_rate": 9.58553563112046e-06, + "loss": 1.3705, + "step": 71300 + }, + { + "epoch": 0.35, + "grad_norm": 2.846968173980713, + "learning_rate": 9.584910496325772e-06, + "loss": 1.3411, + "step": 71400 + }, + { + "epoch": 0.35, + "grad_norm": 3.973281145095825, + "learning_rate": 9.58428536153108e-06, + "loss": 1.3503, + "step": 71500 + }, + { + "epoch": 0.35, + "grad_norm": 2.7122104167938232, + "learning_rate": 9.583660226736392e-06, + "loss": 1.3581, + "step": 71600 + }, + { + "epoch": 0.35, + "grad_norm": 4.12910795211792, + "learning_rate": 9.5830350919417e-06, + "loss": 1.385, + "step": 71700 + }, + { + "epoch": 0.36, + "grad_norm": 3.4491500854492188, + "learning_rate": 9.582409957147011e-06, + "loss": 1.3626, + "step": 71800 + }, + { + "epoch": 0.36, + "grad_norm": 4.059682846069336, + "learning_rate": 9.58178482235232e-06, + "loss": 1.3595, + "step": 71900 + }, + { + "epoch": 0.36, + "grad_norm": 3.2482686042785645, + "learning_rate": 9.581159687557631e-06, + "loss": 1.3339, + "step": 72000 + }, + { + "epoch": 0.36, + "grad_norm": 3.3053741455078125, + "learning_rate": 9.58053455276294e-06, + "loss": 1.3656, + "step": 72100 + }, + { + "epoch": 0.36, + "grad_norm": 3.101283311843872, + "learning_rate": 9.579909417968251e-06, + "loss": 1.3751, + "step": 72200 + }, + { + "epoch": 0.36, + "grad_norm": 3.7894277572631836, + "learning_rate": 9.57928428317356e-06, + "loss": 1.3744, + "step": 72300 + }, + { + "epoch": 0.36, + "grad_norm": 3.6949033737182617, + "learning_rate": 9.578659148378871e-06, + "loss": 1.3434, + "step": 72400 + }, + { + "epoch": 0.36, + "grad_norm": 3.2511017322540283, + "learning_rate": 9.57803401358418e-06, + "loss": 1.3809, + "step": 72500 + }, + { + "epoch": 0.36, + "grad_norm": 2.6631274223327637, + "learning_rate": 9.577408878789489e-06, + "loss": 1.3232, + "step": 72600 + }, + { + "epoch": 0.36, + "grad_norm": 3.0832998752593994, + "learning_rate": 9.576783743994799e-06, + "loss": 1.3785, + "step": 72700 + }, + { + "epoch": 0.36, + "grad_norm": 4.4912238121032715, + "learning_rate": 9.576158609200109e-06, + "loss": 1.3558, + "step": 72800 + }, + { + "epoch": 0.36, + "grad_norm": 3.720935821533203, + "learning_rate": 9.575533474405419e-06, + "loss": 1.3547, + "step": 72900 + }, + { + "epoch": 0.36, + "grad_norm": 3.69688081741333, + "learning_rate": 9.574908339610729e-06, + "loss": 1.3409, + "step": 73000 + }, + { + "epoch": 0.36, + "grad_norm": 4.521012783050537, + "learning_rate": 9.574283204816039e-06, + "loss": 1.3701, + "step": 73100 + }, + { + "epoch": 0.36, + "grad_norm": 3.9866528511047363, + "learning_rate": 9.573658070021349e-06, + "loss": 1.3623, + "step": 73200 + }, + { + "epoch": 0.36, + "grad_norm": 4.300259590148926, + "learning_rate": 9.573032935226659e-06, + "loss": 1.3562, + "step": 73300 + }, + { + "epoch": 0.36, + "grad_norm": 3.585087299346924, + "learning_rate": 9.572407800431969e-06, + "loss": 1.3574, + "step": 73400 + }, + { + "epoch": 0.36, + "grad_norm": 3.3413264751434326, + "learning_rate": 9.571782665637278e-06, + "loss": 1.3586, + "step": 73500 + }, + { + "epoch": 0.36, + "grad_norm": 3.7272746562957764, + "learning_rate": 9.571157530842588e-06, + "loss": 1.3525, + "step": 73600 + }, + { + "epoch": 0.36, + "grad_norm": 3.167235851287842, + "learning_rate": 9.570532396047898e-06, + "loss": 1.3545, + "step": 73700 + }, + { + "epoch": 0.37, + "grad_norm": 3.007138252258301, + "learning_rate": 9.569907261253208e-06, + "loss": 1.3638, + "step": 73800 + }, + { + "epoch": 0.37, + "grad_norm": 3.402449607849121, + "learning_rate": 9.569282126458518e-06, + "loss": 1.3394, + "step": 73900 + }, + { + "epoch": 0.37, + "grad_norm": 3.050807237625122, + "learning_rate": 9.568656991663828e-06, + "loss": 1.3477, + "step": 74000 + }, + { + "epoch": 0.37, + "grad_norm": 3.470465660095215, + "learning_rate": 9.568031856869138e-06, + "loss": 1.3573, + "step": 74100 + }, + { + "epoch": 0.37, + "grad_norm": 3.1874587535858154, + "learning_rate": 9.567406722074448e-06, + "loss": 1.3545, + "step": 74200 + }, + { + "epoch": 0.37, + "grad_norm": 3.022789478302002, + "learning_rate": 9.566781587279758e-06, + "loss": 1.3857, + "step": 74300 + }, + { + "epoch": 0.37, + "grad_norm": 3.696437358856201, + "learning_rate": 9.566156452485068e-06, + "loss": 1.3845, + "step": 74400 + }, + { + "epoch": 0.37, + "grad_norm": 3.3129115104675293, + "learning_rate": 9.565531317690378e-06, + "loss": 1.3342, + "step": 74500 + }, + { + "epoch": 0.37, + "grad_norm": 3.0286476612091064, + "learning_rate": 9.564906182895688e-06, + "loss": 1.3451, + "step": 74600 + }, + { + "epoch": 0.37, + "grad_norm": 3.14780330657959, + "learning_rate": 9.564281048100998e-06, + "loss": 1.3501, + "step": 74700 + }, + { + "epoch": 0.37, + "grad_norm": 3.660125732421875, + "learning_rate": 9.563655913306308e-06, + "loss": 1.3974, + "step": 74800 + }, + { + "epoch": 0.37, + "grad_norm": 3.7873997688293457, + "learning_rate": 9.563030778511617e-06, + "loss": 1.3328, + "step": 74900 + }, + { + "epoch": 0.37, + "grad_norm": 4.175543785095215, + "learning_rate": 9.562405643716927e-06, + "loss": 1.3871, + "step": 75000 + }, + { + "epoch": 0.37, + "grad_norm": 3.0854318141937256, + "learning_rate": 9.561780508922237e-06, + "loss": 1.3436, + "step": 75100 + }, + { + "epoch": 0.37, + "grad_norm": 3.430039882659912, + "learning_rate": 9.561155374127547e-06, + "loss": 1.3614, + "step": 75200 + }, + { + "epoch": 0.37, + "grad_norm": 3.1078710556030273, + "learning_rate": 9.560530239332857e-06, + "loss": 1.3788, + "step": 75300 + }, + { + "epoch": 0.37, + "grad_norm": 3.394430160522461, + "learning_rate": 9.559905104538167e-06, + "loss": 1.3534, + "step": 75400 + }, + { + "epoch": 0.37, + "grad_norm": 3.4498708248138428, + "learning_rate": 9.559279969743477e-06, + "loss": 1.3875, + "step": 75500 + }, + { + "epoch": 0.37, + "grad_norm": 3.169480562210083, + "learning_rate": 9.558654834948787e-06, + "loss": 1.3391, + "step": 75600 + }, + { + "epoch": 0.37, + "grad_norm": 3.4374375343322754, + "learning_rate": 9.558029700154097e-06, + "loss": 1.3657, + "step": 75700 + }, + { + "epoch": 0.38, + "grad_norm": 3.0859546661376953, + "learning_rate": 9.557404565359407e-06, + "loss": 1.3765, + "step": 75800 + }, + { + "epoch": 0.38, + "grad_norm": 3.567939281463623, + "learning_rate": 9.556779430564717e-06, + "loss": 1.3432, + "step": 75900 + }, + { + "epoch": 0.38, + "grad_norm": 3.236070156097412, + "learning_rate": 9.556154295770027e-06, + "loss": 1.3402, + "step": 76000 + }, + { + "epoch": 0.38, + "grad_norm": 3.366365432739258, + "learning_rate": 9.555529160975337e-06, + "loss": 1.3736, + "step": 76100 + }, + { + "epoch": 0.38, + "grad_norm": 4.573514461517334, + "learning_rate": 9.554904026180646e-06, + "loss": 1.3581, + "step": 76200 + }, + { + "epoch": 0.38, + "grad_norm": 3.199225664138794, + "learning_rate": 9.554278891385956e-06, + "loss": 1.3452, + "step": 76300 + }, + { + "epoch": 0.38, + "grad_norm": 3.0722098350524902, + "learning_rate": 9.553653756591266e-06, + "loss": 1.3785, + "step": 76400 + }, + { + "epoch": 0.38, + "grad_norm": 3.137385606765747, + "learning_rate": 9.553028621796576e-06, + "loss": 1.3521, + "step": 76500 + }, + { + "epoch": 0.38, + "grad_norm": 4.893807888031006, + "learning_rate": 9.552403487001886e-06, + "loss": 1.3572, + "step": 76600 + }, + { + "epoch": 0.38, + "grad_norm": 4.010082721710205, + "learning_rate": 9.551778352207194e-06, + "loss": 1.3483, + "step": 76700 + }, + { + "epoch": 0.38, + "grad_norm": 3.6857099533081055, + "learning_rate": 9.551153217412506e-06, + "loss": 1.3347, + "step": 76800 + }, + { + "epoch": 0.38, + "grad_norm": 2.510134696960449, + "learning_rate": 9.550528082617814e-06, + "loss": 1.3461, + "step": 76900 + }, + { + "epoch": 0.38, + "grad_norm": 3.9825291633605957, + "learning_rate": 9.549902947823126e-06, + "loss": 1.3415, + "step": 77000 + }, + { + "epoch": 0.38, + "grad_norm": 3.156740427017212, + "learning_rate": 9.549277813028434e-06, + "loss": 1.3305, + "step": 77100 + }, + { + "epoch": 0.38, + "grad_norm": 3.245800256729126, + "learning_rate": 9.548652678233746e-06, + "loss": 1.3573, + "step": 77200 + }, + { + "epoch": 0.38, + "grad_norm": 2.6874351501464844, + "learning_rate": 9.548027543439054e-06, + "loss": 1.3443, + "step": 77300 + }, + { + "epoch": 0.38, + "grad_norm": 3.6892011165618896, + "learning_rate": 9.547402408644366e-06, + "loss": 1.3192, + "step": 77400 + }, + { + "epoch": 0.38, + "grad_norm": 2.505993604660034, + "learning_rate": 9.546777273849674e-06, + "loss": 1.3812, + "step": 77500 + }, + { + "epoch": 0.38, + "grad_norm": 3.5395193099975586, + "learning_rate": 9.546152139054985e-06, + "loss": 1.3604, + "step": 77600 + }, + { + "epoch": 0.38, + "grad_norm": 3.2124781608581543, + "learning_rate": 9.545527004260294e-06, + "loss": 1.312, + "step": 77700 + }, + { + "epoch": 0.38, + "grad_norm": 3.8713743686676025, + "learning_rate": 9.544901869465605e-06, + "loss": 1.3435, + "step": 77800 + }, + { + "epoch": 0.39, + "grad_norm": 3.1610865592956543, + "learning_rate": 9.544276734670914e-06, + "loss": 1.3696, + "step": 77900 + }, + { + "epoch": 0.39, + "grad_norm": 7.323131561279297, + "learning_rate": 9.543651599876223e-06, + "loss": 1.357, + "step": 78000 + }, + { + "epoch": 0.39, + "grad_norm": 3.0851237773895264, + "learning_rate": 9.543026465081533e-06, + "loss": 1.3406, + "step": 78100 + }, + { + "epoch": 0.39, + "grad_norm": 3.637321949005127, + "learning_rate": 9.542401330286843e-06, + "loss": 1.376, + "step": 78200 + }, + { + "epoch": 0.39, + "grad_norm": 2.876664638519287, + "learning_rate": 9.541776195492153e-06, + "loss": 1.4049, + "step": 78300 + }, + { + "epoch": 0.39, + "grad_norm": 3.146031618118286, + "learning_rate": 9.541151060697463e-06, + "loss": 1.3484, + "step": 78400 + }, + { + "epoch": 0.39, + "grad_norm": 4.596341609954834, + "learning_rate": 9.540525925902773e-06, + "loss": 1.3518, + "step": 78500 + }, + { + "epoch": 0.39, + "grad_norm": 5.041236400604248, + "learning_rate": 9.539900791108083e-06, + "loss": 1.362, + "step": 78600 + }, + { + "epoch": 0.39, + "grad_norm": 3.9177463054656982, + "learning_rate": 9.539275656313393e-06, + "loss": 1.3586, + "step": 78700 + }, + { + "epoch": 0.39, + "grad_norm": 3.115206003189087, + "learning_rate": 9.538650521518703e-06, + "loss": 1.367, + "step": 78800 + }, + { + "epoch": 0.39, + "grad_norm": 2.846676826477051, + "learning_rate": 9.538025386724013e-06, + "loss": 1.3699, + "step": 78900 + }, + { + "epoch": 0.39, + "grad_norm": 3.625420331954956, + "learning_rate": 9.537400251929323e-06, + "loss": 1.3505, + "step": 79000 + }, + { + "epoch": 0.39, + "grad_norm": 3.315352439880371, + "learning_rate": 9.536775117134633e-06, + "loss": 1.3456, + "step": 79100 + }, + { + "epoch": 0.39, + "grad_norm": 3.249753475189209, + "learning_rate": 9.536149982339943e-06, + "loss": 1.3538, + "step": 79200 + }, + { + "epoch": 0.39, + "grad_norm": 3.9315223693847656, + "learning_rate": 9.535524847545253e-06, + "loss": 1.3459, + "step": 79300 + }, + { + "epoch": 0.39, + "grad_norm": 3.4720170497894287, + "learning_rate": 9.534899712750562e-06, + "loss": 1.3935, + "step": 79400 + }, + { + "epoch": 0.39, + "grad_norm": 2.97334885597229, + "learning_rate": 9.534274577955872e-06, + "loss": 1.3526, + "step": 79500 + }, + { + "epoch": 0.39, + "grad_norm": 2.218647003173828, + "learning_rate": 9.533649443161182e-06, + "loss": 1.3845, + "step": 79600 + }, + { + "epoch": 0.39, + "grad_norm": 3.644829034805298, + "learning_rate": 9.533024308366492e-06, + "loss": 1.3515, + "step": 79700 + }, + { + "epoch": 0.39, + "grad_norm": 4.018405437469482, + "learning_rate": 9.532399173571802e-06, + "loss": 1.3544, + "step": 79800 + }, + { + "epoch": 0.4, + "grad_norm": 3.210761308670044, + "learning_rate": 9.531774038777112e-06, + "loss": 1.3554, + "step": 79900 + }, + { + "epoch": 0.4, + "grad_norm": 3.046523094177246, + "learning_rate": 9.531148903982422e-06, + "loss": 1.3733, + "step": 80000 + }, + { + "epoch": 0.4, + "grad_norm": 3.437032699584961, + "learning_rate": 9.530523769187732e-06, + "loss": 1.347, + "step": 80100 + }, + { + "epoch": 0.4, + "grad_norm": 3.712858200073242, + "learning_rate": 9.529898634393042e-06, + "loss": 1.3856, + "step": 80200 + }, + { + "epoch": 0.4, + "grad_norm": 2.859689474105835, + "learning_rate": 9.529273499598352e-06, + "loss": 1.3312, + "step": 80300 + }, + { + "epoch": 0.4, + "grad_norm": 3.8814845085144043, + "learning_rate": 9.528648364803662e-06, + "loss": 1.3362, + "step": 80400 + }, + { + "epoch": 0.4, + "grad_norm": 2.983851909637451, + "learning_rate": 9.528023230008972e-06, + "loss": 1.3689, + "step": 80500 + }, + { + "epoch": 0.4, + "grad_norm": 3.3227264881134033, + "learning_rate": 9.527398095214282e-06, + "loss": 1.3362, + "step": 80600 + }, + { + "epoch": 0.4, + "grad_norm": 3.825824499130249, + "learning_rate": 9.526772960419591e-06, + "loss": 1.3496, + "step": 80700 + }, + { + "epoch": 0.4, + "grad_norm": 3.376059055328369, + "learning_rate": 9.526147825624901e-06, + "loss": 1.3781, + "step": 80800 + }, + { + "epoch": 0.4, + "grad_norm": 3.2187156677246094, + "learning_rate": 9.525522690830211e-06, + "loss": 1.4142, + "step": 80900 + }, + { + "epoch": 0.4, + "grad_norm": 3.073812246322632, + "learning_rate": 9.524897556035521e-06, + "loss": 1.3185, + "step": 81000 + }, + { + "epoch": 0.4, + "grad_norm": 2.7107346057891846, + "learning_rate": 9.524272421240831e-06, + "loss": 1.322, + "step": 81100 + }, + { + "epoch": 0.4, + "grad_norm": 3.378969669342041, + "learning_rate": 9.523647286446141e-06, + "loss": 1.3438, + "step": 81200 + }, + { + "epoch": 0.4, + "grad_norm": 4.337489604949951, + "learning_rate": 9.523022151651451e-06, + "loss": 1.3326, + "step": 81300 + }, + { + "epoch": 0.4, + "grad_norm": 4.453660488128662, + "learning_rate": 9.522397016856761e-06, + "loss": 1.3624, + "step": 81400 + }, + { + "epoch": 0.4, + "grad_norm": 3.236886501312256, + "learning_rate": 9.521771882062071e-06, + "loss": 1.3704, + "step": 81500 + }, + { + "epoch": 0.4, + "grad_norm": 3.969984531402588, + "learning_rate": 9.52114674726738e-06, + "loss": 1.3605, + "step": 81600 + }, + { + "epoch": 0.4, + "grad_norm": 2.707930326461792, + "learning_rate": 9.52052161247269e-06, + "loss": 1.3272, + "step": 81700 + }, + { + "epoch": 0.4, + "grad_norm": 3.0617573261260986, + "learning_rate": 9.519896477678e-06, + "loss": 1.3779, + "step": 81800 + }, + { + "epoch": 0.41, + "grad_norm": 2.8938345909118652, + "learning_rate": 9.519271342883309e-06, + "loss": 1.3453, + "step": 81900 + }, + { + "epoch": 0.41, + "grad_norm": 3.273656129837036, + "learning_rate": 9.51864620808862e-06, + "loss": 1.3535, + "step": 82000 + }, + { + "epoch": 0.41, + "grad_norm": 3.6416726112365723, + "learning_rate": 9.518021073293929e-06, + "loss": 1.393, + "step": 82100 + }, + { + "epoch": 0.41, + "grad_norm": 2.7089104652404785, + "learning_rate": 9.51739593849924e-06, + "loss": 1.3568, + "step": 82200 + }, + { + "epoch": 0.41, + "grad_norm": 3.872784376144409, + "learning_rate": 9.516770803704549e-06, + "loss": 1.3725, + "step": 82300 + }, + { + "epoch": 0.41, + "grad_norm": 3.3895182609558105, + "learning_rate": 9.51614566890986e-06, + "loss": 1.3347, + "step": 82400 + }, + { + "epoch": 0.41, + "grad_norm": 3.349815845489502, + "learning_rate": 9.515520534115168e-06, + "loss": 1.2965, + "step": 82500 + }, + { + "epoch": 0.41, + "grad_norm": 3.8851418495178223, + "learning_rate": 9.51489539932048e-06, + "loss": 1.3547, + "step": 82600 + }, + { + "epoch": 0.41, + "grad_norm": 3.7153375148773193, + "learning_rate": 9.514270264525788e-06, + "loss": 1.3502, + "step": 82700 + }, + { + "epoch": 0.41, + "grad_norm": 2.9336204528808594, + "learning_rate": 9.5136451297311e-06, + "loss": 1.3727, + "step": 82800 + }, + { + "epoch": 0.41, + "grad_norm": 2.910884141921997, + "learning_rate": 9.513019994936408e-06, + "loss": 1.3643, + "step": 82900 + }, + { + "epoch": 0.41, + "grad_norm": 2.9535582065582275, + "learning_rate": 9.51239486014172e-06, + "loss": 1.3532, + "step": 83000 + }, + { + "epoch": 0.41, + "grad_norm": 3.453658103942871, + "learning_rate": 9.511769725347028e-06, + "loss": 1.3747, + "step": 83100 + }, + { + "epoch": 0.41, + "grad_norm": 4.163629055023193, + "learning_rate": 9.511144590552338e-06, + "loss": 1.3686, + "step": 83200 + }, + { + "epoch": 0.41, + "grad_norm": 3.291599988937378, + "learning_rate": 9.510519455757648e-06, + "loss": 1.3195, + "step": 83300 + }, + { + "epoch": 0.41, + "grad_norm": 4.140781879425049, + "learning_rate": 9.509894320962958e-06, + "loss": 1.3454, + "step": 83400 + }, + { + "epoch": 0.41, + "grad_norm": 3.2356150150299072, + "learning_rate": 9.509269186168268e-06, + "loss": 1.3656, + "step": 83500 + }, + { + "epoch": 0.41, + "grad_norm": 2.98710298538208, + "learning_rate": 9.508644051373578e-06, + "loss": 1.371, + "step": 83600 + }, + { + "epoch": 0.41, + "grad_norm": 2.949601650238037, + "learning_rate": 9.508018916578888e-06, + "loss": 1.3794, + "step": 83700 + }, + { + "epoch": 0.41, + "grad_norm": 2.5830845832824707, + "learning_rate": 9.507393781784198e-06, + "loss": 1.3665, + "step": 83800 + }, + { + "epoch": 0.42, + "grad_norm": 3.1843700408935547, + "learning_rate": 9.506768646989507e-06, + "loss": 1.3463, + "step": 83900 + }, + { + "epoch": 0.42, + "grad_norm": 3.233429193496704, + "learning_rate": 9.506143512194817e-06, + "loss": 1.3741, + "step": 84000 + }, + { + "epoch": 0.42, + "grad_norm": 3.79780650138855, + "learning_rate": 9.505518377400127e-06, + "loss": 1.3006, + "step": 84100 + }, + { + "epoch": 0.42, + "grad_norm": 3.684920310974121, + "learning_rate": 9.504893242605437e-06, + "loss": 1.3613, + "step": 84200 + }, + { + "epoch": 0.42, + "grad_norm": 4.043038368225098, + "learning_rate": 9.504268107810747e-06, + "loss": 1.3762, + "step": 84300 + }, + { + "epoch": 0.42, + "grad_norm": 3.4520349502563477, + "learning_rate": 9.503642973016057e-06, + "loss": 1.3339, + "step": 84400 + }, + { + "epoch": 0.42, + "grad_norm": 3.5900933742523193, + "learning_rate": 9.503017838221367e-06, + "loss": 1.3507, + "step": 84500 + }, + { + "epoch": 0.42, + "grad_norm": 4.365208625793457, + "learning_rate": 9.502392703426677e-06, + "loss": 1.354, + "step": 84600 + }, + { + "epoch": 0.42, + "grad_norm": 3.7963385581970215, + "learning_rate": 9.501767568631987e-06, + "loss": 1.3377, + "step": 84700 + }, + { + "epoch": 0.42, + "grad_norm": 3.07368803024292, + "learning_rate": 9.501142433837297e-06, + "loss": 1.3544, + "step": 84800 + }, + { + "epoch": 0.42, + "grad_norm": 3.9033076763153076, + "learning_rate": 9.500517299042607e-06, + "loss": 1.3466, + "step": 84900 + }, + { + "epoch": 0.42, + "grad_norm": 2.946506977081299, + "learning_rate": 9.499892164247917e-06, + "loss": 1.3524, + "step": 85000 + }, + { + "epoch": 0.42, + "grad_norm": 4.097044944763184, + "learning_rate": 9.499267029453227e-06, + "loss": 1.3342, + "step": 85100 + }, + { + "epoch": 0.42, + "grad_norm": 3.7137930393218994, + "learning_rate": 9.498641894658536e-06, + "loss": 1.3737, + "step": 85200 + }, + { + "epoch": 0.42, + "grad_norm": 3.2496094703674316, + "learning_rate": 9.498016759863846e-06, + "loss": 1.339, + "step": 85300 + }, + { + "epoch": 0.42, + "grad_norm": 4.326569557189941, + "learning_rate": 9.497391625069156e-06, + "loss": 1.372, + "step": 85400 + }, + { + "epoch": 0.42, + "grad_norm": 2.918201208114624, + "learning_rate": 9.496766490274466e-06, + "loss": 1.341, + "step": 85500 + }, + { + "epoch": 0.42, + "grad_norm": 3.4720118045806885, + "learning_rate": 9.496141355479776e-06, + "loss": 1.3525, + "step": 85600 + }, + { + "epoch": 0.42, + "grad_norm": 3.20745587348938, + "learning_rate": 9.495516220685086e-06, + "loss": 1.3664, + "step": 85700 + }, + { + "epoch": 0.42, + "grad_norm": 3.320747137069702, + "learning_rate": 9.494891085890396e-06, + "loss": 1.326, + "step": 85800 + }, + { + "epoch": 0.42, + "grad_norm": 2.690807342529297, + "learning_rate": 9.494265951095706e-06, + "loss": 1.3689, + "step": 85900 + }, + { + "epoch": 0.43, + "grad_norm": 4.453171253204346, + "learning_rate": 9.493640816301016e-06, + "loss": 1.374, + "step": 86000 + }, + { + "epoch": 0.43, + "grad_norm": 3.375361204147339, + "learning_rate": 9.493015681506326e-06, + "loss": 1.3427, + "step": 86100 + }, + { + "epoch": 0.43, + "grad_norm": 3.053560495376587, + "learning_rate": 9.492390546711636e-06, + "loss": 1.3418, + "step": 86200 + }, + { + "epoch": 0.43, + "grad_norm": 4.028963565826416, + "learning_rate": 9.491765411916946e-06, + "loss": 1.3203, + "step": 86300 + }, + { + "epoch": 0.43, + "grad_norm": 3.746544599533081, + "learning_rate": 9.491140277122256e-06, + "loss": 1.3121, + "step": 86400 + }, + { + "epoch": 0.43, + "grad_norm": 3.1117103099823, + "learning_rate": 9.490515142327566e-06, + "loss": 1.3383, + "step": 86500 + }, + { + "epoch": 0.43, + "grad_norm": 3.2640998363494873, + "learning_rate": 9.489890007532875e-06, + "loss": 1.3384, + "step": 86600 + }, + { + "epoch": 0.43, + "grad_norm": 3.436328172683716, + "learning_rate": 9.489264872738185e-06, + "loss": 1.3587, + "step": 86700 + }, + { + "epoch": 0.43, + "grad_norm": 3.372560977935791, + "learning_rate": 9.488639737943495e-06, + "loss": 1.4054, + "step": 86800 + }, + { + "epoch": 0.43, + "grad_norm": 2.880247116088867, + "learning_rate": 9.488014603148805e-06, + "loss": 1.3166, + "step": 86900 + }, + { + "epoch": 0.43, + "grad_norm": 3.686885356903076, + "learning_rate": 9.487389468354115e-06, + "loss": 1.3629, + "step": 87000 + }, + { + "epoch": 0.43, + "grad_norm": 3.168898820877075, + "learning_rate": 9.486764333559425e-06, + "loss": 1.3868, + "step": 87100 + }, + { + "epoch": 0.43, + "grad_norm": 3.3519859313964844, + "learning_rate": 9.486139198764735e-06, + "loss": 1.3696, + "step": 87200 + }, + { + "epoch": 0.43, + "grad_norm": 2.844688892364502, + "learning_rate": 9.485514063970043e-06, + "loss": 1.3498, + "step": 87300 + }, + { + "epoch": 0.43, + "grad_norm": 3.061849594116211, + "learning_rate": 9.484888929175355e-06, + "loss": 1.3692, + "step": 87400 + }, + { + "epoch": 0.43, + "grad_norm": 4.100019931793213, + "learning_rate": 9.484263794380663e-06, + "loss": 1.3407, + "step": 87500 + }, + { + "epoch": 0.43, + "grad_norm": 3.519801378250122, + "learning_rate": 9.483638659585975e-06, + "loss": 1.3565, + "step": 87600 + }, + { + "epoch": 0.43, + "grad_norm": 6.410887241363525, + "learning_rate": 9.483013524791283e-06, + "loss": 1.3291, + "step": 87700 + }, + { + "epoch": 0.43, + "grad_norm": 3.080322504043579, + "learning_rate": 9.482388389996595e-06, + "loss": 1.3519, + "step": 87800 + }, + { + "epoch": 0.43, + "grad_norm": 3.0409817695617676, + "learning_rate": 9.481763255201903e-06, + "loss": 1.3519, + "step": 87900 + }, + { + "epoch": 0.44, + "grad_norm": 4.2845025062561035, + "learning_rate": 9.481138120407214e-06, + "loss": 1.3436, + "step": 88000 + }, + { + "epoch": 0.44, + "grad_norm": 3.1132304668426514, + "learning_rate": 9.480512985612523e-06, + "loss": 1.3145, + "step": 88100 + }, + { + "epoch": 0.44, + "grad_norm": 4.386362075805664, + "learning_rate": 9.479887850817834e-06, + "loss": 1.3483, + "step": 88200 + }, + { + "epoch": 0.44, + "grad_norm": 2.7619481086730957, + "learning_rate": 9.479262716023143e-06, + "loss": 1.312, + "step": 88300 + }, + { + "epoch": 0.44, + "grad_norm": 3.451927900314331, + "learning_rate": 9.478637581228454e-06, + "loss": 1.3807, + "step": 88400 + }, + { + "epoch": 0.44, + "grad_norm": 3.5724120140075684, + "learning_rate": 9.478012446433762e-06, + "loss": 1.3469, + "step": 88500 + }, + { + "epoch": 0.44, + "grad_norm": 4.330935955047607, + "learning_rate": 9.477387311639072e-06, + "loss": 1.3136, + "step": 88600 + }, + { + "epoch": 0.44, + "grad_norm": 3.6509666442871094, + "learning_rate": 9.476762176844382e-06, + "loss": 1.3645, + "step": 88700 + }, + { + "epoch": 0.44, + "grad_norm": 3.39678692817688, + "learning_rate": 9.476137042049692e-06, + "loss": 1.3229, + "step": 88800 + }, + { + "epoch": 0.44, + "grad_norm": 3.2337393760681152, + "learning_rate": 9.475511907255002e-06, + "loss": 1.3473, + "step": 88900 + }, + { + "epoch": 0.44, + "grad_norm": 2.9486355781555176, + "learning_rate": 9.474886772460312e-06, + "loss": 1.341, + "step": 89000 + }, + { + "epoch": 0.44, + "grad_norm": 3.5861918926239014, + "learning_rate": 9.474261637665622e-06, + "loss": 1.3708, + "step": 89100 + }, + { + "epoch": 0.44, + "grad_norm": 2.8153584003448486, + "learning_rate": 9.473636502870932e-06, + "loss": 1.35, + "step": 89200 + }, + { + "epoch": 0.44, + "grad_norm": 3.0656278133392334, + "learning_rate": 9.473011368076242e-06, + "loss": 1.3564, + "step": 89300 + }, + { + "epoch": 0.44, + "grad_norm": 3.5475146770477295, + "learning_rate": 9.472386233281552e-06, + "loss": 1.3178, + "step": 89400 + }, + { + "epoch": 0.44, + "grad_norm": 4.837975025177002, + "learning_rate": 9.471761098486862e-06, + "loss": 1.3588, + "step": 89500 + }, + { + "epoch": 0.44, + "grad_norm": 3.626478433609009, + "learning_rate": 9.471135963692172e-06, + "loss": 1.3288, + "step": 89600 + }, + { + "epoch": 0.44, + "grad_norm": 2.8399198055267334, + "learning_rate": 9.470510828897481e-06, + "loss": 1.3542, + "step": 89700 + }, + { + "epoch": 0.44, + "grad_norm": 3.478510856628418, + "learning_rate": 9.469885694102791e-06, + "loss": 1.3282, + "step": 89800 + }, + { + "epoch": 0.44, + "grad_norm": 3.0036330223083496, + "learning_rate": 9.469260559308101e-06, + "loss": 1.3135, + "step": 89900 + }, + { + "epoch": 0.45, + "grad_norm": 3.49764084815979, + "learning_rate": 9.468635424513411e-06, + "loss": 1.3216, + "step": 90000 + }, + { + "epoch": 0.45, + "grad_norm": 4.711456298828125, + "learning_rate": 9.468010289718721e-06, + "loss": 1.3746, + "step": 90100 + }, + { + "epoch": 0.45, + "grad_norm": 3.8252532482147217, + "learning_rate": 9.467385154924031e-06, + "loss": 1.3375, + "step": 90200 + }, + { + "epoch": 0.45, + "grad_norm": 3.445317029953003, + "learning_rate": 9.466760020129341e-06, + "loss": 1.3523, + "step": 90300 + }, + { + "epoch": 0.45, + "grad_norm": 2.879566192626953, + "learning_rate": 9.466134885334651e-06, + "loss": 1.3184, + "step": 90400 + }, + { + "epoch": 0.45, + "grad_norm": 3.891055107116699, + "learning_rate": 9.465509750539961e-06, + "loss": 1.3246, + "step": 90500 + }, + { + "epoch": 0.45, + "grad_norm": 2.6852951049804688, + "learning_rate": 9.46488461574527e-06, + "loss": 1.3803, + "step": 90600 + }, + { + "epoch": 0.45, + "grad_norm": 2.9267516136169434, + "learning_rate": 9.46425948095058e-06, + "loss": 1.3254, + "step": 90700 + }, + { + "epoch": 0.45, + "grad_norm": 2.6373448371887207, + "learning_rate": 9.46363434615589e-06, + "loss": 1.3266, + "step": 90800 + }, + { + "epoch": 0.45, + "grad_norm": 3.703024387359619, + "learning_rate": 9.4630092113612e-06, + "loss": 1.3552, + "step": 90900 + }, + { + "epoch": 0.45, + "grad_norm": 2.5792810916900635, + "learning_rate": 9.46238407656651e-06, + "loss": 1.3365, + "step": 91000 + }, + { + "epoch": 0.45, + "grad_norm": 3.6020054817199707, + "learning_rate": 9.46175894177182e-06, + "loss": 1.3287, + "step": 91100 + }, + { + "epoch": 0.45, + "grad_norm": 3.713806390762329, + "learning_rate": 9.46113380697713e-06, + "loss": 1.3537, + "step": 91200 + }, + { + "epoch": 0.45, + "grad_norm": 4.407512187957764, + "learning_rate": 9.46050867218244e-06, + "loss": 1.3835, + "step": 91300 + }, + { + "epoch": 0.45, + "grad_norm": 2.683220863342285, + "learning_rate": 9.45988353738775e-06, + "loss": 1.3206, + "step": 91400 + }, + { + "epoch": 0.45, + "grad_norm": 3.046828031539917, + "learning_rate": 9.45925840259306e-06, + "loss": 1.3251, + "step": 91500 + }, + { + "epoch": 0.45, + "grad_norm": 2.8515381813049316, + "learning_rate": 9.45863326779837e-06, + "loss": 1.3432, + "step": 91600 + }, + { + "epoch": 0.45, + "grad_norm": 3.381223678588867, + "learning_rate": 9.45800813300368e-06, + "loss": 1.355, + "step": 91700 + }, + { + "epoch": 0.45, + "grad_norm": 3.301053762435913, + "learning_rate": 9.45738299820899e-06, + "loss": 1.3406, + "step": 91800 + }, + { + "epoch": 0.45, + "grad_norm": 2.924475908279419, + "learning_rate": 9.4567578634143e-06, + "loss": 1.3444, + "step": 91900 + }, + { + "epoch": 0.46, + "grad_norm": 3.036510705947876, + "learning_rate": 9.45613272861961e-06, + "loss": 1.3519, + "step": 92000 + }, + { + "epoch": 0.46, + "grad_norm": 2.7162649631500244, + "learning_rate": 9.45550759382492e-06, + "loss": 1.3407, + "step": 92100 + }, + { + "epoch": 0.46, + "grad_norm": 2.7335431575775146, + "learning_rate": 9.45488245903023e-06, + "loss": 1.3941, + "step": 92200 + }, + { + "epoch": 0.46, + "grad_norm": 2.4760313034057617, + "learning_rate": 9.45425732423554e-06, + "loss": 1.3481, + "step": 92300 + }, + { + "epoch": 0.46, + "grad_norm": 3.327454090118408, + "learning_rate": 9.45363218944085e-06, + "loss": 1.3313, + "step": 92400 + }, + { + "epoch": 0.46, + "grad_norm": 3.170297861099243, + "learning_rate": 9.453007054646158e-06, + "loss": 1.3222, + "step": 92500 + }, + { + "epoch": 0.46, + "grad_norm": 3.097593307495117, + "learning_rate": 9.45238191985147e-06, + "loss": 1.3615, + "step": 92600 + }, + { + "epoch": 0.46, + "grad_norm": 2.889549493789673, + "learning_rate": 9.451756785056778e-06, + "loss": 1.3447, + "step": 92700 + }, + { + "epoch": 0.46, + "grad_norm": 2.5188488960266113, + "learning_rate": 9.45113165026209e-06, + "loss": 1.3844, + "step": 92800 + }, + { + "epoch": 0.46, + "grad_norm": 2.8199424743652344, + "learning_rate": 9.450506515467397e-06, + "loss": 1.3751, + "step": 92900 + }, + { + "epoch": 0.46, + "grad_norm": 2.9179065227508545, + "learning_rate": 9.449881380672709e-06, + "loss": 1.3483, + "step": 93000 + }, + { + "epoch": 0.46, + "grad_norm": 3.69584584236145, + "learning_rate": 9.449256245878017e-06, + "loss": 1.3613, + "step": 93100 + }, + { + "epoch": 0.46, + "grad_norm": 4.401488780975342, + "learning_rate": 9.448631111083329e-06, + "loss": 1.3625, + "step": 93200 + }, + { + "epoch": 0.46, + "grad_norm": 2.9850871562957764, + "learning_rate": 9.448005976288637e-06, + "loss": 1.3522, + "step": 93300 + }, + { + "epoch": 0.46, + "grad_norm": 3.8156750202178955, + "learning_rate": 9.447380841493949e-06, + "loss": 1.3387, + "step": 93400 + }, + { + "epoch": 0.46, + "grad_norm": 3.664689779281616, + "learning_rate": 9.446755706699257e-06, + "loss": 1.3233, + "step": 93500 + }, + { + "epoch": 0.46, + "grad_norm": 4.119280815124512, + "learning_rate": 9.446130571904569e-06, + "loss": 1.396, + "step": 93600 + }, + { + "epoch": 0.46, + "grad_norm": 2.9794814586639404, + "learning_rate": 9.445505437109877e-06, + "loss": 1.3731, + "step": 93700 + }, + { + "epoch": 0.46, + "grad_norm": 2.943528890609741, + "learning_rate": 9.444880302315187e-06, + "loss": 1.3452, + "step": 93800 + }, + { + "epoch": 0.46, + "grad_norm": 2.734614610671997, + "learning_rate": 9.444255167520497e-06, + "loss": 1.346, + "step": 93900 + }, + { + "epoch": 0.47, + "grad_norm": 3.5047719478607178, + "learning_rate": 9.443630032725807e-06, + "loss": 1.3618, + "step": 94000 + }, + { + "epoch": 0.47, + "grad_norm": 3.040126323699951, + "learning_rate": 9.443004897931117e-06, + "loss": 1.3461, + "step": 94100 + }, + { + "epoch": 0.47, + "grad_norm": 3.951183795928955, + "learning_rate": 9.442379763136426e-06, + "loss": 1.3402, + "step": 94200 + }, + { + "epoch": 0.47, + "grad_norm": 3.8336355686187744, + "learning_rate": 9.441754628341736e-06, + "loss": 1.3211, + "step": 94300 + }, + { + "epoch": 0.47, + "grad_norm": 2.9649171829223633, + "learning_rate": 9.441129493547046e-06, + "loss": 1.3538, + "step": 94400 + }, + { + "epoch": 0.47, + "grad_norm": 3.0716583728790283, + "learning_rate": 9.440504358752356e-06, + "loss": 1.3518, + "step": 94500 + }, + { + "epoch": 0.47, + "grad_norm": 2.94270658493042, + "learning_rate": 9.439879223957666e-06, + "loss": 1.3278, + "step": 94600 + }, + { + "epoch": 0.47, + "grad_norm": 2.8442766666412354, + "learning_rate": 9.439254089162976e-06, + "loss": 1.32, + "step": 94700 + }, + { + "epoch": 0.47, + "grad_norm": 3.8846325874328613, + "learning_rate": 9.438628954368286e-06, + "loss": 1.3391, + "step": 94800 + }, + { + "epoch": 0.47, + "grad_norm": 2.698730230331421, + "learning_rate": 9.438003819573596e-06, + "loss": 1.3777, + "step": 94900 + }, + { + "epoch": 0.47, + "grad_norm": 3.3867924213409424, + "learning_rate": 9.437378684778906e-06, + "loss": 1.3563, + "step": 95000 + }, + { + "epoch": 0.47, + "grad_norm": 2.769615411758423, + "learning_rate": 9.436753549984216e-06, + "loss": 1.3298, + "step": 95100 + }, + { + "epoch": 0.47, + "grad_norm": 3.6002724170684814, + "learning_rate": 9.436128415189526e-06, + "loss": 1.3425, + "step": 95200 + }, + { + "epoch": 0.47, + "grad_norm": 3.228452205657959, + "learning_rate": 9.435503280394836e-06, + "loss": 1.3429, + "step": 95300 + }, + { + "epoch": 0.47, + "grad_norm": 3.423189401626587, + "learning_rate": 9.434878145600146e-06, + "loss": 1.3768, + "step": 95400 + }, + { + "epoch": 0.47, + "grad_norm": 3.5708446502685547, + "learning_rate": 9.434253010805456e-06, + "loss": 1.3125, + "step": 95500 + }, + { + "epoch": 0.47, + "grad_norm": 2.9108211994171143, + "learning_rate": 9.433627876010765e-06, + "loss": 1.3347, + "step": 95600 + }, + { + "epoch": 0.47, + "grad_norm": 3.365302324295044, + "learning_rate": 9.433002741216075e-06, + "loss": 1.3277, + "step": 95700 + }, + { + "epoch": 0.47, + "grad_norm": 3.0356671810150146, + "learning_rate": 9.432377606421385e-06, + "loss": 1.3345, + "step": 95800 + }, + { + "epoch": 0.47, + "grad_norm": 2.6186368465423584, + "learning_rate": 9.431752471626695e-06, + "loss": 1.3059, + "step": 95900 + }, + { + "epoch": 0.47, + "grad_norm": 3.4578585624694824, + "learning_rate": 9.431127336832005e-06, + "loss": 1.3635, + "step": 96000 + }, + { + "epoch": 0.48, + "grad_norm": 2.8224425315856934, + "learning_rate": 9.430502202037315e-06, + "loss": 1.3325, + "step": 96100 + }, + { + "epoch": 0.48, + "grad_norm": 3.635671377182007, + "learning_rate": 9.429877067242625e-06, + "loss": 1.3007, + "step": 96200 + }, + { + "epoch": 0.48, + "grad_norm": 2.900747299194336, + "learning_rate": 9.429251932447935e-06, + "loss": 1.3317, + "step": 96300 + }, + { + "epoch": 0.48, + "grad_norm": 2.8643271923065186, + "learning_rate": 9.428626797653245e-06, + "loss": 1.3489, + "step": 96400 + }, + { + "epoch": 0.48, + "grad_norm": 3.498797655105591, + "learning_rate": 9.428001662858555e-06, + "loss": 1.3263, + "step": 96500 + }, + { + "epoch": 0.48, + "grad_norm": 2.8632445335388184, + "learning_rate": 9.427376528063865e-06, + "loss": 1.3465, + "step": 96600 + }, + { + "epoch": 0.48, + "grad_norm": 2.886178970336914, + "learning_rate": 9.426751393269175e-06, + "loss": 1.3622, + "step": 96700 + }, + { + "epoch": 0.48, + "grad_norm": 3.605872869491577, + "learning_rate": 9.426126258474485e-06, + "loss": 1.3404, + "step": 96800 + }, + { + "epoch": 0.48, + "grad_norm": 4.709196090698242, + "learning_rate": 9.425501123679795e-06, + "loss": 1.3344, + "step": 96900 + }, + { + "epoch": 0.48, + "grad_norm": 3.5497000217437744, + "learning_rate": 9.424875988885104e-06, + "loss": 1.3536, + "step": 97000 + }, + { + "epoch": 0.48, + "grad_norm": 3.165081024169922, + "learning_rate": 9.424250854090414e-06, + "loss": 1.3453, + "step": 97100 + }, + { + "epoch": 0.48, + "grad_norm": 3.4329254627227783, + "learning_rate": 9.423625719295724e-06, + "loss": 1.3044, + "step": 97200 + }, + { + "epoch": 0.48, + "grad_norm": 3.0184082984924316, + "learning_rate": 9.423000584501034e-06, + "loss": 1.344, + "step": 97300 + }, + { + "epoch": 0.48, + "grad_norm": 3.776301383972168, + "learning_rate": 9.422375449706344e-06, + "loss": 1.3315, + "step": 97400 + }, + { + "epoch": 0.48, + "grad_norm": 2.908074140548706, + "learning_rate": 9.421750314911654e-06, + "loss": 1.3613, + "step": 97500 + }, + { + "epoch": 0.48, + "grad_norm": 3.8458778858184814, + "learning_rate": 9.421125180116964e-06, + "loss": 1.3665, + "step": 97600 + }, + { + "epoch": 0.48, + "grad_norm": 3.778986930847168, + "learning_rate": 9.420500045322274e-06, + "loss": 1.3271, + "step": 97700 + }, + { + "epoch": 0.48, + "grad_norm": 4.841845512390137, + "learning_rate": 9.419874910527584e-06, + "loss": 1.3842, + "step": 97800 + }, + { + "epoch": 0.48, + "grad_norm": 2.875431537628174, + "learning_rate": 9.419249775732892e-06, + "loss": 1.3734, + "step": 97900 + }, + { + "epoch": 0.48, + "grad_norm": 3.327831268310547, + "learning_rate": 9.418624640938204e-06, + "loss": 1.3225, + "step": 98000 + }, + { + "epoch": 0.49, + "grad_norm": 3.921052932739258, + "learning_rate": 9.417999506143512e-06, + "loss": 1.3103, + "step": 98100 + }, + { + "epoch": 0.49, + "grad_norm": 3.3352317810058594, + "learning_rate": 9.417374371348824e-06, + "loss": 1.33, + "step": 98200 + }, + { + "epoch": 0.49, + "grad_norm": 2.6515772342681885, + "learning_rate": 9.416749236554132e-06, + "loss": 1.3325, + "step": 98300 + }, + { + "epoch": 0.49, + "grad_norm": 2.6556906700134277, + "learning_rate": 9.416124101759443e-06, + "loss": 1.3537, + "step": 98400 + }, + { + "epoch": 0.49, + "grad_norm": 3.394216775894165, + "learning_rate": 9.415498966964752e-06, + "loss": 1.337, + "step": 98500 + }, + { + "epoch": 0.49, + "grad_norm": 3.2017979621887207, + "learning_rate": 9.414873832170063e-06, + "loss": 1.3697, + "step": 98600 + }, + { + "epoch": 0.49, + "grad_norm": 4.548534393310547, + "learning_rate": 9.414248697375371e-06, + "loss": 1.353, + "step": 98700 + }, + { + "epoch": 0.49, + "grad_norm": 3.0345072746276855, + "learning_rate": 9.413623562580683e-06, + "loss": 1.3475, + "step": 98800 + }, + { + "epoch": 0.49, + "grad_norm": 2.3942067623138428, + "learning_rate": 9.412998427785991e-06, + "loss": 1.293, + "step": 98900 + }, + { + "epoch": 0.49, + "grad_norm": 2.577939033508301, + "learning_rate": 9.412373292991303e-06, + "loss": 1.3267, + "step": 99000 + }, + { + "epoch": 0.49, + "grad_norm": 3.599987745285034, + "learning_rate": 9.411748158196611e-06, + "loss": 1.3248, + "step": 99100 + }, + { + "epoch": 0.49, + "grad_norm": 2.732025623321533, + "learning_rate": 9.411123023401921e-06, + "loss": 1.3331, + "step": 99200 + }, + { + "epoch": 0.49, + "grad_norm": 3.382721185684204, + "learning_rate": 9.410497888607231e-06, + "loss": 1.3492, + "step": 99300 + }, + { + "epoch": 0.49, + "grad_norm": 3.670431613922119, + "learning_rate": 9.409872753812541e-06, + "loss": 1.3651, + "step": 99400 + }, + { + "epoch": 0.49, + "grad_norm": 2.932300329208374, + "learning_rate": 9.409247619017851e-06, + "loss": 1.3176, + "step": 99500 + }, + { + "epoch": 0.49, + "grad_norm": 4.241666316986084, + "learning_rate": 9.40862248422316e-06, + "loss": 1.3056, + "step": 99600 + }, + { + "epoch": 0.49, + "grad_norm": 2.6911585330963135, + "learning_rate": 9.40799734942847e-06, + "loss": 1.3425, + "step": 99700 + }, + { + "epoch": 0.49, + "grad_norm": 2.879465103149414, + "learning_rate": 9.40737221463378e-06, + "loss": 1.3395, + "step": 99800 + }, + { + "epoch": 0.49, + "grad_norm": 2.778740167617798, + "learning_rate": 9.40674707983909e-06, + "loss": 1.3426, + "step": 99900 + }, + { + "epoch": 0.49, + "grad_norm": 2.6691386699676514, + "learning_rate": 9.4061219450444e-06, + "loss": 1.3696, + "step": 100000 + }, + { + "epoch": 0.5, + "grad_norm": 4.0432562828063965, + "learning_rate": 9.40549681024971e-06, + "loss": 1.368, + "step": 100100 + }, + { + "epoch": 0.5, + "grad_norm": 2.7415411472320557, + "learning_rate": 9.40487167545502e-06, + "loss": 1.3368, + "step": 100200 + }, + { + "epoch": 0.5, + "grad_norm": 2.6961042881011963, + "learning_rate": 9.404246540660332e-06, + "loss": 1.3821, + "step": 100300 + }, + { + "epoch": 0.5, + "grad_norm": 3.592819929122925, + "learning_rate": 9.40362140586564e-06, + "loss": 1.3313, + "step": 100400 + }, + { + "epoch": 0.5, + "grad_norm": 3.588106632232666, + "learning_rate": 9.40299627107095e-06, + "loss": 1.3369, + "step": 100500 + }, + { + "epoch": 0.5, + "grad_norm": 3.1717209815979004, + "learning_rate": 9.40237113627626e-06, + "loss": 1.3063, + "step": 100600 + }, + { + "epoch": 0.5, + "grad_norm": 3.9011149406433105, + "learning_rate": 9.40174600148157e-06, + "loss": 1.3371, + "step": 100700 + }, + { + "epoch": 0.5, + "grad_norm": 2.861337184906006, + "learning_rate": 9.40112086668688e-06, + "loss": 1.3511, + "step": 100800 + }, + { + "epoch": 0.5, + "grad_norm": 3.2174508571624756, + "learning_rate": 9.40049573189219e-06, + "loss": 1.3374, + "step": 100900 + }, + { + "epoch": 0.5, + "grad_norm": 3.301086664199829, + "learning_rate": 9.3998705970975e-06, + "loss": 1.3024, + "step": 101000 + }, + { + "epoch": 0.5, + "grad_norm": 3.3187129497528076, + "learning_rate": 9.39924546230281e-06, + "loss": 1.349, + "step": 101100 + }, + { + "epoch": 0.5, + "grad_norm": 2.6953561305999756, + "learning_rate": 9.39862032750812e-06, + "loss": 1.3528, + "step": 101200 + }, + { + "epoch": 0.5, + "grad_norm": 3.3338325023651123, + "learning_rate": 9.39799519271343e-06, + "loss": 1.3511, + "step": 101300 + }, + { + "epoch": 0.5, + "grad_norm": 3.3522443771362305, + "learning_rate": 9.39737005791874e-06, + "loss": 1.3373, + "step": 101400 + }, + { + "epoch": 0.5, + "grad_norm": 2.7400362491607666, + "learning_rate": 9.39674492312405e-06, + "loss": 1.3535, + "step": 101500 + }, + { + "epoch": 0.5, + "grad_norm": 2.793731212615967, + "learning_rate": 9.39611978832936e-06, + "loss": 1.3184, + "step": 101600 + }, + { + "epoch": 0.5, + "grad_norm": 2.759066581726074, + "learning_rate": 9.39549465353467e-06, + "loss": 1.3489, + "step": 101700 + }, + { + "epoch": 0.5, + "grad_norm": 4.7479681968688965, + "learning_rate": 9.39486951873998e-06, + "loss": 1.3498, + "step": 101800 + }, + { + "epoch": 0.5, + "grad_norm": 3.2522835731506348, + "learning_rate": 9.394244383945289e-06, + "loss": 1.3476, + "step": 101900 + }, + { + "epoch": 0.5, + "grad_norm": 4.208197593688965, + "learning_rate": 9.393619249150599e-06, + "loss": 1.3289, + "step": 102000 + }, + { + "epoch": 0.51, + "grad_norm": 2.8332533836364746, + "learning_rate": 9.392994114355909e-06, + "loss": 1.3501, + "step": 102100 + }, + { + "epoch": 0.51, + "grad_norm": 3.675553798675537, + "learning_rate": 9.392368979561219e-06, + "loss": 1.3559, + "step": 102200 + }, + { + "epoch": 0.51, + "grad_norm": 2.7442257404327393, + "learning_rate": 9.391743844766529e-06, + "loss": 1.346, + "step": 102300 + }, + { + "epoch": 0.51, + "grad_norm": 3.129180431365967, + "learning_rate": 9.391118709971839e-06, + "loss": 1.3265, + "step": 102400 + }, + { + "epoch": 0.51, + "grad_norm": 3.1826012134552, + "learning_rate": 9.390493575177149e-06, + "loss": 1.3488, + "step": 102500 + }, + { + "epoch": 0.51, + "grad_norm": 3.8879926204681396, + "learning_rate": 9.389868440382459e-06, + "loss": 1.3814, + "step": 102600 + }, + { + "epoch": 0.51, + "grad_norm": 4.066867828369141, + "learning_rate": 9.389243305587769e-06, + "loss": 1.3454, + "step": 102700 + }, + { + "epoch": 0.51, + "grad_norm": 3.3028340339660645, + "learning_rate": 9.388618170793078e-06, + "loss": 1.3661, + "step": 102800 + }, + { + "epoch": 0.51, + "grad_norm": 2.9503161907196045, + "learning_rate": 9.387993035998388e-06, + "loss": 1.3326, + "step": 102900 + }, + { + "epoch": 0.51, + "grad_norm": 3.030353546142578, + "learning_rate": 9.387367901203698e-06, + "loss": 1.3436, + "step": 103000 + }, + { + "epoch": 0.51, + "grad_norm": 3.6172800064086914, + "learning_rate": 9.386742766409007e-06, + "loss": 1.3545, + "step": 103100 + }, + { + "epoch": 0.51, + "grad_norm": 3.2115883827209473, + "learning_rate": 9.386117631614318e-06, + "loss": 1.3345, + "step": 103200 + }, + { + "epoch": 0.51, + "grad_norm": 2.9105865955352783, + "learning_rate": 9.385492496819626e-06, + "loss": 1.3667, + "step": 103300 + }, + { + "epoch": 0.51, + "grad_norm": 4.278082847595215, + "learning_rate": 9.384867362024938e-06, + "loss": 1.3497, + "step": 103400 + }, + { + "epoch": 0.51, + "grad_norm": 3.113901376724243, + "learning_rate": 9.384242227230246e-06, + "loss": 1.3216, + "step": 103500 + }, + { + "epoch": 0.51, + "grad_norm": 3.9379656314849854, + "learning_rate": 9.383617092435558e-06, + "loss": 1.3469, + "step": 103600 + }, + { + "epoch": 0.51, + "grad_norm": 3.53202748298645, + "learning_rate": 9.382991957640866e-06, + "loss": 1.3488, + "step": 103700 + }, + { + "epoch": 0.51, + "grad_norm": 2.973238468170166, + "learning_rate": 9.382366822846178e-06, + "loss": 1.3387, + "step": 103800 + }, + { + "epoch": 0.51, + "grad_norm": 3.464711904525757, + "learning_rate": 9.381741688051486e-06, + "loss": 1.3472, + "step": 103900 + }, + { + "epoch": 0.51, + "grad_norm": 2.8774147033691406, + "learning_rate": 9.381116553256798e-06, + "loss": 1.3786, + "step": 104000 + }, + { + "epoch": 0.52, + "grad_norm": 3.0821099281311035, + "learning_rate": 9.380491418462106e-06, + "loss": 1.3329, + "step": 104100 + }, + { + "epoch": 0.52, + "grad_norm": 4.228484630584717, + "learning_rate": 9.379866283667417e-06, + "loss": 1.3329, + "step": 104200 + }, + { + "epoch": 0.52, + "grad_norm": 3.113833427429199, + "learning_rate": 9.379241148872726e-06, + "loss": 1.373, + "step": 104300 + }, + { + "epoch": 0.52, + "grad_norm": 3.078624963760376, + "learning_rate": 9.378616014078036e-06, + "loss": 1.3409, + "step": 104400 + }, + { + "epoch": 0.52, + "grad_norm": 2.510251045227051, + "learning_rate": 9.377990879283346e-06, + "loss": 1.3393, + "step": 104500 + }, + { + "epoch": 0.52, + "grad_norm": 3.0739970207214355, + "learning_rate": 9.377365744488655e-06, + "loss": 1.3696, + "step": 104600 + }, + { + "epoch": 0.52, + "grad_norm": 3.1729655265808105, + "learning_rate": 9.376740609693965e-06, + "loss": 1.3229, + "step": 104700 + }, + { + "epoch": 0.52, + "grad_norm": 3.003714084625244, + "learning_rate": 9.376115474899275e-06, + "loss": 1.3309, + "step": 104800 + }, + { + "epoch": 0.52, + "grad_norm": 3.0130865573883057, + "learning_rate": 9.375490340104585e-06, + "loss": 1.3736, + "step": 104900 + }, + { + "epoch": 0.52, + "grad_norm": 3.3182711601257324, + "learning_rate": 9.374865205309895e-06, + "loss": 1.342, + "step": 105000 + }, + { + "epoch": 0.52, + "grad_norm": 2.8089771270751953, + "learning_rate": 9.374240070515205e-06, + "loss": 1.3187, + "step": 105100 + }, + { + "epoch": 0.52, + "grad_norm": 3.321974515914917, + "learning_rate": 9.373614935720515e-06, + "loss": 1.3676, + "step": 105200 + }, + { + "epoch": 0.52, + "grad_norm": 3.114701986312866, + "learning_rate": 9.372989800925825e-06, + "loss": 1.3746, + "step": 105300 + }, + { + "epoch": 0.52, + "grad_norm": 3.1047348976135254, + "learning_rate": 9.372364666131135e-06, + "loss": 1.3437, + "step": 105400 + }, + { + "epoch": 0.52, + "grad_norm": 3.5590476989746094, + "learning_rate": 9.371739531336446e-06, + "loss": 1.3496, + "step": 105500 + }, + { + "epoch": 0.52, + "grad_norm": 3.6819374561309814, + "learning_rate": 9.371114396541755e-06, + "loss": 1.3254, + "step": 105600 + }, + { + "epoch": 0.52, + "grad_norm": 3.0282516479492188, + "learning_rate": 9.370489261747066e-06, + "loss": 1.3568, + "step": 105700 + }, + { + "epoch": 0.52, + "grad_norm": 3.659374475479126, + "learning_rate": 9.369864126952375e-06, + "loss": 1.3331, + "step": 105800 + }, + { + "epoch": 0.52, + "grad_norm": 2.9225759506225586, + "learning_rate": 9.369238992157685e-06, + "loss": 1.3293, + "step": 105900 + }, + { + "epoch": 0.52, + "grad_norm": 3.179713726043701, + "learning_rate": 9.368613857362994e-06, + "loss": 1.3342, + "step": 106000 + }, + { + "epoch": 0.52, + "grad_norm": 3.126467704772949, + "learning_rate": 9.367988722568304e-06, + "loss": 1.3441, + "step": 106100 + }, + { + "epoch": 0.53, + "grad_norm": 4.179965019226074, + "learning_rate": 9.367363587773614e-06, + "loss": 1.3225, + "step": 106200 + }, + { + "epoch": 0.53, + "grad_norm": 4.020696640014648, + "learning_rate": 9.366738452978924e-06, + "loss": 1.3263, + "step": 106300 + }, + { + "epoch": 0.53, + "grad_norm": 4.02736759185791, + "learning_rate": 9.366113318184234e-06, + "loss": 1.3621, + "step": 106400 + }, + { + "epoch": 0.53, + "grad_norm": 2.7809269428253174, + "learning_rate": 9.365488183389544e-06, + "loss": 1.3583, + "step": 106500 + }, + { + "epoch": 0.53, + "grad_norm": 3.92323637008667, + "learning_rate": 9.364863048594854e-06, + "loss": 1.359, + "step": 106600 + }, + { + "epoch": 0.53, + "grad_norm": 3.1310439109802246, + "learning_rate": 9.364237913800164e-06, + "loss": 1.3296, + "step": 106700 + }, + { + "epoch": 0.53, + "grad_norm": 2.9712395668029785, + "learning_rate": 9.363612779005474e-06, + "loss": 1.3118, + "step": 106800 + }, + { + "epoch": 0.53, + "grad_norm": 3.047405481338501, + "learning_rate": 9.362987644210784e-06, + "loss": 1.3441, + "step": 106900 + }, + { + "epoch": 0.53, + "grad_norm": 4.126023292541504, + "learning_rate": 9.362362509416094e-06, + "loss": 1.3484, + "step": 107000 + }, + { + "epoch": 0.53, + "grad_norm": 4.234996318817139, + "learning_rate": 9.361737374621404e-06, + "loss": 1.2947, + "step": 107100 + }, + { + "epoch": 0.53, + "grad_norm": 3.2066574096679688, + "learning_rate": 9.361112239826714e-06, + "loss": 1.3595, + "step": 107200 + }, + { + "epoch": 0.53, + "grad_norm": 2.9832849502563477, + "learning_rate": 9.360487105032023e-06, + "loss": 1.324, + "step": 107300 + }, + { + "epoch": 0.53, + "grad_norm": 3.168886661529541, + "learning_rate": 9.359861970237333e-06, + "loss": 1.3508, + "step": 107400 + }, + { + "epoch": 0.53, + "grad_norm": 4.0027995109558105, + "learning_rate": 9.359236835442643e-06, + "loss": 1.344, + "step": 107500 + }, + { + "epoch": 0.53, + "grad_norm": 3.394458770751953, + "learning_rate": 9.358611700647953e-06, + "loss": 1.3295, + "step": 107600 + }, + { + "epoch": 0.53, + "grad_norm": 2.4304399490356445, + "learning_rate": 9.357986565853263e-06, + "loss": 1.3473, + "step": 107700 + }, + { + "epoch": 0.53, + "grad_norm": 2.8694140911102295, + "learning_rate": 9.357361431058573e-06, + "loss": 1.3052, + "step": 107800 + }, + { + "epoch": 0.53, + "grad_norm": 2.8801755905151367, + "learning_rate": 9.356736296263883e-06, + "loss": 1.3495, + "step": 107900 + }, + { + "epoch": 0.53, + "grad_norm": 2.713139057159424, + "learning_rate": 9.356111161469193e-06, + "loss": 1.3552, + "step": 108000 + }, + { + "epoch": 0.53, + "grad_norm": 3.1576766967773438, + "learning_rate": 9.355486026674503e-06, + "loss": 1.3425, + "step": 108100 + }, + { + "epoch": 0.54, + "grad_norm": 3.07737398147583, + "learning_rate": 9.354860891879813e-06, + "loss": 1.2991, + "step": 108200 + }, + { + "epoch": 0.54, + "grad_norm": 3.1460793018341064, + "learning_rate": 9.354235757085123e-06, + "loss": 1.3528, + "step": 108300 + }, + { + "epoch": 0.54, + "grad_norm": 3.4123237133026123, + "learning_rate": 9.353610622290433e-06, + "loss": 1.3549, + "step": 108400 + }, + { + "epoch": 0.54, + "grad_norm": 4.295971870422363, + "learning_rate": 9.352985487495741e-06, + "loss": 1.32, + "step": 108500 + }, + { + "epoch": 0.54, + "grad_norm": 7.587291240692139, + "learning_rate": 9.352360352701053e-06, + "loss": 1.3205, + "step": 108600 + }, + { + "epoch": 0.54, + "grad_norm": 2.6797661781311035, + "learning_rate": 9.35173521790636e-06, + "loss": 1.3383, + "step": 108700 + }, + { + "epoch": 0.54, + "grad_norm": 2.9257397651672363, + "learning_rate": 9.351110083111672e-06, + "loss": 1.343, + "step": 108800 + }, + { + "epoch": 0.54, + "grad_norm": 2.9473204612731934, + "learning_rate": 9.35048494831698e-06, + "loss": 1.3423, + "step": 108900 + }, + { + "epoch": 0.54, + "grad_norm": 3.4466347694396973, + "learning_rate": 9.349859813522292e-06, + "loss": 1.3261, + "step": 109000 + }, + { + "epoch": 0.54, + "grad_norm": 3.173017978668213, + "learning_rate": 9.3492346787276e-06, + "loss": 1.3222, + "step": 109100 + }, + { + "epoch": 0.54, + "grad_norm": 3.7112643718719482, + "learning_rate": 9.348609543932912e-06, + "loss": 1.3528, + "step": 109200 + }, + { + "epoch": 0.54, + "grad_norm": 2.747380495071411, + "learning_rate": 9.34798440913822e-06, + "loss": 1.343, + "step": 109300 + }, + { + "epoch": 0.54, + "grad_norm": 2.6822452545166016, + "learning_rate": 9.347359274343532e-06, + "loss": 1.3478, + "step": 109400 + }, + { + "epoch": 0.54, + "grad_norm": 3.6231350898742676, + "learning_rate": 9.34673413954884e-06, + "loss": 1.3569, + "step": 109500 + }, + { + "epoch": 0.54, + "grad_norm": 2.985164165496826, + "learning_rate": 9.346109004754152e-06, + "loss": 1.358, + "step": 109600 + }, + { + "epoch": 0.54, + "grad_norm": 3.304058790206909, + "learning_rate": 9.34548386995946e-06, + "loss": 1.3312, + "step": 109700 + }, + { + "epoch": 0.54, + "grad_norm": 3.1674203872680664, + "learning_rate": 9.34485873516477e-06, + "loss": 1.3499, + "step": 109800 + }, + { + "epoch": 0.54, + "grad_norm": 3.0080056190490723, + "learning_rate": 9.34423360037008e-06, + "loss": 1.3268, + "step": 109900 + }, + { + "epoch": 0.54, + "grad_norm": 3.8802080154418945, + "learning_rate": 9.34360846557539e-06, + "loss": 1.3494, + "step": 110000 + }, + { + "epoch": 0.54, + "grad_norm": 3.842288017272949, + "learning_rate": 9.3429833307807e-06, + "loss": 1.3368, + "step": 110100 + }, + { + "epoch": 0.55, + "grad_norm": 3.315469980239868, + "learning_rate": 9.34235819598601e-06, + "loss": 1.3744, + "step": 110200 + }, + { + "epoch": 0.55, + "grad_norm": 5.507584095001221, + "learning_rate": 9.34173306119132e-06, + "loss": 1.3635, + "step": 110300 + }, + { + "epoch": 0.55, + "grad_norm": 3.494532585144043, + "learning_rate": 9.34110792639663e-06, + "loss": 1.3553, + "step": 110400 + }, + { + "epoch": 0.55, + "grad_norm": 2.602483034133911, + "learning_rate": 9.34048279160194e-06, + "loss": 1.3783, + "step": 110500 + }, + { + "epoch": 0.55, + "grad_norm": 4.201229572296143, + "learning_rate": 9.33985765680725e-06, + "loss": 1.3466, + "step": 110600 + }, + { + "epoch": 0.55, + "grad_norm": 3.809846878051758, + "learning_rate": 9.339232522012561e-06, + "loss": 1.3274, + "step": 110700 + }, + { + "epoch": 0.55, + "grad_norm": 2.964759588241577, + "learning_rate": 9.33860738721787e-06, + "loss": 1.3954, + "step": 110800 + }, + { + "epoch": 0.55, + "grad_norm": 2.925959587097168, + "learning_rate": 9.33798225242318e-06, + "loss": 1.3258, + "step": 110900 + }, + { + "epoch": 0.55, + "grad_norm": 2.654022455215454, + "learning_rate": 9.337357117628489e-06, + "loss": 1.3504, + "step": 111000 + }, + { + "epoch": 0.55, + "grad_norm": 3.296046257019043, + "learning_rate": 9.336731982833799e-06, + "loss": 1.3136, + "step": 111100 + }, + { + "epoch": 0.55, + "grad_norm": 3.804032564163208, + "learning_rate": 9.336106848039109e-06, + "loss": 1.3428, + "step": 111200 + }, + { + "epoch": 0.55, + "grad_norm": 3.254333019256592, + "learning_rate": 9.335481713244419e-06, + "loss": 1.3583, + "step": 111300 + }, + { + "epoch": 0.55, + "grad_norm": 2.7955849170684814, + "learning_rate": 9.334856578449729e-06, + "loss": 1.3294, + "step": 111400 + }, + { + "epoch": 0.55, + "grad_norm": 4.444815635681152, + "learning_rate": 9.334231443655039e-06, + "loss": 1.3727, + "step": 111500 + }, + { + "epoch": 0.55, + "grad_norm": 2.5120747089385986, + "learning_rate": 9.333606308860349e-06, + "loss": 1.3454, + "step": 111600 + }, + { + "epoch": 0.55, + "grad_norm": 3.015342950820923, + "learning_rate": 9.332981174065659e-06, + "loss": 1.319, + "step": 111700 + }, + { + "epoch": 0.55, + "grad_norm": 3.021049976348877, + "learning_rate": 9.332356039270968e-06, + "loss": 1.3498, + "step": 111800 + }, + { + "epoch": 0.55, + "grad_norm": 3.354524850845337, + "learning_rate": 9.331730904476278e-06, + "loss": 1.3552, + "step": 111900 + }, + { + "epoch": 0.55, + "grad_norm": 2.649719476699829, + "learning_rate": 9.331105769681588e-06, + "loss": 1.3276, + "step": 112000 + }, + { + "epoch": 0.55, + "grad_norm": 3.1411306858062744, + "learning_rate": 9.330480634886898e-06, + "loss": 1.3616, + "step": 112100 + }, + { + "epoch": 0.56, + "grad_norm": 3.030653238296509, + "learning_rate": 9.329855500092208e-06, + "loss": 1.3253, + "step": 112200 + }, + { + "epoch": 0.56, + "grad_norm": 2.987105369567871, + "learning_rate": 9.329230365297518e-06, + "loss": 1.3133, + "step": 112300 + }, + { + "epoch": 0.56, + "grad_norm": 3.9337832927703857, + "learning_rate": 9.328605230502828e-06, + "loss": 1.3697, + "step": 112400 + }, + { + "epoch": 0.56, + "grad_norm": 2.6805777549743652, + "learning_rate": 9.327980095708138e-06, + "loss": 1.3689, + "step": 112500 + }, + { + "epoch": 0.56, + "grad_norm": 3.582444667816162, + "learning_rate": 9.327354960913448e-06, + "loss": 1.3746, + "step": 112600 + }, + { + "epoch": 0.56, + "grad_norm": 4.040530204772949, + "learning_rate": 9.326729826118758e-06, + "loss": 1.3484, + "step": 112700 + }, + { + "epoch": 0.56, + "grad_norm": 3.6632535457611084, + "learning_rate": 9.326104691324068e-06, + "loss": 1.3191, + "step": 112800 + }, + { + "epoch": 0.56, + "grad_norm": 2.814882278442383, + "learning_rate": 9.325479556529378e-06, + "loss": 1.3331, + "step": 112900 + }, + { + "epoch": 0.56, + "grad_norm": 3.2695939540863037, + "learning_rate": 9.324854421734688e-06, + "loss": 1.3307, + "step": 113000 + }, + { + "epoch": 0.56, + "grad_norm": 3.6386842727661133, + "learning_rate": 9.324229286939998e-06, + "loss": 1.3464, + "step": 113100 + }, + { + "epoch": 0.56, + "grad_norm": 2.7537262439727783, + "learning_rate": 9.323604152145307e-06, + "loss": 1.3177, + "step": 113200 + }, + { + "epoch": 0.56, + "grad_norm": 3.279010057449341, + "learning_rate": 9.322979017350617e-06, + "loss": 1.353, + "step": 113300 + }, + { + "epoch": 0.56, + "grad_norm": 3.230193614959717, + "learning_rate": 9.322353882555927e-06, + "loss": 1.3301, + "step": 113400 + }, + { + "epoch": 0.56, + "grad_norm": 2.851243257522583, + "learning_rate": 9.321728747761237e-06, + "loss": 1.3535, + "step": 113500 + }, + { + "epoch": 0.56, + "grad_norm": 2.865309000015259, + "learning_rate": 9.321103612966547e-06, + "loss": 1.3836, + "step": 113600 + }, + { + "epoch": 0.56, + "grad_norm": 3.171292781829834, + "learning_rate": 9.320478478171855e-06, + "loss": 1.3426, + "step": 113700 + }, + { + "epoch": 0.56, + "grad_norm": 3.774327278137207, + "learning_rate": 9.319853343377167e-06, + "loss": 1.3376, + "step": 113800 + }, + { + "epoch": 0.56, + "grad_norm": 3.775113344192505, + "learning_rate": 9.319228208582475e-06, + "loss": 1.3253, + "step": 113900 + }, + { + "epoch": 0.56, + "grad_norm": 3.0207529067993164, + "learning_rate": 9.318603073787787e-06, + "loss": 1.3176, + "step": 114000 + }, + { + "epoch": 0.56, + "grad_norm": 3.2777695655822754, + "learning_rate": 9.317977938993095e-06, + "loss": 1.3699, + "step": 114100 + }, + { + "epoch": 0.56, + "grad_norm": 3.0100061893463135, + "learning_rate": 9.317352804198407e-06, + "loss": 1.3158, + "step": 114200 + }, + { + "epoch": 0.57, + "grad_norm": 3.428809881210327, + "learning_rate": 9.316727669403715e-06, + "loss": 1.3165, + "step": 114300 + }, + { + "epoch": 0.57, + "grad_norm": 2.833083391189575, + "learning_rate": 9.316102534609027e-06, + "loss": 1.3487, + "step": 114400 + }, + { + "epoch": 0.57, + "grad_norm": 2.81231951713562, + "learning_rate": 9.315477399814335e-06, + "loss": 1.327, + "step": 114500 + }, + { + "epoch": 0.57, + "grad_norm": 2.9721994400024414, + "learning_rate": 9.314852265019646e-06, + "loss": 1.3189, + "step": 114600 + }, + { + "epoch": 0.57, + "grad_norm": 2.969564437866211, + "learning_rate": 9.314227130224955e-06, + "loss": 1.3487, + "step": 114700 + }, + { + "epoch": 0.57, + "grad_norm": 3.1244125366210938, + "learning_rate": 9.313601995430266e-06, + "loss": 1.3391, + "step": 114800 + }, + { + "epoch": 0.57, + "grad_norm": 3.785893201828003, + "learning_rate": 9.312976860635575e-06, + "loss": 1.3017, + "step": 114900 + }, + { + "epoch": 0.57, + "grad_norm": 2.9908628463745117, + "learning_rate": 9.312351725840884e-06, + "loss": 1.3004, + "step": 115000 + }, + { + "epoch": 0.57, + "grad_norm": 2.7877655029296875, + "learning_rate": 9.311726591046194e-06, + "loss": 1.3345, + "step": 115100 + }, + { + "epoch": 0.57, + "grad_norm": 2.73725962638855, + "learning_rate": 9.311101456251504e-06, + "loss": 1.3164, + "step": 115200 + }, + { + "epoch": 0.57, + "grad_norm": 2.6744511127471924, + "learning_rate": 9.310476321456814e-06, + "loss": 1.3362, + "step": 115300 + }, + { + "epoch": 0.57, + "grad_norm": 2.9426522254943848, + "learning_rate": 9.309851186662124e-06, + "loss": 1.3111, + "step": 115400 + }, + { + "epoch": 0.57, + "grad_norm": 3.818319797515869, + "learning_rate": 9.309226051867434e-06, + "loss": 1.3193, + "step": 115500 + }, + { + "epoch": 0.57, + "grad_norm": 3.0991666316986084, + "learning_rate": 9.308600917072744e-06, + "loss": 1.3144, + "step": 115600 + }, + { + "epoch": 0.57, + "grad_norm": 3.6219863891601562, + "learning_rate": 9.307975782278054e-06, + "loss": 1.3397, + "step": 115700 + }, + { + "epoch": 0.57, + "grad_norm": 3.1432971954345703, + "learning_rate": 9.307350647483364e-06, + "loss": 1.3259, + "step": 115800 + }, + { + "epoch": 0.57, + "grad_norm": 4.15132999420166, + "learning_rate": 9.306725512688675e-06, + "loss": 1.3362, + "step": 115900 + }, + { + "epoch": 0.57, + "grad_norm": 4.191103935241699, + "learning_rate": 9.306100377893984e-06, + "loss": 1.3117, + "step": 116000 + }, + { + "epoch": 0.57, + "grad_norm": 3.6365630626678467, + "learning_rate": 9.305475243099295e-06, + "loss": 1.3397, + "step": 116100 + }, + { + "epoch": 0.57, + "grad_norm": 2.899077892303467, + "learning_rate": 9.304850108304604e-06, + "loss": 1.3434, + "step": 116200 + }, + { + "epoch": 0.58, + "grad_norm": 3.006063461303711, + "learning_rate": 9.304224973509915e-06, + "loss": 1.3148, + "step": 116300 + }, + { + "epoch": 0.58, + "grad_norm": 3.2103986740112305, + "learning_rate": 9.303599838715223e-06, + "loss": 1.3313, + "step": 116400 + }, + { + "epoch": 0.58, + "grad_norm": 2.6371185779571533, + "learning_rate": 9.302974703920533e-06, + "loss": 1.3219, + "step": 116500 + }, + { + "epoch": 0.58, + "grad_norm": 3.5810282230377197, + "learning_rate": 9.302349569125843e-06, + "loss": 1.3199, + "step": 116600 + }, + { + "epoch": 0.58, + "grad_norm": 3.7903432846069336, + "learning_rate": 9.301724434331153e-06, + "loss": 1.3604, + "step": 116700 + }, + { + "epoch": 0.58, + "grad_norm": 3.0497376918792725, + "learning_rate": 9.301099299536463e-06, + "loss": 1.3517, + "step": 116800 + }, + { + "epoch": 0.58, + "grad_norm": 3.0921273231506348, + "learning_rate": 9.300474164741773e-06, + "loss": 1.3192, + "step": 116900 + }, + { + "epoch": 0.58, + "grad_norm": 4.081624507904053, + "learning_rate": 9.299849029947083e-06, + "loss": 1.3376, + "step": 117000 + }, + { + "epoch": 0.58, + "grad_norm": 3.9647045135498047, + "learning_rate": 9.299223895152393e-06, + "loss": 1.3501, + "step": 117100 + }, + { + "epoch": 0.58, + "grad_norm": 3.3014702796936035, + "learning_rate": 9.298598760357703e-06, + "loss": 1.3467, + "step": 117200 + }, + { + "epoch": 0.58, + "grad_norm": 4.240477085113525, + "learning_rate": 9.297973625563013e-06, + "loss": 1.3094, + "step": 117300 + }, + { + "epoch": 0.58, + "grad_norm": 3.317046642303467, + "learning_rate": 9.297348490768323e-06, + "loss": 1.3586, + "step": 117400 + }, + { + "epoch": 0.58, + "grad_norm": 2.8633594512939453, + "learning_rate": 9.296723355973633e-06, + "loss": 1.3272, + "step": 117500 + }, + { + "epoch": 0.58, + "grad_norm": 3.435241937637329, + "learning_rate": 9.296098221178943e-06, + "loss": 1.3683, + "step": 117600 + }, + { + "epoch": 0.58, + "grad_norm": 2.955159902572632, + "learning_rate": 9.295473086384252e-06, + "loss": 1.35, + "step": 117700 + }, + { + "epoch": 0.58, + "grad_norm": 2.941067695617676, + "learning_rate": 9.294847951589562e-06, + "loss": 1.3438, + "step": 117800 + }, + { + "epoch": 0.58, + "grad_norm": 4.773413181304932, + "learning_rate": 9.294222816794872e-06, + "loss": 1.3167, + "step": 117900 + }, + { + "epoch": 0.58, + "grad_norm": 2.976818084716797, + "learning_rate": 9.293597682000182e-06, + "loss": 1.3415, + "step": 118000 + }, + { + "epoch": 0.58, + "grad_norm": 2.769272804260254, + "learning_rate": 9.292972547205492e-06, + "loss": 1.3596, + "step": 118100 + }, + { + "epoch": 0.58, + "grad_norm": 3.0079257488250732, + "learning_rate": 9.292347412410802e-06, + "loss": 1.3479, + "step": 118200 + }, + { + "epoch": 0.59, + "grad_norm": 6.148379802703857, + "learning_rate": 9.291722277616112e-06, + "loss": 1.3324, + "step": 118300 + }, + { + "epoch": 0.59, + "grad_norm": 3.0416197776794434, + "learning_rate": 9.291097142821422e-06, + "loss": 1.307, + "step": 118400 + }, + { + "epoch": 0.59, + "grad_norm": 2.869318962097168, + "learning_rate": 9.290472008026732e-06, + "loss": 1.3217, + "step": 118500 + }, + { + "epoch": 0.59, + "grad_norm": 3.5434398651123047, + "learning_rate": 9.289846873232042e-06, + "loss": 1.3356, + "step": 118600 + }, + { + "epoch": 0.59, + "grad_norm": 3.5270133018493652, + "learning_rate": 9.289221738437352e-06, + "loss": 1.3206, + "step": 118700 + }, + { + "epoch": 0.59, + "grad_norm": 5.0058369636535645, + "learning_rate": 9.288596603642662e-06, + "loss": 1.3386, + "step": 118800 + }, + { + "epoch": 0.59, + "grad_norm": 3.7998316287994385, + "learning_rate": 9.287971468847972e-06, + "loss": 1.2882, + "step": 118900 + }, + { + "epoch": 0.59, + "grad_norm": 4.403027534484863, + "learning_rate": 9.287346334053281e-06, + "loss": 1.3318, + "step": 119000 + }, + { + "epoch": 0.59, + "grad_norm": 3.3011553287506104, + "learning_rate": 9.28672119925859e-06, + "loss": 1.3395, + "step": 119100 + }, + { + "epoch": 0.59, + "grad_norm": 3.5392231941223145, + "learning_rate": 9.286096064463901e-06, + "loss": 1.3309, + "step": 119200 + }, + { + "epoch": 0.59, + "grad_norm": 3.0157182216644287, + "learning_rate": 9.28547092966921e-06, + "loss": 1.3212, + "step": 119300 + }, + { + "epoch": 0.59, + "grad_norm": 3.292978048324585, + "learning_rate": 9.284845794874521e-06, + "loss": 1.3419, + "step": 119400 + }, + { + "epoch": 0.59, + "grad_norm": 3.0125534534454346, + "learning_rate": 9.28422066007983e-06, + "loss": 1.3237, + "step": 119500 + }, + { + "epoch": 0.59, + "grad_norm": 3.421067476272583, + "learning_rate": 9.283595525285141e-06, + "loss": 1.343, + "step": 119600 + }, + { + "epoch": 0.59, + "grad_norm": 3.0257959365844727, + "learning_rate": 9.28297039049045e-06, + "loss": 1.3339, + "step": 119700 + }, + { + "epoch": 0.59, + "grad_norm": 3.5511181354522705, + "learning_rate": 9.282345255695761e-06, + "loss": 1.2891, + "step": 119800 + }, + { + "epoch": 0.59, + "grad_norm": 3.5419836044311523, + "learning_rate": 9.281720120901069e-06, + "loss": 1.3189, + "step": 119900 + }, + { + "epoch": 0.59, + "grad_norm": 2.700242757797241, + "learning_rate": 9.28109498610638e-06, + "loss": 1.3302, + "step": 120000 + } + ], + "logging_steps": 100, + "max_steps": 1604655, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 10000, + "total_flos": 2.4386875134941594e+18, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}