diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,14042 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.988280338049213, + "eval_steps": 500, + "global_step": 2000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0014941401690246065, + "grad_norm": 2.2013847827911377, + "learning_rate": 4.9975e-05, + "loss": 6.2498, + "step": 1 + }, + { + "epoch": 0.002988280338049213, + "grad_norm": 2.3608126640319824, + "learning_rate": 4.995e-05, + "loss": 6.4856, + "step": 2 + }, + { + "epoch": 0.0044824205070738195, + "grad_norm": 2.7555134296417236, + "learning_rate": 4.992500000000001e-05, + "loss": 6.5519, + "step": 3 + }, + { + "epoch": 0.005976560676098426, + "grad_norm": 2.7937450408935547, + "learning_rate": 4.99e-05, + "loss": 6.6085, + "step": 4 + }, + { + "epoch": 0.0074707008451230334, + "grad_norm": 3.405029058456421, + "learning_rate": 4.9875000000000006e-05, + "loss": 6.4712, + "step": 5 + }, + { + "epoch": 0.008964841014147639, + "grad_norm": 2.5555474758148193, + "learning_rate": 4.9850000000000006e-05, + "loss": 6.7113, + "step": 6 + }, + { + "epoch": 0.010458981183172246, + "grad_norm": 3.9536023139953613, + "learning_rate": 4.9825000000000005e-05, + "loss": 6.3192, + "step": 7 + }, + { + "epoch": 0.011953121352196852, + "grad_norm": 3.3245086669921875, + "learning_rate": 4.9800000000000004e-05, + "loss": 6.6072, + "step": 8 + }, + { + "epoch": 0.01344726152122146, + "grad_norm": 3.1858065128326416, + "learning_rate": 4.9775000000000004e-05, + "loss": 6.3415, + "step": 9 + }, + { + "epoch": 0.014941401690246067, + "grad_norm": 3.4829370975494385, + "learning_rate": 4.975e-05, + "loss": 6.5856, + "step": 10 + }, + { + "epoch": 0.016435541859270673, + "grad_norm": 3.615802526473999, + "learning_rate": 4.9725e-05, + "loss": 6.0762, + "step": 11 + }, + { + "epoch": 0.017929682028295278, + "grad_norm": 3.8015692234039307, + "learning_rate": 4.97e-05, + "loss": 6.2116, + "step": 12 + }, + { + "epoch": 0.019423822197319886, + "grad_norm": 3.587550163269043, + "learning_rate": 4.967500000000001e-05, + "loss": 6.1468, + "step": 13 + }, + { + "epoch": 0.02091796236634449, + "grad_norm": 3.3607726097106934, + "learning_rate": 4.965e-05, + "loss": 6.063, + "step": 14 + }, + { + "epoch": 0.0224121025353691, + "grad_norm": 2.8799898624420166, + "learning_rate": 4.962500000000001e-05, + "loss": 5.7877, + "step": 15 + }, + { + "epoch": 0.023906242704393704, + "grad_norm": 3.8595519065856934, + "learning_rate": 4.96e-05, + "loss": 5.6556, + "step": 16 + }, + { + "epoch": 0.025400382873418312, + "grad_norm": 3.544861316680908, + "learning_rate": 4.9575000000000006e-05, + "loss": 6.2185, + "step": 17 + }, + { + "epoch": 0.02689452304244292, + "grad_norm": 3.7592461109161377, + "learning_rate": 4.9550000000000005e-05, + "loss": 5.6731, + "step": 18 + }, + { + "epoch": 0.028388663211467526, + "grad_norm": 4.296304225921631, + "learning_rate": 4.9525000000000004e-05, + "loss": 5.7575, + "step": 19 + }, + { + "epoch": 0.029882803380492134, + "grad_norm": 3.575627088546753, + "learning_rate": 4.9500000000000004e-05, + "loss": 5.6663, + "step": 20 + }, + { + "epoch": 0.03137694354951674, + "grad_norm": 3.7745871543884277, + "learning_rate": 4.9475e-05, + "loss": 5.4617, + "step": 21 + }, + { + "epoch": 0.03287108371854135, + "grad_norm": 3.608196258544922, + "learning_rate": 4.945e-05, + "loss": 5.5331, + "step": 22 + }, + { + "epoch": 0.03436522388756595, + "grad_norm": 3.33453106880188, + "learning_rate": 4.9425e-05, + "loss": 5.3907, + "step": 23 + }, + { + "epoch": 0.035859364056590556, + "grad_norm": 4.210156440734863, + "learning_rate": 4.94e-05, + "loss": 5.1661, + "step": 24 + }, + { + "epoch": 0.03735350422561517, + "grad_norm": 3.6300511360168457, + "learning_rate": 4.937500000000001e-05, + "loss": 5.0496, + "step": 25 + }, + { + "epoch": 0.03884764439463977, + "grad_norm": 4.4206671714782715, + "learning_rate": 4.935e-05, + "loss": 4.7727, + "step": 26 + }, + { + "epoch": 0.04034178456366438, + "grad_norm": 3.644752025604248, + "learning_rate": 4.9325000000000006e-05, + "loss": 4.832, + "step": 27 + }, + { + "epoch": 0.04183592473268898, + "grad_norm": 3.7340240478515625, + "learning_rate": 4.93e-05, + "loss": 4.9109, + "step": 28 + }, + { + "epoch": 0.043330064901713594, + "grad_norm": 3.7647705078125, + "learning_rate": 4.9275000000000005e-05, + "loss": 4.7356, + "step": 29 + }, + { + "epoch": 0.0448242050707382, + "grad_norm": 3.2214791774749756, + "learning_rate": 4.9250000000000004e-05, + "loss": 4.3419, + "step": 30 + }, + { + "epoch": 0.046318345239762804, + "grad_norm": 3.0918350219726562, + "learning_rate": 4.9225000000000004e-05, + "loss": 4.8978, + "step": 31 + }, + { + "epoch": 0.04781248540878741, + "grad_norm": 3.302729368209839, + "learning_rate": 4.92e-05, + "loss": 4.6097, + "step": 32 + }, + { + "epoch": 0.04930662557781202, + "grad_norm": 3.2680063247680664, + "learning_rate": 4.9175e-05, + "loss": 4.8161, + "step": 33 + }, + { + "epoch": 0.050800765746836625, + "grad_norm": 2.8400955200195312, + "learning_rate": 4.915e-05, + "loss": 4.236, + "step": 34 + }, + { + "epoch": 0.05229490591586123, + "grad_norm": 3.210604429244995, + "learning_rate": 4.9125e-05, + "loss": 4.4979, + "step": 35 + }, + { + "epoch": 0.05378904608488584, + "grad_norm": 2.803765058517456, + "learning_rate": 4.91e-05, + "loss": 4.0503, + "step": 36 + }, + { + "epoch": 0.055283186253910446, + "grad_norm": 3.3603885173797607, + "learning_rate": 4.907500000000001e-05, + "loss": 3.8568, + "step": 37 + }, + { + "epoch": 0.05677732642293505, + "grad_norm": 2.6471378803253174, + "learning_rate": 4.905e-05, + "loss": 4.2472, + "step": 38 + }, + { + "epoch": 0.058271466591959656, + "grad_norm": 3.159874439239502, + "learning_rate": 4.9025000000000006e-05, + "loss": 4.2832, + "step": 39 + }, + { + "epoch": 0.05976560676098427, + "grad_norm": 2.405097484588623, + "learning_rate": 4.9e-05, + "loss": 3.7952, + "step": 40 + }, + { + "epoch": 0.06125974693000887, + "grad_norm": 3.4419732093811035, + "learning_rate": 4.8975000000000005e-05, + "loss": 3.8268, + "step": 41 + }, + { + "epoch": 0.06275388709903348, + "grad_norm": 3.0664589405059814, + "learning_rate": 4.8950000000000004e-05, + "loss": 3.8268, + "step": 42 + }, + { + "epoch": 0.06424802726805809, + "grad_norm": 2.7936577796936035, + "learning_rate": 4.8925e-05, + "loss": 3.8516, + "step": 43 + }, + { + "epoch": 0.0657421674370827, + "grad_norm": 2.6919713020324707, + "learning_rate": 4.89e-05, + "loss": 3.5816, + "step": 44 + }, + { + "epoch": 0.0672363076061073, + "grad_norm": 2.2412493228912354, + "learning_rate": 4.8875e-05, + "loss": 3.7505, + "step": 45 + }, + { + "epoch": 0.0687304477751319, + "grad_norm": 2.645268440246582, + "learning_rate": 4.885e-05, + "loss": 3.4058, + "step": 46 + }, + { + "epoch": 0.07022458794415651, + "grad_norm": 2.3000919818878174, + "learning_rate": 4.8825e-05, + "loss": 3.328, + "step": 47 + }, + { + "epoch": 0.07171872811318111, + "grad_norm": 2.3414597511291504, + "learning_rate": 4.88e-05, + "loss": 3.246, + "step": 48 + }, + { + "epoch": 0.07321286828220572, + "grad_norm": 2.2140865325927734, + "learning_rate": 4.8775000000000007e-05, + "loss": 3.2796, + "step": 49 + }, + { + "epoch": 0.07470700845123034, + "grad_norm": 1.9956824779510498, + "learning_rate": 4.875e-05, + "loss": 3.2875, + "step": 50 + }, + { + "epoch": 0.07620114862025494, + "grad_norm": 1.9508098363876343, + "learning_rate": 4.8725000000000005e-05, + "loss": 3.0613, + "step": 51 + }, + { + "epoch": 0.07769528878927955, + "grad_norm": 1.8123174905776978, + "learning_rate": 4.87e-05, + "loss": 3.2806, + "step": 52 + }, + { + "epoch": 0.07918942895830415, + "grad_norm": 2.2940430641174316, + "learning_rate": 4.8675000000000004e-05, + "loss": 3.2443, + "step": 53 + }, + { + "epoch": 0.08068356912732876, + "grad_norm": 1.803460955619812, + "learning_rate": 4.8650000000000003e-05, + "loss": 3.17, + "step": 54 + }, + { + "epoch": 0.08217770929635336, + "grad_norm": 1.5010923147201538, + "learning_rate": 4.8625e-05, + "loss": 3.0046, + "step": 55 + }, + { + "epoch": 0.08367184946537796, + "grad_norm": 1.386048436164856, + "learning_rate": 4.86e-05, + "loss": 2.9352, + "step": 56 + }, + { + "epoch": 0.08516598963440258, + "grad_norm": 1.6107743978500366, + "learning_rate": 4.8575e-05, + "loss": 3.0539, + "step": 57 + }, + { + "epoch": 0.08666012980342719, + "grad_norm": 1.336033821105957, + "learning_rate": 4.855e-05, + "loss": 2.9155, + "step": 58 + }, + { + "epoch": 0.0881542699724518, + "grad_norm": 1.5201988220214844, + "learning_rate": 4.8525e-05, + "loss": 2.8984, + "step": 59 + }, + { + "epoch": 0.0896484101414764, + "grad_norm": 1.316045880317688, + "learning_rate": 4.85e-05, + "loss": 2.7086, + "step": 60 + }, + { + "epoch": 0.091142550310501, + "grad_norm": 1.3400336503982544, + "learning_rate": 4.8475000000000006e-05, + "loss": 2.8407, + "step": 61 + }, + { + "epoch": 0.09263669047952561, + "grad_norm": 1.3193714618682861, + "learning_rate": 4.845e-05, + "loss": 2.8589, + "step": 62 + }, + { + "epoch": 0.09413083064855021, + "grad_norm": 1.0813387632369995, + "learning_rate": 4.8425000000000005e-05, + "loss": 2.4711, + "step": 63 + }, + { + "epoch": 0.09562497081757482, + "grad_norm": 1.3248521089553833, + "learning_rate": 4.8400000000000004e-05, + "loss": 2.9868, + "step": 64 + }, + { + "epoch": 0.09711911098659944, + "grad_norm": 1.227431058883667, + "learning_rate": 4.8375000000000004e-05, + "loss": 2.6897, + "step": 65 + }, + { + "epoch": 0.09861325115562404, + "grad_norm": 1.2781943082809448, + "learning_rate": 4.835e-05, + "loss": 2.6999, + "step": 66 + }, + { + "epoch": 0.10010739132464865, + "grad_norm": 1.2900972366333008, + "learning_rate": 4.8325e-05, + "loss": 2.9534, + "step": 67 + }, + { + "epoch": 0.10160153149367325, + "grad_norm": 1.0637670755386353, + "learning_rate": 4.83e-05, + "loss": 2.8384, + "step": 68 + }, + { + "epoch": 0.10309567166269785, + "grad_norm": 1.0107454061508179, + "learning_rate": 4.8275e-05, + "loss": 2.7397, + "step": 69 + }, + { + "epoch": 0.10458981183172246, + "grad_norm": 1.1608697175979614, + "learning_rate": 4.825e-05, + "loss": 2.686, + "step": 70 + }, + { + "epoch": 0.10608395200074706, + "grad_norm": 1.0701357126235962, + "learning_rate": 4.822500000000001e-05, + "loss": 2.6855, + "step": 71 + }, + { + "epoch": 0.10757809216977168, + "grad_norm": 1.022800326347351, + "learning_rate": 4.82e-05, + "loss": 2.5465, + "step": 72 + }, + { + "epoch": 0.10907223233879629, + "grad_norm": 1.0642368793487549, + "learning_rate": 4.8175000000000005e-05, + "loss": 2.6804, + "step": 73 + }, + { + "epoch": 0.11056637250782089, + "grad_norm": 0.907848596572876, + "learning_rate": 4.815e-05, + "loss": 2.4118, + "step": 74 + }, + { + "epoch": 0.1120605126768455, + "grad_norm": 1.2253451347351074, + "learning_rate": 4.8125000000000004e-05, + "loss": 2.6588, + "step": 75 + }, + { + "epoch": 0.1135546528458701, + "grad_norm": 1.0567305088043213, + "learning_rate": 4.8100000000000004e-05, + "loss": 2.615, + "step": 76 + }, + { + "epoch": 0.1150487930148947, + "grad_norm": 1.2600438594818115, + "learning_rate": 4.8075e-05, + "loss": 2.7854, + "step": 77 + }, + { + "epoch": 0.11654293318391931, + "grad_norm": 1.0818730592727661, + "learning_rate": 4.805e-05, + "loss": 2.2592, + "step": 78 + }, + { + "epoch": 0.11803707335294392, + "grad_norm": 1.1989450454711914, + "learning_rate": 4.8025e-05, + "loss": 2.6293, + "step": 79 + }, + { + "epoch": 0.11953121352196854, + "grad_norm": 1.0635451078414917, + "learning_rate": 4.8e-05, + "loss": 2.5789, + "step": 80 + }, + { + "epoch": 0.12102535369099314, + "grad_norm": 1.5961538553237915, + "learning_rate": 4.7975e-05, + "loss": 2.1708, + "step": 81 + }, + { + "epoch": 0.12251949386001774, + "grad_norm": 0.886963963508606, + "learning_rate": 4.795e-05, + "loss": 2.3345, + "step": 82 + }, + { + "epoch": 0.12401363402904235, + "grad_norm": 1.1009526252746582, + "learning_rate": 4.7925000000000006e-05, + "loss": 2.3957, + "step": 83 + }, + { + "epoch": 0.12550777419806697, + "grad_norm": 1.0787116289138794, + "learning_rate": 4.79e-05, + "loss": 2.4759, + "step": 84 + }, + { + "epoch": 0.12700191436709157, + "grad_norm": 0.9887699484825134, + "learning_rate": 4.7875000000000005e-05, + "loss": 2.4005, + "step": 85 + }, + { + "epoch": 0.12849605453611618, + "grad_norm": 0.9390841722488403, + "learning_rate": 4.785e-05, + "loss": 2.2801, + "step": 86 + }, + { + "epoch": 0.12999019470514078, + "grad_norm": 0.9958310127258301, + "learning_rate": 4.7825000000000004e-05, + "loss": 2.4892, + "step": 87 + }, + { + "epoch": 0.1314843348741654, + "grad_norm": 0.8534329533576965, + "learning_rate": 4.78e-05, + "loss": 1.9908, + "step": 88 + }, + { + "epoch": 0.13297847504319, + "grad_norm": 0.8332351446151733, + "learning_rate": 4.7775e-05, + "loss": 2.2827, + "step": 89 + }, + { + "epoch": 0.1344726152122146, + "grad_norm": 0.845981776714325, + "learning_rate": 4.775e-05, + "loss": 2.4051, + "step": 90 + }, + { + "epoch": 0.1359667553812392, + "grad_norm": 0.9278237223625183, + "learning_rate": 4.7725e-05, + "loss": 2.3518, + "step": 91 + }, + { + "epoch": 0.1374608955502638, + "grad_norm": 1.276440143585205, + "learning_rate": 4.77e-05, + "loss": 2.7287, + "step": 92 + }, + { + "epoch": 0.1389550357192884, + "grad_norm": 1.0569902658462524, + "learning_rate": 4.7675e-05, + "loss": 2.5747, + "step": 93 + }, + { + "epoch": 0.14044917588831302, + "grad_norm": 0.9122519493103027, + "learning_rate": 4.765e-05, + "loss": 2.186, + "step": 94 + }, + { + "epoch": 0.14194331605733762, + "grad_norm": 0.751939594745636, + "learning_rate": 4.7625000000000006e-05, + "loss": 2.3423, + "step": 95 + }, + { + "epoch": 0.14343745622636223, + "grad_norm": 0.95006263256073, + "learning_rate": 4.76e-05, + "loss": 2.5864, + "step": 96 + }, + { + "epoch": 0.14493159639538683, + "grad_norm": 0.9910762906074524, + "learning_rate": 4.7575000000000004e-05, + "loss": 2.312, + "step": 97 + }, + { + "epoch": 0.14642573656441144, + "grad_norm": 1.133583664894104, + "learning_rate": 4.755e-05, + "loss": 2.4763, + "step": 98 + }, + { + "epoch": 0.14791987673343607, + "grad_norm": 1.2710939645767212, + "learning_rate": 4.7525e-05, + "loss": 2.605, + "step": 99 + }, + { + "epoch": 0.14941401690246067, + "grad_norm": 1.0130882263183594, + "learning_rate": 4.75e-05, + "loss": 2.416, + "step": 100 + }, + { + "epoch": 0.15090815707148528, + "grad_norm": 1.4635257720947266, + "learning_rate": 4.7475e-05, + "loss": 2.6169, + "step": 101 + }, + { + "epoch": 0.15240229724050988, + "grad_norm": 0.9598097801208496, + "learning_rate": 4.745e-05, + "loss": 2.4536, + "step": 102 + }, + { + "epoch": 0.1538964374095345, + "grad_norm": 0.8063713908195496, + "learning_rate": 4.7425e-05, + "loss": 2.2044, + "step": 103 + }, + { + "epoch": 0.1553905775785591, + "grad_norm": 0.9389742612838745, + "learning_rate": 4.74e-05, + "loss": 2.2766, + "step": 104 + }, + { + "epoch": 0.1568847177475837, + "grad_norm": 0.7887735962867737, + "learning_rate": 4.7375e-05, + "loss": 1.8076, + "step": 105 + }, + { + "epoch": 0.1583788579166083, + "grad_norm": 0.8454030156135559, + "learning_rate": 4.735e-05, + "loss": 2.3198, + "step": 106 + }, + { + "epoch": 0.1598729980856329, + "grad_norm": 1.3265002965927124, + "learning_rate": 4.7325000000000005e-05, + "loss": 2.1676, + "step": 107 + }, + { + "epoch": 0.1613671382546575, + "grad_norm": 0.8800100088119507, + "learning_rate": 4.73e-05, + "loss": 2.0926, + "step": 108 + }, + { + "epoch": 0.16286127842368212, + "grad_norm": 0.8939815163612366, + "learning_rate": 4.7275000000000004e-05, + "loss": 2.1965, + "step": 109 + }, + { + "epoch": 0.16435541859270672, + "grad_norm": 0.9632463455200195, + "learning_rate": 4.7249999999999997e-05, + "loss": 2.1399, + "step": 110 + }, + { + "epoch": 0.16584955876173132, + "grad_norm": 1.092432975769043, + "learning_rate": 4.7225e-05, + "loss": 2.2975, + "step": 111 + }, + { + "epoch": 0.16734369893075593, + "grad_norm": 0.9222836494445801, + "learning_rate": 4.72e-05, + "loss": 2.3583, + "step": 112 + }, + { + "epoch": 0.16883783909978053, + "grad_norm": 0.9905472993850708, + "learning_rate": 4.7175e-05, + "loss": 2.0784, + "step": 113 + }, + { + "epoch": 0.17033197926880517, + "grad_norm": 0.6727510094642639, + "learning_rate": 4.715e-05, + "loss": 1.9479, + "step": 114 + }, + { + "epoch": 0.17182611943782977, + "grad_norm": 1.0887380838394165, + "learning_rate": 4.7125e-05, + "loss": 2.3239, + "step": 115 + }, + { + "epoch": 0.17332025960685438, + "grad_norm": 0.7983910441398621, + "learning_rate": 4.71e-05, + "loss": 2.1595, + "step": 116 + }, + { + "epoch": 0.17481439977587898, + "grad_norm": 0.8539843559265137, + "learning_rate": 4.7075e-05, + "loss": 1.9513, + "step": 117 + }, + { + "epoch": 0.1763085399449036, + "grad_norm": 0.8280434608459473, + "learning_rate": 4.705e-05, + "loss": 2.0896, + "step": 118 + }, + { + "epoch": 0.1778026801139282, + "grad_norm": 0.9694619178771973, + "learning_rate": 4.7025000000000005e-05, + "loss": 2.3539, + "step": 119 + }, + { + "epoch": 0.1792968202829528, + "grad_norm": 1.0440824031829834, + "learning_rate": 4.7e-05, + "loss": 1.7679, + "step": 120 + }, + { + "epoch": 0.1807909604519774, + "grad_norm": 0.8723125457763672, + "learning_rate": 4.6975000000000003e-05, + "loss": 2.0644, + "step": 121 + }, + { + "epoch": 0.182285100621002, + "grad_norm": 0.726588249206543, + "learning_rate": 4.695e-05, + "loss": 1.9727, + "step": 122 + }, + { + "epoch": 0.1837792407900266, + "grad_norm": 0.6932450532913208, + "learning_rate": 4.6925e-05, + "loss": 1.8967, + "step": 123 + }, + { + "epoch": 0.18527338095905121, + "grad_norm": 0.6471284627914429, + "learning_rate": 4.69e-05, + "loss": 1.8208, + "step": 124 + }, + { + "epoch": 0.18676752112807582, + "grad_norm": 0.6124398708343506, + "learning_rate": 4.6875e-05, + "loss": 1.9418, + "step": 125 + }, + { + "epoch": 0.18826166129710042, + "grad_norm": 0.705912709236145, + "learning_rate": 4.685000000000001e-05, + "loss": 1.9967, + "step": 126 + }, + { + "epoch": 0.18975580146612503, + "grad_norm": 0.8094897866249084, + "learning_rate": 4.6825e-05, + "loss": 1.9594, + "step": 127 + }, + { + "epoch": 0.19124994163514963, + "grad_norm": 0.5901124477386475, + "learning_rate": 4.6800000000000006e-05, + "loss": 2.0488, + "step": 128 + }, + { + "epoch": 0.19274408180417427, + "grad_norm": 0.7298286557197571, + "learning_rate": 4.6775000000000005e-05, + "loss": 2.1777, + "step": 129 + }, + { + "epoch": 0.19423822197319887, + "grad_norm": 0.5055814981460571, + "learning_rate": 4.6750000000000005e-05, + "loss": 1.7352, + "step": 130 + }, + { + "epoch": 0.19573236214222348, + "grad_norm": 0.47730177640914917, + "learning_rate": 4.6725000000000004e-05, + "loss": 1.8427, + "step": 131 + }, + { + "epoch": 0.19722650231124808, + "grad_norm": 0.5291617512702942, + "learning_rate": 4.6700000000000003e-05, + "loss": 1.9703, + "step": 132 + }, + { + "epoch": 0.19872064248027269, + "grad_norm": 0.6130346059799194, + "learning_rate": 4.6675e-05, + "loss": 2.0337, + "step": 133 + }, + { + "epoch": 0.2002147826492973, + "grad_norm": 0.600026547908783, + "learning_rate": 4.665e-05, + "loss": 2.1049, + "step": 134 + }, + { + "epoch": 0.2017089228183219, + "grad_norm": 0.9462055563926697, + "learning_rate": 4.6625e-05, + "loss": 1.8345, + "step": 135 + }, + { + "epoch": 0.2032030629873465, + "grad_norm": 0.6463867425918579, + "learning_rate": 4.660000000000001e-05, + "loss": 1.9301, + "step": 136 + }, + { + "epoch": 0.2046972031563711, + "grad_norm": 0.7604928016662598, + "learning_rate": 4.6575e-05, + "loss": 1.9863, + "step": 137 + }, + { + "epoch": 0.2061913433253957, + "grad_norm": 0.5646475553512573, + "learning_rate": 4.655000000000001e-05, + "loss": 2.0415, + "step": 138 + }, + { + "epoch": 0.20768548349442031, + "grad_norm": 0.5505641102790833, + "learning_rate": 4.6525e-05, + "loss": 1.9946, + "step": 139 + }, + { + "epoch": 0.20917962366344492, + "grad_norm": 0.5963394641876221, + "learning_rate": 4.6500000000000005e-05, + "loss": 1.7218, + "step": 140 + }, + { + "epoch": 0.21067376383246952, + "grad_norm": 1.011972188949585, + "learning_rate": 4.6475000000000005e-05, + "loss": 2.0932, + "step": 141 + }, + { + "epoch": 0.21216790400149413, + "grad_norm": 0.5645290017127991, + "learning_rate": 4.6450000000000004e-05, + "loss": 1.8721, + "step": 142 + }, + { + "epoch": 0.21366204417051873, + "grad_norm": 0.543994128704071, + "learning_rate": 4.6425000000000004e-05, + "loss": 1.8209, + "step": 143 + }, + { + "epoch": 0.21515618433954337, + "grad_norm": 0.6430279612541199, + "learning_rate": 4.64e-05, + "loss": 1.9537, + "step": 144 + }, + { + "epoch": 0.21665032450856797, + "grad_norm": 0.8161587119102478, + "learning_rate": 4.6375e-05, + "loss": 1.9483, + "step": 145 + }, + { + "epoch": 0.21814446467759258, + "grad_norm": 0.597017765045166, + "learning_rate": 4.635e-05, + "loss": 1.9586, + "step": 146 + }, + { + "epoch": 0.21963860484661718, + "grad_norm": 0.6092536449432373, + "learning_rate": 4.6325e-05, + "loss": 1.9523, + "step": 147 + }, + { + "epoch": 0.22113274501564179, + "grad_norm": 0.8086355328559875, + "learning_rate": 4.630000000000001e-05, + "loss": 1.7828, + "step": 148 + }, + { + "epoch": 0.2226268851846664, + "grad_norm": 0.4737461507320404, + "learning_rate": 4.6275e-05, + "loss": 1.936, + "step": 149 + }, + { + "epoch": 0.224121025353691, + "grad_norm": 0.7633687853813171, + "learning_rate": 4.6250000000000006e-05, + "loss": 1.9736, + "step": 150 + }, + { + "epoch": 0.2256151655227156, + "grad_norm": 0.6465319991111755, + "learning_rate": 4.6225e-05, + "loss": 1.6353, + "step": 151 + }, + { + "epoch": 0.2271093056917402, + "grad_norm": 0.7993711233139038, + "learning_rate": 4.6200000000000005e-05, + "loss": 1.8528, + "step": 152 + }, + { + "epoch": 0.2286034458607648, + "grad_norm": 0.5374677777290344, + "learning_rate": 4.6175000000000004e-05, + "loss": 1.6402, + "step": 153 + }, + { + "epoch": 0.2300975860297894, + "grad_norm": 0.8310596942901611, + "learning_rate": 4.6150000000000004e-05, + "loss": 2.1833, + "step": 154 + }, + { + "epoch": 0.23159172619881402, + "grad_norm": 0.4872928261756897, + "learning_rate": 4.6125e-05, + "loss": 1.7698, + "step": 155 + }, + { + "epoch": 0.23308586636783862, + "grad_norm": 0.587377667427063, + "learning_rate": 4.61e-05, + "loss": 2.0211, + "step": 156 + }, + { + "epoch": 0.23458000653686323, + "grad_norm": 0.5460837483406067, + "learning_rate": 4.6075e-05, + "loss": 1.721, + "step": 157 + }, + { + "epoch": 0.23607414670588783, + "grad_norm": 0.7595339417457581, + "learning_rate": 4.605e-05, + "loss": 1.6841, + "step": 158 + }, + { + "epoch": 0.23756828687491247, + "grad_norm": 0.4985441565513611, + "learning_rate": 4.6025e-05, + "loss": 1.8048, + "step": 159 + }, + { + "epoch": 0.23906242704393707, + "grad_norm": 0.8894737958908081, + "learning_rate": 4.600000000000001e-05, + "loss": 2.1808, + "step": 160 + }, + { + "epoch": 0.24055656721296167, + "grad_norm": 0.49112433195114136, + "learning_rate": 4.5975e-05, + "loss": 1.7699, + "step": 161 + }, + { + "epoch": 0.24205070738198628, + "grad_norm": 0.8248782157897949, + "learning_rate": 4.5950000000000006e-05, + "loss": 1.8133, + "step": 162 + }, + { + "epoch": 0.24354484755101088, + "grad_norm": 1.1163716316223145, + "learning_rate": 4.5925e-05, + "loss": 1.755, + "step": 163 + }, + { + "epoch": 0.2450389877200355, + "grad_norm": 0.7373276948928833, + "learning_rate": 4.5900000000000004e-05, + "loss": 1.6897, + "step": 164 + }, + { + "epoch": 0.2465331278890601, + "grad_norm": 0.4406241476535797, + "learning_rate": 4.5875000000000004e-05, + "loss": 1.6829, + "step": 165 + }, + { + "epoch": 0.2480272680580847, + "grad_norm": 0.6179890036582947, + "learning_rate": 4.585e-05, + "loss": 1.9277, + "step": 166 + }, + { + "epoch": 0.2495214082271093, + "grad_norm": 0.600563108921051, + "learning_rate": 4.5825e-05, + "loss": 1.8681, + "step": 167 + }, + { + "epoch": 0.25101554839613394, + "grad_norm": 0.5876383781433105, + "learning_rate": 4.58e-05, + "loss": 1.8027, + "step": 168 + }, + { + "epoch": 0.25250968856515854, + "grad_norm": 0.6557374596595764, + "learning_rate": 4.5775e-05, + "loss": 2.0037, + "step": 169 + }, + { + "epoch": 0.25400382873418315, + "grad_norm": 0.5612519383430481, + "learning_rate": 4.575e-05, + "loss": 1.8295, + "step": 170 + }, + { + "epoch": 0.25549796890320775, + "grad_norm": 0.550281822681427, + "learning_rate": 4.5725e-05, + "loss": 1.7208, + "step": 171 + }, + { + "epoch": 0.25699210907223236, + "grad_norm": 0.5334203243255615, + "learning_rate": 4.5700000000000006e-05, + "loss": 1.7105, + "step": 172 + }, + { + "epoch": 0.25848624924125696, + "grad_norm": 0.5756668448448181, + "learning_rate": 4.5675e-05, + "loss": 1.8567, + "step": 173 + }, + { + "epoch": 0.25998038941028156, + "grad_norm": 0.5913483500480652, + "learning_rate": 4.5650000000000005e-05, + "loss": 1.9032, + "step": 174 + }, + { + "epoch": 0.26147452957930617, + "grad_norm": 0.7093982100486755, + "learning_rate": 4.5625e-05, + "loss": 2.0607, + "step": 175 + }, + { + "epoch": 0.2629686697483308, + "grad_norm": 0.7194991707801819, + "learning_rate": 4.5600000000000004e-05, + "loss": 1.7715, + "step": 176 + }, + { + "epoch": 0.2644628099173554, + "grad_norm": 0.5984293222427368, + "learning_rate": 4.5575e-05, + "loss": 1.8837, + "step": 177 + }, + { + "epoch": 0.26595695008638, + "grad_norm": 0.5316656827926636, + "learning_rate": 4.555e-05, + "loss": 1.8816, + "step": 178 + }, + { + "epoch": 0.2674510902554046, + "grad_norm": 0.41810792684555054, + "learning_rate": 4.5525e-05, + "loss": 1.6823, + "step": 179 + }, + { + "epoch": 0.2689452304244292, + "grad_norm": 0.6792067885398865, + "learning_rate": 4.55e-05, + "loss": 1.9474, + "step": 180 + }, + { + "epoch": 0.2704393705934538, + "grad_norm": 0.758362889289856, + "learning_rate": 4.5475e-05, + "loss": 1.8358, + "step": 181 + }, + { + "epoch": 0.2719335107624784, + "grad_norm": 0.5369607210159302, + "learning_rate": 4.545000000000001e-05, + "loss": 1.8121, + "step": 182 + }, + { + "epoch": 0.273427650931503, + "grad_norm": 0.47773298621177673, + "learning_rate": 4.5425e-05, + "loss": 1.5175, + "step": 183 + }, + { + "epoch": 0.2749217911005276, + "grad_norm": 0.6082251667976379, + "learning_rate": 4.5400000000000006e-05, + "loss": 1.6287, + "step": 184 + }, + { + "epoch": 0.2764159312695522, + "grad_norm": 0.7682433724403381, + "learning_rate": 4.5375e-05, + "loss": 1.8996, + "step": 185 + }, + { + "epoch": 0.2779100714385768, + "grad_norm": 1.259735345840454, + "learning_rate": 4.5350000000000005e-05, + "loss": 1.7165, + "step": 186 + }, + { + "epoch": 0.2794042116076014, + "grad_norm": 0.7863439321517944, + "learning_rate": 4.5325000000000004e-05, + "loss": 1.8585, + "step": 187 + }, + { + "epoch": 0.28089835177662603, + "grad_norm": 0.5054369568824768, + "learning_rate": 4.53e-05, + "loss": 1.9339, + "step": 188 + }, + { + "epoch": 0.28239249194565064, + "grad_norm": 0.5214366912841797, + "learning_rate": 4.5275e-05, + "loss": 1.6759, + "step": 189 + }, + { + "epoch": 0.28388663211467524, + "grad_norm": 0.48575031757354736, + "learning_rate": 4.525e-05, + "loss": 1.897, + "step": 190 + }, + { + "epoch": 0.28538077228369985, + "grad_norm": 0.4781228303909302, + "learning_rate": 4.5225e-05, + "loss": 1.7445, + "step": 191 + }, + { + "epoch": 0.28687491245272445, + "grad_norm": 0.5672656297683716, + "learning_rate": 4.52e-05, + "loss": 1.7473, + "step": 192 + }, + { + "epoch": 0.28836905262174906, + "grad_norm": 1.3074836730957031, + "learning_rate": 4.5175e-05, + "loss": 1.7179, + "step": 193 + }, + { + "epoch": 0.28986319279077366, + "grad_norm": 0.4941707253456116, + "learning_rate": 4.5150000000000006e-05, + "loss": 1.8377, + "step": 194 + }, + { + "epoch": 0.29135733295979827, + "grad_norm": 0.6247252225875854, + "learning_rate": 4.5125e-05, + "loss": 1.5313, + "step": 195 + }, + { + "epoch": 0.29285147312882287, + "grad_norm": 0.6214970946311951, + "learning_rate": 4.5100000000000005e-05, + "loss": 1.8108, + "step": 196 + }, + { + "epoch": 0.29434561329784753, + "grad_norm": 0.589272677898407, + "learning_rate": 4.5075e-05, + "loss": 1.6296, + "step": 197 + }, + { + "epoch": 0.29583975346687214, + "grad_norm": 0.4383275508880615, + "learning_rate": 4.5050000000000004e-05, + "loss": 1.7754, + "step": 198 + }, + { + "epoch": 0.29733389363589674, + "grad_norm": 0.4272954761981964, + "learning_rate": 4.5025000000000003e-05, + "loss": 1.6274, + "step": 199 + }, + { + "epoch": 0.29882803380492134, + "grad_norm": 0.714189887046814, + "learning_rate": 4.5e-05, + "loss": 1.8001, + "step": 200 + }, + { + "epoch": 0.30032217397394595, + "grad_norm": 0.7614679932594299, + "learning_rate": 4.4975e-05, + "loss": 1.6916, + "step": 201 + }, + { + "epoch": 0.30181631414297055, + "grad_norm": 0.5107477307319641, + "learning_rate": 4.495e-05, + "loss": 1.6286, + "step": 202 + }, + { + "epoch": 0.30331045431199516, + "grad_norm": 1.2998956441879272, + "learning_rate": 4.4925e-05, + "loss": 1.9156, + "step": 203 + }, + { + "epoch": 0.30480459448101976, + "grad_norm": 0.4685801863670349, + "learning_rate": 4.49e-05, + "loss": 1.7799, + "step": 204 + }, + { + "epoch": 0.30629873465004437, + "grad_norm": 0.7038085460662842, + "learning_rate": 4.4875e-05, + "loss": 1.7224, + "step": 205 + }, + { + "epoch": 0.307792874819069, + "grad_norm": 0.5876782536506653, + "learning_rate": 4.4850000000000006e-05, + "loss": 1.6782, + "step": 206 + }, + { + "epoch": 0.3092870149880936, + "grad_norm": 0.7861223220825195, + "learning_rate": 4.4825e-05, + "loss": 1.7567, + "step": 207 + }, + { + "epoch": 0.3107811551571182, + "grad_norm": 0.7584080100059509, + "learning_rate": 4.4800000000000005e-05, + "loss": 1.7193, + "step": 208 + }, + { + "epoch": 0.3122752953261428, + "grad_norm": 0.6750642657279968, + "learning_rate": 4.4775e-05, + "loss": 1.6588, + "step": 209 + }, + { + "epoch": 0.3137694354951674, + "grad_norm": 0.6683038473129272, + "learning_rate": 4.4750000000000004e-05, + "loss": 1.6351, + "step": 210 + }, + { + "epoch": 0.315263575664192, + "grad_norm": 0.7034425735473633, + "learning_rate": 4.4725e-05, + "loss": 1.6917, + "step": 211 + }, + { + "epoch": 0.3167577158332166, + "grad_norm": 0.6255552172660828, + "learning_rate": 4.47e-05, + "loss": 1.7779, + "step": 212 + }, + { + "epoch": 0.3182518560022412, + "grad_norm": 0.7990907430648804, + "learning_rate": 4.4675e-05, + "loss": 1.6069, + "step": 213 + }, + { + "epoch": 0.3197459961712658, + "grad_norm": 1.1993693113327026, + "learning_rate": 4.465e-05, + "loss": 1.9121, + "step": 214 + }, + { + "epoch": 0.3212401363402904, + "grad_norm": 0.5611770749092102, + "learning_rate": 4.4625e-05, + "loss": 1.62, + "step": 215 + }, + { + "epoch": 0.322734276509315, + "grad_norm": 0.7897306680679321, + "learning_rate": 4.46e-05, + "loss": 1.5772, + "step": 216 + }, + { + "epoch": 0.3242284166783396, + "grad_norm": 0.7402398586273193, + "learning_rate": 4.4575e-05, + "loss": 1.7087, + "step": 217 + }, + { + "epoch": 0.32572255684736423, + "grad_norm": 0.5716530680656433, + "learning_rate": 4.4550000000000005e-05, + "loss": 1.6714, + "step": 218 + }, + { + "epoch": 0.32721669701638884, + "grad_norm": 0.6520130634307861, + "learning_rate": 4.4525e-05, + "loss": 1.6386, + "step": 219 + }, + { + "epoch": 0.32871083718541344, + "grad_norm": 0.5196349024772644, + "learning_rate": 4.4500000000000004e-05, + "loss": 1.6034, + "step": 220 + }, + { + "epoch": 0.33020497735443805, + "grad_norm": 0.6718409657478333, + "learning_rate": 4.4475e-05, + "loss": 1.6676, + "step": 221 + }, + { + "epoch": 0.33169911752346265, + "grad_norm": 1.0844227075576782, + "learning_rate": 4.445e-05, + "loss": 1.6749, + "step": 222 + }, + { + "epoch": 0.33319325769248725, + "grad_norm": 0.6650180816650391, + "learning_rate": 4.4425e-05, + "loss": 1.6038, + "step": 223 + }, + { + "epoch": 0.33468739786151186, + "grad_norm": 0.5425866842269897, + "learning_rate": 4.44e-05, + "loss": 1.8488, + "step": 224 + }, + { + "epoch": 0.33618153803053646, + "grad_norm": 0.4744863510131836, + "learning_rate": 4.4375e-05, + "loss": 1.6235, + "step": 225 + }, + { + "epoch": 0.33767567819956107, + "grad_norm": 0.564362645149231, + "learning_rate": 4.435e-05, + "loss": 1.6729, + "step": 226 + }, + { + "epoch": 0.33916981836858573, + "grad_norm": 0.3389017879962921, + "learning_rate": 4.4325e-05, + "loss": 1.5262, + "step": 227 + }, + { + "epoch": 0.34066395853761033, + "grad_norm": 0.7532176971435547, + "learning_rate": 4.43e-05, + "loss": 1.7369, + "step": 228 + }, + { + "epoch": 0.34215809870663494, + "grad_norm": 0.6776916980743408, + "learning_rate": 4.4275e-05, + "loss": 1.7183, + "step": 229 + }, + { + "epoch": 0.34365223887565954, + "grad_norm": 0.5167742371559143, + "learning_rate": 4.4250000000000005e-05, + "loss": 1.7791, + "step": 230 + }, + { + "epoch": 0.34514637904468415, + "grad_norm": 0.6937631368637085, + "learning_rate": 4.4225e-05, + "loss": 1.7255, + "step": 231 + }, + { + "epoch": 0.34664051921370875, + "grad_norm": 0.5599237084388733, + "learning_rate": 4.4200000000000004e-05, + "loss": 1.6517, + "step": 232 + }, + { + "epoch": 0.34813465938273336, + "grad_norm": 0.6251773238182068, + "learning_rate": 4.4174999999999996e-05, + "loss": 1.6929, + "step": 233 + }, + { + "epoch": 0.34962879955175796, + "grad_norm": 0.9755148887634277, + "learning_rate": 4.415e-05, + "loss": 1.6177, + "step": 234 + }, + { + "epoch": 0.35112293972078257, + "grad_norm": 0.951604962348938, + "learning_rate": 4.4125e-05, + "loss": 1.5687, + "step": 235 + }, + { + "epoch": 0.3526170798898072, + "grad_norm": 0.45756927132606506, + "learning_rate": 4.41e-05, + "loss": 1.6375, + "step": 236 + }, + { + "epoch": 0.3541112200588318, + "grad_norm": 0.4150543808937073, + "learning_rate": 4.4075e-05, + "loss": 1.5537, + "step": 237 + }, + { + "epoch": 0.3556053602278564, + "grad_norm": 0.4883075952529907, + "learning_rate": 4.405e-05, + "loss": 1.6506, + "step": 238 + }, + { + "epoch": 0.357099500396881, + "grad_norm": 0.5154853463172913, + "learning_rate": 4.4025e-05, + "loss": 1.7254, + "step": 239 + }, + { + "epoch": 0.3585936405659056, + "grad_norm": 1.2003599405288696, + "learning_rate": 4.4000000000000006e-05, + "loss": 1.6396, + "step": 240 + }, + { + "epoch": 0.3600877807349302, + "grad_norm": 0.5465105175971985, + "learning_rate": 4.3975e-05, + "loss": 1.5758, + "step": 241 + }, + { + "epoch": 0.3615819209039548, + "grad_norm": 0.5422613620758057, + "learning_rate": 4.3950000000000004e-05, + "loss": 1.5582, + "step": 242 + }, + { + "epoch": 0.3630760610729794, + "grad_norm": 0.44329527020454407, + "learning_rate": 4.3925e-05, + "loss": 1.6287, + "step": 243 + }, + { + "epoch": 0.364570201242004, + "grad_norm": 0.7963645458221436, + "learning_rate": 4.39e-05, + "loss": 1.736, + "step": 244 + }, + { + "epoch": 0.3660643414110286, + "grad_norm": 0.490032434463501, + "learning_rate": 4.3875e-05, + "loss": 1.6711, + "step": 245 + }, + { + "epoch": 0.3675584815800532, + "grad_norm": 1.1757748126983643, + "learning_rate": 4.385e-05, + "loss": 1.6462, + "step": 246 + }, + { + "epoch": 0.3690526217490778, + "grad_norm": 0.39032885432243347, + "learning_rate": 4.3825e-05, + "loss": 1.7074, + "step": 247 + }, + { + "epoch": 0.37054676191810243, + "grad_norm": 0.5092072486877441, + "learning_rate": 4.38e-05, + "loss": 1.4719, + "step": 248 + }, + { + "epoch": 0.37204090208712703, + "grad_norm": 0.6537796258926392, + "learning_rate": 4.3775e-05, + "loss": 1.7254, + "step": 249 + }, + { + "epoch": 0.37353504225615164, + "grad_norm": 0.9034501314163208, + "learning_rate": 4.375e-05, + "loss": 1.5082, + "step": 250 + }, + { + "epoch": 0.37502918242517624, + "grad_norm": 1.2041537761688232, + "learning_rate": 4.3725000000000006e-05, + "loss": 1.8014, + "step": 251 + }, + { + "epoch": 0.37652332259420085, + "grad_norm": 0.6997256278991699, + "learning_rate": 4.3700000000000005e-05, + "loss": 1.7564, + "step": 252 + }, + { + "epoch": 0.37801746276322545, + "grad_norm": 0.6815714240074158, + "learning_rate": 4.3675000000000005e-05, + "loss": 1.5752, + "step": 253 + }, + { + "epoch": 0.37951160293225006, + "grad_norm": 2.3094871044158936, + "learning_rate": 4.3650000000000004e-05, + "loss": 1.6464, + "step": 254 + }, + { + "epoch": 0.38100574310127466, + "grad_norm": 0.5100992918014526, + "learning_rate": 4.3625e-05, + "loss": 1.8269, + "step": 255 + }, + { + "epoch": 0.38249988327029927, + "grad_norm": 0.4920763373374939, + "learning_rate": 4.36e-05, + "loss": 1.5146, + "step": 256 + }, + { + "epoch": 0.38399402343932393, + "grad_norm": 0.5346136093139648, + "learning_rate": 4.3575e-05, + "loss": 1.6748, + "step": 257 + }, + { + "epoch": 0.38548816360834853, + "grad_norm": 0.5572606921195984, + "learning_rate": 4.355e-05, + "loss": 1.6669, + "step": 258 + }, + { + "epoch": 0.38698230377737314, + "grad_norm": 0.344314843416214, + "learning_rate": 4.352500000000001e-05, + "loss": 1.6193, + "step": 259 + }, + { + "epoch": 0.38847644394639774, + "grad_norm": 0.47694844007492065, + "learning_rate": 4.35e-05, + "loss": 1.6713, + "step": 260 + }, + { + "epoch": 0.38997058411542235, + "grad_norm": 0.5320355892181396, + "learning_rate": 4.3475000000000006e-05, + "loss": 1.5918, + "step": 261 + }, + { + "epoch": 0.39146472428444695, + "grad_norm": 0.5045621395111084, + "learning_rate": 4.345e-05, + "loss": 1.5785, + "step": 262 + }, + { + "epoch": 0.39295886445347156, + "grad_norm": 0.48489850759506226, + "learning_rate": 4.3425000000000005e-05, + "loss": 1.8115, + "step": 263 + }, + { + "epoch": 0.39445300462249616, + "grad_norm": 0.8452106714248657, + "learning_rate": 4.3400000000000005e-05, + "loss": 1.6116, + "step": 264 + }, + { + "epoch": 0.39594714479152077, + "grad_norm": 0.6507668495178223, + "learning_rate": 4.3375000000000004e-05, + "loss": 1.5557, + "step": 265 + }, + { + "epoch": 0.39744128496054537, + "grad_norm": 0.5312817692756653, + "learning_rate": 4.335e-05, + "loss": 1.5833, + "step": 266 + }, + { + "epoch": 0.39893542512957, + "grad_norm": 1.0674476623535156, + "learning_rate": 4.3325e-05, + "loss": 1.6319, + "step": 267 + }, + { + "epoch": 0.4004295652985946, + "grad_norm": 0.4685145616531372, + "learning_rate": 4.33e-05, + "loss": 1.6653, + "step": 268 + }, + { + "epoch": 0.4019237054676192, + "grad_norm": 0.6625531315803528, + "learning_rate": 4.3275e-05, + "loss": 1.5749, + "step": 269 + }, + { + "epoch": 0.4034178456366438, + "grad_norm": 0.46707406640052795, + "learning_rate": 4.325e-05, + "loss": 1.4153, + "step": 270 + }, + { + "epoch": 0.4049119858056684, + "grad_norm": 0.7175610065460205, + "learning_rate": 4.322500000000001e-05, + "loss": 1.6583, + "step": 271 + }, + { + "epoch": 0.406406125974693, + "grad_norm": 1.0776153802871704, + "learning_rate": 4.32e-05, + "loss": 1.6923, + "step": 272 + }, + { + "epoch": 0.4079002661437176, + "grad_norm": 0.453715443611145, + "learning_rate": 4.3175000000000006e-05, + "loss": 1.6136, + "step": 273 + }, + { + "epoch": 0.4093944063127422, + "grad_norm": 0.5465681552886963, + "learning_rate": 4.315e-05, + "loss": 1.5902, + "step": 274 + }, + { + "epoch": 0.4108885464817668, + "grad_norm": 0.49540913105010986, + "learning_rate": 4.3125000000000005e-05, + "loss": 1.4397, + "step": 275 + }, + { + "epoch": 0.4123826866507914, + "grad_norm": 0.39481091499328613, + "learning_rate": 4.3100000000000004e-05, + "loss": 1.4795, + "step": 276 + }, + { + "epoch": 0.413876826819816, + "grad_norm": 0.8381078839302063, + "learning_rate": 4.3075000000000003e-05, + "loss": 1.5042, + "step": 277 + }, + { + "epoch": 0.41537096698884063, + "grad_norm": 0.366020143032074, + "learning_rate": 4.305e-05, + "loss": 1.4619, + "step": 278 + }, + { + "epoch": 0.41686510715786523, + "grad_norm": 0.3974578082561493, + "learning_rate": 4.3025e-05, + "loss": 1.6508, + "step": 279 + }, + { + "epoch": 0.41835924732688984, + "grad_norm": 0.38582220673561096, + "learning_rate": 4.3e-05, + "loss": 1.7035, + "step": 280 + }, + { + "epoch": 0.41985338749591444, + "grad_norm": 0.5376545786857605, + "learning_rate": 4.2975e-05, + "loss": 1.5625, + "step": 281 + }, + { + "epoch": 0.42134752766493905, + "grad_norm": 0.3950735032558441, + "learning_rate": 4.295e-05, + "loss": 1.58, + "step": 282 + }, + { + "epoch": 0.42284166783396365, + "grad_norm": 0.496115505695343, + "learning_rate": 4.2925000000000007e-05, + "loss": 1.7686, + "step": 283 + }, + { + "epoch": 0.42433580800298826, + "grad_norm": 0.5910571813583374, + "learning_rate": 4.29e-05, + "loss": 1.6702, + "step": 284 + }, + { + "epoch": 0.42582994817201286, + "grad_norm": 0.3945789039134979, + "learning_rate": 4.2875000000000005e-05, + "loss": 1.7765, + "step": 285 + }, + { + "epoch": 0.42732408834103747, + "grad_norm": 0.5639134645462036, + "learning_rate": 4.285e-05, + "loss": 1.4734, + "step": 286 + }, + { + "epoch": 0.4288182285100621, + "grad_norm": 0.3879479765892029, + "learning_rate": 4.2825000000000004e-05, + "loss": 1.4693, + "step": 287 + }, + { + "epoch": 0.43031236867908673, + "grad_norm": 0.5454258322715759, + "learning_rate": 4.2800000000000004e-05, + "loss": 1.3784, + "step": 288 + }, + { + "epoch": 0.43180650884811134, + "grad_norm": 0.4519095718860626, + "learning_rate": 4.2775e-05, + "loss": 1.6286, + "step": 289 + }, + { + "epoch": 0.43330064901713594, + "grad_norm": 0.49812281131744385, + "learning_rate": 4.275e-05, + "loss": 1.6295, + "step": 290 + }, + { + "epoch": 0.43479478918616055, + "grad_norm": 0.5681044459342957, + "learning_rate": 4.2725e-05, + "loss": 1.5457, + "step": 291 + }, + { + "epoch": 0.43628892935518515, + "grad_norm": 0.3775969445705414, + "learning_rate": 4.27e-05, + "loss": 1.4316, + "step": 292 + }, + { + "epoch": 0.43778306952420976, + "grad_norm": 0.5119141340255737, + "learning_rate": 4.2675e-05, + "loss": 1.5647, + "step": 293 + }, + { + "epoch": 0.43927720969323436, + "grad_norm": 0.5567271113395691, + "learning_rate": 4.265e-05, + "loss": 1.6077, + "step": 294 + }, + { + "epoch": 0.44077134986225897, + "grad_norm": 2.239450216293335, + "learning_rate": 4.2625000000000006e-05, + "loss": 1.4764, + "step": 295 + }, + { + "epoch": 0.44226549003128357, + "grad_norm": 0.41082441806793213, + "learning_rate": 4.26e-05, + "loss": 1.6279, + "step": 296 + }, + { + "epoch": 0.4437596302003082, + "grad_norm": 0.3343771994113922, + "learning_rate": 4.2575000000000005e-05, + "loss": 1.5487, + "step": 297 + }, + { + "epoch": 0.4452537703693328, + "grad_norm": 0.35104796290397644, + "learning_rate": 4.2550000000000004e-05, + "loss": 1.5431, + "step": 298 + }, + { + "epoch": 0.4467479105383574, + "grad_norm": 0.7141225337982178, + "learning_rate": 4.2525000000000004e-05, + "loss": 1.5845, + "step": 299 + }, + { + "epoch": 0.448242050707382, + "grad_norm": 1.133215069770813, + "learning_rate": 4.25e-05, + "loss": 1.5753, + "step": 300 + }, + { + "epoch": 0.4497361908764066, + "grad_norm": 0.419767826795578, + "learning_rate": 4.2475e-05, + "loss": 1.5059, + "step": 301 + }, + { + "epoch": 0.4512303310454312, + "grad_norm": 0.3364299535751343, + "learning_rate": 4.245e-05, + "loss": 1.5082, + "step": 302 + }, + { + "epoch": 0.4527244712144558, + "grad_norm": 2.21221661567688, + "learning_rate": 4.2425e-05, + "loss": 1.7031, + "step": 303 + }, + { + "epoch": 0.4542186113834804, + "grad_norm": 1.6208406686782837, + "learning_rate": 4.24e-05, + "loss": 1.657, + "step": 304 + }, + { + "epoch": 0.455712751552505, + "grad_norm": 0.6496986150741577, + "learning_rate": 4.237500000000001e-05, + "loss": 1.8416, + "step": 305 + }, + { + "epoch": 0.4572068917215296, + "grad_norm": 0.3318963050842285, + "learning_rate": 4.235e-05, + "loss": 1.7409, + "step": 306 + }, + { + "epoch": 0.4587010318905542, + "grad_norm": 0.5762292742729187, + "learning_rate": 4.2325000000000006e-05, + "loss": 1.5737, + "step": 307 + }, + { + "epoch": 0.4601951720595788, + "grad_norm": 0.7929126024246216, + "learning_rate": 4.23e-05, + "loss": 1.6096, + "step": 308 + }, + { + "epoch": 0.46168931222860343, + "grad_norm": 0.3540865480899811, + "learning_rate": 4.2275000000000004e-05, + "loss": 1.4596, + "step": 309 + }, + { + "epoch": 0.46318345239762804, + "grad_norm": 0.3595794141292572, + "learning_rate": 4.2250000000000004e-05, + "loss": 1.6072, + "step": 310 + }, + { + "epoch": 0.46467759256665264, + "grad_norm": 0.8183721899986267, + "learning_rate": 4.2225e-05, + "loss": 1.4945, + "step": 311 + }, + { + "epoch": 0.46617173273567725, + "grad_norm": 0.36932075023651123, + "learning_rate": 4.22e-05, + "loss": 1.6256, + "step": 312 + }, + { + "epoch": 0.46766587290470185, + "grad_norm": 1.8945622444152832, + "learning_rate": 4.2175e-05, + "loss": 1.7877, + "step": 313 + }, + { + "epoch": 0.46916001307372646, + "grad_norm": 0.4660819172859192, + "learning_rate": 4.215e-05, + "loss": 1.3404, + "step": 314 + }, + { + "epoch": 0.47065415324275106, + "grad_norm": 0.3449573814868927, + "learning_rate": 4.2125e-05, + "loss": 1.4818, + "step": 315 + }, + { + "epoch": 0.47214829341177567, + "grad_norm": 0.5242890119552612, + "learning_rate": 4.21e-05, + "loss": 1.6351, + "step": 316 + }, + { + "epoch": 0.4736424335808003, + "grad_norm": 0.709052324295044, + "learning_rate": 4.2075000000000006e-05, + "loss": 1.4658, + "step": 317 + }, + { + "epoch": 0.47513657374982493, + "grad_norm": 0.4316338300704956, + "learning_rate": 4.205e-05, + "loss": 1.4741, + "step": 318 + }, + { + "epoch": 0.47663071391884954, + "grad_norm": 0.6699191331863403, + "learning_rate": 4.2025000000000005e-05, + "loss": 1.5725, + "step": 319 + }, + { + "epoch": 0.47812485408787414, + "grad_norm": 0.5343687534332275, + "learning_rate": 4.2e-05, + "loss": 1.5691, + "step": 320 + }, + { + "epoch": 0.47961899425689875, + "grad_norm": 0.35519710183143616, + "learning_rate": 4.1975000000000004e-05, + "loss": 1.2731, + "step": 321 + }, + { + "epoch": 0.48111313442592335, + "grad_norm": 0.385275274515152, + "learning_rate": 4.195e-05, + "loss": 1.345, + "step": 322 + }, + { + "epoch": 0.48260727459494795, + "grad_norm": 0.6153557300567627, + "learning_rate": 4.1925e-05, + "loss": 1.5702, + "step": 323 + }, + { + "epoch": 0.48410141476397256, + "grad_norm": 0.6316625475883484, + "learning_rate": 4.19e-05, + "loss": 1.4381, + "step": 324 + }, + { + "epoch": 0.48559555493299716, + "grad_norm": 2.4540088176727295, + "learning_rate": 4.1875e-05, + "loss": 1.3345, + "step": 325 + }, + { + "epoch": 0.48708969510202177, + "grad_norm": 0.3455633819103241, + "learning_rate": 4.185e-05, + "loss": 1.2002, + "step": 326 + }, + { + "epoch": 0.4885838352710464, + "grad_norm": 0.6012340188026428, + "learning_rate": 4.1825e-05, + "loss": 1.5849, + "step": 327 + }, + { + "epoch": 0.490077975440071, + "grad_norm": 0.6323389410972595, + "learning_rate": 4.18e-05, + "loss": 1.5271, + "step": 328 + }, + { + "epoch": 0.4915721156090956, + "grad_norm": 0.5738844871520996, + "learning_rate": 4.1775000000000006e-05, + "loss": 1.6768, + "step": 329 + }, + { + "epoch": 0.4930662557781202, + "grad_norm": 0.5335976481437683, + "learning_rate": 4.175e-05, + "loss": 1.625, + "step": 330 + }, + { + "epoch": 0.4945603959471448, + "grad_norm": 0.40596872568130493, + "learning_rate": 4.1725000000000005e-05, + "loss": 1.573, + "step": 331 + }, + { + "epoch": 0.4960545361161694, + "grad_norm": 0.46490761637687683, + "learning_rate": 4.17e-05, + "loss": 1.3669, + "step": 332 + }, + { + "epoch": 0.497548676285194, + "grad_norm": 0.42321160435676575, + "learning_rate": 4.1675e-05, + "loss": 1.4261, + "step": 333 + }, + { + "epoch": 0.4990428164542186, + "grad_norm": 0.8209381699562073, + "learning_rate": 4.165e-05, + "loss": 1.3413, + "step": 334 + }, + { + "epoch": 0.5005369566232433, + "grad_norm": 0.3648524880409241, + "learning_rate": 4.1625e-05, + "loss": 1.3683, + "step": 335 + }, + { + "epoch": 0.5020310967922679, + "grad_norm": 0.4434033930301666, + "learning_rate": 4.16e-05, + "loss": 1.5501, + "step": 336 + }, + { + "epoch": 0.5035252369612925, + "grad_norm": 0.44914835691452026, + "learning_rate": 4.1575e-05, + "loss": 1.7285, + "step": 337 + }, + { + "epoch": 0.5050193771303171, + "grad_norm": 0.4023708701133728, + "learning_rate": 4.155e-05, + "loss": 1.4152, + "step": 338 + }, + { + "epoch": 0.5065135172993417, + "grad_norm": 0.39679622650146484, + "learning_rate": 4.1525e-05, + "loss": 1.5925, + "step": 339 + }, + { + "epoch": 0.5080076574683663, + "grad_norm": 0.41201382875442505, + "learning_rate": 4.15e-05, + "loss": 1.4728, + "step": 340 + }, + { + "epoch": 0.5095017976373909, + "grad_norm": 0.37450897693634033, + "learning_rate": 4.1475000000000005e-05, + "loss": 1.3821, + "step": 341 + }, + { + "epoch": 0.5109959378064155, + "grad_norm": 3.6442813873291016, + "learning_rate": 4.145e-05, + "loss": 1.4883, + "step": 342 + }, + { + "epoch": 0.5124900779754401, + "grad_norm": 0.529839813709259, + "learning_rate": 4.1425000000000004e-05, + "loss": 1.5838, + "step": 343 + }, + { + "epoch": 0.5139842181444647, + "grad_norm": 0.47011345624923706, + "learning_rate": 4.14e-05, + "loss": 1.4973, + "step": 344 + }, + { + "epoch": 0.5154783583134893, + "grad_norm": 0.7854222059249878, + "learning_rate": 4.1375e-05, + "loss": 1.526, + "step": 345 + }, + { + "epoch": 0.5169724984825139, + "grad_norm": 0.3422839641571045, + "learning_rate": 4.135e-05, + "loss": 1.7585, + "step": 346 + }, + { + "epoch": 0.5184666386515385, + "grad_norm": 0.3906049132347107, + "learning_rate": 4.1325e-05, + "loss": 1.5914, + "step": 347 + }, + { + "epoch": 0.5199607788205631, + "grad_norm": 0.8085627555847168, + "learning_rate": 4.13e-05, + "loss": 1.5909, + "step": 348 + }, + { + "epoch": 0.5214549189895877, + "grad_norm": 0.571151614189148, + "learning_rate": 4.1275e-05, + "loss": 1.5494, + "step": 349 + }, + { + "epoch": 0.5229490591586123, + "grad_norm": 1.0025105476379395, + "learning_rate": 4.125e-05, + "loss": 1.7711, + "step": 350 + }, + { + "epoch": 0.5244431993276369, + "grad_norm": 0.361357182264328, + "learning_rate": 4.1225e-05, + "loss": 1.494, + "step": 351 + }, + { + "epoch": 0.5259373394966615, + "grad_norm": 0.3900647759437561, + "learning_rate": 4.12e-05, + "loss": 1.5864, + "step": 352 + }, + { + "epoch": 0.5274314796656862, + "grad_norm": 0.4274671673774719, + "learning_rate": 4.1175000000000005e-05, + "loss": 1.4905, + "step": 353 + }, + { + "epoch": 0.5289256198347108, + "grad_norm": 0.3727322220802307, + "learning_rate": 4.115e-05, + "loss": 1.5137, + "step": 354 + }, + { + "epoch": 0.5304197600037354, + "grad_norm": 0.49744248390197754, + "learning_rate": 4.1125000000000004e-05, + "loss": 1.509, + "step": 355 + }, + { + "epoch": 0.53191390017276, + "grad_norm": 0.39131686091423035, + "learning_rate": 4.11e-05, + "loss": 1.5525, + "step": 356 + }, + { + "epoch": 0.5334080403417846, + "grad_norm": 1.4326519966125488, + "learning_rate": 4.1075e-05, + "loss": 1.5982, + "step": 357 + }, + { + "epoch": 0.5349021805108092, + "grad_norm": 0.4607619047164917, + "learning_rate": 4.105e-05, + "loss": 1.3984, + "step": 358 + }, + { + "epoch": 0.5363963206798338, + "grad_norm": 0.5902805328369141, + "learning_rate": 4.1025e-05, + "loss": 1.4801, + "step": 359 + }, + { + "epoch": 0.5378904608488584, + "grad_norm": 0.34372955560684204, + "learning_rate": 4.1e-05, + "loss": 1.4714, + "step": 360 + }, + { + "epoch": 0.539384601017883, + "grad_norm": 0.32892340421676636, + "learning_rate": 4.0975e-05, + "loss": 1.5977, + "step": 361 + }, + { + "epoch": 0.5408787411869076, + "grad_norm": 0.45786258578300476, + "learning_rate": 4.095e-05, + "loss": 1.6396, + "step": 362 + }, + { + "epoch": 0.5423728813559322, + "grad_norm": 0.6414769887924194, + "learning_rate": 4.0925000000000005e-05, + "loss": 1.3731, + "step": 363 + }, + { + "epoch": 0.5438670215249568, + "grad_norm": 0.4823474586009979, + "learning_rate": 4.09e-05, + "loss": 1.4974, + "step": 364 + }, + { + "epoch": 0.5453611616939814, + "grad_norm": 1.1086201667785645, + "learning_rate": 4.0875000000000004e-05, + "loss": 1.6457, + "step": 365 + }, + { + "epoch": 0.546855301863006, + "grad_norm": 0.9799003005027771, + "learning_rate": 4.085e-05, + "loss": 1.4262, + "step": 366 + }, + { + "epoch": 0.5483494420320306, + "grad_norm": 0.5253656506538391, + "learning_rate": 4.0825e-05, + "loss": 1.3437, + "step": 367 + }, + { + "epoch": 0.5498435822010552, + "grad_norm": 0.6868051290512085, + "learning_rate": 4.08e-05, + "loss": 1.464, + "step": 368 + }, + { + "epoch": 0.5513377223700798, + "grad_norm": 0.33976659178733826, + "learning_rate": 4.0775e-05, + "loss": 1.693, + "step": 369 + }, + { + "epoch": 0.5528318625391044, + "grad_norm": 0.3966132700443268, + "learning_rate": 4.075e-05, + "loss": 1.6771, + "step": 370 + }, + { + "epoch": 0.554326002708129, + "grad_norm": 0.39192917943000793, + "learning_rate": 4.0725e-05, + "loss": 1.5507, + "step": 371 + }, + { + "epoch": 0.5558201428771536, + "grad_norm": 1.481609582901001, + "learning_rate": 4.07e-05, + "loss": 1.4611, + "step": 372 + }, + { + "epoch": 0.5573142830461782, + "grad_norm": 0.6329764723777771, + "learning_rate": 4.0675e-05, + "loss": 1.542, + "step": 373 + }, + { + "epoch": 0.5588084232152029, + "grad_norm": 0.41724011301994324, + "learning_rate": 4.065e-05, + "loss": 1.6033, + "step": 374 + }, + { + "epoch": 0.5603025633842275, + "grad_norm": 0.32675087451934814, + "learning_rate": 4.0625000000000005e-05, + "loss": 1.5204, + "step": 375 + }, + { + "epoch": 0.5617967035532521, + "grad_norm": 0.35794204473495483, + "learning_rate": 4.0600000000000004e-05, + "loss": 1.4091, + "step": 376 + }, + { + "epoch": 0.5632908437222767, + "grad_norm": 0.6309773921966553, + "learning_rate": 4.0575000000000004e-05, + "loss": 1.4087, + "step": 377 + }, + { + "epoch": 0.5647849838913013, + "grad_norm": 0.4585449993610382, + "learning_rate": 4.055e-05, + "loss": 1.5238, + "step": 378 + }, + { + "epoch": 0.5662791240603259, + "grad_norm": 0.3686904311180115, + "learning_rate": 4.0525e-05, + "loss": 1.5567, + "step": 379 + }, + { + "epoch": 0.5677732642293505, + "grad_norm": 0.4154909551143646, + "learning_rate": 4.05e-05, + "loss": 1.4935, + "step": 380 + }, + { + "epoch": 0.5692674043983751, + "grad_norm": 0.302909255027771, + "learning_rate": 4.0475e-05, + "loss": 1.5521, + "step": 381 + }, + { + "epoch": 0.5707615445673997, + "grad_norm": 0.40441974997520447, + "learning_rate": 4.045000000000001e-05, + "loss": 1.6756, + "step": 382 + }, + { + "epoch": 0.5722556847364243, + "grad_norm": 0.4657610058784485, + "learning_rate": 4.0425e-05, + "loss": 1.5313, + "step": 383 + }, + { + "epoch": 0.5737498249054489, + "grad_norm": 1.0451929569244385, + "learning_rate": 4.0400000000000006e-05, + "loss": 1.5619, + "step": 384 + }, + { + "epoch": 0.5752439650744735, + "grad_norm": 0.4887607991695404, + "learning_rate": 4.0375e-05, + "loss": 1.5529, + "step": 385 + }, + { + "epoch": 0.5767381052434981, + "grad_norm": 0.4439973533153534, + "learning_rate": 4.0350000000000005e-05, + "loss": 1.5859, + "step": 386 + }, + { + "epoch": 0.5782322454125227, + "grad_norm": 0.36021965742111206, + "learning_rate": 4.0325000000000004e-05, + "loss": 1.5467, + "step": 387 + }, + { + "epoch": 0.5797263855815473, + "grad_norm": 0.9734621047973633, + "learning_rate": 4.0300000000000004e-05, + "loss": 1.6785, + "step": 388 + }, + { + "epoch": 0.5812205257505719, + "grad_norm": 0.4874991774559021, + "learning_rate": 4.0275e-05, + "loss": 1.462, + "step": 389 + }, + { + "epoch": 0.5827146659195965, + "grad_norm": 0.3743913769721985, + "learning_rate": 4.025e-05, + "loss": 1.4352, + "step": 390 + }, + { + "epoch": 0.5842088060886211, + "grad_norm": 0.5307121276855469, + "learning_rate": 4.0225e-05, + "loss": 1.5183, + "step": 391 + }, + { + "epoch": 0.5857029462576457, + "grad_norm": 0.616095244884491, + "learning_rate": 4.02e-05, + "loss": 1.4568, + "step": 392 + }, + { + "epoch": 0.5871970864266705, + "grad_norm": 0.7378297448158264, + "learning_rate": 4.0175e-05, + "loss": 1.6607, + "step": 393 + }, + { + "epoch": 0.5886912265956951, + "grad_norm": 0.38973790407180786, + "learning_rate": 4.015000000000001e-05, + "loss": 1.5029, + "step": 394 + }, + { + "epoch": 0.5901853667647197, + "grad_norm": 0.7084895968437195, + "learning_rate": 4.0125e-05, + "loss": 1.6412, + "step": 395 + }, + { + "epoch": 0.5916795069337443, + "grad_norm": 0.615085244178772, + "learning_rate": 4.0100000000000006e-05, + "loss": 1.48, + "step": 396 + }, + { + "epoch": 0.5931736471027689, + "grad_norm": 0.5237393379211426, + "learning_rate": 4.0075e-05, + "loss": 1.5518, + "step": 397 + }, + { + "epoch": 0.5946677872717935, + "grad_norm": 0.5331382155418396, + "learning_rate": 4.0050000000000004e-05, + "loss": 1.5439, + "step": 398 + }, + { + "epoch": 0.5961619274408181, + "grad_norm": 0.36154451966285706, + "learning_rate": 4.0025000000000004e-05, + "loss": 1.7163, + "step": 399 + }, + { + "epoch": 0.5976560676098427, + "grad_norm": 0.3548503816127777, + "learning_rate": 4e-05, + "loss": 1.4117, + "step": 400 + }, + { + "epoch": 0.5991502077788673, + "grad_norm": 0.5904132127761841, + "learning_rate": 3.9975e-05, + "loss": 1.4805, + "step": 401 + }, + { + "epoch": 0.6006443479478919, + "grad_norm": 0.2864095866680145, + "learning_rate": 3.995e-05, + "loss": 1.4976, + "step": 402 + }, + { + "epoch": 0.6021384881169165, + "grad_norm": 1.153709888458252, + "learning_rate": 3.9925e-05, + "loss": 1.3897, + "step": 403 + }, + { + "epoch": 0.6036326282859411, + "grad_norm": 0.48970621824264526, + "learning_rate": 3.99e-05, + "loss": 1.5441, + "step": 404 + }, + { + "epoch": 0.6051267684549657, + "grad_norm": 0.8419874906539917, + "learning_rate": 3.9875e-05, + "loss": 1.5113, + "step": 405 + }, + { + "epoch": 0.6066209086239903, + "grad_norm": 0.429151713848114, + "learning_rate": 3.9850000000000006e-05, + "loss": 1.5316, + "step": 406 + }, + { + "epoch": 0.6081150487930149, + "grad_norm": 0.24945476651191711, + "learning_rate": 3.9825e-05, + "loss": 1.3466, + "step": 407 + }, + { + "epoch": 0.6096091889620395, + "grad_norm": 0.8487308621406555, + "learning_rate": 3.9800000000000005e-05, + "loss": 1.5499, + "step": 408 + }, + { + "epoch": 0.6111033291310641, + "grad_norm": 0.5790063738822937, + "learning_rate": 3.9775e-05, + "loss": 1.5398, + "step": 409 + }, + { + "epoch": 0.6125974693000887, + "grad_norm": 0.3966883718967438, + "learning_rate": 3.9750000000000004e-05, + "loss": 1.7663, + "step": 410 + }, + { + "epoch": 0.6140916094691133, + "grad_norm": 0.3764961361885071, + "learning_rate": 3.9725e-05, + "loss": 1.6983, + "step": 411 + }, + { + "epoch": 0.615585749638138, + "grad_norm": 0.3106807470321655, + "learning_rate": 3.97e-05, + "loss": 1.4007, + "step": 412 + }, + { + "epoch": 0.6170798898071626, + "grad_norm": 0.5766478776931763, + "learning_rate": 3.9675e-05, + "loss": 1.8415, + "step": 413 + }, + { + "epoch": 0.6185740299761872, + "grad_norm": 0.662083089351654, + "learning_rate": 3.965e-05, + "loss": 1.4447, + "step": 414 + }, + { + "epoch": 0.6200681701452118, + "grad_norm": 0.7968102097511292, + "learning_rate": 3.9625e-05, + "loss": 1.7127, + "step": 415 + }, + { + "epoch": 0.6215623103142364, + "grad_norm": 0.3866075575351715, + "learning_rate": 3.960000000000001e-05, + "loss": 1.6128, + "step": 416 + }, + { + "epoch": 0.623056450483261, + "grad_norm": 0.40726110339164734, + "learning_rate": 3.9575e-05, + "loss": 1.4841, + "step": 417 + }, + { + "epoch": 0.6245505906522856, + "grad_norm": 0.3251897990703583, + "learning_rate": 3.9550000000000006e-05, + "loss": 1.4696, + "step": 418 + }, + { + "epoch": 0.6260447308213102, + "grad_norm": 0.5213460326194763, + "learning_rate": 3.9525e-05, + "loss": 1.6224, + "step": 419 + }, + { + "epoch": 0.6275388709903348, + "grad_norm": 0.33423733711242676, + "learning_rate": 3.9500000000000005e-05, + "loss": 1.5016, + "step": 420 + }, + { + "epoch": 0.6290330111593594, + "grad_norm": 0.39543136954307556, + "learning_rate": 3.9475000000000004e-05, + "loss": 1.5712, + "step": 421 + }, + { + "epoch": 0.630527151328384, + "grad_norm": 0.4221692383289337, + "learning_rate": 3.9450000000000003e-05, + "loss": 1.3305, + "step": 422 + }, + { + "epoch": 0.6320212914974086, + "grad_norm": 0.3848581314086914, + "learning_rate": 3.9425e-05, + "loss": 1.5393, + "step": 423 + }, + { + "epoch": 0.6335154316664332, + "grad_norm": 0.510204553604126, + "learning_rate": 3.94e-05, + "loss": 1.4739, + "step": 424 + }, + { + "epoch": 0.6350095718354578, + "grad_norm": 0.37247434258461, + "learning_rate": 3.9375e-05, + "loss": 1.3264, + "step": 425 + }, + { + "epoch": 0.6365037120044824, + "grad_norm": 0.6320198774337769, + "learning_rate": 3.935e-05, + "loss": 1.6585, + "step": 426 + }, + { + "epoch": 0.637997852173507, + "grad_norm": 0.4277353584766388, + "learning_rate": 3.9325e-05, + "loss": 1.5301, + "step": 427 + }, + { + "epoch": 0.6394919923425316, + "grad_norm": 0.33874911069869995, + "learning_rate": 3.9300000000000007e-05, + "loss": 1.4133, + "step": 428 + }, + { + "epoch": 0.6409861325115562, + "grad_norm": 0.5412879586219788, + "learning_rate": 3.9275e-05, + "loss": 1.4922, + "step": 429 + }, + { + "epoch": 0.6424802726805808, + "grad_norm": 0.8273118138313293, + "learning_rate": 3.9250000000000005e-05, + "loss": 1.4497, + "step": 430 + }, + { + "epoch": 0.6439744128496054, + "grad_norm": 0.42817798256874084, + "learning_rate": 3.9225e-05, + "loss": 1.6066, + "step": 431 + }, + { + "epoch": 0.64546855301863, + "grad_norm": 0.5471529960632324, + "learning_rate": 3.9200000000000004e-05, + "loss": 1.4121, + "step": 432 + }, + { + "epoch": 0.6469626931876546, + "grad_norm": 0.48357531428337097, + "learning_rate": 3.9175000000000004e-05, + "loss": 1.4167, + "step": 433 + }, + { + "epoch": 0.6484568333566793, + "grad_norm": 0.561120331287384, + "learning_rate": 3.915e-05, + "loss": 1.3161, + "step": 434 + }, + { + "epoch": 0.6499509735257039, + "grad_norm": 0.37435629963874817, + "learning_rate": 3.9125e-05, + "loss": 1.4888, + "step": 435 + }, + { + "epoch": 0.6514451136947285, + "grad_norm": 0.302143931388855, + "learning_rate": 3.91e-05, + "loss": 1.3934, + "step": 436 + }, + { + "epoch": 0.6529392538637531, + "grad_norm": 0.6095237135887146, + "learning_rate": 3.9075e-05, + "loss": 1.4218, + "step": 437 + }, + { + "epoch": 0.6544333940327777, + "grad_norm": 0.6860696077346802, + "learning_rate": 3.905e-05, + "loss": 1.4912, + "step": 438 + }, + { + "epoch": 0.6559275342018023, + "grad_norm": 0.34080833196640015, + "learning_rate": 3.9025e-05, + "loss": 1.3564, + "step": 439 + }, + { + "epoch": 0.6574216743708269, + "grad_norm": 0.568854808807373, + "learning_rate": 3.9000000000000006e-05, + "loss": 1.5372, + "step": 440 + }, + { + "epoch": 0.6589158145398515, + "grad_norm": 0.3843095898628235, + "learning_rate": 3.8975e-05, + "loss": 1.6467, + "step": 441 + }, + { + "epoch": 0.6604099547088761, + "grad_norm": 0.7436652183532715, + "learning_rate": 3.8950000000000005e-05, + "loss": 1.5655, + "step": 442 + }, + { + "epoch": 0.6619040948779007, + "grad_norm": 0.6002988219261169, + "learning_rate": 3.8925e-05, + "loss": 1.6488, + "step": 443 + }, + { + "epoch": 0.6633982350469253, + "grad_norm": 0.9423897862434387, + "learning_rate": 3.8900000000000004e-05, + "loss": 1.4125, + "step": 444 + }, + { + "epoch": 0.6648923752159499, + "grad_norm": 0.6755229234695435, + "learning_rate": 3.8875e-05, + "loss": 1.4681, + "step": 445 + }, + { + "epoch": 0.6663865153849745, + "grad_norm": 0.7312940955162048, + "learning_rate": 3.885e-05, + "loss": 1.8124, + "step": 446 + }, + { + "epoch": 0.6678806555539991, + "grad_norm": 0.48998695611953735, + "learning_rate": 3.8825e-05, + "loss": 1.5058, + "step": 447 + }, + { + "epoch": 0.6693747957230237, + "grad_norm": 0.41998910903930664, + "learning_rate": 3.88e-05, + "loss": 1.3735, + "step": 448 + }, + { + "epoch": 0.6708689358920483, + "grad_norm": 0.5682637095451355, + "learning_rate": 3.8775e-05, + "loss": 1.5505, + "step": 449 + }, + { + "epoch": 0.6723630760610729, + "grad_norm": 0.6611406803131104, + "learning_rate": 3.875e-05, + "loss": 1.6073, + "step": 450 + }, + { + "epoch": 0.6738572162300975, + "grad_norm": 0.3332504332065582, + "learning_rate": 3.8725e-05, + "loss": 1.4912, + "step": 451 + }, + { + "epoch": 0.6753513563991221, + "grad_norm": 0.3210924565792084, + "learning_rate": 3.8700000000000006e-05, + "loss": 1.4406, + "step": 452 + }, + { + "epoch": 0.6768454965681469, + "grad_norm": 0.5756349563598633, + "learning_rate": 3.8675e-05, + "loss": 1.5155, + "step": 453 + }, + { + "epoch": 0.6783396367371715, + "grad_norm": 0.33762139081954956, + "learning_rate": 3.8650000000000004e-05, + "loss": 1.577, + "step": 454 + }, + { + "epoch": 0.6798337769061961, + "grad_norm": 0.39244896173477173, + "learning_rate": 3.8625e-05, + "loss": 1.4908, + "step": 455 + }, + { + "epoch": 0.6813279170752207, + "grad_norm": 0.270628422498703, + "learning_rate": 3.86e-05, + "loss": 1.409, + "step": 456 + }, + { + "epoch": 0.6828220572442453, + "grad_norm": 0.3090542256832123, + "learning_rate": 3.8575e-05, + "loss": 1.5653, + "step": 457 + }, + { + "epoch": 0.6843161974132699, + "grad_norm": 0.46093299984931946, + "learning_rate": 3.855e-05, + "loss": 1.6372, + "step": 458 + }, + { + "epoch": 0.6858103375822945, + "grad_norm": 0.6079492568969727, + "learning_rate": 3.8525e-05, + "loss": 1.4893, + "step": 459 + }, + { + "epoch": 0.6873044777513191, + "grad_norm": 1.2198094129562378, + "learning_rate": 3.85e-05, + "loss": 1.5613, + "step": 460 + }, + { + "epoch": 0.6887986179203437, + "grad_norm": 0.3242771029472351, + "learning_rate": 3.8475e-05, + "loss": 1.4498, + "step": 461 + }, + { + "epoch": 0.6902927580893683, + "grad_norm": 0.2971858084201813, + "learning_rate": 3.845e-05, + "loss": 1.5638, + "step": 462 + }, + { + "epoch": 0.6917868982583929, + "grad_norm": 0.2709638178348541, + "learning_rate": 3.8425e-05, + "loss": 1.3314, + "step": 463 + }, + { + "epoch": 0.6932810384274175, + "grad_norm": 0.25972089171409607, + "learning_rate": 3.8400000000000005e-05, + "loss": 1.3006, + "step": 464 + }, + { + "epoch": 0.6947751785964421, + "grad_norm": 0.2617763876914978, + "learning_rate": 3.8375e-05, + "loss": 1.3331, + "step": 465 + }, + { + "epoch": 0.6962693187654667, + "grad_norm": 0.5027948021888733, + "learning_rate": 3.8350000000000004e-05, + "loss": 1.524, + "step": 466 + }, + { + "epoch": 0.6977634589344913, + "grad_norm": 0.637778639793396, + "learning_rate": 3.8324999999999996e-05, + "loss": 1.376, + "step": 467 + }, + { + "epoch": 0.6992575991035159, + "grad_norm": 0.315097451210022, + "learning_rate": 3.83e-05, + "loss": 1.4328, + "step": 468 + }, + { + "epoch": 0.7007517392725405, + "grad_norm": 0.311083048582077, + "learning_rate": 3.8275e-05, + "loss": 1.3065, + "step": 469 + }, + { + "epoch": 0.7022458794415651, + "grad_norm": 0.39416611194610596, + "learning_rate": 3.825e-05, + "loss": 1.5988, + "step": 470 + }, + { + "epoch": 0.7037400196105897, + "grad_norm": 0.33352068066596985, + "learning_rate": 3.8225e-05, + "loss": 1.5433, + "step": 471 + }, + { + "epoch": 0.7052341597796143, + "grad_norm": 0.9079835414886475, + "learning_rate": 3.82e-05, + "loss": 1.6284, + "step": 472 + }, + { + "epoch": 0.706728299948639, + "grad_norm": 0.370746910572052, + "learning_rate": 3.8175e-05, + "loss": 1.3519, + "step": 473 + }, + { + "epoch": 0.7082224401176636, + "grad_norm": 0.6235456466674805, + "learning_rate": 3.8150000000000006e-05, + "loss": 1.6093, + "step": 474 + }, + { + "epoch": 0.7097165802866882, + "grad_norm": 0.3454919457435608, + "learning_rate": 3.8125e-05, + "loss": 1.4376, + "step": 475 + }, + { + "epoch": 0.7112107204557128, + "grad_norm": 1.0801193714141846, + "learning_rate": 3.8100000000000005e-05, + "loss": 1.4585, + "step": 476 + }, + { + "epoch": 0.7127048606247374, + "grad_norm": 0.30193617939949036, + "learning_rate": 3.8075e-05, + "loss": 1.5557, + "step": 477 + }, + { + "epoch": 0.714199000793762, + "grad_norm": 0.8482101559638977, + "learning_rate": 3.805e-05, + "loss": 1.4223, + "step": 478 + }, + { + "epoch": 0.7156931409627866, + "grad_norm": 0.35040053725242615, + "learning_rate": 3.8025e-05, + "loss": 1.5185, + "step": 479 + }, + { + "epoch": 0.7171872811318112, + "grad_norm": 0.468925803899765, + "learning_rate": 3.8e-05, + "loss": 1.4051, + "step": 480 + }, + { + "epoch": 0.7186814213008358, + "grad_norm": 0.2951725423336029, + "learning_rate": 3.7975e-05, + "loss": 1.4082, + "step": 481 + }, + { + "epoch": 0.7201755614698604, + "grad_norm": 0.2716180086135864, + "learning_rate": 3.795e-05, + "loss": 1.3357, + "step": 482 + }, + { + "epoch": 0.721669701638885, + "grad_norm": 0.6728554368019104, + "learning_rate": 3.7925e-05, + "loss": 1.3553, + "step": 483 + }, + { + "epoch": 0.7231638418079096, + "grad_norm": 0.42035940289497375, + "learning_rate": 3.79e-05, + "loss": 1.382, + "step": 484 + }, + { + "epoch": 0.7246579819769342, + "grad_norm": 0.5181907415390015, + "learning_rate": 3.7875e-05, + "loss": 1.5328, + "step": 485 + }, + { + "epoch": 0.7261521221459588, + "grad_norm": 0.30366700887680054, + "learning_rate": 3.7850000000000005e-05, + "loss": 1.4639, + "step": 486 + }, + { + "epoch": 0.7276462623149834, + "grad_norm": 0.30150046944618225, + "learning_rate": 3.7825e-05, + "loss": 1.336, + "step": 487 + }, + { + "epoch": 0.729140402484008, + "grad_norm": 1.3998833894729614, + "learning_rate": 3.7800000000000004e-05, + "loss": 1.7007, + "step": 488 + }, + { + "epoch": 0.7306345426530326, + "grad_norm": 0.3373676836490631, + "learning_rate": 3.7775e-05, + "loss": 1.289, + "step": 489 + }, + { + "epoch": 0.7321286828220572, + "grad_norm": 0.43975895643234253, + "learning_rate": 3.775e-05, + "loss": 1.6339, + "step": 490 + }, + { + "epoch": 0.7336228229910818, + "grad_norm": 0.2881298363208771, + "learning_rate": 3.7725e-05, + "loss": 1.6197, + "step": 491 + }, + { + "epoch": 0.7351169631601064, + "grad_norm": 0.2758522033691406, + "learning_rate": 3.77e-05, + "loss": 1.4038, + "step": 492 + }, + { + "epoch": 0.736611103329131, + "grad_norm": 0.2488110512495041, + "learning_rate": 3.7675e-05, + "loss": 1.4352, + "step": 493 + }, + { + "epoch": 0.7381052434981556, + "grad_norm": 0.2574450671672821, + "learning_rate": 3.765e-05, + "loss": 1.2815, + "step": 494 + }, + { + "epoch": 0.7395993836671803, + "grad_norm": 0.38335368037223816, + "learning_rate": 3.7625e-05, + "loss": 1.2665, + "step": 495 + }, + { + "epoch": 0.7410935238362049, + "grad_norm": 0.3784824311733246, + "learning_rate": 3.76e-05, + "loss": 1.6348, + "step": 496 + }, + { + "epoch": 0.7425876640052295, + "grad_norm": 0.32331913709640503, + "learning_rate": 3.7575e-05, + "loss": 1.4485, + "step": 497 + }, + { + "epoch": 0.7440818041742541, + "grad_norm": 0.636945903301239, + "learning_rate": 3.7550000000000005e-05, + "loss": 1.474, + "step": 498 + }, + { + "epoch": 0.7455759443432787, + "grad_norm": 0.4890618920326233, + "learning_rate": 3.7525e-05, + "loss": 1.4858, + "step": 499 + }, + { + "epoch": 0.7470700845123033, + "grad_norm": 1.4802592992782593, + "learning_rate": 3.7500000000000003e-05, + "loss": 1.5492, + "step": 500 + }, + { + "epoch": 0.7485642246813279, + "grad_norm": 0.8308455348014832, + "learning_rate": 3.7475e-05, + "loss": 1.4273, + "step": 501 + }, + { + "epoch": 0.7500583648503525, + "grad_norm": 0.3105123043060303, + "learning_rate": 3.745e-05, + "loss": 1.6754, + "step": 502 + }, + { + "epoch": 0.7515525050193771, + "grad_norm": 0.3114786744117737, + "learning_rate": 3.7425e-05, + "loss": 1.4259, + "step": 503 + }, + { + "epoch": 0.7530466451884017, + "grad_norm": 0.4946189224720001, + "learning_rate": 3.74e-05, + "loss": 1.5702, + "step": 504 + }, + { + "epoch": 0.7545407853574263, + "grad_norm": 0.3033464550971985, + "learning_rate": 3.737500000000001e-05, + "loss": 1.509, + "step": 505 + }, + { + "epoch": 0.7560349255264509, + "grad_norm": 0.3518037796020508, + "learning_rate": 3.735e-05, + "loss": 1.555, + "step": 506 + }, + { + "epoch": 0.7575290656954755, + "grad_norm": 4.805240631103516, + "learning_rate": 3.7325000000000006e-05, + "loss": 1.4278, + "step": 507 + }, + { + "epoch": 0.7590232058645001, + "grad_norm": 0.2884902358055115, + "learning_rate": 3.73e-05, + "loss": 1.3305, + "step": 508 + }, + { + "epoch": 0.7605173460335247, + "grad_norm": 0.48153796792030334, + "learning_rate": 3.7275000000000005e-05, + "loss": 1.5316, + "step": 509 + }, + { + "epoch": 0.7620114862025493, + "grad_norm": 0.4066573977470398, + "learning_rate": 3.7250000000000004e-05, + "loss": 1.7447, + "step": 510 + }, + { + "epoch": 0.7635056263715739, + "grad_norm": 0.35123422741889954, + "learning_rate": 3.7225000000000004e-05, + "loss": 1.4725, + "step": 511 + }, + { + "epoch": 0.7649997665405985, + "grad_norm": 4.914884090423584, + "learning_rate": 3.72e-05, + "loss": 1.4226, + "step": 512 + }, + { + "epoch": 0.7664939067096233, + "grad_norm": 0.6892772912979126, + "learning_rate": 3.7175e-05, + "loss": 1.4583, + "step": 513 + }, + { + "epoch": 0.7679880468786479, + "grad_norm": 0.7510517239570618, + "learning_rate": 3.715e-05, + "loss": 1.5202, + "step": 514 + }, + { + "epoch": 0.7694821870476725, + "grad_norm": 0.3141719102859497, + "learning_rate": 3.7125e-05, + "loss": 1.4267, + "step": 515 + }, + { + "epoch": 0.7709763272166971, + "grad_norm": 0.36383432149887085, + "learning_rate": 3.71e-05, + "loss": 1.6657, + "step": 516 + }, + { + "epoch": 0.7724704673857217, + "grad_norm": 0.30458924174308777, + "learning_rate": 3.707500000000001e-05, + "loss": 1.3829, + "step": 517 + }, + { + "epoch": 0.7739646075547463, + "grad_norm": 0.4784793257713318, + "learning_rate": 3.705e-05, + "loss": 1.3675, + "step": 518 + }, + { + "epoch": 0.7754587477237709, + "grad_norm": 0.6077926754951477, + "learning_rate": 3.7025000000000005e-05, + "loss": 1.4977, + "step": 519 + }, + { + "epoch": 0.7769528878927955, + "grad_norm": 0.5152701139450073, + "learning_rate": 3.7e-05, + "loss": 1.6965, + "step": 520 + }, + { + "epoch": 0.7784470280618201, + "grad_norm": 0.2685502767562866, + "learning_rate": 3.6975000000000004e-05, + "loss": 1.3124, + "step": 521 + }, + { + "epoch": 0.7799411682308447, + "grad_norm": 0.3213263750076294, + "learning_rate": 3.6950000000000004e-05, + "loss": 1.4155, + "step": 522 + }, + { + "epoch": 0.7814353083998693, + "grad_norm": 0.5548874735832214, + "learning_rate": 3.6925e-05, + "loss": 1.302, + "step": 523 + }, + { + "epoch": 0.7829294485688939, + "grad_norm": 0.3476009666919708, + "learning_rate": 3.69e-05, + "loss": 1.593, + "step": 524 + }, + { + "epoch": 0.7844235887379185, + "grad_norm": 0.45263391733169556, + "learning_rate": 3.6875e-05, + "loss": 1.5412, + "step": 525 + }, + { + "epoch": 0.7859177289069431, + "grad_norm": 0.3879849910736084, + "learning_rate": 3.685e-05, + "loss": 1.4468, + "step": 526 + }, + { + "epoch": 0.7874118690759677, + "grad_norm": 0.3185078203678131, + "learning_rate": 3.6825e-05, + "loss": 1.4325, + "step": 527 + }, + { + "epoch": 0.7889060092449923, + "grad_norm": 0.3549663722515106, + "learning_rate": 3.68e-05, + "loss": 1.5194, + "step": 528 + }, + { + "epoch": 0.7904001494140169, + "grad_norm": 0.7724360823631287, + "learning_rate": 3.6775000000000006e-05, + "loss": 1.5654, + "step": 529 + }, + { + "epoch": 0.7918942895830415, + "grad_norm": 0.48684829473495483, + "learning_rate": 3.675e-05, + "loss": 1.4374, + "step": 530 + }, + { + "epoch": 0.7933884297520661, + "grad_norm": 0.35977497696876526, + "learning_rate": 3.6725000000000005e-05, + "loss": 1.4841, + "step": 531 + }, + { + "epoch": 0.7948825699210907, + "grad_norm": 0.5556880831718445, + "learning_rate": 3.6700000000000004e-05, + "loss": 1.575, + "step": 532 + }, + { + "epoch": 0.7963767100901153, + "grad_norm": 0.3836474120616913, + "learning_rate": 3.6675000000000004e-05, + "loss": 1.4394, + "step": 533 + }, + { + "epoch": 0.79787085025914, + "grad_norm": 0.606639564037323, + "learning_rate": 3.665e-05, + "loss": 1.4676, + "step": 534 + }, + { + "epoch": 0.7993649904281646, + "grad_norm": 0.37593191862106323, + "learning_rate": 3.6625e-05, + "loss": 1.4674, + "step": 535 + }, + { + "epoch": 0.8008591305971892, + "grad_norm": 0.33318042755126953, + "learning_rate": 3.66e-05, + "loss": 1.3682, + "step": 536 + }, + { + "epoch": 0.8023532707662138, + "grad_norm": 0.427814781665802, + "learning_rate": 3.6575e-05, + "loss": 1.5771, + "step": 537 + }, + { + "epoch": 0.8038474109352384, + "grad_norm": 0.5013397336006165, + "learning_rate": 3.655e-05, + "loss": 1.5273, + "step": 538 + }, + { + "epoch": 0.805341551104263, + "grad_norm": 0.333675742149353, + "learning_rate": 3.652500000000001e-05, + "loss": 1.3863, + "step": 539 + }, + { + "epoch": 0.8068356912732876, + "grad_norm": 0.27623650431632996, + "learning_rate": 3.65e-05, + "loss": 1.4684, + "step": 540 + }, + { + "epoch": 0.8083298314423122, + "grad_norm": 0.8387918472290039, + "learning_rate": 3.6475000000000006e-05, + "loss": 1.5025, + "step": 541 + }, + { + "epoch": 0.8098239716113368, + "grad_norm": 0.39421287178993225, + "learning_rate": 3.645e-05, + "loss": 1.5331, + "step": 542 + }, + { + "epoch": 0.8113181117803614, + "grad_norm": 0.44059473276138306, + "learning_rate": 3.6425000000000004e-05, + "loss": 1.5072, + "step": 543 + }, + { + "epoch": 0.812812251949386, + "grad_norm": 0.4156414866447449, + "learning_rate": 3.6400000000000004e-05, + "loss": 1.3086, + "step": 544 + }, + { + "epoch": 0.8143063921184106, + "grad_norm": 0.4378097355365753, + "learning_rate": 3.6375e-05, + "loss": 1.2068, + "step": 545 + }, + { + "epoch": 0.8158005322874352, + "grad_norm": 0.45686447620391846, + "learning_rate": 3.635e-05, + "loss": 1.4368, + "step": 546 + }, + { + "epoch": 0.8172946724564598, + "grad_norm": 0.3088967502117157, + "learning_rate": 3.6325e-05, + "loss": 1.5133, + "step": 547 + }, + { + "epoch": 0.8187888126254844, + "grad_norm": 0.28649911284446716, + "learning_rate": 3.63e-05, + "loss": 1.5194, + "step": 548 + }, + { + "epoch": 0.820282952794509, + "grad_norm": 0.28436142206192017, + "learning_rate": 3.6275e-05, + "loss": 1.3408, + "step": 549 + }, + { + "epoch": 0.8217770929635336, + "grad_norm": 0.4080042243003845, + "learning_rate": 3.625e-05, + "loss": 1.5412, + "step": 550 + }, + { + "epoch": 0.8232712331325582, + "grad_norm": 0.47546809911727905, + "learning_rate": 3.6225000000000006e-05, + "loss": 1.6426, + "step": 551 + }, + { + "epoch": 0.8247653733015828, + "grad_norm": 0.30255812406539917, + "learning_rate": 3.62e-05, + "loss": 1.2971, + "step": 552 + }, + { + "epoch": 0.8262595134706074, + "grad_norm": 0.526885986328125, + "learning_rate": 3.6175000000000005e-05, + "loss": 1.4314, + "step": 553 + }, + { + "epoch": 0.827753653639632, + "grad_norm": 0.3695448040962219, + "learning_rate": 3.615e-05, + "loss": 1.5206, + "step": 554 + }, + { + "epoch": 0.8292477938086567, + "grad_norm": 0.36277472972869873, + "learning_rate": 3.6125000000000004e-05, + "loss": 1.4134, + "step": 555 + }, + { + "epoch": 0.8307419339776813, + "grad_norm": 0.2916204631328583, + "learning_rate": 3.61e-05, + "loss": 1.4018, + "step": 556 + }, + { + "epoch": 0.8322360741467059, + "grad_norm": 0.26475563645362854, + "learning_rate": 3.6075e-05, + "loss": 1.3982, + "step": 557 + }, + { + "epoch": 0.8337302143157305, + "grad_norm": 0.35685673356056213, + "learning_rate": 3.605e-05, + "loss": 1.4475, + "step": 558 + }, + { + "epoch": 0.8352243544847551, + "grad_norm": 0.3294183909893036, + "learning_rate": 3.6025e-05, + "loss": 1.263, + "step": 559 + }, + { + "epoch": 0.8367184946537797, + "grad_norm": 0.3987225890159607, + "learning_rate": 3.6e-05, + "loss": 1.3704, + "step": 560 + }, + { + "epoch": 0.8382126348228043, + "grad_norm": 0.40317440032958984, + "learning_rate": 3.5975e-05, + "loss": 1.5723, + "step": 561 + }, + { + "epoch": 0.8397067749918289, + "grad_norm": 0.3420385718345642, + "learning_rate": 3.595e-05, + "loss": 1.5984, + "step": 562 + }, + { + "epoch": 0.8412009151608535, + "grad_norm": 0.3810725510120392, + "learning_rate": 3.5925000000000006e-05, + "loss": 1.4563, + "step": 563 + }, + { + "epoch": 0.8426950553298781, + "grad_norm": 0.5123723745346069, + "learning_rate": 3.59e-05, + "loss": 1.4666, + "step": 564 + }, + { + "epoch": 0.8441891954989027, + "grad_norm": 0.21530090272426605, + "learning_rate": 3.5875000000000005e-05, + "loss": 1.3619, + "step": 565 + }, + { + "epoch": 0.8456833356679273, + "grad_norm": 1.0108494758605957, + "learning_rate": 3.585e-05, + "loss": 1.4085, + "step": 566 + }, + { + "epoch": 0.8471774758369519, + "grad_norm": 0.47152891755104065, + "learning_rate": 3.5825000000000003e-05, + "loss": 1.4704, + "step": 567 + }, + { + "epoch": 0.8486716160059765, + "grad_norm": 0.3395189046859741, + "learning_rate": 3.58e-05, + "loss": 1.468, + "step": 568 + }, + { + "epoch": 0.8501657561750011, + "grad_norm": 0.2822551727294922, + "learning_rate": 3.5775e-05, + "loss": 1.2872, + "step": 569 + }, + { + "epoch": 0.8516598963440257, + "grad_norm": 0.6422669887542725, + "learning_rate": 3.575e-05, + "loss": 1.3155, + "step": 570 + }, + { + "epoch": 0.8531540365130503, + "grad_norm": 0.45001545548439026, + "learning_rate": 3.5725e-05, + "loss": 1.4375, + "step": 571 + }, + { + "epoch": 0.8546481766820749, + "grad_norm": 0.26399505138397217, + "learning_rate": 3.57e-05, + "loss": 1.394, + "step": 572 + }, + { + "epoch": 0.8561423168510995, + "grad_norm": 0.3477565050125122, + "learning_rate": 3.5675e-05, + "loss": 1.3054, + "step": 573 + }, + { + "epoch": 0.8576364570201243, + "grad_norm": 0.33781954646110535, + "learning_rate": 3.565e-05, + "loss": 1.3096, + "step": 574 + }, + { + "epoch": 0.8591305971891489, + "grad_norm": 0.2942800521850586, + "learning_rate": 3.5625000000000005e-05, + "loss": 1.4921, + "step": 575 + }, + { + "epoch": 0.8606247373581735, + "grad_norm": 0.47856467962265015, + "learning_rate": 3.56e-05, + "loss": 1.2843, + "step": 576 + }, + { + "epoch": 0.8621188775271981, + "grad_norm": 0.369923859834671, + "learning_rate": 3.5575000000000004e-05, + "loss": 1.5034, + "step": 577 + }, + { + "epoch": 0.8636130176962227, + "grad_norm": 0.27429136633872986, + "learning_rate": 3.555e-05, + "loss": 1.3235, + "step": 578 + }, + { + "epoch": 0.8651071578652473, + "grad_norm": 0.42868244647979736, + "learning_rate": 3.5525e-05, + "loss": 1.437, + "step": 579 + }, + { + "epoch": 0.8666012980342719, + "grad_norm": 0.24422957003116608, + "learning_rate": 3.55e-05, + "loss": 1.2913, + "step": 580 + }, + { + "epoch": 0.8680954382032965, + "grad_norm": 0.34310224652290344, + "learning_rate": 3.5475e-05, + "loss": 1.3075, + "step": 581 + }, + { + "epoch": 0.8695895783723211, + "grad_norm": 0.436494380235672, + "learning_rate": 3.545e-05, + "loss": 1.5981, + "step": 582 + }, + { + "epoch": 0.8710837185413457, + "grad_norm": 0.2664638161659241, + "learning_rate": 3.5425e-05, + "loss": 1.3887, + "step": 583 + }, + { + "epoch": 0.8725778587103703, + "grad_norm": 0.24781467020511627, + "learning_rate": 3.54e-05, + "loss": 1.3543, + "step": 584 + }, + { + "epoch": 0.8740719988793949, + "grad_norm": 0.2947968542575836, + "learning_rate": 3.5375e-05, + "loss": 1.3933, + "step": 585 + }, + { + "epoch": 0.8755661390484195, + "grad_norm": 0.3375110328197479, + "learning_rate": 3.535e-05, + "loss": 1.4935, + "step": 586 + }, + { + "epoch": 0.8770602792174441, + "grad_norm": 0.3563228249549866, + "learning_rate": 3.5325000000000005e-05, + "loss": 1.5311, + "step": 587 + }, + { + "epoch": 0.8785544193864687, + "grad_norm": 0.38266658782958984, + "learning_rate": 3.53e-05, + "loss": 1.4301, + "step": 588 + }, + { + "epoch": 0.8800485595554933, + "grad_norm": 0.2319614440202713, + "learning_rate": 3.5275000000000004e-05, + "loss": 1.3498, + "step": 589 + }, + { + "epoch": 0.8815426997245179, + "grad_norm": 0.4004661440849304, + "learning_rate": 3.525e-05, + "loss": 1.5444, + "step": 590 + }, + { + "epoch": 0.8830368398935425, + "grad_norm": 0.25451788306236267, + "learning_rate": 3.5225e-05, + "loss": 1.4218, + "step": 591 + }, + { + "epoch": 0.8845309800625671, + "grad_norm": 0.46508535742759705, + "learning_rate": 3.52e-05, + "loss": 1.426, + "step": 592 + }, + { + "epoch": 0.8860251202315917, + "grad_norm": 0.45118725299835205, + "learning_rate": 3.5175e-05, + "loss": 1.4752, + "step": 593 + }, + { + "epoch": 0.8875192604006163, + "grad_norm": 0.30142486095428467, + "learning_rate": 3.515e-05, + "loss": 1.4359, + "step": 594 + }, + { + "epoch": 0.889013400569641, + "grad_norm": 0.38366007804870605, + "learning_rate": 3.5125e-05, + "loss": 1.3149, + "step": 595 + }, + { + "epoch": 0.8905075407386656, + "grad_norm": 0.275544136762619, + "learning_rate": 3.51e-05, + "loss": 1.2721, + "step": 596 + }, + { + "epoch": 0.8920016809076902, + "grad_norm": 0.3999973237514496, + "learning_rate": 3.5075000000000006e-05, + "loss": 1.4521, + "step": 597 + }, + { + "epoch": 0.8934958210767148, + "grad_norm": 0.22361697256565094, + "learning_rate": 3.505e-05, + "loss": 1.3303, + "step": 598 + }, + { + "epoch": 0.8949899612457394, + "grad_norm": 0.3005754351615906, + "learning_rate": 3.5025000000000004e-05, + "loss": 1.2179, + "step": 599 + }, + { + "epoch": 0.896484101414764, + "grad_norm": 0.3509037494659424, + "learning_rate": 3.5e-05, + "loss": 1.3804, + "step": 600 + }, + { + "epoch": 0.8979782415837886, + "grad_norm": 0.32248327136039734, + "learning_rate": 3.4975e-05, + "loss": 1.4697, + "step": 601 + }, + { + "epoch": 0.8994723817528132, + "grad_norm": 0.6695634126663208, + "learning_rate": 3.495e-05, + "loss": 1.5914, + "step": 602 + }, + { + "epoch": 0.9009665219218378, + "grad_norm": 0.39635902643203735, + "learning_rate": 3.4925e-05, + "loss": 1.1656, + "step": 603 + }, + { + "epoch": 0.9024606620908624, + "grad_norm": 0.44676870107650757, + "learning_rate": 3.49e-05, + "loss": 1.277, + "step": 604 + }, + { + "epoch": 0.903954802259887, + "grad_norm": 0.2875045835971832, + "learning_rate": 3.4875e-05, + "loss": 1.464, + "step": 605 + }, + { + "epoch": 0.9054489424289116, + "grad_norm": 0.4377475082874298, + "learning_rate": 3.485e-05, + "loss": 1.3933, + "step": 606 + }, + { + "epoch": 0.9069430825979362, + "grad_norm": 0.38153204321861267, + "learning_rate": 3.4825e-05, + "loss": 1.3474, + "step": 607 + }, + { + "epoch": 0.9084372227669608, + "grad_norm": 0.3615608215332031, + "learning_rate": 3.48e-05, + "loss": 1.3261, + "step": 608 + }, + { + "epoch": 0.9099313629359854, + "grad_norm": 0.4355512857437134, + "learning_rate": 3.4775000000000005e-05, + "loss": 1.4538, + "step": 609 + }, + { + "epoch": 0.91142550310501, + "grad_norm": 0.30448365211486816, + "learning_rate": 3.475e-05, + "loss": 1.5085, + "step": 610 + }, + { + "epoch": 0.9129196432740346, + "grad_norm": 0.2584160566329956, + "learning_rate": 3.4725000000000004e-05, + "loss": 1.2786, + "step": 611 + }, + { + "epoch": 0.9144137834430592, + "grad_norm": 0.2645299434661865, + "learning_rate": 3.4699999999999996e-05, + "loss": 1.4541, + "step": 612 + }, + { + "epoch": 0.9159079236120838, + "grad_norm": 0.29261913895606995, + "learning_rate": 3.4675e-05, + "loss": 1.4672, + "step": 613 + }, + { + "epoch": 0.9174020637811084, + "grad_norm": 0.33550161123275757, + "learning_rate": 3.465e-05, + "loss": 1.3547, + "step": 614 + }, + { + "epoch": 0.918896203950133, + "grad_norm": 1.3263543844223022, + "learning_rate": 3.4625e-05, + "loss": 1.4612, + "step": 615 + }, + { + "epoch": 0.9203903441191577, + "grad_norm": 0.28431329131126404, + "learning_rate": 3.46e-05, + "loss": 1.5397, + "step": 616 + }, + { + "epoch": 0.9218844842881823, + "grad_norm": 0.6457627415657043, + "learning_rate": 3.4575e-05, + "loss": 1.4713, + "step": 617 + }, + { + "epoch": 0.9233786244572069, + "grad_norm": 1.0010414123535156, + "learning_rate": 3.455e-05, + "loss": 1.5088, + "step": 618 + }, + { + "epoch": 0.9248727646262315, + "grad_norm": 0.3725411891937256, + "learning_rate": 3.4525e-05, + "loss": 1.3313, + "step": 619 + }, + { + "epoch": 0.9263669047952561, + "grad_norm": 0.4637717604637146, + "learning_rate": 3.45e-05, + "loss": 1.3926, + "step": 620 + }, + { + "epoch": 0.9278610449642807, + "grad_norm": 0.591336190700531, + "learning_rate": 3.4475000000000005e-05, + "loss": 1.3868, + "step": 621 + }, + { + "epoch": 0.9293551851333053, + "grad_norm": 0.44118255376815796, + "learning_rate": 3.445e-05, + "loss": 1.3341, + "step": 622 + }, + { + "epoch": 0.9308493253023299, + "grad_norm": 0.8001243472099304, + "learning_rate": 3.4425e-05, + "loss": 1.4864, + "step": 623 + }, + { + "epoch": 0.9323434654713545, + "grad_norm": 0.23716165125370026, + "learning_rate": 3.4399999999999996e-05, + "loss": 1.2912, + "step": 624 + }, + { + "epoch": 0.9338376056403791, + "grad_norm": 0.2823966443538666, + "learning_rate": 3.4375e-05, + "loss": 1.3462, + "step": 625 + }, + { + "epoch": 0.9353317458094037, + "grad_norm": 0.4168063700199127, + "learning_rate": 3.435e-05, + "loss": 1.383, + "step": 626 + }, + { + "epoch": 0.9368258859784283, + "grad_norm": 0.3376123905181885, + "learning_rate": 3.4325e-05, + "loss": 1.2524, + "step": 627 + }, + { + "epoch": 0.9383200261474529, + "grad_norm": 0.5403313040733337, + "learning_rate": 3.430000000000001e-05, + "loss": 1.3068, + "step": 628 + }, + { + "epoch": 0.9398141663164775, + "grad_norm": 0.36801594495773315, + "learning_rate": 3.4275e-05, + "loss": 1.3774, + "step": 629 + }, + { + "epoch": 0.9413083064855021, + "grad_norm": 0.31388500332832336, + "learning_rate": 3.4250000000000006e-05, + "loss": 1.3882, + "step": 630 + }, + { + "epoch": 0.9428024466545267, + "grad_norm": 0.6890659332275391, + "learning_rate": 3.4225e-05, + "loss": 1.3958, + "step": 631 + }, + { + "epoch": 0.9442965868235513, + "grad_norm": 0.3605225086212158, + "learning_rate": 3.4200000000000005e-05, + "loss": 1.4728, + "step": 632 + }, + { + "epoch": 0.9457907269925759, + "grad_norm": 0.3409678041934967, + "learning_rate": 3.4175000000000004e-05, + "loss": 1.2731, + "step": 633 + }, + { + "epoch": 0.9472848671616007, + "grad_norm": 0.22081942856311798, + "learning_rate": 3.415e-05, + "loss": 1.4976, + "step": 634 + }, + { + "epoch": 0.9487790073306253, + "grad_norm": 0.2751425504684448, + "learning_rate": 3.4125e-05, + "loss": 1.395, + "step": 635 + }, + { + "epoch": 0.9502731474996499, + "grad_norm": 0.4033777117729187, + "learning_rate": 3.41e-05, + "loss": 1.4546, + "step": 636 + }, + { + "epoch": 0.9517672876686745, + "grad_norm": 0.7273349761962891, + "learning_rate": 3.4075e-05, + "loss": 1.4307, + "step": 637 + }, + { + "epoch": 0.9532614278376991, + "grad_norm": 0.4339013993740082, + "learning_rate": 3.405e-05, + "loss": 1.4301, + "step": 638 + }, + { + "epoch": 0.9547555680067237, + "grad_norm": 0.3764870762825012, + "learning_rate": 3.4025e-05, + "loss": 1.2367, + "step": 639 + }, + { + "epoch": 0.9562497081757483, + "grad_norm": 0.468927264213562, + "learning_rate": 3.4000000000000007e-05, + "loss": 1.1901, + "step": 640 + }, + { + "epoch": 0.9577438483447729, + "grad_norm": 0.25675371289253235, + "learning_rate": 3.3975e-05, + "loss": 1.1584, + "step": 641 + }, + { + "epoch": 0.9592379885137975, + "grad_norm": 0.40362444519996643, + "learning_rate": 3.3950000000000005e-05, + "loss": 1.4858, + "step": 642 + }, + { + "epoch": 0.9607321286828221, + "grad_norm": 0.38865911960601807, + "learning_rate": 3.3925e-05, + "loss": 1.4717, + "step": 643 + }, + { + "epoch": 0.9622262688518467, + "grad_norm": 0.4919620156288147, + "learning_rate": 3.3900000000000004e-05, + "loss": 1.3452, + "step": 644 + }, + { + "epoch": 0.9637204090208713, + "grad_norm": 0.2999139130115509, + "learning_rate": 3.3875000000000003e-05, + "loss": 1.3253, + "step": 645 + }, + { + "epoch": 0.9652145491898959, + "grad_norm": 0.3142748177051544, + "learning_rate": 3.385e-05, + "loss": 1.4305, + "step": 646 + }, + { + "epoch": 0.9667086893589205, + "grad_norm": 0.36084461212158203, + "learning_rate": 3.3825e-05, + "loss": 1.4515, + "step": 647 + }, + { + "epoch": 0.9682028295279451, + "grad_norm": 0.29961463809013367, + "learning_rate": 3.38e-05, + "loss": 1.5675, + "step": 648 + }, + { + "epoch": 0.9696969696969697, + "grad_norm": 0.26652055978775024, + "learning_rate": 3.3775e-05, + "loss": 1.4403, + "step": 649 + }, + { + "epoch": 0.9711911098659943, + "grad_norm": 0.601382315158844, + "learning_rate": 3.375000000000001e-05, + "loss": 1.4065, + "step": 650 + }, + { + "epoch": 0.9726852500350189, + "grad_norm": 0.2838383913040161, + "learning_rate": 3.3725e-05, + "loss": 1.3804, + "step": 651 + }, + { + "epoch": 0.9741793902040435, + "grad_norm": 0.3205418586730957, + "learning_rate": 3.3700000000000006e-05, + "loss": 1.2481, + "step": 652 + }, + { + "epoch": 0.9756735303730681, + "grad_norm": 0.28523319959640503, + "learning_rate": 3.3675e-05, + "loss": 1.5713, + "step": 653 + }, + { + "epoch": 0.9771676705420927, + "grad_norm": 0.3010360598564148, + "learning_rate": 3.3650000000000005e-05, + "loss": 1.5305, + "step": 654 + }, + { + "epoch": 0.9786618107111174, + "grad_norm": 0.22972212731838226, + "learning_rate": 3.3625000000000004e-05, + "loss": 1.362, + "step": 655 + }, + { + "epoch": 0.980155950880142, + "grad_norm": 0.31966134905815125, + "learning_rate": 3.3600000000000004e-05, + "loss": 1.3665, + "step": 656 + }, + { + "epoch": 0.9816500910491666, + "grad_norm": 0.39883023500442505, + "learning_rate": 3.3575e-05, + "loss": 1.4166, + "step": 657 + }, + { + "epoch": 0.9831442312181912, + "grad_norm": 0.33253997564315796, + "learning_rate": 3.355e-05, + "loss": 1.3789, + "step": 658 + }, + { + "epoch": 0.9846383713872158, + "grad_norm": 0.2807278633117676, + "learning_rate": 3.3525e-05, + "loss": 1.5887, + "step": 659 + }, + { + "epoch": 0.9861325115562404, + "grad_norm": 0.29328641295433044, + "learning_rate": 3.35e-05, + "loss": 1.2736, + "step": 660 + }, + { + "epoch": 0.987626651725265, + "grad_norm": 0.3489589989185333, + "learning_rate": 3.3475e-05, + "loss": 1.4426, + "step": 661 + }, + { + "epoch": 0.9891207918942896, + "grad_norm": 0.26897239685058594, + "learning_rate": 3.345000000000001e-05, + "loss": 1.5113, + "step": 662 + }, + { + "epoch": 0.9906149320633142, + "grad_norm": 0.3970909118652344, + "learning_rate": 3.3425e-05, + "loss": 1.4399, + "step": 663 + }, + { + "epoch": 0.9921090722323388, + "grad_norm": 0.31349503993988037, + "learning_rate": 3.3400000000000005e-05, + "loss": 1.4667, + "step": 664 + }, + { + "epoch": 0.9936032124013634, + "grad_norm": 0.4700935482978821, + "learning_rate": 3.3375e-05, + "loss": 1.3598, + "step": 665 + }, + { + "epoch": 0.995097352570388, + "grad_norm": 0.25302940607070923, + "learning_rate": 3.3350000000000004e-05, + "loss": 1.2526, + "step": 666 + }, + { + "epoch": 0.9965914927394126, + "grad_norm": 0.34383395314216614, + "learning_rate": 3.3325000000000004e-05, + "loss": 1.4012, + "step": 667 + }, + { + "epoch": 0.9980856329084372, + "grad_norm": 0.34091341495513916, + "learning_rate": 3.33e-05, + "loss": 1.3117, + "step": 668 + }, + { + "epoch": 0.9995797730774618, + "grad_norm": 0.3936751186847687, + "learning_rate": 3.3275e-05, + "loss": 1.3979, + "step": 669 + }, + { + "epoch": 1.0010739132464865, + "grad_norm": 0.3573870360851288, + "learning_rate": 3.325e-05, + "loss": 1.3728, + "step": 670 + }, + { + "epoch": 1.002568053415511, + "grad_norm": 0.23025058209896088, + "learning_rate": 3.3225e-05, + "loss": 1.4986, + "step": 671 + }, + { + "epoch": 1.0040621935845357, + "grad_norm": 0.30586880445480347, + "learning_rate": 3.32e-05, + "loss": 1.391, + "step": 672 + }, + { + "epoch": 1.0055563337535602, + "grad_norm": 0.27675920724868774, + "learning_rate": 3.3175e-05, + "loss": 1.2885, + "step": 673 + }, + { + "epoch": 1.007050473922585, + "grad_norm": 0.24607954919338226, + "learning_rate": 3.3150000000000006e-05, + "loss": 1.4229, + "step": 674 + }, + { + "epoch": 1.0085446140916094, + "grad_norm": 0.2698909640312195, + "learning_rate": 3.3125e-05, + "loss": 1.3594, + "step": 675 + }, + { + "epoch": 1.0100387542606342, + "grad_norm": 0.33533975481987, + "learning_rate": 3.3100000000000005e-05, + "loss": 1.3268, + "step": 676 + }, + { + "epoch": 1.0115328944296587, + "grad_norm": 0.3288819491863251, + "learning_rate": 3.3075e-05, + "loss": 1.5193, + "step": 677 + }, + { + "epoch": 1.0130270345986834, + "grad_norm": 0.49441808462142944, + "learning_rate": 3.3050000000000004e-05, + "loss": 1.4717, + "step": 678 + }, + { + "epoch": 1.0145211747677079, + "grad_norm": 0.47882768511772156, + "learning_rate": 3.3025e-05, + "loss": 1.3084, + "step": 679 + }, + { + "epoch": 1.0160153149367326, + "grad_norm": 0.3494391143321991, + "learning_rate": 3.3e-05, + "loss": 1.3419, + "step": 680 + }, + { + "epoch": 1.017509455105757, + "grad_norm": 0.41978195309638977, + "learning_rate": 3.2975e-05, + "loss": 1.5436, + "step": 681 + }, + { + "epoch": 1.0190035952747818, + "grad_norm": 0.25467464327812195, + "learning_rate": 3.295e-05, + "loss": 1.3994, + "step": 682 + }, + { + "epoch": 1.0204977354438063, + "grad_norm": 0.6259259581565857, + "learning_rate": 3.2925e-05, + "loss": 1.2119, + "step": 683 + }, + { + "epoch": 1.021991875612831, + "grad_norm": 0.3374781012535095, + "learning_rate": 3.29e-05, + "loss": 1.3365, + "step": 684 + }, + { + "epoch": 1.0234860157818555, + "grad_norm": 1.0253171920776367, + "learning_rate": 3.2875e-05, + "loss": 1.3974, + "step": 685 + }, + { + "epoch": 1.0249801559508802, + "grad_norm": 0.27831631898880005, + "learning_rate": 3.2850000000000006e-05, + "loss": 1.3793, + "step": 686 + }, + { + "epoch": 1.0264742961199047, + "grad_norm": 0.2690315842628479, + "learning_rate": 3.2825e-05, + "loss": 1.4334, + "step": 687 + }, + { + "epoch": 1.0279684362889294, + "grad_norm": 0.26815739274024963, + "learning_rate": 3.2800000000000004e-05, + "loss": 1.4584, + "step": 688 + }, + { + "epoch": 1.029462576457954, + "grad_norm": 0.829439640045166, + "learning_rate": 3.2775e-05, + "loss": 1.3885, + "step": 689 + }, + { + "epoch": 1.0309567166269786, + "grad_norm": 0.29329952597618103, + "learning_rate": 3.275e-05, + "loss": 1.5287, + "step": 690 + }, + { + "epoch": 1.0324508567960031, + "grad_norm": 0.2592604458332062, + "learning_rate": 3.2725e-05, + "loss": 1.3663, + "step": 691 + }, + { + "epoch": 1.0339449969650278, + "grad_norm": 0.4112248718738556, + "learning_rate": 3.27e-05, + "loss": 1.409, + "step": 692 + }, + { + "epoch": 1.0354391371340523, + "grad_norm": 0.4711347222328186, + "learning_rate": 3.2675e-05, + "loss": 1.5283, + "step": 693 + }, + { + "epoch": 1.036933277303077, + "grad_norm": 0.2854495048522949, + "learning_rate": 3.265e-05, + "loss": 1.1964, + "step": 694 + }, + { + "epoch": 1.0384274174721015, + "grad_norm": 0.3027067184448242, + "learning_rate": 3.2625e-05, + "loss": 1.3459, + "step": 695 + }, + { + "epoch": 1.0399215576411263, + "grad_norm": 0.2927369177341461, + "learning_rate": 3.26e-05, + "loss": 1.3208, + "step": 696 + }, + { + "epoch": 1.0414156978101508, + "grad_norm": 0.26334816217422485, + "learning_rate": 3.2575e-05, + "loss": 1.4442, + "step": 697 + }, + { + "epoch": 1.0429098379791755, + "grad_norm": 0.6484261155128479, + "learning_rate": 3.2550000000000005e-05, + "loss": 1.3031, + "step": 698 + }, + { + "epoch": 1.0444039781482, + "grad_norm": 0.32218441367149353, + "learning_rate": 3.2525e-05, + "loss": 1.2947, + "step": 699 + }, + { + "epoch": 1.0458981183172247, + "grad_norm": 1.0116149187088013, + "learning_rate": 3.2500000000000004e-05, + "loss": 1.3189, + "step": 700 + }, + { + "epoch": 1.0473922584862492, + "grad_norm": 0.35297858715057373, + "learning_rate": 3.2474999999999997e-05, + "loss": 1.5605, + "step": 701 + }, + { + "epoch": 1.0488863986552739, + "grad_norm": 0.3126513659954071, + "learning_rate": 3.245e-05, + "loss": 1.5413, + "step": 702 + }, + { + "epoch": 1.0503805388242984, + "grad_norm": 0.25015923380851746, + "learning_rate": 3.2425e-05, + "loss": 1.3003, + "step": 703 + }, + { + "epoch": 1.051874678993323, + "grad_norm": 0.2285999357700348, + "learning_rate": 3.24e-05, + "loss": 1.3829, + "step": 704 + }, + { + "epoch": 1.0533688191623476, + "grad_norm": 0.2972055673599243, + "learning_rate": 3.2375e-05, + "loss": 1.5097, + "step": 705 + }, + { + "epoch": 1.0548629593313723, + "grad_norm": 0.3634691834449768, + "learning_rate": 3.235e-05, + "loss": 1.4746, + "step": 706 + }, + { + "epoch": 1.0563570995003968, + "grad_norm": 2.4061403274536133, + "learning_rate": 3.2325e-05, + "loss": 1.4028, + "step": 707 + }, + { + "epoch": 1.0578512396694215, + "grad_norm": 0.5329898595809937, + "learning_rate": 3.2300000000000006e-05, + "loss": 1.4579, + "step": 708 + }, + { + "epoch": 1.059345379838446, + "grad_norm": 0.28464484214782715, + "learning_rate": 3.2275e-05, + "loss": 1.5056, + "step": 709 + }, + { + "epoch": 1.0608395200074707, + "grad_norm": 0.39857912063598633, + "learning_rate": 3.2250000000000005e-05, + "loss": 1.4314, + "step": 710 + }, + { + "epoch": 1.0623336601764952, + "grad_norm": 0.2124282419681549, + "learning_rate": 3.2225e-05, + "loss": 1.3051, + "step": 711 + }, + { + "epoch": 1.06382780034552, + "grad_norm": 0.31750163435935974, + "learning_rate": 3.2200000000000003e-05, + "loss": 1.5166, + "step": 712 + }, + { + "epoch": 1.0653219405145444, + "grad_norm": 0.25398388504981995, + "learning_rate": 3.2175e-05, + "loss": 1.3385, + "step": 713 + }, + { + "epoch": 1.0668160806835691, + "grad_norm": 0.3630509674549103, + "learning_rate": 3.215e-05, + "loss": 1.2873, + "step": 714 + }, + { + "epoch": 1.0683102208525936, + "grad_norm": 0.3998902440071106, + "learning_rate": 3.2125e-05, + "loss": 1.4107, + "step": 715 + }, + { + "epoch": 1.0698043610216184, + "grad_norm": 0.2822936475276947, + "learning_rate": 3.21e-05, + "loss": 1.4899, + "step": 716 + }, + { + "epoch": 1.0712985011906428, + "grad_norm": 0.2608018219470978, + "learning_rate": 3.2075e-05, + "loss": 1.2648, + "step": 717 + }, + { + "epoch": 1.0727926413596676, + "grad_norm": 0.24499449133872986, + "learning_rate": 3.205e-05, + "loss": 1.51, + "step": 718 + }, + { + "epoch": 1.074286781528692, + "grad_norm": 0.29815685749053955, + "learning_rate": 3.2025e-05, + "loss": 1.1699, + "step": 719 + }, + { + "epoch": 1.0757809216977168, + "grad_norm": 0.38822293281555176, + "learning_rate": 3.2000000000000005e-05, + "loss": 1.4869, + "step": 720 + }, + { + "epoch": 1.0772750618667413, + "grad_norm": 0.2638522684574127, + "learning_rate": 3.1975e-05, + "loss": 1.2973, + "step": 721 + }, + { + "epoch": 1.078769202035766, + "grad_norm": 0.5482388734817505, + "learning_rate": 3.1950000000000004e-05, + "loss": 1.5617, + "step": 722 + }, + { + "epoch": 1.0802633422047907, + "grad_norm": 0.3795502185821533, + "learning_rate": 3.1925e-05, + "loss": 1.3082, + "step": 723 + }, + { + "epoch": 1.0817574823738152, + "grad_norm": 0.30303916335105896, + "learning_rate": 3.19e-05, + "loss": 1.3434, + "step": 724 + }, + { + "epoch": 1.0832516225428397, + "grad_norm": 0.44353151321411133, + "learning_rate": 3.1875e-05, + "loss": 1.211, + "step": 725 + }, + { + "epoch": 1.0847457627118644, + "grad_norm": 0.47806358337402344, + "learning_rate": 3.185e-05, + "loss": 1.3748, + "step": 726 + }, + { + "epoch": 1.0862399028808891, + "grad_norm": 0.32267263531684875, + "learning_rate": 3.1825e-05, + "loss": 1.3756, + "step": 727 + }, + { + "epoch": 1.0877340430499136, + "grad_norm": 0.33392220735549927, + "learning_rate": 3.18e-05, + "loss": 1.4036, + "step": 728 + }, + { + "epoch": 1.0892281832189383, + "grad_norm": 0.38585975766181946, + "learning_rate": 3.1775e-05, + "loss": 1.4745, + "step": 729 + }, + { + "epoch": 1.0907223233879628, + "grad_norm": 0.23577117919921875, + "learning_rate": 3.175e-05, + "loss": 1.273, + "step": 730 + }, + { + "epoch": 1.0922164635569875, + "grad_norm": 0.3241201937198639, + "learning_rate": 3.1725e-05, + "loss": 1.3136, + "step": 731 + }, + { + "epoch": 1.093710603726012, + "grad_norm": 0.3119312524795532, + "learning_rate": 3.1700000000000005e-05, + "loss": 1.3497, + "step": 732 + }, + { + "epoch": 1.0952047438950367, + "grad_norm": 0.18843188881874084, + "learning_rate": 3.1675e-05, + "loss": 1.3864, + "step": 733 + }, + { + "epoch": 1.0966988840640612, + "grad_norm": 0.21842527389526367, + "learning_rate": 3.1650000000000004e-05, + "loss": 1.3344, + "step": 734 + }, + { + "epoch": 1.098193024233086, + "grad_norm": 0.28005000948905945, + "learning_rate": 3.1624999999999996e-05, + "loss": 1.2891, + "step": 735 + }, + { + "epoch": 1.0996871644021105, + "grad_norm": 0.23576630651950836, + "learning_rate": 3.16e-05, + "loss": 1.5126, + "step": 736 + }, + { + "epoch": 1.1011813045711352, + "grad_norm": 2.74788236618042, + "learning_rate": 3.1575e-05, + "loss": 1.365, + "step": 737 + }, + { + "epoch": 1.1026754447401597, + "grad_norm": 0.42025816440582275, + "learning_rate": 3.155e-05, + "loss": 1.3187, + "step": 738 + }, + { + "epoch": 1.1041695849091844, + "grad_norm": 0.3011794984340668, + "learning_rate": 3.1525e-05, + "loss": 1.2736, + "step": 739 + }, + { + "epoch": 1.1056637250782089, + "grad_norm": 0.2144736796617508, + "learning_rate": 3.15e-05, + "loss": 1.4265, + "step": 740 + }, + { + "epoch": 1.1071578652472336, + "grad_norm": 1.246840000152588, + "learning_rate": 3.1475e-05, + "loss": 1.5073, + "step": 741 + }, + { + "epoch": 1.108652005416258, + "grad_norm": 0.20965440571308136, + "learning_rate": 3.145e-05, + "loss": 1.2651, + "step": 742 + }, + { + "epoch": 1.1101461455852828, + "grad_norm": 0.3805752694606781, + "learning_rate": 3.1425e-05, + "loss": 1.4304, + "step": 743 + }, + { + "epoch": 1.1116402857543073, + "grad_norm": 0.3760318458080292, + "learning_rate": 3.1400000000000004e-05, + "loss": 1.2205, + "step": 744 + }, + { + "epoch": 1.113134425923332, + "grad_norm": 0.302948534488678, + "learning_rate": 3.1375e-05, + "loss": 1.3653, + "step": 745 + }, + { + "epoch": 1.1146285660923565, + "grad_norm": 0.5752202868461609, + "learning_rate": 3.135e-05, + "loss": 1.2555, + "step": 746 + }, + { + "epoch": 1.1161227062613812, + "grad_norm": 0.26672235131263733, + "learning_rate": 3.1324999999999996e-05, + "loss": 1.453, + "step": 747 + }, + { + "epoch": 1.1176168464304057, + "grad_norm": 0.261255145072937, + "learning_rate": 3.13e-05, + "loss": 1.3766, + "step": 748 + }, + { + "epoch": 1.1191109865994304, + "grad_norm": 0.3653060495853424, + "learning_rate": 3.1275e-05, + "loss": 1.4956, + "step": 749 + }, + { + "epoch": 1.120605126768455, + "grad_norm": 0.35471010208129883, + "learning_rate": 3.125e-05, + "loss": 1.3901, + "step": 750 + }, + { + "epoch": 1.1220992669374796, + "grad_norm": 0.2914360463619232, + "learning_rate": 3.122500000000001e-05, + "loss": 1.3893, + "step": 751 + }, + { + "epoch": 1.1235934071065041, + "grad_norm": 0.3225819170475006, + "learning_rate": 3.12e-05, + "loss": 1.2372, + "step": 752 + }, + { + "epoch": 1.1250875472755288, + "grad_norm": 0.3456740975379944, + "learning_rate": 3.1175000000000006e-05, + "loss": 1.4392, + "step": 753 + }, + { + "epoch": 1.1265816874445533, + "grad_norm": 0.37935250997543335, + "learning_rate": 3.115e-05, + "loss": 1.2221, + "step": 754 + }, + { + "epoch": 1.128075827613578, + "grad_norm": 0.34086886048316956, + "learning_rate": 3.1125000000000004e-05, + "loss": 1.4491, + "step": 755 + }, + { + "epoch": 1.1295699677826025, + "grad_norm": 0.3890403211116791, + "learning_rate": 3.1100000000000004e-05, + "loss": 1.3886, + "step": 756 + }, + { + "epoch": 1.1310641079516273, + "grad_norm": 0.19871480762958527, + "learning_rate": 3.1075e-05, + "loss": 1.2324, + "step": 757 + }, + { + "epoch": 1.1325582481206518, + "grad_norm": 0.4740760922431946, + "learning_rate": 3.105e-05, + "loss": 1.5408, + "step": 758 + }, + { + "epoch": 1.1340523882896765, + "grad_norm": 0.25011759996414185, + "learning_rate": 3.1025e-05, + "loss": 1.5476, + "step": 759 + }, + { + "epoch": 1.135546528458701, + "grad_norm": 0.23516173660755157, + "learning_rate": 3.1e-05, + "loss": 1.4722, + "step": 760 + }, + { + "epoch": 1.1370406686277257, + "grad_norm": 0.20128604769706726, + "learning_rate": 3.0975e-05, + "loss": 1.3795, + "step": 761 + }, + { + "epoch": 1.1385348087967502, + "grad_norm": 0.21477344632148743, + "learning_rate": 3.095e-05, + "loss": 1.4503, + "step": 762 + }, + { + "epoch": 1.140028948965775, + "grad_norm": 0.3315068483352661, + "learning_rate": 3.0925000000000006e-05, + "loss": 1.4391, + "step": 763 + }, + { + "epoch": 1.1415230891347994, + "grad_norm": 0.30626919865608215, + "learning_rate": 3.09e-05, + "loss": 1.362, + "step": 764 + }, + { + "epoch": 1.143017229303824, + "grad_norm": 0.5180973410606384, + "learning_rate": 3.0875000000000005e-05, + "loss": 1.3855, + "step": 765 + }, + { + "epoch": 1.1445113694728486, + "grad_norm": 0.4765807092189789, + "learning_rate": 3.0850000000000004e-05, + "loss": 1.4129, + "step": 766 + }, + { + "epoch": 1.1460055096418733, + "grad_norm": 0.3589872121810913, + "learning_rate": 3.0825000000000004e-05, + "loss": 1.4661, + "step": 767 + }, + { + "epoch": 1.1474996498108978, + "grad_norm": 0.23404400050640106, + "learning_rate": 3.08e-05, + "loss": 1.3543, + "step": 768 + }, + { + "epoch": 1.1489937899799225, + "grad_norm": 0.34380847215652466, + "learning_rate": 3.0775e-05, + "loss": 1.4952, + "step": 769 + }, + { + "epoch": 1.150487930148947, + "grad_norm": 0.3612429201602936, + "learning_rate": 3.075e-05, + "loss": 1.1269, + "step": 770 + }, + { + "epoch": 1.1519820703179717, + "grad_norm": 0.17686842381954193, + "learning_rate": 3.0725e-05, + "loss": 1.3392, + "step": 771 + }, + { + "epoch": 1.1534762104869962, + "grad_norm": 0.21031558513641357, + "learning_rate": 3.07e-05, + "loss": 1.3877, + "step": 772 + }, + { + "epoch": 1.154970350656021, + "grad_norm": 0.24665507674217224, + "learning_rate": 3.067500000000001e-05, + "loss": 1.39, + "step": 773 + }, + { + "epoch": 1.1564644908250454, + "grad_norm": 0.206196591258049, + "learning_rate": 3.065e-05, + "loss": 1.4427, + "step": 774 + }, + { + "epoch": 1.1579586309940701, + "grad_norm": 0.8704677224159241, + "learning_rate": 3.0625000000000006e-05, + "loss": 1.6024, + "step": 775 + }, + { + "epoch": 1.1594527711630946, + "grad_norm": 0.36794281005859375, + "learning_rate": 3.06e-05, + "loss": 1.4583, + "step": 776 + }, + { + "epoch": 1.1609469113321194, + "grad_norm": 0.38979122042655945, + "learning_rate": 3.0575000000000005e-05, + "loss": 1.5151, + "step": 777 + }, + { + "epoch": 1.162441051501144, + "grad_norm": 0.3037826716899872, + "learning_rate": 3.0550000000000004e-05, + "loss": 1.4722, + "step": 778 + }, + { + "epoch": 1.1639351916701686, + "grad_norm": 1.2709904909133911, + "learning_rate": 3.0525e-05, + "loss": 1.3094, + "step": 779 + }, + { + "epoch": 1.165429331839193, + "grad_norm": 0.3158092498779297, + "learning_rate": 3.05e-05, + "loss": 1.2846, + "step": 780 + }, + { + "epoch": 1.1669234720082178, + "grad_norm": 0.5657904744148254, + "learning_rate": 3.0475000000000002e-05, + "loss": 1.4704, + "step": 781 + }, + { + "epoch": 1.1684176121772425, + "grad_norm": 0.2906689941883087, + "learning_rate": 3.045e-05, + "loss": 1.1798, + "step": 782 + }, + { + "epoch": 1.169911752346267, + "grad_norm": 0.5362401604652405, + "learning_rate": 3.0425000000000004e-05, + "loss": 1.2779, + "step": 783 + }, + { + "epoch": 1.1714058925152915, + "grad_norm": 0.5080185532569885, + "learning_rate": 3.04e-05, + "loss": 1.5285, + "step": 784 + }, + { + "epoch": 1.1729000326843162, + "grad_norm": 0.4617675840854645, + "learning_rate": 3.0375000000000003e-05, + "loss": 1.2205, + "step": 785 + }, + { + "epoch": 1.174394172853341, + "grad_norm": 0.38406652212142944, + "learning_rate": 3.035e-05, + "loss": 1.3588, + "step": 786 + }, + { + "epoch": 1.1758883130223654, + "grad_norm": 0.373355895280838, + "learning_rate": 3.0325000000000002e-05, + "loss": 1.1637, + "step": 787 + }, + { + "epoch": 1.17738245319139, + "grad_norm": 0.5508015155792236, + "learning_rate": 3.03e-05, + "loss": 1.4598, + "step": 788 + }, + { + "epoch": 1.1788765933604146, + "grad_norm": 0.24885353446006775, + "learning_rate": 3.0275000000000004e-05, + "loss": 1.4141, + "step": 789 + }, + { + "epoch": 1.1803707335294393, + "grad_norm": 0.33441323041915894, + "learning_rate": 3.025e-05, + "loss": 1.3876, + "step": 790 + }, + { + "epoch": 1.1818648736984638, + "grad_norm": 0.2754193842411041, + "learning_rate": 3.0225000000000003e-05, + "loss": 1.6, + "step": 791 + }, + { + "epoch": 1.1833590138674885, + "grad_norm": 0.34586068987846375, + "learning_rate": 3.02e-05, + "loss": 1.5467, + "step": 792 + }, + { + "epoch": 1.184853154036513, + "grad_norm": 0.3228543996810913, + "learning_rate": 3.0175e-05, + "loss": 1.3096, + "step": 793 + }, + { + "epoch": 1.1863472942055378, + "grad_norm": 0.3345719575881958, + "learning_rate": 3.015e-05, + "loss": 1.4846, + "step": 794 + }, + { + "epoch": 1.1878414343745622, + "grad_norm": 0.267182320356369, + "learning_rate": 3.0125000000000004e-05, + "loss": 1.2385, + "step": 795 + }, + { + "epoch": 1.189335574543587, + "grad_norm": 0.29065176844596863, + "learning_rate": 3.01e-05, + "loss": 1.3653, + "step": 796 + }, + { + "epoch": 1.1908297147126115, + "grad_norm": 0.3694697916507721, + "learning_rate": 3.0075000000000003e-05, + "loss": 1.2429, + "step": 797 + }, + { + "epoch": 1.1923238548816362, + "grad_norm": 0.3948061466217041, + "learning_rate": 3.0050000000000002e-05, + "loss": 1.3771, + "step": 798 + }, + { + "epoch": 1.1938179950506607, + "grad_norm": 0.24674028158187866, + "learning_rate": 3.0025000000000005e-05, + "loss": 1.2335, + "step": 799 + }, + { + "epoch": 1.1953121352196854, + "grad_norm": 0.32700109481811523, + "learning_rate": 3e-05, + "loss": 1.3781, + "step": 800 + }, + { + "epoch": 1.1968062753887099, + "grad_norm": 0.45519524812698364, + "learning_rate": 2.9975000000000004e-05, + "loss": 1.2227, + "step": 801 + }, + { + "epoch": 1.1983004155577346, + "grad_norm": 0.21541063487529755, + "learning_rate": 2.995e-05, + "loss": 1.2894, + "step": 802 + }, + { + "epoch": 1.199794555726759, + "grad_norm": 0.28006261587142944, + "learning_rate": 2.9925000000000002e-05, + "loss": 1.4293, + "step": 803 + }, + { + "epoch": 1.2012886958957838, + "grad_norm": 0.2731017470359802, + "learning_rate": 2.9900000000000002e-05, + "loss": 1.1694, + "step": 804 + }, + { + "epoch": 1.2027828360648083, + "grad_norm": 0.22104504704475403, + "learning_rate": 2.9875000000000004e-05, + "loss": 1.282, + "step": 805 + }, + { + "epoch": 1.204276976233833, + "grad_norm": 0.2765119969844818, + "learning_rate": 2.985e-05, + "loss": 1.214, + "step": 806 + }, + { + "epoch": 1.2057711164028575, + "grad_norm": 0.24433647096157074, + "learning_rate": 2.9825000000000003e-05, + "loss": 1.3806, + "step": 807 + }, + { + "epoch": 1.2072652565718822, + "grad_norm": 0.3554052710533142, + "learning_rate": 2.98e-05, + "loss": 1.2306, + "step": 808 + }, + { + "epoch": 1.2087593967409067, + "grad_norm": 0.3093665838241577, + "learning_rate": 2.9775000000000002e-05, + "loss": 1.2714, + "step": 809 + }, + { + "epoch": 1.2102535369099314, + "grad_norm": 0.3403176963329315, + "learning_rate": 2.975e-05, + "loss": 1.2194, + "step": 810 + }, + { + "epoch": 1.211747677078956, + "grad_norm": 0.24468843638896942, + "learning_rate": 2.9725000000000004e-05, + "loss": 1.3873, + "step": 811 + }, + { + "epoch": 1.2132418172479806, + "grad_norm": 0.287300705909729, + "learning_rate": 2.97e-05, + "loss": 1.3787, + "step": 812 + }, + { + "epoch": 1.2147359574170051, + "grad_norm": 0.494862824678421, + "learning_rate": 2.9675000000000003e-05, + "loss": 1.3717, + "step": 813 + }, + { + "epoch": 1.2162300975860298, + "grad_norm": 0.32385581731796265, + "learning_rate": 2.965e-05, + "loss": 1.3565, + "step": 814 + }, + { + "epoch": 1.2177242377550543, + "grad_norm": 0.24930880963802338, + "learning_rate": 2.9625000000000002e-05, + "loss": 1.414, + "step": 815 + }, + { + "epoch": 1.219218377924079, + "grad_norm": 0.6832636594772339, + "learning_rate": 2.96e-05, + "loss": 1.4534, + "step": 816 + }, + { + "epoch": 1.2207125180931035, + "grad_norm": 0.6891420483589172, + "learning_rate": 2.9575000000000004e-05, + "loss": 1.1974, + "step": 817 + }, + { + "epoch": 1.2222066582621283, + "grad_norm": 0.26387810707092285, + "learning_rate": 2.955e-05, + "loss": 1.5896, + "step": 818 + }, + { + "epoch": 1.2237007984311528, + "grad_norm": 0.3951552212238312, + "learning_rate": 2.9525000000000003e-05, + "loss": 1.6081, + "step": 819 + }, + { + "epoch": 1.2251949386001775, + "grad_norm": 0.36233994364738464, + "learning_rate": 2.95e-05, + "loss": 1.4092, + "step": 820 + }, + { + "epoch": 1.226689078769202, + "grad_norm": 0.24323561787605286, + "learning_rate": 2.9475e-05, + "loss": 1.1833, + "step": 821 + }, + { + "epoch": 1.2281832189382267, + "grad_norm": 0.25665760040283203, + "learning_rate": 2.945e-05, + "loss": 1.3723, + "step": 822 + }, + { + "epoch": 1.2296773591072512, + "grad_norm": 0.8483716249465942, + "learning_rate": 2.9425000000000004e-05, + "loss": 1.3616, + "step": 823 + }, + { + "epoch": 1.231171499276276, + "grad_norm": 0.276362806558609, + "learning_rate": 2.94e-05, + "loss": 1.1512, + "step": 824 + }, + { + "epoch": 1.2326656394453004, + "grad_norm": 0.31287723779678345, + "learning_rate": 2.9375000000000003e-05, + "loss": 1.363, + "step": 825 + }, + { + "epoch": 1.234159779614325, + "grad_norm": 0.48258379101753235, + "learning_rate": 2.935e-05, + "loss": 1.4223, + "step": 826 + }, + { + "epoch": 1.2356539197833496, + "grad_norm": 0.3383966088294983, + "learning_rate": 2.9325e-05, + "loss": 1.3193, + "step": 827 + }, + { + "epoch": 1.2371480599523743, + "grad_norm": 0.20877547562122345, + "learning_rate": 2.93e-05, + "loss": 1.2692, + "step": 828 + }, + { + "epoch": 1.2386422001213988, + "grad_norm": 0.21563665568828583, + "learning_rate": 2.9275000000000003e-05, + "loss": 1.1647, + "step": 829 + }, + { + "epoch": 1.2401363402904235, + "grad_norm": 0.3213925361633301, + "learning_rate": 2.925e-05, + "loss": 1.4321, + "step": 830 + }, + { + "epoch": 1.241630480459448, + "grad_norm": 0.40108075737953186, + "learning_rate": 2.9225000000000002e-05, + "loss": 1.2396, + "step": 831 + }, + { + "epoch": 1.2431246206284727, + "grad_norm": 0.30319443345069885, + "learning_rate": 2.9199999999999998e-05, + "loss": 1.4283, + "step": 832 + }, + { + "epoch": 1.2446187607974972, + "grad_norm": 0.2038179188966751, + "learning_rate": 2.9175e-05, + "loss": 1.3823, + "step": 833 + }, + { + "epoch": 1.246112900966522, + "grad_norm": 0.3502418100833893, + "learning_rate": 2.915e-05, + "loss": 1.2678, + "step": 834 + }, + { + "epoch": 1.2476070411355464, + "grad_norm": 0.21862909197807312, + "learning_rate": 2.9125000000000003e-05, + "loss": 1.3333, + "step": 835 + }, + { + "epoch": 1.2491011813045712, + "grad_norm": 0.3174634873867035, + "learning_rate": 2.91e-05, + "loss": 1.2475, + "step": 836 + }, + { + "epoch": 1.2505953214735959, + "grad_norm": 0.26239484548568726, + "learning_rate": 2.9075000000000002e-05, + "loss": 1.3712, + "step": 837 + }, + { + "epoch": 1.2520894616426204, + "grad_norm": 0.40120023488998413, + "learning_rate": 2.9049999999999998e-05, + "loss": 1.3346, + "step": 838 + }, + { + "epoch": 1.2535836018116449, + "grad_norm": 0.5198585391044617, + "learning_rate": 2.9025e-05, + "loss": 1.4444, + "step": 839 + }, + { + "epoch": 1.2550777419806696, + "grad_norm": 0.34028634428977966, + "learning_rate": 2.9e-05, + "loss": 1.2473, + "step": 840 + }, + { + "epoch": 1.2565718821496943, + "grad_norm": 0.35604262351989746, + "learning_rate": 2.8975000000000003e-05, + "loss": 1.3676, + "step": 841 + }, + { + "epoch": 1.2580660223187188, + "grad_norm": 0.2712811529636383, + "learning_rate": 2.895e-05, + "loss": 1.4811, + "step": 842 + }, + { + "epoch": 1.2595601624877433, + "grad_norm": 0.5156970620155334, + "learning_rate": 2.8925000000000002e-05, + "loss": 1.2853, + "step": 843 + }, + { + "epoch": 1.261054302656768, + "grad_norm": 0.560204803943634, + "learning_rate": 2.8899999999999998e-05, + "loss": 1.3404, + "step": 844 + }, + { + "epoch": 1.2625484428257927, + "grad_norm": 0.3760669529438019, + "learning_rate": 2.8875e-05, + "loss": 1.4041, + "step": 845 + }, + { + "epoch": 1.2640425829948172, + "grad_norm": 0.2032732367515564, + "learning_rate": 2.885e-05, + "loss": 1.4157, + "step": 846 + }, + { + "epoch": 1.2655367231638417, + "grad_norm": 0.5017582774162292, + "learning_rate": 2.8825000000000003e-05, + "loss": 1.5672, + "step": 847 + }, + { + "epoch": 1.2670308633328664, + "grad_norm": 0.3427116572856903, + "learning_rate": 2.88e-05, + "loss": 1.3806, + "step": 848 + }, + { + "epoch": 1.2685250035018911, + "grad_norm": 0.22686722874641418, + "learning_rate": 2.8775e-05, + "loss": 1.3692, + "step": 849 + }, + { + "epoch": 1.2700191436709156, + "grad_norm": 0.252910315990448, + "learning_rate": 2.8749999999999997e-05, + "loss": 1.3763, + "step": 850 + }, + { + "epoch": 1.2715132838399401, + "grad_norm": 0.3212681710720062, + "learning_rate": 2.8725e-05, + "loss": 1.1975, + "step": 851 + }, + { + "epoch": 1.2730074240089648, + "grad_norm": 0.42948630452156067, + "learning_rate": 2.87e-05, + "loss": 1.6315, + "step": 852 + }, + { + "epoch": 1.2745015641779895, + "grad_norm": 0.21065746247768402, + "learning_rate": 2.8675000000000002e-05, + "loss": 1.3967, + "step": 853 + }, + { + "epoch": 1.275995704347014, + "grad_norm": 0.2895759046077728, + "learning_rate": 2.865e-05, + "loss": 1.3564, + "step": 854 + }, + { + "epoch": 1.2774898445160385, + "grad_norm": 0.35186535120010376, + "learning_rate": 2.8625e-05, + "loss": 1.2381, + "step": 855 + }, + { + "epoch": 1.2789839846850632, + "grad_norm": 0.2961024045944214, + "learning_rate": 2.86e-05, + "loss": 1.4581, + "step": 856 + }, + { + "epoch": 1.280478124854088, + "grad_norm": 0.23608747124671936, + "learning_rate": 2.8575000000000003e-05, + "loss": 1.397, + "step": 857 + }, + { + "epoch": 1.2819722650231125, + "grad_norm": 0.24908743798732758, + "learning_rate": 2.855e-05, + "loss": 1.16, + "step": 858 + }, + { + "epoch": 1.283466405192137, + "grad_norm": 5.011353969573975, + "learning_rate": 2.8525000000000002e-05, + "loss": 1.3649, + "step": 859 + }, + { + "epoch": 1.2849605453611617, + "grad_norm": 0.20676688849925995, + "learning_rate": 2.8499999999999998e-05, + "loss": 1.4075, + "step": 860 + }, + { + "epoch": 1.2864546855301864, + "grad_norm": 0.43557146191596985, + "learning_rate": 2.8475e-05, + "loss": 1.3004, + "step": 861 + }, + { + "epoch": 1.2879488256992109, + "grad_norm": 0.421140193939209, + "learning_rate": 2.845e-05, + "loss": 1.3747, + "step": 862 + }, + { + "epoch": 1.2894429658682356, + "grad_norm": 0.30302608013153076, + "learning_rate": 2.8425000000000003e-05, + "loss": 1.4815, + "step": 863 + }, + { + "epoch": 1.29093710603726, + "grad_norm": 0.2589489221572876, + "learning_rate": 2.84e-05, + "loss": 1.3638, + "step": 864 + }, + { + "epoch": 1.2924312462062848, + "grad_norm": 0.3756728768348694, + "learning_rate": 2.8375000000000002e-05, + "loss": 1.2456, + "step": 865 + }, + { + "epoch": 1.2939253863753093, + "grad_norm": 0.36108818650245667, + "learning_rate": 2.8349999999999998e-05, + "loss": 1.5138, + "step": 866 + }, + { + "epoch": 1.295419526544334, + "grad_norm": 0.4813844859600067, + "learning_rate": 2.8325e-05, + "loss": 1.4441, + "step": 867 + }, + { + "epoch": 1.2969136667133585, + "grad_norm": 0.31173527240753174, + "learning_rate": 2.83e-05, + "loss": 1.2089, + "step": 868 + }, + { + "epoch": 1.2984078068823832, + "grad_norm": 0.1975783407688141, + "learning_rate": 2.8275000000000003e-05, + "loss": 1.3155, + "step": 869 + }, + { + "epoch": 1.2999019470514077, + "grad_norm": 0.2667165696620941, + "learning_rate": 2.825e-05, + "loss": 1.2374, + "step": 870 + }, + { + "epoch": 1.3013960872204324, + "grad_norm": 0.2623427212238312, + "learning_rate": 2.8225e-05, + "loss": 1.2428, + "step": 871 + }, + { + "epoch": 1.302890227389457, + "grad_norm": 0.38565951585769653, + "learning_rate": 2.8199999999999998e-05, + "loss": 1.4143, + "step": 872 + }, + { + "epoch": 1.3043843675584816, + "grad_norm": 0.2516254186630249, + "learning_rate": 2.8175e-05, + "loss": 1.3205, + "step": 873 + }, + { + "epoch": 1.3058785077275061, + "grad_norm": 0.2702304422855377, + "learning_rate": 2.815e-05, + "loss": 1.4655, + "step": 874 + }, + { + "epoch": 1.3073726478965308, + "grad_norm": 0.23490573465824127, + "learning_rate": 2.8125000000000003e-05, + "loss": 1.3573, + "step": 875 + }, + { + "epoch": 1.3088667880655553, + "grad_norm": 0.2213398814201355, + "learning_rate": 2.8100000000000005e-05, + "loss": 1.4382, + "step": 876 + }, + { + "epoch": 1.31036092823458, + "grad_norm": 0.8293248414993286, + "learning_rate": 2.8075e-05, + "loss": 1.5697, + "step": 877 + }, + { + "epoch": 1.3118550684036046, + "grad_norm": 0.3292347490787506, + "learning_rate": 2.8050000000000004e-05, + "loss": 1.3114, + "step": 878 + }, + { + "epoch": 1.3133492085726293, + "grad_norm": 0.59178626537323, + "learning_rate": 2.8025e-05, + "loss": 1.5344, + "step": 879 + }, + { + "epoch": 1.3148433487416538, + "grad_norm": 0.19311776757240295, + "learning_rate": 2.8000000000000003e-05, + "loss": 1.428, + "step": 880 + }, + { + "epoch": 1.3163374889106785, + "grad_norm": 0.2914370894432068, + "learning_rate": 2.7975000000000002e-05, + "loss": 1.2551, + "step": 881 + }, + { + "epoch": 1.317831629079703, + "grad_norm": 0.5101747512817383, + "learning_rate": 2.7950000000000005e-05, + "loss": 1.3906, + "step": 882 + }, + { + "epoch": 1.3193257692487277, + "grad_norm": 0.309827983379364, + "learning_rate": 2.7925e-05, + "loss": 1.2756, + "step": 883 + }, + { + "epoch": 1.3208199094177522, + "grad_norm": 0.17280730605125427, + "learning_rate": 2.7900000000000004e-05, + "loss": 1.2323, + "step": 884 + }, + { + "epoch": 1.322314049586777, + "grad_norm": 0.22583577036857605, + "learning_rate": 2.7875e-05, + "loss": 1.398, + "step": 885 + }, + { + "epoch": 1.3238081897558014, + "grad_norm": 0.32121244072914124, + "learning_rate": 2.7850000000000003e-05, + "loss": 1.2139, + "step": 886 + }, + { + "epoch": 1.325302329924826, + "grad_norm": 0.22227728366851807, + "learning_rate": 2.7825000000000002e-05, + "loss": 1.3756, + "step": 887 + }, + { + "epoch": 1.3267964700938506, + "grad_norm": 0.2605989873409271, + "learning_rate": 2.7800000000000005e-05, + "loss": 1.3884, + "step": 888 + }, + { + "epoch": 1.3282906102628753, + "grad_norm": 0.32979118824005127, + "learning_rate": 2.7775e-05, + "loss": 1.5079, + "step": 889 + }, + { + "epoch": 1.3297847504318998, + "grad_norm": 0.2826998233795166, + "learning_rate": 2.7750000000000004e-05, + "loss": 1.2673, + "step": 890 + }, + { + "epoch": 1.3312788906009245, + "grad_norm": 0.3953245282173157, + "learning_rate": 2.7725e-05, + "loss": 1.462, + "step": 891 + }, + { + "epoch": 1.3327730307699492, + "grad_norm": 0.3536979854106903, + "learning_rate": 2.7700000000000002e-05, + "loss": 1.4166, + "step": 892 + }, + { + "epoch": 1.3342671709389737, + "grad_norm": 0.2883892059326172, + "learning_rate": 2.7675000000000002e-05, + "loss": 1.4311, + "step": 893 + }, + { + "epoch": 1.3357613111079982, + "grad_norm": 0.9811177253723145, + "learning_rate": 2.7650000000000005e-05, + "loss": 1.559, + "step": 894 + }, + { + "epoch": 1.337255451277023, + "grad_norm": 0.2742171883583069, + "learning_rate": 2.7625e-05, + "loss": 1.388, + "step": 895 + }, + { + "epoch": 1.3387495914460477, + "grad_norm": 0.421430766582489, + "learning_rate": 2.7600000000000003e-05, + "loss": 1.447, + "step": 896 + }, + { + "epoch": 1.3402437316150722, + "grad_norm": 0.21545341610908508, + "learning_rate": 2.7575e-05, + "loss": 1.3656, + "step": 897 + }, + { + "epoch": 1.3417378717840966, + "grad_norm": 0.23525476455688477, + "learning_rate": 2.7550000000000002e-05, + "loss": 1.548, + "step": 898 + }, + { + "epoch": 1.3432320119531214, + "grad_norm": 0.233047753572464, + "learning_rate": 2.7525e-05, + "loss": 1.4095, + "step": 899 + }, + { + "epoch": 1.344726152122146, + "grad_norm": 0.2701248526573181, + "learning_rate": 2.7500000000000004e-05, + "loss": 1.2175, + "step": 900 + }, + { + "epoch": 1.3462202922911706, + "grad_norm": 0.2926495671272278, + "learning_rate": 2.7475e-05, + "loss": 1.3209, + "step": 901 + }, + { + "epoch": 1.347714432460195, + "grad_norm": 0.24261537194252014, + "learning_rate": 2.7450000000000003e-05, + "loss": 1.2946, + "step": 902 + }, + { + "epoch": 1.3492085726292198, + "grad_norm": 0.22992177307605743, + "learning_rate": 2.7425e-05, + "loss": 1.4592, + "step": 903 + }, + { + "epoch": 1.3507027127982445, + "grad_norm": 0.33490830659866333, + "learning_rate": 2.7400000000000002e-05, + "loss": 1.5484, + "step": 904 + }, + { + "epoch": 1.352196852967269, + "grad_norm": 0.4046538174152374, + "learning_rate": 2.7375e-05, + "loss": 1.5581, + "step": 905 + }, + { + "epoch": 1.3536909931362935, + "grad_norm": 0.24933142960071564, + "learning_rate": 2.7350000000000004e-05, + "loss": 1.3697, + "step": 906 + }, + { + "epoch": 1.3551851333053182, + "grad_norm": 0.2170824408531189, + "learning_rate": 2.7325e-05, + "loss": 1.27, + "step": 907 + }, + { + "epoch": 1.356679273474343, + "grad_norm": 0.19052165746688843, + "learning_rate": 2.7300000000000003e-05, + "loss": 1.5267, + "step": 908 + }, + { + "epoch": 1.3581734136433674, + "grad_norm": 0.28672656416893005, + "learning_rate": 2.7275e-05, + "loss": 1.1791, + "step": 909 + }, + { + "epoch": 1.359667553812392, + "grad_norm": 0.19672513008117676, + "learning_rate": 2.725e-05, + "loss": 1.2772, + "step": 910 + }, + { + "epoch": 1.3611616939814166, + "grad_norm": 0.24307642877101898, + "learning_rate": 2.7225e-05, + "loss": 1.5807, + "step": 911 + }, + { + "epoch": 1.3626558341504413, + "grad_norm": 0.5582717657089233, + "learning_rate": 2.7200000000000004e-05, + "loss": 1.4225, + "step": 912 + }, + { + "epoch": 1.3641499743194658, + "grad_norm": 0.2660907804965973, + "learning_rate": 2.7175e-05, + "loss": 1.4568, + "step": 913 + }, + { + "epoch": 1.3656441144884903, + "grad_norm": 1.315430998802185, + "learning_rate": 2.7150000000000003e-05, + "loss": 1.4961, + "step": 914 + }, + { + "epoch": 1.367138254657515, + "grad_norm": 0.23424766957759857, + "learning_rate": 2.7125000000000002e-05, + "loss": 1.1405, + "step": 915 + }, + { + "epoch": 1.3686323948265398, + "grad_norm": 0.3419889509677887, + "learning_rate": 2.7100000000000005e-05, + "loss": 1.3257, + "step": 916 + }, + { + "epoch": 1.3701265349955642, + "grad_norm": 0.26014795899391174, + "learning_rate": 2.7075e-05, + "loss": 1.2588, + "step": 917 + }, + { + "epoch": 1.3716206751645887, + "grad_norm": 0.25678837299346924, + "learning_rate": 2.7050000000000004e-05, + "loss": 1.2701, + "step": 918 + }, + { + "epoch": 1.3731148153336135, + "grad_norm": 0.252119243144989, + "learning_rate": 2.7025e-05, + "loss": 1.2999, + "step": 919 + }, + { + "epoch": 1.3746089555026382, + "grad_norm": 0.21296586096286774, + "learning_rate": 2.7000000000000002e-05, + "loss": 1.1938, + "step": 920 + }, + { + "epoch": 1.3761030956716627, + "grad_norm": 0.3696118891239166, + "learning_rate": 2.6975000000000002e-05, + "loss": 1.3354, + "step": 921 + }, + { + "epoch": 1.3775972358406874, + "grad_norm": 0.24564127624034882, + "learning_rate": 2.6950000000000005e-05, + "loss": 1.2584, + "step": 922 + }, + { + "epoch": 1.3790913760097119, + "grad_norm": 0.3945300877094269, + "learning_rate": 2.6925e-05, + "loss": 1.3515, + "step": 923 + }, + { + "epoch": 1.3805855161787366, + "grad_norm": 0.3990738093852997, + "learning_rate": 2.6900000000000003e-05, + "loss": 1.2732, + "step": 924 + }, + { + "epoch": 1.382079656347761, + "grad_norm": 0.22197787463665009, + "learning_rate": 2.6875e-05, + "loss": 1.1481, + "step": 925 + }, + { + "epoch": 1.3835737965167858, + "grad_norm": 0.24173837900161743, + "learning_rate": 2.6850000000000002e-05, + "loss": 1.3123, + "step": 926 + }, + { + "epoch": 1.3850679366858103, + "grad_norm": 0.21471425890922546, + "learning_rate": 2.6825e-05, + "loss": 1.3757, + "step": 927 + }, + { + "epoch": 1.386562076854835, + "grad_norm": 0.6325602531433105, + "learning_rate": 2.6800000000000004e-05, + "loss": 1.397, + "step": 928 + }, + { + "epoch": 1.3880562170238595, + "grad_norm": 0.19349870085716248, + "learning_rate": 2.6775e-05, + "loss": 1.2721, + "step": 929 + }, + { + "epoch": 1.3895503571928842, + "grad_norm": 0.25107133388519287, + "learning_rate": 2.6750000000000003e-05, + "loss": 1.4071, + "step": 930 + }, + { + "epoch": 1.3910444973619087, + "grad_norm": 0.20738127827644348, + "learning_rate": 2.6725e-05, + "loss": 1.3528, + "step": 931 + }, + { + "epoch": 1.3925386375309334, + "grad_norm": 0.31351810693740845, + "learning_rate": 2.6700000000000002e-05, + "loss": 1.2623, + "step": 932 + }, + { + "epoch": 1.394032777699958, + "grad_norm": 0.213214710354805, + "learning_rate": 2.6675e-05, + "loss": 1.2801, + "step": 933 + }, + { + "epoch": 1.3955269178689826, + "grad_norm": 0.25812122225761414, + "learning_rate": 2.6650000000000004e-05, + "loss": 1.2651, + "step": 934 + }, + { + "epoch": 1.3970210580380071, + "grad_norm": 0.3310001492500305, + "learning_rate": 2.6625e-05, + "loss": 1.4703, + "step": 935 + }, + { + "epoch": 1.3985151982070319, + "grad_norm": 0.7875789403915405, + "learning_rate": 2.6600000000000003e-05, + "loss": 1.4694, + "step": 936 + }, + { + "epoch": 1.4000093383760563, + "grad_norm": 0.25254467129707336, + "learning_rate": 2.6575e-05, + "loss": 1.4147, + "step": 937 + }, + { + "epoch": 1.401503478545081, + "grad_norm": 0.24608255922794342, + "learning_rate": 2.655e-05, + "loss": 1.4127, + "step": 938 + }, + { + "epoch": 1.4029976187141056, + "grad_norm": 0.39427649974823, + "learning_rate": 2.6525e-05, + "loss": 1.2834, + "step": 939 + }, + { + "epoch": 1.4044917588831303, + "grad_norm": 0.3120604157447815, + "learning_rate": 2.6500000000000004e-05, + "loss": 1.2606, + "step": 940 + }, + { + "epoch": 1.4059858990521548, + "grad_norm": 0.5380797982215881, + "learning_rate": 2.6475e-05, + "loss": 1.2931, + "step": 941 + }, + { + "epoch": 1.4074800392211795, + "grad_norm": 0.6395367980003357, + "learning_rate": 2.6450000000000003e-05, + "loss": 1.2245, + "step": 942 + }, + { + "epoch": 1.408974179390204, + "grad_norm": 0.27643725275993347, + "learning_rate": 2.6425e-05, + "loss": 1.4102, + "step": 943 + }, + { + "epoch": 1.4104683195592287, + "grad_norm": 0.31579136848449707, + "learning_rate": 2.64e-05, + "loss": 1.4558, + "step": 944 + }, + { + "epoch": 1.4119624597282532, + "grad_norm": 0.24337553977966309, + "learning_rate": 2.6375e-05, + "loss": 1.4894, + "step": 945 + }, + { + "epoch": 1.413456599897278, + "grad_norm": 0.40405669808387756, + "learning_rate": 2.6350000000000004e-05, + "loss": 1.4032, + "step": 946 + }, + { + "epoch": 1.4149507400663024, + "grad_norm": 0.5983033776283264, + "learning_rate": 2.6325e-05, + "loss": 1.3462, + "step": 947 + }, + { + "epoch": 1.416444880235327, + "grad_norm": 0.2644513249397278, + "learning_rate": 2.6300000000000002e-05, + "loss": 1.5256, + "step": 948 + }, + { + "epoch": 1.4179390204043516, + "grad_norm": 0.37031152844429016, + "learning_rate": 2.6275e-05, + "loss": 1.4239, + "step": 949 + }, + { + "epoch": 1.4194331605733763, + "grad_norm": 0.23210497200489044, + "learning_rate": 2.625e-05, + "loss": 1.4778, + "step": 950 + }, + { + "epoch": 1.420927300742401, + "grad_norm": 0.21914657950401306, + "learning_rate": 2.6225e-05, + "loss": 1.4029, + "step": 951 + }, + { + "epoch": 1.4224214409114255, + "grad_norm": 0.26754164695739746, + "learning_rate": 2.6200000000000003e-05, + "loss": 1.2504, + "step": 952 + }, + { + "epoch": 1.42391558108045, + "grad_norm": 0.22635582089424133, + "learning_rate": 2.6175e-05, + "loss": 1.2113, + "step": 953 + }, + { + "epoch": 1.4254097212494747, + "grad_norm": 0.22673365473747253, + "learning_rate": 2.6150000000000002e-05, + "loss": 1.3774, + "step": 954 + }, + { + "epoch": 1.4269038614184995, + "grad_norm": 0.39390039443969727, + "learning_rate": 2.6124999999999998e-05, + "loss": 1.5364, + "step": 955 + }, + { + "epoch": 1.428398001587524, + "grad_norm": 0.2662557065486908, + "learning_rate": 2.61e-05, + "loss": 1.4081, + "step": 956 + }, + { + "epoch": 1.4298921417565484, + "grad_norm": 1.5610660314559937, + "learning_rate": 2.6075e-05, + "loss": 1.3799, + "step": 957 + }, + { + "epoch": 1.4313862819255732, + "grad_norm": 0.33333706855773926, + "learning_rate": 2.6050000000000003e-05, + "loss": 1.3966, + "step": 958 + }, + { + "epoch": 1.4328804220945979, + "grad_norm": 0.29869765043258667, + "learning_rate": 2.6025e-05, + "loss": 1.3118, + "step": 959 + }, + { + "epoch": 1.4343745622636224, + "grad_norm": 0.26354825496673584, + "learning_rate": 2.6000000000000002e-05, + "loss": 1.3255, + "step": 960 + }, + { + "epoch": 1.4358687024326469, + "grad_norm": 0.41859930753707886, + "learning_rate": 2.5974999999999998e-05, + "loss": 1.3504, + "step": 961 + }, + { + "epoch": 1.4373628426016716, + "grad_norm": 0.21529413759708405, + "learning_rate": 2.595e-05, + "loss": 1.2007, + "step": 962 + }, + { + "epoch": 1.4388569827706963, + "grad_norm": 0.20801830291748047, + "learning_rate": 2.5925e-05, + "loss": 1.4386, + "step": 963 + }, + { + "epoch": 1.4403511229397208, + "grad_norm": 0.3053770661354065, + "learning_rate": 2.5900000000000003e-05, + "loss": 1.254, + "step": 964 + }, + { + "epoch": 1.4418452631087453, + "grad_norm": 0.2524198889732361, + "learning_rate": 2.5875e-05, + "loss": 1.3625, + "step": 965 + }, + { + "epoch": 1.44333940327777, + "grad_norm": 0.27251192927360535, + "learning_rate": 2.585e-05, + "loss": 1.2657, + "step": 966 + }, + { + "epoch": 1.4448335434467947, + "grad_norm": 0.2282276451587677, + "learning_rate": 2.5824999999999998e-05, + "loss": 1.2054, + "step": 967 + }, + { + "epoch": 1.4463276836158192, + "grad_norm": 0.2785608768463135, + "learning_rate": 2.58e-05, + "loss": 1.4328, + "step": 968 + }, + { + "epoch": 1.4478218237848437, + "grad_norm": 0.3191685378551483, + "learning_rate": 2.5775e-05, + "loss": 1.3032, + "step": 969 + }, + { + "epoch": 1.4493159639538684, + "grad_norm": 0.26658880710601807, + "learning_rate": 2.5750000000000002e-05, + "loss": 1.436, + "step": 970 + }, + { + "epoch": 1.4508101041228931, + "grad_norm": 0.532778263092041, + "learning_rate": 2.5725e-05, + "loss": 1.4724, + "step": 971 + }, + { + "epoch": 1.4523042442919176, + "grad_norm": 0.2609519958496094, + "learning_rate": 2.57e-05, + "loss": 1.3608, + "step": 972 + }, + { + "epoch": 1.4537983844609421, + "grad_norm": 0.2050527185201645, + "learning_rate": 2.5675e-05, + "loss": 1.3276, + "step": 973 + }, + { + "epoch": 1.4552925246299668, + "grad_norm": 0.23767416179180145, + "learning_rate": 2.5650000000000003e-05, + "loss": 1.3668, + "step": 974 + }, + { + "epoch": 1.4567866647989915, + "grad_norm": 0.43234944343566895, + "learning_rate": 2.5625e-05, + "loss": 1.4892, + "step": 975 + }, + { + "epoch": 1.458280804968016, + "grad_norm": 0.2602043151855469, + "learning_rate": 2.5600000000000002e-05, + "loss": 1.4824, + "step": 976 + }, + { + "epoch": 1.4597749451370405, + "grad_norm": 0.2256672978401184, + "learning_rate": 2.5574999999999998e-05, + "loss": 1.4242, + "step": 977 + }, + { + "epoch": 1.4612690853060653, + "grad_norm": 0.24697938561439514, + "learning_rate": 2.555e-05, + "loss": 1.2936, + "step": 978 + }, + { + "epoch": 1.46276322547509, + "grad_norm": 0.3508719205856323, + "learning_rate": 2.5525e-05, + "loss": 1.2935, + "step": 979 + }, + { + "epoch": 1.4642573656441145, + "grad_norm": 0.245631143450737, + "learning_rate": 2.5500000000000003e-05, + "loss": 1.3749, + "step": 980 + }, + { + "epoch": 1.4657515058131392, + "grad_norm": 0.34013769030570984, + "learning_rate": 2.5475e-05, + "loss": 1.2343, + "step": 981 + }, + { + "epoch": 1.4672456459821637, + "grad_norm": 0.8150389790534973, + "learning_rate": 2.5450000000000002e-05, + "loss": 1.2601, + "step": 982 + }, + { + "epoch": 1.4687397861511884, + "grad_norm": 0.25743216276168823, + "learning_rate": 2.5424999999999998e-05, + "loss": 1.1799, + "step": 983 + }, + { + "epoch": 1.4702339263202129, + "grad_norm": 0.429471492767334, + "learning_rate": 2.54e-05, + "loss": 1.5372, + "step": 984 + }, + { + "epoch": 1.4717280664892376, + "grad_norm": 0.294306218624115, + "learning_rate": 2.5375e-05, + "loss": 1.2718, + "step": 985 + }, + { + "epoch": 1.473222206658262, + "grad_norm": 0.22676728665828705, + "learning_rate": 2.5350000000000003e-05, + "loss": 1.1818, + "step": 986 + }, + { + "epoch": 1.4747163468272868, + "grad_norm": 0.25699350237846375, + "learning_rate": 2.5325e-05, + "loss": 1.2167, + "step": 987 + }, + { + "epoch": 1.4762104869963113, + "grad_norm": 0.23200049996376038, + "learning_rate": 2.5300000000000002e-05, + "loss": 1.3309, + "step": 988 + }, + { + "epoch": 1.477704627165336, + "grad_norm": 0.3287791311740875, + "learning_rate": 2.5274999999999998e-05, + "loss": 1.3764, + "step": 989 + }, + { + "epoch": 1.4791987673343605, + "grad_norm": 0.5922759771347046, + "learning_rate": 2.525e-05, + "loss": 1.4138, + "step": 990 + }, + { + "epoch": 1.4806929075033852, + "grad_norm": 0.21253745257854462, + "learning_rate": 2.5225e-05, + "loss": 1.151, + "step": 991 + }, + { + "epoch": 1.4821870476724097, + "grad_norm": 0.3864874839782715, + "learning_rate": 2.5200000000000003e-05, + "loss": 1.3244, + "step": 992 + }, + { + "epoch": 1.4836811878414344, + "grad_norm": 0.30087751150131226, + "learning_rate": 2.5175e-05, + "loss": 1.3481, + "step": 993 + }, + { + "epoch": 1.485175328010459, + "grad_norm": 0.2523193657398224, + "learning_rate": 2.515e-05, + "loss": 1.3509, + "step": 994 + }, + { + "epoch": 1.4866694681794836, + "grad_norm": 0.19127090275287628, + "learning_rate": 2.5124999999999997e-05, + "loss": 1.379, + "step": 995 + }, + { + "epoch": 1.4881636083485081, + "grad_norm": 0.5360708832740784, + "learning_rate": 2.51e-05, + "loss": 1.1747, + "step": 996 + }, + { + "epoch": 1.4896577485175329, + "grad_norm": 0.17975042760372162, + "learning_rate": 2.5075e-05, + "loss": 1.1739, + "step": 997 + }, + { + "epoch": 1.4911518886865573, + "grad_norm": 0.4710841476917267, + "learning_rate": 2.5050000000000002e-05, + "loss": 1.4195, + "step": 998 + }, + { + "epoch": 1.492646028855582, + "grad_norm": 0.2385946810245514, + "learning_rate": 2.5025e-05, + "loss": 1.3315, + "step": 999 + }, + { + "epoch": 1.4941401690246066, + "grad_norm": 0.424209326505661, + "learning_rate": 2.5e-05, + "loss": 1.2433, + "step": 1000 + }, + { + "epoch": 1.4956343091936313, + "grad_norm": 0.34269607067108154, + "learning_rate": 2.4975e-05, + "loss": 1.4392, + "step": 1001 + }, + { + "epoch": 1.4971284493626558, + "grad_norm": 1.2078485488891602, + "learning_rate": 2.495e-05, + "loss": 1.3833, + "step": 1002 + }, + { + "epoch": 1.4986225895316805, + "grad_norm": 0.3181706666946411, + "learning_rate": 2.4925000000000003e-05, + "loss": 1.3839, + "step": 1003 + }, + { + "epoch": 1.500116729700705, + "grad_norm": 0.4560883045196533, + "learning_rate": 2.4900000000000002e-05, + "loss": 1.48, + "step": 1004 + }, + { + "epoch": 1.5016108698697297, + "grad_norm": 0.5617378950119019, + "learning_rate": 2.4875e-05, + "loss": 1.216, + "step": 1005 + }, + { + "epoch": 1.5031050100387544, + "grad_norm": 0.28854793310165405, + "learning_rate": 2.485e-05, + "loss": 1.2834, + "step": 1006 + }, + { + "epoch": 1.504599150207779, + "grad_norm": 0.29387474060058594, + "learning_rate": 2.4825e-05, + "loss": 1.0273, + "step": 1007 + }, + { + "epoch": 1.5060932903768034, + "grad_norm": 1.235810399055481, + "learning_rate": 2.48e-05, + "loss": 1.4177, + "step": 1008 + }, + { + "epoch": 1.5075874305458281, + "grad_norm": 0.2525739371776581, + "learning_rate": 2.4775000000000003e-05, + "loss": 1.4798, + "step": 1009 + }, + { + "epoch": 1.5090815707148528, + "grad_norm": 0.437465101480484, + "learning_rate": 2.4750000000000002e-05, + "loss": 1.6346, + "step": 1010 + }, + { + "epoch": 1.5105757108838773, + "grad_norm": 0.19477544724941254, + "learning_rate": 2.4725e-05, + "loss": 1.2425, + "step": 1011 + }, + { + "epoch": 1.5120698510529018, + "grad_norm": 0.23110416531562805, + "learning_rate": 2.47e-05, + "loss": 1.2046, + "step": 1012 + }, + { + "epoch": 1.5135639912219265, + "grad_norm": 0.2584448456764221, + "learning_rate": 2.4675e-05, + "loss": 1.2616, + "step": 1013 + }, + { + "epoch": 1.5150581313909512, + "grad_norm": 0.24508026242256165, + "learning_rate": 2.465e-05, + "loss": 1.5643, + "step": 1014 + }, + { + "epoch": 1.5165522715599757, + "grad_norm": 0.6831825375556946, + "learning_rate": 2.4625000000000002e-05, + "loss": 1.3956, + "step": 1015 + }, + { + "epoch": 1.5180464117290002, + "grad_norm": 0.2081739604473114, + "learning_rate": 2.46e-05, + "loss": 1.1868, + "step": 1016 + }, + { + "epoch": 1.519540551898025, + "grad_norm": 0.36565127968788147, + "learning_rate": 2.4575e-05, + "loss": 1.3864, + "step": 1017 + }, + { + "epoch": 1.5210346920670497, + "grad_norm": 1.2851471900939941, + "learning_rate": 2.455e-05, + "loss": 1.3694, + "step": 1018 + }, + { + "epoch": 1.5225288322360742, + "grad_norm": 0.24699068069458008, + "learning_rate": 2.4525e-05, + "loss": 1.4973, + "step": 1019 + }, + { + "epoch": 1.5240229724050987, + "grad_norm": 0.25761643052101135, + "learning_rate": 2.45e-05, + "loss": 1.0198, + "step": 1020 + }, + { + "epoch": 1.5255171125741234, + "grad_norm": 0.24483232200145721, + "learning_rate": 2.4475000000000002e-05, + "loss": 1.2763, + "step": 1021 + }, + { + "epoch": 1.527011252743148, + "grad_norm": 0.23036424815654755, + "learning_rate": 2.445e-05, + "loss": 1.2409, + "step": 1022 + }, + { + "epoch": 1.5285053929121726, + "grad_norm": 0.3976009786128998, + "learning_rate": 2.4425e-05, + "loss": 1.2585, + "step": 1023 + }, + { + "epoch": 1.529999533081197, + "grad_norm": 0.21544378995895386, + "learning_rate": 2.44e-05, + "loss": 1.5668, + "step": 1024 + }, + { + "epoch": 1.5314936732502218, + "grad_norm": 0.19233135879039764, + "learning_rate": 2.4375e-05, + "loss": 1.3254, + "step": 1025 + }, + { + "epoch": 1.5329878134192465, + "grad_norm": 0.22133982181549072, + "learning_rate": 2.435e-05, + "loss": 1.3993, + "step": 1026 + }, + { + "epoch": 1.534481953588271, + "grad_norm": 0.324274480342865, + "learning_rate": 2.4325000000000002e-05, + "loss": 1.4625, + "step": 1027 + }, + { + "epoch": 1.5359760937572955, + "grad_norm": 0.21806937456130981, + "learning_rate": 2.43e-05, + "loss": 1.3523, + "step": 1028 + }, + { + "epoch": 1.5374702339263202, + "grad_norm": 0.23030443489551544, + "learning_rate": 2.4275e-05, + "loss": 1.2272, + "step": 1029 + }, + { + "epoch": 1.538964374095345, + "grad_norm": 0.37339186668395996, + "learning_rate": 2.425e-05, + "loss": 1.3109, + "step": 1030 + }, + { + "epoch": 1.5404585142643694, + "grad_norm": 0.26914238929748535, + "learning_rate": 2.4225e-05, + "loss": 1.2883, + "step": 1031 + }, + { + "epoch": 1.541952654433394, + "grad_norm": 0.20728585124015808, + "learning_rate": 2.4200000000000002e-05, + "loss": 1.2369, + "step": 1032 + }, + { + "epoch": 1.5434467946024186, + "grad_norm": 0.4167015254497528, + "learning_rate": 2.4175e-05, + "loss": 1.36, + "step": 1033 + }, + { + "epoch": 1.5449409347714433, + "grad_norm": 0.255819171667099, + "learning_rate": 2.415e-05, + "loss": 1.5077, + "step": 1034 + }, + { + "epoch": 1.5464350749404678, + "grad_norm": 0.2544974386692047, + "learning_rate": 2.4125e-05, + "loss": 1.3009, + "step": 1035 + }, + { + "epoch": 1.5479292151094923, + "grad_norm": 0.27418991923332214, + "learning_rate": 2.41e-05, + "loss": 1.425, + "step": 1036 + }, + { + "epoch": 1.549423355278517, + "grad_norm": 0.5164713263511658, + "learning_rate": 2.4075e-05, + "loss": 1.5062, + "step": 1037 + }, + { + "epoch": 1.5509174954475418, + "grad_norm": 0.19975262880325317, + "learning_rate": 2.4050000000000002e-05, + "loss": 1.3069, + "step": 1038 + }, + { + "epoch": 1.5524116356165663, + "grad_norm": 0.2381182760000229, + "learning_rate": 2.4025e-05, + "loss": 1.3012, + "step": 1039 + }, + { + "epoch": 1.5539057757855907, + "grad_norm": 0.2583548426628113, + "learning_rate": 2.4e-05, + "loss": 1.1985, + "step": 1040 + }, + { + "epoch": 1.5553999159546155, + "grad_norm": 0.2762662470340729, + "learning_rate": 2.3975e-05, + "loss": 1.354, + "step": 1041 + }, + { + "epoch": 1.5568940561236402, + "grad_norm": 0.2436295598745346, + "learning_rate": 2.395e-05, + "loss": 1.5246, + "step": 1042 + }, + { + "epoch": 1.5583881962926647, + "grad_norm": 0.24542199075222015, + "learning_rate": 2.3925e-05, + "loss": 1.4547, + "step": 1043 + }, + { + "epoch": 1.5598823364616892, + "grad_norm": 0.27998894453048706, + "learning_rate": 2.39e-05, + "loss": 1.337, + "step": 1044 + }, + { + "epoch": 1.5613764766307139, + "grad_norm": 0.2425457239151001, + "learning_rate": 2.3875e-05, + "loss": 1.219, + "step": 1045 + }, + { + "epoch": 1.5628706167997386, + "grad_norm": 0.26620247960090637, + "learning_rate": 2.385e-05, + "loss": 1.4698, + "step": 1046 + }, + { + "epoch": 1.564364756968763, + "grad_norm": 0.2812005579471588, + "learning_rate": 2.3825e-05, + "loss": 1.2821, + "step": 1047 + }, + { + "epoch": 1.5658588971377876, + "grad_norm": 0.24852269887924194, + "learning_rate": 2.38e-05, + "loss": 1.2905, + "step": 1048 + }, + { + "epoch": 1.5673530373068123, + "grad_norm": 0.2867446541786194, + "learning_rate": 2.3775e-05, + "loss": 1.3753, + "step": 1049 + }, + { + "epoch": 1.568847177475837, + "grad_norm": 0.18901658058166504, + "learning_rate": 2.375e-05, + "loss": 1.2445, + "step": 1050 + }, + { + "epoch": 1.5703413176448615, + "grad_norm": 0.2708028256893158, + "learning_rate": 2.3725e-05, + "loss": 1.3819, + "step": 1051 + }, + { + "epoch": 1.571835457813886, + "grad_norm": 0.2241097092628479, + "learning_rate": 2.37e-05, + "loss": 1.2094, + "step": 1052 + }, + { + "epoch": 1.5733295979829107, + "grad_norm": 0.22352498769760132, + "learning_rate": 2.3675e-05, + "loss": 1.2761, + "step": 1053 + }, + { + "epoch": 1.5748237381519354, + "grad_norm": 0.28223153948783875, + "learning_rate": 2.365e-05, + "loss": 1.2256, + "step": 1054 + }, + { + "epoch": 1.57631787832096, + "grad_norm": 0.17800620198249817, + "learning_rate": 2.3624999999999998e-05, + "loss": 1.1712, + "step": 1055 + }, + { + "epoch": 1.5778120184899846, + "grad_norm": 0.39222070574760437, + "learning_rate": 2.36e-05, + "loss": 1.5474, + "step": 1056 + }, + { + "epoch": 1.5793061586590094, + "grad_norm": 0.24694950878620148, + "learning_rate": 2.3575e-05, + "loss": 1.4975, + "step": 1057 + }, + { + "epoch": 1.5808002988280339, + "grad_norm": 0.2800028622150421, + "learning_rate": 2.355e-05, + "loss": 1.4577, + "step": 1058 + }, + { + "epoch": 1.5822944389970584, + "grad_norm": 0.22472792863845825, + "learning_rate": 2.3525e-05, + "loss": 1.2913, + "step": 1059 + }, + { + "epoch": 1.583788579166083, + "grad_norm": 0.37456855177879333, + "learning_rate": 2.35e-05, + "loss": 1.5394, + "step": 1060 + }, + { + "epoch": 1.5852827193351078, + "grad_norm": 0.25892317295074463, + "learning_rate": 2.3475e-05, + "loss": 1.3467, + "step": 1061 + }, + { + "epoch": 1.5867768595041323, + "grad_norm": 0.2283022403717041, + "learning_rate": 2.345e-05, + "loss": 1.421, + "step": 1062 + }, + { + "epoch": 1.5882709996731568, + "grad_norm": 0.4213232398033142, + "learning_rate": 2.3425000000000004e-05, + "loss": 1.1265, + "step": 1063 + }, + { + "epoch": 1.5897651398421815, + "grad_norm": 0.1526349037885666, + "learning_rate": 2.3400000000000003e-05, + "loss": 1.2766, + "step": 1064 + }, + { + "epoch": 1.5912592800112062, + "grad_norm": 0.3344525098800659, + "learning_rate": 2.3375000000000002e-05, + "loss": 1.2623, + "step": 1065 + }, + { + "epoch": 1.5927534201802307, + "grad_norm": 0.2371218055486679, + "learning_rate": 2.3350000000000002e-05, + "loss": 1.4743, + "step": 1066 + }, + { + "epoch": 1.5942475603492552, + "grad_norm": 0.2002929300069809, + "learning_rate": 2.3325e-05, + "loss": 1.301, + "step": 1067 + }, + { + "epoch": 1.59574170051828, + "grad_norm": 0.24492183327674866, + "learning_rate": 2.3300000000000004e-05, + "loss": 1.3952, + "step": 1068 + }, + { + "epoch": 1.5972358406873046, + "grad_norm": 0.36826092004776, + "learning_rate": 2.3275000000000003e-05, + "loss": 1.1323, + "step": 1069 + }, + { + "epoch": 1.5987299808563291, + "grad_norm": 0.22465544939041138, + "learning_rate": 2.3250000000000003e-05, + "loss": 1.482, + "step": 1070 + }, + { + "epoch": 1.6002241210253536, + "grad_norm": 0.40929779410362244, + "learning_rate": 2.3225000000000002e-05, + "loss": 1.3384, + "step": 1071 + }, + { + "epoch": 1.6017182611943783, + "grad_norm": 0.26005005836486816, + "learning_rate": 2.32e-05, + "loss": 1.2387, + "step": 1072 + }, + { + "epoch": 1.603212401363403, + "grad_norm": 0.2737712264060974, + "learning_rate": 2.3175e-05, + "loss": 1.3755, + "step": 1073 + }, + { + "epoch": 1.6047065415324275, + "grad_norm": 0.20897279679775238, + "learning_rate": 2.3150000000000004e-05, + "loss": 1.5051, + "step": 1074 + }, + { + "epoch": 1.606200681701452, + "grad_norm": 0.23773355782032013, + "learning_rate": 2.3125000000000003e-05, + "loss": 1.2317, + "step": 1075 + }, + { + "epoch": 1.6076948218704767, + "grad_norm": 0.17421811819076538, + "learning_rate": 2.3100000000000002e-05, + "loss": 1.3285, + "step": 1076 + }, + { + "epoch": 1.6091889620395015, + "grad_norm": 0.242172509431839, + "learning_rate": 2.3075000000000002e-05, + "loss": 1.3524, + "step": 1077 + }, + { + "epoch": 1.610683102208526, + "grad_norm": 0.1881755292415619, + "learning_rate": 2.305e-05, + "loss": 1.3131, + "step": 1078 + }, + { + "epoch": 1.6121772423775504, + "grad_norm": 0.39744487404823303, + "learning_rate": 2.3025e-05, + "loss": 1.4211, + "step": 1079 + }, + { + "epoch": 1.6136713825465752, + "grad_norm": 0.28824132680892944, + "learning_rate": 2.3000000000000003e-05, + "loss": 1.4354, + "step": 1080 + }, + { + "epoch": 1.6151655227155999, + "grad_norm": 0.23316174745559692, + "learning_rate": 2.2975000000000003e-05, + "loss": 1.3879, + "step": 1081 + }, + { + "epoch": 1.6166596628846244, + "grad_norm": 0.20622767508029938, + "learning_rate": 2.2950000000000002e-05, + "loss": 1.4856, + "step": 1082 + }, + { + "epoch": 1.6181538030536489, + "grad_norm": 0.26608648896217346, + "learning_rate": 2.2925e-05, + "loss": 1.3223, + "step": 1083 + }, + { + "epoch": 1.6196479432226736, + "grad_norm": 0.38514405488967896, + "learning_rate": 2.29e-05, + "loss": 1.2364, + "step": 1084 + }, + { + "epoch": 1.6211420833916983, + "grad_norm": 0.4655008912086487, + "learning_rate": 2.2875e-05, + "loss": 1.283, + "step": 1085 + }, + { + "epoch": 1.6226362235607228, + "grad_norm": 0.334542453289032, + "learning_rate": 2.2850000000000003e-05, + "loss": 1.4633, + "step": 1086 + }, + { + "epoch": 1.6241303637297473, + "grad_norm": 0.2390364110469818, + "learning_rate": 2.2825000000000003e-05, + "loss": 1.4632, + "step": 1087 + }, + { + "epoch": 1.625624503898772, + "grad_norm": 0.2931368350982666, + "learning_rate": 2.2800000000000002e-05, + "loss": 1.3178, + "step": 1088 + }, + { + "epoch": 1.6271186440677967, + "grad_norm": 0.2291143387556076, + "learning_rate": 2.2775e-05, + "loss": 1.4035, + "step": 1089 + }, + { + "epoch": 1.6286127842368212, + "grad_norm": 0.21086153388023376, + "learning_rate": 2.275e-05, + "loss": 1.4254, + "step": 1090 + }, + { + "epoch": 1.6301069244058457, + "grad_norm": 0.20831753313541412, + "learning_rate": 2.2725000000000003e-05, + "loss": 1.332, + "step": 1091 + }, + { + "epoch": 1.6316010645748704, + "grad_norm": 0.3618925213813782, + "learning_rate": 2.2700000000000003e-05, + "loss": 1.2239, + "step": 1092 + }, + { + "epoch": 1.6330952047438951, + "grad_norm": 0.3585363030433655, + "learning_rate": 2.2675000000000002e-05, + "loss": 1.2772, + "step": 1093 + }, + { + "epoch": 1.6345893449129196, + "grad_norm": 0.24144862592220306, + "learning_rate": 2.265e-05, + "loss": 1.4067, + "step": 1094 + }, + { + "epoch": 1.6360834850819441, + "grad_norm": 0.3754459619522095, + "learning_rate": 2.2625e-05, + "loss": 1.466, + "step": 1095 + }, + { + "epoch": 1.6375776252509688, + "grad_norm": 0.21260520815849304, + "learning_rate": 2.26e-05, + "loss": 1.1853, + "step": 1096 + }, + { + "epoch": 1.6390717654199936, + "grad_norm": 0.21034535765647888, + "learning_rate": 2.2575000000000003e-05, + "loss": 1.1168, + "step": 1097 + }, + { + "epoch": 1.640565905589018, + "grad_norm": 0.23104006052017212, + "learning_rate": 2.2550000000000003e-05, + "loss": 1.2363, + "step": 1098 + }, + { + "epoch": 1.6420600457580425, + "grad_norm": 0.23428553342819214, + "learning_rate": 2.2525000000000002e-05, + "loss": 1.3021, + "step": 1099 + }, + { + "epoch": 1.6435541859270673, + "grad_norm": 0.4666197896003723, + "learning_rate": 2.25e-05, + "loss": 1.4213, + "step": 1100 + }, + { + "epoch": 1.645048326096092, + "grad_norm": 0.6136429309844971, + "learning_rate": 2.2475e-05, + "loss": 1.2789, + "step": 1101 + }, + { + "epoch": 1.6465424662651165, + "grad_norm": 0.5505061149597168, + "learning_rate": 2.245e-05, + "loss": 1.3622, + "step": 1102 + }, + { + "epoch": 1.648036606434141, + "grad_norm": 0.20704680681228638, + "learning_rate": 2.2425000000000003e-05, + "loss": 1.3519, + "step": 1103 + }, + { + "epoch": 1.6495307466031657, + "grad_norm": 0.24088571965694427, + "learning_rate": 2.2400000000000002e-05, + "loss": 1.2892, + "step": 1104 + }, + { + "epoch": 1.6510248867721904, + "grad_norm": 0.6935633420944214, + "learning_rate": 2.2375000000000002e-05, + "loss": 1.332, + "step": 1105 + }, + { + "epoch": 1.6525190269412149, + "grad_norm": 0.2827127277851105, + "learning_rate": 2.235e-05, + "loss": 1.3024, + "step": 1106 + }, + { + "epoch": 1.6540131671102394, + "grad_norm": 0.25590217113494873, + "learning_rate": 2.2325e-05, + "loss": 1.3248, + "step": 1107 + }, + { + "epoch": 1.655507307279264, + "grad_norm": 0.19156008958816528, + "learning_rate": 2.23e-05, + "loss": 1.4835, + "step": 1108 + }, + { + "epoch": 1.6570014474482888, + "grad_norm": 0.18704158067703247, + "learning_rate": 2.2275000000000003e-05, + "loss": 1.2183, + "step": 1109 + }, + { + "epoch": 1.6584955876173133, + "grad_norm": 0.21460570394992828, + "learning_rate": 2.2250000000000002e-05, + "loss": 1.2204, + "step": 1110 + }, + { + "epoch": 1.6599897277863378, + "grad_norm": 0.4342014789581299, + "learning_rate": 2.2225e-05, + "loss": 1.5425, + "step": 1111 + }, + { + "epoch": 1.6614838679553625, + "grad_norm": 0.25505074858665466, + "learning_rate": 2.22e-05, + "loss": 1.2719, + "step": 1112 + }, + { + "epoch": 1.6629780081243872, + "grad_norm": 0.4050869047641754, + "learning_rate": 2.2175e-05, + "loss": 1.2727, + "step": 1113 + }, + { + "epoch": 1.6644721482934117, + "grad_norm": 6.226749897003174, + "learning_rate": 2.215e-05, + "loss": 1.149, + "step": 1114 + }, + { + "epoch": 1.6659662884624362, + "grad_norm": 0.17852473258972168, + "learning_rate": 2.2125000000000002e-05, + "loss": 1.3976, + "step": 1115 + }, + { + "epoch": 1.667460428631461, + "grad_norm": 0.3547992706298828, + "learning_rate": 2.2100000000000002e-05, + "loss": 1.2597, + "step": 1116 + }, + { + "epoch": 1.6689545688004856, + "grad_norm": 0.285022109746933, + "learning_rate": 2.2075e-05, + "loss": 1.4945, + "step": 1117 + }, + { + "epoch": 1.6704487089695101, + "grad_norm": 0.2697555422782898, + "learning_rate": 2.205e-05, + "loss": 1.2034, + "step": 1118 + }, + { + "epoch": 1.6719428491385349, + "grad_norm": 0.2486453652381897, + "learning_rate": 2.2025e-05, + "loss": 1.3333, + "step": 1119 + }, + { + "epoch": 1.6734369893075596, + "grad_norm": 0.3298685550689697, + "learning_rate": 2.2000000000000003e-05, + "loss": 1.2115, + "step": 1120 + }, + { + "epoch": 1.674931129476584, + "grad_norm": 0.2030079960823059, + "learning_rate": 2.1975000000000002e-05, + "loss": 1.4112, + "step": 1121 + }, + { + "epoch": 1.6764252696456086, + "grad_norm": 0.32204288244247437, + "learning_rate": 2.195e-05, + "loss": 1.3559, + "step": 1122 + }, + { + "epoch": 1.6779194098146333, + "grad_norm": 0.22706016898155212, + "learning_rate": 2.1925e-05, + "loss": 1.1504, + "step": 1123 + }, + { + "epoch": 1.679413549983658, + "grad_norm": 0.3011496067047119, + "learning_rate": 2.19e-05, + "loss": 1.3425, + "step": 1124 + }, + { + "epoch": 1.6809076901526825, + "grad_norm": 0.2117116004228592, + "learning_rate": 2.1875e-05, + "loss": 1.428, + "step": 1125 + }, + { + "epoch": 1.682401830321707, + "grad_norm": 0.2260589450597763, + "learning_rate": 2.1850000000000003e-05, + "loss": 1.2989, + "step": 1126 + }, + { + "epoch": 1.6838959704907317, + "grad_norm": 0.282379150390625, + "learning_rate": 2.1825000000000002e-05, + "loss": 1.5454, + "step": 1127 + }, + { + "epoch": 1.6853901106597564, + "grad_norm": 0.2984197735786438, + "learning_rate": 2.18e-05, + "loss": 1.4015, + "step": 1128 + }, + { + "epoch": 1.686884250828781, + "grad_norm": 0.23150713741779327, + "learning_rate": 2.1775e-05, + "loss": 1.1436, + "step": 1129 + }, + { + "epoch": 1.6883783909978054, + "grad_norm": 0.21945536136627197, + "learning_rate": 2.175e-05, + "loss": 1.3747, + "step": 1130 + }, + { + "epoch": 1.6898725311668301, + "grad_norm": 0.18954990804195404, + "learning_rate": 2.1725e-05, + "loss": 1.3046, + "step": 1131 + }, + { + "epoch": 1.6913666713358548, + "grad_norm": 0.3952544927597046, + "learning_rate": 2.1700000000000002e-05, + "loss": 1.3724, + "step": 1132 + }, + { + "epoch": 1.6928608115048793, + "grad_norm": 1.5406503677368164, + "learning_rate": 2.1675e-05, + "loss": 1.3255, + "step": 1133 + }, + { + "epoch": 1.6943549516739038, + "grad_norm": 0.20292794704437256, + "learning_rate": 2.165e-05, + "loss": 1.2522, + "step": 1134 + }, + { + "epoch": 1.6958490918429285, + "grad_norm": 0.25634780526161194, + "learning_rate": 2.1625e-05, + "loss": 1.3941, + "step": 1135 + }, + { + "epoch": 1.6973432320119533, + "grad_norm": 0.5052337050437927, + "learning_rate": 2.16e-05, + "loss": 1.49, + "step": 1136 + }, + { + "epoch": 1.6988373721809777, + "grad_norm": 0.3816353678703308, + "learning_rate": 2.1575e-05, + "loss": 1.367, + "step": 1137 + }, + { + "epoch": 1.7003315123500022, + "grad_norm": 0.19778917729854584, + "learning_rate": 2.1550000000000002e-05, + "loss": 1.3428, + "step": 1138 + }, + { + "epoch": 1.701825652519027, + "grad_norm": 0.214355006814003, + "learning_rate": 2.1525e-05, + "loss": 1.3231, + "step": 1139 + }, + { + "epoch": 1.7033197926880517, + "grad_norm": 0.8011763691902161, + "learning_rate": 2.15e-05, + "loss": 1.2472, + "step": 1140 + }, + { + "epoch": 1.7048139328570762, + "grad_norm": 0.21572141349315643, + "learning_rate": 2.1475e-05, + "loss": 1.3991, + "step": 1141 + }, + { + "epoch": 1.7063080730261007, + "grad_norm": 0.17696185410022736, + "learning_rate": 2.145e-05, + "loss": 1.2861, + "step": 1142 + }, + { + "epoch": 1.7078022131951254, + "grad_norm": 0.2655123472213745, + "learning_rate": 2.1425e-05, + "loss": 1.2419, + "step": 1143 + }, + { + "epoch": 1.70929635336415, + "grad_norm": 0.30516761541366577, + "learning_rate": 2.1400000000000002e-05, + "loss": 1.2123, + "step": 1144 + }, + { + "epoch": 1.7107904935331746, + "grad_norm": 0.33619871735572815, + "learning_rate": 2.1375e-05, + "loss": 1.3795, + "step": 1145 + }, + { + "epoch": 1.712284633702199, + "grad_norm": 0.3656558096408844, + "learning_rate": 2.135e-05, + "loss": 1.4199, + "step": 1146 + }, + { + "epoch": 1.7137787738712238, + "grad_norm": 0.17710858583450317, + "learning_rate": 2.1325e-05, + "loss": 1.2199, + "step": 1147 + }, + { + "epoch": 1.7152729140402485, + "grad_norm": 0.2536126375198364, + "learning_rate": 2.13e-05, + "loss": 1.3821, + "step": 1148 + }, + { + "epoch": 1.716767054209273, + "grad_norm": 0.5065903663635254, + "learning_rate": 2.1275000000000002e-05, + "loss": 1.2265, + "step": 1149 + }, + { + "epoch": 1.7182611943782975, + "grad_norm": 0.21185562014579773, + "learning_rate": 2.125e-05, + "loss": 1.3668, + "step": 1150 + }, + { + "epoch": 1.7197553345473222, + "grad_norm": 3.1472411155700684, + "learning_rate": 2.1225e-05, + "loss": 1.4943, + "step": 1151 + }, + { + "epoch": 1.721249474716347, + "grad_norm": 0.19197702407836914, + "learning_rate": 2.12e-05, + "loss": 1.328, + "step": 1152 + }, + { + "epoch": 1.7227436148853714, + "grad_norm": 0.34128865599632263, + "learning_rate": 2.1175e-05, + "loss": 1.2109, + "step": 1153 + }, + { + "epoch": 1.724237755054396, + "grad_norm": 0.25855574011802673, + "learning_rate": 2.115e-05, + "loss": 1.5121, + "step": 1154 + }, + { + "epoch": 1.7257318952234206, + "grad_norm": 0.32073017954826355, + "learning_rate": 2.1125000000000002e-05, + "loss": 1.2945, + "step": 1155 + }, + { + "epoch": 1.7272260353924453, + "grad_norm": 0.449855774641037, + "learning_rate": 2.11e-05, + "loss": 1.4703, + "step": 1156 + }, + { + "epoch": 1.7287201755614698, + "grad_norm": 0.6917439103126526, + "learning_rate": 2.1075e-05, + "loss": 1.4651, + "step": 1157 + }, + { + "epoch": 1.7302143157304943, + "grad_norm": 0.5707057118415833, + "learning_rate": 2.105e-05, + "loss": 1.3591, + "step": 1158 + }, + { + "epoch": 1.731708455899519, + "grad_norm": 0.24529029428958893, + "learning_rate": 2.1025e-05, + "loss": 1.3453, + "step": 1159 + }, + { + "epoch": 1.7332025960685438, + "grad_norm": 0.24148188531398773, + "learning_rate": 2.1e-05, + "loss": 1.4411, + "step": 1160 + }, + { + "epoch": 1.7346967362375683, + "grad_norm": 0.28139758110046387, + "learning_rate": 2.0975e-05, + "loss": 1.263, + "step": 1161 + }, + { + "epoch": 1.7361908764065928, + "grad_norm": 0.27365148067474365, + "learning_rate": 2.095e-05, + "loss": 1.2998, + "step": 1162 + }, + { + "epoch": 1.7376850165756175, + "grad_norm": 0.2578750550746918, + "learning_rate": 2.0925e-05, + "loss": 1.0968, + "step": 1163 + }, + { + "epoch": 1.7391791567446422, + "grad_norm": 0.27883380651474, + "learning_rate": 2.09e-05, + "loss": 1.2196, + "step": 1164 + }, + { + "epoch": 1.7406732969136667, + "grad_norm": 0.22383853793144226, + "learning_rate": 2.0875e-05, + "loss": 1.3016, + "step": 1165 + }, + { + "epoch": 1.7421674370826912, + "grad_norm": 0.2515240013599396, + "learning_rate": 2.085e-05, + "loss": 1.4733, + "step": 1166 + }, + { + "epoch": 1.7436615772517159, + "grad_norm": 0.2814958393573761, + "learning_rate": 2.0825e-05, + "loss": 1.2962, + "step": 1167 + }, + { + "epoch": 1.7451557174207406, + "grad_norm": 0.23939631879329681, + "learning_rate": 2.08e-05, + "loss": 1.4228, + "step": 1168 + }, + { + "epoch": 1.746649857589765, + "grad_norm": 0.25634765625, + "learning_rate": 2.0775e-05, + "loss": 1.2722, + "step": 1169 + }, + { + "epoch": 1.7481439977587896, + "grad_norm": 1.8628501892089844, + "learning_rate": 2.075e-05, + "loss": 1.2133, + "step": 1170 + }, + { + "epoch": 1.7496381379278143, + "grad_norm": 0.2669479548931122, + "learning_rate": 2.0725e-05, + "loss": 1.2378, + "step": 1171 + }, + { + "epoch": 1.751132278096839, + "grad_norm": 0.23712307214736938, + "learning_rate": 2.07e-05, + "loss": 1.1109, + "step": 1172 + }, + { + "epoch": 1.7526264182658635, + "grad_norm": 0.18234212696552277, + "learning_rate": 2.0675e-05, + "loss": 1.1441, + "step": 1173 + }, + { + "epoch": 1.754120558434888, + "grad_norm": 0.30427947640419006, + "learning_rate": 2.065e-05, + "loss": 1.3749, + "step": 1174 + }, + { + "epoch": 1.7556146986039127, + "grad_norm": 0.21525146067142487, + "learning_rate": 2.0625e-05, + "loss": 1.3148, + "step": 1175 + }, + { + "epoch": 1.7571088387729374, + "grad_norm": 0.36460453271865845, + "learning_rate": 2.06e-05, + "loss": 1.2794, + "step": 1176 + }, + { + "epoch": 1.758602978941962, + "grad_norm": 0.38430091738700867, + "learning_rate": 2.0575e-05, + "loss": 1.2026, + "step": 1177 + }, + { + "epoch": 1.7600971191109867, + "grad_norm": 0.3811454474925995, + "learning_rate": 2.055e-05, + "loss": 1.0977, + "step": 1178 + }, + { + "epoch": 1.7615912592800114, + "grad_norm": 0.2520235478878021, + "learning_rate": 2.0525e-05, + "loss": 1.3843, + "step": 1179 + }, + { + "epoch": 1.7630853994490359, + "grad_norm": 0.30874738097190857, + "learning_rate": 2.05e-05, + "loss": 1.2541, + "step": 1180 + }, + { + "epoch": 1.7645795396180604, + "grad_norm": 0.378074586391449, + "learning_rate": 2.0475e-05, + "loss": 1.1875, + "step": 1181 + }, + { + "epoch": 1.766073679787085, + "grad_norm": 0.39348742365837097, + "learning_rate": 2.045e-05, + "loss": 1.2389, + "step": 1182 + }, + { + "epoch": 1.7675678199561098, + "grad_norm": 0.2662319540977478, + "learning_rate": 2.0425e-05, + "loss": 1.3791, + "step": 1183 + }, + { + "epoch": 1.7690619601251343, + "grad_norm": 0.27314624190330505, + "learning_rate": 2.04e-05, + "loss": 1.5183, + "step": 1184 + }, + { + "epoch": 1.7705561002941588, + "grad_norm": 0.2799110412597656, + "learning_rate": 2.0375e-05, + "loss": 1.3552, + "step": 1185 + }, + { + "epoch": 1.7720502404631835, + "grad_norm": 0.38711944222450256, + "learning_rate": 2.035e-05, + "loss": 1.3103, + "step": 1186 + }, + { + "epoch": 1.7735443806322082, + "grad_norm": 0.22410157322883606, + "learning_rate": 2.0325e-05, + "loss": 1.3525, + "step": 1187 + }, + { + "epoch": 1.7750385208012327, + "grad_norm": 0.21705371141433716, + "learning_rate": 2.0300000000000002e-05, + "loss": 1.3232, + "step": 1188 + }, + { + "epoch": 1.7765326609702572, + "grad_norm": 0.18889586627483368, + "learning_rate": 2.0275e-05, + "loss": 1.0816, + "step": 1189 + }, + { + "epoch": 1.778026801139282, + "grad_norm": 0.32244136929512024, + "learning_rate": 2.025e-05, + "loss": 1.3168, + "step": 1190 + }, + { + "epoch": 1.7795209413083066, + "grad_norm": 0.17929215729236603, + "learning_rate": 2.0225000000000004e-05, + "loss": 1.286, + "step": 1191 + }, + { + "epoch": 1.7810150814773311, + "grad_norm": 0.21204537153244019, + "learning_rate": 2.0200000000000003e-05, + "loss": 1.1635, + "step": 1192 + }, + { + "epoch": 1.7825092216463556, + "grad_norm": 0.22701428830623627, + "learning_rate": 2.0175000000000003e-05, + "loss": 1.4269, + "step": 1193 + }, + { + "epoch": 1.7840033618153803, + "grad_norm": 0.37344813346862793, + "learning_rate": 2.0150000000000002e-05, + "loss": 1.4175, + "step": 1194 + }, + { + "epoch": 1.785497501984405, + "grad_norm": 0.2566252052783966, + "learning_rate": 2.0125e-05, + "loss": 1.332, + "step": 1195 + }, + { + "epoch": 1.7869916421534295, + "grad_norm": 0.3862023949623108, + "learning_rate": 2.01e-05, + "loss": 1.5337, + "step": 1196 + }, + { + "epoch": 1.788485782322454, + "grad_norm": 0.20119082927703857, + "learning_rate": 2.0075000000000003e-05, + "loss": 1.2904, + "step": 1197 + }, + { + "epoch": 1.7899799224914787, + "grad_norm": 0.3420519530773163, + "learning_rate": 2.0050000000000003e-05, + "loss": 1.1776, + "step": 1198 + }, + { + "epoch": 1.7914740626605035, + "grad_norm": 0.22510448098182678, + "learning_rate": 2.0025000000000002e-05, + "loss": 1.2816, + "step": 1199 + }, + { + "epoch": 1.792968202829528, + "grad_norm": 0.5224809050559998, + "learning_rate": 2e-05, + "loss": 1.357, + "step": 1200 + }, + { + "epoch": 1.7944623429985525, + "grad_norm": 0.23496760427951813, + "learning_rate": 1.9975e-05, + "loss": 1.3329, + "step": 1201 + }, + { + "epoch": 1.7959564831675772, + "grad_norm": 0.2106732875108719, + "learning_rate": 1.995e-05, + "loss": 1.4448, + "step": 1202 + }, + { + "epoch": 1.7974506233366019, + "grad_norm": 0.22301816940307617, + "learning_rate": 1.9925000000000003e-05, + "loss": 1.4227, + "step": 1203 + }, + { + "epoch": 1.7989447635056264, + "grad_norm": 0.20743943750858307, + "learning_rate": 1.9900000000000003e-05, + "loss": 1.4388, + "step": 1204 + }, + { + "epoch": 1.8004389036746509, + "grad_norm": 0.24213577806949615, + "learning_rate": 1.9875000000000002e-05, + "loss": 1.3773, + "step": 1205 + }, + { + "epoch": 1.8019330438436756, + "grad_norm": 0.2809053063392639, + "learning_rate": 1.985e-05, + "loss": 1.3239, + "step": 1206 + }, + { + "epoch": 1.8034271840127003, + "grad_norm": 0.17947784066200256, + "learning_rate": 1.9825e-05, + "loss": 1.3838, + "step": 1207 + }, + { + "epoch": 1.8049213241817248, + "grad_norm": 0.249902606010437, + "learning_rate": 1.9800000000000004e-05, + "loss": 1.3094, + "step": 1208 + }, + { + "epoch": 1.8064154643507493, + "grad_norm": 0.3139473795890808, + "learning_rate": 1.9775000000000003e-05, + "loss": 1.292, + "step": 1209 + }, + { + "epoch": 1.807909604519774, + "grad_norm": 0.3059343695640564, + "learning_rate": 1.9750000000000002e-05, + "loss": 1.4008, + "step": 1210 + }, + { + "epoch": 1.8094037446887987, + "grad_norm": 0.22615757584571838, + "learning_rate": 1.9725000000000002e-05, + "loss": 1.4586, + "step": 1211 + }, + { + "epoch": 1.8108978848578232, + "grad_norm": 0.395865261554718, + "learning_rate": 1.97e-05, + "loss": 1.2452, + "step": 1212 + }, + { + "epoch": 1.8123920250268477, + "grad_norm": 0.29632511734962463, + "learning_rate": 1.9675e-05, + "loss": 1.3564, + "step": 1213 + }, + { + "epoch": 1.8138861651958724, + "grad_norm": 0.3064177632331848, + "learning_rate": 1.9650000000000003e-05, + "loss": 1.2446, + "step": 1214 + }, + { + "epoch": 1.8153803053648971, + "grad_norm": 0.30816328525543213, + "learning_rate": 1.9625000000000003e-05, + "loss": 1.3273, + "step": 1215 + }, + { + "epoch": 1.8168744455339216, + "grad_norm": 0.18040741980075836, + "learning_rate": 1.9600000000000002e-05, + "loss": 1.2015, + "step": 1216 + }, + { + "epoch": 1.8183685857029461, + "grad_norm": 0.24273858964443207, + "learning_rate": 1.9575e-05, + "loss": 1.2347, + "step": 1217 + }, + { + "epoch": 1.8198627258719708, + "grad_norm": 0.2441396564245224, + "learning_rate": 1.955e-05, + "loss": 1.3922, + "step": 1218 + }, + { + "epoch": 1.8213568660409956, + "grad_norm": 0.2110196053981781, + "learning_rate": 1.9525e-05, + "loss": 1.2187, + "step": 1219 + }, + { + "epoch": 1.82285100621002, + "grad_norm": 0.2482791244983673, + "learning_rate": 1.9500000000000003e-05, + "loss": 1.4491, + "step": 1220 + }, + { + "epoch": 1.8243451463790445, + "grad_norm": 1.2229297161102295, + "learning_rate": 1.9475000000000002e-05, + "loss": 1.3337, + "step": 1221 + }, + { + "epoch": 1.8258392865480693, + "grad_norm": 0.2569945156574249, + "learning_rate": 1.9450000000000002e-05, + "loss": 1.4649, + "step": 1222 + }, + { + "epoch": 1.827333426717094, + "grad_norm": 0.23530051112174988, + "learning_rate": 1.9425e-05, + "loss": 1.3162, + "step": 1223 + }, + { + "epoch": 1.8288275668861185, + "grad_norm": 0.23422133922576904, + "learning_rate": 1.94e-05, + "loss": 1.2149, + "step": 1224 + }, + { + "epoch": 1.830321707055143, + "grad_norm": 0.8085640072822571, + "learning_rate": 1.9375e-05, + "loss": 1.3529, + "step": 1225 + }, + { + "epoch": 1.8318158472241677, + "grad_norm": 0.23965558409690857, + "learning_rate": 1.9350000000000003e-05, + "loss": 1.2814, + "step": 1226 + }, + { + "epoch": 1.8333099873931924, + "grad_norm": 0.33934149146080017, + "learning_rate": 1.9325000000000002e-05, + "loss": 1.3801, + "step": 1227 + }, + { + "epoch": 1.834804127562217, + "grad_norm": 0.20922242105007172, + "learning_rate": 1.93e-05, + "loss": 1.2043, + "step": 1228 + }, + { + "epoch": 1.8362982677312414, + "grad_norm": 0.23952169716358185, + "learning_rate": 1.9275e-05, + "loss": 1.2686, + "step": 1229 + }, + { + "epoch": 1.837792407900266, + "grad_norm": 0.35494279861450195, + "learning_rate": 1.925e-05, + "loss": 1.42, + "step": 1230 + }, + { + "epoch": 1.8392865480692908, + "grad_norm": 0.2617303729057312, + "learning_rate": 1.9225e-05, + "loss": 1.3625, + "step": 1231 + }, + { + "epoch": 1.8407806882383153, + "grad_norm": 0.26504993438720703, + "learning_rate": 1.9200000000000003e-05, + "loss": 1.2937, + "step": 1232 + }, + { + "epoch": 1.8422748284073398, + "grad_norm": 0.3170221149921417, + "learning_rate": 1.9175000000000002e-05, + "loss": 1.5529, + "step": 1233 + }, + { + "epoch": 1.8437689685763645, + "grad_norm": 0.24354490637779236, + "learning_rate": 1.915e-05, + "loss": 1.1315, + "step": 1234 + }, + { + "epoch": 1.8452631087453892, + "grad_norm": 0.2081626057624817, + "learning_rate": 1.9125e-05, + "loss": 1.4484, + "step": 1235 + }, + { + "epoch": 1.8467572489144137, + "grad_norm": 0.1551181972026825, + "learning_rate": 1.91e-05, + "loss": 1.3253, + "step": 1236 + }, + { + "epoch": 1.8482513890834384, + "grad_norm": 0.17210716009140015, + "learning_rate": 1.9075000000000003e-05, + "loss": 1.3218, + "step": 1237 + }, + { + "epoch": 1.8497455292524632, + "grad_norm": 0.2032608836889267, + "learning_rate": 1.9050000000000002e-05, + "loss": 1.2805, + "step": 1238 + }, + { + "epoch": 1.8512396694214877, + "grad_norm": 0.45856183767318726, + "learning_rate": 1.9025e-05, + "loss": 1.1087, + "step": 1239 + }, + { + "epoch": 1.8527338095905121, + "grad_norm": 0.18352068960666656, + "learning_rate": 1.9e-05, + "loss": 1.3764, + "step": 1240 + }, + { + "epoch": 1.8542279497595369, + "grad_norm": 0.2714996039867401, + "learning_rate": 1.8975e-05, + "loss": 1.5606, + "step": 1241 + }, + { + "epoch": 1.8557220899285616, + "grad_norm": 0.19584618508815765, + "learning_rate": 1.895e-05, + "loss": 1.3594, + "step": 1242 + }, + { + "epoch": 1.857216230097586, + "grad_norm": 0.2596234083175659, + "learning_rate": 1.8925000000000003e-05, + "loss": 1.3978, + "step": 1243 + }, + { + "epoch": 1.8587103702666106, + "grad_norm": 0.19500860571861267, + "learning_rate": 1.8900000000000002e-05, + "loss": 1.2443, + "step": 1244 + }, + { + "epoch": 1.8602045104356353, + "grad_norm": 0.6344171166419983, + "learning_rate": 1.8875e-05, + "loss": 1.591, + "step": 1245 + }, + { + "epoch": 1.86169865060466, + "grad_norm": 0.24306315183639526, + "learning_rate": 1.885e-05, + "loss": 1.3199, + "step": 1246 + }, + { + "epoch": 1.8631927907736845, + "grad_norm": 0.20791853964328766, + "learning_rate": 1.8825e-05, + "loss": 1.2792, + "step": 1247 + }, + { + "epoch": 1.864686930942709, + "grad_norm": 0.3387373685836792, + "learning_rate": 1.88e-05, + "loss": 1.3661, + "step": 1248 + }, + { + "epoch": 1.8661810711117337, + "grad_norm": 0.17687146365642548, + "learning_rate": 1.8775000000000002e-05, + "loss": 1.2467, + "step": 1249 + }, + { + "epoch": 1.8676752112807584, + "grad_norm": 0.2075313925743103, + "learning_rate": 1.8750000000000002e-05, + "loss": 1.3595, + "step": 1250 + }, + { + "epoch": 1.869169351449783, + "grad_norm": 0.4056931734085083, + "learning_rate": 1.8725e-05, + "loss": 1.293, + "step": 1251 + }, + { + "epoch": 1.8706634916188074, + "grad_norm": 0.21190667152404785, + "learning_rate": 1.87e-05, + "loss": 1.2857, + "step": 1252 + }, + { + "epoch": 1.8721576317878321, + "grad_norm": 0.8928343653678894, + "learning_rate": 1.8675e-05, + "loss": 1.3515, + "step": 1253 + }, + { + "epoch": 1.8736517719568568, + "grad_norm": 0.20301412045955658, + "learning_rate": 1.865e-05, + "loss": 1.3271, + "step": 1254 + }, + { + "epoch": 1.8751459121258813, + "grad_norm": 0.21219702064990997, + "learning_rate": 1.8625000000000002e-05, + "loss": 1.3911, + "step": 1255 + }, + { + "epoch": 1.8766400522949058, + "grad_norm": 0.20927630364894867, + "learning_rate": 1.86e-05, + "loss": 1.1812, + "step": 1256 + }, + { + "epoch": 1.8781341924639305, + "grad_norm": 0.19185447692871094, + "learning_rate": 1.8575e-05, + "loss": 1.3061, + "step": 1257 + }, + { + "epoch": 1.8796283326329553, + "grad_norm": 0.3142222464084625, + "learning_rate": 1.855e-05, + "loss": 1.4002, + "step": 1258 + }, + { + "epoch": 1.8811224728019798, + "grad_norm": 0.2145937979221344, + "learning_rate": 1.8525e-05, + "loss": 1.192, + "step": 1259 + }, + { + "epoch": 1.8826166129710042, + "grad_norm": 0.15852953493595123, + "learning_rate": 1.85e-05, + "loss": 1.097, + "step": 1260 + }, + { + "epoch": 1.884110753140029, + "grad_norm": 0.3558714985847473, + "learning_rate": 1.8475000000000002e-05, + "loss": 1.1759, + "step": 1261 + }, + { + "epoch": 1.8856048933090537, + "grad_norm": 0.2350102961063385, + "learning_rate": 1.845e-05, + "loss": 1.2792, + "step": 1262 + }, + { + "epoch": 1.8870990334780782, + "grad_norm": 0.23744919896125793, + "learning_rate": 1.8425e-05, + "loss": 1.3294, + "step": 1263 + }, + { + "epoch": 1.8885931736471027, + "grad_norm": 0.4808562994003296, + "learning_rate": 1.84e-05, + "loss": 1.3644, + "step": 1264 + }, + { + "epoch": 1.8900873138161274, + "grad_norm": 0.18481269478797913, + "learning_rate": 1.8375e-05, + "loss": 1.4389, + "step": 1265 + }, + { + "epoch": 1.891581453985152, + "grad_norm": 0.1974407136440277, + "learning_rate": 1.8350000000000002e-05, + "loss": 1.3803, + "step": 1266 + }, + { + "epoch": 1.8930755941541766, + "grad_norm": 0.18426170945167542, + "learning_rate": 1.8325e-05, + "loss": 1.4062, + "step": 1267 + }, + { + "epoch": 1.894569734323201, + "grad_norm": 0.15900960564613342, + "learning_rate": 1.83e-05, + "loss": 1.2374, + "step": 1268 + }, + { + "epoch": 1.8960638744922258, + "grad_norm": 0.21791104972362518, + "learning_rate": 1.8275e-05, + "loss": 1.3535, + "step": 1269 + }, + { + "epoch": 1.8975580146612505, + "grad_norm": 0.2844353914260864, + "learning_rate": 1.825e-05, + "loss": 1.4481, + "step": 1270 + }, + { + "epoch": 1.899052154830275, + "grad_norm": 0.2063741832971573, + "learning_rate": 1.8225e-05, + "loss": 1.2813, + "step": 1271 + }, + { + "epoch": 1.9005462949992995, + "grad_norm": 0.28945934772491455, + "learning_rate": 1.8200000000000002e-05, + "loss": 1.3534, + "step": 1272 + }, + { + "epoch": 1.9020404351683242, + "grad_norm": 0.3286309540271759, + "learning_rate": 1.8175e-05, + "loss": 1.335, + "step": 1273 + }, + { + "epoch": 1.903534575337349, + "grad_norm": 0.33386310935020447, + "learning_rate": 1.815e-05, + "loss": 1.1667, + "step": 1274 + }, + { + "epoch": 1.9050287155063734, + "grad_norm": 0.4026789665222168, + "learning_rate": 1.8125e-05, + "loss": 1.296, + "step": 1275 + }, + { + "epoch": 1.906522855675398, + "grad_norm": 0.1674850583076477, + "learning_rate": 1.81e-05, + "loss": 1.2641, + "step": 1276 + }, + { + "epoch": 1.9080169958444226, + "grad_norm": 0.6385492086410522, + "learning_rate": 1.8075e-05, + "loss": 1.2057, + "step": 1277 + }, + { + "epoch": 1.9095111360134474, + "grad_norm": 0.7805976271629333, + "learning_rate": 1.805e-05, + "loss": 1.3314, + "step": 1278 + }, + { + "epoch": 1.9110052761824718, + "grad_norm": 0.17184190452098846, + "learning_rate": 1.8025e-05, + "loss": 1.3208, + "step": 1279 + }, + { + "epoch": 1.9124994163514963, + "grad_norm": 0.21604672074317932, + "learning_rate": 1.8e-05, + "loss": 1.3815, + "step": 1280 + }, + { + "epoch": 1.913993556520521, + "grad_norm": 0.3038337826728821, + "learning_rate": 1.7975e-05, + "loss": 1.3725, + "step": 1281 + }, + { + "epoch": 1.9154876966895458, + "grad_norm": 0.2851409614086151, + "learning_rate": 1.795e-05, + "loss": 1.3311, + "step": 1282 + }, + { + "epoch": 1.9169818368585703, + "grad_norm": 0.22219157218933105, + "learning_rate": 1.7925e-05, + "loss": 1.3016, + "step": 1283 + }, + { + "epoch": 1.9184759770275948, + "grad_norm": 0.29200297594070435, + "learning_rate": 1.79e-05, + "loss": 1.2594, + "step": 1284 + }, + { + "epoch": 1.9199701171966195, + "grad_norm": 0.4923015832901001, + "learning_rate": 1.7875e-05, + "loss": 1.2475, + "step": 1285 + }, + { + "epoch": 1.9214642573656442, + "grad_norm": 0.22583426535129547, + "learning_rate": 1.785e-05, + "loss": 1.1543, + "step": 1286 + }, + { + "epoch": 1.9229583975346687, + "grad_norm": 0.21450522541999817, + "learning_rate": 1.7825e-05, + "loss": 1.3766, + "step": 1287 + }, + { + "epoch": 1.9244525377036932, + "grad_norm": 0.22731810808181763, + "learning_rate": 1.78e-05, + "loss": 1.3878, + "step": 1288 + }, + { + "epoch": 1.925946677872718, + "grad_norm": 0.17790710926055908, + "learning_rate": 1.7775e-05, + "loss": 1.2292, + "step": 1289 + }, + { + "epoch": 1.9274408180417426, + "grad_norm": 2.0318102836608887, + "learning_rate": 1.775e-05, + "loss": 1.3844, + "step": 1290 + }, + { + "epoch": 1.928934958210767, + "grad_norm": 0.21264667809009552, + "learning_rate": 1.7725e-05, + "loss": 1.3728, + "step": 1291 + }, + { + "epoch": 1.9304290983797916, + "grad_norm": 0.23059220612049103, + "learning_rate": 1.77e-05, + "loss": 1.3793, + "step": 1292 + }, + { + "epoch": 1.9319232385488163, + "grad_norm": 0.2054581195116043, + "learning_rate": 1.7675e-05, + "loss": 1.2632, + "step": 1293 + }, + { + "epoch": 1.933417378717841, + "grad_norm": 0.9071913361549377, + "learning_rate": 1.765e-05, + "loss": 1.4137, + "step": 1294 + }, + { + "epoch": 1.9349115188868655, + "grad_norm": 0.2947272062301636, + "learning_rate": 1.7625e-05, + "loss": 1.3661, + "step": 1295 + }, + { + "epoch": 1.93640565905589, + "grad_norm": 0.2021481841802597, + "learning_rate": 1.76e-05, + "loss": 1.2498, + "step": 1296 + }, + { + "epoch": 1.9378997992249147, + "grad_norm": 0.22752541303634644, + "learning_rate": 1.7575e-05, + "loss": 1.3359, + "step": 1297 + }, + { + "epoch": 1.9393939393939394, + "grad_norm": 0.21617968380451202, + "learning_rate": 1.755e-05, + "loss": 1.1765, + "step": 1298 + }, + { + "epoch": 1.940888079562964, + "grad_norm": 0.23794393241405487, + "learning_rate": 1.7525e-05, + "loss": 1.2394, + "step": 1299 + }, + { + "epoch": 1.9423822197319887, + "grad_norm": 0.22893854975700378, + "learning_rate": 1.75e-05, + "loss": 1.3831, + "step": 1300 + }, + { + "epoch": 1.9438763599010134, + "grad_norm": 0.2074202597141266, + "learning_rate": 1.7475e-05, + "loss": 1.3276, + "step": 1301 + }, + { + "epoch": 1.9453705000700379, + "grad_norm": 0.5460404753684998, + "learning_rate": 1.745e-05, + "loss": 1.2246, + "step": 1302 + }, + { + "epoch": 1.9468646402390624, + "grad_norm": 0.551843523979187, + "learning_rate": 1.7425e-05, + "loss": 1.2742, + "step": 1303 + }, + { + "epoch": 1.948358780408087, + "grad_norm": 0.2572923004627228, + "learning_rate": 1.74e-05, + "loss": 1.3383, + "step": 1304 + }, + { + "epoch": 1.9498529205771118, + "grad_norm": 0.20585736632347107, + "learning_rate": 1.7375e-05, + "loss": 1.4466, + "step": 1305 + }, + { + "epoch": 1.9513470607461363, + "grad_norm": 0.2732091546058655, + "learning_rate": 1.7349999999999998e-05, + "loss": 1.0664, + "step": 1306 + }, + { + "epoch": 1.9528412009151608, + "grad_norm": 0.2675059735774994, + "learning_rate": 1.7325e-05, + "loss": 1.2259, + "step": 1307 + }, + { + "epoch": 1.9543353410841855, + "grad_norm": 0.2213507890701294, + "learning_rate": 1.73e-05, + "loss": 1.3838, + "step": 1308 + }, + { + "epoch": 1.9558294812532102, + "grad_norm": 0.18887294828891754, + "learning_rate": 1.7275e-05, + "loss": 1.226, + "step": 1309 + }, + { + "epoch": 1.9573236214222347, + "grad_norm": 0.21098558604717255, + "learning_rate": 1.725e-05, + "loss": 1.4265, + "step": 1310 + }, + { + "epoch": 1.9588177615912592, + "grad_norm": 0.2039249986410141, + "learning_rate": 1.7225e-05, + "loss": 1.2338, + "step": 1311 + }, + { + "epoch": 1.960311901760284, + "grad_norm": 0.2207634598016739, + "learning_rate": 1.7199999999999998e-05, + "loss": 1.2166, + "step": 1312 + }, + { + "epoch": 1.9618060419293086, + "grad_norm": 0.2204556167125702, + "learning_rate": 1.7175e-05, + "loss": 1.2149, + "step": 1313 + }, + { + "epoch": 1.9633001820983331, + "grad_norm": 0.19095070660114288, + "learning_rate": 1.7150000000000004e-05, + "loss": 1.3607, + "step": 1314 + }, + { + "epoch": 1.9647943222673576, + "grad_norm": 0.1517525315284729, + "learning_rate": 1.7125000000000003e-05, + "loss": 1.172, + "step": 1315 + }, + { + "epoch": 1.9662884624363823, + "grad_norm": 0.21833693981170654, + "learning_rate": 1.7100000000000002e-05, + "loss": 1.4572, + "step": 1316 + }, + { + "epoch": 1.967782602605407, + "grad_norm": 0.2464878410100937, + "learning_rate": 1.7075e-05, + "loss": 1.4209, + "step": 1317 + }, + { + "epoch": 1.9692767427744315, + "grad_norm": 0.16277910768985748, + "learning_rate": 1.705e-05, + "loss": 1.4236, + "step": 1318 + }, + { + "epoch": 1.970770882943456, + "grad_norm": 0.23348970711231232, + "learning_rate": 1.7025e-05, + "loss": 1.0046, + "step": 1319 + }, + { + "epoch": 1.9722650231124808, + "grad_norm": 0.3407599627971649, + "learning_rate": 1.7000000000000003e-05, + "loss": 1.3508, + "step": 1320 + }, + { + "epoch": 1.9737591632815055, + "grad_norm": 0.8567706346511841, + "learning_rate": 1.6975000000000003e-05, + "loss": 1.453, + "step": 1321 + }, + { + "epoch": 1.97525330345053, + "grad_norm": 0.18511173129081726, + "learning_rate": 1.6950000000000002e-05, + "loss": 1.3107, + "step": 1322 + }, + { + "epoch": 1.9767474436195545, + "grad_norm": 0.21256686747074127, + "learning_rate": 1.6925e-05, + "loss": 1.3865, + "step": 1323 + }, + { + "epoch": 1.9782415837885792, + "grad_norm": 0.25371041893959045, + "learning_rate": 1.69e-05, + "loss": 1.3664, + "step": 1324 + }, + { + "epoch": 1.9797357239576039, + "grad_norm": 0.15344837307929993, + "learning_rate": 1.6875000000000004e-05, + "loss": 1.2623, + "step": 1325 + }, + { + "epoch": 1.9812298641266284, + "grad_norm": 0.27859944105148315, + "learning_rate": 1.6850000000000003e-05, + "loss": 1.3419, + "step": 1326 + }, + { + "epoch": 1.9827240042956529, + "grad_norm": 0.30782634019851685, + "learning_rate": 1.6825000000000002e-05, + "loss": 1.1088, + "step": 1327 + }, + { + "epoch": 1.9842181444646776, + "grad_norm": 0.26927539706230164, + "learning_rate": 1.6800000000000002e-05, + "loss": 1.3839, + "step": 1328 + }, + { + "epoch": 1.9857122846337023, + "grad_norm": 0.5329779982566833, + "learning_rate": 1.6775e-05, + "loss": 1.3443, + "step": 1329 + }, + { + "epoch": 1.9872064248027268, + "grad_norm": 0.2376929074525833, + "learning_rate": 1.675e-05, + "loss": 1.2627, + "step": 1330 + }, + { + "epoch": 1.9887005649717513, + "grad_norm": 0.21043722331523895, + "learning_rate": 1.6725000000000003e-05, + "loss": 1.4348, + "step": 1331 + }, + { + "epoch": 1.990194705140776, + "grad_norm": 0.2253684550523758, + "learning_rate": 1.6700000000000003e-05, + "loss": 1.1814, + "step": 1332 + }, + { + "epoch": 1.9916888453098007, + "grad_norm": 0.23785364627838135, + "learning_rate": 1.6675000000000002e-05, + "loss": 1.0943, + "step": 1333 + }, + { + "epoch": 1.9931829854788252, + "grad_norm": 0.29076915979385376, + "learning_rate": 1.665e-05, + "loss": 1.381, + "step": 1334 + }, + { + "epoch": 1.9946771256478497, + "grad_norm": 0.21580135822296143, + "learning_rate": 1.6625e-05, + "loss": 1.2626, + "step": 1335 + }, + { + "epoch": 1.9961712658168744, + "grad_norm": 0.30649375915527344, + "learning_rate": 1.66e-05, + "loss": 1.5823, + "step": 1336 + }, + { + "epoch": 1.9976654059858991, + "grad_norm": 0.1969205141067505, + "learning_rate": 1.6575000000000003e-05, + "loss": 1.1942, + "step": 1337 + }, + { + "epoch": 1.9991595461549236, + "grad_norm": 0.3132281005382538, + "learning_rate": 1.6550000000000002e-05, + "loss": 1.4082, + "step": 1338 + }, + { + "epoch": 2.000653686323948, + "grad_norm": 0.1845480054616928, + "learning_rate": 1.6525000000000002e-05, + "loss": 1.3161, + "step": 1339 + }, + { + "epoch": 2.002147826492973, + "grad_norm": 0.3126375079154968, + "learning_rate": 1.65e-05, + "loss": 1.3993, + "step": 1340 + }, + { + "epoch": 2.0036419666619976, + "grad_norm": 0.42225363850593567, + "learning_rate": 1.6475e-05, + "loss": 1.2508, + "step": 1341 + }, + { + "epoch": 2.005136106831022, + "grad_norm": 0.16167034208774567, + "learning_rate": 1.645e-05, + "loss": 1.1553, + "step": 1342 + }, + { + "epoch": 2.0066302470000466, + "grad_norm": 0.19103498756885529, + "learning_rate": 1.6425000000000003e-05, + "loss": 1.2684, + "step": 1343 + }, + { + "epoch": 2.0081243871690715, + "grad_norm": 0.23756802082061768, + "learning_rate": 1.6400000000000002e-05, + "loss": 1.2853, + "step": 1344 + }, + { + "epoch": 2.009618527338096, + "grad_norm": 0.23275527358055115, + "learning_rate": 1.6375e-05, + "loss": 1.254, + "step": 1345 + }, + { + "epoch": 2.0111126675071205, + "grad_norm": 0.17952296137809753, + "learning_rate": 1.635e-05, + "loss": 1.2585, + "step": 1346 + }, + { + "epoch": 2.012606807676145, + "grad_norm": 0.9764080047607422, + "learning_rate": 1.6325e-05, + "loss": 1.2869, + "step": 1347 + }, + { + "epoch": 2.01410094784517, + "grad_norm": 0.22916385531425476, + "learning_rate": 1.63e-05, + "loss": 1.3214, + "step": 1348 + }, + { + "epoch": 2.0155950880141944, + "grad_norm": 0.18610818684101105, + "learning_rate": 1.6275000000000003e-05, + "loss": 1.2997, + "step": 1349 + }, + { + "epoch": 2.017089228183219, + "grad_norm": 0.19289493560791016, + "learning_rate": 1.6250000000000002e-05, + "loss": 1.2952, + "step": 1350 + }, + { + "epoch": 2.0185833683522434, + "grad_norm": 0.19546003639698029, + "learning_rate": 1.6225e-05, + "loss": 1.3537, + "step": 1351 + }, + { + "epoch": 2.0200775085212683, + "grad_norm": 0.23147369921207428, + "learning_rate": 1.62e-05, + "loss": 1.4243, + "step": 1352 + }, + { + "epoch": 2.021571648690293, + "grad_norm": 0.18282002210617065, + "learning_rate": 1.6175e-05, + "loss": 1.0762, + "step": 1353 + }, + { + "epoch": 2.0230657888593173, + "grad_norm": 0.18470488488674164, + "learning_rate": 1.6150000000000003e-05, + "loss": 1.4082, + "step": 1354 + }, + { + "epoch": 2.024559929028342, + "grad_norm": 0.2198750376701355, + "learning_rate": 1.6125000000000002e-05, + "loss": 1.4813, + "step": 1355 + }, + { + "epoch": 2.0260540691973667, + "grad_norm": 0.2406291961669922, + "learning_rate": 1.6100000000000002e-05, + "loss": 1.4522, + "step": 1356 + }, + { + "epoch": 2.0275482093663912, + "grad_norm": 0.3409310281276703, + "learning_rate": 1.6075e-05, + "loss": 1.501, + "step": 1357 + }, + { + "epoch": 2.0290423495354157, + "grad_norm": 0.2944510281085968, + "learning_rate": 1.605e-05, + "loss": 1.2892, + "step": 1358 + }, + { + "epoch": 2.0305364897044402, + "grad_norm": 0.2915669083595276, + "learning_rate": 1.6025e-05, + "loss": 1.2348, + "step": 1359 + }, + { + "epoch": 2.032030629873465, + "grad_norm": 0.5818542242050171, + "learning_rate": 1.6000000000000003e-05, + "loss": 1.2945, + "step": 1360 + }, + { + "epoch": 2.0335247700424897, + "grad_norm": 0.6113788485527039, + "learning_rate": 1.5975000000000002e-05, + "loss": 1.4605, + "step": 1361 + }, + { + "epoch": 2.035018910211514, + "grad_norm": 0.20517785847187042, + "learning_rate": 1.595e-05, + "loss": 1.3224, + "step": 1362 + }, + { + "epoch": 2.0365130503805386, + "grad_norm": 0.16138556599617004, + "learning_rate": 1.5925e-05, + "loss": 1.1755, + "step": 1363 + }, + { + "epoch": 2.0380071905495636, + "grad_norm": 0.25767239928245544, + "learning_rate": 1.59e-05, + "loss": 1.1872, + "step": 1364 + }, + { + "epoch": 2.039501330718588, + "grad_norm": 0.2035055011510849, + "learning_rate": 1.5875e-05, + "loss": 1.5063, + "step": 1365 + }, + { + "epoch": 2.0409954708876126, + "grad_norm": 0.25032323598861694, + "learning_rate": 1.5850000000000002e-05, + "loss": 1.2495, + "step": 1366 + }, + { + "epoch": 2.042489611056637, + "grad_norm": 0.2417837679386139, + "learning_rate": 1.5825000000000002e-05, + "loss": 1.1995, + "step": 1367 + }, + { + "epoch": 2.043983751225662, + "grad_norm": 0.21311062574386597, + "learning_rate": 1.58e-05, + "loss": 1.2697, + "step": 1368 + }, + { + "epoch": 2.0454778913946865, + "grad_norm": 0.19063788652420044, + "learning_rate": 1.5775e-05, + "loss": 1.2617, + "step": 1369 + }, + { + "epoch": 2.046972031563711, + "grad_norm": 0.23214475810527802, + "learning_rate": 1.575e-05, + "loss": 1.2846, + "step": 1370 + }, + { + "epoch": 2.0484661717327355, + "grad_norm": 0.22164221107959747, + "learning_rate": 1.5725e-05, + "loss": 1.1985, + "step": 1371 + }, + { + "epoch": 2.0499603119017604, + "grad_norm": 0.42086952924728394, + "learning_rate": 1.5700000000000002e-05, + "loss": 1.2233, + "step": 1372 + }, + { + "epoch": 2.051454452070785, + "grad_norm": 0.19688980281352997, + "learning_rate": 1.5675e-05, + "loss": 1.4493, + "step": 1373 + }, + { + "epoch": 2.0529485922398094, + "grad_norm": 0.19368377327919006, + "learning_rate": 1.565e-05, + "loss": 1.3613, + "step": 1374 + }, + { + "epoch": 2.054442732408834, + "grad_norm": 0.17706935107707977, + "learning_rate": 1.5625e-05, + "loss": 1.3222, + "step": 1375 + }, + { + "epoch": 2.055936872577859, + "grad_norm": 0.37456485629081726, + "learning_rate": 1.56e-05, + "loss": 1.211, + "step": 1376 + }, + { + "epoch": 2.0574310127468833, + "grad_norm": 0.41341719031333923, + "learning_rate": 1.5575e-05, + "loss": 1.3862, + "step": 1377 + }, + { + "epoch": 2.058925152915908, + "grad_norm": 0.21140192449092865, + "learning_rate": 1.5550000000000002e-05, + "loss": 1.2258, + "step": 1378 + }, + { + "epoch": 2.0604192930849323, + "grad_norm": 0.22072890400886536, + "learning_rate": 1.5525e-05, + "loss": 1.1975, + "step": 1379 + }, + { + "epoch": 2.0619134332539573, + "grad_norm": 0.2429424673318863, + "learning_rate": 1.55e-05, + "loss": 1.1305, + "step": 1380 + }, + { + "epoch": 2.0634075734229818, + "grad_norm": 0.17470882833003998, + "learning_rate": 1.5475e-05, + "loss": 1.2887, + "step": 1381 + }, + { + "epoch": 2.0649017135920062, + "grad_norm": 0.2611446678638458, + "learning_rate": 1.545e-05, + "loss": 1.3548, + "step": 1382 + }, + { + "epoch": 2.0663958537610307, + "grad_norm": 0.18730558454990387, + "learning_rate": 1.5425000000000002e-05, + "loss": 1.3207, + "step": 1383 + }, + { + "epoch": 2.0678899939300557, + "grad_norm": 0.22779415547847748, + "learning_rate": 1.54e-05, + "loss": 1.3039, + "step": 1384 + }, + { + "epoch": 2.06938413409908, + "grad_norm": 0.22781801223754883, + "learning_rate": 1.5375e-05, + "loss": 1.2513, + "step": 1385 + }, + { + "epoch": 2.0708782742681047, + "grad_norm": 0.6857816576957703, + "learning_rate": 1.535e-05, + "loss": 1.3592, + "step": 1386 + }, + { + "epoch": 2.0723724144371296, + "grad_norm": 0.22949860990047455, + "learning_rate": 1.5325e-05, + "loss": 1.3609, + "step": 1387 + }, + { + "epoch": 2.073866554606154, + "grad_norm": 0.29369252920150757, + "learning_rate": 1.53e-05, + "loss": 1.3531, + "step": 1388 + }, + { + "epoch": 2.0753606947751786, + "grad_norm": 0.17681251466274261, + "learning_rate": 1.5275000000000002e-05, + "loss": 1.3514, + "step": 1389 + }, + { + "epoch": 2.076854834944203, + "grad_norm": 0.19840802252292633, + "learning_rate": 1.525e-05, + "loss": 1.3562, + "step": 1390 + }, + { + "epoch": 2.0783489751132276, + "grad_norm": 0.38067206740379333, + "learning_rate": 1.5225e-05, + "loss": 1.3926, + "step": 1391 + }, + { + "epoch": 2.0798431152822525, + "grad_norm": 0.23130175471305847, + "learning_rate": 1.52e-05, + "loss": 1.2658, + "step": 1392 + }, + { + "epoch": 2.081337255451277, + "grad_norm": 0.24068257212638855, + "learning_rate": 1.5175e-05, + "loss": 1.2055, + "step": 1393 + }, + { + "epoch": 2.0828313956203015, + "grad_norm": 0.2135651409626007, + "learning_rate": 1.515e-05, + "loss": 1.3947, + "step": 1394 + }, + { + "epoch": 2.0843255357893264, + "grad_norm": 0.25119951367378235, + "learning_rate": 1.5125e-05, + "loss": 1.4246, + "step": 1395 + }, + { + "epoch": 2.085819675958351, + "grad_norm": 0.2441864162683487, + "learning_rate": 1.51e-05, + "loss": 1.4057, + "step": 1396 + }, + { + "epoch": 2.0873138161273754, + "grad_norm": 0.3345242440700531, + "learning_rate": 1.5075e-05, + "loss": 1.4348, + "step": 1397 + }, + { + "epoch": 2.0888079562964, + "grad_norm": 0.2515486180782318, + "learning_rate": 1.505e-05, + "loss": 1.2924, + "step": 1398 + }, + { + "epoch": 2.090302096465425, + "grad_norm": 0.1982625424861908, + "learning_rate": 1.5025000000000001e-05, + "loss": 1.3543, + "step": 1399 + }, + { + "epoch": 2.0917962366344494, + "grad_norm": 0.2248583287000656, + "learning_rate": 1.5e-05, + "loss": 1.3344, + "step": 1400 + }, + { + "epoch": 2.093290376803474, + "grad_norm": 0.22528663277626038, + "learning_rate": 1.4975e-05, + "loss": 1.3416, + "step": 1401 + }, + { + "epoch": 2.0947845169724983, + "grad_norm": 0.19980430603027344, + "learning_rate": 1.4950000000000001e-05, + "loss": 1.3569, + "step": 1402 + }, + { + "epoch": 2.0962786571415233, + "grad_norm": 0.1910398304462433, + "learning_rate": 1.4925e-05, + "loss": 1.402, + "step": 1403 + }, + { + "epoch": 2.0977727973105478, + "grad_norm": 0.2712978720664978, + "learning_rate": 1.49e-05, + "loss": 1.3505, + "step": 1404 + }, + { + "epoch": 2.0992669374795723, + "grad_norm": 0.3270023763179779, + "learning_rate": 1.4875e-05, + "loss": 1.2635, + "step": 1405 + }, + { + "epoch": 2.1007610776485968, + "grad_norm": 0.2294662743806839, + "learning_rate": 1.485e-05, + "loss": 1.151, + "step": 1406 + }, + { + "epoch": 2.1022552178176217, + "grad_norm": 4.064767360687256, + "learning_rate": 1.4825e-05, + "loss": 1.3138, + "step": 1407 + }, + { + "epoch": 2.103749357986646, + "grad_norm": 0.44238874316215515, + "learning_rate": 1.48e-05, + "loss": 1.2766, + "step": 1408 + }, + { + "epoch": 2.1052434981556707, + "grad_norm": 0.42204442620277405, + "learning_rate": 1.4775e-05, + "loss": 1.4241, + "step": 1409 + }, + { + "epoch": 2.106737638324695, + "grad_norm": 1.8588513135910034, + "learning_rate": 1.475e-05, + "loss": 1.297, + "step": 1410 + }, + { + "epoch": 2.10823177849372, + "grad_norm": 0.21745501458644867, + "learning_rate": 1.4725e-05, + "loss": 1.2065, + "step": 1411 + }, + { + "epoch": 2.1097259186627446, + "grad_norm": 0.22868776321411133, + "learning_rate": 1.47e-05, + "loss": 1.3406, + "step": 1412 + }, + { + "epoch": 2.111220058831769, + "grad_norm": 0.23604220151901245, + "learning_rate": 1.4675e-05, + "loss": 1.3829, + "step": 1413 + }, + { + "epoch": 2.1127141990007936, + "grad_norm": 0.19632968306541443, + "learning_rate": 1.465e-05, + "loss": 1.2526, + "step": 1414 + }, + { + "epoch": 2.1142083391698185, + "grad_norm": 0.20098775625228882, + "learning_rate": 1.4625e-05, + "loss": 1.1966, + "step": 1415 + }, + { + "epoch": 2.115702479338843, + "grad_norm": 0.181330606341362, + "learning_rate": 1.4599999999999999e-05, + "loss": 1.4068, + "step": 1416 + }, + { + "epoch": 2.1171966195078675, + "grad_norm": 0.21224381029605865, + "learning_rate": 1.4575e-05, + "loss": 1.2888, + "step": 1417 + }, + { + "epoch": 2.118690759676892, + "grad_norm": 0.2324310541152954, + "learning_rate": 1.455e-05, + "loss": 1.2753, + "step": 1418 + }, + { + "epoch": 2.120184899845917, + "grad_norm": 0.18913941085338593, + "learning_rate": 1.4524999999999999e-05, + "loss": 1.2022, + "step": 1419 + }, + { + "epoch": 2.1216790400149415, + "grad_norm": 0.21584191918373108, + "learning_rate": 1.45e-05, + "loss": 1.3163, + "step": 1420 + }, + { + "epoch": 2.123173180183966, + "grad_norm": 0.17268019914627075, + "learning_rate": 1.4475e-05, + "loss": 1.1978, + "step": 1421 + }, + { + "epoch": 2.1246673203529904, + "grad_norm": 0.16772915422916412, + "learning_rate": 1.4449999999999999e-05, + "loss": 1.4418, + "step": 1422 + }, + { + "epoch": 2.1261614605220154, + "grad_norm": 0.2106754332780838, + "learning_rate": 1.4425e-05, + "loss": 1.4449, + "step": 1423 + }, + { + "epoch": 2.12765560069104, + "grad_norm": 0.3900804817676544, + "learning_rate": 1.44e-05, + "loss": 1.2949, + "step": 1424 + }, + { + "epoch": 2.1291497408600644, + "grad_norm": 0.21320056915283203, + "learning_rate": 1.4374999999999999e-05, + "loss": 1.1929, + "step": 1425 + }, + { + "epoch": 2.130643881029089, + "grad_norm": 0.8794408440589905, + "learning_rate": 1.435e-05, + "loss": 1.3492, + "step": 1426 + }, + { + "epoch": 2.132138021198114, + "grad_norm": 0.20707076787948608, + "learning_rate": 1.4325e-05, + "loss": 1.3354, + "step": 1427 + }, + { + "epoch": 2.1336321613671383, + "grad_norm": 0.19381563365459442, + "learning_rate": 1.43e-05, + "loss": 1.1734, + "step": 1428 + }, + { + "epoch": 2.135126301536163, + "grad_norm": 0.6543886065483093, + "learning_rate": 1.4275e-05, + "loss": 1.2938, + "step": 1429 + }, + { + "epoch": 2.1366204417051873, + "grad_norm": 0.2099548727273941, + "learning_rate": 1.4249999999999999e-05, + "loss": 1.4015, + "step": 1430 + }, + { + "epoch": 2.138114581874212, + "grad_norm": 0.1843765527009964, + "learning_rate": 1.4225e-05, + "loss": 1.2783, + "step": 1431 + }, + { + "epoch": 2.1396087220432367, + "grad_norm": 0.17574813961982727, + "learning_rate": 1.42e-05, + "loss": 1.1302, + "step": 1432 + }, + { + "epoch": 2.141102862212261, + "grad_norm": 0.18504847586154938, + "learning_rate": 1.4174999999999999e-05, + "loss": 1.3835, + "step": 1433 + }, + { + "epoch": 2.1425970023812857, + "grad_norm": 0.5265726447105408, + "learning_rate": 1.415e-05, + "loss": 1.2911, + "step": 1434 + }, + { + "epoch": 2.1440911425503106, + "grad_norm": 0.2602658271789551, + "learning_rate": 1.4125e-05, + "loss": 1.1983, + "step": 1435 + }, + { + "epoch": 2.145585282719335, + "grad_norm": 0.3915673792362213, + "learning_rate": 1.4099999999999999e-05, + "loss": 1.4647, + "step": 1436 + }, + { + "epoch": 2.1470794228883596, + "grad_norm": 0.22867648303508759, + "learning_rate": 1.4075e-05, + "loss": 1.2761, + "step": 1437 + }, + { + "epoch": 2.148573563057384, + "grad_norm": 0.3059532940387726, + "learning_rate": 1.4050000000000003e-05, + "loss": 1.2868, + "step": 1438 + }, + { + "epoch": 2.150067703226409, + "grad_norm": 0.17894840240478516, + "learning_rate": 1.4025000000000002e-05, + "loss": 1.2723, + "step": 1439 + }, + { + "epoch": 2.1515618433954335, + "grad_norm": 0.3326786458492279, + "learning_rate": 1.4000000000000001e-05, + "loss": 1.2988, + "step": 1440 + }, + { + "epoch": 2.153055983564458, + "grad_norm": 0.24176494777202606, + "learning_rate": 1.3975000000000003e-05, + "loss": 1.5227, + "step": 1441 + }, + { + "epoch": 2.1545501237334825, + "grad_norm": 0.45723408460617065, + "learning_rate": 1.3950000000000002e-05, + "loss": 1.355, + "step": 1442 + }, + { + "epoch": 2.1560442639025075, + "grad_norm": 0.2570329010486603, + "learning_rate": 1.3925000000000001e-05, + "loss": 1.1168, + "step": 1443 + }, + { + "epoch": 2.157538404071532, + "grad_norm": 0.24646492302417755, + "learning_rate": 1.3900000000000002e-05, + "loss": 1.2781, + "step": 1444 + }, + { + "epoch": 2.1590325442405565, + "grad_norm": 0.21383704245090485, + "learning_rate": 1.3875000000000002e-05, + "loss": 1.4397, + "step": 1445 + }, + { + "epoch": 2.1605266844095814, + "grad_norm": 0.17543964087963104, + "learning_rate": 1.3850000000000001e-05, + "loss": 1.2952, + "step": 1446 + }, + { + "epoch": 2.162020824578606, + "grad_norm": 0.48758426308631897, + "learning_rate": 1.3825000000000002e-05, + "loss": 1.3334, + "step": 1447 + }, + { + "epoch": 2.1635149647476304, + "grad_norm": 0.25018617510795593, + "learning_rate": 1.3800000000000002e-05, + "loss": 1.3461, + "step": 1448 + }, + { + "epoch": 2.165009104916655, + "grad_norm": 0.2622385621070862, + "learning_rate": 1.3775000000000001e-05, + "loss": 1.2779, + "step": 1449 + }, + { + "epoch": 2.1665032450856794, + "grad_norm": 0.22317945957183838, + "learning_rate": 1.3750000000000002e-05, + "loss": 1.3226, + "step": 1450 + }, + { + "epoch": 2.1679973852547043, + "grad_norm": 0.25490185618400574, + "learning_rate": 1.3725000000000002e-05, + "loss": 1.2453, + "step": 1451 + }, + { + "epoch": 2.169491525423729, + "grad_norm": 0.28227943181991577, + "learning_rate": 1.3700000000000001e-05, + "loss": 1.2265, + "step": 1452 + }, + { + "epoch": 2.1709856655927533, + "grad_norm": 0.24756629765033722, + "learning_rate": 1.3675000000000002e-05, + "loss": 1.4324, + "step": 1453 + }, + { + "epoch": 2.1724798057617782, + "grad_norm": 0.2755368649959564, + "learning_rate": 1.3650000000000001e-05, + "loss": 1.3576, + "step": 1454 + }, + { + "epoch": 2.1739739459308027, + "grad_norm": 0.2696566879749298, + "learning_rate": 1.3625e-05, + "loss": 1.39, + "step": 1455 + }, + { + "epoch": 2.1754680860998272, + "grad_norm": 0.20134660601615906, + "learning_rate": 1.3600000000000002e-05, + "loss": 1.2761, + "step": 1456 + }, + { + "epoch": 2.1769622262688517, + "grad_norm": 0.1893608272075653, + "learning_rate": 1.3575000000000001e-05, + "loss": 1.1057, + "step": 1457 + }, + { + "epoch": 2.1784563664378767, + "grad_norm": 0.26885709166526794, + "learning_rate": 1.3550000000000002e-05, + "loss": 1.4219, + "step": 1458 + }, + { + "epoch": 2.179950506606901, + "grad_norm": 0.3592870831489563, + "learning_rate": 1.3525000000000002e-05, + "loss": 1.2893, + "step": 1459 + }, + { + "epoch": 2.1814446467759256, + "grad_norm": 0.2307906299829483, + "learning_rate": 1.3500000000000001e-05, + "loss": 1.295, + "step": 1460 + }, + { + "epoch": 2.18293878694495, + "grad_norm": 0.23158007860183716, + "learning_rate": 1.3475000000000002e-05, + "loss": 1.2771, + "step": 1461 + }, + { + "epoch": 2.184432927113975, + "grad_norm": 0.3418627977371216, + "learning_rate": 1.3450000000000002e-05, + "loss": 1.2596, + "step": 1462 + }, + { + "epoch": 2.1859270672829996, + "grad_norm": 0.18299835920333862, + "learning_rate": 1.3425000000000001e-05, + "loss": 1.3383, + "step": 1463 + }, + { + "epoch": 2.187421207452024, + "grad_norm": 0.28012511134147644, + "learning_rate": 1.3400000000000002e-05, + "loss": 1.4001, + "step": 1464 + }, + { + "epoch": 2.1889153476210486, + "grad_norm": 0.26353952288627625, + "learning_rate": 1.3375000000000002e-05, + "loss": 1.3973, + "step": 1465 + }, + { + "epoch": 2.1904094877900735, + "grad_norm": 0.2554903030395508, + "learning_rate": 1.3350000000000001e-05, + "loss": 1.2561, + "step": 1466 + }, + { + "epoch": 2.191903627959098, + "grad_norm": 0.21797333657741547, + "learning_rate": 1.3325000000000002e-05, + "loss": 1.2779, + "step": 1467 + }, + { + "epoch": 2.1933977681281225, + "grad_norm": 0.18936985731124878, + "learning_rate": 1.3300000000000001e-05, + "loss": 1.3408, + "step": 1468 + }, + { + "epoch": 2.194891908297147, + "grad_norm": 0.16586045920848846, + "learning_rate": 1.3275e-05, + "loss": 1.2249, + "step": 1469 + }, + { + "epoch": 2.196386048466172, + "grad_norm": 0.3664158582687378, + "learning_rate": 1.3250000000000002e-05, + "loss": 1.2892, + "step": 1470 + }, + { + "epoch": 2.1978801886351964, + "grad_norm": 0.16312362253665924, + "learning_rate": 1.3225000000000001e-05, + "loss": 1.2441, + "step": 1471 + }, + { + "epoch": 2.199374328804221, + "grad_norm": 0.5265755653381348, + "learning_rate": 1.32e-05, + "loss": 1.3866, + "step": 1472 + }, + { + "epoch": 2.2008684689732454, + "grad_norm": 0.2296518087387085, + "learning_rate": 1.3175000000000002e-05, + "loss": 1.3673, + "step": 1473 + }, + { + "epoch": 2.2023626091422703, + "grad_norm": 0.3048577308654785, + "learning_rate": 1.3150000000000001e-05, + "loss": 1.356, + "step": 1474 + }, + { + "epoch": 2.203856749311295, + "grad_norm": 0.2861064076423645, + "learning_rate": 1.3125e-05, + "loss": 1.2939, + "step": 1475 + }, + { + "epoch": 2.2053508894803193, + "grad_norm": 0.24364018440246582, + "learning_rate": 1.3100000000000002e-05, + "loss": 1.4165, + "step": 1476 + }, + { + "epoch": 2.206845029649344, + "grad_norm": 0.21503858268260956, + "learning_rate": 1.3075000000000001e-05, + "loss": 1.2548, + "step": 1477 + }, + { + "epoch": 2.2083391698183688, + "grad_norm": 0.2088351547718048, + "learning_rate": 1.305e-05, + "loss": 1.2361, + "step": 1478 + }, + { + "epoch": 2.2098333099873932, + "grad_norm": 0.21245965361595154, + "learning_rate": 1.3025000000000002e-05, + "loss": 1.2242, + "step": 1479 + }, + { + "epoch": 2.2113274501564177, + "grad_norm": 0.2069002240896225, + "learning_rate": 1.3000000000000001e-05, + "loss": 1.3018, + "step": 1480 + }, + { + "epoch": 2.2128215903254422, + "grad_norm": 0.182997465133667, + "learning_rate": 1.2975e-05, + "loss": 1.3056, + "step": 1481 + }, + { + "epoch": 2.214315730494467, + "grad_norm": 0.30062931776046753, + "learning_rate": 1.2950000000000001e-05, + "loss": 1.3128, + "step": 1482 + }, + { + "epoch": 2.2158098706634917, + "grad_norm": 0.16962677240371704, + "learning_rate": 1.2925e-05, + "loss": 1.2552, + "step": 1483 + }, + { + "epoch": 2.217304010832516, + "grad_norm": 0.22674013674259186, + "learning_rate": 1.29e-05, + "loss": 1.2199, + "step": 1484 + }, + { + "epoch": 2.2187981510015407, + "grad_norm": 0.30475592613220215, + "learning_rate": 1.2875000000000001e-05, + "loss": 1.3368, + "step": 1485 + }, + { + "epoch": 2.2202922911705656, + "grad_norm": 0.18455040454864502, + "learning_rate": 1.285e-05, + "loss": 1.2818, + "step": 1486 + }, + { + "epoch": 2.22178643133959, + "grad_norm": 0.1701553463935852, + "learning_rate": 1.2825000000000002e-05, + "loss": 1.2886, + "step": 1487 + }, + { + "epoch": 2.2232805715086146, + "grad_norm": 0.3963810205459595, + "learning_rate": 1.2800000000000001e-05, + "loss": 1.3285, + "step": 1488 + }, + { + "epoch": 2.224774711677639, + "grad_norm": 0.3438395857810974, + "learning_rate": 1.2775e-05, + "loss": 1.2927, + "step": 1489 + }, + { + "epoch": 2.226268851846664, + "grad_norm": 0.21394677460193634, + "learning_rate": 1.2750000000000002e-05, + "loss": 1.3431, + "step": 1490 + }, + { + "epoch": 2.2277629920156885, + "grad_norm": 2.9313278198242188, + "learning_rate": 1.2725000000000001e-05, + "loss": 1.4353, + "step": 1491 + }, + { + "epoch": 2.229257132184713, + "grad_norm": 0.5035104751586914, + "learning_rate": 1.27e-05, + "loss": 1.4292, + "step": 1492 + }, + { + "epoch": 2.2307512723537375, + "grad_norm": 0.34209391474723816, + "learning_rate": 1.2675000000000001e-05, + "loss": 1.2921, + "step": 1493 + }, + { + "epoch": 2.2322454125227624, + "grad_norm": 0.26486945152282715, + "learning_rate": 1.2650000000000001e-05, + "loss": 1.2368, + "step": 1494 + }, + { + "epoch": 2.233739552691787, + "grad_norm": 0.21212176978588104, + "learning_rate": 1.2625e-05, + "loss": 1.5764, + "step": 1495 + }, + { + "epoch": 2.2352336928608114, + "grad_norm": 0.24310234189033508, + "learning_rate": 1.2600000000000001e-05, + "loss": 1.4328, + "step": 1496 + }, + { + "epoch": 2.236727833029836, + "grad_norm": 0.20805980265140533, + "learning_rate": 1.2575e-05, + "loss": 1.149, + "step": 1497 + }, + { + "epoch": 2.238221973198861, + "grad_norm": 0.3014054596424103, + "learning_rate": 1.255e-05, + "loss": 1.2276, + "step": 1498 + }, + { + "epoch": 2.2397161133678853, + "grad_norm": 0.18532797694206238, + "learning_rate": 1.2525000000000001e-05, + "loss": 1.3269, + "step": 1499 + }, + { + "epoch": 2.24121025353691, + "grad_norm": 0.19393125176429749, + "learning_rate": 1.25e-05, + "loss": 1.1997, + "step": 1500 + }, + { + "epoch": 2.2427043937059343, + "grad_norm": 0.3987423777580261, + "learning_rate": 1.2475e-05, + "loss": 1.4332, + "step": 1501 + }, + { + "epoch": 2.2441985338749593, + "grad_norm": 0.17318999767303467, + "learning_rate": 1.2450000000000001e-05, + "loss": 1.1957, + "step": 1502 + }, + { + "epoch": 2.2456926740439838, + "grad_norm": 1.6794114112854004, + "learning_rate": 1.2425e-05, + "loss": 1.3814, + "step": 1503 + }, + { + "epoch": 2.2471868142130083, + "grad_norm": 0.2993067502975464, + "learning_rate": 1.24e-05, + "loss": 1.2473, + "step": 1504 + }, + { + "epoch": 2.248680954382033, + "grad_norm": 0.35345566272735596, + "learning_rate": 1.2375000000000001e-05, + "loss": 1.315, + "step": 1505 + }, + { + "epoch": 2.2501750945510577, + "grad_norm": 0.24247638881206512, + "learning_rate": 1.235e-05, + "loss": 1.1762, + "step": 1506 + }, + { + "epoch": 2.251669234720082, + "grad_norm": 0.29046157002449036, + "learning_rate": 1.2325e-05, + "loss": 1.2739, + "step": 1507 + }, + { + "epoch": 2.2531633748891067, + "grad_norm": 0.22655948996543884, + "learning_rate": 1.23e-05, + "loss": 1.2653, + "step": 1508 + }, + { + "epoch": 2.254657515058131, + "grad_norm": 0.2086847871541977, + "learning_rate": 1.2275e-05, + "loss": 1.2846, + "step": 1509 + }, + { + "epoch": 2.256151655227156, + "grad_norm": 0.2213176190853119, + "learning_rate": 1.225e-05, + "loss": 1.2555, + "step": 1510 + }, + { + "epoch": 2.2576457953961806, + "grad_norm": 0.22715193033218384, + "learning_rate": 1.2225e-05, + "loss": 1.1853, + "step": 1511 + }, + { + "epoch": 2.259139935565205, + "grad_norm": 0.6379641890525818, + "learning_rate": 1.22e-05, + "loss": 1.3702, + "step": 1512 + }, + { + "epoch": 2.26063407573423, + "grad_norm": 0.37107372283935547, + "learning_rate": 1.2175e-05, + "loss": 1.2811, + "step": 1513 + }, + { + "epoch": 2.2621282159032545, + "grad_norm": 0.21601450443267822, + "learning_rate": 1.215e-05, + "loss": 1.5273, + "step": 1514 + }, + { + "epoch": 2.263622356072279, + "grad_norm": 0.2155398428440094, + "learning_rate": 1.2125e-05, + "loss": 1.3442, + "step": 1515 + }, + { + "epoch": 2.2651164962413035, + "grad_norm": 0.2486492097377777, + "learning_rate": 1.2100000000000001e-05, + "loss": 1.4766, + "step": 1516 + }, + { + "epoch": 2.266610636410328, + "grad_norm": 0.2049209475517273, + "learning_rate": 1.2075e-05, + "loss": 1.1512, + "step": 1517 + }, + { + "epoch": 2.268104776579353, + "grad_norm": 0.2839110493659973, + "learning_rate": 1.205e-05, + "loss": 1.3691, + "step": 1518 + }, + { + "epoch": 2.2695989167483774, + "grad_norm": 0.25949689745903015, + "learning_rate": 1.2025000000000001e-05, + "loss": 1.4586, + "step": 1519 + }, + { + "epoch": 2.271093056917402, + "grad_norm": 0.2694650888442993, + "learning_rate": 1.2e-05, + "loss": 1.2326, + "step": 1520 + }, + { + "epoch": 2.272587197086427, + "grad_norm": 0.30505526065826416, + "learning_rate": 1.1975e-05, + "loss": 1.4554, + "step": 1521 + }, + { + "epoch": 2.2740813372554514, + "grad_norm": 0.20058128237724304, + "learning_rate": 1.195e-05, + "loss": 1.28, + "step": 1522 + }, + { + "epoch": 2.275575477424476, + "grad_norm": 0.2585764527320862, + "learning_rate": 1.1925e-05, + "loss": 1.2229, + "step": 1523 + }, + { + "epoch": 2.2770696175935004, + "grad_norm": 0.6231411695480347, + "learning_rate": 1.19e-05, + "loss": 1.3299, + "step": 1524 + }, + { + "epoch": 2.2785637577625253, + "grad_norm": 0.14931946992874146, + "learning_rate": 1.1875e-05, + "loss": 1.2723, + "step": 1525 + }, + { + "epoch": 2.28005789793155, + "grad_norm": 0.1917908936738968, + "learning_rate": 1.185e-05, + "loss": 1.2713, + "step": 1526 + }, + { + "epoch": 2.2815520381005743, + "grad_norm": 0.17372849583625793, + "learning_rate": 1.1825e-05, + "loss": 1.2496, + "step": 1527 + }, + { + "epoch": 2.2830461782695988, + "grad_norm": 0.18889087438583374, + "learning_rate": 1.18e-05, + "loss": 1.2819, + "step": 1528 + }, + { + "epoch": 2.2845403184386237, + "grad_norm": 0.23077258467674255, + "learning_rate": 1.1775e-05, + "loss": 1.1978, + "step": 1529 + }, + { + "epoch": 2.286034458607648, + "grad_norm": 0.24370890855789185, + "learning_rate": 1.175e-05, + "loss": 1.3296, + "step": 1530 + }, + { + "epoch": 2.2875285987766727, + "grad_norm": 0.5306088924407959, + "learning_rate": 1.1725e-05, + "loss": 1.2872, + "step": 1531 + }, + { + "epoch": 2.289022738945697, + "grad_norm": 0.15988537669181824, + "learning_rate": 1.1700000000000001e-05, + "loss": 1.2157, + "step": 1532 + }, + { + "epoch": 2.290516879114722, + "grad_norm": 0.2093038707971573, + "learning_rate": 1.1675000000000001e-05, + "loss": 1.1948, + "step": 1533 + }, + { + "epoch": 2.2920110192837466, + "grad_norm": 0.24095523357391357, + "learning_rate": 1.1650000000000002e-05, + "loss": 1.1789, + "step": 1534 + }, + { + "epoch": 2.293505159452771, + "grad_norm": 0.3092842102050781, + "learning_rate": 1.1625000000000001e-05, + "loss": 1.3552, + "step": 1535 + }, + { + "epoch": 2.2949992996217956, + "grad_norm": 0.32683873176574707, + "learning_rate": 1.16e-05, + "loss": 1.3141, + "step": 1536 + }, + { + "epoch": 2.2964934397908205, + "grad_norm": 0.24342307448387146, + "learning_rate": 1.1575000000000002e-05, + "loss": 1.2125, + "step": 1537 + }, + { + "epoch": 2.297987579959845, + "grad_norm": 0.21990644931793213, + "learning_rate": 1.1550000000000001e-05, + "loss": 1.1445, + "step": 1538 + }, + { + "epoch": 2.2994817201288695, + "grad_norm": 0.19987203180789948, + "learning_rate": 1.1525e-05, + "loss": 1.2947, + "step": 1539 + }, + { + "epoch": 2.300975860297894, + "grad_norm": 0.21032652258872986, + "learning_rate": 1.1500000000000002e-05, + "loss": 1.333, + "step": 1540 + }, + { + "epoch": 2.302470000466919, + "grad_norm": 0.5228366255760193, + "learning_rate": 1.1475000000000001e-05, + "loss": 1.3451, + "step": 1541 + }, + { + "epoch": 2.3039641406359435, + "grad_norm": 0.2448897361755371, + "learning_rate": 1.145e-05, + "loss": 1.3963, + "step": 1542 + }, + { + "epoch": 2.305458280804968, + "grad_norm": 0.27660074830055237, + "learning_rate": 1.1425000000000002e-05, + "loss": 1.2531, + "step": 1543 + }, + { + "epoch": 2.3069524209739924, + "grad_norm": 0.24510708451271057, + "learning_rate": 1.1400000000000001e-05, + "loss": 1.2857, + "step": 1544 + }, + { + "epoch": 2.3084465611430174, + "grad_norm": 0.19473227858543396, + "learning_rate": 1.1375e-05, + "loss": 1.237, + "step": 1545 + }, + { + "epoch": 2.309940701312042, + "grad_norm": 0.1733362078666687, + "learning_rate": 1.1350000000000001e-05, + "loss": 1.3782, + "step": 1546 + }, + { + "epoch": 2.3114348414810664, + "grad_norm": 0.21493330597877502, + "learning_rate": 1.1325e-05, + "loss": 1.2649, + "step": 1547 + }, + { + "epoch": 2.312928981650091, + "grad_norm": 0.3164669871330261, + "learning_rate": 1.13e-05, + "loss": 1.4352, + "step": 1548 + }, + { + "epoch": 2.314423121819116, + "grad_norm": 0.26332464814186096, + "learning_rate": 1.1275000000000001e-05, + "loss": 1.4147, + "step": 1549 + }, + { + "epoch": 2.3159172619881403, + "grad_norm": 0.21300271153450012, + "learning_rate": 1.125e-05, + "loss": 1.3022, + "step": 1550 + }, + { + "epoch": 2.317411402157165, + "grad_norm": 0.20926804840564728, + "learning_rate": 1.1225e-05, + "loss": 1.4585, + "step": 1551 + }, + { + "epoch": 2.3189055423261893, + "grad_norm": 0.19575107097625732, + "learning_rate": 1.1200000000000001e-05, + "loss": 1.2977, + "step": 1552 + }, + { + "epoch": 2.320399682495214, + "grad_norm": 0.369392067193985, + "learning_rate": 1.1175e-05, + "loss": 1.2781, + "step": 1553 + }, + { + "epoch": 2.3218938226642387, + "grad_norm": 0.21531105041503906, + "learning_rate": 1.115e-05, + "loss": 1.1297, + "step": 1554 + }, + { + "epoch": 2.323387962833263, + "grad_norm": 0.2804863750934601, + "learning_rate": 1.1125000000000001e-05, + "loss": 1.3836, + "step": 1555 + }, + { + "epoch": 2.324882103002288, + "grad_norm": 0.23862357437610626, + "learning_rate": 1.11e-05, + "loss": 1.3508, + "step": 1556 + }, + { + "epoch": 2.3263762431713126, + "grad_norm": 0.2789822816848755, + "learning_rate": 1.1075e-05, + "loss": 1.3357, + "step": 1557 + }, + { + "epoch": 2.327870383340337, + "grad_norm": 0.4456028640270233, + "learning_rate": 1.1050000000000001e-05, + "loss": 1.259, + "step": 1558 + }, + { + "epoch": 2.3293645235093616, + "grad_norm": 0.25457635521888733, + "learning_rate": 1.1025e-05, + "loss": 1.2132, + "step": 1559 + }, + { + "epoch": 2.330858663678386, + "grad_norm": 0.4204428791999817, + "learning_rate": 1.1000000000000001e-05, + "loss": 1.3101, + "step": 1560 + }, + { + "epoch": 2.332352803847411, + "grad_norm": 0.3540639877319336, + "learning_rate": 1.0975e-05, + "loss": 1.3815, + "step": 1561 + }, + { + "epoch": 2.3338469440164356, + "grad_norm": 0.2761100232601166, + "learning_rate": 1.095e-05, + "loss": 1.3601, + "step": 1562 + }, + { + "epoch": 2.33534108418546, + "grad_norm": 0.29980364441871643, + "learning_rate": 1.0925000000000001e-05, + "loss": 1.2059, + "step": 1563 + }, + { + "epoch": 2.336835224354485, + "grad_norm": 0.24911551177501678, + "learning_rate": 1.09e-05, + "loss": 1.2788, + "step": 1564 + }, + { + "epoch": 2.3383293645235095, + "grad_norm": 0.37243562936782837, + "learning_rate": 1.0875e-05, + "loss": 1.1434, + "step": 1565 + }, + { + "epoch": 2.339823504692534, + "grad_norm": 0.2589317560195923, + "learning_rate": 1.0850000000000001e-05, + "loss": 1.397, + "step": 1566 + }, + { + "epoch": 2.3413176448615585, + "grad_norm": 0.21287329494953156, + "learning_rate": 1.0825e-05, + "loss": 1.076, + "step": 1567 + }, + { + "epoch": 2.342811785030583, + "grad_norm": 0.17493118345737457, + "learning_rate": 1.08e-05, + "loss": 1.3775, + "step": 1568 + }, + { + "epoch": 2.344305925199608, + "grad_norm": 0.24420218169689178, + "learning_rate": 1.0775000000000001e-05, + "loss": 1.4579, + "step": 1569 + }, + { + "epoch": 2.3458000653686324, + "grad_norm": 0.18936094641685486, + "learning_rate": 1.075e-05, + "loss": 1.3076, + "step": 1570 + }, + { + "epoch": 2.347294205537657, + "grad_norm": 0.24192200601100922, + "learning_rate": 1.0725e-05, + "loss": 1.3495, + "step": 1571 + }, + { + "epoch": 2.348788345706682, + "grad_norm": 0.32218971848487854, + "learning_rate": 1.0700000000000001e-05, + "loss": 1.4041, + "step": 1572 + }, + { + "epoch": 2.3502824858757063, + "grad_norm": 0.2487693578004837, + "learning_rate": 1.0675e-05, + "loss": 1.2895, + "step": 1573 + }, + { + "epoch": 2.351776626044731, + "grad_norm": 0.14848390221595764, + "learning_rate": 1.065e-05, + "loss": 1.2928, + "step": 1574 + }, + { + "epoch": 2.3532707662137553, + "grad_norm": 0.19325977563858032, + "learning_rate": 1.0625e-05, + "loss": 1.2783, + "step": 1575 + }, + { + "epoch": 2.35476490638278, + "grad_norm": 0.293878436088562, + "learning_rate": 1.06e-05, + "loss": 1.3369, + "step": 1576 + }, + { + "epoch": 2.3562590465518047, + "grad_norm": 0.2537538707256317, + "learning_rate": 1.0575e-05, + "loss": 1.3704, + "step": 1577 + }, + { + "epoch": 2.3577531867208292, + "grad_norm": 0.41805949807167053, + "learning_rate": 1.055e-05, + "loss": 1.3075, + "step": 1578 + }, + { + "epoch": 2.3592473268898537, + "grad_norm": 0.20566672086715698, + "learning_rate": 1.0525e-05, + "loss": 1.1988, + "step": 1579 + }, + { + "epoch": 2.3607414670588787, + "grad_norm": 0.21227644383907318, + "learning_rate": 1.05e-05, + "loss": 1.3543, + "step": 1580 + }, + { + "epoch": 2.362235607227903, + "grad_norm": 0.30649107694625854, + "learning_rate": 1.0475e-05, + "loss": 1.3404, + "step": 1581 + }, + { + "epoch": 2.3637297473969276, + "grad_norm": 0.2410416305065155, + "learning_rate": 1.045e-05, + "loss": 1.4158, + "step": 1582 + }, + { + "epoch": 2.365223887565952, + "grad_norm": 0.1789264678955078, + "learning_rate": 1.0425e-05, + "loss": 1.4128, + "step": 1583 + }, + { + "epoch": 2.366718027734977, + "grad_norm": 0.280666708946228, + "learning_rate": 1.04e-05, + "loss": 1.3142, + "step": 1584 + }, + { + "epoch": 2.3682121679040016, + "grad_norm": 0.2530580461025238, + "learning_rate": 1.0375e-05, + "loss": 1.2719, + "step": 1585 + }, + { + "epoch": 2.369706308073026, + "grad_norm": 0.3545680642127991, + "learning_rate": 1.035e-05, + "loss": 1.354, + "step": 1586 + }, + { + "epoch": 2.3712004482420506, + "grad_norm": 0.4236186742782593, + "learning_rate": 1.0325e-05, + "loss": 1.0796, + "step": 1587 + }, + { + "epoch": 2.3726945884110755, + "grad_norm": 0.22073593735694885, + "learning_rate": 1.03e-05, + "loss": 1.3344, + "step": 1588 + }, + { + "epoch": 2.3741887285801, + "grad_norm": 0.2675930857658386, + "learning_rate": 1.0275e-05, + "loss": 1.3888, + "step": 1589 + }, + { + "epoch": 2.3756828687491245, + "grad_norm": 0.24186880886554718, + "learning_rate": 1.025e-05, + "loss": 1.158, + "step": 1590 + }, + { + "epoch": 2.377177008918149, + "grad_norm": 0.36522793769836426, + "learning_rate": 1.0225e-05, + "loss": 1.3172, + "step": 1591 + }, + { + "epoch": 2.378671149087174, + "grad_norm": 0.33718815445899963, + "learning_rate": 1.02e-05, + "loss": 1.3242, + "step": 1592 + }, + { + "epoch": 2.3801652892561984, + "grad_norm": 0.19472959637641907, + "learning_rate": 1.0175e-05, + "loss": 1.0996, + "step": 1593 + }, + { + "epoch": 2.381659429425223, + "grad_norm": 0.2957690954208374, + "learning_rate": 1.0150000000000001e-05, + "loss": 1.4084, + "step": 1594 + }, + { + "epoch": 2.3831535695942474, + "grad_norm": 0.27677658200263977, + "learning_rate": 1.0125e-05, + "loss": 1.3074, + "step": 1595 + }, + { + "epoch": 2.3846477097632723, + "grad_norm": 0.2773936092853546, + "learning_rate": 1.0100000000000002e-05, + "loss": 1.2948, + "step": 1596 + }, + { + "epoch": 2.386141849932297, + "grad_norm": 0.3917483687400818, + "learning_rate": 1.0075000000000001e-05, + "loss": 1.325, + "step": 1597 + }, + { + "epoch": 2.3876359901013213, + "grad_norm": 0.20603208243846893, + "learning_rate": 1.005e-05, + "loss": 1.1257, + "step": 1598 + }, + { + "epoch": 2.389130130270346, + "grad_norm": 0.2912125885486603, + "learning_rate": 1.0025000000000001e-05, + "loss": 1.2564, + "step": 1599 + }, + { + "epoch": 2.3906242704393708, + "grad_norm": 0.2543962001800537, + "learning_rate": 1e-05, + "loss": 1.241, + "step": 1600 + }, + { + "epoch": 2.3921184106083953, + "grad_norm": 0.27241256833076477, + "learning_rate": 9.975e-06, + "loss": 1.3456, + "step": 1601 + }, + { + "epoch": 2.3936125507774197, + "grad_norm": 0.3361535668373108, + "learning_rate": 9.950000000000001e-06, + "loss": 1.2202, + "step": 1602 + }, + { + "epoch": 2.3951066909464442, + "grad_norm": 0.3284830152988434, + "learning_rate": 9.925e-06, + "loss": 1.2916, + "step": 1603 + }, + { + "epoch": 2.396600831115469, + "grad_norm": 0.2478102296590805, + "learning_rate": 9.900000000000002e-06, + "loss": 1.3076, + "step": 1604 + }, + { + "epoch": 2.3980949712844937, + "grad_norm": 0.19693656265735626, + "learning_rate": 9.875000000000001e-06, + "loss": 1.3351, + "step": 1605 + }, + { + "epoch": 2.399589111453518, + "grad_norm": 0.2959229350090027, + "learning_rate": 9.85e-06, + "loss": 1.2544, + "step": 1606 + }, + { + "epoch": 2.4010832516225427, + "grad_norm": 0.18445654213428497, + "learning_rate": 9.825000000000002e-06, + "loss": 1.2567, + "step": 1607 + }, + { + "epoch": 2.4025773917915676, + "grad_norm": 0.2136373221874237, + "learning_rate": 9.800000000000001e-06, + "loss": 1.5701, + "step": 1608 + }, + { + "epoch": 2.404071531960592, + "grad_norm": 0.23023132979869843, + "learning_rate": 9.775e-06, + "loss": 1.2132, + "step": 1609 + }, + { + "epoch": 2.4055656721296166, + "grad_norm": 0.23368653655052185, + "learning_rate": 9.750000000000002e-06, + "loss": 1.482, + "step": 1610 + }, + { + "epoch": 2.407059812298641, + "grad_norm": 0.2301778644323349, + "learning_rate": 9.725000000000001e-06, + "loss": 1.4484, + "step": 1611 + }, + { + "epoch": 2.408553952467666, + "grad_norm": 0.20297956466674805, + "learning_rate": 9.7e-06, + "loss": 1.304, + "step": 1612 + }, + { + "epoch": 2.4100480926366905, + "grad_norm": 0.207230806350708, + "learning_rate": 9.675000000000001e-06, + "loss": 1.2294, + "step": 1613 + }, + { + "epoch": 2.411542232805715, + "grad_norm": 0.2300531417131424, + "learning_rate": 9.65e-06, + "loss": 1.4689, + "step": 1614 + }, + { + "epoch": 2.41303637297474, + "grad_norm": 0.26880812644958496, + "learning_rate": 9.625e-06, + "loss": 1.3848, + "step": 1615 + }, + { + "epoch": 2.4145305131437644, + "grad_norm": 0.23205740749835968, + "learning_rate": 9.600000000000001e-06, + "loss": 1.3925, + "step": 1616 + }, + { + "epoch": 2.416024653312789, + "grad_norm": 0.21103398501873016, + "learning_rate": 9.575e-06, + "loss": 1.1464, + "step": 1617 + }, + { + "epoch": 2.4175187934818134, + "grad_norm": 0.2809285819530487, + "learning_rate": 9.55e-06, + "loss": 1.3438, + "step": 1618 + }, + { + "epoch": 2.419012933650838, + "grad_norm": 0.19656510651111603, + "learning_rate": 9.525000000000001e-06, + "loss": 1.2569, + "step": 1619 + }, + { + "epoch": 2.420507073819863, + "grad_norm": 0.30520132184028625, + "learning_rate": 9.5e-06, + "loss": 1.395, + "step": 1620 + }, + { + "epoch": 2.4220012139888873, + "grad_norm": 0.18955890834331512, + "learning_rate": 9.475e-06, + "loss": 1.2901, + "step": 1621 + }, + { + "epoch": 2.423495354157912, + "grad_norm": 0.18221187591552734, + "learning_rate": 9.450000000000001e-06, + "loss": 1.2954, + "step": 1622 + }, + { + "epoch": 2.424989494326937, + "grad_norm": 0.19730985164642334, + "learning_rate": 9.425e-06, + "loss": 1.3947, + "step": 1623 + }, + { + "epoch": 2.4264836344959613, + "grad_norm": 0.2914806604385376, + "learning_rate": 9.4e-06, + "loss": 1.3879, + "step": 1624 + }, + { + "epoch": 2.4279777746649858, + "grad_norm": 0.20165985822677612, + "learning_rate": 9.375000000000001e-06, + "loss": 1.2932, + "step": 1625 + }, + { + "epoch": 2.4294719148340103, + "grad_norm": 0.39926818013191223, + "learning_rate": 9.35e-06, + "loss": 1.2433, + "step": 1626 + }, + { + "epoch": 2.4309660550030348, + "grad_norm": 0.32672664523124695, + "learning_rate": 9.325e-06, + "loss": 1.2686, + "step": 1627 + }, + { + "epoch": 2.4324601951720597, + "grad_norm": 0.2178495228290558, + "learning_rate": 9.3e-06, + "loss": 1.4247, + "step": 1628 + }, + { + "epoch": 2.433954335341084, + "grad_norm": 0.24548844993114471, + "learning_rate": 9.275e-06, + "loss": 1.3101, + "step": 1629 + }, + { + "epoch": 2.4354484755101087, + "grad_norm": 0.39940354228019714, + "learning_rate": 9.25e-06, + "loss": 1.4471, + "step": 1630 + }, + { + "epoch": 2.4369426156791336, + "grad_norm": 0.2500586211681366, + "learning_rate": 9.225e-06, + "loss": 1.316, + "step": 1631 + }, + { + "epoch": 2.438436755848158, + "grad_norm": 0.18969309329986572, + "learning_rate": 9.2e-06, + "loss": 1.1965, + "step": 1632 + }, + { + "epoch": 2.4399308960171826, + "grad_norm": 0.2904510498046875, + "learning_rate": 9.175000000000001e-06, + "loss": 1.1452, + "step": 1633 + }, + { + "epoch": 2.441425036186207, + "grad_norm": 0.18344439566135406, + "learning_rate": 9.15e-06, + "loss": 1.4705, + "step": 1634 + }, + { + "epoch": 2.4429191763552316, + "grad_norm": 0.2132856845855713, + "learning_rate": 9.125e-06, + "loss": 1.2666, + "step": 1635 + }, + { + "epoch": 2.4444133165242565, + "grad_norm": 0.26673436164855957, + "learning_rate": 9.100000000000001e-06, + "loss": 1.3129, + "step": 1636 + }, + { + "epoch": 2.445907456693281, + "grad_norm": 0.29528361558914185, + "learning_rate": 9.075e-06, + "loss": 1.2356, + "step": 1637 + }, + { + "epoch": 2.4474015968623055, + "grad_norm": 0.23240934312343597, + "learning_rate": 9.05e-06, + "loss": 1.4241, + "step": 1638 + }, + { + "epoch": 2.4488957370313305, + "grad_norm": 0.19893576204776764, + "learning_rate": 9.025e-06, + "loss": 1.1293, + "step": 1639 + }, + { + "epoch": 2.450389877200355, + "grad_norm": 0.16400805115699768, + "learning_rate": 9e-06, + "loss": 1.232, + "step": 1640 + }, + { + "epoch": 2.4518840173693794, + "grad_norm": 0.1535254716873169, + "learning_rate": 8.975e-06, + "loss": 1.242, + "step": 1641 + }, + { + "epoch": 2.453378157538404, + "grad_norm": 0.35040244460105896, + "learning_rate": 8.95e-06, + "loss": 1.1444, + "step": 1642 + }, + { + "epoch": 2.4548722977074284, + "grad_norm": 0.20848290622234344, + "learning_rate": 8.925e-06, + "loss": 1.3863, + "step": 1643 + }, + { + "epoch": 2.4563664378764534, + "grad_norm": 0.22934338450431824, + "learning_rate": 8.9e-06, + "loss": 1.199, + "step": 1644 + }, + { + "epoch": 2.457860578045478, + "grad_norm": 0.19408972561359406, + "learning_rate": 8.875e-06, + "loss": 1.2936, + "step": 1645 + }, + { + "epoch": 2.4593547182145024, + "grad_norm": 0.21626761555671692, + "learning_rate": 8.85e-06, + "loss": 1.2033, + "step": 1646 + }, + { + "epoch": 2.4608488583835273, + "grad_norm": 0.19572362303733826, + "learning_rate": 8.825e-06, + "loss": 1.4063, + "step": 1647 + }, + { + "epoch": 2.462342998552552, + "grad_norm": 0.24215538799762726, + "learning_rate": 8.8e-06, + "loss": 1.2936, + "step": 1648 + }, + { + "epoch": 2.4638371387215763, + "grad_norm": 0.19395551085472107, + "learning_rate": 8.775e-06, + "loss": 1.3743, + "step": 1649 + }, + { + "epoch": 2.4653312788906008, + "grad_norm": 0.2217566967010498, + "learning_rate": 8.75e-06, + "loss": 1.2934, + "step": 1650 + }, + { + "epoch": 2.4668254190596257, + "grad_norm": 0.20734982192516327, + "learning_rate": 8.725e-06, + "loss": 1.3652, + "step": 1651 + }, + { + "epoch": 2.46831955922865, + "grad_norm": 0.21142613887786865, + "learning_rate": 8.7e-06, + "loss": 1.3618, + "step": 1652 + }, + { + "epoch": 2.4698136993976747, + "grad_norm": 0.3824837803840637, + "learning_rate": 8.674999999999999e-06, + "loss": 1.442, + "step": 1653 + }, + { + "epoch": 2.471307839566699, + "grad_norm": 0.3306943476200104, + "learning_rate": 8.65e-06, + "loss": 1.3585, + "step": 1654 + }, + { + "epoch": 2.472801979735724, + "grad_norm": 0.6740633845329285, + "learning_rate": 8.625e-06, + "loss": 1.4169, + "step": 1655 + }, + { + "epoch": 2.4742961199047486, + "grad_norm": 0.19902844727039337, + "learning_rate": 8.599999999999999e-06, + "loss": 1.3756, + "step": 1656 + }, + { + "epoch": 2.475790260073773, + "grad_norm": 0.3373403251171112, + "learning_rate": 8.575000000000002e-06, + "loss": 1.2222, + "step": 1657 + }, + { + "epoch": 2.4772844002427976, + "grad_norm": 0.21016831696033478, + "learning_rate": 8.550000000000001e-06, + "loss": 1.2962, + "step": 1658 + }, + { + "epoch": 2.4787785404118226, + "grad_norm": 0.241668701171875, + "learning_rate": 8.525e-06, + "loss": 1.2163, + "step": 1659 + }, + { + "epoch": 2.480272680580847, + "grad_norm": 0.2818886637687683, + "learning_rate": 8.500000000000002e-06, + "loss": 1.3332, + "step": 1660 + }, + { + "epoch": 2.4817668207498715, + "grad_norm": 0.23386937379837036, + "learning_rate": 8.475000000000001e-06, + "loss": 1.4589, + "step": 1661 + }, + { + "epoch": 2.483260960918896, + "grad_norm": 0.31478041410446167, + "learning_rate": 8.45e-06, + "loss": 1.3337, + "step": 1662 + }, + { + "epoch": 2.484755101087921, + "grad_norm": 0.1997414529323578, + "learning_rate": 8.425000000000001e-06, + "loss": 1.1732, + "step": 1663 + }, + { + "epoch": 2.4862492412569455, + "grad_norm": 0.27734795212745667, + "learning_rate": 8.400000000000001e-06, + "loss": 1.2748, + "step": 1664 + }, + { + "epoch": 2.48774338142597, + "grad_norm": 0.23954452574253082, + "learning_rate": 8.375e-06, + "loss": 1.2237, + "step": 1665 + }, + { + "epoch": 2.4892375215949945, + "grad_norm": 0.27997615933418274, + "learning_rate": 8.350000000000001e-06, + "loss": 1.3232, + "step": 1666 + }, + { + "epoch": 2.4907316617640194, + "grad_norm": 0.1582835614681244, + "learning_rate": 8.325e-06, + "loss": 1.2447, + "step": 1667 + }, + { + "epoch": 2.492225801933044, + "grad_norm": 0.9380664229393005, + "learning_rate": 8.3e-06, + "loss": 1.363, + "step": 1668 + }, + { + "epoch": 2.4937199421020684, + "grad_norm": 0.2942129969596863, + "learning_rate": 8.275000000000001e-06, + "loss": 1.3214, + "step": 1669 + }, + { + "epoch": 2.495214082271093, + "grad_norm": 0.6295128464698792, + "learning_rate": 8.25e-06, + "loss": 1.3361, + "step": 1670 + }, + { + "epoch": 2.496708222440118, + "grad_norm": 0.19411110877990723, + "learning_rate": 8.225e-06, + "loss": 1.2091, + "step": 1671 + }, + { + "epoch": 2.4982023626091423, + "grad_norm": 0.2738019824028015, + "learning_rate": 8.200000000000001e-06, + "loss": 1.5115, + "step": 1672 + }, + { + "epoch": 2.499696502778167, + "grad_norm": 0.21079149842262268, + "learning_rate": 8.175e-06, + "loss": 1.2621, + "step": 1673 + }, + { + "epoch": 2.5011906429471917, + "grad_norm": 0.34388652443885803, + "learning_rate": 8.15e-06, + "loss": 1.2276, + "step": 1674 + }, + { + "epoch": 2.5026847831162162, + "grad_norm": 0.25901058316230774, + "learning_rate": 8.125000000000001e-06, + "loss": 1.2417, + "step": 1675 + }, + { + "epoch": 2.5041789232852407, + "grad_norm": 0.39429500699043274, + "learning_rate": 8.1e-06, + "loss": 1.3081, + "step": 1676 + }, + { + "epoch": 2.505673063454265, + "grad_norm": 0.3238086700439453, + "learning_rate": 8.075000000000001e-06, + "loss": 1.4371, + "step": 1677 + }, + { + "epoch": 2.5071672036232897, + "grad_norm": 0.2773047387599945, + "learning_rate": 8.050000000000001e-06, + "loss": 1.287, + "step": 1678 + }, + { + "epoch": 2.5086613437923146, + "grad_norm": 0.20827920734882355, + "learning_rate": 8.025e-06, + "loss": 1.2057, + "step": 1679 + }, + { + "epoch": 2.510155483961339, + "grad_norm": 0.18946285545825958, + "learning_rate": 8.000000000000001e-06, + "loss": 1.3255, + "step": 1680 + }, + { + "epoch": 2.5116496241303636, + "grad_norm": 0.2746080160140991, + "learning_rate": 7.975e-06, + "loss": 1.3344, + "step": 1681 + }, + { + "epoch": 2.5131437642993886, + "grad_norm": 0.21937142312526703, + "learning_rate": 7.95e-06, + "loss": 1.3531, + "step": 1682 + }, + { + "epoch": 2.514637904468413, + "grad_norm": 0.25020983815193176, + "learning_rate": 7.925000000000001e-06, + "loss": 1.3749, + "step": 1683 + }, + { + "epoch": 2.5161320446374376, + "grad_norm": 0.19704294204711914, + "learning_rate": 7.9e-06, + "loss": 1.3779, + "step": 1684 + }, + { + "epoch": 2.517626184806462, + "grad_norm": 0.14473195374011993, + "learning_rate": 7.875e-06, + "loss": 1.2173, + "step": 1685 + }, + { + "epoch": 2.5191203249754865, + "grad_norm": 0.17848993837833405, + "learning_rate": 7.850000000000001e-06, + "loss": 1.3379, + "step": 1686 + }, + { + "epoch": 2.5206144651445115, + "grad_norm": 0.23943091928958893, + "learning_rate": 7.825e-06, + "loss": 1.2539, + "step": 1687 + }, + { + "epoch": 2.522108605313536, + "grad_norm": 0.17616920173168182, + "learning_rate": 7.8e-06, + "loss": 1.1976, + "step": 1688 + }, + { + "epoch": 2.5236027454825605, + "grad_norm": 0.3900967240333557, + "learning_rate": 7.775000000000001e-06, + "loss": 1.1891, + "step": 1689 + }, + { + "epoch": 2.5250968856515854, + "grad_norm": 0.22222371399402618, + "learning_rate": 7.75e-06, + "loss": 1.3877, + "step": 1690 + }, + { + "epoch": 2.52659102582061, + "grad_norm": 0.2556477189064026, + "learning_rate": 7.725e-06, + "loss": 1.4977, + "step": 1691 + }, + { + "epoch": 2.5280851659896344, + "grad_norm": 0.1834731101989746, + "learning_rate": 7.7e-06, + "loss": 1.2776, + "step": 1692 + }, + { + "epoch": 2.529579306158659, + "grad_norm": 0.2504706084728241, + "learning_rate": 7.675e-06, + "loss": 1.2564, + "step": 1693 + }, + { + "epoch": 2.5310734463276834, + "grad_norm": 0.2430802583694458, + "learning_rate": 7.65e-06, + "loss": 1.2452, + "step": 1694 + }, + { + "epoch": 2.5325675864967083, + "grad_norm": 0.1919189840555191, + "learning_rate": 7.625e-06, + "loss": 1.3058, + "step": 1695 + }, + { + "epoch": 2.534061726665733, + "grad_norm": 0.1974821835756302, + "learning_rate": 7.6e-06, + "loss": 1.3763, + "step": 1696 + }, + { + "epoch": 2.5355558668347573, + "grad_norm": 0.1689937561750412, + "learning_rate": 7.575e-06, + "loss": 1.2053, + "step": 1697 + }, + { + "epoch": 2.5370500070037822, + "grad_norm": 0.308765172958374, + "learning_rate": 7.55e-06, + "loss": 1.3868, + "step": 1698 + }, + { + "epoch": 2.5385441471728067, + "grad_norm": 0.19889512658119202, + "learning_rate": 7.525e-06, + "loss": 1.3385, + "step": 1699 + }, + { + "epoch": 2.5400382873418312, + "grad_norm": 0.2441468983888626, + "learning_rate": 7.5e-06, + "loss": 1.3314, + "step": 1700 + }, + { + "epoch": 2.5415324275108557, + "grad_norm": 0.2642837464809418, + "learning_rate": 7.4750000000000004e-06, + "loss": 1.3409, + "step": 1701 + }, + { + "epoch": 2.5430265676798802, + "grad_norm": 0.17711983621120453, + "learning_rate": 7.45e-06, + "loss": 1.2269, + "step": 1702 + }, + { + "epoch": 2.544520707848905, + "grad_norm": 0.21140949428081512, + "learning_rate": 7.425e-06, + "loss": 1.2436, + "step": 1703 + }, + { + "epoch": 2.5460148480179297, + "grad_norm": 0.17483963072299957, + "learning_rate": 7.4e-06, + "loss": 1.306, + "step": 1704 + }, + { + "epoch": 2.547508988186954, + "grad_norm": 0.2772678732872009, + "learning_rate": 7.375e-06, + "loss": 1.3276, + "step": 1705 + }, + { + "epoch": 2.549003128355979, + "grad_norm": 0.18797051906585693, + "learning_rate": 7.35e-06, + "loss": 1.0811, + "step": 1706 + }, + { + "epoch": 2.5504972685250036, + "grad_norm": 0.4826465845108032, + "learning_rate": 7.325e-06, + "loss": 1.3009, + "step": 1707 + }, + { + "epoch": 2.551991408694028, + "grad_norm": 0.16395214200019836, + "learning_rate": 7.2999999999999996e-06, + "loss": 1.1172, + "step": 1708 + }, + { + "epoch": 2.5534855488630526, + "grad_norm": 0.20583327114582062, + "learning_rate": 7.275e-06, + "loss": 1.2985, + "step": 1709 + }, + { + "epoch": 2.554979689032077, + "grad_norm": 0.24862010776996613, + "learning_rate": 7.25e-06, + "loss": 1.4367, + "step": 1710 + }, + { + "epoch": 2.556473829201102, + "grad_norm": 0.2522416412830353, + "learning_rate": 7.2249999999999994e-06, + "loss": 1.0932, + "step": 1711 + }, + { + "epoch": 2.5579679693701265, + "grad_norm": 0.2193172425031662, + "learning_rate": 7.2e-06, + "loss": 1.4272, + "step": 1712 + }, + { + "epoch": 2.559462109539151, + "grad_norm": 0.23507429659366608, + "learning_rate": 7.175e-06, + "loss": 1.2178, + "step": 1713 + }, + { + "epoch": 2.560956249708176, + "grad_norm": 0.2719856798648834, + "learning_rate": 7.15e-06, + "loss": 1.4399, + "step": 1714 + }, + { + "epoch": 2.5624503898772004, + "grad_norm": 0.2828925848007202, + "learning_rate": 7.1249999999999995e-06, + "loss": 1.4121, + "step": 1715 + }, + { + "epoch": 2.563944530046225, + "grad_norm": 0.2640264332294464, + "learning_rate": 7.1e-06, + "loss": 1.3727, + "step": 1716 + }, + { + "epoch": 2.5654386702152494, + "grad_norm": 0.20125895738601685, + "learning_rate": 7.075e-06, + "loss": 1.3127, + "step": 1717 + }, + { + "epoch": 2.566932810384274, + "grad_norm": 0.2524392008781433, + "learning_rate": 7.049999999999999e-06, + "loss": 1.3005, + "step": 1718 + }, + { + "epoch": 2.568426950553299, + "grad_norm": 0.24239836633205414, + "learning_rate": 7.025000000000001e-06, + "loss": 1.4189, + "step": 1719 + }, + { + "epoch": 2.5699210907223233, + "grad_norm": 0.17018896341323853, + "learning_rate": 7.000000000000001e-06, + "loss": 1.2798, + "step": 1720 + }, + { + "epoch": 2.571415230891348, + "grad_norm": 0.1992681324481964, + "learning_rate": 6.975000000000001e-06, + "loss": 1.3166, + "step": 1721 + }, + { + "epoch": 2.5729093710603728, + "grad_norm": 0.19908703863620758, + "learning_rate": 6.950000000000001e-06, + "loss": 1.4153, + "step": 1722 + }, + { + "epoch": 2.5744035112293973, + "grad_norm": 0.26776745915412903, + "learning_rate": 6.925000000000001e-06, + "loss": 1.2286, + "step": 1723 + }, + { + "epoch": 2.5758976513984218, + "grad_norm": 0.2907460927963257, + "learning_rate": 6.900000000000001e-06, + "loss": 1.559, + "step": 1724 + }, + { + "epoch": 2.5773917915674467, + "grad_norm": 0.30556491017341614, + "learning_rate": 6.875000000000001e-06, + "loss": 1.3937, + "step": 1725 + }, + { + "epoch": 2.578885931736471, + "grad_norm": 0.2142094224691391, + "learning_rate": 6.8500000000000005e-06, + "loss": 1.3817, + "step": 1726 + }, + { + "epoch": 2.5803800719054957, + "grad_norm": 0.230880469083786, + "learning_rate": 6.825000000000001e-06, + "loss": 1.2456, + "step": 1727 + }, + { + "epoch": 2.58187421207452, + "grad_norm": 0.1611093133687973, + "learning_rate": 6.800000000000001e-06, + "loss": 1.2031, + "step": 1728 + }, + { + "epoch": 2.5833683522435447, + "grad_norm": 0.2961817979812622, + "learning_rate": 6.775000000000001e-06, + "loss": 1.2962, + "step": 1729 + }, + { + "epoch": 2.5848624924125696, + "grad_norm": 0.25123047828674316, + "learning_rate": 6.750000000000001e-06, + "loss": 1.237, + "step": 1730 + }, + { + "epoch": 2.586356632581594, + "grad_norm": 0.19058406352996826, + "learning_rate": 6.725000000000001e-06, + "loss": 1.3968, + "step": 1731 + }, + { + "epoch": 2.5878507727506186, + "grad_norm": 0.28494787216186523, + "learning_rate": 6.700000000000001e-06, + "loss": 1.519, + "step": 1732 + }, + { + "epoch": 2.5893449129196435, + "grad_norm": 0.26594290137290955, + "learning_rate": 6.6750000000000005e-06, + "loss": 1.2813, + "step": 1733 + }, + { + "epoch": 2.590839053088668, + "grad_norm": 0.362155556678772, + "learning_rate": 6.650000000000001e-06, + "loss": 1.3084, + "step": 1734 + }, + { + "epoch": 2.5923331932576925, + "grad_norm": 0.2774062156677246, + "learning_rate": 6.625000000000001e-06, + "loss": 1.3572, + "step": 1735 + }, + { + "epoch": 2.593827333426717, + "grad_norm": 0.26589712500572205, + "learning_rate": 6.6e-06, + "loss": 1.3727, + "step": 1736 + }, + { + "epoch": 2.5953214735957415, + "grad_norm": 0.1808343082666397, + "learning_rate": 6.5750000000000006e-06, + "loss": 1.2308, + "step": 1737 + }, + { + "epoch": 2.5968156137647664, + "grad_norm": 0.2863466739654541, + "learning_rate": 6.550000000000001e-06, + "loss": 1.2566, + "step": 1738 + }, + { + "epoch": 2.598309753933791, + "grad_norm": 0.39508727192878723, + "learning_rate": 6.525e-06, + "loss": 1.2495, + "step": 1739 + }, + { + "epoch": 2.5998038941028154, + "grad_norm": 0.20653335750102997, + "learning_rate": 6.5000000000000004e-06, + "loss": 1.3729, + "step": 1740 + }, + { + "epoch": 2.6012980342718404, + "grad_norm": 0.2799397110939026, + "learning_rate": 6.475000000000001e-06, + "loss": 1.1533, + "step": 1741 + }, + { + "epoch": 2.602792174440865, + "grad_norm": 0.2511701285839081, + "learning_rate": 6.45e-06, + "loss": 1.3166, + "step": 1742 + }, + { + "epoch": 2.6042863146098894, + "grad_norm": 0.42425861954689026, + "learning_rate": 6.425e-06, + "loss": 1.2655, + "step": 1743 + }, + { + "epoch": 2.605780454778914, + "grad_norm": 0.16690030694007874, + "learning_rate": 6.4000000000000006e-06, + "loss": 1.1529, + "step": 1744 + }, + { + "epoch": 2.6072745949479383, + "grad_norm": 0.29977497458457947, + "learning_rate": 6.375000000000001e-06, + "loss": 1.2709, + "step": 1745 + }, + { + "epoch": 2.6087687351169633, + "grad_norm": 0.17054329812526703, + "learning_rate": 6.35e-06, + "loss": 1.2346, + "step": 1746 + }, + { + "epoch": 2.6102628752859878, + "grad_norm": 0.18217666447162628, + "learning_rate": 6.3250000000000004e-06, + "loss": 1.363, + "step": 1747 + }, + { + "epoch": 2.6117570154550123, + "grad_norm": 0.18411266803741455, + "learning_rate": 6.300000000000001e-06, + "loss": 1.2045, + "step": 1748 + }, + { + "epoch": 2.613251155624037, + "grad_norm": 0.22760531306266785, + "learning_rate": 6.275e-06, + "loss": 1.3307, + "step": 1749 + }, + { + "epoch": 2.6147452957930617, + "grad_norm": 0.3128303587436676, + "learning_rate": 6.25e-06, + "loss": 1.4054, + "step": 1750 + }, + { + "epoch": 2.616239435962086, + "grad_norm": 0.1927785873413086, + "learning_rate": 6.2250000000000005e-06, + "loss": 1.2833, + "step": 1751 + }, + { + "epoch": 2.6177335761311107, + "grad_norm": 0.19453319907188416, + "learning_rate": 6.2e-06, + "loss": 1.1372, + "step": 1752 + }, + { + "epoch": 2.619227716300135, + "grad_norm": 0.3225673735141754, + "learning_rate": 6.175e-06, + "loss": 1.2579, + "step": 1753 + }, + { + "epoch": 2.62072185646916, + "grad_norm": 0.18228133022785187, + "learning_rate": 6.15e-06, + "loss": 1.4589, + "step": 1754 + }, + { + "epoch": 2.6222159966381846, + "grad_norm": 0.23270145058631897, + "learning_rate": 6.125e-06, + "loss": 1.403, + "step": 1755 + }, + { + "epoch": 2.623710136807209, + "grad_norm": 0.21594181656837463, + "learning_rate": 6.1e-06, + "loss": 1.4261, + "step": 1756 + }, + { + "epoch": 2.625204276976234, + "grad_norm": 0.3312102258205414, + "learning_rate": 6.075e-06, + "loss": 1.2938, + "step": 1757 + }, + { + "epoch": 2.6266984171452585, + "grad_norm": 0.2661645710468292, + "learning_rate": 6.0500000000000005e-06, + "loss": 1.411, + "step": 1758 + }, + { + "epoch": 2.628192557314283, + "grad_norm": 0.1631266474723816, + "learning_rate": 6.025e-06, + "loss": 1.3814, + "step": 1759 + }, + { + "epoch": 2.6296866974833075, + "grad_norm": 0.2169022560119629, + "learning_rate": 6e-06, + "loss": 1.4121, + "step": 1760 + }, + { + "epoch": 2.631180837652332, + "grad_norm": 0.23801642656326294, + "learning_rate": 5.975e-06, + "loss": 1.1812, + "step": 1761 + }, + { + "epoch": 2.632674977821357, + "grad_norm": 0.19962309300899506, + "learning_rate": 5.95e-06, + "loss": 1.2209, + "step": 1762 + }, + { + "epoch": 2.6341691179903814, + "grad_norm": 0.2108369767665863, + "learning_rate": 5.925e-06, + "loss": 1.315, + "step": 1763 + }, + { + "epoch": 2.635663258159406, + "grad_norm": 0.22944194078445435, + "learning_rate": 5.9e-06, + "loss": 1.2293, + "step": 1764 + }, + { + "epoch": 2.637157398328431, + "grad_norm": 0.23019960522651672, + "learning_rate": 5.875e-06, + "loss": 1.3415, + "step": 1765 + }, + { + "epoch": 2.6386515384974554, + "grad_norm": 0.18209655582904816, + "learning_rate": 5.850000000000001e-06, + "loss": 1.2341, + "step": 1766 + }, + { + "epoch": 2.64014567866648, + "grad_norm": 0.1567586064338684, + "learning_rate": 5.825000000000001e-06, + "loss": 1.121, + "step": 1767 + }, + { + "epoch": 2.6416398188355044, + "grad_norm": 0.87409907579422, + "learning_rate": 5.8e-06, + "loss": 1.2113, + "step": 1768 + }, + { + "epoch": 2.643133959004529, + "grad_norm": 0.2868337035179138, + "learning_rate": 5.775000000000001e-06, + "loss": 1.1993, + "step": 1769 + }, + { + "epoch": 2.644628099173554, + "grad_norm": 0.18613767623901367, + "learning_rate": 5.750000000000001e-06, + "loss": 1.1514, + "step": 1770 + }, + { + "epoch": 2.6461222393425783, + "grad_norm": 0.18503230810165405, + "learning_rate": 5.725e-06, + "loss": 1.1599, + "step": 1771 + }, + { + "epoch": 2.647616379511603, + "grad_norm": 0.30136963725090027, + "learning_rate": 5.7000000000000005e-06, + "loss": 1.2663, + "step": 1772 + }, + { + "epoch": 2.6491105196806277, + "grad_norm": 0.42282551527023315, + "learning_rate": 5.675000000000001e-06, + "loss": 1.3327, + "step": 1773 + }, + { + "epoch": 2.650604659849652, + "grad_norm": 0.7006960511207581, + "learning_rate": 5.65e-06, + "loss": 1.2708, + "step": 1774 + }, + { + "epoch": 2.6520988000186767, + "grad_norm": 0.15408888459205627, + "learning_rate": 5.625e-06, + "loss": 1.2682, + "step": 1775 + }, + { + "epoch": 2.653592940187701, + "grad_norm": 0.25027719140052795, + "learning_rate": 5.600000000000001e-06, + "loss": 1.3571, + "step": 1776 + }, + { + "epoch": 2.6550870803567257, + "grad_norm": 0.19256514310836792, + "learning_rate": 5.575e-06, + "loss": 1.2386, + "step": 1777 + }, + { + "epoch": 2.6565812205257506, + "grad_norm": 0.18717578053474426, + "learning_rate": 5.55e-06, + "loss": 1.1667, + "step": 1778 + }, + { + "epoch": 2.658075360694775, + "grad_norm": 0.16257885098457336, + "learning_rate": 5.5250000000000005e-06, + "loss": 1.2965, + "step": 1779 + }, + { + "epoch": 2.6595695008637996, + "grad_norm": 0.2796028256416321, + "learning_rate": 5.500000000000001e-06, + "loss": 1.3928, + "step": 1780 + }, + { + "epoch": 2.6610636410328246, + "grad_norm": 0.2988707423210144, + "learning_rate": 5.475e-06, + "loss": 1.4746, + "step": 1781 + }, + { + "epoch": 2.662557781201849, + "grad_norm": 0.2644762396812439, + "learning_rate": 5.45e-06, + "loss": 1.3648, + "step": 1782 + }, + { + "epoch": 2.6640519213708735, + "grad_norm": 0.23171842098236084, + "learning_rate": 5.4250000000000006e-06, + "loss": 1.266, + "step": 1783 + }, + { + "epoch": 2.6655460615398985, + "grad_norm": 0.21569930016994476, + "learning_rate": 5.4e-06, + "loss": 1.344, + "step": 1784 + }, + { + "epoch": 2.667040201708923, + "grad_norm": 0.31907474994659424, + "learning_rate": 5.375e-06, + "loss": 1.5272, + "step": 1785 + }, + { + "epoch": 2.6685343418779475, + "grad_norm": 0.18725049495697021, + "learning_rate": 5.3500000000000004e-06, + "loss": 1.4265, + "step": 1786 + }, + { + "epoch": 2.670028482046972, + "grad_norm": 0.22397437691688538, + "learning_rate": 5.325e-06, + "loss": 1.1694, + "step": 1787 + }, + { + "epoch": 2.6715226222159965, + "grad_norm": 0.2414587140083313, + "learning_rate": 5.3e-06, + "loss": 1.1475, + "step": 1788 + }, + { + "epoch": 2.6730167623850214, + "grad_norm": 0.307595819234848, + "learning_rate": 5.275e-06, + "loss": 1.36, + "step": 1789 + }, + { + "epoch": 2.674510902554046, + "grad_norm": 0.20186826586723328, + "learning_rate": 5.25e-06, + "loss": 1.35, + "step": 1790 + }, + { + "epoch": 2.6760050427230704, + "grad_norm": 0.20282383263111115, + "learning_rate": 5.225e-06, + "loss": 1.2654, + "step": 1791 + }, + { + "epoch": 2.6774991828920953, + "grad_norm": 0.21675416827201843, + "learning_rate": 5.2e-06, + "loss": 1.2554, + "step": 1792 + }, + { + "epoch": 2.67899332306112, + "grad_norm": 0.47374227643013, + "learning_rate": 5.175e-06, + "loss": 1.4186, + "step": 1793 + }, + { + "epoch": 2.6804874632301443, + "grad_norm": 0.17790532112121582, + "learning_rate": 5.15e-06, + "loss": 1.277, + "step": 1794 + }, + { + "epoch": 2.681981603399169, + "grad_norm": 0.19837185740470886, + "learning_rate": 5.125e-06, + "loss": 1.3053, + "step": 1795 + }, + { + "epoch": 2.6834757435681933, + "grad_norm": 0.174654021859169, + "learning_rate": 5.1e-06, + "loss": 1.2047, + "step": 1796 + }, + { + "epoch": 2.6849698837372182, + "grad_norm": 0.6168277263641357, + "learning_rate": 5.0750000000000005e-06, + "loss": 1.3662, + "step": 1797 + }, + { + "epoch": 2.6864640239062427, + "grad_norm": 0.33519673347473145, + "learning_rate": 5.050000000000001e-06, + "loss": 1.515, + "step": 1798 + }, + { + "epoch": 2.687958164075267, + "grad_norm": 0.2172117829322815, + "learning_rate": 5.025e-06, + "loss": 1.2729, + "step": 1799 + }, + { + "epoch": 2.689452304244292, + "grad_norm": 0.2625578045845032, + "learning_rate": 5e-06, + "loss": 1.3624, + "step": 1800 + }, + { + "epoch": 2.6909464444133167, + "grad_norm": 0.26138144731521606, + "learning_rate": 4.975000000000001e-06, + "loss": 1.2029, + "step": 1801 + }, + { + "epoch": 2.692440584582341, + "grad_norm": 0.25910723209381104, + "learning_rate": 4.950000000000001e-06, + "loss": 1.4087, + "step": 1802 + }, + { + "epoch": 2.6939347247513656, + "grad_norm": 0.3034740090370178, + "learning_rate": 4.925e-06, + "loss": 1.162, + "step": 1803 + }, + { + "epoch": 2.69542886492039, + "grad_norm": 0.21220143139362335, + "learning_rate": 4.9000000000000005e-06, + "loss": 1.2157, + "step": 1804 + }, + { + "epoch": 2.696923005089415, + "grad_norm": 0.27650538086891174, + "learning_rate": 4.875000000000001e-06, + "loss": 1.3509, + "step": 1805 + }, + { + "epoch": 2.6984171452584396, + "grad_norm": 0.22808341681957245, + "learning_rate": 4.85e-06, + "loss": 1.2005, + "step": 1806 + }, + { + "epoch": 2.699911285427464, + "grad_norm": 0.18333181738853455, + "learning_rate": 4.825e-06, + "loss": 1.2351, + "step": 1807 + }, + { + "epoch": 2.701405425596489, + "grad_norm": 0.5237784385681152, + "learning_rate": 4.800000000000001e-06, + "loss": 1.3003, + "step": 1808 + }, + { + "epoch": 2.7028995657655135, + "grad_norm": 0.4465050995349884, + "learning_rate": 4.775e-06, + "loss": 1.474, + "step": 1809 + }, + { + "epoch": 2.704393705934538, + "grad_norm": 0.4441722333431244, + "learning_rate": 4.75e-06, + "loss": 1.4661, + "step": 1810 + }, + { + "epoch": 2.7058878461035625, + "grad_norm": 0.3523397743701935, + "learning_rate": 4.7250000000000005e-06, + "loss": 1.2954, + "step": 1811 + }, + { + "epoch": 2.707381986272587, + "grad_norm": 0.2340545803308487, + "learning_rate": 4.7e-06, + "loss": 1.1822, + "step": 1812 + }, + { + "epoch": 2.708876126441612, + "grad_norm": 0.34939736127853394, + "learning_rate": 4.675e-06, + "loss": 1.2706, + "step": 1813 + }, + { + "epoch": 2.7103702666106364, + "grad_norm": 0.24331608414649963, + "learning_rate": 4.65e-06, + "loss": 1.3334, + "step": 1814 + }, + { + "epoch": 2.711864406779661, + "grad_norm": 0.1805030107498169, + "learning_rate": 4.625e-06, + "loss": 1.3894, + "step": 1815 + }, + { + "epoch": 2.713358546948686, + "grad_norm": 0.6400303244590759, + "learning_rate": 4.6e-06, + "loss": 1.4153, + "step": 1816 + }, + { + "epoch": 2.7148526871177103, + "grad_norm": 0.2347690910100937, + "learning_rate": 4.575e-06, + "loss": 1.4021, + "step": 1817 + }, + { + "epoch": 2.716346827286735, + "grad_norm": 2.5086920261383057, + "learning_rate": 4.5500000000000005e-06, + "loss": 1.4322, + "step": 1818 + }, + { + "epoch": 2.7178409674557593, + "grad_norm": 0.18511001765727997, + "learning_rate": 4.525e-06, + "loss": 1.2838, + "step": 1819 + }, + { + "epoch": 2.719335107624784, + "grad_norm": 0.2383240908384323, + "learning_rate": 4.5e-06, + "loss": 1.0991, + "step": 1820 + }, + { + "epoch": 2.7208292477938087, + "grad_norm": 0.23585478961467743, + "learning_rate": 4.475e-06, + "loss": 1.2393, + "step": 1821 + }, + { + "epoch": 2.7223233879628332, + "grad_norm": 0.2568042278289795, + "learning_rate": 4.45e-06, + "loss": 1.1486, + "step": 1822 + }, + { + "epoch": 2.7238175281318577, + "grad_norm": 0.20862841606140137, + "learning_rate": 4.425e-06, + "loss": 1.1885, + "step": 1823 + }, + { + "epoch": 2.7253116683008827, + "grad_norm": 0.19219589233398438, + "learning_rate": 4.4e-06, + "loss": 1.4042, + "step": 1824 + }, + { + "epoch": 2.726805808469907, + "grad_norm": 0.3805523216724396, + "learning_rate": 4.375e-06, + "loss": 1.2164, + "step": 1825 + }, + { + "epoch": 2.7282999486389317, + "grad_norm": 0.49763739109039307, + "learning_rate": 4.35e-06, + "loss": 1.3467, + "step": 1826 + }, + { + "epoch": 2.729794088807956, + "grad_norm": 0.17709727585315704, + "learning_rate": 4.325e-06, + "loss": 1.1515, + "step": 1827 + }, + { + "epoch": 2.7312882289769806, + "grad_norm": 0.20847739279270172, + "learning_rate": 4.2999999999999995e-06, + "loss": 1.1841, + "step": 1828 + }, + { + "epoch": 2.7327823691460056, + "grad_norm": 0.21395619213581085, + "learning_rate": 4.2750000000000006e-06, + "loss": 1.3783, + "step": 1829 + }, + { + "epoch": 2.73427650931503, + "grad_norm": 0.4332375228404999, + "learning_rate": 4.250000000000001e-06, + "loss": 1.1825, + "step": 1830 + }, + { + "epoch": 2.7357706494840546, + "grad_norm": 0.8193243145942688, + "learning_rate": 4.225e-06, + "loss": 1.5271, + "step": 1831 + }, + { + "epoch": 2.7372647896530795, + "grad_norm": 0.19088123738765717, + "learning_rate": 4.2000000000000004e-06, + "loss": 1.3111, + "step": 1832 + }, + { + "epoch": 2.738758929822104, + "grad_norm": 0.4072718620300293, + "learning_rate": 4.175000000000001e-06, + "loss": 1.2883, + "step": 1833 + }, + { + "epoch": 2.7402530699911285, + "grad_norm": 0.2014656513929367, + "learning_rate": 4.15e-06, + "loss": 1.2469, + "step": 1834 + }, + { + "epoch": 2.741747210160153, + "grad_norm": 0.2613576054573059, + "learning_rate": 4.125e-06, + "loss": 1.2822, + "step": 1835 + }, + { + "epoch": 2.7432413503291775, + "grad_norm": 0.21430745720863342, + "learning_rate": 4.1000000000000006e-06, + "loss": 1.2495, + "step": 1836 + }, + { + "epoch": 2.7447354904982024, + "grad_norm": 0.18431149423122406, + "learning_rate": 4.075e-06, + "loss": 1.3818, + "step": 1837 + }, + { + "epoch": 2.746229630667227, + "grad_norm": 0.2120877504348755, + "learning_rate": 4.05e-06, + "loss": 1.2272, + "step": 1838 + }, + { + "epoch": 2.7477237708362514, + "grad_norm": 2.143777370452881, + "learning_rate": 4.0250000000000004e-06, + "loss": 1.309, + "step": 1839 + }, + { + "epoch": 2.7492179110052763, + "grad_norm": 0.2145431488752365, + "learning_rate": 4.000000000000001e-06, + "loss": 1.3135, + "step": 1840 + }, + { + "epoch": 2.750712051174301, + "grad_norm": 0.2832334339618683, + "learning_rate": 3.975e-06, + "loss": 1.3395, + "step": 1841 + }, + { + "epoch": 2.7522061913433253, + "grad_norm": 0.1784759759902954, + "learning_rate": 3.95e-06, + "loss": 1.3056, + "step": 1842 + }, + { + "epoch": 2.7537003315123503, + "grad_norm": 0.21450982987880707, + "learning_rate": 3.9250000000000005e-06, + "loss": 1.1001, + "step": 1843 + }, + { + "epoch": 2.7551944716813748, + "grad_norm": 0.2597975730895996, + "learning_rate": 3.9e-06, + "loss": 1.3779, + "step": 1844 + }, + { + "epoch": 2.7566886118503993, + "grad_norm": 0.2315620481967926, + "learning_rate": 3.875e-06, + "loss": 1.3914, + "step": 1845 + }, + { + "epoch": 2.7581827520194238, + "grad_norm": 0.25470679998397827, + "learning_rate": 3.85e-06, + "loss": 1.2633, + "step": 1846 + }, + { + "epoch": 2.7596768921884482, + "grad_norm": 0.2289915233850479, + "learning_rate": 3.825e-06, + "loss": 1.2158, + "step": 1847 + }, + { + "epoch": 2.761171032357473, + "grad_norm": 0.6363893151283264, + "learning_rate": 3.8e-06, + "loss": 1.4446, + "step": 1848 + }, + { + "epoch": 2.7626651725264977, + "grad_norm": 0.19268900156021118, + "learning_rate": 3.775e-06, + "loss": 1.2836, + "step": 1849 + }, + { + "epoch": 2.764159312695522, + "grad_norm": 0.515834629535675, + "learning_rate": 3.75e-06, + "loss": 1.3646, + "step": 1850 + }, + { + "epoch": 2.765653452864547, + "grad_norm": 0.2783832848072052, + "learning_rate": 3.725e-06, + "loss": 1.3876, + "step": 1851 + }, + { + "epoch": 2.7671475930335716, + "grad_norm": 0.21633508801460266, + "learning_rate": 3.7e-06, + "loss": 1.1456, + "step": 1852 + }, + { + "epoch": 2.768641733202596, + "grad_norm": 0.22185078263282776, + "learning_rate": 3.675e-06, + "loss": 1.5259, + "step": 1853 + }, + { + "epoch": 2.7701358733716206, + "grad_norm": 0.29004672169685364, + "learning_rate": 3.6499999999999998e-06, + "loss": 1.3148, + "step": 1854 + }, + { + "epoch": 2.771630013540645, + "grad_norm": 0.2006434053182602, + "learning_rate": 3.625e-06, + "loss": 1.1994, + "step": 1855 + }, + { + "epoch": 2.77312415370967, + "grad_norm": 0.21816261112689972, + "learning_rate": 3.6e-06, + "loss": 1.3848, + "step": 1856 + }, + { + "epoch": 2.7746182938786945, + "grad_norm": 0.21714898943901062, + "learning_rate": 3.575e-06, + "loss": 1.2319, + "step": 1857 + }, + { + "epoch": 2.776112434047719, + "grad_norm": 0.21400469541549683, + "learning_rate": 3.55e-06, + "loss": 1.4311, + "step": 1858 + }, + { + "epoch": 2.777606574216744, + "grad_norm": 0.20019961893558502, + "learning_rate": 3.5249999999999997e-06, + "loss": 1.3558, + "step": 1859 + }, + { + "epoch": 2.7791007143857684, + "grad_norm": 0.24723859131336212, + "learning_rate": 3.5000000000000004e-06, + "loss": 1.1142, + "step": 1860 + }, + { + "epoch": 2.780594854554793, + "grad_norm": 0.22617855668067932, + "learning_rate": 3.4750000000000006e-06, + "loss": 1.3948, + "step": 1861 + }, + { + "epoch": 2.7820889947238174, + "grad_norm": 0.324815958738327, + "learning_rate": 3.4500000000000004e-06, + "loss": 1.2817, + "step": 1862 + }, + { + "epoch": 2.783583134892842, + "grad_norm": 0.19511297345161438, + "learning_rate": 3.4250000000000002e-06, + "loss": 1.5176, + "step": 1863 + }, + { + "epoch": 2.785077275061867, + "grad_norm": 0.8152601718902588, + "learning_rate": 3.4000000000000005e-06, + "loss": 1.1762, + "step": 1864 + }, + { + "epoch": 2.7865714152308914, + "grad_norm": 0.23388169705867767, + "learning_rate": 3.3750000000000003e-06, + "loss": 1.1857, + "step": 1865 + }, + { + "epoch": 2.788065555399916, + "grad_norm": 0.28998321294784546, + "learning_rate": 3.3500000000000005e-06, + "loss": 1.0647, + "step": 1866 + }, + { + "epoch": 2.789559695568941, + "grad_norm": 0.2541603446006775, + "learning_rate": 3.3250000000000004e-06, + "loss": 1.3563, + "step": 1867 + }, + { + "epoch": 2.7910538357379653, + "grad_norm": 0.25344282388687134, + "learning_rate": 3.3e-06, + "loss": 1.4685, + "step": 1868 + }, + { + "epoch": 2.7925479759069898, + "grad_norm": 0.22146444022655487, + "learning_rate": 3.2750000000000004e-06, + "loss": 1.3198, + "step": 1869 + }, + { + "epoch": 2.7940421160760143, + "grad_norm": 0.21067120134830475, + "learning_rate": 3.2500000000000002e-06, + "loss": 1.3077, + "step": 1870 + }, + { + "epoch": 2.7955362562450388, + "grad_norm": 0.45738738775253296, + "learning_rate": 3.225e-06, + "loss": 1.3165, + "step": 1871 + }, + { + "epoch": 2.7970303964140637, + "grad_norm": 0.1831294596195221, + "learning_rate": 3.2000000000000003e-06, + "loss": 1.208, + "step": 1872 + }, + { + "epoch": 2.798524536583088, + "grad_norm": 0.2563336491584778, + "learning_rate": 3.175e-06, + "loss": 1.4487, + "step": 1873 + }, + { + "epoch": 2.8000186767521127, + "grad_norm": 0.15740127861499786, + "learning_rate": 3.1500000000000003e-06, + "loss": 1.2381, + "step": 1874 + }, + { + "epoch": 2.8015128169211376, + "grad_norm": 0.2575199007987976, + "learning_rate": 3.125e-06, + "loss": 1.2935, + "step": 1875 + }, + { + "epoch": 2.803006957090162, + "grad_norm": 0.20497269928455353, + "learning_rate": 3.1e-06, + "loss": 1.5041, + "step": 1876 + }, + { + "epoch": 2.8045010972591866, + "grad_norm": 0.4272780418395996, + "learning_rate": 3.075e-06, + "loss": 1.3039, + "step": 1877 + }, + { + "epoch": 2.805995237428211, + "grad_norm": 0.26943546533584595, + "learning_rate": 3.05e-06, + "loss": 1.1747, + "step": 1878 + }, + { + "epoch": 2.8074893775972356, + "grad_norm": 0.3755415678024292, + "learning_rate": 3.0250000000000003e-06, + "loss": 1.394, + "step": 1879 + }, + { + "epoch": 2.8089835177662605, + "grad_norm": 0.2485462874174118, + "learning_rate": 3e-06, + "loss": 1.317, + "step": 1880 + }, + { + "epoch": 2.810477657935285, + "grad_norm": 0.1786644607782364, + "learning_rate": 2.975e-06, + "loss": 1.2546, + "step": 1881 + }, + { + "epoch": 2.8119717981043095, + "grad_norm": 1.068226933479309, + "learning_rate": 2.95e-06, + "loss": 1.3132, + "step": 1882 + }, + { + "epoch": 2.8134659382733345, + "grad_norm": 0.19659435749053955, + "learning_rate": 2.9250000000000004e-06, + "loss": 1.2877, + "step": 1883 + }, + { + "epoch": 2.814960078442359, + "grad_norm": 0.2480842024087906, + "learning_rate": 2.9e-06, + "loss": 1.337, + "step": 1884 + }, + { + "epoch": 2.8164542186113835, + "grad_norm": 0.3635180592536926, + "learning_rate": 2.8750000000000004e-06, + "loss": 1.2722, + "step": 1885 + }, + { + "epoch": 2.817948358780408, + "grad_norm": 0.238739475607872, + "learning_rate": 2.8500000000000002e-06, + "loss": 1.4595, + "step": 1886 + }, + { + "epoch": 2.8194424989494324, + "grad_norm": 0.2953251600265503, + "learning_rate": 2.825e-06, + "loss": 1.4326, + "step": 1887 + }, + { + "epoch": 2.8209366391184574, + "grad_norm": 0.17587164044380188, + "learning_rate": 2.8000000000000003e-06, + "loss": 1.3387, + "step": 1888 + }, + { + "epoch": 2.822430779287482, + "grad_norm": 0.2200200855731964, + "learning_rate": 2.775e-06, + "loss": 1.2648, + "step": 1889 + }, + { + "epoch": 2.8239249194565064, + "grad_norm": 0.2534538507461548, + "learning_rate": 2.7500000000000004e-06, + "loss": 1.3252, + "step": 1890 + }, + { + "epoch": 2.8254190596255313, + "grad_norm": 0.23841866850852966, + "learning_rate": 2.725e-06, + "loss": 1.2432, + "step": 1891 + }, + { + "epoch": 2.826913199794556, + "grad_norm": 0.23968037962913513, + "learning_rate": 2.7e-06, + "loss": 1.1769, + "step": 1892 + }, + { + "epoch": 2.8284073399635803, + "grad_norm": 0.3135208189487457, + "learning_rate": 2.6750000000000002e-06, + "loss": 1.1362, + "step": 1893 + }, + { + "epoch": 2.829901480132605, + "grad_norm": 0.1951269954442978, + "learning_rate": 2.65e-06, + "loss": 1.3334, + "step": 1894 + }, + { + "epoch": 2.8313956203016293, + "grad_norm": 0.21999233961105347, + "learning_rate": 2.625e-06, + "loss": 1.348, + "step": 1895 + }, + { + "epoch": 2.832889760470654, + "grad_norm": 0.3923155963420868, + "learning_rate": 2.6e-06, + "loss": 1.2762, + "step": 1896 + }, + { + "epoch": 2.8343839006396787, + "grad_norm": 0.25877097249031067, + "learning_rate": 2.575e-06, + "loss": 1.2909, + "step": 1897 + }, + { + "epoch": 2.835878040808703, + "grad_norm": 0.2079850733280182, + "learning_rate": 2.55e-06, + "loss": 1.2758, + "step": 1898 + }, + { + "epoch": 2.837372180977728, + "grad_norm": 0.1782015711069107, + "learning_rate": 2.5250000000000004e-06, + "loss": 1.4591, + "step": 1899 + }, + { + "epoch": 2.8388663211467526, + "grad_norm": 0.21035560965538025, + "learning_rate": 2.5e-06, + "loss": 1.0855, + "step": 1900 + }, + { + "epoch": 2.840360461315777, + "grad_norm": 0.22053273022174835, + "learning_rate": 2.4750000000000004e-06, + "loss": 1.2851, + "step": 1901 + }, + { + "epoch": 2.841854601484802, + "grad_norm": 0.5123848915100098, + "learning_rate": 2.4500000000000003e-06, + "loss": 1.3428, + "step": 1902 + }, + { + "epoch": 2.8433487416538266, + "grad_norm": 0.2593129873275757, + "learning_rate": 2.425e-06, + "loss": 1.4084, + "step": 1903 + }, + { + "epoch": 2.844842881822851, + "grad_norm": 0.25032708048820496, + "learning_rate": 2.4000000000000003e-06, + "loss": 1.3046, + "step": 1904 + }, + { + "epoch": 2.8463370219918755, + "grad_norm": 0.19986706972122192, + "learning_rate": 2.375e-06, + "loss": 1.3027, + "step": 1905 + }, + { + "epoch": 2.8478311621609, + "grad_norm": 0.2648144066333771, + "learning_rate": 2.35e-06, + "loss": 1.2129, + "step": 1906 + }, + { + "epoch": 2.849325302329925, + "grad_norm": 0.19700418412685394, + "learning_rate": 2.325e-06, + "loss": 1.4243, + "step": 1907 + }, + { + "epoch": 2.8508194424989495, + "grad_norm": 0.2682986259460449, + "learning_rate": 2.3e-06, + "loss": 1.2794, + "step": 1908 + }, + { + "epoch": 2.852313582667974, + "grad_norm": 0.20905745029449463, + "learning_rate": 2.2750000000000002e-06, + "loss": 1.2348, + "step": 1909 + }, + { + "epoch": 2.853807722836999, + "grad_norm": 0.3586556315422058, + "learning_rate": 2.25e-06, + "loss": 1.3826, + "step": 1910 + }, + { + "epoch": 2.8553018630060234, + "grad_norm": 0.1797320693731308, + "learning_rate": 2.225e-06, + "loss": 1.2428, + "step": 1911 + }, + { + "epoch": 2.856796003175048, + "grad_norm": 0.21647144854068756, + "learning_rate": 2.2e-06, + "loss": 1.4246, + "step": 1912 + }, + { + "epoch": 2.8582901433440724, + "grad_norm": 0.42772310972213745, + "learning_rate": 2.175e-06, + "loss": 1.2403, + "step": 1913 + }, + { + "epoch": 2.859784283513097, + "grad_norm": 0.18217770755290985, + "learning_rate": 2.1499999999999997e-06, + "loss": 1.2052, + "step": 1914 + }, + { + "epoch": 2.861278423682122, + "grad_norm": 0.2103743702173233, + "learning_rate": 2.1250000000000004e-06, + "loss": 1.2553, + "step": 1915 + }, + { + "epoch": 2.8627725638511463, + "grad_norm": 0.25712189078330994, + "learning_rate": 2.1000000000000002e-06, + "loss": 1.3833, + "step": 1916 + }, + { + "epoch": 2.864266704020171, + "grad_norm": 0.31101569533348083, + "learning_rate": 2.075e-06, + "loss": 1.3011, + "step": 1917 + }, + { + "epoch": 2.8657608441891957, + "grad_norm": 0.4393167197704315, + "learning_rate": 2.0500000000000003e-06, + "loss": 1.4259, + "step": 1918 + }, + { + "epoch": 2.8672549843582202, + "grad_norm": 0.2310882806777954, + "learning_rate": 2.025e-06, + "loss": 1.2061, + "step": 1919 + }, + { + "epoch": 2.8687491245272447, + "grad_norm": 0.22133032977581024, + "learning_rate": 2.0000000000000003e-06, + "loss": 1.255, + "step": 1920 + }, + { + "epoch": 2.8702432646962692, + "grad_norm": 0.31930020451545715, + "learning_rate": 1.975e-06, + "loss": 1.4847, + "step": 1921 + }, + { + "epoch": 2.8717374048652937, + "grad_norm": 0.1897219717502594, + "learning_rate": 1.95e-06, + "loss": 1.4619, + "step": 1922 + }, + { + "epoch": 2.8732315450343187, + "grad_norm": 0.2287951558828354, + "learning_rate": 1.925e-06, + "loss": 1.1942, + "step": 1923 + }, + { + "epoch": 2.874725685203343, + "grad_norm": 0.1839810609817505, + "learning_rate": 1.9e-06, + "loss": 1.401, + "step": 1924 + }, + { + "epoch": 2.8762198253723676, + "grad_norm": 0.2828529179096222, + "learning_rate": 1.875e-06, + "loss": 1.373, + "step": 1925 + }, + { + "epoch": 2.8777139655413926, + "grad_norm": 0.18922413885593414, + "learning_rate": 1.85e-06, + "loss": 1.322, + "step": 1926 + }, + { + "epoch": 2.879208105710417, + "grad_norm": 0.27389007806777954, + "learning_rate": 1.8249999999999999e-06, + "loss": 1.1936, + "step": 1927 + }, + { + "epoch": 2.8807022458794416, + "grad_norm": 0.16232287883758545, + "learning_rate": 1.8e-06, + "loss": 1.1672, + "step": 1928 + }, + { + "epoch": 2.882196386048466, + "grad_norm": 0.2973114550113678, + "learning_rate": 1.775e-06, + "loss": 1.2634, + "step": 1929 + }, + { + "epoch": 2.8836905262174906, + "grad_norm": 0.357994019985199, + "learning_rate": 1.7500000000000002e-06, + "loss": 1.3833, + "step": 1930 + }, + { + "epoch": 2.8851846663865155, + "grad_norm": 0.1879359632730484, + "learning_rate": 1.7250000000000002e-06, + "loss": 1.2167, + "step": 1931 + }, + { + "epoch": 2.88667880655554, + "grad_norm": 0.24511118233203888, + "learning_rate": 1.7000000000000002e-06, + "loss": 1.3985, + "step": 1932 + }, + { + "epoch": 2.8881729467245645, + "grad_norm": 0.19905032217502594, + "learning_rate": 1.6750000000000003e-06, + "loss": 1.26, + "step": 1933 + }, + { + "epoch": 2.8896670868935894, + "grad_norm": 0.22167952358722687, + "learning_rate": 1.65e-06, + "loss": 1.3348, + "step": 1934 + }, + { + "epoch": 2.891161227062614, + "grad_norm": 0.41242149472236633, + "learning_rate": 1.6250000000000001e-06, + "loss": 1.3367, + "step": 1935 + }, + { + "epoch": 2.8926553672316384, + "grad_norm": 0.26797640323638916, + "learning_rate": 1.6000000000000001e-06, + "loss": 1.1586, + "step": 1936 + }, + { + "epoch": 2.894149507400663, + "grad_norm": 0.20665213465690613, + "learning_rate": 1.5750000000000002e-06, + "loss": 1.2999, + "step": 1937 + }, + { + "epoch": 2.8956436475696874, + "grad_norm": 0.2320278286933899, + "learning_rate": 1.55e-06, + "loss": 1.2405, + "step": 1938 + }, + { + "epoch": 2.8971377877387123, + "grad_norm": 0.2435215711593628, + "learning_rate": 1.525e-06, + "loss": 1.4335, + "step": 1939 + }, + { + "epoch": 2.898631927907737, + "grad_norm": 0.202729731798172, + "learning_rate": 1.5e-06, + "loss": 1.2331, + "step": 1940 + }, + { + "epoch": 2.9001260680767613, + "grad_norm": 0.2189522087574005, + "learning_rate": 1.475e-06, + "loss": 1.2774, + "step": 1941 + }, + { + "epoch": 2.9016202082457863, + "grad_norm": 0.19130611419677734, + "learning_rate": 1.45e-06, + "loss": 1.2951, + "step": 1942 + }, + { + "epoch": 2.9031143484148108, + "grad_norm": 0.2006605565547943, + "learning_rate": 1.4250000000000001e-06, + "loss": 1.2765, + "step": 1943 + }, + { + "epoch": 2.9046084885838352, + "grad_norm": 0.7907626032829285, + "learning_rate": 1.4000000000000001e-06, + "loss": 1.353, + "step": 1944 + }, + { + "epoch": 2.9061026287528597, + "grad_norm": 0.19130118191242218, + "learning_rate": 1.3750000000000002e-06, + "loss": 1.2826, + "step": 1945 + }, + { + "epoch": 2.9075967689218842, + "grad_norm": 0.2727353870868683, + "learning_rate": 1.35e-06, + "loss": 1.1702, + "step": 1946 + }, + { + "epoch": 2.909090909090909, + "grad_norm": 0.28280338644981384, + "learning_rate": 1.325e-06, + "loss": 1.4216, + "step": 1947 + }, + { + "epoch": 2.9105850492599337, + "grad_norm": 0.17979927361011505, + "learning_rate": 1.3e-06, + "loss": 1.3568, + "step": 1948 + }, + { + "epoch": 2.912079189428958, + "grad_norm": 0.2216491550207138, + "learning_rate": 1.275e-06, + "loss": 1.2296, + "step": 1949 + }, + { + "epoch": 2.913573329597983, + "grad_norm": 1.3488661050796509, + "learning_rate": 1.25e-06, + "loss": 1.2513, + "step": 1950 + }, + { + "epoch": 2.9150674697670076, + "grad_norm": 2.417159080505371, + "learning_rate": 1.2250000000000001e-06, + "loss": 1.5253, + "step": 1951 + }, + { + "epoch": 2.916561609936032, + "grad_norm": 0.21374452114105225, + "learning_rate": 1.2000000000000002e-06, + "loss": 1.3452, + "step": 1952 + }, + { + "epoch": 2.9180557501050566, + "grad_norm": 0.20700985193252563, + "learning_rate": 1.175e-06, + "loss": 1.2113, + "step": 1953 + }, + { + "epoch": 2.919549890274081, + "grad_norm": 0.19350235164165497, + "learning_rate": 1.15e-06, + "loss": 1.345, + "step": 1954 + }, + { + "epoch": 2.921044030443106, + "grad_norm": 0.1589677780866623, + "learning_rate": 1.125e-06, + "loss": 1.1717, + "step": 1955 + }, + { + "epoch": 2.9225381706121305, + "grad_norm": 0.22525258362293243, + "learning_rate": 1.1e-06, + "loss": 1.184, + "step": 1956 + }, + { + "epoch": 2.924032310781155, + "grad_norm": 0.17294123768806458, + "learning_rate": 1.0749999999999999e-06, + "loss": 1.2835, + "step": 1957 + }, + { + "epoch": 2.92552645095018, + "grad_norm": 0.21778512001037598, + "learning_rate": 1.0500000000000001e-06, + "loss": 1.3665, + "step": 1958 + }, + { + "epoch": 2.9270205911192044, + "grad_norm": 0.17190255224704742, + "learning_rate": 1.0250000000000001e-06, + "loss": 1.2313, + "step": 1959 + }, + { + "epoch": 2.928514731288229, + "grad_norm": 0.20868921279907227, + "learning_rate": 1.0000000000000002e-06, + "loss": 1.2845, + "step": 1960 + }, + { + "epoch": 2.930008871457254, + "grad_norm": 0.19007806479930878, + "learning_rate": 9.75e-07, + "loss": 1.1979, + "step": 1961 + }, + { + "epoch": 2.9315030116262784, + "grad_norm": 0.48530903458595276, + "learning_rate": 9.5e-07, + "loss": 1.3446, + "step": 1962 + }, + { + "epoch": 2.932997151795303, + "grad_norm": 0.3450883626937866, + "learning_rate": 9.25e-07, + "loss": 1.3834, + "step": 1963 + }, + { + "epoch": 2.9344912919643273, + "grad_norm": 0.23902513086795807, + "learning_rate": 9e-07, + "loss": 1.286, + "step": 1964 + }, + { + "epoch": 2.935985432133352, + "grad_norm": 0.44902661442756653, + "learning_rate": 8.750000000000001e-07, + "loss": 1.2347, + "step": 1965 + }, + { + "epoch": 2.9374795723023768, + "grad_norm": 0.24165555834770203, + "learning_rate": 8.500000000000001e-07, + "loss": 1.254, + "step": 1966 + }, + { + "epoch": 2.9389737124714013, + "grad_norm": 0.20330724120140076, + "learning_rate": 8.25e-07, + "loss": 1.4177, + "step": 1967 + }, + { + "epoch": 2.9404678526404258, + "grad_norm": 0.25209009647369385, + "learning_rate": 8.000000000000001e-07, + "loss": 1.4941, + "step": 1968 + }, + { + "epoch": 2.9419619928094507, + "grad_norm": 0.240138441324234, + "learning_rate": 7.75e-07, + "loss": 1.3458, + "step": 1969 + }, + { + "epoch": 2.943456132978475, + "grad_norm": 0.34439221024513245, + "learning_rate": 7.5e-07, + "loss": 1.4309, + "step": 1970 + }, + { + "epoch": 2.9449502731474997, + "grad_norm": 0.22321772575378418, + "learning_rate": 7.25e-07, + "loss": 1.4066, + "step": 1971 + }, + { + "epoch": 2.946444413316524, + "grad_norm": 0.2187803089618683, + "learning_rate": 7.000000000000001e-07, + "loss": 1.3323, + "step": 1972 + }, + { + "epoch": 2.9479385534855487, + "grad_norm": 0.2270226627588272, + "learning_rate": 6.75e-07, + "loss": 1.243, + "step": 1973 + }, + { + "epoch": 2.9494326936545736, + "grad_norm": 0.20585761964321136, + "learning_rate": 6.5e-07, + "loss": 1.2544, + "step": 1974 + }, + { + "epoch": 2.950926833823598, + "grad_norm": 0.18699559569358826, + "learning_rate": 6.25e-07, + "loss": 1.1969, + "step": 1975 + }, + { + "epoch": 2.9524209739926226, + "grad_norm": 0.5145549178123474, + "learning_rate": 6.000000000000001e-07, + "loss": 1.1359, + "step": 1976 + }, + { + "epoch": 2.9539151141616475, + "grad_norm": 0.23603378236293793, + "learning_rate": 5.75e-07, + "loss": 1.4074, + "step": 1977 + }, + { + "epoch": 2.955409254330672, + "grad_norm": 0.24418418109416962, + "learning_rate": 5.5e-07, + "loss": 1.4617, + "step": 1978 + }, + { + "epoch": 2.9569033944996965, + "grad_norm": 0.2115854173898697, + "learning_rate": 5.250000000000001e-07, + "loss": 1.3637, + "step": 1979 + }, + { + "epoch": 2.958397534668721, + "grad_norm": 0.2542995810508728, + "learning_rate": 5.000000000000001e-07, + "loss": 1.2526, + "step": 1980 + }, + { + "epoch": 2.9598916748377455, + "grad_norm": 0.20433180034160614, + "learning_rate": 4.75e-07, + "loss": 1.2907, + "step": 1981 + }, + { + "epoch": 2.9613858150067704, + "grad_norm": 0.19650673866271973, + "learning_rate": 4.5e-07, + "loss": 1.2711, + "step": 1982 + }, + { + "epoch": 2.962879955175795, + "grad_norm": 0.4737726151943207, + "learning_rate": 4.2500000000000006e-07, + "loss": 1.2896, + "step": 1983 + }, + { + "epoch": 2.9643740953448194, + "grad_norm": 0.23286136984825134, + "learning_rate": 4.0000000000000003e-07, + "loss": 1.219, + "step": 1984 + }, + { + "epoch": 2.9658682355138444, + "grad_norm": 0.17285366356372833, + "learning_rate": 3.75e-07, + "loss": 1.2118, + "step": 1985 + }, + { + "epoch": 2.967362375682869, + "grad_norm": 0.15881465375423431, + "learning_rate": 3.5000000000000004e-07, + "loss": 1.4014, + "step": 1986 + }, + { + "epoch": 2.9688565158518934, + "grad_norm": 0.19492295384407043, + "learning_rate": 3.25e-07, + "loss": 1.2168, + "step": 1987 + }, + { + "epoch": 2.970350656020918, + "grad_norm": 0.22486300766468048, + "learning_rate": 3.0000000000000004e-07, + "loss": 1.4393, + "step": 1988 + }, + { + "epoch": 2.9718447961899424, + "grad_norm": 0.24522030353546143, + "learning_rate": 2.75e-07, + "loss": 1.3018, + "step": 1989 + }, + { + "epoch": 2.9733389363589673, + "grad_norm": 0.34732580184936523, + "learning_rate": 2.5000000000000004e-07, + "loss": 1.3166, + "step": 1990 + }, + { + "epoch": 2.974833076527992, + "grad_norm": 0.22350691258907318, + "learning_rate": 2.25e-07, + "loss": 1.3465, + "step": 1991 + }, + { + "epoch": 2.9763272166970163, + "grad_norm": 0.20143184065818787, + "learning_rate": 2.0000000000000002e-07, + "loss": 1.2512, + "step": 1992 + }, + { + "epoch": 2.977821356866041, + "grad_norm": 0.31710419058799744, + "learning_rate": 1.7500000000000002e-07, + "loss": 1.1942, + "step": 1993 + }, + { + "epoch": 2.9793154970350657, + "grad_norm": 0.60710209608078, + "learning_rate": 1.5000000000000002e-07, + "loss": 1.2493, + "step": 1994 + }, + { + "epoch": 2.98080963720409, + "grad_norm": 0.27234354615211487, + "learning_rate": 1.2500000000000002e-07, + "loss": 1.2728, + "step": 1995 + }, + { + "epoch": 2.9823037773731147, + "grad_norm": 0.2213677167892456, + "learning_rate": 1.0000000000000001e-07, + "loss": 1.3398, + "step": 1996 + }, + { + "epoch": 2.983797917542139, + "grad_norm": 0.187736377120018, + "learning_rate": 7.500000000000001e-08, + "loss": 1.3615, + "step": 1997 + }, + { + "epoch": 2.985292057711164, + "grad_norm": 0.1614825576543808, + "learning_rate": 5.0000000000000004e-08, + "loss": 1.1303, + "step": 1998 + }, + { + "epoch": 2.9867861978801886, + "grad_norm": 0.18150056898593903, + "learning_rate": 2.5000000000000002e-08, + "loss": 1.2191, + "step": 1999 + }, + { + "epoch": 2.988280338049213, + "grad_norm": 0.38465365767478943, + "learning_rate": 0.0, + "loss": 1.3653, + "step": 2000 + }, + { + "epoch": 2.988280338049213, + "step": 2000, + "total_flos": 3.3909236563968e+16, + "train_loss": 1.523793903529644, + "train_runtime": 5754.2327, + "train_samples_per_second": 11.122, + "train_steps_per_second": 0.348 + } + ], + "logging_steps": 1.0, + "max_steps": 2000, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 3.3909236563968e+16, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}