diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,4501 @@ +{ + "best_metric": 60.0, + "best_model_checkpoint": "/data/users/bking2/tod_zero/outputs/runs/finetune/starcoder_3b/supervised/eiqracdg/checkpoint-28800", + "epoch": 0.8109888995894369, + "eval_steps": 3200, + "global_step": 32000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "grad_norm": 1.92511785030365, + "learning_rate": 2.5e-06, + "loss": 1.186, + "step": 50 + }, + { + "epoch": 0.0, + "grad_norm": 1.4528826475143433, + "learning_rate": 5e-06, + "loss": 0.287, + "step": 100 + }, + { + "epoch": 0.0, + "grad_norm": 1.285142421722412, + "learning_rate": 4.9999696912850374e-06, + "loss": 0.2339, + "step": 150 + }, + { + "epoch": 0.01, + "grad_norm": 1.2731701135635376, + "learning_rate": 4.999878765875043e-06, + "loss": 0.2247, + "step": 200 + }, + { + "epoch": 0.01, + "grad_norm": 1.5167465209960938, + "learning_rate": 4.999727225974682e-06, + "loss": 0.2029, + "step": 250 + }, + { + "epoch": 0.01, + "grad_norm": 1.0947519540786743, + "learning_rate": 4.999515075258341e-06, + "loss": 0.2034, + "step": 300 + }, + { + "epoch": 0.01, + "grad_norm": 1.1314687728881836, + "learning_rate": 4.999242318870029e-06, + "loss": 0.1977, + "step": 350 + }, + { + "epoch": 0.01, + "grad_norm": 1.172371506690979, + "learning_rate": 4.998908963423264e-06, + "loss": 0.1924, + "step": 400 + }, + { + "epoch": 0.01, + "grad_norm": 1.348120093345642, + "learning_rate": 4.998515017000907e-06, + "loss": 0.189, + "step": 450 + }, + { + "epoch": 0.01, + "grad_norm": 1.0465130805969238, + "learning_rate": 4.998060489154965e-06, + "loss": 0.1863, + "step": 500 + }, + { + "epoch": 0.01, + "grad_norm": 1.0949101448059082, + "learning_rate": 4.997545390906362e-06, + "loss": 0.1894, + "step": 550 + }, + { + "epoch": 0.02, + "grad_norm": 1.2342430353164673, + "learning_rate": 4.996969734744671e-06, + "loss": 0.1842, + "step": 600 + }, + { + "epoch": 0.02, + "grad_norm": 1.2161908149719238, + "learning_rate": 4.99633353462781e-06, + "loss": 0.1885, + "step": 650 + }, + { + "epoch": 0.02, + "grad_norm": 1.237592101097107, + "learning_rate": 4.995636805981707e-06, + "loss": 0.188, + "step": 700 + }, + { + "epoch": 0.02, + "grad_norm": 1.0181180238723755, + "learning_rate": 4.99487956569992e-06, + "loss": 0.1807, + "step": 750 + }, + { + "epoch": 0.02, + "grad_norm": 1.0869165658950806, + "learning_rate": 4.994061832143235e-06, + "loss": 0.1859, + "step": 800 + }, + { + "epoch": 0.02, + "grad_norm": 1.115837574005127, + "learning_rate": 4.993183625139212e-06, + "loss": 0.1733, + "step": 850 + }, + { + "epoch": 0.02, + "grad_norm": 0.9784592986106873, + "learning_rate": 4.992244965981714e-06, + "loss": 0.1781, + "step": 900 + }, + { + "epoch": 0.02, + "grad_norm": 1.1208012104034424, + "learning_rate": 4.991245877430382e-06, + "loss": 0.1793, + "step": 950 + }, + { + "epoch": 0.03, + "grad_norm": 1.1315056085586548, + "learning_rate": 4.990186383710089e-06, + "loss": 0.1737, + "step": 1000 + }, + { + "epoch": 0.03, + "grad_norm": 0.9561248421669006, + "learning_rate": 4.9890665105103484e-06, + "loss": 0.1755, + "step": 1050 + }, + { + "epoch": 0.03, + "grad_norm": 1.0493712425231934, + "learning_rate": 4.987886284984695e-06, + "loss": 0.1733, + "step": 1100 + }, + { + "epoch": 0.03, + "grad_norm": 1.0467311143875122, + "learning_rate": 4.986645735750025e-06, + "loss": 0.1742, + "step": 1150 + }, + { + "epoch": 0.03, + "grad_norm": 0.915791928768158, + "learning_rate": 4.985344892885899e-06, + "loss": 0.1722, + "step": 1200 + }, + { + "epoch": 0.03, + "grad_norm": 1.0208431482315063, + "learning_rate": 4.98398378793382e-06, + "loss": 0.1722, + "step": 1250 + }, + { + "epoch": 0.03, + "grad_norm": 1.0767868757247925, + "learning_rate": 4.982562453896458e-06, + "loss": 0.1726, + "step": 1300 + }, + { + "epoch": 0.03, + "grad_norm": 0.9579061269760132, + "learning_rate": 4.9810809252368615e-06, + "loss": 0.1688, + "step": 1350 + }, + { + "epoch": 0.04, + "grad_norm": 0.9485280513763428, + "learning_rate": 4.979539237877615e-06, + "loss": 0.1614, + "step": 1400 + }, + { + "epoch": 0.04, + "grad_norm": 0.9779006838798523, + "learning_rate": 4.977937429199968e-06, + "loss": 0.1694, + "step": 1450 + }, + { + "epoch": 0.04, + "grad_norm": 0.9930433034896851, + "learning_rate": 4.976275538042932e-06, + "loss": 0.1694, + "step": 1500 + }, + { + "epoch": 0.04, + "grad_norm": 1.1304377317428589, + "learning_rate": 4.974553604702332e-06, + "loss": 0.1697, + "step": 1550 + }, + { + "epoch": 0.04, + "grad_norm": 0.9419906139373779, + "learning_rate": 4.972771670929841e-06, + "loss": 0.1678, + "step": 1600 + }, + { + "epoch": 0.04, + "grad_norm": 0.9315632581710815, + "learning_rate": 4.970929779931955e-06, + "loss": 0.1735, + "step": 1650 + }, + { + "epoch": 0.04, + "grad_norm": 1.015580415725708, + "learning_rate": 4.969027976368954e-06, + "loss": 0.1706, + "step": 1700 + }, + { + "epoch": 0.04, + "grad_norm": 1.059506893157959, + "learning_rate": 4.967066306353816e-06, + "loss": 0.1729, + "step": 1750 + }, + { + "epoch": 0.05, + "grad_norm": 0.8472514152526855, + "learning_rate": 4.9650448174510986e-06, + "loss": 0.1718, + "step": 1800 + }, + { + "epoch": 0.05, + "grad_norm": 1.6632156372070312, + "learning_rate": 4.9629635586757865e-06, + "loss": 0.1667, + "step": 1850 + }, + { + "epoch": 0.05, + "grad_norm": 1.0739672183990479, + "learning_rate": 4.960822580492103e-06, + "loss": 0.1654, + "step": 1900 + }, + { + "epoch": 0.05, + "grad_norm": 0.9874502420425415, + "learning_rate": 4.958621934812286e-06, + "loss": 0.1641, + "step": 1950 + }, + { + "epoch": 0.05, + "grad_norm": 0.9233792424201965, + "learning_rate": 4.95636167499533e-06, + "loss": 0.1648, + "step": 2000 + }, + { + "epoch": 0.05, + "grad_norm": 0.8490021228790283, + "learning_rate": 4.9540418558456915e-06, + "loss": 0.1688, + "step": 2050 + }, + { + "epoch": 0.05, + "grad_norm": 0.9746788144111633, + "learning_rate": 4.951662533611959e-06, + "loss": 0.169, + "step": 2100 + }, + { + "epoch": 0.05, + "grad_norm": 0.8029181957244873, + "learning_rate": 4.9492237659854946e-06, + "loss": 0.1645, + "step": 2150 + }, + { + "epoch": 0.06, + "grad_norm": 1.1753889322280884, + "learning_rate": 4.9467256120990255e-06, + "loss": 0.1692, + "step": 2200 + }, + { + "epoch": 0.06, + "grad_norm": 0.8767470717430115, + "learning_rate": 4.9441681325252215e-06, + "loss": 0.1617, + "step": 2250 + }, + { + "epoch": 0.06, + "grad_norm": 0.9515625238418579, + "learning_rate": 4.941551389275217e-06, + "loss": 0.1599, + "step": 2300 + }, + { + "epoch": 0.06, + "grad_norm": 0.9250404238700867, + "learning_rate": 4.938875445797112e-06, + "loss": 0.1678, + "step": 2350 + }, + { + "epoch": 0.06, + "grad_norm": 0.9468141794204712, + "learning_rate": 4.936140366974434e-06, + "loss": 0.1665, + "step": 2400 + }, + { + "epoch": 0.06, + "grad_norm": 0.9202454090118408, + "learning_rate": 4.933346219124562e-06, + "loss": 0.1579, + "step": 2450 + }, + { + "epoch": 0.06, + "grad_norm": 0.9553722739219666, + "learning_rate": 4.93049306999712e-06, + "loss": 0.1706, + "step": 2500 + }, + { + "epoch": 0.06, + "grad_norm": 0.8615751266479492, + "learning_rate": 4.927580988772336e-06, + "loss": 0.168, + "step": 2550 + }, + { + "epoch": 0.07, + "grad_norm": 0.9888120293617249, + "learning_rate": 4.9246100460593606e-06, + "loss": 0.1688, + "step": 2600 + }, + { + "epoch": 0.07, + "grad_norm": 0.915663480758667, + "learning_rate": 4.92158031389456e-06, + "loss": 0.1615, + "step": 2650 + }, + { + "epoch": 0.07, + "grad_norm": 0.9765244722366333, + "learning_rate": 4.918491865739763e-06, + "loss": 0.1571, + "step": 2700 + }, + { + "epoch": 0.07, + "grad_norm": 0.847814679145813, + "learning_rate": 4.915344776480487e-06, + "loss": 0.1602, + "step": 2750 + }, + { + "epoch": 0.07, + "grad_norm": 0.9848460555076599, + "learning_rate": 4.912139122424118e-06, + "loss": 0.1634, + "step": 2800 + }, + { + "epoch": 0.07, + "grad_norm": 0.8652825355529785, + "learning_rate": 4.908874981298058e-06, + "loss": 0.1594, + "step": 2850 + }, + { + "epoch": 0.07, + "grad_norm": 0.9553362131118774, + "learning_rate": 4.9055524322478456e-06, + "loss": 0.1681, + "step": 2900 + }, + { + "epoch": 0.07, + "grad_norm": 0.8901177644729614, + "learning_rate": 4.902171555835236e-06, + "loss": 0.1562, + "step": 2950 + }, + { + "epoch": 0.08, + "grad_norm": 1.047637701034546, + "learning_rate": 4.8987324340362445e-06, + "loss": 0.1601, + "step": 3000 + }, + { + "epoch": 0.08, + "grad_norm": 0.8235883116722107, + "learning_rate": 4.895235150239159e-06, + "loss": 0.1666, + "step": 3050 + }, + { + "epoch": 0.08, + "grad_norm": 1.1097720861434937, + "learning_rate": 4.891679789242524e-06, + "loss": 0.1584, + "step": 3100 + }, + { + "epoch": 0.08, + "grad_norm": 0.8984447121620178, + "learning_rate": 4.8880664372530765e-06, + "loss": 0.1612, + "step": 3150 + }, + { + "epoch": 0.08, + "grad_norm": 0.9215020537376404, + "learning_rate": 4.884395181883661e-06, + "loss": 0.1606, + "step": 3200 + }, + { + "epoch": 0.08, + "grad_norm": 0.8073440790176392, + "learning_rate": 4.880666112151104e-06, + "loss": 0.1624, + "step": 3250 + }, + { + "epoch": 0.08, + "grad_norm": 0.9907032251358032, + "learning_rate": 4.876879318474056e-06, + "loss": 0.1526, + "step": 3300 + }, + { + "epoch": 0.08, + "grad_norm": 1.0674902200698853, + "learning_rate": 4.873034892670795e-06, + "loss": 0.1628, + "step": 3350 + }, + { + "epoch": 0.09, + "grad_norm": 0.886870801448822, + "learning_rate": 4.869132927957007e-06, + "loss": 0.1597, + "step": 3400 + }, + { + "epoch": 0.09, + "grad_norm": 0.8470545411109924, + "learning_rate": 4.8651735189435205e-06, + "loss": 0.155, + "step": 3450 + }, + { + "epoch": 0.09, + "grad_norm": 0.8206085562705994, + "learning_rate": 4.861156761634014e-06, + "loss": 0.1607, + "step": 3500 + }, + { + "epoch": 0.09, + "grad_norm": 0.9522375464439392, + "learning_rate": 4.857082753422691e-06, + "loss": 0.1622, + "step": 3550 + }, + { + "epoch": 0.09, + "grad_norm": 0.9393265843391418, + "learning_rate": 4.852951593091914e-06, + "loss": 0.1574, + "step": 3600 + }, + { + "epoch": 0.09, + "grad_norm": 0.89833003282547, + "learning_rate": 4.848763380809811e-06, + "loss": 0.1627, + "step": 3650 + }, + { + "epoch": 0.09, + "grad_norm": 0.8664096593856812, + "learning_rate": 4.844518218127849e-06, + "loss": 0.1569, + "step": 3700 + }, + { + "epoch": 0.1, + "grad_norm": 0.8381832838058472, + "learning_rate": 4.840216207978368e-06, + "loss": 0.1585, + "step": 3750 + }, + { + "epoch": 0.1, + "grad_norm": 0.7903485298156738, + "learning_rate": 4.835857454672087e-06, + "loss": 0.1591, + "step": 3800 + }, + { + "epoch": 0.1, + "grad_norm": 0.9844456315040588, + "learning_rate": 4.831442063895575e-06, + "loss": 0.1539, + "step": 3850 + }, + { + "epoch": 0.1, + "grad_norm": 0.8709468245506287, + "learning_rate": 4.8269701427086905e-06, + "loss": 0.1588, + "step": 3900 + }, + { + "epoch": 0.1, + "grad_norm": 0.9181196093559265, + "learning_rate": 4.822441799541979e-06, + "loss": 0.1569, + "step": 3950 + }, + { + "epoch": 0.1, + "grad_norm": 0.868691623210907, + "learning_rate": 4.8178571441940515e-06, + "loss": 0.1554, + "step": 4000 + }, + { + "epoch": 0.1, + "grad_norm": 0.9550264477729797, + "learning_rate": 4.813216287828917e-06, + "loss": 0.1595, + "step": 4050 + }, + { + "epoch": 0.1, + "grad_norm": 0.9788670539855957, + "learning_rate": 4.808519342973289e-06, + "loss": 0.158, + "step": 4100 + }, + { + "epoch": 0.11, + "grad_norm": 0.9521192908287048, + "learning_rate": 4.80376642351386e-06, + "loss": 0.1541, + "step": 4150 + }, + { + "epoch": 0.11, + "grad_norm": 0.9816685318946838, + "learning_rate": 4.798957644694533e-06, + "loss": 0.1555, + "step": 4200 + }, + { + "epoch": 0.11, + "grad_norm": 0.8248597383499146, + "learning_rate": 4.794093123113635e-06, + "loss": 0.1575, + "step": 4250 + }, + { + "epoch": 0.11, + "grad_norm": 0.8728579878807068, + "learning_rate": 4.789172976721082e-06, + "loss": 0.1582, + "step": 4300 + }, + { + "epoch": 0.11, + "grad_norm": 0.8118800520896912, + "learning_rate": 4.7841973248155275e-06, + "loss": 0.1606, + "step": 4350 + }, + { + "epoch": 0.11, + "grad_norm": 0.8186320066452026, + "learning_rate": 4.779166288041463e-06, + "loss": 0.1515, + "step": 4400 + }, + { + "epoch": 0.11, + "grad_norm": 0.8599525690078735, + "learning_rate": 4.7740799883862966e-06, + "loss": 0.1604, + "step": 4450 + }, + { + "epoch": 0.11, + "grad_norm": 0.9356303811073303, + "learning_rate": 4.7689385491773934e-06, + "loss": 0.155, + "step": 4500 + }, + { + "epoch": 0.12, + "grad_norm": 0.9057179689407349, + "learning_rate": 4.7637420950790855e-06, + "loss": 0.1561, + "step": 4550 + }, + { + "epoch": 0.12, + "grad_norm": 0.953428328037262, + "learning_rate": 4.75849075208965e-06, + "loss": 0.1624, + "step": 4600 + }, + { + "epoch": 0.12, + "grad_norm": 0.8495259284973145, + "learning_rate": 4.7531846475382526e-06, + "loss": 0.1583, + "step": 4650 + }, + { + "epoch": 0.12, + "grad_norm": 0.8086969256401062, + "learning_rate": 4.7478239100818626e-06, + "loss": 0.1514, + "step": 4700 + }, + { + "epoch": 0.12, + "grad_norm": 0.8309988975524902, + "learning_rate": 4.742408669702131e-06, + "loss": 0.1586, + "step": 4750 + }, + { + "epoch": 0.12, + "grad_norm": 0.8857250809669495, + "learning_rate": 4.736939057702239e-06, + "loss": 0.1534, + "step": 4800 + }, + { + "epoch": 0.12, + "grad_norm": 0.9398581981658936, + "learning_rate": 4.731415206703714e-06, + "loss": 0.1522, + "step": 4850 + }, + { + "epoch": 0.12, + "grad_norm": 0.8459595441818237, + "learning_rate": 4.725837250643218e-06, + "loss": 0.1488, + "step": 4900 + }, + { + "epoch": 0.13, + "grad_norm": 0.9744529724121094, + "learning_rate": 4.720205324769296e-06, + "loss": 0.153, + "step": 4950 + }, + { + "epoch": 0.13, + "grad_norm": 0.8761635422706604, + "learning_rate": 4.714519565639095e-06, + "loss": 0.1558, + "step": 5000 + }, + { + "epoch": 0.13, + "grad_norm": 0.7558119297027588, + "learning_rate": 4.708780111115058e-06, + "loss": 0.1563, + "step": 5050 + }, + { + "epoch": 0.13, + "grad_norm": 0.9466744661331177, + "learning_rate": 4.702987100361578e-06, + "loss": 0.1578, + "step": 5100 + }, + { + "epoch": 0.13, + "grad_norm": 0.9422277808189392, + "learning_rate": 4.697140673841624e-06, + "loss": 0.147, + "step": 5150 + }, + { + "epoch": 0.13, + "grad_norm": 0.7488685846328735, + "learning_rate": 4.6912409733133365e-06, + "loss": 0.1535, + "step": 5200 + }, + { + "epoch": 0.13, + "grad_norm": 0.8875038027763367, + "learning_rate": 4.685288141826589e-06, + "loss": 0.1491, + "step": 5250 + }, + { + "epoch": 0.13, + "grad_norm": 1.1153545379638672, + "learning_rate": 4.679282323719519e-06, + "loss": 0.1523, + "step": 5300 + }, + { + "epoch": 0.14, + "grad_norm": 0.9965418577194214, + "learning_rate": 4.67322366461503e-06, + "loss": 0.1499, + "step": 5350 + }, + { + "epoch": 0.14, + "grad_norm": 0.7546530961990356, + "learning_rate": 4.66711231141726e-06, + "loss": 0.1517, + "step": 5400 + }, + { + "epoch": 0.14, + "grad_norm": 0.8970305323600769, + "learning_rate": 4.660948412308018e-06, + "loss": 0.1546, + "step": 5450 + }, + { + "epoch": 0.14, + "grad_norm": 1.0205148458480835, + "learning_rate": 4.654732116743193e-06, + "loss": 0.1532, + "step": 5500 + }, + { + "epoch": 0.14, + "grad_norm": 0.8177210092544556, + "learning_rate": 4.64846357544913e-06, + "loss": 0.1437, + "step": 5550 + }, + { + "epoch": 0.14, + "grad_norm": 1.0768853425979614, + "learning_rate": 4.642142940418973e-06, + "loss": 0.1499, + "step": 5600 + }, + { + "epoch": 0.14, + "grad_norm": 0.8934694528579712, + "learning_rate": 4.635770364908984e-06, + "loss": 0.1538, + "step": 5650 + }, + { + "epoch": 0.14, + "grad_norm": 0.9092695713043213, + "learning_rate": 4.629346003434822e-06, + "loss": 0.1507, + "step": 5700 + }, + { + "epoch": 0.15, + "grad_norm": 0.8804992437362671, + "learning_rate": 4.622870011767798e-06, + "loss": 0.1494, + "step": 5750 + }, + { + "epoch": 0.15, + "grad_norm": 0.7924209237098694, + "learning_rate": 4.616342546931103e-06, + "loss": 0.155, + "step": 5800 + }, + { + "epoch": 0.15, + "grad_norm": 0.8898280262947083, + "learning_rate": 4.609763767195991e-06, + "loss": 0.1511, + "step": 5850 + }, + { + "epoch": 0.15, + "grad_norm": 0.8630085587501526, + "learning_rate": 4.603133832077953e-06, + "loss": 0.1486, + "step": 5900 + }, + { + "epoch": 0.15, + "grad_norm": 0.7191097140312195, + "learning_rate": 4.596452902332839e-06, + "loss": 0.1497, + "step": 5950 + }, + { + "epoch": 0.15, + "grad_norm": 0.815141499042511, + "learning_rate": 4.589721139952964e-06, + "loss": 0.1503, + "step": 6000 + }, + { + "epoch": 0.15, + "grad_norm": 0.8735461831092834, + "learning_rate": 4.582938708163183e-06, + "loss": 0.1532, + "step": 6050 + }, + { + "epoch": 0.15, + "grad_norm": 0.851352334022522, + "learning_rate": 4.576105771416928e-06, + "loss": 0.1527, + "step": 6100 + }, + { + "epoch": 0.16, + "grad_norm": 0.7596533298492432, + "learning_rate": 4.569222495392227e-06, + "loss": 0.153, + "step": 6150 + }, + { + "epoch": 0.16, + "grad_norm": 0.8966355919837952, + "learning_rate": 4.562289046987679e-06, + "loss": 0.1493, + "step": 6200 + }, + { + "epoch": 0.16, + "grad_norm": 0.8785326480865479, + "learning_rate": 4.555305594318414e-06, + "loss": 0.1458, + "step": 6250 + }, + { + "epoch": 0.16, + "grad_norm": 0.8890253305435181, + "learning_rate": 4.548272306712013e-06, + "loss": 0.1495, + "step": 6300 + }, + { + "epoch": 0.16, + "grad_norm": 0.8389493823051453, + "learning_rate": 4.541189354704403e-06, + "loss": 0.1554, + "step": 6350 + }, + { + "epoch": 0.16, + "grad_norm": 0.8545738458633423, + "learning_rate": 4.534056910035724e-06, + "loss": 0.1512, + "step": 6400 + }, + { + "epoch": 0.16, + "grad_norm": 0.859368085861206, + "learning_rate": 4.5268751456461605e-06, + "loss": 0.148, + "step": 6450 + }, + { + "epoch": 0.16, + "grad_norm": 0.9073702692985535, + "learning_rate": 4.5196442356717526e-06, + "loss": 0.1522, + "step": 6500 + }, + { + "epoch": 0.17, + "grad_norm": 0.8107109069824219, + "learning_rate": 4.512364355440172e-06, + "loss": 0.1506, + "step": 6550 + }, + { + "epoch": 0.17, + "grad_norm": 0.6515960693359375, + "learning_rate": 4.505035681466472e-06, + "loss": 0.1459, + "step": 6600 + }, + { + "epoch": 0.17, + "grad_norm": 0.8400324583053589, + "learning_rate": 4.497658391448803e-06, + "loss": 0.1486, + "step": 6650 + }, + { + "epoch": 0.17, + "grad_norm": 0.8535892367362976, + "learning_rate": 4.49023266426411e-06, + "loss": 0.1533, + "step": 6700 + }, + { + "epoch": 0.17, + "grad_norm": 0.8867340087890625, + "learning_rate": 4.482758679963792e-06, + "loss": 0.1546, + "step": 6750 + }, + { + "epoch": 0.17, + "grad_norm": 0.8694542050361633, + "learning_rate": 4.475236619769336e-06, + "loss": 0.1462, + "step": 6800 + }, + { + "epoch": 0.17, + "grad_norm": 0.8599735498428345, + "learning_rate": 4.4676666660679265e-06, + "loss": 0.1472, + "step": 6850 + }, + { + "epoch": 0.17, + "grad_norm": 0.8738076686859131, + "learning_rate": 4.460049002408018e-06, + "loss": 0.1521, + "step": 6900 + }, + { + "epoch": 0.18, + "grad_norm": 0.9029892683029175, + "learning_rate": 4.452383813494887e-06, + "loss": 0.1534, + "step": 6950 + }, + { + "epoch": 0.18, + "grad_norm": 0.828575849533081, + "learning_rate": 4.444671285186155e-06, + "loss": 0.1487, + "step": 7000 + }, + { + "epoch": 0.18, + "grad_norm": 0.7742441892623901, + "learning_rate": 4.4369116044872786e-06, + "loss": 0.1449, + "step": 7050 + }, + { + "epoch": 0.18, + "grad_norm": 0.8644819855690002, + "learning_rate": 4.42910495954702e-06, + "loss": 0.1532, + "step": 7100 + }, + { + "epoch": 0.18, + "grad_norm": 0.955912172794342, + "learning_rate": 4.421251539652879e-06, + "loss": 0.1479, + "step": 7150 + }, + { + "epoch": 0.18, + "grad_norm": 0.8100504875183105, + "learning_rate": 4.413351535226507e-06, + "loss": 0.1515, + "step": 7200 + }, + { + "epoch": 0.18, + "grad_norm": 0.8034056425094604, + "learning_rate": 4.4054051378190915e-06, + "loss": 0.1509, + "step": 7250 + }, + { + "epoch": 0.19, + "grad_norm": 1.0016452074050903, + "learning_rate": 4.397412540106707e-06, + "loss": 0.1477, + "step": 7300 + }, + { + "epoch": 0.19, + "grad_norm": 0.7266576290130615, + "learning_rate": 4.3893739358856465e-06, + "loss": 0.1382, + "step": 7350 + }, + { + "epoch": 0.19, + "grad_norm": 0.8133652210235596, + "learning_rate": 4.38128952006772e-06, + "loss": 0.148, + "step": 7400 + }, + { + "epoch": 0.19, + "grad_norm": 0.8336594104766846, + "learning_rate": 4.373159488675533e-06, + "loss": 0.1487, + "step": 7450 + }, + { + "epoch": 0.19, + "grad_norm": 0.7953880429267883, + "learning_rate": 4.364984038837727e-06, + "loss": 0.1471, + "step": 7500 + }, + { + "epoch": 0.19, + "grad_norm": 0.8772010803222656, + "learning_rate": 4.356763368784207e-06, + "loss": 0.1509, + "step": 7550 + }, + { + "epoch": 0.19, + "grad_norm": 0.8125504851341248, + "learning_rate": 4.348497677841328e-06, + "loss": 0.1477, + "step": 7600 + }, + { + "epoch": 0.19, + "grad_norm": 0.8239701390266418, + "learning_rate": 4.340187166427067e-06, + "loss": 0.1439, + "step": 7650 + }, + { + "epoch": 0.2, + "grad_norm": 0.8961694240570068, + "learning_rate": 4.331832036046162e-06, + "loss": 0.1484, + "step": 7700 + }, + { + "epoch": 0.2, + "grad_norm": 0.8788717985153198, + "learning_rate": 4.323432489285223e-06, + "loss": 0.1459, + "step": 7750 + }, + { + "epoch": 0.2, + "grad_norm": 0.8486537337303162, + "learning_rate": 4.3149887298078275e-06, + "loss": 0.1457, + "step": 7800 + }, + { + "epoch": 0.2, + "grad_norm": 0.914941132068634, + "learning_rate": 4.306500962349573e-06, + "loss": 0.144, + "step": 7850 + }, + { + "epoch": 0.2, + "grad_norm": 0.8333587050437927, + "learning_rate": 4.2979693927131205e-06, + "loss": 0.1434, + "step": 7900 + }, + { + "epoch": 0.2, + "grad_norm": 0.8071913719177246, + "learning_rate": 4.289394227763199e-06, + "loss": 0.1498, + "step": 7950 + }, + { + "epoch": 0.2, + "grad_norm": 0.7681615948677063, + "learning_rate": 4.2807756754215926e-06, + "loss": 0.1461, + "step": 8000 + }, + { + "epoch": 0.2, + "grad_norm": 0.8993908762931824, + "learning_rate": 4.272113944662099e-06, + "loss": 0.146, + "step": 8050 + }, + { + "epoch": 0.21, + "grad_norm": 0.8620333671569824, + "learning_rate": 4.263409245505461e-06, + "loss": 0.1444, + "step": 8100 + }, + { + "epoch": 0.21, + "grad_norm": 0.8208089470863342, + "learning_rate": 4.254661789014274e-06, + "loss": 0.1415, + "step": 8150 + }, + { + "epoch": 0.21, + "grad_norm": 0.7570805549621582, + "learning_rate": 4.2458717872878715e-06, + "loss": 0.1406, + "step": 8200 + }, + { + "epoch": 0.21, + "grad_norm": 0.8517532348632812, + "learning_rate": 4.237039453457179e-06, + "loss": 0.1439, + "step": 8250 + }, + { + "epoch": 0.21, + "grad_norm": 0.7504327893257141, + "learning_rate": 4.228165001679547e-06, + "loss": 0.1408, + "step": 8300 + }, + { + "epoch": 0.21, + "grad_norm": 0.7522333264350891, + "learning_rate": 4.219248647133559e-06, + "loss": 0.1434, + "step": 8350 + }, + { + "epoch": 0.21, + "grad_norm": 0.9496443271636963, + "learning_rate": 4.210290606013813e-06, + "loss": 0.1472, + "step": 8400 + }, + { + "epoch": 0.21, + "grad_norm": 1.0091819763183594, + "learning_rate": 4.2012910955256825e-06, + "loss": 0.1502, + "step": 8450 + }, + { + "epoch": 0.22, + "grad_norm": 0.9098952412605286, + "learning_rate": 4.192250333880045e-06, + "loss": 0.1488, + "step": 8500 + }, + { + "epoch": 0.22, + "grad_norm": 0.6374802589416504, + "learning_rate": 4.183168540287995e-06, + "loss": 0.1443, + "step": 8550 + }, + { + "epoch": 0.22, + "grad_norm": 0.7613967657089233, + "learning_rate": 4.174045934955527e-06, + "loss": 0.149, + "step": 8600 + }, + { + "epoch": 0.22, + "grad_norm": 0.8770780563354492, + "learning_rate": 4.164882739078197e-06, + "loss": 0.1432, + "step": 8650 + }, + { + "epoch": 0.22, + "grad_norm": 0.8647637963294983, + "learning_rate": 4.155679174835758e-06, + "loss": 0.1453, + "step": 8700 + }, + { + "epoch": 0.22, + "grad_norm": 0.8997677564620972, + "learning_rate": 4.146435465386776e-06, + "loss": 0.1429, + "step": 8750 + }, + { + "epoch": 0.22, + "grad_norm": 0.702021598815918, + "learning_rate": 4.137151834863213e-06, + "loss": 0.1448, + "step": 8800 + }, + { + "epoch": 0.22, + "grad_norm": 0.8877437710762024, + "learning_rate": 4.1278285083649985e-06, + "loss": 0.1449, + "step": 8850 + }, + { + "epoch": 0.23, + "grad_norm": 0.8944686055183411, + "learning_rate": 4.11846571195457e-06, + "loss": 0.1522, + "step": 8900 + }, + { + "epoch": 0.23, + "grad_norm": 0.9450408816337585, + "learning_rate": 4.1090636726513875e-06, + "loss": 0.1453, + "step": 8950 + }, + { + "epoch": 0.23, + "grad_norm": 0.7170203328132629, + "learning_rate": 4.0996226184264355e-06, + "loss": 0.1469, + "step": 9000 + }, + { + "epoch": 0.23, + "grad_norm": 0.8969573974609375, + "learning_rate": 4.090142778196692e-06, + "loss": 0.1494, + "step": 9050 + }, + { + "epoch": 0.23, + "grad_norm": 0.9128451347351074, + "learning_rate": 4.080624381819577e-06, + "loss": 0.1413, + "step": 9100 + }, + { + "epoch": 0.23, + "grad_norm": 0.8542819023132324, + "learning_rate": 4.071067660087379e-06, + "loss": 0.1418, + "step": 9150 + }, + { + "epoch": 0.23, + "grad_norm": 0.8860799670219421, + "learning_rate": 4.061472844721664e-06, + "loss": 0.1408, + "step": 9200 + }, + { + "epoch": 0.23, + "grad_norm": 0.8985223770141602, + "learning_rate": 4.05184016836765e-06, + "loss": 0.1444, + "step": 9250 + }, + { + "epoch": 0.24, + "grad_norm": 0.7234982252120972, + "learning_rate": 4.042169864588571e-06, + "loss": 0.1469, + "step": 9300 + }, + { + "epoch": 0.24, + "grad_norm": 0.7840049266815186, + "learning_rate": 4.032462167860012e-06, + "loss": 0.1367, + "step": 9350 + }, + { + "epoch": 0.24, + "grad_norm": 0.8657572269439697, + "learning_rate": 4.022717313564223e-06, + "loss": 0.1456, + "step": 9400 + }, + { + "epoch": 0.24, + "grad_norm": 0.8249016404151917, + "learning_rate": 4.012935537984414e-06, + "loss": 0.1398, + "step": 9450 + }, + { + "epoch": 0.24, + "grad_norm": 0.9602368474006653, + "learning_rate": 4.0031170782990214e-06, + "loss": 0.1465, + "step": 9500 + }, + { + "epoch": 0.24, + "grad_norm": 0.7444416284561157, + "learning_rate": 3.993262172575962e-06, + "loss": 0.1415, + "step": 9550 + }, + { + "epoch": 0.24, + "grad_norm": 0.8414493799209595, + "learning_rate": 3.983371059766862e-06, + "loss": 0.1487, + "step": 9600 + }, + { + "epoch": 0.24, + "grad_norm": 0.8294157385826111, + "learning_rate": 3.973443979701252e-06, + "loss": 0.1406, + "step": 9650 + }, + { + "epoch": 0.25, + "grad_norm": 0.8009164333343506, + "learning_rate": 3.963481173080768e-06, + "loss": 0.1445, + "step": 9700 + }, + { + "epoch": 0.25, + "grad_norm": 0.8331242203712463, + "learning_rate": 3.9534828814733e-06, + "loss": 0.1388, + "step": 9750 + }, + { + "epoch": 0.25, + "grad_norm": 0.8543001413345337, + "learning_rate": 3.943449347307146e-06, + "loss": 0.1399, + "step": 9800 + }, + { + "epoch": 0.25, + "grad_norm": 0.625985324382782, + "learning_rate": 3.9333808138651265e-06, + "loss": 0.1376, + "step": 9850 + }, + { + "epoch": 0.25, + "grad_norm": 0.7723961472511292, + "learning_rate": 3.923277525278691e-06, + "loss": 0.1443, + "step": 9900 + }, + { + "epoch": 0.25, + "grad_norm": 0.8138018846511841, + "learning_rate": 3.913139726521993e-06, + "loss": 0.1358, + "step": 9950 + }, + { + "epoch": 0.25, + "grad_norm": 0.7179055213928223, + "learning_rate": 3.9029676634059565e-06, + "loss": 0.1376, + "step": 10000 + }, + { + "epoch": 0.25, + "grad_norm": 0.7832974195480347, + "learning_rate": 3.89276158257231e-06, + "loss": 0.1391, + "step": 10050 + }, + { + "epoch": 0.26, + "grad_norm": 0.6319305300712585, + "learning_rate": 3.882521731487609e-06, + "loss": 0.1394, + "step": 10100 + }, + { + "epoch": 0.26, + "grad_norm": 0.8627734184265137, + "learning_rate": 3.872248358437236e-06, + "loss": 0.1437, + "step": 10150 + }, + { + "epoch": 0.26, + "grad_norm": 0.8356693387031555, + "learning_rate": 3.861941712519379e-06, + "loss": 0.1395, + "step": 10200 + }, + { + "epoch": 0.26, + "grad_norm": 0.9000275135040283, + "learning_rate": 3.8516020436389945e-06, + "loss": 0.1415, + "step": 10250 + }, + { + "epoch": 0.26, + "grad_norm": 0.8124093413352966, + "learning_rate": 3.841229602501742e-06, + "loss": 0.1425, + "step": 10300 + }, + { + "epoch": 0.26, + "grad_norm": 0.7685266137123108, + "learning_rate": 3.8308246406079116e-06, + "loss": 0.1444, + "step": 10350 + }, + { + "epoch": 0.26, + "grad_norm": 0.858231782913208, + "learning_rate": 3.820387410246324e-06, + "loss": 0.1408, + "step": 10400 + }, + { + "epoch": 0.26, + "grad_norm": 0.7910386323928833, + "learning_rate": 3.809918164488208e-06, + "loss": 0.1404, + "step": 10450 + }, + { + "epoch": 0.27, + "grad_norm": 0.7267195582389832, + "learning_rate": 3.7994171571810756e-06, + "loss": 0.1444, + "step": 10500 + }, + { + "epoch": 0.27, + "grad_norm": 0.9443105459213257, + "learning_rate": 3.788884642942555e-06, + "loss": 0.1427, + "step": 10550 + }, + { + "epoch": 0.27, + "grad_norm": 0.8015535473823547, + "learning_rate": 3.7783208771542237e-06, + "loss": 0.1386, + "step": 10600 + }, + { + "epoch": 0.27, + "grad_norm": 0.8959324955940247, + "learning_rate": 3.7677261159554145e-06, + "loss": 0.1428, + "step": 10650 + }, + { + "epoch": 0.27, + "grad_norm": 0.8010550737380981, + "learning_rate": 3.757100616237006e-06, + "loss": 0.1407, + "step": 10700 + }, + { + "epoch": 0.27, + "grad_norm": 0.7638524174690247, + "learning_rate": 3.746444635635191e-06, + "loss": 0.1413, + "step": 10750 + }, + { + "epoch": 0.27, + "grad_norm": 0.8806131482124329, + "learning_rate": 3.735758432525234e-06, + "loss": 0.1392, + "step": 10800 + }, + { + "epoch": 0.27, + "grad_norm": 0.763507068157196, + "learning_rate": 3.725042266015201e-06, + "loss": 0.1414, + "step": 10850 + }, + { + "epoch": 0.28, + "grad_norm": 0.7763858437538147, + "learning_rate": 3.7142963959396805e-06, + "loss": 0.142, + "step": 10900 + }, + { + "epoch": 0.28, + "grad_norm": 0.8945651054382324, + "learning_rate": 3.7035210828534846e-06, + "loss": 0.1444, + "step": 10950 + }, + { + "epoch": 0.28, + "grad_norm": 0.8169758915901184, + "learning_rate": 3.692716588025327e-06, + "loss": 0.1392, + "step": 11000 + }, + { + "epoch": 0.28, + "grad_norm": 0.8331303596496582, + "learning_rate": 3.68188317343149e-06, + "loss": 0.1408, + "step": 11050 + }, + { + "epoch": 0.28, + "grad_norm": 0.7105779051780701, + "learning_rate": 3.671021101749476e-06, + "loss": 0.1369, + "step": 11100 + }, + { + "epoch": 0.28, + "grad_norm": 0.8562634587287903, + "learning_rate": 3.6601306363516297e-06, + "loss": 0.1476, + "step": 11150 + }, + { + "epoch": 0.28, + "grad_norm": 0.831794798374176, + "learning_rate": 3.649212041298763e-06, + "loss": 0.141, + "step": 11200 + }, + { + "epoch": 0.29, + "grad_norm": 0.7753794193267822, + "learning_rate": 3.638265581333742e-06, + "loss": 0.1396, + "step": 11250 + }, + { + "epoch": 0.29, + "grad_norm": 0.8438231348991394, + "learning_rate": 3.627291521875076e-06, + "loss": 0.1411, + "step": 11300 + }, + { + "epoch": 0.29, + "grad_norm": 0.8898864388465881, + "learning_rate": 3.616290129010476e-06, + "loss": 0.1363, + "step": 11350 + }, + { + "epoch": 0.29, + "grad_norm": 0.8584467768669128, + "learning_rate": 3.605261669490407e-06, + "loss": 0.1437, + "step": 11400 + }, + { + "epoch": 0.29, + "grad_norm": 0.8061217665672302, + "learning_rate": 3.5942064107216183e-06, + "loss": 0.1408, + "step": 11450 + }, + { + "epoch": 0.29, + "grad_norm": 0.7721576690673828, + "learning_rate": 3.5831246207606597e-06, + "loss": 0.14, + "step": 11500 + }, + { + "epoch": 0.29, + "grad_norm": 0.7443546056747437, + "learning_rate": 3.57201656830738e-06, + "loss": 0.1416, + "step": 11550 + }, + { + "epoch": 0.29, + "grad_norm": 0.8907784819602966, + "learning_rate": 3.5608825226984168e-06, + "loss": 0.1409, + "step": 11600 + }, + { + "epoch": 0.3, + "grad_norm": 0.8339740633964539, + "learning_rate": 3.549722753900662e-06, + "loss": 0.1403, + "step": 11650 + }, + { + "epoch": 0.3, + "grad_norm": 0.844826340675354, + "learning_rate": 3.5385375325047167e-06, + "loss": 0.135, + "step": 11700 + }, + { + "epoch": 0.3, + "grad_norm": 0.875929057598114, + "learning_rate": 3.5273271297183302e-06, + "loss": 0.1404, + "step": 11750 + }, + { + "epoch": 0.3, + "grad_norm": 0.7786159515380859, + "learning_rate": 3.516091817359825e-06, + "loss": 0.1372, + "step": 11800 + }, + { + "epoch": 0.3, + "grad_norm": 0.9039524793624878, + "learning_rate": 3.5048318678515052e-06, + "loss": 0.1383, + "step": 11850 + }, + { + "epoch": 0.3, + "grad_norm": 0.8629227876663208, + "learning_rate": 3.493547554213051e-06, + "loss": 0.1365, + "step": 11900 + }, + { + "epoch": 0.3, + "grad_norm": 0.9458798170089722, + "learning_rate": 3.482239150054898e-06, + "loss": 0.1377, + "step": 11950 + }, + { + "epoch": 0.3, + "grad_norm": 0.7950788736343384, + "learning_rate": 3.470906929571605e-06, + "loss": 0.1415, + "step": 12000 + }, + { + "epoch": 0.31, + "grad_norm": 0.9919633865356445, + "learning_rate": 3.459551167535205e-06, + "loss": 0.1375, + "step": 12050 + }, + { + "epoch": 0.31, + "grad_norm": 0.8692936301231384, + "learning_rate": 3.4481721392885415e-06, + "loss": 0.1406, + "step": 12100 + }, + { + "epoch": 0.31, + "grad_norm": 0.7293336391448975, + "learning_rate": 3.4367701207385944e-06, + "loss": 0.1379, + "step": 12150 + }, + { + "epoch": 0.31, + "grad_norm": 0.9827872514724731, + "learning_rate": 3.425345388349787e-06, + "loss": 0.1406, + "step": 12200 + }, + { + "epoch": 0.31, + "grad_norm": 0.8083829283714294, + "learning_rate": 3.4138982191372838e-06, + "loss": 0.1451, + "step": 12250 + }, + { + "epoch": 0.31, + "grad_norm": 0.8368284106254578, + "learning_rate": 3.402428890660279e-06, + "loss": 0.1371, + "step": 12300 + }, + { + "epoch": 0.31, + "grad_norm": 0.7933076024055481, + "learning_rate": 3.390937681015256e-06, + "loss": 0.136, + "step": 12350 + }, + { + "epoch": 0.31, + "grad_norm": 0.7652096748352051, + "learning_rate": 3.379424868829254e-06, + "loss": 0.1349, + "step": 12400 + }, + { + "epoch": 0.32, + "grad_norm": 0.9007067084312439, + "learning_rate": 3.367890733253108e-06, + "loss": 0.1457, + "step": 12450 + }, + { + "epoch": 0.32, + "grad_norm": 0.744931161403656, + "learning_rate": 3.3563355539546795e-06, + "loss": 0.1413, + "step": 12500 + }, + { + "epoch": 0.32, + "grad_norm": 0.8155565857887268, + "learning_rate": 3.3447596111120767e-06, + "loss": 0.1332, + "step": 12550 + }, + { + "epoch": 0.32, + "grad_norm": 0.9961397647857666, + "learning_rate": 3.333163185406861e-06, + "loss": 0.1396, + "step": 12600 + }, + { + "epoch": 0.32, + "grad_norm": 0.7819132208824158, + "learning_rate": 3.321546558017243e-06, + "loss": 0.1355, + "step": 12650 + }, + { + "epoch": 0.32, + "grad_norm": 0.7664303779602051, + "learning_rate": 3.309910010611259e-06, + "loss": 0.1379, + "step": 12700 + }, + { + "epoch": 0.32, + "grad_norm": 0.7041974067687988, + "learning_rate": 3.29825382533995e-06, + "loss": 0.1365, + "step": 12750 + }, + { + "epoch": 0.32, + "grad_norm": 0.8403826355934143, + "learning_rate": 3.286578284830513e-06, + "loss": 0.1435, + "step": 12800 + }, + { + "epoch": 0.33, + "grad_norm": 0.7580990791320801, + "learning_rate": 3.2748836721794514e-06, + "loss": 0.1364, + "step": 12850 + }, + { + "epoch": 0.33, + "grad_norm": 0.7609158158302307, + "learning_rate": 3.263170270945709e-06, + "loss": 0.1374, + "step": 12900 + }, + { + "epoch": 0.33, + "grad_norm": 0.8184811472892761, + "learning_rate": 3.2514383651437987e-06, + "loss": 0.1396, + "step": 12950 + }, + { + "epoch": 0.33, + "grad_norm": 0.948807954788208, + "learning_rate": 3.239688239236911e-06, + "loss": 0.1339, + "step": 13000 + }, + { + "epoch": 0.33, + "grad_norm": 0.8595229983329773, + "learning_rate": 3.2279201781300206e-06, + "loss": 0.1374, + "step": 13050 + }, + { + "epoch": 0.33, + "grad_norm": 0.8162684440612793, + "learning_rate": 3.2161344671629736e-06, + "loss": 0.135, + "step": 13100 + }, + { + "epoch": 0.33, + "grad_norm": 0.7258612513542175, + "learning_rate": 3.2043313921035747e-06, + "loss": 0.1376, + "step": 13150 + }, + { + "epoch": 0.33, + "grad_norm": 0.8645867705345154, + "learning_rate": 3.1925112391406534e-06, + "loss": 0.142, + "step": 13200 + }, + { + "epoch": 0.34, + "grad_norm": 0.7972207069396973, + "learning_rate": 3.1806742948771276e-06, + "loss": 0.1351, + "step": 13250 + }, + { + "epoch": 0.34, + "grad_norm": 0.9695426225662231, + "learning_rate": 3.168820846323053e-06, + "loss": 0.1403, + "step": 13300 + }, + { + "epoch": 0.34, + "grad_norm": 0.6566038727760315, + "learning_rate": 3.1569511808886633e-06, + "loss": 0.1342, + "step": 13350 + }, + { + "epoch": 0.34, + "grad_norm": 0.8785290122032166, + "learning_rate": 3.1450655863774053e-06, + "loss": 0.1391, + "step": 13400 + }, + { + "epoch": 0.34, + "grad_norm": 0.7746759653091431, + "learning_rate": 3.1331643509789553e-06, + "loss": 0.1385, + "step": 13450 + }, + { + "epoch": 0.34, + "grad_norm": 0.7730455994606018, + "learning_rate": 3.121247763262235e-06, + "loss": 0.1336, + "step": 13500 + }, + { + "epoch": 0.34, + "grad_norm": 0.864236056804657, + "learning_rate": 3.1093161121684118e-06, + "loss": 0.1384, + "step": 13550 + }, + { + "epoch": 0.34, + "grad_norm": 0.9014241099357605, + "learning_rate": 3.097369687003896e-06, + "loss": 0.1302, + "step": 13600 + }, + { + "epoch": 0.35, + "grad_norm": 0.8704994916915894, + "learning_rate": 3.085408777433323e-06, + "loss": 0.1347, + "step": 13650 + }, + { + "epoch": 0.35, + "grad_norm": 0.793929934501648, + "learning_rate": 3.0734336734725327e-06, + "loss": 0.1396, + "step": 13700 + }, + { + "epoch": 0.35, + "grad_norm": 0.7899141907691956, + "learning_rate": 3.0614446654815346e-06, + "loss": 0.1331, + "step": 13750 + }, + { + "epoch": 0.35, + "grad_norm": 0.7553165555000305, + "learning_rate": 3.049442044157469e-06, + "loss": 0.1326, + "step": 13800 + }, + { + "epoch": 0.35, + "grad_norm": 0.7848679423332214, + "learning_rate": 3.0374261005275606e-06, + "loss": 0.1379, + "step": 13850 + }, + { + "epoch": 0.35, + "grad_norm": 0.9563062191009521, + "learning_rate": 3.025397125942056e-06, + "loss": 0.1357, + "step": 13900 + }, + { + "epoch": 0.35, + "grad_norm": 0.7703486084938049, + "learning_rate": 3.0133554120671653e-06, + "loss": 0.1317, + "step": 13950 + }, + { + "epoch": 0.35, + "grad_norm": 0.8696340322494507, + "learning_rate": 3.001301250877987e-06, + "loss": 0.1309, + "step": 14000 + }, + { + "epoch": 0.36, + "grad_norm": 0.7226243615150452, + "learning_rate": 2.9892349346514306e-06, + "loss": 0.1346, + "step": 14050 + }, + { + "epoch": 0.36, + "grad_norm": 0.8346455097198486, + "learning_rate": 2.977156755959126e-06, + "loss": 0.1421, + "step": 14100 + }, + { + "epoch": 0.36, + "grad_norm": 0.9874286651611328, + "learning_rate": 2.9650670076603342e-06, + "loss": 0.137, + "step": 14150 + }, + { + "epoch": 0.36, + "grad_norm": 0.7056658267974854, + "learning_rate": 2.952965982894844e-06, + "loss": 0.1355, + "step": 14200 + }, + { + "epoch": 0.36, + "grad_norm": 0.8198770880699158, + "learning_rate": 2.9408539750758625e-06, + "loss": 0.1353, + "step": 14250 + }, + { + "epoch": 0.36, + "grad_norm": 0.8781945109367371, + "learning_rate": 2.9287312778829047e-06, + "loss": 0.1333, + "step": 14300 + }, + { + "epoch": 0.36, + "grad_norm": 0.8908482193946838, + "learning_rate": 2.9165981852546688e-06, + "loss": 0.14, + "step": 14350 + }, + { + "epoch": 0.36, + "grad_norm": 0.872043251991272, + "learning_rate": 2.9044549913819125e-06, + "loss": 0.1327, + "step": 14400 + }, + { + "epoch": 0.37, + "grad_norm": 0.8322109580039978, + "learning_rate": 2.892301990700316e-06, + "loss": 0.1391, + "step": 14450 + }, + { + "epoch": 0.37, + "grad_norm": 0.8258897662162781, + "learning_rate": 2.8801394778833475e-06, + "loss": 0.1381, + "step": 14500 + }, + { + "epoch": 0.37, + "grad_norm": 0.9012707471847534, + "learning_rate": 2.8679677478351147e-06, + "loss": 0.1336, + "step": 14550 + }, + { + "epoch": 0.37, + "grad_norm": 0.9831441044807434, + "learning_rate": 2.8557870956832135e-06, + "loss": 0.135, + "step": 14600 + }, + { + "epoch": 0.37, + "grad_norm": 0.8136524558067322, + "learning_rate": 2.8435978167715753e-06, + "loss": 0.1367, + "step": 14650 + }, + { + "epoch": 0.37, + "grad_norm": 0.7816210985183716, + "learning_rate": 2.8314002066533053e-06, + "loss": 0.13, + "step": 14700 + }, + { + "epoch": 0.37, + "grad_norm": 0.8131623268127441, + "learning_rate": 2.8191945610835138e-06, + "loss": 0.1387, + "step": 14750 + }, + { + "epoch": 0.38, + "grad_norm": 0.7747004628181458, + "learning_rate": 2.8069811760121463e-06, + "loss": 0.1369, + "step": 14800 + }, + { + "epoch": 0.38, + "grad_norm": 0.715416669845581, + "learning_rate": 2.794760347576809e-06, + "loss": 0.1353, + "step": 14850 + }, + { + "epoch": 0.38, + "grad_norm": 0.8204655051231384, + "learning_rate": 2.7825323720955853e-06, + "loss": 0.1377, + "step": 14900 + }, + { + "epoch": 0.38, + "grad_norm": 0.6974436044692993, + "learning_rate": 2.7702975460598545e-06, + "loss": 0.1352, + "step": 14950 + }, + { + "epoch": 0.38, + "grad_norm": 0.8297330141067505, + "learning_rate": 2.7580561661271015e-06, + "loss": 0.1385, + "step": 15000 + }, + { + "epoch": 0.38, + "grad_norm": 0.7207173705101013, + "learning_rate": 2.7458085291137213e-06, + "loss": 0.1352, + "step": 15050 + }, + { + "epoch": 0.38, + "grad_norm": 0.7374753355979919, + "learning_rate": 2.733554931987825e-06, + "loss": 0.1357, + "step": 15100 + }, + { + "epoch": 0.38, + "grad_norm": 0.8534835577011108, + "learning_rate": 2.7212956718620404e-06, + "loss": 0.1286, + "step": 15150 + }, + { + "epoch": 0.39, + "grad_norm": 0.8803167939186096, + "learning_rate": 2.709031045986302e-06, + "loss": 0.1302, + "step": 15200 + }, + { + "epoch": 0.39, + "grad_norm": 0.7593905925750732, + "learning_rate": 2.6967613517406514e-06, + "loss": 0.1344, + "step": 15250 + }, + { + "epoch": 0.39, + "grad_norm": 0.8493793606758118, + "learning_rate": 2.68448688662802e-06, + "loss": 0.133, + "step": 15300 + }, + { + "epoch": 0.39, + "grad_norm": 0.7607067823410034, + "learning_rate": 2.6722079482670196e-06, + "loss": 0.1355, + "step": 15350 + }, + { + "epoch": 0.39, + "grad_norm": 0.6717010736465454, + "learning_rate": 2.6599248343847243e-06, + "loss": 0.1337, + "step": 15400 + }, + { + "epoch": 0.39, + "grad_norm": 0.8807181715965271, + "learning_rate": 2.6476378428094523e-06, + "loss": 0.1308, + "step": 15450 + }, + { + "epoch": 0.39, + "grad_norm": 0.7503638863563538, + "learning_rate": 2.6353472714635443e-06, + "loss": 0.1358, + "step": 15500 + }, + { + "epoch": 0.39, + "grad_norm": 0.8454340100288391, + "learning_rate": 2.6230534183561385e-06, + "loss": 0.1349, + "step": 15550 + }, + { + "epoch": 0.4, + "grad_norm": 0.7447136640548706, + "learning_rate": 2.6107565815759473e-06, + "loss": 0.1366, + "step": 15600 + }, + { + "epoch": 0.4, + "grad_norm": 0.891941249370575, + "learning_rate": 2.598457059284027e-06, + "loss": 0.1387, + "step": 15650 + }, + { + "epoch": 0.4, + "grad_norm": 0.8272932767868042, + "learning_rate": 2.5861551497065497e-06, + "loss": 0.1324, + "step": 15700 + }, + { + "epoch": 0.4, + "grad_norm": 0.8277787566184998, + "learning_rate": 2.5738511511275716e-06, + "loss": 0.1344, + "step": 15750 + }, + { + "epoch": 0.4, + "grad_norm": 0.7462809681892395, + "learning_rate": 2.5615453618818033e-06, + "loss": 0.1289, + "step": 15800 + }, + { + "epoch": 0.4, + "grad_norm": 0.6402873396873474, + "learning_rate": 2.5492380803473705e-06, + "loss": 0.1296, + "step": 15850 + }, + { + "epoch": 0.4, + "grad_norm": 0.8029218912124634, + "learning_rate": 2.5369296049385837e-06, + "loss": 0.1354, + "step": 15900 + }, + { + "epoch": 0.4, + "grad_norm": 0.912257969379425, + "learning_rate": 2.5246202340987004e-06, + "loss": 0.129, + "step": 15950 + }, + { + "epoch": 0.41, + "grad_norm": 0.8321228623390198, + "learning_rate": 2.5123102662926912e-06, + "loss": 0.1346, + "step": 16000 + }, + { + "epoch": 0.41, + "grad_norm": 0.7897807955741882, + "learning_rate": 2.5e-06, + "loss": 0.1343, + "step": 16050 + }, + { + "epoch": 0.41, + "grad_norm": 0.8118634223937988, + "learning_rate": 2.4876897337073105e-06, + "loss": 0.1316, + "step": 16100 + }, + { + "epoch": 0.41, + "grad_norm": 0.8614838123321533, + "learning_rate": 2.475379765901301e-06, + "loss": 0.1356, + "step": 16150 + }, + { + "epoch": 0.41, + "grad_norm": 0.8548163175582886, + "learning_rate": 2.4630703950614176e-06, + "loss": 0.1342, + "step": 16200 + }, + { + "epoch": 0.41, + "grad_norm": 0.8339961171150208, + "learning_rate": 2.45076191965263e-06, + "loss": 0.1253, + "step": 16250 + }, + { + "epoch": 0.41, + "grad_norm": 0.7520368695259094, + "learning_rate": 2.4384546381181975e-06, + "loss": 0.1281, + "step": 16300 + }, + { + "epoch": 0.41, + "grad_norm": 0.8519024848937988, + "learning_rate": 2.4261488488724284e-06, + "loss": 0.1339, + "step": 16350 + }, + { + "epoch": 0.42, + "grad_norm": 0.6523202061653137, + "learning_rate": 2.413844850293451e-06, + "loss": 0.1354, + "step": 16400 + }, + { + "epoch": 0.42, + "grad_norm": 0.8685447573661804, + "learning_rate": 2.4015429407159746e-06, + "loss": 0.134, + "step": 16450 + }, + { + "epoch": 0.42, + "grad_norm": 0.7570633292198181, + "learning_rate": 2.3892434184240536e-06, + "loss": 0.131, + "step": 16500 + }, + { + "epoch": 0.42, + "grad_norm": 0.8787110447883606, + "learning_rate": 2.3769465816438627e-06, + "loss": 0.1305, + "step": 16550 + }, + { + "epoch": 0.42, + "grad_norm": 0.8211491107940674, + "learning_rate": 2.3646527285364565e-06, + "loss": 0.133, + "step": 16600 + }, + { + "epoch": 0.42, + "grad_norm": 0.8259795308113098, + "learning_rate": 2.3523621571905485e-06, + "loss": 0.1324, + "step": 16650 + }, + { + "epoch": 0.42, + "grad_norm": 0.8258258700370789, + "learning_rate": 2.340075165615276e-06, + "loss": 0.1305, + "step": 16700 + }, + { + "epoch": 0.42, + "grad_norm": 0.8116273283958435, + "learning_rate": 2.3277920517329813e-06, + "loss": 0.1341, + "step": 16750 + }, + { + "epoch": 0.43, + "grad_norm": 0.8958277702331543, + "learning_rate": 2.315513113371981e-06, + "loss": 0.1274, + "step": 16800 + }, + { + "epoch": 0.43, + "grad_norm": 0.7367030382156372, + "learning_rate": 2.303238648259349e-06, + "loss": 0.1276, + "step": 16850 + }, + { + "epoch": 0.43, + "grad_norm": 0.7800854444503784, + "learning_rate": 2.2909689540136986e-06, + "loss": 0.1347, + "step": 16900 + }, + { + "epoch": 0.43, + "grad_norm": 0.7913332581520081, + "learning_rate": 2.27870432813796e-06, + "loss": 0.1293, + "step": 16950 + }, + { + "epoch": 0.43, + "grad_norm": 0.7659708261489868, + "learning_rate": 2.2664450680121757e-06, + "loss": 0.1334, + "step": 17000 + }, + { + "epoch": 0.43, + "grad_norm": 0.7981456518173218, + "learning_rate": 2.254191470886279e-06, + "loss": 0.1356, + "step": 17050 + }, + { + "epoch": 0.43, + "grad_norm": 0.8289954662322998, + "learning_rate": 2.241943833872899e-06, + "loss": 0.1305, + "step": 17100 + }, + { + "epoch": 0.43, + "grad_norm": 0.8367072343826294, + "learning_rate": 2.2297024539401463e-06, + "loss": 0.1307, + "step": 17150 + }, + { + "epoch": 0.44, + "grad_norm": 0.8345316648483276, + "learning_rate": 2.2174676279044155e-06, + "loss": 0.1283, + "step": 17200 + }, + { + "epoch": 0.44, + "grad_norm": 0.8068302869796753, + "learning_rate": 2.2052396524231924e-06, + "loss": 0.1342, + "step": 17250 + }, + { + "epoch": 0.44, + "grad_norm": 0.811231791973114, + "learning_rate": 2.193018823987854e-06, + "loss": 0.1374, + "step": 17300 + }, + { + "epoch": 0.44, + "grad_norm": 0.8022611141204834, + "learning_rate": 2.180805438916487e-06, + "loss": 0.1296, + "step": 17350 + }, + { + "epoch": 0.44, + "grad_norm": 0.8078568577766418, + "learning_rate": 2.1685997933466947e-06, + "loss": 0.1273, + "step": 17400 + }, + { + "epoch": 0.44, + "grad_norm": 0.749242901802063, + "learning_rate": 2.1564021832284255e-06, + "loss": 0.1302, + "step": 17450 + }, + { + "epoch": 0.44, + "grad_norm": 0.8693391680717468, + "learning_rate": 2.1442129043167877e-06, + "loss": 0.1296, + "step": 17500 + }, + { + "epoch": 0.44, + "grad_norm": 0.9399083852767944, + "learning_rate": 2.1320322521648857e-06, + "loss": 0.127, + "step": 17550 + }, + { + "epoch": 0.45, + "grad_norm": 0.8048396110534668, + "learning_rate": 2.119860522116653e-06, + "loss": 0.128, + "step": 17600 + }, + { + "epoch": 0.45, + "grad_norm": 0.7869442701339722, + "learning_rate": 2.1076980092996845e-06, + "loss": 0.1324, + "step": 17650 + }, + { + "epoch": 0.45, + "grad_norm": 0.8741517066955566, + "learning_rate": 2.0955450086180883e-06, + "loss": 0.1351, + "step": 17700 + }, + { + "epoch": 0.45, + "grad_norm": 0.9408276081085205, + "learning_rate": 2.083401814745332e-06, + "loss": 0.1331, + "step": 17750 + }, + { + "epoch": 0.45, + "grad_norm": 0.8211988210678101, + "learning_rate": 2.071268722117096e-06, + "loss": 0.1281, + "step": 17800 + }, + { + "epoch": 0.45, + "grad_norm": 0.8505455851554871, + "learning_rate": 2.0591460249241383e-06, + "loss": 0.1296, + "step": 17850 + }, + { + "epoch": 0.45, + "grad_norm": 0.8733687400817871, + "learning_rate": 2.0470340171051567e-06, + "loss": 0.1339, + "step": 17900 + }, + { + "epoch": 0.45, + "grad_norm": 0.8266494274139404, + "learning_rate": 2.034932992339666e-06, + "loss": 0.1283, + "step": 17950 + }, + { + "epoch": 0.46, + "grad_norm": 0.7836053967475891, + "learning_rate": 2.022843244040874e-06, + "loss": 0.1243, + "step": 18000 + }, + { + "epoch": 0.46, + "grad_norm": 0.9171106815338135, + "learning_rate": 2.0107650653485707e-06, + "loss": 0.1341, + "step": 18050 + }, + { + "epoch": 0.46, + "grad_norm": 0.796025812625885, + "learning_rate": 1.998698749122014e-06, + "loss": 0.1336, + "step": 18100 + }, + { + "epoch": 0.46, + "grad_norm": 0.8582082986831665, + "learning_rate": 1.986644587932835e-06, + "loss": 0.1311, + "step": 18150 + }, + { + "epoch": 0.46, + "grad_norm": 0.7605078220367432, + "learning_rate": 1.9746028740579453e-06, + "loss": 0.1288, + "step": 18200 + }, + { + "epoch": 0.46, + "grad_norm": 0.8088697195053101, + "learning_rate": 1.96257389947244e-06, + "loss": 0.1285, + "step": 18250 + }, + { + "epoch": 0.46, + "grad_norm": 0.8578440546989441, + "learning_rate": 1.9505579558425315e-06, + "loss": 0.1346, + "step": 18300 + }, + { + "epoch": 0.47, + "grad_norm": 0.7750033140182495, + "learning_rate": 1.938555334518466e-06, + "loss": 0.1306, + "step": 18350 + }, + { + "epoch": 0.47, + "grad_norm": 0.8363983631134033, + "learning_rate": 1.926566326527468e-06, + "loss": 0.1313, + "step": 18400 + }, + { + "epoch": 0.47, + "grad_norm": 0.7722070813179016, + "learning_rate": 1.914591222566678e-06, + "loss": 0.1337, + "step": 18450 + }, + { + "epoch": 0.47, + "grad_norm": 0.8674912452697754, + "learning_rate": 1.9026303129961049e-06, + "loss": 0.1268, + "step": 18500 + }, + { + "epoch": 0.47, + "grad_norm": 1.0055360794067383, + "learning_rate": 1.8906838878315886e-06, + "loss": 0.1299, + "step": 18550 + }, + { + "epoch": 0.47, + "grad_norm": 0.797717273235321, + "learning_rate": 1.878752236737765e-06, + "loss": 0.1261, + "step": 18600 + }, + { + "epoch": 0.47, + "grad_norm": 0.8092938661575317, + "learning_rate": 1.8668356490210449e-06, + "loss": 0.1362, + "step": 18650 + }, + { + "epoch": 0.47, + "grad_norm": 0.8348585367202759, + "learning_rate": 1.8549344136225946e-06, + "loss": 0.1276, + "step": 18700 + }, + { + "epoch": 0.48, + "grad_norm": 0.8768981099128723, + "learning_rate": 1.8430488191113373e-06, + "loss": 0.1332, + "step": 18750 + }, + { + "epoch": 0.48, + "grad_norm": 0.7733373045921326, + "learning_rate": 1.8311791536769485e-06, + "loss": 0.134, + "step": 18800 + }, + { + "epoch": 0.48, + "grad_norm": 0.8418094515800476, + "learning_rate": 1.819325705122873e-06, + "loss": 0.133, + "step": 18850 + }, + { + "epoch": 0.48, + "grad_norm": 0.8411191701889038, + "learning_rate": 1.8074887608593477e-06, + "loss": 0.1304, + "step": 18900 + }, + { + "epoch": 0.48, + "grad_norm": 0.9010744094848633, + "learning_rate": 1.7956686078964257e-06, + "loss": 0.131, + "step": 18950 + }, + { + "epoch": 0.48, + "grad_norm": 0.8585550785064697, + "learning_rate": 1.7838655328370268e-06, + "loss": 0.1308, + "step": 19000 + }, + { + "epoch": 0.48, + "grad_norm": 0.8417059183120728, + "learning_rate": 1.7720798218699798e-06, + "loss": 0.1282, + "step": 19050 + }, + { + "epoch": 0.48, + "grad_norm": 0.8636181354522705, + "learning_rate": 1.7603117607630892e-06, + "loss": 0.1256, + "step": 19100 + }, + { + "epoch": 0.49, + "grad_norm": 0.8818506598472595, + "learning_rate": 1.7485616348562023e-06, + "loss": 0.1306, + "step": 19150 + }, + { + "epoch": 0.49, + "grad_norm": 0.8800364136695862, + "learning_rate": 1.7368297290542918e-06, + "loss": 0.1303, + "step": 19200 + }, + { + "epoch": 0.49, + "grad_norm": 0.8845418095588684, + "learning_rate": 1.72511632782055e-06, + "loss": 0.131, + "step": 19250 + }, + { + "epoch": 0.49, + "grad_norm": 0.9156243205070496, + "learning_rate": 1.7134217151694873e-06, + "loss": 0.1269, + "step": 19300 + }, + { + "epoch": 0.49, + "grad_norm": 0.772602915763855, + "learning_rate": 1.7017461746600506e-06, + "loss": 0.1321, + "step": 19350 + }, + { + "epoch": 0.49, + "grad_norm": 0.7767558097839355, + "learning_rate": 1.690089989388741e-06, + "loss": 0.1302, + "step": 19400 + }, + { + "epoch": 0.49, + "grad_norm": 0.816466212272644, + "learning_rate": 1.678453441982758e-06, + "loss": 0.132, + "step": 19450 + }, + { + "epoch": 0.49, + "grad_norm": 0.746376633644104, + "learning_rate": 1.66683681459314e-06, + "loss": 0.1321, + "step": 19500 + }, + { + "epoch": 0.5, + "grad_norm": 0.9455410242080688, + "learning_rate": 1.6552403888879243e-06, + "loss": 0.1265, + "step": 19550 + }, + { + "epoch": 0.5, + "grad_norm": 0.9499367475509644, + "learning_rate": 1.6436644460453218e-06, + "loss": 0.1266, + "step": 19600 + }, + { + "epoch": 0.5, + "grad_norm": 0.8672183156013489, + "learning_rate": 1.6321092667468926e-06, + "loss": 0.1288, + "step": 19650 + }, + { + "epoch": 0.5, + "grad_norm": 0.7655101418495178, + "learning_rate": 1.6205751311707463e-06, + "loss": 0.1346, + "step": 19700 + }, + { + "epoch": 0.5, + "grad_norm": 0.7775772213935852, + "learning_rate": 1.6090623189847443e-06, + "loss": 0.1276, + "step": 19750 + }, + { + "epoch": 0.5, + "grad_norm": 0.8449127078056335, + "learning_rate": 1.5975711093397223e-06, + "loss": 0.1323, + "step": 19800 + }, + { + "epoch": 0.5, + "grad_norm": 0.9375335574150085, + "learning_rate": 1.5861017808627167e-06, + "loss": 0.1338, + "step": 19850 + }, + { + "epoch": 0.5, + "grad_norm": 0.7458611130714417, + "learning_rate": 1.574654611650214e-06, + "loss": 0.1307, + "step": 19900 + }, + { + "epoch": 0.51, + "grad_norm": 0.749284029006958, + "learning_rate": 1.5632298792614064e-06, + "loss": 0.1216, + "step": 19950 + }, + { + "epoch": 0.51, + "grad_norm": 0.8348965644836426, + "learning_rate": 1.5518278607114585e-06, + "loss": 0.1253, + "step": 20000 + }, + { + "epoch": 0.51, + "grad_norm": 0.9566074013710022, + "learning_rate": 1.540448832464796e-06, + "loss": 0.1289, + "step": 20050 + }, + { + "epoch": 0.51, + "grad_norm": 0.8038182854652405, + "learning_rate": 1.5290930704283953e-06, + "loss": 0.1266, + "step": 20100 + }, + { + "epoch": 0.51, + "grad_norm": 0.8434205055236816, + "learning_rate": 1.517760849945103e-06, + "loss": 0.1276, + "step": 20150 + }, + { + "epoch": 0.51, + "grad_norm": 0.7553625702857971, + "learning_rate": 1.5064524457869506e-06, + "loss": 0.128, + "step": 20200 + }, + { + "epoch": 0.51, + "grad_norm": 0.7524191737174988, + "learning_rate": 1.4951681321484952e-06, + "loss": 0.1313, + "step": 20250 + }, + { + "epoch": 0.51, + "grad_norm": 0.8314839601516724, + "learning_rate": 1.4839081826401756e-06, + "loss": 0.1255, + "step": 20300 + }, + { + "epoch": 0.52, + "grad_norm": 0.8954294323921204, + "learning_rate": 1.47267287028167e-06, + "loss": 0.1259, + "step": 20350 + }, + { + "epoch": 0.52, + "grad_norm": 0.7757198810577393, + "learning_rate": 1.4614624674952843e-06, + "loss": 0.1288, + "step": 20400 + }, + { + "epoch": 0.52, + "grad_norm": 0.9685442447662354, + "learning_rate": 1.4502772460993387e-06, + "loss": 0.1313, + "step": 20450 + }, + { + "epoch": 0.52, + "grad_norm": 0.8876562118530273, + "learning_rate": 1.4391174773015836e-06, + "loss": 0.1326, + "step": 20500 + }, + { + "epoch": 0.52, + "grad_norm": 1.0486761331558228, + "learning_rate": 1.4279834316926217e-06, + "loss": 0.126, + "step": 20550 + }, + { + "epoch": 0.52, + "grad_norm": 0.806884765625, + "learning_rate": 1.4168753792393413e-06, + "loss": 0.1286, + "step": 20600 + }, + { + "epoch": 0.52, + "grad_norm": 0.8478713631629944, + "learning_rate": 1.405793589278382e-06, + "loss": 0.1248, + "step": 20650 + }, + { + "epoch": 0.52, + "grad_norm": 1.0112961530685425, + "learning_rate": 1.394738330509593e-06, + "loss": 0.1265, + "step": 20700 + }, + { + "epoch": 0.53, + "grad_norm": 0.9463728666305542, + "learning_rate": 1.3837098709895246e-06, + "loss": 0.1274, + "step": 20750 + }, + { + "epoch": 0.53, + "grad_norm": 0.8236443996429443, + "learning_rate": 1.3727084781249251e-06, + "loss": 0.1287, + "step": 20800 + }, + { + "epoch": 0.53, + "grad_norm": 0.8493944406509399, + "learning_rate": 1.3617344186662585e-06, + "loss": 0.1252, + "step": 20850 + }, + { + "epoch": 0.53, + "grad_norm": 0.7850596904754639, + "learning_rate": 1.3507879587012378e-06, + "loss": 0.1199, + "step": 20900 + }, + { + "epoch": 0.53, + "grad_norm": 1.119826316833496, + "learning_rate": 1.3398693636483707e-06, + "loss": 0.1259, + "step": 20950 + }, + { + "epoch": 0.53, + "grad_norm": 0.8658459782600403, + "learning_rate": 1.328978898250525e-06, + "loss": 0.1276, + "step": 21000 + }, + { + "epoch": 0.53, + "grad_norm": 0.8346827030181885, + "learning_rate": 1.31811682656851e-06, + "loss": 0.126, + "step": 21050 + }, + { + "epoch": 0.53, + "grad_norm": 1.0707381963729858, + "learning_rate": 1.307283411974674e-06, + "loss": 0.1266, + "step": 21100 + }, + { + "epoch": 0.54, + "grad_norm": 0.8419653177261353, + "learning_rate": 1.2964789171465164e-06, + "loss": 0.1253, + "step": 21150 + }, + { + "epoch": 0.54, + "grad_norm": 0.6659058332443237, + "learning_rate": 1.2857036040603204e-06, + "loss": 0.1251, + "step": 21200 + }, + { + "epoch": 0.54, + "grad_norm": 0.9063161015510559, + "learning_rate": 1.2749577339848007e-06, + "loss": 0.1245, + "step": 21250 + }, + { + "epoch": 0.54, + "grad_norm": 0.9174826741218567, + "learning_rate": 1.2642415674747675e-06, + "loss": 0.1322, + "step": 21300 + }, + { + "epoch": 0.54, + "grad_norm": 0.8082279562950134, + "learning_rate": 1.25355536436481e-06, + "loss": 0.1294, + "step": 21350 + }, + { + "epoch": 0.54, + "grad_norm": 0.9118791222572327, + "learning_rate": 1.2428993837629943e-06, + "loss": 0.1277, + "step": 21400 + }, + { + "epoch": 0.54, + "grad_norm": 0.9657939076423645, + "learning_rate": 1.2322738840445867e-06, + "loss": 0.1281, + "step": 21450 + }, + { + "epoch": 0.54, + "grad_norm": 0.8594872355461121, + "learning_rate": 1.2216791228457778e-06, + "loss": 0.1193, + "step": 21500 + }, + { + "epoch": 0.55, + "grad_norm": 1.0133215188980103, + "learning_rate": 1.2111153570574454e-06, + "loss": 0.1275, + "step": 21550 + }, + { + "epoch": 0.55, + "grad_norm": 0.8617738485336304, + "learning_rate": 1.2005828428189256e-06, + "loss": 0.1269, + "step": 21600 + }, + { + "epoch": 0.55, + "grad_norm": 0.7881460189819336, + "learning_rate": 1.1900818355117918e-06, + "loss": 0.1251, + "step": 21650 + }, + { + "epoch": 0.55, + "grad_norm": 0.8519832491874695, + "learning_rate": 1.1796125897536782e-06, + "loss": 0.1311, + "step": 21700 + }, + { + "epoch": 0.55, + "grad_norm": 0.8465796113014221, + "learning_rate": 1.1691753593920884e-06, + "loss": 0.1293, + "step": 21750 + }, + { + "epoch": 0.55, + "grad_norm": 0.892995297908783, + "learning_rate": 1.1587703974982583e-06, + "loss": 0.1272, + "step": 21800 + }, + { + "epoch": 0.55, + "grad_norm": 0.8453019261360168, + "learning_rate": 1.148397956361007e-06, + "loss": 0.1267, + "step": 21850 + }, + { + "epoch": 0.56, + "grad_norm": 0.7927883863449097, + "learning_rate": 1.1380582874806208e-06, + "loss": 0.1274, + "step": 21900 + }, + { + "epoch": 0.56, + "grad_norm": 0.848616898059845, + "learning_rate": 1.127751641562765e-06, + "loss": 0.1318, + "step": 21950 + }, + { + "epoch": 0.56, + "grad_norm": 0.8871294260025024, + "learning_rate": 1.1174782685123919e-06, + "loss": 0.1278, + "step": 22000 + }, + { + "epoch": 0.56, + "grad_norm": 0.9262788891792297, + "learning_rate": 1.107238417427691e-06, + "loss": 0.1242, + "step": 22050 + }, + { + "epoch": 0.56, + "grad_norm": 1.0166550874710083, + "learning_rate": 1.0970323365940443e-06, + "loss": 0.1271, + "step": 22100 + }, + { + "epoch": 0.56, + "grad_norm": 0.9763181805610657, + "learning_rate": 1.0868602734780075e-06, + "loss": 0.1266, + "step": 22150 + }, + { + "epoch": 0.56, + "grad_norm": 0.8121241927146912, + "learning_rate": 1.0767224747213102e-06, + "loss": 0.1286, + "step": 22200 + }, + { + "epoch": 0.56, + "grad_norm": 0.7165303230285645, + "learning_rate": 1.0666191861348741e-06, + "loss": 0.1275, + "step": 22250 + }, + { + "epoch": 0.57, + "grad_norm": 1.0297795534133911, + "learning_rate": 1.0565506526928548e-06, + "loss": 0.1251, + "step": 22300 + }, + { + "epoch": 0.57, + "grad_norm": 0.8937394022941589, + "learning_rate": 1.0465171185267007e-06, + "loss": 0.1297, + "step": 22350 + }, + { + "epoch": 0.57, + "grad_norm": 0.9162909388542175, + "learning_rate": 1.036518826919233e-06, + "loss": 0.1325, + "step": 22400 + }, + { + "epoch": 0.57, + "grad_norm": 0.8973715305328369, + "learning_rate": 1.0265560202987474e-06, + "loss": 0.1296, + "step": 22450 + }, + { + "epoch": 0.57, + "grad_norm": 0.8906801342964172, + "learning_rate": 1.0166289402331391e-06, + "loss": 0.128, + "step": 22500 + }, + { + "epoch": 0.57, + "grad_norm": 0.7446568608283997, + "learning_rate": 1.006737827424038e-06, + "loss": 0.1238, + "step": 22550 + }, + { + "epoch": 0.57, + "grad_norm": 1.0760340690612793, + "learning_rate": 9.9688292170098e-07, + "loss": 0.129, + "step": 22600 + }, + { + "epoch": 0.57, + "grad_norm": 0.9284375309944153, + "learning_rate": 9.870644620155878e-07, + "loss": 0.125, + "step": 22650 + }, + { + "epoch": 0.58, + "grad_norm": 0.9257674217224121, + "learning_rate": 9.77282686435777e-07, + "loss": 0.1272, + "step": 22700 + }, + { + "epoch": 0.58, + "grad_norm": 0.7764928936958313, + "learning_rate": 9.67537832139989e-07, + "loss": 0.1224, + "step": 22750 + }, + { + "epoch": 0.58, + "grad_norm": 0.7755612730979919, + "learning_rate": 9.578301354114292e-07, + "loss": 0.1279, + "step": 22800 + }, + { + "epoch": 0.58, + "grad_norm": 0.847436785697937, + "learning_rate": 9.481598316323504e-07, + "loss": 0.1273, + "step": 22850 + }, + { + "epoch": 0.58, + "grad_norm": 0.8615244030952454, + "learning_rate": 9.385271552783376e-07, + "loss": 0.1267, + "step": 22900 + }, + { + "epoch": 0.58, + "grad_norm": 0.8775952458381653, + "learning_rate": 9.289323399126216e-07, + "loss": 0.1276, + "step": 22950 + }, + { + "epoch": 0.58, + "grad_norm": 0.9326611161231995, + "learning_rate": 9.193756181804248e-07, + "loss": 0.1296, + "step": 23000 + }, + { + "epoch": 0.58, + "grad_norm": 0.6743093132972717, + "learning_rate": 9.098572218033084e-07, + "loss": 0.1279, + "step": 23050 + }, + { + "epoch": 0.59, + "grad_norm": 0.9707579612731934, + "learning_rate": 9.003773815735644e-07, + "loss": 0.1283, + "step": 23100 + }, + { + "epoch": 0.59, + "grad_norm": 0.8160597681999207, + "learning_rate": 8.90936327348613e-07, + "loss": 0.127, + "step": 23150 + }, + { + "epoch": 0.59, + "grad_norm": 1.070728063583374, + "learning_rate": 8.815342880454312e-07, + "loss": 0.1265, + "step": 23200 + }, + { + "epoch": 0.59, + "grad_norm": 0.945746123790741, + "learning_rate": 8.721714916350019e-07, + "loss": 0.1241, + "step": 23250 + }, + { + "epoch": 0.59, + "grad_norm": 0.844608724117279, + "learning_rate": 8.628481651367876e-07, + "loss": 0.1271, + "step": 23300 + }, + { + "epoch": 0.59, + "grad_norm": 0.8871841430664062, + "learning_rate": 8.535645346132246e-07, + "loss": 0.1313, + "step": 23350 + }, + { + "epoch": 0.59, + "grad_norm": 0.791871964931488, + "learning_rate": 8.443208251642418e-07, + "loss": 0.1242, + "step": 23400 + }, + { + "epoch": 0.59, + "grad_norm": 0.9160509705543518, + "learning_rate": 8.351172609218033e-07, + "loss": 0.1217, + "step": 23450 + }, + { + "epoch": 0.6, + "grad_norm": 0.7962935566902161, + "learning_rate": 8.259540650444736e-07, + "loss": 0.128, + "step": 23500 + }, + { + "epoch": 0.6, + "grad_norm": 0.7934704422950745, + "learning_rate": 8.168314597120059e-07, + "loss": 0.1217, + "step": 23550 + }, + { + "epoch": 0.6, + "grad_norm": 0.7720403671264648, + "learning_rate": 8.077496661199557e-07, + "loss": 0.1252, + "step": 23600 + }, + { + "epoch": 0.6, + "grad_norm": 0.806203305721283, + "learning_rate": 7.987089044743182e-07, + "loss": 0.1276, + "step": 23650 + }, + { + "epoch": 0.6, + "grad_norm": 0.9752463698387146, + "learning_rate": 7.897093939861878e-07, + "loss": 0.1288, + "step": 23700 + }, + { + "epoch": 0.6, + "grad_norm": 0.8070717453956604, + "learning_rate": 7.807513528664415e-07, + "loss": 0.1272, + "step": 23750 + }, + { + "epoch": 0.6, + "grad_norm": 0.773583710193634, + "learning_rate": 7.71834998320454e-07, + "loss": 0.1192, + "step": 23800 + }, + { + "epoch": 0.6, + "grad_norm": 0.7892506122589111, + "learning_rate": 7.629605465428211e-07, + "loss": 0.1217, + "step": 23850 + }, + { + "epoch": 0.61, + "grad_norm": 0.6606497764587402, + "learning_rate": 7.541282127121291e-07, + "loss": 0.1222, + "step": 23900 + }, + { + "epoch": 0.61, + "grad_norm": 0.8119127154350281, + "learning_rate": 7.453382109857269e-07, + "loss": 0.1241, + "step": 23950 + }, + { + "epoch": 0.61, + "grad_norm": 0.7261886596679688, + "learning_rate": 7.365907544945398e-07, + "loss": 0.1294, + "step": 24000 + }, + { + "epoch": 0.61, + "grad_norm": 0.8544095754623413, + "learning_rate": 7.27886055337902e-07, + "loss": 0.126, + "step": 24050 + }, + { + "epoch": 0.61, + "grad_norm": 0.9304544925689697, + "learning_rate": 7.192243245784075e-07, + "loss": 0.1296, + "step": 24100 + }, + { + "epoch": 0.61, + "grad_norm": 0.8993602395057678, + "learning_rate": 7.106057722368012e-07, + "loss": 0.1225, + "step": 24150 + }, + { + "epoch": 0.61, + "grad_norm": 0.7909960150718689, + "learning_rate": 7.020306072868804e-07, + "loss": 0.1261, + "step": 24200 + }, + { + "epoch": 0.61, + "grad_norm": 0.8330161571502686, + "learning_rate": 6.934990376504269e-07, + "loss": 0.1242, + "step": 24250 + }, + { + "epoch": 0.62, + "grad_norm": 0.9058279395103455, + "learning_rate": 6.850112701921735e-07, + "loss": 0.1224, + "step": 24300 + }, + { + "epoch": 0.62, + "grad_norm": 0.9598106741905212, + "learning_rate": 6.76567510714777e-07, + "loss": 0.1261, + "step": 24350 + }, + { + "epoch": 0.62, + "grad_norm": 0.8675400018692017, + "learning_rate": 6.681679639538388e-07, + "loss": 0.1275, + "step": 24400 + }, + { + "epoch": 0.62, + "grad_norm": 0.7243128418922424, + "learning_rate": 6.598128335729332e-07, + "loss": 0.1301, + "step": 24450 + }, + { + "epoch": 0.62, + "grad_norm": 0.866690456867218, + "learning_rate": 6.515023221586722e-07, + "loss": 0.1276, + "step": 24500 + }, + { + "epoch": 0.62, + "grad_norm": 0.8871277570724487, + "learning_rate": 6.432366312157933e-07, + "loss": 0.1285, + "step": 24550 + }, + { + "epoch": 0.62, + "grad_norm": 0.6913353800773621, + "learning_rate": 6.35015961162273e-07, + "loss": 0.1207, + "step": 24600 + }, + { + "epoch": 0.62, + "grad_norm": 0.8807539939880371, + "learning_rate": 6.268405113244677e-07, + "loss": 0.1227, + "step": 24650 + }, + { + "epoch": 0.63, + "grad_norm": 0.9127259850502014, + "learning_rate": 6.187104799322805e-07, + "loss": 0.1234, + "step": 24700 + }, + { + "epoch": 0.63, + "grad_norm": 1.0802431106567383, + "learning_rate": 6.106260641143547e-07, + "loss": 0.1257, + "step": 24750 + }, + { + "epoch": 0.63, + "grad_norm": 0.8640583157539368, + "learning_rate": 6.025874598932937e-07, + "loss": 0.1195, + "step": 24800 + }, + { + "epoch": 0.63, + "grad_norm": 0.8064579963684082, + "learning_rate": 5.945948621809092e-07, + "loss": 0.1266, + "step": 24850 + }, + { + "epoch": 0.63, + "grad_norm": 0.8165306448936462, + "learning_rate": 5.866484647734935e-07, + "loss": 0.1229, + "step": 24900 + }, + { + "epoch": 0.63, + "grad_norm": 0.885006308555603, + "learning_rate": 5.787484603471221e-07, + "loss": 0.1215, + "step": 24950 + }, + { + "epoch": 0.63, + "grad_norm": 1.0080711841583252, + "learning_rate": 5.708950404529812e-07, + "loss": 0.1233, + "step": 25000 + }, + { + "epoch": 0.63, + "grad_norm": 0.8697314858436584, + "learning_rate": 5.630883955127211e-07, + "loss": 0.1215, + "step": 25050 + }, + { + "epoch": 0.64, + "grad_norm": 0.8583015203475952, + "learning_rate": 5.553287148138462e-07, + "loss": 0.1267, + "step": 25100 + }, + { + "epoch": 0.64, + "grad_norm": 0.771619439125061, + "learning_rate": 5.47616186505113e-07, + "loss": 0.1185, + "step": 25150 + }, + { + "epoch": 0.64, + "grad_norm": 0.8931272625923157, + "learning_rate": 5.399509975919828e-07, + "loss": 0.1232, + "step": 25200 + }, + { + "epoch": 0.64, + "grad_norm": 0.9257284998893738, + "learning_rate": 5.323333339320739e-07, + "loss": 0.126, + "step": 25250 + }, + { + "epoch": 0.64, + "grad_norm": 0.7422381639480591, + "learning_rate": 5.247633802306637e-07, + "loss": 0.1258, + "step": 25300 + }, + { + "epoch": 0.64, + "grad_norm": 0.8282724022865295, + "learning_rate": 5.172413200362092e-07, + "loss": 0.1244, + "step": 25350 + }, + { + "epoch": 0.64, + "grad_norm": 0.9066148996353149, + "learning_rate": 5.097673357358906e-07, + "loss": 0.1227, + "step": 25400 + }, + { + "epoch": 0.64, + "grad_norm": 1.0274531841278076, + "learning_rate": 5.023416085511976e-07, + "loss": 0.1206, + "step": 25450 + }, + { + "epoch": 0.65, + "grad_norm": 0.8062939047813416, + "learning_rate": 4.949643185335288e-07, + "loss": 0.121, + "step": 25500 + }, + { + "epoch": 0.65, + "grad_norm": 0.9104852080345154, + "learning_rate": 4.876356445598279e-07, + "loss": 0.1318, + "step": 25550 + }, + { + "epoch": 0.65, + "grad_norm": 0.9484885334968567, + "learning_rate": 4.803557643282486e-07, + "loss": 0.1215, + "step": 25600 + }, + { + "epoch": 0.65, + "grad_norm": 1.0123718976974487, + "learning_rate": 4.731248543538405e-07, + "loss": 0.1232, + "step": 25650 + }, + { + "epoch": 0.65, + "grad_norm": 0.7490533590316772, + "learning_rate": 4.6594308996427696e-07, + "loss": 0.1226, + "step": 25700 + }, + { + "epoch": 0.65, + "grad_norm": 0.867254912853241, + "learning_rate": 4.588106452955973e-07, + "loss": 0.1183, + "step": 25750 + }, + { + "epoch": 0.65, + "grad_norm": 0.889826238155365, + "learning_rate": 4.517276932879877e-07, + "loss": 0.1229, + "step": 25800 + }, + { + "epoch": 0.66, + "grad_norm": 0.8290361166000366, + "learning_rate": 4.446944056815866e-07, + "loss": 0.123, + "step": 25850 + }, + { + "epoch": 0.66, + "grad_norm": 0.8148057460784912, + "learning_rate": 4.377109530123216e-07, + "loss": 0.1275, + "step": 25900 + }, + { + "epoch": 0.66, + "grad_norm": 0.8523454666137695, + "learning_rate": 4.307775046077739e-07, + "loss": 0.1227, + "step": 25950 + }, + { + "epoch": 0.66, + "grad_norm": 0.8022044897079468, + "learning_rate": 4.2389422858307244e-07, + "loss": 0.1273, + "step": 26000 + }, + { + "epoch": 0.66, + "grad_norm": 0.7871876955032349, + "learning_rate": 4.1706129183681834e-07, + "loss": 0.1281, + "step": 26050 + }, + { + "epoch": 0.66, + "grad_norm": 0.8694405555725098, + "learning_rate": 4.10278860047037e-07, + "loss": 0.1202, + "step": 26100 + }, + { + "epoch": 0.66, + "grad_norm": 0.9443354606628418, + "learning_rate": 4.035470976671621e-07, + "loss": 0.1194, + "step": 26150 + }, + { + "epoch": 0.66, + "grad_norm": 0.8417434096336365, + "learning_rate": 3.9686616792204677e-07, + "loss": 0.1213, + "step": 26200 + }, + { + "epoch": 0.67, + "grad_norm": 0.855648398399353, + "learning_rate": 3.902362328040091e-07, + "loss": 0.1272, + "step": 26250 + }, + { + "epoch": 0.67, + "grad_norm": 0.8079158067703247, + "learning_rate": 3.836574530688983e-07, + "loss": 0.1237, + "step": 26300 + }, + { + "epoch": 0.67, + "grad_norm": 0.9010571837425232, + "learning_rate": 3.7712998823220243e-07, + "loss": 0.1268, + "step": 26350 + }, + { + "epoch": 0.67, + "grad_norm": 0.9768967032432556, + "learning_rate": 3.7065399656517955e-07, + "loss": 0.1226, + "step": 26400 + }, + { + "epoch": 0.67, + "grad_norm": 0.720100462436676, + "learning_rate": 3.6422963509101626e-07, + "loss": 0.1212, + "step": 26450 + }, + { + "epoch": 0.67, + "grad_norm": 0.8054412603378296, + "learning_rate": 3.578570595810274e-07, + "loss": 0.12, + "step": 26500 + }, + { + "epoch": 0.67, + "grad_norm": 0.7987850904464722, + "learning_rate": 3.515364245508704e-07, + "loss": 0.1205, + "step": 26550 + }, + { + "epoch": 0.67, + "grad_norm": 0.7818375825881958, + "learning_rate": 3.452678832568071e-07, + "loss": 0.1301, + "step": 26600 + }, + { + "epoch": 0.68, + "grad_norm": 0.8323056101799011, + "learning_rate": 3.390515876919831e-07, + "loss": 0.1208, + "step": 26650 + }, + { + "epoch": 0.68, + "grad_norm": 0.8122744560241699, + "learning_rate": 3.328876885827406e-07, + "loss": 0.121, + "step": 26700 + }, + { + "epoch": 0.68, + "grad_norm": 0.9000005125999451, + "learning_rate": 3.267763353849704e-07, + "loss": 0.1187, + "step": 26750 + }, + { + "epoch": 0.68, + "grad_norm": 0.7364488840103149, + "learning_rate": 3.207176762804814e-07, + "loss": 0.1228, + "step": 26800 + }, + { + "epoch": 0.68, + "grad_norm": 0.9550331234931946, + "learning_rate": 3.1471185817341153e-07, + "loss": 0.1197, + "step": 26850 + }, + { + "epoch": 0.68, + "grad_norm": 0.8683730959892273, + "learning_rate": 3.0875902668666386e-07, + "loss": 0.1279, + "step": 26900 + }, + { + "epoch": 0.68, + "grad_norm": 0.8578702807426453, + "learning_rate": 3.0285932615837646e-07, + "loss": 0.1212, + "step": 26950 + }, + { + "epoch": 0.68, + "grad_norm": 0.9135186076164246, + "learning_rate": 2.970128996384228e-07, + "loss": 0.1219, + "step": 27000 + }, + { + "epoch": 0.69, + "grad_norm": 0.8612381815910339, + "learning_rate": 2.9121988888494297e-07, + "loss": 0.1274, + "step": 27050 + }, + { + "epoch": 0.69, + "grad_norm": 0.8324002027511597, + "learning_rate": 2.854804343609058e-07, + "loss": 0.1282, + "step": 27100 + }, + { + "epoch": 0.69, + "grad_norm": 0.8618267774581909, + "learning_rate": 2.7979467523070484e-07, + "loss": 0.1219, + "step": 27150 + }, + { + "epoch": 0.69, + "grad_norm": 0.8603401780128479, + "learning_rate": 2.741627493567822e-07, + "loss": 0.1225, + "step": 27200 + }, + { + "epoch": 0.69, + "grad_norm": 1.0212452411651611, + "learning_rate": 2.685847932962868e-07, + "loss": 0.1225, + "step": 27250 + }, + { + "epoch": 0.69, + "grad_norm": 0.7207959294319153, + "learning_rate": 2.630609422977623e-07, + "loss": 0.1266, + "step": 27300 + }, + { + "epoch": 0.69, + "grad_norm": 0.9578468203544617, + "learning_rate": 2.575913302978697e-07, + "loss": 0.1274, + "step": 27350 + }, + { + "epoch": 0.69, + "grad_norm": 1.033157229423523, + "learning_rate": 2.5217608991813774e-07, + "loss": 0.116, + "step": 27400 + }, + { + "epoch": 0.7, + "grad_norm": 0.8012121915817261, + "learning_rate": 2.468153524617478e-07, + "loss": 0.121, + "step": 27450 + }, + { + "epoch": 0.7, + "grad_norm": 0.7251663208007812, + "learning_rate": 2.4150924791035037e-07, + "loss": 0.1213, + "step": 27500 + }, + { + "epoch": 0.7, + "grad_norm": 0.8772590160369873, + "learning_rate": 2.3625790492091544e-07, + "loss": 0.1205, + "step": 27550 + }, + { + "epoch": 0.7, + "grad_norm": 0.9037516713142395, + "learning_rate": 2.3106145082260777e-07, + "loss": 0.1234, + "step": 27600 + }, + { + "epoch": 0.7, + "grad_norm": 0.8488568067550659, + "learning_rate": 2.2592001161370392e-07, + "loss": 0.1203, + "step": 27650 + }, + { + "epoch": 0.7, + "grad_norm": 0.8486872911453247, + "learning_rate": 2.2083371195853797e-07, + "loss": 0.1309, + "step": 27700 + }, + { + "epoch": 0.7, + "grad_norm": 0.964940071105957, + "learning_rate": 2.158026751844733e-07, + "loss": 0.1228, + "step": 27750 + }, + { + "epoch": 0.7, + "grad_norm": 0.8965867161750793, + "learning_rate": 2.1082702327891918e-07, + "loss": 0.13, + "step": 27800 + }, + { + "epoch": 0.71, + "grad_norm": 0.8563171029090881, + "learning_rate": 2.0590687688636619e-07, + "loss": 0.1187, + "step": 27850 + }, + { + "epoch": 0.71, + "grad_norm": 1.000481367111206, + "learning_rate": 2.0104235530546745e-07, + "loss": 0.1267, + "step": 27900 + }, + { + "epoch": 0.71, + "grad_norm": 0.793648898601532, + "learning_rate": 1.9623357648614088e-07, + "loss": 0.1225, + "step": 27950 + }, + { + "epoch": 0.71, + "grad_norm": 0.8979523181915283, + "learning_rate": 1.914806570267111e-07, + "loss": 0.1234, + "step": 28000 + }, + { + "epoch": 0.71, + "grad_norm": 0.9331807494163513, + "learning_rate": 1.8678371217108387e-07, + "loss": 0.1197, + "step": 28050 + }, + { + "epoch": 0.71, + "grad_norm": 0.8871297240257263, + "learning_rate": 1.821428558059493e-07, + "loss": 0.1159, + "step": 28100 + }, + { + "epoch": 0.71, + "grad_norm": 1.0340665578842163, + "learning_rate": 1.7755820045802146e-07, + "loss": 0.1266, + "step": 28150 + }, + { + "epoch": 0.71, + "grad_norm": 1.055516004562378, + "learning_rate": 1.7302985729131e-07, + "loss": 0.1255, + "step": 28200 + }, + { + "epoch": 0.72, + "grad_norm": 0.8390622138977051, + "learning_rate": 1.6855793610442484e-07, + "loss": 0.1256, + "step": 28250 + }, + { + "epoch": 0.72, + "grad_norm": 0.8009083867073059, + "learning_rate": 1.6414254532791357e-07, + "loss": 0.1215, + "step": 28300 + }, + { + "epoch": 0.72, + "grad_norm": 1.0313583612442017, + "learning_rate": 1.5978379202163275e-07, + "loss": 0.1274, + "step": 28350 + }, + { + "epoch": 0.72, + "grad_norm": 0.9737227559089661, + "learning_rate": 1.554817818721513e-07, + "loss": 0.1248, + "step": 28400 + }, + { + "epoch": 0.72, + "grad_norm": 1.0153248310089111, + "learning_rate": 1.51236619190189e-07, + "loss": 0.1247, + "step": 28450 + }, + { + "epoch": 0.72, + "grad_norm": 0.8174459338188171, + "learning_rate": 1.4704840690808658e-07, + "loss": 0.122, + "step": 28500 + }, + { + "epoch": 0.72, + "grad_norm": 0.9065647125244141, + "learning_rate": 1.4291724657730904e-07, + "loss": 0.1245, + "step": 28550 + }, + { + "epoch": 0.72, + "grad_norm": 0.9131068587303162, + "learning_rate": 1.3884323836598656e-07, + "loss": 0.1199, + "step": 28600 + }, + { + "epoch": 0.73, + "grad_norm": 0.8409487009048462, + "learning_rate": 1.348264810564809e-07, + "loss": 0.1282, + "step": 28650 + }, + { + "epoch": 0.73, + "grad_norm": 0.8395945429801941, + "learning_rate": 1.3086707204299415e-07, + "loss": 0.1264, + "step": 28700 + }, + { + "epoch": 0.73, + "grad_norm": 0.8718541860580444, + "learning_rate": 1.269651073292058e-07, + "loss": 0.1204, + "step": 28750 + }, + { + "epoch": 0.73, + "grad_norm": 0.8746705055236816, + "learning_rate": 1.2312068152594448e-07, + "loss": 0.1244, + "step": 28800 + }, + { + "epoch": 0.73, + "grad_norm": 0.8849207758903503, + "learning_rate": 1.1933388784889617e-07, + "loss": 0.1215, + "step": 28850 + }, + { + "epoch": 0.73, + "grad_norm": 0.9171212315559387, + "learning_rate": 1.1560481811633911e-07, + "loss": 0.1274, + "step": 28900 + }, + { + "epoch": 0.73, + "grad_norm": 0.9320688247680664, + "learning_rate": 1.1193356274692424e-07, + "loss": 0.1211, + "step": 28950 + }, + { + "epoch": 0.73, + "grad_norm": 0.7832902073860168, + "learning_rate": 1.0832021075747712e-07, + "loss": 0.1257, + "step": 29000 + }, + { + "epoch": 0.74, + "grad_norm": 0.8894676566123962, + "learning_rate": 1.047648497608414e-07, + "loss": 0.1268, + "step": 29050 + }, + { + "epoch": 0.74, + "grad_norm": 0.7472261786460876, + "learning_rate": 1.0126756596375687e-07, + "loss": 0.1222, + "step": 29100 + }, + { + "epoch": 0.74, + "grad_norm": 0.9430793523788452, + "learning_rate": 9.782844416476423e-08, + "loss": 0.1219, + "step": 29150 + }, + { + "epoch": 0.74, + "grad_norm": 0.8395842909812927, + "learning_rate": 9.444756775215446e-08, + "loss": 0.1212, + "step": 29200 + }, + { + "epoch": 0.74, + "grad_norm": 0.8106001019477844, + "learning_rate": 9.112501870194273e-08, + "loss": 0.124, + "step": 29250 + }, + { + "epoch": 0.74, + "grad_norm": 0.9083417654037476, + "learning_rate": 8.786087757588269e-08, + "loss": 0.1269, + "step": 29300 + }, + { + "epoch": 0.74, + "grad_norm": 0.9193884134292603, + "learning_rate": 8.465522351951305e-08, + "loss": 0.1207, + "step": 29350 + }, + { + "epoch": 0.75, + "grad_norm": 0.9699887633323669, + "learning_rate": 8.150813426023752e-08, + "loss": 0.1245, + "step": 29400 + }, + { + "epoch": 0.75, + "grad_norm": 0.984941303730011, + "learning_rate": 7.841968610544121e-08, + "loss": 0.1228, + "step": 29450 + }, + { + "epoch": 0.75, + "grad_norm": 1.0322271585464478, + "learning_rate": 7.538995394063996e-08, + "loss": 0.1247, + "step": 29500 + }, + { + "epoch": 0.75, + "grad_norm": 0.9415401220321655, + "learning_rate": 7.241901122766515e-08, + "loss": 0.1209, + "step": 29550 + }, + { + "epoch": 0.75, + "grad_norm": 0.8878545165061951, + "learning_rate": 6.950693000288056e-08, + "loss": 0.1268, + "step": 29600 + }, + { + "epoch": 0.75, + "grad_norm": 0.8881374001502991, + "learning_rate": 6.665378087543889e-08, + "loss": 0.1239, + "step": 29650 + }, + { + "epoch": 0.75, + "grad_norm": 0.8045355081558228, + "learning_rate": 6.385963302556642e-08, + "loss": 0.1197, + "step": 29700 + }, + { + "epoch": 0.75, + "grad_norm": 0.8438431620597839, + "learning_rate": 6.112455420288821e-08, + "loss": 0.1242, + "step": 29750 + }, + { + "epoch": 0.76, + "grad_norm": 0.8916789293289185, + "learning_rate": 5.844861072478336e-08, + "loss": 0.1244, + "step": 29800 + }, + { + "epoch": 0.76, + "grad_norm": 0.8392596244812012, + "learning_rate": 5.583186747477848e-08, + "loss": 0.1234, + "step": 29850 + }, + { + "epoch": 0.76, + "grad_norm": 0.8782594203948975, + "learning_rate": 5.32743879009745e-08, + "loss": 0.1225, + "step": 29900 + }, + { + "epoch": 0.76, + "grad_norm": 0.8895407319068909, + "learning_rate": 5.077623401450599e-08, + "loss": 0.1212, + "step": 29950 + }, + { + "epoch": 0.76, + "grad_norm": 0.9181334376335144, + "learning_rate": 4.8337466388040935e-08, + "loss": 0.1203, + "step": 30000 + }, + { + "epoch": 0.76, + "grad_norm": 0.7273566126823425, + "learning_rate": 4.595814415430916e-08, + "loss": 0.1235, + "step": 30050 + }, + { + "epoch": 0.76, + "grad_norm": 0.8395570516586304, + "learning_rate": 4.3638325004670134e-08, + "loss": 0.1275, + "step": 30100 + }, + { + "epoch": 0.76, + "grad_norm": 0.9657353758811951, + "learning_rate": 4.1378065187714365e-08, + "loss": 0.1194, + "step": 30150 + }, + { + "epoch": 0.77, + "grad_norm": 0.8621689081192017, + "learning_rate": 3.917741950789727e-08, + "loss": 0.1219, + "step": 30200 + }, + { + "epoch": 0.77, + "grad_norm": 0.8802511692047119, + "learning_rate": 3.703644132421386e-08, + "loss": 0.1246, + "step": 30250 + }, + { + "epoch": 0.77, + "grad_norm": 0.8545549511909485, + "learning_rate": 3.4955182548901956e-08, + "loss": 0.1236, + "step": 30300 + }, + { + "epoch": 0.77, + "grad_norm": 0.9442453384399414, + "learning_rate": 3.293369364618465e-08, + "loss": 0.1255, + "step": 30350 + }, + { + "epoch": 0.77, + "grad_norm": 0.8238327503204346, + "learning_rate": 3.097202363104679e-08, + "loss": 0.1223, + "step": 30400 + }, + { + "epoch": 0.77, + "grad_norm": 0.9696635007858276, + "learning_rate": 2.9070220068045663e-08, + "loss": 0.1203, + "step": 30450 + }, + { + "epoch": 0.77, + "grad_norm": 0.760059118270874, + "learning_rate": 2.722832907015971e-08, + "loss": 0.1241, + "step": 30500 + }, + { + "epoch": 0.77, + "grad_norm": 0.9633733034133911, + "learning_rate": 2.544639529766829e-08, + "loss": 0.1282, + "step": 30550 + }, + { + "epoch": 0.78, + "grad_norm": 0.949769914150238, + "learning_rate": 2.3724461957068955e-08, + "loss": 0.1244, + "step": 30600 + }, + { + "epoch": 0.78, + "grad_norm": 0.8858135342597961, + "learning_rate": 2.206257080003188e-08, + "loss": 0.1241, + "step": 30650 + }, + { + "epoch": 0.78, + "grad_norm": 0.8067658543586731, + "learning_rate": 2.0460762122385124e-08, + "loss": 0.1162, + "step": 30700 + }, + { + "epoch": 0.78, + "grad_norm": 0.874151885509491, + "learning_rate": 1.8919074763138757e-08, + "loss": 0.1246, + "step": 30750 + }, + { + "epoch": 0.78, + "grad_norm": 0.8034783601760864, + "learning_rate": 1.7437546103542814e-08, + "loss": 0.1246, + "step": 30800 + }, + { + "epoch": 0.78, + "grad_norm": 0.9454385042190552, + "learning_rate": 1.6016212066181368e-08, + "loss": 0.122, + "step": 30850 + }, + { + "epoch": 0.78, + "grad_norm": 0.8052737712860107, + "learning_rate": 1.4655107114101008e-08, + "loss": 0.1256, + "step": 30900 + }, + { + "epoch": 0.78, + "grad_norm": 0.9731265902519226, + "learning_rate": 1.3354264249975379e-08, + "loss": 0.1246, + "step": 30950 + }, + { + "epoch": 0.79, + "grad_norm": 0.8946937918663025, + "learning_rate": 1.2113715015304728e-08, + "loss": 0.1245, + "step": 31000 + }, + { + "epoch": 0.79, + "grad_norm": 0.8165032863616943, + "learning_rate": 1.0933489489651783e-08, + "loss": 0.12, + "step": 31050 + }, + { + "epoch": 0.79, + "grad_norm": 0.8956162333488464, + "learning_rate": 9.81361628991151e-09, + "loss": 0.1226, + "step": 31100 + }, + { + "epoch": 0.79, + "grad_norm": 0.9210562705993652, + "learning_rate": 8.754122569618329e-09, + "loss": 0.123, + "step": 31150 + }, + { + "epoch": 0.79, + "grad_norm": 0.9154261946678162, + "learning_rate": 7.755034018286644e-09, + "loss": 0.1242, + "step": 31200 + }, + { + "epoch": 0.79, + "grad_norm": 0.7360591888427734, + "learning_rate": 6.816374860788566e-09, + "loss": 0.1215, + "step": 31250 + }, + { + "epoch": 0.79, + "grad_norm": 0.83289635181427, + "learning_rate": 5.938167856766319e-09, + "loss": 0.1224, + "step": 31300 + }, + { + "epoch": 0.79, + "grad_norm": 0.7892395257949829, + "learning_rate": 5.120434300080745e-09, + "loss": 0.1207, + "step": 31350 + }, + { + "epoch": 0.8, + "grad_norm": 0.9500446915626526, + "learning_rate": 4.363194018293937e-09, + "loss": 0.13, + "step": 31400 + }, + { + "epoch": 0.8, + "grad_norm": 0.8159528970718384, + "learning_rate": 3.666465372190453e-09, + "loss": 0.1224, + "step": 31450 + }, + { + "epoch": 0.8, + "grad_norm": 0.7910547852516174, + "learning_rate": 3.030265255329623e-09, + "loss": 0.1228, + "step": 31500 + }, + { + "epoch": 0.8, + "grad_norm": 0.8346930742263794, + "learning_rate": 2.4546090936383717e-09, + "loss": 0.1239, + "step": 31550 + }, + { + "epoch": 0.8, + "grad_norm": 0.8468943238258362, + "learning_rate": 1.9395108450351308e-09, + "loss": 0.119, + "step": 31600 + }, + { + "epoch": 0.8, + "grad_norm": 0.8092018961906433, + "learning_rate": 1.4849829990931653e-09, + "loss": 0.1221, + "step": 31650 + }, + { + "epoch": 0.8, + "grad_norm": 0.8936166763305664, + "learning_rate": 1.0910365767358155e-09, + "loss": 0.1218, + "step": 31700 + }, + { + "epoch": 0.8, + "grad_norm": 1.029935598373413, + "learning_rate": 7.576811299714326e-10, + "loss": 0.1292, + "step": 31750 + }, + { + "epoch": 0.81, + "grad_norm": 0.8104945421218872, + "learning_rate": 4.849247416599534e-10, + "loss": 0.1189, + "step": 31800 + }, + { + "epoch": 0.81, + "grad_norm": 0.7662272453308105, + "learning_rate": 2.727740253177791e-10, + "loss": 0.1243, + "step": 31850 + }, + { + "epoch": 0.81, + "grad_norm": 0.8399808406829834, + "learning_rate": 1.2123412495762543e-10, + "loss": 0.1229, + "step": 31900 + }, + { + "epoch": 0.81, + "grad_norm": 0.7275273203849792, + "learning_rate": 3.0308714963067644e-11, + "loss": 0.1182, + "step": 31950 + }, + { + "epoch": 0.81, + "grad_norm": 0.9211012721061707, + "learning_rate": 0.0, + "loss": 0.1208, + "step": 32000 + } + ], + "logging_steps": 50, + "max_steps": 32000, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 3200, + "total_flos": 2.2216620291373056e+18, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +}