|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.5257033857892228, |
|
"eval_steps": 500, |
|
"global_step": 4000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0019074868860276585, |
|
"grad_norm": 5.96875, |
|
"learning_rate": 1.9987282207808727e-05, |
|
"loss": 1.8153, |
|
"mean_token_accuracy": 0.5708044067025184, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.003814973772055317, |
|
"grad_norm": 5.40625, |
|
"learning_rate": 1.997456441561745e-05, |
|
"loss": 1.5088, |
|
"mean_token_accuracy": 0.6002451926469803, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.005722460658082976, |
|
"grad_norm": 5.5625, |
|
"learning_rate": 1.9961846623426175e-05, |
|
"loss": 1.5696, |
|
"mean_token_accuracy": 0.5999565117061139, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.007629947544110634, |
|
"grad_norm": 6.0, |
|
"learning_rate": 1.9949128831234897e-05, |
|
"loss": 1.4962, |
|
"mean_token_accuracy": 0.6145946934819222, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.009537434430138292, |
|
"grad_norm": 5.0625, |
|
"learning_rate": 1.9936411039043622e-05, |
|
"loss": 1.4145, |
|
"mean_token_accuracy": 0.6282299846410752, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.011444921316165951, |
|
"grad_norm": 5.75, |
|
"learning_rate": 1.9923693246852348e-05, |
|
"loss": 1.4615, |
|
"mean_token_accuracy": 0.6196133770048619, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.01335240820219361, |
|
"grad_norm": 4.84375, |
|
"learning_rate": 1.9910975454661073e-05, |
|
"loss": 1.4118, |
|
"mean_token_accuracy": 0.6272433631122112, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.015259895088221268, |
|
"grad_norm": 5.0625, |
|
"learning_rate": 1.98982576624698e-05, |
|
"loss": 1.3874, |
|
"mean_token_accuracy": 0.6295698702335357, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.017167381974248927, |
|
"grad_norm": 5.46875, |
|
"learning_rate": 1.988553987027852e-05, |
|
"loss": 1.4997, |
|
"mean_token_accuracy": 0.6131108298897743, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.019074868860276584, |
|
"grad_norm": 4.59375, |
|
"learning_rate": 1.9872822078087246e-05, |
|
"loss": 1.3386, |
|
"mean_token_accuracy": 0.6355413243174552, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.020982355746304245, |
|
"grad_norm": 4.46875, |
|
"learning_rate": 1.986010428589597e-05, |
|
"loss": 1.3453, |
|
"mean_token_accuracy": 0.6322750248014927, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.022889842632331903, |
|
"grad_norm": 5.0625, |
|
"learning_rate": 1.9847386493704694e-05, |
|
"loss": 1.4213, |
|
"mean_token_accuracy": 0.6266717866063118, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.02479732951835956, |
|
"grad_norm": 5.5625, |
|
"learning_rate": 1.983466870151342e-05, |
|
"loss": 1.3913, |
|
"mean_token_accuracy": 0.6290012784302235, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.02670481640438722, |
|
"grad_norm": 5.0625, |
|
"learning_rate": 1.982195090932214e-05, |
|
"loss": 1.3871, |
|
"mean_token_accuracy": 0.627622963488102, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.02861230329041488, |
|
"grad_norm": 4.84375, |
|
"learning_rate": 1.9809233117130867e-05, |
|
"loss": 1.3232, |
|
"mean_token_accuracy": 0.6432097807526589, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.030519790176442536, |
|
"grad_norm": 5.96875, |
|
"learning_rate": 1.9796515324939593e-05, |
|
"loss": 1.3287, |
|
"mean_token_accuracy": 0.6405125185847282, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.03242727706247019, |
|
"grad_norm": 4.78125, |
|
"learning_rate": 1.9783797532748318e-05, |
|
"loss": 1.4044, |
|
"mean_token_accuracy": 0.6292843967676163, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.034334763948497854, |
|
"grad_norm": 4.6875, |
|
"learning_rate": 1.977107974055704e-05, |
|
"loss": 1.3368, |
|
"mean_token_accuracy": 0.6363929770886898, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.036242250834525515, |
|
"grad_norm": 4.6875, |
|
"learning_rate": 1.9758361948365766e-05, |
|
"loss": 1.3156, |
|
"mean_token_accuracy": 0.6389718689024448, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.03814973772055317, |
|
"grad_norm": 4.84375, |
|
"learning_rate": 1.974564415617449e-05, |
|
"loss": 1.3213, |
|
"mean_token_accuracy": 0.6437985837459564, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.04005722460658083, |
|
"grad_norm": 5.34375, |
|
"learning_rate": 1.9732926363983213e-05, |
|
"loss": 1.3403, |
|
"mean_token_accuracy": 0.6304401338100434, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.04196471149260849, |
|
"grad_norm": 5.21875, |
|
"learning_rate": 1.972020857179194e-05, |
|
"loss": 1.4318, |
|
"mean_token_accuracy": 0.6314706668257714, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.043872198378636144, |
|
"grad_norm": 5.625, |
|
"learning_rate": 1.970749077960066e-05, |
|
"loss": 1.3123, |
|
"mean_token_accuracy": 0.6482092589139938, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.045779685264663805, |
|
"grad_norm": 5.03125, |
|
"learning_rate": 1.9694772987409387e-05, |
|
"loss": 1.2722, |
|
"mean_token_accuracy": 0.6475286811590195, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.047687172150691466, |
|
"grad_norm": 4.78125, |
|
"learning_rate": 1.9682055195218112e-05, |
|
"loss": 1.3184, |
|
"mean_token_accuracy": 0.636443517357111, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.04959465903671912, |
|
"grad_norm": 4.75, |
|
"learning_rate": 1.9669337403026834e-05, |
|
"loss": 1.3604, |
|
"mean_token_accuracy": 0.6383991658687591, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.05150214592274678, |
|
"grad_norm": 6.3125, |
|
"learning_rate": 1.965661961083556e-05, |
|
"loss": 1.3624, |
|
"mean_token_accuracy": 0.6359823271632195, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.05340963280877444, |
|
"grad_norm": 4.9375, |
|
"learning_rate": 1.9643901818644285e-05, |
|
"loss": 1.3622, |
|
"mean_token_accuracy": 0.6360629022121429, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.055317119694802096, |
|
"grad_norm": 5.25, |
|
"learning_rate": 1.963118402645301e-05, |
|
"loss": 1.3007, |
|
"mean_token_accuracy": 0.6486773908138275, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.05722460658082976, |
|
"grad_norm": 4.625, |
|
"learning_rate": 1.9618466234261733e-05, |
|
"loss": 1.259, |
|
"mean_token_accuracy": 0.6483664289116859, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.05913209346685742, |
|
"grad_norm": 4.4375, |
|
"learning_rate": 1.9605748442070458e-05, |
|
"loss": 1.2701, |
|
"mean_token_accuracy": 0.6497010916471482, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.06103958035288507, |
|
"grad_norm": 5.0625, |
|
"learning_rate": 1.9593030649879184e-05, |
|
"loss": 1.2925, |
|
"mean_token_accuracy": 0.645444954931736, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.06294706723891273, |
|
"grad_norm": 4.46875, |
|
"learning_rate": 1.9580312857687906e-05, |
|
"loss": 1.3194, |
|
"mean_token_accuracy": 0.6443428501486779, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.06485455412494039, |
|
"grad_norm": 5.15625, |
|
"learning_rate": 1.956759506549663e-05, |
|
"loss": 1.2786, |
|
"mean_token_accuracy": 0.6503037214279175, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.06676204101096805, |
|
"grad_norm": 5.65625, |
|
"learning_rate": 1.9554877273305353e-05, |
|
"loss": 1.3482, |
|
"mean_token_accuracy": 0.6470891699194908, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.06866952789699571, |
|
"grad_norm": 4.21875, |
|
"learning_rate": 1.954215948111408e-05, |
|
"loss": 1.3022, |
|
"mean_token_accuracy": 0.6400970220565796, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.07057701478302336, |
|
"grad_norm": 5.03125, |
|
"learning_rate": 1.9529441688922804e-05, |
|
"loss": 1.2821, |
|
"mean_token_accuracy": 0.6461471430957317, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.07248450166905103, |
|
"grad_norm": 5.0, |
|
"learning_rate": 1.951672389673153e-05, |
|
"loss": 1.2824, |
|
"mean_token_accuracy": 0.649085208773613, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.07439198855507868, |
|
"grad_norm": 4.5, |
|
"learning_rate": 1.9504006104540255e-05, |
|
"loss": 1.2508, |
|
"mean_token_accuracy": 0.6536262959241868, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.07629947544110634, |
|
"grad_norm": 4.9375, |
|
"learning_rate": 1.9491288312348978e-05, |
|
"loss": 1.3044, |
|
"mean_token_accuracy": 0.6513546489179134, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.078206962327134, |
|
"grad_norm": 5.125, |
|
"learning_rate": 1.9478570520157703e-05, |
|
"loss": 1.2265, |
|
"mean_token_accuracy": 0.6645126178860664, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.08011444921316166, |
|
"grad_norm": 5.875, |
|
"learning_rate": 1.9465852727966425e-05, |
|
"loss": 1.3156, |
|
"mean_token_accuracy": 0.6494258716702461, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.08202193609918931, |
|
"grad_norm": 5.0, |
|
"learning_rate": 1.945313493577515e-05, |
|
"loss": 1.1655, |
|
"mean_token_accuracy": 0.6783517330884934, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.08392942298521698, |
|
"grad_norm": 5.4375, |
|
"learning_rate": 1.9440417143583876e-05, |
|
"loss": 1.2306, |
|
"mean_token_accuracy": 0.6659620314836502, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.08583690987124463, |
|
"grad_norm": 4.40625, |
|
"learning_rate": 1.9427699351392598e-05, |
|
"loss": 1.3798, |
|
"mean_token_accuracy": 0.6320724219083786, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.08774439675727229, |
|
"grad_norm": 4.75, |
|
"learning_rate": 1.9414981559201324e-05, |
|
"loss": 1.2437, |
|
"mean_token_accuracy": 0.6454930439591408, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.08965188364329996, |
|
"grad_norm": 5.90625, |
|
"learning_rate": 1.940226376701005e-05, |
|
"loss": 1.2523, |
|
"mean_token_accuracy": 0.6506562553346157, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.09155937052932761, |
|
"grad_norm": 4.75, |
|
"learning_rate": 1.938954597481877e-05, |
|
"loss": 1.2268, |
|
"mean_token_accuracy": 0.6620514318346977, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.09346685741535526, |
|
"grad_norm": 4.53125, |
|
"learning_rate": 1.9376828182627497e-05, |
|
"loss": 1.2215, |
|
"mean_token_accuracy": 0.6610720351338386, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.09537434430138293, |
|
"grad_norm": 5.15625, |
|
"learning_rate": 1.9364110390436222e-05, |
|
"loss": 1.2494, |
|
"mean_token_accuracy": 0.653127409517765, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.09728183118741059, |
|
"grad_norm": 5.1875, |
|
"learning_rate": 1.9351392598244948e-05, |
|
"loss": 1.3126, |
|
"mean_token_accuracy": 0.6540979892015457, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.09918931807343824, |
|
"grad_norm": 5.15625, |
|
"learning_rate": 1.933867480605367e-05, |
|
"loss": 1.2498, |
|
"mean_token_accuracy": 0.6507880866527558, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.10109680495946591, |
|
"grad_norm": 5.8125, |
|
"learning_rate": 1.9325957013862396e-05, |
|
"loss": 1.3294, |
|
"mean_token_accuracy": 0.6403125211596489, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.10300429184549356, |
|
"grad_norm": 6.09375, |
|
"learning_rate": 1.9313239221671118e-05, |
|
"loss": 1.3099, |
|
"mean_token_accuracy": 0.6520273745059967, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.10491177873152122, |
|
"grad_norm": 5.40625, |
|
"learning_rate": 1.9300521429479843e-05, |
|
"loss": 1.2857, |
|
"mean_token_accuracy": 0.6506179749965668, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.10681926561754888, |
|
"grad_norm": 5.71875, |
|
"learning_rate": 1.928780363728857e-05, |
|
"loss": 1.3496, |
|
"mean_token_accuracy": 0.6408744707703591, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.10872675250357654, |
|
"grad_norm": 5.0625, |
|
"learning_rate": 1.927508584509729e-05, |
|
"loss": 1.2012, |
|
"mean_token_accuracy": 0.6639982044696808, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.11063423938960419, |
|
"grad_norm": 5.5, |
|
"learning_rate": 1.9262368052906016e-05, |
|
"loss": 1.2449, |
|
"mean_token_accuracy": 0.6594835132360458, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.11254172627563186, |
|
"grad_norm": 4.125, |
|
"learning_rate": 1.9249650260714742e-05, |
|
"loss": 1.1314, |
|
"mean_token_accuracy": 0.6807294517755509, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.11444921316165951, |
|
"grad_norm": 5.25, |
|
"learning_rate": 1.9236932468523467e-05, |
|
"loss": 1.2528, |
|
"mean_token_accuracy": 0.6543513402342797, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.11635670004768717, |
|
"grad_norm": 4.5, |
|
"learning_rate": 1.9224214676332193e-05, |
|
"loss": 1.201, |
|
"mean_token_accuracy": 0.6674724757671356, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.11826418693371483, |
|
"grad_norm": 5.09375, |
|
"learning_rate": 1.9211496884140915e-05, |
|
"loss": 1.2232, |
|
"mean_token_accuracy": 0.6661024749279022, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.12017167381974249, |
|
"grad_norm": 4.65625, |
|
"learning_rate": 1.919877909194964e-05, |
|
"loss": 1.2015, |
|
"mean_token_accuracy": 0.6703806266188621, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.12207916070577014, |
|
"grad_norm": 4.90625, |
|
"learning_rate": 1.9186061299758362e-05, |
|
"loss": 1.265, |
|
"mean_token_accuracy": 0.6571035169064998, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.12398664759179781, |
|
"grad_norm": 4.5625, |
|
"learning_rate": 1.9173343507567088e-05, |
|
"loss": 1.1817, |
|
"mean_token_accuracy": 0.6720203042030335, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.12589413447782546, |
|
"grad_norm": 5.40625, |
|
"learning_rate": 1.916062571537581e-05, |
|
"loss": 1.3305, |
|
"mean_token_accuracy": 0.6445886738598346, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.12780162136385312, |
|
"grad_norm": 5.34375, |
|
"learning_rate": 1.9147907923184536e-05, |
|
"loss": 1.2302, |
|
"mean_token_accuracy": 0.6632393077015877, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.12970910824988077, |
|
"grad_norm": 4.40625, |
|
"learning_rate": 1.913519013099326e-05, |
|
"loss": 1.2463, |
|
"mean_token_accuracy": 0.6618235319852829, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.13161659513590845, |
|
"grad_norm": 4.84375, |
|
"learning_rate": 1.9122472338801987e-05, |
|
"loss": 1.1995, |
|
"mean_token_accuracy": 0.6734424993395806, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.1335240820219361, |
|
"grad_norm": 6.09375, |
|
"learning_rate": 1.9109754546610712e-05, |
|
"loss": 1.1695, |
|
"mean_token_accuracy": 0.6744470730423927, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.13543156890796376, |
|
"grad_norm": 4.4375, |
|
"learning_rate": 1.9097036754419434e-05, |
|
"loss": 1.235, |
|
"mean_token_accuracy": 0.6613016352057457, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.13733905579399142, |
|
"grad_norm": 5.15625, |
|
"learning_rate": 1.908431896222816e-05, |
|
"loss": 1.1908, |
|
"mean_token_accuracy": 0.6631178431212902, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.13924654268001907, |
|
"grad_norm": 6.03125, |
|
"learning_rate": 1.9071601170036885e-05, |
|
"loss": 1.2538, |
|
"mean_token_accuracy": 0.6583275809884072, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 0.14115402956604672, |
|
"grad_norm": 4.875, |
|
"learning_rate": 1.9058883377845607e-05, |
|
"loss": 1.1481, |
|
"mean_token_accuracy": 0.6743377096951008, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.1430615164520744, |
|
"grad_norm": 4.65625, |
|
"learning_rate": 1.9046165585654333e-05, |
|
"loss": 1.1758, |
|
"mean_token_accuracy": 0.6755423441529274, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.14496900333810206, |
|
"grad_norm": 4.21875, |
|
"learning_rate": 1.9033447793463055e-05, |
|
"loss": 1.1526, |
|
"mean_token_accuracy": 0.6709615409374237, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.1468764902241297, |
|
"grad_norm": 5.34375, |
|
"learning_rate": 1.902073000127178e-05, |
|
"loss": 1.1614, |
|
"mean_token_accuracy": 0.6803930580615998, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 0.14878397711015737, |
|
"grad_norm": 5.15625, |
|
"learning_rate": 1.9008012209080503e-05, |
|
"loss": 1.1651, |
|
"mean_token_accuracy": 0.6757953256368637, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.15069146399618502, |
|
"grad_norm": 5.65625, |
|
"learning_rate": 1.8995294416889228e-05, |
|
"loss": 1.2219, |
|
"mean_token_accuracy": 0.6725533396005631, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 0.15259895088221268, |
|
"grad_norm": 4.90625, |
|
"learning_rate": 1.8982576624697954e-05, |
|
"loss": 1.2598, |
|
"mean_token_accuracy": 0.6562705941498279, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.15450643776824036, |
|
"grad_norm": 4.59375, |
|
"learning_rate": 1.896985883250668e-05, |
|
"loss": 1.1586, |
|
"mean_token_accuracy": 0.6731373474001885, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 0.156413924654268, |
|
"grad_norm": 5.65625, |
|
"learning_rate": 1.8957141040315405e-05, |
|
"loss": 1.2544, |
|
"mean_token_accuracy": 0.6664155155420304, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.15832141154029566, |
|
"grad_norm": 5.09375, |
|
"learning_rate": 1.8944423248124127e-05, |
|
"loss": 1.2045, |
|
"mean_token_accuracy": 0.6721395581960679, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 0.16022889842632332, |
|
"grad_norm": 4.65625, |
|
"learning_rate": 1.8931705455932852e-05, |
|
"loss": 1.1623, |
|
"mean_token_accuracy": 0.6825975701212883, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.16213638531235097, |
|
"grad_norm": 5.25, |
|
"learning_rate": 1.8918987663741578e-05, |
|
"loss": 1.2345, |
|
"mean_token_accuracy": 0.655138723552227, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 0.16404387219837863, |
|
"grad_norm": 4.46875, |
|
"learning_rate": 1.89062698715503e-05, |
|
"loss": 1.1583, |
|
"mean_token_accuracy": 0.6771410465240478, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.1659513590844063, |
|
"grad_norm": 4.34375, |
|
"learning_rate": 1.8893552079359025e-05, |
|
"loss": 1.2226, |
|
"mean_token_accuracy": 0.6566869288682937, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 0.16785884597043396, |
|
"grad_norm": 4.8125, |
|
"learning_rate": 1.8880834287167747e-05, |
|
"loss": 1.2531, |
|
"mean_token_accuracy": 0.6537335075438022, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.16976633285646162, |
|
"grad_norm": 5.1875, |
|
"learning_rate": 1.8868116494976473e-05, |
|
"loss": 1.1736, |
|
"mean_token_accuracy": 0.6741863384842872, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 0.17167381974248927, |
|
"grad_norm": 5.0, |
|
"learning_rate": 1.88553987027852e-05, |
|
"loss": 1.1912, |
|
"mean_token_accuracy": 0.6718283355236053, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.17358130662851692, |
|
"grad_norm": 5.03125, |
|
"learning_rate": 1.8842680910593924e-05, |
|
"loss": 1.2016, |
|
"mean_token_accuracy": 0.670920492708683, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 0.17548879351454458, |
|
"grad_norm": 5.375, |
|
"learning_rate": 1.882996311840265e-05, |
|
"loss": 1.1818, |
|
"mean_token_accuracy": 0.6705762408673763, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.17739628040057226, |
|
"grad_norm": 5.9375, |
|
"learning_rate": 1.881724532621137e-05, |
|
"loss": 1.3159, |
|
"mean_token_accuracy": 0.6546284504234791, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 0.1793037672865999, |
|
"grad_norm": 4.8125, |
|
"learning_rate": 1.8804527534020097e-05, |
|
"loss": 1.2004, |
|
"mean_token_accuracy": 0.6597321718931198, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.18121125417262757, |
|
"grad_norm": 6.9375, |
|
"learning_rate": 1.879180974182882e-05, |
|
"loss": 1.1672, |
|
"mean_token_accuracy": 0.6835288152098655, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 0.18311874105865522, |
|
"grad_norm": 5.0625, |
|
"learning_rate": 1.8779091949637545e-05, |
|
"loss": 1.2113, |
|
"mean_token_accuracy": 0.6654989182949066, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.18502622794468288, |
|
"grad_norm": 5.09375, |
|
"learning_rate": 1.876637415744627e-05, |
|
"loss": 1.076, |
|
"mean_token_accuracy": 0.6919501051306725, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 0.18693371483071053, |
|
"grad_norm": 4.96875, |
|
"learning_rate": 1.8753656365254992e-05, |
|
"loss": 1.1946, |
|
"mean_token_accuracy": 0.671660166978836, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.1888412017167382, |
|
"grad_norm": 5.875, |
|
"learning_rate": 1.8740938573063718e-05, |
|
"loss": 1.2334, |
|
"mean_token_accuracy": 0.6634502306580543, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 0.19074868860276586, |
|
"grad_norm": 5.34375, |
|
"learning_rate": 1.872822078087244e-05, |
|
"loss": 1.1412, |
|
"mean_token_accuracy": 0.6843819186091423, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.19265617548879352, |
|
"grad_norm": 4.875, |
|
"learning_rate": 1.8715502988681165e-05, |
|
"loss": 1.1017, |
|
"mean_token_accuracy": 0.6859075799584389, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 0.19456366237482117, |
|
"grad_norm": 5.0, |
|
"learning_rate": 1.870278519648989e-05, |
|
"loss": 1.099, |
|
"mean_token_accuracy": 0.6794175133109093, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.19647114926084883, |
|
"grad_norm": 5.0, |
|
"learning_rate": 1.8690067404298616e-05, |
|
"loss": 1.162, |
|
"mean_token_accuracy": 0.6769187614321709, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 0.19837863614687648, |
|
"grad_norm": 4.21875, |
|
"learning_rate": 1.8677349612107342e-05, |
|
"loss": 1.1241, |
|
"mean_token_accuracy": 0.6816845044493676, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.20028612303290416, |
|
"grad_norm": 7.09375, |
|
"learning_rate": 1.8664631819916064e-05, |
|
"loss": 1.1331, |
|
"mean_token_accuracy": 0.6768893599510193, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 0.20219360991893182, |
|
"grad_norm": 4.25, |
|
"learning_rate": 1.865191402772479e-05, |
|
"loss": 1.2657, |
|
"mean_token_accuracy": 0.6479664385318756, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.20410109680495947, |
|
"grad_norm": 4.5, |
|
"learning_rate": 1.863919623553351e-05, |
|
"loss": 1.2343, |
|
"mean_token_accuracy": 0.6734806634485722, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 0.20600858369098712, |
|
"grad_norm": 12.3125, |
|
"learning_rate": 1.8626478443342237e-05, |
|
"loss": 1.1263, |
|
"mean_token_accuracy": 0.6808793410658837, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.20791607057701478, |
|
"grad_norm": 5.9375, |
|
"learning_rate": 1.8613760651150963e-05, |
|
"loss": 1.0617, |
|
"mean_token_accuracy": 0.6977316424250603, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 0.20982355746304243, |
|
"grad_norm": 5.96875, |
|
"learning_rate": 1.8601042858959685e-05, |
|
"loss": 1.1229, |
|
"mean_token_accuracy": 0.6831489652395248, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.2117310443490701, |
|
"grad_norm": 4.84375, |
|
"learning_rate": 1.858832506676841e-05, |
|
"loss": 1.1216, |
|
"mean_token_accuracy": 0.6838791735470295, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 0.21363853123509777, |
|
"grad_norm": 4.78125, |
|
"learning_rate": 1.8575607274577136e-05, |
|
"loss": 1.2059, |
|
"mean_token_accuracy": 0.6669023260474205, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.21554601812112542, |
|
"grad_norm": 4.875, |
|
"learning_rate": 1.856288948238586e-05, |
|
"loss": 1.1515, |
|
"mean_token_accuracy": 0.6748893111944199, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 0.21745350500715308, |
|
"grad_norm": 4.21875, |
|
"learning_rate": 1.8550171690194583e-05, |
|
"loss": 1.063, |
|
"mean_token_accuracy": 0.7043292924761773, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.21936099189318073, |
|
"grad_norm": 5.34375, |
|
"learning_rate": 1.853745389800331e-05, |
|
"loss": 1.1071, |
|
"mean_token_accuracy": 0.6816813468933105, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 0.22126847877920838, |
|
"grad_norm": 7.25, |
|
"learning_rate": 1.8524736105812034e-05, |
|
"loss": 1.1493, |
|
"mean_token_accuracy": 0.6842595711350441, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.22317596566523606, |
|
"grad_norm": 6.3125, |
|
"learning_rate": 1.8512018313620756e-05, |
|
"loss": 1.2036, |
|
"mean_token_accuracy": 0.6606533020734787, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 0.22508345255126372, |
|
"grad_norm": 5.1875, |
|
"learning_rate": 1.8499300521429482e-05, |
|
"loss": 1.0329, |
|
"mean_token_accuracy": 0.7044132232666016, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.22699093943729137, |
|
"grad_norm": 4.90625, |
|
"learning_rate": 1.8486582729238204e-05, |
|
"loss": 1.183, |
|
"mean_token_accuracy": 0.6742020189762116, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 0.22889842632331903, |
|
"grad_norm": 5.375, |
|
"learning_rate": 1.847386493704693e-05, |
|
"loss": 1.1212, |
|
"mean_token_accuracy": 0.6900173485279083, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.23080591320934668, |
|
"grad_norm": 5.5, |
|
"learning_rate": 1.8461147144855655e-05, |
|
"loss": 1.1654, |
|
"mean_token_accuracy": 0.6756381630897522, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 0.23271340009537433, |
|
"grad_norm": 4.8125, |
|
"learning_rate": 1.8448429352664377e-05, |
|
"loss": 1.1592, |
|
"mean_token_accuracy": 0.6697646602988243, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.23462088698140202, |
|
"grad_norm": 4.84375, |
|
"learning_rate": 1.8435711560473103e-05, |
|
"loss": 1.0309, |
|
"mean_token_accuracy": 0.6977804109454155, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 0.23652837386742967, |
|
"grad_norm": 5.78125, |
|
"learning_rate": 1.8422993768281828e-05, |
|
"loss": 1.1705, |
|
"mean_token_accuracy": 0.6789061531424523, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.23843586075345732, |
|
"grad_norm": 4.84375, |
|
"learning_rate": 1.8410275976090554e-05, |
|
"loss": 1.1715, |
|
"mean_token_accuracy": 0.6736443802714348, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 0.24034334763948498, |
|
"grad_norm": 4.6875, |
|
"learning_rate": 1.8397558183899276e-05, |
|
"loss": 1.1535, |
|
"mean_token_accuracy": 0.6837850168347359, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.24225083452551263, |
|
"grad_norm": 5.625, |
|
"learning_rate": 1.8384840391708e-05, |
|
"loss": 1.1403, |
|
"mean_token_accuracy": 0.6860886000096797, |
|
"step": 635 |
|
}, |
|
{ |
|
"epoch": 0.24415832141154029, |
|
"grad_norm": 5.78125, |
|
"learning_rate": 1.8372122599516727e-05, |
|
"loss": 1.1777, |
|
"mean_token_accuracy": 0.6730990558862686, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.24606580829756797, |
|
"grad_norm": 4.96875, |
|
"learning_rate": 1.835940480732545e-05, |
|
"loss": 1.2174, |
|
"mean_token_accuracy": 0.6607669338583946, |
|
"step": 645 |
|
}, |
|
{ |
|
"epoch": 0.24797329518359562, |
|
"grad_norm": 5.03125, |
|
"learning_rate": 1.8346687015134174e-05, |
|
"loss": 1.1306, |
|
"mean_token_accuracy": 0.6789732642471791, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.24988078206962328, |
|
"grad_norm": 4.5, |
|
"learning_rate": 1.8333969222942896e-05, |
|
"loss": 1.0958, |
|
"mean_token_accuracy": 0.6940956100821495, |
|
"step": 655 |
|
}, |
|
{ |
|
"epoch": 0.25178826895565093, |
|
"grad_norm": 4.53125, |
|
"learning_rate": 1.8321251430751622e-05, |
|
"loss": 1.1623, |
|
"mean_token_accuracy": 0.6806924149394036, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.2536957558416786, |
|
"grad_norm": 5.59375, |
|
"learning_rate": 1.8308533638560347e-05, |
|
"loss": 1.1438, |
|
"mean_token_accuracy": 0.6787467435002327, |
|
"step": 665 |
|
}, |
|
{ |
|
"epoch": 0.25560324272770624, |
|
"grad_norm": 4.65625, |
|
"learning_rate": 1.8295815846369073e-05, |
|
"loss": 1.2902, |
|
"mean_token_accuracy": 0.655436672270298, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.2575107296137339, |
|
"grad_norm": 4.46875, |
|
"learning_rate": 1.82830980541778e-05, |
|
"loss": 1.1499, |
|
"mean_token_accuracy": 0.6697306737303734, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 0.25941821649976154, |
|
"grad_norm": 5.78125, |
|
"learning_rate": 1.827038026198652e-05, |
|
"loss": 1.1432, |
|
"mean_token_accuracy": 0.6788436755537987, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.2613257033857892, |
|
"grad_norm": 6.40625, |
|
"learning_rate": 1.8257662469795246e-05, |
|
"loss": 1.1457, |
|
"mean_token_accuracy": 0.6794642567634582, |
|
"step": 685 |
|
}, |
|
{ |
|
"epoch": 0.2632331902718169, |
|
"grad_norm": 4.5625, |
|
"learning_rate": 1.8244944677603968e-05, |
|
"loss": 1.1842, |
|
"mean_token_accuracy": 0.6739339649677276, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.26514067715784456, |
|
"grad_norm": 5.375, |
|
"learning_rate": 1.8232226885412694e-05, |
|
"loss": 1.1979, |
|
"mean_token_accuracy": 0.671060286462307, |
|
"step": 695 |
|
}, |
|
{ |
|
"epoch": 0.2670481640438722, |
|
"grad_norm": 5.9375, |
|
"learning_rate": 1.821950909322142e-05, |
|
"loss": 1.1926, |
|
"mean_token_accuracy": 0.68006531894207, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.26895565092989987, |
|
"grad_norm": 4.5, |
|
"learning_rate": 1.820679130103014e-05, |
|
"loss": 1.1425, |
|
"mean_token_accuracy": 0.686011116206646, |
|
"step": 705 |
|
}, |
|
{ |
|
"epoch": 0.2708631378159275, |
|
"grad_norm": 4.125, |
|
"learning_rate": 1.8194073508838867e-05, |
|
"loss": 1.1241, |
|
"mean_token_accuracy": 0.6813527546823025, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.2727706247019552, |
|
"grad_norm": 4.59375, |
|
"learning_rate": 1.8181355716647592e-05, |
|
"loss": 1.0763, |
|
"mean_token_accuracy": 0.6978838533163071, |
|
"step": 715 |
|
}, |
|
{ |
|
"epoch": 0.27467811158798283, |
|
"grad_norm": 4.71875, |
|
"learning_rate": 1.8168637924456314e-05, |
|
"loss": 1.1214, |
|
"mean_token_accuracy": 0.6879947543144226, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.2765855984740105, |
|
"grad_norm": 4.65625, |
|
"learning_rate": 1.815592013226504e-05, |
|
"loss": 1.145, |
|
"mean_token_accuracy": 0.6740229934453964, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 0.27849308536003814, |
|
"grad_norm": 5.0, |
|
"learning_rate": 1.8143202340073765e-05, |
|
"loss": 1.0915, |
|
"mean_token_accuracy": 0.6966636836528778, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.2804005722460658, |
|
"grad_norm": 5.46875, |
|
"learning_rate": 1.813048454788249e-05, |
|
"loss": 1.1534, |
|
"mean_token_accuracy": 0.6881409972906113, |
|
"step": 735 |
|
}, |
|
{ |
|
"epoch": 0.28230805913209345, |
|
"grad_norm": 5.21875, |
|
"learning_rate": 1.8117766755691213e-05, |
|
"loss": 1.1306, |
|
"mean_token_accuracy": 0.6856201700866222, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.2842155460181211, |
|
"grad_norm": 4.3125, |
|
"learning_rate": 1.810504896349994e-05, |
|
"loss": 1.1514, |
|
"mean_token_accuracy": 0.6833734557032585, |
|
"step": 745 |
|
}, |
|
{ |
|
"epoch": 0.2861230329041488, |
|
"grad_norm": 5.1875, |
|
"learning_rate": 1.809233117130866e-05, |
|
"loss": 1.1379, |
|
"mean_token_accuracy": 0.6824650421738625, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.28803051979017646, |
|
"grad_norm": 4.8125, |
|
"learning_rate": 1.8079613379117386e-05, |
|
"loss": 1.1685, |
|
"mean_token_accuracy": 0.6773202955722809, |
|
"step": 755 |
|
}, |
|
{ |
|
"epoch": 0.2899380066762041, |
|
"grad_norm": 5.09375, |
|
"learning_rate": 1.806689558692611e-05, |
|
"loss": 1.1683, |
|
"mean_token_accuracy": 0.6770651459693908, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.2918454935622318, |
|
"grad_norm": 4.875, |
|
"learning_rate": 1.8054177794734834e-05, |
|
"loss": 1.0853, |
|
"mean_token_accuracy": 0.6914259925484657, |
|
"step": 765 |
|
}, |
|
{ |
|
"epoch": 0.2937529804482594, |
|
"grad_norm": 5.0, |
|
"learning_rate": 1.804146000254356e-05, |
|
"loss": 1.1336, |
|
"mean_token_accuracy": 0.6820079162716866, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.2956604673342871, |
|
"grad_norm": 4.09375, |
|
"learning_rate": 1.8028742210352285e-05, |
|
"loss": 1.1233, |
|
"mean_token_accuracy": 0.6821878552436829, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 0.29756795422031473, |
|
"grad_norm": 4.71875, |
|
"learning_rate": 1.801602441816101e-05, |
|
"loss": 1.1204, |
|
"mean_token_accuracy": 0.692288200557232, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.2994754411063424, |
|
"grad_norm": 4.9375, |
|
"learning_rate": 1.8003306625969736e-05, |
|
"loss": 1.2294, |
|
"mean_token_accuracy": 0.673324004560709, |
|
"step": 785 |
|
}, |
|
{ |
|
"epoch": 0.30138292799237004, |
|
"grad_norm": 4.9375, |
|
"learning_rate": 1.7990588833778458e-05, |
|
"loss": 1.2059, |
|
"mean_token_accuracy": 0.6697646111249924, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.3032904148783977, |
|
"grad_norm": 4.5625, |
|
"learning_rate": 1.7977871041587183e-05, |
|
"loss": 1.1745, |
|
"mean_token_accuracy": 0.6823094062507152, |
|
"step": 795 |
|
}, |
|
{ |
|
"epoch": 0.30519790176442535, |
|
"grad_norm": 4.625, |
|
"learning_rate": 1.7965153249395905e-05, |
|
"loss": 1.1519, |
|
"mean_token_accuracy": 0.6778446674346924, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.307105388650453, |
|
"grad_norm": 4.53125, |
|
"learning_rate": 1.795243545720463e-05, |
|
"loss": 1.0918, |
|
"mean_token_accuracy": 0.6978467896580696, |
|
"step": 805 |
|
}, |
|
{ |
|
"epoch": 0.3090128755364807, |
|
"grad_norm": 3.921875, |
|
"learning_rate": 1.7939717665013353e-05, |
|
"loss": 1.1631, |
|
"mean_token_accuracy": 0.6788024313747882, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.31092036242250837, |
|
"grad_norm": 4.78125, |
|
"learning_rate": 1.792699987282208e-05, |
|
"loss": 1.1259, |
|
"mean_token_accuracy": 0.6834620237350464, |
|
"step": 815 |
|
}, |
|
{ |
|
"epoch": 0.312827849308536, |
|
"grad_norm": 5.21875, |
|
"learning_rate": 1.7914282080630804e-05, |
|
"loss": 1.0979, |
|
"mean_token_accuracy": 0.6804828964173794, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.3147353361945637, |
|
"grad_norm": 4.875, |
|
"learning_rate": 1.790156428843953e-05, |
|
"loss": 1.204, |
|
"mean_token_accuracy": 0.6713850289583206, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 0.31664282308059133, |
|
"grad_norm": 4.6875, |
|
"learning_rate": 1.788884649624825e-05, |
|
"loss": 1.14, |
|
"mean_token_accuracy": 0.6814699381589889, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.318550309966619, |
|
"grad_norm": 4.3125, |
|
"learning_rate": 1.7876128704056977e-05, |
|
"loss": 1.1383, |
|
"mean_token_accuracy": 0.672491405904293, |
|
"step": 835 |
|
}, |
|
{ |
|
"epoch": 0.32045779685264664, |
|
"grad_norm": 4.875, |
|
"learning_rate": 1.7863410911865703e-05, |
|
"loss": 1.1993, |
|
"mean_token_accuracy": 0.662742418050766, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.3223652837386743, |
|
"grad_norm": 4.53125, |
|
"learning_rate": 1.7850693119674428e-05, |
|
"loss": 1.0142, |
|
"mean_token_accuracy": 0.7142936125397682, |
|
"step": 845 |
|
}, |
|
{ |
|
"epoch": 0.32427277062470194, |
|
"grad_norm": 4.6875, |
|
"learning_rate": 1.783797532748315e-05, |
|
"loss": 1.1083, |
|
"mean_token_accuracy": 0.6907312035560608, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.3261802575107296, |
|
"grad_norm": 4.40625, |
|
"learning_rate": 1.7825257535291876e-05, |
|
"loss": 1.0248, |
|
"mean_token_accuracy": 0.7127422258257866, |
|
"step": 855 |
|
}, |
|
{ |
|
"epoch": 0.32808774439675725, |
|
"grad_norm": 4.375, |
|
"learning_rate": 1.7812539743100598e-05, |
|
"loss": 1.1234, |
|
"mean_token_accuracy": 0.6854491457343102, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.3299952312827849, |
|
"grad_norm": 5.0625, |
|
"learning_rate": 1.7799821950909323e-05, |
|
"loss": 1.1691, |
|
"mean_token_accuracy": 0.6794403240084648, |
|
"step": 865 |
|
}, |
|
{ |
|
"epoch": 0.3319027181688126, |
|
"grad_norm": 5.5625, |
|
"learning_rate": 1.7787104158718045e-05, |
|
"loss": 1.1286, |
|
"mean_token_accuracy": 0.6781167238950729, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.33381020505484027, |
|
"grad_norm": 4.8125, |
|
"learning_rate": 1.777438636652677e-05, |
|
"loss": 1.1314, |
|
"mean_token_accuracy": 0.6972567990422249, |
|
"step": 875 |
|
}, |
|
{ |
|
"epoch": 0.3357176919408679, |
|
"grad_norm": 5.34375, |
|
"learning_rate": 1.7761668574335496e-05, |
|
"loss": 1.0309, |
|
"mean_token_accuracy": 0.7109489843249321, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.3376251788268956, |
|
"grad_norm": 5.53125, |
|
"learning_rate": 1.7748950782144222e-05, |
|
"loss": 1.122, |
|
"mean_token_accuracy": 0.6877056941390037, |
|
"step": 885 |
|
}, |
|
{ |
|
"epoch": 0.33953266571292323, |
|
"grad_norm": 4.53125, |
|
"learning_rate": 1.7736232989952947e-05, |
|
"loss": 1.0609, |
|
"mean_token_accuracy": 0.7033521652221679, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 0.3414401525989509, |
|
"grad_norm": 4.125, |
|
"learning_rate": 1.772351519776167e-05, |
|
"loss": 1.0418, |
|
"mean_token_accuracy": 0.7021679773926734, |
|
"step": 895 |
|
}, |
|
{ |
|
"epoch": 0.34334763948497854, |
|
"grad_norm": 4.4375, |
|
"learning_rate": 1.7710797405570395e-05, |
|
"loss": 1.1233, |
|
"mean_token_accuracy": 0.696443286538124, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.3452551263710062, |
|
"grad_norm": 4.53125, |
|
"learning_rate": 1.7698079613379117e-05, |
|
"loss": 1.043, |
|
"mean_token_accuracy": 0.7000239789485931, |
|
"step": 905 |
|
}, |
|
{ |
|
"epoch": 0.34716261325703385, |
|
"grad_norm": 4.75, |
|
"learning_rate": 1.7685361821187843e-05, |
|
"loss": 1.0404, |
|
"mean_token_accuracy": 0.7076486960053444, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 0.3490701001430615, |
|
"grad_norm": 4.78125, |
|
"learning_rate": 1.7672644028996568e-05, |
|
"loss": 1.2165, |
|
"mean_token_accuracy": 0.6694424465298653, |
|
"step": 915 |
|
}, |
|
{ |
|
"epoch": 0.35097758702908916, |
|
"grad_norm": 5.15625, |
|
"learning_rate": 1.765992623680529e-05, |
|
"loss": 1.1061, |
|
"mean_token_accuracy": 0.689212466776371, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.3528850739151168, |
|
"grad_norm": 4.9375, |
|
"learning_rate": 1.7647208444614016e-05, |
|
"loss": 1.1573, |
|
"mean_token_accuracy": 0.6761819615960121, |
|
"step": 925 |
|
}, |
|
{ |
|
"epoch": 0.3547925608011445, |
|
"grad_norm": 4.96875, |
|
"learning_rate": 1.763449065242274e-05, |
|
"loss": 1.0511, |
|
"mean_token_accuracy": 0.7017502933740616, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 0.3567000476871722, |
|
"grad_norm": 5.0625, |
|
"learning_rate": 1.7621772860231467e-05, |
|
"loss": 1.176, |
|
"mean_token_accuracy": 0.6732675984501839, |
|
"step": 935 |
|
}, |
|
{ |
|
"epoch": 0.3586075345731998, |
|
"grad_norm": 4.1875, |
|
"learning_rate": 1.760905506804019e-05, |
|
"loss": 1.0912, |
|
"mean_token_accuracy": 0.6839179575443268, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 0.3605150214592275, |
|
"grad_norm": 5.53125, |
|
"learning_rate": 1.7596337275848914e-05, |
|
"loss": 1.1048, |
|
"mean_token_accuracy": 0.6849195197224617, |
|
"step": 945 |
|
}, |
|
{ |
|
"epoch": 0.36242250834525513, |
|
"grad_norm": 4.5625, |
|
"learning_rate": 1.758361948365764e-05, |
|
"loss": 1.0162, |
|
"mean_token_accuracy": 0.6860754758119583, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.3643299952312828, |
|
"grad_norm": 4.59375, |
|
"learning_rate": 1.7570901691466362e-05, |
|
"loss": 1.0933, |
|
"mean_token_accuracy": 0.6905333071947097, |
|
"step": 955 |
|
}, |
|
{ |
|
"epoch": 0.36623748211731044, |
|
"grad_norm": 4.9375, |
|
"learning_rate": 1.7558183899275088e-05, |
|
"loss": 1.0952, |
|
"mean_token_accuracy": 0.6910875916481019, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.3681449690033381, |
|
"grad_norm": 5.25, |
|
"learning_rate": 1.754546610708381e-05, |
|
"loss": 1.2019, |
|
"mean_token_accuracy": 0.6728987120091915, |
|
"step": 965 |
|
}, |
|
{ |
|
"epoch": 0.37005245588936575, |
|
"grad_norm": 3.890625, |
|
"learning_rate": 1.7532748314892535e-05, |
|
"loss": 1.0999, |
|
"mean_token_accuracy": 0.6894315019249916, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 0.3719599427753934, |
|
"grad_norm": 4.28125, |
|
"learning_rate": 1.752003052270126e-05, |
|
"loss": 1.0943, |
|
"mean_token_accuracy": 0.6812393218278885, |
|
"step": 975 |
|
}, |
|
{ |
|
"epoch": 0.37386742966142106, |
|
"grad_norm": 4.5, |
|
"learning_rate": 1.7507312730509983e-05, |
|
"loss": 1.1383, |
|
"mean_token_accuracy": 0.6867758512496949, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 0.3757749165474487, |
|
"grad_norm": 4.46875, |
|
"learning_rate": 1.7494594938318708e-05, |
|
"loss": 1.0795, |
|
"mean_token_accuracy": 0.6977563664317131, |
|
"step": 985 |
|
}, |
|
{ |
|
"epoch": 0.3776824034334764, |
|
"grad_norm": 5.46875, |
|
"learning_rate": 1.7481877146127434e-05, |
|
"loss": 1.0531, |
|
"mean_token_accuracy": 0.7000179141759872, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 0.3795898903195041, |
|
"grad_norm": 5.09375, |
|
"learning_rate": 1.746915935393616e-05, |
|
"loss": 1.1515, |
|
"mean_token_accuracy": 0.679880291223526, |
|
"step": 995 |
|
}, |
|
{ |
|
"epoch": 0.38149737720553173, |
|
"grad_norm": 4.96875, |
|
"learning_rate": 1.7456441561744885e-05, |
|
"loss": 1.139, |
|
"mean_token_accuracy": 0.6736231818795204, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.3834048640915594, |
|
"grad_norm": 4.5625, |
|
"learning_rate": 1.7443723769553607e-05, |
|
"loss": 1.161, |
|
"mean_token_accuracy": 0.6775983899831772, |
|
"step": 1005 |
|
}, |
|
{ |
|
"epoch": 0.38531235097758704, |
|
"grad_norm": 5.65625, |
|
"learning_rate": 1.7431005977362332e-05, |
|
"loss": 1.1337, |
|
"mean_token_accuracy": 0.6791507929563523, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 0.3872198378636147, |
|
"grad_norm": 4.71875, |
|
"learning_rate": 1.7418288185171054e-05, |
|
"loss": 1.0665, |
|
"mean_token_accuracy": 0.6990373253822326, |
|
"step": 1015 |
|
}, |
|
{ |
|
"epoch": 0.38912732474964234, |
|
"grad_norm": 5.34375, |
|
"learning_rate": 1.740557039297978e-05, |
|
"loss": 0.9875, |
|
"mean_token_accuracy": 0.7201899453997612, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 0.39103481163567, |
|
"grad_norm": 5.15625, |
|
"learning_rate": 1.7392852600788502e-05, |
|
"loss": 1.1103, |
|
"mean_token_accuracy": 0.6919103190302849, |
|
"step": 1025 |
|
}, |
|
{ |
|
"epoch": 0.39294229852169765, |
|
"grad_norm": 4.90625, |
|
"learning_rate": 1.7380134808597228e-05, |
|
"loss": 1.1621, |
|
"mean_token_accuracy": 0.6763632833957672, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 0.3948497854077253, |
|
"grad_norm": 5.375, |
|
"learning_rate": 1.7367417016405953e-05, |
|
"loss": 1.0891, |
|
"mean_token_accuracy": 0.6873229309916496, |
|
"step": 1035 |
|
}, |
|
{ |
|
"epoch": 0.39675727229375296, |
|
"grad_norm": 4.9375, |
|
"learning_rate": 1.735469922421468e-05, |
|
"loss": 1.0736, |
|
"mean_token_accuracy": 0.6943288549780846, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 0.3986647591797806, |
|
"grad_norm": 4.84375, |
|
"learning_rate": 1.7341981432023404e-05, |
|
"loss": 1.1739, |
|
"mean_token_accuracy": 0.6772709146142006, |
|
"step": 1045 |
|
}, |
|
{ |
|
"epoch": 0.4005722460658083, |
|
"grad_norm": 4.9375, |
|
"learning_rate": 1.7329263639832126e-05, |
|
"loss": 1.132, |
|
"mean_token_accuracy": 0.6861761540174485, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.402479732951836, |
|
"grad_norm": 5.0, |
|
"learning_rate": 1.731654584764085e-05, |
|
"loss": 1.0849, |
|
"mean_token_accuracy": 0.6876397714018821, |
|
"step": 1055 |
|
}, |
|
{ |
|
"epoch": 0.40438721983786363, |
|
"grad_norm": 5.28125, |
|
"learning_rate": 1.7303828055449577e-05, |
|
"loss": 1.1119, |
|
"mean_token_accuracy": 0.6828581809997558, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 0.4062947067238913, |
|
"grad_norm": 4.78125, |
|
"learning_rate": 1.72911102632583e-05, |
|
"loss": 1.1542, |
|
"mean_token_accuracy": 0.6845688432455063, |
|
"step": 1065 |
|
}, |
|
{ |
|
"epoch": 0.40820219360991894, |
|
"grad_norm": 4.78125, |
|
"learning_rate": 1.7278392471067025e-05, |
|
"loss": 1.245, |
|
"mean_token_accuracy": 0.6623896270990371, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 0.4101096804959466, |
|
"grad_norm": 4.40625, |
|
"learning_rate": 1.7265674678875747e-05, |
|
"loss": 0.9855, |
|
"mean_token_accuracy": 0.7169230833649636, |
|
"step": 1075 |
|
}, |
|
{ |
|
"epoch": 0.41201716738197425, |
|
"grad_norm": 5.28125, |
|
"learning_rate": 1.7252956886684472e-05, |
|
"loss": 1.0003, |
|
"mean_token_accuracy": 0.7067764922976494, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 0.4139246542680019, |
|
"grad_norm": 4.71875, |
|
"learning_rate": 1.7240239094493198e-05, |
|
"loss": 1.0566, |
|
"mean_token_accuracy": 0.7076253116130828, |
|
"step": 1085 |
|
}, |
|
{ |
|
"epoch": 0.41583214115402956, |
|
"grad_norm": 4.8125, |
|
"learning_rate": 1.722752130230192e-05, |
|
"loss": 1.0692, |
|
"mean_token_accuracy": 0.6991090714931488, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 0.4177396280400572, |
|
"grad_norm": 4.84375, |
|
"learning_rate": 1.7214803510110646e-05, |
|
"loss": 1.1702, |
|
"mean_token_accuracy": 0.6843620404601097, |
|
"step": 1095 |
|
}, |
|
{ |
|
"epoch": 0.41964711492608486, |
|
"grad_norm": 5.34375, |
|
"learning_rate": 1.720208571791937e-05, |
|
"loss": 1.1131, |
|
"mean_token_accuracy": 0.6780217066407204, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.4215546018121125, |
|
"grad_norm": 5.03125, |
|
"learning_rate": 1.7189367925728097e-05, |
|
"loss": 1.1733, |
|
"mean_token_accuracy": 0.6738237209618092, |
|
"step": 1105 |
|
}, |
|
{ |
|
"epoch": 0.4234620886981402, |
|
"grad_norm": 4.15625, |
|
"learning_rate": 1.717665013353682e-05, |
|
"loss": 1.0971, |
|
"mean_token_accuracy": 0.6967151150107384, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 0.4253695755841679, |
|
"grad_norm": 6.125, |
|
"learning_rate": 1.7163932341345544e-05, |
|
"loss": 1.0593, |
|
"mean_token_accuracy": 0.6975896939635277, |
|
"step": 1115 |
|
}, |
|
{ |
|
"epoch": 0.42727706247019553, |
|
"grad_norm": 4.875, |
|
"learning_rate": 1.715121454915427e-05, |
|
"loss": 1.045, |
|
"mean_token_accuracy": 0.6995080903172493, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 0.4291845493562232, |
|
"grad_norm": 4.6875, |
|
"learning_rate": 1.7138496756962992e-05, |
|
"loss": 1.1514, |
|
"mean_token_accuracy": 0.6870008051395416, |
|
"step": 1125 |
|
}, |
|
{ |
|
"epoch": 0.43109203624225084, |
|
"grad_norm": 4.9375, |
|
"learning_rate": 1.7125778964771717e-05, |
|
"loss": 1.147, |
|
"mean_token_accuracy": 0.6756279736757278, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 0.4329995231282785, |
|
"grad_norm": 3.953125, |
|
"learning_rate": 1.711306117258044e-05, |
|
"loss": 1.0626, |
|
"mean_token_accuracy": 0.6975025564432145, |
|
"step": 1135 |
|
}, |
|
{ |
|
"epoch": 0.43490701001430615, |
|
"grad_norm": 4.78125, |
|
"learning_rate": 1.7100343380389165e-05, |
|
"loss": 1.0843, |
|
"mean_token_accuracy": 0.6906091332435608, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 0.4368144969003338, |
|
"grad_norm": 4.84375, |
|
"learning_rate": 1.708762558819789e-05, |
|
"loss": 1.1134, |
|
"mean_token_accuracy": 0.6875185921788216, |
|
"step": 1145 |
|
}, |
|
{ |
|
"epoch": 0.43872198378636146, |
|
"grad_norm": 5.28125, |
|
"learning_rate": 1.7074907796006616e-05, |
|
"loss": 1.1802, |
|
"mean_token_accuracy": 0.6715085253119468, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.4406294706723891, |
|
"grad_norm": 5.1875, |
|
"learning_rate": 1.706219000381534e-05, |
|
"loss": 1.0848, |
|
"mean_token_accuracy": 0.6938497066497803, |
|
"step": 1155 |
|
}, |
|
{ |
|
"epoch": 0.44253695755841677, |
|
"grad_norm": 4.34375, |
|
"learning_rate": 1.7049472211624063e-05, |
|
"loss": 0.99, |
|
"mean_token_accuracy": 0.7224840387701988, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 0.4444444444444444, |
|
"grad_norm": 5.0625, |
|
"learning_rate": 1.703675441943279e-05, |
|
"loss": 1.0878, |
|
"mean_token_accuracy": 0.6980762526392936, |
|
"step": 1165 |
|
}, |
|
{ |
|
"epoch": 0.44635193133047213, |
|
"grad_norm": 4.84375, |
|
"learning_rate": 1.702403662724151e-05, |
|
"loss": 1.0144, |
|
"mean_token_accuracy": 0.7007738411426544, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 0.4482594182164998, |
|
"grad_norm": 4.71875, |
|
"learning_rate": 1.7011318835050237e-05, |
|
"loss": 1.0875, |
|
"mean_token_accuracy": 0.6976178154349327, |
|
"step": 1175 |
|
}, |
|
{ |
|
"epoch": 0.45016690510252744, |
|
"grad_norm": 4.25, |
|
"learning_rate": 1.6998601042858962e-05, |
|
"loss": 1.1086, |
|
"mean_token_accuracy": 0.6936508253216743, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 0.4520743919885551, |
|
"grad_norm": 4.625, |
|
"learning_rate": 1.6985883250667684e-05, |
|
"loss": 1.0357, |
|
"mean_token_accuracy": 0.7059789746999741, |
|
"step": 1185 |
|
}, |
|
{ |
|
"epoch": 0.45398187887458274, |
|
"grad_norm": 5.84375, |
|
"learning_rate": 1.697316545847641e-05, |
|
"loss": 1.1516, |
|
"mean_token_accuracy": 0.6797626093029976, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 0.4558893657606104, |
|
"grad_norm": 4.59375, |
|
"learning_rate": 1.6960447666285135e-05, |
|
"loss": 1.0598, |
|
"mean_token_accuracy": 0.6971547856926918, |
|
"step": 1195 |
|
}, |
|
{ |
|
"epoch": 0.45779685264663805, |
|
"grad_norm": 4.34375, |
|
"learning_rate": 1.6947729874093857e-05, |
|
"loss": 0.9843, |
|
"mean_token_accuracy": 0.7089567899703979, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.4597043395326657, |
|
"grad_norm": 5.6875, |
|
"learning_rate": 1.6935012081902583e-05, |
|
"loss": 1.1058, |
|
"mean_token_accuracy": 0.6865063227713109, |
|
"step": 1205 |
|
}, |
|
{ |
|
"epoch": 0.46161182641869336, |
|
"grad_norm": 4.65625, |
|
"learning_rate": 1.6922294289711308e-05, |
|
"loss": 1.1277, |
|
"mean_token_accuracy": 0.6827705770730972, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 0.463519313304721, |
|
"grad_norm": 4.875, |
|
"learning_rate": 1.6909576497520034e-05, |
|
"loss": 1.1664, |
|
"mean_token_accuracy": 0.6819883540272713, |
|
"step": 1215 |
|
}, |
|
{ |
|
"epoch": 0.46542680019074867, |
|
"grad_norm": 5.09375, |
|
"learning_rate": 1.6896858705328756e-05, |
|
"loss": 1.0562, |
|
"mean_token_accuracy": 0.6979849010705947, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 0.4673342870767763, |
|
"grad_norm": 4.875, |
|
"learning_rate": 1.688414091313748e-05, |
|
"loss": 1.1879, |
|
"mean_token_accuracy": 0.6789781466126442, |
|
"step": 1225 |
|
}, |
|
{ |
|
"epoch": 0.46924177396280403, |
|
"grad_norm": 4.6875, |
|
"learning_rate": 1.6871423120946204e-05, |
|
"loss": 1.1305, |
|
"mean_token_accuracy": 0.6878492012619972, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 0.4711492608488317, |
|
"grad_norm": 4.90625, |
|
"learning_rate": 1.685870532875493e-05, |
|
"loss": 1.1696, |
|
"mean_token_accuracy": 0.6815823867917061, |
|
"step": 1235 |
|
}, |
|
{ |
|
"epoch": 0.47305674773485934, |
|
"grad_norm": 4.4375, |
|
"learning_rate": 1.6845987536563655e-05, |
|
"loss": 0.9887, |
|
"mean_token_accuracy": 0.7134695425629616, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 0.474964234620887, |
|
"grad_norm": 4.625, |
|
"learning_rate": 1.6833269744372377e-05, |
|
"loss": 1.0254, |
|
"mean_token_accuracy": 0.7122278586030006, |
|
"step": 1245 |
|
}, |
|
{ |
|
"epoch": 0.47687172150691465, |
|
"grad_norm": 5.25, |
|
"learning_rate": 1.6820551952181102e-05, |
|
"loss": 1.123, |
|
"mean_token_accuracy": 0.6886913120746613, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 0.4787792083929423, |
|
"grad_norm": 4.5625, |
|
"learning_rate": 1.6807834159989828e-05, |
|
"loss": 1.1054, |
|
"mean_token_accuracy": 0.6899633683264256, |
|
"step": 1255 |
|
}, |
|
{ |
|
"epoch": 0.48068669527896996, |
|
"grad_norm": 4.5, |
|
"learning_rate": 1.6795116367798553e-05, |
|
"loss": 0.9764, |
|
"mean_token_accuracy": 0.7191405698657036, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 0.4825941821649976, |
|
"grad_norm": 4.9375, |
|
"learning_rate": 1.6782398575607275e-05, |
|
"loss": 1.0562, |
|
"mean_token_accuracy": 0.7035323694348335, |
|
"step": 1265 |
|
}, |
|
{ |
|
"epoch": 0.48450166905102526, |
|
"grad_norm": 4.25, |
|
"learning_rate": 1.6769680783416e-05, |
|
"loss": 1.0389, |
|
"mean_token_accuracy": 0.708684840798378, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 0.4864091559370529, |
|
"grad_norm": 4.375, |
|
"learning_rate": 1.6756962991224726e-05, |
|
"loss": 1.0037, |
|
"mean_token_accuracy": 0.7036843597888947, |
|
"step": 1275 |
|
}, |
|
{ |
|
"epoch": 0.48831664282308057, |
|
"grad_norm": 4.46875, |
|
"learning_rate": 1.674424519903345e-05, |
|
"loss": 0.9991, |
|
"mean_token_accuracy": 0.708541002869606, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 0.4902241297091082, |
|
"grad_norm": 4.53125, |
|
"learning_rate": 1.6731527406842174e-05, |
|
"loss": 1.0307, |
|
"mean_token_accuracy": 0.7065932080149651, |
|
"step": 1285 |
|
}, |
|
{ |
|
"epoch": 0.49213161659513593, |
|
"grad_norm": 5.3125, |
|
"learning_rate": 1.6718809614650896e-05, |
|
"loss": 1.1893, |
|
"mean_token_accuracy": 0.6674706935882568, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 0.4940391034811636, |
|
"grad_norm": 4.75, |
|
"learning_rate": 1.670609182245962e-05, |
|
"loss": 1.0691, |
|
"mean_token_accuracy": 0.6954927012324333, |
|
"step": 1295 |
|
}, |
|
{ |
|
"epoch": 0.49594659036719124, |
|
"grad_norm": 5.0, |
|
"learning_rate": 1.6693374030268347e-05, |
|
"loss": 1.1, |
|
"mean_token_accuracy": 0.687789686024189, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.4978540772532189, |
|
"grad_norm": 5.4375, |
|
"learning_rate": 1.6680656238077072e-05, |
|
"loss": 0.995, |
|
"mean_token_accuracy": 0.7104071035981179, |
|
"step": 1305 |
|
}, |
|
{ |
|
"epoch": 0.49976156413924655, |
|
"grad_norm": 8.9375, |
|
"learning_rate": 1.6667938445885795e-05, |
|
"loss": 1.1605, |
|
"mean_token_accuracy": 0.6653927579522133, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 0.5016690510252741, |
|
"grad_norm": 5.28125, |
|
"learning_rate": 1.665522065369452e-05, |
|
"loss": 1.0855, |
|
"mean_token_accuracy": 0.6951776430010795, |
|
"step": 1315 |
|
}, |
|
{ |
|
"epoch": 0.5035765379113019, |
|
"grad_norm": 5.40625, |
|
"learning_rate": 1.6642502861503246e-05, |
|
"loss": 1.0628, |
|
"mean_token_accuracy": 0.6940785989165306, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 0.5054840247973296, |
|
"grad_norm": 5.34375, |
|
"learning_rate": 1.6629785069311968e-05, |
|
"loss": 0.988, |
|
"mean_token_accuracy": 0.7169103771448135, |
|
"step": 1325 |
|
}, |
|
{ |
|
"epoch": 0.5073915116833572, |
|
"grad_norm": 6.0, |
|
"learning_rate": 1.6617067277120693e-05, |
|
"loss": 1.1389, |
|
"mean_token_accuracy": 0.6772550821304322, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 0.5092989985693849, |
|
"grad_norm": 5.21875, |
|
"learning_rate": 1.660434948492942e-05, |
|
"loss": 1.0482, |
|
"mean_token_accuracy": 0.7066885620355606, |
|
"step": 1335 |
|
}, |
|
{ |
|
"epoch": 0.5112064854554125, |
|
"grad_norm": 4.46875, |
|
"learning_rate": 1.659163169273814e-05, |
|
"loss": 1.0581, |
|
"mean_token_accuracy": 0.6999363213777542, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 0.5131139723414402, |
|
"grad_norm": 4.625, |
|
"learning_rate": 1.6578913900546866e-05, |
|
"loss": 1.0167, |
|
"mean_token_accuracy": 0.7108445912599564, |
|
"step": 1345 |
|
}, |
|
{ |
|
"epoch": 0.5150214592274678, |
|
"grad_norm": 4.34375, |
|
"learning_rate": 1.656619610835559e-05, |
|
"loss": 1.0204, |
|
"mean_token_accuracy": 0.7141309767961502, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 0.5169289461134955, |
|
"grad_norm": 5.0, |
|
"learning_rate": 1.6553478316164314e-05, |
|
"loss": 1.0762, |
|
"mean_token_accuracy": 0.681866991519928, |
|
"step": 1355 |
|
}, |
|
{ |
|
"epoch": 0.5188364329995231, |
|
"grad_norm": 4.75, |
|
"learning_rate": 1.654076052397304e-05, |
|
"loss": 1.11, |
|
"mean_token_accuracy": 0.6910865843296051, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 0.5207439198855508, |
|
"grad_norm": 4.9375, |
|
"learning_rate": 1.6528042731781765e-05, |
|
"loss": 1.1043, |
|
"mean_token_accuracy": 0.6882475554943085, |
|
"step": 1365 |
|
}, |
|
{ |
|
"epoch": 0.5226514067715784, |
|
"grad_norm": 4.25, |
|
"learning_rate": 1.651532493959049e-05, |
|
"loss": 1.0554, |
|
"mean_token_accuracy": 0.7098303481936454, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 0.5245588936576061, |
|
"grad_norm": 4.875, |
|
"learning_rate": 1.6502607147399213e-05, |
|
"loss": 1.0832, |
|
"mean_token_accuracy": 0.7008169665932655, |
|
"step": 1375 |
|
}, |
|
{ |
|
"epoch": 0.5264663805436338, |
|
"grad_norm": 4.75, |
|
"learning_rate": 1.6489889355207938e-05, |
|
"loss": 1.0496, |
|
"mean_token_accuracy": 0.7001980841159821, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 0.5283738674296614, |
|
"grad_norm": 4.96875, |
|
"learning_rate": 1.647717156301666e-05, |
|
"loss": 1.058, |
|
"mean_token_accuracy": 0.6955149456858635, |
|
"step": 1385 |
|
}, |
|
{ |
|
"epoch": 0.5302813543156891, |
|
"grad_norm": 4.5, |
|
"learning_rate": 1.6464453770825386e-05, |
|
"loss": 1.0637, |
|
"mean_token_accuracy": 0.6973527297377586, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 0.5321888412017167, |
|
"grad_norm": 4.65625, |
|
"learning_rate": 1.645173597863411e-05, |
|
"loss": 1.0224, |
|
"mean_token_accuracy": 0.6985811904072762, |
|
"step": 1395 |
|
}, |
|
{ |
|
"epoch": 0.5340963280877444, |
|
"grad_norm": 5.90625, |
|
"learning_rate": 1.6439018186442833e-05, |
|
"loss": 0.9889, |
|
"mean_token_accuracy": 0.7180039718747139, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.536003814973772, |
|
"grad_norm": 5.65625, |
|
"learning_rate": 1.642630039425156e-05, |
|
"loss": 1.1035, |
|
"mean_token_accuracy": 0.6911762669682503, |
|
"step": 1405 |
|
}, |
|
{ |
|
"epoch": 0.5379113018597997, |
|
"grad_norm": 5.3125, |
|
"learning_rate": 1.6413582602060284e-05, |
|
"loss": 1.121, |
|
"mean_token_accuracy": 0.6914720147848129, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 0.5398187887458273, |
|
"grad_norm": 4.6875, |
|
"learning_rate": 1.640086480986901e-05, |
|
"loss": 1.0923, |
|
"mean_token_accuracy": 0.7057502642273903, |
|
"step": 1415 |
|
}, |
|
{ |
|
"epoch": 0.541726275631855, |
|
"grad_norm": 4.03125, |
|
"learning_rate": 1.6388147017677732e-05, |
|
"loss": 1.0645, |
|
"mean_token_accuracy": 0.6903545215725899, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 0.5436337625178826, |
|
"grad_norm": 4.46875, |
|
"learning_rate": 1.6375429225486457e-05, |
|
"loss": 1.067, |
|
"mean_token_accuracy": 0.6956262946128845, |
|
"step": 1425 |
|
}, |
|
{ |
|
"epoch": 0.5455412494039104, |
|
"grad_norm": 5.40625, |
|
"learning_rate": 1.6362711433295183e-05, |
|
"loss": 1.051, |
|
"mean_token_accuracy": 0.7086583986878395, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 0.547448736289938, |
|
"grad_norm": 5.15625, |
|
"learning_rate": 1.6349993641103905e-05, |
|
"loss": 1.083, |
|
"mean_token_accuracy": 0.6992680087685585, |
|
"step": 1435 |
|
}, |
|
{ |
|
"epoch": 0.5493562231759657, |
|
"grad_norm": 4.875, |
|
"learning_rate": 1.633727584891263e-05, |
|
"loss": 1.048, |
|
"mean_token_accuracy": 0.7062048301100731, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 0.5512637100619934, |
|
"grad_norm": 4.375, |
|
"learning_rate": 1.6324558056721353e-05, |
|
"loss": 1.0279, |
|
"mean_token_accuracy": 0.6988534897565841, |
|
"step": 1445 |
|
}, |
|
{ |
|
"epoch": 0.553171196948021, |
|
"grad_norm": 5.59375, |
|
"learning_rate": 1.6311840264530078e-05, |
|
"loss": 1.1266, |
|
"mean_token_accuracy": 0.6839306525886059, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 0.5550786838340487, |
|
"grad_norm": 5.25, |
|
"learning_rate": 1.6299122472338804e-05, |
|
"loss": 1.0217, |
|
"mean_token_accuracy": 0.711203609406948, |
|
"step": 1455 |
|
}, |
|
{ |
|
"epoch": 0.5569861707200763, |
|
"grad_norm": 4.59375, |
|
"learning_rate": 1.6286404680147526e-05, |
|
"loss": 1.0203, |
|
"mean_token_accuracy": 0.7061103895306587, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 0.558893657606104, |
|
"grad_norm": 4.5, |
|
"learning_rate": 1.627368688795625e-05, |
|
"loss": 1.1219, |
|
"mean_token_accuracy": 0.6869349181652069, |
|
"step": 1465 |
|
}, |
|
{ |
|
"epoch": 0.5608011444921316, |
|
"grad_norm": 5.78125, |
|
"learning_rate": 1.6260969095764977e-05, |
|
"loss": 1.0224, |
|
"mean_token_accuracy": 0.7102344155311584, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 0.5627086313781593, |
|
"grad_norm": 4.71875, |
|
"learning_rate": 1.6248251303573702e-05, |
|
"loss": 1.0301, |
|
"mean_token_accuracy": 0.7014199420809746, |
|
"step": 1475 |
|
}, |
|
{ |
|
"epoch": 0.5646161182641869, |
|
"grad_norm": 4.84375, |
|
"learning_rate": 1.6235533511382428e-05, |
|
"loss": 1.0919, |
|
"mean_token_accuracy": 0.6958119504153728, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 0.5665236051502146, |
|
"grad_norm": 3.796875, |
|
"learning_rate": 1.622281571919115e-05, |
|
"loss": 1.0172, |
|
"mean_token_accuracy": 0.7171666666865348, |
|
"step": 1485 |
|
}, |
|
{ |
|
"epoch": 0.5684310920362422, |
|
"grad_norm": 4.9375, |
|
"learning_rate": 1.6210097926999875e-05, |
|
"loss": 1.0293, |
|
"mean_token_accuracy": 0.7110596433281898, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 0.5703385789222699, |
|
"grad_norm": 5.0, |
|
"learning_rate": 1.6197380134808597e-05, |
|
"loss": 1.0437, |
|
"mean_token_accuracy": 0.7055989898741245, |
|
"step": 1495 |
|
}, |
|
{ |
|
"epoch": 0.5722460658082976, |
|
"grad_norm": 4.5625, |
|
"learning_rate": 1.6184662342617323e-05, |
|
"loss": 1.1109, |
|
"mean_token_accuracy": 0.6983527675271034, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.5741535526943252, |
|
"grad_norm": 5.21875, |
|
"learning_rate": 1.6171944550426045e-05, |
|
"loss": 1.0585, |
|
"mean_token_accuracy": 0.7039476573467255, |
|
"step": 1505 |
|
}, |
|
{ |
|
"epoch": 0.5760610395803529, |
|
"grad_norm": 4.71875, |
|
"learning_rate": 1.615922675823477e-05, |
|
"loss": 1.0795, |
|
"mean_token_accuracy": 0.6893104076385498, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 0.5779685264663805, |
|
"grad_norm": 4.71875, |
|
"learning_rate": 1.6146508966043496e-05, |
|
"loss": 0.9975, |
|
"mean_token_accuracy": 0.7137760400772095, |
|
"step": 1515 |
|
}, |
|
{ |
|
"epoch": 0.5798760133524082, |
|
"grad_norm": 4.5625, |
|
"learning_rate": 1.613379117385222e-05, |
|
"loss": 1.0283, |
|
"mean_token_accuracy": 0.7071486204862595, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 0.5817835002384358, |
|
"grad_norm": 4.125, |
|
"learning_rate": 1.6121073381660947e-05, |
|
"loss": 0.9634, |
|
"mean_token_accuracy": 0.7267276033759117, |
|
"step": 1525 |
|
}, |
|
{ |
|
"epoch": 0.5836909871244635, |
|
"grad_norm": 5.28125, |
|
"learning_rate": 1.610835558946967e-05, |
|
"loss": 1.141, |
|
"mean_token_accuracy": 0.6760995179414749, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 0.5855984740104911, |
|
"grad_norm": 4.3125, |
|
"learning_rate": 1.6095637797278395e-05, |
|
"loss": 1.0584, |
|
"mean_token_accuracy": 0.7001825541257858, |
|
"step": 1535 |
|
}, |
|
{ |
|
"epoch": 0.5875059608965189, |
|
"grad_norm": 4.71875, |
|
"learning_rate": 1.608292000508712e-05, |
|
"loss": 0.9554, |
|
"mean_token_accuracy": 0.7191137507557869, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 0.5894134477825465, |
|
"grad_norm": 4.59375, |
|
"learning_rate": 1.6070202212895842e-05, |
|
"loss": 1.0338, |
|
"mean_token_accuracy": 0.6991492182016372, |
|
"step": 1545 |
|
}, |
|
{ |
|
"epoch": 0.5913209346685742, |
|
"grad_norm": 4.375, |
|
"learning_rate": 1.6057484420704568e-05, |
|
"loss": 1.1116, |
|
"mean_token_accuracy": 0.6837974414229393, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 0.5932284215546018, |
|
"grad_norm": 5.0, |
|
"learning_rate": 1.604476662851329e-05, |
|
"loss": 1.1166, |
|
"mean_token_accuracy": 0.6914651602506637, |
|
"step": 1555 |
|
}, |
|
{ |
|
"epoch": 0.5951359084406295, |
|
"grad_norm": 4.53125, |
|
"learning_rate": 1.6032048836322015e-05, |
|
"loss": 1.092, |
|
"mean_token_accuracy": 0.6894845418632031, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 0.5970433953266572, |
|
"grad_norm": 3.96875, |
|
"learning_rate": 1.601933104413074e-05, |
|
"loss": 1.0382, |
|
"mean_token_accuracy": 0.7044162392616272, |
|
"step": 1565 |
|
}, |
|
{ |
|
"epoch": 0.5989508822126848, |
|
"grad_norm": 5.28125, |
|
"learning_rate": 1.6006613251939463e-05, |
|
"loss": 1.1431, |
|
"mean_token_accuracy": 0.6869931921362877, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 0.6008583690987125, |
|
"grad_norm": 5.03125, |
|
"learning_rate": 1.599389545974819e-05, |
|
"loss": 0.9842, |
|
"mean_token_accuracy": 0.7128385215997696, |
|
"step": 1575 |
|
}, |
|
{ |
|
"epoch": 0.6027658559847401, |
|
"grad_norm": 4.09375, |
|
"learning_rate": 1.5981177667556914e-05, |
|
"loss": 0.9261, |
|
"mean_token_accuracy": 0.7247092142701149, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 0.6046733428707678, |
|
"grad_norm": 6.1875, |
|
"learning_rate": 1.596845987536564e-05, |
|
"loss": 1.1425, |
|
"mean_token_accuracy": 0.690905112028122, |
|
"step": 1585 |
|
}, |
|
{ |
|
"epoch": 0.6065808297567954, |
|
"grad_norm": 4.15625, |
|
"learning_rate": 1.595574208317436e-05, |
|
"loss": 1.0104, |
|
"mean_token_accuracy": 0.7160186618566513, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 0.6084883166428231, |
|
"grad_norm": 4.25, |
|
"learning_rate": 1.5943024290983087e-05, |
|
"loss": 1.0177, |
|
"mean_token_accuracy": 0.7115990072488785, |
|
"step": 1595 |
|
}, |
|
{ |
|
"epoch": 0.6103958035288507, |
|
"grad_norm": 4.125, |
|
"learning_rate": 1.5930306498791813e-05, |
|
"loss": 0.9649, |
|
"mean_token_accuracy": 0.7167477622628212, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.6123032904148784, |
|
"grad_norm": 4.78125, |
|
"learning_rate": 1.5917588706600535e-05, |
|
"loss": 1.0788, |
|
"mean_token_accuracy": 0.6938311874866485, |
|
"step": 1605 |
|
}, |
|
{ |
|
"epoch": 0.614210777300906, |
|
"grad_norm": 4.8125, |
|
"learning_rate": 1.590487091440926e-05, |
|
"loss": 1.0125, |
|
"mean_token_accuracy": 0.7016476511955261, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 0.6161182641869337, |
|
"grad_norm": 4.875, |
|
"learning_rate": 1.5892153122217982e-05, |
|
"loss": 1.0339, |
|
"mean_token_accuracy": 0.7032722011208534, |
|
"step": 1615 |
|
}, |
|
{ |
|
"epoch": 0.6180257510729614, |
|
"grad_norm": 5.0625, |
|
"learning_rate": 1.5879435330026708e-05, |
|
"loss": 1.0328, |
|
"mean_token_accuracy": 0.7095901571214199, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 0.619933237958989, |
|
"grad_norm": 5.25, |
|
"learning_rate": 1.5866717537835433e-05, |
|
"loss": 1.0373, |
|
"mean_token_accuracy": 0.7035743817687035, |
|
"step": 1625 |
|
}, |
|
{ |
|
"epoch": 0.6218407248450167, |
|
"grad_norm": 4.53125, |
|
"learning_rate": 1.585399974564416e-05, |
|
"loss": 0.9541, |
|
"mean_token_accuracy": 0.7259443908929825, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 0.6237482117310443, |
|
"grad_norm": 5.40625, |
|
"learning_rate": 1.5841281953452884e-05, |
|
"loss": 1.163, |
|
"mean_token_accuracy": 0.6807891383767128, |
|
"step": 1635 |
|
}, |
|
{ |
|
"epoch": 0.625655698617072, |
|
"grad_norm": 5.46875, |
|
"learning_rate": 1.5828564161261606e-05, |
|
"loss": 0.9723, |
|
"mean_token_accuracy": 0.7241554304957389, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 0.6275631855030996, |
|
"grad_norm": 4.5625, |
|
"learning_rate": 1.5815846369070332e-05, |
|
"loss": 1.0736, |
|
"mean_token_accuracy": 0.7066580310463906, |
|
"step": 1645 |
|
}, |
|
{ |
|
"epoch": 0.6294706723891274, |
|
"grad_norm": 5.125, |
|
"learning_rate": 1.5803128576879054e-05, |
|
"loss": 1.0553, |
|
"mean_token_accuracy": 0.7062164053320885, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 0.631378159275155, |
|
"grad_norm": 4.1875, |
|
"learning_rate": 1.579041078468778e-05, |
|
"loss": 1.0238, |
|
"mean_token_accuracy": 0.7111857526004315, |
|
"step": 1655 |
|
}, |
|
{ |
|
"epoch": 0.6332856461611827, |
|
"grad_norm": 4.59375, |
|
"learning_rate": 1.5777692992496505e-05, |
|
"loss": 0.9547, |
|
"mean_token_accuracy": 0.7140131160616875, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 0.6351931330472103, |
|
"grad_norm": 4.75, |
|
"learning_rate": 1.5764975200305227e-05, |
|
"loss": 1.0719, |
|
"mean_token_accuracy": 0.6966261744499207, |
|
"step": 1665 |
|
}, |
|
{ |
|
"epoch": 0.637100619933238, |
|
"grad_norm": 4.5625, |
|
"learning_rate": 1.5752257408113953e-05, |
|
"loss": 1.0385, |
|
"mean_token_accuracy": 0.7105199143290519, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 0.6390081068192656, |
|
"grad_norm": 5.40625, |
|
"learning_rate": 1.5739539615922678e-05, |
|
"loss": 1.0362, |
|
"mean_token_accuracy": 0.7015564523637294, |
|
"step": 1675 |
|
}, |
|
{ |
|
"epoch": 0.6409155937052933, |
|
"grad_norm": 5.3125, |
|
"learning_rate": 1.57268218237314e-05, |
|
"loss": 0.9366, |
|
"mean_token_accuracy": 0.7337832853198052, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 0.642823080591321, |
|
"grad_norm": 5.4375, |
|
"learning_rate": 1.5714104031540126e-05, |
|
"loss": 1.1166, |
|
"mean_token_accuracy": 0.6753425896167755, |
|
"step": 1685 |
|
}, |
|
{ |
|
"epoch": 0.6447305674773486, |
|
"grad_norm": 4.5625, |
|
"learning_rate": 1.570138623934885e-05, |
|
"loss": 1.0364, |
|
"mean_token_accuracy": 0.7042675077915191, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 0.6466380543633763, |
|
"grad_norm": 5.1875, |
|
"learning_rate": 1.5688668447157577e-05, |
|
"loss": 0.9948, |
|
"mean_token_accuracy": 0.7185570999979973, |
|
"step": 1695 |
|
}, |
|
{ |
|
"epoch": 0.6485455412494039, |
|
"grad_norm": 5.25, |
|
"learning_rate": 1.56759506549663e-05, |
|
"loss": 1.0878, |
|
"mean_token_accuracy": 0.6928838163614273, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.6504530281354316, |
|
"grad_norm": 5.90625, |
|
"learning_rate": 1.5663232862775024e-05, |
|
"loss": 1.0671, |
|
"mean_token_accuracy": 0.7043869346380234, |
|
"step": 1705 |
|
}, |
|
{ |
|
"epoch": 0.6523605150214592, |
|
"grad_norm": 4.5, |
|
"learning_rate": 1.5650515070583746e-05, |
|
"loss": 1.0721, |
|
"mean_token_accuracy": 0.7013254553079605, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 0.6542680019074869, |
|
"grad_norm": 4.90625, |
|
"learning_rate": 1.5637797278392472e-05, |
|
"loss": 1.0065, |
|
"mean_token_accuracy": 0.7042224168777466, |
|
"step": 1715 |
|
}, |
|
{ |
|
"epoch": 0.6561754887935145, |
|
"grad_norm": 4.15625, |
|
"learning_rate": 1.5625079486201197e-05, |
|
"loss": 1.0739, |
|
"mean_token_accuracy": 0.6945244207978248, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 0.6580829756795422, |
|
"grad_norm": 5.5625, |
|
"learning_rate": 1.561236169400992e-05, |
|
"loss": 1.0859, |
|
"mean_token_accuracy": 0.6950926452875137, |
|
"step": 1725 |
|
}, |
|
{ |
|
"epoch": 0.6599904625655698, |
|
"grad_norm": 4.8125, |
|
"learning_rate": 1.5599643901818645e-05, |
|
"loss": 0.9103, |
|
"mean_token_accuracy": 0.7312288954854012, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 0.6618979494515975, |
|
"grad_norm": 4.375, |
|
"learning_rate": 1.558692610962737e-05, |
|
"loss": 0.9913, |
|
"mean_token_accuracy": 0.7258316233754158, |
|
"step": 1735 |
|
}, |
|
{ |
|
"epoch": 0.6638054363376252, |
|
"grad_norm": 5.84375, |
|
"learning_rate": 1.5574208317436096e-05, |
|
"loss": 1.1365, |
|
"mean_token_accuracy": 0.6847386986017228, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 0.6657129232236528, |
|
"grad_norm": 4.34375, |
|
"learning_rate": 1.5561490525244818e-05, |
|
"loss": 0.9208, |
|
"mean_token_accuracy": 0.7318013399839401, |
|
"step": 1745 |
|
}, |
|
{ |
|
"epoch": 0.6676204101096805, |
|
"grad_norm": 4.59375, |
|
"learning_rate": 1.5548772733053544e-05, |
|
"loss": 1.0381, |
|
"mean_token_accuracy": 0.7117675840854645, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 0.6695278969957081, |
|
"grad_norm": 4.0, |
|
"learning_rate": 1.553605494086227e-05, |
|
"loss": 0.9807, |
|
"mean_token_accuracy": 0.7145498290657997, |
|
"step": 1755 |
|
}, |
|
{ |
|
"epoch": 0.6714353838817358, |
|
"grad_norm": 4.4375, |
|
"learning_rate": 1.552333714867099e-05, |
|
"loss": 0.9746, |
|
"mean_token_accuracy": 0.7150619328022003, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 0.6733428707677634, |
|
"grad_norm": 5.0625, |
|
"learning_rate": 1.5510619356479717e-05, |
|
"loss": 1.0595, |
|
"mean_token_accuracy": 0.7089547023177147, |
|
"step": 1765 |
|
}, |
|
{ |
|
"epoch": 0.6752503576537912, |
|
"grad_norm": 4.59375, |
|
"learning_rate": 1.549790156428844e-05, |
|
"loss": 1.0631, |
|
"mean_token_accuracy": 0.7015356197953224, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 0.6771578445398188, |
|
"grad_norm": 6.28125, |
|
"learning_rate": 1.5485183772097164e-05, |
|
"loss": 1.0188, |
|
"mean_token_accuracy": 0.7043671816587448, |
|
"step": 1775 |
|
}, |
|
{ |
|
"epoch": 0.6790653314258465, |
|
"grad_norm": 4.34375, |
|
"learning_rate": 1.547246597990589e-05, |
|
"loss": 1.017, |
|
"mean_token_accuracy": 0.707620695233345, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 0.6809728183118741, |
|
"grad_norm": 5.25, |
|
"learning_rate": 1.5459748187714615e-05, |
|
"loss": 1.0261, |
|
"mean_token_accuracy": 0.7107647344470024, |
|
"step": 1785 |
|
}, |
|
{ |
|
"epoch": 0.6828803051979018, |
|
"grad_norm": 5.1875, |
|
"learning_rate": 1.5447030395523338e-05, |
|
"loss": 0.976, |
|
"mean_token_accuracy": 0.7236540615558624, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 0.6847877920839294, |
|
"grad_norm": 4.40625, |
|
"learning_rate": 1.5434312603332063e-05, |
|
"loss": 1.0489, |
|
"mean_token_accuracy": 0.7052806288003921, |
|
"step": 1795 |
|
}, |
|
{ |
|
"epoch": 0.6866952789699571, |
|
"grad_norm": 4.53125, |
|
"learning_rate": 1.542159481114079e-05, |
|
"loss": 1.0648, |
|
"mean_token_accuracy": 0.6937704533338547, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.6886027658559848, |
|
"grad_norm": 3.921875, |
|
"learning_rate": 1.540887701894951e-05, |
|
"loss": 1.0472, |
|
"mean_token_accuracy": 0.7000033929944038, |
|
"step": 1805 |
|
}, |
|
{ |
|
"epoch": 0.6905102527420124, |
|
"grad_norm": 4.0625, |
|
"learning_rate": 1.5396159226758236e-05, |
|
"loss": 0.9398, |
|
"mean_token_accuracy": 0.7206048682332039, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 0.6924177396280401, |
|
"grad_norm": 4.71875, |
|
"learning_rate": 1.538344143456696e-05, |
|
"loss": 1.078, |
|
"mean_token_accuracy": 0.6949393272399902, |
|
"step": 1815 |
|
}, |
|
{ |
|
"epoch": 0.6943252265140677, |
|
"grad_norm": 4.78125, |
|
"learning_rate": 1.5370723642375684e-05, |
|
"loss": 0.9936, |
|
"mean_token_accuracy": 0.716967236995697, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 0.6962327134000954, |
|
"grad_norm": 5.46875, |
|
"learning_rate": 1.535800585018441e-05, |
|
"loss": 1.0597, |
|
"mean_token_accuracy": 0.7033153355121613, |
|
"step": 1825 |
|
}, |
|
{ |
|
"epoch": 0.698140200286123, |
|
"grad_norm": 5.59375, |
|
"learning_rate": 1.534528805799313e-05, |
|
"loss": 1.0689, |
|
"mean_token_accuracy": 0.6991181001067162, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 0.7000476871721507, |
|
"grad_norm": 5.59375, |
|
"learning_rate": 1.5332570265801857e-05, |
|
"loss": 1.0807, |
|
"mean_token_accuracy": 0.6999934658408165, |
|
"step": 1835 |
|
}, |
|
{ |
|
"epoch": 0.7019551740581783, |
|
"grad_norm": 5.0625, |
|
"learning_rate": 1.5319852473610582e-05, |
|
"loss": 0.9682, |
|
"mean_token_accuracy": 0.723272667825222, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 0.703862660944206, |
|
"grad_norm": 4.15625, |
|
"learning_rate": 1.5307134681419308e-05, |
|
"loss": 1.031, |
|
"mean_token_accuracy": 0.6966592162847519, |
|
"step": 1845 |
|
}, |
|
{ |
|
"epoch": 0.7057701478302336, |
|
"grad_norm": 5.15625, |
|
"learning_rate": 1.5294416889228033e-05, |
|
"loss": 1.0431, |
|
"mean_token_accuracy": 0.714342576265335, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 0.7076776347162613, |
|
"grad_norm": 5.78125, |
|
"learning_rate": 1.5281699097036755e-05, |
|
"loss": 1.0259, |
|
"mean_token_accuracy": 0.7083830907940865, |
|
"step": 1855 |
|
}, |
|
{ |
|
"epoch": 0.709585121602289, |
|
"grad_norm": 4.5625, |
|
"learning_rate": 1.526898130484548e-05, |
|
"loss": 0.9847, |
|
"mean_token_accuracy": 0.7163553655147552, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 0.7114926084883166, |
|
"grad_norm": 6.28125, |
|
"learning_rate": 1.5256263512654203e-05, |
|
"loss": 0.9951, |
|
"mean_token_accuracy": 0.7172233253717423, |
|
"step": 1865 |
|
}, |
|
{ |
|
"epoch": 0.7134000953743443, |
|
"grad_norm": 5.25, |
|
"learning_rate": 1.5243545720462929e-05, |
|
"loss": 1.0708, |
|
"mean_token_accuracy": 0.6974482744932174, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 0.7153075822603719, |
|
"grad_norm": 4.625, |
|
"learning_rate": 1.5230827928271654e-05, |
|
"loss": 1.0573, |
|
"mean_token_accuracy": 0.7091765016317367, |
|
"step": 1875 |
|
}, |
|
{ |
|
"epoch": 0.7172150691463997, |
|
"grad_norm": 4.96875, |
|
"learning_rate": 1.5218110136080378e-05, |
|
"loss": 0.969, |
|
"mean_token_accuracy": 0.7144028797745705, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 0.7191225560324273, |
|
"grad_norm": 4.71875, |
|
"learning_rate": 1.5205392343889103e-05, |
|
"loss": 1.0841, |
|
"mean_token_accuracy": 0.696842522919178, |
|
"step": 1885 |
|
}, |
|
{ |
|
"epoch": 0.721030042918455, |
|
"grad_norm": 4.21875, |
|
"learning_rate": 1.5192674551697825e-05, |
|
"loss": 0.9193, |
|
"mean_token_accuracy": 0.7247643247246742, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 0.7229375298044826, |
|
"grad_norm": 5.3125, |
|
"learning_rate": 1.5179956759506551e-05, |
|
"loss": 0.9693, |
|
"mean_token_accuracy": 0.7202573090791702, |
|
"step": 1895 |
|
}, |
|
{ |
|
"epoch": 0.7248450166905103, |
|
"grad_norm": 4.59375, |
|
"learning_rate": 1.5167238967315275e-05, |
|
"loss": 0.9176, |
|
"mean_token_accuracy": 0.7282809093594551, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.7267525035765379, |
|
"grad_norm": 4.53125, |
|
"learning_rate": 1.5154521175124e-05, |
|
"loss": 0.9122, |
|
"mean_token_accuracy": 0.7338912770152092, |
|
"step": 1905 |
|
}, |
|
{ |
|
"epoch": 0.7286599904625656, |
|
"grad_norm": 4.96875, |
|
"learning_rate": 1.5141803382932724e-05, |
|
"loss": 1.0756, |
|
"mean_token_accuracy": 0.6949716225266457, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 0.7305674773485932, |
|
"grad_norm": 4.96875, |
|
"learning_rate": 1.5129085590741448e-05, |
|
"loss": 1.0789, |
|
"mean_token_accuracy": 0.6967040143907071, |
|
"step": 1915 |
|
}, |
|
{ |
|
"epoch": 0.7324749642346209, |
|
"grad_norm": 4.71875, |
|
"learning_rate": 1.5116367798550173e-05, |
|
"loss": 0.9752, |
|
"mean_token_accuracy": 0.7204620942473412, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 0.7343824511206486, |
|
"grad_norm": 3.8125, |
|
"learning_rate": 1.5103650006358897e-05, |
|
"loss": 1.0254, |
|
"mean_token_accuracy": 0.7050608977675438, |
|
"step": 1925 |
|
}, |
|
{ |
|
"epoch": 0.7362899380066762, |
|
"grad_norm": 4.5625, |
|
"learning_rate": 1.5090932214167621e-05, |
|
"loss": 0.9845, |
|
"mean_token_accuracy": 0.718043963611126, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 0.7381974248927039, |
|
"grad_norm": 4.75, |
|
"learning_rate": 1.5078214421976347e-05, |
|
"loss": 0.9798, |
|
"mean_token_accuracy": 0.7155537083745003, |
|
"step": 1935 |
|
}, |
|
{ |
|
"epoch": 0.7401049117787315, |
|
"grad_norm": 4.125, |
|
"learning_rate": 1.506549662978507e-05, |
|
"loss": 0.9766, |
|
"mean_token_accuracy": 0.7140335828065872, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 0.7420123986647592, |
|
"grad_norm": 4.125, |
|
"learning_rate": 1.5052778837593796e-05, |
|
"loss": 1.048, |
|
"mean_token_accuracy": 0.6943504139780998, |
|
"step": 1945 |
|
}, |
|
{ |
|
"epoch": 0.7439198855507868, |
|
"grad_norm": 4.46875, |
|
"learning_rate": 1.5040061045402518e-05, |
|
"loss": 0.9615, |
|
"mean_token_accuracy": 0.7220544546842576, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 0.7458273724368145, |
|
"grad_norm": 5.0, |
|
"learning_rate": 1.5027343253211243e-05, |
|
"loss": 0.9978, |
|
"mean_token_accuracy": 0.7115991845726967, |
|
"step": 1955 |
|
}, |
|
{ |
|
"epoch": 0.7477348593228421, |
|
"grad_norm": 4.25, |
|
"learning_rate": 1.5014625461019967e-05, |
|
"loss": 0.9384, |
|
"mean_token_accuracy": 0.7263565197587013, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 0.7496423462088698, |
|
"grad_norm": 4.90625, |
|
"learning_rate": 1.5001907668828693e-05, |
|
"loss": 1.0709, |
|
"mean_token_accuracy": 0.6875549122691155, |
|
"step": 1965 |
|
}, |
|
{ |
|
"epoch": 0.7515498330948974, |
|
"grad_norm": 5.4375, |
|
"learning_rate": 1.4989189876637418e-05, |
|
"loss": 1.0374, |
|
"mean_token_accuracy": 0.7037866428494454, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 0.7534573199809251, |
|
"grad_norm": 4.4375, |
|
"learning_rate": 1.497647208444614e-05, |
|
"loss": 0.9627, |
|
"mean_token_accuracy": 0.7117268234491348, |
|
"step": 1975 |
|
}, |
|
{ |
|
"epoch": 0.7553648068669528, |
|
"grad_norm": 4.34375, |
|
"learning_rate": 1.4963754292254866e-05, |
|
"loss": 0.9921, |
|
"mean_token_accuracy": 0.7058862507343292, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 0.7572722937529804, |
|
"grad_norm": 5.09375, |
|
"learning_rate": 1.495103650006359e-05, |
|
"loss": 0.961, |
|
"mean_token_accuracy": 0.7215966627001762, |
|
"step": 1985 |
|
}, |
|
{ |
|
"epoch": 0.7591797806390082, |
|
"grad_norm": 4.375, |
|
"learning_rate": 1.4938318707872315e-05, |
|
"loss": 0.8637, |
|
"mean_token_accuracy": 0.7477660223841667, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 0.7610872675250357, |
|
"grad_norm": 4.71875, |
|
"learning_rate": 1.492560091568104e-05, |
|
"loss": 0.9912, |
|
"mean_token_accuracy": 0.7125525683164596, |
|
"step": 1995 |
|
}, |
|
{ |
|
"epoch": 0.7629947544110635, |
|
"grad_norm": 4.09375, |
|
"learning_rate": 1.4912883123489763e-05, |
|
"loss": 0.9836, |
|
"mean_token_accuracy": 0.7163909748196602, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.7649022412970911, |
|
"grad_norm": 4.96875, |
|
"learning_rate": 1.4900165331298488e-05, |
|
"loss": 0.9218, |
|
"mean_token_accuracy": 0.718401075899601, |
|
"step": 2005 |
|
}, |
|
{ |
|
"epoch": 0.7668097281831188, |
|
"grad_norm": 5.5625, |
|
"learning_rate": 1.4887447539107212e-05, |
|
"loss": 1.07, |
|
"mean_token_accuracy": 0.7017799213528633, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 0.7687172150691464, |
|
"grad_norm": 4.96875, |
|
"learning_rate": 1.4874729746915938e-05, |
|
"loss": 1.0801, |
|
"mean_token_accuracy": 0.689344696700573, |
|
"step": 2015 |
|
}, |
|
{ |
|
"epoch": 0.7706247019551741, |
|
"grad_norm": 4.90625, |
|
"learning_rate": 1.486201195472466e-05, |
|
"loss": 1.0465, |
|
"mean_token_accuracy": 0.7119330614805222, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 0.7725321888412017, |
|
"grad_norm": 4.5625, |
|
"learning_rate": 1.4849294162533385e-05, |
|
"loss": 0.9299, |
|
"mean_token_accuracy": 0.7306962683796883, |
|
"step": 2025 |
|
}, |
|
{ |
|
"epoch": 0.7744396757272294, |
|
"grad_norm": 5.40625, |
|
"learning_rate": 1.483657637034211e-05, |
|
"loss": 1.0421, |
|
"mean_token_accuracy": 0.7032359898090362, |
|
"step": 2030 |
|
}, |
|
{ |
|
"epoch": 0.776347162613257, |
|
"grad_norm": 4.59375, |
|
"learning_rate": 1.4823858578150834e-05, |
|
"loss": 1.004, |
|
"mean_token_accuracy": 0.7154980972409248, |
|
"step": 2035 |
|
}, |
|
{ |
|
"epoch": 0.7782546494992847, |
|
"grad_norm": 4.46875, |
|
"learning_rate": 1.4811140785959558e-05, |
|
"loss": 0.9309, |
|
"mean_token_accuracy": 0.7144239723682404, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 0.7801621363853124, |
|
"grad_norm": 4.5625, |
|
"learning_rate": 1.4798422993768282e-05, |
|
"loss": 0.9303, |
|
"mean_token_accuracy": 0.7244376420974732, |
|
"step": 2045 |
|
}, |
|
{ |
|
"epoch": 0.78206962327134, |
|
"grad_norm": 4.90625, |
|
"learning_rate": 1.4785705201577008e-05, |
|
"loss": 1.0593, |
|
"mean_token_accuracy": 0.7060457020998001, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 0.7839771101573677, |
|
"grad_norm": 4.6875, |
|
"learning_rate": 1.4772987409385733e-05, |
|
"loss": 0.9526, |
|
"mean_token_accuracy": 0.724238371104002, |
|
"step": 2055 |
|
}, |
|
{ |
|
"epoch": 0.7858845970433953, |
|
"grad_norm": 6.90625, |
|
"learning_rate": 1.4760269617194455e-05, |
|
"loss": 1.0683, |
|
"mean_token_accuracy": 0.6990483224391937, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 0.787792083929423, |
|
"grad_norm": 5.0625, |
|
"learning_rate": 1.474755182500318e-05, |
|
"loss": 0.9964, |
|
"mean_token_accuracy": 0.7195842653512955, |
|
"step": 2065 |
|
}, |
|
{ |
|
"epoch": 0.7896995708154506, |
|
"grad_norm": 4.5625, |
|
"learning_rate": 1.4734834032811905e-05, |
|
"loss": 0.936, |
|
"mean_token_accuracy": 0.7289351716637611, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 0.7916070577014783, |
|
"grad_norm": 6.125, |
|
"learning_rate": 1.472211624062063e-05, |
|
"loss": 1.0424, |
|
"mean_token_accuracy": 0.7045046918094158, |
|
"step": 2075 |
|
}, |
|
{ |
|
"epoch": 0.7935145445875059, |
|
"grad_norm": 4.65625, |
|
"learning_rate": 1.4709398448429352e-05, |
|
"loss": 0.9917, |
|
"mean_token_accuracy": 0.7202805817127228, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 0.7954220314735336, |
|
"grad_norm": 4.3125, |
|
"learning_rate": 1.4696680656238078e-05, |
|
"loss": 0.9864, |
|
"mean_token_accuracy": 0.7165962055325508, |
|
"step": 2085 |
|
}, |
|
{ |
|
"epoch": 0.7973295183595612, |
|
"grad_norm": 5.09375, |
|
"learning_rate": 1.4683962864046803e-05, |
|
"loss": 1.003, |
|
"mean_token_accuracy": 0.7066600769758224, |
|
"step": 2090 |
|
}, |
|
{ |
|
"epoch": 0.7992370052455889, |
|
"grad_norm": 5.0, |
|
"learning_rate": 1.4671245071855527e-05, |
|
"loss": 0.9686, |
|
"mean_token_accuracy": 0.7235840618610382, |
|
"step": 2095 |
|
}, |
|
{ |
|
"epoch": 0.8011444921316166, |
|
"grad_norm": 4.59375, |
|
"learning_rate": 1.4658527279664252e-05, |
|
"loss": 1.0147, |
|
"mean_token_accuracy": 0.7132967829704284, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.8030519790176442, |
|
"grad_norm": 5.09375, |
|
"learning_rate": 1.4645809487472975e-05, |
|
"loss": 1.0229, |
|
"mean_token_accuracy": 0.7088969826698304, |
|
"step": 2105 |
|
}, |
|
{ |
|
"epoch": 0.804959465903672, |
|
"grad_norm": 4.65625, |
|
"learning_rate": 1.46330916952817e-05, |
|
"loss": 1.0721, |
|
"mean_token_accuracy": 0.6959314554929733, |
|
"step": 2110 |
|
}, |
|
{ |
|
"epoch": 0.8068669527896996, |
|
"grad_norm": 5.375, |
|
"learning_rate": 1.4620373903090426e-05, |
|
"loss": 1.0519, |
|
"mean_token_accuracy": 0.7039619326591492, |
|
"step": 2115 |
|
}, |
|
{ |
|
"epoch": 0.8087744396757273, |
|
"grad_norm": 5.0625, |
|
"learning_rate": 1.460765611089915e-05, |
|
"loss": 0.9795, |
|
"mean_token_accuracy": 0.7203118950128555, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 0.8106819265617549, |
|
"grad_norm": 5.59375, |
|
"learning_rate": 1.4594938318707875e-05, |
|
"loss": 0.9688, |
|
"mean_token_accuracy": 0.721127749979496, |
|
"step": 2125 |
|
}, |
|
{ |
|
"epoch": 0.8125894134477826, |
|
"grad_norm": 4.375, |
|
"learning_rate": 1.4582220526516597e-05, |
|
"loss": 1.0835, |
|
"mean_token_accuracy": 0.6876327477395534, |
|
"step": 2130 |
|
}, |
|
{ |
|
"epoch": 0.8144969003338102, |
|
"grad_norm": 4.53125, |
|
"learning_rate": 1.4569502734325322e-05, |
|
"loss": 1.0776, |
|
"mean_token_accuracy": 0.6945757657289505, |
|
"step": 2135 |
|
}, |
|
{ |
|
"epoch": 0.8164043872198379, |
|
"grad_norm": 4.46875, |
|
"learning_rate": 1.4556784942134046e-05, |
|
"loss": 1.0365, |
|
"mean_token_accuracy": 0.7062824577093124, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 0.8183118741058655, |
|
"grad_norm": 4.6875, |
|
"learning_rate": 1.4544067149942772e-05, |
|
"loss": 1.0154, |
|
"mean_token_accuracy": 0.7210542999207974, |
|
"step": 2145 |
|
}, |
|
{ |
|
"epoch": 0.8202193609918932, |
|
"grad_norm": 5.9375, |
|
"learning_rate": 1.4531349357751496e-05, |
|
"loss": 1.1141, |
|
"mean_token_accuracy": 0.6904696643352508, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 0.8221268478779208, |
|
"grad_norm": 4.28125, |
|
"learning_rate": 1.451863156556022e-05, |
|
"loss": 0.9764, |
|
"mean_token_accuracy": 0.7087605282664299, |
|
"step": 2155 |
|
}, |
|
{ |
|
"epoch": 0.8240343347639485, |
|
"grad_norm": 4.25, |
|
"learning_rate": 1.4505913773368945e-05, |
|
"loss": 0.9799, |
|
"mean_token_accuracy": 0.704703937470913, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 0.8259418216499762, |
|
"grad_norm": 5.5, |
|
"learning_rate": 1.4493195981177669e-05, |
|
"loss": 1.001, |
|
"mean_token_accuracy": 0.7080218985676765, |
|
"step": 2165 |
|
}, |
|
{ |
|
"epoch": 0.8278493085360038, |
|
"grad_norm": 4.96875, |
|
"learning_rate": 1.4480478188986392e-05, |
|
"loss": 1.0226, |
|
"mean_token_accuracy": 0.7065225437283515, |
|
"step": 2170 |
|
}, |
|
{ |
|
"epoch": 0.8297567954220315, |
|
"grad_norm": 4.4375, |
|
"learning_rate": 1.4467760396795118e-05, |
|
"loss": 0.9912, |
|
"mean_token_accuracy": 0.7132961705327034, |
|
"step": 2175 |
|
}, |
|
{ |
|
"epoch": 0.8316642823080591, |
|
"grad_norm": 4.90625, |
|
"learning_rate": 1.4455042604603842e-05, |
|
"loss": 0.9855, |
|
"mean_token_accuracy": 0.7152161702513695, |
|
"step": 2180 |
|
}, |
|
{ |
|
"epoch": 0.8335717691940868, |
|
"grad_norm": 4.59375, |
|
"learning_rate": 1.4442324812412567e-05, |
|
"loss": 1.0422, |
|
"mean_token_accuracy": 0.7072960436344147, |
|
"step": 2185 |
|
}, |
|
{ |
|
"epoch": 0.8354792560801144, |
|
"grad_norm": 3.96875, |
|
"learning_rate": 1.442960702022129e-05, |
|
"loss": 0.971, |
|
"mean_token_accuracy": 0.7153645426034927, |
|
"step": 2190 |
|
}, |
|
{ |
|
"epoch": 0.8373867429661421, |
|
"grad_norm": 4.78125, |
|
"learning_rate": 1.4416889228030015e-05, |
|
"loss": 0.993, |
|
"mean_token_accuracy": 0.7126340731978417, |
|
"step": 2195 |
|
}, |
|
{ |
|
"epoch": 0.8392942298521697, |
|
"grad_norm": 4.90625, |
|
"learning_rate": 1.4404171435838739e-05, |
|
"loss": 0.954, |
|
"mean_token_accuracy": 0.727930200099945, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 0.8412017167381974, |
|
"grad_norm": 4.75, |
|
"learning_rate": 1.4391453643647464e-05, |
|
"loss": 0.9531, |
|
"mean_token_accuracy": 0.7239082336425782, |
|
"step": 2205 |
|
}, |
|
{ |
|
"epoch": 0.843109203624225, |
|
"grad_norm": 4.9375, |
|
"learning_rate": 1.437873585145619e-05, |
|
"loss": 1.0409, |
|
"mean_token_accuracy": 0.7055112421512604, |
|
"step": 2210 |
|
}, |
|
{ |
|
"epoch": 0.8450166905102527, |
|
"grad_norm": 5.125, |
|
"learning_rate": 1.4366018059264912e-05, |
|
"loss": 0.9722, |
|
"mean_token_accuracy": 0.7141941577196121, |
|
"step": 2215 |
|
}, |
|
{ |
|
"epoch": 0.8469241773962805, |
|
"grad_norm": 5.5625, |
|
"learning_rate": 1.4353300267073637e-05, |
|
"loss": 1.0716, |
|
"mean_token_accuracy": 0.6948425002396107, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 0.848831664282308, |
|
"grad_norm": 5.53125, |
|
"learning_rate": 1.4340582474882361e-05, |
|
"loss": 0.985, |
|
"mean_token_accuracy": 0.7037705272436142, |
|
"step": 2225 |
|
}, |
|
{ |
|
"epoch": 0.8507391511683358, |
|
"grad_norm": 4.53125, |
|
"learning_rate": 1.4327864682691087e-05, |
|
"loss": 1.0183, |
|
"mean_token_accuracy": 0.7084992684423923, |
|
"step": 2230 |
|
}, |
|
{ |
|
"epoch": 0.8526466380543634, |
|
"grad_norm": 4.3125, |
|
"learning_rate": 1.4315146890499812e-05, |
|
"loss": 1.0079, |
|
"mean_token_accuracy": 0.7166560679674149, |
|
"step": 2235 |
|
}, |
|
{ |
|
"epoch": 0.8545541249403911, |
|
"grad_norm": 4.5625, |
|
"learning_rate": 1.4302429098308534e-05, |
|
"loss": 0.9975, |
|
"mean_token_accuracy": 0.7056717827916146, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 0.8564616118264187, |
|
"grad_norm": 4.34375, |
|
"learning_rate": 1.428971130611726e-05, |
|
"loss": 1.0258, |
|
"mean_token_accuracy": 0.7125526055693626, |
|
"step": 2245 |
|
}, |
|
{ |
|
"epoch": 0.8583690987124464, |
|
"grad_norm": 4.71875, |
|
"learning_rate": 1.4276993513925984e-05, |
|
"loss": 1.0043, |
|
"mean_token_accuracy": 0.7112932525575161, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 0.860276585598474, |
|
"grad_norm": 4.78125, |
|
"learning_rate": 1.4264275721734709e-05, |
|
"loss": 0.9908, |
|
"mean_token_accuracy": 0.7215219050645828, |
|
"step": 2255 |
|
}, |
|
{ |
|
"epoch": 0.8621840724845017, |
|
"grad_norm": 4.96875, |
|
"learning_rate": 1.4251557929543431e-05, |
|
"loss": 0.9868, |
|
"mean_token_accuracy": 0.711732842028141, |
|
"step": 2260 |
|
}, |
|
{ |
|
"epoch": 0.8640915593705293, |
|
"grad_norm": 5.53125, |
|
"learning_rate": 1.4238840137352157e-05, |
|
"loss": 1.0097, |
|
"mean_token_accuracy": 0.6953270882368088, |
|
"step": 2265 |
|
}, |
|
{ |
|
"epoch": 0.865999046256557, |
|
"grad_norm": 5.15625, |
|
"learning_rate": 1.4226122345160882e-05, |
|
"loss": 0.9891, |
|
"mean_token_accuracy": 0.7083195835351944, |
|
"step": 2270 |
|
}, |
|
{ |
|
"epoch": 0.8679065331425846, |
|
"grad_norm": 5.3125, |
|
"learning_rate": 1.4213404552969606e-05, |
|
"loss": 0.9258, |
|
"mean_token_accuracy": 0.7308154091238975, |
|
"step": 2275 |
|
}, |
|
{ |
|
"epoch": 0.8698140200286123, |
|
"grad_norm": 4.71875, |
|
"learning_rate": 1.420068676077833e-05, |
|
"loss": 1.0778, |
|
"mean_token_accuracy": 0.7014591008424759, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 0.87172150691464, |
|
"grad_norm": 4.84375, |
|
"learning_rate": 1.4187968968587054e-05, |
|
"loss": 0.983, |
|
"mean_token_accuracy": 0.7186863839626312, |
|
"step": 2285 |
|
}, |
|
{ |
|
"epoch": 0.8736289938006676, |
|
"grad_norm": 4.5, |
|
"learning_rate": 1.4175251176395779e-05, |
|
"loss": 0.9944, |
|
"mean_token_accuracy": 0.7090497985482216, |
|
"step": 2290 |
|
}, |
|
{ |
|
"epoch": 0.8755364806866953, |
|
"grad_norm": 5.21875, |
|
"learning_rate": 1.4162533384204505e-05, |
|
"loss": 1.0525, |
|
"mean_token_accuracy": 0.6964134000241756, |
|
"step": 2295 |
|
}, |
|
{ |
|
"epoch": 0.8774439675727229, |
|
"grad_norm": 5.40625, |
|
"learning_rate": 1.4149815592013227e-05, |
|
"loss": 0.9733, |
|
"mean_token_accuracy": 0.7252121046185493, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 0.8793514544587506, |
|
"grad_norm": 4.96875, |
|
"learning_rate": 1.4137097799821952e-05, |
|
"loss": 0.959, |
|
"mean_token_accuracy": 0.7168088331818581, |
|
"step": 2305 |
|
}, |
|
{ |
|
"epoch": 0.8812589413447782, |
|
"grad_norm": 6.875, |
|
"learning_rate": 1.4124380007630676e-05, |
|
"loss": 1.049, |
|
"mean_token_accuracy": 0.7068444952368736, |
|
"step": 2310 |
|
}, |
|
{ |
|
"epoch": 0.8831664282308059, |
|
"grad_norm": 5.5, |
|
"learning_rate": 1.4111662215439401e-05, |
|
"loss": 0.9387, |
|
"mean_token_accuracy": 0.7282044783234596, |
|
"step": 2315 |
|
}, |
|
{ |
|
"epoch": 0.8850739151168335, |
|
"grad_norm": 5.21875, |
|
"learning_rate": 1.4098944423248124e-05, |
|
"loss": 1.0492, |
|
"mean_token_accuracy": 0.6986595824360847, |
|
"step": 2320 |
|
}, |
|
{ |
|
"epoch": 0.8869814020028612, |
|
"grad_norm": 4.75, |
|
"learning_rate": 1.4086226631056849e-05, |
|
"loss": 1.0166, |
|
"mean_token_accuracy": 0.70717094540596, |
|
"step": 2325 |
|
}, |
|
{ |
|
"epoch": 0.8888888888888888, |
|
"grad_norm": 5.3125, |
|
"learning_rate": 1.4073508838865575e-05, |
|
"loss": 1.0353, |
|
"mean_token_accuracy": 0.7046495825052261, |
|
"step": 2330 |
|
}, |
|
{ |
|
"epoch": 0.8907963757749165, |
|
"grad_norm": 4.78125, |
|
"learning_rate": 1.4060791046674298e-05, |
|
"loss": 1.004, |
|
"mean_token_accuracy": 0.7123620569705963, |
|
"step": 2335 |
|
}, |
|
{ |
|
"epoch": 0.8927038626609443, |
|
"grad_norm": 5.0625, |
|
"learning_rate": 1.4048073254483024e-05, |
|
"loss": 0.9766, |
|
"mean_token_accuracy": 0.7214268952608108, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 0.8946113495469719, |
|
"grad_norm": 4.84375, |
|
"learning_rate": 1.4035355462291746e-05, |
|
"loss": 1.0042, |
|
"mean_token_accuracy": 0.7193110853433609, |
|
"step": 2345 |
|
}, |
|
{ |
|
"epoch": 0.8965188364329996, |
|
"grad_norm": 4.875, |
|
"learning_rate": 1.4022637670100472e-05, |
|
"loss": 0.9944, |
|
"mean_token_accuracy": 0.7094147495925427, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 0.8984263233190272, |
|
"grad_norm": 5.0, |
|
"learning_rate": 1.4009919877909197e-05, |
|
"loss": 0.9404, |
|
"mean_token_accuracy": 0.7051554054021836, |
|
"step": 2355 |
|
}, |
|
{ |
|
"epoch": 0.9003338102050549, |
|
"grad_norm": 4.09375, |
|
"learning_rate": 1.399720208571792e-05, |
|
"loss": 0.9273, |
|
"mean_token_accuracy": 0.7224482171237468, |
|
"step": 2360 |
|
}, |
|
{ |
|
"epoch": 0.9022412970910825, |
|
"grad_norm": 5.0625, |
|
"learning_rate": 1.3984484293526646e-05, |
|
"loss": 0.9338, |
|
"mean_token_accuracy": 0.7283406421542168, |
|
"step": 2365 |
|
}, |
|
{ |
|
"epoch": 0.9041487839771102, |
|
"grad_norm": 4.78125, |
|
"learning_rate": 1.3971766501335368e-05, |
|
"loss": 0.963, |
|
"mean_token_accuracy": 0.7229938983917237, |
|
"step": 2370 |
|
}, |
|
{ |
|
"epoch": 0.9060562708631378, |
|
"grad_norm": 4.6875, |
|
"learning_rate": 1.3959048709144094e-05, |
|
"loss": 0.9619, |
|
"mean_token_accuracy": 0.7297323048114777, |
|
"step": 2375 |
|
}, |
|
{ |
|
"epoch": 0.9079637577491655, |
|
"grad_norm": 5.15625, |
|
"learning_rate": 1.3946330916952818e-05, |
|
"loss": 0.9825, |
|
"mean_token_accuracy": 0.7321062237024307, |
|
"step": 2380 |
|
}, |
|
{ |
|
"epoch": 0.9098712446351931, |
|
"grad_norm": 4.0625, |
|
"learning_rate": 1.3933613124761543e-05, |
|
"loss": 0.9912, |
|
"mean_token_accuracy": 0.7135132804512978, |
|
"step": 2385 |
|
}, |
|
{ |
|
"epoch": 0.9117787315212208, |
|
"grad_norm": 4.0, |
|
"learning_rate": 1.3920895332570267e-05, |
|
"loss": 0.9794, |
|
"mean_token_accuracy": 0.7198825925588608, |
|
"step": 2390 |
|
}, |
|
{ |
|
"epoch": 0.9136862184072484, |
|
"grad_norm": 5.5, |
|
"learning_rate": 1.3908177540378991e-05, |
|
"loss": 1.0248, |
|
"mean_token_accuracy": 0.7084431156516076, |
|
"step": 2395 |
|
}, |
|
{ |
|
"epoch": 0.9155937052932761, |
|
"grad_norm": 4.53125, |
|
"learning_rate": 1.3895459748187716e-05, |
|
"loss": 0.9445, |
|
"mean_token_accuracy": 0.7225293383002281, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.9175011921793038, |
|
"grad_norm": 4.40625, |
|
"learning_rate": 1.388274195599644e-05, |
|
"loss": 0.9686, |
|
"mean_token_accuracy": 0.7221581146121026, |
|
"step": 2405 |
|
}, |
|
{ |
|
"epoch": 0.9194086790653314, |
|
"grad_norm": 4.46875, |
|
"learning_rate": 1.3870024163805164e-05, |
|
"loss": 0.8985, |
|
"mean_token_accuracy": 0.73746228069067, |
|
"step": 2410 |
|
}, |
|
{ |
|
"epoch": 0.9213161659513591, |
|
"grad_norm": 5.84375, |
|
"learning_rate": 1.385730637161389e-05, |
|
"loss": 1.0963, |
|
"mean_token_accuracy": 0.6906205818057061, |
|
"step": 2415 |
|
}, |
|
{ |
|
"epoch": 0.9232236528373867, |
|
"grad_norm": 6.15625, |
|
"learning_rate": 1.3844588579422613e-05, |
|
"loss": 1.0587, |
|
"mean_token_accuracy": 0.6952868282794953, |
|
"step": 2420 |
|
}, |
|
{ |
|
"epoch": 0.9251311397234144, |
|
"grad_norm": 4.8125, |
|
"learning_rate": 1.3831870787231339e-05, |
|
"loss": 1.0485, |
|
"mean_token_accuracy": 0.7097712486982346, |
|
"step": 2425 |
|
}, |
|
{ |
|
"epoch": 0.927038626609442, |
|
"grad_norm": 4.53125, |
|
"learning_rate": 1.3819152995040061e-05, |
|
"loss": 0.9926, |
|
"mean_token_accuracy": 0.7256608709692955, |
|
"step": 2430 |
|
}, |
|
{ |
|
"epoch": 0.9289461134954697, |
|
"grad_norm": 4.75, |
|
"learning_rate": 1.3806435202848786e-05, |
|
"loss": 0.9483, |
|
"mean_token_accuracy": 0.7213496834039688, |
|
"step": 2435 |
|
}, |
|
{ |
|
"epoch": 0.9308536003814973, |
|
"grad_norm": 5.875, |
|
"learning_rate": 1.379371741065751e-05, |
|
"loss": 0.9997, |
|
"mean_token_accuracy": 0.71124257594347, |
|
"step": 2440 |
|
}, |
|
{ |
|
"epoch": 0.932761087267525, |
|
"grad_norm": 4.6875, |
|
"learning_rate": 1.3780999618466236e-05, |
|
"loss": 0.9698, |
|
"mean_token_accuracy": 0.7182813182473182, |
|
"step": 2445 |
|
}, |
|
{ |
|
"epoch": 0.9346685741535526, |
|
"grad_norm": 6.0, |
|
"learning_rate": 1.3768281826274961e-05, |
|
"loss": 1.0038, |
|
"mean_token_accuracy": 0.7101509183645248, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 0.9365760610395804, |
|
"grad_norm": 5.84375, |
|
"learning_rate": 1.3755564034083683e-05, |
|
"loss": 1.0164, |
|
"mean_token_accuracy": 0.7154742404818535, |
|
"step": 2455 |
|
}, |
|
{ |
|
"epoch": 0.9384835479256081, |
|
"grad_norm": 4.96875, |
|
"learning_rate": 1.3742846241892409e-05, |
|
"loss": 0.9209, |
|
"mean_token_accuracy": 0.7325319960713387, |
|
"step": 2460 |
|
}, |
|
{ |
|
"epoch": 0.9403910348116357, |
|
"grad_norm": 4.5625, |
|
"learning_rate": 1.3730128449701133e-05, |
|
"loss": 0.9261, |
|
"mean_token_accuracy": 0.7324376836419105, |
|
"step": 2465 |
|
}, |
|
{ |
|
"epoch": 0.9422985216976634, |
|
"grad_norm": 6.625, |
|
"learning_rate": 1.3717410657509858e-05, |
|
"loss": 1.0196, |
|
"mean_token_accuracy": 0.6997216045856476, |
|
"step": 2470 |
|
}, |
|
{ |
|
"epoch": 0.944206008583691, |
|
"grad_norm": 4.875, |
|
"learning_rate": 1.3704692865318584e-05, |
|
"loss": 0.9358, |
|
"mean_token_accuracy": 0.7200413174927235, |
|
"step": 2475 |
|
}, |
|
{ |
|
"epoch": 0.9461134954697187, |
|
"grad_norm": 5.09375, |
|
"learning_rate": 1.3691975073127306e-05, |
|
"loss": 0.9271, |
|
"mean_token_accuracy": 0.7231632620096207, |
|
"step": 2480 |
|
}, |
|
{ |
|
"epoch": 0.9480209823557463, |
|
"grad_norm": 4.53125, |
|
"learning_rate": 1.3679257280936031e-05, |
|
"loss": 0.9233, |
|
"mean_token_accuracy": 0.7239905461668968, |
|
"step": 2485 |
|
}, |
|
{ |
|
"epoch": 0.949928469241774, |
|
"grad_norm": 4.59375, |
|
"learning_rate": 1.3666539488744755e-05, |
|
"loss": 0.869, |
|
"mean_token_accuracy": 0.7495769336819649, |
|
"step": 2490 |
|
}, |
|
{ |
|
"epoch": 0.9518359561278016, |
|
"grad_norm": 5.46875, |
|
"learning_rate": 1.365382169655348e-05, |
|
"loss": 1.0349, |
|
"mean_token_accuracy": 0.7054173357784748, |
|
"step": 2495 |
|
}, |
|
{ |
|
"epoch": 0.9537434430138293, |
|
"grad_norm": 6.21875, |
|
"learning_rate": 1.3641103904362203e-05, |
|
"loss": 0.9375, |
|
"mean_token_accuracy": 0.7280614987015724, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.9556509298998569, |
|
"grad_norm": 7.09375, |
|
"learning_rate": 1.3628386112170928e-05, |
|
"loss": 0.9994, |
|
"mean_token_accuracy": 0.7150517120957375, |
|
"step": 2505 |
|
}, |
|
{ |
|
"epoch": 0.9575584167858846, |
|
"grad_norm": 5.59375, |
|
"learning_rate": 1.3615668319979654e-05, |
|
"loss": 0.9513, |
|
"mean_token_accuracy": 0.732499985396862, |
|
"step": 2510 |
|
}, |
|
{ |
|
"epoch": 0.9594659036719122, |
|
"grad_norm": 5.0, |
|
"learning_rate": 1.3602950527788377e-05, |
|
"loss": 0.9393, |
|
"mean_token_accuracy": 0.7373679198324681, |
|
"step": 2515 |
|
}, |
|
{ |
|
"epoch": 0.9613733905579399, |
|
"grad_norm": 5.53125, |
|
"learning_rate": 1.3590232735597101e-05, |
|
"loss": 0.9434, |
|
"mean_token_accuracy": 0.7251186773180962, |
|
"step": 2520 |
|
}, |
|
{ |
|
"epoch": 0.9632808774439676, |
|
"grad_norm": 4.875, |
|
"learning_rate": 1.3577514943405825e-05, |
|
"loss": 1.0031, |
|
"mean_token_accuracy": 0.7138618201017379, |
|
"step": 2525 |
|
}, |
|
{ |
|
"epoch": 0.9651883643299952, |
|
"grad_norm": 5.0625, |
|
"learning_rate": 1.356479715121455e-05, |
|
"loss": 0.9804, |
|
"mean_token_accuracy": 0.7263573125004769, |
|
"step": 2530 |
|
}, |
|
{ |
|
"epoch": 0.9670958512160229, |
|
"grad_norm": 5.3125, |
|
"learning_rate": 1.3552079359023276e-05, |
|
"loss": 0.9581, |
|
"mean_token_accuracy": 0.7182695418596268, |
|
"step": 2535 |
|
}, |
|
{ |
|
"epoch": 0.9690033381020505, |
|
"grad_norm": 4.96875, |
|
"learning_rate": 1.3539361566831998e-05, |
|
"loss": 1.0186, |
|
"mean_token_accuracy": 0.7014960646629333, |
|
"step": 2540 |
|
}, |
|
{ |
|
"epoch": 0.9709108249880782, |
|
"grad_norm": 4.71875, |
|
"learning_rate": 1.3526643774640724e-05, |
|
"loss": 0.9072, |
|
"mean_token_accuracy": 0.7355842962861061, |
|
"step": 2545 |
|
}, |
|
{ |
|
"epoch": 0.9728183118741058, |
|
"grad_norm": 4.78125, |
|
"learning_rate": 1.3513925982449447e-05, |
|
"loss": 0.9532, |
|
"mean_token_accuracy": 0.7224856913089752, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 0.9747257987601335, |
|
"grad_norm": 4.34375, |
|
"learning_rate": 1.3501208190258173e-05, |
|
"loss": 1.0263, |
|
"mean_token_accuracy": 0.7150218620896339, |
|
"step": 2555 |
|
}, |
|
{ |
|
"epoch": 0.9766332856461611, |
|
"grad_norm": 4.0625, |
|
"learning_rate": 1.3488490398066895e-05, |
|
"loss": 0.979, |
|
"mean_token_accuracy": 0.7201207339763641, |
|
"step": 2560 |
|
}, |
|
{ |
|
"epoch": 0.9785407725321889, |
|
"grad_norm": 5.40625, |
|
"learning_rate": 1.347577260587562e-05, |
|
"loss": 1.0678, |
|
"mean_token_accuracy": 0.7005651786923408, |
|
"step": 2565 |
|
}, |
|
{ |
|
"epoch": 0.9804482594182165, |
|
"grad_norm": 4.21875, |
|
"learning_rate": 1.3463054813684346e-05, |
|
"loss": 0.9428, |
|
"mean_token_accuracy": 0.719219633936882, |
|
"step": 2570 |
|
}, |
|
{ |
|
"epoch": 0.9823557463042442, |
|
"grad_norm": 5.125, |
|
"learning_rate": 1.345033702149307e-05, |
|
"loss": 0.9287, |
|
"mean_token_accuracy": 0.7292002603411675, |
|
"step": 2575 |
|
}, |
|
{ |
|
"epoch": 0.9842632331902719, |
|
"grad_norm": 4.9375, |
|
"learning_rate": 1.3437619229301795e-05, |
|
"loss": 0.8934, |
|
"mean_token_accuracy": 0.7359087854623795, |
|
"step": 2580 |
|
}, |
|
{ |
|
"epoch": 0.9861707200762995, |
|
"grad_norm": 4.75, |
|
"learning_rate": 1.3424901437110517e-05, |
|
"loss": 0.9668, |
|
"mean_token_accuracy": 0.7158159494400025, |
|
"step": 2585 |
|
}, |
|
{ |
|
"epoch": 0.9880782069623272, |
|
"grad_norm": 4.8125, |
|
"learning_rate": 1.3412183644919243e-05, |
|
"loss": 0.9261, |
|
"mean_token_accuracy": 0.7298913650214672, |
|
"step": 2590 |
|
}, |
|
{ |
|
"epoch": 0.9899856938483548, |
|
"grad_norm": 4.0625, |
|
"learning_rate": 1.3399465852727968e-05, |
|
"loss": 0.9487, |
|
"mean_token_accuracy": 0.7226217985153198, |
|
"step": 2595 |
|
}, |
|
{ |
|
"epoch": 0.9918931807343825, |
|
"grad_norm": 3.984375, |
|
"learning_rate": 1.3386748060536692e-05, |
|
"loss": 0.9619, |
|
"mean_token_accuracy": 0.7170055121183395, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 0.9938006676204101, |
|
"grad_norm": 4.84375, |
|
"learning_rate": 1.3374030268345418e-05, |
|
"loss": 1.1289, |
|
"mean_token_accuracy": 0.6791918635368347, |
|
"step": 2605 |
|
}, |
|
{ |
|
"epoch": 0.9957081545064378, |
|
"grad_norm": 4.46875, |
|
"learning_rate": 1.336131247615414e-05, |
|
"loss": 0.9903, |
|
"mean_token_accuracy": 0.7179454803466797, |
|
"step": 2610 |
|
}, |
|
{ |
|
"epoch": 0.9976156413924654, |
|
"grad_norm": 4.65625, |
|
"learning_rate": 1.3348594683962865e-05, |
|
"loss": 0.9445, |
|
"mean_token_accuracy": 0.7281292766332627, |
|
"step": 2615 |
|
}, |
|
{ |
|
"epoch": 0.9995231282784931, |
|
"grad_norm": 6.28125, |
|
"learning_rate": 1.333587689177159e-05, |
|
"loss": 0.9597, |
|
"mean_token_accuracy": 0.7316592544317245, |
|
"step": 2620 |
|
}, |
|
{ |
|
"epoch": 1.0011444921316166, |
|
"grad_norm": 5.25, |
|
"learning_rate": 1.3323159099580315e-05, |
|
"loss": 0.8169, |
|
"mean_token_accuracy": 0.7703618687741897, |
|
"step": 2625 |
|
}, |
|
{ |
|
"epoch": 1.0030519790176442, |
|
"grad_norm": 4.53125, |
|
"learning_rate": 1.3310441307389039e-05, |
|
"loss": 0.8116, |
|
"mean_token_accuracy": 0.7598389968276024, |
|
"step": 2630 |
|
}, |
|
{ |
|
"epoch": 1.004959465903672, |
|
"grad_norm": 5.90625, |
|
"learning_rate": 1.3297723515197762e-05, |
|
"loss": 0.6731, |
|
"mean_token_accuracy": 0.8001046404242516, |
|
"step": 2635 |
|
}, |
|
{ |
|
"epoch": 1.0068669527896996, |
|
"grad_norm": 4.375, |
|
"learning_rate": 1.3285005723006488e-05, |
|
"loss": 0.7272, |
|
"mean_token_accuracy": 0.7857492476701736, |
|
"step": 2640 |
|
}, |
|
{ |
|
"epoch": 1.0087744396757272, |
|
"grad_norm": 4.21875, |
|
"learning_rate": 1.3272287930815212e-05, |
|
"loss": 0.6981, |
|
"mean_token_accuracy": 0.7862312287092209, |
|
"step": 2645 |
|
}, |
|
{ |
|
"epoch": 1.0106819265617548, |
|
"grad_norm": 5.34375, |
|
"learning_rate": 1.3259570138623935e-05, |
|
"loss": 0.7172, |
|
"mean_token_accuracy": 0.7825499027967453, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 1.0125894134477826, |
|
"grad_norm": 4.6875, |
|
"learning_rate": 1.324685234643266e-05, |
|
"loss": 0.7289, |
|
"mean_token_accuracy": 0.777584858238697, |
|
"step": 2655 |
|
}, |
|
{ |
|
"epoch": 1.0144969003338102, |
|
"grad_norm": 5.21875, |
|
"learning_rate": 1.3234134554241385e-05, |
|
"loss": 0.7725, |
|
"mean_token_accuracy": 0.774682505428791, |
|
"step": 2660 |
|
}, |
|
{ |
|
"epoch": 1.0164043872198378, |
|
"grad_norm": 5.3125, |
|
"learning_rate": 1.322141676205011e-05, |
|
"loss": 0.714, |
|
"mean_token_accuracy": 0.784676456451416, |
|
"step": 2665 |
|
}, |
|
{ |
|
"epoch": 1.0183118741058654, |
|
"grad_norm": 4.65625, |
|
"learning_rate": 1.3208698969858832e-05, |
|
"loss": 0.701, |
|
"mean_token_accuracy": 0.7884877413511276, |
|
"step": 2670 |
|
}, |
|
{ |
|
"epoch": 1.0202193609918933, |
|
"grad_norm": 5.90625, |
|
"learning_rate": 1.3195981177667558e-05, |
|
"loss": 0.8432, |
|
"mean_token_accuracy": 0.7497532099485398, |
|
"step": 2675 |
|
}, |
|
{ |
|
"epoch": 1.0221268478779209, |
|
"grad_norm": 5.1875, |
|
"learning_rate": 1.3183263385476282e-05, |
|
"loss": 0.7722, |
|
"mean_token_accuracy": 0.7718176633119583, |
|
"step": 2680 |
|
}, |
|
{ |
|
"epoch": 1.0240343347639485, |
|
"grad_norm": 4.875, |
|
"learning_rate": 1.3170545593285007e-05, |
|
"loss": 0.6933, |
|
"mean_token_accuracy": 0.7811264783143997, |
|
"step": 2685 |
|
}, |
|
{ |
|
"epoch": 1.025941821649976, |
|
"grad_norm": 5.625, |
|
"learning_rate": 1.3157827801093733e-05, |
|
"loss": 0.7905, |
|
"mean_token_accuracy": 0.7633818849921227, |
|
"step": 2690 |
|
}, |
|
{ |
|
"epoch": 1.0278493085360039, |
|
"grad_norm": 4.78125, |
|
"learning_rate": 1.3145110008902455e-05, |
|
"loss": 0.6934, |
|
"mean_token_accuracy": 0.7909208744764328, |
|
"step": 2695 |
|
}, |
|
{ |
|
"epoch": 1.0297567954220315, |
|
"grad_norm": 6.09375, |
|
"learning_rate": 1.313239221671118e-05, |
|
"loss": 0.7197, |
|
"mean_token_accuracy": 0.7824130252003669, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 1.031664282308059, |
|
"grad_norm": 5.28125, |
|
"learning_rate": 1.3119674424519904e-05, |
|
"loss": 0.795, |
|
"mean_token_accuracy": 0.7646695077419281, |
|
"step": 2705 |
|
}, |
|
{ |
|
"epoch": 1.0335717691940869, |
|
"grad_norm": 5.59375, |
|
"learning_rate": 1.310695663232863e-05, |
|
"loss": 0.7501, |
|
"mean_token_accuracy": 0.7809911444783211, |
|
"step": 2710 |
|
}, |
|
{ |
|
"epoch": 1.0354792560801145, |
|
"grad_norm": 5.1875, |
|
"learning_rate": 1.3094238840137352e-05, |
|
"loss": 0.707, |
|
"mean_token_accuracy": 0.7913937494158745, |
|
"step": 2715 |
|
}, |
|
{ |
|
"epoch": 1.037386742966142, |
|
"grad_norm": 5.21875, |
|
"learning_rate": 1.3081521047946077e-05, |
|
"loss": 0.63, |
|
"mean_token_accuracy": 0.8109789371490479, |
|
"step": 2720 |
|
}, |
|
{ |
|
"epoch": 1.0392942298521697, |
|
"grad_norm": 7.15625, |
|
"learning_rate": 1.3068803255754803e-05, |
|
"loss": 0.7498, |
|
"mean_token_accuracy": 0.7778886809945107, |
|
"step": 2725 |
|
}, |
|
{ |
|
"epoch": 1.0412017167381975, |
|
"grad_norm": 4.65625, |
|
"learning_rate": 1.3056085463563526e-05, |
|
"loss": 0.7035, |
|
"mean_token_accuracy": 0.7928727805614472, |
|
"step": 2730 |
|
}, |
|
{ |
|
"epoch": 1.043109203624225, |
|
"grad_norm": 5.09375, |
|
"learning_rate": 1.3043367671372252e-05, |
|
"loss": 0.7275, |
|
"mean_token_accuracy": 0.7794766634702682, |
|
"step": 2735 |
|
}, |
|
{ |
|
"epoch": 1.0450166905102527, |
|
"grad_norm": 4.4375, |
|
"learning_rate": 1.3030649879180974e-05, |
|
"loss": 0.7762, |
|
"mean_token_accuracy": 0.770900085568428, |
|
"step": 2740 |
|
}, |
|
{ |
|
"epoch": 1.0469241773962803, |
|
"grad_norm": 4.8125, |
|
"learning_rate": 1.30179320869897e-05, |
|
"loss": 0.685, |
|
"mean_token_accuracy": 0.7873307526111603, |
|
"step": 2745 |
|
}, |
|
{ |
|
"epoch": 1.0488316642823081, |
|
"grad_norm": 5.3125, |
|
"learning_rate": 1.3005214294798425e-05, |
|
"loss": 0.7436, |
|
"mean_token_accuracy": 0.7779841095209121, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 1.0507391511683357, |
|
"grad_norm": 4.75, |
|
"learning_rate": 1.2992496502607149e-05, |
|
"loss": 0.7905, |
|
"mean_token_accuracy": 0.7763501286506653, |
|
"step": 2755 |
|
}, |
|
{ |
|
"epoch": 1.0526466380543633, |
|
"grad_norm": 5.03125, |
|
"learning_rate": 1.2979778710415873e-05, |
|
"loss": 0.7612, |
|
"mean_token_accuracy": 0.7685739085078239, |
|
"step": 2760 |
|
}, |
|
{ |
|
"epoch": 1.0545541249403911, |
|
"grad_norm": 4.65625, |
|
"learning_rate": 1.2967060918224597e-05, |
|
"loss": 0.6927, |
|
"mean_token_accuracy": 0.7887519240379334, |
|
"step": 2765 |
|
}, |
|
{ |
|
"epoch": 1.0564616118264187, |
|
"grad_norm": 5.5625, |
|
"learning_rate": 1.2954343126033322e-05, |
|
"loss": 0.7351, |
|
"mean_token_accuracy": 0.7762645557522774, |
|
"step": 2770 |
|
}, |
|
{ |
|
"epoch": 1.0583690987124463, |
|
"grad_norm": 5.59375, |
|
"learning_rate": 1.2941625333842046e-05, |
|
"loss": 0.7275, |
|
"mean_token_accuracy": 0.7789256662130356, |
|
"step": 2775 |
|
}, |
|
{ |
|
"epoch": 1.060276585598474, |
|
"grad_norm": 5.5625, |
|
"learning_rate": 1.292890754165077e-05, |
|
"loss": 0.6901, |
|
"mean_token_accuracy": 0.7860920026898384, |
|
"step": 2780 |
|
}, |
|
{ |
|
"epoch": 1.0621840724845018, |
|
"grad_norm": 5.34375, |
|
"learning_rate": 1.2916189749459495e-05, |
|
"loss": 0.6762, |
|
"mean_token_accuracy": 0.7937915295362472, |
|
"step": 2785 |
|
}, |
|
{ |
|
"epoch": 1.0640915593705293, |
|
"grad_norm": 5.46875, |
|
"learning_rate": 1.2903471957268219e-05, |
|
"loss": 0.7657, |
|
"mean_token_accuracy": 0.7764147505164146, |
|
"step": 2790 |
|
}, |
|
{ |
|
"epoch": 1.065999046256557, |
|
"grad_norm": 5.21875, |
|
"learning_rate": 1.2890754165076944e-05, |
|
"loss": 0.6942, |
|
"mean_token_accuracy": 0.7820986464619637, |
|
"step": 2795 |
|
}, |
|
{ |
|
"epoch": 1.0679065331425845, |
|
"grad_norm": 5.5625, |
|
"learning_rate": 1.2878036372885667e-05, |
|
"loss": 0.7147, |
|
"mean_token_accuracy": 0.7832359343767166, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 1.0698140200286124, |
|
"grad_norm": 5.09375, |
|
"learning_rate": 1.2865318580694392e-05, |
|
"loss": 0.7236, |
|
"mean_token_accuracy": 0.7760476425290108, |
|
"step": 2805 |
|
}, |
|
{ |
|
"epoch": 1.07172150691464, |
|
"grad_norm": 4.0625, |
|
"learning_rate": 1.2852600788503118e-05, |
|
"loss": 0.6761, |
|
"mean_token_accuracy": 0.7974345803260803, |
|
"step": 2810 |
|
}, |
|
{ |
|
"epoch": 1.0736289938006676, |
|
"grad_norm": 5.3125, |
|
"learning_rate": 1.2839882996311841e-05, |
|
"loss": 0.6539, |
|
"mean_token_accuracy": 0.7917591854929924, |
|
"step": 2815 |
|
}, |
|
{ |
|
"epoch": 1.0755364806866954, |
|
"grad_norm": 5.34375, |
|
"learning_rate": 1.2827165204120567e-05, |
|
"loss": 0.7374, |
|
"mean_token_accuracy": 0.7774428620934486, |
|
"step": 2820 |
|
}, |
|
{ |
|
"epoch": 1.077443967572723, |
|
"grad_norm": 4.78125, |
|
"learning_rate": 1.2814447411929289e-05, |
|
"loss": 0.723, |
|
"mean_token_accuracy": 0.7769853323698044, |
|
"step": 2825 |
|
}, |
|
{ |
|
"epoch": 1.0793514544587506, |
|
"grad_norm": 5.84375, |
|
"learning_rate": 1.2801729619738014e-05, |
|
"loss": 0.7696, |
|
"mean_token_accuracy": 0.766608040034771, |
|
"step": 2830 |
|
}, |
|
{ |
|
"epoch": 1.0812589413447782, |
|
"grad_norm": 5.46875, |
|
"learning_rate": 1.2789011827546738e-05, |
|
"loss": 0.7817, |
|
"mean_token_accuracy": 0.769975657761097, |
|
"step": 2835 |
|
}, |
|
{ |
|
"epoch": 1.083166428230806, |
|
"grad_norm": 5.1875, |
|
"learning_rate": 1.2776294035355464e-05, |
|
"loss": 0.7443, |
|
"mean_token_accuracy": 0.7756567880511284, |
|
"step": 2840 |
|
}, |
|
{ |
|
"epoch": 1.0850739151168336, |
|
"grad_norm": 5.96875, |
|
"learning_rate": 1.276357624316419e-05, |
|
"loss": 0.7636, |
|
"mean_token_accuracy": 0.7678534090518951, |
|
"step": 2845 |
|
}, |
|
{ |
|
"epoch": 1.0869814020028612, |
|
"grad_norm": 5.65625, |
|
"learning_rate": 1.2750858450972911e-05, |
|
"loss": 0.7688, |
|
"mean_token_accuracy": 0.7699078634381294, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 1.0888888888888888, |
|
"grad_norm": 5.09375, |
|
"learning_rate": 1.2738140658781637e-05, |
|
"loss": 0.6738, |
|
"mean_token_accuracy": 0.7892726883292198, |
|
"step": 2855 |
|
}, |
|
{ |
|
"epoch": 1.0907963757749166, |
|
"grad_norm": 5.8125, |
|
"learning_rate": 1.272542286659036e-05, |
|
"loss": 0.6766, |
|
"mean_token_accuracy": 0.7931236639618874, |
|
"step": 2860 |
|
}, |
|
{ |
|
"epoch": 1.0927038626609442, |
|
"grad_norm": 5.21875, |
|
"learning_rate": 1.2712705074399086e-05, |
|
"loss": 0.6785, |
|
"mean_token_accuracy": 0.7956051483750344, |
|
"step": 2865 |
|
}, |
|
{ |
|
"epoch": 1.0946113495469718, |
|
"grad_norm": 5.40625, |
|
"learning_rate": 1.269998728220781e-05, |
|
"loss": 0.7536, |
|
"mean_token_accuracy": 0.7718766152858734, |
|
"step": 2870 |
|
}, |
|
{ |
|
"epoch": 1.0965188364329994, |
|
"grad_norm": 4.96875, |
|
"learning_rate": 1.2687269490016534e-05, |
|
"loss": 0.7712, |
|
"mean_token_accuracy": 0.7525050655007363, |
|
"step": 2875 |
|
}, |
|
{ |
|
"epoch": 1.0984263233190272, |
|
"grad_norm": 5.28125, |
|
"learning_rate": 1.267455169782526e-05, |
|
"loss": 0.801, |
|
"mean_token_accuracy": 0.7586658120155334, |
|
"step": 2880 |
|
}, |
|
{ |
|
"epoch": 1.1003338102050548, |
|
"grad_norm": 5.1875, |
|
"learning_rate": 1.2661833905633983e-05, |
|
"loss": 0.7505, |
|
"mean_token_accuracy": 0.7702452227473259, |
|
"step": 2885 |
|
}, |
|
{ |
|
"epoch": 1.1022412970910824, |
|
"grad_norm": 5.59375, |
|
"learning_rate": 1.2649116113442707e-05, |
|
"loss": 0.7451, |
|
"mean_token_accuracy": 0.7831007912755013, |
|
"step": 2890 |
|
}, |
|
{ |
|
"epoch": 1.1041487839771102, |
|
"grad_norm": 5.25, |
|
"learning_rate": 1.263639832125143e-05, |
|
"loss": 0.7195, |
|
"mean_token_accuracy": 0.7821832373738289, |
|
"step": 2895 |
|
}, |
|
{ |
|
"epoch": 1.1060562708631378, |
|
"grad_norm": 5.3125, |
|
"learning_rate": 1.2623680529060156e-05, |
|
"loss": 0.7329, |
|
"mean_token_accuracy": 0.7784585759043694, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 1.1079637577491654, |
|
"grad_norm": 4.96875, |
|
"learning_rate": 1.2610962736868882e-05, |
|
"loss": 0.7102, |
|
"mean_token_accuracy": 0.7830314084887504, |
|
"step": 2905 |
|
}, |
|
{ |
|
"epoch": 1.109871244635193, |
|
"grad_norm": 5.5625, |
|
"learning_rate": 1.2598244944677604e-05, |
|
"loss": 0.7185, |
|
"mean_token_accuracy": 0.7852053195238113, |
|
"step": 2910 |
|
}, |
|
{ |
|
"epoch": 1.1117787315212209, |
|
"grad_norm": 6.1875, |
|
"learning_rate": 1.258552715248633e-05, |
|
"loss": 0.7364, |
|
"mean_token_accuracy": 0.7896745055913925, |
|
"step": 2915 |
|
}, |
|
{ |
|
"epoch": 1.1136862184072485, |
|
"grad_norm": 5.125, |
|
"learning_rate": 1.2572809360295053e-05, |
|
"loss": 0.7305, |
|
"mean_token_accuracy": 0.7816007971763611, |
|
"step": 2920 |
|
}, |
|
{ |
|
"epoch": 1.115593705293276, |
|
"grad_norm": 6.03125, |
|
"learning_rate": 1.2560091568103779e-05, |
|
"loss": 0.6798, |
|
"mean_token_accuracy": 0.7947070822119713, |
|
"step": 2925 |
|
}, |
|
{ |
|
"epoch": 1.1175011921793039, |
|
"grad_norm": 4.875, |
|
"learning_rate": 1.2547373775912504e-05, |
|
"loss": 0.7053, |
|
"mean_token_accuracy": 0.7792468458414078, |
|
"step": 2930 |
|
}, |
|
{ |
|
"epoch": 1.1194086790653315, |
|
"grad_norm": 5.25, |
|
"learning_rate": 1.2534655983721226e-05, |
|
"loss": 0.7326, |
|
"mean_token_accuracy": 0.7784487336874009, |
|
"step": 2935 |
|
}, |
|
{ |
|
"epoch": 1.121316165951359, |
|
"grad_norm": 5.625, |
|
"learning_rate": 1.2521938191529952e-05, |
|
"loss": 0.7323, |
|
"mean_token_accuracy": 0.7857000678777695, |
|
"step": 2940 |
|
}, |
|
{ |
|
"epoch": 1.1232236528373867, |
|
"grad_norm": 5.875, |
|
"learning_rate": 1.2509220399338676e-05, |
|
"loss": 0.7108, |
|
"mean_token_accuracy": 0.774834556877613, |
|
"step": 2945 |
|
}, |
|
{ |
|
"epoch": 1.1251311397234145, |
|
"grad_norm": 5.15625, |
|
"learning_rate": 1.2496502607147401e-05, |
|
"loss": 0.763, |
|
"mean_token_accuracy": 0.7830505579710006, |
|
"step": 2950 |
|
}, |
|
{ |
|
"epoch": 1.127038626609442, |
|
"grad_norm": 4.6875, |
|
"learning_rate": 1.2483784814956123e-05, |
|
"loss": 0.7907, |
|
"mean_token_accuracy": 0.7585530668497086, |
|
"step": 2955 |
|
}, |
|
{ |
|
"epoch": 1.1289461134954697, |
|
"grad_norm": 6.03125, |
|
"learning_rate": 1.2471067022764849e-05, |
|
"loss": 0.6787, |
|
"mean_token_accuracy": 0.8010306641459465, |
|
"step": 2960 |
|
}, |
|
{ |
|
"epoch": 1.1308536003814973, |
|
"grad_norm": 5.71875, |
|
"learning_rate": 1.2458349230573574e-05, |
|
"loss": 0.7014, |
|
"mean_token_accuracy": 0.7869263395667077, |
|
"step": 2965 |
|
}, |
|
{ |
|
"epoch": 1.1327610872675251, |
|
"grad_norm": 5.96875, |
|
"learning_rate": 1.2445631438382298e-05, |
|
"loss": 0.6731, |
|
"mean_token_accuracy": 0.7962010264396667, |
|
"step": 2970 |
|
}, |
|
{ |
|
"epoch": 1.1346685741535527, |
|
"grad_norm": 5.40625, |
|
"learning_rate": 1.2432913646191023e-05, |
|
"loss": 0.6931, |
|
"mean_token_accuracy": 0.7810707330703736, |
|
"step": 2975 |
|
}, |
|
{ |
|
"epoch": 1.1365760610395803, |
|
"grad_norm": 5.28125, |
|
"learning_rate": 1.2420195853999746e-05, |
|
"loss": 0.6993, |
|
"mean_token_accuracy": 0.7831882372498512, |
|
"step": 2980 |
|
}, |
|
{ |
|
"epoch": 1.138483547925608, |
|
"grad_norm": 4.46875, |
|
"learning_rate": 1.2407478061808471e-05, |
|
"loss": 0.7372, |
|
"mean_token_accuracy": 0.7778501972556114, |
|
"step": 2985 |
|
}, |
|
{ |
|
"epoch": 1.1403910348116357, |
|
"grad_norm": 6.15625, |
|
"learning_rate": 1.2394760269617197e-05, |
|
"loss": 0.7833, |
|
"mean_token_accuracy": 0.7658291473984719, |
|
"step": 2990 |
|
}, |
|
{ |
|
"epoch": 1.1422985216976633, |
|
"grad_norm": 5.34375, |
|
"learning_rate": 1.238204247742592e-05, |
|
"loss": 0.7703, |
|
"mean_token_accuracy": 0.7754868105053901, |
|
"step": 2995 |
|
}, |
|
{ |
|
"epoch": 1.144206008583691, |
|
"grad_norm": 4.59375, |
|
"learning_rate": 1.2369324685234644e-05, |
|
"loss": 0.7596, |
|
"mean_token_accuracy": 0.7784150972962379, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 1.1461134954697187, |
|
"grad_norm": 5.1875, |
|
"learning_rate": 1.2356606893043368e-05, |
|
"loss": 0.699, |
|
"mean_token_accuracy": 0.8043418154120445, |
|
"step": 3005 |
|
}, |
|
{ |
|
"epoch": 1.1480209823557463, |
|
"grad_norm": 5.53125, |
|
"learning_rate": 1.2343889100852093e-05, |
|
"loss": 0.7489, |
|
"mean_token_accuracy": 0.7779771253466606, |
|
"step": 3010 |
|
}, |
|
{ |
|
"epoch": 1.149928469241774, |
|
"grad_norm": 4.8125, |
|
"learning_rate": 1.2331171308660817e-05, |
|
"loss": 0.7085, |
|
"mean_token_accuracy": 0.7849825263023377, |
|
"step": 3015 |
|
}, |
|
{ |
|
"epoch": 1.1518359561278015, |
|
"grad_norm": 5.0625, |
|
"learning_rate": 1.2318453516469541e-05, |
|
"loss": 0.7101, |
|
"mean_token_accuracy": 0.7911660373210907, |
|
"step": 3020 |
|
}, |
|
{ |
|
"epoch": 1.1537434430138294, |
|
"grad_norm": 4.21875, |
|
"learning_rate": 1.2305735724278267e-05, |
|
"loss": 0.6447, |
|
"mean_token_accuracy": 0.8056324139237404, |
|
"step": 3025 |
|
}, |
|
{ |
|
"epoch": 1.155650929899857, |
|
"grad_norm": 5.8125, |
|
"learning_rate": 1.229301793208699e-05, |
|
"loss": 0.7109, |
|
"mean_token_accuracy": 0.7876276299357414, |
|
"step": 3030 |
|
}, |
|
{ |
|
"epoch": 1.1575584167858846, |
|
"grad_norm": 5.34375, |
|
"learning_rate": 1.2280300139895716e-05, |
|
"loss": 0.8102, |
|
"mean_token_accuracy": 0.7673216596245765, |
|
"step": 3035 |
|
}, |
|
{ |
|
"epoch": 1.1594659036719124, |
|
"grad_norm": 5.375, |
|
"learning_rate": 1.2267582347704438e-05, |
|
"loss": 0.7131, |
|
"mean_token_accuracy": 0.7869996011257172, |
|
"step": 3040 |
|
}, |
|
{ |
|
"epoch": 1.16137339055794, |
|
"grad_norm": 5.53125, |
|
"learning_rate": 1.2254864555513164e-05, |
|
"loss": 0.7036, |
|
"mean_token_accuracy": 0.7789271324872971, |
|
"step": 3045 |
|
}, |
|
{ |
|
"epoch": 1.1632808774439676, |
|
"grad_norm": 5.28125, |
|
"learning_rate": 1.2242146763321889e-05, |
|
"loss": 0.6824, |
|
"mean_token_accuracy": 0.7872760623693467, |
|
"step": 3050 |
|
}, |
|
{ |
|
"epoch": 1.1651883643299952, |
|
"grad_norm": 4.46875, |
|
"learning_rate": 1.2229428971130613e-05, |
|
"loss": 0.7335, |
|
"mean_token_accuracy": 0.7788919404149055, |
|
"step": 3055 |
|
}, |
|
{ |
|
"epoch": 1.1670958512160228, |
|
"grad_norm": 5.28125, |
|
"learning_rate": 1.2216711178939338e-05, |
|
"loss": 0.7988, |
|
"mean_token_accuracy": 0.7685728743672371, |
|
"step": 3060 |
|
}, |
|
{ |
|
"epoch": 1.1690033381020506, |
|
"grad_norm": 5.125, |
|
"learning_rate": 1.220399338674806e-05, |
|
"loss": 0.7423, |
|
"mean_token_accuracy": 0.7701018214225769, |
|
"step": 3065 |
|
}, |
|
{ |
|
"epoch": 1.1709108249880782, |
|
"grad_norm": 6.46875, |
|
"learning_rate": 1.2191275594556786e-05, |
|
"loss": 0.7445, |
|
"mean_token_accuracy": 0.7846373036503792, |
|
"step": 3070 |
|
}, |
|
{ |
|
"epoch": 1.1728183118741058, |
|
"grad_norm": 5.6875, |
|
"learning_rate": 1.217855780236551e-05, |
|
"loss": 0.6958, |
|
"mean_token_accuracy": 0.7816796407103539, |
|
"step": 3075 |
|
}, |
|
{ |
|
"epoch": 1.1747257987601336, |
|
"grad_norm": 4.96875, |
|
"learning_rate": 1.2165840010174235e-05, |
|
"loss": 0.6823, |
|
"mean_token_accuracy": 0.7943092510104179, |
|
"step": 3080 |
|
}, |
|
{ |
|
"epoch": 1.1766332856461612, |
|
"grad_norm": 5.28125, |
|
"learning_rate": 1.215312221798296e-05, |
|
"loss": 0.6901, |
|
"mean_token_accuracy": 0.7862473502755165, |
|
"step": 3085 |
|
}, |
|
{ |
|
"epoch": 1.1785407725321888, |
|
"grad_norm": 4.625, |
|
"learning_rate": 1.2140404425791683e-05, |
|
"loss": 0.6357, |
|
"mean_token_accuracy": 0.8068152844905854, |
|
"step": 3090 |
|
}, |
|
{ |
|
"epoch": 1.1804482594182164, |
|
"grad_norm": 4.53125, |
|
"learning_rate": 1.2127686633600408e-05, |
|
"loss": 0.6244, |
|
"mean_token_accuracy": 0.8068492501974106, |
|
"step": 3095 |
|
}, |
|
{ |
|
"epoch": 1.1823557463042442, |
|
"grad_norm": 4.875, |
|
"learning_rate": 1.2114968841409132e-05, |
|
"loss": 0.7346, |
|
"mean_token_accuracy": 0.7738267377018928, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 1.1842632331902718, |
|
"grad_norm": 5.5, |
|
"learning_rate": 1.2102251049217858e-05, |
|
"loss": 0.6948, |
|
"mean_token_accuracy": 0.7892757222056389, |
|
"step": 3105 |
|
}, |
|
{ |
|
"epoch": 1.1861707200762994, |
|
"grad_norm": 5.4375, |
|
"learning_rate": 1.2089533257026581e-05, |
|
"loss": 0.676, |
|
"mean_token_accuracy": 0.7894925311207771, |
|
"step": 3110 |
|
}, |
|
{ |
|
"epoch": 1.1880782069623272, |
|
"grad_norm": 4.65625, |
|
"learning_rate": 1.2076815464835305e-05, |
|
"loss": 0.712, |
|
"mean_token_accuracy": 0.7807778924703598, |
|
"step": 3115 |
|
}, |
|
{ |
|
"epoch": 1.1899856938483548, |
|
"grad_norm": 4.4375, |
|
"learning_rate": 1.206409767264403e-05, |
|
"loss": 0.6698, |
|
"mean_token_accuracy": 0.7984389364719391, |
|
"step": 3120 |
|
}, |
|
{ |
|
"epoch": 1.1918931807343824, |
|
"grad_norm": 5.0625, |
|
"learning_rate": 1.2051379880452755e-05, |
|
"loss": 0.6725, |
|
"mean_token_accuracy": 0.800166568160057, |
|
"step": 3125 |
|
}, |
|
{ |
|
"epoch": 1.19380066762041, |
|
"grad_norm": 5.875, |
|
"learning_rate": 1.2038662088261478e-05, |
|
"loss": 0.7385, |
|
"mean_token_accuracy": 0.7819548204541207, |
|
"step": 3130 |
|
}, |
|
{ |
|
"epoch": 1.1957081545064379, |
|
"grad_norm": 5.5625, |
|
"learning_rate": 1.2025944296070202e-05, |
|
"loss": 0.6656, |
|
"mean_token_accuracy": 0.7992453247308731, |
|
"step": 3135 |
|
}, |
|
{ |
|
"epoch": 1.1976156413924655, |
|
"grad_norm": 6.59375, |
|
"learning_rate": 1.2013226503878928e-05, |
|
"loss": 0.6904, |
|
"mean_token_accuracy": 0.7905550047755241, |
|
"step": 3140 |
|
}, |
|
{ |
|
"epoch": 1.199523128278493, |
|
"grad_norm": 5.125, |
|
"learning_rate": 1.2000508711687653e-05, |
|
"loss": 0.7657, |
|
"mean_token_accuracy": 0.7733726158738137, |
|
"step": 3145 |
|
}, |
|
{ |
|
"epoch": 1.2014306151645207, |
|
"grad_norm": 5.1875, |
|
"learning_rate": 1.1987790919496375e-05, |
|
"loss": 0.6699, |
|
"mean_token_accuracy": 0.7883818462491036, |
|
"step": 3150 |
|
}, |
|
{ |
|
"epoch": 1.2033381020505485, |
|
"grad_norm": 5.0625, |
|
"learning_rate": 1.19750731273051e-05, |
|
"loss": 0.7322, |
|
"mean_token_accuracy": 0.7788220182061195, |
|
"step": 3155 |
|
}, |
|
{ |
|
"epoch": 1.205245588936576, |
|
"grad_norm": 5.5, |
|
"learning_rate": 1.1962355335113825e-05, |
|
"loss": 0.6893, |
|
"mean_token_accuracy": 0.775249108672142, |
|
"step": 3160 |
|
}, |
|
{ |
|
"epoch": 1.2071530758226037, |
|
"grad_norm": 7.0625, |
|
"learning_rate": 1.194963754292255e-05, |
|
"loss": 0.7337, |
|
"mean_token_accuracy": 0.784166394174099, |
|
"step": 3165 |
|
}, |
|
{ |
|
"epoch": 1.2090605627086313, |
|
"grad_norm": 5.5625, |
|
"learning_rate": 1.1936919750731276e-05, |
|
"loss": 0.7038, |
|
"mean_token_accuracy": 0.7948532626032829, |
|
"step": 3170 |
|
}, |
|
{ |
|
"epoch": 1.210968049594659, |
|
"grad_norm": 8.8125, |
|
"learning_rate": 1.1924201958539998e-05, |
|
"loss": 0.753, |
|
"mean_token_accuracy": 0.7781305342912674, |
|
"step": 3175 |
|
}, |
|
{ |
|
"epoch": 1.2128755364806867, |
|
"grad_norm": 5.15625, |
|
"learning_rate": 1.1911484166348723e-05, |
|
"loss": 0.6694, |
|
"mean_token_accuracy": 0.800914041697979, |
|
"step": 3180 |
|
}, |
|
{ |
|
"epoch": 1.2147830233667143, |
|
"grad_norm": 5.625, |
|
"learning_rate": 1.1898766374157447e-05, |
|
"loss": 0.6664, |
|
"mean_token_accuracy": 0.7857102379202843, |
|
"step": 3185 |
|
}, |
|
{ |
|
"epoch": 1.216690510252742, |
|
"grad_norm": 5.3125, |
|
"learning_rate": 1.1886048581966173e-05, |
|
"loss": 0.6609, |
|
"mean_token_accuracy": 0.7949806705117226, |
|
"step": 3190 |
|
}, |
|
{ |
|
"epoch": 1.2185979971387697, |
|
"grad_norm": 5.1875, |
|
"learning_rate": 1.1873330789774895e-05, |
|
"loss": 0.7093, |
|
"mean_token_accuracy": 0.7808532416820526, |
|
"step": 3195 |
|
}, |
|
{ |
|
"epoch": 1.2205054840247973, |
|
"grad_norm": 5.15625, |
|
"learning_rate": 1.186061299758362e-05, |
|
"loss": 0.7516, |
|
"mean_token_accuracy": 0.7698055237531662, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 1.222412970910825, |
|
"grad_norm": 4.6875, |
|
"learning_rate": 1.1847895205392346e-05, |
|
"loss": 0.6743, |
|
"mean_token_accuracy": 0.8001621171832085, |
|
"step": 3205 |
|
}, |
|
{ |
|
"epoch": 1.2243204577968527, |
|
"grad_norm": 4.78125, |
|
"learning_rate": 1.183517741320107e-05, |
|
"loss": 0.7656, |
|
"mean_token_accuracy": 0.7759733751416207, |
|
"step": 3210 |
|
}, |
|
{ |
|
"epoch": 1.2262279446828803, |
|
"grad_norm": 5.875, |
|
"learning_rate": 1.1822459621009795e-05, |
|
"loss": 0.736, |
|
"mean_token_accuracy": 0.7686377540230751, |
|
"step": 3215 |
|
}, |
|
{ |
|
"epoch": 1.228135431568908, |
|
"grad_norm": 5.4375, |
|
"learning_rate": 1.1809741828818517e-05, |
|
"loss": 0.7115, |
|
"mean_token_accuracy": 0.7793835371732711, |
|
"step": 3220 |
|
}, |
|
{ |
|
"epoch": 1.2300429184549357, |
|
"grad_norm": 6.0625, |
|
"learning_rate": 1.1797024036627243e-05, |
|
"loss": 0.7457, |
|
"mean_token_accuracy": 0.7806543171405792, |
|
"step": 3225 |
|
}, |
|
{ |
|
"epoch": 1.2319504053409633, |
|
"grad_norm": 4.75, |
|
"learning_rate": 1.1784306244435968e-05, |
|
"loss": 0.7483, |
|
"mean_token_accuracy": 0.7799697473645211, |
|
"step": 3230 |
|
}, |
|
{ |
|
"epoch": 1.233857892226991, |
|
"grad_norm": 6.15625, |
|
"learning_rate": 1.1771588452244692e-05, |
|
"loss": 0.7715, |
|
"mean_token_accuracy": 0.7469990506768227, |
|
"step": 3235 |
|
}, |
|
{ |
|
"epoch": 1.2357653791130185, |
|
"grad_norm": 4.53125, |
|
"learning_rate": 1.1758870660053416e-05, |
|
"loss": 0.6083, |
|
"mean_token_accuracy": 0.8102578550577164, |
|
"step": 3240 |
|
}, |
|
{ |
|
"epoch": 1.2376728659990461, |
|
"grad_norm": 5.34375, |
|
"learning_rate": 1.174615286786214e-05, |
|
"loss": 0.752, |
|
"mean_token_accuracy": 0.7763517677783967, |
|
"step": 3245 |
|
}, |
|
{ |
|
"epoch": 1.239580352885074, |
|
"grad_norm": 4.8125, |
|
"learning_rate": 1.1733435075670865e-05, |
|
"loss": 0.739, |
|
"mean_token_accuracy": 0.7704811751842499, |
|
"step": 3250 |
|
}, |
|
{ |
|
"epoch": 1.2414878397711016, |
|
"grad_norm": 5.34375, |
|
"learning_rate": 1.1720717283479589e-05, |
|
"loss": 0.693, |
|
"mean_token_accuracy": 0.7942864954471588, |
|
"step": 3255 |
|
}, |
|
{ |
|
"epoch": 1.2433953266571292, |
|
"grad_norm": 5.09375, |
|
"learning_rate": 1.1707999491288313e-05, |
|
"loss": 0.7277, |
|
"mean_token_accuracy": 0.7802015751600265, |
|
"step": 3260 |
|
}, |
|
{ |
|
"epoch": 1.245302813543157, |
|
"grad_norm": 5.71875, |
|
"learning_rate": 1.1695281699097038e-05, |
|
"loss": 0.8018, |
|
"mean_token_accuracy": 0.7585472777485848, |
|
"step": 3265 |
|
}, |
|
{ |
|
"epoch": 1.2472103004291846, |
|
"grad_norm": 6.6875, |
|
"learning_rate": 1.1682563906905762e-05, |
|
"loss": 0.7667, |
|
"mean_token_accuracy": 0.773290790617466, |
|
"step": 3270 |
|
}, |
|
{ |
|
"epoch": 1.2491177873152122, |
|
"grad_norm": 5.4375, |
|
"learning_rate": 1.1669846114714487e-05, |
|
"loss": 0.7078, |
|
"mean_token_accuracy": 0.7810175165534019, |
|
"step": 3275 |
|
}, |
|
{ |
|
"epoch": 1.2510252742012398, |
|
"grad_norm": 6.375, |
|
"learning_rate": 1.165712832252321e-05, |
|
"loss": 0.7716, |
|
"mean_token_accuracy": 0.7618390426039696, |
|
"step": 3280 |
|
}, |
|
{ |
|
"epoch": 1.2529327610872676, |
|
"grad_norm": 5.59375, |
|
"learning_rate": 1.1644410530331935e-05, |
|
"loss": 0.6536, |
|
"mean_token_accuracy": 0.794977605342865, |
|
"step": 3285 |
|
}, |
|
{ |
|
"epoch": 1.2548402479732952, |
|
"grad_norm": 5.15625, |
|
"learning_rate": 1.163169273814066e-05, |
|
"loss": 0.6787, |
|
"mean_token_accuracy": 0.7936605170369149, |
|
"step": 3290 |
|
}, |
|
{ |
|
"epoch": 1.2567477348593228, |
|
"grad_norm": 5.71875, |
|
"learning_rate": 1.1618974945949384e-05, |
|
"loss": 0.7416, |
|
"mean_token_accuracy": 0.7848603546619415, |
|
"step": 3295 |
|
}, |
|
{ |
|
"epoch": 1.2586552217453506, |
|
"grad_norm": 5.28125, |
|
"learning_rate": 1.160625715375811e-05, |
|
"loss": 0.7217, |
|
"mean_token_accuracy": 0.7866264447569847, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 1.2605627086313782, |
|
"grad_norm": 5.5, |
|
"learning_rate": 1.1593539361566832e-05, |
|
"loss": 0.6698, |
|
"mean_token_accuracy": 0.8022088572382927, |
|
"step": 3305 |
|
}, |
|
{ |
|
"epoch": 1.2624701955174058, |
|
"grad_norm": 5.15625, |
|
"learning_rate": 1.1580821569375557e-05, |
|
"loss": 0.7169, |
|
"mean_token_accuracy": 0.7868107482790947, |
|
"step": 3310 |
|
}, |
|
{ |
|
"epoch": 1.2643776824034334, |
|
"grad_norm": 5.0, |
|
"learning_rate": 1.1568103777184281e-05, |
|
"loss": 0.6532, |
|
"mean_token_accuracy": 0.8017155960202217, |
|
"step": 3315 |
|
}, |
|
{ |
|
"epoch": 1.266285169289461, |
|
"grad_norm": 4.78125, |
|
"learning_rate": 1.1555385984993007e-05, |
|
"loss": 0.7316, |
|
"mean_token_accuracy": 0.7813733011484146, |
|
"step": 3320 |
|
}, |
|
{ |
|
"epoch": 1.2681926561754888, |
|
"grad_norm": 4.9375, |
|
"learning_rate": 1.1542668192801732e-05, |
|
"loss": 0.7266, |
|
"mean_token_accuracy": 0.7774209037423134, |
|
"step": 3325 |
|
}, |
|
{ |
|
"epoch": 1.2701001430615164, |
|
"grad_norm": 5.53125, |
|
"learning_rate": 1.1529950400610454e-05, |
|
"loss": 0.706, |
|
"mean_token_accuracy": 0.7916549324989319, |
|
"step": 3330 |
|
}, |
|
{ |
|
"epoch": 1.2720076299475442, |
|
"grad_norm": 5.28125, |
|
"learning_rate": 1.151723260841918e-05, |
|
"loss": 0.7483, |
|
"mean_token_accuracy": 0.7811658516526222, |
|
"step": 3335 |
|
}, |
|
{ |
|
"epoch": 1.2739151168335718, |
|
"grad_norm": 6.1875, |
|
"learning_rate": 1.1504514816227904e-05, |
|
"loss": 0.7015, |
|
"mean_token_accuracy": 0.7907307639718055, |
|
"step": 3340 |
|
}, |
|
{ |
|
"epoch": 1.2758226037195994, |
|
"grad_norm": 4.46875, |
|
"learning_rate": 1.1491797024036629e-05, |
|
"loss": 0.7421, |
|
"mean_token_accuracy": 0.7704378560185432, |
|
"step": 3345 |
|
}, |
|
{ |
|
"epoch": 1.277730090605627, |
|
"grad_norm": 5.8125, |
|
"learning_rate": 1.1479079231845353e-05, |
|
"loss": 0.6951, |
|
"mean_token_accuracy": 0.7920941516757012, |
|
"step": 3350 |
|
}, |
|
{ |
|
"epoch": 1.2796375774916546, |
|
"grad_norm": 5.5, |
|
"learning_rate": 1.1466361439654077e-05, |
|
"loss": 0.7488, |
|
"mean_token_accuracy": 0.7743821710348129, |
|
"step": 3355 |
|
}, |
|
{ |
|
"epoch": 1.2815450643776825, |
|
"grad_norm": 5.53125, |
|
"learning_rate": 1.1453643647462802e-05, |
|
"loss": 0.7239, |
|
"mean_token_accuracy": 0.7782898634672165, |
|
"step": 3360 |
|
}, |
|
{ |
|
"epoch": 1.28345255126371, |
|
"grad_norm": 5.15625, |
|
"learning_rate": 1.1440925855271526e-05, |
|
"loss": 0.7275, |
|
"mean_token_accuracy": 0.7705945268273353, |
|
"step": 3365 |
|
}, |
|
{ |
|
"epoch": 1.2853600381497376, |
|
"grad_norm": 4.84375, |
|
"learning_rate": 1.142820806308025e-05, |
|
"loss": 0.7028, |
|
"mean_token_accuracy": 0.7802629828453064, |
|
"step": 3370 |
|
}, |
|
{ |
|
"epoch": 1.2872675250357655, |
|
"grad_norm": 6.21875, |
|
"learning_rate": 1.1415490270888974e-05, |
|
"loss": 0.7664, |
|
"mean_token_accuracy": 0.7695603922009469, |
|
"step": 3375 |
|
}, |
|
{ |
|
"epoch": 1.289175011921793, |
|
"grad_norm": 5.90625, |
|
"learning_rate": 1.14027724786977e-05, |
|
"loss": 0.728, |
|
"mean_token_accuracy": 0.7842082038521767, |
|
"step": 3380 |
|
}, |
|
{ |
|
"epoch": 1.2910824988078207, |
|
"grad_norm": 4.6875, |
|
"learning_rate": 1.1390054686506425e-05, |
|
"loss": 0.6729, |
|
"mean_token_accuracy": 0.7945865884423255, |
|
"step": 3385 |
|
}, |
|
{ |
|
"epoch": 1.2929899856938483, |
|
"grad_norm": 5.21875, |
|
"learning_rate": 1.1377336894315147e-05, |
|
"loss": 0.6643, |
|
"mean_token_accuracy": 0.7997446030378341, |
|
"step": 3390 |
|
}, |
|
{ |
|
"epoch": 1.294897472579876, |
|
"grad_norm": 4.40625, |
|
"learning_rate": 1.1364619102123872e-05, |
|
"loss": 0.7113, |
|
"mean_token_accuracy": 0.7903872340917587, |
|
"step": 3395 |
|
}, |
|
{ |
|
"epoch": 1.2968049594659037, |
|
"grad_norm": 5.09375, |
|
"learning_rate": 1.1351901309932596e-05, |
|
"loss": 0.6903, |
|
"mean_token_accuracy": 0.7984808310866356, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 1.2987124463519313, |
|
"grad_norm": 5.5625, |
|
"learning_rate": 1.1339183517741322e-05, |
|
"loss": 0.7383, |
|
"mean_token_accuracy": 0.7875708505511284, |
|
"step": 3405 |
|
}, |
|
{ |
|
"epoch": 1.300619933237959, |
|
"grad_norm": 6.21875, |
|
"learning_rate": 1.1326465725550047e-05, |
|
"loss": 0.6604, |
|
"mean_token_accuracy": 0.7980678513646126, |
|
"step": 3410 |
|
}, |
|
{ |
|
"epoch": 1.3025274201239867, |
|
"grad_norm": 6.875, |
|
"learning_rate": 1.131374793335877e-05, |
|
"loss": 0.7434, |
|
"mean_token_accuracy": 0.7777546659111977, |
|
"step": 3415 |
|
}, |
|
{ |
|
"epoch": 1.3044349070100143, |
|
"grad_norm": 4.9375, |
|
"learning_rate": 1.1301030141167495e-05, |
|
"loss": 0.6542, |
|
"mean_token_accuracy": 0.7975090846419335, |
|
"step": 3420 |
|
}, |
|
{ |
|
"epoch": 1.306342393896042, |
|
"grad_norm": 4.71875, |
|
"learning_rate": 1.1288312348976218e-05, |
|
"loss": 0.7533, |
|
"mean_token_accuracy": 0.7681490883231163, |
|
"step": 3425 |
|
}, |
|
{ |
|
"epoch": 1.3082498807820695, |
|
"grad_norm": 5.65625, |
|
"learning_rate": 1.1275594556784944e-05, |
|
"loss": 0.7697, |
|
"mean_token_accuracy": 0.7731455415487289, |
|
"step": 3430 |
|
}, |
|
{ |
|
"epoch": 1.3101573676680973, |
|
"grad_norm": 6.375, |
|
"learning_rate": 1.1262876764593666e-05, |
|
"loss": 0.6563, |
|
"mean_token_accuracy": 0.8001643344759941, |
|
"step": 3435 |
|
}, |
|
{ |
|
"epoch": 1.312064854554125, |
|
"grad_norm": 5.90625, |
|
"learning_rate": 1.1250158972402392e-05, |
|
"loss": 0.6893, |
|
"mean_token_accuracy": 0.787324196100235, |
|
"step": 3440 |
|
}, |
|
{ |
|
"epoch": 1.3139723414401527, |
|
"grad_norm": 5.40625, |
|
"learning_rate": 1.1237441180211117e-05, |
|
"loss": 0.6895, |
|
"mean_token_accuracy": 0.7794161334633827, |
|
"step": 3445 |
|
}, |
|
{ |
|
"epoch": 1.3158798283261803, |
|
"grad_norm": 6.03125, |
|
"learning_rate": 1.1224723388019841e-05, |
|
"loss": 0.6712, |
|
"mean_token_accuracy": 0.8040449827909469, |
|
"step": 3450 |
|
}, |
|
{ |
|
"epoch": 1.317787315212208, |
|
"grad_norm": 6.28125, |
|
"learning_rate": 1.1212005595828566e-05, |
|
"loss": 0.7596, |
|
"mean_token_accuracy": 0.7752531677484512, |
|
"step": 3455 |
|
}, |
|
{ |
|
"epoch": 1.3196948020982355, |
|
"grad_norm": 6.71875, |
|
"learning_rate": 1.1199287803637289e-05, |
|
"loss": 0.7522, |
|
"mean_token_accuracy": 0.7756654977798462, |
|
"step": 3460 |
|
}, |
|
{ |
|
"epoch": 1.3216022889842631, |
|
"grad_norm": 5.3125, |
|
"learning_rate": 1.1186570011446014e-05, |
|
"loss": 0.6728, |
|
"mean_token_accuracy": 0.7911383360624313, |
|
"step": 3465 |
|
}, |
|
{ |
|
"epoch": 1.323509775870291, |
|
"grad_norm": 5.5625, |
|
"learning_rate": 1.117385221925474e-05, |
|
"loss": 0.6693, |
|
"mean_token_accuracy": 0.7900627195835114, |
|
"step": 3470 |
|
}, |
|
{ |
|
"epoch": 1.3254172627563185, |
|
"grad_norm": 5.3125, |
|
"learning_rate": 1.1161134427063463e-05, |
|
"loss": 0.8004, |
|
"mean_token_accuracy": 0.7619607031345368, |
|
"step": 3475 |
|
}, |
|
{ |
|
"epoch": 1.3273247496423461, |
|
"grad_norm": 5.40625, |
|
"learning_rate": 1.1148416634872187e-05, |
|
"loss": 0.6342, |
|
"mean_token_accuracy": 0.8038983285427094, |
|
"step": 3480 |
|
}, |
|
{ |
|
"epoch": 1.329232236528374, |
|
"grad_norm": 4.84375, |
|
"learning_rate": 1.1135698842680911e-05, |
|
"loss": 0.6636, |
|
"mean_token_accuracy": 0.7987726837396621, |
|
"step": 3485 |
|
}, |
|
{ |
|
"epoch": 1.3311397234144016, |
|
"grad_norm": 5.375, |
|
"learning_rate": 1.1122981050489636e-05, |
|
"loss": 0.7975, |
|
"mean_token_accuracy": 0.7736665666103363, |
|
"step": 3490 |
|
}, |
|
{ |
|
"epoch": 1.3330472103004292, |
|
"grad_norm": 4.375, |
|
"learning_rate": 1.1110263258298359e-05, |
|
"loss": 0.671, |
|
"mean_token_accuracy": 0.7992818504571915, |
|
"step": 3495 |
|
}, |
|
{ |
|
"epoch": 1.3349546971864568, |
|
"grad_norm": 6.28125, |
|
"learning_rate": 1.1097545466107084e-05, |
|
"loss": 0.7656, |
|
"mean_token_accuracy": 0.767841525375843, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 1.3368621840724846, |
|
"grad_norm": 6.0, |
|
"learning_rate": 1.108482767391581e-05, |
|
"loss": 0.7973, |
|
"mean_token_accuracy": 0.764432281255722, |
|
"step": 3505 |
|
}, |
|
{ |
|
"epoch": 1.3387696709585122, |
|
"grad_norm": 6.0, |
|
"learning_rate": 1.1072109881724533e-05, |
|
"loss": 0.753, |
|
"mean_token_accuracy": 0.7767688825726509, |
|
"step": 3510 |
|
}, |
|
{ |
|
"epoch": 1.3406771578445398, |
|
"grad_norm": 4.5625, |
|
"learning_rate": 1.1059392089533259e-05, |
|
"loss": 0.7506, |
|
"mean_token_accuracy": 0.7745794802904129, |
|
"step": 3515 |
|
}, |
|
{ |
|
"epoch": 1.3425846447305676, |
|
"grad_norm": 5.1875, |
|
"learning_rate": 1.1046674297341981e-05, |
|
"loss": 0.6841, |
|
"mean_token_accuracy": 0.7984428569674492, |
|
"step": 3520 |
|
}, |
|
{ |
|
"epoch": 1.3444921316165952, |
|
"grad_norm": 6.53125, |
|
"learning_rate": 1.1033956505150706e-05, |
|
"loss": 0.7079, |
|
"mean_token_accuracy": 0.7906140476465225, |
|
"step": 3525 |
|
}, |
|
{ |
|
"epoch": 1.3463996185026228, |
|
"grad_norm": 6.125, |
|
"learning_rate": 1.102123871295943e-05, |
|
"loss": 0.6787, |
|
"mean_token_accuracy": 0.7917294785380363, |
|
"step": 3530 |
|
}, |
|
{ |
|
"epoch": 1.3483071053886504, |
|
"grad_norm": 5.40625, |
|
"learning_rate": 1.1008520920768156e-05, |
|
"loss": 0.7256, |
|
"mean_token_accuracy": 0.7792046666145325, |
|
"step": 3535 |
|
}, |
|
{ |
|
"epoch": 1.350214592274678, |
|
"grad_norm": 6.15625, |
|
"learning_rate": 1.0995803128576881e-05, |
|
"loss": 0.6745, |
|
"mean_token_accuracy": 0.794202433526516, |
|
"step": 3540 |
|
}, |
|
{ |
|
"epoch": 1.3521220791607058, |
|
"grad_norm": 5.875, |
|
"learning_rate": 1.0983085336385603e-05, |
|
"loss": 0.73, |
|
"mean_token_accuracy": 0.7751868382096291, |
|
"step": 3545 |
|
}, |
|
{ |
|
"epoch": 1.3540295660467334, |
|
"grad_norm": 4.9375, |
|
"learning_rate": 1.0970367544194329e-05, |
|
"loss": 0.7783, |
|
"mean_token_accuracy": 0.7688572570681572, |
|
"step": 3550 |
|
}, |
|
{ |
|
"epoch": 1.355937052932761, |
|
"grad_norm": 5.125, |
|
"learning_rate": 1.0957649752003053e-05, |
|
"loss": 0.7878, |
|
"mean_token_accuracy": 0.7709324531257152, |
|
"step": 3555 |
|
}, |
|
{ |
|
"epoch": 1.3578445398187888, |
|
"grad_norm": 4.8125, |
|
"learning_rate": 1.0944931959811778e-05, |
|
"loss": 0.6636, |
|
"mean_token_accuracy": 0.799449859559536, |
|
"step": 3560 |
|
}, |
|
{ |
|
"epoch": 1.3597520267048164, |
|
"grad_norm": 5.21875, |
|
"learning_rate": 1.0932214167620504e-05, |
|
"loss": 0.7111, |
|
"mean_token_accuracy": 0.7865828841924667, |
|
"step": 3565 |
|
}, |
|
{ |
|
"epoch": 1.361659513590844, |
|
"grad_norm": 6.6875, |
|
"learning_rate": 1.0919496375429226e-05, |
|
"loss": 0.7151, |
|
"mean_token_accuracy": 0.7823675647377968, |
|
"step": 3570 |
|
}, |
|
{ |
|
"epoch": 1.3635670004768716, |
|
"grad_norm": 4.65625, |
|
"learning_rate": 1.0906778583237951e-05, |
|
"loss": 0.6872, |
|
"mean_token_accuracy": 0.790128941833973, |
|
"step": 3575 |
|
}, |
|
{ |
|
"epoch": 1.3654744873628994, |
|
"grad_norm": 4.75, |
|
"learning_rate": 1.0894060791046675e-05, |
|
"loss": 0.6401, |
|
"mean_token_accuracy": 0.8002132594585418, |
|
"step": 3580 |
|
}, |
|
{ |
|
"epoch": 1.367381974248927, |
|
"grad_norm": 7.5, |
|
"learning_rate": 1.08813429988554e-05, |
|
"loss": 0.7567, |
|
"mean_token_accuracy": 0.77416540235281, |
|
"step": 3585 |
|
}, |
|
{ |
|
"epoch": 1.3692894611349546, |
|
"grad_norm": 5.78125, |
|
"learning_rate": 1.0868625206664123e-05, |
|
"loss": 0.6764, |
|
"mean_token_accuracy": 0.787289160490036, |
|
"step": 3590 |
|
}, |
|
{ |
|
"epoch": 1.3711969480209825, |
|
"grad_norm": 5.1875, |
|
"learning_rate": 1.0855907414472848e-05, |
|
"loss": 0.7577, |
|
"mean_token_accuracy": 0.7855016678571701, |
|
"step": 3595 |
|
}, |
|
{ |
|
"epoch": 1.37310443490701, |
|
"grad_norm": 4.96875, |
|
"learning_rate": 1.0843189622281574e-05, |
|
"loss": 0.7051, |
|
"mean_token_accuracy": 0.7881559386849404, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 1.3750119217930377, |
|
"grad_norm": 5.28125, |
|
"learning_rate": 1.0830471830090298e-05, |
|
"loss": 0.7592, |
|
"mean_token_accuracy": 0.7797208964824677, |
|
"step": 3605 |
|
}, |
|
{ |
|
"epoch": 1.3769194086790653, |
|
"grad_norm": 4.9375, |
|
"learning_rate": 1.0817754037899021e-05, |
|
"loss": 0.6396, |
|
"mean_token_accuracy": 0.8072021082043648, |
|
"step": 3610 |
|
}, |
|
{ |
|
"epoch": 1.3788268955650929, |
|
"grad_norm": 5.75, |
|
"learning_rate": 1.0805036245707745e-05, |
|
"loss": 0.7734, |
|
"mean_token_accuracy": 0.7690646216273308, |
|
"step": 3615 |
|
}, |
|
{ |
|
"epoch": 1.3807343824511207, |
|
"grad_norm": 5.15625, |
|
"learning_rate": 1.079231845351647e-05, |
|
"loss": 0.6875, |
|
"mean_token_accuracy": 0.7913995578885078, |
|
"step": 3620 |
|
}, |
|
{ |
|
"epoch": 1.3826418693371483, |
|
"grad_norm": 6.03125, |
|
"learning_rate": 1.0779600661325196e-05, |
|
"loss": 0.6543, |
|
"mean_token_accuracy": 0.8033325642347335, |
|
"step": 3625 |
|
}, |
|
{ |
|
"epoch": 1.384549356223176, |
|
"grad_norm": 5.25, |
|
"learning_rate": 1.0766882869133918e-05, |
|
"loss": 0.7016, |
|
"mean_token_accuracy": 0.7871606081724167, |
|
"step": 3630 |
|
}, |
|
{ |
|
"epoch": 1.3864568431092037, |
|
"grad_norm": 5.03125, |
|
"learning_rate": 1.0754165076942644e-05, |
|
"loss": 0.7866, |
|
"mean_token_accuracy": 0.7680317148566246, |
|
"step": 3635 |
|
}, |
|
{ |
|
"epoch": 1.3883643299952313, |
|
"grad_norm": 5.25, |
|
"learning_rate": 1.0741447284751368e-05, |
|
"loss": 0.6679, |
|
"mean_token_accuracy": 0.7980614572763443, |
|
"step": 3640 |
|
}, |
|
{ |
|
"epoch": 1.390271816881259, |
|
"grad_norm": 4.125, |
|
"learning_rate": 1.0728729492560093e-05, |
|
"loss": 0.7072, |
|
"mean_token_accuracy": 0.7797451242804527, |
|
"step": 3645 |
|
}, |
|
{ |
|
"epoch": 1.3921793037672865, |
|
"grad_norm": 6.40625, |
|
"learning_rate": 1.0716011700368815e-05, |
|
"loss": 0.7171, |
|
"mean_token_accuracy": 0.7874438062310218, |
|
"step": 3650 |
|
}, |
|
{ |
|
"epoch": 1.3940867906533143, |
|
"grad_norm": 5.09375, |
|
"learning_rate": 1.070329390817754e-05, |
|
"loss": 0.7401, |
|
"mean_token_accuracy": 0.7841003894805908, |
|
"step": 3655 |
|
}, |
|
{ |
|
"epoch": 1.395994277539342, |
|
"grad_norm": 5.625, |
|
"learning_rate": 1.0690576115986266e-05, |
|
"loss": 0.6398, |
|
"mean_token_accuracy": 0.7971999242901802, |
|
"step": 3660 |
|
}, |
|
{ |
|
"epoch": 1.3979017644253695, |
|
"grad_norm": 5.96875, |
|
"learning_rate": 1.067785832379499e-05, |
|
"loss": 0.6566, |
|
"mean_token_accuracy": 0.8056228026747704, |
|
"step": 3665 |
|
}, |
|
{ |
|
"epoch": 1.3998092513113973, |
|
"grad_norm": 5.78125, |
|
"learning_rate": 1.0665140531603715e-05, |
|
"loss": 0.6996, |
|
"mean_token_accuracy": 0.7968237593770027, |
|
"step": 3670 |
|
}, |
|
{ |
|
"epoch": 1.401716738197425, |
|
"grad_norm": 5.90625, |
|
"learning_rate": 1.0652422739412438e-05, |
|
"loss": 0.7803, |
|
"mean_token_accuracy": 0.7777243912220001, |
|
"step": 3675 |
|
}, |
|
{ |
|
"epoch": 1.4036242250834525, |
|
"grad_norm": 7.125, |
|
"learning_rate": 1.0639704947221163e-05, |
|
"loss": 0.691, |
|
"mean_token_accuracy": 0.7860228613018989, |
|
"step": 3680 |
|
}, |
|
{ |
|
"epoch": 1.4055317119694801, |
|
"grad_norm": 4.6875, |
|
"learning_rate": 1.0626987155029889e-05, |
|
"loss": 0.6445, |
|
"mean_token_accuracy": 0.8084975272417069, |
|
"step": 3685 |
|
}, |
|
{ |
|
"epoch": 1.407439198855508, |
|
"grad_norm": 5.03125, |
|
"learning_rate": 1.0614269362838612e-05, |
|
"loss": 0.6662, |
|
"mean_token_accuracy": 0.7929752454161644, |
|
"step": 3690 |
|
}, |
|
{ |
|
"epoch": 1.4093466857415355, |
|
"grad_norm": 5.90625, |
|
"learning_rate": 1.0601551570647338e-05, |
|
"loss": 0.7406, |
|
"mean_token_accuracy": 0.7814078807830811, |
|
"step": 3695 |
|
}, |
|
{ |
|
"epoch": 1.4112541726275631, |
|
"grad_norm": 6.09375, |
|
"learning_rate": 1.058883377845606e-05, |
|
"loss": 0.7051, |
|
"mean_token_accuracy": 0.7858014822006225, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 1.413161659513591, |
|
"grad_norm": 4.34375, |
|
"learning_rate": 1.0576115986264785e-05, |
|
"loss": 0.7126, |
|
"mean_token_accuracy": 0.7788995161652565, |
|
"step": 3705 |
|
}, |
|
{ |
|
"epoch": 1.4150691463996186, |
|
"grad_norm": 4.90625, |
|
"learning_rate": 1.056339819407351e-05, |
|
"loss": 0.7203, |
|
"mean_token_accuracy": 0.790349793434143, |
|
"step": 3710 |
|
}, |
|
{ |
|
"epoch": 1.4169766332856462, |
|
"grad_norm": 5.25, |
|
"learning_rate": 1.0550680401882235e-05, |
|
"loss": 0.7395, |
|
"mean_token_accuracy": 0.7782655239105225, |
|
"step": 3715 |
|
}, |
|
{ |
|
"epoch": 1.4188841201716738, |
|
"grad_norm": 4.8125, |
|
"learning_rate": 1.0537962609690959e-05, |
|
"loss": 0.6044, |
|
"mean_token_accuracy": 0.8102027118206024, |
|
"step": 3720 |
|
}, |
|
{ |
|
"epoch": 1.4207916070577014, |
|
"grad_norm": 5.8125, |
|
"learning_rate": 1.0525244817499682e-05, |
|
"loss": 0.7614, |
|
"mean_token_accuracy": 0.7794598892331124, |
|
"step": 3725 |
|
}, |
|
{ |
|
"epoch": 1.4226990939437292, |
|
"grad_norm": 4.78125, |
|
"learning_rate": 1.0512527025308408e-05, |
|
"loss": 0.694, |
|
"mean_token_accuracy": 0.785606586933136, |
|
"step": 3730 |
|
}, |
|
{ |
|
"epoch": 1.4246065808297568, |
|
"grad_norm": 5.9375, |
|
"learning_rate": 1.049980923311713e-05, |
|
"loss": 0.7178, |
|
"mean_token_accuracy": 0.7886698007583618, |
|
"step": 3735 |
|
}, |
|
{ |
|
"epoch": 1.4265140677157844, |
|
"grad_norm": 6.65625, |
|
"learning_rate": 1.0487091440925856e-05, |
|
"loss": 0.8059, |
|
"mean_token_accuracy": 0.7648954957723617, |
|
"step": 3740 |
|
}, |
|
{ |
|
"epoch": 1.4284215546018122, |
|
"grad_norm": 5.0, |
|
"learning_rate": 1.0474373648734581e-05, |
|
"loss": 0.6423, |
|
"mean_token_accuracy": 0.8022066548466682, |
|
"step": 3745 |
|
}, |
|
{ |
|
"epoch": 1.4303290414878398, |
|
"grad_norm": 5.3125, |
|
"learning_rate": 1.0461655856543305e-05, |
|
"loss": 0.6728, |
|
"mean_token_accuracy": 0.7947381287813187, |
|
"step": 3750 |
|
}, |
|
{ |
|
"epoch": 1.4322365283738674, |
|
"grad_norm": 5.875, |
|
"learning_rate": 1.044893806435203e-05, |
|
"loss": 0.7399, |
|
"mean_token_accuracy": 0.7824591249227524, |
|
"step": 3755 |
|
}, |
|
{ |
|
"epoch": 1.434144015259895, |
|
"grad_norm": 5.5625, |
|
"learning_rate": 1.0436220272160752e-05, |
|
"loss": 0.7446, |
|
"mean_token_accuracy": 0.7775179803371429, |
|
"step": 3760 |
|
}, |
|
{ |
|
"epoch": 1.4360515021459228, |
|
"grad_norm": 6.5, |
|
"learning_rate": 1.0423502479969478e-05, |
|
"loss": 0.763, |
|
"mean_token_accuracy": 0.7766611501574516, |
|
"step": 3765 |
|
}, |
|
{ |
|
"epoch": 1.4379589890319504, |
|
"grad_norm": 6.6875, |
|
"learning_rate": 1.0410784687778202e-05, |
|
"loss": 0.7096, |
|
"mean_token_accuracy": 0.7917313784360885, |
|
"step": 3770 |
|
}, |
|
{ |
|
"epoch": 1.439866475917978, |
|
"grad_norm": 5.09375, |
|
"learning_rate": 1.0398066895586927e-05, |
|
"loss": 0.6619, |
|
"mean_token_accuracy": 0.7976467430591583, |
|
"step": 3775 |
|
}, |
|
{ |
|
"epoch": 1.4417739628040058, |
|
"grad_norm": 6.59375, |
|
"learning_rate": 1.0385349103395653e-05, |
|
"loss": 0.6963, |
|
"mean_token_accuracy": 0.7884339362382888, |
|
"step": 3780 |
|
}, |
|
{ |
|
"epoch": 1.4436814496900334, |
|
"grad_norm": 4.59375, |
|
"learning_rate": 1.0372631311204375e-05, |
|
"loss": 0.7326, |
|
"mean_token_accuracy": 0.7857313707470894, |
|
"step": 3785 |
|
}, |
|
{ |
|
"epoch": 1.445588936576061, |
|
"grad_norm": 5.28125, |
|
"learning_rate": 1.03599135190131e-05, |
|
"loss": 0.6981, |
|
"mean_token_accuracy": 0.7860577270388603, |
|
"step": 3790 |
|
}, |
|
{ |
|
"epoch": 1.4474964234620886, |
|
"grad_norm": 5.75, |
|
"learning_rate": 1.0347195726821824e-05, |
|
"loss": 0.7086, |
|
"mean_token_accuracy": 0.7905237257480622, |
|
"step": 3795 |
|
}, |
|
{ |
|
"epoch": 1.4494039103481162, |
|
"grad_norm": 6.375, |
|
"learning_rate": 1.033447793463055e-05, |
|
"loss": 0.7141, |
|
"mean_token_accuracy": 0.7852094903588295, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 1.451311397234144, |
|
"grad_norm": 5.375, |
|
"learning_rate": 1.0321760142439275e-05, |
|
"loss": 0.722, |
|
"mean_token_accuracy": 0.7839903935790062, |
|
"step": 3805 |
|
}, |
|
{ |
|
"epoch": 1.4532188841201716, |
|
"grad_norm": 5.0625, |
|
"learning_rate": 1.0309042350247997e-05, |
|
"loss": 0.6321, |
|
"mean_token_accuracy": 0.8083162158727646, |
|
"step": 3810 |
|
}, |
|
{ |
|
"epoch": 1.4551263710061995, |
|
"grad_norm": 5.875, |
|
"learning_rate": 1.0296324558056723e-05, |
|
"loss": 0.7022, |
|
"mean_token_accuracy": 0.7952652171254158, |
|
"step": 3815 |
|
}, |
|
{ |
|
"epoch": 1.457033857892227, |
|
"grad_norm": 5.28125, |
|
"learning_rate": 1.0283606765865447e-05, |
|
"loss": 0.6933, |
|
"mean_token_accuracy": 0.7907264307141304, |
|
"step": 3820 |
|
}, |
|
{ |
|
"epoch": 1.4589413447782547, |
|
"grad_norm": 6.8125, |
|
"learning_rate": 1.0270888973674172e-05, |
|
"loss": 0.7134, |
|
"mean_token_accuracy": 0.7855746790766716, |
|
"step": 3825 |
|
}, |
|
{ |
|
"epoch": 1.4608488316642823, |
|
"grad_norm": 5.6875, |
|
"learning_rate": 1.0258171181482894e-05, |
|
"loss": 0.7339, |
|
"mean_token_accuracy": 0.7784282520413399, |
|
"step": 3830 |
|
}, |
|
{ |
|
"epoch": 1.4627563185503099, |
|
"grad_norm": 6.0625, |
|
"learning_rate": 1.024545338929162e-05, |
|
"loss": 0.6768, |
|
"mean_token_accuracy": 0.794599574804306, |
|
"step": 3835 |
|
}, |
|
{ |
|
"epoch": 1.4646638054363377, |
|
"grad_norm": 5.875, |
|
"learning_rate": 1.0232735597100345e-05, |
|
"loss": 0.7027, |
|
"mean_token_accuracy": 0.7912124559283257, |
|
"step": 3840 |
|
}, |
|
{ |
|
"epoch": 1.4665712923223653, |
|
"grad_norm": 6.1875, |
|
"learning_rate": 1.0220017804909069e-05, |
|
"loss": 0.6917, |
|
"mean_token_accuracy": 0.7966108277440072, |
|
"step": 3845 |
|
}, |
|
{ |
|
"epoch": 1.4684787792083929, |
|
"grad_norm": 5.21875, |
|
"learning_rate": 1.0207300012717793e-05, |
|
"loss": 0.7194, |
|
"mean_token_accuracy": 0.7810886323451995, |
|
"step": 3850 |
|
}, |
|
{ |
|
"epoch": 1.4703862660944207, |
|
"grad_norm": 5.4375, |
|
"learning_rate": 1.0194582220526517e-05, |
|
"loss": 0.7036, |
|
"mean_token_accuracy": 0.7848643451929093, |
|
"step": 3855 |
|
}, |
|
{ |
|
"epoch": 1.4722937529804483, |
|
"grad_norm": 5.0625, |
|
"learning_rate": 1.0181864428335242e-05, |
|
"loss": 0.6481, |
|
"mean_token_accuracy": 0.7968376025557518, |
|
"step": 3860 |
|
}, |
|
{ |
|
"epoch": 1.4742012398664759, |
|
"grad_norm": 4.59375, |
|
"learning_rate": 1.0169146636143968e-05, |
|
"loss": 0.881, |
|
"mean_token_accuracy": 0.772949455678463, |
|
"step": 3865 |
|
}, |
|
{ |
|
"epoch": 1.4761087267525035, |
|
"grad_norm": 4.875, |
|
"learning_rate": 1.015642884395269e-05, |
|
"loss": 0.7054, |
|
"mean_token_accuracy": 0.7943061590194702, |
|
"step": 3870 |
|
}, |
|
{ |
|
"epoch": 1.4780162136385313, |
|
"grad_norm": 5.53125, |
|
"learning_rate": 1.0143711051761415e-05, |
|
"loss": 0.6809, |
|
"mean_token_accuracy": 0.7910932347178459, |
|
"step": 3875 |
|
}, |
|
{ |
|
"epoch": 1.479923700524559, |
|
"grad_norm": 4.875, |
|
"learning_rate": 1.0130993259570139e-05, |
|
"loss": 0.7266, |
|
"mean_token_accuracy": 0.7764055386185647, |
|
"step": 3880 |
|
}, |
|
{ |
|
"epoch": 1.4818311874105865, |
|
"grad_norm": 4.53125, |
|
"learning_rate": 1.0118275467378865e-05, |
|
"loss": 0.6531, |
|
"mean_token_accuracy": 0.8028417259454728, |
|
"step": 3885 |
|
}, |
|
{ |
|
"epoch": 1.4837386742966143, |
|
"grad_norm": 5.8125, |
|
"learning_rate": 1.0105557675187587e-05, |
|
"loss": 0.7847, |
|
"mean_token_accuracy": 0.7660407572984695, |
|
"step": 3890 |
|
}, |
|
{ |
|
"epoch": 1.485646161182642, |
|
"grad_norm": 5.1875, |
|
"learning_rate": 1.0092839882996312e-05, |
|
"loss": 0.6556, |
|
"mean_token_accuracy": 0.7957696139812469, |
|
"step": 3895 |
|
}, |
|
{ |
|
"epoch": 1.4875536480686695, |
|
"grad_norm": 5.28125, |
|
"learning_rate": 1.0080122090805038e-05, |
|
"loss": 0.7379, |
|
"mean_token_accuracy": 0.7747633382678032, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 1.4894611349546971, |
|
"grad_norm": 5.125, |
|
"learning_rate": 1.0067404298613761e-05, |
|
"loss": 0.6545, |
|
"mean_token_accuracy": 0.8000090837478637, |
|
"step": 3905 |
|
}, |
|
{ |
|
"epoch": 1.4913686218407247, |
|
"grad_norm": 5.3125, |
|
"learning_rate": 1.0054686506422487e-05, |
|
"loss": 0.6919, |
|
"mean_token_accuracy": 0.7933750420808792, |
|
"step": 3910 |
|
}, |
|
{ |
|
"epoch": 1.4932761087267525, |
|
"grad_norm": 5.15625, |
|
"learning_rate": 1.0041968714231209e-05, |
|
"loss": 0.8278, |
|
"mean_token_accuracy": 0.754620935022831, |
|
"step": 3915 |
|
}, |
|
{ |
|
"epoch": 1.4951835956127801, |
|
"grad_norm": 4.5, |
|
"learning_rate": 1.0029250922039935e-05, |
|
"loss": 0.6829, |
|
"mean_token_accuracy": 0.7846381962299347, |
|
"step": 3920 |
|
}, |
|
{ |
|
"epoch": 1.497091082498808, |
|
"grad_norm": 5.875, |
|
"learning_rate": 1.001653312984866e-05, |
|
"loss": 0.729, |
|
"mean_token_accuracy": 0.7845353007316589, |
|
"step": 3925 |
|
}, |
|
{ |
|
"epoch": 1.4989985693848356, |
|
"grad_norm": 5.46875, |
|
"learning_rate": 1.0003815337657384e-05, |
|
"loss": 0.7197, |
|
"mean_token_accuracy": 0.7817666932940484, |
|
"step": 3930 |
|
}, |
|
{ |
|
"epoch": 1.5009060562708632, |
|
"grad_norm": 6.59375, |
|
"learning_rate": 9.991097545466108e-06, |
|
"loss": 0.7118, |
|
"mean_token_accuracy": 0.7892649322748184, |
|
"step": 3935 |
|
}, |
|
{ |
|
"epoch": 1.5028135431568908, |
|
"grad_norm": 5.0625, |
|
"learning_rate": 9.978379753274833e-06, |
|
"loss": 0.6926, |
|
"mean_token_accuracy": 0.7978235989809036, |
|
"step": 3940 |
|
}, |
|
{ |
|
"epoch": 1.5047210300429184, |
|
"grad_norm": 5.84375, |
|
"learning_rate": 9.965661961083557e-06, |
|
"loss": 0.7213, |
|
"mean_token_accuracy": 0.781156699359417, |
|
"step": 3945 |
|
}, |
|
{ |
|
"epoch": 1.506628516928946, |
|
"grad_norm": 4.84375, |
|
"learning_rate": 9.95294416889228e-06, |
|
"loss": 0.7274, |
|
"mean_token_accuracy": 0.7924999624490738, |
|
"step": 3950 |
|
}, |
|
{ |
|
"epoch": 1.5085360038149738, |
|
"grad_norm": 4.75, |
|
"learning_rate": 9.940226376701006e-06, |
|
"loss": 0.7102, |
|
"mean_token_accuracy": 0.7911828309297562, |
|
"step": 3955 |
|
}, |
|
{ |
|
"epoch": 1.5104434907010016, |
|
"grad_norm": 7.21875, |
|
"learning_rate": 9.92750858450973e-06, |
|
"loss": 0.7476, |
|
"mean_token_accuracy": 0.7776550844311714, |
|
"step": 3960 |
|
}, |
|
{ |
|
"epoch": 1.5123509775870292, |
|
"grad_norm": 5.75, |
|
"learning_rate": 9.914790792318454e-06, |
|
"loss": 0.6937, |
|
"mean_token_accuracy": 0.7937393441796303, |
|
"step": 3965 |
|
}, |
|
{ |
|
"epoch": 1.5142584644730568, |
|
"grad_norm": 5.75, |
|
"learning_rate": 9.90207300012718e-06, |
|
"loss": 0.6873, |
|
"mean_token_accuracy": 0.7911945432424545, |
|
"step": 3970 |
|
}, |
|
{ |
|
"epoch": 1.5161659513590844, |
|
"grad_norm": 5.15625, |
|
"learning_rate": 9.889355207935903e-06, |
|
"loss": 0.6954, |
|
"mean_token_accuracy": 0.7914523154497146, |
|
"step": 3975 |
|
}, |
|
{ |
|
"epoch": 1.518073438245112, |
|
"grad_norm": 5.71875, |
|
"learning_rate": 9.876637415744627e-06, |
|
"loss": 0.7153, |
|
"mean_token_accuracy": 0.7951628744602204, |
|
"step": 3980 |
|
}, |
|
{ |
|
"epoch": 1.5199809251311396, |
|
"grad_norm": 5.90625, |
|
"learning_rate": 9.863919623553352e-06, |
|
"loss": 0.7331, |
|
"mean_token_accuracy": 0.7814614549279213, |
|
"step": 3985 |
|
}, |
|
{ |
|
"epoch": 1.5218884120171674, |
|
"grad_norm": 6.53125, |
|
"learning_rate": 9.851201831362076e-06, |
|
"loss": 0.6445, |
|
"mean_token_accuracy": 0.7996955350041389, |
|
"step": 3990 |
|
}, |
|
{ |
|
"epoch": 1.523795898903195, |
|
"grad_norm": 5.0625, |
|
"learning_rate": 9.8384840391708e-06, |
|
"loss": 0.698, |
|
"mean_token_accuracy": 0.7938384771347046, |
|
"step": 3995 |
|
}, |
|
{ |
|
"epoch": 1.5257033857892228, |
|
"grad_norm": 4.65625, |
|
"learning_rate": 9.825766246979526e-06, |
|
"loss": 0.7347, |
|
"mean_token_accuracy": 0.7821006596088409, |
|
"step": 4000 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 7863, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 2000, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.0626197926734889e+18, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|