|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.9988901220865705, |
|
"eval_steps": 100, |
|
"global_step": 450, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.011098779134295227, |
|
"grad_norm": 0.32028938698735326, |
|
"learning_rate": 2.222222222222222e-06, |
|
"loss": 1.1057, |
|
"mean_token_accuracy": 0.7074955803776843, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.022197558268590455, |
|
"grad_norm": 0.19534757580522985, |
|
"learning_rate": 4.444444444444444e-06, |
|
"loss": 1.0688, |
|
"mean_token_accuracy": 0.7163213776876576, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.033296337402885685, |
|
"grad_norm": 0.1962807122200424, |
|
"learning_rate": 6.666666666666667e-06, |
|
"loss": 1.0285, |
|
"mean_token_accuracy": 0.7219675252672673, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.04439511653718091, |
|
"grad_norm": 0.17661044382753963, |
|
"learning_rate": 8.888888888888888e-06, |
|
"loss": 0.9569, |
|
"mean_token_accuracy": 0.7338179788852269, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.05549389567147614, |
|
"grad_norm": 0.13298871259606682, |
|
"learning_rate": 1.1111111111111113e-05, |
|
"loss": 0.96, |
|
"mean_token_accuracy": 0.7289285538378297, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.06659267480577137, |
|
"grad_norm": 0.10139226456438258, |
|
"learning_rate": 1.3333333333333333e-05, |
|
"loss": 0.9003, |
|
"mean_token_accuracy": 0.7429319064764655, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.07769145394006659, |
|
"grad_norm": 0.09671244610636456, |
|
"learning_rate": 1.555555555555556e-05, |
|
"loss": 0.8848, |
|
"mean_token_accuracy": 0.7450362054794413, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.08879023307436182, |
|
"grad_norm": 0.10358003016336022, |
|
"learning_rate": 1.7777777777777777e-05, |
|
"loss": 0.8681, |
|
"mean_token_accuracy": 0.7483839065381315, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.09988901220865705, |
|
"grad_norm": 0.07881610121393638, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8442, |
|
"mean_token_accuracy": 0.7542158990266088, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.11098779134295228, |
|
"grad_norm": 0.07695709459998314, |
|
"learning_rate": 1.9992479525042305e-05, |
|
"loss": 0.8202, |
|
"mean_token_accuracy": 0.7594429564506759, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.1220865704772475, |
|
"grad_norm": 0.0770940463572419, |
|
"learning_rate": 1.996992941167792e-05, |
|
"loss": 0.8204, |
|
"mean_token_accuracy": 0.7596416663786116, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.13318534961154274, |
|
"grad_norm": 0.07120273045521322, |
|
"learning_rate": 1.9932383577419432e-05, |
|
"loss": 0.819, |
|
"mean_token_accuracy": 0.7587201214083239, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.14428412874583796, |
|
"grad_norm": 0.07558536731907865, |
|
"learning_rate": 1.9879898494768093e-05, |
|
"loss": 0.8027, |
|
"mean_token_accuracy": 0.7628218193365146, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.15538290788013318, |
|
"grad_norm": 0.0724884828137313, |
|
"learning_rate": 1.9812553106273848e-05, |
|
"loss": 0.8126, |
|
"mean_token_accuracy": 0.7601266253946841, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.16648168701442842, |
|
"grad_norm": 0.0721988261195697, |
|
"learning_rate": 1.973044870579824e-05, |
|
"loss": 0.8027, |
|
"mean_token_accuracy": 0.7625601714037885, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.17758046614872364, |
|
"grad_norm": 0.07812772834648507, |
|
"learning_rate": 1.9633708786158803e-05, |
|
"loss": 0.803, |
|
"mean_token_accuracy": 0.7625912882794786, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.18867924528301888, |
|
"grad_norm": 0.06226943817830889, |
|
"learning_rate": 1.9522478853384154e-05, |
|
"loss": 0.7936, |
|
"mean_token_accuracy": 0.7635993724131902, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.1997780244173141, |
|
"grad_norm": 0.0695985028428393, |
|
"learning_rate": 1.9396926207859085e-05, |
|
"loss": 0.7841, |
|
"mean_token_accuracy": 0.7661331851833721, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.21087680355160932, |
|
"grad_norm": 0.07119143212066893, |
|
"learning_rate": 1.9257239692688907e-05, |
|
"loss": 0.7904, |
|
"mean_token_accuracy": 0.7651270507722707, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.22197558268590456, |
|
"grad_norm": 0.06900642736695052, |
|
"learning_rate": 1.9103629409661468e-05, |
|
"loss": 0.7805, |
|
"mean_token_accuracy": 0.7667522330276506, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.22197558268590456, |
|
"eval_loss": 0.8109510540962219, |
|
"eval_mean_token_accuracy": 0.7568694476131612, |
|
"eval_runtime": 2.9489, |
|
"eval_samples_per_second": 43.744, |
|
"eval_steps_per_second": 3.73, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.23307436182019978, |
|
"grad_norm": 0.07584475595205237, |
|
"learning_rate": 1.8936326403234125e-05, |
|
"loss": 0.7819, |
|
"mean_token_accuracy": 0.7674415677558553, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.244173140954495, |
|
"grad_norm": 0.06583872042093958, |
|
"learning_rate": 1.8755582313020912e-05, |
|
"loss": 0.7819, |
|
"mean_token_accuracy": 0.766467737858396, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.25527192008879024, |
|
"grad_norm": 0.07082657098771901, |
|
"learning_rate": 1.8561668995302668e-05, |
|
"loss": 0.7831, |
|
"mean_token_accuracy": 0.76569958171409, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.2663706992230855, |
|
"grad_norm": 0.06762014391699019, |
|
"learning_rate": 1.8354878114129368e-05, |
|
"loss": 0.7729, |
|
"mean_token_accuracy": 0.7691614712931135, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.27746947835738067, |
|
"grad_norm": 0.06572171584857384, |
|
"learning_rate": 1.8135520702629677e-05, |
|
"loss": 0.7727, |
|
"mean_token_accuracy": 0.7692371318506055, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.2885682574916759, |
|
"grad_norm": 0.07387797497484562, |
|
"learning_rate": 1.7903926695187595e-05, |
|
"loss": 0.7748, |
|
"mean_token_accuracy": 0.767360264364967, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.29966703662597116, |
|
"grad_norm": 0.0648507090395773, |
|
"learning_rate": 1.766044443118978e-05, |
|
"loss": 0.7876, |
|
"mean_token_accuracy": 0.7646269472458946, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.31076581576026635, |
|
"grad_norm": 0.07594530254646713, |
|
"learning_rate": 1.740544013109005e-05, |
|
"loss": 0.763, |
|
"mean_token_accuracy": 0.7709959638413746, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.3218645948945616, |
|
"grad_norm": 0.0732533507550857, |
|
"learning_rate": 1.7139297345578992e-05, |
|
"loss": 0.783, |
|
"mean_token_accuracy": 0.7649811510602629, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.33296337402885684, |
|
"grad_norm": 0.06649981966991503, |
|
"learning_rate": 1.686241637868734e-05, |
|
"loss": 0.7699, |
|
"mean_token_accuracy": 0.7692172433619623, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.34406215316315203, |
|
"grad_norm": 0.0663190385068203, |
|
"learning_rate": 1.657521368569064e-05, |
|
"loss": 0.7701, |
|
"mean_token_accuracy": 0.7685507996235139, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.3551609322974473, |
|
"grad_norm": 0.07086248919589423, |
|
"learning_rate": 1.627812124672099e-05, |
|
"loss": 0.7916, |
|
"mean_token_accuracy": 0.762897097306175, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.3662597114317425, |
|
"grad_norm": 0.06620650991928366, |
|
"learning_rate": 1.5971585917027864e-05, |
|
"loss": 0.7802, |
|
"mean_token_accuracy": 0.7649210363933718, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.37735849056603776, |
|
"grad_norm": 0.06905926805248713, |
|
"learning_rate": 1.5656068754865388e-05, |
|
"loss": 0.7677, |
|
"mean_token_accuracy": 0.7696914957690235, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.38845726970033295, |
|
"grad_norm": 0.07914298364560284, |
|
"learning_rate": 1.5332044328016916e-05, |
|
"loss": 0.7672, |
|
"mean_token_accuracy": 0.7682120122871129, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.3995560488346282, |
|
"grad_norm": 0.07115670436565744, |
|
"learning_rate": 1.5000000000000002e-05, |
|
"loss": 0.7774, |
|
"mean_token_accuracy": 0.7662370568414045, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.41065482796892344, |
|
"grad_norm": 0.06396528657795297, |
|
"learning_rate": 1.4660435197025391e-05, |
|
"loss": 0.7478, |
|
"mean_token_accuracy": 0.7749828723486186, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.42175360710321863, |
|
"grad_norm": 0.0628671379282561, |
|
"learning_rate": 1.4313860656812537e-05, |
|
"loss": 0.7527, |
|
"mean_token_accuracy": 0.7723581904090526, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.4328523862375139, |
|
"grad_norm": 0.06693399129139697, |
|
"learning_rate": 1.396079766039157e-05, |
|
"loss": 0.7709, |
|
"mean_token_accuracy": 0.768176693477312, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.4439511653718091, |
|
"grad_norm": 0.06898843383807368, |
|
"learning_rate": 1.3601777248047105e-05, |
|
"loss": 0.7558, |
|
"mean_token_accuracy": 0.7725129471022092, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.4439511653718091, |
|
"eval_loss": 0.7829984426498413, |
|
"eval_mean_token_accuracy": 0.7632673514716071, |
|
"eval_runtime": 2.5178, |
|
"eval_samples_per_second": 51.234, |
|
"eval_steps_per_second": 4.369, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.4550499445061043, |
|
"grad_norm": 0.06712291331425221, |
|
"learning_rate": 1.3237339420583213e-05, |
|
"loss": 0.7423, |
|
"mean_token_accuracy": 0.7751750598902079, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.46614872364039955, |
|
"grad_norm": 0.07066852759980123, |
|
"learning_rate": 1.2868032327110904e-05, |
|
"loss": 0.7847, |
|
"mean_token_accuracy": 0.7637066112136016, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.4772475027746948, |
|
"grad_norm": 0.06797517881945832, |
|
"learning_rate": 1.2494411440579814e-05, |
|
"loss": 0.7627, |
|
"mean_token_accuracy": 0.7701866684088337, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.48834628190899, |
|
"grad_norm": 0.0740366450868316, |
|
"learning_rate": 1.211703872229411e-05, |
|
"loss": 0.7554, |
|
"mean_token_accuracy": 0.7726324974593143, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.49944506104328523, |
|
"grad_norm": 0.06603564857589732, |
|
"learning_rate": 1.1736481776669307e-05, |
|
"loss": 0.7703, |
|
"mean_token_accuracy": 0.7674938853021173, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.5105438401775805, |
|
"grad_norm": 0.06456424170382646, |
|
"learning_rate": 1.1353312997501313e-05, |
|
"loss": 0.7681, |
|
"mean_token_accuracy": 0.7680860839714745, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.5216426193118757, |
|
"grad_norm": 0.0680877993193484, |
|
"learning_rate": 1.0968108707031792e-05, |
|
"loss": 0.7568, |
|
"mean_token_accuracy": 0.7713919990696916, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.532741398446171, |
|
"grad_norm": 0.06457964510433087, |
|
"learning_rate": 1.0581448289104759e-05, |
|
"loss": 0.7489, |
|
"mean_token_accuracy": 0.7742981274310832, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.5438401775804661, |
|
"grad_norm": 0.06309191183030195, |
|
"learning_rate": 1.0193913317718245e-05, |
|
"loss": 0.7518, |
|
"mean_token_accuracy": 0.7737425585967734, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.5549389567147613, |
|
"grad_norm": 0.06092764730736236, |
|
"learning_rate": 9.806086682281759e-06, |
|
"loss": 0.761, |
|
"mean_token_accuracy": 0.7695176080139992, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.5660377358490566, |
|
"grad_norm": 0.06517875424025751, |
|
"learning_rate": 9.418551710895243e-06, |
|
"loss": 0.7451, |
|
"mean_token_accuracy": 0.7737921205376608, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.5771365149833518, |
|
"grad_norm": 0.06586926609081382, |
|
"learning_rate": 9.03189129296821e-06, |
|
"loss": 0.7272, |
|
"mean_token_accuracy": 0.7790233444795842, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.5882352941176471, |
|
"grad_norm": 0.06455882882155867, |
|
"learning_rate": 8.646687002498692e-06, |
|
"loss": 0.7493, |
|
"mean_token_accuracy": 0.7728544867539606, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.5993340732519423, |
|
"grad_norm": 0.06419037506595338, |
|
"learning_rate": 8.263518223330698e-06, |
|
"loss": 0.7447, |
|
"mean_token_accuracy": 0.774466472618278, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.6104328523862376, |
|
"grad_norm": 0.06483038923609051, |
|
"learning_rate": 7.882961277705897e-06, |
|
"loss": 0.7467, |
|
"mean_token_accuracy": 0.7736083802309587, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.6215316315205327, |
|
"grad_norm": 0.06889145412976934, |
|
"learning_rate": 7.505588559420188e-06, |
|
"loss": 0.7423, |
|
"mean_token_accuracy": 0.7752051151920257, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.632630410654828, |
|
"grad_norm": 0.06931221562176564, |
|
"learning_rate": 7.131967672889101e-06, |
|
"loss": 0.7766, |
|
"mean_token_accuracy": 0.7648978880464531, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.6437291897891232, |
|
"grad_norm": 0.0645937272519637, |
|
"learning_rate": 6.762660579416791e-06, |
|
"loss": 0.7529, |
|
"mean_token_accuracy": 0.7729185441854456, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.6548279689234184, |
|
"grad_norm": 0.06415673551420226, |
|
"learning_rate": 6.3982227519528986e-06, |
|
"loss": 0.7601, |
|
"mean_token_accuracy": 0.7699141088391924, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.6659267480577137, |
|
"grad_norm": 0.06845419068799381, |
|
"learning_rate": 6.039202339608432e-06, |
|
"loss": 0.7519, |
|
"mean_token_accuracy": 0.77195855669696, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.6659267480577137, |
|
"eval_loss": 0.7694035172462463, |
|
"eval_mean_token_accuracy": 0.7656926514001757, |
|
"eval_runtime": 2.5109, |
|
"eval_samples_per_second": 51.376, |
|
"eval_steps_per_second": 4.381, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.6770255271920089, |
|
"grad_norm": 0.06481669897887246, |
|
"learning_rate": 5.686139343187468e-06, |
|
"loss": 0.7368, |
|
"mean_token_accuracy": 0.7768982703775535, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.6881243063263041, |
|
"grad_norm": 0.06555414493127953, |
|
"learning_rate": 5.339564802974615e-06, |
|
"loss": 0.7496, |
|
"mean_token_accuracy": 0.7741114122022307, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.6992230854605993, |
|
"grad_norm": 0.06321637577126818, |
|
"learning_rate": 5.000000000000003e-06, |
|
"loss": 0.7379, |
|
"mean_token_accuracy": 0.7763521164662632, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.7103218645948945, |
|
"grad_norm": 0.06534174478047594, |
|
"learning_rate": 4.66795567198309e-06, |
|
"loss": 0.7186, |
|
"mean_token_accuracy": 0.7821842581348311, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.7214206437291898, |
|
"grad_norm": 0.0656418642194091, |
|
"learning_rate": 4.343931245134616e-06, |
|
"loss": 0.7429, |
|
"mean_token_accuracy": 0.7750543137352807, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.732519422863485, |
|
"grad_norm": 0.06142434056090795, |
|
"learning_rate": 4.028414082972141e-06, |
|
"loss": 0.7432, |
|
"mean_token_accuracy": 0.775647557164172, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.7436182019977803, |
|
"grad_norm": 0.06566009724006543, |
|
"learning_rate": 3.7218787532790167e-06, |
|
"loss": 0.7527, |
|
"mean_token_accuracy": 0.7717515416461624, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.7547169811320755, |
|
"grad_norm": 0.06813787133901092, |
|
"learning_rate": 3.424786314309365e-06, |
|
"loss": 0.7397, |
|
"mean_token_accuracy": 0.7761784463020833, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.7658157602663707, |
|
"grad_norm": 0.061296004900329354, |
|
"learning_rate": 3.1375836213126653e-06, |
|
"loss": 0.7547, |
|
"mean_token_accuracy": 0.7708365795261976, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.7769145394006659, |
|
"grad_norm": 0.06086151158973008, |
|
"learning_rate": 2.8607026544210115e-06, |
|
"loss": 0.7449, |
|
"mean_token_accuracy": 0.7737567810246656, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.7880133185349611, |
|
"grad_norm": 0.06066361422328344, |
|
"learning_rate": 2.594559868909956e-06, |
|
"loss": 0.7526, |
|
"mean_token_accuracy": 0.7728921789426538, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.7991120976692564, |
|
"grad_norm": 0.061244946934632546, |
|
"learning_rate": 2.339555568810221e-06, |
|
"loss": 0.7428, |
|
"mean_token_accuracy": 0.7745905285585644, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.8102108768035516, |
|
"grad_norm": 0.059950100330008936, |
|
"learning_rate": 2.0960733048124082e-06, |
|
"loss": 0.735, |
|
"mean_token_accuracy": 0.7771190685426068, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 0.8213096559378469, |
|
"grad_norm": 0.062267646898023575, |
|
"learning_rate": 1.8644792973703252e-06, |
|
"loss": 0.757, |
|
"mean_token_accuracy": 0.7711234095884432, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.832408435072142, |
|
"grad_norm": 0.05795250590930739, |
|
"learning_rate": 1.6451218858706374e-06, |
|
"loss": 0.7396, |
|
"mean_token_accuracy": 0.7762826225755755, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.8435072142064373, |
|
"grad_norm": 0.05646366375927419, |
|
"learning_rate": 1.4383310046973365e-06, |
|
"loss": 0.742, |
|
"mean_token_accuracy": 0.7754256081182773, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.8546059933407325, |
|
"grad_norm": 0.05933272693822518, |
|
"learning_rate": 1.2444176869790925e-06, |
|
"loss": 0.7457, |
|
"mean_token_accuracy": 0.7742744235316502, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 0.8657047724750278, |
|
"grad_norm": 0.057540816299949595, |
|
"learning_rate": 1.0636735967658785e-06, |
|
"loss": 0.736, |
|
"mean_token_accuracy": 0.7765978708392828, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.876803551609323, |
|
"grad_norm": 0.05844756012864216, |
|
"learning_rate": 8.963705903385344e-07, |
|
"loss": 0.7183, |
|
"mean_token_accuracy": 0.7822006045277151, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 0.8879023307436182, |
|
"grad_norm": 0.05810320914079056, |
|
"learning_rate": 7.427603073110967e-07, |
|
"loss": 0.7442, |
|
"mean_token_accuracy": 0.7742584918108283, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.8879023307436182, |
|
"eval_loss": 0.7638587951660156, |
|
"eval_mean_token_accuracy": 0.7671856273651642, |
|
"eval_runtime": 2.5128, |
|
"eval_samples_per_second": 51.338, |
|
"eval_steps_per_second": 4.378, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.8990011098779135, |
|
"grad_norm": 0.05687291272451405, |
|
"learning_rate": 6.030737921409169e-07, |
|
"loss": 0.7355, |
|
"mean_token_accuracy": 0.7774604937569412, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 0.9100998890122086, |
|
"grad_norm": 0.05899588715439542, |
|
"learning_rate": 4.775211466158469e-07, |
|
"loss": 0.7319, |
|
"mean_token_accuracy": 0.7774400412989902, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.9211986681465039, |
|
"grad_norm": 0.05577638815550462, |
|
"learning_rate": 3.662912138411967e-07, |
|
"loss": 0.7178, |
|
"mean_token_accuracy": 0.7821916392303806, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 0.9322974472807991, |
|
"grad_norm": 0.05865365285983047, |
|
"learning_rate": 2.6955129420176193e-07, |
|
"loss": 0.7309, |
|
"mean_token_accuracy": 0.778973121325068, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.9433962264150944, |
|
"grad_norm": 0.06038263301804733, |
|
"learning_rate": 1.874468937261531e-07, |
|
"loss": 0.7372, |
|
"mean_token_accuracy": 0.7765551528106018, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 0.9544950055493896, |
|
"grad_norm": 0.059800638962906945, |
|
"learning_rate": 1.201015052319099e-07, |
|
"loss": 0.7291, |
|
"mean_token_accuracy": 0.7794756294446461, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.9655937846836848, |
|
"grad_norm": 0.056012625483331906, |
|
"learning_rate": 6.761642258056977e-08, |
|
"loss": 0.7255, |
|
"mean_token_accuracy": 0.7798846256811043, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 0.97669256381798, |
|
"grad_norm": 0.055552002158992475, |
|
"learning_rate": 3.0070588322079765e-08, |
|
"loss": 0.7351, |
|
"mean_token_accuracy": 0.7766328434320331, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.9877913429522752, |
|
"grad_norm": 0.05677914033692145, |
|
"learning_rate": 7.520474957699586e-09, |
|
"loss": 0.7351, |
|
"mean_token_accuracy": 0.7767306519256947, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 0.9988901220865705, |
|
"grad_norm": 0.06170289905752787, |
|
"learning_rate": 0.0, |
|
"loss": 0.7528, |
|
"mean_token_accuracy": 0.7717083040396344, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.9988901220865705, |
|
"step": 450, |
|
"total_flos": 6.086450124265882e+17, |
|
"train_loss": 0.7795855527453952, |
|
"train_runtime": 1563.698, |
|
"train_samples_per_second": 13.828, |
|
"train_steps_per_second": 0.288 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 450, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": false, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 6.086450124265882e+17, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|