|
{ |
|
"best_metric": 1.9983205795288086, |
|
"best_model_checkpoint": "/home/datta0/models/lora_final/Mistral-7B-v0.3_pct_ortho_r16/checkpoint-16", |
|
"epoch": 0.9983071342200726, |
|
"eval_steps": 8, |
|
"global_step": 387, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0025796049979846837, |
|
"grad_norm": 7.310375690460205, |
|
"learning_rate": 1.25e-05, |
|
"loss": 2.1533, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.010318419991938735, |
|
"grad_norm": 7.721648216247559, |
|
"learning_rate": 5e-05, |
|
"loss": 2.0899, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.02063683998387747, |
|
"grad_norm": 4.991214275360107, |
|
"learning_rate": 0.0001, |
|
"loss": 1.9566, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.02063683998387747, |
|
"eval_loss": 2.0118165016174316, |
|
"eval_runtime": 141.9765, |
|
"eval_samples_per_second": 1.726, |
|
"eval_steps_per_second": 0.866, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.030955259975816204, |
|
"grad_norm": 4.564762592315674, |
|
"learning_rate": 9.997251843068762e-05, |
|
"loss": 2.0017, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.04127367996775494, |
|
"grad_norm": 4.318169593811035, |
|
"learning_rate": 9.989010393221656e-05, |
|
"loss": 2.0191, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.04127367996775494, |
|
"eval_loss": 1.9983205795288086, |
|
"eval_runtime": 142.0598, |
|
"eval_samples_per_second": 1.725, |
|
"eval_steps_per_second": 0.866, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.051592099959693674, |
|
"grad_norm": 4.341538429260254, |
|
"learning_rate": 9.97528470997769e-05, |
|
"loss": 2.1128, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.06191051995163241, |
|
"grad_norm": 3.7456655502319336, |
|
"learning_rate": 9.956089881469482e-05, |
|
"loss": 2.0779, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.06191051995163241, |
|
"eval_loss": 2.0212316513061523, |
|
"eval_runtime": 139.6844, |
|
"eval_samples_per_second": 1.754, |
|
"eval_steps_per_second": 0.881, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.07222893994357114, |
|
"grad_norm": 3.6751935482025146, |
|
"learning_rate": 9.931447007857432e-05, |
|
"loss": 2.0376, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.08254735993550988, |
|
"grad_norm": 3.707818031311035, |
|
"learning_rate": 9.901383178135113e-05, |
|
"loss": 2.0339, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.08254735993550988, |
|
"eval_loss": 2.0204849243164062, |
|
"eval_runtime": 266.838, |
|
"eval_samples_per_second": 0.918, |
|
"eval_steps_per_second": 0.461, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.09286577992744861, |
|
"grad_norm": 3.904897689819336, |
|
"learning_rate": 9.865931440351337e-05, |
|
"loss": 2.0485, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.10318419991938735, |
|
"grad_norm": 3.803988456726074, |
|
"learning_rate": 9.825130765281668e-05, |
|
"loss": 2.0429, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.10318419991938735, |
|
"eval_loss": 2.0132498741149902, |
|
"eval_runtime": 135.71, |
|
"eval_samples_per_second": 1.805, |
|
"eval_steps_per_second": 0.906, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.11350261991132608, |
|
"grad_norm": 4.293553829193115, |
|
"learning_rate": 9.779026003589304e-05, |
|
"loss": 2.0392, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.12382103990326482, |
|
"grad_norm": 4.716602802276611, |
|
"learning_rate": 9.727667836522407e-05, |
|
"loss": 2.0601, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.12382103990326482, |
|
"eval_loss": 2.0219218730926514, |
|
"eval_runtime": 138.9003, |
|
"eval_samples_per_second": 1.764, |
|
"eval_steps_per_second": 0.886, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.13413945989520354, |
|
"grad_norm": 4.0252580642700195, |
|
"learning_rate": 9.6711127202021e-05, |
|
"loss": 2.0285, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.1444578798871423, |
|
"grad_norm": 3.81237530708313, |
|
"learning_rate": 9.609422823562345e-05, |
|
"loss": 2.041, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.1444578798871423, |
|
"eval_loss": 2.017057180404663, |
|
"eval_runtime": 136.05, |
|
"eval_samples_per_second": 1.801, |
|
"eval_steps_per_second": 0.904, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.154776299879081, |
|
"grad_norm": 4.155681610107422, |
|
"learning_rate": 9.542665960009959e-05, |
|
"loss": 2.0558, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.16509471987101976, |
|
"grad_norm": 3.761030435562134, |
|
"learning_rate": 9.470915512879852e-05, |
|
"loss": 2.0602, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.16509471987101976, |
|
"eval_loss": 2.0229883193969727, |
|
"eval_runtime": 136.2202, |
|
"eval_samples_per_second": 1.799, |
|
"eval_steps_per_second": 0.903, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.17541313986295848, |
|
"grad_norm": 4.972229480743408, |
|
"learning_rate": 9.394250354767467e-05, |
|
"loss": 2.0754, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.18573155985489723, |
|
"grad_norm": 3.9014289379119873, |
|
"learning_rate": 9.312754760827061e-05, |
|
"loss": 2.0341, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.18573155985489723, |
|
"eval_loss": 2.0311481952667236, |
|
"eval_runtime": 255.6823, |
|
"eval_samples_per_second": 0.958, |
|
"eval_steps_per_second": 0.481, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.19604997984683595, |
|
"grad_norm": 4.544624328613281, |
|
"learning_rate": 9.226518316131176e-05, |
|
"loss": 2.0667, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.2063683998387747, |
|
"grad_norm": 4.004471778869629, |
|
"learning_rate": 9.1356358171931e-05, |
|
"loss": 2.0378, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.2063683998387747, |
|
"eval_loss": 2.0319135189056396, |
|
"eval_runtime": 138.1562, |
|
"eval_samples_per_second": 1.773, |
|
"eval_steps_per_second": 0.89, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.21668681983071342, |
|
"grad_norm": 4.80974006652832, |
|
"learning_rate": 9.040207167760586e-05, |
|
"loss": 2.0514, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.22700523982265217, |
|
"grad_norm": 4.184769153594971, |
|
"learning_rate": 8.940337268995385e-05, |
|
"loss": 2.0961, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.22700523982265217, |
|
"eval_loss": 2.0402026176452637, |
|
"eval_runtime": 141.956, |
|
"eval_samples_per_second": 1.726, |
|
"eval_steps_per_second": 0.866, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.2373236598145909, |
|
"grad_norm": 4.9076409339904785, |
|
"learning_rate": 8.836135904159302e-05, |
|
"loss": 2.0838, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.24764207980652964, |
|
"grad_norm": 5.397791385650635, |
|
"learning_rate": 8.727717617933544e-05, |
|
"loss": 2.106, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.24764207980652964, |
|
"eval_loss": 2.020820140838623, |
|
"eval_runtime": 136.0024, |
|
"eval_samples_per_second": 1.801, |
|
"eval_steps_per_second": 0.904, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.25796049979846836, |
|
"grad_norm": 4.622895240783691, |
|
"learning_rate": 8.615201590504017e-05, |
|
"loss": 2.0765, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.2682789197904071, |
|
"grad_norm": 4.172301769256592, |
|
"learning_rate": 8.498711506550983e-05, |
|
"loss": 2.1219, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.2682789197904071, |
|
"eval_loss": 2.0327656269073486, |
|
"eval_runtime": 126.7113, |
|
"eval_samples_per_second": 1.934, |
|
"eval_steps_per_second": 0.971, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.27859733978234585, |
|
"grad_norm": 4.718521595001221, |
|
"learning_rate": 8.378375419287099e-05, |
|
"loss": 2.0347, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.2889157597742846, |
|
"grad_norm": 4.244528293609619, |
|
"learning_rate": 8.25432560969328e-05, |
|
"loss": 2.0569, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.2889157597742846, |
|
"eval_loss": 2.052781105041504, |
|
"eval_runtime": 252.644, |
|
"eval_samples_per_second": 0.97, |
|
"eval_steps_per_second": 0.487, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.2992341797662233, |
|
"grad_norm": 5.245513439178467, |
|
"learning_rate": 8.126698441107146e-05, |
|
"loss": 2.1042, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.309552599758162, |
|
"grad_norm": 5.125065326690674, |
|
"learning_rate": 7.995634209323886e-05, |
|
"loss": 2.1062, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.309552599758162, |
|
"eval_loss": 2.035524845123291, |
|
"eval_runtime": 141.6722, |
|
"eval_samples_per_second": 1.729, |
|
"eval_steps_per_second": 0.868, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.3198710197501008, |
|
"grad_norm": 3.9797890186309814, |
|
"learning_rate": 7.861276988374302e-05, |
|
"loss": 2.0977, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.3301894397420395, |
|
"grad_norm": 4.666868686676025, |
|
"learning_rate": 7.723774472149601e-05, |
|
"loss": 2.0522, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.3301894397420395, |
|
"eval_loss": 2.036526679992676, |
|
"eval_runtime": 133.5887, |
|
"eval_samples_per_second": 1.834, |
|
"eval_steps_per_second": 0.921, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.34050785973397824, |
|
"grad_norm": 4.081684589385986, |
|
"learning_rate": 7.583277812046993e-05, |
|
"loss": 2.0314, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.35082627972591696, |
|
"grad_norm": 4.402740955352783, |
|
"learning_rate": 7.439941450814591e-05, |
|
"loss": 2.0631, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.35082627972591696, |
|
"eval_loss": 2.029966115951538, |
|
"eval_runtime": 129.43, |
|
"eval_samples_per_second": 1.893, |
|
"eval_steps_per_second": 0.95, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.3611446997178557, |
|
"grad_norm": 4.357232570648193, |
|
"learning_rate": 7.293922952778239e-05, |
|
"loss": 2.0651, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.37146311970979445, |
|
"grad_norm": 4.258504390716553, |
|
"learning_rate": 7.145382830636924e-05, |
|
"loss": 2.1052, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.37146311970979445, |
|
"eval_loss": 2.040853500366211, |
|
"eval_runtime": 266.0097, |
|
"eval_samples_per_second": 0.921, |
|
"eval_steps_per_second": 0.462, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.3817815397017332, |
|
"grad_norm": 4.097599506378174, |
|
"learning_rate": 6.994484369017143e-05, |
|
"loss": 2.0833, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.3920999596936719, |
|
"grad_norm": 4.530006408691406, |
|
"learning_rate": 6.841393444980177e-05, |
|
"loss": 2.0875, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.3920999596936719, |
|
"eval_loss": 2.045410394668579, |
|
"eval_runtime": 253.392, |
|
"eval_samples_per_second": 0.967, |
|
"eval_steps_per_second": 0.485, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.4024183796856106, |
|
"grad_norm": 4.209381580352783, |
|
"learning_rate": 6.686278345679625e-05, |
|
"loss": 2.1119, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.4127367996775494, |
|
"grad_norm": 5.037073612213135, |
|
"learning_rate": 6.529309583369605e-05, |
|
"loss": 2.0854, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.4127367996775494, |
|
"eval_loss": 2.02734112739563, |
|
"eval_runtime": 140.0201, |
|
"eval_samples_per_second": 1.75, |
|
"eval_steps_per_second": 0.878, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.4230552196694881, |
|
"grad_norm": 3.838669776916504, |
|
"learning_rate": 6.370659707966967e-05, |
|
"loss": 2.0698, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 0.43337363966142683, |
|
"grad_norm": 4.199975967407227, |
|
"learning_rate": 6.2105031173736e-05, |
|
"loss": 2.0533, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.43337363966142683, |
|
"eval_loss": 2.0528602600097656, |
|
"eval_runtime": 136.9999, |
|
"eval_samples_per_second": 1.788, |
|
"eval_steps_per_second": 0.898, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.44369205965336556, |
|
"grad_norm": 4.185637950897217, |
|
"learning_rate": 6.049015865767318e-05, |
|
"loss": 2.0949, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 0.45401047964530433, |
|
"grad_norm": 4.430467128753662, |
|
"learning_rate": 5.88637547007204e-05, |
|
"loss": 2.1096, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 0.45401047964530433, |
|
"eval_loss": 2.0372908115386963, |
|
"eval_runtime": 126.2447, |
|
"eval_samples_per_second": 1.941, |
|
"eval_steps_per_second": 0.974, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 0.46432889963724305, |
|
"grad_norm": 4.2635416984558105, |
|
"learning_rate": 5.722760714820057e-05, |
|
"loss": 2.0936, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.4746473196291818, |
|
"grad_norm": 3.9342944622039795, |
|
"learning_rate": 5.5583514556208514e-05, |
|
"loss": 2.0288, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 0.4746473196291818, |
|
"eval_loss": 2.028881549835205, |
|
"eval_runtime": 251.1061, |
|
"eval_samples_per_second": 0.976, |
|
"eval_steps_per_second": 0.49, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 0.4849657396211205, |
|
"grad_norm": 4.739011287689209, |
|
"learning_rate": 5.393328421452514e-05, |
|
"loss": 2.0552, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 0.49528415961305927, |
|
"grad_norm": 4.251812934875488, |
|
"learning_rate": 5.2278730159931076e-05, |
|
"loss": 2.1344, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 0.49528415961305927, |
|
"eval_loss": 2.037468671798706, |
|
"eval_runtime": 127.6312, |
|
"eval_samples_per_second": 1.92, |
|
"eval_steps_per_second": 0.964, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 0.505602579604998, |
|
"grad_norm": 4.010369777679443, |
|
"learning_rate": 5.062167118210367e-05, |
|
"loss": 2.1012, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 0.5159209995969367, |
|
"grad_norm": 4.2648844718933105, |
|
"learning_rate": 4.896392882428901e-05, |
|
"loss": 2.0952, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.5159209995969367, |
|
"eval_loss": 2.044463634490967, |
|
"eval_runtime": 144.9327, |
|
"eval_samples_per_second": 1.69, |
|
"eval_steps_per_second": 0.849, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.5262394195888754, |
|
"grad_norm": 5.115091800689697, |
|
"learning_rate": 4.730732538094749e-05, |
|
"loss": 2.092, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 0.5365578395808142, |
|
"grad_norm": 4.243156909942627, |
|
"learning_rate": 4.565368189457313e-05, |
|
"loss": 2.0613, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 0.5365578395808142, |
|
"eval_loss": 2.0373597145080566, |
|
"eval_runtime": 139.7505, |
|
"eval_samples_per_second": 1.753, |
|
"eval_steps_per_second": 0.88, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 0.5468762595727529, |
|
"grad_norm": 4.147082328796387, |
|
"learning_rate": 4.400481615388948e-05, |
|
"loss": 2.0742, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 0.5571946795646917, |
|
"grad_norm": 4.240519046783447, |
|
"learning_rate": 4.236254069562213e-05, |
|
"loss": 2.0441, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 0.5571946795646917, |
|
"eval_loss": 2.0225021839141846, |
|
"eval_runtime": 140.6705, |
|
"eval_samples_per_second": 1.742, |
|
"eval_steps_per_second": 0.874, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 0.5675130995566304, |
|
"grad_norm": 5.2874650955200195, |
|
"learning_rate": 4.0728660812044536e-05, |
|
"loss": 2.034, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.5778315195485692, |
|
"grad_norm": 4.569270610809326, |
|
"learning_rate": 3.910497256648742e-05, |
|
"loss": 2.0493, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 0.5778315195485692, |
|
"eval_loss": 2.0379703044891357, |
|
"eval_runtime": 245.5153, |
|
"eval_samples_per_second": 0.998, |
|
"eval_steps_per_second": 0.501, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 0.5881499395405079, |
|
"grad_norm": 3.7465648651123047, |
|
"learning_rate": 3.749326081899329e-05, |
|
"loss": 2.0379, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 0.5984683595324466, |
|
"grad_norm": 4.019279479980469, |
|
"learning_rate": 3.589529726428615e-05, |
|
"loss": 2.0568, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 0.5984683595324466, |
|
"eval_loss": 2.021915912628174, |
|
"eval_runtime": 134.5206, |
|
"eval_samples_per_second": 1.821, |
|
"eval_steps_per_second": 0.914, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 0.6087867795243853, |
|
"grad_norm": 3.8277833461761475, |
|
"learning_rate": 3.431283848421347e-05, |
|
"loss": 2.021, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 0.619105199516324, |
|
"grad_norm": 3.819260358810425, |
|
"learning_rate": 3.274762401680124e-05, |
|
"loss": 2.0477, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.619105199516324, |
|
"eval_loss": 2.0260963439941406, |
|
"eval_runtime": 128.1656, |
|
"eval_samples_per_second": 1.912, |
|
"eval_steps_per_second": 0.96, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.6294236195082628, |
|
"grad_norm": 3.959582805633545, |
|
"learning_rate": 3.120137444404442e-05, |
|
"loss": 2.073, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 0.6397420395002016, |
|
"grad_norm": 4.131293296813965, |
|
"learning_rate": 2.9675789500535328e-05, |
|
"loss": 2.1065, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 0.6397420395002016, |
|
"eval_loss": 2.0310373306274414, |
|
"eval_runtime": 121.8664, |
|
"eval_samples_per_second": 2.01, |
|
"eval_steps_per_second": 1.009, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 0.6500604594921403, |
|
"grad_norm": 3.729912519454956, |
|
"learning_rate": 2.8172546205008683e-05, |
|
"loss": 2.0231, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 0.660378879484079, |
|
"grad_norm": 3.492910861968994, |
|
"learning_rate": 2.6693297016857188e-05, |
|
"loss": 2.0245, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 0.660378879484079, |
|
"eval_loss": 2.020824670791626, |
|
"eval_runtime": 138.0864, |
|
"eval_samples_per_second": 1.774, |
|
"eval_steps_per_second": 0.891, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 0.6706972994760177, |
|
"grad_norm": 4.330714225769043, |
|
"learning_rate": 2.523966801964468e-05, |
|
"loss": 2.0436, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.6810157194679565, |
|
"grad_norm": 4.122495174407959, |
|
"learning_rate": 2.3813257133612827e-05, |
|
"loss": 2.1013, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 0.6810157194679565, |
|
"eval_loss": 2.0270016193389893, |
|
"eval_runtime": 254.5885, |
|
"eval_samples_per_second": 0.962, |
|
"eval_steps_per_second": 0.483, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 0.6913341394598952, |
|
"grad_norm": 4.589378833770752, |
|
"learning_rate": 2.2415632359146856e-05, |
|
"loss": 2.0243, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 0.7016525594518339, |
|
"grad_norm": 3.9164700508117676, |
|
"learning_rate": 2.104833005313131e-05, |
|
"loss": 2.0356, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 0.7016525594518339, |
|
"eval_loss": 2.0204756259918213, |
|
"eval_runtime": 139.155, |
|
"eval_samples_per_second": 1.761, |
|
"eval_steps_per_second": 0.884, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 0.7119709794437726, |
|
"grad_norm": 4.024805545806885, |
|
"learning_rate": 1.971285324008994e-05, |
|
"loss": 2.0727, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 0.7222893994357114, |
|
"grad_norm": 3.9658894538879395, |
|
"learning_rate": 1.84106699599668e-05, |
|
"loss": 2.0815, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.7222893994357114, |
|
"eval_loss": 2.011732339859009, |
|
"eval_runtime": 135.732, |
|
"eval_samples_per_second": 1.805, |
|
"eval_steps_per_second": 0.906, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.7326078194276502, |
|
"grad_norm": 4.008228778839111, |
|
"learning_rate": 1.7143211654364762e-05, |
|
"loss": 2.0872, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 0.7429262394195889, |
|
"grad_norm": 4.043550968170166, |
|
"learning_rate": 1.5911871593014837e-05, |
|
"loss": 2.0898, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 0.7429262394195889, |
|
"eval_loss": 2.0174756050109863, |
|
"eval_runtime": 134.8529, |
|
"eval_samples_per_second": 1.817, |
|
"eval_steps_per_second": 0.912, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 0.7532446594115276, |
|
"grad_norm": 4.166672706604004, |
|
"learning_rate": 1.4718003342206722e-05, |
|
"loss": 2.0861, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 0.7635630794034663, |
|
"grad_norm": 4.1882100105285645, |
|
"learning_rate": 1.3562919276863844e-05, |
|
"loss": 2.0529, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 0.7635630794034663, |
|
"eval_loss": 2.017113208770752, |
|
"eval_runtime": 126.4791, |
|
"eval_samples_per_second": 1.937, |
|
"eval_steps_per_second": 0.972, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 0.7738814993954051, |
|
"grad_norm": 4.4058709144592285, |
|
"learning_rate": 1.2447889137898293e-05, |
|
"loss": 2.0136, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.7841999193873438, |
|
"grad_norm": 4.662993431091309, |
|
"learning_rate": 1.1374138636432053e-05, |
|
"loss": 2.0281, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 0.7841999193873438, |
|
"eval_loss": 2.0133650302886963, |
|
"eval_runtime": 244.3912, |
|
"eval_samples_per_second": 1.002, |
|
"eval_steps_per_second": 0.503, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 0.7945183393792825, |
|
"grad_norm": 4.017709732055664, |
|
"learning_rate": 1.0342848106418368e-05, |
|
"loss": 2.0556, |
|
"step": 308 |
|
}, |
|
{ |
|
"epoch": 0.8048367593712212, |
|
"grad_norm": 4.138412952423096, |
|
"learning_rate": 9.35515120714447e-06, |
|
"loss": 2.0473, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 0.8048367593712212, |
|
"eval_loss": 2.01501727104187, |
|
"eval_runtime": 142.6675, |
|
"eval_samples_per_second": 1.717, |
|
"eval_steps_per_second": 0.862, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 0.8151551793631601, |
|
"grad_norm": 4.134696006774902, |
|
"learning_rate": 8.41213367704224e-06, |
|
"loss": 2.0193, |
|
"step": 316 |
|
}, |
|
{ |
|
"epoch": 0.8254735993550988, |
|
"grad_norm": 3.4751768112182617, |
|
"learning_rate": 7.51483214017637e-06, |
|
"loss": 2.0315, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.8254735993550988, |
|
"eval_loss": 2.008768320083618, |
|
"eval_runtime": 122.1967, |
|
"eval_samples_per_second": 2.005, |
|
"eval_steps_per_second": 1.007, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.8357920193470375, |
|
"grad_norm": 4.169270038604736, |
|
"learning_rate": 6.664232966721995e-06, |
|
"loss": 2.1053, |
|
"step": 324 |
|
}, |
|
{ |
|
"epoch": 0.8461104393389762, |
|
"grad_norm": 3.8954155445098877, |
|
"learning_rate": 5.8612711886848196e-06, |
|
"loss": 2.0215, |
|
"step": 328 |
|
}, |
|
{ |
|
"epoch": 0.8461104393389762, |
|
"eval_loss": 2.0070595741271973, |
|
"eval_runtime": 130.4734, |
|
"eval_samples_per_second": 1.878, |
|
"eval_steps_per_second": 0.943, |
|
"step": 328 |
|
}, |
|
{ |
|
"epoch": 0.856428859330915, |
|
"grad_norm": 4.543677806854248, |
|
"learning_rate": 5.106829472055202e-06, |
|
"loss": 2.046, |
|
"step": 332 |
|
}, |
|
{ |
|
"epoch": 0.8667472793228537, |
|
"grad_norm": 3.7477471828460693, |
|
"learning_rate": 4.401737146526219e-06, |
|
"loss": 2.0003, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 0.8667472793228537, |
|
"eval_loss": 2.0092661380767822, |
|
"eval_runtime": 263.0794, |
|
"eval_samples_per_second": 0.931, |
|
"eval_steps_per_second": 0.468, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 0.8770656993147924, |
|
"grad_norm": 3.9242446422576904, |
|
"learning_rate": 3.7467692938425057e-06, |
|
"loss": 2.0887, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.8873841193067311, |
|
"grad_norm": 4.085505962371826, |
|
"learning_rate": 3.142645895781715e-06, |
|
"loss": 2.0561, |
|
"step": 344 |
|
}, |
|
{ |
|
"epoch": 0.8873841193067311, |
|
"eval_loss": 2.013634443283081, |
|
"eval_runtime": 131.734, |
|
"eval_samples_per_second": 1.86, |
|
"eval_steps_per_second": 0.934, |
|
"step": 344 |
|
}, |
|
{ |
|
"epoch": 0.8977025392986699, |
|
"grad_norm": 3.602693557739258, |
|
"learning_rate": 2.5900310427053044e-06, |
|
"loss": 2.0014, |
|
"step": 348 |
|
}, |
|
{ |
|
"epoch": 0.9080209592906087, |
|
"grad_norm": 4.137627124786377, |
|
"learning_rate": 2.089532203548794e-06, |
|
"loss": 2.0407, |
|
"step": 352 |
|
}, |
|
{ |
|
"epoch": 0.9080209592906087, |
|
"eval_loss": 2.013155460357666, |
|
"eval_runtime": 140.8366, |
|
"eval_samples_per_second": 1.74, |
|
"eval_steps_per_second": 0.873, |
|
"step": 352 |
|
}, |
|
{ |
|
"epoch": 0.9183393792825474, |
|
"grad_norm": 4.600844860076904, |
|
"learning_rate": 1.6416995580537664e-06, |
|
"loss": 2.0908, |
|
"step": 356 |
|
}, |
|
{ |
|
"epoch": 0.9286577992744861, |
|
"grad_norm": 3.9122657775878906, |
|
"learning_rate": 1.247025391975698e-06, |
|
"loss": 2.0257, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.9286577992744861, |
|
"eval_loss": 2.0104849338531494, |
|
"eval_runtime": 131.7523, |
|
"eval_samples_per_second": 1.86, |
|
"eval_steps_per_second": 0.934, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.9389762192664248, |
|
"grad_norm": 3.483215093612671, |
|
"learning_rate": 9.059435559326257e-07, |
|
"loss": 2.0517, |
|
"step": 364 |
|
}, |
|
{ |
|
"epoch": 0.9492946392583635, |
|
"grad_norm": 3.709639072418213, |
|
"learning_rate": 6.188289884893062e-07, |
|
"loss": 2.0294, |
|
"step": 368 |
|
}, |
|
{ |
|
"epoch": 0.9492946392583635, |
|
"eval_loss": 2.0089664459228516, |
|
"eval_runtime": 143.1227, |
|
"eval_samples_per_second": 1.712, |
|
"eval_steps_per_second": 0.859, |
|
"step": 368 |
|
}, |
|
{ |
|
"epoch": 0.9596130592503023, |
|
"grad_norm": 3.766313314437866, |
|
"learning_rate": 3.8599730400115107e-07, |
|
"loss": 2.0648, |
|
"step": 372 |
|
}, |
|
{ |
|
"epoch": 0.969931479242241, |
|
"grad_norm": 3.7077174186706543, |
|
"learning_rate": 2.0770444567118075e-07, |
|
"loss": 2.0321, |
|
"step": 376 |
|
}, |
|
{ |
|
"epoch": 0.969931479242241, |
|
"eval_loss": 2.0088565349578857, |
|
"eval_runtime": 248.9022, |
|
"eval_samples_per_second": 0.984, |
|
"eval_steps_per_second": 0.494, |
|
"step": 376 |
|
}, |
|
{ |
|
"epoch": 0.9802498992341797, |
|
"grad_norm": 3.656935930252075, |
|
"learning_rate": 8.414640420116305e-08, |
|
"loss": 2.0558, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.9905683192261185, |
|
"grad_norm": 3.686316967010498, |
|
"learning_rate": 1.5459002346324135e-08, |
|
"loss": 2.0516, |
|
"step": 384 |
|
}, |
|
{ |
|
"epoch": 0.9905683192261185, |
|
"eval_loss": 2.009140729904175, |
|
"eval_runtime": 135.7573, |
|
"eval_samples_per_second": 1.805, |
|
"eval_steps_per_second": 0.906, |
|
"step": 384 |
|
} |
|
], |
|
"logging_steps": 4, |
|
"max_steps": 387, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 8, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 5.875183525374198e+17, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|