|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.9987063389391979, |
|
"eval_steps": 500, |
|
"global_step": 386, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.00258732212160414, |
|
"grad_norm": 1.5696258219863308e+34, |
|
"learning_rate": 1.282051282051282e-08, |
|
"loss": 34.5991, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0129366106080207, |
|
"grad_norm": 2.4800051592297763e+32, |
|
"learning_rate": 6.410256410256409e-08, |
|
"loss": 35.1328, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.0258732212160414, |
|
"grad_norm": 2.54523168619682e+30, |
|
"learning_rate": 1.2820512820512818e-07, |
|
"loss": 35.3514, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.03880983182406209, |
|
"grad_norm": 2.9350637382111784e+28, |
|
"learning_rate": 1.9230769230769231e-07, |
|
"loss": 35.2418, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.0517464424320828, |
|
"grad_norm": 6.845379509396579e+24, |
|
"learning_rate": 2.5641025641025636e-07, |
|
"loss": 35.4029, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.0646830530401035, |
|
"grad_norm": 9.432421146054794e+16, |
|
"learning_rate": 3.2051282051282055e-07, |
|
"loss": 35.4307, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.07761966364812418, |
|
"grad_norm": 103996205868.7185, |
|
"learning_rate": 3.8461538461538463e-07, |
|
"loss": 35.3465, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.09055627425614489, |
|
"grad_norm": 7902.581384813984, |
|
"learning_rate": 4.487179487179487e-07, |
|
"loss": 35.2609, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.1034928848641656, |
|
"grad_norm": 3891.3695056510064, |
|
"learning_rate": 4.999897541535663e-07, |
|
"loss": 30.0438, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.11642949547218628, |
|
"grad_norm": 2358.218700340259, |
|
"learning_rate": 4.996312377016688e-07, |
|
"loss": 20.8297, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.129366106080207, |
|
"grad_norm": 330.05121170654246, |
|
"learning_rate": 4.987612684376705e-07, |
|
"loss": 12.961, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.1423027166882277, |
|
"grad_norm": 193.68319402032964, |
|
"learning_rate": 4.973816287836379e-07, |
|
"loss": 10.809, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.15523932729624837, |
|
"grad_norm": 63.544778628785686, |
|
"learning_rate": 4.954951453913442e-07, |
|
"loss": 10.385, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.16817593790426907, |
|
"grad_norm": 26.370287565962407, |
|
"learning_rate": 4.931056833509313e-07, |
|
"loss": 10.1709, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.18111254851228978, |
|
"grad_norm": 99.65626298246401, |
|
"learning_rate": 4.902181382719843e-07, |
|
"loss": 10.0785, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.19404915912031048, |
|
"grad_norm": 91.97251247083376, |
|
"learning_rate": 4.868384262532425e-07, |
|
"loss": 10.0305, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.2069857697283312, |
|
"grad_norm": 133.24722674771854, |
|
"learning_rate": 4.829734717614995e-07, |
|
"loss": 9.9683, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.21992238033635186, |
|
"grad_norm": 49.44784904214244, |
|
"learning_rate": 4.78631193444524e-07, |
|
"loss": 9.8086, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.23285899094437257, |
|
"grad_norm": 39.83457542693803, |
|
"learning_rate": 4.738204879070702e-07, |
|
"loss": 9.6444, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.24579560155239327, |
|
"grad_norm": 35.524332578393064, |
|
"learning_rate": 4.6855121148321705e-07, |
|
"loss": 9.5317, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.258732212160414, |
|
"grad_norm": 21.89346239249435, |
|
"learning_rate": 4.6283416004238185e-07, |
|
"loss": 9.4199, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.2716688227684347, |
|
"grad_norm": 18.371198207871824, |
|
"learning_rate": 4.566810468703828e-07, |
|
"loss": 9.3422, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.2846054333764554, |
|
"grad_norm": 14.363049133916364, |
|
"learning_rate": 4.5010447867086775e-07, |
|
"loss": 9.2499, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.2975420439844761, |
|
"grad_norm": 25.239020822491135, |
|
"learning_rate": 4.431179297362797e-07, |
|
"loss": 9.1466, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.31047865459249674, |
|
"grad_norm": 50.305464578443626, |
|
"learning_rate": 4.3573571434127553e-07, |
|
"loss": 9.1168, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.32341526520051744, |
|
"grad_norm": 29.946521630452818, |
|
"learning_rate": 4.2797295741516337e-07, |
|
"loss": 9.0978, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.33635187580853815, |
|
"grad_norm": 35.470152215729236, |
|
"learning_rate": 4.1984556355344205e-07, |
|
"loss": 9.0445, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.34928848641655885, |
|
"grad_norm": 26.3769064790735, |
|
"learning_rate": 4.1137018443193496e-07, |
|
"loss": 9.0052, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.36222509702457956, |
|
"grad_norm": 28.18463108090832, |
|
"learning_rate": 4.025641846902812e-07, |
|
"loss": 8.9554, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.37516170763260026, |
|
"grad_norm": 23.521159788503216, |
|
"learning_rate": 3.9344560635468183e-07, |
|
"loss": 8.9328, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.38809831824062097, |
|
"grad_norm": 20.007227770493905, |
|
"learning_rate": 3.8403313187279446e-07, |
|
"loss": 8.8458, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.40103492884864167, |
|
"grad_norm": 20.506423559359675, |
|
"learning_rate": 3.743460458365114e-07, |
|
"loss": 8.8147, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.4139715394566624, |
|
"grad_norm": 20.107542907366796, |
|
"learning_rate": 3.644041954710432e-07, |
|
"loss": 8.7916, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.4269081500646831, |
|
"grad_norm": 18.743137987266405, |
|
"learning_rate": 3.5422794997126223e-07, |
|
"loss": 8.7202, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.4398447606727037, |
|
"grad_norm": 21.53271448781464, |
|
"learning_rate": 3.438381587686152e-07, |
|
"loss": 8.6901, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.45278137128072443, |
|
"grad_norm": 29.119808165824566, |
|
"learning_rate": 3.3325610881411314e-07, |
|
"loss": 8.6836, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.46571798188874514, |
|
"grad_norm": 19.101582760449293, |
|
"learning_rate": 3.225034809649149e-07, |
|
"loss": 8.6107, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.47865459249676584, |
|
"grad_norm": 26.344817395383185, |
|
"learning_rate": 3.116023055638638e-07, |
|
"loss": 8.5785, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.49159120310478654, |
|
"grad_norm": 22.757551882374766, |
|
"learning_rate": 3.005749173029856e-07, |
|
"loss": 8.5407, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.5045278137128072, |
|
"grad_norm": 17.287888797801536, |
|
"learning_rate": 2.894439094634258e-07, |
|
"loss": 8.4908, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.517464424320828, |
|
"grad_norm": 18.78518698061505, |
|
"learning_rate": 2.782320876255818e-07, |
|
"loss": 8.4408, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.5304010349288486, |
|
"grad_norm": 26.147386948788057, |
|
"learning_rate": 2.6696242294426794e-07, |
|
"loss": 8.4371, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.5433376455368694, |
|
"grad_norm": 20.864739238751334, |
|
"learning_rate": 2.5565800508464693e-07, |
|
"loss": 8.3946, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.55627425614489, |
|
"grad_norm": 25.03041019448578, |
|
"learning_rate": 2.443419949153531e-07, |
|
"loss": 8.3438, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.5692108667529108, |
|
"grad_norm": 20.710123234104667, |
|
"learning_rate": 2.3303757705573201e-07, |
|
"loss": 8.3685, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.5821474773609314, |
|
"grad_norm": 20.005194039944502, |
|
"learning_rate": 2.217679123744182e-07, |
|
"loss": 8.2943, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.5950840879689522, |
|
"grad_norm": 20.64021311062467, |
|
"learning_rate": 2.1055609053657423e-07, |
|
"loss": 8.2681, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.6080206985769728, |
|
"grad_norm": 18.041357467771427, |
|
"learning_rate": 1.9942508269701447e-07, |
|
"loss": 8.2574, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.6209573091849935, |
|
"grad_norm": 24.18533351853082, |
|
"learning_rate": 1.883976944361362e-07, |
|
"loss": 8.2044, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.6338939197930142, |
|
"grad_norm": 19.308910630600565, |
|
"learning_rate": 1.7749651903508505e-07, |
|
"loss": 8.1439, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.6468305304010349, |
|
"grad_norm": 18.940923202491184, |
|
"learning_rate": 1.6674389118588684e-07, |
|
"loss": 8.2016, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.6597671410090556, |
|
"grad_norm": 22.62903264366213, |
|
"learning_rate": 1.5616184123138476e-07, |
|
"loss": 8.1482, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.6727037516170763, |
|
"grad_norm": 18.71414418245413, |
|
"learning_rate": 1.457720500287379e-07, |
|
"loss": 8.1104, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.685640362225097, |
|
"grad_norm": 18.16814176627225, |
|
"learning_rate": 1.3559580452895682e-07, |
|
"loss": 8.1109, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.6985769728331177, |
|
"grad_norm": 19.84573072806573, |
|
"learning_rate": 1.2565395416348867e-07, |
|
"loss": 8.0936, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.7115135834411385, |
|
"grad_norm": 19.40761310343157, |
|
"learning_rate": 1.1596686812720555e-07, |
|
"loss": 8.081, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.7244501940491591, |
|
"grad_norm": 17.694041027215786, |
|
"learning_rate": 1.065543936453182e-07, |
|
"loss": 8.0583, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.7373868046571799, |
|
"grad_norm": 19.473414715193275, |
|
"learning_rate": 9.743581530971878e-08, |
|
"loss": 8.0536, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.7503234152652005, |
|
"grad_norm": 20.279917617864715, |
|
"learning_rate": 8.862981556806499e-08, |
|
"loss": 8.0876, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.7632600258732212, |
|
"grad_norm": 18.215356879442886, |
|
"learning_rate": 8.0154436446558e-08, |
|
"loss": 8.0188, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.7761966364812419, |
|
"grad_norm": 17.250109490543473, |
|
"learning_rate": 7.202704258483663e-08, |
|
"loss": 8.0066, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.7891332470892626, |
|
"grad_norm": 16.664504705841736, |
|
"learning_rate": 6.426428565872443e-08, |
|
"loss": 8.0198, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.8020698576972833, |
|
"grad_norm": 21.103826588222557, |
|
"learning_rate": 5.688207026372027e-08, |
|
"loss": 8.0141, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.815006468305304, |
|
"grad_norm": 17.388005585117728, |
|
"learning_rate": 4.989552132913219e-08, |
|
"loss": 7.9909, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.8279430789133247, |
|
"grad_norm": 17.869968121368707, |
|
"learning_rate": 4.331895312961725e-08, |
|
"loss": 7.9866, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.8408796895213454, |
|
"grad_norm": 19.683008826871088, |
|
"learning_rate": 3.7165839957618156e-08, |
|
"loss": 7.9899, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.8538163001293662, |
|
"grad_norm": 17.460030572155425, |
|
"learning_rate": 3.144878851678298e-08, |
|
"loss": 7.9855, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.8667529107373868, |
|
"grad_norm": 17.02791948891275, |
|
"learning_rate": 2.617951209292979e-08, |
|
"loss": 8.0126, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.8796895213454075, |
|
"grad_norm": 20.515442084242775, |
|
"learning_rate": 2.136880655547596e-08, |
|
"loss": 8.0, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.8926261319534282, |
|
"grad_norm": 16.829511963680876, |
|
"learning_rate": 1.7026528238500426e-08, |
|
"loss": 7.9638, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.9055627425614489, |
|
"grad_norm": 17.7377760938193, |
|
"learning_rate": 1.3161573746757415e-08, |
|
"loss": 7.923, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.9184993531694696, |
|
"grad_norm": 17.924298918004375, |
|
"learning_rate": 9.78186172801565e-09, |
|
"loss": 7.9911, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.9314359637774903, |
|
"grad_norm": 17.683222436895043, |
|
"learning_rate": 6.894316649068643e-09, |
|
"loss": 7.9886, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.944372574385511, |
|
"grad_norm": 17.323184239913775, |
|
"learning_rate": 4.50485460865585e-09, |
|
"loss": 7.9649, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 0.9573091849935317, |
|
"grad_norm": 16.759552285384956, |
|
"learning_rate": 2.6183712163621308e-09, |
|
"loss": 7.9909, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.9702457956015524, |
|
"grad_norm": 16.40682058120129, |
|
"learning_rate": 1.2387315623294536e-09, |
|
"loss": 8.0056, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.9831824062095731, |
|
"grad_norm": 16.270242929964763, |
|
"learning_rate": 3.6876229833118776e-10, |
|
"loss": 7.9802, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.9961190168175937, |
|
"grad_norm": 17.17813750433213, |
|
"learning_rate": 1.0245846433665217e-11, |
|
"loss": 7.9839, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 0.9987063389391979, |
|
"eval_loss": 7.827143669128418, |
|
"eval_runtime": 3.6254, |
|
"eval_samples_per_second": 62.613, |
|
"eval_steps_per_second": 1.103, |
|
"step": 386 |
|
}, |
|
{ |
|
"epoch": 0.9987063389391979, |
|
"step": 386, |
|
"total_flos": 161536404357120.0, |
|
"train_loss": 11.514628536342958, |
|
"train_runtime": 5043.2067, |
|
"train_samples_per_second": 19.608, |
|
"train_steps_per_second": 0.077 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 386, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 100, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 161536404357120.0, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|