mistral-7b-ift / trainer_state.json
NicholasCorrado's picture
Model save
a9a6c32 verified
raw
history blame
14.7 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9987063389391979,
"eval_steps": 500,
"global_step": 386,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.00258732212160414,
"grad_norm": 1.5696258219863308e+34,
"learning_rate": 1.282051282051282e-08,
"loss": 34.5991,
"step": 1
},
{
"epoch": 0.0129366106080207,
"grad_norm": 2.4800051592297763e+32,
"learning_rate": 6.410256410256409e-08,
"loss": 35.1328,
"step": 5
},
{
"epoch": 0.0258732212160414,
"grad_norm": 2.54523168619682e+30,
"learning_rate": 1.2820512820512818e-07,
"loss": 35.3514,
"step": 10
},
{
"epoch": 0.03880983182406209,
"grad_norm": 2.9350637382111784e+28,
"learning_rate": 1.9230769230769231e-07,
"loss": 35.2418,
"step": 15
},
{
"epoch": 0.0517464424320828,
"grad_norm": 6.845379509396579e+24,
"learning_rate": 2.5641025641025636e-07,
"loss": 35.4029,
"step": 20
},
{
"epoch": 0.0646830530401035,
"grad_norm": 9.432421146054794e+16,
"learning_rate": 3.2051282051282055e-07,
"loss": 35.4307,
"step": 25
},
{
"epoch": 0.07761966364812418,
"grad_norm": 103996205868.7185,
"learning_rate": 3.8461538461538463e-07,
"loss": 35.3465,
"step": 30
},
{
"epoch": 0.09055627425614489,
"grad_norm": 7902.581384813984,
"learning_rate": 4.487179487179487e-07,
"loss": 35.2609,
"step": 35
},
{
"epoch": 0.1034928848641656,
"grad_norm": 3891.3695056510064,
"learning_rate": 4.999897541535663e-07,
"loss": 30.0438,
"step": 40
},
{
"epoch": 0.11642949547218628,
"grad_norm": 2358.218700340259,
"learning_rate": 4.996312377016688e-07,
"loss": 20.8297,
"step": 45
},
{
"epoch": 0.129366106080207,
"grad_norm": 330.05121170654246,
"learning_rate": 4.987612684376705e-07,
"loss": 12.961,
"step": 50
},
{
"epoch": 0.1423027166882277,
"grad_norm": 193.68319402032964,
"learning_rate": 4.973816287836379e-07,
"loss": 10.809,
"step": 55
},
{
"epoch": 0.15523932729624837,
"grad_norm": 63.544778628785686,
"learning_rate": 4.954951453913442e-07,
"loss": 10.385,
"step": 60
},
{
"epoch": 0.16817593790426907,
"grad_norm": 26.370287565962407,
"learning_rate": 4.931056833509313e-07,
"loss": 10.1709,
"step": 65
},
{
"epoch": 0.18111254851228978,
"grad_norm": 99.65626298246401,
"learning_rate": 4.902181382719843e-07,
"loss": 10.0785,
"step": 70
},
{
"epoch": 0.19404915912031048,
"grad_norm": 91.97251247083376,
"learning_rate": 4.868384262532425e-07,
"loss": 10.0305,
"step": 75
},
{
"epoch": 0.2069857697283312,
"grad_norm": 133.24722674771854,
"learning_rate": 4.829734717614995e-07,
"loss": 9.9683,
"step": 80
},
{
"epoch": 0.21992238033635186,
"grad_norm": 49.44784904214244,
"learning_rate": 4.78631193444524e-07,
"loss": 9.8086,
"step": 85
},
{
"epoch": 0.23285899094437257,
"grad_norm": 39.83457542693803,
"learning_rate": 4.738204879070702e-07,
"loss": 9.6444,
"step": 90
},
{
"epoch": 0.24579560155239327,
"grad_norm": 35.524332578393064,
"learning_rate": 4.6855121148321705e-07,
"loss": 9.5317,
"step": 95
},
{
"epoch": 0.258732212160414,
"grad_norm": 21.89346239249435,
"learning_rate": 4.6283416004238185e-07,
"loss": 9.4199,
"step": 100
},
{
"epoch": 0.2716688227684347,
"grad_norm": 18.371198207871824,
"learning_rate": 4.566810468703828e-07,
"loss": 9.3422,
"step": 105
},
{
"epoch": 0.2846054333764554,
"grad_norm": 14.363049133916364,
"learning_rate": 4.5010447867086775e-07,
"loss": 9.2499,
"step": 110
},
{
"epoch": 0.2975420439844761,
"grad_norm": 25.239020822491135,
"learning_rate": 4.431179297362797e-07,
"loss": 9.1466,
"step": 115
},
{
"epoch": 0.31047865459249674,
"grad_norm": 50.305464578443626,
"learning_rate": 4.3573571434127553e-07,
"loss": 9.1168,
"step": 120
},
{
"epoch": 0.32341526520051744,
"grad_norm": 29.946521630452818,
"learning_rate": 4.2797295741516337e-07,
"loss": 9.0978,
"step": 125
},
{
"epoch": 0.33635187580853815,
"grad_norm": 35.470152215729236,
"learning_rate": 4.1984556355344205e-07,
"loss": 9.0445,
"step": 130
},
{
"epoch": 0.34928848641655885,
"grad_norm": 26.3769064790735,
"learning_rate": 4.1137018443193496e-07,
"loss": 9.0052,
"step": 135
},
{
"epoch": 0.36222509702457956,
"grad_norm": 28.18463108090832,
"learning_rate": 4.025641846902812e-07,
"loss": 8.9554,
"step": 140
},
{
"epoch": 0.37516170763260026,
"grad_norm": 23.521159788503216,
"learning_rate": 3.9344560635468183e-07,
"loss": 8.9328,
"step": 145
},
{
"epoch": 0.38809831824062097,
"grad_norm": 20.007227770493905,
"learning_rate": 3.8403313187279446e-07,
"loss": 8.8458,
"step": 150
},
{
"epoch": 0.40103492884864167,
"grad_norm": 20.506423559359675,
"learning_rate": 3.743460458365114e-07,
"loss": 8.8147,
"step": 155
},
{
"epoch": 0.4139715394566624,
"grad_norm": 20.107542907366796,
"learning_rate": 3.644041954710432e-07,
"loss": 8.7916,
"step": 160
},
{
"epoch": 0.4269081500646831,
"grad_norm": 18.743137987266405,
"learning_rate": 3.5422794997126223e-07,
"loss": 8.7202,
"step": 165
},
{
"epoch": 0.4398447606727037,
"grad_norm": 21.53271448781464,
"learning_rate": 3.438381587686152e-07,
"loss": 8.6901,
"step": 170
},
{
"epoch": 0.45278137128072443,
"grad_norm": 29.119808165824566,
"learning_rate": 3.3325610881411314e-07,
"loss": 8.6836,
"step": 175
},
{
"epoch": 0.46571798188874514,
"grad_norm": 19.101582760449293,
"learning_rate": 3.225034809649149e-07,
"loss": 8.6107,
"step": 180
},
{
"epoch": 0.47865459249676584,
"grad_norm": 26.344817395383185,
"learning_rate": 3.116023055638638e-07,
"loss": 8.5785,
"step": 185
},
{
"epoch": 0.49159120310478654,
"grad_norm": 22.757551882374766,
"learning_rate": 3.005749173029856e-07,
"loss": 8.5407,
"step": 190
},
{
"epoch": 0.5045278137128072,
"grad_norm": 17.287888797801536,
"learning_rate": 2.894439094634258e-07,
"loss": 8.4908,
"step": 195
},
{
"epoch": 0.517464424320828,
"grad_norm": 18.78518698061505,
"learning_rate": 2.782320876255818e-07,
"loss": 8.4408,
"step": 200
},
{
"epoch": 0.5304010349288486,
"grad_norm": 26.147386948788057,
"learning_rate": 2.6696242294426794e-07,
"loss": 8.4371,
"step": 205
},
{
"epoch": 0.5433376455368694,
"grad_norm": 20.864739238751334,
"learning_rate": 2.5565800508464693e-07,
"loss": 8.3946,
"step": 210
},
{
"epoch": 0.55627425614489,
"grad_norm": 25.03041019448578,
"learning_rate": 2.443419949153531e-07,
"loss": 8.3438,
"step": 215
},
{
"epoch": 0.5692108667529108,
"grad_norm": 20.710123234104667,
"learning_rate": 2.3303757705573201e-07,
"loss": 8.3685,
"step": 220
},
{
"epoch": 0.5821474773609314,
"grad_norm": 20.005194039944502,
"learning_rate": 2.217679123744182e-07,
"loss": 8.2943,
"step": 225
},
{
"epoch": 0.5950840879689522,
"grad_norm": 20.64021311062467,
"learning_rate": 2.1055609053657423e-07,
"loss": 8.2681,
"step": 230
},
{
"epoch": 0.6080206985769728,
"grad_norm": 18.041357467771427,
"learning_rate": 1.9942508269701447e-07,
"loss": 8.2574,
"step": 235
},
{
"epoch": 0.6209573091849935,
"grad_norm": 24.18533351853082,
"learning_rate": 1.883976944361362e-07,
"loss": 8.2044,
"step": 240
},
{
"epoch": 0.6338939197930142,
"grad_norm": 19.308910630600565,
"learning_rate": 1.7749651903508505e-07,
"loss": 8.1439,
"step": 245
},
{
"epoch": 0.6468305304010349,
"grad_norm": 18.940923202491184,
"learning_rate": 1.6674389118588684e-07,
"loss": 8.2016,
"step": 250
},
{
"epoch": 0.6597671410090556,
"grad_norm": 22.62903264366213,
"learning_rate": 1.5616184123138476e-07,
"loss": 8.1482,
"step": 255
},
{
"epoch": 0.6727037516170763,
"grad_norm": 18.71414418245413,
"learning_rate": 1.457720500287379e-07,
"loss": 8.1104,
"step": 260
},
{
"epoch": 0.685640362225097,
"grad_norm": 18.16814176627225,
"learning_rate": 1.3559580452895682e-07,
"loss": 8.1109,
"step": 265
},
{
"epoch": 0.6985769728331177,
"grad_norm": 19.84573072806573,
"learning_rate": 1.2565395416348867e-07,
"loss": 8.0936,
"step": 270
},
{
"epoch": 0.7115135834411385,
"grad_norm": 19.40761310343157,
"learning_rate": 1.1596686812720555e-07,
"loss": 8.081,
"step": 275
},
{
"epoch": 0.7244501940491591,
"grad_norm": 17.694041027215786,
"learning_rate": 1.065543936453182e-07,
"loss": 8.0583,
"step": 280
},
{
"epoch": 0.7373868046571799,
"grad_norm": 19.473414715193275,
"learning_rate": 9.743581530971878e-08,
"loss": 8.0536,
"step": 285
},
{
"epoch": 0.7503234152652005,
"grad_norm": 20.279917617864715,
"learning_rate": 8.862981556806499e-08,
"loss": 8.0876,
"step": 290
},
{
"epoch": 0.7632600258732212,
"grad_norm": 18.215356879442886,
"learning_rate": 8.0154436446558e-08,
"loss": 8.0188,
"step": 295
},
{
"epoch": 0.7761966364812419,
"grad_norm": 17.250109490543473,
"learning_rate": 7.202704258483663e-08,
"loss": 8.0066,
"step": 300
},
{
"epoch": 0.7891332470892626,
"grad_norm": 16.664504705841736,
"learning_rate": 6.426428565872443e-08,
"loss": 8.0198,
"step": 305
},
{
"epoch": 0.8020698576972833,
"grad_norm": 21.103826588222557,
"learning_rate": 5.688207026372027e-08,
"loss": 8.0141,
"step": 310
},
{
"epoch": 0.815006468305304,
"grad_norm": 17.388005585117728,
"learning_rate": 4.989552132913219e-08,
"loss": 7.9909,
"step": 315
},
{
"epoch": 0.8279430789133247,
"grad_norm": 17.869968121368707,
"learning_rate": 4.331895312961725e-08,
"loss": 7.9866,
"step": 320
},
{
"epoch": 0.8408796895213454,
"grad_norm": 19.683008826871088,
"learning_rate": 3.7165839957618156e-08,
"loss": 7.9899,
"step": 325
},
{
"epoch": 0.8538163001293662,
"grad_norm": 17.460030572155425,
"learning_rate": 3.144878851678298e-08,
"loss": 7.9855,
"step": 330
},
{
"epoch": 0.8667529107373868,
"grad_norm": 17.02791948891275,
"learning_rate": 2.617951209292979e-08,
"loss": 8.0126,
"step": 335
},
{
"epoch": 0.8796895213454075,
"grad_norm": 20.515442084242775,
"learning_rate": 2.136880655547596e-08,
"loss": 8.0,
"step": 340
},
{
"epoch": 0.8926261319534282,
"grad_norm": 16.829511963680876,
"learning_rate": 1.7026528238500426e-08,
"loss": 7.9638,
"step": 345
},
{
"epoch": 0.9055627425614489,
"grad_norm": 17.7377760938193,
"learning_rate": 1.3161573746757415e-08,
"loss": 7.923,
"step": 350
},
{
"epoch": 0.9184993531694696,
"grad_norm": 17.924298918004375,
"learning_rate": 9.78186172801565e-09,
"loss": 7.9911,
"step": 355
},
{
"epoch": 0.9314359637774903,
"grad_norm": 17.683222436895043,
"learning_rate": 6.894316649068643e-09,
"loss": 7.9886,
"step": 360
},
{
"epoch": 0.944372574385511,
"grad_norm": 17.323184239913775,
"learning_rate": 4.50485460865585e-09,
"loss": 7.9649,
"step": 365
},
{
"epoch": 0.9573091849935317,
"grad_norm": 16.759552285384956,
"learning_rate": 2.6183712163621308e-09,
"loss": 7.9909,
"step": 370
},
{
"epoch": 0.9702457956015524,
"grad_norm": 16.40682058120129,
"learning_rate": 1.2387315623294536e-09,
"loss": 8.0056,
"step": 375
},
{
"epoch": 0.9831824062095731,
"grad_norm": 16.270242929964763,
"learning_rate": 3.6876229833118776e-10,
"loss": 7.9802,
"step": 380
},
{
"epoch": 0.9961190168175937,
"grad_norm": 17.17813750433213,
"learning_rate": 1.0245846433665217e-11,
"loss": 7.9839,
"step": 385
},
{
"epoch": 0.9987063389391979,
"eval_loss": 7.827143669128418,
"eval_runtime": 3.6254,
"eval_samples_per_second": 62.613,
"eval_steps_per_second": 1.103,
"step": 386
},
{
"epoch": 0.9987063389391979,
"step": 386,
"total_flos": 161536404357120.0,
"train_loss": 11.514628536342958,
"train_runtime": 5043.2067,
"train_samples_per_second": 19.608,
"train_steps_per_second": 0.077
}
],
"logging_steps": 5,
"max_steps": 386,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 161536404357120.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}