{ "best_metric": 0.8475400805473328, "best_model_checkpoint": "/home/datta0/models/lora_final/Mistral-7B-v0.3_metamath_ortho/checkpoint-13", "epoch": 0.9898744430943702, "eval_steps": 13, "global_step": 611, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0016200891049007696, "grad_norm": 15.34029483795166, "learning_rate": 2.3076923076923076e-05, "loss": 0.9735, "step": 1 }, { "epoch": 0.011340623734305387, "grad_norm": 9.837395668029785, "learning_rate": 0.00016153846153846153, "loss": 0.7761, "step": 7 }, { "epoch": 0.021061158363710003, "eval_loss": 0.8475400805473328, "eval_runtime": 13.0358, "eval_samples_per_second": 38.356, "eval_steps_per_second": 4.833, "step": 13 }, { "epoch": 0.022681247468610773, "grad_norm": 23.360591888427734, "learning_rate": 0.0002999979709808197, "loss": 0.7321, "step": 14 }, { "epoch": 0.03402187120291616, "grad_norm": 149.70909118652344, "learning_rate": 0.0002998701612152596, "loss": 5.7285, "step": 21 }, { "epoch": 0.042122316727420006, "eval_loss": 7.124204158782959, "eval_runtime": 87.0674, "eval_samples_per_second": 5.743, "eval_steps_per_second": 0.724, "step": 26 }, { "epoch": 0.04536249493722155, "grad_norm": 12.206794738769531, "learning_rate": 0.0002995437011859465, "loss": 7.8931, "step": 28 }, { "epoch": 0.056703118671526935, "grad_norm": 5.862812042236328, "learning_rate": 0.00029901902360990936, "loss": 6.6463, "step": 35 }, { "epoch": 0.06318347509113001, "eval_loss": 6.46243143081665, "eval_runtime": 69.1872, "eval_samples_per_second": 7.227, "eval_steps_per_second": 0.911, "step": 39 }, { "epoch": 0.06804374240583232, "grad_norm": 20.95379638671875, "learning_rate": 0.00029829682393805085, "loss": 6.4581, "step": 42 }, { "epoch": 0.0793843661401377, "grad_norm": 7.908657550811768, "learning_rate": 0.0002973780594333385, "loss": 6.3183, "step": 49 }, { "epoch": 0.08424463345484001, "eval_loss": 6.2700419425964355, "eval_runtime": 12.519, "eval_samples_per_second": 39.939, "eval_steps_per_second": 5.032, "step": 52 }, { "epoch": 0.0907249898744431, "grad_norm": 10.674615859985352, "learning_rate": 0.00029626394790197025, "loss": 6.2305, "step": 56 }, { "epoch": 0.10206561360874848, "grad_norm": 12.813573837280273, "learning_rate": 0.00029495596607919305, "loss": 6.3056, "step": 63 }, { "epoch": 0.10530579181855002, "eval_loss": 6.351102828979492, "eval_runtime": 98.3387, "eval_samples_per_second": 5.084, "eval_steps_per_second": 0.641, "step": 65 }, { "epoch": 0.11340623734305387, "grad_norm": 4.913240909576416, "learning_rate": 0.00029345584767191685, "loss": 6.2761, "step": 70 }, { "epoch": 0.12474686107735926, "grad_norm": 4.377003192901611, "learning_rate": 0.0002917655810607161, "loss": 6.2849, "step": 77 }, { "epoch": 0.12636695018226002, "eval_loss": 6.280069351196289, "eval_runtime": 95.8677, "eval_samples_per_second": 5.216, "eval_steps_per_second": 0.657, "step": 78 }, { "epoch": 0.13608748481166463, "grad_norm": 6.089831829071045, "learning_rate": 0.0002898874066642667, "loss": 6.2835, "step": 84 }, { "epoch": 0.14742810854597002, "grad_norm": 5.015829563140869, "learning_rate": 0.00028782381396971003, "loss": 6.2952, "step": 91 }, { "epoch": 0.14742810854597002, "eval_loss": 6.3205132484436035, "eval_runtime": 12.5936, "eval_samples_per_second": 39.703, "eval_steps_per_second": 5.003, "step": 91 }, { "epoch": 0.1587687322802754, "grad_norm": 5.68073844909668, "learning_rate": 0.00028557753823288173, "loss": 6.2939, "step": 98 }, { "epoch": 0.16848926690968002, "eval_loss": 6.356600284576416, "eval_runtime": 94.6198, "eval_samples_per_second": 5.284, "eval_steps_per_second": 0.666, "step": 104 }, { "epoch": 0.1701093560145808, "grad_norm": 7.282411098480225, "learning_rate": 0.0002831515568527781, "loss": 6.3244, "step": 105 }, { "epoch": 0.1814499797488862, "grad_norm": 3.9774227142333984, "learning_rate": 0.00028054908542506627, "loss": 6.2779, "step": 112 }, { "epoch": 0.18955042527339003, "eval_loss": 6.257962703704834, "eval_runtime": 94.7616, "eval_samples_per_second": 5.276, "eval_steps_per_second": 0.665, "step": 117 }, { "epoch": 0.19279060348319157, "grad_norm": 17.488378524780273, "learning_rate": 0.00027777357347986823, "loss": 6.2659, "step": 119 }, { "epoch": 0.20413122721749696, "grad_norm": 6.908725261688232, "learning_rate": 0.00027482869990946986, "loss": 6.087, "step": 126 }, { "epoch": 0.21061158363710003, "eval_loss": 5.979724407196045, "eval_runtime": 12.841, "eval_samples_per_second": 38.938, "eval_steps_per_second": 4.906, "step": 130 }, { "epoch": 0.21547185095180235, "grad_norm": 61.65908432006836, "learning_rate": 0.0002717183680920135, "loss": 6.1182, "step": 133 }, { "epoch": 0.22681247468610774, "grad_norm": 4.640040397644043, "learning_rate": 0.00026844670071763906, "loss": 5.8495, "step": 140 }, { "epoch": 0.23167274200081003, "eval_loss": 5.868290901184082, "eval_runtime": 97.3603, "eval_samples_per_second": 5.136, "eval_steps_per_second": 0.647, "step": 143 }, { "epoch": 0.23815309842041313, "grad_norm": 13.159008979797363, "learning_rate": 0.00026501803432393037, "loss": 5.7222, "step": 147 }, { "epoch": 0.24949372215471852, "grad_norm": 7.11221981048584, "learning_rate": 0.00026143691354791145, "loss": 5.6782, "step": 154 }, { "epoch": 0.25273390036452004, "eval_loss": 5.517679691314697, "eval_runtime": 90.6952, "eval_samples_per_second": 5.513, "eval_steps_per_second": 0.695, "step": 156 }, { "epoch": 0.2608343458890239, "grad_norm": 28.23392105102539, "learning_rate": 0.00025770808510220956, "loss": 5.4829, "step": 161 }, { "epoch": 0.27217496962332927, "grad_norm": 8.193851470947266, "learning_rate": 0.00025383649148337105, "loss": 5.4335, "step": 168 }, { "epoch": 0.27379505872823007, "eval_loss": 5.388493537902832, "eval_runtime": 12.9486, "eval_samples_per_second": 38.614, "eval_steps_per_second": 4.865, "step": 169 }, { "epoch": 0.28351559335763465, "grad_norm": 10.434927940368652, "learning_rate": 0.0002498272644206695, "loss": 5.3232, "step": 175 }, { "epoch": 0.29485621709194004, "grad_norm": 102.87190246582031, "learning_rate": 0.0002456857180740884, "loss": 5.4451, "step": 182 }, { "epoch": 0.29485621709194004, "eval_loss": 5.794764995574951, "eval_runtime": 92.9079, "eval_samples_per_second": 5.382, "eval_steps_per_second": 0.678, "step": 182 }, { "epoch": 0.30619684082624543, "grad_norm": 22.954904556274414, "learning_rate": 0.0002414173419904956, "loss": 5.5833, "step": 189 }, { "epoch": 0.3159173754556501, "eval_loss": 5.288653373718262, "eval_runtime": 96.562, "eval_samples_per_second": 5.178, "eval_steps_per_second": 0.652, "step": 195 }, { "epoch": 0.3175374645605508, "grad_norm": 11.466552734375, "learning_rate": 0.00023702779382734566, "loss": 5.3074, "step": 196 }, { "epoch": 0.3288780882948562, "grad_norm": 6.876862049102783, "learning_rate": 0.0002325228918535541, "loss": 5.2684, "step": 203 }, { "epoch": 0.33697853381936005, "eval_loss": 5.303614616394043, "eval_runtime": 12.9708, "eval_samples_per_second": 38.548, "eval_steps_per_second": 4.857, "step": 208 }, { "epoch": 0.3402187120291616, "grad_norm": 39.3548698425293, "learning_rate": 0.00022790860723748442, "loss": 5.2485, "step": 210 }, { "epoch": 0.351559335763467, "grad_norm": 16.2589111328125, "learning_rate": 0.00022319105613226921, "loss": 5.1159, "step": 217 }, { "epoch": 0.3580396921830701, "eval_loss": 5.110955238342285, "eval_runtime": 81.347, "eval_samples_per_second": 6.147, "eval_steps_per_second": 0.774, "step": 221 }, { "epoch": 0.3628999594977724, "grad_norm": 39.86543655395508, "learning_rate": 0.00021837649156895706, "loss": 5.0867, "step": 224 }, { "epoch": 0.37424058323207776, "grad_norm": 36.43711471557617, "learning_rate": 0.00021347129516822945, "loss": 5.0046, "step": 231 }, { "epoch": 0.37910085054678005, "eval_loss": 4.980620861053467, "eval_runtime": 83.5688, "eval_samples_per_second": 5.983, "eval_steps_per_second": 0.754, "step": 234 }, { "epoch": 0.38558120696638315, "grad_norm": 37.00041198730469, "learning_rate": 0.00020848196868167505, "loss": 4.9737, "step": 238 }, { "epoch": 0.39692183070068854, "grad_norm": 18.919788360595703, "learning_rate": 0.000203415125373832, "loss": 4.9134, "step": 245 }, { "epoch": 0.4001620089104901, "eval_loss": 4.938173770904541, "eval_runtime": 13.0027, "eval_samples_per_second": 38.454, "eval_steps_per_second": 4.845, "step": 247 }, { "epoch": 0.4082624544349939, "grad_norm": 14.608779907226562, "learning_rate": 0.00019827748125642242, "loss": 4.9585, "step": 252 }, { "epoch": 0.4196030781692993, "grad_norm": 12.640972137451172, "learning_rate": 0.0001930758461863965, "loss": 4.9145, "step": 259 }, { "epoch": 0.42122316727420006, "eval_loss": 4.95436429977417, "eval_runtime": 95.9176, "eval_samples_per_second": 5.213, "eval_steps_per_second": 0.657, "step": 260 }, { "epoch": 0.4309437019036047, "grad_norm": 12.592373847961426, "learning_rate": 0.0001878171148395872, "loss": 4.9281, "step": 266 }, { "epoch": 0.4422843256379101, "grad_norm": 20.04509735107422, "learning_rate": 0.00018250825757193848, "loss": 4.7976, "step": 273 }, { "epoch": 0.4422843256379101, "eval_loss": 4.795408248901367, "eval_runtime": 96.9219, "eval_samples_per_second": 5.159, "eval_steps_per_second": 0.65, "step": 273 }, { "epoch": 0.4536249493722155, "grad_norm": 15.908781051635742, "learning_rate": 0.0001771563111804211, "loss": 4.7328, "step": 280 }, { "epoch": 0.46334548400162007, "eval_loss": 4.689673900604248, "eval_runtime": 13.057, "eval_samples_per_second": 38.294, "eval_steps_per_second": 4.825, "step": 286 }, { "epoch": 0.46496557310652087, "grad_norm": 15.201325416564941, "learning_rate": 0.0001717683695758819, "loss": 4.5762, "step": 287 }, { "epoch": 0.47630619684082626, "grad_norm": 13.56847095489502, "learning_rate": 0.00016635157438018983, "loss": 4.6799, "step": 294 }, { "epoch": 0.4844066423653301, "eval_loss": 4.579278469085693, "eval_runtime": 95.898, "eval_samples_per_second": 5.214, "eval_steps_per_second": 0.657, "step": 299 }, { "epoch": 0.48764682057513165, "grad_norm": 21.771726608276367, "learning_rate": 0.0001609131054601416, "loss": 4.6044, "step": 301 }, { "epoch": 0.49898744430943703, "grad_norm": 10.611679077148438, "learning_rate": 0.00015546017141067432, "loss": 4.5047, "step": 308 }, { "epoch": 0.5054678007290401, "eval_loss": 4.66025972366333, "eval_runtime": 87.88, "eval_samples_per_second": 5.69, "eval_steps_per_second": 0.717, "step": 312 }, { "epoch": 0.5103280680437424, "grad_norm": 30.076244354248047, "learning_rate": 0.00015, "loss": 4.6057, "step": 315 }, { "epoch": 0.5216686917780478, "grad_norm": 49.967533111572266, "learning_rate": 0.0001445398285893257, "loss": 4.529, "step": 322 }, { "epoch": 0.5265289590927501, "eval_loss": 4.440496444702148, "eval_runtime": 13.0558, "eval_samples_per_second": 38.297, "eval_steps_per_second": 4.825, "step": 325 }, { "epoch": 0.5330093155123532, "grad_norm": 54.9085693359375, "learning_rate": 0.0001390868945398584, "loss": 4.3789, "step": 329 }, { "epoch": 0.5443499392466585, "grad_norm": 23.348072052001953, "learning_rate": 0.00013364842561981014, "loss": 4.3835, "step": 336 }, { "epoch": 0.5475901174564601, "eval_loss": 4.391562461853027, "eval_runtime": 99.2344, "eval_samples_per_second": 5.039, "eval_steps_per_second": 0.635, "step": 338 }, { "epoch": 0.555690562980964, "grad_norm": 21.500411987304688, "learning_rate": 0.00012823163042411807, "loss": 4.3474, "step": 343 }, { "epoch": 0.5670311867152693, "grad_norm": 19.43412971496582, "learning_rate": 0.0001228436888195789, "loss": 4.4279, "step": 350 }, { "epoch": 0.56865127582017, "eval_loss": 4.286036968231201, "eval_runtime": 86.192, "eval_samples_per_second": 5.801, "eval_steps_per_second": 0.731, "step": 351 }, { "epoch": 0.5783718104495748, "grad_norm": 25.414024353027344, "learning_rate": 0.00011749174242806152, "loss": 4.3234, "step": 357 }, { "epoch": 0.5897124341838801, "grad_norm": 28.848846435546875, "learning_rate": 0.00011218288516041279, "loss": 4.3177, "step": 364 }, { "epoch": 0.5897124341838801, "eval_loss": 4.3171234130859375, "eval_runtime": 13.0877, "eval_samples_per_second": 38.204, "eval_steps_per_second": 4.814, "step": 364 }, { "epoch": 0.6010530579181855, "grad_norm": 39.80937194824219, "learning_rate": 0.00010692415381360349, "loss": 4.39, "step": 371 }, { "epoch": 0.6107735925475901, "eval_loss": 4.327215671539307, "eval_runtime": 88.3705, "eval_samples_per_second": 5.658, "eval_steps_per_second": 0.713, "step": 377 }, { "epoch": 0.6123936816524909, "grad_norm": 42.101409912109375, "learning_rate": 0.00010172251874357757, "loss": 4.3718, "step": 378 }, { "epoch": 0.6237343053867963, "grad_norm": 70.50567626953125, "learning_rate": 9.658487462616794e-05, "loss": 4.3138, "step": 385 }, { "epoch": 0.6318347509113001, "eval_loss": 4.375349521636963, "eval_runtime": 99.9359, "eval_samples_per_second": 5.003, "eval_steps_per_second": 0.63, "step": 390 }, { "epoch": 0.6350749291211016, "grad_norm": 68.3165512084961, "learning_rate": 9.151803131832493e-05, "loss": 4.3688, "step": 392 }, { "epoch": 0.6464155528554071, "grad_norm": 32.950931549072266, "learning_rate": 8.652870483177049e-05, "loss": 4.2269, "step": 399 }, { "epoch": 0.6528959092750102, "eval_loss": 4.3338847160339355, "eval_runtime": 13.0873, "eval_samples_per_second": 38.205, "eval_steps_per_second": 4.814, "step": 403 }, { "epoch": 0.6577561765897124, "grad_norm": 34.02058792114258, "learning_rate": 8.162350843104291e-05, "loss": 4.2261, "step": 406 }, { "epoch": 0.6690968003240179, "grad_norm": 16.940593719482422, "learning_rate": 7.680894386773072e-05, "loss": 4.1075, "step": 413 }, { "epoch": 0.6739570676387201, "eval_loss": 4.1692986488342285, "eval_runtime": 91.0436, "eval_samples_per_second": 5.492, "eval_steps_per_second": 0.692, "step": 416 }, { "epoch": 0.6804374240583232, "grad_norm": 23.74928092956543, "learning_rate": 7.209139276251558e-05, "loss": 4.195, "step": 420 }, { "epoch": 0.6917780477926286, "grad_norm": 24.23059844970703, "learning_rate": 6.747710814644589e-05, "loss": 4.2285, "step": 427 }, { "epoch": 0.6950182260024301, "eval_loss": 4.118698596954346, "eval_runtime": 95.8606, "eval_samples_per_second": 5.216, "eval_steps_per_second": 0.657, "step": 429 }, { "epoch": 0.703118671526934, "grad_norm": 7.636209011077881, "learning_rate": 6.297220617265435e-05, "loss": 4.1181, "step": 434 }, { "epoch": 0.7144592952612394, "grad_norm": 34.99061584472656, "learning_rate": 5.858265800950438e-05, "loss": 4.1297, "step": 441 }, { "epoch": 0.7160793843661402, "eval_loss": 4.1250529289245605, "eval_runtime": 13.0819, "eval_samples_per_second": 38.221, "eval_steps_per_second": 4.816, "step": 442 }, { "epoch": 0.7257999189955447, "grad_norm": 10.205436706542969, "learning_rate": 5.4314281925911634e-05, "loss": 4.0536, "step": 448 }, { "epoch": 0.7371405427298502, "grad_norm": 14.625051498413086, "learning_rate": 5.0172735579330526e-05, "loss": 4.0021, "step": 455 }, { "epoch": 0.7371405427298502, "eval_loss": 4.036532878875732, "eval_runtime": 97.6614, "eval_samples_per_second": 5.12, "eval_steps_per_second": 0.645, "step": 455 }, { "epoch": 0.7484811664641555, "grad_norm": 17.34446907043457, "learning_rate": 4.616350851662895e-05, "loss": 4.0089, "step": 462 }, { "epoch": 0.7582017010935601, "eval_loss": 4.002528667449951, "eval_runtime": 95.8182, "eval_samples_per_second": 5.218, "eval_steps_per_second": 0.657, "step": 468 }, { "epoch": 0.759821790198461, "grad_norm": 14.288081169128418, "learning_rate": 4.229191489779047e-05, "loss": 3.9673, "step": 469 }, { "epoch": 0.7711624139327663, "grad_norm": 5.4416937828063965, "learning_rate": 3.8563086452088506e-05, "loss": 3.9458, "step": 476 }, { "epoch": 0.7792628594572701, "eval_loss": 3.992373466491699, "eval_runtime": 13.0871, "eval_samples_per_second": 38.206, "eval_steps_per_second": 4.814, "step": 481 }, { "epoch": 0.7825030376670717, "grad_norm": 12.701101303100586, "learning_rate": 3.498196567606959e-05, "loss": 3.9595, "step": 483 }, { "epoch": 0.7938436614013771, "grad_norm": 8.193514823913574, "learning_rate": 3.1553299282360966e-05, "loss": 3.9405, "step": 490 }, { "epoch": 0.8003240178209802, "eval_loss": 3.925393581390381, "eval_runtime": 84.435, "eval_samples_per_second": 5.922, "eval_steps_per_second": 0.746, "step": 494 }, { "epoch": 0.8051842851356824, "grad_norm": 15.101180076599121, "learning_rate": 2.828163190798644e-05, "loss": 3.8519, "step": 497 }, { "epoch": 0.8165249088699879, "grad_norm": 7.726598739624023, "learning_rate": 2.5171300090530106e-05, "loss": 3.9594, "step": 504 }, { "epoch": 0.8213851761846902, "eval_loss": 3.888984441757202, "eval_runtime": 80.8243, "eval_samples_per_second": 6.186, "eval_steps_per_second": 0.779, "step": 507 }, { "epoch": 0.8278655326042932, "grad_norm": 23.742778778076172, "learning_rate": 2.2226426520131734e-05, "loss": 3.8512, "step": 511 }, { "epoch": 0.8392061563385986, "grad_norm": 31.11621856689453, "learning_rate": 1.9450914574933725e-05, "loss": 3.9056, "step": 518 }, { "epoch": 0.8424463345484001, "eval_loss": 3.8773889541625977, "eval_runtime": 13.0864, "eval_samples_per_second": 38.208, "eval_steps_per_second": 4.814, "step": 520 }, { "epoch": 0.850546780072904, "grad_norm": 21.033510208129883, "learning_rate": 1.6848443147221828e-05, "loss": 3.8635, "step": 525 }, { "epoch": 0.8618874038072094, "grad_norm": 6.028651237487793, "learning_rate": 1.4422461767118233e-05, "loss": 3.8639, "step": 532 }, { "epoch": 0.8635074929121102, "eval_loss": 3.8758437633514404, "eval_runtime": 86.7534, "eval_samples_per_second": 5.763, "eval_steps_per_second": 0.726, "step": 533 }, { "epoch": 0.8732280275415147, "grad_norm": 15.711346626281738, "learning_rate": 1.2176186030289936e-05, "loss": 3.86, "step": 539 }, { "epoch": 0.8845686512758202, "grad_norm": 6.367184638977051, "learning_rate": 1.011259333573326e-05, "loss": 3.8543, "step": 546 }, { "epoch": 0.8845686512758202, "eval_loss": 3.867968797683716, "eval_runtime": 91.8465, "eval_samples_per_second": 5.444, "eval_steps_per_second": 0.686, "step": 546 }, { "epoch": 0.8959092750101255, "grad_norm": 14.810521125793457, "learning_rate": 8.234418939283866e-06, "loss": 3.9097, "step": 553 }, { "epoch": 0.9056298096395302, "eval_loss": 3.8501880168914795, "eval_runtime": 13.1149, "eval_samples_per_second": 38.125, "eval_steps_per_second": 4.804, "step": 559 }, { "epoch": 0.907249898744431, "grad_norm": 9.555310249328613, "learning_rate": 6.544152328083152e-06, "loss": 3.8848, "step": 560 }, { "epoch": 0.9185905224787363, "grad_norm": 11.84811782836914, "learning_rate": 5.044033920806933e-06, "loss": 3.8503, "step": 567 }, { "epoch": 0.9266909680032401, "eval_loss": 3.828705072402954, "eval_runtime": 96.4307, "eval_samples_per_second": 5.185, "eval_steps_per_second": 0.653, "step": 572 }, { "epoch": 0.9299311462130417, "grad_norm": 9.427416801452637, "learning_rate": 3.7360520980297514e-06, "loss": 3.8457, "step": 574 }, { "epoch": 0.9412717699473471, "grad_norm": 14.008903503417969, "learning_rate": 2.6219405666614402e-06, "loss": 3.789, "step": 581 }, { "epoch": 0.9477521263669502, "eval_loss": 3.8357293605804443, "eval_runtime": 99.2387, "eval_samples_per_second": 5.038, "eval_steps_per_second": 0.635, "step": 585 }, { "epoch": 0.9526123936816525, "grad_norm": 8.99583625793457, "learning_rate": 1.7031760619491353e-06, "loss": 3.8241, "step": 588 }, { "epoch": 0.9639530174159578, "grad_norm": 4.564662933349609, "learning_rate": 9.809763900905875e-07, "loss": 3.7923, "step": 595 }, { "epoch": 0.9688132847306602, "eval_loss": 3.8298635482788086, "eval_runtime": 13.1339, "eval_samples_per_second": 38.069, "eval_steps_per_second": 4.797, "step": 598 }, { "epoch": 0.9752936411502633, "grad_norm": 12.884811401367188, "learning_rate": 4.562988140535073e-07, "loss": 3.8022, "step": 602 }, { "epoch": 0.9866342648845686, "grad_norm": 4.150009632110596, "learning_rate": 1.298387847403437e-07, "loss": 3.8071, "step": 609 }, { "epoch": 0.9898744430943702, "eval_loss": 3.831874370574951, "eval_runtime": 92.7823, "eval_samples_per_second": 5.389, "eval_steps_per_second": 0.679, "step": 611 } ], "logging_steps": 7, "max_steps": 617, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 13, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 7.92558921542271e+17, "train_batch_size": 8, "trial_name": null, "trial_params": null }