{ "best_metric": 0.4760262072086334, "best_model_checkpoint": "/home/datta0/models/lora_final/Meta-Llama-3-8B_metamath_ortho/checkpoint-611", "epoch": 0.9898744430943702, "eval_steps": 13, "global_step": 611, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0016200891049007696, "grad_norm": 13.657631874084473, "learning_rate": 2.3076923076923076e-05, "loss": 1.2159, "step": 1 }, { "epoch": 0.011340623734305387, "grad_norm": 4.022308349609375, "learning_rate": 0.00016153846153846153, "loss": 0.9348, "step": 7 }, { "epoch": 0.021061158363710003, "eval_loss": 0.7025884389877319, "eval_runtime": 12.1198, "eval_samples_per_second": 41.255, "eval_steps_per_second": 5.198, "step": 13 }, { "epoch": 0.022681247468610773, "grad_norm": 2.0974254608154297, "learning_rate": 0.0002999979709808197, "loss": 0.6798, "step": 14 }, { "epoch": 0.03402187120291616, "grad_norm": 2.313894033432007, "learning_rate": 0.0002998701612152596, "loss": 0.6609, "step": 21 }, { "epoch": 0.042122316727420006, "eval_loss": 0.6959722638130188, "eval_runtime": 76.7856, "eval_samples_per_second": 6.512, "eval_steps_per_second": 0.82, "step": 26 }, { "epoch": 0.04536249493722155, "grad_norm": 2.249401569366455, "learning_rate": 0.0002995437011859465, "loss": 0.6709, "step": 28 }, { "epoch": 0.056703118671526935, "grad_norm": 1.9721312522888184, "learning_rate": 0.00029901902360990936, "loss": 0.6695, "step": 35 }, { "epoch": 0.06318347509113001, "eval_loss": 0.6784710884094238, "eval_runtime": 79.9084, "eval_samples_per_second": 6.257, "eval_steps_per_second": 0.788, "step": 39 }, { "epoch": 0.06804374240583232, "grad_norm": 2.082033395767212, "learning_rate": 0.00029829682393805085, "loss": 0.6472, "step": 42 }, { "epoch": 0.0793843661401377, "grad_norm": 2.075465202331543, "learning_rate": 0.0002973780594333385, "loss": 0.6578, "step": 49 }, { "epoch": 0.08424463345484001, "eval_loss": 0.6770208477973938, "eval_runtime": 12.0924, "eval_samples_per_second": 41.348, "eval_steps_per_second": 5.21, "step": 52 }, { "epoch": 0.0907249898744431, "grad_norm": 2.3162450790405273, "learning_rate": 0.00029626394790197025, "loss": 0.651, "step": 56 }, { "epoch": 0.10206561360874848, "grad_norm": 2.016871929168701, "learning_rate": 0.00029495596607919305, "loss": 0.6222, "step": 63 }, { "epoch": 0.10530579181855002, "eval_loss": 0.6747815012931824, "eval_runtime": 57.4761, "eval_samples_per_second": 8.699, "eval_steps_per_second": 1.096, "step": 65 }, { "epoch": 0.11340623734305387, "grad_norm": 2.5055863857269287, "learning_rate": 0.00029345584767191685, "loss": 0.6403, "step": 70 }, { "epoch": 0.12474686107735926, "grad_norm": 2.1047961711883545, "learning_rate": 0.0002917655810607161, "loss": 0.6331, "step": 77 }, { "epoch": 0.12636695018226002, "eval_loss": 0.6662291288375854, "eval_runtime": 71.288, "eval_samples_per_second": 7.014, "eval_steps_per_second": 0.884, "step": 78 }, { "epoch": 0.13608748481166463, "grad_norm": 1.7916154861450195, "learning_rate": 0.0002898874066642667, "loss": 0.6259, "step": 84 }, { "epoch": 0.14742810854597002, "grad_norm": 2.046363592147827, "learning_rate": 0.00028782381396971003, "loss": 0.6413, "step": 91 }, { "epoch": 0.14742810854597002, "eval_loss": 0.6609434485435486, "eval_runtime": 12.131, "eval_samples_per_second": 41.217, "eval_steps_per_second": 5.193, "step": 91 }, { "epoch": 0.1587687322802754, "grad_norm": 1.9861701726913452, "learning_rate": 0.00028557753823288173, "loss": 0.631, "step": 98 }, { "epoch": 0.16848926690968002, "eval_loss": 0.6538010239601135, "eval_runtime": 80.9566, "eval_samples_per_second": 6.176, "eval_steps_per_second": 0.778, "step": 104 }, { "epoch": 0.1701093560145808, "grad_norm": 1.846071720123291, "learning_rate": 0.0002831515568527781, "loss": 0.6475, "step": 105 }, { "epoch": 0.1814499797488862, "grad_norm": 2.3317582607269287, "learning_rate": 0.00028054908542506627, "loss": 0.6115, "step": 112 }, { "epoch": 0.18955042527339003, "eval_loss": 0.6490179896354675, "eval_runtime": 82.3645, "eval_samples_per_second": 6.071, "eval_steps_per_second": 0.765, "step": 117 }, { "epoch": 0.19279060348319157, "grad_norm": 1.7408918142318726, "learning_rate": 0.00027777357347986823, "loss": 0.6253, "step": 119 }, { "epoch": 0.20413122721749696, "grad_norm": 1.6225473880767822, "learning_rate": 0.00027482869990946986, "loss": 0.6097, "step": 126 }, { "epoch": 0.21061158363710003, "eval_loss": 0.6458165049552917, "eval_runtime": 12.1355, "eval_samples_per_second": 41.201, "eval_steps_per_second": 5.191, "step": 130 }, { "epoch": 0.21547185095180235, "grad_norm": 2.08821964263916, "learning_rate": 0.0002717183680920135, "loss": 0.6016, "step": 133 }, { "epoch": 0.22681247468610774, "grad_norm": 2.0556275844573975, "learning_rate": 0.00026844670071763906, "loss": 0.6052, "step": 140 }, { "epoch": 0.23167274200081003, "eval_loss": 0.6532984375953674, "eval_runtime": 80.8235, "eval_samples_per_second": 6.186, "eval_steps_per_second": 0.779, "step": 143 }, { "epoch": 0.23815309842041313, "grad_norm": 2.4191389083862305, "learning_rate": 0.00026501803432393037, "loss": 0.6318, "step": 147 }, { "epoch": 0.24949372215471852, "grad_norm": 2.222801923751831, "learning_rate": 0.00026143691354791145, "loss": 0.6131, "step": 154 }, { "epoch": 0.25273390036452004, "eval_loss": 0.6334775686264038, "eval_runtime": 82.9349, "eval_samples_per_second": 6.029, "eval_steps_per_second": 0.76, "step": 156 }, { "epoch": 0.2608343458890239, "grad_norm": 1.741701364517212, "learning_rate": 0.00025770808510220956, "loss": 0.6015, "step": 161 }, { "epoch": 0.27217496962332927, "grad_norm": 1.7539088726043701, "learning_rate": 0.00025383649148337105, "loss": 0.6012, "step": 168 }, { "epoch": 0.27379505872823007, "eval_loss": 0.6165497303009033, "eval_runtime": 12.1274, "eval_samples_per_second": 41.229, "eval_steps_per_second": 5.195, "step": 169 }, { "epoch": 0.28351559335763465, "grad_norm": 2.0264108180999756, "learning_rate": 0.0002498272644206695, "loss": 0.5974, "step": 175 }, { "epoch": 0.29485621709194004, "grad_norm": 1.7606844902038574, "learning_rate": 0.0002456857180740884, "loss": 0.6002, "step": 182 }, { "epoch": 0.29485621709194004, "eval_loss": 0.620435893535614, "eval_runtime": 85.68, "eval_samples_per_second": 5.836, "eval_steps_per_second": 0.735, "step": 182 }, { "epoch": 0.30619684082624543, "grad_norm": 1.691292643547058, "learning_rate": 0.0002414173419904956, "loss": 0.5849, "step": 189 }, { "epoch": 0.3159173754556501, "eval_loss": 0.6220102906227112, "eval_runtime": 82.9019, "eval_samples_per_second": 6.031, "eval_steps_per_second": 0.76, "step": 195 }, { "epoch": 0.3175374645605508, "grad_norm": 1.944412350654602, "learning_rate": 0.00023702779382734566, "loss": 0.5901, "step": 196 }, { "epoch": 0.3288780882948562, "grad_norm": 1.9867457151412964, "learning_rate": 0.0002325228918535541, "loss": 0.5689, "step": 203 }, { "epoch": 0.33697853381936005, "eval_loss": 0.6159170866012573, "eval_runtime": 12.1326, "eval_samples_per_second": 41.211, "eval_steps_per_second": 5.193, "step": 208 }, { "epoch": 0.3402187120291616, "grad_norm": 1.955578327178955, "learning_rate": 0.00022790860723748442, "loss": 0.5924, "step": 210 }, { "epoch": 0.351559335763467, "grad_norm": 1.8735487461090088, "learning_rate": 0.00022319105613226921, "loss": 0.5781, "step": 217 }, { "epoch": 0.3580396921830701, "eval_loss": 0.6110228300094604, "eval_runtime": 81.2619, "eval_samples_per_second": 6.153, "eval_steps_per_second": 0.775, "step": 221 }, { "epoch": 0.3628999594977724, "grad_norm": 2.644158363342285, "learning_rate": 0.00021837649156895706, "loss": 0.5891, "step": 224 }, { "epoch": 0.37424058323207776, "grad_norm": 1.511711597442627, "learning_rate": 0.00021347129516822945, "loss": 0.5765, "step": 231 }, { "epoch": 0.37910085054678005, "eval_loss": 0.6027255654335022, "eval_runtime": 82.4543, "eval_samples_per_second": 6.064, "eval_steps_per_second": 0.764, "step": 234 }, { "epoch": 0.38558120696638315, "grad_norm": 1.8768197298049927, "learning_rate": 0.00020848196868167505, "loss": 0.5932, "step": 238 }, { "epoch": 0.39692183070068854, "grad_norm": 1.6268372535705566, "learning_rate": 0.000203415125373832, "loss": 0.5899, "step": 245 }, { "epoch": 0.4001620089104901, "eval_loss": 0.5983073711395264, "eval_runtime": 12.1671, "eval_samples_per_second": 41.094, "eval_steps_per_second": 5.178, "step": 247 }, { "epoch": 0.4082624544349939, "grad_norm": 2.1390023231506348, "learning_rate": 0.00019827748125642242, "loss": 0.5816, "step": 252 }, { "epoch": 0.4196030781692993, "grad_norm": 1.6386767625808716, "learning_rate": 0.0001930758461863965, "loss": 0.5638, "step": 259 }, { "epoch": 0.42122316727420006, "eval_loss": 0.5904666781425476, "eval_runtime": 81.2985, "eval_samples_per_second": 6.15, "eval_steps_per_second": 0.775, "step": 260 }, { "epoch": 0.4309437019036047, "grad_norm": 1.5721111297607422, "learning_rate": 0.0001878171148395872, "loss": 0.5694, "step": 266 }, { "epoch": 0.4422843256379101, "grad_norm": 1.6830506324768066, "learning_rate": 0.00018250825757193848, "loss": 0.5716, "step": 273 }, { "epoch": 0.4422843256379101, "eval_loss": 0.5874103903770447, "eval_runtime": 81.2386, "eval_samples_per_second": 6.155, "eval_steps_per_second": 0.775, "step": 273 }, { "epoch": 0.4536249493722155, "grad_norm": 1.5602617263793945, "learning_rate": 0.0001771563111804211, "loss": 0.5729, "step": 280 }, { "epoch": 0.46334548400162007, "eval_loss": 0.5809030532836914, "eval_runtime": 12.1549, "eval_samples_per_second": 41.136, "eval_steps_per_second": 5.183, "step": 286 }, { "epoch": 0.46496557310652087, "grad_norm": 1.4982717037200928, "learning_rate": 0.0001717683695758819, "loss": 0.5542, "step": 287 }, { "epoch": 0.47630619684082626, "grad_norm": 1.5617115497589111, "learning_rate": 0.00016635157438018983, "loss": 0.5691, "step": 294 }, { "epoch": 0.4844066423653301, "eval_loss": 0.5728867650032043, "eval_runtime": 89.8515, "eval_samples_per_second": 5.565, "eval_steps_per_second": 0.701, "step": 299 }, { "epoch": 0.48764682057513165, "grad_norm": 1.7522519826889038, "learning_rate": 0.0001609131054601416, "loss": 0.5704, "step": 301 }, { "epoch": 0.49898744430943703, "grad_norm": 1.4569737911224365, "learning_rate": 0.00015546017141067432, "loss": 0.5441, "step": 308 }, { "epoch": 0.5054678007290401, "eval_loss": 0.5658866167068481, "eval_runtime": 100.9717, "eval_samples_per_second": 4.952, "eval_steps_per_second": 0.624, "step": 312 }, { "epoch": 0.5103280680437424, "grad_norm": 1.8204305171966553, "learning_rate": 0.00015, "loss": 0.5583, "step": 315 }, { "epoch": 0.5216686917780478, "grad_norm": 1.7424700260162354, "learning_rate": 0.0001445398285893257, "loss": 0.5468, "step": 322 }, { "epoch": 0.5265289590927501, "eval_loss": 0.5584321022033691, "eval_runtime": 12.166, "eval_samples_per_second": 41.098, "eval_steps_per_second": 5.178, "step": 325 }, { "epoch": 0.5330093155123532, "grad_norm": 1.5261849164962769, "learning_rate": 0.0001390868945398584, "loss": 0.547, "step": 329 }, { "epoch": 0.5443499392466585, "grad_norm": 2.1022207736968994, "learning_rate": 0.00013364842561981014, "loss": 0.536, "step": 336 }, { "epoch": 0.5475901174564601, "eval_loss": 0.554373025894165, "eval_runtime": 80.7911, "eval_samples_per_second": 6.189, "eval_steps_per_second": 0.78, "step": 338 }, { "epoch": 0.555690562980964, "grad_norm": 1.375921607017517, "learning_rate": 0.00012823163042411807, "loss": 0.5271, "step": 343 }, { "epoch": 0.5670311867152693, "grad_norm": 1.1903116703033447, "learning_rate": 0.0001228436888195789, "loss": 0.5277, "step": 350 }, { "epoch": 0.56865127582017, "eval_loss": 0.5474019050598145, "eval_runtime": 91.4838, "eval_samples_per_second": 5.465, "eval_steps_per_second": 0.689, "step": 351 }, { "epoch": 0.5783718104495748, "grad_norm": 1.5861282348632812, "learning_rate": 0.00011749174242806152, "loss": 0.5373, "step": 357 }, { "epoch": 0.5897124341838801, "grad_norm": 1.3687732219696045, "learning_rate": 0.00011218288516041279, "loss": 0.5052, "step": 364 }, { "epoch": 0.5897124341838801, "eval_loss": 0.5396790504455566, "eval_runtime": 12.1166, "eval_samples_per_second": 41.266, "eval_steps_per_second": 5.199, "step": 364 }, { "epoch": 0.6010530579181855, "grad_norm": 1.3852468729019165, "learning_rate": 0.00010692415381360349, "loss": 0.5185, "step": 371 }, { "epoch": 0.6107735925475901, "eval_loss": 0.5308696031570435, "eval_runtime": 82.1061, "eval_samples_per_second": 6.09, "eval_steps_per_second": 0.767, "step": 377 }, { "epoch": 0.6123936816524909, "grad_norm": 1.3524726629257202, "learning_rate": 0.00010172251874357757, "loss": 0.5112, "step": 378 }, { "epoch": 0.6237343053867963, "grad_norm": 1.8277941942214966, "learning_rate": 9.658487462616794e-05, "loss": 0.5161, "step": 385 }, { "epoch": 0.6318347509113001, "eval_loss": 0.5261584520339966, "eval_runtime": 103.2902, "eval_samples_per_second": 4.841, "eval_steps_per_second": 0.61, "step": 390 }, { "epoch": 0.6350749291211016, "grad_norm": 1.4200196266174316, "learning_rate": 9.151803131832493e-05, "loss": 0.5237, "step": 392 }, { "epoch": 0.6464155528554071, "grad_norm": 1.3601250648498535, "learning_rate": 8.652870483177049e-05, "loss": 0.5056, "step": 399 }, { "epoch": 0.6528959092750102, "eval_loss": 0.5227454304695129, "eval_runtime": 12.1817, "eval_samples_per_second": 41.045, "eval_steps_per_second": 5.172, "step": 403 }, { "epoch": 0.6577561765897124, "grad_norm": 1.8182213306427002, "learning_rate": 8.162350843104291e-05, "loss": 0.5122, "step": 406 }, { "epoch": 0.6690968003240179, "grad_norm": 1.909848928451538, "learning_rate": 7.680894386773072e-05, "loss": 0.5091, "step": 413 }, { "epoch": 0.6739570676387201, "eval_loss": 0.5164161324501038, "eval_runtime": 82.8439, "eval_samples_per_second": 6.035, "eval_steps_per_second": 0.76, "step": 416 }, { "epoch": 0.6804374240583232, "grad_norm": 1.4879406690597534, "learning_rate": 7.209139276251558e-05, "loss": 0.5023, "step": 420 }, { "epoch": 0.6917780477926286, "grad_norm": 1.6504310369491577, "learning_rate": 6.747710814644589e-05, "loss": 0.492, "step": 427 }, { "epoch": 0.6950182260024301, "eval_loss": 0.511544942855835, "eval_runtime": 83.9119, "eval_samples_per_second": 5.959, "eval_steps_per_second": 0.751, "step": 429 }, { "epoch": 0.703118671526934, "grad_norm": 1.7625768184661865, "learning_rate": 6.297220617265435e-05, "loss": 0.484, "step": 434 }, { "epoch": 0.7144592952612394, "grad_norm": 1.3635530471801758, "learning_rate": 5.858265800950438e-05, "loss": 0.4936, "step": 441 }, { "epoch": 0.7160793843661402, "eval_loss": 0.506995677947998, "eval_runtime": 12.1546, "eval_samples_per_second": 41.137, "eval_steps_per_second": 5.183, "step": 442 }, { "epoch": 0.7257999189955447, "grad_norm": 1.4940941333770752, "learning_rate": 5.4314281925911634e-05, "loss": 0.4918, "step": 448 }, { "epoch": 0.7371405427298502, "grad_norm": 1.3953615427017212, "learning_rate": 5.0172735579330526e-05, "loss": 0.4818, "step": 455 }, { "epoch": 0.7371405427298502, "eval_loss": 0.5005263686180115, "eval_runtime": 82.2428, "eval_samples_per_second": 6.08, "eval_steps_per_second": 0.766, "step": 455 }, { "epoch": 0.7484811664641555, "grad_norm": 1.418354868888855, "learning_rate": 4.616350851662895e-05, "loss": 0.4762, "step": 462 }, { "epoch": 0.7582017010935601, "eval_loss": 0.49856895208358765, "eval_runtime": 82.2573, "eval_samples_per_second": 6.078, "eval_steps_per_second": 0.766, "step": 468 }, { "epoch": 0.759821790198461, "grad_norm": 1.4406681060791016, "learning_rate": 4.229191489779047e-05, "loss": 0.4979, "step": 469 }, { "epoch": 0.7711624139327663, "grad_norm": 1.495307445526123, "learning_rate": 3.8563086452088506e-05, "loss": 0.4685, "step": 476 }, { "epoch": 0.7792628594572701, "eval_loss": 0.4937572777271271, "eval_runtime": 12.1882, "eval_samples_per_second": 41.023, "eval_steps_per_second": 5.169, "step": 481 }, { "epoch": 0.7825030376670717, "grad_norm": 1.4766854047775269, "learning_rate": 3.498196567606959e-05, "loss": 0.485, "step": 483 }, { "epoch": 0.7938436614013771, "grad_norm": 1.3768912553787231, "learning_rate": 3.1553299282360966e-05, "loss": 0.4614, "step": 490 }, { "epoch": 0.8003240178209802, "eval_loss": 0.49044370651245117, "eval_runtime": 82.6246, "eval_samples_per_second": 6.051, "eval_steps_per_second": 0.762, "step": 494 }, { "epoch": 0.8051842851356824, "grad_norm": 1.2158797979354858, "learning_rate": 2.828163190798644e-05, "loss": 0.4779, "step": 497 }, { "epoch": 0.8165249088699879, "grad_norm": 1.4826974868774414, "learning_rate": 2.5171300090530106e-05, "loss": 0.4942, "step": 504 }, { "epoch": 0.8213851761846902, "eval_loss": 0.4870362877845764, "eval_runtime": 97.2746, "eval_samples_per_second": 5.14, "eval_steps_per_second": 0.648, "step": 507 }, { "epoch": 0.8278655326042932, "grad_norm": 1.782825231552124, "learning_rate": 2.2226426520131734e-05, "loss": 0.4714, "step": 511 }, { "epoch": 0.8392061563385986, "grad_norm": 1.2692053318023682, "learning_rate": 1.9450914574933725e-05, "loss": 0.4767, "step": 518 }, { "epoch": 0.8424463345484001, "eval_loss": 0.48374512791633606, "eval_runtime": 12.151, "eval_samples_per_second": 41.149, "eval_steps_per_second": 5.185, "step": 520 }, { "epoch": 0.850546780072904, "grad_norm": 1.1831132173538208, "learning_rate": 1.6848443147221828e-05, "loss": 0.4555, "step": 525 }, { "epoch": 0.8618874038072094, "grad_norm": 1.2183641195297241, "learning_rate": 1.4422461767118233e-05, "loss": 0.4589, "step": 532 }, { "epoch": 0.8635074929121102, "eval_loss": 0.48187872767448425, "eval_runtime": 85.4092, "eval_samples_per_second": 5.854, "eval_steps_per_second": 0.738, "step": 533 }, { "epoch": 0.8732280275415147, "grad_norm": 1.3544665575027466, "learning_rate": 1.2176186030289936e-05, "loss": 0.4624, "step": 539 }, { "epoch": 0.8845686512758202, "grad_norm": 1.4471445083618164, "learning_rate": 1.011259333573326e-05, "loss": 0.4806, "step": 546 }, { "epoch": 0.8845686512758202, "eval_loss": 0.4796264171600342, "eval_runtime": 87.2314, "eval_samples_per_second": 5.732, "eval_steps_per_second": 0.722, "step": 546 }, { "epoch": 0.8959092750101255, "grad_norm": 1.3122518062591553, "learning_rate": 8.234418939283866e-06, "loss": 0.4647, "step": 553 }, { "epoch": 0.9056298096395302, "eval_loss": 0.4782348573207855, "eval_runtime": 12.1368, "eval_samples_per_second": 41.197, "eval_steps_per_second": 5.191, "step": 559 }, { "epoch": 0.907249898744431, "grad_norm": 1.2790122032165527, "learning_rate": 6.544152328083152e-06, "loss": 0.4736, "step": 560 }, { "epoch": 0.9185905224787363, "grad_norm": 1.3850630521774292, "learning_rate": 5.044033920806933e-06, "loss": 0.461, "step": 567 }, { "epoch": 0.9266909680032401, "eval_loss": 0.4772534668445587, "eval_runtime": 55.6609, "eval_samples_per_second": 8.983, "eval_steps_per_second": 1.132, "step": 572 }, { "epoch": 0.9299311462130417, "grad_norm": 1.3517329692840576, "learning_rate": 3.7360520980297514e-06, "loss": 0.4651, "step": 574 }, { "epoch": 0.9412717699473471, "grad_norm": 1.5328634977340698, "learning_rate": 2.6219405666614402e-06, "loss": 0.4718, "step": 581 }, { "epoch": 0.9477521263669502, "eval_loss": 0.476561576128006, "eval_runtime": 52.8481, "eval_samples_per_second": 9.461, "eval_steps_per_second": 1.192, "step": 585 }, { "epoch": 0.9526123936816525, "grad_norm": 1.2334227561950684, "learning_rate": 1.7031760619491353e-06, "loss": 0.4506, "step": 588 }, { "epoch": 0.9639530174159578, "grad_norm": 1.2170372009277344, "learning_rate": 9.809763900905875e-07, "loss": 0.4684, "step": 595 }, { "epoch": 0.9688132847306602, "eval_loss": 0.47612592577934265, "eval_runtime": 12.1852, "eval_samples_per_second": 41.033, "eval_steps_per_second": 5.17, "step": 598 }, { "epoch": 0.9752936411502633, "grad_norm": 1.179705023765564, "learning_rate": 4.562988140535073e-07, "loss": 0.4611, "step": 602 }, { "epoch": 0.9866342648845686, "grad_norm": 1.2852600812911987, "learning_rate": 1.298387847403437e-07, "loss": 0.4716, "step": 609 }, { "epoch": 0.9898744430943702, "eval_loss": 0.4760262072086334, "eval_runtime": 64.7528, "eval_samples_per_second": 7.722, "eval_steps_per_second": 0.973, "step": 611 } ], "logging_steps": 7, "max_steps": 617, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 13, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 7.394413366880502e+17, "train_batch_size": 8, "trial_name": null, "trial_params": null }