{ "best_metric": 0.18747980892658234, "best_model_checkpoint": "/home/datta0/models/lora_final/Qwen2-7B_metamath_reverse/checkpoint-13", "epoch": 0.9995949777237748, "eval_steps": 13, "global_step": 617, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0016200891049007696, "grad_norm": 4.52362585067749, "learning_rate": 2.3076923076923076e-05, "loss": 0.2546, "step": 1 }, { "epoch": 0.011340623734305387, "grad_norm": 1.5261310338974, "learning_rate": 0.00016153846153846153, "loss": 0.1753, "step": 7 }, { "epoch": 0.021061158363710003, "eval_loss": 0.18747980892658234, "eval_runtime": 12.4138, "eval_samples_per_second": 40.278, "eval_steps_per_second": 5.075, "step": 13 }, { "epoch": 0.022681247468610773, "grad_norm": 2.1798665523529053, "learning_rate": 0.0002999979709808197, "loss": 0.1715, "step": 14 }, { "epoch": 0.03402187120291616, "grad_norm": 2.5627083778381348, "learning_rate": 0.0002998701612152596, "loss": 0.2036, "step": 21 }, { "epoch": 0.042122316727420006, "eval_loss": 0.2523616552352905, "eval_runtime": 83.5071, "eval_samples_per_second": 5.988, "eval_steps_per_second": 0.754, "step": 26 }, { "epoch": 0.04536249493722155, "grad_norm": 2.336559772491455, "learning_rate": 0.0002995437011859465, "loss": 0.2407, "step": 28 }, { "epoch": 0.056703118671526935, "grad_norm": 2.8058784008026123, "learning_rate": 0.00029901902360990936, "loss": 0.2585, "step": 35 }, { "epoch": 0.06318347509113001, "eval_loss": 0.2876286506652832, "eval_runtime": 74.7685, "eval_samples_per_second": 6.687, "eval_steps_per_second": 0.843, "step": 39 }, { "epoch": 0.06804374240583232, "grad_norm": 2.3905534744262695, "learning_rate": 0.00029829682393805085, "loss": 0.2638, "step": 42 }, { "epoch": 0.0793843661401377, "grad_norm": 2.4771692752838135, "learning_rate": 0.0002973780594333385, "loss": 0.2848, "step": 49 }, { "epoch": 0.08424463345484001, "eval_loss": 0.31458961963653564, "eval_runtime": 12.4219, "eval_samples_per_second": 40.252, "eval_steps_per_second": 5.072, "step": 52 }, { "epoch": 0.0907249898744431, "grad_norm": 2.488586902618408, "learning_rate": 0.00029626394790197025, "loss": 0.3027, "step": 56 }, { "epoch": 0.10206561360874848, "grad_norm": 2.716110944747925, "learning_rate": 0.00029495596607919305, "loss": 0.2997, "step": 63 }, { "epoch": 0.10530579181855002, "eval_loss": 0.3230917453765869, "eval_runtime": 88.9179, "eval_samples_per_second": 5.623, "eval_steps_per_second": 0.709, "step": 65 }, { "epoch": 0.11340623734305387, "grad_norm": 2.5921218395233154, "learning_rate": 0.00029345584767191685, "loss": 0.3052, "step": 70 }, { "epoch": 0.12474686107735926, "grad_norm": 2.5036072731018066, "learning_rate": 0.0002917655810607161, "loss": 0.3196, "step": 77 }, { "epoch": 0.12636695018226002, "eval_loss": 0.3350300192832947, "eval_runtime": 89.9537, "eval_samples_per_second": 5.558, "eval_steps_per_second": 0.7, "step": 78 }, { "epoch": 0.13608748481166463, "grad_norm": 2.4138002395629883, "learning_rate": 0.0002898874066642667, "loss": 0.3225, "step": 84 }, { "epoch": 0.14742810854597002, "grad_norm": 2.376221179962158, "learning_rate": 0.00028782381396971003, "loss": 0.3263, "step": 91 }, { "epoch": 0.14742810854597002, "eval_loss": 0.3406391739845276, "eval_runtime": 12.3974, "eval_samples_per_second": 40.331, "eval_steps_per_second": 5.082, "step": 91 }, { "epoch": 0.1587687322802754, "grad_norm": 2.2181365489959717, "learning_rate": 0.00028557753823288173, "loss": 0.3148, "step": 98 }, { "epoch": 0.16848926690968002, "eval_loss": 0.3401181101799011, "eval_runtime": 90.3528, "eval_samples_per_second": 5.534, "eval_steps_per_second": 0.697, "step": 104 }, { "epoch": 0.1701093560145808, "grad_norm": 2.313964366912842, "learning_rate": 0.0002831515568527781, "loss": 0.3245, "step": 105 }, { "epoch": 0.1814499797488862, "grad_norm": 2.2569937705993652, "learning_rate": 0.00028054908542506627, "loss": 0.3297, "step": 112 }, { "epoch": 0.18955042527339003, "eval_loss": 0.3456376791000366, "eval_runtime": 88.1843, "eval_samples_per_second": 5.67, "eval_steps_per_second": 0.714, "step": 117 }, { "epoch": 0.19279060348319157, "grad_norm": 2.462419033050537, "learning_rate": 0.00027777357347986823, "loss": 0.3189, "step": 119 }, { "epoch": 0.20413122721749696, "grad_norm": 2.143909215927124, "learning_rate": 0.00027482869990946986, "loss": 0.3221, "step": 126 }, { "epoch": 0.21061158363710003, "eval_loss": 0.3476516604423523, "eval_runtime": 12.3771, "eval_samples_per_second": 40.397, "eval_steps_per_second": 5.09, "step": 130 }, { "epoch": 0.21547185095180235, "grad_norm": 1.9863917827606201, "learning_rate": 0.0002717183680920135, "loss": 0.3263, "step": 133 }, { "epoch": 0.22681247468610774, "grad_norm": 2.3500940799713135, "learning_rate": 0.00026844670071763906, "loss": 0.3359, "step": 140 }, { "epoch": 0.23167274200081003, "eval_loss": 0.3490840792655945, "eval_runtime": 92.3634, "eval_samples_per_second": 5.413, "eval_steps_per_second": 0.682, "step": 143 }, { "epoch": 0.23815309842041313, "grad_norm": 2.0207138061523438, "learning_rate": 0.00026501803432393037, "loss": 0.327, "step": 147 }, { "epoch": 0.24949372215471852, "grad_norm": 2.1643483638763428, "learning_rate": 0.00026143691354791145, "loss": 0.3296, "step": 154 }, { "epoch": 0.25273390036452004, "eval_loss": 0.3398853838443756, "eval_runtime": 91.6137, "eval_samples_per_second": 5.458, "eval_steps_per_second": 0.688, "step": 156 }, { "epoch": 0.2608343458890239, "grad_norm": 2.1574275493621826, "learning_rate": 0.00025770808510220956, "loss": 0.3279, "step": 161 }, { "epoch": 0.27217496962332927, "grad_norm": 2.1672723293304443, "learning_rate": 0.00025383649148337105, "loss": 0.3361, "step": 168 }, { "epoch": 0.27379505872823007, "eval_loss": 0.34163641929626465, "eval_runtime": 12.4303, "eval_samples_per_second": 40.224, "eval_steps_per_second": 5.068, "step": 169 }, { "epoch": 0.28351559335763465, "grad_norm": 2.2144994735717773, "learning_rate": 0.0002498272644206695, "loss": 0.3341, "step": 175 }, { "epoch": 0.29485621709194004, "grad_norm": 2.0570850372314453, "learning_rate": 0.0002456857180740884, "loss": 0.3187, "step": 182 }, { "epoch": 0.29485621709194004, "eval_loss": 0.3376466631889343, "eval_runtime": 74.564, "eval_samples_per_second": 6.706, "eval_steps_per_second": 0.845, "step": 182 }, { "epoch": 0.30619684082624543, "grad_norm": 2.23764967918396, "learning_rate": 0.0002414173419904956, "loss": 0.3285, "step": 189 }, { "epoch": 0.3159173754556501, "eval_loss": 0.33701062202453613, "eval_runtime": 85.9424, "eval_samples_per_second": 5.818, "eval_steps_per_second": 0.733, "step": 195 }, { "epoch": 0.3175374645605508, "grad_norm": 1.9056419134140015, "learning_rate": 0.00023702779382734566, "loss": 0.3197, "step": 196 }, { "epoch": 0.3288780882948562, "grad_norm": 2.03104305267334, "learning_rate": 0.0002325228918535541, "loss": 0.3189, "step": 203 }, { "epoch": 0.33697853381936005, "eval_loss": 0.33059319853782654, "eval_runtime": 12.4539, "eval_samples_per_second": 40.148, "eval_steps_per_second": 5.059, "step": 208 }, { "epoch": 0.3402187120291616, "grad_norm": 2.170870780944824, "learning_rate": 0.00022790860723748442, "loss": 0.3232, "step": 210 }, { "epoch": 0.351559335763467, "grad_norm": 1.942113995552063, "learning_rate": 0.00022319105613226921, "loss": 0.3154, "step": 217 }, { "epoch": 0.3580396921830701, "eval_loss": 0.32931268215179443, "eval_runtime": 86.8422, "eval_samples_per_second": 5.758, "eval_steps_per_second": 0.725, "step": 221 }, { "epoch": 0.3628999594977724, "grad_norm": 1.8157175779342651, "learning_rate": 0.00021837649156895706, "loss": 0.3205, "step": 224 }, { "epoch": 0.37424058323207776, "grad_norm": 2.035099983215332, "learning_rate": 0.00021347129516822945, "loss": 0.3149, "step": 231 }, { "epoch": 0.37910085054678005, "eval_loss": 0.3263063430786133, "eval_runtime": 78.1253, "eval_samples_per_second": 6.4, "eval_steps_per_second": 0.806, "step": 234 }, { "epoch": 0.38558120696638315, "grad_norm": 2.07989764213562, "learning_rate": 0.00020848196868167505, "loss": 0.3115, "step": 238 }, { "epoch": 0.39692183070068854, "grad_norm": 1.8494740724563599, "learning_rate": 0.000203415125373832, "loss": 0.3099, "step": 245 }, { "epoch": 0.4001620089104901, "eval_loss": 0.32076019048690796, "eval_runtime": 12.4446, "eval_samples_per_second": 40.178, "eval_steps_per_second": 5.062, "step": 247 }, { "epoch": 0.4082624544349939, "grad_norm": 2.0092668533325195, "learning_rate": 0.00019827748125642242, "loss": 0.3131, "step": 252 }, { "epoch": 0.4196030781692993, "grad_norm": 2.0937247276306152, "learning_rate": 0.0001930758461863965, "loss": 0.3089, "step": 259 }, { "epoch": 0.42122316727420006, "eval_loss": 0.3143251836299896, "eval_runtime": 77.5222, "eval_samples_per_second": 6.45, "eval_steps_per_second": 0.813, "step": 260 }, { "epoch": 0.4309437019036047, "grad_norm": 2.0988640785217285, "learning_rate": 0.0001878171148395872, "loss": 0.3124, "step": 266 }, { "epoch": 0.4422843256379101, "grad_norm": 2.4692976474761963, "learning_rate": 0.00018250825757193848, "loss": 0.3125, "step": 273 }, { "epoch": 0.4422843256379101, "eval_loss": 0.31036385893821716, "eval_runtime": 84.8095, "eval_samples_per_second": 5.896, "eval_steps_per_second": 0.743, "step": 273 }, { "epoch": 0.4536249493722155, "grad_norm": 1.8889740705490112, "learning_rate": 0.0001771563111804211, "loss": 0.2959, "step": 280 }, { "epoch": 0.46334548400162007, "eval_loss": 0.30606991052627563, "eval_runtime": 12.4497, "eval_samples_per_second": 40.162, "eval_steps_per_second": 5.06, "step": 286 }, { "epoch": 0.46496557310652087, "grad_norm": 2.023824691772461, "learning_rate": 0.0001717683695758819, "loss": 0.299, "step": 287 }, { "epoch": 0.47630619684082626, "grad_norm": 2.0295095443725586, "learning_rate": 0.00016635157438018983, "loss": 0.3042, "step": 294 }, { "epoch": 0.4844066423653301, "eval_loss": 0.2992589473724365, "eval_runtime": 75.8225, "eval_samples_per_second": 6.594, "eval_steps_per_second": 0.831, "step": 299 }, { "epoch": 0.48764682057513165, "grad_norm": 1.946453332901001, "learning_rate": 0.0001609131054601416, "loss": 0.294, "step": 301 }, { "epoch": 0.49898744430943703, "grad_norm": 1.9718040227890015, "learning_rate": 0.00015546017141067432, "loss": 0.2829, "step": 308 }, { "epoch": 0.5054678007290401, "eval_loss": 0.294008731842041, "eval_runtime": 82.5163, "eval_samples_per_second": 6.059, "eval_steps_per_second": 0.763, "step": 312 }, { "epoch": 0.5103280680437424, "grad_norm": 1.8706125020980835, "learning_rate": 0.00015, "loss": 0.2802, "step": 315 }, { "epoch": 0.5216686917780478, "grad_norm": 1.961658000946045, "learning_rate": 0.0001445398285893257, "loss": 0.2832, "step": 322 }, { "epoch": 0.5265289590927501, "eval_loss": 0.2878178656101227, "eval_runtime": 12.4298, "eval_samples_per_second": 40.226, "eval_steps_per_second": 5.068, "step": 325 }, { "epoch": 0.5330093155123532, "grad_norm": 1.774943232536316, "learning_rate": 0.0001390868945398584, "loss": 0.2763, "step": 329 }, { "epoch": 0.5443499392466585, "grad_norm": 2.2113044261932373, "learning_rate": 0.00013364842561981014, "loss": 0.2715, "step": 336 }, { "epoch": 0.5475901174564601, "eval_loss": 0.2820768654346466, "eval_runtime": 85.9045, "eval_samples_per_second": 5.82, "eval_steps_per_second": 0.733, "step": 338 }, { "epoch": 0.555690562980964, "grad_norm": 1.9216161966323853, "learning_rate": 0.00012823163042411807, "loss": 0.2706, "step": 343 }, { "epoch": 0.5670311867152693, "grad_norm": 2.2038073539733887, "learning_rate": 0.0001228436888195789, "loss": 0.2702, "step": 350 }, { "epoch": 0.56865127582017, "eval_loss": 0.27528542280197144, "eval_runtime": 76.8796, "eval_samples_per_second": 6.504, "eval_steps_per_second": 0.819, "step": 351 }, { "epoch": 0.5783718104495748, "grad_norm": 2.00290846824646, "learning_rate": 0.00011749174242806152, "loss": 0.2704, "step": 357 }, { "epoch": 0.5897124341838801, "grad_norm": 1.8399847745895386, "learning_rate": 0.00011218288516041279, "loss": 0.2687, "step": 364 }, { "epoch": 0.5897124341838801, "eval_loss": 0.2687266170978546, "eval_runtime": 12.4501, "eval_samples_per_second": 40.16, "eval_steps_per_second": 5.06, "step": 364 }, { "epoch": 0.6010530579181855, "grad_norm": 1.754230260848999, "learning_rate": 0.00010692415381360349, "loss": 0.2604, "step": 371 }, { "epoch": 0.6107735925475901, "eval_loss": 0.26285678148269653, "eval_runtime": 89.7111, "eval_samples_per_second": 5.573, "eval_steps_per_second": 0.702, "step": 377 }, { "epoch": 0.6123936816524909, "grad_norm": 2.0383028984069824, "learning_rate": 0.00010172251874357757, "loss": 0.2565, "step": 378 }, { "epoch": 0.6237343053867963, "grad_norm": 1.9088571071624756, "learning_rate": 9.658487462616794e-05, "loss": 0.252, "step": 385 }, { "epoch": 0.6318347509113001, "eval_loss": 0.25791773200035095, "eval_runtime": 88.6204, "eval_samples_per_second": 5.642, "eval_steps_per_second": 0.711, "step": 390 }, { "epoch": 0.6350749291211016, "grad_norm": 1.8957221508026123, "learning_rate": 9.151803131832493e-05, "loss": 0.2709, "step": 392 }, { "epoch": 0.6464155528554071, "grad_norm": 1.6993653774261475, "learning_rate": 8.652870483177049e-05, "loss": 0.2537, "step": 399 }, { "epoch": 0.6528959092750102, "eval_loss": 0.2528708577156067, "eval_runtime": 12.4911, "eval_samples_per_second": 40.028, "eval_steps_per_second": 5.044, "step": 403 }, { "epoch": 0.6577561765897124, "grad_norm": 1.844677209854126, "learning_rate": 8.162350843104291e-05, "loss": 0.2468, "step": 406 }, { "epoch": 0.6690968003240179, "grad_norm": 1.943462610244751, "learning_rate": 7.680894386773072e-05, "loss": 0.2535, "step": 413 }, { "epoch": 0.6739570676387201, "eval_loss": 0.24768707156181335, "eval_runtime": 89.239, "eval_samples_per_second": 5.603, "eval_steps_per_second": 0.706, "step": 416 }, { "epoch": 0.6804374240583232, "grad_norm": 1.9346702098846436, "learning_rate": 7.209139276251558e-05, "loss": 0.2472, "step": 420 }, { "epoch": 0.6917780477926286, "grad_norm": 1.684934377670288, "learning_rate": 6.747710814644589e-05, "loss": 0.2442, "step": 427 }, { "epoch": 0.6950182260024301, "eval_loss": 0.24250428378582, "eval_runtime": 90.1456, "eval_samples_per_second": 5.547, "eval_steps_per_second": 0.699, "step": 429 }, { "epoch": 0.703118671526934, "grad_norm": 1.652444839477539, "learning_rate": 6.297220617265435e-05, "loss": 0.233, "step": 434 }, { "epoch": 0.7144592952612394, "grad_norm": 1.749850869178772, "learning_rate": 5.858265800950438e-05, "loss": 0.2451, "step": 441 }, { "epoch": 0.7160793843661402, "eval_loss": 0.237756609916687, "eval_runtime": 12.447, "eval_samples_per_second": 40.17, "eval_steps_per_second": 5.061, "step": 442 }, { "epoch": 0.7257999189955447, "grad_norm": 1.607130527496338, "learning_rate": 5.4314281925911634e-05, "loss": 0.2291, "step": 448 }, { "epoch": 0.7371405427298502, "grad_norm": 1.7240676879882812, "learning_rate": 5.0172735579330526e-05, "loss": 0.2275, "step": 455 }, { "epoch": 0.7371405427298502, "eval_loss": 0.23375177383422852, "eval_runtime": 88.4413, "eval_samples_per_second": 5.653, "eval_steps_per_second": 0.712, "step": 455 }, { "epoch": 0.7484811664641555, "grad_norm": 1.821494698524475, "learning_rate": 4.616350851662895e-05, "loss": 0.2288, "step": 462 }, { "epoch": 0.7582017010935601, "eval_loss": 0.23097103834152222, "eval_runtime": 91.1616, "eval_samples_per_second": 5.485, "eval_steps_per_second": 0.691, "step": 468 }, { "epoch": 0.759821790198461, "grad_norm": 1.6781857013702393, "learning_rate": 4.229191489779047e-05, "loss": 0.2291, "step": 469 }, { "epoch": 0.7711624139327663, "grad_norm": 1.8011544942855835, "learning_rate": 3.8563086452088506e-05, "loss": 0.2323, "step": 476 }, { "epoch": 0.7792628594572701, "eval_loss": 0.22944015264511108, "eval_runtime": 12.4573, "eval_samples_per_second": 40.137, "eval_steps_per_second": 5.057, "step": 481 }, { "epoch": 0.7825030376670717, "grad_norm": 1.6624199151992798, "learning_rate": 3.498196567606959e-05, "loss": 0.2238, "step": 483 }, { "epoch": 0.7938436614013771, "grad_norm": 1.7592713832855225, "learning_rate": 3.1553299282360966e-05, "loss": 0.2254, "step": 490 }, { "epoch": 0.8003240178209802, "eval_loss": 0.225956991314888, "eval_runtime": 79.2699, "eval_samples_per_second": 6.308, "eval_steps_per_second": 0.795, "step": 494 }, { "epoch": 0.8051842851356824, "grad_norm": 1.757391333580017, "learning_rate": 2.828163190798644e-05, "loss": 0.2203, "step": 497 }, { "epoch": 0.8165249088699879, "grad_norm": 1.7900511026382446, "learning_rate": 2.5171300090530106e-05, "loss": 0.2142, "step": 504 }, { "epoch": 0.8213851761846902, "eval_loss": 0.22206208109855652, "eval_runtime": 85.0506, "eval_samples_per_second": 5.879, "eval_steps_per_second": 0.741, "step": 507 }, { "epoch": 0.8278655326042932, "grad_norm": 1.7866923809051514, "learning_rate": 2.2226426520131734e-05, "loss": 0.2147, "step": 511 }, { "epoch": 0.8392061563385986, "grad_norm": 1.5760256052017212, "learning_rate": 1.9450914574933725e-05, "loss": 0.219, "step": 518 }, { "epoch": 0.8424463345484001, "eval_loss": 0.21945638954639435, "eval_runtime": 12.4577, "eval_samples_per_second": 40.136, "eval_steps_per_second": 5.057, "step": 520 }, { "epoch": 0.850546780072904, "grad_norm": 1.7267919778823853, "learning_rate": 1.6848443147221828e-05, "loss": 0.2101, "step": 525 }, { "epoch": 0.8618874038072094, "grad_norm": 1.6345723867416382, "learning_rate": 1.4422461767118233e-05, "loss": 0.2133, "step": 532 }, { "epoch": 0.8635074929121102, "eval_loss": 0.2179647982120514, "eval_runtime": 85.1994, "eval_samples_per_second": 5.869, "eval_steps_per_second": 0.739, "step": 533 }, { "epoch": 0.8732280275415147, "grad_norm": 1.4804401397705078, "learning_rate": 1.2176186030289936e-05, "loss": 0.2136, "step": 539 }, { "epoch": 0.8845686512758202, "grad_norm": 1.593434453010559, "learning_rate": 1.011259333573326e-05, "loss": 0.2095, "step": 546 }, { "epoch": 0.8845686512758202, "eval_loss": 0.21640805900096893, "eval_runtime": 75.9951, "eval_samples_per_second": 6.579, "eval_steps_per_second": 0.829, "step": 546 }, { "epoch": 0.8959092750101255, "grad_norm": 1.5899996757507324, "learning_rate": 8.234418939283866e-06, "loss": 0.2067, "step": 553 }, { "epoch": 0.9056298096395302, "eval_loss": 0.2154710441827774, "eval_runtime": 12.505, "eval_samples_per_second": 39.984, "eval_steps_per_second": 5.038, "step": 559 }, { "epoch": 0.907249898744431, "grad_norm": 1.567068099975586, "learning_rate": 6.544152328083152e-06, "loss": 0.2117, "step": 560 }, { "epoch": 0.9185905224787363, "grad_norm": 1.676525354385376, "learning_rate": 5.044033920806933e-06, "loss": 0.2073, "step": 567 }, { "epoch": 0.9266909680032401, "eval_loss": 0.21462255716323853, "eval_runtime": 87.8678, "eval_samples_per_second": 5.69, "eval_steps_per_second": 0.717, "step": 572 }, { "epoch": 0.9299311462130417, "grad_norm": 1.5873310565948486, "learning_rate": 3.7360520980297514e-06, "loss": 0.208, "step": 574 }, { "epoch": 0.9412717699473471, "grad_norm": 1.7347217798233032, "learning_rate": 2.6219405666614402e-06, "loss": 0.2124, "step": 581 }, { "epoch": 0.9477521263669502, "eval_loss": 0.21402177214622498, "eval_runtime": 87.5614, "eval_samples_per_second": 5.71, "eval_steps_per_second": 0.719, "step": 585 }, { "epoch": 0.9526123936816525, "grad_norm": 1.8601125478744507, "learning_rate": 1.7031760619491353e-06, "loss": 0.2076, "step": 588 }, { "epoch": 0.9639530174159578, "grad_norm": 1.6909899711608887, "learning_rate": 9.809763900905875e-07, "loss": 0.2115, "step": 595 }, { "epoch": 0.9688132847306602, "eval_loss": 0.21379441022872925, "eval_runtime": 12.4885, "eval_samples_per_second": 40.037, "eval_steps_per_second": 5.045, "step": 598 }, { "epoch": 0.9752936411502633, "grad_norm": 1.5291049480438232, "learning_rate": 4.562988140535073e-07, "loss": 0.2071, "step": 602 }, { "epoch": 0.9866342648845686, "grad_norm": 1.775606632232666, "learning_rate": 1.298387847403437e-07, "loss": 0.2127, "step": 609 }, { "epoch": 0.9898744430943702, "eval_loss": 0.21362636983394623, "eval_runtime": 90.3482, "eval_samples_per_second": 5.534, "eval_steps_per_second": 0.697, "step": 611 }, { "epoch": 0.9979748886188741, "grad_norm": 1.7174122333526611, "learning_rate": 2.029019180288527e-09, "loss": 0.2113, "step": 616 } ], "logging_steps": 7, "max_steps": 617, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 13, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 7.47773295983788e+17, "train_batch_size": 8, "trial_name": null, "trial_params": null }