{ "best_metric": 0.506678581237793, "best_model_checkpoint": "/home/datta0/models/lora_final/Meta-Llama-3-8B_metamath_reverse/checkpoint-611", "epoch": 0.9995949777237748, "eval_steps": 13, "global_step": 617, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0016200891049007696, "grad_norm": 21.833539962768555, "learning_rate": 2.3076923076923076e-05, "loss": 1.2178, "step": 1 }, { "epoch": 0.011340623734305387, "grad_norm": 4.27054500579834, "learning_rate": 0.00016153846153846153, "loss": 0.8667, "step": 7 }, { "epoch": 0.021061158363710003, "eval_loss": 0.7332164645195007, "eval_runtime": 12.1685, "eval_samples_per_second": 41.09, "eval_steps_per_second": 5.177, "step": 13 }, { "epoch": 0.022681247468610773, "grad_norm": 4.137913703918457, "learning_rate": 0.0002999979709808197, "loss": 0.6948, "step": 14 }, { "epoch": 0.03402187120291616, "grad_norm": 6.889961242675781, "learning_rate": 0.0002998701612152596, "loss": 0.7009, "step": 21 }, { "epoch": 0.042122316727420006, "eval_loss": 0.7322337627410889, "eval_runtime": 98.0914, "eval_samples_per_second": 5.097, "eval_steps_per_second": 0.642, "step": 26 }, { "epoch": 0.04536249493722155, "grad_norm": 7.484898567199707, "learning_rate": 0.0002995437011859465, "loss": 0.7139, "step": 28 }, { "epoch": 0.056703118671526935, "grad_norm": 9.079093933105469, "learning_rate": 0.00029901902360990936, "loss": 0.7161, "step": 35 }, { "epoch": 0.06318347509113001, "eval_loss": 0.7228567004203796, "eval_runtime": 99.5149, "eval_samples_per_second": 5.024, "eval_steps_per_second": 0.633, "step": 39 }, { "epoch": 0.06804374240583232, "grad_norm": 4.264255523681641, "learning_rate": 0.00029829682393805085, "loss": 0.6852, "step": 42 }, { "epoch": 0.0793843661401377, "grad_norm": 4.362518787384033, "learning_rate": 0.0002973780594333385, "loss": 0.6909, "step": 49 }, { "epoch": 0.08424463345484001, "eval_loss": 0.7190279364585876, "eval_runtime": 12.1598, "eval_samples_per_second": 41.119, "eval_steps_per_second": 5.181, "step": 52 }, { "epoch": 0.0907249898744431, "grad_norm": 5.616891384124756, "learning_rate": 0.00029626394790197025, "loss": 0.6897, "step": 56 }, { "epoch": 0.10206561360874848, "grad_norm": 7.568899154663086, "learning_rate": 0.00029495596607919305, "loss": 0.6541, "step": 63 }, { "epoch": 0.10530579181855002, "eval_loss": 0.708296000957489, "eval_runtime": 108.5306, "eval_samples_per_second": 4.607, "eval_steps_per_second": 0.58, "step": 65 }, { "epoch": 0.11340623734305387, "grad_norm": 6.362358570098877, "learning_rate": 0.00029345584767191685, "loss": 0.6792, "step": 70 }, { "epoch": 0.12474686107735926, "grad_norm": 5.232390403747559, "learning_rate": 0.0002917655810607161, "loss": 0.6704, "step": 77 }, { "epoch": 0.12636695018226002, "eval_loss": 0.7013765573501587, "eval_runtime": 107.5502, "eval_samples_per_second": 4.649, "eval_steps_per_second": 0.586, "step": 78 }, { "epoch": 0.13608748481166463, "grad_norm": 5.135895252227783, "learning_rate": 0.0002898874066642667, "loss": 0.6633, "step": 84 }, { "epoch": 0.14742810854597002, "grad_norm": 6.573320388793945, "learning_rate": 0.00028782381396971003, "loss": 0.6806, "step": 91 }, { "epoch": 0.14742810854597002, "eval_loss": 0.699935257434845, "eval_runtime": 12.1729, "eval_samples_per_second": 41.075, "eval_steps_per_second": 5.175, "step": 91 }, { "epoch": 0.1587687322802754, "grad_norm": 6.816903591156006, "learning_rate": 0.00028557753823288173, "loss": 0.6735, "step": 98 }, { "epoch": 0.16848926690968002, "eval_loss": 0.6932142376899719, "eval_runtime": 108.078, "eval_samples_per_second": 4.626, "eval_steps_per_second": 0.583, "step": 104 }, { "epoch": 0.1701093560145808, "grad_norm": 5.952498435974121, "learning_rate": 0.0002831515568527781, "loss": 0.6837, "step": 105 }, { "epoch": 0.1814499797488862, "grad_norm": 7.350447177886963, "learning_rate": 0.00028054908542506627, "loss": 0.6509, "step": 112 }, { "epoch": 0.18955042527339003, "eval_loss": 0.6961794495582581, "eval_runtime": 119.5602, "eval_samples_per_second": 4.182, "eval_steps_per_second": 0.527, "step": 117 }, { "epoch": 0.19279060348319157, "grad_norm": 5.018365859985352, "learning_rate": 0.00027777357347986823, "loss": 0.6661, "step": 119 }, { "epoch": 0.20413122721749696, "grad_norm": 6.165168285369873, "learning_rate": 0.00027482869990946986, "loss": 0.6537, "step": 126 }, { "epoch": 0.21061158363710003, "eval_loss": 0.6906590461730957, "eval_runtime": 12.1745, "eval_samples_per_second": 41.069, "eval_steps_per_second": 5.175, "step": 130 }, { "epoch": 0.21547185095180235, "grad_norm": 7.098977088928223, "learning_rate": 0.0002717183680920135, "loss": 0.6461, "step": 133 }, { "epoch": 0.22681247468610774, "grad_norm": 4.91437292098999, "learning_rate": 0.00026844670071763906, "loss": 0.6508, "step": 140 }, { "epoch": 0.23167274200081003, "eval_loss": 0.6892094612121582, "eval_runtime": 99.6868, "eval_samples_per_second": 5.016, "eval_steps_per_second": 0.632, "step": 143 }, { "epoch": 0.23815309842041313, "grad_norm": 5.432809352874756, "learning_rate": 0.00026501803432393037, "loss": 0.6696, "step": 147 }, { "epoch": 0.24949372215471852, "grad_norm": 5.0243706703186035, "learning_rate": 0.00026143691354791145, "loss": 0.6594, "step": 154 }, { "epoch": 0.25273390036452004, "eval_loss": 0.6816443204879761, "eval_runtime": 102.7811, "eval_samples_per_second": 4.865, "eval_steps_per_second": 0.613, "step": 156 }, { "epoch": 0.2608343458890239, "grad_norm": 4.282688617706299, "learning_rate": 0.00025770808510220956, "loss": 0.6503, "step": 161 }, { "epoch": 0.27217496962332927, "grad_norm": 5.6248884201049805, "learning_rate": 0.00025383649148337105, "loss": 0.6534, "step": 168 }, { "epoch": 0.27379505872823007, "eval_loss": 0.6733805537223816, "eval_runtime": 12.1401, "eval_samples_per_second": 41.186, "eval_steps_per_second": 5.189, "step": 169 }, { "epoch": 0.28351559335763465, "grad_norm": 6.263326168060303, "learning_rate": 0.0002498272644206695, "loss": 0.657, "step": 175 }, { "epoch": 0.29485621709194004, "grad_norm": 5.819485187530518, "learning_rate": 0.0002456857180740884, "loss": 0.6559, "step": 182 }, { "epoch": 0.29485621709194004, "eval_loss": 0.6744294166564941, "eval_runtime": 93.9309, "eval_samples_per_second": 5.323, "eval_steps_per_second": 0.671, "step": 182 }, { "epoch": 0.30619684082624543, "grad_norm": 4.828428268432617, "learning_rate": 0.0002414173419904956, "loss": 0.6391, "step": 189 }, { "epoch": 0.3159173754556501, "eval_loss": 0.6739293932914734, "eval_runtime": 94.7015, "eval_samples_per_second": 5.28, "eval_steps_per_second": 0.665, "step": 195 }, { "epoch": 0.3175374645605508, "grad_norm": 4.6714348793029785, "learning_rate": 0.00023702779382734566, "loss": 0.6434, "step": 196 }, { "epoch": 0.3288780882948562, "grad_norm": 4.983964920043945, "learning_rate": 0.0002325228918535541, "loss": 0.6115, "step": 203 }, { "epoch": 0.33697853381936005, "eval_loss": 0.6627940535545349, "eval_runtime": 12.1956, "eval_samples_per_second": 40.998, "eval_steps_per_second": 5.166, "step": 208 }, { "epoch": 0.3402187120291616, "grad_norm": 4.924707412719727, "learning_rate": 0.00022790860723748442, "loss": 0.6406, "step": 210 }, { "epoch": 0.351559335763467, "grad_norm": 5.098822116851807, "learning_rate": 0.00022319105613226921, "loss": 0.6261, "step": 217 }, { "epoch": 0.3580396921830701, "eval_loss": 0.6547751426696777, "eval_runtime": 95.155, "eval_samples_per_second": 5.255, "eval_steps_per_second": 0.662, "step": 221 }, { "epoch": 0.3628999594977724, "grad_norm": 4.204422950744629, "learning_rate": 0.00021837649156895706, "loss": 0.6338, "step": 224 }, { "epoch": 0.37424058323207776, "grad_norm": 6.743808746337891, "learning_rate": 0.00021347129516822945, "loss": 0.6288, "step": 231 }, { "epoch": 0.37910085054678005, "eval_loss": 0.6544941067695618, "eval_runtime": 94.1977, "eval_samples_per_second": 5.308, "eval_steps_per_second": 0.669, "step": 234 }, { "epoch": 0.38558120696638315, "grad_norm": 4.939682960510254, "learning_rate": 0.00020848196868167505, "loss": 0.641, "step": 238 }, { "epoch": 0.39692183070068854, "grad_norm": 3.6235039234161377, "learning_rate": 0.000203415125373832, "loss": 0.6377, "step": 245 }, { "epoch": 0.4001620089104901, "eval_loss": 0.6509546041488647, "eval_runtime": 12.1816, "eval_samples_per_second": 41.046, "eval_steps_per_second": 5.172, "step": 247 }, { "epoch": 0.4082624544349939, "grad_norm": 3.9135453701019287, "learning_rate": 0.00019827748125642242, "loss": 0.6295, "step": 252 }, { "epoch": 0.4196030781692993, "grad_norm": 4.376216411590576, "learning_rate": 0.0001930758461863965, "loss": 0.6106, "step": 259 }, { "epoch": 0.42122316727420006, "eval_loss": 0.6465097069740295, "eval_runtime": 95.6315, "eval_samples_per_second": 5.228, "eval_steps_per_second": 0.659, "step": 260 }, { "epoch": 0.4309437019036047, "grad_norm": 4.2812418937683105, "learning_rate": 0.0001878171148395872, "loss": 0.6208, "step": 266 }, { "epoch": 0.4422843256379101, "grad_norm": 5.637602806091309, "learning_rate": 0.00018250825757193848, "loss": 0.6203, "step": 273 }, { "epoch": 0.4422843256379101, "eval_loss": 0.637697160243988, "eval_runtime": 96.3956, "eval_samples_per_second": 5.187, "eval_steps_per_second": 0.654, "step": 273 }, { "epoch": 0.4536249493722155, "grad_norm": 3.6267099380493164, "learning_rate": 0.0001771563111804211, "loss": 0.6196, "step": 280 }, { "epoch": 0.46334548400162007, "eval_loss": 0.6275562047958374, "eval_runtime": 12.1998, "eval_samples_per_second": 40.984, "eval_steps_per_second": 5.164, "step": 286 }, { "epoch": 0.46496557310652087, "grad_norm": 5.152669906616211, "learning_rate": 0.0001717683695758819, "loss": 0.6019, "step": 287 }, { "epoch": 0.47630619684082626, "grad_norm": 3.877650022506714, "learning_rate": 0.00016635157438018983, "loss": 0.6146, "step": 294 }, { "epoch": 0.4844066423653301, "eval_loss": 0.6215513944625854, "eval_runtime": 109.2195, "eval_samples_per_second": 4.578, "eval_steps_per_second": 0.577, "step": 299 }, { "epoch": 0.48764682057513165, "grad_norm": 4.388294696807861, "learning_rate": 0.0001609131054601416, "loss": 0.6115, "step": 301 }, { "epoch": 0.49898744430943703, "grad_norm": 3.800142765045166, "learning_rate": 0.00015546017141067432, "loss": 0.5931, "step": 308 }, { "epoch": 0.5054678007290401, "eval_loss": 0.6187200546264648, "eval_runtime": 102.485, "eval_samples_per_second": 4.879, "eval_steps_per_second": 0.615, "step": 312 }, { "epoch": 0.5103280680437424, "grad_norm": 3.4495720863342285, "learning_rate": 0.00015, "loss": 0.6109, "step": 315 }, { "epoch": 0.5216686917780478, "grad_norm": 4.938626289367676, "learning_rate": 0.0001445398285893257, "loss": 0.5926, "step": 322 }, { "epoch": 0.5265289590927501, "eval_loss": 0.605811357498169, "eval_runtime": 12.2293, "eval_samples_per_second": 40.885, "eval_steps_per_second": 5.152, "step": 325 }, { "epoch": 0.5330093155123532, "grad_norm": 2.700883626937866, "learning_rate": 0.0001390868945398584, "loss": 0.5906, "step": 329 }, { "epoch": 0.5443499392466585, "grad_norm": 2.808101177215576, "learning_rate": 0.00013364842561981014, "loss": 0.5807, "step": 336 }, { "epoch": 0.5475901174564601, "eval_loss": 0.6017518043518066, "eval_runtime": 107.7874, "eval_samples_per_second": 4.639, "eval_steps_per_second": 0.584, "step": 338 }, { "epoch": 0.555690562980964, "grad_norm": 5.031184196472168, "learning_rate": 0.00012823163042411807, "loss": 0.5699, "step": 343 }, { "epoch": 0.5670311867152693, "grad_norm": 2.7506072521209717, "learning_rate": 0.0001228436888195789, "loss": 0.5738, "step": 350 }, { "epoch": 0.56865127582017, "eval_loss": 0.591549813747406, "eval_runtime": 108.9536, "eval_samples_per_second": 4.589, "eval_steps_per_second": 0.578, "step": 351 }, { "epoch": 0.5783718104495748, "grad_norm": 3.25809645652771, "learning_rate": 0.00011749174242806152, "loss": 0.5777, "step": 357 }, { "epoch": 0.5897124341838801, "grad_norm": 3.419900417327881, "learning_rate": 0.00011218288516041279, "loss": 0.5509, "step": 364 }, { "epoch": 0.5897124341838801, "eval_loss": 0.5852319002151489, "eval_runtime": 12.2275, "eval_samples_per_second": 40.892, "eval_steps_per_second": 5.152, "step": 364 }, { "epoch": 0.6010530579181855, "grad_norm": 3.074368953704834, "learning_rate": 0.00010692415381360349, "loss": 0.5641, "step": 371 }, { "epoch": 0.6107735925475901, "eval_loss": 0.5814855694770813, "eval_runtime": 97.6377, "eval_samples_per_second": 5.121, "eval_steps_per_second": 0.645, "step": 377 }, { "epoch": 0.6123936816524909, "grad_norm": 3.7298901081085205, "learning_rate": 0.00010172251874357757, "loss": 0.5582, "step": 378 }, { "epoch": 0.6237343053867963, "grad_norm": 3.000365972518921, "learning_rate": 9.658487462616794e-05, "loss": 0.5606, "step": 385 }, { "epoch": 0.6318347509113001, "eval_loss": 0.5722550749778748, "eval_runtime": 108.1224, "eval_samples_per_second": 4.624, "eval_steps_per_second": 0.583, "step": 390 }, { "epoch": 0.6350749291211016, "grad_norm": 3.4912805557250977, "learning_rate": 9.151803131832493e-05, "loss": 0.5679, "step": 392 }, { "epoch": 0.6464155528554071, "grad_norm": 3.3884501457214355, "learning_rate": 8.652870483177049e-05, "loss": 0.5478, "step": 399 }, { "epoch": 0.6528959092750102, "eval_loss": 0.5653434991836548, "eval_runtime": 12.2742, "eval_samples_per_second": 40.736, "eval_steps_per_second": 5.133, "step": 403 }, { "epoch": 0.6577561765897124, "grad_norm": 2.968888282775879, "learning_rate": 8.162350843104291e-05, "loss": 0.5543, "step": 406 }, { "epoch": 0.6690968003240179, "grad_norm": 2.6131746768951416, "learning_rate": 7.680894386773072e-05, "loss": 0.5451, "step": 413 }, { "epoch": 0.6739570676387201, "eval_loss": 0.5613173246383667, "eval_runtime": 96.25, "eval_samples_per_second": 5.195, "eval_steps_per_second": 0.655, "step": 416 }, { "epoch": 0.6804374240583232, "grad_norm": 3.4300389289855957, "learning_rate": 7.209139276251558e-05, "loss": 0.5393, "step": 420 }, { "epoch": 0.6917780477926286, "grad_norm": 2.889118194580078, "learning_rate": 6.747710814644589e-05, "loss": 0.5362, "step": 427 }, { "epoch": 0.6950182260024301, "eval_loss": 0.5555723905563354, "eval_runtime": 105.9062, "eval_samples_per_second": 4.721, "eval_steps_per_second": 0.595, "step": 429 }, { "epoch": 0.703118671526934, "grad_norm": 3.436547040939331, "learning_rate": 6.297220617265435e-05, "loss": 0.5238, "step": 434 }, { "epoch": 0.7144592952612394, "grad_norm": 2.8543200492858887, "learning_rate": 5.858265800950438e-05, "loss": 0.5328, "step": 441 }, { "epoch": 0.7160793843661402, "eval_loss": 0.5473873019218445, "eval_runtime": 12.2208, "eval_samples_per_second": 40.914, "eval_steps_per_second": 5.155, "step": 442 }, { "epoch": 0.7257999189955447, "grad_norm": 2.4065663814544678, "learning_rate": 5.4314281925911634e-05, "loss": 0.527, "step": 448 }, { "epoch": 0.7371405427298502, "grad_norm": 2.7027878761291504, "learning_rate": 5.0172735579330526e-05, "loss": 0.5185, "step": 455 }, { "epoch": 0.7371405427298502, "eval_loss": 0.5413048267364502, "eval_runtime": 95.8607, "eval_samples_per_second": 5.216, "eval_steps_per_second": 0.657, "step": 455 }, { "epoch": 0.7484811664641555, "grad_norm": 3.0348236560821533, "learning_rate": 4.616350851662895e-05, "loss": 0.5127, "step": 462 }, { "epoch": 0.7582017010935601, "eval_loss": 0.5359441041946411, "eval_runtime": 104.7029, "eval_samples_per_second": 4.775, "eval_steps_per_second": 0.602, "step": 468 }, { "epoch": 0.759821790198461, "grad_norm": 3.366955280303955, "learning_rate": 4.229191489779047e-05, "loss": 0.5387, "step": 469 }, { "epoch": 0.7711624139327663, "grad_norm": 3.5870349407196045, "learning_rate": 3.8563086452088506e-05, "loss": 0.5036, "step": 476 }, { "epoch": 0.7792628594572701, "eval_loss": 0.5299127101898193, "eval_runtime": 12.1891, "eval_samples_per_second": 41.02, "eval_steps_per_second": 5.169, "step": 481 }, { "epoch": 0.7825030376670717, "grad_norm": 2.6109859943389893, "learning_rate": 3.498196567606959e-05, "loss": 0.5213, "step": 483 }, { "epoch": 0.7938436614013771, "grad_norm": 3.2474308013916016, "learning_rate": 3.1553299282360966e-05, "loss": 0.4922, "step": 490 }, { "epoch": 0.8003240178209802, "eval_loss": 0.5264901518821716, "eval_runtime": 102.9216, "eval_samples_per_second": 4.858, "eval_steps_per_second": 0.612, "step": 494 }, { "epoch": 0.8051842851356824, "grad_norm": 2.540072202682495, "learning_rate": 2.828163190798644e-05, "loss": 0.511, "step": 497 }, { "epoch": 0.8165249088699879, "grad_norm": 3.375274419784546, "learning_rate": 2.5171300090530106e-05, "loss": 0.5246, "step": 504 }, { "epoch": 0.8213851761846902, "eval_loss": 0.5218872427940369, "eval_runtime": 104.3935, "eval_samples_per_second": 4.79, "eval_steps_per_second": 0.603, "step": 507 }, { "epoch": 0.8278655326042932, "grad_norm": 2.5530292987823486, "learning_rate": 2.2226426520131734e-05, "loss": 0.5028, "step": 511 }, { "epoch": 0.8392061563385986, "grad_norm": 2.5905301570892334, "learning_rate": 1.9450914574933725e-05, "loss": 0.5088, "step": 518 }, { "epoch": 0.8424463345484001, "eval_loss": 0.517468273639679, "eval_runtime": 12.2173, "eval_samples_per_second": 40.925, "eval_steps_per_second": 5.157, "step": 520 }, { "epoch": 0.850546780072904, "grad_norm": 2.506103992462158, "learning_rate": 1.6848443147221828e-05, "loss": 0.4833, "step": 525 }, { "epoch": 0.8618874038072094, "grad_norm": 2.6942226886749268, "learning_rate": 1.4422461767118233e-05, "loss": 0.4908, "step": 532 }, { "epoch": 0.8635074929121102, "eval_loss": 0.5149521231651306, "eval_runtime": 109.6256, "eval_samples_per_second": 4.561, "eval_steps_per_second": 0.575, "step": 533 }, { "epoch": 0.8732280275415147, "grad_norm": 3.0847156047821045, "learning_rate": 1.2176186030289936e-05, "loss": 0.4903, "step": 539 }, { "epoch": 0.8845686512758202, "grad_norm": 3.1814467906951904, "learning_rate": 1.011259333573326e-05, "loss": 0.5091, "step": 546 }, { "epoch": 0.8845686512758202, "eval_loss": 0.5119792222976685, "eval_runtime": 106.4399, "eval_samples_per_second": 4.697, "eval_steps_per_second": 0.592, "step": 546 }, { "epoch": 0.8959092750101255, "grad_norm": 2.7887933254241943, "learning_rate": 8.234418939283866e-06, "loss": 0.4902, "step": 553 }, { "epoch": 0.9056298096395302, "eval_loss": 0.5096355676651001, "eval_runtime": 12.246, "eval_samples_per_second": 40.83, "eval_steps_per_second": 5.145, "step": 559 }, { "epoch": 0.907249898744431, "grad_norm": 2.728210210800171, "learning_rate": 6.544152328083152e-06, "loss": 0.5004, "step": 560 }, { "epoch": 0.9185905224787363, "grad_norm": 2.694986581802368, "learning_rate": 5.044033920806933e-06, "loss": 0.4865, "step": 567 }, { "epoch": 0.9266909680032401, "eval_loss": 0.5082899332046509, "eval_runtime": 108.3733, "eval_samples_per_second": 4.614, "eval_steps_per_second": 0.581, "step": 572 }, { "epoch": 0.9299311462130417, "grad_norm": 2.781261920928955, "learning_rate": 3.7360520980297514e-06, "loss": 0.4896, "step": 574 }, { "epoch": 0.9412717699473471, "grad_norm": 2.8047616481781006, "learning_rate": 2.6219405666614402e-06, "loss": 0.5007, "step": 581 }, { "epoch": 0.9477521263669502, "eval_loss": 0.5072164535522461, "eval_runtime": 109.1206, "eval_samples_per_second": 4.582, "eval_steps_per_second": 0.577, "step": 585 }, { "epoch": 0.9526123936816525, "grad_norm": 2.2356209754943848, "learning_rate": 1.7031760619491353e-06, "loss": 0.4764, "step": 588 }, { "epoch": 0.9639530174159578, "grad_norm": 2.597925901412964, "learning_rate": 9.809763900905875e-07, "loss": 0.5001, "step": 595 }, { "epoch": 0.9688132847306602, "eval_loss": 0.5067591071128845, "eval_runtime": 12.2032, "eval_samples_per_second": 40.973, "eval_steps_per_second": 5.163, "step": 598 }, { "epoch": 0.9752936411502633, "grad_norm": 2.403350353240967, "learning_rate": 4.562988140535073e-07, "loss": 0.4896, "step": 602 }, { "epoch": 0.9866342648845686, "grad_norm": 2.5980334281921387, "learning_rate": 1.298387847403437e-07, "loss": 0.4989, "step": 609 }, { "epoch": 0.9898744430943702, "eval_loss": 0.506678581237793, "eval_runtime": 104.7374, "eval_samples_per_second": 4.774, "eval_steps_per_second": 0.602, "step": 611 }, { "epoch": 0.9979748886188741, "grad_norm": 3.160571813583374, "learning_rate": 2.029019180288527e-09, "loss": 0.4892, "step": 616 } ], "logging_steps": 7, "max_steps": 617, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 13, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 7.463772026185974e+17, "train_batch_size": 8, "trial_name": null, "trial_params": null }