{ "best_metric": 2.2199668884277344, "best_model_checkpoint": "/home/datta0/models/lora_final/Meta-Llama-3-8B_pct_ortho/checkpoint-328", "epoch": 0.9887351142581268, "eval_steps": 8, "global_step": 384, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.002574831026713872, "grad_norm": 4.8250555992126465, "learning_rate": 3.75e-05, "loss": 2.3613, "step": 1 }, { "epoch": 0.010299324106855488, "grad_norm": 3.812638998031616, "learning_rate": 0.00015, "loss": 2.3464, "step": 4 }, { "epoch": 0.020598648213710977, "grad_norm": 4.135324478149414, "learning_rate": 0.0003, "loss": 2.2594, "step": 8 }, { "epoch": 0.020598648213710977, "eval_loss": 2.2432966232299805, "eval_runtime": 10.4685, "eval_samples_per_second": 23.404, "eval_steps_per_second": 2.961, "step": 8 }, { "epoch": 0.03089797232056646, "grad_norm": 3.940943956375122, "learning_rate": 0.0002999179886011389, "loss": 2.2353, "step": 12 }, { "epoch": 0.04119729642742195, "grad_norm": 3.0164599418640137, "learning_rate": 0.00029967204408281613, "loss": 2.2616, "step": 16 }, { "epoch": 0.04119729642742195, "eval_loss": 2.2376890182495117, "eval_runtime": 10.472, "eval_samples_per_second": 23.396, "eval_steps_per_second": 2.96, "step": 16 }, { "epoch": 0.051496620534277435, "grad_norm": 3.438307046890259, "learning_rate": 0.0002992624353817517, "loss": 2.2166, "step": 20 }, { "epoch": 0.06179594464113292, "grad_norm": 3.0681569576263428, "learning_rate": 0.00029868961039904624, "loss": 2.2041, "step": 24 }, { "epoch": 0.06179594464113292, "eval_loss": 2.2342782020568848, "eval_runtime": 10.4748, "eval_samples_per_second": 23.389, "eval_steps_per_second": 2.959, "step": 24 }, { "epoch": 0.07209526874798841, "grad_norm": 2.55240535736084, "learning_rate": 0.00029795419551040833, "loss": 2.2296, "step": 28 }, { "epoch": 0.0823945928548439, "grad_norm": 2.451361894607544, "learning_rate": 0.0002970569948812214, "loss": 2.2802, "step": 32 }, { "epoch": 0.0823945928548439, "eval_loss": 2.253261089324951, "eval_runtime": 10.4553, "eval_samples_per_second": 23.433, "eval_steps_per_second": 2.965, "step": 32 }, { "epoch": 0.09269391696169939, "grad_norm": 5.104744911193848, "learning_rate": 0.0002959989895872009, "loss": 2.2541, "step": 36 }, { "epoch": 0.10299324106855487, "grad_norm": 2.3457939624786377, "learning_rate": 0.0002947813365416023, "loss": 2.2832, "step": 40 }, { "epoch": 0.10299324106855487, "eval_loss": 2.2618255615234375, "eval_runtime": 10.4563, "eval_samples_per_second": 23.431, "eval_steps_per_second": 2.965, "step": 40 }, { "epoch": 0.11329256517541036, "grad_norm": 3.244114398956299, "learning_rate": 0.0002934053672301536, "loss": 2.2859, "step": 44 }, { "epoch": 0.12359188928226585, "grad_norm": 2.5838561058044434, "learning_rate": 0.00029187258625509513, "loss": 2.2484, "step": 48 }, { "epoch": 0.12359188928226585, "eval_loss": 2.25663685798645, "eval_runtime": 10.4533, "eval_samples_per_second": 23.438, "eval_steps_per_second": 2.966, "step": 48 }, { "epoch": 0.13389121338912133, "grad_norm": 2.609206199645996, "learning_rate": 0.0002901846696899191, "loss": 2.3116, "step": 52 }, { "epoch": 0.14419053749597682, "grad_norm": 3.638091802597046, "learning_rate": 0.0002883434632466077, "loss": 2.2735, "step": 56 }, { "epoch": 0.14419053749597682, "eval_loss": 2.2576816082000732, "eval_runtime": 10.4026, "eval_samples_per_second": 23.552, "eval_steps_per_second": 2.98, "step": 56 }, { "epoch": 0.15448986160283232, "grad_norm": 2.5911107063293457, "learning_rate": 0.00028635098025737434, "loss": 2.2803, "step": 60 }, { "epoch": 0.1647891857096878, "grad_norm": 2.4840705394744873, "learning_rate": 0.0002842093994731145, "loss": 2.293, "step": 64 }, { "epoch": 0.1647891857096878, "eval_loss": 2.2783665657043457, "eval_runtime": 10.3852, "eval_samples_per_second": 23.591, "eval_steps_per_second": 2.985, "step": 64 }, { "epoch": 0.17508850981654328, "grad_norm": 2.24977970123291, "learning_rate": 0.00028192106268097334, "loss": 2.3169, "step": 68 }, { "epoch": 0.18538783392339878, "grad_norm": 2.666362762451172, "learning_rate": 0.0002794884721436361, "loss": 2.2518, "step": 72 }, { "epoch": 0.18538783392339878, "eval_loss": 2.2818338871002197, "eval_runtime": 10.3527, "eval_samples_per_second": 23.665, "eval_steps_per_second": 2.994, "step": 72 }, { "epoch": 0.19568715803025427, "grad_norm": 2.480801582336426, "learning_rate": 0.0002769142878631403, "loss": 2.3334, "step": 76 }, { "epoch": 0.20598648213710974, "grad_norm": 2.343674898147583, "learning_rate": 0.000274201324672203, "loss": 2.2922, "step": 80 }, { "epoch": 0.20598648213710974, "eval_loss": 2.291419744491577, "eval_runtime": 10.3695, "eval_samples_per_second": 23.627, "eval_steps_per_second": 2.99, "step": 80 }, { "epoch": 0.21628580624396523, "grad_norm": 2.1927764415740967, "learning_rate": 0.0002713525491562421, "loss": 2.2794, "step": 84 }, { "epoch": 0.22658513035082073, "grad_norm": 2.764920711517334, "learning_rate": 0.00026837107640945905, "loss": 2.3199, "step": 88 }, { "epoch": 0.22658513035082073, "eval_loss": 2.2720041275024414, "eval_runtime": 10.3284, "eval_samples_per_second": 23.721, "eval_steps_per_second": 3.001, "step": 88 }, { "epoch": 0.23688445445767622, "grad_norm": 2.5399646759033203, "learning_rate": 0.00026526016662852886, "loss": 2.2966, "step": 92 }, { "epoch": 0.2471837785645317, "grad_norm": 2.4544336795806885, "learning_rate": 0.0002620232215476231, "loss": 2.3807, "step": 96 }, { "epoch": 0.2471837785645317, "eval_loss": 2.278792142868042, "eval_runtime": 10.3488, "eval_samples_per_second": 23.674, "eval_steps_per_second": 2.996, "step": 96 }, { "epoch": 0.2574831026713872, "grad_norm": 2.6318697929382324, "learning_rate": 0.00025866378071866334, "loss": 2.3591, "step": 100 }, { "epoch": 0.26778242677824265, "grad_norm": 2.789501667022705, "learning_rate": 0.00025518551764087326, "loss": 2.3528, "step": 104 }, { "epoch": 0.26778242677824265, "eval_loss": 2.2830569744110107, "eval_runtime": 10.2996, "eval_samples_per_second": 23.787, "eval_steps_per_second": 3.01, "step": 104 }, { "epoch": 0.2780817508850982, "grad_norm": 2.526961326599121, "learning_rate": 0.00025159223574386114, "loss": 2.3028, "step": 108 }, { "epoch": 0.28838107499195365, "grad_norm": 3.0287418365478516, "learning_rate": 0.00024788786422862526, "loss": 2.3144, "step": 112 }, { "epoch": 0.28838107499195365, "eval_loss": 2.295750379562378, "eval_runtime": 89.6533, "eval_samples_per_second": 2.733, "eval_steps_per_second": 0.346, "step": 112 }, { "epoch": 0.29868039909880917, "grad_norm": 3.0724334716796875, "learning_rate": 0.00024407645377103054, "loss": 2.3251, "step": 116 }, { "epoch": 0.30897972320566464, "grad_norm": 3.3411991596221924, "learning_rate": 0.00024016217209245374, "loss": 2.3652, "step": 120 }, { "epoch": 0.30897972320566464, "eval_loss": 2.295003652572632, "eval_runtime": 92.6572, "eval_samples_per_second": 2.644, "eval_steps_per_second": 0.335, "step": 120 }, { "epoch": 0.3192790473125201, "grad_norm": 2.637554407119751, "learning_rate": 0.0002361492994024415, "loss": 2.2994, "step": 124 }, { "epoch": 0.3295783714193756, "grad_norm": 2.461324453353882, "learning_rate": 0.00023204222371836405, "loss": 2.3637, "step": 128 }, { "epoch": 0.3295783714193756, "eval_loss": 2.2845101356506348, "eval_runtime": 94.8437, "eval_samples_per_second": 2.583, "eval_steps_per_second": 0.327, "step": 128 }, { "epoch": 0.3398776955262311, "grad_norm": 2.624124765396118, "learning_rate": 0.00022784543606718227, "loss": 2.332, "step": 132 }, { "epoch": 0.35017701963308656, "grad_norm": 2.3923728466033936, "learning_rate": 0.0002235635255745762, "loss": 2.3014, "step": 136 }, { "epoch": 0.35017701963308656, "eval_loss": 2.278064250946045, "eval_runtime": 101.7542, "eval_samples_per_second": 2.408, "eval_steps_per_second": 0.305, "step": 136 }, { "epoch": 0.3604763437399421, "grad_norm": 3.1025068759918213, "learning_rate": 0.00021920117444680317, "loss": 2.3819, "step": 140 }, { "epoch": 0.37077566784679755, "grad_norm": 2.671719551086426, "learning_rate": 0.0002147631528507739, "loss": 2.3067, "step": 144 }, { "epoch": 0.37077566784679755, "eval_loss": 2.2829501628875732, "eval_runtime": 101.7933, "eval_samples_per_second": 2.407, "eval_steps_per_second": 0.305, "step": 144 }, { "epoch": 0.381074991953653, "grad_norm": 2.460653781890869, "learning_rate": 0.0002102543136979454, "loss": 2.2942, "step": 148 }, { "epoch": 0.39137431606050854, "grad_norm": 2.579962730407715, "learning_rate": 0.0002056795873377331, "loss": 2.3242, "step": 152 }, { "epoch": 0.39137431606050854, "eval_loss": 2.2787716388702393, "eval_runtime": 105.0919, "eval_samples_per_second": 2.331, "eval_steps_per_second": 0.295, "step": 152 }, { "epoch": 0.401673640167364, "grad_norm": 2.271052837371826, "learning_rate": 0.00020104397616624645, "loss": 2.3009, "step": 156 }, { "epoch": 0.4119729642742195, "grad_norm": 3.09191632270813, "learning_rate": 0.0001963525491562421, "loss": 2.3184, "step": 160 }, { "epoch": 0.4119729642742195, "eval_loss": 2.265949249267578, "eval_runtime": 100.9124, "eval_samples_per_second": 2.428, "eval_steps_per_second": 0.307, "step": 160 }, { "epoch": 0.422272288381075, "grad_norm": 2.4084908962249756, "learning_rate": 0.00019161043631427666, "loss": 2.2862, "step": 164 }, { "epoch": 0.43257161248793047, "grad_norm": 2.453981399536133, "learning_rate": 0.00018682282307111987, "loss": 2.3574, "step": 168 }, { "epoch": 0.43257161248793047, "eval_loss": 2.278163433074951, "eval_runtime": 10.4957, "eval_samples_per_second": 23.343, "eval_steps_per_second": 2.954, "step": 168 }, { "epoch": 0.442870936594786, "grad_norm": 2.5125510692596436, "learning_rate": 0.00018199494461156203, "loss": 2.3157, "step": 172 }, { "epoch": 0.45317026070164146, "grad_norm": 3.002711057662964, "learning_rate": 0.00017713208014981648, "loss": 2.3006, "step": 176 }, { "epoch": 0.45317026070164146, "eval_loss": 2.2733473777770996, "eval_runtime": 10.4821, "eval_samples_per_second": 23.373, "eval_steps_per_second": 2.957, "step": 176 }, { "epoch": 0.4634695848084969, "grad_norm": 2.4078829288482666, "learning_rate": 0.00017223954715677627, "loss": 2.3307, "step": 180 }, { "epoch": 0.47376890891535245, "grad_norm": 2.81722092628479, "learning_rate": 0.00016732269554543794, "loss": 2.3082, "step": 184 }, { "epoch": 0.47376890891535245, "eval_loss": 2.2698850631713867, "eval_runtime": 10.4759, "eval_samples_per_second": 23.387, "eval_steps_per_second": 2.959, "step": 184 }, { "epoch": 0.4840682330222079, "grad_norm": 2.851599931716919, "learning_rate": 0.00016238690182084986, "loss": 2.2678, "step": 188 }, { "epoch": 0.4943675571290634, "grad_norm": 3.283416271209717, "learning_rate": 0.00015743756320098332, "loss": 2.3097, "step": 192 }, { "epoch": 0.4943675571290634, "eval_loss": 2.2615249156951904, "eval_runtime": 10.4874, "eval_samples_per_second": 23.361, "eval_steps_per_second": 2.956, "step": 192 }, { "epoch": 0.5046668812359189, "grad_norm": 2.369040012359619, "learning_rate": 0.00015248009171495378, "loss": 2.304, "step": 196 }, { "epoch": 0.5149662053427744, "grad_norm": 2.1018686294555664, "learning_rate": 0.00014751990828504622, "loss": 2.3003, "step": 200 }, { "epoch": 0.5149662053427744, "eval_loss": 2.264901638031006, "eval_runtime": 10.4653, "eval_samples_per_second": 23.411, "eval_steps_per_second": 2.962, "step": 200 }, { "epoch": 0.5252655294496299, "grad_norm": 2.4146642684936523, "learning_rate": 0.00014256243679901663, "loss": 2.2735, "step": 204 }, { "epoch": 0.5355648535564853, "grad_norm": 2.4833011627197266, "learning_rate": 0.00013761309817915014, "loss": 2.3027, "step": 208 }, { "epoch": 0.5355648535564853, "eval_loss": 2.2593843936920166, "eval_runtime": 10.4677, "eval_samples_per_second": 23.405, "eval_steps_per_second": 2.961, "step": 208 }, { "epoch": 0.5458641776633408, "grad_norm": 2.3660264015197754, "learning_rate": 0.00013267730445456208, "loss": 2.2475, "step": 212 }, { "epoch": 0.5561635017701964, "grad_norm": 2.8252508640289307, "learning_rate": 0.00012776045284322368, "loss": 2.3262, "step": 216 }, { "epoch": 0.5561635017701964, "eval_loss": 2.249122142791748, "eval_runtime": 10.4377, "eval_samples_per_second": 23.473, "eval_steps_per_second": 2.97, "step": 216 }, { "epoch": 0.5664628258770518, "grad_norm": 2.3867785930633545, "learning_rate": 0.00012286791985018355, "loss": 2.2732, "step": 220 }, { "epoch": 0.5767621499839073, "grad_norm": 2.2431817054748535, "learning_rate": 0.00011800505538843798, "loss": 2.3118, "step": 224 }, { "epoch": 0.5767621499839073, "eval_loss": 2.2598836421966553, "eval_runtime": 10.3894, "eval_samples_per_second": 23.582, "eval_steps_per_second": 2.984, "step": 224 }, { "epoch": 0.5870614740907628, "grad_norm": 2.2373931407928467, "learning_rate": 0.00011317717692888012, "loss": 2.3041, "step": 228 }, { "epoch": 0.5973607981976183, "grad_norm": 2.278763771057129, "learning_rate": 0.00010838956368572334, "loss": 2.2904, "step": 232 }, { "epoch": 0.5973607981976183, "eval_loss": 2.26226544380188, "eval_runtime": 10.39, "eval_samples_per_second": 23.58, "eval_steps_per_second": 2.984, "step": 232 }, { "epoch": 0.6076601223044737, "grad_norm": 2.2224009037017822, "learning_rate": 0.0001036474508437579, "loss": 2.3356, "step": 236 }, { "epoch": 0.6179594464113293, "grad_norm": 1.9502872228622437, "learning_rate": 9.895602383375353e-05, "loss": 2.2519, "step": 240 }, { "epoch": 0.6179594464113293, "eval_loss": 2.249502658843994, "eval_runtime": 10.3323, "eval_samples_per_second": 23.712, "eval_steps_per_second": 3.0, "step": 240 }, { "epoch": 0.6282587705181848, "grad_norm": 1.9756816625595093, "learning_rate": 9.432041266226686e-05, "loss": 2.3011, "step": 244 }, { "epoch": 0.6385580946250402, "grad_norm": 2.24442195892334, "learning_rate": 8.97456863020546e-05, "loss": 2.2907, "step": 248 }, { "epoch": 0.6385580946250402, "eval_loss": 2.252591848373413, "eval_runtime": 10.375, "eval_samples_per_second": 23.615, "eval_steps_per_second": 2.988, "step": 248 }, { "epoch": 0.6488574187318957, "grad_norm": 2.2955076694488525, "learning_rate": 8.523684714922608e-05, "loss": 2.2493, "step": 252 }, { "epoch": 0.6591567428387513, "grad_norm": 2.3158631324768066, "learning_rate": 8.079882555319684e-05, "loss": 2.2864, "step": 256 }, { "epoch": 0.6591567428387513, "eval_loss": 2.25117826461792, "eval_runtime": 10.3184, "eval_samples_per_second": 23.744, "eval_steps_per_second": 3.004, "step": 256 }, { "epoch": 0.6694560669456067, "grad_norm": 1.8003387451171875, "learning_rate": 7.643647442542382e-05, "loss": 2.2806, "step": 260 }, { "epoch": 0.6797553910524622, "grad_norm": 2.0942351818084717, "learning_rate": 7.215456393281776e-05, "loss": 2.242, "step": 264 }, { "epoch": 0.6797553910524622, "eval_loss": 2.249201774597168, "eval_runtime": 10.2986, "eval_samples_per_second": 23.79, "eval_steps_per_second": 3.01, "step": 264 }, { "epoch": 0.6900547151593177, "grad_norm": 2.113708019256592, "learning_rate": 6.795777628163599e-05, "loss": 2.2817, "step": 268 }, { "epoch": 0.7003540392661731, "grad_norm": 2.8915371894836426, "learning_rate": 6.385070059755846e-05, "loss": 2.2941, "step": 272 }, { "epoch": 0.7003540392661731, "eval_loss": 2.2414655685424805, "eval_runtime": 81.0572, "eval_samples_per_second": 3.023, "eval_steps_per_second": 0.382, "step": 272 }, { "epoch": 0.7106533633730286, "grad_norm": 2.2395060062408447, "learning_rate": 5.983782790754623e-05, "loss": 2.2913, "step": 276 }, { "epoch": 0.7209526874798842, "grad_norm": 2.083782911300659, "learning_rate": 5.592354622896944e-05, "loss": 2.2799, "step": 280 }, { "epoch": 0.7209526874798842, "eval_loss": 2.238328695297241, "eval_runtime": 47.7655, "eval_samples_per_second": 5.129, "eval_steps_per_second": 0.649, "step": 280 }, { "epoch": 0.7312520115867396, "grad_norm": 2.176422595977783, "learning_rate": 5.211213577137469e-05, "loss": 2.2807, "step": 284 }, { "epoch": 0.7415513356935951, "grad_norm": 2.0646986961364746, "learning_rate": 4.840776425613886e-05, "loss": 2.2881, "step": 288 }, { "epoch": 0.7415513356935951, "eval_loss": 2.235785484313965, "eval_runtime": 48.1216, "eval_samples_per_second": 5.091, "eval_steps_per_second": 0.644, "step": 288 }, { "epoch": 0.7518506598004506, "grad_norm": 1.7966443300247192, "learning_rate": 4.481448235912671e-05, "loss": 2.2754, "step": 292 }, { "epoch": 0.762149983907306, "grad_norm": 1.8745654821395874, "learning_rate": 4.133621928133665e-05, "loss": 2.2797, "step": 296 }, { "epoch": 0.762149983907306, "eval_loss": 2.2380549907684326, "eval_runtime": 51.7197, "eval_samples_per_second": 4.737, "eval_steps_per_second": 0.599, "step": 296 }, { "epoch": 0.7724493080141616, "grad_norm": 2.07661771774292, "learning_rate": 3.797677845237696e-05, "loss": 2.2368, "step": 300 }, { "epoch": 0.7827486321210171, "grad_norm": 2.3376801013946533, "learning_rate": 3.473983337147118e-05, "loss": 2.3197, "step": 304 }, { "epoch": 0.7827486321210171, "eval_loss": 2.2254865169525146, "eval_runtime": 50.4873, "eval_samples_per_second": 4.853, "eval_steps_per_second": 0.614, "step": 304 }, { "epoch": 0.7930479562278725, "grad_norm": 2.3710439205169678, "learning_rate": 3.162892359054098e-05, "loss": 2.2302, "step": 308 }, { "epoch": 0.803347280334728, "grad_norm": 2.251873731613159, "learning_rate": 2.8647450843757897e-05, "loss": 2.2507, "step": 312 }, { "epoch": 0.803347280334728, "eval_loss": 2.228350877761841, "eval_runtime": 84.236, "eval_samples_per_second": 2.908, "eval_steps_per_second": 0.368, "step": 312 }, { "epoch": 0.8136466044415835, "grad_norm": 2.0786876678466797, "learning_rate": 2.5798675327796993e-05, "loss": 2.2143, "step": 316 }, { "epoch": 0.823945928548439, "grad_norm": 2.0034406185150146, "learning_rate": 2.3085712136859668e-05, "loss": 2.236, "step": 320 }, { "epoch": 0.823945928548439, "eval_loss": 2.2312803268432617, "eval_runtime": 81.5754, "eval_samples_per_second": 3.003, "eval_steps_per_second": 0.38, "step": 320 }, { "epoch": 0.8342452526552945, "grad_norm": 1.921857476234436, "learning_rate": 2.0511527856363912e-05, "loss": 2.2961, "step": 324 }, { "epoch": 0.84454457676215, "grad_norm": 1.6842375993728638, "learning_rate": 1.8078937319026654e-05, "loss": 2.2667, "step": 328 }, { "epoch": 0.84454457676215, "eval_loss": 2.2199668884277344, "eval_runtime": 10.4841, "eval_samples_per_second": 23.369, "eval_steps_per_second": 2.957, "step": 328 }, { "epoch": 0.8548439008690055, "grad_norm": 1.776142954826355, "learning_rate": 1.579060052688548e-05, "loss": 2.2333, "step": 332 }, { "epoch": 0.8651432249758609, "grad_norm": 2.0090060234069824, "learning_rate": 1.3649019742625623e-05, "loss": 2.2763, "step": 336 }, { "epoch": 0.8651432249758609, "eval_loss": 2.2255115509033203, "eval_runtime": 10.47, "eval_samples_per_second": 23.4, "eval_steps_per_second": 2.961, "step": 336 }, { "epoch": 0.8754425490827165, "grad_norm": 2.0487890243530273, "learning_rate": 1.1656536753392287e-05, "loss": 2.2732, "step": 340 }, { "epoch": 0.885741873189572, "grad_norm": 1.6611812114715576, "learning_rate": 9.815330310080887e-06, "loss": 2.2915, "step": 344 }, { "epoch": 0.885741873189572, "eval_loss": 2.2228875160217285, "eval_runtime": 10.4667, "eval_samples_per_second": 23.407, "eval_steps_per_second": 2.962, "step": 344 }, { "epoch": 0.8960411972964274, "grad_norm": 1.7494527101516724, "learning_rate": 8.127413744904804e-06, "loss": 2.2629, "step": 348 }, { "epoch": 0.9063405214032829, "grad_norm": 2.0750112533569336, "learning_rate": 6.594632769846353e-06, "loss": 2.2554, "step": 352 }, { "epoch": 0.9063405214032829, "eval_loss": 2.221097230911255, "eval_runtime": 10.4622, "eval_samples_per_second": 23.418, "eval_steps_per_second": 2.963, "step": 352 }, { "epoch": 0.9166398455101384, "grad_norm": 1.803432583808899, "learning_rate": 5.218663458397715e-06, "loss": 2.2508, "step": 356 }, { "epoch": 0.9269391696169939, "grad_norm": 1.7050764560699463, "learning_rate": 4.001010412799138e-06, "loss": 2.2237, "step": 360 }, { "epoch": 0.9269391696169939, "eval_loss": 2.224059820175171, "eval_runtime": 10.5244, "eval_samples_per_second": 23.279, "eval_steps_per_second": 2.946, "step": 360 }, { "epoch": 0.9372384937238494, "grad_norm": 1.768523097038269, "learning_rate": 2.9430051187785962e-06, "loss": 2.2772, "step": 364 }, { "epoch": 0.9475378178307049, "grad_norm": 1.7917652130126953, "learning_rate": 2.0458044895916513e-06, "loss": 2.2446, "step": 368 }, { "epoch": 0.9475378178307049, "eval_loss": 2.2229297161102295, "eval_runtime": 10.4777, "eval_samples_per_second": 23.383, "eval_steps_per_second": 2.959, "step": 368 }, { "epoch": 0.9578371419375603, "grad_norm": 1.6179791688919067, "learning_rate": 1.3103896009537207e-06, "loss": 2.2329, "step": 372 }, { "epoch": 0.9681364660444158, "grad_norm": 1.8671883344650269, "learning_rate": 7.375646182482875e-07, "loss": 2.2926, "step": 376 }, { "epoch": 0.9681364660444158, "eval_loss": 2.2224321365356445, "eval_runtime": 10.4643, "eval_samples_per_second": 23.413, "eval_steps_per_second": 2.962, "step": 376 }, { "epoch": 0.9784357901512714, "grad_norm": 2.0502476692199707, "learning_rate": 3.2795591718381975e-07, "loss": 2.2313, "step": 380 }, { "epoch": 0.9887351142581268, "grad_norm": 1.8618476390838623, "learning_rate": 8.201139886109264e-08, "loss": 2.2813, "step": 384 }, { "epoch": 0.9887351142581268, "eval_loss": 2.2218899726867676, "eval_runtime": 10.462, "eval_samples_per_second": 23.418, "eval_steps_per_second": 2.963, "step": 384 } ], "logging_steps": 4, "max_steps": 388, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 8, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 8.99888145270571e+17, "train_batch_size": 8, "trial_name": null, "trial_params": null }