{ "best_metric": 2.191589117050171, "best_model_checkpoint": "/home/datta0/models/lora_final/Meta-Llama-3-8B_pct_default/checkpoint-384", "epoch": 0.9990344383649823, "eval_steps": 8, "global_step": 388, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.002574831026713872, "grad_norm": 7.236040115356445, "learning_rate": 3.75e-05, "loss": 2.3601, "step": 1 }, { "epoch": 0.010299324106855488, "grad_norm": 8.456689834594727, "learning_rate": 0.00015, "loss": 2.331, "step": 4 }, { "epoch": 0.020598648213710977, "grad_norm": 6.369657516479492, "learning_rate": 0.0003, "loss": 2.2535, "step": 8 }, { "epoch": 0.020598648213710977, "eval_loss": 2.273538827896118, "eval_runtime": 10.4749, "eval_samples_per_second": 23.389, "eval_steps_per_second": 2.959, "step": 8 }, { "epoch": 0.03089797232056646, "grad_norm": 11.503490447998047, "learning_rate": 0.0002999179886011389, "loss": 2.2455, "step": 12 }, { "epoch": 0.04119729642742195, "grad_norm": 7.761600494384766, "learning_rate": 0.00029967204408281613, "loss": 2.2839, "step": 16 }, { "epoch": 0.04119729642742195, "eval_loss": 2.2722208499908447, "eval_runtime": 10.4676, "eval_samples_per_second": 23.405, "eval_steps_per_second": 2.962, "step": 16 }, { "epoch": 0.051496620534277435, "grad_norm": 8.975590705871582, "learning_rate": 0.0002992624353817517, "loss": 2.2622, "step": 20 }, { "epoch": 0.06179594464113292, "grad_norm": 12.953361511230469, "learning_rate": 0.00029868961039904624, "loss": 2.2318, "step": 24 }, { "epoch": 0.06179594464113292, "eval_loss": 2.27947735786438, "eval_runtime": 10.4821, "eval_samples_per_second": 23.373, "eval_steps_per_second": 2.957, "step": 24 }, { "epoch": 0.07209526874798841, "grad_norm": 10.436603546142578, "learning_rate": 0.00029795419551040833, "loss": 2.2613, "step": 28 }, { "epoch": 0.0823945928548439, "grad_norm": 18.31772232055664, "learning_rate": 0.0002970569948812214, "loss": 2.3022, "step": 32 }, { "epoch": 0.0823945928548439, "eval_loss": 2.272110939025879, "eval_runtime": 10.4777, "eval_samples_per_second": 23.383, "eval_steps_per_second": 2.959, "step": 32 }, { "epoch": 0.09269391696169939, "grad_norm": 15.171878814697266, "learning_rate": 0.0002959989895872009, "loss": 2.2609, "step": 36 }, { "epoch": 0.10299324106855487, "grad_norm": 10.8043794631958, "learning_rate": 0.0002947813365416023, "loss": 2.2843, "step": 40 }, { "epoch": 0.10299324106855487, "eval_loss": 2.2608468532562256, "eval_runtime": 10.4804, "eval_samples_per_second": 23.377, "eval_steps_per_second": 2.958, "step": 40 }, { "epoch": 0.11329256517541036, "grad_norm": 8.218811988830566, "learning_rate": 0.0002934053672301536, "loss": 2.2849, "step": 44 }, { "epoch": 0.12359188928226585, "grad_norm": 9.294170379638672, "learning_rate": 0.00029187258625509513, "loss": 2.2433, "step": 48 }, { "epoch": 0.12359188928226585, "eval_loss": 2.2513418197631836, "eval_runtime": 10.4749, "eval_samples_per_second": 23.389, "eval_steps_per_second": 2.959, "step": 48 }, { "epoch": 0.13389121338912133, "grad_norm": 5.276401996612549, "learning_rate": 0.0002901846696899191, "loss": 2.3066, "step": 52 }, { "epoch": 0.14419053749597682, "grad_norm": 5.453143119812012, "learning_rate": 0.0002883434632466077, "loss": 2.2617, "step": 56 }, { "epoch": 0.14419053749597682, "eval_loss": 2.2631678581237793, "eval_runtime": 10.4651, "eval_samples_per_second": 23.411, "eval_steps_per_second": 2.962, "step": 56 }, { "epoch": 0.15448986160283232, "grad_norm": 4.665058135986328, "learning_rate": 0.00028635098025737434, "loss": 2.2787, "step": 60 }, { "epoch": 0.1647891857096878, "grad_norm": 5.689172267913818, "learning_rate": 0.0002842093994731145, "loss": 2.2962, "step": 64 }, { "epoch": 0.1647891857096878, "eval_loss": 2.276745080947876, "eval_runtime": 10.4548, "eval_samples_per_second": 23.434, "eval_steps_per_second": 2.965, "step": 64 }, { "epoch": 0.17508850981654328, "grad_norm": 5.17114782333374, "learning_rate": 0.00028192106268097334, "loss": 2.3209, "step": 68 }, { "epoch": 0.18538783392339878, "grad_norm": 7.912550926208496, "learning_rate": 0.0002794884721436361, "loss": 2.2573, "step": 72 }, { "epoch": 0.18538783392339878, "eval_loss": 2.2800114154815674, "eval_runtime": 10.441, "eval_samples_per_second": 23.465, "eval_steps_per_second": 2.969, "step": 72 }, { "epoch": 0.19568715803025427, "grad_norm": 7.60773229598999, "learning_rate": 0.0002769142878631403, "loss": 2.3339, "step": 76 }, { "epoch": 0.20598648213710974, "grad_norm": 4.8477959632873535, "learning_rate": 0.000274201324672203, "loss": 2.2856, "step": 80 }, { "epoch": 0.20598648213710974, "eval_loss": 2.2750115394592285, "eval_runtime": 10.4164, "eval_samples_per_second": 23.521, "eval_steps_per_second": 2.976, "step": 80 }, { "epoch": 0.21628580624396523, "grad_norm": 6.659757614135742, "learning_rate": 0.0002713525491562421, "loss": 2.2837, "step": 84 }, { "epoch": 0.22658513035082073, "grad_norm": 6.980467319488525, "learning_rate": 0.00026837107640945905, "loss": 2.3158, "step": 88 }, { "epoch": 0.22658513035082073, "eval_loss": 2.2799086570739746, "eval_runtime": 10.3932, "eval_samples_per_second": 23.573, "eval_steps_per_second": 2.983, "step": 88 }, { "epoch": 0.23688445445767622, "grad_norm": 5.191193103790283, "learning_rate": 0.00026526016662852886, "loss": 2.301, "step": 92 }, { "epoch": 0.2471837785645317, "grad_norm": 5.624630451202393, "learning_rate": 0.0002620232215476231, "loss": 2.3622, "step": 96 }, { "epoch": 0.2471837785645317, "eval_loss": 2.2860019207000732, "eval_runtime": 10.3766, "eval_samples_per_second": 23.611, "eval_steps_per_second": 2.987, "step": 96 }, { "epoch": 0.2574831026713872, "grad_norm": 5.981316089630127, "learning_rate": 0.00025866378071866334, "loss": 2.3561, "step": 100 }, { "epoch": 0.26778242677824265, "grad_norm": 6.887294292449951, "learning_rate": 0.00025518551764087326, "loss": 2.357, "step": 104 }, { "epoch": 0.26778242677824265, "eval_loss": 2.290139675140381, "eval_runtime": 10.3578, "eval_samples_per_second": 23.654, "eval_steps_per_second": 2.993, "step": 104 }, { "epoch": 0.2780817508850982, "grad_norm": 6.06417179107666, "learning_rate": 0.00025159223574386114, "loss": 2.3055, "step": 108 }, { "epoch": 0.28838107499195365, "grad_norm": 6.5543928146362305, "learning_rate": 0.00024788786422862526, "loss": 2.3124, "step": 112 }, { "epoch": 0.28838107499195365, "eval_loss": 2.2984819412231445, "eval_runtime": 44.4261, "eval_samples_per_second": 5.515, "eval_steps_per_second": 0.698, "step": 112 }, { "epoch": 0.29868039909880917, "grad_norm": 5.269571304321289, "learning_rate": 0.00024407645377103054, "loss": 2.3163, "step": 116 }, { "epoch": 0.30897972320566464, "grad_norm": 5.49562931060791, "learning_rate": 0.00024016217209245374, "loss": 2.3646, "step": 120 }, { "epoch": 0.30897972320566464, "eval_loss": 2.2943296432495117, "eval_runtime": 44.1467, "eval_samples_per_second": 5.55, "eval_steps_per_second": 0.702, "step": 120 }, { "epoch": 0.3192790473125201, "grad_norm": 7.95536994934082, "learning_rate": 0.0002361492994024415, "loss": 2.3003, "step": 124 }, { "epoch": 0.3295783714193756, "grad_norm": 5.131826400756836, "learning_rate": 0.00023204222371836405, "loss": 2.3591, "step": 128 }, { "epoch": 0.3295783714193756, "eval_loss": 2.2891480922698975, "eval_runtime": 44.0608, "eval_samples_per_second": 5.56, "eval_steps_per_second": 0.704, "step": 128 }, { "epoch": 0.3398776955262311, "grad_norm": 6.059808731079102, "learning_rate": 0.00022784543606718227, "loss": 2.3418, "step": 132 }, { "epoch": 0.35017701963308656, "grad_norm": 6.209086894989014, "learning_rate": 0.0002235635255745762, "loss": 2.3085, "step": 136 }, { "epoch": 0.35017701963308656, "eval_loss": 2.2922980785369873, "eval_runtime": 44.0389, "eval_samples_per_second": 5.563, "eval_steps_per_second": 0.704, "step": 136 }, { "epoch": 0.3604763437399421, "grad_norm": 5.690346717834473, "learning_rate": 0.00021920117444680317, "loss": 2.3863, "step": 140 }, { "epoch": 0.37077566784679755, "grad_norm": 4.371445178985596, "learning_rate": 0.0002147631528507739, "loss": 2.3054, "step": 144 }, { "epoch": 0.37077566784679755, "eval_loss": 2.287841558456421, "eval_runtime": 44.2544, "eval_samples_per_second": 5.536, "eval_steps_per_second": 0.7, "step": 144 }, { "epoch": 0.381074991953653, "grad_norm": 6.41508674621582, "learning_rate": 0.0002102543136979454, "loss": 2.2997, "step": 148 }, { "epoch": 0.39137431606050854, "grad_norm": 4.227716445922852, "learning_rate": 0.0002056795873377331, "loss": 2.3203, "step": 152 }, { "epoch": 0.39137431606050854, "eval_loss": 2.282947063446045, "eval_runtime": 44.1342, "eval_samples_per_second": 5.551, "eval_steps_per_second": 0.702, "step": 152 }, { "epoch": 0.401673640167364, "grad_norm": 4.451204776763916, "learning_rate": 0.00020104397616624645, "loss": 2.2989, "step": 156 }, { "epoch": 0.4119729642742195, "grad_norm": 5.805531978607178, "learning_rate": 0.0001963525491562421, "loss": 2.2995, "step": 160 }, { "epoch": 0.4119729642742195, "eval_loss": 2.2782602310180664, "eval_runtime": 44.9558, "eval_samples_per_second": 5.45, "eval_steps_per_second": 0.69, "step": 160 }, { "epoch": 0.422272288381075, "grad_norm": 4.269804000854492, "learning_rate": 0.00019161043631427666, "loss": 2.2861, "step": 164 }, { "epoch": 0.43257161248793047, "grad_norm": 4.4612135887146, "learning_rate": 0.00018682282307111987, "loss": 2.356, "step": 168 }, { "epoch": 0.43257161248793047, "eval_loss": 2.275855779647827, "eval_runtime": 10.4715, "eval_samples_per_second": 23.397, "eval_steps_per_second": 2.96, "step": 168 }, { "epoch": 0.442870936594786, "grad_norm": 5.098966598510742, "learning_rate": 0.00018199494461156203, "loss": 2.3096, "step": 172 }, { "epoch": 0.45317026070164146, "grad_norm": 5.2250871658325195, "learning_rate": 0.00017713208014981648, "loss": 2.2942, "step": 176 }, { "epoch": 0.45317026070164146, "eval_loss": 2.2719717025756836, "eval_runtime": 10.4779, "eval_samples_per_second": 23.383, "eval_steps_per_second": 2.959, "step": 176 }, { "epoch": 0.4634695848084969, "grad_norm": 4.393130302429199, "learning_rate": 0.00017223954715677627, "loss": 2.3204, "step": 180 }, { "epoch": 0.47376890891535245, "grad_norm": 3.999812602996826, "learning_rate": 0.00016732269554543794, "loss": 2.2987, "step": 184 }, { "epoch": 0.47376890891535245, "eval_loss": 2.2650279998779297, "eval_runtime": 10.4729, "eval_samples_per_second": 23.394, "eval_steps_per_second": 2.96, "step": 184 }, { "epoch": 0.4840682330222079, "grad_norm": 4.32090425491333, "learning_rate": 0.00016238690182084986, "loss": 2.2684, "step": 188 }, { "epoch": 0.4943675571290634, "grad_norm": 4.725154876708984, "learning_rate": 0.00015743756320098332, "loss": 2.3025, "step": 192 }, { "epoch": 0.4943675571290634, "eval_loss": 2.2645103931427, "eval_runtime": 10.4773, "eval_samples_per_second": 23.384, "eval_steps_per_second": 2.959, "step": 192 }, { "epoch": 0.5046668812359189, "grad_norm": 4.0945892333984375, "learning_rate": 0.00015248009171495378, "loss": 2.2888, "step": 196 }, { "epoch": 0.5149662053427744, "grad_norm": 4.477442264556885, "learning_rate": 0.00014751990828504622, "loss": 2.294, "step": 200 }, { "epoch": 0.5149662053427744, "eval_loss": 2.2624104022979736, "eval_runtime": 10.4821, "eval_samples_per_second": 23.373, "eval_steps_per_second": 2.957, "step": 200 }, { "epoch": 0.5252655294496299, "grad_norm": 3.748276710510254, "learning_rate": 0.00014256243679901663, "loss": 2.2632, "step": 204 }, { "epoch": 0.5355648535564853, "grad_norm": 4.243557453155518, "learning_rate": 0.00013761309817915014, "loss": 2.2959, "step": 208 }, { "epoch": 0.5355648535564853, "eval_loss": 2.2678449153900146, "eval_runtime": 10.4697, "eval_samples_per_second": 23.401, "eval_steps_per_second": 2.961, "step": 208 }, { "epoch": 0.5458641776633408, "grad_norm": 3.7038612365722656, "learning_rate": 0.00013267730445456208, "loss": 2.2426, "step": 212 }, { "epoch": 0.5561635017701964, "grad_norm": 5.393055438995361, "learning_rate": 0.00012776045284322368, "loss": 2.3074, "step": 216 }, { "epoch": 0.5561635017701964, "eval_loss": 2.2524540424346924, "eval_runtime": 10.4594, "eval_samples_per_second": 23.424, "eval_steps_per_second": 2.964, "step": 216 }, { "epoch": 0.5664628258770518, "grad_norm": 5.647940635681152, "learning_rate": 0.00012286791985018355, "loss": 2.2712, "step": 220 }, { "epoch": 0.5767621499839073, "grad_norm": 3.919537305831909, "learning_rate": 0.00011800505538843798, "loss": 2.2862, "step": 224 }, { "epoch": 0.5767621499839073, "eval_loss": 2.2530322074890137, "eval_runtime": 10.464, "eval_samples_per_second": 23.414, "eval_steps_per_second": 2.963, "step": 224 }, { "epoch": 0.5870614740907628, "grad_norm": 4.026923656463623, "learning_rate": 0.00011317717692888012, "loss": 2.2933, "step": 228 }, { "epoch": 0.5973607981976183, "grad_norm": 4.218359470367432, "learning_rate": 0.00010838956368572334, "loss": 2.2745, "step": 232 }, { "epoch": 0.5973607981976183, "eval_loss": 2.2494354248046875, "eval_runtime": 10.4537, "eval_samples_per_second": 23.437, "eval_steps_per_second": 2.965, "step": 232 }, { "epoch": 0.6076601223044737, "grad_norm": 3.6584322452545166, "learning_rate": 0.0001036474508437579, "loss": 2.3179, "step": 236 }, { "epoch": 0.6179594464113293, "grad_norm": 3.86560320854187, "learning_rate": 9.895602383375353e-05, "loss": 2.2422, "step": 240 }, { "epoch": 0.6179594464113293, "eval_loss": 2.239821434020996, "eval_runtime": 10.4187, "eval_samples_per_second": 23.515, "eval_steps_per_second": 2.975, "step": 240 }, { "epoch": 0.6282587705181848, "grad_norm": 6.043626308441162, "learning_rate": 9.432041266226686e-05, "loss": 2.2811, "step": 244 }, { "epoch": 0.6385580946250402, "grad_norm": 4.4510393142700195, "learning_rate": 8.97456863020546e-05, "loss": 2.275, "step": 248 }, { "epoch": 0.6385580946250402, "eval_loss": 2.2399165630340576, "eval_runtime": 10.4081, "eval_samples_per_second": 23.539, "eval_steps_per_second": 2.978, "step": 248 }, { "epoch": 0.6488574187318957, "grad_norm": 3.6573245525360107, "learning_rate": 8.523684714922608e-05, "loss": 2.2348, "step": 252 }, { "epoch": 0.6591567428387513, "grad_norm": 4.474267482757568, "learning_rate": 8.079882555319684e-05, "loss": 2.2632, "step": 256 }, { "epoch": 0.6591567428387513, "eval_loss": 2.2398147583007812, "eval_runtime": 10.3617, "eval_samples_per_second": 23.645, "eval_steps_per_second": 2.992, "step": 256 }, { "epoch": 0.6694560669456067, "grad_norm": 4.0849995613098145, "learning_rate": 7.643647442542382e-05, "loss": 2.2624, "step": 260 }, { "epoch": 0.6797553910524622, "grad_norm": 3.5736443996429443, "learning_rate": 7.215456393281776e-05, "loss": 2.2198, "step": 264 }, { "epoch": 0.6797553910524622, "eval_loss": 2.2288267612457275, "eval_runtime": 10.3503, "eval_samples_per_second": 23.671, "eval_steps_per_second": 2.995, "step": 264 }, { "epoch": 0.6900547151593177, "grad_norm": 3.700812816619873, "learning_rate": 6.795777628163599e-05, "loss": 2.2498, "step": 268 }, { "epoch": 0.7003540392661731, "grad_norm": 3.252271890640259, "learning_rate": 6.385070059755846e-05, "loss": 2.2732, "step": 272 }, { "epoch": 0.7003540392661731, "eval_loss": 2.2233266830444336, "eval_runtime": 44.1123, "eval_samples_per_second": 5.554, "eval_steps_per_second": 0.703, "step": 272 }, { "epoch": 0.7106533633730286, "grad_norm": 3.207204818725586, "learning_rate": 5.983782790754623e-05, "loss": 2.2661, "step": 276 }, { "epoch": 0.7209526874798842, "grad_norm": 3.933907985687256, "learning_rate": 5.592354622896944e-05, "loss": 2.2576, "step": 280 }, { "epoch": 0.7209526874798842, "eval_loss": 2.2177813053131104, "eval_runtime": 45.0771, "eval_samples_per_second": 5.435, "eval_steps_per_second": 0.688, "step": 280 }, { "epoch": 0.7312520115867396, "grad_norm": 3.623133659362793, "learning_rate": 5.211213577137469e-05, "loss": 2.253, "step": 284 }, { "epoch": 0.7415513356935951, "grad_norm": 3.6505510807037354, "learning_rate": 4.840776425613886e-05, "loss": 2.2606, "step": 288 }, { "epoch": 0.7415513356935951, "eval_loss": 2.2098140716552734, "eval_runtime": 44.4469, "eval_samples_per_second": 5.512, "eval_steps_per_second": 0.697, "step": 288 }, { "epoch": 0.7518506598004506, "grad_norm": 3.72786021232605, "learning_rate": 4.481448235912671e-05, "loss": 2.2492, "step": 292 }, { "epoch": 0.762149983907306, "grad_norm": 3.0990045070648193, "learning_rate": 4.133621928133665e-05, "loss": 2.2559, "step": 296 }, { "epoch": 0.762149983907306, "eval_loss": 2.2151412963867188, "eval_runtime": 44.8388, "eval_samples_per_second": 5.464, "eval_steps_per_second": 0.691, "step": 296 }, { "epoch": 0.7724493080141616, "grad_norm": 3.5728273391723633, "learning_rate": 3.797677845237696e-05, "loss": 2.2018, "step": 300 }, { "epoch": 0.7827486321210171, "grad_norm": 3.1590304374694824, "learning_rate": 3.473983337147118e-05, "loss": 2.2852, "step": 304 }, { "epoch": 0.7827486321210171, "eval_loss": 2.2048428058624268, "eval_runtime": 45.1815, "eval_samples_per_second": 5.423, "eval_steps_per_second": 0.686, "step": 304 }, { "epoch": 0.7930479562278725, "grad_norm": 3.989384412765503, "learning_rate": 3.162892359054098e-05, "loss": 2.2077, "step": 308 }, { "epoch": 0.803347280334728, "grad_norm": 4.393310070037842, "learning_rate": 2.8647450843757897e-05, "loss": 2.2252, "step": 312 }, { "epoch": 0.803347280334728, "eval_loss": 2.2025868892669678, "eval_runtime": 45.7427, "eval_samples_per_second": 5.356, "eval_steps_per_second": 0.678, "step": 312 }, { "epoch": 0.8136466044415835, "grad_norm": 3.008932590484619, "learning_rate": 2.5798675327796993e-05, "loss": 2.1846, "step": 316 }, { "epoch": 0.823945928548439, "grad_norm": 3.083261251449585, "learning_rate": 2.3085712136859668e-05, "loss": 2.2024, "step": 320 }, { "epoch": 0.823945928548439, "eval_loss": 2.2029316425323486, "eval_runtime": 44.6941, "eval_samples_per_second": 5.482, "eval_steps_per_second": 0.694, "step": 320 }, { "epoch": 0.8342452526552945, "grad_norm": 3.446568250656128, "learning_rate": 2.0511527856363912e-05, "loss": 2.2635, "step": 324 }, { "epoch": 0.84454457676215, "grad_norm": 3.727339029312134, "learning_rate": 1.8078937319026654e-05, "loss": 2.2339, "step": 328 }, { "epoch": 0.84454457676215, "eval_loss": 2.196918249130249, "eval_runtime": 10.48, "eval_samples_per_second": 23.378, "eval_steps_per_second": 2.958, "step": 328 }, { "epoch": 0.8548439008690055, "grad_norm": 3.6961987018585205, "learning_rate": 1.579060052688548e-05, "loss": 2.21, "step": 332 }, { "epoch": 0.8651432249758609, "grad_norm": 2.9594602584838867, "learning_rate": 1.3649019742625623e-05, "loss": 2.2468, "step": 336 }, { "epoch": 0.8651432249758609, "eval_loss": 2.197859048843384, "eval_runtime": 10.4828, "eval_samples_per_second": 23.372, "eval_steps_per_second": 2.957, "step": 336 }, { "epoch": 0.8754425490827165, "grad_norm": 3.7084383964538574, "learning_rate": 1.1656536753392287e-05, "loss": 2.2378, "step": 340 }, { "epoch": 0.885741873189572, "grad_norm": 4.392552852630615, "learning_rate": 9.815330310080887e-06, "loss": 2.2582, "step": 344 }, { "epoch": 0.885741873189572, "eval_loss": 2.1931886672973633, "eval_runtime": 10.4854, "eval_samples_per_second": 23.366, "eval_steps_per_second": 2.956, "step": 344 }, { "epoch": 0.8960411972964274, "grad_norm": 3.1651930809020996, "learning_rate": 8.127413744904804e-06, "loss": 2.2329, "step": 348 }, { "epoch": 0.9063405214032829, "grad_norm": 3.342674493789673, "learning_rate": 6.594632769846353e-06, "loss": 2.223, "step": 352 }, { "epoch": 0.9063405214032829, "eval_loss": 2.1925430297851562, "eval_runtime": 10.4791, "eval_samples_per_second": 23.38, "eval_steps_per_second": 2.958, "step": 352 }, { "epoch": 0.9166398455101384, "grad_norm": 3.3589231967926025, "learning_rate": 5.218663458397715e-06, "loss": 2.2147, "step": 356 }, { "epoch": 0.9269391696169939, "grad_norm": 2.758237361907959, "learning_rate": 4.001010412799138e-06, "loss": 2.1887, "step": 360 }, { "epoch": 0.9269391696169939, "eval_loss": 2.1936519145965576, "eval_runtime": 10.4939, "eval_samples_per_second": 23.347, "eval_steps_per_second": 2.954, "step": 360 }, { "epoch": 0.9372384937238494, "grad_norm": 3.3825769424438477, "learning_rate": 2.9430051187785962e-06, "loss": 2.2439, "step": 364 }, { "epoch": 0.9475378178307049, "grad_norm": 3.4175548553466797, "learning_rate": 2.0458044895916513e-06, "loss": 2.218, "step": 368 }, { "epoch": 0.9475378178307049, "eval_loss": 2.1924386024475098, "eval_runtime": 10.4759, "eval_samples_per_second": 23.387, "eval_steps_per_second": 2.959, "step": 368 }, { "epoch": 0.9578371419375603, "grad_norm": 3.223475217819214, "learning_rate": 1.3103896009537207e-06, "loss": 2.195, "step": 372 }, { "epoch": 0.9681364660444158, "grad_norm": 2.9230334758758545, "learning_rate": 7.375646182482875e-07, "loss": 2.258, "step": 376 }, { "epoch": 0.9681364660444158, "eval_loss": 2.1916966438293457, "eval_runtime": 10.4626, "eval_samples_per_second": 23.417, "eval_steps_per_second": 2.963, "step": 376 }, { "epoch": 0.9784357901512714, "grad_norm": 4.080029487609863, "learning_rate": 3.2795591718381975e-07, "loss": 2.1991, "step": 380 }, { "epoch": 0.9887351142581268, "grad_norm": 2.951921224594116, "learning_rate": 8.201139886109264e-08, "loss": 2.2479, "step": 384 }, { "epoch": 0.9887351142581268, "eval_loss": 2.191589117050171, "eval_runtime": 10.4621, "eval_samples_per_second": 23.418, "eval_steps_per_second": 2.963, "step": 384 }, { "epoch": 0.9990344383649823, "grad_norm": 3.3245644569396973, "learning_rate": 0.0, "loss": 2.2378, "step": 388 } ], "logging_steps": 4, "max_steps": 388, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 8, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 9.087198145554678e+17, "train_batch_size": 8, "trial_name": null, "trial_params": null }