{ "best_metric": 0.777683675289154, "best_model_checkpoint": "saves/starcoder2-7b/lora/sft/checkpoint-1500", "epoch": 0.23942537909018355, "eval_steps": 100, "global_step": 1500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 0.628385603427887, "learning_rate": 4.999999126897802e-05, "loss": 1.2582, "step": 5 }, { "epoch": 0.0, "grad_norm": 1.0855119228363037, "learning_rate": 4.999996507591817e-05, "loss": 0.801, "step": 10 }, { "epoch": 0.0, "grad_norm": 1.5689586400985718, "learning_rate": 4.9999921420838745e-05, "loss": 1.067, "step": 15 }, { "epoch": 0.0, "grad_norm": 2.0851330757141113, "learning_rate": 4.999986030377024e-05, "loss": 1.2953, "step": 20 }, { "epoch": 0.0, "grad_norm": 1.397479772567749, "learning_rate": 4.999978172475535e-05, "loss": 0.9826, "step": 25 }, { "epoch": 0.0, "grad_norm": 1.344118595123291, "learning_rate": 4.9999685683848954e-05, "loss": 0.9485, "step": 30 }, { "epoch": 0.01, "grad_norm": 1.158163070678711, "learning_rate": 4.9999596278606616e-05, "loss": 0.8103, "step": 35 }, { "epoch": 0.01, "grad_norm": 1.602233648300171, "learning_rate": 4.999946880647276e-05, "loss": 0.8648, "step": 40 }, { "epoch": 0.01, "grad_norm": 1.557242751121521, "learning_rate": 4.999932387266596e-05, "loss": 1.0198, "step": 45 }, { "epoch": 0.01, "grad_norm": 1.36068856716156, "learning_rate": 4.999916147728746e-05, "loss": 0.9367, "step": 50 }, { "epoch": 0.01, "grad_norm": 1.3263639211654663, "learning_rate": 4.999898162045068e-05, "loss": 0.9695, "step": 55 }, { "epoch": 0.01, "grad_norm": 1.333601474761963, "learning_rate": 4.999878430228126e-05, "loss": 1.1509, "step": 60 }, { "epoch": 0.01, "grad_norm": 1.4753800630569458, "learning_rate": 4.999856952291702e-05, "loss": 1.1461, "step": 65 }, { "epoch": 0.01, "grad_norm": 1.5096240043640137, "learning_rate": 4.9998337282507965e-05, "loss": 1.1722, "step": 70 }, { "epoch": 0.01, "grad_norm": 1.189892053604126, "learning_rate": 4.999808758121633e-05, "loss": 1.1834, "step": 75 }, { "epoch": 0.01, "grad_norm": 0.9292634725570679, "learning_rate": 4.999782041921651e-05, "loss": 0.9498, "step": 80 }, { "epoch": 0.01, "grad_norm": 2.1775777339935303, "learning_rate": 4.9997535796695134e-05, "loss": 0.9346, "step": 85 }, { "epoch": 0.01, "grad_norm": 1.6854296922683716, "learning_rate": 4.999723371385099e-05, "loss": 1.119, "step": 90 }, { "epoch": 0.02, "grad_norm": 1.4571490287780762, "learning_rate": 4.999691417089507e-05, "loss": 0.8671, "step": 95 }, { "epoch": 0.02, "grad_norm": 1.277044653892517, "learning_rate": 4.999657716805059e-05, "loss": 1.2469, "step": 100 }, { "epoch": 0.02, "eval_loss": 0.8478816747665405, "eval_runtime": 96.2736, "eval_samples_per_second": 7.24, "eval_steps_per_second": 7.24, "step": 100 }, { "epoch": 0.02, "grad_norm": 0.6687743067741394, "learning_rate": 4.9996222705552933e-05, "loss": 0.735, "step": 105 }, { "epoch": 0.02, "grad_norm": 1.3488354682922363, "learning_rate": 4.9995850783649665e-05, "loss": 0.8344, "step": 110 }, { "epoch": 0.02, "grad_norm": 1.1043323278427124, "learning_rate": 4.9995461402600593e-05, "loss": 0.8254, "step": 115 }, { "epoch": 0.02, "grad_norm": 0.9382895827293396, "learning_rate": 4.9995054562677684e-05, "loss": 0.9179, "step": 120 }, { "epoch": 0.02, "grad_norm": 1.2824612855911255, "learning_rate": 4.9994630264165107e-05, "loss": 0.8663, "step": 125 }, { "epoch": 0.02, "grad_norm": 1.0491925477981567, "learning_rate": 4.999418850735923e-05, "loss": 0.9247, "step": 130 }, { "epoch": 0.02, "grad_norm": 1.3642233610153198, "learning_rate": 4.99937292925686e-05, "loss": 0.8253, "step": 135 }, { "epoch": 0.02, "grad_norm": 3.747757911682129, "learning_rate": 4.9993252620113976e-05, "loss": 1.0245, "step": 140 }, { "epoch": 0.02, "grad_norm": 1.299494981765747, "learning_rate": 4.999275849032832e-05, "loss": 0.8723, "step": 145 }, { "epoch": 0.02, "grad_norm": 1.7195830345153809, "learning_rate": 4.999224690355675e-05, "loss": 1.0524, "step": 150 }, { "epoch": 0.02, "grad_norm": 0.9922987222671509, "learning_rate": 4.9991717860156616e-05, "loss": 0.9502, "step": 155 }, { "epoch": 0.03, "grad_norm": 1.0577458143234253, "learning_rate": 4.9991171360497437e-05, "loss": 1.0115, "step": 160 }, { "epoch": 0.03, "grad_norm": 1.0001195669174194, "learning_rate": 4.999060740496093e-05, "loss": 1.1999, "step": 165 }, { "epoch": 0.03, "grad_norm": 1.2456804513931274, "learning_rate": 4.999002599394102e-05, "loss": 0.8882, "step": 170 }, { "epoch": 0.03, "grad_norm": 1.0445325374603271, "learning_rate": 4.9989427127843814e-05, "loss": 1.0615, "step": 175 }, { "epoch": 0.03, "grad_norm": 1.2410887479782104, "learning_rate": 4.9988810807087584e-05, "loss": 1.1068, "step": 180 }, { "epoch": 0.03, "grad_norm": 0.8935971260070801, "learning_rate": 4.998817703210285e-05, "loss": 0.6683, "step": 185 }, { "epoch": 0.03, "grad_norm": 1.1614488363265991, "learning_rate": 4.9987525803332265e-05, "loss": 0.7446, "step": 190 }, { "epoch": 0.03, "grad_norm": 0.9392004013061523, "learning_rate": 4.998685712123072e-05, "loss": 0.7397, "step": 195 }, { "epoch": 0.03, "grad_norm": 1.0314444303512573, "learning_rate": 4.9986170986265266e-05, "loss": 1.3584, "step": 200 }, { "epoch": 0.03, "eval_loss": 0.8368077278137207, "eval_runtime": 96.5262, "eval_samples_per_second": 7.221, "eval_steps_per_second": 7.221, "step": 200 }, { "epoch": 0.03, "grad_norm": 0.8964811563491821, "learning_rate": 4.998546739891516e-05, "loss": 0.9546, "step": 205 }, { "epoch": 0.03, "grad_norm": 1.0679796934127808, "learning_rate": 4.998474635967185e-05, "loss": 0.864, "step": 210 }, { "epoch": 0.03, "grad_norm": 1.2340985536575317, "learning_rate": 4.998400786903896e-05, "loss": 0.885, "step": 215 }, { "epoch": 0.04, "grad_norm": 1.7219617366790771, "learning_rate": 4.9983251927532315e-05, "loss": 1.1069, "step": 220 }, { "epoch": 0.04, "grad_norm": 1.1480705738067627, "learning_rate": 4.9982478535679924e-05, "loss": 1.0416, "step": 225 }, { "epoch": 0.04, "grad_norm": 1.515589714050293, "learning_rate": 4.9981687694021996e-05, "loss": 1.1844, "step": 230 }, { "epoch": 0.04, "grad_norm": 1.6687963008880615, "learning_rate": 4.998087940311091e-05, "loss": 0.8664, "step": 235 }, { "epoch": 0.04, "grad_norm": 1.9256645441055298, "learning_rate": 4.998005366351125e-05, "loss": 1.0125, "step": 240 }, { "epoch": 0.04, "grad_norm": 1.2500052452087402, "learning_rate": 4.997921047579978e-05, "loss": 1.1374, "step": 245 }, { "epoch": 0.04, "grad_norm": 1.0543216466903687, "learning_rate": 4.9978349840565434e-05, "loss": 0.8502, "step": 250 }, { "epoch": 0.04, "grad_norm": 1.3009012937545776, "learning_rate": 4.997747175840937e-05, "loss": 1.0357, "step": 255 }, { "epoch": 0.04, "grad_norm": 0.8456661105155945, "learning_rate": 4.997657622994491e-05, "loss": 0.6883, "step": 260 }, { "epoch": 0.04, "grad_norm": 0.5856515765190125, "learning_rate": 4.9975663255797555e-05, "loss": 0.7656, "step": 265 }, { "epoch": 0.04, "grad_norm": 0.973818302154541, "learning_rate": 4.997473283660501e-05, "loss": 0.823, "step": 270 }, { "epoch": 0.04, "grad_norm": 0.9960187673568726, "learning_rate": 4.997378497301715e-05, "loss": 0.8726, "step": 275 }, { "epoch": 0.04, "grad_norm": 1.2900679111480713, "learning_rate": 4.997281966569604e-05, "loss": 0.9781, "step": 280 }, { "epoch": 0.05, "grad_norm": 1.828894853591919, "learning_rate": 4.9971836915315926e-05, "loss": 0.8932, "step": 285 }, { "epoch": 0.05, "grad_norm": 1.239621877670288, "learning_rate": 4.9970836722563256e-05, "loss": 1.2022, "step": 290 }, { "epoch": 0.05, "grad_norm": 1.0117149353027344, "learning_rate": 4.996981908813664e-05, "loss": 0.8032, "step": 295 }, { "epoch": 0.05, "grad_norm": 0.8861119747161865, "learning_rate": 4.996878401274687e-05, "loss": 1.0651, "step": 300 }, { "epoch": 0.05, "eval_loss": 0.8281473517417908, "eval_runtime": 96.5283, "eval_samples_per_second": 7.221, "eval_steps_per_second": 7.221, "step": 300 }, { "epoch": 0.05, "grad_norm": 0.8583046197891235, "learning_rate": 4.996773149711693e-05, "loss": 0.8784, "step": 305 }, { "epoch": 0.05, "grad_norm": 2.5717499256134033, "learning_rate": 4.9966661541981984e-05, "loss": 0.8395, "step": 310 }, { "epoch": 0.05, "grad_norm": 0.982342004776001, "learning_rate": 4.9965574148089376e-05, "loss": 0.9869, "step": 315 }, { "epoch": 0.05, "grad_norm": 0.9000777006149292, "learning_rate": 4.9964469316198633e-05, "loss": 0.8435, "step": 320 }, { "epoch": 0.05, "grad_norm": 0.8733209371566772, "learning_rate": 4.9963347047081464e-05, "loss": 0.7281, "step": 325 }, { "epoch": 0.05, "grad_norm": 3.323739767074585, "learning_rate": 4.9962207341521746e-05, "loss": 1.1013, "step": 330 }, { "epoch": 0.05, "grad_norm": 1.7102876901626587, "learning_rate": 4.996105020031554e-05, "loss": 0.8276, "step": 335 }, { "epoch": 0.05, "grad_norm": 0.9196123480796814, "learning_rate": 4.995987562427109e-05, "loss": 0.8274, "step": 340 }, { "epoch": 0.06, "grad_norm": 1.210099458694458, "learning_rate": 4.995868361420883e-05, "loss": 1.3257, "step": 345 }, { "epoch": 0.06, "grad_norm": 0.8923581838607788, "learning_rate": 4.9957474170961335e-05, "loss": 0.6815, "step": 350 }, { "epoch": 0.06, "grad_norm": 0.9576735496520996, "learning_rate": 4.9956247295373396e-05, "loss": 1.23, "step": 355 }, { "epoch": 0.06, "grad_norm": 1.3774089813232422, "learning_rate": 4.995500298830196e-05, "loss": 1.0556, "step": 360 }, { "epoch": 0.06, "grad_norm": 1.1523677110671997, "learning_rate": 4.995374125061614e-05, "loss": 1.1787, "step": 365 }, { "epoch": 0.06, "grad_norm": 0.8310608863830566, "learning_rate": 4.9952462083197246e-05, "loss": 0.8525, "step": 370 }, { "epoch": 0.06, "grad_norm": 0.9814196825027466, "learning_rate": 4.9951165486938765e-05, "loss": 0.8522, "step": 375 }, { "epoch": 0.06, "grad_norm": 0.9878122210502625, "learning_rate": 4.994985146274633e-05, "loss": 0.6618, "step": 380 }, { "epoch": 0.06, "grad_norm": 1.2652586698532104, "learning_rate": 4.994852001153777e-05, "loss": 1.0489, "step": 385 }, { "epoch": 0.06, "grad_norm": 1.2940975427627563, "learning_rate": 4.994717113424307e-05, "loss": 1.104, "step": 390 }, { "epoch": 0.06, "grad_norm": 0.9636249542236328, "learning_rate": 4.99458048318044e-05, "loss": 0.9228, "step": 395 }, { "epoch": 0.06, "grad_norm": 0.8122813105583191, "learning_rate": 4.994442110517611e-05, "loss": 0.9209, "step": 400 }, { "epoch": 0.06, "eval_loss": 0.8184689879417419, "eval_runtime": 96.4572, "eval_samples_per_second": 7.226, "eval_steps_per_second": 7.226, "step": 400 }, { "epoch": 0.06, "grad_norm": 0.8742052912712097, "learning_rate": 4.99430199553247e-05, "loss": 0.9608, "step": 405 }, { "epoch": 0.07, "grad_norm": 0.5679522752761841, "learning_rate": 4.9941601383228835e-05, "loss": 0.5963, "step": 410 }, { "epoch": 0.07, "grad_norm": 1.0234627723693848, "learning_rate": 4.994016538987938e-05, "loss": 0.8642, "step": 415 }, { "epoch": 0.07, "grad_norm": 0.8581897616386414, "learning_rate": 4.993871197627934e-05, "loss": 0.8993, "step": 420 }, { "epoch": 0.07, "grad_norm": 1.4666485786437988, "learning_rate": 4.9937241143443904e-05, "loss": 0.8565, "step": 425 }, { "epoch": 0.07, "grad_norm": 1.1166578531265259, "learning_rate": 4.993575289240041e-05, "loss": 0.881, "step": 430 }, { "epoch": 0.07, "grad_norm": 1.303992748260498, "learning_rate": 4.9934247224188393e-05, "loss": 0.9962, "step": 435 }, { "epoch": 0.07, "grad_norm": 0.9011989235877991, "learning_rate": 4.993272413985952e-05, "loss": 0.9316, "step": 440 }, { "epoch": 0.07, "grad_norm": 0.8321458101272583, "learning_rate": 4.993118364047764e-05, "loss": 0.7889, "step": 445 }, { "epoch": 0.07, "grad_norm": 0.7780352234840393, "learning_rate": 4.992962572711877e-05, "loss": 0.8287, "step": 450 }, { "epoch": 0.07, "grad_norm": 0.9090210199356079, "learning_rate": 4.992805040087108e-05, "loss": 0.7018, "step": 455 }, { "epoch": 0.07, "grad_norm": 0.8694137334823608, "learning_rate": 4.9926457662834906e-05, "loss": 0.8484, "step": 460 }, { "epoch": 0.07, "grad_norm": 0.6327371001243591, "learning_rate": 4.992484751412274e-05, "loss": 0.716, "step": 465 }, { "epoch": 0.08, "grad_norm": 1.200668215751648, "learning_rate": 4.9923219955859254e-05, "loss": 0.9525, "step": 470 }, { "epoch": 0.08, "grad_norm": 0.8530198931694031, "learning_rate": 4.9921574989181266e-05, "loss": 0.744, "step": 475 }, { "epoch": 0.08, "grad_norm": 1.168479323387146, "learning_rate": 4.991991261523775e-05, "loss": 0.729, "step": 480 }, { "epoch": 0.08, "grad_norm": 0.9499714970588684, "learning_rate": 4.9918232835189834e-05, "loss": 0.7725, "step": 485 }, { "epoch": 0.08, "grad_norm": 0.8434467911720276, "learning_rate": 4.991653565021084e-05, "loss": 1.1558, "step": 490 }, { "epoch": 0.08, "grad_norm": 0.7665804624557495, "learning_rate": 4.99148210614862e-05, "loss": 1.0208, "step": 495 }, { "epoch": 0.08, "grad_norm": 0.5782546401023865, "learning_rate": 4.991308907021353e-05, "loss": 0.8306, "step": 500 }, { "epoch": 0.08, "eval_loss": 0.8132078051567078, "eval_runtime": 96.433, "eval_samples_per_second": 7.228, "eval_steps_per_second": 7.228, "step": 500 }, { "epoch": 0.08, "grad_norm": 1.0821778774261475, "learning_rate": 4.9911339677602584e-05, "loss": 0.9503, "step": 505 }, { "epoch": 0.08, "grad_norm": 0.5409029126167297, "learning_rate": 4.99095728848753e-05, "loss": 0.8586, "step": 510 }, { "epoch": 0.08, "grad_norm": 0.9011789560317993, "learning_rate": 4.990778869326575e-05, "loss": 0.7981, "step": 515 }, { "epoch": 0.08, "grad_norm": 1.0092263221740723, "learning_rate": 4.990598710402013e-05, "loss": 1.0174, "step": 520 }, { "epoch": 0.08, "grad_norm": 1.4362307786941528, "learning_rate": 4.9904168118396844e-05, "loss": 0.8373, "step": 525 }, { "epoch": 0.08, "grad_norm": 2.1772639751434326, "learning_rate": 4.9902331737666414e-05, "loss": 0.9599, "step": 530 }, { "epoch": 0.09, "grad_norm": 0.9610542058944702, "learning_rate": 4.990047796311151e-05, "loss": 0.6895, "step": 535 }, { "epoch": 0.09, "grad_norm": 0.9922348260879517, "learning_rate": 4.989860679602698e-05, "loss": 0.7315, "step": 540 }, { "epoch": 0.09, "grad_norm": 1.2409151792526245, "learning_rate": 4.9896718237719785e-05, "loss": 0.8574, "step": 545 }, { "epoch": 0.09, "grad_norm": 1.016333818435669, "learning_rate": 4.9894812289509046e-05, "loss": 1.1248, "step": 550 }, { "epoch": 0.09, "grad_norm": 0.9131489396095276, "learning_rate": 4.989288895272604e-05, "loss": 0.9847, "step": 555 }, { "epoch": 0.09, "grad_norm": 1.215469479560852, "learning_rate": 4.989094822871419e-05, "loss": 0.912, "step": 560 }, { "epoch": 0.09, "grad_norm": 1.0536105632781982, "learning_rate": 4.988899011882903e-05, "loss": 0.8425, "step": 565 }, { "epoch": 0.09, "grad_norm": 1.9705311059951782, "learning_rate": 4.988701462443829e-05, "loss": 0.9385, "step": 570 }, { "epoch": 0.09, "grad_norm": 1.2488442659378052, "learning_rate": 4.98850217469218e-05, "loss": 0.7865, "step": 575 }, { "epoch": 0.09, "grad_norm": 1.7318600416183472, "learning_rate": 4.988301148767157e-05, "loss": 0.8231, "step": 580 }, { "epoch": 0.09, "grad_norm": 0.8247858881950378, "learning_rate": 4.9880983848091704e-05, "loss": 0.8553, "step": 585 }, { "epoch": 0.09, "grad_norm": 0.858172595500946, "learning_rate": 4.987893882959849e-05, "loss": 1.3952, "step": 590 }, { "epoch": 0.09, "grad_norm": 1.2286418676376343, "learning_rate": 4.987687643362033e-05, "loss": 0.837, "step": 595 }, { "epoch": 0.1, "grad_norm": 1.034350872039795, "learning_rate": 4.9874796661597765e-05, "loss": 0.9175, "step": 600 }, { "epoch": 0.1, "eval_loss": 0.8063747882843018, "eval_runtime": 96.4224, "eval_samples_per_second": 7.229, "eval_steps_per_second": 7.229, "step": 600 }, { "epoch": 0.1, "grad_norm": 0.7192366123199463, "learning_rate": 4.987269951498348e-05, "loss": 0.8563, "step": 605 }, { "epoch": 0.1, "grad_norm": 1.2645854949951172, "learning_rate": 4.98705849952423e-05, "loss": 0.6663, "step": 610 }, { "epoch": 0.1, "grad_norm": 1.0610381364822388, "learning_rate": 4.9868453103851176e-05, "loss": 0.8452, "step": 615 }, { "epoch": 0.1, "grad_norm": 0.8550002574920654, "learning_rate": 4.986630384229919e-05, "loss": 0.8894, "step": 620 }, { "epoch": 0.1, "grad_norm": 0.7490519285202026, "learning_rate": 4.986413721208757e-05, "loss": 0.9106, "step": 625 }, { "epoch": 0.1, "grad_norm": 0.557860255241394, "learning_rate": 4.986195321472965e-05, "loss": 0.685, "step": 630 }, { "epoch": 0.1, "grad_norm": 0.7450752258300781, "learning_rate": 4.9859751851750934e-05, "loss": 0.8472, "step": 635 }, { "epoch": 0.1, "grad_norm": 1.176376461982727, "learning_rate": 4.985753312468903e-05, "loss": 1.0197, "step": 640 }, { "epoch": 0.1, "grad_norm": 1.0625300407409668, "learning_rate": 4.985529703509367e-05, "loss": 0.9685, "step": 645 }, { "epoch": 0.1, "grad_norm": 0.8808372616767883, "learning_rate": 4.985304358452672e-05, "loss": 0.8612, "step": 650 }, { "epoch": 0.1, "grad_norm": 0.8110201954841614, "learning_rate": 4.985077277456218e-05, "loss": 0.8401, "step": 655 }, { "epoch": 0.11, "grad_norm": 0.9364888072013855, "learning_rate": 4.984848460678618e-05, "loss": 0.6197, "step": 660 }, { "epoch": 0.11, "grad_norm": 1.0113518238067627, "learning_rate": 4.984617908279694e-05, "loss": 0.9889, "step": 665 }, { "epoch": 0.11, "grad_norm": 1.1148868799209595, "learning_rate": 4.984385620420485e-05, "loss": 0.9558, "step": 670 }, { "epoch": 0.11, "grad_norm": 0.9506175518035889, "learning_rate": 4.984151597263238e-05, "loss": 0.7323, "step": 675 }, { "epoch": 0.11, "grad_norm": 1.0044193267822266, "learning_rate": 4.983915838971415e-05, "loss": 0.7504, "step": 680 }, { "epoch": 0.11, "grad_norm": 2.2674214839935303, "learning_rate": 4.9836783457096875e-05, "loss": 1.032, "step": 685 }, { "epoch": 0.11, "grad_norm": 1.4945333003997803, "learning_rate": 4.983439117643942e-05, "loss": 1.0359, "step": 690 }, { "epoch": 0.11, "grad_norm": 0.9860715866088867, "learning_rate": 4.9831981549412744e-05, "loss": 1.1152, "step": 695 }, { "epoch": 0.11, "grad_norm": 0.8287227153778076, "learning_rate": 4.982955457769992e-05, "loss": 0.8157, "step": 700 }, { "epoch": 0.11, "eval_loss": 0.8022791743278503, "eval_runtime": 96.5324, "eval_samples_per_second": 7.22, "eval_steps_per_second": 7.22, "step": 700 }, { "epoch": 0.11, "grad_norm": 0.9216273427009583, "learning_rate": 4.9827110262996144e-05, "loss": 0.8395, "step": 705 }, { "epoch": 0.11, "grad_norm": 0.7642357349395752, "learning_rate": 4.982464860700874e-05, "loss": 0.8817, "step": 710 }, { "epoch": 0.11, "grad_norm": 0.8851175308227539, "learning_rate": 4.982216961145711e-05, "loss": 0.8558, "step": 715 }, { "epoch": 0.11, "grad_norm": 0.44226109981536865, "learning_rate": 4.98196732780728e-05, "loss": 0.882, "step": 720 }, { "epoch": 0.12, "grad_norm": 0.8005027174949646, "learning_rate": 4.981715960859945e-05, "loss": 0.8835, "step": 725 }, { "epoch": 0.12, "grad_norm": 0.7451304793357849, "learning_rate": 4.981462860479281e-05, "loss": 0.8551, "step": 730 }, { "epoch": 0.12, "grad_norm": 1.1069347858428955, "learning_rate": 4.9812080268420745e-05, "loss": 0.999, "step": 735 }, { "epoch": 0.12, "grad_norm": 0.8892244100570679, "learning_rate": 4.980951460126322e-05, "loss": 1.012, "step": 740 }, { "epoch": 0.12, "grad_norm": 0.8935977816581726, "learning_rate": 4.9806931605112305e-05, "loss": 0.9911, "step": 745 }, { "epoch": 0.12, "grad_norm": 0.8456961512565613, "learning_rate": 4.9804331281772176e-05, "loss": 0.7595, "step": 750 }, { "epoch": 0.12, "grad_norm": 0.78443443775177, "learning_rate": 4.980171363305911e-05, "loss": 0.8308, "step": 755 }, { "epoch": 0.12, "grad_norm": 1.0028038024902344, "learning_rate": 4.979907866080149e-05, "loss": 0.9637, "step": 760 }, { "epoch": 0.12, "grad_norm": 1.1801577806472778, "learning_rate": 4.9796426366839786e-05, "loss": 0.6159, "step": 765 }, { "epoch": 0.12, "grad_norm": 0.8370681405067444, "learning_rate": 4.979375675302659e-05, "loss": 0.9276, "step": 770 }, { "epoch": 0.12, "grad_norm": 0.8605382442474365, "learning_rate": 4.979106982122658e-05, "loss": 1.1077, "step": 775 }, { "epoch": 0.12, "grad_norm": 0.7788259387016296, "learning_rate": 4.978836557331652e-05, "loss": 0.8172, "step": 780 }, { "epoch": 0.13, "grad_norm": 1.4312686920166016, "learning_rate": 4.978564401118528e-05, "loss": 0.8759, "step": 785 }, { "epoch": 0.13, "grad_norm": 0.9109662175178528, "learning_rate": 4.978290513673381e-05, "loss": 0.947, "step": 790 }, { "epoch": 0.13, "grad_norm": 1.1819065809249878, "learning_rate": 4.9780148951875195e-05, "loss": 0.7364, "step": 795 }, { "epoch": 0.13, "grad_norm": 0.9400575160980225, "learning_rate": 4.977737545853455e-05, "loss": 0.9469, "step": 800 }, { "epoch": 0.13, "eval_loss": 0.7995806932449341, "eval_runtime": 96.5877, "eval_samples_per_second": 7.216, "eval_steps_per_second": 7.216, "step": 800 }, { "epoch": 0.13, "grad_norm": 1.693812370300293, "learning_rate": 4.9774584658649126e-05, "loss": 0.9433, "step": 805 }, { "epoch": 0.13, "grad_norm": 1.0892895460128784, "learning_rate": 4.9771776554168234e-05, "loss": 0.7027, "step": 810 }, { "epoch": 0.13, "grad_norm": 0.9118362665176392, "learning_rate": 4.976895114705329e-05, "loss": 0.9468, "step": 815 }, { "epoch": 0.13, "grad_norm": 0.8032681345939636, "learning_rate": 4.976610843927779e-05, "loss": 0.7927, "step": 820 }, { "epoch": 0.13, "grad_norm": 1.168225646018982, "learning_rate": 4.976324843282732e-05, "loss": 0.9673, "step": 825 }, { "epoch": 0.13, "grad_norm": 1.077602744102478, "learning_rate": 4.976037112969953e-05, "loss": 0.9156, "step": 830 }, { "epoch": 0.13, "grad_norm": 0.8643108606338501, "learning_rate": 4.9757476531904165e-05, "loss": 0.6999, "step": 835 }, { "epoch": 0.13, "grad_norm": 0.933397650718689, "learning_rate": 4.975456464146306e-05, "loss": 0.8828, "step": 840 }, { "epoch": 0.13, "grad_norm": 0.7036295533180237, "learning_rate": 4.975163546041011e-05, "loss": 0.8709, "step": 845 }, { "epoch": 0.14, "grad_norm": 0.5974694490432739, "learning_rate": 4.974868899079128e-05, "loss": 0.7594, "step": 850 }, { "epoch": 0.14, "grad_norm": 0.7244943380355835, "learning_rate": 4.974572523466465e-05, "loss": 0.8714, "step": 855 }, { "epoch": 0.14, "grad_norm": 0.5783522725105286, "learning_rate": 4.9742744194100345e-05, "loss": 0.8941, "step": 860 }, { "epoch": 0.14, "grad_norm": 0.7480617761611938, "learning_rate": 4.973974587118055e-05, "loss": 0.9798, "step": 865 }, { "epoch": 0.14, "grad_norm": 0.7548874020576477, "learning_rate": 4.973673026799956e-05, "loss": 0.7767, "step": 870 }, { "epoch": 0.14, "grad_norm": 0.7075071930885315, "learning_rate": 4.97336973866637e-05, "loss": 0.7779, "step": 875 }, { "epoch": 0.14, "grad_norm": 0.7042987942695618, "learning_rate": 4.97306472292914e-05, "loss": 0.8249, "step": 880 }, { "epoch": 0.14, "grad_norm": 1.0242459774017334, "learning_rate": 4.972757979801313e-05, "loss": 0.9223, "step": 885 }, { "epoch": 0.14, "grad_norm": 0.6138095259666443, "learning_rate": 4.9724495094971436e-05, "loss": 0.9842, "step": 890 }, { "epoch": 0.14, "grad_norm": 0.7905042767524719, "learning_rate": 4.9721393122320925e-05, "loss": 0.8738, "step": 895 }, { "epoch": 0.14, "grad_norm": 0.9658048748970032, "learning_rate": 4.9718273882228265e-05, "loss": 0.8872, "step": 900 }, { "epoch": 0.14, "eval_loss": 0.7954564690589905, "eval_runtime": 96.643, "eval_samples_per_second": 7.212, "eval_steps_per_second": 7.212, "step": 900 }, { "epoch": 0.14, "grad_norm": 0.8425014019012451, "learning_rate": 4.97151373768722e-05, "loss": 0.778, "step": 905 }, { "epoch": 0.15, "grad_norm": 0.5527231693267822, "learning_rate": 4.971198360844351e-05, "loss": 0.8332, "step": 910 }, { "epoch": 0.15, "grad_norm": 0.7870334386825562, "learning_rate": 4.9708812579145056e-05, "loss": 0.9265, "step": 915 }, { "epoch": 0.15, "grad_norm": 0.9935321807861328, "learning_rate": 4.970562429119173e-05, "loss": 0.7243, "step": 920 }, { "epoch": 0.15, "grad_norm": 0.9546892046928406, "learning_rate": 4.970241874681051e-05, "loss": 0.9908, "step": 925 }, { "epoch": 0.15, "grad_norm": 0.7340118885040283, "learning_rate": 4.969919594824039e-05, "loss": 0.7932, "step": 930 }, { "epoch": 0.15, "grad_norm": 5.1686015129089355, "learning_rate": 4.9695955897732453e-05, "loss": 0.9842, "step": 935 }, { "epoch": 0.15, "grad_norm": 0.9721456170082092, "learning_rate": 4.9692698597549815e-05, "loss": 0.9271, "step": 940 }, { "epoch": 0.15, "grad_norm": 0.6477334499359131, "learning_rate": 4.9689424049967623e-05, "loss": 0.934, "step": 945 }, { "epoch": 0.15, "grad_norm": 1.0759055614471436, "learning_rate": 4.968613225727311e-05, "loss": 1.0465, "step": 950 }, { "epoch": 0.15, "grad_norm": 0.7222158908843994, "learning_rate": 4.968282322176552e-05, "loss": 0.7732, "step": 955 }, { "epoch": 0.15, "grad_norm": 0.8591343760490417, "learning_rate": 4.9679496945756155e-05, "loss": 0.9062, "step": 960 }, { "epoch": 0.15, "grad_norm": 1.8495111465454102, "learning_rate": 4.967615343156837e-05, "loss": 0.8861, "step": 965 }, { "epoch": 0.15, "grad_norm": 0.6847331523895264, "learning_rate": 4.967279268153753e-05, "loss": 0.8001, "step": 970 }, { "epoch": 0.16, "grad_norm": 0.690113365650177, "learning_rate": 4.9669414698011074e-05, "loss": 0.7378, "step": 975 }, { "epoch": 0.16, "grad_norm": 0.8349626064300537, "learning_rate": 4.9666019483348456e-05, "loss": 0.7193, "step": 980 }, { "epoch": 0.16, "grad_norm": 0.6444108486175537, "learning_rate": 4.966260703992116e-05, "loss": 0.8729, "step": 985 }, { "epoch": 0.16, "grad_norm": 0.9515655040740967, "learning_rate": 4.965917737011274e-05, "loss": 0.7532, "step": 990 }, { "epoch": 0.16, "grad_norm": 0.8138986229896545, "learning_rate": 4.965573047631873e-05, "loss": 1.0124, "step": 995 }, { "epoch": 0.16, "grad_norm": 1.0182080268859863, "learning_rate": 4.9652266360946745e-05, "loss": 0.8842, "step": 1000 }, { "epoch": 0.16, "eval_loss": 0.7912728190422058, "eval_runtime": 96.5004, "eval_samples_per_second": 7.223, "eval_steps_per_second": 7.223, "step": 1000 }, { "epoch": 0.16, "grad_norm": 0.9665297269821167, "learning_rate": 4.96487850264164e-05, "loss": 1.0155, "step": 1005 }, { "epoch": 0.16, "grad_norm": 1.1356585025787354, "learning_rate": 4.964528647515933e-05, "loss": 0.8705, "step": 1010 }, { "epoch": 0.16, "grad_norm": 0.5548833608627319, "learning_rate": 4.9641770709619234e-05, "loss": 0.9634, "step": 1015 }, { "epoch": 0.16, "grad_norm": 0.8028444647789001, "learning_rate": 4.9638237732251794e-05, "loss": 0.8722, "step": 1020 }, { "epoch": 0.16, "grad_norm": 0.934234082698822, "learning_rate": 4.9634687545524724e-05, "loss": 0.9731, "step": 1025 }, { "epoch": 0.16, "grad_norm": 0.7293463349342346, "learning_rate": 4.963112015191778e-05, "loss": 1.0237, "step": 1030 }, { "epoch": 0.17, "grad_norm": 0.6442769169807434, "learning_rate": 4.962753555392271e-05, "loss": 1.1331, "step": 1035 }, { "epoch": 0.17, "grad_norm": 0.7877534031867981, "learning_rate": 4.962393375404331e-05, "loss": 1.0737, "step": 1040 }, { "epoch": 0.17, "grad_norm": 0.5739997625350952, "learning_rate": 4.9620314754795343e-05, "loss": 0.8836, "step": 1045 }, { "epoch": 0.17, "grad_norm": 0.7318402528762817, "learning_rate": 4.9616678558706634e-05, "loss": 0.9981, "step": 1050 }, { "epoch": 0.17, "grad_norm": 0.5463365316390991, "learning_rate": 4.961302516831699e-05, "loss": 0.7336, "step": 1055 }, { "epoch": 0.17, "grad_norm": 0.7839176654815674, "learning_rate": 4.960935458617824e-05, "loss": 1.025, "step": 1060 }, { "epoch": 0.17, "grad_norm": 0.7076404690742493, "learning_rate": 4.9605666814854225e-05, "loss": 0.833, "step": 1065 }, { "epoch": 0.17, "grad_norm": 0.732940673828125, "learning_rate": 4.960196185692077e-05, "loss": 0.5103, "step": 1070 }, { "epoch": 0.17, "grad_norm": 0.7256388068199158, "learning_rate": 4.959823971496574e-05, "loss": 0.8617, "step": 1075 }, { "epoch": 0.17, "grad_norm": 1.1714242696762085, "learning_rate": 4.959450039158898e-05, "loss": 1.0345, "step": 1080 }, { "epoch": 0.17, "grad_norm": 0.5849193930625916, "learning_rate": 4.9590743889402325e-05, "loss": 0.729, "step": 1085 }, { "epoch": 0.17, "grad_norm": 0.6283109784126282, "learning_rate": 4.958697021102963e-05, "loss": 0.8527, "step": 1090 }, { "epoch": 0.17, "grad_norm": 0.6387770175933838, "learning_rate": 4.9583179359106746e-05, "loss": 0.7411, "step": 1095 }, { "epoch": 0.18, "grad_norm": 0.5853758454322815, "learning_rate": 4.957937133628151e-05, "loss": 0.7909, "step": 1100 }, { "epoch": 0.18, "eval_loss": 0.7863278985023499, "eval_runtime": 96.3784, "eval_samples_per_second": 7.232, "eval_steps_per_second": 7.232, "step": 1100 }, { "epoch": 0.18, "grad_norm": 0.9301708936691284, "learning_rate": 4.9575546145213755e-05, "loss": 0.7149, "step": 1105 }, { "epoch": 0.18, "grad_norm": 1.125088095664978, "learning_rate": 4.9571703788575314e-05, "loss": 0.8034, "step": 1110 }, { "epoch": 0.18, "grad_norm": 1.0697988271713257, "learning_rate": 4.956784426905e-05, "loss": 0.8874, "step": 1115 }, { "epoch": 0.18, "grad_norm": 0.7094873189926147, "learning_rate": 4.956396758933361e-05, "loss": 0.6612, "step": 1120 }, { "epoch": 0.18, "grad_norm": 0.8048680424690247, "learning_rate": 4.956007375213393e-05, "loss": 0.9558, "step": 1125 }, { "epoch": 0.18, "grad_norm": 0.8820949196815491, "learning_rate": 4.9556162760170756e-05, "loss": 0.9442, "step": 1130 }, { "epoch": 0.18, "grad_norm": 0.7214958071708679, "learning_rate": 4.955223461617583e-05, "loss": 0.8392, "step": 1135 }, { "epoch": 0.18, "grad_norm": 0.8364250063896179, "learning_rate": 4.954828932289288e-05, "loss": 0.9834, "step": 1140 }, { "epoch": 0.18, "grad_norm": 0.8735854625701904, "learning_rate": 4.954432688307764e-05, "loss": 0.8817, "step": 1145 }, { "epoch": 0.18, "grad_norm": 0.810013473033905, "learning_rate": 4.9540347299497805e-05, "loss": 0.7723, "step": 1150 }, { "epoch": 0.18, "grad_norm": 0.8791002035140991, "learning_rate": 4.953635057493302e-05, "loss": 0.706, "step": 1155 }, { "epoch": 0.19, "grad_norm": 0.7556783556938171, "learning_rate": 4.953233671217493e-05, "loss": 0.8145, "step": 1160 }, { "epoch": 0.19, "grad_norm": 1.3251086473464966, "learning_rate": 4.952830571402716e-05, "loss": 0.8413, "step": 1165 }, { "epoch": 0.19, "grad_norm": 0.8531173467636108, "learning_rate": 4.952425758330527e-05, "loss": 0.8236, "step": 1170 }, { "epoch": 0.19, "grad_norm": 1.0738744735717773, "learning_rate": 4.952019232283681e-05, "loss": 0.8357, "step": 1175 }, { "epoch": 0.19, "grad_norm": 0.7908213138580322, "learning_rate": 4.9516109935461306e-05, "loss": 0.6165, "step": 1180 }, { "epoch": 0.19, "grad_norm": 0.9802565574645996, "learning_rate": 4.951201042403021e-05, "loss": 0.7203, "step": 1185 }, { "epoch": 0.19, "grad_norm": 0.7866708636283875, "learning_rate": 4.9507893791406974e-05, "loss": 0.8479, "step": 1190 }, { "epoch": 0.19, "grad_norm": 0.6721138954162598, "learning_rate": 4.950376004046698e-05, "loss": 0.8871, "step": 1195 }, { "epoch": 0.19, "grad_norm": 1.1981366872787476, "learning_rate": 4.9499609174097574e-05, "loss": 0.8196, "step": 1200 }, { "epoch": 0.19, "eval_loss": 0.7843652367591858, "eval_runtime": 96.5411, "eval_samples_per_second": 7.22, "eval_steps_per_second": 7.22, "step": 1200 }, { "epoch": 0.19, "grad_norm": 0.7013841867446899, "learning_rate": 4.9495441195198064e-05, "loss": 1.0009, "step": 1205 }, { "epoch": 0.19, "grad_norm": 0.8476290702819824, "learning_rate": 4.949125610667972e-05, "loss": 0.5127, "step": 1210 }, { "epoch": 0.19, "grad_norm": 0.7680797576904297, "learning_rate": 4.9487053911465735e-05, "loss": 0.7003, "step": 1215 }, { "epoch": 0.19, "grad_norm": 0.9771925806999207, "learning_rate": 4.948283461249127e-05, "loss": 1.1135, "step": 1220 }, { "epoch": 0.2, "grad_norm": 1.4247405529022217, "learning_rate": 4.947859821270342e-05, "loss": 0.8253, "step": 1225 }, { "epoch": 0.2, "grad_norm": 1.184887409210205, "learning_rate": 4.947434471506125e-05, "loss": 1.1208, "step": 1230 }, { "epoch": 0.2, "grad_norm": 0.7579745054244995, "learning_rate": 4.9470074122535745e-05, "loss": 1.1363, "step": 1235 }, { "epoch": 0.2, "grad_norm": 0.8529625535011292, "learning_rate": 4.9465786438109826e-05, "loss": 0.8699, "step": 1240 }, { "epoch": 0.2, "grad_norm": 1.810576319694519, "learning_rate": 4.9461481664778374e-05, "loss": 1.0166, "step": 1245 }, { "epoch": 0.2, "grad_norm": 0.8605110049247742, "learning_rate": 4.9457159805548187e-05, "loss": 0.9427, "step": 1250 }, { "epoch": 0.2, "grad_norm": 0.59971684217453, "learning_rate": 4.945282086343801e-05, "loss": 0.6536, "step": 1255 }, { "epoch": 0.2, "grad_norm": 1.0233818292617798, "learning_rate": 4.9448464841478506e-05, "loss": 0.9505, "step": 1260 }, { "epoch": 0.2, "grad_norm": 0.8945149779319763, "learning_rate": 4.9444091742712293e-05, "loss": 0.8416, "step": 1265 }, { "epoch": 0.2, "grad_norm": 0.702805757522583, "learning_rate": 4.9439701570193886e-05, "loss": 0.9419, "step": 1270 }, { "epoch": 0.2, "grad_norm": 0.7464181184768677, "learning_rate": 4.9435294326989745e-05, "loss": 0.7972, "step": 1275 }, { "epoch": 0.2, "grad_norm": 1.1765002012252808, "learning_rate": 4.943175624360097e-05, "loss": 0.9914, "step": 1280 }, { "epoch": 0.21, "grad_norm": 0.6549853682518005, "learning_rate": 4.9427318280928034e-05, "loss": 0.8924, "step": 1285 }, { "epoch": 0.21, "grad_norm": 0.5978650450706482, "learning_rate": 4.942286325621888e-05, "loss": 0.6224, "step": 1290 }, { "epoch": 0.21, "grad_norm": 0.7752617597579956, "learning_rate": 4.941839117258523e-05, "loss": 0.8666, "step": 1295 }, { "epoch": 0.21, "grad_norm": 0.6919072866439819, "learning_rate": 4.941390203315078e-05, "loss": 0.9341, "step": 1300 }, { "epoch": 0.21, "eval_loss": 0.7824844717979431, "eval_runtime": 96.8874, "eval_samples_per_second": 7.194, "eval_steps_per_second": 7.194, "step": 1300 }, { "epoch": 0.21, "grad_norm": 0.7222729325294495, "learning_rate": 4.94093958410511e-05, "loss": 0.9925, "step": 1305 }, { "epoch": 0.21, "grad_norm": 0.9575716853141785, "learning_rate": 4.9404872599433686e-05, "loss": 0.8623, "step": 1310 }, { "epoch": 0.21, "grad_norm": 0.7721400260925293, "learning_rate": 4.940033231145793e-05, "loss": 1.0061, "step": 1315 }, { "epoch": 0.21, "grad_norm": 0.7019990682601929, "learning_rate": 4.9395774980295165e-05, "loss": 0.8697, "step": 1320 }, { "epoch": 0.21, "grad_norm": 0.7828916907310486, "learning_rate": 4.939120060912858e-05, "loss": 1.0066, "step": 1325 }, { "epoch": 0.21, "grad_norm": 1.0238871574401855, "learning_rate": 4.93866092011533e-05, "loss": 1.0285, "step": 1330 }, { "epoch": 0.21, "grad_norm": 0.48669734597206116, "learning_rate": 4.938200075957634e-05, "loss": 0.7454, "step": 1335 }, { "epoch": 0.21, "grad_norm": 0.8834619522094727, "learning_rate": 4.93773752876166e-05, "loss": 0.9998, "step": 1340 }, { "epoch": 0.21, "grad_norm": 0.6462609767913818, "learning_rate": 4.9372732788504905e-05, "loss": 0.7278, "step": 1345 }, { "epoch": 0.22, "grad_norm": 0.7309257388114929, "learning_rate": 4.936807326548395e-05, "loss": 0.7301, "step": 1350 }, { "epoch": 0.22, "grad_norm": 0.8515027165412903, "learning_rate": 4.936339672180833e-05, "loss": 0.8307, "step": 1355 }, { "epoch": 0.22, "grad_norm": 0.913206934928894, "learning_rate": 4.935870316074451e-05, "loss": 0.9467, "step": 1360 }, { "epoch": 0.22, "grad_norm": 0.6705841422080994, "learning_rate": 4.935399258557088e-05, "loss": 0.7124, "step": 1365 }, { "epoch": 0.22, "grad_norm": 0.676695704460144, "learning_rate": 4.934926499957767e-05, "loss": 0.9318, "step": 1370 }, { "epoch": 0.22, "grad_norm": 1.0529104471206665, "learning_rate": 4.934452040606703e-05, "loss": 1.0307, "step": 1375 }, { "epoch": 0.22, "grad_norm": 0.7150225639343262, "learning_rate": 4.933975880835296e-05, "loss": 0.8718, "step": 1380 }, { "epoch": 0.22, "grad_norm": 0.7180047035217285, "learning_rate": 4.933498020976135e-05, "loss": 0.7515, "step": 1385 }, { "epoch": 0.22, "grad_norm": 1.0961759090423584, "learning_rate": 4.933018461362997e-05, "loss": 0.8797, "step": 1390 }, { "epoch": 0.22, "grad_norm": 0.830609142780304, "learning_rate": 4.9325372023308446e-05, "loss": 0.6927, "step": 1395 }, { "epoch": 0.22, "grad_norm": 0.5277318358421326, "learning_rate": 4.9320542442158305e-05, "loss": 0.8801, "step": 1400 }, { "epoch": 0.22, "eval_loss": 0.7787255644798279, "eval_runtime": 96.8812, "eval_samples_per_second": 7.194, "eval_steps_per_second": 7.194, "step": 1400 }, { "epoch": 0.22, "grad_norm": 1.3845161199569702, "learning_rate": 4.931569587355289e-05, "loss": 0.8782, "step": 1405 }, { "epoch": 0.23, "grad_norm": 0.8579941987991333, "learning_rate": 4.9310832320877476e-05, "loss": 0.713, "step": 1410 }, { "epoch": 0.23, "grad_norm": 0.2643532454967499, "learning_rate": 4.930595178752914e-05, "loss": 0.9781, "step": 1415 }, { "epoch": 0.23, "grad_norm": 0.4968445897102356, "learning_rate": 4.930105427691685e-05, "loss": 0.93, "step": 1420 }, { "epoch": 0.23, "grad_norm": 0.9254417419433594, "learning_rate": 4.929613979246144e-05, "loss": 0.6353, "step": 1425 }, { "epoch": 0.23, "grad_norm": 0.9814417958259583, "learning_rate": 4.9291208337595574e-05, "loss": 0.9672, "step": 1430 }, { "epoch": 0.23, "grad_norm": 0.7159338593482971, "learning_rate": 4.928625991576379e-05, "loss": 0.9482, "step": 1435 }, { "epoch": 0.23, "grad_norm": 0.623866617679596, "learning_rate": 4.9281294530422476e-05, "loss": 0.623, "step": 1440 }, { "epoch": 0.23, "grad_norm": 0.8750379681587219, "learning_rate": 4.927631218503985e-05, "loss": 0.772, "step": 1445 }, { "epoch": 0.23, "grad_norm": 0.5593128800392151, "learning_rate": 4.9271312883096e-05, "loss": 0.6579, "step": 1450 }, { "epoch": 0.23, "grad_norm": 0.6411569714546204, "learning_rate": 4.9266296628082834e-05, "loss": 0.9239, "step": 1455 }, { "epoch": 0.23, "grad_norm": 0.9317705631256104, "learning_rate": 4.9261263423504135e-05, "loss": 0.9315, "step": 1460 }, { "epoch": 0.23, "grad_norm": 0.8312699198722839, "learning_rate": 4.9256213272875486e-05, "loss": 0.7334, "step": 1465 }, { "epoch": 0.23, "grad_norm": 0.6170663833618164, "learning_rate": 4.925114617972433e-05, "loss": 0.8603, "step": 1470 }, { "epoch": 0.24, "grad_norm": 0.7176920771598816, "learning_rate": 4.924606214758995e-05, "loss": 0.8738, "step": 1475 }, { "epoch": 0.24, "grad_norm": 0.8957033157348633, "learning_rate": 4.924096118002343e-05, "loss": 0.8861, "step": 1480 }, { "epoch": 0.24, "grad_norm": 0.5490685701370239, "learning_rate": 4.923584328058772e-05, "loss": 0.712, "step": 1485 }, { "epoch": 0.24, "grad_norm": 0.7401763796806335, "learning_rate": 4.923070845285757e-05, "loss": 0.8118, "step": 1490 }, { "epoch": 0.24, "grad_norm": 0.7380841374397278, "learning_rate": 4.922555670041957e-05, "loss": 0.8476, "step": 1495 }, { "epoch": 0.24, "grad_norm": 1.0009427070617676, "learning_rate": 4.922038802687212e-05, "loss": 0.9109, "step": 1500 }, { "epoch": 0.24, "eval_loss": 0.777683675289154, "eval_runtime": 96.9147, "eval_samples_per_second": 7.192, "eval_steps_per_second": 7.192, "step": 1500 } ], "logging_steps": 5, "max_steps": 18795, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "total_flos": 6.4732976381952e+16, "train_batch_size": 1, "trial_name": null, "trial_params": null }