{ "best_metric": 0.7912728190422058, "best_model_checkpoint": "saves/starcoder2-7b/lora/sft/checkpoint-1000", "epoch": 0.1596169193934557, "eval_steps": 100, "global_step": 1000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 0.628385603427887, "learning_rate": 4.999999126897802e-05, "loss": 1.2582, "step": 5 }, { "epoch": 0.0, "grad_norm": 1.0855119228363037, "learning_rate": 4.999996507591817e-05, "loss": 0.801, "step": 10 }, { "epoch": 0.0, "grad_norm": 1.5689586400985718, "learning_rate": 4.9999921420838745e-05, "loss": 1.067, "step": 15 }, { "epoch": 0.0, "grad_norm": 2.0851330757141113, "learning_rate": 4.999986030377024e-05, "loss": 1.2953, "step": 20 }, { "epoch": 0.0, "grad_norm": 1.397479772567749, "learning_rate": 4.999978172475535e-05, "loss": 0.9826, "step": 25 }, { "epoch": 0.0, "grad_norm": 1.344118595123291, "learning_rate": 4.9999685683848954e-05, "loss": 0.9485, "step": 30 }, { "epoch": 0.01, "grad_norm": 1.158163070678711, "learning_rate": 4.9999596278606616e-05, "loss": 0.8103, "step": 35 }, { "epoch": 0.01, "grad_norm": 1.602233648300171, "learning_rate": 4.999946880647276e-05, "loss": 0.8648, "step": 40 }, { "epoch": 0.01, "grad_norm": 1.557242751121521, "learning_rate": 4.999932387266596e-05, "loss": 1.0198, "step": 45 }, { "epoch": 0.01, "grad_norm": 1.36068856716156, "learning_rate": 4.999916147728746e-05, "loss": 0.9367, "step": 50 }, { "epoch": 0.01, "grad_norm": 1.3263639211654663, "learning_rate": 4.999898162045068e-05, "loss": 0.9695, "step": 55 }, { "epoch": 0.01, "grad_norm": 1.333601474761963, "learning_rate": 4.999878430228126e-05, "loss": 1.1509, "step": 60 }, { "epoch": 0.01, "grad_norm": 1.4753800630569458, "learning_rate": 4.999856952291702e-05, "loss": 1.1461, "step": 65 }, { "epoch": 0.01, "grad_norm": 1.5096240043640137, "learning_rate": 4.9998337282507965e-05, "loss": 1.1722, "step": 70 }, { "epoch": 0.01, "grad_norm": 1.189892053604126, "learning_rate": 4.999808758121633e-05, "loss": 1.1834, "step": 75 }, { "epoch": 0.01, "grad_norm": 0.9292634725570679, "learning_rate": 4.999782041921651e-05, "loss": 0.9498, "step": 80 }, { "epoch": 0.01, "grad_norm": 2.1775777339935303, "learning_rate": 4.9997535796695134e-05, "loss": 0.9346, "step": 85 }, { "epoch": 0.01, "grad_norm": 1.6854296922683716, "learning_rate": 4.999723371385099e-05, "loss": 1.119, "step": 90 }, { "epoch": 0.02, "grad_norm": 1.4571490287780762, "learning_rate": 4.999691417089507e-05, "loss": 0.8671, "step": 95 }, { "epoch": 0.02, "grad_norm": 1.277044653892517, "learning_rate": 4.999657716805059e-05, "loss": 1.2469, "step": 100 }, { "epoch": 0.02, "eval_loss": 0.8478816747665405, "eval_runtime": 96.2736, "eval_samples_per_second": 7.24, "eval_steps_per_second": 7.24, "step": 100 }, { "epoch": 0.02, "grad_norm": 0.6687743067741394, "learning_rate": 4.9996222705552933e-05, "loss": 0.735, "step": 105 }, { "epoch": 0.02, "grad_norm": 1.3488354682922363, "learning_rate": 4.9995850783649665e-05, "loss": 0.8344, "step": 110 }, { "epoch": 0.02, "grad_norm": 1.1043323278427124, "learning_rate": 4.9995461402600593e-05, "loss": 0.8254, "step": 115 }, { "epoch": 0.02, "grad_norm": 0.9382895827293396, "learning_rate": 4.9995054562677684e-05, "loss": 0.9179, "step": 120 }, { "epoch": 0.02, "grad_norm": 1.2824612855911255, "learning_rate": 4.9994630264165107e-05, "loss": 0.8663, "step": 125 }, { "epoch": 0.02, "grad_norm": 1.0491925477981567, "learning_rate": 4.999418850735923e-05, "loss": 0.9247, "step": 130 }, { "epoch": 0.02, "grad_norm": 1.3642233610153198, "learning_rate": 4.99937292925686e-05, "loss": 0.8253, "step": 135 }, { "epoch": 0.02, "grad_norm": 3.747757911682129, "learning_rate": 4.9993252620113976e-05, "loss": 1.0245, "step": 140 }, { "epoch": 0.02, "grad_norm": 1.299494981765747, "learning_rate": 4.999275849032832e-05, "loss": 0.8723, "step": 145 }, { "epoch": 0.02, "grad_norm": 1.7195830345153809, "learning_rate": 4.999224690355675e-05, "loss": 1.0524, "step": 150 }, { "epoch": 0.02, "grad_norm": 0.9922987222671509, "learning_rate": 4.9991717860156616e-05, "loss": 0.9502, "step": 155 }, { "epoch": 0.03, "grad_norm": 1.0577458143234253, "learning_rate": 4.9991171360497437e-05, "loss": 1.0115, "step": 160 }, { "epoch": 0.03, "grad_norm": 1.0001195669174194, "learning_rate": 4.999060740496093e-05, "loss": 1.1999, "step": 165 }, { "epoch": 0.03, "grad_norm": 1.2456804513931274, "learning_rate": 4.999002599394102e-05, "loss": 0.8882, "step": 170 }, { "epoch": 0.03, "grad_norm": 1.0445325374603271, "learning_rate": 4.9989427127843814e-05, "loss": 1.0615, "step": 175 }, { "epoch": 0.03, "grad_norm": 1.2410887479782104, "learning_rate": 4.9988810807087584e-05, "loss": 1.1068, "step": 180 }, { "epoch": 0.03, "grad_norm": 0.8935971260070801, "learning_rate": 4.998817703210285e-05, "loss": 0.6683, "step": 185 }, { "epoch": 0.03, "grad_norm": 1.1614488363265991, "learning_rate": 4.9987525803332265e-05, "loss": 0.7446, "step": 190 }, { "epoch": 0.03, "grad_norm": 0.9392004013061523, "learning_rate": 4.998685712123072e-05, "loss": 0.7397, "step": 195 }, { "epoch": 0.03, "grad_norm": 1.0314444303512573, "learning_rate": 4.9986170986265266e-05, "loss": 1.3584, "step": 200 }, { "epoch": 0.03, "eval_loss": 0.8368077278137207, "eval_runtime": 96.5262, "eval_samples_per_second": 7.221, "eval_steps_per_second": 7.221, "step": 200 }, { "epoch": 0.03, "grad_norm": 0.8964811563491821, "learning_rate": 4.998546739891516e-05, "loss": 0.9546, "step": 205 }, { "epoch": 0.03, "grad_norm": 1.0679796934127808, "learning_rate": 4.998474635967185e-05, "loss": 0.864, "step": 210 }, { "epoch": 0.03, "grad_norm": 1.2340985536575317, "learning_rate": 4.998400786903896e-05, "loss": 0.885, "step": 215 }, { "epoch": 0.04, "grad_norm": 1.7219617366790771, "learning_rate": 4.9983251927532315e-05, "loss": 1.1069, "step": 220 }, { "epoch": 0.04, "grad_norm": 1.1480705738067627, "learning_rate": 4.9982478535679924e-05, "loss": 1.0416, "step": 225 }, { "epoch": 0.04, "grad_norm": 1.515589714050293, "learning_rate": 4.9981687694021996e-05, "loss": 1.1844, "step": 230 }, { "epoch": 0.04, "grad_norm": 1.6687963008880615, "learning_rate": 4.998087940311091e-05, "loss": 0.8664, "step": 235 }, { "epoch": 0.04, "grad_norm": 1.9256645441055298, "learning_rate": 4.998005366351125e-05, "loss": 1.0125, "step": 240 }, { "epoch": 0.04, "grad_norm": 1.2500052452087402, "learning_rate": 4.997921047579978e-05, "loss": 1.1374, "step": 245 }, { "epoch": 0.04, "grad_norm": 1.0543216466903687, "learning_rate": 4.9978349840565434e-05, "loss": 0.8502, "step": 250 }, { "epoch": 0.04, "grad_norm": 1.3009012937545776, "learning_rate": 4.997747175840937e-05, "loss": 1.0357, "step": 255 }, { "epoch": 0.04, "grad_norm": 0.8456661105155945, "learning_rate": 4.997657622994491e-05, "loss": 0.6883, "step": 260 }, { "epoch": 0.04, "grad_norm": 0.5856515765190125, "learning_rate": 4.9975663255797555e-05, "loss": 0.7656, "step": 265 }, { "epoch": 0.04, "grad_norm": 0.973818302154541, "learning_rate": 4.997473283660501e-05, "loss": 0.823, "step": 270 }, { "epoch": 0.04, "grad_norm": 0.9960187673568726, "learning_rate": 4.997378497301715e-05, "loss": 0.8726, "step": 275 }, { "epoch": 0.04, "grad_norm": 1.2900679111480713, "learning_rate": 4.997281966569604e-05, "loss": 0.9781, "step": 280 }, { "epoch": 0.05, "grad_norm": 1.828894853591919, "learning_rate": 4.9971836915315926e-05, "loss": 0.8932, "step": 285 }, { "epoch": 0.05, "grad_norm": 1.239621877670288, "learning_rate": 4.9970836722563256e-05, "loss": 1.2022, "step": 290 }, { "epoch": 0.05, "grad_norm": 1.0117149353027344, "learning_rate": 4.996981908813664e-05, "loss": 0.8032, "step": 295 }, { "epoch": 0.05, "grad_norm": 0.8861119747161865, "learning_rate": 4.996878401274687e-05, "loss": 1.0651, "step": 300 }, { "epoch": 0.05, "eval_loss": 0.8281473517417908, "eval_runtime": 96.5283, "eval_samples_per_second": 7.221, "eval_steps_per_second": 7.221, "step": 300 }, { "epoch": 0.05, "grad_norm": 0.8583046197891235, "learning_rate": 4.996773149711693e-05, "loss": 0.8784, "step": 305 }, { "epoch": 0.05, "grad_norm": 2.5717499256134033, "learning_rate": 4.9966661541981984e-05, "loss": 0.8395, "step": 310 }, { "epoch": 0.05, "grad_norm": 0.982342004776001, "learning_rate": 4.9965574148089376e-05, "loss": 0.9869, "step": 315 }, { "epoch": 0.05, "grad_norm": 0.9000777006149292, "learning_rate": 4.9964469316198633e-05, "loss": 0.8435, "step": 320 }, { "epoch": 0.05, "grad_norm": 0.8733209371566772, "learning_rate": 4.9963347047081464e-05, "loss": 0.7281, "step": 325 }, { "epoch": 0.05, "grad_norm": 3.323739767074585, "learning_rate": 4.9962207341521746e-05, "loss": 1.1013, "step": 330 }, { "epoch": 0.05, "grad_norm": 1.7102876901626587, "learning_rate": 4.996105020031554e-05, "loss": 0.8276, "step": 335 }, { "epoch": 0.05, "grad_norm": 0.9196123480796814, "learning_rate": 4.995987562427109e-05, "loss": 0.8274, "step": 340 }, { "epoch": 0.06, "grad_norm": 1.210099458694458, "learning_rate": 4.995868361420883e-05, "loss": 1.3257, "step": 345 }, { "epoch": 0.06, "grad_norm": 0.8923581838607788, "learning_rate": 4.9957474170961335e-05, "loss": 0.6815, "step": 350 }, { "epoch": 0.06, "grad_norm": 0.9576735496520996, "learning_rate": 4.9956247295373396e-05, "loss": 1.23, "step": 355 }, { "epoch": 0.06, "grad_norm": 1.3774089813232422, "learning_rate": 4.995500298830196e-05, "loss": 1.0556, "step": 360 }, { "epoch": 0.06, "grad_norm": 1.1523677110671997, "learning_rate": 4.995374125061614e-05, "loss": 1.1787, "step": 365 }, { "epoch": 0.06, "grad_norm": 0.8310608863830566, "learning_rate": 4.9952462083197246e-05, "loss": 0.8525, "step": 370 }, { "epoch": 0.06, "grad_norm": 0.9814196825027466, "learning_rate": 4.9951165486938765e-05, "loss": 0.8522, "step": 375 }, { "epoch": 0.06, "grad_norm": 0.9878122210502625, "learning_rate": 4.994985146274633e-05, "loss": 0.6618, "step": 380 }, { "epoch": 0.06, "grad_norm": 1.2652586698532104, "learning_rate": 4.994852001153777e-05, "loss": 1.0489, "step": 385 }, { "epoch": 0.06, "grad_norm": 1.2940975427627563, "learning_rate": 4.994717113424307e-05, "loss": 1.104, "step": 390 }, { "epoch": 0.06, "grad_norm": 0.9636249542236328, "learning_rate": 4.99458048318044e-05, "loss": 0.9228, "step": 395 }, { "epoch": 0.06, "grad_norm": 0.8122813105583191, "learning_rate": 4.994442110517611e-05, "loss": 0.9209, "step": 400 }, { "epoch": 0.06, "eval_loss": 0.8184689879417419, "eval_runtime": 96.4572, "eval_samples_per_second": 7.226, "eval_steps_per_second": 7.226, "step": 400 }, { "epoch": 0.06, "grad_norm": 0.8742052912712097, "learning_rate": 4.99430199553247e-05, "loss": 0.9608, "step": 405 }, { "epoch": 0.07, "grad_norm": 0.5679522752761841, "learning_rate": 4.9941601383228835e-05, "loss": 0.5963, "step": 410 }, { "epoch": 0.07, "grad_norm": 1.0234627723693848, "learning_rate": 4.994016538987938e-05, "loss": 0.8642, "step": 415 }, { "epoch": 0.07, "grad_norm": 0.8581897616386414, "learning_rate": 4.993871197627934e-05, "loss": 0.8993, "step": 420 }, { "epoch": 0.07, "grad_norm": 1.4666485786437988, "learning_rate": 4.9937241143443904e-05, "loss": 0.8565, "step": 425 }, { "epoch": 0.07, "grad_norm": 1.1166578531265259, "learning_rate": 4.993575289240041e-05, "loss": 0.881, "step": 430 }, { "epoch": 0.07, "grad_norm": 1.303992748260498, "learning_rate": 4.9934247224188393e-05, "loss": 0.9962, "step": 435 }, { "epoch": 0.07, "grad_norm": 0.9011989235877991, "learning_rate": 4.993272413985952e-05, "loss": 0.9316, "step": 440 }, { "epoch": 0.07, "grad_norm": 0.8321458101272583, "learning_rate": 4.993118364047764e-05, "loss": 0.7889, "step": 445 }, { "epoch": 0.07, "grad_norm": 0.7780352234840393, "learning_rate": 4.992962572711877e-05, "loss": 0.8287, "step": 450 }, { "epoch": 0.07, "grad_norm": 0.9090210199356079, "learning_rate": 4.992805040087108e-05, "loss": 0.7018, "step": 455 }, { "epoch": 0.07, "grad_norm": 0.8694137334823608, "learning_rate": 4.9926457662834906e-05, "loss": 0.8484, "step": 460 }, { "epoch": 0.07, "grad_norm": 0.6327371001243591, "learning_rate": 4.992484751412274e-05, "loss": 0.716, "step": 465 }, { "epoch": 0.08, "grad_norm": 1.200668215751648, "learning_rate": 4.9923219955859254e-05, "loss": 0.9525, "step": 470 }, { "epoch": 0.08, "grad_norm": 0.8530198931694031, "learning_rate": 4.9921574989181266e-05, "loss": 0.744, "step": 475 }, { "epoch": 0.08, "grad_norm": 1.168479323387146, "learning_rate": 4.991991261523775e-05, "loss": 0.729, "step": 480 }, { "epoch": 0.08, "grad_norm": 0.9499714970588684, "learning_rate": 4.9918232835189834e-05, "loss": 0.7725, "step": 485 }, { "epoch": 0.08, "grad_norm": 0.8434467911720276, "learning_rate": 4.991653565021084e-05, "loss": 1.1558, "step": 490 }, { "epoch": 0.08, "grad_norm": 0.7665804624557495, "learning_rate": 4.99148210614862e-05, "loss": 1.0208, "step": 495 }, { "epoch": 0.08, "grad_norm": 0.5782546401023865, "learning_rate": 4.991308907021353e-05, "loss": 0.8306, "step": 500 }, { "epoch": 0.08, "eval_loss": 0.8132078051567078, "eval_runtime": 96.433, "eval_samples_per_second": 7.228, "eval_steps_per_second": 7.228, "step": 500 }, { "epoch": 0.08, "grad_norm": 1.0821778774261475, "learning_rate": 4.9911339677602584e-05, "loss": 0.9503, "step": 505 }, { "epoch": 0.08, "grad_norm": 0.5409029126167297, "learning_rate": 4.99095728848753e-05, "loss": 0.8586, "step": 510 }, { "epoch": 0.08, "grad_norm": 0.9011789560317993, "learning_rate": 4.990778869326575e-05, "loss": 0.7981, "step": 515 }, { "epoch": 0.08, "grad_norm": 1.0092263221740723, "learning_rate": 4.990598710402013e-05, "loss": 1.0174, "step": 520 }, { "epoch": 0.08, "grad_norm": 1.4362307786941528, "learning_rate": 4.9904168118396844e-05, "loss": 0.8373, "step": 525 }, { "epoch": 0.08, "grad_norm": 2.1772639751434326, "learning_rate": 4.9902331737666414e-05, "loss": 0.9599, "step": 530 }, { "epoch": 0.09, "grad_norm": 0.9610542058944702, "learning_rate": 4.990047796311151e-05, "loss": 0.6895, "step": 535 }, { "epoch": 0.09, "grad_norm": 0.9922348260879517, "learning_rate": 4.989860679602698e-05, "loss": 0.7315, "step": 540 }, { "epoch": 0.09, "grad_norm": 1.2409151792526245, "learning_rate": 4.9896718237719785e-05, "loss": 0.8574, "step": 545 }, { "epoch": 0.09, "grad_norm": 1.016333818435669, "learning_rate": 4.9894812289509046e-05, "loss": 1.1248, "step": 550 }, { "epoch": 0.09, "grad_norm": 0.9131489396095276, "learning_rate": 4.989288895272604e-05, "loss": 0.9847, "step": 555 }, { "epoch": 0.09, "grad_norm": 1.215469479560852, "learning_rate": 4.989094822871419e-05, "loss": 0.912, "step": 560 }, { "epoch": 0.09, "grad_norm": 1.0536105632781982, "learning_rate": 4.988899011882903e-05, "loss": 0.8425, "step": 565 }, { "epoch": 0.09, "grad_norm": 1.9705311059951782, "learning_rate": 4.988701462443829e-05, "loss": 0.9385, "step": 570 }, { "epoch": 0.09, "grad_norm": 1.2488442659378052, "learning_rate": 4.98850217469218e-05, "loss": 0.7865, "step": 575 }, { "epoch": 0.09, "grad_norm": 1.7318600416183472, "learning_rate": 4.988301148767157e-05, "loss": 0.8231, "step": 580 }, { "epoch": 0.09, "grad_norm": 0.8247858881950378, "learning_rate": 4.9880983848091704e-05, "loss": 0.8553, "step": 585 }, { "epoch": 0.09, "grad_norm": 0.858172595500946, "learning_rate": 4.987893882959849e-05, "loss": 1.3952, "step": 590 }, { "epoch": 0.09, "grad_norm": 1.2286418676376343, "learning_rate": 4.987687643362033e-05, "loss": 0.837, "step": 595 }, { "epoch": 0.1, "grad_norm": 1.034350872039795, "learning_rate": 4.9874796661597765e-05, "loss": 0.9175, "step": 600 }, { "epoch": 0.1, "eval_loss": 0.8063747882843018, "eval_runtime": 96.4224, "eval_samples_per_second": 7.229, "eval_steps_per_second": 7.229, "step": 600 }, { "epoch": 0.1, "grad_norm": 0.7192366123199463, "learning_rate": 4.987269951498348e-05, "loss": 0.8563, "step": 605 }, { "epoch": 0.1, "grad_norm": 1.2645854949951172, "learning_rate": 4.98705849952423e-05, "loss": 0.6663, "step": 610 }, { "epoch": 0.1, "grad_norm": 1.0610381364822388, "learning_rate": 4.9868453103851176e-05, "loss": 0.8452, "step": 615 }, { "epoch": 0.1, "grad_norm": 0.8550002574920654, "learning_rate": 4.986630384229919e-05, "loss": 0.8894, "step": 620 }, { "epoch": 0.1, "grad_norm": 0.7490519285202026, "learning_rate": 4.986413721208757e-05, "loss": 0.9106, "step": 625 }, { "epoch": 0.1, "grad_norm": 0.557860255241394, "learning_rate": 4.986195321472965e-05, "loss": 0.685, "step": 630 }, { "epoch": 0.1, "grad_norm": 0.7450752258300781, "learning_rate": 4.9859751851750934e-05, "loss": 0.8472, "step": 635 }, { "epoch": 0.1, "grad_norm": 1.176376461982727, "learning_rate": 4.985753312468903e-05, "loss": 1.0197, "step": 640 }, { "epoch": 0.1, "grad_norm": 1.0625300407409668, "learning_rate": 4.985529703509367e-05, "loss": 0.9685, "step": 645 }, { "epoch": 0.1, "grad_norm": 0.8808372616767883, "learning_rate": 4.985304358452672e-05, "loss": 0.8612, "step": 650 }, { "epoch": 0.1, "grad_norm": 0.8110201954841614, "learning_rate": 4.985077277456218e-05, "loss": 0.8401, "step": 655 }, { "epoch": 0.11, "grad_norm": 0.9364888072013855, "learning_rate": 4.984848460678618e-05, "loss": 0.6197, "step": 660 }, { "epoch": 0.11, "grad_norm": 1.0113518238067627, "learning_rate": 4.984617908279694e-05, "loss": 0.9889, "step": 665 }, { "epoch": 0.11, "grad_norm": 1.1148868799209595, "learning_rate": 4.984385620420485e-05, "loss": 0.9558, "step": 670 }, { "epoch": 0.11, "grad_norm": 0.9506175518035889, "learning_rate": 4.984151597263238e-05, "loss": 0.7323, "step": 675 }, { "epoch": 0.11, "grad_norm": 1.0044193267822266, "learning_rate": 4.983915838971415e-05, "loss": 0.7504, "step": 680 }, { "epoch": 0.11, "grad_norm": 2.2674214839935303, "learning_rate": 4.9836783457096875e-05, "loss": 1.032, "step": 685 }, { "epoch": 0.11, "grad_norm": 1.4945333003997803, "learning_rate": 4.983439117643942e-05, "loss": 1.0359, "step": 690 }, { "epoch": 0.11, "grad_norm": 0.9860715866088867, "learning_rate": 4.9831981549412744e-05, "loss": 1.1152, "step": 695 }, { "epoch": 0.11, "grad_norm": 0.8287227153778076, "learning_rate": 4.982955457769992e-05, "loss": 0.8157, "step": 700 }, { "epoch": 0.11, "eval_loss": 0.8022791743278503, "eval_runtime": 96.5324, "eval_samples_per_second": 7.22, "eval_steps_per_second": 7.22, "step": 700 }, { "epoch": 0.11, "grad_norm": 0.9216273427009583, "learning_rate": 4.9827110262996144e-05, "loss": 0.8395, "step": 705 }, { "epoch": 0.11, "grad_norm": 0.7642357349395752, "learning_rate": 4.982464860700874e-05, "loss": 0.8817, "step": 710 }, { "epoch": 0.11, "grad_norm": 0.8851175308227539, "learning_rate": 4.982216961145711e-05, "loss": 0.8558, "step": 715 }, { "epoch": 0.11, "grad_norm": 0.44226109981536865, "learning_rate": 4.98196732780728e-05, "loss": 0.882, "step": 720 }, { "epoch": 0.12, "grad_norm": 0.8005027174949646, "learning_rate": 4.981715960859945e-05, "loss": 0.8835, "step": 725 }, { "epoch": 0.12, "grad_norm": 0.7451304793357849, "learning_rate": 4.981462860479281e-05, "loss": 0.8551, "step": 730 }, { "epoch": 0.12, "grad_norm": 1.1069347858428955, "learning_rate": 4.9812080268420745e-05, "loss": 0.999, "step": 735 }, { "epoch": 0.12, "grad_norm": 0.8892244100570679, "learning_rate": 4.980951460126322e-05, "loss": 1.012, "step": 740 }, { "epoch": 0.12, "grad_norm": 0.8935977816581726, "learning_rate": 4.9806931605112305e-05, "loss": 0.9911, "step": 745 }, { "epoch": 0.12, "grad_norm": 0.8456961512565613, "learning_rate": 4.9804331281772176e-05, "loss": 0.7595, "step": 750 }, { "epoch": 0.12, "grad_norm": 0.78443443775177, "learning_rate": 4.980171363305911e-05, "loss": 0.8308, "step": 755 }, { "epoch": 0.12, "grad_norm": 1.0028038024902344, "learning_rate": 4.979907866080149e-05, "loss": 0.9637, "step": 760 }, { "epoch": 0.12, "grad_norm": 1.1801577806472778, "learning_rate": 4.9796426366839786e-05, "loss": 0.6159, "step": 765 }, { "epoch": 0.12, "grad_norm": 0.8370681405067444, "learning_rate": 4.979375675302659e-05, "loss": 0.9276, "step": 770 }, { "epoch": 0.12, "grad_norm": 0.8605382442474365, "learning_rate": 4.979106982122658e-05, "loss": 1.1077, "step": 775 }, { "epoch": 0.12, "grad_norm": 0.7788259387016296, "learning_rate": 4.978836557331652e-05, "loss": 0.8172, "step": 780 }, { "epoch": 0.13, "grad_norm": 1.4312686920166016, "learning_rate": 4.978564401118528e-05, "loss": 0.8759, "step": 785 }, { "epoch": 0.13, "grad_norm": 0.9109662175178528, "learning_rate": 4.978290513673381e-05, "loss": 0.947, "step": 790 }, { "epoch": 0.13, "grad_norm": 1.1819065809249878, "learning_rate": 4.9780148951875195e-05, "loss": 0.7364, "step": 795 }, { "epoch": 0.13, "grad_norm": 0.9400575160980225, "learning_rate": 4.977737545853455e-05, "loss": 0.9469, "step": 800 }, { "epoch": 0.13, "eval_loss": 0.7995806932449341, "eval_runtime": 96.5877, "eval_samples_per_second": 7.216, "eval_steps_per_second": 7.216, "step": 800 }, { "epoch": 0.13, "grad_norm": 1.693812370300293, "learning_rate": 4.9774584658649126e-05, "loss": 0.9433, "step": 805 }, { "epoch": 0.13, "grad_norm": 1.0892895460128784, "learning_rate": 4.9771776554168234e-05, "loss": 0.7027, "step": 810 }, { "epoch": 0.13, "grad_norm": 0.9118362665176392, "learning_rate": 4.976895114705329e-05, "loss": 0.9468, "step": 815 }, { "epoch": 0.13, "grad_norm": 0.8032681345939636, "learning_rate": 4.976610843927779e-05, "loss": 0.7927, "step": 820 }, { "epoch": 0.13, "grad_norm": 1.168225646018982, "learning_rate": 4.976324843282732e-05, "loss": 0.9673, "step": 825 }, { "epoch": 0.13, "grad_norm": 1.077602744102478, "learning_rate": 4.976037112969953e-05, "loss": 0.9156, "step": 830 }, { "epoch": 0.13, "grad_norm": 0.8643108606338501, "learning_rate": 4.9757476531904165e-05, "loss": 0.6999, "step": 835 }, { "epoch": 0.13, "grad_norm": 0.933397650718689, "learning_rate": 4.975456464146306e-05, "loss": 0.8828, "step": 840 }, { "epoch": 0.13, "grad_norm": 0.7036295533180237, "learning_rate": 4.975163546041011e-05, "loss": 0.8709, "step": 845 }, { "epoch": 0.14, "grad_norm": 0.5974694490432739, "learning_rate": 4.974868899079128e-05, "loss": 0.7594, "step": 850 }, { "epoch": 0.14, "grad_norm": 0.7244943380355835, "learning_rate": 4.974572523466465e-05, "loss": 0.8714, "step": 855 }, { "epoch": 0.14, "grad_norm": 0.5783522725105286, "learning_rate": 4.9742744194100345e-05, "loss": 0.8941, "step": 860 }, { "epoch": 0.14, "grad_norm": 0.7480617761611938, "learning_rate": 4.973974587118055e-05, "loss": 0.9798, "step": 865 }, { "epoch": 0.14, "grad_norm": 0.7548874020576477, "learning_rate": 4.973673026799956e-05, "loss": 0.7767, "step": 870 }, { "epoch": 0.14, "grad_norm": 0.7075071930885315, "learning_rate": 4.97336973866637e-05, "loss": 0.7779, "step": 875 }, { "epoch": 0.14, "grad_norm": 0.7042987942695618, "learning_rate": 4.97306472292914e-05, "loss": 0.8249, "step": 880 }, { "epoch": 0.14, "grad_norm": 1.0242459774017334, "learning_rate": 4.972757979801313e-05, "loss": 0.9223, "step": 885 }, { "epoch": 0.14, "grad_norm": 0.6138095259666443, "learning_rate": 4.9724495094971436e-05, "loss": 0.9842, "step": 890 }, { "epoch": 0.14, "grad_norm": 0.7905042767524719, "learning_rate": 4.9721393122320925e-05, "loss": 0.8738, "step": 895 }, { "epoch": 0.14, "grad_norm": 0.9658048748970032, "learning_rate": 4.9718273882228265e-05, "loss": 0.8872, "step": 900 }, { "epoch": 0.14, "eval_loss": 0.7954564690589905, "eval_runtime": 96.643, "eval_samples_per_second": 7.212, "eval_steps_per_second": 7.212, "step": 900 }, { "epoch": 0.14, "grad_norm": 0.8425014019012451, "learning_rate": 4.97151373768722e-05, "loss": 0.778, "step": 905 }, { "epoch": 0.15, "grad_norm": 0.5527231693267822, "learning_rate": 4.971198360844351e-05, "loss": 0.8332, "step": 910 }, { "epoch": 0.15, "grad_norm": 0.7870334386825562, "learning_rate": 4.9708812579145056e-05, "loss": 0.9265, "step": 915 }, { "epoch": 0.15, "grad_norm": 0.9935321807861328, "learning_rate": 4.970562429119173e-05, "loss": 0.7243, "step": 920 }, { "epoch": 0.15, "grad_norm": 0.9546892046928406, "learning_rate": 4.970241874681051e-05, "loss": 0.9908, "step": 925 }, { "epoch": 0.15, "grad_norm": 0.7340118885040283, "learning_rate": 4.969919594824039e-05, "loss": 0.7932, "step": 930 }, { "epoch": 0.15, "grad_norm": 5.1686015129089355, "learning_rate": 4.9695955897732453e-05, "loss": 0.9842, "step": 935 }, { "epoch": 0.15, "grad_norm": 0.9721456170082092, "learning_rate": 4.9692698597549815e-05, "loss": 0.9271, "step": 940 }, { "epoch": 0.15, "grad_norm": 0.6477334499359131, "learning_rate": 4.9689424049967623e-05, "loss": 0.934, "step": 945 }, { "epoch": 0.15, "grad_norm": 1.0759055614471436, "learning_rate": 4.968613225727311e-05, "loss": 1.0465, "step": 950 }, { "epoch": 0.15, "grad_norm": 0.7222158908843994, "learning_rate": 4.968282322176552e-05, "loss": 0.7732, "step": 955 }, { "epoch": 0.15, "grad_norm": 0.8591343760490417, "learning_rate": 4.9679496945756155e-05, "loss": 0.9062, "step": 960 }, { "epoch": 0.15, "grad_norm": 1.8495111465454102, "learning_rate": 4.967615343156837e-05, "loss": 0.8861, "step": 965 }, { "epoch": 0.15, "grad_norm": 0.6847331523895264, "learning_rate": 4.967279268153753e-05, "loss": 0.8001, "step": 970 }, { "epoch": 0.16, "grad_norm": 0.690113365650177, "learning_rate": 4.9669414698011074e-05, "loss": 0.7378, "step": 975 }, { "epoch": 0.16, "grad_norm": 0.8349626064300537, "learning_rate": 4.9666019483348456e-05, "loss": 0.7193, "step": 980 }, { "epoch": 0.16, "grad_norm": 0.6444108486175537, "learning_rate": 4.966260703992116e-05, "loss": 0.8729, "step": 985 }, { "epoch": 0.16, "grad_norm": 0.9515655040740967, "learning_rate": 4.965917737011274e-05, "loss": 0.7532, "step": 990 }, { "epoch": 0.16, "grad_norm": 0.8138986229896545, "learning_rate": 4.965573047631873e-05, "loss": 1.0124, "step": 995 }, { "epoch": 0.16, "grad_norm": 1.0182080268859863, "learning_rate": 4.9652266360946745e-05, "loss": 0.8842, "step": 1000 }, { "epoch": 0.16, "eval_loss": 0.7912728190422058, "eval_runtime": 96.5004, "eval_samples_per_second": 7.223, "eval_steps_per_second": 7.223, "step": 1000 } ], "logging_steps": 5, "max_steps": 18795, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "total_flos": 4.3155317587968e+16, "train_batch_size": 1, "trial_name": null, "trial_params": null }