{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.9994277539341918, "eval_steps": 500, "global_step": 3494, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 46.73702167943208, "learning_rate": 1.4285714285714288e-08, "loss": 1.4108, "step": 1 }, { "epoch": 0.01, "grad_norm": 44.025654647587054, "learning_rate": 1.4285714285714287e-07, "loss": 1.404, "step": 10 }, { "epoch": 0.01, "grad_norm": 17.050573864369657, "learning_rate": 2.8571428571428575e-07, "loss": 1.262, "step": 20 }, { "epoch": 0.02, "grad_norm": 12.362997846073997, "learning_rate": 4.285714285714286e-07, "loss": 1.0064, "step": 30 }, { "epoch": 0.02, "grad_norm": 4.544949126575649, "learning_rate": 5.714285714285715e-07, "loss": 0.8356, "step": 40 }, { "epoch": 0.03, "grad_norm": 3.303210825544356, "learning_rate": 7.142857142857143e-07, "loss": 0.772, "step": 50 }, { "epoch": 0.03, "grad_norm": 2.6886321352067735, "learning_rate": 8.571428571428572e-07, "loss": 0.768, "step": 60 }, { "epoch": 0.04, "grad_norm": 2.528853230744217, "learning_rate": 1.0000000000000002e-06, "loss": 0.7464, "step": 70 }, { "epoch": 0.05, "grad_norm": 2.4854198868922994, "learning_rate": 1.142857142857143e-06, "loss": 0.7499, "step": 80 }, { "epoch": 0.05, "grad_norm": 2.5204846897703335, "learning_rate": 1.2857142857142856e-06, "loss": 0.7407, "step": 90 }, { "epoch": 0.06, "grad_norm": 2.6727574539079075, "learning_rate": 1.4285714285714286e-06, "loss": 0.7418, "step": 100 }, { "epoch": 0.06, "grad_norm": 2.4909264997558713, "learning_rate": 1.5714285714285714e-06, "loss": 0.7155, "step": 110 }, { "epoch": 0.07, "grad_norm": 2.4511405269140574, "learning_rate": 1.7142857142857145e-06, "loss": 0.7188, "step": 120 }, { "epoch": 0.07, "grad_norm": 2.29941417948333, "learning_rate": 1.8571428571428573e-06, "loss": 0.7025, "step": 130 }, { "epoch": 0.08, "grad_norm": 2.25479052126887, "learning_rate": 2.0000000000000003e-06, "loss": 0.7147, "step": 140 }, { "epoch": 0.09, "grad_norm": 2.259049566829717, "learning_rate": 2.1428571428571427e-06, "loss": 0.7054, "step": 150 }, { "epoch": 0.09, "grad_norm": 2.360482364120735, "learning_rate": 2.285714285714286e-06, "loss": 0.7065, "step": 160 }, { "epoch": 0.1, "grad_norm": 3.542698113557103, "learning_rate": 2.428571428571429e-06, "loss": 0.6936, "step": 170 }, { "epoch": 0.1, "grad_norm": 2.5159436855022985, "learning_rate": 2.571428571428571e-06, "loss": 0.6988, "step": 180 }, { "epoch": 0.11, "grad_norm": 2.2866610023159404, "learning_rate": 2.7142857142857144e-06, "loss": 0.6676, "step": 190 }, { "epoch": 0.11, "grad_norm": 2.413352451235453, "learning_rate": 2.8571428571428573e-06, "loss": 0.6835, "step": 200 }, { "epoch": 0.12, "grad_norm": 2.142867928366355, "learning_rate": 3e-06, "loss": 0.6748, "step": 210 }, { "epoch": 0.13, "grad_norm": 2.2732969439494917, "learning_rate": 3.142857142857143e-06, "loss": 0.6766, "step": 220 }, { "epoch": 0.13, "grad_norm": 2.363769090094736, "learning_rate": 3.285714285714286e-06, "loss": 0.6744, "step": 230 }, { "epoch": 0.14, "grad_norm": 2.2070896050087927, "learning_rate": 3.428571428571429e-06, "loss": 0.6529, "step": 240 }, { "epoch": 0.14, "grad_norm": 2.270856142543171, "learning_rate": 3.5714285714285718e-06, "loss": 0.6721, "step": 250 }, { "epoch": 0.15, "grad_norm": 2.458909086153812, "learning_rate": 3.7142857142857146e-06, "loss": 0.6549, "step": 260 }, { "epoch": 0.15, "grad_norm": 2.38712999789014, "learning_rate": 3.857142857142858e-06, "loss": 0.6592, "step": 270 }, { "epoch": 0.16, "grad_norm": 2.1901720461163383, "learning_rate": 4.000000000000001e-06, "loss": 0.6543, "step": 280 }, { "epoch": 0.17, "grad_norm": 2.448025889828106, "learning_rate": 4.1428571428571435e-06, "loss": 0.6633, "step": 290 }, { "epoch": 0.17, "grad_norm": 2.3714159790058376, "learning_rate": 4.2857142857142855e-06, "loss": 0.6751, "step": 300 }, { "epoch": 0.18, "grad_norm": 2.286975644624705, "learning_rate": 4.428571428571429e-06, "loss": 0.653, "step": 310 }, { "epoch": 0.18, "grad_norm": 2.2971551515931705, "learning_rate": 4.571428571428572e-06, "loss": 0.6566, "step": 320 }, { "epoch": 0.19, "grad_norm": 2.2863181903895944, "learning_rate": 4.714285714285715e-06, "loss": 0.651, "step": 330 }, { "epoch": 0.19, "grad_norm": 2.1454553859203016, "learning_rate": 4.857142857142858e-06, "loss": 0.6449, "step": 340 }, { "epoch": 0.2, "grad_norm": 2.3164450357661655, "learning_rate": 5e-06, "loss": 0.64, "step": 350 }, { "epoch": 0.21, "grad_norm": 2.3977191539499514, "learning_rate": 4.9998751923890465e-06, "loss": 0.6537, "step": 360 }, { "epoch": 0.21, "grad_norm": 2.0695920697937074, "learning_rate": 4.999500782017737e-06, "loss": 0.6318, "step": 370 }, { "epoch": 0.22, "grad_norm": 2.191828287138372, "learning_rate": 4.9988768062694815e-06, "loss": 0.6372, "step": 380 }, { "epoch": 0.22, "grad_norm": 2.216026606359093, "learning_rate": 4.99800332744582e-06, "loss": 0.6308, "step": 390 }, { "epoch": 0.23, "grad_norm": 2.316444311670876, "learning_rate": 4.996880432760195e-06, "loss": 0.6345, "step": 400 }, { "epoch": 0.23, "grad_norm": 2.120057918512993, "learning_rate": 4.99550823432925e-06, "loss": 0.632, "step": 410 }, { "epoch": 0.24, "grad_norm": 2.226615533896781, "learning_rate": 4.993886869161631e-06, "loss": 0.6347, "step": 420 }, { "epoch": 0.25, "grad_norm": 2.221954699719473, "learning_rate": 4.992016499144309e-06, "loss": 0.6473, "step": 430 }, { "epoch": 0.25, "grad_norm": 2.0658073751369943, "learning_rate": 4.9898973110264126e-06, "loss": 0.61, "step": 440 }, { "epoch": 0.26, "grad_norm": 2.3331061006391582, "learning_rate": 4.98752951640059e-06, "loss": 0.6364, "step": 450 }, { "epoch": 0.26, "grad_norm": 2.159695737484098, "learning_rate": 4.984913351681871e-06, "loss": 0.6224, "step": 460 }, { "epoch": 0.27, "grad_norm": 2.100223862687521, "learning_rate": 4.982049078084072e-06, "loss": 0.6386, "step": 470 }, { "epoch": 0.27, "grad_norm": 2.260448805877094, "learning_rate": 4.9789369815937075e-06, "loss": 0.6329, "step": 480 }, { "epoch": 0.28, "grad_norm": 2.2109790785096957, "learning_rate": 4.975577372941441e-06, "loss": 0.6126, "step": 490 }, { "epoch": 0.29, "grad_norm": 174.20789307969923, "learning_rate": 4.9719705875710545e-06, "loss": 0.6226, "step": 500 }, { "epoch": 0.29, "eval_loss": 0.6215025186538696, "eval_runtime": 530.7348, "eval_samples_per_second": 23.332, "eval_steps_per_second": 0.729, "step": 500 }, { "epoch": 0.29, "grad_norm": 59.547555932050344, "learning_rate": 4.968116985605963e-06, "loss": 0.676, "step": 510 }, { "epoch": 0.3, "grad_norm": 5.0119747977301135, "learning_rate": 4.964016951813248e-06, "loss": 0.6415, "step": 520 }, { "epoch": 0.3, "grad_norm": 2.4337121629645613, "learning_rate": 4.959670895565248e-06, "loss": 0.6195, "step": 530 }, { "epoch": 0.31, "grad_norm": 2.175705909128136, "learning_rate": 4.955079250798682e-06, "loss": 0.6362, "step": 540 }, { "epoch": 0.31, "grad_norm": 2.189681659202465, "learning_rate": 4.950242475971321e-06, "loss": 0.6351, "step": 550 }, { "epoch": 0.32, "grad_norm": 2.1123816587264144, "learning_rate": 4.9451610540162114e-06, "loss": 0.6284, "step": 560 }, { "epoch": 0.33, "grad_norm": 2.210287811195601, "learning_rate": 4.939835492293463e-06, "loss": 0.6219, "step": 570 }, { "epoch": 0.33, "grad_norm": 2.12233798540241, "learning_rate": 4.934266322539584e-06, "loss": 0.6331, "step": 580 }, { "epoch": 0.34, "grad_norm": 2.1741763930826132, "learning_rate": 4.92845410081439e-06, "loss": 0.6166, "step": 590 }, { "epoch": 0.34, "grad_norm": 22.85902590832313, "learning_rate": 4.922399407445491e-06, "loss": 0.6174, "step": 600 }, { "epoch": 0.35, "grad_norm": 2.2404434892749006, "learning_rate": 4.9161028469703345e-06, "loss": 0.6224, "step": 610 }, { "epoch": 0.35, "grad_norm": 2.2681579472027527, "learning_rate": 4.909565048075859e-06, "loss": 0.6281, "step": 620 }, { "epoch": 0.36, "grad_norm": 2.194186874479834, "learning_rate": 4.902786663535714e-06, "loss": 0.5974, "step": 630 }, { "epoch": 0.37, "grad_norm": 1.9753405498524519, "learning_rate": 4.895768370145081e-06, "loss": 0.6133, "step": 640 }, { "epoch": 0.37, "grad_norm": 1.9627709932695874, "learning_rate": 4.888510868653107e-06, "loss": 0.5979, "step": 650 }, { "epoch": 0.38, "grad_norm": 2.0978691375994383, "learning_rate": 4.881014883692931e-06, "loss": 0.6153, "step": 660 }, { "epoch": 0.38, "grad_norm": 1.9322421934439415, "learning_rate": 4.87328116370933e-06, "loss": 0.6091, "step": 670 }, { "epoch": 0.39, "grad_norm": 2.09891544330605, "learning_rate": 4.865310480883998e-06, "loss": 0.6114, "step": 680 }, { "epoch": 0.39, "grad_norm": 2.1055346718055223, "learning_rate": 4.857103631058438e-06, "loss": 0.613, "step": 690 }, { "epoch": 0.4, "grad_norm": 2.1490868409285557, "learning_rate": 4.848661433654509e-06, "loss": 0.6013, "step": 700 }, { "epoch": 0.41, "grad_norm": 2.0204589894962233, "learning_rate": 4.8399847315926e-06, "loss": 0.612, "step": 710 }, { "epoch": 0.41, "grad_norm": 2.119211573140167, "learning_rate": 4.831074391207477e-06, "loss": 0.6056, "step": 720 }, { "epoch": 0.42, "grad_norm": 2.1096094058746195, "learning_rate": 4.821931302161775e-06, "loss": 0.5906, "step": 730 }, { "epoch": 0.42, "grad_norm": 2.0352305435483338, "learning_rate": 4.812556377357175e-06, "loss": 0.5875, "step": 740 }, { "epoch": 0.43, "grad_norm": 2.095591458734852, "learning_rate": 4.8029505528432516e-06, "loss": 0.6023, "step": 750 }, { "epoch": 0.43, "grad_norm": 2.2591386947868095, "learning_rate": 4.793114787724013e-06, "loss": 0.5852, "step": 760 }, { "epoch": 0.44, "grad_norm": 2.1831334550022943, "learning_rate": 4.783050064062135e-06, "loss": 0.6132, "step": 770 }, { "epoch": 0.45, "grad_norm": 2.079194401695253, "learning_rate": 4.77275738678091e-06, "loss": 0.5954, "step": 780 }, { "epoch": 0.45, "grad_norm": 2.0851818697723115, "learning_rate": 4.7622377835639065e-06, "loss": 0.5904, "step": 790 }, { "epoch": 0.46, "grad_norm": 2.240834597974655, "learning_rate": 4.751492304752362e-06, "loss": 0.5881, "step": 800 }, { "epoch": 0.46, "grad_norm": 2.1904386633647763, "learning_rate": 4.740522023240308e-06, "loss": 0.5938, "step": 810 }, { "epoch": 0.47, "grad_norm": 2.0784086731070306, "learning_rate": 4.729328034367447e-06, "loss": 0.5893, "step": 820 }, { "epoch": 0.47, "grad_norm": 2.2844448327810905, "learning_rate": 4.717911455809782e-06, "loss": 0.5846, "step": 830 }, { "epoch": 0.48, "grad_norm": 2.455456003484141, "learning_rate": 4.7062734274680336e-06, "loss": 0.5885, "step": 840 }, { "epoch": 0.49, "grad_norm": 2.140011586996601, "learning_rate": 4.694415111353809e-06, "loss": 0.5876, "step": 850 }, { "epoch": 0.49, "grad_norm": 2.063342954699121, "learning_rate": 4.682337691473593e-06, "loss": 0.5864, "step": 860 }, { "epoch": 0.5, "grad_norm": 2.0835701678769625, "learning_rate": 4.670042373710524e-06, "loss": 0.5892, "step": 870 }, { "epoch": 0.5, "grad_norm": 2.2923758881575034, "learning_rate": 4.657530385703989e-06, "loss": 0.5805, "step": 880 }, { "epoch": 0.51, "grad_norm": 2.084287121874536, "learning_rate": 4.644802976727054e-06, "loss": 0.5906, "step": 890 }, { "epoch": 0.52, "grad_norm": 2.0504569347730994, "learning_rate": 4.631861417561725e-06, "loss": 0.5727, "step": 900 }, { "epoch": 0.52, "grad_norm": 2.2226821794710268, "learning_rate": 4.618707000372067e-06, "loss": 0.5665, "step": 910 }, { "epoch": 0.53, "grad_norm": 2.0195324875616167, "learning_rate": 4.605341038575188e-06, "loss": 0.5819, "step": 920 }, { "epoch": 0.53, "grad_norm": 2.2720158701812023, "learning_rate": 4.591764866710093e-06, "loss": 0.5702, "step": 930 }, { "epoch": 0.54, "grad_norm": 2.052632552841863, "learning_rate": 4.577979840304446e-06, "loss": 0.5704, "step": 940 }, { "epoch": 0.54, "grad_norm": 2.07608686655621, "learning_rate": 4.563987335739216e-06, "loss": 0.5705, "step": 950 }, { "epoch": 0.55, "grad_norm": 2.144036474992449, "learning_rate": 4.549788750111256e-06, "loss": 0.584, "step": 960 }, { "epoch": 0.56, "grad_norm": 2.038070637205325, "learning_rate": 4.535385501093806e-06, "loss": 0.5709, "step": 970 }, { "epoch": 0.56, "grad_norm": 3.788413264230372, "learning_rate": 4.520779026794947e-06, "loss": 0.5787, "step": 980 }, { "epoch": 0.57, "grad_norm": 2.221280506319893, "learning_rate": 4.505970785614007e-06, "loss": 0.568, "step": 990 }, { "epoch": 0.57, "grad_norm": 2.1628156092927973, "learning_rate": 4.490962256095951e-06, "loss": 0.5616, "step": 1000 }, { "epoch": 0.57, "eval_loss": 0.5683727860450745, "eval_runtime": 531.1147, "eval_samples_per_second": 23.315, "eval_steps_per_second": 0.729, "step": 1000 }, { "epoch": 0.58, "grad_norm": 2.2129498718686396, "learning_rate": 4.475754936783749e-06, "loss": 0.569, "step": 1010 }, { "epoch": 0.58, "grad_norm": 2.1882249016919277, "learning_rate": 4.460350346068755e-06, "loss": 0.5649, "step": 1020 }, { "epoch": 0.59, "grad_norm": 2.3225486571596194, "learning_rate": 4.4447500220390994e-06, "loss": 0.5692, "step": 1030 }, { "epoch": 0.6, "grad_norm": 2.316022975291687, "learning_rate": 4.428955522326122e-06, "loss": 0.575, "step": 1040 }, { "epoch": 0.6, "grad_norm": 2.078276101778362, "learning_rate": 4.4129684239488415e-06, "loss": 0.5644, "step": 1050 }, { "epoch": 0.61, "grad_norm": 2.1582893234263714, "learning_rate": 4.396790323156503e-06, "loss": 0.5647, "step": 1060 }, { "epoch": 0.61, "grad_norm": 2.1862836513401933, "learning_rate": 4.380422835269193e-06, "loss": 0.5736, "step": 1070 }, { "epoch": 0.62, "grad_norm": 2.2841548421776636, "learning_rate": 4.363867594516561e-06, "loss": 0.5655, "step": 1080 }, { "epoch": 0.62, "grad_norm": 2.280570400516185, "learning_rate": 4.347126253874644e-06, "loss": 0.562, "step": 1090 }, { "epoch": 0.63, "grad_norm": 2.2076164515797547, "learning_rate": 4.3302004849008264e-06, "loss": 0.5641, "step": 1100 }, { "epoch": 0.64, "grad_norm": 2.360813863915769, "learning_rate": 4.313091977566938e-06, "loss": 0.5657, "step": 1110 }, { "epoch": 0.64, "grad_norm": 2.2467158945171457, "learning_rate": 4.2958024400905215e-06, "loss": 0.5626, "step": 1120 }, { "epoch": 0.65, "grad_norm": 2.1411390923548197, "learning_rate": 4.278333598764271e-06, "loss": 0.5471, "step": 1130 }, { "epoch": 0.65, "grad_norm": 2.1385307098732103, "learning_rate": 4.260687197783669e-06, "loss": 0.5581, "step": 1140 }, { "epoch": 0.66, "grad_norm": 2.212052709050042, "learning_rate": 4.242864999072832e-06, "loss": 0.556, "step": 1150 }, { "epoch": 0.66, "grad_norm": 2.210641112539053, "learning_rate": 4.224868782108595e-06, "loss": 0.5396, "step": 1160 }, { "epoch": 0.67, "grad_norm": 2.154384214473984, "learning_rate": 4.206700343742835e-06, "loss": 0.5499, "step": 1170 }, { "epoch": 0.68, "grad_norm": 2.3776492042765645, "learning_rate": 4.1883614980230625e-06, "loss": 0.5472, "step": 1180 }, { "epoch": 0.68, "grad_norm": 2.4769783683847133, "learning_rate": 4.169854076011293e-06, "loss": 0.5449, "step": 1190 }, { "epoch": 0.69, "grad_norm": 2.2374406025722604, "learning_rate": 4.151179925601228e-06, "loss": 0.5514, "step": 1200 }, { "epoch": 0.69, "grad_norm": 2.2566112951367314, "learning_rate": 4.13234091133375e-06, "loss": 0.5474, "step": 1210 }, { "epoch": 0.7, "grad_norm": 2.367681149601566, "learning_rate": 4.113338914210745e-06, "loss": 0.5539, "step": 1220 }, { "epoch": 0.7, "grad_norm": 2.1661561664676077, "learning_rate": 4.094175831507307e-06, "loss": 0.5537, "step": 1230 }, { "epoch": 0.71, "grad_norm": 2.2108776725962787, "learning_rate": 4.074853576582293e-06, "loss": 0.5384, "step": 1240 }, { "epoch": 0.72, "grad_norm": 2.437065860368966, "learning_rate": 4.055374078687283e-06, "loss": 0.5557, "step": 1250 }, { "epoch": 0.72, "grad_norm": 2.3757604501117604, "learning_rate": 4.03573928277395e-06, "loss": 0.5429, "step": 1260 }, { "epoch": 0.73, "grad_norm": 2.299133822726468, "learning_rate": 4.015951149299875e-06, "loss": 0.5373, "step": 1270 }, { "epoch": 0.73, "grad_norm": 2.192017012064702, "learning_rate": 3.996011654032785e-06, "loss": 0.5448, "step": 1280 }, { "epoch": 0.74, "grad_norm": 2.304906002469218, "learning_rate": 3.975922787853296e-06, "loss": 0.5394, "step": 1290 }, { "epoch": 0.74, "grad_norm": 2.436764853283025, "learning_rate": 3.955686556556123e-06, "loss": 0.5447, "step": 1300 }, { "epoch": 0.75, "grad_norm": 2.494565076053036, "learning_rate": 3.935304980649813e-06, "loss": 0.5272, "step": 1310 }, { "epoch": 0.76, "grad_norm": 2.228063884542341, "learning_rate": 3.914780095155003e-06, "loss": 0.5432, "step": 1320 }, { "epoch": 0.76, "grad_norm": 2.2231946750720732, "learning_rate": 3.894113949401233e-06, "loss": 0.5286, "step": 1330 }, { "epoch": 0.77, "grad_norm": 2.503855593624124, "learning_rate": 3.873308606822323e-06, "loss": 0.5427, "step": 1340 }, { "epoch": 0.77, "grad_norm": 2.350038662257984, "learning_rate": 3.852366144750358e-06, "loss": 0.5359, "step": 1350 }, { "epoch": 0.78, "grad_norm": 2.4984837734626075, "learning_rate": 3.831288654208264e-06, "loss": 0.5362, "step": 1360 }, { "epoch": 0.78, "grad_norm": 2.4092005158856806, "learning_rate": 3.810078239701033e-06, "loss": 0.5427, "step": 1370 }, { "epoch": 0.79, "grad_norm": 2.4167184400341757, "learning_rate": 3.788737019005593e-06, "loss": 0.5292, "step": 1380 }, { "epoch": 0.8, "grad_norm": 2.489261713837357, "learning_rate": 3.7672671229593617e-06, "loss": 0.5328, "step": 1390 }, { "epoch": 0.8, "grad_norm": 2.3416784142161937, "learning_rate": 3.7456706952474853e-06, "loss": 0.5337, "step": 1400 }, { "epoch": 0.81, "grad_norm": 2.3405171601297887, "learning_rate": 3.723949892188801e-06, "loss": 0.5306, "step": 1410 }, { "epoch": 0.81, "grad_norm": 2.5435746837026616, "learning_rate": 3.7021068825205385e-06, "loss": 0.536, "step": 1420 }, { "epoch": 0.82, "grad_norm": 2.517397794058062, "learning_rate": 3.680143847181783e-06, "loss": 0.5304, "step": 1430 }, { "epoch": 0.82, "grad_norm": 2.4432716242411665, "learning_rate": 3.6580629790957073e-06, "loss": 0.5304, "step": 1440 }, { "epoch": 0.83, "grad_norm": 2.930860876384952, "learning_rate": 3.635866482950628e-06, "loss": 0.5347, "step": 1450 }, { "epoch": 0.84, "grad_norm": 2.9347320732859865, "learning_rate": 3.613556574979869e-06, "loss": 0.5301, "step": 1460 }, { "epoch": 0.84, "grad_norm": 3.563032283293063, "learning_rate": 3.5911354827404817e-06, "loss": 0.5218, "step": 1470 }, { "epoch": 0.85, "grad_norm": 3.2275844302722705, "learning_rate": 3.568605444890832e-06, "loss": 0.5352, "step": 1480 }, { "epoch": 0.85, "grad_norm": 2.6488408673845845, "learning_rate": 3.545968710967079e-06, "loss": 0.5343, "step": 1490 }, { "epoch": 0.86, "grad_norm": 540.9225446614429, "learning_rate": 3.5232275411585665e-06, "loss": 0.5384, "step": 1500 }, { "epoch": 0.86, "eval_loss": 0.5287850499153137, "eval_runtime": 532.1842, "eval_samples_per_second": 23.268, "eval_steps_per_second": 0.727, "step": 1500 }, { "epoch": 0.86, "grad_norm": 8.321452242337378, "learning_rate": 3.500384206082155e-06, "loss": 0.5295, "step": 1510 }, { "epoch": 0.87, "grad_norm": 2.815363351399787, "learning_rate": 3.4774409865555064e-06, "loss": 0.54, "step": 1520 }, { "epoch": 0.88, "grad_norm": 2.64837752834866, "learning_rate": 3.4544001733693534e-06, "loss": 0.5205, "step": 1530 }, { "epoch": 0.88, "grad_norm": 2.876814676291678, "learning_rate": 3.4312640670587738e-06, "loss": 0.5263, "step": 1540 }, { "epoch": 0.89, "grad_norm": 2.8619136430308094, "learning_rate": 3.408034977673493e-06, "loss": 0.5272, "step": 1550 }, { "epoch": 0.89, "grad_norm": 2.5910982551975787, "learning_rate": 3.384715224547232e-06, "loss": 0.516, "step": 1560 }, { "epoch": 0.9, "grad_norm": 2.3973340910193968, "learning_rate": 3.36130713606613e-06, "loss": 0.5157, "step": 1570 }, { "epoch": 0.9, "grad_norm": 2.4753957257302983, "learning_rate": 3.337813049436268e-06, "loss": 0.5151, "step": 1580 }, { "epoch": 0.91, "grad_norm": 2.4327407017468077, "learning_rate": 3.3142353104503055e-06, "loss": 0.5113, "step": 1590 }, { "epoch": 0.92, "grad_norm": 2.583443162287294, "learning_rate": 3.290576273253262e-06, "loss": 0.5137, "step": 1600 }, { "epoch": 0.92, "grad_norm": 2.8083503640404754, "learning_rate": 3.2668383001074646e-06, "loss": 0.5139, "step": 1610 }, { "epoch": 0.93, "grad_norm": 2.561938411287161, "learning_rate": 3.2430237611566874e-06, "loss": 0.5087, "step": 1620 }, { "epoch": 0.93, "grad_norm": 4.820890836324099, "learning_rate": 3.2191350341895012e-06, "loss": 0.5175, "step": 1630 }, { "epoch": 0.94, "grad_norm": 4.206676693632578, "learning_rate": 3.1951745044018584e-06, "loss": 0.5271, "step": 1640 }, { "epoch": 0.94, "grad_norm": 25.523830030080394, "learning_rate": 3.1711445641589435e-06, "loss": 0.5622, "step": 1650 }, { "epoch": 0.95, "grad_norm": 11.240090997530515, "learning_rate": 3.147047612756302e-06, "loss": 0.9079, "step": 1660 }, { "epoch": 0.96, "grad_norm": 29.22518153420536, "learning_rate": 3.122886056180284e-06, "loss": 0.7066, "step": 1670 }, { "epoch": 0.96, "grad_norm": 4.019885752857641, "learning_rate": 3.098662306867811e-06, "loss": 0.6342, "step": 1680 }, { "epoch": 0.97, "grad_norm": 5.411201020641005, "learning_rate": 3.074378783465507e-06, "loss": 0.6105, "step": 1690 }, { "epoch": 0.97, "grad_norm": 8.822887992474177, "learning_rate": 3.050037910588206e-06, "loss": 0.6115, "step": 1700 }, { "epoch": 0.98, "grad_norm": 5.0785295408950155, "learning_rate": 3.0256421185768616e-06, "loss": 0.5663, "step": 1710 }, { "epoch": 0.98, "grad_norm": 2.9678315863647526, "learning_rate": 3.0011938432558873e-06, "loss": 0.5802, "step": 1720 }, { "epoch": 0.99, "grad_norm": 3.412236320518679, "learning_rate": 2.976695525689952e-06, "loss": 0.5684, "step": 1730 }, { "epoch": 1.0, "grad_norm": 3.2044150474863655, "learning_rate": 2.9521496119402453e-06, "loss": 0.5535, "step": 1740 }, { "epoch": 1.0, "grad_norm": 3.9473879559645852, "learning_rate": 2.9275585528202516e-06, "loss": 0.5314, "step": 1750 }, { "epoch": 1.01, "grad_norm": 3.2348412490515455, "learning_rate": 2.9029248036510406e-06, "loss": 0.4765, "step": 1760 }, { "epoch": 1.01, "grad_norm": 3.0685208563178854, "learning_rate": 2.87825082401612e-06, "loss": 0.4748, "step": 1770 }, { "epoch": 1.02, "grad_norm": 2.852153504448215, "learning_rate": 2.8535390775158495e-06, "loss": 0.47, "step": 1780 }, { "epoch": 1.02, "grad_norm": 4.143044242647944, "learning_rate": 2.8287920315214647e-06, "loss": 0.4573, "step": 1790 }, { "epoch": 1.03, "grad_norm": 34.463143285380006, "learning_rate": 2.8040121569287155e-06, "loss": 0.4584, "step": 1800 }, { "epoch": 1.04, "grad_norm": 2.661591324494075, "learning_rate": 2.7792019279111604e-06, "loss": 0.4471, "step": 1810 }, { "epoch": 1.04, "grad_norm": 2.6072582596419567, "learning_rate": 2.7543638216731284e-06, "loss": 0.4489, "step": 1820 }, { "epoch": 1.05, "grad_norm": 2.8084493663354397, "learning_rate": 2.7295003182023794e-06, "loss": 0.4476, "step": 1830 }, { "epoch": 1.05, "grad_norm": 2.8081294693878798, "learning_rate": 2.7046139000224882e-06, "loss": 0.4527, "step": 1840 }, { "epoch": 1.06, "grad_norm": 2.8187716457470255, "learning_rate": 2.6797070519449735e-06, "loss": 0.4371, "step": 1850 }, { "epoch": 1.06, "grad_norm": 2.6664541594312734, "learning_rate": 2.654782260821198e-06, "loss": 0.4489, "step": 1860 }, { "epoch": 1.07, "grad_norm": 3.1660587876790207, "learning_rate": 2.6298420152940706e-06, "loss": 0.4464, "step": 1870 }, { "epoch": 1.08, "grad_norm": 2.578632187270232, "learning_rate": 2.6048888055495586e-06, "loss": 0.4436, "step": 1880 }, { "epoch": 1.08, "grad_norm": 2.898387161362659, "learning_rate": 2.5799251230680572e-06, "loss": 0.4373, "step": 1890 }, { "epoch": 1.09, "grad_norm": 2.675237350045505, "learning_rate": 2.554953460375624e-06, "loss": 0.4524, "step": 1900 }, { "epoch": 1.09, "grad_norm": 2.714776711198956, "learning_rate": 2.529976310795108e-06, "loss": 0.4321, "step": 1910 }, { "epoch": 1.1, "grad_norm": 2.710787483431792, "learning_rate": 2.5049961681972035e-06, "loss": 0.4319, "step": 1920 }, { "epoch": 1.1, "grad_norm": 2.6437855709831743, "learning_rate": 2.4800155267514455e-06, "loss": 0.431, "step": 1930 }, { "epoch": 1.11, "grad_norm": 2.566308240056163, "learning_rate": 2.455036880677177e-06, "loss": 0.4358, "step": 1940 }, { "epoch": 1.12, "grad_norm": 2.6611983864059927, "learning_rate": 2.430062723994513e-06, "loss": 0.4343, "step": 1950 }, { "epoch": 1.12, "grad_norm": 2.6472716826792393, "learning_rate": 2.4050955502753164e-06, "loss": 0.4481, "step": 1960 }, { "epoch": 1.13, "grad_norm": 2.698904335701275, "learning_rate": 2.38013785239423e-06, "loss": 0.4314, "step": 1970 }, { "epoch": 1.13, "grad_norm": 2.5697229032197413, "learning_rate": 2.3551921222797736e-06, "loss": 0.4374, "step": 1980 }, { "epoch": 1.14, "grad_norm": 2.9083035166905145, "learning_rate": 2.330260850665529e-06, "loss": 0.4311, "step": 1990 }, { "epoch": 1.14, "grad_norm": 2.864477813697365, "learning_rate": 2.305346526841456e-06, "loss": 0.4248, "step": 2000 }, { "epoch": 1.14, "eval_loss": 0.5098316669464111, "eval_runtime": 531.7861, "eval_samples_per_second": 23.286, "eval_steps_per_second": 0.728, "step": 2000 }, { "epoch": 1.15, "grad_norm": 2.487093905262324, "learning_rate": 2.2804516384053422e-06, "loss": 0.4278, "step": 2010 }, { "epoch": 1.16, "grad_norm": 2.4697444658310115, "learning_rate": 2.2555786710144274e-06, "loss": 0.4185, "step": 2020 }, { "epoch": 1.16, "grad_norm": 2.7888802934203376, "learning_rate": 2.2307301081372228e-06, "loss": 0.4205, "step": 2030 }, { "epoch": 1.17, "grad_norm": 2.7782246570408655, "learning_rate": 2.205908430805542e-06, "loss": 0.4186, "step": 2040 }, { "epoch": 1.17, "grad_norm": 2.855124513920933, "learning_rate": 2.181116117366783e-06, "loss": 0.4287, "step": 2050 }, { "epoch": 1.18, "grad_norm": 2.5830367229868982, "learning_rate": 2.1563556432364754e-06, "loss": 0.4266, "step": 2060 }, { "epoch": 1.18, "grad_norm": 2.7855942926027617, "learning_rate": 2.131629480651116e-06, "loss": 0.4169, "step": 2070 }, { "epoch": 1.19, "grad_norm": 2.625295284188482, "learning_rate": 2.1069400984213282e-06, "loss": 0.4189, "step": 2080 }, { "epoch": 1.2, "grad_norm": 2.656457764922864, "learning_rate": 2.0822899616853643e-06, "loss": 0.4393, "step": 2090 }, { "epoch": 1.2, "grad_norm": 2.3861057485273998, "learning_rate": 2.0576815316629615e-06, "loss": 0.4137, "step": 2100 }, { "epoch": 1.21, "grad_norm": 2.775579784902102, "learning_rate": 2.033117265409611e-06, "loss": 0.4107, "step": 2110 }, { "epoch": 1.21, "grad_norm": 2.7447136775018057, "learning_rate": 2.0085996155712205e-06, "loss": 0.4098, "step": 2120 }, { "epoch": 1.22, "grad_norm": 2.7615464883658265, "learning_rate": 1.984131030139231e-06, "loss": 0.4202, "step": 2130 }, { "epoch": 1.22, "grad_norm": 3.0966486839292684, "learning_rate": 1.9597139522061977e-06, "loss": 0.4307, "step": 2140 }, { "epoch": 1.23, "grad_norm": 2.596350491119526, "learning_rate": 1.9353508197218492e-06, "loss": 0.412, "step": 2150 }, { "epoch": 1.24, "grad_norm": 2.376403726879793, "learning_rate": 1.911044065249674e-06, "loss": 0.4115, "step": 2160 }, { "epoch": 1.24, "grad_norm": 2.77907836700564, "learning_rate": 1.886796115724039e-06, "loss": 0.4191, "step": 2170 }, { "epoch": 1.25, "grad_norm": 2.361792411282615, "learning_rate": 1.8626093922078614e-06, "loss": 0.4135, "step": 2180 }, { "epoch": 1.25, "grad_norm": 2.721607968377175, "learning_rate": 1.838486309650888e-06, "loss": 0.4221, "step": 2190 }, { "epoch": 1.26, "grad_norm": 2.6544964211718334, "learning_rate": 1.8144292766485588e-06, "loss": 0.411, "step": 2200 }, { "epoch": 1.26, "grad_norm": 2.3573667729775147, "learning_rate": 1.7904406952015258e-06, "loss": 0.4098, "step": 2210 }, { "epoch": 1.27, "grad_norm": 2.8700396902241354, "learning_rate": 1.7665229604758237e-06, "loss": 0.4163, "step": 2220 }, { "epoch": 1.28, "grad_norm": 3.1872115128374534, "learning_rate": 1.7426784605637153e-06, "loss": 0.4098, "step": 2230 }, { "epoch": 1.28, "grad_norm": 2.5720647224489075, "learning_rate": 1.7189095762452552e-06, "loss": 0.419, "step": 2240 }, { "epoch": 1.29, "grad_norm": 2.7721403217588594, "learning_rate": 1.6952186807505778e-06, "loss": 0.4201, "step": 2250 }, { "epoch": 1.29, "grad_norm": 3.3795392317416413, "learning_rate": 1.6716081395229374e-06, "loss": 0.4175, "step": 2260 }, { "epoch": 1.3, "grad_norm": 2.792505110856509, "learning_rate": 1.648080309982528e-06, "loss": 0.4047, "step": 2270 }, { "epoch": 1.3, "grad_norm": 2.5045890389309537, "learning_rate": 1.624637541291108e-06, "loss": 0.4139, "step": 2280 }, { "epoch": 1.31, "grad_norm": 2.5076429338592066, "learning_rate": 1.6012821741174394e-06, "loss": 0.4166, "step": 2290 }, { "epoch": 1.32, "grad_norm": 2.6078986626262983, "learning_rate": 1.5780165404035875e-06, "loss": 0.4128, "step": 2300 }, { "epoch": 1.32, "grad_norm": 2.5396271208193713, "learning_rate": 1.5548429631320804e-06, "loss": 0.4236, "step": 2310 }, { "epoch": 1.33, "grad_norm": 2.4770713027568325, "learning_rate": 1.5317637560939717e-06, "loss": 0.4105, "step": 2320 }, { "epoch": 1.33, "grad_norm": 2.7342907090878397, "learning_rate": 1.5087812236578158e-06, "loss": 0.4097, "step": 2330 }, { "epoch": 1.34, "grad_norm": 2.474686962501904, "learning_rate": 1.4858976605395858e-06, "loss": 0.4028, "step": 2340 }, { "epoch": 1.34, "grad_norm": 2.541050407831277, "learning_rate": 1.4631153515735566e-06, "loss": 0.4092, "step": 2350 }, { "epoch": 1.35, "grad_norm": 2.3319036310065915, "learning_rate": 1.4404365714841717e-06, "loss": 0.4095, "step": 2360 }, { "epoch": 1.36, "grad_norm": 2.8467247569785523, "learning_rate": 1.4178635846589195e-06, "loss": 0.404, "step": 2370 }, { "epoch": 1.36, "grad_norm": 2.6269760063806475, "learning_rate": 1.395398644922249e-06, "loss": 0.4165, "step": 2380 }, { "epoch": 1.37, "grad_norm": 2.7166759530722486, "learning_rate": 1.3730439953105242e-06, "loss": 0.4018, "step": 2390 }, { "epoch": 1.37, "grad_norm": 2.4008152588750478, "learning_rate": 1.350801867848076e-06, "loss": 0.4027, "step": 2400 }, { "epoch": 1.38, "grad_norm": 2.681524401478661, "learning_rate": 1.3286744833243373e-06, "loss": 0.4058, "step": 2410 }, { "epoch": 1.38, "grad_norm": 2.4832885931873934, "learning_rate": 1.306664051072108e-06, "loss": 0.4101, "step": 2420 }, { "epoch": 1.39, "grad_norm": 2.9652476007510726, "learning_rate": 1.2847727687469573e-06, "loss": 0.411, "step": 2430 }, { "epoch": 1.4, "grad_norm": 2.8095821252599187, "learning_rate": 1.2630028221078083e-06, "loss": 0.4131, "step": 2440 }, { "epoch": 1.4, "grad_norm": 2.5494893937849428, "learning_rate": 1.2413563847986814e-06, "loss": 0.4092, "step": 2450 }, { "epoch": 1.41, "grad_norm": 2.725138704030533, "learning_rate": 1.2198356181316793e-06, "loss": 0.4091, "step": 2460 }, { "epoch": 1.41, "grad_norm": 2.39868053795732, "learning_rate": 1.1984426708711804e-06, "loss": 0.3978, "step": 2470 }, { "epoch": 1.42, "grad_norm": 2.488515661168399, "learning_rate": 1.1771796790192963e-06, "loss": 0.403, "step": 2480 }, { "epoch": 1.42, "grad_norm": 2.5950371989879493, "learning_rate": 1.1560487656025985e-06, "loss": 0.3983, "step": 2490 }, { "epoch": 1.43, "grad_norm": 2.8164587473243325, "learning_rate": 1.1350520404601445e-06, "loss": 0.3969, "step": 2500 }, { "epoch": 1.43, "eval_loss": 0.48087671399116516, "eval_runtime": 531.108, "eval_samples_per_second": 23.315, "eval_steps_per_second": 0.729, "step": 2500 }, { "epoch": 1.44, "grad_norm": 2.6911969387062875, "learning_rate": 1.1141916000328152e-06, "loss": 0.4027, "step": 2510 }, { "epoch": 1.44, "grad_norm": 2.476764330966686, "learning_rate": 1.0934695271539975e-06, "loss": 0.4074, "step": 2520 }, { "epoch": 1.45, "grad_norm": 2.5619945448411605, "learning_rate": 1.0728878908416194e-06, "loss": 0.4083, "step": 2530 }, { "epoch": 1.45, "grad_norm": 2.9402161507985296, "learning_rate": 1.0524487460915675e-06, "loss": 0.3922, "step": 2540 }, { "epoch": 1.46, "grad_norm": 2.5346303173308335, "learning_rate": 1.0321541336725027e-06, "loss": 0.4034, "step": 2550 }, { "epoch": 1.46, "grad_norm": 2.3264995133931476, "learning_rate": 1.0120060799220976e-06, "loss": 0.3961, "step": 2560 }, { "epoch": 1.47, "grad_norm": 2.3682299479571474, "learning_rate": 9.920065965447167e-07, "loss": 0.407, "step": 2570 }, { "epoch": 1.48, "grad_norm": 2.4443777701467524, "learning_rate": 9.721576804105507e-07, "loss": 0.3965, "step": 2580 }, { "epoch": 1.48, "grad_norm": 2.5713687903588864, "learning_rate": 9.524613133562422e-07, "loss": 0.3996, "step": 2590 }, { "epoch": 1.49, "grad_norm": 2.4771454646234545, "learning_rate": 9.329194619870047e-07, "loss": 0.4024, "step": 2600 }, { "epoch": 1.49, "grad_norm": 2.6576827146080233, "learning_rate": 9.135340774802656e-07, "loss": 0.4005, "step": 2610 }, { "epoch": 1.5, "grad_norm": 2.5894019593645274, "learning_rate": 8.943070953908448e-07, "loss": 0.4028, "step": 2620 }, { "epoch": 1.51, "grad_norm": 2.514759035169968, "learning_rate": 8.752404354577052e-07, "loss": 0.4059, "step": 2630 }, { "epoch": 1.51, "grad_norm": 2.5607304476769044, "learning_rate": 8.563360014122648e-07, "loss": 0.4039, "step": 2640 }, { "epoch": 1.52, "grad_norm": 5.14637237359729, "learning_rate": 8.375956807883245e-07, "loss": 0.3901, "step": 2650 }, { "epoch": 1.52, "grad_norm": 2.2734216891063395, "learning_rate": 8.190213447336001e-07, "loss": 0.3899, "step": 2660 }, { "epoch": 1.53, "grad_norm": 2.471190008443679, "learning_rate": 8.006148478228986e-07, "loss": 0.4055, "step": 2670 }, { "epoch": 1.53, "grad_norm": 2.71999964571031, "learning_rate": 7.823780278729443e-07, "loss": 0.3961, "step": 2680 }, { "epoch": 1.54, "grad_norm": 2.418310205284732, "learning_rate": 7.643127057588811e-07, "loss": 0.3813, "step": 2690 }, { "epoch": 1.55, "grad_norm": 2.5665802451050044, "learning_rate": 7.464206852324624e-07, "loss": 0.3927, "step": 2700 }, { "epoch": 1.55, "grad_norm": 2.7770319022822343, "learning_rate": 7.287037527419605e-07, "loss": 0.3946, "step": 2710 }, { "epoch": 1.56, "grad_norm": 2.549475746375355, "learning_rate": 7.111636772537875e-07, "loss": 0.3915, "step": 2720 }, { "epoch": 1.56, "grad_norm": 2.688664419149691, "learning_rate": 6.938022100758782e-07, "loss": 0.4044, "step": 2730 }, { "epoch": 1.57, "grad_norm": 2.5993657832655788, "learning_rate": 6.766210846828256e-07, "loss": 0.3957, "step": 2740 }, { "epoch": 1.57, "grad_norm": 2.5309747823008215, "learning_rate": 6.596220165428002e-07, "loss": 0.3962, "step": 2750 }, { "epoch": 1.58, "grad_norm": 2.6144066056011517, "learning_rate": 6.428067029462693e-07, "loss": 0.3912, "step": 2760 }, { "epoch": 1.59, "grad_norm": 2.7527841475063357, "learning_rate": 6.261768228365267e-07, "loss": 0.3987, "step": 2770 }, { "epoch": 1.59, "grad_norm": 2.432371850837855, "learning_rate": 6.097340366420562e-07, "loss": 0.4078, "step": 2780 }, { "epoch": 1.6, "grad_norm": 2.625005423707898, "learning_rate": 5.934799861107507e-07, "loss": 0.3918, "step": 2790 }, { "epoch": 1.6, "grad_norm": 2.4813103577109326, "learning_rate": 5.774162941459802e-07, "loss": 0.3925, "step": 2800 }, { "epoch": 1.61, "grad_norm": 2.9410397977745215, "learning_rate": 5.615445646445586e-07, "loss": 0.3884, "step": 2810 }, { "epoch": 1.61, "grad_norm": 2.3507919061555858, "learning_rate": 5.45866382336599e-07, "loss": 0.389, "step": 2820 }, { "epoch": 1.62, "grad_norm": 2.419134938253293, "learning_rate": 5.303833126272837e-07, "loss": 0.3911, "step": 2830 }, { "epoch": 1.63, "grad_norm": 2.705274207526381, "learning_rate": 5.150969014405652e-07, "loss": 0.3806, "step": 2840 }, { "epoch": 1.63, "grad_norm": 2.818214976762935, "learning_rate": 5.000086750648125e-07, "loss": 0.3992, "step": 2850 }, { "epoch": 1.64, "grad_norm": 2.6412303699783144, "learning_rate": 4.851201400004133e-07, "loss": 0.3982, "step": 2860 }, { "epoch": 1.64, "grad_norm": 2.557106526975151, "learning_rate": 4.704327828093641e-07, "loss": 0.389, "step": 2870 }, { "epoch": 1.65, "grad_norm": 2.654458887909101, "learning_rate": 4.5594806996683256e-07, "loss": 0.3956, "step": 2880 }, { "epoch": 1.65, "grad_norm": 2.3963528740076225, "learning_rate": 4.4166744771474367e-07, "loss": 0.3923, "step": 2890 }, { "epoch": 1.66, "grad_norm": 2.6404819468490524, "learning_rate": 4.2759234191737443e-07, "loss": 0.3873, "step": 2900 }, { "epoch": 1.67, "grad_norm": 2.430447706914984, "learning_rate": 4.1372415791898794e-07, "loss": 0.3892, "step": 2910 }, { "epoch": 1.67, "grad_norm": 2.72283555848891, "learning_rate": 4.000642804035132e-07, "loss": 0.3972, "step": 2920 }, { "epoch": 1.68, "grad_norm": 48.58377587713292, "learning_rate": 3.8661407325629533e-07, "loss": 0.4079, "step": 2930 }, { "epoch": 1.68, "grad_norm": 2.624060213750708, "learning_rate": 3.7337487942790964e-07, "loss": 0.3892, "step": 2940 }, { "epoch": 1.69, "grad_norm": 2.9699815624549677, "learning_rate": 3.6034802080007814e-07, "loss": 0.3999, "step": 2950 }, { "epoch": 1.69, "grad_norm": 3.25458088141065, "learning_rate": 3.475347980536842e-07, "loss": 0.385, "step": 2960 }, { "epoch": 1.7, "grad_norm": 2.445550971601129, "learning_rate": 3.3493649053890325e-07, "loss": 0.3929, "step": 2970 }, { "epoch": 1.71, "grad_norm": 2.5114168114259887, "learning_rate": 3.225543561474667e-07, "loss": 0.3998, "step": 2980 }, { "epoch": 1.71, "grad_norm": 2.9780387761779252, "learning_rate": 3.1038963118706246e-07, "loss": 0.397, "step": 2990 }, { "epoch": 1.72, "grad_norm": 2.5876649071384636, "learning_rate": 2.984435302578992e-07, "loss": 0.3933, "step": 3000 }, { "epoch": 1.72, "eval_loss": 0.466826856136322, "eval_runtime": 531.3559, "eval_samples_per_second": 23.305, "eval_steps_per_second": 0.728, "step": 3000 }, { "epoch": 1.72, "grad_norm": 2.856546256466058, "learning_rate": 2.8671724613143106e-07, "loss": 0.3843, "step": 3010 }, { "epoch": 1.73, "grad_norm": 2.65010522081742, "learning_rate": 2.7521194963126387e-07, "loss": 0.3928, "step": 3020 }, { "epoch": 1.73, "grad_norm": 4.365159018177608, "learning_rate": 2.639287895162526e-07, "loss": 0.3842, "step": 3030 }, { "epoch": 1.74, "grad_norm": 2.5069662246327655, "learning_rate": 2.5286889236580485e-07, "loss": 0.3919, "step": 3040 }, { "epoch": 1.75, "grad_norm": 2.44175870329633, "learning_rate": 2.420333624673912e-07, "loss": 0.3858, "step": 3050 }, { "epoch": 1.75, "grad_norm": 2.408625264190103, "learning_rate": 2.314232817062939e-07, "loss": 0.3847, "step": 3060 }, { "epoch": 1.76, "grad_norm": 2.6673150680165176, "learning_rate": 2.2103970945757668e-07, "loss": 0.3912, "step": 3070 }, { "epoch": 1.76, "grad_norm": 3.416210049564414, "learning_rate": 2.1088368248031703e-07, "loss": 0.3887, "step": 3080 }, { "epoch": 1.77, "grad_norm": 2.4831114598380104, "learning_rate": 2.0095621481408535e-07, "loss": 0.3847, "step": 3090 }, { "epoch": 1.77, "grad_norm": 2.4518249271789823, "learning_rate": 1.9125829767770026e-07, "loss": 0.3869, "step": 3100 }, { "epoch": 1.78, "grad_norm": 2.431921711992356, "learning_rate": 1.817908993702555e-07, "loss": 0.4038, "step": 3110 }, { "epoch": 1.79, "grad_norm": 2.401086232330709, "learning_rate": 1.725549651744446e-07, "loss": 0.3896, "step": 3120 }, { "epoch": 1.79, "grad_norm": 2.8030599868787345, "learning_rate": 1.6355141726217132e-07, "loss": 0.4034, "step": 3130 }, { "epoch": 1.8, "grad_norm": 2.5196493937058126, "learning_rate": 1.547811546024816e-07, "loss": 0.3898, "step": 3140 }, { "epoch": 1.8, "grad_norm": 2.532168588674458, "learning_rate": 1.4624505287179818e-07, "loss": 0.3812, "step": 3150 }, { "epoch": 1.81, "grad_norm": 2.9831369418646254, "learning_rate": 1.3794396436649205e-07, "loss": 0.396, "step": 3160 }, { "epoch": 1.81, "grad_norm": 2.714395312518003, "learning_rate": 1.298787179177835e-07, "loss": 0.384, "step": 3170 }, { "epoch": 1.82, "grad_norm": 2.4865221680081038, "learning_rate": 1.2205011880898565e-07, "loss": 0.3841, "step": 3180 }, { "epoch": 1.83, "grad_norm": 2.4597701864630026, "learning_rate": 1.1445894869509866e-07, "loss": 0.3927, "step": 3190 }, { "epoch": 1.83, "grad_norm": 2.4358583132347906, "learning_rate": 1.0710596552476915e-07, "loss": 0.392, "step": 3200 }, { "epoch": 1.84, "grad_norm": 2.386461680357716, "learning_rate": 9.999190346460602e-08, "loss": 0.385, "step": 3210 }, { "epoch": 1.84, "grad_norm": 2.780440529108891, "learning_rate": 9.31174728258824e-08, "loss": 0.3833, "step": 3220 }, { "epoch": 1.85, "grad_norm": 2.474009294682335, "learning_rate": 8.648335999360935e-08, "loss": 0.3824, "step": 3230 }, { "epoch": 1.85, "grad_norm": 2.409537607728227, "learning_rate": 8.009022735800537e-08, "loss": 0.3888, "step": 3240 }, { "epoch": 1.86, "grad_norm": 2.4245907679021195, "learning_rate": 7.39387132483596e-08, "loss": 0.3856, "step": 3250 }, { "epoch": 1.87, "grad_norm": 2.692749484502572, "learning_rate": 6.802943186929617e-08, "loss": 0.3803, "step": 3260 }, { "epoch": 1.87, "grad_norm": 2.4379196195711215, "learning_rate": 6.236297323944679e-08, "loss": 0.3831, "step": 3270 }, { "epoch": 1.88, "grad_norm": 3.048508349939611, "learning_rate": 5.693990313254433e-08, "loss": 0.397, "step": 3280 }, { "epoch": 1.88, "grad_norm": 2.887339309122361, "learning_rate": 5.1760763020927143e-08, "loss": 0.3893, "step": 3290 }, { "epoch": 1.89, "grad_norm": 2.3788921808030072, "learning_rate": 4.682607002147899e-08, "loss": 0.3954, "step": 3300 }, { "epoch": 1.89, "grad_norm": 2.57063336475747, "learning_rate": 4.2136316843994775e-08, "loss": 0.3891, "step": 3310 }, { "epoch": 1.9, "grad_norm": 2.4304903487638803, "learning_rate": 3.769197174198658e-08, "loss": 0.3879, "step": 3320 }, { "epoch": 1.91, "grad_norm": 2.637983143379398, "learning_rate": 3.349347846593021e-08, "loss": 0.3946, "step": 3330 }, { "epoch": 1.91, "grad_norm": 2.3414639863414073, "learning_rate": 2.954125621895787e-08, "loss": 0.4075, "step": 3340 }, { "epoch": 1.92, "grad_norm": 2.528267339071513, "learning_rate": 2.5835699615002764e-08, "loss": 0.3892, "step": 3350 }, { "epoch": 1.92, "grad_norm": 2.7385307714857325, "learning_rate": 2.2377178639398368e-08, "loss": 0.3905, "step": 3360 }, { "epoch": 1.93, "grad_norm": 2.4232300548313512, "learning_rate": 1.9166038611937166e-08, "loss": 0.3924, "step": 3370 }, { "epoch": 1.93, "grad_norm": 2.5681127030737065, "learning_rate": 1.620260015239128e-08, "loss": 0.3973, "step": 3380 }, { "epoch": 1.94, "grad_norm": 2.514711802598368, "learning_rate": 1.3487159148500572e-08, "loss": 0.3837, "step": 3390 }, { "epoch": 1.95, "grad_norm": 2.8751043735648554, "learning_rate": 1.1019986726427955e-08, "loss": 0.3877, "step": 3400 }, { "epoch": 1.95, "grad_norm": 2.526450637506157, "learning_rate": 8.80132922369048e-09, "loss": 0.3747, "step": 3410 }, { "epoch": 1.96, "grad_norm": 2.538792948888959, "learning_rate": 6.831408164561792e-09, "loss": 0.3864, "step": 3420 }, { "epoch": 1.96, "grad_norm": 2.5203237951269233, "learning_rate": 5.1104202379553825e-09, "loss": 0.3909, "step": 3430 }, { "epoch": 1.97, "grad_norm": 2.4438999348564607, "learning_rate": 3.6385372777839046e-09, "loss": 0.3898, "step": 3440 }, { "epoch": 1.97, "grad_norm": 2.552607870699825, "learning_rate": 2.415906245804289e-09, "loss": 0.3792, "step": 3450 }, { "epoch": 1.98, "grad_norm": 2.601255703179851, "learning_rate": 1.4426492169430929e-09, "loss": 0.3942, "step": 3460 }, { "epoch": 1.99, "grad_norm": 2.4765130524935453, "learning_rate": 7.188633671079137e-10, "loss": 0.3937, "step": 3470 }, { "epoch": 1.99, "grad_norm": 2.4315037049643573, "learning_rate": 2.4462096348515373e-10, "loss": 0.3962, "step": 3480 }, { "epoch": 2.0, "grad_norm": 2.3352186798660193, "learning_rate": 1.996935732384575e-11, "loss": 0.3873, "step": 3490 }, { "epoch": 2.0, "step": 3494, "total_flos": 731467827118080.0, "train_loss": 0.5086251636130099, "train_runtime": 42324.6962, "train_samples_per_second": 5.284, "train_steps_per_second": 0.083 } ], "logging_steps": 10, "max_steps": 3494, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 500, "total_flos": 731467827118080.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }