{ "best_metric": null, "best_model_checkpoint": null, "epoch": 76.92307692307692, "eval_steps": 500, "global_step": 2500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.3076923076923077, "grad_norm": 5.159167766571045, "learning_rate": 7.8125e-08, "loss": 4.2745, "step": 10 }, { "epoch": 0.6153846153846154, "grad_norm": 5.482397079467773, "learning_rate": 1.5625e-07, "loss": 4.2677, "step": 20 }, { "epoch": 0.9230769230769231, "grad_norm": 5.341490268707275, "learning_rate": 2.3437500000000003e-07, "loss": 4.2277, "step": 30 }, { "epoch": 1.2307692307692308, "grad_norm": 5.534327983856201, "learning_rate": 3.125e-07, "loss": 4.2715, "step": 40 }, { "epoch": 1.5384615384615383, "grad_norm": 5.407191753387451, "learning_rate": 3.90625e-07, "loss": 4.2371, "step": 50 }, { "epoch": 1.8461538461538463, "grad_norm": 5.150612831115723, "learning_rate": 4.6875000000000006e-07, "loss": 4.1953, "step": 60 }, { "epoch": 2.1538461538461537, "grad_norm": 5.620100021362305, "learning_rate": 5.468750000000001e-07, "loss": 4.2908, "step": 70 }, { "epoch": 2.4615384615384617, "grad_norm": 5.80696964263916, "learning_rate": 6.25e-07, "loss": 4.2603, "step": 80 }, { "epoch": 2.769230769230769, "grad_norm": 5.940330982208252, "learning_rate": 7.03125e-07, "loss": 4.2049, "step": 90 }, { "epoch": 3.076923076923077, "grad_norm": 5.8099141120910645, "learning_rate": 7.8125e-07, "loss": 4.1638, "step": 100 }, { "epoch": 3.3846153846153846, "grad_norm": 5.388895511627197, "learning_rate": 8.59375e-07, "loss": 4.1409, "step": 110 }, { "epoch": 3.6923076923076925, "grad_norm": 5.919471263885498, "learning_rate": 9.375000000000001e-07, "loss": 4.1658, "step": 120 }, { "epoch": 4.0, "grad_norm": 13097.25390625, "learning_rate": 1.0156250000000001e-06, "loss": 4.1936, "step": 130 }, { "epoch": 4.3076923076923075, "grad_norm": 4.802829265594482, "learning_rate": 1.0937500000000001e-06, "loss": 4.021, "step": 140 }, { "epoch": 4.615384615384615, "grad_norm": 5.628367900848389, "learning_rate": 1.1718750000000001e-06, "loss": 4.019, "step": 150 }, { "epoch": 4.923076923076923, "grad_norm": 5.601304531097412, "learning_rate": 1.25e-06, "loss": 3.9075, "step": 160 }, { "epoch": 5.230769230769231, "grad_norm": 5.029603004455566, "learning_rate": 1.328125e-06, "loss": 3.8952, "step": 170 }, { "epoch": 5.538461538461538, "grad_norm": 5.625918388366699, "learning_rate": 1.40625e-06, "loss": 3.7809, "step": 180 }, { "epoch": 5.846153846153846, "grad_norm": 4.302311420440674, "learning_rate": 1.484375e-06, "loss": 3.5823, "step": 190 }, { "epoch": 6.153846153846154, "grad_norm": 4.250982284545898, "learning_rate": 1.5625e-06, "loss": 3.5112, "step": 200 }, { "epoch": 6.461538461538462, "grad_norm": 3.137059211730957, "learning_rate": 1.640625e-06, "loss": 3.3867, "step": 210 }, { "epoch": 6.769230769230769, "grad_norm": 3.2033724784851074, "learning_rate": 1.71875e-06, "loss": 3.2788, "step": 220 }, { "epoch": 7.076923076923077, "grad_norm": 2.8167309761047363, "learning_rate": 1.796875e-06, "loss": 3.1566, "step": 230 }, { "epoch": 7.384615384615385, "grad_norm": 2.167381525039673, "learning_rate": 1.8750000000000003e-06, "loss": 2.9642, "step": 240 }, { "epoch": 7.6923076923076925, "grad_norm": 2.2277944087982178, "learning_rate": 1.953125e-06, "loss": 2.8886, "step": 250 }, { "epoch": 8.0, "grad_norm": 60692.35546875, "learning_rate": 2.0312500000000002e-06, "loss": 2.7726, "step": 260 }, { "epoch": 8.307692307692308, "grad_norm": 1.6853564977645874, "learning_rate": 2.109375e-06, "loss": 2.7062, "step": 270 }, { "epoch": 8.615384615384615, "grad_norm": 1.5454535484313965, "learning_rate": 2.1875000000000002e-06, "loss": 2.5508, "step": 280 }, { "epoch": 8.923076923076923, "grad_norm": 1.2037118673324585, "learning_rate": 2.265625e-06, "loss": 2.4639, "step": 290 }, { "epoch": 9.23076923076923, "grad_norm": 1.0261240005493164, "learning_rate": 2.3437500000000002e-06, "loss": 2.4103, "step": 300 }, { "epoch": 9.538461538461538, "grad_norm": 0.9358808994293213, "learning_rate": 2.421875e-06, "loss": 2.3032, "step": 310 }, { "epoch": 9.846153846153847, "grad_norm": 0.7383924722671509, "learning_rate": 2.5e-06, "loss": 2.3002, "step": 320 }, { "epoch": 10.153846153846153, "grad_norm": 0.685702383518219, "learning_rate": 2.5781250000000004e-06, "loss": 2.2148, "step": 330 }, { "epoch": 10.461538461538462, "grad_norm": 0.6645168662071228, "learning_rate": 2.65625e-06, "loss": 2.1989, "step": 340 }, { "epoch": 10.76923076923077, "grad_norm": 0.7011102437973022, "learning_rate": 2.7343750000000004e-06, "loss": 2.1496, "step": 350 }, { "epoch": 11.076923076923077, "grad_norm": 0.5761039853096008, "learning_rate": 2.8125e-06, "loss": 2.1465, "step": 360 }, { "epoch": 11.384615384615385, "grad_norm": 0.562958836555481, "learning_rate": 2.8906250000000004e-06, "loss": 2.0871, "step": 370 }, { "epoch": 11.692307692307692, "grad_norm": 0.5713663101196289, "learning_rate": 2.96875e-06, "loss": 2.0732, "step": 380 }, { "epoch": 12.0, "grad_norm": 16827.7421875, "learning_rate": 3.0468750000000004e-06, "loss": 2.0684, "step": 390 }, { "epoch": 12.307692307692308, "grad_norm": 0.5909234285354614, "learning_rate": 3.125e-06, "loss": 2.0554, "step": 400 }, { "epoch": 12.615384615384615, "grad_norm": 0.5658320784568787, "learning_rate": 3.2031250000000004e-06, "loss": 1.9929, "step": 410 }, { "epoch": 12.923076923076923, "grad_norm": 0.5382928848266602, "learning_rate": 3.28125e-06, "loss": 1.9913, "step": 420 }, { "epoch": 13.23076923076923, "grad_norm": 0.7106872200965881, "learning_rate": 3.3593750000000003e-06, "loss": 1.9562, "step": 430 }, { "epoch": 13.538461538461538, "grad_norm": 0.5338084697723389, "learning_rate": 3.4375e-06, "loss": 1.9503, "step": 440 }, { "epoch": 13.846153846153847, "grad_norm": 0.524355411529541, "learning_rate": 3.5156250000000003e-06, "loss": 1.9499, "step": 450 }, { "epoch": 14.153846153846153, "grad_norm": 0.5286893248558044, "learning_rate": 3.59375e-06, "loss": 1.9225, "step": 460 }, { "epoch": 14.461538461538462, "grad_norm": 0.5003280639648438, "learning_rate": 3.6718750000000003e-06, "loss": 1.8688, "step": 470 }, { "epoch": 14.76923076923077, "grad_norm": 0.5744629502296448, "learning_rate": 3.7500000000000005e-06, "loss": 1.8728, "step": 480 }, { "epoch": 15.076923076923077, "grad_norm": 0.47028881311416626, "learning_rate": 3.828125000000001e-06, "loss": 1.818, "step": 490 }, { "epoch": 15.384615384615385, "grad_norm": 0.5076237320899963, "learning_rate": 3.90625e-06, "loss": 1.8375, "step": 500 }, { "epoch": 15.692307692307692, "grad_norm": 0.5628578066825867, "learning_rate": 3.984375e-06, "loss": 1.823, "step": 510 }, { "epoch": 16.0, "grad_norm": 35255.69921875, "learning_rate": 4.0625000000000005e-06, "loss": 1.8249, "step": 520 }, { "epoch": 16.307692307692307, "grad_norm": 0.5536410212516785, "learning_rate": 4.140625000000001e-06, "loss": 1.7998, "step": 530 }, { "epoch": 16.615384615384617, "grad_norm": 0.6619865298271179, "learning_rate": 4.21875e-06, "loss": 1.775, "step": 540 }, { "epoch": 16.923076923076923, "grad_norm": 0.6591458320617676, "learning_rate": 4.296875e-06, "loss": 1.7605, "step": 550 }, { "epoch": 17.23076923076923, "grad_norm": 0.8084316253662109, "learning_rate": 4.3750000000000005e-06, "loss": 1.7278, "step": 560 }, { "epoch": 17.53846153846154, "grad_norm": 0.6782126426696777, "learning_rate": 4.453125000000001e-06, "loss": 1.7261, "step": 570 }, { "epoch": 17.846153846153847, "grad_norm": 0.6113712191581726, "learning_rate": 4.53125e-06, "loss": 1.7178, "step": 580 }, { "epoch": 18.153846153846153, "grad_norm": 0.6165570020675659, "learning_rate": 4.609375e-06, "loss": 1.6865, "step": 590 }, { "epoch": 18.46153846153846, "grad_norm": 0.7881684303283691, "learning_rate": 4.6875000000000004e-06, "loss": 1.6793, "step": 600 }, { "epoch": 18.76923076923077, "grad_norm": 0.672874927520752, "learning_rate": 4.765625000000001e-06, "loss": 1.6661, "step": 610 }, { "epoch": 19.076923076923077, "grad_norm": 0.730848491191864, "learning_rate": 4.84375e-06, "loss": 1.6431, "step": 620 }, { "epoch": 19.384615384615383, "grad_norm": 0.6730669736862183, "learning_rate": 4.921875e-06, "loss": 1.6155, "step": 630 }, { "epoch": 19.692307692307693, "grad_norm": 0.6560551524162292, "learning_rate": 5e-06, "loss": 1.6395, "step": 640 }, { "epoch": 20.0, "grad_norm": 4129.79052734375, "learning_rate": 4.999811754597862e-06, "loss": 1.5897, "step": 650 }, { "epoch": 20.307692307692307, "grad_norm": 0.6594120860099792, "learning_rate": 4.999247046740511e-06, "loss": 1.5829, "step": 660 }, { "epoch": 20.615384615384617, "grad_norm": 0.893703818321228, "learning_rate": 4.998305961470874e-06, "loss": 1.558, "step": 670 }, { "epoch": 20.923076923076923, "grad_norm": 0.7144062519073486, "learning_rate": 4.996988640512931e-06, "loss": 1.5373, "step": 680 }, { "epoch": 21.23076923076923, "grad_norm": 0.8453310132026672, "learning_rate": 4.995295282250373e-06, "loss": 1.4909, "step": 690 }, { "epoch": 21.53846153846154, "grad_norm": 0.8293094635009766, "learning_rate": 4.993226141696726e-06, "loss": 1.4967, "step": 700 }, { "epoch": 21.846153846153847, "grad_norm": 0.7181118726730347, "learning_rate": 4.990781530456945e-06, "loss": 1.4857, "step": 710 }, { "epoch": 22.153846153846153, "grad_norm": 0.844008207321167, "learning_rate": 4.987961816680493e-06, "loss": 1.495, "step": 720 }, { "epoch": 22.46153846153846, "grad_norm": 0.8761960864067078, "learning_rate": 4.984767425005891e-06, "loss": 1.4224, "step": 730 }, { "epoch": 22.76923076923077, "grad_norm": 0.8939017057418823, "learning_rate": 4.981198836496776e-06, "loss": 1.4063, "step": 740 }, { "epoch": 23.076923076923077, "grad_norm": 0.828834593296051, "learning_rate": 4.97725658856945e-06, "loss": 1.429, "step": 750 }, { "epoch": 23.384615384615383, "grad_norm": 0.945023775100708, "learning_rate": 4.972941274911953e-06, "loss": 1.3588, "step": 760 }, { "epoch": 23.692307692307693, "grad_norm": 1.122968316078186, "learning_rate": 4.968253545394647e-06, "loss": 1.3309, "step": 770 }, { "epoch": 24.0, "grad_norm": 93171.5546875, "learning_rate": 4.9631941059723535e-06, "loss": 1.335, "step": 780 }, { "epoch": 24.307692307692307, "grad_norm": 1.0523442029953003, "learning_rate": 4.957763718578042e-06, "loss": 1.3347, "step": 790 }, { "epoch": 24.615384615384617, "grad_norm": 1.179585576057434, "learning_rate": 4.9519632010080765e-06, "loss": 1.2618, "step": 800 }, { "epoch": 24.923076923076923, "grad_norm": 0.854586124420166, "learning_rate": 4.9457934267990695e-06, "loss": 1.2321, "step": 810 }, { "epoch": 25.23076923076923, "grad_norm": 1.0027631521224976, "learning_rate": 4.939255325096322e-06, "loss": 1.2041, "step": 820 }, { "epoch": 25.53846153846154, "grad_norm": 1.045911192893982, "learning_rate": 4.932349880513901e-06, "loss": 1.1746, "step": 830 }, { "epoch": 25.846153846153847, "grad_norm": 0.9622510075569153, "learning_rate": 4.925078132986361e-06, "loss": 1.1846, "step": 840 }, { "epoch": 26.153846153846153, "grad_norm": 1.2767952680587769, "learning_rate": 4.917441177612131e-06, "loss": 1.1605, "step": 850 }, { "epoch": 26.46153846153846, "grad_norm": 0.9456568360328674, "learning_rate": 4.9094401644886e-06, "loss": 1.1373, "step": 860 }, { "epoch": 26.76923076923077, "grad_norm": 1.3368542194366455, "learning_rate": 4.901076298538915e-06, "loss": 1.0879, "step": 870 }, { "epoch": 27.076923076923077, "grad_norm": 1.4178813695907593, "learning_rate": 4.8923508393305224e-06, "loss": 1.0359, "step": 880 }, { "epoch": 27.384615384615383, "grad_norm": 1.4570515155792236, "learning_rate": 4.883265100885484e-06, "loss": 1.0313, "step": 890 }, { "epoch": 27.692307692307693, "grad_norm": 1.415483832359314, "learning_rate": 4.873820451482592e-06, "loss": 1.034, "step": 900 }, { "epoch": 28.0, "grad_norm": 1146730.75, "learning_rate": 4.864018313451304e-06, "loss": 0.8984, "step": 910 }, { "epoch": 28.307692307692307, "grad_norm": 1.3334933519363403, "learning_rate": 4.8538601629575525e-06, "loss": 0.9298, "step": 920 }, { "epoch": 28.615384615384617, "grad_norm": 1.1457735300064087, "learning_rate": 4.843347529781438e-06, "loss": 0.94, "step": 930 }, { "epoch": 28.923076923076923, "grad_norm": 1.3328795433044434, "learning_rate": 4.832481997086848e-06, "loss": 0.8961, "step": 940 }, { "epoch": 29.23076923076923, "grad_norm": 1.455394983291626, "learning_rate": 4.82126520118304e-06, "loss": 0.9099, "step": 950 }, { "epoch": 29.53846153846154, "grad_norm": 1.7109308242797852, "learning_rate": 4.809698831278217e-06, "loss": 0.8214, "step": 960 }, { "epoch": 29.846153846153847, "grad_norm": 1.392449140548706, "learning_rate": 4.797784629225145e-06, "loss": 0.813, "step": 970 }, { "epoch": 30.153846153846153, "grad_norm": 1.8153551816940308, "learning_rate": 4.7855243892588275e-06, "loss": 0.7938, "step": 980 }, { "epoch": 30.46153846153846, "grad_norm": 1.7134582996368408, "learning_rate": 4.772919957726306e-06, "loss": 0.8109, "step": 990 }, { "epoch": 30.76923076923077, "grad_norm": 1.7986242771148682, "learning_rate": 4.759973232808609e-06, "loss": 0.6657, "step": 1000 }, { "epoch": 31.076923076923077, "grad_norm": 1.5258599519729614, "learning_rate": 4.746686164234885e-06, "loss": 0.7157, "step": 1010 }, { "epoch": 31.384615384615383, "grad_norm": 1.645753264427185, "learning_rate": 4.7330607529887885e-06, "loss": 0.6876, "step": 1020 }, { "epoch": 31.692307692307693, "grad_norm": 1.4284254312515259, "learning_rate": 4.719099051007136e-06, "loss": 0.6913, "step": 1030 }, { "epoch": 32.0, "grad_norm": 322668.5625, "learning_rate": 4.704803160870888e-06, "loss": 0.6297, "step": 1040 }, { "epoch": 32.30769230769231, "grad_norm": 1.519518256187439, "learning_rate": 4.6901752354885166e-06, "loss": 0.617, "step": 1050 }, { "epoch": 32.61538461538461, "grad_norm": 1.3290555477142334, "learning_rate": 4.675217477771779e-06, "loss": 0.6476, "step": 1060 }, { "epoch": 32.92307692307692, "grad_norm": 1.5469777584075928, "learning_rate": 4.659932140303967e-06, "loss": 0.5633, "step": 1070 }, { "epoch": 33.23076923076923, "grad_norm": 1.5148197412490845, "learning_rate": 4.644321525000681e-06, "loss": 0.5595, "step": 1080 }, { "epoch": 33.53846153846154, "grad_norm": 1.7467495203018188, "learning_rate": 4.628387982763163e-06, "loss": 0.5691, "step": 1090 }, { "epoch": 33.84615384615385, "grad_norm": 1.9587457180023193, "learning_rate": 4.612133913124268e-06, "loss": 0.4897, "step": 1100 }, { "epoch": 34.15384615384615, "grad_norm": 1.3276329040527344, "learning_rate": 4.595561763887095e-06, "loss": 0.5177, "step": 1110 }, { "epoch": 34.46153846153846, "grad_norm": 1.5996724367141724, "learning_rate": 4.578674030756364e-06, "loss": 0.4505, "step": 1120 }, { "epoch": 34.76923076923077, "grad_norm": 1.8829643726348877, "learning_rate": 4.561473256962564e-06, "loss": 0.5024, "step": 1130 }, { "epoch": 35.07692307692308, "grad_norm": 2.2122015953063965, "learning_rate": 4.54396203287896e-06, "loss": 0.466, "step": 1140 }, { "epoch": 35.38461538461539, "grad_norm": 1.784812092781067, "learning_rate": 4.526142995631488e-06, "loss": 0.4057, "step": 1150 }, { "epoch": 35.69230769230769, "grad_norm": 1.3397154808044434, "learning_rate": 4.508018828701613e-06, "loss": 0.4513, "step": 1160 }, { "epoch": 36.0, "grad_norm": 368474.65625, "learning_rate": 4.489592261522209e-06, "loss": 0.3743, "step": 1170 }, { "epoch": 36.30769230769231, "grad_norm": 1.8128660917282104, "learning_rate": 4.470866069066516e-06, "loss": 0.3919, "step": 1180 }, { "epoch": 36.61538461538461, "grad_norm": 1.7692885398864746, "learning_rate": 4.451843071430236e-06, "loss": 0.3589, "step": 1190 }, { "epoch": 36.92307692307692, "grad_norm": 1.45668363571167, "learning_rate": 4.432526133406843e-06, "loss": 0.3578, "step": 1200 }, { "epoch": 37.23076923076923, "grad_norm": 1.255777359008789, "learning_rate": 4.412918164056148e-06, "loss": 0.3436, "step": 1210 }, { "epoch": 37.53846153846154, "grad_norm": 1.4039700031280518, "learning_rate": 4.393022116266212e-06, "loss": 0.3054, "step": 1220 }, { "epoch": 37.84615384615385, "grad_norm": 2.489480495452881, "learning_rate": 4.372840986308649e-06, "loss": 0.3145, "step": 1230 }, { "epoch": 38.15384615384615, "grad_norm": 2.235471725463867, "learning_rate": 4.352377813387398e-06, "loss": 0.3915, "step": 1240 }, { "epoch": 38.46153846153846, "grad_norm": 1.5027036666870117, "learning_rate": 4.331635679181032e-06, "loss": 0.2823, "step": 1250 }, { "epoch": 38.76923076923077, "grad_norm": 1.8041187524795532, "learning_rate": 4.3106177073786684e-06, "loss": 0.2837, "step": 1260 }, { "epoch": 39.07692307692308, "grad_norm": 1.359180212020874, "learning_rate": 4.289327063209548e-06, "loss": 0.277, "step": 1270 }, { "epoch": 39.38461538461539, "grad_norm": 1.5031533241271973, "learning_rate": 4.267766952966369e-06, "loss": 0.2959, "step": 1280 }, { "epoch": 39.69230769230769, "grad_norm": 1.5086129903793335, "learning_rate": 4.245940623522433e-06, "loss": 0.236, "step": 1290 }, { "epoch": 40.0, "grad_norm": 8115298.5, "learning_rate": 4.223851361842668e-06, "loss": 0.2243, "step": 1300 }, { "epoch": 40.30769230769231, "grad_norm": 1.59078049659729, "learning_rate": 4.201502494488633e-06, "loss": 0.2423, "step": 1310 }, { "epoch": 40.61538461538461, "grad_norm": 1.6530065536499023, "learning_rate": 4.178897387117547e-06, "loss": 0.2217, "step": 1320 }, { "epoch": 40.92307692307692, "grad_norm": 1.6133264303207397, "learning_rate": 4.15603944397543e-06, "loss": 0.2185, "step": 1330 }, { "epoch": 41.23076923076923, "grad_norm": 1.7328965663909912, "learning_rate": 4.132932107384442e-06, "loss": 0.2183, "step": 1340 }, { "epoch": 41.53846153846154, "grad_norm": 1.5189462900161743, "learning_rate": 4.109578857224478e-06, "loss": 0.2167, "step": 1350 }, { "epoch": 41.84615384615385, "grad_norm": 1.807603359222412, "learning_rate": 4.085983210409114e-06, "loss": 0.193, "step": 1360 }, { "epoch": 42.15384615384615, "grad_norm": 1.4981666803359985, "learning_rate": 4.062148720355967e-06, "loss": 0.1724, "step": 1370 }, { "epoch": 42.46153846153846, "grad_norm": 1.40492844581604, "learning_rate": 4.038078976451567e-06, "loss": 0.1658, "step": 1380 }, { "epoch": 42.76923076923077, "grad_norm": 1.4978539943695068, "learning_rate": 4.013777603510815e-06, "loss": 0.1555, "step": 1390 }, { "epoch": 43.07692307692308, "grad_norm": 1.2119354009628296, "learning_rate": 3.989248261231084e-06, "loss": 0.1833, "step": 1400 }, { "epoch": 43.38461538461539, "grad_norm": 1.857128381729126, "learning_rate": 3.964494643641097e-06, "loss": 0.1622, "step": 1410 }, { "epoch": 43.69230769230769, "grad_norm": 1.624523639678955, "learning_rate": 3.939520478544614e-06, "loss": 0.1499, "step": 1420 }, { "epoch": 44.0, "grad_norm": 53768.84375, "learning_rate": 3.914329526959033e-06, "loss": 0.1712, "step": 1430 }, { "epoch": 44.30769230769231, "grad_norm": 1.3771990537643433, "learning_rate": 3.888925582549006e-06, "loss": 0.1334, "step": 1440 }, { "epoch": 44.61538461538461, "grad_norm": 2.549403190612793, "learning_rate": 3.863312471055116e-06, "loss": 0.1833, "step": 1450 }, { "epoch": 44.92307692307692, "grad_norm": 1.4592410326004028, "learning_rate": 3.8374940497177435e-06, "loss": 0.1174, "step": 1460 }, { "epoch": 45.23076923076923, "grad_norm": 1.5769482851028442, "learning_rate": 3.8114742066961722e-06, "loss": 0.1207, "step": 1470 }, { "epoch": 45.53846153846154, "grad_norm": 1.1684468984603882, "learning_rate": 3.785256860483054e-06, "loss": 0.1275, "step": 1480 }, { "epoch": 45.84615384615385, "grad_norm": 1.1915509700775146, "learning_rate": 3.7588459593142944e-06, "loss": 0.1253, "step": 1490 }, { "epoch": 46.15384615384615, "grad_norm": 1.3312920331954956, "learning_rate": 3.7322454805744605e-06, "loss": 0.1159, "step": 1500 }, { "epoch": 46.46153846153846, "grad_norm": 1.1740134954452515, "learning_rate": 3.7054594301978075e-06, "loss": 0.1312, "step": 1510 }, { "epoch": 46.76923076923077, "grad_norm": 0.9904786348342896, "learning_rate": 3.6784918420649952e-06, "loss": 0.1171, "step": 1520 }, { "epoch": 47.07692307692308, "grad_norm": 1.7853026390075684, "learning_rate": 3.6513467773956002e-06, "loss": 0.1229, "step": 1530 }, { "epoch": 47.38461538461539, "grad_norm": 0.924367368221283, "learning_rate": 3.624028324136517e-06, "loss": 0.1087, "step": 1540 }, { "epoch": 47.69230769230769, "grad_norm": 1.3565295934677124, "learning_rate": 3.5965405963463197e-06, "loss": 0.113, "step": 1550 }, { "epoch": 48.0, "grad_norm": 27509738.0, "learning_rate": 3.5688877335757055e-06, "loss": 0.0949, "step": 1560 }, { "epoch": 48.30769230769231, "grad_norm": 1.4091086387634277, "learning_rate": 3.5410739002440938e-06, "loss": 0.095, "step": 1570 }, { "epoch": 48.61538461538461, "grad_norm": 1.1597000360488892, "learning_rate": 3.5131032850124745e-06, "loss": 0.0987, "step": 1580 }, { "epoch": 48.92307692307692, "grad_norm": 1.2181200981140137, "learning_rate": 3.484980100152621e-06, "loss": 0.0894, "step": 1590 }, { "epoch": 49.23076923076923, "grad_norm": 1.0059666633605957, "learning_rate": 3.4567085809127247e-06, "loss": 0.0962, "step": 1600 }, { "epoch": 49.53846153846154, "grad_norm": 1.6103034019470215, "learning_rate": 3.4282929848795944e-06, "loss": 0.0842, "step": 1610 }, { "epoch": 49.84615384615385, "grad_norm": 1.6124812364578247, "learning_rate": 3.399737591337471e-06, "loss": 0.0981, "step": 1620 }, { "epoch": 50.15384615384615, "grad_norm": 1.14884614944458, "learning_rate": 3.3710467006235865e-06, "loss": 0.0996, "step": 1630 }, { "epoch": 50.46153846153846, "grad_norm": 1.4607173204421997, "learning_rate": 3.3422246334805504e-06, "loss": 0.1034, "step": 1640 }, { "epoch": 50.76923076923077, "grad_norm": 1.092631459236145, "learning_rate": 3.313275730405658e-06, "loss": 0.0801, "step": 1650 }, { "epoch": 51.07692307692308, "grad_norm": 1.0797463655471802, "learning_rate": 3.2842043509972294e-06, "loss": 0.0795, "step": 1660 }, { "epoch": 51.38461538461539, "grad_norm": 1.503728985786438, "learning_rate": 3.2550148732980707e-06, "loss": 0.0744, "step": 1670 }, { "epoch": 51.69230769230769, "grad_norm": 1.0997803211212158, "learning_rate": 3.225711693136156e-06, "loss": 0.0741, "step": 1680 }, { "epoch": 52.0, "grad_norm": 10298226.0, "learning_rate": 3.196299223462633e-06, "loss": 0.0699, "step": 1690 }, { "epoch": 52.30769230769231, "grad_norm": 1.0617835521697998, "learning_rate": 3.1667818936872463e-06, "loss": 0.074, "step": 1700 }, { "epoch": 52.61538461538461, "grad_norm": 0.8478608131408691, "learning_rate": 3.137164149011287e-06, "loss": 0.0577, "step": 1710 }, { "epoch": 52.92307692307692, "grad_norm": 1.827232003211975, "learning_rate": 3.10745044975816e-06, "loss": 0.0871, "step": 1720 }, { "epoch": 53.23076923076923, "grad_norm": 1.0407119989395142, "learning_rate": 3.0776452707016784e-06, "loss": 0.0682, "step": 1730 }, { "epoch": 53.53846153846154, "grad_norm": 1.375914454460144, "learning_rate": 3.0477531003921745e-06, "loss": 0.0735, "step": 1740 }, { "epoch": 53.84615384615385, "grad_norm": 1.3044782876968384, "learning_rate": 3.0177784404805466e-06, "loss": 0.0617, "step": 1750 }, { "epoch": 54.15384615384615, "grad_norm": 0.6784080862998962, "learning_rate": 2.9877258050403214e-06, "loss": 0.0494, "step": 1760 }, { "epoch": 54.46153846153846, "grad_norm": 1.0913127660751343, "learning_rate": 2.957599719887853e-06, "loss": 0.0685, "step": 1770 }, { "epoch": 54.76923076923077, "grad_norm": 1.2988967895507812, "learning_rate": 2.9274047219007533e-06, "loss": 0.069, "step": 1780 }, { "epoch": 55.07692307692308, "grad_norm": 1.0163013935089111, "learning_rate": 2.8971453583346536e-06, "loss": 0.0545, "step": 1790 }, { "epoch": 55.38461538461539, "grad_norm": 0.7982223033905029, "learning_rate": 2.8668261861384045e-06, "loss": 0.0509, "step": 1800 }, { "epoch": 55.69230769230769, "grad_norm": 1.5185413360595703, "learning_rate": 2.8364517712678157e-06, "loss": 0.0708, "step": 1810 }, { "epoch": 56.0, "grad_norm": 54588.5390625, "learning_rate": 2.806026687998041e-06, "loss": 0.0531, "step": 1820 }, { "epoch": 56.30769230769231, "grad_norm": 0.9050544500350952, "learning_rate": 2.775555518234708e-06, "loss": 0.0529, "step": 1830 }, { "epoch": 56.61538461538461, "grad_norm": 1.0293651819229126, "learning_rate": 2.7450428508239024e-06, "loss": 0.0572, "step": 1840 }, { "epoch": 56.92307692307692, "grad_norm": 1.1838538646697998, "learning_rate": 2.7144932808611002e-06, "loss": 0.0616, "step": 1850 }, { "epoch": 57.23076923076923, "grad_norm": 1.2836631536483765, "learning_rate": 2.683911408999169e-06, "loss": 0.0536, "step": 1860 }, { "epoch": 57.53846153846154, "grad_norm": 1.0823341608047485, "learning_rate": 2.6533018407555216e-06, "loss": 0.0476, "step": 1870 }, { "epoch": 57.84615384615385, "grad_norm": 1.0420117378234863, "learning_rate": 2.6226691858185454e-06, "loss": 0.0527, "step": 1880 }, { "epoch": 58.15384615384615, "grad_norm": 1.0533287525177002, "learning_rate": 2.5920180573533975e-06, "loss": 0.0623, "step": 1890 }, { "epoch": 58.46153846153846, "grad_norm": 0.9571561217308044, "learning_rate": 2.561353071307281e-06, "loss": 0.0498, "step": 1900 }, { "epoch": 58.76923076923077, "grad_norm": 1.224555253982544, "learning_rate": 2.5306788457143e-06, "loss": 0.0447, "step": 1910 }, { "epoch": 59.07692307692308, "grad_norm": 1.900484561920166, "learning_rate": 2.5e-06, "loss": 0.0578, "step": 1920 }, { "epoch": 59.38461538461539, "grad_norm": 0.9753430485725403, "learning_rate": 2.4693211542857005e-06, "loss": 0.052, "step": 1930 }, { "epoch": 59.69230769230769, "grad_norm": 1.0865504741668701, "learning_rate": 2.43864692869272e-06, "loss": 0.0447, "step": 1940 }, { "epoch": 60.0, "grad_norm": 39835.109375, "learning_rate": 2.407981942646603e-06, "loss": 0.0477, "step": 1950 }, { "epoch": 60.30769230769231, "grad_norm": 1.1388822793960571, "learning_rate": 2.377330814181455e-06, "loss": 0.0516, "step": 1960 }, { "epoch": 60.61538461538461, "grad_norm": 1.2782968282699585, "learning_rate": 2.346698159244479e-06, "loss": 0.0501, "step": 1970 }, { "epoch": 60.92307692307692, "grad_norm": 1.6132394075393677, "learning_rate": 2.3160885910008317e-06, "loss": 0.0408, "step": 1980 }, { "epoch": 61.23076923076923, "grad_norm": 0.8268479108810425, "learning_rate": 2.2855067191389006e-06, "loss": 0.0407, "step": 1990 }, { "epoch": 61.53846153846154, "grad_norm": 0.9518911838531494, "learning_rate": 2.2549571491760985e-06, "loss": 0.0455, "step": 2000 }, { "epoch": 61.84615384615385, "grad_norm": 1.2631924152374268, "learning_rate": 2.2244444817652923e-06, "loss": 0.0462, "step": 2010 }, { "epoch": 62.15384615384615, "grad_norm": 1.0491702556610107, "learning_rate": 2.19397331200196e-06, "loss": 0.0384, "step": 2020 }, { "epoch": 62.46153846153846, "grad_norm": 1.37276291847229, "learning_rate": 2.1635482287321848e-06, "loss": 0.0384, "step": 2030 }, { "epoch": 62.76923076923077, "grad_norm": 0.911716103553772, "learning_rate": 2.133173813861596e-06, "loss": 0.0489, "step": 2040 }, { "epoch": 63.07692307692308, "grad_norm": 0.8453769683837891, "learning_rate": 2.102854641665347e-06, "loss": 0.0519, "step": 2050 }, { "epoch": 63.38461538461539, "grad_norm": 1.1368571519851685, "learning_rate": 2.072595278099247e-06, "loss": 0.0383, "step": 2060 }, { "epoch": 63.69230769230769, "grad_norm": 1.6040756702423096, "learning_rate": 2.042400280112148e-06, "loss": 0.0421, "step": 2070 }, { "epoch": 64.0, "grad_norm": 664656.875, "learning_rate": 2.01227419495968e-06, "loss": 0.0473, "step": 2080 }, { "epoch": 64.3076923076923, "grad_norm": 0.9847516417503357, "learning_rate": 1.982221559519454e-06, "loss": 0.042, "step": 2090 }, { "epoch": 64.61538461538461, "grad_norm": 0.77010577917099, "learning_rate": 1.952246899607826e-06, "loss": 0.042, "step": 2100 }, { "epoch": 64.92307692307692, "grad_norm": 1.2908334732055664, "learning_rate": 1.9223547292983225e-06, "loss": 0.0416, "step": 2110 }, { "epoch": 65.23076923076923, "grad_norm": 0.7256491184234619, "learning_rate": 1.8925495502418407e-06, "loss": 0.0353, "step": 2120 }, { "epoch": 65.53846153846153, "grad_norm": 1.212572455406189, "learning_rate": 1.862835850988714e-06, "loss": 0.0427, "step": 2130 }, { "epoch": 65.84615384615384, "grad_norm": 1.154390811920166, "learning_rate": 1.8332181063127543e-06, "loss": 0.0396, "step": 2140 }, { "epoch": 66.15384615384616, "grad_norm": 0.8889743685722351, "learning_rate": 1.8037007765373677e-06, "loss": 0.0381, "step": 2150 }, { "epoch": 66.46153846153847, "grad_norm": 1.1566401720046997, "learning_rate": 1.7742883068638447e-06, "loss": 0.041, "step": 2160 }, { "epoch": 66.76923076923077, "grad_norm": 0.9389579892158508, "learning_rate": 1.74498512670193e-06, "loss": 0.0385, "step": 2170 }, { "epoch": 67.07692307692308, "grad_norm": 0.709125816822052, "learning_rate": 1.7157956490027716e-06, "loss": 0.0432, "step": 2180 }, { "epoch": 67.38461538461539, "grad_norm": 1.1108027696609497, "learning_rate": 1.686724269594343e-06, "loss": 0.0395, "step": 2190 }, { "epoch": 67.6923076923077, "grad_norm": 1.087064266204834, "learning_rate": 1.6577753665194502e-06, "loss": 0.0402, "step": 2200 }, { "epoch": 68.0, "grad_norm": 3785425.75, "learning_rate": 1.628953299376414e-06, "loss": 0.0351, "step": 2210 }, { "epoch": 68.3076923076923, "grad_norm": 0.9498631954193115, "learning_rate": 1.6002624086625296e-06, "loss": 0.0402, "step": 2220 }, { "epoch": 68.61538461538461, "grad_norm": 0.6672590374946594, "learning_rate": 1.5717070151204064e-06, "loss": 0.0333, "step": 2230 }, { "epoch": 68.92307692307692, "grad_norm": 0.671436607837677, "learning_rate": 1.5432914190872757e-06, "loss": 0.0387, "step": 2240 }, { "epoch": 69.23076923076923, "grad_norm": 0.7344552874565125, "learning_rate": 1.5150198998473802e-06, "loss": 0.0345, "step": 2250 }, { "epoch": 69.53846153846153, "grad_norm": 0.5938952565193176, "learning_rate": 1.4868967149875257e-06, "loss": 0.0343, "step": 2260 }, { "epoch": 69.84615384615384, "grad_norm": 0.862509548664093, "learning_rate": 1.4589260997559077e-06, "loss": 0.0359, "step": 2270 }, { "epoch": 70.15384615384616, "grad_norm": 0.8928817510604858, "learning_rate": 1.4311122664242955e-06, "loss": 0.0406, "step": 2280 }, { "epoch": 70.46153846153847, "grad_norm": 0.81640625, "learning_rate": 1.4034594036536816e-06, "loss": 0.0332, "step": 2290 }, { "epoch": 70.76923076923077, "grad_norm": 0.5396949648857117, "learning_rate": 1.3759716758634833e-06, "loss": 0.0389, "step": 2300 }, { "epoch": 71.07692307692308, "grad_norm": 0.918347954750061, "learning_rate": 1.3486532226044e-06, "loss": 0.0329, "step": 2310 }, { "epoch": 71.38461538461539, "grad_norm": 0.8731901049613953, "learning_rate": 1.3215081579350058e-06, "loss": 0.0355, "step": 2320 }, { "epoch": 71.6923076923077, "grad_norm": 0.9578891396522522, "learning_rate": 1.294540569802193e-06, "loss": 0.037, "step": 2330 }, { "epoch": 72.0, "grad_norm": 3753634.75, "learning_rate": 1.2677545194255403e-06, "loss": 0.0372, "step": 2340 }, { "epoch": 72.3076923076923, "grad_norm": 0.9850757122039795, "learning_rate": 1.2411540406857064e-06, "loss": 0.0388, "step": 2350 }, { "epoch": 72.61538461538461, "grad_norm": 1.0889559984207153, "learning_rate": 1.214743139516946e-06, "loss": 0.0329, "step": 2360 }, { "epoch": 72.92307692307692, "grad_norm": 1.2561447620391846, "learning_rate": 1.1885257933038282e-06, "loss": 0.0334, "step": 2370 }, { "epoch": 73.23076923076923, "grad_norm": 1.08214271068573, "learning_rate": 1.1625059502822575e-06, "loss": 0.0438, "step": 2380 }, { "epoch": 73.53846153846153, "grad_norm": 0.6585337519645691, "learning_rate": 1.1366875289448844e-06, "loss": 0.0283, "step": 2390 }, { "epoch": 73.84615384615384, "grad_norm": 1.001531720161438, "learning_rate": 1.1110744174509952e-06, "loss": 0.0405, "step": 2400 }, { "epoch": 74.15384615384616, "grad_norm": 0.774890124797821, "learning_rate": 1.0856704730409667e-06, "loss": 0.035, "step": 2410 }, { "epoch": 74.46153846153847, "grad_norm": 0.8112567663192749, "learning_rate": 1.0604795214553867e-06, "loss": 0.0371, "step": 2420 }, { "epoch": 74.76923076923077, "grad_norm": 0.7539799809455872, "learning_rate": 1.035505356358903e-06, "loss": 0.0363, "step": 2430 }, { "epoch": 75.07692307692308, "grad_norm": 1.047004222869873, "learning_rate": 1.0107517387689168e-06, "loss": 0.0305, "step": 2440 }, { "epoch": 75.38461538461539, "grad_norm": 0.5877715945243835, "learning_rate": 9.862223964891864e-07, "loss": 0.032, "step": 2450 }, { "epoch": 75.6923076923077, "grad_norm": 0.7707840204238892, "learning_rate": 9.61921023548433e-07, "loss": 0.0345, "step": 2460 }, { "epoch": 76.0, "grad_norm": 8456043.0, "learning_rate": 9.378512796440345e-07, "loss": 0.0362, "step": 2470 }, { "epoch": 76.3076923076923, "grad_norm": 1.2004197835922241, "learning_rate": 9.140167895908867e-07, "loss": 0.0318, "step": 2480 }, { "epoch": 76.61538461538461, "grad_norm": 0.7192149758338928, "learning_rate": 8.904211427755219e-07, "loss": 0.0339, "step": 2490 }, { "epoch": 76.92307692307692, "grad_norm": 0.8858992457389832, "learning_rate": 8.670678926155588e-07, "loss": 0.0365, "step": 2500 } ], "logging_steps": 10, "max_steps": 3200, "num_input_tokens_seen": 0, "num_train_epochs": 100, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 4.4736165944792064e+17, "train_batch_size": 8, "trial_name": null, "trial_params": null }