2020-Q3-75p-filtered-random / trainer_state.json
DouglasPontes's picture
Training in progress, step 32000
8da7923 verified
raw
history blame
81 kB
{
"best_metric": 3.1026012897491455,
"best_model_checkpoint": "./model_tweets_2020_Q3_75/checkpoint-1408000",
"epoch": 20.28620453565723,
"eval_steps": 8000,
"global_step": 2400000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.07,
"eval_loss": 3.451251983642578,
"eval_runtime": 109.3752,
"eval_samples_per_second": 910.865,
"eval_steps_per_second": 56.932,
"step": 8000
},
{
"epoch": 0.14,
"learning_rate": 4.0726666666666665e-07,
"loss": 3.6645,
"step": 16000
},
{
"epoch": 0.14,
"eval_loss": 3.379037380218506,
"eval_runtime": 110.7092,
"eval_samples_per_second": 899.889,
"eval_steps_per_second": 56.246,
"step": 16000
},
{
"epoch": 0.2,
"eval_loss": 3.322641611099243,
"eval_runtime": 110.5211,
"eval_samples_per_second": 901.42,
"eval_steps_per_second": 56.342,
"step": 24000
},
{
"epoch": 0.27,
"learning_rate": 4.0453333333333336e-07,
"loss": 3.4678,
"step": 32000
},
{
"epoch": 0.27,
"eval_loss": 3.290071964263916,
"eval_runtime": 110.2146,
"eval_samples_per_second": 903.927,
"eval_steps_per_second": 56.499,
"step": 32000
},
{
"epoch": 0.34,
"eval_loss": 3.2590460777282715,
"eval_runtime": 110.8956,
"eval_samples_per_second": 898.376,
"eval_steps_per_second": 56.152,
"step": 40000
},
{
"epoch": 0.41,
"learning_rate": 4.018e-07,
"loss": 3.4118,
"step": 48000
},
{
"epoch": 0.41,
"eval_loss": 3.2533791065216064,
"eval_runtime": 111.7727,
"eval_samples_per_second": 891.327,
"eval_steps_per_second": 55.711,
"step": 48000
},
{
"epoch": 0.47,
"eval_loss": 3.235682725906372,
"eval_runtime": 111.5419,
"eval_samples_per_second": 893.171,
"eval_steps_per_second": 55.827,
"step": 56000
},
{
"epoch": 0.54,
"learning_rate": 3.9906666666666667e-07,
"loss": 3.3843,
"step": 64000
},
{
"epoch": 0.54,
"eval_loss": 3.222811460494995,
"eval_runtime": 111.4969,
"eval_samples_per_second": 893.532,
"eval_steps_per_second": 55.849,
"step": 64000
},
{
"epoch": 0.61,
"eval_loss": 3.233112096786499,
"eval_runtime": 111.9875,
"eval_samples_per_second": 889.617,
"eval_steps_per_second": 55.604,
"step": 72000
},
{
"epoch": 0.68,
"learning_rate": 3.963333333333333e-07,
"loss": 3.3633,
"step": 80000
},
{
"epoch": 0.68,
"eval_loss": 3.204684019088745,
"eval_runtime": 111.4416,
"eval_samples_per_second": 893.975,
"eval_steps_per_second": 55.877,
"step": 80000
},
{
"epoch": 0.74,
"eval_loss": 3.213789463043213,
"eval_runtime": 111.3931,
"eval_samples_per_second": 894.364,
"eval_steps_per_second": 55.901,
"step": 88000
},
{
"epoch": 0.81,
"learning_rate": 3.936e-07,
"loss": 3.3474,
"step": 96000
},
{
"epoch": 0.81,
"eval_loss": 3.2049667835235596,
"eval_runtime": 111.0388,
"eval_samples_per_second": 897.218,
"eval_steps_per_second": 56.08,
"step": 96000
},
{
"epoch": 0.88,
"eval_loss": 3.2050797939300537,
"eval_runtime": 111.9364,
"eval_samples_per_second": 890.024,
"eval_steps_per_second": 55.63,
"step": 104000
},
{
"epoch": 0.95,
"learning_rate": 3.908666666666667e-07,
"loss": 3.3414,
"step": 112000
},
{
"epoch": 0.95,
"eval_loss": 3.1929500102996826,
"eval_runtime": 111.7306,
"eval_samples_per_second": 891.663,
"eval_steps_per_second": 55.732,
"step": 112000
},
{
"epoch": 1.01,
"eval_loss": 3.200173854827881,
"eval_runtime": 112.2148,
"eval_samples_per_second": 887.815,
"eval_steps_per_second": 55.492,
"step": 120000
},
{
"epoch": 1.08,
"learning_rate": 3.8813333333333334e-07,
"loss": 3.335,
"step": 128000
},
{
"epoch": 1.08,
"eval_loss": 3.1920297145843506,
"eval_runtime": 112.5389,
"eval_samples_per_second": 885.258,
"eval_steps_per_second": 55.332,
"step": 128000
},
{
"epoch": 1.15,
"eval_loss": 3.1913902759552,
"eval_runtime": 113.3938,
"eval_samples_per_second": 878.584,
"eval_steps_per_second": 54.915,
"step": 136000
},
{
"epoch": 1.22,
"learning_rate": 3.854e-07,
"loss": 3.3283,
"step": 144000
},
{
"epoch": 1.22,
"eval_loss": 3.1852760314941406,
"eval_runtime": 112.5608,
"eval_samples_per_second": 885.086,
"eval_steps_per_second": 55.321,
"step": 144000
},
{
"epoch": 1.28,
"eval_loss": 3.1824891567230225,
"eval_runtime": 111.9288,
"eval_samples_per_second": 890.084,
"eval_steps_per_second": 55.634,
"step": 152000
},
{
"epoch": 1.35,
"learning_rate": 3.8266666666666665e-07,
"loss": 3.3276,
"step": 160000
},
{
"epoch": 1.35,
"eval_loss": 3.1827123165130615,
"eval_runtime": 111.957,
"eval_samples_per_second": 889.86,
"eval_steps_per_second": 55.62,
"step": 160000
},
{
"epoch": 1.42,
"eval_loss": 3.175551176071167,
"eval_runtime": 111.803,
"eval_samples_per_second": 891.086,
"eval_steps_per_second": 55.696,
"step": 168000
},
{
"epoch": 1.49,
"learning_rate": 3.799333333333333e-07,
"loss": 3.323,
"step": 176000
},
{
"epoch": 1.49,
"eval_loss": 3.186506509780884,
"eval_runtime": 111.1821,
"eval_samples_per_second": 896.061,
"eval_steps_per_second": 56.007,
"step": 176000
},
{
"epoch": 1.56,
"eval_loss": 3.174856662750244,
"eval_runtime": 111.9657,
"eval_samples_per_second": 889.79,
"eval_steps_per_second": 55.615,
"step": 184000
},
{
"epoch": 1.62,
"learning_rate": 3.772e-07,
"loss": 3.3275,
"step": 192000
},
{
"epoch": 1.62,
"eval_loss": 3.1782140731811523,
"eval_runtime": 112.4385,
"eval_samples_per_second": 886.049,
"eval_steps_per_second": 55.381,
"step": 192000
},
{
"epoch": 1.69,
"eval_loss": 3.167551040649414,
"eval_runtime": 110.6744,
"eval_samples_per_second": 900.172,
"eval_steps_per_second": 56.264,
"step": 200000
},
{
"epoch": 1.76,
"learning_rate": 3.7446666666666667e-07,
"loss": 3.3309,
"step": 208000
},
{
"epoch": 1.76,
"eval_loss": 3.1831653118133545,
"eval_runtime": 111.0316,
"eval_samples_per_second": 897.276,
"eval_steps_per_second": 56.083,
"step": 208000
},
{
"epoch": 1.83,
"eval_loss": 3.1743879318237305,
"eval_runtime": 111.5589,
"eval_samples_per_second": 893.035,
"eval_steps_per_second": 55.818,
"step": 216000
},
{
"epoch": 1.89,
"learning_rate": 3.7173333333333333e-07,
"loss": 3.3166,
"step": 224000
},
{
"epoch": 1.89,
"eval_loss": 3.164480447769165,
"eval_runtime": 112.2934,
"eval_samples_per_second": 887.194,
"eval_steps_per_second": 55.453,
"step": 224000
},
{
"epoch": 1.96,
"eval_loss": 3.177034616470337,
"eval_runtime": 111.2262,
"eval_samples_per_second": 895.706,
"eval_steps_per_second": 55.985,
"step": 232000
},
{
"epoch": 2.03,
"learning_rate": 3.69e-07,
"loss": 3.3206,
"step": 240000
},
{
"epoch": 2.03,
"eval_loss": 3.165634870529175,
"eval_runtime": 110.3932,
"eval_samples_per_second": 902.465,
"eval_steps_per_second": 56.407,
"step": 240000
},
{
"epoch": 2.1,
"eval_loss": 3.1560590267181396,
"eval_runtime": 111.4599,
"eval_samples_per_second": 893.828,
"eval_steps_per_second": 55.868,
"step": 248000
},
{
"epoch": 2.16,
"learning_rate": 3.6626666666666664e-07,
"loss": 3.3228,
"step": 256000
},
{
"epoch": 2.16,
"eval_loss": 3.1664586067199707,
"eval_runtime": 111.3781,
"eval_samples_per_second": 894.485,
"eval_steps_per_second": 55.909,
"step": 256000
},
{
"epoch": 2.23,
"eval_loss": 3.165701389312744,
"eval_runtime": 112.0697,
"eval_samples_per_second": 888.965,
"eval_steps_per_second": 55.564,
"step": 264000
},
{
"epoch": 2.3,
"learning_rate": 3.6353333333333335e-07,
"loss": 3.3208,
"step": 272000
},
{
"epoch": 2.3,
"eval_loss": 3.169295072555542,
"eval_runtime": 112.0597,
"eval_samples_per_second": 889.044,
"eval_steps_per_second": 55.569,
"step": 272000
},
{
"epoch": 2.37,
"eval_loss": 3.1777946949005127,
"eval_runtime": 111.8143,
"eval_samples_per_second": 890.995,
"eval_steps_per_second": 55.691,
"step": 280000
},
{
"epoch": 2.43,
"learning_rate": 3.608e-07,
"loss": 3.3106,
"step": 288000
},
{
"epoch": 2.43,
"eval_loss": 3.1760342121124268,
"eval_runtime": 111.7773,
"eval_samples_per_second": 891.29,
"eval_steps_per_second": 55.709,
"step": 288000
},
{
"epoch": 2.5,
"eval_loss": 3.1663529872894287,
"eval_runtime": 111.6908,
"eval_samples_per_second": 891.98,
"eval_steps_per_second": 55.752,
"step": 296000
},
{
"epoch": 2.57,
"learning_rate": 3.5806666666666666e-07,
"loss": 3.3189,
"step": 304000
},
{
"epoch": 2.57,
"eval_loss": 3.167732000350952,
"eval_runtime": 111.5586,
"eval_samples_per_second": 893.037,
"eval_steps_per_second": 55.818,
"step": 304000
},
{
"epoch": 2.64,
"eval_loss": 3.160001277923584,
"eval_runtime": 111.7384,
"eval_samples_per_second": 891.6,
"eval_steps_per_second": 55.728,
"step": 312000
},
{
"epoch": 2.7,
"learning_rate": 3.553333333333333e-07,
"loss": 3.319,
"step": 320000
},
{
"epoch": 2.7,
"eval_loss": 3.1570096015930176,
"eval_runtime": 111.8911,
"eval_samples_per_second": 890.384,
"eval_steps_per_second": 55.652,
"step": 320000
},
{
"epoch": 2.77,
"eval_loss": 3.169872760772705,
"eval_runtime": 110.7472,
"eval_samples_per_second": 899.58,
"eval_steps_per_second": 56.227,
"step": 328000
},
{
"epoch": 2.84,
"learning_rate": 3.5259999999999997e-07,
"loss": 3.3236,
"step": 336000
},
{
"epoch": 2.84,
"eval_loss": 3.1577506065368652,
"eval_runtime": 111.5,
"eval_samples_per_second": 893.507,
"eval_steps_per_second": 55.848,
"step": 336000
},
{
"epoch": 2.91,
"eval_loss": 3.1665139198303223,
"eval_runtime": 111.674,
"eval_samples_per_second": 892.115,
"eval_steps_per_second": 55.761,
"step": 344000
},
{
"epoch": 2.98,
"learning_rate": 3.498666666666667e-07,
"loss": 3.3205,
"step": 352000
},
{
"epoch": 2.98,
"eval_loss": 3.1557507514953613,
"eval_runtime": 111.4997,
"eval_samples_per_second": 893.509,
"eval_steps_per_second": 55.848,
"step": 352000
},
{
"epoch": 3.04,
"eval_loss": 3.167837381362915,
"eval_runtime": 111.1802,
"eval_samples_per_second": 896.077,
"eval_steps_per_second": 56.008,
"step": 360000
},
{
"epoch": 3.11,
"learning_rate": 3.4713333333333333e-07,
"loss": 3.3114,
"step": 368000
},
{
"epoch": 3.11,
"eval_loss": 3.159724473953247,
"eval_runtime": 111.0206,
"eval_samples_per_second": 897.365,
"eval_steps_per_second": 56.089,
"step": 368000
},
{
"epoch": 3.18,
"eval_loss": 3.161818265914917,
"eval_runtime": 111.1411,
"eval_samples_per_second": 896.392,
"eval_steps_per_second": 56.028,
"step": 376000
},
{
"epoch": 3.25,
"learning_rate": 3.444e-07,
"loss": 3.3067,
"step": 384000
},
{
"epoch": 3.25,
"eval_loss": 3.158372640609741,
"eval_runtime": 112.1096,
"eval_samples_per_second": 888.649,
"eval_steps_per_second": 55.544,
"step": 384000
},
{
"epoch": 3.31,
"eval_loss": 3.1597392559051514,
"eval_runtime": 112.0977,
"eval_samples_per_second": 888.742,
"eval_steps_per_second": 55.55,
"step": 392000
},
{
"epoch": 3.38,
"learning_rate": 3.416666666666667e-07,
"loss": 3.314,
"step": 400000
},
{
"epoch": 3.38,
"eval_loss": 3.1565074920654297,
"eval_runtime": 111.7357,
"eval_samples_per_second": 891.622,
"eval_steps_per_second": 55.73,
"step": 400000
},
{
"epoch": 3.45,
"eval_loss": 3.161231517791748,
"eval_runtime": 111.0757,
"eval_samples_per_second": 896.92,
"eval_steps_per_second": 56.061,
"step": 408000
},
{
"epoch": 3.52,
"learning_rate": 3.3893333333333335e-07,
"loss": 3.3183,
"step": 416000
},
{
"epoch": 3.52,
"eval_loss": 3.163727283477783,
"eval_runtime": 111.1047,
"eval_samples_per_second": 896.685,
"eval_steps_per_second": 56.046,
"step": 416000
},
{
"epoch": 3.58,
"eval_loss": 3.1568877696990967,
"eval_runtime": 111.7771,
"eval_samples_per_second": 891.292,
"eval_steps_per_second": 55.709,
"step": 424000
},
{
"epoch": 3.65,
"learning_rate": 3.3619999999999995e-07,
"loss": 3.318,
"step": 432000
},
{
"epoch": 3.65,
"eval_loss": 3.157625198364258,
"eval_runtime": 111.9872,
"eval_samples_per_second": 889.62,
"eval_steps_per_second": 55.605,
"step": 432000
},
{
"epoch": 3.72,
"eval_loss": 3.163891077041626,
"eval_runtime": 113.1182,
"eval_samples_per_second": 880.725,
"eval_steps_per_second": 55.049,
"step": 440000
},
{
"epoch": 3.79,
"learning_rate": 3.3346666666666666e-07,
"loss": 3.3114,
"step": 448000
},
{
"epoch": 3.79,
"eval_loss": 3.1459975242614746,
"eval_runtime": 113.0006,
"eval_samples_per_second": 881.641,
"eval_steps_per_second": 55.106,
"step": 448000
},
{
"epoch": 3.85,
"eval_loss": 3.161134719848633,
"eval_runtime": 112.6662,
"eval_samples_per_second": 884.258,
"eval_steps_per_second": 55.269,
"step": 456000
},
{
"epoch": 3.92,
"learning_rate": 3.307333333333333e-07,
"loss": 3.3068,
"step": 464000
},
{
"epoch": 3.92,
"eval_loss": 3.158674716949463,
"eval_runtime": 111.6387,
"eval_samples_per_second": 892.396,
"eval_steps_per_second": 55.778,
"step": 464000
},
{
"epoch": 3.99,
"eval_loss": 3.154172897338867,
"eval_runtime": 111.5221,
"eval_samples_per_second": 893.33,
"eval_steps_per_second": 55.836,
"step": 472000
},
{
"epoch": 4.06,
"learning_rate": 3.28e-07,
"loss": 3.3166,
"step": 480000
},
{
"epoch": 4.06,
"eval_loss": 3.142169237136841,
"eval_runtime": 112.2969,
"eval_samples_per_second": 887.166,
"eval_steps_per_second": 55.451,
"step": 480000
},
{
"epoch": 4.12,
"eval_loss": 3.160404920578003,
"eval_runtime": 112.5525,
"eval_samples_per_second": 885.152,
"eval_steps_per_second": 55.325,
"step": 488000
},
{
"epoch": 4.19,
"learning_rate": 3.252666666666667e-07,
"loss": 3.3057,
"step": 496000
},
{
"epoch": 4.19,
"eval_loss": 3.1586716175079346,
"eval_runtime": 112.4095,
"eval_samples_per_second": 886.278,
"eval_steps_per_second": 55.396,
"step": 496000
},
{
"epoch": 4.26,
"eval_loss": 3.1575887203216553,
"eval_runtime": 113.1006,
"eval_samples_per_second": 880.862,
"eval_steps_per_second": 55.057,
"step": 504000
},
{
"epoch": 4.33,
"learning_rate": 3.2253333333333334e-07,
"loss": 3.3176,
"step": 512000
},
{
"epoch": 4.33,
"eval_loss": 3.160233974456787,
"eval_runtime": 111.8966,
"eval_samples_per_second": 890.34,
"eval_steps_per_second": 55.65,
"step": 512000
},
{
"epoch": 4.4,
"eval_loss": 3.154435157775879,
"eval_runtime": 112.2439,
"eval_samples_per_second": 887.585,
"eval_steps_per_second": 55.477,
"step": 520000
},
{
"epoch": 4.46,
"learning_rate": 3.198e-07,
"loss": 3.3126,
"step": 528000
},
{
"epoch": 4.46,
"eval_loss": 3.1477601528167725,
"eval_runtime": 112.0306,
"eval_samples_per_second": 889.275,
"eval_steps_per_second": 55.583,
"step": 528000
},
{
"epoch": 4.53,
"eval_loss": 3.1520204544067383,
"eval_runtime": 111.7071,
"eval_samples_per_second": 891.85,
"eval_steps_per_second": 55.744,
"step": 536000
},
{
"epoch": 4.6,
"learning_rate": 3.1706666666666665e-07,
"loss": 3.3044,
"step": 544000
},
{
"epoch": 4.6,
"eval_loss": 3.158066749572754,
"eval_runtime": 111.3952,
"eval_samples_per_second": 894.348,
"eval_steps_per_second": 55.9,
"step": 544000
},
{
"epoch": 4.67,
"eval_loss": 3.1625027656555176,
"eval_runtime": 111.6253,
"eval_samples_per_second": 892.503,
"eval_steps_per_second": 55.785,
"step": 552000
},
{
"epoch": 4.73,
"learning_rate": 3.1433333333333336e-07,
"loss": 3.3118,
"step": 560000
},
{
"epoch": 4.73,
"eval_loss": 3.1510021686553955,
"eval_runtime": 111.8688,
"eval_samples_per_second": 890.561,
"eval_steps_per_second": 55.663,
"step": 560000
},
{
"epoch": 4.8,
"eval_loss": 3.154784917831421,
"eval_runtime": 112.8767,
"eval_samples_per_second": 882.609,
"eval_steps_per_second": 55.166,
"step": 568000
},
{
"epoch": 4.87,
"learning_rate": 3.116e-07,
"loss": 3.3085,
"step": 576000
},
{
"epoch": 4.87,
"eval_loss": 3.153918743133545,
"eval_runtime": 113.0859,
"eval_samples_per_second": 880.977,
"eval_steps_per_second": 55.064,
"step": 576000
},
{
"epoch": 4.94,
"eval_loss": 3.1503121852874756,
"eval_runtime": 111.3015,
"eval_samples_per_second": 895.1,
"eval_steps_per_second": 55.947,
"step": 584000
},
{
"epoch": 5.0,
"learning_rate": 3.0886666666666667e-07,
"loss": 3.3014,
"step": 592000
},
{
"epoch": 5.0,
"eval_loss": 3.1503639221191406,
"eval_runtime": 112.1576,
"eval_samples_per_second": 888.268,
"eval_steps_per_second": 55.52,
"step": 592000
},
{
"epoch": 5.07,
"eval_loss": 3.1534154415130615,
"eval_runtime": 112.02,
"eval_samples_per_second": 889.359,
"eval_steps_per_second": 55.588,
"step": 600000
},
{
"epoch": 5.14,
"learning_rate": 3.061333333333333e-07,
"loss": 3.3115,
"step": 608000
},
{
"epoch": 5.14,
"eval_loss": 3.155081033706665,
"eval_runtime": 112.107,
"eval_samples_per_second": 888.669,
"eval_steps_per_second": 55.545,
"step": 608000
},
{
"epoch": 5.21,
"eval_loss": 3.1493077278137207,
"eval_runtime": 112.698,
"eval_samples_per_second": 884.009,
"eval_steps_per_second": 55.254,
"step": 616000
},
{
"epoch": 5.27,
"learning_rate": 3.034e-07,
"loss": 3.3079,
"step": 624000
},
{
"epoch": 5.27,
"eval_loss": 3.1427001953125,
"eval_runtime": 112.7088,
"eval_samples_per_second": 883.924,
"eval_steps_per_second": 55.249,
"step": 624000
},
{
"epoch": 5.34,
"eval_loss": 3.1499979496002197,
"eval_runtime": 113.2934,
"eval_samples_per_second": 879.363,
"eval_steps_per_second": 54.964,
"step": 632000
},
{
"epoch": 5.41,
"learning_rate": 3.0066666666666663e-07,
"loss": 3.3138,
"step": 640000
},
{
"epoch": 5.41,
"eval_loss": 3.1546239852905273,
"eval_runtime": 112.2247,
"eval_samples_per_second": 887.737,
"eval_steps_per_second": 55.487,
"step": 640000
},
{
"epoch": 5.48,
"eval_loss": 3.1481516361236572,
"eval_runtime": 112.549,
"eval_samples_per_second": 885.179,
"eval_steps_per_second": 55.327,
"step": 648000
},
{
"epoch": 5.54,
"learning_rate": 2.9793333333333334e-07,
"loss": 3.3096,
"step": 656000
},
{
"epoch": 5.54,
"eval_loss": 3.1346185207366943,
"eval_runtime": 112.618,
"eval_samples_per_second": 884.637,
"eval_steps_per_second": 55.293,
"step": 656000
},
{
"epoch": 5.61,
"eval_loss": 3.1328086853027344,
"eval_runtime": 112.4516,
"eval_samples_per_second": 885.946,
"eval_steps_per_second": 55.375,
"step": 664000
},
{
"epoch": 5.68,
"learning_rate": 2.952e-07,
"loss": 3.3121,
"step": 672000
},
{
"epoch": 5.68,
"eval_loss": 3.14998197555542,
"eval_runtime": 113.0205,
"eval_samples_per_second": 881.486,
"eval_steps_per_second": 55.096,
"step": 672000
},
{
"epoch": 5.75,
"eval_loss": 3.131186008453369,
"eval_runtime": 112.1192,
"eval_samples_per_second": 888.573,
"eval_steps_per_second": 55.539,
"step": 680000
},
{
"epoch": 5.82,
"learning_rate": 2.9246666666666665e-07,
"loss": 3.3195,
"step": 688000
},
{
"epoch": 5.82,
"eval_loss": 3.143950939178467,
"eval_runtime": 114.0106,
"eval_samples_per_second": 873.831,
"eval_steps_per_second": 54.618,
"step": 688000
},
{
"epoch": 5.88,
"eval_loss": 3.1190884113311768,
"eval_runtime": 113.1962,
"eval_samples_per_second": 880.118,
"eval_steps_per_second": 55.011,
"step": 696000
},
{
"epoch": 5.95,
"learning_rate": 2.897333333333333e-07,
"loss": 3.3091,
"step": 704000
},
{
"epoch": 5.95,
"eval_loss": 3.139669179916382,
"eval_runtime": 112.4234,
"eval_samples_per_second": 886.168,
"eval_steps_per_second": 55.389,
"step": 704000
},
{
"epoch": 6.02,
"eval_loss": 3.1484687328338623,
"eval_runtime": 113.6323,
"eval_samples_per_second": 876.74,
"eval_steps_per_second": 54.8,
"step": 712000
},
{
"epoch": 6.09,
"learning_rate": 2.8699999999999996e-07,
"loss": 3.3089,
"step": 720000
},
{
"epoch": 6.09,
"eval_loss": 3.134004592895508,
"eval_runtime": 112.7952,
"eval_samples_per_second": 883.247,
"eval_steps_per_second": 55.206,
"step": 720000
},
{
"epoch": 6.15,
"eval_loss": 3.1384811401367188,
"eval_runtime": 112.7828,
"eval_samples_per_second": 883.344,
"eval_steps_per_second": 55.212,
"step": 728000
},
{
"epoch": 6.22,
"learning_rate": 2.8426666666666667e-07,
"loss": 3.3062,
"step": 736000
},
{
"epoch": 6.22,
"eval_loss": 3.1357834339141846,
"eval_runtime": 113.0764,
"eval_samples_per_second": 881.05,
"eval_steps_per_second": 55.069,
"step": 736000
},
{
"epoch": 6.29,
"eval_loss": 3.1295759677886963,
"eval_runtime": 113.0992,
"eval_samples_per_second": 880.873,
"eval_steps_per_second": 55.058,
"step": 744000
},
{
"epoch": 6.36,
"learning_rate": 2.815333333333333e-07,
"loss": 3.3102,
"step": 752000
},
{
"epoch": 6.36,
"eval_loss": 3.1260080337524414,
"eval_runtime": 113.4791,
"eval_samples_per_second": 877.924,
"eval_steps_per_second": 54.874,
"step": 752000
},
{
"epoch": 6.42,
"eval_loss": 3.142832040786743,
"eval_runtime": 113.2715,
"eval_samples_per_second": 879.533,
"eval_steps_per_second": 54.974,
"step": 760000
},
{
"epoch": 6.49,
"learning_rate": 2.7880000000000003e-07,
"loss": 3.3088,
"step": 768000
},
{
"epoch": 6.49,
"eval_loss": 3.137244939804077,
"eval_runtime": 112.7233,
"eval_samples_per_second": 883.81,
"eval_steps_per_second": 55.241,
"step": 768000
},
{
"epoch": 6.56,
"eval_loss": 3.1404080390930176,
"eval_runtime": 112.6566,
"eval_samples_per_second": 884.333,
"eval_steps_per_second": 55.274,
"step": 776000
},
{
"epoch": 6.63,
"learning_rate": 2.7606666666666664e-07,
"loss": 3.3096,
"step": 784000
},
{
"epoch": 6.63,
"eval_loss": 3.1362013816833496,
"eval_runtime": 112.5766,
"eval_samples_per_second": 884.962,
"eval_steps_per_second": 55.313,
"step": 784000
},
{
"epoch": 6.69,
"eval_loss": 3.1407511234283447,
"eval_runtime": 113.0016,
"eval_samples_per_second": 881.633,
"eval_steps_per_second": 55.105,
"step": 792000
},
{
"epoch": 6.76,
"learning_rate": 2.733333333333333e-07,
"loss": 3.3079,
"step": 800000
},
{
"epoch": 6.76,
"eval_loss": 3.134976625442505,
"eval_runtime": 112.2018,
"eval_samples_per_second": 887.918,
"eval_steps_per_second": 55.498,
"step": 800000
},
{
"epoch": 6.83,
"eval_loss": 3.146075487136841,
"eval_runtime": 112.9151,
"eval_samples_per_second": 882.309,
"eval_steps_per_second": 55.148,
"step": 808000
},
{
"epoch": 6.9,
"learning_rate": 2.706e-07,
"loss": 3.3099,
"step": 816000
},
{
"epoch": 6.9,
"eval_loss": 3.142042398452759,
"eval_runtime": 112.8773,
"eval_samples_per_second": 882.604,
"eval_steps_per_second": 55.166,
"step": 816000
},
{
"epoch": 6.96,
"eval_loss": 3.121290922164917,
"eval_runtime": 112.4177,
"eval_samples_per_second": 886.213,
"eval_steps_per_second": 55.392,
"step": 824000
},
{
"epoch": 7.03,
"learning_rate": 2.6786666666666666e-07,
"loss": 3.3015,
"step": 832000
},
{
"epoch": 7.03,
"eval_loss": 3.136622905731201,
"eval_runtime": 112.757,
"eval_samples_per_second": 883.546,
"eval_steps_per_second": 55.225,
"step": 832000
},
{
"epoch": 7.1,
"eval_loss": 3.1400632858276367,
"eval_runtime": 112.5633,
"eval_samples_per_second": 885.067,
"eval_steps_per_second": 55.32,
"step": 840000
},
{
"epoch": 7.17,
"learning_rate": 2.651333333333333e-07,
"loss": 3.3045,
"step": 848000
},
{
"epoch": 7.17,
"eval_loss": 3.129455804824829,
"eval_runtime": 112.7781,
"eval_samples_per_second": 883.381,
"eval_steps_per_second": 55.215,
"step": 848000
},
{
"epoch": 7.24,
"eval_loss": 3.1323471069335938,
"eval_runtime": 113.1417,
"eval_samples_per_second": 880.542,
"eval_steps_per_second": 55.037,
"step": 856000
},
{
"epoch": 7.3,
"learning_rate": 2.624e-07,
"loss": 3.3085,
"step": 864000
},
{
"epoch": 7.3,
"eval_loss": 3.1367900371551514,
"eval_runtime": 112.7893,
"eval_samples_per_second": 883.293,
"eval_steps_per_second": 55.209,
"step": 864000
},
{
"epoch": 7.37,
"eval_loss": 3.1274642944335938,
"eval_runtime": 113.8432,
"eval_samples_per_second": 875.116,
"eval_steps_per_second": 54.698,
"step": 872000
},
{
"epoch": 7.44,
"learning_rate": 2.596666666666667e-07,
"loss": 3.3061,
"step": 880000
},
{
"epoch": 7.44,
"eval_loss": 3.1325767040252686,
"eval_runtime": 112.5258,
"eval_samples_per_second": 885.362,
"eval_steps_per_second": 55.338,
"step": 880000
},
{
"epoch": 7.51,
"eval_loss": 3.137669801712036,
"eval_runtime": 112.5494,
"eval_samples_per_second": 885.175,
"eval_steps_per_second": 55.327,
"step": 888000
},
{
"epoch": 7.57,
"learning_rate": 2.5693333333333333e-07,
"loss": 3.309,
"step": 896000
},
{
"epoch": 7.57,
"eval_loss": 3.1406917572021484,
"eval_runtime": 112.4096,
"eval_samples_per_second": 886.276,
"eval_steps_per_second": 55.396,
"step": 896000
},
{
"epoch": 7.64,
"eval_loss": 3.132387399673462,
"eval_runtime": 113.3694,
"eval_samples_per_second": 878.773,
"eval_steps_per_second": 54.927,
"step": 904000
},
{
"epoch": 7.71,
"learning_rate": 2.542e-07,
"loss": 3.3024,
"step": 912000
},
{
"epoch": 7.71,
"eval_loss": 3.1187102794647217,
"eval_runtime": 112.3891,
"eval_samples_per_second": 886.438,
"eval_steps_per_second": 55.406,
"step": 912000
},
{
"epoch": 7.78,
"eval_loss": 3.1514384746551514,
"eval_runtime": 112.9329,
"eval_samples_per_second": 882.17,
"eval_steps_per_second": 55.139,
"step": 920000
},
{
"epoch": 7.84,
"learning_rate": 2.5146666666666664e-07,
"loss": 3.2955,
"step": 928000
},
{
"epoch": 7.84,
"eval_loss": 3.135131359100342,
"eval_runtime": 113.0723,
"eval_samples_per_second": 881.082,
"eval_steps_per_second": 55.071,
"step": 928000
},
{
"epoch": 7.91,
"eval_loss": 3.1307849884033203,
"eval_runtime": 112.9789,
"eval_samples_per_second": 881.811,
"eval_steps_per_second": 55.117,
"step": 936000
},
{
"epoch": 7.98,
"learning_rate": 2.4873333333333335e-07,
"loss": 3.3122,
"step": 944000
},
{
"epoch": 7.98,
"eval_loss": 3.1404905319213867,
"eval_runtime": 113.3801,
"eval_samples_per_second": 878.69,
"eval_steps_per_second": 54.921,
"step": 944000
},
{
"epoch": 8.05,
"eval_loss": 3.129053831100464,
"eval_runtime": 113.1579,
"eval_samples_per_second": 880.416,
"eval_steps_per_second": 55.029,
"step": 952000
},
{
"epoch": 8.11,
"learning_rate": 2.46e-07,
"loss": 3.304,
"step": 960000
},
{
"epoch": 8.11,
"eval_loss": 3.1244165897369385,
"eval_runtime": 113.5289,
"eval_samples_per_second": 877.539,
"eval_steps_per_second": 54.849,
"step": 960000
},
{
"epoch": 8.18,
"eval_loss": 3.1409430503845215,
"eval_runtime": 113.3422,
"eval_samples_per_second": 878.984,
"eval_steps_per_second": 54.94,
"step": 968000
},
{
"epoch": 8.25,
"learning_rate": 2.4326666666666666e-07,
"loss": 3.3046,
"step": 976000
},
{
"epoch": 8.25,
"eval_loss": 3.135524272918701,
"eval_runtime": 112.7434,
"eval_samples_per_second": 883.653,
"eval_steps_per_second": 55.232,
"step": 976000
},
{
"epoch": 8.32,
"eval_loss": 3.141561269760132,
"eval_runtime": 113.0047,
"eval_samples_per_second": 881.609,
"eval_steps_per_second": 55.104,
"step": 984000
},
{
"epoch": 8.38,
"learning_rate": 2.405333333333333e-07,
"loss": 3.3022,
"step": 992000
},
{
"epoch": 8.38,
"eval_loss": 3.1258225440979004,
"eval_runtime": 113.3174,
"eval_samples_per_second": 879.177,
"eval_steps_per_second": 54.952,
"step": 992000
},
{
"epoch": 8.45,
"eval_loss": 3.1332101821899414,
"eval_runtime": 113.3836,
"eval_samples_per_second": 878.663,
"eval_steps_per_second": 54.92,
"step": 1000000
},
{
"epoch": 8.52,
"learning_rate": 2.3779999999999997e-07,
"loss": 3.3004,
"step": 1008000
},
{
"epoch": 8.52,
"eval_loss": 3.143005847930908,
"eval_runtime": 113.3372,
"eval_samples_per_second": 879.023,
"eval_steps_per_second": 54.942,
"step": 1008000
},
{
"epoch": 8.59,
"eval_loss": 3.1281683444976807,
"eval_runtime": 113.2531,
"eval_samples_per_second": 879.675,
"eval_steps_per_second": 54.983,
"step": 1016000
},
{
"epoch": 8.66,
"learning_rate": 2.3506666666666668e-07,
"loss": 3.3045,
"step": 1024000
},
{
"epoch": 8.66,
"eval_loss": 3.1286985874176025,
"eval_runtime": 112.5841,
"eval_samples_per_second": 884.903,
"eval_steps_per_second": 55.31,
"step": 1024000
},
{
"epoch": 8.72,
"eval_loss": 3.1368112564086914,
"eval_runtime": 113.404,
"eval_samples_per_second": 878.505,
"eval_steps_per_second": 54.91,
"step": 1032000
},
{
"epoch": 8.79,
"learning_rate": 2.3233333333333334e-07,
"loss": 3.3047,
"step": 1040000
},
{
"epoch": 8.79,
"eval_loss": 3.136190891265869,
"eval_runtime": 113.3739,
"eval_samples_per_second": 878.738,
"eval_steps_per_second": 54.924,
"step": 1040000
},
{
"epoch": 8.86,
"eval_loss": 3.1267800331115723,
"eval_runtime": 113.3548,
"eval_samples_per_second": 878.886,
"eval_steps_per_second": 54.934,
"step": 1048000
},
{
"epoch": 8.93,
"learning_rate": 2.2960000000000002e-07,
"loss": 3.3044,
"step": 1056000
},
{
"epoch": 8.93,
"eval_loss": 3.1329193115234375,
"eval_runtime": 113.2679,
"eval_samples_per_second": 879.56,
"eval_steps_per_second": 54.976,
"step": 1056000
},
{
"epoch": 8.99,
"eval_loss": 3.124464273452759,
"eval_runtime": 112.1804,
"eval_samples_per_second": 888.087,
"eval_steps_per_second": 55.509,
"step": 1064000
},
{
"epoch": 9.06,
"learning_rate": 2.2686666666666667e-07,
"loss": 3.2961,
"step": 1072000
},
{
"epoch": 9.06,
"eval_loss": 3.127128839492798,
"eval_runtime": 112.9944,
"eval_samples_per_second": 881.69,
"eval_steps_per_second": 55.109,
"step": 1072000
},
{
"epoch": 9.13,
"eval_loss": 3.130047559738159,
"eval_runtime": 113.791,
"eval_samples_per_second": 875.518,
"eval_steps_per_second": 54.723,
"step": 1080000
},
{
"epoch": 9.2,
"learning_rate": 2.2413333333333333e-07,
"loss": 3.2999,
"step": 1088000
},
{
"epoch": 9.2,
"eval_loss": 3.136892080307007,
"eval_runtime": 113.0086,
"eval_samples_per_second": 881.579,
"eval_steps_per_second": 55.102,
"step": 1088000
},
{
"epoch": 9.26,
"eval_loss": 3.1424949169158936,
"eval_runtime": 113.6203,
"eval_samples_per_second": 876.833,
"eval_steps_per_second": 54.805,
"step": 1096000
},
{
"epoch": 9.33,
"learning_rate": 2.214e-07,
"loss": 3.3012,
"step": 1104000
},
{
"epoch": 9.33,
"eval_loss": 3.121316432952881,
"eval_runtime": 113.7295,
"eval_samples_per_second": 875.991,
"eval_steps_per_second": 54.753,
"step": 1104000
},
{
"epoch": 9.4,
"eval_loss": 3.1285130977630615,
"eval_runtime": 114.4392,
"eval_samples_per_second": 870.558,
"eval_steps_per_second": 54.413,
"step": 1112000
},
{
"epoch": 9.47,
"learning_rate": 2.1866666666666667e-07,
"loss": 3.3008,
"step": 1120000
},
{
"epoch": 9.47,
"eval_loss": 3.135331869125366,
"eval_runtime": 114.4873,
"eval_samples_per_second": 870.193,
"eval_steps_per_second": 54.39,
"step": 1120000
},
{
"epoch": 9.53,
"eval_loss": 3.136654853820801,
"eval_runtime": 113.6856,
"eval_samples_per_second": 876.329,
"eval_steps_per_second": 54.774,
"step": 1128000
},
{
"epoch": 9.6,
"learning_rate": 2.1593333333333332e-07,
"loss": 3.3028,
"step": 1136000
},
{
"epoch": 9.6,
"eval_loss": 3.129446029663086,
"eval_runtime": 113.0995,
"eval_samples_per_second": 880.871,
"eval_steps_per_second": 55.058,
"step": 1136000
},
{
"epoch": 9.67,
"eval_loss": 3.133976459503174,
"eval_runtime": 113.1955,
"eval_samples_per_second": 880.124,
"eval_steps_per_second": 55.011,
"step": 1144000
},
{
"epoch": 9.74,
"learning_rate": 2.132e-07,
"loss": 3.3043,
"step": 1152000
},
{
"epoch": 9.74,
"eval_loss": 3.1329877376556396,
"eval_runtime": 113.2026,
"eval_samples_per_second": 880.068,
"eval_steps_per_second": 55.008,
"step": 1152000
},
{
"epoch": 9.8,
"eval_loss": 3.13797664642334,
"eval_runtime": 113.0427,
"eval_samples_per_second": 881.313,
"eval_steps_per_second": 55.085,
"step": 1160000
},
{
"epoch": 9.87,
"learning_rate": 2.1046666666666666e-07,
"loss": 3.2976,
"step": 1168000
},
{
"epoch": 9.87,
"eval_loss": 3.119840145111084,
"eval_runtime": 113.6008,
"eval_samples_per_second": 876.983,
"eval_steps_per_second": 54.815,
"step": 1168000
},
{
"epoch": 9.94,
"eval_loss": 3.128972053527832,
"eval_runtime": 114.1658,
"eval_samples_per_second": 872.643,
"eval_steps_per_second": 54.543,
"step": 1176000
},
{
"epoch": 10.01,
"learning_rate": 2.0773333333333334e-07,
"loss": 3.3048,
"step": 1184000
},
{
"epoch": 10.01,
"eval_loss": 3.1457955837249756,
"eval_runtime": 113.9721,
"eval_samples_per_second": 874.126,
"eval_steps_per_second": 54.636,
"step": 1184000
},
{
"epoch": 10.08,
"eval_loss": 3.1274356842041016,
"eval_runtime": 114.22,
"eval_samples_per_second": 872.229,
"eval_steps_per_second": 54.518,
"step": 1192000
},
{
"epoch": 10.14,
"learning_rate": 2.05e-07,
"loss": 3.3038,
"step": 1200000
},
{
"epoch": 10.14,
"eval_loss": 3.1180973052978516,
"eval_runtime": 114.324,
"eval_samples_per_second": 871.436,
"eval_steps_per_second": 54.468,
"step": 1200000
},
{
"epoch": 10.21,
"eval_loss": 3.127936601638794,
"eval_runtime": 113.9464,
"eval_samples_per_second": 874.324,
"eval_steps_per_second": 54.649,
"step": 1208000
},
{
"epoch": 10.28,
"learning_rate": 2.0226666666666668e-07,
"loss": 3.3066,
"step": 1216000
},
{
"epoch": 10.28,
"eval_loss": 3.122974395751953,
"eval_runtime": 115.0259,
"eval_samples_per_second": 866.118,
"eval_steps_per_second": 54.136,
"step": 1216000
},
{
"epoch": 10.35,
"eval_loss": 3.1319711208343506,
"eval_runtime": 114.7784,
"eval_samples_per_second": 867.986,
"eval_steps_per_second": 54.252,
"step": 1224000
},
{
"epoch": 10.41,
"learning_rate": 1.9953333333333333e-07,
"loss": 3.3019,
"step": 1232000
},
{
"epoch": 10.41,
"eval_loss": 3.1202657222747803,
"eval_runtime": 115.5672,
"eval_samples_per_second": 862.061,
"eval_steps_per_second": 53.882,
"step": 1232000
},
{
"epoch": 10.48,
"eval_loss": 3.134918451309204,
"eval_runtime": 114.8028,
"eval_samples_per_second": 867.801,
"eval_steps_per_second": 54.241,
"step": 1240000
},
{
"epoch": 10.55,
"learning_rate": 1.968e-07,
"loss": 3.3037,
"step": 1248000
},
{
"epoch": 10.55,
"eval_loss": 3.132294178009033,
"eval_runtime": 115.0121,
"eval_samples_per_second": 866.222,
"eval_steps_per_second": 54.142,
"step": 1248000
},
{
"epoch": 10.62,
"eval_loss": 3.134295701980591,
"eval_runtime": 114.186,
"eval_samples_per_second": 872.489,
"eval_steps_per_second": 54.534,
"step": 1256000
},
{
"epoch": 10.68,
"learning_rate": 1.9406666666666667e-07,
"loss": 3.2868,
"step": 1264000
},
{
"epoch": 10.68,
"eval_loss": 3.1262283325195312,
"eval_runtime": 114.6026,
"eval_samples_per_second": 869.317,
"eval_steps_per_second": 54.336,
"step": 1264000
},
{
"epoch": 10.75,
"eval_loss": 3.1265833377838135,
"eval_runtime": 114.6327,
"eval_samples_per_second": 869.089,
"eval_steps_per_second": 54.321,
"step": 1272000
},
{
"epoch": 10.82,
"learning_rate": 1.9133333333333333e-07,
"loss": 3.3033,
"step": 1280000
},
{
"epoch": 10.82,
"eval_loss": 3.1282565593719482,
"eval_runtime": 113.7083,
"eval_samples_per_second": 876.154,
"eval_steps_per_second": 54.763,
"step": 1280000
},
{
"epoch": 10.89,
"eval_loss": 3.1290106773376465,
"eval_runtime": 113.8399,
"eval_samples_per_second": 875.141,
"eval_steps_per_second": 54.7,
"step": 1288000
},
{
"epoch": 10.95,
"learning_rate": 1.886e-07,
"loss": 3.2984,
"step": 1296000
},
{
"epoch": 10.95,
"eval_loss": 3.1177093982696533,
"eval_runtime": 114.6951,
"eval_samples_per_second": 868.616,
"eval_steps_per_second": 54.292,
"step": 1296000
},
{
"epoch": 11.02,
"eval_loss": 3.123425245285034,
"eval_runtime": 113.7751,
"eval_samples_per_second": 875.64,
"eval_steps_per_second": 54.731,
"step": 1304000
},
{
"epoch": 11.09,
"learning_rate": 1.8586666666666666e-07,
"loss": 3.2982,
"step": 1312000
},
{
"epoch": 11.09,
"eval_loss": 3.1309823989868164,
"eval_runtime": 114.1207,
"eval_samples_per_second": 872.988,
"eval_steps_per_second": 54.565,
"step": 1312000
},
{
"epoch": 11.16,
"eval_loss": 3.1408894062042236,
"eval_runtime": 113.7168,
"eval_samples_per_second": 876.089,
"eval_steps_per_second": 54.759,
"step": 1320000
},
{
"epoch": 11.23,
"learning_rate": 1.8313333333333332e-07,
"loss": 3.303,
"step": 1328000
},
{
"epoch": 11.23,
"eval_loss": 3.132986545562744,
"eval_runtime": 114.412,
"eval_samples_per_second": 870.766,
"eval_steps_per_second": 54.426,
"step": 1328000
},
{
"epoch": 11.29,
"eval_loss": 3.1281206607818604,
"eval_runtime": 114.0643,
"eval_samples_per_second": 873.42,
"eval_steps_per_second": 54.592,
"step": 1336000
},
{
"epoch": 11.36,
"learning_rate": 1.804e-07,
"loss": 3.2976,
"step": 1344000
},
{
"epoch": 11.36,
"eval_loss": 3.1286239624023438,
"eval_runtime": 114.1147,
"eval_samples_per_second": 873.034,
"eval_steps_per_second": 54.568,
"step": 1344000
},
{
"epoch": 11.43,
"eval_loss": 3.1282992362976074,
"eval_runtime": 114.374,
"eval_samples_per_second": 871.055,
"eval_steps_per_second": 54.444,
"step": 1352000
},
{
"epoch": 11.5,
"learning_rate": 1.7766666666666666e-07,
"loss": 3.2923,
"step": 1360000
},
{
"epoch": 11.5,
"eval_loss": 3.114553451538086,
"eval_runtime": 112.5162,
"eval_samples_per_second": 885.437,
"eval_steps_per_second": 55.343,
"step": 1360000
},
{
"epoch": 11.56,
"eval_loss": 3.1387319564819336,
"eval_runtime": 113.9184,
"eval_samples_per_second": 874.539,
"eval_steps_per_second": 54.662,
"step": 1368000
},
{
"epoch": 11.63,
"learning_rate": 1.7493333333333334e-07,
"loss": 3.2988,
"step": 1376000
},
{
"epoch": 11.63,
"eval_loss": 3.1278181076049805,
"eval_runtime": 114.7175,
"eval_samples_per_second": 868.447,
"eval_steps_per_second": 54.281,
"step": 1376000
},
{
"epoch": 11.7,
"eval_loss": 3.1225082874298096,
"eval_runtime": 113.872,
"eval_samples_per_second": 874.894,
"eval_steps_per_second": 54.684,
"step": 1384000
},
{
"epoch": 11.77,
"learning_rate": 1.722e-07,
"loss": 3.299,
"step": 1392000
},
{
"epoch": 11.77,
"eval_loss": 3.1341497898101807,
"eval_runtime": 113.9675,
"eval_samples_per_second": 874.161,
"eval_steps_per_second": 54.638,
"step": 1392000
},
{
"epoch": 11.83,
"eval_loss": 3.1210529804229736,
"eval_runtime": 113.6828,
"eval_samples_per_second": 876.351,
"eval_steps_per_second": 54.775,
"step": 1400000
},
{
"epoch": 11.9,
"learning_rate": 1.6946666666666668e-07,
"loss": 3.2993,
"step": 1408000
},
{
"epoch": 11.9,
"eval_loss": 3.1026012897491455,
"eval_runtime": 114.1409,
"eval_samples_per_second": 872.834,
"eval_steps_per_second": 54.555,
"step": 1408000
},
{
"epoch": 11.97,
"eval_loss": 3.1222946643829346,
"eval_runtime": 113.2607,
"eval_samples_per_second": 879.617,
"eval_steps_per_second": 54.979,
"step": 1416000
},
{
"epoch": 12.04,
"learning_rate": 1.6673333333333333e-07,
"loss": 3.2942,
"step": 1424000
},
{
"epoch": 12.04,
"eval_loss": 3.1199705600738525,
"eval_runtime": 114.2703,
"eval_samples_per_second": 871.845,
"eval_steps_per_second": 54.494,
"step": 1424000
},
{
"epoch": 12.1,
"eval_loss": 3.1245763301849365,
"eval_runtime": 114.7753,
"eval_samples_per_second": 868.009,
"eval_steps_per_second": 54.254,
"step": 1432000
},
{
"epoch": 12.17,
"learning_rate": 1.64e-07,
"loss": 3.3062,
"step": 1440000
},
{
"epoch": 12.17,
"eval_loss": 3.1325275897979736,
"eval_runtime": 114.1964,
"eval_samples_per_second": 872.409,
"eval_steps_per_second": 54.529,
"step": 1440000
},
{
"epoch": 12.24,
"eval_loss": 3.138754367828369,
"eval_runtime": 113.4408,
"eval_samples_per_second": 878.22,
"eval_steps_per_second": 54.892,
"step": 1448000
},
{
"epoch": 12.31,
"learning_rate": 1.6126666666666667e-07,
"loss": 3.297,
"step": 1456000
},
{
"epoch": 12.31,
"eval_loss": 3.1370742321014404,
"eval_runtime": 114.3725,
"eval_samples_per_second": 871.066,
"eval_steps_per_second": 54.445,
"step": 1456000
},
{
"epoch": 12.37,
"eval_loss": 3.1272239685058594,
"eval_runtime": 114.3424,
"eval_samples_per_second": 871.295,
"eval_steps_per_second": 54.459,
"step": 1464000
},
{
"epoch": 12.44,
"learning_rate": 1.5853333333333332e-07,
"loss": 3.3033,
"step": 1472000
},
{
"epoch": 12.44,
"eval_loss": 3.1231026649475098,
"eval_runtime": 113.6228,
"eval_samples_per_second": 876.813,
"eval_steps_per_second": 54.804,
"step": 1472000
},
{
"epoch": 12.51,
"eval_loss": 3.131573438644409,
"eval_runtime": 114.5622,
"eval_samples_per_second": 869.623,
"eval_steps_per_second": 54.355,
"step": 1480000
},
{
"epoch": 12.58,
"learning_rate": 1.558e-07,
"loss": 3.291,
"step": 1488000
},
{
"epoch": 12.58,
"eval_loss": 3.139345169067383,
"eval_runtime": 114.0858,
"eval_samples_per_second": 873.255,
"eval_steps_per_second": 54.582,
"step": 1488000
},
{
"epoch": 12.65,
"eval_loss": 3.1269216537475586,
"eval_runtime": 114.7525,
"eval_samples_per_second": 868.182,
"eval_steps_per_second": 54.265,
"step": 1496000
},
{
"epoch": 12.71,
"learning_rate": 1.5306666666666666e-07,
"loss": 3.3054,
"step": 1504000
},
{
"epoch": 12.71,
"eval_loss": 3.1363420486450195,
"eval_runtime": 114.3108,
"eval_samples_per_second": 871.536,
"eval_steps_per_second": 54.474,
"step": 1504000
},
{
"epoch": 12.78,
"eval_loss": 3.1249115467071533,
"eval_runtime": 114.4828,
"eval_samples_per_second": 870.227,
"eval_steps_per_second": 54.392,
"step": 1512000
},
{
"epoch": 12.85,
"learning_rate": 1.5033333333333332e-07,
"loss": 3.2908,
"step": 1520000
},
{
"epoch": 12.85,
"eval_loss": 3.1309752464294434,
"eval_runtime": 114.385,
"eval_samples_per_second": 870.97,
"eval_steps_per_second": 54.439,
"step": 1520000
},
{
"epoch": 12.92,
"eval_loss": 3.121305465698242,
"eval_runtime": 115.3916,
"eval_samples_per_second": 863.373,
"eval_steps_per_second": 53.964,
"step": 1528000
},
{
"epoch": 12.98,
"learning_rate": 1.476e-07,
"loss": 3.2987,
"step": 1536000
},
{
"epoch": 12.98,
"eval_loss": 3.122337818145752,
"eval_runtime": 115.0493,
"eval_samples_per_second": 865.942,
"eval_steps_per_second": 54.125,
"step": 1536000
},
{
"epoch": 13.05,
"eval_loss": 3.1133577823638916,
"eval_runtime": 113.7876,
"eval_samples_per_second": 875.544,
"eval_steps_per_second": 54.725,
"step": 1544000
},
{
"epoch": 13.12,
"learning_rate": 1.4486666666666665e-07,
"loss": 3.2965,
"step": 1552000
},
{
"epoch": 13.12,
"eval_loss": 3.1168410778045654,
"eval_runtime": 113.5718,
"eval_samples_per_second": 877.207,
"eval_steps_per_second": 54.829,
"step": 1552000
},
{
"epoch": 13.19,
"eval_loss": 3.1230275630950928,
"eval_runtime": 113.4112,
"eval_samples_per_second": 878.449,
"eval_steps_per_second": 54.906,
"step": 1560000
},
{
"epoch": 13.25,
"learning_rate": 1.4213333333333334e-07,
"loss": 3.2931,
"step": 1568000
},
{
"epoch": 13.25,
"eval_loss": 3.113243818283081,
"eval_runtime": 114.3885,
"eval_samples_per_second": 870.944,
"eval_steps_per_second": 54.437,
"step": 1568000
},
{
"epoch": 13.32,
"eval_loss": 3.119607925415039,
"eval_runtime": 114.327,
"eval_samples_per_second": 871.412,
"eval_steps_per_second": 54.467,
"step": 1576000
},
{
"epoch": 13.39,
"learning_rate": 1.3940000000000002e-07,
"loss": 3.301,
"step": 1584000
},
{
"epoch": 13.39,
"eval_loss": 3.1286561489105225,
"eval_runtime": 114.3561,
"eval_samples_per_second": 871.191,
"eval_steps_per_second": 54.453,
"step": 1584000
},
{
"epoch": 13.46,
"eval_loss": 3.114452838897705,
"eval_runtime": 114.7858,
"eval_samples_per_second": 867.929,
"eval_steps_per_second": 54.249,
"step": 1592000
},
{
"epoch": 13.52,
"learning_rate": 1.3666666666666665e-07,
"loss": 3.3004,
"step": 1600000
},
{
"epoch": 13.52,
"eval_loss": 3.129112482070923,
"eval_runtime": 115.2447,
"eval_samples_per_second": 864.473,
"eval_steps_per_second": 54.033,
"step": 1600000
},
{
"epoch": 13.59,
"eval_loss": 3.1145238876342773,
"eval_runtime": 114.3883,
"eval_samples_per_second": 870.946,
"eval_steps_per_second": 54.437,
"step": 1608000
},
{
"epoch": 13.66,
"learning_rate": 1.3393333333333333e-07,
"loss": 3.2992,
"step": 1616000
},
{
"epoch": 13.66,
"eval_loss": 3.129173755645752,
"eval_runtime": 114.4593,
"eval_samples_per_second": 870.405,
"eval_steps_per_second": 54.404,
"step": 1616000
},
{
"epoch": 13.73,
"eval_loss": 3.124779462814331,
"eval_runtime": 114.1908,
"eval_samples_per_second": 872.452,
"eval_steps_per_second": 54.532,
"step": 1624000
},
{
"epoch": 13.79,
"learning_rate": 1.312e-07,
"loss": 3.2974,
"step": 1632000
},
{
"epoch": 13.79,
"eval_loss": 3.1315438747406006,
"eval_runtime": 114.5685,
"eval_samples_per_second": 869.576,
"eval_steps_per_second": 54.352,
"step": 1632000
},
{
"epoch": 13.86,
"eval_loss": 3.111248254776001,
"eval_runtime": 114.8609,
"eval_samples_per_second": 867.362,
"eval_steps_per_second": 54.213,
"step": 1640000
},
{
"epoch": 13.93,
"learning_rate": 1.2846666666666667e-07,
"loss": 3.2993,
"step": 1648000
},
{
"epoch": 13.93,
"eval_loss": 3.121676206588745,
"eval_runtime": 115.6258,
"eval_samples_per_second": 861.625,
"eval_steps_per_second": 53.855,
"step": 1648000
},
{
"epoch": 14.0,
"eval_loss": 3.136202812194824,
"eval_runtime": 115.2064,
"eval_samples_per_second": 864.761,
"eval_steps_per_second": 54.051,
"step": 1656000
},
{
"epoch": 14.07,
"learning_rate": 1.2573333333333332e-07,
"loss": 3.2934,
"step": 1664000
},
{
"epoch": 14.07,
"eval_loss": 3.1199350357055664,
"eval_runtime": 114.9755,
"eval_samples_per_second": 866.498,
"eval_steps_per_second": 54.159,
"step": 1664000
},
{
"epoch": 14.13,
"eval_loss": 3.1276044845581055,
"eval_runtime": 114.7877,
"eval_samples_per_second": 867.916,
"eval_steps_per_second": 54.248,
"step": 1672000
},
{
"epoch": 14.2,
"learning_rate": 1.23e-07,
"loss": 3.2964,
"step": 1680000
},
{
"epoch": 14.2,
"eval_loss": 3.1164281368255615,
"eval_runtime": 115.6081,
"eval_samples_per_second": 861.756,
"eval_steps_per_second": 53.863,
"step": 1680000
},
{
"epoch": 14.27,
"eval_loss": 3.117210865020752,
"eval_runtime": 114.8184,
"eval_samples_per_second": 867.683,
"eval_steps_per_second": 54.233,
"step": 1688000
},
{
"epoch": 14.34,
"learning_rate": 1.2026666666666666e-07,
"loss": 3.305,
"step": 1696000
},
{
"epoch": 14.34,
"eval_loss": 3.1319870948791504,
"eval_runtime": 114.4707,
"eval_samples_per_second": 870.319,
"eval_steps_per_second": 54.398,
"step": 1696000
},
{
"epoch": 14.4,
"eval_loss": 3.1268680095672607,
"eval_runtime": 115.2616,
"eval_samples_per_second": 864.347,
"eval_steps_per_second": 54.025,
"step": 1704000
},
{
"epoch": 14.47,
"learning_rate": 1.1753333333333334e-07,
"loss": 3.3022,
"step": 1712000
},
{
"epoch": 14.47,
"eval_loss": 3.1107068061828613,
"eval_runtime": 115.3427,
"eval_samples_per_second": 863.739,
"eval_steps_per_second": 53.987,
"step": 1712000
},
{
"epoch": 14.54,
"eval_loss": 3.1096973419189453,
"eval_runtime": 115.7245,
"eval_samples_per_second": 860.89,
"eval_steps_per_second": 53.809,
"step": 1720000
},
{
"epoch": 14.61,
"learning_rate": 1.1480000000000001e-07,
"loss": 3.2969,
"step": 1728000
},
{
"epoch": 14.61,
"eval_loss": 3.117579460144043,
"eval_runtime": 114.9075,
"eval_samples_per_second": 867.01,
"eval_steps_per_second": 54.191,
"step": 1728000
},
{
"epoch": 14.67,
"eval_loss": 3.1282224655151367,
"eval_runtime": 114.9382,
"eval_samples_per_second": 866.779,
"eval_steps_per_second": 54.177,
"step": 1736000
},
{
"epoch": 14.74,
"learning_rate": 1.1206666666666666e-07,
"loss": 3.2976,
"step": 1744000
},
{
"epoch": 14.74,
"eval_loss": 3.1195032596588135,
"eval_runtime": 114.3423,
"eval_samples_per_second": 871.296,
"eval_steps_per_second": 54.459,
"step": 1744000
},
{
"epoch": 14.81,
"eval_loss": 3.115382432937622,
"eval_runtime": 115.6972,
"eval_samples_per_second": 861.093,
"eval_steps_per_second": 53.822,
"step": 1752000
},
{
"epoch": 14.88,
"learning_rate": 1.0933333333333333e-07,
"loss": 3.3004,
"step": 1760000
},
{
"epoch": 14.88,
"eval_loss": 3.114680767059326,
"eval_runtime": 114.7064,
"eval_samples_per_second": 868.53,
"eval_steps_per_second": 54.286,
"step": 1760000
},
{
"epoch": 14.94,
"eval_loss": 3.109429359436035,
"eval_runtime": 121.4142,
"eval_samples_per_second": 820.547,
"eval_steps_per_second": 51.287,
"step": 1768000
},
{
"epoch": 15.01,
"learning_rate": 1.066e-07,
"loss": 3.2908,
"step": 1776000
},
{
"epoch": 15.01,
"eval_loss": 3.1313207149505615,
"eval_runtime": 122.4119,
"eval_samples_per_second": 813.859,
"eval_steps_per_second": 50.869,
"step": 1776000
},
{
"epoch": 15.08,
"eval_loss": 3.128021001815796,
"eval_runtime": 121.5867,
"eval_samples_per_second": 819.382,
"eval_steps_per_second": 51.214,
"step": 1784000
},
{
"epoch": 15.15,
"learning_rate": 1.0386666666666667e-07,
"loss": 3.2896,
"step": 1792000
},
{
"epoch": 15.15,
"eval_loss": 3.130439043045044,
"eval_runtime": 123.1458,
"eval_samples_per_second": 809.009,
"eval_steps_per_second": 50.566,
"step": 1792000
},
{
"epoch": 15.21,
"eval_loss": 3.1329095363616943,
"eval_runtime": 122.3076,
"eval_samples_per_second": 814.553,
"eval_steps_per_second": 50.913,
"step": 1800000
},
{
"epoch": 15.28,
"learning_rate": 1.0113333333333334e-07,
"loss": 3.3061,
"step": 1808000
},
{
"epoch": 15.28,
"eval_loss": 3.119783401489258,
"eval_runtime": 122.902,
"eval_samples_per_second": 810.614,
"eval_steps_per_second": 50.666,
"step": 1808000
},
{
"epoch": 15.35,
"eval_loss": 3.1258275508880615,
"eval_runtime": 124.0345,
"eval_samples_per_second": 803.212,
"eval_steps_per_second": 50.204,
"step": 1816000
},
{
"epoch": 15.42,
"learning_rate": 9.84e-08,
"loss": 3.3056,
"step": 1824000
},
{
"epoch": 15.42,
"eval_loss": 3.125251531600952,
"eval_runtime": 122.5875,
"eval_samples_per_second": 812.693,
"eval_steps_per_second": 50.796,
"step": 1824000
},
{
"epoch": 15.49,
"eval_loss": 3.1200268268585205,
"eval_runtime": 123.5094,
"eval_samples_per_second": 806.627,
"eval_steps_per_second": 50.417,
"step": 1832000
},
{
"epoch": 15.55,
"learning_rate": 9.566666666666666e-08,
"loss": 3.2921,
"step": 1840000
},
{
"epoch": 15.55,
"eval_loss": 3.138437032699585,
"eval_runtime": 122.6438,
"eval_samples_per_second": 812.32,
"eval_steps_per_second": 50.773,
"step": 1840000
},
{
"epoch": 15.62,
"eval_loss": 3.1225225925445557,
"eval_runtime": 123.7318,
"eval_samples_per_second": 805.177,
"eval_steps_per_second": 50.327,
"step": 1848000
},
{
"epoch": 15.69,
"learning_rate": 9.293333333333333e-08,
"loss": 3.2895,
"step": 1856000
},
{
"epoch": 15.69,
"eval_loss": 3.128382921218872,
"eval_runtime": 123.4511,
"eval_samples_per_second": 807.008,
"eval_steps_per_second": 50.441,
"step": 1856000
},
{
"epoch": 15.76,
"eval_loss": 3.1200921535491943,
"eval_runtime": 122.7543,
"eval_samples_per_second": 811.589,
"eval_steps_per_second": 50.727,
"step": 1864000
},
{
"epoch": 15.82,
"learning_rate": 9.02e-08,
"loss": 3.293,
"step": 1872000
},
{
"epoch": 15.82,
"eval_loss": 3.1255955696105957,
"eval_runtime": 123.1108,
"eval_samples_per_second": 809.239,
"eval_steps_per_second": 50.58,
"step": 1872000
},
{
"epoch": 15.89,
"eval_loss": 3.116579055786133,
"eval_runtime": 123.1245,
"eval_samples_per_second": 809.149,
"eval_steps_per_second": 50.575,
"step": 1880000
},
{
"epoch": 15.96,
"learning_rate": 8.746666666666667e-08,
"loss": 3.2963,
"step": 1888000
},
{
"epoch": 15.96,
"eval_loss": 3.1218485832214355,
"eval_runtime": 123.513,
"eval_samples_per_second": 806.603,
"eval_steps_per_second": 50.416,
"step": 1888000
},
{
"epoch": 16.03,
"eval_loss": 3.1192948818206787,
"eval_runtime": 123.2874,
"eval_samples_per_second": 808.079,
"eval_steps_per_second": 50.508,
"step": 1896000
},
{
"epoch": 16.09,
"learning_rate": 8.473333333333334e-08,
"loss": 3.2908,
"step": 1904000
},
{
"epoch": 16.09,
"eval_loss": 3.12041974067688,
"eval_runtime": 121.8605,
"eval_samples_per_second": 817.541,
"eval_steps_per_second": 51.099,
"step": 1904000
},
{
"epoch": 16.16,
"eval_loss": 3.132479429244995,
"eval_runtime": 121.752,
"eval_samples_per_second": 818.27,
"eval_steps_per_second": 51.145,
"step": 1912000
},
{
"epoch": 16.23,
"learning_rate": 8.2e-08,
"loss": 3.3039,
"step": 1920000
},
{
"epoch": 16.23,
"eval_loss": 3.1090898513793945,
"eval_runtime": 121.4862,
"eval_samples_per_second": 820.06,
"eval_steps_per_second": 51.257,
"step": 1920000
},
{
"epoch": 16.3,
"eval_loss": 3.125005006790161,
"eval_runtime": 121.872,
"eval_samples_per_second": 817.464,
"eval_steps_per_second": 51.095,
"step": 1928000
},
{
"epoch": 16.36,
"learning_rate": 7.926666666666666e-08,
"loss": 3.3011,
"step": 1936000
},
{
"epoch": 16.36,
"eval_loss": 3.121675968170166,
"eval_runtime": 121.4859,
"eval_samples_per_second": 820.062,
"eval_steps_per_second": 51.257,
"step": 1936000
},
{
"epoch": 16.43,
"eval_loss": 3.120821237564087,
"eval_runtime": 122.7757,
"eval_samples_per_second": 811.447,
"eval_steps_per_second": 50.719,
"step": 1944000
},
{
"epoch": 16.5,
"learning_rate": 7.653333333333333e-08,
"loss": 3.3003,
"step": 1952000
},
{
"epoch": 16.5,
"eval_loss": 3.1109042167663574,
"eval_runtime": 122.9588,
"eval_samples_per_second": 810.239,
"eval_steps_per_second": 50.643,
"step": 1952000
},
{
"epoch": 16.57,
"eval_loss": 3.125174045562744,
"eval_runtime": 123.2276,
"eval_samples_per_second": 808.471,
"eval_steps_per_second": 50.533,
"step": 1960000
},
{
"epoch": 16.63,
"learning_rate": 7.38e-08,
"loss": 3.3012,
"step": 1968000
},
{
"epoch": 16.63,
"eval_loss": 3.112320899963379,
"eval_runtime": 123.5751,
"eval_samples_per_second": 806.198,
"eval_steps_per_second": 50.39,
"step": 1968000
},
{
"epoch": 16.7,
"eval_loss": 3.121267080307007,
"eval_runtime": 123.2561,
"eval_samples_per_second": 808.285,
"eval_steps_per_second": 50.521,
"step": 1976000
},
{
"epoch": 16.77,
"learning_rate": 7.106666666666667e-08,
"loss": 3.2885,
"step": 1984000
},
{
"epoch": 16.77,
"eval_loss": 3.1219470500946045,
"eval_runtime": 124.2602,
"eval_samples_per_second": 801.753,
"eval_steps_per_second": 50.113,
"step": 1984000
},
{
"epoch": 16.84,
"eval_loss": 3.1254475116729736,
"eval_runtime": 123.4006,
"eval_samples_per_second": 807.338,
"eval_steps_per_second": 50.462,
"step": 1992000
},
{
"epoch": 16.91,
"learning_rate": 6.833333333333332e-08,
"loss": 3.2982,
"step": 2000000
},
{
"epoch": 16.91,
"eval_loss": 3.1259801387786865,
"eval_runtime": 123.6708,
"eval_samples_per_second": 805.574,
"eval_steps_per_second": 50.351,
"step": 2000000
},
{
"epoch": 16.97,
"eval_loss": 3.1166510581970215,
"eval_runtime": 123.5064,
"eval_samples_per_second": 806.646,
"eval_steps_per_second": 50.418,
"step": 2008000
},
{
"epoch": 17.04,
"learning_rate": 6.56e-08,
"loss": 3.2962,
"step": 2016000
},
{
"epoch": 17.04,
"eval_loss": 3.108151435852051,
"eval_runtime": 122.3197,
"eval_samples_per_second": 814.472,
"eval_steps_per_second": 50.908,
"step": 2016000
},
{
"epoch": 17.11,
"eval_loss": 3.120389699935913,
"eval_runtime": 123.9621,
"eval_samples_per_second": 803.681,
"eval_steps_per_second": 50.233,
"step": 2024000
},
{
"epoch": 17.18,
"learning_rate": 6.286666666666666e-08,
"loss": 3.2889,
"step": 2032000
},
{
"epoch": 17.18,
"eval_loss": 3.1235997676849365,
"eval_runtime": 124.4028,
"eval_samples_per_second": 800.834,
"eval_steps_per_second": 50.055,
"step": 2032000
},
{
"epoch": 17.24,
"eval_loss": 3.13246488571167,
"eval_runtime": 123.0626,
"eval_samples_per_second": 809.555,
"eval_steps_per_second": 50.6,
"step": 2040000
},
{
"epoch": 17.31,
"learning_rate": 6.013333333333333e-08,
"loss": 3.2892,
"step": 2048000
},
{
"epoch": 17.31,
"eval_loss": 3.1200194358825684,
"eval_runtime": 124.0744,
"eval_samples_per_second": 802.954,
"eval_steps_per_second": 50.188,
"step": 2048000
},
{
"epoch": 17.38,
"eval_loss": 3.1231026649475098,
"eval_runtime": 122.7547,
"eval_samples_per_second": 811.586,
"eval_steps_per_second": 50.727,
"step": 2056000
},
{
"epoch": 17.45,
"learning_rate": 5.7400000000000004e-08,
"loss": 3.3028,
"step": 2064000
},
{
"epoch": 17.45,
"eval_loss": 3.1202361583709717,
"eval_runtime": 122.376,
"eval_samples_per_second": 814.098,
"eval_steps_per_second": 50.884,
"step": 2064000
},
{
"epoch": 17.51,
"eval_loss": 3.1188881397247314,
"eval_runtime": 123.9465,
"eval_samples_per_second": 803.782,
"eval_steps_per_second": 50.239,
"step": 2072000
},
{
"epoch": 17.58,
"learning_rate": 5.4666666666666666e-08,
"loss": 3.2889,
"step": 2080000
},
{
"epoch": 17.58,
"eval_loss": 3.1336753368377686,
"eval_runtime": 123.6532,
"eval_samples_per_second": 805.689,
"eval_steps_per_second": 50.359,
"step": 2080000
},
{
"epoch": 17.65,
"eval_loss": 3.1155591011047363,
"eval_runtime": 124.2421,
"eval_samples_per_second": 801.87,
"eval_steps_per_second": 50.12,
"step": 2088000
},
{
"epoch": 17.72,
"learning_rate": 5.1933333333333335e-08,
"loss": 3.2985,
"step": 2096000
},
{
"epoch": 17.72,
"eval_loss": 3.1258046627044678,
"eval_runtime": 123.2939,
"eval_samples_per_second": 808.037,
"eval_steps_per_second": 50.505,
"step": 2096000
},
{
"epoch": 17.78,
"eval_loss": 3.1358370780944824,
"eval_runtime": 123.3415,
"eval_samples_per_second": 807.725,
"eval_steps_per_second": 50.486,
"step": 2104000
},
{
"epoch": 17.85,
"learning_rate": 4.92e-08,
"loss": 3.2949,
"step": 2112000
},
{
"epoch": 17.85,
"eval_loss": 3.1270527839660645,
"eval_runtime": 115.5858,
"eval_samples_per_second": 861.922,
"eval_steps_per_second": 53.873,
"step": 2112000
},
{
"epoch": 17.92,
"eval_loss": 3.1249983310699463,
"eval_runtime": 114.4039,
"eval_samples_per_second": 870.827,
"eval_steps_per_second": 54.43,
"step": 2120000
},
{
"epoch": 17.99,
"learning_rate": 4.6466666666666666e-08,
"loss": 3.2987,
"step": 2128000
},
{
"epoch": 17.99,
"eval_loss": 3.124422550201416,
"eval_runtime": 113.825,
"eval_samples_per_second": 875.256,
"eval_steps_per_second": 54.707,
"step": 2128000
},
{
"epoch": 18.05,
"eval_loss": 3.1221253871917725,
"eval_runtime": 113.4341,
"eval_samples_per_second": 878.272,
"eval_steps_per_second": 54.895,
"step": 2136000
},
{
"epoch": 18.12,
"learning_rate": 4.3733333333333335e-08,
"loss": 3.2884,
"step": 2144000
},
{
"epoch": 18.12,
"eval_loss": 3.1197779178619385,
"eval_runtime": 114.383,
"eval_samples_per_second": 870.986,
"eval_steps_per_second": 54.44,
"step": 2144000
},
{
"epoch": 18.19,
"eval_loss": 3.1169650554656982,
"eval_runtime": 115.2342,
"eval_samples_per_second": 864.553,
"eval_steps_per_second": 54.038,
"step": 2152000
},
{
"epoch": 18.26,
"learning_rate": 4.1e-08,
"loss": 3.2918,
"step": 2160000
},
{
"epoch": 18.26,
"eval_loss": 3.1158599853515625,
"eval_runtime": 114.9399,
"eval_samples_per_second": 866.766,
"eval_steps_per_second": 54.176,
"step": 2160000
},
{
"epoch": 18.33,
"eval_loss": 3.1153085231781006,
"eval_runtime": 114.0048,
"eval_samples_per_second": 873.876,
"eval_steps_per_second": 54.621,
"step": 2168000
},
{
"epoch": 18.39,
"learning_rate": 3.8266666666666665e-08,
"loss": 3.2995,
"step": 2176000
},
{
"epoch": 18.39,
"eval_loss": 3.120265483856201,
"eval_runtime": 114.5155,
"eval_samples_per_second": 869.979,
"eval_steps_per_second": 54.377,
"step": 2176000
},
{
"epoch": 18.46,
"eval_loss": 3.110717535018921,
"eval_runtime": 114.5457,
"eval_samples_per_second": 869.749,
"eval_steps_per_second": 54.363,
"step": 2184000
},
{
"epoch": 18.53,
"learning_rate": 3.5533333333333334e-08,
"loss": 3.3003,
"step": 2192000
},
{
"epoch": 18.53,
"eval_loss": 3.1211767196655273,
"eval_runtime": 114.2326,
"eval_samples_per_second": 872.133,
"eval_steps_per_second": 54.512,
"step": 2192000
},
{
"epoch": 18.6,
"eval_loss": 3.133021593093872,
"eval_runtime": 123.3425,
"eval_samples_per_second": 807.718,
"eval_steps_per_second": 50.485,
"step": 2200000
},
{
"epoch": 18.66,
"learning_rate": 3.28e-08,
"loss": 3.2921,
"step": 2208000
},
{
"epoch": 18.66,
"eval_loss": 3.1160311698913574,
"eval_runtime": 123.8562,
"eval_samples_per_second": 804.369,
"eval_steps_per_second": 50.276,
"step": 2208000
},
{
"epoch": 18.73,
"eval_loss": 3.1191678047180176,
"eval_runtime": 124.221,
"eval_samples_per_second": 802.006,
"eval_steps_per_second": 50.128,
"step": 2216000
},
{
"epoch": 18.8,
"learning_rate": 3.0066666666666665e-08,
"loss": 3.293,
"step": 2224000
},
{
"epoch": 18.8,
"eval_loss": 3.1164309978485107,
"eval_runtime": 123.2955,
"eval_samples_per_second": 808.026,
"eval_steps_per_second": 50.505,
"step": 2224000
},
{
"epoch": 18.87,
"eval_loss": 3.1224827766418457,
"eval_runtime": 124.8121,
"eval_samples_per_second": 798.208,
"eval_steps_per_second": 49.891,
"step": 2232000
},
{
"epoch": 18.93,
"learning_rate": 2.7333333333333333e-08,
"loss": 3.2969,
"step": 2240000
},
{
"epoch": 18.93,
"eval_loss": 3.1243343353271484,
"eval_runtime": 125.6337,
"eval_samples_per_second": 792.988,
"eval_steps_per_second": 49.565,
"step": 2240000
},
{
"epoch": 19.0,
"eval_loss": 3.115158796310425,
"eval_runtime": 123.9293,
"eval_samples_per_second": 803.894,
"eval_steps_per_second": 50.246,
"step": 2248000
},
{
"epoch": 19.07,
"learning_rate": 2.46e-08,
"loss": 3.2891,
"step": 2256000
},
{
"epoch": 19.07,
"eval_loss": 3.132289171218872,
"eval_runtime": 125.9455,
"eval_samples_per_second": 791.025,
"eval_steps_per_second": 49.442,
"step": 2256000
},
{
"epoch": 19.14,
"eval_loss": 3.1076977252960205,
"eval_runtime": 125.1878,
"eval_samples_per_second": 795.812,
"eval_steps_per_second": 49.741,
"step": 2264000
},
{
"epoch": 19.2,
"learning_rate": 2.1866666666666667e-08,
"loss": 3.2903,
"step": 2272000
},
{
"epoch": 19.2,
"eval_loss": 3.134789228439331,
"eval_runtime": 114.7197,
"eval_samples_per_second": 868.43,
"eval_steps_per_second": 54.28,
"step": 2272000
},
{
"epoch": 19.27,
"eval_loss": 3.120239019393921,
"eval_runtime": 115.2866,
"eval_samples_per_second": 864.159,
"eval_steps_per_second": 54.013,
"step": 2280000
},
{
"epoch": 19.34,
"learning_rate": 1.9133333333333333e-08,
"loss": 3.2986,
"step": 2288000
},
{
"epoch": 19.34,
"eval_loss": 3.122042655944824,
"eval_runtime": 114.9192,
"eval_samples_per_second": 866.922,
"eval_steps_per_second": 54.186,
"step": 2288000
},
{
"epoch": 19.41,
"eval_loss": 3.12359881401062,
"eval_runtime": 114.2736,
"eval_samples_per_second": 871.82,
"eval_steps_per_second": 54.492,
"step": 2296000
},
{
"epoch": 19.47,
"learning_rate": 1.64e-08,
"loss": 3.293,
"step": 2304000
},
{
"epoch": 19.47,
"eval_loss": 3.122392177581787,
"eval_runtime": 114.9052,
"eval_samples_per_second": 867.028,
"eval_steps_per_second": 54.192,
"step": 2304000
},
{
"epoch": 19.54,
"eval_loss": 3.1246843338012695,
"eval_runtime": 115.1423,
"eval_samples_per_second": 865.242,
"eval_steps_per_second": 54.081,
"step": 2312000
},
{
"epoch": 19.61,
"learning_rate": 1.3666666666666667e-08,
"loss": 3.299,
"step": 2320000
},
{
"epoch": 19.61,
"eval_loss": 3.1234774589538574,
"eval_runtime": 114.9644,
"eval_samples_per_second": 866.581,
"eval_steps_per_second": 54.165,
"step": 2320000
},
{
"epoch": 19.68,
"eval_loss": 3.120058059692383,
"eval_runtime": 114.2563,
"eval_samples_per_second": 871.952,
"eval_steps_per_second": 54.5,
"step": 2328000
},
{
"epoch": 19.75,
"learning_rate": 1.0933333333333334e-08,
"loss": 3.2898,
"step": 2336000
},
{
"epoch": 19.75,
"eval_loss": 3.1162607669830322,
"eval_runtime": 114.3246,
"eval_samples_per_second": 871.431,
"eval_steps_per_second": 54.468,
"step": 2336000
},
{
"epoch": 19.81,
"eval_loss": 3.1289384365081787,
"eval_runtime": 113.9183,
"eval_samples_per_second": 874.539,
"eval_steps_per_second": 54.662,
"step": 2344000
},
{
"epoch": 19.88,
"learning_rate": 8.2e-09,
"loss": 3.2956,
"step": 2352000
},
{
"epoch": 19.88,
"eval_loss": 3.1197969913482666,
"eval_runtime": 114.6787,
"eval_samples_per_second": 868.74,
"eval_steps_per_second": 54.3,
"step": 2352000
},
{
"epoch": 19.95,
"eval_loss": 3.1250617504119873,
"eval_runtime": 114.9296,
"eval_samples_per_second": 866.844,
"eval_steps_per_second": 54.181,
"step": 2360000
},
{
"epoch": 20.02,
"learning_rate": 5.466666666666667e-09,
"loss": 3.2926,
"step": 2368000
},
{
"epoch": 20.02,
"eval_loss": 3.1086537837982178,
"eval_runtime": 115.0292,
"eval_samples_per_second": 866.093,
"eval_steps_per_second": 54.134,
"step": 2368000
},
{
"epoch": 20.08,
"eval_loss": 3.109729051589966,
"eval_runtime": 115.2619,
"eval_samples_per_second": 864.345,
"eval_steps_per_second": 54.025,
"step": 2376000
},
{
"epoch": 20.15,
"learning_rate": 2.7333333333333334e-09,
"loss": 3.2958,
"step": 2384000
},
{
"epoch": 20.15,
"eval_loss": 3.126241445541382,
"eval_runtime": 114.8392,
"eval_samples_per_second": 867.526,
"eval_steps_per_second": 54.224,
"step": 2384000
},
{
"epoch": 20.22,
"eval_loss": 3.1308016777038574,
"eval_runtime": 115.2099,
"eval_samples_per_second": 864.735,
"eval_steps_per_second": 54.049,
"step": 2392000
},
{
"epoch": 20.29,
"learning_rate": 0.0,
"loss": 3.2862,
"step": 2400000
},
{
"epoch": 20.29,
"eval_loss": 3.1129300594329834,
"eval_runtime": 115.5611,
"eval_samples_per_second": 862.107,
"eval_steps_per_second": 53.885,
"step": 2400000
},
{
"epoch": 20.29,
"step": 2400000,
"total_flos": 7.704255639100524e+17,
"train_loss": 3.30909029296875,
"train_runtime": 182182.8159,
"train_samples_per_second": 210.777,
"train_steps_per_second": 13.174
}
],
"logging_steps": 16000,
"max_steps": 2400000,
"num_train_epochs": 21,
"save_steps": 32000,
"total_flos": 7.704255639100524e+17,
"trial_name": null,
"trial_params": null
}