2020-Q4-25p-filtered-random / trainer_state.json
DouglasPontes's picture
End of training
d8a8c04 verified
raw
history blame
80.8 kB
{
"best_metric": 2.256277322769165,
"best_model_checkpoint": "./model_tweets_2020_Q4_25/checkpoint-1952000",
"epoch": 6.73682319488226,
"eval_steps": 8000,
"global_step": 2400000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.02,
"eval_loss": 2.580249547958374,
"eval_runtime": 320.9634,
"eval_samples_per_second": 934.686,
"eval_steps_per_second": 58.418,
"step": 8000
},
{
"epoch": 0.04,
"learning_rate": 4.0726666666666665e-07,
"loss": 2.8151,
"step": 16000
},
{
"epoch": 0.04,
"eval_loss": 2.488163471221924,
"eval_runtime": 321.5644,
"eval_samples_per_second": 932.939,
"eval_steps_per_second": 58.309,
"step": 16000
},
{
"epoch": 0.07,
"eval_loss": 2.429165840148926,
"eval_runtime": 321.2729,
"eval_samples_per_second": 933.786,
"eval_steps_per_second": 58.362,
"step": 24000
},
{
"epoch": 0.09,
"learning_rate": 4.0453333333333336e-07,
"loss": 2.5636,
"step": 32000
},
{
"epoch": 0.09,
"eval_loss": 2.3980140686035156,
"eval_runtime": 321.6728,
"eval_samples_per_second": 932.625,
"eval_steps_per_second": 58.289,
"step": 32000
},
{
"epoch": 0.11,
"eval_loss": 2.3799262046813965,
"eval_runtime": 323.0695,
"eval_samples_per_second": 928.593,
"eval_steps_per_second": 58.037,
"step": 40000
},
{
"epoch": 0.13,
"learning_rate": 4.018e-07,
"loss": 2.4947,
"step": 48000
},
{
"epoch": 0.13,
"eval_loss": 2.3665478229522705,
"eval_runtime": 322.6194,
"eval_samples_per_second": 929.888,
"eval_steps_per_second": 58.118,
"step": 48000
},
{
"epoch": 0.16,
"eval_loss": 2.345531940460205,
"eval_runtime": 322.4114,
"eval_samples_per_second": 930.488,
"eval_steps_per_second": 58.156,
"step": 56000
},
{
"epoch": 0.18,
"learning_rate": 3.9906666666666667e-07,
"loss": 2.473,
"step": 64000
},
{
"epoch": 0.18,
"eval_loss": 2.341932773590088,
"eval_runtime": 324.4552,
"eval_samples_per_second": 924.627,
"eval_steps_per_second": 57.789,
"step": 64000
},
{
"epoch": 0.2,
"eval_loss": 2.3307127952575684,
"eval_runtime": 322.5941,
"eval_samples_per_second": 929.961,
"eval_steps_per_second": 58.123,
"step": 72000
},
{
"epoch": 0.22,
"learning_rate": 3.963333333333333e-07,
"loss": 2.4512,
"step": 80000
},
{
"epoch": 0.22,
"eval_loss": 2.3288769721984863,
"eval_runtime": 322.3925,
"eval_samples_per_second": 930.543,
"eval_steps_per_second": 58.159,
"step": 80000
},
{
"epoch": 0.25,
"eval_loss": 2.325032949447632,
"eval_runtime": 322.736,
"eval_samples_per_second": 929.552,
"eval_steps_per_second": 58.097,
"step": 88000
},
{
"epoch": 0.27,
"learning_rate": 3.936e-07,
"loss": 2.4421,
"step": 96000
},
{
"epoch": 0.27,
"eval_loss": 2.318911075592041,
"eval_runtime": 323.2095,
"eval_samples_per_second": 928.19,
"eval_steps_per_second": 58.012,
"step": 96000
},
{
"epoch": 0.29,
"eval_loss": 2.3199880123138428,
"eval_runtime": 323.2363,
"eval_samples_per_second": 928.114,
"eval_steps_per_second": 58.007,
"step": 104000
},
{
"epoch": 0.31,
"learning_rate": 3.908666666666667e-07,
"loss": 2.4354,
"step": 112000
},
{
"epoch": 0.31,
"eval_loss": 2.3154587745666504,
"eval_runtime": 323.95,
"eval_samples_per_second": 926.069,
"eval_steps_per_second": 57.879,
"step": 112000
},
{
"epoch": 0.34,
"eval_loss": 2.313781976699829,
"eval_runtime": 324.5922,
"eval_samples_per_second": 924.237,
"eval_steps_per_second": 57.765,
"step": 120000
},
{
"epoch": 0.36,
"learning_rate": 3.8813333333333334e-07,
"loss": 2.4324,
"step": 128000
},
{
"epoch": 0.36,
"eval_loss": 2.305436372756958,
"eval_runtime": 323.2003,
"eval_samples_per_second": 928.217,
"eval_steps_per_second": 58.014,
"step": 128000
},
{
"epoch": 0.38,
"eval_loss": 2.302849054336548,
"eval_runtime": 323.3577,
"eval_samples_per_second": 927.765,
"eval_steps_per_second": 57.985,
"step": 136000
},
{
"epoch": 0.4,
"learning_rate": 3.854e-07,
"loss": 2.4253,
"step": 144000
},
{
"epoch": 0.4,
"eval_loss": 2.3029212951660156,
"eval_runtime": 324.8316,
"eval_samples_per_second": 923.555,
"eval_steps_per_second": 57.722,
"step": 144000
},
{
"epoch": 0.43,
"eval_loss": 2.3006043434143066,
"eval_runtime": 323.2225,
"eval_samples_per_second": 928.153,
"eval_steps_per_second": 58.01,
"step": 152000
},
{
"epoch": 0.45,
"learning_rate": 3.8266666666666665e-07,
"loss": 2.4156,
"step": 160000
},
{
"epoch": 0.45,
"eval_loss": 2.300135612487793,
"eval_runtime": 323.6582,
"eval_samples_per_second": 926.904,
"eval_steps_per_second": 57.931,
"step": 160000
},
{
"epoch": 0.47,
"eval_loss": 2.298043727874756,
"eval_runtime": 322.8658,
"eval_samples_per_second": 929.179,
"eval_steps_per_second": 58.074,
"step": 168000
},
{
"epoch": 0.49,
"learning_rate": 3.799333333333333e-07,
"loss": 2.4165,
"step": 176000
},
{
"epoch": 0.49,
"eval_loss": 2.291269063949585,
"eval_runtime": 323.9312,
"eval_samples_per_second": 926.122,
"eval_steps_per_second": 57.883,
"step": 176000
},
{
"epoch": 0.52,
"eval_loss": 2.297363519668579,
"eval_runtime": 323.2402,
"eval_samples_per_second": 928.102,
"eval_steps_per_second": 58.006,
"step": 184000
},
{
"epoch": 0.54,
"learning_rate": 3.772e-07,
"loss": 2.4131,
"step": 192000
},
{
"epoch": 0.54,
"eval_loss": 2.2906086444854736,
"eval_runtime": 323.5876,
"eval_samples_per_second": 927.106,
"eval_steps_per_second": 57.944,
"step": 192000
},
{
"epoch": 0.56,
"eval_loss": 2.2908411026000977,
"eval_runtime": 324.835,
"eval_samples_per_second": 923.546,
"eval_steps_per_second": 57.722,
"step": 200000
},
{
"epoch": 0.58,
"learning_rate": 3.7446666666666667e-07,
"loss": 2.407,
"step": 208000
},
{
"epoch": 0.58,
"eval_loss": 2.289541482925415,
"eval_runtime": 323.2737,
"eval_samples_per_second": 928.006,
"eval_steps_per_second": 58.0,
"step": 208000
},
{
"epoch": 0.61,
"eval_loss": 2.2865185737609863,
"eval_runtime": 323.7161,
"eval_samples_per_second": 926.738,
"eval_steps_per_second": 57.921,
"step": 216000
},
{
"epoch": 0.63,
"learning_rate": 3.7173333333333333e-07,
"loss": 2.4153,
"step": 224000
},
{
"epoch": 0.63,
"eval_loss": 2.2913596630096436,
"eval_runtime": 323.8117,
"eval_samples_per_second": 926.464,
"eval_steps_per_second": 57.904,
"step": 224000
},
{
"epoch": 0.65,
"eval_loss": 2.280600070953369,
"eval_runtime": 324.7681,
"eval_samples_per_second": 923.736,
"eval_steps_per_second": 57.734,
"step": 232000
},
{
"epoch": 0.67,
"learning_rate": 3.69e-07,
"loss": 2.4011,
"step": 240000
},
{
"epoch": 0.67,
"eval_loss": 2.2818994522094727,
"eval_runtime": 324.8269,
"eval_samples_per_second": 923.569,
"eval_steps_per_second": 57.723,
"step": 240000
},
{
"epoch": 0.7,
"eval_loss": 2.2854413986206055,
"eval_runtime": 324.8244,
"eval_samples_per_second": 923.576,
"eval_steps_per_second": 57.724,
"step": 248000
},
{
"epoch": 0.72,
"learning_rate": 3.6626666666666664e-07,
"loss": 2.4087,
"step": 256000
},
{
"epoch": 0.72,
"eval_loss": 2.283675193786621,
"eval_runtime": 326.3862,
"eval_samples_per_second": 919.157,
"eval_steps_per_second": 57.447,
"step": 256000
},
{
"epoch": 0.74,
"eval_loss": 2.286595106124878,
"eval_runtime": 327.6717,
"eval_samples_per_second": 915.551,
"eval_steps_per_second": 57.222,
"step": 264000
},
{
"epoch": 0.76,
"learning_rate": 3.6353333333333335e-07,
"loss": 2.4059,
"step": 272000
},
{
"epoch": 0.76,
"eval_loss": 2.285534143447876,
"eval_runtime": 326.6584,
"eval_samples_per_second": 918.391,
"eval_steps_per_second": 57.399,
"step": 272000
},
{
"epoch": 0.79,
"eval_loss": 2.28678560256958,
"eval_runtime": 329.4988,
"eval_samples_per_second": 910.474,
"eval_steps_per_second": 56.905,
"step": 280000
},
{
"epoch": 0.81,
"learning_rate": 3.608e-07,
"loss": 2.4086,
"step": 288000
},
{
"epoch": 0.81,
"eval_loss": 2.277035713195801,
"eval_runtime": 327.402,
"eval_samples_per_second": 916.305,
"eval_steps_per_second": 57.269,
"step": 288000
},
{
"epoch": 0.83,
"eval_loss": 2.2788984775543213,
"eval_runtime": 328.7295,
"eval_samples_per_second": 912.604,
"eval_steps_per_second": 57.038,
"step": 296000
},
{
"epoch": 0.85,
"learning_rate": 3.5806666666666666e-07,
"loss": 2.4093,
"step": 304000
},
{
"epoch": 0.85,
"eval_loss": 2.2792067527770996,
"eval_runtime": 328.6343,
"eval_samples_per_second": 912.869,
"eval_steps_per_second": 57.054,
"step": 304000
},
{
"epoch": 0.88,
"eval_loss": 2.2796542644500732,
"eval_runtime": 328.9041,
"eval_samples_per_second": 912.12,
"eval_steps_per_second": 57.007,
"step": 312000
},
{
"epoch": 0.9,
"learning_rate": 3.553333333333333e-07,
"loss": 2.4036,
"step": 320000
},
{
"epoch": 0.9,
"eval_loss": 2.2794368267059326,
"eval_runtime": 327.0881,
"eval_samples_per_second": 917.184,
"eval_steps_per_second": 57.324,
"step": 320000
},
{
"epoch": 0.92,
"eval_loss": 2.2767865657806396,
"eval_runtime": 325.4813,
"eval_samples_per_second": 921.712,
"eval_steps_per_second": 57.607,
"step": 328000
},
{
"epoch": 0.94,
"learning_rate": 3.5259999999999997e-07,
"loss": 2.4063,
"step": 336000
},
{
"epoch": 0.94,
"eval_loss": 2.28360652923584,
"eval_runtime": 326.0539,
"eval_samples_per_second": 920.093,
"eval_steps_per_second": 57.506,
"step": 336000
},
{
"epoch": 0.97,
"eval_loss": 2.2808754444122314,
"eval_runtime": 324.6753,
"eval_samples_per_second": 924.0,
"eval_steps_per_second": 57.75,
"step": 344000
},
{
"epoch": 0.99,
"learning_rate": 3.498666666666667e-07,
"loss": 2.4047,
"step": 352000
},
{
"epoch": 0.99,
"eval_loss": 2.280778408050537,
"eval_runtime": 325.3269,
"eval_samples_per_second": 922.149,
"eval_steps_per_second": 57.634,
"step": 352000
},
{
"epoch": 1.01,
"eval_loss": 2.28403377532959,
"eval_runtime": 325.2468,
"eval_samples_per_second": 922.377,
"eval_steps_per_second": 57.649,
"step": 360000
},
{
"epoch": 1.03,
"learning_rate": 3.4713333333333333e-07,
"loss": 2.4084,
"step": 368000
},
{
"epoch": 1.03,
"eval_loss": 2.279930591583252,
"eval_runtime": 327.9631,
"eval_samples_per_second": 914.737,
"eval_steps_per_second": 57.171,
"step": 368000
},
{
"epoch": 1.06,
"eval_loss": 2.272570848464966,
"eval_runtime": 327.8275,
"eval_samples_per_second": 915.115,
"eval_steps_per_second": 57.195,
"step": 376000
},
{
"epoch": 1.08,
"learning_rate": 3.444e-07,
"loss": 2.4041,
"step": 384000
},
{
"epoch": 1.08,
"eval_loss": 2.2823517322540283,
"eval_runtime": 328.4584,
"eval_samples_per_second": 913.358,
"eval_steps_per_second": 57.085,
"step": 384000
},
{
"epoch": 1.1,
"eval_loss": 2.278149127960205,
"eval_runtime": 326.9556,
"eval_samples_per_second": 917.556,
"eval_steps_per_second": 57.347,
"step": 392000
},
{
"epoch": 1.12,
"learning_rate": 3.416666666666667e-07,
"loss": 2.4034,
"step": 400000
},
{
"epoch": 1.12,
"eval_loss": 2.275142192840576,
"eval_runtime": 326.8439,
"eval_samples_per_second": 917.869,
"eval_steps_per_second": 57.367,
"step": 400000
},
{
"epoch": 1.15,
"eval_loss": 2.2760984897613525,
"eval_runtime": 325.9846,
"eval_samples_per_second": 920.289,
"eval_steps_per_second": 57.518,
"step": 408000
},
{
"epoch": 1.17,
"learning_rate": 3.3893333333333335e-07,
"loss": 2.3951,
"step": 416000
},
{
"epoch": 1.17,
"eval_loss": 2.2731635570526123,
"eval_runtime": 326.1395,
"eval_samples_per_second": 919.852,
"eval_steps_per_second": 57.491,
"step": 416000
},
{
"epoch": 1.19,
"eval_loss": 2.2709577083587646,
"eval_runtime": 326.1973,
"eval_samples_per_second": 919.689,
"eval_steps_per_second": 57.481,
"step": 424000
},
{
"epoch": 1.21,
"learning_rate": 3.3619999999999995e-07,
"loss": 2.409,
"step": 432000
},
{
"epoch": 1.21,
"eval_loss": 2.277972936630249,
"eval_runtime": 325.3949,
"eval_samples_per_second": 921.957,
"eval_steps_per_second": 57.622,
"step": 432000
},
{
"epoch": 1.24,
"eval_loss": 2.2714641094207764,
"eval_runtime": 325.6353,
"eval_samples_per_second": 921.276,
"eval_steps_per_second": 57.58,
"step": 440000
},
{
"epoch": 1.26,
"learning_rate": 3.3346666666666666e-07,
"loss": 2.3985,
"step": 448000
},
{
"epoch": 1.26,
"eval_loss": 2.279003620147705,
"eval_runtime": 326.3983,
"eval_samples_per_second": 919.122,
"eval_steps_per_second": 57.445,
"step": 448000
},
{
"epoch": 1.28,
"eval_loss": 2.276561737060547,
"eval_runtime": 326.5381,
"eval_samples_per_second": 918.729,
"eval_steps_per_second": 57.421,
"step": 456000
},
{
"epoch": 1.3,
"learning_rate": 3.307333333333333e-07,
"loss": 2.4016,
"step": 464000
},
{
"epoch": 1.3,
"eval_loss": 2.2744641304016113,
"eval_runtime": 326.438,
"eval_samples_per_second": 919.011,
"eval_steps_per_second": 57.438,
"step": 464000
},
{
"epoch": 1.32,
"eval_loss": 2.2719147205352783,
"eval_runtime": 326.1182,
"eval_samples_per_second": 919.912,
"eval_steps_per_second": 57.494,
"step": 472000
},
{
"epoch": 1.35,
"learning_rate": 3.28e-07,
"loss": 2.3978,
"step": 480000
},
{
"epoch": 1.35,
"eval_loss": 2.2755250930786133,
"eval_runtime": 326.0946,
"eval_samples_per_second": 919.978,
"eval_steps_per_second": 57.499,
"step": 480000
},
{
"epoch": 1.37,
"eval_loss": 2.269918203353882,
"eval_runtime": 326.8772,
"eval_samples_per_second": 917.776,
"eval_steps_per_second": 57.361,
"step": 488000
},
{
"epoch": 1.39,
"learning_rate": 3.252666666666667e-07,
"loss": 2.406,
"step": 496000
},
{
"epoch": 1.39,
"eval_loss": 2.282317876815796,
"eval_runtime": 325.8019,
"eval_samples_per_second": 920.805,
"eval_steps_per_second": 57.55,
"step": 496000
},
{
"epoch": 1.41,
"eval_loss": 2.2735817432403564,
"eval_runtime": 326.0969,
"eval_samples_per_second": 919.972,
"eval_steps_per_second": 57.498,
"step": 504000
},
{
"epoch": 1.44,
"learning_rate": 3.2253333333333334e-07,
"loss": 2.3958,
"step": 512000
},
{
"epoch": 1.44,
"eval_loss": 2.2728230953216553,
"eval_runtime": 326.2067,
"eval_samples_per_second": 919.662,
"eval_steps_per_second": 57.479,
"step": 512000
},
{
"epoch": 1.46,
"eval_loss": 2.2762703895568848,
"eval_runtime": 326.3243,
"eval_samples_per_second": 919.331,
"eval_steps_per_second": 57.458,
"step": 520000
},
{
"epoch": 1.48,
"learning_rate": 3.198e-07,
"loss": 2.406,
"step": 528000
},
{
"epoch": 1.48,
"eval_loss": 2.2780961990356445,
"eval_runtime": 325.8653,
"eval_samples_per_second": 920.626,
"eval_steps_per_second": 57.539,
"step": 528000
},
{
"epoch": 1.5,
"eval_loss": 2.2722842693328857,
"eval_runtime": 326.0044,
"eval_samples_per_second": 920.233,
"eval_steps_per_second": 57.515,
"step": 536000
},
{
"epoch": 1.53,
"learning_rate": 3.1706666666666665e-07,
"loss": 2.4,
"step": 544000
},
{
"epoch": 1.53,
"eval_loss": 2.273293972015381,
"eval_runtime": 326.966,
"eval_samples_per_second": 917.527,
"eval_steps_per_second": 57.345,
"step": 544000
},
{
"epoch": 1.55,
"eval_loss": 2.271476984024048,
"eval_runtime": 326.8892,
"eval_samples_per_second": 917.742,
"eval_steps_per_second": 57.359,
"step": 552000
},
{
"epoch": 1.57,
"learning_rate": 3.1433333333333336e-07,
"loss": 2.3998,
"step": 560000
},
{
"epoch": 1.57,
"eval_loss": 2.271629810333252,
"eval_runtime": 326.5264,
"eval_samples_per_second": 918.762,
"eval_steps_per_second": 57.423,
"step": 560000
},
{
"epoch": 1.59,
"eval_loss": 2.27506422996521,
"eval_runtime": 326.712,
"eval_samples_per_second": 918.24,
"eval_steps_per_second": 57.39,
"step": 568000
},
{
"epoch": 1.62,
"learning_rate": 3.116e-07,
"loss": 2.4017,
"step": 576000
},
{
"epoch": 1.62,
"eval_loss": 2.274268865585327,
"eval_runtime": 326.6112,
"eval_samples_per_second": 918.523,
"eval_steps_per_second": 57.408,
"step": 576000
},
{
"epoch": 1.64,
"eval_loss": 2.2739031314849854,
"eval_runtime": 326.4511,
"eval_samples_per_second": 918.974,
"eval_steps_per_second": 57.436,
"step": 584000
},
{
"epoch": 1.66,
"learning_rate": 3.0886666666666667e-07,
"loss": 2.4019,
"step": 592000
},
{
"epoch": 1.66,
"eval_loss": 2.275505542755127,
"eval_runtime": 329.4605,
"eval_samples_per_second": 910.58,
"eval_steps_per_second": 56.911,
"step": 592000
},
{
"epoch": 1.68,
"eval_loss": 2.269094228744507,
"eval_runtime": 327.3789,
"eval_samples_per_second": 916.369,
"eval_steps_per_second": 57.273,
"step": 600000
},
{
"epoch": 1.71,
"learning_rate": 3.061333333333333e-07,
"loss": 2.398,
"step": 608000
},
{
"epoch": 1.71,
"eval_loss": 2.2705538272857666,
"eval_runtime": 327.1271,
"eval_samples_per_second": 917.075,
"eval_steps_per_second": 57.317,
"step": 608000
},
{
"epoch": 1.73,
"eval_loss": 2.270341634750366,
"eval_runtime": 326.9286,
"eval_samples_per_second": 917.632,
"eval_steps_per_second": 57.352,
"step": 616000
},
{
"epoch": 1.75,
"learning_rate": 3.034e-07,
"loss": 2.4027,
"step": 624000
},
{
"epoch": 1.75,
"eval_loss": 2.2657225131988525,
"eval_runtime": 326.8016,
"eval_samples_per_second": 917.988,
"eval_steps_per_second": 57.374,
"step": 624000
},
{
"epoch": 1.77,
"eval_loss": 2.267418146133423,
"eval_runtime": 326.6227,
"eval_samples_per_second": 918.491,
"eval_steps_per_second": 57.406,
"step": 632000
},
{
"epoch": 1.8,
"learning_rate": 3.0066666666666663e-07,
"loss": 2.4,
"step": 640000
},
{
"epoch": 1.8,
"eval_loss": 2.2748591899871826,
"eval_runtime": 326.8527,
"eval_samples_per_second": 917.845,
"eval_steps_per_second": 57.365,
"step": 640000
},
{
"epoch": 1.82,
"eval_loss": 2.2713701725006104,
"eval_runtime": 326.3767,
"eval_samples_per_second": 919.183,
"eval_steps_per_second": 57.449,
"step": 648000
},
{
"epoch": 1.84,
"learning_rate": 2.9793333333333334e-07,
"loss": 2.4046,
"step": 656000
},
{
"epoch": 1.84,
"eval_loss": 2.2694690227508545,
"eval_runtime": 326.9136,
"eval_samples_per_second": 917.674,
"eval_steps_per_second": 57.355,
"step": 656000
},
{
"epoch": 1.86,
"eval_loss": 2.2724227905273438,
"eval_runtime": 326.9654,
"eval_samples_per_second": 917.528,
"eval_steps_per_second": 57.346,
"step": 664000
},
{
"epoch": 1.89,
"learning_rate": 2.952e-07,
"loss": 2.4033,
"step": 672000
},
{
"epoch": 1.89,
"eval_loss": 2.2697391510009766,
"eval_runtime": 326.8958,
"eval_samples_per_second": 917.724,
"eval_steps_per_second": 57.358,
"step": 672000
},
{
"epoch": 1.91,
"eval_loss": 2.2697041034698486,
"eval_runtime": 326.8461,
"eval_samples_per_second": 917.863,
"eval_steps_per_second": 57.366,
"step": 680000
},
{
"epoch": 1.93,
"learning_rate": 2.9246666666666665e-07,
"loss": 2.3981,
"step": 688000
},
{
"epoch": 1.93,
"eval_loss": 2.267427444458008,
"eval_runtime": 327.9149,
"eval_samples_per_second": 914.872,
"eval_steps_per_second": 57.179,
"step": 688000
},
{
"epoch": 1.95,
"eval_loss": 2.266889810562134,
"eval_runtime": 327.4325,
"eval_samples_per_second": 916.219,
"eval_steps_per_second": 57.264,
"step": 696000
},
{
"epoch": 1.98,
"learning_rate": 2.897333333333333e-07,
"loss": 2.4029,
"step": 704000
},
{
"epoch": 1.98,
"eval_loss": 2.275509834289551,
"eval_runtime": 327.0353,
"eval_samples_per_second": 917.332,
"eval_steps_per_second": 57.333,
"step": 704000
},
{
"epoch": 2.0,
"eval_loss": 2.2664170265197754,
"eval_runtime": 329.3443,
"eval_samples_per_second": 910.901,
"eval_steps_per_second": 56.931,
"step": 712000
},
{
"epoch": 2.02,
"learning_rate": 2.8699999999999996e-07,
"loss": 2.4046,
"step": 720000
},
{
"epoch": 2.02,
"eval_loss": 2.2758920192718506,
"eval_runtime": 328.0111,
"eval_samples_per_second": 914.603,
"eval_steps_per_second": 57.163,
"step": 720000
},
{
"epoch": 2.04,
"eval_loss": 2.2689473628997803,
"eval_runtime": 327.8597,
"eval_samples_per_second": 915.026,
"eval_steps_per_second": 57.189,
"step": 728000
},
{
"epoch": 2.07,
"learning_rate": 2.8426666666666667e-07,
"loss": 2.4056,
"step": 736000
},
{
"epoch": 2.07,
"eval_loss": 2.2710442543029785,
"eval_runtime": 327.6707,
"eval_samples_per_second": 915.553,
"eval_steps_per_second": 57.222,
"step": 736000
},
{
"epoch": 2.09,
"eval_loss": 2.2743895053863525,
"eval_runtime": 326.938,
"eval_samples_per_second": 917.605,
"eval_steps_per_second": 57.35,
"step": 744000
},
{
"epoch": 2.11,
"learning_rate": 2.815333333333333e-07,
"loss": 2.4036,
"step": 752000
},
{
"epoch": 2.11,
"eval_loss": 2.265347719192505,
"eval_runtime": 327.8639,
"eval_samples_per_second": 915.014,
"eval_steps_per_second": 57.188,
"step": 752000
},
{
"epoch": 2.13,
"eval_loss": 2.264220952987671,
"eval_runtime": 328.2384,
"eval_samples_per_second": 913.97,
"eval_steps_per_second": 57.123,
"step": 760000
},
{
"epoch": 2.16,
"learning_rate": 2.7880000000000003e-07,
"loss": 2.3961,
"step": 768000
},
{
"epoch": 2.16,
"eval_loss": 2.2702980041503906,
"eval_runtime": 328.1483,
"eval_samples_per_second": 914.221,
"eval_steps_per_second": 57.139,
"step": 768000
},
{
"epoch": 2.18,
"eval_loss": 2.2682902812957764,
"eval_runtime": 327.4533,
"eval_samples_per_second": 916.161,
"eval_steps_per_second": 57.26,
"step": 776000
},
{
"epoch": 2.2,
"learning_rate": 2.7606666666666664e-07,
"loss": 2.3939,
"step": 784000
},
{
"epoch": 2.2,
"eval_loss": 2.2746386528015137,
"eval_runtime": 327.8678,
"eval_samples_per_second": 915.003,
"eval_steps_per_second": 57.188,
"step": 784000
},
{
"epoch": 2.22,
"eval_loss": 2.2666993141174316,
"eval_runtime": 329.1807,
"eval_samples_per_second": 911.353,
"eval_steps_per_second": 56.96,
"step": 792000
},
{
"epoch": 2.25,
"learning_rate": 2.733333333333333e-07,
"loss": 2.3998,
"step": 800000
},
{
"epoch": 2.25,
"eval_loss": 2.268972396850586,
"eval_runtime": 328.4073,
"eval_samples_per_second": 913.5,
"eval_steps_per_second": 57.094,
"step": 800000
},
{
"epoch": 2.27,
"eval_loss": 2.2696826457977295,
"eval_runtime": 329.554,
"eval_samples_per_second": 910.321,
"eval_steps_per_second": 56.895,
"step": 808000
},
{
"epoch": 2.29,
"learning_rate": 2.706e-07,
"loss": 2.3921,
"step": 816000
},
{
"epoch": 2.29,
"eval_loss": 2.268064498901367,
"eval_runtime": 328.2902,
"eval_samples_per_second": 913.826,
"eval_steps_per_second": 57.114,
"step": 816000
},
{
"epoch": 2.31,
"eval_loss": 2.27397084236145,
"eval_runtime": 328.4539,
"eval_samples_per_second": 913.37,
"eval_steps_per_second": 57.086,
"step": 824000
},
{
"epoch": 2.34,
"learning_rate": 2.6786666666666666e-07,
"loss": 2.4011,
"step": 832000
},
{
"epoch": 2.34,
"eval_loss": 2.270357608795166,
"eval_runtime": 328.9931,
"eval_samples_per_second": 911.873,
"eval_steps_per_second": 56.992,
"step": 832000
},
{
"epoch": 2.36,
"eval_loss": 2.2666330337524414,
"eval_runtime": 328.6018,
"eval_samples_per_second": 912.959,
"eval_steps_per_second": 57.06,
"step": 840000
},
{
"epoch": 2.38,
"learning_rate": 2.651333333333333e-07,
"loss": 2.3948,
"step": 848000
},
{
"epoch": 2.38,
"eval_loss": 2.2689247131347656,
"eval_runtime": 328.0791,
"eval_samples_per_second": 914.414,
"eval_steps_per_second": 57.151,
"step": 848000
},
{
"epoch": 2.4,
"eval_loss": 2.2741663455963135,
"eval_runtime": 329.8118,
"eval_samples_per_second": 909.61,
"eval_steps_per_second": 56.851,
"step": 856000
},
{
"epoch": 2.43,
"learning_rate": 2.624e-07,
"loss": 2.3957,
"step": 864000
},
{
"epoch": 2.43,
"eval_loss": 2.2755067348480225,
"eval_runtime": 329.6482,
"eval_samples_per_second": 910.061,
"eval_steps_per_second": 56.879,
"step": 864000
},
{
"epoch": 2.45,
"eval_loss": 2.268922805786133,
"eval_runtime": 328.948,
"eval_samples_per_second": 911.998,
"eval_steps_per_second": 57.0,
"step": 872000
},
{
"epoch": 2.47,
"learning_rate": 2.596666666666667e-07,
"loss": 2.3971,
"step": 880000
},
{
"epoch": 2.47,
"eval_loss": 2.271690607070923,
"eval_runtime": 328.8273,
"eval_samples_per_second": 912.333,
"eval_steps_per_second": 57.021,
"step": 880000
},
{
"epoch": 2.49,
"eval_loss": 2.2689971923828125,
"eval_runtime": 329.7312,
"eval_samples_per_second": 909.832,
"eval_steps_per_second": 56.864,
"step": 888000
},
{
"epoch": 2.52,
"learning_rate": 2.5693333333333333e-07,
"loss": 2.3982,
"step": 896000
},
{
"epoch": 2.52,
"eval_loss": 2.264453649520874,
"eval_runtime": 329.0657,
"eval_samples_per_second": 911.672,
"eval_steps_per_second": 56.98,
"step": 896000
},
{
"epoch": 2.54,
"eval_loss": 2.2726194858551025,
"eval_runtime": 328.4591,
"eval_samples_per_second": 913.356,
"eval_steps_per_second": 57.085,
"step": 904000
},
{
"epoch": 2.56,
"learning_rate": 2.542e-07,
"loss": 2.4005,
"step": 912000
},
{
"epoch": 2.56,
"eval_loss": 2.262789011001587,
"eval_runtime": 329.0087,
"eval_samples_per_second": 911.83,
"eval_steps_per_second": 56.989,
"step": 912000
},
{
"epoch": 2.58,
"eval_loss": 2.2725658416748047,
"eval_runtime": 331.131,
"eval_samples_per_second": 905.986,
"eval_steps_per_second": 56.624,
"step": 920000
},
{
"epoch": 2.6,
"learning_rate": 2.5146666666666664e-07,
"loss": 2.4037,
"step": 928000
},
{
"epoch": 2.6,
"eval_loss": 2.2759974002838135,
"eval_runtime": 329.3386,
"eval_samples_per_second": 910.917,
"eval_steps_per_second": 56.932,
"step": 928000
},
{
"epoch": 2.63,
"eval_loss": 2.2662434577941895,
"eval_runtime": 331.0495,
"eval_samples_per_second": 906.209,
"eval_steps_per_second": 56.638,
"step": 936000
},
{
"epoch": 2.65,
"learning_rate": 2.4873333333333335e-07,
"loss": 2.4031,
"step": 944000
},
{
"epoch": 2.65,
"eval_loss": 2.272948741912842,
"eval_runtime": 329.451,
"eval_samples_per_second": 910.606,
"eval_steps_per_second": 56.913,
"step": 944000
},
{
"epoch": 2.67,
"eval_loss": 2.270596742630005,
"eval_runtime": 328.9394,
"eval_samples_per_second": 912.022,
"eval_steps_per_second": 57.001,
"step": 952000
},
{
"epoch": 2.69,
"learning_rate": 2.46e-07,
"loss": 2.4025,
"step": 960000
},
{
"epoch": 2.69,
"eval_loss": 2.2684247493743896,
"eval_runtime": 328.8064,
"eval_samples_per_second": 912.391,
"eval_steps_per_second": 57.024,
"step": 960000
},
{
"epoch": 2.72,
"eval_loss": 2.2634849548339844,
"eval_runtime": 329.3927,
"eval_samples_per_second": 910.767,
"eval_steps_per_second": 56.923,
"step": 968000
},
{
"epoch": 2.74,
"learning_rate": 2.4326666666666666e-07,
"loss": 2.409,
"step": 976000
},
{
"epoch": 2.74,
"eval_loss": 2.2605979442596436,
"eval_runtime": 330.2691,
"eval_samples_per_second": 908.35,
"eval_steps_per_second": 56.772,
"step": 976000
},
{
"epoch": 2.76,
"eval_loss": 2.2664294242858887,
"eval_runtime": 334.9875,
"eval_samples_per_second": 895.556,
"eval_steps_per_second": 55.972,
"step": 984000
},
{
"epoch": 2.78,
"learning_rate": 2.405333333333333e-07,
"loss": 2.4085,
"step": 992000
},
{
"epoch": 2.78,
"eval_loss": 2.2646701335906982,
"eval_runtime": 332.6305,
"eval_samples_per_second": 901.902,
"eval_steps_per_second": 56.369,
"step": 992000
},
{
"epoch": 2.81,
"eval_loss": 2.265587329864502,
"eval_runtime": 330.3094,
"eval_samples_per_second": 908.239,
"eval_steps_per_second": 56.765,
"step": 1000000
},
{
"epoch": 2.83,
"learning_rate": 2.3779999999999997e-07,
"loss": 2.3971,
"step": 1008000
},
{
"epoch": 2.83,
"eval_loss": 2.265507221221924,
"eval_runtime": 332.5509,
"eval_samples_per_second": 902.118,
"eval_steps_per_second": 56.382,
"step": 1008000
},
{
"epoch": 2.85,
"eval_loss": 2.2681467533111572,
"eval_runtime": 329.8973,
"eval_samples_per_second": 909.374,
"eval_steps_per_second": 56.836,
"step": 1016000
},
{
"epoch": 2.87,
"learning_rate": 2.3506666666666668e-07,
"loss": 2.3946,
"step": 1024000
},
{
"epoch": 2.87,
"eval_loss": 2.267101526260376,
"eval_runtime": 329.91,
"eval_samples_per_second": 909.339,
"eval_steps_per_second": 56.834,
"step": 1024000
},
{
"epoch": 2.9,
"eval_loss": 2.2659785747528076,
"eval_runtime": 332.3096,
"eval_samples_per_second": 902.772,
"eval_steps_per_second": 56.423,
"step": 1032000
},
{
"epoch": 2.92,
"learning_rate": 2.3233333333333334e-07,
"loss": 2.4063,
"step": 1040000
},
{
"epoch": 2.92,
"eval_loss": 2.2696707248687744,
"eval_runtime": 329.9244,
"eval_samples_per_second": 909.299,
"eval_steps_per_second": 56.831,
"step": 1040000
},
{
"epoch": 2.94,
"eval_loss": 2.2705624103546143,
"eval_runtime": 330.8986,
"eval_samples_per_second": 906.622,
"eval_steps_per_second": 56.664,
"step": 1048000
},
{
"epoch": 2.96,
"learning_rate": 2.2960000000000002e-07,
"loss": 2.399,
"step": 1056000
},
{
"epoch": 2.96,
"eval_loss": 2.2625114917755127,
"eval_runtime": 330.8268,
"eval_samples_per_second": 906.819,
"eval_steps_per_second": 56.676,
"step": 1056000
},
{
"epoch": 2.99,
"eval_loss": 2.26986026763916,
"eval_runtime": 330.4632,
"eval_samples_per_second": 907.817,
"eval_steps_per_second": 56.739,
"step": 1064000
},
{
"epoch": 3.01,
"learning_rate": 2.2686666666666667e-07,
"loss": 2.4024,
"step": 1072000
},
{
"epoch": 3.01,
"eval_loss": 2.2622313499450684,
"eval_runtime": 331.2446,
"eval_samples_per_second": 905.675,
"eval_steps_per_second": 56.605,
"step": 1072000
},
{
"epoch": 3.03,
"eval_loss": 2.269458293914795,
"eval_runtime": 330.8485,
"eval_samples_per_second": 906.759,
"eval_steps_per_second": 56.672,
"step": 1080000
},
{
"epoch": 3.05,
"learning_rate": 2.2413333333333333e-07,
"loss": 2.4035,
"step": 1088000
},
{
"epoch": 3.05,
"eval_loss": 2.2699954509735107,
"eval_runtime": 332.9859,
"eval_samples_per_second": 900.939,
"eval_steps_per_second": 56.309,
"step": 1088000
},
{
"epoch": 3.08,
"eval_loss": 2.262361526489258,
"eval_runtime": 333.2535,
"eval_samples_per_second": 900.216,
"eval_steps_per_second": 56.263,
"step": 1096000
},
{
"epoch": 3.1,
"learning_rate": 2.214e-07,
"loss": 2.4061,
"step": 1104000
},
{
"epoch": 3.1,
"eval_loss": 2.2690372467041016,
"eval_runtime": 332.4767,
"eval_samples_per_second": 902.319,
"eval_steps_per_second": 56.395,
"step": 1104000
},
{
"epoch": 3.12,
"eval_loss": 2.265334367752075,
"eval_runtime": 333.384,
"eval_samples_per_second": 899.863,
"eval_steps_per_second": 56.241,
"step": 1112000
},
{
"epoch": 3.14,
"learning_rate": 2.1866666666666667e-07,
"loss": 2.4044,
"step": 1120000
},
{
"epoch": 3.14,
"eval_loss": 2.267867088317871,
"eval_runtime": 332.4491,
"eval_samples_per_second": 902.394,
"eval_steps_per_second": 56.4,
"step": 1120000
},
{
"epoch": 3.17,
"eval_loss": 2.2657666206359863,
"eval_runtime": 337.9264,
"eval_samples_per_second": 887.767,
"eval_steps_per_second": 55.485,
"step": 1128000
},
{
"epoch": 3.19,
"learning_rate": 2.1593333333333332e-07,
"loss": 2.3996,
"step": 1136000
},
{
"epoch": 3.19,
"eval_loss": 2.2680134773254395,
"eval_runtime": 335.9795,
"eval_samples_per_second": 892.912,
"eval_steps_per_second": 55.807,
"step": 1136000
},
{
"epoch": 3.21,
"eval_loss": 2.26682186126709,
"eval_runtime": 332.0277,
"eval_samples_per_second": 903.539,
"eval_steps_per_second": 56.471,
"step": 1144000
},
{
"epoch": 3.23,
"learning_rate": 2.132e-07,
"loss": 2.3943,
"step": 1152000
},
{
"epoch": 3.23,
"eval_loss": 2.2689149379730225,
"eval_runtime": 332.3397,
"eval_samples_per_second": 902.691,
"eval_steps_per_second": 56.418,
"step": 1152000
},
{
"epoch": 3.26,
"eval_loss": 2.2701900005340576,
"eval_runtime": 333.2287,
"eval_samples_per_second": 900.283,
"eval_steps_per_second": 56.268,
"step": 1160000
},
{
"epoch": 3.28,
"learning_rate": 2.1046666666666666e-07,
"loss": 2.3948,
"step": 1168000
},
{
"epoch": 3.28,
"eval_loss": 2.2652790546417236,
"eval_runtime": 332.3733,
"eval_samples_per_second": 902.6,
"eval_steps_per_second": 56.412,
"step": 1168000
},
{
"epoch": 3.3,
"eval_loss": 2.262141466140747,
"eval_runtime": 332.7579,
"eval_samples_per_second": 901.556,
"eval_steps_per_second": 56.347,
"step": 1176000
},
{
"epoch": 3.32,
"learning_rate": 2.0773333333333334e-07,
"loss": 2.4047,
"step": 1184000
},
{
"epoch": 3.32,
"eval_loss": 2.272305488586426,
"eval_runtime": 332.21,
"eval_samples_per_second": 903.043,
"eval_steps_per_second": 56.44,
"step": 1184000
},
{
"epoch": 3.35,
"eval_loss": 2.271768808364868,
"eval_runtime": 334.301,
"eval_samples_per_second": 897.395,
"eval_steps_per_second": 56.087,
"step": 1192000
},
{
"epoch": 3.37,
"learning_rate": 2.05e-07,
"loss": 2.4057,
"step": 1200000
},
{
"epoch": 3.37,
"eval_loss": 2.266768217086792,
"eval_runtime": 331.8859,
"eval_samples_per_second": 903.925,
"eval_steps_per_second": 56.495,
"step": 1200000
},
{
"epoch": 3.39,
"eval_loss": 2.264948844909668,
"eval_runtime": 333.4261,
"eval_samples_per_second": 899.75,
"eval_steps_per_second": 56.234,
"step": 1208000
},
{
"epoch": 3.41,
"learning_rate": 2.0226666666666668e-07,
"loss": 2.3901,
"step": 1216000
},
{
"epoch": 3.41,
"eval_loss": 2.2699382305145264,
"eval_runtime": 334.7905,
"eval_samples_per_second": 896.083,
"eval_steps_per_second": 56.005,
"step": 1216000
},
{
"epoch": 3.44,
"eval_loss": 2.2682831287384033,
"eval_runtime": 335.082,
"eval_samples_per_second": 895.303,
"eval_steps_per_second": 55.956,
"step": 1224000
},
{
"epoch": 3.46,
"learning_rate": 1.9953333333333333e-07,
"loss": 2.3942,
"step": 1232000
},
{
"epoch": 3.46,
"eval_loss": 2.2679033279418945,
"eval_runtime": 333.2769,
"eval_samples_per_second": 900.152,
"eval_steps_per_second": 56.26,
"step": 1232000
},
{
"epoch": 3.48,
"eval_loss": 2.264688014984131,
"eval_runtime": 335.8312,
"eval_samples_per_second": 893.306,
"eval_steps_per_second": 55.832,
"step": 1240000
},
{
"epoch": 3.5,
"learning_rate": 1.968e-07,
"loss": 2.4052,
"step": 1248000
},
{
"epoch": 3.5,
"eval_loss": 2.265596866607666,
"eval_runtime": 333.6068,
"eval_samples_per_second": 899.262,
"eval_steps_per_second": 56.204,
"step": 1248000
},
{
"epoch": 3.53,
"eval_loss": 2.267854690551758,
"eval_runtime": 333.2939,
"eval_samples_per_second": 900.107,
"eval_steps_per_second": 56.257,
"step": 1256000
},
{
"epoch": 3.55,
"learning_rate": 1.9406666666666667e-07,
"loss": 2.401,
"step": 1264000
},
{
"epoch": 3.55,
"eval_loss": 2.268515110015869,
"eval_runtime": 332.5102,
"eval_samples_per_second": 902.228,
"eval_steps_per_second": 56.389,
"step": 1264000
},
{
"epoch": 3.57,
"eval_loss": 2.26540207862854,
"eval_runtime": 332.9978,
"eval_samples_per_second": 900.907,
"eval_steps_per_second": 56.307,
"step": 1272000
},
{
"epoch": 3.59,
"learning_rate": 1.9133333333333333e-07,
"loss": 2.4012,
"step": 1280000
},
{
"epoch": 3.59,
"eval_loss": 2.260671854019165,
"eval_runtime": 333.82,
"eval_samples_per_second": 898.688,
"eval_steps_per_second": 56.168,
"step": 1280000
},
{
"epoch": 3.62,
"eval_loss": 2.2668306827545166,
"eval_runtime": 334.7781,
"eval_samples_per_second": 896.116,
"eval_steps_per_second": 56.007,
"step": 1288000
},
{
"epoch": 3.64,
"learning_rate": 1.886e-07,
"loss": 2.4015,
"step": 1296000
},
{
"epoch": 3.64,
"eval_loss": 2.267199754714966,
"eval_runtime": 333.9129,
"eval_samples_per_second": 898.438,
"eval_steps_per_second": 56.152,
"step": 1296000
},
{
"epoch": 3.66,
"eval_loss": 2.268502712249756,
"eval_runtime": 334.246,
"eval_samples_per_second": 897.542,
"eval_steps_per_second": 56.096,
"step": 1304000
},
{
"epoch": 3.68,
"learning_rate": 1.8586666666666666e-07,
"loss": 2.4039,
"step": 1312000
},
{
"epoch": 3.68,
"eval_loss": 2.267529010772705,
"eval_runtime": 333.8135,
"eval_samples_per_second": 898.705,
"eval_steps_per_second": 56.169,
"step": 1312000
},
{
"epoch": 3.71,
"eval_loss": 2.2702226638793945,
"eval_runtime": 336.4463,
"eval_samples_per_second": 891.673,
"eval_steps_per_second": 55.73,
"step": 1320000
},
{
"epoch": 3.73,
"learning_rate": 1.8313333333333332e-07,
"loss": 2.3927,
"step": 1328000
},
{
"epoch": 3.73,
"eval_loss": 2.268892526626587,
"eval_runtime": 334.6454,
"eval_samples_per_second": 896.471,
"eval_steps_per_second": 56.029,
"step": 1328000
},
{
"epoch": 3.75,
"eval_loss": 2.2673678398132324,
"eval_runtime": 334.3792,
"eval_samples_per_second": 897.185,
"eval_steps_per_second": 56.074,
"step": 1336000
},
{
"epoch": 3.77,
"learning_rate": 1.804e-07,
"loss": 2.3998,
"step": 1344000
},
{
"epoch": 3.77,
"eval_loss": 2.2693703174591064,
"eval_runtime": 336.7748,
"eval_samples_per_second": 890.803,
"eval_steps_per_second": 55.675,
"step": 1344000
},
{
"epoch": 3.8,
"eval_loss": 2.264862298965454,
"eval_runtime": 336.6189,
"eval_samples_per_second": 891.216,
"eval_steps_per_second": 55.701,
"step": 1352000
},
{
"epoch": 3.82,
"learning_rate": 1.7766666666666666e-07,
"loss": 2.404,
"step": 1360000
},
{
"epoch": 3.82,
"eval_loss": 2.263476848602295,
"eval_runtime": 333.0441,
"eval_samples_per_second": 900.782,
"eval_steps_per_second": 56.299,
"step": 1360000
},
{
"epoch": 3.84,
"eval_loss": 2.2680845260620117,
"eval_runtime": 333.2221,
"eval_samples_per_second": 900.301,
"eval_steps_per_second": 56.269,
"step": 1368000
},
{
"epoch": 3.86,
"learning_rate": 1.7493333333333334e-07,
"loss": 2.4023,
"step": 1376000
},
{
"epoch": 3.86,
"eval_loss": 2.260050058364868,
"eval_runtime": 333.6835,
"eval_samples_per_second": 899.056,
"eval_steps_per_second": 56.191,
"step": 1376000
},
{
"epoch": 3.88,
"eval_loss": 2.2660913467407227,
"eval_runtime": 334.5678,
"eval_samples_per_second": 896.679,
"eval_steps_per_second": 56.042,
"step": 1384000
},
{
"epoch": 3.91,
"learning_rate": 1.722e-07,
"loss": 2.393,
"step": 1392000
},
{
"epoch": 3.91,
"eval_loss": 2.261288642883301,
"eval_runtime": 334.5524,
"eval_samples_per_second": 896.721,
"eval_steps_per_second": 56.045,
"step": 1392000
},
{
"epoch": 3.93,
"eval_loss": 2.271660327911377,
"eval_runtime": 334.4275,
"eval_samples_per_second": 897.055,
"eval_steps_per_second": 56.066,
"step": 1400000
},
{
"epoch": 3.95,
"learning_rate": 1.6946666666666668e-07,
"loss": 2.402,
"step": 1408000
},
{
"epoch": 3.95,
"eval_loss": 2.2671592235565186,
"eval_runtime": 333.6753,
"eval_samples_per_second": 899.078,
"eval_steps_per_second": 56.192,
"step": 1408000
},
{
"epoch": 3.97,
"eval_loss": 2.263709545135498,
"eval_runtime": 333.67,
"eval_samples_per_second": 899.092,
"eval_steps_per_second": 56.193,
"step": 1416000
},
{
"epoch": 4.0,
"learning_rate": 1.6673333333333333e-07,
"loss": 2.4047,
"step": 1424000
},
{
"epoch": 4.0,
"eval_loss": 2.2704622745513916,
"eval_runtime": 336.6456,
"eval_samples_per_second": 891.145,
"eval_steps_per_second": 55.697,
"step": 1424000
},
{
"epoch": 4.02,
"eval_loss": 2.2682485580444336,
"eval_runtime": 337.2045,
"eval_samples_per_second": 889.668,
"eval_steps_per_second": 55.604,
"step": 1432000
},
{
"epoch": 4.04,
"learning_rate": 1.64e-07,
"loss": 2.4045,
"step": 1440000
},
{
"epoch": 4.04,
"eval_loss": 2.2630040645599365,
"eval_runtime": 335.66,
"eval_samples_per_second": 893.761,
"eval_steps_per_second": 55.86,
"step": 1440000
},
{
"epoch": 4.06,
"eval_loss": 2.269909143447876,
"eval_runtime": 336.6708,
"eval_samples_per_second": 891.078,
"eval_steps_per_second": 55.692,
"step": 1448000
},
{
"epoch": 4.09,
"learning_rate": 1.6126666666666667e-07,
"loss": 2.3973,
"step": 1456000
},
{
"epoch": 4.09,
"eval_loss": 2.2578797340393066,
"eval_runtime": 335.7138,
"eval_samples_per_second": 893.618,
"eval_steps_per_second": 55.851,
"step": 1456000
},
{
"epoch": 4.11,
"eval_loss": 2.2601444721221924,
"eval_runtime": 334.2559,
"eval_samples_per_second": 897.516,
"eval_steps_per_second": 56.095,
"step": 1464000
},
{
"epoch": 4.13,
"learning_rate": 1.5853333333333332e-07,
"loss": 2.399,
"step": 1472000
},
{
"epoch": 4.13,
"eval_loss": 2.26086688041687,
"eval_runtime": 334.4066,
"eval_samples_per_second": 897.112,
"eval_steps_per_second": 56.069,
"step": 1472000
},
{
"epoch": 4.15,
"eval_loss": 2.269728660583496,
"eval_runtime": 334.0805,
"eval_samples_per_second": 897.987,
"eval_steps_per_second": 56.124,
"step": 1480000
},
{
"epoch": 4.18,
"learning_rate": 1.558e-07,
"loss": 2.399,
"step": 1488000
},
{
"epoch": 4.18,
"eval_loss": 2.2630419731140137,
"eval_runtime": 334.5552,
"eval_samples_per_second": 896.713,
"eval_steps_per_second": 56.045,
"step": 1488000
},
{
"epoch": 4.2,
"eval_loss": 2.2658443450927734,
"eval_runtime": 336.5508,
"eval_samples_per_second": 891.396,
"eval_steps_per_second": 55.712,
"step": 1496000
},
{
"epoch": 4.22,
"learning_rate": 1.5306666666666666e-07,
"loss": 2.3995,
"step": 1504000
},
{
"epoch": 4.22,
"eval_loss": 2.265606641769409,
"eval_runtime": 335.2841,
"eval_samples_per_second": 894.763,
"eval_steps_per_second": 55.923,
"step": 1504000
},
{
"epoch": 4.24,
"eval_loss": 2.2688894271850586,
"eval_runtime": 337.311,
"eval_samples_per_second": 889.387,
"eval_steps_per_second": 55.587,
"step": 1512000
},
{
"epoch": 4.27,
"learning_rate": 1.5033333333333332e-07,
"loss": 2.3929,
"step": 1520000
},
{
"epoch": 4.27,
"eval_loss": 2.2678134441375732,
"eval_runtime": 337.3214,
"eval_samples_per_second": 889.359,
"eval_steps_per_second": 55.585,
"step": 1520000
},
{
"epoch": 4.29,
"eval_loss": 2.2694430351257324,
"eval_runtime": 336.6085,
"eval_samples_per_second": 891.243,
"eval_steps_per_second": 55.703,
"step": 1528000
},
{
"epoch": 4.31,
"learning_rate": 1.476e-07,
"loss": 2.404,
"step": 1536000
},
{
"epoch": 4.31,
"eval_loss": 2.2631914615631104,
"eval_runtime": 337.5687,
"eval_samples_per_second": 888.708,
"eval_steps_per_second": 55.544,
"step": 1536000
},
{
"epoch": 4.33,
"eval_loss": 2.2656803131103516,
"eval_runtime": 336.4606,
"eval_samples_per_second": 891.635,
"eval_steps_per_second": 55.727,
"step": 1544000
},
{
"epoch": 4.36,
"learning_rate": 1.4486666666666665e-07,
"loss": 2.3932,
"step": 1552000
},
{
"epoch": 4.36,
"eval_loss": 2.2641873359680176,
"eval_runtime": 335.6292,
"eval_samples_per_second": 893.844,
"eval_steps_per_second": 55.865,
"step": 1552000
},
{
"epoch": 4.38,
"eval_loss": 2.260714054107666,
"eval_runtime": 335.5993,
"eval_samples_per_second": 893.923,
"eval_steps_per_second": 55.87,
"step": 1560000
},
{
"epoch": 4.4,
"learning_rate": 1.4213333333333334e-07,
"loss": 2.3985,
"step": 1568000
},
{
"epoch": 4.4,
"eval_loss": 2.2634730339050293,
"eval_runtime": 335.566,
"eval_samples_per_second": 894.012,
"eval_steps_per_second": 55.876,
"step": 1568000
},
{
"epoch": 4.42,
"eval_loss": 2.2645463943481445,
"eval_runtime": 337.3641,
"eval_samples_per_second": 889.247,
"eval_steps_per_second": 55.578,
"step": 1576000
},
{
"epoch": 4.45,
"learning_rate": 1.3940000000000002e-07,
"loss": 2.3997,
"step": 1584000
},
{
"epoch": 4.45,
"eval_loss": 2.2654054164886475,
"eval_runtime": 336.173,
"eval_samples_per_second": 892.398,
"eval_steps_per_second": 55.775,
"step": 1584000
},
{
"epoch": 4.47,
"eval_loss": 2.2672231197357178,
"eval_runtime": 336.1452,
"eval_samples_per_second": 892.472,
"eval_steps_per_second": 55.779,
"step": 1592000
},
{
"epoch": 4.49,
"learning_rate": 1.3666666666666665e-07,
"loss": 2.396,
"step": 1600000
},
{
"epoch": 4.49,
"eval_loss": 2.2665934562683105,
"eval_runtime": 336.5057,
"eval_samples_per_second": 891.515,
"eval_steps_per_second": 55.72,
"step": 1600000
},
{
"epoch": 4.51,
"eval_loss": 2.2708349227905273,
"eval_runtime": 335.6471,
"eval_samples_per_second": 893.796,
"eval_steps_per_second": 55.862,
"step": 1608000
},
{
"epoch": 4.54,
"learning_rate": 1.3393333333333333e-07,
"loss": 2.4012,
"step": 1616000
},
{
"epoch": 4.54,
"eval_loss": 2.2706656455993652,
"eval_runtime": 335.6113,
"eval_samples_per_second": 893.891,
"eval_steps_per_second": 55.868,
"step": 1616000
},
{
"epoch": 4.56,
"eval_loss": 2.2683677673339844,
"eval_runtime": 335.9133,
"eval_samples_per_second": 893.087,
"eval_steps_per_second": 55.818,
"step": 1624000
},
{
"epoch": 4.58,
"learning_rate": 1.312e-07,
"loss": 2.4074,
"step": 1632000
},
{
"epoch": 4.58,
"eval_loss": 2.2676126956939697,
"eval_runtime": 336.2793,
"eval_samples_per_second": 892.116,
"eval_steps_per_second": 55.757,
"step": 1632000
},
{
"epoch": 4.6,
"eval_loss": 2.2657711505889893,
"eval_runtime": 336.5159,
"eval_samples_per_second": 891.488,
"eval_steps_per_second": 55.718,
"step": 1640000
},
{
"epoch": 4.63,
"learning_rate": 1.2846666666666667e-07,
"loss": 2.3965,
"step": 1648000
},
{
"epoch": 4.63,
"eval_loss": 2.2716164588928223,
"eval_runtime": 335.6672,
"eval_samples_per_second": 893.742,
"eval_steps_per_second": 55.859,
"step": 1648000
},
{
"epoch": 4.65,
"eval_loss": 2.2655858993530273,
"eval_runtime": 335.9521,
"eval_samples_per_second": 892.984,
"eval_steps_per_second": 55.812,
"step": 1656000
},
{
"epoch": 4.67,
"learning_rate": 1.2573333333333332e-07,
"loss": 2.4021,
"step": 1664000
},
{
"epoch": 4.67,
"eval_loss": 2.2689690589904785,
"eval_runtime": 336.4235,
"eval_samples_per_second": 891.733,
"eval_steps_per_second": 55.733,
"step": 1664000
},
{
"epoch": 4.69,
"eval_loss": 2.265604257583618,
"eval_runtime": 337.1771,
"eval_samples_per_second": 889.74,
"eval_steps_per_second": 55.609,
"step": 1672000
},
{
"epoch": 4.72,
"learning_rate": 1.23e-07,
"loss": 2.3981,
"step": 1680000
},
{
"epoch": 4.72,
"eval_loss": 2.2659354209899902,
"eval_runtime": 337.0582,
"eval_samples_per_second": 890.054,
"eval_steps_per_second": 55.628,
"step": 1680000
},
{
"epoch": 4.74,
"eval_loss": 2.2666890621185303,
"eval_runtime": 336.7986,
"eval_samples_per_second": 890.74,
"eval_steps_per_second": 55.671,
"step": 1688000
},
{
"epoch": 4.76,
"learning_rate": 1.2026666666666666e-07,
"loss": 2.3974,
"step": 1696000
},
{
"epoch": 4.76,
"eval_loss": 2.2654528617858887,
"eval_runtime": 338.6552,
"eval_samples_per_second": 885.857,
"eval_steps_per_second": 55.366,
"step": 1696000
},
{
"epoch": 4.78,
"eval_loss": 2.2675693035125732,
"eval_runtime": 336.4191,
"eval_samples_per_second": 891.745,
"eval_steps_per_second": 55.734,
"step": 1704000
},
{
"epoch": 4.81,
"learning_rate": 1.1753333333333334e-07,
"loss": 2.3964,
"step": 1712000
},
{
"epoch": 4.81,
"eval_loss": 2.265490770339966,
"eval_runtime": 338.7304,
"eval_samples_per_second": 885.66,
"eval_steps_per_second": 55.354,
"step": 1712000
},
{
"epoch": 4.83,
"eval_loss": 2.2635693550109863,
"eval_runtime": 337.2341,
"eval_samples_per_second": 889.59,
"eval_steps_per_second": 55.599,
"step": 1720000
},
{
"epoch": 4.85,
"learning_rate": 1.1480000000000001e-07,
"loss": 2.3933,
"step": 1728000
},
{
"epoch": 4.85,
"eval_loss": 2.267894983291626,
"eval_runtime": 337.1638,
"eval_samples_per_second": 889.775,
"eval_steps_per_second": 55.611,
"step": 1728000
},
{
"epoch": 4.87,
"eval_loss": 2.266650438308716,
"eval_runtime": 337.1959,
"eval_samples_per_second": 889.69,
"eval_steps_per_second": 55.606,
"step": 1736000
},
{
"epoch": 4.9,
"learning_rate": 1.1206666666666666e-07,
"loss": 2.4066,
"step": 1744000
},
{
"epoch": 4.9,
"eval_loss": 2.264688730239868,
"eval_runtime": 338.0924,
"eval_samples_per_second": 887.331,
"eval_steps_per_second": 55.458,
"step": 1744000
},
{
"epoch": 4.92,
"eval_loss": 2.265735149383545,
"eval_runtime": 338.8846,
"eval_samples_per_second": 885.257,
"eval_steps_per_second": 55.329,
"step": 1752000
},
{
"epoch": 4.94,
"learning_rate": 1.0933333333333333e-07,
"loss": 2.4027,
"step": 1760000
},
{
"epoch": 4.94,
"eval_loss": 2.2628121376037598,
"eval_runtime": 337.9881,
"eval_samples_per_second": 887.605,
"eval_steps_per_second": 55.475,
"step": 1760000
},
{
"epoch": 4.96,
"eval_loss": 2.2642323970794678,
"eval_runtime": 339.1796,
"eval_samples_per_second": 884.487,
"eval_steps_per_second": 55.28,
"step": 1768000
},
{
"epoch": 4.99,
"learning_rate": 1.066e-07,
"loss": 2.4029,
"step": 1776000
},
{
"epoch": 4.99,
"eval_loss": 2.2676889896392822,
"eval_runtime": 338.3313,
"eval_samples_per_second": 886.705,
"eval_steps_per_second": 55.419,
"step": 1776000
},
{
"epoch": 5.01,
"eval_loss": 2.2704169750213623,
"eval_runtime": 340.3735,
"eval_samples_per_second": 881.385,
"eval_steps_per_second": 55.087,
"step": 1784000
},
{
"epoch": 5.03,
"learning_rate": 1.0386666666666667e-07,
"loss": 2.3958,
"step": 1792000
},
{
"epoch": 5.03,
"eval_loss": 2.2650022506713867,
"eval_runtime": 337.884,
"eval_samples_per_second": 887.879,
"eval_steps_per_second": 55.492,
"step": 1792000
},
{
"epoch": 5.05,
"eval_loss": 2.265009880065918,
"eval_runtime": 339.0311,
"eval_samples_per_second": 884.875,
"eval_steps_per_second": 55.305,
"step": 1800000
},
{
"epoch": 5.08,
"learning_rate": 1.0113333333333334e-07,
"loss": 2.4054,
"step": 1808000
},
{
"epoch": 5.08,
"eval_loss": 2.2680423259735107,
"eval_runtime": 338.3773,
"eval_samples_per_second": 886.584,
"eval_steps_per_second": 55.412,
"step": 1808000
},
{
"epoch": 5.1,
"eval_loss": 2.2601048946380615,
"eval_runtime": 338.8902,
"eval_samples_per_second": 885.243,
"eval_steps_per_second": 55.328,
"step": 1816000
},
{
"epoch": 5.12,
"learning_rate": 9.84e-08,
"loss": 2.3984,
"step": 1824000
},
{
"epoch": 5.12,
"eval_loss": 2.267129898071289,
"eval_runtime": 341.218,
"eval_samples_per_second": 879.203,
"eval_steps_per_second": 54.95,
"step": 1824000
},
{
"epoch": 5.14,
"eval_loss": 2.263897657394409,
"eval_runtime": 339.0811,
"eval_samples_per_second": 884.744,
"eval_steps_per_second": 55.296,
"step": 1832000
},
{
"epoch": 5.16,
"learning_rate": 9.566666666666666e-08,
"loss": 2.4005,
"step": 1840000
},
{
"epoch": 5.16,
"eval_loss": 2.262948989868164,
"eval_runtime": 338.4625,
"eval_samples_per_second": 886.361,
"eval_steps_per_second": 55.398,
"step": 1840000
},
{
"epoch": 5.19,
"eval_loss": 2.2656354904174805,
"eval_runtime": 339.1914,
"eval_samples_per_second": 884.456,
"eval_steps_per_second": 55.279,
"step": 1848000
},
{
"epoch": 5.21,
"learning_rate": 9.293333333333333e-08,
"loss": 2.3962,
"step": 1856000
},
{
"epoch": 5.21,
"eval_loss": 2.2646210193634033,
"eval_runtime": 339.4764,
"eval_samples_per_second": 883.714,
"eval_steps_per_second": 55.232,
"step": 1856000
},
{
"epoch": 5.23,
"eval_loss": 2.2571327686309814,
"eval_runtime": 340.4494,
"eval_samples_per_second": 881.188,
"eval_steps_per_second": 55.074,
"step": 1864000
},
{
"epoch": 5.25,
"learning_rate": 9.02e-08,
"loss": 2.4033,
"step": 1872000
},
{
"epoch": 5.25,
"eval_loss": 2.2689077854156494,
"eval_runtime": 339.6348,
"eval_samples_per_second": 883.302,
"eval_steps_per_second": 55.206,
"step": 1872000
},
{
"epoch": 5.28,
"eval_loss": 2.263167381286621,
"eval_runtime": 340.3091,
"eval_samples_per_second": 881.552,
"eval_steps_per_second": 55.097,
"step": 1880000
},
{
"epoch": 5.3,
"learning_rate": 8.746666666666667e-08,
"loss": 2.4064,
"step": 1888000
},
{
"epoch": 5.3,
"eval_loss": 2.2632765769958496,
"eval_runtime": 342.5582,
"eval_samples_per_second": 875.764,
"eval_steps_per_second": 54.735,
"step": 1888000
},
{
"epoch": 5.32,
"eval_loss": 2.2693655490875244,
"eval_runtime": 342.7491,
"eval_samples_per_second": 875.276,
"eval_steps_per_second": 54.705,
"step": 1896000
},
{
"epoch": 5.34,
"learning_rate": 8.473333333333334e-08,
"loss": 2.3967,
"step": 1904000
},
{
"epoch": 5.34,
"eval_loss": 2.2685184478759766,
"eval_runtime": 342.158,
"eval_samples_per_second": 876.788,
"eval_steps_per_second": 54.799,
"step": 1904000
},
{
"epoch": 5.37,
"eval_loss": 2.2636401653289795,
"eval_runtime": 341.2652,
"eval_samples_per_second": 879.082,
"eval_steps_per_second": 54.943,
"step": 1912000
},
{
"epoch": 5.39,
"learning_rate": 8.2e-08,
"loss": 2.4002,
"step": 1920000
},
{
"epoch": 5.39,
"eval_loss": 2.268721103668213,
"eval_runtime": 343.2554,
"eval_samples_per_second": 873.985,
"eval_steps_per_second": 54.624,
"step": 1920000
},
{
"epoch": 5.41,
"eval_loss": 2.263157844543457,
"eval_runtime": 341.2197,
"eval_samples_per_second": 879.199,
"eval_steps_per_second": 54.95,
"step": 1928000
},
{
"epoch": 5.43,
"learning_rate": 7.926666666666666e-08,
"loss": 2.4045,
"step": 1936000
},
{
"epoch": 5.43,
"eval_loss": 2.262470006942749,
"eval_runtime": 342.6853,
"eval_samples_per_second": 875.439,
"eval_steps_per_second": 54.715,
"step": 1936000
},
{
"epoch": 5.46,
"eval_loss": 2.267735242843628,
"eval_runtime": 346.6665,
"eval_samples_per_second": 865.385,
"eval_steps_per_second": 54.087,
"step": 1944000
},
{
"epoch": 5.48,
"learning_rate": 7.653333333333333e-08,
"loss": 2.4096,
"step": 1952000
},
{
"epoch": 5.48,
"eval_loss": 2.256277322769165,
"eval_runtime": 340.6214,
"eval_samples_per_second": 880.743,
"eval_steps_per_second": 55.046,
"step": 1952000
},
{
"epoch": 5.5,
"eval_loss": 2.264164447784424,
"eval_runtime": 341.931,
"eval_samples_per_second": 877.37,
"eval_steps_per_second": 54.836,
"step": 1960000
},
{
"epoch": 5.52,
"learning_rate": 7.38e-08,
"loss": 2.4004,
"step": 1968000
},
{
"epoch": 5.52,
"eval_loss": 2.269155979156494,
"eval_runtime": 342.3742,
"eval_samples_per_second": 876.234,
"eval_steps_per_second": 54.765,
"step": 1968000
},
{
"epoch": 5.55,
"eval_loss": 2.2696123123168945,
"eval_runtime": 345.6816,
"eval_samples_per_second": 867.851,
"eval_steps_per_second": 54.241,
"step": 1976000
},
{
"epoch": 5.57,
"learning_rate": 7.106666666666667e-08,
"loss": 2.4065,
"step": 1984000
},
{
"epoch": 5.57,
"eval_loss": 2.2579238414764404,
"eval_runtime": 341.8896,
"eval_samples_per_second": 877.476,
"eval_steps_per_second": 54.842,
"step": 1984000
},
{
"epoch": 5.59,
"eval_loss": 2.266026020050049,
"eval_runtime": 344.4173,
"eval_samples_per_second": 871.036,
"eval_steps_per_second": 54.44,
"step": 1992000
},
{
"epoch": 5.61,
"learning_rate": 6.833333333333332e-08,
"loss": 2.4025,
"step": 2000000
},
{
"epoch": 5.61,
"eval_loss": 2.2654054164886475,
"eval_runtime": 342.2708,
"eval_samples_per_second": 876.499,
"eval_steps_per_second": 54.781,
"step": 2000000
},
{
"epoch": 5.64,
"eval_loss": 2.2706494331359863,
"eval_runtime": 341.5445,
"eval_samples_per_second": 878.363,
"eval_steps_per_second": 54.898,
"step": 2008000
},
{
"epoch": 5.66,
"learning_rate": 6.56e-08,
"loss": 2.3993,
"step": 2016000
},
{
"epoch": 5.66,
"eval_loss": 2.270448684692383,
"eval_runtime": 340.9974,
"eval_samples_per_second": 879.772,
"eval_steps_per_second": 54.986,
"step": 2016000
},
{
"epoch": 5.68,
"eval_loss": 2.2663590908050537,
"eval_runtime": 340.7056,
"eval_samples_per_second": 880.526,
"eval_steps_per_second": 55.033,
"step": 2024000
},
{
"epoch": 5.7,
"learning_rate": 6.286666666666666e-08,
"loss": 2.4034,
"step": 2032000
},
{
"epoch": 5.7,
"eval_loss": 2.2659454345703125,
"eval_runtime": 341.9489,
"eval_samples_per_second": 877.324,
"eval_steps_per_second": 54.833,
"step": 2032000
},
{
"epoch": 5.73,
"eval_loss": 2.268005609512329,
"eval_runtime": 340.8655,
"eval_samples_per_second": 880.113,
"eval_steps_per_second": 55.007,
"step": 2040000
},
{
"epoch": 5.75,
"learning_rate": 6.013333333333333e-08,
"loss": 2.4004,
"step": 2048000
},
{
"epoch": 5.75,
"eval_loss": 2.2611002922058105,
"eval_runtime": 340.9511,
"eval_samples_per_second": 879.891,
"eval_steps_per_second": 54.993,
"step": 2048000
},
{
"epoch": 5.77,
"eval_loss": 2.264587879180908,
"eval_runtime": 342.5116,
"eval_samples_per_second": 875.883,
"eval_steps_per_second": 54.743,
"step": 2056000
},
{
"epoch": 5.79,
"learning_rate": 5.7400000000000004e-08,
"loss": 2.4025,
"step": 2064000
},
{
"epoch": 5.79,
"eval_loss": 2.268247604370117,
"eval_runtime": 343.4269,
"eval_samples_per_second": 873.548,
"eval_steps_per_second": 54.597,
"step": 2064000
},
{
"epoch": 5.82,
"eval_loss": 2.264587640762329,
"eval_runtime": 341.3392,
"eval_samples_per_second": 878.891,
"eval_steps_per_second": 54.931,
"step": 2072000
},
{
"epoch": 5.84,
"learning_rate": 5.4666666666666666e-08,
"loss": 2.4063,
"step": 2080000
},
{
"epoch": 5.84,
"eval_loss": 2.2597994804382324,
"eval_runtime": 343.1178,
"eval_samples_per_second": 874.335,
"eval_steps_per_second": 54.646,
"step": 2080000
},
{
"epoch": 5.86,
"eval_loss": 2.267334461212158,
"eval_runtime": 344.4059,
"eval_samples_per_second": 871.065,
"eval_steps_per_second": 54.442,
"step": 2088000
},
{
"epoch": 5.88,
"learning_rate": 5.1933333333333335e-08,
"loss": 2.4071,
"step": 2096000
},
{
"epoch": 5.88,
"eval_loss": 2.264587879180908,
"eval_runtime": 342.5952,
"eval_samples_per_second": 875.669,
"eval_steps_per_second": 54.729,
"step": 2096000
},
{
"epoch": 5.91,
"eval_loss": 2.2672042846679688,
"eval_runtime": 342.3657,
"eval_samples_per_second": 876.256,
"eval_steps_per_second": 54.766,
"step": 2104000
},
{
"epoch": 5.93,
"learning_rate": 4.92e-08,
"loss": 2.401,
"step": 2112000
},
{
"epoch": 5.93,
"eval_loss": 2.2647833824157715,
"eval_runtime": 343.2309,
"eval_samples_per_second": 874.047,
"eval_steps_per_second": 54.628,
"step": 2112000
},
{
"epoch": 5.95,
"eval_loss": 2.2654144763946533,
"eval_runtime": 344.1951,
"eval_samples_per_second": 871.599,
"eval_steps_per_second": 54.475,
"step": 2120000
},
{
"epoch": 5.97,
"learning_rate": 4.6466666666666666e-08,
"loss": 2.402,
"step": 2128000
},
{
"epoch": 5.97,
"eval_loss": 2.2664010524749756,
"eval_runtime": 342.7081,
"eval_samples_per_second": 875.381,
"eval_steps_per_second": 54.711,
"step": 2128000
},
{
"epoch": 6.0,
"eval_loss": 2.2682883739471436,
"eval_runtime": 342.1336,
"eval_samples_per_second": 876.851,
"eval_steps_per_second": 54.803,
"step": 2136000
},
{
"epoch": 6.02,
"learning_rate": 4.3733333333333335e-08,
"loss": 2.4004,
"step": 2144000
},
{
"epoch": 6.02,
"eval_loss": 2.261821985244751,
"eval_runtime": 343.7815,
"eval_samples_per_second": 872.647,
"eval_steps_per_second": 54.54,
"step": 2144000
},
{
"epoch": 6.04,
"eval_loss": 2.2668938636779785,
"eval_runtime": 344.1074,
"eval_samples_per_second": 871.821,
"eval_steps_per_second": 54.489,
"step": 2152000
},
{
"epoch": 6.06,
"learning_rate": 4.1e-08,
"loss": 2.4001,
"step": 2160000
},
{
"epoch": 6.06,
"eval_loss": 2.2630324363708496,
"eval_runtime": 341.9786,
"eval_samples_per_second": 877.248,
"eval_steps_per_second": 54.828,
"step": 2160000
},
{
"epoch": 6.09,
"eval_loss": 2.2631518840789795,
"eval_runtime": 341.9226,
"eval_samples_per_second": 877.391,
"eval_steps_per_second": 54.837,
"step": 2168000
},
{
"epoch": 6.11,
"learning_rate": 3.8266666666666665e-08,
"loss": 2.4046,
"step": 2176000
},
{
"epoch": 6.11,
"eval_loss": 2.26960825920105,
"eval_runtime": 344.2789,
"eval_samples_per_second": 871.387,
"eval_steps_per_second": 54.462,
"step": 2176000
},
{
"epoch": 6.13,
"eval_loss": 2.2641026973724365,
"eval_runtime": 343.3436,
"eval_samples_per_second": 873.76,
"eval_steps_per_second": 54.61,
"step": 2184000
},
{
"epoch": 6.15,
"learning_rate": 3.5533333333333334e-08,
"loss": 2.405,
"step": 2192000
},
{
"epoch": 6.15,
"eval_loss": 2.262655735015869,
"eval_runtime": 344.8039,
"eval_samples_per_second": 870.06,
"eval_steps_per_second": 54.379,
"step": 2192000
},
{
"epoch": 6.18,
"eval_loss": 2.268143653869629,
"eval_runtime": 343.934,
"eval_samples_per_second": 872.26,
"eval_steps_per_second": 54.516,
"step": 2200000
},
{
"epoch": 6.2,
"learning_rate": 3.28e-08,
"loss": 2.4063,
"step": 2208000
},
{
"epoch": 6.2,
"eval_loss": 2.2603704929351807,
"eval_runtime": 342.6448,
"eval_samples_per_second": 875.542,
"eval_steps_per_second": 54.721,
"step": 2208000
},
{
"epoch": 6.22,
"eval_loss": 2.271454095840454,
"eval_runtime": 343.324,
"eval_samples_per_second": 873.81,
"eval_steps_per_second": 54.613,
"step": 2216000
},
{
"epoch": 6.24,
"learning_rate": 3.0066666666666665e-08,
"loss": 2.3991,
"step": 2224000
},
{
"epoch": 6.24,
"eval_loss": 2.268319606781006,
"eval_runtime": 342.6834,
"eval_samples_per_second": 875.444,
"eval_steps_per_second": 54.715,
"step": 2224000
},
{
"epoch": 6.27,
"eval_loss": 2.265730857849121,
"eval_runtime": 346.275,
"eval_samples_per_second": 866.363,
"eval_steps_per_second": 54.148,
"step": 2232000
},
{
"epoch": 6.29,
"learning_rate": 2.7333333333333333e-08,
"loss": 2.405,
"step": 2240000
},
{
"epoch": 6.29,
"eval_loss": 2.2645092010498047,
"eval_runtime": 343.3622,
"eval_samples_per_second": 873.713,
"eval_steps_per_second": 54.607,
"step": 2240000
},
{
"epoch": 6.31,
"eval_loss": 2.2676303386688232,
"eval_runtime": 343.161,
"eval_samples_per_second": 874.225,
"eval_steps_per_second": 54.639,
"step": 2248000
},
{
"epoch": 6.33,
"learning_rate": 2.46e-08,
"loss": 2.3941,
"step": 2256000
},
{
"epoch": 6.33,
"eval_loss": 2.270566463470459,
"eval_runtime": 344.7989,
"eval_samples_per_second": 870.072,
"eval_steps_per_second": 54.38,
"step": 2256000
},
{
"epoch": 6.36,
"eval_loss": 2.259324312210083,
"eval_runtime": 344.3396,
"eval_samples_per_second": 871.233,
"eval_steps_per_second": 54.452,
"step": 2264000
},
{
"epoch": 6.38,
"learning_rate": 2.1866666666666667e-08,
"loss": 2.4041,
"step": 2272000
},
{
"epoch": 6.38,
"eval_loss": 2.267908811569214,
"eval_runtime": 344.2377,
"eval_samples_per_second": 871.491,
"eval_steps_per_second": 54.468,
"step": 2272000
},
{
"epoch": 6.4,
"eval_loss": 2.2643110752105713,
"eval_runtime": 343.3047,
"eval_samples_per_second": 873.859,
"eval_steps_per_second": 54.616,
"step": 2280000
},
{
"epoch": 6.42,
"learning_rate": 1.9133333333333333e-08,
"loss": 2.4001,
"step": 2288000
},
{
"epoch": 6.42,
"eval_loss": 2.2728431224823,
"eval_runtime": 343.644,
"eval_samples_per_second": 872.996,
"eval_steps_per_second": 54.562,
"step": 2288000
},
{
"epoch": 6.44,
"eval_loss": 2.263103485107422,
"eval_runtime": 343.0897,
"eval_samples_per_second": 874.407,
"eval_steps_per_second": 54.65,
"step": 2296000
},
{
"epoch": 6.47,
"learning_rate": 1.64e-08,
"loss": 2.3983,
"step": 2304000
},
{
"epoch": 6.47,
"eval_loss": 2.263552188873291,
"eval_runtime": 344.7078,
"eval_samples_per_second": 870.302,
"eval_steps_per_second": 54.394,
"step": 2304000
},
{
"epoch": 6.49,
"eval_loss": 2.262969732284546,
"eval_runtime": 343.3199,
"eval_samples_per_second": 873.821,
"eval_steps_per_second": 54.614,
"step": 2312000
},
{
"epoch": 6.51,
"learning_rate": 1.3666666666666667e-08,
"loss": 2.4003,
"step": 2320000
},
{
"epoch": 6.51,
"eval_loss": 2.2662770748138428,
"eval_runtime": 344.4313,
"eval_samples_per_second": 871.001,
"eval_steps_per_second": 54.438,
"step": 2320000
},
{
"epoch": 6.53,
"eval_loss": 2.264718770980835,
"eval_runtime": 344.3318,
"eval_samples_per_second": 871.253,
"eval_steps_per_second": 54.453,
"step": 2328000
},
{
"epoch": 6.56,
"learning_rate": 1.0933333333333334e-08,
"loss": 2.3981,
"step": 2336000
},
{
"epoch": 6.56,
"eval_loss": 2.2669222354888916,
"eval_runtime": 344.4268,
"eval_samples_per_second": 871.012,
"eval_steps_per_second": 54.438,
"step": 2336000
},
{
"epoch": 6.58,
"eval_loss": 2.266000509262085,
"eval_runtime": 344.4815,
"eval_samples_per_second": 870.874,
"eval_steps_per_second": 54.43,
"step": 2344000
},
{
"epoch": 6.6,
"learning_rate": 8.2e-09,
"loss": 2.3951,
"step": 2352000
},
{
"epoch": 6.6,
"eval_loss": 2.2692267894744873,
"eval_runtime": 344.0579,
"eval_samples_per_second": 871.946,
"eval_steps_per_second": 54.497,
"step": 2352000
},
{
"epoch": 6.62,
"eval_loss": 2.264406442642212,
"eval_runtime": 344.7783,
"eval_samples_per_second": 870.124,
"eval_steps_per_second": 54.383,
"step": 2360000
},
{
"epoch": 6.65,
"learning_rate": 5.466666666666667e-09,
"loss": 2.4013,
"step": 2368000
},
{
"epoch": 6.65,
"eval_loss": 2.2610132694244385,
"eval_runtime": 344.5393,
"eval_samples_per_second": 870.728,
"eval_steps_per_second": 54.42,
"step": 2368000
},
{
"epoch": 6.67,
"eval_loss": 2.26550555229187,
"eval_runtime": 344.5292,
"eval_samples_per_second": 870.754,
"eval_steps_per_second": 54.422,
"step": 2376000
},
{
"epoch": 6.69,
"learning_rate": 2.7333333333333334e-09,
"loss": 2.4,
"step": 2384000
},
{
"epoch": 6.69,
"eval_loss": 2.25915789604187,
"eval_runtime": 344.8958,
"eval_samples_per_second": 869.828,
"eval_steps_per_second": 54.364,
"step": 2384000
},
{
"epoch": 6.71,
"eval_loss": 2.266591787338257,
"eval_runtime": 344.5939,
"eval_samples_per_second": 870.59,
"eval_steps_per_second": 54.412,
"step": 2392000
},
{
"epoch": 6.74,
"learning_rate": 0.0,
"loss": 2.3975,
"step": 2400000
},
{
"epoch": 6.74,
"eval_loss": 2.2684991359710693,
"eval_runtime": 344.5329,
"eval_samples_per_second": 870.744,
"eval_steps_per_second": 54.422,
"step": 2400000
},
{
"epoch": 6.74,
"step": 2400000,
"total_flos": 8.367702695823237e+17,
"train_loss": 2.4076748518880207,
"train_runtime": 247856.7094,
"train_samples_per_second": 154.928,
"train_steps_per_second": 9.683
}
],
"logging_steps": 16000,
"max_steps": 2400000,
"num_train_epochs": 7,
"save_steps": 32000,
"total_flos": 8.367702695823237e+17,
"trial_name": null,
"trial_params": null
}