|
{
|
|
"best_metric": null,
|
|
"best_model_checkpoint": null,
|
|
"epoch": 30.0,
|
|
"eval_steps": 100,
|
|
"global_step": 28800,
|
|
"is_hyper_param_search": false,
|
|
"is_local_process_zero": true,
|
|
"is_world_process_zero": true,
|
|
"log_history": [
|
|
{
|
|
"epoch": 0.10416666666666667,
|
|
"eval_loss": 0.07793249189853668,
|
|
"eval_runtime": 3.1433,
|
|
"eval_samples_per_second": 159.069,
|
|
"eval_steps_per_second": 15.907,
|
|
"step": 100
|
|
},
|
|
{
|
|
"epoch": 0.20833333333333334,
|
|
"eval_loss": 0.060976069420576096,
|
|
"eval_runtime": 5.2088,
|
|
"eval_samples_per_second": 95.992,
|
|
"eval_steps_per_second": 9.599,
|
|
"step": 200
|
|
},
|
|
{
|
|
"epoch": 0.3125,
|
|
"eval_loss": 0.06986938416957855,
|
|
"eval_runtime": 5.0795,
|
|
"eval_samples_per_second": 98.436,
|
|
"eval_steps_per_second": 9.844,
|
|
"step": 300
|
|
},
|
|
{
|
|
"epoch": 0.4166666666666667,
|
|
"eval_loss": 0.049215417355298996,
|
|
"eval_runtime": 3.1571,
|
|
"eval_samples_per_second": 158.372,
|
|
"eval_steps_per_second": 15.837,
|
|
"step": 400
|
|
},
|
|
{
|
|
"epoch": 0.5208333333333334,
|
|
"grad_norm": 2.454777240753174,
|
|
"learning_rate": 8.680555555555556e-06,
|
|
"loss": 0.1299,
|
|
"step": 500
|
|
},
|
|
{
|
|
"epoch": 0.5208333333333334,
|
|
"eval_loss": 0.05049608275294304,
|
|
"eval_runtime": 3.1336,
|
|
"eval_samples_per_second": 159.56,
|
|
"eval_steps_per_second": 15.956,
|
|
"step": 500
|
|
},
|
|
{
|
|
"epoch": 0.625,
|
|
"eval_loss": 0.05414561927318573,
|
|
"eval_runtime": 3.0995,
|
|
"eval_samples_per_second": 161.315,
|
|
"eval_steps_per_second": 16.131,
|
|
"step": 600
|
|
},
|
|
{
|
|
"epoch": 0.7291666666666666,
|
|
"eval_loss": 0.058882277458906174,
|
|
"eval_runtime": 3.2082,
|
|
"eval_samples_per_second": 155.852,
|
|
"eval_steps_per_second": 15.585,
|
|
"step": 700
|
|
},
|
|
{
|
|
"epoch": 0.8333333333333334,
|
|
"eval_loss": 0.05974648892879486,
|
|
"eval_runtime": 4.9789,
|
|
"eval_samples_per_second": 100.424,
|
|
"eval_steps_per_second": 10.042,
|
|
"step": 800
|
|
},
|
|
{
|
|
"epoch": 0.9375,
|
|
"eval_loss": 0.06364227831363678,
|
|
"eval_runtime": 3.1858,
|
|
"eval_samples_per_second": 156.948,
|
|
"eval_steps_per_second": 15.695,
|
|
"step": 900
|
|
},
|
|
{
|
|
"epoch": 1.0416666666666667,
|
|
"grad_norm": 0.6711832284927368,
|
|
"learning_rate": 1.736111111111111e-05,
|
|
"loss": 0.1028,
|
|
"step": 1000
|
|
},
|
|
{
|
|
"epoch": 1.0416666666666667,
|
|
"eval_loss": 0.07583945989608765,
|
|
"eval_runtime": 3.1485,
|
|
"eval_samples_per_second": 158.808,
|
|
"eval_steps_per_second": 15.881,
|
|
"step": 1000
|
|
},
|
|
{
|
|
"epoch": 1.1458333333333333,
|
|
"eval_loss": 0.06294066458940506,
|
|
"eval_runtime": 3.7666,
|
|
"eval_samples_per_second": 132.746,
|
|
"eval_steps_per_second": 13.275,
|
|
"step": 1100
|
|
},
|
|
{
|
|
"epoch": 1.25,
|
|
"eval_loss": 0.06619992107152939,
|
|
"eval_runtime": 3.2391,
|
|
"eval_samples_per_second": 154.363,
|
|
"eval_steps_per_second": 15.436,
|
|
"step": 1200
|
|
},
|
|
{
|
|
"epoch": 1.3541666666666667,
|
|
"eval_loss": 0.09093113243579865,
|
|
"eval_runtime": 5.1419,
|
|
"eval_samples_per_second": 97.241,
|
|
"eval_steps_per_second": 9.724,
|
|
"step": 1300
|
|
},
|
|
{
|
|
"epoch": 1.4583333333333333,
|
|
"eval_loss": 0.10987065732479095,
|
|
"eval_runtime": 3.0882,
|
|
"eval_samples_per_second": 161.906,
|
|
"eval_steps_per_second": 16.191,
|
|
"step": 1400
|
|
},
|
|
{
|
|
"epoch": 1.5625,
|
|
"grad_norm": 23.796688079833984,
|
|
"learning_rate": 2.604166666666667e-05,
|
|
"loss": 0.091,
|
|
"step": 1500
|
|
},
|
|
{
|
|
"epoch": 1.5625,
|
|
"eval_loss": 0.10730883479118347,
|
|
"eval_runtime": 3.112,
|
|
"eval_samples_per_second": 160.667,
|
|
"eval_steps_per_second": 16.067,
|
|
"step": 1500
|
|
},
|
|
{
|
|
"epoch": 1.6666666666666665,
|
|
"eval_loss": 0.11321321874856949,
|
|
"eval_runtime": 3.1391,
|
|
"eval_samples_per_second": 159.279,
|
|
"eval_steps_per_second": 15.928,
|
|
"step": 1600
|
|
},
|
|
{
|
|
"epoch": 1.7708333333333335,
|
|
"eval_loss": 0.09883977472782135,
|
|
"eval_runtime": 5.0716,
|
|
"eval_samples_per_second": 98.587,
|
|
"eval_steps_per_second": 9.859,
|
|
"step": 1700
|
|
},
|
|
{
|
|
"epoch": 1.875,
|
|
"eval_loss": 0.11777649074792862,
|
|
"eval_runtime": 5.1982,
|
|
"eval_samples_per_second": 96.187,
|
|
"eval_steps_per_second": 9.619,
|
|
"step": 1800
|
|
},
|
|
{
|
|
"epoch": 1.9791666666666665,
|
|
"eval_loss": 0.09968377649784088,
|
|
"eval_runtime": 3.1348,
|
|
"eval_samples_per_second": 159.499,
|
|
"eval_steps_per_second": 15.95,
|
|
"step": 1900
|
|
},
|
|
{
|
|
"epoch": 2.0833333333333335,
|
|
"grad_norm": 1.0350979566574097,
|
|
"learning_rate": 3.472222222222222e-05,
|
|
"loss": 0.1147,
|
|
"step": 2000
|
|
},
|
|
{
|
|
"epoch": 2.0833333333333335,
|
|
"eval_loss": 0.0993076041340828,
|
|
"eval_runtime": 3.1904,
|
|
"eval_samples_per_second": 156.719,
|
|
"eval_steps_per_second": 15.672,
|
|
"step": 2000
|
|
},
|
|
{
|
|
"epoch": 2.1875,
|
|
"eval_loss": 0.09541837126016617,
|
|
"eval_runtime": 3.0402,
|
|
"eval_samples_per_second": 164.463,
|
|
"eval_steps_per_second": 16.446,
|
|
"step": 2100
|
|
},
|
|
{
|
|
"epoch": 2.2916666666666665,
|
|
"eval_loss": 0.1211095005273819,
|
|
"eval_runtime": 3.0483,
|
|
"eval_samples_per_second": 164.026,
|
|
"eval_steps_per_second": 16.403,
|
|
"step": 2200
|
|
},
|
|
{
|
|
"epoch": 2.3958333333333335,
|
|
"eval_loss": 0.10396522283554077,
|
|
"eval_runtime": 3.0322,
|
|
"eval_samples_per_second": 164.895,
|
|
"eval_steps_per_second": 16.489,
|
|
"step": 2300
|
|
},
|
|
{
|
|
"epoch": 2.5,
|
|
"eval_loss": 0.15123824775218964,
|
|
"eval_runtime": 3.0437,
|
|
"eval_samples_per_second": 164.275,
|
|
"eval_steps_per_second": 16.427,
|
|
"step": 2400
|
|
},
|
|
{
|
|
"epoch": 2.6041666666666665,
|
|
"grad_norm": 2.144970178604126,
|
|
"learning_rate": 4.340277777777778e-05,
|
|
"loss": 0.0832,
|
|
"step": 2500
|
|
},
|
|
{
|
|
"epoch": 2.6041666666666665,
|
|
"eval_loss": 0.16908738017082214,
|
|
"eval_runtime": 4.9423,
|
|
"eval_samples_per_second": 101.167,
|
|
"eval_steps_per_second": 10.117,
|
|
"step": 2500
|
|
},
|
|
{
|
|
"epoch": 2.7083333333333335,
|
|
"eval_loss": 0.16467152535915375,
|
|
"eval_runtime": 3.999,
|
|
"eval_samples_per_second": 125.033,
|
|
"eval_steps_per_second": 12.503,
|
|
"step": 2600
|
|
},
|
|
{
|
|
"epoch": 2.8125,
|
|
"eval_loss": 0.19610275328159332,
|
|
"eval_runtime": 4.7478,
|
|
"eval_samples_per_second": 105.311,
|
|
"eval_steps_per_second": 10.531,
|
|
"step": 2700
|
|
},
|
|
{
|
|
"epoch": 2.9166666666666665,
|
|
"eval_loss": 0.2006768137216568,
|
|
"eval_runtime": 4.7959,
|
|
"eval_samples_per_second": 104.256,
|
|
"eval_steps_per_second": 10.426,
|
|
"step": 2800
|
|
},
|
|
{
|
|
"epoch": 3.0208333333333335,
|
|
"eval_loss": 0.1452997922897339,
|
|
"eval_runtime": 4.8491,
|
|
"eval_samples_per_second": 103.111,
|
|
"eval_steps_per_second": 10.311,
|
|
"step": 2900
|
|
},
|
|
{
|
|
"epoch": 3.125,
|
|
"grad_norm": 8.211258888244629,
|
|
"learning_rate": 4.976851851851852e-05,
|
|
"loss": 0.115,
|
|
"step": 3000
|
|
},
|
|
{
|
|
"epoch": 3.125,
|
|
"eval_loss": 0.17185473442077637,
|
|
"eval_runtime": 4.968,
|
|
"eval_samples_per_second": 100.644,
|
|
"eval_steps_per_second": 10.064,
|
|
"step": 3000
|
|
},
|
|
{
|
|
"epoch": 3.2291666666666665,
|
|
"eval_loss": 0.1926930844783783,
|
|
"eval_runtime": 3.0476,
|
|
"eval_samples_per_second": 164.064,
|
|
"eval_steps_per_second": 16.406,
|
|
"step": 3100
|
|
},
|
|
{
|
|
"epoch": 3.3333333333333335,
|
|
"eval_loss": 0.20631477236747742,
|
|
"eval_runtime": 3.0178,
|
|
"eval_samples_per_second": 165.682,
|
|
"eval_steps_per_second": 16.568,
|
|
"step": 3200
|
|
},
|
|
{
|
|
"epoch": 3.4375,
|
|
"eval_loss": 0.15469783544540405,
|
|
"eval_runtime": 3.0052,
|
|
"eval_samples_per_second": 166.38,
|
|
"eval_steps_per_second": 16.638,
|
|
"step": 3300
|
|
},
|
|
{
|
|
"epoch": 3.5416666666666665,
|
|
"eval_loss": 0.13334515690803528,
|
|
"eval_runtime": 3.0679,
|
|
"eval_samples_per_second": 162.979,
|
|
"eval_steps_per_second": 16.298,
|
|
"step": 3400
|
|
},
|
|
{
|
|
"epoch": 3.6458333333333335,
|
|
"grad_norm": 1.2755959033966064,
|
|
"learning_rate": 4.880401234567901e-05,
|
|
"loss": 0.0954,
|
|
"step": 3500
|
|
},
|
|
{
|
|
"epoch": 3.6458333333333335,
|
|
"eval_loss": 0.16306829452514648,
|
|
"eval_runtime": 3.1165,
|
|
"eval_samples_per_second": 160.436,
|
|
"eval_steps_per_second": 16.044,
|
|
"step": 3500
|
|
},
|
|
{
|
|
"epoch": 3.75,
|
|
"eval_loss": 0.211606964468956,
|
|
"eval_runtime": 2.9864,
|
|
"eval_samples_per_second": 167.428,
|
|
"eval_steps_per_second": 16.743,
|
|
"step": 3600
|
|
},
|
|
{
|
|
"epoch": 3.8541666666666665,
|
|
"eval_loss": 0.18188658356666565,
|
|
"eval_runtime": 2.984,
|
|
"eval_samples_per_second": 167.563,
|
|
"eval_steps_per_second": 16.756,
|
|
"step": 3700
|
|
},
|
|
{
|
|
"epoch": 3.9583333333333335,
|
|
"eval_loss": 0.1797247678041458,
|
|
"eval_runtime": 2.9814,
|
|
"eval_samples_per_second": 167.708,
|
|
"eval_steps_per_second": 16.771,
|
|
"step": 3800
|
|
},
|
|
{
|
|
"epoch": 4.0625,
|
|
"eval_loss": 0.16481846570968628,
|
|
"eval_runtime": 3.059,
|
|
"eval_samples_per_second": 163.454,
|
|
"eval_steps_per_second": 16.345,
|
|
"step": 3900
|
|
},
|
|
{
|
|
"epoch": 4.166666666666667,
|
|
"grad_norm": 7.385697364807129,
|
|
"learning_rate": 4.783950617283951e-05,
|
|
"loss": 0.0823,
|
|
"step": 4000
|
|
},
|
|
{
|
|
"epoch": 4.166666666666667,
|
|
"eval_loss": 0.20793281495571136,
|
|
"eval_runtime": 2.996,
|
|
"eval_samples_per_second": 166.89,
|
|
"eval_steps_per_second": 16.689,
|
|
"step": 4000
|
|
},
|
|
{
|
|
"epoch": 4.270833333333333,
|
|
"eval_loss": 0.1974990963935852,
|
|
"eval_runtime": 3.0328,
|
|
"eval_samples_per_second": 164.862,
|
|
"eval_steps_per_second": 16.486,
|
|
"step": 4100
|
|
},
|
|
{
|
|
"epoch": 4.375,
|
|
"eval_loss": 0.24861294031143188,
|
|
"eval_runtime": 2.9906,
|
|
"eval_samples_per_second": 167.189,
|
|
"eval_steps_per_second": 16.719,
|
|
"step": 4200
|
|
},
|
|
{
|
|
"epoch": 4.479166666666667,
|
|
"eval_loss": 0.1775345653295517,
|
|
"eval_runtime": 2.9925,
|
|
"eval_samples_per_second": 167.083,
|
|
"eval_steps_per_second": 16.708,
|
|
"step": 4300
|
|
},
|
|
{
|
|
"epoch": 4.583333333333333,
|
|
"eval_loss": 0.1704624593257904,
|
|
"eval_runtime": 3.0684,
|
|
"eval_samples_per_second": 162.95,
|
|
"eval_steps_per_second": 16.295,
|
|
"step": 4400
|
|
},
|
|
{
|
|
"epoch": 4.6875,
|
|
"grad_norm": 0.6013280749320984,
|
|
"learning_rate": 4.6875e-05,
|
|
"loss": 0.0549,
|
|
"step": 4500
|
|
},
|
|
{
|
|
"epoch": 4.6875,
|
|
"eval_loss": 0.22908172011375427,
|
|
"eval_runtime": 3.0692,
|
|
"eval_samples_per_second": 162.907,
|
|
"eval_steps_per_second": 16.291,
|
|
"step": 4500
|
|
},
|
|
{
|
|
"epoch": 4.791666666666667,
|
|
"eval_loss": 0.2069811075925827,
|
|
"eval_runtime": 2.9792,
|
|
"eval_samples_per_second": 167.83,
|
|
"eval_steps_per_second": 16.783,
|
|
"step": 4600
|
|
},
|
|
{
|
|
"epoch": 4.895833333333333,
|
|
"eval_loss": 0.20501156151294708,
|
|
"eval_runtime": 2.9759,
|
|
"eval_samples_per_second": 168.015,
|
|
"eval_steps_per_second": 16.801,
|
|
"step": 4700
|
|
},
|
|
{
|
|
"epoch": 5.0,
|
|
"eval_loss": 0.1834719032049179,
|
|
"eval_runtime": 2.9965,
|
|
"eval_samples_per_second": 166.864,
|
|
"eval_steps_per_second": 16.686,
|
|
"step": 4800
|
|
},
|
|
{
|
|
"epoch": 5.104166666666667,
|
|
"eval_loss": 0.1747453212738037,
|
|
"eval_runtime": 3.0464,
|
|
"eval_samples_per_second": 164.127,
|
|
"eval_steps_per_second": 16.413,
|
|
"step": 4900
|
|
},
|
|
{
|
|
"epoch": 5.208333333333333,
|
|
"grad_norm": 0.5279375910758972,
|
|
"learning_rate": 4.591049382716049e-05,
|
|
"loss": 0.0518,
|
|
"step": 5000
|
|
},
|
|
{
|
|
"epoch": 5.208333333333333,
|
|
"eval_loss": 0.21121186017990112,
|
|
"eval_runtime": 2.9779,
|
|
"eval_samples_per_second": 167.902,
|
|
"eval_steps_per_second": 16.79,
|
|
"step": 5000
|
|
},
|
|
{
|
|
"epoch": 5.3125,
|
|
"eval_loss": 0.20287540555000305,
|
|
"eval_runtime": 3.0068,
|
|
"eval_samples_per_second": 166.292,
|
|
"eval_steps_per_second": 16.629,
|
|
"step": 5100
|
|
},
|
|
{
|
|
"epoch": 5.416666666666667,
|
|
"eval_loss": 0.20998916029930115,
|
|
"eval_runtime": 3.0065,
|
|
"eval_samples_per_second": 166.307,
|
|
"eval_steps_per_second": 16.631,
|
|
"step": 5200
|
|
},
|
|
{
|
|
"epoch": 5.520833333333333,
|
|
"eval_loss": 0.2178017646074295,
|
|
"eval_runtime": 2.9901,
|
|
"eval_samples_per_second": 167.22,
|
|
"eval_steps_per_second": 16.722,
|
|
"step": 5300
|
|
},
|
|
{
|
|
"epoch": 5.625,
|
|
"eval_loss": 0.2072754055261612,
|
|
"eval_runtime": 3.0001,
|
|
"eval_samples_per_second": 166.662,
|
|
"eval_steps_per_second": 16.666,
|
|
"step": 5400
|
|
},
|
|
{
|
|
"epoch": 5.729166666666667,
|
|
"grad_norm": 1.3259690999984741,
|
|
"learning_rate": 4.494598765432099e-05,
|
|
"loss": 0.0363,
|
|
"step": 5500
|
|
},
|
|
{
|
|
"epoch": 5.729166666666667,
|
|
"eval_loss": 0.16645069420337677,
|
|
"eval_runtime": 3.0133,
|
|
"eval_samples_per_second": 165.933,
|
|
"eval_steps_per_second": 16.593,
|
|
"step": 5500
|
|
},
|
|
{
|
|
"epoch": 5.833333333333333,
|
|
"eval_loss": 0.18585015833377838,
|
|
"eval_runtime": 3.0016,
|
|
"eval_samples_per_second": 166.577,
|
|
"eval_steps_per_second": 16.658,
|
|
"step": 5600
|
|
},
|
|
{
|
|
"epoch": 5.9375,
|
|
"eval_loss": 0.24183067679405212,
|
|
"eval_runtime": 3.0665,
|
|
"eval_samples_per_second": 163.05,
|
|
"eval_steps_per_second": 16.305,
|
|
"step": 5700
|
|
},
|
|
{
|
|
"epoch": 6.041666666666667,
|
|
"eval_loss": 0.22054076194763184,
|
|
"eval_runtime": 2.9689,
|
|
"eval_samples_per_second": 168.413,
|
|
"eval_steps_per_second": 16.841,
|
|
"step": 5800
|
|
},
|
|
{
|
|
"epoch": 6.145833333333333,
|
|
"eval_loss": 0.17449523508548737,
|
|
"eval_runtime": 3.0168,
|
|
"eval_samples_per_second": 165.739,
|
|
"eval_steps_per_second": 16.574,
|
|
"step": 5900
|
|
},
|
|
{
|
|
"epoch": 6.25,
|
|
"grad_norm": 0.3047299087047577,
|
|
"learning_rate": 4.3981481481481486e-05,
|
|
"loss": 0.0305,
|
|
"step": 6000
|
|
},
|
|
{
|
|
"epoch": 6.25,
|
|
"eval_loss": 0.20409347116947174,
|
|
"eval_runtime": 2.9665,
|
|
"eval_samples_per_second": 168.549,
|
|
"eval_steps_per_second": 16.855,
|
|
"step": 6000
|
|
},
|
|
{
|
|
"epoch": 6.354166666666667,
|
|
"eval_loss": 0.23080451786518097,
|
|
"eval_runtime": 3.0955,
|
|
"eval_samples_per_second": 161.524,
|
|
"eval_steps_per_second": 16.152,
|
|
"step": 6100
|
|
},
|
|
{
|
|
"epoch": 6.458333333333333,
|
|
"eval_loss": 0.23451215028762817,
|
|
"eval_runtime": 2.9667,
|
|
"eval_samples_per_second": 168.535,
|
|
"eval_steps_per_second": 16.854,
|
|
"step": 6200
|
|
},
|
|
{
|
|
"epoch": 6.5625,
|
|
"eval_loss": 0.1807590276002884,
|
|
"eval_runtime": 3.0657,
|
|
"eval_samples_per_second": 163.093,
|
|
"eval_steps_per_second": 16.309,
|
|
"step": 6300
|
|
},
|
|
{
|
|
"epoch": 6.666666666666667,
|
|
"eval_loss": 0.21056562662124634,
|
|
"eval_runtime": 2.9809,
|
|
"eval_samples_per_second": 167.732,
|
|
"eval_steps_per_second": 16.773,
|
|
"step": 6400
|
|
},
|
|
{
|
|
"epoch": 6.770833333333333,
|
|
"grad_norm": 0.7045068144798279,
|
|
"learning_rate": 4.301697530864198e-05,
|
|
"loss": 0.0221,
|
|
"step": 6500
|
|
},
|
|
{
|
|
"epoch": 6.770833333333333,
|
|
"eval_loss": 0.2072342485189438,
|
|
"eval_runtime": 2.97,
|
|
"eval_samples_per_second": 168.353,
|
|
"eval_steps_per_second": 16.835,
|
|
"step": 6500
|
|
},
|
|
{
|
|
"epoch": 6.875,
|
|
"eval_loss": 0.2312939465045929,
|
|
"eval_runtime": 3.0018,
|
|
"eval_samples_per_second": 166.566,
|
|
"eval_steps_per_second": 16.657,
|
|
"step": 6600
|
|
},
|
|
{
|
|
"epoch": 6.979166666666667,
|
|
"eval_loss": 0.2178051918745041,
|
|
"eval_runtime": 3.0112,
|
|
"eval_samples_per_second": 166.045,
|
|
"eval_steps_per_second": 16.605,
|
|
"step": 6700
|
|
},
|
|
{
|
|
"epoch": 7.083333333333333,
|
|
"eval_loss": 0.1962529867887497,
|
|
"eval_runtime": 2.9373,
|
|
"eval_samples_per_second": 170.222,
|
|
"eval_steps_per_second": 17.022,
|
|
"step": 6800
|
|
},
|
|
{
|
|
"epoch": 7.1875,
|
|
"eval_loss": 0.23245805501937866,
|
|
"eval_runtime": 2.9287,
|
|
"eval_samples_per_second": 170.727,
|
|
"eval_steps_per_second": 17.073,
|
|
"step": 6900
|
|
},
|
|
{
|
|
"epoch": 7.291666666666667,
|
|
"grad_norm": 0.3293949365615845,
|
|
"learning_rate": 4.205246913580247e-05,
|
|
"loss": 0.0191,
|
|
"step": 7000
|
|
},
|
|
{
|
|
"epoch": 7.291666666666667,
|
|
"eval_loss": 0.20682944357395172,
|
|
"eval_runtime": 2.922,
|
|
"eval_samples_per_second": 171.118,
|
|
"eval_steps_per_second": 17.112,
|
|
"step": 7000
|
|
},
|
|
{
|
|
"epoch": 7.395833333333333,
|
|
"eval_loss": 0.2781427800655365,
|
|
"eval_runtime": 3.0086,
|
|
"eval_samples_per_second": 166.191,
|
|
"eval_steps_per_second": 16.619,
|
|
"step": 7100
|
|
},
|
|
{
|
|
"epoch": 7.5,
|
|
"eval_loss": 0.23378807306289673,
|
|
"eval_runtime": 2.928,
|
|
"eval_samples_per_second": 170.763,
|
|
"eval_steps_per_second": 17.076,
|
|
"step": 7200
|
|
},
|
|
{
|
|
"epoch": 7.604166666666667,
|
|
"eval_loss": 0.20770449936389923,
|
|
"eval_runtime": 2.9745,
|
|
"eval_samples_per_second": 168.097,
|
|
"eval_steps_per_second": 16.81,
|
|
"step": 7300
|
|
},
|
|
{
|
|
"epoch": 7.708333333333333,
|
|
"eval_loss": 0.19365566968917847,
|
|
"eval_runtime": 2.9194,
|
|
"eval_samples_per_second": 171.267,
|
|
"eval_steps_per_second": 17.127,
|
|
"step": 7400
|
|
},
|
|
{
|
|
"epoch": 7.8125,
|
|
"grad_norm": 0.15344583988189697,
|
|
"learning_rate": 4.1087962962962965e-05,
|
|
"loss": 0.0185,
|
|
"step": 7500
|
|
},
|
|
{
|
|
"epoch": 7.8125,
|
|
"eval_loss": 0.22151848673820496,
|
|
"eval_runtime": 2.901,
|
|
"eval_samples_per_second": 172.356,
|
|
"eval_steps_per_second": 17.236,
|
|
"step": 7500
|
|
},
|
|
{
|
|
"epoch": 7.916666666666667,
|
|
"eval_loss": 0.18537622690200806,
|
|
"eval_runtime": 2.9225,
|
|
"eval_samples_per_second": 171.086,
|
|
"eval_steps_per_second": 17.109,
|
|
"step": 7600
|
|
},
|
|
{
|
|
"epoch": 8.020833333333334,
|
|
"eval_loss": 0.21262435615062714,
|
|
"eval_runtime": 2.9556,
|
|
"eval_samples_per_second": 169.168,
|
|
"eval_steps_per_second": 16.917,
|
|
"step": 7700
|
|
},
|
|
{
|
|
"epoch": 8.125,
|
|
"eval_loss": 0.24334414303302765,
|
|
"eval_runtime": 2.9324,
|
|
"eval_samples_per_second": 170.51,
|
|
"eval_steps_per_second": 17.051,
|
|
"step": 7800
|
|
},
|
|
{
|
|
"epoch": 8.229166666666666,
|
|
"eval_loss": 0.2494126558303833,
|
|
"eval_runtime": 2.9435,
|
|
"eval_samples_per_second": 169.864,
|
|
"eval_steps_per_second": 16.986,
|
|
"step": 7900
|
|
},
|
|
{
|
|
"epoch": 8.333333333333334,
|
|
"grad_norm": 0.12840139865875244,
|
|
"learning_rate": 4.012345679012346e-05,
|
|
"loss": 0.0152,
|
|
"step": 8000
|
|
},
|
|
{
|
|
"epoch": 8.333333333333334,
|
|
"eval_loss": 0.18908418715000153,
|
|
"eval_runtime": 2.9931,
|
|
"eval_samples_per_second": 167.052,
|
|
"eval_steps_per_second": 16.705,
|
|
"step": 8000
|
|
},
|
|
{
|
|
"epoch": 8.4375,
|
|
"eval_loss": 0.21541763842105865,
|
|
"eval_runtime": 2.9627,
|
|
"eval_samples_per_second": 168.765,
|
|
"eval_steps_per_second": 16.876,
|
|
"step": 8100
|
|
},
|
|
{
|
|
"epoch": 8.541666666666666,
|
|
"eval_loss": 0.1759854555130005,
|
|
"eval_runtime": 2.9612,
|
|
"eval_samples_per_second": 168.849,
|
|
"eval_steps_per_second": 16.885,
|
|
"step": 8200
|
|
},
|
|
{
|
|
"epoch": 8.645833333333334,
|
|
"eval_loss": 0.19276337325572968,
|
|
"eval_runtime": 3.0833,
|
|
"eval_samples_per_second": 162.166,
|
|
"eval_steps_per_second": 16.217,
|
|
"step": 8300
|
|
},
|
|
{
|
|
"epoch": 8.75,
|
|
"eval_loss": 0.2762078642845154,
|
|
"eval_runtime": 2.9688,
|
|
"eval_samples_per_second": 168.42,
|
|
"eval_steps_per_second": 16.842,
|
|
"step": 8400
|
|
},
|
|
{
|
|
"epoch": 8.854166666666666,
|
|
"grad_norm": 0.2024412602186203,
|
|
"learning_rate": 3.915895061728395e-05,
|
|
"loss": 0.0131,
|
|
"step": 8500
|
|
},
|
|
{
|
|
"epoch": 8.854166666666666,
|
|
"eval_loss": 0.2193083018064499,
|
|
"eval_runtime": 3.0051,
|
|
"eval_samples_per_second": 166.385,
|
|
"eval_steps_per_second": 16.639,
|
|
"step": 8500
|
|
},
|
|
{
|
|
"epoch": 8.958333333333334,
|
|
"eval_loss": 0.2521556615829468,
|
|
"eval_runtime": 2.9901,
|
|
"eval_samples_per_second": 167.221,
|
|
"eval_steps_per_second": 16.722,
|
|
"step": 8600
|
|
},
|
|
{
|
|
"epoch": 9.0625,
|
|
"eval_loss": 0.25700661540031433,
|
|
"eval_runtime": 2.9726,
|
|
"eval_samples_per_second": 168.202,
|
|
"eval_steps_per_second": 16.82,
|
|
"step": 8700
|
|
},
|
|
{
|
|
"epoch": 9.166666666666666,
|
|
"eval_loss": 0.2857784628868103,
|
|
"eval_runtime": 3.0972,
|
|
"eval_samples_per_second": 161.436,
|
|
"eval_steps_per_second": 16.144,
|
|
"step": 8800
|
|
},
|
|
{
|
|
"epoch": 9.270833333333334,
|
|
"eval_loss": 0.24449588358402252,
|
|
"eval_runtime": 2.9884,
|
|
"eval_samples_per_second": 167.312,
|
|
"eval_steps_per_second": 16.731,
|
|
"step": 8900
|
|
},
|
|
{
|
|
"epoch": 9.375,
|
|
"grad_norm": 0.406023770570755,
|
|
"learning_rate": 3.8194444444444444e-05,
|
|
"loss": 0.0109,
|
|
"step": 9000
|
|
},
|
|
{
|
|
"epoch": 9.375,
|
|
"eval_loss": 0.2514427900314331,
|
|
"eval_runtime": 2.9485,
|
|
"eval_samples_per_second": 169.58,
|
|
"eval_steps_per_second": 16.958,
|
|
"step": 9000
|
|
},
|
|
{
|
|
"epoch": 9.479166666666666,
|
|
"eval_loss": 0.2540574371814728,
|
|
"eval_runtime": 2.9427,
|
|
"eval_samples_per_second": 169.914,
|
|
"eval_steps_per_second": 16.991,
|
|
"step": 9100
|
|
},
|
|
{
|
|
"epoch": 9.583333333333334,
|
|
"eval_loss": 0.24942024052143097,
|
|
"eval_runtime": 2.8732,
|
|
"eval_samples_per_second": 174.023,
|
|
"eval_steps_per_second": 17.402,
|
|
"step": 9200
|
|
},
|
|
{
|
|
"epoch": 9.6875,
|
|
"eval_loss": 0.25903967022895813,
|
|
"eval_runtime": 2.8416,
|
|
"eval_samples_per_second": 175.956,
|
|
"eval_steps_per_second": 17.596,
|
|
"step": 9300
|
|
},
|
|
{
|
|
"epoch": 9.791666666666666,
|
|
"eval_loss": 0.21632665395736694,
|
|
"eval_runtime": 2.787,
|
|
"eval_samples_per_second": 179.408,
|
|
"eval_steps_per_second": 17.941,
|
|
"step": 9400
|
|
},
|
|
{
|
|
"epoch": 9.895833333333334,
|
|
"grad_norm": 0.3254503309726715,
|
|
"learning_rate": 3.722993827160494e-05,
|
|
"loss": 0.0117,
|
|
"step": 9500
|
|
},
|
|
{
|
|
"epoch": 9.895833333333334,
|
|
"eval_loss": 0.22778008878231049,
|
|
"eval_runtime": 2.7626,
|
|
"eval_samples_per_second": 180.991,
|
|
"eval_steps_per_second": 18.099,
|
|
"step": 9500
|
|
},
|
|
{
|
|
"epoch": 10.0,
|
|
"eval_loss": 0.25083282589912415,
|
|
"eval_runtime": 2.6929,
|
|
"eval_samples_per_second": 185.676,
|
|
"eval_steps_per_second": 18.568,
|
|
"step": 9600
|
|
},
|
|
{
|
|
"epoch": 10.104166666666666,
|
|
"eval_loss": 0.27844151854515076,
|
|
"eval_runtime": 2.6854,
|
|
"eval_samples_per_second": 186.195,
|
|
"eval_steps_per_second": 18.62,
|
|
"step": 9700
|
|
},
|
|
{
|
|
"epoch": 10.208333333333334,
|
|
"eval_loss": 0.2655443549156189,
|
|
"eval_runtime": 2.6574,
|
|
"eval_samples_per_second": 188.155,
|
|
"eval_steps_per_second": 18.815,
|
|
"step": 9800
|
|
},
|
|
{
|
|
"epoch": 10.3125,
|
|
"eval_loss": 0.27213552594184875,
|
|
"eval_runtime": 2.7641,
|
|
"eval_samples_per_second": 180.888,
|
|
"eval_steps_per_second": 18.089,
|
|
"step": 9900
|
|
},
|
|
{
|
|
"epoch": 10.416666666666666,
|
|
"grad_norm": 0.01915908418595791,
|
|
"learning_rate": 3.626543209876543e-05,
|
|
"loss": 0.01,
|
|
"step": 10000
|
|
},
|
|
{
|
|
"epoch": 10.416666666666666,
|
|
"eval_loss": 0.2086724191904068,
|
|
"eval_runtime": 2.6276,
|
|
"eval_samples_per_second": 190.287,
|
|
"eval_steps_per_second": 19.029,
|
|
"step": 10000
|
|
},
|
|
{
|
|
"epoch": 10.520833333333334,
|
|
"eval_loss": 0.21828189492225647,
|
|
"eval_runtime": 2.7043,
|
|
"eval_samples_per_second": 184.893,
|
|
"eval_steps_per_second": 18.489,
|
|
"step": 10100
|
|
},
|
|
{
|
|
"epoch": 10.625,
|
|
"eval_loss": 0.17361362278461456,
|
|
"eval_runtime": 2.6359,
|
|
"eval_samples_per_second": 189.686,
|
|
"eval_steps_per_second": 18.969,
|
|
"step": 10200
|
|
},
|
|
{
|
|
"epoch": 10.729166666666666,
|
|
"eval_loss": 0.23900029063224792,
|
|
"eval_runtime": 2.6622,
|
|
"eval_samples_per_second": 187.815,
|
|
"eval_steps_per_second": 18.781,
|
|
"step": 10300
|
|
},
|
|
{
|
|
"epoch": 10.833333333333334,
|
|
"eval_loss": 0.21116891503334045,
|
|
"eval_runtime": 2.7512,
|
|
"eval_samples_per_second": 181.737,
|
|
"eval_steps_per_second": 18.174,
|
|
"step": 10400
|
|
},
|
|
{
|
|
"epoch": 10.9375,
|
|
"grad_norm": 0.15779471397399902,
|
|
"learning_rate": 3.530092592592593e-05,
|
|
"loss": 0.0079,
|
|
"step": 10500
|
|
},
|
|
{
|
|
"epoch": 10.9375,
|
|
"eval_loss": 0.22312691807746887,
|
|
"eval_runtime": 2.6146,
|
|
"eval_samples_per_second": 191.236,
|
|
"eval_steps_per_second": 19.124,
|
|
"step": 10500
|
|
},
|
|
{
|
|
"epoch": 11.041666666666666,
|
|
"eval_loss": 0.19351090490818024,
|
|
"eval_runtime": 2.6666,
|
|
"eval_samples_per_second": 187.503,
|
|
"eval_steps_per_second": 18.75,
|
|
"step": 10600
|
|
},
|
|
{
|
|
"epoch": 11.145833333333334,
|
|
"eval_loss": 0.22429315745830536,
|
|
"eval_runtime": 2.7037,
|
|
"eval_samples_per_second": 184.933,
|
|
"eval_steps_per_second": 18.493,
|
|
"step": 10700
|
|
},
|
|
{
|
|
"epoch": 11.25,
|
|
"eval_loss": 0.2761378884315491,
|
|
"eval_runtime": 2.6376,
|
|
"eval_samples_per_second": 189.563,
|
|
"eval_steps_per_second": 18.956,
|
|
"step": 10800
|
|
},
|
|
{
|
|
"epoch": 11.354166666666666,
|
|
"eval_loss": 0.21286267042160034,
|
|
"eval_runtime": 2.7376,
|
|
"eval_samples_per_second": 182.64,
|
|
"eval_steps_per_second": 18.264,
|
|
"step": 10900
|
|
},
|
|
{
|
|
"epoch": 11.458333333333334,
|
|
"grad_norm": 28.01507568359375,
|
|
"learning_rate": 3.4336419753086427e-05,
|
|
"loss": 0.0076,
|
|
"step": 11000
|
|
},
|
|
{
|
|
"epoch": 11.458333333333334,
|
|
"eval_loss": 0.16870196163654327,
|
|
"eval_runtime": 2.6961,
|
|
"eval_samples_per_second": 185.451,
|
|
"eval_steps_per_second": 18.545,
|
|
"step": 11000
|
|
},
|
|
{
|
|
"epoch": 11.5625,
|
|
"eval_loss": 0.24428625404834747,
|
|
"eval_runtime": 2.6808,
|
|
"eval_samples_per_second": 186.514,
|
|
"eval_steps_per_second": 18.651,
|
|
"step": 11100
|
|
},
|
|
{
|
|
"epoch": 11.666666666666666,
|
|
"eval_loss": 0.24231122434139252,
|
|
"eval_runtime": 2.6904,
|
|
"eval_samples_per_second": 185.843,
|
|
"eval_steps_per_second": 18.584,
|
|
"step": 11200
|
|
},
|
|
{
|
|
"epoch": 11.770833333333334,
|
|
"eval_loss": 0.29215526580810547,
|
|
"eval_runtime": 2.6528,
|
|
"eval_samples_per_second": 188.481,
|
|
"eval_steps_per_second": 18.848,
|
|
"step": 11300
|
|
},
|
|
{
|
|
"epoch": 11.875,
|
|
"eval_loss": 0.22481079399585724,
|
|
"eval_runtime": 2.6638,
|
|
"eval_samples_per_second": 187.703,
|
|
"eval_steps_per_second": 18.77,
|
|
"step": 11400
|
|
},
|
|
{
|
|
"epoch": 11.979166666666666,
|
|
"grad_norm": 0.09294537454843521,
|
|
"learning_rate": 3.337191358024692e-05,
|
|
"loss": 0.0074,
|
|
"step": 11500
|
|
},
|
|
{
|
|
"epoch": 11.979166666666666,
|
|
"eval_loss": 0.21293191611766815,
|
|
"eval_runtime": 2.706,
|
|
"eval_samples_per_second": 184.775,
|
|
"eval_steps_per_second": 18.478,
|
|
"step": 11500
|
|
},
|
|
{
|
|
"epoch": 12.083333333333334,
|
|
"eval_loss": 0.23273849487304688,
|
|
"eval_runtime": 2.6588,
|
|
"eval_samples_per_second": 188.053,
|
|
"eval_steps_per_second": 18.805,
|
|
"step": 11600
|
|
},
|
|
{
|
|
"epoch": 12.1875,
|
|
"eval_loss": 0.23750942945480347,
|
|
"eval_runtime": 2.6738,
|
|
"eval_samples_per_second": 186.999,
|
|
"eval_steps_per_second": 18.7,
|
|
"step": 11700
|
|
},
|
|
{
|
|
"epoch": 12.291666666666666,
|
|
"eval_loss": 0.26990386843681335,
|
|
"eval_runtime": 2.6509,
|
|
"eval_samples_per_second": 188.618,
|
|
"eval_steps_per_second": 18.862,
|
|
"step": 11800
|
|
},
|
|
{
|
|
"epoch": 12.395833333333334,
|
|
"eval_loss": 0.24741919338703156,
|
|
"eval_runtime": 2.6554,
|
|
"eval_samples_per_second": 188.299,
|
|
"eval_steps_per_second": 18.83,
|
|
"step": 11900
|
|
},
|
|
{
|
|
"epoch": 12.5,
|
|
"grad_norm": 1.0478854179382324,
|
|
"learning_rate": 3.240740740740741e-05,
|
|
"loss": 0.0056,
|
|
"step": 12000
|
|
},
|
|
{
|
|
"epoch": 12.5,
|
|
"eval_loss": 0.22709093987941742,
|
|
"eval_runtime": 2.7368,
|
|
"eval_samples_per_second": 182.693,
|
|
"eval_steps_per_second": 18.269,
|
|
"step": 12000
|
|
},
|
|
{
|
|
"epoch": 12.604166666666666,
|
|
"eval_loss": 0.2405824214220047,
|
|
"eval_runtime": 2.6757,
|
|
"eval_samples_per_second": 186.87,
|
|
"eval_steps_per_second": 18.687,
|
|
"step": 12100
|
|
},
|
|
{
|
|
"epoch": 12.708333333333334,
|
|
"eval_loss": 0.28877463936805725,
|
|
"eval_runtime": 2.671,
|
|
"eval_samples_per_second": 187.194,
|
|
"eval_steps_per_second": 18.719,
|
|
"step": 12200
|
|
},
|
|
{
|
|
"epoch": 12.8125,
|
|
"eval_loss": 0.3177616596221924,
|
|
"eval_runtime": 2.6749,
|
|
"eval_samples_per_second": 186.922,
|
|
"eval_steps_per_second": 18.692,
|
|
"step": 12300
|
|
},
|
|
{
|
|
"epoch": 12.916666666666666,
|
|
"eval_loss": 0.2896740734577179,
|
|
"eval_runtime": 2.6676,
|
|
"eval_samples_per_second": 187.434,
|
|
"eval_steps_per_second": 18.743,
|
|
"step": 12400
|
|
},
|
|
{
|
|
"epoch": 13.020833333333334,
|
|
"grad_norm": 0.09383181482553482,
|
|
"learning_rate": 3.14429012345679e-05,
|
|
"loss": 0.0068,
|
|
"step": 12500
|
|
},
|
|
{
|
|
"epoch": 13.020833333333334,
|
|
"eval_loss": 0.2690945267677307,
|
|
"eval_runtime": 2.6854,
|
|
"eval_samples_per_second": 186.195,
|
|
"eval_steps_per_second": 18.619,
|
|
"step": 12500
|
|
},
|
|
{
|
|
"epoch": 13.125,
|
|
"eval_loss": 0.28043872117996216,
|
|
"eval_runtime": 2.7005,
|
|
"eval_samples_per_second": 185.153,
|
|
"eval_steps_per_second": 18.515,
|
|
"step": 12600
|
|
},
|
|
{
|
|
"epoch": 13.229166666666666,
|
|
"eval_loss": 0.2700558006763458,
|
|
"eval_runtime": 2.664,
|
|
"eval_samples_per_second": 187.69,
|
|
"eval_steps_per_second": 18.769,
|
|
"step": 12700
|
|
},
|
|
{
|
|
"epoch": 13.333333333333334,
|
|
"eval_loss": 0.2863883078098297,
|
|
"eval_runtime": 2.6459,
|
|
"eval_samples_per_second": 188.974,
|
|
"eval_steps_per_second": 18.897,
|
|
"step": 12800
|
|
},
|
|
{
|
|
"epoch": 13.4375,
|
|
"eval_loss": 0.31038790941238403,
|
|
"eval_runtime": 2.6638,
|
|
"eval_samples_per_second": 187.7,
|
|
"eval_steps_per_second": 18.77,
|
|
"step": 12900
|
|
},
|
|
{
|
|
"epoch": 13.541666666666666,
|
|
"grad_norm": 0.030933663249015808,
|
|
"learning_rate": 3.04783950617284e-05,
|
|
"loss": 0.0053,
|
|
"step": 13000
|
|
},
|
|
{
|
|
"epoch": 13.541666666666666,
|
|
"eval_loss": 0.2956462502479553,
|
|
"eval_runtime": 2.7335,
|
|
"eval_samples_per_second": 182.914,
|
|
"eval_steps_per_second": 18.291,
|
|
"step": 13000
|
|
},
|
|
{
|
|
"epoch": 13.645833333333334,
|
|
"eval_loss": 0.24711303412914276,
|
|
"eval_runtime": 2.6773,
|
|
"eval_samples_per_second": 186.754,
|
|
"eval_steps_per_second": 18.675,
|
|
"step": 13100
|
|
},
|
|
{
|
|
"epoch": 13.75,
|
|
"eval_loss": 0.2535928785800934,
|
|
"eval_runtime": 2.6697,
|
|
"eval_samples_per_second": 187.286,
|
|
"eval_steps_per_second": 18.729,
|
|
"step": 13200
|
|
},
|
|
{
|
|
"epoch": 13.854166666666666,
|
|
"eval_loss": 0.23705850541591644,
|
|
"eval_runtime": 2.7698,
|
|
"eval_samples_per_second": 180.518,
|
|
"eval_steps_per_second": 18.052,
|
|
"step": 13300
|
|
},
|
|
{
|
|
"epoch": 13.958333333333334,
|
|
"eval_loss": 0.23593567311763763,
|
|
"eval_runtime": 2.6612,
|
|
"eval_samples_per_second": 187.888,
|
|
"eval_steps_per_second": 18.789,
|
|
"step": 13400
|
|
},
|
|
{
|
|
"epoch": 14.0625,
|
|
"grad_norm": 0.031106941401958466,
|
|
"learning_rate": 2.951388888888889e-05,
|
|
"loss": 0.0049,
|
|
"step": 13500
|
|
},
|
|
{
|
|
"epoch": 14.0625,
|
|
"eval_loss": 0.2466021180152893,
|
|
"eval_runtime": 2.6433,
|
|
"eval_samples_per_second": 189.159,
|
|
"eval_steps_per_second": 18.916,
|
|
"step": 13500
|
|
},
|
|
{
|
|
"epoch": 14.166666666666666,
|
|
"eval_loss": 0.27085715532302856,
|
|
"eval_runtime": 2.6665,
|
|
"eval_samples_per_second": 187.513,
|
|
"eval_steps_per_second": 18.751,
|
|
"step": 13600
|
|
},
|
|
{
|
|
"epoch": 14.270833333333334,
|
|
"eval_loss": 0.23776446282863617,
|
|
"eval_runtime": 2.6623,
|
|
"eval_samples_per_second": 187.806,
|
|
"eval_steps_per_second": 18.781,
|
|
"step": 13700
|
|
},
|
|
{
|
|
"epoch": 14.375,
|
|
"eval_loss": 0.22552721202373505,
|
|
"eval_runtime": 2.696,
|
|
"eval_samples_per_second": 185.463,
|
|
"eval_steps_per_second": 18.546,
|
|
"step": 13800
|
|
},
|
|
{
|
|
"epoch": 14.479166666666666,
|
|
"eval_loss": 0.25723254680633545,
|
|
"eval_runtime": 2.759,
|
|
"eval_samples_per_second": 181.228,
|
|
"eval_steps_per_second": 18.123,
|
|
"step": 13900
|
|
},
|
|
{
|
|
"epoch": 14.583333333333334,
|
|
"grad_norm": 0.014980652369558811,
|
|
"learning_rate": 2.8549382716049384e-05,
|
|
"loss": 0.0034,
|
|
"step": 14000
|
|
},
|
|
{
|
|
"epoch": 14.583333333333334,
|
|
"eval_loss": 0.24281509220600128,
|
|
"eval_runtime": 2.6514,
|
|
"eval_samples_per_second": 188.58,
|
|
"eval_steps_per_second": 18.858,
|
|
"step": 14000
|
|
},
|
|
{
|
|
"epoch": 14.6875,
|
|
"eval_loss": 0.28589245676994324,
|
|
"eval_runtime": 2.6758,
|
|
"eval_samples_per_second": 186.862,
|
|
"eval_steps_per_second": 18.686,
|
|
"step": 14100
|
|
},
|
|
{
|
|
"epoch": 14.791666666666666,
|
|
"eval_loss": 0.23985494673252106,
|
|
"eval_runtime": 2.6688,
|
|
"eval_samples_per_second": 187.353,
|
|
"eval_steps_per_second": 18.735,
|
|
"step": 14200
|
|
},
|
|
{
|
|
"epoch": 14.895833333333334,
|
|
"eval_loss": 0.2654513716697693,
|
|
"eval_runtime": 2.7853,
|
|
"eval_samples_per_second": 179.514,
|
|
"eval_steps_per_second": 17.951,
|
|
"step": 14300
|
|
},
|
|
{
|
|
"epoch": 15.0,
|
|
"eval_loss": 0.24361200630664825,
|
|
"eval_runtime": 2.6858,
|
|
"eval_samples_per_second": 186.162,
|
|
"eval_steps_per_second": 18.616,
|
|
"step": 14400
|
|
},
|
|
{
|
|
"epoch": 15.104166666666666,
|
|
"grad_norm": 0.06275557726621628,
|
|
"learning_rate": 2.7584876543209875e-05,
|
|
"loss": 0.0025,
|
|
"step": 14500
|
|
},
|
|
{
|
|
"epoch": 15.104166666666666,
|
|
"eval_loss": 0.22555358707904816,
|
|
"eval_runtime": 2.6629,
|
|
"eval_samples_per_second": 187.767,
|
|
"eval_steps_per_second": 18.777,
|
|
"step": 14500
|
|
},
|
|
{
|
|
"epoch": 15.208333333333334,
|
|
"eval_loss": 0.2367800772190094,
|
|
"eval_runtime": 2.6382,
|
|
"eval_samples_per_second": 189.52,
|
|
"eval_steps_per_second": 18.952,
|
|
"step": 14600
|
|
},
|
|
{
|
|
"epoch": 15.3125,
|
|
"eval_loss": 0.2452574521303177,
|
|
"eval_runtime": 2.7269,
|
|
"eval_samples_per_second": 183.358,
|
|
"eval_steps_per_second": 18.336,
|
|
"step": 14700
|
|
},
|
|
{
|
|
"epoch": 15.416666666666666,
|
|
"eval_loss": 0.23891642689704895,
|
|
"eval_runtime": 2.7402,
|
|
"eval_samples_per_second": 182.469,
|
|
"eval_steps_per_second": 18.247,
|
|
"step": 14800
|
|
},
|
|
{
|
|
"epoch": 15.520833333333334,
|
|
"eval_loss": 0.25828251242637634,
|
|
"eval_runtime": 2.6596,
|
|
"eval_samples_per_second": 187.998,
|
|
"eval_steps_per_second": 18.8,
|
|
"step": 14900
|
|
},
|
|
{
|
|
"epoch": 15.625,
|
|
"grad_norm": 0.019014783203601837,
|
|
"learning_rate": 2.6620370370370372e-05,
|
|
"loss": 0.0028,
|
|
"step": 15000
|
|
},
|
|
{
|
|
"epoch": 15.625,
|
|
"eval_loss": 0.21553362905979156,
|
|
"eval_runtime": 2.685,
|
|
"eval_samples_per_second": 186.223,
|
|
"eval_steps_per_second": 18.622,
|
|
"step": 15000
|
|
},
|
|
{
|
|
"epoch": 15.729166666666666,
|
|
"eval_loss": 0.1873685121536255,
|
|
"eval_runtime": 2.679,
|
|
"eval_samples_per_second": 186.639,
|
|
"eval_steps_per_second": 18.664,
|
|
"step": 15100
|
|
},
|
|
{
|
|
"epoch": 15.833333333333334,
|
|
"eval_loss": 0.23436906933784485,
|
|
"eval_runtime": 2.6734,
|
|
"eval_samples_per_second": 187.027,
|
|
"eval_steps_per_second": 18.703,
|
|
"step": 15200
|
|
},
|
|
{
|
|
"epoch": 15.9375,
|
|
"eval_loss": 0.21372827887535095,
|
|
"eval_runtime": 2.6738,
|
|
"eval_samples_per_second": 187.0,
|
|
"eval_steps_per_second": 18.7,
|
|
"step": 15300
|
|
},
|
|
{
|
|
"epoch": 16.041666666666668,
|
|
"eval_loss": 0.28128960728645325,
|
|
"eval_runtime": 2.721,
|
|
"eval_samples_per_second": 183.754,
|
|
"eval_steps_per_second": 18.375,
|
|
"step": 15400
|
|
},
|
|
{
|
|
"epoch": 16.145833333333332,
|
|
"grad_norm": 0.001963632879778743,
|
|
"learning_rate": 2.5655864197530866e-05,
|
|
"loss": 0.0038,
|
|
"step": 15500
|
|
},
|
|
{
|
|
"epoch": 16.145833333333332,
|
|
"eval_loss": 0.2411942183971405,
|
|
"eval_runtime": 2.6679,
|
|
"eval_samples_per_second": 187.412,
|
|
"eval_steps_per_second": 18.741,
|
|
"step": 15500
|
|
},
|
|
{
|
|
"epoch": 16.25,
|
|
"eval_loss": 0.27309754490852356,
|
|
"eval_runtime": 2.6866,
|
|
"eval_samples_per_second": 186.109,
|
|
"eval_steps_per_second": 18.611,
|
|
"step": 15600
|
|
},
|
|
{
|
|
"epoch": 16.354166666666668,
|
|
"eval_loss": 0.26635468006134033,
|
|
"eval_runtime": 2.6688,
|
|
"eval_samples_per_second": 187.351,
|
|
"eval_steps_per_second": 18.735,
|
|
"step": 15700
|
|
},
|
|
{
|
|
"epoch": 16.458333333333332,
|
|
"eval_loss": 0.2760668694972992,
|
|
"eval_runtime": 2.7331,
|
|
"eval_samples_per_second": 182.941,
|
|
"eval_steps_per_second": 18.294,
|
|
"step": 15800
|
|
},
|
|
{
|
|
"epoch": 16.5625,
|
|
"eval_loss": 0.2560645341873169,
|
|
"eval_runtime": 2.662,
|
|
"eval_samples_per_second": 187.83,
|
|
"eval_steps_per_second": 18.783,
|
|
"step": 15900
|
|
},
|
|
{
|
|
"epoch": 16.666666666666668,
|
|
"grad_norm": 0.04559716209769249,
|
|
"learning_rate": 2.4691358024691357e-05,
|
|
"loss": 0.0024,
|
|
"step": 16000
|
|
},
|
|
{
|
|
"epoch": 16.666666666666668,
|
|
"eval_loss": 0.31702032685279846,
|
|
"eval_runtime": 2.6834,
|
|
"eval_samples_per_second": 186.33,
|
|
"eval_steps_per_second": 18.633,
|
|
"step": 16000
|
|
},
|
|
{
|
|
"epoch": 16.770833333333332,
|
|
"eval_loss": 0.23096273839473724,
|
|
"eval_runtime": 2.643,
|
|
"eval_samples_per_second": 189.182,
|
|
"eval_steps_per_second": 18.918,
|
|
"step": 16100
|
|
},
|
|
{
|
|
"epoch": 16.875,
|
|
"eval_loss": 0.2803973853588104,
|
|
"eval_runtime": 2.6697,
|
|
"eval_samples_per_second": 187.288,
|
|
"eval_steps_per_second": 18.729,
|
|
"step": 16200
|
|
},
|
|
{
|
|
"epoch": 16.979166666666668,
|
|
"eval_loss": 0.2678157091140747,
|
|
"eval_runtime": 2.7817,
|
|
"eval_samples_per_second": 179.745,
|
|
"eval_steps_per_second": 17.975,
|
|
"step": 16300
|
|
},
|
|
{
|
|
"epoch": 17.083333333333332,
|
|
"eval_loss": 0.27891919016838074,
|
|
"eval_runtime": 2.8206,
|
|
"eval_samples_per_second": 177.27,
|
|
"eval_steps_per_second": 17.727,
|
|
"step": 16400
|
|
},
|
|
{
|
|
"epoch": 17.1875,
|
|
"grad_norm": 0.025386014953255653,
|
|
"learning_rate": 2.3726851851851854e-05,
|
|
"loss": 0.0021,
|
|
"step": 16500
|
|
},
|
|
{
|
|
"epoch": 17.1875,
|
|
"eval_loss": 0.2423981875181198,
|
|
"eval_runtime": 2.6721,
|
|
"eval_samples_per_second": 187.12,
|
|
"eval_steps_per_second": 18.712,
|
|
"step": 16500
|
|
},
|
|
{
|
|
"epoch": 17.291666666666668,
|
|
"eval_loss": 0.26049262285232544,
|
|
"eval_runtime": 2.7078,
|
|
"eval_samples_per_second": 184.654,
|
|
"eval_steps_per_second": 18.465,
|
|
"step": 16600
|
|
},
|
|
{
|
|
"epoch": 17.395833333333332,
|
|
"eval_loss": 0.30610647797584534,
|
|
"eval_runtime": 2.6975,
|
|
"eval_samples_per_second": 185.357,
|
|
"eval_steps_per_second": 18.536,
|
|
"step": 16700
|
|
},
|
|
{
|
|
"epoch": 17.5,
|
|
"eval_loss": 0.26282158493995667,
|
|
"eval_runtime": 2.7046,
|
|
"eval_samples_per_second": 184.872,
|
|
"eval_steps_per_second": 18.487,
|
|
"step": 16800
|
|
},
|
|
{
|
|
"epoch": 17.604166666666668,
|
|
"eval_loss": 0.24968083202838898,
|
|
"eval_runtime": 2.6698,
|
|
"eval_samples_per_second": 187.28,
|
|
"eval_steps_per_second": 18.728,
|
|
"step": 16900
|
|
},
|
|
{
|
|
"epoch": 17.708333333333332,
|
|
"grad_norm": 0.038250915706157684,
|
|
"learning_rate": 2.2762345679012348e-05,
|
|
"loss": 0.0033,
|
|
"step": 17000
|
|
},
|
|
{
|
|
"epoch": 17.708333333333332,
|
|
"eval_loss": 0.27164506912231445,
|
|
"eval_runtime": 2.6512,
|
|
"eval_samples_per_second": 188.591,
|
|
"eval_steps_per_second": 18.859,
|
|
"step": 17000
|
|
},
|
|
{
|
|
"epoch": 17.8125,
|
|
"eval_loss": 0.26982831954956055,
|
|
"eval_runtime": 2.7234,
|
|
"eval_samples_per_second": 183.593,
|
|
"eval_steps_per_second": 18.359,
|
|
"step": 17100
|
|
},
|
|
{
|
|
"epoch": 17.916666666666668,
|
|
"eval_loss": 0.2670990824699402,
|
|
"eval_runtime": 2.7233,
|
|
"eval_samples_per_second": 183.601,
|
|
"eval_steps_per_second": 18.36,
|
|
"step": 17200
|
|
},
|
|
{
|
|
"epoch": 18.020833333333332,
|
|
"eval_loss": 0.24990878999233246,
|
|
"eval_runtime": 2.6806,
|
|
"eval_samples_per_second": 186.525,
|
|
"eval_steps_per_second": 18.653,
|
|
"step": 17300
|
|
},
|
|
{
|
|
"epoch": 18.125,
|
|
"eval_loss": 0.2912808656692505,
|
|
"eval_runtime": 2.6688,
|
|
"eval_samples_per_second": 187.351,
|
|
"eval_steps_per_second": 18.735,
|
|
"step": 17400
|
|
},
|
|
{
|
|
"epoch": 18.229166666666668,
|
|
"grad_norm": 0.021504106000065804,
|
|
"learning_rate": 2.179783950617284e-05,
|
|
"loss": 0.003,
|
|
"step": 17500
|
|
},
|
|
{
|
|
"epoch": 18.229166666666668,
|
|
"eval_loss": 0.26864093542099,
|
|
"eval_runtime": 2.6597,
|
|
"eval_samples_per_second": 187.99,
|
|
"eval_steps_per_second": 18.799,
|
|
"step": 17500
|
|
},
|
|
{
|
|
"epoch": 18.333333333333332,
|
|
"eval_loss": 0.2561093866825104,
|
|
"eval_runtime": 2.7405,
|
|
"eval_samples_per_second": 182.449,
|
|
"eval_steps_per_second": 18.245,
|
|
"step": 17600
|
|
},
|
|
{
|
|
"epoch": 18.4375,
|
|
"eval_loss": 0.2752689719200134,
|
|
"eval_runtime": 2.8003,
|
|
"eval_samples_per_second": 178.554,
|
|
"eval_steps_per_second": 17.855,
|
|
"step": 17700
|
|
},
|
|
{
|
|
"epoch": 18.541666666666668,
|
|
"eval_loss": 0.25926899909973145,
|
|
"eval_runtime": 2.6884,
|
|
"eval_samples_per_second": 185.987,
|
|
"eval_steps_per_second": 18.599,
|
|
"step": 17800
|
|
},
|
|
{
|
|
"epoch": 18.645833333333332,
|
|
"eval_loss": 0.26980525255203247,
|
|
"eval_runtime": 2.7348,
|
|
"eval_samples_per_second": 182.831,
|
|
"eval_steps_per_second": 18.283,
|
|
"step": 17900
|
|
},
|
|
{
|
|
"epoch": 18.75,
|
|
"grad_norm": 0.17659549415111542,
|
|
"learning_rate": 2.0833333333333336e-05,
|
|
"loss": 0.0023,
|
|
"step": 18000
|
|
},
|
|
{
|
|
"epoch": 18.75,
|
|
"eval_loss": 0.22183029353618622,
|
|
"eval_runtime": 2.7514,
|
|
"eval_samples_per_second": 181.729,
|
|
"eval_steps_per_second": 18.173,
|
|
"step": 18000
|
|
},
|
|
{
|
|
"epoch": 18.854166666666668,
|
|
"eval_loss": 0.245712548494339,
|
|
"eval_runtime": 2.6909,
|
|
"eval_samples_per_second": 185.812,
|
|
"eval_steps_per_second": 18.581,
|
|
"step": 18100
|
|
},
|
|
{
|
|
"epoch": 18.958333333333332,
|
|
"eval_loss": 0.23490136861801147,
|
|
"eval_runtime": 2.6739,
|
|
"eval_samples_per_second": 186.991,
|
|
"eval_steps_per_second": 18.699,
|
|
"step": 18200
|
|
},
|
|
{
|
|
"epoch": 19.0625,
|
|
"eval_loss": 0.2799266278743744,
|
|
"eval_runtime": 2.6965,
|
|
"eval_samples_per_second": 185.424,
|
|
"eval_steps_per_second": 18.542,
|
|
"step": 18300
|
|
},
|
|
{
|
|
"epoch": 19.166666666666668,
|
|
"eval_loss": 0.2952316403388977,
|
|
"eval_runtime": 2.7321,
|
|
"eval_samples_per_second": 183.006,
|
|
"eval_steps_per_second": 18.301,
|
|
"step": 18400
|
|
},
|
|
{
|
|
"epoch": 19.270833333333332,
|
|
"grad_norm": 0.023679744452238083,
|
|
"learning_rate": 1.9868827160493827e-05,
|
|
"loss": 0.0018,
|
|
"step": 18500
|
|
},
|
|
{
|
|
"epoch": 19.270833333333332,
|
|
"eval_loss": 0.23330457508563995,
|
|
"eval_runtime": 2.6526,
|
|
"eval_samples_per_second": 188.496,
|
|
"eval_steps_per_second": 18.85,
|
|
"step": 18500
|
|
},
|
|
{
|
|
"epoch": 19.375,
|
|
"eval_loss": 0.26152676343917847,
|
|
"eval_runtime": 2.6547,
|
|
"eval_samples_per_second": 188.345,
|
|
"eval_steps_per_second": 18.834,
|
|
"step": 18600
|
|
},
|
|
{
|
|
"epoch": 19.479166666666668,
|
|
"eval_loss": 0.2680070996284485,
|
|
"eval_runtime": 2.6519,
|
|
"eval_samples_per_second": 188.545,
|
|
"eval_steps_per_second": 18.855,
|
|
"step": 18700
|
|
},
|
|
{
|
|
"epoch": 19.583333333333332,
|
|
"eval_loss": 0.219146266579628,
|
|
"eval_runtime": 2.6887,
|
|
"eval_samples_per_second": 185.964,
|
|
"eval_steps_per_second": 18.596,
|
|
"step": 18800
|
|
},
|
|
{
|
|
"epoch": 19.6875,
|
|
"eval_loss": 0.2633296251296997,
|
|
"eval_runtime": 2.716,
|
|
"eval_samples_per_second": 184.095,
|
|
"eval_steps_per_second": 18.41,
|
|
"step": 18900
|
|
},
|
|
{
|
|
"epoch": 19.791666666666668,
|
|
"grad_norm": 0.1822015345096588,
|
|
"learning_rate": 1.8904320987654324e-05,
|
|
"loss": 0.0019,
|
|
"step": 19000
|
|
},
|
|
{
|
|
"epoch": 19.791666666666668,
|
|
"eval_loss": 0.22770513594150543,
|
|
"eval_runtime": 2.6476,
|
|
"eval_samples_per_second": 188.849,
|
|
"eval_steps_per_second": 18.885,
|
|
"step": 19000
|
|
},
|
|
{
|
|
"epoch": 19.895833333333332,
|
|
"eval_loss": 0.2615948021411896,
|
|
"eval_runtime": 2.6985,
|
|
"eval_samples_per_second": 185.288,
|
|
"eval_steps_per_second": 18.529,
|
|
"step": 19100
|
|
},
|
|
{
|
|
"epoch": 20.0,
|
|
"eval_loss": 0.23876366019248962,
|
|
"eval_runtime": 2.6951,
|
|
"eval_samples_per_second": 185.519,
|
|
"eval_steps_per_second": 18.552,
|
|
"step": 19200
|
|
},
|
|
{
|
|
"epoch": 20.104166666666668,
|
|
"eval_loss": 0.22909638285636902,
|
|
"eval_runtime": 2.6935,
|
|
"eval_samples_per_second": 185.63,
|
|
"eval_steps_per_second": 18.563,
|
|
"step": 19300
|
|
},
|
|
{
|
|
"epoch": 20.208333333333332,
|
|
"eval_loss": 0.2323145866394043,
|
|
"eval_runtime": 2.7334,
|
|
"eval_samples_per_second": 182.92,
|
|
"eval_steps_per_second": 18.292,
|
|
"step": 19400
|
|
},
|
|
{
|
|
"epoch": 20.3125,
|
|
"grad_norm": 0.013318472541868687,
|
|
"learning_rate": 1.7939814814814815e-05,
|
|
"loss": 0.0013,
|
|
"step": 19500
|
|
},
|
|
{
|
|
"epoch": 20.3125,
|
|
"eval_loss": 0.2226209044456482,
|
|
"eval_runtime": 2.655,
|
|
"eval_samples_per_second": 188.322,
|
|
"eval_steps_per_second": 18.832,
|
|
"step": 19500
|
|
},
|
|
{
|
|
"epoch": 20.416666666666668,
|
|
"eval_loss": 0.24030183255672455,
|
|
"eval_runtime": 2.6564,
|
|
"eval_samples_per_second": 188.222,
|
|
"eval_steps_per_second": 18.822,
|
|
"step": 19600
|
|
},
|
|
{
|
|
"epoch": 20.520833333333332,
|
|
"eval_loss": 0.2417810559272766,
|
|
"eval_runtime": 2.7353,
|
|
"eval_samples_per_second": 182.794,
|
|
"eval_steps_per_second": 18.279,
|
|
"step": 19700
|
|
},
|
|
{
|
|
"epoch": 20.625,
|
|
"eval_loss": 0.2691485583782196,
|
|
"eval_runtime": 2.7924,
|
|
"eval_samples_per_second": 179.056,
|
|
"eval_steps_per_second": 17.906,
|
|
"step": 19800
|
|
},
|
|
{
|
|
"epoch": 20.729166666666668,
|
|
"eval_loss": 0.21901768445968628,
|
|
"eval_runtime": 2.6311,
|
|
"eval_samples_per_second": 190.038,
|
|
"eval_steps_per_second": 19.004,
|
|
"step": 19900
|
|
},
|
|
{
|
|
"epoch": 20.833333333333332,
|
|
"grad_norm": 0.024880312383174896,
|
|
"learning_rate": 1.697530864197531e-05,
|
|
"loss": 0.0014,
|
|
"step": 20000
|
|
},
|
|
{
|
|
"epoch": 20.833333333333332,
|
|
"eval_loss": 0.24805013835430145,
|
|
"eval_runtime": 2.6373,
|
|
"eval_samples_per_second": 189.588,
|
|
"eval_steps_per_second": 18.959,
|
|
"step": 20000
|
|
},
|
|
{
|
|
"epoch": 20.9375,
|
|
"eval_loss": 0.22968819737434387,
|
|
"eval_runtime": 2.6422,
|
|
"eval_samples_per_second": 189.237,
|
|
"eval_steps_per_second": 18.924,
|
|
"step": 20100
|
|
},
|
|
{
|
|
"epoch": 21.041666666666668,
|
|
"eval_loss": 0.2395009696483612,
|
|
"eval_runtime": 2.6643,
|
|
"eval_samples_per_second": 187.67,
|
|
"eval_steps_per_second": 18.767,
|
|
"step": 20200
|
|
},
|
|
{
|
|
"epoch": 21.145833333333332,
|
|
"eval_loss": 0.2686944603919983,
|
|
"eval_runtime": 2.7421,
|
|
"eval_samples_per_second": 182.345,
|
|
"eval_steps_per_second": 18.235,
|
|
"step": 20300
|
|
},
|
|
{
|
|
"epoch": 21.25,
|
|
"eval_loss": 0.2748400866985321,
|
|
"eval_runtime": 2.6544,
|
|
"eval_samples_per_second": 188.369,
|
|
"eval_steps_per_second": 18.837,
|
|
"step": 20400
|
|
},
|
|
{
|
|
"epoch": 21.354166666666668,
|
|
"grad_norm": 0.020624302327632904,
|
|
"learning_rate": 1.6010802469135803e-05,
|
|
"loss": 0.0013,
|
|
"step": 20500
|
|
},
|
|
{
|
|
"epoch": 21.354166666666668,
|
|
"eval_loss": 0.2482401579618454,
|
|
"eval_runtime": 2.6547,
|
|
"eval_samples_per_second": 188.344,
|
|
"eval_steps_per_second": 18.834,
|
|
"step": 20500
|
|
},
|
|
{
|
|
"epoch": 21.458333333333332,
|
|
"eval_loss": 0.2579880952835083,
|
|
"eval_runtime": 2.6892,
|
|
"eval_samples_per_second": 185.929,
|
|
"eval_steps_per_second": 18.593,
|
|
"step": 20600
|
|
},
|
|
{
|
|
"epoch": 21.5625,
|
|
"eval_loss": 0.2505187690258026,
|
|
"eval_runtime": 2.7076,
|
|
"eval_samples_per_second": 184.666,
|
|
"eval_steps_per_second": 18.467,
|
|
"step": 20700
|
|
},
|
|
{
|
|
"epoch": 21.666666666666668,
|
|
"eval_loss": 0.24572314321994781,
|
|
"eval_runtime": 2.6772,
|
|
"eval_samples_per_second": 186.761,
|
|
"eval_steps_per_second": 18.676,
|
|
"step": 20800
|
|
},
|
|
{
|
|
"epoch": 21.770833333333332,
|
|
"eval_loss": 0.2615715265274048,
|
|
"eval_runtime": 2.6506,
|
|
"eval_samples_per_second": 188.636,
|
|
"eval_steps_per_second": 18.864,
|
|
"step": 20900
|
|
},
|
|
{
|
|
"epoch": 21.875,
|
|
"grad_norm": 0.10809088498353958,
|
|
"learning_rate": 1.5046296296296297e-05,
|
|
"loss": 0.0012,
|
|
"step": 21000
|
|
},
|
|
{
|
|
"epoch": 21.875,
|
|
"eval_loss": 0.24613332748413086,
|
|
"eval_runtime": 2.664,
|
|
"eval_samples_per_second": 187.687,
|
|
"eval_steps_per_second": 18.769,
|
|
"step": 21000
|
|
},
|
|
{
|
|
"epoch": 21.979166666666668,
|
|
"eval_loss": 0.2297743260860443,
|
|
"eval_runtime": 2.8188,
|
|
"eval_samples_per_second": 177.378,
|
|
"eval_steps_per_second": 17.738,
|
|
"step": 21100
|
|
},
|
|
{
|
|
"epoch": 22.083333333333332,
|
|
"eval_loss": 0.20882727205753326,
|
|
"eval_runtime": 2.6539,
|
|
"eval_samples_per_second": 188.4,
|
|
"eval_steps_per_second": 18.84,
|
|
"step": 21200
|
|
},
|
|
{
|
|
"epoch": 22.1875,
|
|
"eval_loss": 0.24317896366119385,
|
|
"eval_runtime": 2.6727,
|
|
"eval_samples_per_second": 187.08,
|
|
"eval_steps_per_second": 18.708,
|
|
"step": 21300
|
|
},
|
|
{
|
|
"epoch": 22.291666666666668,
|
|
"eval_loss": 0.24090611934661865,
|
|
"eval_runtime": 2.6745,
|
|
"eval_samples_per_second": 186.949,
|
|
"eval_steps_per_second": 18.695,
|
|
"step": 21400
|
|
},
|
|
{
|
|
"epoch": 22.395833333333332,
|
|
"grad_norm": 0.013452223502099514,
|
|
"learning_rate": 1.4081790123456789e-05,
|
|
"loss": 0.0006,
|
|
"step": 21500
|
|
},
|
|
{
|
|
"epoch": 22.395833333333332,
|
|
"eval_loss": 0.23966069519519806,
|
|
"eval_runtime": 2.6467,
|
|
"eval_samples_per_second": 188.915,
|
|
"eval_steps_per_second": 18.892,
|
|
"step": 21500
|
|
},
|
|
{
|
|
"epoch": 22.5,
|
|
"eval_loss": 0.2442026436328888,
|
|
"eval_runtime": 2.752,
|
|
"eval_samples_per_second": 181.687,
|
|
"eval_steps_per_second": 18.169,
|
|
"step": 21600
|
|
},
|
|
{
|
|
"epoch": 22.604166666666668,
|
|
"eval_loss": 0.2610822916030884,
|
|
"eval_runtime": 2.7823,
|
|
"eval_samples_per_second": 179.709,
|
|
"eval_steps_per_second": 17.971,
|
|
"step": 21700
|
|
},
|
|
{
|
|
"epoch": 22.708333333333332,
|
|
"eval_loss": 0.2411614954471588,
|
|
"eval_runtime": 2.6894,
|
|
"eval_samples_per_second": 185.916,
|
|
"eval_steps_per_second": 18.592,
|
|
"step": 21800
|
|
},
|
|
{
|
|
"epoch": 22.8125,
|
|
"eval_loss": 0.28125372529029846,
|
|
"eval_runtime": 2.6995,
|
|
"eval_samples_per_second": 185.22,
|
|
"eval_steps_per_second": 18.522,
|
|
"step": 21900
|
|
},
|
|
{
|
|
"epoch": 22.916666666666668,
|
|
"grad_norm": 0.01698753982782364,
|
|
"learning_rate": 1.3117283950617285e-05,
|
|
"loss": 0.0016,
|
|
"step": 22000
|
|
},
|
|
{
|
|
"epoch": 22.916666666666668,
|
|
"eval_loss": 0.2897321879863739,
|
|
"eval_runtime": 2.6638,
|
|
"eval_samples_per_second": 187.699,
|
|
"eval_steps_per_second": 18.77,
|
|
"step": 22000
|
|
},
|
|
{
|
|
"epoch": 23.020833333333332,
|
|
"eval_loss": 0.2508152425289154,
|
|
"eval_runtime": 2.6526,
|
|
"eval_samples_per_second": 188.492,
|
|
"eval_steps_per_second": 18.849,
|
|
"step": 22100
|
|
},
|
|
{
|
|
"epoch": 23.125,
|
|
"eval_loss": 0.2747707664966583,
|
|
"eval_runtime": 2.7474,
|
|
"eval_samples_per_second": 181.988,
|
|
"eval_steps_per_second": 18.199,
|
|
"step": 22200
|
|
},
|
|
{
|
|
"epoch": 23.229166666666668,
|
|
"eval_loss": 0.24645893275737762,
|
|
"eval_runtime": 2.7117,
|
|
"eval_samples_per_second": 184.383,
|
|
"eval_steps_per_second": 18.438,
|
|
"step": 22300
|
|
},
|
|
{
|
|
"epoch": 23.333333333333332,
|
|
"eval_loss": 0.22453027963638306,
|
|
"eval_runtime": 2.6879,
|
|
"eval_samples_per_second": 186.022,
|
|
"eval_steps_per_second": 18.602,
|
|
"step": 22400
|
|
},
|
|
{
|
|
"epoch": 23.4375,
|
|
"grad_norm": 0.020492762327194214,
|
|
"learning_rate": 1.2152777777777779e-05,
|
|
"loss": 0.0012,
|
|
"step": 22500
|
|
},
|
|
{
|
|
"epoch": 23.4375,
|
|
"eval_loss": 0.2575179934501648,
|
|
"eval_runtime": 2.657,
|
|
"eval_samples_per_second": 188.185,
|
|
"eval_steps_per_second": 18.818,
|
|
"step": 22500
|
|
},
|
|
{
|
|
"epoch": 23.541666666666668,
|
|
"eval_loss": 0.2540989816188812,
|
|
"eval_runtime": 2.6727,
|
|
"eval_samples_per_second": 187.077,
|
|
"eval_steps_per_second": 18.708,
|
|
"step": 22600
|
|
},
|
|
{
|
|
"epoch": 23.645833333333332,
|
|
"eval_loss": 0.26418963074684143,
|
|
"eval_runtime": 2.6889,
|
|
"eval_samples_per_second": 185.951,
|
|
"eval_steps_per_second": 18.595,
|
|
"step": 22700
|
|
},
|
|
{
|
|
"epoch": 23.75,
|
|
"eval_loss": 0.27296414971351624,
|
|
"eval_runtime": 2.6695,
|
|
"eval_samples_per_second": 187.304,
|
|
"eval_steps_per_second": 18.73,
|
|
"step": 22800
|
|
},
|
|
{
|
|
"epoch": 23.854166666666668,
|
|
"eval_loss": 0.28103941679000854,
|
|
"eval_runtime": 2.6878,
|
|
"eval_samples_per_second": 186.024,
|
|
"eval_steps_per_second": 18.602,
|
|
"step": 22900
|
|
},
|
|
{
|
|
"epoch": 23.958333333333332,
|
|
"grad_norm": 0.019232362508773804,
|
|
"learning_rate": 1.1188271604938271e-05,
|
|
"loss": 0.0009,
|
|
"step": 23000
|
|
},
|
|
{
|
|
"epoch": 23.958333333333332,
|
|
"eval_loss": 0.29488101601600647,
|
|
"eval_runtime": 2.671,
|
|
"eval_samples_per_second": 187.195,
|
|
"eval_steps_per_second": 18.72,
|
|
"step": 23000
|
|
},
|
|
{
|
|
"epoch": 24.0625,
|
|
"eval_loss": 0.2643510103225708,
|
|
"eval_runtime": 2.6969,
|
|
"eval_samples_per_second": 185.398,
|
|
"eval_steps_per_second": 18.54,
|
|
"step": 23100
|
|
},
|
|
{
|
|
"epoch": 24.166666666666668,
|
|
"eval_loss": 0.28387248516082764,
|
|
"eval_runtime": 2.8223,
|
|
"eval_samples_per_second": 177.159,
|
|
"eval_steps_per_second": 17.716,
|
|
"step": 23200
|
|
},
|
|
{
|
|
"epoch": 24.270833333333332,
|
|
"eval_loss": 0.28446924686431885,
|
|
"eval_runtime": 2.79,
|
|
"eval_samples_per_second": 179.211,
|
|
"eval_steps_per_second": 17.921,
|
|
"step": 23300
|
|
},
|
|
{
|
|
"epoch": 24.375,
|
|
"eval_loss": 0.24371571838855743,
|
|
"eval_runtime": 2.6806,
|
|
"eval_samples_per_second": 186.527,
|
|
"eval_steps_per_second": 18.653,
|
|
"step": 23400
|
|
},
|
|
{
|
|
"epoch": 24.479166666666668,
|
|
"grad_norm": 0.09378495067358017,
|
|
"learning_rate": 1.0223765432098765e-05,
|
|
"loss": 0.0012,
|
|
"step": 23500
|
|
},
|
|
{
|
|
"epoch": 24.479166666666668,
|
|
"eval_loss": 0.2529699206352234,
|
|
"eval_runtime": 2.7479,
|
|
"eval_samples_per_second": 181.955,
|
|
"eval_steps_per_second": 18.196,
|
|
"step": 23500
|
|
},
|
|
{
|
|
"epoch": 24.583333333333332,
|
|
"eval_loss": 0.23549579083919525,
|
|
"eval_runtime": 2.7468,
|
|
"eval_samples_per_second": 182.03,
|
|
"eval_steps_per_second": 18.203,
|
|
"step": 23600
|
|
},
|
|
{
|
|
"epoch": 24.6875,
|
|
"eval_loss": 0.24692101776599884,
|
|
"eval_runtime": 2.687,
|
|
"eval_samples_per_second": 186.078,
|
|
"eval_steps_per_second": 18.608,
|
|
"step": 23700
|
|
},
|
|
{
|
|
"epoch": 24.791666666666668,
|
|
"eval_loss": 0.2594464123249054,
|
|
"eval_runtime": 2.7055,
|
|
"eval_samples_per_second": 184.81,
|
|
"eval_steps_per_second": 18.481,
|
|
"step": 23800
|
|
},
|
|
{
|
|
"epoch": 24.895833333333332,
|
|
"eval_loss": 0.2568516731262207,
|
|
"eval_runtime": 2.7026,
|
|
"eval_samples_per_second": 185.01,
|
|
"eval_steps_per_second": 18.501,
|
|
"step": 23900
|
|
},
|
|
{
|
|
"epoch": 25.0,
|
|
"grad_norm": 0.01909373700618744,
|
|
"learning_rate": 9.259259259259259e-06,
|
|
"loss": 0.0006,
|
|
"step": 24000
|
|
},
|
|
{
|
|
"epoch": 25.0,
|
|
"eval_loss": 0.23979052901268005,
|
|
"eval_runtime": 2.6347,
|
|
"eval_samples_per_second": 189.774,
|
|
"eval_steps_per_second": 18.977,
|
|
"step": 24000
|
|
},
|
|
{
|
|
"epoch": 25.104166666666668,
|
|
"eval_loss": 0.25141003727912903,
|
|
"eval_runtime": 2.6995,
|
|
"eval_samples_per_second": 185.219,
|
|
"eval_steps_per_second": 18.522,
|
|
"step": 24100
|
|
},
|
|
{
|
|
"epoch": 25.208333333333332,
|
|
"eval_loss": 0.2534993886947632,
|
|
"eval_runtime": 2.6878,
|
|
"eval_samples_per_second": 186.027,
|
|
"eval_steps_per_second": 18.603,
|
|
"step": 24200
|
|
},
|
|
{
|
|
"epoch": 25.3125,
|
|
"eval_loss": 0.2161099910736084,
|
|
"eval_runtime": 2.6906,
|
|
"eval_samples_per_second": 185.834,
|
|
"eval_steps_per_second": 18.583,
|
|
"step": 24300
|
|
},
|
|
{
|
|
"epoch": 25.416666666666668,
|
|
"eval_loss": 0.2284189760684967,
|
|
"eval_runtime": 2.6783,
|
|
"eval_samples_per_second": 186.688,
|
|
"eval_steps_per_second": 18.669,
|
|
"step": 24400
|
|
},
|
|
{
|
|
"epoch": 25.520833333333332,
|
|
"grad_norm": 0.01035739853978157,
|
|
"learning_rate": 8.294753086419753e-06,
|
|
"loss": 0.0005,
|
|
"step": 24500
|
|
},
|
|
{
|
|
"epoch": 25.520833333333332,
|
|
"eval_loss": 0.23142491281032562,
|
|
"eval_runtime": 2.7424,
|
|
"eval_samples_per_second": 182.32,
|
|
"eval_steps_per_second": 18.232,
|
|
"step": 24500
|
|
},
|
|
{
|
|
"epoch": 25.625,
|
|
"eval_loss": 0.22284680604934692,
|
|
"eval_runtime": 2.667,
|
|
"eval_samples_per_second": 187.477,
|
|
"eval_steps_per_second": 18.748,
|
|
"step": 24600
|
|
},
|
|
{
|
|
"epoch": 25.729166666666668,
|
|
"eval_loss": 0.23332533240318298,
|
|
"eval_runtime": 2.6695,
|
|
"eval_samples_per_second": 187.298,
|
|
"eval_steps_per_second": 18.73,
|
|
"step": 24700
|
|
},
|
|
{
|
|
"epoch": 25.833333333333332,
|
|
"eval_loss": 0.2309611290693283,
|
|
"eval_runtime": 2.7134,
|
|
"eval_samples_per_second": 184.273,
|
|
"eval_steps_per_second": 18.427,
|
|
"step": 24800
|
|
},
|
|
{
|
|
"epoch": 25.9375,
|
|
"eval_loss": 0.2237754762172699,
|
|
"eval_runtime": 2.6731,
|
|
"eval_samples_per_second": 187.045,
|
|
"eval_steps_per_second": 18.705,
|
|
"step": 24900
|
|
},
|
|
{
|
|
"epoch": 26.041666666666668,
|
|
"grad_norm": 0.023038456216454506,
|
|
"learning_rate": 7.330246913580248e-06,
|
|
"loss": 0.0015,
|
|
"step": 25000
|
|
},
|
|
{
|
|
"epoch": 26.041666666666668,
|
|
"eval_loss": 0.22988422214984894,
|
|
"eval_runtime": 2.6773,
|
|
"eval_samples_per_second": 186.754,
|
|
"eval_steps_per_second": 18.675,
|
|
"step": 25000
|
|
},
|
|
{
|
|
"epoch": 26.145833333333332,
|
|
"eval_loss": 0.2248789370059967,
|
|
"eval_runtime": 2.6901,
|
|
"eval_samples_per_second": 185.864,
|
|
"eval_steps_per_second": 18.586,
|
|
"step": 25100
|
|
},
|
|
{
|
|
"epoch": 26.25,
|
|
"eval_loss": 0.21588632464408875,
|
|
"eval_runtime": 2.7349,
|
|
"eval_samples_per_second": 182.823,
|
|
"eval_steps_per_second": 18.282,
|
|
"step": 25200
|
|
},
|
|
{
|
|
"epoch": 26.354166666666668,
|
|
"eval_loss": 0.22912240028381348,
|
|
"eval_runtime": 2.6991,
|
|
"eval_samples_per_second": 185.249,
|
|
"eval_steps_per_second": 18.525,
|
|
"step": 25300
|
|
},
|
|
{
|
|
"epoch": 26.458333333333332,
|
|
"eval_loss": 0.24369725584983826,
|
|
"eval_runtime": 2.6901,
|
|
"eval_samples_per_second": 185.865,
|
|
"eval_steps_per_second": 18.586,
|
|
"step": 25400
|
|
},
|
|
{
|
|
"epoch": 26.5625,
|
|
"grad_norm": 0.004963716492056847,
|
|
"learning_rate": 6.365740740740741e-06,
|
|
"loss": 0.0009,
|
|
"step": 25500
|
|
},
|
|
{
|
|
"epoch": 26.5625,
|
|
"eval_loss": 0.24551524221897125,
|
|
"eval_runtime": 2.6597,
|
|
"eval_samples_per_second": 187.991,
|
|
"eval_steps_per_second": 18.799,
|
|
"step": 25500
|
|
},
|
|
{
|
|
"epoch": 26.666666666666668,
|
|
"eval_loss": 0.2321571707725525,
|
|
"eval_runtime": 2.7245,
|
|
"eval_samples_per_second": 183.522,
|
|
"eval_steps_per_second": 18.352,
|
|
"step": 25600
|
|
},
|
|
{
|
|
"epoch": 26.770833333333332,
|
|
"eval_loss": 0.2371620088815689,
|
|
"eval_runtime": 2.6794,
|
|
"eval_samples_per_second": 186.607,
|
|
"eval_steps_per_second": 18.661,
|
|
"step": 25700
|
|
},
|
|
{
|
|
"epoch": 26.875,
|
|
"eval_loss": 0.24450993537902832,
|
|
"eval_runtime": 2.6866,
|
|
"eval_samples_per_second": 186.112,
|
|
"eval_steps_per_second": 18.611,
|
|
"step": 25800
|
|
},
|
|
{
|
|
"epoch": 26.979166666666668,
|
|
"eval_loss": 0.24821239709854126,
|
|
"eval_runtime": 2.6972,
|
|
"eval_samples_per_second": 185.379,
|
|
"eval_steps_per_second": 18.538,
|
|
"step": 25900
|
|
},
|
|
{
|
|
"epoch": 27.083333333333332,
|
|
"grad_norm": 0.018454568460583687,
|
|
"learning_rate": 5.401234567901234e-06,
|
|
"loss": 0.0004,
|
|
"step": 26000
|
|
},
|
|
{
|
|
"epoch": 27.083333333333332,
|
|
"eval_loss": 0.23901359736919403,
|
|
"eval_runtime": 2.6658,
|
|
"eval_samples_per_second": 187.559,
|
|
"eval_steps_per_second": 18.756,
|
|
"step": 26000
|
|
},
|
|
{
|
|
"epoch": 27.1875,
|
|
"eval_loss": 0.24414636194705963,
|
|
"eval_runtime": 2.7605,
|
|
"eval_samples_per_second": 181.124,
|
|
"eval_steps_per_second": 18.112,
|
|
"step": 26100
|
|
},
|
|
{
|
|
"epoch": 27.291666666666668,
|
|
"eval_loss": 0.22809472680091858,
|
|
"eval_runtime": 2.6599,
|
|
"eval_samples_per_second": 187.975,
|
|
"eval_steps_per_second": 18.798,
|
|
"step": 26200
|
|
},
|
|
{
|
|
"epoch": 27.395833333333332,
|
|
"eval_loss": 0.23070019483566284,
|
|
"eval_runtime": 2.6735,
|
|
"eval_samples_per_second": 187.017,
|
|
"eval_steps_per_second": 18.702,
|
|
"step": 26300
|
|
},
|
|
{
|
|
"epoch": 27.5,
|
|
"eval_loss": 0.22740352153778076,
|
|
"eval_runtime": 2.6801,
|
|
"eval_samples_per_second": 186.562,
|
|
"eval_steps_per_second": 18.656,
|
|
"step": 26400
|
|
},
|
|
{
|
|
"epoch": 27.604166666666668,
|
|
"grad_norm": 0.024484924972057343,
|
|
"learning_rate": 4.436728395061729e-06,
|
|
"loss": 0.0003,
|
|
"step": 26500
|
|
},
|
|
{
|
|
"epoch": 27.604166666666668,
|
|
"eval_loss": 0.23273907601833344,
|
|
"eval_runtime": 2.6798,
|
|
"eval_samples_per_second": 186.583,
|
|
"eval_steps_per_second": 18.658,
|
|
"step": 26500
|
|
},
|
|
{
|
|
"epoch": 27.708333333333332,
|
|
"eval_loss": 0.246334508061409,
|
|
"eval_runtime": 2.7151,
|
|
"eval_samples_per_second": 184.157,
|
|
"eval_steps_per_second": 18.416,
|
|
"step": 26600
|
|
},
|
|
{
|
|
"epoch": 27.8125,
|
|
"eval_loss": 0.2416531890630722,
|
|
"eval_runtime": 2.7757,
|
|
"eval_samples_per_second": 180.134,
|
|
"eval_steps_per_second": 18.013,
|
|
"step": 26700
|
|
},
|
|
{
|
|
"epoch": 27.916666666666668,
|
|
"eval_loss": 0.2483607828617096,
|
|
"eval_runtime": 2.7131,
|
|
"eval_samples_per_second": 184.294,
|
|
"eval_steps_per_second": 18.429,
|
|
"step": 26800
|
|
},
|
|
{
|
|
"epoch": 28.020833333333332,
|
|
"eval_loss": 0.252419114112854,
|
|
"eval_runtime": 2.6895,
|
|
"eval_samples_per_second": 185.906,
|
|
"eval_steps_per_second": 18.591,
|
|
"step": 26900
|
|
},
|
|
{
|
|
"epoch": 28.125,
|
|
"grad_norm": 0.0021363645792007446,
|
|
"learning_rate": 3.4722222222222224e-06,
|
|
"loss": 0.0005,
|
|
"step": 27000
|
|
},
|
|
{
|
|
"epoch": 28.125,
|
|
"eval_loss": 0.24915319681167603,
|
|
"eval_runtime": 2.6788,
|
|
"eval_samples_per_second": 186.648,
|
|
"eval_steps_per_second": 18.665,
|
|
"step": 27000
|
|
},
|
|
{
|
|
"epoch": 28.229166666666668,
|
|
"eval_loss": 0.2466410994529724,
|
|
"eval_runtime": 2.7305,
|
|
"eval_samples_per_second": 183.118,
|
|
"eval_steps_per_second": 18.312,
|
|
"step": 27100
|
|
},
|
|
{
|
|
"epoch": 28.333333333333332,
|
|
"eval_loss": 0.23784339427947998,
|
|
"eval_runtime": 2.7758,
|
|
"eval_samples_per_second": 180.131,
|
|
"eval_steps_per_second": 18.013,
|
|
"step": 27200
|
|
},
|
|
{
|
|
"epoch": 28.4375,
|
|
"eval_loss": 0.23728083074092865,
|
|
"eval_runtime": 2.7003,
|
|
"eval_samples_per_second": 185.168,
|
|
"eval_steps_per_second": 18.517,
|
|
"step": 27300
|
|
},
|
|
{
|
|
"epoch": 28.541666666666668,
|
|
"eval_loss": 0.23528096079826355,
|
|
"eval_runtime": 2.6782,
|
|
"eval_samples_per_second": 186.695,
|
|
"eval_steps_per_second": 18.669,
|
|
"step": 27400
|
|
},
|
|
{
|
|
"epoch": 28.645833333333332,
|
|
"grad_norm": 0.004232197534292936,
|
|
"learning_rate": 2.5077160493827164e-06,
|
|
"loss": 0.0007,
|
|
"step": 27500
|
|
},
|
|
{
|
|
"epoch": 28.645833333333332,
|
|
"eval_loss": 0.22034627199172974,
|
|
"eval_runtime": 2.6537,
|
|
"eval_samples_per_second": 188.417,
|
|
"eval_steps_per_second": 18.842,
|
|
"step": 27500
|
|
},
|
|
{
|
|
"epoch": 28.75,
|
|
"eval_loss": 0.22053499519824982,
|
|
"eval_runtime": 2.7004,
|
|
"eval_samples_per_second": 185.159,
|
|
"eval_steps_per_second": 18.516,
|
|
"step": 27600
|
|
},
|
|
{
|
|
"epoch": 28.854166666666668,
|
|
"eval_loss": 0.2195887714624405,
|
|
"eval_runtime": 2.7033,
|
|
"eval_samples_per_second": 184.958,
|
|
"eval_steps_per_second": 18.496,
|
|
"step": 27700
|
|
},
|
|
{
|
|
"epoch": 28.958333333333332,
|
|
"eval_loss": 0.2224828451871872,
|
|
"eval_runtime": 2.7202,
|
|
"eval_samples_per_second": 183.811,
|
|
"eval_steps_per_second": 18.381,
|
|
"step": 27800
|
|
},
|
|
{
|
|
"epoch": 29.0625,
|
|
"eval_loss": 0.22565923631191254,
|
|
"eval_runtime": 2.7224,
|
|
"eval_samples_per_second": 183.659,
|
|
"eval_steps_per_second": 18.366,
|
|
"step": 27900
|
|
},
|
|
{
|
|
"epoch": 29.166666666666668,
|
|
"grad_norm": 0.005333024077117443,
|
|
"learning_rate": 1.5432098765432098e-06,
|
|
"loss": 0.0001,
|
|
"step": 28000
|
|
},
|
|
{
|
|
"epoch": 29.166666666666668,
|
|
"eval_loss": 0.2284005731344223,
|
|
"eval_runtime": 2.7316,
|
|
"eval_samples_per_second": 183.043,
|
|
"eval_steps_per_second": 18.304,
|
|
"step": 28000
|
|
},
|
|
{
|
|
"epoch": 29.270833333333332,
|
|
"eval_loss": 0.22724701464176178,
|
|
"eval_runtime": 2.7988,
|
|
"eval_samples_per_second": 178.649,
|
|
"eval_steps_per_second": 17.865,
|
|
"step": 28100
|
|
},
|
|
{
|
|
"epoch": 29.375,
|
|
"eval_loss": 0.23275510966777802,
|
|
"eval_runtime": 2.8416,
|
|
"eval_samples_per_second": 175.955,
|
|
"eval_steps_per_second": 17.595,
|
|
"step": 28200
|
|
},
|
|
{
|
|
"epoch": 29.479166666666668,
|
|
"eval_loss": 0.23228801786899567,
|
|
"eval_runtime": 2.8546,
|
|
"eval_samples_per_second": 175.155,
|
|
"eval_steps_per_second": 17.516,
|
|
"step": 28300
|
|
},
|
|
{
|
|
"epoch": 29.583333333333332,
|
|
"eval_loss": 0.23211157321929932,
|
|
"eval_runtime": 2.8814,
|
|
"eval_samples_per_second": 173.527,
|
|
"eval_steps_per_second": 17.353,
|
|
"step": 28400
|
|
},
|
|
{
|
|
"epoch": 29.6875,
|
|
"grad_norm": 0.001249138847924769,
|
|
"learning_rate": 5.787037037037037e-07,
|
|
"loss": 0.0007,
|
|
"step": 28500
|
|
},
|
|
{
|
|
"epoch": 29.6875,
|
|
"eval_loss": 0.23103657364845276,
|
|
"eval_runtime": 2.9118,
|
|
"eval_samples_per_second": 171.717,
|
|
"eval_steps_per_second": 17.172,
|
|
"step": 28500
|
|
},
|
|
{
|
|
"epoch": 29.791666666666668,
|
|
"eval_loss": 0.23130209743976593,
|
|
"eval_runtime": 3.0208,
|
|
"eval_samples_per_second": 165.52,
|
|
"eval_steps_per_second": 16.552,
|
|
"step": 28600
|
|
},
|
|
{
|
|
"epoch": 29.895833333333332,
|
|
"eval_loss": 0.2313297837972641,
|
|
"eval_runtime": 3.1209,
|
|
"eval_samples_per_second": 160.208,
|
|
"eval_steps_per_second": 16.021,
|
|
"step": 28700
|
|
},
|
|
{
|
|
"epoch": 30.0,
|
|
"eval_loss": 0.23098842799663544,
|
|
"eval_runtime": 3.0691,
|
|
"eval_samples_per_second": 162.912,
|
|
"eval_steps_per_second": 16.291,
|
|
"step": 28800
|
|
}
|
|
],
|
|
"logging_steps": 500,
|
|
"max_steps": 28800,
|
|
"num_input_tokens_seen": 0,
|
|
"num_train_epochs": 30,
|
|
"save_steps": 1200,
|
|
"stateful_callbacks": {
|
|
"TrainerControl": {
|
|
"args": {
|
|
"should_epoch_stop": false,
|
|
"should_evaluate": false,
|
|
"should_log": false,
|
|
"should_save": true,
|
|
"should_training_stop": true
|
|
},
|
|
"attributes": {}
|
|
}
|
|
},
|
|
"total_flos": 0.0,
|
|
"train_batch_size": 10,
|
|
"trial_name": null,
|
|
"trial_params": null
|
|
}
|
|
|