bge-large_osllmai_V1.6 / trainer_state.json
alinemati's picture
Upload 10 files
ea405a4 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 30.0,
"eval_steps": 100,
"global_step": 28800,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.10416666666666667,
"eval_loss": 0.07793249189853668,
"eval_runtime": 3.1433,
"eval_samples_per_second": 159.069,
"eval_steps_per_second": 15.907,
"step": 100
},
{
"epoch": 0.20833333333333334,
"eval_loss": 0.060976069420576096,
"eval_runtime": 5.2088,
"eval_samples_per_second": 95.992,
"eval_steps_per_second": 9.599,
"step": 200
},
{
"epoch": 0.3125,
"eval_loss": 0.06986938416957855,
"eval_runtime": 5.0795,
"eval_samples_per_second": 98.436,
"eval_steps_per_second": 9.844,
"step": 300
},
{
"epoch": 0.4166666666666667,
"eval_loss": 0.049215417355298996,
"eval_runtime": 3.1571,
"eval_samples_per_second": 158.372,
"eval_steps_per_second": 15.837,
"step": 400
},
{
"epoch": 0.5208333333333334,
"grad_norm": 2.454777240753174,
"learning_rate": 8.680555555555556e-06,
"loss": 0.1299,
"step": 500
},
{
"epoch": 0.5208333333333334,
"eval_loss": 0.05049608275294304,
"eval_runtime": 3.1336,
"eval_samples_per_second": 159.56,
"eval_steps_per_second": 15.956,
"step": 500
},
{
"epoch": 0.625,
"eval_loss": 0.05414561927318573,
"eval_runtime": 3.0995,
"eval_samples_per_second": 161.315,
"eval_steps_per_second": 16.131,
"step": 600
},
{
"epoch": 0.7291666666666666,
"eval_loss": 0.058882277458906174,
"eval_runtime": 3.2082,
"eval_samples_per_second": 155.852,
"eval_steps_per_second": 15.585,
"step": 700
},
{
"epoch": 0.8333333333333334,
"eval_loss": 0.05974648892879486,
"eval_runtime": 4.9789,
"eval_samples_per_second": 100.424,
"eval_steps_per_second": 10.042,
"step": 800
},
{
"epoch": 0.9375,
"eval_loss": 0.06364227831363678,
"eval_runtime": 3.1858,
"eval_samples_per_second": 156.948,
"eval_steps_per_second": 15.695,
"step": 900
},
{
"epoch": 1.0416666666666667,
"grad_norm": 0.6711832284927368,
"learning_rate": 1.736111111111111e-05,
"loss": 0.1028,
"step": 1000
},
{
"epoch": 1.0416666666666667,
"eval_loss": 0.07583945989608765,
"eval_runtime": 3.1485,
"eval_samples_per_second": 158.808,
"eval_steps_per_second": 15.881,
"step": 1000
},
{
"epoch": 1.1458333333333333,
"eval_loss": 0.06294066458940506,
"eval_runtime": 3.7666,
"eval_samples_per_second": 132.746,
"eval_steps_per_second": 13.275,
"step": 1100
},
{
"epoch": 1.25,
"eval_loss": 0.06619992107152939,
"eval_runtime": 3.2391,
"eval_samples_per_second": 154.363,
"eval_steps_per_second": 15.436,
"step": 1200
},
{
"epoch": 1.3541666666666667,
"eval_loss": 0.09093113243579865,
"eval_runtime": 5.1419,
"eval_samples_per_second": 97.241,
"eval_steps_per_second": 9.724,
"step": 1300
},
{
"epoch": 1.4583333333333333,
"eval_loss": 0.10987065732479095,
"eval_runtime": 3.0882,
"eval_samples_per_second": 161.906,
"eval_steps_per_second": 16.191,
"step": 1400
},
{
"epoch": 1.5625,
"grad_norm": 23.796688079833984,
"learning_rate": 2.604166666666667e-05,
"loss": 0.091,
"step": 1500
},
{
"epoch": 1.5625,
"eval_loss": 0.10730883479118347,
"eval_runtime": 3.112,
"eval_samples_per_second": 160.667,
"eval_steps_per_second": 16.067,
"step": 1500
},
{
"epoch": 1.6666666666666665,
"eval_loss": 0.11321321874856949,
"eval_runtime": 3.1391,
"eval_samples_per_second": 159.279,
"eval_steps_per_second": 15.928,
"step": 1600
},
{
"epoch": 1.7708333333333335,
"eval_loss": 0.09883977472782135,
"eval_runtime": 5.0716,
"eval_samples_per_second": 98.587,
"eval_steps_per_second": 9.859,
"step": 1700
},
{
"epoch": 1.875,
"eval_loss": 0.11777649074792862,
"eval_runtime": 5.1982,
"eval_samples_per_second": 96.187,
"eval_steps_per_second": 9.619,
"step": 1800
},
{
"epoch": 1.9791666666666665,
"eval_loss": 0.09968377649784088,
"eval_runtime": 3.1348,
"eval_samples_per_second": 159.499,
"eval_steps_per_second": 15.95,
"step": 1900
},
{
"epoch": 2.0833333333333335,
"grad_norm": 1.0350979566574097,
"learning_rate": 3.472222222222222e-05,
"loss": 0.1147,
"step": 2000
},
{
"epoch": 2.0833333333333335,
"eval_loss": 0.0993076041340828,
"eval_runtime": 3.1904,
"eval_samples_per_second": 156.719,
"eval_steps_per_second": 15.672,
"step": 2000
},
{
"epoch": 2.1875,
"eval_loss": 0.09541837126016617,
"eval_runtime": 3.0402,
"eval_samples_per_second": 164.463,
"eval_steps_per_second": 16.446,
"step": 2100
},
{
"epoch": 2.2916666666666665,
"eval_loss": 0.1211095005273819,
"eval_runtime": 3.0483,
"eval_samples_per_second": 164.026,
"eval_steps_per_second": 16.403,
"step": 2200
},
{
"epoch": 2.3958333333333335,
"eval_loss": 0.10396522283554077,
"eval_runtime": 3.0322,
"eval_samples_per_second": 164.895,
"eval_steps_per_second": 16.489,
"step": 2300
},
{
"epoch": 2.5,
"eval_loss": 0.15123824775218964,
"eval_runtime": 3.0437,
"eval_samples_per_second": 164.275,
"eval_steps_per_second": 16.427,
"step": 2400
},
{
"epoch": 2.6041666666666665,
"grad_norm": 2.144970178604126,
"learning_rate": 4.340277777777778e-05,
"loss": 0.0832,
"step": 2500
},
{
"epoch": 2.6041666666666665,
"eval_loss": 0.16908738017082214,
"eval_runtime": 4.9423,
"eval_samples_per_second": 101.167,
"eval_steps_per_second": 10.117,
"step": 2500
},
{
"epoch": 2.7083333333333335,
"eval_loss": 0.16467152535915375,
"eval_runtime": 3.999,
"eval_samples_per_second": 125.033,
"eval_steps_per_second": 12.503,
"step": 2600
},
{
"epoch": 2.8125,
"eval_loss": 0.19610275328159332,
"eval_runtime": 4.7478,
"eval_samples_per_second": 105.311,
"eval_steps_per_second": 10.531,
"step": 2700
},
{
"epoch": 2.9166666666666665,
"eval_loss": 0.2006768137216568,
"eval_runtime": 4.7959,
"eval_samples_per_second": 104.256,
"eval_steps_per_second": 10.426,
"step": 2800
},
{
"epoch": 3.0208333333333335,
"eval_loss": 0.1452997922897339,
"eval_runtime": 4.8491,
"eval_samples_per_second": 103.111,
"eval_steps_per_second": 10.311,
"step": 2900
},
{
"epoch": 3.125,
"grad_norm": 8.211258888244629,
"learning_rate": 4.976851851851852e-05,
"loss": 0.115,
"step": 3000
},
{
"epoch": 3.125,
"eval_loss": 0.17185473442077637,
"eval_runtime": 4.968,
"eval_samples_per_second": 100.644,
"eval_steps_per_second": 10.064,
"step": 3000
},
{
"epoch": 3.2291666666666665,
"eval_loss": 0.1926930844783783,
"eval_runtime": 3.0476,
"eval_samples_per_second": 164.064,
"eval_steps_per_second": 16.406,
"step": 3100
},
{
"epoch": 3.3333333333333335,
"eval_loss": 0.20631477236747742,
"eval_runtime": 3.0178,
"eval_samples_per_second": 165.682,
"eval_steps_per_second": 16.568,
"step": 3200
},
{
"epoch": 3.4375,
"eval_loss": 0.15469783544540405,
"eval_runtime": 3.0052,
"eval_samples_per_second": 166.38,
"eval_steps_per_second": 16.638,
"step": 3300
},
{
"epoch": 3.5416666666666665,
"eval_loss": 0.13334515690803528,
"eval_runtime": 3.0679,
"eval_samples_per_second": 162.979,
"eval_steps_per_second": 16.298,
"step": 3400
},
{
"epoch": 3.6458333333333335,
"grad_norm": 1.2755959033966064,
"learning_rate": 4.880401234567901e-05,
"loss": 0.0954,
"step": 3500
},
{
"epoch": 3.6458333333333335,
"eval_loss": 0.16306829452514648,
"eval_runtime": 3.1165,
"eval_samples_per_second": 160.436,
"eval_steps_per_second": 16.044,
"step": 3500
},
{
"epoch": 3.75,
"eval_loss": 0.211606964468956,
"eval_runtime": 2.9864,
"eval_samples_per_second": 167.428,
"eval_steps_per_second": 16.743,
"step": 3600
},
{
"epoch": 3.8541666666666665,
"eval_loss": 0.18188658356666565,
"eval_runtime": 2.984,
"eval_samples_per_second": 167.563,
"eval_steps_per_second": 16.756,
"step": 3700
},
{
"epoch": 3.9583333333333335,
"eval_loss": 0.1797247678041458,
"eval_runtime": 2.9814,
"eval_samples_per_second": 167.708,
"eval_steps_per_second": 16.771,
"step": 3800
},
{
"epoch": 4.0625,
"eval_loss": 0.16481846570968628,
"eval_runtime": 3.059,
"eval_samples_per_second": 163.454,
"eval_steps_per_second": 16.345,
"step": 3900
},
{
"epoch": 4.166666666666667,
"grad_norm": 7.385697364807129,
"learning_rate": 4.783950617283951e-05,
"loss": 0.0823,
"step": 4000
},
{
"epoch": 4.166666666666667,
"eval_loss": 0.20793281495571136,
"eval_runtime": 2.996,
"eval_samples_per_second": 166.89,
"eval_steps_per_second": 16.689,
"step": 4000
},
{
"epoch": 4.270833333333333,
"eval_loss": 0.1974990963935852,
"eval_runtime": 3.0328,
"eval_samples_per_second": 164.862,
"eval_steps_per_second": 16.486,
"step": 4100
},
{
"epoch": 4.375,
"eval_loss": 0.24861294031143188,
"eval_runtime": 2.9906,
"eval_samples_per_second": 167.189,
"eval_steps_per_second": 16.719,
"step": 4200
},
{
"epoch": 4.479166666666667,
"eval_loss": 0.1775345653295517,
"eval_runtime": 2.9925,
"eval_samples_per_second": 167.083,
"eval_steps_per_second": 16.708,
"step": 4300
},
{
"epoch": 4.583333333333333,
"eval_loss": 0.1704624593257904,
"eval_runtime": 3.0684,
"eval_samples_per_second": 162.95,
"eval_steps_per_second": 16.295,
"step": 4400
},
{
"epoch": 4.6875,
"grad_norm": 0.6013280749320984,
"learning_rate": 4.6875e-05,
"loss": 0.0549,
"step": 4500
},
{
"epoch": 4.6875,
"eval_loss": 0.22908172011375427,
"eval_runtime": 3.0692,
"eval_samples_per_second": 162.907,
"eval_steps_per_second": 16.291,
"step": 4500
},
{
"epoch": 4.791666666666667,
"eval_loss": 0.2069811075925827,
"eval_runtime": 2.9792,
"eval_samples_per_second": 167.83,
"eval_steps_per_second": 16.783,
"step": 4600
},
{
"epoch": 4.895833333333333,
"eval_loss": 0.20501156151294708,
"eval_runtime": 2.9759,
"eval_samples_per_second": 168.015,
"eval_steps_per_second": 16.801,
"step": 4700
},
{
"epoch": 5.0,
"eval_loss": 0.1834719032049179,
"eval_runtime": 2.9965,
"eval_samples_per_second": 166.864,
"eval_steps_per_second": 16.686,
"step": 4800
},
{
"epoch": 5.104166666666667,
"eval_loss": 0.1747453212738037,
"eval_runtime": 3.0464,
"eval_samples_per_second": 164.127,
"eval_steps_per_second": 16.413,
"step": 4900
},
{
"epoch": 5.208333333333333,
"grad_norm": 0.5279375910758972,
"learning_rate": 4.591049382716049e-05,
"loss": 0.0518,
"step": 5000
},
{
"epoch": 5.208333333333333,
"eval_loss": 0.21121186017990112,
"eval_runtime": 2.9779,
"eval_samples_per_second": 167.902,
"eval_steps_per_second": 16.79,
"step": 5000
},
{
"epoch": 5.3125,
"eval_loss": 0.20287540555000305,
"eval_runtime": 3.0068,
"eval_samples_per_second": 166.292,
"eval_steps_per_second": 16.629,
"step": 5100
},
{
"epoch": 5.416666666666667,
"eval_loss": 0.20998916029930115,
"eval_runtime": 3.0065,
"eval_samples_per_second": 166.307,
"eval_steps_per_second": 16.631,
"step": 5200
},
{
"epoch": 5.520833333333333,
"eval_loss": 0.2178017646074295,
"eval_runtime": 2.9901,
"eval_samples_per_second": 167.22,
"eval_steps_per_second": 16.722,
"step": 5300
},
{
"epoch": 5.625,
"eval_loss": 0.2072754055261612,
"eval_runtime": 3.0001,
"eval_samples_per_second": 166.662,
"eval_steps_per_second": 16.666,
"step": 5400
},
{
"epoch": 5.729166666666667,
"grad_norm": 1.3259690999984741,
"learning_rate": 4.494598765432099e-05,
"loss": 0.0363,
"step": 5500
},
{
"epoch": 5.729166666666667,
"eval_loss": 0.16645069420337677,
"eval_runtime": 3.0133,
"eval_samples_per_second": 165.933,
"eval_steps_per_second": 16.593,
"step": 5500
},
{
"epoch": 5.833333333333333,
"eval_loss": 0.18585015833377838,
"eval_runtime": 3.0016,
"eval_samples_per_second": 166.577,
"eval_steps_per_second": 16.658,
"step": 5600
},
{
"epoch": 5.9375,
"eval_loss": 0.24183067679405212,
"eval_runtime": 3.0665,
"eval_samples_per_second": 163.05,
"eval_steps_per_second": 16.305,
"step": 5700
},
{
"epoch": 6.041666666666667,
"eval_loss": 0.22054076194763184,
"eval_runtime": 2.9689,
"eval_samples_per_second": 168.413,
"eval_steps_per_second": 16.841,
"step": 5800
},
{
"epoch": 6.145833333333333,
"eval_loss": 0.17449523508548737,
"eval_runtime": 3.0168,
"eval_samples_per_second": 165.739,
"eval_steps_per_second": 16.574,
"step": 5900
},
{
"epoch": 6.25,
"grad_norm": 0.3047299087047577,
"learning_rate": 4.3981481481481486e-05,
"loss": 0.0305,
"step": 6000
},
{
"epoch": 6.25,
"eval_loss": 0.20409347116947174,
"eval_runtime": 2.9665,
"eval_samples_per_second": 168.549,
"eval_steps_per_second": 16.855,
"step": 6000
},
{
"epoch": 6.354166666666667,
"eval_loss": 0.23080451786518097,
"eval_runtime": 3.0955,
"eval_samples_per_second": 161.524,
"eval_steps_per_second": 16.152,
"step": 6100
},
{
"epoch": 6.458333333333333,
"eval_loss": 0.23451215028762817,
"eval_runtime": 2.9667,
"eval_samples_per_second": 168.535,
"eval_steps_per_second": 16.854,
"step": 6200
},
{
"epoch": 6.5625,
"eval_loss": 0.1807590276002884,
"eval_runtime": 3.0657,
"eval_samples_per_second": 163.093,
"eval_steps_per_second": 16.309,
"step": 6300
},
{
"epoch": 6.666666666666667,
"eval_loss": 0.21056562662124634,
"eval_runtime": 2.9809,
"eval_samples_per_second": 167.732,
"eval_steps_per_second": 16.773,
"step": 6400
},
{
"epoch": 6.770833333333333,
"grad_norm": 0.7045068144798279,
"learning_rate": 4.301697530864198e-05,
"loss": 0.0221,
"step": 6500
},
{
"epoch": 6.770833333333333,
"eval_loss": 0.2072342485189438,
"eval_runtime": 2.97,
"eval_samples_per_second": 168.353,
"eval_steps_per_second": 16.835,
"step": 6500
},
{
"epoch": 6.875,
"eval_loss": 0.2312939465045929,
"eval_runtime": 3.0018,
"eval_samples_per_second": 166.566,
"eval_steps_per_second": 16.657,
"step": 6600
},
{
"epoch": 6.979166666666667,
"eval_loss": 0.2178051918745041,
"eval_runtime": 3.0112,
"eval_samples_per_second": 166.045,
"eval_steps_per_second": 16.605,
"step": 6700
},
{
"epoch": 7.083333333333333,
"eval_loss": 0.1962529867887497,
"eval_runtime": 2.9373,
"eval_samples_per_second": 170.222,
"eval_steps_per_second": 17.022,
"step": 6800
},
{
"epoch": 7.1875,
"eval_loss": 0.23245805501937866,
"eval_runtime": 2.9287,
"eval_samples_per_second": 170.727,
"eval_steps_per_second": 17.073,
"step": 6900
},
{
"epoch": 7.291666666666667,
"grad_norm": 0.3293949365615845,
"learning_rate": 4.205246913580247e-05,
"loss": 0.0191,
"step": 7000
},
{
"epoch": 7.291666666666667,
"eval_loss": 0.20682944357395172,
"eval_runtime": 2.922,
"eval_samples_per_second": 171.118,
"eval_steps_per_second": 17.112,
"step": 7000
},
{
"epoch": 7.395833333333333,
"eval_loss": 0.2781427800655365,
"eval_runtime": 3.0086,
"eval_samples_per_second": 166.191,
"eval_steps_per_second": 16.619,
"step": 7100
},
{
"epoch": 7.5,
"eval_loss": 0.23378807306289673,
"eval_runtime": 2.928,
"eval_samples_per_second": 170.763,
"eval_steps_per_second": 17.076,
"step": 7200
},
{
"epoch": 7.604166666666667,
"eval_loss": 0.20770449936389923,
"eval_runtime": 2.9745,
"eval_samples_per_second": 168.097,
"eval_steps_per_second": 16.81,
"step": 7300
},
{
"epoch": 7.708333333333333,
"eval_loss": 0.19365566968917847,
"eval_runtime": 2.9194,
"eval_samples_per_second": 171.267,
"eval_steps_per_second": 17.127,
"step": 7400
},
{
"epoch": 7.8125,
"grad_norm": 0.15344583988189697,
"learning_rate": 4.1087962962962965e-05,
"loss": 0.0185,
"step": 7500
},
{
"epoch": 7.8125,
"eval_loss": 0.22151848673820496,
"eval_runtime": 2.901,
"eval_samples_per_second": 172.356,
"eval_steps_per_second": 17.236,
"step": 7500
},
{
"epoch": 7.916666666666667,
"eval_loss": 0.18537622690200806,
"eval_runtime": 2.9225,
"eval_samples_per_second": 171.086,
"eval_steps_per_second": 17.109,
"step": 7600
},
{
"epoch": 8.020833333333334,
"eval_loss": 0.21262435615062714,
"eval_runtime": 2.9556,
"eval_samples_per_second": 169.168,
"eval_steps_per_second": 16.917,
"step": 7700
},
{
"epoch": 8.125,
"eval_loss": 0.24334414303302765,
"eval_runtime": 2.9324,
"eval_samples_per_second": 170.51,
"eval_steps_per_second": 17.051,
"step": 7800
},
{
"epoch": 8.229166666666666,
"eval_loss": 0.2494126558303833,
"eval_runtime": 2.9435,
"eval_samples_per_second": 169.864,
"eval_steps_per_second": 16.986,
"step": 7900
},
{
"epoch": 8.333333333333334,
"grad_norm": 0.12840139865875244,
"learning_rate": 4.012345679012346e-05,
"loss": 0.0152,
"step": 8000
},
{
"epoch": 8.333333333333334,
"eval_loss": 0.18908418715000153,
"eval_runtime": 2.9931,
"eval_samples_per_second": 167.052,
"eval_steps_per_second": 16.705,
"step": 8000
},
{
"epoch": 8.4375,
"eval_loss": 0.21541763842105865,
"eval_runtime": 2.9627,
"eval_samples_per_second": 168.765,
"eval_steps_per_second": 16.876,
"step": 8100
},
{
"epoch": 8.541666666666666,
"eval_loss": 0.1759854555130005,
"eval_runtime": 2.9612,
"eval_samples_per_second": 168.849,
"eval_steps_per_second": 16.885,
"step": 8200
},
{
"epoch": 8.645833333333334,
"eval_loss": 0.19276337325572968,
"eval_runtime": 3.0833,
"eval_samples_per_second": 162.166,
"eval_steps_per_second": 16.217,
"step": 8300
},
{
"epoch": 8.75,
"eval_loss": 0.2762078642845154,
"eval_runtime": 2.9688,
"eval_samples_per_second": 168.42,
"eval_steps_per_second": 16.842,
"step": 8400
},
{
"epoch": 8.854166666666666,
"grad_norm": 0.2024412602186203,
"learning_rate": 3.915895061728395e-05,
"loss": 0.0131,
"step": 8500
},
{
"epoch": 8.854166666666666,
"eval_loss": 0.2193083018064499,
"eval_runtime": 3.0051,
"eval_samples_per_second": 166.385,
"eval_steps_per_second": 16.639,
"step": 8500
},
{
"epoch": 8.958333333333334,
"eval_loss": 0.2521556615829468,
"eval_runtime": 2.9901,
"eval_samples_per_second": 167.221,
"eval_steps_per_second": 16.722,
"step": 8600
},
{
"epoch": 9.0625,
"eval_loss": 0.25700661540031433,
"eval_runtime": 2.9726,
"eval_samples_per_second": 168.202,
"eval_steps_per_second": 16.82,
"step": 8700
},
{
"epoch": 9.166666666666666,
"eval_loss": 0.2857784628868103,
"eval_runtime": 3.0972,
"eval_samples_per_second": 161.436,
"eval_steps_per_second": 16.144,
"step": 8800
},
{
"epoch": 9.270833333333334,
"eval_loss": 0.24449588358402252,
"eval_runtime": 2.9884,
"eval_samples_per_second": 167.312,
"eval_steps_per_second": 16.731,
"step": 8900
},
{
"epoch": 9.375,
"grad_norm": 0.406023770570755,
"learning_rate": 3.8194444444444444e-05,
"loss": 0.0109,
"step": 9000
},
{
"epoch": 9.375,
"eval_loss": 0.2514427900314331,
"eval_runtime": 2.9485,
"eval_samples_per_second": 169.58,
"eval_steps_per_second": 16.958,
"step": 9000
},
{
"epoch": 9.479166666666666,
"eval_loss": 0.2540574371814728,
"eval_runtime": 2.9427,
"eval_samples_per_second": 169.914,
"eval_steps_per_second": 16.991,
"step": 9100
},
{
"epoch": 9.583333333333334,
"eval_loss": 0.24942024052143097,
"eval_runtime": 2.8732,
"eval_samples_per_second": 174.023,
"eval_steps_per_second": 17.402,
"step": 9200
},
{
"epoch": 9.6875,
"eval_loss": 0.25903967022895813,
"eval_runtime": 2.8416,
"eval_samples_per_second": 175.956,
"eval_steps_per_second": 17.596,
"step": 9300
},
{
"epoch": 9.791666666666666,
"eval_loss": 0.21632665395736694,
"eval_runtime": 2.787,
"eval_samples_per_second": 179.408,
"eval_steps_per_second": 17.941,
"step": 9400
},
{
"epoch": 9.895833333333334,
"grad_norm": 0.3254503309726715,
"learning_rate": 3.722993827160494e-05,
"loss": 0.0117,
"step": 9500
},
{
"epoch": 9.895833333333334,
"eval_loss": 0.22778008878231049,
"eval_runtime": 2.7626,
"eval_samples_per_second": 180.991,
"eval_steps_per_second": 18.099,
"step": 9500
},
{
"epoch": 10.0,
"eval_loss": 0.25083282589912415,
"eval_runtime": 2.6929,
"eval_samples_per_second": 185.676,
"eval_steps_per_second": 18.568,
"step": 9600
},
{
"epoch": 10.104166666666666,
"eval_loss": 0.27844151854515076,
"eval_runtime": 2.6854,
"eval_samples_per_second": 186.195,
"eval_steps_per_second": 18.62,
"step": 9700
},
{
"epoch": 10.208333333333334,
"eval_loss": 0.2655443549156189,
"eval_runtime": 2.6574,
"eval_samples_per_second": 188.155,
"eval_steps_per_second": 18.815,
"step": 9800
},
{
"epoch": 10.3125,
"eval_loss": 0.27213552594184875,
"eval_runtime": 2.7641,
"eval_samples_per_second": 180.888,
"eval_steps_per_second": 18.089,
"step": 9900
},
{
"epoch": 10.416666666666666,
"grad_norm": 0.01915908418595791,
"learning_rate": 3.626543209876543e-05,
"loss": 0.01,
"step": 10000
},
{
"epoch": 10.416666666666666,
"eval_loss": 0.2086724191904068,
"eval_runtime": 2.6276,
"eval_samples_per_second": 190.287,
"eval_steps_per_second": 19.029,
"step": 10000
},
{
"epoch": 10.520833333333334,
"eval_loss": 0.21828189492225647,
"eval_runtime": 2.7043,
"eval_samples_per_second": 184.893,
"eval_steps_per_second": 18.489,
"step": 10100
},
{
"epoch": 10.625,
"eval_loss": 0.17361362278461456,
"eval_runtime": 2.6359,
"eval_samples_per_second": 189.686,
"eval_steps_per_second": 18.969,
"step": 10200
},
{
"epoch": 10.729166666666666,
"eval_loss": 0.23900029063224792,
"eval_runtime": 2.6622,
"eval_samples_per_second": 187.815,
"eval_steps_per_second": 18.781,
"step": 10300
},
{
"epoch": 10.833333333333334,
"eval_loss": 0.21116891503334045,
"eval_runtime": 2.7512,
"eval_samples_per_second": 181.737,
"eval_steps_per_second": 18.174,
"step": 10400
},
{
"epoch": 10.9375,
"grad_norm": 0.15779471397399902,
"learning_rate": 3.530092592592593e-05,
"loss": 0.0079,
"step": 10500
},
{
"epoch": 10.9375,
"eval_loss": 0.22312691807746887,
"eval_runtime": 2.6146,
"eval_samples_per_second": 191.236,
"eval_steps_per_second": 19.124,
"step": 10500
},
{
"epoch": 11.041666666666666,
"eval_loss": 0.19351090490818024,
"eval_runtime": 2.6666,
"eval_samples_per_second": 187.503,
"eval_steps_per_second": 18.75,
"step": 10600
},
{
"epoch": 11.145833333333334,
"eval_loss": 0.22429315745830536,
"eval_runtime": 2.7037,
"eval_samples_per_second": 184.933,
"eval_steps_per_second": 18.493,
"step": 10700
},
{
"epoch": 11.25,
"eval_loss": 0.2761378884315491,
"eval_runtime": 2.6376,
"eval_samples_per_second": 189.563,
"eval_steps_per_second": 18.956,
"step": 10800
},
{
"epoch": 11.354166666666666,
"eval_loss": 0.21286267042160034,
"eval_runtime": 2.7376,
"eval_samples_per_second": 182.64,
"eval_steps_per_second": 18.264,
"step": 10900
},
{
"epoch": 11.458333333333334,
"grad_norm": 28.01507568359375,
"learning_rate": 3.4336419753086427e-05,
"loss": 0.0076,
"step": 11000
},
{
"epoch": 11.458333333333334,
"eval_loss": 0.16870196163654327,
"eval_runtime": 2.6961,
"eval_samples_per_second": 185.451,
"eval_steps_per_second": 18.545,
"step": 11000
},
{
"epoch": 11.5625,
"eval_loss": 0.24428625404834747,
"eval_runtime": 2.6808,
"eval_samples_per_second": 186.514,
"eval_steps_per_second": 18.651,
"step": 11100
},
{
"epoch": 11.666666666666666,
"eval_loss": 0.24231122434139252,
"eval_runtime": 2.6904,
"eval_samples_per_second": 185.843,
"eval_steps_per_second": 18.584,
"step": 11200
},
{
"epoch": 11.770833333333334,
"eval_loss": 0.29215526580810547,
"eval_runtime": 2.6528,
"eval_samples_per_second": 188.481,
"eval_steps_per_second": 18.848,
"step": 11300
},
{
"epoch": 11.875,
"eval_loss": 0.22481079399585724,
"eval_runtime": 2.6638,
"eval_samples_per_second": 187.703,
"eval_steps_per_second": 18.77,
"step": 11400
},
{
"epoch": 11.979166666666666,
"grad_norm": 0.09294537454843521,
"learning_rate": 3.337191358024692e-05,
"loss": 0.0074,
"step": 11500
},
{
"epoch": 11.979166666666666,
"eval_loss": 0.21293191611766815,
"eval_runtime": 2.706,
"eval_samples_per_second": 184.775,
"eval_steps_per_second": 18.478,
"step": 11500
},
{
"epoch": 12.083333333333334,
"eval_loss": 0.23273849487304688,
"eval_runtime": 2.6588,
"eval_samples_per_second": 188.053,
"eval_steps_per_second": 18.805,
"step": 11600
},
{
"epoch": 12.1875,
"eval_loss": 0.23750942945480347,
"eval_runtime": 2.6738,
"eval_samples_per_second": 186.999,
"eval_steps_per_second": 18.7,
"step": 11700
},
{
"epoch": 12.291666666666666,
"eval_loss": 0.26990386843681335,
"eval_runtime": 2.6509,
"eval_samples_per_second": 188.618,
"eval_steps_per_second": 18.862,
"step": 11800
},
{
"epoch": 12.395833333333334,
"eval_loss": 0.24741919338703156,
"eval_runtime": 2.6554,
"eval_samples_per_second": 188.299,
"eval_steps_per_second": 18.83,
"step": 11900
},
{
"epoch": 12.5,
"grad_norm": 1.0478854179382324,
"learning_rate": 3.240740740740741e-05,
"loss": 0.0056,
"step": 12000
},
{
"epoch": 12.5,
"eval_loss": 0.22709093987941742,
"eval_runtime": 2.7368,
"eval_samples_per_second": 182.693,
"eval_steps_per_second": 18.269,
"step": 12000
},
{
"epoch": 12.604166666666666,
"eval_loss": 0.2405824214220047,
"eval_runtime": 2.6757,
"eval_samples_per_second": 186.87,
"eval_steps_per_second": 18.687,
"step": 12100
},
{
"epoch": 12.708333333333334,
"eval_loss": 0.28877463936805725,
"eval_runtime": 2.671,
"eval_samples_per_second": 187.194,
"eval_steps_per_second": 18.719,
"step": 12200
},
{
"epoch": 12.8125,
"eval_loss": 0.3177616596221924,
"eval_runtime": 2.6749,
"eval_samples_per_second": 186.922,
"eval_steps_per_second": 18.692,
"step": 12300
},
{
"epoch": 12.916666666666666,
"eval_loss": 0.2896740734577179,
"eval_runtime": 2.6676,
"eval_samples_per_second": 187.434,
"eval_steps_per_second": 18.743,
"step": 12400
},
{
"epoch": 13.020833333333334,
"grad_norm": 0.09383181482553482,
"learning_rate": 3.14429012345679e-05,
"loss": 0.0068,
"step": 12500
},
{
"epoch": 13.020833333333334,
"eval_loss": 0.2690945267677307,
"eval_runtime": 2.6854,
"eval_samples_per_second": 186.195,
"eval_steps_per_second": 18.619,
"step": 12500
},
{
"epoch": 13.125,
"eval_loss": 0.28043872117996216,
"eval_runtime": 2.7005,
"eval_samples_per_second": 185.153,
"eval_steps_per_second": 18.515,
"step": 12600
},
{
"epoch": 13.229166666666666,
"eval_loss": 0.2700558006763458,
"eval_runtime": 2.664,
"eval_samples_per_second": 187.69,
"eval_steps_per_second": 18.769,
"step": 12700
},
{
"epoch": 13.333333333333334,
"eval_loss": 0.2863883078098297,
"eval_runtime": 2.6459,
"eval_samples_per_second": 188.974,
"eval_steps_per_second": 18.897,
"step": 12800
},
{
"epoch": 13.4375,
"eval_loss": 0.31038790941238403,
"eval_runtime": 2.6638,
"eval_samples_per_second": 187.7,
"eval_steps_per_second": 18.77,
"step": 12900
},
{
"epoch": 13.541666666666666,
"grad_norm": 0.030933663249015808,
"learning_rate": 3.04783950617284e-05,
"loss": 0.0053,
"step": 13000
},
{
"epoch": 13.541666666666666,
"eval_loss": 0.2956462502479553,
"eval_runtime": 2.7335,
"eval_samples_per_second": 182.914,
"eval_steps_per_second": 18.291,
"step": 13000
},
{
"epoch": 13.645833333333334,
"eval_loss": 0.24711303412914276,
"eval_runtime": 2.6773,
"eval_samples_per_second": 186.754,
"eval_steps_per_second": 18.675,
"step": 13100
},
{
"epoch": 13.75,
"eval_loss": 0.2535928785800934,
"eval_runtime": 2.6697,
"eval_samples_per_second": 187.286,
"eval_steps_per_second": 18.729,
"step": 13200
},
{
"epoch": 13.854166666666666,
"eval_loss": 0.23705850541591644,
"eval_runtime": 2.7698,
"eval_samples_per_second": 180.518,
"eval_steps_per_second": 18.052,
"step": 13300
},
{
"epoch": 13.958333333333334,
"eval_loss": 0.23593567311763763,
"eval_runtime": 2.6612,
"eval_samples_per_second": 187.888,
"eval_steps_per_second": 18.789,
"step": 13400
},
{
"epoch": 14.0625,
"grad_norm": 0.031106941401958466,
"learning_rate": 2.951388888888889e-05,
"loss": 0.0049,
"step": 13500
},
{
"epoch": 14.0625,
"eval_loss": 0.2466021180152893,
"eval_runtime": 2.6433,
"eval_samples_per_second": 189.159,
"eval_steps_per_second": 18.916,
"step": 13500
},
{
"epoch": 14.166666666666666,
"eval_loss": 0.27085715532302856,
"eval_runtime": 2.6665,
"eval_samples_per_second": 187.513,
"eval_steps_per_second": 18.751,
"step": 13600
},
{
"epoch": 14.270833333333334,
"eval_loss": 0.23776446282863617,
"eval_runtime": 2.6623,
"eval_samples_per_second": 187.806,
"eval_steps_per_second": 18.781,
"step": 13700
},
{
"epoch": 14.375,
"eval_loss": 0.22552721202373505,
"eval_runtime": 2.696,
"eval_samples_per_second": 185.463,
"eval_steps_per_second": 18.546,
"step": 13800
},
{
"epoch": 14.479166666666666,
"eval_loss": 0.25723254680633545,
"eval_runtime": 2.759,
"eval_samples_per_second": 181.228,
"eval_steps_per_second": 18.123,
"step": 13900
},
{
"epoch": 14.583333333333334,
"grad_norm": 0.014980652369558811,
"learning_rate": 2.8549382716049384e-05,
"loss": 0.0034,
"step": 14000
},
{
"epoch": 14.583333333333334,
"eval_loss": 0.24281509220600128,
"eval_runtime": 2.6514,
"eval_samples_per_second": 188.58,
"eval_steps_per_second": 18.858,
"step": 14000
},
{
"epoch": 14.6875,
"eval_loss": 0.28589245676994324,
"eval_runtime": 2.6758,
"eval_samples_per_second": 186.862,
"eval_steps_per_second": 18.686,
"step": 14100
},
{
"epoch": 14.791666666666666,
"eval_loss": 0.23985494673252106,
"eval_runtime": 2.6688,
"eval_samples_per_second": 187.353,
"eval_steps_per_second": 18.735,
"step": 14200
},
{
"epoch": 14.895833333333334,
"eval_loss": 0.2654513716697693,
"eval_runtime": 2.7853,
"eval_samples_per_second": 179.514,
"eval_steps_per_second": 17.951,
"step": 14300
},
{
"epoch": 15.0,
"eval_loss": 0.24361200630664825,
"eval_runtime": 2.6858,
"eval_samples_per_second": 186.162,
"eval_steps_per_second": 18.616,
"step": 14400
},
{
"epoch": 15.104166666666666,
"grad_norm": 0.06275557726621628,
"learning_rate": 2.7584876543209875e-05,
"loss": 0.0025,
"step": 14500
},
{
"epoch": 15.104166666666666,
"eval_loss": 0.22555358707904816,
"eval_runtime": 2.6629,
"eval_samples_per_second": 187.767,
"eval_steps_per_second": 18.777,
"step": 14500
},
{
"epoch": 15.208333333333334,
"eval_loss": 0.2367800772190094,
"eval_runtime": 2.6382,
"eval_samples_per_second": 189.52,
"eval_steps_per_second": 18.952,
"step": 14600
},
{
"epoch": 15.3125,
"eval_loss": 0.2452574521303177,
"eval_runtime": 2.7269,
"eval_samples_per_second": 183.358,
"eval_steps_per_second": 18.336,
"step": 14700
},
{
"epoch": 15.416666666666666,
"eval_loss": 0.23891642689704895,
"eval_runtime": 2.7402,
"eval_samples_per_second": 182.469,
"eval_steps_per_second": 18.247,
"step": 14800
},
{
"epoch": 15.520833333333334,
"eval_loss": 0.25828251242637634,
"eval_runtime": 2.6596,
"eval_samples_per_second": 187.998,
"eval_steps_per_second": 18.8,
"step": 14900
},
{
"epoch": 15.625,
"grad_norm": 0.019014783203601837,
"learning_rate": 2.6620370370370372e-05,
"loss": 0.0028,
"step": 15000
},
{
"epoch": 15.625,
"eval_loss": 0.21553362905979156,
"eval_runtime": 2.685,
"eval_samples_per_second": 186.223,
"eval_steps_per_second": 18.622,
"step": 15000
},
{
"epoch": 15.729166666666666,
"eval_loss": 0.1873685121536255,
"eval_runtime": 2.679,
"eval_samples_per_second": 186.639,
"eval_steps_per_second": 18.664,
"step": 15100
},
{
"epoch": 15.833333333333334,
"eval_loss": 0.23436906933784485,
"eval_runtime": 2.6734,
"eval_samples_per_second": 187.027,
"eval_steps_per_second": 18.703,
"step": 15200
},
{
"epoch": 15.9375,
"eval_loss": 0.21372827887535095,
"eval_runtime": 2.6738,
"eval_samples_per_second": 187.0,
"eval_steps_per_second": 18.7,
"step": 15300
},
{
"epoch": 16.041666666666668,
"eval_loss": 0.28128960728645325,
"eval_runtime": 2.721,
"eval_samples_per_second": 183.754,
"eval_steps_per_second": 18.375,
"step": 15400
},
{
"epoch": 16.145833333333332,
"grad_norm": 0.001963632879778743,
"learning_rate": 2.5655864197530866e-05,
"loss": 0.0038,
"step": 15500
},
{
"epoch": 16.145833333333332,
"eval_loss": 0.2411942183971405,
"eval_runtime": 2.6679,
"eval_samples_per_second": 187.412,
"eval_steps_per_second": 18.741,
"step": 15500
},
{
"epoch": 16.25,
"eval_loss": 0.27309754490852356,
"eval_runtime": 2.6866,
"eval_samples_per_second": 186.109,
"eval_steps_per_second": 18.611,
"step": 15600
},
{
"epoch": 16.354166666666668,
"eval_loss": 0.26635468006134033,
"eval_runtime": 2.6688,
"eval_samples_per_second": 187.351,
"eval_steps_per_second": 18.735,
"step": 15700
},
{
"epoch": 16.458333333333332,
"eval_loss": 0.2760668694972992,
"eval_runtime": 2.7331,
"eval_samples_per_second": 182.941,
"eval_steps_per_second": 18.294,
"step": 15800
},
{
"epoch": 16.5625,
"eval_loss": 0.2560645341873169,
"eval_runtime": 2.662,
"eval_samples_per_second": 187.83,
"eval_steps_per_second": 18.783,
"step": 15900
},
{
"epoch": 16.666666666666668,
"grad_norm": 0.04559716209769249,
"learning_rate": 2.4691358024691357e-05,
"loss": 0.0024,
"step": 16000
},
{
"epoch": 16.666666666666668,
"eval_loss": 0.31702032685279846,
"eval_runtime": 2.6834,
"eval_samples_per_second": 186.33,
"eval_steps_per_second": 18.633,
"step": 16000
},
{
"epoch": 16.770833333333332,
"eval_loss": 0.23096273839473724,
"eval_runtime": 2.643,
"eval_samples_per_second": 189.182,
"eval_steps_per_second": 18.918,
"step": 16100
},
{
"epoch": 16.875,
"eval_loss": 0.2803973853588104,
"eval_runtime": 2.6697,
"eval_samples_per_second": 187.288,
"eval_steps_per_second": 18.729,
"step": 16200
},
{
"epoch": 16.979166666666668,
"eval_loss": 0.2678157091140747,
"eval_runtime": 2.7817,
"eval_samples_per_second": 179.745,
"eval_steps_per_second": 17.975,
"step": 16300
},
{
"epoch": 17.083333333333332,
"eval_loss": 0.27891919016838074,
"eval_runtime": 2.8206,
"eval_samples_per_second": 177.27,
"eval_steps_per_second": 17.727,
"step": 16400
},
{
"epoch": 17.1875,
"grad_norm": 0.025386014953255653,
"learning_rate": 2.3726851851851854e-05,
"loss": 0.0021,
"step": 16500
},
{
"epoch": 17.1875,
"eval_loss": 0.2423981875181198,
"eval_runtime": 2.6721,
"eval_samples_per_second": 187.12,
"eval_steps_per_second": 18.712,
"step": 16500
},
{
"epoch": 17.291666666666668,
"eval_loss": 0.26049262285232544,
"eval_runtime": 2.7078,
"eval_samples_per_second": 184.654,
"eval_steps_per_second": 18.465,
"step": 16600
},
{
"epoch": 17.395833333333332,
"eval_loss": 0.30610647797584534,
"eval_runtime": 2.6975,
"eval_samples_per_second": 185.357,
"eval_steps_per_second": 18.536,
"step": 16700
},
{
"epoch": 17.5,
"eval_loss": 0.26282158493995667,
"eval_runtime": 2.7046,
"eval_samples_per_second": 184.872,
"eval_steps_per_second": 18.487,
"step": 16800
},
{
"epoch": 17.604166666666668,
"eval_loss": 0.24968083202838898,
"eval_runtime": 2.6698,
"eval_samples_per_second": 187.28,
"eval_steps_per_second": 18.728,
"step": 16900
},
{
"epoch": 17.708333333333332,
"grad_norm": 0.038250915706157684,
"learning_rate": 2.2762345679012348e-05,
"loss": 0.0033,
"step": 17000
},
{
"epoch": 17.708333333333332,
"eval_loss": 0.27164506912231445,
"eval_runtime": 2.6512,
"eval_samples_per_second": 188.591,
"eval_steps_per_second": 18.859,
"step": 17000
},
{
"epoch": 17.8125,
"eval_loss": 0.26982831954956055,
"eval_runtime": 2.7234,
"eval_samples_per_second": 183.593,
"eval_steps_per_second": 18.359,
"step": 17100
},
{
"epoch": 17.916666666666668,
"eval_loss": 0.2670990824699402,
"eval_runtime": 2.7233,
"eval_samples_per_second": 183.601,
"eval_steps_per_second": 18.36,
"step": 17200
},
{
"epoch": 18.020833333333332,
"eval_loss": 0.24990878999233246,
"eval_runtime": 2.6806,
"eval_samples_per_second": 186.525,
"eval_steps_per_second": 18.653,
"step": 17300
},
{
"epoch": 18.125,
"eval_loss": 0.2912808656692505,
"eval_runtime": 2.6688,
"eval_samples_per_second": 187.351,
"eval_steps_per_second": 18.735,
"step": 17400
},
{
"epoch": 18.229166666666668,
"grad_norm": 0.021504106000065804,
"learning_rate": 2.179783950617284e-05,
"loss": 0.003,
"step": 17500
},
{
"epoch": 18.229166666666668,
"eval_loss": 0.26864093542099,
"eval_runtime": 2.6597,
"eval_samples_per_second": 187.99,
"eval_steps_per_second": 18.799,
"step": 17500
},
{
"epoch": 18.333333333333332,
"eval_loss": 0.2561093866825104,
"eval_runtime": 2.7405,
"eval_samples_per_second": 182.449,
"eval_steps_per_second": 18.245,
"step": 17600
},
{
"epoch": 18.4375,
"eval_loss": 0.2752689719200134,
"eval_runtime": 2.8003,
"eval_samples_per_second": 178.554,
"eval_steps_per_second": 17.855,
"step": 17700
},
{
"epoch": 18.541666666666668,
"eval_loss": 0.25926899909973145,
"eval_runtime": 2.6884,
"eval_samples_per_second": 185.987,
"eval_steps_per_second": 18.599,
"step": 17800
},
{
"epoch": 18.645833333333332,
"eval_loss": 0.26980525255203247,
"eval_runtime": 2.7348,
"eval_samples_per_second": 182.831,
"eval_steps_per_second": 18.283,
"step": 17900
},
{
"epoch": 18.75,
"grad_norm": 0.17659549415111542,
"learning_rate": 2.0833333333333336e-05,
"loss": 0.0023,
"step": 18000
},
{
"epoch": 18.75,
"eval_loss": 0.22183029353618622,
"eval_runtime": 2.7514,
"eval_samples_per_second": 181.729,
"eval_steps_per_second": 18.173,
"step": 18000
},
{
"epoch": 18.854166666666668,
"eval_loss": 0.245712548494339,
"eval_runtime": 2.6909,
"eval_samples_per_second": 185.812,
"eval_steps_per_second": 18.581,
"step": 18100
},
{
"epoch": 18.958333333333332,
"eval_loss": 0.23490136861801147,
"eval_runtime": 2.6739,
"eval_samples_per_second": 186.991,
"eval_steps_per_second": 18.699,
"step": 18200
},
{
"epoch": 19.0625,
"eval_loss": 0.2799266278743744,
"eval_runtime": 2.6965,
"eval_samples_per_second": 185.424,
"eval_steps_per_second": 18.542,
"step": 18300
},
{
"epoch": 19.166666666666668,
"eval_loss": 0.2952316403388977,
"eval_runtime": 2.7321,
"eval_samples_per_second": 183.006,
"eval_steps_per_second": 18.301,
"step": 18400
},
{
"epoch": 19.270833333333332,
"grad_norm": 0.023679744452238083,
"learning_rate": 1.9868827160493827e-05,
"loss": 0.0018,
"step": 18500
},
{
"epoch": 19.270833333333332,
"eval_loss": 0.23330457508563995,
"eval_runtime": 2.6526,
"eval_samples_per_second": 188.496,
"eval_steps_per_second": 18.85,
"step": 18500
},
{
"epoch": 19.375,
"eval_loss": 0.26152676343917847,
"eval_runtime": 2.6547,
"eval_samples_per_second": 188.345,
"eval_steps_per_second": 18.834,
"step": 18600
},
{
"epoch": 19.479166666666668,
"eval_loss": 0.2680070996284485,
"eval_runtime": 2.6519,
"eval_samples_per_second": 188.545,
"eval_steps_per_second": 18.855,
"step": 18700
},
{
"epoch": 19.583333333333332,
"eval_loss": 0.219146266579628,
"eval_runtime": 2.6887,
"eval_samples_per_second": 185.964,
"eval_steps_per_second": 18.596,
"step": 18800
},
{
"epoch": 19.6875,
"eval_loss": 0.2633296251296997,
"eval_runtime": 2.716,
"eval_samples_per_second": 184.095,
"eval_steps_per_second": 18.41,
"step": 18900
},
{
"epoch": 19.791666666666668,
"grad_norm": 0.1822015345096588,
"learning_rate": 1.8904320987654324e-05,
"loss": 0.0019,
"step": 19000
},
{
"epoch": 19.791666666666668,
"eval_loss": 0.22770513594150543,
"eval_runtime": 2.6476,
"eval_samples_per_second": 188.849,
"eval_steps_per_second": 18.885,
"step": 19000
},
{
"epoch": 19.895833333333332,
"eval_loss": 0.2615948021411896,
"eval_runtime": 2.6985,
"eval_samples_per_second": 185.288,
"eval_steps_per_second": 18.529,
"step": 19100
},
{
"epoch": 20.0,
"eval_loss": 0.23876366019248962,
"eval_runtime": 2.6951,
"eval_samples_per_second": 185.519,
"eval_steps_per_second": 18.552,
"step": 19200
},
{
"epoch": 20.104166666666668,
"eval_loss": 0.22909638285636902,
"eval_runtime": 2.6935,
"eval_samples_per_second": 185.63,
"eval_steps_per_second": 18.563,
"step": 19300
},
{
"epoch": 20.208333333333332,
"eval_loss": 0.2323145866394043,
"eval_runtime": 2.7334,
"eval_samples_per_second": 182.92,
"eval_steps_per_second": 18.292,
"step": 19400
},
{
"epoch": 20.3125,
"grad_norm": 0.013318472541868687,
"learning_rate": 1.7939814814814815e-05,
"loss": 0.0013,
"step": 19500
},
{
"epoch": 20.3125,
"eval_loss": 0.2226209044456482,
"eval_runtime": 2.655,
"eval_samples_per_second": 188.322,
"eval_steps_per_second": 18.832,
"step": 19500
},
{
"epoch": 20.416666666666668,
"eval_loss": 0.24030183255672455,
"eval_runtime": 2.6564,
"eval_samples_per_second": 188.222,
"eval_steps_per_second": 18.822,
"step": 19600
},
{
"epoch": 20.520833333333332,
"eval_loss": 0.2417810559272766,
"eval_runtime": 2.7353,
"eval_samples_per_second": 182.794,
"eval_steps_per_second": 18.279,
"step": 19700
},
{
"epoch": 20.625,
"eval_loss": 0.2691485583782196,
"eval_runtime": 2.7924,
"eval_samples_per_second": 179.056,
"eval_steps_per_second": 17.906,
"step": 19800
},
{
"epoch": 20.729166666666668,
"eval_loss": 0.21901768445968628,
"eval_runtime": 2.6311,
"eval_samples_per_second": 190.038,
"eval_steps_per_second": 19.004,
"step": 19900
},
{
"epoch": 20.833333333333332,
"grad_norm": 0.024880312383174896,
"learning_rate": 1.697530864197531e-05,
"loss": 0.0014,
"step": 20000
},
{
"epoch": 20.833333333333332,
"eval_loss": 0.24805013835430145,
"eval_runtime": 2.6373,
"eval_samples_per_second": 189.588,
"eval_steps_per_second": 18.959,
"step": 20000
},
{
"epoch": 20.9375,
"eval_loss": 0.22968819737434387,
"eval_runtime": 2.6422,
"eval_samples_per_second": 189.237,
"eval_steps_per_second": 18.924,
"step": 20100
},
{
"epoch": 21.041666666666668,
"eval_loss": 0.2395009696483612,
"eval_runtime": 2.6643,
"eval_samples_per_second": 187.67,
"eval_steps_per_second": 18.767,
"step": 20200
},
{
"epoch": 21.145833333333332,
"eval_loss": 0.2686944603919983,
"eval_runtime": 2.7421,
"eval_samples_per_second": 182.345,
"eval_steps_per_second": 18.235,
"step": 20300
},
{
"epoch": 21.25,
"eval_loss": 0.2748400866985321,
"eval_runtime": 2.6544,
"eval_samples_per_second": 188.369,
"eval_steps_per_second": 18.837,
"step": 20400
},
{
"epoch": 21.354166666666668,
"grad_norm": 0.020624302327632904,
"learning_rate": 1.6010802469135803e-05,
"loss": 0.0013,
"step": 20500
},
{
"epoch": 21.354166666666668,
"eval_loss": 0.2482401579618454,
"eval_runtime": 2.6547,
"eval_samples_per_second": 188.344,
"eval_steps_per_second": 18.834,
"step": 20500
},
{
"epoch": 21.458333333333332,
"eval_loss": 0.2579880952835083,
"eval_runtime": 2.6892,
"eval_samples_per_second": 185.929,
"eval_steps_per_second": 18.593,
"step": 20600
},
{
"epoch": 21.5625,
"eval_loss": 0.2505187690258026,
"eval_runtime": 2.7076,
"eval_samples_per_second": 184.666,
"eval_steps_per_second": 18.467,
"step": 20700
},
{
"epoch": 21.666666666666668,
"eval_loss": 0.24572314321994781,
"eval_runtime": 2.6772,
"eval_samples_per_second": 186.761,
"eval_steps_per_second": 18.676,
"step": 20800
},
{
"epoch": 21.770833333333332,
"eval_loss": 0.2615715265274048,
"eval_runtime": 2.6506,
"eval_samples_per_second": 188.636,
"eval_steps_per_second": 18.864,
"step": 20900
},
{
"epoch": 21.875,
"grad_norm": 0.10809088498353958,
"learning_rate": 1.5046296296296297e-05,
"loss": 0.0012,
"step": 21000
},
{
"epoch": 21.875,
"eval_loss": 0.24613332748413086,
"eval_runtime": 2.664,
"eval_samples_per_second": 187.687,
"eval_steps_per_second": 18.769,
"step": 21000
},
{
"epoch": 21.979166666666668,
"eval_loss": 0.2297743260860443,
"eval_runtime": 2.8188,
"eval_samples_per_second": 177.378,
"eval_steps_per_second": 17.738,
"step": 21100
},
{
"epoch": 22.083333333333332,
"eval_loss": 0.20882727205753326,
"eval_runtime": 2.6539,
"eval_samples_per_second": 188.4,
"eval_steps_per_second": 18.84,
"step": 21200
},
{
"epoch": 22.1875,
"eval_loss": 0.24317896366119385,
"eval_runtime": 2.6727,
"eval_samples_per_second": 187.08,
"eval_steps_per_second": 18.708,
"step": 21300
},
{
"epoch": 22.291666666666668,
"eval_loss": 0.24090611934661865,
"eval_runtime": 2.6745,
"eval_samples_per_second": 186.949,
"eval_steps_per_second": 18.695,
"step": 21400
},
{
"epoch": 22.395833333333332,
"grad_norm": 0.013452223502099514,
"learning_rate": 1.4081790123456789e-05,
"loss": 0.0006,
"step": 21500
},
{
"epoch": 22.395833333333332,
"eval_loss": 0.23966069519519806,
"eval_runtime": 2.6467,
"eval_samples_per_second": 188.915,
"eval_steps_per_second": 18.892,
"step": 21500
},
{
"epoch": 22.5,
"eval_loss": 0.2442026436328888,
"eval_runtime": 2.752,
"eval_samples_per_second": 181.687,
"eval_steps_per_second": 18.169,
"step": 21600
},
{
"epoch": 22.604166666666668,
"eval_loss": 0.2610822916030884,
"eval_runtime": 2.7823,
"eval_samples_per_second": 179.709,
"eval_steps_per_second": 17.971,
"step": 21700
},
{
"epoch": 22.708333333333332,
"eval_loss": 0.2411614954471588,
"eval_runtime": 2.6894,
"eval_samples_per_second": 185.916,
"eval_steps_per_second": 18.592,
"step": 21800
},
{
"epoch": 22.8125,
"eval_loss": 0.28125372529029846,
"eval_runtime": 2.6995,
"eval_samples_per_second": 185.22,
"eval_steps_per_second": 18.522,
"step": 21900
},
{
"epoch": 22.916666666666668,
"grad_norm": 0.01698753982782364,
"learning_rate": 1.3117283950617285e-05,
"loss": 0.0016,
"step": 22000
},
{
"epoch": 22.916666666666668,
"eval_loss": 0.2897321879863739,
"eval_runtime": 2.6638,
"eval_samples_per_second": 187.699,
"eval_steps_per_second": 18.77,
"step": 22000
},
{
"epoch": 23.020833333333332,
"eval_loss": 0.2508152425289154,
"eval_runtime": 2.6526,
"eval_samples_per_second": 188.492,
"eval_steps_per_second": 18.849,
"step": 22100
},
{
"epoch": 23.125,
"eval_loss": 0.2747707664966583,
"eval_runtime": 2.7474,
"eval_samples_per_second": 181.988,
"eval_steps_per_second": 18.199,
"step": 22200
},
{
"epoch": 23.229166666666668,
"eval_loss": 0.24645893275737762,
"eval_runtime": 2.7117,
"eval_samples_per_second": 184.383,
"eval_steps_per_second": 18.438,
"step": 22300
},
{
"epoch": 23.333333333333332,
"eval_loss": 0.22453027963638306,
"eval_runtime": 2.6879,
"eval_samples_per_second": 186.022,
"eval_steps_per_second": 18.602,
"step": 22400
},
{
"epoch": 23.4375,
"grad_norm": 0.020492762327194214,
"learning_rate": 1.2152777777777779e-05,
"loss": 0.0012,
"step": 22500
},
{
"epoch": 23.4375,
"eval_loss": 0.2575179934501648,
"eval_runtime": 2.657,
"eval_samples_per_second": 188.185,
"eval_steps_per_second": 18.818,
"step": 22500
},
{
"epoch": 23.541666666666668,
"eval_loss": 0.2540989816188812,
"eval_runtime": 2.6727,
"eval_samples_per_second": 187.077,
"eval_steps_per_second": 18.708,
"step": 22600
},
{
"epoch": 23.645833333333332,
"eval_loss": 0.26418963074684143,
"eval_runtime": 2.6889,
"eval_samples_per_second": 185.951,
"eval_steps_per_second": 18.595,
"step": 22700
},
{
"epoch": 23.75,
"eval_loss": 0.27296414971351624,
"eval_runtime": 2.6695,
"eval_samples_per_second": 187.304,
"eval_steps_per_second": 18.73,
"step": 22800
},
{
"epoch": 23.854166666666668,
"eval_loss": 0.28103941679000854,
"eval_runtime": 2.6878,
"eval_samples_per_second": 186.024,
"eval_steps_per_second": 18.602,
"step": 22900
},
{
"epoch": 23.958333333333332,
"grad_norm": 0.019232362508773804,
"learning_rate": 1.1188271604938271e-05,
"loss": 0.0009,
"step": 23000
},
{
"epoch": 23.958333333333332,
"eval_loss": 0.29488101601600647,
"eval_runtime": 2.671,
"eval_samples_per_second": 187.195,
"eval_steps_per_second": 18.72,
"step": 23000
},
{
"epoch": 24.0625,
"eval_loss": 0.2643510103225708,
"eval_runtime": 2.6969,
"eval_samples_per_second": 185.398,
"eval_steps_per_second": 18.54,
"step": 23100
},
{
"epoch": 24.166666666666668,
"eval_loss": 0.28387248516082764,
"eval_runtime": 2.8223,
"eval_samples_per_second": 177.159,
"eval_steps_per_second": 17.716,
"step": 23200
},
{
"epoch": 24.270833333333332,
"eval_loss": 0.28446924686431885,
"eval_runtime": 2.79,
"eval_samples_per_second": 179.211,
"eval_steps_per_second": 17.921,
"step": 23300
},
{
"epoch": 24.375,
"eval_loss": 0.24371571838855743,
"eval_runtime": 2.6806,
"eval_samples_per_second": 186.527,
"eval_steps_per_second": 18.653,
"step": 23400
},
{
"epoch": 24.479166666666668,
"grad_norm": 0.09378495067358017,
"learning_rate": 1.0223765432098765e-05,
"loss": 0.0012,
"step": 23500
},
{
"epoch": 24.479166666666668,
"eval_loss": 0.2529699206352234,
"eval_runtime": 2.7479,
"eval_samples_per_second": 181.955,
"eval_steps_per_second": 18.196,
"step": 23500
},
{
"epoch": 24.583333333333332,
"eval_loss": 0.23549579083919525,
"eval_runtime": 2.7468,
"eval_samples_per_second": 182.03,
"eval_steps_per_second": 18.203,
"step": 23600
},
{
"epoch": 24.6875,
"eval_loss": 0.24692101776599884,
"eval_runtime": 2.687,
"eval_samples_per_second": 186.078,
"eval_steps_per_second": 18.608,
"step": 23700
},
{
"epoch": 24.791666666666668,
"eval_loss": 0.2594464123249054,
"eval_runtime": 2.7055,
"eval_samples_per_second": 184.81,
"eval_steps_per_second": 18.481,
"step": 23800
},
{
"epoch": 24.895833333333332,
"eval_loss": 0.2568516731262207,
"eval_runtime": 2.7026,
"eval_samples_per_second": 185.01,
"eval_steps_per_second": 18.501,
"step": 23900
},
{
"epoch": 25.0,
"grad_norm": 0.01909373700618744,
"learning_rate": 9.259259259259259e-06,
"loss": 0.0006,
"step": 24000
},
{
"epoch": 25.0,
"eval_loss": 0.23979052901268005,
"eval_runtime": 2.6347,
"eval_samples_per_second": 189.774,
"eval_steps_per_second": 18.977,
"step": 24000
},
{
"epoch": 25.104166666666668,
"eval_loss": 0.25141003727912903,
"eval_runtime": 2.6995,
"eval_samples_per_second": 185.219,
"eval_steps_per_second": 18.522,
"step": 24100
},
{
"epoch": 25.208333333333332,
"eval_loss": 0.2534993886947632,
"eval_runtime": 2.6878,
"eval_samples_per_second": 186.027,
"eval_steps_per_second": 18.603,
"step": 24200
},
{
"epoch": 25.3125,
"eval_loss": 0.2161099910736084,
"eval_runtime": 2.6906,
"eval_samples_per_second": 185.834,
"eval_steps_per_second": 18.583,
"step": 24300
},
{
"epoch": 25.416666666666668,
"eval_loss": 0.2284189760684967,
"eval_runtime": 2.6783,
"eval_samples_per_second": 186.688,
"eval_steps_per_second": 18.669,
"step": 24400
},
{
"epoch": 25.520833333333332,
"grad_norm": 0.01035739853978157,
"learning_rate": 8.294753086419753e-06,
"loss": 0.0005,
"step": 24500
},
{
"epoch": 25.520833333333332,
"eval_loss": 0.23142491281032562,
"eval_runtime": 2.7424,
"eval_samples_per_second": 182.32,
"eval_steps_per_second": 18.232,
"step": 24500
},
{
"epoch": 25.625,
"eval_loss": 0.22284680604934692,
"eval_runtime": 2.667,
"eval_samples_per_second": 187.477,
"eval_steps_per_second": 18.748,
"step": 24600
},
{
"epoch": 25.729166666666668,
"eval_loss": 0.23332533240318298,
"eval_runtime": 2.6695,
"eval_samples_per_second": 187.298,
"eval_steps_per_second": 18.73,
"step": 24700
},
{
"epoch": 25.833333333333332,
"eval_loss": 0.2309611290693283,
"eval_runtime": 2.7134,
"eval_samples_per_second": 184.273,
"eval_steps_per_second": 18.427,
"step": 24800
},
{
"epoch": 25.9375,
"eval_loss": 0.2237754762172699,
"eval_runtime": 2.6731,
"eval_samples_per_second": 187.045,
"eval_steps_per_second": 18.705,
"step": 24900
},
{
"epoch": 26.041666666666668,
"grad_norm": 0.023038456216454506,
"learning_rate": 7.330246913580248e-06,
"loss": 0.0015,
"step": 25000
},
{
"epoch": 26.041666666666668,
"eval_loss": 0.22988422214984894,
"eval_runtime": 2.6773,
"eval_samples_per_second": 186.754,
"eval_steps_per_second": 18.675,
"step": 25000
},
{
"epoch": 26.145833333333332,
"eval_loss": 0.2248789370059967,
"eval_runtime": 2.6901,
"eval_samples_per_second": 185.864,
"eval_steps_per_second": 18.586,
"step": 25100
},
{
"epoch": 26.25,
"eval_loss": 0.21588632464408875,
"eval_runtime": 2.7349,
"eval_samples_per_second": 182.823,
"eval_steps_per_second": 18.282,
"step": 25200
},
{
"epoch": 26.354166666666668,
"eval_loss": 0.22912240028381348,
"eval_runtime": 2.6991,
"eval_samples_per_second": 185.249,
"eval_steps_per_second": 18.525,
"step": 25300
},
{
"epoch": 26.458333333333332,
"eval_loss": 0.24369725584983826,
"eval_runtime": 2.6901,
"eval_samples_per_second": 185.865,
"eval_steps_per_second": 18.586,
"step": 25400
},
{
"epoch": 26.5625,
"grad_norm": 0.004963716492056847,
"learning_rate": 6.365740740740741e-06,
"loss": 0.0009,
"step": 25500
},
{
"epoch": 26.5625,
"eval_loss": 0.24551524221897125,
"eval_runtime": 2.6597,
"eval_samples_per_second": 187.991,
"eval_steps_per_second": 18.799,
"step": 25500
},
{
"epoch": 26.666666666666668,
"eval_loss": 0.2321571707725525,
"eval_runtime": 2.7245,
"eval_samples_per_second": 183.522,
"eval_steps_per_second": 18.352,
"step": 25600
},
{
"epoch": 26.770833333333332,
"eval_loss": 0.2371620088815689,
"eval_runtime": 2.6794,
"eval_samples_per_second": 186.607,
"eval_steps_per_second": 18.661,
"step": 25700
},
{
"epoch": 26.875,
"eval_loss": 0.24450993537902832,
"eval_runtime": 2.6866,
"eval_samples_per_second": 186.112,
"eval_steps_per_second": 18.611,
"step": 25800
},
{
"epoch": 26.979166666666668,
"eval_loss": 0.24821239709854126,
"eval_runtime": 2.6972,
"eval_samples_per_second": 185.379,
"eval_steps_per_second": 18.538,
"step": 25900
},
{
"epoch": 27.083333333333332,
"grad_norm": 0.018454568460583687,
"learning_rate": 5.401234567901234e-06,
"loss": 0.0004,
"step": 26000
},
{
"epoch": 27.083333333333332,
"eval_loss": 0.23901359736919403,
"eval_runtime": 2.6658,
"eval_samples_per_second": 187.559,
"eval_steps_per_second": 18.756,
"step": 26000
},
{
"epoch": 27.1875,
"eval_loss": 0.24414636194705963,
"eval_runtime": 2.7605,
"eval_samples_per_second": 181.124,
"eval_steps_per_second": 18.112,
"step": 26100
},
{
"epoch": 27.291666666666668,
"eval_loss": 0.22809472680091858,
"eval_runtime": 2.6599,
"eval_samples_per_second": 187.975,
"eval_steps_per_second": 18.798,
"step": 26200
},
{
"epoch": 27.395833333333332,
"eval_loss": 0.23070019483566284,
"eval_runtime": 2.6735,
"eval_samples_per_second": 187.017,
"eval_steps_per_second": 18.702,
"step": 26300
},
{
"epoch": 27.5,
"eval_loss": 0.22740352153778076,
"eval_runtime": 2.6801,
"eval_samples_per_second": 186.562,
"eval_steps_per_second": 18.656,
"step": 26400
},
{
"epoch": 27.604166666666668,
"grad_norm": 0.024484924972057343,
"learning_rate": 4.436728395061729e-06,
"loss": 0.0003,
"step": 26500
},
{
"epoch": 27.604166666666668,
"eval_loss": 0.23273907601833344,
"eval_runtime": 2.6798,
"eval_samples_per_second": 186.583,
"eval_steps_per_second": 18.658,
"step": 26500
},
{
"epoch": 27.708333333333332,
"eval_loss": 0.246334508061409,
"eval_runtime": 2.7151,
"eval_samples_per_second": 184.157,
"eval_steps_per_second": 18.416,
"step": 26600
},
{
"epoch": 27.8125,
"eval_loss": 0.2416531890630722,
"eval_runtime": 2.7757,
"eval_samples_per_second": 180.134,
"eval_steps_per_second": 18.013,
"step": 26700
},
{
"epoch": 27.916666666666668,
"eval_loss": 0.2483607828617096,
"eval_runtime": 2.7131,
"eval_samples_per_second": 184.294,
"eval_steps_per_second": 18.429,
"step": 26800
},
{
"epoch": 28.020833333333332,
"eval_loss": 0.252419114112854,
"eval_runtime": 2.6895,
"eval_samples_per_second": 185.906,
"eval_steps_per_second": 18.591,
"step": 26900
},
{
"epoch": 28.125,
"grad_norm": 0.0021363645792007446,
"learning_rate": 3.4722222222222224e-06,
"loss": 0.0005,
"step": 27000
},
{
"epoch": 28.125,
"eval_loss": 0.24915319681167603,
"eval_runtime": 2.6788,
"eval_samples_per_second": 186.648,
"eval_steps_per_second": 18.665,
"step": 27000
},
{
"epoch": 28.229166666666668,
"eval_loss": 0.2466410994529724,
"eval_runtime": 2.7305,
"eval_samples_per_second": 183.118,
"eval_steps_per_second": 18.312,
"step": 27100
},
{
"epoch": 28.333333333333332,
"eval_loss": 0.23784339427947998,
"eval_runtime": 2.7758,
"eval_samples_per_second": 180.131,
"eval_steps_per_second": 18.013,
"step": 27200
},
{
"epoch": 28.4375,
"eval_loss": 0.23728083074092865,
"eval_runtime": 2.7003,
"eval_samples_per_second": 185.168,
"eval_steps_per_second": 18.517,
"step": 27300
},
{
"epoch": 28.541666666666668,
"eval_loss": 0.23528096079826355,
"eval_runtime": 2.6782,
"eval_samples_per_second": 186.695,
"eval_steps_per_second": 18.669,
"step": 27400
},
{
"epoch": 28.645833333333332,
"grad_norm": 0.004232197534292936,
"learning_rate": 2.5077160493827164e-06,
"loss": 0.0007,
"step": 27500
},
{
"epoch": 28.645833333333332,
"eval_loss": 0.22034627199172974,
"eval_runtime": 2.6537,
"eval_samples_per_second": 188.417,
"eval_steps_per_second": 18.842,
"step": 27500
},
{
"epoch": 28.75,
"eval_loss": 0.22053499519824982,
"eval_runtime": 2.7004,
"eval_samples_per_second": 185.159,
"eval_steps_per_second": 18.516,
"step": 27600
},
{
"epoch": 28.854166666666668,
"eval_loss": 0.2195887714624405,
"eval_runtime": 2.7033,
"eval_samples_per_second": 184.958,
"eval_steps_per_second": 18.496,
"step": 27700
},
{
"epoch": 28.958333333333332,
"eval_loss": 0.2224828451871872,
"eval_runtime": 2.7202,
"eval_samples_per_second": 183.811,
"eval_steps_per_second": 18.381,
"step": 27800
},
{
"epoch": 29.0625,
"eval_loss": 0.22565923631191254,
"eval_runtime": 2.7224,
"eval_samples_per_second": 183.659,
"eval_steps_per_second": 18.366,
"step": 27900
},
{
"epoch": 29.166666666666668,
"grad_norm": 0.005333024077117443,
"learning_rate": 1.5432098765432098e-06,
"loss": 0.0001,
"step": 28000
},
{
"epoch": 29.166666666666668,
"eval_loss": 0.2284005731344223,
"eval_runtime": 2.7316,
"eval_samples_per_second": 183.043,
"eval_steps_per_second": 18.304,
"step": 28000
},
{
"epoch": 29.270833333333332,
"eval_loss": 0.22724701464176178,
"eval_runtime": 2.7988,
"eval_samples_per_second": 178.649,
"eval_steps_per_second": 17.865,
"step": 28100
},
{
"epoch": 29.375,
"eval_loss": 0.23275510966777802,
"eval_runtime": 2.8416,
"eval_samples_per_second": 175.955,
"eval_steps_per_second": 17.595,
"step": 28200
},
{
"epoch": 29.479166666666668,
"eval_loss": 0.23228801786899567,
"eval_runtime": 2.8546,
"eval_samples_per_second": 175.155,
"eval_steps_per_second": 17.516,
"step": 28300
},
{
"epoch": 29.583333333333332,
"eval_loss": 0.23211157321929932,
"eval_runtime": 2.8814,
"eval_samples_per_second": 173.527,
"eval_steps_per_second": 17.353,
"step": 28400
},
{
"epoch": 29.6875,
"grad_norm": 0.001249138847924769,
"learning_rate": 5.787037037037037e-07,
"loss": 0.0007,
"step": 28500
},
{
"epoch": 29.6875,
"eval_loss": 0.23103657364845276,
"eval_runtime": 2.9118,
"eval_samples_per_second": 171.717,
"eval_steps_per_second": 17.172,
"step": 28500
},
{
"epoch": 29.791666666666668,
"eval_loss": 0.23130209743976593,
"eval_runtime": 3.0208,
"eval_samples_per_second": 165.52,
"eval_steps_per_second": 16.552,
"step": 28600
},
{
"epoch": 29.895833333333332,
"eval_loss": 0.2313297837972641,
"eval_runtime": 3.1209,
"eval_samples_per_second": 160.208,
"eval_steps_per_second": 16.021,
"step": 28700
},
{
"epoch": 30.0,
"eval_loss": 0.23098842799663544,
"eval_runtime": 3.0691,
"eval_samples_per_second": 162.912,
"eval_steps_per_second": 16.291,
"step": 28800
}
],
"logging_steps": 500,
"max_steps": 28800,
"num_input_tokens_seen": 0,
"num_train_epochs": 30,
"save_steps": 1200,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 10,
"trial_name": null,
"trial_params": null
}