quen_2.5_lora / trainer_state.json
sujithatz's picture
sujithatz/finbot-quen2.5-merged_adapter
10cfcac verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 8.0,
"eval_steps": 3,
"global_step": 592,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.04054054054054054,
"grad_norm": 5.807250022888184,
"learning_rate": 5e-05,
"loss": 3.1119,
"step": 3
},
{
"epoch": 0.04054054054054054,
"eval_loss": 3.1016640663146973,
"eval_runtime": 1.0551,
"eval_samples_per_second": 15.164,
"eval_steps_per_second": 3.791,
"step": 3
},
{
"epoch": 0.08108108108108109,
"grad_norm": 4.004100322723389,
"learning_rate": 0.0001,
"loss": 2.8734,
"step": 6
},
{
"epoch": 0.08108108108108109,
"eval_loss": 2.6094236373901367,
"eval_runtime": 1.0592,
"eval_samples_per_second": 15.106,
"eval_steps_per_second": 3.777,
"step": 6
},
{
"epoch": 0.12162162162162163,
"grad_norm": 3.935053586959839,
"learning_rate": 9.999353337510526e-05,
"loss": 2.4188,
"step": 9
},
{
"epoch": 0.12162162162162163,
"eval_loss": 2.1545872688293457,
"eval_runtime": 1.0511,
"eval_samples_per_second": 15.222,
"eval_steps_per_second": 3.805,
"step": 9
},
{
"epoch": 0.16216216216216217,
"grad_norm": 5.741048812866211,
"learning_rate": 9.997413517311055e-05,
"loss": 1.9335,
"step": 12
},
{
"epoch": 0.16216216216216217,
"eval_loss": 1.786160945892334,
"eval_runtime": 1.0532,
"eval_samples_per_second": 15.192,
"eval_steps_per_second": 3.798,
"step": 12
},
{
"epoch": 0.20270270270270271,
"grad_norm": 4.155601978302002,
"learning_rate": 9.99418104116517e-05,
"loss": 1.5361,
"step": 15
},
{
"epoch": 0.20270270270270271,
"eval_loss": 1.4731855392456055,
"eval_runtime": 1.0511,
"eval_samples_per_second": 15.222,
"eval_steps_per_second": 3.805,
"step": 15
},
{
"epoch": 0.24324324324324326,
"grad_norm": 2.4831109046936035,
"learning_rate": 9.989656745201298e-05,
"loss": 1.314,
"step": 18
},
{
"epoch": 0.24324324324324326,
"eval_loss": 1.2790606021881104,
"eval_runtime": 1.0553,
"eval_samples_per_second": 15.161,
"eval_steps_per_second": 3.79,
"step": 18
},
{
"epoch": 0.28378378378378377,
"grad_norm": 1.9509971141815186,
"learning_rate": 9.983841799696438e-05,
"loss": 1.1747,
"step": 21
},
{
"epoch": 0.28378378378378377,
"eval_loss": 1.1653475761413574,
"eval_runtime": 1.0512,
"eval_samples_per_second": 15.221,
"eval_steps_per_second": 3.805,
"step": 21
},
{
"epoch": 0.32432432432432434,
"grad_norm": 2.245741367340088,
"learning_rate": 9.976737708773445e-05,
"loss": 1.1407,
"step": 24
},
{
"epoch": 0.32432432432432434,
"eval_loss": 1.110356092453003,
"eval_runtime": 1.0534,
"eval_samples_per_second": 15.188,
"eval_steps_per_second": 3.797,
"step": 24
},
{
"epoch": 0.36486486486486486,
"grad_norm": 2.0690531730651855,
"learning_rate": 9.968346310011964e-05,
"loss": 1.1734,
"step": 27
},
{
"epoch": 0.36486486486486486,
"eval_loss": 1.088733434677124,
"eval_runtime": 1.0508,
"eval_samples_per_second": 15.226,
"eval_steps_per_second": 3.806,
"step": 27
},
{
"epoch": 0.40540540540540543,
"grad_norm": 1.8963656425476074,
"learning_rate": 9.958669773973123e-05,
"loss": 1.0495,
"step": 30
},
{
"epoch": 0.40540540540540543,
"eval_loss": 1.0401344299316406,
"eval_runtime": 1.0528,
"eval_samples_per_second": 15.197,
"eval_steps_per_second": 3.799,
"step": 30
},
{
"epoch": 0.44594594594594594,
"grad_norm": 1.753909945487976,
"learning_rate": 9.947710603638078e-05,
"loss": 1.0401,
"step": 33
},
{
"epoch": 0.44594594594594594,
"eval_loss": 0.990611732006073,
"eval_runtime": 1.0507,
"eval_samples_per_second": 15.227,
"eval_steps_per_second": 3.807,
"step": 33
},
{
"epoch": 0.4864864864864865,
"grad_norm": 2.1073760986328125,
"learning_rate": 9.935471633760573e-05,
"loss": 1.0623,
"step": 36
},
{
"epoch": 0.4864864864864865,
"eval_loss": 0.9593618512153625,
"eval_runtime": 1.0535,
"eval_samples_per_second": 15.188,
"eval_steps_per_second": 3.797,
"step": 36
},
{
"epoch": 0.527027027027027,
"grad_norm": 1.5675249099731445,
"learning_rate": 9.921956030133701e-05,
"loss": 0.8152,
"step": 39
},
{
"epoch": 0.527027027027027,
"eval_loss": 0.9366932511329651,
"eval_runtime": 1.0514,
"eval_samples_per_second": 15.218,
"eval_steps_per_second": 3.805,
"step": 39
},
{
"epoch": 0.5675675675675675,
"grad_norm": 2.219888210296631,
"learning_rate": 9.907167288771019e-05,
"loss": 0.9261,
"step": 42
},
{
"epoch": 0.5675675675675675,
"eval_loss": 0.9247606992721558,
"eval_runtime": 1.0532,
"eval_samples_per_second": 15.192,
"eval_steps_per_second": 3.798,
"step": 42
},
{
"epoch": 0.6081081081081081,
"grad_norm": 1.6866446733474731,
"learning_rate": 9.891109235002249e-05,
"loss": 0.9469,
"step": 45
},
{
"epoch": 0.6081081081081081,
"eval_loss": 0.9134540557861328,
"eval_runtime": 1.0562,
"eval_samples_per_second": 15.149,
"eval_steps_per_second": 3.787,
"step": 45
},
{
"epoch": 0.6486486486486487,
"grad_norm": 1.7272800207138062,
"learning_rate": 9.8737860224838e-05,
"loss": 0.8381,
"step": 48
},
{
"epoch": 0.6486486486486487,
"eval_loss": 0.8871217370033264,
"eval_runtime": 1.0527,
"eval_samples_per_second": 15.199,
"eval_steps_per_second": 3.8,
"step": 48
},
{
"epoch": 0.6891891891891891,
"grad_norm": 2.6152303218841553,
"learning_rate": 9.855202132124365e-05,
"loss": 0.8456,
"step": 51
},
{
"epoch": 0.6891891891891891,
"eval_loss": 0.8553087711334229,
"eval_runtime": 1.0521,
"eval_samples_per_second": 15.208,
"eval_steps_per_second": 3.802,
"step": 51
},
{
"epoch": 0.7297297297297297,
"grad_norm": 1.8282960653305054,
"learning_rate": 9.835362370925868e-05,
"loss": 0.908,
"step": 54
},
{
"epoch": 0.7297297297297297,
"eval_loss": 0.8271682858467102,
"eval_runtime": 1.052,
"eval_samples_per_second": 15.21,
"eval_steps_per_second": 3.802,
"step": 54
},
{
"epoch": 0.7702702702702703,
"grad_norm": 2.466750383377075,
"learning_rate": 9.814271870740054e-05,
"loss": 0.999,
"step": 57
},
{
"epoch": 0.7702702702702703,
"eval_loss": 0.8151593208312988,
"eval_runtime": 1.0549,
"eval_samples_per_second": 15.167,
"eval_steps_per_second": 3.792,
"step": 57
},
{
"epoch": 0.8108108108108109,
"grad_norm": 1.8908120393753052,
"learning_rate": 9.791936086941064e-05,
"loss": 0.897,
"step": 60
},
{
"epoch": 0.8108108108108109,
"eval_loss": 0.8052847981452942,
"eval_runtime": 1.0512,
"eval_samples_per_second": 15.22,
"eval_steps_per_second": 3.805,
"step": 60
},
{
"epoch": 0.8513513513513513,
"grad_norm": 1.9563689231872559,
"learning_rate": 9.768360797014324e-05,
"loss": 0.8747,
"step": 63
},
{
"epoch": 0.8513513513513513,
"eval_loss": 0.7914941906929016,
"eval_runtime": 1.0519,
"eval_samples_per_second": 15.21,
"eval_steps_per_second": 3.803,
"step": 63
},
{
"epoch": 0.8918918918918919,
"grad_norm": 1.9292480945587158,
"learning_rate": 9.7435520990621e-05,
"loss": 1.0646,
"step": 66
},
{
"epoch": 0.8918918918918919,
"eval_loss": 0.7872657179832458,
"eval_runtime": 1.0526,
"eval_samples_per_second": 15.201,
"eval_steps_per_second": 3.8,
"step": 66
},
{
"epoch": 0.9324324324324325,
"grad_norm": 1.7248555421829224,
"learning_rate": 9.717516410226145e-05,
"loss": 0.6771,
"step": 69
},
{
"epoch": 0.9324324324324325,
"eval_loss": 0.7814666628837585,
"eval_runtime": 1.0522,
"eval_samples_per_second": 15.207,
"eval_steps_per_second": 3.802,
"step": 69
},
{
"epoch": 0.972972972972973,
"grad_norm": 2.171896457672119,
"learning_rate": 9.690260465027801e-05,
"loss": 0.9386,
"step": 72
},
{
"epoch": 0.972972972972973,
"eval_loss": 0.7634860873222351,
"eval_runtime": 1.0498,
"eval_samples_per_second": 15.241,
"eval_steps_per_second": 3.81,
"step": 72
},
{
"epoch": 1.0135135135135136,
"grad_norm": 1.625179409980774,
"learning_rate": 9.661791313626018e-05,
"loss": 0.6348,
"step": 75
},
{
"epoch": 1.0135135135135136,
"eval_loss": 0.75515216588974,
"eval_runtime": 1.0536,
"eval_samples_per_second": 15.186,
"eval_steps_per_second": 3.796,
"step": 75
},
{
"epoch": 1.054054054054054,
"grad_norm": 1.4293404817581177,
"learning_rate": 9.632116319993725e-05,
"loss": 0.5763,
"step": 78
},
{
"epoch": 1.054054054054054,
"eval_loss": 0.7473800182342529,
"eval_runtime": 1.0524,
"eval_samples_per_second": 15.203,
"eval_steps_per_second": 3.801,
"step": 78
},
{
"epoch": 1.0945945945945945,
"grad_norm": 1.9279707670211792,
"learning_rate": 9.601243160013023e-05,
"loss": 0.7059,
"step": 81
},
{
"epoch": 1.0945945945945945,
"eval_loss": 0.7430617213249207,
"eval_runtime": 1.0539,
"eval_samples_per_second": 15.181,
"eval_steps_per_second": 3.795,
"step": 81
},
{
"epoch": 1.135135135135135,
"grad_norm": 1.7644144296646118,
"learning_rate": 9.56917981948971e-05,
"loss": 0.6111,
"step": 84
},
{
"epoch": 1.135135135135135,
"eval_loss": 0.7393875122070312,
"eval_runtime": 1.0525,
"eval_samples_per_second": 15.202,
"eval_steps_per_second": 3.8,
"step": 84
},
{
"epoch": 1.1756756756756757,
"grad_norm": 1.4910467863082886,
"learning_rate": 9.535934592087627e-05,
"loss": 0.6937,
"step": 87
},
{
"epoch": 1.1756756756756757,
"eval_loss": 0.7415614724159241,
"eval_runtime": 1.0533,
"eval_samples_per_second": 15.191,
"eval_steps_per_second": 3.798,
"step": 87
},
{
"epoch": 1.2162162162162162,
"grad_norm": 1.989018440246582,
"learning_rate": 9.50151607718338e-05,
"loss": 0.6408,
"step": 90
},
{
"epoch": 1.2162162162162162,
"eval_loss": 0.7331891059875488,
"eval_runtime": 1.0504,
"eval_samples_per_second": 15.232,
"eval_steps_per_second": 3.808,
"step": 90
},
{
"epoch": 1.2567567567567568,
"grad_norm": 1.5546590089797974,
"learning_rate": 9.465933177641982e-05,
"loss": 0.5931,
"step": 93
},
{
"epoch": 1.2567567567567568,
"eval_loss": 0.7319458723068237,
"eval_runtime": 1.0532,
"eval_samples_per_second": 15.191,
"eval_steps_per_second": 3.798,
"step": 93
},
{
"epoch": 1.2972972972972974,
"grad_norm": 2.128746271133423,
"learning_rate": 9.429195097513993e-05,
"loss": 0.5792,
"step": 96
},
{
"epoch": 1.2972972972972974,
"eval_loss": 0.7179479598999023,
"eval_runtime": 1.0504,
"eval_samples_per_second": 15.232,
"eval_steps_per_second": 3.808,
"step": 96
},
{
"epoch": 1.3378378378378377,
"grad_norm": 2.069204092025757,
"learning_rate": 9.391311339654753e-05,
"loss": 0.5502,
"step": 99
},
{
"epoch": 1.3378378378378377,
"eval_loss": 0.7083268165588379,
"eval_runtime": 1.0531,
"eval_samples_per_second": 15.193,
"eval_steps_per_second": 3.798,
"step": 99
},
{
"epoch": 1.3783783783783785,
"grad_norm": 2.069469928741455,
"learning_rate": 9.352291703266331e-05,
"loss": 0.7356,
"step": 102
},
{
"epoch": 1.3783783783783785,
"eval_loss": 0.7048563957214355,
"eval_runtime": 1.0519,
"eval_samples_per_second": 15.21,
"eval_steps_per_second": 3.803,
"step": 102
},
{
"epoch": 1.4189189189189189,
"grad_norm": 1.507051706314087,
"learning_rate": 9.31214628136281e-05,
"loss": 0.5204,
"step": 105
},
{
"epoch": 1.4189189189189189,
"eval_loss": 0.6983195543289185,
"eval_runtime": 1.0543,
"eval_samples_per_second": 15.176,
"eval_steps_per_second": 3.794,
"step": 105
},
{
"epoch": 1.4594594594594594,
"grad_norm": 1.918865442276001,
"learning_rate": 9.270885458159575e-05,
"loss": 0.6132,
"step": 108
},
{
"epoch": 1.4594594594594594,
"eval_loss": 0.6857842803001404,
"eval_runtime": 1.0525,
"eval_samples_per_second": 15.202,
"eval_steps_per_second": 3.8,
"step": 108
},
{
"epoch": 1.5,
"grad_norm": 2.062997341156006,
"learning_rate": 9.228519906387288e-05,
"loss": 0.7527,
"step": 111
},
{
"epoch": 1.5,
"eval_loss": 0.6743776798248291,
"eval_runtime": 1.0512,
"eval_samples_per_second": 15.221,
"eval_steps_per_second": 3.805,
"step": 111
},
{
"epoch": 1.5405405405405406,
"grad_norm": 1.8099018335342407,
"learning_rate": 9.185060584531217e-05,
"loss": 0.6798,
"step": 114
},
{
"epoch": 1.5405405405405406,
"eval_loss": 0.6715844869613647,
"eval_runtime": 1.0529,
"eval_samples_per_second": 15.196,
"eval_steps_per_second": 3.799,
"step": 114
},
{
"epoch": 1.5810810810810811,
"grad_norm": 2.0540611743927,
"learning_rate": 9.140518733996672e-05,
"loss": 0.7266,
"step": 117
},
{
"epoch": 1.5810810810810811,
"eval_loss": 0.6656138896942139,
"eval_runtime": 1.0523,
"eval_samples_per_second": 15.204,
"eval_steps_per_second": 3.801,
"step": 117
},
{
"epoch": 1.6216216216216215,
"grad_norm": 2.3945634365081787,
"learning_rate": 9.094905876201229e-05,
"loss": 0.5347,
"step": 120
},
{
"epoch": 1.6216216216216215,
"eval_loss": 0.6710730791091919,
"eval_runtime": 1.053,
"eval_samples_per_second": 15.195,
"eval_steps_per_second": 3.799,
"step": 120
},
{
"epoch": 1.6621621621621623,
"grad_norm": 2.006612777709961,
"learning_rate": 9.048233809594561e-05,
"loss": 0.6522,
"step": 123
},
{
"epoch": 1.6621621621621623,
"eval_loss": 0.6679877042770386,
"eval_runtime": 1.0519,
"eval_samples_per_second": 15.211,
"eval_steps_per_second": 3.803,
"step": 123
},
{
"epoch": 1.7027027027027026,
"grad_norm": 1.751696228981018,
"learning_rate": 9.000514606606581e-05,
"loss": 0.8567,
"step": 126
},
{
"epoch": 1.7027027027027026,
"eval_loss": 0.6558159589767456,
"eval_runtime": 1.0531,
"eval_samples_per_second": 15.193,
"eval_steps_per_second": 3.798,
"step": 126
},
{
"epoch": 1.7432432432432432,
"grad_norm": 1.5286139249801636,
"learning_rate": 8.951760610524724e-05,
"loss": 0.5204,
"step": 129
},
{
"epoch": 1.7432432432432432,
"eval_loss": 0.6488269567489624,
"eval_runtime": 1.0516,
"eval_samples_per_second": 15.215,
"eval_steps_per_second": 3.804,
"step": 129
},
{
"epoch": 1.7837837837837838,
"grad_norm": 2.1092898845672607,
"learning_rate": 8.901984432301185e-05,
"loss": 0.6443,
"step": 132
},
{
"epoch": 1.7837837837837838,
"eval_loss": 0.6392868161201477,
"eval_runtime": 1.053,
"eval_samples_per_second": 15.195,
"eval_steps_per_second": 3.799,
"step": 132
},
{
"epoch": 1.8243243243243243,
"grad_norm": 1.7279053926467896,
"learning_rate": 8.851198947290894e-05,
"loss": 0.5436,
"step": 135
},
{
"epoch": 1.8243243243243243,
"eval_loss": 0.6321672201156616,
"eval_runtime": 1.0499,
"eval_samples_per_second": 15.239,
"eval_steps_per_second": 3.81,
"step": 135
},
{
"epoch": 1.864864864864865,
"grad_norm": 2.6842877864837646,
"learning_rate": 8.799417291921117e-05,
"loss": 0.6054,
"step": 138
},
{
"epoch": 1.864864864864865,
"eval_loss": 0.6346270442008972,
"eval_runtime": 1.0528,
"eval_samples_per_second": 15.198,
"eval_steps_per_second": 3.799,
"step": 138
},
{
"epoch": 1.9054054054054053,
"grad_norm": 1.9958398342132568,
"learning_rate": 8.746652860293523e-05,
"loss": 0.4488,
"step": 141
},
{
"epoch": 1.9054054054054053,
"eval_loss": 0.6389164924621582,
"eval_runtime": 1.0505,
"eval_samples_per_second": 15.231,
"eval_steps_per_second": 3.808,
"step": 141
},
{
"epoch": 1.945945945945946,
"grad_norm": 2.0705783367156982,
"learning_rate": 8.692919300719595e-05,
"loss": 0.7171,
"step": 144
},
{
"epoch": 1.945945945945946,
"eval_loss": 0.632194995880127,
"eval_runtime": 1.0537,
"eval_samples_per_second": 15.184,
"eval_steps_per_second": 3.796,
"step": 144
},
{
"epoch": 1.9864864864864864,
"grad_norm": 2.0737218856811523,
"learning_rate": 8.638230512190298e-05,
"loss": 0.5383,
"step": 147
},
{
"epoch": 1.9864864864864864,
"eval_loss": 0.6272808313369751,
"eval_runtime": 1.0507,
"eval_samples_per_second": 15.228,
"eval_steps_per_second": 3.807,
"step": 147
},
{
"epoch": 2.027027027027027,
"grad_norm": 1.6119190454483032,
"learning_rate": 8.58260064078088e-05,
"loss": 0.4812,
"step": 150
},
{
"epoch": 2.027027027027027,
"eval_loss": 0.6234598755836487,
"eval_runtime": 1.0541,
"eval_samples_per_second": 15.179,
"eval_steps_per_second": 3.795,
"step": 150
},
{
"epoch": 2.0675675675675675,
"grad_norm": 2.104738712310791,
"learning_rate": 8.526044075991802e-05,
"loss": 0.7911,
"step": 153
},
{
"epoch": 2.0675675675675675,
"eval_loss": 0.6295649409294128,
"eval_runtime": 1.0504,
"eval_samples_per_second": 15.232,
"eval_steps_per_second": 3.808,
"step": 153
},
{
"epoch": 2.108108108108108,
"grad_norm": 2.041696786880493,
"learning_rate": 8.468575447026651e-05,
"loss": 0.514,
"step": 156
},
{
"epoch": 2.108108108108108,
"eval_loss": 0.6444165706634521,
"eval_runtime": 1.0539,
"eval_samples_per_second": 15.182,
"eval_steps_per_second": 3.795,
"step": 156
},
{
"epoch": 2.1486486486486487,
"grad_norm": 1.7887616157531738,
"learning_rate": 8.410209619008101e-05,
"loss": 0.4481,
"step": 159
},
{
"epoch": 2.1486486486486487,
"eval_loss": 0.6452795267105103,
"eval_runtime": 1.0508,
"eval_samples_per_second": 15.227,
"eval_steps_per_second": 3.807,
"step": 159
},
{
"epoch": 2.189189189189189,
"grad_norm": 2.2852938175201416,
"learning_rate": 8.350961689132808e-05,
"loss": 0.3983,
"step": 162
},
{
"epoch": 2.189189189189189,
"eval_loss": 0.6356573104858398,
"eval_runtime": 1.0538,
"eval_samples_per_second": 15.183,
"eval_steps_per_second": 3.796,
"step": 162
},
{
"epoch": 2.22972972972973,
"grad_norm": 1.3814259767532349,
"learning_rate": 8.290846982766305e-05,
"loss": 0.2386,
"step": 165
},
{
"epoch": 2.22972972972973,
"eval_loss": 0.632733166217804,
"eval_runtime": 1.053,
"eval_samples_per_second": 15.195,
"eval_steps_per_second": 3.799,
"step": 165
},
{
"epoch": 2.27027027027027,
"grad_norm": 2.624509572982788,
"learning_rate": 8.22988104947886e-05,
"loss": 0.4447,
"step": 168
},
{
"epoch": 2.27027027027027,
"eval_loss": 0.6358802318572998,
"eval_runtime": 1.0518,
"eval_samples_per_second": 15.212,
"eval_steps_per_second": 3.803,
"step": 168
},
{
"epoch": 2.310810810810811,
"grad_norm": 2.1006217002868652,
"learning_rate": 8.168079659023349e-05,
"loss": 0.4302,
"step": 171
},
{
"epoch": 2.310810810810811,
"eval_loss": 0.6386667490005493,
"eval_runtime": 1.0534,
"eval_samples_per_second": 15.188,
"eval_steps_per_second": 3.797,
"step": 171
},
{
"epoch": 2.3513513513513513,
"grad_norm": 2.631301164627075,
"learning_rate": 8.105458797256178e-05,
"loss": 0.4514,
"step": 174
},
{
"epoch": 2.3513513513513513,
"eval_loss": 0.6402238607406616,
"eval_runtime": 1.0545,
"eval_samples_per_second": 15.174,
"eval_steps_per_second": 3.793,
"step": 174
},
{
"epoch": 2.391891891891892,
"grad_norm": 1.4005826711654663,
"learning_rate": 8.04203466200229e-05,
"loss": 0.2813,
"step": 177
},
{
"epoch": 2.391891891891892,
"eval_loss": 0.6313220262527466,
"eval_runtime": 1.0541,
"eval_samples_per_second": 15.178,
"eval_steps_per_second": 3.795,
"step": 177
},
{
"epoch": 2.4324324324324325,
"grad_norm": 2.4380390644073486,
"learning_rate": 7.977823658865364e-05,
"loss": 0.4747,
"step": 180
},
{
"epoch": 2.4324324324324325,
"eval_loss": 0.6258513927459717,
"eval_runtime": 1.0533,
"eval_samples_per_second": 15.191,
"eval_steps_per_second": 3.798,
"step": 180
},
{
"epoch": 2.472972972972973,
"grad_norm": 2.3655426502227783,
"learning_rate": 7.912842396984254e-05,
"loss": 0.547,
"step": 183
},
{
"epoch": 2.472972972972973,
"eval_loss": 0.6256988048553467,
"eval_runtime": 1.053,
"eval_samples_per_second": 15.195,
"eval_steps_per_second": 3.799,
"step": 183
},
{
"epoch": 2.5135135135135136,
"grad_norm": 1.9949471950531006,
"learning_rate": 7.847107684736792e-05,
"loss": 0.3154,
"step": 186
},
{
"epoch": 2.5135135135135136,
"eval_loss": 0.6247289776802063,
"eval_runtime": 1.0523,
"eval_samples_per_second": 15.205,
"eval_steps_per_second": 3.801,
"step": 186
},
{
"epoch": 2.554054054054054,
"grad_norm": 3.2453622817993164,
"learning_rate": 7.780636525392046e-05,
"loss": 0.5583,
"step": 189
},
{
"epoch": 2.554054054054054,
"eval_loss": 0.6129618883132935,
"eval_runtime": 1.0519,
"eval_samples_per_second": 15.21,
"eval_steps_per_second": 3.803,
"step": 189
},
{
"epoch": 2.5945945945945947,
"grad_norm": 2.022986888885498,
"learning_rate": 7.713446112712169e-05,
"loss": 0.5726,
"step": 192
},
{
"epoch": 2.5945945945945947,
"eval_loss": 0.6086827516555786,
"eval_runtime": 1.0543,
"eval_samples_per_second": 15.175,
"eval_steps_per_second": 3.794,
"step": 192
},
{
"epoch": 2.635135135135135,
"grad_norm": 2.429865598678589,
"learning_rate": 7.645553826504969e-05,
"loss": 0.4701,
"step": 195
},
{
"epoch": 2.635135135135135,
"eval_loss": 0.6085944175720215,
"eval_runtime": 1.0521,
"eval_samples_per_second": 15.208,
"eval_steps_per_second": 3.802,
"step": 195
},
{
"epoch": 2.6756756756756754,
"grad_norm": 1.991803526878357,
"learning_rate": 7.576977228128376e-05,
"loss": 0.4866,
"step": 198
},
{
"epoch": 2.6756756756756754,
"eval_loss": 0.6133272647857666,
"eval_runtime": 1.0535,
"eval_samples_per_second": 15.187,
"eval_steps_per_second": 3.797,
"step": 198
},
{
"epoch": 2.7162162162162162,
"grad_norm": 2.537832021713257,
"learning_rate": 7.50773405594792e-05,
"loss": 0.4015,
"step": 201
},
{
"epoch": 2.7162162162162162,
"eval_loss": 0.6213403940200806,
"eval_runtime": 1.0524,
"eval_samples_per_second": 15.203,
"eval_steps_per_second": 3.801,
"step": 201
},
{
"epoch": 2.756756756756757,
"grad_norm": 1.758016586303711,
"learning_rate": 7.437842220748441e-05,
"loss": 0.4277,
"step": 204
},
{
"epoch": 2.756756756756757,
"eval_loss": 0.623763382434845,
"eval_runtime": 1.0527,
"eval_samples_per_second": 15.198,
"eval_steps_per_second": 3.8,
"step": 204
},
{
"epoch": 2.7972972972972974,
"grad_norm": 1.8930737972259521,
"learning_rate": 7.367319801101196e-05,
"loss": 0.3157,
"step": 207
},
{
"epoch": 2.7972972972972974,
"eval_loss": 0.6248853206634521,
"eval_runtime": 1.0562,
"eval_samples_per_second": 15.149,
"eval_steps_per_second": 3.787,
"step": 207
},
{
"epoch": 2.8378378378378377,
"grad_norm": 2.071988105773926,
"learning_rate": 7.296185038687566e-05,
"loss": 0.3883,
"step": 210
},
{
"epoch": 2.8378378378378377,
"eval_loss": 0.6209710240364075,
"eval_runtime": 1.0518,
"eval_samples_per_second": 15.212,
"eval_steps_per_second": 3.803,
"step": 210
},
{
"epoch": 2.8783783783783785,
"grad_norm": 1.579237937927246,
"learning_rate": 7.224456333580573e-05,
"loss": 0.5436,
"step": 213
},
{
"epoch": 2.8783783783783785,
"eval_loss": 0.6127223968505859,
"eval_runtime": 1.0524,
"eval_samples_per_second": 15.204,
"eval_steps_per_second": 3.801,
"step": 213
},
{
"epoch": 2.918918918918919,
"grad_norm": 2.4129927158355713,
"learning_rate": 7.152152239485419e-05,
"loss": 0.526,
"step": 216
},
{
"epoch": 2.918918918918919,
"eval_loss": 0.6055560111999512,
"eval_runtime": 1.0502,
"eval_samples_per_second": 15.236,
"eval_steps_per_second": 3.809,
"step": 216
},
{
"epoch": 2.9594594594594597,
"grad_norm": 2.252251148223877,
"learning_rate": 7.079291458940301e-05,
"loss": 0.4465,
"step": 219
},
{
"epoch": 2.9594594594594597,
"eval_loss": 0.5982283353805542,
"eval_runtime": 1.0529,
"eval_samples_per_second": 15.197,
"eval_steps_per_second": 3.799,
"step": 219
},
{
"epoch": 3.0,
"grad_norm": 1.9773114919662476,
"learning_rate": 7.005892838478711e-05,
"loss": 0.3692,
"step": 222
},
{
"epoch": 3.0,
"eval_loss": 0.5916565656661987,
"eval_runtime": 1.0501,
"eval_samples_per_second": 15.237,
"eval_steps_per_second": 3.809,
"step": 222
},
{
"epoch": 3.0405405405405403,
"grad_norm": 1.1434626579284668,
"learning_rate": 6.931975363754502e-05,
"loss": 0.3022,
"step": 225
},
{
"epoch": 3.0405405405405403,
"eval_loss": 0.5955583453178406,
"eval_runtime": 1.0535,
"eval_samples_per_second": 15.187,
"eval_steps_per_second": 3.797,
"step": 225
},
{
"epoch": 3.081081081081081,
"grad_norm": 1.9162238836288452,
"learning_rate": 6.85755815463096e-05,
"loss": 0.2875,
"step": 228
},
{
"epoch": 3.081081081081081,
"eval_loss": 0.6152929067611694,
"eval_runtime": 1.0516,
"eval_samples_per_second": 15.215,
"eval_steps_per_second": 3.804,
"step": 228
},
{
"epoch": 3.1216216216216215,
"grad_norm": 2.688631057739258,
"learning_rate": 6.782660460235174e-05,
"loss": 0.5544,
"step": 231
},
{
"epoch": 3.1216216216216215,
"eval_loss": 0.6343094110488892,
"eval_runtime": 1.052,
"eval_samples_per_second": 15.21,
"eval_steps_per_second": 3.802,
"step": 231
},
{
"epoch": 3.1621621621621623,
"grad_norm": 2.58313250541687,
"learning_rate": 6.707301653978945e-05,
"loss": 0.4159,
"step": 234
},
{
"epoch": 3.1621621621621623,
"eval_loss": 0.6369538307189941,
"eval_runtime": 1.0524,
"eval_samples_per_second": 15.203,
"eval_steps_per_second": 3.801,
"step": 234
},
{
"epoch": 3.2027027027027026,
"grad_norm": 2.2415409088134766,
"learning_rate": 6.63150122854758e-05,
"loss": 0.4963,
"step": 237
},
{
"epoch": 3.2027027027027026,
"eval_loss": 0.6289186477661133,
"eval_runtime": 1.0528,
"eval_samples_per_second": 15.198,
"eval_steps_per_second": 3.799,
"step": 237
},
{
"epoch": 3.2432432432432434,
"grad_norm": 2.974931240081787,
"learning_rate": 6.5552787908578e-05,
"loss": 0.3248,
"step": 240
},
{
"epoch": 3.2432432432432434,
"eval_loss": 0.6189987659454346,
"eval_runtime": 1.0515,
"eval_samples_per_second": 15.217,
"eval_steps_per_second": 3.804,
"step": 240
},
{
"epoch": 3.2837837837837838,
"grad_norm": 2.0078535079956055,
"learning_rate": 6.478654056986131e-05,
"loss": 0.349,
"step": 243
},
{
"epoch": 3.2837837837837838,
"eval_loss": 0.6110680103302002,
"eval_runtime": 1.0532,
"eval_samples_per_second": 15.192,
"eval_steps_per_second": 3.798,
"step": 243
},
{
"epoch": 3.3243243243243246,
"grad_norm": 2.6236143112182617,
"learning_rate": 6.401646847069039e-05,
"loss": 0.3107,
"step": 246
},
{
"epoch": 3.3243243243243246,
"eval_loss": 0.6120755672454834,
"eval_runtime": 1.0508,
"eval_samples_per_second": 15.227,
"eval_steps_per_second": 3.807,
"step": 246
},
{
"epoch": 3.364864864864865,
"grad_norm": 1.75555419921875,
"learning_rate": 6.32427708017615e-05,
"loss": 0.2219,
"step": 249
},
{
"epoch": 3.364864864864865,
"eval_loss": 0.6196171641349792,
"eval_runtime": 1.0523,
"eval_samples_per_second": 15.204,
"eval_steps_per_second": 3.801,
"step": 249
},
{
"epoch": 3.4054054054054053,
"grad_norm": 3.003138303756714,
"learning_rate": 6.246564769157894e-05,
"loss": 0.251,
"step": 252
},
{
"epoch": 3.4054054054054053,
"eval_loss": 0.6273298263549805,
"eval_runtime": 1.0546,
"eval_samples_per_second": 15.171,
"eval_steps_per_second": 3.793,
"step": 252
},
{
"epoch": 3.445945945945946,
"grad_norm": 2.2066917419433594,
"learning_rate": 6.168530015468872e-05,
"loss": 0.3366,
"step": 255
},
{
"epoch": 3.445945945945946,
"eval_loss": 0.6258885860443115,
"eval_runtime": 1.0514,
"eval_samples_per_second": 15.217,
"eval_steps_per_second": 3.804,
"step": 255
},
{
"epoch": 3.4864864864864864,
"grad_norm": 1.7121000289916992,
"learning_rate": 6.0901930039683184e-05,
"loss": 0.3182,
"step": 258
},
{
"epoch": 3.4864864864864864,
"eval_loss": 0.6243223547935486,
"eval_runtime": 1.0739,
"eval_samples_per_second": 14.898,
"eval_steps_per_second": 3.725,
"step": 258
},
{
"epoch": 3.527027027027027,
"grad_norm": 2.7600913047790527,
"learning_rate": 6.011573997698985e-05,
"loss": 0.4133,
"step": 261
},
{
"epoch": 3.527027027027027,
"eval_loss": 0.6259996294975281,
"eval_runtime": 1.0561,
"eval_samples_per_second": 15.151,
"eval_steps_per_second": 3.788,
"step": 261
},
{
"epoch": 3.5675675675675675,
"grad_norm": 2.611302614212036,
"learning_rate": 5.9326933326457956e-05,
"loss": 0.3297,
"step": 264
},
{
"epoch": 3.5675675675675675,
"eval_loss": 0.6303350925445557,
"eval_runtime": 1.0534,
"eval_samples_per_second": 15.189,
"eval_steps_per_second": 3.797,
"step": 264
},
{
"epoch": 3.608108108108108,
"grad_norm": 1.6527258157730103,
"learning_rate": 5.8535714124756434e-05,
"loss": 0.2276,
"step": 267
},
{
"epoch": 3.608108108108108,
"eval_loss": 0.6364917159080505,
"eval_runtime": 1.052,
"eval_samples_per_second": 15.209,
"eval_steps_per_second": 3.802,
"step": 267
},
{
"epoch": 3.6486486486486487,
"grad_norm": 1.1108059883117676,
"learning_rate": 5.774228703259678e-05,
"loss": 0.1842,
"step": 270
},
{
"epoch": 3.6486486486486487,
"eval_loss": 0.6382502317428589,
"eval_runtime": 1.0549,
"eval_samples_per_second": 15.168,
"eval_steps_per_second": 3.792,
"step": 270
},
{
"epoch": 3.689189189189189,
"grad_norm": 2.822380781173706,
"learning_rate": 5.694685728179442e-05,
"loss": 0.4961,
"step": 273
},
{
"epoch": 3.689189189189189,
"eval_loss": 0.6313918828964233,
"eval_runtime": 1.0523,
"eval_samples_per_second": 15.205,
"eval_steps_per_second": 3.801,
"step": 273
},
{
"epoch": 3.72972972972973,
"grad_norm": 2.4894397258758545,
"learning_rate": 5.6149630622182526e-05,
"loss": 0.3785,
"step": 276
},
{
"epoch": 3.72972972972973,
"eval_loss": 0.6239753365516663,
"eval_runtime": 1.053,
"eval_samples_per_second": 15.195,
"eval_steps_per_second": 3.799,
"step": 276
},
{
"epoch": 3.77027027027027,
"grad_norm": 2.1039986610412598,
"learning_rate": 5.535081326839165e-05,
"loss": 0.2834,
"step": 279
},
{
"epoch": 3.77027027027027,
"eval_loss": 0.6189073920249939,
"eval_runtime": 1.0515,
"eval_samples_per_second": 15.217,
"eval_steps_per_second": 3.804,
"step": 279
},
{
"epoch": 3.810810810810811,
"grad_norm": 2.7096340656280518,
"learning_rate": 5.455061184650921e-05,
"loss": 0.3397,
"step": 282
},
{
"epoch": 3.810810810810811,
"eval_loss": 0.6138538122177124,
"eval_runtime": 1.0521,
"eval_samples_per_second": 15.208,
"eval_steps_per_second": 3.802,
"step": 282
},
{
"epoch": 3.8513513513513513,
"grad_norm": 2.030907154083252,
"learning_rate": 5.3749233340632674e-05,
"loss": 0.2795,
"step": 285
},
{
"epoch": 3.8513513513513513,
"eval_loss": 0.6104437708854675,
"eval_runtime": 1.0581,
"eval_samples_per_second": 15.122,
"eval_steps_per_second": 3.78,
"step": 285
},
{
"epoch": 3.891891891891892,
"grad_norm": 2.061206340789795,
"learning_rate": 5.2946885039329866e-05,
"loss": 0.3114,
"step": 288
},
{
"epoch": 3.891891891891892,
"eval_loss": 0.6077687740325928,
"eval_runtime": 1.0527,
"eval_samples_per_second": 15.199,
"eval_steps_per_second": 3.8,
"step": 288
},
{
"epoch": 3.9324324324324325,
"grad_norm": 2.062087059020996,
"learning_rate": 5.2143774482020744e-05,
"loss": 0.2395,
"step": 291
},
{
"epoch": 3.9324324324324325,
"eval_loss": 0.6111433506011963,
"eval_runtime": 1.0517,
"eval_samples_per_second": 15.214,
"eval_steps_per_second": 3.804,
"step": 291
},
{
"epoch": 3.972972972972973,
"grad_norm": 1.6344010829925537,
"learning_rate": 5.134010940529429e-05,
"loss": 0.1948,
"step": 294
},
{
"epoch": 3.972972972972973,
"eval_loss": 0.6142452955245972,
"eval_runtime": 1.0529,
"eval_samples_per_second": 15.196,
"eval_steps_per_second": 3.799,
"step": 294
},
{
"epoch": 4.013513513513513,
"grad_norm": 1.9017384052276611,
"learning_rate": 5.053609768917413e-05,
"loss": 0.2284,
"step": 297
},
{
"epoch": 4.013513513513513,
"eval_loss": 0.6194114685058594,
"eval_runtime": 1.0515,
"eval_samples_per_second": 15.217,
"eval_steps_per_second": 3.804,
"step": 297
},
{
"epoch": 4.054054054054054,
"grad_norm": 2.1609394550323486,
"learning_rate": 4.973194730334748e-05,
"loss": 0.2638,
"step": 300
},
{
"epoch": 4.054054054054054,
"eval_loss": 0.6303145885467529,
"eval_runtime": 1.053,
"eval_samples_per_second": 15.194,
"eval_steps_per_second": 3.798,
"step": 300
},
{
"epoch": 4.094594594594595,
"grad_norm": 1.5275555849075317,
"learning_rate": 4.892786625337047e-05,
"loss": 0.252,
"step": 303
},
{
"epoch": 4.094594594594595,
"eval_loss": 0.6517325639724731,
"eval_runtime": 1.051,
"eval_samples_per_second": 15.224,
"eval_steps_per_second": 3.806,
"step": 303
},
{
"epoch": 4.135135135135135,
"grad_norm": 2.807483434677124,
"learning_rate": 4.8124062526864534e-05,
"loss": 0.183,
"step": 306
},
{
"epoch": 4.135135135135135,
"eval_loss": 0.6644703149795532,
"eval_runtime": 1.0531,
"eval_samples_per_second": 15.193,
"eval_steps_per_second": 3.798,
"step": 306
},
{
"epoch": 4.175675675675675,
"grad_norm": 2.6279256343841553,
"learning_rate": 4.7320744039717154e-05,
"loss": 0.2415,
"step": 309
},
{
"epoch": 4.175675675675675,
"eval_loss": 0.6603893041610718,
"eval_runtime": 1.0531,
"eval_samples_per_second": 15.193,
"eval_steps_per_second": 3.798,
"step": 309
},
{
"epoch": 4.216216216216216,
"grad_norm": 0.42106354236602783,
"learning_rate": 4.651811858230149e-05,
"loss": 0.1791,
"step": 312
},
{
"epoch": 4.216216216216216,
"eval_loss": 0.652984082698822,
"eval_runtime": 1.053,
"eval_samples_per_second": 15.195,
"eval_steps_per_second": 3.799,
"step": 312
},
{
"epoch": 4.256756756756757,
"grad_norm": 2.064615249633789,
"learning_rate": 4.571639376572806e-05,
"loss": 0.2013,
"step": 315
},
{
"epoch": 4.256756756756757,
"eval_loss": 0.6488903760910034,
"eval_runtime": 1.0505,
"eval_samples_per_second": 15.23,
"eval_steps_per_second": 3.808,
"step": 315
},
{
"epoch": 4.297297297297297,
"grad_norm": 2.4248170852661133,
"learning_rate": 4.491577696814318e-05,
"loss": 0.1827,
"step": 318
},
{
"epoch": 4.297297297297297,
"eval_loss": 0.653176486492157,
"eval_runtime": 1.0536,
"eval_samples_per_second": 15.186,
"eval_steps_per_second": 3.797,
"step": 318
},
{
"epoch": 4.337837837837838,
"grad_norm": 2.055769443511963,
"learning_rate": 4.411647528108743e-05,
"loss": 0.1792,
"step": 321
},
{
"epoch": 4.337837837837838,
"eval_loss": 0.6584765315055847,
"eval_runtime": 1.052,
"eval_samples_per_second": 15.209,
"eval_steps_per_second": 3.802,
"step": 321
},
{
"epoch": 4.378378378378378,
"grad_norm": 3.4611449241638184,
"learning_rate": 4.331869545592834e-05,
"loss": 0.2568,
"step": 324
},
{
"epoch": 4.378378378378378,
"eval_loss": 0.6628451347351074,
"eval_runtime": 1.055,
"eval_samples_per_second": 15.166,
"eval_steps_per_second": 3.791,
"step": 324
},
{
"epoch": 4.418918918918919,
"grad_norm": 1.6108025312423706,
"learning_rate": 4.252264385038098e-05,
"loss": 0.1682,
"step": 327
},
{
"epoch": 4.418918918918919,
"eval_loss": 0.66502845287323,
"eval_runtime": 1.0508,
"eval_samples_per_second": 15.227,
"eval_steps_per_second": 3.807,
"step": 327
},
{
"epoch": 4.45945945945946,
"grad_norm": 1.828131914138794,
"learning_rate": 4.1728526375130614e-05,
"loss": 0.25,
"step": 330
},
{
"epoch": 4.45945945945946,
"eval_loss": 0.6729562282562256,
"eval_runtime": 1.0534,
"eval_samples_per_second": 15.189,
"eval_steps_per_second": 3.797,
"step": 330
},
{
"epoch": 4.5,
"grad_norm": 2.5057499408721924,
"learning_rate": 4.093654844057059e-05,
"loss": 0.2664,
"step": 333
},
{
"epoch": 4.5,
"eval_loss": 0.6741403937339783,
"eval_runtime": 1.052,
"eval_samples_per_second": 15.209,
"eval_steps_per_second": 3.802,
"step": 333
},
{
"epoch": 4.54054054054054,
"grad_norm": 1.6008535623550415,
"learning_rate": 4.014691490367e-05,
"loss": 0.2316,
"step": 336
},
{
"epoch": 4.54054054054054,
"eval_loss": 0.6773088574409485,
"eval_runtime": 1.053,
"eval_samples_per_second": 15.194,
"eval_steps_per_second": 3.799,
"step": 336
},
{
"epoch": 4.581081081081081,
"grad_norm": 2.551591157913208,
"learning_rate": 3.935983001498439e-05,
"loss": 0.3467,
"step": 339
},
{
"epoch": 4.581081081081081,
"eval_loss": 0.6705477237701416,
"eval_runtime": 1.0509,
"eval_samples_per_second": 15.226,
"eval_steps_per_second": 3.806,
"step": 339
},
{
"epoch": 4.621621621621622,
"grad_norm": 2.130202054977417,
"learning_rate": 3.857549736582316e-05,
"loss": 0.2426,
"step": 342
},
{
"epoch": 4.621621621621622,
"eval_loss": 0.6681296825408936,
"eval_runtime": 1.0529,
"eval_samples_per_second": 15.196,
"eval_steps_per_second": 3.799,
"step": 342
},
{
"epoch": 4.662162162162162,
"grad_norm": 2.043670415878296,
"learning_rate": 3.7794119835587685e-05,
"loss": 0.2421,
"step": 345
},
{
"epoch": 4.662162162162162,
"eval_loss": 0.6622060537338257,
"eval_runtime": 1.0519,
"eval_samples_per_second": 15.21,
"eval_steps_per_second": 3.803,
"step": 345
},
{
"epoch": 4.702702702702703,
"grad_norm": 1.9365885257720947,
"learning_rate": 3.701589953929354e-05,
"loss": 0.4063,
"step": 348
},
{
"epoch": 4.702702702702703,
"eval_loss": 0.6608781814575195,
"eval_runtime": 1.0528,
"eval_samples_per_second": 15.197,
"eval_steps_per_second": 3.799,
"step": 348
},
{
"epoch": 4.743243243243243,
"grad_norm": 2.596634864807129,
"learning_rate": 3.62410377752904e-05,
"loss": 0.2255,
"step": 351
},
{
"epoch": 4.743243243243243,
"eval_loss": 0.6569182276725769,
"eval_runtime": 1.0522,
"eval_samples_per_second": 15.206,
"eval_steps_per_second": 3.802,
"step": 351
},
{
"epoch": 4.783783783783784,
"grad_norm": 2.039332628250122,
"learning_rate": 3.546973497319319e-05,
"loss": 0.1933,
"step": 354
},
{
"epoch": 4.783783783783784,
"eval_loss": 0.6534222364425659,
"eval_runtime": 1.0498,
"eval_samples_per_second": 15.241,
"eval_steps_per_second": 3.81,
"step": 354
},
{
"epoch": 4.824324324324325,
"grad_norm": 1.994629144668579,
"learning_rate": 3.4702190642037944e-05,
"loss": 0.1975,
"step": 357
},
{
"epoch": 4.824324324324325,
"eval_loss": 0.649687647819519,
"eval_runtime": 1.0523,
"eval_samples_per_second": 15.204,
"eval_steps_per_second": 3.801,
"step": 357
},
{
"epoch": 4.864864864864865,
"grad_norm": 2.154684543609619,
"learning_rate": 3.393860331867589e-05,
"loss": 0.3065,
"step": 360
},
{
"epoch": 4.864864864864865,
"eval_loss": 0.6491411924362183,
"eval_runtime": 1.0519,
"eval_samples_per_second": 15.21,
"eval_steps_per_second": 3.803,
"step": 360
},
{
"epoch": 4.905405405405405,
"grad_norm": 1.61858069896698,
"learning_rate": 3.317917051641877e-05,
"loss": 0.1641,
"step": 363
},
{
"epoch": 4.905405405405405,
"eval_loss": 0.651297926902771,
"eval_runtime": 1.0521,
"eval_samples_per_second": 15.208,
"eval_steps_per_second": 3.802,
"step": 363
},
{
"epoch": 4.945945945945946,
"grad_norm": 2.7362637519836426,
"learning_rate": 3.242408867394919e-05,
"loss": 0.2032,
"step": 366
},
{
"epoch": 4.945945945945946,
"eval_loss": 0.6552869081497192,
"eval_runtime": 1.0506,
"eval_samples_per_second": 15.229,
"eval_steps_per_second": 3.807,
"step": 366
},
{
"epoch": 4.986486486486487,
"grad_norm": 2.0567097663879395,
"learning_rate": 3.167355310450877e-05,
"loss": 0.1886,
"step": 369
},
{
"epoch": 4.986486486486487,
"eval_loss": 0.6590157747268677,
"eval_runtime": 1.0528,
"eval_samples_per_second": 15.197,
"eval_steps_per_second": 3.799,
"step": 369
},
{
"epoch": 5.027027027027027,
"grad_norm": 1.5418853759765625,
"learning_rate": 3.092775794537741e-05,
"loss": 0.2539,
"step": 372
},
{
"epoch": 5.027027027027027,
"eval_loss": 0.6676727533340454,
"eval_runtime": 1.0516,
"eval_samples_per_second": 15.215,
"eval_steps_per_second": 3.804,
"step": 372
},
{
"epoch": 5.0675675675675675,
"grad_norm": 1.229972004890442,
"learning_rate": 3.0186896107656803e-05,
"loss": 0.1464,
"step": 375
},
{
"epoch": 5.0675675675675675,
"eval_loss": 0.687861979007721,
"eval_runtime": 1.0539,
"eval_samples_per_second": 15.182,
"eval_steps_per_second": 3.796,
"step": 375
},
{
"epoch": 5.108108108108108,
"grad_norm": 2.421496868133545,
"learning_rate": 2.9451159226371095e-05,
"loss": 0.2295,
"step": 378
},
{
"epoch": 5.108108108108108,
"eval_loss": 0.7066453695297241,
"eval_runtime": 1.0503,
"eval_samples_per_second": 15.233,
"eval_steps_per_second": 3.808,
"step": 378
},
{
"epoch": 5.148648648648648,
"grad_norm": 2.3475804328918457,
"learning_rate": 2.8720737610897575e-05,
"loss": 0.1438,
"step": 381
},
{
"epoch": 5.148648648648648,
"eval_loss": 0.7166962623596191,
"eval_runtime": 1.0534,
"eval_samples_per_second": 15.189,
"eval_steps_per_second": 3.797,
"step": 381
},
{
"epoch": 5.1891891891891895,
"grad_norm": 2.2746946811676025,
"learning_rate": 2.799582019574033e-05,
"loss": 0.1603,
"step": 384
},
{
"epoch": 5.1891891891891895,
"eval_loss": 0.7134541273117065,
"eval_runtime": 1.0519,
"eval_samples_per_second": 15.211,
"eval_steps_per_second": 3.803,
"step": 384
},
{
"epoch": 5.22972972972973,
"grad_norm": 1.2550048828125,
"learning_rate": 2.7276594491659525e-05,
"loss": 0.1379,
"step": 387
},
{
"epoch": 5.22972972972973,
"eval_loss": 0.7095359563827515,
"eval_runtime": 1.0543,
"eval_samples_per_second": 15.176,
"eval_steps_per_second": 3.794,
"step": 387
},
{
"epoch": 5.27027027027027,
"grad_norm": 1.7738205194473267,
"learning_rate": 2.656324653716884e-05,
"loss": 0.2783,
"step": 390
},
{
"epoch": 5.27027027027027,
"eval_loss": 0.7103461623191833,
"eval_runtime": 1.0515,
"eval_samples_per_second": 15.216,
"eval_steps_per_second": 3.804,
"step": 390
},
{
"epoch": 5.3108108108108105,
"grad_norm": 2.2887580394744873,
"learning_rate": 2.5855960850413935e-05,
"loss": 0.1575,
"step": 393
},
{
"epoch": 5.3108108108108105,
"eval_loss": 0.7042403817176819,
"eval_runtime": 1.0523,
"eval_samples_per_second": 15.204,
"eval_steps_per_second": 3.801,
"step": 393
},
{
"epoch": 5.351351351351352,
"grad_norm": 2.6281135082244873,
"learning_rate": 2.5154920381444025e-05,
"loss": 0.1743,
"step": 396
},
{
"epoch": 5.351351351351352,
"eval_loss": 0.7114053964614868,
"eval_runtime": 1.0527,
"eval_samples_per_second": 15.199,
"eval_steps_per_second": 3.8,
"step": 396
},
{
"epoch": 5.391891891891892,
"grad_norm": 1.8125991821289062,
"learning_rate": 2.4460306464889022e-05,
"loss": 0.1168,
"step": 399
},
{
"epoch": 5.391891891891892,
"eval_loss": 0.7083012461662292,
"eval_runtime": 1.0506,
"eval_samples_per_second": 15.23,
"eval_steps_per_second": 3.807,
"step": 399
},
{
"epoch": 5.4324324324324325,
"grad_norm": 2.5157058238983154,
"learning_rate": 2.3772298773054757e-05,
"loss": 0.284,
"step": 402
},
{
"epoch": 5.4324324324324325,
"eval_loss": 0.7072416543960571,
"eval_runtime": 1.0524,
"eval_samples_per_second": 15.204,
"eval_steps_per_second": 3.801,
"step": 402
},
{
"epoch": 5.472972972972973,
"grad_norm": 0.8739199042320251,
"learning_rate": 2.309107526944792e-05,
"loss": 0.1013,
"step": 405
},
{
"epoch": 5.472972972972973,
"eval_loss": 0.7062889933586121,
"eval_runtime": 1.051,
"eval_samples_per_second": 15.223,
"eval_steps_per_second": 3.806,
"step": 405
},
{
"epoch": 5.513513513513513,
"grad_norm": 2.2809295654296875,
"learning_rate": 2.2416812162743223e-05,
"loss": 0.2612,
"step": 408
},
{
"epoch": 5.513513513513513,
"eval_loss": 0.70506751537323,
"eval_runtime": 1.053,
"eval_samples_per_second": 15.195,
"eval_steps_per_second": 3.799,
"step": 408
},
{
"epoch": 5.554054054054054,
"grad_norm": 2.2030365467071533,
"learning_rate": 2.17496838612043e-05,
"loss": 0.1343,
"step": 411
},
{
"epoch": 5.554054054054054,
"eval_loss": 0.7102519273757935,
"eval_runtime": 1.0534,
"eval_samples_per_second": 15.188,
"eval_steps_per_second": 3.797,
"step": 411
},
{
"epoch": 5.594594594594595,
"grad_norm": 1.4592159986495972,
"learning_rate": 2.1089862927570475e-05,
"loss": 0.1009,
"step": 414
},
{
"epoch": 5.594594594594595,
"eval_loss": 0.7105306386947632,
"eval_runtime": 1.0533,
"eval_samples_per_second": 15.19,
"eval_steps_per_second": 3.797,
"step": 414
},
{
"epoch": 5.635135135135135,
"grad_norm": 2.2018954753875732,
"learning_rate": 2.0437520034420776e-05,
"loss": 0.3127,
"step": 417
},
{
"epoch": 5.635135135135135,
"eval_loss": 0.7089606523513794,
"eval_runtime": 1.0533,
"eval_samples_per_second": 15.191,
"eval_steps_per_second": 3.798,
"step": 417
},
{
"epoch": 5.675675675675675,
"grad_norm": 1.8359624147415161,
"learning_rate": 1.979282392002691e-05,
"loss": 0.1355,
"step": 420
},
{
"epoch": 5.675675675675675,
"eval_loss": 0.7059516906738281,
"eval_runtime": 1.0526,
"eval_samples_per_second": 15.201,
"eval_steps_per_second": 3.8,
"step": 420
},
{
"epoch": 5.716216216216216,
"grad_norm": 2.3145079612731934,
"learning_rate": 1.9155941344706546e-05,
"loss": 0.1345,
"step": 423
},
{
"epoch": 5.716216216216216,
"eval_loss": 0.705683171749115,
"eval_runtime": 1.0519,
"eval_samples_per_second": 15.21,
"eval_steps_per_second": 3.802,
"step": 423
},
{
"epoch": 5.756756756756757,
"grad_norm": 1.7434961795806885,
"learning_rate": 1.852703704768842e-05,
"loss": 0.1865,
"step": 426
},
{
"epoch": 5.756756756756757,
"eval_loss": 0.7038547396659851,
"eval_runtime": 1.0535,
"eval_samples_per_second": 15.188,
"eval_steps_per_second": 3.797,
"step": 426
},
{
"epoch": 5.797297297297297,
"grad_norm": 1.5850327014923096,
"learning_rate": 1.7906273704499845e-05,
"loss": 0.119,
"step": 429
},
{
"epoch": 5.797297297297297,
"eval_loss": 0.7066537737846375,
"eval_runtime": 1.0521,
"eval_samples_per_second": 15.208,
"eval_steps_per_second": 3.802,
"step": 429
},
{
"epoch": 5.837837837837838,
"grad_norm": 1.599552035331726,
"learning_rate": 1.7293811884888344e-05,
"loss": 0.149,
"step": 432
},
{
"epoch": 5.837837837837838,
"eval_loss": 0.7120293974876404,
"eval_runtime": 1.0536,
"eval_samples_per_second": 15.185,
"eval_steps_per_second": 3.796,
"step": 432
},
{
"epoch": 5.878378378378378,
"grad_norm": 1.8353303670883179,
"learning_rate": 1.6689810011287932e-05,
"loss": 0.1748,
"step": 435
},
{
"epoch": 5.878378378378378,
"eval_loss": 0.7123138308525085,
"eval_runtime": 1.0524,
"eval_samples_per_second": 15.203,
"eval_steps_per_second": 3.801,
"step": 435
},
{
"epoch": 5.918918918918919,
"grad_norm": 1.4937026500701904,
"learning_rate": 1.6094424317840723e-05,
"loss": 0.1781,
"step": 438
},
{
"epoch": 5.918918918918919,
"eval_loss": 0.7113088965415955,
"eval_runtime": 1.0528,
"eval_samples_per_second": 15.198,
"eval_steps_per_second": 3.799,
"step": 438
},
{
"epoch": 5.95945945945946,
"grad_norm": 2.0092716217041016,
"learning_rate": 1.550780880998456e-05,
"loss": 0.2075,
"step": 441
},
{
"epoch": 5.95945945945946,
"eval_loss": 0.7117879390716553,
"eval_runtime": 1.0532,
"eval_samples_per_second": 15.192,
"eval_steps_per_second": 3.798,
"step": 441
},
{
"epoch": 6.0,
"grad_norm": 2.762338161468506,
"learning_rate": 1.4930115224617353e-05,
"loss": 0.1591,
"step": 444
},
{
"epoch": 6.0,
"eval_loss": 0.7111848592758179,
"eval_runtime": 1.0522,
"eval_samples_per_second": 15.206,
"eval_steps_per_second": 3.801,
"step": 444
},
{
"epoch": 6.04054054054054,
"grad_norm": 1.825244665145874,
"learning_rate": 1.436149299084789e-05,
"loss": 0.1224,
"step": 447
},
{
"epoch": 6.04054054054054,
"eval_loss": 0.7117843627929688,
"eval_runtime": 1.0529,
"eval_samples_per_second": 15.195,
"eval_steps_per_second": 3.799,
"step": 447
},
{
"epoch": 6.081081081081081,
"grad_norm": 0.9274085760116577,
"learning_rate": 1.380208919134392e-05,
"loss": 0.2234,
"step": 450
},
{
"epoch": 6.081081081081081,
"eval_loss": 0.7170644402503967,
"eval_runtime": 1.0513,
"eval_samples_per_second": 15.219,
"eval_steps_per_second": 3.805,
"step": 450
},
{
"epoch": 6.121621621621622,
"grad_norm": 1.5220532417297363,
"learning_rate": 1.3252048524286842e-05,
"loss": 0.1165,
"step": 453
},
{
"epoch": 6.121621621621622,
"eval_loss": 0.7227377891540527,
"eval_runtime": 1.0532,
"eval_samples_per_second": 15.191,
"eval_steps_per_second": 3.798,
"step": 453
},
{
"epoch": 6.162162162162162,
"grad_norm": 1.669662594795227,
"learning_rate": 1.271151326594352e-05,
"loss": 0.2518,
"step": 456
},
{
"epoch": 6.162162162162162,
"eval_loss": 0.7325636148452759,
"eval_runtime": 1.0523,
"eval_samples_per_second": 15.205,
"eval_steps_per_second": 3.801,
"step": 456
},
{
"epoch": 6.202702702702703,
"grad_norm": 1.6538748741149902,
"learning_rate": 1.2180623233864253e-05,
"loss": 0.1288,
"step": 459
},
{
"epoch": 6.202702702702703,
"eval_loss": 0.7430564165115356,
"eval_runtime": 1.0597,
"eval_samples_per_second": 15.099,
"eval_steps_per_second": 3.775,
"step": 459
},
{
"epoch": 6.243243243243243,
"grad_norm": 1.5836577415466309,
"learning_rate": 1.1659515750716955e-05,
"loss": 0.1176,
"step": 462
},
{
"epoch": 6.243243243243243,
"eval_loss": 0.7481391429901123,
"eval_runtime": 1.0512,
"eval_samples_per_second": 15.221,
"eval_steps_per_second": 3.805,
"step": 462
},
{
"epoch": 6.283783783783784,
"grad_norm": 1.0982418060302734,
"learning_rate": 1.1148325608766585e-05,
"loss": 0.1231,
"step": 465
},
{
"epoch": 6.283783783783784,
"eval_loss": 0.7511347532272339,
"eval_runtime": 1.0552,
"eval_samples_per_second": 15.163,
"eval_steps_per_second": 3.791,
"step": 465
},
{
"epoch": 6.324324324324325,
"grad_norm": 1.9232176542282104,
"learning_rate": 1.0647185035009038e-05,
"loss": 0.146,
"step": 468
},
{
"epoch": 6.324324324324325,
"eval_loss": 0.7529792785644531,
"eval_runtime": 1.0535,
"eval_samples_per_second": 15.188,
"eval_steps_per_second": 3.797,
"step": 468
},
{
"epoch": 6.364864864864865,
"grad_norm": 2.5786333084106445,
"learning_rate": 1.0156223656968694e-05,
"loss": 0.1169,
"step": 471
},
{
"epoch": 6.364864864864865,
"eval_loss": 0.7518468499183655,
"eval_runtime": 1.0523,
"eval_samples_per_second": 15.205,
"eval_steps_per_second": 3.801,
"step": 471
},
{
"epoch": 6.405405405405405,
"grad_norm": 1.4718759059906006,
"learning_rate": 9.675568469168388e-06,
"loss": 0.1048,
"step": 474
},
{
"epoch": 6.405405405405405,
"eval_loss": 0.7540909051895142,
"eval_runtime": 1.049,
"eval_samples_per_second": 15.253,
"eval_steps_per_second": 3.813,
"step": 474
},
{
"epoch": 6.445945945945946,
"grad_norm": 1.3492368459701538,
"learning_rate": 9.205343800280219e-06,
"loss": 0.1092,
"step": 477
},
{
"epoch": 6.445945945945946,
"eval_loss": 0.750686764717102,
"eval_runtime": 1.0533,
"eval_samples_per_second": 15.19,
"eval_steps_per_second": 3.798,
"step": 477
},
{
"epoch": 6.486486486486487,
"grad_norm": 2.10587739944458,
"learning_rate": 8.745671280966177e-06,
"loss": 0.1458,
"step": 480
},
{
"epoch": 6.486486486486487,
"eval_loss": 0.7518497705459595,
"eval_runtime": 1.0499,
"eval_samples_per_second": 15.239,
"eval_steps_per_second": 3.81,
"step": 480
},
{
"epoch": 6.527027027027027,
"grad_norm": 0.8871177434921265,
"learning_rate": 8.296669812416547e-06,
"loss": 0.2177,
"step": 483
},
{
"epoch": 6.527027027027027,
"eval_loss": 0.7509324550628662,
"eval_runtime": 1.0528,
"eval_samples_per_second": 15.198,
"eval_steps_per_second": 3.8,
"step": 483
},
{
"epoch": 6.5675675675675675,
"grad_norm": 1.299116611480713,
"learning_rate": 7.858455535594306e-06,
"loss": 0.1585,
"step": 486
},
{
"epoch": 6.5675675675675675,
"eval_loss": 0.7509753108024597,
"eval_runtime": 1.0507,
"eval_samples_per_second": 15.228,
"eval_steps_per_second": 3.807,
"step": 486
},
{
"epoch": 6.608108108108108,
"grad_norm": 1.8996071815490723,
"learning_rate": 7.431141801193508e-06,
"loss": 0.1337,
"step": 489
},
{
"epoch": 6.608108108108108,
"eval_loss": 0.7546273469924927,
"eval_runtime": 1.0538,
"eval_samples_per_second": 15.183,
"eval_steps_per_second": 3.796,
"step": 489
},
{
"epoch": 6.648648648648649,
"grad_norm": 2.193199634552002,
"learning_rate": 7.014839140319485e-06,
"loss": 0.122,
"step": 492
},
{
"epoch": 6.648648648648649,
"eval_loss": 0.7523775100708008,
"eval_runtime": 1.0517,
"eval_samples_per_second": 15.213,
"eval_steps_per_second": 3.803,
"step": 492
},
{
"epoch": 6.6891891891891895,
"grad_norm": 1.310517430305481,
"learning_rate": 6.609655235898227e-06,
"loss": 0.0793,
"step": 495
},
{
"epoch": 6.6891891891891895,
"eval_loss": 0.7553800344467163,
"eval_runtime": 1.0524,
"eval_samples_per_second": 15.203,
"eval_steps_per_second": 3.801,
"step": 495
},
{
"epoch": 6.72972972972973,
"grad_norm": 1.7615861892700195,
"learning_rate": 6.215694894822699e-06,
"loss": 0.1544,
"step": 498
},
{
"epoch": 6.72972972972973,
"eval_loss": 0.7521288394927979,
"eval_runtime": 1.0505,
"eval_samples_per_second": 15.231,
"eval_steps_per_second": 3.808,
"step": 498
},
{
"epoch": 6.77027027027027,
"grad_norm": 1.4952490329742432,
"learning_rate": 5.83306002084284e-06,
"loss": 0.1387,
"step": 501
},
{
"epoch": 6.77027027027027,
"eval_loss": 0.7528640627861023,
"eval_runtime": 1.052,
"eval_samples_per_second": 15.209,
"eval_steps_per_second": 3.802,
"step": 501
},
{
"epoch": 6.8108108108108105,
"grad_norm": 1.7409045696258545,
"learning_rate": 5.461849588206724e-06,
"loss": 0.1253,
"step": 504
},
{
"epoch": 6.8108108108108105,
"eval_loss": 0.7528926134109497,
"eval_runtime": 1.059,
"eval_samples_per_second": 15.108,
"eval_steps_per_second": 3.777,
"step": 504
},
{
"epoch": 6.851351351351351,
"grad_norm": 0.7362686395645142,
"learning_rate": 5.102159616059365e-06,
"loss": 0.1296,
"step": 507
},
{
"epoch": 6.851351351351351,
"eval_loss": 0.7542049884796143,
"eval_runtime": 1.0521,
"eval_samples_per_second": 15.207,
"eval_steps_per_second": 3.802,
"step": 507
},
{
"epoch": 6.891891891891892,
"grad_norm": 0.806505560874939,
"learning_rate": 4.754083143605869e-06,
"loss": 0.1094,
"step": 510
},
{
"epoch": 6.891891891891892,
"eval_loss": 0.7515612840652466,
"eval_runtime": 1.0559,
"eval_samples_per_second": 15.152,
"eval_steps_per_second": 3.788,
"step": 510
},
{
"epoch": 6.9324324324324325,
"grad_norm": 1.5709373950958252,
"learning_rate": 4.417710206045533e-06,
"loss": 0.1009,
"step": 513
},
{
"epoch": 6.9324324324324325,
"eval_loss": 0.751240611076355,
"eval_runtime": 1.0523,
"eval_samples_per_second": 15.205,
"eval_steps_per_second": 3.801,
"step": 513
},
{
"epoch": 6.972972972972973,
"grad_norm": 1.2641761302947998,
"learning_rate": 4.093127811282821e-06,
"loss": 0.1871,
"step": 516
},
{
"epoch": 6.972972972972973,
"eval_loss": 0.7525576949119568,
"eval_runtime": 1.0539,
"eval_samples_per_second": 15.181,
"eval_steps_per_second": 3.795,
"step": 516
},
{
"epoch": 7.013513513513513,
"grad_norm": 0.9734938144683838,
"learning_rate": 3.7804199174215183e-06,
"loss": 0.1017,
"step": 519
},
{
"epoch": 7.013513513513513,
"eval_loss": 0.7537960410118103,
"eval_runtime": 1.0511,
"eval_samples_per_second": 15.222,
"eval_steps_per_second": 3.805,
"step": 519
},
{
"epoch": 7.054054054054054,
"grad_norm": 1.4745818376541138,
"learning_rate": 3.479667411047677e-06,
"loss": 0.1536,
"step": 522
},
{
"epoch": 7.054054054054054,
"eval_loss": 0.7529079914093018,
"eval_runtime": 1.0543,
"eval_samples_per_second": 15.176,
"eval_steps_per_second": 3.794,
"step": 522
},
{
"epoch": 7.094594594594595,
"grad_norm": 1.0725492238998413,
"learning_rate": 3.1909480863070884e-06,
"loss": 0.0886,
"step": 525
},
{
"epoch": 7.094594594594595,
"eval_loss": 0.7565038204193115,
"eval_runtime": 1.0511,
"eval_samples_per_second": 15.222,
"eval_steps_per_second": 3.806,
"step": 525
},
{
"epoch": 7.135135135135135,
"grad_norm": 1.1345540285110474,
"learning_rate": 2.9143366247826598e-06,
"loss": 0.0983,
"step": 528
},
{
"epoch": 7.135135135135135,
"eval_loss": 0.7576066255569458,
"eval_runtime": 1.0528,
"eval_samples_per_second": 15.198,
"eval_steps_per_second": 3.799,
"step": 528
},
{
"epoch": 7.175675675675675,
"grad_norm": 1.122189998626709,
"learning_rate": 2.6499045761769315e-06,
"loss": 0.084,
"step": 531
},
{
"epoch": 7.175675675675675,
"eval_loss": 0.758578896522522,
"eval_runtime": 1.0508,
"eval_samples_per_second": 15.227,
"eval_steps_per_second": 3.807,
"step": 531
},
{
"epoch": 7.216216216216216,
"grad_norm": 1.6193064451217651,
"learning_rate": 2.397720339804649e-06,
"loss": 0.099,
"step": 534
},
{
"epoch": 7.216216216216216,
"eval_loss": 0.7563527822494507,
"eval_runtime": 1.0563,
"eval_samples_per_second": 15.147,
"eval_steps_per_second": 3.787,
"step": 534
},
{
"epoch": 7.256756756756757,
"grad_norm": 1.373356580734253,
"learning_rate": 2.1578491469002373e-06,
"loss": 0.1089,
"step": 537
},
{
"epoch": 7.256756756756757,
"eval_loss": 0.7592064142227173,
"eval_runtime": 1.0528,
"eval_samples_per_second": 15.197,
"eval_steps_per_second": 3.799,
"step": 537
},
{
"epoch": 7.297297297297297,
"grad_norm": 1.1875869035720825,
"learning_rate": 1.9303530437448035e-06,
"loss": 0.1145,
"step": 540
},
{
"epoch": 7.297297297297297,
"eval_loss": 0.7611518502235413,
"eval_runtime": 1.0529,
"eval_samples_per_second": 15.196,
"eval_steps_per_second": 3.799,
"step": 540
},
{
"epoch": 7.337837837837838,
"grad_norm": 1.8787821531295776,
"learning_rate": 1.7152908756169262e-06,
"loss": 0.1823,
"step": 543
},
{
"epoch": 7.337837837837838,
"eval_loss": 0.7614726424217224,
"eval_runtime": 1.0548,
"eval_samples_per_second": 15.168,
"eval_steps_per_second": 3.792,
"step": 543
},
{
"epoch": 7.378378378378378,
"grad_norm": 1.9469506740570068,
"learning_rate": 1.5127182715714006e-06,
"loss": 0.2784,
"step": 546
},
{
"epoch": 7.378378378378378,
"eval_loss": 0.7602246999740601,
"eval_runtime": 1.053,
"eval_samples_per_second": 15.194,
"eval_steps_per_second": 3.799,
"step": 546
},
{
"epoch": 7.418918918918919,
"grad_norm": 1.6328327655792236,
"learning_rate": 1.3226876300500123e-06,
"loss": 0.0887,
"step": 549
},
{
"epoch": 7.418918918918919,
"eval_loss": 0.7616763114929199,
"eval_runtime": 1.0504,
"eval_samples_per_second": 15.232,
"eval_steps_per_second": 3.808,
"step": 549
},
{
"epoch": 7.45945945945946,
"grad_norm": 1.5713064670562744,
"learning_rate": 1.1452481053278396e-06,
"loss": 0.1133,
"step": 552
},
{
"epoch": 7.45945945945946,
"eval_loss": 0.7640103101730347,
"eval_runtime": 1.053,
"eval_samples_per_second": 15.195,
"eval_steps_per_second": 3.799,
"step": 552
},
{
"epoch": 7.5,
"grad_norm": 1.5901539325714111,
"learning_rate": 9.804455947988067e-07,
"loss": 0.1207,
"step": 555
},
{
"epoch": 7.5,
"eval_loss": 0.7629836797714233,
"eval_runtime": 1.0516,
"eval_samples_per_second": 15.216,
"eval_steps_per_second": 3.804,
"step": 555
},
{
"epoch": 7.54054054054054,
"grad_norm": 1.5648808479309082,
"learning_rate": 8.283227271035976e-07,
"loss": 0.0954,
"step": 558
},
{
"epoch": 7.54054054054054,
"eval_loss": 0.7643275260925293,
"eval_runtime": 1.0548,
"eval_samples_per_second": 15.169,
"eval_steps_per_second": 3.792,
"step": 558
},
{
"epoch": 7.581081081081081,
"grad_norm": 1.6403340101242065,
"learning_rate": 6.889188511031542e-07,
"loss": 0.1135,
"step": 561
},
{
"epoch": 7.581081081081081,
"eval_loss": 0.7628697156906128,
"eval_runtime": 1.0531,
"eval_samples_per_second": 15.194,
"eval_steps_per_second": 3.798,
"step": 561
},
{
"epoch": 7.621621621621622,
"grad_norm": 1.393983244895935,
"learning_rate": 5.622700257004676e-07,
"loss": 0.096,
"step": 564
},
{
"epoch": 7.621621621621622,
"eval_loss": 0.7637063264846802,
"eval_runtime": 1.0544,
"eval_samples_per_second": 15.174,
"eval_steps_per_second": 3.793,
"step": 564
},
{
"epoch": 7.662162162162162,
"grad_norm": 1.2016361951828003,
"learning_rate": 4.484090105134231e-07,
"loss": 0.1088,
"step": 567
},
{
"epoch": 7.662162162162162,
"eval_loss": 0.7655338048934937,
"eval_runtime": 1.0534,
"eval_samples_per_second": 15.189,
"eval_steps_per_second": 3.797,
"step": 567
},
{
"epoch": 7.702702702702703,
"grad_norm": 1.1388864517211914,
"learning_rate": 3.4736525740104444e-07,
"loss": 0.1628,
"step": 570
},
{
"epoch": 7.702702702702703,
"eval_loss": 0.7655097842216492,
"eval_runtime": 1.053,
"eval_samples_per_second": 15.195,
"eval_steps_per_second": 3.799,
"step": 570
},
{
"epoch": 7.743243243243243,
"grad_norm": 1.9650497436523438,
"learning_rate": 2.591649028453047e-07,
"loss": 0.1431,
"step": 573
},
{
"epoch": 7.743243243243243,
"eval_loss": 0.7649960517883301,
"eval_runtime": 1.0519,
"eval_samples_per_second": 15.211,
"eval_steps_per_second": 3.803,
"step": 573
},
{
"epoch": 7.783783783783784,
"grad_norm": 1.7549225091934204,
"learning_rate": 1.8383076119053432e-07,
"loss": 0.1034,
"step": 576
},
{
"epoch": 7.783783783783784,
"eval_loss": 0.763870358467102,
"eval_runtime": 1.0529,
"eval_samples_per_second": 15.196,
"eval_steps_per_second": 3.799,
"step": 576
},
{
"epoch": 7.824324324324325,
"grad_norm": 1.7549595832824707,
"learning_rate": 1.2138231874217475e-07,
"loss": 0.181,
"step": 579
},
{
"epoch": 7.824324324324325,
"eval_loss": 0.7637079358100891,
"eval_runtime": 1.0546,
"eval_samples_per_second": 15.172,
"eval_steps_per_second": 3.793,
"step": 579
},
{
"epoch": 7.864864864864865,
"grad_norm": 1.3891515731811523,
"learning_rate": 7.183572872632715e-08,
"loss": 0.062,
"step": 582
},
{
"epoch": 7.864864864864865,
"eval_loss": 0.7649126052856445,
"eval_runtime": 1.0509,
"eval_samples_per_second": 15.225,
"eval_steps_per_second": 3.806,
"step": 582
},
{
"epoch": 7.905405405405405,
"grad_norm": 1.0669249296188354,
"learning_rate": 3.5203807111489074e-08,
"loss": 0.0769,
"step": 585
},
{
"epoch": 7.905405405405405,
"eval_loss": 0.7653980255126953,
"eval_runtime": 1.0536,
"eval_samples_per_second": 15.185,
"eval_steps_per_second": 3.796,
"step": 585
},
{
"epoch": 7.945945945945946,
"grad_norm": 2.3302104473114014,
"learning_rate": 1.1496029293511789e-08,
"loss": 0.1951,
"step": 588
},
{
"epoch": 7.945945945945946,
"eval_loss": 0.7646524906158447,
"eval_runtime": 1.0566,
"eval_samples_per_second": 15.143,
"eval_steps_per_second": 3.786,
"step": 588
},
{
"epoch": 7.986486486486487,
"grad_norm": 1.9744952917099,
"learning_rate": 7.185276446441958e-10,
"loss": 0.1175,
"step": 591
},
{
"epoch": 7.986486486486487,
"eval_loss": 0.765015721321106,
"eval_runtime": 1.0522,
"eval_samples_per_second": 15.206,
"eval_steps_per_second": 3.801,
"step": 591
},
{
"epoch": 8.0,
"step": 592,
"total_flos": 1188976147968000.0,
"train_loss": 0.4161318518926163,
"train_runtime": 741.424,
"train_samples_per_second": 3.194,
"train_steps_per_second": 0.798
}
],
"logging_steps": 3,
"max_steps": 592,
"num_input_tokens_seen": 0,
"num_train_epochs": 8,
"save_steps": 50,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1188976147968000.0,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}