gte-small-pairscore / trainer_state.json
youssefkhalil320's picture
Upload folder using huggingface_hub
01cce3b verified
{
"best_metric": 0.823376476764679,
"best_model_checkpoint": "./gte-small-pairscore/checkpoint-38500",
"epoch": 2.4142471938295604,
"eval_steps": 100,
"global_step": 38500,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.006270771932024832,
"grad_norm": 8.922538757324219,
"learning_rate": 3.135287662643047e-07,
"loss": 6.3669,
"step": 100
},
{
"epoch": 0.006270771932024832,
"eval_loss": 6.6512651443481445,
"eval_runtime": 499.3604,
"eval_samples_per_second": 255.471,
"eval_steps_per_second": 7.984,
"step": 100
},
{
"epoch": 0.012541543864049664,
"grad_norm": 8.918222427368164,
"learning_rate": 6.270575325286094e-07,
"loss": 6.1795,
"step": 200
},
{
"epoch": 0.012541543864049664,
"eval_loss": 6.254122734069824,
"eval_runtime": 501.3941,
"eval_samples_per_second": 254.435,
"eval_steps_per_second": 7.952,
"step": 200
},
{
"epoch": 0.018812315796074498,
"grad_norm": 10.144318580627441,
"learning_rate": 9.405862987929143e-07,
"loss": 5.893,
"step": 300
},
{
"epoch": 0.018812315796074498,
"eval_loss": 5.773278713226318,
"eval_runtime": 495.3793,
"eval_samples_per_second": 257.524,
"eval_steps_per_second": 8.048,
"step": 300
},
{
"epoch": 0.025083087728099328,
"grad_norm": 9.239725112915039,
"learning_rate": 1.2541150650572189e-06,
"loss": 5.5099,
"step": 400
},
{
"epoch": 0.025083087728099328,
"eval_loss": 5.362614154815674,
"eval_runtime": 502.6963,
"eval_samples_per_second": 253.776,
"eval_steps_per_second": 7.931,
"step": 400
},
{
"epoch": 0.03135385966012416,
"grad_norm": 9.863190650939941,
"learning_rate": 1.5676438313215239e-06,
"loss": 5.1589,
"step": 500
},
{
"epoch": 0.03135385966012416,
"eval_loss": 4.990196228027344,
"eval_runtime": 497.9208,
"eval_samples_per_second": 256.209,
"eval_steps_per_second": 8.007,
"step": 500
},
{
"epoch": 0.037624631592148995,
"grad_norm": 13.765459060668945,
"learning_rate": 1.8811725975858286e-06,
"loss": 4.8599,
"step": 600
},
{
"epoch": 0.037624631592148995,
"eval_loss": 4.652267932891846,
"eval_runtime": 502.9499,
"eval_samples_per_second": 253.648,
"eval_steps_per_second": 7.927,
"step": 600
},
{
"epoch": 0.04389540352417383,
"grad_norm": 12.12948226928711,
"learning_rate": 2.1915660761874904e-06,
"loss": 4.6075,
"step": 700
},
{
"epoch": 0.04389540352417383,
"eval_loss": 4.423278331756592,
"eval_runtime": 500.0391,
"eval_samples_per_second": 255.124,
"eval_steps_per_second": 7.973,
"step": 700
},
{
"epoch": 0.050166175456198656,
"grad_norm": 19.26013946533203,
"learning_rate": 2.505094842451795e-06,
"loss": 4.3831,
"step": 800
},
{
"epoch": 0.050166175456198656,
"eval_loss": 4.243069171905518,
"eval_runtime": 502.9493,
"eval_samples_per_second": 253.648,
"eval_steps_per_second": 7.927,
"step": 800
},
{
"epoch": 0.05643694738822349,
"grad_norm": 15.950737953186035,
"learning_rate": 2.8186236087161e-06,
"loss": 4.1737,
"step": 900
},
{
"epoch": 0.05643694738822349,
"eval_loss": 4.134955883026123,
"eval_runtime": 500.339,
"eval_samples_per_second": 254.971,
"eval_steps_per_second": 7.969,
"step": 900
},
{
"epoch": 0.06270771932024832,
"grad_norm": 22.265525817871094,
"learning_rate": 3.1290170873177617e-06,
"loss": 4.0266,
"step": 1000
},
{
"epoch": 0.06270771932024832,
"eval_loss": 4.0326995849609375,
"eval_runtime": 498.5852,
"eval_samples_per_second": 255.868,
"eval_steps_per_second": 7.997,
"step": 1000
},
{
"epoch": 0.06897849125227315,
"grad_norm": 25.777463912963867,
"learning_rate": 3.4425458535820665e-06,
"loss": 3.9526,
"step": 1100
},
{
"epoch": 0.06897849125227315,
"eval_loss": 3.928138256072998,
"eval_runtime": 500.7885,
"eval_samples_per_second": 254.742,
"eval_steps_per_second": 7.961,
"step": 1100
},
{
"epoch": 0.07524926318429799,
"grad_norm": 23.928476333618164,
"learning_rate": 3.756074619846371e-06,
"loss": 3.8773,
"step": 1200
},
{
"epoch": 0.07524926318429799,
"eval_loss": 3.873471260070801,
"eval_runtime": 500.5002,
"eval_samples_per_second": 254.889,
"eval_steps_per_second": 7.966,
"step": 1200
},
{
"epoch": 0.08152003511632282,
"grad_norm": 33.25246810913086,
"learning_rate": 4.069603386110676e-06,
"loss": 3.7856,
"step": 1300
},
{
"epoch": 0.08152003511632282,
"eval_loss": 3.777860164642334,
"eval_runtime": 502.3139,
"eval_samples_per_second": 253.969,
"eval_steps_per_second": 7.937,
"step": 1300
},
{
"epoch": 0.08779080704834766,
"grad_norm": 29.081541061401367,
"learning_rate": 4.383132152374981e-06,
"loss": 3.5994,
"step": 1400
},
{
"epoch": 0.08779080704834766,
"eval_loss": 3.705378532409668,
"eval_runtime": 500.945,
"eval_samples_per_second": 254.663,
"eval_steps_per_second": 7.959,
"step": 1400
},
{
"epoch": 0.09406157898037248,
"grad_norm": 32.77998352050781,
"learning_rate": 4.696660918639286e-06,
"loss": 3.7067,
"step": 1500
},
{
"epoch": 0.09406157898037248,
"eval_loss": 3.615516185760498,
"eval_runtime": 495.8157,
"eval_samples_per_second": 257.297,
"eval_steps_per_second": 8.041,
"step": 1500
},
{
"epoch": 0.10033235091239731,
"grad_norm": 80.04820251464844,
"learning_rate": 5.01018968490359e-06,
"loss": 3.5471,
"step": 1600
},
{
"epoch": 0.10033235091239731,
"eval_loss": 3.57977032661438,
"eval_runtime": 494.2912,
"eval_samples_per_second": 258.091,
"eval_steps_per_second": 8.066,
"step": 1600
},
{
"epoch": 0.10660312284442215,
"grad_norm": 32.79664611816406,
"learning_rate": 5.323718451167895e-06,
"loss": 3.6679,
"step": 1700
},
{
"epoch": 0.10660312284442215,
"eval_loss": 3.4653944969177246,
"eval_runtime": 487.8092,
"eval_samples_per_second": 261.52,
"eval_steps_per_second": 8.173,
"step": 1700
},
{
"epoch": 0.11287389477644698,
"grad_norm": 52.47648620605469,
"learning_rate": 5.6372472174322e-06,
"loss": 3.4484,
"step": 1800
},
{
"epoch": 0.11287389477644698,
"eval_loss": 3.4174623489379883,
"eval_runtime": 488.4427,
"eval_samples_per_second": 261.181,
"eval_steps_per_second": 8.163,
"step": 1800
},
{
"epoch": 0.11914466670847182,
"grad_norm": 56.593727111816406,
"learning_rate": 5.947640696033862e-06,
"loss": 3.377,
"step": 1900
},
{
"epoch": 0.11914466670847182,
"eval_loss": 3.412893056869507,
"eval_runtime": 487.7362,
"eval_samples_per_second": 261.559,
"eval_steps_per_second": 8.175,
"step": 1900
},
{
"epoch": 0.12541543864049665,
"grad_norm": 47.65892028808594,
"learning_rate": 6.2611694622981665e-06,
"loss": 3.4259,
"step": 2000
},
{
"epoch": 0.12541543864049665,
"eval_loss": 3.3347389698028564,
"eval_runtime": 485.6244,
"eval_samples_per_second": 262.697,
"eval_steps_per_second": 8.21,
"step": 2000
},
{
"epoch": 0.13168621057252147,
"grad_norm": 46.80497741699219,
"learning_rate": 6.574698228562471e-06,
"loss": 3.4832,
"step": 2100
},
{
"epoch": 0.13168621057252147,
"eval_loss": 3.2113163471221924,
"eval_runtime": 485.3601,
"eval_samples_per_second": 262.84,
"eval_steps_per_second": 8.215,
"step": 2100
},
{
"epoch": 0.1379569825045463,
"grad_norm": 61.08994674682617,
"learning_rate": 6.888226994826775e-06,
"loss": 3.3043,
"step": 2200
},
{
"epoch": 0.1379569825045463,
"eval_loss": 3.164067268371582,
"eval_runtime": 486.9272,
"eval_samples_per_second": 261.994,
"eval_steps_per_second": 8.188,
"step": 2200
},
{
"epoch": 0.14422775443657115,
"grad_norm": 54.029170989990234,
"learning_rate": 7.201755761091081e-06,
"loss": 3.2344,
"step": 2300
},
{
"epoch": 0.14422775443657115,
"eval_loss": 3.1528868675231934,
"eval_runtime": 485.6647,
"eval_samples_per_second": 262.675,
"eval_steps_per_second": 8.209,
"step": 2300
},
{
"epoch": 0.15049852636859598,
"grad_norm": 51.89152526855469,
"learning_rate": 7.5152845273553855e-06,
"loss": 3.1238,
"step": 2400
},
{
"epoch": 0.15049852636859598,
"eval_loss": 3.2577104568481445,
"eval_runtime": 483.4268,
"eval_samples_per_second": 263.891,
"eval_steps_per_second": 8.247,
"step": 2400
},
{
"epoch": 0.1567692983006208,
"grad_norm": 42.53623962402344,
"learning_rate": 7.828813293619691e-06,
"loss": 3.1456,
"step": 2500
},
{
"epoch": 0.1567692983006208,
"eval_loss": 3.067770481109619,
"eval_runtime": 483.4427,
"eval_samples_per_second": 263.882,
"eval_steps_per_second": 8.247,
"step": 2500
},
{
"epoch": 0.16304007023264563,
"grad_norm": 40.32865905761719,
"learning_rate": 8.142342059883996e-06,
"loss": 3.0223,
"step": 2600
},
{
"epoch": 0.16304007023264563,
"eval_loss": 3.000631332397461,
"eval_runtime": 490.1586,
"eval_samples_per_second": 260.267,
"eval_steps_per_second": 8.134,
"step": 2600
},
{
"epoch": 0.16931084216467046,
"grad_norm": 52.34198760986328,
"learning_rate": 8.4558708261483e-06,
"loss": 3.2046,
"step": 2700
},
{
"epoch": 0.16931084216467046,
"eval_loss": 2.9682161808013916,
"eval_runtime": 501.7817,
"eval_samples_per_second": 254.238,
"eval_steps_per_second": 7.946,
"step": 2700
},
{
"epoch": 0.17558161409669532,
"grad_norm": 54.570518493652344,
"learning_rate": 8.766264304749962e-06,
"loss": 3.0866,
"step": 2800
},
{
"epoch": 0.17558161409669532,
"eval_loss": 3.0524070262908936,
"eval_runtime": 492.0201,
"eval_samples_per_second": 259.282,
"eval_steps_per_second": 8.103,
"step": 2800
},
{
"epoch": 0.18185238602872014,
"grad_norm": 91.97798156738281,
"learning_rate": 9.079793071014266e-06,
"loss": 2.9271,
"step": 2900
},
{
"epoch": 0.18185238602872014,
"eval_loss": 3.057253122329712,
"eval_runtime": 483.8055,
"eval_samples_per_second": 263.684,
"eval_steps_per_second": 8.241,
"step": 2900
},
{
"epoch": 0.18812315796074497,
"grad_norm": 72.72432708740234,
"learning_rate": 9.393321837278571e-06,
"loss": 2.7692,
"step": 3000
},
{
"epoch": 0.18812315796074497,
"eval_loss": 3.0557968616485596,
"eval_runtime": 481.3118,
"eval_samples_per_second": 265.051,
"eval_steps_per_second": 8.284,
"step": 3000
},
{
"epoch": 0.1943939298927698,
"grad_norm": 66.55966186523438,
"learning_rate": 9.706850603542876e-06,
"loss": 3.1498,
"step": 3100
},
{
"epoch": 0.1943939298927698,
"eval_loss": 2.786630153656006,
"eval_runtime": 470.7517,
"eval_samples_per_second": 270.996,
"eval_steps_per_second": 8.469,
"step": 3100
},
{
"epoch": 0.20066470182479462,
"grad_norm": 60.73588180541992,
"learning_rate": 1.002037936980718e-05,
"loss": 3.0683,
"step": 3200
},
{
"epoch": 0.20066470182479462,
"eval_loss": 2.847790002822876,
"eval_runtime": 479.7323,
"eval_samples_per_second": 265.923,
"eval_steps_per_second": 8.311,
"step": 3200
},
{
"epoch": 0.20693547375681948,
"grad_norm": 138.51925659179688,
"learning_rate": 1.0333908136071484e-05,
"loss": 2.5776,
"step": 3300
},
{
"epoch": 0.20693547375681948,
"eval_loss": 2.9458932876586914,
"eval_runtime": 476.173,
"eval_samples_per_second": 267.911,
"eval_steps_per_second": 8.373,
"step": 3300
},
{
"epoch": 0.2132062456888443,
"grad_norm": 44.883033752441406,
"learning_rate": 1.064743690233579e-05,
"loss": 2.9394,
"step": 3400
},
{
"epoch": 0.2132062456888443,
"eval_loss": 2.7133240699768066,
"eval_runtime": 484.1605,
"eval_samples_per_second": 263.491,
"eval_steps_per_second": 8.235,
"step": 3400
},
{
"epoch": 0.21947701762086913,
"grad_norm": 62.664493560791016,
"learning_rate": 1.0960965668600095e-05,
"loss": 2.6996,
"step": 3500
},
{
"epoch": 0.21947701762086913,
"eval_loss": 2.8582112789154053,
"eval_runtime": 486.4854,
"eval_samples_per_second": 262.232,
"eval_steps_per_second": 8.196,
"step": 3500
},
{
"epoch": 0.22574778955289396,
"grad_norm": 121.68364715576172,
"learning_rate": 1.12744944348644e-05,
"loss": 2.569,
"step": 3600
},
{
"epoch": 0.22574778955289396,
"eval_loss": 2.8092362880706787,
"eval_runtime": 488.2917,
"eval_samples_per_second": 261.262,
"eval_steps_per_second": 8.165,
"step": 3600
},
{
"epoch": 0.23201856148491878,
"grad_norm": 62.47746658325195,
"learning_rate": 1.1588023201128705e-05,
"loss": 2.6535,
"step": 3700
},
{
"epoch": 0.23201856148491878,
"eval_loss": 2.7977445125579834,
"eval_runtime": 489.6066,
"eval_samples_per_second": 260.56,
"eval_steps_per_second": 8.143,
"step": 3700
},
{
"epoch": 0.23828933341694364,
"grad_norm": 61.19312286376953,
"learning_rate": 1.1901551967393011e-05,
"loss": 2.6679,
"step": 3800
},
{
"epoch": 0.23828933341694364,
"eval_loss": 2.8578476905822754,
"eval_runtime": 490.8833,
"eval_samples_per_second": 259.883,
"eval_steps_per_second": 8.122,
"step": 3800
},
{
"epoch": 0.24456010534896847,
"grad_norm": 52.05066680908203,
"learning_rate": 1.2215080733657314e-05,
"loss": 2.592,
"step": 3900
},
{
"epoch": 0.24456010534896847,
"eval_loss": 2.8251442909240723,
"eval_runtime": 487.5816,
"eval_samples_per_second": 261.642,
"eval_steps_per_second": 8.177,
"step": 3900
},
{
"epoch": 0.2508308772809933,
"grad_norm": 72.0737533569336,
"learning_rate": 1.2525474212258977e-05,
"loss": 2.4931,
"step": 4000
},
{
"epoch": 0.2508308772809933,
"eval_loss": 2.5975987911224365,
"eval_runtime": 483.614,
"eval_samples_per_second": 263.789,
"eval_steps_per_second": 8.244,
"step": 4000
},
{
"epoch": 0.25710164921301815,
"grad_norm": 142.94813537597656,
"learning_rate": 1.283900297852328e-05,
"loss": 2.3012,
"step": 4100
},
{
"epoch": 0.25710164921301815,
"eval_loss": 2.926022529602051,
"eval_runtime": 482.5775,
"eval_samples_per_second": 264.355,
"eval_steps_per_second": 8.262,
"step": 4100
},
{
"epoch": 0.26337242114504295,
"grad_norm": 188.10948181152344,
"learning_rate": 1.3152531744787585e-05,
"loss": 2.4728,
"step": 4200
},
{
"epoch": 0.26337242114504295,
"eval_loss": 2.7869389057159424,
"eval_runtime": 484.6597,
"eval_samples_per_second": 263.22,
"eval_steps_per_second": 8.226,
"step": 4200
},
{
"epoch": 0.2696431930770678,
"grad_norm": 88.77122497558594,
"learning_rate": 1.3466060511051891e-05,
"loss": 2.4391,
"step": 4300
},
{
"epoch": 0.2696431930770678,
"eval_loss": 2.898672580718994,
"eval_runtime": 488.7638,
"eval_samples_per_second": 261.009,
"eval_steps_per_second": 8.157,
"step": 4300
},
{
"epoch": 0.2759139650090926,
"grad_norm": 24.550174713134766,
"learning_rate": 1.3779589277316194e-05,
"loss": 2.3825,
"step": 4400
},
{
"epoch": 0.2759139650090926,
"eval_loss": 2.7803783416748047,
"eval_runtime": 479.1092,
"eval_samples_per_second": 266.269,
"eval_steps_per_second": 8.322,
"step": 4400
},
{
"epoch": 0.28218473694111745,
"grad_norm": 262.1357727050781,
"learning_rate": 1.40931180435805e-05,
"loss": 2.6257,
"step": 4500
},
{
"epoch": 0.28218473694111745,
"eval_loss": 2.8308775424957275,
"eval_runtime": 481.1992,
"eval_samples_per_second": 265.113,
"eval_steps_per_second": 8.286,
"step": 4500
},
{
"epoch": 0.2884555088731423,
"grad_norm": 106.33345794677734,
"learning_rate": 1.4406646809844804e-05,
"loss": 2.4304,
"step": 4600
},
{
"epoch": 0.2884555088731423,
"eval_loss": 3.241865873336792,
"eval_runtime": 480.8967,
"eval_samples_per_second": 265.279,
"eval_steps_per_second": 8.291,
"step": 4600
},
{
"epoch": 0.2947262808051671,
"grad_norm": 24.162464141845703,
"learning_rate": 1.472017557610911e-05,
"loss": 3.0246,
"step": 4700
},
{
"epoch": 0.2947262808051671,
"eval_loss": 2.5731775760650635,
"eval_runtime": 478.865,
"eval_samples_per_second": 266.405,
"eval_steps_per_second": 8.326,
"step": 4700
},
{
"epoch": 0.30099705273719196,
"grad_norm": 89.0951919555664,
"learning_rate": 1.5033704342373415e-05,
"loss": 2.6894,
"step": 4800
},
{
"epoch": 0.30099705273719196,
"eval_loss": 2.8057522773742676,
"eval_runtime": 469.6303,
"eval_samples_per_second": 271.643,
"eval_steps_per_second": 8.49,
"step": 4800
},
{
"epoch": 0.30726782466921676,
"grad_norm": 10.197610855102539,
"learning_rate": 1.5347233108637716e-05,
"loss": 2.5333,
"step": 4900
},
{
"epoch": 0.30726782466921676,
"eval_loss": 2.4581992626190186,
"eval_runtime": 480.2298,
"eval_samples_per_second": 265.648,
"eval_steps_per_second": 8.302,
"step": 4900
},
{
"epoch": 0.3135385966012416,
"grad_norm": 10.737910270690918,
"learning_rate": 1.5660761874902023e-05,
"loss": 2.3268,
"step": 5000
},
{
"epoch": 0.3135385966012416,
"eval_loss": 2.8622071743011475,
"eval_runtime": 477.9553,
"eval_samples_per_second": 266.912,
"eval_steps_per_second": 8.342,
"step": 5000
},
{
"epoch": 0.31980936853326647,
"grad_norm": 107.68405151367188,
"learning_rate": 1.597429064116633e-05,
"loss": 2.6996,
"step": 5100
},
{
"epoch": 0.31980936853326647,
"eval_loss": 2.751514196395874,
"eval_runtime": 479.8929,
"eval_samples_per_second": 265.834,
"eval_steps_per_second": 8.308,
"step": 5100
},
{
"epoch": 0.32608014046529127,
"grad_norm": 137.2300567626953,
"learning_rate": 1.6287819407430632e-05,
"loss": 2.8175,
"step": 5200
},
{
"epoch": 0.32608014046529127,
"eval_loss": 2.5842323303222656,
"eval_runtime": 473.9066,
"eval_samples_per_second": 269.192,
"eval_steps_per_second": 8.413,
"step": 5200
},
{
"epoch": 0.3323509123973161,
"grad_norm": 30.23833465576172,
"learning_rate": 1.660134817369494e-05,
"loss": 2.1244,
"step": 5300
},
{
"epoch": 0.3323509123973161,
"eval_loss": 2.725175380706787,
"eval_runtime": 479.3619,
"eval_samples_per_second": 266.129,
"eval_steps_per_second": 8.317,
"step": 5300
},
{
"epoch": 0.3386216843293409,
"grad_norm": 66.7165756225586,
"learning_rate": 1.6914876939959242e-05,
"loss": 2.7331,
"step": 5400
},
{
"epoch": 0.3386216843293409,
"eval_loss": 2.5052876472473145,
"eval_runtime": 482.2091,
"eval_samples_per_second": 264.557,
"eval_steps_per_second": 8.268,
"step": 5400
},
{
"epoch": 0.3448924562613658,
"grad_norm": 20.616701126098633,
"learning_rate": 1.722840570622355e-05,
"loss": 2.3226,
"step": 5500
},
{
"epoch": 0.3448924562613658,
"eval_loss": 2.2429914474487305,
"eval_runtime": 483.8758,
"eval_samples_per_second": 263.646,
"eval_steps_per_second": 8.24,
"step": 5500
},
{
"epoch": 0.35116322819339063,
"grad_norm": 10.704608917236328,
"learning_rate": 1.754193447248785e-05,
"loss": 2.0706,
"step": 5600
},
{
"epoch": 0.35116322819339063,
"eval_loss": 2.6055426597595215,
"eval_runtime": 481.5222,
"eval_samples_per_second": 264.935,
"eval_steps_per_second": 8.28,
"step": 5600
},
{
"epoch": 0.35743400012541543,
"grad_norm": 116.87510681152344,
"learning_rate": 1.7852327951089514e-05,
"loss": 2.2461,
"step": 5700
},
{
"epoch": 0.35743400012541543,
"eval_loss": 2.894943952560425,
"eval_runtime": 482.8194,
"eval_samples_per_second": 264.223,
"eval_steps_per_second": 8.258,
"step": 5700
},
{
"epoch": 0.3637047720574403,
"grad_norm": 75.6421890258789,
"learning_rate": 1.8165856717353817e-05,
"loss": 2.6365,
"step": 5800
},
{
"epoch": 0.3637047720574403,
"eval_loss": 2.5271661281585693,
"eval_runtime": 486.8779,
"eval_samples_per_second": 262.021,
"eval_steps_per_second": 8.189,
"step": 5800
},
{
"epoch": 0.3699755439894651,
"grad_norm": 25.05718231201172,
"learning_rate": 1.8479385483618124e-05,
"loss": 2.7119,
"step": 5900
},
{
"epoch": 0.3699755439894651,
"eval_loss": 2.433084011077881,
"eval_runtime": 486.349,
"eval_samples_per_second": 262.305,
"eval_steps_per_second": 8.198,
"step": 5900
},
{
"epoch": 0.37624631592148994,
"grad_norm": 88.68294525146484,
"learning_rate": 1.8792914249882427e-05,
"loss": 2.6146,
"step": 6000
},
{
"epoch": 0.37624631592148994,
"eval_loss": 2.385845899581909,
"eval_runtime": 485.6975,
"eval_samples_per_second": 262.657,
"eval_steps_per_second": 8.209,
"step": 6000
},
{
"epoch": 0.3825170878535148,
"grad_norm": 185.446533203125,
"learning_rate": 1.9106443016146733e-05,
"loss": 2.1998,
"step": 6100
},
{
"epoch": 0.3825170878535148,
"eval_loss": 2.6891462802886963,
"eval_runtime": 488.7846,
"eval_samples_per_second": 260.998,
"eval_steps_per_second": 8.157,
"step": 6100
},
{
"epoch": 0.3887878597855396,
"grad_norm": 105.49547576904297,
"learning_rate": 1.9419971782411036e-05,
"loss": 2.5076,
"step": 6200
},
{
"epoch": 0.3887878597855396,
"eval_loss": 2.3827390670776367,
"eval_runtime": 493.7693,
"eval_samples_per_second": 258.364,
"eval_steps_per_second": 8.075,
"step": 6200
},
{
"epoch": 0.39505863171756445,
"grad_norm": 20.25705909729004,
"learning_rate": 1.9733500548675343e-05,
"loss": 2.5244,
"step": 6300
},
{
"epoch": 0.39505863171756445,
"eval_loss": 2.6522157192230225,
"eval_runtime": 500.1844,
"eval_samples_per_second": 255.05,
"eval_steps_per_second": 7.971,
"step": 6300
},
{
"epoch": 0.40132940364958924,
"grad_norm": 1.7094597816467285,
"learning_rate": 1.9994774338518353e-05,
"loss": 2.0613,
"step": 6400
},
{
"epoch": 0.40132940364958924,
"eval_loss": 2.4750421047210693,
"eval_runtime": 501.9454,
"eval_samples_per_second": 254.155,
"eval_steps_per_second": 7.943,
"step": 6400
},
{
"epoch": 0.4076001755816141,
"grad_norm": 2.9270060062408447,
"learning_rate": 1.995993659530736e-05,
"loss": 2.465,
"step": 6500
},
{
"epoch": 0.4076001755816141,
"eval_loss": 2.525411367416382,
"eval_runtime": 501.3359,
"eval_samples_per_second": 254.464,
"eval_steps_per_second": 7.953,
"step": 6500
},
{
"epoch": 0.41387094751363895,
"grad_norm": 95.53108215332031,
"learning_rate": 1.9925098852096362e-05,
"loss": 2.3201,
"step": 6600
},
{
"epoch": 0.41387094751363895,
"eval_loss": 2.2248587608337402,
"eval_runtime": 495.8221,
"eval_samples_per_second": 257.294,
"eval_steps_per_second": 8.041,
"step": 6600
},
{
"epoch": 0.42014171944566375,
"grad_norm": 19.441762924194336,
"learning_rate": 1.9890261108885365e-05,
"loss": 2.234,
"step": 6700
},
{
"epoch": 0.42014171944566375,
"eval_loss": 2.5168297290802,
"eval_runtime": 497.6361,
"eval_samples_per_second": 256.356,
"eval_steps_per_second": 8.012,
"step": 6700
},
{
"epoch": 0.4264124913776886,
"grad_norm": 13.225996017456055,
"learning_rate": 1.985542336567437e-05,
"loss": 2.1277,
"step": 6800
},
{
"epoch": 0.4264124913776886,
"eval_loss": 2.5358171463012695,
"eval_runtime": 498.1476,
"eval_samples_per_second": 256.093,
"eval_steps_per_second": 8.004,
"step": 6800
},
{
"epoch": 0.4326832633097134,
"grad_norm": 103.2215347290039,
"learning_rate": 1.9820585622463378e-05,
"loss": 2.3801,
"step": 6900
},
{
"epoch": 0.4326832633097134,
"eval_loss": 2.4991824626922607,
"eval_runtime": 500.8539,
"eval_samples_per_second": 254.709,
"eval_steps_per_second": 7.96,
"step": 6900
},
{
"epoch": 0.43895403524173826,
"grad_norm": 97.55316925048828,
"learning_rate": 1.9785747879252384e-05,
"loss": 2.1443,
"step": 7000
},
{
"epoch": 0.43895403524173826,
"eval_loss": 2.4043357372283936,
"eval_runtime": 499.124,
"eval_samples_per_second": 255.592,
"eval_steps_per_second": 7.988,
"step": 7000
},
{
"epoch": 0.4452248071737631,
"grad_norm": 233.4646759033203,
"learning_rate": 1.9750910136041387e-05,
"loss": 1.9136,
"step": 7100
},
{
"epoch": 0.4452248071737631,
"eval_loss": 2.3874008655548096,
"eval_runtime": 497.0926,
"eval_samples_per_second": 256.636,
"eval_steps_per_second": 8.021,
"step": 7100
},
{
"epoch": 0.4514955791057879,
"grad_norm": 111.59117889404297,
"learning_rate": 1.9716072392830394e-05,
"loss": 2.3067,
"step": 7200
},
{
"epoch": 0.4514955791057879,
"eval_loss": 2.647474765777588,
"eval_runtime": 497.6919,
"eval_samples_per_second": 256.327,
"eval_steps_per_second": 8.011,
"step": 7200
},
{
"epoch": 0.45776635103781277,
"grad_norm": 156.6864776611328,
"learning_rate": 1.96812346496194e-05,
"loss": 2.1464,
"step": 7300
},
{
"epoch": 0.45776635103781277,
"eval_loss": 2.4704177379608154,
"eval_runtime": 500.9739,
"eval_samples_per_second": 254.648,
"eval_steps_per_second": 7.958,
"step": 7300
},
{
"epoch": 0.46403712296983757,
"grad_norm": 22.16613006591797,
"learning_rate": 1.9646396906408406e-05,
"loss": 2.2151,
"step": 7400
},
{
"epoch": 0.46403712296983757,
"eval_loss": 2.519892692565918,
"eval_runtime": 501.5459,
"eval_samples_per_second": 254.358,
"eval_steps_per_second": 7.949,
"step": 7400
},
{
"epoch": 0.4703078949018624,
"grad_norm": 251.9285125732422,
"learning_rate": 1.961155916319741e-05,
"loss": 2.4653,
"step": 7500
},
{
"epoch": 0.4703078949018624,
"eval_loss": 2.529334545135498,
"eval_runtime": 497.5167,
"eval_samples_per_second": 256.418,
"eval_steps_per_second": 8.014,
"step": 7500
},
{
"epoch": 0.4765786668338873,
"grad_norm": 222.22967529296875,
"learning_rate": 1.9576721419986416e-05,
"loss": 2.4425,
"step": 7600
},
{
"epoch": 0.4765786668338873,
"eval_loss": 2.126385450363159,
"eval_runtime": 501.9759,
"eval_samples_per_second": 254.14,
"eval_steps_per_second": 7.943,
"step": 7600
},
{
"epoch": 0.4828494387659121,
"grad_norm": 135.4093780517578,
"learning_rate": 1.954188367677542e-05,
"loss": 2.3138,
"step": 7700
},
{
"epoch": 0.4828494387659121,
"eval_loss": 2.18104887008667,
"eval_runtime": 499.0397,
"eval_samples_per_second": 255.635,
"eval_steps_per_second": 7.989,
"step": 7700
},
{
"epoch": 0.48912021069793693,
"grad_norm": 16.642980575561523,
"learning_rate": 1.9507045933564425e-05,
"loss": 2.247,
"step": 7800
},
{
"epoch": 0.48912021069793693,
"eval_loss": 2.1403872966766357,
"eval_runtime": 487.5913,
"eval_samples_per_second": 261.637,
"eval_steps_per_second": 8.177,
"step": 7800
},
{
"epoch": 0.49539098262996173,
"grad_norm": 163.52439880371094,
"learning_rate": 1.947220819035343e-05,
"loss": 2.1621,
"step": 7900
},
{
"epoch": 0.49539098262996173,
"eval_loss": 2.2122886180877686,
"eval_runtime": 498.6473,
"eval_samples_per_second": 255.836,
"eval_steps_per_second": 7.996,
"step": 7900
},
{
"epoch": 0.5016617545619866,
"grad_norm": 0.11034490168094635,
"learning_rate": 1.9437370447142438e-05,
"loss": 2.1338,
"step": 8000
},
{
"epoch": 0.5016617545619866,
"eval_loss": 2.5108418464660645,
"eval_runtime": 497.4577,
"eval_samples_per_second": 256.448,
"eval_steps_per_second": 8.015,
"step": 8000
},
{
"epoch": 0.5079325264940114,
"grad_norm": 73.57258605957031,
"learning_rate": 1.940253270393144e-05,
"loss": 2.1846,
"step": 8100
},
{
"epoch": 0.5079325264940114,
"eval_loss": 2.149299383163452,
"eval_runtime": 500.0715,
"eval_samples_per_second": 255.108,
"eval_steps_per_second": 7.973,
"step": 8100
},
{
"epoch": 0.5142032984260363,
"grad_norm": 71.24880981445312,
"learning_rate": 1.9367694960720447e-05,
"loss": 2.1167,
"step": 8200
},
{
"epoch": 0.5142032984260363,
"eval_loss": 2.287858486175537,
"eval_runtime": 500.8627,
"eval_samples_per_second": 254.705,
"eval_steps_per_second": 7.96,
"step": 8200
},
{
"epoch": 0.520474070358061,
"grad_norm": 138.7628936767578,
"learning_rate": 1.933285721750945e-05,
"loss": 2.2143,
"step": 8300
},
{
"epoch": 0.520474070358061,
"eval_loss": 2.1663804054260254,
"eval_runtime": 501.8798,
"eval_samples_per_second": 254.188,
"eval_steps_per_second": 7.944,
"step": 8300
},
{
"epoch": 0.5267448422900859,
"grad_norm": 17.088781356811523,
"learning_rate": 1.9298019474298456e-05,
"loss": 2.3152,
"step": 8400
},
{
"epoch": 0.5267448422900859,
"eval_loss": 2.1071760654449463,
"eval_runtime": 495.7183,
"eval_samples_per_second": 257.348,
"eval_steps_per_second": 8.043,
"step": 8400
},
{
"epoch": 0.5330156142221107,
"grad_norm": 267.6972351074219,
"learning_rate": 1.9263181731087462e-05,
"loss": 1.7618,
"step": 8500
},
{
"epoch": 0.5330156142221107,
"eval_loss": 2.032350540161133,
"eval_runtime": 498.1752,
"eval_samples_per_second": 256.079,
"eval_steps_per_second": 8.003,
"step": 8500
},
{
"epoch": 0.5392863861541356,
"grad_norm": 117.72229766845703,
"learning_rate": 1.922834398787647e-05,
"loss": 2.0777,
"step": 8600
},
{
"epoch": 0.5392863861541356,
"eval_loss": 2.4468319416046143,
"eval_runtime": 495.2364,
"eval_samples_per_second": 257.598,
"eval_steps_per_second": 8.051,
"step": 8600
},
{
"epoch": 0.5455571580861605,
"grad_norm": 1.1375752687454224,
"learning_rate": 1.9193506244665472e-05,
"loss": 2.1573,
"step": 8700
},
{
"epoch": 0.5455571580861605,
"eval_loss": 2.2053027153015137,
"eval_runtime": 485.0279,
"eval_samples_per_second": 263.02,
"eval_steps_per_second": 8.22,
"step": 8700
},
{
"epoch": 0.5518279300181852,
"grad_norm": 244.1565704345703,
"learning_rate": 1.9158668501454478e-05,
"loss": 1.9831,
"step": 8800
},
{
"epoch": 0.5518279300181852,
"eval_loss": 2.3276798725128174,
"eval_runtime": 499.4822,
"eval_samples_per_second": 255.409,
"eval_steps_per_second": 7.982,
"step": 8800
},
{
"epoch": 0.55809870195021,
"grad_norm": 69.60086059570312,
"learning_rate": 1.912383075824348e-05,
"loss": 1.9083,
"step": 8900
},
{
"epoch": 0.55809870195021,
"eval_loss": 1.9949347972869873,
"eval_runtime": 495.1776,
"eval_samples_per_second": 257.629,
"eval_steps_per_second": 8.052,
"step": 8900
},
{
"epoch": 0.5643694738822349,
"grad_norm": 1.5420753955841064,
"learning_rate": 1.9088993015032487e-05,
"loss": 1.932,
"step": 9000
},
{
"epoch": 0.5643694738822349,
"eval_loss": 1.9848002195358276,
"eval_runtime": 498.1194,
"eval_samples_per_second": 256.107,
"eval_steps_per_second": 8.004,
"step": 9000
},
{
"epoch": 0.5706402458142598,
"grad_norm": 9.46451473236084,
"learning_rate": 1.9054155271821494e-05,
"loss": 2.3223,
"step": 9100
},
{
"epoch": 0.5706402458142598,
"eval_loss": 1.9191622734069824,
"eval_runtime": 495.6516,
"eval_samples_per_second": 257.382,
"eval_steps_per_second": 8.044,
"step": 9100
},
{
"epoch": 0.5769110177462846,
"grad_norm": 22.84164047241211,
"learning_rate": 1.90193175286105e-05,
"loss": 1.7583,
"step": 9200
},
{
"epoch": 0.5769110177462846,
"eval_loss": 2.0066075325012207,
"eval_runtime": 497.8537,
"eval_samples_per_second": 256.244,
"eval_steps_per_second": 8.008,
"step": 9200
},
{
"epoch": 0.5831817896783094,
"grad_norm": 3.343338966369629,
"learning_rate": 1.8984479785399503e-05,
"loss": 1.6394,
"step": 9300
},
{
"epoch": 0.5831817896783094,
"eval_loss": 2.0322048664093018,
"eval_runtime": 485.1706,
"eval_samples_per_second": 262.943,
"eval_steps_per_second": 8.218,
"step": 9300
},
{
"epoch": 0.5894525616103342,
"grad_norm": 13.116720199584961,
"learning_rate": 1.894964204218851e-05,
"loss": 1.973,
"step": 9400
},
{
"epoch": 0.5894525616103342,
"eval_loss": 2.100987195968628,
"eval_runtime": 492.3424,
"eval_samples_per_second": 259.112,
"eval_steps_per_second": 8.098,
"step": 9400
},
{
"epoch": 0.5957233335423591,
"grad_norm": 136.55160522460938,
"learning_rate": 1.8914804298977512e-05,
"loss": 2.2377,
"step": 9500
},
{
"epoch": 0.5957233335423591,
"eval_loss": 2.11759090423584,
"eval_runtime": 495.0098,
"eval_samples_per_second": 257.716,
"eval_steps_per_second": 8.054,
"step": 9500
},
{
"epoch": 0.6019941054743839,
"grad_norm": 18.554906845092773,
"learning_rate": 1.887996655576652e-05,
"loss": 2.2269,
"step": 9600
},
{
"epoch": 0.6019941054743839,
"eval_loss": 2.002722978591919,
"eval_runtime": 498.3286,
"eval_samples_per_second": 256.0,
"eval_steps_per_second": 8.001,
"step": 9600
},
{
"epoch": 0.6082648774064088,
"grad_norm": 118.95328521728516,
"learning_rate": 1.8845477189987635e-05,
"loss": 1.971,
"step": 9700
},
{
"epoch": 0.6082648774064088,
"eval_loss": 1.9329177141189575,
"eval_runtime": 493.8369,
"eval_samples_per_second": 258.328,
"eval_steps_per_second": 8.074,
"step": 9700
},
{
"epoch": 0.6145356493384335,
"grad_norm": 0.6365923285484314,
"learning_rate": 1.881063944677664e-05,
"loss": 1.8982,
"step": 9800
},
{
"epoch": 0.6145356493384335,
"eval_loss": 1.9797492027282715,
"eval_runtime": 496.6699,
"eval_samples_per_second": 256.855,
"eval_steps_per_second": 8.027,
"step": 9800
},
{
"epoch": 0.6208064212704584,
"grad_norm": 88.75743865966797,
"learning_rate": 1.8775801703565644e-05,
"loss": 2.2853,
"step": 9900
},
{
"epoch": 0.6208064212704584,
"eval_loss": 1.8433477878570557,
"eval_runtime": 502.8332,
"eval_samples_per_second": 253.706,
"eval_steps_per_second": 7.929,
"step": 9900
},
{
"epoch": 0.6270771932024832,
"grad_norm": 2.4320499897003174,
"learning_rate": 1.874096396035465e-05,
"loss": 1.6657,
"step": 10000
},
{
"epoch": 0.6270771932024832,
"eval_loss": 2.0090935230255127,
"eval_runtime": 487.5525,
"eval_samples_per_second": 261.658,
"eval_steps_per_second": 8.178,
"step": 10000
},
{
"epoch": 0.6333479651345081,
"grad_norm": 94.46017456054688,
"learning_rate": 1.8706126217143653e-05,
"loss": 2.0732,
"step": 10100
},
{
"epoch": 0.6333479651345081,
"eval_loss": 1.7602357864379883,
"eval_runtime": 476.6373,
"eval_samples_per_second": 267.65,
"eval_steps_per_second": 8.365,
"step": 10100
},
{
"epoch": 0.6396187370665329,
"grad_norm": 186.80111694335938,
"learning_rate": 1.867128847393266e-05,
"loss": 1.6951,
"step": 10200
},
{
"epoch": 0.6396187370665329,
"eval_loss": 1.8849464654922485,
"eval_runtime": 494.2546,
"eval_samples_per_second": 258.11,
"eval_steps_per_second": 8.067,
"step": 10200
},
{
"epoch": 0.6458895089985577,
"grad_norm": 175.21151733398438,
"learning_rate": 1.8636450730721666e-05,
"loss": 1.6548,
"step": 10300
},
{
"epoch": 0.6458895089985577,
"eval_loss": 2.0065879821777344,
"eval_runtime": 492.3572,
"eval_samples_per_second": 259.105,
"eval_steps_per_second": 8.098,
"step": 10300
},
{
"epoch": 0.6521602809305825,
"grad_norm": 1.4941706657409668,
"learning_rate": 1.8601612987510672e-05,
"loss": 1.7187,
"step": 10400
},
{
"epoch": 0.6521602809305825,
"eval_loss": 1.9644232988357544,
"eval_runtime": 502.7154,
"eval_samples_per_second": 253.766,
"eval_steps_per_second": 7.931,
"step": 10400
},
{
"epoch": 0.6584310528626074,
"grad_norm": 2.646362543106079,
"learning_rate": 1.8566775244299675e-05,
"loss": 2.1948,
"step": 10500
},
{
"epoch": 0.6584310528626074,
"eval_loss": 1.8391690254211426,
"eval_runtime": 478.8123,
"eval_samples_per_second": 266.434,
"eval_steps_per_second": 8.327,
"step": 10500
},
{
"epoch": 0.6647018247946322,
"grad_norm": 47.12744140625,
"learning_rate": 1.8531937501088682e-05,
"loss": 1.9756,
"step": 10600
},
{
"epoch": 0.6647018247946322,
"eval_loss": 1.8404371738433838,
"eval_runtime": 487.2993,
"eval_samples_per_second": 261.794,
"eval_steps_per_second": 8.182,
"step": 10600
},
{
"epoch": 0.6709725967266571,
"grad_norm": 0.8185029029846191,
"learning_rate": 1.8497099757877685e-05,
"loss": 1.7644,
"step": 10700
},
{
"epoch": 0.6709725967266571,
"eval_loss": 1.910104751586914,
"eval_runtime": 487.8167,
"eval_samples_per_second": 261.516,
"eval_steps_per_second": 8.173,
"step": 10700
},
{
"epoch": 0.6772433686586818,
"grad_norm": 115.93226623535156,
"learning_rate": 1.846226201466669e-05,
"loss": 1.6295,
"step": 10800
},
{
"epoch": 0.6772433686586818,
"eval_loss": 1.943991780281067,
"eval_runtime": 489.7195,
"eval_samples_per_second": 260.5,
"eval_steps_per_second": 8.141,
"step": 10800
},
{
"epoch": 0.6835141405907067,
"grad_norm": 112.30680084228516,
"learning_rate": 1.8427772648887807e-05,
"loss": 1.7687,
"step": 10900
},
{
"epoch": 0.6835141405907067,
"eval_loss": 1.903131127357483,
"eval_runtime": 492.5747,
"eval_samples_per_second": 258.99,
"eval_steps_per_second": 8.094,
"step": 10900
},
{
"epoch": 0.6897849125227316,
"grad_norm": 105.42852783203125,
"learning_rate": 1.839293490567681e-05,
"loss": 1.8203,
"step": 11000
},
{
"epoch": 0.6897849125227316,
"eval_loss": 1.9650237560272217,
"eval_runtime": 495.7922,
"eval_samples_per_second": 257.309,
"eval_steps_per_second": 8.042,
"step": 11000
},
{
"epoch": 0.6960556844547564,
"grad_norm": 143.69154357910156,
"learning_rate": 1.8358097162465817e-05,
"loss": 2.3055,
"step": 11100
},
{
"epoch": 0.6960556844547564,
"eval_loss": 1.8432321548461914,
"eval_runtime": 494.4248,
"eval_samples_per_second": 258.021,
"eval_steps_per_second": 8.064,
"step": 11100
},
{
"epoch": 0.7023264563867813,
"grad_norm": 55.0260009765625,
"learning_rate": 1.8323259419254823e-05,
"loss": 1.8294,
"step": 11200
},
{
"epoch": 0.7023264563867813,
"eval_loss": 1.7363530397415161,
"eval_runtime": 498.3754,
"eval_samples_per_second": 255.976,
"eval_steps_per_second": 8.0,
"step": 11200
},
{
"epoch": 0.708597228318806,
"grad_norm": 20.01657485961914,
"learning_rate": 1.828842167604383e-05,
"loss": 2.0026,
"step": 11300
},
{
"epoch": 0.708597228318806,
"eval_loss": 1.789391279220581,
"eval_runtime": 487.8486,
"eval_samples_per_second": 261.499,
"eval_steps_per_second": 8.173,
"step": 11300
},
{
"epoch": 0.7148680002508309,
"grad_norm": 103.39813232421875,
"learning_rate": 1.8253583932832832e-05,
"loss": 1.9916,
"step": 11400
},
{
"epoch": 0.7148680002508309,
"eval_loss": 1.8342993259429932,
"eval_runtime": 493.6206,
"eval_samples_per_second": 258.441,
"eval_steps_per_second": 8.077,
"step": 11400
},
{
"epoch": 0.7211387721828557,
"grad_norm": 24.854759216308594,
"learning_rate": 1.821874618962184e-05,
"loss": 1.8698,
"step": 11500
},
{
"epoch": 0.7211387721828557,
"eval_loss": 1.807905673980713,
"eval_runtime": 494.1372,
"eval_samples_per_second": 258.171,
"eval_steps_per_second": 8.069,
"step": 11500
},
{
"epoch": 0.7274095441148806,
"grad_norm": 2.62512469291687,
"learning_rate": 1.818390844641084e-05,
"loss": 1.5213,
"step": 11600
},
{
"epoch": 0.7274095441148806,
"eval_loss": 1.684904932975769,
"eval_runtime": 492.1644,
"eval_samples_per_second": 259.206,
"eval_steps_per_second": 8.101,
"step": 11600
},
{
"epoch": 0.7336803160469054,
"grad_norm": 68.22614288330078,
"learning_rate": 1.8149070703199848e-05,
"loss": 1.7462,
"step": 11700
},
{
"epoch": 0.7336803160469054,
"eval_loss": 1.732839822769165,
"eval_runtime": 502.4559,
"eval_samples_per_second": 253.897,
"eval_steps_per_second": 7.935,
"step": 11700
},
{
"epoch": 0.7399510879789302,
"grad_norm": 54.836814880371094,
"learning_rate": 1.8114232959988854e-05,
"loss": 1.3519,
"step": 11800
},
{
"epoch": 0.7399510879789302,
"eval_loss": 1.8369685411453247,
"eval_runtime": 491.7357,
"eval_samples_per_second": 259.432,
"eval_steps_per_second": 8.108,
"step": 11800
},
{
"epoch": 0.746221859910955,
"grad_norm": 189.6983184814453,
"learning_rate": 1.807939521677786e-05,
"loss": 1.4935,
"step": 11900
},
{
"epoch": 0.746221859910955,
"eval_loss": 1.72471022605896,
"eval_runtime": 497.1147,
"eval_samples_per_second": 256.625,
"eval_steps_per_second": 8.02,
"step": 11900
},
{
"epoch": 0.7524926318429799,
"grad_norm": 103.63326263427734,
"learning_rate": 1.8044557473566863e-05,
"loss": 1.1721,
"step": 12000
},
{
"epoch": 0.7524926318429799,
"eval_loss": 1.6529266834259033,
"eval_runtime": 496.0569,
"eval_samples_per_second": 257.172,
"eval_steps_per_second": 8.037,
"step": 12000
},
{
"epoch": 0.7587634037750047,
"grad_norm": 118.19406127929688,
"learning_rate": 1.8009719730355866e-05,
"loss": 2.2432,
"step": 12100
},
{
"epoch": 0.7587634037750047,
"eval_loss": 1.6328880786895752,
"eval_runtime": 502.1124,
"eval_samples_per_second": 254.071,
"eval_steps_per_second": 7.94,
"step": 12100
},
{
"epoch": 0.7650341757070296,
"grad_norm": 135.55650329589844,
"learning_rate": 1.7974881987144873e-05,
"loss": 1.6931,
"step": 12200
},
{
"epoch": 0.7650341757070296,
"eval_loss": 1.6563047170639038,
"eval_runtime": 492.9421,
"eval_samples_per_second": 258.797,
"eval_steps_per_second": 8.088,
"step": 12200
},
{
"epoch": 0.7713049476390543,
"grad_norm": 3.689490795135498,
"learning_rate": 1.794004424393388e-05,
"loss": 1.2736,
"step": 12300
},
{
"epoch": 0.7713049476390543,
"eval_loss": 1.6984437704086304,
"eval_runtime": 495.6061,
"eval_samples_per_second": 257.406,
"eval_steps_per_second": 8.045,
"step": 12300
},
{
"epoch": 0.7775757195710792,
"grad_norm": 88.78681945800781,
"learning_rate": 1.7905206500722885e-05,
"loss": 1.7063,
"step": 12400
},
{
"epoch": 0.7775757195710792,
"eval_loss": 1.6574100255966187,
"eval_runtime": 504.1606,
"eval_samples_per_second": 253.038,
"eval_steps_per_second": 7.908,
"step": 12400
},
{
"epoch": 0.783846491503104,
"grad_norm": 20.35865592956543,
"learning_rate": 1.787036875751189e-05,
"loss": 1.7921,
"step": 12500
},
{
"epoch": 0.783846491503104,
"eval_loss": 1.7759722471237183,
"eval_runtime": 497.8975,
"eval_samples_per_second": 256.221,
"eval_steps_per_second": 8.008,
"step": 12500
},
{
"epoch": 0.7901172634351289,
"grad_norm": 2.046844720840454,
"learning_rate": 1.7835531014300895e-05,
"loss": 1.875,
"step": 12600
},
{
"epoch": 0.7901172634351289,
"eval_loss": 1.7148810625076294,
"eval_runtime": 492.3064,
"eval_samples_per_second": 259.131,
"eval_steps_per_second": 8.099,
"step": 12600
},
{
"epoch": 0.7963880353671537,
"grad_norm": 0.9655187129974365,
"learning_rate": 1.78006932710899e-05,
"loss": 1.4435,
"step": 12700
},
{
"epoch": 0.7963880353671537,
"eval_loss": 1.8084521293640137,
"eval_runtime": 502.53,
"eval_samples_per_second": 253.859,
"eval_steps_per_second": 7.934,
"step": 12700
},
{
"epoch": 0.8026588072991785,
"grad_norm": 135.72523498535156,
"learning_rate": 1.7765855527878907e-05,
"loss": 1.5271,
"step": 12800
},
{
"epoch": 0.8026588072991785,
"eval_loss": 1.7246832847595215,
"eval_runtime": 498.9684,
"eval_samples_per_second": 255.672,
"eval_steps_per_second": 7.99,
"step": 12800
},
{
"epoch": 0.8089295792312033,
"grad_norm": 1.4582579135894775,
"learning_rate": 1.773101778466791e-05,
"loss": 1.618,
"step": 12900
},
{
"epoch": 0.8089295792312033,
"eval_loss": 1.6542091369628906,
"eval_runtime": 498.6777,
"eval_samples_per_second": 255.821,
"eval_steps_per_second": 7.995,
"step": 12900
},
{
"epoch": 0.8152003511632282,
"grad_norm": 240.90525817871094,
"learning_rate": 1.7696528418889027e-05,
"loss": 1.9788,
"step": 13000
},
{
"epoch": 0.8152003511632282,
"eval_loss": 1.5685710906982422,
"eval_runtime": 501.55,
"eval_samples_per_second": 254.355,
"eval_steps_per_second": 7.949,
"step": 13000
},
{
"epoch": 0.821471123095253,
"grad_norm": 0.4606687128543854,
"learning_rate": 1.7661690675678033e-05,
"loss": 1.8213,
"step": 13100
},
{
"epoch": 0.821471123095253,
"eval_loss": 1.560313105583191,
"eval_runtime": 495.8302,
"eval_samples_per_second": 257.29,
"eval_steps_per_second": 8.041,
"step": 13100
},
{
"epoch": 0.8277418950272779,
"grad_norm": 33.05907440185547,
"learning_rate": 1.7626852932467036e-05,
"loss": 1.3661,
"step": 13200
},
{
"epoch": 0.8277418950272779,
"eval_loss": 1.637640118598938,
"eval_runtime": 499.6063,
"eval_samples_per_second": 255.345,
"eval_steps_per_second": 7.98,
"step": 13200
},
{
"epoch": 0.8340126669593027,
"grad_norm": 58.993228912353516,
"learning_rate": 1.7592015189256042e-05,
"loss": 1.3852,
"step": 13300
},
{
"epoch": 0.8340126669593027,
"eval_loss": 1.595252513885498,
"eval_runtime": 497.3714,
"eval_samples_per_second": 256.492,
"eval_steps_per_second": 8.016,
"step": 13300
},
{
"epoch": 0.8402834388913275,
"grad_norm": 52.913265228271484,
"learning_rate": 1.7557177446045045e-05,
"loss": 1.4673,
"step": 13400
},
{
"epoch": 0.8402834388913275,
"eval_loss": 1.634629487991333,
"eval_runtime": 497.4186,
"eval_samples_per_second": 256.468,
"eval_steps_per_second": 8.015,
"step": 13400
},
{
"epoch": 0.8465542108233524,
"grad_norm": 126.8105697631836,
"learning_rate": 1.752233970283405e-05,
"loss": 1.6684,
"step": 13500
},
{
"epoch": 0.8465542108233524,
"eval_loss": 1.5818397998809814,
"eval_runtime": 499.5245,
"eval_samples_per_second": 255.387,
"eval_steps_per_second": 7.982,
"step": 13500
},
{
"epoch": 0.8528249827553772,
"grad_norm": 130.67335510253906,
"learning_rate": 1.7487501959623058e-05,
"loss": 1.686,
"step": 13600
},
{
"epoch": 0.8528249827553772,
"eval_loss": 1.5840120315551758,
"eval_runtime": 500.7181,
"eval_samples_per_second": 254.778,
"eval_steps_per_second": 7.963,
"step": 13600
},
{
"epoch": 0.8590957546874021,
"grad_norm": 3.3967671394348145,
"learning_rate": 1.7452664216412064e-05,
"loss": 1.4397,
"step": 13700
},
{
"epoch": 0.8590957546874021,
"eval_loss": 1.5855337381362915,
"eval_runtime": 502.231,
"eval_samples_per_second": 254.011,
"eval_steps_per_second": 7.939,
"step": 13700
},
{
"epoch": 0.8653665266194268,
"grad_norm": 446.9328918457031,
"learning_rate": 1.7417826473201067e-05,
"loss": 1.5973,
"step": 13800
},
{
"epoch": 0.8653665266194268,
"eval_loss": 1.720745325088501,
"eval_runtime": 501.237,
"eval_samples_per_second": 254.514,
"eval_steps_per_second": 7.954,
"step": 13800
},
{
"epoch": 0.8716372985514517,
"grad_norm": 0.6950648427009583,
"learning_rate": 1.7382988729990073e-05,
"loss": 1.221,
"step": 13900
},
{
"epoch": 0.8716372985514517,
"eval_loss": 1.638085961341858,
"eval_runtime": 499.3245,
"eval_samples_per_second": 255.489,
"eval_steps_per_second": 7.985,
"step": 13900
},
{
"epoch": 0.8779080704834765,
"grad_norm": 24.994272232055664,
"learning_rate": 1.7348150986779076e-05,
"loss": 1.2082,
"step": 14000
},
{
"epoch": 0.8779080704834765,
"eval_loss": 1.6335324048995972,
"eval_runtime": 501.6663,
"eval_samples_per_second": 254.297,
"eval_steps_per_second": 7.948,
"step": 14000
},
{
"epoch": 0.8841788424155014,
"grad_norm": 0.017005544155836105,
"learning_rate": 1.7313313243568083e-05,
"loss": 1.5399,
"step": 14100
},
{
"epoch": 0.8841788424155014,
"eval_loss": 1.643354058265686,
"eval_runtime": 500.7206,
"eval_samples_per_second": 254.777,
"eval_steps_per_second": 7.963,
"step": 14100
},
{
"epoch": 0.8904496143475262,
"grad_norm": 48.26883316040039,
"learning_rate": 1.727847550035709e-05,
"loss": 1.5265,
"step": 14200
},
{
"epoch": 0.8904496143475262,
"eval_loss": 1.7265760898590088,
"eval_runtime": 503.0351,
"eval_samples_per_second": 253.605,
"eval_steps_per_second": 7.926,
"step": 14200
},
{
"epoch": 0.896720386279551,
"grad_norm": 4.5458149909973145,
"learning_rate": 1.7243637757146095e-05,
"loss": 0.9321,
"step": 14300
},
{
"epoch": 0.896720386279551,
"eval_loss": 1.5980534553527832,
"eval_runtime": 500.0744,
"eval_samples_per_second": 255.106,
"eval_steps_per_second": 7.973,
"step": 14300
},
{
"epoch": 0.9029911582115758,
"grad_norm": 1.9558783769607544,
"learning_rate": 1.72088000139351e-05,
"loss": 1.1133,
"step": 14400
},
{
"epoch": 0.9029911582115758,
"eval_loss": 1.612575650215149,
"eval_runtime": 502.1556,
"eval_samples_per_second": 254.049,
"eval_steps_per_second": 7.94,
"step": 14400
},
{
"epoch": 0.9092619301436007,
"grad_norm": 71.19198608398438,
"learning_rate": 1.7173962270724105e-05,
"loss": 1.0754,
"step": 14500
},
{
"epoch": 0.9092619301436007,
"eval_loss": 1.6227186918258667,
"eval_runtime": 493.8123,
"eval_samples_per_second": 258.341,
"eval_steps_per_second": 8.074,
"step": 14500
},
{
"epoch": 0.9155327020756255,
"grad_norm": 0.26305466890335083,
"learning_rate": 1.7139124527513108e-05,
"loss": 1.3486,
"step": 14600
},
{
"epoch": 0.9155327020756255,
"eval_loss": 1.6142776012420654,
"eval_runtime": 499.3615,
"eval_samples_per_second": 255.47,
"eval_steps_per_second": 7.984,
"step": 14600
},
{
"epoch": 0.9218034740076504,
"grad_norm": 35.207157135009766,
"learning_rate": 1.7104286784302114e-05,
"loss": 1.6338,
"step": 14700
},
{
"epoch": 0.9218034740076504,
"eval_loss": 1.5451936721801758,
"eval_runtime": 501.5927,
"eval_samples_per_second": 254.334,
"eval_steps_per_second": 7.949,
"step": 14700
},
{
"epoch": 0.9280742459396751,
"grad_norm": 213.60140991210938,
"learning_rate": 1.706944904109112e-05,
"loss": 1.389,
"step": 14800
},
{
"epoch": 0.9280742459396751,
"eval_loss": 1.6098874807357788,
"eval_runtime": 501.8582,
"eval_samples_per_second": 254.199,
"eval_steps_per_second": 7.944,
"step": 14800
},
{
"epoch": 0.9343450178717,
"grad_norm": 111.08502960205078,
"learning_rate": 1.7034611297880123e-05,
"loss": 1.3776,
"step": 14900
},
{
"epoch": 0.9343450178717,
"eval_loss": 1.6435140371322632,
"eval_runtime": 502.6322,
"eval_samples_per_second": 253.808,
"eval_steps_per_second": 7.932,
"step": 14900
},
{
"epoch": 0.9406157898037248,
"grad_norm": 18.123170852661133,
"learning_rate": 1.699977355466913e-05,
"loss": 1.8714,
"step": 15000
},
{
"epoch": 0.9406157898037248,
"eval_loss": 1.537667989730835,
"eval_runtime": 501.0111,
"eval_samples_per_second": 254.629,
"eval_steps_per_second": 7.958,
"step": 15000
},
{
"epoch": 0.9468865617357497,
"grad_norm": 155.93455505371094,
"learning_rate": 1.6964935811458133e-05,
"loss": 1.1286,
"step": 15100
},
{
"epoch": 0.9468865617357497,
"eval_loss": 1.6325874328613281,
"eval_runtime": 502.9153,
"eval_samples_per_second": 253.665,
"eval_steps_per_second": 7.928,
"step": 15100
},
{
"epoch": 0.9531573336677746,
"grad_norm": 172.1987762451172,
"learning_rate": 1.693009806824714e-05,
"loss": 1.4029,
"step": 15200
},
{
"epoch": 0.9531573336677746,
"eval_loss": 1.6255732774734497,
"eval_runtime": 508.4447,
"eval_samples_per_second": 250.906,
"eval_steps_per_second": 7.842,
"step": 15200
},
{
"epoch": 0.9594281055997993,
"grad_norm": 6.499632835388184,
"learning_rate": 1.6895260325036145e-05,
"loss": 1.7772,
"step": 15300
},
{
"epoch": 0.9594281055997993,
"eval_loss": 1.5221425294876099,
"eval_runtime": 502.3511,
"eval_samples_per_second": 253.95,
"eval_steps_per_second": 7.937,
"step": 15300
},
{
"epoch": 0.9656988775318242,
"grad_norm": 0.45312049984931946,
"learning_rate": 1.686042258182515e-05,
"loss": 1.3415,
"step": 15400
},
{
"epoch": 0.9656988775318242,
"eval_loss": 1.5603629350662231,
"eval_runtime": 502.2256,
"eval_samples_per_second": 254.013,
"eval_steps_per_second": 7.939,
"step": 15400
},
{
"epoch": 0.971969649463849,
"grad_norm": 1.405121922492981,
"learning_rate": 1.6825584838614155e-05,
"loss": 1.1088,
"step": 15500
},
{
"epoch": 0.971969649463849,
"eval_loss": 1.574865698814392,
"eval_runtime": 517.5942,
"eval_samples_per_second": 246.471,
"eval_steps_per_second": 7.703,
"step": 15500
},
{
"epoch": 0.9782404213958739,
"grad_norm": 6.808924198150635,
"learning_rate": 1.679074709540316e-05,
"loss": 1.4602,
"step": 15600
},
{
"epoch": 0.9782404213958739,
"eval_loss": 1.494147777557373,
"eval_runtime": 505.721,
"eval_samples_per_second": 252.258,
"eval_steps_per_second": 7.884,
"step": 15600
},
{
"epoch": 0.9845111933278987,
"grad_norm": 88.74259185791016,
"learning_rate": 1.6755909352192167e-05,
"loss": 1.867,
"step": 15700
},
{
"epoch": 0.9845111933278987,
"eval_loss": 1.3730698823928833,
"eval_runtime": 515.7676,
"eval_samples_per_second": 247.344,
"eval_steps_per_second": 7.73,
"step": 15700
},
{
"epoch": 0.9907819652599235,
"grad_norm": 3.1625919342041016,
"learning_rate": 1.672141998641328e-05,
"loss": 1.4541,
"step": 15800
},
{
"epoch": 0.9907819652599235,
"eval_loss": 1.4205607175827026,
"eval_runtime": 486.2625,
"eval_samples_per_second": 262.352,
"eval_steps_per_second": 8.199,
"step": 15800
},
{
"epoch": 0.9970527371919483,
"grad_norm": 135.7765655517578,
"learning_rate": 1.6686930620634396e-05,
"loss": 1.1966,
"step": 15900
},
{
"epoch": 0.9970527371919483,
"eval_loss": 1.4982208013534546,
"eval_runtime": 494.4178,
"eval_samples_per_second": 258.025,
"eval_steps_per_second": 8.064,
"step": 15900
},
{
"epoch": 1.0033235091239732,
"grad_norm": 97.84881591796875,
"learning_rate": 1.6652092877423403e-05,
"loss": 1.1447,
"step": 16000
},
{
"epoch": 1.0033235091239732,
"eval_loss": 1.5120809078216553,
"eval_runtime": 491.4115,
"eval_samples_per_second": 259.603,
"eval_steps_per_second": 8.113,
"step": 16000
},
{
"epoch": 1.009594281055998,
"grad_norm": 152.9120635986328,
"learning_rate": 1.6617255134212406e-05,
"loss": 1.1266,
"step": 16100
},
{
"epoch": 1.009594281055998,
"eval_loss": 1.4103273153305054,
"eval_runtime": 501.3393,
"eval_samples_per_second": 254.462,
"eval_steps_per_second": 7.953,
"step": 16100
},
{
"epoch": 1.0158650529880229,
"grad_norm": 10.062068939208984,
"learning_rate": 1.6582417391001412e-05,
"loss": 1.1971,
"step": 16200
},
{
"epoch": 1.0158650529880229,
"eval_loss": 1.5044476985931396,
"eval_runtime": 500.4234,
"eval_samples_per_second": 254.928,
"eval_steps_per_second": 7.967,
"step": 16200
},
{
"epoch": 1.0221358249200476,
"grad_norm": 385.3752136230469,
"learning_rate": 1.6547579647790418e-05,
"loss": 1.3376,
"step": 16300
},
{
"epoch": 1.0221358249200476,
"eval_loss": 1.5336840152740479,
"eval_runtime": 494.8747,
"eval_samples_per_second": 257.786,
"eval_steps_per_second": 8.057,
"step": 16300
},
{
"epoch": 1.0284065968520726,
"grad_norm": 0.33529093861579895,
"learning_rate": 1.6512741904579425e-05,
"loss": 1.7977,
"step": 16400
},
{
"epoch": 1.0284065968520726,
"eval_loss": 1.5711828470230103,
"eval_runtime": 502.7844,
"eval_samples_per_second": 253.731,
"eval_steps_per_second": 7.93,
"step": 16400
},
{
"epoch": 1.0346773687840973,
"grad_norm": 228.05165100097656,
"learning_rate": 1.6477904161368428e-05,
"loss": 1.6946,
"step": 16500
},
{
"epoch": 1.0346773687840973,
"eval_loss": 1.5322738885879517,
"eval_runtime": 498.3968,
"eval_samples_per_second": 255.965,
"eval_steps_per_second": 8.0,
"step": 16500
},
{
"epoch": 1.040948140716122,
"grad_norm": 1.2080790996551514,
"learning_rate": 1.6443066418157434e-05,
"loss": 0.8674,
"step": 16600
},
{
"epoch": 1.040948140716122,
"eval_loss": 1.4461946487426758,
"eval_runtime": 492.3648,
"eval_samples_per_second": 259.101,
"eval_steps_per_second": 8.098,
"step": 16600
},
{
"epoch": 1.047218912648147,
"grad_norm": 68.50479888916016,
"learning_rate": 1.6408228674946437e-05,
"loss": 1.6447,
"step": 16700
},
{
"epoch": 1.047218912648147,
"eval_loss": 1.483079433441162,
"eval_runtime": 496.9095,
"eval_samples_per_second": 256.731,
"eval_steps_per_second": 8.024,
"step": 16700
},
{
"epoch": 1.0534896845801718,
"grad_norm": 0.08792801946401596,
"learning_rate": 1.6373390931735443e-05,
"loss": 1.2709,
"step": 16800
},
{
"epoch": 1.0534896845801718,
"eval_loss": 1.575551986694336,
"eval_runtime": 503.9395,
"eval_samples_per_second": 253.149,
"eval_steps_per_second": 7.912,
"step": 16800
},
{
"epoch": 1.0597604565121967,
"grad_norm": 1.81405770778656,
"learning_rate": 1.633855318852445e-05,
"loss": 1.5217,
"step": 16900
},
{
"epoch": 1.0597604565121967,
"eval_loss": 1.5059562921524048,
"eval_runtime": 506.8836,
"eval_samples_per_second": 251.679,
"eval_steps_per_second": 7.866,
"step": 16900
},
{
"epoch": 1.0660312284442215,
"grad_norm": 38.73731231689453,
"learning_rate": 1.6303715445313456e-05,
"loss": 1.2986,
"step": 17000
},
{
"epoch": 1.0660312284442215,
"eval_loss": 1.4834423065185547,
"eval_runtime": 503.4795,
"eval_samples_per_second": 253.381,
"eval_steps_per_second": 7.919,
"step": 17000
},
{
"epoch": 1.0723020003762462,
"grad_norm": 0.7970458269119263,
"learning_rate": 1.626887770210246e-05,
"loss": 0.9976,
"step": 17100
},
{
"epoch": 1.0723020003762462,
"eval_loss": 1.4840906858444214,
"eval_runtime": 504.3957,
"eval_samples_per_second": 252.92,
"eval_steps_per_second": 7.905,
"step": 17100
},
{
"epoch": 1.0785727723082712,
"grad_norm": 0.04621260613203049,
"learning_rate": 1.6234039958891465e-05,
"loss": 1.3457,
"step": 17200
},
{
"epoch": 1.0785727723082712,
"eval_loss": 1.4227601289749146,
"eval_runtime": 493.9233,
"eval_samples_per_second": 258.283,
"eval_steps_per_second": 8.072,
"step": 17200
},
{
"epoch": 1.084843544240296,
"grad_norm": 0.5272818803787231,
"learning_rate": 1.6199202215680468e-05,
"loss": 0.987,
"step": 17300
},
{
"epoch": 1.084843544240296,
"eval_loss": 1.3806939125061035,
"eval_runtime": 501.4303,
"eval_samples_per_second": 254.416,
"eval_steps_per_second": 7.951,
"step": 17300
},
{
"epoch": 1.091114316172321,
"grad_norm": 0.4564209282398224,
"learning_rate": 1.6164364472469474e-05,
"loss": 1.2714,
"step": 17400
},
{
"epoch": 1.091114316172321,
"eval_loss": 1.3470913171768188,
"eval_runtime": 501.1914,
"eval_samples_per_second": 254.538,
"eval_steps_per_second": 7.955,
"step": 17400
},
{
"epoch": 1.0973850881043457,
"grad_norm": 14.678479194641113,
"learning_rate": 1.612952672925848e-05,
"loss": 1.298,
"step": 17500
},
{
"epoch": 1.0973850881043457,
"eval_loss": 1.4133707284927368,
"eval_runtime": 499.6488,
"eval_samples_per_second": 255.323,
"eval_steps_per_second": 7.98,
"step": 17500
},
{
"epoch": 1.1036558600363704,
"grad_norm": 1.6324628591537476,
"learning_rate": 1.6094688986047484e-05,
"loss": 0.9522,
"step": 17600
},
{
"epoch": 1.1036558600363704,
"eval_loss": 1.4225292205810547,
"eval_runtime": 501.7975,
"eval_samples_per_second": 254.23,
"eval_steps_per_second": 7.945,
"step": 17600
},
{
"epoch": 1.1099266319683954,
"grad_norm": 1.6328845024108887,
"learning_rate": 1.605985124283649e-05,
"loss": 1.0634,
"step": 17700
},
{
"epoch": 1.1099266319683954,
"eval_loss": 1.4474034309387207,
"eval_runtime": 505.9682,
"eval_samples_per_second": 252.134,
"eval_steps_per_second": 7.88,
"step": 17700
},
{
"epoch": 1.11619740390042,
"grad_norm": 0.9931433200836182,
"learning_rate": 1.6025013499625493e-05,
"loss": 1.2889,
"step": 17800
},
{
"epoch": 1.11619740390042,
"eval_loss": 1.4678562879562378,
"eval_runtime": 503.5781,
"eval_samples_per_second": 253.331,
"eval_steps_per_second": 7.917,
"step": 17800
},
{
"epoch": 1.122468175832445,
"grad_norm": 59.28689956665039,
"learning_rate": 1.5990175756414503e-05,
"loss": 1.7532,
"step": 17900
},
{
"epoch": 1.122468175832445,
"eval_loss": 1.3757271766662598,
"eval_runtime": 499.9963,
"eval_samples_per_second": 255.146,
"eval_steps_per_second": 7.974,
"step": 17900
},
{
"epoch": 1.1287389477644698,
"grad_norm": 72.52947998046875,
"learning_rate": 1.5955338013203506e-05,
"loss": 1.6613,
"step": 18000
},
{
"epoch": 1.1287389477644698,
"eval_loss": 1.3807989358901978,
"eval_runtime": 492.0199,
"eval_samples_per_second": 259.282,
"eval_steps_per_second": 8.103,
"step": 18000
},
{
"epoch": 1.1350097196964946,
"grad_norm": 29.813941955566406,
"learning_rate": 1.5920500269992512e-05,
"loss": 1.1765,
"step": 18100
},
{
"epoch": 1.1350097196964946,
"eval_loss": 1.3903069496154785,
"eval_runtime": 501.0104,
"eval_samples_per_second": 254.629,
"eval_steps_per_second": 7.958,
"step": 18100
},
{
"epoch": 1.1412804916285195,
"grad_norm": 1.3140065670013428,
"learning_rate": 1.5885662526781515e-05,
"loss": 1.2787,
"step": 18200
},
{
"epoch": 1.1412804916285195,
"eval_loss": 1.3920559883117676,
"eval_runtime": 500.2005,
"eval_samples_per_second": 255.042,
"eval_steps_per_second": 7.971,
"step": 18200
},
{
"epoch": 1.1475512635605443,
"grad_norm": 0.21044209599494934,
"learning_rate": 1.585082478357052e-05,
"loss": 1.2532,
"step": 18300
},
{
"epoch": 1.1475512635605443,
"eval_loss": 1.3519495725631714,
"eval_runtime": 505.681,
"eval_samples_per_second": 252.278,
"eval_steps_per_second": 7.884,
"step": 18300
},
{
"epoch": 1.1538220354925692,
"grad_norm": 56.845211029052734,
"learning_rate": 1.5815987040359528e-05,
"loss": 1.8056,
"step": 18400
},
{
"epoch": 1.1538220354925692,
"eval_loss": 1.2984182834625244,
"eval_runtime": 507.7377,
"eval_samples_per_second": 251.256,
"eval_steps_per_second": 7.852,
"step": 18400
},
{
"epoch": 1.160092807424594,
"grad_norm": 99.5033950805664,
"learning_rate": 1.5781149297148534e-05,
"loss": 1.0985,
"step": 18500
},
{
"epoch": 1.160092807424594,
"eval_loss": 1.3321679830551147,
"eval_runtime": 504.9231,
"eval_samples_per_second": 252.656,
"eval_steps_per_second": 7.896,
"step": 18500
},
{
"epoch": 1.1663635793566187,
"grad_norm": 75.43387603759766,
"learning_rate": 1.5746311553937537e-05,
"loss": 1.8665,
"step": 18600
},
{
"epoch": 1.1663635793566187,
"eval_loss": 1.4059826135635376,
"eval_runtime": 495.7712,
"eval_samples_per_second": 257.32,
"eval_steps_per_second": 8.042,
"step": 18600
},
{
"epoch": 1.1726343512886437,
"grad_norm": 111.51386260986328,
"learning_rate": 1.5711473810726543e-05,
"loss": 1.2427,
"step": 18700
},
{
"epoch": 1.1726343512886437,
"eval_loss": 1.3774936199188232,
"eval_runtime": 502.6217,
"eval_samples_per_second": 253.813,
"eval_steps_per_second": 7.932,
"step": 18700
},
{
"epoch": 1.1789051232206684,
"grad_norm": 1.3077305555343628,
"learning_rate": 1.5676636067515546e-05,
"loss": 1.1241,
"step": 18800
},
{
"epoch": 1.1789051232206684,
"eval_loss": 1.3168435096740723,
"eval_runtime": 498.3092,
"eval_samples_per_second": 256.01,
"eval_steps_per_second": 8.001,
"step": 18800
},
{
"epoch": 1.1851758951526934,
"grad_norm": 29.557662963867188,
"learning_rate": 1.5641798324304553e-05,
"loss": 1.2348,
"step": 18900
},
{
"epoch": 1.1851758951526934,
"eval_loss": 1.353879690170288,
"eval_runtime": 503.07,
"eval_samples_per_second": 253.587,
"eval_steps_per_second": 7.925,
"step": 18900
},
{
"epoch": 1.1914466670847181,
"grad_norm": 65.81330871582031,
"learning_rate": 1.560696058109356e-05,
"loss": 1.1709,
"step": 19000
},
{
"epoch": 1.1914466670847181,
"eval_loss": 1.3540174961090088,
"eval_runtime": 498.735,
"eval_samples_per_second": 255.791,
"eval_steps_per_second": 7.994,
"step": 19000
},
{
"epoch": 1.1977174390167429,
"grad_norm": 48.844017028808594,
"learning_rate": 1.5572122837882565e-05,
"loss": 0.8844,
"step": 19100
},
{
"epoch": 1.1977174390167429,
"eval_loss": 1.3141909837722778,
"eval_runtime": 498.6069,
"eval_samples_per_second": 255.857,
"eval_steps_per_second": 7.996,
"step": 19100
},
{
"epoch": 1.2039882109487678,
"grad_norm": 3.451929807662964,
"learning_rate": 1.5537285094671568e-05,
"loss": 1.0035,
"step": 19200
},
{
"epoch": 1.2039882109487678,
"eval_loss": 1.3781260251998901,
"eval_runtime": 506.2945,
"eval_samples_per_second": 251.972,
"eval_steps_per_second": 7.875,
"step": 19200
},
{
"epoch": 1.2102589828807926,
"grad_norm": 77.69365692138672,
"learning_rate": 1.5502447351460575e-05,
"loss": 1.4279,
"step": 19300
},
{
"epoch": 1.2102589828807926,
"eval_loss": 1.261493444442749,
"eval_runtime": 498.5065,
"eval_samples_per_second": 255.908,
"eval_steps_per_second": 7.998,
"step": 19300
},
{
"epoch": 1.2165297548128176,
"grad_norm": 21.791259765625,
"learning_rate": 1.5467609608249577e-05,
"loss": 1.3327,
"step": 19400
},
{
"epoch": 1.2165297548128176,
"eval_loss": 1.2696096897125244,
"eval_runtime": 498.1301,
"eval_samples_per_second": 256.102,
"eval_steps_per_second": 8.004,
"step": 19400
},
{
"epoch": 1.2228005267448423,
"grad_norm": 2.250319242477417,
"learning_rate": 1.5432771865038584e-05,
"loss": 0.993,
"step": 19500
},
{
"epoch": 1.2228005267448423,
"eval_loss": 1.3169900178909302,
"eval_runtime": 495.9918,
"eval_samples_per_second": 257.206,
"eval_steps_per_second": 8.038,
"step": 19500
},
{
"epoch": 1.229071298676867,
"grad_norm": 73.77873229980469,
"learning_rate": 1.539793412182759e-05,
"loss": 0.7869,
"step": 19600
},
{
"epoch": 1.229071298676867,
"eval_loss": 1.2967498302459717,
"eval_runtime": 497.8866,
"eval_samples_per_second": 256.227,
"eval_steps_per_second": 8.008,
"step": 19600
},
{
"epoch": 1.235342070608892,
"grad_norm": 0.07626141607761383,
"learning_rate": 1.5363096378616596e-05,
"loss": 0.985,
"step": 19700
},
{
"epoch": 1.235342070608892,
"eval_loss": 1.3056693077087402,
"eval_runtime": 494.5073,
"eval_samples_per_second": 257.978,
"eval_steps_per_second": 8.063,
"step": 19700
},
{
"epoch": 1.2416128425409168,
"grad_norm": 4.803875923156738,
"learning_rate": 1.53282586354056e-05,
"loss": 1.1603,
"step": 19800
},
{
"epoch": 1.2416128425409168,
"eval_loss": 1.2796647548675537,
"eval_runtime": 496.7276,
"eval_samples_per_second": 256.825,
"eval_steps_per_second": 8.027,
"step": 19800
},
{
"epoch": 1.2478836144729417,
"grad_norm": 63.491329193115234,
"learning_rate": 1.5293420892194606e-05,
"loss": 1.2469,
"step": 19900
},
{
"epoch": 1.2478836144729417,
"eval_loss": 1.2394485473632812,
"eval_runtime": 504.5722,
"eval_samples_per_second": 252.832,
"eval_steps_per_second": 7.902,
"step": 19900
},
{
"epoch": 1.2541543864049665,
"grad_norm": 155.53126525878906,
"learning_rate": 1.525858314898361e-05,
"loss": 1.521,
"step": 20000
},
{
"epoch": 1.2541543864049665,
"eval_loss": 1.2309328317642212,
"eval_runtime": 499.147,
"eval_samples_per_second": 255.58,
"eval_steps_per_second": 7.988,
"step": 20000
},
{
"epoch": 1.2604251583369912,
"grad_norm": 0.10026417672634125,
"learning_rate": 1.5223745405772617e-05,
"loss": 1.2632,
"step": 20100
},
{
"epoch": 1.2604251583369912,
"eval_loss": 1.2352900505065918,
"eval_runtime": 498.6806,
"eval_samples_per_second": 255.819,
"eval_steps_per_second": 7.995,
"step": 20100
},
{
"epoch": 1.2666959302690162,
"grad_norm": 20.156579971313477,
"learning_rate": 1.518890766256162e-05,
"loss": 1.3621,
"step": 20200
},
{
"epoch": 1.2666959302690162,
"eval_loss": 1.2432923316955566,
"eval_runtime": 488.6088,
"eval_samples_per_second": 261.092,
"eval_steps_per_second": 8.16,
"step": 20200
},
{
"epoch": 1.272966702201041,
"grad_norm": 1.3594141006469727,
"learning_rate": 1.5154069919350624e-05,
"loss": 1.5145,
"step": 20300
},
{
"epoch": 1.272966702201041,
"eval_loss": 1.3064727783203125,
"eval_runtime": 501.5288,
"eval_samples_per_second": 254.366,
"eval_steps_per_second": 7.95,
"step": 20300
},
{
"epoch": 1.2792374741330659,
"grad_norm": 26.742637634277344,
"learning_rate": 1.511923217613963e-05,
"loss": 1.3708,
"step": 20400
},
{
"epoch": 1.2792374741330659,
"eval_loss": 1.2422964572906494,
"eval_runtime": 504.9841,
"eval_samples_per_second": 252.626,
"eval_steps_per_second": 7.895,
"step": 20400
},
{
"epoch": 1.2855082460650906,
"grad_norm": 666.2847290039062,
"learning_rate": 1.5084394432928635e-05,
"loss": 1.1716,
"step": 20500
},
{
"epoch": 1.2855082460650906,
"eval_loss": 1.2922592163085938,
"eval_runtime": 502.0283,
"eval_samples_per_second": 254.113,
"eval_steps_per_second": 7.942,
"step": 20500
},
{
"epoch": 1.2917790179971154,
"grad_norm": 0.90843665599823,
"learning_rate": 1.5049556689717642e-05,
"loss": 1.419,
"step": 20600
},
{
"epoch": 1.2917790179971154,
"eval_loss": 1.2193955183029175,
"eval_runtime": 496.5986,
"eval_samples_per_second": 256.892,
"eval_steps_per_second": 8.029,
"step": 20600
},
{
"epoch": 1.2980497899291403,
"grad_norm": 174.012451171875,
"learning_rate": 1.5014718946506646e-05,
"loss": 1.1644,
"step": 20700
},
{
"epoch": 1.2980497899291403,
"eval_loss": 1.2368745803833008,
"eval_runtime": 500.022,
"eval_samples_per_second": 255.133,
"eval_steps_per_second": 7.974,
"step": 20700
},
{
"epoch": 1.304320561861165,
"grad_norm": 7.468738555908203,
"learning_rate": 1.4979881203295653e-05,
"loss": 1.6589,
"step": 20800
},
{
"epoch": 1.304320561861165,
"eval_loss": 1.1971392631530762,
"eval_runtime": 500.4989,
"eval_samples_per_second": 254.89,
"eval_steps_per_second": 7.966,
"step": 20800
},
{
"epoch": 1.31059133379319,
"grad_norm": 120.70152282714844,
"learning_rate": 1.4945043460084656e-05,
"loss": 1.0299,
"step": 20900
},
{
"epoch": 1.31059133379319,
"eval_loss": 1.2342555522918701,
"eval_runtime": 499.8846,
"eval_samples_per_second": 255.203,
"eval_steps_per_second": 7.976,
"step": 20900
},
{
"epoch": 1.3168621057252148,
"grad_norm": 90.38188934326172,
"learning_rate": 1.4910205716873662e-05,
"loss": 1.3452,
"step": 21000
},
{
"epoch": 1.3168621057252148,
"eval_loss": 1.2725248336791992,
"eval_runtime": 490.3505,
"eval_samples_per_second": 260.165,
"eval_steps_per_second": 8.131,
"step": 21000
},
{
"epoch": 1.3231328776572395,
"grad_norm": 0.8048076033592224,
"learning_rate": 1.4875367973662667e-05,
"loss": 1.4234,
"step": 21100
},
{
"epoch": 1.3231328776572395,
"eval_loss": 1.2416248321533203,
"eval_runtime": 493.5915,
"eval_samples_per_second": 258.457,
"eval_steps_per_second": 8.078,
"step": 21100
},
{
"epoch": 1.3294036495892645,
"grad_norm": 102.93982696533203,
"learning_rate": 1.4840530230451673e-05,
"loss": 1.2496,
"step": 21200
},
{
"epoch": 1.3294036495892645,
"eval_loss": 1.3609205484390259,
"eval_runtime": 501.4555,
"eval_samples_per_second": 254.403,
"eval_steps_per_second": 7.951,
"step": 21200
},
{
"epoch": 1.3356744215212892,
"grad_norm": 0.0473560094833374,
"learning_rate": 1.4805692487240678e-05,
"loss": 1.2133,
"step": 21300
},
{
"epoch": 1.3356744215212892,
"eval_loss": 1.2892857789993286,
"eval_runtime": 486.7923,
"eval_samples_per_second": 262.067,
"eval_steps_per_second": 8.19,
"step": 21300
},
{
"epoch": 1.3419451934533142,
"grad_norm": 0.2829754948616028,
"learning_rate": 1.4770854744029684e-05,
"loss": 0.8682,
"step": 21400
},
{
"epoch": 1.3419451934533142,
"eval_loss": 1.2352983951568604,
"eval_runtime": 501.6238,
"eval_samples_per_second": 254.318,
"eval_steps_per_second": 7.948,
"step": 21400
},
{
"epoch": 1.348215965385339,
"grad_norm": 0.09349790215492249,
"learning_rate": 1.4736017000818687e-05,
"loss": 0.9499,
"step": 21500
},
{
"epoch": 1.348215965385339,
"eval_loss": 1.2423368692398071,
"eval_runtime": 503.2262,
"eval_samples_per_second": 253.508,
"eval_steps_per_second": 7.923,
"step": 21500
},
{
"epoch": 1.3544867373173637,
"grad_norm": 0.7133996486663818,
"learning_rate": 1.4701179257607693e-05,
"loss": 1.2896,
"step": 21600
},
{
"epoch": 1.3544867373173637,
"eval_loss": 1.1796832084655762,
"eval_runtime": 504.2727,
"eval_samples_per_second": 252.982,
"eval_steps_per_second": 7.906,
"step": 21600
},
{
"epoch": 1.3607575092493887,
"grad_norm": 44.3637580871582,
"learning_rate": 1.4666341514396698e-05,
"loss": 1.2392,
"step": 21700
},
{
"epoch": 1.3607575092493887,
"eval_loss": 1.1962292194366455,
"eval_runtime": 504.4317,
"eval_samples_per_second": 252.902,
"eval_steps_per_second": 7.904,
"step": 21700
},
{
"epoch": 1.3670282811814134,
"grad_norm": 41.141788482666016,
"learning_rate": 1.4631503771185704e-05,
"loss": 0.9206,
"step": 21800
},
{
"epoch": 1.3670282811814134,
"eval_loss": 1.2483233213424683,
"eval_runtime": 502.7012,
"eval_samples_per_second": 253.773,
"eval_steps_per_second": 7.931,
"step": 21800
},
{
"epoch": 1.3732990531134384,
"grad_norm": 0.8109003901481628,
"learning_rate": 1.4596666027974709e-05,
"loss": 1.174,
"step": 21900
},
{
"epoch": 1.3732990531134384,
"eval_loss": 1.23282790184021,
"eval_runtime": 505.2234,
"eval_samples_per_second": 252.506,
"eval_steps_per_second": 7.892,
"step": 21900
},
{
"epoch": 1.379569825045463,
"grad_norm": 74.6466293334961,
"learning_rate": 1.4562176662195823e-05,
"loss": 1.6361,
"step": 22000
},
{
"epoch": 1.379569825045463,
"eval_loss": 1.1558316946029663,
"eval_runtime": 501.4654,
"eval_samples_per_second": 254.398,
"eval_steps_per_second": 7.951,
"step": 22000
},
{
"epoch": 1.3858405969774878,
"grad_norm": 5.058000087738037,
"learning_rate": 1.452733891898483e-05,
"loss": 0.8284,
"step": 22100
},
{
"epoch": 1.3858405969774878,
"eval_loss": 1.271115779876709,
"eval_runtime": 493.6867,
"eval_samples_per_second": 258.407,
"eval_steps_per_second": 8.076,
"step": 22100
},
{
"epoch": 1.3921113689095128,
"grad_norm": 1.647706389427185,
"learning_rate": 1.4492849553205946e-05,
"loss": 1.2814,
"step": 22200
},
{
"epoch": 1.3921113689095128,
"eval_loss": 1.246185064315796,
"eval_runtime": 492.0347,
"eval_samples_per_second": 259.274,
"eval_steps_per_second": 8.103,
"step": 22200
},
{
"epoch": 1.3983821408415376,
"grad_norm": 0.4397072494029999,
"learning_rate": 1.4458011809994949e-05,
"loss": 1.1595,
"step": 22300
},
{
"epoch": 1.3983821408415376,
"eval_loss": 1.2613025903701782,
"eval_runtime": 498.6845,
"eval_samples_per_second": 255.817,
"eval_steps_per_second": 7.995,
"step": 22300
},
{
"epoch": 1.4046529127735625,
"grad_norm": 16.611690521240234,
"learning_rate": 1.4423174066783955e-05,
"loss": 1.3129,
"step": 22400
},
{
"epoch": 1.4046529127735625,
"eval_loss": 1.1816045045852661,
"eval_runtime": 496.2598,
"eval_samples_per_second": 257.067,
"eval_steps_per_second": 8.034,
"step": 22400
},
{
"epoch": 1.4109236847055873,
"grad_norm": 69.52592468261719,
"learning_rate": 1.438833632357296e-05,
"loss": 1.1353,
"step": 22500
},
{
"epoch": 1.4109236847055873,
"eval_loss": 1.245389699935913,
"eval_runtime": 494.4633,
"eval_samples_per_second": 258.001,
"eval_steps_per_second": 8.063,
"step": 22500
},
{
"epoch": 1.417194456637612,
"grad_norm": 6.014486789703369,
"learning_rate": 1.4353498580361966e-05,
"loss": 1.3302,
"step": 22600
},
{
"epoch": 1.417194456637612,
"eval_loss": 1.1397989988327026,
"eval_runtime": 503.4853,
"eval_samples_per_second": 253.378,
"eval_steps_per_second": 7.919,
"step": 22600
},
{
"epoch": 1.423465228569637,
"grad_norm": 2.0832605361938477,
"learning_rate": 1.4318660837150971e-05,
"loss": 1.1591,
"step": 22700
},
{
"epoch": 1.423465228569637,
"eval_loss": 1.2935895919799805,
"eval_runtime": 495.0142,
"eval_samples_per_second": 257.714,
"eval_steps_per_second": 8.054,
"step": 22700
},
{
"epoch": 1.4297360005016617,
"grad_norm": 4.5407891273498535,
"learning_rate": 1.4283823093939975e-05,
"loss": 0.6551,
"step": 22800
},
{
"epoch": 1.4297360005016617,
"eval_loss": 1.2345027923583984,
"eval_runtime": 489.171,
"eval_samples_per_second": 260.792,
"eval_steps_per_second": 8.151,
"step": 22800
},
{
"epoch": 1.4360067724336867,
"grad_norm": 78.76990509033203,
"learning_rate": 1.4248985350728982e-05,
"loss": 1.2884,
"step": 22900
},
{
"epoch": 1.4360067724336867,
"eval_loss": 1.1629202365875244,
"eval_runtime": 483.3185,
"eval_samples_per_second": 263.95,
"eval_steps_per_second": 8.249,
"step": 22900
},
{
"epoch": 1.4422775443657114,
"grad_norm": 110.63036346435547,
"learning_rate": 1.4214147607517985e-05,
"loss": 1.1769,
"step": 23000
},
{
"epoch": 1.4422775443657114,
"eval_loss": 1.2339965105056763,
"eval_runtime": 486.5591,
"eval_samples_per_second": 262.192,
"eval_steps_per_second": 8.194,
"step": 23000
},
{
"epoch": 1.4485483162977362,
"grad_norm": 126.27979278564453,
"learning_rate": 1.4179309864306991e-05,
"loss": 1.1331,
"step": 23100
},
{
"epoch": 1.4485483162977362,
"eval_loss": 1.2035988569259644,
"eval_runtime": 490.9417,
"eval_samples_per_second": 259.852,
"eval_steps_per_second": 8.121,
"step": 23100
},
{
"epoch": 1.4548190882297611,
"grad_norm": 0.48294782638549805,
"learning_rate": 1.4144472121095996e-05,
"loss": 1.1008,
"step": 23200
},
{
"epoch": 1.4548190882297611,
"eval_loss": 1.1685419082641602,
"eval_runtime": 485.6527,
"eval_samples_per_second": 262.682,
"eval_steps_per_second": 8.21,
"step": 23200
},
{
"epoch": 1.4610898601617859,
"grad_norm": 6.466658115386963,
"learning_rate": 1.4109634377885002e-05,
"loss": 1.1487,
"step": 23300
},
{
"epoch": 1.4610898601617859,
"eval_loss": 1.1274471282958984,
"eval_runtime": 496.8021,
"eval_samples_per_second": 256.786,
"eval_steps_per_second": 8.025,
"step": 23300
},
{
"epoch": 1.4673606320938108,
"grad_norm": 18.893667221069336,
"learning_rate": 1.4074796634674007e-05,
"loss": 0.7753,
"step": 23400
},
{
"epoch": 1.4673606320938108,
"eval_loss": 1.1737704277038574,
"eval_runtime": 486.0352,
"eval_samples_per_second": 262.475,
"eval_steps_per_second": 8.203,
"step": 23400
},
{
"epoch": 1.4736314040258356,
"grad_norm": 19.157712936401367,
"learning_rate": 1.4039958891463013e-05,
"loss": 1.3236,
"step": 23500
},
{
"epoch": 1.4736314040258356,
"eval_loss": 1.2376619577407837,
"eval_runtime": 488.4737,
"eval_samples_per_second": 261.164,
"eval_steps_per_second": 8.162,
"step": 23500
},
{
"epoch": 1.4799021759578603,
"grad_norm": 9.691899299621582,
"learning_rate": 1.4005121148252016e-05,
"loss": 0.919,
"step": 23600
},
{
"epoch": 1.4799021759578603,
"eval_loss": 1.2018409967422485,
"eval_runtime": 493.2156,
"eval_samples_per_second": 258.654,
"eval_steps_per_second": 8.084,
"step": 23600
},
{
"epoch": 1.4861729478898853,
"grad_norm": 98.8059310913086,
"learning_rate": 1.3970283405041022e-05,
"loss": 0.8516,
"step": 23700
},
{
"epoch": 1.4861729478898853,
"eval_loss": 1.2296911478042603,
"eval_runtime": 499.0547,
"eval_samples_per_second": 255.627,
"eval_steps_per_second": 7.989,
"step": 23700
},
{
"epoch": 1.49244371982191,
"grad_norm": 22.1707706451416,
"learning_rate": 1.3935445661830027e-05,
"loss": 1.092,
"step": 23800
},
{
"epoch": 1.49244371982191,
"eval_loss": 1.1629080772399902,
"eval_runtime": 493.4212,
"eval_samples_per_second": 258.546,
"eval_steps_per_second": 8.08,
"step": 23800
},
{
"epoch": 1.498714491753935,
"grad_norm": 0.31641775369644165,
"learning_rate": 1.3900607918619033e-05,
"loss": 0.673,
"step": 23900
},
{
"epoch": 1.498714491753935,
"eval_loss": 1.2161920070648193,
"eval_runtime": 495.1904,
"eval_samples_per_second": 257.622,
"eval_steps_per_second": 8.051,
"step": 23900
},
{
"epoch": 1.5049852636859598,
"grad_norm": 0.4521692097187042,
"learning_rate": 1.3865770175408038e-05,
"loss": 0.994,
"step": 24000
},
{
"epoch": 1.5049852636859598,
"eval_loss": 1.1778312921524048,
"eval_runtime": 494.7958,
"eval_samples_per_second": 257.828,
"eval_steps_per_second": 8.058,
"step": 24000
},
{
"epoch": 1.5112560356179845,
"grad_norm": 1.2718249559402466,
"learning_rate": 1.3830932432197044e-05,
"loss": 0.8766,
"step": 24100
},
{
"epoch": 1.5112560356179845,
"eval_loss": 1.1902062892913818,
"eval_runtime": 498.2478,
"eval_samples_per_second": 256.041,
"eval_steps_per_second": 8.002,
"step": 24100
},
{
"epoch": 1.5175268075500095,
"grad_norm": 78.13153076171875,
"learning_rate": 1.3796094688986047e-05,
"loss": 1.3818,
"step": 24200
},
{
"epoch": 1.5175268075500095,
"eval_loss": 1.1638315916061401,
"eval_runtime": 475.6768,
"eval_samples_per_second": 268.191,
"eval_steps_per_second": 8.382,
"step": 24200
},
{
"epoch": 1.5237975794820342,
"grad_norm": 11.799439430236816,
"learning_rate": 1.3761256945775054e-05,
"loss": 1.1215,
"step": 24300
},
{
"epoch": 1.5237975794820342,
"eval_loss": 1.1665599346160889,
"eval_runtime": 493.9156,
"eval_samples_per_second": 258.287,
"eval_steps_per_second": 8.072,
"step": 24300
},
{
"epoch": 1.5300683514140592,
"grad_norm": 0.15210537612438202,
"learning_rate": 1.3726419202564058e-05,
"loss": 0.8485,
"step": 24400
},
{
"epoch": 1.5300683514140592,
"eval_loss": 1.190748929977417,
"eval_runtime": 489.6338,
"eval_samples_per_second": 260.546,
"eval_steps_per_second": 8.143,
"step": 24400
},
{
"epoch": 1.536339123346084,
"grad_norm": 111.32445526123047,
"learning_rate": 1.3691581459353065e-05,
"loss": 1.1033,
"step": 24500
},
{
"epoch": 1.536339123346084,
"eval_loss": 1.2317506074905396,
"eval_runtime": 495.4364,
"eval_samples_per_second": 257.494,
"eval_steps_per_second": 8.047,
"step": 24500
},
{
"epoch": 1.5426098952781087,
"grad_norm": 4.906432151794434,
"learning_rate": 1.365674371614207e-05,
"loss": 0.9001,
"step": 24600
},
{
"epoch": 1.5426098952781087,
"eval_loss": 1.2112876176834106,
"eval_runtime": 501.1743,
"eval_samples_per_second": 254.546,
"eval_steps_per_second": 7.955,
"step": 24600
},
{
"epoch": 1.5488806672101336,
"grad_norm": 3.4020934104919434,
"learning_rate": 1.3622254350363184e-05,
"loss": 1.3256,
"step": 24700
},
{
"epoch": 1.5488806672101336,
"eval_loss": 1.23091721534729,
"eval_runtime": 488.223,
"eval_samples_per_second": 261.299,
"eval_steps_per_second": 8.166,
"step": 24700
},
{
"epoch": 1.5551514391421584,
"grad_norm": 172.33592224121094,
"learning_rate": 1.358741660715219e-05,
"loss": 0.8162,
"step": 24800
},
{
"epoch": 1.5551514391421584,
"eval_loss": 1.213860273361206,
"eval_runtime": 492.7571,
"eval_samples_per_second": 258.894,
"eval_steps_per_second": 8.091,
"step": 24800
},
{
"epoch": 1.5614222110741833,
"grad_norm": 1.1643731594085693,
"learning_rate": 1.3552578863941195e-05,
"loss": 0.5741,
"step": 24900
},
{
"epoch": 1.5614222110741833,
"eval_loss": 1.237512469291687,
"eval_runtime": 506.8194,
"eval_samples_per_second": 251.711,
"eval_steps_per_second": 7.867,
"step": 24900
},
{
"epoch": 1.567692983006208,
"grad_norm": 106.2492446899414,
"learning_rate": 1.3517741120730201e-05,
"loss": 0.883,
"step": 25000
},
{
"epoch": 1.567692983006208,
"eval_loss": 1.203902244567871,
"eval_runtime": 495.5543,
"eval_samples_per_second": 257.433,
"eval_steps_per_second": 8.046,
"step": 25000
},
{
"epoch": 1.5739637549382328,
"grad_norm": 24.915504455566406,
"learning_rate": 1.3482903377519206e-05,
"loss": 1.1212,
"step": 25100
},
{
"epoch": 1.5739637549382328,
"eval_loss": 1.1428111791610718,
"eval_runtime": 489.9866,
"eval_samples_per_second": 260.358,
"eval_steps_per_second": 8.137,
"step": 25100
},
{
"epoch": 1.5802345268702578,
"grad_norm": 0.43622246384620667,
"learning_rate": 1.3448065634308212e-05,
"loss": 0.8229,
"step": 25200
},
{
"epoch": 1.5802345268702578,
"eval_loss": 1.2338348627090454,
"eval_runtime": 488.67,
"eval_samples_per_second": 261.06,
"eval_steps_per_second": 8.159,
"step": 25200
},
{
"epoch": 1.5865052988022825,
"grad_norm": 76.21497344970703,
"learning_rate": 1.3413227891097215e-05,
"loss": 0.8856,
"step": 25300
},
{
"epoch": 1.5865052988022825,
"eval_loss": 1.146145224571228,
"eval_runtime": 504.1995,
"eval_samples_per_second": 253.019,
"eval_steps_per_second": 7.908,
"step": 25300
},
{
"epoch": 1.5927760707343075,
"grad_norm": 114.51611328125,
"learning_rate": 1.337839014788622e-05,
"loss": 1.2323,
"step": 25400
},
{
"epoch": 1.5927760707343075,
"eval_loss": 1.1568622589111328,
"eval_runtime": 492.991,
"eval_samples_per_second": 258.771,
"eval_steps_per_second": 8.087,
"step": 25400
},
{
"epoch": 1.5990468426663322,
"grad_norm": 3.8696110248565674,
"learning_rate": 1.3343552404675226e-05,
"loss": 0.9724,
"step": 25500
},
{
"epoch": 1.5990468426663322,
"eval_loss": 1.1549348831176758,
"eval_runtime": 499.5621,
"eval_samples_per_second": 255.368,
"eval_steps_per_second": 7.981,
"step": 25500
},
{
"epoch": 1.605317614598357,
"grad_norm": 1.6167796850204468,
"learning_rate": 1.330871466146423e-05,
"loss": 1.0791,
"step": 25600
},
{
"epoch": 1.605317614598357,
"eval_loss": 1.1160709857940674,
"eval_runtime": 484.6001,
"eval_samples_per_second": 263.252,
"eval_steps_per_second": 8.227,
"step": 25600
},
{
"epoch": 1.611588386530382,
"grad_norm": 138.8144073486328,
"learning_rate": 1.3273876918253237e-05,
"loss": 0.9845,
"step": 25700
},
{
"epoch": 1.611588386530382,
"eval_loss": 1.1060998439788818,
"eval_runtime": 496.5423,
"eval_samples_per_second": 256.921,
"eval_steps_per_second": 8.03,
"step": 25700
},
{
"epoch": 1.6178591584624067,
"grad_norm": 4.400548934936523,
"learning_rate": 1.3239039175042242e-05,
"loss": 1.1591,
"step": 25800
},
{
"epoch": 1.6178591584624067,
"eval_loss": 1.110283613204956,
"eval_runtime": 486.9154,
"eval_samples_per_second": 262.0,
"eval_steps_per_second": 8.188,
"step": 25800
},
{
"epoch": 1.6241299303944317,
"grad_norm": 239.38189697265625,
"learning_rate": 1.3204201431831248e-05,
"loss": 1.116,
"step": 25900
},
{
"epoch": 1.6241299303944317,
"eval_loss": 1.1404825448989868,
"eval_runtime": 492.7605,
"eval_samples_per_second": 258.892,
"eval_steps_per_second": 8.091,
"step": 25900
},
{
"epoch": 1.6304007023264564,
"grad_norm": 232.2500457763672,
"learning_rate": 1.3169363688620251e-05,
"loss": 1.2221,
"step": 26000
},
{
"epoch": 1.6304007023264564,
"eval_loss": 1.1528397798538208,
"eval_runtime": 487.3414,
"eval_samples_per_second": 261.771,
"eval_steps_per_second": 8.181,
"step": 26000
},
{
"epoch": 1.6366714742584811,
"grad_norm": 5.894351959228516,
"learning_rate": 1.3134525945409257e-05,
"loss": 0.9085,
"step": 26100
},
{
"epoch": 1.6366714742584811,
"eval_loss": 1.139626145362854,
"eval_runtime": 480.0477,
"eval_samples_per_second": 265.749,
"eval_steps_per_second": 8.305,
"step": 26100
},
{
"epoch": 1.642942246190506,
"grad_norm": 0.19382409751415253,
"learning_rate": 1.3099688202198262e-05,
"loss": 0.9543,
"step": 26200
},
{
"epoch": 1.642942246190506,
"eval_loss": 1.195331335067749,
"eval_runtime": 487.008,
"eval_samples_per_second": 261.951,
"eval_steps_per_second": 8.187,
"step": 26200
},
{
"epoch": 1.6492130181225308,
"grad_norm": 240.2974090576172,
"learning_rate": 1.3064850458987268e-05,
"loss": 1.1855,
"step": 26300
},
{
"epoch": 1.6492130181225308,
"eval_loss": 1.1792023181915283,
"eval_runtime": 487.2539,
"eval_samples_per_second": 261.818,
"eval_steps_per_second": 8.183,
"step": 26300
},
{
"epoch": 1.6554837900545558,
"grad_norm": 5.021773338317871,
"learning_rate": 1.3030012715776273e-05,
"loss": 1.0583,
"step": 26400
},
{
"epoch": 1.6554837900545558,
"eval_loss": 1.1666100025177002,
"eval_runtime": 490.5958,
"eval_samples_per_second": 260.035,
"eval_steps_per_second": 8.127,
"step": 26400
},
{
"epoch": 1.6617545619865806,
"grad_norm": 0.47061604261398315,
"learning_rate": 1.299517497256528e-05,
"loss": 0.6583,
"step": 26500
},
{
"epoch": 1.6617545619865806,
"eval_loss": 1.1151552200317383,
"eval_runtime": 489.33,
"eval_samples_per_second": 260.708,
"eval_steps_per_second": 8.148,
"step": 26500
},
{
"epoch": 1.6680253339186053,
"grad_norm": 0.7339816689491272,
"learning_rate": 1.2960337229354282e-05,
"loss": 1.3067,
"step": 26600
},
{
"epoch": 1.6680253339186053,
"eval_loss": 1.0397262573242188,
"eval_runtime": 490.7479,
"eval_samples_per_second": 259.954,
"eval_steps_per_second": 8.124,
"step": 26600
},
{
"epoch": 1.6742961058506303,
"grad_norm": 0.43579697608947754,
"learning_rate": 1.2925499486143289e-05,
"loss": 1.5336,
"step": 26700
},
{
"epoch": 1.6742961058506303,
"eval_loss": 1.1244205236434937,
"eval_runtime": 504.0991,
"eval_samples_per_second": 253.069,
"eval_steps_per_second": 7.909,
"step": 26700
},
{
"epoch": 1.680566877782655,
"grad_norm": 0.877700686454773,
"learning_rate": 1.2890661742932293e-05,
"loss": 0.614,
"step": 26800
},
{
"epoch": 1.680566877782655,
"eval_loss": 1.1273393630981445,
"eval_runtime": 490.8071,
"eval_samples_per_second": 259.923,
"eval_steps_per_second": 8.123,
"step": 26800
},
{
"epoch": 1.68683764971468,
"grad_norm": 2.61261248588562,
"learning_rate": 1.28558239997213e-05,
"loss": 1.0336,
"step": 26900
},
{
"epoch": 1.68683764971468,
"eval_loss": 1.067978024482727,
"eval_runtime": 488.4991,
"eval_samples_per_second": 261.151,
"eval_steps_per_second": 8.162,
"step": 26900
},
{
"epoch": 1.6931084216467047,
"grad_norm": 1.7996759414672852,
"learning_rate": 1.2821334633942416e-05,
"loss": 1.462,
"step": 27000
},
{
"epoch": 1.6931084216467047,
"eval_loss": 1.0983270406723022,
"eval_runtime": 497.9625,
"eval_samples_per_second": 256.188,
"eval_steps_per_second": 8.007,
"step": 27000
},
{
"epoch": 1.6993791935787295,
"grad_norm": 0.4661722183227539,
"learning_rate": 1.2786496890731419e-05,
"loss": 0.8858,
"step": 27100
},
{
"epoch": 1.6993791935787295,
"eval_loss": 1.0672377347946167,
"eval_runtime": 488.5627,
"eval_samples_per_second": 261.117,
"eval_steps_per_second": 8.161,
"step": 27100
},
{
"epoch": 1.7056499655107544,
"grad_norm": 131.8981475830078,
"learning_rate": 1.2751659147520425e-05,
"loss": 0.7494,
"step": 27200
},
{
"epoch": 1.7056499655107544,
"eval_loss": 1.1623871326446533,
"eval_runtime": 489.2152,
"eval_samples_per_second": 260.769,
"eval_steps_per_second": 8.15,
"step": 27200
},
{
"epoch": 1.7119207374427792,
"grad_norm": 1.5505995750427246,
"learning_rate": 1.271682140430943e-05,
"loss": 0.8152,
"step": 27300
},
{
"epoch": 1.7119207374427792,
"eval_loss": 1.0928338766098022,
"eval_runtime": 485.3945,
"eval_samples_per_second": 262.821,
"eval_steps_per_second": 8.214,
"step": 27300
},
{
"epoch": 1.7181915093748041,
"grad_norm": 0.11606509238481522,
"learning_rate": 1.2681983661098436e-05,
"loss": 0.7785,
"step": 27400
},
{
"epoch": 1.7181915093748041,
"eval_loss": 1.0952435731887817,
"eval_runtime": 490.1873,
"eval_samples_per_second": 260.252,
"eval_steps_per_second": 8.134,
"step": 27400
},
{
"epoch": 1.7244622813068289,
"grad_norm": 60.00815963745117,
"learning_rate": 1.264714591788744e-05,
"loss": 1.0471,
"step": 27500
},
{
"epoch": 1.7244622813068289,
"eval_loss": 1.0999162197113037,
"eval_runtime": 472.8514,
"eval_samples_per_second": 269.793,
"eval_steps_per_second": 8.432,
"step": 27500
},
{
"epoch": 1.7307330532388536,
"grad_norm": 0.18325106799602509,
"learning_rate": 1.2612308174676447e-05,
"loss": 1.0994,
"step": 27600
},
{
"epoch": 1.7307330532388536,
"eval_loss": 0.9880152344703674,
"eval_runtime": 489.527,
"eval_samples_per_second": 260.603,
"eval_steps_per_second": 8.145,
"step": 27600
},
{
"epoch": 1.7370038251708786,
"grad_norm": 33.887603759765625,
"learning_rate": 1.257747043146545e-05,
"loss": 1.0706,
"step": 27700
},
{
"epoch": 1.7370038251708786,
"eval_loss": 1.0416243076324463,
"eval_runtime": 486.6381,
"eval_samples_per_second": 262.15,
"eval_steps_per_second": 8.193,
"step": 27700
},
{
"epoch": 1.7432745971029033,
"grad_norm": 122.05184936523438,
"learning_rate": 1.2542632688254456e-05,
"loss": 1.1158,
"step": 27800
},
{
"epoch": 1.7432745971029033,
"eval_loss": 1.0675890445709229,
"eval_runtime": 488.3694,
"eval_samples_per_second": 261.22,
"eval_steps_per_second": 8.164,
"step": 27800
},
{
"epoch": 1.7495453690349283,
"grad_norm": 3.5680992603302,
"learning_rate": 1.2507794945043461e-05,
"loss": 0.9893,
"step": 27900
},
{
"epoch": 1.7495453690349283,
"eval_loss": 1.0288848876953125,
"eval_runtime": 487.5059,
"eval_samples_per_second": 261.683,
"eval_steps_per_second": 8.178,
"step": 27900
},
{
"epoch": 1.755816140966953,
"grad_norm": 0.61468905210495,
"learning_rate": 1.2472957201832467e-05,
"loss": 1.2939,
"step": 28000
},
{
"epoch": 1.755816140966953,
"eval_loss": 1.0149768590927124,
"eval_runtime": 496.1264,
"eval_samples_per_second": 257.136,
"eval_steps_per_second": 8.036,
"step": 28000
},
{
"epoch": 1.7620869128989778,
"grad_norm": 0.23548483848571777,
"learning_rate": 1.2438119458621472e-05,
"loss": 0.9543,
"step": 28100
},
{
"epoch": 1.7620869128989778,
"eval_loss": 1.076741099357605,
"eval_runtime": 494.571,
"eval_samples_per_second": 257.945,
"eval_steps_per_second": 8.062,
"step": 28100
},
{
"epoch": 1.7683576848310028,
"grad_norm": 0.04505012556910515,
"learning_rate": 1.2403281715410475e-05,
"loss": 0.7907,
"step": 28200
},
{
"epoch": 1.7683576848310028,
"eval_loss": 1.071725845336914,
"eval_runtime": 498.1358,
"eval_samples_per_second": 256.099,
"eval_steps_per_second": 8.004,
"step": 28200
},
{
"epoch": 1.7746284567630275,
"grad_norm": 0.3665514886379242,
"learning_rate": 1.2368443972199481e-05,
"loss": 0.92,
"step": 28300
},
{
"epoch": 1.7746284567630275,
"eval_loss": 1.1132545471191406,
"eval_runtime": 494.9621,
"eval_samples_per_second": 257.741,
"eval_steps_per_second": 8.055,
"step": 28300
},
{
"epoch": 1.7808992286950525,
"grad_norm": 2.6903622150421143,
"learning_rate": 1.2333606228988486e-05,
"loss": 0.8636,
"step": 28400
},
{
"epoch": 1.7808992286950525,
"eval_loss": 1.070193886756897,
"eval_runtime": 487.101,
"eval_samples_per_second": 261.901,
"eval_steps_per_second": 8.185,
"step": 28400
},
{
"epoch": 1.7871700006270772,
"grad_norm": 246.5596923828125,
"learning_rate": 1.2298768485777492e-05,
"loss": 0.9118,
"step": 28500
},
{
"epoch": 1.7871700006270772,
"eval_loss": 1.0536377429962158,
"eval_runtime": 500.9429,
"eval_samples_per_second": 254.664,
"eval_steps_per_second": 7.959,
"step": 28500
},
{
"epoch": 1.793440772559102,
"grad_norm": 15.87330150604248,
"learning_rate": 1.2263930742566497e-05,
"loss": 1.2643,
"step": 28600
},
{
"epoch": 1.793440772559102,
"eval_loss": 1.135445237159729,
"eval_runtime": 491.8209,
"eval_samples_per_second": 259.387,
"eval_steps_per_second": 8.107,
"step": 28600
},
{
"epoch": 1.7997115444911267,
"grad_norm": 0.04285774007439613,
"learning_rate": 1.2229092999355503e-05,
"loss": 0.8284,
"step": 28700
},
{
"epoch": 1.7997115444911267,
"eval_loss": 1.0714679956436157,
"eval_runtime": 491.3195,
"eval_samples_per_second": 259.652,
"eval_steps_per_second": 8.115,
"step": 28700
},
{
"epoch": 1.8059823164231517,
"grad_norm": 50.862327575683594,
"learning_rate": 1.2194255256144508e-05,
"loss": 0.8447,
"step": 28800
},
{
"epoch": 1.8059823164231517,
"eval_loss": 1.0457782745361328,
"eval_runtime": 497.4392,
"eval_samples_per_second": 256.457,
"eval_steps_per_second": 8.015,
"step": 28800
},
{
"epoch": 1.8122530883551766,
"grad_norm": 1.507433295249939,
"learning_rate": 1.2159417512933514e-05,
"loss": 1.2102,
"step": 28900
},
{
"epoch": 1.8122530883551766,
"eval_loss": 1.1000713109970093,
"eval_runtime": 492.3678,
"eval_samples_per_second": 259.099,
"eval_steps_per_second": 8.098,
"step": 28900
},
{
"epoch": 1.8185238602872014,
"grad_norm": 182.16946411132812,
"learning_rate": 1.2124579769722517e-05,
"loss": 1.1042,
"step": 29000
},
{
"epoch": 1.8185238602872014,
"eval_loss": 1.0364127159118652,
"eval_runtime": 493.1395,
"eval_samples_per_second": 258.694,
"eval_steps_per_second": 8.085,
"step": 29000
},
{
"epoch": 1.824794632219226,
"grad_norm": 84.75048065185547,
"learning_rate": 1.2089742026511523e-05,
"loss": 0.9638,
"step": 29100
},
{
"epoch": 1.824794632219226,
"eval_loss": 1.0946918725967407,
"eval_runtime": 494.4031,
"eval_samples_per_second": 258.032,
"eval_steps_per_second": 8.064,
"step": 29100
},
{
"epoch": 1.8310654041512509,
"grad_norm": 0.5844135284423828,
"learning_rate": 1.2054904283300528e-05,
"loss": 0.6847,
"step": 29200
},
{
"epoch": 1.8310654041512509,
"eval_loss": 1.0311741828918457,
"eval_runtime": 481.6292,
"eval_samples_per_second": 264.876,
"eval_steps_per_second": 8.278,
"step": 29200
},
{
"epoch": 1.8373361760832758,
"grad_norm": 21.12558364868164,
"learning_rate": 1.2020066540089534e-05,
"loss": 1.7671,
"step": 29300
},
{
"epoch": 1.8373361760832758,
"eval_loss": 1.0470467805862427,
"eval_runtime": 494.9594,
"eval_samples_per_second": 257.742,
"eval_steps_per_second": 8.055,
"step": 29300
},
{
"epoch": 1.8436069480153008,
"grad_norm": 7.0535407066345215,
"learning_rate": 1.1985228796878539e-05,
"loss": 0.7525,
"step": 29400
},
{
"epoch": 1.8436069480153008,
"eval_loss": 1.1158130168914795,
"eval_runtime": 492.3408,
"eval_samples_per_second": 259.113,
"eval_steps_per_second": 8.098,
"step": 29400
},
{
"epoch": 1.8498777199473255,
"grad_norm": 0.11249526590108871,
"learning_rate": 1.1950391053667545e-05,
"loss": 1.2843,
"step": 29500
},
{
"epoch": 1.8498777199473255,
"eval_loss": 1.0139508247375488,
"eval_runtime": 483.0492,
"eval_samples_per_second": 264.097,
"eval_steps_per_second": 8.254,
"step": 29500
},
{
"epoch": 1.8561484918793503,
"grad_norm": 72.12831115722656,
"learning_rate": 1.191590168788866e-05,
"loss": 0.6844,
"step": 29600
},
{
"epoch": 1.8561484918793503,
"eval_loss": 1.1603798866271973,
"eval_runtime": 491.4897,
"eval_samples_per_second": 259.562,
"eval_steps_per_second": 8.112,
"step": 29600
},
{
"epoch": 1.862419263811375,
"grad_norm": 21.705537796020508,
"learning_rate": 1.1881063944677665e-05,
"loss": 1.2824,
"step": 29700
},
{
"epoch": 1.862419263811375,
"eval_loss": 1.0052319765090942,
"eval_runtime": 487.7473,
"eval_samples_per_second": 261.553,
"eval_steps_per_second": 8.174,
"step": 29700
},
{
"epoch": 1.8686900357434,
"grad_norm": 1.3453004360198975,
"learning_rate": 1.1846226201466671e-05,
"loss": 1.314,
"step": 29800
},
{
"epoch": 1.8686900357434,
"eval_loss": 1.0322686433792114,
"eval_runtime": 480.0979,
"eval_samples_per_second": 265.721,
"eval_steps_per_second": 8.305,
"step": 29800
},
{
"epoch": 1.874960807675425,
"grad_norm": 5.6963677406311035,
"learning_rate": 1.1811388458255676e-05,
"loss": 1.0796,
"step": 29900
},
{
"epoch": 1.874960807675425,
"eval_loss": 1.0885429382324219,
"eval_runtime": 483.9557,
"eval_samples_per_second": 263.603,
"eval_steps_per_second": 8.238,
"step": 29900
},
{
"epoch": 1.8812315796074497,
"grad_norm": 0.3642306923866272,
"learning_rate": 1.1776550715044682e-05,
"loss": 1.0012,
"step": 30000
},
{
"epoch": 1.8812315796074497,
"eval_loss": 1.0266896486282349,
"eval_runtime": 498.6153,
"eval_samples_per_second": 255.853,
"eval_steps_per_second": 7.996,
"step": 30000
},
{
"epoch": 1.8875023515394744,
"grad_norm": 45.68118667602539,
"learning_rate": 1.1741712971833685e-05,
"loss": 1.4932,
"step": 30100
},
{
"epoch": 1.8875023515394744,
"eval_loss": 1.0438352823257446,
"eval_runtime": 479.3134,
"eval_samples_per_second": 266.156,
"eval_steps_per_second": 8.318,
"step": 30100
},
{
"epoch": 1.8937731234714992,
"grad_norm": 159.10227966308594,
"learning_rate": 1.1706875228622691e-05,
"loss": 1.0404,
"step": 30200
},
{
"epoch": 1.8937731234714992,
"eval_loss": 1.0162733793258667,
"eval_runtime": 484.5198,
"eval_samples_per_second": 263.296,
"eval_steps_per_second": 8.229,
"step": 30200
},
{
"epoch": 1.9000438954035241,
"grad_norm": 9.165184020996094,
"learning_rate": 1.1672037485411696e-05,
"loss": 0.614,
"step": 30300
},
{
"epoch": 1.9000438954035241,
"eval_loss": 1.0366989374160767,
"eval_runtime": 494.4949,
"eval_samples_per_second": 257.984,
"eval_steps_per_second": 8.063,
"step": 30300
},
{
"epoch": 1.906314667335549,
"grad_norm": 93.2901840209961,
"learning_rate": 1.1637199742200702e-05,
"loss": 1.2676,
"step": 30400
},
{
"epoch": 1.906314667335549,
"eval_loss": 1.080250859260559,
"eval_runtime": 506.0169,
"eval_samples_per_second": 252.11,
"eval_steps_per_second": 7.879,
"step": 30400
},
{
"epoch": 1.9125854392675739,
"grad_norm": 22.93528938293457,
"learning_rate": 1.1602361998989707e-05,
"loss": 1.2431,
"step": 30500
},
{
"epoch": 1.9125854392675739,
"eval_loss": 1.042752742767334,
"eval_runtime": 482.2307,
"eval_samples_per_second": 264.546,
"eval_steps_per_second": 8.268,
"step": 30500
},
{
"epoch": 1.9188562111995986,
"grad_norm": 44.19611358642578,
"learning_rate": 1.1567524255778713e-05,
"loss": 1.4063,
"step": 30600
},
{
"epoch": 1.9188562111995986,
"eval_loss": 1.0318702459335327,
"eval_runtime": 482.0351,
"eval_samples_per_second": 264.653,
"eval_steps_per_second": 8.271,
"step": 30600
},
{
"epoch": 1.9251269831316233,
"grad_norm": 0.21961411833763123,
"learning_rate": 1.1532686512567716e-05,
"loss": 0.7787,
"step": 30700
},
{
"epoch": 1.9251269831316233,
"eval_loss": 0.9666246175765991,
"eval_runtime": 497.4003,
"eval_samples_per_second": 256.478,
"eval_steps_per_second": 8.016,
"step": 30700
},
{
"epoch": 1.9313977550636483,
"grad_norm": 5.579217910766602,
"learning_rate": 1.1497848769356722e-05,
"loss": 1.0311,
"step": 30800
},
{
"epoch": 1.9313977550636483,
"eval_loss": 1.0375796556472778,
"eval_runtime": 496.1027,
"eval_samples_per_second": 257.148,
"eval_steps_per_second": 8.037,
"step": 30800
},
{
"epoch": 1.9376685269956733,
"grad_norm": 0.01572820357978344,
"learning_rate": 1.1463011026145727e-05,
"loss": 1.0353,
"step": 30900
},
{
"epoch": 1.9376685269956733,
"eval_loss": 0.9868729114532471,
"eval_runtime": 491.6277,
"eval_samples_per_second": 259.489,
"eval_steps_per_second": 8.11,
"step": 30900
},
{
"epoch": 1.943939298927698,
"grad_norm": 1.0484445095062256,
"learning_rate": 1.1428173282934732e-05,
"loss": 1.2221,
"step": 31000
},
{
"epoch": 1.943939298927698,
"eval_loss": 0.968561589717865,
"eval_runtime": 499.708,
"eval_samples_per_second": 255.293,
"eval_steps_per_second": 7.979,
"step": 31000
},
{
"epoch": 1.9502100708597228,
"grad_norm": 123.73536682128906,
"learning_rate": 1.1393335539723738e-05,
"loss": 0.5806,
"step": 31100
},
{
"epoch": 1.9502100708597228,
"eval_loss": 0.9662685394287109,
"eval_runtime": 496.0179,
"eval_samples_per_second": 257.192,
"eval_steps_per_second": 8.038,
"step": 31100
},
{
"epoch": 1.9564808427917475,
"grad_norm": 265.9390869140625,
"learning_rate": 1.1358497796512741e-05,
"loss": 0.6919,
"step": 31200
},
{
"epoch": 1.9564808427917475,
"eval_loss": 0.9837759733200073,
"eval_runtime": 481.0273,
"eval_samples_per_second": 265.207,
"eval_steps_per_second": 8.289,
"step": 31200
},
{
"epoch": 1.9627516147237725,
"grad_norm": 1.0015980005264282,
"learning_rate": 1.1323660053301749e-05,
"loss": 0.8028,
"step": 31300
},
{
"epoch": 1.9627516147237725,
"eval_loss": 0.9759084582328796,
"eval_runtime": 487.9887,
"eval_samples_per_second": 261.424,
"eval_steps_per_second": 8.17,
"step": 31300
},
{
"epoch": 1.9690223866557974,
"grad_norm": 31.675607681274414,
"learning_rate": 1.1288822310090752e-05,
"loss": 0.8365,
"step": 31400
},
{
"epoch": 1.9690223866557974,
"eval_loss": 0.9640862345695496,
"eval_runtime": 496.5309,
"eval_samples_per_second": 256.927,
"eval_steps_per_second": 8.03,
"step": 31400
},
{
"epoch": 1.9752931585878222,
"grad_norm": 1.1243913173675537,
"learning_rate": 1.1253984566879758e-05,
"loss": 0.7518,
"step": 31500
},
{
"epoch": 1.9752931585878222,
"eval_loss": 1.008094310760498,
"eval_runtime": 499.5695,
"eval_samples_per_second": 255.364,
"eval_steps_per_second": 7.981,
"step": 31500
},
{
"epoch": 1.981563930519847,
"grad_norm": 216.04434204101562,
"learning_rate": 1.1219495201100875e-05,
"loss": 1.0654,
"step": 31600
},
{
"epoch": 1.981563930519847,
"eval_loss": 0.9843435287475586,
"eval_runtime": 480.8256,
"eval_samples_per_second": 265.319,
"eval_steps_per_second": 8.292,
"step": 31600
},
{
"epoch": 1.9878347024518717,
"grad_norm": 0.3936084806919098,
"learning_rate": 1.1184657457889878e-05,
"loss": 0.8637,
"step": 31700
},
{
"epoch": 1.9878347024518717,
"eval_loss": 0.963536262512207,
"eval_runtime": 481.4757,
"eval_samples_per_second": 264.96,
"eval_steps_per_second": 8.281,
"step": 31700
},
{
"epoch": 1.9941054743838966,
"grad_norm": 8.97900104522705,
"learning_rate": 1.1149819714678884e-05,
"loss": 0.8663,
"step": 31800
},
{
"epoch": 1.9941054743838966,
"eval_loss": 0.9537881016731262,
"eval_runtime": 488.2812,
"eval_samples_per_second": 261.268,
"eval_steps_per_second": 8.165,
"step": 31800
},
{
"epoch": 2.0003762463159216,
"grad_norm": 0.23352281749248505,
"learning_rate": 1.1114981971467889e-05,
"loss": 0.8524,
"step": 31900
},
{
"epoch": 2.0003762463159216,
"eval_loss": 0.9627546072006226,
"eval_runtime": 476.3461,
"eval_samples_per_second": 267.814,
"eval_steps_per_second": 8.37,
"step": 31900
},
{
"epoch": 2.0066470182479463,
"grad_norm": 10.038532257080078,
"learning_rate": 1.1080144228256895e-05,
"loss": 1.2748,
"step": 32000
},
{
"epoch": 2.0066470182479463,
"eval_loss": 0.9381898641586304,
"eval_runtime": 483.6522,
"eval_samples_per_second": 263.768,
"eval_steps_per_second": 8.244,
"step": 32000
},
{
"epoch": 2.012917790179971,
"grad_norm": 3.102550745010376,
"learning_rate": 1.10453064850459e-05,
"loss": 0.8138,
"step": 32100
},
{
"epoch": 2.012917790179971,
"eval_loss": 0.9460862874984741,
"eval_runtime": 486.1122,
"eval_samples_per_second": 262.433,
"eval_steps_per_second": 8.202,
"step": 32100
},
{
"epoch": 2.019188562111996,
"grad_norm": 5.872899532318115,
"learning_rate": 1.1010468741834906e-05,
"loss": 0.4484,
"step": 32200
},
{
"epoch": 2.019188562111996,
"eval_loss": 0.9221316576004028,
"eval_runtime": 489.5035,
"eval_samples_per_second": 260.615,
"eval_steps_per_second": 8.145,
"step": 32200
},
{
"epoch": 2.025459334044021,
"grad_norm": 70.84674072265625,
"learning_rate": 1.0975630998623909e-05,
"loss": 0.8839,
"step": 32300
},
{
"epoch": 2.025459334044021,
"eval_loss": 0.9566515684127808,
"eval_runtime": 497.3551,
"eval_samples_per_second": 256.501,
"eval_steps_per_second": 8.016,
"step": 32300
},
{
"epoch": 2.0317301059760458,
"grad_norm": 20.528474807739258,
"learning_rate": 1.0940793255412915e-05,
"loss": 0.7599,
"step": 32400
},
{
"epoch": 2.0317301059760458,
"eval_loss": 0.9439575672149658,
"eval_runtime": 475.9709,
"eval_samples_per_second": 268.025,
"eval_steps_per_second": 8.377,
"step": 32400
},
{
"epoch": 2.0380008779080705,
"grad_norm": 0.2569330930709839,
"learning_rate": 1.090595551220192e-05,
"loss": 0.8665,
"step": 32500
},
{
"epoch": 2.0380008779080705,
"eval_loss": 0.9651756882667542,
"eval_runtime": 476.2761,
"eval_samples_per_second": 267.853,
"eval_steps_per_second": 8.371,
"step": 32500
},
{
"epoch": 2.0442716498400952,
"grad_norm": 160.0611572265625,
"learning_rate": 1.0871117768990926e-05,
"loss": 0.5802,
"step": 32600
},
{
"epoch": 2.0442716498400952,
"eval_loss": 0.9474946856498718,
"eval_runtime": 498.1214,
"eval_samples_per_second": 256.106,
"eval_steps_per_second": 8.004,
"step": 32600
},
{
"epoch": 2.05054242177212,
"grad_norm": 13.137542724609375,
"learning_rate": 1.083628002577993e-05,
"loss": 0.7731,
"step": 32700
},
{
"epoch": 2.05054242177212,
"eval_loss": 0.9197245240211487,
"eval_runtime": 471.6865,
"eval_samples_per_second": 270.459,
"eval_steps_per_second": 8.453,
"step": 32700
},
{
"epoch": 2.056813193704145,
"grad_norm": 4.745016574859619,
"learning_rate": 1.0801442282568937e-05,
"loss": 0.7913,
"step": 32800
},
{
"epoch": 2.056813193704145,
"eval_loss": 1.002418875694275,
"eval_runtime": 444.3682,
"eval_samples_per_second": 287.086,
"eval_steps_per_second": 8.972,
"step": 32800
},
{
"epoch": 2.06308396563617,
"grad_norm": 273.15252685546875,
"learning_rate": 1.0766604539357942e-05,
"loss": 0.7758,
"step": 32900
},
{
"epoch": 2.06308396563617,
"eval_loss": 0.9257067441940308,
"eval_runtime": 479.6839,
"eval_samples_per_second": 265.95,
"eval_steps_per_second": 8.312,
"step": 32900
},
{
"epoch": 2.0693547375681947,
"grad_norm": 0.2749234437942505,
"learning_rate": 1.0731766796146948e-05,
"loss": 0.7468,
"step": 33000
},
{
"epoch": 2.0693547375681947,
"eval_loss": 0.9662745594978333,
"eval_runtime": 482.8123,
"eval_samples_per_second": 264.227,
"eval_steps_per_second": 8.258,
"step": 33000
},
{
"epoch": 2.0756255095002194,
"grad_norm": 2.7121362686157227,
"learning_rate": 1.0696929052935951e-05,
"loss": 0.9947,
"step": 33100
},
{
"epoch": 2.0756255095002194,
"eval_loss": 0.9788134098052979,
"eval_runtime": 488.227,
"eval_samples_per_second": 261.296,
"eval_steps_per_second": 8.166,
"step": 33100
},
{
"epoch": 2.081896281432244,
"grad_norm": 0.2543056905269623,
"learning_rate": 1.0662091309724957e-05,
"loss": 0.5618,
"step": 33200
},
{
"epoch": 2.081896281432244,
"eval_loss": 0.948021650314331,
"eval_runtime": 491.5864,
"eval_samples_per_second": 259.511,
"eval_steps_per_second": 8.11,
"step": 33200
},
{
"epoch": 2.0881670533642693,
"grad_norm": 0.034537989646196365,
"learning_rate": 1.0627253566513962e-05,
"loss": 0.8805,
"step": 33300
},
{
"epoch": 2.0881670533642693,
"eval_loss": 0.9520492553710938,
"eval_runtime": 482.9571,
"eval_samples_per_second": 264.148,
"eval_steps_per_second": 8.255,
"step": 33300
},
{
"epoch": 2.094437825296294,
"grad_norm": 4.662662982940674,
"learning_rate": 1.0592415823302968e-05,
"loss": 0.9755,
"step": 33400
},
{
"epoch": 2.094437825296294,
"eval_loss": 0.9288346767425537,
"eval_runtime": 495.4516,
"eval_samples_per_second": 257.486,
"eval_steps_per_second": 8.047,
"step": 33400
},
{
"epoch": 2.100708597228319,
"grad_norm": 64.40668487548828,
"learning_rate": 1.0557578080091973e-05,
"loss": 0.8942,
"step": 33500
},
{
"epoch": 2.100708597228319,
"eval_loss": 0.9233998656272888,
"eval_runtime": 486.5273,
"eval_samples_per_second": 262.209,
"eval_steps_per_second": 8.195,
"step": 33500
},
{
"epoch": 2.1069793691603436,
"grad_norm": 2.1412320137023926,
"learning_rate": 1.0522740336880976e-05,
"loss": 0.7242,
"step": 33600
},
{
"epoch": 2.1069793691603436,
"eval_loss": 0.9412585496902466,
"eval_runtime": 481.4435,
"eval_samples_per_second": 264.978,
"eval_steps_per_second": 8.281,
"step": 33600
},
{
"epoch": 2.1132501410923683,
"grad_norm": 5.01767635345459,
"learning_rate": 1.0487902593669982e-05,
"loss": 0.6231,
"step": 33700
},
{
"epoch": 2.1132501410923683,
"eval_loss": 0.9660213589668274,
"eval_runtime": 480.6062,
"eval_samples_per_second": 265.44,
"eval_steps_per_second": 8.296,
"step": 33700
},
{
"epoch": 2.1195209130243935,
"grad_norm": 0.02841496281325817,
"learning_rate": 1.0453064850458987e-05,
"loss": 0.7144,
"step": 33800
},
{
"epoch": 2.1195209130243935,
"eval_loss": 0.8900822997093201,
"eval_runtime": 506.9048,
"eval_samples_per_second": 251.669,
"eval_steps_per_second": 7.865,
"step": 33800
},
{
"epoch": 2.1257916849564182,
"grad_norm": 14.184029579162598,
"learning_rate": 1.0418227107247993e-05,
"loss": 0.7139,
"step": 33900
},
{
"epoch": 2.1257916849564182,
"eval_loss": 0.9535605907440186,
"eval_runtime": 467.6722,
"eval_samples_per_second": 272.781,
"eval_steps_per_second": 8.525,
"step": 33900
},
{
"epoch": 2.132062456888443,
"grad_norm": 247.51730346679688,
"learning_rate": 1.0383389364036998e-05,
"loss": 0.6378,
"step": 34000
},
{
"epoch": 2.132062456888443,
"eval_loss": 0.9369811415672302,
"eval_runtime": 467.3096,
"eval_samples_per_second": 272.992,
"eval_steps_per_second": 8.532,
"step": 34000
},
{
"epoch": 2.1383332288204677,
"grad_norm": 0.265493243932724,
"learning_rate": 1.0348551620826004e-05,
"loss": 0.7607,
"step": 34100
},
{
"epoch": 2.1383332288204677,
"eval_loss": 0.9209387898445129,
"eval_runtime": 460.3387,
"eval_samples_per_second": 277.126,
"eval_steps_per_second": 8.661,
"step": 34100
},
{
"epoch": 2.1446040007524925,
"grad_norm": 4.44495153427124,
"learning_rate": 1.0313713877615009e-05,
"loss": 0.8667,
"step": 34200
},
{
"epoch": 2.1446040007524925,
"eval_loss": 0.9734475016593933,
"eval_runtime": 472.3123,
"eval_samples_per_second": 270.101,
"eval_steps_per_second": 8.441,
"step": 34200
},
{
"epoch": 2.1508747726845177,
"grad_norm": 1.1490778923034668,
"learning_rate": 1.0278876134404015e-05,
"loss": 0.8533,
"step": 34300
},
{
"epoch": 2.1508747726845177,
"eval_loss": 0.9177405834197998,
"eval_runtime": 481.8576,
"eval_samples_per_second": 264.75,
"eval_steps_per_second": 8.274,
"step": 34300
},
{
"epoch": 2.1571455446165424,
"grad_norm": 6.377614498138428,
"learning_rate": 1.0244038391193018e-05,
"loss": 0.6395,
"step": 34400
},
{
"epoch": 2.1571455446165424,
"eval_loss": 0.9285467863082886,
"eval_runtime": 491.7764,
"eval_samples_per_second": 259.411,
"eval_steps_per_second": 8.107,
"step": 34400
},
{
"epoch": 2.163416316548567,
"grad_norm": 63.10408401489258,
"learning_rate": 1.0209200647982025e-05,
"loss": 0.7377,
"step": 34500
},
{
"epoch": 2.163416316548567,
"eval_loss": 0.9046958088874817,
"eval_runtime": 472.1262,
"eval_samples_per_second": 270.207,
"eval_steps_per_second": 8.445,
"step": 34500
},
{
"epoch": 2.169687088480592,
"grad_norm": 0.07853188365697861,
"learning_rate": 1.017471128220314e-05,
"loss": 0.7787,
"step": 34600
},
{
"epoch": 2.169687088480592,
"eval_loss": 0.9967793822288513,
"eval_runtime": 488.6457,
"eval_samples_per_second": 261.073,
"eval_steps_per_second": 8.159,
"step": 34600
},
{
"epoch": 2.1759578604126166,
"grad_norm": 44.51852035522461,
"learning_rate": 1.0139873538992144e-05,
"loss": 0.6561,
"step": 34700
},
{
"epoch": 2.1759578604126166,
"eval_loss": 0.9653065800666809,
"eval_runtime": 480.7646,
"eval_samples_per_second": 265.352,
"eval_steps_per_second": 8.293,
"step": 34700
},
{
"epoch": 2.182228632344642,
"grad_norm": 37.319366455078125,
"learning_rate": 1.010503579578115e-05,
"loss": 0.6169,
"step": 34800
},
{
"epoch": 2.182228632344642,
"eval_loss": 0.9403988122940063,
"eval_runtime": 467.6154,
"eval_samples_per_second": 272.814,
"eval_steps_per_second": 8.526,
"step": 34800
},
{
"epoch": 2.1884994042766666,
"grad_norm": 0.24766607582569122,
"learning_rate": 1.0070198052570155e-05,
"loss": 0.7643,
"step": 34900
},
{
"epoch": 2.1884994042766666,
"eval_loss": 0.9397174715995789,
"eval_runtime": 492.9746,
"eval_samples_per_second": 258.78,
"eval_steps_per_second": 8.088,
"step": 34900
},
{
"epoch": 2.1947701762086913,
"grad_norm": 1.6579983234405518,
"learning_rate": 1.0035360309359161e-05,
"loss": 0.998,
"step": 35000
},
{
"epoch": 2.1947701762086913,
"eval_loss": 0.9152400493621826,
"eval_runtime": 488.9266,
"eval_samples_per_second": 260.923,
"eval_steps_per_second": 8.155,
"step": 35000
},
{
"epoch": 2.201040948140716,
"grad_norm": 14.633705139160156,
"learning_rate": 1.0000522566148166e-05,
"loss": 0.8246,
"step": 35100
},
{
"epoch": 2.201040948140716,
"eval_loss": 0.9512937068939209,
"eval_runtime": 497.3249,
"eval_samples_per_second": 256.516,
"eval_steps_per_second": 8.017,
"step": 35100
},
{
"epoch": 2.207311720072741,
"grad_norm": 162.75132751464844,
"learning_rate": 9.96568482293717e-06,
"loss": 0.6655,
"step": 35200
},
{
"epoch": 2.207311720072741,
"eval_loss": 0.9354454278945923,
"eval_runtime": 479.342,
"eval_samples_per_second": 266.14,
"eval_steps_per_second": 8.318,
"step": 35200
},
{
"epoch": 2.213582492004766,
"grad_norm": 0.0890607163310051,
"learning_rate": 9.930847079726175e-06,
"loss": 0.9279,
"step": 35300
},
{
"epoch": 2.213582492004766,
"eval_loss": 0.9034134745597839,
"eval_runtime": 495.8444,
"eval_samples_per_second": 257.282,
"eval_steps_per_second": 8.041,
"step": 35300
},
{
"epoch": 2.2198532639367907,
"grad_norm": 12.482114791870117,
"learning_rate": 9.896009336515181e-06,
"loss": 0.4239,
"step": 35400
},
{
"epoch": 2.2198532639367907,
"eval_loss": 0.9606735706329346,
"eval_runtime": 472.1675,
"eval_samples_per_second": 270.184,
"eval_steps_per_second": 8.444,
"step": 35400
},
{
"epoch": 2.2261240358688155,
"grad_norm": 193.45916748046875,
"learning_rate": 9.861171593304186e-06,
"loss": 1.0023,
"step": 35500
},
{
"epoch": 2.2261240358688155,
"eval_loss": 0.8731982707977295,
"eval_runtime": 502.841,
"eval_samples_per_second": 253.702,
"eval_steps_per_second": 7.929,
"step": 35500
},
{
"epoch": 2.23239480780084,
"grad_norm": 25.368621826171875,
"learning_rate": 9.826333850093192e-06,
"loss": 0.7426,
"step": 35600
},
{
"epoch": 2.23239480780084,
"eval_loss": 0.8882994651794434,
"eval_runtime": 489.5037,
"eval_samples_per_second": 260.615,
"eval_steps_per_second": 8.145,
"step": 35600
},
{
"epoch": 2.238665579732865,
"grad_norm": 6.321267127990723,
"learning_rate": 9.791496106882197e-06,
"loss": 0.8675,
"step": 35700
},
{
"epoch": 2.238665579732865,
"eval_loss": 0.9296298027038574,
"eval_runtime": 481.2287,
"eval_samples_per_second": 265.096,
"eval_steps_per_second": 8.285,
"step": 35700
},
{
"epoch": 2.24493635166489,
"grad_norm": 0.16120706498622894,
"learning_rate": 9.756658363671202e-06,
"loss": 0.9226,
"step": 35800
},
{
"epoch": 2.24493635166489,
"eval_loss": 0.9145704507827759,
"eval_runtime": 507.1996,
"eval_samples_per_second": 251.522,
"eval_steps_per_second": 7.861,
"step": 35800
},
{
"epoch": 2.251207123596915,
"grad_norm": 14.761024475097656,
"learning_rate": 9.721820620460208e-06,
"loss": 0.4944,
"step": 35900
},
{
"epoch": 2.251207123596915,
"eval_loss": 0.9145201444625854,
"eval_runtime": 480.7973,
"eval_samples_per_second": 265.334,
"eval_steps_per_second": 8.292,
"step": 35900
},
{
"epoch": 2.2574778955289396,
"grad_norm": 15.988486289978027,
"learning_rate": 9.686982877249213e-06,
"loss": 0.9663,
"step": 36000
},
{
"epoch": 2.2574778955289396,
"eval_loss": 0.8893073201179504,
"eval_runtime": 466.9887,
"eval_samples_per_second": 273.18,
"eval_steps_per_second": 8.538,
"step": 36000
},
{
"epoch": 2.2637486674609644,
"grad_norm": 8.232684135437012,
"learning_rate": 9.652493511470327e-06,
"loss": 0.6455,
"step": 36100
},
{
"epoch": 2.2637486674609644,
"eval_loss": 0.9238069653511047,
"eval_runtime": 471.8533,
"eval_samples_per_second": 270.364,
"eval_steps_per_second": 8.45,
"step": 36100
},
{
"epoch": 2.270019439392989,
"grad_norm": 0.20196978747844696,
"learning_rate": 9.617655768259333e-06,
"loss": 0.9673,
"step": 36200
},
{
"epoch": 2.270019439392989,
"eval_loss": 0.8942546248435974,
"eval_runtime": 497.8096,
"eval_samples_per_second": 256.267,
"eval_steps_per_second": 8.009,
"step": 36200
},
{
"epoch": 2.2762902113250143,
"grad_norm": 180.26956176757812,
"learning_rate": 9.582818025048338e-06,
"loss": 0.7974,
"step": 36300
},
{
"epoch": 2.2762902113250143,
"eval_loss": 0.8620045185089111,
"eval_runtime": 505.8787,
"eval_samples_per_second": 252.179,
"eval_steps_per_second": 7.881,
"step": 36300
},
{
"epoch": 2.282560983257039,
"grad_norm": 118.34184265136719,
"learning_rate": 9.547980281837343e-06,
"loss": 0.9777,
"step": 36400
},
{
"epoch": 2.282560983257039,
"eval_loss": 0.8812283873558044,
"eval_runtime": 484.5263,
"eval_samples_per_second": 263.292,
"eval_steps_per_second": 8.229,
"step": 36400
},
{
"epoch": 2.288831755189064,
"grad_norm": 1.6557927131652832,
"learning_rate": 9.513142538626349e-06,
"loss": 0.8741,
"step": 36500
},
{
"epoch": 2.288831755189064,
"eval_loss": 0.8862267732620239,
"eval_runtime": 499.5387,
"eval_samples_per_second": 255.38,
"eval_steps_per_second": 7.981,
"step": 36500
},
{
"epoch": 2.2951025271210885,
"grad_norm": 0.09699351340532303,
"learning_rate": 9.478304795415354e-06,
"loss": 0.9642,
"step": 36600
},
{
"epoch": 2.2951025271210885,
"eval_loss": 0.9157158732414246,
"eval_runtime": 473.3855,
"eval_samples_per_second": 269.489,
"eval_steps_per_second": 8.422,
"step": 36600
},
{
"epoch": 2.3013732990531133,
"grad_norm": 166.3496551513672,
"learning_rate": 9.44346705220436e-06,
"loss": 0.9225,
"step": 36700
},
{
"epoch": 2.3013732990531133,
"eval_loss": 0.8784195780754089,
"eval_runtime": 480.9986,
"eval_samples_per_second": 265.223,
"eval_steps_per_second": 8.289,
"step": 36700
},
{
"epoch": 2.3076440709851385,
"grad_norm": 3.2308545112609863,
"learning_rate": 9.408629308993365e-06,
"loss": 0.6789,
"step": 36800
},
{
"epoch": 2.3076440709851385,
"eval_loss": 0.9065931439399719,
"eval_runtime": 486.2861,
"eval_samples_per_second": 262.339,
"eval_steps_per_second": 8.199,
"step": 36800
},
{
"epoch": 2.313914842917163,
"grad_norm": 117.87212371826172,
"learning_rate": 9.37379156578237e-06,
"loss": 0.6726,
"step": 36900
},
{
"epoch": 2.313914842917163,
"eval_loss": 0.9090869426727295,
"eval_runtime": 479.1369,
"eval_samples_per_second": 266.254,
"eval_steps_per_second": 8.321,
"step": 36900
},
{
"epoch": 2.320185614849188,
"grad_norm": 64.78949737548828,
"learning_rate": 9.338953822571376e-06,
"loss": 0.7326,
"step": 37000
},
{
"epoch": 2.320185614849188,
"eval_loss": 0.9202622175216675,
"eval_runtime": 484.0573,
"eval_samples_per_second": 263.547,
"eval_steps_per_second": 8.237,
"step": 37000
},
{
"epoch": 2.3264563867812127,
"grad_norm": 259.2130126953125,
"learning_rate": 9.30411607936038e-06,
"loss": 1.007,
"step": 37100
},
{
"epoch": 2.3264563867812127,
"eval_loss": 0.9124699234962463,
"eval_runtime": 494.1141,
"eval_samples_per_second": 258.183,
"eval_steps_per_second": 8.069,
"step": 37100
},
{
"epoch": 2.3327271587132374,
"grad_norm": 3.8969433307647705,
"learning_rate": 9.269278336149385e-06,
"loss": 0.6134,
"step": 37200
},
{
"epoch": 2.3327271587132374,
"eval_loss": 0.8837085366249084,
"eval_runtime": 473.1604,
"eval_samples_per_second": 269.617,
"eval_steps_per_second": 8.426,
"step": 37200
},
{
"epoch": 2.3389979306452626,
"grad_norm": 0.8037031292915344,
"learning_rate": 9.234440592938391e-06,
"loss": 0.9051,
"step": 37300
},
{
"epoch": 2.3389979306452626,
"eval_loss": 0.8945268392562866,
"eval_runtime": 488.4927,
"eval_samples_per_second": 261.154,
"eval_steps_per_second": 8.162,
"step": 37300
},
{
"epoch": 2.3452687025772874,
"grad_norm": 70.98564910888672,
"learning_rate": 9.199602849727396e-06,
"loss": 0.837,
"step": 37400
},
{
"epoch": 2.3452687025772874,
"eval_loss": 0.8740183711051941,
"eval_runtime": 492.9522,
"eval_samples_per_second": 258.792,
"eval_steps_per_second": 8.088,
"step": 37400
},
{
"epoch": 2.351539474509312,
"grad_norm": 1.006698489189148,
"learning_rate": 9.1647651065164e-06,
"loss": 0.7615,
"step": 37500
},
{
"epoch": 2.351539474509312,
"eval_loss": 0.916473388671875,
"eval_runtime": 499.2162,
"eval_samples_per_second": 255.545,
"eval_steps_per_second": 7.987,
"step": 37500
},
{
"epoch": 2.357810246441337,
"grad_norm": 0.15957336127758026,
"learning_rate": 9.129927363305405e-06,
"loss": 0.8304,
"step": 37600
},
{
"epoch": 2.357810246441337,
"eval_loss": 0.9107189774513245,
"eval_runtime": 494.2784,
"eval_samples_per_second": 258.097,
"eval_steps_per_second": 8.066,
"step": 37600
},
{
"epoch": 2.3640810183733616,
"grad_norm": 0.21330799162387848,
"learning_rate": 9.09508962009441e-06,
"loss": 0.6255,
"step": 37700
},
{
"epoch": 2.3640810183733616,
"eval_loss": 0.8891344666481018,
"eval_runtime": 489.2061,
"eval_samples_per_second": 260.774,
"eval_steps_per_second": 8.15,
"step": 37700
},
{
"epoch": 2.370351790305387,
"grad_norm": 1.2431716918945312,
"learning_rate": 9.060251876883416e-06,
"loss": 0.6775,
"step": 37800
},
{
"epoch": 2.370351790305387,
"eval_loss": 0.8907997608184814,
"eval_runtime": 497.9968,
"eval_samples_per_second": 256.17,
"eval_steps_per_second": 8.006,
"step": 37800
},
{
"epoch": 2.3766225622374115,
"grad_norm": 0.6021884679794312,
"learning_rate": 9.025414133672421e-06,
"loss": 0.7159,
"step": 37900
},
{
"epoch": 2.3766225622374115,
"eval_loss": 0.8589950203895569,
"eval_runtime": 499.2581,
"eval_samples_per_second": 255.523,
"eval_steps_per_second": 7.986,
"step": 37900
},
{
"epoch": 2.3828933341694363,
"grad_norm": 0.35575389862060547,
"learning_rate": 8.990576390461425e-06,
"loss": 0.6422,
"step": 38000
},
{
"epoch": 2.3828933341694363,
"eval_loss": 0.8558962941169739,
"eval_runtime": 493.9361,
"eval_samples_per_second": 258.276,
"eval_steps_per_second": 8.072,
"step": 38000
},
{
"epoch": 2.389164106101461,
"grad_norm": 0.1820683479309082,
"learning_rate": 8.955738647250432e-06,
"loss": 0.7773,
"step": 38100
},
{
"epoch": 2.389164106101461,
"eval_loss": 0.8600557446479797,
"eval_runtime": 493.6945,
"eval_samples_per_second": 258.403,
"eval_steps_per_second": 8.076,
"step": 38100
},
{
"epoch": 2.3954348780334858,
"grad_norm": 0.02349485270678997,
"learning_rate": 8.920900904039436e-06,
"loss": 0.5457,
"step": 38200
},
{
"epoch": 2.3954348780334858,
"eval_loss": 0.8856033086776733,
"eval_runtime": 499.9609,
"eval_samples_per_second": 255.164,
"eval_steps_per_second": 7.975,
"step": 38200
},
{
"epoch": 2.401705649965511,
"grad_norm": 38.077266693115234,
"learning_rate": 8.886063160828443e-06,
"loss": 0.4997,
"step": 38300
},
{
"epoch": 2.401705649965511,
"eval_loss": 0.8785237669944763,
"eval_runtime": 503.5878,
"eval_samples_per_second": 253.326,
"eval_steps_per_second": 7.917,
"step": 38300
},
{
"epoch": 2.4079764218975357,
"grad_norm": 100.00057983398438,
"learning_rate": 8.851225417617447e-06,
"loss": 0.6319,
"step": 38400
},
{
"epoch": 2.4079764218975357,
"eval_loss": 0.885017454624176,
"eval_runtime": 496.4876,
"eval_samples_per_second": 256.949,
"eval_steps_per_second": 8.03,
"step": 38400
},
{
"epoch": 2.4142471938295604,
"grad_norm": 71.8719253540039,
"learning_rate": 8.816387674406452e-06,
"loss": 0.7096,
"step": 38500
},
{
"epoch": 2.4142471938295604,
"eval_loss": 0.823376476764679,
"eval_runtime": 500.2741,
"eval_samples_per_second": 255.004,
"eval_steps_per_second": 7.97,
"step": 38500
}
],
"logging_steps": 100,
"max_steps": 63788,
"num_input_tokens_seen": 0,
"num_train_epochs": 4,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 32,
"trial_name": null,
"trial_params": null
}