bgem3-sft-20240523-e1 / trainer_state.json
nntoan209's picture
Upload folder using huggingface_hub
cec3540 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 500,
"global_step": 3409,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0,
"grad_norm": 11.125381634627788,
"learning_rate": 5.308241808752198e-06,
"loss": 0.4004,
"step": 5
},
{
"epoch": 0.0,
"grad_norm": 8.92423656628485,
"learning_rate": 7.508241808752199e-06,
"loss": 0.4709,
"step": 10
},
{
"epoch": 0.0,
"grad_norm": 8.41715845868157,
"learning_rate": 8.795159310338741e-06,
"loss": 0.3973,
"step": 15
},
{
"epoch": 0.01,
"grad_norm": 15.774318751695052,
"learning_rate": 9.708241808752198e-06,
"loss": 0.4057,
"step": 20
},
{
"epoch": 0.01,
"grad_norm": 8.099327492758947,
"learning_rate": 1.0416483617504396e-05,
"loss": 0.2547,
"step": 25
},
{
"epoch": 0.01,
"grad_norm": 7.413713903241163,
"learning_rate": 1.099515931033874e-05,
"loss": 0.2041,
"step": 30
},
{
"epoch": 0.01,
"grad_norm": 9.03722608897779,
"learning_rate": 1.1484422637278927e-05,
"loss": 0.2867,
"step": 35
},
{
"epoch": 0.01,
"grad_norm": 8.92455629586844,
"learning_rate": 1.1908241808752199e-05,
"loss": 0.2939,
"step": 40
},
{
"epoch": 0.01,
"grad_norm": 10.156800616083926,
"learning_rate": 1.2282076811925285e-05,
"loss": 0.2955,
"step": 45
},
{
"epoch": 0.01,
"grad_norm": 9.420288964636175,
"learning_rate": 1.2616483617504393e-05,
"loss": 0.1845,
"step": 50
},
{
"epoch": 0.02,
"grad_norm": 16.959594279780564,
"learning_rate": 1.2918991369754252e-05,
"loss": 0.2387,
"step": 55
},
{
"epoch": 0.02,
"grad_norm": 6.074452366890137,
"learning_rate": 1.3195159310338741e-05,
"loss": 0.2455,
"step": 60
},
{
"epoch": 0.02,
"grad_norm": 4.742335711555924,
"learning_rate": 1.3449209188662602e-05,
"loss": 0.159,
"step": 65
},
{
"epoch": 0.02,
"grad_norm": 5.288626310542604,
"learning_rate": 1.3684422637278928e-05,
"loss": 0.1843,
"step": 70
},
{
"epoch": 0.02,
"grad_norm": 7.44716471541915,
"learning_rate": 1.3903401119090938e-05,
"loss": 0.1788,
"step": 75
},
{
"epoch": 0.02,
"grad_norm": 3.821096016199364,
"learning_rate": 1.4108241808752197e-05,
"loss": 0.1514,
"step": 80
},
{
"epoch": 0.02,
"grad_norm": 8.992446696823665,
"learning_rate": 1.4300660059502947e-05,
"loss": 0.2482,
"step": 85
},
{
"epoch": 0.03,
"grad_norm": 5.839197680608412,
"learning_rate": 1.4482076811925287e-05,
"loss": 0.2065,
"step": 90
},
{
"epoch": 0.03,
"grad_norm": 8.087974815050226,
"learning_rate": 1.4653682338328086e-05,
"loss": 0.2201,
"step": 95
},
{
"epoch": 0.03,
"grad_norm": 9.987995189856553,
"learning_rate": 1.4816483617504398e-05,
"loss": 0.1501,
"step": 100
},
{
"epoch": 0.03,
"grad_norm": 7.240449600169706,
"learning_rate": 1.4971340138865471e-05,
"loss": 0.2253,
"step": 105
},
{
"epoch": 0.03,
"grad_norm": 15.014801352117672,
"learning_rate": 1.5118991369754255e-05,
"loss": 0.1938,
"step": 110
},
{
"epoch": 0.03,
"grad_norm": 8.588750321209607,
"learning_rate": 1.5260078112077627e-05,
"loss": 0.2299,
"step": 115
},
{
"epoch": 0.04,
"grad_norm": 20.519070570571127,
"learning_rate": 1.5395159310338742e-05,
"loss": 0.2432,
"step": 120
},
{
"epoch": 0.04,
"grad_norm": 4.505660841015201,
"learning_rate": 1.5524725426256594e-05,
"loss": 0.127,
"step": 125
},
{
"epoch": 0.04,
"grad_norm": 6.3767344789277605,
"learning_rate": 1.56492091886626e-05,
"loss": 0.192,
"step": 130
},
{
"epoch": 0.04,
"grad_norm": 6.2113668041489545,
"learning_rate": 1.576899431351183e-05,
"loss": 0.2247,
"step": 135
},
{
"epoch": 0.04,
"grad_norm": 11.909747553663436,
"learning_rate": 1.5884422637278926e-05,
"loss": 0.2069,
"step": 140
},
{
"epoch": 0.04,
"grad_norm": 5.701699776433005,
"learning_rate": 1.5995799998032858e-05,
"loss": 0.1573,
"step": 145
},
{
"epoch": 0.04,
"grad_norm": 4.996573800175535,
"learning_rate": 1.6103401119090937e-05,
"loss": 0.1758,
"step": 150
},
{
"epoch": 0.05,
"grad_norm": 9.43286827567575,
"learning_rate": 1.6207473691603323e-05,
"loss": 0.1664,
"step": 155
},
{
"epoch": 0.05,
"grad_norm": 28.24015920275371,
"learning_rate": 1.6308241808752197e-05,
"loss": 0.1844,
"step": 160
},
{
"epoch": 0.05,
"grad_norm": 8.923975388298482,
"learning_rate": 1.6405908871340797e-05,
"loss": 0.2391,
"step": 165
},
{
"epoch": 0.05,
"grad_norm": 6.282811516394321,
"learning_rate": 1.6500660059502946e-05,
"loss": 0.1676,
"step": 170
},
{
"epoch": 0.05,
"grad_norm": 10.30498920082885,
"learning_rate": 1.6592664446031127e-05,
"loss": 0.1934,
"step": 175
},
{
"epoch": 0.05,
"grad_norm": 5.1185908366180195,
"learning_rate": 1.6682076811925287e-05,
"loss": 0.1531,
"step": 180
},
{
"epoch": 0.05,
"grad_norm": 4.756718752566272,
"learning_rate": 1.6769039213135887e-05,
"loss": 0.1765,
"step": 185
},
{
"epoch": 0.06,
"grad_norm": 6.629667545228336,
"learning_rate": 1.6853682338328088e-05,
"loss": 0.2043,
"step": 190
},
{
"epoch": 0.06,
"grad_norm": 1.1680908649614796,
"learning_rate": 1.6936126690249144e-05,
"loss": 0.1845,
"step": 195
},
{
"epoch": 0.06,
"grad_norm": 2.594897280738297,
"learning_rate": 1.7016483617504395e-05,
"loss": 0.1142,
"step": 200
},
{
"epoch": 0.06,
"grad_norm": 2.9032680841669585,
"learning_rate": 1.7094856218911983e-05,
"loss": 0.2007,
"step": 205
},
{
"epoch": 0.06,
"grad_norm": 6.848184342158123,
"learning_rate": 1.717134013886547e-05,
"loss": 0.1239,
"step": 210
},
{
"epoch": 0.06,
"grad_norm": 6.717780464657794,
"learning_rate": 1.7246024269096814e-05,
"loss": 0.2148,
"step": 215
},
{
"epoch": 0.06,
"grad_norm": 2.418335623324901,
"learning_rate": 1.7318991369754256e-05,
"loss": 0.1228,
"step": 220
},
{
"epoch": 0.07,
"grad_norm": 6.090782655933177,
"learning_rate": 1.739031862067748e-05,
"loss": 0.2336,
"step": 225
},
{
"epoch": 0.07,
"grad_norm": 9.653963182390594,
"learning_rate": 1.7460078112077626e-05,
"loss": 0.1624,
"step": 230
},
{
"epoch": 0.07,
"grad_norm": 6.782020630535402,
"learning_rate": 1.7528337282443e-05,
"loss": 0.2082,
"step": 235
},
{
"epoch": 0.07,
"grad_norm": 7.955935238397691,
"learning_rate": 1.7595159310338742e-05,
"loss": 0.1203,
"step": 240
},
{
"epoch": 0.07,
"grad_norm": 11.86700555838968,
"learning_rate": 1.766060346580566e-05,
"loss": 0.2336,
"step": 245
},
{
"epoch": 0.07,
"grad_norm": 7.932728692529283,
"learning_rate": 1.772472542625659e-05,
"loss": 0.1557,
"step": 250
},
{
"epoch": 0.07,
"grad_norm": 9.367369941926402,
"learning_rate": 1.7787577561089487e-05,
"loss": 0.1778,
"step": 255
},
{
"epoch": 0.08,
"grad_norm": 8.119258788907397,
"learning_rate": 1.7849209188662603e-05,
"loss": 0.1641,
"step": 260
},
{
"epoch": 0.08,
"grad_norm": 8.877824360780032,
"learning_rate": 1.7909666808791235e-05,
"loss": 0.2865,
"step": 265
},
{
"epoch": 0.08,
"grad_norm": 5.311739339787183,
"learning_rate": 1.796899431351183e-05,
"loss": 0.1644,
"step": 270
},
{
"epoch": 0.08,
"grad_norm": 3.648816107438258,
"learning_rate": 1.802723317850645e-05,
"loss": 0.1989,
"step": 275
},
{
"epoch": 0.08,
"grad_norm": 37.56990879513816,
"learning_rate": 1.8084422637278925e-05,
"loss": 0.1892,
"step": 280
},
{
"epoch": 0.08,
"grad_norm": 16.74826541858141,
"learning_rate": 1.8140599839914632e-05,
"loss": 0.2343,
"step": 285
},
{
"epoch": 0.09,
"grad_norm": 4.863631858596725,
"learning_rate": 1.8195799998032857e-05,
"loss": 0.2181,
"step": 290
},
{
"epoch": 0.09,
"grad_norm": 3.9113582322289524,
"learning_rate": 1.8250056517348252e-05,
"loss": 0.1715,
"step": 295
},
{
"epoch": 0.09,
"grad_norm": 11.84889258509689,
"learning_rate": 1.830340111909094e-05,
"loss": 0.1459,
"step": 300
},
{
"epoch": 0.09,
"grad_norm": 4.8514319753979915,
"learning_rate": 1.8355863951390547e-05,
"loss": 0.136,
"step": 305
},
{
"epoch": 0.09,
"grad_norm": 10.329272877304971,
"learning_rate": 1.8407473691603325e-05,
"loss": 0.161,
"step": 310
},
{
"epoch": 0.09,
"grad_norm": 15.22633652105913,
"learning_rate": 1.8458257640452014e-05,
"loss": 0.1721,
"step": 315
},
{
"epoch": 0.09,
"grad_norm": 4.230396160936562,
"learning_rate": 1.8508241808752197e-05,
"loss": 0.1408,
"step": 320
},
{
"epoch": 0.1,
"grad_norm": 4.777596867016981,
"learning_rate": 1.85574509974148e-05,
"loss": 0.1496,
"step": 325
},
{
"epoch": 0.1,
"grad_norm": 5.111340439300221,
"learning_rate": 1.8605908871340793e-05,
"loss": 0.2215,
"step": 330
},
{
"epoch": 0.1,
"grad_norm": 12.347791565416415,
"learning_rate": 1.8653638027759297e-05,
"loss": 0.1494,
"step": 335
},
{
"epoch": 0.1,
"grad_norm": 9.015406243961076,
"learning_rate": 1.8700660059502946e-05,
"loss": 0.1212,
"step": 340
},
{
"epoch": 0.1,
"grad_norm": 5.797822201400517,
"learning_rate": 1.874699561366417e-05,
"loss": 0.142,
"step": 345
},
{
"epoch": 0.1,
"grad_norm": 17.174149644614868,
"learning_rate": 1.8792664446031123e-05,
"loss": 0.1458,
"step": 350
},
{
"epoch": 0.1,
"grad_norm": 11.455289753213496,
"learning_rate": 1.88376854716625e-05,
"loss": 0.2466,
"step": 355
},
{
"epoch": 0.11,
"grad_norm": 4.808218103853913,
"learning_rate": 1.8882076811925286e-05,
"loss": 0.1382,
"step": 360
},
{
"epoch": 0.11,
"grad_norm": 6.0172560821404835,
"learning_rate": 1.8925855838288235e-05,
"loss": 0.2062,
"step": 365
},
{
"epoch": 0.11,
"grad_norm": 7.361219818784043,
"learning_rate": 1.8969039213135886e-05,
"loss": 0.2005,
"step": 370
},
{
"epoch": 0.11,
"grad_norm": 12.237376033790202,
"learning_rate": 1.9011642927843134e-05,
"loss": 0.2208,
"step": 375
},
{
"epoch": 0.11,
"grad_norm": 4.98412721789337,
"learning_rate": 1.9053682338328088e-05,
"loss": 0.1888,
"step": 380
},
{
"epoch": 0.11,
"grad_norm": 4.540797625815676,
"learning_rate": 1.9095172198280984e-05,
"loss": 0.1749,
"step": 385
},
{
"epoch": 0.11,
"grad_norm": 3.822828217331199,
"learning_rate": 1.9136126690249147e-05,
"loss": 0.0922,
"step": 390
},
{
"epoch": 0.12,
"grad_norm": 3.990734216544265,
"learning_rate": 1.9176559454741825e-05,
"loss": 0.1863,
"step": 395
},
{
"epoch": 0.12,
"grad_norm": 6.910160942356377,
"learning_rate": 1.9216483617504394e-05,
"loss": 0.2068,
"step": 400
},
{
"epoch": 0.12,
"grad_norm": 4.781285105059034,
"learning_rate": 1.9255911815098372e-05,
"loss": 0.1525,
"step": 405
},
{
"epoch": 0.12,
"grad_norm": 13.76373104788082,
"learning_rate": 1.9294856218911982e-05,
"loss": 0.2154,
"step": 410
},
{
"epoch": 0.12,
"grad_norm": 7.48460945287791,
"learning_rate": 1.9333328557715434e-05,
"loss": 0.1214,
"step": 415
},
{
"epoch": 0.12,
"grad_norm": 9.890476894625708,
"learning_rate": 1.937134013886547e-05,
"loss": 0.1381,
"step": 420
},
{
"epoch": 0.12,
"grad_norm": 9.300476915169963,
"learning_rate": 1.9408901868255147e-05,
"loss": 0.0847,
"step": 425
},
{
"epoch": 0.13,
"grad_norm": 6.528248658568062,
"learning_rate": 1.9446024269096816e-05,
"loss": 0.1411,
"step": 430
},
{
"epoch": 0.13,
"grad_norm": 3.682181191664401,
"learning_rate": 1.94827174996194e-05,
"loss": 0.1426,
"step": 435
},
{
"epoch": 0.13,
"grad_norm": 8.43703521578652,
"learning_rate": 1.951899136975425e-05,
"loss": 0.1579,
"step": 440
},
{
"epoch": 0.13,
"grad_norm": 6.56291531656417,
"learning_rate": 1.9554855356878272e-05,
"loss": 0.2021,
"step": 445
},
{
"epoch": 0.13,
"grad_norm": 10.066497956266401,
"learning_rate": 1.9590318620677484e-05,
"loss": 0.1154,
"step": 450
},
{
"epoch": 0.13,
"grad_norm": 5.966271330169713,
"learning_rate": 1.962539001718933e-05,
"loss": 0.2115,
"step": 455
},
{
"epoch": 0.13,
"grad_norm": 5.856232881470111,
"learning_rate": 1.9660078112077626e-05,
"loss": 0.1418,
"step": 460
},
{
"epoch": 0.14,
"grad_norm": 6.240360608562805,
"learning_rate": 1.9694391193189866e-05,
"loss": 0.1586,
"step": 465
},
{
"epoch": 0.14,
"grad_norm": 9.757750340699735,
"learning_rate": 1.9728337282443e-05,
"loss": 0.1449,
"step": 470
},
{
"epoch": 0.14,
"grad_norm": 7.712636650272143,
"learning_rate": 1.9761924147080285e-05,
"loss": 0.135,
"step": 475
},
{
"epoch": 0.14,
"grad_norm": 4.515393736306068,
"learning_rate": 1.9795159310338744e-05,
"loss": 0.129,
"step": 480
},
{
"epoch": 0.14,
"grad_norm": 0.7671227888175197,
"learning_rate": 1.982805006156388e-05,
"loss": 0.222,
"step": 485
},
{
"epoch": 0.14,
"grad_norm": 9.304084815966336,
"learning_rate": 1.9860603465805653e-05,
"loss": 0.1849,
"step": 490
},
{
"epoch": 0.15,
"grad_norm": 9.749297679157017,
"learning_rate": 1.989282637292734e-05,
"loss": 0.1362,
"step": 495
},
{
"epoch": 0.15,
"grad_norm": 21.698855546577796,
"learning_rate": 1.9924725426256592e-05,
"loss": 0.2026,
"step": 500
},
{
"epoch": 0.15,
"grad_norm": 8.82338005089098,
"learning_rate": 1.995630707080615e-05,
"loss": 0.1667,
"step": 505
},
{
"epoch": 0.15,
"grad_norm": 26.86060661079716,
"learning_rate": 1.998757756108949e-05,
"loss": 0.175,
"step": 510
},
{
"epoch": 0.15,
"grad_norm": 13.652883014206537,
"learning_rate": 1.9999995294744797e-05,
"loss": 0.1679,
"step": 515
},
{
"epoch": 0.15,
"grad_norm": 8.716096864047406,
"learning_rate": 1.9999966540423482e-05,
"loss": 0.1385,
"step": 520
},
{
"epoch": 0.15,
"grad_norm": 9.1722231121385,
"learning_rate": 1.99999116458866e-05,
"loss": 0.1694,
"step": 525
},
{
"epoch": 0.16,
"grad_norm": 20.79172007745624,
"learning_rate": 1.9999830611277667e-05,
"loss": 0.2038,
"step": 530
},
{
"epoch": 0.16,
"grad_norm": 7.595957745153807,
"learning_rate": 1.9999723436808522e-05,
"loss": 0.2264,
"step": 535
},
{
"epoch": 0.16,
"grad_norm": 3.7192450442308376,
"learning_rate": 1.9999590122759357e-05,
"loss": 0.1615,
"step": 540
},
{
"epoch": 0.16,
"grad_norm": 3.637037613572214,
"learning_rate": 1.9999430669478693e-05,
"loss": 0.0805,
"step": 545
},
{
"epoch": 0.16,
"grad_norm": 7.478389632328602,
"learning_rate": 1.999924507738338e-05,
"loss": 0.1623,
"step": 550
},
{
"epoch": 0.16,
"grad_norm": 0.5979318918112878,
"learning_rate": 1.9999033346958624e-05,
"loss": 0.1491,
"step": 555
},
{
"epoch": 0.16,
"grad_norm": 12.424319850795971,
"learning_rate": 1.999879547875794e-05,
"loss": 0.1631,
"step": 560
},
{
"epoch": 0.17,
"grad_norm": 5.76364771735449,
"learning_rate": 1.9998531473403187e-05,
"loss": 0.2197,
"step": 565
},
{
"epoch": 0.17,
"grad_norm": 6.606496630551793,
"learning_rate": 1.999824133158455e-05,
"loss": 0.1468,
"step": 570
},
{
"epoch": 0.17,
"grad_norm": 4.41747007920085,
"learning_rate": 1.999792505406055e-05,
"loss": 0.1366,
"step": 575
},
{
"epoch": 0.17,
"grad_norm": 5.075879536599382,
"learning_rate": 1.999758264165802e-05,
"loss": 0.0992,
"step": 580
},
{
"epoch": 0.17,
"grad_norm": 22.308467554033715,
"learning_rate": 1.9997214095272135e-05,
"loss": 0.2057,
"step": 585
},
{
"epoch": 0.17,
"grad_norm": 7.279463258176968,
"learning_rate": 1.9996819415866377e-05,
"loss": 0.1195,
"step": 590
},
{
"epoch": 0.17,
"grad_norm": 2.9315010662776357,
"learning_rate": 1.9996398604472556e-05,
"loss": 0.0958,
"step": 595
},
{
"epoch": 0.18,
"grad_norm": 21.026513936696166,
"learning_rate": 1.9995951662190794e-05,
"loss": 0.1842,
"step": 600
},
{
"epoch": 0.18,
"grad_norm": 4.622313054019638,
"learning_rate": 1.9995478590189534e-05,
"loss": 0.1048,
"step": 605
},
{
"epoch": 0.18,
"grad_norm": 6.403749452434921,
"learning_rate": 1.9994979389705517e-05,
"loss": 0.0971,
"step": 610
},
{
"epoch": 0.18,
"grad_norm": 14.94556621011129,
"learning_rate": 1.9994454062043795e-05,
"loss": 0.2127,
"step": 615
},
{
"epoch": 0.18,
"grad_norm": 8.09929499442626,
"learning_rate": 1.999390260857774e-05,
"loss": 0.2903,
"step": 620
},
{
"epoch": 0.18,
"grad_norm": 7.006300679141809,
"learning_rate": 1.9993325030749006e-05,
"loss": 0.1775,
"step": 625
},
{
"epoch": 0.18,
"grad_norm": 9.59543458146263,
"learning_rate": 1.9992721330067547e-05,
"loss": 0.2223,
"step": 630
},
{
"epoch": 0.19,
"grad_norm": 12.495151647629797,
"learning_rate": 1.9992091508111616e-05,
"loss": 0.2557,
"step": 635
},
{
"epoch": 0.19,
"grad_norm": 11.89488787437466,
"learning_rate": 1.9991435566527757e-05,
"loss": 0.2852,
"step": 640
},
{
"epoch": 0.19,
"grad_norm": 3.188905144434785,
"learning_rate": 1.999075350703078e-05,
"loss": 0.0961,
"step": 645
},
{
"epoch": 0.19,
"grad_norm": 4.654862369916143,
"learning_rate": 1.99900453314038e-05,
"loss": 0.1578,
"step": 650
},
{
"epoch": 0.19,
"grad_norm": 4.32187595326047,
"learning_rate": 1.9989311041498186e-05,
"loss": 0.127,
"step": 655
},
{
"epoch": 0.19,
"grad_norm": 4.215063448210065,
"learning_rate": 1.9988550639233587e-05,
"loss": 0.1392,
"step": 660
},
{
"epoch": 0.2,
"grad_norm": 15.535200940500214,
"learning_rate": 1.998776412659792e-05,
"loss": 0.1974,
"step": 665
},
{
"epoch": 0.2,
"grad_norm": 4.760906747957173,
"learning_rate": 1.998695150564736e-05,
"loss": 0.1434,
"step": 670
},
{
"epoch": 0.2,
"grad_norm": 0.061155960694268596,
"learning_rate": 1.998611277850633e-05,
"loss": 0.117,
"step": 675
},
{
"epoch": 0.2,
"grad_norm": 4.953787678092304,
"learning_rate": 1.9985247947367508e-05,
"loss": 0.1831,
"step": 680
},
{
"epoch": 0.2,
"grad_norm": 4.678233435016873,
"learning_rate": 1.9984357014491816e-05,
"loss": 0.1597,
"step": 685
},
{
"epoch": 0.2,
"grad_norm": 3.229829256723626,
"learning_rate": 1.9983439982208417e-05,
"loss": 0.1587,
"step": 690
},
{
"epoch": 0.2,
"grad_norm": 4.885033869708901,
"learning_rate": 1.9982496852914696e-05,
"loss": 0.18,
"step": 695
},
{
"epoch": 0.21,
"grad_norm": 6.329976580969712,
"learning_rate": 1.9981527629076265e-05,
"loss": 0.1383,
"step": 700
},
{
"epoch": 0.21,
"grad_norm": 8.616010763659258,
"learning_rate": 1.9980532313226964e-05,
"loss": 0.1439,
"step": 705
},
{
"epoch": 0.21,
"grad_norm": 5.805509239551478,
"learning_rate": 1.9979510907968834e-05,
"loss": 0.1916,
"step": 710
},
{
"epoch": 0.21,
"grad_norm": 9.354847725343019,
"learning_rate": 1.9978463415972135e-05,
"loss": 0.1291,
"step": 715
},
{
"epoch": 0.21,
"grad_norm": 11.028811436490342,
"learning_rate": 1.997738983997531e-05,
"loss": 0.1329,
"step": 720
},
{
"epoch": 0.21,
"grad_norm": 5.398403758317581,
"learning_rate": 1.9976290182784994e-05,
"loss": 0.169,
"step": 725
},
{
"epoch": 0.21,
"grad_norm": 1.5150079159167873,
"learning_rate": 1.9975164447276022e-05,
"loss": 0.14,
"step": 730
},
{
"epoch": 0.22,
"grad_norm": 5.437256906271298,
"learning_rate": 1.9974012636391393e-05,
"loss": 0.1706,
"step": 735
},
{
"epoch": 0.22,
"grad_norm": 4.686594964713854,
"learning_rate": 1.9972834753142275e-05,
"loss": 0.1086,
"step": 740
},
{
"epoch": 0.22,
"grad_norm": 7.059615364727769,
"learning_rate": 1.9971630800607995e-05,
"loss": 0.1391,
"step": 745
},
{
"epoch": 0.22,
"grad_norm": 3.759488626159761,
"learning_rate": 1.9970400781936044e-05,
"loss": 0.1252,
"step": 750
},
{
"epoch": 0.22,
"grad_norm": 5.196072033882789,
"learning_rate": 1.9969144700342042e-05,
"loss": 0.1907,
"step": 755
},
{
"epoch": 0.22,
"grad_norm": 12.217160172705722,
"learning_rate": 1.9967862559109757e-05,
"loss": 0.2199,
"step": 760
},
{
"epoch": 0.22,
"grad_norm": 4.605767548552978,
"learning_rate": 1.996655436159108e-05,
"loss": 0.0851,
"step": 765
},
{
"epoch": 0.23,
"grad_norm": 2.7976726055332968,
"learning_rate": 1.9965220111206022e-05,
"loss": 0.1428,
"step": 770
},
{
"epoch": 0.23,
"grad_norm": 6.6315563077229465,
"learning_rate": 1.9963859811442695e-05,
"loss": 0.1419,
"step": 775
},
{
"epoch": 0.23,
"grad_norm": 7.1281321818622025,
"learning_rate": 1.996247346585733e-05,
"loss": 0.1895,
"step": 780
},
{
"epoch": 0.23,
"grad_norm": 5.305688238343365,
"learning_rate": 1.9961061078074236e-05,
"loss": 0.1973,
"step": 785
},
{
"epoch": 0.23,
"grad_norm": 6.655387141934961,
"learning_rate": 1.99596226517858e-05,
"loss": 0.222,
"step": 790
},
{
"epoch": 0.23,
"grad_norm": 3.802457245856603,
"learning_rate": 1.9958158190752497e-05,
"loss": 0.1619,
"step": 795
},
{
"epoch": 0.23,
"grad_norm": 4.205338699906657,
"learning_rate": 1.9956667698802847e-05,
"loss": 0.1411,
"step": 800
},
{
"epoch": 0.24,
"grad_norm": 18.6994174478946,
"learning_rate": 1.9955151179833437e-05,
"loss": 0.1763,
"step": 805
},
{
"epoch": 0.24,
"grad_norm": 5.203119684623317,
"learning_rate": 1.995360863780889e-05,
"loss": 0.1651,
"step": 810
},
{
"epoch": 0.24,
"grad_norm": 1.1272129969070257,
"learning_rate": 1.9952040076761857e-05,
"loss": 0.1222,
"step": 815
},
{
"epoch": 0.24,
"grad_norm": 7.271805429556148,
"learning_rate": 1.9950445500793015e-05,
"loss": 0.1072,
"step": 820
},
{
"epoch": 0.24,
"grad_norm": 7.652271467084596,
"learning_rate": 1.994882491407105e-05,
"loss": 0.1221,
"step": 825
},
{
"epoch": 0.24,
"grad_norm": 11.746166496698574,
"learning_rate": 1.9947178320832656e-05,
"loss": 0.2158,
"step": 830
},
{
"epoch": 0.24,
"grad_norm": 6.5191485713782535,
"learning_rate": 1.99455057253825e-05,
"loss": 0.1689,
"step": 835
},
{
"epoch": 0.25,
"grad_norm": 5.7533957364325286,
"learning_rate": 1.9943807132093236e-05,
"loss": 0.2428,
"step": 840
},
{
"epoch": 0.25,
"grad_norm": 7.743116459704382,
"learning_rate": 1.9942082545405485e-05,
"loss": 0.2132,
"step": 845
},
{
"epoch": 0.25,
"grad_norm": 7.284870082767622,
"learning_rate": 1.9940331969827816e-05,
"loss": 0.1184,
"step": 850
},
{
"epoch": 0.25,
"grad_norm": 6.8174806638918355,
"learning_rate": 1.9938555409936746e-05,
"loss": 0.1717,
"step": 855
},
{
"epoch": 0.25,
"grad_norm": 4.522024299566416,
"learning_rate": 1.9936752870376722e-05,
"loss": 0.1544,
"step": 860
},
{
"epoch": 0.25,
"grad_norm": 10.08832302588539,
"learning_rate": 1.9934924355860107e-05,
"loss": 0.1735,
"step": 865
},
{
"epoch": 0.26,
"grad_norm": 12.48917866555821,
"learning_rate": 1.993306987116717e-05,
"loss": 0.1627,
"step": 870
},
{
"epoch": 0.26,
"grad_norm": 5.430627799088825,
"learning_rate": 1.993118942114608e-05,
"loss": 0.1113,
"step": 875
},
{
"epoch": 0.26,
"grad_norm": 6.3813002626417905,
"learning_rate": 1.992928301071288e-05,
"loss": 0.1718,
"step": 880
},
{
"epoch": 0.26,
"grad_norm": 13.52771259964932,
"learning_rate": 1.9927350644851477e-05,
"loss": 0.1118,
"step": 885
},
{
"epoch": 0.26,
"grad_norm": 3.5900299721319406,
"learning_rate": 1.9925392328613644e-05,
"loss": 0.1357,
"step": 890
},
{
"epoch": 0.26,
"grad_norm": 10.835472443494583,
"learning_rate": 1.992340806711899e-05,
"loss": 0.0815,
"step": 895
},
{
"epoch": 0.26,
"grad_norm": 6.39064559373592,
"learning_rate": 1.992139786555496e-05,
"loss": 0.2278,
"step": 900
},
{
"epoch": 0.27,
"grad_norm": 14.873787966804063,
"learning_rate": 1.9919361729176798e-05,
"loss": 0.1245,
"step": 905
},
{
"epoch": 0.27,
"grad_norm": 0.8036248310449102,
"learning_rate": 1.991729966330756e-05,
"loss": 0.1297,
"step": 910
},
{
"epoch": 0.27,
"grad_norm": 2.7695654260331835,
"learning_rate": 1.991521167333809e-05,
"loss": 0.1493,
"step": 915
},
{
"epoch": 0.27,
"grad_norm": 7.413157143500232,
"learning_rate": 1.9913097764727006e-05,
"loss": 0.1712,
"step": 920
},
{
"epoch": 0.27,
"grad_norm": 5.35044325659026,
"learning_rate": 1.9910957943000678e-05,
"loss": 0.1923,
"step": 925
},
{
"epoch": 0.27,
"grad_norm": 4.939892539272544,
"learning_rate": 1.9908792213753223e-05,
"loss": 0.1262,
"step": 930
},
{
"epoch": 0.27,
"grad_norm": 4.425995433503358,
"learning_rate": 1.990660058264649e-05,
"loss": 0.1316,
"step": 935
},
{
"epoch": 0.28,
"grad_norm": 6.682173559661954,
"learning_rate": 1.9904383055410045e-05,
"loss": 0.2628,
"step": 940
},
{
"epoch": 0.28,
"grad_norm": 19.871499575052912,
"learning_rate": 1.9902139637841146e-05,
"loss": 0.1646,
"step": 945
},
{
"epoch": 0.28,
"grad_norm": 6.951640795770302,
"learning_rate": 1.989987033580475e-05,
"loss": 0.1733,
"step": 950
},
{
"epoch": 0.28,
"grad_norm": 9.387316535142354,
"learning_rate": 1.989757515523346e-05,
"loss": 0.1448,
"step": 955
},
{
"epoch": 0.28,
"grad_norm": 3.9401918928016197,
"learning_rate": 1.9895254102127562e-05,
"loss": 0.1421,
"step": 960
},
{
"epoch": 0.28,
"grad_norm": 14.253092112437391,
"learning_rate": 1.989290718255496e-05,
"loss": 0.205,
"step": 965
},
{
"epoch": 0.28,
"grad_norm": 3.6398631596257545,
"learning_rate": 1.9890534402651184e-05,
"loss": 0.0899,
"step": 970
},
{
"epoch": 0.29,
"grad_norm": 15.630617642490721,
"learning_rate": 1.988813576861938e-05,
"loss": 0.1328,
"step": 975
},
{
"epoch": 0.29,
"grad_norm": 6.6456920006850115,
"learning_rate": 1.9885711286730267e-05,
"loss": 0.1899,
"step": 980
},
{
"epoch": 0.29,
"grad_norm": 7.585995077745574,
"learning_rate": 1.9883260963322152e-05,
"loss": 0.1583,
"step": 985
},
{
"epoch": 0.29,
"grad_norm": 3.476123842146937,
"learning_rate": 1.98807848048009e-05,
"loss": 0.1855,
"step": 990
},
{
"epoch": 0.29,
"grad_norm": 4.179578662061332,
"learning_rate": 1.987828281763991e-05,
"loss": 0.1681,
"step": 995
},
{
"epoch": 0.29,
"grad_norm": 3.565609561977014,
"learning_rate": 1.9875755008380104e-05,
"loss": 0.1187,
"step": 1000
},
{
"epoch": 0.29,
"grad_norm": 9.26279986638738,
"learning_rate": 1.9873201383629913e-05,
"loss": 0.1337,
"step": 1005
},
{
"epoch": 0.3,
"grad_norm": 3.569019308801666,
"learning_rate": 1.987062195006526e-05,
"loss": 0.0932,
"step": 1010
},
{
"epoch": 0.3,
"grad_norm": 1.8954161789376982,
"learning_rate": 1.986801671442953e-05,
"loss": 0.1272,
"step": 1015
},
{
"epoch": 0.3,
"grad_norm": 6.549425452273108,
"learning_rate": 1.986538568353358e-05,
"loss": 0.2543,
"step": 1020
},
{
"epoch": 0.3,
"grad_norm": 6.401069146499022,
"learning_rate": 1.9862728864255677e-05,
"loss": 0.1339,
"step": 1025
},
{
"epoch": 0.3,
"grad_norm": 14.623867414912695,
"learning_rate": 1.9860046263541537e-05,
"loss": 0.1368,
"step": 1030
},
{
"epoch": 0.3,
"grad_norm": 7.5226028629064166,
"learning_rate": 1.9857337888404254e-05,
"loss": 0.1315,
"step": 1035
},
{
"epoch": 0.31,
"grad_norm": 6.3020352784733795,
"learning_rate": 1.985460374592431e-05,
"loss": 0.2022,
"step": 1040
},
{
"epoch": 0.31,
"grad_norm": 35.72017508640075,
"learning_rate": 1.9851843843249552e-05,
"loss": 0.1907,
"step": 1045
},
{
"epoch": 0.31,
"grad_norm": 7.856532164781736,
"learning_rate": 1.9849058187595173e-05,
"loss": 0.1042,
"step": 1050
},
{
"epoch": 0.31,
"grad_norm": 4.939914354516286,
"learning_rate": 1.9846246786243682e-05,
"loss": 0.1883,
"step": 1055
},
{
"epoch": 0.31,
"grad_norm": 5.847586448522947,
"learning_rate": 1.9843409646544912e-05,
"loss": 0.1352,
"step": 1060
},
{
"epoch": 0.31,
"grad_norm": 1.7662920071735315,
"learning_rate": 1.984054677591597e-05,
"loss": 0.1266,
"step": 1065
},
{
"epoch": 0.31,
"grad_norm": 5.934002297797303,
"learning_rate": 1.9837658181841236e-05,
"loss": 0.1282,
"step": 1070
},
{
"epoch": 0.32,
"grad_norm": 7.322599560603541,
"learning_rate": 1.9834743871872333e-05,
"loss": 0.1002,
"step": 1075
},
{
"epoch": 0.32,
"grad_norm": 7.354422877300046,
"learning_rate": 1.9831803853628122e-05,
"loss": 0.1347,
"step": 1080
},
{
"epoch": 0.32,
"grad_norm": 4.244910165579806,
"learning_rate": 1.9828838134794668e-05,
"loss": 0.191,
"step": 1085
},
{
"epoch": 0.32,
"grad_norm": 2.189412507115152,
"learning_rate": 1.9825846723125222e-05,
"loss": 0.129,
"step": 1090
},
{
"epoch": 0.32,
"grad_norm": 4.334446446188146,
"learning_rate": 1.9822829626440213e-05,
"loss": 0.1606,
"step": 1095
},
{
"epoch": 0.32,
"grad_norm": 10.79965169839475,
"learning_rate": 1.9819786852627208e-05,
"loss": 0.2085,
"step": 1100
},
{
"epoch": 0.32,
"grad_norm": 5.047688232660633,
"learning_rate": 1.9816718409640904e-05,
"loss": 0.1335,
"step": 1105
},
{
"epoch": 0.33,
"grad_norm": 3.0521785647450033,
"learning_rate": 1.9813624305503105e-05,
"loss": 0.0902,
"step": 1110
},
{
"epoch": 0.33,
"grad_norm": 4.404798119689452,
"learning_rate": 1.9810504548302706e-05,
"loss": 0.1504,
"step": 1115
},
{
"epoch": 0.33,
"grad_norm": 4.276958235694841,
"learning_rate": 1.980735914619566e-05,
"loss": 0.1787,
"step": 1120
},
{
"epoch": 0.33,
"grad_norm": 6.67294781745391,
"learning_rate": 1.9804188107404973e-05,
"loss": 0.1485,
"step": 1125
},
{
"epoch": 0.33,
"grad_norm": 1.4659143223309605,
"learning_rate": 1.9800991440220652e-05,
"loss": 0.1161,
"step": 1130
},
{
"epoch": 0.33,
"grad_norm": 12.717469246978801,
"learning_rate": 1.979776915299973e-05,
"loss": 0.1409,
"step": 1135
},
{
"epoch": 0.33,
"grad_norm": 7.082211154981896,
"learning_rate": 1.9794521254166197e-05,
"loss": 0.1543,
"step": 1140
},
{
"epoch": 0.34,
"grad_norm": 5.796340300790522,
"learning_rate": 1.9791247752211014e-05,
"loss": 0.1151,
"step": 1145
},
{
"epoch": 0.34,
"grad_norm": 5.586857817269538,
"learning_rate": 1.978794865569207e-05,
"loss": 0.1199,
"step": 1150
},
{
"epoch": 0.34,
"grad_norm": 12.706251296959476,
"learning_rate": 1.9784623973234158e-05,
"loss": 0.1619,
"step": 1155
},
{
"epoch": 0.34,
"grad_norm": 6.66191554843005,
"learning_rate": 1.978127371352898e-05,
"loss": 0.1827,
"step": 1160
},
{
"epoch": 0.34,
"grad_norm": 1.7314817335468968,
"learning_rate": 1.9777897885335077e-05,
"loss": 0.1299,
"step": 1165
},
{
"epoch": 0.34,
"grad_norm": 3.1709913704813957,
"learning_rate": 1.9774496497477863e-05,
"loss": 0.0935,
"step": 1170
},
{
"epoch": 0.34,
"grad_norm": 6.802763534426451,
"learning_rate": 1.9771069558849553e-05,
"loss": 0.1747,
"step": 1175
},
{
"epoch": 0.35,
"grad_norm": 5.666618159965396,
"learning_rate": 1.9767617078409162e-05,
"loss": 0.1248,
"step": 1180
},
{
"epoch": 0.35,
"grad_norm": 12.07309712013722,
"learning_rate": 1.9764139065182485e-05,
"loss": 0.1369,
"step": 1185
},
{
"epoch": 0.35,
"grad_norm": 5.0768838627507895,
"learning_rate": 1.976063552826206e-05,
"loss": 0.1426,
"step": 1190
},
{
"epoch": 0.35,
"grad_norm": 2.6195923645371915,
"learning_rate": 1.9757106476807156e-05,
"loss": 0.1414,
"step": 1195
},
{
"epoch": 0.35,
"grad_norm": 8.745582119219046,
"learning_rate": 1.975355192004374e-05,
"loss": 0.1537,
"step": 1200
},
{
"epoch": 0.35,
"grad_norm": 4.3619589308843425,
"learning_rate": 1.9749971867264468e-05,
"loss": 0.2326,
"step": 1205
},
{
"epoch": 0.35,
"grad_norm": 9.986127768195958,
"learning_rate": 1.9746366327828637e-05,
"loss": 0.0914,
"step": 1210
},
{
"epoch": 0.36,
"grad_norm": 11.679210640348971,
"learning_rate": 1.9742735311162177e-05,
"loss": 0.1284,
"step": 1215
},
{
"epoch": 0.36,
"grad_norm": 4.432897195420396,
"learning_rate": 1.973907882675763e-05,
"loss": 0.1578,
"step": 1220
},
{
"epoch": 0.36,
"grad_norm": 9.382375435412673,
"learning_rate": 1.973539688417411e-05,
"loss": 0.1637,
"step": 1225
},
{
"epoch": 0.36,
"grad_norm": 5.830067001664431,
"learning_rate": 1.973168949303729e-05,
"loss": 0.1468,
"step": 1230
},
{
"epoch": 0.36,
"grad_norm": 11.741616362731765,
"learning_rate": 1.9727956663039367e-05,
"loss": 0.1691,
"step": 1235
},
{
"epoch": 0.36,
"grad_norm": 7.39202415980895,
"learning_rate": 1.9724198403939053e-05,
"loss": 0.1809,
"step": 1240
},
{
"epoch": 0.37,
"grad_norm": 4.693742994361657,
"learning_rate": 1.9720414725561538e-05,
"loss": 0.0858,
"step": 1245
},
{
"epoch": 0.37,
"grad_norm": 8.617997574967658,
"learning_rate": 1.9716605637798452e-05,
"loss": 0.1308,
"step": 1250
},
{
"epoch": 0.37,
"grad_norm": 2.6119778380512626,
"learning_rate": 1.9712771150607865e-05,
"loss": 0.1205,
"step": 1255
},
{
"epoch": 0.37,
"grad_norm": 5.216314969786015,
"learning_rate": 1.9708911274014247e-05,
"loss": 0.1245,
"step": 1260
},
{
"epoch": 0.37,
"grad_norm": 5.528129775162225,
"learning_rate": 1.970502601810844e-05,
"loss": 0.1876,
"step": 1265
},
{
"epoch": 0.37,
"grad_norm": 5.524231849539429,
"learning_rate": 1.9701115393047636e-05,
"loss": 0.1143,
"step": 1270
},
{
"epoch": 0.37,
"grad_norm": 4.229434350974042,
"learning_rate": 1.969717940905535e-05,
"loss": 0.1273,
"step": 1275
},
{
"epoch": 0.38,
"grad_norm": 6.514210060225279,
"learning_rate": 1.9693218076421395e-05,
"loss": 0.1034,
"step": 1280
},
{
"epoch": 0.38,
"grad_norm": 14.7017638724717,
"learning_rate": 1.9689231405501844e-05,
"loss": 0.1529,
"step": 1285
},
{
"epoch": 0.38,
"grad_norm": 4.964386181339109,
"learning_rate": 1.968521940671903e-05,
"loss": 0.1567,
"step": 1290
},
{
"epoch": 0.38,
"grad_norm": 5.222923574403755,
"learning_rate": 1.9681182090561467e-05,
"loss": 0.1144,
"step": 1295
},
{
"epoch": 0.38,
"grad_norm": 5.523495340651447,
"learning_rate": 1.96771194675839e-05,
"loss": 0.1311,
"step": 1300
},
{
"epoch": 0.38,
"grad_norm": 6.064321028150695,
"learning_rate": 1.9673031548407197e-05,
"loss": 0.1282,
"step": 1305
},
{
"epoch": 0.38,
"grad_norm": 7.022609980188127,
"learning_rate": 1.9668918343718377e-05,
"loss": 0.1735,
"step": 1310
},
{
"epoch": 0.39,
"grad_norm": 5.386703878469319,
"learning_rate": 1.9664779864270553e-05,
"loss": 0.146,
"step": 1315
},
{
"epoch": 0.39,
"grad_norm": 2.2817281979534587,
"learning_rate": 1.966061612088292e-05,
"loss": 0.1566,
"step": 1320
},
{
"epoch": 0.39,
"grad_norm": 5.647148438986119,
"learning_rate": 1.965642712444072e-05,
"loss": 0.1508,
"step": 1325
},
{
"epoch": 0.39,
"grad_norm": 14.636180809770323,
"learning_rate": 1.965221288589521e-05,
"loss": 0.1366,
"step": 1330
},
{
"epoch": 0.39,
"grad_norm": 3.796916066642304,
"learning_rate": 1.9647973416263634e-05,
"loss": 0.1562,
"step": 1335
},
{
"epoch": 0.39,
"grad_norm": 8.39363234629941,
"learning_rate": 1.964370872662921e-05,
"loss": 0.0938,
"step": 1340
},
{
"epoch": 0.39,
"grad_norm": 6.500094779864121,
"learning_rate": 1.963941882814108e-05,
"loss": 0.1746,
"step": 1345
},
{
"epoch": 0.4,
"grad_norm": 3.5283001757400054,
"learning_rate": 1.963510373201428e-05,
"loss": 0.1419,
"step": 1350
},
{
"epoch": 0.4,
"grad_norm": 2.5908302511117327,
"learning_rate": 1.9630763449529747e-05,
"loss": 0.0663,
"step": 1355
},
{
"epoch": 0.4,
"grad_norm": 6.447702267666421,
"learning_rate": 1.962639799203423e-05,
"loss": 0.1506,
"step": 1360
},
{
"epoch": 0.4,
"grad_norm": 7.258886744369148,
"learning_rate": 1.962200737094032e-05,
"loss": 0.1705,
"step": 1365
},
{
"epoch": 0.4,
"grad_norm": 8.355297998196825,
"learning_rate": 1.9617591597726372e-05,
"loss": 0.2185,
"step": 1370
},
{
"epoch": 0.4,
"grad_norm": 11.059705823190226,
"learning_rate": 1.9613150683936513e-05,
"loss": 0.164,
"step": 1375
},
{
"epoch": 0.4,
"grad_norm": 7.496281019677461,
"learning_rate": 1.9608684641180584e-05,
"loss": 0.1868,
"step": 1380
},
{
"epoch": 0.41,
"grad_norm": 9.337392423896086,
"learning_rate": 1.9604193481134123e-05,
"loss": 0.128,
"step": 1385
},
{
"epoch": 0.41,
"grad_norm": 6.741067698323382,
"learning_rate": 1.9599677215538333e-05,
"loss": 0.1304,
"step": 1390
},
{
"epoch": 0.41,
"grad_norm": 5.97327894379059,
"learning_rate": 1.959513585620005e-05,
"loss": 0.1129,
"step": 1395
},
{
"epoch": 0.41,
"grad_norm": 6.2990643124323,
"learning_rate": 1.9590569414991718e-05,
"loss": 0.2452,
"step": 1400
},
{
"epoch": 0.41,
"grad_norm": 3.9874003574274552,
"learning_rate": 1.9585977903851334e-05,
"loss": 0.1288,
"step": 1405
},
{
"epoch": 0.41,
"grad_norm": 10.71406937880321,
"learning_rate": 1.9581361334782453e-05,
"loss": 0.1682,
"step": 1410
},
{
"epoch": 0.42,
"grad_norm": 3.1321367786011076,
"learning_rate": 1.957671971985414e-05,
"loss": 0.1461,
"step": 1415
},
{
"epoch": 0.42,
"grad_norm": 6.601166084621551,
"learning_rate": 1.9572053071200922e-05,
"loss": 0.1642,
"step": 1420
},
{
"epoch": 0.42,
"grad_norm": 6.815298083273188,
"learning_rate": 1.9567361401022784e-05,
"loss": 0.2203,
"step": 1425
},
{
"epoch": 0.42,
"grad_norm": 8.60817902664583,
"learning_rate": 1.9562644721585123e-05,
"loss": 0.1246,
"step": 1430
},
{
"epoch": 0.42,
"grad_norm": 3.607077127205811,
"learning_rate": 1.9557903045218708e-05,
"loss": 0.0977,
"step": 1435
},
{
"epoch": 0.42,
"grad_norm": 5.81146788919113,
"learning_rate": 1.955313638431967e-05,
"loss": 0.1038,
"step": 1440
},
{
"epoch": 0.42,
"grad_norm": 8.172735748764952,
"learning_rate": 1.954834475134945e-05,
"loss": 0.1653,
"step": 1445
},
{
"epoch": 0.43,
"grad_norm": 6.240754575344053,
"learning_rate": 1.9543528158834775e-05,
"loss": 0.1734,
"step": 1450
},
{
"epoch": 0.43,
"grad_norm": 5.739038717343691,
"learning_rate": 1.953868661936762e-05,
"loss": 0.1477,
"step": 1455
},
{
"epoch": 0.43,
"grad_norm": 3.09005933315755,
"learning_rate": 1.9533820145605184e-05,
"loss": 0.1303,
"step": 1460
},
{
"epoch": 0.43,
"grad_norm": 9.269129587273234,
"learning_rate": 1.9528928750269847e-05,
"loss": 0.1188,
"step": 1465
},
{
"epoch": 0.43,
"grad_norm": 3.9756210801612446,
"learning_rate": 1.9524012446149144e-05,
"loss": 0.1011,
"step": 1470
},
{
"epoch": 0.43,
"grad_norm": 6.014169492139025,
"learning_rate": 1.9519071246095734e-05,
"loss": 0.1843,
"step": 1475
},
{
"epoch": 0.43,
"grad_norm": 6.42414136559558,
"learning_rate": 1.951410516302735e-05,
"loss": 0.1429,
"step": 1480
},
{
"epoch": 0.44,
"grad_norm": 5.990872434024276,
"learning_rate": 1.950911420992678e-05,
"loss": 0.0871,
"step": 1485
},
{
"epoch": 0.44,
"grad_norm": 3.5783586051011795,
"learning_rate": 1.9504098399841835e-05,
"loss": 0.1602,
"step": 1490
},
{
"epoch": 0.44,
"grad_norm": 6.318287528571422,
"learning_rate": 1.9499057745885308e-05,
"loss": 0.134,
"step": 1495
},
{
"epoch": 0.44,
"grad_norm": 12.51258277827576,
"learning_rate": 1.949399226123493e-05,
"loss": 0.1537,
"step": 1500
},
{
"epoch": 0.44,
"grad_norm": 3.886808062730125,
"learning_rate": 1.9488901959133365e-05,
"loss": 0.1997,
"step": 1505
},
{
"epoch": 0.44,
"grad_norm": 2.309686134450491,
"learning_rate": 1.9483786852888144e-05,
"loss": 0.1105,
"step": 1510
},
{
"epoch": 0.44,
"grad_norm": 4.355694711066838,
"learning_rate": 1.947864695587165e-05,
"loss": 0.1538,
"step": 1515
},
{
"epoch": 0.45,
"grad_norm": 2.986939771149088,
"learning_rate": 1.9473482281521063e-05,
"loss": 0.0769,
"step": 1520
},
{
"epoch": 0.45,
"grad_norm": 2.8973623579771397,
"learning_rate": 1.946829284333836e-05,
"loss": 0.0987,
"step": 1525
},
{
"epoch": 0.45,
"grad_norm": 2.0533675098464874,
"learning_rate": 1.9463078654890242e-05,
"loss": 0.1066,
"step": 1530
},
{
"epoch": 0.45,
"grad_norm": 7.8889622513250774,
"learning_rate": 1.945783972980812e-05,
"loss": 0.0989,
"step": 1535
},
{
"epoch": 0.45,
"grad_norm": 1.470844560891904,
"learning_rate": 1.945257608178807e-05,
"loss": 0.0536,
"step": 1540
},
{
"epoch": 0.45,
"grad_norm": 3.097540950673788,
"learning_rate": 1.9447287724590808e-05,
"loss": 0.1487,
"step": 1545
},
{
"epoch": 0.45,
"grad_norm": 2.236294115632501,
"learning_rate": 1.9441974672041636e-05,
"loss": 0.1272,
"step": 1550
},
{
"epoch": 0.46,
"grad_norm": 6.08682373802408,
"learning_rate": 1.943663693803043e-05,
"loss": 0.1234,
"step": 1555
},
{
"epoch": 0.46,
"grad_norm": 2.659517679298076,
"learning_rate": 1.9431274536511577e-05,
"loss": 0.107,
"step": 1560
},
{
"epoch": 0.46,
"grad_norm": 4.176352623298332,
"learning_rate": 1.9425887481503964e-05,
"loss": 0.1275,
"step": 1565
},
{
"epoch": 0.46,
"grad_norm": 3.247258717513404,
"learning_rate": 1.9420475787090926e-05,
"loss": 0.1282,
"step": 1570
},
{
"epoch": 0.46,
"grad_norm": 3.345767544439658,
"learning_rate": 1.9415039467420207e-05,
"loss": 0.0917,
"step": 1575
},
{
"epoch": 0.46,
"grad_norm": 2.368596730771716,
"learning_rate": 1.9409578536703936e-05,
"loss": 0.1262,
"step": 1580
},
{
"epoch": 0.46,
"grad_norm": 12.620428787419241,
"learning_rate": 1.9404093009218568e-05,
"loss": 0.1687,
"step": 1585
},
{
"epoch": 0.47,
"grad_norm": 2.3476402040970985,
"learning_rate": 1.939858289930489e-05,
"loss": 0.0849,
"step": 1590
},
{
"epoch": 0.47,
"grad_norm": 9.157447293999722,
"learning_rate": 1.9393048221367924e-05,
"loss": 0.1322,
"step": 1595
},
{
"epoch": 0.47,
"grad_norm": 10.047629795227984,
"learning_rate": 1.9387488989876937e-05,
"loss": 0.1215,
"step": 1600
},
{
"epoch": 0.47,
"grad_norm": 4.324110021146038,
"learning_rate": 1.938190521936538e-05,
"loss": 0.1228,
"step": 1605
},
{
"epoch": 0.47,
"grad_norm": 2.520014942669844,
"learning_rate": 1.937629692443086e-05,
"loss": 0.1538,
"step": 1610
},
{
"epoch": 0.47,
"grad_norm": 5.789832430771725,
"learning_rate": 1.9370664119735096e-05,
"loss": 0.1508,
"step": 1615
},
{
"epoch": 0.48,
"grad_norm": 3.248881220822288,
"learning_rate": 1.9365006820003883e-05,
"loss": 0.1051,
"step": 1620
},
{
"epoch": 0.48,
"grad_norm": 3.1423550675382623,
"learning_rate": 1.935932504002705e-05,
"loss": 0.0786,
"step": 1625
},
{
"epoch": 0.48,
"grad_norm": 4.9862187243188645,
"learning_rate": 1.935361879465843e-05,
"loss": 0.089,
"step": 1630
},
{
"epoch": 0.48,
"grad_norm": 1.9635393731249728,
"learning_rate": 1.9347888098815814e-05,
"loss": 0.0699,
"step": 1635
},
{
"epoch": 0.48,
"grad_norm": 2.1963931644611914,
"learning_rate": 1.9342132967480914e-05,
"loss": 0.1087,
"step": 1640
},
{
"epoch": 0.48,
"grad_norm": 2.689006294580755,
"learning_rate": 1.9336353415699316e-05,
"loss": 0.0622,
"step": 1645
},
{
"epoch": 0.48,
"grad_norm": 2.1154762474968005,
"learning_rate": 1.933054945858046e-05,
"loss": 0.0961,
"step": 1650
},
{
"epoch": 0.49,
"grad_norm": 6.7523638506838095,
"learning_rate": 1.932472111129758e-05,
"loss": 0.1685,
"step": 1655
},
{
"epoch": 0.49,
"grad_norm": 1.5984988800226272,
"learning_rate": 1.931886838908768e-05,
"loss": 0.1272,
"step": 1660
},
{
"epoch": 0.49,
"grad_norm": 4.090521085044347,
"learning_rate": 1.9312991307251476e-05,
"loss": 0.0738,
"step": 1665
},
{
"epoch": 0.49,
"grad_norm": 3.0964168267397385,
"learning_rate": 1.9307089881153383e-05,
"loss": 0.095,
"step": 1670
},
{
"epoch": 0.49,
"grad_norm": 1.7867271806779057,
"learning_rate": 1.9301164126221444e-05,
"loss": 0.1104,
"step": 1675
},
{
"epoch": 0.49,
"grad_norm": 2.565341397593321,
"learning_rate": 1.929521405794732e-05,
"loss": 0.1075,
"step": 1680
},
{
"epoch": 0.49,
"grad_norm": 0.7933305187602832,
"learning_rate": 1.9289239691886213e-05,
"loss": 0.0703,
"step": 1685
},
{
"epoch": 0.5,
"grad_norm": 2.7412621656740184,
"learning_rate": 1.9283241043656865e-05,
"loss": 0.1091,
"step": 1690
},
{
"epoch": 0.5,
"grad_norm": 3.6919637269194117,
"learning_rate": 1.9277218128941493e-05,
"loss": 0.0399,
"step": 1695
},
{
"epoch": 0.5,
"grad_norm": 3.3309770998366575,
"learning_rate": 1.927117096348575e-05,
"loss": 0.1115,
"step": 1700
},
{
"epoch": 0.5,
"grad_norm": 3.120277651390199,
"learning_rate": 1.9265099563098698e-05,
"loss": 0.1292,
"step": 1705
},
{
"epoch": 0.5,
"grad_norm": 3.567470496202032,
"learning_rate": 1.9259003943652743e-05,
"loss": 0.1023,
"step": 1710
},
{
"epoch": 0.5,
"grad_norm": 3.1015901732471365,
"learning_rate": 1.9252884121083613e-05,
"loss": 0.098,
"step": 1715
},
{
"epoch": 0.5,
"grad_norm": 3.3435838604589216,
"learning_rate": 1.924674011139031e-05,
"loss": 0.1034,
"step": 1720
},
{
"epoch": 0.51,
"grad_norm": 2.751303537238706,
"learning_rate": 1.924057193063507e-05,
"loss": 0.0751,
"step": 1725
},
{
"epoch": 0.51,
"grad_norm": 2.8435568272302034,
"learning_rate": 1.923437959494331e-05,
"loss": 0.0938,
"step": 1730
},
{
"epoch": 0.51,
"grad_norm": 3.491687713292557,
"learning_rate": 1.9228163120503612e-05,
"loss": 0.126,
"step": 1735
},
{
"epoch": 0.51,
"grad_norm": 2.065971227764955,
"learning_rate": 1.9221922523567643e-05,
"loss": 0.0992,
"step": 1740
},
{
"epoch": 0.51,
"grad_norm": 2.291836460643561,
"learning_rate": 1.9215657820450152e-05,
"loss": 0.1169,
"step": 1745
},
{
"epoch": 0.51,
"grad_norm": 6.805536473392368,
"learning_rate": 1.92093690275289e-05,
"loss": 0.1301,
"step": 1750
},
{
"epoch": 0.51,
"grad_norm": 2.1824217570154456,
"learning_rate": 1.920305616124462e-05,
"loss": 0.1125,
"step": 1755
},
{
"epoch": 0.52,
"grad_norm": 14.726348017337305,
"learning_rate": 1.9196719238100993e-05,
"loss": 0.1292,
"step": 1760
},
{
"epoch": 0.52,
"grad_norm": 4.930978645236691,
"learning_rate": 1.9190358274664586e-05,
"loss": 0.1418,
"step": 1765
},
{
"epoch": 0.52,
"grad_norm": 1.5342337673472881,
"learning_rate": 1.9183973287564806e-05,
"loss": 0.139,
"step": 1770
},
{
"epoch": 0.52,
"grad_norm": 2.427844265256046,
"learning_rate": 1.9177564293493876e-05,
"loss": 0.083,
"step": 1775
},
{
"epoch": 0.52,
"grad_norm": 2.1313388749998086,
"learning_rate": 1.9171131309206777e-05,
"loss": 0.1207,
"step": 1780
},
{
"epoch": 0.52,
"grad_norm": 3.126693266841522,
"learning_rate": 1.9164674351521203e-05,
"loss": 0.0963,
"step": 1785
},
{
"epoch": 0.53,
"grad_norm": 3.175825693492831,
"learning_rate": 1.9158193437317527e-05,
"loss": 0.1776,
"step": 1790
},
{
"epoch": 0.53,
"grad_norm": 2.9113721112544906,
"learning_rate": 1.9151688583538753e-05,
"loss": 0.1217,
"step": 1795
},
{
"epoch": 0.53,
"grad_norm": 3.1166239064049113,
"learning_rate": 1.9145159807190458e-05,
"loss": 0.0507,
"step": 1800
},
{
"epoch": 0.53,
"grad_norm": 8.142997644880616,
"learning_rate": 1.9138607125340777e-05,
"loss": 0.1586,
"step": 1805
},
{
"epoch": 0.53,
"grad_norm": 1.8022906748245269,
"learning_rate": 1.913203055512033e-05,
"loss": 0.1093,
"step": 1810
},
{
"epoch": 0.53,
"grad_norm": 1.8417345443000441,
"learning_rate": 1.9125430113722186e-05,
"loss": 0.1023,
"step": 1815
},
{
"epoch": 0.53,
"grad_norm": 3.399632737014696,
"learning_rate": 1.9118805818401825e-05,
"loss": 0.1,
"step": 1820
},
{
"epoch": 0.54,
"grad_norm": 2.709759019082288,
"learning_rate": 1.9112157686477092e-05,
"loss": 0.108,
"step": 1825
},
{
"epoch": 0.54,
"grad_norm": 2.8287642794394894,
"learning_rate": 1.910548573532814e-05,
"loss": 0.1119,
"step": 1830
},
{
"epoch": 0.54,
"grad_norm": 0.9463293108337878,
"learning_rate": 1.90987899823974e-05,
"loss": 0.106,
"step": 1835
},
{
"epoch": 0.54,
"grad_norm": 4.02788077621015,
"learning_rate": 1.9092070445189513e-05,
"loss": 0.1223,
"step": 1840
},
{
"epoch": 0.54,
"grad_norm": 6.586862447096695,
"learning_rate": 1.9085327141271325e-05,
"loss": 0.1612,
"step": 1845
},
{
"epoch": 0.54,
"grad_norm": 6.365460607459384,
"learning_rate": 1.907856008827178e-05,
"loss": 0.1469,
"step": 1850
},
{
"epoch": 0.54,
"grad_norm": 5.186222516601001,
"learning_rate": 1.907176930388195e-05,
"loss": 0.1176,
"step": 1855
},
{
"epoch": 0.55,
"grad_norm": 0.10245286406524089,
"learning_rate": 1.906495480585491e-05,
"loss": 0.0928,
"step": 1860
},
{
"epoch": 0.55,
"grad_norm": 2.469663523408353,
"learning_rate": 1.9058116612005757e-05,
"loss": 0.095,
"step": 1865
},
{
"epoch": 0.55,
"grad_norm": 2.7503837837835223,
"learning_rate": 1.905125474021152e-05,
"loss": 0.1103,
"step": 1870
},
{
"epoch": 0.55,
"grad_norm": 3.6629482112205047,
"learning_rate": 1.9044369208411127e-05,
"loss": 0.0769,
"step": 1875
},
{
"epoch": 0.55,
"grad_norm": 3.6257303751767043,
"learning_rate": 1.903746003460538e-05,
"loss": 0.1188,
"step": 1880
},
{
"epoch": 0.55,
"grad_norm": 3.1074798789565987,
"learning_rate": 1.9030527236856867e-05,
"loss": 0.0771,
"step": 1885
},
{
"epoch": 0.55,
"grad_norm": 1.9930789523758066,
"learning_rate": 1.9023570833289946e-05,
"loss": 0.1227,
"step": 1890
},
{
"epoch": 0.56,
"grad_norm": 5.3774295018042775,
"learning_rate": 1.9016590842090682e-05,
"loss": 0.1089,
"step": 1895
},
{
"epoch": 0.56,
"grad_norm": 3.988587549835179,
"learning_rate": 1.9009587281506815e-05,
"loss": 0.1095,
"step": 1900
},
{
"epoch": 0.56,
"grad_norm": 2.4025900613693385,
"learning_rate": 1.9002560169847688e-05,
"loss": 0.0744,
"step": 1905
},
{
"epoch": 0.56,
"grad_norm": 4.962479191119096,
"learning_rate": 1.8995509525484227e-05,
"loss": 0.1113,
"step": 1910
},
{
"epoch": 0.56,
"grad_norm": 3.222489638212013,
"learning_rate": 1.8988435366848867e-05,
"loss": 0.1122,
"step": 1915
},
{
"epoch": 0.56,
"grad_norm": 6.910634577936775,
"learning_rate": 1.8981337712435528e-05,
"loss": 0.1357,
"step": 1920
},
{
"epoch": 0.56,
"grad_norm": 2.770074917437753,
"learning_rate": 1.897421658079955e-05,
"loss": 0.1246,
"step": 1925
},
{
"epoch": 0.57,
"grad_norm": 1.9251395311195432,
"learning_rate": 1.8967071990557643e-05,
"loss": 0.1159,
"step": 1930
},
{
"epoch": 0.57,
"grad_norm": 5.562401417062251,
"learning_rate": 1.8959903960387852e-05,
"loss": 0.0945,
"step": 1935
},
{
"epoch": 0.57,
"grad_norm": 2.5785513785382435,
"learning_rate": 1.89527125090295e-05,
"loss": 0.1166,
"step": 1940
},
{
"epoch": 0.57,
"grad_norm": 2.3414558881424488,
"learning_rate": 1.8945497655283142e-05,
"loss": 0.1071,
"step": 1945
},
{
"epoch": 0.57,
"grad_norm": 3.8387053345665754,
"learning_rate": 1.8938259418010504e-05,
"loss": 0.078,
"step": 1950
},
{
"epoch": 0.57,
"grad_norm": 2.503163339080756,
"learning_rate": 1.8930997816134457e-05,
"loss": 0.1155,
"step": 1955
},
{
"epoch": 0.57,
"grad_norm": 3.006961850846919,
"learning_rate": 1.892371286863894e-05,
"loss": 0.1433,
"step": 1960
},
{
"epoch": 0.58,
"grad_norm": 2.509118046318532,
"learning_rate": 1.8916404594568934e-05,
"loss": 0.0889,
"step": 1965
},
{
"epoch": 0.58,
"grad_norm": 2.061207568655511,
"learning_rate": 1.8909073013030404e-05,
"loss": 0.1235,
"step": 1970
},
{
"epoch": 0.58,
"grad_norm": 2.1278253733682098,
"learning_rate": 1.8901718143190234e-05,
"loss": 0.0903,
"step": 1975
},
{
"epoch": 0.58,
"grad_norm": 2.546231615677142,
"learning_rate": 1.8894340004276208e-05,
"loss": 0.0992,
"step": 1980
},
{
"epoch": 0.58,
"grad_norm": 5.884018165275079,
"learning_rate": 1.8886938615576926e-05,
"loss": 0.1213,
"step": 1985
},
{
"epoch": 0.58,
"grad_norm": 3.048907575238015,
"learning_rate": 1.887951399644178e-05,
"loss": 0.076,
"step": 1990
},
{
"epoch": 0.59,
"grad_norm": 3.326277815721477,
"learning_rate": 1.8872066166280898e-05,
"loss": 0.097,
"step": 1995
},
{
"epoch": 0.59,
"grad_norm": 3.033210042047812,
"learning_rate": 1.8864595144565067e-05,
"loss": 0.1589,
"step": 2000
},
{
"epoch": 0.59,
"grad_norm": 3.2824483953691823,
"learning_rate": 1.8857100950825725e-05,
"loss": 0.1037,
"step": 2005
},
{
"epoch": 0.59,
"grad_norm": 3.4944060444975196,
"learning_rate": 1.8849583604654883e-05,
"loss": 0.1102,
"step": 2010
},
{
"epoch": 0.59,
"grad_norm": 3.5842552477679854,
"learning_rate": 1.8842043125705074e-05,
"loss": 0.0704,
"step": 2015
},
{
"epoch": 0.59,
"grad_norm": 2.5453422006053277,
"learning_rate": 1.883447953368931e-05,
"loss": 0.0902,
"step": 2020
},
{
"epoch": 0.59,
"grad_norm": 2.05893040027048,
"learning_rate": 1.8826892848381026e-05,
"loss": 0.1236,
"step": 2025
},
{
"epoch": 0.6,
"grad_norm": 2.5436602939843653,
"learning_rate": 1.881928308961403e-05,
"loss": 0.1127,
"step": 2030
},
{
"epoch": 0.6,
"grad_norm": 5.148596800219499,
"learning_rate": 1.8811650277282457e-05,
"loss": 0.1554,
"step": 2035
},
{
"epoch": 0.6,
"grad_norm": 3.4114441072977875,
"learning_rate": 1.88039944313407e-05,
"loss": 0.1361,
"step": 2040
},
{
"epoch": 0.6,
"grad_norm": 2.8127938011148865,
"learning_rate": 1.8796315571803373e-05,
"loss": 0.0995,
"step": 2045
},
{
"epoch": 0.6,
"grad_norm": 3.9625421467654545,
"learning_rate": 1.8788613718745258e-05,
"loss": 0.1007,
"step": 2050
},
{
"epoch": 0.6,
"grad_norm": 1.7804274416555144,
"learning_rate": 1.8780888892301246e-05,
"loss": 0.0831,
"step": 2055
},
{
"epoch": 0.6,
"grad_norm": 3.0212657925844457,
"learning_rate": 1.8773141112666282e-05,
"loss": 0.0983,
"step": 2060
},
{
"epoch": 0.61,
"grad_norm": 2.1666245700952618,
"learning_rate": 1.876537040009533e-05,
"loss": 0.1188,
"step": 2065
},
{
"epoch": 0.61,
"grad_norm": 1.3595918504356805,
"learning_rate": 1.8757576774903293e-05,
"loss": 0.0847,
"step": 2070
},
{
"epoch": 0.61,
"grad_norm": 3.125961613522519,
"learning_rate": 1.8749760257464987e-05,
"loss": 0.1239,
"step": 2075
},
{
"epoch": 0.61,
"grad_norm": 3.7003035601248655,
"learning_rate": 1.874192086821506e-05,
"loss": 0.1409,
"step": 2080
},
{
"epoch": 0.61,
"grad_norm": 2.379395082357442,
"learning_rate": 1.8734058627647974e-05,
"loss": 0.0724,
"step": 2085
},
{
"epoch": 0.61,
"grad_norm": 6.950334653412999,
"learning_rate": 1.872617355631791e-05,
"loss": 0.1478,
"step": 2090
},
{
"epoch": 0.61,
"grad_norm": 3.242450014080562,
"learning_rate": 1.871826567483875e-05,
"loss": 0.099,
"step": 2095
},
{
"epoch": 0.62,
"grad_norm": 3.6369097830549175,
"learning_rate": 1.8710335003884e-05,
"loss": 0.0874,
"step": 2100
},
{
"epoch": 0.62,
"grad_norm": 2.3740836935740575,
"learning_rate": 1.8702381564186752e-05,
"loss": 0.1088,
"step": 2105
},
{
"epoch": 0.62,
"grad_norm": 4.29882368809403,
"learning_rate": 1.8694405376539612e-05,
"loss": 0.1358,
"step": 2110
},
{
"epoch": 0.62,
"grad_norm": 1.233884641572376,
"learning_rate": 1.8686406461794663e-05,
"loss": 0.0848,
"step": 2115
},
{
"epoch": 0.62,
"grad_norm": 5.10093491156728,
"learning_rate": 1.86783848408634e-05,
"loss": 0.1664,
"step": 2120
},
{
"epoch": 0.62,
"grad_norm": 2.6372062628013366,
"learning_rate": 1.867034053471669e-05,
"loss": 0.0864,
"step": 2125
},
{
"epoch": 0.62,
"grad_norm": 1.9131502429850842,
"learning_rate": 1.8662273564384685e-05,
"loss": 0.0712,
"step": 2130
},
{
"epoch": 0.63,
"grad_norm": 2.2716382434584665,
"learning_rate": 1.8654183950956807e-05,
"loss": 0.1098,
"step": 2135
},
{
"epoch": 0.63,
"grad_norm": 2.980703122184445,
"learning_rate": 1.864607171558166e-05,
"loss": 0.1524,
"step": 2140
},
{
"epoch": 0.63,
"grad_norm": 2.6693066304605497,
"learning_rate": 1.863793687946699e-05,
"loss": 0.1263,
"step": 2145
},
{
"epoch": 0.63,
"grad_norm": 1.8713460923305671,
"learning_rate": 1.862977946387964e-05,
"loss": 0.1043,
"step": 2150
},
{
"epoch": 0.63,
"grad_norm": 2.173402756633364,
"learning_rate": 1.862159949014547e-05,
"loss": 0.1268,
"step": 2155
},
{
"epoch": 0.63,
"grad_norm": 2.1115579118349936,
"learning_rate": 1.861339697964932e-05,
"loss": 0.0871,
"step": 2160
},
{
"epoch": 0.64,
"grad_norm": 0.4714286720726806,
"learning_rate": 1.860517195383495e-05,
"loss": 0.1029,
"step": 2165
},
{
"epoch": 0.64,
"grad_norm": 2.133585471909289,
"learning_rate": 1.8596924434204963e-05,
"loss": 0.0858,
"step": 2170
},
{
"epoch": 0.64,
"grad_norm": 1.7496615491285803,
"learning_rate": 1.8588654442320796e-05,
"loss": 0.1081,
"step": 2175
},
{
"epoch": 0.64,
"grad_norm": 4.814425987182643,
"learning_rate": 1.8580361999802606e-05,
"loss": 0.1179,
"step": 2180
},
{
"epoch": 0.64,
"grad_norm": 1.9816374631526001,
"learning_rate": 1.8572047128329272e-05,
"loss": 0.1062,
"step": 2185
},
{
"epoch": 0.64,
"grad_norm": 6.646969625340206,
"learning_rate": 1.8563709849638286e-05,
"loss": 0.1477,
"step": 2190
},
{
"epoch": 0.64,
"grad_norm": 2.374535766203674,
"learning_rate": 1.8555350185525723e-05,
"loss": 0.1142,
"step": 2195
},
{
"epoch": 0.65,
"grad_norm": 3.365883417577783,
"learning_rate": 1.8546968157846195e-05,
"loss": 0.124,
"step": 2200
},
{
"epoch": 0.65,
"grad_norm": 2.119464733459411,
"learning_rate": 1.8538563788512757e-05,
"loss": 0.0861,
"step": 2205
},
{
"epoch": 0.65,
"grad_norm": 3.4982979586919245,
"learning_rate": 1.8530137099496886e-05,
"loss": 0.1153,
"step": 2210
},
{
"epoch": 0.65,
"grad_norm": 2.0143848714607326,
"learning_rate": 1.852168811282841e-05,
"loss": 0.0957,
"step": 2215
},
{
"epoch": 0.65,
"grad_norm": 5.713010148322335,
"learning_rate": 1.8513216850595434e-05,
"loss": 0.106,
"step": 2220
},
{
"epoch": 0.65,
"grad_norm": 3.66380234305124,
"learning_rate": 1.850472333494432e-05,
"loss": 0.1011,
"step": 2225
},
{
"epoch": 0.65,
"grad_norm": 2.082824212489925,
"learning_rate": 1.849620758807959e-05,
"loss": 0.1106,
"step": 2230
},
{
"epoch": 0.66,
"grad_norm": 3.1172243656282594,
"learning_rate": 1.8487669632263892e-05,
"loss": 0.1099,
"step": 2235
},
{
"epoch": 0.66,
"grad_norm": 3.8750958584649853,
"learning_rate": 1.8479109489817935e-05,
"loss": 0.0927,
"step": 2240
},
{
"epoch": 0.66,
"grad_norm": 2.658381425986548,
"learning_rate": 1.8470527183120425e-05,
"loss": 0.0768,
"step": 2245
},
{
"epoch": 0.66,
"grad_norm": 4.454354742139791,
"learning_rate": 1.8461922734608016e-05,
"loss": 0.0906,
"step": 2250
},
{
"epoch": 0.66,
"grad_norm": 3.7643868717903866,
"learning_rate": 1.845329616677525e-05,
"loss": 0.0937,
"step": 2255
},
{
"epoch": 0.66,
"grad_norm": 3.680014043034835,
"learning_rate": 1.8444647502174492e-05,
"loss": 0.087,
"step": 2260
},
{
"epoch": 0.66,
"grad_norm": 3.3837653881747585,
"learning_rate": 1.843597676341587e-05,
"loss": 0.0916,
"step": 2265
},
{
"epoch": 0.67,
"grad_norm": 2.121221324252522,
"learning_rate": 1.8427283973167225e-05,
"loss": 0.1221,
"step": 2270
},
{
"epoch": 0.67,
"grad_norm": 1.0325272482169887,
"learning_rate": 1.841856915415405e-05,
"loss": 0.0874,
"step": 2275
},
{
"epoch": 0.67,
"grad_norm": 2.0795962066445566,
"learning_rate": 1.840983232915942e-05,
"loss": 0.0741,
"step": 2280
},
{
"epoch": 0.67,
"grad_norm": 4.460588805671975,
"learning_rate": 1.840107352102395e-05,
"loss": 0.1488,
"step": 2285
},
{
"epoch": 0.67,
"grad_norm": 2.974661793437929,
"learning_rate": 1.839229275264572e-05,
"loss": 0.093,
"step": 2290
},
{
"epoch": 0.67,
"grad_norm": 3.137347736691432,
"learning_rate": 1.8383490046980212e-05,
"loss": 0.1,
"step": 2295
},
{
"epoch": 0.67,
"grad_norm": 7.90074357987843,
"learning_rate": 1.8374665427040276e-05,
"loss": 0.1362,
"step": 2300
},
{
"epoch": 0.68,
"grad_norm": 2.4540688223728826,
"learning_rate": 1.836581891589604e-05,
"loss": 0.1124,
"step": 2305
},
{
"epoch": 0.68,
"grad_norm": 4.638147996733137,
"learning_rate": 1.8356950536674858e-05,
"loss": 0.1031,
"step": 2310
},
{
"epoch": 0.68,
"grad_norm": 2.2696169707348544,
"learning_rate": 1.834806031256127e-05,
"loss": 0.0965,
"step": 2315
},
{
"epoch": 0.68,
"grad_norm": 6.130234330693954,
"learning_rate": 1.833914826679691e-05,
"loss": 0.079,
"step": 2320
},
{
"epoch": 0.68,
"grad_norm": 1.8781015810402948,
"learning_rate": 1.8330214422680467e-05,
"loss": 0.0791,
"step": 2325
},
{
"epoch": 0.68,
"grad_norm": 1.4069832479622444,
"learning_rate": 1.8321258803567613e-05,
"loss": 0.0831,
"step": 2330
},
{
"epoch": 0.68,
"grad_norm": 3.4630544831332664,
"learning_rate": 1.831228143287096e-05,
"loss": 0.1616,
"step": 2335
},
{
"epoch": 0.69,
"grad_norm": 2.9442217600685137,
"learning_rate": 1.8303282334059957e-05,
"loss": 0.1199,
"step": 2340
},
{
"epoch": 0.69,
"grad_norm": 2.0954935585893257,
"learning_rate": 1.8294261530660885e-05,
"loss": 0.1302,
"step": 2345
},
{
"epoch": 0.69,
"grad_norm": 4.890904817711451,
"learning_rate": 1.8285219046256758e-05,
"loss": 0.1025,
"step": 2350
},
{
"epoch": 0.69,
"grad_norm": 2.583755171594856,
"learning_rate": 1.8276154904487264e-05,
"loss": 0.1043,
"step": 2355
},
{
"epoch": 0.69,
"grad_norm": 8.260345907657872,
"learning_rate": 1.8267069129048707e-05,
"loss": 0.1782,
"step": 2360
},
{
"epoch": 0.69,
"grad_norm": 4.771965836395477,
"learning_rate": 1.8257961743693962e-05,
"loss": 0.0862,
"step": 2365
},
{
"epoch": 0.7,
"grad_norm": 2.554197211858201,
"learning_rate": 1.8248832772232394e-05,
"loss": 0.0851,
"step": 2370
},
{
"epoch": 0.7,
"grad_norm": 2.16812655587689,
"learning_rate": 1.8239682238529792e-05,
"loss": 0.0938,
"step": 2375
},
{
"epoch": 0.7,
"grad_norm": 3.3012819170031853,
"learning_rate": 1.8230510166508322e-05,
"loss": 0.0769,
"step": 2380
},
{
"epoch": 0.7,
"grad_norm": 1.998340872187495,
"learning_rate": 1.822131658014646e-05,
"loss": 0.0735,
"step": 2385
},
{
"epoch": 0.7,
"grad_norm": 1.9806358974129312,
"learning_rate": 1.8212101503478916e-05,
"loss": 0.14,
"step": 2390
},
{
"epoch": 0.7,
"grad_norm": 6.99507540091613,
"learning_rate": 1.8202864960596592e-05,
"loss": 0.0944,
"step": 2395
},
{
"epoch": 0.7,
"grad_norm": 2.4015686273341545,
"learning_rate": 1.8193606975646506e-05,
"loss": 0.0677,
"step": 2400
},
{
"epoch": 0.71,
"grad_norm": 3.3364791966330962,
"learning_rate": 1.8184327572831738e-05,
"loss": 0.0829,
"step": 2405
},
{
"epoch": 0.71,
"grad_norm": 6.874850462240894,
"learning_rate": 1.817502677641134e-05,
"loss": 0.1419,
"step": 2410
},
{
"epoch": 0.71,
"grad_norm": 2.6075283986023665,
"learning_rate": 1.8165704610700315e-05,
"loss": 0.1117,
"step": 2415
},
{
"epoch": 0.71,
"grad_norm": 2.148518397802211,
"learning_rate": 1.8156361100069524e-05,
"loss": 0.101,
"step": 2420
},
{
"epoch": 0.71,
"grad_norm": 3.4560468231137564,
"learning_rate": 1.8146996268945632e-05,
"loss": 0.0966,
"step": 2425
},
{
"epoch": 0.71,
"grad_norm": 2.384974065195782,
"learning_rate": 1.8137610141811037e-05,
"loss": 0.122,
"step": 2430
},
{
"epoch": 0.71,
"grad_norm": 2.362869907867881,
"learning_rate": 1.812820274320381e-05,
"loss": 0.1132,
"step": 2435
},
{
"epoch": 0.72,
"grad_norm": 6.9320506095293455,
"learning_rate": 1.811877409771764e-05,
"loss": 0.1524,
"step": 2440
},
{
"epoch": 0.72,
"grad_norm": 4.720326949611697,
"learning_rate": 1.8109324230001756e-05,
"loss": 0.1301,
"step": 2445
},
{
"epoch": 0.72,
"grad_norm": 2.1399702198416413,
"learning_rate": 1.8099853164760865e-05,
"loss": 0.0889,
"step": 2450
},
{
"epoch": 0.72,
"grad_norm": 6.7467640785050325,
"learning_rate": 1.80903609267551e-05,
"loss": 0.099,
"step": 2455
},
{
"epoch": 0.72,
"grad_norm": 2.325513463081622,
"learning_rate": 1.8080847540799942e-05,
"loss": 0.1064,
"step": 2460
},
{
"epoch": 0.72,
"grad_norm": 2.177610658971863,
"learning_rate": 1.8071313031766148e-05,
"loss": 0.0658,
"step": 2465
},
{
"epoch": 0.72,
"grad_norm": 4.977562149709799,
"learning_rate": 1.8061757424579716e-05,
"loss": 0.1207,
"step": 2470
},
{
"epoch": 0.73,
"grad_norm": 2.1681517377729382,
"learning_rate": 1.8052180744221784e-05,
"loss": 0.1197,
"step": 2475
},
{
"epoch": 0.73,
"grad_norm": 2.7090062894808002,
"learning_rate": 1.8042583015728598e-05,
"loss": 0.0792,
"step": 2480
},
{
"epoch": 0.73,
"grad_norm": 4.747077846148363,
"learning_rate": 1.8032964264191402e-05,
"loss": 0.1143,
"step": 2485
},
{
"epoch": 0.73,
"grad_norm": 2.74552501361813,
"learning_rate": 1.8023324514756436e-05,
"loss": 0.1265,
"step": 2490
},
{
"epoch": 0.73,
"grad_norm": 4.1581416593625296,
"learning_rate": 1.801366379262481e-05,
"loss": 0.072,
"step": 2495
},
{
"epoch": 0.73,
"grad_norm": 1.6406189691341908,
"learning_rate": 1.8003982123052474e-05,
"loss": 0.0814,
"step": 2500
},
{
"epoch": 0.73,
"grad_norm": 2.712884736792791,
"learning_rate": 1.7994279531350135e-05,
"loss": 0.0973,
"step": 2505
},
{
"epoch": 0.74,
"grad_norm": 1.204869182656131,
"learning_rate": 1.7984556042883195e-05,
"loss": 0.0725,
"step": 2510
},
{
"epoch": 0.74,
"grad_norm": 1.63571544112928,
"learning_rate": 1.7974811683071688e-05,
"loss": 0.1416,
"step": 2515
},
{
"epoch": 0.74,
"grad_norm": 0.8179280822029832,
"learning_rate": 1.7965046477390223e-05,
"loss": 0.08,
"step": 2520
},
{
"epoch": 0.74,
"grad_norm": 3.1456845650432697,
"learning_rate": 1.7955260451367887e-05,
"loss": 0.0939,
"step": 2525
},
{
"epoch": 0.74,
"grad_norm": 3.068511046140001,
"learning_rate": 1.7945453630588214e-05,
"loss": 0.074,
"step": 2530
},
{
"epoch": 0.74,
"grad_norm": 1.8530528834496558,
"learning_rate": 1.7935626040689087e-05,
"loss": 0.1254,
"step": 2535
},
{
"epoch": 0.75,
"grad_norm": 9.466013698713851,
"learning_rate": 1.7925777707362694e-05,
"loss": 0.1031,
"step": 2540
},
{
"epoch": 0.75,
"grad_norm": 2.8626798281758683,
"learning_rate": 1.791590865635546e-05,
"loss": 0.0906,
"step": 2545
},
{
"epoch": 0.75,
"grad_norm": 5.788311578849042,
"learning_rate": 1.7906018913467957e-05,
"loss": 0.1191,
"step": 2550
},
{
"epoch": 0.75,
"grad_norm": 15.34485630001676,
"learning_rate": 1.7896108504554858e-05,
"loss": 0.1703,
"step": 2555
},
{
"epoch": 0.75,
"grad_norm": 1.878644897302772,
"learning_rate": 1.7886177455524865e-05,
"loss": 0.0978,
"step": 2560
},
{
"epoch": 0.75,
"grad_norm": 3.9594573315714197,
"learning_rate": 1.7876225792340635e-05,
"loss": 0.1066,
"step": 2565
},
{
"epoch": 0.75,
"grad_norm": 6.412819373671454,
"learning_rate": 1.786625354101872e-05,
"loss": 0.1204,
"step": 2570
},
{
"epoch": 0.76,
"grad_norm": 9.689461195761101,
"learning_rate": 1.7856260727629495e-05,
"loss": 0.1137,
"step": 2575
},
{
"epoch": 0.76,
"grad_norm": 3.5251907080608054,
"learning_rate": 1.784624737829709e-05,
"loss": 0.1519,
"step": 2580
},
{
"epoch": 0.76,
"grad_norm": 2.735467659755077,
"learning_rate": 1.783621351919932e-05,
"loss": 0.0956,
"step": 2585
},
{
"epoch": 0.76,
"grad_norm": 2.579970560423387,
"learning_rate": 1.7826159176567616e-05,
"loss": 0.0965,
"step": 2590
},
{
"epoch": 0.76,
"grad_norm": 5.7447980487012185,
"learning_rate": 1.781608437668697e-05,
"loss": 0.1355,
"step": 2595
},
{
"epoch": 0.76,
"grad_norm": 2.746235872283559,
"learning_rate": 1.7805989145895847e-05,
"loss": 0.0879,
"step": 2600
},
{
"epoch": 0.76,
"grad_norm": 3.6558387720713794,
"learning_rate": 1.779587351058612e-05,
"loss": 0.1266,
"step": 2605
},
{
"epoch": 0.77,
"grad_norm": 2.79558623046737,
"learning_rate": 1.7785737497203013e-05,
"loss": 0.1805,
"step": 2610
},
{
"epoch": 0.77,
"grad_norm": 2.9429418600589123,
"learning_rate": 1.7775581132245026e-05,
"loss": 0.1069,
"step": 2615
},
{
"epoch": 0.77,
"grad_norm": 1.5095627662015014,
"learning_rate": 1.776540444226386e-05,
"loss": 0.047,
"step": 2620
},
{
"epoch": 0.77,
"grad_norm": 3.495377076072146,
"learning_rate": 1.775520745386434e-05,
"loss": 0.0633,
"step": 2625
},
{
"epoch": 0.77,
"grad_norm": 3.0385032472654325,
"learning_rate": 1.774499019370438e-05,
"loss": 0.1487,
"step": 2630
},
{
"epoch": 0.77,
"grad_norm": 10.965757999825739,
"learning_rate": 1.773475268849488e-05,
"loss": 0.1373,
"step": 2635
},
{
"epoch": 0.77,
"grad_norm": 2.8123652108000585,
"learning_rate": 1.772449496499966e-05,
"loss": 0.1187,
"step": 2640
},
{
"epoch": 0.78,
"grad_norm": 4.66597249908126,
"learning_rate": 1.77142170500354e-05,
"loss": 0.1114,
"step": 2645
},
{
"epoch": 0.78,
"grad_norm": 2.676562831364225,
"learning_rate": 1.770391897047157e-05,
"loss": 0.1213,
"step": 2650
},
{
"epoch": 0.78,
"grad_norm": 2.0506018569230973,
"learning_rate": 1.769360075323036e-05,
"loss": 0.1216,
"step": 2655
},
{
"epoch": 0.78,
"grad_norm": 3.4758686873519644,
"learning_rate": 1.7683262425286593e-05,
"loss": 0.1068,
"step": 2660
},
{
"epoch": 0.78,
"grad_norm": 16.360011105708523,
"learning_rate": 1.7672904013667675e-05,
"loss": 0.0836,
"step": 2665
},
{
"epoch": 0.78,
"grad_norm": 2.2472093017776724,
"learning_rate": 1.7662525545453518e-05,
"loss": 0.1206,
"step": 2670
},
{
"epoch": 0.78,
"grad_norm": 3.264614259094197,
"learning_rate": 1.7652127047776464e-05,
"loss": 0.0736,
"step": 2675
},
{
"epoch": 0.79,
"grad_norm": 4.838998543720377,
"learning_rate": 1.7641708547821218e-05,
"loss": 0.112,
"step": 2680
},
{
"epoch": 0.79,
"grad_norm": 1.889257267329085,
"learning_rate": 1.7631270072824786e-05,
"loss": 0.0915,
"step": 2685
},
{
"epoch": 0.79,
"grad_norm": 3.489350329159354,
"learning_rate": 1.762081165007638e-05,
"loss": 0.0872,
"step": 2690
},
{
"epoch": 0.79,
"grad_norm": 5.007436753100703,
"learning_rate": 1.7610333306917367e-05,
"loss": 0.1069,
"step": 2695
},
{
"epoch": 0.79,
"grad_norm": 1.8855245850871363,
"learning_rate": 1.75998350707412e-05,
"loss": 0.1096,
"step": 2700
},
{
"epoch": 0.79,
"grad_norm": 4.28080691529872,
"learning_rate": 1.7589316968993323e-05,
"loss": 0.1135,
"step": 2705
},
{
"epoch": 0.79,
"grad_norm": 2.188803148392315,
"learning_rate": 1.7578779029171128e-05,
"loss": 0.0631,
"step": 2710
},
{
"epoch": 0.8,
"grad_norm": 3.588543379312152,
"learning_rate": 1.7568221278823862e-05,
"loss": 0.1063,
"step": 2715
},
{
"epoch": 0.8,
"grad_norm": 2.6512479470769765,
"learning_rate": 1.7557643745552566e-05,
"loss": 0.0792,
"step": 2720
},
{
"epoch": 0.8,
"grad_norm": 2.4622361404876334,
"learning_rate": 1.7547046457009995e-05,
"loss": 0.0815,
"step": 2725
},
{
"epoch": 0.8,
"grad_norm": 1.181686879663521,
"learning_rate": 1.7536429440900554e-05,
"loss": 0.0855,
"step": 2730
},
{
"epoch": 0.8,
"grad_norm": 3.0605705567048047,
"learning_rate": 1.7525792724980225e-05,
"loss": 0.1384,
"step": 2735
},
{
"epoch": 0.8,
"grad_norm": 3.0361367388503884,
"learning_rate": 1.7515136337056476e-05,
"loss": 0.0652,
"step": 2740
},
{
"epoch": 0.81,
"grad_norm": 3.6999816430126207,
"learning_rate": 1.750446030498822e-05,
"loss": 0.1307,
"step": 2745
},
{
"epoch": 0.81,
"grad_norm": 1.7331629461515146,
"learning_rate": 1.7493764656685725e-05,
"loss": 0.085,
"step": 2750
},
{
"epoch": 0.81,
"grad_norm": 3.3426839965302104,
"learning_rate": 1.7483049420110526e-05,
"loss": 0.1107,
"step": 2755
},
{
"epoch": 0.81,
"grad_norm": 1.8945908369938274,
"learning_rate": 1.747231462327538e-05,
"loss": 0.0677,
"step": 2760
},
{
"epoch": 0.81,
"grad_norm": 2.375178480970911,
"learning_rate": 1.7461560294244185e-05,
"loss": 0.0816,
"step": 2765
},
{
"epoch": 0.81,
"grad_norm": 2.7160314363337323,
"learning_rate": 1.7450786461131886e-05,
"loss": 0.1479,
"step": 2770
},
{
"epoch": 0.81,
"grad_norm": 2.1767080856674283,
"learning_rate": 1.7439993152104424e-05,
"loss": 0.0701,
"step": 2775
},
{
"epoch": 0.82,
"grad_norm": 2.8479809150806425,
"learning_rate": 1.7429180395378667e-05,
"loss": 0.1213,
"step": 2780
},
{
"epoch": 0.82,
"grad_norm": 3.1833758133322387,
"learning_rate": 1.741834821922231e-05,
"loss": 0.0927,
"step": 2785
},
{
"epoch": 0.82,
"grad_norm": 3.7860456588982396,
"learning_rate": 1.7407496651953824e-05,
"loss": 0.0896,
"step": 2790
},
{
"epoch": 0.82,
"grad_norm": 3.127110340900602,
"learning_rate": 1.739662572194237e-05,
"loss": 0.0593,
"step": 2795
},
{
"epoch": 0.82,
"grad_norm": 1.805218044753091,
"learning_rate": 1.7385735457607728e-05,
"loss": 0.1007,
"step": 2800
},
{
"epoch": 0.82,
"grad_norm": 2.552175891261757,
"learning_rate": 1.7374825887420227e-05,
"loss": 0.1329,
"step": 2805
},
{
"epoch": 0.82,
"grad_norm": 1.4027082134325906,
"learning_rate": 1.7363897039900673e-05,
"loss": 0.0775,
"step": 2810
},
{
"epoch": 0.83,
"grad_norm": 2.5766792636413722,
"learning_rate": 1.7352948943620252e-05,
"loss": 0.1039,
"step": 2815
},
{
"epoch": 0.83,
"grad_norm": 2.315064631933743,
"learning_rate": 1.7341981627200486e-05,
"loss": 0.1002,
"step": 2820
},
{
"epoch": 0.83,
"grad_norm": 1.77275038528492,
"learning_rate": 1.733099511931314e-05,
"loss": 0.059,
"step": 2825
},
{
"epoch": 0.83,
"grad_norm": 3.6553802759984086,
"learning_rate": 1.731998944868015e-05,
"loss": 0.0873,
"step": 2830
},
{
"epoch": 0.83,
"grad_norm": 9.277418585224488,
"learning_rate": 1.730896464407355e-05,
"loss": 0.1054,
"step": 2835
},
{
"epoch": 0.83,
"grad_norm": 2.330825468379001,
"learning_rate": 1.7297920734315397e-05,
"loss": 0.0841,
"step": 2840
},
{
"epoch": 0.83,
"grad_norm": 1.5353749049117653,
"learning_rate": 1.728685774827769e-05,
"loss": 0.1018,
"step": 2845
},
{
"epoch": 0.84,
"grad_norm": 2.887634875297545,
"learning_rate": 1.7275775714882302e-05,
"loss": 0.1114,
"step": 2850
},
{
"epoch": 0.84,
"grad_norm": 2.453602349555872,
"learning_rate": 1.7264674663100908e-05,
"loss": 0.1401,
"step": 2855
},
{
"epoch": 0.84,
"grad_norm": 1.973689346976038,
"learning_rate": 1.7253554621954888e-05,
"loss": 0.1036,
"step": 2860
},
{
"epoch": 0.84,
"grad_norm": 2.3448153299848964,
"learning_rate": 1.7242415620515277e-05,
"loss": 0.1152,
"step": 2865
},
{
"epoch": 0.84,
"grad_norm": 3.1738861172586437,
"learning_rate": 1.7231257687902668e-05,
"loss": 0.1154,
"step": 2870
},
{
"epoch": 0.84,
"grad_norm": 5.216105630624163,
"learning_rate": 1.722008085328716e-05,
"loss": 0.0901,
"step": 2875
},
{
"epoch": 0.84,
"grad_norm": 1.687338997966392,
"learning_rate": 1.7208885145888262e-05,
"loss": 0.0996,
"step": 2880
},
{
"epoch": 0.85,
"grad_norm": 2.5681904204171584,
"learning_rate": 1.7197670594974815e-05,
"loss": 0.1147,
"step": 2885
},
{
"epoch": 0.85,
"grad_norm": 3.9337900031340416,
"learning_rate": 1.718643722986492e-05,
"loss": 0.0782,
"step": 2890
},
{
"epoch": 0.85,
"grad_norm": 4.399086614652283,
"learning_rate": 1.7175185079925877e-05,
"loss": 0.1258,
"step": 2895
},
{
"epoch": 0.85,
"grad_norm": 1.9000347309953698,
"learning_rate": 1.7163914174574092e-05,
"loss": 0.1434,
"step": 2900
},
{
"epoch": 0.85,
"grad_norm": 3.8364789467985507,
"learning_rate": 1.7152624543274994e-05,
"loss": 0.0865,
"step": 2905
},
{
"epoch": 0.85,
"grad_norm": 2.3865032691893213,
"learning_rate": 1.7141316215542975e-05,
"loss": 0.0866,
"step": 2910
},
{
"epoch": 0.86,
"grad_norm": 2.0857563297304167,
"learning_rate": 1.71299892209413e-05,
"loss": 0.0924,
"step": 2915
},
{
"epoch": 0.86,
"grad_norm": 2.345942984458089,
"learning_rate": 1.7118643589082043e-05,
"loss": 0.0708,
"step": 2920
},
{
"epoch": 0.86,
"grad_norm": 1.8641359939859798,
"learning_rate": 1.7107279349625992e-05,
"loss": 0.0788,
"step": 2925
},
{
"epoch": 0.86,
"grad_norm": 2.272737375583735,
"learning_rate": 1.7095896532282584e-05,
"loss": 0.0851,
"step": 2930
},
{
"epoch": 0.86,
"grad_norm": 2.4039124864082657,
"learning_rate": 1.7084495166809822e-05,
"loss": 0.1068,
"step": 2935
},
{
"epoch": 0.86,
"grad_norm": 1.1218714658208662,
"learning_rate": 1.707307528301421e-05,
"loss": 0.1091,
"step": 2940
},
{
"epoch": 0.86,
"grad_norm": 0.9891789998991855,
"learning_rate": 1.7061636910750646e-05,
"loss": 0.096,
"step": 2945
},
{
"epoch": 0.87,
"grad_norm": 3.194805823262511,
"learning_rate": 1.7050180079922373e-05,
"loss": 0.1006,
"step": 2950
},
{
"epoch": 0.87,
"grad_norm": 2.257798455534552,
"learning_rate": 1.7038704820480898e-05,
"loss": 0.0914,
"step": 2955
},
{
"epoch": 0.87,
"grad_norm": 5.78033176037585,
"learning_rate": 1.7027211162425888e-05,
"loss": 0.0965,
"step": 2960
},
{
"epoch": 0.87,
"grad_norm": 2.6184356901420025,
"learning_rate": 1.7015699135805122e-05,
"loss": 0.1114,
"step": 2965
},
{
"epoch": 0.87,
"grad_norm": 1.8106354297762863,
"learning_rate": 1.70041687707144e-05,
"loss": 0.0941,
"step": 2970
},
{
"epoch": 0.87,
"grad_norm": 1.245232646301207,
"learning_rate": 1.699262009729745e-05,
"loss": 0.0814,
"step": 2975
},
{
"epoch": 0.87,
"grad_norm": 3.442666904279059,
"learning_rate": 1.6981053145745877e-05,
"loss": 0.1418,
"step": 2980
},
{
"epoch": 0.88,
"grad_norm": 2.2823324901203557,
"learning_rate": 1.6969467946299073e-05,
"loss": 0.0593,
"step": 2985
},
{
"epoch": 0.88,
"grad_norm": 2.3494464059884015,
"learning_rate": 1.6957864529244123e-05,
"loss": 0.0696,
"step": 2990
},
{
"epoch": 0.88,
"grad_norm": 4.051449109074783,
"learning_rate": 1.694624292491575e-05,
"loss": 0.091,
"step": 2995
},
{
"epoch": 0.88,
"grad_norm": 2.6735251166535616,
"learning_rate": 1.6934603163696212e-05,
"loss": 0.089,
"step": 3000
},
{
"epoch": 0.88,
"grad_norm": 1.8986340958640284,
"learning_rate": 1.6922945276015244e-05,
"loss": 0.087,
"step": 3005
},
{
"epoch": 0.88,
"grad_norm": 3.912312190511564,
"learning_rate": 1.691126929234996e-05,
"loss": 0.1154,
"step": 3010
},
{
"epoch": 0.88,
"grad_norm": 3.9209068288748936,
"learning_rate": 1.6899575243224794e-05,
"loss": 0.0845,
"step": 3015
},
{
"epoch": 0.89,
"grad_norm": 0.986967487651937,
"learning_rate": 1.6887863159211403e-05,
"loss": 0.0563,
"step": 3020
},
{
"epoch": 0.89,
"grad_norm": 3.5756032794838775,
"learning_rate": 1.6876133070928584e-05,
"loss": 0.1154,
"step": 3025
},
{
"epoch": 0.89,
"grad_norm": 2.8732144989552793,
"learning_rate": 1.6864385009042215e-05,
"loss": 0.0882,
"step": 3030
},
{
"epoch": 0.89,
"grad_norm": 3.2146073135071584,
"learning_rate": 1.6852619004265157e-05,
"loss": 0.0746,
"step": 3035
},
{
"epoch": 0.89,
"grad_norm": 2.032882973038013,
"learning_rate": 1.684083508735718e-05,
"loss": 0.0919,
"step": 3040
},
{
"epoch": 0.89,
"grad_norm": 1.7212867050301923,
"learning_rate": 1.6829033289124876e-05,
"loss": 0.061,
"step": 3045
},
{
"epoch": 0.89,
"grad_norm": 2.0434776241308557,
"learning_rate": 1.681721364042159e-05,
"loss": 0.0608,
"step": 3050
},
{
"epoch": 0.9,
"grad_norm": 2.4595918397424077,
"learning_rate": 1.6805376172147335e-05,
"loss": 0.0618,
"step": 3055
},
{
"epoch": 0.9,
"grad_norm": 1.9062625696987283,
"learning_rate": 1.6793520915248704e-05,
"loss": 0.106,
"step": 3060
},
{
"epoch": 0.9,
"grad_norm": 3.0695618155829765,
"learning_rate": 1.6781647900718797e-05,
"loss": 0.0826,
"step": 3065
},
{
"epoch": 0.9,
"grad_norm": 2.5737887820804533,
"learning_rate": 1.676975715959714e-05,
"loss": 0.0896,
"step": 3070
},
{
"epoch": 0.9,
"grad_norm": 5.4035821406888855,
"learning_rate": 1.67578487229696e-05,
"loss": 0.0747,
"step": 3075
},
{
"epoch": 0.9,
"grad_norm": 4.031141412736291,
"learning_rate": 1.67459226219683e-05,
"loss": 0.1035,
"step": 3080
},
{
"epoch": 0.9,
"grad_norm": 2.8672651649371415,
"learning_rate": 1.6733978887771548e-05,
"loss": 0.1211,
"step": 3085
},
{
"epoch": 0.91,
"grad_norm": 3.622543160107051,
"learning_rate": 1.6722017551603752e-05,
"loss": 0.0782,
"step": 3090
},
{
"epoch": 0.91,
"grad_norm": 2.611185690533942,
"learning_rate": 1.6710038644735328e-05,
"loss": 0.1197,
"step": 3095
},
{
"epoch": 0.91,
"grad_norm": 2.239743805096798,
"learning_rate": 1.6698042198482645e-05,
"loss": 0.0694,
"step": 3100
},
{
"epoch": 0.91,
"grad_norm": 3.6697206061343963,
"learning_rate": 1.6686028244207902e-05,
"loss": 0.0896,
"step": 3105
},
{
"epoch": 0.91,
"grad_norm": 2.3344027475446776,
"learning_rate": 1.667399681331909e-05,
"loss": 0.0863,
"step": 3110
},
{
"epoch": 0.91,
"grad_norm": 5.716084510514838,
"learning_rate": 1.666194793726987e-05,
"loss": 0.0548,
"step": 3115
},
{
"epoch": 0.92,
"grad_norm": 3.1415750006162626,
"learning_rate": 1.6649881647559527e-05,
"loss": 0.0684,
"step": 3120
},
{
"epoch": 0.92,
"grad_norm": 2.1766147626420187,
"learning_rate": 1.6637797975732855e-05,
"loss": 0.0786,
"step": 3125
},
{
"epoch": 0.92,
"grad_norm": 2.602913079216308,
"learning_rate": 1.6625696953380104e-05,
"loss": 0.1321,
"step": 3130
},
{
"epoch": 0.92,
"grad_norm": 2.2070894756907418,
"learning_rate": 1.661357861213687e-05,
"loss": 0.0912,
"step": 3135
},
{
"epoch": 0.92,
"grad_norm": 1.6298032344638562,
"learning_rate": 1.6601442983684042e-05,
"loss": 0.0802,
"step": 3140
},
{
"epoch": 0.92,
"grad_norm": 2.7174691000044064,
"learning_rate": 1.658929009974768e-05,
"loss": 0.1251,
"step": 3145
},
{
"epoch": 0.92,
"grad_norm": 4.932366321915814,
"learning_rate": 1.657711999209898e-05,
"loss": 0.1141,
"step": 3150
},
{
"epoch": 0.93,
"grad_norm": 9.069104123184824,
"learning_rate": 1.656493269255415e-05,
"loss": 0.1253,
"step": 3155
},
{
"epoch": 0.93,
"grad_norm": 2.9830010731689076,
"learning_rate": 1.6552728232974344e-05,
"loss": 0.0736,
"step": 3160
},
{
"epoch": 0.93,
"grad_norm": 2.3767202070428732,
"learning_rate": 1.654050664526558e-05,
"loss": 0.1481,
"step": 3165
},
{
"epoch": 0.93,
"grad_norm": 2.1967464664457292,
"learning_rate": 1.6528267961378653e-05,
"loss": 0.0737,
"step": 3170
},
{
"epoch": 0.93,
"grad_norm": 2.5475708180655317,
"learning_rate": 1.651601221330906e-05,
"loss": 0.0965,
"step": 3175
},
{
"epoch": 0.93,
"grad_norm": 1.4514690521021125,
"learning_rate": 1.6503739433096893e-05,
"loss": 0.09,
"step": 3180
},
{
"epoch": 0.93,
"grad_norm": 1.0106614527500741,
"learning_rate": 1.649144965282679e-05,
"loss": 0.1028,
"step": 3185
},
{
"epoch": 0.94,
"grad_norm": 2.264948776878529,
"learning_rate": 1.647914290462781e-05,
"loss": 0.1099,
"step": 3190
},
{
"epoch": 0.94,
"grad_norm": 2.387800555259844,
"learning_rate": 1.6466819220673392e-05,
"loss": 0.0858,
"step": 3195
},
{
"epoch": 0.94,
"grad_norm": 2.2939673463714043,
"learning_rate": 1.6454478633181238e-05,
"loss": 0.0965,
"step": 3200
},
{
"epoch": 0.94,
"grad_norm": 2.8815140940347828,
"learning_rate": 1.6442121174413242e-05,
"loss": 0.084,
"step": 3205
},
{
"epoch": 0.94,
"grad_norm": 3.3710190642074394,
"learning_rate": 1.6429746876675406e-05,
"loss": 0.1348,
"step": 3210
},
{
"epoch": 0.94,
"grad_norm": 3.154298991388609,
"learning_rate": 1.6417355772317763e-05,
"loss": 0.1307,
"step": 3215
},
{
"epoch": 0.94,
"grad_norm": 2.431634251842803,
"learning_rate": 1.6404947893734263e-05,
"loss": 0.1269,
"step": 3220
},
{
"epoch": 0.95,
"grad_norm": 1.8476184413662657,
"learning_rate": 1.639252327336273e-05,
"loss": 0.0886,
"step": 3225
},
{
"epoch": 0.95,
"grad_norm": 2.856480869169723,
"learning_rate": 1.6380081943684733e-05,
"loss": 0.1183,
"step": 3230
},
{
"epoch": 0.95,
"grad_norm": 1.4292746192636032,
"learning_rate": 1.6367623937225553e-05,
"loss": 0.062,
"step": 3235
},
{
"epoch": 0.95,
"grad_norm": 1.8800704426452939,
"learning_rate": 1.6355149286554047e-05,
"loss": 0.1223,
"step": 3240
},
{
"epoch": 0.95,
"grad_norm": 3.2229475779778154,
"learning_rate": 1.6342658024282585e-05,
"loss": 0.1167,
"step": 3245
},
{
"epoch": 0.95,
"grad_norm": 5.6176440142314,
"learning_rate": 1.6330150183066983e-05,
"loss": 0.116,
"step": 3250
},
{
"epoch": 0.95,
"grad_norm": 2.101148600040731,
"learning_rate": 1.6317625795606378e-05,
"loss": 0.1162,
"step": 3255
},
{
"epoch": 0.96,
"grad_norm": 2.02917715092117,
"learning_rate": 1.6305084894643172e-05,
"loss": 0.1406,
"step": 3260
},
{
"epoch": 0.96,
"grad_norm": 2.478189120719845,
"learning_rate": 1.6292527512962947e-05,
"loss": 0.1065,
"step": 3265
},
{
"epoch": 0.96,
"grad_norm": 2.3396951822486196,
"learning_rate": 1.627995368339435e-05,
"loss": 0.1115,
"step": 3270
},
{
"epoch": 0.96,
"grad_norm": 3.7337628106177485,
"learning_rate": 1.6267363438809052e-05,
"loss": 0.0846,
"step": 3275
},
{
"epoch": 0.96,
"grad_norm": 4.176925288355294,
"learning_rate": 1.6254756812121612e-05,
"loss": 0.0425,
"step": 3280
},
{
"epoch": 0.96,
"grad_norm": 1.437234356425958,
"learning_rate": 1.6242133836289444e-05,
"loss": 0.1001,
"step": 3285
},
{
"epoch": 0.97,
"grad_norm": 1.6309867032536425,
"learning_rate": 1.6229494544312684e-05,
"loss": 0.0555,
"step": 3290
},
{
"epoch": 0.97,
"grad_norm": 3.254534712202672,
"learning_rate": 1.6216838969234124e-05,
"loss": 0.0781,
"step": 3295
},
{
"epoch": 0.97,
"grad_norm": 2.59152889519255,
"learning_rate": 1.620416714413913e-05,
"loss": 0.0997,
"step": 3300
},
{
"epoch": 0.97,
"grad_norm": 2.0546888924365527,
"learning_rate": 1.6191479102155556e-05,
"loss": 0.117,
"step": 3305
},
{
"epoch": 0.97,
"grad_norm": 3.4303903750156484,
"learning_rate": 1.617877487645364e-05,
"loss": 0.0715,
"step": 3310
},
{
"epoch": 0.97,
"grad_norm": 2.1827612639457574,
"learning_rate": 1.616605450024594e-05,
"loss": 0.1013,
"step": 3315
},
{
"epoch": 0.97,
"grad_norm": 2.865702647381268,
"learning_rate": 1.6153318006787223e-05,
"loss": 0.1131,
"step": 3320
},
{
"epoch": 0.98,
"grad_norm": 2.9285326499896946,
"learning_rate": 1.61405654293744e-05,
"loss": 0.0839,
"step": 3325
},
{
"epoch": 0.98,
"grad_norm": 1.9355299378240092,
"learning_rate": 1.6127796801346437e-05,
"loss": 0.0938,
"step": 3330
},
{
"epoch": 0.98,
"grad_norm": 4.344809423553806,
"learning_rate": 1.6115012156084242e-05,
"loss": 0.066,
"step": 3335
},
{
"epoch": 0.98,
"grad_norm": 5.030847000188601,
"learning_rate": 1.6102211527010608e-05,
"loss": 0.1054,
"step": 3340
},
{
"epoch": 0.98,
"grad_norm": 1.8609269020938257,
"learning_rate": 1.6089394947590123e-05,
"loss": 0.0936,
"step": 3345
},
{
"epoch": 0.98,
"grad_norm": 2.878476404530353,
"learning_rate": 1.6076562451329055e-05,
"loss": 0.0885,
"step": 3350
},
{
"epoch": 0.98,
"grad_norm": 2.297272109392417,
"learning_rate": 1.6063714071775297e-05,
"loss": 0.1302,
"step": 3355
},
{
"epoch": 0.99,
"grad_norm": 2.4727024357295484,
"learning_rate": 1.6050849842518265e-05,
"loss": 0.1108,
"step": 3360
},
{
"epoch": 0.99,
"grad_norm": 2.616633205645327,
"learning_rate": 1.60379697971888e-05,
"loss": 0.1337,
"step": 3365
},
{
"epoch": 0.99,
"grad_norm": 3.0531342208551266,
"learning_rate": 1.60250739694591e-05,
"loss": 0.1144,
"step": 3370
},
{
"epoch": 0.99,
"grad_norm": 2.658444025810342,
"learning_rate": 1.6012162393042625e-05,
"loss": 0.1986,
"step": 3375
},
{
"epoch": 0.99,
"grad_norm": 2.1995801611755392,
"learning_rate": 1.5999235101694003e-05,
"loss": 0.1367,
"step": 3380
},
{
"epoch": 0.99,
"grad_norm": 1.3930532278994612,
"learning_rate": 1.5986292129208938e-05,
"loss": 0.0491,
"step": 3385
},
{
"epoch": 0.99,
"grad_norm": 3.212502054295458,
"learning_rate": 1.597333350942414e-05,
"loss": 0.0862,
"step": 3390
},
{
"epoch": 1.0,
"grad_norm": 1.849378948097875,
"learning_rate": 1.5960359276217222e-05,
"loss": 0.0899,
"step": 3395
},
{
"epoch": 1.0,
"grad_norm": 3.045383499124253,
"learning_rate": 1.5947369463506614e-05,
"loss": 0.0809,
"step": 3400
},
{
"epoch": 1.0,
"grad_norm": 3.350584656668729,
"learning_rate": 1.5934364105251473e-05,
"loss": 0.0692,
"step": 3405
}
],
"logging_steps": 5,
"max_steps": 10227,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"total_flos": 0.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}