{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 3409, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 11.125381634627788, "learning_rate": 5.308241808752198e-06, "loss": 0.4004, "step": 5 }, { "epoch": 0.0, "grad_norm": 8.92423656628485, "learning_rate": 7.508241808752199e-06, "loss": 0.4709, "step": 10 }, { "epoch": 0.0, "grad_norm": 8.41715845868157, "learning_rate": 8.795159310338741e-06, "loss": 0.3973, "step": 15 }, { "epoch": 0.01, "grad_norm": 15.774318751695052, "learning_rate": 9.708241808752198e-06, "loss": 0.4057, "step": 20 }, { "epoch": 0.01, "grad_norm": 8.099327492758947, "learning_rate": 1.0416483617504396e-05, "loss": 0.2547, "step": 25 }, { "epoch": 0.01, "grad_norm": 7.413713903241163, "learning_rate": 1.099515931033874e-05, "loss": 0.2041, "step": 30 }, { "epoch": 0.01, "grad_norm": 9.03722608897779, "learning_rate": 1.1484422637278927e-05, "loss": 0.2867, "step": 35 }, { "epoch": 0.01, "grad_norm": 8.92455629586844, "learning_rate": 1.1908241808752199e-05, "loss": 0.2939, "step": 40 }, { "epoch": 0.01, "grad_norm": 10.156800616083926, "learning_rate": 1.2282076811925285e-05, "loss": 0.2955, "step": 45 }, { "epoch": 0.01, "grad_norm": 9.420288964636175, "learning_rate": 1.2616483617504393e-05, "loss": 0.1845, "step": 50 }, { "epoch": 0.02, "grad_norm": 16.959594279780564, "learning_rate": 1.2918991369754252e-05, "loss": 0.2387, "step": 55 }, { "epoch": 0.02, "grad_norm": 6.074452366890137, "learning_rate": 1.3195159310338741e-05, "loss": 0.2455, "step": 60 }, { "epoch": 0.02, "grad_norm": 4.742335711555924, "learning_rate": 1.3449209188662602e-05, "loss": 0.159, "step": 65 }, { "epoch": 0.02, "grad_norm": 5.288626310542604, "learning_rate": 1.3684422637278928e-05, "loss": 0.1843, "step": 70 }, { "epoch": 0.02, "grad_norm": 7.44716471541915, "learning_rate": 1.3903401119090938e-05, "loss": 0.1788, "step": 75 }, { "epoch": 0.02, "grad_norm": 3.821096016199364, "learning_rate": 1.4108241808752197e-05, "loss": 0.1514, "step": 80 }, { "epoch": 0.02, "grad_norm": 8.992446696823665, "learning_rate": 1.4300660059502947e-05, "loss": 0.2482, "step": 85 }, { "epoch": 0.03, "grad_norm": 5.839197680608412, "learning_rate": 1.4482076811925287e-05, "loss": 0.2065, "step": 90 }, { "epoch": 0.03, "grad_norm": 8.087974815050226, "learning_rate": 1.4653682338328086e-05, "loss": 0.2201, "step": 95 }, { "epoch": 0.03, "grad_norm": 9.987995189856553, "learning_rate": 1.4816483617504398e-05, "loss": 0.1501, "step": 100 }, { "epoch": 0.03, "grad_norm": 7.240449600169706, "learning_rate": 1.4971340138865471e-05, "loss": 0.2253, "step": 105 }, { "epoch": 0.03, "grad_norm": 15.014801352117672, "learning_rate": 1.5118991369754255e-05, "loss": 0.1938, "step": 110 }, { "epoch": 0.03, "grad_norm": 8.588750321209607, "learning_rate": 1.5260078112077627e-05, "loss": 0.2299, "step": 115 }, { "epoch": 0.04, "grad_norm": 20.519070570571127, "learning_rate": 1.5395159310338742e-05, "loss": 0.2432, "step": 120 }, { "epoch": 0.04, "grad_norm": 4.505660841015201, "learning_rate": 1.5524725426256594e-05, "loss": 0.127, "step": 125 }, { "epoch": 0.04, "grad_norm": 6.3767344789277605, "learning_rate": 1.56492091886626e-05, "loss": 0.192, "step": 130 }, { "epoch": 0.04, "grad_norm": 6.2113668041489545, "learning_rate": 1.576899431351183e-05, "loss": 0.2247, "step": 135 }, { "epoch": 0.04, "grad_norm": 11.909747553663436, "learning_rate": 1.5884422637278926e-05, "loss": 0.2069, "step": 140 }, { "epoch": 0.04, "grad_norm": 5.701699776433005, "learning_rate": 1.5995799998032858e-05, "loss": 0.1573, "step": 145 }, { "epoch": 0.04, "grad_norm": 4.996573800175535, "learning_rate": 1.6103401119090937e-05, "loss": 0.1758, "step": 150 }, { "epoch": 0.05, "grad_norm": 9.43286827567575, "learning_rate": 1.6207473691603323e-05, "loss": 0.1664, "step": 155 }, { "epoch": 0.05, "grad_norm": 28.24015920275371, "learning_rate": 1.6308241808752197e-05, "loss": 0.1844, "step": 160 }, { "epoch": 0.05, "grad_norm": 8.923975388298482, "learning_rate": 1.6405908871340797e-05, "loss": 0.2391, "step": 165 }, { "epoch": 0.05, "grad_norm": 6.282811516394321, "learning_rate": 1.6500660059502946e-05, "loss": 0.1676, "step": 170 }, { "epoch": 0.05, "grad_norm": 10.30498920082885, "learning_rate": 1.6592664446031127e-05, "loss": 0.1934, "step": 175 }, { "epoch": 0.05, "grad_norm": 5.1185908366180195, "learning_rate": 1.6682076811925287e-05, "loss": 0.1531, "step": 180 }, { "epoch": 0.05, "grad_norm": 4.756718752566272, "learning_rate": 1.6769039213135887e-05, "loss": 0.1765, "step": 185 }, { "epoch": 0.06, "grad_norm": 6.629667545228336, "learning_rate": 1.6853682338328088e-05, "loss": 0.2043, "step": 190 }, { "epoch": 0.06, "grad_norm": 1.1680908649614796, "learning_rate": 1.6936126690249144e-05, "loss": 0.1845, "step": 195 }, { "epoch": 0.06, "grad_norm": 2.594897280738297, "learning_rate": 1.7016483617504395e-05, "loss": 0.1142, "step": 200 }, { "epoch": 0.06, "grad_norm": 2.9032680841669585, "learning_rate": 1.7094856218911983e-05, "loss": 0.2007, "step": 205 }, { "epoch": 0.06, "grad_norm": 6.848184342158123, "learning_rate": 1.717134013886547e-05, "loss": 0.1239, "step": 210 }, { "epoch": 0.06, "grad_norm": 6.717780464657794, "learning_rate": 1.7246024269096814e-05, "loss": 0.2148, "step": 215 }, { "epoch": 0.06, "grad_norm": 2.418335623324901, "learning_rate": 1.7318991369754256e-05, "loss": 0.1228, "step": 220 }, { "epoch": 0.07, "grad_norm": 6.090782655933177, "learning_rate": 1.739031862067748e-05, "loss": 0.2336, "step": 225 }, { "epoch": 0.07, "grad_norm": 9.653963182390594, "learning_rate": 1.7460078112077626e-05, "loss": 0.1624, "step": 230 }, { "epoch": 0.07, "grad_norm": 6.782020630535402, "learning_rate": 1.7528337282443e-05, "loss": 0.2082, "step": 235 }, { "epoch": 0.07, "grad_norm": 7.955935238397691, "learning_rate": 1.7595159310338742e-05, "loss": 0.1203, "step": 240 }, { "epoch": 0.07, "grad_norm": 11.86700555838968, "learning_rate": 1.766060346580566e-05, "loss": 0.2336, "step": 245 }, { "epoch": 0.07, "grad_norm": 7.932728692529283, "learning_rate": 1.772472542625659e-05, "loss": 0.1557, "step": 250 }, { "epoch": 0.07, "grad_norm": 9.367369941926402, "learning_rate": 1.7787577561089487e-05, "loss": 0.1778, "step": 255 }, { "epoch": 0.08, "grad_norm": 8.119258788907397, "learning_rate": 1.7849209188662603e-05, "loss": 0.1641, "step": 260 }, { "epoch": 0.08, "grad_norm": 8.877824360780032, "learning_rate": 1.7909666808791235e-05, "loss": 0.2865, "step": 265 }, { "epoch": 0.08, "grad_norm": 5.311739339787183, "learning_rate": 1.796899431351183e-05, "loss": 0.1644, "step": 270 }, { "epoch": 0.08, "grad_norm": 3.648816107438258, "learning_rate": 1.802723317850645e-05, "loss": 0.1989, "step": 275 }, { "epoch": 0.08, "grad_norm": 37.56990879513816, "learning_rate": 1.8084422637278925e-05, "loss": 0.1892, "step": 280 }, { "epoch": 0.08, "grad_norm": 16.74826541858141, "learning_rate": 1.8140599839914632e-05, "loss": 0.2343, "step": 285 }, { "epoch": 0.09, "grad_norm": 4.863631858596725, "learning_rate": 1.8195799998032857e-05, "loss": 0.2181, "step": 290 }, { "epoch": 0.09, "grad_norm": 3.9113582322289524, "learning_rate": 1.8250056517348252e-05, "loss": 0.1715, "step": 295 }, { "epoch": 0.09, "grad_norm": 11.84889258509689, "learning_rate": 1.830340111909094e-05, "loss": 0.1459, "step": 300 }, { "epoch": 0.09, "grad_norm": 4.8514319753979915, "learning_rate": 1.8355863951390547e-05, "loss": 0.136, "step": 305 }, { "epoch": 0.09, "grad_norm": 10.329272877304971, "learning_rate": 1.8407473691603325e-05, "loss": 0.161, "step": 310 }, { "epoch": 0.09, "grad_norm": 15.22633652105913, "learning_rate": 1.8458257640452014e-05, "loss": 0.1721, "step": 315 }, { "epoch": 0.09, "grad_norm": 4.230396160936562, "learning_rate": 1.8508241808752197e-05, "loss": 0.1408, "step": 320 }, { "epoch": 0.1, "grad_norm": 4.777596867016981, "learning_rate": 1.85574509974148e-05, "loss": 0.1496, "step": 325 }, { "epoch": 0.1, "grad_norm": 5.111340439300221, "learning_rate": 1.8605908871340793e-05, "loss": 0.2215, "step": 330 }, { "epoch": 0.1, "grad_norm": 12.347791565416415, "learning_rate": 1.8653638027759297e-05, "loss": 0.1494, "step": 335 }, { "epoch": 0.1, "grad_norm": 9.015406243961076, "learning_rate": 1.8700660059502946e-05, "loss": 0.1212, "step": 340 }, { "epoch": 0.1, "grad_norm": 5.797822201400517, "learning_rate": 1.874699561366417e-05, "loss": 0.142, "step": 345 }, { "epoch": 0.1, "grad_norm": 17.174149644614868, "learning_rate": 1.8792664446031123e-05, "loss": 0.1458, "step": 350 }, { "epoch": 0.1, "grad_norm": 11.455289753213496, "learning_rate": 1.88376854716625e-05, "loss": 0.2466, "step": 355 }, { "epoch": 0.11, "grad_norm": 4.808218103853913, "learning_rate": 1.8882076811925286e-05, "loss": 0.1382, "step": 360 }, { "epoch": 0.11, "grad_norm": 6.0172560821404835, "learning_rate": 1.8925855838288235e-05, "loss": 0.2062, "step": 365 }, { "epoch": 0.11, "grad_norm": 7.361219818784043, "learning_rate": 1.8969039213135886e-05, "loss": 0.2005, "step": 370 }, { "epoch": 0.11, "grad_norm": 12.237376033790202, "learning_rate": 1.9011642927843134e-05, "loss": 0.2208, "step": 375 }, { "epoch": 0.11, "grad_norm": 4.98412721789337, "learning_rate": 1.9053682338328088e-05, "loss": 0.1888, "step": 380 }, { "epoch": 0.11, "grad_norm": 4.540797625815676, "learning_rate": 1.9095172198280984e-05, "loss": 0.1749, "step": 385 }, { "epoch": 0.11, "grad_norm": 3.822828217331199, "learning_rate": 1.9136126690249147e-05, "loss": 0.0922, "step": 390 }, { "epoch": 0.12, "grad_norm": 3.990734216544265, "learning_rate": 1.9176559454741825e-05, "loss": 0.1863, "step": 395 }, { "epoch": 0.12, "grad_norm": 6.910160942356377, "learning_rate": 1.9216483617504394e-05, "loss": 0.2068, "step": 400 }, { "epoch": 0.12, "grad_norm": 4.781285105059034, "learning_rate": 1.9255911815098372e-05, "loss": 0.1525, "step": 405 }, { "epoch": 0.12, "grad_norm": 13.76373104788082, "learning_rate": 1.9294856218911982e-05, "loss": 0.2154, "step": 410 }, { "epoch": 0.12, "grad_norm": 7.48460945287791, "learning_rate": 1.9333328557715434e-05, "loss": 0.1214, "step": 415 }, { "epoch": 0.12, "grad_norm": 9.890476894625708, "learning_rate": 1.937134013886547e-05, "loss": 0.1381, "step": 420 }, { "epoch": 0.12, "grad_norm": 9.300476915169963, "learning_rate": 1.9408901868255147e-05, "loss": 0.0847, "step": 425 }, { "epoch": 0.13, "grad_norm": 6.528248658568062, "learning_rate": 1.9446024269096816e-05, "loss": 0.1411, "step": 430 }, { "epoch": 0.13, "grad_norm": 3.682181191664401, "learning_rate": 1.94827174996194e-05, "loss": 0.1426, "step": 435 }, { "epoch": 0.13, "grad_norm": 8.43703521578652, "learning_rate": 1.951899136975425e-05, "loss": 0.1579, "step": 440 }, { "epoch": 0.13, "grad_norm": 6.56291531656417, "learning_rate": 1.9554855356878272e-05, "loss": 0.2021, "step": 445 }, { "epoch": 0.13, "grad_norm": 10.066497956266401, "learning_rate": 1.9590318620677484e-05, "loss": 0.1154, "step": 450 }, { "epoch": 0.13, "grad_norm": 5.966271330169713, "learning_rate": 1.962539001718933e-05, "loss": 0.2115, "step": 455 }, { "epoch": 0.13, "grad_norm": 5.856232881470111, "learning_rate": 1.9660078112077626e-05, "loss": 0.1418, "step": 460 }, { "epoch": 0.14, "grad_norm": 6.240360608562805, "learning_rate": 1.9694391193189866e-05, "loss": 0.1586, "step": 465 }, { "epoch": 0.14, "grad_norm": 9.757750340699735, "learning_rate": 1.9728337282443e-05, "loss": 0.1449, "step": 470 }, { "epoch": 0.14, "grad_norm": 7.712636650272143, "learning_rate": 1.9761924147080285e-05, "loss": 0.135, "step": 475 }, { "epoch": 0.14, "grad_norm": 4.515393736306068, "learning_rate": 1.9795159310338744e-05, "loss": 0.129, "step": 480 }, { "epoch": 0.14, "grad_norm": 0.7671227888175197, "learning_rate": 1.982805006156388e-05, "loss": 0.222, "step": 485 }, { "epoch": 0.14, "grad_norm": 9.304084815966336, "learning_rate": 1.9860603465805653e-05, "loss": 0.1849, "step": 490 }, { "epoch": 0.15, "grad_norm": 9.749297679157017, "learning_rate": 1.989282637292734e-05, "loss": 0.1362, "step": 495 }, { "epoch": 0.15, "grad_norm": 21.698855546577796, "learning_rate": 1.9924725426256592e-05, "loss": 0.2026, "step": 500 }, { "epoch": 0.15, "grad_norm": 8.82338005089098, "learning_rate": 1.995630707080615e-05, "loss": 0.1667, "step": 505 }, { "epoch": 0.15, "grad_norm": 26.86060661079716, "learning_rate": 1.998757756108949e-05, "loss": 0.175, "step": 510 }, { "epoch": 0.15, "grad_norm": 13.652883014206537, "learning_rate": 1.9999995294744797e-05, "loss": 0.1679, "step": 515 }, { "epoch": 0.15, "grad_norm": 8.716096864047406, "learning_rate": 1.9999966540423482e-05, "loss": 0.1385, "step": 520 }, { "epoch": 0.15, "grad_norm": 9.1722231121385, "learning_rate": 1.99999116458866e-05, "loss": 0.1694, "step": 525 }, { "epoch": 0.16, "grad_norm": 20.79172007745624, "learning_rate": 1.9999830611277667e-05, "loss": 0.2038, "step": 530 }, { "epoch": 0.16, "grad_norm": 7.595957745153807, "learning_rate": 1.9999723436808522e-05, "loss": 0.2264, "step": 535 }, { "epoch": 0.16, "grad_norm": 3.7192450442308376, "learning_rate": 1.9999590122759357e-05, "loss": 0.1615, "step": 540 }, { "epoch": 0.16, "grad_norm": 3.637037613572214, "learning_rate": 1.9999430669478693e-05, "loss": 0.0805, "step": 545 }, { "epoch": 0.16, "grad_norm": 7.478389632328602, "learning_rate": 1.999924507738338e-05, "loss": 0.1623, "step": 550 }, { "epoch": 0.16, "grad_norm": 0.5979318918112878, "learning_rate": 1.9999033346958624e-05, "loss": 0.1491, "step": 555 }, { "epoch": 0.16, "grad_norm": 12.424319850795971, "learning_rate": 1.999879547875794e-05, "loss": 0.1631, "step": 560 }, { "epoch": 0.17, "grad_norm": 5.76364771735449, "learning_rate": 1.9998531473403187e-05, "loss": 0.2197, "step": 565 }, { "epoch": 0.17, "grad_norm": 6.606496630551793, "learning_rate": 1.999824133158455e-05, "loss": 0.1468, "step": 570 }, { "epoch": 0.17, "grad_norm": 4.41747007920085, "learning_rate": 1.999792505406055e-05, "loss": 0.1366, "step": 575 }, { "epoch": 0.17, "grad_norm": 5.075879536599382, "learning_rate": 1.999758264165802e-05, "loss": 0.0992, "step": 580 }, { "epoch": 0.17, "grad_norm": 22.308467554033715, "learning_rate": 1.9997214095272135e-05, "loss": 0.2057, "step": 585 }, { "epoch": 0.17, "grad_norm": 7.279463258176968, "learning_rate": 1.9996819415866377e-05, "loss": 0.1195, "step": 590 }, { "epoch": 0.17, "grad_norm": 2.9315010662776357, "learning_rate": 1.9996398604472556e-05, "loss": 0.0958, "step": 595 }, { "epoch": 0.18, "grad_norm": 21.026513936696166, "learning_rate": 1.9995951662190794e-05, "loss": 0.1842, "step": 600 }, { "epoch": 0.18, "grad_norm": 4.622313054019638, "learning_rate": 1.9995478590189534e-05, "loss": 0.1048, "step": 605 }, { "epoch": 0.18, "grad_norm": 6.403749452434921, "learning_rate": 1.9994979389705517e-05, "loss": 0.0971, "step": 610 }, { "epoch": 0.18, "grad_norm": 14.94556621011129, "learning_rate": 1.9994454062043795e-05, "loss": 0.2127, "step": 615 }, { "epoch": 0.18, "grad_norm": 8.09929499442626, "learning_rate": 1.999390260857774e-05, "loss": 0.2903, "step": 620 }, { "epoch": 0.18, "grad_norm": 7.006300679141809, "learning_rate": 1.9993325030749006e-05, "loss": 0.1775, "step": 625 }, { "epoch": 0.18, "grad_norm": 9.59543458146263, "learning_rate": 1.9992721330067547e-05, "loss": 0.2223, "step": 630 }, { "epoch": 0.19, "grad_norm": 12.495151647629797, "learning_rate": 1.9992091508111616e-05, "loss": 0.2557, "step": 635 }, { "epoch": 0.19, "grad_norm": 11.89488787437466, "learning_rate": 1.9991435566527757e-05, "loss": 0.2852, "step": 640 }, { "epoch": 0.19, "grad_norm": 3.188905144434785, "learning_rate": 1.999075350703078e-05, "loss": 0.0961, "step": 645 }, { "epoch": 0.19, "grad_norm": 4.654862369916143, "learning_rate": 1.99900453314038e-05, "loss": 0.1578, "step": 650 }, { "epoch": 0.19, "grad_norm": 4.32187595326047, "learning_rate": 1.9989311041498186e-05, "loss": 0.127, "step": 655 }, { "epoch": 0.19, "grad_norm": 4.215063448210065, "learning_rate": 1.9988550639233587e-05, "loss": 0.1392, "step": 660 }, { "epoch": 0.2, "grad_norm": 15.535200940500214, "learning_rate": 1.998776412659792e-05, "loss": 0.1974, "step": 665 }, { "epoch": 0.2, "grad_norm": 4.760906747957173, "learning_rate": 1.998695150564736e-05, "loss": 0.1434, "step": 670 }, { "epoch": 0.2, "grad_norm": 0.061155960694268596, "learning_rate": 1.998611277850633e-05, "loss": 0.117, "step": 675 }, { "epoch": 0.2, "grad_norm": 4.953787678092304, "learning_rate": 1.9985247947367508e-05, "loss": 0.1831, "step": 680 }, { "epoch": 0.2, "grad_norm": 4.678233435016873, "learning_rate": 1.9984357014491816e-05, "loss": 0.1597, "step": 685 }, { "epoch": 0.2, "grad_norm": 3.229829256723626, "learning_rate": 1.9983439982208417e-05, "loss": 0.1587, "step": 690 }, { "epoch": 0.2, "grad_norm": 4.885033869708901, "learning_rate": 1.9982496852914696e-05, "loss": 0.18, "step": 695 }, { "epoch": 0.21, "grad_norm": 6.329976580969712, "learning_rate": 1.9981527629076265e-05, "loss": 0.1383, "step": 700 }, { "epoch": 0.21, "grad_norm": 8.616010763659258, "learning_rate": 1.9980532313226964e-05, "loss": 0.1439, "step": 705 }, { "epoch": 0.21, "grad_norm": 5.805509239551478, "learning_rate": 1.9979510907968834e-05, "loss": 0.1916, "step": 710 }, { "epoch": 0.21, "grad_norm": 9.354847725343019, "learning_rate": 1.9978463415972135e-05, "loss": 0.1291, "step": 715 }, { "epoch": 0.21, "grad_norm": 11.028811436490342, "learning_rate": 1.997738983997531e-05, "loss": 0.1329, "step": 720 }, { "epoch": 0.21, "grad_norm": 5.398403758317581, "learning_rate": 1.9976290182784994e-05, "loss": 0.169, "step": 725 }, { "epoch": 0.21, "grad_norm": 1.5150079159167873, "learning_rate": 1.9975164447276022e-05, "loss": 0.14, "step": 730 }, { "epoch": 0.22, "grad_norm": 5.437256906271298, "learning_rate": 1.9974012636391393e-05, "loss": 0.1706, "step": 735 }, { "epoch": 0.22, "grad_norm": 4.686594964713854, "learning_rate": 1.9972834753142275e-05, "loss": 0.1086, "step": 740 }, { "epoch": 0.22, "grad_norm": 7.059615364727769, "learning_rate": 1.9971630800607995e-05, "loss": 0.1391, "step": 745 }, { "epoch": 0.22, "grad_norm": 3.759488626159761, "learning_rate": 1.9970400781936044e-05, "loss": 0.1252, "step": 750 }, { "epoch": 0.22, "grad_norm": 5.196072033882789, "learning_rate": 1.9969144700342042e-05, "loss": 0.1907, "step": 755 }, { "epoch": 0.22, "grad_norm": 12.217160172705722, "learning_rate": 1.9967862559109757e-05, "loss": 0.2199, "step": 760 }, { "epoch": 0.22, "grad_norm": 4.605767548552978, "learning_rate": 1.996655436159108e-05, "loss": 0.0851, "step": 765 }, { "epoch": 0.23, "grad_norm": 2.7976726055332968, "learning_rate": 1.9965220111206022e-05, "loss": 0.1428, "step": 770 }, { "epoch": 0.23, "grad_norm": 6.6315563077229465, "learning_rate": 1.9963859811442695e-05, "loss": 0.1419, "step": 775 }, { "epoch": 0.23, "grad_norm": 7.1281321818622025, "learning_rate": 1.996247346585733e-05, "loss": 0.1895, "step": 780 }, { "epoch": 0.23, "grad_norm": 5.305688238343365, "learning_rate": 1.9961061078074236e-05, "loss": 0.1973, "step": 785 }, { "epoch": 0.23, "grad_norm": 6.655387141934961, "learning_rate": 1.99596226517858e-05, "loss": 0.222, "step": 790 }, { "epoch": 0.23, "grad_norm": 3.802457245856603, "learning_rate": 1.9958158190752497e-05, "loss": 0.1619, "step": 795 }, { "epoch": 0.23, "grad_norm": 4.205338699906657, "learning_rate": 1.9956667698802847e-05, "loss": 0.1411, "step": 800 }, { "epoch": 0.24, "grad_norm": 18.6994174478946, "learning_rate": 1.9955151179833437e-05, "loss": 0.1763, "step": 805 }, { "epoch": 0.24, "grad_norm": 5.203119684623317, "learning_rate": 1.995360863780889e-05, "loss": 0.1651, "step": 810 }, { "epoch": 0.24, "grad_norm": 1.1272129969070257, "learning_rate": 1.9952040076761857e-05, "loss": 0.1222, "step": 815 }, { "epoch": 0.24, "grad_norm": 7.271805429556148, "learning_rate": 1.9950445500793015e-05, "loss": 0.1072, "step": 820 }, { "epoch": 0.24, "grad_norm": 7.652271467084596, "learning_rate": 1.994882491407105e-05, "loss": 0.1221, "step": 825 }, { "epoch": 0.24, "grad_norm": 11.746166496698574, "learning_rate": 1.9947178320832656e-05, "loss": 0.2158, "step": 830 }, { "epoch": 0.24, "grad_norm": 6.5191485713782535, "learning_rate": 1.99455057253825e-05, "loss": 0.1689, "step": 835 }, { "epoch": 0.25, "grad_norm": 5.7533957364325286, "learning_rate": 1.9943807132093236e-05, "loss": 0.2428, "step": 840 }, { "epoch": 0.25, "grad_norm": 7.743116459704382, "learning_rate": 1.9942082545405485e-05, "loss": 0.2132, "step": 845 }, { "epoch": 0.25, "grad_norm": 7.284870082767622, "learning_rate": 1.9940331969827816e-05, "loss": 0.1184, "step": 850 }, { "epoch": 0.25, "grad_norm": 6.8174806638918355, "learning_rate": 1.9938555409936746e-05, "loss": 0.1717, "step": 855 }, { "epoch": 0.25, "grad_norm": 4.522024299566416, "learning_rate": 1.9936752870376722e-05, "loss": 0.1544, "step": 860 }, { "epoch": 0.25, "grad_norm": 10.08832302588539, "learning_rate": 1.9934924355860107e-05, "loss": 0.1735, "step": 865 }, { "epoch": 0.26, "grad_norm": 12.48917866555821, "learning_rate": 1.993306987116717e-05, "loss": 0.1627, "step": 870 }, { "epoch": 0.26, "grad_norm": 5.430627799088825, "learning_rate": 1.993118942114608e-05, "loss": 0.1113, "step": 875 }, { "epoch": 0.26, "grad_norm": 6.3813002626417905, "learning_rate": 1.992928301071288e-05, "loss": 0.1718, "step": 880 }, { "epoch": 0.26, "grad_norm": 13.52771259964932, "learning_rate": 1.9927350644851477e-05, "loss": 0.1118, "step": 885 }, { "epoch": 0.26, "grad_norm": 3.5900299721319406, "learning_rate": 1.9925392328613644e-05, "loss": 0.1357, "step": 890 }, { "epoch": 0.26, "grad_norm": 10.835472443494583, "learning_rate": 1.992340806711899e-05, "loss": 0.0815, "step": 895 }, { "epoch": 0.26, "grad_norm": 6.39064559373592, "learning_rate": 1.992139786555496e-05, "loss": 0.2278, "step": 900 }, { "epoch": 0.27, "grad_norm": 14.873787966804063, "learning_rate": 1.9919361729176798e-05, "loss": 0.1245, "step": 905 }, { "epoch": 0.27, "grad_norm": 0.8036248310449102, "learning_rate": 1.991729966330756e-05, "loss": 0.1297, "step": 910 }, { "epoch": 0.27, "grad_norm": 2.7695654260331835, "learning_rate": 1.991521167333809e-05, "loss": 0.1493, "step": 915 }, { "epoch": 0.27, "grad_norm": 7.413157143500232, "learning_rate": 1.9913097764727006e-05, "loss": 0.1712, "step": 920 }, { "epoch": 0.27, "grad_norm": 5.35044325659026, "learning_rate": 1.9910957943000678e-05, "loss": 0.1923, "step": 925 }, { "epoch": 0.27, "grad_norm": 4.939892539272544, "learning_rate": 1.9908792213753223e-05, "loss": 0.1262, "step": 930 }, { "epoch": 0.27, "grad_norm": 4.425995433503358, "learning_rate": 1.990660058264649e-05, "loss": 0.1316, "step": 935 }, { "epoch": 0.28, "grad_norm": 6.682173559661954, "learning_rate": 1.9904383055410045e-05, "loss": 0.2628, "step": 940 }, { "epoch": 0.28, "grad_norm": 19.871499575052912, "learning_rate": 1.9902139637841146e-05, "loss": 0.1646, "step": 945 }, { "epoch": 0.28, "grad_norm": 6.951640795770302, "learning_rate": 1.989987033580475e-05, "loss": 0.1733, "step": 950 }, { "epoch": 0.28, "grad_norm": 9.387316535142354, "learning_rate": 1.989757515523346e-05, "loss": 0.1448, "step": 955 }, { "epoch": 0.28, "grad_norm": 3.9401918928016197, "learning_rate": 1.9895254102127562e-05, "loss": 0.1421, "step": 960 }, { "epoch": 0.28, "grad_norm": 14.253092112437391, "learning_rate": 1.989290718255496e-05, "loss": 0.205, "step": 965 }, { "epoch": 0.28, "grad_norm": 3.6398631596257545, "learning_rate": 1.9890534402651184e-05, "loss": 0.0899, "step": 970 }, { "epoch": 0.29, "grad_norm": 15.630617642490721, "learning_rate": 1.988813576861938e-05, "loss": 0.1328, "step": 975 }, { "epoch": 0.29, "grad_norm": 6.6456920006850115, "learning_rate": 1.9885711286730267e-05, "loss": 0.1899, "step": 980 }, { "epoch": 0.29, "grad_norm": 7.585995077745574, "learning_rate": 1.9883260963322152e-05, "loss": 0.1583, "step": 985 }, { "epoch": 0.29, "grad_norm": 3.476123842146937, "learning_rate": 1.98807848048009e-05, "loss": 0.1855, "step": 990 }, { "epoch": 0.29, "grad_norm": 4.179578662061332, "learning_rate": 1.987828281763991e-05, "loss": 0.1681, "step": 995 }, { "epoch": 0.29, "grad_norm": 3.565609561977014, "learning_rate": 1.9875755008380104e-05, "loss": 0.1187, "step": 1000 }, { "epoch": 0.29, "grad_norm": 9.26279986638738, "learning_rate": 1.9873201383629913e-05, "loss": 0.1337, "step": 1005 }, { "epoch": 0.3, "grad_norm": 3.569019308801666, "learning_rate": 1.987062195006526e-05, "loss": 0.0932, "step": 1010 }, { "epoch": 0.3, "grad_norm": 1.8954161789376982, "learning_rate": 1.986801671442953e-05, "loss": 0.1272, "step": 1015 }, { "epoch": 0.3, "grad_norm": 6.549425452273108, "learning_rate": 1.986538568353358e-05, "loss": 0.2543, "step": 1020 }, { "epoch": 0.3, "grad_norm": 6.401069146499022, "learning_rate": 1.9862728864255677e-05, "loss": 0.1339, "step": 1025 }, { "epoch": 0.3, "grad_norm": 14.623867414912695, "learning_rate": 1.9860046263541537e-05, "loss": 0.1368, "step": 1030 }, { "epoch": 0.3, "grad_norm": 7.5226028629064166, "learning_rate": 1.9857337888404254e-05, "loss": 0.1315, "step": 1035 }, { "epoch": 0.31, "grad_norm": 6.3020352784733795, "learning_rate": 1.985460374592431e-05, "loss": 0.2022, "step": 1040 }, { "epoch": 0.31, "grad_norm": 35.72017508640075, "learning_rate": 1.9851843843249552e-05, "loss": 0.1907, "step": 1045 }, { "epoch": 0.31, "grad_norm": 7.856532164781736, "learning_rate": 1.9849058187595173e-05, "loss": 0.1042, "step": 1050 }, { "epoch": 0.31, "grad_norm": 4.939914354516286, "learning_rate": 1.9846246786243682e-05, "loss": 0.1883, "step": 1055 }, { "epoch": 0.31, "grad_norm": 5.847586448522947, "learning_rate": 1.9843409646544912e-05, "loss": 0.1352, "step": 1060 }, { "epoch": 0.31, "grad_norm": 1.7662920071735315, "learning_rate": 1.984054677591597e-05, "loss": 0.1266, "step": 1065 }, { "epoch": 0.31, "grad_norm": 5.934002297797303, "learning_rate": 1.9837658181841236e-05, "loss": 0.1282, "step": 1070 }, { "epoch": 0.32, "grad_norm": 7.322599560603541, "learning_rate": 1.9834743871872333e-05, "loss": 0.1002, "step": 1075 }, { "epoch": 0.32, "grad_norm": 7.354422877300046, "learning_rate": 1.9831803853628122e-05, "loss": 0.1347, "step": 1080 }, { "epoch": 0.32, "grad_norm": 4.244910165579806, "learning_rate": 1.9828838134794668e-05, "loss": 0.191, "step": 1085 }, { "epoch": 0.32, "grad_norm": 2.189412507115152, "learning_rate": 1.9825846723125222e-05, "loss": 0.129, "step": 1090 }, { "epoch": 0.32, "grad_norm": 4.334446446188146, "learning_rate": 1.9822829626440213e-05, "loss": 0.1606, "step": 1095 }, { "epoch": 0.32, "grad_norm": 10.79965169839475, "learning_rate": 1.9819786852627208e-05, "loss": 0.2085, "step": 1100 }, { "epoch": 0.32, "grad_norm": 5.047688232660633, "learning_rate": 1.9816718409640904e-05, "loss": 0.1335, "step": 1105 }, { "epoch": 0.33, "grad_norm": 3.0521785647450033, "learning_rate": 1.9813624305503105e-05, "loss": 0.0902, "step": 1110 }, { "epoch": 0.33, "grad_norm": 4.404798119689452, "learning_rate": 1.9810504548302706e-05, "loss": 0.1504, "step": 1115 }, { "epoch": 0.33, "grad_norm": 4.276958235694841, "learning_rate": 1.980735914619566e-05, "loss": 0.1787, "step": 1120 }, { "epoch": 0.33, "grad_norm": 6.67294781745391, "learning_rate": 1.9804188107404973e-05, "loss": 0.1485, "step": 1125 }, { "epoch": 0.33, "grad_norm": 1.4659143223309605, "learning_rate": 1.9800991440220652e-05, "loss": 0.1161, "step": 1130 }, { "epoch": 0.33, "grad_norm": 12.717469246978801, "learning_rate": 1.979776915299973e-05, "loss": 0.1409, "step": 1135 }, { "epoch": 0.33, "grad_norm": 7.082211154981896, "learning_rate": 1.9794521254166197e-05, "loss": 0.1543, "step": 1140 }, { "epoch": 0.34, "grad_norm": 5.796340300790522, "learning_rate": 1.9791247752211014e-05, "loss": 0.1151, "step": 1145 }, { "epoch": 0.34, "grad_norm": 5.586857817269538, "learning_rate": 1.978794865569207e-05, "loss": 0.1199, "step": 1150 }, { "epoch": 0.34, "grad_norm": 12.706251296959476, "learning_rate": 1.9784623973234158e-05, "loss": 0.1619, "step": 1155 }, { "epoch": 0.34, "grad_norm": 6.66191554843005, "learning_rate": 1.978127371352898e-05, "loss": 0.1827, "step": 1160 }, { "epoch": 0.34, "grad_norm": 1.7314817335468968, "learning_rate": 1.9777897885335077e-05, "loss": 0.1299, "step": 1165 }, { "epoch": 0.34, "grad_norm": 3.1709913704813957, "learning_rate": 1.9774496497477863e-05, "loss": 0.0935, "step": 1170 }, { "epoch": 0.34, "grad_norm": 6.802763534426451, "learning_rate": 1.9771069558849553e-05, "loss": 0.1747, "step": 1175 }, { "epoch": 0.35, "grad_norm": 5.666618159965396, "learning_rate": 1.9767617078409162e-05, "loss": 0.1248, "step": 1180 }, { "epoch": 0.35, "grad_norm": 12.07309712013722, "learning_rate": 1.9764139065182485e-05, "loss": 0.1369, "step": 1185 }, { "epoch": 0.35, "grad_norm": 5.0768838627507895, "learning_rate": 1.976063552826206e-05, "loss": 0.1426, "step": 1190 }, { "epoch": 0.35, "grad_norm": 2.6195923645371915, "learning_rate": 1.9757106476807156e-05, "loss": 0.1414, "step": 1195 }, { "epoch": 0.35, "grad_norm": 8.745582119219046, "learning_rate": 1.975355192004374e-05, "loss": 0.1537, "step": 1200 }, { "epoch": 0.35, "grad_norm": 4.3619589308843425, "learning_rate": 1.9749971867264468e-05, "loss": 0.2326, "step": 1205 }, { "epoch": 0.35, "grad_norm": 9.986127768195958, "learning_rate": 1.9746366327828637e-05, "loss": 0.0914, "step": 1210 }, { "epoch": 0.36, "grad_norm": 11.679210640348971, "learning_rate": 1.9742735311162177e-05, "loss": 0.1284, "step": 1215 }, { "epoch": 0.36, "grad_norm": 4.432897195420396, "learning_rate": 1.973907882675763e-05, "loss": 0.1578, "step": 1220 }, { "epoch": 0.36, "grad_norm": 9.382375435412673, "learning_rate": 1.973539688417411e-05, "loss": 0.1637, "step": 1225 }, { "epoch": 0.36, "grad_norm": 5.830067001664431, "learning_rate": 1.973168949303729e-05, "loss": 0.1468, "step": 1230 }, { "epoch": 0.36, "grad_norm": 11.741616362731765, "learning_rate": 1.9727956663039367e-05, "loss": 0.1691, "step": 1235 }, { "epoch": 0.36, "grad_norm": 7.39202415980895, "learning_rate": 1.9724198403939053e-05, "loss": 0.1809, "step": 1240 }, { "epoch": 0.37, "grad_norm": 4.693742994361657, "learning_rate": 1.9720414725561538e-05, "loss": 0.0858, "step": 1245 }, { "epoch": 0.37, "grad_norm": 8.617997574967658, "learning_rate": 1.9716605637798452e-05, "loss": 0.1308, "step": 1250 }, { "epoch": 0.37, "grad_norm": 2.6119778380512626, "learning_rate": 1.9712771150607865e-05, "loss": 0.1205, "step": 1255 }, { "epoch": 0.37, "grad_norm": 5.216314969786015, "learning_rate": 1.9708911274014247e-05, "loss": 0.1245, "step": 1260 }, { "epoch": 0.37, "grad_norm": 5.528129775162225, "learning_rate": 1.970502601810844e-05, "loss": 0.1876, "step": 1265 }, { "epoch": 0.37, "grad_norm": 5.524231849539429, "learning_rate": 1.9701115393047636e-05, "loss": 0.1143, "step": 1270 }, { "epoch": 0.37, "grad_norm": 4.229434350974042, "learning_rate": 1.969717940905535e-05, "loss": 0.1273, "step": 1275 }, { "epoch": 0.38, "grad_norm": 6.514210060225279, "learning_rate": 1.9693218076421395e-05, "loss": 0.1034, "step": 1280 }, { "epoch": 0.38, "grad_norm": 14.7017638724717, "learning_rate": 1.9689231405501844e-05, "loss": 0.1529, "step": 1285 }, { "epoch": 0.38, "grad_norm": 4.964386181339109, "learning_rate": 1.968521940671903e-05, "loss": 0.1567, "step": 1290 }, { "epoch": 0.38, "grad_norm": 5.222923574403755, "learning_rate": 1.9681182090561467e-05, "loss": 0.1144, "step": 1295 }, { "epoch": 0.38, "grad_norm": 5.523495340651447, "learning_rate": 1.96771194675839e-05, "loss": 0.1311, "step": 1300 }, { "epoch": 0.38, "grad_norm": 6.064321028150695, "learning_rate": 1.9673031548407197e-05, "loss": 0.1282, "step": 1305 }, { "epoch": 0.38, "grad_norm": 7.022609980188127, "learning_rate": 1.9668918343718377e-05, "loss": 0.1735, "step": 1310 }, { "epoch": 0.39, "grad_norm": 5.386703878469319, "learning_rate": 1.9664779864270553e-05, "loss": 0.146, "step": 1315 }, { "epoch": 0.39, "grad_norm": 2.2817281979534587, "learning_rate": 1.966061612088292e-05, "loss": 0.1566, "step": 1320 }, { "epoch": 0.39, "grad_norm": 5.647148438986119, "learning_rate": 1.965642712444072e-05, "loss": 0.1508, "step": 1325 }, { "epoch": 0.39, "grad_norm": 14.636180809770323, "learning_rate": 1.965221288589521e-05, "loss": 0.1366, "step": 1330 }, { "epoch": 0.39, "grad_norm": 3.796916066642304, "learning_rate": 1.9647973416263634e-05, "loss": 0.1562, "step": 1335 }, { "epoch": 0.39, "grad_norm": 8.39363234629941, "learning_rate": 1.964370872662921e-05, "loss": 0.0938, "step": 1340 }, { "epoch": 0.39, "grad_norm": 6.500094779864121, "learning_rate": 1.963941882814108e-05, "loss": 0.1746, "step": 1345 }, { "epoch": 0.4, "grad_norm": 3.5283001757400054, "learning_rate": 1.963510373201428e-05, "loss": 0.1419, "step": 1350 }, { "epoch": 0.4, "grad_norm": 2.5908302511117327, "learning_rate": 1.9630763449529747e-05, "loss": 0.0663, "step": 1355 }, { "epoch": 0.4, "grad_norm": 6.447702267666421, "learning_rate": 1.962639799203423e-05, "loss": 0.1506, "step": 1360 }, { "epoch": 0.4, "grad_norm": 7.258886744369148, "learning_rate": 1.962200737094032e-05, "loss": 0.1705, "step": 1365 }, { "epoch": 0.4, "grad_norm": 8.355297998196825, "learning_rate": 1.9617591597726372e-05, "loss": 0.2185, "step": 1370 }, { "epoch": 0.4, "grad_norm": 11.059705823190226, "learning_rate": 1.9613150683936513e-05, "loss": 0.164, "step": 1375 }, { "epoch": 0.4, "grad_norm": 7.496281019677461, "learning_rate": 1.9608684641180584e-05, "loss": 0.1868, "step": 1380 }, { "epoch": 0.41, "grad_norm": 9.337392423896086, "learning_rate": 1.9604193481134123e-05, "loss": 0.128, "step": 1385 }, { "epoch": 0.41, "grad_norm": 6.741067698323382, "learning_rate": 1.9599677215538333e-05, "loss": 0.1304, "step": 1390 }, { "epoch": 0.41, "grad_norm": 5.97327894379059, "learning_rate": 1.959513585620005e-05, "loss": 0.1129, "step": 1395 }, { "epoch": 0.41, "grad_norm": 6.2990643124323, "learning_rate": 1.9590569414991718e-05, "loss": 0.2452, "step": 1400 }, { "epoch": 0.41, "grad_norm": 3.9874003574274552, "learning_rate": 1.9585977903851334e-05, "loss": 0.1288, "step": 1405 }, { "epoch": 0.41, "grad_norm": 10.71406937880321, "learning_rate": 1.9581361334782453e-05, "loss": 0.1682, "step": 1410 }, { "epoch": 0.42, "grad_norm": 3.1321367786011076, "learning_rate": 1.957671971985414e-05, "loss": 0.1461, "step": 1415 }, { "epoch": 0.42, "grad_norm": 6.601166084621551, "learning_rate": 1.9572053071200922e-05, "loss": 0.1642, "step": 1420 }, { "epoch": 0.42, "grad_norm": 6.815298083273188, "learning_rate": 1.9567361401022784e-05, "loss": 0.2203, "step": 1425 }, { "epoch": 0.42, "grad_norm": 8.60817902664583, "learning_rate": 1.9562644721585123e-05, "loss": 0.1246, "step": 1430 }, { "epoch": 0.42, "grad_norm": 3.607077127205811, "learning_rate": 1.9557903045218708e-05, "loss": 0.0977, "step": 1435 }, { "epoch": 0.42, "grad_norm": 5.81146788919113, "learning_rate": 1.955313638431967e-05, "loss": 0.1038, "step": 1440 }, { "epoch": 0.42, "grad_norm": 8.172735748764952, "learning_rate": 1.954834475134945e-05, "loss": 0.1653, "step": 1445 }, { "epoch": 0.43, "grad_norm": 6.240754575344053, "learning_rate": 1.9543528158834775e-05, "loss": 0.1734, "step": 1450 }, { "epoch": 0.43, "grad_norm": 5.739038717343691, "learning_rate": 1.953868661936762e-05, "loss": 0.1477, "step": 1455 }, { "epoch": 0.43, "grad_norm": 3.09005933315755, "learning_rate": 1.9533820145605184e-05, "loss": 0.1303, "step": 1460 }, { "epoch": 0.43, "grad_norm": 9.269129587273234, "learning_rate": 1.9528928750269847e-05, "loss": 0.1188, "step": 1465 }, { "epoch": 0.43, "grad_norm": 3.9756210801612446, "learning_rate": 1.9524012446149144e-05, "loss": 0.1011, "step": 1470 }, { "epoch": 0.43, "grad_norm": 6.014169492139025, "learning_rate": 1.9519071246095734e-05, "loss": 0.1843, "step": 1475 }, { "epoch": 0.43, "grad_norm": 6.42414136559558, "learning_rate": 1.951410516302735e-05, "loss": 0.1429, "step": 1480 }, { "epoch": 0.44, "grad_norm": 5.990872434024276, "learning_rate": 1.950911420992678e-05, "loss": 0.0871, "step": 1485 }, { "epoch": 0.44, "grad_norm": 3.5783586051011795, "learning_rate": 1.9504098399841835e-05, "loss": 0.1602, "step": 1490 }, { "epoch": 0.44, "grad_norm": 6.318287528571422, "learning_rate": 1.9499057745885308e-05, "loss": 0.134, "step": 1495 }, { "epoch": 0.44, "grad_norm": 12.51258277827576, "learning_rate": 1.949399226123493e-05, "loss": 0.1537, "step": 1500 }, { "epoch": 0.44, "grad_norm": 3.886808062730125, "learning_rate": 1.9488901959133365e-05, "loss": 0.1997, "step": 1505 }, { "epoch": 0.44, "grad_norm": 2.309686134450491, "learning_rate": 1.9483786852888144e-05, "loss": 0.1105, "step": 1510 }, { "epoch": 0.44, "grad_norm": 4.355694711066838, "learning_rate": 1.947864695587165e-05, "loss": 0.1538, "step": 1515 }, { "epoch": 0.45, "grad_norm": 2.986939771149088, "learning_rate": 1.9473482281521063e-05, "loss": 0.0769, "step": 1520 }, { "epoch": 0.45, "grad_norm": 2.8973623579771397, "learning_rate": 1.946829284333836e-05, "loss": 0.0987, "step": 1525 }, { "epoch": 0.45, "grad_norm": 2.0533675098464874, "learning_rate": 1.9463078654890242e-05, "loss": 0.1066, "step": 1530 }, { "epoch": 0.45, "grad_norm": 7.8889622513250774, "learning_rate": 1.945783972980812e-05, "loss": 0.0989, "step": 1535 }, { "epoch": 0.45, "grad_norm": 1.470844560891904, "learning_rate": 1.945257608178807e-05, "loss": 0.0536, "step": 1540 }, { "epoch": 0.45, "grad_norm": 3.097540950673788, "learning_rate": 1.9447287724590808e-05, "loss": 0.1487, "step": 1545 }, { "epoch": 0.45, "grad_norm": 2.236294115632501, "learning_rate": 1.9441974672041636e-05, "loss": 0.1272, "step": 1550 }, { "epoch": 0.46, "grad_norm": 6.08682373802408, "learning_rate": 1.943663693803043e-05, "loss": 0.1234, "step": 1555 }, { "epoch": 0.46, "grad_norm": 2.659517679298076, "learning_rate": 1.9431274536511577e-05, "loss": 0.107, "step": 1560 }, { "epoch": 0.46, "grad_norm": 4.176352623298332, "learning_rate": 1.9425887481503964e-05, "loss": 0.1275, "step": 1565 }, { "epoch": 0.46, "grad_norm": 3.247258717513404, "learning_rate": 1.9420475787090926e-05, "loss": 0.1282, "step": 1570 }, { "epoch": 0.46, "grad_norm": 3.345767544439658, "learning_rate": 1.9415039467420207e-05, "loss": 0.0917, "step": 1575 }, { "epoch": 0.46, "grad_norm": 2.368596730771716, "learning_rate": 1.9409578536703936e-05, "loss": 0.1262, "step": 1580 }, { "epoch": 0.46, "grad_norm": 12.620428787419241, "learning_rate": 1.9404093009218568e-05, "loss": 0.1687, "step": 1585 }, { "epoch": 0.47, "grad_norm": 2.3476402040970985, "learning_rate": 1.939858289930489e-05, "loss": 0.0849, "step": 1590 }, { "epoch": 0.47, "grad_norm": 9.157447293999722, "learning_rate": 1.9393048221367924e-05, "loss": 0.1322, "step": 1595 }, { "epoch": 0.47, "grad_norm": 10.047629795227984, "learning_rate": 1.9387488989876937e-05, "loss": 0.1215, "step": 1600 }, { "epoch": 0.47, "grad_norm": 4.324110021146038, "learning_rate": 1.938190521936538e-05, "loss": 0.1228, "step": 1605 }, { "epoch": 0.47, "grad_norm": 2.520014942669844, "learning_rate": 1.937629692443086e-05, "loss": 0.1538, "step": 1610 }, { "epoch": 0.47, "grad_norm": 5.789832430771725, "learning_rate": 1.9370664119735096e-05, "loss": 0.1508, "step": 1615 }, { "epoch": 0.48, "grad_norm": 3.248881220822288, "learning_rate": 1.9365006820003883e-05, "loss": 0.1051, "step": 1620 }, { "epoch": 0.48, "grad_norm": 3.1423550675382623, "learning_rate": 1.935932504002705e-05, "loss": 0.0786, "step": 1625 }, { "epoch": 0.48, "grad_norm": 4.9862187243188645, "learning_rate": 1.935361879465843e-05, "loss": 0.089, "step": 1630 }, { "epoch": 0.48, "grad_norm": 1.9635393731249728, "learning_rate": 1.9347888098815814e-05, "loss": 0.0699, "step": 1635 }, { "epoch": 0.48, "grad_norm": 2.1963931644611914, "learning_rate": 1.9342132967480914e-05, "loss": 0.1087, "step": 1640 }, { "epoch": 0.48, "grad_norm": 2.689006294580755, "learning_rate": 1.9336353415699316e-05, "loss": 0.0622, "step": 1645 }, { "epoch": 0.48, "grad_norm": 2.1154762474968005, "learning_rate": 1.933054945858046e-05, "loss": 0.0961, "step": 1650 }, { "epoch": 0.49, "grad_norm": 6.7523638506838095, "learning_rate": 1.932472111129758e-05, "loss": 0.1685, "step": 1655 }, { "epoch": 0.49, "grad_norm": 1.5984988800226272, "learning_rate": 1.931886838908768e-05, "loss": 0.1272, "step": 1660 }, { "epoch": 0.49, "grad_norm": 4.090521085044347, "learning_rate": 1.9312991307251476e-05, "loss": 0.0738, "step": 1665 }, { "epoch": 0.49, "grad_norm": 3.0964168267397385, "learning_rate": 1.9307089881153383e-05, "loss": 0.095, "step": 1670 }, { "epoch": 0.49, "grad_norm": 1.7867271806779057, "learning_rate": 1.9301164126221444e-05, "loss": 0.1104, "step": 1675 }, { "epoch": 0.49, "grad_norm": 2.565341397593321, "learning_rate": 1.929521405794732e-05, "loss": 0.1075, "step": 1680 }, { "epoch": 0.49, "grad_norm": 0.7933305187602832, "learning_rate": 1.9289239691886213e-05, "loss": 0.0703, "step": 1685 }, { "epoch": 0.5, "grad_norm": 2.7412621656740184, "learning_rate": 1.9283241043656865e-05, "loss": 0.1091, "step": 1690 }, { "epoch": 0.5, "grad_norm": 3.6919637269194117, "learning_rate": 1.9277218128941493e-05, "loss": 0.0399, "step": 1695 }, { "epoch": 0.5, "grad_norm": 3.3309770998366575, "learning_rate": 1.927117096348575e-05, "loss": 0.1115, "step": 1700 }, { "epoch": 0.5, "grad_norm": 3.120277651390199, "learning_rate": 1.9265099563098698e-05, "loss": 0.1292, "step": 1705 }, { "epoch": 0.5, "grad_norm": 3.567470496202032, "learning_rate": 1.9259003943652743e-05, "loss": 0.1023, "step": 1710 }, { "epoch": 0.5, "grad_norm": 3.1015901732471365, "learning_rate": 1.9252884121083613e-05, "loss": 0.098, "step": 1715 }, { "epoch": 0.5, "grad_norm": 3.3435838604589216, "learning_rate": 1.924674011139031e-05, "loss": 0.1034, "step": 1720 }, { "epoch": 0.51, "grad_norm": 2.751303537238706, "learning_rate": 1.924057193063507e-05, "loss": 0.0751, "step": 1725 }, { "epoch": 0.51, "grad_norm": 2.8435568272302034, "learning_rate": 1.923437959494331e-05, "loss": 0.0938, "step": 1730 }, { "epoch": 0.51, "grad_norm": 3.491687713292557, "learning_rate": 1.9228163120503612e-05, "loss": 0.126, "step": 1735 }, { "epoch": 0.51, "grad_norm": 2.065971227764955, "learning_rate": 1.9221922523567643e-05, "loss": 0.0992, "step": 1740 }, { "epoch": 0.51, "grad_norm": 2.291836460643561, "learning_rate": 1.9215657820450152e-05, "loss": 0.1169, "step": 1745 }, { "epoch": 0.51, "grad_norm": 6.805536473392368, "learning_rate": 1.92093690275289e-05, "loss": 0.1301, "step": 1750 }, { "epoch": 0.51, "grad_norm": 2.1824217570154456, "learning_rate": 1.920305616124462e-05, "loss": 0.1125, "step": 1755 }, { "epoch": 0.52, "grad_norm": 14.726348017337305, "learning_rate": 1.9196719238100993e-05, "loss": 0.1292, "step": 1760 }, { "epoch": 0.52, "grad_norm": 4.930978645236691, "learning_rate": 1.9190358274664586e-05, "loss": 0.1418, "step": 1765 }, { "epoch": 0.52, "grad_norm": 1.5342337673472881, "learning_rate": 1.9183973287564806e-05, "loss": 0.139, "step": 1770 }, { "epoch": 0.52, "grad_norm": 2.427844265256046, "learning_rate": 1.9177564293493876e-05, "loss": 0.083, "step": 1775 }, { "epoch": 0.52, "grad_norm": 2.1313388749998086, "learning_rate": 1.9171131309206777e-05, "loss": 0.1207, "step": 1780 }, { "epoch": 0.52, "grad_norm": 3.126693266841522, "learning_rate": 1.9164674351521203e-05, "loss": 0.0963, "step": 1785 }, { "epoch": 0.53, "grad_norm": 3.175825693492831, "learning_rate": 1.9158193437317527e-05, "loss": 0.1776, "step": 1790 }, { "epoch": 0.53, "grad_norm": 2.9113721112544906, "learning_rate": 1.9151688583538753e-05, "loss": 0.1217, "step": 1795 }, { "epoch": 0.53, "grad_norm": 3.1166239064049113, "learning_rate": 1.9145159807190458e-05, "loss": 0.0507, "step": 1800 }, { "epoch": 0.53, "grad_norm": 8.142997644880616, "learning_rate": 1.9138607125340777e-05, "loss": 0.1586, "step": 1805 }, { "epoch": 0.53, "grad_norm": 1.8022906748245269, "learning_rate": 1.913203055512033e-05, "loss": 0.1093, "step": 1810 }, { "epoch": 0.53, "grad_norm": 1.8417345443000441, "learning_rate": 1.9125430113722186e-05, "loss": 0.1023, "step": 1815 }, { "epoch": 0.53, "grad_norm": 3.399632737014696, "learning_rate": 1.9118805818401825e-05, "loss": 0.1, "step": 1820 }, { "epoch": 0.54, "grad_norm": 2.709759019082288, "learning_rate": 1.9112157686477092e-05, "loss": 0.108, "step": 1825 }, { "epoch": 0.54, "grad_norm": 2.8287642794394894, "learning_rate": 1.910548573532814e-05, "loss": 0.1119, "step": 1830 }, { "epoch": 0.54, "grad_norm": 0.9463293108337878, "learning_rate": 1.90987899823974e-05, "loss": 0.106, "step": 1835 }, { "epoch": 0.54, "grad_norm": 4.02788077621015, "learning_rate": 1.9092070445189513e-05, "loss": 0.1223, "step": 1840 }, { "epoch": 0.54, "grad_norm": 6.586862447096695, "learning_rate": 1.9085327141271325e-05, "loss": 0.1612, "step": 1845 }, { "epoch": 0.54, "grad_norm": 6.365460607459384, "learning_rate": 1.907856008827178e-05, "loss": 0.1469, "step": 1850 }, { "epoch": 0.54, "grad_norm": 5.186222516601001, "learning_rate": 1.907176930388195e-05, "loss": 0.1176, "step": 1855 }, { "epoch": 0.55, "grad_norm": 0.10245286406524089, "learning_rate": 1.906495480585491e-05, "loss": 0.0928, "step": 1860 }, { "epoch": 0.55, "grad_norm": 2.469663523408353, "learning_rate": 1.9058116612005757e-05, "loss": 0.095, "step": 1865 }, { "epoch": 0.55, "grad_norm": 2.7503837837835223, "learning_rate": 1.905125474021152e-05, "loss": 0.1103, "step": 1870 }, { "epoch": 0.55, "grad_norm": 3.6629482112205047, "learning_rate": 1.9044369208411127e-05, "loss": 0.0769, "step": 1875 }, { "epoch": 0.55, "grad_norm": 3.6257303751767043, "learning_rate": 1.903746003460538e-05, "loss": 0.1188, "step": 1880 }, { "epoch": 0.55, "grad_norm": 3.1074798789565987, "learning_rate": 1.9030527236856867e-05, "loss": 0.0771, "step": 1885 }, { "epoch": 0.55, "grad_norm": 1.9930789523758066, "learning_rate": 1.9023570833289946e-05, "loss": 0.1227, "step": 1890 }, { "epoch": 0.56, "grad_norm": 5.3774295018042775, "learning_rate": 1.9016590842090682e-05, "loss": 0.1089, "step": 1895 }, { "epoch": 0.56, "grad_norm": 3.988587549835179, "learning_rate": 1.9009587281506815e-05, "loss": 0.1095, "step": 1900 }, { "epoch": 0.56, "grad_norm": 2.4025900613693385, "learning_rate": 1.9002560169847688e-05, "loss": 0.0744, "step": 1905 }, { "epoch": 0.56, "grad_norm": 4.962479191119096, "learning_rate": 1.8995509525484227e-05, "loss": 0.1113, "step": 1910 }, { "epoch": 0.56, "grad_norm": 3.222489638212013, "learning_rate": 1.8988435366848867e-05, "loss": 0.1122, "step": 1915 }, { "epoch": 0.56, "grad_norm": 6.910634577936775, "learning_rate": 1.8981337712435528e-05, "loss": 0.1357, "step": 1920 }, { "epoch": 0.56, "grad_norm": 2.770074917437753, "learning_rate": 1.897421658079955e-05, "loss": 0.1246, "step": 1925 }, { "epoch": 0.57, "grad_norm": 1.9251395311195432, "learning_rate": 1.8967071990557643e-05, "loss": 0.1159, "step": 1930 }, { "epoch": 0.57, "grad_norm": 5.562401417062251, "learning_rate": 1.8959903960387852e-05, "loss": 0.0945, "step": 1935 }, { "epoch": 0.57, "grad_norm": 2.5785513785382435, "learning_rate": 1.89527125090295e-05, "loss": 0.1166, "step": 1940 }, { "epoch": 0.57, "grad_norm": 2.3414558881424488, "learning_rate": 1.8945497655283142e-05, "loss": 0.1071, "step": 1945 }, { "epoch": 0.57, "grad_norm": 3.8387053345665754, "learning_rate": 1.8938259418010504e-05, "loss": 0.078, "step": 1950 }, { "epoch": 0.57, "grad_norm": 2.503163339080756, "learning_rate": 1.8930997816134457e-05, "loss": 0.1155, "step": 1955 }, { "epoch": 0.57, "grad_norm": 3.006961850846919, "learning_rate": 1.892371286863894e-05, "loss": 0.1433, "step": 1960 }, { "epoch": 0.58, "grad_norm": 2.509118046318532, "learning_rate": 1.8916404594568934e-05, "loss": 0.0889, "step": 1965 }, { "epoch": 0.58, "grad_norm": 2.061207568655511, "learning_rate": 1.8909073013030404e-05, "loss": 0.1235, "step": 1970 }, { "epoch": 0.58, "grad_norm": 2.1278253733682098, "learning_rate": 1.8901718143190234e-05, "loss": 0.0903, "step": 1975 }, { "epoch": 0.58, "grad_norm": 2.546231615677142, "learning_rate": 1.8894340004276208e-05, "loss": 0.0992, "step": 1980 }, { "epoch": 0.58, "grad_norm": 5.884018165275079, "learning_rate": 1.8886938615576926e-05, "loss": 0.1213, "step": 1985 }, { "epoch": 0.58, "grad_norm": 3.048907575238015, "learning_rate": 1.887951399644178e-05, "loss": 0.076, "step": 1990 }, { "epoch": 0.59, "grad_norm": 3.326277815721477, "learning_rate": 1.8872066166280898e-05, "loss": 0.097, "step": 1995 }, { "epoch": 0.59, "grad_norm": 3.033210042047812, "learning_rate": 1.8864595144565067e-05, "loss": 0.1589, "step": 2000 }, { "epoch": 0.59, "grad_norm": 3.2824483953691823, "learning_rate": 1.8857100950825725e-05, "loss": 0.1037, "step": 2005 }, { "epoch": 0.59, "grad_norm": 3.4944060444975196, "learning_rate": 1.8849583604654883e-05, "loss": 0.1102, "step": 2010 }, { "epoch": 0.59, "grad_norm": 3.5842552477679854, "learning_rate": 1.8842043125705074e-05, "loss": 0.0704, "step": 2015 }, { "epoch": 0.59, "grad_norm": 2.5453422006053277, "learning_rate": 1.883447953368931e-05, "loss": 0.0902, "step": 2020 }, { "epoch": 0.59, "grad_norm": 2.05893040027048, "learning_rate": 1.8826892848381026e-05, "loss": 0.1236, "step": 2025 }, { "epoch": 0.6, "grad_norm": 2.5436602939843653, "learning_rate": 1.881928308961403e-05, "loss": 0.1127, "step": 2030 }, { "epoch": 0.6, "grad_norm": 5.148596800219499, "learning_rate": 1.8811650277282457e-05, "loss": 0.1554, "step": 2035 }, { "epoch": 0.6, "grad_norm": 3.4114441072977875, "learning_rate": 1.88039944313407e-05, "loss": 0.1361, "step": 2040 }, { "epoch": 0.6, "grad_norm": 2.8127938011148865, "learning_rate": 1.8796315571803373e-05, "loss": 0.0995, "step": 2045 }, { "epoch": 0.6, "grad_norm": 3.9625421467654545, "learning_rate": 1.8788613718745258e-05, "loss": 0.1007, "step": 2050 }, { "epoch": 0.6, "grad_norm": 1.7804274416555144, "learning_rate": 1.8780888892301246e-05, "loss": 0.0831, "step": 2055 }, { "epoch": 0.6, "grad_norm": 3.0212657925844457, "learning_rate": 1.8773141112666282e-05, "loss": 0.0983, "step": 2060 }, { "epoch": 0.61, "grad_norm": 2.1666245700952618, "learning_rate": 1.876537040009533e-05, "loss": 0.1188, "step": 2065 }, { "epoch": 0.61, "grad_norm": 1.3595918504356805, "learning_rate": 1.8757576774903293e-05, "loss": 0.0847, "step": 2070 }, { "epoch": 0.61, "grad_norm": 3.125961613522519, "learning_rate": 1.8749760257464987e-05, "loss": 0.1239, "step": 2075 }, { "epoch": 0.61, "grad_norm": 3.7003035601248655, "learning_rate": 1.874192086821506e-05, "loss": 0.1409, "step": 2080 }, { "epoch": 0.61, "grad_norm": 2.379395082357442, "learning_rate": 1.8734058627647974e-05, "loss": 0.0724, "step": 2085 }, { "epoch": 0.61, "grad_norm": 6.950334653412999, "learning_rate": 1.872617355631791e-05, "loss": 0.1478, "step": 2090 }, { "epoch": 0.61, "grad_norm": 3.242450014080562, "learning_rate": 1.871826567483875e-05, "loss": 0.099, "step": 2095 }, { "epoch": 0.62, "grad_norm": 3.6369097830549175, "learning_rate": 1.8710335003884e-05, "loss": 0.0874, "step": 2100 }, { "epoch": 0.62, "grad_norm": 2.3740836935740575, "learning_rate": 1.8702381564186752e-05, "loss": 0.1088, "step": 2105 }, { "epoch": 0.62, "grad_norm": 4.29882368809403, "learning_rate": 1.8694405376539612e-05, "loss": 0.1358, "step": 2110 }, { "epoch": 0.62, "grad_norm": 1.233884641572376, "learning_rate": 1.8686406461794663e-05, "loss": 0.0848, "step": 2115 }, { "epoch": 0.62, "grad_norm": 5.10093491156728, "learning_rate": 1.86783848408634e-05, "loss": 0.1664, "step": 2120 }, { "epoch": 0.62, "grad_norm": 2.6372062628013366, "learning_rate": 1.867034053471669e-05, "loss": 0.0864, "step": 2125 }, { "epoch": 0.62, "grad_norm": 1.9131502429850842, "learning_rate": 1.8662273564384685e-05, "loss": 0.0712, "step": 2130 }, { "epoch": 0.63, "grad_norm": 2.2716382434584665, "learning_rate": 1.8654183950956807e-05, "loss": 0.1098, "step": 2135 }, { "epoch": 0.63, "grad_norm": 2.980703122184445, "learning_rate": 1.864607171558166e-05, "loss": 0.1524, "step": 2140 }, { "epoch": 0.63, "grad_norm": 2.6693066304605497, "learning_rate": 1.863793687946699e-05, "loss": 0.1263, "step": 2145 }, { "epoch": 0.63, "grad_norm": 1.8713460923305671, "learning_rate": 1.862977946387964e-05, "loss": 0.1043, "step": 2150 }, { "epoch": 0.63, "grad_norm": 2.173402756633364, "learning_rate": 1.862159949014547e-05, "loss": 0.1268, "step": 2155 }, { "epoch": 0.63, "grad_norm": 2.1115579118349936, "learning_rate": 1.861339697964932e-05, "loss": 0.0871, "step": 2160 }, { "epoch": 0.64, "grad_norm": 0.4714286720726806, "learning_rate": 1.860517195383495e-05, "loss": 0.1029, "step": 2165 }, { "epoch": 0.64, "grad_norm": 2.133585471909289, "learning_rate": 1.8596924434204963e-05, "loss": 0.0858, "step": 2170 }, { "epoch": 0.64, "grad_norm": 1.7496615491285803, "learning_rate": 1.8588654442320796e-05, "loss": 0.1081, "step": 2175 }, { "epoch": 0.64, "grad_norm": 4.814425987182643, "learning_rate": 1.8580361999802606e-05, "loss": 0.1179, "step": 2180 }, { "epoch": 0.64, "grad_norm": 1.9816374631526001, "learning_rate": 1.8572047128329272e-05, "loss": 0.1062, "step": 2185 }, { "epoch": 0.64, "grad_norm": 6.646969625340206, "learning_rate": 1.8563709849638286e-05, "loss": 0.1477, "step": 2190 }, { "epoch": 0.64, "grad_norm": 2.374535766203674, "learning_rate": 1.8555350185525723e-05, "loss": 0.1142, "step": 2195 }, { "epoch": 0.65, "grad_norm": 3.365883417577783, "learning_rate": 1.8546968157846195e-05, "loss": 0.124, "step": 2200 }, { "epoch": 0.65, "grad_norm": 2.119464733459411, "learning_rate": 1.8538563788512757e-05, "loss": 0.0861, "step": 2205 }, { "epoch": 0.65, "grad_norm": 3.4982979586919245, "learning_rate": 1.8530137099496886e-05, "loss": 0.1153, "step": 2210 }, { "epoch": 0.65, "grad_norm": 2.0143848714607326, "learning_rate": 1.852168811282841e-05, "loss": 0.0957, "step": 2215 }, { "epoch": 0.65, "grad_norm": 5.713010148322335, "learning_rate": 1.8513216850595434e-05, "loss": 0.106, "step": 2220 }, { "epoch": 0.65, "grad_norm": 3.66380234305124, "learning_rate": 1.850472333494432e-05, "loss": 0.1011, "step": 2225 }, { "epoch": 0.65, "grad_norm": 2.082824212489925, "learning_rate": 1.849620758807959e-05, "loss": 0.1106, "step": 2230 }, { "epoch": 0.66, "grad_norm": 3.1172243656282594, "learning_rate": 1.8487669632263892e-05, "loss": 0.1099, "step": 2235 }, { "epoch": 0.66, "grad_norm": 3.8750958584649853, "learning_rate": 1.8479109489817935e-05, "loss": 0.0927, "step": 2240 }, { "epoch": 0.66, "grad_norm": 2.658381425986548, "learning_rate": 1.8470527183120425e-05, "loss": 0.0768, "step": 2245 }, { "epoch": 0.66, "grad_norm": 4.454354742139791, "learning_rate": 1.8461922734608016e-05, "loss": 0.0906, "step": 2250 }, { "epoch": 0.66, "grad_norm": 3.7643868717903866, "learning_rate": 1.845329616677525e-05, "loss": 0.0937, "step": 2255 }, { "epoch": 0.66, "grad_norm": 3.680014043034835, "learning_rate": 1.8444647502174492e-05, "loss": 0.087, "step": 2260 }, { "epoch": 0.66, "grad_norm": 3.3837653881747585, "learning_rate": 1.843597676341587e-05, "loss": 0.0916, "step": 2265 }, { "epoch": 0.67, "grad_norm": 2.121221324252522, "learning_rate": 1.8427283973167225e-05, "loss": 0.1221, "step": 2270 }, { "epoch": 0.67, "grad_norm": 1.0325272482169887, "learning_rate": 1.841856915415405e-05, "loss": 0.0874, "step": 2275 }, { "epoch": 0.67, "grad_norm": 2.0795962066445566, "learning_rate": 1.840983232915942e-05, "loss": 0.0741, "step": 2280 }, { "epoch": 0.67, "grad_norm": 4.460588805671975, "learning_rate": 1.840107352102395e-05, "loss": 0.1488, "step": 2285 }, { "epoch": 0.67, "grad_norm": 2.974661793437929, "learning_rate": 1.839229275264572e-05, "loss": 0.093, "step": 2290 }, { "epoch": 0.67, "grad_norm": 3.137347736691432, "learning_rate": 1.8383490046980212e-05, "loss": 0.1, "step": 2295 }, { "epoch": 0.67, "grad_norm": 7.90074357987843, "learning_rate": 1.8374665427040276e-05, "loss": 0.1362, "step": 2300 }, { "epoch": 0.68, "grad_norm": 2.4540688223728826, "learning_rate": 1.836581891589604e-05, "loss": 0.1124, "step": 2305 }, { "epoch": 0.68, "grad_norm": 4.638147996733137, "learning_rate": 1.8356950536674858e-05, "loss": 0.1031, "step": 2310 }, { "epoch": 0.68, "grad_norm": 2.2696169707348544, "learning_rate": 1.834806031256127e-05, "loss": 0.0965, "step": 2315 }, { "epoch": 0.68, "grad_norm": 6.130234330693954, "learning_rate": 1.833914826679691e-05, "loss": 0.079, "step": 2320 }, { "epoch": 0.68, "grad_norm": 1.8781015810402948, "learning_rate": 1.8330214422680467e-05, "loss": 0.0791, "step": 2325 }, { "epoch": 0.68, "grad_norm": 1.4069832479622444, "learning_rate": 1.8321258803567613e-05, "loss": 0.0831, "step": 2330 }, { "epoch": 0.68, "grad_norm": 3.4630544831332664, "learning_rate": 1.831228143287096e-05, "loss": 0.1616, "step": 2335 }, { "epoch": 0.69, "grad_norm": 2.9442217600685137, "learning_rate": 1.8303282334059957e-05, "loss": 0.1199, "step": 2340 }, { "epoch": 0.69, "grad_norm": 2.0954935585893257, "learning_rate": 1.8294261530660885e-05, "loss": 0.1302, "step": 2345 }, { "epoch": 0.69, "grad_norm": 4.890904817711451, "learning_rate": 1.8285219046256758e-05, "loss": 0.1025, "step": 2350 }, { "epoch": 0.69, "grad_norm": 2.583755171594856, "learning_rate": 1.8276154904487264e-05, "loss": 0.1043, "step": 2355 }, { "epoch": 0.69, "grad_norm": 8.260345907657872, "learning_rate": 1.8267069129048707e-05, "loss": 0.1782, "step": 2360 }, { "epoch": 0.69, "grad_norm": 4.771965836395477, "learning_rate": 1.8257961743693962e-05, "loss": 0.0862, "step": 2365 }, { "epoch": 0.7, "grad_norm": 2.554197211858201, "learning_rate": 1.8248832772232394e-05, "loss": 0.0851, "step": 2370 }, { "epoch": 0.7, "grad_norm": 2.16812655587689, "learning_rate": 1.8239682238529792e-05, "loss": 0.0938, "step": 2375 }, { "epoch": 0.7, "grad_norm": 3.3012819170031853, "learning_rate": 1.8230510166508322e-05, "loss": 0.0769, "step": 2380 }, { "epoch": 0.7, "grad_norm": 1.998340872187495, "learning_rate": 1.822131658014646e-05, "loss": 0.0735, "step": 2385 }, { "epoch": 0.7, "grad_norm": 1.9806358974129312, "learning_rate": 1.8212101503478916e-05, "loss": 0.14, "step": 2390 }, { "epoch": 0.7, "grad_norm": 6.99507540091613, "learning_rate": 1.8202864960596592e-05, "loss": 0.0944, "step": 2395 }, { "epoch": 0.7, "grad_norm": 2.4015686273341545, "learning_rate": 1.8193606975646506e-05, "loss": 0.0677, "step": 2400 }, { "epoch": 0.71, "grad_norm": 3.3364791966330962, "learning_rate": 1.8184327572831738e-05, "loss": 0.0829, "step": 2405 }, { "epoch": 0.71, "grad_norm": 6.874850462240894, "learning_rate": 1.817502677641134e-05, "loss": 0.1419, "step": 2410 }, { "epoch": 0.71, "grad_norm": 2.6075283986023665, "learning_rate": 1.8165704610700315e-05, "loss": 0.1117, "step": 2415 }, { "epoch": 0.71, "grad_norm": 2.148518397802211, "learning_rate": 1.8156361100069524e-05, "loss": 0.101, "step": 2420 }, { "epoch": 0.71, "grad_norm": 3.4560468231137564, "learning_rate": 1.8146996268945632e-05, "loss": 0.0966, "step": 2425 }, { "epoch": 0.71, "grad_norm": 2.384974065195782, "learning_rate": 1.8137610141811037e-05, "loss": 0.122, "step": 2430 }, { "epoch": 0.71, "grad_norm": 2.362869907867881, "learning_rate": 1.812820274320381e-05, "loss": 0.1132, "step": 2435 }, { "epoch": 0.72, "grad_norm": 6.9320506095293455, "learning_rate": 1.811877409771764e-05, "loss": 0.1524, "step": 2440 }, { "epoch": 0.72, "grad_norm": 4.720326949611697, "learning_rate": 1.8109324230001756e-05, "loss": 0.1301, "step": 2445 }, { "epoch": 0.72, "grad_norm": 2.1399702198416413, "learning_rate": 1.8099853164760865e-05, "loss": 0.0889, "step": 2450 }, { "epoch": 0.72, "grad_norm": 6.7467640785050325, "learning_rate": 1.80903609267551e-05, "loss": 0.099, "step": 2455 }, { "epoch": 0.72, "grad_norm": 2.325513463081622, "learning_rate": 1.8080847540799942e-05, "loss": 0.1064, "step": 2460 }, { "epoch": 0.72, "grad_norm": 2.177610658971863, "learning_rate": 1.8071313031766148e-05, "loss": 0.0658, "step": 2465 }, { "epoch": 0.72, "grad_norm": 4.977562149709799, "learning_rate": 1.8061757424579716e-05, "loss": 0.1207, "step": 2470 }, { "epoch": 0.73, "grad_norm": 2.1681517377729382, "learning_rate": 1.8052180744221784e-05, "loss": 0.1197, "step": 2475 }, { "epoch": 0.73, "grad_norm": 2.7090062894808002, "learning_rate": 1.8042583015728598e-05, "loss": 0.0792, "step": 2480 }, { "epoch": 0.73, "grad_norm": 4.747077846148363, "learning_rate": 1.8032964264191402e-05, "loss": 0.1143, "step": 2485 }, { "epoch": 0.73, "grad_norm": 2.74552501361813, "learning_rate": 1.8023324514756436e-05, "loss": 0.1265, "step": 2490 }, { "epoch": 0.73, "grad_norm": 4.1581416593625296, "learning_rate": 1.801366379262481e-05, "loss": 0.072, "step": 2495 }, { "epoch": 0.73, "grad_norm": 1.6406189691341908, "learning_rate": 1.8003982123052474e-05, "loss": 0.0814, "step": 2500 }, { "epoch": 0.73, "grad_norm": 2.712884736792791, "learning_rate": 1.7994279531350135e-05, "loss": 0.0973, "step": 2505 }, { "epoch": 0.74, "grad_norm": 1.204869182656131, "learning_rate": 1.7984556042883195e-05, "loss": 0.0725, "step": 2510 }, { "epoch": 0.74, "grad_norm": 1.63571544112928, "learning_rate": 1.7974811683071688e-05, "loss": 0.1416, "step": 2515 }, { "epoch": 0.74, "grad_norm": 0.8179280822029832, "learning_rate": 1.7965046477390223e-05, "loss": 0.08, "step": 2520 }, { "epoch": 0.74, "grad_norm": 3.1456845650432697, "learning_rate": 1.7955260451367887e-05, "loss": 0.0939, "step": 2525 }, { "epoch": 0.74, "grad_norm": 3.068511046140001, "learning_rate": 1.7945453630588214e-05, "loss": 0.074, "step": 2530 }, { "epoch": 0.74, "grad_norm": 1.8530528834496558, "learning_rate": 1.7935626040689087e-05, "loss": 0.1254, "step": 2535 }, { "epoch": 0.75, "grad_norm": 9.466013698713851, "learning_rate": 1.7925777707362694e-05, "loss": 0.1031, "step": 2540 }, { "epoch": 0.75, "grad_norm": 2.8626798281758683, "learning_rate": 1.791590865635546e-05, "loss": 0.0906, "step": 2545 }, { "epoch": 0.75, "grad_norm": 5.788311578849042, "learning_rate": 1.7906018913467957e-05, "loss": 0.1191, "step": 2550 }, { "epoch": 0.75, "grad_norm": 15.34485630001676, "learning_rate": 1.7896108504554858e-05, "loss": 0.1703, "step": 2555 }, { "epoch": 0.75, "grad_norm": 1.878644897302772, "learning_rate": 1.7886177455524865e-05, "loss": 0.0978, "step": 2560 }, { "epoch": 0.75, "grad_norm": 3.9594573315714197, "learning_rate": 1.7876225792340635e-05, "loss": 0.1066, "step": 2565 }, { "epoch": 0.75, "grad_norm": 6.412819373671454, "learning_rate": 1.786625354101872e-05, "loss": 0.1204, "step": 2570 }, { "epoch": 0.76, "grad_norm": 9.689461195761101, "learning_rate": 1.7856260727629495e-05, "loss": 0.1137, "step": 2575 }, { "epoch": 0.76, "grad_norm": 3.5251907080608054, "learning_rate": 1.784624737829709e-05, "loss": 0.1519, "step": 2580 }, { "epoch": 0.76, "grad_norm": 2.735467659755077, "learning_rate": 1.783621351919932e-05, "loss": 0.0956, "step": 2585 }, { "epoch": 0.76, "grad_norm": 2.579970560423387, "learning_rate": 1.7826159176567616e-05, "loss": 0.0965, "step": 2590 }, { "epoch": 0.76, "grad_norm": 5.7447980487012185, "learning_rate": 1.781608437668697e-05, "loss": 0.1355, "step": 2595 }, { "epoch": 0.76, "grad_norm": 2.746235872283559, "learning_rate": 1.7805989145895847e-05, "loss": 0.0879, "step": 2600 }, { "epoch": 0.76, "grad_norm": 3.6558387720713794, "learning_rate": 1.779587351058612e-05, "loss": 0.1266, "step": 2605 }, { "epoch": 0.77, "grad_norm": 2.79558623046737, "learning_rate": 1.7785737497203013e-05, "loss": 0.1805, "step": 2610 }, { "epoch": 0.77, "grad_norm": 2.9429418600589123, "learning_rate": 1.7775581132245026e-05, "loss": 0.1069, "step": 2615 }, { "epoch": 0.77, "grad_norm": 1.5095627662015014, "learning_rate": 1.776540444226386e-05, "loss": 0.047, "step": 2620 }, { "epoch": 0.77, "grad_norm": 3.495377076072146, "learning_rate": 1.775520745386434e-05, "loss": 0.0633, "step": 2625 }, { "epoch": 0.77, "grad_norm": 3.0385032472654325, "learning_rate": 1.774499019370438e-05, "loss": 0.1487, "step": 2630 }, { "epoch": 0.77, "grad_norm": 10.965757999825739, "learning_rate": 1.773475268849488e-05, "loss": 0.1373, "step": 2635 }, { "epoch": 0.77, "grad_norm": 2.8123652108000585, "learning_rate": 1.772449496499966e-05, "loss": 0.1187, "step": 2640 }, { "epoch": 0.78, "grad_norm": 4.66597249908126, "learning_rate": 1.77142170500354e-05, "loss": 0.1114, "step": 2645 }, { "epoch": 0.78, "grad_norm": 2.676562831364225, "learning_rate": 1.770391897047157e-05, "loss": 0.1213, "step": 2650 }, { "epoch": 0.78, "grad_norm": 2.0506018569230973, "learning_rate": 1.769360075323036e-05, "loss": 0.1216, "step": 2655 }, { "epoch": 0.78, "grad_norm": 3.4758686873519644, "learning_rate": 1.7683262425286593e-05, "loss": 0.1068, "step": 2660 }, { "epoch": 0.78, "grad_norm": 16.360011105708523, "learning_rate": 1.7672904013667675e-05, "loss": 0.0836, "step": 2665 }, { "epoch": 0.78, "grad_norm": 2.2472093017776724, "learning_rate": 1.7662525545453518e-05, "loss": 0.1206, "step": 2670 }, { "epoch": 0.78, "grad_norm": 3.264614259094197, "learning_rate": 1.7652127047776464e-05, "loss": 0.0736, "step": 2675 }, { "epoch": 0.79, "grad_norm": 4.838998543720377, "learning_rate": 1.7641708547821218e-05, "loss": 0.112, "step": 2680 }, { "epoch": 0.79, "grad_norm": 1.889257267329085, "learning_rate": 1.7631270072824786e-05, "loss": 0.0915, "step": 2685 }, { "epoch": 0.79, "grad_norm": 3.489350329159354, "learning_rate": 1.762081165007638e-05, "loss": 0.0872, "step": 2690 }, { "epoch": 0.79, "grad_norm": 5.007436753100703, "learning_rate": 1.7610333306917367e-05, "loss": 0.1069, "step": 2695 }, { "epoch": 0.79, "grad_norm": 1.8855245850871363, "learning_rate": 1.75998350707412e-05, "loss": 0.1096, "step": 2700 }, { "epoch": 0.79, "grad_norm": 4.28080691529872, "learning_rate": 1.7589316968993323e-05, "loss": 0.1135, "step": 2705 }, { "epoch": 0.79, "grad_norm": 2.188803148392315, "learning_rate": 1.7578779029171128e-05, "loss": 0.0631, "step": 2710 }, { "epoch": 0.8, "grad_norm": 3.588543379312152, "learning_rate": 1.7568221278823862e-05, "loss": 0.1063, "step": 2715 }, { "epoch": 0.8, "grad_norm": 2.6512479470769765, "learning_rate": 1.7557643745552566e-05, "loss": 0.0792, "step": 2720 }, { "epoch": 0.8, "grad_norm": 2.4622361404876334, "learning_rate": 1.7547046457009995e-05, "loss": 0.0815, "step": 2725 }, { "epoch": 0.8, "grad_norm": 1.181686879663521, "learning_rate": 1.7536429440900554e-05, "loss": 0.0855, "step": 2730 }, { "epoch": 0.8, "grad_norm": 3.0605705567048047, "learning_rate": 1.7525792724980225e-05, "loss": 0.1384, "step": 2735 }, { "epoch": 0.8, "grad_norm": 3.0361367388503884, "learning_rate": 1.7515136337056476e-05, "loss": 0.0652, "step": 2740 }, { "epoch": 0.81, "grad_norm": 3.6999816430126207, "learning_rate": 1.750446030498822e-05, "loss": 0.1307, "step": 2745 }, { "epoch": 0.81, "grad_norm": 1.7331629461515146, "learning_rate": 1.7493764656685725e-05, "loss": 0.085, "step": 2750 }, { "epoch": 0.81, "grad_norm": 3.3426839965302104, "learning_rate": 1.7483049420110526e-05, "loss": 0.1107, "step": 2755 }, { "epoch": 0.81, "grad_norm": 1.8945908369938274, "learning_rate": 1.747231462327538e-05, "loss": 0.0677, "step": 2760 }, { "epoch": 0.81, "grad_norm": 2.375178480970911, "learning_rate": 1.7461560294244185e-05, "loss": 0.0816, "step": 2765 }, { "epoch": 0.81, "grad_norm": 2.7160314363337323, "learning_rate": 1.7450786461131886e-05, "loss": 0.1479, "step": 2770 }, { "epoch": 0.81, "grad_norm": 2.1767080856674283, "learning_rate": 1.7439993152104424e-05, "loss": 0.0701, "step": 2775 }, { "epoch": 0.82, "grad_norm": 2.8479809150806425, "learning_rate": 1.7429180395378667e-05, "loss": 0.1213, "step": 2780 }, { "epoch": 0.82, "grad_norm": 3.1833758133322387, "learning_rate": 1.741834821922231e-05, "loss": 0.0927, "step": 2785 }, { "epoch": 0.82, "grad_norm": 3.7860456588982396, "learning_rate": 1.7407496651953824e-05, "loss": 0.0896, "step": 2790 }, { "epoch": 0.82, "grad_norm": 3.127110340900602, "learning_rate": 1.739662572194237e-05, "loss": 0.0593, "step": 2795 }, { "epoch": 0.82, "grad_norm": 1.805218044753091, "learning_rate": 1.7385735457607728e-05, "loss": 0.1007, "step": 2800 }, { "epoch": 0.82, "grad_norm": 2.552175891261757, "learning_rate": 1.7374825887420227e-05, "loss": 0.1329, "step": 2805 }, { "epoch": 0.82, "grad_norm": 1.4027082134325906, "learning_rate": 1.7363897039900673e-05, "loss": 0.0775, "step": 2810 }, { "epoch": 0.83, "grad_norm": 2.5766792636413722, "learning_rate": 1.7352948943620252e-05, "loss": 0.1039, "step": 2815 }, { "epoch": 0.83, "grad_norm": 2.315064631933743, "learning_rate": 1.7341981627200486e-05, "loss": 0.1002, "step": 2820 }, { "epoch": 0.83, "grad_norm": 1.77275038528492, "learning_rate": 1.733099511931314e-05, "loss": 0.059, "step": 2825 }, { "epoch": 0.83, "grad_norm": 3.6553802759984086, "learning_rate": 1.731998944868015e-05, "loss": 0.0873, "step": 2830 }, { "epoch": 0.83, "grad_norm": 9.277418585224488, "learning_rate": 1.730896464407355e-05, "loss": 0.1054, "step": 2835 }, { "epoch": 0.83, "grad_norm": 2.330825468379001, "learning_rate": 1.7297920734315397e-05, "loss": 0.0841, "step": 2840 }, { "epoch": 0.83, "grad_norm": 1.5353749049117653, "learning_rate": 1.728685774827769e-05, "loss": 0.1018, "step": 2845 }, { "epoch": 0.84, "grad_norm": 2.887634875297545, "learning_rate": 1.7275775714882302e-05, "loss": 0.1114, "step": 2850 }, { "epoch": 0.84, "grad_norm": 2.453602349555872, "learning_rate": 1.7264674663100908e-05, "loss": 0.1401, "step": 2855 }, { "epoch": 0.84, "grad_norm": 1.973689346976038, "learning_rate": 1.7253554621954888e-05, "loss": 0.1036, "step": 2860 }, { "epoch": 0.84, "grad_norm": 2.3448153299848964, "learning_rate": 1.7242415620515277e-05, "loss": 0.1152, "step": 2865 }, { "epoch": 0.84, "grad_norm": 3.1738861172586437, "learning_rate": 1.7231257687902668e-05, "loss": 0.1154, "step": 2870 }, { "epoch": 0.84, "grad_norm": 5.216105630624163, "learning_rate": 1.722008085328716e-05, "loss": 0.0901, "step": 2875 }, { "epoch": 0.84, "grad_norm": 1.687338997966392, "learning_rate": 1.7208885145888262e-05, "loss": 0.0996, "step": 2880 }, { "epoch": 0.85, "grad_norm": 2.5681904204171584, "learning_rate": 1.7197670594974815e-05, "loss": 0.1147, "step": 2885 }, { "epoch": 0.85, "grad_norm": 3.9337900031340416, "learning_rate": 1.718643722986492e-05, "loss": 0.0782, "step": 2890 }, { "epoch": 0.85, "grad_norm": 4.399086614652283, "learning_rate": 1.7175185079925877e-05, "loss": 0.1258, "step": 2895 }, { "epoch": 0.85, "grad_norm": 1.9000347309953698, "learning_rate": 1.7163914174574092e-05, "loss": 0.1434, "step": 2900 }, { "epoch": 0.85, "grad_norm": 3.8364789467985507, "learning_rate": 1.7152624543274994e-05, "loss": 0.0865, "step": 2905 }, { "epoch": 0.85, "grad_norm": 2.3865032691893213, "learning_rate": 1.7141316215542975e-05, "loss": 0.0866, "step": 2910 }, { "epoch": 0.86, "grad_norm": 2.0857563297304167, "learning_rate": 1.71299892209413e-05, "loss": 0.0924, "step": 2915 }, { "epoch": 0.86, "grad_norm": 2.345942984458089, "learning_rate": 1.7118643589082043e-05, "loss": 0.0708, "step": 2920 }, { "epoch": 0.86, "grad_norm": 1.8641359939859798, "learning_rate": 1.7107279349625992e-05, "loss": 0.0788, "step": 2925 }, { "epoch": 0.86, "grad_norm": 2.272737375583735, "learning_rate": 1.7095896532282584e-05, "loss": 0.0851, "step": 2930 }, { "epoch": 0.86, "grad_norm": 2.4039124864082657, "learning_rate": 1.7084495166809822e-05, "loss": 0.1068, "step": 2935 }, { "epoch": 0.86, "grad_norm": 1.1218714658208662, "learning_rate": 1.707307528301421e-05, "loss": 0.1091, "step": 2940 }, { "epoch": 0.86, "grad_norm": 0.9891789998991855, "learning_rate": 1.7061636910750646e-05, "loss": 0.096, "step": 2945 }, { "epoch": 0.87, "grad_norm": 3.194805823262511, "learning_rate": 1.7050180079922373e-05, "loss": 0.1006, "step": 2950 }, { "epoch": 0.87, "grad_norm": 2.257798455534552, "learning_rate": 1.7038704820480898e-05, "loss": 0.0914, "step": 2955 }, { "epoch": 0.87, "grad_norm": 5.78033176037585, "learning_rate": 1.7027211162425888e-05, "loss": 0.0965, "step": 2960 }, { "epoch": 0.87, "grad_norm": 2.6184356901420025, "learning_rate": 1.7015699135805122e-05, "loss": 0.1114, "step": 2965 }, { "epoch": 0.87, "grad_norm": 1.8106354297762863, "learning_rate": 1.70041687707144e-05, "loss": 0.0941, "step": 2970 }, { "epoch": 0.87, "grad_norm": 1.245232646301207, "learning_rate": 1.699262009729745e-05, "loss": 0.0814, "step": 2975 }, { "epoch": 0.87, "grad_norm": 3.442666904279059, "learning_rate": 1.6981053145745877e-05, "loss": 0.1418, "step": 2980 }, { "epoch": 0.88, "grad_norm": 2.2823324901203557, "learning_rate": 1.6969467946299073e-05, "loss": 0.0593, "step": 2985 }, { "epoch": 0.88, "grad_norm": 2.3494464059884015, "learning_rate": 1.6957864529244123e-05, "loss": 0.0696, "step": 2990 }, { "epoch": 0.88, "grad_norm": 4.051449109074783, "learning_rate": 1.694624292491575e-05, "loss": 0.091, "step": 2995 }, { "epoch": 0.88, "grad_norm": 2.6735251166535616, "learning_rate": 1.6934603163696212e-05, "loss": 0.089, "step": 3000 }, { "epoch": 0.88, "grad_norm": 1.8986340958640284, "learning_rate": 1.6922945276015244e-05, "loss": 0.087, "step": 3005 }, { "epoch": 0.88, "grad_norm": 3.912312190511564, "learning_rate": 1.691126929234996e-05, "loss": 0.1154, "step": 3010 }, { "epoch": 0.88, "grad_norm": 3.9209068288748936, "learning_rate": 1.6899575243224794e-05, "loss": 0.0845, "step": 3015 }, { "epoch": 0.89, "grad_norm": 0.986967487651937, "learning_rate": 1.6887863159211403e-05, "loss": 0.0563, "step": 3020 }, { "epoch": 0.89, "grad_norm": 3.5756032794838775, "learning_rate": 1.6876133070928584e-05, "loss": 0.1154, "step": 3025 }, { "epoch": 0.89, "grad_norm": 2.8732144989552793, "learning_rate": 1.6864385009042215e-05, "loss": 0.0882, "step": 3030 }, { "epoch": 0.89, "grad_norm": 3.2146073135071584, "learning_rate": 1.6852619004265157e-05, "loss": 0.0746, "step": 3035 }, { "epoch": 0.89, "grad_norm": 2.032882973038013, "learning_rate": 1.684083508735718e-05, "loss": 0.0919, "step": 3040 }, { "epoch": 0.89, "grad_norm": 1.7212867050301923, "learning_rate": 1.6829033289124876e-05, "loss": 0.061, "step": 3045 }, { "epoch": 0.89, "grad_norm": 2.0434776241308557, "learning_rate": 1.681721364042159e-05, "loss": 0.0608, "step": 3050 }, { "epoch": 0.9, "grad_norm": 2.4595918397424077, "learning_rate": 1.6805376172147335e-05, "loss": 0.0618, "step": 3055 }, { "epoch": 0.9, "grad_norm": 1.9062625696987283, "learning_rate": 1.6793520915248704e-05, "loss": 0.106, "step": 3060 }, { "epoch": 0.9, "grad_norm": 3.0695618155829765, "learning_rate": 1.6781647900718797e-05, "loss": 0.0826, "step": 3065 }, { "epoch": 0.9, "grad_norm": 2.5737887820804533, "learning_rate": 1.676975715959714e-05, "loss": 0.0896, "step": 3070 }, { "epoch": 0.9, "grad_norm": 5.4035821406888855, "learning_rate": 1.67578487229696e-05, "loss": 0.0747, "step": 3075 }, { "epoch": 0.9, "grad_norm": 4.031141412736291, "learning_rate": 1.67459226219683e-05, "loss": 0.1035, "step": 3080 }, { "epoch": 0.9, "grad_norm": 2.8672651649371415, "learning_rate": 1.6733978887771548e-05, "loss": 0.1211, "step": 3085 }, { "epoch": 0.91, "grad_norm": 3.622543160107051, "learning_rate": 1.6722017551603752e-05, "loss": 0.0782, "step": 3090 }, { "epoch": 0.91, "grad_norm": 2.611185690533942, "learning_rate": 1.6710038644735328e-05, "loss": 0.1197, "step": 3095 }, { "epoch": 0.91, "grad_norm": 2.239743805096798, "learning_rate": 1.6698042198482645e-05, "loss": 0.0694, "step": 3100 }, { "epoch": 0.91, "grad_norm": 3.6697206061343963, "learning_rate": 1.6686028244207902e-05, "loss": 0.0896, "step": 3105 }, { "epoch": 0.91, "grad_norm": 2.3344027475446776, "learning_rate": 1.667399681331909e-05, "loss": 0.0863, "step": 3110 }, { "epoch": 0.91, "grad_norm": 5.716084510514838, "learning_rate": 1.666194793726987e-05, "loss": 0.0548, "step": 3115 }, { "epoch": 0.92, "grad_norm": 3.1415750006162626, "learning_rate": 1.6649881647559527e-05, "loss": 0.0684, "step": 3120 }, { "epoch": 0.92, "grad_norm": 2.1766147626420187, "learning_rate": 1.6637797975732855e-05, "loss": 0.0786, "step": 3125 }, { "epoch": 0.92, "grad_norm": 2.602913079216308, "learning_rate": 1.6625696953380104e-05, "loss": 0.1321, "step": 3130 }, { "epoch": 0.92, "grad_norm": 2.2070894756907418, "learning_rate": 1.661357861213687e-05, "loss": 0.0912, "step": 3135 }, { "epoch": 0.92, "grad_norm": 1.6298032344638562, "learning_rate": 1.6601442983684042e-05, "loss": 0.0802, "step": 3140 }, { "epoch": 0.92, "grad_norm": 2.7174691000044064, "learning_rate": 1.658929009974768e-05, "loss": 0.1251, "step": 3145 }, { "epoch": 0.92, "grad_norm": 4.932366321915814, "learning_rate": 1.657711999209898e-05, "loss": 0.1141, "step": 3150 }, { "epoch": 0.93, "grad_norm": 9.069104123184824, "learning_rate": 1.656493269255415e-05, "loss": 0.1253, "step": 3155 }, { "epoch": 0.93, "grad_norm": 2.9830010731689076, "learning_rate": 1.6552728232974344e-05, "loss": 0.0736, "step": 3160 }, { "epoch": 0.93, "grad_norm": 2.3767202070428732, "learning_rate": 1.654050664526558e-05, "loss": 0.1481, "step": 3165 }, { "epoch": 0.93, "grad_norm": 2.1967464664457292, "learning_rate": 1.6528267961378653e-05, "loss": 0.0737, "step": 3170 }, { "epoch": 0.93, "grad_norm": 2.5475708180655317, "learning_rate": 1.651601221330906e-05, "loss": 0.0965, "step": 3175 }, { "epoch": 0.93, "grad_norm": 1.4514690521021125, "learning_rate": 1.6503739433096893e-05, "loss": 0.09, "step": 3180 }, { "epoch": 0.93, "grad_norm": 1.0106614527500741, "learning_rate": 1.649144965282679e-05, "loss": 0.1028, "step": 3185 }, { "epoch": 0.94, "grad_norm": 2.264948776878529, "learning_rate": 1.647914290462781e-05, "loss": 0.1099, "step": 3190 }, { "epoch": 0.94, "grad_norm": 2.387800555259844, "learning_rate": 1.6466819220673392e-05, "loss": 0.0858, "step": 3195 }, { "epoch": 0.94, "grad_norm": 2.2939673463714043, "learning_rate": 1.6454478633181238e-05, "loss": 0.0965, "step": 3200 }, { "epoch": 0.94, "grad_norm": 2.8815140940347828, "learning_rate": 1.6442121174413242e-05, "loss": 0.084, "step": 3205 }, { "epoch": 0.94, "grad_norm": 3.3710190642074394, "learning_rate": 1.6429746876675406e-05, "loss": 0.1348, "step": 3210 }, { "epoch": 0.94, "grad_norm": 3.154298991388609, "learning_rate": 1.6417355772317763e-05, "loss": 0.1307, "step": 3215 }, { "epoch": 0.94, "grad_norm": 2.431634251842803, "learning_rate": 1.6404947893734263e-05, "loss": 0.1269, "step": 3220 }, { "epoch": 0.95, "grad_norm": 1.8476184413662657, "learning_rate": 1.639252327336273e-05, "loss": 0.0886, "step": 3225 }, { "epoch": 0.95, "grad_norm": 2.856480869169723, "learning_rate": 1.6380081943684733e-05, "loss": 0.1183, "step": 3230 }, { "epoch": 0.95, "grad_norm": 1.4292746192636032, "learning_rate": 1.6367623937225553e-05, "loss": 0.062, "step": 3235 }, { "epoch": 0.95, "grad_norm": 1.8800704426452939, "learning_rate": 1.6355149286554047e-05, "loss": 0.1223, "step": 3240 }, { "epoch": 0.95, "grad_norm": 3.2229475779778154, "learning_rate": 1.6342658024282585e-05, "loss": 0.1167, "step": 3245 }, { "epoch": 0.95, "grad_norm": 5.6176440142314, "learning_rate": 1.6330150183066983e-05, "loss": 0.116, "step": 3250 }, { "epoch": 0.95, "grad_norm": 2.101148600040731, "learning_rate": 1.6317625795606378e-05, "loss": 0.1162, "step": 3255 }, { "epoch": 0.96, "grad_norm": 2.02917715092117, "learning_rate": 1.6305084894643172e-05, "loss": 0.1406, "step": 3260 }, { "epoch": 0.96, "grad_norm": 2.478189120719845, "learning_rate": 1.6292527512962947e-05, "loss": 0.1065, "step": 3265 }, { "epoch": 0.96, "grad_norm": 2.3396951822486196, "learning_rate": 1.627995368339435e-05, "loss": 0.1115, "step": 3270 }, { "epoch": 0.96, "grad_norm": 3.7337628106177485, "learning_rate": 1.6267363438809052e-05, "loss": 0.0846, "step": 3275 }, { "epoch": 0.96, "grad_norm": 4.176925288355294, "learning_rate": 1.6254756812121612e-05, "loss": 0.0425, "step": 3280 }, { "epoch": 0.96, "grad_norm": 1.437234356425958, "learning_rate": 1.6242133836289444e-05, "loss": 0.1001, "step": 3285 }, { "epoch": 0.97, "grad_norm": 1.6309867032536425, "learning_rate": 1.6229494544312684e-05, "loss": 0.0555, "step": 3290 }, { "epoch": 0.97, "grad_norm": 3.254534712202672, "learning_rate": 1.6216838969234124e-05, "loss": 0.0781, "step": 3295 }, { "epoch": 0.97, "grad_norm": 2.59152889519255, "learning_rate": 1.620416714413913e-05, "loss": 0.0997, "step": 3300 }, { "epoch": 0.97, "grad_norm": 2.0546888924365527, "learning_rate": 1.6191479102155556e-05, "loss": 0.117, "step": 3305 }, { "epoch": 0.97, "grad_norm": 3.4303903750156484, "learning_rate": 1.617877487645364e-05, "loss": 0.0715, "step": 3310 }, { "epoch": 0.97, "grad_norm": 2.1827612639457574, "learning_rate": 1.616605450024594e-05, "loss": 0.1013, "step": 3315 }, { "epoch": 0.97, "grad_norm": 2.865702647381268, "learning_rate": 1.6153318006787223e-05, "loss": 0.1131, "step": 3320 }, { "epoch": 0.98, "grad_norm": 2.9285326499896946, "learning_rate": 1.61405654293744e-05, "loss": 0.0839, "step": 3325 }, { "epoch": 0.98, "grad_norm": 1.9355299378240092, "learning_rate": 1.6127796801346437e-05, "loss": 0.0938, "step": 3330 }, { "epoch": 0.98, "grad_norm": 4.344809423553806, "learning_rate": 1.6115012156084242e-05, "loss": 0.066, "step": 3335 }, { "epoch": 0.98, "grad_norm": 5.030847000188601, "learning_rate": 1.6102211527010608e-05, "loss": 0.1054, "step": 3340 }, { "epoch": 0.98, "grad_norm": 1.8609269020938257, "learning_rate": 1.6089394947590123e-05, "loss": 0.0936, "step": 3345 }, { "epoch": 0.98, "grad_norm": 2.878476404530353, "learning_rate": 1.6076562451329055e-05, "loss": 0.0885, "step": 3350 }, { "epoch": 0.98, "grad_norm": 2.297272109392417, "learning_rate": 1.6063714071775297e-05, "loss": 0.1302, "step": 3355 }, { "epoch": 0.99, "grad_norm": 2.4727024357295484, "learning_rate": 1.6050849842518265e-05, "loss": 0.1108, "step": 3360 }, { "epoch": 0.99, "grad_norm": 2.616633205645327, "learning_rate": 1.60379697971888e-05, "loss": 0.1337, "step": 3365 }, { "epoch": 0.99, "grad_norm": 3.0531342208551266, "learning_rate": 1.60250739694591e-05, "loss": 0.1144, "step": 3370 }, { "epoch": 0.99, "grad_norm": 2.658444025810342, "learning_rate": 1.6012162393042625e-05, "loss": 0.1986, "step": 3375 }, { "epoch": 0.99, "grad_norm": 2.1995801611755392, "learning_rate": 1.5999235101694003e-05, "loss": 0.1367, "step": 3380 }, { "epoch": 0.99, "grad_norm": 1.3930532278994612, "learning_rate": 1.5986292129208938e-05, "loss": 0.0491, "step": 3385 }, { "epoch": 0.99, "grad_norm": 3.212502054295458, "learning_rate": 1.597333350942414e-05, "loss": 0.0862, "step": 3390 }, { "epoch": 1.0, "grad_norm": 1.849378948097875, "learning_rate": 1.5960359276217222e-05, "loss": 0.0899, "step": 3395 }, { "epoch": 1.0, "grad_norm": 3.045383499124253, "learning_rate": 1.5947369463506614e-05, "loss": 0.0809, "step": 3400 }, { "epoch": 1.0, "grad_norm": 3.350584656668729, "learning_rate": 1.5934364105251473e-05, "loss": 0.0692, "step": 3405 } ], "logging_steps": 5, "max_steps": 10227, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }