diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,4788 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 3409, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "grad_norm": 11.125381634627788, + "learning_rate": 5.308241808752198e-06, + "loss": 0.4004, + "step": 5 + }, + { + "epoch": 0.0, + "grad_norm": 8.92423656628485, + "learning_rate": 7.508241808752199e-06, + "loss": 0.4709, + "step": 10 + }, + { + "epoch": 0.0, + "grad_norm": 8.41715845868157, + "learning_rate": 8.795159310338741e-06, + "loss": 0.3973, + "step": 15 + }, + { + "epoch": 0.01, + "grad_norm": 15.774318751695052, + "learning_rate": 9.708241808752198e-06, + "loss": 0.4057, + "step": 20 + }, + { + "epoch": 0.01, + "grad_norm": 8.099327492758947, + "learning_rate": 1.0416483617504396e-05, + "loss": 0.2547, + "step": 25 + }, + { + "epoch": 0.01, + "grad_norm": 7.413713903241163, + "learning_rate": 1.099515931033874e-05, + "loss": 0.2041, + "step": 30 + }, + { + "epoch": 0.01, + "grad_norm": 9.03722608897779, + "learning_rate": 1.1484422637278927e-05, + "loss": 0.2867, + "step": 35 + }, + { + "epoch": 0.01, + "grad_norm": 8.92455629586844, + "learning_rate": 1.1908241808752199e-05, + "loss": 0.2939, + "step": 40 + }, + { + "epoch": 0.01, + "grad_norm": 10.156800616083926, + "learning_rate": 1.2282076811925285e-05, + "loss": 0.2955, + "step": 45 + }, + { + "epoch": 0.01, + "grad_norm": 9.420288964636175, + "learning_rate": 1.2616483617504393e-05, + "loss": 0.1845, + "step": 50 + }, + { + "epoch": 0.02, + "grad_norm": 16.959594279780564, + "learning_rate": 1.2918991369754252e-05, + "loss": 0.2387, + "step": 55 + }, + { + "epoch": 0.02, + "grad_norm": 6.074452366890137, + "learning_rate": 1.3195159310338741e-05, + "loss": 0.2455, + "step": 60 + }, + { + "epoch": 0.02, + "grad_norm": 4.742335711555924, + "learning_rate": 1.3449209188662602e-05, + "loss": 0.159, + "step": 65 + }, + { + "epoch": 0.02, + "grad_norm": 5.288626310542604, + "learning_rate": 1.3684422637278928e-05, + "loss": 0.1843, + "step": 70 + }, + { + "epoch": 0.02, + "grad_norm": 7.44716471541915, + "learning_rate": 1.3903401119090938e-05, + "loss": 0.1788, + "step": 75 + }, + { + "epoch": 0.02, + "grad_norm": 3.821096016199364, + "learning_rate": 1.4108241808752197e-05, + "loss": 0.1514, + "step": 80 + }, + { + "epoch": 0.02, + "grad_norm": 8.992446696823665, + "learning_rate": 1.4300660059502947e-05, + "loss": 0.2482, + "step": 85 + }, + { + "epoch": 0.03, + "grad_norm": 5.839197680608412, + "learning_rate": 1.4482076811925287e-05, + "loss": 0.2065, + "step": 90 + }, + { + "epoch": 0.03, + "grad_norm": 8.087974815050226, + "learning_rate": 1.4653682338328086e-05, + "loss": 0.2201, + "step": 95 + }, + { + "epoch": 0.03, + "grad_norm": 9.987995189856553, + "learning_rate": 1.4816483617504398e-05, + "loss": 0.1501, + "step": 100 + }, + { + "epoch": 0.03, + "grad_norm": 7.240449600169706, + "learning_rate": 1.4971340138865471e-05, + "loss": 0.2253, + "step": 105 + }, + { + "epoch": 0.03, + "grad_norm": 15.014801352117672, + "learning_rate": 1.5118991369754255e-05, + "loss": 0.1938, + "step": 110 + }, + { + "epoch": 0.03, + "grad_norm": 8.588750321209607, + "learning_rate": 1.5260078112077627e-05, + "loss": 0.2299, + "step": 115 + }, + { + "epoch": 0.04, + "grad_norm": 20.519070570571127, + "learning_rate": 1.5395159310338742e-05, + "loss": 0.2432, + "step": 120 + }, + { + "epoch": 0.04, + "grad_norm": 4.505660841015201, + "learning_rate": 1.5524725426256594e-05, + "loss": 0.127, + "step": 125 + }, + { + "epoch": 0.04, + "grad_norm": 6.3767344789277605, + "learning_rate": 1.56492091886626e-05, + "loss": 0.192, + "step": 130 + }, + { + "epoch": 0.04, + "grad_norm": 6.2113668041489545, + "learning_rate": 1.576899431351183e-05, + "loss": 0.2247, + "step": 135 + }, + { + "epoch": 0.04, + "grad_norm": 11.909747553663436, + "learning_rate": 1.5884422637278926e-05, + "loss": 0.2069, + "step": 140 + }, + { + "epoch": 0.04, + "grad_norm": 5.701699776433005, + "learning_rate": 1.5995799998032858e-05, + "loss": 0.1573, + "step": 145 + }, + { + "epoch": 0.04, + "grad_norm": 4.996573800175535, + "learning_rate": 1.6103401119090937e-05, + "loss": 0.1758, + "step": 150 + }, + { + "epoch": 0.05, + "grad_norm": 9.43286827567575, + "learning_rate": 1.6207473691603323e-05, + "loss": 0.1664, + "step": 155 + }, + { + "epoch": 0.05, + "grad_norm": 28.24015920275371, + "learning_rate": 1.6308241808752197e-05, + "loss": 0.1844, + "step": 160 + }, + { + "epoch": 0.05, + "grad_norm": 8.923975388298482, + "learning_rate": 1.6405908871340797e-05, + "loss": 0.2391, + "step": 165 + }, + { + "epoch": 0.05, + "grad_norm": 6.282811516394321, + "learning_rate": 1.6500660059502946e-05, + "loss": 0.1676, + "step": 170 + }, + { + "epoch": 0.05, + "grad_norm": 10.30498920082885, + "learning_rate": 1.6592664446031127e-05, + "loss": 0.1934, + "step": 175 + }, + { + "epoch": 0.05, + "grad_norm": 5.1185908366180195, + "learning_rate": 1.6682076811925287e-05, + "loss": 0.1531, + "step": 180 + }, + { + "epoch": 0.05, + "grad_norm": 4.756718752566272, + "learning_rate": 1.6769039213135887e-05, + "loss": 0.1765, + "step": 185 + }, + { + "epoch": 0.06, + "grad_norm": 6.629667545228336, + "learning_rate": 1.6853682338328088e-05, + "loss": 0.2043, + "step": 190 + }, + { + "epoch": 0.06, + "grad_norm": 1.1680908649614796, + "learning_rate": 1.6936126690249144e-05, + "loss": 0.1845, + "step": 195 + }, + { + "epoch": 0.06, + "grad_norm": 2.594897280738297, + "learning_rate": 1.7016483617504395e-05, + "loss": 0.1142, + "step": 200 + }, + { + "epoch": 0.06, + "grad_norm": 2.9032680841669585, + "learning_rate": 1.7094856218911983e-05, + "loss": 0.2007, + "step": 205 + }, + { + "epoch": 0.06, + "grad_norm": 6.848184342158123, + "learning_rate": 1.717134013886547e-05, + "loss": 0.1239, + "step": 210 + }, + { + "epoch": 0.06, + "grad_norm": 6.717780464657794, + "learning_rate": 1.7246024269096814e-05, + "loss": 0.2148, + "step": 215 + }, + { + "epoch": 0.06, + "grad_norm": 2.418335623324901, + "learning_rate": 1.7318991369754256e-05, + "loss": 0.1228, + "step": 220 + }, + { + "epoch": 0.07, + "grad_norm": 6.090782655933177, + "learning_rate": 1.739031862067748e-05, + "loss": 0.2336, + "step": 225 + }, + { + "epoch": 0.07, + "grad_norm": 9.653963182390594, + "learning_rate": 1.7460078112077626e-05, + "loss": 0.1624, + "step": 230 + }, + { + "epoch": 0.07, + "grad_norm": 6.782020630535402, + "learning_rate": 1.7528337282443e-05, + "loss": 0.2082, + "step": 235 + }, + { + "epoch": 0.07, + "grad_norm": 7.955935238397691, + "learning_rate": 1.7595159310338742e-05, + "loss": 0.1203, + "step": 240 + }, + { + "epoch": 0.07, + "grad_norm": 11.86700555838968, + "learning_rate": 1.766060346580566e-05, + "loss": 0.2336, + "step": 245 + }, + { + "epoch": 0.07, + "grad_norm": 7.932728692529283, + "learning_rate": 1.772472542625659e-05, + "loss": 0.1557, + "step": 250 + }, + { + "epoch": 0.07, + "grad_norm": 9.367369941926402, + "learning_rate": 1.7787577561089487e-05, + "loss": 0.1778, + "step": 255 + }, + { + "epoch": 0.08, + "grad_norm": 8.119258788907397, + "learning_rate": 1.7849209188662603e-05, + "loss": 0.1641, + "step": 260 + }, + { + "epoch": 0.08, + "grad_norm": 8.877824360780032, + "learning_rate": 1.7909666808791235e-05, + "loss": 0.2865, + "step": 265 + }, + { + "epoch": 0.08, + "grad_norm": 5.311739339787183, + "learning_rate": 1.796899431351183e-05, + "loss": 0.1644, + "step": 270 + }, + { + "epoch": 0.08, + "grad_norm": 3.648816107438258, + "learning_rate": 1.802723317850645e-05, + "loss": 0.1989, + "step": 275 + }, + { + "epoch": 0.08, + "grad_norm": 37.56990879513816, + "learning_rate": 1.8084422637278925e-05, + "loss": 0.1892, + "step": 280 + }, + { + "epoch": 0.08, + "grad_norm": 16.74826541858141, + "learning_rate": 1.8140599839914632e-05, + "loss": 0.2343, + "step": 285 + }, + { + "epoch": 0.09, + "grad_norm": 4.863631858596725, + "learning_rate": 1.8195799998032857e-05, + "loss": 0.2181, + "step": 290 + }, + { + "epoch": 0.09, + "grad_norm": 3.9113582322289524, + "learning_rate": 1.8250056517348252e-05, + "loss": 0.1715, + "step": 295 + }, + { + "epoch": 0.09, + "grad_norm": 11.84889258509689, + "learning_rate": 1.830340111909094e-05, + "loss": 0.1459, + "step": 300 + }, + { + "epoch": 0.09, + "grad_norm": 4.8514319753979915, + "learning_rate": 1.8355863951390547e-05, + "loss": 0.136, + "step": 305 + }, + { + "epoch": 0.09, + "grad_norm": 10.329272877304971, + "learning_rate": 1.8407473691603325e-05, + "loss": 0.161, + "step": 310 + }, + { + "epoch": 0.09, + "grad_norm": 15.22633652105913, + "learning_rate": 1.8458257640452014e-05, + "loss": 0.1721, + "step": 315 + }, + { + "epoch": 0.09, + "grad_norm": 4.230396160936562, + "learning_rate": 1.8508241808752197e-05, + "loss": 0.1408, + "step": 320 + }, + { + "epoch": 0.1, + "grad_norm": 4.777596867016981, + "learning_rate": 1.85574509974148e-05, + "loss": 0.1496, + "step": 325 + }, + { + "epoch": 0.1, + "grad_norm": 5.111340439300221, + "learning_rate": 1.8605908871340793e-05, + "loss": 0.2215, + "step": 330 + }, + { + "epoch": 0.1, + "grad_norm": 12.347791565416415, + "learning_rate": 1.8653638027759297e-05, + "loss": 0.1494, + "step": 335 + }, + { + "epoch": 0.1, + "grad_norm": 9.015406243961076, + "learning_rate": 1.8700660059502946e-05, + "loss": 0.1212, + "step": 340 + }, + { + "epoch": 0.1, + "grad_norm": 5.797822201400517, + "learning_rate": 1.874699561366417e-05, + "loss": 0.142, + "step": 345 + }, + { + "epoch": 0.1, + "grad_norm": 17.174149644614868, + "learning_rate": 1.8792664446031123e-05, + "loss": 0.1458, + "step": 350 + }, + { + "epoch": 0.1, + "grad_norm": 11.455289753213496, + "learning_rate": 1.88376854716625e-05, + "loss": 0.2466, + "step": 355 + }, + { + "epoch": 0.11, + "grad_norm": 4.808218103853913, + "learning_rate": 1.8882076811925286e-05, + "loss": 0.1382, + "step": 360 + }, + { + "epoch": 0.11, + "grad_norm": 6.0172560821404835, + "learning_rate": 1.8925855838288235e-05, + "loss": 0.2062, + "step": 365 + }, + { + "epoch": 0.11, + "grad_norm": 7.361219818784043, + "learning_rate": 1.8969039213135886e-05, + "loss": 0.2005, + "step": 370 + }, + { + "epoch": 0.11, + "grad_norm": 12.237376033790202, + "learning_rate": 1.9011642927843134e-05, + "loss": 0.2208, + "step": 375 + }, + { + "epoch": 0.11, + "grad_norm": 4.98412721789337, + "learning_rate": 1.9053682338328088e-05, + "loss": 0.1888, + "step": 380 + }, + { + "epoch": 0.11, + "grad_norm": 4.540797625815676, + "learning_rate": 1.9095172198280984e-05, + "loss": 0.1749, + "step": 385 + }, + { + "epoch": 0.11, + "grad_norm": 3.822828217331199, + "learning_rate": 1.9136126690249147e-05, + "loss": 0.0922, + "step": 390 + }, + { + "epoch": 0.12, + "grad_norm": 3.990734216544265, + "learning_rate": 1.9176559454741825e-05, + "loss": 0.1863, + "step": 395 + }, + { + "epoch": 0.12, + "grad_norm": 6.910160942356377, + "learning_rate": 1.9216483617504394e-05, + "loss": 0.2068, + "step": 400 + }, + { + "epoch": 0.12, + "grad_norm": 4.781285105059034, + "learning_rate": 1.9255911815098372e-05, + "loss": 0.1525, + "step": 405 + }, + { + "epoch": 0.12, + "grad_norm": 13.76373104788082, + "learning_rate": 1.9294856218911982e-05, + "loss": 0.2154, + "step": 410 + }, + { + "epoch": 0.12, + "grad_norm": 7.48460945287791, + "learning_rate": 1.9333328557715434e-05, + "loss": 0.1214, + "step": 415 + }, + { + "epoch": 0.12, + "grad_norm": 9.890476894625708, + "learning_rate": 1.937134013886547e-05, + "loss": 0.1381, + "step": 420 + }, + { + "epoch": 0.12, + "grad_norm": 9.300476915169963, + "learning_rate": 1.9408901868255147e-05, + "loss": 0.0847, + "step": 425 + }, + { + "epoch": 0.13, + "grad_norm": 6.528248658568062, + "learning_rate": 1.9446024269096816e-05, + "loss": 0.1411, + "step": 430 + }, + { + "epoch": 0.13, + "grad_norm": 3.682181191664401, + "learning_rate": 1.94827174996194e-05, + "loss": 0.1426, + "step": 435 + }, + { + "epoch": 0.13, + "grad_norm": 8.43703521578652, + "learning_rate": 1.951899136975425e-05, + "loss": 0.1579, + "step": 440 + }, + { + "epoch": 0.13, + "grad_norm": 6.56291531656417, + "learning_rate": 1.9554855356878272e-05, + "loss": 0.2021, + "step": 445 + }, + { + "epoch": 0.13, + "grad_norm": 10.066497956266401, + "learning_rate": 1.9590318620677484e-05, + "loss": 0.1154, + "step": 450 + }, + { + "epoch": 0.13, + "grad_norm": 5.966271330169713, + "learning_rate": 1.962539001718933e-05, + "loss": 0.2115, + "step": 455 + }, + { + "epoch": 0.13, + "grad_norm": 5.856232881470111, + "learning_rate": 1.9660078112077626e-05, + "loss": 0.1418, + "step": 460 + }, + { + "epoch": 0.14, + "grad_norm": 6.240360608562805, + "learning_rate": 1.9694391193189866e-05, + "loss": 0.1586, + "step": 465 + }, + { + "epoch": 0.14, + "grad_norm": 9.757750340699735, + "learning_rate": 1.9728337282443e-05, + "loss": 0.1449, + "step": 470 + }, + { + "epoch": 0.14, + "grad_norm": 7.712636650272143, + "learning_rate": 1.9761924147080285e-05, + "loss": 0.135, + "step": 475 + }, + { + "epoch": 0.14, + "grad_norm": 4.515393736306068, + "learning_rate": 1.9795159310338744e-05, + "loss": 0.129, + "step": 480 + }, + { + "epoch": 0.14, + "grad_norm": 0.7671227888175197, + "learning_rate": 1.982805006156388e-05, + "loss": 0.222, + "step": 485 + }, + { + "epoch": 0.14, + "grad_norm": 9.304084815966336, + "learning_rate": 1.9860603465805653e-05, + "loss": 0.1849, + "step": 490 + }, + { + "epoch": 0.15, + "grad_norm": 9.749297679157017, + "learning_rate": 1.989282637292734e-05, + "loss": 0.1362, + "step": 495 + }, + { + "epoch": 0.15, + "grad_norm": 21.698855546577796, + "learning_rate": 1.9924725426256592e-05, + "loss": 0.2026, + "step": 500 + }, + { + "epoch": 0.15, + "grad_norm": 8.82338005089098, + "learning_rate": 1.995630707080615e-05, + "loss": 0.1667, + "step": 505 + }, + { + "epoch": 0.15, + "grad_norm": 26.86060661079716, + "learning_rate": 1.998757756108949e-05, + "loss": 0.175, + "step": 510 + }, + { + "epoch": 0.15, + "grad_norm": 13.652883014206537, + "learning_rate": 1.9999995294744797e-05, + "loss": 0.1679, + "step": 515 + }, + { + "epoch": 0.15, + "grad_norm": 8.716096864047406, + "learning_rate": 1.9999966540423482e-05, + "loss": 0.1385, + "step": 520 + }, + { + "epoch": 0.15, + "grad_norm": 9.1722231121385, + "learning_rate": 1.99999116458866e-05, + "loss": 0.1694, + "step": 525 + }, + { + "epoch": 0.16, + "grad_norm": 20.79172007745624, + "learning_rate": 1.9999830611277667e-05, + "loss": 0.2038, + "step": 530 + }, + { + "epoch": 0.16, + "grad_norm": 7.595957745153807, + "learning_rate": 1.9999723436808522e-05, + "loss": 0.2264, + "step": 535 + }, + { + "epoch": 0.16, + "grad_norm": 3.7192450442308376, + "learning_rate": 1.9999590122759357e-05, + "loss": 0.1615, + "step": 540 + }, + { + "epoch": 0.16, + "grad_norm": 3.637037613572214, + "learning_rate": 1.9999430669478693e-05, + "loss": 0.0805, + "step": 545 + }, + { + "epoch": 0.16, + "grad_norm": 7.478389632328602, + "learning_rate": 1.999924507738338e-05, + "loss": 0.1623, + "step": 550 + }, + { + "epoch": 0.16, + "grad_norm": 0.5979318918112878, + "learning_rate": 1.9999033346958624e-05, + "loss": 0.1491, + "step": 555 + }, + { + "epoch": 0.16, + "grad_norm": 12.424319850795971, + "learning_rate": 1.999879547875794e-05, + "loss": 0.1631, + "step": 560 + }, + { + "epoch": 0.17, + "grad_norm": 5.76364771735449, + "learning_rate": 1.9998531473403187e-05, + "loss": 0.2197, + "step": 565 + }, + { + "epoch": 0.17, + "grad_norm": 6.606496630551793, + "learning_rate": 1.999824133158455e-05, + "loss": 0.1468, + "step": 570 + }, + { + "epoch": 0.17, + "grad_norm": 4.41747007920085, + "learning_rate": 1.999792505406055e-05, + "loss": 0.1366, + "step": 575 + }, + { + "epoch": 0.17, + "grad_norm": 5.075879536599382, + "learning_rate": 1.999758264165802e-05, + "loss": 0.0992, + "step": 580 + }, + { + "epoch": 0.17, + "grad_norm": 22.308467554033715, + "learning_rate": 1.9997214095272135e-05, + "loss": 0.2057, + "step": 585 + }, + { + "epoch": 0.17, + "grad_norm": 7.279463258176968, + "learning_rate": 1.9996819415866377e-05, + "loss": 0.1195, + "step": 590 + }, + { + "epoch": 0.17, + "grad_norm": 2.9315010662776357, + "learning_rate": 1.9996398604472556e-05, + "loss": 0.0958, + "step": 595 + }, + { + "epoch": 0.18, + "grad_norm": 21.026513936696166, + "learning_rate": 1.9995951662190794e-05, + "loss": 0.1842, + "step": 600 + }, + { + "epoch": 0.18, + "grad_norm": 4.622313054019638, + "learning_rate": 1.9995478590189534e-05, + "loss": 0.1048, + "step": 605 + }, + { + "epoch": 0.18, + "grad_norm": 6.403749452434921, + "learning_rate": 1.9994979389705517e-05, + "loss": 0.0971, + "step": 610 + }, + { + "epoch": 0.18, + "grad_norm": 14.94556621011129, + "learning_rate": 1.9994454062043795e-05, + "loss": 0.2127, + "step": 615 + }, + { + "epoch": 0.18, + "grad_norm": 8.09929499442626, + "learning_rate": 1.999390260857774e-05, + "loss": 0.2903, + "step": 620 + }, + { + "epoch": 0.18, + "grad_norm": 7.006300679141809, + "learning_rate": 1.9993325030749006e-05, + "loss": 0.1775, + "step": 625 + }, + { + "epoch": 0.18, + "grad_norm": 9.59543458146263, + "learning_rate": 1.9992721330067547e-05, + "loss": 0.2223, + "step": 630 + }, + { + "epoch": 0.19, + "grad_norm": 12.495151647629797, + "learning_rate": 1.9992091508111616e-05, + "loss": 0.2557, + "step": 635 + }, + { + "epoch": 0.19, + "grad_norm": 11.89488787437466, + "learning_rate": 1.9991435566527757e-05, + "loss": 0.2852, + "step": 640 + }, + { + "epoch": 0.19, + "grad_norm": 3.188905144434785, + "learning_rate": 1.999075350703078e-05, + "loss": 0.0961, + "step": 645 + }, + { + "epoch": 0.19, + "grad_norm": 4.654862369916143, + "learning_rate": 1.99900453314038e-05, + "loss": 0.1578, + "step": 650 + }, + { + "epoch": 0.19, + "grad_norm": 4.32187595326047, + "learning_rate": 1.9989311041498186e-05, + "loss": 0.127, + "step": 655 + }, + { + "epoch": 0.19, + "grad_norm": 4.215063448210065, + "learning_rate": 1.9988550639233587e-05, + "loss": 0.1392, + "step": 660 + }, + { + "epoch": 0.2, + "grad_norm": 15.535200940500214, + "learning_rate": 1.998776412659792e-05, + "loss": 0.1974, + "step": 665 + }, + { + "epoch": 0.2, + "grad_norm": 4.760906747957173, + "learning_rate": 1.998695150564736e-05, + "loss": 0.1434, + "step": 670 + }, + { + "epoch": 0.2, + "grad_norm": 0.061155960694268596, + "learning_rate": 1.998611277850633e-05, + "loss": 0.117, + "step": 675 + }, + { + "epoch": 0.2, + "grad_norm": 4.953787678092304, + "learning_rate": 1.9985247947367508e-05, + "loss": 0.1831, + "step": 680 + }, + { + "epoch": 0.2, + "grad_norm": 4.678233435016873, + "learning_rate": 1.9984357014491816e-05, + "loss": 0.1597, + "step": 685 + }, + { + "epoch": 0.2, + "grad_norm": 3.229829256723626, + "learning_rate": 1.9983439982208417e-05, + "loss": 0.1587, + "step": 690 + }, + { + "epoch": 0.2, + "grad_norm": 4.885033869708901, + "learning_rate": 1.9982496852914696e-05, + "loss": 0.18, + "step": 695 + }, + { + "epoch": 0.21, + "grad_norm": 6.329976580969712, + "learning_rate": 1.9981527629076265e-05, + "loss": 0.1383, + "step": 700 + }, + { + "epoch": 0.21, + "grad_norm": 8.616010763659258, + "learning_rate": 1.9980532313226964e-05, + "loss": 0.1439, + "step": 705 + }, + { + "epoch": 0.21, + "grad_norm": 5.805509239551478, + "learning_rate": 1.9979510907968834e-05, + "loss": 0.1916, + "step": 710 + }, + { + "epoch": 0.21, + "grad_norm": 9.354847725343019, + "learning_rate": 1.9978463415972135e-05, + "loss": 0.1291, + "step": 715 + }, + { + "epoch": 0.21, + "grad_norm": 11.028811436490342, + "learning_rate": 1.997738983997531e-05, + "loss": 0.1329, + "step": 720 + }, + { + "epoch": 0.21, + "grad_norm": 5.398403758317581, + "learning_rate": 1.9976290182784994e-05, + "loss": 0.169, + "step": 725 + }, + { + "epoch": 0.21, + "grad_norm": 1.5150079159167873, + "learning_rate": 1.9975164447276022e-05, + "loss": 0.14, + "step": 730 + }, + { + "epoch": 0.22, + "grad_norm": 5.437256906271298, + "learning_rate": 1.9974012636391393e-05, + "loss": 0.1706, + "step": 735 + }, + { + "epoch": 0.22, + "grad_norm": 4.686594964713854, + "learning_rate": 1.9972834753142275e-05, + "loss": 0.1086, + "step": 740 + }, + { + "epoch": 0.22, + "grad_norm": 7.059615364727769, + "learning_rate": 1.9971630800607995e-05, + "loss": 0.1391, + "step": 745 + }, + { + "epoch": 0.22, + "grad_norm": 3.759488626159761, + "learning_rate": 1.9970400781936044e-05, + "loss": 0.1252, + "step": 750 + }, + { + "epoch": 0.22, + "grad_norm": 5.196072033882789, + "learning_rate": 1.9969144700342042e-05, + "loss": 0.1907, + "step": 755 + }, + { + "epoch": 0.22, + "grad_norm": 12.217160172705722, + "learning_rate": 1.9967862559109757e-05, + "loss": 0.2199, + "step": 760 + }, + { + "epoch": 0.22, + "grad_norm": 4.605767548552978, + "learning_rate": 1.996655436159108e-05, + "loss": 0.0851, + "step": 765 + }, + { + "epoch": 0.23, + "grad_norm": 2.7976726055332968, + "learning_rate": 1.9965220111206022e-05, + "loss": 0.1428, + "step": 770 + }, + { + "epoch": 0.23, + "grad_norm": 6.6315563077229465, + "learning_rate": 1.9963859811442695e-05, + "loss": 0.1419, + "step": 775 + }, + { + "epoch": 0.23, + "grad_norm": 7.1281321818622025, + "learning_rate": 1.996247346585733e-05, + "loss": 0.1895, + "step": 780 + }, + { + "epoch": 0.23, + "grad_norm": 5.305688238343365, + "learning_rate": 1.9961061078074236e-05, + "loss": 0.1973, + "step": 785 + }, + { + "epoch": 0.23, + "grad_norm": 6.655387141934961, + "learning_rate": 1.99596226517858e-05, + "loss": 0.222, + "step": 790 + }, + { + "epoch": 0.23, + "grad_norm": 3.802457245856603, + "learning_rate": 1.9958158190752497e-05, + "loss": 0.1619, + "step": 795 + }, + { + "epoch": 0.23, + "grad_norm": 4.205338699906657, + "learning_rate": 1.9956667698802847e-05, + "loss": 0.1411, + "step": 800 + }, + { + "epoch": 0.24, + "grad_norm": 18.6994174478946, + "learning_rate": 1.9955151179833437e-05, + "loss": 0.1763, + "step": 805 + }, + { + "epoch": 0.24, + "grad_norm": 5.203119684623317, + "learning_rate": 1.995360863780889e-05, + "loss": 0.1651, + "step": 810 + }, + { + "epoch": 0.24, + "grad_norm": 1.1272129969070257, + "learning_rate": 1.9952040076761857e-05, + "loss": 0.1222, + "step": 815 + }, + { + "epoch": 0.24, + "grad_norm": 7.271805429556148, + "learning_rate": 1.9950445500793015e-05, + "loss": 0.1072, + "step": 820 + }, + { + "epoch": 0.24, + "grad_norm": 7.652271467084596, + "learning_rate": 1.994882491407105e-05, + "loss": 0.1221, + "step": 825 + }, + { + "epoch": 0.24, + "grad_norm": 11.746166496698574, + "learning_rate": 1.9947178320832656e-05, + "loss": 0.2158, + "step": 830 + }, + { + "epoch": 0.24, + "grad_norm": 6.5191485713782535, + "learning_rate": 1.99455057253825e-05, + "loss": 0.1689, + "step": 835 + }, + { + "epoch": 0.25, + "grad_norm": 5.7533957364325286, + "learning_rate": 1.9943807132093236e-05, + "loss": 0.2428, + "step": 840 + }, + { + "epoch": 0.25, + "grad_norm": 7.743116459704382, + "learning_rate": 1.9942082545405485e-05, + "loss": 0.2132, + "step": 845 + }, + { + "epoch": 0.25, + "grad_norm": 7.284870082767622, + "learning_rate": 1.9940331969827816e-05, + "loss": 0.1184, + "step": 850 + }, + { + "epoch": 0.25, + "grad_norm": 6.8174806638918355, + "learning_rate": 1.9938555409936746e-05, + "loss": 0.1717, + "step": 855 + }, + { + "epoch": 0.25, + "grad_norm": 4.522024299566416, + "learning_rate": 1.9936752870376722e-05, + "loss": 0.1544, + "step": 860 + }, + { + "epoch": 0.25, + "grad_norm": 10.08832302588539, + "learning_rate": 1.9934924355860107e-05, + "loss": 0.1735, + "step": 865 + }, + { + "epoch": 0.26, + "grad_norm": 12.48917866555821, + "learning_rate": 1.993306987116717e-05, + "loss": 0.1627, + "step": 870 + }, + { + "epoch": 0.26, + "grad_norm": 5.430627799088825, + "learning_rate": 1.993118942114608e-05, + "loss": 0.1113, + "step": 875 + }, + { + "epoch": 0.26, + "grad_norm": 6.3813002626417905, + "learning_rate": 1.992928301071288e-05, + "loss": 0.1718, + "step": 880 + }, + { + "epoch": 0.26, + "grad_norm": 13.52771259964932, + "learning_rate": 1.9927350644851477e-05, + "loss": 0.1118, + "step": 885 + }, + { + "epoch": 0.26, + "grad_norm": 3.5900299721319406, + "learning_rate": 1.9925392328613644e-05, + "loss": 0.1357, + "step": 890 + }, + { + "epoch": 0.26, + "grad_norm": 10.835472443494583, + "learning_rate": 1.992340806711899e-05, + "loss": 0.0815, + "step": 895 + }, + { + "epoch": 0.26, + "grad_norm": 6.39064559373592, + "learning_rate": 1.992139786555496e-05, + "loss": 0.2278, + "step": 900 + }, + { + "epoch": 0.27, + "grad_norm": 14.873787966804063, + "learning_rate": 1.9919361729176798e-05, + "loss": 0.1245, + "step": 905 + }, + { + "epoch": 0.27, + "grad_norm": 0.8036248310449102, + "learning_rate": 1.991729966330756e-05, + "loss": 0.1297, + "step": 910 + }, + { + "epoch": 0.27, + "grad_norm": 2.7695654260331835, + "learning_rate": 1.991521167333809e-05, + "loss": 0.1493, + "step": 915 + }, + { + "epoch": 0.27, + "grad_norm": 7.413157143500232, + "learning_rate": 1.9913097764727006e-05, + "loss": 0.1712, + "step": 920 + }, + { + "epoch": 0.27, + "grad_norm": 5.35044325659026, + "learning_rate": 1.9910957943000678e-05, + "loss": 0.1923, + "step": 925 + }, + { + "epoch": 0.27, + "grad_norm": 4.939892539272544, + "learning_rate": 1.9908792213753223e-05, + "loss": 0.1262, + "step": 930 + }, + { + "epoch": 0.27, + "grad_norm": 4.425995433503358, + "learning_rate": 1.990660058264649e-05, + "loss": 0.1316, + "step": 935 + }, + { + "epoch": 0.28, + "grad_norm": 6.682173559661954, + "learning_rate": 1.9904383055410045e-05, + "loss": 0.2628, + "step": 940 + }, + { + "epoch": 0.28, + "grad_norm": 19.871499575052912, + "learning_rate": 1.9902139637841146e-05, + "loss": 0.1646, + "step": 945 + }, + { + "epoch": 0.28, + "grad_norm": 6.951640795770302, + "learning_rate": 1.989987033580475e-05, + "loss": 0.1733, + "step": 950 + }, + { + "epoch": 0.28, + "grad_norm": 9.387316535142354, + "learning_rate": 1.989757515523346e-05, + "loss": 0.1448, + "step": 955 + }, + { + "epoch": 0.28, + "grad_norm": 3.9401918928016197, + "learning_rate": 1.9895254102127562e-05, + "loss": 0.1421, + "step": 960 + }, + { + "epoch": 0.28, + "grad_norm": 14.253092112437391, + "learning_rate": 1.989290718255496e-05, + "loss": 0.205, + "step": 965 + }, + { + "epoch": 0.28, + "grad_norm": 3.6398631596257545, + "learning_rate": 1.9890534402651184e-05, + "loss": 0.0899, + "step": 970 + }, + { + "epoch": 0.29, + "grad_norm": 15.630617642490721, + "learning_rate": 1.988813576861938e-05, + "loss": 0.1328, + "step": 975 + }, + { + "epoch": 0.29, + "grad_norm": 6.6456920006850115, + "learning_rate": 1.9885711286730267e-05, + "loss": 0.1899, + "step": 980 + }, + { + "epoch": 0.29, + "grad_norm": 7.585995077745574, + "learning_rate": 1.9883260963322152e-05, + "loss": 0.1583, + "step": 985 + }, + { + "epoch": 0.29, + "grad_norm": 3.476123842146937, + "learning_rate": 1.98807848048009e-05, + "loss": 0.1855, + "step": 990 + }, + { + "epoch": 0.29, + "grad_norm": 4.179578662061332, + "learning_rate": 1.987828281763991e-05, + "loss": 0.1681, + "step": 995 + }, + { + "epoch": 0.29, + "grad_norm": 3.565609561977014, + "learning_rate": 1.9875755008380104e-05, + "loss": 0.1187, + "step": 1000 + }, + { + "epoch": 0.29, + "grad_norm": 9.26279986638738, + "learning_rate": 1.9873201383629913e-05, + "loss": 0.1337, + "step": 1005 + }, + { + "epoch": 0.3, + "grad_norm": 3.569019308801666, + "learning_rate": 1.987062195006526e-05, + "loss": 0.0932, + "step": 1010 + }, + { + "epoch": 0.3, + "grad_norm": 1.8954161789376982, + "learning_rate": 1.986801671442953e-05, + "loss": 0.1272, + "step": 1015 + }, + { + "epoch": 0.3, + "grad_norm": 6.549425452273108, + "learning_rate": 1.986538568353358e-05, + "loss": 0.2543, + "step": 1020 + }, + { + "epoch": 0.3, + "grad_norm": 6.401069146499022, + "learning_rate": 1.9862728864255677e-05, + "loss": 0.1339, + "step": 1025 + }, + { + "epoch": 0.3, + "grad_norm": 14.623867414912695, + "learning_rate": 1.9860046263541537e-05, + "loss": 0.1368, + "step": 1030 + }, + { + "epoch": 0.3, + "grad_norm": 7.5226028629064166, + "learning_rate": 1.9857337888404254e-05, + "loss": 0.1315, + "step": 1035 + }, + { + "epoch": 0.31, + "grad_norm": 6.3020352784733795, + "learning_rate": 1.985460374592431e-05, + "loss": 0.2022, + "step": 1040 + }, + { + "epoch": 0.31, + "grad_norm": 35.72017508640075, + "learning_rate": 1.9851843843249552e-05, + "loss": 0.1907, + "step": 1045 + }, + { + "epoch": 0.31, + "grad_norm": 7.856532164781736, + "learning_rate": 1.9849058187595173e-05, + "loss": 0.1042, + "step": 1050 + }, + { + "epoch": 0.31, + "grad_norm": 4.939914354516286, + "learning_rate": 1.9846246786243682e-05, + "loss": 0.1883, + "step": 1055 + }, + { + "epoch": 0.31, + "grad_norm": 5.847586448522947, + "learning_rate": 1.9843409646544912e-05, + "loss": 0.1352, + "step": 1060 + }, + { + "epoch": 0.31, + "grad_norm": 1.7662920071735315, + "learning_rate": 1.984054677591597e-05, + "loss": 0.1266, + "step": 1065 + }, + { + "epoch": 0.31, + "grad_norm": 5.934002297797303, + "learning_rate": 1.9837658181841236e-05, + "loss": 0.1282, + "step": 1070 + }, + { + "epoch": 0.32, + "grad_norm": 7.322599560603541, + "learning_rate": 1.9834743871872333e-05, + "loss": 0.1002, + "step": 1075 + }, + { + "epoch": 0.32, + "grad_norm": 7.354422877300046, + "learning_rate": 1.9831803853628122e-05, + "loss": 0.1347, + "step": 1080 + }, + { + "epoch": 0.32, + "grad_norm": 4.244910165579806, + "learning_rate": 1.9828838134794668e-05, + "loss": 0.191, + "step": 1085 + }, + { + "epoch": 0.32, + "grad_norm": 2.189412507115152, + "learning_rate": 1.9825846723125222e-05, + "loss": 0.129, + "step": 1090 + }, + { + "epoch": 0.32, + "grad_norm": 4.334446446188146, + "learning_rate": 1.9822829626440213e-05, + "loss": 0.1606, + "step": 1095 + }, + { + "epoch": 0.32, + "grad_norm": 10.79965169839475, + "learning_rate": 1.9819786852627208e-05, + "loss": 0.2085, + "step": 1100 + }, + { + "epoch": 0.32, + "grad_norm": 5.047688232660633, + "learning_rate": 1.9816718409640904e-05, + "loss": 0.1335, + "step": 1105 + }, + { + "epoch": 0.33, + "grad_norm": 3.0521785647450033, + "learning_rate": 1.9813624305503105e-05, + "loss": 0.0902, + "step": 1110 + }, + { + "epoch": 0.33, + "grad_norm": 4.404798119689452, + "learning_rate": 1.9810504548302706e-05, + "loss": 0.1504, + "step": 1115 + }, + { + "epoch": 0.33, + "grad_norm": 4.276958235694841, + "learning_rate": 1.980735914619566e-05, + "loss": 0.1787, + "step": 1120 + }, + { + "epoch": 0.33, + "grad_norm": 6.67294781745391, + "learning_rate": 1.9804188107404973e-05, + "loss": 0.1485, + "step": 1125 + }, + { + "epoch": 0.33, + "grad_norm": 1.4659143223309605, + "learning_rate": 1.9800991440220652e-05, + "loss": 0.1161, + "step": 1130 + }, + { + "epoch": 0.33, + "grad_norm": 12.717469246978801, + "learning_rate": 1.979776915299973e-05, + "loss": 0.1409, + "step": 1135 + }, + { + "epoch": 0.33, + "grad_norm": 7.082211154981896, + "learning_rate": 1.9794521254166197e-05, + "loss": 0.1543, + "step": 1140 + }, + { + "epoch": 0.34, + "grad_norm": 5.796340300790522, + "learning_rate": 1.9791247752211014e-05, + "loss": 0.1151, + "step": 1145 + }, + { + "epoch": 0.34, + "grad_norm": 5.586857817269538, + "learning_rate": 1.978794865569207e-05, + "loss": 0.1199, + "step": 1150 + }, + { + "epoch": 0.34, + "grad_norm": 12.706251296959476, + "learning_rate": 1.9784623973234158e-05, + "loss": 0.1619, + "step": 1155 + }, + { + "epoch": 0.34, + "grad_norm": 6.66191554843005, + "learning_rate": 1.978127371352898e-05, + "loss": 0.1827, + "step": 1160 + }, + { + "epoch": 0.34, + "grad_norm": 1.7314817335468968, + "learning_rate": 1.9777897885335077e-05, + "loss": 0.1299, + "step": 1165 + }, + { + "epoch": 0.34, + "grad_norm": 3.1709913704813957, + "learning_rate": 1.9774496497477863e-05, + "loss": 0.0935, + "step": 1170 + }, + { + "epoch": 0.34, + "grad_norm": 6.802763534426451, + "learning_rate": 1.9771069558849553e-05, + "loss": 0.1747, + "step": 1175 + }, + { + "epoch": 0.35, + "grad_norm": 5.666618159965396, + "learning_rate": 1.9767617078409162e-05, + "loss": 0.1248, + "step": 1180 + }, + { + "epoch": 0.35, + "grad_norm": 12.07309712013722, + "learning_rate": 1.9764139065182485e-05, + "loss": 0.1369, + "step": 1185 + }, + { + "epoch": 0.35, + "grad_norm": 5.0768838627507895, + "learning_rate": 1.976063552826206e-05, + "loss": 0.1426, + "step": 1190 + }, + { + "epoch": 0.35, + "grad_norm": 2.6195923645371915, + "learning_rate": 1.9757106476807156e-05, + "loss": 0.1414, + "step": 1195 + }, + { + "epoch": 0.35, + "grad_norm": 8.745582119219046, + "learning_rate": 1.975355192004374e-05, + "loss": 0.1537, + "step": 1200 + }, + { + "epoch": 0.35, + "grad_norm": 4.3619589308843425, + "learning_rate": 1.9749971867264468e-05, + "loss": 0.2326, + "step": 1205 + }, + { + "epoch": 0.35, + "grad_norm": 9.986127768195958, + "learning_rate": 1.9746366327828637e-05, + "loss": 0.0914, + "step": 1210 + }, + { + "epoch": 0.36, + "grad_norm": 11.679210640348971, + "learning_rate": 1.9742735311162177e-05, + "loss": 0.1284, + "step": 1215 + }, + { + "epoch": 0.36, + "grad_norm": 4.432897195420396, + "learning_rate": 1.973907882675763e-05, + "loss": 0.1578, + "step": 1220 + }, + { + "epoch": 0.36, + "grad_norm": 9.382375435412673, + "learning_rate": 1.973539688417411e-05, + "loss": 0.1637, + "step": 1225 + }, + { + "epoch": 0.36, + "grad_norm": 5.830067001664431, + "learning_rate": 1.973168949303729e-05, + "loss": 0.1468, + "step": 1230 + }, + { + "epoch": 0.36, + "grad_norm": 11.741616362731765, + "learning_rate": 1.9727956663039367e-05, + "loss": 0.1691, + "step": 1235 + }, + { + "epoch": 0.36, + "grad_norm": 7.39202415980895, + "learning_rate": 1.9724198403939053e-05, + "loss": 0.1809, + "step": 1240 + }, + { + "epoch": 0.37, + "grad_norm": 4.693742994361657, + "learning_rate": 1.9720414725561538e-05, + "loss": 0.0858, + "step": 1245 + }, + { + "epoch": 0.37, + "grad_norm": 8.617997574967658, + "learning_rate": 1.9716605637798452e-05, + "loss": 0.1308, + "step": 1250 + }, + { + "epoch": 0.37, + "grad_norm": 2.6119778380512626, + "learning_rate": 1.9712771150607865e-05, + "loss": 0.1205, + "step": 1255 + }, + { + "epoch": 0.37, + "grad_norm": 5.216314969786015, + "learning_rate": 1.9708911274014247e-05, + "loss": 0.1245, + "step": 1260 + }, + { + "epoch": 0.37, + "grad_norm": 5.528129775162225, + "learning_rate": 1.970502601810844e-05, + "loss": 0.1876, + "step": 1265 + }, + { + "epoch": 0.37, + "grad_norm": 5.524231849539429, + "learning_rate": 1.9701115393047636e-05, + "loss": 0.1143, + "step": 1270 + }, + { + "epoch": 0.37, + "grad_norm": 4.229434350974042, + "learning_rate": 1.969717940905535e-05, + "loss": 0.1273, + "step": 1275 + }, + { + "epoch": 0.38, + "grad_norm": 6.514210060225279, + "learning_rate": 1.9693218076421395e-05, + "loss": 0.1034, + "step": 1280 + }, + { + "epoch": 0.38, + "grad_norm": 14.7017638724717, + "learning_rate": 1.9689231405501844e-05, + "loss": 0.1529, + "step": 1285 + }, + { + "epoch": 0.38, + "grad_norm": 4.964386181339109, + "learning_rate": 1.968521940671903e-05, + "loss": 0.1567, + "step": 1290 + }, + { + "epoch": 0.38, + "grad_norm": 5.222923574403755, + "learning_rate": 1.9681182090561467e-05, + "loss": 0.1144, + "step": 1295 + }, + { + "epoch": 0.38, + "grad_norm": 5.523495340651447, + "learning_rate": 1.96771194675839e-05, + "loss": 0.1311, + "step": 1300 + }, + { + "epoch": 0.38, + "grad_norm": 6.064321028150695, + "learning_rate": 1.9673031548407197e-05, + "loss": 0.1282, + "step": 1305 + }, + { + "epoch": 0.38, + "grad_norm": 7.022609980188127, + "learning_rate": 1.9668918343718377e-05, + "loss": 0.1735, + "step": 1310 + }, + { + "epoch": 0.39, + "grad_norm": 5.386703878469319, + "learning_rate": 1.9664779864270553e-05, + "loss": 0.146, + "step": 1315 + }, + { + "epoch": 0.39, + "grad_norm": 2.2817281979534587, + "learning_rate": 1.966061612088292e-05, + "loss": 0.1566, + "step": 1320 + }, + { + "epoch": 0.39, + "grad_norm": 5.647148438986119, + "learning_rate": 1.965642712444072e-05, + "loss": 0.1508, + "step": 1325 + }, + { + "epoch": 0.39, + "grad_norm": 14.636180809770323, + "learning_rate": 1.965221288589521e-05, + "loss": 0.1366, + "step": 1330 + }, + { + "epoch": 0.39, + "grad_norm": 3.796916066642304, + "learning_rate": 1.9647973416263634e-05, + "loss": 0.1562, + "step": 1335 + }, + { + "epoch": 0.39, + "grad_norm": 8.39363234629941, + "learning_rate": 1.964370872662921e-05, + "loss": 0.0938, + "step": 1340 + }, + { + "epoch": 0.39, + "grad_norm": 6.500094779864121, + "learning_rate": 1.963941882814108e-05, + "loss": 0.1746, + "step": 1345 + }, + { + "epoch": 0.4, + "grad_norm": 3.5283001757400054, + "learning_rate": 1.963510373201428e-05, + "loss": 0.1419, + "step": 1350 + }, + { + "epoch": 0.4, + "grad_norm": 2.5908302511117327, + "learning_rate": 1.9630763449529747e-05, + "loss": 0.0663, + "step": 1355 + }, + { + "epoch": 0.4, + "grad_norm": 6.447702267666421, + "learning_rate": 1.962639799203423e-05, + "loss": 0.1506, + "step": 1360 + }, + { + "epoch": 0.4, + "grad_norm": 7.258886744369148, + "learning_rate": 1.962200737094032e-05, + "loss": 0.1705, + "step": 1365 + }, + { + "epoch": 0.4, + "grad_norm": 8.355297998196825, + "learning_rate": 1.9617591597726372e-05, + "loss": 0.2185, + "step": 1370 + }, + { + "epoch": 0.4, + "grad_norm": 11.059705823190226, + "learning_rate": 1.9613150683936513e-05, + "loss": 0.164, + "step": 1375 + }, + { + "epoch": 0.4, + "grad_norm": 7.496281019677461, + "learning_rate": 1.9608684641180584e-05, + "loss": 0.1868, + "step": 1380 + }, + { + "epoch": 0.41, + "grad_norm": 9.337392423896086, + "learning_rate": 1.9604193481134123e-05, + "loss": 0.128, + "step": 1385 + }, + { + "epoch": 0.41, + "grad_norm": 6.741067698323382, + "learning_rate": 1.9599677215538333e-05, + "loss": 0.1304, + "step": 1390 + }, + { + "epoch": 0.41, + "grad_norm": 5.97327894379059, + "learning_rate": 1.959513585620005e-05, + "loss": 0.1129, + "step": 1395 + }, + { + "epoch": 0.41, + "grad_norm": 6.2990643124323, + "learning_rate": 1.9590569414991718e-05, + "loss": 0.2452, + "step": 1400 + }, + { + "epoch": 0.41, + "grad_norm": 3.9874003574274552, + "learning_rate": 1.9585977903851334e-05, + "loss": 0.1288, + "step": 1405 + }, + { + "epoch": 0.41, + "grad_norm": 10.71406937880321, + "learning_rate": 1.9581361334782453e-05, + "loss": 0.1682, + "step": 1410 + }, + { + "epoch": 0.42, + "grad_norm": 3.1321367786011076, + "learning_rate": 1.957671971985414e-05, + "loss": 0.1461, + "step": 1415 + }, + { + "epoch": 0.42, + "grad_norm": 6.601166084621551, + "learning_rate": 1.9572053071200922e-05, + "loss": 0.1642, + "step": 1420 + }, + { + "epoch": 0.42, + "grad_norm": 6.815298083273188, + "learning_rate": 1.9567361401022784e-05, + "loss": 0.2203, + "step": 1425 + }, + { + "epoch": 0.42, + "grad_norm": 8.60817902664583, + "learning_rate": 1.9562644721585123e-05, + "loss": 0.1246, + "step": 1430 + }, + { + "epoch": 0.42, + "grad_norm": 3.607077127205811, + "learning_rate": 1.9557903045218708e-05, + "loss": 0.0977, + "step": 1435 + }, + { + "epoch": 0.42, + "grad_norm": 5.81146788919113, + "learning_rate": 1.955313638431967e-05, + "loss": 0.1038, + "step": 1440 + }, + { + "epoch": 0.42, + "grad_norm": 8.172735748764952, + "learning_rate": 1.954834475134945e-05, + "loss": 0.1653, + "step": 1445 + }, + { + "epoch": 0.43, + "grad_norm": 6.240754575344053, + "learning_rate": 1.9543528158834775e-05, + "loss": 0.1734, + "step": 1450 + }, + { + "epoch": 0.43, + "grad_norm": 5.739038717343691, + "learning_rate": 1.953868661936762e-05, + "loss": 0.1477, + "step": 1455 + }, + { + "epoch": 0.43, + "grad_norm": 3.09005933315755, + "learning_rate": 1.9533820145605184e-05, + "loss": 0.1303, + "step": 1460 + }, + { + "epoch": 0.43, + "grad_norm": 9.269129587273234, + "learning_rate": 1.9528928750269847e-05, + "loss": 0.1188, + "step": 1465 + }, + { + "epoch": 0.43, + "grad_norm": 3.9756210801612446, + "learning_rate": 1.9524012446149144e-05, + "loss": 0.1011, + "step": 1470 + }, + { + "epoch": 0.43, + "grad_norm": 6.014169492139025, + "learning_rate": 1.9519071246095734e-05, + "loss": 0.1843, + "step": 1475 + }, + { + "epoch": 0.43, + "grad_norm": 6.42414136559558, + "learning_rate": 1.951410516302735e-05, + "loss": 0.1429, + "step": 1480 + }, + { + "epoch": 0.44, + "grad_norm": 5.990872434024276, + "learning_rate": 1.950911420992678e-05, + "loss": 0.0871, + "step": 1485 + }, + { + "epoch": 0.44, + "grad_norm": 3.5783586051011795, + "learning_rate": 1.9504098399841835e-05, + "loss": 0.1602, + "step": 1490 + }, + { + "epoch": 0.44, + "grad_norm": 6.318287528571422, + "learning_rate": 1.9499057745885308e-05, + "loss": 0.134, + "step": 1495 + }, + { + "epoch": 0.44, + "grad_norm": 12.51258277827576, + "learning_rate": 1.949399226123493e-05, + "loss": 0.1537, + "step": 1500 + }, + { + "epoch": 0.44, + "grad_norm": 3.886808062730125, + "learning_rate": 1.9488901959133365e-05, + "loss": 0.1997, + "step": 1505 + }, + { + "epoch": 0.44, + "grad_norm": 2.309686134450491, + "learning_rate": 1.9483786852888144e-05, + "loss": 0.1105, + "step": 1510 + }, + { + "epoch": 0.44, + "grad_norm": 4.355694711066838, + "learning_rate": 1.947864695587165e-05, + "loss": 0.1538, + "step": 1515 + }, + { + "epoch": 0.45, + "grad_norm": 2.986939771149088, + "learning_rate": 1.9473482281521063e-05, + "loss": 0.0769, + "step": 1520 + }, + { + "epoch": 0.45, + "grad_norm": 2.8973623579771397, + "learning_rate": 1.946829284333836e-05, + "loss": 0.0987, + "step": 1525 + }, + { + "epoch": 0.45, + "grad_norm": 2.0533675098464874, + "learning_rate": 1.9463078654890242e-05, + "loss": 0.1066, + "step": 1530 + }, + { + "epoch": 0.45, + "grad_norm": 7.8889622513250774, + "learning_rate": 1.945783972980812e-05, + "loss": 0.0989, + "step": 1535 + }, + { + "epoch": 0.45, + "grad_norm": 1.470844560891904, + "learning_rate": 1.945257608178807e-05, + "loss": 0.0536, + "step": 1540 + }, + { + "epoch": 0.45, + "grad_norm": 3.097540950673788, + "learning_rate": 1.9447287724590808e-05, + "loss": 0.1487, + "step": 1545 + }, + { + "epoch": 0.45, + "grad_norm": 2.236294115632501, + "learning_rate": 1.9441974672041636e-05, + "loss": 0.1272, + "step": 1550 + }, + { + "epoch": 0.46, + "grad_norm": 6.08682373802408, + "learning_rate": 1.943663693803043e-05, + "loss": 0.1234, + "step": 1555 + }, + { + "epoch": 0.46, + "grad_norm": 2.659517679298076, + "learning_rate": 1.9431274536511577e-05, + "loss": 0.107, + "step": 1560 + }, + { + "epoch": 0.46, + "grad_norm": 4.176352623298332, + "learning_rate": 1.9425887481503964e-05, + "loss": 0.1275, + "step": 1565 + }, + { + "epoch": 0.46, + "grad_norm": 3.247258717513404, + "learning_rate": 1.9420475787090926e-05, + "loss": 0.1282, + "step": 1570 + }, + { + "epoch": 0.46, + "grad_norm": 3.345767544439658, + "learning_rate": 1.9415039467420207e-05, + "loss": 0.0917, + "step": 1575 + }, + { + "epoch": 0.46, + "grad_norm": 2.368596730771716, + "learning_rate": 1.9409578536703936e-05, + "loss": 0.1262, + "step": 1580 + }, + { + "epoch": 0.46, + "grad_norm": 12.620428787419241, + "learning_rate": 1.9404093009218568e-05, + "loss": 0.1687, + "step": 1585 + }, + { + "epoch": 0.47, + "grad_norm": 2.3476402040970985, + "learning_rate": 1.939858289930489e-05, + "loss": 0.0849, + "step": 1590 + }, + { + "epoch": 0.47, + "grad_norm": 9.157447293999722, + "learning_rate": 1.9393048221367924e-05, + "loss": 0.1322, + "step": 1595 + }, + { + "epoch": 0.47, + "grad_norm": 10.047629795227984, + "learning_rate": 1.9387488989876937e-05, + "loss": 0.1215, + "step": 1600 + }, + { + "epoch": 0.47, + "grad_norm": 4.324110021146038, + "learning_rate": 1.938190521936538e-05, + "loss": 0.1228, + "step": 1605 + }, + { + "epoch": 0.47, + "grad_norm": 2.520014942669844, + "learning_rate": 1.937629692443086e-05, + "loss": 0.1538, + "step": 1610 + }, + { + "epoch": 0.47, + "grad_norm": 5.789832430771725, + "learning_rate": 1.9370664119735096e-05, + "loss": 0.1508, + "step": 1615 + }, + { + "epoch": 0.48, + "grad_norm": 3.248881220822288, + "learning_rate": 1.9365006820003883e-05, + "loss": 0.1051, + "step": 1620 + }, + { + "epoch": 0.48, + "grad_norm": 3.1423550675382623, + "learning_rate": 1.935932504002705e-05, + "loss": 0.0786, + "step": 1625 + }, + { + "epoch": 0.48, + "grad_norm": 4.9862187243188645, + "learning_rate": 1.935361879465843e-05, + "loss": 0.089, + "step": 1630 + }, + { + "epoch": 0.48, + "grad_norm": 1.9635393731249728, + "learning_rate": 1.9347888098815814e-05, + "loss": 0.0699, + "step": 1635 + }, + { + "epoch": 0.48, + "grad_norm": 2.1963931644611914, + "learning_rate": 1.9342132967480914e-05, + "loss": 0.1087, + "step": 1640 + }, + { + "epoch": 0.48, + "grad_norm": 2.689006294580755, + "learning_rate": 1.9336353415699316e-05, + "loss": 0.0622, + "step": 1645 + }, + { + "epoch": 0.48, + "grad_norm": 2.1154762474968005, + "learning_rate": 1.933054945858046e-05, + "loss": 0.0961, + "step": 1650 + }, + { + "epoch": 0.49, + "grad_norm": 6.7523638506838095, + "learning_rate": 1.932472111129758e-05, + "loss": 0.1685, + "step": 1655 + }, + { + "epoch": 0.49, + "grad_norm": 1.5984988800226272, + "learning_rate": 1.931886838908768e-05, + "loss": 0.1272, + "step": 1660 + }, + { + "epoch": 0.49, + "grad_norm": 4.090521085044347, + "learning_rate": 1.9312991307251476e-05, + "loss": 0.0738, + "step": 1665 + }, + { + "epoch": 0.49, + "grad_norm": 3.0964168267397385, + "learning_rate": 1.9307089881153383e-05, + "loss": 0.095, + "step": 1670 + }, + { + "epoch": 0.49, + "grad_norm": 1.7867271806779057, + "learning_rate": 1.9301164126221444e-05, + "loss": 0.1104, + "step": 1675 + }, + { + "epoch": 0.49, + "grad_norm": 2.565341397593321, + "learning_rate": 1.929521405794732e-05, + "loss": 0.1075, + "step": 1680 + }, + { + "epoch": 0.49, + "grad_norm": 0.7933305187602832, + "learning_rate": 1.9289239691886213e-05, + "loss": 0.0703, + "step": 1685 + }, + { + "epoch": 0.5, + "grad_norm": 2.7412621656740184, + "learning_rate": 1.9283241043656865e-05, + "loss": 0.1091, + "step": 1690 + }, + { + "epoch": 0.5, + "grad_norm": 3.6919637269194117, + "learning_rate": 1.9277218128941493e-05, + "loss": 0.0399, + "step": 1695 + }, + { + "epoch": 0.5, + "grad_norm": 3.3309770998366575, + "learning_rate": 1.927117096348575e-05, + "loss": 0.1115, + "step": 1700 + }, + { + "epoch": 0.5, + "grad_norm": 3.120277651390199, + "learning_rate": 1.9265099563098698e-05, + "loss": 0.1292, + "step": 1705 + }, + { + "epoch": 0.5, + "grad_norm": 3.567470496202032, + "learning_rate": 1.9259003943652743e-05, + "loss": 0.1023, + "step": 1710 + }, + { + "epoch": 0.5, + "grad_norm": 3.1015901732471365, + "learning_rate": 1.9252884121083613e-05, + "loss": 0.098, + "step": 1715 + }, + { + "epoch": 0.5, + "grad_norm": 3.3435838604589216, + "learning_rate": 1.924674011139031e-05, + "loss": 0.1034, + "step": 1720 + }, + { + "epoch": 0.51, + "grad_norm": 2.751303537238706, + "learning_rate": 1.924057193063507e-05, + "loss": 0.0751, + "step": 1725 + }, + { + "epoch": 0.51, + "grad_norm": 2.8435568272302034, + "learning_rate": 1.923437959494331e-05, + "loss": 0.0938, + "step": 1730 + }, + { + "epoch": 0.51, + "grad_norm": 3.491687713292557, + "learning_rate": 1.9228163120503612e-05, + "loss": 0.126, + "step": 1735 + }, + { + "epoch": 0.51, + "grad_norm": 2.065971227764955, + "learning_rate": 1.9221922523567643e-05, + "loss": 0.0992, + "step": 1740 + }, + { + "epoch": 0.51, + "grad_norm": 2.291836460643561, + "learning_rate": 1.9215657820450152e-05, + "loss": 0.1169, + "step": 1745 + }, + { + "epoch": 0.51, + "grad_norm": 6.805536473392368, + "learning_rate": 1.92093690275289e-05, + "loss": 0.1301, + "step": 1750 + }, + { + "epoch": 0.51, + "grad_norm": 2.1824217570154456, + "learning_rate": 1.920305616124462e-05, + "loss": 0.1125, + "step": 1755 + }, + { + "epoch": 0.52, + "grad_norm": 14.726348017337305, + "learning_rate": 1.9196719238100993e-05, + "loss": 0.1292, + "step": 1760 + }, + { + "epoch": 0.52, + "grad_norm": 4.930978645236691, + "learning_rate": 1.9190358274664586e-05, + "loss": 0.1418, + "step": 1765 + }, + { + "epoch": 0.52, + "grad_norm": 1.5342337673472881, + "learning_rate": 1.9183973287564806e-05, + "loss": 0.139, + "step": 1770 + }, + { + "epoch": 0.52, + "grad_norm": 2.427844265256046, + "learning_rate": 1.9177564293493876e-05, + "loss": 0.083, + "step": 1775 + }, + { + "epoch": 0.52, + "grad_norm": 2.1313388749998086, + "learning_rate": 1.9171131309206777e-05, + "loss": 0.1207, + "step": 1780 + }, + { + "epoch": 0.52, + "grad_norm": 3.126693266841522, + "learning_rate": 1.9164674351521203e-05, + "loss": 0.0963, + "step": 1785 + }, + { + "epoch": 0.53, + "grad_norm": 3.175825693492831, + "learning_rate": 1.9158193437317527e-05, + "loss": 0.1776, + "step": 1790 + }, + { + "epoch": 0.53, + "grad_norm": 2.9113721112544906, + "learning_rate": 1.9151688583538753e-05, + "loss": 0.1217, + "step": 1795 + }, + { + "epoch": 0.53, + "grad_norm": 3.1166239064049113, + "learning_rate": 1.9145159807190458e-05, + "loss": 0.0507, + "step": 1800 + }, + { + "epoch": 0.53, + "grad_norm": 8.142997644880616, + "learning_rate": 1.9138607125340777e-05, + "loss": 0.1586, + "step": 1805 + }, + { + "epoch": 0.53, + "grad_norm": 1.8022906748245269, + "learning_rate": 1.913203055512033e-05, + "loss": 0.1093, + "step": 1810 + }, + { + "epoch": 0.53, + "grad_norm": 1.8417345443000441, + "learning_rate": 1.9125430113722186e-05, + "loss": 0.1023, + "step": 1815 + }, + { + "epoch": 0.53, + "grad_norm": 3.399632737014696, + "learning_rate": 1.9118805818401825e-05, + "loss": 0.1, + "step": 1820 + }, + { + "epoch": 0.54, + "grad_norm": 2.709759019082288, + "learning_rate": 1.9112157686477092e-05, + "loss": 0.108, + "step": 1825 + }, + { + "epoch": 0.54, + "grad_norm": 2.8287642794394894, + "learning_rate": 1.910548573532814e-05, + "loss": 0.1119, + "step": 1830 + }, + { + "epoch": 0.54, + "grad_norm": 0.9463293108337878, + "learning_rate": 1.90987899823974e-05, + "loss": 0.106, + "step": 1835 + }, + { + "epoch": 0.54, + "grad_norm": 4.02788077621015, + "learning_rate": 1.9092070445189513e-05, + "loss": 0.1223, + "step": 1840 + }, + { + "epoch": 0.54, + "grad_norm": 6.586862447096695, + "learning_rate": 1.9085327141271325e-05, + "loss": 0.1612, + "step": 1845 + }, + { + "epoch": 0.54, + "grad_norm": 6.365460607459384, + "learning_rate": 1.907856008827178e-05, + "loss": 0.1469, + "step": 1850 + }, + { + "epoch": 0.54, + "grad_norm": 5.186222516601001, + "learning_rate": 1.907176930388195e-05, + "loss": 0.1176, + "step": 1855 + }, + { + "epoch": 0.55, + "grad_norm": 0.10245286406524089, + "learning_rate": 1.906495480585491e-05, + "loss": 0.0928, + "step": 1860 + }, + { + "epoch": 0.55, + "grad_norm": 2.469663523408353, + "learning_rate": 1.9058116612005757e-05, + "loss": 0.095, + "step": 1865 + }, + { + "epoch": 0.55, + "grad_norm": 2.7503837837835223, + "learning_rate": 1.905125474021152e-05, + "loss": 0.1103, + "step": 1870 + }, + { + "epoch": 0.55, + "grad_norm": 3.6629482112205047, + "learning_rate": 1.9044369208411127e-05, + "loss": 0.0769, + "step": 1875 + }, + { + "epoch": 0.55, + "grad_norm": 3.6257303751767043, + "learning_rate": 1.903746003460538e-05, + "loss": 0.1188, + "step": 1880 + }, + { + "epoch": 0.55, + "grad_norm": 3.1074798789565987, + "learning_rate": 1.9030527236856867e-05, + "loss": 0.0771, + "step": 1885 + }, + { + "epoch": 0.55, + "grad_norm": 1.9930789523758066, + "learning_rate": 1.9023570833289946e-05, + "loss": 0.1227, + "step": 1890 + }, + { + "epoch": 0.56, + "grad_norm": 5.3774295018042775, + "learning_rate": 1.9016590842090682e-05, + "loss": 0.1089, + "step": 1895 + }, + { + "epoch": 0.56, + "grad_norm": 3.988587549835179, + "learning_rate": 1.9009587281506815e-05, + "loss": 0.1095, + "step": 1900 + }, + { + "epoch": 0.56, + "grad_norm": 2.4025900613693385, + "learning_rate": 1.9002560169847688e-05, + "loss": 0.0744, + "step": 1905 + }, + { + "epoch": 0.56, + "grad_norm": 4.962479191119096, + "learning_rate": 1.8995509525484227e-05, + "loss": 0.1113, + "step": 1910 + }, + { + "epoch": 0.56, + "grad_norm": 3.222489638212013, + "learning_rate": 1.8988435366848867e-05, + "loss": 0.1122, + "step": 1915 + }, + { + "epoch": 0.56, + "grad_norm": 6.910634577936775, + "learning_rate": 1.8981337712435528e-05, + "loss": 0.1357, + "step": 1920 + }, + { + "epoch": 0.56, + "grad_norm": 2.770074917437753, + "learning_rate": 1.897421658079955e-05, + "loss": 0.1246, + "step": 1925 + }, + { + "epoch": 0.57, + "grad_norm": 1.9251395311195432, + "learning_rate": 1.8967071990557643e-05, + "loss": 0.1159, + "step": 1930 + }, + { + "epoch": 0.57, + "grad_norm": 5.562401417062251, + "learning_rate": 1.8959903960387852e-05, + "loss": 0.0945, + "step": 1935 + }, + { + "epoch": 0.57, + "grad_norm": 2.5785513785382435, + "learning_rate": 1.89527125090295e-05, + "loss": 0.1166, + "step": 1940 + }, + { + "epoch": 0.57, + "grad_norm": 2.3414558881424488, + "learning_rate": 1.8945497655283142e-05, + "loss": 0.1071, + "step": 1945 + }, + { + "epoch": 0.57, + "grad_norm": 3.8387053345665754, + "learning_rate": 1.8938259418010504e-05, + "loss": 0.078, + "step": 1950 + }, + { + "epoch": 0.57, + "grad_norm": 2.503163339080756, + "learning_rate": 1.8930997816134457e-05, + "loss": 0.1155, + "step": 1955 + }, + { + "epoch": 0.57, + "grad_norm": 3.006961850846919, + "learning_rate": 1.892371286863894e-05, + "loss": 0.1433, + "step": 1960 + }, + { + "epoch": 0.58, + "grad_norm": 2.509118046318532, + "learning_rate": 1.8916404594568934e-05, + "loss": 0.0889, + "step": 1965 + }, + { + "epoch": 0.58, + "grad_norm": 2.061207568655511, + "learning_rate": 1.8909073013030404e-05, + "loss": 0.1235, + "step": 1970 + }, + { + "epoch": 0.58, + "grad_norm": 2.1278253733682098, + "learning_rate": 1.8901718143190234e-05, + "loss": 0.0903, + "step": 1975 + }, + { + "epoch": 0.58, + "grad_norm": 2.546231615677142, + "learning_rate": 1.8894340004276208e-05, + "loss": 0.0992, + "step": 1980 + }, + { + "epoch": 0.58, + "grad_norm": 5.884018165275079, + "learning_rate": 1.8886938615576926e-05, + "loss": 0.1213, + "step": 1985 + }, + { + "epoch": 0.58, + "grad_norm": 3.048907575238015, + "learning_rate": 1.887951399644178e-05, + "loss": 0.076, + "step": 1990 + }, + { + "epoch": 0.59, + "grad_norm": 3.326277815721477, + "learning_rate": 1.8872066166280898e-05, + "loss": 0.097, + "step": 1995 + }, + { + "epoch": 0.59, + "grad_norm": 3.033210042047812, + "learning_rate": 1.8864595144565067e-05, + "loss": 0.1589, + "step": 2000 + }, + { + "epoch": 0.59, + "grad_norm": 3.2824483953691823, + "learning_rate": 1.8857100950825725e-05, + "loss": 0.1037, + "step": 2005 + }, + { + "epoch": 0.59, + "grad_norm": 3.4944060444975196, + "learning_rate": 1.8849583604654883e-05, + "loss": 0.1102, + "step": 2010 + }, + { + "epoch": 0.59, + "grad_norm": 3.5842552477679854, + "learning_rate": 1.8842043125705074e-05, + "loss": 0.0704, + "step": 2015 + }, + { + "epoch": 0.59, + "grad_norm": 2.5453422006053277, + "learning_rate": 1.883447953368931e-05, + "loss": 0.0902, + "step": 2020 + }, + { + "epoch": 0.59, + "grad_norm": 2.05893040027048, + "learning_rate": 1.8826892848381026e-05, + "loss": 0.1236, + "step": 2025 + }, + { + "epoch": 0.6, + "grad_norm": 2.5436602939843653, + "learning_rate": 1.881928308961403e-05, + "loss": 0.1127, + "step": 2030 + }, + { + "epoch": 0.6, + "grad_norm": 5.148596800219499, + "learning_rate": 1.8811650277282457e-05, + "loss": 0.1554, + "step": 2035 + }, + { + "epoch": 0.6, + "grad_norm": 3.4114441072977875, + "learning_rate": 1.88039944313407e-05, + "loss": 0.1361, + "step": 2040 + }, + { + "epoch": 0.6, + "grad_norm": 2.8127938011148865, + "learning_rate": 1.8796315571803373e-05, + "loss": 0.0995, + "step": 2045 + }, + { + "epoch": 0.6, + "grad_norm": 3.9625421467654545, + "learning_rate": 1.8788613718745258e-05, + "loss": 0.1007, + "step": 2050 + }, + { + "epoch": 0.6, + "grad_norm": 1.7804274416555144, + "learning_rate": 1.8780888892301246e-05, + "loss": 0.0831, + "step": 2055 + }, + { + "epoch": 0.6, + "grad_norm": 3.0212657925844457, + "learning_rate": 1.8773141112666282e-05, + "loss": 0.0983, + "step": 2060 + }, + { + "epoch": 0.61, + "grad_norm": 2.1666245700952618, + "learning_rate": 1.876537040009533e-05, + "loss": 0.1188, + "step": 2065 + }, + { + "epoch": 0.61, + "grad_norm": 1.3595918504356805, + "learning_rate": 1.8757576774903293e-05, + "loss": 0.0847, + "step": 2070 + }, + { + "epoch": 0.61, + "grad_norm": 3.125961613522519, + "learning_rate": 1.8749760257464987e-05, + "loss": 0.1239, + "step": 2075 + }, + { + "epoch": 0.61, + "grad_norm": 3.7003035601248655, + "learning_rate": 1.874192086821506e-05, + "loss": 0.1409, + "step": 2080 + }, + { + "epoch": 0.61, + "grad_norm": 2.379395082357442, + "learning_rate": 1.8734058627647974e-05, + "loss": 0.0724, + "step": 2085 + }, + { + "epoch": 0.61, + "grad_norm": 6.950334653412999, + "learning_rate": 1.872617355631791e-05, + "loss": 0.1478, + "step": 2090 + }, + { + "epoch": 0.61, + "grad_norm": 3.242450014080562, + "learning_rate": 1.871826567483875e-05, + "loss": 0.099, + "step": 2095 + }, + { + "epoch": 0.62, + "grad_norm": 3.6369097830549175, + "learning_rate": 1.8710335003884e-05, + "loss": 0.0874, + "step": 2100 + }, + { + "epoch": 0.62, + "grad_norm": 2.3740836935740575, + "learning_rate": 1.8702381564186752e-05, + "loss": 0.1088, + "step": 2105 + }, + { + "epoch": 0.62, + "grad_norm": 4.29882368809403, + "learning_rate": 1.8694405376539612e-05, + "loss": 0.1358, + "step": 2110 + }, + { + "epoch": 0.62, + "grad_norm": 1.233884641572376, + "learning_rate": 1.8686406461794663e-05, + "loss": 0.0848, + "step": 2115 + }, + { + "epoch": 0.62, + "grad_norm": 5.10093491156728, + "learning_rate": 1.86783848408634e-05, + "loss": 0.1664, + "step": 2120 + }, + { + "epoch": 0.62, + "grad_norm": 2.6372062628013366, + "learning_rate": 1.867034053471669e-05, + "loss": 0.0864, + "step": 2125 + }, + { + "epoch": 0.62, + "grad_norm": 1.9131502429850842, + "learning_rate": 1.8662273564384685e-05, + "loss": 0.0712, + "step": 2130 + }, + { + "epoch": 0.63, + "grad_norm": 2.2716382434584665, + "learning_rate": 1.8654183950956807e-05, + "loss": 0.1098, + "step": 2135 + }, + { + "epoch": 0.63, + "grad_norm": 2.980703122184445, + "learning_rate": 1.864607171558166e-05, + "loss": 0.1524, + "step": 2140 + }, + { + "epoch": 0.63, + "grad_norm": 2.6693066304605497, + "learning_rate": 1.863793687946699e-05, + "loss": 0.1263, + "step": 2145 + }, + { + "epoch": 0.63, + "grad_norm": 1.8713460923305671, + "learning_rate": 1.862977946387964e-05, + "loss": 0.1043, + "step": 2150 + }, + { + "epoch": 0.63, + "grad_norm": 2.173402756633364, + "learning_rate": 1.862159949014547e-05, + "loss": 0.1268, + "step": 2155 + }, + { + "epoch": 0.63, + "grad_norm": 2.1115579118349936, + "learning_rate": 1.861339697964932e-05, + "loss": 0.0871, + "step": 2160 + }, + { + "epoch": 0.64, + "grad_norm": 0.4714286720726806, + "learning_rate": 1.860517195383495e-05, + "loss": 0.1029, + "step": 2165 + }, + { + "epoch": 0.64, + "grad_norm": 2.133585471909289, + "learning_rate": 1.8596924434204963e-05, + "loss": 0.0858, + "step": 2170 + }, + { + "epoch": 0.64, + "grad_norm": 1.7496615491285803, + "learning_rate": 1.8588654442320796e-05, + "loss": 0.1081, + "step": 2175 + }, + { + "epoch": 0.64, + "grad_norm": 4.814425987182643, + "learning_rate": 1.8580361999802606e-05, + "loss": 0.1179, + "step": 2180 + }, + { + "epoch": 0.64, + "grad_norm": 1.9816374631526001, + "learning_rate": 1.8572047128329272e-05, + "loss": 0.1062, + "step": 2185 + }, + { + "epoch": 0.64, + "grad_norm": 6.646969625340206, + "learning_rate": 1.8563709849638286e-05, + "loss": 0.1477, + "step": 2190 + }, + { + "epoch": 0.64, + "grad_norm": 2.374535766203674, + "learning_rate": 1.8555350185525723e-05, + "loss": 0.1142, + "step": 2195 + }, + { + "epoch": 0.65, + "grad_norm": 3.365883417577783, + "learning_rate": 1.8546968157846195e-05, + "loss": 0.124, + "step": 2200 + }, + { + "epoch": 0.65, + "grad_norm": 2.119464733459411, + "learning_rate": 1.8538563788512757e-05, + "loss": 0.0861, + "step": 2205 + }, + { + "epoch": 0.65, + "grad_norm": 3.4982979586919245, + "learning_rate": 1.8530137099496886e-05, + "loss": 0.1153, + "step": 2210 + }, + { + "epoch": 0.65, + "grad_norm": 2.0143848714607326, + "learning_rate": 1.852168811282841e-05, + "loss": 0.0957, + "step": 2215 + }, + { + "epoch": 0.65, + "grad_norm": 5.713010148322335, + "learning_rate": 1.8513216850595434e-05, + "loss": 0.106, + "step": 2220 + }, + { + "epoch": 0.65, + "grad_norm": 3.66380234305124, + "learning_rate": 1.850472333494432e-05, + "loss": 0.1011, + "step": 2225 + }, + { + "epoch": 0.65, + "grad_norm": 2.082824212489925, + "learning_rate": 1.849620758807959e-05, + "loss": 0.1106, + "step": 2230 + }, + { + "epoch": 0.66, + "grad_norm": 3.1172243656282594, + "learning_rate": 1.8487669632263892e-05, + "loss": 0.1099, + "step": 2235 + }, + { + "epoch": 0.66, + "grad_norm": 3.8750958584649853, + "learning_rate": 1.8479109489817935e-05, + "loss": 0.0927, + "step": 2240 + }, + { + "epoch": 0.66, + "grad_norm": 2.658381425986548, + "learning_rate": 1.8470527183120425e-05, + "loss": 0.0768, + "step": 2245 + }, + { + "epoch": 0.66, + "grad_norm": 4.454354742139791, + "learning_rate": 1.8461922734608016e-05, + "loss": 0.0906, + "step": 2250 + }, + { + "epoch": 0.66, + "grad_norm": 3.7643868717903866, + "learning_rate": 1.845329616677525e-05, + "loss": 0.0937, + "step": 2255 + }, + { + "epoch": 0.66, + "grad_norm": 3.680014043034835, + "learning_rate": 1.8444647502174492e-05, + "loss": 0.087, + "step": 2260 + }, + { + "epoch": 0.66, + "grad_norm": 3.3837653881747585, + "learning_rate": 1.843597676341587e-05, + "loss": 0.0916, + "step": 2265 + }, + { + "epoch": 0.67, + "grad_norm": 2.121221324252522, + "learning_rate": 1.8427283973167225e-05, + "loss": 0.1221, + "step": 2270 + }, + { + "epoch": 0.67, + "grad_norm": 1.0325272482169887, + "learning_rate": 1.841856915415405e-05, + "loss": 0.0874, + "step": 2275 + }, + { + "epoch": 0.67, + "grad_norm": 2.0795962066445566, + "learning_rate": 1.840983232915942e-05, + "loss": 0.0741, + "step": 2280 + }, + { + "epoch": 0.67, + "grad_norm": 4.460588805671975, + "learning_rate": 1.840107352102395e-05, + "loss": 0.1488, + "step": 2285 + }, + { + "epoch": 0.67, + "grad_norm": 2.974661793437929, + "learning_rate": 1.839229275264572e-05, + "loss": 0.093, + "step": 2290 + }, + { + "epoch": 0.67, + "grad_norm": 3.137347736691432, + "learning_rate": 1.8383490046980212e-05, + "loss": 0.1, + "step": 2295 + }, + { + "epoch": 0.67, + "grad_norm": 7.90074357987843, + "learning_rate": 1.8374665427040276e-05, + "loss": 0.1362, + "step": 2300 + }, + { + "epoch": 0.68, + "grad_norm": 2.4540688223728826, + "learning_rate": 1.836581891589604e-05, + "loss": 0.1124, + "step": 2305 + }, + { + "epoch": 0.68, + "grad_norm": 4.638147996733137, + "learning_rate": 1.8356950536674858e-05, + "loss": 0.1031, + "step": 2310 + }, + { + "epoch": 0.68, + "grad_norm": 2.2696169707348544, + "learning_rate": 1.834806031256127e-05, + "loss": 0.0965, + "step": 2315 + }, + { + "epoch": 0.68, + "grad_norm": 6.130234330693954, + "learning_rate": 1.833914826679691e-05, + "loss": 0.079, + "step": 2320 + }, + { + "epoch": 0.68, + "grad_norm": 1.8781015810402948, + "learning_rate": 1.8330214422680467e-05, + "loss": 0.0791, + "step": 2325 + }, + { + "epoch": 0.68, + "grad_norm": 1.4069832479622444, + "learning_rate": 1.8321258803567613e-05, + "loss": 0.0831, + "step": 2330 + }, + { + "epoch": 0.68, + "grad_norm": 3.4630544831332664, + "learning_rate": 1.831228143287096e-05, + "loss": 0.1616, + "step": 2335 + }, + { + "epoch": 0.69, + "grad_norm": 2.9442217600685137, + "learning_rate": 1.8303282334059957e-05, + "loss": 0.1199, + "step": 2340 + }, + { + "epoch": 0.69, + "grad_norm": 2.0954935585893257, + "learning_rate": 1.8294261530660885e-05, + "loss": 0.1302, + "step": 2345 + }, + { + "epoch": 0.69, + "grad_norm": 4.890904817711451, + "learning_rate": 1.8285219046256758e-05, + "loss": 0.1025, + "step": 2350 + }, + { + "epoch": 0.69, + "grad_norm": 2.583755171594856, + "learning_rate": 1.8276154904487264e-05, + "loss": 0.1043, + "step": 2355 + }, + { + "epoch": 0.69, + "grad_norm": 8.260345907657872, + "learning_rate": 1.8267069129048707e-05, + "loss": 0.1782, + "step": 2360 + }, + { + "epoch": 0.69, + "grad_norm": 4.771965836395477, + "learning_rate": 1.8257961743693962e-05, + "loss": 0.0862, + "step": 2365 + }, + { + "epoch": 0.7, + "grad_norm": 2.554197211858201, + "learning_rate": 1.8248832772232394e-05, + "loss": 0.0851, + "step": 2370 + }, + { + "epoch": 0.7, + "grad_norm": 2.16812655587689, + "learning_rate": 1.8239682238529792e-05, + "loss": 0.0938, + "step": 2375 + }, + { + "epoch": 0.7, + "grad_norm": 3.3012819170031853, + "learning_rate": 1.8230510166508322e-05, + "loss": 0.0769, + "step": 2380 + }, + { + "epoch": 0.7, + "grad_norm": 1.998340872187495, + "learning_rate": 1.822131658014646e-05, + "loss": 0.0735, + "step": 2385 + }, + { + "epoch": 0.7, + "grad_norm": 1.9806358974129312, + "learning_rate": 1.8212101503478916e-05, + "loss": 0.14, + "step": 2390 + }, + { + "epoch": 0.7, + "grad_norm": 6.99507540091613, + "learning_rate": 1.8202864960596592e-05, + "loss": 0.0944, + "step": 2395 + }, + { + "epoch": 0.7, + "grad_norm": 2.4015686273341545, + "learning_rate": 1.8193606975646506e-05, + "loss": 0.0677, + "step": 2400 + }, + { + "epoch": 0.71, + "grad_norm": 3.3364791966330962, + "learning_rate": 1.8184327572831738e-05, + "loss": 0.0829, + "step": 2405 + }, + { + "epoch": 0.71, + "grad_norm": 6.874850462240894, + "learning_rate": 1.817502677641134e-05, + "loss": 0.1419, + "step": 2410 + }, + { + "epoch": 0.71, + "grad_norm": 2.6075283986023665, + "learning_rate": 1.8165704610700315e-05, + "loss": 0.1117, + "step": 2415 + }, + { + "epoch": 0.71, + "grad_norm": 2.148518397802211, + "learning_rate": 1.8156361100069524e-05, + "loss": 0.101, + "step": 2420 + }, + { + "epoch": 0.71, + "grad_norm": 3.4560468231137564, + "learning_rate": 1.8146996268945632e-05, + "loss": 0.0966, + "step": 2425 + }, + { + "epoch": 0.71, + "grad_norm": 2.384974065195782, + "learning_rate": 1.8137610141811037e-05, + "loss": 0.122, + "step": 2430 + }, + { + "epoch": 0.71, + "grad_norm": 2.362869907867881, + "learning_rate": 1.812820274320381e-05, + "loss": 0.1132, + "step": 2435 + }, + { + "epoch": 0.72, + "grad_norm": 6.9320506095293455, + "learning_rate": 1.811877409771764e-05, + "loss": 0.1524, + "step": 2440 + }, + { + "epoch": 0.72, + "grad_norm": 4.720326949611697, + "learning_rate": 1.8109324230001756e-05, + "loss": 0.1301, + "step": 2445 + }, + { + "epoch": 0.72, + "grad_norm": 2.1399702198416413, + "learning_rate": 1.8099853164760865e-05, + "loss": 0.0889, + "step": 2450 + }, + { + "epoch": 0.72, + "grad_norm": 6.7467640785050325, + "learning_rate": 1.80903609267551e-05, + "loss": 0.099, + "step": 2455 + }, + { + "epoch": 0.72, + "grad_norm": 2.325513463081622, + "learning_rate": 1.8080847540799942e-05, + "loss": 0.1064, + "step": 2460 + }, + { + "epoch": 0.72, + "grad_norm": 2.177610658971863, + "learning_rate": 1.8071313031766148e-05, + "loss": 0.0658, + "step": 2465 + }, + { + "epoch": 0.72, + "grad_norm": 4.977562149709799, + "learning_rate": 1.8061757424579716e-05, + "loss": 0.1207, + "step": 2470 + }, + { + "epoch": 0.73, + "grad_norm": 2.1681517377729382, + "learning_rate": 1.8052180744221784e-05, + "loss": 0.1197, + "step": 2475 + }, + { + "epoch": 0.73, + "grad_norm": 2.7090062894808002, + "learning_rate": 1.8042583015728598e-05, + "loss": 0.0792, + "step": 2480 + }, + { + "epoch": 0.73, + "grad_norm": 4.747077846148363, + "learning_rate": 1.8032964264191402e-05, + "loss": 0.1143, + "step": 2485 + }, + { + "epoch": 0.73, + "grad_norm": 2.74552501361813, + "learning_rate": 1.8023324514756436e-05, + "loss": 0.1265, + "step": 2490 + }, + { + "epoch": 0.73, + "grad_norm": 4.1581416593625296, + "learning_rate": 1.801366379262481e-05, + "loss": 0.072, + "step": 2495 + }, + { + "epoch": 0.73, + "grad_norm": 1.6406189691341908, + "learning_rate": 1.8003982123052474e-05, + "loss": 0.0814, + "step": 2500 + }, + { + "epoch": 0.73, + "grad_norm": 2.712884736792791, + "learning_rate": 1.7994279531350135e-05, + "loss": 0.0973, + "step": 2505 + }, + { + "epoch": 0.74, + "grad_norm": 1.204869182656131, + "learning_rate": 1.7984556042883195e-05, + "loss": 0.0725, + "step": 2510 + }, + { + "epoch": 0.74, + "grad_norm": 1.63571544112928, + "learning_rate": 1.7974811683071688e-05, + "loss": 0.1416, + "step": 2515 + }, + { + "epoch": 0.74, + "grad_norm": 0.8179280822029832, + "learning_rate": 1.7965046477390223e-05, + "loss": 0.08, + "step": 2520 + }, + { + "epoch": 0.74, + "grad_norm": 3.1456845650432697, + "learning_rate": 1.7955260451367887e-05, + "loss": 0.0939, + "step": 2525 + }, + { + "epoch": 0.74, + "grad_norm": 3.068511046140001, + "learning_rate": 1.7945453630588214e-05, + "loss": 0.074, + "step": 2530 + }, + { + "epoch": 0.74, + "grad_norm": 1.8530528834496558, + "learning_rate": 1.7935626040689087e-05, + "loss": 0.1254, + "step": 2535 + }, + { + "epoch": 0.75, + "grad_norm": 9.466013698713851, + "learning_rate": 1.7925777707362694e-05, + "loss": 0.1031, + "step": 2540 + }, + { + "epoch": 0.75, + "grad_norm": 2.8626798281758683, + "learning_rate": 1.791590865635546e-05, + "loss": 0.0906, + "step": 2545 + }, + { + "epoch": 0.75, + "grad_norm": 5.788311578849042, + "learning_rate": 1.7906018913467957e-05, + "loss": 0.1191, + "step": 2550 + }, + { + "epoch": 0.75, + "grad_norm": 15.34485630001676, + "learning_rate": 1.7896108504554858e-05, + "loss": 0.1703, + "step": 2555 + }, + { + "epoch": 0.75, + "grad_norm": 1.878644897302772, + "learning_rate": 1.7886177455524865e-05, + "loss": 0.0978, + "step": 2560 + }, + { + "epoch": 0.75, + "grad_norm": 3.9594573315714197, + "learning_rate": 1.7876225792340635e-05, + "loss": 0.1066, + "step": 2565 + }, + { + "epoch": 0.75, + "grad_norm": 6.412819373671454, + "learning_rate": 1.786625354101872e-05, + "loss": 0.1204, + "step": 2570 + }, + { + "epoch": 0.76, + "grad_norm": 9.689461195761101, + "learning_rate": 1.7856260727629495e-05, + "loss": 0.1137, + "step": 2575 + }, + { + "epoch": 0.76, + "grad_norm": 3.5251907080608054, + "learning_rate": 1.784624737829709e-05, + "loss": 0.1519, + "step": 2580 + }, + { + "epoch": 0.76, + "grad_norm": 2.735467659755077, + "learning_rate": 1.783621351919932e-05, + "loss": 0.0956, + "step": 2585 + }, + { + "epoch": 0.76, + "grad_norm": 2.579970560423387, + "learning_rate": 1.7826159176567616e-05, + "loss": 0.0965, + "step": 2590 + }, + { + "epoch": 0.76, + "grad_norm": 5.7447980487012185, + "learning_rate": 1.781608437668697e-05, + "loss": 0.1355, + "step": 2595 + }, + { + "epoch": 0.76, + "grad_norm": 2.746235872283559, + "learning_rate": 1.7805989145895847e-05, + "loss": 0.0879, + "step": 2600 + }, + { + "epoch": 0.76, + "grad_norm": 3.6558387720713794, + "learning_rate": 1.779587351058612e-05, + "loss": 0.1266, + "step": 2605 + }, + { + "epoch": 0.77, + "grad_norm": 2.79558623046737, + "learning_rate": 1.7785737497203013e-05, + "loss": 0.1805, + "step": 2610 + }, + { + "epoch": 0.77, + "grad_norm": 2.9429418600589123, + "learning_rate": 1.7775581132245026e-05, + "loss": 0.1069, + "step": 2615 + }, + { + "epoch": 0.77, + "grad_norm": 1.5095627662015014, + "learning_rate": 1.776540444226386e-05, + "loss": 0.047, + "step": 2620 + }, + { + "epoch": 0.77, + "grad_norm": 3.495377076072146, + "learning_rate": 1.775520745386434e-05, + "loss": 0.0633, + "step": 2625 + }, + { + "epoch": 0.77, + "grad_norm": 3.0385032472654325, + "learning_rate": 1.774499019370438e-05, + "loss": 0.1487, + "step": 2630 + }, + { + "epoch": 0.77, + "grad_norm": 10.965757999825739, + "learning_rate": 1.773475268849488e-05, + "loss": 0.1373, + "step": 2635 + }, + { + "epoch": 0.77, + "grad_norm": 2.8123652108000585, + "learning_rate": 1.772449496499966e-05, + "loss": 0.1187, + "step": 2640 + }, + { + "epoch": 0.78, + "grad_norm": 4.66597249908126, + "learning_rate": 1.77142170500354e-05, + "loss": 0.1114, + "step": 2645 + }, + { + "epoch": 0.78, + "grad_norm": 2.676562831364225, + "learning_rate": 1.770391897047157e-05, + "loss": 0.1213, + "step": 2650 + }, + { + "epoch": 0.78, + "grad_norm": 2.0506018569230973, + "learning_rate": 1.769360075323036e-05, + "loss": 0.1216, + "step": 2655 + }, + { + "epoch": 0.78, + "grad_norm": 3.4758686873519644, + "learning_rate": 1.7683262425286593e-05, + "loss": 0.1068, + "step": 2660 + }, + { + "epoch": 0.78, + "grad_norm": 16.360011105708523, + "learning_rate": 1.7672904013667675e-05, + "loss": 0.0836, + "step": 2665 + }, + { + "epoch": 0.78, + "grad_norm": 2.2472093017776724, + "learning_rate": 1.7662525545453518e-05, + "loss": 0.1206, + "step": 2670 + }, + { + "epoch": 0.78, + "grad_norm": 3.264614259094197, + "learning_rate": 1.7652127047776464e-05, + "loss": 0.0736, + "step": 2675 + }, + { + "epoch": 0.79, + "grad_norm": 4.838998543720377, + "learning_rate": 1.7641708547821218e-05, + "loss": 0.112, + "step": 2680 + }, + { + "epoch": 0.79, + "grad_norm": 1.889257267329085, + "learning_rate": 1.7631270072824786e-05, + "loss": 0.0915, + "step": 2685 + }, + { + "epoch": 0.79, + "grad_norm": 3.489350329159354, + "learning_rate": 1.762081165007638e-05, + "loss": 0.0872, + "step": 2690 + }, + { + "epoch": 0.79, + "grad_norm": 5.007436753100703, + "learning_rate": 1.7610333306917367e-05, + "loss": 0.1069, + "step": 2695 + }, + { + "epoch": 0.79, + "grad_norm": 1.8855245850871363, + "learning_rate": 1.75998350707412e-05, + "loss": 0.1096, + "step": 2700 + }, + { + "epoch": 0.79, + "grad_norm": 4.28080691529872, + "learning_rate": 1.7589316968993323e-05, + "loss": 0.1135, + "step": 2705 + }, + { + "epoch": 0.79, + "grad_norm": 2.188803148392315, + "learning_rate": 1.7578779029171128e-05, + "loss": 0.0631, + "step": 2710 + }, + { + "epoch": 0.8, + "grad_norm": 3.588543379312152, + "learning_rate": 1.7568221278823862e-05, + "loss": 0.1063, + "step": 2715 + }, + { + "epoch": 0.8, + "grad_norm": 2.6512479470769765, + "learning_rate": 1.7557643745552566e-05, + "loss": 0.0792, + "step": 2720 + }, + { + "epoch": 0.8, + "grad_norm": 2.4622361404876334, + "learning_rate": 1.7547046457009995e-05, + "loss": 0.0815, + "step": 2725 + }, + { + "epoch": 0.8, + "grad_norm": 1.181686879663521, + "learning_rate": 1.7536429440900554e-05, + "loss": 0.0855, + "step": 2730 + }, + { + "epoch": 0.8, + "grad_norm": 3.0605705567048047, + "learning_rate": 1.7525792724980225e-05, + "loss": 0.1384, + "step": 2735 + }, + { + "epoch": 0.8, + "grad_norm": 3.0361367388503884, + "learning_rate": 1.7515136337056476e-05, + "loss": 0.0652, + "step": 2740 + }, + { + "epoch": 0.81, + "grad_norm": 3.6999816430126207, + "learning_rate": 1.750446030498822e-05, + "loss": 0.1307, + "step": 2745 + }, + { + "epoch": 0.81, + "grad_norm": 1.7331629461515146, + "learning_rate": 1.7493764656685725e-05, + "loss": 0.085, + "step": 2750 + }, + { + "epoch": 0.81, + "grad_norm": 3.3426839965302104, + "learning_rate": 1.7483049420110526e-05, + "loss": 0.1107, + "step": 2755 + }, + { + "epoch": 0.81, + "grad_norm": 1.8945908369938274, + "learning_rate": 1.747231462327538e-05, + "loss": 0.0677, + "step": 2760 + }, + { + "epoch": 0.81, + "grad_norm": 2.375178480970911, + "learning_rate": 1.7461560294244185e-05, + "loss": 0.0816, + "step": 2765 + }, + { + "epoch": 0.81, + "grad_norm": 2.7160314363337323, + "learning_rate": 1.7450786461131886e-05, + "loss": 0.1479, + "step": 2770 + }, + { + "epoch": 0.81, + "grad_norm": 2.1767080856674283, + "learning_rate": 1.7439993152104424e-05, + "loss": 0.0701, + "step": 2775 + }, + { + "epoch": 0.82, + "grad_norm": 2.8479809150806425, + "learning_rate": 1.7429180395378667e-05, + "loss": 0.1213, + "step": 2780 + }, + { + "epoch": 0.82, + "grad_norm": 3.1833758133322387, + "learning_rate": 1.741834821922231e-05, + "loss": 0.0927, + "step": 2785 + }, + { + "epoch": 0.82, + "grad_norm": 3.7860456588982396, + "learning_rate": 1.7407496651953824e-05, + "loss": 0.0896, + "step": 2790 + }, + { + "epoch": 0.82, + "grad_norm": 3.127110340900602, + "learning_rate": 1.739662572194237e-05, + "loss": 0.0593, + "step": 2795 + }, + { + "epoch": 0.82, + "grad_norm": 1.805218044753091, + "learning_rate": 1.7385735457607728e-05, + "loss": 0.1007, + "step": 2800 + }, + { + "epoch": 0.82, + "grad_norm": 2.552175891261757, + "learning_rate": 1.7374825887420227e-05, + "loss": 0.1329, + "step": 2805 + }, + { + "epoch": 0.82, + "grad_norm": 1.4027082134325906, + "learning_rate": 1.7363897039900673e-05, + "loss": 0.0775, + "step": 2810 + }, + { + "epoch": 0.83, + "grad_norm": 2.5766792636413722, + "learning_rate": 1.7352948943620252e-05, + "loss": 0.1039, + "step": 2815 + }, + { + "epoch": 0.83, + "grad_norm": 2.315064631933743, + "learning_rate": 1.7341981627200486e-05, + "loss": 0.1002, + "step": 2820 + }, + { + "epoch": 0.83, + "grad_norm": 1.77275038528492, + "learning_rate": 1.733099511931314e-05, + "loss": 0.059, + "step": 2825 + }, + { + "epoch": 0.83, + "grad_norm": 3.6553802759984086, + "learning_rate": 1.731998944868015e-05, + "loss": 0.0873, + "step": 2830 + }, + { + "epoch": 0.83, + "grad_norm": 9.277418585224488, + "learning_rate": 1.730896464407355e-05, + "loss": 0.1054, + "step": 2835 + }, + { + "epoch": 0.83, + "grad_norm": 2.330825468379001, + "learning_rate": 1.7297920734315397e-05, + "loss": 0.0841, + "step": 2840 + }, + { + "epoch": 0.83, + "grad_norm": 1.5353749049117653, + "learning_rate": 1.728685774827769e-05, + "loss": 0.1018, + "step": 2845 + }, + { + "epoch": 0.84, + "grad_norm": 2.887634875297545, + "learning_rate": 1.7275775714882302e-05, + "loss": 0.1114, + "step": 2850 + }, + { + "epoch": 0.84, + "grad_norm": 2.453602349555872, + "learning_rate": 1.7264674663100908e-05, + "loss": 0.1401, + "step": 2855 + }, + { + "epoch": 0.84, + "grad_norm": 1.973689346976038, + "learning_rate": 1.7253554621954888e-05, + "loss": 0.1036, + "step": 2860 + }, + { + "epoch": 0.84, + "grad_norm": 2.3448153299848964, + "learning_rate": 1.7242415620515277e-05, + "loss": 0.1152, + "step": 2865 + }, + { + "epoch": 0.84, + "grad_norm": 3.1738861172586437, + "learning_rate": 1.7231257687902668e-05, + "loss": 0.1154, + "step": 2870 + }, + { + "epoch": 0.84, + "grad_norm": 5.216105630624163, + "learning_rate": 1.722008085328716e-05, + "loss": 0.0901, + "step": 2875 + }, + { + "epoch": 0.84, + "grad_norm": 1.687338997966392, + "learning_rate": 1.7208885145888262e-05, + "loss": 0.0996, + "step": 2880 + }, + { + "epoch": 0.85, + "grad_norm": 2.5681904204171584, + "learning_rate": 1.7197670594974815e-05, + "loss": 0.1147, + "step": 2885 + }, + { + "epoch": 0.85, + "grad_norm": 3.9337900031340416, + "learning_rate": 1.718643722986492e-05, + "loss": 0.0782, + "step": 2890 + }, + { + "epoch": 0.85, + "grad_norm": 4.399086614652283, + "learning_rate": 1.7175185079925877e-05, + "loss": 0.1258, + "step": 2895 + }, + { + "epoch": 0.85, + "grad_norm": 1.9000347309953698, + "learning_rate": 1.7163914174574092e-05, + "loss": 0.1434, + "step": 2900 + }, + { + "epoch": 0.85, + "grad_norm": 3.8364789467985507, + "learning_rate": 1.7152624543274994e-05, + "loss": 0.0865, + "step": 2905 + }, + { + "epoch": 0.85, + "grad_norm": 2.3865032691893213, + "learning_rate": 1.7141316215542975e-05, + "loss": 0.0866, + "step": 2910 + }, + { + "epoch": 0.86, + "grad_norm": 2.0857563297304167, + "learning_rate": 1.71299892209413e-05, + "loss": 0.0924, + "step": 2915 + }, + { + "epoch": 0.86, + "grad_norm": 2.345942984458089, + "learning_rate": 1.7118643589082043e-05, + "loss": 0.0708, + "step": 2920 + }, + { + "epoch": 0.86, + "grad_norm": 1.8641359939859798, + "learning_rate": 1.7107279349625992e-05, + "loss": 0.0788, + "step": 2925 + }, + { + "epoch": 0.86, + "grad_norm": 2.272737375583735, + "learning_rate": 1.7095896532282584e-05, + "loss": 0.0851, + "step": 2930 + }, + { + "epoch": 0.86, + "grad_norm": 2.4039124864082657, + "learning_rate": 1.7084495166809822e-05, + "loss": 0.1068, + "step": 2935 + }, + { + "epoch": 0.86, + "grad_norm": 1.1218714658208662, + "learning_rate": 1.707307528301421e-05, + "loss": 0.1091, + "step": 2940 + }, + { + "epoch": 0.86, + "grad_norm": 0.9891789998991855, + "learning_rate": 1.7061636910750646e-05, + "loss": 0.096, + "step": 2945 + }, + { + "epoch": 0.87, + "grad_norm": 3.194805823262511, + "learning_rate": 1.7050180079922373e-05, + "loss": 0.1006, + "step": 2950 + }, + { + "epoch": 0.87, + "grad_norm": 2.257798455534552, + "learning_rate": 1.7038704820480898e-05, + "loss": 0.0914, + "step": 2955 + }, + { + "epoch": 0.87, + "grad_norm": 5.78033176037585, + "learning_rate": 1.7027211162425888e-05, + "loss": 0.0965, + "step": 2960 + }, + { + "epoch": 0.87, + "grad_norm": 2.6184356901420025, + "learning_rate": 1.7015699135805122e-05, + "loss": 0.1114, + "step": 2965 + }, + { + "epoch": 0.87, + "grad_norm": 1.8106354297762863, + "learning_rate": 1.70041687707144e-05, + "loss": 0.0941, + "step": 2970 + }, + { + "epoch": 0.87, + "grad_norm": 1.245232646301207, + "learning_rate": 1.699262009729745e-05, + "loss": 0.0814, + "step": 2975 + }, + { + "epoch": 0.87, + "grad_norm": 3.442666904279059, + "learning_rate": 1.6981053145745877e-05, + "loss": 0.1418, + "step": 2980 + }, + { + "epoch": 0.88, + "grad_norm": 2.2823324901203557, + "learning_rate": 1.6969467946299073e-05, + "loss": 0.0593, + "step": 2985 + }, + { + "epoch": 0.88, + "grad_norm": 2.3494464059884015, + "learning_rate": 1.6957864529244123e-05, + "loss": 0.0696, + "step": 2990 + }, + { + "epoch": 0.88, + "grad_norm": 4.051449109074783, + "learning_rate": 1.694624292491575e-05, + "loss": 0.091, + "step": 2995 + }, + { + "epoch": 0.88, + "grad_norm": 2.6735251166535616, + "learning_rate": 1.6934603163696212e-05, + "loss": 0.089, + "step": 3000 + }, + { + "epoch": 0.88, + "grad_norm": 1.8986340958640284, + "learning_rate": 1.6922945276015244e-05, + "loss": 0.087, + "step": 3005 + }, + { + "epoch": 0.88, + "grad_norm": 3.912312190511564, + "learning_rate": 1.691126929234996e-05, + "loss": 0.1154, + "step": 3010 + }, + { + "epoch": 0.88, + "grad_norm": 3.9209068288748936, + "learning_rate": 1.6899575243224794e-05, + "loss": 0.0845, + "step": 3015 + }, + { + "epoch": 0.89, + "grad_norm": 0.986967487651937, + "learning_rate": 1.6887863159211403e-05, + "loss": 0.0563, + "step": 3020 + }, + { + "epoch": 0.89, + "grad_norm": 3.5756032794838775, + "learning_rate": 1.6876133070928584e-05, + "loss": 0.1154, + "step": 3025 + }, + { + "epoch": 0.89, + "grad_norm": 2.8732144989552793, + "learning_rate": 1.6864385009042215e-05, + "loss": 0.0882, + "step": 3030 + }, + { + "epoch": 0.89, + "grad_norm": 3.2146073135071584, + "learning_rate": 1.6852619004265157e-05, + "loss": 0.0746, + "step": 3035 + }, + { + "epoch": 0.89, + "grad_norm": 2.032882973038013, + "learning_rate": 1.684083508735718e-05, + "loss": 0.0919, + "step": 3040 + }, + { + "epoch": 0.89, + "grad_norm": 1.7212867050301923, + "learning_rate": 1.6829033289124876e-05, + "loss": 0.061, + "step": 3045 + }, + { + "epoch": 0.89, + "grad_norm": 2.0434776241308557, + "learning_rate": 1.681721364042159e-05, + "loss": 0.0608, + "step": 3050 + }, + { + "epoch": 0.9, + "grad_norm": 2.4595918397424077, + "learning_rate": 1.6805376172147335e-05, + "loss": 0.0618, + "step": 3055 + }, + { + "epoch": 0.9, + "grad_norm": 1.9062625696987283, + "learning_rate": 1.6793520915248704e-05, + "loss": 0.106, + "step": 3060 + }, + { + "epoch": 0.9, + "grad_norm": 3.0695618155829765, + "learning_rate": 1.6781647900718797e-05, + "loss": 0.0826, + "step": 3065 + }, + { + "epoch": 0.9, + "grad_norm": 2.5737887820804533, + "learning_rate": 1.676975715959714e-05, + "loss": 0.0896, + "step": 3070 + }, + { + "epoch": 0.9, + "grad_norm": 5.4035821406888855, + "learning_rate": 1.67578487229696e-05, + "loss": 0.0747, + "step": 3075 + }, + { + "epoch": 0.9, + "grad_norm": 4.031141412736291, + "learning_rate": 1.67459226219683e-05, + "loss": 0.1035, + "step": 3080 + }, + { + "epoch": 0.9, + "grad_norm": 2.8672651649371415, + "learning_rate": 1.6733978887771548e-05, + "loss": 0.1211, + "step": 3085 + }, + { + "epoch": 0.91, + "grad_norm": 3.622543160107051, + "learning_rate": 1.6722017551603752e-05, + "loss": 0.0782, + "step": 3090 + }, + { + "epoch": 0.91, + "grad_norm": 2.611185690533942, + "learning_rate": 1.6710038644735328e-05, + "loss": 0.1197, + "step": 3095 + }, + { + "epoch": 0.91, + "grad_norm": 2.239743805096798, + "learning_rate": 1.6698042198482645e-05, + "loss": 0.0694, + "step": 3100 + }, + { + "epoch": 0.91, + "grad_norm": 3.6697206061343963, + "learning_rate": 1.6686028244207902e-05, + "loss": 0.0896, + "step": 3105 + }, + { + "epoch": 0.91, + "grad_norm": 2.3344027475446776, + "learning_rate": 1.667399681331909e-05, + "loss": 0.0863, + "step": 3110 + }, + { + "epoch": 0.91, + "grad_norm": 5.716084510514838, + "learning_rate": 1.666194793726987e-05, + "loss": 0.0548, + "step": 3115 + }, + { + "epoch": 0.92, + "grad_norm": 3.1415750006162626, + "learning_rate": 1.6649881647559527e-05, + "loss": 0.0684, + "step": 3120 + }, + { + "epoch": 0.92, + "grad_norm": 2.1766147626420187, + "learning_rate": 1.6637797975732855e-05, + "loss": 0.0786, + "step": 3125 + }, + { + "epoch": 0.92, + "grad_norm": 2.602913079216308, + "learning_rate": 1.6625696953380104e-05, + "loss": 0.1321, + "step": 3130 + }, + { + "epoch": 0.92, + "grad_norm": 2.2070894756907418, + "learning_rate": 1.661357861213687e-05, + "loss": 0.0912, + "step": 3135 + }, + { + "epoch": 0.92, + "grad_norm": 1.6298032344638562, + "learning_rate": 1.6601442983684042e-05, + "loss": 0.0802, + "step": 3140 + }, + { + "epoch": 0.92, + "grad_norm": 2.7174691000044064, + "learning_rate": 1.658929009974768e-05, + "loss": 0.1251, + "step": 3145 + }, + { + "epoch": 0.92, + "grad_norm": 4.932366321915814, + "learning_rate": 1.657711999209898e-05, + "loss": 0.1141, + "step": 3150 + }, + { + "epoch": 0.93, + "grad_norm": 9.069104123184824, + "learning_rate": 1.656493269255415e-05, + "loss": 0.1253, + "step": 3155 + }, + { + "epoch": 0.93, + "grad_norm": 2.9830010731689076, + "learning_rate": 1.6552728232974344e-05, + "loss": 0.0736, + "step": 3160 + }, + { + "epoch": 0.93, + "grad_norm": 2.3767202070428732, + "learning_rate": 1.654050664526558e-05, + "loss": 0.1481, + "step": 3165 + }, + { + "epoch": 0.93, + "grad_norm": 2.1967464664457292, + "learning_rate": 1.6528267961378653e-05, + "loss": 0.0737, + "step": 3170 + }, + { + "epoch": 0.93, + "grad_norm": 2.5475708180655317, + "learning_rate": 1.651601221330906e-05, + "loss": 0.0965, + "step": 3175 + }, + { + "epoch": 0.93, + "grad_norm": 1.4514690521021125, + "learning_rate": 1.6503739433096893e-05, + "loss": 0.09, + "step": 3180 + }, + { + "epoch": 0.93, + "grad_norm": 1.0106614527500741, + "learning_rate": 1.649144965282679e-05, + "loss": 0.1028, + "step": 3185 + }, + { + "epoch": 0.94, + "grad_norm": 2.264948776878529, + "learning_rate": 1.647914290462781e-05, + "loss": 0.1099, + "step": 3190 + }, + { + "epoch": 0.94, + "grad_norm": 2.387800555259844, + "learning_rate": 1.6466819220673392e-05, + "loss": 0.0858, + "step": 3195 + }, + { + "epoch": 0.94, + "grad_norm": 2.2939673463714043, + "learning_rate": 1.6454478633181238e-05, + "loss": 0.0965, + "step": 3200 + }, + { + "epoch": 0.94, + "grad_norm": 2.8815140940347828, + "learning_rate": 1.6442121174413242e-05, + "loss": 0.084, + "step": 3205 + }, + { + "epoch": 0.94, + "grad_norm": 3.3710190642074394, + "learning_rate": 1.6429746876675406e-05, + "loss": 0.1348, + "step": 3210 + }, + { + "epoch": 0.94, + "grad_norm": 3.154298991388609, + "learning_rate": 1.6417355772317763e-05, + "loss": 0.1307, + "step": 3215 + }, + { + "epoch": 0.94, + "grad_norm": 2.431634251842803, + "learning_rate": 1.6404947893734263e-05, + "loss": 0.1269, + "step": 3220 + }, + { + "epoch": 0.95, + "grad_norm": 1.8476184413662657, + "learning_rate": 1.639252327336273e-05, + "loss": 0.0886, + "step": 3225 + }, + { + "epoch": 0.95, + "grad_norm": 2.856480869169723, + "learning_rate": 1.6380081943684733e-05, + "loss": 0.1183, + "step": 3230 + }, + { + "epoch": 0.95, + "grad_norm": 1.4292746192636032, + "learning_rate": 1.6367623937225553e-05, + "loss": 0.062, + "step": 3235 + }, + { + "epoch": 0.95, + "grad_norm": 1.8800704426452939, + "learning_rate": 1.6355149286554047e-05, + "loss": 0.1223, + "step": 3240 + }, + { + "epoch": 0.95, + "grad_norm": 3.2229475779778154, + "learning_rate": 1.6342658024282585e-05, + "loss": 0.1167, + "step": 3245 + }, + { + "epoch": 0.95, + "grad_norm": 5.6176440142314, + "learning_rate": 1.6330150183066983e-05, + "loss": 0.116, + "step": 3250 + }, + { + "epoch": 0.95, + "grad_norm": 2.101148600040731, + "learning_rate": 1.6317625795606378e-05, + "loss": 0.1162, + "step": 3255 + }, + { + "epoch": 0.96, + "grad_norm": 2.02917715092117, + "learning_rate": 1.6305084894643172e-05, + "loss": 0.1406, + "step": 3260 + }, + { + "epoch": 0.96, + "grad_norm": 2.478189120719845, + "learning_rate": 1.6292527512962947e-05, + "loss": 0.1065, + "step": 3265 + }, + { + "epoch": 0.96, + "grad_norm": 2.3396951822486196, + "learning_rate": 1.627995368339435e-05, + "loss": 0.1115, + "step": 3270 + }, + { + "epoch": 0.96, + "grad_norm": 3.7337628106177485, + "learning_rate": 1.6267363438809052e-05, + "loss": 0.0846, + "step": 3275 + }, + { + "epoch": 0.96, + "grad_norm": 4.176925288355294, + "learning_rate": 1.6254756812121612e-05, + "loss": 0.0425, + "step": 3280 + }, + { + "epoch": 0.96, + "grad_norm": 1.437234356425958, + "learning_rate": 1.6242133836289444e-05, + "loss": 0.1001, + "step": 3285 + }, + { + "epoch": 0.97, + "grad_norm": 1.6309867032536425, + "learning_rate": 1.6229494544312684e-05, + "loss": 0.0555, + "step": 3290 + }, + { + "epoch": 0.97, + "grad_norm": 3.254534712202672, + "learning_rate": 1.6216838969234124e-05, + "loss": 0.0781, + "step": 3295 + }, + { + "epoch": 0.97, + "grad_norm": 2.59152889519255, + "learning_rate": 1.620416714413913e-05, + "loss": 0.0997, + "step": 3300 + }, + { + "epoch": 0.97, + "grad_norm": 2.0546888924365527, + "learning_rate": 1.6191479102155556e-05, + "loss": 0.117, + "step": 3305 + }, + { + "epoch": 0.97, + "grad_norm": 3.4303903750156484, + "learning_rate": 1.617877487645364e-05, + "loss": 0.0715, + "step": 3310 + }, + { + "epoch": 0.97, + "grad_norm": 2.1827612639457574, + "learning_rate": 1.616605450024594e-05, + "loss": 0.1013, + "step": 3315 + }, + { + "epoch": 0.97, + "grad_norm": 2.865702647381268, + "learning_rate": 1.6153318006787223e-05, + "loss": 0.1131, + "step": 3320 + }, + { + "epoch": 0.98, + "grad_norm": 2.9285326499896946, + "learning_rate": 1.61405654293744e-05, + "loss": 0.0839, + "step": 3325 + }, + { + "epoch": 0.98, + "grad_norm": 1.9355299378240092, + "learning_rate": 1.6127796801346437e-05, + "loss": 0.0938, + "step": 3330 + }, + { + "epoch": 0.98, + "grad_norm": 4.344809423553806, + "learning_rate": 1.6115012156084242e-05, + "loss": 0.066, + "step": 3335 + }, + { + "epoch": 0.98, + "grad_norm": 5.030847000188601, + "learning_rate": 1.6102211527010608e-05, + "loss": 0.1054, + "step": 3340 + }, + { + "epoch": 0.98, + "grad_norm": 1.8609269020938257, + "learning_rate": 1.6089394947590123e-05, + "loss": 0.0936, + "step": 3345 + }, + { + "epoch": 0.98, + "grad_norm": 2.878476404530353, + "learning_rate": 1.6076562451329055e-05, + "loss": 0.0885, + "step": 3350 + }, + { + "epoch": 0.98, + "grad_norm": 2.297272109392417, + "learning_rate": 1.6063714071775297e-05, + "loss": 0.1302, + "step": 3355 + }, + { + "epoch": 0.99, + "grad_norm": 2.4727024357295484, + "learning_rate": 1.6050849842518265e-05, + "loss": 0.1108, + "step": 3360 + }, + { + "epoch": 0.99, + "grad_norm": 2.616633205645327, + "learning_rate": 1.60379697971888e-05, + "loss": 0.1337, + "step": 3365 + }, + { + "epoch": 0.99, + "grad_norm": 3.0531342208551266, + "learning_rate": 1.60250739694591e-05, + "loss": 0.1144, + "step": 3370 + }, + { + "epoch": 0.99, + "grad_norm": 2.658444025810342, + "learning_rate": 1.6012162393042625e-05, + "loss": 0.1986, + "step": 3375 + }, + { + "epoch": 0.99, + "grad_norm": 2.1995801611755392, + "learning_rate": 1.5999235101694003e-05, + "loss": 0.1367, + "step": 3380 + }, + { + "epoch": 0.99, + "grad_norm": 1.3930532278994612, + "learning_rate": 1.5986292129208938e-05, + "loss": 0.0491, + "step": 3385 + }, + { + "epoch": 0.99, + "grad_norm": 3.212502054295458, + "learning_rate": 1.597333350942414e-05, + "loss": 0.0862, + "step": 3390 + }, + { + "epoch": 1.0, + "grad_norm": 1.849378948097875, + "learning_rate": 1.5960359276217222e-05, + "loss": 0.0899, + "step": 3395 + }, + { + "epoch": 1.0, + "grad_norm": 3.045383499124253, + "learning_rate": 1.5947369463506614e-05, + "loss": 0.0809, + "step": 3400 + }, + { + "epoch": 1.0, + "grad_norm": 3.350584656668729, + "learning_rate": 1.5934364105251473e-05, + "loss": 0.0692, + "step": 3405 + } + ], + "logging_steps": 5, + "max_steps": 10227, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 500, + "total_flos": 0.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}