diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,4929 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 3488, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0014334862385321102, + "grad_norm": 10.930955258982435, + "learning_rate": 2.865329512893983e-07, + "loss": 1.1122, + "step": 5 + }, + { + "epoch": 0.0028669724770642203, + "grad_norm": 7.957126318923027, + "learning_rate": 5.730659025787966e-07, + "loss": 1.1033, + "step": 10 + }, + { + "epoch": 0.0043004587155963305, + "grad_norm": 6.252488126522345, + "learning_rate": 8.595988538681949e-07, + "loss": 1.1448, + "step": 15 + }, + { + "epoch": 0.005733944954128441, + "grad_norm": 6.589440813762702, + "learning_rate": 1.1461318051575932e-06, + "loss": 1.0284, + "step": 20 + }, + { + "epoch": 0.007167431192660551, + "grad_norm": 4.917441825534674, + "learning_rate": 1.4326647564469915e-06, + "loss": 1.0131, + "step": 25 + }, + { + "epoch": 0.008600917431192661, + "grad_norm": 4.611821748972413, + "learning_rate": 1.7191977077363897e-06, + "loss": 1.0492, + "step": 30 + }, + { + "epoch": 0.010034403669724771, + "grad_norm": 4.602326061969326, + "learning_rate": 2.005730659025788e-06, + "loss": 1.0249, + "step": 35 + }, + { + "epoch": 0.011467889908256881, + "grad_norm": 4.795975437768488, + "learning_rate": 2.2922636103151864e-06, + "loss": 0.9889, + "step": 40 + }, + { + "epoch": 0.012901376146788992, + "grad_norm": 4.881452695659322, + "learning_rate": 2.5787965616045845e-06, + "loss": 0.9654, + "step": 45 + }, + { + "epoch": 0.014334862385321102, + "grad_norm": 4.816747748013977, + "learning_rate": 2.865329512893983e-06, + "loss": 1.0479, + "step": 50 + }, + { + "epoch": 0.01576834862385321, + "grad_norm": 4.672562758642394, + "learning_rate": 3.151862464183381e-06, + "loss": 0.9692, + "step": 55 + }, + { + "epoch": 0.017201834862385322, + "grad_norm": 4.665632444621185, + "learning_rate": 3.4383954154727795e-06, + "loss": 1.0033, + "step": 60 + }, + { + "epoch": 0.01863532110091743, + "grad_norm": 4.5487339829077795, + "learning_rate": 3.724928366762178e-06, + "loss": 1.0206, + "step": 65 + }, + { + "epoch": 0.020068807339449542, + "grad_norm": 4.72120821418095, + "learning_rate": 4.011461318051576e-06, + "loss": 1.0065, + "step": 70 + }, + { + "epoch": 0.02150229357798165, + "grad_norm": 4.488698654642803, + "learning_rate": 4.2979942693409744e-06, + "loss": 0.9634, + "step": 75 + }, + { + "epoch": 0.022935779816513763, + "grad_norm": 4.502540106177661, + "learning_rate": 4.584527220630373e-06, + "loss": 0.9633, + "step": 80 + }, + { + "epoch": 0.02436926605504587, + "grad_norm": 4.538891592130377, + "learning_rate": 4.871060171919771e-06, + "loss": 0.9148, + "step": 85 + }, + { + "epoch": 0.025802752293577983, + "grad_norm": 4.89489963864345, + "learning_rate": 5.157593123209169e-06, + "loss": 1.0303, + "step": 90 + }, + { + "epoch": 0.02723623853211009, + "grad_norm": 4.546009685259081, + "learning_rate": 5.444126074498568e-06, + "loss": 1.0174, + "step": 95 + }, + { + "epoch": 0.028669724770642203, + "grad_norm": 4.542941293141969, + "learning_rate": 5.730659025787966e-06, + "loss": 1.0251, + "step": 100 + }, + { + "epoch": 0.030103211009174312, + "grad_norm": 4.571406282018777, + "learning_rate": 6.017191977077364e-06, + "loss": 1.0376, + "step": 105 + }, + { + "epoch": 0.03153669724770642, + "grad_norm": 4.75986045778734, + "learning_rate": 6.303724928366762e-06, + "loss": 0.9827, + "step": 110 + }, + { + "epoch": 0.03297018348623853, + "grad_norm": 4.393838501543807, + "learning_rate": 6.590257879656161e-06, + "loss": 0.9533, + "step": 115 + }, + { + "epoch": 0.034403669724770644, + "grad_norm": 4.780066323607276, + "learning_rate": 6.876790830945559e-06, + "loss": 1.0018, + "step": 120 + }, + { + "epoch": 0.03583715596330275, + "grad_norm": 4.414441537089264, + "learning_rate": 7.163323782234957e-06, + "loss": 1.0135, + "step": 125 + }, + { + "epoch": 0.03727064220183486, + "grad_norm": 4.301618750664585, + "learning_rate": 7.449856733524356e-06, + "loss": 1.0201, + "step": 130 + }, + { + "epoch": 0.03870412844036697, + "grad_norm": 4.3543481672910245, + "learning_rate": 7.736389684813753e-06, + "loss": 1.0236, + "step": 135 + }, + { + "epoch": 0.040137614678899085, + "grad_norm": 4.31887452196869, + "learning_rate": 8.022922636103152e-06, + "loss": 1.0413, + "step": 140 + }, + { + "epoch": 0.04157110091743119, + "grad_norm": 4.19580797580846, + "learning_rate": 8.30945558739255e-06, + "loss": 0.9999, + "step": 145 + }, + { + "epoch": 0.0430045871559633, + "grad_norm": 4.2941554889344635, + "learning_rate": 8.595988538681949e-06, + "loss": 1.0824, + "step": 150 + }, + { + "epoch": 0.04443807339449541, + "grad_norm": 4.430699539319586, + "learning_rate": 8.882521489971347e-06, + "loss": 1.0545, + "step": 155 + }, + { + "epoch": 0.045871559633027525, + "grad_norm": 4.357465541547218, + "learning_rate": 9.169054441260746e-06, + "loss": 1.0662, + "step": 160 + }, + { + "epoch": 0.047305045871559634, + "grad_norm": 5.722224893724134, + "learning_rate": 9.455587392550144e-06, + "loss": 1.0197, + "step": 165 + }, + { + "epoch": 0.04873853211009174, + "grad_norm": 4.374513298911362, + "learning_rate": 9.742120343839543e-06, + "loss": 1.046, + "step": 170 + }, + { + "epoch": 0.05017201834862385, + "grad_norm": 4.593743802840811, + "learning_rate": 1.0028653295128941e-05, + "loss": 1.0739, + "step": 175 + }, + { + "epoch": 0.051605504587155966, + "grad_norm": 4.333415176606838, + "learning_rate": 1.0315186246418338e-05, + "loss": 1.0833, + "step": 180 + }, + { + "epoch": 0.053038990825688075, + "grad_norm": 4.426601251103325, + "learning_rate": 1.0601719197707738e-05, + "loss": 1.0799, + "step": 185 + }, + { + "epoch": 0.05447247706422018, + "grad_norm": 4.556659147888879, + "learning_rate": 1.0888252148997137e-05, + "loss": 1.0405, + "step": 190 + }, + { + "epoch": 0.05590596330275229, + "grad_norm": 4.3383547238785605, + "learning_rate": 1.1174785100286533e-05, + "loss": 1.0189, + "step": 195 + }, + { + "epoch": 0.05733944954128441, + "grad_norm": 4.125983340413087, + "learning_rate": 1.1461318051575932e-05, + "loss": 1.0592, + "step": 200 + }, + { + "epoch": 0.058772935779816515, + "grad_norm": 4.479422603616749, + "learning_rate": 1.1747851002865332e-05, + "loss": 1.0626, + "step": 205 + }, + { + "epoch": 0.060206422018348624, + "grad_norm": 4.506035260293105, + "learning_rate": 1.2034383954154729e-05, + "loss": 1.0506, + "step": 210 + }, + { + "epoch": 0.06163990825688073, + "grad_norm": 5.22525942718489, + "learning_rate": 1.2320916905444127e-05, + "loss": 1.0855, + "step": 215 + }, + { + "epoch": 0.06307339449541284, + "grad_norm": 4.779500493076456, + "learning_rate": 1.2607449856733524e-05, + "loss": 1.0872, + "step": 220 + }, + { + "epoch": 0.06450688073394495, + "grad_norm": 4.1419371246845555, + "learning_rate": 1.2893982808022924e-05, + "loss": 1.0158, + "step": 225 + }, + { + "epoch": 0.06594036697247706, + "grad_norm": 4.55193247523381, + "learning_rate": 1.3180515759312323e-05, + "loss": 1.1341, + "step": 230 + }, + { + "epoch": 0.06737385321100918, + "grad_norm": 4.821922591302919, + "learning_rate": 1.346704871060172e-05, + "loss": 1.0317, + "step": 235 + }, + { + "epoch": 0.06880733944954129, + "grad_norm": 5.7218678278363075, + "learning_rate": 1.3753581661891118e-05, + "loss": 1.0929, + "step": 240 + }, + { + "epoch": 0.0702408256880734, + "grad_norm": 4.5897462960681885, + "learning_rate": 1.4040114613180518e-05, + "loss": 1.0707, + "step": 245 + }, + { + "epoch": 0.0716743119266055, + "grad_norm": 4.9044704729529265, + "learning_rate": 1.4326647564469915e-05, + "loss": 1.0589, + "step": 250 + }, + { + "epoch": 0.07310779816513761, + "grad_norm": 4.317942267526155, + "learning_rate": 1.4613180515759313e-05, + "loss": 1.0944, + "step": 255 + }, + { + "epoch": 0.07454128440366972, + "grad_norm": 4.019443011772793, + "learning_rate": 1.4899713467048712e-05, + "loss": 1.1014, + "step": 260 + }, + { + "epoch": 0.07597477064220183, + "grad_norm": 5.05645456273739, + "learning_rate": 1.518624641833811e-05, + "loss": 1.0855, + "step": 265 + }, + { + "epoch": 0.07740825688073394, + "grad_norm": 4.631991611940915, + "learning_rate": 1.5472779369627507e-05, + "loss": 1.1797, + "step": 270 + }, + { + "epoch": 0.07884174311926606, + "grad_norm": 4.863391448983006, + "learning_rate": 1.5759312320916907e-05, + "loss": 1.1189, + "step": 275 + }, + { + "epoch": 0.08027522935779817, + "grad_norm": 5.67171228826149, + "learning_rate": 1.6045845272206304e-05, + "loss": 1.0687, + "step": 280 + }, + { + "epoch": 0.08170871559633028, + "grad_norm": 4.863402157719064, + "learning_rate": 1.6332378223495704e-05, + "loss": 1.1033, + "step": 285 + }, + { + "epoch": 0.08314220183486239, + "grad_norm": 4.660992837053409, + "learning_rate": 1.66189111747851e-05, + "loss": 1.1543, + "step": 290 + }, + { + "epoch": 0.0845756880733945, + "grad_norm": 4.552069083735871, + "learning_rate": 1.69054441260745e-05, + "loss": 1.1464, + "step": 295 + }, + { + "epoch": 0.0860091743119266, + "grad_norm": 4.61726981167071, + "learning_rate": 1.7191977077363898e-05, + "loss": 1.1332, + "step": 300 + }, + { + "epoch": 0.08744266055045871, + "grad_norm": 4.9734394128477755, + "learning_rate": 1.7478510028653298e-05, + "loss": 1.0608, + "step": 305 + }, + { + "epoch": 0.08887614678899082, + "grad_norm": 4.080774915213187, + "learning_rate": 1.7765042979942695e-05, + "loss": 1.1737, + "step": 310 + }, + { + "epoch": 0.09030963302752294, + "grad_norm": 4.9010757003007805, + "learning_rate": 1.805157593123209e-05, + "loss": 1.152, + "step": 315 + }, + { + "epoch": 0.09174311926605505, + "grad_norm": 4.348268075612745, + "learning_rate": 1.833810888252149e-05, + "loss": 1.1493, + "step": 320 + }, + { + "epoch": 0.09317660550458716, + "grad_norm": 5.525129625035676, + "learning_rate": 1.8624641833810892e-05, + "loss": 1.109, + "step": 325 + }, + { + "epoch": 0.09461009174311927, + "grad_norm": 6.128793939259254, + "learning_rate": 1.891117478510029e-05, + "loss": 1.1883, + "step": 330 + }, + { + "epoch": 0.09604357798165138, + "grad_norm": 4.553408862504849, + "learning_rate": 1.9197707736389685e-05, + "loss": 1.1294, + "step": 335 + }, + { + "epoch": 0.09747706422018348, + "grad_norm": 4.512129266800345, + "learning_rate": 1.9484240687679085e-05, + "loss": 1.2076, + "step": 340 + }, + { + "epoch": 0.0989105504587156, + "grad_norm": 4.294698339999475, + "learning_rate": 1.9770773638968482e-05, + "loss": 1.1455, + "step": 345 + }, + { + "epoch": 0.1003440366972477, + "grad_norm": 4.649384755228939, + "learning_rate": 1.999999499173752e-05, + "loss": 1.142, + "step": 350 + }, + { + "epoch": 0.10177752293577981, + "grad_norm": 4.639825467109486, + "learning_rate": 1.999981970307739e-05, + "loss": 1.19, + "step": 355 + }, + { + "epoch": 0.10321100917431193, + "grad_norm": 4.447236882707046, + "learning_rate": 1.999939400630968e-05, + "loss": 1.1753, + "step": 360 + }, + { + "epoch": 0.10464449541284404, + "grad_norm": 4.092068716884874, + "learning_rate": 1.999871791209438e-05, + "loss": 1.1893, + "step": 365 + }, + { + "epoch": 0.10607798165137615, + "grad_norm": 4.398948638445328, + "learning_rate": 1.9997791437361734e-05, + "loss": 1.1555, + "step": 370 + }, + { + "epoch": 0.10751146788990826, + "grad_norm": 4.701129620714216, + "learning_rate": 1.9996614605311848e-05, + "loss": 1.2011, + "step": 375 + }, + { + "epoch": 0.10894495412844037, + "grad_norm": 4.821236633722439, + "learning_rate": 1.999518744541407e-05, + "loss": 1.1776, + "step": 380 + }, + { + "epoch": 0.11037844036697247, + "grad_norm": 4.335100849608752, + "learning_rate": 1.9993509993406297e-05, + "loss": 1.1902, + "step": 385 + }, + { + "epoch": 0.11181192660550458, + "grad_norm": 4.068825093466267, + "learning_rate": 1.9991582291294042e-05, + "loss": 1.1836, + "step": 390 + }, + { + "epoch": 0.11324541284403669, + "grad_norm": 3.9520872546047143, + "learning_rate": 1.9989404387349393e-05, + "loss": 1.1439, + "step": 395 + }, + { + "epoch": 0.11467889908256881, + "grad_norm": 4.582077126321336, + "learning_rate": 1.998697633610982e-05, + "loss": 1.2094, + "step": 400 + }, + { + "epoch": 0.11611238532110092, + "grad_norm": 4.38687782745242, + "learning_rate": 1.998429819837679e-05, + "loss": 1.1866, + "step": 405 + }, + { + "epoch": 0.11754587155963303, + "grad_norm": 4.6771014427247275, + "learning_rate": 1.998137004121425e-05, + "loss": 1.1587, + "step": 410 + }, + { + "epoch": 0.11897935779816514, + "grad_norm": 4.218200613872475, + "learning_rate": 1.9978191937946955e-05, + "loss": 1.1411, + "step": 415 + }, + { + "epoch": 0.12041284403669725, + "grad_norm": 4.509609936000089, + "learning_rate": 1.9974763968158614e-05, + "loss": 1.1772, + "step": 420 + }, + { + "epoch": 0.12184633027522936, + "grad_norm": 5.925174301278395, + "learning_rate": 1.9971086217689928e-05, + "loss": 1.2411, + "step": 425 + }, + { + "epoch": 0.12327981651376146, + "grad_norm": 4.022377669658578, + "learning_rate": 1.9967158778636405e-05, + "loss": 1.1887, + "step": 430 + }, + { + "epoch": 0.12471330275229357, + "grad_norm": 4.539584147379229, + "learning_rate": 1.996298174934608e-05, + "loss": 1.1717, + "step": 435 + }, + { + "epoch": 0.12614678899082568, + "grad_norm": 9.545718854729863, + "learning_rate": 1.9958555234417035e-05, + "loss": 1.1879, + "step": 440 + }, + { + "epoch": 0.1275802752293578, + "grad_norm": 22.053598752620175, + "learning_rate": 1.995387934469479e-05, + "loss": 1.172, + "step": 445 + }, + { + "epoch": 0.1290137614678899, + "grad_norm": 12.911881630750296, + "learning_rate": 1.994895419726953e-05, + "loss": 1.2034, + "step": 450 + }, + { + "epoch": 0.13044724770642202, + "grad_norm": 4.856805114224933, + "learning_rate": 1.9943779915473165e-05, + "loss": 1.1795, + "step": 455 + }, + { + "epoch": 0.13188073394495411, + "grad_norm": 4.286105662410557, + "learning_rate": 1.9938356628876235e-05, + "loss": 1.1866, + "step": 460 + }, + { + "epoch": 0.13331422018348624, + "grad_norm": 4.2624287742900995, + "learning_rate": 1.9932684473284687e-05, + "loss": 1.1894, + "step": 465 + }, + { + "epoch": 0.13474770642201836, + "grad_norm": 4.716264114725749, + "learning_rate": 1.9926763590736457e-05, + "loss": 1.2835, + "step": 470 + }, + { + "epoch": 0.13618119266055045, + "grad_norm": 4.2462330499210115, + "learning_rate": 1.992059412949791e-05, + "loss": 1.2412, + "step": 475 + }, + { + "epoch": 0.13761467889908258, + "grad_norm": 4.391420347453553, + "learning_rate": 1.9914176244060158e-05, + "loss": 1.1516, + "step": 480 + }, + { + "epoch": 0.13904816513761467, + "grad_norm": 75.66535297260735, + "learning_rate": 1.9907510095135142e-05, + "loss": 1.2722, + "step": 485 + }, + { + "epoch": 0.1404816513761468, + "grad_norm": 5.310902667535963, + "learning_rate": 1.9900595849651645e-05, + "loss": 1.2421, + "step": 490 + }, + { + "epoch": 0.1419151376146789, + "grad_norm": 5.025394485398511, + "learning_rate": 1.9893433680751105e-05, + "loss": 1.2633, + "step": 495 + }, + { + "epoch": 0.143348623853211, + "grad_norm": 5.520994924491881, + "learning_rate": 1.988602376778327e-05, + "loss": 1.189, + "step": 500 + }, + { + "epoch": 0.14478211009174313, + "grad_norm": 4.515316969142533, + "learning_rate": 1.9878366296301713e-05, + "loss": 1.1607, + "step": 505 + }, + { + "epoch": 0.14621559633027523, + "grad_norm": 4.532857835025907, + "learning_rate": 1.9870461458059188e-05, + "loss": 1.2315, + "step": 510 + }, + { + "epoch": 0.14764908256880735, + "grad_norm": 4.001206919662932, + "learning_rate": 1.9862309451002827e-05, + "loss": 1.1659, + "step": 515 + }, + { + "epoch": 0.14908256880733944, + "grad_norm": 4.137717431705149, + "learning_rate": 1.9853910479269165e-05, + "loss": 1.1732, + "step": 520 + }, + { + "epoch": 0.15051605504587157, + "grad_norm": 4.367905503345653, + "learning_rate": 1.9845264753179064e-05, + "loss": 1.2219, + "step": 525 + }, + { + "epoch": 0.15194954128440366, + "grad_norm": 3.942691834220351, + "learning_rate": 1.9836372489232416e-05, + "loss": 1.1208, + "step": 530 + }, + { + "epoch": 0.15338302752293578, + "grad_norm": 4.028647286087348, + "learning_rate": 1.982723391010273e-05, + "loss": 1.1499, + "step": 535 + }, + { + "epoch": 0.15481651376146788, + "grad_norm": 4.262774154684462, + "learning_rate": 1.9817849244631575e-05, + "loss": 1.2052, + "step": 540 + }, + { + "epoch": 0.15625, + "grad_norm": 4.0350996506196175, + "learning_rate": 1.9808218727822808e-05, + "loss": 1.1899, + "step": 545 + }, + { + "epoch": 0.15768348623853212, + "grad_norm": 3.806300412200292, + "learning_rate": 1.979834260083673e-05, + "loss": 1.2099, + "step": 550 + }, + { + "epoch": 0.15911697247706422, + "grad_norm": 4.447162495747635, + "learning_rate": 1.9788221110984026e-05, + "loss": 1.2123, + "step": 555 + }, + { + "epoch": 0.16055045871559634, + "grad_norm": 4.094827599966712, + "learning_rate": 1.977785451171958e-05, + "loss": 1.1411, + "step": 560 + }, + { + "epoch": 0.16198394495412843, + "grad_norm": 4.6429132125696535, + "learning_rate": 1.9767243062636122e-05, + "loss": 1.1914, + "step": 565 + }, + { + "epoch": 0.16341743119266056, + "grad_norm": 3.9726685572488227, + "learning_rate": 1.975638702945773e-05, + "loss": 1.1758, + "step": 570 + }, + { + "epoch": 0.16485091743119265, + "grad_norm": 3.9582277412235367, + "learning_rate": 1.974528668403318e-05, + "loss": 1.1839, + "step": 575 + }, + { + "epoch": 0.16628440366972477, + "grad_norm": 4.611199474059946, + "learning_rate": 1.973394230432913e-05, + "loss": 1.2155, + "step": 580 + }, + { + "epoch": 0.16771788990825687, + "grad_norm": 3.964842824608755, + "learning_rate": 1.972235417442317e-05, + "loss": 1.168, + "step": 585 + }, + { + "epoch": 0.169151376146789, + "grad_norm": 4.083964710145448, + "learning_rate": 1.9710522584496695e-05, + "loss": 1.2135, + "step": 590 + }, + { + "epoch": 0.1705848623853211, + "grad_norm": 3.8905730852414155, + "learning_rate": 1.9698447830827655e-05, + "loss": 1.1893, + "step": 595 + }, + { + "epoch": 0.1720183486238532, + "grad_norm": 4.399950123881084, + "learning_rate": 1.9686130215783124e-05, + "loss": 1.2244, + "step": 600 + }, + { + "epoch": 0.17345183486238533, + "grad_norm": 4.3066835429404495, + "learning_rate": 1.967357004781173e-05, + "loss": 1.1532, + "step": 605 + }, + { + "epoch": 0.17488532110091742, + "grad_norm": 5.071390390632784, + "learning_rate": 1.9660767641435926e-05, + "loss": 1.2137, + "step": 610 + }, + { + "epoch": 0.17631880733944955, + "grad_norm": 3.738266843064544, + "learning_rate": 1.964772331724414e-05, + "loss": 1.165, + "step": 615 + }, + { + "epoch": 0.17775229357798164, + "grad_norm": 3.885352429025116, + "learning_rate": 1.9634437401882707e-05, + "loss": 1.1899, + "step": 620 + }, + { + "epoch": 0.17918577981651376, + "grad_norm": 3.653935761439161, + "learning_rate": 1.962091022804772e-05, + "loss": 1.1506, + "step": 625 + }, + { + "epoch": 0.18061926605504589, + "grad_norm": 4.559227033021131, + "learning_rate": 1.960714213447668e-05, + "loss": 1.1645, + "step": 630 + }, + { + "epoch": 0.18205275229357798, + "grad_norm": 4.618925411728112, + "learning_rate": 1.959313346594004e-05, + "loss": 1.1802, + "step": 635 + }, + { + "epoch": 0.1834862385321101, + "grad_norm": 4.308837236054774, + "learning_rate": 1.9578884573232538e-05, + "loss": 1.2023, + "step": 640 + }, + { + "epoch": 0.1849197247706422, + "grad_norm": 4.304427359268019, + "learning_rate": 1.9564395813164428e-05, + "loss": 1.2244, + "step": 645 + }, + { + "epoch": 0.18635321100917432, + "grad_norm": 3.486413458583046, + "learning_rate": 1.9549667548552557e-05, + "loss": 1.2111, + "step": 650 + }, + { + "epoch": 0.1877866972477064, + "grad_norm": 3.7640427311235403, + "learning_rate": 1.9534700148211255e-05, + "loss": 1.1956, + "step": 655 + }, + { + "epoch": 0.18922018348623854, + "grad_norm": 4.222766727318349, + "learning_rate": 1.9519493986943125e-05, + "loss": 1.1996, + "step": 660 + }, + { + "epoch": 0.19065366972477063, + "grad_norm": 3.7868143111703256, + "learning_rate": 1.9504049445529632e-05, + "loss": 1.2093, + "step": 665 + }, + { + "epoch": 0.19208715596330275, + "grad_norm": 3.579431474524795, + "learning_rate": 1.94883669107216e-05, + "loss": 1.1785, + "step": 670 + }, + { + "epoch": 0.19352064220183487, + "grad_norm": 4.14899759337832, + "learning_rate": 1.9472446775229486e-05, + "loss": 1.2402, + "step": 675 + }, + { + "epoch": 0.19495412844036697, + "grad_norm": 3.7259985492864804, + "learning_rate": 1.9456289437713578e-05, + "loss": 1.1713, + "step": 680 + }, + { + "epoch": 0.1963876146788991, + "grad_norm": 4.525451494816849, + "learning_rate": 1.9439895302774007e-05, + "loss": 1.2179, + "step": 685 + }, + { + "epoch": 0.1978211009174312, + "grad_norm": 3.9639891914703185, + "learning_rate": 1.9423264780940602e-05, + "loss": 1.2163, + "step": 690 + }, + { + "epoch": 0.1992545871559633, + "grad_norm": 3.8210369748297612, + "learning_rate": 1.940639828866262e-05, + "loss": 1.2462, + "step": 695 + }, + { + "epoch": 0.2006880733944954, + "grad_norm": 4.499363334347953, + "learning_rate": 1.938929624829832e-05, + "loss": 1.2572, + "step": 700 + }, + { + "epoch": 0.20212155963302753, + "grad_norm": 32.95893389851377, + "learning_rate": 1.937195908810438e-05, + "loss": 1.2332, + "step": 705 + }, + { + "epoch": 0.20355504587155962, + "grad_norm": 4.056487188350767, + "learning_rate": 1.935438724222517e-05, + "loss": 1.1797, + "step": 710 + }, + { + "epoch": 0.20498853211009174, + "grad_norm": 3.940310991182425, + "learning_rate": 1.93365811506819e-05, + "loss": 1.2315, + "step": 715 + }, + { + "epoch": 0.20642201834862386, + "grad_norm": 3.92517292280482, + "learning_rate": 1.9318541259361573e-05, + "loss": 1.1824, + "step": 720 + }, + { + "epoch": 0.20785550458715596, + "grad_norm": 4.109451546798357, + "learning_rate": 1.9300268020005832e-05, + "loss": 1.2375, + "step": 725 + }, + { + "epoch": 0.20928899082568808, + "grad_norm": 4.168329616635684, + "learning_rate": 1.9281761890199666e-05, + "loss": 1.1812, + "step": 730 + }, + { + "epoch": 0.21072247706422018, + "grad_norm": 3.8618338414153435, + "learning_rate": 1.9263023333359918e-05, + "loss": 1.1903, + "step": 735 + }, + { + "epoch": 0.2121559633027523, + "grad_norm": 4.070643005805358, + "learning_rate": 1.9244052818723706e-05, + "loss": 1.2539, + "step": 740 + }, + { + "epoch": 0.2135894495412844, + "grad_norm": 3.9207051720216004, + "learning_rate": 1.9224850821336664e-05, + "loss": 1.2375, + "step": 745 + }, + { + "epoch": 0.21502293577981652, + "grad_norm": 4.14495928553583, + "learning_rate": 1.920541782204104e-05, + "loss": 1.253, + "step": 750 + }, + { + "epoch": 0.21645642201834864, + "grad_norm": 4.210904171053472, + "learning_rate": 1.918575430746367e-05, + "loss": 1.1987, + "step": 755 + }, + { + "epoch": 0.21788990825688073, + "grad_norm": 3.6674431622452883, + "learning_rate": 1.9165860770003774e-05, + "loss": 1.2427, + "step": 760 + }, + { + "epoch": 0.21932339449541285, + "grad_norm": 3.7503035449224127, + "learning_rate": 1.914573770782065e-05, + "loss": 1.1564, + "step": 765 + }, + { + "epoch": 0.22075688073394495, + "grad_norm": 4.206820549333469, + "learning_rate": 1.9125385624821162e-05, + "loss": 1.2026, + "step": 770 + }, + { + "epoch": 0.22219036697247707, + "grad_norm": 4.079050524248544, + "learning_rate": 1.9104805030647164e-05, + "loss": 1.2521, + "step": 775 + }, + { + "epoch": 0.22362385321100917, + "grad_norm": 3.8439532493920585, + "learning_rate": 1.908399644066272e-05, + "loss": 1.2247, + "step": 780 + }, + { + "epoch": 0.2250573394495413, + "grad_norm": 4.419153034660572, + "learning_rate": 1.906296037594117e-05, + "loss": 1.2827, + "step": 785 + }, + { + "epoch": 0.22649082568807338, + "grad_norm": 4.802038277224569, + "learning_rate": 1.904169736325215e-05, + "loss": 1.262, + "step": 790 + }, + { + "epoch": 0.2279243119266055, + "grad_norm": 4.284188903637777, + "learning_rate": 1.9020207935048317e-05, + "loss": 1.1532, + "step": 795 + }, + { + "epoch": 0.22935779816513763, + "grad_norm": 3.639715866595117, + "learning_rate": 1.8998492629452087e-05, + "loss": 1.2381, + "step": 800 + }, + { + "epoch": 0.23079128440366972, + "grad_norm": 3.741005239883841, + "learning_rate": 1.8976551990242122e-05, + "loss": 1.216, + "step": 805 + }, + { + "epoch": 0.23222477064220184, + "grad_norm": 3.6139511602383276, + "learning_rate": 1.895438656683972e-05, + "loss": 1.2473, + "step": 810 + }, + { + "epoch": 0.23365825688073394, + "grad_norm": 3.8300619382359846, + "learning_rate": 1.8931996914295065e-05, + "loss": 1.1882, + "step": 815 + }, + { + "epoch": 0.23509174311926606, + "grad_norm": 3.943889343109655, + "learning_rate": 1.8909383593273317e-05, + "loss": 1.2444, + "step": 820 + }, + { + "epoch": 0.23652522935779816, + "grad_norm": 4.115538048586424, + "learning_rate": 1.8886547170040575e-05, + "loss": 1.233, + "step": 825 + }, + { + "epoch": 0.23795871559633028, + "grad_norm": 5.787154468595948, + "learning_rate": 1.8863488216449702e-05, + "loss": 1.2236, + "step": 830 + }, + { + "epoch": 0.23939220183486237, + "grad_norm": 3.778052893153268, + "learning_rate": 1.8840207309926003e-05, + "loss": 1.2286, + "step": 835 + }, + { + "epoch": 0.2408256880733945, + "grad_norm": 3.435004162216508, + "learning_rate": 1.881670503345277e-05, + "loss": 1.1868, + "step": 840 + }, + { + "epoch": 0.24225917431192662, + "grad_norm": 5.261326157501783, + "learning_rate": 1.879298197555666e-05, + "loss": 1.2486, + "step": 845 + }, + { + "epoch": 0.2436926605504587, + "grad_norm": 3.651324240819765, + "learning_rate": 1.8769038730292993e-05, + "loss": 1.2383, + "step": 850 + }, + { + "epoch": 0.24512614678899083, + "grad_norm": 4.834151575695704, + "learning_rate": 1.8744875897230853e-05, + "loss": 1.2592, + "step": 855 + }, + { + "epoch": 0.24655963302752293, + "grad_norm": 3.9673796319727606, + "learning_rate": 1.872049408143808e-05, + "loss": 1.2135, + "step": 860 + }, + { + "epoch": 0.24799311926605505, + "grad_norm": 3.9495356227550293, + "learning_rate": 1.869589389346611e-05, + "loss": 1.2443, + "step": 865 + }, + { + "epoch": 0.24942660550458715, + "grad_norm": 3.8171283870857917, + "learning_rate": 1.8671075949334713e-05, + "loss": 1.1502, + "step": 870 + }, + { + "epoch": 0.25086009174311924, + "grad_norm": 3.594654406802359, + "learning_rate": 1.8646040870516526e-05, + "loss": 1.1831, + "step": 875 + }, + { + "epoch": 0.25229357798165136, + "grad_norm": 3.5772784131514337, + "learning_rate": 1.862078928392153e-05, + "loss": 1.2116, + "step": 880 + }, + { + "epoch": 0.2537270642201835, + "grad_norm": 4.016497035159711, + "learning_rate": 1.8595321821881322e-05, + "loss": 1.2156, + "step": 885 + }, + { + "epoch": 0.2551605504587156, + "grad_norm": 3.6584048014823622, + "learning_rate": 1.8569639122133304e-05, + "loss": 1.1687, + "step": 890 + }, + { + "epoch": 0.25659403669724773, + "grad_norm": 4.092581816708313, + "learning_rate": 1.8543741827804685e-05, + "loss": 1.2433, + "step": 895 + }, + { + "epoch": 0.2580275229357798, + "grad_norm": 10.965320915487338, + "learning_rate": 1.8517630587396413e-05, + "loss": 1.2365, + "step": 900 + }, + { + "epoch": 0.2594610091743119, + "grad_norm": 3.744425931996878, + "learning_rate": 1.8491306054766907e-05, + "loss": 1.2315, + "step": 905 + }, + { + "epoch": 0.26089449541284404, + "grad_norm": 4.446159019549649, + "learning_rate": 1.8464768889115684e-05, + "loss": 1.165, + "step": 910 + }, + { + "epoch": 0.26232798165137616, + "grad_norm": 3.4122914752905227, + "learning_rate": 1.8438019754966877e-05, + "loss": 1.1715, + "step": 915 + }, + { + "epoch": 0.26376146788990823, + "grad_norm": 3.681090000606899, + "learning_rate": 1.841105932215256e-05, + "loss": 1.2197, + "step": 920 + }, + { + "epoch": 0.26519495412844035, + "grad_norm": 3.723633399780894, + "learning_rate": 1.838388826579601e-05, + "loss": 1.2308, + "step": 925 + }, + { + "epoch": 0.2666284403669725, + "grad_norm": 3.79765489569063, + "learning_rate": 1.835650726629477e-05, + "loss": 1.2388, + "step": 930 + }, + { + "epoch": 0.2680619266055046, + "grad_norm": 3.636387647388923, + "learning_rate": 1.8328917009303634e-05, + "loss": 1.2296, + "step": 935 + }, + { + "epoch": 0.2694954128440367, + "grad_norm": 3.696962989830932, + "learning_rate": 1.830111818571745e-05, + "loss": 1.2048, + "step": 940 + }, + { + "epoch": 0.2709288990825688, + "grad_norm": 3.896582872078181, + "learning_rate": 1.8273111491653867e-05, + "loss": 1.2522, + "step": 945 + }, + { + "epoch": 0.2723623853211009, + "grad_norm": 3.7244583842983454, + "learning_rate": 1.824489762843584e-05, + "loss": 1.1896, + "step": 950 + }, + { + "epoch": 0.27379587155963303, + "grad_norm": 3.66702244239725, + "learning_rate": 1.821647730257413e-05, + "loss": 1.1777, + "step": 955 + }, + { + "epoch": 0.27522935779816515, + "grad_norm": 6.778664388382065, + "learning_rate": 1.818785122574956e-05, + "loss": 1.2239, + "step": 960 + }, + { + "epoch": 0.2766628440366973, + "grad_norm": 3.5798872160414095, + "learning_rate": 1.8159020114795226e-05, + "loss": 1.1532, + "step": 965 + }, + { + "epoch": 0.27809633027522934, + "grad_norm": 4.080407231530825, + "learning_rate": 1.8129984691678547e-05, + "loss": 1.2046, + "step": 970 + }, + { + "epoch": 0.27952981651376146, + "grad_norm": 3.9179230686635234, + "learning_rate": 1.8100745683483168e-05, + "loss": 1.254, + "step": 975 + }, + { + "epoch": 0.2809633027522936, + "grad_norm": 4.97899730667325, + "learning_rate": 1.807130382239075e-05, + "loss": 1.2084, + "step": 980 + }, + { + "epoch": 0.2823967889908257, + "grad_norm": 3.8642747207026242, + "learning_rate": 1.8041659845662663e-05, + "loss": 1.2014, + "step": 985 + }, + { + "epoch": 0.2838302752293578, + "grad_norm": 3.4483181116099964, + "learning_rate": 1.8011814495621506e-05, + "loss": 1.1435, + "step": 990 + }, + { + "epoch": 0.2852637614678899, + "grad_norm": 3.8042720182477434, + "learning_rate": 1.798176851963251e-05, + "loss": 1.2047, + "step": 995 + }, + { + "epoch": 0.286697247706422, + "grad_norm": 3.483685408627929, + "learning_rate": 1.7951522670084847e-05, + "loss": 1.171, + "step": 1000 + }, + { + "epoch": 0.28813073394495414, + "grad_norm": 3.487052716444198, + "learning_rate": 1.792107770437276e-05, + "loss": 1.2396, + "step": 1005 + }, + { + "epoch": 0.28956422018348627, + "grad_norm": 3.553005650055509, + "learning_rate": 1.789043438487662e-05, + "loss": 1.1149, + "step": 1010 + }, + { + "epoch": 0.29099770642201833, + "grad_norm": 3.5693421017081306, + "learning_rate": 1.785959347894383e-05, + "loss": 1.2387, + "step": 1015 + }, + { + "epoch": 0.29243119266055045, + "grad_norm": 3.535948224388696, + "learning_rate": 1.7828555758869602e-05, + "loss": 1.232, + "step": 1020 + }, + { + "epoch": 0.2938646788990826, + "grad_norm": 3.522215138188648, + "learning_rate": 1.7797322001877625e-05, + "loss": 1.2004, + "step": 1025 + }, + { + "epoch": 0.2952981651376147, + "grad_norm": 3.9401114869933362, + "learning_rate": 1.7765892990100593e-05, + "loss": 1.1954, + "step": 1030 + }, + { + "epoch": 0.29673165137614677, + "grad_norm": 3.9452874322873424, + "learning_rate": 1.773426951056064e-05, + "loss": 1.1752, + "step": 1035 + }, + { + "epoch": 0.2981651376146789, + "grad_norm": 3.77900474922427, + "learning_rate": 1.7702452355149606e-05, + "loss": 1.2023, + "step": 1040 + }, + { + "epoch": 0.299598623853211, + "grad_norm": 4.194810233115317, + "learning_rate": 1.7670442320609226e-05, + "loss": 1.1762, + "step": 1045 + }, + { + "epoch": 0.30103211009174313, + "grad_norm": 3.6317516711250524, + "learning_rate": 1.7638240208511162e-05, + "loss": 1.2036, + "step": 1050 + }, + { + "epoch": 0.30246559633027525, + "grad_norm": 3.613998915069475, + "learning_rate": 1.760584682523696e-05, + "loss": 1.1794, + "step": 1055 + }, + { + "epoch": 0.3038990825688073, + "grad_norm": 3.168795755449482, + "learning_rate": 1.7573262981957814e-05, + "loss": 1.1812, + "step": 1060 + }, + { + "epoch": 0.30533256880733944, + "grad_norm": 3.883990610964808, + "learning_rate": 1.7540489494614294e-05, + "loss": 1.2273, + "step": 1065 + }, + { + "epoch": 0.30676605504587157, + "grad_norm": 4.068560981758666, + "learning_rate": 1.7507527183895893e-05, + "loss": 1.2297, + "step": 1070 + }, + { + "epoch": 0.3081995412844037, + "grad_norm": 3.812795569574441, + "learning_rate": 1.747437687522047e-05, + "loss": 1.2037, + "step": 1075 + }, + { + "epoch": 0.30963302752293576, + "grad_norm": 3.7886870743963077, + "learning_rate": 1.744103939871361e-05, + "loss": 1.2235, + "step": 1080 + }, + { + "epoch": 0.3110665137614679, + "grad_norm": 3.3064001495116555, + "learning_rate": 1.7407515589187793e-05, + "loss": 1.213, + "step": 1085 + }, + { + "epoch": 0.3125, + "grad_norm": 3.999062074927365, + "learning_rate": 1.7373806286121532e-05, + "loss": 1.2586, + "step": 1090 + }, + { + "epoch": 0.3139334862385321, + "grad_norm": 4.371331122546795, + "learning_rate": 1.7339912333638322e-05, + "loss": 1.1731, + "step": 1095 + }, + { + "epoch": 0.31536697247706424, + "grad_norm": 3.8167405663507, + "learning_rate": 1.730583458048552e-05, + "loss": 1.2098, + "step": 1100 + }, + { + "epoch": 0.3168004587155963, + "grad_norm": 3.398825843337014, + "learning_rate": 1.727157388001307e-05, + "loss": 1.187, + "step": 1105 + }, + { + "epoch": 0.31823394495412843, + "grad_norm": 3.323984794217734, + "learning_rate": 1.723713109015217e-05, + "loss": 1.229, + "step": 1110 + }, + { + "epoch": 0.31966743119266056, + "grad_norm": 3.388696780319304, + "learning_rate": 1.720250707339374e-05, + "loss": 1.1616, + "step": 1115 + }, + { + "epoch": 0.3211009174311927, + "grad_norm": 3.7284978608483934, + "learning_rate": 1.7167702696766877e-05, + "loss": 1.173, + "step": 1120 + }, + { + "epoch": 0.32253440366972475, + "grad_norm": 4.037999124478957, + "learning_rate": 1.7132718831817093e-05, + "loss": 1.1695, + "step": 1125 + }, + { + "epoch": 0.32396788990825687, + "grad_norm": 3.2060752585491086, + "learning_rate": 1.7097556354584526e-05, + "loss": 1.1464, + "step": 1130 + }, + { + "epoch": 0.325401376146789, + "grad_norm": 3.72775795176933, + "learning_rate": 1.7062216145581997e-05, + "loss": 1.1237, + "step": 1135 + }, + { + "epoch": 0.3268348623853211, + "grad_norm": 3.783961044208216, + "learning_rate": 1.7026699089772937e-05, + "loss": 1.1899, + "step": 1140 + }, + { + "epoch": 0.32826834862385323, + "grad_norm": 3.5936905948051323, + "learning_rate": 1.699100607654926e-05, + "loss": 1.176, + "step": 1145 + }, + { + "epoch": 0.3297018348623853, + "grad_norm": 3.5296084880895156, + "learning_rate": 1.6955137999709075e-05, + "loss": 1.1445, + "step": 1150 + }, + { + "epoch": 0.3311353211009174, + "grad_norm": 3.278850491610551, + "learning_rate": 1.6919095757434288e-05, + "loss": 1.2269, + "step": 1155 + }, + { + "epoch": 0.33256880733944955, + "grad_norm": 3.6404322855144247, + "learning_rate": 1.6882880252268156e-05, + "loss": 1.1836, + "step": 1160 + }, + { + "epoch": 0.33400229357798167, + "grad_norm": 3.437933045156408, + "learning_rate": 1.6846492391092625e-05, + "loss": 1.2295, + "step": 1165 + }, + { + "epoch": 0.33543577981651373, + "grad_norm": 3.5611595737623167, + "learning_rate": 1.680993308510568e-05, + "loss": 1.1862, + "step": 1170 + }, + { + "epoch": 0.33686926605504586, + "grad_norm": 4.312054532006292, + "learning_rate": 1.6773203249798482e-05, + "loss": 1.1602, + "step": 1175 + }, + { + "epoch": 0.338302752293578, + "grad_norm": 3.598611274354031, + "learning_rate": 1.6736303804932475e-05, + "loss": 1.1825, + "step": 1180 + }, + { + "epoch": 0.3397362385321101, + "grad_norm": 3.779933724951351, + "learning_rate": 1.6699235674516334e-05, + "loss": 1.1847, + "step": 1185 + }, + { + "epoch": 0.3411697247706422, + "grad_norm": 3.590038112136874, + "learning_rate": 1.666199978678283e-05, + "loss": 1.1984, + "step": 1190 + }, + { + "epoch": 0.3426032110091743, + "grad_norm": 4.110223116861788, + "learning_rate": 1.6624597074165597e-05, + "loss": 1.2468, + "step": 1195 + }, + { + "epoch": 0.3440366972477064, + "grad_norm": 3.603557931052651, + "learning_rate": 1.6587028473275772e-05, + "loss": 1.1962, + "step": 1200 + }, + { + "epoch": 0.34547018348623854, + "grad_norm": 3.209591671498054, + "learning_rate": 1.6549294924878532e-05, + "loss": 1.1714, + "step": 1205 + }, + { + "epoch": 0.34690366972477066, + "grad_norm": 3.6243010245754337, + "learning_rate": 1.651139737386957e-05, + "loss": 1.1733, + "step": 1210 + }, + { + "epoch": 0.3483371559633027, + "grad_norm": 3.9078886802263697, + "learning_rate": 1.6473336769251388e-05, + "loss": 1.1715, + "step": 1215 + }, + { + "epoch": 0.34977064220183485, + "grad_norm": 3.6344742284976306, + "learning_rate": 1.6435114064109575e-05, + "loss": 1.1244, + "step": 1220 + }, + { + "epoch": 0.35120412844036697, + "grad_norm": 3.6952536271050964, + "learning_rate": 1.6396730215588913e-05, + "loss": 1.2089, + "step": 1225 + }, + { + "epoch": 0.3526376146788991, + "grad_norm": 3.429886933846167, + "learning_rate": 1.6358186184869417e-05, + "loss": 1.1892, + "step": 1230 + }, + { + "epoch": 0.3540711009174312, + "grad_norm": 3.093455767598654, + "learning_rate": 1.631948293714227e-05, + "loss": 1.2106, + "step": 1235 + }, + { + "epoch": 0.3555045871559633, + "grad_norm": 4.038152481713509, + "learning_rate": 1.6280621441585647e-05, + "loss": 1.1697, + "step": 1240 + }, + { + "epoch": 0.3569380733944954, + "grad_norm": 3.6999557746923095, + "learning_rate": 1.6241602671340448e-05, + "loss": 1.2376, + "step": 1245 + }, + { + "epoch": 0.3583715596330275, + "grad_norm": 3.6041781324392868, + "learning_rate": 1.6202427603485933e-05, + "loss": 1.1327, + "step": 1250 + }, + { + "epoch": 0.35980504587155965, + "grad_norm": 3.4145472213007473, + "learning_rate": 1.6163097219015245e-05, + "loss": 1.1372, + "step": 1255 + }, + { + "epoch": 0.36123853211009177, + "grad_norm": 3.429021637188865, + "learning_rate": 1.6123612502810865e-05, + "loss": 1.1646, + "step": 1260 + }, + { + "epoch": 0.36267201834862384, + "grad_norm": 3.6122171856293708, + "learning_rate": 1.6083974443619922e-05, + "loss": 1.2031, + "step": 1265 + }, + { + "epoch": 0.36410550458715596, + "grad_norm": 3.2252592769247617, + "learning_rate": 1.6044184034029445e-05, + "loss": 1.1669, + "step": 1270 + }, + { + "epoch": 0.3655389908256881, + "grad_norm": 3.513524941132868, + "learning_rate": 1.6004242270441523e-05, + "loss": 1.1871, + "step": 1275 + }, + { + "epoch": 0.3669724770642202, + "grad_norm": 3.5159805187675115, + "learning_rate": 1.596415015304833e-05, + "loss": 1.1851, + "step": 1280 + }, + { + "epoch": 0.36840596330275227, + "grad_norm": 3.3054440445349837, + "learning_rate": 1.5923908685807087e-05, + "loss": 1.1074, + "step": 1285 + }, + { + "epoch": 0.3698394495412844, + "grad_norm": 3.6946632308744136, + "learning_rate": 1.588351887641494e-05, + "loss": 1.1596, + "step": 1290 + }, + { + "epoch": 0.3712729357798165, + "grad_norm": 3.3924561467755785, + "learning_rate": 1.5842981736283686e-05, + "loss": 1.2262, + "step": 1295 + }, + { + "epoch": 0.37270642201834864, + "grad_norm": 3.623083608828368, + "learning_rate": 1.5802298280514487e-05, + "loss": 1.1823, + "step": 1300 + }, + { + "epoch": 0.37413990825688076, + "grad_norm": 3.0148932024272437, + "learning_rate": 1.5761469527872427e-05, + "loss": 1.1513, + "step": 1305 + }, + { + "epoch": 0.3755733944954128, + "grad_norm": 10.198288104380973, + "learning_rate": 1.572049650076101e-05, + "loss": 1.1498, + "step": 1310 + }, + { + "epoch": 0.37700688073394495, + "grad_norm": 3.623884808969302, + "learning_rate": 1.5679380225196546e-05, + "loss": 1.1857, + "step": 1315 + }, + { + "epoch": 0.37844036697247707, + "grad_norm": 21.814389727799938, + "learning_rate": 1.5638121730782486e-05, + "loss": 1.1748, + "step": 1320 + }, + { + "epoch": 0.3798738532110092, + "grad_norm": 3.7586731538464684, + "learning_rate": 1.5596722050683598e-05, + "loss": 1.2024, + "step": 1325 + }, + { + "epoch": 0.38130733944954126, + "grad_norm": 3.914930197959616, + "learning_rate": 1.555518222160013e-05, + "loss": 1.1873, + "step": 1330 + }, + { + "epoch": 0.3827408256880734, + "grad_norm": 3.645421580899915, + "learning_rate": 1.551350328374184e-05, + "loss": 1.1252, + "step": 1335 + }, + { + "epoch": 0.3841743119266055, + "grad_norm": 3.1973889011870376, + "learning_rate": 1.5471686280801933e-05, + "loss": 1.1839, + "step": 1340 + }, + { + "epoch": 0.3856077981651376, + "grad_norm": 3.5501087765790915, + "learning_rate": 1.5429732259930955e-05, + "loss": 1.1232, + "step": 1345 + }, + { + "epoch": 0.38704128440366975, + "grad_norm": 3.3946063622500016, + "learning_rate": 1.538764227171054e-05, + "loss": 1.1768, + "step": 1350 + }, + { + "epoch": 0.3884747706422018, + "grad_norm": 3.3089407247339975, + "learning_rate": 1.5345417370127123e-05, + "loss": 1.1697, + "step": 1355 + }, + { + "epoch": 0.38990825688073394, + "grad_norm": 4.435494305269373, + "learning_rate": 1.5303058612545534e-05, + "loss": 1.1301, + "step": 1360 + }, + { + "epoch": 0.39134174311926606, + "grad_norm": 3.1427532195841343, + "learning_rate": 1.5260567059682535e-05, + "loss": 1.1857, + "step": 1365 + }, + { + "epoch": 0.3927752293577982, + "grad_norm": 3.5070367063445684, + "learning_rate": 1.521794377558024e-05, + "loss": 1.1294, + "step": 1370 + }, + { + "epoch": 0.39420871559633025, + "grad_norm": 3.6077040833104896, + "learning_rate": 1.5175189827579489e-05, + "loss": 1.1469, + "step": 1375 + }, + { + "epoch": 0.3956422018348624, + "grad_norm": 3.401651617393664, + "learning_rate": 1.5132306286293096e-05, + "loss": 1.1383, + "step": 1380 + }, + { + "epoch": 0.3970756880733945, + "grad_norm": 3.3826428424139925, + "learning_rate": 1.5089294225579077e-05, + "loss": 1.1887, + "step": 1385 + }, + { + "epoch": 0.3985091743119266, + "grad_norm": 3.2830968866458345, + "learning_rate": 1.5046154722513718e-05, + "loss": 1.15, + "step": 1390 + }, + { + "epoch": 0.39994266055045874, + "grad_norm": 3.2832815737637113, + "learning_rate": 1.5002888857364624e-05, + "loss": 1.1811, + "step": 1395 + }, + { + "epoch": 0.4013761467889908, + "grad_norm": 3.222829419424945, + "learning_rate": 1.4959497713563677e-05, + "loss": 1.1696, + "step": 1400 + }, + { + "epoch": 0.40280963302752293, + "grad_norm": 3.739181624579295, + "learning_rate": 1.4915982377679885e-05, + "loss": 1.1978, + "step": 1405 + }, + { + "epoch": 0.40424311926605505, + "grad_norm": 3.383948957829039, + "learning_rate": 1.4872343939392189e-05, + "loss": 1.1724, + "step": 1410 + }, + { + "epoch": 0.4056766055045872, + "grad_norm": 3.7382791698251427, + "learning_rate": 1.482858349146216e-05, + "loss": 1.1471, + "step": 1415 + }, + { + "epoch": 0.40711009174311924, + "grad_norm": 3.510878161789182, + "learning_rate": 1.4784702129706655e-05, + "loss": 1.1337, + "step": 1420 + }, + { + "epoch": 0.40854357798165136, + "grad_norm": 3.505598802015759, + "learning_rate": 1.474070095297036e-05, + "loss": 1.187, + "step": 1425 + }, + { + "epoch": 0.4099770642201835, + "grad_norm": 3.2069715268799204, + "learning_rate": 1.469658106309828e-05, + "loss": 1.1136, + "step": 1430 + }, + { + "epoch": 0.4114105504587156, + "grad_norm": 4.1812266615604345, + "learning_rate": 1.465234356490815e-05, + "loss": 1.1718, + "step": 1435 + }, + { + "epoch": 0.41284403669724773, + "grad_norm": 3.3985858466934338, + "learning_rate": 1.4607989566162761e-05, + "loss": 1.1414, + "step": 1440 + }, + { + "epoch": 0.4142775229357798, + "grad_norm": 3.6381472550431093, + "learning_rate": 1.4563520177542226e-05, + "loss": 1.2166, + "step": 1445 + }, + { + "epoch": 0.4157110091743119, + "grad_norm": 3.268767608874384, + "learning_rate": 1.451893651261617e-05, + "loss": 1.1695, + "step": 1450 + }, + { + "epoch": 0.41714449541284404, + "grad_norm": 4.519674348990799, + "learning_rate": 1.4474239687815838e-05, + "loss": 1.189, + "step": 1455 + }, + { + "epoch": 0.41857798165137616, + "grad_norm": 3.594106437257434, + "learning_rate": 1.4429430822406138e-05, + "loss": 1.1956, + "step": 1460 + }, + { + "epoch": 0.42001146788990823, + "grad_norm": 3.838114634468203, + "learning_rate": 1.4384511038457624e-05, + "loss": 1.1348, + "step": 1465 + }, + { + "epoch": 0.42144495412844035, + "grad_norm": 3.3721934098468083, + "learning_rate": 1.4339481460818385e-05, + "loss": 1.135, + "step": 1470 + }, + { + "epoch": 0.4228784403669725, + "grad_norm": 3.3513743473211512, + "learning_rate": 1.429434321708588e-05, + "loss": 1.1443, + "step": 1475 + }, + { + "epoch": 0.4243119266055046, + "grad_norm": 3.3279635877417304, + "learning_rate": 1.4249097437578712e-05, + "loss": 1.1574, + "step": 1480 + }, + { + "epoch": 0.4257454128440367, + "grad_norm": 3.5340814143703896, + "learning_rate": 1.4203745255308306e-05, + "loss": 1.1607, + "step": 1485 + }, + { + "epoch": 0.4271788990825688, + "grad_norm": 3.3296679892148995, + "learning_rate": 1.4158287805950557e-05, + "loss": 1.199, + "step": 1490 + }, + { + "epoch": 0.4286123853211009, + "grad_norm": 3.401879333323161, + "learning_rate": 1.411272622781737e-05, + "loss": 1.1364, + "step": 1495 + }, + { + "epoch": 0.43004587155963303, + "grad_norm": 3.131916232702695, + "learning_rate": 1.4067061661828176e-05, + "loss": 1.1472, + "step": 1500 + }, + { + "epoch": 0.43147935779816515, + "grad_norm": 3.543349954734912, + "learning_rate": 1.4021295251481347e-05, + "loss": 1.1813, + "step": 1505 + }, + { + "epoch": 0.4329128440366973, + "grad_norm": 3.309506087624542, + "learning_rate": 1.3975428142825562e-05, + "loss": 1.1581, + "step": 1510 + }, + { + "epoch": 0.43434633027522934, + "grad_norm": 3.9281940551258097, + "learning_rate": 1.392946148443112e-05, + "loss": 1.1808, + "step": 1515 + }, + { + "epoch": 0.43577981651376146, + "grad_norm": 3.8214812468244346, + "learning_rate": 1.3883396427361169e-05, + "loss": 1.1616, + "step": 1520 + }, + { + "epoch": 0.4372133027522936, + "grad_norm": 3.997270424750147, + "learning_rate": 1.383723412514288e-05, + "loss": 1.1113, + "step": 1525 + }, + { + "epoch": 0.4386467889908257, + "grad_norm": 3.437008178425516, + "learning_rate": 1.3790975733738576e-05, + "loss": 1.1713, + "step": 1530 + }, + { + "epoch": 0.4400802752293578, + "grad_norm": 3.3811402901395984, + "learning_rate": 1.3744622411516758e-05, + "loss": 1.1347, + "step": 1535 + }, + { + "epoch": 0.4415137614678899, + "grad_norm": 3.3962664452333993, + "learning_rate": 1.3698175319223133e-05, + "loss": 1.1347, + "step": 1540 + }, + { + "epoch": 0.442947247706422, + "grad_norm": 3.6162635705687127, + "learning_rate": 1.3651635619951509e-05, + "loss": 1.1574, + "step": 1545 + }, + { + "epoch": 0.44438073394495414, + "grad_norm": 3.502705575568886, + "learning_rate": 1.360500447911471e-05, + "loss": 1.0691, + "step": 1550 + }, + { + "epoch": 0.44581422018348627, + "grad_norm": 3.473102068128584, + "learning_rate": 1.3558283064415357e-05, + "loss": 1.1018, + "step": 1555 + }, + { + "epoch": 0.44724770642201833, + "grad_norm": 3.5587976176438656, + "learning_rate": 1.3511472545816648e-05, + "loss": 1.1293, + "step": 1560 + }, + { + "epoch": 0.44868119266055045, + "grad_norm": 3.613085860404909, + "learning_rate": 1.3464574095513058e-05, + "loss": 1.1685, + "step": 1565 + }, + { + "epoch": 0.4501146788990826, + "grad_norm": 4.06622018524846, + "learning_rate": 1.3417588887900978e-05, + "loss": 1.1067, + "step": 1570 + }, + { + "epoch": 0.4515481651376147, + "grad_norm": 3.4649254770088698, + "learning_rate": 1.3370518099549315e-05, + "loss": 1.1026, + "step": 1575 + }, + { + "epoch": 0.45298165137614677, + "grad_norm": 2.952840725725241, + "learning_rate": 1.3323362909170018e-05, + "loss": 1.1283, + "step": 1580 + }, + { + "epoch": 0.4544151376146789, + "grad_norm": 3.253136251920175, + "learning_rate": 1.3276124497588585e-05, + "loss": 1.2414, + "step": 1585 + }, + { + "epoch": 0.455848623853211, + "grad_norm": 3.162846986211554, + "learning_rate": 1.3228804047714462e-05, + "loss": 1.1169, + "step": 1590 + }, + { + "epoch": 0.45728211009174313, + "grad_norm": 3.3272179350821505, + "learning_rate": 1.3181402744511446e-05, + "loss": 1.1756, + "step": 1595 + }, + { + "epoch": 0.45871559633027525, + "grad_norm": 3.1731644889698183, + "learning_rate": 1.3133921774968001e-05, + "loss": 1.1305, + "step": 1600 + }, + { + "epoch": 0.4601490825688073, + "grad_norm": 3.1249681327250904, + "learning_rate": 1.3086362328067536e-05, + "loss": 1.1603, + "step": 1605 + }, + { + "epoch": 0.46158256880733944, + "grad_norm": 3.376924777276381, + "learning_rate": 1.3038725594758632e-05, + "loss": 1.118, + "step": 1610 + }, + { + "epoch": 0.46301605504587157, + "grad_norm": 3.193633263846461, + "learning_rate": 1.2991012767925224e-05, + "loss": 1.1337, + "step": 1615 + }, + { + "epoch": 0.4644495412844037, + "grad_norm": 3.0745547065753915, + "learning_rate": 1.2943225042356714e-05, + "loss": 1.1252, + "step": 1620 + }, + { + "epoch": 0.46588302752293576, + "grad_norm": 7.556185912909887, + "learning_rate": 1.2895363614718082e-05, + "loss": 1.1517, + "step": 1625 + }, + { + "epoch": 0.4673165137614679, + "grad_norm": 3.3618721338615787, + "learning_rate": 1.2847429683519879e-05, + "loss": 1.1321, + "step": 1630 + }, + { + "epoch": 0.46875, + "grad_norm": 3.1654091942580904, + "learning_rate": 1.2799424449088246e-05, + "loss": 1.1475, + "step": 1635 + }, + { + "epoch": 0.4701834862385321, + "grad_norm": 3.3736619152907386, + "learning_rate": 1.2751349113534856e-05, + "loss": 1.1297, + "step": 1640 + }, + { + "epoch": 0.47161697247706424, + "grad_norm": 3.711148861718645, + "learning_rate": 1.2703204880726788e-05, + "loss": 1.1654, + "step": 1645 + }, + { + "epoch": 0.4730504587155963, + "grad_norm": 3.0399296608693946, + "learning_rate": 1.2654992956256397e-05, + "loss": 1.1072, + "step": 1650 + }, + { + "epoch": 0.47448394495412843, + "grad_norm": 3.299482801946242, + "learning_rate": 1.2606714547411138e-05, + "loss": 1.1338, + "step": 1655 + }, + { + "epoch": 0.47591743119266056, + "grad_norm": 4.109998693762478, + "learning_rate": 1.2558370863143298e-05, + "loss": 1.1517, + "step": 1660 + }, + { + "epoch": 0.4773509174311927, + "grad_norm": 3.6028647178766167, + "learning_rate": 1.250996311403976e-05, + "loss": 1.2193, + "step": 1665 + }, + { + "epoch": 0.47878440366972475, + "grad_norm": 3.088463522622337, + "learning_rate": 1.246149251229166e-05, + "loss": 1.1481, + "step": 1670 + }, + { + "epoch": 0.48021788990825687, + "grad_norm": 3.1966754732870033, + "learning_rate": 1.2412960271664046e-05, + "loss": 1.0923, + "step": 1675 + }, + { + "epoch": 0.481651376146789, + "grad_norm": 3.5520419027791044, + "learning_rate": 1.2364367607465483e-05, + "loss": 1.131, + "step": 1680 + }, + { + "epoch": 0.4830848623853211, + "grad_norm": 3.1911366000417103, + "learning_rate": 1.2315715736517624e-05, + "loss": 1.1524, + "step": 1685 + }, + { + "epoch": 0.48451834862385323, + "grad_norm": 3.1232221206590847, + "learning_rate": 1.2267005877124721e-05, + "loss": 1.1336, + "step": 1690 + }, + { + "epoch": 0.4859518348623853, + "grad_norm": 3.1919229672837606, + "learning_rate": 1.2218239249043143e-05, + "loss": 1.1163, + "step": 1695 + }, + { + "epoch": 0.4873853211009174, + "grad_norm": 3.6466393865332223, + "learning_rate": 1.2169417073450805e-05, + "loss": 1.1113, + "step": 1700 + }, + { + "epoch": 0.48881880733944955, + "grad_norm": 3.275240485720321, + "learning_rate": 1.2120540572916617e-05, + "loss": 1.1516, + "step": 1705 + }, + { + "epoch": 0.49025229357798167, + "grad_norm": 3.350383352712665, + "learning_rate": 1.2071610971369842e-05, + "loss": 1.1564, + "step": 1710 + }, + { + "epoch": 0.49168577981651373, + "grad_norm": 3.5074589174762236, + "learning_rate": 1.2022629494069466e-05, + "loss": 1.1004, + "step": 1715 + }, + { + "epoch": 0.49311926605504586, + "grad_norm": 3.4164962172597035, + "learning_rate": 1.1973597367573509e-05, + "loss": 1.0966, + "step": 1720 + }, + { + "epoch": 0.494552752293578, + "grad_norm": 3.3927103725177825, + "learning_rate": 1.19245158197083e-05, + "loss": 1.1635, + "step": 1725 + }, + { + "epoch": 0.4959862385321101, + "grad_norm": 2.9590270815185193, + "learning_rate": 1.1875386079537762e-05, + "loss": 1.1415, + "step": 1730 + }, + { + "epoch": 0.4974197247706422, + "grad_norm": 3.257588158306314, + "learning_rate": 1.1826209377332593e-05, + "loss": 1.1326, + "step": 1735 + }, + { + "epoch": 0.4988532110091743, + "grad_norm": 3.3051541372419737, + "learning_rate": 1.1776986944539498e-05, + "loss": 1.1278, + "step": 1740 + }, + { + "epoch": 0.5002866972477065, + "grad_norm": 3.523346784327648, + "learning_rate": 1.1727720013750319e-05, + "loss": 1.1099, + "step": 1745 + }, + { + "epoch": 0.5017201834862385, + "grad_norm": 3.3893012078265015, + "learning_rate": 1.1678409818671192e-05, + "loss": 1.1546, + "step": 1750 + }, + { + "epoch": 0.5031536697247706, + "grad_norm": 3.3226857594304713, + "learning_rate": 1.1629057594091639e-05, + "loss": 1.1061, + "step": 1755 + }, + { + "epoch": 0.5045871559633027, + "grad_norm": 3.507323841567208, + "learning_rate": 1.1579664575853667e-05, + "loss": 1.1551, + "step": 1760 + }, + { + "epoch": 0.5060206422018348, + "grad_norm": 3.1700645014865856, + "learning_rate": 1.1530232000820791e-05, + "loss": 1.1151, + "step": 1765 + }, + { + "epoch": 0.507454128440367, + "grad_norm": 3.397843452612661, + "learning_rate": 1.1480761106847088e-05, + "loss": 1.0967, + "step": 1770 + }, + { + "epoch": 0.5088876146788991, + "grad_norm": 3.296593763189619, + "learning_rate": 1.1431253132746187e-05, + "loss": 1.1229, + "step": 1775 + }, + { + "epoch": 0.5103211009174312, + "grad_norm": 3.1860797359537574, + "learning_rate": 1.138170931826025e-05, + "loss": 1.1305, + "step": 1780 + }, + { + "epoch": 0.5117545871559633, + "grad_norm": 3.2917554463603333, + "learning_rate": 1.133213090402893e-05, + "loss": 1.0911, + "step": 1785 + }, + { + "epoch": 0.5131880733944955, + "grad_norm": 3.158139008213761, + "learning_rate": 1.1282519131558302e-05, + "loss": 1.1095, + "step": 1790 + }, + { + "epoch": 0.5146215596330275, + "grad_norm": 3.22490301605298, + "learning_rate": 1.1232875243189765e-05, + "loss": 1.1695, + "step": 1795 + }, + { + "epoch": 0.5160550458715596, + "grad_norm": 3.586519697086409, + "learning_rate": 1.1183200482068949e-05, + "loss": 1.0989, + "step": 1800 + }, + { + "epoch": 0.5174885321100917, + "grad_norm": 3.265891742392178, + "learning_rate": 1.1133496092114576e-05, + "loss": 1.0851, + "step": 1805 + }, + { + "epoch": 0.5189220183486238, + "grad_norm": 3.1099004425125334, + "learning_rate": 1.1083763317987304e-05, + "loss": 1.1119, + "step": 1810 + }, + { + "epoch": 0.520355504587156, + "grad_norm": 4.5404304007913305, + "learning_rate": 1.103400340505858e-05, + "loss": 1.1143, + "step": 1815 + }, + { + "epoch": 0.5217889908256881, + "grad_norm": 3.4230865219558764, + "learning_rate": 1.0984217599379425e-05, + "loss": 1.1182, + "step": 1820 + }, + { + "epoch": 0.5232224770642202, + "grad_norm": 3.1427663664508456, + "learning_rate": 1.093440714764926e-05, + "loss": 1.1317, + "step": 1825 + }, + { + "epoch": 0.5246559633027523, + "grad_norm": 2.9784943507834476, + "learning_rate": 1.088457329718467e-05, + "loss": 1.0364, + "step": 1830 + }, + { + "epoch": 0.5260894495412844, + "grad_norm": 3.3656116327984806, + "learning_rate": 1.0834717295888168e-05, + "loss": 1.1432, + "step": 1835 + }, + { + "epoch": 0.5275229357798165, + "grad_norm": 2.8639878500754774, + "learning_rate": 1.0784840392216961e-05, + "loss": 1.1421, + "step": 1840 + }, + { + "epoch": 0.5289564220183486, + "grad_norm": 3.340151227567289, + "learning_rate": 1.0734943835151674e-05, + "loss": 1.0931, + "step": 1845 + }, + { + "epoch": 0.5303899082568807, + "grad_norm": 3.8035180968079993, + "learning_rate": 1.0685028874165075e-05, + "loss": 1.1179, + "step": 1850 + }, + { + "epoch": 0.5318233944954128, + "grad_norm": 3.070748628370571, + "learning_rate": 1.0635096759190792e-05, + "loss": 1.123, + "step": 1855 + }, + { + "epoch": 0.533256880733945, + "grad_norm": 3.226775333715127, + "learning_rate": 1.0585148740592013e-05, + "loss": 1.1159, + "step": 1860 + }, + { + "epoch": 0.5346903669724771, + "grad_norm": 3.236504164334084, + "learning_rate": 1.053518606913017e-05, + "loss": 1.1494, + "step": 1865 + }, + { + "epoch": 0.5361238532110092, + "grad_norm": 3.2302791306950702, + "learning_rate": 1.048520999593362e-05, + "loss": 1.1444, + "step": 1870 + }, + { + "epoch": 0.5375573394495413, + "grad_norm": 3.2832052841745916, + "learning_rate": 1.0435221772466318e-05, + "loss": 1.1469, + "step": 1875 + }, + { + "epoch": 0.5389908256880734, + "grad_norm": 3.2524360195076496, + "learning_rate": 1.0385222650496479e-05, + "loss": 1.1313, + "step": 1880 + }, + { + "epoch": 0.5404243119266054, + "grad_norm": 2.919300243143409, + "learning_rate": 1.0335213882065225e-05, + "loss": 1.1112, + "step": 1885 + }, + { + "epoch": 0.5418577981651376, + "grad_norm": 3.7356105416504546, + "learning_rate": 1.0285196719455242e-05, + "loss": 1.1113, + "step": 1890 + }, + { + "epoch": 0.5432912844036697, + "grad_norm": 3.1163300953894306, + "learning_rate": 1.0235172415159418e-05, + "loss": 1.0671, + "step": 1895 + }, + { + "epoch": 0.5447247706422018, + "grad_norm": 3.6338243642002976, + "learning_rate": 1.0185142221849469e-05, + "loss": 1.1405, + "step": 1900 + }, + { + "epoch": 0.5461582568807339, + "grad_norm": 3.4265846748133613, + "learning_rate": 1.0135107392344594e-05, + "loss": 1.1052, + "step": 1905 + }, + { + "epoch": 0.5475917431192661, + "grad_norm": 3.6433669703963774, + "learning_rate": 1.0085069179580076e-05, + "loss": 1.0958, + "step": 1910 + }, + { + "epoch": 0.5490252293577982, + "grad_norm": 3.070116617818365, + "learning_rate": 1.0035028836575922e-05, + "loss": 1.0856, + "step": 1915 + }, + { + "epoch": 0.5504587155963303, + "grad_norm": 3.3094892848191373, + "learning_rate": 9.984987616405486e-06, + "loss": 1.1775, + "step": 1920 + }, + { + "epoch": 0.5518922018348624, + "grad_norm": 3.1253255381505465, + "learning_rate": 9.934946772164082e-06, + "loss": 1.0962, + "step": 1925 + }, + { + "epoch": 0.5533256880733946, + "grad_norm": 3.2006466765888377, + "learning_rate": 9.884907556937619e-06, + "loss": 1.0732, + "step": 1930 + }, + { + "epoch": 0.5547591743119266, + "grad_norm": 3.1292256633487647, + "learning_rate": 9.834871223771204e-06, + "loss": 1.0887, + "step": 1935 + }, + { + "epoch": 0.5561926605504587, + "grad_norm": 3.1916225285491886, + "learning_rate": 9.78483902563778e-06, + "loss": 1.1239, + "step": 1940 + }, + { + "epoch": 0.5576261467889908, + "grad_norm": 3.0332428019262623, + "learning_rate": 9.73481221540674e-06, + "loss": 1.1231, + "step": 1945 + }, + { + "epoch": 0.5590596330275229, + "grad_norm": 3.045815826748736, + "learning_rate": 9.684792045812555e-06, + "loss": 1.1098, + "step": 1950 + }, + { + "epoch": 0.560493119266055, + "grad_norm": 3.0391892370593334, + "learning_rate": 9.634779769423412e-06, + "loss": 1.1044, + "step": 1955 + }, + { + "epoch": 0.5619266055045872, + "grad_norm": 3.055569668736528, + "learning_rate": 9.584776638609841e-06, + "loss": 1.0894, + "step": 1960 + }, + { + "epoch": 0.5633600917431193, + "grad_norm": 3.5406439314630167, + "learning_rate": 9.534783905513355e-06, + "loss": 1.1109, + "step": 1965 + }, + { + "epoch": 0.5647935779816514, + "grad_norm": 3.345905555913882, + "learning_rate": 9.484802822015087e-06, + "loss": 1.1138, + "step": 1970 + }, + { + "epoch": 0.5662270642201835, + "grad_norm": 3.296141105361695, + "learning_rate": 9.434834639704464e-06, + "loss": 1.1059, + "step": 1975 + }, + { + "epoch": 0.5676605504587156, + "grad_norm": 2.8031036854561697, + "learning_rate": 9.384880609847838e-06, + "loss": 1.0806, + "step": 1980 + }, + { + "epoch": 0.5690940366972477, + "grad_norm": 3.0977315843393165, + "learning_rate": 9.33494198335717e-06, + "loss": 1.1364, + "step": 1985 + }, + { + "epoch": 0.5705275229357798, + "grad_norm": 3.3929218329827604, + "learning_rate": 9.285020010758706e-06, + "loss": 1.1644, + "step": 1990 + }, + { + "epoch": 0.5719610091743119, + "grad_norm": 3.2746802917151396, + "learning_rate": 9.235115942161656e-06, + "loss": 1.0867, + "step": 1995 + }, + { + "epoch": 0.573394495412844, + "grad_norm": 3.4111204084118594, + "learning_rate": 9.18523102722688e-06, + "loss": 1.1372, + "step": 2000 + }, + { + "epoch": 0.5748279816513762, + "grad_norm": 3.0339025073036883, + "learning_rate": 9.135366515135617e-06, + "loss": 1.1156, + "step": 2005 + }, + { + "epoch": 0.5762614678899083, + "grad_norm": 3.152200571534001, + "learning_rate": 9.0855236545582e-06, + "loss": 1.2007, + "step": 2010 + }, + { + "epoch": 0.5776949541284404, + "grad_norm": 5.672098040763039, + "learning_rate": 9.035703693622762e-06, + "loss": 1.1011, + "step": 2015 + }, + { + "epoch": 0.5791284403669725, + "grad_norm": 3.0337007868840207, + "learning_rate": 8.985907879884011e-06, + "loss": 1.0502, + "step": 2020 + }, + { + "epoch": 0.5805619266055045, + "grad_norm": 3.397456497600358, + "learning_rate": 8.936137460291985e-06, + "loss": 1.0686, + "step": 2025 + }, + { + "epoch": 0.5819954128440367, + "grad_norm": 3.1557825825649983, + "learning_rate": 8.886393681160804e-06, + "loss": 1.1242, + "step": 2030 + }, + { + "epoch": 0.5834288990825688, + "grad_norm": 3.014868609795805, + "learning_rate": 8.836677788137488e-06, + "loss": 1.0823, + "step": 2035 + }, + { + "epoch": 0.5848623853211009, + "grad_norm": 3.1697976935253256, + "learning_rate": 8.78699102617076e-06, + "loss": 1.1477, + "step": 2040 + }, + { + "epoch": 0.586295871559633, + "grad_norm": 3.060910090014108, + "learning_rate": 8.737334639479843e-06, + "loss": 1.1047, + "step": 2045 + }, + { + "epoch": 0.5877293577981652, + "grad_norm": 3.3975873769480525, + "learning_rate": 8.687709871523346e-06, + "loss": 1.1472, + "step": 2050 + }, + { + "epoch": 0.5891628440366973, + "grad_norm": 3.103294993064734, + "learning_rate": 8.638117964968098e-06, + "loss": 1.1147, + "step": 2055 + }, + { + "epoch": 0.5905963302752294, + "grad_norm": 3.1038713521837176, + "learning_rate": 8.588560161658039e-06, + "loss": 1.1201, + "step": 2060 + }, + { + "epoch": 0.5920298165137615, + "grad_norm": 3.579612587244007, + "learning_rate": 8.539037702583108e-06, + "loss": 1.2046, + "step": 2065 + }, + { + "epoch": 0.5934633027522935, + "grad_norm": 3.1302516904022766, + "learning_rate": 8.489551827848197e-06, + "loss": 1.0237, + "step": 2070 + }, + { + "epoch": 0.5948967889908257, + "grad_norm": 3.24638481758472, + "learning_rate": 8.440103776642074e-06, + "loss": 1.0678, + "step": 2075 + }, + { + "epoch": 0.5963302752293578, + "grad_norm": 3.4237594408353083, + "learning_rate": 8.390694787206349e-06, + "loss": 1.1734, + "step": 2080 + }, + { + "epoch": 0.5977637614678899, + "grad_norm": 3.0719379679166625, + "learning_rate": 8.341326096804489e-06, + "loss": 1.0969, + "step": 2085 + }, + { + "epoch": 0.599197247706422, + "grad_norm": 3.114117068959359, + "learning_rate": 8.291998941690821e-06, + "loss": 1.1036, + "step": 2090 + }, + { + "epoch": 0.6006307339449541, + "grad_norm": 3.058349370894333, + "learning_rate": 8.242714557079563e-06, + "loss": 1.0749, + "step": 2095 + }, + { + "epoch": 0.6020642201834863, + "grad_norm": 3.036934322182637, + "learning_rate": 8.193474177113918e-06, + "loss": 1.0583, + "step": 2100 + }, + { + "epoch": 0.6034977064220184, + "grad_norm": 3.13799906852602, + "learning_rate": 8.144279034835157e-06, + "loss": 1.1374, + "step": 2105 + }, + { + "epoch": 0.6049311926605505, + "grad_norm": 3.303135805293391, + "learning_rate": 8.095130362151737e-06, + "loss": 1.0465, + "step": 2110 + }, + { + "epoch": 0.6063646788990825, + "grad_norm": 3.3978912865766686, + "learning_rate": 8.046029389808457e-06, + "loss": 1.055, + "step": 2115 + }, + { + "epoch": 0.6077981651376146, + "grad_norm": 3.47121454527395, + "learning_rate": 7.996977347355647e-06, + "loss": 1.1155, + "step": 2120 + }, + { + "epoch": 0.6092316513761468, + "grad_norm": 3.156947339229311, + "learning_rate": 7.947975463118361e-06, + "loss": 1.0363, + "step": 2125 + }, + { + "epoch": 0.6106651376146789, + "grad_norm": 3.2797688368791955, + "learning_rate": 7.899024964165634e-06, + "loss": 1.1381, + "step": 2130 + }, + { + "epoch": 0.612098623853211, + "grad_norm": 2.9662302688805275, + "learning_rate": 7.850127076279747e-06, + "loss": 1.1032, + "step": 2135 + }, + { + "epoch": 0.6135321100917431, + "grad_norm": 3.203748291163447, + "learning_rate": 7.801283023925536e-06, + "loss": 1.0517, + "step": 2140 + }, + { + "epoch": 0.6149655963302753, + "grad_norm": 3.319287430110112, + "learning_rate": 7.752494030219724e-06, + "loss": 1.115, + "step": 2145 + }, + { + "epoch": 0.6163990825688074, + "grad_norm": 3.7703557404425823, + "learning_rate": 7.703761316900293e-06, + "loss": 1.0813, + "step": 2150 + }, + { + "epoch": 0.6178325688073395, + "grad_norm": 3.2303522091544727, + "learning_rate": 7.655086104295904e-06, + "loss": 1.0668, + "step": 2155 + }, + { + "epoch": 0.6192660550458715, + "grad_norm": 3.1818989111713676, + "learning_rate": 7.606469611295315e-06, + "loss": 1.1368, + "step": 2160 + }, + { + "epoch": 0.6206995412844036, + "grad_norm": 3.0516455298261347, + "learning_rate": 7.5579130553168815e-06, + "loss": 1.1132, + "step": 2165 + }, + { + "epoch": 0.6221330275229358, + "grad_norm": 3.3307988467764087, + "learning_rate": 7.50941765227805e-06, + "loss": 1.0518, + "step": 2170 + }, + { + "epoch": 0.6235665137614679, + "grad_norm": 3.3783423662774363, + "learning_rate": 7.460984616564929e-06, + "loss": 1.1132, + "step": 2175 + }, + { + "epoch": 0.625, + "grad_norm": 3.634134930894941, + "learning_rate": 7.412615161001866e-06, + "loss": 1.0798, + "step": 2180 + }, + { + "epoch": 0.6264334862385321, + "grad_norm": 3.2417955659870796, + "learning_rate": 7.364310496821086e-06, + "loss": 1.0864, + "step": 2185 + }, + { + "epoch": 0.6278669724770642, + "grad_norm": 3.1150031782926395, + "learning_rate": 7.316071833632346e-06, + "loss": 1.1044, + "step": 2190 + }, + { + "epoch": 0.6293004587155964, + "grad_norm": 3.058907033640181, + "learning_rate": 7.2679003793926626e-06, + "loss": 1.1097, + "step": 2195 + }, + { + "epoch": 0.6307339449541285, + "grad_norm": 3.2942538258713348, + "learning_rate": 7.2197973403760614e-06, + "loss": 1.1005, + "step": 2200 + }, + { + "epoch": 0.6321674311926605, + "grad_norm": 3.225308816069398, + "learning_rate": 7.171763921143346e-06, + "loss": 1.0627, + "step": 2205 + }, + { + "epoch": 0.6336009174311926, + "grad_norm": 3.0409935206984566, + "learning_rate": 7.123801324511972e-06, + "loss": 1.0937, + "step": 2210 + }, + { + "epoch": 0.6350344036697247, + "grad_norm": 9.664703329802007, + "learning_rate": 7.075910751525895e-06, + "loss": 1.1024, + "step": 2215 + }, + { + "epoch": 0.6364678899082569, + "grad_norm": 3.057262418668561, + "learning_rate": 7.0280934014255195e-06, + "loss": 1.0857, + "step": 2220 + }, + { + "epoch": 0.637901376146789, + "grad_norm": 3.0509477926802395, + "learning_rate": 6.980350471617638e-06, + "loss": 1.0727, + "step": 2225 + }, + { + "epoch": 0.6393348623853211, + "grad_norm": 3.4647112333044743, + "learning_rate": 6.9326831576454835e-06, + "loss": 1.1128, + "step": 2230 + }, + { + "epoch": 0.6407683486238532, + "grad_norm": 8.615283468155432, + "learning_rate": 6.885092653158768e-06, + "loss": 1.1205, + "step": 2235 + }, + { + "epoch": 0.6422018348623854, + "grad_norm": 3.6518697308480927, + "learning_rate": 6.837580149883787e-06, + "loss": 1.0998, + "step": 2240 + }, + { + "epoch": 0.6436353211009175, + "grad_norm": 3.2292043015948493, + "learning_rate": 6.790146837593599e-06, + "loss": 1.0467, + "step": 2245 + }, + { + "epoch": 0.6450688073394495, + "grad_norm": 3.060662379650319, + "learning_rate": 6.7427939040782175e-06, + "loss": 1.0426, + "step": 2250 + }, + { + "epoch": 0.6465022935779816, + "grad_norm": 3.3641577098688087, + "learning_rate": 6.695522535114866e-06, + "loss": 1.0968, + "step": 2255 + }, + { + "epoch": 0.6479357798165137, + "grad_norm": 3.025310634435396, + "learning_rate": 6.64833391443829e-06, + "loss": 1.1066, + "step": 2260 + }, + { + "epoch": 0.6493692660550459, + "grad_norm": 3.3638409022750477, + "learning_rate": 6.601229223711123e-06, + "loss": 1.061, + "step": 2265 + }, + { + "epoch": 0.650802752293578, + "grad_norm": 3.0591413772292415, + "learning_rate": 6.554209642494267e-06, + "loss": 1.1305, + "step": 2270 + }, + { + "epoch": 0.6522362385321101, + "grad_norm": 3.048895188924846, + "learning_rate": 6.507276348217393e-06, + "loss": 1.0676, + "step": 2275 + }, + { + "epoch": 0.6536697247706422, + "grad_norm": 3.1625182877065, + "learning_rate": 6.460430516149433e-06, + "loss": 0.9783, + "step": 2280 + }, + { + "epoch": 0.6551032110091743, + "grad_norm": 3.0971219975300093, + "learning_rate": 6.413673319369145e-06, + "loss": 1.0947, + "step": 2285 + }, + { + "epoch": 0.6565366972477065, + "grad_norm": 3.0087113841047706, + "learning_rate": 6.36700592873576e-06, + "loss": 1.1115, + "step": 2290 + }, + { + "epoch": 0.6579701834862385, + "grad_norm": 3.240656660427129, + "learning_rate": 6.320429512859645e-06, + "loss": 1.0604, + "step": 2295 + }, + { + "epoch": 0.6594036697247706, + "grad_norm": 2.8631951161021423, + "learning_rate": 6.273945238073047e-06, + "loss": 1.0521, + "step": 2300 + }, + { + "epoch": 0.6608371559633027, + "grad_norm": 3.372695675980184, + "learning_rate": 6.227554268400875e-06, + "loss": 1.1259, + "step": 2305 + }, + { + "epoch": 0.6622706422018348, + "grad_norm": 3.0645684579682353, + "learning_rate": 6.1812577655315695e-06, + "loss": 1.0146, + "step": 2310 + }, + { + "epoch": 0.663704128440367, + "grad_norm": 3.2992667272685456, + "learning_rate": 6.135056888788004e-06, + "loss": 1.0908, + "step": 2315 + }, + { + "epoch": 0.6651376146788991, + "grad_norm": 3.134342552266428, + "learning_rate": 6.088952795098442e-06, + "loss": 1.0408, + "step": 2320 + }, + { + "epoch": 0.6665711009174312, + "grad_norm": 3.195978033002455, + "learning_rate": 6.042946638967586e-06, + "loss": 1.0734, + "step": 2325 + }, + { + "epoch": 0.6680045871559633, + "grad_norm": 3.244082712886425, + "learning_rate": 5.997039572447658e-06, + "loss": 1.0466, + "step": 2330 + }, + { + "epoch": 0.6694380733944955, + "grad_norm": 2.9373127993862105, + "learning_rate": 5.951232745109552e-06, + "loss": 1.0658, + "step": 2335 + }, + { + "epoch": 0.6708715596330275, + "grad_norm": 3.1038594855264967, + "learning_rate": 5.9055273040140374e-06, + "loss": 1.0916, + "step": 2340 + }, + { + "epoch": 0.6723050458715596, + "grad_norm": 3.1281256278361256, + "learning_rate": 5.859924393683056e-06, + "loss": 1.1042, + "step": 2345 + }, + { + "epoch": 0.6737385321100917, + "grad_norm": 2.9165714439580825, + "learning_rate": 5.8144251560710415e-06, + "loss": 1.0735, + "step": 2350 + }, + { + "epoch": 0.6751720183486238, + "grad_norm": 3.3690225312568955, + "learning_rate": 5.769030730536336e-06, + "loss": 1.0574, + "step": 2355 + }, + { + "epoch": 0.676605504587156, + "grad_norm": 3.298580191163876, + "learning_rate": 5.723742253812658e-06, + "loss": 1.1132, + "step": 2360 + }, + { + "epoch": 0.6780389908256881, + "grad_norm": 3.004930197106605, + "learning_rate": 5.678560859980621e-06, + "loss": 1.0691, + "step": 2365 + }, + { + "epoch": 0.6794724770642202, + "grad_norm": 3.1318112002832197, + "learning_rate": 5.633487680439362e-06, + "loss": 1.0305, + "step": 2370 + }, + { + "epoch": 0.6809059633027523, + "grad_norm": 3.139769183129407, + "learning_rate": 5.588523843878189e-06, + "loss": 1.0547, + "step": 2375 + }, + { + "epoch": 0.6823394495412844, + "grad_norm": 3.060531894281262, + "learning_rate": 5.543670476248327e-06, + "loss": 1.0354, + "step": 2380 + }, + { + "epoch": 0.6837729357798165, + "grad_norm": 3.1023960334424556, + "learning_rate": 5.498928700734713e-06, + "loss": 1.0565, + "step": 2385 + }, + { + "epoch": 0.6852064220183486, + "grad_norm": 3.1211535834048396, + "learning_rate": 5.454299637727885e-06, + "loss": 1.0662, + "step": 2390 + }, + { + "epoch": 0.6866399082568807, + "grad_norm": 2.99851195359665, + "learning_rate": 5.409784404795913e-06, + "loss": 1.0467, + "step": 2395 + }, + { + "epoch": 0.6880733944954128, + "grad_norm": 3.0166642308548606, + "learning_rate": 5.365384116656415e-06, + "loss": 1.1033, + "step": 2400 + }, + { + "epoch": 0.689506880733945, + "grad_norm": 3.1825150095020724, + "learning_rate": 5.321099885148652e-06, + "loss": 1.0541, + "step": 2405 + }, + { + "epoch": 0.6909403669724771, + "grad_norm": 3.3490852800179174, + "learning_rate": 5.2769328192056824e-06, + "loss": 1.0624, + "step": 2410 + }, + { + "epoch": 0.6923738532110092, + "grad_norm": 3.1423133524388698, + "learning_rate": 5.23288402482658e-06, + "loss": 1.0582, + "step": 2415 + }, + { + "epoch": 0.6938073394495413, + "grad_norm": 3.1573558030480235, + "learning_rate": 5.18895460504876e-06, + "loss": 1.0325, + "step": 2420 + }, + { + "epoch": 0.6952408256880734, + "grad_norm": 3.107763639677989, + "learning_rate": 5.145145659920348e-06, + "loss": 1.0017, + "step": 2425 + }, + { + "epoch": 0.6966743119266054, + "grad_norm": 3.3802618734734793, + "learning_rate": 5.101458286472618e-06, + "loss": 1.0718, + "step": 2430 + }, + { + "epoch": 0.6981077981651376, + "grad_norm": 2.918248752013308, + "learning_rate": 5.05789357869255e-06, + "loss": 1.0316, + "step": 2435 + }, + { + "epoch": 0.6995412844036697, + "grad_norm": 3.189799059727885, + "learning_rate": 5.01445262749542e-06, + "loss": 1.0851, + "step": 2440 + }, + { + "epoch": 0.7009747706422018, + "grad_norm": 3.2475620018827676, + "learning_rate": 4.9711365206974716e-06, + "loss": 1.0537, + "step": 2445 + }, + { + "epoch": 0.7024082568807339, + "grad_norm": 3.0707000997716047, + "learning_rate": 4.927946342988699e-06, + "loss": 1.0271, + "step": 2450 + }, + { + "epoch": 0.7038417431192661, + "grad_norm": 2.9176475930747543, + "learning_rate": 4.884883175905671e-06, + "loss": 1.0535, + "step": 2455 + }, + { + "epoch": 0.7052752293577982, + "grad_norm": 3.0577673743440923, + "learning_rate": 4.8419480978044395e-06, + "loss": 1.0848, + "step": 2460 + }, + { + "epoch": 0.7067087155963303, + "grad_norm": 3.3735135945939136, + "learning_rate": 4.799142183833561e-06, + "loss": 1.0651, + "step": 2465 + }, + { + "epoch": 0.7081422018348624, + "grad_norm": 3.170994136975781, + "learning_rate": 4.75646650590715e-06, + "loss": 1.0736, + "step": 2470 + }, + { + "epoch": 0.7095756880733946, + "grad_norm": 2.993299399999142, + "learning_rate": 4.713922132678055e-06, + "loss": 1.0415, + "step": 2475 + }, + { + "epoch": 0.7110091743119266, + "grad_norm": 3.4225173298945117, + "learning_rate": 4.671510129511074e-06, + "loss": 1.0363, + "step": 2480 + }, + { + "epoch": 0.7124426605504587, + "grad_norm": 3.0344762068899365, + "learning_rate": 4.629231558456306e-06, + "loss": 1.0414, + "step": 2485 + }, + { + "epoch": 0.7138761467889908, + "grad_norm": 3.1844264445491297, + "learning_rate": 4.587087478222539e-06, + "loss": 1.0714, + "step": 2490 + }, + { + "epoch": 0.7153096330275229, + "grad_norm": 3.09219275586818, + "learning_rate": 4.545078944150728e-06, + "loss": 1.017, + "step": 2495 + }, + { + "epoch": 0.716743119266055, + "grad_norm": 3.4728979799373976, + "learning_rate": 4.5032070081876e-06, + "loss": 1.0612, + "step": 2500 + }, + { + "epoch": 0.7181766055045872, + "grad_norm": 3.063801560649391, + "learning_rate": 4.4614727188592835e-06, + "loss": 1.0798, + "step": 2505 + }, + { + "epoch": 0.7196100917431193, + "grad_norm": 3.823900363914515, + "learning_rate": 4.419877121245058e-06, + "loss": 1.0435, + "step": 2510 + }, + { + "epoch": 0.7210435779816514, + "grad_norm": 2.8618913314337107, + "learning_rate": 4.378421256951192e-06, + "loss": 1.0102, + "step": 2515 + }, + { + "epoch": 0.7224770642201835, + "grad_norm": 2.9378204976736466, + "learning_rate": 4.337106164084861e-06, + "loss": 1.0824, + "step": 2520 + }, + { + "epoch": 0.7239105504587156, + "grad_norm": 2.945681604419468, + "learning_rate": 4.295932877228132e-06, + "loss": 0.9985, + "step": 2525 + }, + { + "epoch": 0.7253440366972477, + "grad_norm": 3.106814303596456, + "learning_rate": 4.254902427412082e-06, + "loss": 1.0317, + "step": 2530 + }, + { + "epoch": 0.7267775229357798, + "grad_norm": 3.1281573814995363, + "learning_rate": 4.214015842090969e-06, + "loss": 1.0491, + "step": 2535 + }, + { + "epoch": 0.7282110091743119, + "grad_norm": 2.795954987429199, + "learning_rate": 4.173274145116491e-06, + "loss": 0.9885, + "step": 2540 + }, + { + "epoch": 0.729644495412844, + "grad_norm": 3.3191476249085565, + "learning_rate": 4.1326783567121685e-06, + "loss": 1.0517, + "step": 2545 + }, + { + "epoch": 0.7310779816513762, + "grad_norm": 3.07441160825076, + "learning_rate": 4.092229493447788e-06, + "loss": 1.0512, + "step": 2550 + }, + { + "epoch": 0.7325114678899083, + "grad_norm": 3.3162975946352824, + "learning_rate": 4.051928568213942e-06, + "loss": 1.0476, + "step": 2555 + }, + { + "epoch": 0.7339449541284404, + "grad_norm": 3.222715903509193, + "learning_rate": 4.0117765901966635e-06, + "loss": 1.0401, + "step": 2560 + }, + { + "epoch": 0.7353784403669725, + "grad_norm": 3.1385505256509014, + "learning_rate": 3.9717745648521646e-06, + "loss": 1.0243, + "step": 2565 + }, + { + "epoch": 0.7368119266055045, + "grad_norm": 3.5703129773472506, + "learning_rate": 3.931923493881659e-06, + "loss": 1.0319, + "step": 2570 + }, + { + "epoch": 0.7382454128440367, + "grad_norm": 3.0801290385363393, + "learning_rate": 3.892224375206256e-06, + "loss": 1.0382, + "step": 2575 + }, + { + "epoch": 0.7396788990825688, + "grad_norm": 3.2917177730741676, + "learning_rate": 3.8526782029420005e-06, + "loss": 1.1191, + "step": 2580 + }, + { + "epoch": 0.7411123853211009, + "grad_norm": 3.1425801332889827, + "learning_rate": 3.8132859673749688e-06, + "loss": 1.0481, + "step": 2585 + }, + { + "epoch": 0.742545871559633, + "grad_norm": 3.1741701966057447, + "learning_rate": 3.774048654936454e-06, + "loss": 1.0072, + "step": 2590 + }, + { + "epoch": 0.7439793577981652, + "grad_norm": 3.4923814958157466, + "learning_rate": 3.7349672481782894e-06, + "loss": 1.01, + "step": 2595 + }, + { + "epoch": 0.7454128440366973, + "grad_norm": 3.121310877940095, + "learning_rate": 3.6960427257482343e-06, + "loss": 1.0804, + "step": 2600 + }, + { + "epoch": 0.7468463302752294, + "grad_norm": 3.1172405011336908, + "learning_rate": 3.657276062365457e-06, + "loss": 1.0313, + "step": 2605 + }, + { + "epoch": 0.7482798165137615, + "grad_norm": 3.0906434983571223, + "learning_rate": 3.618668228796143e-06, + "loss": 1.0378, + "step": 2610 + }, + { + "epoch": 0.7497133027522935, + "grad_norm": 3.1863927645425725, + "learning_rate": 3.580220191829178e-06, + "loss": 0.9929, + "step": 2615 + }, + { + "epoch": 0.7511467889908257, + "grad_norm": 3.0251252557702437, + "learning_rate": 3.5419329142519433e-06, + "loss": 1.0814, + "step": 2620 + }, + { + "epoch": 0.7525802752293578, + "grad_norm": 4.943706664249104, + "learning_rate": 3.5038073548261888e-06, + "loss": 1.0629, + "step": 2625 + }, + { + "epoch": 0.7540137614678899, + "grad_norm": 3.16322658334476, + "learning_rate": 3.46584446826405e-06, + "loss": 0.9881, + "step": 2630 + }, + { + "epoch": 0.755447247706422, + "grad_norm": 3.125095249964922, + "learning_rate": 3.428045205204125e-06, + "loss": 0.9923, + "step": 2635 + }, + { + "epoch": 0.7568807339449541, + "grad_norm": 2.973224447074909, + "learning_rate": 3.3904105121876764e-06, + "loss": 1.1052, + "step": 2640 + }, + { + "epoch": 0.7583142201834863, + "grad_norm": 2.996637163056179, + "learning_rate": 3.3529413316349145e-06, + "loss": 1.1049, + "step": 2645 + }, + { + "epoch": 0.7597477064220184, + "grad_norm": 2.935944096948497, + "learning_rate": 3.3156386018214193e-06, + "loss": 1.0712, + "step": 2650 + }, + { + "epoch": 0.7611811926605505, + "grad_norm": 3.038494065615691, + "learning_rate": 3.2785032568546304e-06, + "loss": 1.0472, + "step": 2655 + }, + { + "epoch": 0.7626146788990825, + "grad_norm": 3.2871929054536397, + "learning_rate": 3.2415362266504626e-06, + "loss": 1.0342, + "step": 2660 + }, + { + "epoch": 0.7640481651376146, + "grad_norm": 3.098314519979228, + "learning_rate": 3.2047384369100065e-06, + "loss": 1.0903, + "step": 2665 + }, + { + "epoch": 0.7654816513761468, + "grad_norm": 2.9569161427598787, + "learning_rate": 3.16811080909637e-06, + "loss": 1.0007, + "step": 2670 + }, + { + "epoch": 0.7669151376146789, + "grad_norm": 3.355325001454353, + "learning_rate": 3.1316542604115853e-06, + "loss": 1.0525, + "step": 2675 + }, + { + "epoch": 0.768348623853211, + "grad_norm": 3.7866942452127113, + "learning_rate": 3.095369703773652e-06, + "loss": 1.0607, + "step": 2680 + }, + { + "epoch": 0.7697821100917431, + "grad_norm": 2.8549205403217024, + "learning_rate": 3.0592580477936606e-06, + "loss": 1.0612, + "step": 2685 + }, + { + "epoch": 0.7712155963302753, + "grad_norm": 3.1911721995721547, + "learning_rate": 3.0233201967530647e-06, + "loss": 1.0756, + "step": 2690 + }, + { + "epoch": 0.7726490825688074, + "grad_norm": 3.254594926845702, + "learning_rate": 2.987557050581017e-06, + "loss": 1.0928, + "step": 2695 + }, + { + "epoch": 0.7740825688073395, + "grad_norm": 3.029915420885045, + "learning_rate": 2.9519695048318353e-06, + "loss": 0.9834, + "step": 2700 + }, + { + "epoch": 0.7755160550458715, + "grad_norm": 3.7010803603107614, + "learning_rate": 2.9165584506625864e-06, + "loss": 1.0616, + "step": 2705 + }, + { + "epoch": 0.7769495412844036, + "grad_norm": 3.0973140862278155, + "learning_rate": 2.8813247748107665e-06, + "loss": 1.0493, + "step": 2710 + }, + { + "epoch": 0.7783830275229358, + "grad_norm": 3.3576787529696217, + "learning_rate": 2.8462693595720938e-06, + "loss": 1.0743, + "step": 2715 + }, + { + "epoch": 0.7798165137614679, + "grad_norm": 3.074976481415647, + "learning_rate": 2.8113930827784076e-06, + "loss": 1.0157, + "step": 2720 + }, + { + "epoch": 0.78125, + "grad_norm": 3.0753397191274945, + "learning_rate": 2.776696817775707e-06, + "loss": 1.0266, + "step": 2725 + }, + { + "epoch": 0.7826834862385321, + "grad_norm": 3.09366006809429, + "learning_rate": 2.7421814334022624e-06, + "loss": 1.0243, + "step": 2730 + }, + { + "epoch": 0.7841169724770642, + "grad_norm": 3.6168913804023326, + "learning_rate": 2.7078477939668625e-06, + "loss": 1.0652, + "step": 2735 + }, + { + "epoch": 0.7855504587155964, + "grad_norm": 4.050361789957049, + "learning_rate": 2.673696759227177e-06, + "loss": 1.0636, + "step": 2740 + }, + { + "epoch": 0.7869839449541285, + "grad_norm": 3.249151536222312, + "learning_rate": 2.639729184368226e-06, + "loss": 1.0643, + "step": 2745 + }, + { + "epoch": 0.7884174311926605, + "grad_norm": 2.9287260819204692, + "learning_rate": 2.6059459199809545e-06, + "loss": 1.0028, + "step": 2750 + }, + { + "epoch": 0.7898509174311926, + "grad_norm": 3.0465964857735663, + "learning_rate": 2.5723478120409474e-06, + "loss": 1.0421, + "step": 2755 + }, + { + "epoch": 0.7912844036697247, + "grad_norm": 3.153016800252537, + "learning_rate": 2.5389357018872405e-06, + "loss": 1.0201, + "step": 2760 + }, + { + "epoch": 0.7927178899082569, + "grad_norm": 3.115248902197113, + "learning_rate": 2.505710426201239e-06, + "loss": 1.1122, + "step": 2765 + }, + { + "epoch": 0.794151376146789, + "grad_norm": 3.456526732392788, + "learning_rate": 2.4726728169857885e-06, + "loss": 0.9736, + "step": 2770 + }, + { + "epoch": 0.7955848623853211, + "grad_norm": 3.078706890825068, + "learning_rate": 2.439823701544328e-06, + "loss": 1.0208, + "step": 2775 + }, + { + "epoch": 0.7970183486238532, + "grad_norm": 3.0852747181382485, + "learning_rate": 2.407163902460167e-06, + "loss": 1.0112, + "step": 2780 + }, + { + "epoch": 0.7984518348623854, + "grad_norm": 2.9795351626224202, + "learning_rate": 2.3746942375758986e-06, + "loss": 1.0495, + "step": 2785 + }, + { + "epoch": 0.7998853211009175, + "grad_norm": 3.662965002906388, + "learning_rate": 2.3424155199729206e-06, + "loss": 1.0088, + "step": 2790 + }, + { + "epoch": 0.8013188073394495, + "grad_norm": 2.9979503759811146, + "learning_rate": 2.310328557951065e-06, + "loss": 0.9978, + "step": 2795 + }, + { + "epoch": 0.8027522935779816, + "grad_norm": 3.1993479502657856, + "learning_rate": 2.2784341550083577e-06, + "loss": 1.08, + "step": 2800 + }, + { + "epoch": 0.8041857798165137, + "grad_norm": 2.9645752085086716, + "learning_rate": 2.2467331098209098e-06, + "loss": 1.0019, + "step": 2805 + }, + { + "epoch": 0.8056192660550459, + "grad_norm": 3.0604845129074856, + "learning_rate": 2.215226216222911e-06, + "loss": 0.9847, + "step": 2810 + }, + { + "epoch": 0.807052752293578, + "grad_norm": 3.1249395082814235, + "learning_rate": 2.1839142631867396e-06, + "loss": 1.003, + "step": 2815 + }, + { + "epoch": 0.8084862385321101, + "grad_norm": 3.5887794377789746, + "learning_rate": 2.1527980348032263e-06, + "loss": 1.0915, + "step": 2820 + }, + { + "epoch": 0.8099197247706422, + "grad_norm": 2.9913673167068056, + "learning_rate": 2.121878310262008e-06, + "loss": 1.0346, + "step": 2825 + }, + { + "epoch": 0.8113532110091743, + "grad_norm": 3.076522728603463, + "learning_rate": 2.0911558638320117e-06, + "loss": 1.0254, + "step": 2830 + }, + { + "epoch": 0.8127866972477065, + "grad_norm": 3.4952190302417625, + "learning_rate": 2.0606314648420757e-06, + "loss": 1.0164, + "step": 2835 + }, + { + "epoch": 0.8142201834862385, + "grad_norm": 2.987547482751348, + "learning_rate": 2.0303058776616847e-06, + "loss": 1.0241, + "step": 2840 + }, + { + "epoch": 0.8156536697247706, + "grad_norm": 3.0295895740792265, + "learning_rate": 2.0001798616818137e-06, + "loss": 0.9397, + "step": 2845 + }, + { + "epoch": 0.8170871559633027, + "grad_norm": 3.2124562930166376, + "learning_rate": 1.970254171295931e-06, + "loss": 1.0349, + "step": 2850 + }, + { + "epoch": 0.8185206422018348, + "grad_norm": 2.7811469673449403, + "learning_rate": 1.940529555881101e-06, + "loss": 1.0372, + "step": 2855 + }, + { + "epoch": 0.819954128440367, + "grad_norm": 3.0716049369726854, + "learning_rate": 1.9110067597792094e-06, + "loss": 1.0258, + "step": 2860 + }, + { + "epoch": 0.8213876146788991, + "grad_norm": 3.31583577307639, + "learning_rate": 1.8816865222783354e-06, + "loss": 1.0659, + "step": 2865 + }, + { + "epoch": 0.8228211009174312, + "grad_norm": 3.3545193733702616, + "learning_rate": 1.8525695775942376e-06, + "loss": 0.997, + "step": 2870 + }, + { + "epoch": 0.8242545871559633, + "grad_norm": 3.082738506583357, + "learning_rate": 1.8236566548519664e-06, + "loss": 1.0109, + "step": 2875 + }, + { + "epoch": 0.8256880733944955, + "grad_norm": 2.9272236463346326, + "learning_rate": 1.7949484780675941e-06, + "loss": 1.0096, + "step": 2880 + }, + { + "epoch": 0.8271215596330275, + "grad_norm": 2.9288594245035937, + "learning_rate": 1.7664457661301103e-06, + "loss": 1.0126, + "step": 2885 + }, + { + "epoch": 0.8285550458715596, + "grad_norm": 2.9288774104923077, + "learning_rate": 1.7381492327834004e-06, + "loss": 1.0499, + "step": 2890 + }, + { + "epoch": 0.8299885321100917, + "grad_norm": 2.7877742435190065, + "learning_rate": 1.7100595866083713e-06, + "loss": 1.0041, + "step": 2895 + }, + { + "epoch": 0.8314220183486238, + "grad_norm": 3.0286412164914656, + "learning_rate": 1.6821775310052212e-06, + "loss": 0.9811, + "step": 2900 + }, + { + "epoch": 0.832855504587156, + "grad_norm": 2.7532642945829537, + "learning_rate": 1.65450376417582e-06, + "loss": 1.0096, + "step": 2905 + }, + { + "epoch": 0.8342889908256881, + "grad_norm": 2.9265157315568375, + "learning_rate": 1.6270389791062146e-06, + "loss": 1.0491, + "step": 2910 + }, + { + "epoch": 0.8357224770642202, + "grad_norm": 3.216652891176282, + "learning_rate": 1.5997838635492936e-06, + "loss": 1.0956, + "step": 2915 + }, + { + "epoch": 0.8371559633027523, + "grad_norm": 3.242345149185627, + "learning_rate": 1.5727391000075542e-06, + "loss": 0.9864, + "step": 2920 + }, + { + "epoch": 0.8385894495412844, + "grad_norm": 3.2259601086687204, + "learning_rate": 1.5459053657160084e-06, + "loss": 1.1009, + "step": 2925 + }, + { + "epoch": 0.8400229357798165, + "grad_norm": 2.744523780896053, + "learning_rate": 1.5192833326252377e-06, + "loss": 1.0163, + "step": 2930 + }, + { + "epoch": 0.8414564220183486, + "grad_norm": 2.828910888875833, + "learning_rate": 1.4928736673845534e-06, + "loss": 1.0037, + "step": 2935 + }, + { + "epoch": 0.8428899082568807, + "grad_norm": 3.062048982707707, + "learning_rate": 1.4666770313253054e-06, + "loss": 1.0248, + "step": 2940 + }, + { + "epoch": 0.8443233944954128, + "grad_norm": 3.1169358681462174, + "learning_rate": 1.4406940804443303e-06, + "loss": 1.0094, + "step": 2945 + }, + { + "epoch": 0.845756880733945, + "grad_norm": 3.045993870767794, + "learning_rate": 1.4149254653875167e-06, + "loss": 1.0523, + "step": 2950 + }, + { + "epoch": 0.8471903669724771, + "grad_norm": 3.0564501410586637, + "learning_rate": 1.389371831433507e-06, + "loss": 1.0095, + "step": 2955 + }, + { + "epoch": 0.8486238532110092, + "grad_norm": 3.4108278621499917, + "learning_rate": 1.3640338184775526e-06, + "loss": 1.0366, + "step": 2960 + }, + { + "epoch": 0.8500573394495413, + "grad_norm": 3.309723042860519, + "learning_rate": 1.3389120610154804e-06, + "loss": 1.015, + "step": 2965 + }, + { + "epoch": 0.8514908256880734, + "grad_norm": 3.40140275029476, + "learning_rate": 1.3140071881278106e-06, + "loss": 1.1159, + "step": 2970 + }, + { + "epoch": 0.8529243119266054, + "grad_norm": 3.0642718892804344, + "learning_rate": 1.2893198234639904e-06, + "loss": 0.983, + "step": 2975 + }, + { + "epoch": 0.8543577981651376, + "grad_norm": 3.5196505954999315, + "learning_rate": 1.2648505852267956e-06, + "loss": 1.0422, + "step": 2980 + }, + { + "epoch": 0.8557912844036697, + "grad_norm": 3.012388196153649, + "learning_rate": 1.240600086156839e-06, + "loss": 1.0161, + "step": 2985 + }, + { + "epoch": 0.8572247706422018, + "grad_norm": 4.224630882517724, + "learning_rate": 1.2165689335172248e-06, + "loss": 1.0435, + "step": 2990 + }, + { + "epoch": 0.8586582568807339, + "grad_norm": 3.3927019439454558, + "learning_rate": 1.1927577290783488e-06, + "loss": 1.0549, + "step": 2995 + }, + { + "epoch": 0.8600917431192661, + "grad_norm": 2.8680598337924965, + "learning_rate": 1.169167069102828e-06, + "loss": 1.062, + "step": 3000 + }, + { + "epoch": 0.8615252293577982, + "grad_norm": 3.2446852778299453, + "learning_rate": 1.1457975443305625e-06, + "loss": 1.0439, + "step": 3005 + }, + { + "epoch": 0.8629587155963303, + "grad_norm": 3.074025983059374, + "learning_rate": 1.1226497399639501e-06, + "loss": 1.0202, + "step": 3010 + }, + { + "epoch": 0.8643922018348624, + "grad_norm": 2.9805357814293423, + "learning_rate": 1.0997242356532335e-06, + "loss": 0.9824, + "step": 3015 + }, + { + "epoch": 0.8658256880733946, + "grad_norm": 3.377450943710277, + "learning_rate": 1.0770216054819782e-06, + "loss": 1.0292, + "step": 3020 + }, + { + "epoch": 0.8672591743119266, + "grad_norm": 3.203588510488362, + "learning_rate": 1.0545424179526963e-06, + "loss": 1.0008, + "step": 3025 + }, + { + "epoch": 0.8686926605504587, + "grad_norm": 3.3838873878755438, + "learning_rate": 1.03228723597262e-06, + "loss": 1.0276, + "step": 3030 + }, + { + "epoch": 0.8701261467889908, + "grad_norm": 3.0207979512808247, + "learning_rate": 1.0102566168395977e-06, + "loss": 1.0448, + "step": 3035 + }, + { + "epoch": 0.8715596330275229, + "grad_norm": 3.294936816848852, + "learning_rate": 9.884511122281427e-07, + "loss": 1.0123, + "step": 3040 + }, + { + "epoch": 0.872993119266055, + "grad_norm": 3.0263492251087802, + "learning_rate": 9.668712681756087e-07, + "loss": 0.9943, + "step": 3045 + }, + { + "epoch": 0.8744266055045872, + "grad_norm": 2.987557644263372, + "learning_rate": 9.455176250685338e-07, + "loss": 1.0392, + "step": 3050 + }, + { + "epoch": 0.8758600917431193, + "grad_norm": 2.827866834943389, + "learning_rate": 9.243907176290945e-07, + "loss": 1.0009, + "step": 3055 + }, + { + "epoch": 0.8772935779816514, + "grad_norm": 3.0479507652012106, + "learning_rate": 9.034910749017211e-07, + "loss": 1.0362, + "step": 3060 + }, + { + "epoch": 0.8787270642201835, + "grad_norm": 3.166615479232945, + "learning_rate": 8.828192202398455e-07, + "loss": 1.0649, + "step": 3065 + }, + { + "epoch": 0.8801605504587156, + "grad_norm": 3.086530733524286, + "learning_rate": 8.623756712928022e-07, + "loss": 1.0161, + "step": 3070 + }, + { + "epoch": 0.8815940366972477, + "grad_norm": 3.2339942958362995, + "learning_rate": 8.421609399928621e-07, + "loss": 1.0274, + "step": 3075 + }, + { + "epoch": 0.8830275229357798, + "grad_norm": 3.1624079222447845, + "learning_rate": 8.221755325424152e-07, + "loss": 0.963, + "step": 3080 + }, + { + "epoch": 0.8844610091743119, + "grad_norm": 6.67242001014344, + "learning_rate": 8.024199494012863e-07, + "loss": 0.994, + "step": 3085 + }, + { + "epoch": 0.885894495412844, + "grad_norm": 3.0370868865276424, + "learning_rate": 7.828946852742148e-07, + "loss": 1.0551, + "step": 3090 + }, + { + "epoch": 0.8873279816513762, + "grad_norm": 3.1653796046419185, + "learning_rate": 7.636002290984634e-07, + "loss": 1.0085, + "step": 3095 + }, + { + "epoch": 0.8887614678899083, + "grad_norm": 3.018418033888261, + "learning_rate": 7.445370640315642e-07, + "loss": 1.0017, + "step": 3100 + }, + { + "epoch": 0.8901949541284404, + "grad_norm": 3.543389563066827, + "learning_rate": 7.257056674392359e-07, + "loss": 1.002, + "step": 3105 + }, + { + "epoch": 0.8916284403669725, + "grad_norm": 3.1380320671176194, + "learning_rate": 7.071065108834197e-07, + "loss": 1.0311, + "step": 3110 + }, + { + "epoch": 0.8930619266055045, + "grad_norm": 3.107368067479966, + "learning_rate": 6.887400601104688e-07, + "loss": 0.9969, + "step": 3115 + }, + { + "epoch": 0.8944954128440367, + "grad_norm": 2.86265063408354, + "learning_rate": 6.706067750394951e-07, + "loss": 1.0004, + "step": 3120 + }, + { + "epoch": 0.8959288990825688, + "grad_norm": 2.921057543794976, + "learning_rate": 6.527071097508475e-07, + "loss": 1.0568, + "step": 3125 + }, + { + "epoch": 0.8973623853211009, + "grad_norm": 3.413457128089623, + "learning_rate": 6.350415124747378e-07, + "loss": 1.0966, + "step": 3130 + }, + { + "epoch": 0.898795871559633, + "grad_norm": 3.2754484907337913, + "learning_rate": 6.176104255800175e-07, + "loss": 1.0012, + "step": 3135 + }, + { + "epoch": 0.9002293577981652, + "grad_norm": 3.096817656277008, + "learning_rate": 6.004142855631068e-07, + "loss": 1.0409, + "step": 3140 + }, + { + "epoch": 0.9016628440366973, + "grad_norm": 3.2107227855247737, + "learning_rate": 5.834535230370586e-07, + "loss": 1.022, + "step": 3145 + }, + { + "epoch": 0.9030963302752294, + "grad_norm": 3.087915467643749, + "learning_rate": 5.66728562720772e-07, + "loss": 1.1023, + "step": 3150 + }, + { + "epoch": 0.9045298165137615, + "grad_norm": 3.1987341877696034, + "learning_rate": 5.502398234283657e-07, + "loss": 1.004, + "step": 3155 + }, + { + "epoch": 0.9059633027522935, + "grad_norm": 2.8182259878196527, + "learning_rate": 5.339877180586872e-07, + "loss": 1.0388, + "step": 3160 + }, + { + "epoch": 0.9073967889908257, + "grad_norm": 3.060289512500466, + "learning_rate": 5.179726535849649e-07, + "loss": 1.0461, + "step": 3165 + }, + { + "epoch": 0.9088302752293578, + "grad_norm": 3.1698304516587794, + "learning_rate": 5.0219503104463e-07, + "loss": 1.0247, + "step": 3170 + }, + { + "epoch": 0.9102637614678899, + "grad_norm": 3.130302466725309, + "learning_rate": 4.866552455292673e-07, + "loss": 1.0121, + "step": 3175 + }, + { + "epoch": 0.911697247706422, + "grad_norm": 3.42566263140553, + "learning_rate": 4.713536861747181e-07, + "loss": 1.0241, + "step": 3180 + }, + { + "epoch": 0.9131307339449541, + "grad_norm": 2.9858737991061974, + "learning_rate": 4.5629073615134466e-07, + "loss": 1.0616, + "step": 3185 + }, + { + "epoch": 0.9145642201834863, + "grad_norm": 3.285076013359202, + "learning_rate": 4.414667726544308e-07, + "loss": 0.9933, + "step": 3190 + }, + { + "epoch": 0.9159977064220184, + "grad_norm": 3.2338711561101006, + "learning_rate": 4.2688216689472984e-07, + "loss": 1.0764, + "step": 3195 + }, + { + "epoch": 0.9174311926605505, + "grad_norm": 3.1747891589130766, + "learning_rate": 4.125372840891817e-07, + "loss": 1.0288, + "step": 3200 + }, + { + "epoch": 0.9188646788990825, + "grad_norm": 3.4369303388998604, + "learning_rate": 3.984324834517583e-07, + "loss": 1.0151, + "step": 3205 + }, + { + "epoch": 0.9202981651376146, + "grad_norm": 2.9486157404109927, + "learning_rate": 3.845681181844718e-07, + "loss": 1.0339, + "step": 3210 + }, + { + "epoch": 0.9217316513761468, + "grad_norm": 3.026266781773246, + "learning_rate": 3.7094453546852706e-07, + "loss": 1.0372, + "step": 3215 + }, + { + "epoch": 0.9231651376146789, + "grad_norm": 3.6005808302403715, + "learning_rate": 3.575620764556331e-07, + "loss": 1.0675, + "step": 3220 + }, + { + "epoch": 0.924598623853211, + "grad_norm": 3.065531017189199, + "learning_rate": 3.4442107625945577e-07, + "loss": 0.9969, + "step": 3225 + }, + { + "epoch": 0.9260321100917431, + "grad_norm": 3.101619885450492, + "learning_rate": 3.3152186394722506e-07, + "loss": 1.004, + "step": 3230 + }, + { + "epoch": 0.9274655963302753, + "grad_norm": 3.069142098455305, + "learning_rate": 3.188647625315011e-07, + "loss": 1.0277, + "step": 3235 + }, + { + "epoch": 0.9288990825688074, + "grad_norm": 2.981358314660114, + "learning_rate": 3.064500889620792e-07, + "loss": 1.012, + "step": 3240 + }, + { + "epoch": 0.9303325688073395, + "grad_norm": 3.146209185290999, + "learning_rate": 2.9427815411805616e-07, + "loss": 1.0409, + "step": 3245 + }, + { + "epoch": 0.9317660550458715, + "grad_norm": 2.9139523212945, + "learning_rate": 2.823492628000435e-07, + "loss": 1.0108, + "step": 3250 + }, + { + "epoch": 0.9331995412844036, + "grad_norm": 3.2427768967167068, + "learning_rate": 2.7066371372253873e-07, + "loss": 1.0033, + "step": 3255 + }, + { + "epoch": 0.9346330275229358, + "grad_norm": 3.2929773628001073, + "learning_rate": 2.5922179950643833e-07, + "loss": 1.053, + "step": 3260 + }, + { + "epoch": 0.9360665137614679, + "grad_norm": 3.0857731418567473, + "learning_rate": 2.480238066717178e-07, + "loss": 1.0159, + "step": 3265 + }, + { + "epoch": 0.9375, + "grad_norm": 2.913155618655732, + "learning_rate": 2.370700156302541e-07, + "loss": 0.9997, + "step": 3270 + }, + { + "epoch": 0.9389334862385321, + "grad_norm": 2.9236470377703134, + "learning_rate": 2.2636070067879933e-07, + "loss": 1.0315, + "step": 3275 + }, + { + "epoch": 0.9403669724770642, + "grad_norm": 3.1616081864710113, + "learning_rate": 2.1589612999211697e-07, + "loss": 1.0573, + "step": 3280 + }, + { + "epoch": 0.9418004587155964, + "grad_norm": 3.2061935710058305, + "learning_rate": 2.056765656162685e-07, + "loss": 1.0344, + "step": 3285 + }, + { + "epoch": 0.9432339449541285, + "grad_norm": 3.1445007223648087, + "learning_rate": 1.9570226346204312e-07, + "loss": 1.0502, + "step": 3290 + }, + { + "epoch": 0.9446674311926605, + "grad_norm": 3.0523955190740226, + "learning_rate": 1.8597347329855742e-07, + "loss": 1.0277, + "step": 3295 + }, + { + "epoch": 0.9461009174311926, + "grad_norm": 3.10853151235872, + "learning_rate": 1.764904387469979e-07, + "loss": 0.9951, + "step": 3300 + }, + { + "epoch": 0.9475344036697247, + "grad_norm": 2.8610073341168327, + "learning_rate": 1.672533972745194e-07, + "loss": 1.0133, + "step": 3305 + }, + { + "epoch": 0.9489678899082569, + "grad_norm": 2.9659296802362154, + "learning_rate": 1.5826258018829866e-07, + "loss": 1.0146, + "step": 3310 + }, + { + "epoch": 0.950401376146789, + "grad_norm": 3.0101090249896187, + "learning_rate": 1.4951821262974563e-07, + "loss": 1.0064, + "step": 3315 + }, + { + "epoch": 0.9518348623853211, + "grad_norm": 2.873431144530414, + "learning_rate": 1.4102051356886027e-07, + "loss": 0.9988, + "step": 3320 + }, + { + "epoch": 0.9532683486238532, + "grad_norm": 2.861730520518137, + "learning_rate": 1.3276969579875453e-07, + "loss": 1.007, + "step": 3325 + }, + { + "epoch": 0.9547018348623854, + "grad_norm": 3.1539277667342964, + "learning_rate": 1.24765965930318e-07, + "loss": 1.017, + "step": 3330 + }, + { + "epoch": 0.9561353211009175, + "grad_norm": 3.182662519707628, + "learning_rate": 1.1700952438705171e-07, + "loss": 1.0008, + "step": 3335 + }, + { + "epoch": 0.9575688073394495, + "grad_norm": 3.078518777565793, + "learning_rate": 1.0950056540004029e-07, + "loss": 1.0649, + "step": 3340 + }, + { + "epoch": 0.9590022935779816, + "grad_norm": 2.8736168807619507, + "learning_rate": 1.0223927700309667e-07, + "loss": 1.0177, + "step": 3345 + }, + { + "epoch": 0.9604357798165137, + "grad_norm": 3.380358687506272, + "learning_rate": 9.522584102804599e-08, + "loss": 0.9955, + "step": 3350 + }, + { + "epoch": 0.9618692660550459, + "grad_norm": 3.209744698526145, + "learning_rate": 8.846043310017927e-08, + "loss": 1.0051, + "step": 3355 + }, + { + "epoch": 0.963302752293578, + "grad_norm": 3.135993664577884, + "learning_rate": 8.194322263385013e-08, + "loss": 1.0096, + "step": 3360 + }, + { + "epoch": 0.9647362385321101, + "grad_norm": 2.9702419226658128, + "learning_rate": 7.567437282823386e-08, + "loss": 0.9581, + "step": 3365 + }, + { + "epoch": 0.9661697247706422, + "grad_norm": 2.9902064692936365, + "learning_rate": 6.96540406632451e-08, + "loss": 1.0254, + "step": 3370 + }, + { + "epoch": 0.9676032110091743, + "grad_norm": 3.136047137814621, + "learning_rate": 6.388237689559762e-08, + "loss": 1.0386, + "step": 3375 + }, + { + "epoch": 0.9690366972477065, + "grad_norm": 3.209855131586944, + "learning_rate": 5.8359526055038476e-08, + "loss": 1.0493, + "step": 3380 + }, + { + "epoch": 0.9704701834862385, + "grad_norm": 3.2935877965995966, + "learning_rate": 5.3085626440724235e-08, + "loss": 1.0391, + "step": 3385 + }, + { + "epoch": 0.9719036697247706, + "grad_norm": 3.406107022140581, + "learning_rate": 4.8060810117757096e-08, + "loss": 1.0244, + "step": 3390 + }, + { + "epoch": 0.9733371559633027, + "grad_norm": 3.3748108058363377, + "learning_rate": 4.3285202913881944e-08, + "loss": 0.9787, + "step": 3395 + }, + { + "epoch": 0.9747706422018348, + "grad_norm": 3.0832449198208978, + "learning_rate": 3.875892441633e-08, + "loss": 0.9951, + "step": 3400 + }, + { + "epoch": 0.976204128440367, + "grad_norm": 3.186455532049428, + "learning_rate": 3.4482087968829014e-08, + "loss": 1.0315, + "step": 3405 + }, + { + "epoch": 0.9776376146788991, + "grad_norm": 2.9104798470989883, + "learning_rate": 3.045480066876105e-08, + "loss": 1.0092, + "step": 3410 + }, + { + "epoch": 0.9790711009174312, + "grad_norm": 3.037035090176394, + "learning_rate": 2.667716336448356e-08, + "loss": 0.9587, + "step": 3415 + }, + { + "epoch": 0.9805045871559633, + "grad_norm": 2.9918210850993785, + "learning_rate": 2.3149270652803592e-08, + "loss": 0.9973, + "step": 3420 + }, + { + "epoch": 0.9819380733944955, + "grad_norm": 3.183820306323636, + "learning_rate": 1.9871210876607484e-08, + "loss": 1.011, + "step": 3425 + }, + { + "epoch": 0.9833715596330275, + "grad_norm": 3.214411554963273, + "learning_rate": 1.6843066122649297e-08, + "loss": 0.9913, + "step": 3430 + }, + { + "epoch": 0.9848050458715596, + "grad_norm": 3.5216487010248088, + "learning_rate": 1.4064912219496907e-08, + "loss": 1.0442, + "step": 3435 + }, + { + "epoch": 0.9862385321100917, + "grad_norm": 3.1877265670501598, + "learning_rate": 1.1536818735630172e-08, + "loss": 1.0318, + "step": 3440 + }, + { + "epoch": 0.9876720183486238, + "grad_norm": 3.087482942801695, + "learning_rate": 9.25884897770013e-09, + "loss": 1.0607, + "step": 3445 + }, + { + "epoch": 0.989105504587156, + "grad_norm": 3.059255678756277, + "learning_rate": 7.231059988945799e-09, + "loss": 1.0148, + "step": 3450 + }, + { + "epoch": 0.9905389908256881, + "grad_norm": 3.239412598410617, + "learning_rate": 5.4535025477642224e-09, + "loss": 1.0422, + "step": 3455 + }, + { + "epoch": 0.9919724770642202, + "grad_norm": 3.144154368562322, + "learning_rate": 3.926221166434818e-09, + "loss": 1.0226, + "step": 3460 + }, + { + "epoch": 0.9934059633027523, + "grad_norm": 3.4686686147707877, + "learning_rate": 2.6492540900135976e-09, + "loss": 1.0359, + "step": 3465 + }, + { + "epoch": 0.9948394495412844, + "grad_norm": 2.9818281052270432, + "learning_rate": 1.6226332953661605e-09, + "loss": 0.988, + "step": 3470 + }, + { + "epoch": 0.9962729357798165, + "grad_norm": 2.940056641516156, + "learning_rate": 8.46384490373886e-10, + "loss": 1.0299, + "step": 3475 + }, + { + "epoch": 0.9977064220183486, + "grad_norm": 3.0373605695085883, + "learning_rate": 3.2052711328778297e-10, + "loss": 0.9876, + "step": 3480 + }, + { + "epoch": 0.9991399082568807, + "grad_norm": 3.0988136193131335, + "learning_rate": 4.5074332237771134e-11, + "loss": 1.0197, + "step": 3485 + }, + { + "epoch": 1.0, + "eval_loss": 1.0320301055908203, + "eval_runtime": 4528.3515, + "eval_samples_per_second": 18.469, + "eval_steps_per_second": 1.155, + "step": 3488 + }, + { + "epoch": 1.0, + "step": 3488, + "total_flos": 182579059752960.0, + "train_loss": 0.025541797006895784, + "train_runtime": 4827.4873, + "train_samples_per_second": 11.557, + "train_steps_per_second": 0.723 + } + ], + "logging_steps": 5, + "max_steps": 3488, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 182579059752960.0, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}