diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,9576 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 3.0, + "eval_steps": 500, + "global_step": 6807, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00044072278536800354, + "grad_norm": 0.24354467550812778, + "learning_rate": 2.936857562408223e-07, + "loss": 2.1339, + "step": 1 + }, + { + "epoch": 0.0022036139268400176, + "grad_norm": 0.27040776216996587, + "learning_rate": 1.4684287812041115e-06, + "loss": 2.3535, + "step": 5 + }, + { + "epoch": 0.004407227853680035, + "grad_norm": 0.23978713638500837, + "learning_rate": 2.936857562408223e-06, + "loss": 2.0659, + "step": 10 + }, + { + "epoch": 0.006610841780520053, + "grad_norm": 0.2562898248528363, + "learning_rate": 4.4052863436123355e-06, + "loss": 2.0742, + "step": 15 + }, + { + "epoch": 0.00881445570736007, + "grad_norm": 0.2671940707327157, + "learning_rate": 5.873715124816446e-06, + "loss": 2.0397, + "step": 20 + }, + { + "epoch": 0.011018069634200088, + "grad_norm": 0.2840547143688825, + "learning_rate": 7.3421439060205585e-06, + "loss": 2.3034, + "step": 25 + }, + { + "epoch": 0.013221683561040106, + "grad_norm": 0.2739646963997833, + "learning_rate": 8.810572687224671e-06, + "loss": 2.0168, + "step": 30 + }, + { + "epoch": 0.015425297487880123, + "grad_norm": 0.36046684768779125, + "learning_rate": 1.0279001468428782e-05, + "loss": 2.1073, + "step": 35 + }, + { + "epoch": 0.01762891141472014, + "grad_norm": 0.29214076993878346, + "learning_rate": 1.1747430249632892e-05, + "loss": 1.9053, + "step": 40 + }, + { + "epoch": 0.01983252534156016, + "grad_norm": 0.4413639637555924, + "learning_rate": 1.3215859030837005e-05, + "loss": 2.272, + "step": 45 + }, + { + "epoch": 0.022036139268400177, + "grad_norm": 0.3645215252816923, + "learning_rate": 1.4684287812041117e-05, + "loss": 2.0033, + "step": 50 + }, + { + "epoch": 0.024239753195240195, + "grad_norm": 0.26241461964383006, + "learning_rate": 1.615271659324523e-05, + "loss": 2.0168, + "step": 55 + }, + { + "epoch": 0.026443367122080213, + "grad_norm": 0.3108898378679939, + "learning_rate": 1.7621145374449342e-05, + "loss": 2.1111, + "step": 60 + }, + { + "epoch": 0.02864698104892023, + "grad_norm": 0.2662326669897319, + "learning_rate": 1.9089574155653454e-05, + "loss": 1.8806, + "step": 65 + }, + { + "epoch": 0.030850594975760245, + "grad_norm": 0.16963519927101947, + "learning_rate": 2.0558002936857563e-05, + "loss": 2.1257, + "step": 70 + }, + { + "epoch": 0.03305420890260027, + "grad_norm": 0.23310716143802493, + "learning_rate": 2.2026431718061676e-05, + "loss": 1.8759, + "step": 75 + }, + { + "epoch": 0.03525782282944028, + "grad_norm": 0.2575993518919339, + "learning_rate": 2.3494860499265785e-05, + "loss": 1.9053, + "step": 80 + }, + { + "epoch": 0.0374614367562803, + "grad_norm": 0.2746073869104198, + "learning_rate": 2.4963289280469897e-05, + "loss": 1.7651, + "step": 85 + }, + { + "epoch": 0.03966505068312032, + "grad_norm": 0.2279037029835098, + "learning_rate": 2.643171806167401e-05, + "loss": 1.8627, + "step": 90 + }, + { + "epoch": 0.04186866460996033, + "grad_norm": 0.27296809706241965, + "learning_rate": 2.7900146842878122e-05, + "loss": 1.9057, + "step": 95 + }, + { + "epoch": 0.044072278536800354, + "grad_norm": 0.2670117144218064, + "learning_rate": 2.9368575624082234e-05, + "loss": 1.8911, + "step": 100 + }, + { + "epoch": 0.04627589246364037, + "grad_norm": 0.23511621219550868, + "learning_rate": 3.0837004405286347e-05, + "loss": 2.024, + "step": 105 + }, + { + "epoch": 0.04847950639048039, + "grad_norm": 0.24414856722360948, + "learning_rate": 3.230543318649046e-05, + "loss": 1.9756, + "step": 110 + }, + { + "epoch": 0.050683120317320404, + "grad_norm": 0.25077238640232496, + "learning_rate": 3.377386196769457e-05, + "loss": 1.7627, + "step": 115 + }, + { + "epoch": 0.052886734244160426, + "grad_norm": 0.30541323094735956, + "learning_rate": 3.5242290748898684e-05, + "loss": 1.9455, + "step": 120 + }, + { + "epoch": 0.05509034817100044, + "grad_norm": 0.2593735959377697, + "learning_rate": 3.6710719530102796e-05, + "loss": 1.9408, + "step": 125 + }, + { + "epoch": 0.05729396209784046, + "grad_norm": 0.2356435173973878, + "learning_rate": 3.817914831130691e-05, + "loss": 1.9746, + "step": 130 + }, + { + "epoch": 0.059497576024680476, + "grad_norm": 0.2556575372905199, + "learning_rate": 3.9647577092511014e-05, + "loss": 1.9133, + "step": 135 + }, + { + "epoch": 0.06170118995152049, + "grad_norm": 0.23671492386855494, + "learning_rate": 4.1116005873715127e-05, + "loss": 1.7228, + "step": 140 + }, + { + "epoch": 0.0639048038783605, + "grad_norm": 0.2522128793647996, + "learning_rate": 4.258443465491924e-05, + "loss": 1.9416, + "step": 145 + }, + { + "epoch": 0.06610841780520053, + "grad_norm": 0.259439737149894, + "learning_rate": 4.405286343612335e-05, + "loss": 2.0295, + "step": 150 + }, + { + "epoch": 0.06831203173204055, + "grad_norm": 0.2632402024665515, + "learning_rate": 4.5521292217327464e-05, + "loss": 1.7906, + "step": 155 + }, + { + "epoch": 0.07051564565888056, + "grad_norm": 0.28869506877919027, + "learning_rate": 4.698972099853157e-05, + "loss": 1.73, + "step": 160 + }, + { + "epoch": 0.07271925958572058, + "grad_norm": 0.2586750438465526, + "learning_rate": 4.845814977973568e-05, + "loss": 1.8038, + "step": 165 + }, + { + "epoch": 0.0749228735125606, + "grad_norm": 0.2506125082680772, + "learning_rate": 4.9926578560939794e-05, + "loss": 1.8558, + "step": 170 + }, + { + "epoch": 0.07712648743940062, + "grad_norm": 0.2876243539987618, + "learning_rate": 5.1395007342143906e-05, + "loss": 1.9784, + "step": 175 + }, + { + "epoch": 0.07933010136624064, + "grad_norm": 0.23288536708042795, + "learning_rate": 5.286343612334802e-05, + "loss": 1.7534, + "step": 180 + }, + { + "epoch": 0.08153371529308065, + "grad_norm": 0.25062968804040664, + "learning_rate": 5.433186490455213e-05, + "loss": 1.8032, + "step": 185 + }, + { + "epoch": 0.08373732921992066, + "grad_norm": 0.3358849445093002, + "learning_rate": 5.5800293685756244e-05, + "loss": 1.9325, + "step": 190 + }, + { + "epoch": 0.08594094314676069, + "grad_norm": 0.25234625121100573, + "learning_rate": 5.7268722466960356e-05, + "loss": 1.8855, + "step": 195 + }, + { + "epoch": 0.08814455707360071, + "grad_norm": 0.3365993483895123, + "learning_rate": 5.873715124816447e-05, + "loss": 1.8118, + "step": 200 + }, + { + "epoch": 0.09034817100044072, + "grad_norm": 0.311599846916865, + "learning_rate": 6.020558002936858e-05, + "loss": 2.0721, + "step": 205 + }, + { + "epoch": 0.09255178492728074, + "grad_norm": 0.3363293627242514, + "learning_rate": 6.167400881057269e-05, + "loss": 1.804, + "step": 210 + }, + { + "epoch": 0.09475539885412076, + "grad_norm": 0.3414057038117249, + "learning_rate": 6.31424375917768e-05, + "loss": 1.871, + "step": 215 + }, + { + "epoch": 0.09695901278096078, + "grad_norm": 0.3051539417193126, + "learning_rate": 6.461086637298092e-05, + "loss": 2.0932, + "step": 220 + }, + { + "epoch": 0.0991626267078008, + "grad_norm": 0.2859223655623353, + "learning_rate": 6.607929515418503e-05, + "loss": 1.8641, + "step": 225 + }, + { + "epoch": 0.10136624063464081, + "grad_norm": 0.30959368455808256, + "learning_rate": 6.754772393538914e-05, + "loss": 1.8437, + "step": 230 + }, + { + "epoch": 0.10356985456148082, + "grad_norm": 0.3544528067642694, + "learning_rate": 6.901615271659326e-05, + "loss": 1.9002, + "step": 235 + }, + { + "epoch": 0.10577346848832085, + "grad_norm": 0.34632827275641465, + "learning_rate": 7.048458149779737e-05, + "loss": 1.8398, + "step": 240 + }, + { + "epoch": 0.10797708241516087, + "grad_norm": 0.597473977518495, + "learning_rate": 7.195301027900148e-05, + "loss": 1.9357, + "step": 245 + }, + { + "epoch": 0.11018069634200088, + "grad_norm": 0.35953743798846133, + "learning_rate": 7.342143906020559e-05, + "loss": 1.944, + "step": 250 + }, + { + "epoch": 0.1123843102688409, + "grad_norm": 0.32165430097730124, + "learning_rate": 7.48898678414097e-05, + "loss": 1.8974, + "step": 255 + }, + { + "epoch": 0.11458792419568092, + "grad_norm": 0.2616578656705798, + "learning_rate": 7.635829662261382e-05, + "loss": 1.6988, + "step": 260 + }, + { + "epoch": 0.11679153812252094, + "grad_norm": 0.37507900741140277, + "learning_rate": 7.782672540381793e-05, + "loss": 1.9061, + "step": 265 + }, + { + "epoch": 0.11899515204936095, + "grad_norm": 0.3505810167092092, + "learning_rate": 7.929515418502203e-05, + "loss": 1.7829, + "step": 270 + }, + { + "epoch": 0.12119876597620097, + "grad_norm": 0.2905187770243884, + "learning_rate": 8.076358296622614e-05, + "loss": 1.896, + "step": 275 + }, + { + "epoch": 0.12340237990304098, + "grad_norm": 0.31667074390598227, + "learning_rate": 8.223201174743025e-05, + "loss": 1.8549, + "step": 280 + }, + { + "epoch": 0.125605993829881, + "grad_norm": 0.33256810111534474, + "learning_rate": 8.370044052863437e-05, + "loss": 1.9184, + "step": 285 + }, + { + "epoch": 0.127809607756721, + "grad_norm": 0.41094013265883617, + "learning_rate": 8.516886930983848e-05, + "loss": 1.9407, + "step": 290 + }, + { + "epoch": 0.13001322168356105, + "grad_norm": 0.3268179907564926, + "learning_rate": 8.663729809104259e-05, + "loss": 1.858, + "step": 295 + }, + { + "epoch": 0.13221683561040107, + "grad_norm": 0.4082234533551937, + "learning_rate": 8.81057268722467e-05, + "loss": 1.8172, + "step": 300 + }, + { + "epoch": 0.13442044953724108, + "grad_norm": 0.3303954200472522, + "learning_rate": 8.957415565345081e-05, + "loss": 1.7879, + "step": 305 + }, + { + "epoch": 0.1366240634640811, + "grad_norm": 0.3785434588284624, + "learning_rate": 9.104258443465493e-05, + "loss": 1.7931, + "step": 310 + }, + { + "epoch": 0.1388276773909211, + "grad_norm": 0.3603897034728447, + "learning_rate": 9.251101321585903e-05, + "loss": 1.7793, + "step": 315 + }, + { + "epoch": 0.14103129131776113, + "grad_norm": 0.2967904662976913, + "learning_rate": 9.397944199706314e-05, + "loss": 1.7092, + "step": 320 + }, + { + "epoch": 0.14323490524460114, + "grad_norm": 0.2976559195453983, + "learning_rate": 9.544787077826725e-05, + "loss": 1.8302, + "step": 325 + }, + { + "epoch": 0.14543851917144116, + "grad_norm": 0.34525724393698276, + "learning_rate": 9.691629955947136e-05, + "loss": 1.9488, + "step": 330 + }, + { + "epoch": 0.14764213309828117, + "grad_norm": 0.3473487500731534, + "learning_rate": 9.838472834067548e-05, + "loss": 1.8169, + "step": 335 + }, + { + "epoch": 0.1498457470251212, + "grad_norm": 0.35776737697830296, + "learning_rate": 9.985315712187959e-05, + "loss": 1.7562, + "step": 340 + }, + { + "epoch": 0.15204936095196123, + "grad_norm": 0.35067754753032443, + "learning_rate": 0.00010132158590308371, + "loss": 1.9597, + "step": 345 + }, + { + "epoch": 0.15425297487880124, + "grad_norm": 0.31111407532049806, + "learning_rate": 0.00010279001468428781, + "loss": 1.6977, + "step": 350 + }, + { + "epoch": 0.15645658880564126, + "grad_norm": 0.350081092372353, + "learning_rate": 0.00010425844346549194, + "loss": 1.7585, + "step": 355 + }, + { + "epoch": 0.15866020273248127, + "grad_norm": 0.3445787779571548, + "learning_rate": 0.00010572687224669604, + "loss": 1.7956, + "step": 360 + }, + { + "epoch": 0.16086381665932128, + "grad_norm": 0.2992117867602888, + "learning_rate": 0.00010719530102790014, + "loss": 1.7752, + "step": 365 + }, + { + "epoch": 0.1630674305861613, + "grad_norm": 0.3754242349038873, + "learning_rate": 0.00010866372980910426, + "loss": 1.8151, + "step": 370 + }, + { + "epoch": 0.16527104451300131, + "grad_norm": 0.36786720092684805, + "learning_rate": 0.00011013215859030836, + "loss": 1.8172, + "step": 375 + }, + { + "epoch": 0.16747465843984133, + "grad_norm": 0.3572694028170277, + "learning_rate": 0.00011160058737151249, + "loss": 1.8668, + "step": 380 + }, + { + "epoch": 0.16967827236668137, + "grad_norm": 0.34400588996372305, + "learning_rate": 0.00011306901615271659, + "loss": 1.9685, + "step": 385 + }, + { + "epoch": 0.17188188629352139, + "grad_norm": 0.3484332919699417, + "learning_rate": 0.00011453744493392071, + "loss": 1.999, + "step": 390 + }, + { + "epoch": 0.1740855002203614, + "grad_norm": 0.3167720096524829, + "learning_rate": 0.00011600587371512481, + "loss": 1.7002, + "step": 395 + }, + { + "epoch": 0.17628911414720141, + "grad_norm": 0.30865191358312394, + "learning_rate": 0.00011747430249632894, + "loss": 1.7808, + "step": 400 + }, + { + "epoch": 0.17849272807404143, + "grad_norm": 0.3635805195870158, + "learning_rate": 0.00011894273127753304, + "loss": 1.8711, + "step": 405 + }, + { + "epoch": 0.18069634200088144, + "grad_norm": 0.3113426681083048, + "learning_rate": 0.00012041116005873716, + "loss": 1.7559, + "step": 410 + }, + { + "epoch": 0.18289995592772146, + "grad_norm": 0.3500754402872819, + "learning_rate": 0.00012187958883994126, + "loss": 2.0623, + "step": 415 + }, + { + "epoch": 0.18510356985456147, + "grad_norm": 0.3002611868025747, + "learning_rate": 0.00012334801762114539, + "loss": 1.6155, + "step": 420 + }, + { + "epoch": 0.1873071837814015, + "grad_norm": 0.30728017118253237, + "learning_rate": 0.00012481644640234947, + "loss": 1.7865, + "step": 425 + }, + { + "epoch": 0.18951079770824153, + "grad_norm": 0.30435853108098426, + "learning_rate": 0.0001262848751835536, + "loss": 1.8101, + "step": 430 + }, + { + "epoch": 0.19171441163508154, + "grad_norm": 0.33728305221904875, + "learning_rate": 0.0001277533039647577, + "loss": 1.7445, + "step": 435 + }, + { + "epoch": 0.19391802556192156, + "grad_norm": 0.3269188152820616, + "learning_rate": 0.00012922173274596184, + "loss": 1.8443, + "step": 440 + }, + { + "epoch": 0.19612163948876157, + "grad_norm": 0.30824761799725153, + "learning_rate": 0.00013069016152716592, + "loss": 1.8385, + "step": 445 + }, + { + "epoch": 0.1983252534156016, + "grad_norm": 0.2692779779339547, + "learning_rate": 0.00013215859030837006, + "loss": 1.7256, + "step": 450 + }, + { + "epoch": 0.2005288673424416, + "grad_norm": 0.29633902366885784, + "learning_rate": 0.00013362701908957415, + "loss": 1.8496, + "step": 455 + }, + { + "epoch": 0.20273248126928162, + "grad_norm": 0.31921033190555953, + "learning_rate": 0.00013509544787077829, + "loss": 1.9118, + "step": 460 + }, + { + "epoch": 0.20493609519612163, + "grad_norm": 0.38557464681376724, + "learning_rate": 0.00013656387665198237, + "loss": 1.7992, + "step": 465 + }, + { + "epoch": 0.20713970912296165, + "grad_norm": 0.28175106471056965, + "learning_rate": 0.0001380323054331865, + "loss": 1.8768, + "step": 470 + }, + { + "epoch": 0.2093433230498017, + "grad_norm": 0.31789045276124145, + "learning_rate": 0.0001395007342143906, + "loss": 1.7628, + "step": 475 + }, + { + "epoch": 0.2115469369766417, + "grad_norm": 0.2412861323934925, + "learning_rate": 0.00014096916299559473, + "loss": 1.7988, + "step": 480 + }, + { + "epoch": 0.21375055090348172, + "grad_norm": 0.24858109179997467, + "learning_rate": 0.00014243759177679882, + "loss": 1.8673, + "step": 485 + }, + { + "epoch": 0.21595416483032173, + "grad_norm": 0.30755969433310765, + "learning_rate": 0.00014390602055800296, + "loss": 1.7105, + "step": 490 + }, + { + "epoch": 0.21815777875716175, + "grad_norm": 0.2249398504391689, + "learning_rate": 0.00014537444933920705, + "loss": 1.7837, + "step": 495 + }, + { + "epoch": 0.22036139268400176, + "grad_norm": 0.3335603893570301, + "learning_rate": 0.00014684287812041118, + "loss": 1.8333, + "step": 500 + }, + { + "epoch": 0.22256500661084178, + "grad_norm": 0.3038176355961198, + "learning_rate": 0.00014831130690161527, + "loss": 1.8275, + "step": 505 + }, + { + "epoch": 0.2247686205376818, + "grad_norm": 0.30793332193972345, + "learning_rate": 0.0001497797356828194, + "loss": 1.8177, + "step": 510 + }, + { + "epoch": 0.2269722344645218, + "grad_norm": 0.3072544700688745, + "learning_rate": 0.0001512481644640235, + "loss": 1.9828, + "step": 515 + }, + { + "epoch": 0.22917584839136185, + "grad_norm": 0.34999895392965585, + "learning_rate": 0.00015271659324522763, + "loss": 1.9117, + "step": 520 + }, + { + "epoch": 0.23137946231820186, + "grad_norm": 0.32911766969763456, + "learning_rate": 0.00015418502202643172, + "loss": 1.6697, + "step": 525 + }, + { + "epoch": 0.23358307624504188, + "grad_norm": 0.2617607674469746, + "learning_rate": 0.00015565345080763586, + "loss": 1.6296, + "step": 530 + }, + { + "epoch": 0.2357866901718819, + "grad_norm": 0.43182139308340584, + "learning_rate": 0.00015712187958883994, + "loss": 1.9794, + "step": 535 + }, + { + "epoch": 0.2379903040987219, + "grad_norm": 0.3127396374267501, + "learning_rate": 0.00015859030837004406, + "loss": 1.7436, + "step": 540 + }, + { + "epoch": 0.24019391802556192, + "grad_norm": 0.21415922561789702, + "learning_rate": 0.00016005873715124817, + "loss": 1.7742, + "step": 545 + }, + { + "epoch": 0.24239753195240193, + "grad_norm": 0.31852170762507637, + "learning_rate": 0.00016152716593245228, + "loss": 1.7969, + "step": 550 + }, + { + "epoch": 0.24460114587924195, + "grad_norm": 0.29928941424884764, + "learning_rate": 0.0001629955947136564, + "loss": 1.8178, + "step": 555 + }, + { + "epoch": 0.24680475980608196, + "grad_norm": 0.2145836014852306, + "learning_rate": 0.0001644640234948605, + "loss": 1.6149, + "step": 560 + }, + { + "epoch": 0.249008373732922, + "grad_norm": 0.2765877443446801, + "learning_rate": 0.00016593245227606462, + "loss": 1.859, + "step": 565 + }, + { + "epoch": 0.251211987659762, + "grad_norm": 0.26872348364190873, + "learning_rate": 0.00016740088105726873, + "loss": 1.8757, + "step": 570 + }, + { + "epoch": 0.25341560158660204, + "grad_norm": 0.3149952835357651, + "learning_rate": 0.00016886930983847284, + "loss": 1.869, + "step": 575 + }, + { + "epoch": 0.255619215513442, + "grad_norm": 0.2304679465113612, + "learning_rate": 0.00017033773861967696, + "loss": 1.7151, + "step": 580 + }, + { + "epoch": 0.25782282944028206, + "grad_norm": 0.24264862789474126, + "learning_rate": 0.00017180616740088107, + "loss": 2.0695, + "step": 585 + }, + { + "epoch": 0.2600264433671221, + "grad_norm": 0.2598067252093709, + "learning_rate": 0.00017327459618208518, + "loss": 1.839, + "step": 590 + }, + { + "epoch": 0.2622300572939621, + "grad_norm": 0.26705538323533523, + "learning_rate": 0.0001747430249632893, + "loss": 1.8008, + "step": 595 + }, + { + "epoch": 0.26443367122080214, + "grad_norm": 0.28479994342274373, + "learning_rate": 0.0001762114537444934, + "loss": 1.7284, + "step": 600 + }, + { + "epoch": 0.2666372851476421, + "grad_norm": 0.28497317726898896, + "learning_rate": 0.00017767988252569752, + "loss": 1.9214, + "step": 605 + }, + { + "epoch": 0.26884089907448216, + "grad_norm": 0.33639476801612694, + "learning_rate": 0.00017914831130690163, + "loss": 1.7516, + "step": 610 + }, + { + "epoch": 0.27104451300132215, + "grad_norm": 0.25526934350033054, + "learning_rate": 0.00018061674008810574, + "loss": 1.7688, + "step": 615 + }, + { + "epoch": 0.2732481269281622, + "grad_norm": 0.28869758623973935, + "learning_rate": 0.00018208516886930985, + "loss": 1.7843, + "step": 620 + }, + { + "epoch": 0.2754517408550022, + "grad_norm": 0.2782854068624019, + "learning_rate": 0.00018355359765051397, + "loss": 1.8081, + "step": 625 + }, + { + "epoch": 0.2776553547818422, + "grad_norm": 0.28975241284668296, + "learning_rate": 0.00018502202643171805, + "loss": 2.0243, + "step": 630 + }, + { + "epoch": 0.27985896870868227, + "grad_norm": 0.3308791482681284, + "learning_rate": 0.0001864904552129222, + "loss": 1.9301, + "step": 635 + }, + { + "epoch": 0.28206258263552225, + "grad_norm": 0.2725800531393519, + "learning_rate": 0.00018795888399412628, + "loss": 1.8126, + "step": 640 + }, + { + "epoch": 0.2842661965623623, + "grad_norm": 0.2347435274751105, + "learning_rate": 0.00018942731277533042, + "loss": 1.9526, + "step": 645 + }, + { + "epoch": 0.2864698104892023, + "grad_norm": 0.2594593727241724, + "learning_rate": 0.0001908957415565345, + "loss": 1.7362, + "step": 650 + }, + { + "epoch": 0.2886734244160423, + "grad_norm": 0.20428739622605385, + "learning_rate": 0.00019236417033773864, + "loss": 1.9143, + "step": 655 + }, + { + "epoch": 0.2908770383428823, + "grad_norm": 0.35231366470248277, + "learning_rate": 0.00019383259911894273, + "loss": 1.882, + "step": 660 + }, + { + "epoch": 0.29308065226972235, + "grad_norm": 0.28303949235132847, + "learning_rate": 0.00019530102790014687, + "loss": 1.9956, + "step": 665 + }, + { + "epoch": 0.29528426619656234, + "grad_norm": 0.28380137271318484, + "learning_rate": 0.00019676945668135095, + "loss": 1.9298, + "step": 670 + }, + { + "epoch": 0.2974878801234024, + "grad_norm": 0.29197603939206423, + "learning_rate": 0.0001982378854625551, + "loss": 1.9936, + "step": 675 + }, + { + "epoch": 0.2996914940502424, + "grad_norm": 0.3192279737304851, + "learning_rate": 0.00019970631424375918, + "loss": 1.856, + "step": 680 + }, + { + "epoch": 0.3018951079770824, + "grad_norm": 0.31555391042987874, + "learning_rate": 0.00019999978960491256, + "loss": 1.9492, + "step": 685 + }, + { + "epoch": 0.30409872190392245, + "grad_norm": 0.26468213545329267, + "learning_rate": 0.0001999989348763872, + "loss": 1.867, + "step": 690 + }, + { + "epoch": 0.30630233583076244, + "grad_norm": 0.2591553540599883, + "learning_rate": 0.0001999974226703463, + "loss": 1.7565, + "step": 695 + }, + { + "epoch": 0.3085059497576025, + "grad_norm": 0.33857027825097774, + "learning_rate": 0.00019999525299673244, + "loss": 1.8407, + "step": 700 + }, + { + "epoch": 0.31070956368444247, + "grad_norm": 0.27554046406812405, + "learning_rate": 0.0001999924258698108, + "loss": 1.8449, + "step": 705 + }, + { + "epoch": 0.3129131776112825, + "grad_norm": 0.2906036501277061, + "learning_rate": 0.0001999889413081694, + "loss": 1.9425, + "step": 710 + }, + { + "epoch": 0.3151167915381225, + "grad_norm": 0.24516499633235853, + "learning_rate": 0.00019998479933471862, + "loss": 1.8373, + "step": 715 + }, + { + "epoch": 0.31732040546496254, + "grad_norm": 0.261325768775238, + "learning_rate": 0.0001999799999766913, + "loss": 1.8655, + "step": 720 + }, + { + "epoch": 0.3195240193918026, + "grad_norm": 0.26292913191780404, + "learning_rate": 0.00019997454326564252, + "loss": 1.8011, + "step": 725 + }, + { + "epoch": 0.32172763331864257, + "grad_norm": 0.24941214673161538, + "learning_rate": 0.0001999684292374493, + "loss": 1.7063, + "step": 730 + }, + { + "epoch": 0.3239312472454826, + "grad_norm": 0.285047883815678, + "learning_rate": 0.00019996165793231038, + "loss": 1.9537, + "step": 735 + }, + { + "epoch": 0.3261348611723226, + "grad_norm": 0.2853612548244496, + "learning_rate": 0.0001999542293947461, + "loss": 1.7641, + "step": 740 + }, + { + "epoch": 0.32833847509916264, + "grad_norm": 0.2600200330765211, + "learning_rate": 0.00019994614367359792, + "loss": 1.8886, + "step": 745 + }, + { + "epoch": 0.33054208902600263, + "grad_norm": 0.36999348537839605, + "learning_rate": 0.00019993740082202818, + "loss": 1.798, + "step": 750 + }, + { + "epoch": 0.33274570295284267, + "grad_norm": 0.24647791783433215, + "learning_rate": 0.00019992800089751984, + "loss": 1.8922, + "step": 755 + }, + { + "epoch": 0.33494931687968266, + "grad_norm": 0.23414809097297196, + "learning_rate": 0.0001999179439618759, + "loss": 1.6675, + "step": 760 + }, + { + "epoch": 0.3371529308065227, + "grad_norm": 0.24171495213943817, + "learning_rate": 0.00019990723008121917, + "loss": 1.5054, + "step": 765 + }, + { + "epoch": 0.33935654473336274, + "grad_norm": 0.2462163957540956, + "learning_rate": 0.00019989585932599172, + "loss": 1.8441, + "step": 770 + }, + { + "epoch": 0.34156015866020273, + "grad_norm": 0.24192478872496498, + "learning_rate": 0.00019988383177095459, + "loss": 1.8535, + "step": 775 + }, + { + "epoch": 0.34376377258704277, + "grad_norm": 0.21554799200025276, + "learning_rate": 0.000199871147495187, + "loss": 1.7299, + "step": 780 + }, + { + "epoch": 0.34596738651388276, + "grad_norm": 0.29240426650240625, + "learning_rate": 0.00019985780658208618, + "loss": 1.9846, + "step": 785 + }, + { + "epoch": 0.3481710004407228, + "grad_norm": 0.32582827846868756, + "learning_rate": 0.00019984380911936648, + "loss": 1.5922, + "step": 790 + }, + { + "epoch": 0.3503746143675628, + "grad_norm": 0.3253623043496682, + "learning_rate": 0.00019982915519905912, + "loss": 1.7138, + "step": 795 + }, + { + "epoch": 0.35257822829440283, + "grad_norm": 0.3012253411690825, + "learning_rate": 0.00019981384491751133, + "loss": 1.9526, + "step": 800 + }, + { + "epoch": 0.3547818422212428, + "grad_norm": 0.32373710599771705, + "learning_rate": 0.00019979787837538587, + "loss": 2.0799, + "step": 805 + }, + { + "epoch": 0.35698545614808286, + "grad_norm": 0.2736459891711902, + "learning_rate": 0.00019978125567766023, + "loss": 1.8422, + "step": 810 + }, + { + "epoch": 0.3591890700749229, + "grad_norm": 0.28970333253517644, + "learning_rate": 0.00019976397693362614, + "loss": 1.8309, + "step": 815 + }, + { + "epoch": 0.3613926840017629, + "grad_norm": 0.271249926943477, + "learning_rate": 0.0001997460422568886, + "loss": 1.6581, + "step": 820 + }, + { + "epoch": 0.36359629792860293, + "grad_norm": 0.25582927997161137, + "learning_rate": 0.00019972745176536537, + "loss": 1.9441, + "step": 825 + }, + { + "epoch": 0.3657999118554429, + "grad_norm": 0.27987061803178176, + "learning_rate": 0.00019970820558128604, + "loss": 1.8015, + "step": 830 + }, + { + "epoch": 0.36800352578228296, + "grad_norm": 0.2956257318754398, + "learning_rate": 0.0001996883038311913, + "loss": 1.7853, + "step": 835 + }, + { + "epoch": 0.37020713970912295, + "grad_norm": 0.25727871315384043, + "learning_rate": 0.00019966774664593206, + "loss": 1.7594, + "step": 840 + }, + { + "epoch": 0.372410753635963, + "grad_norm": 0.3069722747545403, + "learning_rate": 0.00019964653416066868, + "loss": 1.9102, + "step": 845 + }, + { + "epoch": 0.374614367562803, + "grad_norm": 0.362312858514461, + "learning_rate": 0.0001996246665148699, + "loss": 1.8419, + "step": 850 + }, + { + "epoch": 0.376817981489643, + "grad_norm": 0.30043599837768675, + "learning_rate": 0.00019960214385231217, + "loss": 1.9281, + "step": 855 + }, + { + "epoch": 0.37902159541648306, + "grad_norm": 0.25742900200071334, + "learning_rate": 0.00019957896632107845, + "loss": 1.8382, + "step": 860 + }, + { + "epoch": 0.38122520934332305, + "grad_norm": 0.22904042108335546, + "learning_rate": 0.00019955513407355743, + "loss": 1.585, + "step": 865 + }, + { + "epoch": 0.3834288232701631, + "grad_norm": 0.35248924984262536, + "learning_rate": 0.0001995306472664425, + "loss": 1.8779, + "step": 870 + }, + { + "epoch": 0.3856324371970031, + "grad_norm": 0.21394107686214808, + "learning_rate": 0.00019950550606073056, + "loss": 1.6203, + "step": 875 + }, + { + "epoch": 0.3878360511238431, + "grad_norm": 0.23210191109497874, + "learning_rate": 0.00019947971062172118, + "loss": 1.7579, + "step": 880 + }, + { + "epoch": 0.3900396650506831, + "grad_norm": 0.280197585305159, + "learning_rate": 0.00019945326111901542, + "loss": 1.8697, + "step": 885 + }, + { + "epoch": 0.39224327897752315, + "grad_norm": 0.2007938963214883, + "learning_rate": 0.00019942615772651455, + "loss": 1.6718, + "step": 890 + }, + { + "epoch": 0.39444689290436313, + "grad_norm": 0.28396842059026434, + "learning_rate": 0.0001993984006224193, + "loss": 1.8261, + "step": 895 + }, + { + "epoch": 0.3966505068312032, + "grad_norm": 0.2970710682132302, + "learning_rate": 0.00019936998998922826, + "loss": 1.8988, + "step": 900 + }, + { + "epoch": 0.3988541207580432, + "grad_norm": 0.4794659287163716, + "learning_rate": 0.00019934092601373694, + "loss": 1.8387, + "step": 905 + }, + { + "epoch": 0.4010577346848832, + "grad_norm": 0.2810211439045603, + "learning_rate": 0.00019931120888703652, + "loss": 1.7516, + "step": 910 + }, + { + "epoch": 0.40326134861172325, + "grad_norm": 0.28984779802870314, + "learning_rate": 0.0001992808388045125, + "loss": 1.9212, + "step": 915 + }, + { + "epoch": 0.40546496253856323, + "grad_norm": 0.26294811280148017, + "learning_rate": 0.00019924981596584345, + "loss": 1.8798, + "step": 920 + }, + { + "epoch": 0.4076685764654033, + "grad_norm": 0.27546864810479915, + "learning_rate": 0.00019921814057499978, + "loss": 1.7595, + "step": 925 + }, + { + "epoch": 0.40987219039224326, + "grad_norm": 0.2506019656377615, + "learning_rate": 0.0001991858128402422, + "loss": 1.7625, + "step": 930 + }, + { + "epoch": 0.4120758043190833, + "grad_norm": 0.35470941281384444, + "learning_rate": 0.0001991528329741206, + "loss": 1.8644, + "step": 935 + }, + { + "epoch": 0.4142794182459233, + "grad_norm": 0.30456048523132856, + "learning_rate": 0.00019911920119347254, + "loss": 1.8427, + "step": 940 + }, + { + "epoch": 0.41648303217276333, + "grad_norm": 0.21180617855630868, + "learning_rate": 0.0001990849177194217, + "loss": 1.947, + "step": 945 + }, + { + "epoch": 0.4186866460996034, + "grad_norm": 0.27175217869149726, + "learning_rate": 0.00019904998277737668, + "loss": 1.5794, + "step": 950 + }, + { + "epoch": 0.42089026002644336, + "grad_norm": 0.25340188979799705, + "learning_rate": 0.00019901439659702924, + "loss": 1.655, + "step": 955 + }, + { + "epoch": 0.4230938739532834, + "grad_norm": 0.32039092541913483, + "learning_rate": 0.00019897815941235307, + "loss": 1.9448, + "step": 960 + }, + { + "epoch": 0.4252974878801234, + "grad_norm": 0.28203099326236186, + "learning_rate": 0.00019894127146160204, + "loss": 1.73, + "step": 965 + }, + { + "epoch": 0.42750110180696343, + "grad_norm": 0.790698073004524, + "learning_rate": 0.00019890373298730868, + "loss": 2.0466, + "step": 970 + }, + { + "epoch": 0.4297047157338034, + "grad_norm": 0.23107801878483133, + "learning_rate": 0.0001988655442362827, + "loss": 1.5805, + "step": 975 + }, + { + "epoch": 0.43190832966064346, + "grad_norm": 0.2396293478769625, + "learning_rate": 0.00019882670545960914, + "loss": 1.7482, + "step": 980 + }, + { + "epoch": 0.43411194358748345, + "grad_norm": 0.29159214942433453, + "learning_rate": 0.00019878721691264704, + "loss": 1.9851, + "step": 985 + }, + { + "epoch": 0.4363155575143235, + "grad_norm": 0.2825006648123125, + "learning_rate": 0.00019874707885502745, + "loss": 1.7534, + "step": 990 + }, + { + "epoch": 0.43851917144116354, + "grad_norm": 0.28034425756379244, + "learning_rate": 0.00019870629155065186, + "loss": 1.7489, + "step": 995 + }, + { + "epoch": 0.4407227853680035, + "grad_norm": 0.3181059895374247, + "learning_rate": 0.0001986648552676905, + "loss": 1.8798, + "step": 1000 + }, + { + "epoch": 0.44292639929484356, + "grad_norm": 0.20367681253555586, + "learning_rate": 0.0001986227702785805, + "loss": 1.8065, + "step": 1005 + }, + { + "epoch": 0.44513001322168355, + "grad_norm": 0.25873539379499144, + "learning_rate": 0.0001985800368600242, + "loss": 1.687, + "step": 1010 + }, + { + "epoch": 0.4473336271485236, + "grad_norm": 0.2342074611961214, + "learning_rate": 0.0001985366552929871, + "loss": 1.9431, + "step": 1015 + }, + { + "epoch": 0.4495372410753636, + "grad_norm": 0.48215863294928724, + "learning_rate": 0.00019849262586269642, + "loss": 1.8137, + "step": 1020 + }, + { + "epoch": 0.4517408550022036, + "grad_norm": 0.27724392120608005, + "learning_rate": 0.00019844794885863877, + "loss": 1.8311, + "step": 1025 + }, + { + "epoch": 0.4539444689290436, + "grad_norm": 0.22655546528476625, + "learning_rate": 0.00019840262457455855, + "loss": 1.6968, + "step": 1030 + }, + { + "epoch": 0.45614808285588365, + "grad_norm": 0.24804519754733267, + "learning_rate": 0.00019835665330845595, + "loss": 1.844, + "step": 1035 + }, + { + "epoch": 0.4583516967827237, + "grad_norm": 0.3053870909241666, + "learning_rate": 0.00019831003536258487, + "loss": 1.6674, + "step": 1040 + }, + { + "epoch": 0.4605553107095637, + "grad_norm": 0.2677054196356507, + "learning_rate": 0.00019826277104345109, + "loss": 1.994, + "step": 1045 + }, + { + "epoch": 0.4627589246364037, + "grad_norm": 0.2332936772139347, + "learning_rate": 0.0001982148606618102, + "loss": 1.7963, + "step": 1050 + }, + { + "epoch": 0.4649625385632437, + "grad_norm": 0.290304703836439, + "learning_rate": 0.00019816630453266555, + "loss": 1.8278, + "step": 1055 + }, + { + "epoch": 0.46716615249008375, + "grad_norm": 0.31440944408786714, + "learning_rate": 0.0001981171029752662, + "loss": 1.7053, + "step": 1060 + }, + { + "epoch": 0.46936976641692374, + "grad_norm": 0.31837742235142563, + "learning_rate": 0.00019806725631310476, + "loss": 1.8377, + "step": 1065 + }, + { + "epoch": 0.4715733803437638, + "grad_norm": 0.31835369363351046, + "learning_rate": 0.00019801676487391529, + "loss": 1.7635, + "step": 1070 + }, + { + "epoch": 0.47377699427060377, + "grad_norm": 0.32008261758346795, + "learning_rate": 0.0001979656289896712, + "loss": 1.8322, + "step": 1075 + }, + { + "epoch": 0.4759806081974438, + "grad_norm": 0.2436306554476478, + "learning_rate": 0.000197913848996583, + "loss": 1.7057, + "step": 1080 + }, + { + "epoch": 0.47818422212428385, + "grad_norm": 0.24420764132400904, + "learning_rate": 0.00019786142523509615, + "loss": 1.7756, + "step": 1085 + }, + { + "epoch": 0.48038783605112384, + "grad_norm": 0.24156752527695532, + "learning_rate": 0.00019780835804988876, + "loss": 1.7788, + "step": 1090 + }, + { + "epoch": 0.4825914499779639, + "grad_norm": 0.3341728235303617, + "learning_rate": 0.00019775464778986934, + "loss": 1.9, + "step": 1095 + }, + { + "epoch": 0.48479506390480387, + "grad_norm": 0.23920551280340352, + "learning_rate": 0.00019770029480817454, + "loss": 1.8956, + "step": 1100 + }, + { + "epoch": 0.4869986778316439, + "grad_norm": 0.2463312946508512, + "learning_rate": 0.00019764529946216682, + "loss": 1.5345, + "step": 1105 + }, + { + "epoch": 0.4892022917584839, + "grad_norm": 0.26366246938923554, + "learning_rate": 0.00019758966211343206, + "loss": 1.7621, + "step": 1110 + }, + { + "epoch": 0.49140590568532394, + "grad_norm": 0.3123943209000793, + "learning_rate": 0.00019753338312777718, + "loss": 1.815, + "step": 1115 + }, + { + "epoch": 0.4936095196121639, + "grad_norm": 0.2551443475914167, + "learning_rate": 0.00019747646287522784, + "loss": 1.8611, + "step": 1120 + }, + { + "epoch": 0.49581313353900397, + "grad_norm": 0.2332525414967985, + "learning_rate": 0.0001974189017300259, + "loss": 1.5384, + "step": 1125 + }, + { + "epoch": 0.498016747465844, + "grad_norm": 0.2861706387721841, + "learning_rate": 0.00019736070007062692, + "loss": 2.029, + "step": 1130 + }, + { + "epoch": 0.500220361392684, + "grad_norm": 0.4583730418300565, + "learning_rate": 0.00019730185827969784, + "loss": 1.6826, + "step": 1135 + }, + { + "epoch": 0.502423975319524, + "grad_norm": 0.1931367375627417, + "learning_rate": 0.00019724237674411432, + "loss": 1.6877, + "step": 1140 + }, + { + "epoch": 0.5046275892463641, + "grad_norm": 0.3082508866109686, + "learning_rate": 0.00019718225585495824, + "loss": 1.8148, + "step": 1145 + }, + { + "epoch": 0.5068312031732041, + "grad_norm": 0.18106453992956664, + "learning_rate": 0.00019712149600751517, + "loss": 1.6556, + "step": 1150 + }, + { + "epoch": 0.5090348171000441, + "grad_norm": 0.23000297672552616, + "learning_rate": 0.00019706009760127164, + "loss": 2.0152, + "step": 1155 + }, + { + "epoch": 0.511238431026884, + "grad_norm": 0.29387352965288377, + "learning_rate": 0.00019699806103991272, + "loss": 1.7962, + "step": 1160 + }, + { + "epoch": 0.5134420449537241, + "grad_norm": 0.23060861917057546, + "learning_rate": 0.00019693538673131917, + "loss": 1.8123, + "step": 1165 + }, + { + "epoch": 0.5156456588805641, + "grad_norm": 0.25329490761690543, + "learning_rate": 0.00019687207508756486, + "loss": 1.7052, + "step": 1170 + }, + { + "epoch": 0.5178492728074041, + "grad_norm": 0.3356944723720528, + "learning_rate": 0.00019680812652491408, + "loss": 1.7985, + "step": 1175 + }, + { + "epoch": 0.5200528867342442, + "grad_norm": 0.31492967037029806, + "learning_rate": 0.0001967435414638187, + "loss": 1.7971, + "step": 1180 + }, + { + "epoch": 0.5222565006610842, + "grad_norm": 0.2915314540239125, + "learning_rate": 0.00019667832032891554, + "loss": 1.9571, + "step": 1185 + }, + { + "epoch": 0.5244601145879242, + "grad_norm": 0.28065572919249093, + "learning_rate": 0.00019661246354902342, + "loss": 1.9185, + "step": 1190 + }, + { + "epoch": 0.5266637285147642, + "grad_norm": 0.22888871376259906, + "learning_rate": 0.00019654597155714044, + "loss": 1.7367, + "step": 1195 + }, + { + "epoch": 0.5288673424416043, + "grad_norm": 0.23069615225682083, + "learning_rate": 0.00019647884479044123, + "loss": 1.7333, + "step": 1200 + }, + { + "epoch": 0.5310709563684443, + "grad_norm": 0.25845750908289505, + "learning_rate": 0.00019641108369027385, + "loss": 1.5907, + "step": 1205 + }, + { + "epoch": 0.5332745702952842, + "grad_norm": 0.3069614304651339, + "learning_rate": 0.00019634268870215703, + "loss": 1.9282, + "step": 1210 + }, + { + "epoch": 0.5354781842221242, + "grad_norm": 0.3005722535622308, + "learning_rate": 0.00019627366027577726, + "loss": 1.6378, + "step": 1215 + }, + { + "epoch": 0.5376817981489643, + "grad_norm": 0.270244780455395, + "learning_rate": 0.00019620399886498578, + "loss": 1.6499, + "step": 1220 + }, + { + "epoch": 0.5398854120758043, + "grad_norm": 0.2796374681033303, + "learning_rate": 0.0001961337049277955, + "loss": 1.7962, + "step": 1225 + }, + { + "epoch": 0.5420890260026443, + "grad_norm": 0.2608673103569785, + "learning_rate": 0.00019606277892637823, + "loss": 1.6946, + "step": 1230 + }, + { + "epoch": 0.5442926399294844, + "grad_norm": 0.35197474426731085, + "learning_rate": 0.00019599122132706146, + "loss": 1.9751, + "step": 1235 + }, + { + "epoch": 0.5464962538563244, + "grad_norm": 0.26530267535084306, + "learning_rate": 0.0001959190326003253, + "loss": 1.7257, + "step": 1240 + }, + { + "epoch": 0.5486998677831644, + "grad_norm": 0.2540337035705083, + "learning_rate": 0.00019584621322079942, + "loss": 1.8693, + "step": 1245 + }, + { + "epoch": 0.5509034817100044, + "grad_norm": 0.2530867344602867, + "learning_rate": 0.00019577276366726003, + "loss": 1.6761, + "step": 1250 + }, + { + "epoch": 0.5531070956368445, + "grad_norm": 0.2549205571103006, + "learning_rate": 0.00019569868442262655, + "loss": 1.8729, + "step": 1255 + }, + { + "epoch": 0.5553107095636844, + "grad_norm": 0.2871173332255517, + "learning_rate": 0.00019562397597395857, + "loss": 1.8347, + "step": 1260 + }, + { + "epoch": 0.5575143234905244, + "grad_norm": 0.28369117286467893, + "learning_rate": 0.0001955486388124525, + "loss": 1.8547, + "step": 1265 + }, + { + "epoch": 0.5597179374173645, + "grad_norm": 0.2852722364768899, + "learning_rate": 0.00019547267343343857, + "loss": 1.6552, + "step": 1270 + }, + { + "epoch": 0.5619215513442045, + "grad_norm": 0.34585189639791736, + "learning_rate": 0.0001953960803363774, + "loss": 1.6727, + "step": 1275 + }, + { + "epoch": 0.5641251652710445, + "grad_norm": 0.2767216642372831, + "learning_rate": 0.00019531886002485674, + "loss": 1.8886, + "step": 1280 + }, + { + "epoch": 0.5663287791978845, + "grad_norm": 0.20036600537602942, + "learning_rate": 0.00019524101300658813, + "loss": 1.8685, + "step": 1285 + }, + { + "epoch": 0.5685323931247246, + "grad_norm": 0.261316473836974, + "learning_rate": 0.0001951625397934037, + "loss": 1.782, + "step": 1290 + }, + { + "epoch": 0.5707360070515646, + "grad_norm": 0.4236076570583356, + "learning_rate": 0.0001950834409012527, + "loss": 1.8318, + "step": 1295 + }, + { + "epoch": 0.5729396209784046, + "grad_norm": 0.28925306291568, + "learning_rate": 0.00019500371685019806, + "loss": 1.6012, + "step": 1300 + }, + { + "epoch": 0.5751432349052445, + "grad_norm": 0.2945702905493002, + "learning_rate": 0.0001949233681644131, + "loss": 1.9158, + "step": 1305 + }, + { + "epoch": 0.5773468488320846, + "grad_norm": 0.27644920036772963, + "learning_rate": 0.00019484239537217798, + "loss": 1.8232, + "step": 1310 + }, + { + "epoch": 0.5795504627589246, + "grad_norm": 0.24910322311175717, + "learning_rate": 0.00019476079900587626, + "loss": 2.0731, + "step": 1315 + }, + { + "epoch": 0.5817540766857646, + "grad_norm": 0.2746087574116961, + "learning_rate": 0.00019467857960199142, + "loss": 1.8429, + "step": 1320 + }, + { + "epoch": 0.5839576906126047, + "grad_norm": 0.30262920301398166, + "learning_rate": 0.00019459573770110335, + "loss": 1.7647, + "step": 1325 + }, + { + "epoch": 0.5861613045394447, + "grad_norm": 0.2570054670626858, + "learning_rate": 0.0001945122738478847, + "loss": 1.7511, + "step": 1330 + }, + { + "epoch": 0.5883649184662847, + "grad_norm": 0.2506934757685486, + "learning_rate": 0.00019442818859109737, + "loss": 1.9036, + "step": 1335 + }, + { + "epoch": 0.5905685323931247, + "grad_norm": 0.28984253571835894, + "learning_rate": 0.00019434348248358892, + "loss": 1.7763, + "step": 1340 + }, + { + "epoch": 0.5927721463199648, + "grad_norm": 0.28120650964379307, + "learning_rate": 0.00019425815608228888, + "loss": 1.8062, + "step": 1345 + }, + { + "epoch": 0.5949757602468048, + "grad_norm": 0.26796794914129696, + "learning_rate": 0.00019417220994820514, + "loss": 1.7886, + "step": 1350 + }, + { + "epoch": 0.5971793741736448, + "grad_norm": 0.27855604023858827, + "learning_rate": 0.00019408564464642024, + "loss": 1.6226, + "step": 1355 + }, + { + "epoch": 0.5993829881004848, + "grad_norm": 0.23996517928921976, + "learning_rate": 0.00019399846074608757, + "loss": 1.8206, + "step": 1360 + }, + { + "epoch": 0.6015866020273248, + "grad_norm": 0.2718103628870133, + "learning_rate": 0.00019391065882042786, + "loss": 1.622, + "step": 1365 + }, + { + "epoch": 0.6037902159541648, + "grad_norm": 0.40326599118637163, + "learning_rate": 0.00019382223944672516, + "loss": 1.8595, + "step": 1370 + }, + { + "epoch": 0.6059938298810048, + "grad_norm": 0.26188146699351833, + "learning_rate": 0.00019373320320632313, + "loss": 1.719, + "step": 1375 + }, + { + "epoch": 0.6081974438078449, + "grad_norm": 0.26353376595142103, + "learning_rate": 0.00019364355068462126, + "loss": 1.7599, + "step": 1380 + }, + { + "epoch": 0.6104010577346849, + "grad_norm": 0.27506310894040453, + "learning_rate": 0.00019355328247107106, + "loss": 1.6895, + "step": 1385 + }, + { + "epoch": 0.6126046716615249, + "grad_norm": 0.23293714707305346, + "learning_rate": 0.00019346239915917204, + "loss": 1.9199, + "step": 1390 + }, + { + "epoch": 0.6148082855883649, + "grad_norm": 0.2728024319863431, + "learning_rate": 0.00019337090134646787, + "loss": 1.6137, + "step": 1395 + }, + { + "epoch": 0.617011899515205, + "grad_norm": 0.2580080135345881, + "learning_rate": 0.00019327878963454253, + "loss": 1.9251, + "step": 1400 + }, + { + "epoch": 0.619215513442045, + "grad_norm": 0.18292171734629373, + "learning_rate": 0.00019318606462901625, + "loss": 1.6127, + "step": 1405 + }, + { + "epoch": 0.6214191273688849, + "grad_norm": 0.2540991707755816, + "learning_rate": 0.0001930927269395416, + "loss": 1.716, + "step": 1410 + }, + { + "epoch": 0.623622741295725, + "grad_norm": 0.2222252074542422, + "learning_rate": 0.00019299877717979944, + "loss": 1.649, + "step": 1415 + }, + { + "epoch": 0.625826355222565, + "grad_norm": 0.23656133580313402, + "learning_rate": 0.00019290421596749487, + "loss": 1.7321, + "step": 1420 + }, + { + "epoch": 0.628029969149405, + "grad_norm": 0.2770878731693278, + "learning_rate": 0.00019280904392435328, + "loss": 1.7982, + "step": 1425 + }, + { + "epoch": 0.630233583076245, + "grad_norm": 0.38713553349288304, + "learning_rate": 0.00019271326167611606, + "loss": 1.757, + "step": 1430 + }, + { + "epoch": 0.6324371970030851, + "grad_norm": 0.21826448168974197, + "learning_rate": 0.00019261686985253668, + "loss": 1.6568, + "step": 1435 + }, + { + "epoch": 0.6346408109299251, + "grad_norm": 0.28392855935252787, + "learning_rate": 0.00019251986908737646, + "loss": 1.6995, + "step": 1440 + }, + { + "epoch": 0.6368444248567651, + "grad_norm": 0.3335345636849735, + "learning_rate": 0.00019242226001840043, + "loss": 1.6445, + "step": 1445 + }, + { + "epoch": 0.6390480387836052, + "grad_norm": 0.23367317521663916, + "learning_rate": 0.0001923240432873731, + "loss": 1.7995, + "step": 1450 + }, + { + "epoch": 0.6412516527104452, + "grad_norm": 0.26851942754392183, + "learning_rate": 0.00019222521954005424, + "loss": 1.8078, + "step": 1455 + }, + { + "epoch": 0.6434552666372851, + "grad_norm": 0.28393938915488615, + "learning_rate": 0.00019212578942619474, + "loss": 1.7108, + "step": 1460 + }, + { + "epoch": 0.6456588805641251, + "grad_norm": 0.2481400915816766, + "learning_rate": 0.00019202575359953213, + "loss": 1.7509, + "step": 1465 + }, + { + "epoch": 0.6478624944909652, + "grad_norm": 0.22200215432596082, + "learning_rate": 0.00019192511271778656, + "loss": 1.6549, + "step": 1470 + }, + { + "epoch": 0.6500661084178052, + "grad_norm": 0.24388551441860462, + "learning_rate": 0.00019182386744265623, + "loss": 1.9977, + "step": 1475 + }, + { + "epoch": 0.6522697223446452, + "grad_norm": 0.28231311563927136, + "learning_rate": 0.00019172201843981314, + "loss": 1.7473, + "step": 1480 + }, + { + "epoch": 0.6544733362714852, + "grad_norm": 0.2911468005732327, + "learning_rate": 0.00019161956637889872, + "loss": 1.8572, + "step": 1485 + }, + { + "epoch": 0.6566769501983253, + "grad_norm": 0.2809891282005259, + "learning_rate": 0.0001915165119335194, + "loss": 1.6363, + "step": 1490 + }, + { + "epoch": 0.6588805641251653, + "grad_norm": 0.257092783900919, + "learning_rate": 0.0001914128557812422, + "loss": 1.6894, + "step": 1495 + }, + { + "epoch": 0.6610841780520053, + "grad_norm": 0.25492249438744313, + "learning_rate": 0.00019130859860359026, + "loss": 1.9549, + "step": 1500 + }, + { + "epoch": 0.6632877919788454, + "grad_norm": 0.29656417671091373, + "learning_rate": 0.00019120374108603843, + "loss": 1.882, + "step": 1505 + }, + { + "epoch": 0.6654914059056853, + "grad_norm": 0.26795937749586346, + "learning_rate": 0.0001910982839180086, + "loss": 1.7532, + "step": 1510 + }, + { + "epoch": 0.6676950198325253, + "grad_norm": 0.30019068058678056, + "learning_rate": 0.0001909922277928654, + "loss": 1.9185, + "step": 1515 + }, + { + "epoch": 0.6698986337593653, + "grad_norm": 0.2908491561313871, + "learning_rate": 0.00019088557340791136, + "loss": 1.8659, + "step": 1520 + }, + { + "epoch": 0.6721022476862054, + "grad_norm": 0.2664246491795904, + "learning_rate": 0.00019077832146438257, + "loss": 1.618, + "step": 1525 + }, + { + "epoch": 0.6743058616130454, + "grad_norm": 0.2670150987011056, + "learning_rate": 0.00019067047266744396, + "loss": 1.82, + "step": 1530 + }, + { + "epoch": 0.6765094755398854, + "grad_norm": 0.26014739460054054, + "learning_rate": 0.0001905620277261847, + "loss": 1.8267, + "step": 1535 + }, + { + "epoch": 0.6787130894667255, + "grad_norm": 0.39761269473071104, + "learning_rate": 0.00019045298735361345, + "loss": 1.7682, + "step": 1540 + }, + { + "epoch": 0.6809167033935655, + "grad_norm": 0.25574765659361764, + "learning_rate": 0.0001903433522666538, + "loss": 1.8709, + "step": 1545 + }, + { + "epoch": 0.6831203173204055, + "grad_norm": 0.2633667660696579, + "learning_rate": 0.00019023312318613945, + "loss": 1.7439, + "step": 1550 + }, + { + "epoch": 0.6853239312472454, + "grad_norm": 0.33365458542367915, + "learning_rate": 0.00019012230083680954, + "loss": 1.8991, + "step": 1555 + }, + { + "epoch": 0.6875275451740855, + "grad_norm": 0.26233949374382926, + "learning_rate": 0.0001900108859473039, + "loss": 1.7681, + "step": 1560 + }, + { + "epoch": 0.6897311591009255, + "grad_norm": 0.27517789876709425, + "learning_rate": 0.00018989887925015814, + "loss": 1.8164, + "step": 1565 + }, + { + "epoch": 0.6919347730277655, + "grad_norm": 0.22352922332415057, + "learning_rate": 0.00018978628148179897, + "loss": 1.6674, + "step": 1570 + }, + { + "epoch": 0.6941383869546055, + "grad_norm": 0.2705784713837994, + "learning_rate": 0.0001896730933825393, + "loss": 1.883, + "step": 1575 + }, + { + "epoch": 0.6963420008814456, + "grad_norm": 0.2333580239686404, + "learning_rate": 0.00018955931569657333, + "loss": 1.6889, + "step": 1580 + }, + { + "epoch": 0.6985456148082856, + "grad_norm": 0.29573837903023903, + "learning_rate": 0.00018944494917197172, + "loss": 1.8473, + "step": 1585 + }, + { + "epoch": 0.7007492287351256, + "grad_norm": 0.24394341802838163, + "learning_rate": 0.00018932999456067675, + "loss": 1.844, + "step": 1590 + }, + { + "epoch": 0.7029528426619657, + "grad_norm": 0.2454925329534158, + "learning_rate": 0.0001892144526184971, + "loss": 1.7824, + "step": 1595 + }, + { + "epoch": 0.7051564565888057, + "grad_norm": 0.23532373440650545, + "learning_rate": 0.00018909832410510315, + "loss": 1.8537, + "step": 1600 + }, + { + "epoch": 0.7073600705156456, + "grad_norm": 0.2602720785746293, + "learning_rate": 0.00018898160978402198, + "loss": 1.8717, + "step": 1605 + }, + { + "epoch": 0.7095636844424856, + "grad_norm": 0.23292617880883168, + "learning_rate": 0.00018886431042263208, + "loss": 1.703, + "step": 1610 + }, + { + "epoch": 0.7117672983693257, + "grad_norm": 0.32473590479932374, + "learning_rate": 0.0001887464267921587, + "loss": 1.7075, + "step": 1615 + }, + { + "epoch": 0.7139709122961657, + "grad_norm": 0.21131060430922607, + "learning_rate": 0.00018862795966766833, + "loss": 1.6993, + "step": 1620 + }, + { + "epoch": 0.7161745262230057, + "grad_norm": 0.2945260951578129, + "learning_rate": 0.0001885089098280641, + "loss": 1.7315, + "step": 1625 + }, + { + "epoch": 0.7183781401498458, + "grad_norm": 0.2988718316963067, + "learning_rate": 0.0001883892780560802, + "loss": 1.7079, + "step": 1630 + }, + { + "epoch": 0.7205817540766858, + "grad_norm": 0.2671861990469183, + "learning_rate": 0.00018826906513827704, + "loss": 1.8816, + "step": 1635 + }, + { + "epoch": 0.7227853680035258, + "grad_norm": 0.3539590759381266, + "learning_rate": 0.00018814827186503595, + "loss": 1.7559, + "step": 1640 + }, + { + "epoch": 0.7249889819303658, + "grad_norm": 0.2098483762554333, + "learning_rate": 0.00018802689903055396, + "loss": 1.8296, + "step": 1645 + }, + { + "epoch": 0.7271925958572059, + "grad_norm": 0.24342758941079398, + "learning_rate": 0.0001879049474328387, + "loss": 1.8845, + "step": 1650 + }, + { + "epoch": 0.7293962097840458, + "grad_norm": 0.30213591141833546, + "learning_rate": 0.00018778241787370303, + "loss": 1.7739, + "step": 1655 + }, + { + "epoch": 0.7315998237108858, + "grad_norm": 0.2796229215884198, + "learning_rate": 0.00018765931115875985, + "loss": 1.7238, + "step": 1660 + }, + { + "epoch": 0.7338034376377258, + "grad_norm": 0.2818539631618944, + "learning_rate": 0.00018753562809741673, + "loss": 1.7833, + "step": 1665 + }, + { + "epoch": 0.7360070515645659, + "grad_norm": 0.2534113717910187, + "learning_rate": 0.00018741136950287067, + "loss": 1.781, + "step": 1670 + }, + { + "epoch": 0.7382106654914059, + "grad_norm": 0.25817838823410116, + "learning_rate": 0.0001872865361921027, + "loss": 1.6845, + "step": 1675 + }, + { + "epoch": 0.7404142794182459, + "grad_norm": 0.25172352057271447, + "learning_rate": 0.00018716112898587247, + "loss": 1.9169, + "step": 1680 + }, + { + "epoch": 0.742617893345086, + "grad_norm": 0.257829203989891, + "learning_rate": 0.000187035148708713, + "loss": 1.7977, + "step": 1685 + }, + { + "epoch": 0.744821507271926, + "grad_norm": 0.21408758544817338, + "learning_rate": 0.00018690859618892506, + "loss": 1.6934, + "step": 1690 + }, + { + "epoch": 0.747025121198766, + "grad_norm": 0.32305529721640136, + "learning_rate": 0.0001867814722585719, + "loss": 1.811, + "step": 1695 + }, + { + "epoch": 0.749228735125606, + "grad_norm": 0.27366670016595385, + "learning_rate": 0.0001866537777534737, + "loss": 1.6083, + "step": 1700 + }, + { + "epoch": 0.751432349052446, + "grad_norm": 0.2978907514821807, + "learning_rate": 0.00018652551351320198, + "loss": 1.7621, + "step": 1705 + }, + { + "epoch": 0.753635962979286, + "grad_norm": 0.2682049213061407, + "learning_rate": 0.00018639668038107437, + "loss": 1.8008, + "step": 1710 + }, + { + "epoch": 0.755839576906126, + "grad_norm": 0.2422117187271709, + "learning_rate": 0.0001862672792041487, + "loss": 1.9899, + "step": 1715 + }, + { + "epoch": 0.7580431908329661, + "grad_norm": 0.27006478070278994, + "learning_rate": 0.0001861373108332177, + "loss": 1.7577, + "step": 1720 + }, + { + "epoch": 0.7602468047598061, + "grad_norm": 0.31176719361390687, + "learning_rate": 0.0001860067761228033, + "loss": 1.7494, + "step": 1725 + }, + { + "epoch": 0.7624504186866461, + "grad_norm": 0.2965711368621697, + "learning_rate": 0.00018587567593115098, + "loss": 1.9554, + "step": 1730 + }, + { + "epoch": 0.7646540326134861, + "grad_norm": 0.26922434332877426, + "learning_rate": 0.0001857440111202242, + "loss": 1.7415, + "step": 1735 + }, + { + "epoch": 0.7668576465403262, + "grad_norm": 0.25558172886185604, + "learning_rate": 0.00018561178255569879, + "loss": 1.7389, + "step": 1740 + }, + { + "epoch": 0.7690612604671662, + "grad_norm": 0.24853091753480136, + "learning_rate": 0.000185478991106957, + "loss": 1.9658, + "step": 1745 + }, + { + "epoch": 0.7712648743940062, + "grad_norm": 0.29324751667885357, + "learning_rate": 0.00018534563764708206, + "loss": 1.8161, + "step": 1750 + }, + { + "epoch": 0.7734684883208461, + "grad_norm": 0.2517432199347208, + "learning_rate": 0.00018521172305285236, + "loss": 1.6512, + "step": 1755 + }, + { + "epoch": 0.7756721022476862, + "grad_norm": 0.25142671678387746, + "learning_rate": 0.00018507724820473556, + "loss": 1.7221, + "step": 1760 + }, + { + "epoch": 0.7778757161745262, + "grad_norm": 0.2851848328431096, + "learning_rate": 0.00018494221398688307, + "loss": 1.9137, + "step": 1765 + }, + { + "epoch": 0.7800793301013662, + "grad_norm": 0.27922286690055653, + "learning_rate": 0.00018480662128712389, + "loss": 1.7529, + "step": 1770 + }, + { + "epoch": 0.7822829440282063, + "grad_norm": 0.24188014554995038, + "learning_rate": 0.00018467047099695905, + "loss": 1.7036, + "step": 1775 + }, + { + "epoch": 0.7844865579550463, + "grad_norm": 0.21832286860860883, + "learning_rate": 0.00018453376401155562, + "loss": 1.8127, + "step": 1780 + }, + { + "epoch": 0.7866901718818863, + "grad_norm": 0.26225286533808284, + "learning_rate": 0.00018439650122974087, + "loss": 1.7398, + "step": 1785 + }, + { + "epoch": 0.7888937858087263, + "grad_norm": 0.2585197080539731, + "learning_rate": 0.0001842586835539964, + "loss": 1.8645, + "step": 1790 + }, + { + "epoch": 0.7910973997355664, + "grad_norm": 0.24265804391287996, + "learning_rate": 0.00018412031189045196, + "loss": 1.7356, + "step": 1795 + }, + { + "epoch": 0.7933010136624064, + "grad_norm": 0.2590383769624523, + "learning_rate": 0.00018398138714887993, + "loss": 1.6518, + "step": 1800 + }, + { + "epoch": 0.7955046275892463, + "grad_norm": 0.23488722647726504, + "learning_rate": 0.00018384191024268894, + "loss": 1.8054, + "step": 1805 + }, + { + "epoch": 0.7977082415160864, + "grad_norm": 0.23606528165986015, + "learning_rate": 0.00018370188208891803, + "loss": 1.6994, + "step": 1810 + }, + { + "epoch": 0.7999118554429264, + "grad_norm": 0.29081075890767155, + "learning_rate": 0.00018356130360823068, + "loss": 1.987, + "step": 1815 + }, + { + "epoch": 0.8021154693697664, + "grad_norm": 0.24366570601075402, + "learning_rate": 0.00018342017572490858, + "loss": 1.5363, + "step": 1820 + }, + { + "epoch": 0.8043190832966064, + "grad_norm": 0.2677025615653137, + "learning_rate": 0.0001832784993668458, + "loss": 1.7781, + "step": 1825 + }, + { + "epoch": 0.8065226972234465, + "grad_norm": 0.22776397402134876, + "learning_rate": 0.0001831362754655424, + "loss": 1.8064, + "step": 1830 + }, + { + "epoch": 0.8087263111502865, + "grad_norm": 0.21327160658864644, + "learning_rate": 0.0001829935049560985, + "loss": 1.5278, + "step": 1835 + }, + { + "epoch": 0.8109299250771265, + "grad_norm": 0.4311681362534179, + "learning_rate": 0.0001828501887772081, + "loss": 1.9316, + "step": 1840 + }, + { + "epoch": 0.8131335390039665, + "grad_norm": 0.3127936675415962, + "learning_rate": 0.00018270632787115295, + "loss": 1.9393, + "step": 1845 + }, + { + "epoch": 0.8153371529308066, + "grad_norm": 0.2872735828645766, + "learning_rate": 0.0001825619231837962, + "loss": 1.8913, + "step": 1850 + }, + { + "epoch": 0.8175407668576465, + "grad_norm": 0.23659304750850696, + "learning_rate": 0.0001824169756645763, + "loss": 1.79, + "step": 1855 + }, + { + "epoch": 0.8197443807844865, + "grad_norm": 0.22502963250778782, + "learning_rate": 0.00018227148626650072, + "loss": 1.7616, + "step": 1860 + }, + { + "epoch": 0.8219479947113266, + "grad_norm": 0.22975193518596113, + "learning_rate": 0.00018212545594613978, + "loss": 1.7862, + "step": 1865 + }, + { + "epoch": 0.8241516086381666, + "grad_norm": 0.22813094138391649, + "learning_rate": 0.00018197888566362023, + "loss": 1.6909, + "step": 1870 + }, + { + "epoch": 0.8263552225650066, + "grad_norm": 0.3170639776475332, + "learning_rate": 0.00018183177638261895, + "loss": 1.8876, + "step": 1875 + }, + { + "epoch": 0.8285588364918466, + "grad_norm": 0.25433638493270494, + "learning_rate": 0.00018168412907035672, + "loss": 1.7447, + "step": 1880 + }, + { + "epoch": 0.8307624504186867, + "grad_norm": 0.2527115922098035, + "learning_rate": 0.00018153594469759175, + "loss": 1.7288, + "step": 1885 + }, + { + "epoch": 0.8329660643455267, + "grad_norm": 0.24448176152726306, + "learning_rate": 0.00018138722423861333, + "loss": 1.8385, + "step": 1890 + }, + { + "epoch": 0.8351696782723667, + "grad_norm": 0.2797043743850419, + "learning_rate": 0.00018123796867123548, + "loss": 1.8353, + "step": 1895 + }, + { + "epoch": 0.8373732921992068, + "grad_norm": 0.26948384291872024, + "learning_rate": 0.00018108817897679043, + "loss": 1.6995, + "step": 1900 + }, + { + "epoch": 0.8395769061260467, + "grad_norm": 0.23050209288787957, + "learning_rate": 0.00018093785614012228, + "loss": 1.7752, + "step": 1905 + }, + { + "epoch": 0.8417805200528867, + "grad_norm": 0.28186670258249874, + "learning_rate": 0.0001807870011495803, + "loss": 1.8608, + "step": 1910 + }, + { + "epoch": 0.8439841339797267, + "grad_norm": 0.29105838653881044, + "learning_rate": 0.00018063561499701282, + "loss": 1.8997, + "step": 1915 + }, + { + "epoch": 0.8461877479065668, + "grad_norm": 0.29426186229081736, + "learning_rate": 0.00018048369867776029, + "loss": 1.6416, + "step": 1920 + }, + { + "epoch": 0.8483913618334068, + "grad_norm": 0.21168547827802603, + "learning_rate": 0.00018033125319064902, + "loss": 1.8158, + "step": 1925 + }, + { + "epoch": 0.8505949757602468, + "grad_norm": 0.35188920795977724, + "learning_rate": 0.00018017827953798444, + "loss": 1.7531, + "step": 1930 + }, + { + "epoch": 0.8527985896870868, + "grad_norm": 0.2979347408479712, + "learning_rate": 0.0001800247787255447, + "loss": 1.9657, + "step": 1935 + }, + { + "epoch": 0.8550022036139269, + "grad_norm": 0.3102746795792285, + "learning_rate": 0.00017987075176257382, + "loss": 1.5273, + "step": 1940 + }, + { + "epoch": 0.8572058175407669, + "grad_norm": 0.2141203314519693, + "learning_rate": 0.00017971619966177524, + "loss": 1.7978, + "step": 1945 + }, + { + "epoch": 0.8594094314676068, + "grad_norm": 0.3010217472666534, + "learning_rate": 0.00017956112343930512, + "loss": 1.8066, + "step": 1950 + }, + { + "epoch": 0.8616130453944469, + "grad_norm": 0.2685511500450377, + "learning_rate": 0.00017940552411476566, + "loss": 1.8096, + "step": 1955 + }, + { + "epoch": 0.8638166593212869, + "grad_norm": 0.3648269512255727, + "learning_rate": 0.00017924940271119827, + "loss": 1.9212, + "step": 1960 + }, + { + "epoch": 0.8660202732481269, + "grad_norm": 0.23673592232409962, + "learning_rate": 0.00017909276025507696, + "loss": 1.9925, + "step": 1965 + }, + { + "epoch": 0.8682238871749669, + "grad_norm": 0.22051543871837528, + "learning_rate": 0.00017893559777630173, + "loss": 1.8895, + "step": 1970 + }, + { + "epoch": 0.870427501101807, + "grad_norm": 0.25128906373836973, + "learning_rate": 0.00017877791630819149, + "loss": 1.7637, + "step": 1975 + }, + { + "epoch": 0.872631115028647, + "grad_norm": 0.2982588862018805, + "learning_rate": 0.00017861971688747747, + "loss": 1.865, + "step": 1980 + }, + { + "epoch": 0.874834728955487, + "grad_norm": 0.2927080940082808, + "learning_rate": 0.00017846100055429642, + "loss": 1.742, + "step": 1985 + }, + { + "epoch": 0.8770383428823271, + "grad_norm": 0.30412581475879524, + "learning_rate": 0.00017830176835218368, + "loss": 1.6706, + "step": 1990 + }, + { + "epoch": 0.8792419568091671, + "grad_norm": 0.2902194881075315, + "learning_rate": 0.0001781420213280662, + "loss": 1.8014, + "step": 1995 + }, + { + "epoch": 0.881445570736007, + "grad_norm": 0.3378104042442177, + "learning_rate": 0.00017798176053225606, + "loss": 1.8318, + "step": 2000 + }, + { + "epoch": 0.883649184662847, + "grad_norm": 0.2465500138547731, + "learning_rate": 0.0001778209870184431, + "loss": 1.6756, + "step": 2005 + }, + { + "epoch": 0.8858527985896871, + "grad_norm": 0.27205493094420413, + "learning_rate": 0.00017765970184368835, + "loss": 1.7398, + "step": 2010 + }, + { + "epoch": 0.8880564125165271, + "grad_norm": 0.28881718844442433, + "learning_rate": 0.0001774979060684168, + "loss": 1.8652, + "step": 2015 + }, + { + "epoch": 0.8902600264433671, + "grad_norm": 0.2930186193777016, + "learning_rate": 0.0001773356007564107, + "loss": 1.7748, + "step": 2020 + }, + { + "epoch": 0.8924636403702071, + "grad_norm": 0.28319545050901546, + "learning_rate": 0.0001771727869748023, + "loss": 1.7198, + "step": 2025 + }, + { + "epoch": 0.8946672542970472, + "grad_norm": 0.26485391899745814, + "learning_rate": 0.000177009465794067, + "loss": 1.7109, + "step": 2030 + }, + { + "epoch": 0.8968708682238872, + "grad_norm": 0.2367084302014588, + "learning_rate": 0.0001768456382880163, + "loss": 1.773, + "step": 2035 + }, + { + "epoch": 0.8990744821507272, + "grad_norm": 0.28707015291179266, + "learning_rate": 0.00017668130553379063, + "loss": 1.8698, + "step": 2040 + }, + { + "epoch": 0.9012780960775673, + "grad_norm": 0.26521748140973644, + "learning_rate": 0.00017651646861185252, + "loss": 1.5433, + "step": 2045 + }, + { + "epoch": 0.9034817100044072, + "grad_norm": 0.25186929013428017, + "learning_rate": 0.0001763511286059791, + "loss": 1.7003, + "step": 2050 + }, + { + "epoch": 0.9056853239312472, + "grad_norm": 0.2646341657457682, + "learning_rate": 0.0001761852866032554, + "loss": 1.8017, + "step": 2055 + }, + { + "epoch": 0.9078889378580872, + "grad_norm": 0.27426709503702656, + "learning_rate": 0.0001760189436940669, + "loss": 1.717, + "step": 2060 + }, + { + "epoch": 0.9100925517849273, + "grad_norm": 0.2909513812455515, + "learning_rate": 0.00017585210097209242, + "loss": 1.8286, + "step": 2065 + }, + { + "epoch": 0.9122961657117673, + "grad_norm": 0.3195261689470233, + "learning_rate": 0.00017568475953429706, + "loss": 1.9248, + "step": 2070 + }, + { + "epoch": 0.9144997796386073, + "grad_norm": 0.2559454772629871, + "learning_rate": 0.00017551692048092487, + "loss": 1.9235, + "step": 2075 + }, + { + "epoch": 0.9167033935654474, + "grad_norm": 0.2787085951940099, + "learning_rate": 0.00017534858491549167, + "loss": 1.5563, + "step": 2080 + }, + { + "epoch": 0.9189070074922874, + "grad_norm": 0.27430346968172775, + "learning_rate": 0.00017517975394477765, + "loss": 1.7408, + "step": 2085 + }, + { + "epoch": 0.9211106214191274, + "grad_norm": 0.3057530869061833, + "learning_rate": 0.00017501042867882043, + "loss": 1.8029, + "step": 2090 + }, + { + "epoch": 0.9233142353459673, + "grad_norm": 0.28792933063158205, + "learning_rate": 0.0001748406102309073, + "loss": 1.7174, + "step": 2095 + }, + { + "epoch": 0.9255178492728074, + "grad_norm": 0.2926998767200136, + "learning_rate": 0.00017467029971756837, + "loss": 1.7753, + "step": 2100 + }, + { + "epoch": 0.9277214631996474, + "grad_norm": 0.3202640628143745, + "learning_rate": 0.00017449949825856881, + "loss": 1.6815, + "step": 2105 + }, + { + "epoch": 0.9299250771264874, + "grad_norm": 0.2898987783743886, + "learning_rate": 0.00017432820697690183, + "loss": 1.5471, + "step": 2110 + }, + { + "epoch": 0.9321286910533274, + "grad_norm": 0.25429358796976326, + "learning_rate": 0.00017415642699878108, + "loss": 1.801, + "step": 2115 + }, + { + "epoch": 0.9343323049801675, + "grad_norm": 0.2906871474145296, + "learning_rate": 0.00017398415945363326, + "loss": 1.7255, + "step": 2120 + }, + { + "epoch": 0.9365359189070075, + "grad_norm": 0.24941394640440692, + "learning_rate": 0.00017381140547409091, + "loss": 1.7265, + "step": 2125 + }, + { + "epoch": 0.9387395328338475, + "grad_norm": 0.2680278555985492, + "learning_rate": 0.00017363816619598462, + "loss": 1.8507, + "step": 2130 + }, + { + "epoch": 0.9409431467606876, + "grad_norm": 0.24326339969260075, + "learning_rate": 0.00017346444275833587, + "loss": 1.8278, + "step": 2135 + }, + { + "epoch": 0.9431467606875276, + "grad_norm": 0.2881192799230667, + "learning_rate": 0.00017329023630334935, + "loss": 1.6301, + "step": 2140 + }, + { + "epoch": 0.9453503746143676, + "grad_norm": 0.22324975257691354, + "learning_rate": 0.00017311554797640552, + "loss": 1.8182, + "step": 2145 + }, + { + "epoch": 0.9475539885412075, + "grad_norm": 0.38310870823903137, + "learning_rate": 0.0001729403789260531, + "loss": 1.6758, + "step": 2150 + }, + { + "epoch": 0.9497576024680476, + "grad_norm": 0.3082503271893105, + "learning_rate": 0.0001727647303040015, + "loss": 1.717, + "step": 2155 + }, + { + "epoch": 0.9519612163948876, + "grad_norm": 0.2910152870088472, + "learning_rate": 0.00017258860326511318, + "loss": 1.6762, + "step": 2160 + }, + { + "epoch": 0.9541648303217276, + "grad_norm": 0.2751619266213339, + "learning_rate": 0.00017241199896739614, + "loss": 1.5402, + "step": 2165 + }, + { + "epoch": 0.9563684442485677, + "grad_norm": 0.2633670365064433, + "learning_rate": 0.00017223491857199636, + "loss": 1.6089, + "step": 2170 + }, + { + "epoch": 0.9585720581754077, + "grad_norm": 0.26253468282607706, + "learning_rate": 0.00017205736324318999, + "loss": 1.8698, + "step": 2175 + }, + { + "epoch": 0.9607756721022477, + "grad_norm": 0.2623810146620032, + "learning_rate": 0.0001718793341483758, + "loss": 1.7996, + "step": 2180 + }, + { + "epoch": 0.9629792860290877, + "grad_norm": 0.2541467303967672, + "learning_rate": 0.00017170083245806757, + "loss": 1.7066, + "step": 2185 + }, + { + "epoch": 0.9651828999559278, + "grad_norm": 0.2450837271489877, + "learning_rate": 0.00017152185934588623, + "loss": 1.9326, + "step": 2190 + }, + { + "epoch": 0.9673865138827678, + "grad_norm": 0.3277904173770177, + "learning_rate": 0.00017134241598855236, + "loss": 1.941, + "step": 2195 + }, + { + "epoch": 0.9695901278096077, + "grad_norm": 0.26036896345598354, + "learning_rate": 0.0001711625035658782, + "loss": 1.7543, + "step": 2200 + }, + { + "epoch": 0.9717937417364477, + "grad_norm": 0.294563677729738, + "learning_rate": 0.00017098212326076008, + "loss": 1.6402, + "step": 2205 + }, + { + "epoch": 0.9739973556632878, + "grad_norm": 0.3220804392466043, + "learning_rate": 0.0001708012762591706, + "loss": 1.7078, + "step": 2210 + }, + { + "epoch": 0.9762009695901278, + "grad_norm": 0.2811686235748074, + "learning_rate": 0.00017061996375015078, + "loss": 1.9067, + "step": 2215 + }, + { + "epoch": 0.9784045835169678, + "grad_norm": 0.23864027527398188, + "learning_rate": 0.00017043818692580228, + "loss": 1.9146, + "step": 2220 + }, + { + "epoch": 0.9806081974438079, + "grad_norm": 0.2954583734368435, + "learning_rate": 0.00017025594698127965, + "loss": 1.791, + "step": 2225 + }, + { + "epoch": 0.9828118113706479, + "grad_norm": 0.2510099463630807, + "learning_rate": 0.00017007324511478223, + "loss": 1.5883, + "step": 2230 + }, + { + "epoch": 0.9850154252974879, + "grad_norm": 0.27874972856939056, + "learning_rate": 0.00016989008252754655, + "loss": 1.7865, + "step": 2235 + }, + { + "epoch": 0.9872190392243279, + "grad_norm": 0.2611091563705049, + "learning_rate": 0.00016970646042383826, + "loss": 1.8104, + "step": 2240 + }, + { + "epoch": 0.989422653151168, + "grad_norm": 0.28511909006315234, + "learning_rate": 0.00016952238001094428, + "loss": 1.6686, + "step": 2245 + }, + { + "epoch": 0.9916262670780079, + "grad_norm": 0.3381628494565006, + "learning_rate": 0.00016933784249916476, + "loss": 1.9412, + "step": 2250 + }, + { + "epoch": 0.9938298810048479, + "grad_norm": 0.28520119611042416, + "learning_rate": 0.00016915284910180533, + "loss": 1.6889, + "step": 2255 + }, + { + "epoch": 0.996033494931688, + "grad_norm": 0.2883852687543481, + "learning_rate": 0.00016896740103516895, + "loss": 1.8003, + "step": 2260 + }, + { + "epoch": 0.998237108858528, + "grad_norm": 0.24451797378236065, + "learning_rate": 0.0001687814995185479, + "loss": 1.8752, + "step": 2265 + }, + { + "epoch": 1.000440722785368, + "grad_norm": 0.33508649606500057, + "learning_rate": 0.00016859514577421592, + "loss": 1.8299, + "step": 2270 + }, + { + "epoch": 1.002644336712208, + "grad_norm": 0.2636718535549577, + "learning_rate": 0.00016840834102741997, + "loss": 1.706, + "step": 2275 + }, + { + "epoch": 1.004847950639048, + "grad_norm": 0.27046399540212357, + "learning_rate": 0.00016822108650637238, + "loss": 1.6819, + "step": 2280 + }, + { + "epoch": 1.007051564565888, + "grad_norm": 0.30877609431387876, + "learning_rate": 0.00016803338344224266, + "loss": 1.7218, + "step": 2285 + }, + { + "epoch": 1.0092551784927282, + "grad_norm": 0.23492423335336904, + "learning_rate": 0.00016784523306914934, + "loss": 1.57, + "step": 2290 + }, + { + "epoch": 1.0114587924195682, + "grad_norm": 0.2795118641324036, + "learning_rate": 0.00016765663662415204, + "loss": 1.7023, + "step": 2295 + }, + { + "epoch": 1.0136624063464081, + "grad_norm": 0.34933397332267874, + "learning_rate": 0.00016746759534724316, + "loss": 1.9401, + "step": 2300 + }, + { + "epoch": 1.0158660202732481, + "grad_norm": 0.2928363020374088, + "learning_rate": 0.00016727811048133985, + "loss": 1.7873, + "step": 2305 + }, + { + "epoch": 1.0180696342000881, + "grad_norm": 0.2765841121495588, + "learning_rate": 0.00016708818327227574, + "loss": 1.6925, + "step": 2310 + }, + { + "epoch": 1.020273248126928, + "grad_norm": 0.3407816261608907, + "learning_rate": 0.00016689781496879283, + "loss": 1.7583, + "step": 2315 + }, + { + "epoch": 1.022476862053768, + "grad_norm": 0.3837729171353601, + "learning_rate": 0.00016670700682253328, + "loss": 1.7058, + "step": 2320 + }, + { + "epoch": 1.0246804759806083, + "grad_norm": 0.2833490201778371, + "learning_rate": 0.00016651576008803112, + "loss": 1.7306, + "step": 2325 + }, + { + "epoch": 1.0268840899074483, + "grad_norm": 0.24052446562056756, + "learning_rate": 0.00016632407602270398, + "loss": 1.6612, + "step": 2330 + }, + { + "epoch": 1.0290877038342883, + "grad_norm": 0.2706767115179465, + "learning_rate": 0.00016613195588684488, + "loss": 1.5943, + "step": 2335 + }, + { + "epoch": 1.0312913177611283, + "grad_norm": 0.2694153548940927, + "learning_rate": 0.00016593940094361407, + "loss": 1.7072, + "step": 2340 + }, + { + "epoch": 1.0334949316879682, + "grad_norm": 0.2675415111083724, + "learning_rate": 0.0001657464124590304, + "loss": 1.7392, + "step": 2345 + }, + { + "epoch": 1.0356985456148082, + "grad_norm": 0.2783138545253644, + "learning_rate": 0.00016555299170196332, + "loss": 1.7264, + "step": 2350 + }, + { + "epoch": 1.0379021595416482, + "grad_norm": 0.29818023025744944, + "learning_rate": 0.00016535913994412436, + "loss": 1.6038, + "step": 2355 + }, + { + "epoch": 1.0401057734684884, + "grad_norm": 0.30521070526088534, + "learning_rate": 0.00016516485846005882, + "loss": 1.652, + "step": 2360 + }, + { + "epoch": 1.0423093873953284, + "grad_norm": 0.24990284589728196, + "learning_rate": 0.00016497014852713738, + "loss": 1.5745, + "step": 2365 + }, + { + "epoch": 1.0445130013221684, + "grad_norm": 0.36544530123344093, + "learning_rate": 0.0001647750114255477, + "loss": 1.8678, + "step": 2370 + }, + { + "epoch": 1.0467166152490084, + "grad_norm": 0.2984936690547218, + "learning_rate": 0.000164579448438286, + "loss": 1.8263, + "step": 2375 + }, + { + "epoch": 1.0489202291758484, + "grad_norm": 0.3138432337464576, + "learning_rate": 0.00016438346085114865, + "loss": 1.7807, + "step": 2380 + }, + { + "epoch": 1.0511238431026884, + "grad_norm": 0.409332779624452, + "learning_rate": 0.00016418704995272373, + "loss": 1.8612, + "step": 2385 + }, + { + "epoch": 1.0533274570295283, + "grad_norm": 0.27134598526523496, + "learning_rate": 0.00016399021703438247, + "loss": 1.7323, + "step": 2390 + }, + { + "epoch": 1.0555310709563686, + "grad_norm": 0.2852569861948395, + "learning_rate": 0.0001637929633902708, + "loss": 1.7619, + "step": 2395 + }, + { + "epoch": 1.0577346848832085, + "grad_norm": 0.2913405836362002, + "learning_rate": 0.00016359529031730093, + "loss": 1.8196, + "step": 2400 + }, + { + "epoch": 1.0599382988100485, + "grad_norm": 0.3029325708755559, + "learning_rate": 0.00016339719911514272, + "loss": 1.7579, + "step": 2405 + }, + { + "epoch": 1.0621419127368885, + "grad_norm": 0.2738995593865038, + "learning_rate": 0.00016319869108621512, + "loss": 1.8309, + "step": 2410 + }, + { + "epoch": 1.0643455266637285, + "grad_norm": 0.26503420605103173, + "learning_rate": 0.00016299976753567772, + "loss": 1.708, + "step": 2415 + }, + { + "epoch": 1.0665491405905685, + "grad_norm": 0.32283303116470646, + "learning_rate": 0.00016280042977142204, + "loss": 1.6915, + "step": 2420 + }, + { + "epoch": 1.0687527545174085, + "grad_norm": 0.24107839513422735, + "learning_rate": 0.00016260067910406304, + "loss": 1.5685, + "step": 2425 + }, + { + "epoch": 1.0709563684442487, + "grad_norm": 0.29608435429464214, + "learning_rate": 0.00016240051684693042, + "loss": 1.7239, + "step": 2430 + }, + { + "epoch": 1.0731599823710887, + "grad_norm": 0.29986988301001716, + "learning_rate": 0.00016219994431606005, + "loss": 1.6816, + "step": 2435 + }, + { + "epoch": 1.0753635962979287, + "grad_norm": 0.24843757070636935, + "learning_rate": 0.00016199896283018527, + "loss": 1.5677, + "step": 2440 + }, + { + "epoch": 1.0775672102247686, + "grad_norm": 0.2983161128962509, + "learning_rate": 0.00016179757371072824, + "loss": 1.7859, + "step": 2445 + }, + { + "epoch": 1.0797708241516086, + "grad_norm": 0.2787284156730912, + "learning_rate": 0.00016159577828179123, + "loss": 1.562, + "step": 2450 + }, + { + "epoch": 1.0819744380784486, + "grad_norm": 0.3269743373279123, + "learning_rate": 0.0001613935778701479, + "loss": 1.8759, + "step": 2455 + }, + { + "epoch": 1.0841780520052886, + "grad_norm": 0.2802831939826356, + "learning_rate": 0.0001611909738052347, + "loss": 1.7401, + "step": 2460 + }, + { + "epoch": 1.0863816659321286, + "grad_norm": 0.23846388238067243, + "learning_rate": 0.000160987967419142, + "loss": 1.727, + "step": 2465 + }, + { + "epoch": 1.0885852798589688, + "grad_norm": 0.2659078946356941, + "learning_rate": 0.00016078456004660536, + "loss": 1.6454, + "step": 2470 + }, + { + "epoch": 1.0907888937858088, + "grad_norm": 0.29111397323788823, + "learning_rate": 0.00016058075302499673, + "loss": 1.7724, + "step": 2475 + }, + { + "epoch": 1.0929925077126488, + "grad_norm": 0.35649745669814514, + "learning_rate": 0.00016037654769431576, + "loss": 1.6527, + "step": 2480 + }, + { + "epoch": 1.0951961216394888, + "grad_norm": 0.28097148721584675, + "learning_rate": 0.00016017194539718086, + "loss": 1.7563, + "step": 2485 + }, + { + "epoch": 1.0973997355663287, + "grad_norm": 0.39237884989816185, + "learning_rate": 0.0001599669474788205, + "loss": 1.8656, + "step": 2490 + }, + { + "epoch": 1.0996033494931687, + "grad_norm": 0.2633010799321111, + "learning_rate": 0.00015976155528706415, + "loss": 1.7129, + "step": 2495 + }, + { + "epoch": 1.1018069634200087, + "grad_norm": 0.2719172135219212, + "learning_rate": 0.0001595557701723338, + "loss": 1.5688, + "step": 2500 + }, + { + "epoch": 1.104010577346849, + "grad_norm": 0.2731378192802766, + "learning_rate": 0.00015934959348763467, + "loss": 1.7727, + "step": 2505 + }, + { + "epoch": 1.106214191273689, + "grad_norm": 0.31808171740983593, + "learning_rate": 0.00015914302658854657, + "loss": 1.8461, + "step": 2510 + }, + { + "epoch": 1.108417805200529, + "grad_norm": 0.2869427974730242, + "learning_rate": 0.00015893607083321477, + "loss": 1.8664, + "step": 2515 + }, + { + "epoch": 1.110621419127369, + "grad_norm": 0.2187585282755932, + "learning_rate": 0.00015872872758234148, + "loss": 1.6029, + "step": 2520 + }, + { + "epoch": 1.1128250330542089, + "grad_norm": 0.2667619562054791, + "learning_rate": 0.00015852099819917639, + "loss": 1.8981, + "step": 2525 + }, + { + "epoch": 1.1150286469810489, + "grad_norm": 0.3372014415136272, + "learning_rate": 0.00015831288404950802, + "loss": 1.7639, + "step": 2530 + }, + { + "epoch": 1.1172322609078889, + "grad_norm": 0.3048158336194716, + "learning_rate": 0.0001581043865016547, + "loss": 1.6387, + "step": 2535 + }, + { + "epoch": 1.1194358748347288, + "grad_norm": 0.2971544124275263, + "learning_rate": 0.00015789550692645556, + "loss": 1.7692, + "step": 2540 + }, + { + "epoch": 1.121639488761569, + "grad_norm": 0.2518439420968902, + "learning_rate": 0.00015768624669726145, + "loss": 1.5533, + "step": 2545 + }, + { + "epoch": 1.123843102688409, + "grad_norm": 0.28947857100850594, + "learning_rate": 0.00015747660718992598, + "loss": 1.6443, + "step": 2550 + }, + { + "epoch": 1.126046716615249, + "grad_norm": 0.27744579032481387, + "learning_rate": 0.00015726658978279642, + "loss": 1.7146, + "step": 2555 + }, + { + "epoch": 1.128250330542089, + "grad_norm": 0.28434343297956843, + "learning_rate": 0.00015705619585670478, + "loss": 1.8686, + "step": 2560 + }, + { + "epoch": 1.130453944468929, + "grad_norm": 0.26116640553101533, + "learning_rate": 0.00015684542679495847, + "loss": 1.7831, + "step": 2565 + }, + { + "epoch": 1.132657558395769, + "grad_norm": 0.31380511545232626, + "learning_rate": 0.00015663428398333157, + "loss": 1.6778, + "step": 2570 + }, + { + "epoch": 1.134861172322609, + "grad_norm": 0.2747114435356579, + "learning_rate": 0.0001564227688100552, + "loss": 1.6324, + "step": 2575 + }, + { + "epoch": 1.1370647862494492, + "grad_norm": 0.2678076483406117, + "learning_rate": 0.00015621088266580904, + "loss": 1.4946, + "step": 2580 + }, + { + "epoch": 1.1392684001762892, + "grad_norm": 0.33262666113926864, + "learning_rate": 0.00015599862694371157, + "loss": 1.882, + "step": 2585 + }, + { + "epoch": 1.1414720141031292, + "grad_norm": 0.30861283361311576, + "learning_rate": 0.00015578600303931136, + "loss": 1.6738, + "step": 2590 + }, + { + "epoch": 1.1436756280299691, + "grad_norm": 0.30927635505724477, + "learning_rate": 0.00015557301235057767, + "loss": 1.7005, + "step": 2595 + }, + { + "epoch": 1.1458792419568091, + "grad_norm": 0.28688292761286854, + "learning_rate": 0.00015535965627789126, + "loss": 1.6462, + "step": 2600 + }, + { + "epoch": 1.1480828558836491, + "grad_norm": 0.27171604870426835, + "learning_rate": 0.00015514593622403532, + "loss": 1.585, + "step": 2605 + }, + { + "epoch": 1.150286469810489, + "grad_norm": 0.3092542662127764, + "learning_rate": 0.0001549318535941861, + "loss": 1.9096, + "step": 2610 + }, + { + "epoch": 1.1524900837373293, + "grad_norm": 0.2716117253477657, + "learning_rate": 0.00015471740979590377, + "loss": 1.7352, + "step": 2615 + }, + { + "epoch": 1.1546936976641693, + "grad_norm": 0.3066935084879198, + "learning_rate": 0.0001545026062391231, + "loss": 1.8141, + "step": 2620 + }, + { + "epoch": 1.1568973115910093, + "grad_norm": 0.346363671435922, + "learning_rate": 0.00015428744433614415, + "loss": 1.5573, + "step": 2625 + }, + { + "epoch": 1.1591009255178493, + "grad_norm": 0.29157425354464905, + "learning_rate": 0.00015407192550162318, + "loss": 1.5464, + "step": 2630 + }, + { + "epoch": 1.1613045394446893, + "grad_norm": 0.2806374114105095, + "learning_rate": 0.0001538560511525632, + "loss": 1.6386, + "step": 2635 + }, + { + "epoch": 1.1635081533715292, + "grad_norm": 0.2982356223064332, + "learning_rate": 0.0001536398227083046, + "loss": 1.7813, + "step": 2640 + }, + { + "epoch": 1.1657117672983692, + "grad_norm": 0.33202427377524757, + "learning_rate": 0.00015342324159051587, + "loss": 1.9532, + "step": 2645 + }, + { + "epoch": 1.1679153812252094, + "grad_norm": 0.33897582151852124, + "learning_rate": 0.00015320630922318444, + "loss": 1.7746, + "step": 2650 + }, + { + "epoch": 1.1701189951520494, + "grad_norm": 0.3321654404402584, + "learning_rate": 0.00015298902703260692, + "loss": 2.0143, + "step": 2655 + }, + { + "epoch": 1.1723226090788894, + "grad_norm": 0.21833177585011845, + "learning_rate": 0.0001527713964473802, + "loss": 1.6702, + "step": 2660 + }, + { + "epoch": 1.1745262230057294, + "grad_norm": 0.26595318714741284, + "learning_rate": 0.00015255341889839157, + "loss": 1.683, + "step": 2665 + }, + { + "epoch": 1.1767298369325694, + "grad_norm": 0.2880553889858748, + "learning_rate": 0.00015233509581880973, + "loss": 1.6248, + "step": 2670 + }, + { + "epoch": 1.1789334508594094, + "grad_norm": 0.2966201251141622, + "learning_rate": 0.0001521164286440751, + "loss": 1.5788, + "step": 2675 + }, + { + "epoch": 1.1811370647862494, + "grad_norm": 0.32819026600076084, + "learning_rate": 0.00015189741881189054, + "loss": 1.6132, + "step": 2680 + }, + { + "epoch": 1.1833406787130896, + "grad_norm": 0.25598509051489227, + "learning_rate": 0.00015167806776221178, + "loss": 1.7194, + "step": 2685 + }, + { + "epoch": 1.1855442926399296, + "grad_norm": 0.32216524743892266, + "learning_rate": 0.000151458376937238, + "loss": 1.6499, + "step": 2690 + }, + { + "epoch": 1.1877479065667695, + "grad_norm": 0.25107175983686675, + "learning_rate": 0.00015123834778140233, + "loss": 1.6059, + "step": 2695 + }, + { + "epoch": 1.1899515204936095, + "grad_norm": 0.26100544568346645, + "learning_rate": 0.00015101798174136247, + "loss": 1.7677, + "step": 2700 + }, + { + "epoch": 1.1921551344204495, + "grad_norm": 0.28164900490660866, + "learning_rate": 0.000150797280265991, + "loss": 1.6141, + "step": 2705 + }, + { + "epoch": 1.1943587483472895, + "grad_norm": 0.25606567943961145, + "learning_rate": 0.00015057624480636594, + "loss": 1.6868, + "step": 2710 + }, + { + "epoch": 1.1965623622741295, + "grad_norm": 0.30536561643444826, + "learning_rate": 0.0001503548768157612, + "loss": 1.515, + "step": 2715 + }, + { + "epoch": 1.1987659762009697, + "grad_norm": 0.2495814195185963, + "learning_rate": 0.00015013317774963708, + "loss": 1.5754, + "step": 2720 + }, + { + "epoch": 1.2009695901278097, + "grad_norm": 0.3292329862356163, + "learning_rate": 0.00014991114906563055, + "loss": 1.6599, + "step": 2725 + }, + { + "epoch": 1.2031732040546497, + "grad_norm": 0.28736997494401945, + "learning_rate": 0.00014968879222354597, + "loss": 1.6939, + "step": 2730 + }, + { + "epoch": 1.2053768179814897, + "grad_norm": 0.28066545139959265, + "learning_rate": 0.00014946610868534502, + "loss": 1.5954, + "step": 2735 + }, + { + "epoch": 1.2075804319083296, + "grad_norm": 0.27963157670324124, + "learning_rate": 0.00014924309991513757, + "loss": 1.6816, + "step": 2740 + }, + { + "epoch": 1.2097840458351696, + "grad_norm": 0.3074722093066757, + "learning_rate": 0.0001490197673791717, + "loss": 1.5102, + "step": 2745 + }, + { + "epoch": 1.2119876597620096, + "grad_norm": 0.5175039724428968, + "learning_rate": 0.00014879611254582428, + "loss": 1.8587, + "step": 2750 + }, + { + "epoch": 1.2141912736888498, + "grad_norm": 0.4607644456138267, + "learning_rate": 0.00014857213688559124, + "loss": 1.8861, + "step": 2755 + }, + { + "epoch": 1.2163948876156898, + "grad_norm": 0.203471979843397, + "learning_rate": 0.00014834784187107785, + "loss": 1.5549, + "step": 2760 + }, + { + "epoch": 1.2185985015425298, + "grad_norm": 0.27836500512738993, + "learning_rate": 0.00014812322897698912, + "loss": 1.6677, + "step": 2765 + }, + { + "epoch": 1.2208021154693698, + "grad_norm": 0.2681432382916657, + "learning_rate": 0.00014789829968012, + "loss": 1.8601, + "step": 2770 + }, + { + "epoch": 1.2230057293962098, + "grad_norm": 0.24822408823990583, + "learning_rate": 0.00014767305545934588, + "loss": 1.9008, + "step": 2775 + }, + { + "epoch": 1.2252093433230498, + "grad_norm": 0.26961514947075566, + "learning_rate": 0.00014744749779561258, + "loss": 1.7573, + "step": 2780 + }, + { + "epoch": 1.2274129572498897, + "grad_norm": 0.40650037835816966, + "learning_rate": 0.0001472216281719269, + "loss": 1.6177, + "step": 2785 + }, + { + "epoch": 1.22961657117673, + "grad_norm": 0.27436209441284687, + "learning_rate": 0.0001469954480733465, + "loss": 1.6021, + "step": 2790 + }, + { + "epoch": 1.23182018510357, + "grad_norm": 0.2563256590060921, + "learning_rate": 0.00014676895898697062, + "loss": 1.9842, + "step": 2795 + }, + { + "epoch": 1.23402379903041, + "grad_norm": 0.35591941682342815, + "learning_rate": 0.00014654216240192995, + "loss": 1.6028, + "step": 2800 + }, + { + "epoch": 1.23622741295725, + "grad_norm": 0.33349441263276575, + "learning_rate": 0.00014631505980937688, + "loss": 1.852, + "step": 2805 + }, + { + "epoch": 1.23843102688409, + "grad_norm": 0.31528740240587627, + "learning_rate": 0.0001460876527024758, + "loss": 1.5587, + "step": 2810 + }, + { + "epoch": 1.24063464081093, + "grad_norm": 0.25877213395041015, + "learning_rate": 0.00014585994257639324, + "loss": 1.5482, + "step": 2815 + }, + { + "epoch": 1.2428382547377699, + "grad_norm": 0.2910006006185105, + "learning_rate": 0.00014563193092828803, + "loss": 1.6998, + "step": 2820 + }, + { + "epoch": 1.24504186866461, + "grad_norm": 0.37486054420050446, + "learning_rate": 0.00014540361925730147, + "loss": 1.8516, + "step": 2825 + }, + { + "epoch": 1.24724548259145, + "grad_norm": 0.34335913235176224, + "learning_rate": 0.00014517500906454742, + "loss": 1.6384, + "step": 2830 + }, + { + "epoch": 1.24944909651829, + "grad_norm": 0.2930060418053195, + "learning_rate": 0.00014494610185310252, + "loss": 1.8508, + "step": 2835 + }, + { + "epoch": 1.25165271044513, + "grad_norm": 0.27045205482633095, + "learning_rate": 0.00014471689912799626, + "loss": 1.5935, + "step": 2840 + }, + { + "epoch": 1.25385632437197, + "grad_norm": 0.30870157000744336, + "learning_rate": 0.00014448740239620108, + "loss": 1.8287, + "step": 2845 + }, + { + "epoch": 1.25605993829881, + "grad_norm": 0.27087387170107313, + "learning_rate": 0.00014425761316662241, + "loss": 1.9209, + "step": 2850 + }, + { + "epoch": 1.25826355222565, + "grad_norm": 0.32362518237545235, + "learning_rate": 0.0001440275329500889, + "loss": 1.7297, + "step": 2855 + }, + { + "epoch": 1.2604671661524902, + "grad_norm": 0.3473479960796504, + "learning_rate": 0.00014379716325934236, + "loss": 1.7847, + "step": 2860 + }, + { + "epoch": 1.26267078007933, + "grad_norm": 0.2985625193084418, + "learning_rate": 0.0001435665056090278, + "loss": 1.7811, + "step": 2865 + }, + { + "epoch": 1.2648743940061702, + "grad_norm": 0.2549368072217001, + "learning_rate": 0.00014333556151568364, + "loss": 1.8424, + "step": 2870 + }, + { + "epoch": 1.2670780079330102, + "grad_norm": 0.3031064940015495, + "learning_rate": 0.00014310433249773146, + "loss": 1.8502, + "step": 2875 + }, + { + "epoch": 1.2692816218598502, + "grad_norm": 0.2398504180714546, + "learning_rate": 0.00014287282007546627, + "loss": 1.648, + "step": 2880 + }, + { + "epoch": 1.2714852357866901, + "grad_norm": 0.33873410733492354, + "learning_rate": 0.00014264102577104645, + "loss": 1.6617, + "step": 2885 + }, + { + "epoch": 1.2736888497135301, + "grad_norm": 0.2655552963090036, + "learning_rate": 0.00014240895110848365, + "loss": 1.7205, + "step": 2890 + }, + { + "epoch": 1.2758924636403703, + "grad_norm": 0.30714278749605195, + "learning_rate": 0.0001421765976136328, + "loss": 1.7343, + "step": 2895 + }, + { + "epoch": 1.27809607756721, + "grad_norm": 0.31410869299454564, + "learning_rate": 0.0001419439668141822, + "loss": 1.7369, + "step": 2900 + }, + { + "epoch": 1.2802996914940503, + "grad_norm": 0.29231072094243427, + "learning_rate": 0.0001417110602396434, + "loss": 1.5317, + "step": 2905 + }, + { + "epoch": 1.2825033054208903, + "grad_norm": 0.26156390622111436, + "learning_rate": 0.00014147787942134089, + "loss": 1.4907, + "step": 2910 + }, + { + "epoch": 1.2847069193477303, + "grad_norm": 0.2432071100976, + "learning_rate": 0.00014124442589240265, + "loss": 1.7181, + "step": 2915 + }, + { + "epoch": 1.2869105332745703, + "grad_norm": 0.28813004538064096, + "learning_rate": 0.00014101070118774936, + "loss": 1.7243, + "step": 2920 + }, + { + "epoch": 1.2891141472014103, + "grad_norm": 0.29339284132217475, + "learning_rate": 0.00014077670684408485, + "loss": 1.7679, + "step": 2925 + }, + { + "epoch": 1.2913177611282503, + "grad_norm": 0.2588877094640161, + "learning_rate": 0.00014054244439988566, + "loss": 1.72, + "step": 2930 + }, + { + "epoch": 1.2935213750550902, + "grad_norm": 0.3011003249431555, + "learning_rate": 0.0001403079153953911, + "loss": 1.879, + "step": 2935 + }, + { + "epoch": 1.2957249889819304, + "grad_norm": 0.29129639306491034, + "learning_rate": 0.00014007312137259307, + "loss": 1.7124, + "step": 2940 + }, + { + "epoch": 1.2979286029087704, + "grad_norm": 0.30248111079542994, + "learning_rate": 0.00013983806387522592, + "loss": 1.5669, + "step": 2945 + }, + { + "epoch": 1.3001322168356104, + "grad_norm": 0.39466981086485026, + "learning_rate": 0.00013960274444875628, + "loss": 1.7579, + "step": 2950 + }, + { + "epoch": 1.3023358307624504, + "grad_norm": 0.30172725615707247, + "learning_rate": 0.000139367164640373, + "loss": 1.8218, + "step": 2955 + }, + { + "epoch": 1.3045394446892904, + "grad_norm": 0.286453848416204, + "learning_rate": 0.00013913132599897683, + "loss": 1.9354, + "step": 2960 + }, + { + "epoch": 1.3067430586161304, + "grad_norm": 0.3902213358624171, + "learning_rate": 0.00013889523007517028, + "loss": 1.7235, + "step": 2965 + }, + { + "epoch": 1.3089466725429704, + "grad_norm": 0.2589601846721671, + "learning_rate": 0.00013865887842124755, + "loss": 1.5088, + "step": 2970 + }, + { + "epoch": 1.3111502864698106, + "grad_norm": 0.2838598126846581, + "learning_rate": 0.0001384222725911842, + "loss": 1.6694, + "step": 2975 + }, + { + "epoch": 1.3133539003966506, + "grad_norm": 0.2985112357585295, + "learning_rate": 0.00013818541414062683, + "loss": 1.8195, + "step": 2980 + }, + { + "epoch": 1.3155575143234906, + "grad_norm": 0.344496826298518, + "learning_rate": 0.0001379483046268832, + "loss": 1.7105, + "step": 2985 + }, + { + "epoch": 1.3177611282503305, + "grad_norm": 0.2832321651335367, + "learning_rate": 0.00013771094560891155, + "loss": 1.6398, + "step": 2990 + }, + { + "epoch": 1.3199647421771705, + "grad_norm": 0.3166967200246273, + "learning_rate": 0.00013747333864731073, + "loss": 1.8804, + "step": 2995 + }, + { + "epoch": 1.3221683561040105, + "grad_norm": 0.29889345173462817, + "learning_rate": 0.00013723548530430974, + "loss": 1.5327, + "step": 3000 + }, + { + "epoch": 1.3243719700308505, + "grad_norm": 0.2989561597186618, + "learning_rate": 0.00013699738714375748, + "loss": 1.8312, + "step": 3005 + }, + { + "epoch": 1.3265755839576907, + "grad_norm": 0.27767734489817053, + "learning_rate": 0.00013675904573111247, + "loss": 1.7797, + "step": 3010 + }, + { + "epoch": 1.3287791978845307, + "grad_norm": 0.3263812983049982, + "learning_rate": 0.00013652046263343262, + "loss": 1.7061, + "step": 3015 + }, + { + "epoch": 1.3309828118113707, + "grad_norm": 0.24589188706441673, + "learning_rate": 0.00013628163941936485, + "loss": 1.7644, + "step": 3020 + }, + { + "epoch": 1.3331864257382107, + "grad_norm": 0.277800302556096, + "learning_rate": 0.00013604257765913484, + "loss": 1.8151, + "step": 3025 + }, + { + "epoch": 1.3353900396650507, + "grad_norm": 0.23353363973323982, + "learning_rate": 0.0001358032789245366, + "loss": 1.7236, + "step": 3030 + }, + { + "epoch": 1.3375936535918906, + "grad_norm": 0.34178889147907426, + "learning_rate": 0.00013556374478892232, + "loss": 1.7669, + "step": 3035 + }, + { + "epoch": 1.3397972675187306, + "grad_norm": 0.28321134962453065, + "learning_rate": 0.00013532397682719185, + "loss": 1.6165, + "step": 3040 + }, + { + "epoch": 1.3420008814455708, + "grad_norm": 0.3160168756862356, + "learning_rate": 0.00013508397661578242, + "loss": 1.8131, + "step": 3045 + }, + { + "epoch": 1.3442044953724108, + "grad_norm": 0.2800833181613442, + "learning_rate": 0.0001348437457326582, + "loss": 1.9182, + "step": 3050 + }, + { + "epoch": 1.3464081092992508, + "grad_norm": 0.2679976561310916, + "learning_rate": 0.00013460328575730019, + "loss": 1.8312, + "step": 3055 + }, + { + "epoch": 1.3486117232260908, + "grad_norm": 0.2899158112172882, + "learning_rate": 0.00013436259827069534, + "loss": 1.8217, + "step": 3060 + }, + { + "epoch": 1.3508153371529308, + "grad_norm": 0.26525449374755994, + "learning_rate": 0.00013412168485532676, + "loss": 1.7636, + "step": 3065 + }, + { + "epoch": 1.3530189510797708, + "grad_norm": 0.2643909185128004, + "learning_rate": 0.00013388054709516272, + "loss": 1.6257, + "step": 3070 + }, + { + "epoch": 1.3552225650066108, + "grad_norm": 0.3217683133395989, + "learning_rate": 0.0001336391865756468, + "loss": 1.8385, + "step": 3075 + }, + { + "epoch": 1.357426178933451, + "grad_norm": 0.30256367251501726, + "learning_rate": 0.00013339760488368695, + "loss": 1.5994, + "step": 3080 + }, + { + "epoch": 1.359629792860291, + "grad_norm": 0.33945781722722157, + "learning_rate": 0.00013315580360764542, + "loss": 1.6502, + "step": 3085 + }, + { + "epoch": 1.361833406787131, + "grad_norm": 0.2343801676740979, + "learning_rate": 0.00013291378433732818, + "loss": 1.7302, + "step": 3090 + }, + { + "epoch": 1.364037020713971, + "grad_norm": 0.3789024872984378, + "learning_rate": 0.00013267154866397447, + "loss": 1.9092, + "step": 3095 + }, + { + "epoch": 1.366240634640811, + "grad_norm": 0.27745717625968813, + "learning_rate": 0.00013242909818024628, + "loss": 1.6587, + "step": 3100 + }, + { + "epoch": 1.368444248567651, + "grad_norm": 0.26534498515018917, + "learning_rate": 0.0001321864344802181, + "loss": 1.7184, + "step": 3105 + }, + { + "epoch": 1.3706478624944909, + "grad_norm": 0.25912670390615655, + "learning_rate": 0.00013194355915936611, + "loss": 1.7708, + "step": 3110 + }, + { + "epoch": 1.372851476421331, + "grad_norm": 0.23803080715278022, + "learning_rate": 0.000131700473814558, + "loss": 1.7224, + "step": 3115 + }, + { + "epoch": 1.375055090348171, + "grad_norm": 0.30746363012665606, + "learning_rate": 0.00013145718004404223, + "loss": 1.754, + "step": 3120 + }, + { + "epoch": 1.377258704275011, + "grad_norm": 0.2672636477164184, + "learning_rate": 0.00013121367944743777, + "loss": 1.6989, + "step": 3125 + }, + { + "epoch": 1.379462318201851, + "grad_norm": 0.28413083026015534, + "learning_rate": 0.0001309699736257232, + "loss": 1.6421, + "step": 3130 + }, + { + "epoch": 1.381665932128691, + "grad_norm": 0.3193377255035563, + "learning_rate": 0.00013072606418122667, + "loss": 1.8467, + "step": 3135 + }, + { + "epoch": 1.383869546055531, + "grad_norm": 0.2833583691387121, + "learning_rate": 0.00013048195271761498, + "loss": 1.6013, + "step": 3140 + }, + { + "epoch": 1.386073159982371, + "grad_norm": 0.25616581325290944, + "learning_rate": 0.00013023764083988323, + "loss": 1.7542, + "step": 3145 + }, + { + "epoch": 1.3882767739092112, + "grad_norm": 0.34179796686003233, + "learning_rate": 0.0001299931301543442, + "loss": 1.6674, + "step": 3150 + }, + { + "epoch": 1.390480387836051, + "grad_norm": 0.2612422912889042, + "learning_rate": 0.00012974842226861773, + "loss": 1.3979, + "step": 3155 + }, + { + "epoch": 1.3926840017628912, + "grad_norm": 0.30556335921079647, + "learning_rate": 0.0001295035187916204, + "loss": 1.7775, + "step": 3160 + }, + { + "epoch": 1.3948876156897312, + "grad_norm": 0.34927377227523054, + "learning_rate": 0.00012925842133355454, + "loss": 1.7384, + "step": 3165 + }, + { + "epoch": 1.3970912296165712, + "grad_norm": 0.35826503954646516, + "learning_rate": 0.00012901313150589806, + "loss": 1.8279, + "step": 3170 + }, + { + "epoch": 1.3992948435434112, + "grad_norm": 0.3558470664702752, + "learning_rate": 0.0001287676509213936, + "loss": 1.6467, + "step": 3175 + }, + { + "epoch": 1.4014984574702511, + "grad_norm": 0.2897608843662268, + "learning_rate": 0.00012852198119403798, + "loss": 1.6509, + "step": 3180 + }, + { + "epoch": 1.4037020713970914, + "grad_norm": 0.32428607006990234, + "learning_rate": 0.00012827612393907163, + "loss": 1.7118, + "step": 3185 + }, + { + "epoch": 1.4059056853239311, + "grad_norm": 0.29054441869310144, + "learning_rate": 0.0001280300807729679, + "loss": 1.6328, + "step": 3190 + }, + { + "epoch": 1.4081092992507713, + "grad_norm": 0.2672079347199706, + "learning_rate": 0.0001277838533134226, + "loss": 1.7875, + "step": 3195 + }, + { + "epoch": 1.4103129131776113, + "grad_norm": 0.3317588448314954, + "learning_rate": 0.00012753744317934307, + "loss": 1.9754, + "step": 3200 + }, + { + "epoch": 1.4125165271044513, + "grad_norm": 0.30976057441678767, + "learning_rate": 0.0001272908519908379, + "loss": 1.7292, + "step": 3205 + }, + { + "epoch": 1.4147201410312913, + "grad_norm": 0.2588899590749228, + "learning_rate": 0.00012704408136920585, + "loss": 1.661, + "step": 3210 + }, + { + "epoch": 1.4169237549581313, + "grad_norm": 0.34042631761749226, + "learning_rate": 0.0001267971329369256, + "loss": 1.7688, + "step": 3215 + }, + { + "epoch": 1.4191273688849715, + "grad_norm": 0.2613604846991468, + "learning_rate": 0.00012655000831764495, + "loss": 1.7979, + "step": 3220 + }, + { + "epoch": 1.4213309828118112, + "grad_norm": 0.30387612038339795, + "learning_rate": 0.00012630270913616985, + "loss": 1.6008, + "step": 3225 + }, + { + "epoch": 1.4235345967386515, + "grad_norm": 0.3245461529092582, + "learning_rate": 0.00012605523701845431, + "loss": 1.7394, + "step": 3230 + }, + { + "epoch": 1.4257382106654914, + "grad_norm": 0.23964868596701627, + "learning_rate": 0.00012580759359158905, + "loss": 1.5526, + "step": 3235 + }, + { + "epoch": 1.4279418245923314, + "grad_norm": 0.281150127445612, + "learning_rate": 0.00012555978048379133, + "loss": 1.6581, + "step": 3240 + }, + { + "epoch": 1.4301454385191714, + "grad_norm": 0.31208534783523834, + "learning_rate": 0.00012531179932439397, + "loss": 1.6698, + "step": 3245 + }, + { + "epoch": 1.4323490524460114, + "grad_norm": 0.3197645332854783, + "learning_rate": 0.00012506365174383467, + "loss": 1.8493, + "step": 3250 + }, + { + "epoch": 1.4345526663728516, + "grad_norm": 0.29747281096057276, + "learning_rate": 0.0001248153393736454, + "loss": 1.923, + "step": 3255 + }, + { + "epoch": 1.4367562802996914, + "grad_norm": 0.2706957926203667, + "learning_rate": 0.00012456686384644148, + "loss": 1.7219, + "step": 3260 + }, + { + "epoch": 1.4389598942265316, + "grad_norm": 0.3015008988665459, + "learning_rate": 0.00012431822679591112, + "loss": 1.6334, + "step": 3265 + }, + { + "epoch": 1.4411635081533716, + "grad_norm": 0.28824055515626146, + "learning_rate": 0.00012406942985680437, + "loss": 1.7096, + "step": 3270 + }, + { + "epoch": 1.4433671220802116, + "grad_norm": 0.28792375010811966, + "learning_rate": 0.00012382047466492262, + "loss": 1.6993, + "step": 3275 + }, + { + "epoch": 1.4455707360070515, + "grad_norm": 0.2778794221727809, + "learning_rate": 0.0001235713628571077, + "loss": 1.699, + "step": 3280 + }, + { + "epoch": 1.4477743499338915, + "grad_norm": 0.3173174516544841, + "learning_rate": 0.00012332209607123117, + "loss": 1.6214, + "step": 3285 + }, + { + "epoch": 1.4499779638607315, + "grad_norm": 0.30655650928697775, + "learning_rate": 0.0001230726759461836, + "loss": 1.7923, + "step": 3290 + }, + { + "epoch": 1.4521815777875715, + "grad_norm": 0.2517175305379352, + "learning_rate": 0.00012282310412186365, + "loss": 1.8434, + "step": 3295 + }, + { + "epoch": 1.4543851917144117, + "grad_norm": 0.24920356879351888, + "learning_rate": 0.0001225733822391675, + "loss": 1.6146, + "step": 3300 + }, + { + "epoch": 1.4565888056412517, + "grad_norm": 0.2950517285567546, + "learning_rate": 0.00012232351193997774, + "loss": 1.6819, + "step": 3305 + }, + { + "epoch": 1.4587924195680917, + "grad_norm": 0.3058013022960617, + "learning_rate": 0.000122073494867153, + "loss": 1.579, + "step": 3310 + }, + { + "epoch": 1.4609960334949317, + "grad_norm": 0.46566643268469327, + "learning_rate": 0.00012182333266451684, + "loss": 1.6713, + "step": 3315 + }, + { + "epoch": 1.4631996474217717, + "grad_norm": 0.3269081430168815, + "learning_rate": 0.00012157302697684695, + "loss": 1.6608, + "step": 3320 + }, + { + "epoch": 1.4654032613486117, + "grad_norm": 0.2472677464376836, + "learning_rate": 0.00012132257944986454, + "loss": 1.6504, + "step": 3325 + }, + { + "epoch": 1.4676068752754516, + "grad_norm": 0.3189526409165166, + "learning_rate": 0.00012107199173022327, + "loss": 1.6308, + "step": 3330 + }, + { + "epoch": 1.4698104892022918, + "grad_norm": 0.2439726953786154, + "learning_rate": 0.00012082126546549864, + "loss": 1.6694, + "step": 3335 + }, + { + "epoch": 1.4720141031291318, + "grad_norm": 0.28416189235796196, + "learning_rate": 0.000120570402304177, + "loss": 1.9048, + "step": 3340 + }, + { + "epoch": 1.4742177170559718, + "grad_norm": 0.25976522560441834, + "learning_rate": 0.00012031940389564478, + "loss": 1.7083, + "step": 3345 + }, + { + "epoch": 1.4764213309828118, + "grad_norm": 0.2874680056323443, + "learning_rate": 0.00012006827189017773, + "loss": 1.7914, + "step": 3350 + }, + { + "epoch": 1.4786249449096518, + "grad_norm": 0.35212840385267163, + "learning_rate": 0.00011981700793892982, + "loss": 1.8617, + "step": 3355 + }, + { + "epoch": 1.4808285588364918, + "grad_norm": 0.3258646795205973, + "learning_rate": 0.00011956561369392274, + "loss": 1.8569, + "step": 3360 + }, + { + "epoch": 1.4830321727633318, + "grad_norm": 0.25990120954046436, + "learning_rate": 0.0001193140908080346, + "loss": 1.7778, + "step": 3365 + }, + { + "epoch": 1.485235786690172, + "grad_norm": 0.2916499249746569, + "learning_rate": 0.00011906244093498955, + "loss": 1.7442, + "step": 3370 + }, + { + "epoch": 1.487439400617012, + "grad_norm": 0.38118475856684764, + "learning_rate": 0.00011881066572934644, + "loss": 1.6281, + "step": 3375 + }, + { + "epoch": 1.489643014543852, + "grad_norm": 0.33602332943649665, + "learning_rate": 0.00011855876684648837, + "loss": 1.6655, + "step": 3380 + }, + { + "epoch": 1.491846628470692, + "grad_norm": 0.3091891353046593, + "learning_rate": 0.00011830674594261145, + "loss": 1.818, + "step": 3385 + }, + { + "epoch": 1.494050242397532, + "grad_norm": 0.2675107541956203, + "learning_rate": 0.0001180546046747141, + "loss": 1.9917, + "step": 3390 + }, + { + "epoch": 1.496253856324372, + "grad_norm": 0.24171145502227592, + "learning_rate": 0.00011780234470058613, + "loss": 1.747, + "step": 3395 + }, + { + "epoch": 1.498457470251212, + "grad_norm": 0.31043683691075824, + "learning_rate": 0.0001175499676787978, + "loss": 1.7863, + "step": 3400 + }, + { + "epoch": 1.500661084178052, + "grad_norm": 0.25938236019167105, + "learning_rate": 0.000117297475268689, + "loss": 1.6216, + "step": 3405 + }, + { + "epoch": 1.5028646981048919, + "grad_norm": 0.2889407665894309, + "learning_rate": 0.00011704486913035819, + "loss": 1.7023, + "step": 3410 + }, + { + "epoch": 1.505068312031732, + "grad_norm": 0.2595635779433101, + "learning_rate": 0.00011679215092465163, + "loss": 1.6651, + "step": 3415 + }, + { + "epoch": 1.507271925958572, + "grad_norm": 0.3147360748379521, + "learning_rate": 0.00011653932231315245, + "loss": 1.6855, + "step": 3420 + }, + { + "epoch": 1.509475539885412, + "grad_norm": 0.2456935617451673, + "learning_rate": 0.00011628638495816955, + "loss": 1.6982, + "step": 3425 + }, + { + "epoch": 1.511679153812252, + "grad_norm": 0.2644287001406921, + "learning_rate": 0.00011603334052272696, + "loss": 1.7438, + "step": 3430 + }, + { + "epoch": 1.513882767739092, + "grad_norm": 0.3098487244790836, + "learning_rate": 0.0001157801906705526, + "loss": 1.7459, + "step": 3435 + }, + { + "epoch": 1.5160863816659322, + "grad_norm": 0.2635907160016382, + "learning_rate": 0.00011552693706606758, + "loss": 1.5969, + "step": 3440 + }, + { + "epoch": 1.518289995592772, + "grad_norm": 0.293285003547433, + "learning_rate": 0.00011527358137437516, + "loss": 1.7899, + "step": 3445 + }, + { + "epoch": 1.5204936095196122, + "grad_norm": 0.3154800148422279, + "learning_rate": 0.00011502012526124978, + "loss": 1.7859, + "step": 3450 + }, + { + "epoch": 1.5226972234464522, + "grad_norm": 0.3029474817652635, + "learning_rate": 0.00011476657039312613, + "loss": 1.8433, + "step": 3455 + }, + { + "epoch": 1.5249008373732922, + "grad_norm": 0.3394061586641444, + "learning_rate": 0.00011451291843708824, + "loss": 1.8191, + "step": 3460 + }, + { + "epoch": 1.5271044513001322, + "grad_norm": 0.28588437415991474, + "learning_rate": 0.00011425917106085844, + "loss": 1.6528, + "step": 3465 + }, + { + "epoch": 1.5293080652269722, + "grad_norm": 0.2624098766559022, + "learning_rate": 0.00011400532993278643, + "loss": 1.8208, + "step": 3470 + }, + { + "epoch": 1.5315116791538124, + "grad_norm": 0.29050819773398057, + "learning_rate": 0.00011375139672183834, + "loss": 1.763, + "step": 3475 + }, + { + "epoch": 1.5337152930806521, + "grad_norm": 0.28595831334373306, + "learning_rate": 0.00011349737309758572, + "loss": 1.6389, + "step": 3480 + }, + { + "epoch": 1.5359189070074923, + "grad_norm": 0.310106838673627, + "learning_rate": 0.00011324326073019458, + "loss": 1.7008, + "step": 3485 + }, + { + "epoch": 1.5381225209343323, + "grad_norm": 0.3425497639069633, + "learning_rate": 0.0001129890612904144, + "loss": 1.7975, + "step": 3490 + }, + { + "epoch": 1.5403261348611723, + "grad_norm": 0.3049155101860015, + "learning_rate": 0.0001127347764495671, + "loss": 1.6302, + "step": 3495 + }, + { + "epoch": 1.5425297487880123, + "grad_norm": 0.3083723366063809, + "learning_rate": 0.00011248040787953622, + "loss": 1.8779, + "step": 3500 + }, + { + "epoch": 1.5447333627148523, + "grad_norm": 0.2714184317474351, + "learning_rate": 0.00011222595725275562, + "loss": 1.6655, + "step": 3505 + }, + { + "epoch": 1.5469369766416925, + "grad_norm": 0.3020528286222207, + "learning_rate": 0.00011197142624219887, + "loss": 1.5374, + "step": 3510 + }, + { + "epoch": 1.5491405905685323, + "grad_norm": 0.270325291856936, + "learning_rate": 0.00011171681652136793, + "loss": 1.7442, + "step": 3515 + }, + { + "epoch": 1.5513442044953725, + "grad_norm": 0.2718748140775875, + "learning_rate": 0.00011146212976428232, + "loss": 1.793, + "step": 3520 + }, + { + "epoch": 1.5535478184222125, + "grad_norm": 0.2684213723870114, + "learning_rate": 0.00011120736764546799, + "loss": 1.5847, + "step": 3525 + }, + { + "epoch": 1.5557514323490524, + "grad_norm": 0.3293563453835575, + "learning_rate": 0.00011095253183994645, + "loss": 1.5808, + "step": 3530 + }, + { + "epoch": 1.5579550462758924, + "grad_norm": 0.32122911366332685, + "learning_rate": 0.0001106976240232237, + "loss": 1.7343, + "step": 3535 + }, + { + "epoch": 1.5601586602027324, + "grad_norm": 0.31939212525307864, + "learning_rate": 0.0001104426458712791, + "loss": 1.7123, + "step": 3540 + }, + { + "epoch": 1.5623622741295726, + "grad_norm": 0.2676344457188956, + "learning_rate": 0.00011018759906055463, + "loss": 1.4029, + "step": 3545 + }, + { + "epoch": 1.5645658880564124, + "grad_norm": 0.33141673784681086, + "learning_rate": 0.00010993248526794347, + "loss": 1.8105, + "step": 3550 + }, + { + "epoch": 1.5667695019832526, + "grad_norm": 0.2715916392293134, + "learning_rate": 0.00010967730617077938, + "loss": 1.73, + "step": 3555 + }, + { + "epoch": 1.5689731159100926, + "grad_norm": 0.2615038650065928, + "learning_rate": 0.00010942206344682541, + "loss": 1.7547, + "step": 3560 + }, + { + "epoch": 1.5711767298369326, + "grad_norm": 0.2662856667093564, + "learning_rate": 0.00010916675877426296, + "loss": 1.6934, + "step": 3565 + }, + { + "epoch": 1.5733803437637726, + "grad_norm": 0.20249417258651908, + "learning_rate": 0.00010891139383168072, + "loss": 1.7876, + "step": 3570 + }, + { + "epoch": 1.5755839576906125, + "grad_norm": 0.30704028921333987, + "learning_rate": 0.00010865597029806365, + "loss": 1.7228, + "step": 3575 + }, + { + "epoch": 1.5777875716174528, + "grad_norm": 0.32055060377455574, + "learning_rate": 0.00010840048985278195, + "loss": 1.7169, + "step": 3580 + }, + { + "epoch": 1.5799911855442925, + "grad_norm": 0.31196324685842286, + "learning_rate": 0.00010814495417557997, + "loss": 1.875, + "step": 3585 + }, + { + "epoch": 1.5821947994711327, + "grad_norm": 0.3005168077317045, + "learning_rate": 0.00010788936494656523, + "loss": 1.8862, + "step": 3590 + }, + { + "epoch": 1.5843984133979727, + "grad_norm": 0.2856041438770126, + "learning_rate": 0.00010763372384619738, + "loss": 1.6419, + "step": 3595 + }, + { + "epoch": 1.5866020273248127, + "grad_norm": 0.27459499931453724, + "learning_rate": 0.00010737803255527702, + "loss": 1.7495, + "step": 3600 + }, + { + "epoch": 1.5888056412516527, + "grad_norm": 0.23142425230470423, + "learning_rate": 0.00010712229275493489, + "loss": 1.7615, + "step": 3605 + }, + { + "epoch": 1.5910092551784927, + "grad_norm": 0.26704359594443944, + "learning_rate": 0.00010686650612662048, + "loss": 1.7043, + "step": 3610 + }, + { + "epoch": 1.5932128691053329, + "grad_norm": 0.2644858878666521, + "learning_rate": 0.00010661067435209135, + "loss": 1.8665, + "step": 3615 + }, + { + "epoch": 1.5954164830321726, + "grad_norm": 0.306618968505366, + "learning_rate": 0.00010635479911340176, + "loss": 1.8191, + "step": 3620 + }, + { + "epoch": 1.5976200969590129, + "grad_norm": 0.32629807608957256, + "learning_rate": 0.00010609888209289183, + "loss": 1.781, + "step": 3625 + }, + { + "epoch": 1.5998237108858528, + "grad_norm": 0.2701943161629176, + "learning_rate": 0.00010584292497317633, + "loss": 1.6162, + "step": 3630 + }, + { + "epoch": 1.6020273248126928, + "grad_norm": 0.40336230029221404, + "learning_rate": 0.00010558692943713373, + "loss": 1.722, + "step": 3635 + }, + { + "epoch": 1.6042309387395328, + "grad_norm": 0.2767732465609723, + "learning_rate": 0.000105330897167895, + "loss": 1.6427, + "step": 3640 + }, + { + "epoch": 1.6064345526663728, + "grad_norm": 0.28371049131821663, + "learning_rate": 0.00010507482984883268, + "loss": 1.6872, + "step": 3645 + }, + { + "epoch": 1.608638166593213, + "grad_norm": 0.2975217481512648, + "learning_rate": 0.00010481872916354978, + "loss": 1.6807, + "step": 3650 + }, + { + "epoch": 1.6108417805200528, + "grad_norm": 0.30522288055794994, + "learning_rate": 0.00010456259679586862, + "loss": 1.6253, + "step": 3655 + }, + { + "epoch": 1.613045394446893, + "grad_norm": 0.2893887034344458, + "learning_rate": 0.00010430643442981986, + "loss": 1.6465, + "step": 3660 + }, + { + "epoch": 1.615249008373733, + "grad_norm": 0.28628624467329145, + "learning_rate": 0.0001040502437496315, + "loss": 1.6428, + "step": 3665 + }, + { + "epoch": 1.617452622300573, + "grad_norm": 0.31683631498188874, + "learning_rate": 0.00010379402643971746, + "loss": 1.7033, + "step": 3670 + }, + { + "epoch": 1.619656236227413, + "grad_norm": 0.2627512390977551, + "learning_rate": 0.00010353778418466697, + "loss": 1.8805, + "step": 3675 + }, + { + "epoch": 1.621859850154253, + "grad_norm": 0.2976244892863047, + "learning_rate": 0.00010328151866923316, + "loss": 1.8013, + "step": 3680 + }, + { + "epoch": 1.6240634640810931, + "grad_norm": 0.31860629887164105, + "learning_rate": 0.00010302523157832216, + "loss": 1.648, + "step": 3685 + }, + { + "epoch": 1.626267078007933, + "grad_norm": 0.331987729083311, + "learning_rate": 0.00010276892459698182, + "loss": 1.6325, + "step": 3690 + }, + { + "epoch": 1.6284706919347731, + "grad_norm": 0.2818827039809831, + "learning_rate": 0.00010251259941039098, + "loss": 1.7213, + "step": 3695 + }, + { + "epoch": 1.6306743058616129, + "grad_norm": 0.2511192996283762, + "learning_rate": 0.00010225625770384797, + "loss": 1.5629, + "step": 3700 + }, + { + "epoch": 1.632877919788453, + "grad_norm": 0.29741607234777423, + "learning_rate": 0.00010199990116275988, + "loss": 1.7834, + "step": 3705 + }, + { + "epoch": 1.635081533715293, + "grad_norm": 0.30036469948366823, + "learning_rate": 0.00010174353147263125, + "loss": 1.4849, + "step": 3710 + }, + { + "epoch": 1.637285147642133, + "grad_norm": 0.30522753547462433, + "learning_rate": 0.00010148715031905312, + "loss": 1.8071, + "step": 3715 + }, + { + "epoch": 1.6394887615689733, + "grad_norm": 0.2769872314517015, + "learning_rate": 0.00010123075938769187, + "loss": 1.685, + "step": 3720 + }, + { + "epoch": 1.641692375495813, + "grad_norm": 0.2709018320510704, + "learning_rate": 0.00010097436036427816, + "loss": 1.7853, + "step": 3725 + }, + { + "epoch": 1.6438959894226532, + "grad_norm": 0.3446719925528754, + "learning_rate": 0.00010071795493459591, + "loss": 1.7783, + "step": 3730 + }, + { + "epoch": 1.646099603349493, + "grad_norm": 0.3208180036756325, + "learning_rate": 0.00010046154478447114, + "loss": 1.8982, + "step": 3735 + }, + { + "epoch": 1.6483032172763332, + "grad_norm": 0.289239117609306, + "learning_rate": 0.00010020513159976084, + "loss": 1.7313, + "step": 3740 + }, + { + "epoch": 1.6505068312031732, + "grad_norm": 0.30122205784761724, + "learning_rate": 9.994871706634204e-05, + "loss": 1.6831, + "step": 3745 + }, + { + "epoch": 1.6527104451300132, + "grad_norm": 0.2835725442151606, + "learning_rate": 9.96923028701006e-05, + "loss": 1.6129, + "step": 3750 + }, + { + "epoch": 1.6549140590568534, + "grad_norm": 0.4849263642027594, + "learning_rate": 9.943589069692014e-05, + "loss": 1.783, + "step": 3755 + }, + { + "epoch": 1.6571176729836932, + "grad_norm": 0.34060901148734135, + "learning_rate": 9.917948223267105e-05, + "loss": 1.642, + "step": 3760 + }, + { + "epoch": 1.6593212869105334, + "grad_norm": 0.3001957536668968, + "learning_rate": 9.892307916319919e-05, + "loss": 1.6005, + "step": 3765 + }, + { + "epoch": 1.6615249008373731, + "grad_norm": 0.3141764748091563, + "learning_rate": 9.866668317431514e-05, + "loss": 1.8968, + "step": 3770 + }, + { + "epoch": 1.6637285147642134, + "grad_norm": 0.3066452663914771, + "learning_rate": 9.841029595178282e-05, + "loss": 1.6288, + "step": 3775 + }, + { + "epoch": 1.6659321286910533, + "grad_norm": 0.2737312482046674, + "learning_rate": 9.815391918130848e-05, + "loss": 1.6151, + "step": 3780 + }, + { + "epoch": 1.6681357426178933, + "grad_norm": 0.337368580313636, + "learning_rate": 9.789755454852971e-05, + "loss": 1.6298, + "step": 3785 + }, + { + "epoch": 1.6703393565447333, + "grad_norm": 0.22864544249672591, + "learning_rate": 9.764120373900436e-05, + "loss": 1.7166, + "step": 3790 + }, + { + "epoch": 1.6725429704715733, + "grad_norm": 0.3199596986088049, + "learning_rate": 9.738486843819919e-05, + "loss": 1.4291, + "step": 3795 + }, + { + "epoch": 1.6747465843984135, + "grad_norm": 0.484113228627448, + "learning_rate": 9.712855033147921e-05, + "loss": 1.8267, + "step": 3800 + }, + { + "epoch": 1.6769501983252533, + "grad_norm": 0.29066627384947913, + "learning_rate": 9.68722511040962e-05, + "loss": 1.7618, + "step": 3805 + }, + { + "epoch": 1.6791538122520935, + "grad_norm": 0.2756578416564992, + "learning_rate": 9.661597244117802e-05, + "loss": 1.7626, + "step": 3810 + }, + { + "epoch": 1.6813574261789335, + "grad_norm": 0.26262162877838774, + "learning_rate": 9.635971602771716e-05, + "loss": 1.6437, + "step": 3815 + }, + { + "epoch": 1.6835610401057735, + "grad_norm": 0.3485930873206027, + "learning_rate": 9.61034835485598e-05, + "loss": 1.9054, + "step": 3820 + }, + { + "epoch": 1.6857646540326134, + "grad_norm": 0.347720471228885, + "learning_rate": 9.584727668839487e-05, + "loss": 1.6653, + "step": 3825 + }, + { + "epoch": 1.6879682679594534, + "grad_norm": 0.259896835339376, + "learning_rate": 9.559109713174282e-05, + "loss": 1.8298, + "step": 3830 + }, + { + "epoch": 1.6901718818862936, + "grad_norm": 0.27688333027135853, + "learning_rate": 9.533494656294458e-05, + "loss": 1.5074, + "step": 3835 + }, + { + "epoch": 1.6923754958131334, + "grad_norm": 0.3186540043438054, + "learning_rate": 9.507882666615049e-05, + "loss": 1.666, + "step": 3840 + }, + { + "epoch": 1.6945791097399736, + "grad_norm": 0.34137430082378867, + "learning_rate": 9.482273912530913e-05, + "loss": 1.7862, + "step": 3845 + }, + { + "epoch": 1.6967827236668136, + "grad_norm": 0.31312228735788816, + "learning_rate": 9.456668562415657e-05, + "loss": 1.6969, + "step": 3850 + }, + { + "epoch": 1.6989863375936536, + "grad_norm": 0.3211377859121694, + "learning_rate": 9.431066784620486e-05, + "loss": 1.8653, + "step": 3855 + }, + { + "epoch": 1.7011899515204936, + "grad_norm": 0.3443229930133863, + "learning_rate": 9.405468747473127e-05, + "loss": 1.696, + "step": 3860 + }, + { + "epoch": 1.7033935654473336, + "grad_norm": 0.2900871769643087, + "learning_rate": 9.379874619276707e-05, + "loss": 1.7654, + "step": 3865 + }, + { + "epoch": 1.7055971793741738, + "grad_norm": 0.3109956258190758, + "learning_rate": 9.354284568308665e-05, + "loss": 1.6229, + "step": 3870 + }, + { + "epoch": 1.7078007933010135, + "grad_norm": 0.29096423007153205, + "learning_rate": 9.328698762819623e-05, + "loss": 1.7274, + "step": 3875 + }, + { + "epoch": 1.7100044072278537, + "grad_norm": 0.2674081631314747, + "learning_rate": 9.303117371032284e-05, + "loss": 1.7598, + "step": 3880 + }, + { + "epoch": 1.7122080211546937, + "grad_norm": 0.3370488411342742, + "learning_rate": 9.277540561140342e-05, + "loss": 1.7854, + "step": 3885 + }, + { + "epoch": 1.7144116350815337, + "grad_norm": 0.2236164025203487, + "learning_rate": 9.251968501307365e-05, + "loss": 1.735, + "step": 3890 + }, + { + "epoch": 1.7166152490083737, + "grad_norm": 0.31253584348251195, + "learning_rate": 9.226401359665686e-05, + "loss": 1.5912, + "step": 3895 + }, + { + "epoch": 1.7188188629352137, + "grad_norm": 0.28523913611604496, + "learning_rate": 9.2008393043153e-05, + "loss": 1.7252, + "step": 3900 + }, + { + "epoch": 1.721022476862054, + "grad_norm": 0.3641271132364838, + "learning_rate": 9.17528250332277e-05, + "loss": 1.7001, + "step": 3905 + }, + { + "epoch": 1.7232260907888937, + "grad_norm": 0.34337087912187736, + "learning_rate": 9.149731124720104e-05, + "loss": 1.941, + "step": 3910 + }, + { + "epoch": 1.7254297047157339, + "grad_norm": 0.3388940885148476, + "learning_rate": 9.124185336503656e-05, + "loss": 1.751, + "step": 3915 + }, + { + "epoch": 1.7276333186425739, + "grad_norm": 0.3980423482061792, + "learning_rate": 9.098645306633029e-05, + "loss": 1.7571, + "step": 3920 + }, + { + "epoch": 1.7298369325694138, + "grad_norm": 0.28139197000268995, + "learning_rate": 9.073111203029972e-05, + "loss": 1.5225, + "step": 3925 + }, + { + "epoch": 1.7320405464962538, + "grad_norm": 0.28174263964005763, + "learning_rate": 9.04758319357726e-05, + "loss": 1.6746, + "step": 3930 + }, + { + "epoch": 1.7342441604230938, + "grad_norm": 0.2461518030614674, + "learning_rate": 9.0220614461176e-05, + "loss": 1.5447, + "step": 3935 + }, + { + "epoch": 1.736447774349934, + "grad_norm": 0.3297581828947644, + "learning_rate": 8.99654612845253e-05, + "loss": 1.5802, + "step": 3940 + }, + { + "epoch": 1.7386513882767738, + "grad_norm": 0.3434764691584567, + "learning_rate": 8.971037408341319e-05, + "loss": 1.6836, + "step": 3945 + }, + { + "epoch": 1.740855002203614, + "grad_norm": 0.2967197456557559, + "learning_rate": 8.94553545349985e-05, + "loss": 1.6141, + "step": 3950 + }, + { + "epoch": 1.743058616130454, + "grad_norm": 0.3235657879123794, + "learning_rate": 8.92004043159953e-05, + "loss": 1.8539, + "step": 3955 + }, + { + "epoch": 1.745262230057294, + "grad_norm": 0.30230482414452203, + "learning_rate": 8.894552510266172e-05, + "loss": 1.6447, + "step": 3960 + }, + { + "epoch": 1.747465843984134, + "grad_norm": 0.25937552544500664, + "learning_rate": 8.869071857078926e-05, + "loss": 1.7132, + "step": 3965 + }, + { + "epoch": 1.749669457910974, + "grad_norm": 0.3504915271341817, + "learning_rate": 8.843598639569134e-05, + "loss": 1.616, + "step": 3970 + }, + { + "epoch": 1.7518730718378142, + "grad_norm": 0.3391539087204084, + "learning_rate": 8.818133025219258e-05, + "loss": 1.7726, + "step": 3975 + }, + { + "epoch": 1.754076685764654, + "grad_norm": 0.2945081027363211, + "learning_rate": 8.79267518146177e-05, + "loss": 1.6354, + "step": 3980 + }, + { + "epoch": 1.7562802996914941, + "grad_norm": 0.3122456807955038, + "learning_rate": 8.767225275678054e-05, + "loss": 1.7703, + "step": 3985 + }, + { + "epoch": 1.7584839136183341, + "grad_norm": 0.2911403528384371, + "learning_rate": 8.741783475197301e-05, + "loss": 1.6184, + "step": 3990 + }, + { + "epoch": 1.760687527545174, + "grad_norm": 0.28063561583988544, + "learning_rate": 8.716349947295406e-05, + "loss": 1.6723, + "step": 3995 + }, + { + "epoch": 1.762891141472014, + "grad_norm": 0.24899262558923158, + "learning_rate": 8.690924859193877e-05, + "loss": 1.5999, + "step": 4000 + }, + { + "epoch": 1.765094755398854, + "grad_norm": 0.21950111519024182, + "learning_rate": 8.665508378058737e-05, + "loss": 1.59, + "step": 4005 + }, + { + "epoch": 1.7672983693256943, + "grad_norm": 0.2812929263006604, + "learning_rate": 8.640100670999413e-05, + "loss": 1.7219, + "step": 4010 + }, + { + "epoch": 1.769501983252534, + "grad_norm": 0.26800092694380395, + "learning_rate": 8.614701905067648e-05, + "loss": 1.5878, + "step": 4015 + }, + { + "epoch": 1.7717055971793743, + "grad_norm": 0.3421307184861735, + "learning_rate": 8.589312247256385e-05, + "loss": 1.5289, + "step": 4020 + }, + { + "epoch": 1.7739092111062142, + "grad_norm": 0.2759648702531539, + "learning_rate": 8.563931864498709e-05, + "loss": 1.7232, + "step": 4025 + }, + { + "epoch": 1.7761128250330542, + "grad_norm": 0.3083755515457062, + "learning_rate": 8.538560923666697e-05, + "loss": 1.5333, + "step": 4030 + }, + { + "epoch": 1.7783164389598942, + "grad_norm": 0.32344968491148207, + "learning_rate": 8.51319959157036e-05, + "loss": 1.6531, + "step": 4035 + }, + { + "epoch": 1.7805200528867342, + "grad_norm": 0.2628449279485873, + "learning_rate": 8.487848034956527e-05, + "loss": 1.7176, + "step": 4040 + }, + { + "epoch": 1.7827236668135744, + "grad_norm": 0.30081231536696845, + "learning_rate": 8.462506420507764e-05, + "loss": 1.6087, + "step": 4045 + }, + { + "epoch": 1.7849272807404142, + "grad_norm": 0.24172711814982975, + "learning_rate": 8.437174914841261e-05, + "loss": 1.6365, + "step": 4050 + }, + { + "epoch": 1.7871308946672544, + "grad_norm": 0.3240513311621846, + "learning_rate": 8.411853684507744e-05, + "loss": 1.6818, + "step": 4055 + }, + { + "epoch": 1.7893345085940942, + "grad_norm": 0.26629546007810934, + "learning_rate": 8.38654289599038e-05, + "loss": 1.6165, + "step": 4060 + }, + { + "epoch": 1.7915381225209344, + "grad_norm": 0.2718697282057903, + "learning_rate": 8.36124271570369e-05, + "loss": 1.7767, + "step": 4065 + }, + { + "epoch": 1.7937417364477743, + "grad_norm": 0.2991997948062704, + "learning_rate": 8.335953309992442e-05, + "loss": 1.6968, + "step": 4070 + }, + { + "epoch": 1.7959453503746143, + "grad_norm": 0.33154914217795595, + "learning_rate": 8.310674845130563e-05, + "loss": 1.8523, + "step": 4075 + }, + { + "epoch": 1.7981489643014545, + "grad_norm": 0.29895299240181794, + "learning_rate": 8.285407487320042e-05, + "loss": 1.5945, + "step": 4080 + }, + { + "epoch": 1.8003525782282943, + "grad_norm": 0.2947016318335874, + "learning_rate": 8.260151402689848e-05, + "loss": 1.625, + "step": 4085 + }, + { + "epoch": 1.8025561921551345, + "grad_norm": 0.3039906812402064, + "learning_rate": 8.234906757294829e-05, + "loss": 1.8956, + "step": 4090 + }, + { + "epoch": 1.8047598060819743, + "grad_norm": 0.311603713861004, + "learning_rate": 8.209673717114618e-05, + "loss": 1.5808, + "step": 4095 + }, + { + "epoch": 1.8069634200088145, + "grad_norm": 0.3208961582381735, + "learning_rate": 8.184452448052547e-05, + "loss": 1.4928, + "step": 4100 + }, + { + "epoch": 1.8091670339356545, + "grad_norm": 0.27150517803601, + "learning_rate": 8.15924311593456e-05, + "loss": 1.7155, + "step": 4105 + }, + { + "epoch": 1.8113706478624945, + "grad_norm": 0.31637619824367874, + "learning_rate": 8.134045886508108e-05, + "loss": 1.4761, + "step": 4110 + }, + { + "epoch": 1.8135742617893347, + "grad_norm": 0.2536681416637017, + "learning_rate": 8.108860925441076e-05, + "loss": 1.7682, + "step": 4115 + }, + { + "epoch": 1.8157778757161744, + "grad_norm": 0.2683657294785762, + "learning_rate": 8.083688398320681e-05, + "loss": 1.6091, + "step": 4120 + }, + { + "epoch": 1.8179814896430146, + "grad_norm": 0.3004847316685122, + "learning_rate": 8.058528470652396e-05, + "loss": 1.7524, + "step": 4125 + }, + { + "epoch": 1.8201851035698544, + "grad_norm": 0.3033198553489031, + "learning_rate": 8.03338130785885e-05, + "loss": 1.6975, + "step": 4130 + }, + { + "epoch": 1.8223887174966946, + "grad_norm": 0.2932969746872377, + "learning_rate": 8.008247075278742e-05, + "loss": 1.6345, + "step": 4135 + }, + { + "epoch": 1.8245923314235346, + "grad_norm": 0.29419726566032006, + "learning_rate": 7.983125938165758e-05, + "loss": 2.0007, + "step": 4140 + }, + { + "epoch": 1.8267959453503746, + "grad_norm": 0.36994662608619383, + "learning_rate": 7.958018061687494e-05, + "loss": 1.8041, + "step": 4145 + }, + { + "epoch": 1.8289995592772146, + "grad_norm": 0.3171521170329957, + "learning_rate": 7.932923610924343e-05, + "loss": 1.8268, + "step": 4150 + }, + { + "epoch": 1.8312031732040546, + "grad_norm": 0.29749910928069495, + "learning_rate": 7.907842750868441e-05, + "loss": 1.7521, + "step": 4155 + }, + { + "epoch": 1.8334067871308948, + "grad_norm": 0.28978378846287695, + "learning_rate": 7.882775646422547e-05, + "loss": 1.8141, + "step": 4160 + }, + { + "epoch": 1.8356104010577345, + "grad_norm": 0.32370603539130244, + "learning_rate": 7.857722462399009e-05, + "loss": 1.5852, + "step": 4165 + }, + { + "epoch": 1.8378140149845748, + "grad_norm": 0.3000864365709902, + "learning_rate": 7.832683363518621e-05, + "loss": 1.5174, + "step": 4170 + }, + { + "epoch": 1.8400176289114147, + "grad_norm": 0.30867413513998754, + "learning_rate": 7.807658514409587e-05, + "loss": 1.7091, + "step": 4175 + }, + { + "epoch": 1.8422212428382547, + "grad_norm": 0.2941777962805528, + "learning_rate": 7.782648079606412e-05, + "loss": 1.9314, + "step": 4180 + }, + { + "epoch": 1.8444248567650947, + "grad_norm": 0.26789438468880916, + "learning_rate": 7.757652223548836e-05, + "loss": 1.5959, + "step": 4185 + }, + { + "epoch": 1.8466284706919347, + "grad_norm": 0.2852613225045537, + "learning_rate": 7.732671110580746e-05, + "loss": 1.4776, + "step": 4190 + }, + { + "epoch": 1.848832084618775, + "grad_norm": 0.28204667104600045, + "learning_rate": 7.707704904949085e-05, + "loss": 1.6044, + "step": 4195 + }, + { + "epoch": 1.8510356985456147, + "grad_norm": 0.3433461472883869, + "learning_rate": 7.682753770802791e-05, + "loss": 1.8343, + "step": 4200 + }, + { + "epoch": 1.8532393124724549, + "grad_norm": 0.27355747329642977, + "learning_rate": 7.657817872191713e-05, + "loss": 1.6496, + "step": 4205 + }, + { + "epoch": 1.8554429263992949, + "grad_norm": 0.2860370187153179, + "learning_rate": 7.632897373065522e-05, + "loss": 1.5719, + "step": 4210 + }, + { + "epoch": 1.8576465403261349, + "grad_norm": 0.2618172945255669, + "learning_rate": 7.607992437272642e-05, + "loss": 1.6911, + "step": 4215 + }, + { + "epoch": 1.8598501542529748, + "grad_norm": 0.2570942578849571, + "learning_rate": 7.583103228559164e-05, + "loss": 1.7205, + "step": 4220 + }, + { + "epoch": 1.8620537681798148, + "grad_norm": 0.30089829678159197, + "learning_rate": 7.558229910567794e-05, + "loss": 1.7114, + "step": 4225 + }, + { + "epoch": 1.864257382106655, + "grad_norm": 0.3120407343609943, + "learning_rate": 7.533372646836736e-05, + "loss": 1.6438, + "step": 4230 + }, + { + "epoch": 1.8664609960334948, + "grad_norm": 0.29920830893784145, + "learning_rate": 7.508531600798657e-05, + "loss": 1.8773, + "step": 4235 + }, + { + "epoch": 1.868664609960335, + "grad_norm": 0.28932016728007587, + "learning_rate": 7.483706935779584e-05, + "loss": 1.5398, + "step": 4240 + }, + { + "epoch": 1.870868223887175, + "grad_norm": 0.3323660400642421, + "learning_rate": 7.458898814997852e-05, + "loss": 1.8427, + "step": 4245 + }, + { + "epoch": 1.873071837814015, + "grad_norm": 0.24905133812180258, + "learning_rate": 7.434107401563016e-05, + "loss": 1.5713, + "step": 4250 + }, + { + "epoch": 1.875275451740855, + "grad_norm": 0.3177914709078092, + "learning_rate": 7.409332858474772e-05, + "loss": 1.8163, + "step": 4255 + }, + { + "epoch": 1.877479065667695, + "grad_norm": 0.2731160161178577, + "learning_rate": 7.384575348621909e-05, + "loss": 1.502, + "step": 4260 + }, + { + "epoch": 1.8796826795945352, + "grad_norm": 0.318150552351912, + "learning_rate": 7.359835034781227e-05, + "loss": 1.709, + "step": 4265 + }, + { + "epoch": 1.881886293521375, + "grad_norm": 0.2992460408245423, + "learning_rate": 7.335112079616456e-05, + "loss": 1.6948, + "step": 4270 + }, + { + "epoch": 1.8840899074482151, + "grad_norm": 0.26242651145469986, + "learning_rate": 7.31040664567719e-05, + "loss": 1.5288, + "step": 4275 + }, + { + "epoch": 1.8862935213750551, + "grad_norm": 0.32400746012853343, + "learning_rate": 7.285718895397848e-05, + "loss": 1.7299, + "step": 4280 + }, + { + "epoch": 1.8884971353018951, + "grad_norm": 0.30579297267422695, + "learning_rate": 7.261048991096558e-05, + "loss": 1.8361, + "step": 4285 + }, + { + "epoch": 1.890700749228735, + "grad_norm": 0.28173595807802665, + "learning_rate": 7.236397094974119e-05, + "loss": 1.737, + "step": 4290 + }, + { + "epoch": 1.892904363155575, + "grad_norm": 0.3062487579152163, + "learning_rate": 7.211763369112934e-05, + "loss": 1.6612, + "step": 4295 + }, + { + "epoch": 1.8951079770824153, + "grad_norm": 0.29979139920156694, + "learning_rate": 7.18714797547594e-05, + "loss": 1.762, + "step": 4300 + }, + { + "epoch": 1.897311591009255, + "grad_norm": 0.3021721281737111, + "learning_rate": 7.162551075905538e-05, + "loss": 1.8317, + "step": 4305 + }, + { + "epoch": 1.8995152049360953, + "grad_norm": 0.2809347403792435, + "learning_rate": 7.137972832122532e-05, + "loss": 1.7406, + "step": 4310 + }, + { + "epoch": 1.9017188188629353, + "grad_norm": 0.31981826220168086, + "learning_rate": 7.113413405725069e-05, + "loss": 1.8273, + "step": 4315 + }, + { + "epoch": 1.9039224327897752, + "grad_norm": 0.27829653919403746, + "learning_rate": 7.088872958187578e-05, + "loss": 1.6196, + "step": 4320 + }, + { + "epoch": 1.9061260467166152, + "grad_norm": 0.2612389626891207, + "learning_rate": 7.064351650859704e-05, + "loss": 1.7173, + "step": 4325 + }, + { + "epoch": 1.9083296606434552, + "grad_norm": 0.3351698187645255, + "learning_rate": 7.039849644965246e-05, + "loss": 1.5561, + "step": 4330 + }, + { + "epoch": 1.9105332745702954, + "grad_norm": 0.276151473793176, + "learning_rate": 7.015367101601091e-05, + "loss": 1.5952, + "step": 4335 + }, + { + "epoch": 1.9127368884971352, + "grad_norm": 0.2866569066490694, + "learning_rate": 6.990904181736187e-05, + "loss": 1.7386, + "step": 4340 + }, + { + "epoch": 1.9149405024239754, + "grad_norm": 0.30680340268949835, + "learning_rate": 6.96646104621043e-05, + "loss": 1.793, + "step": 4345 + }, + { + "epoch": 1.9171441163508154, + "grad_norm": 0.2853878652168025, + "learning_rate": 6.942037855733661e-05, + "loss": 1.8032, + "step": 4350 + }, + { + "epoch": 1.9193477302776554, + "grad_norm": 0.3335656030865567, + "learning_rate": 6.917634770884571e-05, + "loss": 1.8019, + "step": 4355 + }, + { + "epoch": 1.9215513442044954, + "grad_norm": 0.26469328077404297, + "learning_rate": 6.893251952109668e-05, + "loss": 1.7769, + "step": 4360 + }, + { + "epoch": 1.9237549581313353, + "grad_norm": 0.3611778018097071, + "learning_rate": 6.868889559722213e-05, + "loss": 1.845, + "step": 4365 + }, + { + "epoch": 1.9259585720581756, + "grad_norm": 0.2607913972126291, + "learning_rate": 6.84454775390116e-05, + "loss": 1.6583, + "step": 4370 + }, + { + "epoch": 1.9281621859850153, + "grad_norm": 0.2952126078040435, + "learning_rate": 6.820226694690112e-05, + "loss": 1.4631, + "step": 4375 + }, + { + "epoch": 1.9303657999118555, + "grad_norm": 0.28021793360716346, + "learning_rate": 6.795926541996273e-05, + "loss": 1.7197, + "step": 4380 + }, + { + "epoch": 1.9325694138386955, + "grad_norm": 0.2431287772236666, + "learning_rate": 6.771647455589384e-05, + "loss": 1.7528, + "step": 4385 + }, + { + "epoch": 1.9347730277655355, + "grad_norm": 0.24277132942171645, + "learning_rate": 6.74738959510068e-05, + "loss": 1.52, + "step": 4390 + }, + { + "epoch": 1.9369766416923755, + "grad_norm": 0.2616438959393551, + "learning_rate": 6.723153120021833e-05, + "loss": 1.6386, + "step": 4395 + }, + { + "epoch": 1.9391802556192155, + "grad_norm": 0.3044048108764881, + "learning_rate": 6.698938189703918e-05, + "loss": 1.653, + "step": 4400 + }, + { + "epoch": 1.9413838695460557, + "grad_norm": 0.2952474841434869, + "learning_rate": 6.674744963356357e-05, + "loss": 1.6325, + "step": 4405 + }, + { + "epoch": 1.9435874834728954, + "grad_norm": 0.3104804592935538, + "learning_rate": 6.65057360004586e-05, + "loss": 1.7827, + "step": 4410 + }, + { + "epoch": 1.9457910973997357, + "grad_norm": 0.27421700814052996, + "learning_rate": 6.626424258695403e-05, + "loss": 1.6614, + "step": 4415 + }, + { + "epoch": 1.9479947113265754, + "grad_norm": 0.28977404672357854, + "learning_rate": 6.60229709808317e-05, + "loss": 1.8225, + "step": 4420 + }, + { + "epoch": 1.9501983252534156, + "grad_norm": 0.30875392634058607, + "learning_rate": 6.578192276841501e-05, + "loss": 1.7437, + "step": 4425 + }, + { + "epoch": 1.9524019391802556, + "grad_norm": 0.307292209389746, + "learning_rate": 6.554109953455864e-05, + "loss": 1.7637, + "step": 4430 + }, + { + "epoch": 1.9546055531070956, + "grad_norm": 0.34674701377289235, + "learning_rate": 6.53005028626381e-05, + "loss": 1.6782, + "step": 4435 + }, + { + "epoch": 1.9568091670339358, + "grad_norm": 0.2997355914742966, + "learning_rate": 6.506013433453926e-05, + "loss": 1.7479, + "step": 4440 + }, + { + "epoch": 1.9590127809607756, + "grad_norm": 0.33789074614600445, + "learning_rate": 6.4819995530648e-05, + "loss": 1.6811, + "step": 4445 + }, + { + "epoch": 1.9612163948876158, + "grad_norm": 0.3008656237866022, + "learning_rate": 6.45800880298397e-05, + "loss": 1.6704, + "step": 4450 + }, + { + "epoch": 1.9634200088144556, + "grad_norm": 0.31248757394845805, + "learning_rate": 6.434041340946909e-05, + "loss": 1.6695, + "step": 4455 + }, + { + "epoch": 1.9656236227412958, + "grad_norm": 0.3086687965739077, + "learning_rate": 6.41009732453597e-05, + "loss": 1.5949, + "step": 4460 + }, + { + "epoch": 1.9678272366681357, + "grad_norm": 0.2536971137620243, + "learning_rate": 6.386176911179353e-05, + "loss": 1.6463, + "step": 4465 + }, + { + "epoch": 1.9700308505949757, + "grad_norm": 0.3287438241265034, + "learning_rate": 6.362280258150074e-05, + "loss": 1.6429, + "step": 4470 + }, + { + "epoch": 1.972234464521816, + "grad_norm": 0.2565655853570647, + "learning_rate": 6.33840752256492e-05, + "loss": 1.5546, + "step": 4475 + }, + { + "epoch": 1.9744380784486557, + "grad_norm": 0.3055708289742853, + "learning_rate": 6.314558861383442e-05, + "loss": 1.7164, + "step": 4480 + }, + { + "epoch": 1.976641692375496, + "grad_norm": 0.26286511562137227, + "learning_rate": 6.29073443140689e-05, + "loss": 1.7841, + "step": 4485 + }, + { + "epoch": 1.9788453063023357, + "grad_norm": 0.2842174046049182, + "learning_rate": 6.266934389277204e-05, + "loss": 1.7053, + "step": 4490 + }, + { + "epoch": 1.981048920229176, + "grad_norm": 0.34820565260475556, + "learning_rate": 6.24315889147597e-05, + "loss": 1.7649, + "step": 4495 + }, + { + "epoch": 1.9832525341560159, + "grad_norm": 0.3290871426082691, + "learning_rate": 6.219408094323415e-05, + "loss": 1.6402, + "step": 4500 + }, + { + "epoch": 1.9854561480828559, + "grad_norm": 0.25604694542787715, + "learning_rate": 6.195682153977351e-05, + "loss": 1.5192, + "step": 4505 + }, + { + "epoch": 1.9876597620096959, + "grad_norm": 0.28478382829773047, + "learning_rate": 6.17198122643216e-05, + "loss": 1.649, + "step": 4510 + }, + { + "epoch": 1.9898633759365358, + "grad_norm": 0.2776833563766362, + "learning_rate": 6.148305467517768e-05, + "loss": 1.7351, + "step": 4515 + }, + { + "epoch": 1.992066989863376, + "grad_norm": 0.32063858764742265, + "learning_rate": 6.124655032898631e-05, + "loss": 1.8315, + "step": 4520 + }, + { + "epoch": 1.9942706037902158, + "grad_norm": 0.29572004437320715, + "learning_rate": 6.1010300780726925e-05, + "loss": 1.7337, + "step": 4525 + }, + { + "epoch": 1.996474217717056, + "grad_norm": 0.3096259639483799, + "learning_rate": 6.077430758370376e-05, + "loss": 1.759, + "step": 4530 + }, + { + "epoch": 1.998677831643896, + "grad_norm": 0.31442409226477874, + "learning_rate": 6.053857228953546e-05, + "loss": 1.7822, + "step": 4535 + }, + { + "epoch": 2.000881445570736, + "grad_norm": 0.2377883462279695, + "learning_rate": 6.03030964481452e-05, + "loss": 1.4966, + "step": 4540 + }, + { + "epoch": 2.003085059497576, + "grad_norm": 0.2900063226146335, + "learning_rate": 6.0067881607750134e-05, + "loss": 1.7189, + "step": 4545 + }, + { + "epoch": 2.005288673424416, + "grad_norm": 0.30629874279137115, + "learning_rate": 5.983292931485142e-05, + "loss": 1.5509, + "step": 4550 + }, + { + "epoch": 2.007492287351256, + "grad_norm": 0.2682166094278232, + "learning_rate": 5.9598241114223986e-05, + "loss": 1.5487, + "step": 4555 + }, + { + "epoch": 2.009695901278096, + "grad_norm": 0.3321377020430881, + "learning_rate": 5.936381854890646e-05, + "loss": 1.8033, + "step": 4560 + }, + { + "epoch": 2.011899515204936, + "grad_norm": 0.31810161747738896, + "learning_rate": 5.912966316019093e-05, + "loss": 1.621, + "step": 4565 + }, + { + "epoch": 2.014103129131776, + "grad_norm": 0.24729310525461057, + "learning_rate": 5.8895776487612765e-05, + "loss": 1.5993, + "step": 4570 + }, + { + "epoch": 2.016306743058616, + "grad_norm": 0.30363634336485235, + "learning_rate": 5.8662160068940655e-05, + "loss": 1.6749, + "step": 4575 + }, + { + "epoch": 2.0185103569854563, + "grad_norm": 0.2851113038101266, + "learning_rate": 5.84288154401664e-05, + "loss": 1.6956, + "step": 4580 + }, + { + "epoch": 2.020713970912296, + "grad_norm": 0.345208802987356, + "learning_rate": 5.81957441354948e-05, + "loss": 1.5851, + "step": 4585 + }, + { + "epoch": 2.0229175848391363, + "grad_norm": 0.33771374182942, + "learning_rate": 5.796294768733362e-05, + "loss": 1.5908, + "step": 4590 + }, + { + "epoch": 2.025121198765976, + "grad_norm": 0.29219932563018164, + "learning_rate": 5.773042762628342e-05, + "loss": 1.677, + "step": 4595 + }, + { + "epoch": 2.0273248126928163, + "grad_norm": 0.26977700042523883, + "learning_rate": 5.749818548112762e-05, + "loss": 1.6073, + "step": 4600 + }, + { + "epoch": 2.029528426619656, + "grad_norm": 0.30654875158910516, + "learning_rate": 5.726622277882243e-05, + "loss": 1.7622, + "step": 4605 + }, + { + "epoch": 2.0317320405464963, + "grad_norm": 0.3217700066222481, + "learning_rate": 5.703454104448665e-05, + "loss": 1.804, + "step": 4610 + }, + { + "epoch": 2.0339356544733365, + "grad_norm": 0.30632620282091305, + "learning_rate": 5.680314180139178e-05, + "loss": 1.7833, + "step": 4615 + }, + { + "epoch": 2.0361392684001762, + "grad_norm": 0.3023215763344122, + "learning_rate": 5.657202657095206e-05, + "loss": 1.7969, + "step": 4620 + }, + { + "epoch": 2.0383428823270164, + "grad_norm": 0.3078282400161021, + "learning_rate": 5.6341196872714394e-05, + "loss": 1.6958, + "step": 4625 + }, + { + "epoch": 2.040546496253856, + "grad_norm": 0.27870283271610047, + "learning_rate": 5.611065422434828e-05, + "loss": 1.5725, + "step": 4630 + }, + { + "epoch": 2.0427501101806964, + "grad_norm": 0.3114689828172716, + "learning_rate": 5.588040014163585e-05, + "loss": 1.562, + "step": 4635 + }, + { + "epoch": 2.044953724107536, + "grad_norm": 0.2912835457860766, + "learning_rate": 5.565043613846219e-05, + "loss": 1.7486, + "step": 4640 + }, + { + "epoch": 2.0471573380343764, + "grad_norm": 0.29628653391558857, + "learning_rate": 5.542076372680498e-05, + "loss": 1.7084, + "step": 4645 + }, + { + "epoch": 2.0493609519612166, + "grad_norm": 0.29948576765849355, + "learning_rate": 5.519138441672471e-05, + "loss": 1.6903, + "step": 4650 + }, + { + "epoch": 2.0515645658880564, + "grad_norm": 0.26477213023267704, + "learning_rate": 5.496229971635487e-05, + "loss": 1.6743, + "step": 4655 + }, + { + "epoch": 2.0537681798148966, + "grad_norm": 0.36060338821204513, + "learning_rate": 5.473351113189194e-05, + "loss": 1.8093, + "step": 4660 + }, + { + "epoch": 2.0559717937417363, + "grad_norm": 0.31416116481556966, + "learning_rate": 5.4505020167585396e-05, + "loss": 1.6409, + "step": 4665 + }, + { + "epoch": 2.0581754076685765, + "grad_norm": 0.23960293640658495, + "learning_rate": 5.4276828325727934e-05, + "loss": 1.5688, + "step": 4670 + }, + { + "epoch": 2.0603790215954163, + "grad_norm": 0.33607407453371085, + "learning_rate": 5.4048937106645613e-05, + "loss": 1.5812, + "step": 4675 + }, + { + "epoch": 2.0625826355222565, + "grad_norm": 0.3362219928372315, + "learning_rate": 5.3821348008687967e-05, + "loss": 1.7184, + "step": 4680 + }, + { + "epoch": 2.0647862494490967, + "grad_norm": 0.31128427440833256, + "learning_rate": 5.3594062528218025e-05, + "loss": 1.6606, + "step": 4685 + }, + { + "epoch": 2.0669898633759365, + "grad_norm": 0.28720777906212147, + "learning_rate": 5.336708215960258e-05, + "loss": 1.6505, + "step": 4690 + }, + { + "epoch": 2.0691934773027767, + "grad_norm": 0.35035039806258184, + "learning_rate": 5.314040839520253e-05, + "loss": 1.7716, + "step": 4695 + }, + { + "epoch": 2.0713970912296165, + "grad_norm": 0.2931752659797425, + "learning_rate": 5.291404272536275e-05, + "loss": 1.6877, + "step": 4700 + }, + { + "epoch": 2.0736007051564567, + "grad_norm": 0.2873712201718596, + "learning_rate": 5.268798663840243e-05, + "loss": 1.6062, + "step": 4705 + }, + { + "epoch": 2.0758043190832964, + "grad_norm": 0.26676705722923527, + "learning_rate": 5.2462241620605366e-05, + "loss": 1.6592, + "step": 4710 + }, + { + "epoch": 2.0780079330101366, + "grad_norm": 0.3509576273254501, + "learning_rate": 5.223680915621014e-05, + "loss": 1.671, + "step": 4715 + }, + { + "epoch": 2.080211546936977, + "grad_norm": 0.39209953129493824, + "learning_rate": 5.2011690727400285e-05, + "loss": 1.6385, + "step": 4720 + }, + { + "epoch": 2.0824151608638166, + "grad_norm": 0.3402308135526598, + "learning_rate": 5.178688781429455e-05, + "loss": 1.6095, + "step": 4725 + }, + { + "epoch": 2.084618774790657, + "grad_norm": 0.29891446961728113, + "learning_rate": 5.1562401894937365e-05, + "loss": 1.6653, + "step": 4730 + }, + { + "epoch": 2.0868223887174966, + "grad_norm": 0.2466567525935987, + "learning_rate": 5.133823444528889e-05, + "loss": 1.6558, + "step": 4735 + }, + { + "epoch": 2.089026002644337, + "grad_norm": 0.3676958248067488, + "learning_rate": 5.111438693921536e-05, + "loss": 1.6279, + "step": 4740 + }, + { + "epoch": 2.0912296165711766, + "grad_norm": 0.343788030408807, + "learning_rate": 5.089086084847954e-05, + "loss": 1.6951, + "step": 4745 + }, + { + "epoch": 2.0934332304980168, + "grad_norm": 0.3319476261884821, + "learning_rate": 5.066765764273078e-05, + "loss": 1.617, + "step": 4750 + }, + { + "epoch": 2.095636844424857, + "grad_norm": 0.33586220021618074, + "learning_rate": 5.044477878949571e-05, + "loss": 1.6601, + "step": 4755 + }, + { + "epoch": 2.0978404583516967, + "grad_norm": 0.27982003995931076, + "learning_rate": 5.0222225754168175e-05, + "loss": 1.6063, + "step": 4760 + }, + { + "epoch": 2.100044072278537, + "grad_norm": 0.3330827204915635, + "learning_rate": 5.000000000000002e-05, + "loss": 1.7513, + "step": 4765 + }, + { + "epoch": 2.1022476862053767, + "grad_norm": 0.34805614239612326, + "learning_rate": 4.97781029880911e-05, + "loss": 1.5524, + "step": 4770 + }, + { + "epoch": 2.104451300132217, + "grad_norm": 0.287355630557104, + "learning_rate": 4.955653617737995e-05, + "loss": 1.6138, + "step": 4775 + }, + { + "epoch": 2.1066549140590567, + "grad_norm": 0.29898227350779466, + "learning_rate": 4.9335301024634094e-05, + "loss": 1.6648, + "step": 4780 + }, + { + "epoch": 2.108858527985897, + "grad_norm": 0.26174467755819714, + "learning_rate": 4.911439898444036e-05, + "loss": 1.594, + "step": 4785 + }, + { + "epoch": 2.111062141912737, + "grad_norm": 0.35692436121383353, + "learning_rate": 4.889383150919543e-05, + "loss": 1.5403, + "step": 4790 + }, + { + "epoch": 2.113265755839577, + "grad_norm": 0.2926942813060104, + "learning_rate": 4.867360004909635e-05, + "loss": 1.754, + "step": 4795 + }, + { + "epoch": 2.115469369766417, + "grad_norm": 0.3290438654965286, + "learning_rate": 4.845370605213091e-05, + "loss": 1.5578, + "step": 4800 + }, + { + "epoch": 2.117672983693257, + "grad_norm": 0.3373040566376742, + "learning_rate": 4.823415096406806e-05, + "loss": 1.6939, + "step": 4805 + }, + { + "epoch": 2.119876597620097, + "grad_norm": 0.34391563105196393, + "learning_rate": 4.801493622844847e-05, + "loss": 1.7067, + "step": 4810 + }, + { + "epoch": 2.122080211546937, + "grad_norm": 0.3002969326243971, + "learning_rate": 4.779606328657513e-05, + "loss": 1.716, + "step": 4815 + }, + { + "epoch": 2.124283825473777, + "grad_norm": 0.3388857319061881, + "learning_rate": 4.75775335775038e-05, + "loss": 1.843, + "step": 4820 + }, + { + "epoch": 2.1264874394006172, + "grad_norm": 0.4156596099940998, + "learning_rate": 4.735934853803339e-05, + "loss": 1.7106, + "step": 4825 + }, + { + "epoch": 2.128691053327457, + "grad_norm": 0.2940085091207592, + "learning_rate": 4.71415096026968e-05, + "loss": 1.6581, + "step": 4830 + }, + { + "epoch": 2.130894667254297, + "grad_norm": 0.306112268047742, + "learning_rate": 4.692401820375134e-05, + "loss": 1.7315, + "step": 4835 + }, + { + "epoch": 2.133098281181137, + "grad_norm": 0.29996735672531655, + "learning_rate": 4.6706875771169265e-05, + "loss": 1.649, + "step": 4840 + }, + { + "epoch": 2.135301895107977, + "grad_norm": 0.27463359410451954, + "learning_rate": 4.64900837326284e-05, + "loss": 1.6608, + "step": 4845 + }, + { + "epoch": 2.137505509034817, + "grad_norm": 0.3165826117114464, + "learning_rate": 4.627364351350288e-05, + "loss": 1.6793, + "step": 4850 + }, + { + "epoch": 2.139709122961657, + "grad_norm": 0.3469777212157657, + "learning_rate": 4.605755653685366e-05, + "loss": 1.7027, + "step": 4855 + }, + { + "epoch": 2.1419127368884974, + "grad_norm": 0.28698621030093346, + "learning_rate": 4.584182422341915e-05, + "loss": 1.6516, + "step": 4860 + }, + { + "epoch": 2.144116350815337, + "grad_norm": 0.3294649962724426, + "learning_rate": 4.562644799160585e-05, + "loss": 1.7214, + "step": 4865 + }, + { + "epoch": 2.1463199647421773, + "grad_norm": 0.3185351974138239, + "learning_rate": 4.541142925747919e-05, + "loss": 1.6362, + "step": 4870 + }, + { + "epoch": 2.148523578669017, + "grad_norm": 0.3126109058989699, + "learning_rate": 4.519676943475408e-05, + "loss": 1.7064, + "step": 4875 + }, + { + "epoch": 2.1507271925958573, + "grad_norm": 0.5380540251828319, + "learning_rate": 4.4982469934785574e-05, + "loss": 1.6943, + "step": 4880 + }, + { + "epoch": 2.152930806522697, + "grad_norm": 0.3435754281589444, + "learning_rate": 4.4768532166559763e-05, + "loss": 1.715, + "step": 4885 + }, + { + "epoch": 2.1551344204495373, + "grad_norm": 0.28522234926734513, + "learning_rate": 4.455495753668428e-05, + "loss": 1.3429, + "step": 4890 + }, + { + "epoch": 2.157338034376377, + "grad_norm": 0.29145572840002815, + "learning_rate": 4.4341747449379335e-05, + "loss": 1.5995, + "step": 4895 + }, + { + "epoch": 2.1595416483032173, + "grad_norm": 0.47296843361442037, + "learning_rate": 4.412890330646815e-05, + "loss": 1.8911, + "step": 4900 + }, + { + "epoch": 2.1617452622300575, + "grad_norm": 0.3101777212408002, + "learning_rate": 4.391642650736811e-05, + "loss": 1.5388, + "step": 4905 + }, + { + "epoch": 2.1639488761568972, + "grad_norm": 0.27664700170021256, + "learning_rate": 4.370431844908119e-05, + "loss": 1.5866, + "step": 4910 + }, + { + "epoch": 2.1661524900837374, + "grad_norm": 0.29560765708014347, + "learning_rate": 4.349258052618509e-05, + "loss": 1.7198, + "step": 4915 + }, + { + "epoch": 2.168356104010577, + "grad_norm": 0.3363838001346494, + "learning_rate": 4.328121413082388e-05, + "loss": 1.6872, + "step": 4920 + }, + { + "epoch": 2.1705597179374174, + "grad_norm": 0.2850203542162651, + "learning_rate": 4.307022065269887e-05, + "loss": 1.6207, + "step": 4925 + }, + { + "epoch": 2.172763331864257, + "grad_norm": 0.32285248455004484, + "learning_rate": 4.285960147905946e-05, + "loss": 1.6117, + "step": 4930 + }, + { + "epoch": 2.1749669457910974, + "grad_norm": 0.27430213811378995, + "learning_rate": 4.264935799469417e-05, + "loss": 1.6949, + "step": 4935 + }, + { + "epoch": 2.1771705597179376, + "grad_norm": 0.46396848778851696, + "learning_rate": 4.2439491581921373e-05, + "loss": 1.6883, + "step": 4940 + }, + { + "epoch": 2.1793741736447774, + "grad_norm": 0.31449151532095687, + "learning_rate": 4.223000362058023e-05, + "loss": 1.6213, + "step": 4945 + }, + { + "epoch": 2.1815777875716176, + "grad_norm": 0.27602615724586665, + "learning_rate": 4.202089548802157e-05, + "loss": 1.6365, + "step": 4950 + }, + { + "epoch": 2.1837814014984573, + "grad_norm": 0.2989766105095882, + "learning_rate": 4.181216855909913e-05, + "loss": 1.6936, + "step": 4955 + }, + { + "epoch": 2.1859850154252976, + "grad_norm": 0.29722828765908, + "learning_rate": 4.16038242061601e-05, + "loss": 1.5737, + "step": 4960 + }, + { + "epoch": 2.1881886293521373, + "grad_norm": 0.2573098385617781, + "learning_rate": 4.139586379903629e-05, + "loss": 1.6852, + "step": 4965 + }, + { + "epoch": 2.1903922432789775, + "grad_norm": 0.2951816124750441, + "learning_rate": 4.1188288705035226e-05, + "loss": 1.645, + "step": 4970 + }, + { + "epoch": 2.1925958572058177, + "grad_norm": 0.33288427202987686, + "learning_rate": 4.098110028893105e-05, + "loss": 1.5257, + "step": 4975 + }, + { + "epoch": 2.1947994711326575, + "grad_norm": 0.3489398250177744, + "learning_rate": 4.077429991295549e-05, + "loss": 1.6671, + "step": 4980 + }, + { + "epoch": 2.1970030850594977, + "grad_norm": 0.2899374350308383, + "learning_rate": 4.056788893678898e-05, + "loss": 1.6132, + "step": 4985 + }, + { + "epoch": 2.1992066989863375, + "grad_norm": 0.3500814252732699, + "learning_rate": 4.036186871755173e-05, + "loss": 1.5695, + "step": 4990 + }, + { + "epoch": 2.2014103129131777, + "grad_norm": 0.3687163067932294, + "learning_rate": 4.015624060979486e-05, + "loss": 1.5143, + "step": 4995 + }, + { + "epoch": 2.2036139268400174, + "grad_norm": 0.33423795880289425, + "learning_rate": 3.995100596549128e-05, + "loss": 1.6156, + "step": 5000 + }, + { + "epoch": 2.2058175407668577, + "grad_norm": 0.2868033623703877, + "learning_rate": 3.9746166134026995e-05, + "loss": 1.5561, + "step": 5005 + }, + { + "epoch": 2.208021154693698, + "grad_norm": 0.34323096599301134, + "learning_rate": 3.9541722462192196e-05, + "loss": 1.4618, + "step": 5010 + }, + { + "epoch": 2.2102247686205376, + "grad_norm": 0.27958346490895863, + "learning_rate": 3.9337676294172424e-05, + "loss": 1.7082, + "step": 5015 + }, + { + "epoch": 2.212428382547378, + "grad_norm": 0.3257321749343037, + "learning_rate": 3.913402897153957e-05, + "loss": 1.6946, + "step": 5020 + }, + { + "epoch": 2.2146319964742176, + "grad_norm": 0.3365177337266649, + "learning_rate": 3.893078183324329e-05, + "loss": 1.6428, + "step": 5025 + }, + { + "epoch": 2.216835610401058, + "grad_norm": 0.34401308894946486, + "learning_rate": 3.8727936215602077e-05, + "loss": 1.5488, + "step": 5030 + }, + { + "epoch": 2.2190392243278976, + "grad_norm": 0.2845220746832339, + "learning_rate": 3.852549345229445e-05, + "loss": 1.6519, + "step": 5035 + }, + { + "epoch": 2.221242838254738, + "grad_norm": 0.25529471083550676, + "learning_rate": 3.832345487435019e-05, + "loss": 1.8166, + "step": 5040 + }, + { + "epoch": 2.223446452181578, + "grad_norm": 0.3591343853259783, + "learning_rate": 3.812182181014169e-05, + "loss": 1.7223, + "step": 5045 + }, + { + "epoch": 2.2256500661084178, + "grad_norm": 0.3054726611280714, + "learning_rate": 3.792059558537518e-05, + "loss": 1.8144, + "step": 5050 + }, + { + "epoch": 2.227853680035258, + "grad_norm": 0.30167111375727146, + "learning_rate": 3.7719777523081864e-05, + "loss": 1.5961, + "step": 5055 + }, + { + "epoch": 2.2300572939620977, + "grad_norm": 0.2916710789608964, + "learning_rate": 3.751936894360949e-05, + "loss": 1.7809, + "step": 5060 + }, + { + "epoch": 2.232260907888938, + "grad_norm": 0.3732355683913339, + "learning_rate": 3.731937116461336e-05, + "loss": 1.6552, + "step": 5065 + }, + { + "epoch": 2.2344645218157777, + "grad_norm": 0.35016354658091353, + "learning_rate": 3.7119785501047977e-05, + "loss": 1.649, + "step": 5070 + }, + { + "epoch": 2.236668135742618, + "grad_norm": 0.34880893384696754, + "learning_rate": 3.6920613265158124e-05, + "loss": 1.5914, + "step": 5075 + }, + { + "epoch": 2.2388717496694577, + "grad_norm": 0.3026190037167945, + "learning_rate": 3.672185576647047e-05, + "loss": 1.5736, + "step": 5080 + }, + { + "epoch": 2.241075363596298, + "grad_norm": 0.3484793101368692, + "learning_rate": 3.652351431178473e-05, + "loss": 1.7296, + "step": 5085 + }, + { + "epoch": 2.243278977523138, + "grad_norm": 0.2587698411979413, + "learning_rate": 3.6325590205165314e-05, + "loss": 1.7112, + "step": 5090 + }, + { + "epoch": 2.245482591449978, + "grad_norm": 0.25882732375987005, + "learning_rate": 3.612808474793261e-05, + "loss": 1.7223, + "step": 5095 + }, + { + "epoch": 2.247686205376818, + "grad_norm": 0.27481092620324143, + "learning_rate": 3.593099923865438e-05, + "loss": 1.5473, + "step": 5100 + }, + { + "epoch": 2.249889819303658, + "grad_norm": 0.3411799767876264, + "learning_rate": 3.573433497313731e-05, + "loss": 1.6459, + "step": 5105 + }, + { + "epoch": 2.252093433230498, + "grad_norm": 0.2887170405054378, + "learning_rate": 3.5538093244418525e-05, + "loss": 1.6195, + "step": 5110 + }, + { + "epoch": 2.254297047157338, + "grad_norm": 0.3884613083691619, + "learning_rate": 3.5342275342757046e-05, + "loss": 1.8638, + "step": 5115 + }, + { + "epoch": 2.256500661084178, + "grad_norm": 0.28793406477711025, + "learning_rate": 3.5146882555625226e-05, + "loss": 1.5124, + "step": 5120 + }, + { + "epoch": 2.2587042750110182, + "grad_norm": 0.30122690590343093, + "learning_rate": 3.495191616770034e-05, + "loss": 1.7147, + "step": 5125 + }, + { + "epoch": 2.260907888937858, + "grad_norm": 0.27840602962105204, + "learning_rate": 3.475737746085631e-05, + "loss": 1.5467, + "step": 5130 + }, + { + "epoch": 2.263111502864698, + "grad_norm": 0.3165143945785378, + "learning_rate": 3.456326771415498e-05, + "loss": 1.6215, + "step": 5135 + }, + { + "epoch": 2.265315116791538, + "grad_norm": 0.315000953366212, + "learning_rate": 3.436958820383783e-05, + "loss": 1.5548, + "step": 5140 + }, + { + "epoch": 2.267518730718378, + "grad_norm": 0.29091408470720853, + "learning_rate": 3.417634020331769e-05, + "loss": 1.786, + "step": 5145 + }, + { + "epoch": 2.269722344645218, + "grad_norm": 0.335006684656636, + "learning_rate": 3.398352498317029e-05, + "loss": 1.6015, + "step": 5150 + }, + { + "epoch": 2.271925958572058, + "grad_norm": 0.3244552469471718, + "learning_rate": 3.379114381112581e-05, + "loss": 1.653, + "step": 5155 + }, + { + "epoch": 2.2741295724988984, + "grad_norm": 0.32402505918566016, + "learning_rate": 3.359919795206065e-05, + "loss": 1.5578, + "step": 5160 + }, + { + "epoch": 2.276333186425738, + "grad_norm": 0.3417399295841799, + "learning_rate": 3.3407688667989124e-05, + "loss": 1.8143, + "step": 5165 + }, + { + "epoch": 2.2785368003525783, + "grad_norm": 0.3293628623372523, + "learning_rate": 3.321661721805519e-05, + "loss": 1.62, + "step": 5170 + }, + { + "epoch": 2.280740414279418, + "grad_norm": 0.3594982936202251, + "learning_rate": 3.302598485852401e-05, + "loss": 1.5937, + "step": 5175 + }, + { + "epoch": 2.2829440282062583, + "grad_norm": 0.2986899594367847, + "learning_rate": 3.283579284277378e-05, + "loss": 1.5761, + "step": 5180 + }, + { + "epoch": 2.285147642133098, + "grad_norm": 0.2633442789610725, + "learning_rate": 3.2646042421287625e-05, + "loss": 1.7272, + "step": 5185 + }, + { + "epoch": 2.2873512560599383, + "grad_norm": 0.3664081223341677, + "learning_rate": 3.245673484164521e-05, + "loss": 1.6607, + "step": 5190 + }, + { + "epoch": 2.2895548699867785, + "grad_norm": 0.36095917628329555, + "learning_rate": 3.2267871348514475e-05, + "loss": 1.6644, + "step": 5195 + }, + { + "epoch": 2.2917584839136182, + "grad_norm": 0.3155817104820564, + "learning_rate": 3.207945318364376e-05, + "loss": 1.7833, + "step": 5200 + }, + { + "epoch": 2.2939620978404585, + "grad_norm": 0.5525894084347752, + "learning_rate": 3.1891481585853224e-05, + "loss": 1.7846, + "step": 5205 + }, + { + "epoch": 2.2961657117672982, + "grad_norm": 0.3183611553729405, + "learning_rate": 3.1703957791027104e-05, + "loss": 1.8015, + "step": 5210 + }, + { + "epoch": 2.2983693256941384, + "grad_norm": 0.3341156230287187, + "learning_rate": 3.151688303210525e-05, + "loss": 1.4901, + "step": 5215 + }, + { + "epoch": 2.300572939620978, + "grad_norm": 0.29108487309353126, + "learning_rate": 3.133025853907531e-05, + "loss": 1.6021, + "step": 5220 + }, + { + "epoch": 2.3027765535478184, + "grad_norm": 0.31003713411448125, + "learning_rate": 3.114408553896437e-05, + "loss": 1.6835, + "step": 5225 + }, + { + "epoch": 2.3049801674746586, + "grad_norm": 0.31639971696298724, + "learning_rate": 3.09583652558311e-05, + "loss": 1.7131, + "step": 5230 + }, + { + "epoch": 2.3071837814014984, + "grad_norm": 0.320459066996447, + "learning_rate": 3.077309891075766e-05, + "loss": 1.7207, + "step": 5235 + }, + { + "epoch": 2.3093873953283386, + "grad_norm": 0.3658345444037042, + "learning_rate": 3.058828772184155e-05, + "loss": 1.637, + "step": 5240 + }, + { + "epoch": 2.3115910092551784, + "grad_norm": 0.3119356619189155, + "learning_rate": 3.0403932904187694e-05, + "loss": 1.7374, + "step": 5245 + }, + { + "epoch": 2.3137946231820186, + "grad_norm": 0.34062681606017264, + "learning_rate": 3.0220035669900493e-05, + "loss": 1.3662, + "step": 5250 + }, + { + "epoch": 2.3159982371088583, + "grad_norm": 0.3231089104173347, + "learning_rate": 3.0036597228075847e-05, + "loss": 1.7862, + "step": 5255 + }, + { + "epoch": 2.3182018510356985, + "grad_norm": 0.2845067164587982, + "learning_rate": 2.985361878479307e-05, + "loss": 1.6374, + "step": 5260 + }, + { + "epoch": 2.3204054649625387, + "grad_norm": 0.3158916323796938, + "learning_rate": 2.9671101543107037e-05, + "loss": 1.7791, + "step": 5265 + }, + { + "epoch": 2.3226090788893785, + "grad_norm": 0.3473567438345015, + "learning_rate": 2.9489046703040478e-05, + "loss": 1.6438, + "step": 5270 + }, + { + "epoch": 2.3248126928162187, + "grad_norm": 0.32956604729846517, + "learning_rate": 2.9307455461575728e-05, + "loss": 1.5174, + "step": 5275 + }, + { + "epoch": 2.3270163067430585, + "grad_norm": 0.31139915651994976, + "learning_rate": 2.9126329012647048e-05, + "loss": 1.6661, + "step": 5280 + }, + { + "epoch": 2.3292199206698987, + "grad_norm": 0.3916001778917859, + "learning_rate": 2.894566854713283e-05, + "loss": 1.7324, + "step": 5285 + }, + { + "epoch": 2.3314235345967385, + "grad_norm": 0.33538535373990935, + "learning_rate": 2.8765475252847696e-05, + "loss": 1.8397, + "step": 5290 + }, + { + "epoch": 2.3336271485235787, + "grad_norm": 0.4045876334606375, + "learning_rate": 2.8585750314534633e-05, + "loss": 1.761, + "step": 5295 + }, + { + "epoch": 2.335830762450419, + "grad_norm": 0.34476500843118907, + "learning_rate": 2.8406494913857264e-05, + "loss": 1.7239, + "step": 5300 + }, + { + "epoch": 2.3380343763772586, + "grad_norm": 0.3377854881404091, + "learning_rate": 2.8227710229392102e-05, + "loss": 1.6767, + "step": 5305 + }, + { + "epoch": 2.340237990304099, + "grad_norm": 0.2943468051471504, + "learning_rate": 2.8049397436620817e-05, + "loss": 1.7027, + "step": 5310 + }, + { + "epoch": 2.3424416042309386, + "grad_norm": 0.3325646272609782, + "learning_rate": 2.7871557707922356e-05, + "loss": 1.7092, + "step": 5315 + }, + { + "epoch": 2.344645218157779, + "grad_norm": 0.3194408987322575, + "learning_rate": 2.769419221256546e-05, + "loss": 1.5551, + "step": 5320 + }, + { + "epoch": 2.3468488320846186, + "grad_norm": 0.3154791851100986, + "learning_rate": 2.751730211670075e-05, + "loss": 1.5952, + "step": 5325 + }, + { + "epoch": 2.349052446011459, + "grad_norm": 0.37152524303673207, + "learning_rate": 2.7340888583353263e-05, + "loss": 1.6328, + "step": 5330 + }, + { + "epoch": 2.351256059938299, + "grad_norm": 0.364737719498444, + "learning_rate": 2.716495277241463e-05, + "loss": 1.5125, + "step": 5335 + }, + { + "epoch": 2.3534596738651388, + "grad_norm": 0.3524349639748066, + "learning_rate": 2.6989495840635615e-05, + "loss": 1.6589, + "step": 5340 + }, + { + "epoch": 2.355663287791979, + "grad_norm": 0.27640673317944237, + "learning_rate": 2.6814518941618326e-05, + "loss": 1.5661, + "step": 5345 + }, + { + "epoch": 2.3578669017188187, + "grad_norm": 0.31837314971643976, + "learning_rate": 2.6640023225808852e-05, + "loss": 1.7214, + "step": 5350 + }, + { + "epoch": 2.360070515645659, + "grad_norm": 0.29946292415188364, + "learning_rate": 2.6466009840489436e-05, + "loss": 1.4745, + "step": 5355 + }, + { + "epoch": 2.3622741295724987, + "grad_norm": 0.3368911389398718, + "learning_rate": 2.629247992977122e-05, + "loss": 1.6371, + "step": 5360 + }, + { + "epoch": 2.364477743499339, + "grad_norm": 0.3078861715002911, + "learning_rate": 2.6119434634586427e-05, + "loss": 1.6562, + "step": 5365 + }, + { + "epoch": 2.366681357426179, + "grad_norm": 0.39044087680489714, + "learning_rate": 2.5946875092681134e-05, + "loss": 1.7854, + "step": 5370 + }, + { + "epoch": 2.368884971353019, + "grad_norm": 0.3905475645799491, + "learning_rate": 2.5774802438607627e-05, + "loss": 1.7027, + "step": 5375 + }, + { + "epoch": 2.371088585279859, + "grad_norm": 0.3344158542218199, + "learning_rate": 2.5603217803716938e-05, + "loss": 1.6856, + "step": 5380 + }, + { + "epoch": 2.373292199206699, + "grad_norm": 0.3171235587081351, + "learning_rate": 2.5432122316151463e-05, + "loss": 1.6338, + "step": 5385 + }, + { + "epoch": 2.375495813133539, + "grad_norm": 0.31814287079371045, + "learning_rate": 2.5261517100837563e-05, + "loss": 1.6072, + "step": 5390 + }, + { + "epoch": 2.377699427060379, + "grad_norm": 0.3715801248464969, + "learning_rate": 2.509140327947814e-05, + "loss": 1.7025, + "step": 5395 + }, + { + "epoch": 2.379903040987219, + "grad_norm": 0.30689235196019754, + "learning_rate": 2.4921781970545178e-05, + "loss": 1.704, + "step": 5400 + }, + { + "epoch": 2.3821066549140593, + "grad_norm": 0.2783210103898173, + "learning_rate": 2.4752654289272568e-05, + "loss": 1.8138, + "step": 5405 + }, + { + "epoch": 2.384310268840899, + "grad_norm": 0.4115102536349836, + "learning_rate": 2.4584021347648645e-05, + "loss": 1.7562, + "step": 5410 + }, + { + "epoch": 2.3865138827677392, + "grad_norm": 0.320397115810562, + "learning_rate": 2.441588425440886e-05, + "loss": 1.7002, + "step": 5415 + }, + { + "epoch": 2.388717496694579, + "grad_norm": 0.32551710279990625, + "learning_rate": 2.424824411502856e-05, + "loss": 1.6053, + "step": 5420 + }, + { + "epoch": 2.390921110621419, + "grad_norm": 0.37060732842102345, + "learning_rate": 2.408110203171572e-05, + "loss": 1.6564, + "step": 5425 + }, + { + "epoch": 2.393124724548259, + "grad_norm": 0.33585884001747096, + "learning_rate": 2.3914459103403696e-05, + "loss": 1.7012, + "step": 5430 + }, + { + "epoch": 2.395328338475099, + "grad_norm": 0.297871745937653, + "learning_rate": 2.374831642574392e-05, + "loss": 1.7399, + "step": 5435 + }, + { + "epoch": 2.3975319524019394, + "grad_norm": 0.28026347006814506, + "learning_rate": 2.3582675091098717e-05, + "loss": 1.6698, + "step": 5440 + }, + { + "epoch": 2.399735566328779, + "grad_norm": 0.3092926951118008, + "learning_rate": 2.3417536188534327e-05, + "loss": 1.7019, + "step": 5445 + }, + { + "epoch": 2.4019391802556194, + "grad_norm": 0.29956315407231204, + "learning_rate": 2.3252900803813415e-05, + "loss": 1.7835, + "step": 5450 + }, + { + "epoch": 2.404142794182459, + "grad_norm": 0.3472861996721381, + "learning_rate": 2.3088770019388116e-05, + "loss": 1.7523, + "step": 5455 + }, + { + "epoch": 2.4063464081092993, + "grad_norm": 0.3346967409970768, + "learning_rate": 2.292514491439297e-05, + "loss": 1.543, + "step": 5460 + }, + { + "epoch": 2.408550022036139, + "grad_norm": 0.30667047442127315, + "learning_rate": 2.2762026564637717e-05, + "loss": 1.7131, + "step": 5465 + }, + { + "epoch": 2.4107536359629793, + "grad_norm": 0.32172911896887835, + "learning_rate": 2.259941604260024e-05, + "loss": 1.4888, + "step": 5470 + }, + { + "epoch": 2.4129572498898195, + "grad_norm": 0.33661859838509534, + "learning_rate": 2.2437314417419518e-05, + "loss": 1.6434, + "step": 5475 + }, + { + "epoch": 2.4151608638166593, + "grad_norm": 0.2766235524609549, + "learning_rate": 2.2275722754888662e-05, + "loss": 1.4497, + "step": 5480 + }, + { + "epoch": 2.4173644777434995, + "grad_norm": 0.31531488102987404, + "learning_rate": 2.211464211744787e-05, + "loss": 1.7619, + "step": 5485 + }, + { + "epoch": 2.4195680916703393, + "grad_norm": 0.4045821730130733, + "learning_rate": 2.195407356417737e-05, + "loss": 1.5253, + "step": 5490 + }, + { + "epoch": 2.4217717055971795, + "grad_norm": 0.3077739063590653, + "learning_rate": 2.1794018150790507e-05, + "loss": 1.3737, + "step": 5495 + }, + { + "epoch": 2.4239753195240192, + "grad_norm": 0.26933598773812595, + "learning_rate": 2.1634476929626868e-05, + "loss": 1.5562, + "step": 5500 + }, + { + "epoch": 2.4261789334508594, + "grad_norm": 0.3180561976663089, + "learning_rate": 2.1475450949645325e-05, + "loss": 1.8415, + "step": 5505 + }, + { + "epoch": 2.4283825473776997, + "grad_norm": 0.326460873305707, + "learning_rate": 2.1316941256417024e-05, + "loss": 1.6886, + "step": 5510 + }, + { + "epoch": 2.4305861613045394, + "grad_norm": 0.2651305009582311, + "learning_rate": 2.115894889211869e-05, + "loss": 1.556, + "step": 5515 + }, + { + "epoch": 2.4327897752313796, + "grad_norm": 0.285018315967615, + "learning_rate": 2.100147489552562e-05, + "loss": 1.6264, + "step": 5520 + }, + { + "epoch": 2.4349933891582194, + "grad_norm": 0.3039400323881749, + "learning_rate": 2.084452030200502e-05, + "loss": 1.68, + "step": 5525 + }, + { + "epoch": 2.4371970030850596, + "grad_norm": 0.4530331306706194, + "learning_rate": 2.068808614350899e-05, + "loss": 1.8822, + "step": 5530 + }, + { + "epoch": 2.4394006170118994, + "grad_norm": 0.32124417856720616, + "learning_rate": 2.0532173448567936e-05, + "loss": 1.6896, + "step": 5535 + }, + { + "epoch": 2.4416042309387396, + "grad_norm": 0.3526640578616687, + "learning_rate": 2.037678324228366e-05, + "loss": 1.3874, + "step": 5540 + }, + { + "epoch": 2.44380784486558, + "grad_norm": 0.3025841716329574, + "learning_rate": 2.022191654632274e-05, + "loss": 1.5026, + "step": 5545 + }, + { + "epoch": 2.4460114587924195, + "grad_norm": 0.3451825131993311, + "learning_rate": 2.0067574378909726e-05, + "loss": 1.6466, + "step": 5550 + }, + { + "epoch": 2.4482150727192598, + "grad_norm": 0.2946116685019073, + "learning_rate": 1.9913757754820483e-05, + "loss": 1.7246, + "step": 5555 + }, + { + "epoch": 2.4504186866460995, + "grad_norm": 0.3604250683804044, + "learning_rate": 1.976046768537544e-05, + "loss": 1.5967, + "step": 5560 + }, + { + "epoch": 2.4526223005729397, + "grad_norm": 0.356529608888727, + "learning_rate": 1.9607705178433124e-05, + "loss": 1.8344, + "step": 5565 + }, + { + "epoch": 2.4548259144997795, + "grad_norm": 0.3062637993405146, + "learning_rate": 1.9455471238383394e-05, + "loss": 1.727, + "step": 5570 + }, + { + "epoch": 2.4570295284266197, + "grad_norm": 0.2913261589880019, + "learning_rate": 1.9303766866140794e-05, + "loss": 1.6422, + "step": 5575 + }, + { + "epoch": 2.45923314235346, + "grad_norm": 0.28539967100565233, + "learning_rate": 1.9152593059138036e-05, + "loss": 1.5191, + "step": 5580 + }, + { + "epoch": 2.4614367562802997, + "grad_norm": 0.38278133166473177, + "learning_rate": 1.9001950811319624e-05, + "loss": 1.747, + "step": 5585 + }, + { + "epoch": 2.46364037020714, + "grad_norm": 0.3213863995857726, + "learning_rate": 1.885184111313494e-05, + "loss": 1.6493, + "step": 5590 + }, + { + "epoch": 2.4658439841339796, + "grad_norm": 0.3515687604082846, + "learning_rate": 1.870226495153199e-05, + "loss": 1.6207, + "step": 5595 + }, + { + "epoch": 2.46804759806082, + "grad_norm": 0.2818899743965069, + "learning_rate": 1.8553223309950907e-05, + "loss": 1.5783, + "step": 5600 + }, + { + "epoch": 2.4702512119876596, + "grad_norm": 0.32339197111018464, + "learning_rate": 1.8404717168317444e-05, + "loss": 1.6023, + "step": 5605 + }, + { + "epoch": 2.4724548259145, + "grad_norm": 0.32235437648153625, + "learning_rate": 1.8256747503036465e-05, + "loss": 1.5901, + "step": 5610 + }, + { + "epoch": 2.47465843984134, + "grad_norm": 0.2996816683579224, + "learning_rate": 1.8109315286985575e-05, + "loss": 1.7065, + "step": 5615 + }, + { + "epoch": 2.47686205376818, + "grad_norm": 0.34309522162413447, + "learning_rate": 1.7962421489508797e-05, + "loss": 1.7226, + "step": 5620 + }, + { + "epoch": 2.47906566769502, + "grad_norm": 0.29849313259138993, + "learning_rate": 1.7816067076410138e-05, + "loss": 1.7579, + "step": 5625 + }, + { + "epoch": 2.48126928162186, + "grad_norm": 0.3368071136503966, + "learning_rate": 1.7670253009947146e-05, + "loss": 1.5962, + "step": 5630 + }, + { + "epoch": 2.4834728955487, + "grad_norm": 0.3175198598873115, + "learning_rate": 1.7524980248824806e-05, + "loss": 1.7556, + "step": 5635 + }, + { + "epoch": 2.4856765094755398, + "grad_norm": 0.30889458937514946, + "learning_rate": 1.738024974818896e-05, + "loss": 1.7268, + "step": 5640 + }, + { + "epoch": 2.48788012340238, + "grad_norm": 0.30454000273861387, + "learning_rate": 1.7236062459620306e-05, + "loss": 1.6084, + "step": 5645 + }, + { + "epoch": 2.49008373732922, + "grad_norm": 0.30973625136809374, + "learning_rate": 1.7092419331127894e-05, + "loss": 1.581, + "step": 5650 + }, + { + "epoch": 2.49228735125606, + "grad_norm": 0.3312428081002817, + "learning_rate": 1.6949321307143096e-05, + "loss": 1.6826, + "step": 5655 + }, + { + "epoch": 2.4944909651829, + "grad_norm": 0.29438754562772956, + "learning_rate": 1.6806769328513226e-05, + "loss": 1.6531, + "step": 5660 + }, + { + "epoch": 2.49669457910974, + "grad_norm": 0.3072379816792179, + "learning_rate": 1.666476433249552e-05, + "loss": 1.6036, + "step": 5665 + }, + { + "epoch": 2.49889819303658, + "grad_norm": 0.2653188527319854, + "learning_rate": 1.6523307252750787e-05, + "loss": 1.6677, + "step": 5670 + }, + { + "epoch": 2.50110180696342, + "grad_norm": 0.31853001670175285, + "learning_rate": 1.6382399019337493e-05, + "loss": 1.7512, + "step": 5675 + }, + { + "epoch": 2.50330542089026, + "grad_norm": 0.3541014940940505, + "learning_rate": 1.6242040558705386e-05, + "loss": 1.4784, + "step": 5680 + }, + { + "epoch": 2.5055090348171003, + "grad_norm": 0.3022737220008577, + "learning_rate": 1.6102232793689652e-05, + "loss": 1.552, + "step": 5685 + }, + { + "epoch": 2.50771264874394, + "grad_norm": 0.2865580516371412, + "learning_rate": 1.5962976643504734e-05, + "loss": 1.6162, + "step": 5690 + }, + { + "epoch": 2.50991626267078, + "grad_norm": 0.33229646654342737, + "learning_rate": 1.5824273023738223e-05, + "loss": 1.7025, + "step": 5695 + }, + { + "epoch": 2.51211987659762, + "grad_norm": 0.25315774233489335, + "learning_rate": 1.5686122846344932e-05, + "loss": 1.6556, + "step": 5700 + }, + { + "epoch": 2.5143234905244602, + "grad_norm": 0.32926703632524384, + "learning_rate": 1.55485270196409e-05, + "loss": 1.7055, + "step": 5705 + }, + { + "epoch": 2.5165271044513, + "grad_norm": 0.38254621782613324, + "learning_rate": 1.541148644829743e-05, + "loss": 1.8189, + "step": 5710 + }, + { + "epoch": 2.51873071837814, + "grad_norm": 0.3262138673835723, + "learning_rate": 1.5275002033335016e-05, + "loss": 1.6328, + "step": 5715 + }, + { + "epoch": 2.5209343323049804, + "grad_norm": 0.36898284620242594, + "learning_rate": 1.5139074672117514e-05, + "loss": 1.7229, + "step": 5720 + }, + { + "epoch": 2.52313794623182, + "grad_norm": 0.29383774220673775, + "learning_rate": 1.500370525834639e-05, + "loss": 1.7057, + "step": 5725 + }, + { + "epoch": 2.52534156015866, + "grad_norm": 0.35255942435868104, + "learning_rate": 1.4868894682054535e-05, + "loss": 1.703, + "step": 5730 + }, + { + "epoch": 2.5275451740855, + "grad_norm": 0.3224975998851544, + "learning_rate": 1.473464382960057e-05, + "loss": 1.6255, + "step": 5735 + }, + { + "epoch": 2.5297487880123404, + "grad_norm": 0.2728863606551014, + "learning_rate": 1.4600953583663114e-05, + "loss": 1.5348, + "step": 5740 + }, + { + "epoch": 2.53195240193918, + "grad_norm": 0.3292693963632781, + "learning_rate": 1.4467824823234843e-05, + "loss": 1.6536, + "step": 5745 + }, + { + "epoch": 2.5341560158660204, + "grad_norm": 0.3530164649944846, + "learning_rate": 1.4335258423616737e-05, + "loss": 1.631, + "step": 5750 + }, + { + "epoch": 2.5363596297928606, + "grad_norm": 0.3107181811257503, + "learning_rate": 1.4203255256412318e-05, + "loss": 1.5969, + "step": 5755 + }, + { + "epoch": 2.5385632437197003, + "grad_norm": 0.3700803583018722, + "learning_rate": 1.407181618952199e-05, + "loss": 1.7883, + "step": 5760 + }, + { + "epoch": 2.54076685764654, + "grad_norm": 0.30243816752307084, + "learning_rate": 1.394094208713732e-05, + "loss": 1.652, + "step": 5765 + }, + { + "epoch": 2.5429704715733803, + "grad_norm": 0.3413616757398393, + "learning_rate": 1.3810633809735196e-05, + "loss": 1.7507, + "step": 5770 + }, + { + "epoch": 2.5451740855002205, + "grad_norm": 0.3120087119800215, + "learning_rate": 1.3680892214072405e-05, + "loss": 1.7198, + "step": 5775 + }, + { + "epoch": 2.5473776994270603, + "grad_norm": 0.3518504932831635, + "learning_rate": 1.3551718153179871e-05, + "loss": 1.8579, + "step": 5780 + }, + { + "epoch": 2.5495813133539005, + "grad_norm": 0.34752155952619, + "learning_rate": 1.3423112476357036e-05, + "loss": 1.5468, + "step": 5785 + }, + { + "epoch": 2.5517849272807407, + "grad_norm": 0.3260308226099618, + "learning_rate": 1.3295076029166265e-05, + "loss": 1.5258, + "step": 5790 + }, + { + "epoch": 2.5539885412075805, + "grad_norm": 0.35584838362058835, + "learning_rate": 1.3167609653427426e-05, + "loss": 1.7373, + "step": 5795 + }, + { + "epoch": 2.55619215513442, + "grad_norm": 0.32184918024972864, + "learning_rate": 1.304071418721221e-05, + "loss": 1.6741, + "step": 5800 + }, + { + "epoch": 2.5583957690612604, + "grad_norm": 0.315994062331127, + "learning_rate": 1.2914390464838655e-05, + "loss": 1.7156, + "step": 5805 + }, + { + "epoch": 2.5605993829881006, + "grad_norm": 0.23924524979523612, + "learning_rate": 1.2788639316865635e-05, + "loss": 1.738, + "step": 5810 + }, + { + "epoch": 2.5628029969149404, + "grad_norm": 0.3098121698841723, + "learning_rate": 1.266346157008753e-05, + "loss": 1.5198, + "step": 5815 + }, + { + "epoch": 2.5650066108417806, + "grad_norm": 0.3262487762949137, + "learning_rate": 1.2538858047528646e-05, + "loss": 1.587, + "step": 5820 + }, + { + "epoch": 2.5672102247686204, + "grad_norm": 0.33166973277913664, + "learning_rate": 1.2414829568437825e-05, + "loss": 1.5043, + "step": 5825 + }, + { + "epoch": 2.5694138386954606, + "grad_norm": 0.3022890966007592, + "learning_rate": 1.2291376948283139e-05, + "loss": 1.605, + "step": 5830 + }, + { + "epoch": 2.5716174526223003, + "grad_norm": 0.3389630036906691, + "learning_rate": 1.2168500998746435e-05, + "loss": 1.6955, + "step": 5835 + }, + { + "epoch": 2.5738210665491406, + "grad_norm": 0.3796380833169501, + "learning_rate": 1.2046202527718076e-05, + "loss": 1.6275, + "step": 5840 + }, + { + "epoch": 2.5760246804759808, + "grad_norm": 0.3238245323234113, + "learning_rate": 1.1924482339291554e-05, + "loss": 1.7289, + "step": 5845 + }, + { + "epoch": 2.5782282944028205, + "grad_norm": 0.30218042381327415, + "learning_rate": 1.1803341233758291e-05, + "loss": 1.5412, + "step": 5850 + }, + { + "epoch": 2.5804319083296607, + "grad_norm": 0.33206646260037787, + "learning_rate": 1.1682780007602268e-05, + "loss": 1.8011, + "step": 5855 + }, + { + "epoch": 2.5826355222565005, + "grad_norm": 0.3538395424898534, + "learning_rate": 1.1562799453494899e-05, + "loss": 1.7862, + "step": 5860 + }, + { + "epoch": 2.5848391361833407, + "grad_norm": 0.35528233062631853, + "learning_rate": 1.144340036028978e-05, + "loss": 1.685, + "step": 5865 + }, + { + "epoch": 2.5870427501101805, + "grad_norm": 0.33412660972869573, + "learning_rate": 1.132458351301744e-05, + "loss": 1.7575, + "step": 5870 + }, + { + "epoch": 2.5892463640370207, + "grad_norm": 0.3175660326892495, + "learning_rate": 1.1206349692880236e-05, + "loss": 1.5658, + "step": 5875 + }, + { + "epoch": 2.591449977963861, + "grad_norm": 0.30007433696171515, + "learning_rate": 1.1088699677247238e-05, + "loss": 1.586, + "step": 5880 + }, + { + "epoch": 2.5936535918907007, + "grad_norm": 0.3180669505120074, + "learning_rate": 1.097163423964912e-05, + "loss": 1.7819, + "step": 5885 + }, + { + "epoch": 2.595857205817541, + "grad_norm": 0.25188231699393177, + "learning_rate": 1.0855154149772994e-05, + "loss": 1.512, + "step": 5890 + }, + { + "epoch": 2.5980608197443806, + "grad_norm": 0.2725825366458783, + "learning_rate": 1.0739260173457355e-05, + "loss": 1.7237, + "step": 5895 + }, + { + "epoch": 2.600264433671221, + "grad_norm": 0.4353012547028696, + "learning_rate": 1.0623953072687265e-05, + "loss": 1.5664, + "step": 5900 + }, + { + "epoch": 2.6024680475980606, + "grad_norm": 0.3208919789812601, + "learning_rate": 1.0509233605588997e-05, + "loss": 1.7184, + "step": 5905 + }, + { + "epoch": 2.604671661524901, + "grad_norm": 0.2811764278977997, + "learning_rate": 1.0395102526425282e-05, + "loss": 1.6933, + "step": 5910 + }, + { + "epoch": 2.606875275451741, + "grad_norm": 0.3283193021777617, + "learning_rate": 1.0281560585590311e-05, + "loss": 1.709, + "step": 5915 + }, + { + "epoch": 2.609078889378581, + "grad_norm": 0.31601773043700576, + "learning_rate": 1.0168608529604783e-05, + "loss": 1.5517, + "step": 5920 + }, + { + "epoch": 2.611282503305421, + "grad_norm": 0.26388766629051863, + "learning_rate": 1.0056247101110972e-05, + "loss": 1.5716, + "step": 5925 + }, + { + "epoch": 2.6134861172322608, + "grad_norm": 0.3170004934043696, + "learning_rate": 9.944477038867838e-06, + "loss": 1.3933, + "step": 5930 + }, + { + "epoch": 2.615689731159101, + "grad_norm": 0.3026100677449506, + "learning_rate": 9.833299077746261e-06, + "loss": 1.538, + "step": 5935 + }, + { + "epoch": 2.6178933450859407, + "grad_norm": 0.32599901382842245, + "learning_rate": 9.72271394872416e-06, + "loss": 1.722, + "step": 5940 + }, + { + "epoch": 2.620096959012781, + "grad_norm": 0.31252235277224677, + "learning_rate": 9.612722378881578e-06, + "loss": 1.5777, + "step": 5945 + }, + { + "epoch": 2.622300572939621, + "grad_norm": 0.27949230064797415, + "learning_rate": 9.503325091396098e-06, + "loss": 1.7781, + "step": 5950 + }, + { + "epoch": 2.624504186866461, + "grad_norm": 0.3031767326868999, + "learning_rate": 9.394522805537931e-06, + "loss": 1.6123, + "step": 5955 + }, + { + "epoch": 2.626707800793301, + "grad_norm": 0.3596489809565846, + "learning_rate": 9.286316236665271e-06, + "loss": 1.8234, + "step": 5960 + }, + { + "epoch": 2.628911414720141, + "grad_norm": 0.37067976899261396, + "learning_rate": 9.178706096219547e-06, + "loss": 1.5176, + "step": 5965 + }, + { + "epoch": 2.631115028646981, + "grad_norm": 0.30187017500054564, + "learning_rate": 9.0716930917208e-06, + "loss": 1.5401, + "step": 5970 + }, + { + "epoch": 2.633318642573821, + "grad_norm": 0.338495742270805, + "learning_rate": 8.965277926762916e-06, + "loss": 1.5802, + "step": 5975 + }, + { + "epoch": 2.635522256500661, + "grad_norm": 0.33527970219407616, + "learning_rate": 8.859461301009186e-06, + "loss": 1.6115, + "step": 5980 + }, + { + "epoch": 2.6377258704275013, + "grad_norm": 0.3284526374151953, + "learning_rate": 8.754243910187498e-06, + "loss": 1.7051, + "step": 5985 + }, + { + "epoch": 2.639929484354341, + "grad_norm": 0.2820385948307506, + "learning_rate": 8.649626446085945e-06, + "loss": 1.4949, + "step": 5990 + }, + { + "epoch": 2.6421330982811813, + "grad_norm": 0.3357888909663005, + "learning_rate": 8.545609596548121e-06, + "loss": 1.5265, + "step": 5995 + }, + { + "epoch": 2.644336712208021, + "grad_norm": 0.24829105724152342, + "learning_rate": 8.442194045468733e-06, + "loss": 1.4418, + "step": 6000 + }, + { + "epoch": 2.6465403261348612, + "grad_norm": 0.3445954692738449, + "learning_rate": 8.339380472789016e-06, + "loss": 1.471, + "step": 6005 + }, + { + "epoch": 2.648743940061701, + "grad_norm": 0.34193200011238856, + "learning_rate": 8.237169554492297e-06, + "loss": 1.4714, + "step": 6010 + }, + { + "epoch": 2.650947553988541, + "grad_norm": 0.2994838498414857, + "learning_rate": 8.135561962599514e-06, + "loss": 1.5747, + "step": 6015 + }, + { + "epoch": 2.6531511679153814, + "grad_norm": 0.2510845629901009, + "learning_rate": 8.034558365164868e-06, + "loss": 1.6476, + "step": 6020 + }, + { + "epoch": 2.655354781842221, + "grad_norm": 0.3375793364015892, + "learning_rate": 7.934159426271403e-06, + "loss": 1.6063, + "step": 6025 + }, + { + "epoch": 2.6575583957690614, + "grad_norm": 0.33573003764748477, + "learning_rate": 7.834365806026578e-06, + "loss": 1.5814, + "step": 6030 + }, + { + "epoch": 2.659762009695901, + "grad_norm": 0.38295818181236296, + "learning_rate": 7.735178160557943e-06, + "loss": 1.7642, + "step": 6035 + }, + { + "epoch": 2.6619656236227414, + "grad_norm": 0.28003247319118707, + "learning_rate": 7.636597142009017e-06, + "loss": 1.7946, + "step": 6040 + }, + { + "epoch": 2.664169237549581, + "grad_norm": 0.367119348898622, + "learning_rate": 7.538623398534661e-06, + "loss": 1.5553, + "step": 6045 + }, + { + "epoch": 2.6663728514764213, + "grad_norm": 0.30087617582351794, + "learning_rate": 7.441257574297089e-06, + "loss": 1.56, + "step": 6050 + }, + { + "epoch": 2.6685764654032615, + "grad_norm": 0.2679029740855203, + "learning_rate": 7.344500309461511e-06, + "loss": 1.5622, + "step": 6055 + }, + { + "epoch": 2.6707800793301013, + "grad_norm": 0.38643212881302264, + "learning_rate": 7.248352240192002e-06, + "loss": 1.6344, + "step": 6060 + }, + { + "epoch": 2.6729836932569415, + "grad_norm": 0.3036596598041859, + "learning_rate": 7.15281399864719e-06, + "loss": 1.6126, + "step": 6065 + }, + { + "epoch": 2.6751873071837813, + "grad_norm": 0.3409000984647797, + "learning_rate": 7.057886212976239e-06, + "loss": 1.7453, + "step": 6070 + }, + { + "epoch": 2.6773909211106215, + "grad_norm": 0.3123434626912612, + "learning_rate": 6.963569507314627e-06, + "loss": 1.6624, + "step": 6075 + }, + { + "epoch": 2.6795945350374613, + "grad_norm": 0.3272418793360404, + "learning_rate": 6.8698645017801325e-06, + "loss": 1.8614, + "step": 6080 + }, + { + "epoch": 2.6817981489643015, + "grad_norm": 0.2432770598979605, + "learning_rate": 6.776771812468618e-06, + "loss": 1.6761, + "step": 6085 + }, + { + "epoch": 2.6840017628911417, + "grad_norm": 0.29638907037530365, + "learning_rate": 6.684292051450147e-06, + "loss": 1.5734, + "step": 6090 + }, + { + "epoch": 2.6862053768179814, + "grad_norm": 0.2927874947198909, + "learning_rate": 6.592425826764781e-06, + "loss": 1.6527, + "step": 6095 + }, + { + "epoch": 2.6884089907448216, + "grad_norm": 0.3262506822619773, + "learning_rate": 6.501173742418753e-06, + "loss": 1.7488, + "step": 6100 + }, + { + "epoch": 2.6906126046716614, + "grad_norm": 0.29388683856815984, + "learning_rate": 6.410536398380385e-06, + "loss": 1.7391, + "step": 6105 + }, + { + "epoch": 2.6928162185985016, + "grad_norm": 0.42564932659573224, + "learning_rate": 6.320514390576193e-06, + "loss": 1.5618, + "step": 6110 + }, + { + "epoch": 2.6950198325253414, + "grad_norm": 0.33120895230745345, + "learning_rate": 6.231108310886924e-06, + "loss": 1.5172, + "step": 6115 + }, + { + "epoch": 2.6972234464521816, + "grad_norm": 0.2970159546645706, + "learning_rate": 6.142318747143716e-06, + "loss": 1.5319, + "step": 6120 + }, + { + "epoch": 2.699427060379022, + "grad_norm": 0.3125793002150719, + "learning_rate": 6.054146283124218e-06, + "loss": 1.6401, + "step": 6125 + }, + { + "epoch": 2.7016306743058616, + "grad_norm": 0.30146872233206856, + "learning_rate": 5.966591498548724e-06, + "loss": 1.7384, + "step": 6130 + }, + { + "epoch": 2.7038342882327018, + "grad_norm": 0.3887993733574576, + "learning_rate": 5.8796549690763645e-06, + "loss": 1.8019, + "step": 6135 + }, + { + "epoch": 2.7060379021595415, + "grad_norm": 0.3033022818548619, + "learning_rate": 5.79333726630138e-06, + "loss": 1.5844, + "step": 6140 + }, + { + "epoch": 2.7082415160863818, + "grad_norm": 0.368025241100897, + "learning_rate": 5.7076389577493175e-06, + "loss": 1.8454, + "step": 6145 + }, + { + "epoch": 2.7104451300132215, + "grad_norm": 0.298047160015892, + "learning_rate": 5.622560606873262e-06, + "loss": 1.6045, + "step": 6150 + }, + { + "epoch": 2.7126487439400617, + "grad_norm": 0.30605420571316905, + "learning_rate": 5.538102773050235e-06, + "loss": 1.696, + "step": 6155 + }, + { + "epoch": 2.714852357866902, + "grad_norm": 0.3454395184873584, + "learning_rate": 5.454266011577369e-06, + "loss": 1.6258, + "step": 6160 + }, + { + "epoch": 2.7170559717937417, + "grad_norm": 0.30863792610727064, + "learning_rate": 5.371050873668437e-06, + "loss": 1.5895, + "step": 6165 + }, + { + "epoch": 2.719259585720582, + "grad_norm": 0.32671612304128317, + "learning_rate": 5.2884579064500615e-06, + "loss": 1.751, + "step": 6170 + }, + { + "epoch": 2.7214631996474217, + "grad_norm": 0.24404249683796003, + "learning_rate": 5.206487652958214e-06, + "loss": 1.5318, + "step": 6175 + }, + { + "epoch": 2.723666813574262, + "grad_norm": 0.31345216172176077, + "learning_rate": 5.125140652134652e-06, + "loss": 1.6814, + "step": 6180 + }, + { + "epoch": 2.7258704275011016, + "grad_norm": 0.3301202829233535, + "learning_rate": 5.044417438823279e-06, + "loss": 1.6688, + "step": 6185 + }, + { + "epoch": 2.728074041427942, + "grad_norm": 0.3180153311679256, + "learning_rate": 4.964318543766733e-06, + "loss": 1.8152, + "step": 6190 + }, + { + "epoch": 2.730277655354782, + "grad_norm": 0.3477053424267945, + "learning_rate": 4.884844493602847e-06, + "loss": 1.6068, + "step": 6195 + }, + { + "epoch": 2.732481269281622, + "grad_norm": 0.3829450747175287, + "learning_rate": 4.805995810861219e-06, + "loss": 1.5436, + "step": 6200 + }, + { + "epoch": 2.734684883208462, + "grad_norm": 0.35144134365971347, + "learning_rate": 4.727773013959702e-06, + "loss": 1.7733, + "step": 6205 + }, + { + "epoch": 2.736888497135302, + "grad_norm": 0.3511048030633929, + "learning_rate": 4.650176617201074e-06, + "loss": 1.7483, + "step": 6210 + }, + { + "epoch": 2.739092111062142, + "grad_norm": 0.3146009823620477, + "learning_rate": 4.573207130769663e-06, + "loss": 1.6416, + "step": 6215 + }, + { + "epoch": 2.7412957249889818, + "grad_norm": 0.2961123189548949, + "learning_rate": 4.496865060727917e-06, + "loss": 1.5871, + "step": 6220 + }, + { + "epoch": 2.743499338915822, + "grad_norm": 0.2827248539547275, + "learning_rate": 4.421150909013094e-06, + "loss": 1.6537, + "step": 6225 + }, + { + "epoch": 2.745702952842662, + "grad_norm": 0.3230949462447901, + "learning_rate": 4.346065173434055e-06, + "loss": 1.5128, + "step": 6230 + }, + { + "epoch": 2.747906566769502, + "grad_norm": 0.27203611650594783, + "learning_rate": 4.271608347667888e-06, + "loss": 1.6916, + "step": 6235 + }, + { + "epoch": 2.750110180696342, + "grad_norm": 0.316718312569841, + "learning_rate": 4.197780921256678e-06, + "loss": 1.7967, + "step": 6240 + }, + { + "epoch": 2.752313794623182, + "grad_norm": 0.31564921665656825, + "learning_rate": 4.1245833796043184e-06, + "loss": 1.5092, + "step": 6245 + }, + { + "epoch": 2.754517408550022, + "grad_norm": 0.3023941557144956, + "learning_rate": 4.052016203973319e-06, + "loss": 1.6864, + "step": 6250 + }, + { + "epoch": 2.756721022476862, + "grad_norm": 0.31013607515444674, + "learning_rate": 3.9800798714816566e-06, + "loss": 1.7096, + "step": 6255 + }, + { + "epoch": 2.758924636403702, + "grad_norm": 0.34766875505420175, + "learning_rate": 3.908774855099529e-06, + "loss": 1.6837, + "step": 6260 + }, + { + "epoch": 2.7611282503305423, + "grad_norm": 0.2943152497920515, + "learning_rate": 3.838101623646429e-06, + "loss": 1.6478, + "step": 6265 + }, + { + "epoch": 2.763331864257382, + "grad_norm": 0.32690567367741863, + "learning_rate": 3.768060641787874e-06, + "loss": 1.8321, + "step": 6270 + }, + { + "epoch": 2.765535478184222, + "grad_norm": 0.35174727110880194, + "learning_rate": 3.698652370032496e-06, + "loss": 1.7583, + "step": 6275 + }, + { + "epoch": 2.767739092111062, + "grad_norm": 0.30775438684619605, + "learning_rate": 3.6298772647289204e-06, + "loss": 1.7887, + "step": 6280 + }, + { + "epoch": 2.7699427060379023, + "grad_norm": 0.2974347187890948, + "learning_rate": 3.561735778062847e-06, + "loss": 1.5669, + "step": 6285 + }, + { + "epoch": 2.772146319964742, + "grad_norm": 0.32631762135645986, + "learning_rate": 3.4942283580539747e-06, + "loss": 1.5496, + "step": 6290 + }, + { + "epoch": 2.7743499338915822, + "grad_norm": 0.3078445012000501, + "learning_rate": 3.427355448553149e-06, + "loss": 1.4473, + "step": 6295 + }, + { + "epoch": 2.7765535478184225, + "grad_norm": 0.31874633296757016, + "learning_rate": 3.3611174892393848e-06, + "loss": 1.7297, + "step": 6300 + }, + { + "epoch": 2.778757161745262, + "grad_norm": 0.3877251343286691, + "learning_rate": 3.2955149156170373e-06, + "loss": 1.7889, + "step": 6305 + }, + { + "epoch": 2.780960775672102, + "grad_norm": 0.3050428005282347, + "learning_rate": 3.230548159012836e-06, + "loss": 1.7297, + "step": 6310 + }, + { + "epoch": 2.783164389598942, + "grad_norm": 0.3297939270880444, + "learning_rate": 3.1662176465731776e-06, + "loss": 1.7542, + "step": 6315 + }, + { + "epoch": 2.7853680035257824, + "grad_norm": 0.35972869738440505, + "learning_rate": 3.1025238012612146e-06, + "loss": 1.6169, + "step": 6320 + }, + { + "epoch": 2.787571617452622, + "grad_norm": 0.3384226865100565, + "learning_rate": 3.039467041854105e-06, + "loss": 1.6362, + "step": 6325 + }, + { + "epoch": 2.7897752313794624, + "grad_norm": 0.3670760451609635, + "learning_rate": 2.97704778294029e-06, + "loss": 1.6398, + "step": 6330 + }, + { + "epoch": 2.7919788453063026, + "grad_norm": 0.2854714227749342, + "learning_rate": 2.9152664349167415e-06, + "loss": 1.4325, + "step": 6335 + }, + { + "epoch": 2.7941824592331423, + "grad_norm": 0.3042468904319354, + "learning_rate": 2.854123403986253e-06, + "loss": 1.6423, + "step": 6340 + }, + { + "epoch": 2.796386073159982, + "grad_norm": 0.33954395775276786, + "learning_rate": 2.793619092154787e-06, + "loss": 1.6785, + "step": 6345 + }, + { + "epoch": 2.7985896870868223, + "grad_norm": 0.41052304755954266, + "learning_rate": 2.7337538972287967e-06, + "loss": 1.7808, + "step": 6350 + }, + { + "epoch": 2.8007933010136625, + "grad_norm": 0.2735021459128279, + "learning_rate": 2.674528212812721e-06, + "loss": 1.579, + "step": 6355 + }, + { + "epoch": 2.8029969149405023, + "grad_norm": 0.42116340427498267, + "learning_rate": 2.6159424283062507e-06, + "loss": 1.665, + "step": 6360 + }, + { + "epoch": 2.8052005288673425, + "grad_norm": 0.40395544915333126, + "learning_rate": 2.557996928901829e-06, + "loss": 1.6685, + "step": 6365 + }, + { + "epoch": 2.8074041427941827, + "grad_norm": 0.41455342226161546, + "learning_rate": 2.5006920955821465e-06, + "loss": 1.7578, + "step": 6370 + }, + { + "epoch": 2.8096077567210225, + "grad_norm": 0.32321328330549354, + "learning_rate": 2.4440283051176405e-06, + "loss": 1.7026, + "step": 6375 + }, + { + "epoch": 2.8118113706478622, + "grad_norm": 0.2956835138059502, + "learning_rate": 2.388005930063941e-06, + "loss": 1.8632, + "step": 6380 + }, + { + "epoch": 2.8140149845747024, + "grad_norm": 0.3268794222372435, + "learning_rate": 2.3326253387594753e-06, + "loss": 1.6233, + "step": 6385 + }, + { + "epoch": 2.8162185985015427, + "grad_norm": 0.3626797930342101, + "learning_rate": 2.277886895323078e-06, + "loss": 1.74, + "step": 6390 + }, + { + "epoch": 2.8184222124283824, + "grad_norm": 0.33358450270104334, + "learning_rate": 2.2237909596515396e-06, + "loss": 1.4655, + "step": 6395 + }, + { + "epoch": 2.8206258263552226, + "grad_norm": 0.29838126441173135, + "learning_rate": 2.1703378874172507e-06, + "loss": 1.4969, + "step": 6400 + }, + { + "epoch": 2.822829440282063, + "grad_norm": 0.34601838203569607, + "learning_rate": 2.117528030065907e-06, + "loss": 1.6886, + "step": 6405 + }, + { + "epoch": 2.8250330542089026, + "grad_norm": 0.3110562652655748, + "learning_rate": 2.0653617348141084e-06, + "loss": 1.4905, + "step": 6410 + }, + { + "epoch": 2.8272366681357424, + "grad_norm": 0.3274079081069346, + "learning_rate": 2.013839344647217e-06, + "loss": 1.6808, + "step": 6415 + }, + { + "epoch": 2.8294402820625826, + "grad_norm": 0.34133335820544286, + "learning_rate": 1.962961198316937e-06, + "loss": 1.7414, + "step": 6420 + }, + { + "epoch": 2.831643895989423, + "grad_norm": 0.3234135423197038, + "learning_rate": 1.912727630339217e-06, + "loss": 1.4927, + "step": 6425 + }, + { + "epoch": 2.8338475099162626, + "grad_norm": 0.31099973650003665, + "learning_rate": 1.8631389709919843e-06, + "loss": 1.5605, + "step": 6430 + }, + { + "epoch": 2.8360511238431028, + "grad_norm": 0.37303627703284004, + "learning_rate": 1.8141955463129912e-06, + "loss": 1.6712, + "step": 6435 + }, + { + "epoch": 2.838254737769943, + "grad_norm": 0.3614598619091711, + "learning_rate": 1.7658976780976944e-06, + "loss": 1.7914, + "step": 6440 + }, + { + "epoch": 2.8404583516967827, + "grad_norm": 0.30969623455642703, + "learning_rate": 1.7182456838971016e-06, + "loss": 1.5793, + "step": 6445 + }, + { + "epoch": 2.8426619656236225, + "grad_norm": 0.26680083024728835, + "learning_rate": 1.6712398770156734e-06, + "loss": 1.5423, + "step": 6450 + }, + { + "epoch": 2.8448655795504627, + "grad_norm": 0.38620184585663864, + "learning_rate": 1.6248805665093348e-06, + "loss": 1.7361, + "step": 6455 + }, + { + "epoch": 2.847069193477303, + "grad_norm": 0.29245866381267194, + "learning_rate": 1.5791680571833667e-06, + "loss": 1.4591, + "step": 6460 + }, + { + "epoch": 2.8492728074041427, + "grad_norm": 0.36925991583350115, + "learning_rate": 1.5341026495904409e-06, + "loss": 1.5466, + "step": 6465 + }, + { + "epoch": 2.851476421330983, + "grad_norm": 0.31621129108601936, + "learning_rate": 1.4896846400286323e-06, + "loss": 1.5198, + "step": 6470 + }, + { + "epoch": 2.853680035257823, + "grad_norm": 0.3918256317744239, + "learning_rate": 1.4459143205394876e-06, + "loss": 1.8413, + "step": 6475 + }, + { + "epoch": 2.855883649184663, + "grad_norm": 0.3333872096143664, + "learning_rate": 1.4027919789060818e-06, + "loss": 1.6091, + "step": 6480 + }, + { + "epoch": 2.8580872631115026, + "grad_norm": 0.350383265332482, + "learning_rate": 1.36031789865112e-06, + "loss": 1.77, + "step": 6485 + }, + { + "epoch": 2.860290877038343, + "grad_norm": 0.32052781126282354, + "learning_rate": 1.3184923590351062e-06, + "loss": 1.6178, + "step": 6490 + }, + { + "epoch": 2.862494490965183, + "grad_norm": 0.33800257995746813, + "learning_rate": 1.27731563505451e-06, + "loss": 1.6759, + "step": 6495 + }, + { + "epoch": 2.864698104892023, + "grad_norm": 0.32607540150243636, + "learning_rate": 1.236787997439892e-06, + "loss": 1.5576, + "step": 6500 + }, + { + "epoch": 2.866901718818863, + "grad_norm": 0.32271754224482374, + "learning_rate": 1.196909712654204e-06, + "loss": 1.5769, + "step": 6505 + }, + { + "epoch": 2.8691053327457032, + "grad_norm": 0.32279106125213675, + "learning_rate": 1.1576810428910012e-06, + "loss": 1.4904, + "step": 6510 + }, + { + "epoch": 2.871308946672543, + "grad_norm": 0.33356280248337183, + "learning_rate": 1.1191022460727007e-06, + "loss": 1.5742, + "step": 6515 + }, + { + "epoch": 2.8735125605993828, + "grad_norm": 0.3151589561132343, + "learning_rate": 1.0811735758489372e-06, + "loss": 1.6439, + "step": 6520 + }, + { + "epoch": 2.875716174526223, + "grad_norm": 0.3317166675779296, + "learning_rate": 1.04389528159482e-06, + "loss": 1.4983, + "step": 6525 + }, + { + "epoch": 2.877919788453063, + "grad_norm": 0.3198594908928924, + "learning_rate": 1.0072676084093902e-06, + "loss": 1.6749, + "step": 6530 + }, + { + "epoch": 2.880123402379903, + "grad_norm": 0.2937946173545102, + "learning_rate": 9.712907971139218e-07, + "loss": 1.7593, + "step": 6535 + }, + { + "epoch": 2.882327016306743, + "grad_norm": 0.3493222702605276, + "learning_rate": 9.359650842503565e-07, + "loss": 1.737, + "step": 6540 + }, + { + "epoch": 2.884530630233583, + "grad_norm": 0.3038129813169421, + "learning_rate": 9.012907020798156e-07, + "loss": 1.6078, + "step": 6545 + }, + { + "epoch": 2.886734244160423, + "grad_norm": 0.3031344362125413, + "learning_rate": 8.672678785809796e-07, + "loss": 1.6788, + "step": 6550 + }, + { + "epoch": 2.888937858087263, + "grad_norm": 0.31295231167033, + "learning_rate": 8.338968374486555e-07, + "loss": 1.734, + "step": 6555 + }, + { + "epoch": 2.891141472014103, + "grad_norm": 0.30464503391760195, + "learning_rate": 8.011777980922564e-07, + "loss": 1.6216, + "step": 6560 + }, + { + "epoch": 2.8933450859409433, + "grad_norm": 0.31057203787361704, + "learning_rate": 7.691109756344128e-07, + "loss": 1.6683, + "step": 6565 + }, + { + "epoch": 2.895548699867783, + "grad_norm": 0.3406431085406218, + "learning_rate": 7.376965809095193e-07, + "loss": 1.7457, + "step": 6570 + }, + { + "epoch": 2.8977523137946233, + "grad_norm": 0.26328954802317356, + "learning_rate": 7.06934820462346e-07, + "loss": 1.6027, + "step": 6575 + }, + { + "epoch": 2.899955927721463, + "grad_norm": 0.3427927943612674, + "learning_rate": 6.768258965467289e-07, + "loss": 1.7368, + "step": 6580 + }, + { + "epoch": 2.9021595416483033, + "grad_norm": 0.3099689467854988, + "learning_rate": 6.473700071241484e-07, + "loss": 1.7899, + "step": 6585 + }, + { + "epoch": 2.904363155575143, + "grad_norm": 0.38502367800150844, + "learning_rate": 6.185673458625418e-07, + "loss": 1.732, + "step": 6590 + }, + { + "epoch": 2.9065667695019832, + "grad_norm": 0.3501339480817073, + "learning_rate": 5.904181021349375e-07, + "loss": 1.6615, + "step": 6595 + }, + { + "epoch": 2.9087703834288234, + "grad_norm": 0.28836814250088966, + "learning_rate": 5.629224610182671e-07, + "loss": 1.5576, + "step": 6600 + }, + { + "epoch": 2.910973997355663, + "grad_norm": 0.2999170211226527, + "learning_rate": 5.360806032920995e-07, + "loss": 1.6333, + "step": 6605 + }, + { + "epoch": 2.9131776112825034, + "grad_norm": 0.3369079119860257, + "learning_rate": 5.09892705437498e-07, + "loss": 1.4815, + "step": 6610 + }, + { + "epoch": 2.915381225209343, + "grad_norm": 0.30167843329688654, + "learning_rate": 4.843589396358427e-07, + "loss": 1.7719, + "step": 6615 + }, + { + "epoch": 2.9175848391361834, + "grad_norm": 0.3352146963211379, + "learning_rate": 4.5947947376767663e-07, + "loss": 1.8039, + "step": 6620 + }, + { + "epoch": 2.919788453063023, + "grad_norm": 0.31153697039563194, + "learning_rate": 4.3525447141165023e-07, + "loss": 1.4281, + "step": 6625 + }, + { + "epoch": 2.9219920669898634, + "grad_norm": 0.3021129933679619, + "learning_rate": 4.116840918434006e-07, + "loss": 1.7845, + "step": 6630 + }, + { + "epoch": 2.9241956809167036, + "grad_norm": 0.3038448101025625, + "learning_rate": 3.887684900345301e-07, + "loss": 1.6785, + "step": 6635 + }, + { + "epoch": 2.9263992948435433, + "grad_norm": 0.29571689111577726, + "learning_rate": 3.665078166515623e-07, + "loss": 1.5903, + "step": 6640 + }, + { + "epoch": 2.9286029087703835, + "grad_norm": 0.3420405614774203, + "learning_rate": 3.449022180549766e-07, + "loss": 1.7721, + "step": 6645 + }, + { + "epoch": 2.9308065226972233, + "grad_norm": 0.28585767479477975, + "learning_rate": 3.2395183629824186e-07, + "loss": 1.6843, + "step": 6650 + }, + { + "epoch": 2.9330101366240635, + "grad_norm": 0.2974354225013896, + "learning_rate": 3.0365680912688434e-07, + "loss": 1.557, + "step": 6655 + }, + { + "epoch": 2.9352137505509033, + "grad_norm": 0.3294613096996469, + "learning_rate": 2.840172699775656e-07, + "loss": 1.5854, + "step": 6660 + }, + { + "epoch": 2.9374173644777435, + "grad_norm": 0.2754088803888805, + "learning_rate": 2.650333479771949e-07, + "loss": 1.5721, + "step": 6665 + }, + { + "epoch": 2.9396209784045837, + "grad_norm": 0.2983838481391387, + "learning_rate": 2.467051679421406e-07, + "loss": 1.6993, + "step": 6670 + }, + { + "epoch": 2.9418245923314235, + "grad_norm": 0.31155141036980233, + "learning_rate": 2.290328503773309e-07, + "loss": 1.687, + "step": 6675 + }, + { + "epoch": 2.9440282062582637, + "grad_norm": 0.29451801544591716, + "learning_rate": 2.1201651147554347e-07, + "loss": 1.7157, + "step": 6680 + }, + { + "epoch": 2.9462318201851034, + "grad_norm": 0.3316660290657273, + "learning_rate": 1.956562631165504e-07, + "loss": 1.6958, + "step": 6685 + }, + { + "epoch": 2.9484354341119436, + "grad_norm": 0.37499704382786475, + "learning_rate": 1.7995221286645215e-07, + "loss": 1.7061, + "step": 6690 + }, + { + "epoch": 2.9506390480387834, + "grad_norm": 0.21043643461218126, + "learning_rate": 1.6490446397696702e-07, + "loss": 1.3901, + "step": 6695 + }, + { + "epoch": 2.9528426619656236, + "grad_norm": 0.3260933032444566, + "learning_rate": 1.5051311538469837e-07, + "loss": 1.6567, + "step": 6700 + }, + { + "epoch": 2.955046275892464, + "grad_norm": 0.3164050584909926, + "learning_rate": 1.367782617105351e-07, + "loss": 1.6661, + "step": 6705 + }, + { + "epoch": 2.9572498898193036, + "grad_norm": 0.3495591454896855, + "learning_rate": 1.2369999325901881e-07, + "loss": 1.6197, + "step": 6710 + }, + { + "epoch": 2.959453503746144, + "grad_norm": 0.3351019168716464, + "learning_rate": 1.1127839601774437e-07, + "loss": 1.6162, + "step": 6715 + }, + { + "epoch": 2.9616571176729836, + "grad_norm": 0.34641497402663024, + "learning_rate": 9.951355165678244e-08, + "loss": 1.7908, + "step": 6720 + }, + { + "epoch": 2.9638607315998238, + "grad_norm": 0.37716614486780947, + "learning_rate": 8.840553752815783e-08, + "loss": 1.6302, + "step": 6725 + }, + { + "epoch": 2.9660643455266635, + "grad_norm": 0.3039829114023131, + "learning_rate": 7.79544266653609e-08, + "loss": 1.7006, + "step": 6730 + }, + { + "epoch": 2.9682679594535037, + "grad_norm": 0.31609676417425303, + "learning_rate": 6.816028778281469e-08, + "loss": 1.6702, + "step": 6735 + }, + { + "epoch": 2.970471573380344, + "grad_norm": 0.37777667052819425, + "learning_rate": 5.902318527547523e-08, + "loss": 1.4444, + "step": 6740 + }, + { + "epoch": 2.9726751873071837, + "grad_norm": 0.36388069806115425, + "learning_rate": 5.0543179218365265e-08, + "loss": 1.5438, + "step": 6745 + }, + { + "epoch": 2.974878801234024, + "grad_norm": 0.362171321908763, + "learning_rate": 4.272032536621895e-08, + "loss": 1.7638, + "step": 6750 + }, + { + "epoch": 2.9770824151608637, + "grad_norm": 0.3560152063618098, + "learning_rate": 3.5554675153082195e-08, + "loss": 1.6422, + "step": 6755 + }, + { + "epoch": 2.979286029087704, + "grad_norm": 0.2934857454303967, + "learning_rate": 2.9046275692012904e-08, + "loss": 1.5529, + "step": 6760 + }, + { + "epoch": 2.9814896430145437, + "grad_norm": 0.31059161987377404, + "learning_rate": 2.3195169774714586e-08, + "loss": 1.5975, + "step": 6765 + }, + { + "epoch": 2.983693256941384, + "grad_norm": 0.4364323493515609, + "learning_rate": 1.8001395871303228e-08, + "loss": 1.7395, + "step": 6770 + }, + { + "epoch": 2.985896870868224, + "grad_norm": 0.3199314191111005, + "learning_rate": 1.3464988130051925e-08, + "loss": 1.5351, + "step": 6775 + }, + { + "epoch": 2.988100484795064, + "grad_norm": 0.2946914049828253, + "learning_rate": 9.585976377124439e-09, + "loss": 1.519, + "step": 6780 + }, + { + "epoch": 2.990304098721904, + "grad_norm": 0.2758574818813751, + "learning_rate": 6.364386116419762e-09, + "loss": 1.4758, + "step": 6785 + }, + { + "epoch": 2.992507712648744, + "grad_norm": 0.329124521705272, + "learning_rate": 3.800238529416688e-09, + "loss": 1.8557, + "step": 6790 + }, + { + "epoch": 2.994711326575584, + "grad_norm": 0.3394680059091016, + "learning_rate": 1.8935504749628684e-09, + "loss": 1.7226, + "step": 6795 + }, + { + "epoch": 2.996914940502424, + "grad_norm": 0.27640804395611795, + "learning_rate": 6.443344892637093e-10, + "loss": 1.6186, + "step": 6800 + }, + { + "epoch": 2.999118554429264, + "grad_norm": 0.3138822790633703, + "learning_rate": 5.259878569363608e-11, + "loss": 1.6958, + "step": 6805 + }, + { + "epoch": 3.0, + "step": 6807, + "total_flos": 1.620091928969216e+16, + "train_loss": 1.72514724640379, + "train_runtime": 38823.0414, + "train_samples_per_second": 0.701, + "train_steps_per_second": 0.175 + } + ], + "logging_steps": 5, + "max_steps": 6807, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1.620091928969216e+16, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}