diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,12832 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9999452264884702, + "eval_steps": 500, + "global_step": 9128, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00010954702305964835, + "grad_norm": 11.6875, + "learning_rate": 2.190580503833516e-08, + "loss": 1.6695, + "step": 1 + }, + { + "epoch": 0.0005477351152982418, + "grad_norm": 10.5, + "learning_rate": 1.095290251916758e-07, + "loss": 1.7344, + "step": 5 + }, + { + "epoch": 0.0010954702305964836, + "grad_norm": 13.1875, + "learning_rate": 2.190580503833516e-07, + "loss": 1.6283, + "step": 10 + }, + { + "epoch": 0.0016432053458947254, + "grad_norm": 8.5625, + "learning_rate": 3.285870755750274e-07, + "loss": 1.6588, + "step": 15 + }, + { + "epoch": 0.0021909404611929672, + "grad_norm": 9.4375, + "learning_rate": 4.381161007667032e-07, + "loss": 1.6831, + "step": 20 + }, + { + "epoch": 0.002738675576491209, + "grad_norm": 8.1875, + "learning_rate": 5.47645125958379e-07, + "loss": 1.6779, + "step": 25 + }, + { + "epoch": 0.003286410691789451, + "grad_norm": 11.25, + "learning_rate": 6.571741511500548e-07, + "loss": 1.7032, + "step": 30 + }, + { + "epoch": 0.0038341458070876924, + "grad_norm": 9.125, + "learning_rate": 7.667031763417306e-07, + "loss": 1.6961, + "step": 35 + }, + { + "epoch": 0.0043818809223859344, + "grad_norm": 9.5625, + "learning_rate": 8.762322015334064e-07, + "loss": 1.6005, + "step": 40 + }, + { + "epoch": 0.004929616037684176, + "grad_norm": 11.4375, + "learning_rate": 9.857612267250823e-07, + "loss": 1.6734, + "step": 45 + }, + { + "epoch": 0.005477351152982418, + "grad_norm": 8.5625, + "learning_rate": 1.095290251916758e-06, + "loss": 1.6523, + "step": 50 + }, + { + "epoch": 0.00602508626828066, + "grad_norm": 8.0625, + "learning_rate": 1.2048192771084338e-06, + "loss": 1.6274, + "step": 55 + }, + { + "epoch": 0.006572821383578902, + "grad_norm": 9.25, + "learning_rate": 1.3143483023001096e-06, + "loss": 1.6391, + "step": 60 + }, + { + "epoch": 0.007120556498877143, + "grad_norm": 7.09375, + "learning_rate": 1.4238773274917855e-06, + "loss": 1.6723, + "step": 65 + }, + { + "epoch": 0.007668291614175385, + "grad_norm": 8.75, + "learning_rate": 1.5334063526834611e-06, + "loss": 1.616, + "step": 70 + }, + { + "epoch": 0.008216026729473627, + "grad_norm": 9.5, + "learning_rate": 1.642935377875137e-06, + "loss": 1.6565, + "step": 75 + }, + { + "epoch": 0.008763761844771869, + "grad_norm": 8.875, + "learning_rate": 1.7524644030668128e-06, + "loss": 1.5556, + "step": 80 + }, + { + "epoch": 0.009311496960070111, + "grad_norm": 7.4375, + "learning_rate": 1.8619934282584886e-06, + "loss": 1.5898, + "step": 85 + }, + { + "epoch": 0.009859232075368351, + "grad_norm": 8.75, + "learning_rate": 1.9715224534501647e-06, + "loss": 1.5372, + "step": 90 + }, + { + "epoch": 0.010406967190666593, + "grad_norm": 7.15625, + "learning_rate": 2.0810514786418403e-06, + "loss": 1.5206, + "step": 95 + }, + { + "epoch": 0.010954702305964835, + "grad_norm": 5.65625, + "learning_rate": 2.190580503833516e-06, + "loss": 1.5475, + "step": 100 + }, + { + "epoch": 0.011502437421263077, + "grad_norm": 7.03125, + "learning_rate": 2.300109529025192e-06, + "loss": 1.5011, + "step": 105 + }, + { + "epoch": 0.01205017253656132, + "grad_norm": 4.1875, + "learning_rate": 2.4096385542168676e-06, + "loss": 1.4805, + "step": 110 + }, + { + "epoch": 0.012597907651859561, + "grad_norm": 4.5625, + "learning_rate": 2.5191675794085437e-06, + "loss": 1.4625, + "step": 115 + }, + { + "epoch": 0.013145642767157803, + "grad_norm": 4.25, + "learning_rate": 2.6286966046002193e-06, + "loss": 1.4329, + "step": 120 + }, + { + "epoch": 0.013693377882456044, + "grad_norm": 3.140625, + "learning_rate": 2.7382256297918953e-06, + "loss": 1.4614, + "step": 125 + }, + { + "epoch": 0.014241112997754286, + "grad_norm": 2.890625, + "learning_rate": 2.847754654983571e-06, + "loss": 1.43, + "step": 130 + }, + { + "epoch": 0.014788848113052528, + "grad_norm": 2.96875, + "learning_rate": 2.957283680175247e-06, + "loss": 1.4254, + "step": 135 + }, + { + "epoch": 0.01533658322835077, + "grad_norm": 2.375, + "learning_rate": 3.0668127053669222e-06, + "loss": 1.3846, + "step": 140 + }, + { + "epoch": 0.01588431834364901, + "grad_norm": 2.109375, + "learning_rate": 3.1763417305585983e-06, + "loss": 1.3968, + "step": 145 + }, + { + "epoch": 0.016432053458947254, + "grad_norm": 2.015625, + "learning_rate": 3.285870755750274e-06, + "loss": 1.3592, + "step": 150 + }, + { + "epoch": 0.016979788574245494, + "grad_norm": 3.03125, + "learning_rate": 3.39539978094195e-06, + "loss": 1.3269, + "step": 155 + }, + { + "epoch": 0.017527523689543738, + "grad_norm": 1.8359375, + "learning_rate": 3.5049288061336256e-06, + "loss": 1.3384, + "step": 160 + }, + { + "epoch": 0.018075258804841978, + "grad_norm": 1.734375, + "learning_rate": 3.6144578313253016e-06, + "loss": 1.3149, + "step": 165 + }, + { + "epoch": 0.018622993920140222, + "grad_norm": 1.5703125, + "learning_rate": 3.7239868565169773e-06, + "loss": 1.331, + "step": 170 + }, + { + "epoch": 0.019170729035438462, + "grad_norm": 1.78125, + "learning_rate": 3.833515881708653e-06, + "loss": 1.3452, + "step": 175 + }, + { + "epoch": 0.019718464150736702, + "grad_norm": 2.078125, + "learning_rate": 3.943044906900329e-06, + "loss": 1.2923, + "step": 180 + }, + { + "epoch": 0.020266199266034946, + "grad_norm": 1.734375, + "learning_rate": 4.0525739320920046e-06, + "loss": 1.351, + "step": 185 + }, + { + "epoch": 0.020813934381333186, + "grad_norm": 1.5078125, + "learning_rate": 4.162102957283681e-06, + "loss": 1.2837, + "step": 190 + }, + { + "epoch": 0.02136166949663143, + "grad_norm": 1.5625, + "learning_rate": 4.271631982475356e-06, + "loss": 1.2891, + "step": 195 + }, + { + "epoch": 0.02190940461192967, + "grad_norm": 1.5078125, + "learning_rate": 4.381161007667032e-06, + "loss": 1.3041, + "step": 200 + }, + { + "epoch": 0.022457139727227914, + "grad_norm": 1.625, + "learning_rate": 4.490690032858708e-06, + "loss": 1.2525, + "step": 205 + }, + { + "epoch": 0.023004874842526155, + "grad_norm": 1.609375, + "learning_rate": 4.600219058050384e-06, + "loss": 1.3449, + "step": 210 + }, + { + "epoch": 0.023552609957824395, + "grad_norm": 1.453125, + "learning_rate": 4.709748083242059e-06, + "loss": 1.2768, + "step": 215 + }, + { + "epoch": 0.02410034507312264, + "grad_norm": 1.5078125, + "learning_rate": 4.819277108433735e-06, + "loss": 1.28, + "step": 220 + }, + { + "epoch": 0.02464808018842088, + "grad_norm": 1.46875, + "learning_rate": 4.928806133625411e-06, + "loss": 1.2782, + "step": 225 + }, + { + "epoch": 0.025195815303719123, + "grad_norm": 1.6953125, + "learning_rate": 5.038335158817087e-06, + "loss": 1.2688, + "step": 230 + }, + { + "epoch": 0.025743550419017363, + "grad_norm": 1.421875, + "learning_rate": 5.1478641840087625e-06, + "loss": 1.2547, + "step": 235 + }, + { + "epoch": 0.026291285534315607, + "grad_norm": 1.5703125, + "learning_rate": 5.257393209200439e-06, + "loss": 1.2791, + "step": 240 + }, + { + "epoch": 0.026839020649613847, + "grad_norm": 1.5546875, + "learning_rate": 5.366922234392115e-06, + "loss": 1.2681, + "step": 245 + }, + { + "epoch": 0.027386755764912087, + "grad_norm": 1.671875, + "learning_rate": 5.476451259583791e-06, + "loss": 1.2936, + "step": 250 + }, + { + "epoch": 0.02793449088021033, + "grad_norm": 1.5234375, + "learning_rate": 5.585980284775466e-06, + "loss": 1.2555, + "step": 255 + }, + { + "epoch": 0.02848222599550857, + "grad_norm": 1.59375, + "learning_rate": 5.695509309967142e-06, + "loss": 1.2712, + "step": 260 + }, + { + "epoch": 0.029029961110806815, + "grad_norm": 1.59375, + "learning_rate": 5.805038335158818e-06, + "loss": 1.2957, + "step": 265 + }, + { + "epoch": 0.029577696226105055, + "grad_norm": 1.5546875, + "learning_rate": 5.914567360350494e-06, + "loss": 1.2894, + "step": 270 + }, + { + "epoch": 0.030125431341403296, + "grad_norm": 1.578125, + "learning_rate": 6.02409638554217e-06, + "loss": 1.2609, + "step": 275 + }, + { + "epoch": 0.03067316645670154, + "grad_norm": 1.609375, + "learning_rate": 6.1336254107338444e-06, + "loss": 1.2343, + "step": 280 + }, + { + "epoch": 0.03122090157199978, + "grad_norm": 1.703125, + "learning_rate": 6.2431544359255205e-06, + "loss": 1.2609, + "step": 285 + }, + { + "epoch": 0.03176863668729802, + "grad_norm": 1.4140625, + "learning_rate": 6.3526834611171965e-06, + "loss": 1.228, + "step": 290 + }, + { + "epoch": 0.032316371802596264, + "grad_norm": 1.5078125, + "learning_rate": 6.462212486308872e-06, + "loss": 1.2499, + "step": 295 + }, + { + "epoch": 0.03286410691789451, + "grad_norm": 1.5546875, + "learning_rate": 6.571741511500548e-06, + "loss": 1.2826, + "step": 300 + }, + { + "epoch": 0.03341184203319275, + "grad_norm": 1.5234375, + "learning_rate": 6.681270536692224e-06, + "loss": 1.2581, + "step": 305 + }, + { + "epoch": 0.03395957714849099, + "grad_norm": 1.7734375, + "learning_rate": 6.7907995618839e-06, + "loss": 1.2385, + "step": 310 + }, + { + "epoch": 0.03450731226378923, + "grad_norm": 1.7734375, + "learning_rate": 6.900328587075575e-06, + "loss": 1.2487, + "step": 315 + }, + { + "epoch": 0.035055047379087476, + "grad_norm": 1.7109375, + "learning_rate": 7.009857612267251e-06, + "loss": 1.2359, + "step": 320 + }, + { + "epoch": 0.03560278249438571, + "grad_norm": 1.734375, + "learning_rate": 7.119386637458927e-06, + "loss": 1.2917, + "step": 325 + }, + { + "epoch": 0.036150517609683956, + "grad_norm": 1.6484375, + "learning_rate": 7.228915662650603e-06, + "loss": 1.2511, + "step": 330 + }, + { + "epoch": 0.0366982527249822, + "grad_norm": 1.5234375, + "learning_rate": 7.3384446878422785e-06, + "loss": 1.2733, + "step": 335 + }, + { + "epoch": 0.037245987840280444, + "grad_norm": 1.421875, + "learning_rate": 7.4479737130339545e-06, + "loss": 1.25, + "step": 340 + }, + { + "epoch": 0.03779372295557868, + "grad_norm": 1.453125, + "learning_rate": 7.5575027382256306e-06, + "loss": 1.2793, + "step": 345 + }, + { + "epoch": 0.038341458070876924, + "grad_norm": 1.515625, + "learning_rate": 7.667031763417307e-06, + "loss": 1.2786, + "step": 350 + }, + { + "epoch": 0.03888919318617517, + "grad_norm": 1.5, + "learning_rate": 7.776560788608982e-06, + "loss": 1.2847, + "step": 355 + }, + { + "epoch": 0.039436928301473405, + "grad_norm": 1.4453125, + "learning_rate": 7.886089813800659e-06, + "loss": 1.3118, + "step": 360 + }, + { + "epoch": 0.03998466341677165, + "grad_norm": 1.4921875, + "learning_rate": 7.995618838992334e-06, + "loss": 1.2672, + "step": 365 + }, + { + "epoch": 0.04053239853206989, + "grad_norm": 1.484375, + "learning_rate": 8.105147864184009e-06, + "loss": 1.2752, + "step": 370 + }, + { + "epoch": 0.041080133647368136, + "grad_norm": 1.515625, + "learning_rate": 8.214676889375686e-06, + "loss": 1.2129, + "step": 375 + }, + { + "epoch": 0.04162786876266637, + "grad_norm": 1.46875, + "learning_rate": 8.324205914567361e-06, + "loss": 1.2755, + "step": 380 + }, + { + "epoch": 0.04217560387796462, + "grad_norm": 1.4140625, + "learning_rate": 8.433734939759038e-06, + "loss": 1.1983, + "step": 385 + }, + { + "epoch": 0.04272333899326286, + "grad_norm": 2.3125, + "learning_rate": 8.543263964950712e-06, + "loss": 1.2275, + "step": 390 + }, + { + "epoch": 0.0432710741085611, + "grad_norm": 1.390625, + "learning_rate": 8.652792990142389e-06, + "loss": 1.1843, + "step": 395 + }, + { + "epoch": 0.04381880922385934, + "grad_norm": 1.59375, + "learning_rate": 8.762322015334064e-06, + "loss": 1.2471, + "step": 400 + }, + { + "epoch": 0.044366544339157585, + "grad_norm": 1.5, + "learning_rate": 8.871851040525739e-06, + "loss": 1.2367, + "step": 405 + }, + { + "epoch": 0.04491427945445583, + "grad_norm": 1.53125, + "learning_rate": 8.981380065717416e-06, + "loss": 1.2504, + "step": 410 + }, + { + "epoch": 0.045462014569754065, + "grad_norm": 1.53125, + "learning_rate": 9.090909090909091e-06, + "loss": 1.3088, + "step": 415 + }, + { + "epoch": 0.04600974968505231, + "grad_norm": 1.4453125, + "learning_rate": 9.200438116100768e-06, + "loss": 1.2398, + "step": 420 + }, + { + "epoch": 0.04655748480035055, + "grad_norm": 1.484375, + "learning_rate": 9.309967141292443e-06, + "loss": 1.2922, + "step": 425 + }, + { + "epoch": 0.04710521991564879, + "grad_norm": 1.5234375, + "learning_rate": 9.419496166484118e-06, + "loss": 1.2491, + "step": 430 + }, + { + "epoch": 0.04765295503094703, + "grad_norm": 1.5078125, + "learning_rate": 9.529025191675795e-06, + "loss": 1.1853, + "step": 435 + }, + { + "epoch": 0.04820069014624528, + "grad_norm": 1.59375, + "learning_rate": 9.63855421686747e-06, + "loss": 1.2793, + "step": 440 + }, + { + "epoch": 0.04874842526154352, + "grad_norm": 1.4765625, + "learning_rate": 9.748083242059146e-06, + "loss": 1.2385, + "step": 445 + }, + { + "epoch": 0.04929616037684176, + "grad_norm": 1.4921875, + "learning_rate": 9.857612267250823e-06, + "loss": 1.2048, + "step": 450 + }, + { + "epoch": 0.04984389549214, + "grad_norm": 1.4375, + "learning_rate": 9.967141292442498e-06, + "loss": 1.208, + "step": 455 + }, + { + "epoch": 0.050391630607438245, + "grad_norm": 1.40625, + "learning_rate": 1.0076670317634175e-05, + "loss": 1.2326, + "step": 460 + }, + { + "epoch": 0.05093936572273648, + "grad_norm": 1.390625, + "learning_rate": 1.0186199342825848e-05, + "loss": 1.195, + "step": 465 + }, + { + "epoch": 0.051487100838034726, + "grad_norm": 1.3515625, + "learning_rate": 1.0295728368017525e-05, + "loss": 1.1941, + "step": 470 + }, + { + "epoch": 0.05203483595333297, + "grad_norm": 1.4453125, + "learning_rate": 1.04052573932092e-05, + "loss": 1.2478, + "step": 475 + }, + { + "epoch": 0.05258257106863121, + "grad_norm": 1.515625, + "learning_rate": 1.0514786418400877e-05, + "loss": 1.2514, + "step": 480 + }, + { + "epoch": 0.05313030618392945, + "grad_norm": 1.5859375, + "learning_rate": 1.0624315443592552e-05, + "loss": 1.2535, + "step": 485 + }, + { + "epoch": 0.053678041299227694, + "grad_norm": 1.40625, + "learning_rate": 1.073384446878423e-05, + "loss": 1.232, + "step": 490 + }, + { + "epoch": 0.05422577641452594, + "grad_norm": 1.6796875, + "learning_rate": 1.0843373493975904e-05, + "loss": 1.3235, + "step": 495 + }, + { + "epoch": 0.054773511529824175, + "grad_norm": 1.484375, + "learning_rate": 1.0952902519167581e-05, + "loss": 1.2356, + "step": 500 + }, + { + "epoch": 0.05532124664512242, + "grad_norm": 1.4609375, + "learning_rate": 1.1062431544359255e-05, + "loss": 1.1983, + "step": 505 + }, + { + "epoch": 0.05586898176042066, + "grad_norm": 1.46875, + "learning_rate": 1.1171960569550932e-05, + "loss": 1.2143, + "step": 510 + }, + { + "epoch": 0.0564167168757189, + "grad_norm": 1.515625, + "learning_rate": 1.1281489594742607e-05, + "loss": 1.2818, + "step": 515 + }, + { + "epoch": 0.05696445199101714, + "grad_norm": 1.5078125, + "learning_rate": 1.1391018619934284e-05, + "loss": 1.1754, + "step": 520 + }, + { + "epoch": 0.057512187106315386, + "grad_norm": 1.3828125, + "learning_rate": 1.1500547645125959e-05, + "loss": 1.2255, + "step": 525 + }, + { + "epoch": 0.05805992222161363, + "grad_norm": 1.4609375, + "learning_rate": 1.1610076670317636e-05, + "loss": 1.162, + "step": 530 + }, + { + "epoch": 0.05860765733691187, + "grad_norm": 1.65625, + "learning_rate": 1.1719605695509311e-05, + "loss": 1.2283, + "step": 535 + }, + { + "epoch": 0.05915539245221011, + "grad_norm": 1.5, + "learning_rate": 1.1829134720700988e-05, + "loss": 1.2758, + "step": 540 + }, + { + "epoch": 0.059703127567508354, + "grad_norm": 1.4453125, + "learning_rate": 1.1938663745892662e-05, + "loss": 1.2252, + "step": 545 + }, + { + "epoch": 0.06025086268280659, + "grad_norm": 1.421875, + "learning_rate": 1.204819277108434e-05, + "loss": 1.2171, + "step": 550 + }, + { + "epoch": 0.060798597798104835, + "grad_norm": 1.5078125, + "learning_rate": 1.2157721796276014e-05, + "loss": 1.2346, + "step": 555 + }, + { + "epoch": 0.06134633291340308, + "grad_norm": 1.34375, + "learning_rate": 1.2267250821467689e-05, + "loss": 1.2295, + "step": 560 + }, + { + "epoch": 0.06189406802870132, + "grad_norm": 1.4609375, + "learning_rate": 1.2376779846659366e-05, + "loss": 1.2068, + "step": 565 + }, + { + "epoch": 0.06244180314399956, + "grad_norm": 1.4140625, + "learning_rate": 1.2486308871851041e-05, + "loss": 1.1862, + "step": 570 + }, + { + "epoch": 0.0629895382592978, + "grad_norm": 1.390625, + "learning_rate": 1.2595837897042718e-05, + "loss": 1.219, + "step": 575 + }, + { + "epoch": 0.06353727337459604, + "grad_norm": 1.3984375, + "learning_rate": 1.2705366922234393e-05, + "loss": 1.2089, + "step": 580 + }, + { + "epoch": 0.06408500848989429, + "grad_norm": 1.4765625, + "learning_rate": 1.281489594742607e-05, + "loss": 1.2738, + "step": 585 + }, + { + "epoch": 0.06463274360519253, + "grad_norm": 1.421875, + "learning_rate": 1.2924424972617743e-05, + "loss": 1.2507, + "step": 590 + }, + { + "epoch": 0.06518047872049076, + "grad_norm": 1.421875, + "learning_rate": 1.303395399780942e-05, + "loss": 1.2554, + "step": 595 + }, + { + "epoch": 0.06572821383578901, + "grad_norm": 1.4453125, + "learning_rate": 1.3143483023001096e-05, + "loss": 1.2269, + "step": 600 + }, + { + "epoch": 0.06627594895108725, + "grad_norm": 1.3359375, + "learning_rate": 1.3253012048192772e-05, + "loss": 1.2278, + "step": 605 + }, + { + "epoch": 0.0668236840663855, + "grad_norm": 1.4453125, + "learning_rate": 1.3362541073384448e-05, + "loss": 1.197, + "step": 610 + }, + { + "epoch": 0.06737141918168374, + "grad_norm": 1.484375, + "learning_rate": 1.3472070098576125e-05, + "loss": 1.2462, + "step": 615 + }, + { + "epoch": 0.06791915429698198, + "grad_norm": 1.4453125, + "learning_rate": 1.35815991237678e-05, + "loss": 1.1692, + "step": 620 + }, + { + "epoch": 0.06846688941228023, + "grad_norm": 1.3828125, + "learning_rate": 1.3691128148959477e-05, + "loss": 1.2331, + "step": 625 + }, + { + "epoch": 0.06901462452757846, + "grad_norm": 1.375, + "learning_rate": 1.380065717415115e-05, + "loss": 1.2679, + "step": 630 + }, + { + "epoch": 0.0695623596428767, + "grad_norm": 1.4140625, + "learning_rate": 1.3910186199342827e-05, + "loss": 1.212, + "step": 635 + }, + { + "epoch": 0.07011009475817495, + "grad_norm": 1.375, + "learning_rate": 1.4019715224534502e-05, + "loss": 1.2085, + "step": 640 + }, + { + "epoch": 0.07065782987347319, + "grad_norm": 1.3828125, + "learning_rate": 1.412924424972618e-05, + "loss": 1.2214, + "step": 645 + }, + { + "epoch": 0.07120556498877142, + "grad_norm": 1.375, + "learning_rate": 1.4238773274917854e-05, + "loss": 1.2937, + "step": 650 + }, + { + "epoch": 0.07175330010406968, + "grad_norm": 1.4375, + "learning_rate": 1.4348302300109531e-05, + "loss": 1.2745, + "step": 655 + }, + { + "epoch": 0.07230103521936791, + "grad_norm": 1.359375, + "learning_rate": 1.4457831325301207e-05, + "loss": 1.1891, + "step": 660 + }, + { + "epoch": 0.07284877033466615, + "grad_norm": 1.3203125, + "learning_rate": 1.4567360350492883e-05, + "loss": 1.2059, + "step": 665 + }, + { + "epoch": 0.0733965054499644, + "grad_norm": 1.359375, + "learning_rate": 1.4676889375684557e-05, + "loss": 1.2321, + "step": 670 + }, + { + "epoch": 0.07394424056526264, + "grad_norm": 1.375, + "learning_rate": 1.4786418400876232e-05, + "loss": 1.2003, + "step": 675 + }, + { + "epoch": 0.07449197568056089, + "grad_norm": 1.40625, + "learning_rate": 1.4895947426067909e-05, + "loss": 1.2113, + "step": 680 + }, + { + "epoch": 0.07503971079585912, + "grad_norm": 1.34375, + "learning_rate": 1.5005476451259584e-05, + "loss": 1.2612, + "step": 685 + }, + { + "epoch": 0.07558744591115736, + "grad_norm": 1.328125, + "learning_rate": 1.5115005476451261e-05, + "loss": 1.2022, + "step": 690 + }, + { + "epoch": 0.07613518102645561, + "grad_norm": 1.421875, + "learning_rate": 1.5224534501642936e-05, + "loss": 1.1903, + "step": 695 + }, + { + "epoch": 0.07668291614175385, + "grad_norm": 1.3828125, + "learning_rate": 1.5334063526834613e-05, + "loss": 1.1864, + "step": 700 + }, + { + "epoch": 0.07723065125705209, + "grad_norm": 1.390625, + "learning_rate": 1.5443592552026287e-05, + "loss": 1.1988, + "step": 705 + }, + { + "epoch": 0.07777838637235034, + "grad_norm": 1.34375, + "learning_rate": 1.5553121577217964e-05, + "loss": 1.2798, + "step": 710 + }, + { + "epoch": 0.07832612148764857, + "grad_norm": 1.359375, + "learning_rate": 1.566265060240964e-05, + "loss": 1.2141, + "step": 715 + }, + { + "epoch": 0.07887385660294681, + "grad_norm": 1.46875, + "learning_rate": 1.5772179627601317e-05, + "loss": 1.265, + "step": 720 + }, + { + "epoch": 0.07942159171824506, + "grad_norm": 1.3515625, + "learning_rate": 1.588170865279299e-05, + "loss": 1.2063, + "step": 725 + }, + { + "epoch": 0.0799693268335433, + "grad_norm": 1.3828125, + "learning_rate": 1.5991237677984668e-05, + "loss": 1.192, + "step": 730 + }, + { + "epoch": 0.08051706194884153, + "grad_norm": 1.3828125, + "learning_rate": 1.610076670317634e-05, + "loss": 1.2135, + "step": 735 + }, + { + "epoch": 0.08106479706413978, + "grad_norm": 1.3203125, + "learning_rate": 1.6210295728368018e-05, + "loss": 1.1986, + "step": 740 + }, + { + "epoch": 0.08161253217943802, + "grad_norm": 1.25, + "learning_rate": 1.6319824753559695e-05, + "loss": 1.2055, + "step": 745 + }, + { + "epoch": 0.08216026729473627, + "grad_norm": 1.3359375, + "learning_rate": 1.6429353778751372e-05, + "loss": 1.2332, + "step": 750 + }, + { + "epoch": 0.08270800241003451, + "grad_norm": 1.375, + "learning_rate": 1.6538882803943046e-05, + "loss": 1.2425, + "step": 755 + }, + { + "epoch": 0.08325573752533275, + "grad_norm": 1.3046875, + "learning_rate": 1.6648411829134722e-05, + "loss": 1.2201, + "step": 760 + }, + { + "epoch": 0.083803472640631, + "grad_norm": 1.3515625, + "learning_rate": 1.67579408543264e-05, + "loss": 1.2385, + "step": 765 + }, + { + "epoch": 0.08435120775592923, + "grad_norm": 1.3125, + "learning_rate": 1.6867469879518076e-05, + "loss": 1.1299, + "step": 770 + }, + { + "epoch": 0.08489894287122747, + "grad_norm": 1.3515625, + "learning_rate": 1.697699890470975e-05, + "loss": 1.212, + "step": 775 + }, + { + "epoch": 0.08544667798652572, + "grad_norm": 1.3515625, + "learning_rate": 1.7086527929901423e-05, + "loss": 1.2242, + "step": 780 + }, + { + "epoch": 0.08599441310182396, + "grad_norm": 1.46875, + "learning_rate": 1.71960569550931e-05, + "loss": 1.2346, + "step": 785 + }, + { + "epoch": 0.0865421482171222, + "grad_norm": 1.328125, + "learning_rate": 1.7305585980284777e-05, + "loss": 1.2316, + "step": 790 + }, + { + "epoch": 0.08708988333242045, + "grad_norm": 1.3359375, + "learning_rate": 1.7415115005476454e-05, + "loss": 1.1931, + "step": 795 + }, + { + "epoch": 0.08763761844771868, + "grad_norm": 1.328125, + "learning_rate": 1.7524644030668127e-05, + "loss": 1.2037, + "step": 800 + }, + { + "epoch": 0.08818535356301692, + "grad_norm": 1.375, + "learning_rate": 1.7634173055859804e-05, + "loss": 1.2317, + "step": 805 + }, + { + "epoch": 0.08873308867831517, + "grad_norm": 1.3046875, + "learning_rate": 1.7743702081051478e-05, + "loss": 1.2425, + "step": 810 + }, + { + "epoch": 0.0892808237936134, + "grad_norm": 1.328125, + "learning_rate": 1.7853231106243155e-05, + "loss": 1.2056, + "step": 815 + }, + { + "epoch": 0.08982855890891166, + "grad_norm": 1.3046875, + "learning_rate": 1.796276013143483e-05, + "loss": 1.1954, + "step": 820 + }, + { + "epoch": 0.0903762940242099, + "grad_norm": 1.3203125, + "learning_rate": 1.807228915662651e-05, + "loss": 1.2034, + "step": 825 + }, + { + "epoch": 0.09092402913950813, + "grad_norm": 1.3828125, + "learning_rate": 1.8181818181818182e-05, + "loss": 1.1815, + "step": 830 + }, + { + "epoch": 0.09147176425480638, + "grad_norm": 1.328125, + "learning_rate": 1.829134720700986e-05, + "loss": 1.2272, + "step": 835 + }, + { + "epoch": 0.09201949937010462, + "grad_norm": 1.2890625, + "learning_rate": 1.8400876232201536e-05, + "loss": 1.2151, + "step": 840 + }, + { + "epoch": 0.09256723448540286, + "grad_norm": 1.296875, + "learning_rate": 1.8510405257393213e-05, + "loss": 1.2056, + "step": 845 + }, + { + "epoch": 0.0931149696007011, + "grad_norm": 1.34375, + "learning_rate": 1.8619934282584886e-05, + "loss": 1.2219, + "step": 850 + }, + { + "epoch": 0.09366270471599934, + "grad_norm": 1.640625, + "learning_rate": 1.8729463307776563e-05, + "loss": 1.1675, + "step": 855 + }, + { + "epoch": 0.09421043983129758, + "grad_norm": 1.2890625, + "learning_rate": 1.8838992332968237e-05, + "loss": 1.2177, + "step": 860 + }, + { + "epoch": 0.09475817494659583, + "grad_norm": 1.3671875, + "learning_rate": 1.8948521358159914e-05, + "loss": 1.2518, + "step": 865 + }, + { + "epoch": 0.09530591006189407, + "grad_norm": 1.265625, + "learning_rate": 1.905805038335159e-05, + "loss": 1.2463, + "step": 870 + }, + { + "epoch": 0.0958536451771923, + "grad_norm": 1.2109375, + "learning_rate": 1.9167579408543267e-05, + "loss": 1.1985, + "step": 875 + }, + { + "epoch": 0.09640138029249055, + "grad_norm": 1.3125, + "learning_rate": 1.927710843373494e-05, + "loss": 1.2317, + "step": 880 + }, + { + "epoch": 0.09694911540778879, + "grad_norm": 1.3046875, + "learning_rate": 1.9386637458926618e-05, + "loss": 1.2383, + "step": 885 + }, + { + "epoch": 0.09749685052308704, + "grad_norm": 1.2421875, + "learning_rate": 1.949616648411829e-05, + "loss": 1.1178, + "step": 890 + }, + { + "epoch": 0.09804458563838528, + "grad_norm": 1.5390625, + "learning_rate": 1.9605695509309968e-05, + "loss": 1.2037, + "step": 895 + }, + { + "epoch": 0.09859232075368352, + "grad_norm": 1.296875, + "learning_rate": 1.9715224534501645e-05, + "loss": 1.1711, + "step": 900 + }, + { + "epoch": 0.09914005586898177, + "grad_norm": 1.25, + "learning_rate": 1.982475355969332e-05, + "loss": 1.223, + "step": 905 + }, + { + "epoch": 0.09968779098428, + "grad_norm": 1.28125, + "learning_rate": 1.9934282584884995e-05, + "loss": 1.2393, + "step": 910 + }, + { + "epoch": 0.10023552609957824, + "grad_norm": 1.296875, + "learning_rate": 1.9999997075076013e-05, + "loss": 1.1597, + "step": 915 + }, + { + "epoch": 0.10078326121487649, + "grad_norm": 1.2734375, + "learning_rate": 1.999996416970079e-05, + "loss": 1.2222, + "step": 920 + }, + { + "epoch": 0.10133099633017473, + "grad_norm": 1.28125, + "learning_rate": 1.9999894702916073e-05, + "loss": 1.1993, + "step": 925 + }, + { + "epoch": 0.10187873144547296, + "grad_norm": 1.328125, + "learning_rate": 1.9999788674975834e-05, + "loss": 1.2389, + "step": 930 + }, + { + "epoch": 0.10242646656077121, + "grad_norm": 1.3515625, + "learning_rate": 1.9999646086267734e-05, + "loss": 1.1934, + "step": 935 + }, + { + "epoch": 0.10297420167606945, + "grad_norm": 1.2421875, + "learning_rate": 1.9999466937313098e-05, + "loss": 1.2261, + "step": 940 + }, + { + "epoch": 0.10352193679136769, + "grad_norm": 1.2734375, + "learning_rate": 1.9999251228766922e-05, + "loss": 1.1829, + "step": 945 + }, + { + "epoch": 0.10406967190666594, + "grad_norm": 1.265625, + "learning_rate": 1.999899896141787e-05, + "loss": 1.2095, + "step": 950 + }, + { + "epoch": 0.10461740702196418, + "grad_norm": 1.265625, + "learning_rate": 1.9998710136188267e-05, + "loss": 1.1984, + "step": 955 + }, + { + "epoch": 0.10516514213726243, + "grad_norm": 1.265625, + "learning_rate": 1.999838475413411e-05, + "loss": 1.1844, + "step": 960 + }, + { + "epoch": 0.10571287725256066, + "grad_norm": 1.2734375, + "learning_rate": 1.9998022816445037e-05, + "loss": 1.2157, + "step": 965 + }, + { + "epoch": 0.1062606123678589, + "grad_norm": 1.2421875, + "learning_rate": 1.999762432444435e-05, + "loss": 1.1304, + "step": 970 + }, + { + "epoch": 0.10680834748315715, + "grad_norm": 1.2578125, + "learning_rate": 1.9997189279589003e-05, + "loss": 1.163, + "step": 975 + }, + { + "epoch": 0.10735608259845539, + "grad_norm": 1.2890625, + "learning_rate": 1.9996717683469582e-05, + "loss": 1.1851, + "step": 980 + }, + { + "epoch": 0.10790381771375362, + "grad_norm": 1.2890625, + "learning_rate": 1.9996209537810317e-05, + "loss": 1.211, + "step": 985 + }, + { + "epoch": 0.10845155282905188, + "grad_norm": 1.390625, + "learning_rate": 1.9995664844469064e-05, + "loss": 1.1801, + "step": 990 + }, + { + "epoch": 0.10899928794435011, + "grad_norm": 1.2890625, + "learning_rate": 1.9995083605437312e-05, + "loss": 1.1569, + "step": 995 + }, + { + "epoch": 0.10954702305964835, + "grad_norm": 1.2421875, + "learning_rate": 1.9994465822840152e-05, + "loss": 1.2466, + "step": 1000 + }, + { + "epoch": 0.1100947581749466, + "grad_norm": 1.34375, + "learning_rate": 1.99938114989363e-05, + "loss": 1.1697, + "step": 1005 + }, + { + "epoch": 0.11064249329024484, + "grad_norm": 1.2890625, + "learning_rate": 1.9993120636118055e-05, + "loss": 1.2242, + "step": 1010 + }, + { + "epoch": 0.11119022840554307, + "grad_norm": 1.28125, + "learning_rate": 1.999239323691133e-05, + "loss": 1.1978, + "step": 1015 + }, + { + "epoch": 0.11173796352084132, + "grad_norm": 1.2734375, + "learning_rate": 1.99916293039756e-05, + "loss": 1.2025, + "step": 1020 + }, + { + "epoch": 0.11228569863613956, + "grad_norm": 1.296875, + "learning_rate": 1.999082884010393e-05, + "loss": 1.2639, + "step": 1025 + }, + { + "epoch": 0.1128334337514378, + "grad_norm": 1.2734375, + "learning_rate": 1.998999184822293e-05, + "loss": 1.1972, + "step": 1030 + }, + { + "epoch": 0.11338116886673605, + "grad_norm": 1.2734375, + "learning_rate": 1.9989118331392775e-05, + "loss": 1.2215, + "step": 1035 + }, + { + "epoch": 0.11392890398203429, + "grad_norm": 1.296875, + "learning_rate": 1.998820829280718e-05, + "loss": 1.2178, + "step": 1040 + }, + { + "epoch": 0.11447663909733254, + "grad_norm": 1.2421875, + "learning_rate": 1.998726173579338e-05, + "loss": 1.1748, + "step": 1045 + }, + { + "epoch": 0.11502437421263077, + "grad_norm": 1.28125, + "learning_rate": 1.9986278663812137e-05, + "loss": 1.212, + "step": 1050 + }, + { + "epoch": 0.11557210932792901, + "grad_norm": 1.1796875, + "learning_rate": 1.998525908045771e-05, + "loss": 1.1583, + "step": 1055 + }, + { + "epoch": 0.11611984444322726, + "grad_norm": 1.296875, + "learning_rate": 1.998420298945786e-05, + "loss": 1.1714, + "step": 1060 + }, + { + "epoch": 0.1166675795585255, + "grad_norm": 1.2421875, + "learning_rate": 1.998311039467382e-05, + "loss": 1.2036, + "step": 1065 + }, + { + "epoch": 0.11721531467382373, + "grad_norm": 1.2734375, + "learning_rate": 1.9981981300100267e-05, + "loss": 1.1606, + "step": 1070 + }, + { + "epoch": 0.11776304978912198, + "grad_norm": 1.265625, + "learning_rate": 1.9980815709865365e-05, + "loss": 1.1495, + "step": 1075 + }, + { + "epoch": 0.11831078490442022, + "grad_norm": 1.2578125, + "learning_rate": 1.9979613628230683e-05, + "loss": 1.1872, + "step": 1080 + }, + { + "epoch": 0.11885852001971846, + "grad_norm": 1.3046875, + "learning_rate": 1.9978375059591214e-05, + "loss": 1.2167, + "step": 1085 + }, + { + "epoch": 0.11940625513501671, + "grad_norm": 1.296875, + "learning_rate": 1.997710000847536e-05, + "loss": 1.1988, + "step": 1090 + }, + { + "epoch": 0.11995399025031495, + "grad_norm": 1.2578125, + "learning_rate": 1.9975788479544908e-05, + "loss": 1.1961, + "step": 1095 + }, + { + "epoch": 0.12050172536561318, + "grad_norm": 1.3203125, + "learning_rate": 1.9974440477595e-05, + "loss": 1.2294, + "step": 1100 + }, + { + "epoch": 0.12104946048091143, + "grad_norm": 1.2109375, + "learning_rate": 1.9973056007554145e-05, + "loss": 1.1643, + "step": 1105 + }, + { + "epoch": 0.12159719559620967, + "grad_norm": 1.3515625, + "learning_rate": 1.9971635074484174e-05, + "loss": 1.2341, + "step": 1110 + }, + { + "epoch": 0.12214493071150792, + "grad_norm": 1.25, + "learning_rate": 1.9970177683580245e-05, + "loss": 1.2422, + "step": 1115 + }, + { + "epoch": 0.12269266582680616, + "grad_norm": 1.2578125, + "learning_rate": 1.99686838401708e-05, + "loss": 1.1809, + "step": 1120 + }, + { + "epoch": 0.1232404009421044, + "grad_norm": 1.265625, + "learning_rate": 1.996715354971755e-05, + "loss": 1.2402, + "step": 1125 + }, + { + "epoch": 0.12378813605740265, + "grad_norm": 1.25, + "learning_rate": 1.9965586817815494e-05, + "loss": 1.1718, + "step": 1130 + }, + { + "epoch": 0.12433587117270088, + "grad_norm": 1.234375, + "learning_rate": 1.996398365019283e-05, + "loss": 1.1678, + "step": 1135 + }, + { + "epoch": 0.12488360628799912, + "grad_norm": 1.3046875, + "learning_rate": 1.996234405271099e-05, + "loss": 1.2031, + "step": 1140 + }, + { + "epoch": 0.12543134140329737, + "grad_norm": 1.25, + "learning_rate": 1.9960668031364593e-05, + "loss": 1.2042, + "step": 1145 + }, + { + "epoch": 0.1259790765185956, + "grad_norm": 1.25, + "learning_rate": 1.9958955592281436e-05, + "loss": 1.2292, + "step": 1150 + }, + { + "epoch": 0.12652681163389384, + "grad_norm": 1.375, + "learning_rate": 1.9957206741722455e-05, + "loss": 1.2361, + "step": 1155 + }, + { + "epoch": 0.12707454674919208, + "grad_norm": 1.2421875, + "learning_rate": 1.9955421486081718e-05, + "loss": 1.1373, + "step": 1160 + }, + { + "epoch": 0.12762228186449034, + "grad_norm": 1.328125, + "learning_rate": 1.9953599831886398e-05, + "loss": 1.1941, + "step": 1165 + }, + { + "epoch": 0.12817001697978858, + "grad_norm": 1.28125, + "learning_rate": 1.9951741785796737e-05, + "loss": 1.1622, + "step": 1170 + }, + { + "epoch": 0.12871775209508682, + "grad_norm": 1.2578125, + "learning_rate": 1.9949847354606046e-05, + "loss": 1.1535, + "step": 1175 + }, + { + "epoch": 0.12926548721038505, + "grad_norm": 1.28125, + "learning_rate": 1.994791654524065e-05, + "loss": 1.1981, + "step": 1180 + }, + { + "epoch": 0.1298132223256833, + "grad_norm": 1.21875, + "learning_rate": 1.9945949364759887e-05, + "loss": 1.1849, + "step": 1185 + }, + { + "epoch": 0.13036095744098153, + "grad_norm": 1.234375, + "learning_rate": 1.9943945820356075e-05, + "loss": 1.1899, + "step": 1190 + }, + { + "epoch": 0.1309086925562798, + "grad_norm": 1.28125, + "learning_rate": 1.994190591935448e-05, + "loss": 1.2066, + "step": 1195 + }, + { + "epoch": 0.13145642767157803, + "grad_norm": 1.3046875, + "learning_rate": 1.9939829669213296e-05, + "loss": 1.1682, + "step": 1200 + }, + { + "epoch": 0.13200416278687627, + "grad_norm": 1.25, + "learning_rate": 1.9937717077523607e-05, + "loss": 1.1963, + "step": 1205 + }, + { + "epoch": 0.1325518979021745, + "grad_norm": 1.234375, + "learning_rate": 1.9935568152009378e-05, + "loss": 1.177, + "step": 1210 + }, + { + "epoch": 0.13309963301747274, + "grad_norm": 1.34375, + "learning_rate": 1.9933382900527413e-05, + "loss": 1.1594, + "step": 1215 + }, + { + "epoch": 0.133647368132771, + "grad_norm": 1.3125, + "learning_rate": 1.9931161331067327e-05, + "loss": 1.183, + "step": 1220 + }, + { + "epoch": 0.13419510324806924, + "grad_norm": 1.21875, + "learning_rate": 1.9928903451751517e-05, + "loss": 1.21, + "step": 1225 + }, + { + "epoch": 0.13474283836336748, + "grad_norm": 1.3515625, + "learning_rate": 1.992660927083514e-05, + "loss": 1.2326, + "step": 1230 + }, + { + "epoch": 0.13529057347866572, + "grad_norm": 1.265625, + "learning_rate": 1.992427879670608e-05, + "loss": 1.1703, + "step": 1235 + }, + { + "epoch": 0.13583830859396395, + "grad_norm": 1.2265625, + "learning_rate": 1.99219120378849e-05, + "loss": 1.2061, + "step": 1240 + }, + { + "epoch": 0.1363860437092622, + "grad_norm": 1.2421875, + "learning_rate": 1.991950900302484e-05, + "loss": 1.2041, + "step": 1245 + }, + { + "epoch": 0.13693377882456045, + "grad_norm": 1.265625, + "learning_rate": 1.9917069700911766e-05, + "loss": 1.1703, + "step": 1250 + }, + { + "epoch": 0.1374815139398587, + "grad_norm": 1.2578125, + "learning_rate": 1.991459414046414e-05, + "loss": 1.1598, + "step": 1255 + }, + { + "epoch": 0.13802924905515693, + "grad_norm": 1.234375, + "learning_rate": 1.9912082330732992e-05, + "loss": 1.1528, + "step": 1260 + }, + { + "epoch": 0.13857698417045516, + "grad_norm": 1.2109375, + "learning_rate": 1.990953428090189e-05, + "loss": 1.1381, + "step": 1265 + }, + { + "epoch": 0.1391247192857534, + "grad_norm": 1.203125, + "learning_rate": 1.9906950000286894e-05, + "loss": 1.1351, + "step": 1270 + }, + { + "epoch": 0.13967245440105164, + "grad_norm": 1.328125, + "learning_rate": 1.990432949833653e-05, + "loss": 1.1824, + "step": 1275 + }, + { + "epoch": 0.1402201895163499, + "grad_norm": 1.2890625, + "learning_rate": 1.990167278463176e-05, + "loss": 1.1525, + "step": 1280 + }, + { + "epoch": 0.14076792463164814, + "grad_norm": 1.2421875, + "learning_rate": 1.9898979868885933e-05, + "loss": 1.1933, + "step": 1285 + }, + { + "epoch": 0.14131565974694638, + "grad_norm": 1.2109375, + "learning_rate": 1.989625076094477e-05, + "loss": 1.1801, + "step": 1290 + }, + { + "epoch": 0.1418633948622446, + "grad_norm": 1.25, + "learning_rate": 1.9893485470786307e-05, + "loss": 1.2175, + "step": 1295 + }, + { + "epoch": 0.14241112997754285, + "grad_norm": 1.2578125, + "learning_rate": 1.9890684008520872e-05, + "loss": 1.1855, + "step": 1300 + }, + { + "epoch": 0.14295886509284111, + "grad_norm": 1.2578125, + "learning_rate": 1.9887846384391048e-05, + "loss": 1.1998, + "step": 1305 + }, + { + "epoch": 0.14350660020813935, + "grad_norm": 1.234375, + "learning_rate": 1.9884972608771612e-05, + "loss": 1.1501, + "step": 1310 + }, + { + "epoch": 0.1440543353234376, + "grad_norm": 1.2109375, + "learning_rate": 1.9882062692169544e-05, + "loss": 1.2048, + "step": 1315 + }, + { + "epoch": 0.14460207043873582, + "grad_norm": 1.25, + "learning_rate": 1.987911664522394e-05, + "loss": 1.1473, + "step": 1320 + }, + { + "epoch": 0.14514980555403406, + "grad_norm": 1.3203125, + "learning_rate": 1.9876134478706004e-05, + "loss": 1.1571, + "step": 1325 + }, + { + "epoch": 0.1456975406693323, + "grad_norm": 1.2890625, + "learning_rate": 1.9873116203518997e-05, + "loss": 1.2003, + "step": 1330 + }, + { + "epoch": 0.14624527578463056, + "grad_norm": 1.2578125, + "learning_rate": 1.9870061830698196e-05, + "loss": 1.1797, + "step": 1335 + }, + { + "epoch": 0.1467930108999288, + "grad_norm": 1.2109375, + "learning_rate": 1.9866971371410858e-05, + "loss": 1.1441, + "step": 1340 + }, + { + "epoch": 0.14734074601522704, + "grad_norm": 1.3046875, + "learning_rate": 1.9863844836956177e-05, + "loss": 1.1912, + "step": 1345 + }, + { + "epoch": 0.14788848113052527, + "grad_norm": 1.203125, + "learning_rate": 1.986068223876525e-05, + "loss": 1.171, + "step": 1350 + }, + { + "epoch": 0.1484362162458235, + "grad_norm": 1.296875, + "learning_rate": 1.9857483588401023e-05, + "loss": 1.1628, + "step": 1355 + }, + { + "epoch": 0.14898395136112177, + "grad_norm": 1.234375, + "learning_rate": 1.9854248897558247e-05, + "loss": 1.1522, + "step": 1360 + }, + { + "epoch": 0.14953168647642, + "grad_norm": 1.2109375, + "learning_rate": 1.985097817806346e-05, + "loss": 1.1907, + "step": 1365 + }, + { + "epoch": 0.15007942159171825, + "grad_norm": 1.28125, + "learning_rate": 1.9847671441874907e-05, + "loss": 1.1937, + "step": 1370 + }, + { + "epoch": 0.15062715670701649, + "grad_norm": 1.28125, + "learning_rate": 1.9844328701082532e-05, + "loss": 1.1997, + "step": 1375 + }, + { + "epoch": 0.15117489182231472, + "grad_norm": 1.2265625, + "learning_rate": 1.9840949967907906e-05, + "loss": 1.1507, + "step": 1380 + }, + { + "epoch": 0.15172262693761296, + "grad_norm": 1.2265625, + "learning_rate": 1.98375352547042e-05, + "loss": 1.1952, + "step": 1385 + }, + { + "epoch": 0.15227036205291122, + "grad_norm": 1.203125, + "learning_rate": 1.983408457395613e-05, + "loss": 1.1519, + "step": 1390 + }, + { + "epoch": 0.15281809716820946, + "grad_norm": 1.390625, + "learning_rate": 1.9830597938279915e-05, + "loss": 1.2213, + "step": 1395 + }, + { + "epoch": 0.1533658322835077, + "grad_norm": 1.2421875, + "learning_rate": 1.9827075360423236e-05, + "loss": 1.168, + "step": 1400 + }, + { + "epoch": 0.15391356739880593, + "grad_norm": 1.265625, + "learning_rate": 1.982351685326518e-05, + "loss": 1.1586, + "step": 1405 + }, + { + "epoch": 0.15446130251410417, + "grad_norm": 1.25, + "learning_rate": 1.9819922429816193e-05, + "loss": 1.1292, + "step": 1410 + }, + { + "epoch": 0.1550090376294024, + "grad_norm": 1.28125, + "learning_rate": 1.981629210321805e-05, + "loss": 1.2092, + "step": 1415 + }, + { + "epoch": 0.15555677274470067, + "grad_norm": 1.2421875, + "learning_rate": 1.9812625886743775e-05, + "loss": 1.1997, + "step": 1420 + }, + { + "epoch": 0.1561045078599989, + "grad_norm": 1.2890625, + "learning_rate": 1.980892379379762e-05, + "loss": 1.1885, + "step": 1425 + }, + { + "epoch": 0.15665224297529715, + "grad_norm": 1.203125, + "learning_rate": 1.9805185837915014e-05, + "loss": 1.1733, + "step": 1430 + }, + { + "epoch": 0.15719997809059538, + "grad_norm": 1.2734375, + "learning_rate": 1.9801412032762495e-05, + "loss": 1.1742, + "step": 1435 + }, + { + "epoch": 0.15774771320589362, + "grad_norm": 1.21875, + "learning_rate": 1.9797602392137678e-05, + "loss": 1.1946, + "step": 1440 + }, + { + "epoch": 0.15829544832119188, + "grad_norm": 1.25, + "learning_rate": 1.9793756929969195e-05, + "loss": 1.1446, + "step": 1445 + }, + { + "epoch": 0.15884318343649012, + "grad_norm": 1.2421875, + "learning_rate": 1.978987566031665e-05, + "loss": 1.2024, + "step": 1450 + }, + { + "epoch": 0.15939091855178836, + "grad_norm": 1.2421875, + "learning_rate": 1.9785958597370557e-05, + "loss": 1.194, + "step": 1455 + }, + { + "epoch": 0.1599386536670866, + "grad_norm": 1.2578125, + "learning_rate": 1.9782005755452306e-05, + "loss": 1.1704, + "step": 1460 + }, + { + "epoch": 0.16048638878238483, + "grad_norm": 1.21875, + "learning_rate": 1.9778017149014098e-05, + "loss": 1.2247, + "step": 1465 + }, + { + "epoch": 0.16103412389768307, + "grad_norm": 1.2109375, + "learning_rate": 1.977399279263889e-05, + "loss": 1.1891, + "step": 1470 + }, + { + "epoch": 0.16158185901298133, + "grad_norm": 1.4140625, + "learning_rate": 1.9769932701040343e-05, + "loss": 1.1585, + "step": 1475 + }, + { + "epoch": 0.16212959412827957, + "grad_norm": 1.203125, + "learning_rate": 1.9765836889062788e-05, + "loss": 1.1668, + "step": 1480 + }, + { + "epoch": 0.1626773292435778, + "grad_norm": 1.2578125, + "learning_rate": 1.9761705371681138e-05, + "loss": 1.2052, + "step": 1485 + }, + { + "epoch": 0.16322506435887604, + "grad_norm": 1.4296875, + "learning_rate": 1.975753816400086e-05, + "loss": 1.1191, + "step": 1490 + }, + { + "epoch": 0.16377279947417428, + "grad_norm": 1.328125, + "learning_rate": 1.975333528125791e-05, + "loss": 1.1966, + "step": 1495 + }, + { + "epoch": 0.16432053458947254, + "grad_norm": 1.203125, + "learning_rate": 1.9749096738818663e-05, + "loss": 1.1594, + "step": 1500 + }, + { + "epoch": 0.16486826970477078, + "grad_norm": 1.3359375, + "learning_rate": 1.9744822552179895e-05, + "loss": 1.2079, + "step": 1505 + }, + { + "epoch": 0.16541600482006902, + "grad_norm": 1.3046875, + "learning_rate": 1.9740512736968688e-05, + "loss": 1.1655, + "step": 1510 + }, + { + "epoch": 0.16596373993536725, + "grad_norm": 1.21875, + "learning_rate": 1.9736167308942385e-05, + "loss": 1.1866, + "step": 1515 + }, + { + "epoch": 0.1665114750506655, + "grad_norm": 1.1953125, + "learning_rate": 1.973178628398855e-05, + "loss": 1.1686, + "step": 1520 + }, + { + "epoch": 0.16705921016596373, + "grad_norm": 1.2265625, + "learning_rate": 1.9727369678124876e-05, + "loss": 1.1725, + "step": 1525 + }, + { + "epoch": 0.167606945281262, + "grad_norm": 1.296875, + "learning_rate": 1.9722917507499154e-05, + "loss": 1.2279, + "step": 1530 + }, + { + "epoch": 0.16815468039656023, + "grad_norm": 1.2421875, + "learning_rate": 1.9718429788389214e-05, + "loss": 1.2022, + "step": 1535 + }, + { + "epoch": 0.16870241551185847, + "grad_norm": 1.2109375, + "learning_rate": 1.971390653720284e-05, + "loss": 1.2493, + "step": 1540 + }, + { + "epoch": 0.1692501506271567, + "grad_norm": 1.2890625, + "learning_rate": 1.9709347770477743e-05, + "loss": 1.1685, + "step": 1545 + }, + { + "epoch": 0.16979788574245494, + "grad_norm": 1.2578125, + "learning_rate": 1.9704753504881476e-05, + "loss": 1.2464, + "step": 1550 + }, + { + "epoch": 0.17034562085775318, + "grad_norm": 1.1875, + "learning_rate": 1.9700123757211372e-05, + "loss": 1.1843, + "step": 1555 + }, + { + "epoch": 0.17089335597305144, + "grad_norm": 1.28125, + "learning_rate": 1.969545854439451e-05, + "loss": 1.1613, + "step": 1560 + }, + { + "epoch": 0.17144109108834968, + "grad_norm": 1.2421875, + "learning_rate": 1.9690757883487626e-05, + "loss": 1.174, + "step": 1565 + }, + { + "epoch": 0.17198882620364792, + "grad_norm": 1.28125, + "learning_rate": 1.9686021791677055e-05, + "loss": 1.2428, + "step": 1570 + }, + { + "epoch": 0.17253656131894615, + "grad_norm": 1.1796875, + "learning_rate": 1.9681250286278685e-05, + "loss": 1.1463, + "step": 1575 + }, + { + "epoch": 0.1730842964342444, + "grad_norm": 1.265625, + "learning_rate": 1.9676443384737873e-05, + "loss": 1.2007, + "step": 1580 + }, + { + "epoch": 0.17363203154954265, + "grad_norm": 1.234375, + "learning_rate": 1.9671601104629388e-05, + "loss": 1.1912, + "step": 1585 + }, + { + "epoch": 0.1741797666648409, + "grad_norm": 1.1953125, + "learning_rate": 1.9666723463657357e-05, + "loss": 1.2221, + "step": 1590 + }, + { + "epoch": 0.17472750178013913, + "grad_norm": 1.3203125, + "learning_rate": 1.9661810479655184e-05, + "loss": 1.2012, + "step": 1595 + }, + { + "epoch": 0.17527523689543736, + "grad_norm": 1.1953125, + "learning_rate": 1.9656862170585494e-05, + "loss": 1.1746, + "step": 1600 + }, + { + "epoch": 0.1758229720107356, + "grad_norm": 1.203125, + "learning_rate": 1.965187855454007e-05, + "loss": 1.1739, + "step": 1605 + }, + { + "epoch": 0.17637070712603384, + "grad_norm": 1.265625, + "learning_rate": 1.964685964973978e-05, + "loss": 1.175, + "step": 1610 + }, + { + "epoch": 0.1769184422413321, + "grad_norm": 1.21875, + "learning_rate": 1.9641805474534514e-05, + "loss": 1.1344, + "step": 1615 + }, + { + "epoch": 0.17746617735663034, + "grad_norm": 1.2265625, + "learning_rate": 1.963671604740311e-05, + "loss": 1.1242, + "step": 1620 + }, + { + "epoch": 0.17801391247192858, + "grad_norm": 1.3359375, + "learning_rate": 1.9631591386953303e-05, + "loss": 1.1899, + "step": 1625 + }, + { + "epoch": 0.1785616475872268, + "grad_norm": 1.25, + "learning_rate": 1.9626431511921638e-05, + "loss": 1.2024, + "step": 1630 + }, + { + "epoch": 0.17910938270252505, + "grad_norm": 1.3125, + "learning_rate": 1.9621236441173414e-05, + "loss": 1.1714, + "step": 1635 + }, + { + "epoch": 0.17965711781782331, + "grad_norm": 1.2421875, + "learning_rate": 1.9616006193702608e-05, + "loss": 1.1386, + "step": 1640 + }, + { + "epoch": 0.18020485293312155, + "grad_norm": 1.2265625, + "learning_rate": 1.9610740788631816e-05, + "loss": 1.1612, + "step": 1645 + }, + { + "epoch": 0.1807525880484198, + "grad_norm": 1.2109375, + "learning_rate": 1.9605440245212165e-05, + "loss": 1.1568, + "step": 1650 + }, + { + "epoch": 0.18130032316371802, + "grad_norm": 1.1484375, + "learning_rate": 1.960010458282326e-05, + "loss": 1.145, + "step": 1655 + }, + { + "epoch": 0.18184805827901626, + "grad_norm": 1.2578125, + "learning_rate": 1.9594733820973105e-05, + "loss": 1.1805, + "step": 1660 + }, + { + "epoch": 0.1823957933943145, + "grad_norm": 1.1875, + "learning_rate": 1.958932797929803e-05, + "loss": 1.1542, + "step": 1665 + }, + { + "epoch": 0.18294352850961276, + "grad_norm": 1.21875, + "learning_rate": 1.958388707756263e-05, + "loss": 1.1288, + "step": 1670 + }, + { + "epoch": 0.183491263624911, + "grad_norm": 1.171875, + "learning_rate": 1.957841113565967e-05, + "loss": 1.2251, + "step": 1675 + }, + { + "epoch": 0.18403899874020924, + "grad_norm": 1.1953125, + "learning_rate": 1.957290017361005e-05, + "loss": 1.166, + "step": 1680 + }, + { + "epoch": 0.18458673385550747, + "grad_norm": 1.2265625, + "learning_rate": 1.9567354211562693e-05, + "loss": 1.179, + "step": 1685 + }, + { + "epoch": 0.1851344689708057, + "grad_norm": 1.21875, + "learning_rate": 1.9561773269794484e-05, + "loss": 1.1975, + "step": 1690 + }, + { + "epoch": 0.18568220408610395, + "grad_norm": 1.265625, + "learning_rate": 1.955615736871021e-05, + "loss": 1.188, + "step": 1695 + }, + { + "epoch": 0.1862299392014022, + "grad_norm": 1.265625, + "learning_rate": 1.955050652884247e-05, + "loss": 1.2116, + "step": 1700 + }, + { + "epoch": 0.18677767431670045, + "grad_norm": 1.2578125, + "learning_rate": 1.9544820770851608e-05, + "loss": 1.1793, + "step": 1705 + }, + { + "epoch": 0.18732540943199869, + "grad_norm": 1.234375, + "learning_rate": 1.9539100115525625e-05, + "loss": 1.1777, + "step": 1710 + }, + { + "epoch": 0.18787314454729692, + "grad_norm": 1.203125, + "learning_rate": 1.9533344583780124e-05, + "loss": 1.1605, + "step": 1715 + }, + { + "epoch": 0.18842087966259516, + "grad_norm": 1.2890625, + "learning_rate": 1.952755419665821e-05, + "loss": 1.2486, + "step": 1720 + }, + { + "epoch": 0.18896861477789342, + "grad_norm": 1.2578125, + "learning_rate": 1.9521728975330436e-05, + "loss": 1.216, + "step": 1725 + }, + { + "epoch": 0.18951634989319166, + "grad_norm": 1.203125, + "learning_rate": 1.951586894109471e-05, + "loss": 1.1769, + "step": 1730 + }, + { + "epoch": 0.1900640850084899, + "grad_norm": 1.2265625, + "learning_rate": 1.9509974115376208e-05, + "loss": 1.1962, + "step": 1735 + }, + { + "epoch": 0.19061182012378813, + "grad_norm": 1.234375, + "learning_rate": 1.9504044519727333e-05, + "loss": 1.1603, + "step": 1740 + }, + { + "epoch": 0.19115955523908637, + "grad_norm": 1.25, + "learning_rate": 1.9498080175827598e-05, + "loss": 1.1867, + "step": 1745 + }, + { + "epoch": 0.1917072903543846, + "grad_norm": 1.2578125, + "learning_rate": 1.949208110548356e-05, + "loss": 1.1531, + "step": 1750 + }, + { + "epoch": 0.19225502546968287, + "grad_norm": 1.2265625, + "learning_rate": 1.9486047330628745e-05, + "loss": 1.1891, + "step": 1755 + }, + { + "epoch": 0.1928027605849811, + "grad_norm": 1.1796875, + "learning_rate": 1.9479978873323565e-05, + "loss": 1.1366, + "step": 1760 + }, + { + "epoch": 0.19335049570027935, + "grad_norm": 1.3046875, + "learning_rate": 1.9473875755755235e-05, + "loss": 1.1604, + "step": 1765 + }, + { + "epoch": 0.19389823081557758, + "grad_norm": 1.28125, + "learning_rate": 1.9467738000237685e-05, + "loss": 1.1663, + "step": 1770 + }, + { + "epoch": 0.19444596593087582, + "grad_norm": 1.2734375, + "learning_rate": 1.946156562921151e-05, + "loss": 1.1747, + "step": 1775 + }, + { + "epoch": 0.19499370104617408, + "grad_norm": 1.2578125, + "learning_rate": 1.9455358665243836e-05, + "loss": 1.1973, + "step": 1780 + }, + { + "epoch": 0.19554143616147232, + "grad_norm": 1.203125, + "learning_rate": 1.9449117131028282e-05, + "loss": 1.1809, + "step": 1785 + }, + { + "epoch": 0.19608917127677056, + "grad_norm": 1.203125, + "learning_rate": 1.9442841049384865e-05, + "loss": 1.2348, + "step": 1790 + }, + { + "epoch": 0.1966369063920688, + "grad_norm": 1.25, + "learning_rate": 1.94365304432599e-05, + "loss": 1.1703, + "step": 1795 + }, + { + "epoch": 0.19718464150736703, + "grad_norm": 1.2265625, + "learning_rate": 1.9430185335725942e-05, + "loss": 1.1491, + "step": 1800 + }, + { + "epoch": 0.19773237662266527, + "grad_norm": 1.1796875, + "learning_rate": 1.9423805749981673e-05, + "loss": 1.143, + "step": 1805 + }, + { + "epoch": 0.19828011173796353, + "grad_norm": 1.2578125, + "learning_rate": 1.9417391709351845e-05, + "loss": 1.1962, + "step": 1810 + }, + { + "epoch": 0.19882784685326177, + "grad_norm": 1.2890625, + "learning_rate": 1.9410943237287178e-05, + "loss": 1.2093, + "step": 1815 + }, + { + "epoch": 0.19937558196856, + "grad_norm": 1.203125, + "learning_rate": 1.9404460357364282e-05, + "loss": 1.165, + "step": 1820 + }, + { + "epoch": 0.19992331708385824, + "grad_norm": 1.2265625, + "learning_rate": 1.939794309328556e-05, + "loss": 1.1203, + "step": 1825 + }, + { + "epoch": 0.20047105219915648, + "grad_norm": 1.2578125, + "learning_rate": 1.939139146887914e-05, + "loss": 1.1476, + "step": 1830 + }, + { + "epoch": 0.20101878731445472, + "grad_norm": 1.25, + "learning_rate": 1.9384805508098763e-05, + "loss": 1.1903, + "step": 1835 + }, + { + "epoch": 0.20156652242975298, + "grad_norm": 1.2265625, + "learning_rate": 1.937818523502372e-05, + "loss": 1.21, + "step": 1840 + }, + { + "epoch": 0.20211425754505122, + "grad_norm": 1.2890625, + "learning_rate": 1.9371530673858753e-05, + "loss": 1.1682, + "step": 1845 + }, + { + "epoch": 0.20266199266034945, + "grad_norm": 1.265625, + "learning_rate": 1.936484184893395e-05, + "loss": 1.1534, + "step": 1850 + }, + { + "epoch": 0.2032097277756477, + "grad_norm": 1.15625, + "learning_rate": 1.93581187847047e-05, + "loss": 1.136, + "step": 1855 + }, + { + "epoch": 0.20375746289094593, + "grad_norm": 1.328125, + "learning_rate": 1.9351361505751554e-05, + "loss": 1.1976, + "step": 1860 + }, + { + "epoch": 0.2043051980062442, + "grad_norm": 1.21875, + "learning_rate": 1.9344570036780165e-05, + "loss": 1.1293, + "step": 1865 + }, + { + "epoch": 0.20485293312154243, + "grad_norm": 1.2265625, + "learning_rate": 1.9337744402621194e-05, + "loss": 1.1543, + "step": 1870 + }, + { + "epoch": 0.20540066823684067, + "grad_norm": 1.2421875, + "learning_rate": 1.9330884628230203e-05, + "loss": 1.2295, + "step": 1875 + }, + { + "epoch": 0.2059484033521389, + "grad_norm": 1.359375, + "learning_rate": 1.932399073868759e-05, + "loss": 1.1531, + "step": 1880 + }, + { + "epoch": 0.20649613846743714, + "grad_norm": 1.2578125, + "learning_rate": 1.9317062759198478e-05, + "loss": 1.1634, + "step": 1885 + }, + { + "epoch": 0.20704387358273538, + "grad_norm": 1.3046875, + "learning_rate": 1.9310100715092624e-05, + "loss": 1.1359, + "step": 1890 + }, + { + "epoch": 0.20759160869803364, + "grad_norm": 1.2265625, + "learning_rate": 1.9303104631824338e-05, + "loss": 1.1439, + "step": 1895 + }, + { + "epoch": 0.20813934381333188, + "grad_norm": 1.2578125, + "learning_rate": 1.929607453497237e-05, + "loss": 1.16, + "step": 1900 + }, + { + "epoch": 0.20868707892863012, + "grad_norm": 1.234375, + "learning_rate": 1.9289010450239843e-05, + "loss": 1.1588, + "step": 1905 + }, + { + "epoch": 0.20923481404392835, + "grad_norm": 1.21875, + "learning_rate": 1.928191240345414e-05, + "loss": 1.1687, + "step": 1910 + }, + { + "epoch": 0.2097825491592266, + "grad_norm": 1.25, + "learning_rate": 1.9274780420566813e-05, + "loss": 1.1872, + "step": 1915 + }, + { + "epoch": 0.21033028427452485, + "grad_norm": 1.1953125, + "learning_rate": 1.926761452765349e-05, + "loss": 1.1762, + "step": 1920 + }, + { + "epoch": 0.2108780193898231, + "grad_norm": 1.2265625, + "learning_rate": 1.926041475091377e-05, + "loss": 1.1131, + "step": 1925 + }, + { + "epoch": 0.21142575450512133, + "grad_norm": 1.1953125, + "learning_rate": 1.925318111667116e-05, + "loss": 1.1835, + "step": 1930 + }, + { + "epoch": 0.21197348962041956, + "grad_norm": 1.21875, + "learning_rate": 1.9245913651372935e-05, + "loss": 1.2117, + "step": 1935 + }, + { + "epoch": 0.2125212247357178, + "grad_norm": 1.2265625, + "learning_rate": 1.923861238159007e-05, + "loss": 1.1396, + "step": 1940 + }, + { + "epoch": 0.21306895985101604, + "grad_norm": 1.2421875, + "learning_rate": 1.9231277334017126e-05, + "loss": 1.1932, + "step": 1945 + }, + { + "epoch": 0.2136166949663143, + "grad_norm": 1.21875, + "learning_rate": 1.9223908535472172e-05, + "loss": 1.1839, + "step": 1950 + }, + { + "epoch": 0.21416443008161254, + "grad_norm": 1.2890625, + "learning_rate": 1.921650601289667e-05, + "loss": 1.1628, + "step": 1955 + }, + { + "epoch": 0.21471216519691078, + "grad_norm": 1.2109375, + "learning_rate": 1.9209069793355382e-05, + "loss": 1.1491, + "step": 1960 + }, + { + "epoch": 0.215259900312209, + "grad_norm": 1.2421875, + "learning_rate": 1.9201599904036276e-05, + "loss": 1.186, + "step": 1965 + }, + { + "epoch": 0.21580763542750725, + "grad_norm": 1.2734375, + "learning_rate": 1.919409637225041e-05, + "loss": 1.1556, + "step": 1970 + }, + { + "epoch": 0.2163553705428055, + "grad_norm": 1.1953125, + "learning_rate": 1.9186559225431857e-05, + "loss": 1.1598, + "step": 1975 + }, + { + "epoch": 0.21690310565810375, + "grad_norm": 1.1953125, + "learning_rate": 1.917898849113759e-05, + "loss": 1.2076, + "step": 1980 + }, + { + "epoch": 0.217450840773402, + "grad_norm": 1.234375, + "learning_rate": 1.9171384197047376e-05, + "loss": 1.1219, + "step": 1985 + }, + { + "epoch": 0.21799857588870022, + "grad_norm": 1.2109375, + "learning_rate": 1.9163746370963687e-05, + "loss": 1.1347, + "step": 1990 + }, + { + "epoch": 0.21854631100399846, + "grad_norm": 1.234375, + "learning_rate": 1.9156075040811585e-05, + "loss": 1.1717, + "step": 1995 + }, + { + "epoch": 0.2190940461192967, + "grad_norm": 1.2265625, + "learning_rate": 1.9148370234638646e-05, + "loss": 1.1705, + "step": 2000 + }, + { + "epoch": 0.21964178123459496, + "grad_norm": 1.2265625, + "learning_rate": 1.914063198061482e-05, + "loss": 1.1666, + "step": 2005 + }, + { + "epoch": 0.2201895163498932, + "grad_norm": 1.25, + "learning_rate": 1.9132860307032362e-05, + "loss": 1.1711, + "step": 2010 + }, + { + "epoch": 0.22073725146519144, + "grad_norm": 1.21875, + "learning_rate": 1.9125055242305702e-05, + "loss": 1.1176, + "step": 2015 + }, + { + "epoch": 0.22128498658048967, + "grad_norm": 1.2578125, + "learning_rate": 1.9117216814971365e-05, + "loss": 1.1716, + "step": 2020 + }, + { + "epoch": 0.2218327216957879, + "grad_norm": 1.2265625, + "learning_rate": 1.9109345053687845e-05, + "loss": 1.1684, + "step": 2025 + }, + { + "epoch": 0.22238045681108615, + "grad_norm": 1.3125, + "learning_rate": 1.9101439987235514e-05, + "loss": 1.1766, + "step": 2030 + }, + { + "epoch": 0.2229281919263844, + "grad_norm": 1.1953125, + "learning_rate": 1.909350164451651e-05, + "loss": 1.21, + "step": 2035 + }, + { + "epoch": 0.22347592704168265, + "grad_norm": 1.28125, + "learning_rate": 1.9085530054554642e-05, + "loss": 1.2172, + "step": 2040 + }, + { + "epoch": 0.22402366215698089, + "grad_norm": 1.21875, + "learning_rate": 1.907752524649527e-05, + "loss": 1.1885, + "step": 2045 + }, + { + "epoch": 0.22457139727227912, + "grad_norm": 1.25, + "learning_rate": 1.9069487249605202e-05, + "loss": 1.1708, + "step": 2050 + }, + { + "epoch": 0.22511913238757736, + "grad_norm": 1.1953125, + "learning_rate": 1.90614160932726e-05, + "loss": 1.1645, + "step": 2055 + }, + { + "epoch": 0.2256668675028756, + "grad_norm": 1.25, + "learning_rate": 1.9053311807006845e-05, + "loss": 1.1632, + "step": 2060 + }, + { + "epoch": 0.22621460261817386, + "grad_norm": 1.21875, + "learning_rate": 1.9045174420438465e-05, + "loss": 1.1619, + "step": 2065 + }, + { + "epoch": 0.2267623377334721, + "grad_norm": 1.25, + "learning_rate": 1.9037003963318993e-05, + "loss": 1.1658, + "step": 2070 + }, + { + "epoch": 0.22731007284877033, + "grad_norm": 1.34375, + "learning_rate": 1.9028800465520878e-05, + "loss": 1.1941, + "step": 2075 + }, + { + "epoch": 0.22785780796406857, + "grad_norm": 1.203125, + "learning_rate": 1.9020563957037378e-05, + "loss": 1.1463, + "step": 2080 + }, + { + "epoch": 0.2284055430793668, + "grad_norm": 1.2421875, + "learning_rate": 1.901229446798243e-05, + "loss": 1.16, + "step": 2085 + }, + { + "epoch": 0.22895327819466507, + "grad_norm": 1.234375, + "learning_rate": 1.9003992028590568e-05, + "loss": 1.1567, + "step": 2090 + }, + { + "epoch": 0.2295010133099633, + "grad_norm": 1.265625, + "learning_rate": 1.8995656669216786e-05, + "loss": 1.2039, + "step": 2095 + }, + { + "epoch": 0.23004874842526155, + "grad_norm": 1.28125, + "learning_rate": 1.898728842033644e-05, + "loss": 1.1733, + "step": 2100 + }, + { + "epoch": 0.23059648354055978, + "grad_norm": 1.2421875, + "learning_rate": 1.8978887312545135e-05, + "loss": 1.1702, + "step": 2105 + }, + { + "epoch": 0.23114421865585802, + "grad_norm": 1.2109375, + "learning_rate": 1.897045337655862e-05, + "loss": 1.2218, + "step": 2110 + }, + { + "epoch": 0.23169195377115626, + "grad_norm": 1.2265625, + "learning_rate": 1.8961986643212664e-05, + "loss": 1.2084, + "step": 2115 + }, + { + "epoch": 0.23223968888645452, + "grad_norm": 1.2109375, + "learning_rate": 1.895348714346294e-05, + "loss": 1.1865, + "step": 2120 + }, + { + "epoch": 0.23278742400175276, + "grad_norm": 1.265625, + "learning_rate": 1.894495490838494e-05, + "loss": 1.2198, + "step": 2125 + }, + { + "epoch": 0.233335159117051, + "grad_norm": 1.1953125, + "learning_rate": 1.8936389969173825e-05, + "loss": 1.134, + "step": 2130 + }, + { + "epoch": 0.23388289423234923, + "grad_norm": 1.2421875, + "learning_rate": 1.892779235714433e-05, + "loss": 1.1399, + "step": 2135 + }, + { + "epoch": 0.23443062934764747, + "grad_norm": 1.21875, + "learning_rate": 1.8919162103730656e-05, + "loss": 1.1123, + "step": 2140 + }, + { + "epoch": 0.23497836446294573, + "grad_norm": 1.25, + "learning_rate": 1.891049924048633e-05, + "loss": 1.1695, + "step": 2145 + }, + { + "epoch": 0.23552609957824397, + "grad_norm": 1.2578125, + "learning_rate": 1.8901803799084117e-05, + "loss": 1.1359, + "step": 2150 + }, + { + "epoch": 0.2360738346935422, + "grad_norm": 1.3046875, + "learning_rate": 1.8893075811315894e-05, + "loss": 1.2111, + "step": 2155 + }, + { + "epoch": 0.23662156980884044, + "grad_norm": 1.2265625, + "learning_rate": 1.888431530909253e-05, + "loss": 1.1088, + "step": 2160 + }, + { + "epoch": 0.23716930492413868, + "grad_norm": 1.2734375, + "learning_rate": 1.8875522324443762e-05, + "loss": 1.1838, + "step": 2165 + }, + { + "epoch": 0.23771704003943692, + "grad_norm": 1.2109375, + "learning_rate": 1.8866696889518107e-05, + "loss": 1.1323, + "step": 2170 + }, + { + "epoch": 0.23826477515473518, + "grad_norm": 1.2578125, + "learning_rate": 1.8857839036582707e-05, + "loss": 1.1296, + "step": 2175 + }, + { + "epoch": 0.23881251027003342, + "grad_norm": 1.1953125, + "learning_rate": 1.8848948798023238e-05, + "loss": 1.1617, + "step": 2180 + }, + { + "epoch": 0.23936024538533165, + "grad_norm": 1.203125, + "learning_rate": 1.8840026206343786e-05, + "loss": 1.1627, + "step": 2185 + }, + { + "epoch": 0.2399079805006299, + "grad_norm": 1.2265625, + "learning_rate": 1.8831071294166717e-05, + "loss": 1.1173, + "step": 2190 + }, + { + "epoch": 0.24045571561592813, + "grad_norm": 1.28125, + "learning_rate": 1.882208409423257e-05, + "loss": 1.212, + "step": 2195 + }, + { + "epoch": 0.24100345073122637, + "grad_norm": 1.28125, + "learning_rate": 1.8813064639399933e-05, + "loss": 1.1543, + "step": 2200 + }, + { + "epoch": 0.24155118584652463, + "grad_norm": 1.2578125, + "learning_rate": 1.8804012962645322e-05, + "loss": 1.1847, + "step": 2205 + }, + { + "epoch": 0.24209892096182287, + "grad_norm": 1.2109375, + "learning_rate": 1.8794929097063062e-05, + "loss": 1.2248, + "step": 2210 + }, + { + "epoch": 0.2426466560771211, + "grad_norm": 1.234375, + "learning_rate": 1.8785813075865164e-05, + "loss": 1.2115, + "step": 2215 + }, + { + "epoch": 0.24319439119241934, + "grad_norm": 1.2265625, + "learning_rate": 1.8776664932381208e-05, + "loss": 1.1039, + "step": 2220 + }, + { + "epoch": 0.24374212630771758, + "grad_norm": 1.203125, + "learning_rate": 1.8767484700058212e-05, + "loss": 1.1608, + "step": 2225 + }, + { + "epoch": 0.24428986142301584, + "grad_norm": 1.2578125, + "learning_rate": 1.875827241246052e-05, + "loss": 1.1998, + "step": 2230 + }, + { + "epoch": 0.24483759653831408, + "grad_norm": 1.296875, + "learning_rate": 1.874902810326968e-05, + "loss": 1.2347, + "step": 2235 + }, + { + "epoch": 0.24538533165361232, + "grad_norm": 1.3203125, + "learning_rate": 1.873975180628431e-05, + "loss": 1.149, + "step": 2240 + }, + { + "epoch": 0.24593306676891055, + "grad_norm": 1.15625, + "learning_rate": 1.873044355541997e-05, + "loss": 1.1524, + "step": 2245 + }, + { + "epoch": 0.2464808018842088, + "grad_norm": 1.21875, + "learning_rate": 1.872110338470907e-05, + "loss": 1.1628, + "step": 2250 + }, + { + "epoch": 0.24702853699950703, + "grad_norm": 1.1796875, + "learning_rate": 1.8711731328300715e-05, + "loss": 1.201, + "step": 2255 + }, + { + "epoch": 0.2475762721148053, + "grad_norm": 1.2421875, + "learning_rate": 1.8702327420460575e-05, + "loss": 1.1418, + "step": 2260 + }, + { + "epoch": 0.24812400723010353, + "grad_norm": 1.171875, + "learning_rate": 1.8692891695570795e-05, + "loss": 1.1357, + "step": 2265 + }, + { + "epoch": 0.24867174234540176, + "grad_norm": 1.34375, + "learning_rate": 1.8683424188129836e-05, + "loss": 1.1994, + "step": 2270 + }, + { + "epoch": 0.2492194774607, + "grad_norm": 1.21875, + "learning_rate": 1.8673924932752366e-05, + "loss": 1.1592, + "step": 2275 + }, + { + "epoch": 0.24976721257599824, + "grad_norm": 1.2578125, + "learning_rate": 1.8664393964169125e-05, + "loss": 1.1777, + "step": 2280 + }, + { + "epoch": 0.2503149476912965, + "grad_norm": 1.2734375, + "learning_rate": 1.86548313172268e-05, + "loss": 1.1569, + "step": 2285 + }, + { + "epoch": 0.25086268280659474, + "grad_norm": 1.1796875, + "learning_rate": 1.8645237026887896e-05, + "loss": 1.162, + "step": 2290 + }, + { + "epoch": 0.25141041792189295, + "grad_norm": 1.2265625, + "learning_rate": 1.8635611128230632e-05, + "loss": 1.1158, + "step": 2295 + }, + { + "epoch": 0.2519581530371912, + "grad_norm": 1.2109375, + "learning_rate": 1.8625953656448764e-05, + "loss": 1.1439, + "step": 2300 + }, + { + "epoch": 0.2525058881524895, + "grad_norm": 1.1953125, + "learning_rate": 1.86162646468515e-05, + "loss": 1.1504, + "step": 2305 + }, + { + "epoch": 0.2530536232677877, + "grad_norm": 1.1875, + "learning_rate": 1.8606544134863355e-05, + "loss": 1.1322, + "step": 2310 + }, + { + "epoch": 0.25360135838308595, + "grad_norm": 1.203125, + "learning_rate": 1.8596792156024014e-05, + "loss": 1.2144, + "step": 2315 + }, + { + "epoch": 0.25414909349838416, + "grad_norm": 1.1875, + "learning_rate": 1.858700874598821e-05, + "loss": 1.1363, + "step": 2320 + }, + { + "epoch": 0.2546968286136824, + "grad_norm": 1.1953125, + "learning_rate": 1.8577193940525608e-05, + "loss": 1.1982, + "step": 2325 + }, + { + "epoch": 0.2552445637289807, + "grad_norm": 1.25, + "learning_rate": 1.8567347775520642e-05, + "loss": 1.1562, + "step": 2330 + }, + { + "epoch": 0.2557922988442789, + "grad_norm": 1.25, + "learning_rate": 1.8557470286972405e-05, + "loss": 1.1552, + "step": 2335 + }, + { + "epoch": 0.25634003395957716, + "grad_norm": 1.1484375, + "learning_rate": 1.8547561510994525e-05, + "loss": 1.1051, + "step": 2340 + }, + { + "epoch": 0.25688776907487537, + "grad_norm": 1.15625, + "learning_rate": 1.853762148381501e-05, + "loss": 1.1806, + "step": 2345 + }, + { + "epoch": 0.25743550419017364, + "grad_norm": 1.28125, + "learning_rate": 1.8527650241776128e-05, + "loss": 1.1662, + "step": 2350 + }, + { + "epoch": 0.2579832393054719, + "grad_norm": 1.2265625, + "learning_rate": 1.8517647821334278e-05, + "loss": 1.2243, + "step": 2355 + }, + { + "epoch": 0.2585309744207701, + "grad_norm": 1.2890625, + "learning_rate": 1.8507614259059847e-05, + "loss": 1.1888, + "step": 2360 + }, + { + "epoch": 0.2590787095360684, + "grad_norm": 1.234375, + "learning_rate": 1.849754959163709e-05, + "loss": 1.1542, + "step": 2365 + }, + { + "epoch": 0.2596264446513666, + "grad_norm": 1.203125, + "learning_rate": 1.848745385586398e-05, + "loss": 1.1973, + "step": 2370 + }, + { + "epoch": 0.26017417976666485, + "grad_norm": 1.1953125, + "learning_rate": 1.8477327088652078e-05, + "loss": 1.1869, + "step": 2375 + }, + { + "epoch": 0.26072191488196306, + "grad_norm": 1.1953125, + "learning_rate": 1.846716932702641e-05, + "loss": 1.1914, + "step": 2380 + }, + { + "epoch": 0.2612696499972613, + "grad_norm": 1.2109375, + "learning_rate": 1.8456980608125317e-05, + "loss": 1.1582, + "step": 2385 + }, + { + "epoch": 0.2618173851125596, + "grad_norm": 1.2109375, + "learning_rate": 1.8446760969200318e-05, + "loss": 1.1917, + "step": 2390 + }, + { + "epoch": 0.2623651202278578, + "grad_norm": 1.2109375, + "learning_rate": 1.8436510447615997e-05, + "loss": 1.1456, + "step": 2395 + }, + { + "epoch": 0.26291285534315606, + "grad_norm": 1.21875, + "learning_rate": 1.8426229080849833e-05, + "loss": 1.1333, + "step": 2400 + }, + { + "epoch": 0.26346059045845427, + "grad_norm": 1.234375, + "learning_rate": 1.8415916906492093e-05, + "loss": 1.1286, + "step": 2405 + }, + { + "epoch": 0.26400832557375253, + "grad_norm": 1.2265625, + "learning_rate": 1.8405573962245666e-05, + "loss": 1.1542, + "step": 2410 + }, + { + "epoch": 0.2645560606890508, + "grad_norm": 1.2109375, + "learning_rate": 1.839520028592596e-05, + "loss": 1.1858, + "step": 2415 + }, + { + "epoch": 0.265103795804349, + "grad_norm": 1.2109375, + "learning_rate": 1.838479591546072e-05, + "loss": 1.1588, + "step": 2420 + }, + { + "epoch": 0.26565153091964727, + "grad_norm": 1.21875, + "learning_rate": 1.8374360888889943e-05, + "loss": 1.174, + "step": 2425 + }, + { + "epoch": 0.2661992660349455, + "grad_norm": 1.234375, + "learning_rate": 1.8363895244365687e-05, + "loss": 1.2234, + "step": 2430 + }, + { + "epoch": 0.26674700115024375, + "grad_norm": 1.1796875, + "learning_rate": 1.8353399020151954e-05, + "loss": 1.1582, + "step": 2435 + }, + { + "epoch": 0.267294736265542, + "grad_norm": 1.2421875, + "learning_rate": 1.8342872254624565e-05, + "loss": 1.1537, + "step": 2440 + }, + { + "epoch": 0.2678424713808402, + "grad_norm": 1.1953125, + "learning_rate": 1.8332314986270994e-05, + "loss": 1.154, + "step": 2445 + }, + { + "epoch": 0.2683902064961385, + "grad_norm": 1.15625, + "learning_rate": 1.832172725369024e-05, + "loss": 1.1304, + "step": 2450 + }, + { + "epoch": 0.2689379416114367, + "grad_norm": 1.234375, + "learning_rate": 1.831110909559269e-05, + "loss": 1.1849, + "step": 2455 + }, + { + "epoch": 0.26948567672673496, + "grad_norm": 1.203125, + "learning_rate": 1.830046055079995e-05, + "loss": 1.1697, + "step": 2460 + }, + { + "epoch": 0.27003341184203317, + "grad_norm": 1.15625, + "learning_rate": 1.8289781658244757e-05, + "loss": 1.1658, + "step": 2465 + }, + { + "epoch": 0.27058114695733143, + "grad_norm": 1.1953125, + "learning_rate": 1.827907245697078e-05, + "loss": 1.1697, + "step": 2470 + }, + { + "epoch": 0.2711288820726297, + "grad_norm": 1.1875, + "learning_rate": 1.826833298613251e-05, + "loss": 1.1042, + "step": 2475 + }, + { + "epoch": 0.2716766171879279, + "grad_norm": 1.265625, + "learning_rate": 1.825756328499511e-05, + "loss": 1.2129, + "step": 2480 + }, + { + "epoch": 0.27222435230322617, + "grad_norm": 1.453125, + "learning_rate": 1.8246763392934256e-05, + "loss": 1.2177, + "step": 2485 + }, + { + "epoch": 0.2727720874185244, + "grad_norm": 1.2265625, + "learning_rate": 1.8235933349436025e-05, + "loss": 1.134, + "step": 2490 + }, + { + "epoch": 0.27331982253382264, + "grad_norm": 1.234375, + "learning_rate": 1.8225073194096728e-05, + "loss": 1.1721, + "step": 2495 + }, + { + "epoch": 0.2738675576491209, + "grad_norm": 1.25, + "learning_rate": 1.8214182966622758e-05, + "loss": 1.2035, + "step": 2500 + }, + { + "epoch": 0.2744152927644191, + "grad_norm": 1.234375, + "learning_rate": 1.8203262706830467e-05, + "loss": 1.1489, + "step": 2505 + }, + { + "epoch": 0.2749630278797174, + "grad_norm": 1.21875, + "learning_rate": 1.8192312454646007e-05, + "loss": 1.1163, + "step": 2510 + }, + { + "epoch": 0.2755107629950156, + "grad_norm": 1.2578125, + "learning_rate": 1.818133225010519e-05, + "loss": 1.1847, + "step": 2515 + }, + { + "epoch": 0.27605849811031385, + "grad_norm": 1.2265625, + "learning_rate": 1.8170322133353328e-05, + "loss": 1.2177, + "step": 2520 + }, + { + "epoch": 0.2766062332256121, + "grad_norm": 1.21875, + "learning_rate": 1.815928214464511e-05, + "loss": 1.1929, + "step": 2525 + }, + { + "epoch": 0.27715396834091033, + "grad_norm": 1.2109375, + "learning_rate": 1.814821232434444e-05, + "loss": 1.1931, + "step": 2530 + }, + { + "epoch": 0.2777017034562086, + "grad_norm": 1.234375, + "learning_rate": 1.8137112712924273e-05, + "loss": 1.1286, + "step": 2535 + }, + { + "epoch": 0.2782494385715068, + "grad_norm": 1.2421875, + "learning_rate": 1.812598335096651e-05, + "loss": 1.1489, + "step": 2540 + }, + { + "epoch": 0.27879717368680507, + "grad_norm": 1.1875, + "learning_rate": 1.8114824279161806e-05, + "loss": 1.1457, + "step": 2545 + }, + { + "epoch": 0.2793449088021033, + "grad_norm": 1.2578125, + "learning_rate": 1.810363553830945e-05, + "loss": 1.1604, + "step": 2550 + }, + { + "epoch": 0.27989264391740154, + "grad_norm": 1.2578125, + "learning_rate": 1.8092417169317203e-05, + "loss": 1.2167, + "step": 2555 + }, + { + "epoch": 0.2804403790326998, + "grad_norm": 1.234375, + "learning_rate": 1.8081169213201145e-05, + "loss": 1.1854, + "step": 2560 + }, + { + "epoch": 0.280988114147998, + "grad_norm": 1.1953125, + "learning_rate": 1.806989171108554e-05, + "loss": 1.1755, + "step": 2565 + }, + { + "epoch": 0.2815358492632963, + "grad_norm": 1.2421875, + "learning_rate": 1.8058584704202674e-05, + "loss": 1.1673, + "step": 2570 + }, + { + "epoch": 0.2820835843785945, + "grad_norm": 1.28125, + "learning_rate": 1.8047248233892706e-05, + "loss": 1.2167, + "step": 2575 + }, + { + "epoch": 0.28263131949389275, + "grad_norm": 1.203125, + "learning_rate": 1.8035882341603518e-05, + "loss": 1.1441, + "step": 2580 + }, + { + "epoch": 0.283179054609191, + "grad_norm": 1.2265625, + "learning_rate": 1.8024487068890556e-05, + "loss": 1.1872, + "step": 2585 + }, + { + "epoch": 0.2837267897244892, + "grad_norm": 1.265625, + "learning_rate": 1.8013062457416702e-05, + "loss": 1.1503, + "step": 2590 + }, + { + "epoch": 0.2842745248397875, + "grad_norm": 1.21875, + "learning_rate": 1.800160854895209e-05, + "loss": 1.1635, + "step": 2595 + }, + { + "epoch": 0.2848222599550857, + "grad_norm": 1.171875, + "learning_rate": 1.7990125385373978e-05, + "loss": 1.1482, + "step": 2600 + }, + { + "epoch": 0.28536999507038396, + "grad_norm": 1.2890625, + "learning_rate": 1.7978613008666577e-05, + "loss": 1.1801, + "step": 2605 + }, + { + "epoch": 0.28591773018568223, + "grad_norm": 1.21875, + "learning_rate": 1.796707146092091e-05, + "loss": 1.1781, + "step": 2610 + }, + { + "epoch": 0.28646546530098044, + "grad_norm": 1.2109375, + "learning_rate": 1.7955500784334655e-05, + "loss": 1.1593, + "step": 2615 + }, + { + "epoch": 0.2870132004162787, + "grad_norm": 1.203125, + "learning_rate": 1.794390102121199e-05, + "loss": 1.1746, + "step": 2620 + }, + { + "epoch": 0.2875609355315769, + "grad_norm": 1.203125, + "learning_rate": 1.7932272213963434e-05, + "loss": 1.1758, + "step": 2625 + }, + { + "epoch": 0.2881086706468752, + "grad_norm": 1.2578125, + "learning_rate": 1.7920614405105695e-05, + "loss": 1.2296, + "step": 2630 + }, + { + "epoch": 0.28865640576217344, + "grad_norm": 1.1875, + "learning_rate": 1.7908927637261522e-05, + "loss": 1.1928, + "step": 2635 + }, + { + "epoch": 0.28920414087747165, + "grad_norm": 1.3203125, + "learning_rate": 1.789721195315954e-05, + "loss": 1.1237, + "step": 2640 + }, + { + "epoch": 0.2897518759927699, + "grad_norm": 1.2109375, + "learning_rate": 1.7885467395634087e-05, + "loss": 1.1734, + "step": 2645 + }, + { + "epoch": 0.2902996111080681, + "grad_norm": 1.2421875, + "learning_rate": 1.7873694007625084e-05, + "loss": 1.1656, + "step": 2650 + }, + { + "epoch": 0.2908473462233664, + "grad_norm": 1.234375, + "learning_rate": 1.786189183217784e-05, + "loss": 1.1558, + "step": 2655 + }, + { + "epoch": 0.2913950813386646, + "grad_norm": 1.1875, + "learning_rate": 1.785006091244294e-05, + "loss": 1.1529, + "step": 2660 + }, + { + "epoch": 0.29194281645396286, + "grad_norm": 1.1875, + "learning_rate": 1.7838201291676037e-05, + "loss": 1.1561, + "step": 2665 + }, + { + "epoch": 0.2924905515692611, + "grad_norm": 1.21875, + "learning_rate": 1.7826313013237744e-05, + "loss": 1.1463, + "step": 2670 + }, + { + "epoch": 0.29303828668455933, + "grad_norm": 1.2109375, + "learning_rate": 1.7814396120593428e-05, + "loss": 1.1611, + "step": 2675 + }, + { + "epoch": 0.2935860217998576, + "grad_norm": 1.2109375, + "learning_rate": 1.7802450657313086e-05, + "loss": 1.1687, + "step": 2680 + }, + { + "epoch": 0.2941337569151558, + "grad_norm": 1.1953125, + "learning_rate": 1.7790476667071175e-05, + "loss": 1.1818, + "step": 2685 + }, + { + "epoch": 0.2946814920304541, + "grad_norm": 1.2109375, + "learning_rate": 1.7778474193646448e-05, + "loss": 1.1589, + "step": 2690 + }, + { + "epoch": 0.29522922714575234, + "grad_norm": 1.171875, + "learning_rate": 1.776644328092179e-05, + "loss": 1.1448, + "step": 2695 + }, + { + "epoch": 0.29577696226105055, + "grad_norm": 1.25, + "learning_rate": 1.7754383972884084e-05, + "loss": 1.1354, + "step": 2700 + }, + { + "epoch": 0.2963246973763488, + "grad_norm": 1.1796875, + "learning_rate": 1.7742296313624005e-05, + "loss": 1.1309, + "step": 2705 + }, + { + "epoch": 0.296872432491647, + "grad_norm": 1.1875, + "learning_rate": 1.773018034733591e-05, + "loss": 1.1921, + "step": 2710 + }, + { + "epoch": 0.2974201676069453, + "grad_norm": 1.2265625, + "learning_rate": 1.771803611831762e-05, + "loss": 1.1484, + "step": 2715 + }, + { + "epoch": 0.29796790272224355, + "grad_norm": 1.1953125, + "learning_rate": 1.7705863670970327e-05, + "loss": 1.2024, + "step": 2720 + }, + { + "epoch": 0.29851563783754176, + "grad_norm": 1.234375, + "learning_rate": 1.7693663049798363e-05, + "loss": 1.1696, + "step": 2725 + }, + { + "epoch": 0.29906337295284, + "grad_norm": 1.21875, + "learning_rate": 1.7681434299409077e-05, + "loss": 1.1191, + "step": 2730 + }, + { + "epoch": 0.29961110806813823, + "grad_norm": 1.2265625, + "learning_rate": 1.766917746451267e-05, + "loss": 1.1662, + "step": 2735 + }, + { + "epoch": 0.3001588431834365, + "grad_norm": 1.203125, + "learning_rate": 1.7656892589922017e-05, + "loss": 1.1653, + "step": 2740 + }, + { + "epoch": 0.3007065782987347, + "grad_norm": 1.2578125, + "learning_rate": 1.7644579720552513e-05, + "loss": 1.1145, + "step": 2745 + }, + { + "epoch": 0.30125431341403297, + "grad_norm": 1.21875, + "learning_rate": 1.76322389014219e-05, + "loss": 1.1201, + "step": 2750 + }, + { + "epoch": 0.30180204852933123, + "grad_norm": 1.1953125, + "learning_rate": 1.761987017765012e-05, + "loss": 1.1349, + "step": 2755 + }, + { + "epoch": 0.30234978364462944, + "grad_norm": 1.2421875, + "learning_rate": 1.7607473594459127e-05, + "loss": 1.1937, + "step": 2760 + }, + { + "epoch": 0.3028975187599277, + "grad_norm": 1.28125, + "learning_rate": 1.7595049197172744e-05, + "loss": 1.1489, + "step": 2765 + }, + { + "epoch": 0.3034452538752259, + "grad_norm": 1.265625, + "learning_rate": 1.7582597031216476e-05, + "loss": 1.1559, + "step": 2770 + }, + { + "epoch": 0.3039929889905242, + "grad_norm": 1.203125, + "learning_rate": 1.7570117142117366e-05, + "loss": 1.1413, + "step": 2775 + }, + { + "epoch": 0.30454072410582245, + "grad_norm": 1.2109375, + "learning_rate": 1.7557609575503808e-05, + "loss": 1.1424, + "step": 2780 + }, + { + "epoch": 0.30508845922112066, + "grad_norm": 1.2265625, + "learning_rate": 1.754507437710539e-05, + "loss": 1.1737, + "step": 2785 + }, + { + "epoch": 0.3056361943364189, + "grad_norm": 1.1953125, + "learning_rate": 1.7532511592752734e-05, + "loss": 1.1835, + "step": 2790 + }, + { + "epoch": 0.30618392945171713, + "grad_norm": 1.2421875, + "learning_rate": 1.751992126837731e-05, + "loss": 1.2163, + "step": 2795 + }, + { + "epoch": 0.3067316645670154, + "grad_norm": 1.2109375, + "learning_rate": 1.7507303450011287e-05, + "loss": 1.1411, + "step": 2800 + }, + { + "epoch": 0.30727939968231366, + "grad_norm": 1.2109375, + "learning_rate": 1.7494658183787344e-05, + "loss": 1.2053, + "step": 2805 + }, + { + "epoch": 0.30782713479761187, + "grad_norm": 1.21875, + "learning_rate": 1.7481985515938538e-05, + "loss": 1.1699, + "step": 2810 + }, + { + "epoch": 0.30837486991291013, + "grad_norm": 1.296875, + "learning_rate": 1.746928549279808e-05, + "loss": 1.1804, + "step": 2815 + }, + { + "epoch": 0.30892260502820834, + "grad_norm": 1.25, + "learning_rate": 1.745655816079922e-05, + "loss": 1.1437, + "step": 2820 + }, + { + "epoch": 0.3094703401435066, + "grad_norm": 1.2421875, + "learning_rate": 1.744380356647504e-05, + "loss": 1.13, + "step": 2825 + }, + { + "epoch": 0.3100180752588048, + "grad_norm": 1.265625, + "learning_rate": 1.743102175645831e-05, + "loss": 1.1917, + "step": 2830 + }, + { + "epoch": 0.3105658103741031, + "grad_norm": 1.234375, + "learning_rate": 1.741821277748128e-05, + "loss": 1.1613, + "step": 2835 + }, + { + "epoch": 0.31111354548940134, + "grad_norm": 1.2421875, + "learning_rate": 1.7405376676375567e-05, + "loss": 1.1408, + "step": 2840 + }, + { + "epoch": 0.31166128060469955, + "grad_norm": 1.296875, + "learning_rate": 1.7392513500071927e-05, + "loss": 1.1696, + "step": 2845 + }, + { + "epoch": 0.3122090157199978, + "grad_norm": 1.1640625, + "learning_rate": 1.737962329560011e-05, + "loss": 1.1046, + "step": 2850 + }, + { + "epoch": 0.312756750835296, + "grad_norm": 1.296875, + "learning_rate": 1.7366706110088697e-05, + "loss": 1.1929, + "step": 2855 + }, + { + "epoch": 0.3133044859505943, + "grad_norm": 1.2109375, + "learning_rate": 1.73537619907649e-05, + "loss": 1.1473, + "step": 2860 + }, + { + "epoch": 0.31385222106589256, + "grad_norm": 1.171875, + "learning_rate": 1.7340790984954425e-05, + "loss": 1.098, + "step": 2865 + }, + { + "epoch": 0.31439995618119077, + "grad_norm": 1.234375, + "learning_rate": 1.7327793140081256e-05, + "loss": 1.1105, + "step": 2870 + }, + { + "epoch": 0.31494769129648903, + "grad_norm": 1.1796875, + "learning_rate": 1.731476850366752e-05, + "loss": 1.186, + "step": 2875 + }, + { + "epoch": 0.31549542641178724, + "grad_norm": 1.328125, + "learning_rate": 1.7301717123333297e-05, + "loss": 1.2037, + "step": 2880 + }, + { + "epoch": 0.3160431615270855, + "grad_norm": 1.25, + "learning_rate": 1.7288639046796442e-05, + "loss": 1.2007, + "step": 2885 + }, + { + "epoch": 0.31659089664238377, + "grad_norm": 1.1875, + "learning_rate": 1.7275534321872415e-05, + "loss": 1.1487, + "step": 2890 + }, + { + "epoch": 0.317138631757682, + "grad_norm": 1.1875, + "learning_rate": 1.726240299647411e-05, + "loss": 1.1364, + "step": 2895 + }, + { + "epoch": 0.31768636687298024, + "grad_norm": 1.2265625, + "learning_rate": 1.7249245118611684e-05, + "loss": 1.1387, + "step": 2900 + }, + { + "epoch": 0.31823410198827845, + "grad_norm": 1.234375, + "learning_rate": 1.723606073639235e-05, + "loss": 1.1701, + "step": 2905 + }, + { + "epoch": 0.3187818371035767, + "grad_norm": 1.2109375, + "learning_rate": 1.7222849898020256e-05, + "loss": 1.1947, + "step": 2910 + }, + { + "epoch": 0.319329572218875, + "grad_norm": 1.234375, + "learning_rate": 1.720961265179625e-05, + "loss": 1.1591, + "step": 2915 + }, + { + "epoch": 0.3198773073341732, + "grad_norm": 1.265625, + "learning_rate": 1.7196349046117753e-05, + "loss": 1.1816, + "step": 2920 + }, + { + "epoch": 0.32042504244947145, + "grad_norm": 1.2265625, + "learning_rate": 1.7183059129478543e-05, + "loss": 1.1463, + "step": 2925 + }, + { + "epoch": 0.32097277756476966, + "grad_norm": 1.265625, + "learning_rate": 1.7169742950468607e-05, + "loss": 1.0909, + "step": 2930 + }, + { + "epoch": 0.3215205126800679, + "grad_norm": 1.1953125, + "learning_rate": 1.7156400557773944e-05, + "loss": 1.1657, + "step": 2935 + }, + { + "epoch": 0.32206824779536614, + "grad_norm": 1.171875, + "learning_rate": 1.71430320001764e-05, + "loss": 1.1314, + "step": 2940 + }, + { + "epoch": 0.3226159829106644, + "grad_norm": 1.2421875, + "learning_rate": 1.712963732655348e-05, + "loss": 1.1076, + "step": 2945 + }, + { + "epoch": 0.32316371802596267, + "grad_norm": 1.1875, + "learning_rate": 1.711621658587817e-05, + "loss": 1.1591, + "step": 2950 + }, + { + "epoch": 0.3237114531412609, + "grad_norm": 1.2890625, + "learning_rate": 1.7102769827218773e-05, + "loss": 1.1353, + "step": 2955 + }, + { + "epoch": 0.32425918825655914, + "grad_norm": 1.234375, + "learning_rate": 1.7089297099738703e-05, + "loss": 1.186, + "step": 2960 + }, + { + "epoch": 0.32480692337185735, + "grad_norm": 1.2421875, + "learning_rate": 1.7075798452696333e-05, + "loss": 1.1698, + "step": 2965 + }, + { + "epoch": 0.3253546584871556, + "grad_norm": 1.2421875, + "learning_rate": 1.706227393544479e-05, + "loss": 1.1474, + "step": 2970 + }, + { + "epoch": 0.3259023936024539, + "grad_norm": 1.2265625, + "learning_rate": 1.7048723597431804e-05, + "loss": 1.1462, + "step": 2975 + }, + { + "epoch": 0.3264501287177521, + "grad_norm": 1.25, + "learning_rate": 1.703514748819948e-05, + "loss": 1.1458, + "step": 2980 + }, + { + "epoch": 0.32699786383305035, + "grad_norm": 1.3203125, + "learning_rate": 1.702154565738418e-05, + "loss": 1.1661, + "step": 2985 + }, + { + "epoch": 0.32754559894834856, + "grad_norm": 1.15625, + "learning_rate": 1.7007918154716286e-05, + "loss": 1.1273, + "step": 2990 + }, + { + "epoch": 0.3280933340636468, + "grad_norm": 1.1796875, + "learning_rate": 1.6994265030020056e-05, + "loss": 1.1597, + "step": 2995 + }, + { + "epoch": 0.3286410691789451, + "grad_norm": 1.2109375, + "learning_rate": 1.69805863332134e-05, + "loss": 1.1346, + "step": 3000 + }, + { + "epoch": 0.3291888042942433, + "grad_norm": 1.28125, + "learning_rate": 1.6966882114307756e-05, + "loss": 1.2173, + "step": 3005 + }, + { + "epoch": 0.32973653940954156, + "grad_norm": 1.25, + "learning_rate": 1.695315242340785e-05, + "loss": 1.1515, + "step": 3010 + }, + { + "epoch": 0.33028427452483977, + "grad_norm": 1.1953125, + "learning_rate": 1.6939397310711557e-05, + "loss": 1.1896, + "step": 3015 + }, + { + "epoch": 0.33083200964013804, + "grad_norm": 1.171875, + "learning_rate": 1.6925616826509678e-05, + "loss": 1.16, + "step": 3020 + }, + { + "epoch": 0.33137974475543625, + "grad_norm": 1.2421875, + "learning_rate": 1.6911811021185795e-05, + "loss": 1.1782, + "step": 3025 + }, + { + "epoch": 0.3319274798707345, + "grad_norm": 1.2109375, + "learning_rate": 1.689797994521605e-05, + "loss": 1.1129, + "step": 3030 + }, + { + "epoch": 0.3324752149860328, + "grad_norm": 1.2265625, + "learning_rate": 1.688412364916899e-05, + "loss": 1.1878, + "step": 3035 + }, + { + "epoch": 0.333022950101331, + "grad_norm": 1.2421875, + "learning_rate": 1.6870242183705374e-05, + "loss": 1.1491, + "step": 3040 + }, + { + "epoch": 0.33357068521662925, + "grad_norm": 1.203125, + "learning_rate": 1.6856335599577973e-05, + "loss": 1.2087, + "step": 3045 + }, + { + "epoch": 0.33411842033192746, + "grad_norm": 1.25, + "learning_rate": 1.68424039476314e-05, + "loss": 1.1259, + "step": 3050 + }, + { + "epoch": 0.3346661554472257, + "grad_norm": 1.2734375, + "learning_rate": 1.6828447278801923e-05, + "loss": 1.1633, + "step": 3055 + }, + { + "epoch": 0.335213890562524, + "grad_norm": 1.2265625, + "learning_rate": 1.6814465644117274e-05, + "loss": 1.1689, + "step": 3060 + }, + { + "epoch": 0.3357616256778222, + "grad_norm": 1.1484375, + "learning_rate": 1.6800459094696458e-05, + "loss": 1.2037, + "step": 3065 + }, + { + "epoch": 0.33630936079312046, + "grad_norm": 1.3046875, + "learning_rate": 1.6786427681749585e-05, + "loss": 1.166, + "step": 3070 + }, + { + "epoch": 0.33685709590841867, + "grad_norm": 1.1875, + "learning_rate": 1.677237145657766e-05, + "loss": 1.1427, + "step": 3075 + }, + { + "epoch": 0.33740483102371693, + "grad_norm": 1.375, + "learning_rate": 1.6758290470572404e-05, + "loss": 1.1598, + "step": 3080 + }, + { + "epoch": 0.3379525661390152, + "grad_norm": 1.21875, + "learning_rate": 1.674418477521607e-05, + "loss": 1.1667, + "step": 3085 + }, + { + "epoch": 0.3385003012543134, + "grad_norm": 1.3203125, + "learning_rate": 1.673005442208126e-05, + "loss": 1.1389, + "step": 3090 + }, + { + "epoch": 0.33904803636961167, + "grad_norm": 1.265625, + "learning_rate": 1.6715899462830716e-05, + "loss": 1.1899, + "step": 3095 + }, + { + "epoch": 0.3395957714849099, + "grad_norm": 1.2109375, + "learning_rate": 1.6701719949217154e-05, + "loss": 1.1621, + "step": 3100 + }, + { + "epoch": 0.34014350660020815, + "grad_norm": 1.203125, + "learning_rate": 1.6687515933083055e-05, + "loss": 1.1612, + "step": 3105 + }, + { + "epoch": 0.34069124171550635, + "grad_norm": 1.21875, + "learning_rate": 1.66732874663605e-05, + "loss": 1.1507, + "step": 3110 + }, + { + "epoch": 0.3412389768308046, + "grad_norm": 1.21875, + "learning_rate": 1.665903460107094e-05, + "loss": 1.1442, + "step": 3115 + }, + { + "epoch": 0.3417867119461029, + "grad_norm": 1.2109375, + "learning_rate": 1.664475738932506e-05, + "loss": 1.1422, + "step": 3120 + }, + { + "epoch": 0.3423344470614011, + "grad_norm": 1.21875, + "learning_rate": 1.6630455883322535e-05, + "loss": 1.1343, + "step": 3125 + }, + { + "epoch": 0.34288218217669936, + "grad_norm": 1.2265625, + "learning_rate": 1.6616130135351884e-05, + "loss": 1.1564, + "step": 3130 + }, + { + "epoch": 0.34342991729199757, + "grad_norm": 1.234375, + "learning_rate": 1.660178019779024e-05, + "loss": 1.1465, + "step": 3135 + }, + { + "epoch": 0.34397765240729583, + "grad_norm": 1.203125, + "learning_rate": 1.6587406123103194e-05, + "loss": 1.1779, + "step": 3140 + }, + { + "epoch": 0.3445253875225941, + "grad_norm": 1.2265625, + "learning_rate": 1.657300796384457e-05, + "loss": 1.1625, + "step": 3145 + }, + { + "epoch": 0.3450731226378923, + "grad_norm": 1.203125, + "learning_rate": 1.6558585772656255e-05, + "loss": 1.1463, + "step": 3150 + }, + { + "epoch": 0.34562085775319057, + "grad_norm": 1.1875, + "learning_rate": 1.6544139602268008e-05, + "loss": 1.1371, + "step": 3155 + }, + { + "epoch": 0.3461685928684888, + "grad_norm": 1.2265625, + "learning_rate": 1.652966950549725e-05, + "loss": 1.179, + "step": 3160 + }, + { + "epoch": 0.34671632798378704, + "grad_norm": 1.1953125, + "learning_rate": 1.6515175535248883e-05, + "loss": 1.1811, + "step": 3165 + }, + { + "epoch": 0.3472640630990853, + "grad_norm": 1.2109375, + "learning_rate": 1.6500657744515098e-05, + "loss": 1.1595, + "step": 3170 + }, + { + "epoch": 0.3478117982143835, + "grad_norm": 1.25, + "learning_rate": 1.6486116186375175e-05, + "loss": 1.1437, + "step": 3175 + }, + { + "epoch": 0.3483595333296818, + "grad_norm": 1.2421875, + "learning_rate": 1.6471550913995286e-05, + "loss": 1.1407, + "step": 3180 + }, + { + "epoch": 0.34890726844498, + "grad_norm": 1.2421875, + "learning_rate": 1.6456961980628317e-05, + "loss": 1.1064, + "step": 3185 + }, + { + "epoch": 0.34945500356027825, + "grad_norm": 1.1875, + "learning_rate": 1.6442349439613648e-05, + "loss": 1.1789, + "step": 3190 + }, + { + "epoch": 0.35000273867557646, + "grad_norm": 1.265625, + "learning_rate": 1.6427713344376987e-05, + "loss": 1.138, + "step": 3195 + }, + { + "epoch": 0.35055047379087473, + "grad_norm": 1.1640625, + "learning_rate": 1.6413053748430145e-05, + "loss": 1.1574, + "step": 3200 + }, + { + "epoch": 0.351098208906173, + "grad_norm": 1.2890625, + "learning_rate": 1.639837070537087e-05, + "loss": 1.1448, + "step": 3205 + }, + { + "epoch": 0.3516459440214712, + "grad_norm": 1.2421875, + "learning_rate": 1.6383664268882632e-05, + "loss": 1.151, + "step": 3210 + }, + { + "epoch": 0.35219367913676947, + "grad_norm": 1.2421875, + "learning_rate": 1.636893449273442e-05, + "loss": 1.1196, + "step": 3215 + }, + { + "epoch": 0.3527414142520677, + "grad_norm": 1.1640625, + "learning_rate": 1.635418143078057e-05, + "loss": 1.1732, + "step": 3220 + }, + { + "epoch": 0.35328914936736594, + "grad_norm": 1.25, + "learning_rate": 1.6339405136960544e-05, + "loss": 1.1382, + "step": 3225 + }, + { + "epoch": 0.3538368844826642, + "grad_norm": 1.2734375, + "learning_rate": 1.6324605665298755e-05, + "loss": 1.1572, + "step": 3230 + }, + { + "epoch": 0.3543846195979624, + "grad_norm": 1.234375, + "learning_rate": 1.630978306990435e-05, + "loss": 1.1503, + "step": 3235 + }, + { + "epoch": 0.3549323547132607, + "grad_norm": 1.203125, + "learning_rate": 1.6294937404971016e-05, + "loss": 1.1678, + "step": 3240 + }, + { + "epoch": 0.3554800898285589, + "grad_norm": 1.203125, + "learning_rate": 1.6280068724776795e-05, + "loss": 1.1897, + "step": 3245 + }, + { + "epoch": 0.35602782494385715, + "grad_norm": 1.21875, + "learning_rate": 1.6265177083683875e-05, + "loss": 1.1742, + "step": 3250 + }, + { + "epoch": 0.3565755600591554, + "grad_norm": 1.265625, + "learning_rate": 1.6250262536138383e-05, + "loss": 1.1802, + "step": 3255 + }, + { + "epoch": 0.3571232951744536, + "grad_norm": 1.1953125, + "learning_rate": 1.6235325136670208e-05, + "loss": 1.1628, + "step": 3260 + }, + { + "epoch": 0.3576710302897519, + "grad_norm": 1.2421875, + "learning_rate": 1.6220364939892783e-05, + "loss": 1.1598, + "step": 3265 + }, + { + "epoch": 0.3582187654050501, + "grad_norm": 1.2578125, + "learning_rate": 1.6205382000502887e-05, + "loss": 1.1226, + "step": 3270 + }, + { + "epoch": 0.35876650052034836, + "grad_norm": 1.2265625, + "learning_rate": 1.619037637328046e-05, + "loss": 1.1599, + "step": 3275 + }, + { + "epoch": 0.35931423563564663, + "grad_norm": 1.21875, + "learning_rate": 1.617534811308839e-05, + "loss": 1.1609, + "step": 3280 + }, + { + "epoch": 0.35986197075094484, + "grad_norm": 1.2109375, + "learning_rate": 1.61602972748723e-05, + "loss": 1.1611, + "step": 3285 + }, + { + "epoch": 0.3604097058662431, + "grad_norm": 1.203125, + "learning_rate": 1.6145223913660378e-05, + "loss": 1.1171, + "step": 3290 + }, + { + "epoch": 0.3609574409815413, + "grad_norm": 1.21875, + "learning_rate": 1.613012808456316e-05, + "loss": 1.1123, + "step": 3295 + }, + { + "epoch": 0.3615051760968396, + "grad_norm": 1.2890625, + "learning_rate": 1.6115009842773322e-05, + "loss": 1.1953, + "step": 3300 + }, + { + "epoch": 0.3620529112121378, + "grad_norm": 1.2265625, + "learning_rate": 1.609986924356548e-05, + "loss": 1.0976, + "step": 3305 + }, + { + "epoch": 0.36260064632743605, + "grad_norm": 1.15625, + "learning_rate": 1.6084706342295994e-05, + "loss": 1.1681, + "step": 3310 + }, + { + "epoch": 0.3631483814427343, + "grad_norm": 1.234375, + "learning_rate": 1.6069521194402776e-05, + "loss": 1.1529, + "step": 3315 + }, + { + "epoch": 0.3636961165580325, + "grad_norm": 1.2109375, + "learning_rate": 1.605431385540506e-05, + "loss": 1.1988, + "step": 3320 + }, + { + "epoch": 0.3642438516733308, + "grad_norm": 1.265625, + "learning_rate": 1.6039084380903222e-05, + "loss": 1.1394, + "step": 3325 + }, + { + "epoch": 0.364791586788629, + "grad_norm": 1.2109375, + "learning_rate": 1.602383282657857e-05, + "loss": 1.1255, + "step": 3330 + }, + { + "epoch": 0.36533932190392726, + "grad_norm": 1.1875, + "learning_rate": 1.6008559248193128e-05, + "loss": 1.1545, + "step": 3335 + }, + { + "epoch": 0.3658870570192255, + "grad_norm": 1.25, + "learning_rate": 1.599326370158946e-05, + "loss": 1.1661, + "step": 3340 + }, + { + "epoch": 0.36643479213452373, + "grad_norm": 1.2421875, + "learning_rate": 1.597794624269043e-05, + "loss": 1.1714, + "step": 3345 + }, + { + "epoch": 0.366982527249822, + "grad_norm": 1.1953125, + "learning_rate": 1.5962606927499043e-05, + "loss": 1.1919, + "step": 3350 + }, + { + "epoch": 0.3675302623651202, + "grad_norm": 1.2109375, + "learning_rate": 1.5947245812098186e-05, + "loss": 1.1691, + "step": 3355 + }, + { + "epoch": 0.3680779974804185, + "grad_norm": 1.1796875, + "learning_rate": 1.5931862952650466e-05, + "loss": 1.1454, + "step": 3360 + }, + { + "epoch": 0.36862573259571674, + "grad_norm": 1.296875, + "learning_rate": 1.591645840539799e-05, + "loss": 1.1235, + "step": 3365 + }, + { + "epoch": 0.36917346771101495, + "grad_norm": 1.125, + "learning_rate": 1.5901032226662154e-05, + "loss": 1.1144, + "step": 3370 + }, + { + "epoch": 0.3697212028263132, + "grad_norm": 1.21875, + "learning_rate": 1.588558447284344e-05, + "loss": 1.0976, + "step": 3375 + }, + { + "epoch": 0.3702689379416114, + "grad_norm": 1.2265625, + "learning_rate": 1.5870115200421223e-05, + "loss": 1.1143, + "step": 3380 + }, + { + "epoch": 0.3708166730569097, + "grad_norm": 1.1953125, + "learning_rate": 1.5854624465953537e-05, + "loss": 1.1874, + "step": 3385 + }, + { + "epoch": 0.3713644081722079, + "grad_norm": 1.203125, + "learning_rate": 1.5839112326076905e-05, + "loss": 1.1331, + "step": 3390 + }, + { + "epoch": 0.37191214328750616, + "grad_norm": 1.25, + "learning_rate": 1.582357883750609e-05, + "loss": 1.2034, + "step": 3395 + }, + { + "epoch": 0.3724598784028044, + "grad_norm": 1.203125, + "learning_rate": 1.5808024057033927e-05, + "loss": 1.1387, + "step": 3400 + }, + { + "epoch": 0.37300761351810263, + "grad_norm": 1.21875, + "learning_rate": 1.579244804153108e-05, + "loss": 1.1715, + "step": 3405 + }, + { + "epoch": 0.3735553486334009, + "grad_norm": 1.21875, + "learning_rate": 1.5776850847945867e-05, + "loss": 1.1801, + "step": 3410 + }, + { + "epoch": 0.3741030837486991, + "grad_norm": 1.2578125, + "learning_rate": 1.5761232533304034e-05, + "loss": 1.092, + "step": 3415 + }, + { + "epoch": 0.37465081886399737, + "grad_norm": 1.2421875, + "learning_rate": 1.5745593154708543e-05, + "loss": 1.1536, + "step": 3420 + }, + { + "epoch": 0.37519855397929563, + "grad_norm": 1.1796875, + "learning_rate": 1.5729932769339366e-05, + "loss": 1.1283, + "step": 3425 + }, + { + "epoch": 0.37574628909459384, + "grad_norm": 1.203125, + "learning_rate": 1.571425143445329e-05, + "loss": 1.1645, + "step": 3430 + }, + { + "epoch": 0.3762940242098921, + "grad_norm": 1.2578125, + "learning_rate": 1.5698549207383687e-05, + "loss": 1.1203, + "step": 3435 + }, + { + "epoch": 0.3768417593251903, + "grad_norm": 1.234375, + "learning_rate": 1.5682826145540324e-05, + "loss": 1.1555, + "step": 3440 + }, + { + "epoch": 0.3773894944404886, + "grad_norm": 1.3046875, + "learning_rate": 1.566708230640913e-05, + "loss": 1.1614, + "step": 3445 + }, + { + "epoch": 0.37793722955578685, + "grad_norm": 1.203125, + "learning_rate": 1.5651317747552014e-05, + "loss": 1.1987, + "step": 3450 + }, + { + "epoch": 0.37848496467108506, + "grad_norm": 1.28125, + "learning_rate": 1.5635532526606625e-05, + "loss": 1.1825, + "step": 3455 + }, + { + "epoch": 0.3790326997863833, + "grad_norm": 1.25, + "learning_rate": 1.5619726701286167e-05, + "loss": 1.1229, + "step": 3460 + }, + { + "epoch": 0.37958043490168153, + "grad_norm": 1.21875, + "learning_rate": 1.5603900329379168e-05, + "loss": 1.1509, + "step": 3465 + }, + { + "epoch": 0.3801281700169798, + "grad_norm": 1.203125, + "learning_rate": 1.5588053468749285e-05, + "loss": 1.185, + "step": 3470 + }, + { + "epoch": 0.380675905132278, + "grad_norm": 1.2265625, + "learning_rate": 1.5572186177335084e-05, + "loss": 1.1458, + "step": 3475 + }, + { + "epoch": 0.38122364024757627, + "grad_norm": 1.3359375, + "learning_rate": 1.555629851314982e-05, + "loss": 1.1651, + "step": 3480 + }, + { + "epoch": 0.38177137536287453, + "grad_norm": 1.234375, + "learning_rate": 1.5540390534281245e-05, + "loss": 1.1313, + "step": 3485 + }, + { + "epoch": 0.38231911047817274, + "grad_norm": 1.1875, + "learning_rate": 1.5524462298891377e-05, + "loss": 1.2005, + "step": 3490 + }, + { + "epoch": 0.382866845593471, + "grad_norm": 1.1875, + "learning_rate": 1.5508513865216306e-05, + "loss": 1.1227, + "step": 3495 + }, + { + "epoch": 0.3834145807087692, + "grad_norm": 1.2421875, + "learning_rate": 1.5492545291565953e-05, + "loss": 1.1439, + "step": 3500 + }, + { + "epoch": 0.3839623158240675, + "grad_norm": 1.21875, + "learning_rate": 1.5476556636323893e-05, + "loss": 1.1816, + "step": 3505 + }, + { + "epoch": 0.38451005093936574, + "grad_norm": 1.234375, + "learning_rate": 1.5460547957947105e-05, + "loss": 1.124, + "step": 3510 + }, + { + "epoch": 0.38505778605466395, + "grad_norm": 1.2890625, + "learning_rate": 1.5444519314965782e-05, + "loss": 1.1489, + "step": 3515 + }, + { + "epoch": 0.3856055211699622, + "grad_norm": 1.2734375, + "learning_rate": 1.542847076598312e-05, + "loss": 1.1407, + "step": 3520 + }, + { + "epoch": 0.3861532562852604, + "grad_norm": 1.1796875, + "learning_rate": 1.5412402369675082e-05, + "loss": 1.2599, + "step": 3525 + }, + { + "epoch": 0.3867009914005587, + "grad_norm": 1.234375, + "learning_rate": 1.5396314184790194e-05, + "loss": 1.1772, + "step": 3530 + }, + { + "epoch": 0.38724872651585696, + "grad_norm": 1.1953125, + "learning_rate": 1.5380206270149353e-05, + "loss": 1.1525, + "step": 3535 + }, + { + "epoch": 0.38779646163115516, + "grad_norm": 1.1640625, + "learning_rate": 1.536407868464556e-05, + "loss": 1.1601, + "step": 3540 + }, + { + "epoch": 0.38834419674645343, + "grad_norm": 1.265625, + "learning_rate": 1.534793148724376e-05, + "loss": 1.1402, + "step": 3545 + }, + { + "epoch": 0.38889193186175164, + "grad_norm": 1.2109375, + "learning_rate": 1.5331764736980602e-05, + "loss": 1.1789, + "step": 3550 + }, + { + "epoch": 0.3894396669770499, + "grad_norm": 1.2578125, + "learning_rate": 1.5315578492964203e-05, + "loss": 1.1397, + "step": 3555 + }, + { + "epoch": 0.38998740209234817, + "grad_norm": 1.234375, + "learning_rate": 1.5299372814373967e-05, + "loss": 1.183, + "step": 3560 + }, + { + "epoch": 0.3905351372076464, + "grad_norm": 1.1875, + "learning_rate": 1.5283147760460354e-05, + "loss": 1.1203, + "step": 3565 + }, + { + "epoch": 0.39108287232294464, + "grad_norm": 1.1328125, + "learning_rate": 1.5266903390544662e-05, + "loss": 1.0783, + "step": 3570 + }, + { + "epoch": 0.39163060743824285, + "grad_norm": 1.234375, + "learning_rate": 1.5250639764018807e-05, + "loss": 1.1258, + "step": 3575 + }, + { + "epoch": 0.3921783425535411, + "grad_norm": 1.2109375, + "learning_rate": 1.5234356940345115e-05, + "loss": 1.158, + "step": 3580 + }, + { + "epoch": 0.3927260776688393, + "grad_norm": 1.203125, + "learning_rate": 1.5218054979056093e-05, + "loss": 1.1331, + "step": 3585 + }, + { + "epoch": 0.3932738127841376, + "grad_norm": 1.21875, + "learning_rate": 1.5201733939754228e-05, + "loss": 1.2043, + "step": 3590 + }, + { + "epoch": 0.39382154789943585, + "grad_norm": 1.265625, + "learning_rate": 1.5185393882111756e-05, + "loss": 1.1403, + "step": 3595 + }, + { + "epoch": 0.39436928301473406, + "grad_norm": 1.21875, + "learning_rate": 1.5169034865870438e-05, + "loss": 1.1392, + "step": 3600 + }, + { + "epoch": 0.3949170181300323, + "grad_norm": 1.2578125, + "learning_rate": 1.5152656950841361e-05, + "loss": 1.1757, + "step": 3605 + }, + { + "epoch": 0.39546475324533054, + "grad_norm": 1.203125, + "learning_rate": 1.5136260196904704e-05, + "loss": 1.1485, + "step": 3610 + }, + { + "epoch": 0.3960124883606288, + "grad_norm": 1.21875, + "learning_rate": 1.5119844664009526e-05, + "loss": 1.1333, + "step": 3615 + }, + { + "epoch": 0.39656022347592707, + "grad_norm": 1.28125, + "learning_rate": 1.510341041217355e-05, + "loss": 1.1758, + "step": 3620 + }, + { + "epoch": 0.3971079585912253, + "grad_norm": 1.1875, + "learning_rate": 1.508695750148292e-05, + "loss": 1.1336, + "step": 3625 + }, + { + "epoch": 0.39765569370652354, + "grad_norm": 1.2421875, + "learning_rate": 1.5070485992092023e-05, + "loss": 1.1332, + "step": 3630 + }, + { + "epoch": 0.39820342882182175, + "grad_norm": 1.25, + "learning_rate": 1.505399594422323e-05, + "loss": 1.2085, + "step": 3635 + }, + { + "epoch": 0.39875116393712, + "grad_norm": 1.21875, + "learning_rate": 1.5037487418166696e-05, + "loss": 1.1608, + "step": 3640 + }, + { + "epoch": 0.3992988990524183, + "grad_norm": 1.2890625, + "learning_rate": 1.5020960474280137e-05, + "loss": 1.1443, + "step": 3645 + }, + { + "epoch": 0.3998466341677165, + "grad_norm": 1.2109375, + "learning_rate": 1.5004415172988606e-05, + "loss": 1.1886, + "step": 3650 + }, + { + "epoch": 0.40039436928301475, + "grad_norm": 1.3125, + "learning_rate": 1.4987851574784271e-05, + "loss": 1.1719, + "step": 3655 + }, + { + "epoch": 0.40094210439831296, + "grad_norm": 1.1875, + "learning_rate": 1.4971269740226203e-05, + "loss": 1.1735, + "step": 3660 + }, + { + "epoch": 0.4014898395136112, + "grad_norm": 1.1953125, + "learning_rate": 1.4954669729940137e-05, + "loss": 1.141, + "step": 3665 + }, + { + "epoch": 0.40203757462890943, + "grad_norm": 1.2265625, + "learning_rate": 1.4938051604618275e-05, + "loss": 1.1463, + "step": 3670 + }, + { + "epoch": 0.4025853097442077, + "grad_norm": 1.1875, + "learning_rate": 1.4921415425019039e-05, + "loss": 1.1986, + "step": 3675 + }, + { + "epoch": 0.40313304485950596, + "grad_norm": 1.203125, + "learning_rate": 1.4904761251966864e-05, + "loss": 1.15, + "step": 3680 + }, + { + "epoch": 0.40368077997480417, + "grad_norm": 1.21875, + "learning_rate": 1.4888089146351971e-05, + "loss": 1.1433, + "step": 3685 + }, + { + "epoch": 0.40422851509010244, + "grad_norm": 1.265625, + "learning_rate": 1.4871399169130157e-05, + "loss": 1.1749, + "step": 3690 + }, + { + "epoch": 0.40477625020540065, + "grad_norm": 1.2265625, + "learning_rate": 1.4854691381322538e-05, + "loss": 1.1315, + "step": 3695 + }, + { + "epoch": 0.4053239853206989, + "grad_norm": 1.21875, + "learning_rate": 1.4837965844015359e-05, + "loss": 1.1184, + "step": 3700 + }, + { + "epoch": 0.4058717204359972, + "grad_norm": 1.203125, + "learning_rate": 1.4821222618359769e-05, + "loss": 1.1398, + "step": 3705 + }, + { + "epoch": 0.4064194555512954, + "grad_norm": 1.21875, + "learning_rate": 1.480446176557158e-05, + "loss": 1.1341, + "step": 3710 + }, + { + "epoch": 0.40696719066659365, + "grad_norm": 1.234375, + "learning_rate": 1.4787683346931047e-05, + "loss": 1.1129, + "step": 3715 + }, + { + "epoch": 0.40751492578189186, + "grad_norm": 1.2109375, + "learning_rate": 1.4770887423782654e-05, + "loss": 1.1532, + "step": 3720 + }, + { + "epoch": 0.4080626608971901, + "grad_norm": 1.171875, + "learning_rate": 1.4754074057534885e-05, + "loss": 1.0848, + "step": 3725 + }, + { + "epoch": 0.4086103960124884, + "grad_norm": 1.2421875, + "learning_rate": 1.4737243309659998e-05, + "loss": 1.1446, + "step": 3730 + }, + { + "epoch": 0.4091581311277866, + "grad_norm": 1.2265625, + "learning_rate": 1.4720395241693796e-05, + "loss": 1.1819, + "step": 3735 + }, + { + "epoch": 0.40970586624308486, + "grad_norm": 1.3046875, + "learning_rate": 1.4703529915235417e-05, + "loss": 1.1483, + "step": 3740 + }, + { + "epoch": 0.41025360135838307, + "grad_norm": 1.234375, + "learning_rate": 1.468664739194709e-05, + "loss": 1.1191, + "step": 3745 + }, + { + "epoch": 0.41080133647368133, + "grad_norm": 1.25, + "learning_rate": 1.4669747733553917e-05, + "loss": 1.1227, + "step": 3750 + }, + { + "epoch": 0.41134907158897954, + "grad_norm": 1.203125, + "learning_rate": 1.4652831001843656e-05, + "loss": 1.1884, + "step": 3755 + }, + { + "epoch": 0.4118968067042778, + "grad_norm": 1.2421875, + "learning_rate": 1.4635897258666484e-05, + "loss": 1.1812, + "step": 3760 + }, + { + "epoch": 0.41244454181957607, + "grad_norm": 1.21875, + "learning_rate": 1.4618946565934775e-05, + "loss": 1.1986, + "step": 3765 + }, + { + "epoch": 0.4129922769348743, + "grad_norm": 1.21875, + "learning_rate": 1.4601978985622874e-05, + "loss": 1.2179, + "step": 3770 + }, + { + "epoch": 0.41354001205017255, + "grad_norm": 1.1640625, + "learning_rate": 1.4584994579766865e-05, + "loss": 1.1142, + "step": 3775 + }, + { + "epoch": 0.41408774716547075, + "grad_norm": 1.21875, + "learning_rate": 1.4567993410464354e-05, + "loss": 1.1743, + "step": 3780 + }, + { + "epoch": 0.414635482280769, + "grad_norm": 1.2265625, + "learning_rate": 1.4550975539874233e-05, + "loss": 1.2144, + "step": 3785 + }, + { + "epoch": 0.4151832173960673, + "grad_norm": 1.3125, + "learning_rate": 1.4533941030216466e-05, + "loss": 1.1351, + "step": 3790 + }, + { + "epoch": 0.4157309525113655, + "grad_norm": 1.21875, + "learning_rate": 1.4516889943771835e-05, + "loss": 1.1299, + "step": 3795 + }, + { + "epoch": 0.41627868762666376, + "grad_norm": 1.203125, + "learning_rate": 1.4499822342881744e-05, + "loss": 1.1562, + "step": 3800 + }, + { + "epoch": 0.41682642274196197, + "grad_norm": 1.1953125, + "learning_rate": 1.4482738289947968e-05, + "loss": 1.1797, + "step": 3805 + }, + { + "epoch": 0.41737415785726023, + "grad_norm": 1.203125, + "learning_rate": 1.4465637847432444e-05, + "loss": 1.1477, + "step": 3810 + }, + { + "epoch": 0.4179218929725585, + "grad_norm": 1.1875, + "learning_rate": 1.4448521077857013e-05, + "loss": 1.1662, + "step": 3815 + }, + { + "epoch": 0.4184696280878567, + "grad_norm": 1.203125, + "learning_rate": 1.4431388043803227e-05, + "loss": 1.1298, + "step": 3820 + }, + { + "epoch": 0.41901736320315497, + "grad_norm": 1.234375, + "learning_rate": 1.44142388079121e-05, + "loss": 1.1517, + "step": 3825 + }, + { + "epoch": 0.4195650983184532, + "grad_norm": 1.25, + "learning_rate": 1.439707343288388e-05, + "loss": 1.1491, + "step": 3830 + }, + { + "epoch": 0.42011283343375144, + "grad_norm": 1.1953125, + "learning_rate": 1.4379891981477816e-05, + "loss": 1.1864, + "step": 3835 + }, + { + "epoch": 0.4206605685490497, + "grad_norm": 1.2109375, + "learning_rate": 1.4362694516511946e-05, + "loss": 1.1392, + "step": 3840 + }, + { + "epoch": 0.4212083036643479, + "grad_norm": 1.390625, + "learning_rate": 1.434548110086285e-05, + "loss": 1.1222, + "step": 3845 + }, + { + "epoch": 0.4217560387796462, + "grad_norm": 1.328125, + "learning_rate": 1.4328251797465434e-05, + "loss": 1.1979, + "step": 3850 + }, + { + "epoch": 0.4223037738949444, + "grad_norm": 1.2421875, + "learning_rate": 1.4311006669312672e-05, + "loss": 1.2049, + "step": 3855 + }, + { + "epoch": 0.42285150901024265, + "grad_norm": 1.234375, + "learning_rate": 1.4293745779455418e-05, + "loss": 1.1786, + "step": 3860 + }, + { + "epoch": 0.42339924412554086, + "grad_norm": 1.2265625, + "learning_rate": 1.4276469191002147e-05, + "loss": 1.1306, + "step": 3865 + }, + { + "epoch": 0.42394697924083913, + "grad_norm": 1.25, + "learning_rate": 1.4259176967118719e-05, + "loss": 1.1525, + "step": 3870 + }, + { + "epoch": 0.4244947143561374, + "grad_norm": 1.265625, + "learning_rate": 1.4241869171028178e-05, + "loss": 1.153, + "step": 3875 + }, + { + "epoch": 0.4250424494714356, + "grad_norm": 1.1953125, + "learning_rate": 1.4224545866010484e-05, + "loss": 1.1271, + "step": 3880 + }, + { + "epoch": 0.42559018458673387, + "grad_norm": 1.2109375, + "learning_rate": 1.4207207115402316e-05, + "loss": 1.1419, + "step": 3885 + }, + { + "epoch": 0.4261379197020321, + "grad_norm": 1.171875, + "learning_rate": 1.4189852982596813e-05, + "loss": 1.1326, + "step": 3890 + }, + { + "epoch": 0.42668565481733034, + "grad_norm": 1.203125, + "learning_rate": 1.4172483531043358e-05, + "loss": 1.1815, + "step": 3895 + }, + { + "epoch": 0.4272333899326286, + "grad_norm": 1.1796875, + "learning_rate": 1.4155098824247341e-05, + "loss": 1.1368, + "step": 3900 + }, + { + "epoch": 0.4277811250479268, + "grad_norm": 1.234375, + "learning_rate": 1.4137698925769931e-05, + "loss": 1.218, + "step": 3905 + }, + { + "epoch": 0.4283288601632251, + "grad_norm": 1.25, + "learning_rate": 1.4120283899227839e-05, + "loss": 1.1762, + "step": 3910 + }, + { + "epoch": 0.4288765952785233, + "grad_norm": 1.2109375, + "learning_rate": 1.4102853808293077e-05, + "loss": 1.15, + "step": 3915 + }, + { + "epoch": 0.42942433039382155, + "grad_norm": 1.203125, + "learning_rate": 1.408540871669275e-05, + "loss": 1.1668, + "step": 3920 + }, + { + "epoch": 0.4299720655091198, + "grad_norm": 1.1796875, + "learning_rate": 1.4067948688208799e-05, + "loss": 1.1276, + "step": 3925 + }, + { + "epoch": 0.430519800624418, + "grad_norm": 1.203125, + "learning_rate": 1.4050473786677774e-05, + "loss": 1.189, + "step": 3930 + }, + { + "epoch": 0.4310675357397163, + "grad_norm": 1.234375, + "learning_rate": 1.403298407599061e-05, + "loss": 1.1595, + "step": 3935 + }, + { + "epoch": 0.4316152708550145, + "grad_norm": 1.2578125, + "learning_rate": 1.4015479620092383e-05, + "loss": 1.1478, + "step": 3940 + }, + { + "epoch": 0.43216300597031276, + "grad_norm": 1.1640625, + "learning_rate": 1.3997960482982082e-05, + "loss": 1.1392, + "step": 3945 + }, + { + "epoch": 0.432710741085611, + "grad_norm": 1.265625, + "learning_rate": 1.3980426728712369e-05, + "loss": 1.1706, + "step": 3950 + }, + { + "epoch": 0.43325847620090924, + "grad_norm": 1.2421875, + "learning_rate": 1.396287842138935e-05, + "loss": 1.1913, + "step": 3955 + }, + { + "epoch": 0.4338062113162075, + "grad_norm": 1.328125, + "learning_rate": 1.394531562517234e-05, + "loss": 1.1721, + "step": 3960 + }, + { + "epoch": 0.4343539464315057, + "grad_norm": 1.3984375, + "learning_rate": 1.3927738404273634e-05, + "loss": 1.2269, + "step": 3965 + }, + { + "epoch": 0.434901681546804, + "grad_norm": 1.2109375, + "learning_rate": 1.391014682295825e-05, + "loss": 1.1459, + "step": 3970 + }, + { + "epoch": 0.4354494166621022, + "grad_norm": 1.25, + "learning_rate": 1.3892540945543722e-05, + "loss": 1.117, + "step": 3975 + }, + { + "epoch": 0.43599715177740045, + "grad_norm": 1.2265625, + "learning_rate": 1.3874920836399854e-05, + "loss": 1.1366, + "step": 3980 + }, + { + "epoch": 0.4365448868926987, + "grad_norm": 1.2578125, + "learning_rate": 1.3857286559948476e-05, + "loss": 1.178, + "step": 3985 + }, + { + "epoch": 0.4370926220079969, + "grad_norm": 1.1796875, + "learning_rate": 1.383963818066322e-05, + "loss": 1.0893, + "step": 3990 + }, + { + "epoch": 0.4376403571232952, + "grad_norm": 1.2734375, + "learning_rate": 1.3821975763069279e-05, + "loss": 1.1656, + "step": 3995 + }, + { + "epoch": 0.4381880922385934, + "grad_norm": 1.2109375, + "learning_rate": 1.3804299371743174e-05, + "loss": 1.1316, + "step": 4000 + }, + { + "epoch": 0.43873582735389166, + "grad_norm": 1.203125, + "learning_rate": 1.3786609071312511e-05, + "loss": 1.1865, + "step": 4005 + }, + { + "epoch": 0.4392835624691899, + "grad_norm": 1.203125, + "learning_rate": 1.376890492645576e-05, + "loss": 1.1552, + "step": 4010 + }, + { + "epoch": 0.43983129758448813, + "grad_norm": 1.21875, + "learning_rate": 1.3751187001901995e-05, + "loss": 1.1758, + "step": 4015 + }, + { + "epoch": 0.4403790326997864, + "grad_norm": 1.203125, + "learning_rate": 1.3733455362430684e-05, + "loss": 1.1494, + "step": 4020 + }, + { + "epoch": 0.4409267678150846, + "grad_norm": 1.21875, + "learning_rate": 1.3715710072871426e-05, + "loss": 1.1465, + "step": 4025 + }, + { + "epoch": 0.4414745029303829, + "grad_norm": 1.2265625, + "learning_rate": 1.369795119810374e-05, + "loss": 1.1678, + "step": 4030 + }, + { + "epoch": 0.4420222380456811, + "grad_norm": 1.2421875, + "learning_rate": 1.3680178803056802e-05, + "loss": 1.1544, + "step": 4035 + }, + { + "epoch": 0.44256997316097935, + "grad_norm": 1.2421875, + "learning_rate": 1.366239295270923e-05, + "loss": 1.1633, + "step": 4040 + }, + { + "epoch": 0.4431177082762776, + "grad_norm": 1.1875, + "learning_rate": 1.3644593712088829e-05, + "loss": 1.1644, + "step": 4045 + }, + { + "epoch": 0.4436654433915758, + "grad_norm": 1.2265625, + "learning_rate": 1.3626781146272369e-05, + "loss": 1.1366, + "step": 4050 + }, + { + "epoch": 0.4442131785068741, + "grad_norm": 1.2265625, + "learning_rate": 1.3608955320385333e-05, + "loss": 1.1761, + "step": 4055 + }, + { + "epoch": 0.4447609136221723, + "grad_norm": 1.203125, + "learning_rate": 1.3591116299601684e-05, + "loss": 1.1194, + "step": 4060 + }, + { + "epoch": 0.44530864873747056, + "grad_norm": 1.1953125, + "learning_rate": 1.3573264149143636e-05, + "loss": 1.1779, + "step": 4065 + }, + { + "epoch": 0.4458563838527688, + "grad_norm": 1.140625, + "learning_rate": 1.3555398934281397e-05, + "loss": 1.1487, + "step": 4070 + }, + { + "epoch": 0.44640411896806703, + "grad_norm": 1.1953125, + "learning_rate": 1.3537520720332943e-05, + "loss": 1.1687, + "step": 4075 + }, + { + "epoch": 0.4469518540833653, + "grad_norm": 1.2421875, + "learning_rate": 1.351962957266378e-05, + "loss": 1.1544, + "step": 4080 + }, + { + "epoch": 0.4474995891986635, + "grad_norm": 1.234375, + "learning_rate": 1.3501725556686702e-05, + "loss": 1.1358, + "step": 4085 + }, + { + "epoch": 0.44804732431396177, + "grad_norm": 1.265625, + "learning_rate": 1.3483808737861547e-05, + "loss": 1.1723, + "step": 4090 + }, + { + "epoch": 0.44859505942926003, + "grad_norm": 1.1875, + "learning_rate": 1.3465879181694966e-05, + "loss": 1.1463, + "step": 4095 + }, + { + "epoch": 0.44914279454455824, + "grad_norm": 1.390625, + "learning_rate": 1.344793695374018e-05, + "loss": 1.15, + "step": 4100 + }, + { + "epoch": 0.4496905296598565, + "grad_norm": 1.1953125, + "learning_rate": 1.3429982119596737e-05, + "loss": 1.2038, + "step": 4105 + }, + { + "epoch": 0.4502382647751547, + "grad_norm": 1.234375, + "learning_rate": 1.341201474491027e-05, + "loss": 1.1767, + "step": 4110 + }, + { + "epoch": 0.450785999890453, + "grad_norm": 1.2421875, + "learning_rate": 1.3394034895372281e-05, + "loss": 1.173, + "step": 4115 + }, + { + "epoch": 0.4513337350057512, + "grad_norm": 1.234375, + "learning_rate": 1.337604263671986e-05, + "loss": 1.183, + "step": 4120 + }, + { + "epoch": 0.45188147012104946, + "grad_norm": 1.21875, + "learning_rate": 1.3358038034735485e-05, + "loss": 1.1281, + "step": 4125 + }, + { + "epoch": 0.4524292052363477, + "grad_norm": 1.1640625, + "learning_rate": 1.3340021155246745e-05, + "loss": 1.187, + "step": 4130 + }, + { + "epoch": 0.45297694035164593, + "grad_norm": 1.25, + "learning_rate": 1.3321992064126132e-05, + "loss": 1.1543, + "step": 4135 + }, + { + "epoch": 0.4535246754669442, + "grad_norm": 1.2421875, + "learning_rate": 1.3303950827290781e-05, + "loss": 1.191, + "step": 4140 + }, + { + "epoch": 0.4540724105822424, + "grad_norm": 1.2109375, + "learning_rate": 1.328589751070223e-05, + "loss": 1.2142, + "step": 4145 + }, + { + "epoch": 0.45462014569754067, + "grad_norm": 1.171875, + "learning_rate": 1.3267832180366189e-05, + "loss": 1.1444, + "step": 4150 + }, + { + "epoch": 0.45516788081283893, + "grad_norm": 1.1953125, + "learning_rate": 1.3249754902332285e-05, + "loss": 1.1635, + "step": 4155 + }, + { + "epoch": 0.45571561592813714, + "grad_norm": 1.2421875, + "learning_rate": 1.3231665742693838e-05, + "loss": 1.156, + "step": 4160 + }, + { + "epoch": 0.4562633510434354, + "grad_norm": 1.2890625, + "learning_rate": 1.3213564767587594e-05, + "loss": 1.2229, + "step": 4165 + }, + { + "epoch": 0.4568110861587336, + "grad_norm": 1.2265625, + "learning_rate": 1.3195452043193511e-05, + "loss": 1.1456, + "step": 4170 + }, + { + "epoch": 0.4573588212740319, + "grad_norm": 1.25, + "learning_rate": 1.3177327635734497e-05, + "loss": 1.1082, + "step": 4175 + }, + { + "epoch": 0.45790655638933014, + "grad_norm": 1.2421875, + "learning_rate": 1.3159191611476183e-05, + "loss": 1.1428, + "step": 4180 + }, + { + "epoch": 0.45845429150462835, + "grad_norm": 1.234375, + "learning_rate": 1.3141044036726663e-05, + "loss": 1.1755, + "step": 4185 + }, + { + "epoch": 0.4590020266199266, + "grad_norm": 1.2421875, + "learning_rate": 1.3122884977836265e-05, + "loss": 1.1714, + "step": 4190 + }, + { + "epoch": 0.4595497617352248, + "grad_norm": 1.3671875, + "learning_rate": 1.3104714501197308e-05, + "loss": 1.1976, + "step": 4195 + }, + { + "epoch": 0.4600974968505231, + "grad_norm": 1.1796875, + "learning_rate": 1.3086532673243855e-05, + "loss": 1.1892, + "step": 4200 + }, + { + "epoch": 0.46064523196582136, + "grad_norm": 1.1875, + "learning_rate": 1.306833956045147e-05, + "loss": 1.1116, + "step": 4205 + }, + { + "epoch": 0.46119296708111956, + "grad_norm": 1.296875, + "learning_rate": 1.3050135229336974e-05, + "loss": 1.1555, + "step": 4210 + }, + { + "epoch": 0.46174070219641783, + "grad_norm": 1.3046875, + "learning_rate": 1.3031919746458202e-05, + "loss": 1.1401, + "step": 4215 + }, + { + "epoch": 0.46228843731171604, + "grad_norm": 1.21875, + "learning_rate": 1.3013693178413773e-05, + "loss": 1.144, + "step": 4220 + }, + { + "epoch": 0.4628361724270143, + "grad_norm": 1.2109375, + "learning_rate": 1.2995455591842825e-05, + "loss": 1.1727, + "step": 4225 + }, + { + "epoch": 0.4633839075423125, + "grad_norm": 1.28125, + "learning_rate": 1.2977207053424781e-05, + "loss": 1.2165, + "step": 4230 + }, + { + "epoch": 0.4639316426576108, + "grad_norm": 1.2109375, + "learning_rate": 1.2958947629879113e-05, + "loss": 1.1501, + "step": 4235 + }, + { + "epoch": 0.46447937777290904, + "grad_norm": 1.21875, + "learning_rate": 1.2940677387965083e-05, + "loss": 1.244, + "step": 4240 + }, + { + "epoch": 0.46502711288820725, + "grad_norm": 1.1796875, + "learning_rate": 1.2922396394481513e-05, + "loss": 1.1096, + "step": 4245 + }, + { + "epoch": 0.4655748480035055, + "grad_norm": 1.265625, + "learning_rate": 1.2904104716266529e-05, + "loss": 1.145, + "step": 4250 + }, + { + "epoch": 0.4661225831188037, + "grad_norm": 1.1796875, + "learning_rate": 1.2885802420197324e-05, + "loss": 1.1838, + "step": 4255 + }, + { + "epoch": 0.466670318234102, + "grad_norm": 1.296875, + "learning_rate": 1.2867489573189916e-05, + "loss": 1.1849, + "step": 4260 + }, + { + "epoch": 0.46721805334940025, + "grad_norm": 1.2265625, + "learning_rate": 1.2849166242198887e-05, + "loss": 1.1366, + "step": 4265 + }, + { + "epoch": 0.46776578846469846, + "grad_norm": 1.2578125, + "learning_rate": 1.2830832494217167e-05, + "loss": 1.1374, + "step": 4270 + }, + { + "epoch": 0.4683135235799967, + "grad_norm": 1.2578125, + "learning_rate": 1.2812488396275757e-05, + "loss": 1.1513, + "step": 4275 + }, + { + "epoch": 0.46886125869529494, + "grad_norm": 1.2421875, + "learning_rate": 1.2794134015443508e-05, + "loss": 1.1682, + "step": 4280 + }, + { + "epoch": 0.4694089938105932, + "grad_norm": 1.3125, + "learning_rate": 1.2775769418826858e-05, + "loss": 1.1919, + "step": 4285 + }, + { + "epoch": 0.46995672892589146, + "grad_norm": 1.1328125, + "learning_rate": 1.2757394673569608e-05, + "loss": 1.1142, + "step": 4290 + }, + { + "epoch": 0.4705044640411897, + "grad_norm": 1.203125, + "learning_rate": 1.2739009846852655e-05, + "loss": 1.1921, + "step": 4295 + }, + { + "epoch": 0.47105219915648794, + "grad_norm": 1.1953125, + "learning_rate": 1.2720615005893758e-05, + "loss": 1.154, + "step": 4300 + }, + { + "epoch": 0.47159993427178615, + "grad_norm": 1.2421875, + "learning_rate": 1.2702210217947289e-05, + "loss": 1.1875, + "step": 4305 + }, + { + "epoch": 0.4721476693870844, + "grad_norm": 1.203125, + "learning_rate": 1.2683795550303985e-05, + "loss": 1.1543, + "step": 4310 + }, + { + "epoch": 0.4726954045023826, + "grad_norm": 1.1875, + "learning_rate": 1.266537107029071e-05, + "loss": 1.1906, + "step": 4315 + }, + { + "epoch": 0.4732431396176809, + "grad_norm": 1.203125, + "learning_rate": 1.2646936845270208e-05, + "loss": 1.0545, + "step": 4320 + }, + { + "epoch": 0.47379087473297915, + "grad_norm": 1.21875, + "learning_rate": 1.2628492942640835e-05, + "loss": 1.1769, + "step": 4325 + }, + { + "epoch": 0.47433860984827736, + "grad_norm": 1.2265625, + "learning_rate": 1.2610039429836345e-05, + "loss": 1.1138, + "step": 4330 + }, + { + "epoch": 0.4748863449635756, + "grad_norm": 1.203125, + "learning_rate": 1.2591576374325622e-05, + "loss": 1.1499, + "step": 4335 + }, + { + "epoch": 0.47543408007887383, + "grad_norm": 1.2421875, + "learning_rate": 1.2573103843612447e-05, + "loss": 1.1579, + "step": 4340 + }, + { + "epoch": 0.4759818151941721, + "grad_norm": 1.203125, + "learning_rate": 1.2554621905235226e-05, + "loss": 1.1361, + "step": 4345 + }, + { + "epoch": 0.47652955030947036, + "grad_norm": 1.2265625, + "learning_rate": 1.2536130626766783e-05, + "loss": 1.2137, + "step": 4350 + }, + { + "epoch": 0.47707728542476857, + "grad_norm": 1.234375, + "learning_rate": 1.2517630075814078e-05, + "loss": 1.1736, + "step": 4355 + }, + { + "epoch": 0.47762502054006684, + "grad_norm": 1.234375, + "learning_rate": 1.2499120320017977e-05, + "loss": 1.1649, + "step": 4360 + }, + { + "epoch": 0.47817275565536504, + "grad_norm": 1.296875, + "learning_rate": 1.2480601427052994e-05, + "loss": 1.1655, + "step": 4365 + }, + { + "epoch": 0.4787204907706633, + "grad_norm": 1.265625, + "learning_rate": 1.2462073464627058e-05, + "loss": 1.1312, + "step": 4370 + }, + { + "epoch": 0.4792682258859616, + "grad_norm": 1.1953125, + "learning_rate": 1.2443536500481252e-05, + "loss": 1.1196, + "step": 4375 + }, + { + "epoch": 0.4798159610012598, + "grad_norm": 1.21875, + "learning_rate": 1.2424990602389578e-05, + "loss": 1.1134, + "step": 4380 + }, + { + "epoch": 0.48036369611655805, + "grad_norm": 1.25, + "learning_rate": 1.2406435838158686e-05, + "loss": 1.2028, + "step": 4385 + }, + { + "epoch": 0.48091143123185626, + "grad_norm": 1.171875, + "learning_rate": 1.2387872275627659e-05, + "loss": 1.0878, + "step": 4390 + }, + { + "epoch": 0.4814591663471545, + "grad_norm": 1.2578125, + "learning_rate": 1.2369299982667744e-05, + "loss": 1.1652, + "step": 4395 + }, + { + "epoch": 0.48200690146245273, + "grad_norm": 1.21875, + "learning_rate": 1.23507190271821e-05, + "loss": 1.1133, + "step": 4400 + }, + { + "epoch": 0.482554636577751, + "grad_norm": 1.375, + "learning_rate": 1.2332129477105562e-05, + "loss": 1.1953, + "step": 4405 + }, + { + "epoch": 0.48310237169304926, + "grad_norm": 1.21875, + "learning_rate": 1.2313531400404397e-05, + "loss": 1.1242, + "step": 4410 + }, + { + "epoch": 0.48365010680834747, + "grad_norm": 1.21875, + "learning_rate": 1.2294924865076029e-05, + "loss": 1.1628, + "step": 4415 + }, + { + "epoch": 0.48419784192364573, + "grad_norm": 1.1796875, + "learning_rate": 1.227630993914882e-05, + "loss": 1.1063, + "step": 4420 + }, + { + "epoch": 0.48474557703894394, + "grad_norm": 1.2265625, + "learning_rate": 1.2257686690681812e-05, + "loss": 1.1294, + "step": 4425 + }, + { + "epoch": 0.4852933121542422, + "grad_norm": 1.25, + "learning_rate": 1.2239055187764463e-05, + "loss": 1.1627, + "step": 4430 + }, + { + "epoch": 0.48584104726954047, + "grad_norm": 1.2578125, + "learning_rate": 1.222041549851642e-05, + "loss": 1.1864, + "step": 4435 + }, + { + "epoch": 0.4863887823848387, + "grad_norm": 1.2578125, + "learning_rate": 1.2201767691087265e-05, + "loss": 1.113, + "step": 4440 + }, + { + "epoch": 0.48693651750013695, + "grad_norm": 1.2265625, + "learning_rate": 1.2183111833656244e-05, + "loss": 1.1931, + "step": 4445 + }, + { + "epoch": 0.48748425261543515, + "grad_norm": 1.21875, + "learning_rate": 1.2164447994432054e-05, + "loss": 1.1258, + "step": 4450 + }, + { + "epoch": 0.4880319877307334, + "grad_norm": 1.2265625, + "learning_rate": 1.214577624165256e-05, + "loss": 1.1602, + "step": 4455 + }, + { + "epoch": 0.4885797228460317, + "grad_norm": 1.21875, + "learning_rate": 1.2127096643584573e-05, + "loss": 1.1441, + "step": 4460 + }, + { + "epoch": 0.4891274579613299, + "grad_norm": 1.2421875, + "learning_rate": 1.2108409268523578e-05, + "loss": 1.1582, + "step": 4465 + }, + { + "epoch": 0.48967519307662816, + "grad_norm": 1.25, + "learning_rate": 1.2089714184793492e-05, + "loss": 1.1229, + "step": 4470 + }, + { + "epoch": 0.49022292819192637, + "grad_norm": 1.2109375, + "learning_rate": 1.207101146074643e-05, + "loss": 1.1002, + "step": 4475 + }, + { + "epoch": 0.49077066330722463, + "grad_norm": 1.3203125, + "learning_rate": 1.205230116476243e-05, + "loss": 1.1865, + "step": 4480 + }, + { + "epoch": 0.4913183984225229, + "grad_norm": 1.3046875, + "learning_rate": 1.203358336524921e-05, + "loss": 1.2106, + "step": 4485 + }, + { + "epoch": 0.4918661335378211, + "grad_norm": 1.2421875, + "learning_rate": 1.2014858130641936e-05, + "loss": 1.1417, + "step": 4490 + }, + { + "epoch": 0.49241386865311937, + "grad_norm": 1.1953125, + "learning_rate": 1.1996125529402946e-05, + "loss": 1.1356, + "step": 4495 + }, + { + "epoch": 0.4929616037684176, + "grad_norm": 1.25, + "learning_rate": 1.1977385630021523e-05, + "loss": 1.1409, + "step": 4500 + }, + { + "epoch": 0.49350933888371584, + "grad_norm": 1.2421875, + "learning_rate": 1.1958638501013613e-05, + "loss": 1.129, + "step": 4505 + }, + { + "epoch": 0.49405707399901405, + "grad_norm": 1.265625, + "learning_rate": 1.1939884210921618e-05, + "loss": 1.1589, + "step": 4510 + }, + { + "epoch": 0.4946048091143123, + "grad_norm": 1.2578125, + "learning_rate": 1.1921122828314109e-05, + "loss": 1.0934, + "step": 4515 + }, + { + "epoch": 0.4951525442296106, + "grad_norm": 1.1875, + "learning_rate": 1.1902354421785591e-05, + "loss": 1.1568, + "step": 4520 + }, + { + "epoch": 0.4957002793449088, + "grad_norm": 1.328125, + "learning_rate": 1.188357905995625e-05, + "loss": 1.151, + "step": 4525 + }, + { + "epoch": 0.49624801446020705, + "grad_norm": 1.265625, + "learning_rate": 1.18647968114717e-05, + "loss": 1.1709, + "step": 4530 + }, + { + "epoch": 0.49679574957550526, + "grad_norm": 1.296875, + "learning_rate": 1.1846007745002734e-05, + "loss": 1.1514, + "step": 4535 + }, + { + "epoch": 0.49734348469080353, + "grad_norm": 1.2265625, + "learning_rate": 1.1827211929245075e-05, + "loss": 1.1487, + "step": 4540 + }, + { + "epoch": 0.4978912198061018, + "grad_norm": 1.1953125, + "learning_rate": 1.1808409432919124e-05, + "loss": 1.1874, + "step": 4545 + }, + { + "epoch": 0.4984389549214, + "grad_norm": 1.234375, + "learning_rate": 1.1789600324769696e-05, + "loss": 1.1591, + "step": 4550 + }, + { + "epoch": 0.49898669003669827, + "grad_norm": 1.1875, + "learning_rate": 1.1770784673565796e-05, + "loss": 1.1598, + "step": 4555 + }, + { + "epoch": 0.4995344251519965, + "grad_norm": 1.1953125, + "learning_rate": 1.1751962548100339e-05, + "loss": 1.1619, + "step": 4560 + }, + { + "epoch": 0.5000821602672947, + "grad_norm": 1.21875, + "learning_rate": 1.1733134017189918e-05, + "loss": 1.1255, + "step": 4565 + }, + { + "epoch": 0.500629895382593, + "grad_norm": 1.2421875, + "learning_rate": 1.1714299149674538e-05, + "loss": 1.136, + "step": 4570 + }, + { + "epoch": 0.5011776304978912, + "grad_norm": 1.2265625, + "learning_rate": 1.1695458014417382e-05, + "loss": 1.1483, + "step": 4575 + }, + { + "epoch": 0.5017253656131895, + "grad_norm": 1.28125, + "learning_rate": 1.1676610680304539e-05, + "loss": 1.1182, + "step": 4580 + }, + { + "epoch": 0.5022731007284877, + "grad_norm": 1.25, + "learning_rate": 1.1657757216244767e-05, + "loss": 1.118, + "step": 4585 + }, + { + "epoch": 0.5028208358437859, + "grad_norm": 1.203125, + "learning_rate": 1.163889769116923e-05, + "loss": 1.1345, + "step": 4590 + }, + { + "epoch": 0.5033685709590842, + "grad_norm": 1.1640625, + "learning_rate": 1.162003217403127e-05, + "loss": 1.1254, + "step": 4595 + }, + { + "epoch": 0.5039163060743824, + "grad_norm": 1.234375, + "learning_rate": 1.1601160733806113e-05, + "loss": 1.125, + "step": 4600 + }, + { + "epoch": 0.5044640411896807, + "grad_norm": 1.3359375, + "learning_rate": 1.1582283439490652e-05, + "loss": 1.2014, + "step": 4605 + }, + { + "epoch": 0.505011776304979, + "grad_norm": 1.1953125, + "learning_rate": 1.1563400360103188e-05, + "loss": 1.1424, + "step": 4610 + }, + { + "epoch": 0.5055595114202771, + "grad_norm": 1.2109375, + "learning_rate": 1.1544511564683165e-05, + "loss": 1.1401, + "step": 4615 + }, + { + "epoch": 0.5061072465355754, + "grad_norm": 1.28125, + "learning_rate": 1.152561712229093e-05, + "loss": 1.1839, + "step": 4620 + }, + { + "epoch": 0.5066549816508736, + "grad_norm": 1.2578125, + "learning_rate": 1.1506717102007474e-05, + "loss": 1.1592, + "step": 4625 + }, + { + "epoch": 0.5072027167661719, + "grad_norm": 1.203125, + "learning_rate": 1.1487811572934184e-05, + "loss": 1.1217, + "step": 4630 + }, + { + "epoch": 0.5077504518814702, + "grad_norm": 1.234375, + "learning_rate": 1.1468900604192585e-05, + "loss": 1.1532, + "step": 4635 + }, + { + "epoch": 0.5082981869967683, + "grad_norm": 1.234375, + "learning_rate": 1.1449984264924094e-05, + "loss": 1.1942, + "step": 4640 + }, + { + "epoch": 0.5088459221120666, + "grad_norm": 1.25, + "learning_rate": 1.143106262428976e-05, + "loss": 1.0977, + "step": 4645 + }, + { + "epoch": 0.5093936572273648, + "grad_norm": 1.21875, + "learning_rate": 1.1412135751470017e-05, + "loss": 1.1754, + "step": 4650 + }, + { + "epoch": 0.5099413923426631, + "grad_norm": 1.1796875, + "learning_rate": 1.1393203715664427e-05, + "loss": 1.131, + "step": 4655 + }, + { + "epoch": 0.5104891274579614, + "grad_norm": 1.1953125, + "learning_rate": 1.137426658609143e-05, + "loss": 1.1093, + "step": 4660 + }, + { + "epoch": 0.5110368625732595, + "grad_norm": 1.2109375, + "learning_rate": 1.1355324431988086e-05, + "loss": 1.1413, + "step": 4665 + }, + { + "epoch": 0.5115845976885578, + "grad_norm": 1.1875, + "learning_rate": 1.1336377322609832e-05, + "loss": 1.1897, + "step": 4670 + }, + { + "epoch": 0.5121323328038561, + "grad_norm": 1.2890625, + "learning_rate": 1.131742532723022e-05, + "loss": 1.1527, + "step": 4675 + }, + { + "epoch": 0.5126800679191543, + "grad_norm": 1.25, + "learning_rate": 1.1298468515140662e-05, + "loss": 1.1567, + "step": 4680 + }, + { + "epoch": 0.5132278030344526, + "grad_norm": 1.234375, + "learning_rate": 1.1279506955650182e-05, + "loss": 1.1145, + "step": 4685 + }, + { + "epoch": 0.5137755381497507, + "grad_norm": 1.2109375, + "learning_rate": 1.1260540718085162e-05, + "loss": 1.1819, + "step": 4690 + }, + { + "epoch": 0.514323273265049, + "grad_norm": 1.25, + "learning_rate": 1.1241569871789096e-05, + "loss": 1.1306, + "step": 4695 + }, + { + "epoch": 0.5148710083803473, + "grad_norm": 1.1953125, + "learning_rate": 1.1222594486122312e-05, + "loss": 1.1663, + "step": 4700 + }, + { + "epoch": 0.5154187434956455, + "grad_norm": 1.203125, + "learning_rate": 1.1203614630461746e-05, + "loss": 1.1199, + "step": 4705 + }, + { + "epoch": 0.5159664786109438, + "grad_norm": 1.2109375, + "learning_rate": 1.118463037420067e-05, + "loss": 1.1542, + "step": 4710 + }, + { + "epoch": 0.516514213726242, + "grad_norm": 1.28125, + "learning_rate": 1.116564178674846e-05, + "loss": 1.1904, + "step": 4715 + }, + { + "epoch": 0.5170619488415402, + "grad_norm": 1.265625, + "learning_rate": 1.1146648937530309e-05, + "loss": 1.1787, + "step": 4720 + }, + { + "epoch": 0.5176096839568385, + "grad_norm": 1.265625, + "learning_rate": 1.1127651895986999e-05, + "loss": 1.163, + "step": 4725 + }, + { + "epoch": 0.5181574190721367, + "grad_norm": 1.171875, + "learning_rate": 1.1108650731574644e-05, + "loss": 1.0855, + "step": 4730 + }, + { + "epoch": 0.5187051541874349, + "grad_norm": 1.171875, + "learning_rate": 1.1089645513764429e-05, + "loss": 1.1962, + "step": 4735 + }, + { + "epoch": 0.5192528893027332, + "grad_norm": 1.2578125, + "learning_rate": 1.1070636312042352e-05, + "loss": 1.2023, + "step": 4740 + }, + { + "epoch": 0.5198006244180314, + "grad_norm": 1.265625, + "learning_rate": 1.1051623195908987e-05, + "loss": 1.1834, + "step": 4745 + }, + { + "epoch": 0.5203483595333297, + "grad_norm": 1.21875, + "learning_rate": 1.1032606234879217e-05, + "loss": 1.172, + "step": 4750 + }, + { + "epoch": 0.520896094648628, + "grad_norm": 1.2109375, + "learning_rate": 1.1013585498481983e-05, + "loss": 1.1487, + "step": 4755 + }, + { + "epoch": 0.5214438297639261, + "grad_norm": 1.2578125, + "learning_rate": 1.0994561056260016e-05, + "loss": 1.2174, + "step": 4760 + }, + { + "epoch": 0.5219915648792244, + "grad_norm": 1.28125, + "learning_rate": 1.0975532977769619e-05, + "loss": 1.1955, + "step": 4765 + }, + { + "epoch": 0.5225392999945226, + "grad_norm": 1.21875, + "learning_rate": 1.0956501332580375e-05, + "loss": 1.0925, + "step": 4770 + }, + { + "epoch": 0.5230870351098209, + "grad_norm": 1.1875, + "learning_rate": 1.093746619027491e-05, + "loss": 1.181, + "step": 4775 + }, + { + "epoch": 0.5236347702251192, + "grad_norm": 1.2265625, + "learning_rate": 1.0918427620448635e-05, + "loss": 1.1976, + "step": 4780 + }, + { + "epoch": 0.5241825053404173, + "grad_norm": 1.2109375, + "learning_rate": 1.0899385692709499e-05, + "loss": 1.1783, + "step": 4785 + }, + { + "epoch": 0.5247302404557156, + "grad_norm": 1.2109375, + "learning_rate": 1.0880340476677718e-05, + "loss": 1.0751, + "step": 4790 + }, + { + "epoch": 0.5252779755710139, + "grad_norm": 1.21875, + "learning_rate": 1.0861292041985538e-05, + "loss": 1.1422, + "step": 4795 + }, + { + "epoch": 0.5258257106863121, + "grad_norm": 1.3203125, + "learning_rate": 1.084224045827697e-05, + "loss": 1.1123, + "step": 4800 + }, + { + "epoch": 0.5263734458016104, + "grad_norm": 1.234375, + "learning_rate": 1.082318579520754e-05, + "loss": 1.1201, + "step": 4805 + }, + { + "epoch": 0.5269211809169085, + "grad_norm": 1.2265625, + "learning_rate": 1.080412812244403e-05, + "loss": 1.1319, + "step": 4810 + }, + { + "epoch": 0.5274689160322068, + "grad_norm": 1.2734375, + "learning_rate": 1.0785067509664231e-05, + "loss": 1.1475, + "step": 4815 + }, + { + "epoch": 0.5280166511475051, + "grad_norm": 1.2578125, + "learning_rate": 1.0766004026556676e-05, + "loss": 1.1697, + "step": 4820 + }, + { + "epoch": 0.5285643862628033, + "grad_norm": 1.21875, + "learning_rate": 1.0746937742820397e-05, + "loss": 1.1328, + "step": 4825 + }, + { + "epoch": 0.5291121213781016, + "grad_norm": 1.2109375, + "learning_rate": 1.072786872816466e-05, + "loss": 1.1438, + "step": 4830 + }, + { + "epoch": 0.5296598564933997, + "grad_norm": 1.2265625, + "learning_rate": 1.070879705230873e-05, + "loss": 1.1648, + "step": 4835 + }, + { + "epoch": 0.530207591608698, + "grad_norm": 1.1640625, + "learning_rate": 1.0689722784981586e-05, + "loss": 1.113, + "step": 4840 + }, + { + "epoch": 0.5307553267239963, + "grad_norm": 1.2265625, + "learning_rate": 1.0670645995921687e-05, + "loss": 1.1427, + "step": 4845 + }, + { + "epoch": 0.5313030618392945, + "grad_norm": 1.234375, + "learning_rate": 1.0651566754876715e-05, + "loss": 1.1771, + "step": 4850 + }, + { + "epoch": 0.5318507969545928, + "grad_norm": 1.2109375, + "learning_rate": 1.0632485131603313e-05, + "loss": 1.1629, + "step": 4855 + }, + { + "epoch": 0.532398532069891, + "grad_norm": 1.203125, + "learning_rate": 1.0613401195866835e-05, + "loss": 1.2022, + "step": 4860 + }, + { + "epoch": 0.5329462671851892, + "grad_norm": 1.28125, + "learning_rate": 1.059431501744109e-05, + "loss": 1.1548, + "step": 4865 + }, + { + "epoch": 0.5334940023004875, + "grad_norm": 1.25, + "learning_rate": 1.0575226666108086e-05, + "loss": 1.1427, + "step": 4870 + }, + { + "epoch": 0.5340417374157858, + "grad_norm": 1.1875, + "learning_rate": 1.0556136211657784e-05, + "loss": 1.1265, + "step": 4875 + }, + { + "epoch": 0.534589472531084, + "grad_norm": 1.203125, + "learning_rate": 1.0537043723887811e-05, + "loss": 1.1737, + "step": 4880 + }, + { + "epoch": 0.5351372076463822, + "grad_norm": 1.1875, + "learning_rate": 1.0517949272603257e-05, + "loss": 1.081, + "step": 4885 + }, + { + "epoch": 0.5356849427616804, + "grad_norm": 1.2734375, + "learning_rate": 1.049885292761637e-05, + "loss": 1.1054, + "step": 4890 + }, + { + "epoch": 0.5362326778769787, + "grad_norm": 1.28125, + "learning_rate": 1.0479754758746332e-05, + "loss": 1.1456, + "step": 4895 + }, + { + "epoch": 0.536780412992277, + "grad_norm": 1.2265625, + "learning_rate": 1.0460654835818989e-05, + "loss": 1.1402, + "step": 4900 + }, + { + "epoch": 0.5373281481075752, + "grad_norm": 1.1953125, + "learning_rate": 1.0441553228666603e-05, + "loss": 1.1627, + "step": 4905 + }, + { + "epoch": 0.5378758832228734, + "grad_norm": 1.2734375, + "learning_rate": 1.0422450007127591e-05, + "loss": 1.1311, + "step": 4910 + }, + { + "epoch": 0.5384236183381717, + "grad_norm": 1.234375, + "learning_rate": 1.0403345241046277e-05, + "loss": 1.215, + "step": 4915 + }, + { + "epoch": 0.5389713534534699, + "grad_norm": 1.1796875, + "learning_rate": 1.0384239000272624e-05, + "loss": 1.1007, + "step": 4920 + }, + { + "epoch": 0.5395190885687682, + "grad_norm": 1.1953125, + "learning_rate": 1.0365131354661995e-05, + "loss": 1.0988, + "step": 4925 + }, + { + "epoch": 0.5400668236840663, + "grad_norm": 1.203125, + "learning_rate": 1.0346022374074885e-05, + "loss": 1.1654, + "step": 4930 + }, + { + "epoch": 0.5406145587993646, + "grad_norm": 1.1953125, + "learning_rate": 1.032691212837667e-05, + "loss": 1.1612, + "step": 4935 + }, + { + "epoch": 0.5411622939146629, + "grad_norm": 1.3046875, + "learning_rate": 1.0307800687437352e-05, + "loss": 1.1963, + "step": 4940 + }, + { + "epoch": 0.5417100290299611, + "grad_norm": 1.234375, + "learning_rate": 1.0288688121131308e-05, + "loss": 1.1222, + "step": 4945 + }, + { + "epoch": 0.5422577641452594, + "grad_norm": 1.2734375, + "learning_rate": 1.0269574499337016e-05, + "loss": 1.1935, + "step": 4950 + }, + { + "epoch": 0.5428054992605575, + "grad_norm": 1.234375, + "learning_rate": 1.0250459891936831e-05, + "loss": 1.1486, + "step": 4955 + }, + { + "epoch": 0.5433532343758558, + "grad_norm": 1.1953125, + "learning_rate": 1.0231344368816694e-05, + "loss": 1.0891, + "step": 4960 + }, + { + "epoch": 0.5439009694911541, + "grad_norm": 1.2421875, + "learning_rate": 1.0212227999865905e-05, + "loss": 1.1315, + "step": 4965 + }, + { + "epoch": 0.5444487046064523, + "grad_norm": 1.1875, + "learning_rate": 1.0193110854976859e-05, + "loss": 1.1676, + "step": 4970 + }, + { + "epoch": 0.5449964397217506, + "grad_norm": 1.25, + "learning_rate": 1.0173993004044777e-05, + "loss": 1.1696, + "step": 4975 + }, + { + "epoch": 0.5455441748370488, + "grad_norm": 1.1953125, + "learning_rate": 1.0154874516967466e-05, + "loss": 1.1742, + "step": 4980 + }, + { + "epoch": 0.546091909952347, + "grad_norm": 1.328125, + "learning_rate": 1.0135755463645065e-05, + "loss": 1.1852, + "step": 4985 + }, + { + "epoch": 0.5466396450676453, + "grad_norm": 1.4296875, + "learning_rate": 1.0116635913979778e-05, + "loss": 1.164, + "step": 4990 + }, + { + "epoch": 0.5471873801829436, + "grad_norm": 1.2734375, + "learning_rate": 1.0097515937875619e-05, + "loss": 1.1373, + "step": 4995 + }, + { + "epoch": 0.5477351152982418, + "grad_norm": 1.203125, + "learning_rate": 1.0078395605238168e-05, + "loss": 1.1487, + "step": 5000 + }, + { + "epoch": 0.54828285041354, + "grad_norm": 1.21875, + "learning_rate": 1.0059274985974305e-05, + "loss": 1.1163, + "step": 5005 + }, + { + "epoch": 0.5488305855288382, + "grad_norm": 1.2265625, + "learning_rate": 1.004015414999197e-05, + "loss": 1.1109, + "step": 5010 + }, + { + "epoch": 0.5493783206441365, + "grad_norm": 1.171875, + "learning_rate": 1.002103316719987e-05, + "loss": 1.1546, + "step": 5015 + }, + { + "epoch": 0.5499260557594348, + "grad_norm": 1.3125, + "learning_rate": 1.0001912107507273e-05, + "loss": 1.1904, + "step": 5020 + }, + { + "epoch": 0.550473790874733, + "grad_norm": 1.2421875, + "learning_rate": 9.982791040823714e-06, + "loss": 1.1877, + "step": 5025 + }, + { + "epoch": 0.5510215259900312, + "grad_norm": 1.1875, + "learning_rate": 9.963670037058764e-06, + "loss": 1.1568, + "step": 5030 + }, + { + "epoch": 0.5515692611053294, + "grad_norm": 1.234375, + "learning_rate": 9.944549166121753e-06, + "loss": 1.146, + "step": 5035 + }, + { + "epoch": 0.5521169962206277, + "grad_norm": 1.1875, + "learning_rate": 9.925428497921533e-06, + "loss": 1.1435, + "step": 5040 + }, + { + "epoch": 0.552664731335926, + "grad_norm": 1.1953125, + "learning_rate": 9.906308102366216e-06, + "loss": 1.1093, + "step": 5045 + }, + { + "epoch": 0.5532124664512242, + "grad_norm": 1.25, + "learning_rate": 9.887188049362906e-06, + "loss": 1.1546, + "step": 5050 + }, + { + "epoch": 0.5537602015665224, + "grad_norm": 1.1796875, + "learning_rate": 9.86806840881747e-06, + "loss": 1.1049, + "step": 5055 + }, + { + "epoch": 0.5543079366818207, + "grad_norm": 1.2734375, + "learning_rate": 9.848949250634253e-06, + "loss": 1.1575, + "step": 5060 + }, + { + "epoch": 0.5548556717971189, + "grad_norm": 1.21875, + "learning_rate": 9.82983064471584e-06, + "loss": 1.0936, + "step": 5065 + }, + { + "epoch": 0.5554034069124172, + "grad_norm": 1.2265625, + "learning_rate": 9.810712660962813e-06, + "loss": 1.1147, + "step": 5070 + }, + { + "epoch": 0.5559511420277155, + "grad_norm": 1.21875, + "learning_rate": 9.791595369273454e-06, + "loss": 1.1929, + "step": 5075 + }, + { + "epoch": 0.5564988771430136, + "grad_norm": 1.25, + "learning_rate": 9.772478839543526e-06, + "loss": 1.1763, + "step": 5080 + }, + { + "epoch": 0.5570466122583119, + "grad_norm": 1.2265625, + "learning_rate": 9.753363141666017e-06, + "loss": 1.1068, + "step": 5085 + }, + { + "epoch": 0.5575943473736101, + "grad_norm": 1.2578125, + "learning_rate": 9.734248345530854e-06, + "loss": 1.1285, + "step": 5090 + }, + { + "epoch": 0.5581420824889084, + "grad_norm": 1.2578125, + "learning_rate": 9.715134521024675e-06, + "loss": 1.1655, + "step": 5095 + }, + { + "epoch": 0.5586898176042066, + "grad_norm": 1.296875, + "learning_rate": 9.696021738030575e-06, + "loss": 1.1562, + "step": 5100 + }, + { + "epoch": 0.5592375527195048, + "grad_norm": 1.2265625, + "learning_rate": 9.676910066427825e-06, + "loss": 1.2125, + "step": 5105 + }, + { + "epoch": 0.5597852878348031, + "grad_norm": 1.2421875, + "learning_rate": 9.657799576091646e-06, + "loss": 1.1052, + "step": 5110 + }, + { + "epoch": 0.5603330229501013, + "grad_norm": 1.2265625, + "learning_rate": 9.638690336892936e-06, + "loss": 1.1403, + "step": 5115 + }, + { + "epoch": 0.5608807580653996, + "grad_norm": 1.2265625, + "learning_rate": 9.619582418698009e-06, + "loss": 1.115, + "step": 5120 + }, + { + "epoch": 0.5614284931806978, + "grad_norm": 1.234375, + "learning_rate": 9.600475891368365e-06, + "loss": 1.1525, + "step": 5125 + }, + { + "epoch": 0.561976228295996, + "grad_norm": 1.25, + "learning_rate": 9.58137082476041e-06, + "loss": 1.1344, + "step": 5130 + }, + { + "epoch": 0.5625239634112943, + "grad_norm": 1.28125, + "learning_rate": 9.562267288725205e-06, + "loss": 1.1938, + "step": 5135 + }, + { + "epoch": 0.5630716985265926, + "grad_norm": 1.2421875, + "learning_rate": 9.543165353108232e-06, + "loss": 1.1521, + "step": 5140 + }, + { + "epoch": 0.5636194336418908, + "grad_norm": 1.2265625, + "learning_rate": 9.524065087749097e-06, + "loss": 1.1257, + "step": 5145 + }, + { + "epoch": 0.564167168757189, + "grad_norm": 1.265625, + "learning_rate": 9.50496656248132e-06, + "loss": 1.1702, + "step": 5150 + }, + { + "epoch": 0.5647149038724872, + "grad_norm": 1.2578125, + "learning_rate": 9.485869847132055e-06, + "loss": 1.141, + "step": 5155 + }, + { + "epoch": 0.5652626389877855, + "grad_norm": 1.2421875, + "learning_rate": 9.466775011521825e-06, + "loss": 1.1146, + "step": 5160 + }, + { + "epoch": 0.5658103741030838, + "grad_norm": 1.1953125, + "learning_rate": 9.447682125464299e-06, + "loss": 1.1636, + "step": 5165 + }, + { + "epoch": 0.566358109218382, + "grad_norm": 1.203125, + "learning_rate": 9.42859125876601e-06, + "loss": 1.1135, + "step": 5170 + }, + { + "epoch": 0.5669058443336802, + "grad_norm": 1.2890625, + "learning_rate": 9.409502481226098e-06, + "loss": 1.1315, + "step": 5175 + }, + { + "epoch": 0.5674535794489785, + "grad_norm": 1.203125, + "learning_rate": 9.39041586263608e-06, + "loss": 1.1671, + "step": 5180 + }, + { + "epoch": 0.5680013145642767, + "grad_norm": 1.15625, + "learning_rate": 9.37133147277958e-06, + "loss": 1.1187, + "step": 5185 + }, + { + "epoch": 0.568549049679575, + "grad_norm": 1.21875, + "learning_rate": 9.35224938143206e-06, + "loss": 1.1596, + "step": 5190 + }, + { + "epoch": 0.5690967847948732, + "grad_norm": 1.21875, + "learning_rate": 9.333169658360588e-06, + "loss": 1.1323, + "step": 5195 + }, + { + "epoch": 0.5696445199101714, + "grad_norm": 1.2109375, + "learning_rate": 9.31409237332357e-06, + "loss": 1.1726, + "step": 5200 + }, + { + "epoch": 0.5701922550254697, + "grad_norm": 1.25, + "learning_rate": 9.2950175960705e-06, + "loss": 1.1918, + "step": 5205 + }, + { + "epoch": 0.5707399901407679, + "grad_norm": 1.296875, + "learning_rate": 9.275945396341704e-06, + "loss": 1.1976, + "step": 5210 + }, + { + "epoch": 0.5712877252560662, + "grad_norm": 1.265625, + "learning_rate": 9.256875843868072e-06, + "loss": 1.1623, + "step": 5215 + }, + { + "epoch": 0.5718354603713645, + "grad_norm": 1.21875, + "learning_rate": 9.237809008370836e-06, + "loss": 1.1411, + "step": 5220 + }, + { + "epoch": 0.5723831954866626, + "grad_norm": 1.203125, + "learning_rate": 9.218744959561285e-06, + "loss": 1.1414, + "step": 5225 + }, + { + "epoch": 0.5729309306019609, + "grad_norm": 1.2265625, + "learning_rate": 9.199683767140511e-06, + "loss": 1.124, + "step": 5230 + }, + { + "epoch": 0.5734786657172591, + "grad_norm": 1.25, + "learning_rate": 9.18062550079917e-06, + "loss": 1.1288, + "step": 5235 + }, + { + "epoch": 0.5740264008325574, + "grad_norm": 1.25, + "learning_rate": 9.161570230217228e-06, + "loss": 1.179, + "step": 5240 + }, + { + "epoch": 0.5745741359478557, + "grad_norm": 1.2265625, + "learning_rate": 9.14251802506368e-06, + "loss": 1.1691, + "step": 5245 + }, + { + "epoch": 0.5751218710631538, + "grad_norm": 1.2265625, + "learning_rate": 9.12346895499633e-06, + "loss": 1.1064, + "step": 5250 + }, + { + "epoch": 0.5756696061784521, + "grad_norm": 1.21875, + "learning_rate": 9.104423089661512e-06, + "loss": 1.154, + "step": 5255 + }, + { + "epoch": 0.5762173412937504, + "grad_norm": 1.2734375, + "learning_rate": 9.085380498693837e-06, + "loss": 1.1888, + "step": 5260 + }, + { + "epoch": 0.5767650764090486, + "grad_norm": 1.1953125, + "learning_rate": 9.066341251715958e-06, + "loss": 1.1586, + "step": 5265 + }, + { + "epoch": 0.5773128115243469, + "grad_norm": 1.265625, + "learning_rate": 9.047305418338295e-06, + "loss": 1.1831, + "step": 5270 + }, + { + "epoch": 0.577860546639645, + "grad_norm": 1.203125, + "learning_rate": 9.028273068158782e-06, + "loss": 1.2184, + "step": 5275 + }, + { + "epoch": 0.5784082817549433, + "grad_norm": 1.265625, + "learning_rate": 9.009244270762625e-06, + "loss": 1.1374, + "step": 5280 + }, + { + "epoch": 0.5789560168702416, + "grad_norm": 1.2265625, + "learning_rate": 8.990219095722044e-06, + "loss": 1.1599, + "step": 5285 + }, + { + "epoch": 0.5795037519855398, + "grad_norm": 1.46875, + "learning_rate": 8.971197612596006e-06, + "loss": 1.0858, + "step": 5290 + }, + { + "epoch": 0.580051487100838, + "grad_norm": 1.3125, + "learning_rate": 8.952179890929982e-06, + "loss": 1.1553, + "step": 5295 + }, + { + "epoch": 0.5805992222161362, + "grad_norm": 1.21875, + "learning_rate": 8.933166000255692e-06, + "loss": 1.1676, + "step": 5300 + }, + { + "epoch": 0.5811469573314345, + "grad_norm": 1.4921875, + "learning_rate": 8.914156010090852e-06, + "loss": 1.1262, + "step": 5305 + }, + { + "epoch": 0.5816946924467328, + "grad_norm": 1.2421875, + "learning_rate": 8.895149989938915e-06, + "loss": 1.1667, + "step": 5310 + }, + { + "epoch": 0.582242427562031, + "grad_norm": 1.2421875, + "learning_rate": 8.876148009288813e-06, + "loss": 1.1792, + "step": 5315 + }, + { + "epoch": 0.5827901626773292, + "grad_norm": 1.265625, + "learning_rate": 8.857150137614718e-06, + "loss": 1.1894, + "step": 5320 + }, + { + "epoch": 0.5833378977926275, + "grad_norm": 1.1875, + "learning_rate": 8.83815644437578e-06, + "loss": 1.148, + "step": 5325 + }, + { + "epoch": 0.5838856329079257, + "grad_norm": 1.2265625, + "learning_rate": 8.819166999015863e-06, + "loss": 1.1339, + "step": 5330 + }, + { + "epoch": 0.584433368023224, + "grad_norm": 1.1875, + "learning_rate": 8.8001818709633e-06, + "loss": 1.0615, + "step": 5335 + }, + { + "epoch": 0.5849811031385223, + "grad_norm": 1.1875, + "learning_rate": 8.781201129630655e-06, + "loss": 1.1195, + "step": 5340 + }, + { + "epoch": 0.5855288382538204, + "grad_norm": 1.1484375, + "learning_rate": 8.762224844414433e-06, + "loss": 1.1996, + "step": 5345 + }, + { + "epoch": 0.5860765733691187, + "grad_norm": 1.203125, + "learning_rate": 8.743253084694858e-06, + "loss": 1.1773, + "step": 5350 + }, + { + "epoch": 0.5866243084844169, + "grad_norm": 1.21875, + "learning_rate": 8.724285919835611e-06, + "loss": 1.1409, + "step": 5355 + }, + { + "epoch": 0.5871720435997152, + "grad_norm": 1.1640625, + "learning_rate": 8.70532341918356e-06, + "loss": 1.1911, + "step": 5360 + }, + { + "epoch": 0.5877197787150135, + "grad_norm": 1.2734375, + "learning_rate": 8.686365652068536e-06, + "loss": 1.1553, + "step": 5365 + }, + { + "epoch": 0.5882675138303116, + "grad_norm": 1.1953125, + "learning_rate": 8.667412687803054e-06, + "loss": 1.1326, + "step": 5370 + }, + { + "epoch": 0.5888152489456099, + "grad_norm": 1.21875, + "learning_rate": 8.648464595682068e-06, + "loss": 1.1896, + "step": 5375 + }, + { + "epoch": 0.5893629840609081, + "grad_norm": 1.1953125, + "learning_rate": 8.629521444982726e-06, + "loss": 1.1159, + "step": 5380 + }, + { + "epoch": 0.5899107191762064, + "grad_norm": 1.265625, + "learning_rate": 8.610583304964105e-06, + "loss": 1.1625, + "step": 5385 + }, + { + "epoch": 0.5904584542915047, + "grad_norm": 1.234375, + "learning_rate": 8.591650244866957e-06, + "loss": 1.1655, + "step": 5390 + }, + { + "epoch": 0.5910061894068028, + "grad_norm": 1.1875, + "learning_rate": 8.572722333913473e-06, + "loss": 1.1404, + "step": 5395 + }, + { + "epoch": 0.5915539245221011, + "grad_norm": 1.203125, + "learning_rate": 8.553799641307003e-06, + "loss": 1.1366, + "step": 5400 + }, + { + "epoch": 0.5921016596373994, + "grad_norm": 1.25, + "learning_rate": 8.534882236231837e-06, + "loss": 1.1413, + "step": 5405 + }, + { + "epoch": 0.5926493947526976, + "grad_norm": 1.2421875, + "learning_rate": 8.515970187852916e-06, + "loss": 1.1671, + "step": 5410 + }, + { + "epoch": 0.5931971298679959, + "grad_norm": 1.2265625, + "learning_rate": 8.4970635653156e-06, + "loss": 1.1465, + "step": 5415 + }, + { + "epoch": 0.593744864983294, + "grad_norm": 1.2109375, + "learning_rate": 8.478162437745418e-06, + "loss": 1.1198, + "step": 5420 + }, + { + "epoch": 0.5942926000985923, + "grad_norm": 1.25, + "learning_rate": 8.459266874247805e-06, + "loss": 1.1324, + "step": 5425 + }, + { + "epoch": 0.5948403352138906, + "grad_norm": 1.15625, + "learning_rate": 8.440376943907847e-06, + "loss": 1.1416, + "step": 5430 + }, + { + "epoch": 0.5953880703291888, + "grad_norm": 1.2421875, + "learning_rate": 8.42149271579004e-06, + "loss": 1.1581, + "step": 5435 + }, + { + "epoch": 0.5959358054444871, + "grad_norm": 1.2265625, + "learning_rate": 8.402614258938038e-06, + "loss": 1.1468, + "step": 5440 + }, + { + "epoch": 0.5964835405597853, + "grad_norm": 1.25, + "learning_rate": 8.38374164237438e-06, + "loss": 1.1138, + "step": 5445 + }, + { + "epoch": 0.5970312756750835, + "grad_norm": 1.21875, + "learning_rate": 8.36487493510026e-06, + "loss": 1.1697, + "step": 5450 + }, + { + "epoch": 0.5975790107903818, + "grad_norm": 1.2421875, + "learning_rate": 8.346014206095272e-06, + "loss": 1.1659, + "step": 5455 + }, + { + "epoch": 0.59812674590568, + "grad_norm": 1.2265625, + "learning_rate": 8.32715952431714e-06, + "loss": 1.163, + "step": 5460 + }, + { + "epoch": 0.5986744810209782, + "grad_norm": 1.28125, + "learning_rate": 8.308310958701492e-06, + "loss": 1.1482, + "step": 5465 + }, + { + "epoch": 0.5992222161362765, + "grad_norm": 1.21875, + "learning_rate": 8.289468578161581e-06, + "loss": 1.1804, + "step": 5470 + }, + { + "epoch": 0.5997699512515747, + "grad_norm": 1.2734375, + "learning_rate": 8.270632451588053e-06, + "loss": 1.143, + "step": 5475 + }, + { + "epoch": 0.600317686366873, + "grad_norm": 1.1796875, + "learning_rate": 8.251802647848696e-06, + "loss": 1.1491, + "step": 5480 + }, + { + "epoch": 0.6008654214821713, + "grad_norm": 1.28125, + "learning_rate": 8.232979235788167e-06, + "loss": 1.2016, + "step": 5485 + }, + { + "epoch": 0.6014131565974694, + "grad_norm": 1.203125, + "learning_rate": 8.214162284227758e-06, + "loss": 1.1132, + "step": 5490 + }, + { + "epoch": 0.6019608917127677, + "grad_norm": 1.28125, + "learning_rate": 8.195351861965151e-06, + "loss": 1.1075, + "step": 5495 + }, + { + "epoch": 0.6025086268280659, + "grad_norm": 1.25, + "learning_rate": 8.176548037774137e-06, + "loss": 1.152, + "step": 5500 + }, + { + "epoch": 0.6030563619433642, + "grad_norm": 1.234375, + "learning_rate": 8.157750880404402e-06, + "loss": 1.1574, + "step": 5505 + }, + { + "epoch": 0.6036040970586625, + "grad_norm": 1.21875, + "learning_rate": 8.13896045858125e-06, + "loss": 1.1635, + "step": 5510 + }, + { + "epoch": 0.6041518321739606, + "grad_norm": 1.203125, + "learning_rate": 8.12017684100535e-06, + "loss": 1.1093, + "step": 5515 + }, + { + "epoch": 0.6046995672892589, + "grad_norm": 1.2421875, + "learning_rate": 8.101400096352508e-06, + "loss": 1.1926, + "step": 5520 + }, + { + "epoch": 0.6052473024045572, + "grad_norm": 1.2578125, + "learning_rate": 8.082630293273394e-06, + "loss": 1.1289, + "step": 5525 + }, + { + "epoch": 0.6057950375198554, + "grad_norm": 1.2578125, + "learning_rate": 8.063867500393296e-06, + "loss": 1.1551, + "step": 5530 + }, + { + "epoch": 0.6063427726351537, + "grad_norm": 1.2421875, + "learning_rate": 8.045111786311878e-06, + "loss": 1.1592, + "step": 5535 + }, + { + "epoch": 0.6068905077504518, + "grad_norm": 1.25, + "learning_rate": 8.026363219602921e-06, + "loss": 1.2031, + "step": 5540 + }, + { + "epoch": 0.6074382428657501, + "grad_norm": 1.3046875, + "learning_rate": 8.007621868814073e-06, + "loss": 1.2304, + "step": 5545 + }, + { + "epoch": 0.6079859779810484, + "grad_norm": 1.203125, + "learning_rate": 7.9888878024666e-06, + "loss": 1.1297, + "step": 5550 + }, + { + "epoch": 0.6085337130963466, + "grad_norm": 1.25, + "learning_rate": 7.970161089055127e-06, + "loss": 1.1341, + "step": 5555 + }, + { + "epoch": 0.6090814482116449, + "grad_norm": 1.171875, + "learning_rate": 7.951441797047412e-06, + "loss": 1.0772, + "step": 5560 + }, + { + "epoch": 0.609629183326943, + "grad_norm": 1.2421875, + "learning_rate": 7.93272999488407e-06, + "loss": 1.141, + "step": 5565 + }, + { + "epoch": 0.6101769184422413, + "grad_norm": 1.1953125, + "learning_rate": 7.914025750978324e-06, + "loss": 1.1046, + "step": 5570 + }, + { + "epoch": 0.6107246535575396, + "grad_norm": 1.2578125, + "learning_rate": 7.895329133715779e-06, + "loss": 1.1575, + "step": 5575 + }, + { + "epoch": 0.6112723886728378, + "grad_norm": 1.265625, + "learning_rate": 7.876640211454148e-06, + "loss": 1.1169, + "step": 5580 + }, + { + "epoch": 0.6118201237881361, + "grad_norm": 1.28125, + "learning_rate": 7.857959052523005e-06, + "loss": 1.157, + "step": 5585 + }, + { + "epoch": 0.6123678589034343, + "grad_norm": 1.2421875, + "learning_rate": 7.839285725223545e-06, + "loss": 1.1819, + "step": 5590 + }, + { + "epoch": 0.6129155940187325, + "grad_norm": 1.1875, + "learning_rate": 7.820620297828337e-06, + "loss": 1.16, + "step": 5595 + }, + { + "epoch": 0.6134633291340308, + "grad_norm": 1.1953125, + "learning_rate": 7.801962838581051e-06, + "loss": 1.1279, + "step": 5600 + }, + { + "epoch": 0.614011064249329, + "grad_norm": 1.21875, + "learning_rate": 7.783313415696231e-06, + "loss": 1.1946, + "step": 5605 + }, + { + "epoch": 0.6145587993646273, + "grad_norm": 1.2734375, + "learning_rate": 7.76467209735905e-06, + "loss": 1.1686, + "step": 5610 + }, + { + "epoch": 0.6151065344799255, + "grad_norm": 1.21875, + "learning_rate": 7.74603895172503e-06, + "loss": 1.206, + "step": 5615 + }, + { + "epoch": 0.6156542695952237, + "grad_norm": 1.28125, + "learning_rate": 7.727414046919825e-06, + "loss": 1.1534, + "step": 5620 + }, + { + "epoch": 0.616202004710522, + "grad_norm": 1.2421875, + "learning_rate": 7.70879745103896e-06, + "loss": 1.1479, + "step": 5625 + }, + { + "epoch": 0.6167497398258203, + "grad_norm": 1.203125, + "learning_rate": 7.690189232147566e-06, + "loss": 1.2199, + "step": 5630 + }, + { + "epoch": 0.6172974749411185, + "grad_norm": 1.2265625, + "learning_rate": 7.671589458280172e-06, + "loss": 1.1312, + "step": 5635 + }, + { + "epoch": 0.6178452100564167, + "grad_norm": 1.234375, + "learning_rate": 7.65299819744041e-06, + "loss": 1.1763, + "step": 5640 + }, + { + "epoch": 0.618392945171715, + "grad_norm": 1.2265625, + "learning_rate": 7.634415517600789e-06, + "loss": 1.1124, + "step": 5645 + }, + { + "epoch": 0.6189406802870132, + "grad_norm": 1.2421875, + "learning_rate": 7.61584148670246e-06, + "loss": 1.1387, + "step": 5650 + }, + { + "epoch": 0.6194884154023115, + "grad_norm": 1.171875, + "learning_rate": 7.59727617265493e-06, + "loss": 1.1751, + "step": 5655 + }, + { + "epoch": 0.6200361505176096, + "grad_norm": 1.2734375, + "learning_rate": 7.578719643335854e-06, + "loss": 1.1762, + "step": 5660 + }, + { + "epoch": 0.6205838856329079, + "grad_norm": 1.2890625, + "learning_rate": 7.560171966590762e-06, + "loss": 1.1825, + "step": 5665 + }, + { + "epoch": 0.6211316207482062, + "grad_norm": 1.2578125, + "learning_rate": 7.541633210232812e-06, + "loss": 1.161, + "step": 5670 + }, + { + "epoch": 0.6216793558635044, + "grad_norm": 1.1875, + "learning_rate": 7.523103442042556e-06, + "loss": 1.097, + "step": 5675 + }, + { + "epoch": 0.6222270909788027, + "grad_norm": 1.234375, + "learning_rate": 7.504582729767687e-06, + "loss": 1.1358, + "step": 5680 + }, + { + "epoch": 0.6227748260941008, + "grad_norm": 1.28125, + "learning_rate": 7.486071141122774e-06, + "loss": 1.2019, + "step": 5685 + }, + { + "epoch": 0.6233225612093991, + "grad_norm": 1.21875, + "learning_rate": 7.4675687437890375e-06, + "loss": 1.1657, + "step": 5690 + }, + { + "epoch": 0.6238702963246974, + "grad_norm": 1.1953125, + "learning_rate": 7.4490756054141e-06, + "loss": 1.1705, + "step": 5695 + }, + { + "epoch": 0.6244180314399956, + "grad_norm": 1.2109375, + "learning_rate": 7.430591793611715e-06, + "loss": 1.1294, + "step": 5700 + }, + { + "epoch": 0.6249657665552939, + "grad_norm": 1.234375, + "learning_rate": 7.412117375961554e-06, + "loss": 1.1325, + "step": 5705 + }, + { + "epoch": 0.625513501670592, + "grad_norm": 1.203125, + "learning_rate": 7.393652420008923e-06, + "loss": 1.1276, + "step": 5710 + }, + { + "epoch": 0.6260612367858903, + "grad_norm": 1.1875, + "learning_rate": 7.375196993264555e-06, + "loss": 1.1718, + "step": 5715 + }, + { + "epoch": 0.6266089719011886, + "grad_norm": 1.203125, + "learning_rate": 7.356751163204332e-06, + "loss": 1.11, + "step": 5720 + }, + { + "epoch": 0.6271567070164868, + "grad_norm": 1.265625, + "learning_rate": 7.33831499726905e-06, + "loss": 1.1356, + "step": 5725 + }, + { + "epoch": 0.6277044421317851, + "grad_norm": 1.1796875, + "learning_rate": 7.319888562864165e-06, + "loss": 1.1446, + "step": 5730 + }, + { + "epoch": 0.6282521772470833, + "grad_norm": 1.1953125, + "learning_rate": 7.301471927359571e-06, + "loss": 1.1267, + "step": 5735 + }, + { + "epoch": 0.6287999123623815, + "grad_norm": 1.3359375, + "learning_rate": 7.283065158089315e-06, + "loss": 1.1637, + "step": 5740 + }, + { + "epoch": 0.6293476474776798, + "grad_norm": 1.3046875, + "learning_rate": 7.264668322351385e-06, + "loss": 1.1372, + "step": 5745 + }, + { + "epoch": 0.6298953825929781, + "grad_norm": 1.203125, + "learning_rate": 7.24628148740745e-06, + "loss": 1.1375, + "step": 5750 + }, + { + "epoch": 0.6304431177082763, + "grad_norm": 1.265625, + "learning_rate": 7.227904720482601e-06, + "loss": 1.117, + "step": 5755 + }, + { + "epoch": 0.6309908528235745, + "grad_norm": 1.203125, + "learning_rate": 7.20953808876514e-06, + "loss": 1.1562, + "step": 5760 + }, + { + "epoch": 0.6315385879388727, + "grad_norm": 1.234375, + "learning_rate": 7.191181659406297e-06, + "loss": 1.12, + "step": 5765 + }, + { + "epoch": 0.632086323054171, + "grad_norm": 1.265625, + "learning_rate": 7.172835499520002e-06, + "loss": 1.1717, + "step": 5770 + }, + { + "epoch": 0.6326340581694693, + "grad_norm": 1.203125, + "learning_rate": 7.154499676182648e-06, + "loss": 1.1618, + "step": 5775 + }, + { + "epoch": 0.6331817932847675, + "grad_norm": 1.265625, + "learning_rate": 7.136174256432828e-06, + "loss": 1.1518, + "step": 5780 + }, + { + "epoch": 0.6337295284000657, + "grad_norm": 1.3046875, + "learning_rate": 7.117859307271095e-06, + "loss": 1.1344, + "step": 5785 + }, + { + "epoch": 0.634277263515364, + "grad_norm": 1.2265625, + "learning_rate": 7.099554895659734e-06, + "loss": 1.1029, + "step": 5790 + }, + { + "epoch": 0.6348249986306622, + "grad_norm": 1.2578125, + "learning_rate": 7.081261088522482e-06, + "loss": 1.1324, + "step": 5795 + }, + { + "epoch": 0.6353727337459605, + "grad_norm": 1.2265625, + "learning_rate": 7.062977952744326e-06, + "loss": 1.0928, + "step": 5800 + }, + { + "epoch": 0.6359204688612587, + "grad_norm": 1.234375, + "learning_rate": 7.044705555171223e-06, + "loss": 1.1352, + "step": 5805 + }, + { + "epoch": 0.6364682039765569, + "grad_norm": 1.1953125, + "learning_rate": 7.02644396260987e-06, + "loss": 1.1719, + "step": 5810 + }, + { + "epoch": 0.6370159390918552, + "grad_norm": 1.2265625, + "learning_rate": 7.008193241827468e-06, + "loss": 1.1325, + "step": 5815 + }, + { + "epoch": 0.6375636742071534, + "grad_norm": 1.203125, + "learning_rate": 6.989953459551463e-06, + "loss": 1.1598, + "step": 5820 + }, + { + "epoch": 0.6381114093224517, + "grad_norm": 1.28125, + "learning_rate": 6.971724682469303e-06, + "loss": 1.1432, + "step": 5825 + }, + { + "epoch": 0.63865914443775, + "grad_norm": 1.1953125, + "learning_rate": 6.953506977228211e-06, + "loss": 1.1366, + "step": 5830 + }, + { + "epoch": 0.6392068795530481, + "grad_norm": 1.1953125, + "learning_rate": 6.935300410434927e-06, + "loss": 1.1742, + "step": 5835 + }, + { + "epoch": 0.6397546146683464, + "grad_norm": 1.28125, + "learning_rate": 6.917105048655454e-06, + "loss": 1.2324, + "step": 5840 + }, + { + "epoch": 0.6403023497836446, + "grad_norm": 1.2421875, + "learning_rate": 6.898920958414843e-06, + "loss": 1.1062, + "step": 5845 + }, + { + "epoch": 0.6408500848989429, + "grad_norm": 1.2421875, + "learning_rate": 6.880748206196934e-06, + "loss": 1.1941, + "step": 5850 + }, + { + "epoch": 0.6413978200142411, + "grad_norm": 1.171875, + "learning_rate": 6.862586858444102e-06, + "loss": 1.1457, + "step": 5855 + }, + { + "epoch": 0.6419455551295393, + "grad_norm": 1.203125, + "learning_rate": 6.84443698155703e-06, + "loss": 1.1622, + "step": 5860 + }, + { + "epoch": 0.6424932902448376, + "grad_norm": 1.2421875, + "learning_rate": 6.826298641894473e-06, + "loss": 1.1634, + "step": 5865 + }, + { + "epoch": 0.6430410253601359, + "grad_norm": 1.234375, + "learning_rate": 6.808171905772986e-06, + "loss": 1.1232, + "step": 5870 + }, + { + "epoch": 0.6435887604754341, + "grad_norm": 1.1953125, + "learning_rate": 6.790056839466715e-06, + "loss": 1.164, + "step": 5875 + }, + { + "epoch": 0.6441364955907323, + "grad_norm": 1.21875, + "learning_rate": 6.7719535092071296e-06, + "loss": 1.1156, + "step": 5880 + }, + { + "epoch": 0.6446842307060305, + "grad_norm": 1.25, + "learning_rate": 6.753861981182793e-06, + "loss": 1.1748, + "step": 5885 + }, + { + "epoch": 0.6452319658213288, + "grad_norm": 1.25, + "learning_rate": 6.735782321539125e-06, + "loss": 1.105, + "step": 5890 + }, + { + "epoch": 0.6457797009366271, + "grad_norm": 1.2734375, + "learning_rate": 6.717714596378138e-06, + "loss": 1.1865, + "step": 5895 + }, + { + "epoch": 0.6463274360519253, + "grad_norm": 1.140625, + "learning_rate": 6.699658871758223e-06, + "loss": 1.1225, + "step": 5900 + }, + { + "epoch": 0.6468751711672235, + "grad_norm": 1.296875, + "learning_rate": 6.681615213693895e-06, + "loss": 1.0707, + "step": 5905 + }, + { + "epoch": 0.6474229062825217, + "grad_norm": 1.2109375, + "learning_rate": 6.6635836881555374e-06, + "loss": 1.1432, + "step": 5910 + }, + { + "epoch": 0.64797064139782, + "grad_norm": 1.21875, + "learning_rate": 6.645564361069193e-06, + "loss": 1.1497, + "step": 5915 + }, + { + "epoch": 0.6485183765131183, + "grad_norm": 1.203125, + "learning_rate": 6.627557298316296e-06, + "loss": 1.1894, + "step": 5920 + }, + { + "epoch": 0.6490661116284165, + "grad_norm": 1.2890625, + "learning_rate": 6.609562565733438e-06, + "loss": 1.1338, + "step": 5925 + }, + { + "epoch": 0.6496138467437147, + "grad_norm": 1.2109375, + "learning_rate": 6.591580229112135e-06, + "loss": 1.0914, + "step": 5930 + }, + { + "epoch": 0.650161581859013, + "grad_norm": 1.1875, + "learning_rate": 6.573610354198587e-06, + "loss": 1.1414, + "step": 5935 + }, + { + "epoch": 0.6507093169743112, + "grad_norm": 1.265625, + "learning_rate": 6.555653006693417e-06, + "loss": 1.1661, + "step": 5940 + }, + { + "epoch": 0.6512570520896095, + "grad_norm": 1.1875, + "learning_rate": 6.537708252251454e-06, + "loss": 1.2106, + "step": 5945 + }, + { + "epoch": 0.6518047872049078, + "grad_norm": 1.2421875, + "learning_rate": 6.519776156481492e-06, + "loss": 1.127, + "step": 5950 + }, + { + "epoch": 0.6523525223202059, + "grad_norm": 1.1953125, + "learning_rate": 6.501856784946027e-06, + "loss": 1.1524, + "step": 5955 + }, + { + "epoch": 0.6529002574355042, + "grad_norm": 1.25, + "learning_rate": 6.483950203161047e-06, + "loss": 1.1249, + "step": 5960 + }, + { + "epoch": 0.6534479925508024, + "grad_norm": 1.2265625, + "learning_rate": 6.466056476595768e-06, + "loss": 1.1287, + "step": 5965 + }, + { + "epoch": 0.6539957276661007, + "grad_norm": 1.21875, + "learning_rate": 6.448175670672413e-06, + "loss": 1.1386, + "step": 5970 + }, + { + "epoch": 0.654543462781399, + "grad_norm": 1.2421875, + "learning_rate": 6.43030785076597e-06, + "loss": 1.21, + "step": 5975 + }, + { + "epoch": 0.6550911978966971, + "grad_norm": 1.6484375, + "learning_rate": 6.412453082203933e-06, + "loss": 1.1321, + "step": 5980 + }, + { + "epoch": 0.6556389330119954, + "grad_norm": 1.25, + "learning_rate": 6.394611430266087e-06, + "loss": 1.1684, + "step": 5985 + }, + { + "epoch": 0.6561866681272936, + "grad_norm": 1.234375, + "learning_rate": 6.376782960184266e-06, + "loss": 1.1898, + "step": 5990 + }, + { + "epoch": 0.6567344032425919, + "grad_norm": 1.2421875, + "learning_rate": 6.358967737142096e-06, + "loss": 1.1738, + "step": 5995 + }, + { + "epoch": 0.6572821383578902, + "grad_norm": 1.28125, + "learning_rate": 6.34116582627478e-06, + "loss": 1.1621, + "step": 6000 + }, + { + "epoch": 0.6578298734731883, + "grad_norm": 1.2265625, + "learning_rate": 6.323377292668848e-06, + "loss": 1.1685, + "step": 6005 + }, + { + "epoch": 0.6583776085884866, + "grad_norm": 1.2578125, + "learning_rate": 6.305602201361915e-06, + "loss": 1.1973, + "step": 6010 + }, + { + "epoch": 0.6589253437037849, + "grad_norm": 1.1875, + "learning_rate": 6.2878406173424576e-06, + "loss": 1.0964, + "step": 6015 + }, + { + "epoch": 0.6594730788190831, + "grad_norm": 1.1875, + "learning_rate": 6.270092605549564e-06, + "loss": 1.1605, + "step": 6020 + }, + { + "epoch": 0.6600208139343813, + "grad_norm": 1.2265625, + "learning_rate": 6.25235823087269e-06, + "loss": 1.1705, + "step": 6025 + }, + { + "epoch": 0.6605685490496795, + "grad_norm": 1.234375, + "learning_rate": 6.2346375581514525e-06, + "loss": 1.1656, + "step": 6030 + }, + { + "epoch": 0.6611162841649778, + "grad_norm": 1.2421875, + "learning_rate": 6.216930652175358e-06, + "loss": 1.1893, + "step": 6035 + }, + { + "epoch": 0.6616640192802761, + "grad_norm": 1.21875, + "learning_rate": 6.199237577683577e-06, + "loss": 1.1163, + "step": 6040 + }, + { + "epoch": 0.6622117543955743, + "grad_norm": 1.1875, + "learning_rate": 6.181558399364722e-06, + "loss": 1.1637, + "step": 6045 + }, + { + "epoch": 0.6627594895108725, + "grad_norm": 1.3359375, + "learning_rate": 6.163893181856586e-06, + "loss": 1.2075, + "step": 6050 + }, + { + "epoch": 0.6633072246261708, + "grad_norm": 1.2109375, + "learning_rate": 6.146241989745932e-06, + "loss": 1.1517, + "step": 6055 + }, + { + "epoch": 0.663854959741469, + "grad_norm": 1.2109375, + "learning_rate": 6.128604887568237e-06, + "loss": 1.113, + "step": 6060 + }, + { + "epoch": 0.6644026948567673, + "grad_norm": 1.203125, + "learning_rate": 6.1109819398074564e-06, + "loss": 1.1239, + "step": 6065 + }, + { + "epoch": 0.6649504299720655, + "grad_norm": 1.2734375, + "learning_rate": 6.09337321089581e-06, + "loss": 1.1289, + "step": 6070 + }, + { + "epoch": 0.6654981650873637, + "grad_norm": 1.203125, + "learning_rate": 6.0757787652135195e-06, + "loss": 1.1207, + "step": 6075 + }, + { + "epoch": 0.666045900202662, + "grad_norm": 1.2109375, + "learning_rate": 6.05819866708859e-06, + "loss": 1.1825, + "step": 6080 + }, + { + "epoch": 0.6665936353179602, + "grad_norm": 1.2109375, + "learning_rate": 6.040632980796566e-06, + "loss": 1.1496, + "step": 6085 + }, + { + "epoch": 0.6671413704332585, + "grad_norm": 1.1796875, + "learning_rate": 6.023081770560307e-06, + "loss": 1.1455, + "step": 6090 + }, + { + "epoch": 0.6676891055485568, + "grad_norm": 1.25, + "learning_rate": 6.005545100549739e-06, + "loss": 1.1437, + "step": 6095 + }, + { + "epoch": 0.6682368406638549, + "grad_norm": 1.2109375, + "learning_rate": 5.9880230348816245e-06, + "loss": 1.1507, + "step": 6100 + }, + { + "epoch": 0.6687845757791532, + "grad_norm": 1.1875, + "learning_rate": 5.970515637619346e-06, + "loss": 1.096, + "step": 6105 + }, + { + "epoch": 0.6693323108944514, + "grad_norm": 1.1796875, + "learning_rate": 5.953022972772633e-06, + "loss": 1.1749, + "step": 6110 + }, + { + "epoch": 0.6698800460097497, + "grad_norm": 1.203125, + "learning_rate": 5.935545104297373e-06, + "loss": 1.1504, + "step": 6115 + }, + { + "epoch": 0.670427781125048, + "grad_norm": 1.2265625, + "learning_rate": 5.918082096095339e-06, + "loss": 1.1512, + "step": 6120 + }, + { + "epoch": 0.6709755162403461, + "grad_norm": 1.203125, + "learning_rate": 5.900634012013977e-06, + "loss": 1.1532, + "step": 6125 + }, + { + "epoch": 0.6715232513556444, + "grad_norm": 1.2109375, + "learning_rate": 5.883200915846181e-06, + "loss": 1.0785, + "step": 6130 + }, + { + "epoch": 0.6720709864709427, + "grad_norm": 1.15625, + "learning_rate": 5.865782871330026e-06, + "loss": 1.1757, + "step": 6135 + }, + { + "epoch": 0.6726187215862409, + "grad_norm": 1.2734375, + "learning_rate": 5.848379942148568e-06, + "loss": 1.2096, + "step": 6140 + }, + { + "epoch": 0.6731664567015392, + "grad_norm": 1.203125, + "learning_rate": 5.830992191929603e-06, + "loss": 1.1297, + "step": 6145 + }, + { + "epoch": 0.6737141918168373, + "grad_norm": 1.1875, + "learning_rate": 5.813619684245413e-06, + "loss": 1.1174, + "step": 6150 + }, + { + "epoch": 0.6742619269321356, + "grad_norm": 1.25, + "learning_rate": 5.7962624826125725e-06, + "loss": 1.1527, + "step": 6155 + }, + { + "epoch": 0.6748096620474339, + "grad_norm": 1.28125, + "learning_rate": 5.7789206504916815e-06, + "loss": 1.1952, + "step": 6160 + }, + { + "epoch": 0.6753573971627321, + "grad_norm": 1.2265625, + "learning_rate": 5.761594251287149e-06, + "loss": 1.1212, + "step": 6165 + }, + { + "epoch": 0.6759051322780304, + "grad_norm": 1.1796875, + "learning_rate": 5.74428334834696e-06, + "loss": 1.142, + "step": 6170 + }, + { + "epoch": 0.6764528673933285, + "grad_norm": 1.2109375, + "learning_rate": 5.726988004962443e-06, + "loss": 1.137, + "step": 6175 + }, + { + "epoch": 0.6770006025086268, + "grad_norm": 1.265625, + "learning_rate": 5.709708284368038e-06, + "loss": 1.1304, + "step": 6180 + }, + { + "epoch": 0.6775483376239251, + "grad_norm": 1.171875, + "learning_rate": 5.692444249741064e-06, + "loss": 1.101, + "step": 6185 + }, + { + "epoch": 0.6780960727392233, + "grad_norm": 1.2109375, + "learning_rate": 5.675195964201492e-06, + "loss": 1.1385, + "step": 6190 + }, + { + "epoch": 0.6786438078545216, + "grad_norm": 1.2578125, + "learning_rate": 5.65796349081171e-06, + "loss": 1.1179, + "step": 6195 + }, + { + "epoch": 0.6791915429698198, + "grad_norm": 1.2109375, + "learning_rate": 5.640746892576296e-06, + "loss": 1.1782, + "step": 6200 + }, + { + "epoch": 0.679739278085118, + "grad_norm": 1.234375, + "learning_rate": 5.623546232441786e-06, + "loss": 1.1495, + "step": 6205 + }, + { + "epoch": 0.6802870132004163, + "grad_norm": 1.2109375, + "learning_rate": 5.6063615732964425e-06, + "loss": 1.1382, + "step": 6210 + }, + { + "epoch": 0.6808347483157146, + "grad_norm": 1.234375, + "learning_rate": 5.589192977970028e-06, + "loss": 1.0981, + "step": 6215 + }, + { + "epoch": 0.6813824834310127, + "grad_norm": 1.2109375, + "learning_rate": 5.57204050923357e-06, + "loss": 1.1724, + "step": 6220 + }, + { + "epoch": 0.681930218546311, + "grad_norm": 1.2265625, + "learning_rate": 5.5549042297991366e-06, + "loss": 1.1736, + "step": 6225 + }, + { + "epoch": 0.6824779536616092, + "grad_norm": 1.25, + "learning_rate": 5.537784202319607e-06, + "loss": 1.1174, + "step": 6230 + }, + { + "epoch": 0.6830256887769075, + "grad_norm": 1.2421875, + "learning_rate": 5.520680489388437e-06, + "loss": 1.1457, + "step": 6235 + }, + { + "epoch": 0.6835734238922058, + "grad_norm": 1.234375, + "learning_rate": 5.503593153539437e-06, + "loss": 1.1336, + "step": 6240 + }, + { + "epoch": 0.6841211590075039, + "grad_norm": 1.2890625, + "learning_rate": 5.486522257246538e-06, + "loss": 1.2187, + "step": 6245 + }, + { + "epoch": 0.6846688941228022, + "grad_norm": 1.2890625, + "learning_rate": 5.4694678629235696e-06, + "loss": 1.1642, + "step": 6250 + }, + { + "epoch": 0.6852166292381004, + "grad_norm": 1.234375, + "learning_rate": 5.452430032924017e-06, + "loss": 1.1529, + "step": 6255 + }, + { + "epoch": 0.6857643643533987, + "grad_norm": 1.21875, + "learning_rate": 5.4354088295408265e-06, + "loss": 1.1982, + "step": 6260 + }, + { + "epoch": 0.686312099468697, + "grad_norm": 1.2890625, + "learning_rate": 5.418404315006125e-06, + "loss": 1.0957, + "step": 6265 + }, + { + "epoch": 0.6868598345839951, + "grad_norm": 1.2421875, + "learning_rate": 5.401416551491039e-06, + "loss": 1.1719, + "step": 6270 + }, + { + "epoch": 0.6874075696992934, + "grad_norm": 1.1953125, + "learning_rate": 5.384445601105462e-06, + "loss": 1.0921, + "step": 6275 + }, + { + "epoch": 0.6879553048145917, + "grad_norm": 1.25, + "learning_rate": 5.367491525897787e-06, + "loss": 1.1153, + "step": 6280 + }, + { + "epoch": 0.6885030399298899, + "grad_norm": 1.2578125, + "learning_rate": 5.3505543878547315e-06, + "loss": 1.1524, + "step": 6285 + }, + { + "epoch": 0.6890507750451882, + "grad_norm": 1.2421875, + "learning_rate": 5.33363424890108e-06, + "loss": 1.1685, + "step": 6290 + }, + { + "epoch": 0.6895985101604863, + "grad_norm": 1.2421875, + "learning_rate": 5.316731170899467e-06, + "loss": 1.1593, + "step": 6295 + }, + { + "epoch": 0.6901462452757846, + "grad_norm": 1.328125, + "learning_rate": 5.299845215650157e-06, + "loss": 1.1513, + "step": 6300 + }, + { + "epoch": 0.6906939803910829, + "grad_norm": 1.21875, + "learning_rate": 5.282976444890793e-06, + "loss": 1.1787, + "step": 6305 + }, + { + "epoch": 0.6912417155063811, + "grad_norm": 1.2734375, + "learning_rate": 5.2661249202962e-06, + "loss": 1.125, + "step": 6310 + }, + { + "epoch": 0.6917894506216794, + "grad_norm": 1.2265625, + "learning_rate": 5.249290703478158e-06, + "loss": 1.1662, + "step": 6315 + }, + { + "epoch": 0.6923371857369776, + "grad_norm": 1.2109375, + "learning_rate": 5.232473855985149e-06, + "loss": 1.1612, + "step": 6320 + }, + { + "epoch": 0.6928849208522758, + "grad_norm": 1.234375, + "learning_rate": 5.215674439302155e-06, + "loss": 1.1552, + "step": 6325 + }, + { + "epoch": 0.6934326559675741, + "grad_norm": 1.234375, + "learning_rate": 5.198892514850444e-06, + "loss": 1.1097, + "step": 6330 + }, + { + "epoch": 0.6939803910828723, + "grad_norm": 1.203125, + "learning_rate": 5.182128143987302e-06, + "loss": 1.1337, + "step": 6335 + }, + { + "epoch": 0.6945281261981706, + "grad_norm": 1.1953125, + "learning_rate": 5.165381388005861e-06, + "loss": 1.148, + "step": 6340 + }, + { + "epoch": 0.6950758613134688, + "grad_norm": 1.34375, + "learning_rate": 5.148652308134844e-06, + "loss": 1.2329, + "step": 6345 + }, + { + "epoch": 0.695623596428767, + "grad_norm": 1.328125, + "learning_rate": 5.131940965538329e-06, + "loss": 1.1538, + "step": 6350 + }, + { + "epoch": 0.6961713315440653, + "grad_norm": 1.234375, + "learning_rate": 5.115247421315572e-06, + "loss": 1.2042, + "step": 6355 + }, + { + "epoch": 0.6967190666593636, + "grad_norm": 1.4296875, + "learning_rate": 5.098571736500744e-06, + "loss": 1.1985, + "step": 6360 + }, + { + "epoch": 0.6972668017746618, + "grad_norm": 1.1953125, + "learning_rate": 5.081913972062704e-06, + "loss": 1.1183, + "step": 6365 + }, + { + "epoch": 0.69781453688996, + "grad_norm": 1.1875, + "learning_rate": 5.0652741889048205e-06, + "loss": 1.179, + "step": 6370 + }, + { + "epoch": 0.6983622720052582, + "grad_norm": 1.21875, + "learning_rate": 5.048652447864689e-06, + "loss": 1.1346, + "step": 6375 + }, + { + "epoch": 0.6989100071205565, + "grad_norm": 1.1875, + "learning_rate": 5.032048809713967e-06, + "loss": 1.1022, + "step": 6380 + }, + { + "epoch": 0.6994577422358548, + "grad_norm": 1.21875, + "learning_rate": 5.015463335158114e-06, + "loss": 1.179, + "step": 6385 + }, + { + "epoch": 0.7000054773511529, + "grad_norm": 1.265625, + "learning_rate": 4.998896084836167e-06, + "loss": 1.1692, + "step": 6390 + }, + { + "epoch": 0.7005532124664512, + "grad_norm": 1.2265625, + "learning_rate": 4.982347119320557e-06, + "loss": 1.1716, + "step": 6395 + }, + { + "epoch": 0.7011009475817495, + "grad_norm": 1.203125, + "learning_rate": 4.965816499116849e-06, + "loss": 1.1171, + "step": 6400 + }, + { + "epoch": 0.7016486826970477, + "grad_norm": 1.2734375, + "learning_rate": 4.949304284663535e-06, + "loss": 1.1577, + "step": 6405 + }, + { + "epoch": 0.702196417812346, + "grad_norm": 1.1640625, + "learning_rate": 4.932810536331817e-06, + "loss": 1.1562, + "step": 6410 + }, + { + "epoch": 0.7027441529276441, + "grad_norm": 1.171875, + "learning_rate": 4.916335314425376e-06, + "loss": 1.1456, + "step": 6415 + }, + { + "epoch": 0.7032918880429424, + "grad_norm": 1.1875, + "learning_rate": 4.899878679180167e-06, + "loss": 1.1799, + "step": 6420 + }, + { + "epoch": 0.7038396231582407, + "grad_norm": 1.25, + "learning_rate": 4.8834406907641784e-06, + "loss": 1.1339, + "step": 6425 + }, + { + "epoch": 0.7043873582735389, + "grad_norm": 1.1796875, + "learning_rate": 4.8670214092772316e-06, + "loss": 1.1551, + "step": 6430 + }, + { + "epoch": 0.7049350933888372, + "grad_norm": 1.234375, + "learning_rate": 4.850620894750746e-06, + "loss": 1.1504, + "step": 6435 + }, + { + "epoch": 0.7054828285041354, + "grad_norm": 1.234375, + "learning_rate": 4.834239207147532e-06, + "loss": 1.2009, + "step": 6440 + }, + { + "epoch": 0.7060305636194336, + "grad_norm": 1.2578125, + "learning_rate": 4.817876406361561e-06, + "loss": 1.1192, + "step": 6445 + }, + { + "epoch": 0.7065782987347319, + "grad_norm": 1.2265625, + "learning_rate": 4.801532552217756e-06, + "loss": 1.1502, + "step": 6450 + }, + { + "epoch": 0.7071260338500301, + "grad_norm": 1.1953125, + "learning_rate": 4.785207704471763e-06, + "loss": 1.142, + "step": 6455 + }, + { + "epoch": 0.7076737689653284, + "grad_norm": 1.203125, + "learning_rate": 4.76890192280974e-06, + "loss": 1.1281, + "step": 6460 + }, + { + "epoch": 0.7082215040806266, + "grad_norm": 1.203125, + "learning_rate": 4.7526152668481385e-06, + "loss": 1.1053, + "step": 6465 + }, + { + "epoch": 0.7087692391959248, + "grad_norm": 1.2109375, + "learning_rate": 4.736347796133481e-06, + "loss": 1.163, + "step": 6470 + }, + { + "epoch": 0.7093169743112231, + "grad_norm": 1.2421875, + "learning_rate": 4.7200995701421455e-06, + "loss": 1.2034, + "step": 6475 + }, + { + "epoch": 0.7098647094265214, + "grad_norm": 1.234375, + "learning_rate": 4.703870648280151e-06, + "loss": 1.1658, + "step": 6480 + }, + { + "epoch": 0.7104124445418196, + "grad_norm": 1.2109375, + "learning_rate": 4.687661089882934e-06, + "loss": 1.1676, + "step": 6485 + }, + { + "epoch": 0.7109601796571178, + "grad_norm": 1.203125, + "learning_rate": 4.671470954215139e-06, + "loss": 1.1919, + "step": 6490 + }, + { + "epoch": 0.711507914772416, + "grad_norm": 1.2109375, + "learning_rate": 4.655300300470395e-06, + "loss": 1.1288, + "step": 6495 + }, + { + "epoch": 0.7120556498877143, + "grad_norm": 1.21875, + "learning_rate": 4.639149187771102e-06, + "loss": 1.1863, + "step": 6500 + }, + { + "epoch": 0.7126033850030126, + "grad_norm": 1.2578125, + "learning_rate": 4.6230176751682185e-06, + "loss": 1.1942, + "step": 6505 + }, + { + "epoch": 0.7131511201183108, + "grad_norm": 1.2890625, + "learning_rate": 4.606905821641036e-06, + "loss": 1.1683, + "step": 6510 + }, + { + "epoch": 0.713698855233609, + "grad_norm": 1.2421875, + "learning_rate": 4.590813686096981e-06, + "loss": 1.0678, + "step": 6515 + }, + { + "epoch": 0.7142465903489073, + "grad_norm": 1.203125, + "learning_rate": 4.5747413273713715e-06, + "loss": 1.1841, + "step": 6520 + }, + { + "epoch": 0.7147943254642055, + "grad_norm": 1.2578125, + "learning_rate": 4.558688804227229e-06, + "loss": 1.1208, + "step": 6525 + }, + { + "epoch": 0.7153420605795038, + "grad_norm": 1.296875, + "learning_rate": 4.542656175355054e-06, + "loss": 1.1631, + "step": 6530 + }, + { + "epoch": 0.715889795694802, + "grad_norm": 1.234375, + "learning_rate": 4.526643499372604e-06, + "loss": 1.1604, + "step": 6535 + }, + { + "epoch": 0.7164375308101002, + "grad_norm": 1.328125, + "learning_rate": 4.510650834824692e-06, + "loss": 1.1719, + "step": 6540 + }, + { + "epoch": 0.7169852659253985, + "grad_norm": 1.2421875, + "learning_rate": 4.49467824018296e-06, + "loss": 1.1601, + "step": 6545 + }, + { + "epoch": 0.7175330010406967, + "grad_norm": 1.2734375, + "learning_rate": 4.4787257738456745e-06, + "loss": 1.1557, + "step": 6550 + }, + { + "epoch": 0.718080736155995, + "grad_norm": 1.1796875, + "learning_rate": 4.4627934941375185e-06, + "loss": 1.102, + "step": 6555 + }, + { + "epoch": 0.7186284712712933, + "grad_norm": 1.21875, + "learning_rate": 4.446881459309351e-06, + "loss": 1.0873, + "step": 6560 + }, + { + "epoch": 0.7191762063865914, + "grad_norm": 1.2421875, + "learning_rate": 4.430989727538023e-06, + "loss": 1.1641, + "step": 6565 + }, + { + "epoch": 0.7197239415018897, + "grad_norm": 1.234375, + "learning_rate": 4.415118356926163e-06, + "loss": 1.1565, + "step": 6570 + }, + { + "epoch": 0.7202716766171879, + "grad_norm": 1.2265625, + "learning_rate": 4.399267405501938e-06, + "loss": 1.174, + "step": 6575 + }, + { + "epoch": 0.7208194117324862, + "grad_norm": 1.2734375, + "learning_rate": 4.383436931218867e-06, + "loss": 1.1657, + "step": 6580 + }, + { + "epoch": 0.7213671468477844, + "grad_norm": 1.265625, + "learning_rate": 4.367626991955615e-06, + "loss": 1.1566, + "step": 6585 + }, + { + "epoch": 0.7219148819630826, + "grad_norm": 1.2578125, + "learning_rate": 4.351837645515739e-06, + "loss": 1.1382, + "step": 6590 + }, + { + "epoch": 0.7224626170783809, + "grad_norm": 1.234375, + "learning_rate": 4.336068949627534e-06, + "loss": 1.1729, + "step": 6595 + }, + { + "epoch": 0.7230103521936792, + "grad_norm": 1.3046875, + "learning_rate": 4.320320961943781e-06, + "loss": 1.1313, + "step": 6600 + }, + { + "epoch": 0.7235580873089774, + "grad_norm": 1.203125, + "learning_rate": 4.304593740041539e-06, + "loss": 1.1615, + "step": 6605 + }, + { + "epoch": 0.7241058224242756, + "grad_norm": 1.21875, + "learning_rate": 4.2888873414219675e-06, + "loss": 1.1193, + "step": 6610 + }, + { + "epoch": 0.7246535575395738, + "grad_norm": 1.234375, + "learning_rate": 4.273201823510066e-06, + "loss": 1.1548, + "step": 6615 + }, + { + "epoch": 0.7252012926548721, + "grad_norm": 1.25, + "learning_rate": 4.2575372436545125e-06, + "loss": 1.1503, + "step": 6620 + }, + { + "epoch": 0.7257490277701704, + "grad_norm": 1.2109375, + "learning_rate": 4.241893659127428e-06, + "loss": 1.0939, + "step": 6625 + }, + { + "epoch": 0.7262967628854686, + "grad_norm": 1.21875, + "learning_rate": 4.226271127124153e-06, + "loss": 1.1613, + "step": 6630 + }, + { + "epoch": 0.7268444980007668, + "grad_norm": 1.21875, + "learning_rate": 4.210669704763084e-06, + "loss": 1.1476, + "step": 6635 + }, + { + "epoch": 0.727392233116065, + "grad_norm": 1.34375, + "learning_rate": 4.195089449085424e-06, + "loss": 1.1295, + "step": 6640 + }, + { + "epoch": 0.7279399682313633, + "grad_norm": 1.1953125, + "learning_rate": 4.1795304170549784e-06, + "loss": 1.1707, + "step": 6645 + }, + { + "epoch": 0.7284877033466616, + "grad_norm": 1.2265625, + "learning_rate": 4.163992665557975e-06, + "loss": 1.1444, + "step": 6650 + }, + { + "epoch": 0.7290354384619598, + "grad_norm": 1.265625, + "learning_rate": 4.148476251402821e-06, + "loss": 1.2146, + "step": 6655 + }, + { + "epoch": 0.729583173577258, + "grad_norm": 1.1953125, + "learning_rate": 4.1329812313199195e-06, + "loss": 1.1645, + "step": 6660 + }, + { + "epoch": 0.7301309086925563, + "grad_norm": 1.2265625, + "learning_rate": 4.117507661961451e-06, + "loss": 1.1362, + "step": 6665 + }, + { + "epoch": 0.7306786438078545, + "grad_norm": 1.2578125, + "learning_rate": 4.102055599901168e-06, + "loss": 1.1424, + "step": 6670 + }, + { + "epoch": 0.7312263789231528, + "grad_norm": 1.25, + "learning_rate": 4.08662510163419e-06, + "loss": 1.1322, + "step": 6675 + }, + { + "epoch": 0.731774114038451, + "grad_norm": 1.203125, + "learning_rate": 4.071216223576795e-06, + "loss": 1.1155, + "step": 6680 + }, + { + "epoch": 0.7323218491537492, + "grad_norm": 1.21875, + "learning_rate": 4.055829022066216e-06, + "loss": 1.126, + "step": 6685 + }, + { + "epoch": 0.7328695842690475, + "grad_norm": 1.234375, + "learning_rate": 4.040463553360431e-06, + "loss": 1.1659, + "step": 6690 + }, + { + "epoch": 0.7334173193843457, + "grad_norm": 1.2109375, + "learning_rate": 4.025119873637962e-06, + "loss": 1.1248, + "step": 6695 + }, + { + "epoch": 0.733965054499644, + "grad_norm": 1.234375, + "learning_rate": 4.009798038997664e-06, + "loss": 1.1073, + "step": 6700 + }, + { + "epoch": 0.7345127896149423, + "grad_norm": 1.1796875, + "learning_rate": 3.9944981054585276e-06, + "loss": 1.1886, + "step": 6705 + }, + { + "epoch": 0.7350605247302404, + "grad_norm": 1.234375, + "learning_rate": 3.979220128959463e-06, + "loss": 1.1453, + "step": 6710 + }, + { + "epoch": 0.7356082598455387, + "grad_norm": 1.1640625, + "learning_rate": 3.96396416535911e-06, + "loss": 1.1076, + "step": 6715 + }, + { + "epoch": 0.736155994960837, + "grad_norm": 1.2109375, + "learning_rate": 3.948730270435618e-06, + "loss": 1.1978, + "step": 6720 + }, + { + "epoch": 0.7367037300761352, + "grad_norm": 1.296875, + "learning_rate": 3.933518499886455e-06, + "loss": 1.1705, + "step": 6725 + }, + { + "epoch": 0.7372514651914335, + "grad_norm": 1.203125, + "learning_rate": 3.918328909328198e-06, + "loss": 1.1433, + "step": 6730 + }, + { + "epoch": 0.7377992003067316, + "grad_norm": 1.21875, + "learning_rate": 3.9031615542963305e-06, + "loss": 1.1205, + "step": 6735 + }, + { + "epoch": 0.7383469354220299, + "grad_norm": 1.34375, + "learning_rate": 3.8880164902450375e-06, + "loss": 1.2021, + "step": 6740 + }, + { + "epoch": 0.7388946705373282, + "grad_norm": 1.2109375, + "learning_rate": 3.8728937725470084e-06, + "loss": 1.1087, + "step": 6745 + }, + { + "epoch": 0.7394424056526264, + "grad_norm": 1.203125, + "learning_rate": 3.857793456493226e-06, + "loss": 1.1444, + "step": 6750 + }, + { + "epoch": 0.7399901407679247, + "grad_norm": 1.21875, + "learning_rate": 3.842715597292773e-06, + "loss": 1.1848, + "step": 6755 + }, + { + "epoch": 0.7405378758832228, + "grad_norm": 1.2734375, + "learning_rate": 3.8276602500726265e-06, + "loss": 1.1686, + "step": 6760 + }, + { + "epoch": 0.7410856109985211, + "grad_norm": 1.1953125, + "learning_rate": 3.8126274698774513e-06, + "loss": 1.1225, + "step": 6765 + }, + { + "epoch": 0.7416333461138194, + "grad_norm": 1.2265625, + "learning_rate": 3.797617311669417e-06, + "loss": 1.2015, + "step": 6770 + }, + { + "epoch": 0.7421810812291176, + "grad_norm": 1.2734375, + "learning_rate": 3.7826298303279652e-06, + "loss": 1.1643, + "step": 6775 + }, + { + "epoch": 0.7427288163444158, + "grad_norm": 1.234375, + "learning_rate": 3.7676650806496386e-06, + "loss": 1.1988, + "step": 6780 + }, + { + "epoch": 0.743276551459714, + "grad_norm": 1.2265625, + "learning_rate": 3.7527231173478684e-06, + "loss": 1.1904, + "step": 6785 + }, + { + "epoch": 0.7438242865750123, + "grad_norm": 1.1796875, + "learning_rate": 3.7378039950527733e-06, + "loss": 1.1832, + "step": 6790 + }, + { + "epoch": 0.7443720216903106, + "grad_norm": 1.2265625, + "learning_rate": 3.722907768310965e-06, + "loss": 1.1459, + "step": 6795 + }, + { + "epoch": 0.7449197568056088, + "grad_norm": 1.296875, + "learning_rate": 3.7080344915853406e-06, + "loss": 1.1343, + "step": 6800 + }, + { + "epoch": 0.745467491920907, + "grad_norm": 1.3359375, + "learning_rate": 3.6931842192548884e-06, + "loss": 1.161, + "step": 6805 + }, + { + "epoch": 0.7460152270362053, + "grad_norm": 1.2109375, + "learning_rate": 3.6783570056145025e-06, + "loss": 1.089, + "step": 6810 + }, + { + "epoch": 0.7465629621515035, + "grad_norm": 1.2109375, + "learning_rate": 3.6635529048747498e-06, + "loss": 1.1435, + "step": 6815 + }, + { + "epoch": 0.7471106972668018, + "grad_norm": 1.203125, + "learning_rate": 3.648771971161702e-06, + "loss": 1.1859, + "step": 6820 + }, + { + "epoch": 0.7476584323821001, + "grad_norm": 1.2265625, + "learning_rate": 3.6340142585167415e-06, + "loss": 1.1555, + "step": 6825 + }, + { + "epoch": 0.7482061674973982, + "grad_norm": 1.171875, + "learning_rate": 3.6192798208963255e-06, + "loss": 1.1323, + "step": 6830 + }, + { + "epoch": 0.7487539026126965, + "grad_norm": 1.2109375, + "learning_rate": 3.6045687121718365e-06, + "loss": 1.1747, + "step": 6835 + }, + { + "epoch": 0.7493016377279947, + "grad_norm": 1.2421875, + "learning_rate": 3.589880986129356e-06, + "loss": 1.1286, + "step": 6840 + }, + { + "epoch": 0.749849372843293, + "grad_norm": 1.2734375, + "learning_rate": 3.5752166964694613e-06, + "loss": 1.1288, + "step": 6845 + }, + { + "epoch": 0.7503971079585913, + "grad_norm": 1.21875, + "learning_rate": 3.5605758968070658e-06, + "loss": 1.2077, + "step": 6850 + }, + { + "epoch": 0.7509448430738894, + "grad_norm": 1.140625, + "learning_rate": 3.5459586406711875e-06, + "loss": 1.0527, + "step": 6855 + }, + { + "epoch": 0.7514925781891877, + "grad_norm": 1.234375, + "learning_rate": 3.531364981504758e-06, + "loss": 1.1637, + "step": 6860 + }, + { + "epoch": 0.752040313304486, + "grad_norm": 1.3125, + "learning_rate": 3.5167949726644545e-06, + "loss": 1.1552, + "step": 6865 + }, + { + "epoch": 0.7525880484197842, + "grad_norm": 1.203125, + "learning_rate": 3.5022486674204626e-06, + "loss": 1.1109, + "step": 6870 + }, + { + "epoch": 0.7531357835350825, + "grad_norm": 1.234375, + "learning_rate": 3.4877261189563237e-06, + "loss": 1.1444, + "step": 6875 + }, + { + "epoch": 0.7536835186503806, + "grad_norm": 1.265625, + "learning_rate": 3.4732273803687133e-06, + "loss": 1.1577, + "step": 6880 + }, + { + "epoch": 0.7542312537656789, + "grad_norm": 1.234375, + "learning_rate": 3.4587525046672433e-06, + "loss": 1.1359, + "step": 6885 + }, + { + "epoch": 0.7547789888809772, + "grad_norm": 1.25, + "learning_rate": 3.4443015447742977e-06, + "loss": 1.1344, + "step": 6890 + }, + { + "epoch": 0.7553267239962754, + "grad_norm": 1.1953125, + "learning_rate": 3.4298745535248135e-06, + "loss": 1.1513, + "step": 6895 + }, + { + "epoch": 0.7558744591115737, + "grad_norm": 1.2109375, + "learning_rate": 3.4154715836660814e-06, + "loss": 1.1569, + "step": 6900 + }, + { + "epoch": 0.7564221942268718, + "grad_norm": 1.2265625, + "learning_rate": 3.40109268785759e-06, + "loss": 1.1421, + "step": 6905 + }, + { + "epoch": 0.7569699293421701, + "grad_norm": 1.2578125, + "learning_rate": 3.3867379186707927e-06, + "loss": 1.1312, + "step": 6910 + }, + { + "epoch": 0.7575176644574684, + "grad_norm": 1.28125, + "learning_rate": 3.372407328588938e-06, + "loss": 1.1694, + "step": 6915 + }, + { + "epoch": 0.7580653995727666, + "grad_norm": 1.2578125, + "learning_rate": 3.35810097000687e-06, + "loss": 1.1709, + "step": 6920 + }, + { + "epoch": 0.7586131346880649, + "grad_norm": 1.203125, + "learning_rate": 3.3438188952308403e-06, + "loss": 1.15, + "step": 6925 + }, + { + "epoch": 0.7591608698033631, + "grad_norm": 1.2265625, + "learning_rate": 3.3295611564783136e-06, + "loss": 1.0951, + "step": 6930 + }, + { + "epoch": 0.7597086049186613, + "grad_norm": 1.2578125, + "learning_rate": 3.315327805877784e-06, + "loss": 1.2345, + "step": 6935 + }, + { + "epoch": 0.7602563400339596, + "grad_norm": 1.2890625, + "learning_rate": 3.3011188954685626e-06, + "loss": 1.1564, + "step": 6940 + }, + { + "epoch": 0.7608040751492579, + "grad_norm": 1.265625, + "learning_rate": 3.286934477200624e-06, + "loss": 1.1598, + "step": 6945 + }, + { + "epoch": 0.761351810264556, + "grad_norm": 1.2109375, + "learning_rate": 3.2727746029343856e-06, + "loss": 1.1203, + "step": 6950 + }, + { + "epoch": 0.7618995453798543, + "grad_norm": 1.2421875, + "learning_rate": 3.258639324440527e-06, + "loss": 1.1721, + "step": 6955 + }, + { + "epoch": 0.7624472804951525, + "grad_norm": 1.234375, + "learning_rate": 3.2445286933998044e-06, + "loss": 1.1528, + "step": 6960 + }, + { + "epoch": 0.7629950156104508, + "grad_norm": 1.21875, + "learning_rate": 3.23044276140286e-06, + "loss": 1.124, + "step": 6965 + }, + { + "epoch": 0.7635427507257491, + "grad_norm": 1.234375, + "learning_rate": 3.2163815799500288e-06, + "loss": 1.1527, + "step": 6970 + }, + { + "epoch": 0.7640904858410472, + "grad_norm": 1.2109375, + "learning_rate": 3.202345200451158e-06, + "loss": 1.103, + "step": 6975 + }, + { + "epoch": 0.7646382209563455, + "grad_norm": 1.234375, + "learning_rate": 3.1883336742254124e-06, + "loss": 1.16, + "step": 6980 + }, + { + "epoch": 0.7651859560716437, + "grad_norm": 1.1953125, + "learning_rate": 3.174347052501091e-06, + "loss": 1.1707, + "step": 6985 + }, + { + "epoch": 0.765733691186942, + "grad_norm": 1.25, + "learning_rate": 3.160385386415438e-06, + "loss": 1.1882, + "step": 6990 + }, + { + "epoch": 0.7662814263022403, + "grad_norm": 1.2578125, + "learning_rate": 3.146448727014454e-06, + "loss": 1.1877, + "step": 6995 + }, + { + "epoch": 0.7668291614175384, + "grad_norm": 1.2109375, + "learning_rate": 3.1325371252527127e-06, + "loss": 1.1277, + "step": 7000 + }, + { + "epoch": 0.7673768965328367, + "grad_norm": 1.2890625, + "learning_rate": 3.1186506319931754e-06, + "loss": 1.1748, + "step": 7005 + }, + { + "epoch": 0.767924631648135, + "grad_norm": 1.2578125, + "learning_rate": 3.1047892980069984e-06, + "loss": 1.2315, + "step": 7010 + }, + { + "epoch": 0.7684723667634332, + "grad_norm": 1.328125, + "learning_rate": 3.0909531739733544e-06, + "loss": 1.2003, + "step": 7015 + }, + { + "epoch": 0.7690201018787315, + "grad_norm": 1.2265625, + "learning_rate": 3.0771423104792454e-06, + "loss": 1.1535, + "step": 7020 + }, + { + "epoch": 0.7695678369940296, + "grad_norm": 1.2578125, + "learning_rate": 3.0633567580193147e-06, + "loss": 1.1765, + "step": 7025 + }, + { + "epoch": 0.7701155721093279, + "grad_norm": 1.234375, + "learning_rate": 3.049596566995666e-06, + "loss": 1.1403, + "step": 7030 + }, + { + "epoch": 0.7706633072246262, + "grad_norm": 1.25, + "learning_rate": 3.0358617877176787e-06, + "loss": 1.1534, + "step": 7035 + }, + { + "epoch": 0.7712110423399244, + "grad_norm": 1.203125, + "learning_rate": 3.0221524704018212e-06, + "loss": 1.1534, + "step": 7040 + }, + { + "epoch": 0.7717587774552227, + "grad_norm": 1.25, + "learning_rate": 3.0084686651714647e-06, + "loss": 1.1507, + "step": 7045 + }, + { + "epoch": 0.7723065125705209, + "grad_norm": 1.1640625, + "learning_rate": 2.99481042205672e-06, + "loss": 1.1906, + "step": 7050 + }, + { + "epoch": 0.7728542476858191, + "grad_norm": 1.2421875, + "learning_rate": 2.9811777909942185e-06, + "loss": 1.1704, + "step": 7055 + }, + { + "epoch": 0.7734019828011174, + "grad_norm": 1.328125, + "learning_rate": 2.9675708218269596e-06, + "loss": 1.1865, + "step": 7060 + }, + { + "epoch": 0.7739497179164156, + "grad_norm": 1.25, + "learning_rate": 2.9539895643041283e-06, + "loss": 1.1695, + "step": 7065 + }, + { + "epoch": 0.7744974530317139, + "grad_norm": 1.25, + "learning_rate": 2.940434068080883e-06, + "loss": 1.1652, + "step": 7070 + }, + { + "epoch": 0.7750451881470121, + "grad_norm": 1.234375, + "learning_rate": 2.9269043827182084e-06, + "loss": 1.1423, + "step": 7075 + }, + { + "epoch": 0.7755929232623103, + "grad_norm": 1.265625, + "learning_rate": 2.9134005576827273e-06, + "loss": 1.1913, + "step": 7080 + }, + { + "epoch": 0.7761406583776086, + "grad_norm": 1.25, + "learning_rate": 2.899922642346491e-06, + "loss": 1.1596, + "step": 7085 + }, + { + "epoch": 0.7766883934929069, + "grad_norm": 1.21875, + "learning_rate": 2.8864706859868443e-06, + "loss": 1.1581, + "step": 7090 + }, + { + "epoch": 0.7772361286082051, + "grad_norm": 1.265625, + "learning_rate": 2.8730447377862116e-06, + "loss": 1.1263, + "step": 7095 + }, + { + "epoch": 0.7777838637235033, + "grad_norm": 1.3125, + "learning_rate": 2.859644846831918e-06, + "loss": 1.1653, + "step": 7100 + }, + { + "epoch": 0.7783315988388015, + "grad_norm": 1.21875, + "learning_rate": 2.8462710621160427e-06, + "loss": 1.1609, + "step": 7105 + }, + { + "epoch": 0.7788793339540998, + "grad_norm": 1.265625, + "learning_rate": 2.8329234325351917e-06, + "loss": 1.0972, + "step": 7110 + }, + { + "epoch": 0.7794270690693981, + "grad_norm": 1.234375, + "learning_rate": 2.8196020068903574e-06, + "loss": 1.1194, + "step": 7115 + }, + { + "epoch": 0.7799748041846963, + "grad_norm": 1.21875, + "learning_rate": 2.8063068338867315e-06, + "loss": 1.17, + "step": 7120 + }, + { + "epoch": 0.7805225392999945, + "grad_norm": 1.203125, + "learning_rate": 2.7930379621335035e-06, + "loss": 1.1294, + "step": 7125 + }, + { + "epoch": 0.7810702744152928, + "grad_norm": 1.1875, + "learning_rate": 2.779795440143721e-06, + "loss": 1.1411, + "step": 7130 + }, + { + "epoch": 0.781618009530591, + "grad_norm": 1.2890625, + "learning_rate": 2.766579316334085e-06, + "loss": 1.2037, + "step": 7135 + }, + { + "epoch": 0.7821657446458893, + "grad_norm": 1.234375, + "learning_rate": 2.7533896390247706e-06, + "loss": 1.1298, + "step": 7140 + }, + { + "epoch": 0.7827134797611874, + "grad_norm": 1.171875, + "learning_rate": 2.7402264564392788e-06, + "loss": 1.1657, + "step": 7145 + }, + { + "epoch": 0.7832612148764857, + "grad_norm": 1.25, + "learning_rate": 2.7270898167042325e-06, + "loss": 1.1313, + "step": 7150 + }, + { + "epoch": 0.783808949991784, + "grad_norm": 1.2109375, + "learning_rate": 2.7139797678492032e-06, + "loss": 1.1042, + "step": 7155 + }, + { + "epoch": 0.7843566851070822, + "grad_norm": 1.203125, + "learning_rate": 2.700896357806555e-06, + "loss": 1.2117, + "step": 7160 + }, + { + "epoch": 0.7849044202223805, + "grad_norm": 1.2578125, + "learning_rate": 2.6878396344112513e-06, + "loss": 1.1121, + "step": 7165 + }, + { + "epoch": 0.7854521553376786, + "grad_norm": 1.234375, + "learning_rate": 2.6748096454006812e-06, + "loss": 1.1624, + "step": 7170 + }, + { + "epoch": 0.7859998904529769, + "grad_norm": 1.203125, + "learning_rate": 2.6618064384144925e-06, + "loss": 1.0864, + "step": 7175 + }, + { + "epoch": 0.7865476255682752, + "grad_norm": 1.2109375, + "learning_rate": 2.648830060994414e-06, + "loss": 1.1666, + "step": 7180 + }, + { + "epoch": 0.7870953606835734, + "grad_norm": 1.2109375, + "learning_rate": 2.63588056058408e-06, + "loss": 1.0627, + "step": 7185 + }, + { + "epoch": 0.7876430957988717, + "grad_norm": 1.2265625, + "learning_rate": 2.622957984528861e-06, + "loss": 1.1774, + "step": 7190 + }, + { + "epoch": 0.7881908309141699, + "grad_norm": 1.1953125, + "learning_rate": 2.6100623800756787e-06, + "loss": 1.117, + "step": 7195 + }, + { + "epoch": 0.7887385660294681, + "grad_norm": 1.2578125, + "learning_rate": 2.5971937943728563e-06, + "loss": 1.1719, + "step": 7200 + }, + { + "epoch": 0.7892863011447664, + "grad_norm": 1.1875, + "learning_rate": 2.5843522744699246e-06, + "loss": 1.1124, + "step": 7205 + }, + { + "epoch": 0.7898340362600647, + "grad_norm": 1.1640625, + "learning_rate": 2.5715378673174575e-06, + "loss": 1.1658, + "step": 7210 + }, + { + "epoch": 0.7903817713753629, + "grad_norm": 1.2578125, + "learning_rate": 2.558750619766902e-06, + "loss": 1.1308, + "step": 7215 + }, + { + "epoch": 0.7909295064906611, + "grad_norm": 1.1953125, + "learning_rate": 2.545990578570404e-06, + "loss": 1.1377, + "step": 7220 + }, + { + "epoch": 0.7914772416059593, + "grad_norm": 1.203125, + "learning_rate": 2.533257790380642e-06, + "loss": 1.1657, + "step": 7225 + }, + { + "epoch": 0.7920249767212576, + "grad_norm": 1.1953125, + "learning_rate": 2.5205523017506496e-06, + "loss": 1.1398, + "step": 7230 + }, + { + "epoch": 0.7925727118365559, + "grad_norm": 1.3203125, + "learning_rate": 2.5078741591336496e-06, + "loss": 1.193, + "step": 7235 + }, + { + "epoch": 0.7931204469518541, + "grad_norm": 1.21875, + "learning_rate": 2.495223408882886e-06, + "loss": 1.168, + "step": 7240 + }, + { + "epoch": 0.7936681820671523, + "grad_norm": 1.265625, + "learning_rate": 2.4826000972514498e-06, + "loss": 1.1467, + "step": 7245 + }, + { + "epoch": 0.7942159171824505, + "grad_norm": 1.2109375, + "learning_rate": 2.4700042703921132e-06, + "loss": 1.154, + "step": 7250 + }, + { + "epoch": 0.7947636522977488, + "grad_norm": 1.234375, + "learning_rate": 2.4574359743571606e-06, + "loss": 1.1555, + "step": 7255 + }, + { + "epoch": 0.7953113874130471, + "grad_norm": 1.28125, + "learning_rate": 2.4448952550982142e-06, + "loss": 1.1517, + "step": 7260 + }, + { + "epoch": 0.7958591225283453, + "grad_norm": 1.2734375, + "learning_rate": 2.4323821584660846e-06, + "loss": 1.1558, + "step": 7265 + }, + { + "epoch": 0.7964068576436435, + "grad_norm": 1.21875, + "learning_rate": 2.4198967302105712e-06, + "loss": 1.106, + "step": 7270 + }, + { + "epoch": 0.7969545927589418, + "grad_norm": 1.265625, + "learning_rate": 2.4074390159803273e-06, + "loss": 1.17, + "step": 7275 + }, + { + "epoch": 0.79750232787424, + "grad_norm": 1.203125, + "learning_rate": 2.395009061322675e-06, + "loss": 1.1446, + "step": 7280 + }, + { + "epoch": 0.7980500629895383, + "grad_norm": 1.28125, + "learning_rate": 2.3826069116834426e-06, + "loss": 1.1757, + "step": 7285 + }, + { + "epoch": 0.7985977981048366, + "grad_norm": 1.171875, + "learning_rate": 2.370232612406801e-06, + "loss": 1.1398, + "step": 7290 + }, + { + "epoch": 0.7991455332201347, + "grad_norm": 1.3203125, + "learning_rate": 2.3578862087350927e-06, + "loss": 1.1763, + "step": 7295 + }, + { + "epoch": 0.799693268335433, + "grad_norm": 1.234375, + "learning_rate": 2.3455677458086702e-06, + "loss": 1.1386, + "step": 7300 + }, + { + "epoch": 0.8002410034507312, + "grad_norm": 1.203125, + "learning_rate": 2.3332772686657402e-06, + "loss": 1.1302, + "step": 7305 + }, + { + "epoch": 0.8007887385660295, + "grad_norm": 1.1875, + "learning_rate": 2.3210148222421715e-06, + "loss": 1.1224, + "step": 7310 + }, + { + "epoch": 0.8013364736813277, + "grad_norm": 1.21875, + "learning_rate": 2.3087804513713563e-06, + "loss": 1.1453, + "step": 7315 + }, + { + "epoch": 0.8018842087966259, + "grad_norm": 1.265625, + "learning_rate": 2.29657420078405e-06, + "loss": 1.1204, + "step": 7320 + }, + { + "epoch": 0.8024319439119242, + "grad_norm": 1.3359375, + "learning_rate": 2.284396115108174e-06, + "loss": 1.1726, + "step": 7325 + }, + { + "epoch": 0.8029796790272224, + "grad_norm": 1.21875, + "learning_rate": 2.272246238868687e-06, + "loss": 1.1466, + "step": 7330 + }, + { + "epoch": 0.8035274141425207, + "grad_norm": 1.21875, + "learning_rate": 2.2601246164874168e-06, + "loss": 1.1598, + "step": 7335 + }, + { + "epoch": 0.8040751492578189, + "grad_norm": 1.2578125, + "learning_rate": 2.2480312922828717e-06, + "loss": 1.1559, + "step": 7340 + }, + { + "epoch": 0.8046228843731171, + "grad_norm": 1.2734375, + "learning_rate": 2.2359663104701133e-06, + "loss": 1.1813, + "step": 7345 + }, + { + "epoch": 0.8051706194884154, + "grad_norm": 1.234375, + "learning_rate": 2.2239297151605774e-06, + "loss": 1.1431, + "step": 7350 + }, + { + "epoch": 0.8057183546037137, + "grad_norm": 1.2109375, + "learning_rate": 2.2119215503618995e-06, + "loss": 1.1598, + "step": 7355 + }, + { + "epoch": 0.8062660897190119, + "grad_norm": 1.1796875, + "learning_rate": 2.1999418599777935e-06, + "loss": 1.1539, + "step": 7360 + }, + { + "epoch": 0.8068138248343101, + "grad_norm": 1.1796875, + "learning_rate": 2.1879906878078427e-06, + "loss": 1.1623, + "step": 7365 + }, + { + "epoch": 0.8073615599496083, + "grad_norm": 1.1875, + "learning_rate": 2.1760680775473742e-06, + "loss": 1.108, + "step": 7370 + }, + { + "epoch": 0.8079092950649066, + "grad_norm": 1.21875, + "learning_rate": 2.1641740727872963e-06, + "loss": 1.1486, + "step": 7375 + }, + { + "epoch": 0.8084570301802049, + "grad_norm": 1.2265625, + "learning_rate": 2.152308717013911e-06, + "loss": 1.1697, + "step": 7380 + }, + { + "epoch": 0.8090047652955031, + "grad_norm": 1.2109375, + "learning_rate": 2.1404720536087954e-06, + "loss": 1.1243, + "step": 7385 + }, + { + "epoch": 0.8095525004108013, + "grad_norm": 1.2109375, + "learning_rate": 2.128664125848615e-06, + "loss": 1.1442, + "step": 7390 + }, + { + "epoch": 0.8101002355260996, + "grad_norm": 1.21875, + "learning_rate": 2.116884976904965e-06, + "loss": 1.0855, + "step": 7395 + }, + { + "epoch": 0.8106479706413978, + "grad_norm": 1.25, + "learning_rate": 2.105134649844235e-06, + "loss": 1.1801, + "step": 7400 + }, + { + "epoch": 0.8111957057566961, + "grad_norm": 1.1796875, + "learning_rate": 2.093413187627431e-06, + "loss": 1.0925, + "step": 7405 + }, + { + "epoch": 0.8117434408719943, + "grad_norm": 1.2578125, + "learning_rate": 2.081720633110017e-06, + "loss": 1.1548, + "step": 7410 + }, + { + "epoch": 0.8122911759872925, + "grad_norm": 1.2109375, + "learning_rate": 2.070057029041783e-06, + "loss": 1.1049, + "step": 7415 + }, + { + "epoch": 0.8128389111025908, + "grad_norm": 1.2734375, + "learning_rate": 2.0584224180666567e-06, + "loss": 1.184, + "step": 7420 + }, + { + "epoch": 0.813386646217889, + "grad_norm": 1.2109375, + "learning_rate": 2.046816842722571e-06, + "loss": 1.1299, + "step": 7425 + }, + { + "epoch": 0.8139343813331873, + "grad_norm": 1.1796875, + "learning_rate": 2.0352403454412984e-06, + "loss": 1.1397, + "step": 7430 + }, + { + "epoch": 0.8144821164484856, + "grad_norm": 1.1953125, + "learning_rate": 2.0236929685482875e-06, + "loss": 1.1364, + "step": 7435 + }, + { + "epoch": 0.8150298515637837, + "grad_norm": 1.171875, + "learning_rate": 2.012174754262537e-06, + "loss": 1.1209, + "step": 7440 + }, + { + "epoch": 0.815577586679082, + "grad_norm": 1.2421875, + "learning_rate": 2.000685744696409e-06, + "loss": 1.1272, + "step": 7445 + }, + { + "epoch": 0.8161253217943802, + "grad_norm": 1.28125, + "learning_rate": 1.9892259818554915e-06, + "loss": 1.2119, + "step": 7450 + }, + { + "epoch": 0.8166730569096785, + "grad_norm": 1.25, + "learning_rate": 1.9777955076384446e-06, + "loss": 1.1568, + "step": 7455 + }, + { + "epoch": 0.8172207920249768, + "grad_norm": 1.2109375, + "learning_rate": 1.966394363836842e-06, + "loss": 1.1831, + "step": 7460 + }, + { + "epoch": 0.8177685271402749, + "grad_norm": 1.2578125, + "learning_rate": 1.955022592135022e-06, + "loss": 1.1637, + "step": 7465 + }, + { + "epoch": 0.8183162622555732, + "grad_norm": 1.25, + "learning_rate": 1.9436802341099337e-06, + "loss": 1.1363, + "step": 7470 + }, + { + "epoch": 0.8188639973708715, + "grad_norm": 1.234375, + "learning_rate": 1.9323673312309856e-06, + "loss": 1.1293, + "step": 7475 + }, + { + "epoch": 0.8194117324861697, + "grad_norm": 1.21875, + "learning_rate": 1.9210839248598924e-06, + "loss": 1.1667, + "step": 7480 + }, + { + "epoch": 0.819959467601468, + "grad_norm": 1.4140625, + "learning_rate": 1.9098300562505266e-06, + "loss": 1.2437, + "step": 7485 + }, + { + "epoch": 0.8205072027167661, + "grad_norm": 1.1875, + "learning_rate": 1.8986057665487656e-06, + "loss": 1.1136, + "step": 7490 + }, + { + "epoch": 0.8210549378320644, + "grad_norm": 1.2890625, + "learning_rate": 1.887411096792342e-06, + "loss": 1.1329, + "step": 7495 + }, + { + "epoch": 0.8216026729473627, + "grad_norm": 1.2109375, + "learning_rate": 1.8762460879106925e-06, + "loss": 1.1745, + "step": 7500 + }, + { + "epoch": 0.8221504080626609, + "grad_norm": 1.2421875, + "learning_rate": 1.8651107807248091e-06, + "loss": 1.1553, + "step": 7505 + }, + { + "epoch": 0.8226981431779591, + "grad_norm": 1.25, + "learning_rate": 1.8540052159470912e-06, + "loss": 1.1032, + "step": 7510 + }, + { + "epoch": 0.8232458782932573, + "grad_norm": 1.28125, + "learning_rate": 1.8429294341811933e-06, + "loss": 1.1256, + "step": 7515 + }, + { + "epoch": 0.8237936134085556, + "grad_norm": 1.28125, + "learning_rate": 1.8318834759218817e-06, + "loss": 1.1521, + "step": 7520 + }, + { + "epoch": 0.8243413485238539, + "grad_norm": 1.21875, + "learning_rate": 1.8208673815548794e-06, + "loss": 1.1444, + "step": 7525 + }, + { + "epoch": 0.8248890836391521, + "grad_norm": 1.1953125, + "learning_rate": 1.8098811913567248e-06, + "loss": 1.0999, + "step": 7530 + }, + { + "epoch": 0.8254368187544503, + "grad_norm": 1.2109375, + "learning_rate": 1.7989249454946234e-06, + "loss": 1.1726, + "step": 7535 + }, + { + "epoch": 0.8259845538697486, + "grad_norm": 1.3125, + "learning_rate": 1.7879986840262953e-06, + "loss": 1.1836, + "step": 7540 + }, + { + "epoch": 0.8265322889850468, + "grad_norm": 1.3046875, + "learning_rate": 1.7771024468998377e-06, + "loss": 1.1058, + "step": 7545 + }, + { + "epoch": 0.8270800241003451, + "grad_norm": 1.2734375, + "learning_rate": 1.766236273953571e-06, + "loss": 1.1559, + "step": 7550 + }, + { + "epoch": 0.8276277592156434, + "grad_norm": 1.25, + "learning_rate": 1.7554002049158947e-06, + "loss": 1.1613, + "step": 7555 + }, + { + "epoch": 0.8281754943309415, + "grad_norm": 1.2265625, + "learning_rate": 1.7445942794051552e-06, + "loss": 1.1669, + "step": 7560 + }, + { + "epoch": 0.8287232294462398, + "grad_norm": 1.2109375, + "learning_rate": 1.7338185369294725e-06, + "loss": 1.1707, + "step": 7565 + }, + { + "epoch": 0.829270964561538, + "grad_norm": 1.2421875, + "learning_rate": 1.7230730168866216e-06, + "loss": 1.1434, + "step": 7570 + }, + { + "epoch": 0.8298186996768363, + "grad_norm": 1.15625, + "learning_rate": 1.7123577585638861e-06, + "loss": 1.1345, + "step": 7575 + }, + { + "epoch": 0.8303664347921346, + "grad_norm": 1.2578125, + "learning_rate": 1.7016728011378936e-06, + "loss": 1.1141, + "step": 7580 + }, + { + "epoch": 0.8309141699074327, + "grad_norm": 1.3125, + "learning_rate": 1.6910181836744955e-06, + "loss": 1.1606, + "step": 7585 + }, + { + "epoch": 0.831461905022731, + "grad_norm": 1.2890625, + "learning_rate": 1.68039394512862e-06, + "loss": 1.1388, + "step": 7590 + }, + { + "epoch": 0.8320096401380292, + "grad_norm": 1.2421875, + "learning_rate": 1.669800124344112e-06, + "loss": 1.1396, + "step": 7595 + }, + { + "epoch": 0.8325573752533275, + "grad_norm": 1.1796875, + "learning_rate": 1.6592367600536209e-06, + "loss": 1.1241, + "step": 7600 + }, + { + "epoch": 0.8331051103686258, + "grad_norm": 1.2578125, + "learning_rate": 1.648703890878427e-06, + "loss": 1.1524, + "step": 7605 + }, + { + "epoch": 0.8336528454839239, + "grad_norm": 1.3203125, + "learning_rate": 1.6382015553283215e-06, + "loss": 1.1413, + "step": 7610 + }, + { + "epoch": 0.8342005805992222, + "grad_norm": 1.25, + "learning_rate": 1.6277297918014711e-06, + "loss": 1.1485, + "step": 7615 + }, + { + "epoch": 0.8347483157145205, + "grad_norm": 1.2109375, + "learning_rate": 1.6172886385842457e-06, + "loss": 1.1215, + "step": 7620 + }, + { + "epoch": 0.8352960508298187, + "grad_norm": 1.2421875, + "learning_rate": 1.6068781338511131e-06, + "loss": 1.1049, + "step": 7625 + }, + { + "epoch": 0.835843785945117, + "grad_norm": 1.265625, + "learning_rate": 1.5964983156644876e-06, + "loss": 1.1723, + "step": 7630 + }, + { + "epoch": 0.8363915210604151, + "grad_norm": 1.1796875, + "learning_rate": 1.5861492219745733e-06, + "loss": 1.1166, + "step": 7635 + }, + { + "epoch": 0.8369392561757134, + "grad_norm": 1.25, + "learning_rate": 1.575830890619261e-06, + "loss": 1.1591, + "step": 7640 + }, + { + "epoch": 0.8374869912910117, + "grad_norm": 1.234375, + "learning_rate": 1.5655433593239566e-06, + "loss": 1.1433, + "step": 7645 + }, + { + "epoch": 0.8380347264063099, + "grad_norm": 1.2734375, + "learning_rate": 1.5552866657014542e-06, + "loss": 1.1351, + "step": 7650 + }, + { + "epoch": 0.8385824615216082, + "grad_norm": 1.203125, + "learning_rate": 1.5450608472518115e-06, + "loss": 1.1423, + "step": 7655 + }, + { + "epoch": 0.8391301966369064, + "grad_norm": 1.1796875, + "learning_rate": 1.5348659413621946e-06, + "loss": 1.1316, + "step": 7660 + }, + { + "epoch": 0.8396779317522046, + "grad_norm": 1.203125, + "learning_rate": 1.5247019853067501e-06, + "loss": 1.1513, + "step": 7665 + }, + { + "epoch": 0.8402256668675029, + "grad_norm": 1.1953125, + "learning_rate": 1.5145690162464666e-06, + "loss": 1.1162, + "step": 7670 + }, + { + "epoch": 0.8407734019828011, + "grad_norm": 1.1875, + "learning_rate": 1.5044670712290399e-06, + "loss": 1.1247, + "step": 7675 + }, + { + "epoch": 0.8413211370980994, + "grad_norm": 1.2421875, + "learning_rate": 1.4943961871887368e-06, + "loss": 1.1506, + "step": 7680 + }, + { + "epoch": 0.8418688722133976, + "grad_norm": 1.25, + "learning_rate": 1.484356400946264e-06, + "loss": 1.1769, + "step": 7685 + }, + { + "epoch": 0.8424166073286958, + "grad_norm": 1.3125, + "learning_rate": 1.47434774920862e-06, + "loss": 1.1816, + "step": 7690 + }, + { + "epoch": 0.8429643424439941, + "grad_norm": 1.203125, + "learning_rate": 1.4643702685689832e-06, + "loss": 1.1576, + "step": 7695 + }, + { + "epoch": 0.8435120775592924, + "grad_norm": 1.171875, + "learning_rate": 1.4544239955065576e-06, + "loss": 1.109, + "step": 7700 + }, + { + "epoch": 0.8440598126745905, + "grad_norm": 1.2578125, + "learning_rate": 1.444508966386451e-06, + "loss": 1.2185, + "step": 7705 + }, + { + "epoch": 0.8446075477898888, + "grad_norm": 1.2421875, + "learning_rate": 1.4346252174595354e-06, + "loss": 1.1538, + "step": 7710 + }, + { + "epoch": 0.845155282905187, + "grad_norm": 1.265625, + "learning_rate": 1.4247727848623205e-06, + "loss": 1.1246, + "step": 7715 + }, + { + "epoch": 0.8457030180204853, + "grad_norm": 1.265625, + "learning_rate": 1.4149517046168182e-06, + "loss": 1.1427, + "step": 7720 + }, + { + "epoch": 0.8462507531357836, + "grad_norm": 1.265625, + "learning_rate": 1.40516201263041e-06, + "loss": 1.1379, + "step": 7725 + }, + { + "epoch": 0.8467984882510817, + "grad_norm": 1.21875, + "learning_rate": 1.395403744695717e-06, + "loss": 1.119, + "step": 7730 + }, + { + "epoch": 0.84734622336638, + "grad_norm": 1.171875, + "learning_rate": 1.385676936490472e-06, + "loss": 1.15, + "step": 7735 + }, + { + "epoch": 0.8478939584816783, + "grad_norm": 1.2421875, + "learning_rate": 1.3759816235773838e-06, + "loss": 1.1581, + "step": 7740 + }, + { + "epoch": 0.8484416935969765, + "grad_norm": 1.203125, + "learning_rate": 1.366317841404009e-06, + "loss": 1.1532, + "step": 7745 + }, + { + "epoch": 0.8489894287122748, + "grad_norm": 1.21875, + "learning_rate": 1.356685625302625e-06, + "loss": 1.1954, + "step": 7750 + }, + { + "epoch": 0.8495371638275729, + "grad_norm": 1.2265625, + "learning_rate": 1.3470850104900967e-06, + "loss": 1.1654, + "step": 7755 + }, + { + "epoch": 0.8500848989428712, + "grad_norm": 1.3203125, + "learning_rate": 1.3375160320677517e-06, + "loss": 1.2019, + "step": 7760 + }, + { + "epoch": 0.8506326340581695, + "grad_norm": 1.265625, + "learning_rate": 1.3279787250212483e-06, + "loss": 1.132, + "step": 7765 + }, + { + "epoch": 0.8511803691734677, + "grad_norm": 1.2109375, + "learning_rate": 1.3184731242204508e-06, + "loss": 1.2017, + "step": 7770 + }, + { + "epoch": 0.851728104288766, + "grad_norm": 1.2109375, + "learning_rate": 1.3089992644193005e-06, + "loss": 1.1279, + "step": 7775 + }, + { + "epoch": 0.8522758394040642, + "grad_norm": 1.2734375, + "learning_rate": 1.2995571802556872e-06, + "loss": 1.1969, + "step": 7780 + }, + { + "epoch": 0.8528235745193624, + "grad_norm": 1.21875, + "learning_rate": 1.2901469062513262e-06, + "loss": 1.1247, + "step": 7785 + }, + { + "epoch": 0.8533713096346607, + "grad_norm": 1.1875, + "learning_rate": 1.2807684768116292e-06, + "loss": 1.1916, + "step": 7790 + }, + { + "epoch": 0.8539190447499589, + "grad_norm": 1.265625, + "learning_rate": 1.2714219262255777e-06, + "loss": 1.1319, + "step": 7795 + }, + { + "epoch": 0.8544667798652572, + "grad_norm": 1.21875, + "learning_rate": 1.2621072886656005e-06, + "loss": 1.1725, + "step": 7800 + }, + { + "epoch": 0.8550145149805554, + "grad_norm": 1.2265625, + "learning_rate": 1.2528245981874488e-06, + "loss": 1.124, + "step": 7805 + }, + { + "epoch": 0.8555622500958536, + "grad_norm": 1.25, + "learning_rate": 1.2435738887300653e-06, + "loss": 1.1971, + "step": 7810 + }, + { + "epoch": 0.8561099852111519, + "grad_norm": 1.234375, + "learning_rate": 1.2343551941154763e-06, + "loss": 1.1879, + "step": 7815 + }, + { + "epoch": 0.8566577203264502, + "grad_norm": 1.1875, + "learning_rate": 1.225168548048643e-06, + "loss": 1.1339, + "step": 7820 + }, + { + "epoch": 0.8572054554417484, + "grad_norm": 1.2890625, + "learning_rate": 1.2160139841173579e-06, + "loss": 1.195, + "step": 7825 + }, + { + "epoch": 0.8577531905570466, + "grad_norm": 1.203125, + "learning_rate": 1.206891535792124e-06, + "loss": 1.1395, + "step": 7830 + }, + { + "epoch": 0.8583009256723448, + "grad_norm": 1.203125, + "learning_rate": 1.1978012364260116e-06, + "loss": 1.1273, + "step": 7835 + }, + { + "epoch": 0.8588486607876431, + "grad_norm": 1.265625, + "learning_rate": 1.1887431192545573e-06, + "loss": 1.1573, + "step": 7840 + }, + { + "epoch": 0.8593963959029414, + "grad_norm": 1.2578125, + "learning_rate": 1.1797172173956328e-06, + "loss": 1.1382, + "step": 7845 + }, + { + "epoch": 0.8599441310182396, + "grad_norm": 1.28125, + "learning_rate": 1.170723563849323e-06, + "loss": 1.1812, + "step": 7850 + }, + { + "epoch": 0.8604918661335378, + "grad_norm": 1.1953125, + "learning_rate": 1.1617621914978184e-06, + "loss": 1.1322, + "step": 7855 + }, + { + "epoch": 0.861039601248836, + "grad_norm": 1.2109375, + "learning_rate": 1.1528331331052688e-06, + "loss": 1.2111, + "step": 7860 + }, + { + "epoch": 0.8615873363641343, + "grad_norm": 1.2265625, + "learning_rate": 1.1439364213176895e-06, + "loss": 1.114, + "step": 7865 + }, + { + "epoch": 0.8621350714794326, + "grad_norm": 1.2421875, + "learning_rate": 1.135072088662833e-06, + "loss": 1.1716, + "step": 7870 + }, + { + "epoch": 0.8626828065947307, + "grad_norm": 1.1953125, + "learning_rate": 1.126240167550059e-06, + "loss": 1.1152, + "step": 7875 + }, + { + "epoch": 0.863230541710029, + "grad_norm": 1.203125, + "learning_rate": 1.1174406902702362e-06, + "loss": 1.1209, + "step": 7880 + }, + { + "epoch": 0.8637782768253273, + "grad_norm": 1.25, + "learning_rate": 1.1086736889956107e-06, + "loss": 1.1278, + "step": 7885 + }, + { + "epoch": 0.8643260119406255, + "grad_norm": 1.2109375, + "learning_rate": 1.0999391957796824e-06, + "loss": 1.1796, + "step": 7890 + }, + { + "epoch": 0.8648737470559238, + "grad_norm": 1.1796875, + "learning_rate": 1.0912372425571095e-06, + "loss": 1.1365, + "step": 7895 + }, + { + "epoch": 0.865421482171222, + "grad_norm": 1.375, + "learning_rate": 1.0825678611435753e-06, + "loss": 1.1597, + "step": 7900 + }, + { + "epoch": 0.8659692172865202, + "grad_norm": 1.234375, + "learning_rate": 1.0739310832356664e-06, + "loss": 1.1346, + "step": 7905 + }, + { + "epoch": 0.8665169524018185, + "grad_norm": 1.234375, + "learning_rate": 1.0653269404107802e-06, + "loss": 1.124, + "step": 7910 + }, + { + "epoch": 0.8670646875171167, + "grad_norm": 1.2890625, + "learning_rate": 1.056755464126985e-06, + "loss": 1.1831, + "step": 7915 + }, + { + "epoch": 0.867612422632415, + "grad_norm": 1.3046875, + "learning_rate": 1.0482166857229204e-06, + "loss": 1.1719, + "step": 7920 + }, + { + "epoch": 0.8681601577477132, + "grad_norm": 1.21875, + "learning_rate": 1.0397106364176768e-06, + "loss": 1.1345, + "step": 7925 + }, + { + "epoch": 0.8687078928630114, + "grad_norm": 1.28125, + "learning_rate": 1.0312373473106741e-06, + "loss": 1.1847, + "step": 7930 + }, + { + "epoch": 0.8692556279783097, + "grad_norm": 1.2265625, + "learning_rate": 1.0227968493815698e-06, + "loss": 1.1735, + "step": 7935 + }, + { + "epoch": 0.869803363093608, + "grad_norm": 1.25, + "learning_rate": 1.0143891734901235e-06, + "loss": 1.1974, + "step": 7940 + }, + { + "epoch": 0.8703510982089062, + "grad_norm": 1.21875, + "learning_rate": 1.0060143503760877e-06, + "loss": 1.1209, + "step": 7945 + }, + { + "epoch": 0.8708988333242044, + "grad_norm": 1.234375, + "learning_rate": 9.976724106591128e-07, + "loss": 1.1643, + "step": 7950 + }, + { + "epoch": 0.8714465684395026, + "grad_norm": 1.234375, + "learning_rate": 9.89363384838613e-07, + "loss": 1.1561, + "step": 7955 + }, + { + "epoch": 0.8719943035548009, + "grad_norm": 1.203125, + "learning_rate": 9.810873032936653e-07, + "loss": 1.154, + "step": 7960 + }, + { + "epoch": 0.8725420386700992, + "grad_norm": 1.328125, + "learning_rate": 9.728441962829006e-07, + "loss": 1.1731, + "step": 7965 + }, + { + "epoch": 0.8730897737853974, + "grad_norm": 1.2109375, + "learning_rate": 9.646340939443877e-07, + "loss": 1.1772, + "step": 7970 + }, + { + "epoch": 0.8736375089006956, + "grad_norm": 1.234375, + "learning_rate": 9.564570262955252e-07, + "loss": 1.1721, + "step": 7975 + }, + { + "epoch": 0.8741852440159938, + "grad_norm": 1.2109375, + "learning_rate": 9.483130232329307e-07, + "loss": 1.1366, + "step": 7980 + }, + { + "epoch": 0.8747329791312921, + "grad_norm": 1.1953125, + "learning_rate": 9.40202114532337e-07, + "loss": 1.1504, + "step": 7985 + }, + { + "epoch": 0.8752807142465904, + "grad_norm": 1.234375, + "learning_rate": 9.321243298484728e-07, + "loss": 1.1317, + "step": 7990 + }, + { + "epoch": 0.8758284493618886, + "grad_norm": 1.234375, + "learning_rate": 9.240796987149658e-07, + "loss": 1.1397, + "step": 7995 + }, + { + "epoch": 0.8763761844771868, + "grad_norm": 1.203125, + "learning_rate": 9.160682505442242e-07, + "loss": 1.15, + "step": 8000 + }, + { + "epoch": 0.8769239195924851, + "grad_norm": 1.28125, + "learning_rate": 9.080900146273386e-07, + "loss": 1.1657, + "step": 8005 + }, + { + "epoch": 0.8774716547077833, + "grad_norm": 1.28125, + "learning_rate": 9.001450201339679e-07, + "loss": 1.1981, + "step": 8010 + }, + { + "epoch": 0.8780193898230816, + "grad_norm": 1.25, + "learning_rate": 8.92233296112236e-07, + "loss": 1.1738, + "step": 8015 + }, + { + "epoch": 0.8785671249383799, + "grad_norm": 1.3828125, + "learning_rate": 8.843548714886252e-07, + "loss": 1.0631, + "step": 8020 + }, + { + "epoch": 0.879114860053678, + "grad_norm": 1.2265625, + "learning_rate": 8.765097750678675e-07, + "loss": 1.1533, + "step": 8025 + }, + { + "epoch": 0.8796625951689763, + "grad_norm": 1.21875, + "learning_rate": 8.686980355328467e-07, + "loss": 1.1551, + "step": 8030 + }, + { + "epoch": 0.8802103302842745, + "grad_norm": 1.2734375, + "learning_rate": 8.609196814444843e-07, + "loss": 1.1485, + "step": 8035 + }, + { + "epoch": 0.8807580653995728, + "grad_norm": 1.2421875, + "learning_rate": 8.531747412416424e-07, + "loss": 1.14, + "step": 8040 + }, + { + "epoch": 0.8813058005148711, + "grad_norm": 1.21875, + "learning_rate": 8.454632432410137e-07, + "loss": 1.1257, + "step": 8045 + }, + { + "epoch": 0.8818535356301692, + "grad_norm": 1.21875, + "learning_rate": 8.377852156370236e-07, + "loss": 1.1692, + "step": 8050 + }, + { + "epoch": 0.8824012707454675, + "grad_norm": 1.296875, + "learning_rate": 8.301406865017247e-07, + "loss": 1.1838, + "step": 8055 + }, + { + "epoch": 0.8829490058607657, + "grad_norm": 1.234375, + "learning_rate": 8.225296837846919e-07, + "loss": 1.1881, + "step": 8060 + }, + { + "epoch": 0.883496740976064, + "grad_norm": 1.2265625, + "learning_rate": 8.149522353129224e-07, + "loss": 1.1945, + "step": 8065 + }, + { + "epoch": 0.8840444760913622, + "grad_norm": 1.265625, + "learning_rate": 8.074083687907408e-07, + "loss": 1.1544, + "step": 8070 + }, + { + "epoch": 0.8845922112066604, + "grad_norm": 1.34375, + "learning_rate": 7.998981117996796e-07, + "loss": 1.1729, + "step": 8075 + }, + { + "epoch": 0.8851399463219587, + "grad_norm": 1.3203125, + "learning_rate": 7.924214917983952e-07, + "loss": 1.1486, + "step": 8080 + }, + { + "epoch": 0.885687681437257, + "grad_norm": 1.2109375, + "learning_rate": 7.84978536122567e-07, + "loss": 1.2001, + "step": 8085 + }, + { + "epoch": 0.8862354165525552, + "grad_norm": 1.234375, + "learning_rate": 7.775692719847816e-07, + "loss": 1.1905, + "step": 8090 + }, + { + "epoch": 0.8867831516678534, + "grad_norm": 1.1953125, + "learning_rate": 7.701937264744564e-07, + "loss": 1.1848, + "step": 8095 + }, + { + "epoch": 0.8873308867831516, + "grad_norm": 1.2421875, + "learning_rate": 7.628519265577162e-07, + "loss": 1.1559, + "step": 8100 + }, + { + "epoch": 0.8878786218984499, + "grad_norm": 1.3125, + "learning_rate": 7.555438990773134e-07, + "loss": 1.1799, + "step": 8105 + }, + { + "epoch": 0.8884263570137482, + "grad_norm": 1.234375, + "learning_rate": 7.482696707525272e-07, + "loss": 1.2088, + "step": 8110 + }, + { + "epoch": 0.8889740921290464, + "grad_norm": 1.1875, + "learning_rate": 7.41029268179052e-07, + "loss": 1.1298, + "step": 8115 + }, + { + "epoch": 0.8895218272443446, + "grad_norm": 1.296875, + "learning_rate": 7.338227178289148e-07, + "loss": 1.1782, + "step": 8120 + }, + { + "epoch": 0.8900695623596429, + "grad_norm": 1.2578125, + "learning_rate": 7.266500460503778e-07, + "loss": 1.2125, + "step": 8125 + }, + { + "epoch": 0.8906172974749411, + "grad_norm": 1.21875, + "learning_rate": 7.195112790678293e-07, + "loss": 1.1491, + "step": 8130 + }, + { + "epoch": 0.8911650325902394, + "grad_norm": 1.21875, + "learning_rate": 7.124064429817057e-07, + "loss": 1.1438, + "step": 8135 + }, + { + "epoch": 0.8917127677055376, + "grad_norm": 1.2734375, + "learning_rate": 7.053355637683801e-07, + "loss": 1.1473, + "step": 8140 + }, + { + "epoch": 0.8922605028208358, + "grad_norm": 1.203125, + "learning_rate": 6.98298667280074e-07, + "loss": 1.2112, + "step": 8145 + }, + { + "epoch": 0.8928082379361341, + "grad_norm": 1.1953125, + "learning_rate": 6.912957792447683e-07, + "loss": 1.1886, + "step": 8150 + }, + { + "epoch": 0.8933559730514323, + "grad_norm": 1.2734375, + "learning_rate": 6.843269252661011e-07, + "loss": 1.1485, + "step": 8155 + }, + { + "epoch": 0.8939037081667306, + "grad_norm": 1.2265625, + "learning_rate": 6.773921308232701e-07, + "loss": 1.1473, + "step": 8160 + }, + { + "epoch": 0.8944514432820289, + "grad_norm": 1.2578125, + "learning_rate": 6.70491421270959e-07, + "loss": 1.1345, + "step": 8165 + }, + { + "epoch": 0.894999178397327, + "grad_norm": 1.2265625, + "learning_rate": 6.636248218392216e-07, + "loss": 1.1708, + "step": 8170 + }, + { + "epoch": 0.8955469135126253, + "grad_norm": 1.265625, + "learning_rate": 6.567923576334034e-07, + "loss": 1.1519, + "step": 8175 + }, + { + "epoch": 0.8960946486279235, + "grad_norm": 1.21875, + "learning_rate": 6.499940536340488e-07, + "loss": 1.1515, + "step": 8180 + }, + { + "epoch": 0.8966423837432218, + "grad_norm": 1.3125, + "learning_rate": 6.432299346967996e-07, + "loss": 1.1465, + "step": 8185 + }, + { + "epoch": 0.8971901188585201, + "grad_norm": 1.28125, + "learning_rate": 6.365000255523202e-07, + "loss": 1.1477, + "step": 8190 + }, + { + "epoch": 0.8977378539738182, + "grad_norm": 1.21875, + "learning_rate": 6.298043508061946e-07, + "loss": 1.163, + "step": 8195 + }, + { + "epoch": 0.8982855890891165, + "grad_norm": 1.203125, + "learning_rate": 6.231429349388396e-07, + "loss": 1.1559, + "step": 8200 + }, + { + "epoch": 0.8988333242044148, + "grad_norm": 1.1875, + "learning_rate": 6.165158023054196e-07, + "loss": 1.1445, + "step": 8205 + }, + { + "epoch": 0.899381059319713, + "grad_norm": 1.25, + "learning_rate": 6.099229771357517e-07, + "loss": 1.1405, + "step": 8210 + }, + { + "epoch": 0.8999287944350113, + "grad_norm": 1.234375, + "learning_rate": 6.0336448353422e-07, + "loss": 1.1287, + "step": 8215 + }, + { + "epoch": 0.9004765295503094, + "grad_norm": 1.234375, + "learning_rate": 5.968403454796889e-07, + "loss": 1.1489, + "step": 8220 + }, + { + "epoch": 0.9010242646656077, + "grad_norm": 1.2421875, + "learning_rate": 5.9035058682541e-07, + "loss": 1.1511, + "step": 8225 + }, + { + "epoch": 0.901571999780906, + "grad_norm": 1.2421875, + "learning_rate": 5.838952312989432e-07, + "loss": 1.1326, + "step": 8230 + }, + { + "epoch": 0.9021197348962042, + "grad_norm": 1.265625, + "learning_rate": 5.774743025020602e-07, + "loss": 1.1529, + "step": 8235 + }, + { + "epoch": 0.9026674700115024, + "grad_norm": 1.234375, + "learning_rate": 5.710878239106677e-07, + "loss": 1.1237, + "step": 8240 + }, + { + "epoch": 0.9032152051268006, + "grad_norm": 1.2265625, + "learning_rate": 5.647358188747143e-07, + "loss": 1.1941, + "step": 8245 + }, + { + "epoch": 0.9037629402420989, + "grad_norm": 1.28125, + "learning_rate": 5.584183106181085e-07, + "loss": 1.1752, + "step": 8250 + }, + { + "epoch": 0.9043106753573972, + "grad_norm": 1.1796875, + "learning_rate": 5.521353222386361e-07, + "loss": 1.1237, + "step": 8255 + }, + { + "epoch": 0.9048584104726954, + "grad_norm": 1.1953125, + "learning_rate": 5.458868767078673e-07, + "loss": 1.0955, + "step": 8260 + }, + { + "epoch": 0.9054061455879936, + "grad_norm": 1.171875, + "learning_rate": 5.396729968710835e-07, + "loss": 1.1359, + "step": 8265 + }, + { + "epoch": 0.9059538807032919, + "grad_norm": 1.25, + "learning_rate": 5.334937054471867e-07, + "loss": 1.1522, + "step": 8270 + }, + { + "epoch": 0.9065016158185901, + "grad_norm": 1.2578125, + "learning_rate": 5.273490250286173e-07, + "loss": 1.1596, + "step": 8275 + }, + { + "epoch": 0.9070493509338884, + "grad_norm": 1.2421875, + "learning_rate": 5.212389780812733e-07, + "loss": 1.156, + "step": 8280 + }, + { + "epoch": 0.9075970860491867, + "grad_norm": 1.2734375, + "learning_rate": 5.151635869444293e-07, + "loss": 1.1344, + "step": 8285 + }, + { + "epoch": 0.9081448211644848, + "grad_norm": 1.2421875, + "learning_rate": 5.091228738306497e-07, + "loss": 1.1873, + "step": 8290 + }, + { + "epoch": 0.9086925562797831, + "grad_norm": 1.375, + "learning_rate": 5.03116860825712e-07, + "loss": 1.1077, + "step": 8295 + }, + { + "epoch": 0.9092402913950813, + "grad_norm": 1.2890625, + "learning_rate": 4.971455698885263e-07, + "loss": 1.1709, + "step": 8300 + }, + { + "epoch": 0.9097880265103796, + "grad_norm": 1.1953125, + "learning_rate": 4.912090228510502e-07, + "loss": 1.1968, + "step": 8305 + }, + { + "epoch": 0.9103357616256779, + "grad_norm": 1.203125, + "learning_rate": 4.85307241418218e-07, + "loss": 1.1573, + "step": 8310 + }, + { + "epoch": 0.910883496740976, + "grad_norm": 1.21875, + "learning_rate": 4.794402471678483e-07, + "loss": 1.2055, + "step": 8315 + }, + { + "epoch": 0.9114312318562743, + "grad_norm": 1.21875, + "learning_rate": 4.7360806155057557e-07, + "loss": 1.1618, + "step": 8320 + }, + { + "epoch": 0.9119789669715725, + "grad_norm": 1.1953125, + "learning_rate": 4.6781070588977187e-07, + "loss": 1.1254, + "step": 8325 + }, + { + "epoch": 0.9125267020868708, + "grad_norm": 1.203125, + "learning_rate": 4.620482013814609e-07, + "loss": 1.121, + "step": 8330 + }, + { + "epoch": 0.9130744372021691, + "grad_norm": 1.25, + "learning_rate": 4.5632056909424517e-07, + "loss": 1.1182, + "step": 8335 + }, + { + "epoch": 0.9136221723174672, + "grad_norm": 1.2265625, + "learning_rate": 4.506278299692335e-07, + "loss": 1.167, + "step": 8340 + }, + { + "epoch": 0.9141699074327655, + "grad_norm": 1.296875, + "learning_rate": 4.449700048199546e-07, + "loss": 1.1577, + "step": 8345 + }, + { + "epoch": 0.9147176425480638, + "grad_norm": 1.2109375, + "learning_rate": 4.393471143322925e-07, + "loss": 1.1233, + "step": 8350 + }, + { + "epoch": 0.915265377663362, + "grad_norm": 1.2265625, + "learning_rate": 4.337591790643969e-07, + "loss": 1.1454, + "step": 8355 + }, + { + "epoch": 0.9158131127786603, + "grad_norm": 1.1953125, + "learning_rate": 4.2820621944662077e-07, + "loss": 1.1687, + "step": 8360 + }, + { + "epoch": 0.9163608478939584, + "grad_norm": 1.2109375, + "learning_rate": 4.226882557814438e-07, + "loss": 1.1294, + "step": 8365 + }, + { + "epoch": 0.9169085830092567, + "grad_norm": 1.2265625, + "learning_rate": 4.172053082433858e-07, + "loss": 1.1536, + "step": 8370 + }, + { + "epoch": 0.917456318124555, + "grad_norm": 1.234375, + "learning_rate": 4.117573968789501e-07, + "loss": 1.1602, + "step": 8375 + }, + { + "epoch": 0.9180040532398532, + "grad_norm": 1.21875, + "learning_rate": 4.063445416065415e-07, + "loss": 1.1011, + "step": 8380 + }, + { + "epoch": 0.9185517883551515, + "grad_norm": 1.2578125, + "learning_rate": 4.009667622163882e-07, + "loss": 1.2093, + "step": 8385 + }, + { + "epoch": 0.9190995234704497, + "grad_norm": 1.28125, + "learning_rate": 3.9562407837048566e-07, + "loss": 1.1483, + "step": 8390 + }, + { + "epoch": 0.9196472585857479, + "grad_norm": 1.1953125, + "learning_rate": 3.9031650960250635e-07, + "loss": 1.1597, + "step": 8395 + }, + { + "epoch": 0.9201949937010462, + "grad_norm": 1.1484375, + "learning_rate": 3.850440753177376e-07, + "loss": 1.089, + "step": 8400 + }, + { + "epoch": 0.9207427288163444, + "grad_norm": 1.2578125, + "learning_rate": 3.79806794793014e-07, + "loss": 1.1409, + "step": 8405 + }, + { + "epoch": 0.9212904639316427, + "grad_norm": 1.2578125, + "learning_rate": 3.7460468717663955e-07, + "loss": 1.1505, + "step": 8410 + }, + { + "epoch": 0.9218381990469409, + "grad_norm": 1.234375, + "learning_rate": 3.6943777148831907e-07, + "loss": 1.1877, + "step": 8415 + }, + { + "epoch": 0.9223859341622391, + "grad_norm": 1.2109375, + "learning_rate": 3.6430606661909673e-07, + "loss": 1.1677, + "step": 8420 + }, + { + "epoch": 0.9229336692775374, + "grad_norm": 1.234375, + "learning_rate": 3.5920959133126987e-07, + "loss": 1.1571, + "step": 8425 + }, + { + "epoch": 0.9234814043928357, + "grad_norm": 1.2890625, + "learning_rate": 3.541483642583421e-07, + "loss": 1.1979, + "step": 8430 + }, + { + "epoch": 0.9240291395081338, + "grad_norm": 1.234375, + "learning_rate": 3.491224039049379e-07, + "loss": 1.1641, + "step": 8435 + }, + { + "epoch": 0.9245768746234321, + "grad_norm": 1.21875, + "learning_rate": 3.441317286467416e-07, + "loss": 1.1128, + "step": 8440 + }, + { + "epoch": 0.9251246097387303, + "grad_norm": 1.2109375, + "learning_rate": 3.3917635673043183e-07, + "loss": 1.0991, + "step": 8445 + }, + { + "epoch": 0.9256723448540286, + "grad_norm": 1.2109375, + "learning_rate": 3.3425630627361263e-07, + "loss": 1.2116, + "step": 8450 + }, + { + "epoch": 0.9262200799693269, + "grad_norm": 1.296875, + "learning_rate": 3.293715952647425e-07, + "loss": 1.1897, + "step": 8455 + }, + { + "epoch": 0.926767815084625, + "grad_norm": 1.234375, + "learning_rate": 3.245222415630822e-07, + "loss": 1.1805, + "step": 8460 + }, + { + "epoch": 0.9273155501999233, + "grad_norm": 1.2265625, + "learning_rate": 3.197082628986126e-07, + "loss": 1.1074, + "step": 8465 + }, + { + "epoch": 0.9278632853152216, + "grad_norm": 1.1796875, + "learning_rate": 3.149296768719834e-07, + "loss": 1.0734, + "step": 8470 + }, + { + "epoch": 0.9284110204305198, + "grad_norm": 1.2109375, + "learning_rate": 3.101865009544391e-07, + "loss": 1.1775, + "step": 8475 + }, + { + "epoch": 0.9289587555458181, + "grad_norm": 1.2109375, + "learning_rate": 3.054787524877645e-07, + "loss": 1.1536, + "step": 8480 + }, + { + "epoch": 0.9295064906611162, + "grad_norm": 1.2265625, + "learning_rate": 3.0080644868420996e-07, + "loss": 1.1351, + "step": 8485 + }, + { + "epoch": 0.9300542257764145, + "grad_norm": 1.2265625, + "learning_rate": 2.9616960662643967e-07, + "loss": 1.222, + "step": 8490 + }, + { + "epoch": 0.9306019608917128, + "grad_norm": 1.2421875, + "learning_rate": 2.915682432674627e-07, + "loss": 1.1448, + "step": 8495 + }, + { + "epoch": 0.931149696007011, + "grad_norm": 1.1953125, + "learning_rate": 2.8700237543057173e-07, + "loss": 1.1303, + "step": 8500 + }, + { + "epoch": 0.9316974311223093, + "grad_norm": 1.25, + "learning_rate": 2.824720198092834e-07, + "loss": 1.1381, + "step": 8505 + }, + { + "epoch": 0.9322451662376074, + "grad_norm": 1.2109375, + "learning_rate": 2.7797719296727476e-07, + "loss": 1.1853, + "step": 8510 + }, + { + "epoch": 0.9327929013529057, + "grad_norm": 1.203125, + "learning_rate": 2.7351791133832685e-07, + "loss": 1.1622, + "step": 8515 + }, + { + "epoch": 0.933340636468204, + "grad_norm": 1.2265625, + "learning_rate": 2.69094191226259e-07, + "loss": 1.1323, + "step": 8520 + }, + { + "epoch": 0.9338883715835022, + "grad_norm": 1.2265625, + "learning_rate": 2.647060488048736e-07, + "loss": 1.1172, + "step": 8525 + }, + { + "epoch": 0.9344361066988005, + "grad_norm": 1.2109375, + "learning_rate": 2.603535001178947e-07, + "loss": 1.1416, + "step": 8530 + }, + { + "epoch": 0.9349838418140987, + "grad_norm": 1.234375, + "learning_rate": 2.5603656107891285e-07, + "loss": 1.224, + "step": 8535 + }, + { + "epoch": 0.9355315769293969, + "grad_norm": 1.21875, + "learning_rate": 2.517552474713203e-07, + "loss": 1.1628, + "step": 8540 + }, + { + "epoch": 0.9360793120446952, + "grad_norm": 1.1875, + "learning_rate": 2.4750957494826033e-07, + "loss": 1.1395, + "step": 8545 + }, + { + "epoch": 0.9366270471599935, + "grad_norm": 1.2109375, + "learning_rate": 2.4329955903256376e-07, + "loss": 1.169, + "step": 8550 + }, + { + "epoch": 0.9371747822752917, + "grad_norm": 1.2421875, + "learning_rate": 2.391252151167001e-07, + "loss": 1.1216, + "step": 8555 + }, + { + "epoch": 0.9377225173905899, + "grad_norm": 1.21875, + "learning_rate": 2.349865584627109e-07, + "loss": 1.1221, + "step": 8560 + }, + { + "epoch": 0.9382702525058881, + "grad_norm": 1.2109375, + "learning_rate": 2.308836042021656e-07, + "loss": 1.1322, + "step": 8565 + }, + { + "epoch": 0.9388179876211864, + "grad_norm": 1.2109375, + "learning_rate": 2.2681636733609457e-07, + "loss": 1.1842, + "step": 8570 + }, + { + "epoch": 0.9393657227364847, + "grad_norm": 1.265625, + "learning_rate": 2.2278486273494272e-07, + "loss": 1.1955, + "step": 8575 + }, + { + "epoch": 0.9399134578517829, + "grad_norm": 1.21875, + "learning_rate": 2.1878910513851381e-07, + "loss": 1.2065, + "step": 8580 + }, + { + "epoch": 0.9404611929670811, + "grad_norm": 1.25, + "learning_rate": 2.148291091559107e-07, + "loss": 1.141, + "step": 8585 + }, + { + "epoch": 0.9410089280823793, + "grad_norm": 1.234375, + "learning_rate": 2.1090488926548968e-07, + "loss": 1.145, + "step": 8590 + }, + { + "epoch": 0.9415566631976776, + "grad_norm": 1.2109375, + "learning_rate": 2.070164598148039e-07, + "loss": 1.1382, + "step": 8595 + }, + { + "epoch": 0.9421043983129759, + "grad_norm": 1.2265625, + "learning_rate": 2.0316383502054782e-07, + "loss": 1.1571, + "step": 8600 + }, + { + "epoch": 0.9426521334282741, + "grad_norm": 1.2265625, + "learning_rate": 1.993470289685162e-07, + "loss": 1.1382, + "step": 8605 + }, + { + "epoch": 0.9431998685435723, + "grad_norm": 1.15625, + "learning_rate": 1.9556605561353525e-07, + "loss": 1.146, + "step": 8610 + }, + { + "epoch": 0.9437476036588706, + "grad_norm": 1.25, + "learning_rate": 1.9182092877942705e-07, + "loss": 1.0946, + "step": 8615 + }, + { + "epoch": 0.9442953387741688, + "grad_norm": 1.203125, + "learning_rate": 1.8811166215895405e-07, + "loss": 1.0913, + "step": 8620 + }, + { + "epoch": 0.9448430738894671, + "grad_norm": 1.234375, + "learning_rate": 1.8443826931376474e-07, + "loss": 1.0888, + "step": 8625 + }, + { + "epoch": 0.9453908090047652, + "grad_norm": 1.2421875, + "learning_rate": 1.8080076367434918e-07, + "loss": 1.1671, + "step": 8630 + }, + { + "epoch": 0.9459385441200635, + "grad_norm": 1.2109375, + "learning_rate": 1.7719915853999014e-07, + "loss": 1.1341, + "step": 8635 + }, + { + "epoch": 0.9464862792353618, + "grad_norm": 1.25, + "learning_rate": 1.7363346707870877e-07, + "loss": 1.1377, + "step": 8640 + }, + { + "epoch": 0.94703401435066, + "grad_norm": 1.1953125, + "learning_rate": 1.701037023272234e-07, + "loss": 1.1857, + "step": 8645 + }, + { + "epoch": 0.9475817494659583, + "grad_norm": 1.2265625, + "learning_rate": 1.666098771908986e-07, + "loss": 1.1515, + "step": 8650 + }, + { + "epoch": 0.9481294845812565, + "grad_norm": 1.2265625, + "learning_rate": 1.6315200444369406e-07, + "loss": 1.1915, + "step": 8655 + }, + { + "epoch": 0.9486772196965547, + "grad_norm": 1.2421875, + "learning_rate": 1.59730096728129e-07, + "loss": 1.1233, + "step": 8660 + }, + { + "epoch": 0.949224954811853, + "grad_norm": 1.2265625, + "learning_rate": 1.5634416655522343e-07, + "loss": 1.1587, + "step": 8665 + }, + { + "epoch": 0.9497726899271512, + "grad_norm": 1.1953125, + "learning_rate": 1.5299422630445816e-07, + "loss": 1.1867, + "step": 8670 + }, + { + "epoch": 0.9503204250424495, + "grad_norm": 1.2265625, + "learning_rate": 1.4968028822373471e-07, + "loss": 1.2192, + "step": 8675 + }, + { + "epoch": 0.9508681601577477, + "grad_norm": 1.234375, + "learning_rate": 1.4640236442931665e-07, + "loss": 1.2009, + "step": 8680 + }, + { + "epoch": 0.9514158952730459, + "grad_norm": 1.203125, + "learning_rate": 1.4316046690580178e-07, + "loss": 1.1951, + "step": 8685 + }, + { + "epoch": 0.9519636303883442, + "grad_norm": 1.234375, + "learning_rate": 1.399546075060665e-07, + "loss": 1.1862, + "step": 8690 + }, + { + "epoch": 0.9525113655036425, + "grad_norm": 1.265625, + "learning_rate": 1.367847979512238e-07, + "loss": 1.1432, + "step": 8695 + }, + { + "epoch": 0.9530591006189407, + "grad_norm": 1.2265625, + "learning_rate": 1.3365104983058873e-07, + "loss": 1.1178, + "step": 8700 + }, + { + "epoch": 0.9536068357342389, + "grad_norm": 1.328125, + "learning_rate": 1.3055337460162632e-07, + "loss": 1.1738, + "step": 8705 + }, + { + "epoch": 0.9541545708495371, + "grad_norm": 1.2109375, + "learning_rate": 1.2749178358991477e-07, + "loss": 1.16, + "step": 8710 + }, + { + "epoch": 0.9547023059648354, + "grad_norm": 1.21875, + "learning_rate": 1.244662879891012e-07, + "loss": 1.1101, + "step": 8715 + }, + { + "epoch": 0.9552500410801337, + "grad_norm": 1.2578125, + "learning_rate": 1.21476898860865e-07, + "loss": 1.2003, + "step": 8720 + }, + { + "epoch": 0.9557977761954319, + "grad_norm": 1.2265625, + "learning_rate": 1.185236271348722e-07, + "loss": 1.2025, + "step": 8725 + }, + { + "epoch": 0.9563455113107301, + "grad_norm": 1.21875, + "learning_rate": 1.1560648360874005e-07, + "loss": 1.1886, + "step": 8730 + }, + { + "epoch": 0.9568932464260284, + "grad_norm": 1.3046875, + "learning_rate": 1.1272547894799369e-07, + "loss": 1.1359, + "step": 8735 + }, + { + "epoch": 0.9574409815413266, + "grad_norm": 1.2421875, + "learning_rate": 1.0988062368603059e-07, + "loss": 1.2033, + "step": 8740 + }, + { + "epoch": 0.9579887166566249, + "grad_norm": 1.234375, + "learning_rate": 1.0707192822408063e-07, + "loss": 1.2268, + "step": 8745 + }, + { + "epoch": 0.9585364517719231, + "grad_norm": 1.234375, + "learning_rate": 1.0429940283116613e-07, + "loss": 1.1409, + "step": 8750 + }, + { + "epoch": 0.9590841868872213, + "grad_norm": 1.21875, + "learning_rate": 1.0156305764406627e-07, + "loss": 1.1499, + "step": 8755 + }, + { + "epoch": 0.9596319220025196, + "grad_norm": 1.265625, + "learning_rate": 9.886290266728271e-08, + "loss": 1.1508, + "step": 8760 + }, + { + "epoch": 0.9601796571178178, + "grad_norm": 1.2265625, + "learning_rate": 9.619894777299632e-08, + "loss": 1.168, + "step": 8765 + }, + { + "epoch": 0.9607273922331161, + "grad_norm": 1.15625, + "learning_rate": 9.357120270103715e-08, + "loss": 1.1012, + "step": 8770 + }, + { + "epoch": 0.9612751273484144, + "grad_norm": 1.265625, + "learning_rate": 9.097967705884558e-08, + "loss": 1.1533, + "step": 8775 + }, + { + "epoch": 0.9618228624637125, + "grad_norm": 1.1875, + "learning_rate": 8.842438032143908e-08, + "loss": 1.1602, + "step": 8780 + }, + { + "epoch": 0.9623705975790108, + "grad_norm": 1.2578125, + "learning_rate": 8.590532183137656e-08, + "loss": 1.1479, + "step": 8785 + }, + { + "epoch": 0.962918332694309, + "grad_norm": 1.1953125, + "learning_rate": 8.3422510798723e-08, + "loss": 1.1496, + "step": 8790 + }, + { + "epoch": 0.9634660678096073, + "grad_norm": 1.2734375, + "learning_rate": 8.097595630101818e-08, + "loss": 1.146, + "step": 8795 + }, + { + "epoch": 0.9640138029249055, + "grad_norm": 1.1640625, + "learning_rate": 7.856566728324244e-08, + "loss": 1.0557, + "step": 8800 + }, + { + "epoch": 0.9645615380402037, + "grad_norm": 1.25, + "learning_rate": 7.619165255778327e-08, + "loss": 1.1975, + "step": 8805 + }, + { + "epoch": 0.965109273155502, + "grad_norm": 1.234375, + "learning_rate": 7.385392080440535e-08, + "loss": 1.1624, + "step": 8810 + }, + { + "epoch": 0.9656570082708003, + "grad_norm": 1.1953125, + "learning_rate": 7.155248057021502e-08, + "loss": 1.1225, + "step": 8815 + }, + { + "epoch": 0.9662047433860985, + "grad_norm": 1.3046875, + "learning_rate": 6.928734026963258e-08, + "loss": 1.1341, + "step": 8820 + }, + { + "epoch": 0.9667524785013967, + "grad_norm": 1.25, + "learning_rate": 6.705850818436111e-08, + "loss": 1.1975, + "step": 8825 + }, + { + "epoch": 0.9673002136166949, + "grad_norm": 1.2421875, + "learning_rate": 6.486599246335212e-08, + "loss": 1.1769, + "step": 8830 + }, + { + "epoch": 0.9678479487319932, + "grad_norm": 1.21875, + "learning_rate": 6.270980112278113e-08, + "loss": 1.2048, + "step": 8835 + }, + { + "epoch": 0.9683956838472915, + "grad_norm": 1.2578125, + "learning_rate": 6.058994204601765e-08, + "loss": 1.0985, + "step": 8840 + }, + { + "epoch": 0.9689434189625897, + "grad_norm": 1.21875, + "learning_rate": 5.850642298359188e-08, + "loss": 1.173, + "step": 8845 + }, + { + "epoch": 0.9694911540778879, + "grad_norm": 1.2109375, + "learning_rate": 5.6459251553169226e-08, + "loss": 1.104, + "step": 8850 + }, + { + "epoch": 0.9700388891931861, + "grad_norm": 1.2265625, + "learning_rate": 5.444843523952581e-08, + "loss": 1.158, + "step": 8855 + }, + { + "epoch": 0.9705866243084844, + "grad_norm": 1.2265625, + "learning_rate": 5.2473981394515205e-08, + "loss": 1.1602, + "step": 8860 + }, + { + "epoch": 0.9711343594237827, + "grad_norm": 1.171875, + "learning_rate": 5.0535897237044e-08, + "loss": 1.0855, + "step": 8865 + }, + { + "epoch": 0.9716820945390809, + "grad_norm": 1.328125, + "learning_rate": 4.863418985304735e-08, + "loss": 1.157, + "step": 8870 + }, + { + "epoch": 0.9722298296543791, + "grad_norm": 1.2734375, + "learning_rate": 4.6768866195460175e-08, + "loss": 1.2024, + "step": 8875 + }, + { + "epoch": 0.9727775647696774, + "grad_norm": 1.2265625, + "learning_rate": 4.4939933084192646e-08, + "loss": 1.1865, + "step": 8880 + }, + { + "epoch": 0.9733252998849756, + "grad_norm": 1.203125, + "learning_rate": 4.3147397206106945e-08, + "loss": 1.1473, + "step": 8885 + }, + { + "epoch": 0.9738730350002739, + "grad_norm": 1.1953125, + "learning_rate": 4.1391265114990584e-08, + "loss": 1.1184, + "step": 8890 + }, + { + "epoch": 0.9744207701155722, + "grad_norm": 1.3046875, + "learning_rate": 3.967154323153089e-08, + "loss": 1.1739, + "step": 8895 + }, + { + "epoch": 0.9749685052308703, + "grad_norm": 1.21875, + "learning_rate": 3.79882378432983e-08, + "loss": 1.1479, + "step": 8900 + }, + { + "epoch": 0.9755162403461686, + "grad_norm": 1.2109375, + "learning_rate": 3.634135510471537e-08, + "loss": 1.0821, + "step": 8905 + }, + { + "epoch": 0.9760639754614668, + "grad_norm": 1.25, + "learning_rate": 3.473090103703891e-08, + "loss": 1.1344, + "step": 8910 + }, + { + "epoch": 0.9766117105767651, + "grad_norm": 1.2109375, + "learning_rate": 3.315688152833896e-08, + "loss": 1.1173, + "step": 8915 + }, + { + "epoch": 0.9771594456920634, + "grad_norm": 1.21875, + "learning_rate": 3.161930233347099e-08, + "loss": 1.1412, + "step": 8920 + }, + { + "epoch": 0.9777071808073615, + "grad_norm": 1.2265625, + "learning_rate": 3.0118169074061507e-08, + "loss": 1.1319, + "step": 8925 + }, + { + "epoch": 0.9782549159226598, + "grad_norm": 1.2265625, + "learning_rate": 2.8653487238488044e-08, + "loss": 1.1994, + "step": 8930 + }, + { + "epoch": 0.978802651037958, + "grad_norm": 1.2578125, + "learning_rate": 2.7225262181849177e-08, + "loss": 1.1638, + "step": 8935 + }, + { + "epoch": 0.9793503861532563, + "grad_norm": 1.3203125, + "learning_rate": 2.5833499125957896e-08, + "loss": 1.1377, + "step": 8940 + }, + { + "epoch": 0.9798981212685546, + "grad_norm": 1.2890625, + "learning_rate": 2.447820315931382e-08, + "loss": 1.125, + "step": 8945 + }, + { + "epoch": 0.9804458563838527, + "grad_norm": 1.328125, + "learning_rate": 2.3159379237087666e-08, + "loss": 1.1463, + "step": 8950 + }, + { + "epoch": 0.980993591499151, + "grad_norm": 1.2890625, + "learning_rate": 2.1877032181102376e-08, + "loss": 1.1219, + "step": 8955 + }, + { + "epoch": 0.9815413266144493, + "grad_norm": 1.2578125, + "learning_rate": 2.063116667981757e-08, + "loss": 1.1693, + "step": 8960 + }, + { + "epoch": 0.9820890617297475, + "grad_norm": 1.1484375, + "learning_rate": 1.942178728830957e-08, + "loss": 1.1101, + "step": 8965 + }, + { + "epoch": 0.9826367968450458, + "grad_norm": 1.2734375, + "learning_rate": 1.8248898428253615e-08, + "loss": 1.1785, + "step": 8970 + }, + { + "epoch": 0.9831845319603439, + "grad_norm": 1.234375, + "learning_rate": 1.71125043879139e-08, + "loss": 1.1803, + "step": 8975 + }, + { + "epoch": 0.9837322670756422, + "grad_norm": 1.2578125, + "learning_rate": 1.601260932212245e-08, + "loss": 1.172, + "step": 8980 + }, + { + "epoch": 0.9842800021909405, + "grad_norm": 1.2421875, + "learning_rate": 1.4949217252262505e-08, + "loss": 1.1578, + "step": 8985 + }, + { + "epoch": 0.9848277373062387, + "grad_norm": 1.2578125, + "learning_rate": 1.3922332066262923e-08, + "loss": 1.1933, + "step": 8990 + }, + { + "epoch": 0.9853754724215369, + "grad_norm": 1.203125, + "learning_rate": 1.2931957518570459e-08, + "loss": 1.2329, + "step": 8995 + }, + { + "epoch": 0.9859232075368352, + "grad_norm": 1.25, + "learning_rate": 1.1978097230149755e-08, + "loss": 1.2313, + "step": 9000 + }, + { + "epoch": 0.9864709426521334, + "grad_norm": 1.21875, + "learning_rate": 1.1060754688460018e-08, + "loss": 1.1615, + "step": 9005 + }, + { + "epoch": 0.9870186777674317, + "grad_norm": 1.1953125, + "learning_rate": 1.017993324744615e-08, + "loss": 1.1141, + "step": 9010 + }, + { + "epoch": 0.98756641288273, + "grad_norm": 1.25, + "learning_rate": 9.335636127528746e-09, + "loss": 1.182, + "step": 9015 + }, + { + "epoch": 0.9881141479980281, + "grad_norm": 1.1875, + "learning_rate": 8.527866415586338e-09, + "loss": 1.129, + "step": 9020 + }, + { + "epoch": 0.9886618831133264, + "grad_norm": 1.1640625, + "learning_rate": 7.75662706495095e-09, + "loss": 1.1161, + "step": 9025 + }, + { + "epoch": 0.9892096182286246, + "grad_norm": 1.21875, + "learning_rate": 7.021920895391443e-09, + "loss": 1.1489, + "step": 9030 + }, + { + "epoch": 0.9897573533439229, + "grad_norm": 1.265625, + "learning_rate": 6.323750593106859e-09, + "loss": 1.1763, + "step": 9035 + }, + { + "epoch": 0.9903050884592212, + "grad_norm": 1.1953125, + "learning_rate": 5.6621187107153145e-09, + "loss": 1.1108, + "step": 9040 + }, + { + "epoch": 0.9908528235745193, + "grad_norm": 1.234375, + "learning_rate": 5.037027667246231e-09, + "loss": 1.1569, + "step": 9045 + }, + { + "epoch": 0.9914005586898176, + "grad_norm": 1.25, + "learning_rate": 4.44847974812701e-09, + "loss": 1.1746, + "step": 9050 + }, + { + "epoch": 0.9919482938051158, + "grad_norm": 1.1875, + "learning_rate": 3.896477105179708e-09, + "loss": 1.1355, + "step": 9055 + }, + { + "epoch": 0.9924960289204141, + "grad_norm": 1.1953125, + "learning_rate": 3.381021756612146e-09, + "loss": 1.1379, + "step": 9060 + }, + { + "epoch": 0.9930437640357124, + "grad_norm": 1.390625, + "learning_rate": 2.9021155870079255e-09, + "loss": 1.1436, + "step": 9065 + }, + { + "epoch": 0.9935914991510105, + "grad_norm": 1.25, + "learning_rate": 2.459760347320872e-09, + "loss": 1.181, + "step": 9070 + }, + { + "epoch": 0.9941392342663088, + "grad_norm": 1.1796875, + "learning_rate": 2.053957654871708e-09, + "loss": 1.1355, + "step": 9075 + }, + { + "epoch": 0.9946869693816071, + "grad_norm": 1.1953125, + "learning_rate": 1.6847089933358373e-09, + "loss": 1.1293, + "step": 9080 + }, + { + "epoch": 0.9952347044969053, + "grad_norm": 1.2265625, + "learning_rate": 1.3520157127444589e-09, + "loss": 1.1847, + "step": 9085 + }, + { + "epoch": 0.9957824396122036, + "grad_norm": 1.234375, + "learning_rate": 1.0558790294745713e-09, + "loss": 1.1692, + "step": 9090 + }, + { + "epoch": 0.9963301747275017, + "grad_norm": 1.28125, + "learning_rate": 7.96300026248975e-10, + "loss": 1.1458, + "step": 9095 + }, + { + "epoch": 0.9968779098428, + "grad_norm": 1.25, + "learning_rate": 5.732796521262796e-10, + "loss": 1.1276, + "step": 9100 + }, + { + "epoch": 0.9974256449580983, + "grad_norm": 1.21875, + "learning_rate": 3.8681872250534437e-10, + "loss": 1.1752, + "step": 9105 + }, + { + "epoch": 0.9979733800733965, + "grad_norm": 1.203125, + "learning_rate": 2.369179191141768e-10, + "loss": 1.1254, + "step": 9110 + }, + { + "epoch": 0.9985211151886948, + "grad_norm": 1.2734375, + "learning_rate": 1.235777900154833e-10, + "loss": 1.1485, + "step": 9115 + }, + { + "epoch": 0.999068850303993, + "grad_norm": 1.25, + "learning_rate": 4.679874959556685e-11, + "loss": 1.1661, + "step": 9120 + }, + { + "epoch": 0.9996165854192912, + "grad_norm": 1.28125, + "learning_rate": 6.5810785732089985e-12, + "loss": 1.1323, + "step": 9125 + }, + { + "epoch": 0.9999452264884702, + "eval_loss": 1.1566358804702759, + "eval_runtime": 1068.2466, + "eval_samples_per_second": 15.133, + "eval_steps_per_second": 1.892, + "step": 9128 + }, + { + "epoch": 0.9999452264884702, + "step": 9128, + "total_flos": 1.8565750537228124e+18, + "train_loss": 1.1714502769954156, + "train_runtime": 42415.0484, + "train_samples_per_second": 3.443, + "train_steps_per_second": 0.215 + } + ], + "logging_steps": 5, + "max_steps": 9128, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.8565750537228124e+18, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +}