{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9999466719486831, "eval_steps": 500, "global_step": 16407, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0006094634436208223, "grad_norm": 9.033647537231445, "learning_rate": 9.7442143727162e-07, "loss": 6.6451, "num_input_tokens_seen": 1401772, "step": 10 }, { "epoch": 0.0012189268872416447, "grad_norm": 8.373307228088379, "learning_rate": 1.94884287454324e-06, "loss": 6.4199, "num_input_tokens_seen": 2825364, "step": 20 }, { "epoch": 0.001828390330862467, "grad_norm": 8.205723762512207, "learning_rate": 2.92326431181486e-06, "loss": 5.9935, "num_input_tokens_seen": 4259836, "step": 30 }, { "epoch": 0.0024378537744832894, "grad_norm": 8.017410278320312, "learning_rate": 3.89768574908648e-06, "loss": 5.8172, "num_input_tokens_seen": 5653988, "step": 40 }, { "epoch": 0.0030473172181041115, "grad_norm": 8.248656272888184, "learning_rate": 4.8721071863581e-06, "loss": 5.4176, "num_input_tokens_seen": 7026620, "step": 50 }, { "epoch": 0.003656780661724934, "grad_norm": 6.726569175720215, "learning_rate": 5.84652862362972e-06, "loss": 5.0672, "num_input_tokens_seen": 8474412, "step": 60 }, { "epoch": 0.004266244105345756, "grad_norm": 7.437803268432617, "learning_rate": 6.8209500609013406e-06, "loss": 4.6725, "num_input_tokens_seen": 9888436, "step": 70 }, { "epoch": 0.004875707548966579, "grad_norm": 7.783193111419678, "learning_rate": 7.79537149817296e-06, "loss": 4.0473, "num_input_tokens_seen": 11322860, "step": 80 }, { "epoch": 0.0054851709925874004, "grad_norm": 1.6730059385299683, "learning_rate": 8.769792935444581e-06, "loss": 3.3354, "num_input_tokens_seen": 12764288, "step": 90 }, { "epoch": 0.006094634436208223, "grad_norm": 1.6392016410827637, "learning_rate": 9.7442143727162e-06, "loss": 2.9862, "num_input_tokens_seen": 14208968, "step": 100 }, { "epoch": 0.0067040978798290456, "grad_norm": 1.278815507888794, "learning_rate": 1.071863580998782e-05, "loss": 2.7488, "num_input_tokens_seen": 15575528, "step": 110 }, { "epoch": 0.007313561323449868, "grad_norm": 1.2898283004760742, "learning_rate": 1.169305724725944e-05, "loss": 2.6214, "num_input_tokens_seen": 16993668, "step": 120 }, { "epoch": 0.00792302476707069, "grad_norm": 1.0845414400100708, "learning_rate": 1.266747868453106e-05, "loss": 2.5408, "num_input_tokens_seen": 18424664, "step": 130 }, { "epoch": 0.008532488210691512, "grad_norm": 1.1549726724624634, "learning_rate": 1.3641900121802681e-05, "loss": 2.4494, "num_input_tokens_seen": 19842364, "step": 140 }, { "epoch": 0.009141951654312334, "grad_norm": 1.2677397727966309, "learning_rate": 1.46163215590743e-05, "loss": 2.4496, "num_input_tokens_seen": 21287212, "step": 150 }, { "epoch": 0.009751415097933157, "grad_norm": 1.2440944910049438, "learning_rate": 1.559074299634592e-05, "loss": 2.4503, "num_input_tokens_seen": 22706312, "step": 160 }, { "epoch": 0.01036087854155398, "grad_norm": 1.2314107418060303, "learning_rate": 1.656516443361754e-05, "loss": 2.3764, "num_input_tokens_seen": 24127816, "step": 170 }, { "epoch": 0.010970341985174801, "grad_norm": 1.0929701328277588, "learning_rate": 1.7539585870889162e-05, "loss": 2.4311, "num_input_tokens_seen": 25495676, "step": 180 }, { "epoch": 0.011579805428795624, "grad_norm": 1.0920017957687378, "learning_rate": 1.8514007308160783e-05, "loss": 2.3656, "num_input_tokens_seen": 26913848, "step": 190 }, { "epoch": 0.012189268872416446, "grad_norm": 1.259750485420227, "learning_rate": 1.94884287454324e-05, "loss": 2.2739, "num_input_tokens_seen": 28326716, "step": 200 }, { "epoch": 0.01279873231603727, "grad_norm": 1.0931645631790161, "learning_rate": 2.046285018270402e-05, "loss": 2.231, "num_input_tokens_seen": 29699424, "step": 210 }, { "epoch": 0.013408195759658091, "grad_norm": 1.0463930368423462, "learning_rate": 2.143727161997564e-05, "loss": 2.216, "num_input_tokens_seen": 31109904, "step": 220 }, { "epoch": 0.014017659203278913, "grad_norm": 1.0597470998764038, "learning_rate": 2.241169305724726e-05, "loss": 2.1803, "num_input_tokens_seen": 32547736, "step": 230 }, { "epoch": 0.014627122646899736, "grad_norm": 1.1559436321258545, "learning_rate": 2.338611449451888e-05, "loss": 2.1441, "num_input_tokens_seen": 33972620, "step": 240 }, { "epoch": 0.015236586090520558, "grad_norm": 1.2502082586288452, "learning_rate": 2.4360535931790504e-05, "loss": 2.1537, "num_input_tokens_seen": 35404252, "step": 250 }, { "epoch": 0.01584604953414138, "grad_norm": 1.096165418624878, "learning_rate": 2.533495736906212e-05, "loss": 2.1193, "num_input_tokens_seen": 36804296, "step": 260 }, { "epoch": 0.016455512977762203, "grad_norm": 1.0119866132736206, "learning_rate": 2.630937880633374e-05, "loss": 2.1115, "num_input_tokens_seen": 38185184, "step": 270 }, { "epoch": 0.017064976421383025, "grad_norm": 1.1104751825332642, "learning_rate": 2.7283800243605362e-05, "loss": 2.2292, "num_input_tokens_seen": 39600904, "step": 280 }, { "epoch": 0.017674439865003846, "grad_norm": 0.9693301320075989, "learning_rate": 2.8258221680876983e-05, "loss": 2.1193, "num_input_tokens_seen": 41014344, "step": 290 }, { "epoch": 0.018283903308624668, "grad_norm": 0.9671382308006287, "learning_rate": 2.92326431181486e-05, "loss": 2.052, "num_input_tokens_seen": 42441088, "step": 300 }, { "epoch": 0.018893366752245493, "grad_norm": 1.053604006767273, "learning_rate": 3.020706455542022e-05, "loss": 2.081, "num_input_tokens_seen": 43835948, "step": 310 }, { "epoch": 0.019502830195866315, "grad_norm": 0.9839246869087219, "learning_rate": 3.118148599269184e-05, "loss": 2.0446, "num_input_tokens_seen": 45267668, "step": 320 }, { "epoch": 0.020112293639487137, "grad_norm": 1.0461212396621704, "learning_rate": 3.215590742996346e-05, "loss": 2.0046, "num_input_tokens_seen": 46734856, "step": 330 }, { "epoch": 0.02072175708310796, "grad_norm": 0.9702940583229065, "learning_rate": 3.313032886723508e-05, "loss": 2.0476, "num_input_tokens_seen": 48137552, "step": 340 }, { "epoch": 0.02133122052672878, "grad_norm": 1.0886714458465576, "learning_rate": 3.41047503045067e-05, "loss": 2.0536, "num_input_tokens_seen": 49555456, "step": 350 }, { "epoch": 0.021940683970349602, "grad_norm": 1.0513168573379517, "learning_rate": 3.5079171741778324e-05, "loss": 2.0185, "num_input_tokens_seen": 50949976, "step": 360 }, { "epoch": 0.022550147413970427, "grad_norm": 1.0875654220581055, "learning_rate": 3.605359317904994e-05, "loss": 1.9939, "num_input_tokens_seen": 52330820, "step": 370 }, { "epoch": 0.02315961085759125, "grad_norm": 0.9643621444702148, "learning_rate": 3.7028014616321566e-05, "loss": 2.0392, "num_input_tokens_seen": 53764024, "step": 380 }, { "epoch": 0.02376907430121207, "grad_norm": 1.0301045179367065, "learning_rate": 3.800243605359318e-05, "loss": 2.0448, "num_input_tokens_seen": 55201816, "step": 390 }, { "epoch": 0.024378537744832892, "grad_norm": 1.0076828002929688, "learning_rate": 3.89768574908648e-05, "loss": 1.9415, "num_input_tokens_seen": 56622004, "step": 400 }, { "epoch": 0.024988001188453714, "grad_norm": 0.9446476101875305, "learning_rate": 3.9951278928136424e-05, "loss": 1.9752, "num_input_tokens_seen": 58038112, "step": 410 }, { "epoch": 0.02559746463207454, "grad_norm": 0.952552318572998, "learning_rate": 4.092570036540804e-05, "loss": 1.9806, "num_input_tokens_seen": 59484616, "step": 420 }, { "epoch": 0.02620692807569536, "grad_norm": 0.8900318145751953, "learning_rate": 4.1900121802679666e-05, "loss": 1.9547, "num_input_tokens_seen": 60889256, "step": 430 }, { "epoch": 0.026816391519316182, "grad_norm": 0.9296801090240479, "learning_rate": 4.287454323995128e-05, "loss": 2.0001, "num_input_tokens_seen": 62274740, "step": 440 }, { "epoch": 0.027425854962937004, "grad_norm": 0.9372000098228455, "learning_rate": 4.38489646772229e-05, "loss": 2.001, "num_input_tokens_seen": 63719728, "step": 450 }, { "epoch": 0.028035318406557826, "grad_norm": 1.1481891870498657, "learning_rate": 4.482338611449452e-05, "loss": 2.0047, "num_input_tokens_seen": 65120052, "step": 460 }, { "epoch": 0.028644781850178647, "grad_norm": 0.9526330232620239, "learning_rate": 4.579780755176614e-05, "loss": 1.8862, "num_input_tokens_seen": 66516240, "step": 470 }, { "epoch": 0.029254245293799472, "grad_norm": 0.9846788048744202, "learning_rate": 4.677222898903776e-05, "loss": 1.8632, "num_input_tokens_seen": 67966636, "step": 480 }, { "epoch": 0.029863708737420294, "grad_norm": 0.9888688921928406, "learning_rate": 4.774665042630938e-05, "loss": 1.7964, "num_input_tokens_seen": 69405656, "step": 490 }, { "epoch": 0.030473172181041116, "grad_norm": 0.9095290303230286, "learning_rate": 4.872107186358101e-05, "loss": 1.9531, "num_input_tokens_seen": 70820452, "step": 500 }, { "epoch": 0.031082635624661938, "grad_norm": 0.8255355358123779, "learning_rate": 4.9695493300852625e-05, "loss": 1.951, "num_input_tokens_seen": 72275948, "step": 510 }, { "epoch": 0.03169209906828276, "grad_norm": 0.8973667025566101, "learning_rate": 5.066991473812424e-05, "loss": 1.9297, "num_input_tokens_seen": 73697960, "step": 520 }, { "epoch": 0.03230156251190358, "grad_norm": 1.0261414051055908, "learning_rate": 5.164433617539586e-05, "loss": 1.8723, "num_input_tokens_seen": 75117608, "step": 530 }, { "epoch": 0.032911025955524406, "grad_norm": 0.8425599336624146, "learning_rate": 5.261875761266748e-05, "loss": 1.8414, "num_input_tokens_seen": 76461224, "step": 540 }, { "epoch": 0.033520489399145224, "grad_norm": 0.9196986556053162, "learning_rate": 5.35931790499391e-05, "loss": 1.9135, "num_input_tokens_seen": 77904064, "step": 550 }, { "epoch": 0.03412995284276605, "grad_norm": 0.8252116441726685, "learning_rate": 5.4567600487210725e-05, "loss": 1.8992, "num_input_tokens_seen": 79294400, "step": 560 }, { "epoch": 0.034739416286386875, "grad_norm": 1.140308141708374, "learning_rate": 5.554202192448234e-05, "loss": 1.8564, "num_input_tokens_seen": 80694984, "step": 570 }, { "epoch": 0.03534887973000769, "grad_norm": 0.772463321685791, "learning_rate": 5.6516443361753966e-05, "loss": 1.9372, "num_input_tokens_seen": 82116012, "step": 580 }, { "epoch": 0.03595834317362852, "grad_norm": 0.8610614538192749, "learning_rate": 5.749086479902558e-05, "loss": 1.8236, "num_input_tokens_seen": 83500604, "step": 590 }, { "epoch": 0.036567806617249336, "grad_norm": 1.0011142492294312, "learning_rate": 5.84652862362972e-05, "loss": 1.875, "num_input_tokens_seen": 84936208, "step": 600 }, { "epoch": 0.03717727006087016, "grad_norm": 0.9461501836776733, "learning_rate": 5.9439707673568825e-05, "loss": 1.8397, "num_input_tokens_seen": 86347300, "step": 610 }, { "epoch": 0.03778673350449099, "grad_norm": 0.8287586569786072, "learning_rate": 6.041412911084044e-05, "loss": 1.8283, "num_input_tokens_seen": 87787620, "step": 620 }, { "epoch": 0.038396196948111805, "grad_norm": 0.9248697757720947, "learning_rate": 6.138855054811207e-05, "loss": 1.8192, "num_input_tokens_seen": 89217528, "step": 630 }, { "epoch": 0.03900566039173263, "grad_norm": 0.8168230056762695, "learning_rate": 6.236297198538368e-05, "loss": 1.8232, "num_input_tokens_seen": 90602460, "step": 640 }, { "epoch": 0.03961512383535345, "grad_norm": 0.8372480273246765, "learning_rate": 6.33373934226553e-05, "loss": 1.9267, "num_input_tokens_seen": 91995380, "step": 650 }, { "epoch": 0.04022458727897427, "grad_norm": 1.0177308320999146, "learning_rate": 6.431181485992692e-05, "loss": 1.8319, "num_input_tokens_seen": 93396324, "step": 660 }, { "epoch": 0.0408340507225951, "grad_norm": 0.9811750054359436, "learning_rate": 6.528623629719854e-05, "loss": 1.8442, "num_input_tokens_seen": 94835448, "step": 670 }, { "epoch": 0.04144351416621592, "grad_norm": 0.8578136563301086, "learning_rate": 6.626065773447017e-05, "loss": 1.8373, "num_input_tokens_seen": 96247680, "step": 680 }, { "epoch": 0.04205297760983674, "grad_norm": 0.7805215120315552, "learning_rate": 6.723507917174178e-05, "loss": 1.8717, "num_input_tokens_seen": 97649748, "step": 690 }, { "epoch": 0.04266244105345756, "grad_norm": 0.7658279538154602, "learning_rate": 6.82095006090134e-05, "loss": 1.8215, "num_input_tokens_seen": 99072716, "step": 700 }, { "epoch": 0.043271904497078385, "grad_norm": 0.8456059098243713, "learning_rate": 6.918392204628502e-05, "loss": 1.9152, "num_input_tokens_seen": 100460184, "step": 710 }, { "epoch": 0.043881367940699204, "grad_norm": 0.7689954042434692, "learning_rate": 7.015834348355665e-05, "loss": 1.8299, "num_input_tokens_seen": 101833788, "step": 720 }, { "epoch": 0.04449083138432003, "grad_norm": 0.769614577293396, "learning_rate": 7.113276492082827e-05, "loss": 1.7905, "num_input_tokens_seen": 103233860, "step": 730 }, { "epoch": 0.045100294827940854, "grad_norm": 0.984970211982727, "learning_rate": 7.210718635809988e-05, "loss": 1.7542, "num_input_tokens_seen": 104649404, "step": 740 }, { "epoch": 0.04570975827156167, "grad_norm": 0.898064136505127, "learning_rate": 7.30816077953715e-05, "loss": 1.8333, "num_input_tokens_seen": 106019108, "step": 750 }, { "epoch": 0.0463192217151825, "grad_norm": 0.8178762197494507, "learning_rate": 7.405602923264313e-05, "loss": 1.7696, "num_input_tokens_seen": 107412140, "step": 760 }, { "epoch": 0.046928685158803315, "grad_norm": 0.8419963121414185, "learning_rate": 7.503045066991475e-05, "loss": 1.8582, "num_input_tokens_seen": 108814020, "step": 770 }, { "epoch": 0.04753814860242414, "grad_norm": 0.8736382126808167, "learning_rate": 7.600487210718637e-05, "loss": 1.8505, "num_input_tokens_seen": 110243636, "step": 780 }, { "epoch": 0.048147612046044966, "grad_norm": 0.8330615758895874, "learning_rate": 7.697929354445798e-05, "loss": 1.8439, "num_input_tokens_seen": 111661300, "step": 790 }, { "epoch": 0.048757075489665784, "grad_norm": 0.8429380059242249, "learning_rate": 7.79537149817296e-05, "loss": 1.7612, "num_input_tokens_seen": 113084304, "step": 800 }, { "epoch": 0.04936653893328661, "grad_norm": 0.7914267182350159, "learning_rate": 7.892813641900122e-05, "loss": 1.8355, "num_input_tokens_seen": 114514328, "step": 810 }, { "epoch": 0.04997600237690743, "grad_norm": 0.8499755263328552, "learning_rate": 7.990255785627285e-05, "loss": 1.762, "num_input_tokens_seen": 115939680, "step": 820 }, { "epoch": 0.05058546582052825, "grad_norm": 0.8349440693855286, "learning_rate": 8e-05, "loss": 1.7935, "num_input_tokens_seen": 117339404, "step": 830 }, { "epoch": 0.05119492926414908, "grad_norm": 0.8505790829658508, "learning_rate": 8e-05, "loss": 1.8398, "num_input_tokens_seen": 118763072, "step": 840 }, { "epoch": 0.051804392707769896, "grad_norm": 0.761175274848938, "learning_rate": 8e-05, "loss": 1.7851, "num_input_tokens_seen": 120156740, "step": 850 }, { "epoch": 0.05241385615139072, "grad_norm": 0.8605666756629944, "learning_rate": 8e-05, "loss": 1.8358, "num_input_tokens_seen": 121570328, "step": 860 }, { "epoch": 0.05302331959501154, "grad_norm": 0.648381769657135, "learning_rate": 8e-05, "loss": 1.7502, "num_input_tokens_seen": 122939696, "step": 870 }, { "epoch": 0.053632783038632365, "grad_norm": 0.9061549305915833, "learning_rate": 8e-05, "loss": 1.7837, "num_input_tokens_seen": 124365264, "step": 880 }, { "epoch": 0.05424224648225319, "grad_norm": 0.8057026267051697, "learning_rate": 8e-05, "loss": 1.7609, "num_input_tokens_seen": 125746168, "step": 890 }, { "epoch": 0.05485170992587401, "grad_norm": 0.8674312829971313, "learning_rate": 8e-05, "loss": 1.7415, "num_input_tokens_seen": 127137780, "step": 900 }, { "epoch": 0.05546117336949483, "grad_norm": 0.8366326689720154, "learning_rate": 8e-05, "loss": 1.8147, "num_input_tokens_seen": 128546252, "step": 910 }, { "epoch": 0.05607063681311565, "grad_norm": 0.8231328129768372, "learning_rate": 8e-05, "loss": 1.8652, "num_input_tokens_seen": 129950036, "step": 920 }, { "epoch": 0.056680100256736476, "grad_norm": 0.7696998715400696, "learning_rate": 8e-05, "loss": 1.8445, "num_input_tokens_seen": 131396708, "step": 930 }, { "epoch": 0.057289563700357295, "grad_norm": 0.8545131683349609, "learning_rate": 8e-05, "loss": 1.8091, "num_input_tokens_seen": 132834744, "step": 940 }, { "epoch": 0.05789902714397812, "grad_norm": 0.8772911429405212, "learning_rate": 8e-05, "loss": 1.822, "num_input_tokens_seen": 134197864, "step": 950 }, { "epoch": 0.058508490587598945, "grad_norm": 0.7780118584632874, "learning_rate": 8e-05, "loss": 1.6964, "num_input_tokens_seen": 135587736, "step": 960 }, { "epoch": 0.05911795403121976, "grad_norm": 0.8001295328140259, "learning_rate": 8e-05, "loss": 1.6976, "num_input_tokens_seen": 136998024, "step": 970 }, { "epoch": 0.05972741747484059, "grad_norm": 0.8727300763130188, "learning_rate": 8e-05, "loss": 1.6902, "num_input_tokens_seen": 138396964, "step": 980 }, { "epoch": 0.06033688091846141, "grad_norm": 0.9872994422912598, "learning_rate": 8e-05, "loss": 1.7319, "num_input_tokens_seen": 139815224, "step": 990 }, { "epoch": 0.06094634436208223, "grad_norm": 0.8009439706802368, "learning_rate": 8e-05, "loss": 1.6803, "num_input_tokens_seen": 141251384, "step": 1000 }, { "epoch": 0.06155580780570306, "grad_norm": 0.7269445657730103, "learning_rate": 8e-05, "loss": 1.7466, "num_input_tokens_seen": 142666948, "step": 1010 }, { "epoch": 0.062165271249323875, "grad_norm": 0.7160496711730957, "learning_rate": 8e-05, "loss": 1.7092, "num_input_tokens_seen": 144099664, "step": 1020 }, { "epoch": 0.0627747346929447, "grad_norm": 0.8215118646621704, "learning_rate": 8e-05, "loss": 1.8191, "num_input_tokens_seen": 145505712, "step": 1030 }, { "epoch": 0.06338419813656553, "grad_norm": 0.7651961445808411, "learning_rate": 8e-05, "loss": 1.7255, "num_input_tokens_seen": 146921148, "step": 1040 }, { "epoch": 0.06399366158018634, "grad_norm": 0.8512997031211853, "learning_rate": 8e-05, "loss": 1.7603, "num_input_tokens_seen": 148324952, "step": 1050 }, { "epoch": 0.06460312502380716, "grad_norm": 0.7710238695144653, "learning_rate": 8e-05, "loss": 1.7784, "num_input_tokens_seen": 149742540, "step": 1060 }, { "epoch": 0.06521258846742799, "grad_norm": 0.7975362539291382, "learning_rate": 8e-05, "loss": 1.7519, "num_input_tokens_seen": 151140136, "step": 1070 }, { "epoch": 0.06582205191104881, "grad_norm": 0.8341178894042969, "learning_rate": 8e-05, "loss": 1.7708, "num_input_tokens_seen": 152574668, "step": 1080 }, { "epoch": 0.06643151535466964, "grad_norm": 0.81528240442276, "learning_rate": 8e-05, "loss": 1.7906, "num_input_tokens_seen": 153945316, "step": 1090 }, { "epoch": 0.06704097879829045, "grad_norm": 0.7310307621955872, "learning_rate": 8e-05, "loss": 1.7695, "num_input_tokens_seen": 155334976, "step": 1100 }, { "epoch": 0.06765044224191127, "grad_norm": 0.7746317982673645, "learning_rate": 8e-05, "loss": 1.7351, "num_input_tokens_seen": 156703560, "step": 1110 }, { "epoch": 0.0682599056855321, "grad_norm": 0.7563662528991699, "learning_rate": 8e-05, "loss": 1.8099, "num_input_tokens_seen": 158068384, "step": 1120 }, { "epoch": 0.06886936912915292, "grad_norm": 0.7843930721282959, "learning_rate": 8e-05, "loss": 1.6761, "num_input_tokens_seen": 159493532, "step": 1130 }, { "epoch": 0.06947883257277375, "grad_norm": 0.8035304546356201, "learning_rate": 8e-05, "loss": 1.6405, "num_input_tokens_seen": 160906460, "step": 1140 }, { "epoch": 0.07008829601639456, "grad_norm": 0.6847174763679504, "learning_rate": 8e-05, "loss": 1.7033, "num_input_tokens_seen": 162330960, "step": 1150 }, { "epoch": 0.07069775946001539, "grad_norm": 0.7156793475151062, "learning_rate": 8e-05, "loss": 1.7531, "num_input_tokens_seen": 163763200, "step": 1160 }, { "epoch": 0.07130722290363621, "grad_norm": 0.7055004239082336, "learning_rate": 8e-05, "loss": 1.6826, "num_input_tokens_seen": 165128772, "step": 1170 }, { "epoch": 0.07191668634725704, "grad_norm": 0.689820408821106, "learning_rate": 8e-05, "loss": 1.7022, "num_input_tokens_seen": 166506640, "step": 1180 }, { "epoch": 0.07252614979087786, "grad_norm": 0.7453446388244629, "learning_rate": 8e-05, "loss": 1.7654, "num_input_tokens_seen": 167908440, "step": 1190 }, { "epoch": 0.07313561323449867, "grad_norm": 0.768242597579956, "learning_rate": 8e-05, "loss": 1.732, "num_input_tokens_seen": 169276732, "step": 1200 }, { "epoch": 0.0737450766781195, "grad_norm": 0.7208901047706604, "learning_rate": 8e-05, "loss": 1.754, "num_input_tokens_seen": 170678632, "step": 1210 }, { "epoch": 0.07435454012174032, "grad_norm": 0.7076956629753113, "learning_rate": 8e-05, "loss": 1.6955, "num_input_tokens_seen": 172093484, "step": 1220 }, { "epoch": 0.07496400356536115, "grad_norm": 0.7701781392097473, "learning_rate": 8e-05, "loss": 1.7475, "num_input_tokens_seen": 173482152, "step": 1230 }, { "epoch": 0.07557346700898197, "grad_norm": 0.6883390545845032, "learning_rate": 8e-05, "loss": 1.7302, "num_input_tokens_seen": 174939192, "step": 1240 }, { "epoch": 0.07618293045260278, "grad_norm": 0.6942645311355591, "learning_rate": 8e-05, "loss": 1.771, "num_input_tokens_seen": 176393408, "step": 1250 }, { "epoch": 0.07679239389622361, "grad_norm": 0.6907592415809631, "learning_rate": 8e-05, "loss": 1.8247, "num_input_tokens_seen": 177820192, "step": 1260 }, { "epoch": 0.07740185733984443, "grad_norm": 0.6885057687759399, "learning_rate": 8e-05, "loss": 1.7403, "num_input_tokens_seen": 179198048, "step": 1270 }, { "epoch": 0.07801132078346526, "grad_norm": 0.7358853816986084, "learning_rate": 8e-05, "loss": 1.716, "num_input_tokens_seen": 180638000, "step": 1280 }, { "epoch": 0.07862078422708609, "grad_norm": 0.8144451379776001, "learning_rate": 8e-05, "loss": 1.6946, "num_input_tokens_seen": 182017752, "step": 1290 }, { "epoch": 0.0792302476707069, "grad_norm": 0.7160412669181824, "learning_rate": 8e-05, "loss": 1.7141, "num_input_tokens_seen": 183405184, "step": 1300 }, { "epoch": 0.07983971111432772, "grad_norm": 0.8351532220840454, "learning_rate": 8e-05, "loss": 1.6821, "num_input_tokens_seen": 184819236, "step": 1310 }, { "epoch": 0.08044917455794855, "grad_norm": 0.6285978555679321, "learning_rate": 8e-05, "loss": 1.7392, "num_input_tokens_seen": 186241748, "step": 1320 }, { "epoch": 0.08105863800156937, "grad_norm": 0.6861914992332458, "learning_rate": 8e-05, "loss": 1.6637, "num_input_tokens_seen": 187651796, "step": 1330 }, { "epoch": 0.0816681014451902, "grad_norm": 0.7372764945030212, "learning_rate": 8e-05, "loss": 1.6151, "num_input_tokens_seen": 189052704, "step": 1340 }, { "epoch": 0.08227756488881101, "grad_norm": 0.754943311214447, "learning_rate": 8e-05, "loss": 1.6771, "num_input_tokens_seen": 190457440, "step": 1350 }, { "epoch": 0.08288702833243183, "grad_norm": 0.7047508358955383, "learning_rate": 8e-05, "loss": 1.7488, "num_input_tokens_seen": 191855300, "step": 1360 }, { "epoch": 0.08349649177605266, "grad_norm": 0.7752687931060791, "learning_rate": 8e-05, "loss": 1.7385, "num_input_tokens_seen": 193237128, "step": 1370 }, { "epoch": 0.08410595521967348, "grad_norm": 0.7014668583869934, "learning_rate": 8e-05, "loss": 1.767, "num_input_tokens_seen": 194589060, "step": 1380 }, { "epoch": 0.08471541866329431, "grad_norm": 0.6854328513145447, "learning_rate": 8e-05, "loss": 1.7614, "num_input_tokens_seen": 196034960, "step": 1390 }, { "epoch": 0.08532488210691512, "grad_norm": 0.7649319171905518, "learning_rate": 8e-05, "loss": 1.7496, "num_input_tokens_seen": 197441788, "step": 1400 }, { "epoch": 0.08593434555053595, "grad_norm": 0.6990752220153809, "learning_rate": 8e-05, "loss": 1.7276, "num_input_tokens_seen": 198816188, "step": 1410 }, { "epoch": 0.08654380899415677, "grad_norm": 0.6721024513244629, "learning_rate": 8e-05, "loss": 1.675, "num_input_tokens_seen": 200227720, "step": 1420 }, { "epoch": 0.0871532724377776, "grad_norm": 0.7723363637924194, "learning_rate": 8e-05, "loss": 1.6968, "num_input_tokens_seen": 201640184, "step": 1430 }, { "epoch": 0.08776273588139841, "grad_norm": 0.7399781942367554, "learning_rate": 8e-05, "loss": 1.6968, "num_input_tokens_seen": 203082784, "step": 1440 }, { "epoch": 0.08837219932501923, "grad_norm": 0.6369470953941345, "learning_rate": 8e-05, "loss": 1.7369, "num_input_tokens_seen": 204507764, "step": 1450 }, { "epoch": 0.08898166276864006, "grad_norm": 0.6783959865570068, "learning_rate": 8e-05, "loss": 1.7148, "num_input_tokens_seen": 205926692, "step": 1460 }, { "epoch": 0.08959112621226088, "grad_norm": 0.6937606930732727, "learning_rate": 8e-05, "loss": 1.6742, "num_input_tokens_seen": 207292676, "step": 1470 }, { "epoch": 0.09020058965588171, "grad_norm": 0.7870405316352844, "learning_rate": 8e-05, "loss": 1.6477, "num_input_tokens_seen": 208696852, "step": 1480 }, { "epoch": 0.09081005309950252, "grad_norm": 0.7263059020042419, "learning_rate": 8e-05, "loss": 1.6267, "num_input_tokens_seen": 210121428, "step": 1490 }, { "epoch": 0.09141951654312334, "grad_norm": 0.753978967666626, "learning_rate": 8e-05, "loss": 1.6423, "num_input_tokens_seen": 211550144, "step": 1500 }, { "epoch": 0.09202897998674417, "grad_norm": 0.6854983568191528, "learning_rate": 8e-05, "loss": 1.6713, "num_input_tokens_seen": 212992572, "step": 1510 }, { "epoch": 0.092638443430365, "grad_norm": 0.7292976975440979, "learning_rate": 8e-05, "loss": 1.6481, "num_input_tokens_seen": 214390412, "step": 1520 }, { "epoch": 0.09324790687398582, "grad_norm": 0.6837400197982788, "learning_rate": 8e-05, "loss": 1.5692, "num_input_tokens_seen": 215806552, "step": 1530 }, { "epoch": 0.09385737031760663, "grad_norm": 0.7413853406906128, "learning_rate": 8e-05, "loss": 1.6803, "num_input_tokens_seen": 217210216, "step": 1540 }, { "epoch": 0.09446683376122746, "grad_norm": 0.7433369755744934, "learning_rate": 8e-05, "loss": 1.6934, "num_input_tokens_seen": 218598456, "step": 1550 }, { "epoch": 0.09507629720484828, "grad_norm": 0.7748942375183105, "learning_rate": 8e-05, "loss": 1.6741, "num_input_tokens_seen": 219976860, "step": 1560 }, { "epoch": 0.0956857606484691, "grad_norm": 0.7293086647987366, "learning_rate": 8e-05, "loss": 1.7087, "num_input_tokens_seen": 221355396, "step": 1570 }, { "epoch": 0.09629522409208993, "grad_norm": 0.862250566482544, "learning_rate": 8e-05, "loss": 1.6832, "num_input_tokens_seen": 222791252, "step": 1580 }, { "epoch": 0.09690468753571074, "grad_norm": 0.7801169157028198, "learning_rate": 8e-05, "loss": 1.5818, "num_input_tokens_seen": 224182892, "step": 1590 }, { "epoch": 0.09751415097933157, "grad_norm": 0.7509076595306396, "learning_rate": 8e-05, "loss": 1.6964, "num_input_tokens_seen": 225596820, "step": 1600 }, { "epoch": 0.0981236144229524, "grad_norm": 0.6174609661102295, "learning_rate": 8e-05, "loss": 1.6795, "num_input_tokens_seen": 227002192, "step": 1610 }, { "epoch": 0.09873307786657322, "grad_norm": 0.7319700717926025, "learning_rate": 8e-05, "loss": 1.5898, "num_input_tokens_seen": 228388248, "step": 1620 }, { "epoch": 0.09934254131019404, "grad_norm": 0.7365676164627075, "learning_rate": 8e-05, "loss": 1.6146, "num_input_tokens_seen": 229791408, "step": 1630 }, { "epoch": 0.09995200475381485, "grad_norm": 0.7724565267562866, "learning_rate": 8e-05, "loss": 1.638, "num_input_tokens_seen": 231187424, "step": 1640 }, { "epoch": 0.10056146819743568, "grad_norm": 0.7245753407478333, "learning_rate": 8e-05, "loss": 1.712, "num_input_tokens_seen": 232600616, "step": 1650 }, { "epoch": 0.1011709316410565, "grad_norm": 0.6499654650688171, "learning_rate": 8e-05, "loss": 1.6265, "num_input_tokens_seen": 234006344, "step": 1660 }, { "epoch": 0.10178039508467733, "grad_norm": 0.6563583612442017, "learning_rate": 8e-05, "loss": 1.664, "num_input_tokens_seen": 235416748, "step": 1670 }, { "epoch": 0.10238985852829816, "grad_norm": 0.7042690515518188, "learning_rate": 8e-05, "loss": 1.6468, "num_input_tokens_seen": 236830896, "step": 1680 }, { "epoch": 0.10299932197191897, "grad_norm": 0.7631860375404358, "learning_rate": 8e-05, "loss": 1.6325, "num_input_tokens_seen": 238262140, "step": 1690 }, { "epoch": 0.10360878541553979, "grad_norm": 0.6432304382324219, "learning_rate": 8e-05, "loss": 1.6756, "num_input_tokens_seen": 239674040, "step": 1700 }, { "epoch": 0.10421824885916062, "grad_norm": 0.7273523807525635, "learning_rate": 8e-05, "loss": 1.6661, "num_input_tokens_seen": 241093432, "step": 1710 }, { "epoch": 0.10482771230278144, "grad_norm": 0.781891942024231, "learning_rate": 8e-05, "loss": 1.7187, "num_input_tokens_seen": 242530300, "step": 1720 }, { "epoch": 0.10543717574640227, "grad_norm": 0.6699258089065552, "learning_rate": 8e-05, "loss": 1.6269, "num_input_tokens_seen": 243920956, "step": 1730 }, { "epoch": 0.10604663919002308, "grad_norm": 0.7054083943367004, "learning_rate": 8e-05, "loss": 1.6728, "num_input_tokens_seen": 245343376, "step": 1740 }, { "epoch": 0.1066561026336439, "grad_norm": 0.7309428453445435, "learning_rate": 8e-05, "loss": 1.6168, "num_input_tokens_seen": 246742064, "step": 1750 }, { "epoch": 0.10726556607726473, "grad_norm": 0.6241644620895386, "learning_rate": 8e-05, "loss": 1.6413, "num_input_tokens_seen": 248121468, "step": 1760 }, { "epoch": 0.10787502952088555, "grad_norm": 0.7994136214256287, "learning_rate": 8e-05, "loss": 1.6438, "num_input_tokens_seen": 249536668, "step": 1770 }, { "epoch": 0.10848449296450638, "grad_norm": 0.6934658885002136, "learning_rate": 8e-05, "loss": 1.7127, "num_input_tokens_seen": 250943296, "step": 1780 }, { "epoch": 0.10909395640812719, "grad_norm": 0.7179552912712097, "learning_rate": 8e-05, "loss": 1.6016, "num_input_tokens_seen": 252370756, "step": 1790 }, { "epoch": 0.10970341985174802, "grad_norm": 0.7161432504653931, "learning_rate": 8e-05, "loss": 1.6472, "num_input_tokens_seen": 253768292, "step": 1800 }, { "epoch": 0.11031288329536884, "grad_norm": 0.749668300151825, "learning_rate": 8e-05, "loss": 1.645, "num_input_tokens_seen": 255228292, "step": 1810 }, { "epoch": 0.11092234673898967, "grad_norm": 0.6765767335891724, "learning_rate": 8e-05, "loss": 1.6538, "num_input_tokens_seen": 256611280, "step": 1820 }, { "epoch": 0.11153181018261048, "grad_norm": 0.6850834488868713, "learning_rate": 8e-05, "loss": 1.7177, "num_input_tokens_seen": 258027728, "step": 1830 }, { "epoch": 0.1121412736262313, "grad_norm": 0.7252477407455444, "learning_rate": 8e-05, "loss": 1.6964, "num_input_tokens_seen": 259428460, "step": 1840 }, { "epoch": 0.11275073706985213, "grad_norm": 0.7121752500534058, "learning_rate": 8e-05, "loss": 1.6462, "num_input_tokens_seen": 260851868, "step": 1850 }, { "epoch": 0.11336020051347295, "grad_norm": 0.6859217882156372, "learning_rate": 8e-05, "loss": 1.6794, "num_input_tokens_seen": 262264644, "step": 1860 }, { "epoch": 0.11396966395709378, "grad_norm": 0.8705602884292603, "learning_rate": 8e-05, "loss": 1.6131, "num_input_tokens_seen": 263652388, "step": 1870 }, { "epoch": 0.11457912740071459, "grad_norm": 0.6738168001174927, "learning_rate": 8e-05, "loss": 1.6295, "num_input_tokens_seen": 265074652, "step": 1880 }, { "epoch": 0.11518859084433541, "grad_norm": 0.7672312259674072, "learning_rate": 8e-05, "loss": 1.7278, "num_input_tokens_seen": 266487960, "step": 1890 }, { "epoch": 0.11579805428795624, "grad_norm": 0.7259830236434937, "learning_rate": 8e-05, "loss": 1.5629, "num_input_tokens_seen": 267907564, "step": 1900 }, { "epoch": 0.11640751773157706, "grad_norm": 0.7508255243301392, "learning_rate": 8e-05, "loss": 1.6847, "num_input_tokens_seen": 269339832, "step": 1910 }, { "epoch": 0.11701698117519789, "grad_norm": 0.5813190340995789, "learning_rate": 8e-05, "loss": 1.6564, "num_input_tokens_seen": 270751000, "step": 1920 }, { "epoch": 0.1176264446188187, "grad_norm": 0.7213466167449951, "learning_rate": 8e-05, "loss": 1.6302, "num_input_tokens_seen": 272212216, "step": 1930 }, { "epoch": 0.11823590806243953, "grad_norm": 0.6883914470672607, "learning_rate": 8e-05, "loss": 1.5737, "num_input_tokens_seen": 273691760, "step": 1940 }, { "epoch": 0.11884537150606035, "grad_norm": 0.7095319628715515, "learning_rate": 8e-05, "loss": 1.6719, "num_input_tokens_seen": 275114348, "step": 1950 }, { "epoch": 0.11945483494968118, "grad_norm": 0.6590485572814941, "learning_rate": 8e-05, "loss": 1.6855, "num_input_tokens_seen": 276525856, "step": 1960 }, { "epoch": 0.120064298393302, "grad_norm": 0.7270433306694031, "learning_rate": 8e-05, "loss": 1.6401, "num_input_tokens_seen": 277930536, "step": 1970 }, { "epoch": 0.12067376183692281, "grad_norm": 0.6123723387718201, "learning_rate": 8e-05, "loss": 1.6706, "num_input_tokens_seen": 279346748, "step": 1980 }, { "epoch": 0.12128322528054364, "grad_norm": 0.7456077337265015, "learning_rate": 8e-05, "loss": 1.7432, "num_input_tokens_seen": 280734376, "step": 1990 }, { "epoch": 0.12189268872416446, "grad_norm": 0.6831845641136169, "learning_rate": 8e-05, "loss": 1.6291, "num_input_tokens_seen": 282132752, "step": 2000 }, { "epoch": 0.12250215216778529, "grad_norm": 0.6901050209999084, "learning_rate": 8e-05, "loss": 1.6807, "num_input_tokens_seen": 283584816, "step": 2010 }, { "epoch": 0.12311161561140611, "grad_norm": 0.7861285209655762, "learning_rate": 8e-05, "loss": 1.7389, "num_input_tokens_seen": 285064504, "step": 2020 }, { "epoch": 0.12372107905502693, "grad_norm": 0.5943942070007324, "learning_rate": 8e-05, "loss": 1.6045, "num_input_tokens_seen": 286464552, "step": 2030 }, { "epoch": 0.12433054249864775, "grad_norm": 0.7200583815574646, "learning_rate": 8e-05, "loss": 1.6617, "num_input_tokens_seen": 287842704, "step": 2040 }, { "epoch": 0.12494000594226858, "grad_norm": 0.6980604529380798, "learning_rate": 8e-05, "loss": 1.5976, "num_input_tokens_seen": 289280200, "step": 2050 }, { "epoch": 0.1255494693858894, "grad_norm": 0.6354398727416992, "learning_rate": 8e-05, "loss": 1.6191, "num_input_tokens_seen": 290665088, "step": 2060 }, { "epoch": 0.1261589328295102, "grad_norm": 0.654530942440033, "learning_rate": 8e-05, "loss": 1.6089, "num_input_tokens_seen": 292110316, "step": 2070 }, { "epoch": 0.12676839627313105, "grad_norm": 0.6449682712554932, "learning_rate": 8e-05, "loss": 1.6043, "num_input_tokens_seen": 293537852, "step": 2080 }, { "epoch": 0.12737785971675186, "grad_norm": 0.7298957705497742, "learning_rate": 8e-05, "loss": 1.6753, "num_input_tokens_seen": 294958232, "step": 2090 }, { "epoch": 0.12798732316037267, "grad_norm": 0.7615550756454468, "learning_rate": 8e-05, "loss": 1.5889, "num_input_tokens_seen": 296341104, "step": 2100 }, { "epoch": 0.1285967866039935, "grad_norm": 0.6716745495796204, "learning_rate": 8e-05, "loss": 1.6452, "num_input_tokens_seen": 297751192, "step": 2110 }, { "epoch": 0.12920625004761432, "grad_norm": 0.6735673546791077, "learning_rate": 8e-05, "loss": 1.5333, "num_input_tokens_seen": 299157160, "step": 2120 }, { "epoch": 0.12981571349123516, "grad_norm": 0.6383569836616516, "learning_rate": 8e-05, "loss": 1.7071, "num_input_tokens_seen": 300556440, "step": 2130 }, { "epoch": 0.13042517693485597, "grad_norm": 0.6580716967582703, "learning_rate": 8e-05, "loss": 1.5882, "num_input_tokens_seen": 301950588, "step": 2140 }, { "epoch": 0.13103464037847679, "grad_norm": 0.7268714904785156, "learning_rate": 8e-05, "loss": 1.6112, "num_input_tokens_seen": 303350336, "step": 2150 }, { "epoch": 0.13164410382209762, "grad_norm": 0.6901399493217468, "learning_rate": 8e-05, "loss": 1.572, "num_input_tokens_seen": 304781136, "step": 2160 }, { "epoch": 0.13225356726571844, "grad_norm": 0.6628725528717041, "learning_rate": 8e-05, "loss": 1.6983, "num_input_tokens_seen": 306176884, "step": 2170 }, { "epoch": 0.13286303070933927, "grad_norm": 0.688841700553894, "learning_rate": 8e-05, "loss": 1.6409, "num_input_tokens_seen": 307601844, "step": 2180 }, { "epoch": 0.1334724941529601, "grad_norm": 0.6499971747398376, "learning_rate": 8e-05, "loss": 1.5889, "num_input_tokens_seen": 308995452, "step": 2190 }, { "epoch": 0.1340819575965809, "grad_norm": 0.6463174223899841, "learning_rate": 8e-05, "loss": 1.6411, "num_input_tokens_seen": 310417804, "step": 2200 }, { "epoch": 0.13469142104020174, "grad_norm": 0.6206100583076477, "learning_rate": 8e-05, "loss": 1.59, "num_input_tokens_seen": 311846032, "step": 2210 }, { "epoch": 0.13530088448382255, "grad_norm": 0.6116955876350403, "learning_rate": 8e-05, "loss": 1.6057, "num_input_tokens_seen": 313234256, "step": 2220 }, { "epoch": 0.1359103479274434, "grad_norm": 0.7880204916000366, "learning_rate": 8e-05, "loss": 1.6058, "num_input_tokens_seen": 314639172, "step": 2230 }, { "epoch": 0.1365198113710642, "grad_norm": 0.6306136846542358, "learning_rate": 8e-05, "loss": 1.665, "num_input_tokens_seen": 316031736, "step": 2240 }, { "epoch": 0.137129274814685, "grad_norm": 0.6810283660888672, "learning_rate": 8e-05, "loss": 1.6544, "num_input_tokens_seen": 317438412, "step": 2250 }, { "epoch": 0.13773873825830585, "grad_norm": 0.671161949634552, "learning_rate": 8e-05, "loss": 1.6091, "num_input_tokens_seen": 318863568, "step": 2260 }, { "epoch": 0.13834820170192666, "grad_norm": 0.6195483803749084, "learning_rate": 8e-05, "loss": 1.5676, "num_input_tokens_seen": 320256312, "step": 2270 }, { "epoch": 0.1389576651455475, "grad_norm": 0.6664822101593018, "learning_rate": 8e-05, "loss": 1.6302, "num_input_tokens_seen": 321670092, "step": 2280 }, { "epoch": 0.1395671285891683, "grad_norm": 0.6218630075454712, "learning_rate": 8e-05, "loss": 1.6288, "num_input_tokens_seen": 323080412, "step": 2290 }, { "epoch": 0.14017659203278912, "grad_norm": 0.676080048084259, "learning_rate": 8e-05, "loss": 1.5945, "num_input_tokens_seen": 324469600, "step": 2300 }, { "epoch": 0.14078605547640996, "grad_norm": 0.5576323866844177, "learning_rate": 8e-05, "loss": 1.5904, "num_input_tokens_seen": 325857624, "step": 2310 }, { "epoch": 0.14139551892003077, "grad_norm": 0.6566000580787659, "learning_rate": 8e-05, "loss": 1.6492, "num_input_tokens_seen": 327258636, "step": 2320 }, { "epoch": 0.1420049823636516, "grad_norm": 0.6909166574478149, "learning_rate": 8e-05, "loss": 1.6071, "num_input_tokens_seen": 328702764, "step": 2330 }, { "epoch": 0.14261444580727242, "grad_norm": 0.647297739982605, "learning_rate": 8e-05, "loss": 1.5577, "num_input_tokens_seen": 330100304, "step": 2340 }, { "epoch": 0.14322390925089323, "grad_norm": 0.6550387740135193, "learning_rate": 8e-05, "loss": 1.6879, "num_input_tokens_seen": 331549276, "step": 2350 }, { "epoch": 0.14383337269451407, "grad_norm": 0.6743698716163635, "learning_rate": 8e-05, "loss": 1.7019, "num_input_tokens_seen": 332916584, "step": 2360 }, { "epoch": 0.14444283613813488, "grad_norm": 0.581924557685852, "learning_rate": 8e-05, "loss": 1.6591, "num_input_tokens_seen": 334299004, "step": 2370 }, { "epoch": 0.14505229958175572, "grad_norm": 0.6334020495414734, "learning_rate": 8e-05, "loss": 1.6316, "num_input_tokens_seen": 335702312, "step": 2380 }, { "epoch": 0.14566176302537653, "grad_norm": 0.5923337340354919, "learning_rate": 8e-05, "loss": 1.5888, "num_input_tokens_seen": 337127304, "step": 2390 }, { "epoch": 0.14627122646899735, "grad_norm": 0.6918947696685791, "learning_rate": 8e-05, "loss": 1.6433, "num_input_tokens_seen": 338550764, "step": 2400 }, { "epoch": 0.14688068991261818, "grad_norm": 0.7656240463256836, "learning_rate": 8e-05, "loss": 1.5949, "num_input_tokens_seen": 339962352, "step": 2410 }, { "epoch": 0.147490153356239, "grad_norm": 0.7092509269714355, "learning_rate": 8e-05, "loss": 1.6059, "num_input_tokens_seen": 341363788, "step": 2420 }, { "epoch": 0.14809961679985983, "grad_norm": 0.6121082901954651, "learning_rate": 8e-05, "loss": 1.5637, "num_input_tokens_seen": 342800568, "step": 2430 }, { "epoch": 0.14870908024348065, "grad_norm": 0.5944322347640991, "learning_rate": 8e-05, "loss": 1.6384, "num_input_tokens_seen": 344226896, "step": 2440 }, { "epoch": 0.14931854368710146, "grad_norm": 0.6050378084182739, "learning_rate": 8e-05, "loss": 1.6027, "num_input_tokens_seen": 345639348, "step": 2450 }, { "epoch": 0.1499280071307223, "grad_norm": 0.7982641458511353, "learning_rate": 8e-05, "loss": 1.5957, "num_input_tokens_seen": 347053896, "step": 2460 }, { "epoch": 0.1505374705743431, "grad_norm": 0.6125765442848206, "learning_rate": 8e-05, "loss": 1.576, "num_input_tokens_seen": 348438848, "step": 2470 }, { "epoch": 0.15114693401796395, "grad_norm": 0.728951096534729, "learning_rate": 8e-05, "loss": 1.6711, "num_input_tokens_seen": 349823624, "step": 2480 }, { "epoch": 0.15175639746158476, "grad_norm": 0.5719558596611023, "learning_rate": 8e-05, "loss": 1.6289, "num_input_tokens_seen": 351265596, "step": 2490 }, { "epoch": 0.15236586090520557, "grad_norm": 0.5965576767921448, "learning_rate": 8e-05, "loss": 1.6114, "num_input_tokens_seen": 352677476, "step": 2500 }, { "epoch": 0.1529753243488264, "grad_norm": 0.6597477197647095, "learning_rate": 8e-05, "loss": 1.5929, "num_input_tokens_seen": 354079432, "step": 2510 }, { "epoch": 0.15358478779244722, "grad_norm": 0.633777379989624, "learning_rate": 8e-05, "loss": 1.6062, "num_input_tokens_seen": 355500376, "step": 2520 }, { "epoch": 0.15419425123606806, "grad_norm": 0.6986291408538818, "learning_rate": 8e-05, "loss": 1.5994, "num_input_tokens_seen": 356938480, "step": 2530 }, { "epoch": 0.15480371467968887, "grad_norm": 0.6283817291259766, "learning_rate": 8e-05, "loss": 1.6403, "num_input_tokens_seen": 358347276, "step": 2540 }, { "epoch": 0.15541317812330968, "grad_norm": 0.6436857581138611, "learning_rate": 8e-05, "loss": 1.5707, "num_input_tokens_seen": 359743248, "step": 2550 }, { "epoch": 0.15602264156693052, "grad_norm": 0.6602137684822083, "learning_rate": 8e-05, "loss": 1.6227, "num_input_tokens_seen": 361133912, "step": 2560 }, { "epoch": 0.15663210501055133, "grad_norm": 0.7311964631080627, "learning_rate": 8e-05, "loss": 1.5496, "num_input_tokens_seen": 362551344, "step": 2570 }, { "epoch": 0.15724156845417217, "grad_norm": 0.6504777073860168, "learning_rate": 8e-05, "loss": 1.595, "num_input_tokens_seen": 363943344, "step": 2580 }, { "epoch": 0.15785103189779298, "grad_norm": 0.6566677093505859, "learning_rate": 8e-05, "loss": 1.5807, "num_input_tokens_seen": 365347512, "step": 2590 }, { "epoch": 0.1584604953414138, "grad_norm": 0.6459395885467529, "learning_rate": 8e-05, "loss": 1.6358, "num_input_tokens_seen": 366720924, "step": 2600 }, { "epoch": 0.15906995878503463, "grad_norm": 0.6944287419319153, "learning_rate": 8e-05, "loss": 1.586, "num_input_tokens_seen": 368150964, "step": 2610 }, { "epoch": 0.15967942222865544, "grad_norm": 0.5620209574699402, "learning_rate": 8e-05, "loss": 1.5993, "num_input_tokens_seen": 369511972, "step": 2620 }, { "epoch": 0.16028888567227628, "grad_norm": 0.6273583173751831, "learning_rate": 8e-05, "loss": 1.5833, "num_input_tokens_seen": 370942616, "step": 2630 }, { "epoch": 0.1608983491158971, "grad_norm": 0.669262707233429, "learning_rate": 8e-05, "loss": 1.5427, "num_input_tokens_seen": 372347592, "step": 2640 }, { "epoch": 0.1615078125595179, "grad_norm": 0.5964462161064148, "learning_rate": 8e-05, "loss": 1.6693, "num_input_tokens_seen": 373729360, "step": 2650 }, { "epoch": 0.16211727600313874, "grad_norm": 0.608135461807251, "learning_rate": 8e-05, "loss": 1.5437, "num_input_tokens_seen": 375181280, "step": 2660 }, { "epoch": 0.16272673944675956, "grad_norm": 0.6479006409645081, "learning_rate": 8e-05, "loss": 1.6328, "num_input_tokens_seen": 376596372, "step": 2670 }, { "epoch": 0.1633362028903804, "grad_norm": 0.6235690712928772, "learning_rate": 8e-05, "loss": 1.6596, "num_input_tokens_seen": 378020808, "step": 2680 }, { "epoch": 0.1639456663340012, "grad_norm": 0.6381850838661194, "learning_rate": 8e-05, "loss": 1.5794, "num_input_tokens_seen": 379380024, "step": 2690 }, { "epoch": 0.16455512977762202, "grad_norm": 0.6151752471923828, "learning_rate": 8e-05, "loss": 1.5792, "num_input_tokens_seen": 380803184, "step": 2700 }, { "epoch": 0.16516459322124286, "grad_norm": 0.6571025252342224, "learning_rate": 8e-05, "loss": 1.5779, "num_input_tokens_seen": 382188904, "step": 2710 }, { "epoch": 0.16577405666486367, "grad_norm": 0.694426417350769, "learning_rate": 8e-05, "loss": 1.6054, "num_input_tokens_seen": 383612672, "step": 2720 }, { "epoch": 0.1663835201084845, "grad_norm": 0.6907577514648438, "learning_rate": 8e-05, "loss": 1.5726, "num_input_tokens_seen": 385033960, "step": 2730 }, { "epoch": 0.16699298355210532, "grad_norm": 0.7059677243232727, "learning_rate": 8e-05, "loss": 1.5798, "num_input_tokens_seen": 386416964, "step": 2740 }, { "epoch": 0.16760244699572613, "grad_norm": 0.6744717359542847, "learning_rate": 8e-05, "loss": 1.6229, "num_input_tokens_seen": 387838080, "step": 2750 }, { "epoch": 0.16821191043934697, "grad_norm": 0.6473004817962646, "learning_rate": 8e-05, "loss": 1.5853, "num_input_tokens_seen": 389250364, "step": 2760 }, { "epoch": 0.16882137388296778, "grad_norm": 0.7390977144241333, "learning_rate": 8e-05, "loss": 1.5684, "num_input_tokens_seen": 390679648, "step": 2770 }, { "epoch": 0.16943083732658862, "grad_norm": 0.7005506753921509, "learning_rate": 8e-05, "loss": 1.6613, "num_input_tokens_seen": 392061336, "step": 2780 }, { "epoch": 0.17004030077020943, "grad_norm": 0.6911351084709167, "learning_rate": 8e-05, "loss": 1.5572, "num_input_tokens_seen": 393461980, "step": 2790 }, { "epoch": 0.17064976421383024, "grad_norm": 0.5468894243240356, "learning_rate": 8e-05, "loss": 1.589, "num_input_tokens_seen": 394847452, "step": 2800 }, { "epoch": 0.17125922765745108, "grad_norm": 0.5998040437698364, "learning_rate": 8e-05, "loss": 1.5748, "num_input_tokens_seen": 396233508, "step": 2810 }, { "epoch": 0.1718686911010719, "grad_norm": 0.7814628481864929, "learning_rate": 8e-05, "loss": 1.5206, "num_input_tokens_seen": 397650544, "step": 2820 }, { "epoch": 0.17247815454469273, "grad_norm": 0.6909236311912537, "learning_rate": 8e-05, "loss": 1.5474, "num_input_tokens_seen": 399046560, "step": 2830 }, { "epoch": 0.17308761798831354, "grad_norm": 0.7126919627189636, "learning_rate": 8e-05, "loss": 1.5615, "num_input_tokens_seen": 400408700, "step": 2840 }, { "epoch": 0.17369708143193435, "grad_norm": 0.6222878098487854, "learning_rate": 8e-05, "loss": 1.5642, "num_input_tokens_seen": 401806840, "step": 2850 }, { "epoch": 0.1743065448755552, "grad_norm": 0.6794285774230957, "learning_rate": 8e-05, "loss": 1.5793, "num_input_tokens_seen": 403203952, "step": 2860 }, { "epoch": 0.174916008319176, "grad_norm": 0.609735906124115, "learning_rate": 8e-05, "loss": 1.6068, "num_input_tokens_seen": 404591240, "step": 2870 }, { "epoch": 0.17552547176279681, "grad_norm": 0.6210402250289917, "learning_rate": 8e-05, "loss": 1.6045, "num_input_tokens_seen": 405971452, "step": 2880 }, { "epoch": 0.17613493520641765, "grad_norm": 0.6104692816734314, "learning_rate": 8e-05, "loss": 1.5263, "num_input_tokens_seen": 407388256, "step": 2890 }, { "epoch": 0.17674439865003846, "grad_norm": 0.6604185700416565, "learning_rate": 8e-05, "loss": 1.6198, "num_input_tokens_seen": 408813112, "step": 2900 }, { "epoch": 0.1773538620936593, "grad_norm": 0.6400834321975708, "learning_rate": 8e-05, "loss": 1.5672, "num_input_tokens_seen": 410235500, "step": 2910 }, { "epoch": 0.17796332553728011, "grad_norm": 0.6400529742240906, "learning_rate": 8e-05, "loss": 1.5761, "num_input_tokens_seen": 411639812, "step": 2920 }, { "epoch": 0.17857278898090093, "grad_norm": 0.603792130947113, "learning_rate": 8e-05, "loss": 1.6047, "num_input_tokens_seen": 413089172, "step": 2930 }, { "epoch": 0.17918225242452177, "grad_norm": 0.6779669523239136, "learning_rate": 8e-05, "loss": 1.5808, "num_input_tokens_seen": 414491152, "step": 2940 }, { "epoch": 0.17979171586814258, "grad_norm": 0.6752368807792664, "learning_rate": 8e-05, "loss": 1.5736, "num_input_tokens_seen": 415907296, "step": 2950 }, { "epoch": 0.18040117931176342, "grad_norm": 0.6246203184127808, "learning_rate": 8e-05, "loss": 1.5875, "num_input_tokens_seen": 417303208, "step": 2960 }, { "epoch": 0.18101064275538423, "grad_norm": 0.6521744728088379, "learning_rate": 8e-05, "loss": 1.586, "num_input_tokens_seen": 418730556, "step": 2970 }, { "epoch": 0.18162010619900504, "grad_norm": 0.6168652176856995, "learning_rate": 8e-05, "loss": 1.541, "num_input_tokens_seen": 420143632, "step": 2980 }, { "epoch": 0.18222956964262588, "grad_norm": 0.6491835713386536, "learning_rate": 8e-05, "loss": 1.551, "num_input_tokens_seen": 421602604, "step": 2990 }, { "epoch": 0.1828390330862467, "grad_norm": 0.8011573553085327, "learning_rate": 8e-05, "loss": 1.6447, "num_input_tokens_seen": 423013584, "step": 3000 }, { "epoch": 0.18344849652986753, "grad_norm": 0.641477644443512, "learning_rate": 8e-05, "loss": 1.5983, "num_input_tokens_seen": 424408696, "step": 3010 }, { "epoch": 0.18405795997348834, "grad_norm": 0.6406200528144836, "learning_rate": 8e-05, "loss": 1.6416, "num_input_tokens_seen": 425827168, "step": 3020 }, { "epoch": 0.18466742341710915, "grad_norm": 0.6057642102241516, "learning_rate": 8e-05, "loss": 1.5256, "num_input_tokens_seen": 427245472, "step": 3030 }, { "epoch": 0.18527688686073, "grad_norm": 0.6526055932044983, "learning_rate": 8e-05, "loss": 1.4959, "num_input_tokens_seen": 428628168, "step": 3040 }, { "epoch": 0.1858863503043508, "grad_norm": 0.6561000943183899, "learning_rate": 8e-05, "loss": 1.5155, "num_input_tokens_seen": 430033608, "step": 3050 }, { "epoch": 0.18649581374797164, "grad_norm": 0.6516884565353394, "learning_rate": 8e-05, "loss": 1.6234, "num_input_tokens_seen": 431432912, "step": 3060 }, { "epoch": 0.18710527719159245, "grad_norm": 0.6904422044754028, "learning_rate": 8e-05, "loss": 1.4835, "num_input_tokens_seen": 432800096, "step": 3070 }, { "epoch": 0.18771474063521326, "grad_norm": 0.5921774506568909, "learning_rate": 8e-05, "loss": 1.5685, "num_input_tokens_seen": 434190712, "step": 3080 }, { "epoch": 0.1883242040788341, "grad_norm": 0.5747608542442322, "learning_rate": 8e-05, "loss": 1.6084, "num_input_tokens_seen": 435587468, "step": 3090 }, { "epoch": 0.1889336675224549, "grad_norm": 0.6614587306976318, "learning_rate": 8e-05, "loss": 1.5626, "num_input_tokens_seen": 437003988, "step": 3100 }, { "epoch": 0.18954313096607575, "grad_norm": 0.5542994141578674, "learning_rate": 8e-05, "loss": 1.6015, "num_input_tokens_seen": 438403756, "step": 3110 }, { "epoch": 0.19015259440969656, "grad_norm": 0.6427204608917236, "learning_rate": 8e-05, "loss": 1.529, "num_input_tokens_seen": 439826144, "step": 3120 }, { "epoch": 0.19076205785331737, "grad_norm": 0.560806930065155, "learning_rate": 8e-05, "loss": 1.5518, "num_input_tokens_seen": 441217644, "step": 3130 }, { "epoch": 0.1913715212969382, "grad_norm": 0.6509169340133667, "learning_rate": 8e-05, "loss": 1.5269, "num_input_tokens_seen": 442617344, "step": 3140 }, { "epoch": 0.19198098474055902, "grad_norm": 0.6311179995536804, "learning_rate": 8e-05, "loss": 1.5414, "num_input_tokens_seen": 444028916, "step": 3150 }, { "epoch": 0.19259044818417986, "grad_norm": 0.6070147752761841, "learning_rate": 8e-05, "loss": 1.5521, "num_input_tokens_seen": 445413608, "step": 3160 }, { "epoch": 0.19319991162780067, "grad_norm": 0.6126248836517334, "learning_rate": 8e-05, "loss": 1.6203, "num_input_tokens_seen": 446824748, "step": 3170 }, { "epoch": 0.19380937507142149, "grad_norm": 0.6572065949440002, "learning_rate": 8e-05, "loss": 1.56, "num_input_tokens_seen": 448217780, "step": 3180 }, { "epoch": 0.19441883851504232, "grad_norm": 0.6731168627738953, "learning_rate": 8e-05, "loss": 1.6071, "num_input_tokens_seen": 449669388, "step": 3190 }, { "epoch": 0.19502830195866314, "grad_norm": 0.5746667385101318, "learning_rate": 8e-05, "loss": 1.6396, "num_input_tokens_seen": 451076256, "step": 3200 }, { "epoch": 0.19563776540228398, "grad_norm": 0.5883861184120178, "learning_rate": 8e-05, "loss": 1.5441, "num_input_tokens_seen": 452433176, "step": 3210 }, { "epoch": 0.1962472288459048, "grad_norm": 0.6213635206222534, "learning_rate": 8e-05, "loss": 1.5408, "num_input_tokens_seen": 453864744, "step": 3220 }, { "epoch": 0.1968566922895256, "grad_norm": 0.5844454169273376, "learning_rate": 8e-05, "loss": 1.5892, "num_input_tokens_seen": 455266916, "step": 3230 }, { "epoch": 0.19746615573314644, "grad_norm": 0.7327582240104675, "learning_rate": 8e-05, "loss": 1.5541, "num_input_tokens_seen": 456655360, "step": 3240 }, { "epoch": 0.19807561917676725, "grad_norm": 0.7098453044891357, "learning_rate": 8e-05, "loss": 1.5219, "num_input_tokens_seen": 458100268, "step": 3250 }, { "epoch": 0.1986850826203881, "grad_norm": 0.6221525073051453, "learning_rate": 8e-05, "loss": 1.5587, "num_input_tokens_seen": 459523316, "step": 3260 }, { "epoch": 0.1992945460640089, "grad_norm": 0.646511971950531, "learning_rate": 8e-05, "loss": 1.6495, "num_input_tokens_seen": 460960464, "step": 3270 }, { "epoch": 0.1999040095076297, "grad_norm": 0.6622752547264099, "learning_rate": 8e-05, "loss": 1.5559, "num_input_tokens_seen": 462371980, "step": 3280 }, { "epoch": 0.20051347295125055, "grad_norm": 0.5687728524208069, "learning_rate": 8e-05, "loss": 1.5691, "num_input_tokens_seen": 463756352, "step": 3290 }, { "epoch": 0.20112293639487136, "grad_norm": 0.6098371148109436, "learning_rate": 8e-05, "loss": 1.5524, "num_input_tokens_seen": 465193884, "step": 3300 }, { "epoch": 0.2017323998384922, "grad_norm": 0.681311309337616, "learning_rate": 8e-05, "loss": 1.5381, "num_input_tokens_seen": 466639236, "step": 3310 }, { "epoch": 0.202341863282113, "grad_norm": 0.5747373104095459, "learning_rate": 8e-05, "loss": 1.5177, "num_input_tokens_seen": 468072688, "step": 3320 }, { "epoch": 0.20295132672573382, "grad_norm": 0.5559400320053101, "learning_rate": 8e-05, "loss": 1.5873, "num_input_tokens_seen": 469442532, "step": 3330 }, { "epoch": 0.20356079016935466, "grad_norm": 0.6499312520027161, "learning_rate": 8e-05, "loss": 1.6033, "num_input_tokens_seen": 470904716, "step": 3340 }, { "epoch": 0.20417025361297547, "grad_norm": 0.5671247839927673, "learning_rate": 8e-05, "loss": 1.546, "num_input_tokens_seen": 472320820, "step": 3350 }, { "epoch": 0.2047797170565963, "grad_norm": 0.605981171131134, "learning_rate": 8e-05, "loss": 1.5692, "num_input_tokens_seen": 473731260, "step": 3360 }, { "epoch": 0.20538918050021712, "grad_norm": 0.7082146406173706, "learning_rate": 8e-05, "loss": 1.502, "num_input_tokens_seen": 475164164, "step": 3370 }, { "epoch": 0.20599864394383793, "grad_norm": 0.6158855557441711, "learning_rate": 8e-05, "loss": 1.5276, "num_input_tokens_seen": 476603328, "step": 3380 }, { "epoch": 0.20660810738745877, "grad_norm": 0.701566755771637, "learning_rate": 8e-05, "loss": 1.451, "num_input_tokens_seen": 477996612, "step": 3390 }, { "epoch": 0.20721757083107958, "grad_norm": 0.7635865211486816, "learning_rate": 8e-05, "loss": 1.5975, "num_input_tokens_seen": 479446800, "step": 3400 }, { "epoch": 0.20782703427470042, "grad_norm": 0.5621991753578186, "learning_rate": 8e-05, "loss": 1.5216, "num_input_tokens_seen": 480863616, "step": 3410 }, { "epoch": 0.20843649771832123, "grad_norm": 0.6029215455055237, "learning_rate": 8e-05, "loss": 1.671, "num_input_tokens_seen": 482273428, "step": 3420 }, { "epoch": 0.20904596116194205, "grad_norm": 0.6263468265533447, "learning_rate": 8e-05, "loss": 1.6108, "num_input_tokens_seen": 483698556, "step": 3430 }, { "epoch": 0.20965542460556288, "grad_norm": 0.6057296991348267, "learning_rate": 8e-05, "loss": 1.5797, "num_input_tokens_seen": 485164152, "step": 3440 }, { "epoch": 0.2102648880491837, "grad_norm": 0.639839231967926, "learning_rate": 8e-05, "loss": 1.5568, "num_input_tokens_seen": 486564516, "step": 3450 }, { "epoch": 0.21087435149280453, "grad_norm": 0.6368933320045471, "learning_rate": 8e-05, "loss": 1.5162, "num_input_tokens_seen": 487982336, "step": 3460 }, { "epoch": 0.21148381493642535, "grad_norm": 0.6618413925170898, "learning_rate": 8e-05, "loss": 1.5737, "num_input_tokens_seen": 489402684, "step": 3470 }, { "epoch": 0.21209327838004616, "grad_norm": 0.7187027335166931, "learning_rate": 8e-05, "loss": 1.5436, "num_input_tokens_seen": 490822360, "step": 3480 }, { "epoch": 0.212702741823667, "grad_norm": 0.6799234747886658, "learning_rate": 8e-05, "loss": 1.6655, "num_input_tokens_seen": 492270836, "step": 3490 }, { "epoch": 0.2133122052672878, "grad_norm": 0.6822348833084106, "learning_rate": 8e-05, "loss": 1.5637, "num_input_tokens_seen": 493667632, "step": 3500 }, { "epoch": 0.21392166871090865, "grad_norm": 0.5559425950050354, "learning_rate": 8e-05, "loss": 1.6041, "num_input_tokens_seen": 495092028, "step": 3510 }, { "epoch": 0.21453113215452946, "grad_norm": 0.6507359147071838, "learning_rate": 8e-05, "loss": 1.5193, "num_input_tokens_seen": 496485460, "step": 3520 }, { "epoch": 0.21514059559815027, "grad_norm": 0.6291355490684509, "learning_rate": 8e-05, "loss": 1.5885, "num_input_tokens_seen": 497903368, "step": 3530 }, { "epoch": 0.2157500590417711, "grad_norm": 0.6117660999298096, "learning_rate": 8e-05, "loss": 1.583, "num_input_tokens_seen": 499318444, "step": 3540 }, { "epoch": 0.21635952248539192, "grad_norm": 0.6362354159355164, "learning_rate": 8e-05, "loss": 1.5588, "num_input_tokens_seen": 500692116, "step": 3550 }, { "epoch": 0.21696898592901276, "grad_norm": 0.7075550556182861, "learning_rate": 8e-05, "loss": 1.6182, "num_input_tokens_seen": 502070960, "step": 3560 }, { "epoch": 0.21757844937263357, "grad_norm": 0.5841058492660522, "learning_rate": 8e-05, "loss": 1.5713, "num_input_tokens_seen": 503481416, "step": 3570 }, { "epoch": 0.21818791281625438, "grad_norm": 0.6521745920181274, "learning_rate": 8e-05, "loss": 1.5641, "num_input_tokens_seen": 504920860, "step": 3580 }, { "epoch": 0.21879737625987522, "grad_norm": 0.527927041053772, "learning_rate": 8e-05, "loss": 1.5431, "num_input_tokens_seen": 506389836, "step": 3590 }, { "epoch": 0.21940683970349603, "grad_norm": 0.6047568321228027, "learning_rate": 8e-05, "loss": 1.5653, "num_input_tokens_seen": 507827788, "step": 3600 }, { "epoch": 0.22001630314711687, "grad_norm": 0.5986194610595703, "learning_rate": 8e-05, "loss": 1.5669, "num_input_tokens_seen": 509254352, "step": 3610 }, { "epoch": 0.22062576659073768, "grad_norm": 0.7015334367752075, "learning_rate": 8e-05, "loss": 1.5154, "num_input_tokens_seen": 510665164, "step": 3620 }, { "epoch": 0.2212352300343585, "grad_norm": 0.6172800064086914, "learning_rate": 8e-05, "loss": 1.5364, "num_input_tokens_seen": 512088312, "step": 3630 }, { "epoch": 0.22184469347797933, "grad_norm": 0.7089115381240845, "learning_rate": 8e-05, "loss": 1.644, "num_input_tokens_seen": 513489328, "step": 3640 }, { "epoch": 0.22245415692160014, "grad_norm": 0.5689125657081604, "learning_rate": 8e-05, "loss": 1.5437, "num_input_tokens_seen": 514882072, "step": 3650 }, { "epoch": 0.22306362036522095, "grad_norm": 0.610406756401062, "learning_rate": 8e-05, "loss": 1.5451, "num_input_tokens_seen": 516281980, "step": 3660 }, { "epoch": 0.2236730838088418, "grad_norm": 0.5259641408920288, "learning_rate": 8e-05, "loss": 1.5613, "num_input_tokens_seen": 517696272, "step": 3670 }, { "epoch": 0.2242825472524626, "grad_norm": 0.6053969264030457, "learning_rate": 8e-05, "loss": 1.5655, "num_input_tokens_seen": 519099176, "step": 3680 }, { "epoch": 0.22489201069608344, "grad_norm": 0.6376533508300781, "learning_rate": 8e-05, "loss": 1.5481, "num_input_tokens_seen": 520548316, "step": 3690 }, { "epoch": 0.22550147413970426, "grad_norm": 0.6387814879417419, "learning_rate": 8e-05, "loss": 1.5541, "num_input_tokens_seen": 521977404, "step": 3700 }, { "epoch": 0.22611093758332507, "grad_norm": 0.5563859343528748, "learning_rate": 8e-05, "loss": 1.5862, "num_input_tokens_seen": 523431332, "step": 3710 }, { "epoch": 0.2267204010269459, "grad_norm": 0.5293943285942078, "learning_rate": 8e-05, "loss": 1.5249, "num_input_tokens_seen": 524857860, "step": 3720 }, { "epoch": 0.22732986447056672, "grad_norm": 0.6297414898872375, "learning_rate": 8e-05, "loss": 1.5081, "num_input_tokens_seen": 526236604, "step": 3730 }, { "epoch": 0.22793932791418756, "grad_norm": 0.7299327850341797, "learning_rate": 8e-05, "loss": 1.5876, "num_input_tokens_seen": 527654036, "step": 3740 }, { "epoch": 0.22854879135780837, "grad_norm": 0.6514659523963928, "learning_rate": 8e-05, "loss": 1.5381, "num_input_tokens_seen": 529042780, "step": 3750 }, { "epoch": 0.22915825480142918, "grad_norm": 0.5427886247634888, "learning_rate": 8e-05, "loss": 1.5652, "num_input_tokens_seen": 530471304, "step": 3760 }, { "epoch": 0.22976771824505002, "grad_norm": 0.6196932196617126, "learning_rate": 8e-05, "loss": 1.5239, "num_input_tokens_seen": 531891692, "step": 3770 }, { "epoch": 0.23037718168867083, "grad_norm": 0.628355860710144, "learning_rate": 8e-05, "loss": 1.4701, "num_input_tokens_seen": 533308800, "step": 3780 }, { "epoch": 0.23098664513229167, "grad_norm": 0.5529937744140625, "learning_rate": 8e-05, "loss": 1.5185, "num_input_tokens_seen": 534696500, "step": 3790 }, { "epoch": 0.23159610857591248, "grad_norm": 0.5507948994636536, "learning_rate": 8e-05, "loss": 1.5384, "num_input_tokens_seen": 536107284, "step": 3800 }, { "epoch": 0.2322055720195333, "grad_norm": 0.6283483505249023, "learning_rate": 8e-05, "loss": 1.6055, "num_input_tokens_seen": 537501972, "step": 3810 }, { "epoch": 0.23281503546315413, "grad_norm": 0.6029003858566284, "learning_rate": 8e-05, "loss": 1.4906, "num_input_tokens_seen": 538873180, "step": 3820 }, { "epoch": 0.23342449890677494, "grad_norm": 0.6179420351982117, "learning_rate": 8e-05, "loss": 1.5218, "num_input_tokens_seen": 540283024, "step": 3830 }, { "epoch": 0.23403396235039578, "grad_norm": 0.6198109984397888, "learning_rate": 8e-05, "loss": 1.5806, "num_input_tokens_seen": 541699176, "step": 3840 }, { "epoch": 0.2346434257940166, "grad_norm": 0.6719751954078674, "learning_rate": 8e-05, "loss": 1.4983, "num_input_tokens_seen": 543113424, "step": 3850 }, { "epoch": 0.2352528892376374, "grad_norm": 0.7113502025604248, "learning_rate": 8e-05, "loss": 1.5353, "num_input_tokens_seen": 544510612, "step": 3860 }, { "epoch": 0.23586235268125824, "grad_norm": 0.6690021753311157, "learning_rate": 8e-05, "loss": 1.5478, "num_input_tokens_seen": 545973208, "step": 3870 }, { "epoch": 0.23647181612487905, "grad_norm": 0.5268478989601135, "learning_rate": 8e-05, "loss": 1.5472, "num_input_tokens_seen": 547339972, "step": 3880 }, { "epoch": 0.2370812795684999, "grad_norm": 0.5470036864280701, "learning_rate": 8e-05, "loss": 1.6029, "num_input_tokens_seen": 548743084, "step": 3890 }, { "epoch": 0.2376907430121207, "grad_norm": 0.5641947388648987, "learning_rate": 8e-05, "loss": 1.5503, "num_input_tokens_seen": 550127048, "step": 3900 }, { "epoch": 0.23830020645574151, "grad_norm": 0.6231464147567749, "learning_rate": 8e-05, "loss": 1.518, "num_input_tokens_seen": 551531300, "step": 3910 }, { "epoch": 0.23890966989936235, "grad_norm": 0.5082821846008301, "learning_rate": 8e-05, "loss": 1.4749, "num_input_tokens_seen": 552910292, "step": 3920 }, { "epoch": 0.23951913334298316, "grad_norm": 0.7179940938949585, "learning_rate": 8e-05, "loss": 1.511, "num_input_tokens_seen": 554318712, "step": 3930 }, { "epoch": 0.240128596786604, "grad_norm": 0.6112212538719177, "learning_rate": 8e-05, "loss": 1.4962, "num_input_tokens_seen": 555780652, "step": 3940 }, { "epoch": 0.24073806023022482, "grad_norm": 0.7789644598960876, "learning_rate": 8e-05, "loss": 1.5523, "num_input_tokens_seen": 557205392, "step": 3950 }, { "epoch": 0.24134752367384563, "grad_norm": 0.559933066368103, "learning_rate": 8e-05, "loss": 1.5004, "num_input_tokens_seen": 558642004, "step": 3960 }, { "epoch": 0.24195698711746647, "grad_norm": 0.6398128867149353, "learning_rate": 8e-05, "loss": 1.5336, "num_input_tokens_seen": 560061460, "step": 3970 }, { "epoch": 0.24256645056108728, "grad_norm": 0.5948666334152222, "learning_rate": 8e-05, "loss": 1.6063, "num_input_tokens_seen": 561476852, "step": 3980 }, { "epoch": 0.24317591400470812, "grad_norm": 0.5978219509124756, "learning_rate": 8e-05, "loss": 1.535, "num_input_tokens_seen": 562885480, "step": 3990 }, { "epoch": 0.24378537744832893, "grad_norm": 0.5732299089431763, "learning_rate": 8e-05, "loss": 1.6014, "num_input_tokens_seen": 564296708, "step": 4000 }, { "epoch": 0.24439484089194974, "grad_norm": 0.5508894324302673, "learning_rate": 8e-05, "loss": 1.5665, "num_input_tokens_seen": 565716316, "step": 4010 }, { "epoch": 0.24500430433557058, "grad_norm": 0.6127896904945374, "learning_rate": 8e-05, "loss": 1.5016, "num_input_tokens_seen": 567148104, "step": 4020 }, { "epoch": 0.2456137677791914, "grad_norm": 0.6429306864738464, "learning_rate": 8e-05, "loss": 1.5325, "num_input_tokens_seen": 568546676, "step": 4030 }, { "epoch": 0.24622323122281223, "grad_norm": 0.5604584813117981, "learning_rate": 8e-05, "loss": 1.5726, "num_input_tokens_seen": 569955020, "step": 4040 }, { "epoch": 0.24683269466643304, "grad_norm": 0.5747659206390381, "learning_rate": 8e-05, "loss": 1.613, "num_input_tokens_seen": 571360824, "step": 4050 }, { "epoch": 0.24744215811005385, "grad_norm": 0.6003303527832031, "learning_rate": 8e-05, "loss": 1.4661, "num_input_tokens_seen": 572783056, "step": 4060 }, { "epoch": 0.2480516215536747, "grad_norm": 0.5869039297103882, "learning_rate": 8e-05, "loss": 1.5553, "num_input_tokens_seen": 574182888, "step": 4070 }, { "epoch": 0.2486610849972955, "grad_norm": 0.6359077095985413, "learning_rate": 8e-05, "loss": 1.5035, "num_input_tokens_seen": 575590724, "step": 4080 }, { "epoch": 0.24927054844091634, "grad_norm": 0.6442738175392151, "learning_rate": 8e-05, "loss": 1.5434, "num_input_tokens_seen": 577002740, "step": 4090 }, { "epoch": 0.24988001188453715, "grad_norm": 0.5803224444389343, "learning_rate": 8e-05, "loss": 1.5408, "num_input_tokens_seen": 578408172, "step": 4100 }, { "epoch": 0.25048947532815796, "grad_norm": 0.5301817059516907, "learning_rate": 8e-05, "loss": 1.5587, "num_input_tokens_seen": 579811960, "step": 4110 }, { "epoch": 0.2510989387717788, "grad_norm": 0.5798521637916565, "learning_rate": 8e-05, "loss": 1.5368, "num_input_tokens_seen": 581216492, "step": 4120 }, { "epoch": 0.25170840221539964, "grad_norm": 0.6303583979606628, "learning_rate": 8e-05, "loss": 1.5264, "num_input_tokens_seen": 582601144, "step": 4130 }, { "epoch": 0.2523178656590204, "grad_norm": 0.5813618898391724, "learning_rate": 8e-05, "loss": 1.5848, "num_input_tokens_seen": 583981308, "step": 4140 }, { "epoch": 0.25292732910264126, "grad_norm": 0.6459015607833862, "learning_rate": 8e-05, "loss": 1.5018, "num_input_tokens_seen": 585375584, "step": 4150 }, { "epoch": 0.2535367925462621, "grad_norm": 0.5455829501152039, "learning_rate": 8e-05, "loss": 1.5464, "num_input_tokens_seen": 586802916, "step": 4160 }, { "epoch": 0.2541462559898829, "grad_norm": 0.6944795250892639, "learning_rate": 8e-05, "loss": 1.548, "num_input_tokens_seen": 588224368, "step": 4170 }, { "epoch": 0.2547557194335037, "grad_norm": 0.49821239709854126, "learning_rate": 8e-05, "loss": 1.5018, "num_input_tokens_seen": 589678916, "step": 4180 }, { "epoch": 0.25536518287712456, "grad_norm": 0.6219947338104248, "learning_rate": 8e-05, "loss": 1.4944, "num_input_tokens_seen": 591107332, "step": 4190 }, { "epoch": 0.25597464632074535, "grad_norm": 0.6208930611610413, "learning_rate": 8e-05, "loss": 1.5005, "num_input_tokens_seen": 592503952, "step": 4200 }, { "epoch": 0.2565841097643662, "grad_norm": 0.5741164684295654, "learning_rate": 8e-05, "loss": 1.551, "num_input_tokens_seen": 593961892, "step": 4210 }, { "epoch": 0.257193573207987, "grad_norm": 0.6542494893074036, "learning_rate": 8e-05, "loss": 1.4703, "num_input_tokens_seen": 595372500, "step": 4220 }, { "epoch": 0.25780303665160786, "grad_norm": 0.5730968117713928, "learning_rate": 8e-05, "loss": 1.5682, "num_input_tokens_seen": 596779600, "step": 4230 }, { "epoch": 0.25841250009522865, "grad_norm": 0.683051347732544, "learning_rate": 8e-05, "loss": 1.4536, "num_input_tokens_seen": 598184840, "step": 4240 }, { "epoch": 0.2590219635388495, "grad_norm": 0.7352629899978638, "learning_rate": 8e-05, "loss": 1.5573, "num_input_tokens_seen": 599586716, "step": 4250 }, { "epoch": 0.2596314269824703, "grad_norm": 0.5975140929222107, "learning_rate": 8e-05, "loss": 1.4868, "num_input_tokens_seen": 600969532, "step": 4260 }, { "epoch": 0.2602408904260911, "grad_norm": 0.5328396558761597, "learning_rate": 8e-05, "loss": 1.4971, "num_input_tokens_seen": 602381612, "step": 4270 }, { "epoch": 0.26085035386971195, "grad_norm": 0.832878053188324, "learning_rate": 8e-05, "loss": 1.5396, "num_input_tokens_seen": 603796776, "step": 4280 }, { "epoch": 0.2614598173133328, "grad_norm": 0.5859492421150208, "learning_rate": 8e-05, "loss": 1.5363, "num_input_tokens_seen": 605221644, "step": 4290 }, { "epoch": 0.26206928075695357, "grad_norm": 0.5248042345046997, "learning_rate": 8e-05, "loss": 1.5086, "num_input_tokens_seen": 606642144, "step": 4300 }, { "epoch": 0.2626787442005744, "grad_norm": 0.5584684014320374, "learning_rate": 8e-05, "loss": 1.5288, "num_input_tokens_seen": 608083892, "step": 4310 }, { "epoch": 0.26328820764419525, "grad_norm": 0.5786579251289368, "learning_rate": 8e-05, "loss": 1.5388, "num_input_tokens_seen": 609490492, "step": 4320 }, { "epoch": 0.2638976710878161, "grad_norm": 0.6442510485649109, "learning_rate": 8e-05, "loss": 1.5189, "num_input_tokens_seen": 610881576, "step": 4330 }, { "epoch": 0.26450713453143687, "grad_norm": 0.5844077467918396, "learning_rate": 8e-05, "loss": 1.4962, "num_input_tokens_seen": 612271752, "step": 4340 }, { "epoch": 0.2651165979750577, "grad_norm": 0.5478509664535522, "learning_rate": 8e-05, "loss": 1.479, "num_input_tokens_seen": 613667596, "step": 4350 }, { "epoch": 0.26572606141867855, "grad_norm": 0.5829759836196899, "learning_rate": 8e-05, "loss": 1.5768, "num_input_tokens_seen": 615067476, "step": 4360 }, { "epoch": 0.26633552486229933, "grad_norm": 0.6037372350692749, "learning_rate": 8e-05, "loss": 1.5158, "num_input_tokens_seen": 616456876, "step": 4370 }, { "epoch": 0.2669449883059202, "grad_norm": 0.7143189311027527, "learning_rate": 8e-05, "loss": 1.5373, "num_input_tokens_seen": 617885136, "step": 4380 }, { "epoch": 0.267554451749541, "grad_norm": 0.5982239246368408, "learning_rate": 8e-05, "loss": 1.5993, "num_input_tokens_seen": 619310072, "step": 4390 }, { "epoch": 0.2681639151931618, "grad_norm": 0.5947147011756897, "learning_rate": 8e-05, "loss": 1.5218, "num_input_tokens_seen": 620729588, "step": 4400 }, { "epoch": 0.26877337863678263, "grad_norm": 0.6012521386146545, "learning_rate": 8e-05, "loss": 1.4877, "num_input_tokens_seen": 622107468, "step": 4410 }, { "epoch": 0.2693828420804035, "grad_norm": 0.8092216849327087, "learning_rate": 8e-05, "loss": 1.5039, "num_input_tokens_seen": 623481492, "step": 4420 }, { "epoch": 0.2699923055240243, "grad_norm": 0.5891050100326538, "learning_rate": 8e-05, "loss": 1.6217, "num_input_tokens_seen": 624886240, "step": 4430 }, { "epoch": 0.2706017689676451, "grad_norm": 0.5771660804748535, "learning_rate": 8e-05, "loss": 1.464, "num_input_tokens_seen": 626297120, "step": 4440 }, { "epoch": 0.27121123241126593, "grad_norm": 0.6003730893135071, "learning_rate": 8e-05, "loss": 1.5165, "num_input_tokens_seen": 627731736, "step": 4450 }, { "epoch": 0.2718206958548868, "grad_norm": 0.545914351940155, "learning_rate": 8e-05, "loss": 1.5915, "num_input_tokens_seen": 629175788, "step": 4460 }, { "epoch": 0.27243015929850756, "grad_norm": 0.5131253004074097, "learning_rate": 8e-05, "loss": 1.519, "num_input_tokens_seen": 630590732, "step": 4470 }, { "epoch": 0.2730396227421284, "grad_norm": 0.6029266715049744, "learning_rate": 8e-05, "loss": 1.5594, "num_input_tokens_seen": 632034684, "step": 4480 }, { "epoch": 0.27364908618574924, "grad_norm": 0.7176979780197144, "learning_rate": 8e-05, "loss": 1.4626, "num_input_tokens_seen": 633439316, "step": 4490 }, { "epoch": 0.27425854962937, "grad_norm": 0.5998932719230652, "learning_rate": 8e-05, "loss": 1.4572, "num_input_tokens_seen": 634866124, "step": 4500 }, { "epoch": 0.27486801307299086, "grad_norm": 0.6117346882820129, "learning_rate": 8e-05, "loss": 1.4799, "num_input_tokens_seen": 636287180, "step": 4510 }, { "epoch": 0.2754774765166117, "grad_norm": 0.6476706862449646, "learning_rate": 8e-05, "loss": 1.4876, "num_input_tokens_seen": 637660604, "step": 4520 }, { "epoch": 0.27608693996023254, "grad_norm": 0.5890971422195435, "learning_rate": 8e-05, "loss": 1.497, "num_input_tokens_seen": 639105088, "step": 4530 }, { "epoch": 0.2766964034038533, "grad_norm": 0.5373267531394958, "learning_rate": 8e-05, "loss": 1.4828, "num_input_tokens_seen": 640515580, "step": 4540 }, { "epoch": 0.27730586684747416, "grad_norm": 0.5229721665382385, "learning_rate": 8e-05, "loss": 1.4927, "num_input_tokens_seen": 641960740, "step": 4550 }, { "epoch": 0.277915330291095, "grad_norm": 0.5390836000442505, "learning_rate": 8e-05, "loss": 1.5461, "num_input_tokens_seen": 643367736, "step": 4560 }, { "epoch": 0.2785247937347158, "grad_norm": 0.5476319789886475, "learning_rate": 8e-05, "loss": 1.5791, "num_input_tokens_seen": 644815132, "step": 4570 }, { "epoch": 0.2791342571783366, "grad_norm": 0.5691061615943909, "learning_rate": 8e-05, "loss": 1.5169, "num_input_tokens_seen": 646189852, "step": 4580 }, { "epoch": 0.27974372062195746, "grad_norm": 0.5501654744148254, "learning_rate": 8e-05, "loss": 1.4543, "num_input_tokens_seen": 647603012, "step": 4590 }, { "epoch": 0.28035318406557824, "grad_norm": 0.6424432992935181, "learning_rate": 8e-05, "loss": 1.5923, "num_input_tokens_seen": 649042496, "step": 4600 }, { "epoch": 0.2809626475091991, "grad_norm": 0.5200062394142151, "learning_rate": 8e-05, "loss": 1.5062, "num_input_tokens_seen": 650458192, "step": 4610 }, { "epoch": 0.2815721109528199, "grad_norm": 0.6190493702888489, "learning_rate": 8e-05, "loss": 1.5041, "num_input_tokens_seen": 651891436, "step": 4620 }, { "epoch": 0.28218157439644076, "grad_norm": 0.5610544681549072, "learning_rate": 8e-05, "loss": 1.4763, "num_input_tokens_seen": 653287132, "step": 4630 }, { "epoch": 0.28279103784006154, "grad_norm": 0.6020642518997192, "learning_rate": 8e-05, "loss": 1.4961, "num_input_tokens_seen": 654715988, "step": 4640 }, { "epoch": 0.2834005012836824, "grad_norm": 0.5459628701210022, "learning_rate": 8e-05, "loss": 1.5022, "num_input_tokens_seen": 656116916, "step": 4650 }, { "epoch": 0.2840099647273032, "grad_norm": 0.53467857837677, "learning_rate": 8e-05, "loss": 1.5723, "num_input_tokens_seen": 657533764, "step": 4660 }, { "epoch": 0.284619428170924, "grad_norm": 0.5662572383880615, "learning_rate": 8e-05, "loss": 1.4852, "num_input_tokens_seen": 658933892, "step": 4670 }, { "epoch": 0.28522889161454484, "grad_norm": 0.6344622373580933, "learning_rate": 8e-05, "loss": 1.5427, "num_input_tokens_seen": 660343924, "step": 4680 }, { "epoch": 0.2858383550581657, "grad_norm": 0.6415901184082031, "learning_rate": 8e-05, "loss": 1.4782, "num_input_tokens_seen": 661722560, "step": 4690 }, { "epoch": 0.28644781850178647, "grad_norm": 0.6006565690040588, "learning_rate": 8e-05, "loss": 1.5184, "num_input_tokens_seen": 663126892, "step": 4700 }, { "epoch": 0.2870572819454073, "grad_norm": 0.5979477763175964, "learning_rate": 8e-05, "loss": 1.4963, "num_input_tokens_seen": 664549752, "step": 4710 }, { "epoch": 0.28766674538902814, "grad_norm": 0.560646653175354, "learning_rate": 8e-05, "loss": 1.4522, "num_input_tokens_seen": 665976108, "step": 4720 }, { "epoch": 0.288276208832649, "grad_norm": 0.4898284673690796, "learning_rate": 8e-05, "loss": 1.4917, "num_input_tokens_seen": 667423832, "step": 4730 }, { "epoch": 0.28888567227626977, "grad_norm": 0.5207868218421936, "learning_rate": 8e-05, "loss": 1.4298, "num_input_tokens_seen": 668840704, "step": 4740 }, { "epoch": 0.2894951357198906, "grad_norm": 0.5953548550605774, "learning_rate": 8e-05, "loss": 1.5316, "num_input_tokens_seen": 670225524, "step": 4750 }, { "epoch": 0.29010459916351145, "grad_norm": 0.6386962532997131, "learning_rate": 8e-05, "loss": 1.5264, "num_input_tokens_seen": 671644060, "step": 4760 }, { "epoch": 0.29071406260713223, "grad_norm": 0.581168532371521, "learning_rate": 8e-05, "loss": 1.4726, "num_input_tokens_seen": 673055808, "step": 4770 }, { "epoch": 0.29132352605075307, "grad_norm": 0.7128572463989258, "learning_rate": 8e-05, "loss": 1.4414, "num_input_tokens_seen": 674498056, "step": 4780 }, { "epoch": 0.2919329894943739, "grad_norm": 0.5856772065162659, "learning_rate": 8e-05, "loss": 1.507, "num_input_tokens_seen": 675874572, "step": 4790 }, { "epoch": 0.2925424529379947, "grad_norm": 0.6448741555213928, "learning_rate": 8e-05, "loss": 1.488, "num_input_tokens_seen": 677325828, "step": 4800 }, { "epoch": 0.29315191638161553, "grad_norm": 0.6047000885009766, "learning_rate": 8e-05, "loss": 1.4665, "num_input_tokens_seen": 678720324, "step": 4810 }, { "epoch": 0.29376137982523637, "grad_norm": 0.5227587819099426, "learning_rate": 8e-05, "loss": 1.5083, "num_input_tokens_seen": 680144452, "step": 4820 }, { "epoch": 0.2943708432688572, "grad_norm": 0.5058688521385193, "learning_rate": 8e-05, "loss": 1.5108, "num_input_tokens_seen": 681537236, "step": 4830 }, { "epoch": 0.294980306712478, "grad_norm": 0.6823422312736511, "learning_rate": 8e-05, "loss": 1.5768, "num_input_tokens_seen": 682917996, "step": 4840 }, { "epoch": 0.29558977015609883, "grad_norm": 0.6330751180648804, "learning_rate": 8e-05, "loss": 1.5083, "num_input_tokens_seen": 684353484, "step": 4850 }, { "epoch": 0.29619923359971967, "grad_norm": 0.6086271405220032, "learning_rate": 8e-05, "loss": 1.5022, "num_input_tokens_seen": 685747296, "step": 4860 }, { "epoch": 0.29680869704334045, "grad_norm": 0.5951144695281982, "learning_rate": 8e-05, "loss": 1.583, "num_input_tokens_seen": 687170156, "step": 4870 }, { "epoch": 0.2974181604869613, "grad_norm": 0.5558424592018127, "learning_rate": 8e-05, "loss": 1.4283, "num_input_tokens_seen": 688574796, "step": 4880 }, { "epoch": 0.29802762393058213, "grad_norm": 0.6256549954414368, "learning_rate": 8e-05, "loss": 1.6313, "num_input_tokens_seen": 689998376, "step": 4890 }, { "epoch": 0.2986370873742029, "grad_norm": 0.5750453472137451, "learning_rate": 8e-05, "loss": 1.4544, "num_input_tokens_seen": 691383556, "step": 4900 }, { "epoch": 0.29924655081782375, "grad_norm": 0.69186931848526, "learning_rate": 8e-05, "loss": 1.5238, "num_input_tokens_seen": 692797940, "step": 4910 }, { "epoch": 0.2998560142614446, "grad_norm": 0.5753540992736816, "learning_rate": 8e-05, "loss": 1.5077, "num_input_tokens_seen": 694209928, "step": 4920 }, { "epoch": 0.3004654777050654, "grad_norm": 0.6008949279785156, "learning_rate": 8e-05, "loss": 1.5407, "num_input_tokens_seen": 695637656, "step": 4930 }, { "epoch": 0.3010749411486862, "grad_norm": 0.5696120262145996, "learning_rate": 8e-05, "loss": 1.4826, "num_input_tokens_seen": 697072516, "step": 4940 }, { "epoch": 0.30168440459230705, "grad_norm": 0.5745052099227905, "learning_rate": 8e-05, "loss": 1.5266, "num_input_tokens_seen": 698505536, "step": 4950 }, { "epoch": 0.3022938680359279, "grad_norm": 0.5735147595405579, "learning_rate": 8e-05, "loss": 1.4663, "num_input_tokens_seen": 699890632, "step": 4960 }, { "epoch": 0.3029033314795487, "grad_norm": 0.5933482050895691, "learning_rate": 8e-05, "loss": 1.521, "num_input_tokens_seen": 701311308, "step": 4970 }, { "epoch": 0.3035127949231695, "grad_norm": 0.5017425417900085, "learning_rate": 8e-05, "loss": 1.4664, "num_input_tokens_seen": 702715200, "step": 4980 }, { "epoch": 0.30412225836679035, "grad_norm": 0.6305922865867615, "learning_rate": 8e-05, "loss": 1.5193, "num_input_tokens_seen": 704117168, "step": 4990 }, { "epoch": 0.30473172181041114, "grad_norm": 0.5459288954734802, "learning_rate": 8e-05, "loss": 1.4912, "num_input_tokens_seen": 705551000, "step": 5000 }, { "epoch": 0.305341185254032, "grad_norm": 0.5966220498085022, "learning_rate": 8e-05, "loss": 1.5046, "num_input_tokens_seen": 706973412, "step": 5010 }, { "epoch": 0.3059506486976528, "grad_norm": 0.5664955377578735, "learning_rate": 8e-05, "loss": 1.5611, "num_input_tokens_seen": 708316344, "step": 5020 }, { "epoch": 0.3065601121412736, "grad_norm": 0.5635570883750916, "learning_rate": 8e-05, "loss": 1.4977, "num_input_tokens_seen": 709764736, "step": 5030 }, { "epoch": 0.30716957558489444, "grad_norm": 0.555467963218689, "learning_rate": 8e-05, "loss": 1.5142, "num_input_tokens_seen": 711149252, "step": 5040 }, { "epoch": 0.3077790390285153, "grad_norm": 0.5750370025634766, "learning_rate": 8e-05, "loss": 1.4936, "num_input_tokens_seen": 712543440, "step": 5050 }, { "epoch": 0.3083885024721361, "grad_norm": 0.4970649778842926, "learning_rate": 8e-05, "loss": 1.4638, "num_input_tokens_seen": 713913020, "step": 5060 }, { "epoch": 0.3089979659157569, "grad_norm": 0.510879635810852, "learning_rate": 8e-05, "loss": 1.455, "num_input_tokens_seen": 715350768, "step": 5070 }, { "epoch": 0.30960742935937774, "grad_norm": 0.5770237445831299, "learning_rate": 8e-05, "loss": 1.5248, "num_input_tokens_seen": 716774184, "step": 5080 }, { "epoch": 0.3102168928029986, "grad_norm": 0.508195698261261, "learning_rate": 8e-05, "loss": 1.4908, "num_input_tokens_seen": 718209544, "step": 5090 }, { "epoch": 0.31082635624661936, "grad_norm": 0.501331627368927, "learning_rate": 8e-05, "loss": 1.4317, "num_input_tokens_seen": 719606848, "step": 5100 }, { "epoch": 0.3114358196902402, "grad_norm": 0.5919586420059204, "learning_rate": 8e-05, "loss": 1.4905, "num_input_tokens_seen": 721020264, "step": 5110 }, { "epoch": 0.31204528313386104, "grad_norm": 0.51059490442276, "learning_rate": 8e-05, "loss": 1.4769, "num_input_tokens_seen": 722433924, "step": 5120 }, { "epoch": 0.3126547465774818, "grad_norm": 0.5880390405654907, "learning_rate": 8e-05, "loss": 1.4865, "num_input_tokens_seen": 723841812, "step": 5130 }, { "epoch": 0.31326421002110266, "grad_norm": 0.5752800107002258, "learning_rate": 8e-05, "loss": 1.5344, "num_input_tokens_seen": 725237092, "step": 5140 }, { "epoch": 0.3138736734647235, "grad_norm": 0.5985324382781982, "learning_rate": 8e-05, "loss": 1.4905, "num_input_tokens_seen": 726645824, "step": 5150 }, { "epoch": 0.31448313690834434, "grad_norm": 0.5916955471038818, "learning_rate": 8e-05, "loss": 1.4696, "num_input_tokens_seen": 728082536, "step": 5160 }, { "epoch": 0.3150926003519651, "grad_norm": 0.5135784149169922, "learning_rate": 8e-05, "loss": 1.4112, "num_input_tokens_seen": 729471752, "step": 5170 }, { "epoch": 0.31570206379558596, "grad_norm": 0.597804069519043, "learning_rate": 8e-05, "loss": 1.4754, "num_input_tokens_seen": 730906264, "step": 5180 }, { "epoch": 0.3163115272392068, "grad_norm": 0.48136067390441895, "learning_rate": 8e-05, "loss": 1.5421, "num_input_tokens_seen": 732311764, "step": 5190 }, { "epoch": 0.3169209906828276, "grad_norm": 0.5294743180274963, "learning_rate": 8e-05, "loss": 1.42, "num_input_tokens_seen": 733705960, "step": 5200 }, { "epoch": 0.3175304541264484, "grad_norm": 0.49918803572654724, "learning_rate": 8e-05, "loss": 1.5201, "num_input_tokens_seen": 735136208, "step": 5210 }, { "epoch": 0.31813991757006926, "grad_norm": 0.6225274205207825, "learning_rate": 8e-05, "loss": 1.4671, "num_input_tokens_seen": 736548572, "step": 5220 }, { "epoch": 0.31874938101369005, "grad_norm": 0.6126905083656311, "learning_rate": 8e-05, "loss": 1.5022, "num_input_tokens_seen": 737948492, "step": 5230 }, { "epoch": 0.3193588444573109, "grad_norm": 0.6735352873802185, "learning_rate": 8e-05, "loss": 1.4824, "num_input_tokens_seen": 739360472, "step": 5240 }, { "epoch": 0.3199683079009317, "grad_norm": 0.5282402634620667, "learning_rate": 8e-05, "loss": 1.3916, "num_input_tokens_seen": 740751936, "step": 5250 }, { "epoch": 0.32057777134455256, "grad_norm": 0.5800075531005859, "learning_rate": 8e-05, "loss": 1.4639, "num_input_tokens_seen": 742134900, "step": 5260 }, { "epoch": 0.32118723478817335, "grad_norm": 0.7085527777671814, "learning_rate": 8e-05, "loss": 1.5392, "num_input_tokens_seen": 743507672, "step": 5270 }, { "epoch": 0.3217966982317942, "grad_norm": 0.5233438611030579, "learning_rate": 8e-05, "loss": 1.4817, "num_input_tokens_seen": 744911012, "step": 5280 }, { "epoch": 0.322406161675415, "grad_norm": 0.5503348112106323, "learning_rate": 8e-05, "loss": 1.4695, "num_input_tokens_seen": 746348456, "step": 5290 }, { "epoch": 0.3230156251190358, "grad_norm": 0.5066059827804565, "learning_rate": 8e-05, "loss": 1.5024, "num_input_tokens_seen": 747760896, "step": 5300 }, { "epoch": 0.32362508856265665, "grad_norm": 0.6413135528564453, "learning_rate": 8e-05, "loss": 1.4737, "num_input_tokens_seen": 749182700, "step": 5310 }, { "epoch": 0.3242345520062775, "grad_norm": 0.6394685506820679, "learning_rate": 8e-05, "loss": 1.497, "num_input_tokens_seen": 750569816, "step": 5320 }, { "epoch": 0.32484401544989827, "grad_norm": 0.6730980277061462, "learning_rate": 8e-05, "loss": 1.5137, "num_input_tokens_seen": 752049528, "step": 5330 }, { "epoch": 0.3254534788935191, "grad_norm": 0.5059208273887634, "learning_rate": 8e-05, "loss": 1.4047, "num_input_tokens_seen": 753469828, "step": 5340 }, { "epoch": 0.32606294233713995, "grad_norm": 0.6344226002693176, "learning_rate": 8e-05, "loss": 1.4388, "num_input_tokens_seen": 754893840, "step": 5350 }, { "epoch": 0.3266724057807608, "grad_norm": 0.5653851628303528, "learning_rate": 8e-05, "loss": 1.5341, "num_input_tokens_seen": 756345148, "step": 5360 }, { "epoch": 0.32728186922438157, "grad_norm": 0.5219058394432068, "learning_rate": 8e-05, "loss": 1.4945, "num_input_tokens_seen": 757782128, "step": 5370 }, { "epoch": 0.3278913326680024, "grad_norm": 0.6265976428985596, "learning_rate": 8e-05, "loss": 1.5015, "num_input_tokens_seen": 759205500, "step": 5380 }, { "epoch": 0.32850079611162325, "grad_norm": 0.5684333443641663, "learning_rate": 8e-05, "loss": 1.4409, "num_input_tokens_seen": 760595412, "step": 5390 }, { "epoch": 0.32911025955524403, "grad_norm": 0.5176586508750916, "learning_rate": 8e-05, "loss": 1.4624, "num_input_tokens_seen": 761988752, "step": 5400 }, { "epoch": 0.3297197229988649, "grad_norm": 0.6107752323150635, "learning_rate": 8e-05, "loss": 1.4213, "num_input_tokens_seen": 763379640, "step": 5410 }, { "epoch": 0.3303291864424857, "grad_norm": 0.7825304865837097, "learning_rate": 8e-05, "loss": 1.5461, "num_input_tokens_seen": 764777136, "step": 5420 }, { "epoch": 0.3309386498861065, "grad_norm": 0.5260487794876099, "learning_rate": 8e-05, "loss": 1.4863, "num_input_tokens_seen": 766185628, "step": 5430 }, { "epoch": 0.33154811332972733, "grad_norm": 0.5938482284545898, "learning_rate": 8e-05, "loss": 1.5117, "num_input_tokens_seen": 767555088, "step": 5440 }, { "epoch": 0.3321575767733482, "grad_norm": 0.5161702632904053, "learning_rate": 8e-05, "loss": 1.5179, "num_input_tokens_seen": 768994976, "step": 5450 }, { "epoch": 0.332767040216969, "grad_norm": 0.5811363458633423, "learning_rate": 8e-05, "loss": 1.496, "num_input_tokens_seen": 770367576, "step": 5460 }, { "epoch": 0.3333765036605898, "grad_norm": 0.5560015439987183, "learning_rate": 8e-05, "loss": 1.5018, "num_input_tokens_seen": 771776868, "step": 5470 }, { "epoch": 0.33398596710421063, "grad_norm": 0.5517913103103638, "learning_rate": 8e-05, "loss": 1.5197, "num_input_tokens_seen": 773199948, "step": 5480 }, { "epoch": 0.3345954305478315, "grad_norm": 0.5763364434242249, "learning_rate": 8e-05, "loss": 1.4531, "num_input_tokens_seen": 774635788, "step": 5490 }, { "epoch": 0.33520489399145226, "grad_norm": 0.5679718852043152, "learning_rate": 8e-05, "loss": 1.5311, "num_input_tokens_seen": 776038608, "step": 5500 }, { "epoch": 0.3358143574350731, "grad_norm": 0.6076738238334656, "learning_rate": 8e-05, "loss": 1.5176, "num_input_tokens_seen": 777447108, "step": 5510 }, { "epoch": 0.33642382087869394, "grad_norm": 0.5834559798240662, "learning_rate": 8e-05, "loss": 1.5009, "num_input_tokens_seen": 778861076, "step": 5520 }, { "epoch": 0.3370332843223147, "grad_norm": 0.5051541328430176, "learning_rate": 8e-05, "loss": 1.4895, "num_input_tokens_seen": 780263956, "step": 5530 }, { "epoch": 0.33764274776593556, "grad_norm": 0.5240879058837891, "learning_rate": 8e-05, "loss": 1.5555, "num_input_tokens_seen": 781682280, "step": 5540 }, { "epoch": 0.3382522112095564, "grad_norm": 0.4911370873451233, "learning_rate": 8e-05, "loss": 1.4297, "num_input_tokens_seen": 783125984, "step": 5550 }, { "epoch": 0.33886167465317724, "grad_norm": 0.5864706635475159, "learning_rate": 8e-05, "loss": 1.5062, "num_input_tokens_seen": 784568312, "step": 5560 }, { "epoch": 0.339471138096798, "grad_norm": 0.5688824653625488, "learning_rate": 8e-05, "loss": 1.4459, "num_input_tokens_seen": 785985256, "step": 5570 }, { "epoch": 0.34008060154041886, "grad_norm": 0.521834671497345, "learning_rate": 8e-05, "loss": 1.4781, "num_input_tokens_seen": 787388484, "step": 5580 }, { "epoch": 0.3406900649840397, "grad_norm": 0.5668113827705383, "learning_rate": 8e-05, "loss": 1.4792, "num_input_tokens_seen": 788815088, "step": 5590 }, { "epoch": 0.3412995284276605, "grad_norm": 0.5735951662063599, "learning_rate": 8e-05, "loss": 1.482, "num_input_tokens_seen": 790242832, "step": 5600 }, { "epoch": 0.3419089918712813, "grad_norm": 0.5737040042877197, "learning_rate": 8e-05, "loss": 1.4741, "num_input_tokens_seen": 791648756, "step": 5610 }, { "epoch": 0.34251845531490216, "grad_norm": 0.5083377361297607, "learning_rate": 8e-05, "loss": 1.4006, "num_input_tokens_seen": 793038600, "step": 5620 }, { "epoch": 0.34312791875852294, "grad_norm": 0.48619720339775085, "learning_rate": 8e-05, "loss": 1.4241, "num_input_tokens_seen": 794466268, "step": 5630 }, { "epoch": 0.3437373822021438, "grad_norm": 0.5126612782478333, "learning_rate": 8e-05, "loss": 1.472, "num_input_tokens_seen": 795840520, "step": 5640 }, { "epoch": 0.3443468456457646, "grad_norm": 0.553537905216217, "learning_rate": 8e-05, "loss": 1.4802, "num_input_tokens_seen": 797249844, "step": 5650 }, { "epoch": 0.34495630908938546, "grad_norm": 0.576506495475769, "learning_rate": 8e-05, "loss": 1.4802, "num_input_tokens_seen": 798662736, "step": 5660 }, { "epoch": 0.34556577253300624, "grad_norm": 0.5432490706443787, "learning_rate": 8e-05, "loss": 1.5314, "num_input_tokens_seen": 800085376, "step": 5670 }, { "epoch": 0.3461752359766271, "grad_norm": 0.5892384648323059, "learning_rate": 8e-05, "loss": 1.4365, "num_input_tokens_seen": 801519988, "step": 5680 }, { "epoch": 0.3467846994202479, "grad_norm": 0.550477147102356, "learning_rate": 8e-05, "loss": 1.4772, "num_input_tokens_seen": 802930932, "step": 5690 }, { "epoch": 0.3473941628638687, "grad_norm": 0.46749433875083923, "learning_rate": 8e-05, "loss": 1.4563, "num_input_tokens_seen": 804355416, "step": 5700 }, { "epoch": 0.34800362630748954, "grad_norm": 0.6174665689468384, "learning_rate": 8e-05, "loss": 1.4796, "num_input_tokens_seen": 805809472, "step": 5710 }, { "epoch": 0.3486130897511104, "grad_norm": 0.5981230139732361, "learning_rate": 8e-05, "loss": 1.4873, "num_input_tokens_seen": 807216568, "step": 5720 }, { "epoch": 0.34922255319473117, "grad_norm": 0.8428714275360107, "learning_rate": 8e-05, "loss": 1.5775, "num_input_tokens_seen": 808599216, "step": 5730 }, { "epoch": 0.349832016638352, "grad_norm": 0.5814129710197449, "learning_rate": 8e-05, "loss": 1.4566, "num_input_tokens_seen": 810003948, "step": 5740 }, { "epoch": 0.35044148008197284, "grad_norm": 0.6042664647102356, "learning_rate": 8e-05, "loss": 1.5147, "num_input_tokens_seen": 811385708, "step": 5750 }, { "epoch": 0.35105094352559363, "grad_norm": 0.5238978862762451, "learning_rate": 8e-05, "loss": 1.477, "num_input_tokens_seen": 812784196, "step": 5760 }, { "epoch": 0.35166040696921447, "grad_norm": 0.5527967810630798, "learning_rate": 8e-05, "loss": 1.5258, "num_input_tokens_seen": 814197040, "step": 5770 }, { "epoch": 0.3522698704128353, "grad_norm": 0.539685070514679, "learning_rate": 8e-05, "loss": 1.505, "num_input_tokens_seen": 815610940, "step": 5780 }, { "epoch": 0.35287933385645615, "grad_norm": 0.5669323801994324, "learning_rate": 8e-05, "loss": 1.4015, "num_input_tokens_seen": 817022692, "step": 5790 }, { "epoch": 0.35348879730007693, "grad_norm": 0.6599160432815552, "learning_rate": 8e-05, "loss": 1.456, "num_input_tokens_seen": 818400184, "step": 5800 }, { "epoch": 0.35409826074369777, "grad_norm": 0.6508094668388367, "learning_rate": 8e-05, "loss": 1.432, "num_input_tokens_seen": 819819924, "step": 5810 }, { "epoch": 0.3547077241873186, "grad_norm": 0.45001381635665894, "learning_rate": 8e-05, "loss": 1.4672, "num_input_tokens_seen": 821219128, "step": 5820 }, { "epoch": 0.3553171876309394, "grad_norm": 0.5801172256469727, "learning_rate": 8e-05, "loss": 1.4242, "num_input_tokens_seen": 822620580, "step": 5830 }, { "epoch": 0.35592665107456023, "grad_norm": 0.588651716709137, "learning_rate": 8e-05, "loss": 1.5112, "num_input_tokens_seen": 824046412, "step": 5840 }, { "epoch": 0.35653611451818107, "grad_norm": 0.5115150213241577, "learning_rate": 8e-05, "loss": 1.4171, "num_input_tokens_seen": 825453892, "step": 5850 }, { "epoch": 0.35714557796180185, "grad_norm": 0.6320896744728088, "learning_rate": 8e-05, "loss": 1.5575, "num_input_tokens_seen": 826866976, "step": 5860 }, { "epoch": 0.3577550414054227, "grad_norm": 0.5521752834320068, "learning_rate": 8e-05, "loss": 1.4981, "num_input_tokens_seen": 828276444, "step": 5870 }, { "epoch": 0.35836450484904353, "grad_norm": 0.4956464469432831, "learning_rate": 8e-05, "loss": 1.4582, "num_input_tokens_seen": 829717872, "step": 5880 }, { "epoch": 0.35897396829266437, "grad_norm": 0.6160319447517395, "learning_rate": 8e-05, "loss": 1.4735, "num_input_tokens_seen": 831124236, "step": 5890 }, { "epoch": 0.35958343173628515, "grad_norm": 0.5546999573707581, "learning_rate": 8e-05, "loss": 1.5093, "num_input_tokens_seen": 832544524, "step": 5900 }, { "epoch": 0.360192895179906, "grad_norm": 0.5691624879837036, "learning_rate": 8e-05, "loss": 1.4528, "num_input_tokens_seen": 833976044, "step": 5910 }, { "epoch": 0.36080235862352683, "grad_norm": 0.5685278177261353, "learning_rate": 8e-05, "loss": 1.4905, "num_input_tokens_seen": 835420540, "step": 5920 }, { "epoch": 0.3614118220671476, "grad_norm": 0.5637968182563782, "learning_rate": 8e-05, "loss": 1.438, "num_input_tokens_seen": 836844744, "step": 5930 }, { "epoch": 0.36202128551076845, "grad_norm": 0.5134168267250061, "learning_rate": 8e-05, "loss": 1.4268, "num_input_tokens_seen": 838204660, "step": 5940 }, { "epoch": 0.3626307489543893, "grad_norm": 0.6179930567741394, "learning_rate": 8e-05, "loss": 1.4223, "num_input_tokens_seen": 839639804, "step": 5950 }, { "epoch": 0.3632402123980101, "grad_norm": 0.6156236529350281, "learning_rate": 8e-05, "loss": 1.4914, "num_input_tokens_seen": 841052816, "step": 5960 }, { "epoch": 0.3638496758416309, "grad_norm": 0.5875065922737122, "learning_rate": 8e-05, "loss": 1.4722, "num_input_tokens_seen": 842451068, "step": 5970 }, { "epoch": 0.36445913928525175, "grad_norm": 0.6806820631027222, "learning_rate": 8e-05, "loss": 1.4535, "num_input_tokens_seen": 843881212, "step": 5980 }, { "epoch": 0.3650686027288726, "grad_norm": 0.5545870065689087, "learning_rate": 8e-05, "loss": 1.4698, "num_input_tokens_seen": 845263884, "step": 5990 }, { "epoch": 0.3656780661724934, "grad_norm": 0.4307273328304291, "learning_rate": 8e-05, "loss": 1.5082, "num_input_tokens_seen": 846667064, "step": 6000 }, { "epoch": 0.3662875296161142, "grad_norm": 0.58298659324646, "learning_rate": 8e-05, "loss": 1.4679, "num_input_tokens_seen": 848068848, "step": 6010 }, { "epoch": 0.36689699305973505, "grad_norm": 0.45190927386283875, "learning_rate": 8e-05, "loss": 1.4729, "num_input_tokens_seen": 849463044, "step": 6020 }, { "epoch": 0.36750645650335584, "grad_norm": 0.5742015838623047, "learning_rate": 8e-05, "loss": 1.4787, "num_input_tokens_seen": 850877448, "step": 6030 }, { "epoch": 0.3681159199469767, "grad_norm": 0.5748038291931152, "learning_rate": 8e-05, "loss": 1.4364, "num_input_tokens_seen": 852315284, "step": 6040 }, { "epoch": 0.3687253833905975, "grad_norm": 0.5301962494850159, "learning_rate": 8e-05, "loss": 1.4812, "num_input_tokens_seen": 853729908, "step": 6050 }, { "epoch": 0.3693348468342183, "grad_norm": 0.589756965637207, "learning_rate": 8e-05, "loss": 1.4993, "num_input_tokens_seen": 855128224, "step": 6060 }, { "epoch": 0.36994431027783914, "grad_norm": 0.5072458982467651, "learning_rate": 8e-05, "loss": 1.5019, "num_input_tokens_seen": 856560260, "step": 6070 }, { "epoch": 0.37055377372146, "grad_norm": 0.532714307308197, "learning_rate": 8e-05, "loss": 1.4981, "num_input_tokens_seen": 857947816, "step": 6080 }, { "epoch": 0.3711632371650808, "grad_norm": 0.5182844400405884, "learning_rate": 8e-05, "loss": 1.4869, "num_input_tokens_seen": 859354540, "step": 6090 }, { "epoch": 0.3717727006087016, "grad_norm": 0.5242214798927307, "learning_rate": 8e-05, "loss": 1.498, "num_input_tokens_seen": 860784760, "step": 6100 }, { "epoch": 0.37238216405232244, "grad_norm": 0.5378220677375793, "learning_rate": 8e-05, "loss": 1.4231, "num_input_tokens_seen": 862165624, "step": 6110 }, { "epoch": 0.3729916274959433, "grad_norm": 0.5064565539360046, "learning_rate": 8e-05, "loss": 1.4333, "num_input_tokens_seen": 863576044, "step": 6120 }, { "epoch": 0.37360109093956406, "grad_norm": 0.6327691078186035, "learning_rate": 8e-05, "loss": 1.4521, "num_input_tokens_seen": 864950492, "step": 6130 }, { "epoch": 0.3742105543831849, "grad_norm": 0.6391866207122803, "learning_rate": 8e-05, "loss": 1.4526, "num_input_tokens_seen": 866373744, "step": 6140 }, { "epoch": 0.37482001782680574, "grad_norm": 0.5664558410644531, "learning_rate": 8e-05, "loss": 1.5089, "num_input_tokens_seen": 867827248, "step": 6150 }, { "epoch": 0.3754294812704265, "grad_norm": 0.6216340065002441, "learning_rate": 8e-05, "loss": 1.4829, "num_input_tokens_seen": 869255428, "step": 6160 }, { "epoch": 0.37603894471404736, "grad_norm": 0.612844705581665, "learning_rate": 8e-05, "loss": 1.4621, "num_input_tokens_seen": 870649988, "step": 6170 }, { "epoch": 0.3766484081576682, "grad_norm": 0.5435082912445068, "learning_rate": 8e-05, "loss": 1.3834, "num_input_tokens_seen": 872079800, "step": 6180 }, { "epoch": 0.37725787160128904, "grad_norm": 0.6071653366088867, "learning_rate": 8e-05, "loss": 1.4301, "num_input_tokens_seen": 873466904, "step": 6190 }, { "epoch": 0.3778673350449098, "grad_norm": 0.5115202069282532, "learning_rate": 8e-05, "loss": 1.4156, "num_input_tokens_seen": 874891528, "step": 6200 }, { "epoch": 0.37847679848853066, "grad_norm": 0.5494471788406372, "learning_rate": 8e-05, "loss": 1.4658, "num_input_tokens_seen": 876284912, "step": 6210 }, { "epoch": 0.3790862619321515, "grad_norm": 0.4871610701084137, "learning_rate": 8e-05, "loss": 1.4497, "num_input_tokens_seen": 877703108, "step": 6220 }, { "epoch": 0.3796957253757723, "grad_norm": 0.6249947547912598, "learning_rate": 8e-05, "loss": 1.4456, "num_input_tokens_seen": 879123340, "step": 6230 }, { "epoch": 0.3803051888193931, "grad_norm": 0.5746230483055115, "learning_rate": 8e-05, "loss": 1.5171, "num_input_tokens_seen": 880553048, "step": 6240 }, { "epoch": 0.38091465226301396, "grad_norm": 0.5843877792358398, "learning_rate": 8e-05, "loss": 1.5046, "num_input_tokens_seen": 881984776, "step": 6250 }, { "epoch": 0.38152411570663475, "grad_norm": 0.5240760445594788, "learning_rate": 8e-05, "loss": 1.4689, "num_input_tokens_seen": 883371668, "step": 6260 }, { "epoch": 0.3821335791502556, "grad_norm": 0.5815708041191101, "learning_rate": 8e-05, "loss": 1.5634, "num_input_tokens_seen": 884792444, "step": 6270 }, { "epoch": 0.3827430425938764, "grad_norm": 0.6558341383934021, "learning_rate": 8e-05, "loss": 1.4519, "num_input_tokens_seen": 886181612, "step": 6280 }, { "epoch": 0.38335250603749726, "grad_norm": 0.5229777097702026, "learning_rate": 8e-05, "loss": 1.4538, "num_input_tokens_seen": 887601452, "step": 6290 }, { "epoch": 0.38396196948111805, "grad_norm": 0.5209792852401733, "learning_rate": 8e-05, "loss": 1.4415, "num_input_tokens_seen": 889034508, "step": 6300 }, { "epoch": 0.3845714329247389, "grad_norm": 0.5684558749198914, "learning_rate": 8e-05, "loss": 1.4671, "num_input_tokens_seen": 890425468, "step": 6310 }, { "epoch": 0.3851808963683597, "grad_norm": 0.578091025352478, "learning_rate": 8e-05, "loss": 1.4735, "num_input_tokens_seen": 891826040, "step": 6320 }, { "epoch": 0.3857903598119805, "grad_norm": 0.5623794198036194, "learning_rate": 8e-05, "loss": 1.4926, "num_input_tokens_seen": 893262592, "step": 6330 }, { "epoch": 0.38639982325560135, "grad_norm": 0.5411602854728699, "learning_rate": 8e-05, "loss": 1.4318, "num_input_tokens_seen": 894682080, "step": 6340 }, { "epoch": 0.3870092866992222, "grad_norm": 0.5540904998779297, "learning_rate": 8e-05, "loss": 1.4068, "num_input_tokens_seen": 896099164, "step": 6350 }, { "epoch": 0.38761875014284297, "grad_norm": 0.5864216685295105, "learning_rate": 8e-05, "loss": 1.399, "num_input_tokens_seen": 897476228, "step": 6360 }, { "epoch": 0.3882282135864638, "grad_norm": 0.6339991092681885, "learning_rate": 8e-05, "loss": 1.467, "num_input_tokens_seen": 898854080, "step": 6370 }, { "epoch": 0.38883767703008465, "grad_norm": 0.555851399898529, "learning_rate": 8e-05, "loss": 1.4563, "num_input_tokens_seen": 900256152, "step": 6380 }, { "epoch": 0.3894471404737055, "grad_norm": 0.5713673233985901, "learning_rate": 8e-05, "loss": 1.5061, "num_input_tokens_seen": 901627728, "step": 6390 }, { "epoch": 0.39005660391732627, "grad_norm": 0.5585047006607056, "learning_rate": 8e-05, "loss": 1.4458, "num_input_tokens_seen": 902992920, "step": 6400 }, { "epoch": 0.3906660673609471, "grad_norm": 0.4973145127296448, "learning_rate": 8e-05, "loss": 1.4566, "num_input_tokens_seen": 904318848, "step": 6410 }, { "epoch": 0.39127553080456795, "grad_norm": 0.5424864888191223, "learning_rate": 8e-05, "loss": 1.4537, "num_input_tokens_seen": 905700004, "step": 6420 }, { "epoch": 0.39188499424818873, "grad_norm": 0.5117954611778259, "learning_rate": 8e-05, "loss": 1.4243, "num_input_tokens_seen": 907103420, "step": 6430 }, { "epoch": 0.3924944576918096, "grad_norm": 0.5525716543197632, "learning_rate": 8e-05, "loss": 1.4567, "num_input_tokens_seen": 908554396, "step": 6440 }, { "epoch": 0.3931039211354304, "grad_norm": 0.5715943574905396, "learning_rate": 8e-05, "loss": 1.44, "num_input_tokens_seen": 909956772, "step": 6450 }, { "epoch": 0.3937133845790512, "grad_norm": 0.5808925628662109, "learning_rate": 8e-05, "loss": 1.4815, "num_input_tokens_seen": 911395988, "step": 6460 }, { "epoch": 0.39432284802267203, "grad_norm": 0.6523997187614441, "learning_rate": 8e-05, "loss": 1.4218, "num_input_tokens_seen": 912784372, "step": 6470 }, { "epoch": 0.3949323114662929, "grad_norm": 0.6092135310173035, "learning_rate": 8e-05, "loss": 1.4576, "num_input_tokens_seen": 914207364, "step": 6480 }, { "epoch": 0.39554177490991366, "grad_norm": 0.5254921317100525, "learning_rate": 8e-05, "loss": 1.4644, "num_input_tokens_seen": 915636420, "step": 6490 }, { "epoch": 0.3961512383535345, "grad_norm": 0.6733999252319336, "learning_rate": 8e-05, "loss": 1.4234, "num_input_tokens_seen": 917065908, "step": 6500 }, { "epoch": 0.39676070179715534, "grad_norm": 0.5586018562316895, "learning_rate": 8e-05, "loss": 1.5169, "num_input_tokens_seen": 918451632, "step": 6510 }, { "epoch": 0.3973701652407762, "grad_norm": 0.5588156580924988, "learning_rate": 8e-05, "loss": 1.3798, "num_input_tokens_seen": 919829072, "step": 6520 }, { "epoch": 0.39797962868439696, "grad_norm": 0.5383242964744568, "learning_rate": 8e-05, "loss": 1.468, "num_input_tokens_seen": 921242500, "step": 6530 }, { "epoch": 0.3985890921280178, "grad_norm": 0.516828715801239, "learning_rate": 8e-05, "loss": 1.4953, "num_input_tokens_seen": 922668428, "step": 6540 }, { "epoch": 0.39919855557163864, "grad_norm": 0.5715090036392212, "learning_rate": 8e-05, "loss": 1.4606, "num_input_tokens_seen": 924115712, "step": 6550 }, { "epoch": 0.3998080190152594, "grad_norm": 0.5680440664291382, "learning_rate": 8e-05, "loss": 1.498, "num_input_tokens_seen": 925525280, "step": 6560 }, { "epoch": 0.40041748245888026, "grad_norm": 0.5580743551254272, "learning_rate": 8e-05, "loss": 1.4742, "num_input_tokens_seen": 926948888, "step": 6570 }, { "epoch": 0.4010269459025011, "grad_norm": 0.5276727676391602, "learning_rate": 8e-05, "loss": 1.4808, "num_input_tokens_seen": 928345036, "step": 6580 }, { "epoch": 0.4016364093461219, "grad_norm": 0.5557569265365601, "learning_rate": 8e-05, "loss": 1.5104, "num_input_tokens_seen": 929788420, "step": 6590 }, { "epoch": 0.4022458727897427, "grad_norm": 0.5946210026741028, "learning_rate": 8e-05, "loss": 1.4409, "num_input_tokens_seen": 931238968, "step": 6600 }, { "epoch": 0.40285533623336356, "grad_norm": 0.5625696778297424, "learning_rate": 8e-05, "loss": 1.4899, "num_input_tokens_seen": 932685800, "step": 6610 }, { "epoch": 0.4034647996769844, "grad_norm": 0.5148335695266724, "learning_rate": 8e-05, "loss": 1.4781, "num_input_tokens_seen": 934066584, "step": 6620 }, { "epoch": 0.4040742631206052, "grad_norm": 0.5163837671279907, "learning_rate": 8e-05, "loss": 1.3748, "num_input_tokens_seen": 935478628, "step": 6630 }, { "epoch": 0.404683726564226, "grad_norm": 0.5167310237884521, "learning_rate": 8e-05, "loss": 1.4414, "num_input_tokens_seen": 936901648, "step": 6640 }, { "epoch": 0.40529319000784686, "grad_norm": 0.5219048261642456, "learning_rate": 8e-05, "loss": 1.4501, "num_input_tokens_seen": 938297212, "step": 6650 }, { "epoch": 0.40590265345146764, "grad_norm": 0.4949333369731903, "learning_rate": 8e-05, "loss": 1.4314, "num_input_tokens_seen": 939716088, "step": 6660 }, { "epoch": 0.4065121168950885, "grad_norm": 0.6273387670516968, "learning_rate": 8e-05, "loss": 1.3889, "num_input_tokens_seen": 941144088, "step": 6670 }, { "epoch": 0.4071215803387093, "grad_norm": 0.7101430296897888, "learning_rate": 8e-05, "loss": 1.488, "num_input_tokens_seen": 942563156, "step": 6680 }, { "epoch": 0.4077310437823301, "grad_norm": 0.5433675646781921, "learning_rate": 8e-05, "loss": 1.419, "num_input_tokens_seen": 943957008, "step": 6690 }, { "epoch": 0.40834050722595094, "grad_norm": 0.5392646789550781, "learning_rate": 8e-05, "loss": 1.4103, "num_input_tokens_seen": 945372644, "step": 6700 }, { "epoch": 0.4089499706695718, "grad_norm": 0.537996768951416, "learning_rate": 8e-05, "loss": 1.4489, "num_input_tokens_seen": 946820652, "step": 6710 }, { "epoch": 0.4095594341131926, "grad_norm": 0.625306248664856, "learning_rate": 8e-05, "loss": 1.4446, "num_input_tokens_seen": 948202592, "step": 6720 }, { "epoch": 0.4101688975568134, "grad_norm": 0.5231274962425232, "learning_rate": 8e-05, "loss": 1.4562, "num_input_tokens_seen": 949622864, "step": 6730 }, { "epoch": 0.41077836100043424, "grad_norm": 0.5137037634849548, "learning_rate": 8e-05, "loss": 1.518, "num_input_tokens_seen": 951016388, "step": 6740 }, { "epoch": 0.4113878244440551, "grad_norm": 0.5600166320800781, "learning_rate": 8e-05, "loss": 1.3742, "num_input_tokens_seen": 952467456, "step": 6750 }, { "epoch": 0.41199728788767587, "grad_norm": 0.5758654475212097, "learning_rate": 8e-05, "loss": 1.4311, "num_input_tokens_seen": 953914724, "step": 6760 }, { "epoch": 0.4126067513312967, "grad_norm": 0.5547349452972412, "learning_rate": 8e-05, "loss": 1.4261, "num_input_tokens_seen": 955295720, "step": 6770 }, { "epoch": 0.41321621477491755, "grad_norm": 0.5143230557441711, "learning_rate": 8e-05, "loss": 1.4667, "num_input_tokens_seen": 956681368, "step": 6780 }, { "epoch": 0.41382567821853833, "grad_norm": 0.6212142109870911, "learning_rate": 8e-05, "loss": 1.5469, "num_input_tokens_seen": 958080316, "step": 6790 }, { "epoch": 0.41443514166215917, "grad_norm": 0.6437855362892151, "learning_rate": 8e-05, "loss": 1.4665, "num_input_tokens_seen": 959483132, "step": 6800 }, { "epoch": 0.41504460510578, "grad_norm": 0.6089012622833252, "learning_rate": 8e-05, "loss": 1.4368, "num_input_tokens_seen": 960872924, "step": 6810 }, { "epoch": 0.41565406854940085, "grad_norm": 0.5278199315071106, "learning_rate": 8e-05, "loss": 1.4487, "num_input_tokens_seen": 962282748, "step": 6820 }, { "epoch": 0.41626353199302163, "grad_norm": 0.5420494079589844, "learning_rate": 8e-05, "loss": 1.4431, "num_input_tokens_seen": 963724892, "step": 6830 }, { "epoch": 0.41687299543664247, "grad_norm": 0.5393320322036743, "learning_rate": 8e-05, "loss": 1.462, "num_input_tokens_seen": 965116844, "step": 6840 }, { "epoch": 0.4174824588802633, "grad_norm": 0.4785369634628296, "learning_rate": 8e-05, "loss": 1.4703, "num_input_tokens_seen": 966494980, "step": 6850 }, { "epoch": 0.4180919223238841, "grad_norm": 0.5344218611717224, "learning_rate": 8e-05, "loss": 1.4295, "num_input_tokens_seen": 967908316, "step": 6860 }, { "epoch": 0.41870138576750493, "grad_norm": 0.5722907781600952, "learning_rate": 8e-05, "loss": 1.5415, "num_input_tokens_seen": 969300604, "step": 6870 }, { "epoch": 0.41931084921112577, "grad_norm": 0.6001546382904053, "learning_rate": 8e-05, "loss": 1.4469, "num_input_tokens_seen": 970686572, "step": 6880 }, { "epoch": 0.41992031265474655, "grad_norm": 0.48998522758483887, "learning_rate": 8e-05, "loss": 1.497, "num_input_tokens_seen": 972128596, "step": 6890 }, { "epoch": 0.4205297760983674, "grad_norm": 0.5235675573348999, "learning_rate": 8e-05, "loss": 1.4051, "num_input_tokens_seen": 973505548, "step": 6900 }, { "epoch": 0.42113923954198823, "grad_norm": 0.45352238416671753, "learning_rate": 8e-05, "loss": 1.5063, "num_input_tokens_seen": 974916112, "step": 6910 }, { "epoch": 0.42174870298560907, "grad_norm": 0.6097639203071594, "learning_rate": 8e-05, "loss": 1.4945, "num_input_tokens_seen": 976344436, "step": 6920 }, { "epoch": 0.42235816642922985, "grad_norm": 0.6069915294647217, "learning_rate": 8e-05, "loss": 1.4796, "num_input_tokens_seen": 977726860, "step": 6930 }, { "epoch": 0.4229676298728507, "grad_norm": 0.555101752281189, "learning_rate": 8e-05, "loss": 1.465, "num_input_tokens_seen": 979148684, "step": 6940 }, { "epoch": 0.42357709331647153, "grad_norm": 0.5358740091323853, "learning_rate": 8e-05, "loss": 1.4698, "num_input_tokens_seen": 980570088, "step": 6950 }, { "epoch": 0.4241865567600923, "grad_norm": 0.5622055530548096, "learning_rate": 8e-05, "loss": 1.4742, "num_input_tokens_seen": 982007352, "step": 6960 }, { "epoch": 0.42479602020371315, "grad_norm": 0.548179566860199, "learning_rate": 8e-05, "loss": 1.3727, "num_input_tokens_seen": 983386076, "step": 6970 }, { "epoch": 0.425405483647334, "grad_norm": 0.49980705976486206, "learning_rate": 8e-05, "loss": 1.4423, "num_input_tokens_seen": 984801120, "step": 6980 }, { "epoch": 0.4260149470909548, "grad_norm": 0.5310669541358948, "learning_rate": 8e-05, "loss": 1.4295, "num_input_tokens_seen": 986186972, "step": 6990 }, { "epoch": 0.4266244105345756, "grad_norm": 0.5813121199607849, "learning_rate": 8e-05, "loss": 1.4334, "num_input_tokens_seen": 987570284, "step": 7000 }, { "epoch": 0.42723387397819645, "grad_norm": 0.5761700868606567, "learning_rate": 8e-05, "loss": 1.444, "num_input_tokens_seen": 988955796, "step": 7010 }, { "epoch": 0.4278433374218173, "grad_norm": 0.5148670673370361, "learning_rate": 8e-05, "loss": 1.4178, "num_input_tokens_seen": 990350148, "step": 7020 }, { "epoch": 0.4284528008654381, "grad_norm": 0.5637139678001404, "learning_rate": 8e-05, "loss": 1.452, "num_input_tokens_seen": 991771360, "step": 7030 }, { "epoch": 0.4290622643090589, "grad_norm": 0.5411230325698853, "learning_rate": 8e-05, "loss": 1.4548, "num_input_tokens_seen": 993232832, "step": 7040 }, { "epoch": 0.42967172775267976, "grad_norm": 0.5264154076576233, "learning_rate": 8e-05, "loss": 1.4627, "num_input_tokens_seen": 994638464, "step": 7050 }, { "epoch": 0.43028119119630054, "grad_norm": 0.6401540637016296, "learning_rate": 8e-05, "loss": 1.4346, "num_input_tokens_seen": 996023220, "step": 7060 }, { "epoch": 0.4308906546399214, "grad_norm": 0.5791023373603821, "learning_rate": 8e-05, "loss": 1.419, "num_input_tokens_seen": 997433208, "step": 7070 }, { "epoch": 0.4315001180835422, "grad_norm": 0.5640988945960999, "learning_rate": 8e-05, "loss": 1.4145, "num_input_tokens_seen": 998853492, "step": 7080 }, { "epoch": 0.432109581527163, "grad_norm": 0.6895711421966553, "learning_rate": 8e-05, "loss": 1.5127, "num_input_tokens_seen": 1000269424, "step": 7090 }, { "epoch": 0.43271904497078384, "grad_norm": 0.5173920392990112, "learning_rate": 8e-05, "loss": 1.4035, "num_input_tokens_seen": 1001659736, "step": 7100 }, { "epoch": 0.4333285084144047, "grad_norm": 0.518268346786499, "learning_rate": 8e-05, "loss": 1.4023, "num_input_tokens_seen": 1003061084, "step": 7110 }, { "epoch": 0.4339379718580255, "grad_norm": 0.4936695992946625, "learning_rate": 8e-05, "loss": 1.4437, "num_input_tokens_seen": 1004496060, "step": 7120 }, { "epoch": 0.4345474353016463, "grad_norm": 0.5039248466491699, "learning_rate": 8e-05, "loss": 1.4591, "num_input_tokens_seen": 1005928784, "step": 7130 }, { "epoch": 0.43515689874526714, "grad_norm": 0.5840840339660645, "learning_rate": 8e-05, "loss": 1.4275, "num_input_tokens_seen": 1007361236, "step": 7140 }, { "epoch": 0.435766362188888, "grad_norm": 0.5575258135795593, "learning_rate": 8e-05, "loss": 1.4881, "num_input_tokens_seen": 1008766028, "step": 7150 }, { "epoch": 0.43637582563250876, "grad_norm": 0.49963733553886414, "learning_rate": 8e-05, "loss": 1.4783, "num_input_tokens_seen": 1010202448, "step": 7160 }, { "epoch": 0.4369852890761296, "grad_norm": 0.4767080843448639, "learning_rate": 8e-05, "loss": 1.4741, "num_input_tokens_seen": 1011606648, "step": 7170 }, { "epoch": 0.43759475251975044, "grad_norm": 0.5568097829818726, "learning_rate": 8e-05, "loss": 1.5022, "num_input_tokens_seen": 1013003276, "step": 7180 }, { "epoch": 0.4382042159633712, "grad_norm": 0.5164647102355957, "learning_rate": 8e-05, "loss": 1.3946, "num_input_tokens_seen": 1014431492, "step": 7190 }, { "epoch": 0.43881367940699206, "grad_norm": 0.48684781789779663, "learning_rate": 8e-05, "loss": 1.4464, "num_input_tokens_seen": 1015812592, "step": 7200 }, { "epoch": 0.4394231428506129, "grad_norm": 0.4711934030056, "learning_rate": 8e-05, "loss": 1.4647, "num_input_tokens_seen": 1017217112, "step": 7210 }, { "epoch": 0.44003260629423374, "grad_norm": 0.5720543265342712, "learning_rate": 8e-05, "loss": 1.4723, "num_input_tokens_seen": 1018655572, "step": 7220 }, { "epoch": 0.4406420697378545, "grad_norm": 0.5549769997596741, "learning_rate": 8e-05, "loss": 1.4384, "num_input_tokens_seen": 1020040320, "step": 7230 }, { "epoch": 0.44125153318147536, "grad_norm": 0.5608406066894531, "learning_rate": 8e-05, "loss": 1.4143, "num_input_tokens_seen": 1021483904, "step": 7240 }, { "epoch": 0.4418609966250962, "grad_norm": 0.5251893401145935, "learning_rate": 8e-05, "loss": 1.3365, "num_input_tokens_seen": 1022886696, "step": 7250 }, { "epoch": 0.442470460068717, "grad_norm": 0.50985187292099, "learning_rate": 8e-05, "loss": 1.3904, "num_input_tokens_seen": 1024290644, "step": 7260 }, { "epoch": 0.4430799235123378, "grad_norm": 0.673911988735199, "learning_rate": 8e-05, "loss": 1.4286, "num_input_tokens_seen": 1025736504, "step": 7270 }, { "epoch": 0.44368938695595866, "grad_norm": 0.6181952357292175, "learning_rate": 8e-05, "loss": 1.4864, "num_input_tokens_seen": 1027167200, "step": 7280 }, { "epoch": 0.44429885039957945, "grad_norm": 0.6667662262916565, "learning_rate": 8e-05, "loss": 1.4002, "num_input_tokens_seen": 1028578528, "step": 7290 }, { "epoch": 0.4449083138432003, "grad_norm": 0.5670000910758972, "learning_rate": 8e-05, "loss": 1.4228, "num_input_tokens_seen": 1029984048, "step": 7300 }, { "epoch": 0.4455177772868211, "grad_norm": 0.527633786201477, "learning_rate": 8e-05, "loss": 1.423, "num_input_tokens_seen": 1031348748, "step": 7310 }, { "epoch": 0.4461272407304419, "grad_norm": 0.6152751445770264, "learning_rate": 8e-05, "loss": 1.4067, "num_input_tokens_seen": 1032747992, "step": 7320 }, { "epoch": 0.44673670417406275, "grad_norm": 0.6434319615364075, "learning_rate": 8e-05, "loss": 1.5126, "num_input_tokens_seen": 1034148668, "step": 7330 }, { "epoch": 0.4473461676176836, "grad_norm": 0.5347893238067627, "learning_rate": 8e-05, "loss": 1.5054, "num_input_tokens_seen": 1035562136, "step": 7340 }, { "epoch": 0.4479556310613044, "grad_norm": 0.6078560948371887, "learning_rate": 8e-05, "loss": 1.4428, "num_input_tokens_seen": 1036939864, "step": 7350 }, { "epoch": 0.4485650945049252, "grad_norm": 0.5783689618110657, "learning_rate": 8e-05, "loss": 1.4361, "num_input_tokens_seen": 1038323952, "step": 7360 }, { "epoch": 0.44917455794854605, "grad_norm": 0.5024809837341309, "learning_rate": 8e-05, "loss": 1.4971, "num_input_tokens_seen": 1039722864, "step": 7370 }, { "epoch": 0.4497840213921669, "grad_norm": 0.524659276008606, "learning_rate": 8e-05, "loss": 1.4948, "num_input_tokens_seen": 1041139980, "step": 7380 }, { "epoch": 0.45039348483578767, "grad_norm": 0.46882256865501404, "learning_rate": 8e-05, "loss": 1.4066, "num_input_tokens_seen": 1042543716, "step": 7390 }, { "epoch": 0.4510029482794085, "grad_norm": 1.1085089445114136, "learning_rate": 8e-05, "loss": 1.4335, "num_input_tokens_seen": 1043957848, "step": 7400 }, { "epoch": 0.45161241172302935, "grad_norm": 0.4720146059989929, "learning_rate": 8e-05, "loss": 1.5418, "num_input_tokens_seen": 1045388008, "step": 7410 }, { "epoch": 0.45222187516665013, "grad_norm": 0.47802358865737915, "learning_rate": 8e-05, "loss": 1.46, "num_input_tokens_seen": 1046809520, "step": 7420 }, { "epoch": 0.452831338610271, "grad_norm": 0.5443661212921143, "learning_rate": 8e-05, "loss": 1.4775, "num_input_tokens_seen": 1048246316, "step": 7430 }, { "epoch": 0.4534408020538918, "grad_norm": 0.5573719143867493, "learning_rate": 8e-05, "loss": 1.5155, "num_input_tokens_seen": 1049699188, "step": 7440 }, { "epoch": 0.45405026549751265, "grad_norm": 0.4769093692302704, "learning_rate": 8e-05, "loss": 1.3774, "num_input_tokens_seen": 1051064820, "step": 7450 }, { "epoch": 0.45465972894113343, "grad_norm": 0.5798795819282532, "learning_rate": 8e-05, "loss": 1.4688, "num_input_tokens_seen": 1052484612, "step": 7460 }, { "epoch": 0.4552691923847543, "grad_norm": 0.6087391376495361, "learning_rate": 8e-05, "loss": 1.416, "num_input_tokens_seen": 1053910944, "step": 7470 }, { "epoch": 0.4558786558283751, "grad_norm": 0.567535400390625, "learning_rate": 8e-05, "loss": 1.4634, "num_input_tokens_seen": 1055332428, "step": 7480 }, { "epoch": 0.4564881192719959, "grad_norm": 0.6195465922355652, "learning_rate": 8e-05, "loss": 1.3092, "num_input_tokens_seen": 1056726892, "step": 7490 }, { "epoch": 0.45709758271561673, "grad_norm": 0.5984314680099487, "learning_rate": 8e-05, "loss": 1.5001, "num_input_tokens_seen": 1058136916, "step": 7500 }, { "epoch": 0.4577070461592376, "grad_norm": 0.48998868465423584, "learning_rate": 8e-05, "loss": 1.4218, "num_input_tokens_seen": 1059580108, "step": 7510 }, { "epoch": 0.45831650960285836, "grad_norm": 0.5266983509063721, "learning_rate": 8e-05, "loss": 1.4329, "num_input_tokens_seen": 1060958932, "step": 7520 }, { "epoch": 0.4589259730464792, "grad_norm": 0.5505419969558716, "learning_rate": 8e-05, "loss": 1.4541, "num_input_tokens_seen": 1062394804, "step": 7530 }, { "epoch": 0.45953543649010004, "grad_norm": 0.5599749684333801, "learning_rate": 8e-05, "loss": 1.3939, "num_input_tokens_seen": 1063805320, "step": 7540 }, { "epoch": 0.4601448999337209, "grad_norm": 0.561083972454071, "learning_rate": 8e-05, "loss": 1.3821, "num_input_tokens_seen": 1065164348, "step": 7550 }, { "epoch": 0.46075436337734166, "grad_norm": 0.5383641719818115, "learning_rate": 8e-05, "loss": 1.5057, "num_input_tokens_seen": 1066572824, "step": 7560 }, { "epoch": 0.4613638268209625, "grad_norm": 0.4498058259487152, "learning_rate": 8e-05, "loss": 1.4079, "num_input_tokens_seen": 1067999532, "step": 7570 }, { "epoch": 0.46197329026458334, "grad_norm": 0.5523561239242554, "learning_rate": 8e-05, "loss": 1.4182, "num_input_tokens_seen": 1069413536, "step": 7580 }, { "epoch": 0.4625827537082041, "grad_norm": 0.5212712287902832, "learning_rate": 8e-05, "loss": 1.4475, "num_input_tokens_seen": 1070846708, "step": 7590 }, { "epoch": 0.46319221715182496, "grad_norm": 0.5242462158203125, "learning_rate": 8e-05, "loss": 1.427, "num_input_tokens_seen": 1072226000, "step": 7600 }, { "epoch": 0.4638016805954458, "grad_norm": 0.5377146601676941, "learning_rate": 8e-05, "loss": 1.4384, "num_input_tokens_seen": 1073640188, "step": 7610 }, { "epoch": 0.4644111440390666, "grad_norm": 0.5246424674987793, "learning_rate": 8e-05, "loss": 1.4643, "num_input_tokens_seen": 1075053120, "step": 7620 }, { "epoch": 0.4650206074826874, "grad_norm": 0.49351179599761963, "learning_rate": 8e-05, "loss": 1.4539, "num_input_tokens_seen": 1076464372, "step": 7630 }, { "epoch": 0.46563007092630826, "grad_norm": 0.4650513827800751, "learning_rate": 8e-05, "loss": 1.4067, "num_input_tokens_seen": 1077887580, "step": 7640 }, { "epoch": 0.4662395343699291, "grad_norm": 0.5255919694900513, "learning_rate": 8e-05, "loss": 1.4461, "num_input_tokens_seen": 1079314396, "step": 7650 }, { "epoch": 0.4668489978135499, "grad_norm": 0.45597824454307556, "learning_rate": 8e-05, "loss": 1.4076, "num_input_tokens_seen": 1080722664, "step": 7660 }, { "epoch": 0.4674584612571707, "grad_norm": 0.5342453718185425, "learning_rate": 8e-05, "loss": 1.3912, "num_input_tokens_seen": 1082104756, "step": 7670 }, { "epoch": 0.46806792470079156, "grad_norm": 0.5388374328613281, "learning_rate": 8e-05, "loss": 1.4392, "num_input_tokens_seen": 1083515704, "step": 7680 }, { "epoch": 0.46867738814441234, "grad_norm": 0.5510755777359009, "learning_rate": 8e-05, "loss": 1.4074, "num_input_tokens_seen": 1084952776, "step": 7690 }, { "epoch": 0.4692868515880332, "grad_norm": 0.5435276627540588, "learning_rate": 8e-05, "loss": 1.4402, "num_input_tokens_seen": 1086371416, "step": 7700 }, { "epoch": 0.469896315031654, "grad_norm": 0.4792904853820801, "learning_rate": 8e-05, "loss": 1.3989, "num_input_tokens_seen": 1087779788, "step": 7710 }, { "epoch": 0.4705057784752748, "grad_norm": 0.4804374575614929, "learning_rate": 8e-05, "loss": 1.3157, "num_input_tokens_seen": 1089174612, "step": 7720 }, { "epoch": 0.47111524191889564, "grad_norm": 0.47840744256973267, "learning_rate": 8e-05, "loss": 1.3208, "num_input_tokens_seen": 1090588704, "step": 7730 }, { "epoch": 0.4717247053625165, "grad_norm": 0.5292723178863525, "learning_rate": 8e-05, "loss": 1.4032, "num_input_tokens_seen": 1091995860, "step": 7740 }, { "epoch": 0.4723341688061373, "grad_norm": 0.5833985209465027, "learning_rate": 8e-05, "loss": 1.4286, "num_input_tokens_seen": 1093395952, "step": 7750 }, { "epoch": 0.4729436322497581, "grad_norm": 0.48498985171318054, "learning_rate": 8e-05, "loss": 1.4765, "num_input_tokens_seen": 1094805108, "step": 7760 }, { "epoch": 0.47355309569337894, "grad_norm": 0.5416792631149292, "learning_rate": 8e-05, "loss": 1.4559, "num_input_tokens_seen": 1096191520, "step": 7770 }, { "epoch": 0.4741625591369998, "grad_norm": 0.5118204951286316, "learning_rate": 8e-05, "loss": 1.4098, "num_input_tokens_seen": 1097626836, "step": 7780 }, { "epoch": 0.47477202258062057, "grad_norm": 0.5373117327690125, "learning_rate": 8e-05, "loss": 1.4222, "num_input_tokens_seen": 1099008448, "step": 7790 }, { "epoch": 0.4753814860242414, "grad_norm": 0.634472668170929, "learning_rate": 8e-05, "loss": 1.4626, "num_input_tokens_seen": 1100430712, "step": 7800 }, { "epoch": 0.47599094946786225, "grad_norm": 0.5606759190559387, "learning_rate": 8e-05, "loss": 1.4181, "num_input_tokens_seen": 1101864384, "step": 7810 }, { "epoch": 0.47660041291148303, "grad_norm": 0.6342650651931763, "learning_rate": 8e-05, "loss": 1.3675, "num_input_tokens_seen": 1103281588, "step": 7820 }, { "epoch": 0.47720987635510387, "grad_norm": 0.5315597057342529, "learning_rate": 8e-05, "loss": 1.4346, "num_input_tokens_seen": 1104678480, "step": 7830 }, { "epoch": 0.4778193397987247, "grad_norm": 0.6562740802764893, "learning_rate": 8e-05, "loss": 1.4669, "num_input_tokens_seen": 1106087552, "step": 7840 }, { "epoch": 0.47842880324234555, "grad_norm": 0.5265791416168213, "learning_rate": 8e-05, "loss": 1.4921, "num_input_tokens_seen": 1107533152, "step": 7850 }, { "epoch": 0.47903826668596633, "grad_norm": 0.4799894094467163, "learning_rate": 8e-05, "loss": 1.4559, "num_input_tokens_seen": 1108933276, "step": 7860 }, { "epoch": 0.47964773012958717, "grad_norm": 0.5308011174201965, "learning_rate": 8e-05, "loss": 1.3988, "num_input_tokens_seen": 1110344220, "step": 7870 }, { "epoch": 0.480257193573208, "grad_norm": 0.5046650171279907, "learning_rate": 8e-05, "loss": 1.4366, "num_input_tokens_seen": 1111735944, "step": 7880 }, { "epoch": 0.4808666570168288, "grad_norm": 0.5083670616149902, "learning_rate": 8e-05, "loss": 1.4219, "num_input_tokens_seen": 1113172872, "step": 7890 }, { "epoch": 0.48147612046044963, "grad_norm": 0.5572594404220581, "learning_rate": 8e-05, "loss": 1.4198, "num_input_tokens_seen": 1114579892, "step": 7900 }, { "epoch": 0.48208558390407047, "grad_norm": 0.5634285807609558, "learning_rate": 8e-05, "loss": 1.468, "num_input_tokens_seen": 1115978076, "step": 7910 }, { "epoch": 0.48269504734769125, "grad_norm": 0.4851561486721039, "learning_rate": 8e-05, "loss": 1.3592, "num_input_tokens_seen": 1117361664, "step": 7920 }, { "epoch": 0.4833045107913121, "grad_norm": 0.4718756079673767, "learning_rate": 8e-05, "loss": 1.439, "num_input_tokens_seen": 1118816252, "step": 7930 }, { "epoch": 0.48391397423493293, "grad_norm": 0.5959479808807373, "learning_rate": 8e-05, "loss": 1.4517, "num_input_tokens_seen": 1120264108, "step": 7940 }, { "epoch": 0.48452343767855377, "grad_norm": 0.5836747288703918, "learning_rate": 8e-05, "loss": 1.4244, "num_input_tokens_seen": 1121650124, "step": 7950 }, { "epoch": 0.48513290112217455, "grad_norm": 0.5627513527870178, "learning_rate": 8e-05, "loss": 1.4043, "num_input_tokens_seen": 1123080060, "step": 7960 }, { "epoch": 0.4857423645657954, "grad_norm": 0.5587437152862549, "learning_rate": 8e-05, "loss": 1.4019, "num_input_tokens_seen": 1124511240, "step": 7970 }, { "epoch": 0.48635182800941623, "grad_norm": 0.6421571969985962, "learning_rate": 8e-05, "loss": 1.4026, "num_input_tokens_seen": 1125934080, "step": 7980 }, { "epoch": 0.486961291453037, "grad_norm": 0.4620668888092041, "learning_rate": 8e-05, "loss": 1.4142, "num_input_tokens_seen": 1127349476, "step": 7990 }, { "epoch": 0.48757075489665785, "grad_norm": 0.5190436840057373, "learning_rate": 8e-05, "loss": 1.4625, "num_input_tokens_seen": 1128787572, "step": 8000 }, { "epoch": 0.4881802183402787, "grad_norm": 0.5595162510871887, "learning_rate": 8e-05, "loss": 1.4101, "num_input_tokens_seen": 1130217996, "step": 8010 }, { "epoch": 0.4887896817838995, "grad_norm": 0.4808787703514099, "learning_rate": 8e-05, "loss": 1.4088, "num_input_tokens_seen": 1131625032, "step": 8020 }, { "epoch": 0.4893991452275203, "grad_norm": 0.5082031488418579, "learning_rate": 8e-05, "loss": 1.4811, "num_input_tokens_seen": 1133074808, "step": 8030 }, { "epoch": 0.49000860867114115, "grad_norm": 0.7117087841033936, "learning_rate": 8e-05, "loss": 1.3978, "num_input_tokens_seen": 1134471984, "step": 8040 }, { "epoch": 0.490618072114762, "grad_norm": 0.4917582869529724, "learning_rate": 8e-05, "loss": 1.4326, "num_input_tokens_seen": 1135871844, "step": 8050 }, { "epoch": 0.4912275355583828, "grad_norm": 0.5196986794471741, "learning_rate": 8e-05, "loss": 1.4705, "num_input_tokens_seen": 1137251528, "step": 8060 }, { "epoch": 0.4918369990020036, "grad_norm": 0.45626333355903625, "learning_rate": 8e-05, "loss": 1.4173, "num_input_tokens_seen": 1138668772, "step": 8070 }, { "epoch": 0.49244646244562446, "grad_norm": 0.7028423547744751, "learning_rate": 8e-05, "loss": 1.4415, "num_input_tokens_seen": 1140072992, "step": 8080 }, { "epoch": 0.49305592588924524, "grad_norm": 0.4503718912601471, "learning_rate": 8e-05, "loss": 1.4979, "num_input_tokens_seen": 1141473780, "step": 8090 }, { "epoch": 0.4936653893328661, "grad_norm": 0.49622249603271484, "learning_rate": 8e-05, "loss": 1.4959, "num_input_tokens_seen": 1142896620, "step": 8100 }, { "epoch": 0.4942748527764869, "grad_norm": 0.45446139574050903, "learning_rate": 8e-05, "loss": 1.4125, "num_input_tokens_seen": 1144295328, "step": 8110 }, { "epoch": 0.4948843162201077, "grad_norm": 0.4794887602329254, "learning_rate": 8e-05, "loss": 1.4337, "num_input_tokens_seen": 1145761996, "step": 8120 }, { "epoch": 0.49549377966372854, "grad_norm": 0.535640299320221, "learning_rate": 8e-05, "loss": 1.4061, "num_input_tokens_seen": 1147167508, "step": 8130 }, { "epoch": 0.4961032431073494, "grad_norm": 0.5999556183815002, "learning_rate": 8e-05, "loss": 1.3923, "num_input_tokens_seen": 1148590576, "step": 8140 }, { "epoch": 0.49671270655097016, "grad_norm": 0.4626942574977875, "learning_rate": 8e-05, "loss": 1.3859, "num_input_tokens_seen": 1149968016, "step": 8150 }, { "epoch": 0.497322169994591, "grad_norm": 0.5697856545448303, "learning_rate": 8e-05, "loss": 1.4115, "num_input_tokens_seen": 1151379092, "step": 8160 }, { "epoch": 0.49793163343821184, "grad_norm": 0.5212153196334839, "learning_rate": 8e-05, "loss": 1.4006, "num_input_tokens_seen": 1152789164, "step": 8170 }, { "epoch": 0.4985410968818327, "grad_norm": 0.5307455062866211, "learning_rate": 8e-05, "loss": 1.4147, "num_input_tokens_seen": 1154215056, "step": 8180 }, { "epoch": 0.49915056032545346, "grad_norm": 0.49318286776542664, "learning_rate": 8e-05, "loss": 1.3641, "num_input_tokens_seen": 1155560728, "step": 8190 }, { "epoch": 0.4997600237690743, "grad_norm": 0.5201637148857117, "learning_rate": 8e-05, "loss": 1.4909, "num_input_tokens_seen": 1156956988, "step": 8200 }, { "epoch": 0.5003694872126951, "grad_norm": 0.5208278894424438, "learning_rate": 8e-05, "loss": 1.4412, "num_input_tokens_seen": 1158393764, "step": 8210 }, { "epoch": 0.5009789506563159, "grad_norm": 0.5729420185089111, "learning_rate": 8e-05, "loss": 1.3848, "num_input_tokens_seen": 1159824056, "step": 8220 }, { "epoch": 0.5015884140999368, "grad_norm": 0.4863448739051819, "learning_rate": 8e-05, "loss": 1.3707, "num_input_tokens_seen": 1161241976, "step": 8230 }, { "epoch": 0.5021978775435576, "grad_norm": 0.544262707233429, "learning_rate": 8e-05, "loss": 1.452, "num_input_tokens_seen": 1162622704, "step": 8240 }, { "epoch": 0.5028073409871784, "grad_norm": 0.5542010068893433, "learning_rate": 8e-05, "loss": 1.4808, "num_input_tokens_seen": 1164019924, "step": 8250 }, { "epoch": 0.5034168044307993, "grad_norm": 0.466775506734848, "learning_rate": 8e-05, "loss": 1.4303, "num_input_tokens_seen": 1165465436, "step": 8260 }, { "epoch": 0.5040262678744201, "grad_norm": 0.6469402313232422, "learning_rate": 8e-05, "loss": 1.4665, "num_input_tokens_seen": 1166886136, "step": 8270 }, { "epoch": 0.5046357313180408, "grad_norm": 0.6200391054153442, "learning_rate": 8e-05, "loss": 1.4426, "num_input_tokens_seen": 1168299392, "step": 8280 }, { "epoch": 0.5052451947616617, "grad_norm": 0.5519371628761292, "learning_rate": 8e-05, "loss": 1.4484, "num_input_tokens_seen": 1169721572, "step": 8290 }, { "epoch": 0.5058546582052825, "grad_norm": 0.5275024771690369, "learning_rate": 8e-05, "loss": 1.3321, "num_input_tokens_seen": 1171103648, "step": 8300 }, { "epoch": 0.5064641216489033, "grad_norm": 0.522892951965332, "learning_rate": 8e-05, "loss": 1.3864, "num_input_tokens_seen": 1172502288, "step": 8310 }, { "epoch": 0.5070735850925242, "grad_norm": 0.47808873653411865, "learning_rate": 8e-05, "loss": 1.3831, "num_input_tokens_seen": 1173938752, "step": 8320 }, { "epoch": 0.507683048536145, "grad_norm": 0.6184147596359253, "learning_rate": 8e-05, "loss": 1.4538, "num_input_tokens_seen": 1175364784, "step": 8330 }, { "epoch": 0.5082925119797658, "grad_norm": 0.5707573890686035, "learning_rate": 8e-05, "loss": 1.3267, "num_input_tokens_seen": 1176787692, "step": 8340 }, { "epoch": 0.5089019754233867, "grad_norm": 0.5967716574668884, "learning_rate": 8e-05, "loss": 1.4053, "num_input_tokens_seen": 1178179564, "step": 8350 }, { "epoch": 0.5095114388670074, "grad_norm": 0.5708630681037903, "learning_rate": 8e-05, "loss": 1.478, "num_input_tokens_seen": 1179580276, "step": 8360 }, { "epoch": 0.5101209023106282, "grad_norm": 0.5356885194778442, "learning_rate": 8e-05, "loss": 1.4038, "num_input_tokens_seen": 1180980160, "step": 8370 }, { "epoch": 0.5107303657542491, "grad_norm": 0.6001008749008179, "learning_rate": 8e-05, "loss": 1.4924, "num_input_tokens_seen": 1182394556, "step": 8380 }, { "epoch": 0.5113398291978699, "grad_norm": 0.4999195635318756, "learning_rate": 8e-05, "loss": 1.3733, "num_input_tokens_seen": 1183836592, "step": 8390 }, { "epoch": 0.5119492926414907, "grad_norm": 0.5139046311378479, "learning_rate": 8e-05, "loss": 1.399, "num_input_tokens_seen": 1185238392, "step": 8400 }, { "epoch": 0.5125587560851116, "grad_norm": 0.614023745059967, "learning_rate": 8e-05, "loss": 1.426, "num_input_tokens_seen": 1186634124, "step": 8410 }, { "epoch": 0.5131682195287324, "grad_norm": 0.5585746765136719, "learning_rate": 8e-05, "loss": 1.4441, "num_input_tokens_seen": 1188016924, "step": 8420 }, { "epoch": 0.5137776829723533, "grad_norm": 0.6003695726394653, "learning_rate": 8e-05, "loss": 1.4042, "num_input_tokens_seen": 1189415668, "step": 8430 }, { "epoch": 0.514387146415974, "grad_norm": 0.6207943558692932, "learning_rate": 8e-05, "loss": 1.3925, "num_input_tokens_seen": 1190833848, "step": 8440 }, { "epoch": 0.5149966098595948, "grad_norm": 0.5384374260902405, "learning_rate": 8e-05, "loss": 1.3952, "num_input_tokens_seen": 1192256296, "step": 8450 }, { "epoch": 0.5156060733032157, "grad_norm": 0.5247148275375366, "learning_rate": 8e-05, "loss": 1.471, "num_input_tokens_seen": 1193697300, "step": 8460 }, { "epoch": 0.5162155367468365, "grad_norm": 0.5606392621994019, "learning_rate": 8e-05, "loss": 1.5178, "num_input_tokens_seen": 1195149596, "step": 8470 }, { "epoch": 0.5168250001904573, "grad_norm": 0.5033918619155884, "learning_rate": 8e-05, "loss": 1.3954, "num_input_tokens_seen": 1196543328, "step": 8480 }, { "epoch": 0.5174344636340782, "grad_norm": 0.5382705330848694, "learning_rate": 8e-05, "loss": 1.3954, "num_input_tokens_seen": 1197953020, "step": 8490 }, { "epoch": 0.518043927077699, "grad_norm": 0.508390486240387, "learning_rate": 8e-05, "loss": 1.448, "num_input_tokens_seen": 1199356544, "step": 8500 }, { "epoch": 0.5186533905213198, "grad_norm": 0.49335700273513794, "learning_rate": 8e-05, "loss": 1.4365, "num_input_tokens_seen": 1200772144, "step": 8510 }, { "epoch": 0.5192628539649407, "grad_norm": 0.511746883392334, "learning_rate": 8e-05, "loss": 1.4413, "num_input_tokens_seen": 1202172256, "step": 8520 }, { "epoch": 0.5198723174085614, "grad_norm": 0.7011821269989014, "learning_rate": 8e-05, "loss": 1.4147, "num_input_tokens_seen": 1203597072, "step": 8530 }, { "epoch": 0.5204817808521822, "grad_norm": 0.612265408039093, "learning_rate": 8e-05, "loss": 1.4681, "num_input_tokens_seen": 1204985968, "step": 8540 }, { "epoch": 0.5210912442958031, "grad_norm": 0.510905385017395, "learning_rate": 8e-05, "loss": 1.4615, "num_input_tokens_seen": 1206391220, "step": 8550 }, { "epoch": 0.5217007077394239, "grad_norm": 0.6800599694252014, "learning_rate": 8e-05, "loss": 1.4496, "num_input_tokens_seen": 1207786772, "step": 8560 }, { "epoch": 0.5223101711830447, "grad_norm": 0.6727950572967529, "learning_rate": 8e-05, "loss": 1.4528, "num_input_tokens_seen": 1209206112, "step": 8570 }, { "epoch": 0.5229196346266656, "grad_norm": 0.6258131265640259, "learning_rate": 8e-05, "loss": 1.3601, "num_input_tokens_seen": 1210620340, "step": 8580 }, { "epoch": 0.5235290980702864, "grad_norm": 0.5519835948944092, "learning_rate": 8e-05, "loss": 1.3857, "num_input_tokens_seen": 1212035396, "step": 8590 }, { "epoch": 0.5241385615139071, "grad_norm": 0.49297013878822327, "learning_rate": 8e-05, "loss": 1.4223, "num_input_tokens_seen": 1213445184, "step": 8600 }, { "epoch": 0.524748024957528, "grad_norm": 0.47780099511146545, "learning_rate": 8e-05, "loss": 1.378, "num_input_tokens_seen": 1214854316, "step": 8610 }, { "epoch": 0.5253574884011488, "grad_norm": 0.5096369981765747, "learning_rate": 8e-05, "loss": 1.4467, "num_input_tokens_seen": 1216250744, "step": 8620 }, { "epoch": 0.5259669518447697, "grad_norm": 0.5387448072433472, "learning_rate": 8e-05, "loss": 1.4201, "num_input_tokens_seen": 1217686440, "step": 8630 }, { "epoch": 0.5265764152883905, "grad_norm": 0.5782696604728699, "learning_rate": 8e-05, "loss": 1.4183, "num_input_tokens_seen": 1219090264, "step": 8640 }, { "epoch": 0.5271858787320113, "grad_norm": 0.4248006343841553, "learning_rate": 8e-05, "loss": 1.5077, "num_input_tokens_seen": 1220493872, "step": 8650 }, { "epoch": 0.5277953421756322, "grad_norm": 0.5155441164970398, "learning_rate": 8e-05, "loss": 1.461, "num_input_tokens_seen": 1221904396, "step": 8660 }, { "epoch": 0.528404805619253, "grad_norm": 0.7794909477233887, "learning_rate": 8e-05, "loss": 1.359, "num_input_tokens_seen": 1223272932, "step": 8670 }, { "epoch": 0.5290142690628737, "grad_norm": 0.5529311299324036, "learning_rate": 8e-05, "loss": 1.4378, "num_input_tokens_seen": 1224665304, "step": 8680 }, { "epoch": 0.5296237325064946, "grad_norm": 0.5060685276985168, "learning_rate": 8e-05, "loss": 1.4173, "num_input_tokens_seen": 1226062180, "step": 8690 }, { "epoch": 0.5302331959501154, "grad_norm": 0.4951160252094269, "learning_rate": 8e-05, "loss": 1.4475, "num_input_tokens_seen": 1227516508, "step": 8700 }, { "epoch": 0.5308426593937362, "grad_norm": 0.48395630717277527, "learning_rate": 8e-05, "loss": 1.374, "num_input_tokens_seen": 1228928164, "step": 8710 }, { "epoch": 0.5314521228373571, "grad_norm": 0.5231735110282898, "learning_rate": 8e-05, "loss": 1.3518, "num_input_tokens_seen": 1230332108, "step": 8720 }, { "epoch": 0.5320615862809779, "grad_norm": 0.6426222324371338, "learning_rate": 8e-05, "loss": 1.3299, "num_input_tokens_seen": 1231769688, "step": 8730 }, { "epoch": 0.5326710497245987, "grad_norm": 0.496803343296051, "learning_rate": 8e-05, "loss": 1.3993, "num_input_tokens_seen": 1233210764, "step": 8740 }, { "epoch": 0.5332805131682196, "grad_norm": 0.4480886459350586, "learning_rate": 8e-05, "loss": 1.4256, "num_input_tokens_seen": 1234671524, "step": 8750 }, { "epoch": 0.5338899766118403, "grad_norm": 0.48204657435417175, "learning_rate": 8e-05, "loss": 1.3814, "num_input_tokens_seen": 1236085908, "step": 8760 }, { "epoch": 0.5344994400554611, "grad_norm": 0.5191913843154907, "learning_rate": 8e-05, "loss": 1.3768, "num_input_tokens_seen": 1237467536, "step": 8770 }, { "epoch": 0.535108903499082, "grad_norm": 0.5462549924850464, "learning_rate": 8e-05, "loss": 1.415, "num_input_tokens_seen": 1238886584, "step": 8780 }, { "epoch": 0.5357183669427028, "grad_norm": 0.574504554271698, "learning_rate": 8e-05, "loss": 1.4029, "num_input_tokens_seen": 1240281716, "step": 8790 }, { "epoch": 0.5363278303863236, "grad_norm": 0.5356009602546692, "learning_rate": 8e-05, "loss": 1.4261, "num_input_tokens_seen": 1241675192, "step": 8800 }, { "epoch": 0.5369372938299445, "grad_norm": 0.5325603485107422, "learning_rate": 8e-05, "loss": 1.3992, "num_input_tokens_seen": 1243056036, "step": 8810 }, { "epoch": 0.5375467572735653, "grad_norm": 0.47047409415245056, "learning_rate": 8e-05, "loss": 1.3463, "num_input_tokens_seen": 1244426652, "step": 8820 }, { "epoch": 0.5381562207171862, "grad_norm": 0.5718605518341064, "learning_rate": 8e-05, "loss": 1.4106, "num_input_tokens_seen": 1245827804, "step": 8830 }, { "epoch": 0.538765684160807, "grad_norm": 0.5218635201454163, "learning_rate": 8e-05, "loss": 1.4177, "num_input_tokens_seen": 1247204320, "step": 8840 }, { "epoch": 0.5393751476044277, "grad_norm": 0.6211861968040466, "learning_rate": 8e-05, "loss": 1.5218, "num_input_tokens_seen": 1248618312, "step": 8850 }, { "epoch": 0.5399846110480486, "grad_norm": 0.49085572361946106, "learning_rate": 8e-05, "loss": 1.4029, "num_input_tokens_seen": 1250024496, "step": 8860 }, { "epoch": 0.5405940744916694, "grad_norm": 0.5032560229301453, "learning_rate": 8e-05, "loss": 1.4333, "num_input_tokens_seen": 1251433236, "step": 8870 }, { "epoch": 0.5412035379352902, "grad_norm": 0.7042189240455627, "learning_rate": 8e-05, "loss": 1.3757, "num_input_tokens_seen": 1252853548, "step": 8880 }, { "epoch": 0.5418130013789111, "grad_norm": 0.5665264129638672, "learning_rate": 8e-05, "loss": 1.413, "num_input_tokens_seen": 1254255272, "step": 8890 }, { "epoch": 0.5424224648225319, "grad_norm": 0.5010056495666504, "learning_rate": 8e-05, "loss": 1.328, "num_input_tokens_seen": 1255651628, "step": 8900 }, { "epoch": 0.5430319282661527, "grad_norm": 0.5278030037879944, "learning_rate": 8e-05, "loss": 1.4238, "num_input_tokens_seen": 1257075628, "step": 8910 }, { "epoch": 0.5436413917097735, "grad_norm": 0.5137773156166077, "learning_rate": 8e-05, "loss": 1.4509, "num_input_tokens_seen": 1258492128, "step": 8920 }, { "epoch": 0.5442508551533943, "grad_norm": 0.49045154452323914, "learning_rate": 8e-05, "loss": 1.473, "num_input_tokens_seen": 1259921876, "step": 8930 }, { "epoch": 0.5448603185970151, "grad_norm": 0.5632349848747253, "learning_rate": 8e-05, "loss": 1.4155, "num_input_tokens_seen": 1261282328, "step": 8940 }, { "epoch": 0.545469782040636, "grad_norm": 0.4805395007133484, "learning_rate": 8e-05, "loss": 1.445, "num_input_tokens_seen": 1262718360, "step": 8950 }, { "epoch": 0.5460792454842568, "grad_norm": 0.5406620502471924, "learning_rate": 8e-05, "loss": 1.3724, "num_input_tokens_seen": 1264130680, "step": 8960 }, { "epoch": 0.5466887089278776, "grad_norm": 0.5531741976737976, "learning_rate": 8e-05, "loss": 1.4404, "num_input_tokens_seen": 1265513744, "step": 8970 }, { "epoch": 0.5472981723714985, "grad_norm": 0.5422140955924988, "learning_rate": 8e-05, "loss": 1.3957, "num_input_tokens_seen": 1266903584, "step": 8980 }, { "epoch": 0.5479076358151193, "grad_norm": 0.5059279203414917, "learning_rate": 8e-05, "loss": 1.36, "num_input_tokens_seen": 1268300852, "step": 8990 }, { "epoch": 0.54851709925874, "grad_norm": 0.4933377504348755, "learning_rate": 8e-05, "loss": 1.3714, "num_input_tokens_seen": 1269717552, "step": 9000 }, { "epoch": 0.5491265627023609, "grad_norm": 0.6613243818283081, "learning_rate": 8e-05, "loss": 1.4211, "num_input_tokens_seen": 1271115644, "step": 9010 }, { "epoch": 0.5497360261459817, "grad_norm": 0.49182480573654175, "learning_rate": 8e-05, "loss": 1.352, "num_input_tokens_seen": 1272504180, "step": 9020 }, { "epoch": 0.5503454895896025, "grad_norm": 0.5405098795890808, "learning_rate": 8e-05, "loss": 1.4031, "num_input_tokens_seen": 1273932024, "step": 9030 }, { "epoch": 0.5509549530332234, "grad_norm": 0.5463997721672058, "learning_rate": 8e-05, "loss": 1.5083, "num_input_tokens_seen": 1275332648, "step": 9040 }, { "epoch": 0.5515644164768442, "grad_norm": 0.5602612495422363, "learning_rate": 8e-05, "loss": 1.3726, "num_input_tokens_seen": 1276735532, "step": 9050 }, { "epoch": 0.5521738799204651, "grad_norm": 0.5101543068885803, "learning_rate": 8e-05, "loss": 1.3778, "num_input_tokens_seen": 1278146092, "step": 9060 }, { "epoch": 0.5527833433640859, "grad_norm": 0.5481278896331787, "learning_rate": 8e-05, "loss": 1.4419, "num_input_tokens_seen": 1279583276, "step": 9070 }, { "epoch": 0.5533928068077066, "grad_norm": 0.4749618172645569, "learning_rate": 8e-05, "loss": 1.4162, "num_input_tokens_seen": 1281000036, "step": 9080 }, { "epoch": 0.5540022702513275, "grad_norm": 0.5409462451934814, "learning_rate": 8e-05, "loss": 1.3868, "num_input_tokens_seen": 1282445072, "step": 9090 }, { "epoch": 0.5546117336949483, "grad_norm": 0.5022688508033752, "learning_rate": 8e-05, "loss": 1.4162, "num_input_tokens_seen": 1283856564, "step": 9100 }, { "epoch": 0.5552211971385691, "grad_norm": 0.4955657124519348, "learning_rate": 8e-05, "loss": 1.41, "num_input_tokens_seen": 1285249116, "step": 9110 }, { "epoch": 0.55583066058219, "grad_norm": 0.4351109564304352, "learning_rate": 8e-05, "loss": 1.4364, "num_input_tokens_seen": 1286611828, "step": 9120 }, { "epoch": 0.5564401240258108, "grad_norm": 0.47502756118774414, "learning_rate": 8e-05, "loss": 1.3688, "num_input_tokens_seen": 1288059680, "step": 9130 }, { "epoch": 0.5570495874694316, "grad_norm": 0.5219740867614746, "learning_rate": 8e-05, "loss": 1.3995, "num_input_tokens_seen": 1289506464, "step": 9140 }, { "epoch": 0.5576590509130525, "grad_norm": 0.5647391676902771, "learning_rate": 8e-05, "loss": 1.3867, "num_input_tokens_seen": 1290890052, "step": 9150 }, { "epoch": 0.5582685143566732, "grad_norm": 0.5420753359794617, "learning_rate": 8e-05, "loss": 1.4256, "num_input_tokens_seen": 1292311544, "step": 9160 }, { "epoch": 0.558877977800294, "grad_norm": 0.629084050655365, "learning_rate": 8e-05, "loss": 1.422, "num_input_tokens_seen": 1293705252, "step": 9170 }, { "epoch": 0.5594874412439149, "grad_norm": 0.5770242214202881, "learning_rate": 8e-05, "loss": 1.3623, "num_input_tokens_seen": 1295093572, "step": 9180 }, { "epoch": 0.5600969046875357, "grad_norm": 0.5409170985221863, "learning_rate": 8e-05, "loss": 1.4737, "num_input_tokens_seen": 1296519364, "step": 9190 }, { "epoch": 0.5607063681311565, "grad_norm": 0.5729120373725891, "learning_rate": 8e-05, "loss": 1.4459, "num_input_tokens_seen": 1297897980, "step": 9200 }, { "epoch": 0.5613158315747774, "grad_norm": 0.5346562266349792, "learning_rate": 8e-05, "loss": 1.4169, "num_input_tokens_seen": 1299282408, "step": 9210 }, { "epoch": 0.5619252950183982, "grad_norm": 0.6186701655387878, "learning_rate": 8e-05, "loss": 1.3818, "num_input_tokens_seen": 1300736024, "step": 9220 }, { "epoch": 0.562534758462019, "grad_norm": 0.5668373107910156, "learning_rate": 8e-05, "loss": 1.4676, "num_input_tokens_seen": 1302156036, "step": 9230 }, { "epoch": 0.5631442219056398, "grad_norm": 0.603315532207489, "learning_rate": 8e-05, "loss": 1.3285, "num_input_tokens_seen": 1303545448, "step": 9240 }, { "epoch": 0.5637536853492606, "grad_norm": 0.5525988936424255, "learning_rate": 8e-05, "loss": 1.4286, "num_input_tokens_seen": 1304987840, "step": 9250 }, { "epoch": 0.5643631487928815, "grad_norm": 0.5281280279159546, "learning_rate": 8e-05, "loss": 1.4153, "num_input_tokens_seen": 1306349096, "step": 9260 }, { "epoch": 0.5649726122365023, "grad_norm": 0.539382815361023, "learning_rate": 8e-05, "loss": 1.4315, "num_input_tokens_seen": 1307741544, "step": 9270 }, { "epoch": 0.5655820756801231, "grad_norm": 0.49413955211639404, "learning_rate": 8e-05, "loss": 1.422, "num_input_tokens_seen": 1309121068, "step": 9280 }, { "epoch": 0.566191539123744, "grad_norm": 0.7388852834701538, "learning_rate": 8e-05, "loss": 1.3662, "num_input_tokens_seen": 1310517248, "step": 9290 }, { "epoch": 0.5668010025673648, "grad_norm": 0.5051785707473755, "learning_rate": 8e-05, "loss": 1.3247, "num_input_tokens_seen": 1311906244, "step": 9300 }, { "epoch": 0.5674104660109855, "grad_norm": 0.5112394094467163, "learning_rate": 8e-05, "loss": 1.3472, "num_input_tokens_seen": 1313330152, "step": 9310 }, { "epoch": 0.5680199294546064, "grad_norm": 0.5090803503990173, "learning_rate": 8e-05, "loss": 1.3978, "num_input_tokens_seen": 1314772320, "step": 9320 }, { "epoch": 0.5686293928982272, "grad_norm": 0.4785972237586975, "learning_rate": 8e-05, "loss": 1.3579, "num_input_tokens_seen": 1316168788, "step": 9330 }, { "epoch": 0.569238856341848, "grad_norm": 0.565299928188324, "learning_rate": 8e-05, "loss": 1.386, "num_input_tokens_seen": 1317603684, "step": 9340 }, { "epoch": 0.5698483197854689, "grad_norm": 0.5050956010818481, "learning_rate": 8e-05, "loss": 1.4233, "num_input_tokens_seen": 1319033440, "step": 9350 }, { "epoch": 0.5704577832290897, "grad_norm": 0.5199946761131287, "learning_rate": 8e-05, "loss": 1.4047, "num_input_tokens_seen": 1320462816, "step": 9360 }, { "epoch": 0.5710672466727105, "grad_norm": 0.47691354155540466, "learning_rate": 8e-05, "loss": 1.3643, "num_input_tokens_seen": 1321889000, "step": 9370 }, { "epoch": 0.5716767101163314, "grad_norm": 0.5591989159584045, "learning_rate": 8e-05, "loss": 1.4623, "num_input_tokens_seen": 1323308556, "step": 9380 }, { "epoch": 0.5722861735599521, "grad_norm": 0.516518771648407, "learning_rate": 8e-05, "loss": 1.4165, "num_input_tokens_seen": 1324702200, "step": 9390 }, { "epoch": 0.5728956370035729, "grad_norm": 0.5647040009498596, "learning_rate": 8e-05, "loss": 1.3805, "num_input_tokens_seen": 1326076652, "step": 9400 }, { "epoch": 0.5735051004471938, "grad_norm": 0.5209704637527466, "learning_rate": 8e-05, "loss": 1.3816, "num_input_tokens_seen": 1327492928, "step": 9410 }, { "epoch": 0.5741145638908146, "grad_norm": 0.6285092234611511, "learning_rate": 8e-05, "loss": 1.3143, "num_input_tokens_seen": 1328873088, "step": 9420 }, { "epoch": 0.5747240273344354, "grad_norm": 0.42731142044067383, "learning_rate": 8e-05, "loss": 1.3784, "num_input_tokens_seen": 1330316768, "step": 9430 }, { "epoch": 0.5753334907780563, "grad_norm": 0.500942051410675, "learning_rate": 8e-05, "loss": 1.3928, "num_input_tokens_seen": 1331718868, "step": 9440 }, { "epoch": 0.5759429542216771, "grad_norm": 0.5587007403373718, "learning_rate": 8e-05, "loss": 1.423, "num_input_tokens_seen": 1333111484, "step": 9450 }, { "epoch": 0.576552417665298, "grad_norm": 0.48135367035865784, "learning_rate": 8e-05, "loss": 1.4287, "num_input_tokens_seen": 1334496056, "step": 9460 }, { "epoch": 0.5771618811089188, "grad_norm": 0.5771949887275696, "learning_rate": 8e-05, "loss": 1.4181, "num_input_tokens_seen": 1335922768, "step": 9470 }, { "epoch": 0.5777713445525395, "grad_norm": 0.5315853953361511, "learning_rate": 8e-05, "loss": 1.4274, "num_input_tokens_seen": 1337302964, "step": 9480 }, { "epoch": 0.5783808079961604, "grad_norm": 0.5134252309799194, "learning_rate": 8e-05, "loss": 1.4307, "num_input_tokens_seen": 1338719692, "step": 9490 }, { "epoch": 0.5789902714397812, "grad_norm": 0.5051198601722717, "learning_rate": 8e-05, "loss": 1.496, "num_input_tokens_seen": 1340134348, "step": 9500 }, { "epoch": 0.579599734883402, "grad_norm": 0.549379289150238, "learning_rate": 8e-05, "loss": 1.4031, "num_input_tokens_seen": 1341522456, "step": 9510 }, { "epoch": 0.5802091983270229, "grad_norm": 0.56511390209198, "learning_rate": 8e-05, "loss": 1.3683, "num_input_tokens_seen": 1342933524, "step": 9520 }, { "epoch": 0.5808186617706437, "grad_norm": 0.55156409740448, "learning_rate": 8e-05, "loss": 1.4012, "num_input_tokens_seen": 1344352168, "step": 9530 }, { "epoch": 0.5814281252142645, "grad_norm": 0.4940817654132843, "learning_rate": 8e-05, "loss": 1.4177, "num_input_tokens_seen": 1345777572, "step": 9540 }, { "epoch": 0.5820375886578854, "grad_norm": 0.4556514620780945, "learning_rate": 8e-05, "loss": 1.3985, "num_input_tokens_seen": 1347204108, "step": 9550 }, { "epoch": 0.5826470521015061, "grad_norm": 0.5312452912330627, "learning_rate": 8e-05, "loss": 1.3973, "num_input_tokens_seen": 1348650808, "step": 9560 }, { "epoch": 0.5832565155451269, "grad_norm": 0.5042407512664795, "learning_rate": 8e-05, "loss": 1.4426, "num_input_tokens_seen": 1350015484, "step": 9570 }, { "epoch": 0.5838659789887478, "grad_norm": 0.549752950668335, "learning_rate": 8e-05, "loss": 1.4271, "num_input_tokens_seen": 1351431476, "step": 9580 }, { "epoch": 0.5844754424323686, "grad_norm": 0.5132181644439697, "learning_rate": 8e-05, "loss": 1.3289, "num_input_tokens_seen": 1352869096, "step": 9590 }, { "epoch": 0.5850849058759894, "grad_norm": 0.6704673767089844, "learning_rate": 8e-05, "loss": 1.3606, "num_input_tokens_seen": 1354273284, "step": 9600 }, { "epoch": 0.5856943693196103, "grad_norm": 0.5381009578704834, "learning_rate": 8e-05, "loss": 1.3736, "num_input_tokens_seen": 1355701548, "step": 9610 }, { "epoch": 0.5863038327632311, "grad_norm": 0.4848478138446808, "learning_rate": 8e-05, "loss": 1.4624, "num_input_tokens_seen": 1357098184, "step": 9620 }, { "epoch": 0.5869132962068518, "grad_norm": 0.5218216180801392, "learning_rate": 8e-05, "loss": 1.3911, "num_input_tokens_seen": 1358496652, "step": 9630 }, { "epoch": 0.5875227596504727, "grad_norm": 0.5365182757377625, "learning_rate": 8e-05, "loss": 1.4491, "num_input_tokens_seen": 1359933264, "step": 9640 }, { "epoch": 0.5881322230940935, "grad_norm": 0.5306704044342041, "learning_rate": 8e-05, "loss": 1.4075, "num_input_tokens_seen": 1361319892, "step": 9650 }, { "epoch": 0.5887416865377144, "grad_norm": 0.5075612664222717, "learning_rate": 8e-05, "loss": 1.3719, "num_input_tokens_seen": 1362739532, "step": 9660 }, { "epoch": 0.5893511499813352, "grad_norm": 0.5192306041717529, "learning_rate": 8e-05, "loss": 1.3277, "num_input_tokens_seen": 1364184264, "step": 9670 }, { "epoch": 0.589960613424956, "grad_norm": 0.5479230880737305, "learning_rate": 8e-05, "loss": 1.4084, "num_input_tokens_seen": 1365614432, "step": 9680 }, { "epoch": 0.5905700768685769, "grad_norm": 0.6008402109146118, "learning_rate": 8e-05, "loss": 1.4572, "num_input_tokens_seen": 1367052460, "step": 9690 }, { "epoch": 0.5911795403121977, "grad_norm": 0.42700648307800293, "learning_rate": 8e-05, "loss": 1.3478, "num_input_tokens_seen": 1368451128, "step": 9700 }, { "epoch": 0.5917890037558184, "grad_norm": 0.5453005433082581, "learning_rate": 8e-05, "loss": 1.4283, "num_input_tokens_seen": 1369849936, "step": 9710 }, { "epoch": 0.5923984671994393, "grad_norm": 0.5360944271087646, "learning_rate": 8e-05, "loss": 1.4105, "num_input_tokens_seen": 1371263976, "step": 9720 }, { "epoch": 0.5930079306430601, "grad_norm": 0.5020470023155212, "learning_rate": 8e-05, "loss": 1.39, "num_input_tokens_seen": 1372649640, "step": 9730 }, { "epoch": 0.5936173940866809, "grad_norm": 0.6608033180236816, "learning_rate": 8e-05, "loss": 1.3897, "num_input_tokens_seen": 1374020584, "step": 9740 }, { "epoch": 0.5942268575303018, "grad_norm": 0.4657529294490814, "learning_rate": 8e-05, "loss": 1.3444, "num_input_tokens_seen": 1375393228, "step": 9750 }, { "epoch": 0.5948363209739226, "grad_norm": 0.6202511191368103, "learning_rate": 8e-05, "loss": 1.4421, "num_input_tokens_seen": 1376785448, "step": 9760 }, { "epoch": 0.5954457844175434, "grad_norm": 0.48330217599868774, "learning_rate": 8e-05, "loss": 1.3878, "num_input_tokens_seen": 1378221156, "step": 9770 }, { "epoch": 0.5960552478611643, "grad_norm": 0.5171040296554565, "learning_rate": 8e-05, "loss": 1.406, "num_input_tokens_seen": 1379642012, "step": 9780 }, { "epoch": 0.596664711304785, "grad_norm": 0.5539959669113159, "learning_rate": 8e-05, "loss": 1.3386, "num_input_tokens_seen": 1381034376, "step": 9790 }, { "epoch": 0.5972741747484058, "grad_norm": 0.5682281851768494, "learning_rate": 8e-05, "loss": 1.3227, "num_input_tokens_seen": 1382464908, "step": 9800 }, { "epoch": 0.5978836381920267, "grad_norm": 0.5348504781723022, "learning_rate": 8e-05, "loss": 1.3212, "num_input_tokens_seen": 1383873968, "step": 9810 }, { "epoch": 0.5984931016356475, "grad_norm": 0.5483260750770569, "learning_rate": 8e-05, "loss": 1.4158, "num_input_tokens_seen": 1385281380, "step": 9820 }, { "epoch": 0.5991025650792683, "grad_norm": 0.4727989137172699, "learning_rate": 8e-05, "loss": 1.3502, "num_input_tokens_seen": 1386679784, "step": 9830 }, { "epoch": 0.5997120285228892, "grad_norm": 0.5440919399261475, "learning_rate": 8e-05, "loss": 1.5048, "num_input_tokens_seen": 1388088844, "step": 9840 }, { "epoch": 0.60032149196651, "grad_norm": 0.5022852420806885, "learning_rate": 8e-05, "loss": 1.3446, "num_input_tokens_seen": 1389562772, "step": 9850 }, { "epoch": 0.6009309554101308, "grad_norm": 0.5559846758842468, "learning_rate": 8e-05, "loss": 1.4179, "num_input_tokens_seen": 1390956260, "step": 9860 }, { "epoch": 0.6015404188537516, "grad_norm": 0.5346333384513855, "learning_rate": 8e-05, "loss": 1.3865, "num_input_tokens_seen": 1392385600, "step": 9870 }, { "epoch": 0.6021498822973724, "grad_norm": 0.5209230780601501, "learning_rate": 8e-05, "loss": 1.3285, "num_input_tokens_seen": 1393782632, "step": 9880 }, { "epoch": 0.6027593457409933, "grad_norm": 0.5146684646606445, "learning_rate": 8e-05, "loss": 1.419, "num_input_tokens_seen": 1395203648, "step": 9890 }, { "epoch": 0.6033688091846141, "grad_norm": 0.49187934398651123, "learning_rate": 8e-05, "loss": 1.3834, "num_input_tokens_seen": 1396636232, "step": 9900 }, { "epoch": 0.6039782726282349, "grad_norm": 0.5156924724578857, "learning_rate": 8e-05, "loss": 1.3366, "num_input_tokens_seen": 1398070672, "step": 9910 }, { "epoch": 0.6045877360718558, "grad_norm": 0.5331023931503296, "learning_rate": 8e-05, "loss": 1.3713, "num_input_tokens_seen": 1399473452, "step": 9920 }, { "epoch": 0.6051971995154766, "grad_norm": 0.6120520234107971, "learning_rate": 8e-05, "loss": 1.4259, "num_input_tokens_seen": 1400826964, "step": 9930 }, { "epoch": 0.6058066629590974, "grad_norm": 0.4598010182380676, "learning_rate": 8e-05, "loss": 1.3581, "num_input_tokens_seen": 1402235776, "step": 9940 }, { "epoch": 0.6064161264027182, "grad_norm": 0.4815860688686371, "learning_rate": 8e-05, "loss": 1.3952, "num_input_tokens_seen": 1403663484, "step": 9950 }, { "epoch": 0.607025589846339, "grad_norm": 0.5356654524803162, "learning_rate": 8e-05, "loss": 1.4294, "num_input_tokens_seen": 1405093388, "step": 9960 }, { "epoch": 0.6076350532899598, "grad_norm": 0.4952469766139984, "learning_rate": 8e-05, "loss": 1.4177, "num_input_tokens_seen": 1406509716, "step": 9970 }, { "epoch": 0.6082445167335807, "grad_norm": 0.497787207365036, "learning_rate": 8e-05, "loss": 1.39, "num_input_tokens_seen": 1407883080, "step": 9980 }, { "epoch": 0.6088539801772015, "grad_norm": 0.41468948125839233, "learning_rate": 8e-05, "loss": 1.3927, "num_input_tokens_seen": 1409283056, "step": 9990 }, { "epoch": 0.6094634436208223, "grad_norm": 0.45570316910743713, "learning_rate": 8e-05, "loss": 1.289, "num_input_tokens_seen": 1410699192, "step": 10000 }, { "epoch": 0.6100729070644432, "grad_norm": 0.5052545666694641, "learning_rate": 8e-05, "loss": 1.3546, "num_input_tokens_seen": 1412106116, "step": 10010 }, { "epoch": 0.610682370508064, "grad_norm": 0.5244643688201904, "learning_rate": 8e-05, "loss": 1.3356, "num_input_tokens_seen": 1413547016, "step": 10020 }, { "epoch": 0.6112918339516847, "grad_norm": 0.5760179758071899, "learning_rate": 8e-05, "loss": 1.389, "num_input_tokens_seen": 1414973076, "step": 10030 }, { "epoch": 0.6119012973953056, "grad_norm": 0.5389821529388428, "learning_rate": 8e-05, "loss": 1.3859, "num_input_tokens_seen": 1416396436, "step": 10040 }, { "epoch": 0.6125107608389264, "grad_norm": 0.5720279812812805, "learning_rate": 8e-05, "loss": 1.349, "num_input_tokens_seen": 1417828128, "step": 10050 }, { "epoch": 0.6131202242825472, "grad_norm": 0.5402315258979797, "learning_rate": 8e-05, "loss": 1.4391, "num_input_tokens_seen": 1419232568, "step": 10060 }, { "epoch": 0.6137296877261681, "grad_norm": 0.5044508576393127, "learning_rate": 8e-05, "loss": 1.3909, "num_input_tokens_seen": 1420627952, "step": 10070 }, { "epoch": 0.6143391511697889, "grad_norm": 0.48339608311653137, "learning_rate": 8e-05, "loss": 1.3168, "num_input_tokens_seen": 1422007340, "step": 10080 }, { "epoch": 0.6149486146134098, "grad_norm": 0.4976171553134918, "learning_rate": 8e-05, "loss": 1.3754, "num_input_tokens_seen": 1423445552, "step": 10090 }, { "epoch": 0.6155580780570306, "grad_norm": 0.49609145522117615, "learning_rate": 8e-05, "loss": 1.3285, "num_input_tokens_seen": 1424851720, "step": 10100 }, { "epoch": 0.6161675415006513, "grad_norm": 0.5570063591003418, "learning_rate": 8e-05, "loss": 1.4115, "num_input_tokens_seen": 1426280464, "step": 10110 }, { "epoch": 0.6167770049442722, "grad_norm": 0.511796236038208, "learning_rate": 8e-05, "loss": 1.391, "num_input_tokens_seen": 1427688620, "step": 10120 }, { "epoch": 0.617386468387893, "grad_norm": 0.5466093420982361, "learning_rate": 8e-05, "loss": 1.3736, "num_input_tokens_seen": 1429118608, "step": 10130 }, { "epoch": 0.6179959318315138, "grad_norm": 0.5146467685699463, "learning_rate": 8e-05, "loss": 1.4094, "num_input_tokens_seen": 1430487804, "step": 10140 }, { "epoch": 0.6186053952751347, "grad_norm": 0.5151812434196472, "learning_rate": 8e-05, "loss": 1.3828, "num_input_tokens_seen": 1431876676, "step": 10150 }, { "epoch": 0.6192148587187555, "grad_norm": 0.5586668848991394, "learning_rate": 8e-05, "loss": 1.3261, "num_input_tokens_seen": 1433285472, "step": 10160 }, { "epoch": 0.6198243221623763, "grad_norm": 0.5817645788192749, "learning_rate": 8e-05, "loss": 1.4316, "num_input_tokens_seen": 1434693624, "step": 10170 }, { "epoch": 0.6204337856059972, "grad_norm": 0.6884422302246094, "learning_rate": 8e-05, "loss": 1.4434, "num_input_tokens_seen": 1436113752, "step": 10180 }, { "epoch": 0.6210432490496179, "grad_norm": 0.5640652179718018, "learning_rate": 8e-05, "loss": 1.3756, "num_input_tokens_seen": 1437519516, "step": 10190 }, { "epoch": 0.6216527124932387, "grad_norm": 0.49332916736602783, "learning_rate": 8e-05, "loss": 1.3701, "num_input_tokens_seen": 1438919604, "step": 10200 }, { "epoch": 0.6222621759368596, "grad_norm": 0.5096072554588318, "learning_rate": 8e-05, "loss": 1.3413, "num_input_tokens_seen": 1440347500, "step": 10210 }, { "epoch": 0.6228716393804804, "grad_norm": 0.44264650344848633, "learning_rate": 8e-05, "loss": 1.4, "num_input_tokens_seen": 1441772520, "step": 10220 }, { "epoch": 0.6234811028241012, "grad_norm": 0.4569202661514282, "learning_rate": 8e-05, "loss": 1.3895, "num_input_tokens_seen": 1443207380, "step": 10230 }, { "epoch": 0.6240905662677221, "grad_norm": 0.5005388855934143, "learning_rate": 8e-05, "loss": 1.3637, "num_input_tokens_seen": 1444575216, "step": 10240 }, { "epoch": 0.6247000297113429, "grad_norm": 0.5004163384437561, "learning_rate": 8e-05, "loss": 1.4176, "num_input_tokens_seen": 1445983064, "step": 10250 }, { "epoch": 0.6253094931549636, "grad_norm": 0.5014557242393494, "learning_rate": 8e-05, "loss": 1.3762, "num_input_tokens_seen": 1447412308, "step": 10260 }, { "epoch": 0.6259189565985845, "grad_norm": 0.5607008337974548, "learning_rate": 8e-05, "loss": 1.3564, "num_input_tokens_seen": 1448812476, "step": 10270 }, { "epoch": 0.6265284200422053, "grad_norm": 0.5385088324546814, "learning_rate": 8e-05, "loss": 1.324, "num_input_tokens_seen": 1450237216, "step": 10280 }, { "epoch": 0.6271378834858262, "grad_norm": 0.5262163281440735, "learning_rate": 8e-05, "loss": 1.4133, "num_input_tokens_seen": 1451658832, "step": 10290 }, { "epoch": 0.627747346929447, "grad_norm": 0.7675474286079407, "learning_rate": 8e-05, "loss": 1.3457, "num_input_tokens_seen": 1453044480, "step": 10300 }, { "epoch": 0.6283568103730678, "grad_norm": 0.48304829001426697, "learning_rate": 8e-05, "loss": 1.3758, "num_input_tokens_seen": 1454441984, "step": 10310 }, { "epoch": 0.6289662738166887, "grad_norm": 0.49265825748443604, "learning_rate": 8e-05, "loss": 1.3955, "num_input_tokens_seen": 1455862852, "step": 10320 }, { "epoch": 0.6295757372603095, "grad_norm": 0.4803888499736786, "learning_rate": 8e-05, "loss": 1.3637, "num_input_tokens_seen": 1457252176, "step": 10330 }, { "epoch": 0.6301852007039302, "grad_norm": 0.4618512690067291, "learning_rate": 8e-05, "loss": 1.3878, "num_input_tokens_seen": 1458690980, "step": 10340 }, { "epoch": 0.6307946641475511, "grad_norm": 0.486068457365036, "learning_rate": 8e-05, "loss": 1.3597, "num_input_tokens_seen": 1460126928, "step": 10350 }, { "epoch": 0.6314041275911719, "grad_norm": 0.482714980840683, "learning_rate": 8e-05, "loss": 1.3995, "num_input_tokens_seen": 1461522124, "step": 10360 }, { "epoch": 0.6320135910347927, "grad_norm": 0.5192288756370544, "learning_rate": 8e-05, "loss": 1.4576, "num_input_tokens_seen": 1462943012, "step": 10370 }, { "epoch": 0.6326230544784136, "grad_norm": 0.4682476818561554, "learning_rate": 8e-05, "loss": 1.3796, "num_input_tokens_seen": 1464333524, "step": 10380 }, { "epoch": 0.6332325179220344, "grad_norm": 0.5397130250930786, "learning_rate": 8e-05, "loss": 1.3822, "num_input_tokens_seen": 1465743076, "step": 10390 }, { "epoch": 0.6338419813656552, "grad_norm": 0.5656686425209045, "learning_rate": 8e-05, "loss": 1.3642, "num_input_tokens_seen": 1467142196, "step": 10400 }, { "epoch": 0.6344514448092761, "grad_norm": 0.6183952689170837, "learning_rate": 8e-05, "loss": 1.3855, "num_input_tokens_seen": 1468562740, "step": 10410 }, { "epoch": 0.6350609082528968, "grad_norm": 0.4923710227012634, "learning_rate": 8e-05, "loss": 1.3049, "num_input_tokens_seen": 1469960856, "step": 10420 }, { "epoch": 0.6356703716965176, "grad_norm": 0.6067305207252502, "learning_rate": 8e-05, "loss": 1.3892, "num_input_tokens_seen": 1471400408, "step": 10430 }, { "epoch": 0.6362798351401385, "grad_norm": 0.5619065761566162, "learning_rate": 8e-05, "loss": 1.4251, "num_input_tokens_seen": 1472802868, "step": 10440 }, { "epoch": 0.6368892985837593, "grad_norm": 0.5561144351959229, "learning_rate": 8e-05, "loss": 1.4278, "num_input_tokens_seen": 1474193064, "step": 10450 }, { "epoch": 0.6374987620273801, "grad_norm": 0.5016257762908936, "learning_rate": 8e-05, "loss": 1.3927, "num_input_tokens_seen": 1475612660, "step": 10460 }, { "epoch": 0.638108225471001, "grad_norm": 0.4828028380870819, "learning_rate": 8e-05, "loss": 1.3506, "num_input_tokens_seen": 1477059616, "step": 10470 }, { "epoch": 0.6387176889146218, "grad_norm": 0.7187060713768005, "learning_rate": 8e-05, "loss": 1.3665, "num_input_tokens_seen": 1478402568, "step": 10480 }, { "epoch": 0.6393271523582427, "grad_norm": 0.6509134769439697, "learning_rate": 8e-05, "loss": 1.4104, "num_input_tokens_seen": 1479782212, "step": 10490 }, { "epoch": 0.6399366158018635, "grad_norm": 0.5177718997001648, "learning_rate": 8e-05, "loss": 1.2858, "num_input_tokens_seen": 1481194272, "step": 10500 }, { "epoch": 0.6405460792454842, "grad_norm": 0.542962908744812, "learning_rate": 8e-05, "loss": 1.4324, "num_input_tokens_seen": 1482567368, "step": 10510 }, { "epoch": 0.6411555426891051, "grad_norm": 0.5583025217056274, "learning_rate": 8e-05, "loss": 1.3911, "num_input_tokens_seen": 1484014248, "step": 10520 }, { "epoch": 0.6417650061327259, "grad_norm": 0.4803900718688965, "learning_rate": 8e-05, "loss": 1.4556, "num_input_tokens_seen": 1485396060, "step": 10530 }, { "epoch": 0.6423744695763467, "grad_norm": 0.4976584017276764, "learning_rate": 8e-05, "loss": 1.3407, "num_input_tokens_seen": 1486803768, "step": 10540 }, { "epoch": 0.6429839330199676, "grad_norm": 0.6051338911056519, "learning_rate": 8e-05, "loss": 1.4611, "num_input_tokens_seen": 1488205756, "step": 10550 }, { "epoch": 0.6435933964635884, "grad_norm": 0.5262444615364075, "learning_rate": 8e-05, "loss": 1.3961, "num_input_tokens_seen": 1489615612, "step": 10560 }, { "epoch": 0.6442028599072092, "grad_norm": 0.484791100025177, "learning_rate": 8e-05, "loss": 1.4121, "num_input_tokens_seen": 1491011084, "step": 10570 }, { "epoch": 0.64481232335083, "grad_norm": 0.5317909717559814, "learning_rate": 8e-05, "loss": 1.3567, "num_input_tokens_seen": 1492429136, "step": 10580 }, { "epoch": 0.6454217867944508, "grad_norm": 0.5404983162879944, "learning_rate": 8e-05, "loss": 1.3642, "num_input_tokens_seen": 1493834740, "step": 10590 }, { "epoch": 0.6460312502380716, "grad_norm": 0.538868248462677, "learning_rate": 8e-05, "loss": 1.3155, "num_input_tokens_seen": 1495264904, "step": 10600 }, { "epoch": 0.6466407136816925, "grad_norm": 0.5658362507820129, "learning_rate": 8e-05, "loss": 1.3228, "num_input_tokens_seen": 1496659280, "step": 10610 }, { "epoch": 0.6472501771253133, "grad_norm": 0.5738557577133179, "learning_rate": 8e-05, "loss": 1.3978, "num_input_tokens_seen": 1498064828, "step": 10620 }, { "epoch": 0.6478596405689341, "grad_norm": 0.5166726112365723, "learning_rate": 8e-05, "loss": 1.4355, "num_input_tokens_seen": 1499487812, "step": 10630 }, { "epoch": 0.648469104012555, "grad_norm": 0.4428934156894684, "learning_rate": 8e-05, "loss": 1.3573, "num_input_tokens_seen": 1500892436, "step": 10640 }, { "epoch": 0.6490785674561758, "grad_norm": 0.559181809425354, "learning_rate": 8e-05, "loss": 1.4064, "num_input_tokens_seen": 1502337536, "step": 10650 }, { "epoch": 0.6496880308997965, "grad_norm": 0.46578314900398254, "learning_rate": 8e-05, "loss": 1.3296, "num_input_tokens_seen": 1503735980, "step": 10660 }, { "epoch": 0.6502974943434174, "grad_norm": 0.5593947768211365, "learning_rate": 8e-05, "loss": 1.444, "num_input_tokens_seen": 1505164384, "step": 10670 }, { "epoch": 0.6509069577870382, "grad_norm": 2.141988754272461, "learning_rate": 8e-05, "loss": 1.3533, "num_input_tokens_seen": 1506537700, "step": 10680 }, { "epoch": 0.651516421230659, "grad_norm": 0.6614646315574646, "learning_rate": 8e-05, "loss": 1.4131, "num_input_tokens_seen": 1507983352, "step": 10690 }, { "epoch": 0.6521258846742799, "grad_norm": 0.49016857147216797, "learning_rate": 8e-05, "loss": 1.389, "num_input_tokens_seen": 1509361728, "step": 10700 }, { "epoch": 0.6527353481179007, "grad_norm": 0.5343895554542542, "learning_rate": 8e-05, "loss": 1.4211, "num_input_tokens_seen": 1510712804, "step": 10710 }, { "epoch": 0.6533448115615216, "grad_norm": 0.44679155945777893, "learning_rate": 8e-05, "loss": 1.3696, "num_input_tokens_seen": 1512120068, "step": 10720 }, { "epoch": 0.6539542750051424, "grad_norm": 0.5805819630622864, "learning_rate": 8e-05, "loss": 1.3976, "num_input_tokens_seen": 1513546256, "step": 10730 }, { "epoch": 0.6545637384487631, "grad_norm": 0.5583277940750122, "learning_rate": 8e-05, "loss": 1.3012, "num_input_tokens_seen": 1514925448, "step": 10740 }, { "epoch": 0.655173201892384, "grad_norm": 0.49840471148490906, "learning_rate": 8e-05, "loss": 1.462, "num_input_tokens_seen": 1516367584, "step": 10750 }, { "epoch": 0.6557826653360048, "grad_norm": 0.4973022937774658, "learning_rate": 8e-05, "loss": 1.3537, "num_input_tokens_seen": 1517757988, "step": 10760 }, { "epoch": 0.6563921287796256, "grad_norm": 0.5231379866600037, "learning_rate": 8e-05, "loss": 1.3669, "num_input_tokens_seen": 1519164136, "step": 10770 }, { "epoch": 0.6570015922232465, "grad_norm": 0.5220736861228943, "learning_rate": 8e-05, "loss": 1.3633, "num_input_tokens_seen": 1520588564, "step": 10780 }, { "epoch": 0.6576110556668673, "grad_norm": 0.5354141592979431, "learning_rate": 8e-05, "loss": 1.4131, "num_input_tokens_seen": 1522010348, "step": 10790 }, { "epoch": 0.6582205191104881, "grad_norm": 0.5864176154136658, "learning_rate": 8e-05, "loss": 1.3681, "num_input_tokens_seen": 1523415004, "step": 10800 }, { "epoch": 0.658829982554109, "grad_norm": 0.5721861720085144, "learning_rate": 8e-05, "loss": 1.308, "num_input_tokens_seen": 1524815832, "step": 10810 }, { "epoch": 0.6594394459977297, "grad_norm": 0.6083350777626038, "learning_rate": 8e-05, "loss": 1.3567, "num_input_tokens_seen": 1526230472, "step": 10820 }, { "epoch": 0.6600489094413505, "grad_norm": 0.46110281348228455, "learning_rate": 8e-05, "loss": 1.4186, "num_input_tokens_seen": 1527644032, "step": 10830 }, { "epoch": 0.6606583728849714, "grad_norm": 0.5084540843963623, "learning_rate": 8e-05, "loss": 1.32, "num_input_tokens_seen": 1529057312, "step": 10840 }, { "epoch": 0.6612678363285922, "grad_norm": 0.5809466242790222, "learning_rate": 8e-05, "loss": 1.3788, "num_input_tokens_seen": 1530445328, "step": 10850 }, { "epoch": 0.661877299772213, "grad_norm": 0.46439307928085327, "learning_rate": 8e-05, "loss": 1.3833, "num_input_tokens_seen": 1531850316, "step": 10860 }, { "epoch": 0.6624867632158339, "grad_norm": 0.5628945231437683, "learning_rate": 8e-05, "loss": 1.3472, "num_input_tokens_seen": 1533252868, "step": 10870 }, { "epoch": 0.6630962266594547, "grad_norm": 0.6179889440536499, "learning_rate": 8e-05, "loss": 1.4127, "num_input_tokens_seen": 1534660880, "step": 10880 }, { "epoch": 0.6637056901030755, "grad_norm": 0.5281222462654114, "learning_rate": 8e-05, "loss": 1.4029, "num_input_tokens_seen": 1536068992, "step": 10890 }, { "epoch": 0.6643151535466963, "grad_norm": 0.5171144008636475, "learning_rate": 8e-05, "loss": 1.4088, "num_input_tokens_seen": 1537464796, "step": 10900 }, { "epoch": 0.6649246169903171, "grad_norm": 0.529052197933197, "learning_rate": 8e-05, "loss": 1.3975, "num_input_tokens_seen": 1538875160, "step": 10910 }, { "epoch": 0.665534080433938, "grad_norm": 0.5157914757728577, "learning_rate": 8e-05, "loss": 1.3578, "num_input_tokens_seen": 1540279084, "step": 10920 }, { "epoch": 0.6661435438775588, "grad_norm": 0.5008856058120728, "learning_rate": 8e-05, "loss": 1.3453, "num_input_tokens_seen": 1541714696, "step": 10930 }, { "epoch": 0.6667530073211796, "grad_norm": 0.45337000489234924, "learning_rate": 8e-05, "loss": 1.4766, "num_input_tokens_seen": 1543170740, "step": 10940 }, { "epoch": 0.6673624707648005, "grad_norm": 0.5083340406417847, "learning_rate": 8e-05, "loss": 1.4436, "num_input_tokens_seen": 1544592796, "step": 10950 }, { "epoch": 0.6679719342084213, "grad_norm": 0.48253244161605835, "learning_rate": 8e-05, "loss": 1.3303, "num_input_tokens_seen": 1545997384, "step": 10960 }, { "epoch": 0.668581397652042, "grad_norm": 0.4723127484321594, "learning_rate": 8e-05, "loss": 1.3656, "num_input_tokens_seen": 1547390872, "step": 10970 }, { "epoch": 0.669190861095663, "grad_norm": 0.539252758026123, "learning_rate": 8e-05, "loss": 1.3333, "num_input_tokens_seen": 1548803244, "step": 10980 }, { "epoch": 0.6698003245392837, "grad_norm": 0.5402015447616577, "learning_rate": 8e-05, "loss": 1.434, "num_input_tokens_seen": 1550175492, "step": 10990 }, { "epoch": 0.6704097879829045, "grad_norm": 0.6111288070678711, "learning_rate": 8e-05, "loss": 1.3002, "num_input_tokens_seen": 1551566972, "step": 11000 }, { "epoch": 0.6710192514265254, "grad_norm": 0.5599400997161865, "learning_rate": 8e-05, "loss": 1.3756, "num_input_tokens_seen": 1552990300, "step": 11010 }, { "epoch": 0.6716287148701462, "grad_norm": 0.4863987863063812, "learning_rate": 8e-05, "loss": 1.3602, "num_input_tokens_seen": 1554395040, "step": 11020 }, { "epoch": 0.672238178313767, "grad_norm": 0.5732718706130981, "learning_rate": 8e-05, "loss": 1.3012, "num_input_tokens_seen": 1555814488, "step": 11030 }, { "epoch": 0.6728476417573879, "grad_norm": 0.48249438405036926, "learning_rate": 8e-05, "loss": 1.3947, "num_input_tokens_seen": 1557230564, "step": 11040 }, { "epoch": 0.6734571052010087, "grad_norm": 0.5319753289222717, "learning_rate": 8e-05, "loss": 1.3969, "num_input_tokens_seen": 1558697588, "step": 11050 }, { "epoch": 0.6740665686446294, "grad_norm": 0.4940206706523895, "learning_rate": 8e-05, "loss": 1.3153, "num_input_tokens_seen": 1560096556, "step": 11060 }, { "epoch": 0.6746760320882503, "grad_norm": 0.5515936017036438, "learning_rate": 8e-05, "loss": 1.3674, "num_input_tokens_seen": 1561517816, "step": 11070 }, { "epoch": 0.6752854955318711, "grad_norm": 0.4390547573566437, "learning_rate": 8e-05, "loss": 1.3617, "num_input_tokens_seen": 1562934520, "step": 11080 }, { "epoch": 0.6758949589754919, "grad_norm": 0.5946500897407532, "learning_rate": 8e-05, "loss": 1.4425, "num_input_tokens_seen": 1564382372, "step": 11090 }, { "epoch": 0.6765044224191128, "grad_norm": 0.46804702281951904, "learning_rate": 8e-05, "loss": 1.295, "num_input_tokens_seen": 1565750404, "step": 11100 }, { "epoch": 0.6771138858627336, "grad_norm": 0.5131279826164246, "learning_rate": 8e-05, "loss": 1.4145, "num_input_tokens_seen": 1567168144, "step": 11110 }, { "epoch": 0.6777233493063545, "grad_norm": 0.5205957293510437, "learning_rate": 8e-05, "loss": 1.3669, "num_input_tokens_seen": 1568518060, "step": 11120 }, { "epoch": 0.6783328127499753, "grad_norm": 0.4875277876853943, "learning_rate": 8e-05, "loss": 1.4563, "num_input_tokens_seen": 1569937112, "step": 11130 }, { "epoch": 0.678942276193596, "grad_norm": 0.5259339809417725, "learning_rate": 8e-05, "loss": 1.3498, "num_input_tokens_seen": 1571385304, "step": 11140 }, { "epoch": 0.6795517396372169, "grad_norm": 0.48126786947250366, "learning_rate": 8e-05, "loss": 1.3787, "num_input_tokens_seen": 1572780332, "step": 11150 }, { "epoch": 0.6801612030808377, "grad_norm": 0.45843496918678284, "learning_rate": 8e-05, "loss": 1.3826, "num_input_tokens_seen": 1574176036, "step": 11160 }, { "epoch": 0.6807706665244585, "grad_norm": 0.5293328762054443, "learning_rate": 8e-05, "loss": 1.3625, "num_input_tokens_seen": 1575584344, "step": 11170 }, { "epoch": 0.6813801299680794, "grad_norm": 0.4788746237754822, "learning_rate": 8e-05, "loss": 1.401, "num_input_tokens_seen": 1577029088, "step": 11180 }, { "epoch": 0.6819895934117002, "grad_norm": 0.5486621260643005, "learning_rate": 8e-05, "loss": 1.4309, "num_input_tokens_seen": 1578437036, "step": 11190 }, { "epoch": 0.682599056855321, "grad_norm": 0.5115844011306763, "learning_rate": 8e-05, "loss": 1.417, "num_input_tokens_seen": 1579839792, "step": 11200 }, { "epoch": 0.6832085202989419, "grad_norm": 0.46927279233932495, "learning_rate": 8e-05, "loss": 1.3891, "num_input_tokens_seen": 1581258564, "step": 11210 }, { "epoch": 0.6838179837425626, "grad_norm": 0.4486519694328308, "learning_rate": 8e-05, "loss": 1.3672, "num_input_tokens_seen": 1582668792, "step": 11220 }, { "epoch": 0.6844274471861834, "grad_norm": 0.5005569458007812, "learning_rate": 8e-05, "loss": 1.421, "num_input_tokens_seen": 1584050876, "step": 11230 }, { "epoch": 0.6850369106298043, "grad_norm": 0.605232834815979, "learning_rate": 8e-05, "loss": 1.3457, "num_input_tokens_seen": 1585466308, "step": 11240 }, { "epoch": 0.6856463740734251, "grad_norm": 0.5172209739685059, "learning_rate": 8e-05, "loss": 1.3576, "num_input_tokens_seen": 1586888240, "step": 11250 }, { "epoch": 0.6862558375170459, "grad_norm": 0.47500714659690857, "learning_rate": 8e-05, "loss": 1.2991, "num_input_tokens_seen": 1588319356, "step": 11260 }, { "epoch": 0.6868653009606668, "grad_norm": 0.48329707980155945, "learning_rate": 8e-05, "loss": 1.3285, "num_input_tokens_seen": 1589717748, "step": 11270 }, { "epoch": 0.6874747644042876, "grad_norm": 0.5334200859069824, "learning_rate": 8e-05, "loss": 1.3861, "num_input_tokens_seen": 1591160404, "step": 11280 }, { "epoch": 0.6880842278479083, "grad_norm": 0.5618347525596619, "learning_rate": 8e-05, "loss": 1.3787, "num_input_tokens_seen": 1592539424, "step": 11290 }, { "epoch": 0.6886936912915292, "grad_norm": 0.47227922081947327, "learning_rate": 8e-05, "loss": 1.3481, "num_input_tokens_seen": 1593941668, "step": 11300 }, { "epoch": 0.68930315473515, "grad_norm": 0.4863712191581726, "learning_rate": 8e-05, "loss": 1.3482, "num_input_tokens_seen": 1595349516, "step": 11310 }, { "epoch": 0.6899126181787709, "grad_norm": 0.4943729341030121, "learning_rate": 8e-05, "loss": 1.3168, "num_input_tokens_seen": 1596768640, "step": 11320 }, { "epoch": 0.6905220816223917, "grad_norm": 0.5097691416740417, "learning_rate": 8e-05, "loss": 1.3259, "num_input_tokens_seen": 1598213372, "step": 11330 }, { "epoch": 0.6911315450660125, "grad_norm": 0.5228952169418335, "learning_rate": 8e-05, "loss": 1.4481, "num_input_tokens_seen": 1599627692, "step": 11340 }, { "epoch": 0.6917410085096334, "grad_norm": 0.4985556900501251, "learning_rate": 8e-05, "loss": 1.3566, "num_input_tokens_seen": 1601010164, "step": 11350 }, { "epoch": 0.6923504719532542, "grad_norm": 0.4366033673286438, "learning_rate": 8e-05, "loss": 1.27, "num_input_tokens_seen": 1602418184, "step": 11360 }, { "epoch": 0.692959935396875, "grad_norm": 0.43515148758888245, "learning_rate": 8e-05, "loss": 1.3676, "num_input_tokens_seen": 1603824332, "step": 11370 }, { "epoch": 0.6935693988404958, "grad_norm": 0.5262019634246826, "learning_rate": 8e-05, "loss": 1.3643, "num_input_tokens_seen": 1605236940, "step": 11380 }, { "epoch": 0.6941788622841166, "grad_norm": 0.5392087697982788, "learning_rate": 8e-05, "loss": 1.4025, "num_input_tokens_seen": 1606675864, "step": 11390 }, { "epoch": 0.6947883257277374, "grad_norm": 0.6022461652755737, "learning_rate": 8e-05, "loss": 1.408, "num_input_tokens_seen": 1608094576, "step": 11400 }, { "epoch": 0.6953977891713583, "grad_norm": 0.6072622537612915, "learning_rate": 8e-05, "loss": 1.3024, "num_input_tokens_seen": 1609523344, "step": 11410 }, { "epoch": 0.6960072526149791, "grad_norm": 0.5051203370094299, "learning_rate": 8e-05, "loss": 1.3671, "num_input_tokens_seen": 1610934128, "step": 11420 }, { "epoch": 0.6966167160585999, "grad_norm": 0.5395517349243164, "learning_rate": 8e-05, "loss": 1.3802, "num_input_tokens_seen": 1612402656, "step": 11430 }, { "epoch": 0.6972261795022208, "grad_norm": 0.5100081562995911, "learning_rate": 8e-05, "loss": 1.4357, "num_input_tokens_seen": 1613828080, "step": 11440 }, { "epoch": 0.6978356429458415, "grad_norm": 0.5012816190719604, "learning_rate": 8e-05, "loss": 1.3875, "num_input_tokens_seen": 1615257472, "step": 11450 }, { "epoch": 0.6984451063894623, "grad_norm": 0.5558944344520569, "learning_rate": 8e-05, "loss": 1.3962, "num_input_tokens_seen": 1616629904, "step": 11460 }, { "epoch": 0.6990545698330832, "grad_norm": 0.5226157307624817, "learning_rate": 8e-05, "loss": 1.3542, "num_input_tokens_seen": 1618030980, "step": 11470 }, { "epoch": 0.699664033276704, "grad_norm": 0.5259307622909546, "learning_rate": 8e-05, "loss": 1.3689, "num_input_tokens_seen": 1619450276, "step": 11480 }, { "epoch": 0.7002734967203248, "grad_norm": 0.5086653232574463, "learning_rate": 8e-05, "loss": 1.3484, "num_input_tokens_seen": 1620874576, "step": 11490 }, { "epoch": 0.7008829601639457, "grad_norm": 0.5540332198143005, "learning_rate": 8e-05, "loss": 1.3293, "num_input_tokens_seen": 1622283328, "step": 11500 }, { "epoch": 0.7014924236075665, "grad_norm": 0.5071319341659546, "learning_rate": 8e-05, "loss": 1.3199, "num_input_tokens_seen": 1623641908, "step": 11510 }, { "epoch": 0.7021018870511873, "grad_norm": 0.5358554124832153, "learning_rate": 8e-05, "loss": 1.4036, "num_input_tokens_seen": 1625041264, "step": 11520 }, { "epoch": 0.7027113504948082, "grad_norm": 0.5506737232208252, "learning_rate": 8e-05, "loss": 1.34, "num_input_tokens_seen": 1626404844, "step": 11530 }, { "epoch": 0.7033208139384289, "grad_norm": 0.44404277205467224, "learning_rate": 8e-05, "loss": 1.34, "num_input_tokens_seen": 1627863172, "step": 11540 }, { "epoch": 0.7039302773820498, "grad_norm": 0.46990063786506653, "learning_rate": 8e-05, "loss": 1.3722, "num_input_tokens_seen": 1629274832, "step": 11550 }, { "epoch": 0.7045397408256706, "grad_norm": 0.4636783301830292, "learning_rate": 8e-05, "loss": 1.3307, "num_input_tokens_seen": 1630697752, "step": 11560 }, { "epoch": 0.7051492042692914, "grad_norm": 0.43071427941322327, "learning_rate": 8e-05, "loss": 1.3574, "num_input_tokens_seen": 1632095920, "step": 11570 }, { "epoch": 0.7057586677129123, "grad_norm": 0.5207687616348267, "learning_rate": 8e-05, "loss": 1.3084, "num_input_tokens_seen": 1633509780, "step": 11580 }, { "epoch": 0.7063681311565331, "grad_norm": 0.4385841190814972, "learning_rate": 8e-05, "loss": 1.3712, "num_input_tokens_seen": 1634958136, "step": 11590 }, { "epoch": 0.7069775946001539, "grad_norm": 0.5395679473876953, "learning_rate": 8e-05, "loss": 1.3802, "num_input_tokens_seen": 1636385408, "step": 11600 }, { "epoch": 0.7075870580437748, "grad_norm": 0.605144202709198, "learning_rate": 8e-05, "loss": 1.3814, "num_input_tokens_seen": 1637775280, "step": 11610 }, { "epoch": 0.7081965214873955, "grad_norm": 0.46157678961753845, "learning_rate": 8e-05, "loss": 1.378, "num_input_tokens_seen": 1639212620, "step": 11620 }, { "epoch": 0.7088059849310163, "grad_norm": 0.4572867751121521, "learning_rate": 8e-05, "loss": 1.3809, "num_input_tokens_seen": 1640607368, "step": 11630 }, { "epoch": 0.7094154483746372, "grad_norm": 0.5079594254493713, "learning_rate": 8e-05, "loss": 1.2521, "num_input_tokens_seen": 1642027784, "step": 11640 }, { "epoch": 0.710024911818258, "grad_norm": 0.7235115766525269, "learning_rate": 8e-05, "loss": 1.3339, "num_input_tokens_seen": 1643408672, "step": 11650 }, { "epoch": 0.7106343752618788, "grad_norm": 0.459494411945343, "learning_rate": 8e-05, "loss": 1.3244, "num_input_tokens_seen": 1644816008, "step": 11660 }, { "epoch": 0.7112438387054997, "grad_norm": 0.4589853584766388, "learning_rate": 8e-05, "loss": 1.3231, "num_input_tokens_seen": 1646220640, "step": 11670 }, { "epoch": 0.7118533021491205, "grad_norm": 0.496324360370636, "learning_rate": 8e-05, "loss": 1.3495, "num_input_tokens_seen": 1647628668, "step": 11680 }, { "epoch": 0.7124627655927412, "grad_norm": 0.5813272595405579, "learning_rate": 8e-05, "loss": 1.3994, "num_input_tokens_seen": 1649031944, "step": 11690 }, { "epoch": 0.7130722290363621, "grad_norm": 0.528102695941925, "learning_rate": 8e-05, "loss": 1.4662, "num_input_tokens_seen": 1650445412, "step": 11700 }, { "epoch": 0.7136816924799829, "grad_norm": 0.5562940835952759, "learning_rate": 8e-05, "loss": 1.3603, "num_input_tokens_seen": 1651821844, "step": 11710 }, { "epoch": 0.7142911559236037, "grad_norm": 0.4890764057636261, "learning_rate": 8e-05, "loss": 1.3721, "num_input_tokens_seen": 1653199044, "step": 11720 }, { "epoch": 0.7149006193672246, "grad_norm": 0.5230799913406372, "learning_rate": 8e-05, "loss": 1.3574, "num_input_tokens_seen": 1654565212, "step": 11730 }, { "epoch": 0.7155100828108454, "grad_norm": 0.5011894106864929, "learning_rate": 8e-05, "loss": 1.3551, "num_input_tokens_seen": 1655952188, "step": 11740 }, { "epoch": 0.7161195462544663, "grad_norm": 0.48514172434806824, "learning_rate": 8e-05, "loss": 1.3334, "num_input_tokens_seen": 1657357812, "step": 11750 }, { "epoch": 0.7167290096980871, "grad_norm": 0.5416278839111328, "learning_rate": 8e-05, "loss": 1.3142, "num_input_tokens_seen": 1658753864, "step": 11760 }, { "epoch": 0.7173384731417078, "grad_norm": 0.5038822889328003, "learning_rate": 8e-05, "loss": 1.3044, "num_input_tokens_seen": 1660171056, "step": 11770 }, { "epoch": 0.7179479365853287, "grad_norm": 0.6000845432281494, "learning_rate": 8e-05, "loss": 1.3487, "num_input_tokens_seen": 1661578792, "step": 11780 }, { "epoch": 0.7185574000289495, "grad_norm": 0.4599187672138214, "learning_rate": 8e-05, "loss": 1.3169, "num_input_tokens_seen": 1662961256, "step": 11790 }, { "epoch": 0.7191668634725703, "grad_norm": 0.5955362915992737, "learning_rate": 8e-05, "loss": 1.4204, "num_input_tokens_seen": 1664363192, "step": 11800 }, { "epoch": 0.7197763269161912, "grad_norm": 0.5447018146514893, "learning_rate": 8e-05, "loss": 1.3821, "num_input_tokens_seen": 1665776148, "step": 11810 }, { "epoch": 0.720385790359812, "grad_norm": 0.49061569571495056, "learning_rate": 8e-05, "loss": 1.3661, "num_input_tokens_seen": 1667199220, "step": 11820 }, { "epoch": 0.7209952538034328, "grad_norm": 0.5659445524215698, "learning_rate": 8e-05, "loss": 1.3536, "num_input_tokens_seen": 1668633580, "step": 11830 }, { "epoch": 0.7216047172470537, "grad_norm": 0.44457441568374634, "learning_rate": 8e-05, "loss": 1.3176, "num_input_tokens_seen": 1670048000, "step": 11840 }, { "epoch": 0.7222141806906744, "grad_norm": 0.5029119849205017, "learning_rate": 8e-05, "loss": 1.4107, "num_input_tokens_seen": 1671464340, "step": 11850 }, { "epoch": 0.7228236441342952, "grad_norm": 0.4228610694408417, "learning_rate": 8e-05, "loss": 1.4419, "num_input_tokens_seen": 1672855328, "step": 11860 }, { "epoch": 0.7234331075779161, "grad_norm": 0.5061122179031372, "learning_rate": 8e-05, "loss": 1.3356, "num_input_tokens_seen": 1674291744, "step": 11870 }, { "epoch": 0.7240425710215369, "grad_norm": 0.4675373136997223, "learning_rate": 8e-05, "loss": 1.4002, "num_input_tokens_seen": 1675717044, "step": 11880 }, { "epoch": 0.7246520344651577, "grad_norm": 0.40757638216018677, "learning_rate": 8e-05, "loss": 1.3313, "num_input_tokens_seen": 1677101540, "step": 11890 }, { "epoch": 0.7252614979087786, "grad_norm": 0.5157292485237122, "learning_rate": 8e-05, "loss": 1.4145, "num_input_tokens_seen": 1678515036, "step": 11900 }, { "epoch": 0.7258709613523994, "grad_norm": 0.6204097270965576, "learning_rate": 8e-05, "loss": 1.3039, "num_input_tokens_seen": 1679921248, "step": 11910 }, { "epoch": 0.7264804247960202, "grad_norm": 0.44126880168914795, "learning_rate": 8e-05, "loss": 1.3353, "num_input_tokens_seen": 1681306344, "step": 11920 }, { "epoch": 0.727089888239641, "grad_norm": 0.6292856931686401, "learning_rate": 8e-05, "loss": 1.3355, "num_input_tokens_seen": 1682719392, "step": 11930 }, { "epoch": 0.7276993516832618, "grad_norm": 0.4729043245315552, "learning_rate": 8e-05, "loss": 1.2812, "num_input_tokens_seen": 1684071768, "step": 11940 }, { "epoch": 0.7283088151268827, "grad_norm": 0.484418660402298, "learning_rate": 8e-05, "loss": 1.3574, "num_input_tokens_seen": 1685486372, "step": 11950 }, { "epoch": 0.7289182785705035, "grad_norm": 0.48124608397483826, "learning_rate": 8e-05, "loss": 1.3325, "num_input_tokens_seen": 1686917640, "step": 11960 }, { "epoch": 0.7295277420141243, "grad_norm": 0.5151872634887695, "learning_rate": 8e-05, "loss": 1.3391, "num_input_tokens_seen": 1688341640, "step": 11970 }, { "epoch": 0.7301372054577452, "grad_norm": 0.5516366958618164, "learning_rate": 8e-05, "loss": 1.3006, "num_input_tokens_seen": 1689765020, "step": 11980 }, { "epoch": 0.730746668901366, "grad_norm": 0.5208513140678406, "learning_rate": 8e-05, "loss": 1.3503, "num_input_tokens_seen": 1691179912, "step": 11990 }, { "epoch": 0.7313561323449868, "grad_norm": 0.5036225914955139, "learning_rate": 8e-05, "loss": 1.3422, "num_input_tokens_seen": 1692541944, "step": 12000 }, { "epoch": 0.7319655957886076, "grad_norm": 0.44400182366371155, "learning_rate": 8e-05, "loss": 1.3675, "num_input_tokens_seen": 1693937504, "step": 12010 }, { "epoch": 0.7325750592322284, "grad_norm": 0.4907507598400116, "learning_rate": 8e-05, "loss": 1.3248, "num_input_tokens_seen": 1695314680, "step": 12020 }, { "epoch": 0.7331845226758492, "grad_norm": 0.5090814828872681, "learning_rate": 8e-05, "loss": 1.3548, "num_input_tokens_seen": 1696725392, "step": 12030 }, { "epoch": 0.7337939861194701, "grad_norm": 0.5077126622200012, "learning_rate": 8e-05, "loss": 1.3663, "num_input_tokens_seen": 1698124780, "step": 12040 }, { "epoch": 0.7344034495630909, "grad_norm": 0.5142135620117188, "learning_rate": 8e-05, "loss": 1.3235, "num_input_tokens_seen": 1699533240, "step": 12050 }, { "epoch": 0.7350129130067117, "grad_norm": 0.4585704207420349, "learning_rate": 8e-05, "loss": 1.3834, "num_input_tokens_seen": 1700921096, "step": 12060 }, { "epoch": 0.7356223764503326, "grad_norm": 0.6136194467544556, "learning_rate": 8e-05, "loss": 1.3458, "num_input_tokens_seen": 1702319564, "step": 12070 }, { "epoch": 0.7362318398939534, "grad_norm": 0.5828477740287781, "learning_rate": 8e-05, "loss": 1.3506, "num_input_tokens_seen": 1703697712, "step": 12080 }, { "epoch": 0.7368413033375741, "grad_norm": 0.5217992663383484, "learning_rate": 8e-05, "loss": 1.3363, "num_input_tokens_seen": 1705110788, "step": 12090 }, { "epoch": 0.737450766781195, "grad_norm": 0.5703504681587219, "learning_rate": 8e-05, "loss": 1.3916, "num_input_tokens_seen": 1706529936, "step": 12100 }, { "epoch": 0.7380602302248158, "grad_norm": 0.49754780530929565, "learning_rate": 8e-05, "loss": 1.3259, "num_input_tokens_seen": 1707949384, "step": 12110 }, { "epoch": 0.7386696936684366, "grad_norm": 0.46623051166534424, "learning_rate": 8e-05, "loss": 1.3169, "num_input_tokens_seen": 1709343816, "step": 12120 }, { "epoch": 0.7392791571120575, "grad_norm": 0.4529217481613159, "learning_rate": 8e-05, "loss": 1.3445, "num_input_tokens_seen": 1710727904, "step": 12130 }, { "epoch": 0.7398886205556783, "grad_norm": 0.5647047758102417, "learning_rate": 8e-05, "loss": 1.359, "num_input_tokens_seen": 1712104092, "step": 12140 }, { "epoch": 0.7404980839992991, "grad_norm": 0.4240126609802246, "learning_rate": 8e-05, "loss": 1.3276, "num_input_tokens_seen": 1713494984, "step": 12150 }, { "epoch": 0.74110754744292, "grad_norm": 0.4749116003513336, "learning_rate": 8e-05, "loss": 1.3626, "num_input_tokens_seen": 1714923848, "step": 12160 }, { "epoch": 0.7417170108865407, "grad_norm": 0.4912160038948059, "learning_rate": 8e-05, "loss": 1.3342, "num_input_tokens_seen": 1716289192, "step": 12170 }, { "epoch": 0.7423264743301616, "grad_norm": 0.47284796833992004, "learning_rate": 8e-05, "loss": 1.4014, "num_input_tokens_seen": 1717724568, "step": 12180 }, { "epoch": 0.7429359377737824, "grad_norm": 0.520675539970398, "learning_rate": 8e-05, "loss": 1.3207, "num_input_tokens_seen": 1719167576, "step": 12190 }, { "epoch": 0.7435454012174032, "grad_norm": 0.48047661781311035, "learning_rate": 8e-05, "loss": 1.288, "num_input_tokens_seen": 1720576612, "step": 12200 }, { "epoch": 0.7441548646610241, "grad_norm": 0.49878841638565063, "learning_rate": 8e-05, "loss": 1.3228, "num_input_tokens_seen": 1721972364, "step": 12210 }, { "epoch": 0.7447643281046449, "grad_norm": 0.4979226589202881, "learning_rate": 8e-05, "loss": 1.2814, "num_input_tokens_seen": 1723431204, "step": 12220 }, { "epoch": 0.7453737915482657, "grad_norm": 0.5070583820343018, "learning_rate": 8e-05, "loss": 1.3283, "num_input_tokens_seen": 1724849792, "step": 12230 }, { "epoch": 0.7459832549918866, "grad_norm": 0.4756496250629425, "learning_rate": 8e-05, "loss": 1.2717, "num_input_tokens_seen": 1726305880, "step": 12240 }, { "epoch": 0.7465927184355073, "grad_norm": 0.6402299404144287, "learning_rate": 8e-05, "loss": 1.3142, "num_input_tokens_seen": 1727716648, "step": 12250 }, { "epoch": 0.7472021818791281, "grad_norm": 0.5206665396690369, "learning_rate": 8e-05, "loss": 1.2944, "num_input_tokens_seen": 1729132756, "step": 12260 }, { "epoch": 0.747811645322749, "grad_norm": 0.5346994996070862, "learning_rate": 8e-05, "loss": 1.3813, "num_input_tokens_seen": 1730565924, "step": 12270 }, { "epoch": 0.7484211087663698, "grad_norm": 0.6076343059539795, "learning_rate": 8e-05, "loss": 1.3387, "num_input_tokens_seen": 1731972796, "step": 12280 }, { "epoch": 0.7490305722099906, "grad_norm": 0.5422996878623962, "learning_rate": 8e-05, "loss": 1.3192, "num_input_tokens_seen": 1733365796, "step": 12290 }, { "epoch": 0.7496400356536115, "grad_norm": 0.5899409055709839, "learning_rate": 8e-05, "loss": 1.3258, "num_input_tokens_seen": 1734719368, "step": 12300 }, { "epoch": 0.7502494990972323, "grad_norm": 0.4870734214782715, "learning_rate": 8e-05, "loss": 1.3172, "num_input_tokens_seen": 1736156068, "step": 12310 }, { "epoch": 0.750858962540853, "grad_norm": 0.44740596413612366, "learning_rate": 8e-05, "loss": 1.3336, "num_input_tokens_seen": 1737561880, "step": 12320 }, { "epoch": 0.7514684259844739, "grad_norm": 0.5884395241737366, "learning_rate": 8e-05, "loss": 1.3935, "num_input_tokens_seen": 1738988796, "step": 12330 }, { "epoch": 0.7520778894280947, "grad_norm": 0.5457208156585693, "learning_rate": 8e-05, "loss": 1.338, "num_input_tokens_seen": 1740413288, "step": 12340 }, { "epoch": 0.7526873528717155, "grad_norm": 0.4471927881240845, "learning_rate": 8e-05, "loss": 1.3132, "num_input_tokens_seen": 1741852120, "step": 12350 }, { "epoch": 0.7532968163153364, "grad_norm": 0.5655962228775024, "learning_rate": 8e-05, "loss": 1.265, "num_input_tokens_seen": 1743221408, "step": 12360 }, { "epoch": 0.7539062797589572, "grad_norm": 0.5733407735824585, "learning_rate": 8e-05, "loss": 1.3406, "num_input_tokens_seen": 1744609012, "step": 12370 }, { "epoch": 0.7545157432025781, "grad_norm": 0.5545304417610168, "learning_rate": 8e-05, "loss": 1.411, "num_input_tokens_seen": 1746007428, "step": 12380 }, { "epoch": 0.7551252066461989, "grad_norm": 0.5492582321166992, "learning_rate": 8e-05, "loss": 1.3644, "num_input_tokens_seen": 1747422652, "step": 12390 }, { "epoch": 0.7557346700898196, "grad_norm": 0.5435560345649719, "learning_rate": 8e-05, "loss": 1.37, "num_input_tokens_seen": 1748808948, "step": 12400 }, { "epoch": 0.7563441335334405, "grad_norm": 0.5100287795066833, "learning_rate": 8e-05, "loss": 1.3401, "num_input_tokens_seen": 1750211396, "step": 12410 }, { "epoch": 0.7569535969770613, "grad_norm": 0.5885925889015198, "learning_rate": 8e-05, "loss": 1.2552, "num_input_tokens_seen": 1751641492, "step": 12420 }, { "epoch": 0.7575630604206821, "grad_norm": 0.5710425972938538, "learning_rate": 8e-05, "loss": 1.3211, "num_input_tokens_seen": 1753033724, "step": 12430 }, { "epoch": 0.758172523864303, "grad_norm": 0.4631437659263611, "learning_rate": 8e-05, "loss": 1.3819, "num_input_tokens_seen": 1754426200, "step": 12440 }, { "epoch": 0.7587819873079238, "grad_norm": 0.5151511430740356, "learning_rate": 8e-05, "loss": 1.2802, "num_input_tokens_seen": 1755811048, "step": 12450 }, { "epoch": 0.7593914507515446, "grad_norm": 0.5333474278450012, "learning_rate": 8e-05, "loss": 1.3127, "num_input_tokens_seen": 1757194620, "step": 12460 }, { "epoch": 0.7600009141951655, "grad_norm": 0.49914056062698364, "learning_rate": 8e-05, "loss": 1.3185, "num_input_tokens_seen": 1758622592, "step": 12470 }, { "epoch": 0.7606103776387863, "grad_norm": 0.41091620922088623, "learning_rate": 8e-05, "loss": 1.2696, "num_input_tokens_seen": 1759999208, "step": 12480 }, { "epoch": 0.761219841082407, "grad_norm": 0.5769081115722656, "learning_rate": 8e-05, "loss": 1.3431, "num_input_tokens_seen": 1761407552, "step": 12490 }, { "epoch": 0.7618293045260279, "grad_norm": 0.5206863880157471, "learning_rate": 8e-05, "loss": 1.3317, "num_input_tokens_seen": 1762800588, "step": 12500 }, { "epoch": 0.7624387679696487, "grad_norm": 0.5158757567405701, "learning_rate": 8e-05, "loss": 1.3526, "num_input_tokens_seen": 1764187880, "step": 12510 }, { "epoch": 0.7630482314132695, "grad_norm": 0.5738706588745117, "learning_rate": 8e-05, "loss": 1.3989, "num_input_tokens_seen": 1765586032, "step": 12520 }, { "epoch": 0.7636576948568904, "grad_norm": 0.6022568941116333, "learning_rate": 8e-05, "loss": 1.2905, "num_input_tokens_seen": 1767057448, "step": 12530 }, { "epoch": 0.7642671583005112, "grad_norm": 0.5038744211196899, "learning_rate": 8e-05, "loss": 1.3278, "num_input_tokens_seen": 1768443564, "step": 12540 }, { "epoch": 0.764876621744132, "grad_norm": 0.5244400501251221, "learning_rate": 8e-05, "loss": 1.3228, "num_input_tokens_seen": 1769849304, "step": 12550 }, { "epoch": 0.7654860851877529, "grad_norm": 0.5055208802223206, "learning_rate": 8e-05, "loss": 1.3335, "num_input_tokens_seen": 1771248796, "step": 12560 }, { "epoch": 0.7660955486313736, "grad_norm": 0.48623979091644287, "learning_rate": 8e-05, "loss": 1.3765, "num_input_tokens_seen": 1772625404, "step": 12570 }, { "epoch": 0.7667050120749945, "grad_norm": 0.5019470453262329, "learning_rate": 8e-05, "loss": 1.3249, "num_input_tokens_seen": 1774037708, "step": 12580 }, { "epoch": 0.7673144755186153, "grad_norm": 0.5067726373672485, "learning_rate": 8e-05, "loss": 1.3194, "num_input_tokens_seen": 1775423152, "step": 12590 }, { "epoch": 0.7679239389622361, "grad_norm": 0.4977276027202606, "learning_rate": 8e-05, "loss": 1.3447, "num_input_tokens_seen": 1776840448, "step": 12600 }, { "epoch": 0.768533402405857, "grad_norm": 0.5764220356941223, "learning_rate": 8e-05, "loss": 1.3149, "num_input_tokens_seen": 1778260160, "step": 12610 }, { "epoch": 0.7691428658494778, "grad_norm": 0.5561099648475647, "learning_rate": 8e-05, "loss": 1.416, "num_input_tokens_seen": 1779627344, "step": 12620 }, { "epoch": 0.7697523292930986, "grad_norm": 0.4993090331554413, "learning_rate": 8e-05, "loss": 1.4345, "num_input_tokens_seen": 1781020624, "step": 12630 }, { "epoch": 0.7703617927367195, "grad_norm": 0.537917971611023, "learning_rate": 8e-05, "loss": 1.2816, "num_input_tokens_seen": 1782457444, "step": 12640 }, { "epoch": 0.7709712561803402, "grad_norm": 0.5599529147148132, "learning_rate": 8e-05, "loss": 1.3272, "num_input_tokens_seen": 1783910788, "step": 12650 }, { "epoch": 0.771580719623961, "grad_norm": 0.523169755935669, "learning_rate": 8e-05, "loss": 1.3633, "num_input_tokens_seen": 1785320612, "step": 12660 }, { "epoch": 0.7721901830675819, "grad_norm": 0.5435786247253418, "learning_rate": 8e-05, "loss": 1.2906, "num_input_tokens_seen": 1786698656, "step": 12670 }, { "epoch": 0.7727996465112027, "grad_norm": 0.489886999130249, "learning_rate": 8e-05, "loss": 1.365, "num_input_tokens_seen": 1788067732, "step": 12680 }, { "epoch": 0.7734091099548235, "grad_norm": 0.43583112955093384, "learning_rate": 8e-05, "loss": 1.2961, "num_input_tokens_seen": 1789457292, "step": 12690 }, { "epoch": 0.7740185733984444, "grad_norm": 0.4963041841983795, "learning_rate": 8e-05, "loss": 1.3571, "num_input_tokens_seen": 1790862480, "step": 12700 }, { "epoch": 0.7746280368420652, "grad_norm": 0.5106602907180786, "learning_rate": 8e-05, "loss": 1.4132, "num_input_tokens_seen": 1792265412, "step": 12710 }, { "epoch": 0.7752375002856859, "grad_norm": 0.47308048605918884, "learning_rate": 8e-05, "loss": 1.2768, "num_input_tokens_seen": 1793681304, "step": 12720 }, { "epoch": 0.7758469637293068, "grad_norm": 0.5513538718223572, "learning_rate": 8e-05, "loss": 1.4489, "num_input_tokens_seen": 1795090636, "step": 12730 }, { "epoch": 0.7764564271729276, "grad_norm": 0.5152673125267029, "learning_rate": 8e-05, "loss": 1.3806, "num_input_tokens_seen": 1796505112, "step": 12740 }, { "epoch": 0.7770658906165484, "grad_norm": 0.5017542243003845, "learning_rate": 8e-05, "loss": 1.2905, "num_input_tokens_seen": 1797913208, "step": 12750 }, { "epoch": 0.7776753540601693, "grad_norm": 0.513664186000824, "learning_rate": 8e-05, "loss": 1.4124, "num_input_tokens_seen": 1799342844, "step": 12760 }, { "epoch": 0.7782848175037901, "grad_norm": 0.47624465823173523, "learning_rate": 8e-05, "loss": 1.3233, "num_input_tokens_seen": 1800772036, "step": 12770 }, { "epoch": 0.778894280947411, "grad_norm": 0.5275976061820984, "learning_rate": 8e-05, "loss": 1.3211, "num_input_tokens_seen": 1802188180, "step": 12780 }, { "epoch": 0.7795037443910318, "grad_norm": 0.5230554938316345, "learning_rate": 8e-05, "loss": 1.3797, "num_input_tokens_seen": 1803600956, "step": 12790 }, { "epoch": 0.7801132078346525, "grad_norm": 0.6262668371200562, "learning_rate": 8e-05, "loss": 1.285, "num_input_tokens_seen": 1805012032, "step": 12800 }, { "epoch": 0.7807226712782734, "grad_norm": 0.5023617148399353, "learning_rate": 8e-05, "loss": 1.3513, "num_input_tokens_seen": 1806435284, "step": 12810 }, { "epoch": 0.7813321347218942, "grad_norm": 0.7828114032745361, "learning_rate": 8e-05, "loss": 1.3308, "num_input_tokens_seen": 1807822968, "step": 12820 }, { "epoch": 0.781941598165515, "grad_norm": 0.4700995981693268, "learning_rate": 8e-05, "loss": 1.4061, "num_input_tokens_seen": 1809275984, "step": 12830 }, { "epoch": 0.7825510616091359, "grad_norm": 0.48993438482284546, "learning_rate": 8e-05, "loss": 1.2856, "num_input_tokens_seen": 1810719656, "step": 12840 }, { "epoch": 0.7831605250527567, "grad_norm": 0.5454090237617493, "learning_rate": 8e-05, "loss": 1.3035, "num_input_tokens_seen": 1812148332, "step": 12850 }, { "epoch": 0.7837699884963775, "grad_norm": 0.4457705318927765, "learning_rate": 8e-05, "loss": 1.3257, "num_input_tokens_seen": 1813587512, "step": 12860 }, { "epoch": 0.7843794519399984, "grad_norm": 0.5165936350822449, "learning_rate": 8e-05, "loss": 1.3082, "num_input_tokens_seen": 1814998812, "step": 12870 }, { "epoch": 0.7849889153836191, "grad_norm": 0.5486757755279541, "learning_rate": 8e-05, "loss": 1.3164, "num_input_tokens_seen": 1816431448, "step": 12880 }, { "epoch": 0.7855983788272399, "grad_norm": 0.5352789163589478, "learning_rate": 8e-05, "loss": 1.333, "num_input_tokens_seen": 1817905740, "step": 12890 }, { "epoch": 0.7862078422708608, "grad_norm": 0.5381478071212769, "learning_rate": 8e-05, "loss": 1.3422, "num_input_tokens_seen": 1819312544, "step": 12900 }, { "epoch": 0.7868173057144816, "grad_norm": 0.4825122058391571, "learning_rate": 8e-05, "loss": 1.3764, "num_input_tokens_seen": 1820735580, "step": 12910 }, { "epoch": 0.7874267691581024, "grad_norm": 0.49656012654304504, "learning_rate": 8e-05, "loss": 1.3513, "num_input_tokens_seen": 1822160700, "step": 12920 }, { "epoch": 0.7880362326017233, "grad_norm": 0.4445992112159729, "learning_rate": 8e-05, "loss": 1.2595, "num_input_tokens_seen": 1823572968, "step": 12930 }, { "epoch": 0.7886456960453441, "grad_norm": 0.5262532234191895, "learning_rate": 8e-05, "loss": 1.3655, "num_input_tokens_seen": 1824957320, "step": 12940 }, { "epoch": 0.7892551594889649, "grad_norm": 0.5006152987480164, "learning_rate": 8e-05, "loss": 1.3366, "num_input_tokens_seen": 1826389520, "step": 12950 }, { "epoch": 0.7898646229325857, "grad_norm": 0.5019990801811218, "learning_rate": 8e-05, "loss": 1.3276, "num_input_tokens_seen": 1827851336, "step": 12960 }, { "epoch": 0.7904740863762065, "grad_norm": 0.5868325233459473, "learning_rate": 8e-05, "loss": 1.2845, "num_input_tokens_seen": 1829273684, "step": 12970 }, { "epoch": 0.7910835498198273, "grad_norm": 0.5358790159225464, "learning_rate": 8e-05, "loss": 1.2664, "num_input_tokens_seen": 1830681708, "step": 12980 }, { "epoch": 0.7916930132634482, "grad_norm": 0.5080293416976929, "learning_rate": 8e-05, "loss": 1.3323, "num_input_tokens_seen": 1832051744, "step": 12990 }, { "epoch": 0.792302476707069, "grad_norm": 0.48632749915122986, "learning_rate": 8e-05, "loss": 1.3402, "num_input_tokens_seen": 1833451224, "step": 13000 }, { "epoch": 0.7929119401506899, "grad_norm": 0.5741437673568726, "learning_rate": 8e-05, "loss": 1.3162, "num_input_tokens_seen": 1834868992, "step": 13010 }, { "epoch": 0.7935214035943107, "grad_norm": 0.5048441886901855, "learning_rate": 8e-05, "loss": 1.4296, "num_input_tokens_seen": 1836321756, "step": 13020 }, { "epoch": 0.7941308670379315, "grad_norm": 0.5186116695404053, "learning_rate": 8e-05, "loss": 1.3066, "num_input_tokens_seen": 1837704368, "step": 13030 }, { "epoch": 0.7947403304815523, "grad_norm": 0.5407278537750244, "learning_rate": 8e-05, "loss": 1.3587, "num_input_tokens_seen": 1839125744, "step": 13040 }, { "epoch": 0.7953497939251731, "grad_norm": 0.5114259719848633, "learning_rate": 8e-05, "loss": 1.2733, "num_input_tokens_seen": 1840565796, "step": 13050 }, { "epoch": 0.7959592573687939, "grad_norm": 0.5528411269187927, "learning_rate": 8e-05, "loss": 1.2579, "num_input_tokens_seen": 1841969076, "step": 13060 }, { "epoch": 0.7965687208124148, "grad_norm": 0.4976850748062134, "learning_rate": 8e-05, "loss": 1.3238, "num_input_tokens_seen": 1843405600, "step": 13070 }, { "epoch": 0.7971781842560356, "grad_norm": 0.48887088894844055, "learning_rate": 8e-05, "loss": 1.322, "num_input_tokens_seen": 1844856608, "step": 13080 }, { "epoch": 0.7977876476996564, "grad_norm": 0.44426125288009644, "learning_rate": 8e-05, "loss": 1.2833, "num_input_tokens_seen": 1846275428, "step": 13090 }, { "epoch": 0.7983971111432773, "grad_norm": 0.5178174376487732, "learning_rate": 8e-05, "loss": 1.3407, "num_input_tokens_seen": 1847671452, "step": 13100 }, { "epoch": 0.799006574586898, "grad_norm": 0.445492148399353, "learning_rate": 8e-05, "loss": 1.3597, "num_input_tokens_seen": 1849090208, "step": 13110 }, { "epoch": 0.7996160380305188, "grad_norm": 0.547702968120575, "learning_rate": 8e-05, "loss": 1.3509, "num_input_tokens_seen": 1850510744, "step": 13120 }, { "epoch": 0.8002255014741397, "grad_norm": 0.531181275844574, "learning_rate": 8e-05, "loss": 1.2971, "num_input_tokens_seen": 1851915908, "step": 13130 }, { "epoch": 0.8008349649177605, "grad_norm": 0.5193353891372681, "learning_rate": 8e-05, "loss": 1.2707, "num_input_tokens_seen": 1853305048, "step": 13140 }, { "epoch": 0.8014444283613813, "grad_norm": 0.5197256207466125, "learning_rate": 8e-05, "loss": 1.3619, "num_input_tokens_seen": 1854735100, "step": 13150 }, { "epoch": 0.8020538918050022, "grad_norm": 0.5376043319702148, "learning_rate": 8e-05, "loss": 1.2955, "num_input_tokens_seen": 1856147500, "step": 13160 }, { "epoch": 0.802663355248623, "grad_norm": 0.45468467473983765, "learning_rate": 8e-05, "loss": 1.375, "num_input_tokens_seen": 1857560596, "step": 13170 }, { "epoch": 0.8032728186922438, "grad_norm": 0.49229443073272705, "learning_rate": 8e-05, "loss": 1.3398, "num_input_tokens_seen": 1858960492, "step": 13180 }, { "epoch": 0.8038822821358647, "grad_norm": 0.49927592277526855, "learning_rate": 8e-05, "loss": 1.2409, "num_input_tokens_seen": 1860333604, "step": 13190 }, { "epoch": 0.8044917455794854, "grad_norm": 0.533245861530304, "learning_rate": 8e-05, "loss": 1.4103, "num_input_tokens_seen": 1861704208, "step": 13200 }, { "epoch": 0.8051012090231063, "grad_norm": 0.4365921914577484, "learning_rate": 8e-05, "loss": 1.3687, "num_input_tokens_seen": 1863099408, "step": 13210 }, { "epoch": 0.8057106724667271, "grad_norm": 0.48126909136772156, "learning_rate": 8e-05, "loss": 1.4127, "num_input_tokens_seen": 1864469736, "step": 13220 }, { "epoch": 0.8063201359103479, "grad_norm": 0.5094852447509766, "learning_rate": 8e-05, "loss": 1.3602, "num_input_tokens_seen": 1865862852, "step": 13230 }, { "epoch": 0.8069295993539688, "grad_norm": 0.45473966002464294, "learning_rate": 8e-05, "loss": 1.3174, "num_input_tokens_seen": 1867276496, "step": 13240 }, { "epoch": 0.8075390627975896, "grad_norm": 0.465964674949646, "learning_rate": 8e-05, "loss": 1.3023, "num_input_tokens_seen": 1868676544, "step": 13250 }, { "epoch": 0.8081485262412104, "grad_norm": 0.4686562716960907, "learning_rate": 8e-05, "loss": 1.3148, "num_input_tokens_seen": 1870089728, "step": 13260 }, { "epoch": 0.8087579896848313, "grad_norm": 0.510066568851471, "learning_rate": 8e-05, "loss": 1.3571, "num_input_tokens_seen": 1871505476, "step": 13270 }, { "epoch": 0.809367453128452, "grad_norm": 0.5353869199752808, "learning_rate": 8e-05, "loss": 1.3631, "num_input_tokens_seen": 1872945652, "step": 13280 }, { "epoch": 0.8099769165720728, "grad_norm": 0.46690231561660767, "learning_rate": 8e-05, "loss": 1.4069, "num_input_tokens_seen": 1874334148, "step": 13290 }, { "epoch": 0.8105863800156937, "grad_norm": 0.46225330233573914, "learning_rate": 8e-05, "loss": 1.2693, "num_input_tokens_seen": 1875742380, "step": 13300 }, { "epoch": 0.8111958434593145, "grad_norm": 0.39511632919311523, "learning_rate": 8e-05, "loss": 1.4481, "num_input_tokens_seen": 1877145116, "step": 13310 }, { "epoch": 0.8118053069029353, "grad_norm": 0.53965163230896, "learning_rate": 8e-05, "loss": 1.3699, "num_input_tokens_seen": 1878564336, "step": 13320 }, { "epoch": 0.8124147703465562, "grad_norm": 0.47026219964027405, "learning_rate": 8e-05, "loss": 1.2953, "num_input_tokens_seen": 1879957336, "step": 13330 }, { "epoch": 0.813024233790177, "grad_norm": 0.45782703161239624, "learning_rate": 8e-05, "loss": 1.3148, "num_input_tokens_seen": 1881375308, "step": 13340 }, { "epoch": 0.8136336972337977, "grad_norm": 0.5887371301651001, "learning_rate": 8e-05, "loss": 1.3117, "num_input_tokens_seen": 1882757276, "step": 13350 }, { "epoch": 0.8142431606774186, "grad_norm": 0.46690821647644043, "learning_rate": 8e-05, "loss": 1.3411, "num_input_tokens_seen": 1884181132, "step": 13360 }, { "epoch": 0.8148526241210394, "grad_norm": 0.44570302963256836, "learning_rate": 8e-05, "loss": 1.3642, "num_input_tokens_seen": 1885627092, "step": 13370 }, { "epoch": 0.8154620875646602, "grad_norm": 0.46674537658691406, "learning_rate": 8e-05, "loss": 1.3329, "num_input_tokens_seen": 1887042028, "step": 13380 }, { "epoch": 0.8160715510082811, "grad_norm": 0.4996930956840515, "learning_rate": 8e-05, "loss": 1.3414, "num_input_tokens_seen": 1888444844, "step": 13390 }, { "epoch": 0.8166810144519019, "grad_norm": 0.5288018584251404, "learning_rate": 8e-05, "loss": 1.3824, "num_input_tokens_seen": 1889802940, "step": 13400 }, { "epoch": 0.8172904778955228, "grad_norm": 0.5384576320648193, "learning_rate": 8e-05, "loss": 1.2916, "num_input_tokens_seen": 1891232692, "step": 13410 }, { "epoch": 0.8178999413391436, "grad_norm": 0.5390682816505432, "learning_rate": 8e-05, "loss": 1.334, "num_input_tokens_seen": 1892682584, "step": 13420 }, { "epoch": 0.8185094047827643, "grad_norm": 0.4713698923587799, "learning_rate": 8e-05, "loss": 1.3697, "num_input_tokens_seen": 1894098956, "step": 13430 }, { "epoch": 0.8191188682263852, "grad_norm": 0.4790286421775818, "learning_rate": 8e-05, "loss": 1.3651, "num_input_tokens_seen": 1895517364, "step": 13440 }, { "epoch": 0.819728331670006, "grad_norm": 0.5080155730247498, "learning_rate": 8e-05, "loss": 1.3155, "num_input_tokens_seen": 1896935040, "step": 13450 }, { "epoch": 0.8203377951136268, "grad_norm": 0.4799495339393616, "learning_rate": 8e-05, "loss": 1.3478, "num_input_tokens_seen": 1898327544, "step": 13460 }, { "epoch": 0.8209472585572477, "grad_norm": 0.5344340205192566, "learning_rate": 8e-05, "loss": 1.3948, "num_input_tokens_seen": 1899741048, "step": 13470 }, { "epoch": 0.8215567220008685, "grad_norm": 0.5373334288597107, "learning_rate": 8e-05, "loss": 1.376, "num_input_tokens_seen": 1901150512, "step": 13480 }, { "epoch": 0.8221661854444893, "grad_norm": 0.511161208152771, "learning_rate": 8e-05, "loss": 1.333, "num_input_tokens_seen": 1902523144, "step": 13490 }, { "epoch": 0.8227756488881102, "grad_norm": 0.4798104763031006, "learning_rate": 8e-05, "loss": 1.4043, "num_input_tokens_seen": 1903981052, "step": 13500 }, { "epoch": 0.823385112331731, "grad_norm": 0.5256845355033875, "learning_rate": 8e-05, "loss": 1.2564, "num_input_tokens_seen": 1905352460, "step": 13510 }, { "epoch": 0.8239945757753517, "grad_norm": 0.43500033020973206, "learning_rate": 8e-05, "loss": 1.3265, "num_input_tokens_seen": 1906776468, "step": 13520 }, { "epoch": 0.8246040392189726, "grad_norm": 0.47271063923835754, "learning_rate": 8e-05, "loss": 1.3805, "num_input_tokens_seen": 1908155040, "step": 13530 }, { "epoch": 0.8252135026625934, "grad_norm": 0.5768705606460571, "learning_rate": 8e-05, "loss": 1.3352, "num_input_tokens_seen": 1909572824, "step": 13540 }, { "epoch": 0.8258229661062142, "grad_norm": 0.496417373418808, "learning_rate": 8e-05, "loss": 1.33, "num_input_tokens_seen": 1911018176, "step": 13550 }, { "epoch": 0.8264324295498351, "grad_norm": 0.4653494954109192, "learning_rate": 8e-05, "loss": 1.3169, "num_input_tokens_seen": 1912427256, "step": 13560 }, { "epoch": 0.8270418929934559, "grad_norm": 0.4730968475341797, "learning_rate": 8e-05, "loss": 1.3272, "num_input_tokens_seen": 1913827684, "step": 13570 }, { "epoch": 0.8276513564370767, "grad_norm": 0.4840553104877472, "learning_rate": 8e-05, "loss": 1.3281, "num_input_tokens_seen": 1915245556, "step": 13580 }, { "epoch": 0.8282608198806976, "grad_norm": 0.4903997480869293, "learning_rate": 8e-05, "loss": 1.297, "num_input_tokens_seen": 1916646964, "step": 13590 }, { "epoch": 0.8288702833243183, "grad_norm": 0.539023756980896, "learning_rate": 8e-05, "loss": 1.3459, "num_input_tokens_seen": 1918123372, "step": 13600 }, { "epoch": 0.8294797467679392, "grad_norm": 0.5554112195968628, "learning_rate": 8e-05, "loss": 1.296, "num_input_tokens_seen": 1919534892, "step": 13610 }, { "epoch": 0.83008921021156, "grad_norm": 0.4873456656932831, "learning_rate": 8e-05, "loss": 1.3225, "num_input_tokens_seen": 1920943776, "step": 13620 }, { "epoch": 0.8306986736551808, "grad_norm": 0.5194123387336731, "learning_rate": 8e-05, "loss": 1.3441, "num_input_tokens_seen": 1922312644, "step": 13630 }, { "epoch": 0.8313081370988017, "grad_norm": 0.476192444562912, "learning_rate": 8e-05, "loss": 1.3575, "num_input_tokens_seen": 1923732804, "step": 13640 }, { "epoch": 0.8319176005424225, "grad_norm": 0.540345311164856, "learning_rate": 8e-05, "loss": 1.3457, "num_input_tokens_seen": 1925149948, "step": 13650 }, { "epoch": 0.8325270639860433, "grad_norm": 0.5137238502502441, "learning_rate": 8e-05, "loss": 1.3691, "num_input_tokens_seen": 1926573700, "step": 13660 }, { "epoch": 0.8331365274296642, "grad_norm": 0.5790938138961792, "learning_rate": 8e-05, "loss": 1.3639, "num_input_tokens_seen": 1927971992, "step": 13670 }, { "epoch": 0.8337459908732849, "grad_norm": 0.509369432926178, "learning_rate": 8e-05, "loss": 1.2713, "num_input_tokens_seen": 1929379708, "step": 13680 }, { "epoch": 0.8343554543169057, "grad_norm": 0.523574948310852, "learning_rate": 8e-05, "loss": 1.3607, "num_input_tokens_seen": 1930737992, "step": 13690 }, { "epoch": 0.8349649177605266, "grad_norm": 0.5223262310028076, "learning_rate": 8e-05, "loss": 1.34, "num_input_tokens_seen": 1932126924, "step": 13700 }, { "epoch": 0.8355743812041474, "grad_norm": 0.5009987354278564, "learning_rate": 8e-05, "loss": 1.3189, "num_input_tokens_seen": 1933541832, "step": 13710 }, { "epoch": 0.8361838446477682, "grad_norm": 0.4302278459072113, "learning_rate": 8e-05, "loss": 1.2678, "num_input_tokens_seen": 1934971524, "step": 13720 }, { "epoch": 0.8367933080913891, "grad_norm": 0.5610336661338806, "learning_rate": 8e-05, "loss": 1.3011, "num_input_tokens_seen": 1936364524, "step": 13730 }, { "epoch": 0.8374027715350099, "grad_norm": 0.5906192660331726, "learning_rate": 8e-05, "loss": 1.3344, "num_input_tokens_seen": 1937758596, "step": 13740 }, { "epoch": 0.8380122349786306, "grad_norm": 0.5197456479072571, "learning_rate": 8e-05, "loss": 1.4107, "num_input_tokens_seen": 1939192700, "step": 13750 }, { "epoch": 0.8386216984222515, "grad_norm": 0.5945485830307007, "learning_rate": 8e-05, "loss": 1.31, "num_input_tokens_seen": 1940593480, "step": 13760 }, { "epoch": 0.8392311618658723, "grad_norm": 0.4964558184146881, "learning_rate": 8e-05, "loss": 1.3275, "num_input_tokens_seen": 1941979224, "step": 13770 }, { "epoch": 0.8398406253094931, "grad_norm": 0.540803074836731, "learning_rate": 8e-05, "loss": 1.3242, "num_input_tokens_seen": 1943362060, "step": 13780 }, { "epoch": 0.840450088753114, "grad_norm": 0.5920431017875671, "learning_rate": 8e-05, "loss": 1.2768, "num_input_tokens_seen": 1944773336, "step": 13790 }, { "epoch": 0.8410595521967348, "grad_norm": 0.5019993185997009, "learning_rate": 8e-05, "loss": 1.3502, "num_input_tokens_seen": 1946151796, "step": 13800 }, { "epoch": 0.8416690156403556, "grad_norm": 0.44807812571525574, "learning_rate": 8e-05, "loss": 1.2786, "num_input_tokens_seen": 1947532968, "step": 13810 }, { "epoch": 0.8422784790839765, "grad_norm": 0.4444776773452759, "learning_rate": 8e-05, "loss": 1.2909, "num_input_tokens_seen": 1948944796, "step": 13820 }, { "epoch": 0.8428879425275972, "grad_norm": 0.5267406702041626, "learning_rate": 8e-05, "loss": 1.4207, "num_input_tokens_seen": 1950341252, "step": 13830 }, { "epoch": 0.8434974059712181, "grad_norm": 0.5206403732299805, "learning_rate": 8e-05, "loss": 1.3525, "num_input_tokens_seen": 1951752800, "step": 13840 }, { "epoch": 0.8441068694148389, "grad_norm": 0.4875728487968445, "learning_rate": 8e-05, "loss": 1.2995, "num_input_tokens_seen": 1953159488, "step": 13850 }, { "epoch": 0.8447163328584597, "grad_norm": 0.49969223141670227, "learning_rate": 8e-05, "loss": 1.3496, "num_input_tokens_seen": 1954546568, "step": 13860 }, { "epoch": 0.8453257963020806, "grad_norm": 0.44394856691360474, "learning_rate": 8e-05, "loss": 1.3643, "num_input_tokens_seen": 1955979156, "step": 13870 }, { "epoch": 0.8459352597457014, "grad_norm": 0.4543069303035736, "learning_rate": 8e-05, "loss": 1.2678, "num_input_tokens_seen": 1957369568, "step": 13880 }, { "epoch": 0.8465447231893222, "grad_norm": 0.5392143726348877, "learning_rate": 8e-05, "loss": 1.2553, "num_input_tokens_seen": 1958773732, "step": 13890 }, { "epoch": 0.8471541866329431, "grad_norm": 0.48405200242996216, "learning_rate": 8e-05, "loss": 1.3331, "num_input_tokens_seen": 1960203416, "step": 13900 }, { "epoch": 0.8477636500765638, "grad_norm": 0.44527801871299744, "learning_rate": 8e-05, "loss": 1.3401, "num_input_tokens_seen": 1961626884, "step": 13910 }, { "epoch": 0.8483731135201846, "grad_norm": 0.5325603485107422, "learning_rate": 8e-05, "loss": 1.3624, "num_input_tokens_seen": 1963044460, "step": 13920 }, { "epoch": 0.8489825769638055, "grad_norm": 0.531408429145813, "learning_rate": 8e-05, "loss": 1.345, "num_input_tokens_seen": 1964494136, "step": 13930 }, { "epoch": 0.8495920404074263, "grad_norm": 0.49144402146339417, "learning_rate": 8e-05, "loss": 1.3735, "num_input_tokens_seen": 1965888548, "step": 13940 }, { "epoch": 0.8502015038510471, "grad_norm": 0.5098019242286682, "learning_rate": 8e-05, "loss": 1.3549, "num_input_tokens_seen": 1967290036, "step": 13950 }, { "epoch": 0.850810967294668, "grad_norm": 0.5794479250907898, "learning_rate": 8e-05, "loss": 1.3227, "num_input_tokens_seen": 1968705020, "step": 13960 }, { "epoch": 0.8514204307382888, "grad_norm": 0.4568016529083252, "learning_rate": 8e-05, "loss": 1.26, "num_input_tokens_seen": 1970114156, "step": 13970 }, { "epoch": 0.8520298941819096, "grad_norm": 0.5260335206985474, "learning_rate": 8e-05, "loss": 1.356, "num_input_tokens_seen": 1971510264, "step": 13980 }, { "epoch": 0.8526393576255304, "grad_norm": 0.45923447608947754, "learning_rate": 8e-05, "loss": 1.3072, "num_input_tokens_seen": 1972916944, "step": 13990 }, { "epoch": 0.8532488210691512, "grad_norm": 0.6013538837432861, "learning_rate": 8e-05, "loss": 1.3758, "num_input_tokens_seen": 1974342624, "step": 14000 }, { "epoch": 0.853858284512772, "grad_norm": 0.5479147434234619, "learning_rate": 8e-05, "loss": 1.2949, "num_input_tokens_seen": 1975743100, "step": 14010 }, { "epoch": 0.8544677479563929, "grad_norm": 0.47358494997024536, "learning_rate": 8e-05, "loss": 1.3132, "num_input_tokens_seen": 1977148904, "step": 14020 }, { "epoch": 0.8550772114000137, "grad_norm": 0.5430836081504822, "learning_rate": 8e-05, "loss": 1.3641, "num_input_tokens_seen": 1978557708, "step": 14030 }, { "epoch": 0.8556866748436346, "grad_norm": 0.5587384700775146, "learning_rate": 8e-05, "loss": 1.2717, "num_input_tokens_seen": 1979948296, "step": 14040 }, { "epoch": 0.8562961382872554, "grad_norm": 0.5155408978462219, "learning_rate": 8e-05, "loss": 1.2849, "num_input_tokens_seen": 1981315776, "step": 14050 }, { "epoch": 0.8569056017308762, "grad_norm": 0.4776671528816223, "learning_rate": 8e-05, "loss": 1.3477, "num_input_tokens_seen": 1982710884, "step": 14060 }, { "epoch": 0.857515065174497, "grad_norm": 0.5615227222442627, "learning_rate": 8e-05, "loss": 1.3669, "num_input_tokens_seen": 1984128224, "step": 14070 }, { "epoch": 0.8581245286181178, "grad_norm": 0.5377513766288757, "learning_rate": 8e-05, "loss": 1.2847, "num_input_tokens_seen": 1985557920, "step": 14080 }, { "epoch": 0.8587339920617386, "grad_norm": 0.4939589202404022, "learning_rate": 8e-05, "loss": 1.3306, "num_input_tokens_seen": 1986944272, "step": 14090 }, { "epoch": 0.8593434555053595, "grad_norm": 0.5589674115180969, "learning_rate": 8e-05, "loss": 1.2835, "num_input_tokens_seen": 1988374288, "step": 14100 }, { "epoch": 0.8599529189489803, "grad_norm": 0.5040762424468994, "learning_rate": 8e-05, "loss": 1.2794, "num_input_tokens_seen": 1989791720, "step": 14110 }, { "epoch": 0.8605623823926011, "grad_norm": 0.40734902024269104, "learning_rate": 8e-05, "loss": 1.2372, "num_input_tokens_seen": 1991197028, "step": 14120 }, { "epoch": 0.861171845836222, "grad_norm": 0.49047330021858215, "learning_rate": 8e-05, "loss": 1.3511, "num_input_tokens_seen": 1992584332, "step": 14130 }, { "epoch": 0.8617813092798428, "grad_norm": 0.5605955719947815, "learning_rate": 8e-05, "loss": 1.2739, "num_input_tokens_seen": 1993990784, "step": 14140 }, { "epoch": 0.8623907727234635, "grad_norm": 0.4485302269458771, "learning_rate": 8e-05, "loss": 1.3499, "num_input_tokens_seen": 1995433372, "step": 14150 }, { "epoch": 0.8630002361670844, "grad_norm": 0.5024114847183228, "learning_rate": 8e-05, "loss": 1.3728, "num_input_tokens_seen": 1996822904, "step": 14160 }, { "epoch": 0.8636096996107052, "grad_norm": 0.5688835978507996, "learning_rate": 8e-05, "loss": 1.3265, "num_input_tokens_seen": 1998261028, "step": 14170 }, { "epoch": 0.864219163054326, "grad_norm": 0.5042539834976196, "learning_rate": 8e-05, "loss": 1.3042, "num_input_tokens_seen": 1999646144, "step": 14180 }, { "epoch": 0.8648286264979469, "grad_norm": 0.4236133098602295, "learning_rate": 8e-05, "loss": 1.3615, "num_input_tokens_seen": 2001077220, "step": 14190 }, { "epoch": 0.8654380899415677, "grad_norm": 0.5273798108100891, "learning_rate": 8e-05, "loss": 1.3866, "num_input_tokens_seen": 2002474020, "step": 14200 }, { "epoch": 0.8660475533851885, "grad_norm": 0.5134631991386414, "learning_rate": 8e-05, "loss": 1.3165, "num_input_tokens_seen": 2003842724, "step": 14210 }, { "epoch": 0.8666570168288094, "grad_norm": 0.6235056519508362, "learning_rate": 8e-05, "loss": 1.3191, "num_input_tokens_seen": 2005293516, "step": 14220 }, { "epoch": 0.8672664802724301, "grad_norm": 0.6013771295547485, "learning_rate": 8e-05, "loss": 1.2849, "num_input_tokens_seen": 2006708784, "step": 14230 }, { "epoch": 0.867875943716051, "grad_norm": 0.5487123131752014, "learning_rate": 8e-05, "loss": 1.2884, "num_input_tokens_seen": 2008066768, "step": 14240 }, { "epoch": 0.8684854071596718, "grad_norm": 0.5143485069274902, "learning_rate": 8e-05, "loss": 1.3233, "num_input_tokens_seen": 2009474988, "step": 14250 }, { "epoch": 0.8690948706032926, "grad_norm": 0.6184853911399841, "learning_rate": 8e-05, "loss": 1.406, "num_input_tokens_seen": 2010864984, "step": 14260 }, { "epoch": 0.8697043340469135, "grad_norm": 0.43288683891296387, "learning_rate": 8e-05, "loss": 1.3188, "num_input_tokens_seen": 2012273664, "step": 14270 }, { "epoch": 0.8703137974905343, "grad_norm": 0.5103833079338074, "learning_rate": 8e-05, "loss": 1.3006, "num_input_tokens_seen": 2013731756, "step": 14280 }, { "epoch": 0.8709232609341551, "grad_norm": 0.46680596470832825, "learning_rate": 8e-05, "loss": 1.306, "num_input_tokens_seen": 2015108620, "step": 14290 }, { "epoch": 0.871532724377776, "grad_norm": 0.46874338388442993, "learning_rate": 8e-05, "loss": 1.2844, "num_input_tokens_seen": 2016514404, "step": 14300 }, { "epoch": 0.8721421878213967, "grad_norm": 0.4423303008079529, "learning_rate": 8e-05, "loss": 1.3593, "num_input_tokens_seen": 2017955308, "step": 14310 }, { "epoch": 0.8727516512650175, "grad_norm": 0.507398784160614, "learning_rate": 8e-05, "loss": 1.2901, "num_input_tokens_seen": 2019400760, "step": 14320 }, { "epoch": 0.8733611147086384, "grad_norm": 0.46312689781188965, "learning_rate": 8e-05, "loss": 1.2778, "num_input_tokens_seen": 2020825292, "step": 14330 }, { "epoch": 0.8739705781522592, "grad_norm": 0.45070067048072815, "learning_rate": 8e-05, "loss": 1.2942, "num_input_tokens_seen": 2022226416, "step": 14340 }, { "epoch": 0.87458004159588, "grad_norm": 0.458053857088089, "learning_rate": 8e-05, "loss": 1.3677, "num_input_tokens_seen": 2023626112, "step": 14350 }, { "epoch": 0.8751895050395009, "grad_norm": 0.47325587272644043, "learning_rate": 8e-05, "loss": 1.3597, "num_input_tokens_seen": 2024990412, "step": 14360 }, { "epoch": 0.8757989684831217, "grad_norm": 0.5431790947914124, "learning_rate": 8e-05, "loss": 1.3117, "num_input_tokens_seen": 2026355792, "step": 14370 }, { "epoch": 0.8764084319267424, "grad_norm": 0.45192548632621765, "learning_rate": 8e-05, "loss": 1.3317, "num_input_tokens_seen": 2027764676, "step": 14380 }, { "epoch": 0.8770178953703633, "grad_norm": 0.541114866733551, "learning_rate": 8e-05, "loss": 1.2806, "num_input_tokens_seen": 2029153492, "step": 14390 }, { "epoch": 0.8776273588139841, "grad_norm": 0.4860060513019562, "learning_rate": 8e-05, "loss": 1.4211, "num_input_tokens_seen": 2030588204, "step": 14400 }, { "epoch": 0.8782368222576049, "grad_norm": 0.4681771695613861, "learning_rate": 8e-05, "loss": 1.3697, "num_input_tokens_seen": 2032011484, "step": 14410 }, { "epoch": 0.8788462857012258, "grad_norm": 0.4607865810394287, "learning_rate": 8e-05, "loss": 1.3697, "num_input_tokens_seen": 2033420856, "step": 14420 }, { "epoch": 0.8794557491448466, "grad_norm": 0.6460906863212585, "learning_rate": 8e-05, "loss": 1.3781, "num_input_tokens_seen": 2034852316, "step": 14430 }, { "epoch": 0.8800652125884675, "grad_norm": 0.5076463222503662, "learning_rate": 8e-05, "loss": 1.3139, "num_input_tokens_seen": 2036238376, "step": 14440 }, { "epoch": 0.8806746760320883, "grad_norm": 0.4806708097457886, "learning_rate": 8e-05, "loss": 1.3171, "num_input_tokens_seen": 2037656080, "step": 14450 }, { "epoch": 0.881284139475709, "grad_norm": 0.6310375332832336, "learning_rate": 8e-05, "loss": 1.2628, "num_input_tokens_seen": 2039101276, "step": 14460 }, { "epoch": 0.8818936029193299, "grad_norm": 0.5229341387748718, "learning_rate": 8e-05, "loss": 1.368, "num_input_tokens_seen": 2040493272, "step": 14470 }, { "epoch": 0.8825030663629507, "grad_norm": 0.5170985460281372, "learning_rate": 8e-05, "loss": 1.3502, "num_input_tokens_seen": 2041931884, "step": 14480 }, { "epoch": 0.8831125298065715, "grad_norm": 0.5730608105659485, "learning_rate": 8e-05, "loss": 1.3101, "num_input_tokens_seen": 2043339212, "step": 14490 }, { "epoch": 0.8837219932501924, "grad_norm": 0.4748365879058838, "learning_rate": 8e-05, "loss": 1.3242, "num_input_tokens_seen": 2044736972, "step": 14500 }, { "epoch": 0.8843314566938132, "grad_norm": 0.4900893270969391, "learning_rate": 8e-05, "loss": 1.2727, "num_input_tokens_seen": 2046111152, "step": 14510 }, { "epoch": 0.884940920137434, "grad_norm": 0.5116603374481201, "learning_rate": 8e-05, "loss": 1.298, "num_input_tokens_seen": 2047511888, "step": 14520 }, { "epoch": 0.8855503835810549, "grad_norm": 0.4778316617012024, "learning_rate": 8e-05, "loss": 1.2757, "num_input_tokens_seen": 2048942212, "step": 14530 }, { "epoch": 0.8861598470246757, "grad_norm": 0.4536020755767822, "learning_rate": 8e-05, "loss": 1.3739, "num_input_tokens_seen": 2050337400, "step": 14540 }, { "epoch": 0.8867693104682964, "grad_norm": 0.4740245044231415, "learning_rate": 8e-05, "loss": 1.2835, "num_input_tokens_seen": 2051768816, "step": 14550 }, { "epoch": 0.8873787739119173, "grad_norm": 0.4206888675689697, "learning_rate": 8e-05, "loss": 1.3625, "num_input_tokens_seen": 2053167920, "step": 14560 }, { "epoch": 0.8879882373555381, "grad_norm": 0.5243297815322876, "learning_rate": 8e-05, "loss": 1.338, "num_input_tokens_seen": 2054565312, "step": 14570 }, { "epoch": 0.8885977007991589, "grad_norm": 0.4735357165336609, "learning_rate": 8e-05, "loss": 1.3339, "num_input_tokens_seen": 2056014664, "step": 14580 }, { "epoch": 0.8892071642427798, "grad_norm": 0.43609538674354553, "learning_rate": 8e-05, "loss": 1.318, "num_input_tokens_seen": 2057450624, "step": 14590 }, { "epoch": 0.8898166276864006, "grad_norm": 0.5434485673904419, "learning_rate": 8e-05, "loss": 1.2862, "num_input_tokens_seen": 2058888776, "step": 14600 }, { "epoch": 0.8904260911300214, "grad_norm": 0.516051709651947, "learning_rate": 8e-05, "loss": 1.2762, "num_input_tokens_seen": 2060339988, "step": 14610 }, { "epoch": 0.8910355545736423, "grad_norm": 0.4696006178855896, "learning_rate": 8e-05, "loss": 1.3784, "num_input_tokens_seen": 2061765008, "step": 14620 }, { "epoch": 0.891645018017263, "grad_norm": 0.5371540188789368, "learning_rate": 8e-05, "loss": 1.2606, "num_input_tokens_seen": 2063161640, "step": 14630 }, { "epoch": 0.8922544814608838, "grad_norm": 0.463405042886734, "learning_rate": 8e-05, "loss": 1.3241, "num_input_tokens_seen": 2064557948, "step": 14640 }, { "epoch": 0.8928639449045047, "grad_norm": 0.48568493127822876, "learning_rate": 8e-05, "loss": 1.3389, "num_input_tokens_seen": 2065990172, "step": 14650 }, { "epoch": 0.8934734083481255, "grad_norm": 0.5302020311355591, "learning_rate": 8e-05, "loss": 1.3282, "num_input_tokens_seen": 2067415172, "step": 14660 }, { "epoch": 0.8940828717917464, "grad_norm": 0.4722622036933899, "learning_rate": 8e-05, "loss": 1.3124, "num_input_tokens_seen": 2068826280, "step": 14670 }, { "epoch": 0.8946923352353672, "grad_norm": 0.5686400532722473, "learning_rate": 8e-05, "loss": 1.2964, "num_input_tokens_seen": 2070285916, "step": 14680 }, { "epoch": 0.895301798678988, "grad_norm": 0.5032939314842224, "learning_rate": 8e-05, "loss": 1.3056, "num_input_tokens_seen": 2071687308, "step": 14690 }, { "epoch": 0.8959112621226089, "grad_norm": 0.43025267124176025, "learning_rate": 8e-05, "loss": 1.2774, "num_input_tokens_seen": 2073089308, "step": 14700 }, { "epoch": 0.8965207255662296, "grad_norm": 0.5931614637374878, "learning_rate": 8e-05, "loss": 1.2295, "num_input_tokens_seen": 2074487708, "step": 14710 }, { "epoch": 0.8971301890098504, "grad_norm": 0.5101529359817505, "learning_rate": 8e-05, "loss": 1.3418, "num_input_tokens_seen": 2075901036, "step": 14720 }, { "epoch": 0.8977396524534713, "grad_norm": 0.46563971042633057, "learning_rate": 8e-05, "loss": 1.2945, "num_input_tokens_seen": 2077316540, "step": 14730 }, { "epoch": 0.8983491158970921, "grad_norm": 0.535887598991394, "learning_rate": 8e-05, "loss": 1.3194, "num_input_tokens_seen": 2078729452, "step": 14740 }, { "epoch": 0.8989585793407129, "grad_norm": 0.5296595096588135, "learning_rate": 8e-05, "loss": 1.3476, "num_input_tokens_seen": 2080099128, "step": 14750 }, { "epoch": 0.8995680427843338, "grad_norm": 0.5821929574012756, "learning_rate": 8e-05, "loss": 1.3395, "num_input_tokens_seen": 2081541188, "step": 14760 }, { "epoch": 0.9001775062279546, "grad_norm": 0.5599145293235779, "learning_rate": 8e-05, "loss": 1.4025, "num_input_tokens_seen": 2082915756, "step": 14770 }, { "epoch": 0.9007869696715753, "grad_norm": 0.548435389995575, "learning_rate": 8e-05, "loss": 1.2736, "num_input_tokens_seen": 2084374868, "step": 14780 }, { "epoch": 0.9013964331151962, "grad_norm": 0.5481672286987305, "learning_rate": 8e-05, "loss": 1.3607, "num_input_tokens_seen": 2085774468, "step": 14790 }, { "epoch": 0.902005896558817, "grad_norm": 0.5384652018547058, "learning_rate": 8e-05, "loss": 1.2555, "num_input_tokens_seen": 2087183164, "step": 14800 }, { "epoch": 0.9026153600024378, "grad_norm": 0.546398937702179, "learning_rate": 8e-05, "loss": 1.3217, "num_input_tokens_seen": 2088598132, "step": 14810 }, { "epoch": 0.9032248234460587, "grad_norm": 0.4481549561023712, "learning_rate": 8e-05, "loss": 1.2731, "num_input_tokens_seen": 2089956332, "step": 14820 }, { "epoch": 0.9038342868896795, "grad_norm": 0.4861372709274292, "learning_rate": 8e-05, "loss": 1.3456, "num_input_tokens_seen": 2091342172, "step": 14830 }, { "epoch": 0.9044437503333003, "grad_norm": 0.5424238443374634, "learning_rate": 8e-05, "loss": 1.3, "num_input_tokens_seen": 2092764024, "step": 14840 }, { "epoch": 0.9050532137769212, "grad_norm": 0.4749182462692261, "learning_rate": 8e-05, "loss": 1.2833, "num_input_tokens_seen": 2094144480, "step": 14850 }, { "epoch": 0.905662677220542, "grad_norm": 0.4868445098400116, "learning_rate": 8e-05, "loss": 1.3079, "num_input_tokens_seen": 2095575144, "step": 14860 }, { "epoch": 0.9062721406641628, "grad_norm": 0.48875224590301514, "learning_rate": 8e-05, "loss": 1.3415, "num_input_tokens_seen": 2096968984, "step": 14870 }, { "epoch": 0.9068816041077836, "grad_norm": 0.6021813750267029, "learning_rate": 8e-05, "loss": 1.3031, "num_input_tokens_seen": 2098395432, "step": 14880 }, { "epoch": 0.9074910675514044, "grad_norm": 0.5280013084411621, "learning_rate": 8e-05, "loss": 1.2774, "num_input_tokens_seen": 2099787376, "step": 14890 }, { "epoch": 0.9081005309950253, "grad_norm": 0.44756680727005005, "learning_rate": 8e-05, "loss": 1.415, "num_input_tokens_seen": 2101178184, "step": 14900 }, { "epoch": 0.9087099944386461, "grad_norm": 0.5173446536064148, "learning_rate": 8e-05, "loss": 1.2678, "num_input_tokens_seen": 2102550216, "step": 14910 }, { "epoch": 0.9093194578822669, "grad_norm": 0.5057575106620789, "learning_rate": 8e-05, "loss": 1.2572, "num_input_tokens_seen": 2103991892, "step": 14920 }, { "epoch": 0.9099289213258878, "grad_norm": 0.4518465995788574, "learning_rate": 8e-05, "loss": 1.2135, "num_input_tokens_seen": 2105393196, "step": 14930 }, { "epoch": 0.9105383847695085, "grad_norm": 0.5331540107727051, "learning_rate": 8e-05, "loss": 1.225, "num_input_tokens_seen": 2106853740, "step": 14940 }, { "epoch": 0.9111478482131293, "grad_norm": 0.567085862159729, "learning_rate": 8e-05, "loss": 1.3343, "num_input_tokens_seen": 2108256916, "step": 14950 }, { "epoch": 0.9117573116567502, "grad_norm": 0.4467783570289612, "learning_rate": 8e-05, "loss": 1.3246, "num_input_tokens_seen": 2109676832, "step": 14960 }, { "epoch": 0.912366775100371, "grad_norm": 0.4943903684616089, "learning_rate": 8e-05, "loss": 1.3044, "num_input_tokens_seen": 2111041288, "step": 14970 }, { "epoch": 0.9129762385439918, "grad_norm": 0.5459105372428894, "learning_rate": 8e-05, "loss": 1.3449, "num_input_tokens_seen": 2112428456, "step": 14980 }, { "epoch": 0.9135857019876127, "grad_norm": 0.6656093597412109, "learning_rate": 8e-05, "loss": 1.3407, "num_input_tokens_seen": 2113839204, "step": 14990 }, { "epoch": 0.9141951654312335, "grad_norm": 0.5155654549598694, "learning_rate": 8e-05, "loss": 1.3348, "num_input_tokens_seen": 2115240548, "step": 15000 }, { "epoch": 0.9148046288748543, "grad_norm": 0.5123623013496399, "learning_rate": 8e-05, "loss": 1.2716, "num_input_tokens_seen": 2116649340, "step": 15010 }, { "epoch": 0.9154140923184751, "grad_norm": 0.46583035588264465, "learning_rate": 8e-05, "loss": 1.2999, "num_input_tokens_seen": 2118049608, "step": 15020 }, { "epoch": 0.9160235557620959, "grad_norm": 0.5250749588012695, "learning_rate": 8e-05, "loss": 1.3072, "num_input_tokens_seen": 2119496856, "step": 15030 }, { "epoch": 0.9166330192057167, "grad_norm": 0.4794974625110626, "learning_rate": 8e-05, "loss": 1.3763, "num_input_tokens_seen": 2120930156, "step": 15040 }, { "epoch": 0.9172424826493376, "grad_norm": 0.4998897314071655, "learning_rate": 8e-05, "loss": 1.3722, "num_input_tokens_seen": 2122320520, "step": 15050 }, { "epoch": 0.9178519460929584, "grad_norm": 0.4267031252384186, "learning_rate": 8e-05, "loss": 1.3729, "num_input_tokens_seen": 2123749292, "step": 15060 }, { "epoch": 0.9184614095365793, "grad_norm": 0.5466004610061646, "learning_rate": 8e-05, "loss": 1.2821, "num_input_tokens_seen": 2125150732, "step": 15070 }, { "epoch": 0.9190708729802001, "grad_norm": 0.4843430519104004, "learning_rate": 8e-05, "loss": 1.3095, "num_input_tokens_seen": 2126553680, "step": 15080 }, { "epoch": 0.9196803364238209, "grad_norm": 0.4891015887260437, "learning_rate": 8e-05, "loss": 1.2743, "num_input_tokens_seen": 2127954848, "step": 15090 }, { "epoch": 0.9202897998674417, "grad_norm": 0.4860542416572571, "learning_rate": 8e-05, "loss": 1.2681, "num_input_tokens_seen": 2129337336, "step": 15100 }, { "epoch": 0.9208992633110625, "grad_norm": 0.5063233375549316, "learning_rate": 8e-05, "loss": 1.2805, "num_input_tokens_seen": 2130776264, "step": 15110 }, { "epoch": 0.9215087267546833, "grad_norm": 0.513878345489502, "learning_rate": 8e-05, "loss": 1.3024, "num_input_tokens_seen": 2132183500, "step": 15120 }, { "epoch": 0.9221181901983042, "grad_norm": 0.6026753783226013, "learning_rate": 8e-05, "loss": 1.3195, "num_input_tokens_seen": 2133618296, "step": 15130 }, { "epoch": 0.922727653641925, "grad_norm": 0.6205632090568542, "learning_rate": 8e-05, "loss": 1.2973, "num_input_tokens_seen": 2135035140, "step": 15140 }, { "epoch": 0.9233371170855458, "grad_norm": 0.5202974677085876, "learning_rate": 8e-05, "loss": 1.3003, "num_input_tokens_seen": 2136445656, "step": 15150 }, { "epoch": 0.9239465805291667, "grad_norm": 0.4942576587200165, "learning_rate": 8e-05, "loss": 1.3261, "num_input_tokens_seen": 2137871872, "step": 15160 }, { "epoch": 0.9245560439727875, "grad_norm": 0.4791765511035919, "learning_rate": 8e-05, "loss": 1.3159, "num_input_tokens_seen": 2139281680, "step": 15170 }, { "epoch": 0.9251655074164082, "grad_norm": 0.5186665654182434, "learning_rate": 8e-05, "loss": 1.2736, "num_input_tokens_seen": 2140719180, "step": 15180 }, { "epoch": 0.9257749708600291, "grad_norm": 0.5142242312431335, "learning_rate": 8e-05, "loss": 1.2888, "num_input_tokens_seen": 2142122972, "step": 15190 }, { "epoch": 0.9263844343036499, "grad_norm": 0.4749925136566162, "learning_rate": 8e-05, "loss": 1.3422, "num_input_tokens_seen": 2143543944, "step": 15200 }, { "epoch": 0.9269938977472707, "grad_norm": 0.4524226784706116, "learning_rate": 8e-05, "loss": 1.3178, "num_input_tokens_seen": 2144948304, "step": 15210 }, { "epoch": 0.9276033611908916, "grad_norm": 0.5263559222221375, "learning_rate": 8e-05, "loss": 1.2454, "num_input_tokens_seen": 2146366520, "step": 15220 }, { "epoch": 0.9282128246345124, "grad_norm": 0.571079432964325, "learning_rate": 8e-05, "loss": 1.3552, "num_input_tokens_seen": 2147785372, "step": 15230 }, { "epoch": 0.9288222880781332, "grad_norm": 0.5034895539283752, "learning_rate": 8e-05, "loss": 1.325, "num_input_tokens_seen": 2149217276, "step": 15240 }, { "epoch": 0.9294317515217541, "grad_norm": 0.45907917618751526, "learning_rate": 8e-05, "loss": 1.2877, "num_input_tokens_seen": 2150613728, "step": 15250 }, { "epoch": 0.9300412149653748, "grad_norm": 0.4312871992588043, "learning_rate": 8e-05, "loss": 1.3095, "num_input_tokens_seen": 2151987964, "step": 15260 }, { "epoch": 0.9306506784089957, "grad_norm": 0.5046421885490417, "learning_rate": 8e-05, "loss": 1.3279, "num_input_tokens_seen": 2153424756, "step": 15270 }, { "epoch": 0.9312601418526165, "grad_norm": 0.5745195150375366, "learning_rate": 8e-05, "loss": 1.2899, "num_input_tokens_seen": 2154852656, "step": 15280 }, { "epoch": 0.9318696052962373, "grad_norm": 0.4521108865737915, "learning_rate": 8e-05, "loss": 1.2494, "num_input_tokens_seen": 2156242808, "step": 15290 }, { "epoch": 0.9324790687398582, "grad_norm": 0.43681252002716064, "learning_rate": 8e-05, "loss": 1.2328, "num_input_tokens_seen": 2157698388, "step": 15300 }, { "epoch": 0.933088532183479, "grad_norm": 0.43972909450531006, "learning_rate": 8e-05, "loss": 1.3337, "num_input_tokens_seen": 2159079792, "step": 15310 }, { "epoch": 0.9336979956270998, "grad_norm": 0.4632760286331177, "learning_rate": 8e-05, "loss": 1.2879, "num_input_tokens_seen": 2160506712, "step": 15320 }, { "epoch": 0.9343074590707207, "grad_norm": 0.4858049154281616, "learning_rate": 8e-05, "loss": 1.3715, "num_input_tokens_seen": 2161935332, "step": 15330 }, { "epoch": 0.9349169225143414, "grad_norm": 0.502547025680542, "learning_rate": 8e-05, "loss": 1.2783, "num_input_tokens_seen": 2163308116, "step": 15340 }, { "epoch": 0.9355263859579622, "grad_norm": 0.5862429141998291, "learning_rate": 8e-05, "loss": 1.3081, "num_input_tokens_seen": 2164680144, "step": 15350 }, { "epoch": 0.9361358494015831, "grad_norm": 0.5465579032897949, "learning_rate": 8e-05, "loss": 1.3302, "num_input_tokens_seen": 2166084000, "step": 15360 }, { "epoch": 0.9367453128452039, "grad_norm": 0.4558219909667969, "learning_rate": 8e-05, "loss": 1.3208, "num_input_tokens_seen": 2167539648, "step": 15370 }, { "epoch": 0.9373547762888247, "grad_norm": 0.4179956614971161, "learning_rate": 8e-05, "loss": 1.2822, "num_input_tokens_seen": 2168925220, "step": 15380 }, { "epoch": 0.9379642397324456, "grad_norm": 0.5408328175544739, "learning_rate": 8e-05, "loss": 1.311, "num_input_tokens_seen": 2170353320, "step": 15390 }, { "epoch": 0.9385737031760664, "grad_norm": 0.45750945806503296, "learning_rate": 8e-05, "loss": 1.3182, "num_input_tokens_seen": 2171767016, "step": 15400 }, { "epoch": 0.9391831666196871, "grad_norm": 0.5293141007423401, "learning_rate": 8e-05, "loss": 1.3439, "num_input_tokens_seen": 2173170916, "step": 15410 }, { "epoch": 0.939792630063308, "grad_norm": 0.4048777222633362, "learning_rate": 8e-05, "loss": 1.2898, "num_input_tokens_seen": 2174582080, "step": 15420 }, { "epoch": 0.9404020935069288, "grad_norm": 0.5126138925552368, "learning_rate": 8e-05, "loss": 1.3273, "num_input_tokens_seen": 2176029604, "step": 15430 }, { "epoch": 0.9410115569505496, "grad_norm": 0.5170230865478516, "learning_rate": 8e-05, "loss": 1.2763, "num_input_tokens_seen": 2177398412, "step": 15440 }, { "epoch": 0.9416210203941705, "grad_norm": 0.47761601209640503, "learning_rate": 8e-05, "loss": 1.3317, "num_input_tokens_seen": 2178777544, "step": 15450 }, { "epoch": 0.9422304838377913, "grad_norm": 0.4851064682006836, "learning_rate": 8e-05, "loss": 1.2528, "num_input_tokens_seen": 2180193028, "step": 15460 }, { "epoch": 0.9428399472814121, "grad_norm": 0.445537805557251, "learning_rate": 8e-05, "loss": 1.3249, "num_input_tokens_seen": 2181639340, "step": 15470 }, { "epoch": 0.943449410725033, "grad_norm": 0.4982694983482361, "learning_rate": 8e-05, "loss": 1.257, "num_input_tokens_seen": 2183099028, "step": 15480 }, { "epoch": 0.9440588741686537, "grad_norm": 0.4679562747478485, "learning_rate": 8e-05, "loss": 1.2946, "num_input_tokens_seen": 2184447260, "step": 15490 }, { "epoch": 0.9446683376122746, "grad_norm": 0.48683637380599976, "learning_rate": 8e-05, "loss": 1.3197, "num_input_tokens_seen": 2185818896, "step": 15500 }, { "epoch": 0.9452778010558954, "grad_norm": 0.6176589131355286, "learning_rate": 8e-05, "loss": 1.3705, "num_input_tokens_seen": 2187242616, "step": 15510 }, { "epoch": 0.9458872644995162, "grad_norm": 0.3919903635978699, "learning_rate": 8e-05, "loss": 1.2758, "num_input_tokens_seen": 2188635656, "step": 15520 }, { "epoch": 0.9464967279431371, "grad_norm": 0.48860692977905273, "learning_rate": 8e-05, "loss": 1.2734, "num_input_tokens_seen": 2190004480, "step": 15530 }, { "epoch": 0.9471061913867579, "grad_norm": 0.547088623046875, "learning_rate": 8e-05, "loss": 1.2956, "num_input_tokens_seen": 2191428232, "step": 15540 }, { "epoch": 0.9477156548303787, "grad_norm": 0.46737053990364075, "learning_rate": 8e-05, "loss": 1.3075, "num_input_tokens_seen": 2192866976, "step": 15550 }, { "epoch": 0.9483251182739996, "grad_norm": 0.5125867128372192, "learning_rate": 8e-05, "loss": 1.3354, "num_input_tokens_seen": 2194303344, "step": 15560 }, { "epoch": 0.9489345817176204, "grad_norm": 0.45754584670066833, "learning_rate": 8e-05, "loss": 1.3479, "num_input_tokens_seen": 2195684020, "step": 15570 }, { "epoch": 0.9495440451612411, "grad_norm": 0.4521014094352722, "learning_rate": 8e-05, "loss": 1.3423, "num_input_tokens_seen": 2197122540, "step": 15580 }, { "epoch": 0.950153508604862, "grad_norm": 0.44286853075027466, "learning_rate": 8e-05, "loss": 1.3658, "num_input_tokens_seen": 2198552600, "step": 15590 }, { "epoch": 0.9507629720484828, "grad_norm": 0.489595502614975, "learning_rate": 8e-05, "loss": 1.3535, "num_input_tokens_seen": 2200004540, "step": 15600 }, { "epoch": 0.9513724354921036, "grad_norm": 0.5072488188743591, "learning_rate": 8e-05, "loss": 1.2492, "num_input_tokens_seen": 2201446580, "step": 15610 }, { "epoch": 0.9519818989357245, "grad_norm": 0.5579765439033508, "learning_rate": 8e-05, "loss": 1.3371, "num_input_tokens_seen": 2202855424, "step": 15620 }, { "epoch": 0.9525913623793453, "grad_norm": 0.49142131209373474, "learning_rate": 8e-05, "loss": 1.3132, "num_input_tokens_seen": 2204239896, "step": 15630 }, { "epoch": 0.9532008258229661, "grad_norm": 0.49964556097984314, "learning_rate": 8e-05, "loss": 1.3217, "num_input_tokens_seen": 2205656712, "step": 15640 }, { "epoch": 0.953810289266587, "grad_norm": 0.493123322725296, "learning_rate": 8e-05, "loss": 1.2907, "num_input_tokens_seen": 2207060424, "step": 15650 }, { "epoch": 0.9544197527102077, "grad_norm": 0.5087452530860901, "learning_rate": 8e-05, "loss": 1.3345, "num_input_tokens_seen": 2208448188, "step": 15660 }, { "epoch": 0.9550292161538285, "grad_norm": 0.5435436367988586, "learning_rate": 8e-05, "loss": 1.3228, "num_input_tokens_seen": 2209862512, "step": 15670 }, { "epoch": 0.9556386795974494, "grad_norm": 0.5179306268692017, "learning_rate": 8e-05, "loss": 1.3164, "num_input_tokens_seen": 2211270724, "step": 15680 }, { "epoch": 0.9562481430410702, "grad_norm": 0.4974176287651062, "learning_rate": 8e-05, "loss": 1.2702, "num_input_tokens_seen": 2212669556, "step": 15690 }, { "epoch": 0.9568576064846911, "grad_norm": 0.5859740972518921, "learning_rate": 8e-05, "loss": 1.2829, "num_input_tokens_seen": 2214058020, "step": 15700 }, { "epoch": 0.9574670699283119, "grad_norm": 0.5075061321258545, "learning_rate": 8e-05, "loss": 1.32, "num_input_tokens_seen": 2215513552, "step": 15710 }, { "epoch": 0.9580765333719327, "grad_norm": 0.5155842304229736, "learning_rate": 8e-05, "loss": 1.2568, "num_input_tokens_seen": 2216879340, "step": 15720 }, { "epoch": 0.9586859968155536, "grad_norm": 0.5818564295768738, "learning_rate": 8e-05, "loss": 1.3082, "num_input_tokens_seen": 2218291808, "step": 15730 }, { "epoch": 0.9592954602591743, "grad_norm": 0.534583568572998, "learning_rate": 8e-05, "loss": 1.3433, "num_input_tokens_seen": 2219696636, "step": 15740 }, { "epoch": 0.9599049237027951, "grad_norm": 0.5481460690498352, "learning_rate": 8e-05, "loss": 1.2681, "num_input_tokens_seen": 2221124380, "step": 15750 }, { "epoch": 0.960514387146416, "grad_norm": 0.4679185450077057, "learning_rate": 8e-05, "loss": 1.3098, "num_input_tokens_seen": 2222547488, "step": 15760 }, { "epoch": 0.9611238505900368, "grad_norm": 0.576784074306488, "learning_rate": 8e-05, "loss": 1.3268, "num_input_tokens_seen": 2223963556, "step": 15770 }, { "epoch": 0.9617333140336576, "grad_norm": 0.48329129815101624, "learning_rate": 8e-05, "loss": 1.2158, "num_input_tokens_seen": 2225362452, "step": 15780 }, { "epoch": 0.9623427774772785, "grad_norm": 0.5717592835426331, "learning_rate": 8e-05, "loss": 1.3256, "num_input_tokens_seen": 2226820220, "step": 15790 }, { "epoch": 0.9629522409208993, "grad_norm": 0.4110204875469208, "learning_rate": 8e-05, "loss": 1.3205, "num_input_tokens_seen": 2228219256, "step": 15800 }, { "epoch": 0.96356170436452, "grad_norm": 0.5517961382865906, "learning_rate": 8e-05, "loss": 1.3158, "num_input_tokens_seen": 2229620876, "step": 15810 }, { "epoch": 0.9641711678081409, "grad_norm": 0.6182602047920227, "learning_rate": 8e-05, "loss": 1.3214, "num_input_tokens_seen": 2231002716, "step": 15820 }, { "epoch": 0.9647806312517617, "grad_norm": 0.5628819465637207, "learning_rate": 8e-05, "loss": 1.3393, "num_input_tokens_seen": 2232410556, "step": 15830 }, { "epoch": 0.9653900946953825, "grad_norm": 0.4238695502281189, "learning_rate": 8e-05, "loss": 1.2732, "num_input_tokens_seen": 2233815396, "step": 15840 }, { "epoch": 0.9659995581390034, "grad_norm": 0.5082485675811768, "learning_rate": 8e-05, "loss": 1.3654, "num_input_tokens_seen": 2235185328, "step": 15850 }, { "epoch": 0.9666090215826242, "grad_norm": 0.550703227519989, "learning_rate": 8e-05, "loss": 1.3748, "num_input_tokens_seen": 2236596760, "step": 15860 }, { "epoch": 0.967218485026245, "grad_norm": 0.47868087887763977, "learning_rate": 8e-05, "loss": 1.3216, "num_input_tokens_seen": 2238020480, "step": 15870 }, { "epoch": 0.9678279484698659, "grad_norm": 0.42936983704566956, "learning_rate": 8e-05, "loss": 1.2932, "num_input_tokens_seen": 2239422468, "step": 15880 }, { "epoch": 0.9684374119134866, "grad_norm": 0.5375044941902161, "learning_rate": 8e-05, "loss": 1.2576, "num_input_tokens_seen": 2240839504, "step": 15890 }, { "epoch": 0.9690468753571075, "grad_norm": 0.44315773248672485, "learning_rate": 8e-05, "loss": 1.2574, "num_input_tokens_seen": 2242293840, "step": 15900 }, { "epoch": 0.9696563388007283, "grad_norm": 0.550308346748352, "learning_rate": 8e-05, "loss": 1.2451, "num_input_tokens_seen": 2243656524, "step": 15910 }, { "epoch": 0.9702658022443491, "grad_norm": 0.47337761521339417, "learning_rate": 8e-05, "loss": 1.3502, "num_input_tokens_seen": 2245041628, "step": 15920 }, { "epoch": 0.97087526568797, "grad_norm": 0.49625223875045776, "learning_rate": 8e-05, "loss": 1.2356, "num_input_tokens_seen": 2246441132, "step": 15930 }, { "epoch": 0.9714847291315908, "grad_norm": 0.5873345732688904, "learning_rate": 8e-05, "loss": 1.3175, "num_input_tokens_seen": 2247890244, "step": 15940 }, { "epoch": 0.9720941925752116, "grad_norm": 0.5385427474975586, "learning_rate": 8e-05, "loss": 1.3387, "num_input_tokens_seen": 2249271220, "step": 15950 }, { "epoch": 0.9727036560188325, "grad_norm": 0.6146961450576782, "learning_rate": 8e-05, "loss": 1.3295, "num_input_tokens_seen": 2250717056, "step": 15960 }, { "epoch": 0.9733131194624532, "grad_norm": 0.4284912347793579, "learning_rate": 8e-05, "loss": 1.3742, "num_input_tokens_seen": 2252132980, "step": 15970 }, { "epoch": 0.973922582906074, "grad_norm": 0.5247157216072083, "learning_rate": 8e-05, "loss": 1.391, "num_input_tokens_seen": 2253553588, "step": 15980 }, { "epoch": 0.9745320463496949, "grad_norm": 0.5365486741065979, "learning_rate": 8e-05, "loss": 1.3256, "num_input_tokens_seen": 2254999908, "step": 15990 }, { "epoch": 0.9751415097933157, "grad_norm": 0.5320517420768738, "learning_rate": 8e-05, "loss": 1.3291, "num_input_tokens_seen": 2256440196, "step": 16000 }, { "epoch": 0.9757509732369365, "grad_norm": 0.5396883487701416, "learning_rate": 8e-05, "loss": 1.2364, "num_input_tokens_seen": 2257840344, "step": 16010 }, { "epoch": 0.9763604366805574, "grad_norm": 0.5101752281188965, "learning_rate": 8e-05, "loss": 1.2918, "num_input_tokens_seen": 2259258160, "step": 16020 }, { "epoch": 0.9769699001241782, "grad_norm": 0.46001750230789185, "learning_rate": 8e-05, "loss": 1.3095, "num_input_tokens_seen": 2260655960, "step": 16030 }, { "epoch": 0.977579363567799, "grad_norm": 0.5177302360534668, "learning_rate": 8e-05, "loss": 1.2653, "num_input_tokens_seen": 2262030540, "step": 16040 }, { "epoch": 0.9781888270114198, "grad_norm": 0.5023877620697021, "learning_rate": 8e-05, "loss": 1.3001, "num_input_tokens_seen": 2263437256, "step": 16050 }, { "epoch": 0.9787982904550406, "grad_norm": 0.5150201320648193, "learning_rate": 8e-05, "loss": 1.2895, "num_input_tokens_seen": 2264836292, "step": 16060 }, { "epoch": 0.9794077538986614, "grad_norm": 0.5193420052528381, "learning_rate": 8e-05, "loss": 1.2073, "num_input_tokens_seen": 2266236256, "step": 16070 }, { "epoch": 0.9800172173422823, "grad_norm": 0.5201960802078247, "learning_rate": 8e-05, "loss": 1.3091, "num_input_tokens_seen": 2267626184, "step": 16080 }, { "epoch": 0.9806266807859031, "grad_norm": 0.5707736015319824, "learning_rate": 8e-05, "loss": 1.2807, "num_input_tokens_seen": 2269065944, "step": 16090 }, { "epoch": 0.981236144229524, "grad_norm": 0.5105067491531372, "learning_rate": 8e-05, "loss": 1.3169, "num_input_tokens_seen": 2270432792, "step": 16100 }, { "epoch": 0.9818456076731448, "grad_norm": 0.5395560264587402, "learning_rate": 8e-05, "loss": 1.3133, "num_input_tokens_seen": 2271812640, "step": 16110 }, { "epoch": 0.9824550711167656, "grad_norm": 0.43431755900382996, "learning_rate": 8e-05, "loss": 1.3047, "num_input_tokens_seen": 2273222960, "step": 16120 }, { "epoch": 0.9830645345603864, "grad_norm": 0.5516846179962158, "learning_rate": 8e-05, "loss": 1.2922, "num_input_tokens_seen": 2274609328, "step": 16130 }, { "epoch": 0.9836739980040072, "grad_norm": 0.46326059103012085, "learning_rate": 8e-05, "loss": 1.2601, "num_input_tokens_seen": 2276053984, "step": 16140 }, { "epoch": 0.984283461447628, "grad_norm": 0.5828720331192017, "learning_rate": 8e-05, "loss": 1.2963, "num_input_tokens_seen": 2277490232, "step": 16150 }, { "epoch": 0.9848929248912489, "grad_norm": 0.5485450029373169, "learning_rate": 8e-05, "loss": 1.2844, "num_input_tokens_seen": 2278878972, "step": 16160 }, { "epoch": 0.9855023883348697, "grad_norm": 0.4985129237174988, "learning_rate": 8e-05, "loss": 1.271, "num_input_tokens_seen": 2280327752, "step": 16170 }, { "epoch": 0.9861118517784905, "grad_norm": 0.47291767597198486, "learning_rate": 8e-05, "loss": 1.2697, "num_input_tokens_seen": 2281761044, "step": 16180 }, { "epoch": 0.9867213152221114, "grad_norm": 0.43932777643203735, "learning_rate": 8e-05, "loss": 1.283, "num_input_tokens_seen": 2283171296, "step": 16190 }, { "epoch": 0.9873307786657322, "grad_norm": 0.5212329030036926, "learning_rate": 8e-05, "loss": 1.2226, "num_input_tokens_seen": 2284634396, "step": 16200 }, { "epoch": 0.9879402421093529, "grad_norm": 0.468487024307251, "learning_rate": 8e-05, "loss": 1.4401, "num_input_tokens_seen": 2286043496, "step": 16210 }, { "epoch": 0.9885497055529738, "grad_norm": 0.6297067403793335, "learning_rate": 8e-05, "loss": 1.313, "num_input_tokens_seen": 2287450120, "step": 16220 }, { "epoch": 0.9891591689965946, "grad_norm": 0.44979387521743774, "learning_rate": 8e-05, "loss": 1.2554, "num_input_tokens_seen": 2288841740, "step": 16230 }, { "epoch": 0.9897686324402154, "grad_norm": 0.4660492241382599, "learning_rate": 8e-05, "loss": 1.2291, "num_input_tokens_seen": 2290243272, "step": 16240 }, { "epoch": 0.9903780958838363, "grad_norm": 0.4804689884185791, "learning_rate": 8e-05, "loss": 1.3183, "num_input_tokens_seen": 2291631656, "step": 16250 }, { "epoch": 0.9909875593274571, "grad_norm": 0.5002725124359131, "learning_rate": 8e-05, "loss": 1.2312, "num_input_tokens_seen": 2293014940, "step": 16260 }, { "epoch": 0.9915970227710779, "grad_norm": 0.49899348616600037, "learning_rate": 8e-05, "loss": 1.1467, "num_input_tokens_seen": 2294401564, "step": 16270 }, { "epoch": 0.9922064862146988, "grad_norm": 0.4643469750881195, "learning_rate": 8e-05, "loss": 1.3575, "num_input_tokens_seen": 2295835048, "step": 16280 }, { "epoch": 0.9928159496583195, "grad_norm": 0.5447330474853516, "learning_rate": 8e-05, "loss": 1.2759, "num_input_tokens_seen": 2297276424, "step": 16290 }, { "epoch": 0.9934254131019403, "grad_norm": 0.556800901889801, "learning_rate": 8e-05, "loss": 1.2833, "num_input_tokens_seen": 2298712204, "step": 16300 }, { "epoch": 0.9940348765455612, "grad_norm": 0.5475782752037048, "learning_rate": 8e-05, "loss": 1.346, "num_input_tokens_seen": 2300152748, "step": 16310 }, { "epoch": 0.994644339989182, "grad_norm": 0.47940051555633545, "learning_rate": 8e-05, "loss": 1.2262, "num_input_tokens_seen": 2301556176, "step": 16320 }, { "epoch": 0.9952538034328029, "grad_norm": 0.45238494873046875, "learning_rate": 8e-05, "loss": 1.3266, "num_input_tokens_seen": 2303010552, "step": 16330 }, { "epoch": 0.9958632668764237, "grad_norm": 0.5712950825691223, "learning_rate": 8e-05, "loss": 1.3051, "num_input_tokens_seen": 2304415944, "step": 16340 }, { "epoch": 0.9964727303200445, "grad_norm": 0.5340079069137573, "learning_rate": 8e-05, "loss": 1.2415, "num_input_tokens_seen": 2305805212, "step": 16350 }, { "epoch": 0.9970821937636654, "grad_norm": 0.5300690531730652, "learning_rate": 8e-05, "loss": 1.295, "num_input_tokens_seen": 2307230964, "step": 16360 }, { "epoch": 0.9976916572072861, "grad_norm": 0.5516910552978516, "learning_rate": 8e-05, "loss": 1.2448, "num_input_tokens_seen": 2308634668, "step": 16370 }, { "epoch": 0.9983011206509069, "grad_norm": 0.48180052638053894, "learning_rate": 8e-05, "loss": 1.2914, "num_input_tokens_seen": 2310022092, "step": 16380 }, { "epoch": 0.9989105840945278, "grad_norm": 0.48212724924087524, "learning_rate": 8e-05, "loss": 1.3808, "num_input_tokens_seen": 2311466628, "step": 16390 }, { "epoch": 0.9995200475381486, "grad_norm": 0.435754656791687, "learning_rate": 8e-05, "loss": 1.3024, "num_input_tokens_seen": 2312932112, "step": 16400 }, { "epoch": 0.9999466719486831, "num_input_tokens_seen": 2313940996, "step": 16407, "total_flos": 9.029436409798197e+18, "train_loss": 0.7368571982491552, "train_runtime": 130509.2663, "train_samples_per_second": 32.185, "train_steps_per_second": 0.126 } ], "logging_steps": 10, "max_steps": 16407, "num_input_tokens_seen": 2313940996, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 9.029436409798197e+18, "train_batch_size": 4, "trial_name": null, "trial_params": null }