diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,13163 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9999466719486831, + "eval_steps": 500, + "global_step": 16407, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0006094634436208223, + "grad_norm": 9.033647537231445, + "learning_rate": 9.7442143727162e-07, + "loss": 6.6451, + "num_input_tokens_seen": 1401772, + "step": 10 + }, + { + "epoch": 0.0012189268872416447, + "grad_norm": 8.373307228088379, + "learning_rate": 1.94884287454324e-06, + "loss": 6.4199, + "num_input_tokens_seen": 2825364, + "step": 20 + }, + { + "epoch": 0.001828390330862467, + "grad_norm": 8.205723762512207, + "learning_rate": 2.92326431181486e-06, + "loss": 5.9935, + "num_input_tokens_seen": 4259836, + "step": 30 + }, + { + "epoch": 0.0024378537744832894, + "grad_norm": 8.017410278320312, + "learning_rate": 3.89768574908648e-06, + "loss": 5.8172, + "num_input_tokens_seen": 5653988, + "step": 40 + }, + { + "epoch": 0.0030473172181041115, + "grad_norm": 8.248656272888184, + "learning_rate": 4.8721071863581e-06, + "loss": 5.4176, + "num_input_tokens_seen": 7026620, + "step": 50 + }, + { + "epoch": 0.003656780661724934, + "grad_norm": 6.726569175720215, + "learning_rate": 5.84652862362972e-06, + "loss": 5.0672, + "num_input_tokens_seen": 8474412, + "step": 60 + }, + { + "epoch": 0.004266244105345756, + "grad_norm": 7.437803268432617, + "learning_rate": 6.8209500609013406e-06, + "loss": 4.6725, + "num_input_tokens_seen": 9888436, + "step": 70 + }, + { + "epoch": 0.004875707548966579, + "grad_norm": 7.783193111419678, + "learning_rate": 7.79537149817296e-06, + "loss": 4.0473, + "num_input_tokens_seen": 11322860, + "step": 80 + }, + { + "epoch": 0.0054851709925874004, + "grad_norm": 1.6730059385299683, + "learning_rate": 8.769792935444581e-06, + "loss": 3.3354, + "num_input_tokens_seen": 12764288, + "step": 90 + }, + { + "epoch": 0.006094634436208223, + "grad_norm": 1.6392016410827637, + "learning_rate": 9.7442143727162e-06, + "loss": 2.9862, + "num_input_tokens_seen": 14208968, + "step": 100 + }, + { + "epoch": 0.0067040978798290456, + "grad_norm": 1.278815507888794, + "learning_rate": 1.071863580998782e-05, + "loss": 2.7488, + "num_input_tokens_seen": 15575528, + "step": 110 + }, + { + "epoch": 0.007313561323449868, + "grad_norm": 1.2898283004760742, + "learning_rate": 1.169305724725944e-05, + "loss": 2.6214, + "num_input_tokens_seen": 16993668, + "step": 120 + }, + { + "epoch": 0.00792302476707069, + "grad_norm": 1.0845414400100708, + "learning_rate": 1.266747868453106e-05, + "loss": 2.5408, + "num_input_tokens_seen": 18424664, + "step": 130 + }, + { + "epoch": 0.008532488210691512, + "grad_norm": 1.1549726724624634, + "learning_rate": 1.3641900121802681e-05, + "loss": 2.4494, + "num_input_tokens_seen": 19842364, + "step": 140 + }, + { + "epoch": 0.009141951654312334, + "grad_norm": 1.2677397727966309, + "learning_rate": 1.46163215590743e-05, + "loss": 2.4496, + "num_input_tokens_seen": 21287212, + "step": 150 + }, + { + "epoch": 0.009751415097933157, + "grad_norm": 1.2440944910049438, + "learning_rate": 1.559074299634592e-05, + "loss": 2.4503, + "num_input_tokens_seen": 22706312, + "step": 160 + }, + { + "epoch": 0.01036087854155398, + "grad_norm": 1.2314107418060303, + "learning_rate": 1.656516443361754e-05, + "loss": 2.3764, + "num_input_tokens_seen": 24127816, + "step": 170 + }, + { + "epoch": 0.010970341985174801, + "grad_norm": 1.0929701328277588, + "learning_rate": 1.7539585870889162e-05, + "loss": 2.4311, + "num_input_tokens_seen": 25495676, + "step": 180 + }, + { + "epoch": 0.011579805428795624, + "grad_norm": 1.0920017957687378, + "learning_rate": 1.8514007308160783e-05, + "loss": 2.3656, + "num_input_tokens_seen": 26913848, + "step": 190 + }, + { + "epoch": 0.012189268872416446, + "grad_norm": 1.259750485420227, + "learning_rate": 1.94884287454324e-05, + "loss": 2.2739, + "num_input_tokens_seen": 28326716, + "step": 200 + }, + { + "epoch": 0.01279873231603727, + "grad_norm": 1.0931645631790161, + "learning_rate": 2.046285018270402e-05, + "loss": 2.231, + "num_input_tokens_seen": 29699424, + "step": 210 + }, + { + "epoch": 0.013408195759658091, + "grad_norm": 1.0463930368423462, + "learning_rate": 2.143727161997564e-05, + "loss": 2.216, + "num_input_tokens_seen": 31109904, + "step": 220 + }, + { + "epoch": 0.014017659203278913, + "grad_norm": 1.0597470998764038, + "learning_rate": 2.241169305724726e-05, + "loss": 2.1803, + "num_input_tokens_seen": 32547736, + "step": 230 + }, + { + "epoch": 0.014627122646899736, + "grad_norm": 1.1559436321258545, + "learning_rate": 2.338611449451888e-05, + "loss": 2.1441, + "num_input_tokens_seen": 33972620, + "step": 240 + }, + { + "epoch": 0.015236586090520558, + "grad_norm": 1.2502082586288452, + "learning_rate": 2.4360535931790504e-05, + "loss": 2.1537, + "num_input_tokens_seen": 35404252, + "step": 250 + }, + { + "epoch": 0.01584604953414138, + "grad_norm": 1.096165418624878, + "learning_rate": 2.533495736906212e-05, + "loss": 2.1193, + "num_input_tokens_seen": 36804296, + "step": 260 + }, + { + "epoch": 0.016455512977762203, + "grad_norm": 1.0119866132736206, + "learning_rate": 2.630937880633374e-05, + "loss": 2.1115, + "num_input_tokens_seen": 38185184, + "step": 270 + }, + { + "epoch": 0.017064976421383025, + "grad_norm": 1.1104751825332642, + "learning_rate": 2.7283800243605362e-05, + "loss": 2.2292, + "num_input_tokens_seen": 39600904, + "step": 280 + }, + { + "epoch": 0.017674439865003846, + "grad_norm": 0.9693301320075989, + "learning_rate": 2.8258221680876983e-05, + "loss": 2.1193, + "num_input_tokens_seen": 41014344, + "step": 290 + }, + { + "epoch": 0.018283903308624668, + "grad_norm": 0.9671382308006287, + "learning_rate": 2.92326431181486e-05, + "loss": 2.052, + "num_input_tokens_seen": 42441088, + "step": 300 + }, + { + "epoch": 0.018893366752245493, + "grad_norm": 1.053604006767273, + "learning_rate": 3.020706455542022e-05, + "loss": 2.081, + "num_input_tokens_seen": 43835948, + "step": 310 + }, + { + "epoch": 0.019502830195866315, + "grad_norm": 0.9839246869087219, + "learning_rate": 3.118148599269184e-05, + "loss": 2.0446, + "num_input_tokens_seen": 45267668, + "step": 320 + }, + { + "epoch": 0.020112293639487137, + "grad_norm": 1.0461212396621704, + "learning_rate": 3.215590742996346e-05, + "loss": 2.0046, + "num_input_tokens_seen": 46734856, + "step": 330 + }, + { + "epoch": 0.02072175708310796, + "grad_norm": 0.9702940583229065, + "learning_rate": 3.313032886723508e-05, + "loss": 2.0476, + "num_input_tokens_seen": 48137552, + "step": 340 + }, + { + "epoch": 0.02133122052672878, + "grad_norm": 1.0886714458465576, + "learning_rate": 3.41047503045067e-05, + "loss": 2.0536, + "num_input_tokens_seen": 49555456, + "step": 350 + }, + { + "epoch": 0.021940683970349602, + "grad_norm": 1.0513168573379517, + "learning_rate": 3.5079171741778324e-05, + "loss": 2.0185, + "num_input_tokens_seen": 50949976, + "step": 360 + }, + { + "epoch": 0.022550147413970427, + "grad_norm": 1.0875654220581055, + "learning_rate": 3.605359317904994e-05, + "loss": 1.9939, + "num_input_tokens_seen": 52330820, + "step": 370 + }, + { + "epoch": 0.02315961085759125, + "grad_norm": 0.9643621444702148, + "learning_rate": 3.7028014616321566e-05, + "loss": 2.0392, + "num_input_tokens_seen": 53764024, + "step": 380 + }, + { + "epoch": 0.02376907430121207, + "grad_norm": 1.0301045179367065, + "learning_rate": 3.800243605359318e-05, + "loss": 2.0448, + "num_input_tokens_seen": 55201816, + "step": 390 + }, + { + "epoch": 0.024378537744832892, + "grad_norm": 1.0076828002929688, + "learning_rate": 3.89768574908648e-05, + "loss": 1.9415, + "num_input_tokens_seen": 56622004, + "step": 400 + }, + { + "epoch": 0.024988001188453714, + "grad_norm": 0.9446476101875305, + "learning_rate": 3.9951278928136424e-05, + "loss": 1.9752, + "num_input_tokens_seen": 58038112, + "step": 410 + }, + { + "epoch": 0.02559746463207454, + "grad_norm": 0.952552318572998, + "learning_rate": 4.092570036540804e-05, + "loss": 1.9806, + "num_input_tokens_seen": 59484616, + "step": 420 + }, + { + "epoch": 0.02620692807569536, + "grad_norm": 0.8900318145751953, + "learning_rate": 4.1900121802679666e-05, + "loss": 1.9547, + "num_input_tokens_seen": 60889256, + "step": 430 + }, + { + "epoch": 0.026816391519316182, + "grad_norm": 0.9296801090240479, + "learning_rate": 4.287454323995128e-05, + "loss": 2.0001, + "num_input_tokens_seen": 62274740, + "step": 440 + }, + { + "epoch": 0.027425854962937004, + "grad_norm": 0.9372000098228455, + "learning_rate": 4.38489646772229e-05, + "loss": 2.001, + "num_input_tokens_seen": 63719728, + "step": 450 + }, + { + "epoch": 0.028035318406557826, + "grad_norm": 1.1481891870498657, + "learning_rate": 4.482338611449452e-05, + "loss": 2.0047, + "num_input_tokens_seen": 65120052, + "step": 460 + }, + { + "epoch": 0.028644781850178647, + "grad_norm": 0.9526330232620239, + "learning_rate": 4.579780755176614e-05, + "loss": 1.8862, + "num_input_tokens_seen": 66516240, + "step": 470 + }, + { + "epoch": 0.029254245293799472, + "grad_norm": 0.9846788048744202, + "learning_rate": 4.677222898903776e-05, + "loss": 1.8632, + "num_input_tokens_seen": 67966636, + "step": 480 + }, + { + "epoch": 0.029863708737420294, + "grad_norm": 0.9888688921928406, + "learning_rate": 4.774665042630938e-05, + "loss": 1.7964, + "num_input_tokens_seen": 69405656, + "step": 490 + }, + { + "epoch": 0.030473172181041116, + "grad_norm": 0.9095290303230286, + "learning_rate": 4.872107186358101e-05, + "loss": 1.9531, + "num_input_tokens_seen": 70820452, + "step": 500 + }, + { + "epoch": 0.031082635624661938, + "grad_norm": 0.8255355358123779, + "learning_rate": 4.9695493300852625e-05, + "loss": 1.951, + "num_input_tokens_seen": 72275948, + "step": 510 + }, + { + "epoch": 0.03169209906828276, + "grad_norm": 0.8973667025566101, + "learning_rate": 5.066991473812424e-05, + "loss": 1.9297, + "num_input_tokens_seen": 73697960, + "step": 520 + }, + { + "epoch": 0.03230156251190358, + "grad_norm": 1.0261414051055908, + "learning_rate": 5.164433617539586e-05, + "loss": 1.8723, + "num_input_tokens_seen": 75117608, + "step": 530 + }, + { + "epoch": 0.032911025955524406, + "grad_norm": 0.8425599336624146, + "learning_rate": 5.261875761266748e-05, + "loss": 1.8414, + "num_input_tokens_seen": 76461224, + "step": 540 + }, + { + "epoch": 0.033520489399145224, + "grad_norm": 0.9196986556053162, + "learning_rate": 5.35931790499391e-05, + "loss": 1.9135, + "num_input_tokens_seen": 77904064, + "step": 550 + }, + { + "epoch": 0.03412995284276605, + "grad_norm": 0.8252116441726685, + "learning_rate": 5.4567600487210725e-05, + "loss": 1.8992, + "num_input_tokens_seen": 79294400, + "step": 560 + }, + { + "epoch": 0.034739416286386875, + "grad_norm": 1.140308141708374, + "learning_rate": 5.554202192448234e-05, + "loss": 1.8564, + "num_input_tokens_seen": 80694984, + "step": 570 + }, + { + "epoch": 0.03534887973000769, + "grad_norm": 0.772463321685791, + "learning_rate": 5.6516443361753966e-05, + "loss": 1.9372, + "num_input_tokens_seen": 82116012, + "step": 580 + }, + { + "epoch": 0.03595834317362852, + "grad_norm": 0.8610614538192749, + "learning_rate": 5.749086479902558e-05, + "loss": 1.8236, + "num_input_tokens_seen": 83500604, + "step": 590 + }, + { + "epoch": 0.036567806617249336, + "grad_norm": 1.0011142492294312, + "learning_rate": 5.84652862362972e-05, + "loss": 1.875, + "num_input_tokens_seen": 84936208, + "step": 600 + }, + { + "epoch": 0.03717727006087016, + "grad_norm": 0.9461501836776733, + "learning_rate": 5.9439707673568825e-05, + "loss": 1.8397, + "num_input_tokens_seen": 86347300, + "step": 610 + }, + { + "epoch": 0.03778673350449099, + "grad_norm": 0.8287586569786072, + "learning_rate": 6.041412911084044e-05, + "loss": 1.8283, + "num_input_tokens_seen": 87787620, + "step": 620 + }, + { + "epoch": 0.038396196948111805, + "grad_norm": 0.9248697757720947, + "learning_rate": 6.138855054811207e-05, + "loss": 1.8192, + "num_input_tokens_seen": 89217528, + "step": 630 + }, + { + "epoch": 0.03900566039173263, + "grad_norm": 0.8168230056762695, + "learning_rate": 6.236297198538368e-05, + "loss": 1.8232, + "num_input_tokens_seen": 90602460, + "step": 640 + }, + { + "epoch": 0.03961512383535345, + "grad_norm": 0.8372480273246765, + "learning_rate": 6.33373934226553e-05, + "loss": 1.9267, + "num_input_tokens_seen": 91995380, + "step": 650 + }, + { + "epoch": 0.04022458727897427, + "grad_norm": 1.0177308320999146, + "learning_rate": 6.431181485992692e-05, + "loss": 1.8319, + "num_input_tokens_seen": 93396324, + "step": 660 + }, + { + "epoch": 0.0408340507225951, + "grad_norm": 0.9811750054359436, + "learning_rate": 6.528623629719854e-05, + "loss": 1.8442, + "num_input_tokens_seen": 94835448, + "step": 670 + }, + { + "epoch": 0.04144351416621592, + "grad_norm": 0.8578136563301086, + "learning_rate": 6.626065773447017e-05, + "loss": 1.8373, + "num_input_tokens_seen": 96247680, + "step": 680 + }, + { + "epoch": 0.04205297760983674, + "grad_norm": 0.7805215120315552, + "learning_rate": 6.723507917174178e-05, + "loss": 1.8717, + "num_input_tokens_seen": 97649748, + "step": 690 + }, + { + "epoch": 0.04266244105345756, + "grad_norm": 0.7658279538154602, + "learning_rate": 6.82095006090134e-05, + "loss": 1.8215, + "num_input_tokens_seen": 99072716, + "step": 700 + }, + { + "epoch": 0.043271904497078385, + "grad_norm": 0.8456059098243713, + "learning_rate": 6.918392204628502e-05, + "loss": 1.9152, + "num_input_tokens_seen": 100460184, + "step": 710 + }, + { + "epoch": 0.043881367940699204, + "grad_norm": 0.7689954042434692, + "learning_rate": 7.015834348355665e-05, + "loss": 1.8299, + "num_input_tokens_seen": 101833788, + "step": 720 + }, + { + "epoch": 0.04449083138432003, + "grad_norm": 0.769614577293396, + "learning_rate": 7.113276492082827e-05, + "loss": 1.7905, + "num_input_tokens_seen": 103233860, + "step": 730 + }, + { + "epoch": 0.045100294827940854, + "grad_norm": 0.984970211982727, + "learning_rate": 7.210718635809988e-05, + "loss": 1.7542, + "num_input_tokens_seen": 104649404, + "step": 740 + }, + { + "epoch": 0.04570975827156167, + "grad_norm": 0.898064136505127, + "learning_rate": 7.30816077953715e-05, + "loss": 1.8333, + "num_input_tokens_seen": 106019108, + "step": 750 + }, + { + "epoch": 0.0463192217151825, + "grad_norm": 0.8178762197494507, + "learning_rate": 7.405602923264313e-05, + "loss": 1.7696, + "num_input_tokens_seen": 107412140, + "step": 760 + }, + { + "epoch": 0.046928685158803315, + "grad_norm": 0.8419963121414185, + "learning_rate": 7.503045066991475e-05, + "loss": 1.8582, + "num_input_tokens_seen": 108814020, + "step": 770 + }, + { + "epoch": 0.04753814860242414, + "grad_norm": 0.8736382126808167, + "learning_rate": 7.600487210718637e-05, + "loss": 1.8505, + "num_input_tokens_seen": 110243636, + "step": 780 + }, + { + "epoch": 0.048147612046044966, + "grad_norm": 0.8330615758895874, + "learning_rate": 7.697929354445798e-05, + "loss": 1.8439, + "num_input_tokens_seen": 111661300, + "step": 790 + }, + { + "epoch": 0.048757075489665784, + "grad_norm": 0.8429380059242249, + "learning_rate": 7.79537149817296e-05, + "loss": 1.7612, + "num_input_tokens_seen": 113084304, + "step": 800 + }, + { + "epoch": 0.04936653893328661, + "grad_norm": 0.7914267182350159, + "learning_rate": 7.892813641900122e-05, + "loss": 1.8355, + "num_input_tokens_seen": 114514328, + "step": 810 + }, + { + "epoch": 0.04997600237690743, + "grad_norm": 0.8499755263328552, + "learning_rate": 7.990255785627285e-05, + "loss": 1.762, + "num_input_tokens_seen": 115939680, + "step": 820 + }, + { + "epoch": 0.05058546582052825, + "grad_norm": 0.8349440693855286, + "learning_rate": 8e-05, + "loss": 1.7935, + "num_input_tokens_seen": 117339404, + "step": 830 + }, + { + "epoch": 0.05119492926414908, + "grad_norm": 0.8505790829658508, + "learning_rate": 8e-05, + "loss": 1.8398, + "num_input_tokens_seen": 118763072, + "step": 840 + }, + { + "epoch": 0.051804392707769896, + "grad_norm": 0.761175274848938, + "learning_rate": 8e-05, + "loss": 1.7851, + "num_input_tokens_seen": 120156740, + "step": 850 + }, + { + "epoch": 0.05241385615139072, + "grad_norm": 0.8605666756629944, + "learning_rate": 8e-05, + "loss": 1.8358, + "num_input_tokens_seen": 121570328, + "step": 860 + }, + { + "epoch": 0.05302331959501154, + "grad_norm": 0.648381769657135, + "learning_rate": 8e-05, + "loss": 1.7502, + "num_input_tokens_seen": 122939696, + "step": 870 + }, + { + "epoch": 0.053632783038632365, + "grad_norm": 0.9061549305915833, + "learning_rate": 8e-05, + "loss": 1.7837, + "num_input_tokens_seen": 124365264, + "step": 880 + }, + { + "epoch": 0.05424224648225319, + "grad_norm": 0.8057026267051697, + "learning_rate": 8e-05, + "loss": 1.7609, + "num_input_tokens_seen": 125746168, + "step": 890 + }, + { + "epoch": 0.05485170992587401, + "grad_norm": 0.8674312829971313, + "learning_rate": 8e-05, + "loss": 1.7415, + "num_input_tokens_seen": 127137780, + "step": 900 + }, + { + "epoch": 0.05546117336949483, + "grad_norm": 0.8366326689720154, + "learning_rate": 8e-05, + "loss": 1.8147, + "num_input_tokens_seen": 128546252, + "step": 910 + }, + { + "epoch": 0.05607063681311565, + "grad_norm": 0.8231328129768372, + "learning_rate": 8e-05, + "loss": 1.8652, + "num_input_tokens_seen": 129950036, + "step": 920 + }, + { + "epoch": 0.056680100256736476, + "grad_norm": 0.7696998715400696, + "learning_rate": 8e-05, + "loss": 1.8445, + "num_input_tokens_seen": 131396708, + "step": 930 + }, + { + "epoch": 0.057289563700357295, + "grad_norm": 0.8545131683349609, + "learning_rate": 8e-05, + "loss": 1.8091, + "num_input_tokens_seen": 132834744, + "step": 940 + }, + { + "epoch": 0.05789902714397812, + "grad_norm": 0.8772911429405212, + "learning_rate": 8e-05, + "loss": 1.822, + "num_input_tokens_seen": 134197864, + "step": 950 + }, + { + "epoch": 0.058508490587598945, + "grad_norm": 0.7780118584632874, + "learning_rate": 8e-05, + "loss": 1.6964, + "num_input_tokens_seen": 135587736, + "step": 960 + }, + { + "epoch": 0.05911795403121976, + "grad_norm": 0.8001295328140259, + "learning_rate": 8e-05, + "loss": 1.6976, + "num_input_tokens_seen": 136998024, + "step": 970 + }, + { + "epoch": 0.05972741747484059, + "grad_norm": 0.8727300763130188, + "learning_rate": 8e-05, + "loss": 1.6902, + "num_input_tokens_seen": 138396964, + "step": 980 + }, + { + "epoch": 0.06033688091846141, + "grad_norm": 0.9872994422912598, + "learning_rate": 8e-05, + "loss": 1.7319, + "num_input_tokens_seen": 139815224, + "step": 990 + }, + { + "epoch": 0.06094634436208223, + "grad_norm": 0.8009439706802368, + "learning_rate": 8e-05, + "loss": 1.6803, + "num_input_tokens_seen": 141251384, + "step": 1000 + }, + { + "epoch": 0.06155580780570306, + "grad_norm": 0.7269445657730103, + "learning_rate": 8e-05, + "loss": 1.7466, + "num_input_tokens_seen": 142666948, + "step": 1010 + }, + { + "epoch": 0.062165271249323875, + "grad_norm": 0.7160496711730957, + "learning_rate": 8e-05, + "loss": 1.7092, + "num_input_tokens_seen": 144099664, + "step": 1020 + }, + { + "epoch": 0.0627747346929447, + "grad_norm": 0.8215118646621704, + "learning_rate": 8e-05, + "loss": 1.8191, + "num_input_tokens_seen": 145505712, + "step": 1030 + }, + { + "epoch": 0.06338419813656553, + "grad_norm": 0.7651961445808411, + "learning_rate": 8e-05, + "loss": 1.7255, + "num_input_tokens_seen": 146921148, + "step": 1040 + }, + { + "epoch": 0.06399366158018634, + "grad_norm": 0.8512997031211853, + "learning_rate": 8e-05, + "loss": 1.7603, + "num_input_tokens_seen": 148324952, + "step": 1050 + }, + { + "epoch": 0.06460312502380716, + "grad_norm": 0.7710238695144653, + "learning_rate": 8e-05, + "loss": 1.7784, + "num_input_tokens_seen": 149742540, + "step": 1060 + }, + { + "epoch": 0.06521258846742799, + "grad_norm": 0.7975362539291382, + "learning_rate": 8e-05, + "loss": 1.7519, + "num_input_tokens_seen": 151140136, + "step": 1070 + }, + { + "epoch": 0.06582205191104881, + "grad_norm": 0.8341178894042969, + "learning_rate": 8e-05, + "loss": 1.7708, + "num_input_tokens_seen": 152574668, + "step": 1080 + }, + { + "epoch": 0.06643151535466964, + "grad_norm": 0.81528240442276, + "learning_rate": 8e-05, + "loss": 1.7906, + "num_input_tokens_seen": 153945316, + "step": 1090 + }, + { + "epoch": 0.06704097879829045, + "grad_norm": 0.7310307621955872, + "learning_rate": 8e-05, + "loss": 1.7695, + "num_input_tokens_seen": 155334976, + "step": 1100 + }, + { + "epoch": 0.06765044224191127, + "grad_norm": 0.7746317982673645, + "learning_rate": 8e-05, + "loss": 1.7351, + "num_input_tokens_seen": 156703560, + "step": 1110 + }, + { + "epoch": 0.0682599056855321, + "grad_norm": 0.7563662528991699, + "learning_rate": 8e-05, + "loss": 1.8099, + "num_input_tokens_seen": 158068384, + "step": 1120 + }, + { + "epoch": 0.06886936912915292, + "grad_norm": 0.7843930721282959, + "learning_rate": 8e-05, + "loss": 1.6761, + "num_input_tokens_seen": 159493532, + "step": 1130 + }, + { + "epoch": 0.06947883257277375, + "grad_norm": 0.8035304546356201, + "learning_rate": 8e-05, + "loss": 1.6405, + "num_input_tokens_seen": 160906460, + "step": 1140 + }, + { + "epoch": 0.07008829601639456, + "grad_norm": 0.6847174763679504, + "learning_rate": 8e-05, + "loss": 1.7033, + "num_input_tokens_seen": 162330960, + "step": 1150 + }, + { + "epoch": 0.07069775946001539, + "grad_norm": 0.7156793475151062, + "learning_rate": 8e-05, + "loss": 1.7531, + "num_input_tokens_seen": 163763200, + "step": 1160 + }, + { + "epoch": 0.07130722290363621, + "grad_norm": 0.7055004239082336, + "learning_rate": 8e-05, + "loss": 1.6826, + "num_input_tokens_seen": 165128772, + "step": 1170 + }, + { + "epoch": 0.07191668634725704, + "grad_norm": 0.689820408821106, + "learning_rate": 8e-05, + "loss": 1.7022, + "num_input_tokens_seen": 166506640, + "step": 1180 + }, + { + "epoch": 0.07252614979087786, + "grad_norm": 0.7453446388244629, + "learning_rate": 8e-05, + "loss": 1.7654, + "num_input_tokens_seen": 167908440, + "step": 1190 + }, + { + "epoch": 0.07313561323449867, + "grad_norm": 0.768242597579956, + "learning_rate": 8e-05, + "loss": 1.732, + "num_input_tokens_seen": 169276732, + "step": 1200 + }, + { + "epoch": 0.0737450766781195, + "grad_norm": 0.7208901047706604, + "learning_rate": 8e-05, + "loss": 1.754, + "num_input_tokens_seen": 170678632, + "step": 1210 + }, + { + "epoch": 0.07435454012174032, + "grad_norm": 0.7076956629753113, + "learning_rate": 8e-05, + "loss": 1.6955, + "num_input_tokens_seen": 172093484, + "step": 1220 + }, + { + "epoch": 0.07496400356536115, + "grad_norm": 0.7701781392097473, + "learning_rate": 8e-05, + "loss": 1.7475, + "num_input_tokens_seen": 173482152, + "step": 1230 + }, + { + "epoch": 0.07557346700898197, + "grad_norm": 0.6883390545845032, + "learning_rate": 8e-05, + "loss": 1.7302, + "num_input_tokens_seen": 174939192, + "step": 1240 + }, + { + "epoch": 0.07618293045260278, + "grad_norm": 0.6942645311355591, + "learning_rate": 8e-05, + "loss": 1.771, + "num_input_tokens_seen": 176393408, + "step": 1250 + }, + { + "epoch": 0.07679239389622361, + "grad_norm": 0.6907592415809631, + "learning_rate": 8e-05, + "loss": 1.8247, + "num_input_tokens_seen": 177820192, + "step": 1260 + }, + { + "epoch": 0.07740185733984443, + "grad_norm": 0.6885057687759399, + "learning_rate": 8e-05, + "loss": 1.7403, + "num_input_tokens_seen": 179198048, + "step": 1270 + }, + { + "epoch": 0.07801132078346526, + "grad_norm": 0.7358853816986084, + "learning_rate": 8e-05, + "loss": 1.716, + "num_input_tokens_seen": 180638000, + "step": 1280 + }, + { + "epoch": 0.07862078422708609, + "grad_norm": 0.8144451379776001, + "learning_rate": 8e-05, + "loss": 1.6946, + "num_input_tokens_seen": 182017752, + "step": 1290 + }, + { + "epoch": 0.0792302476707069, + "grad_norm": 0.7160412669181824, + "learning_rate": 8e-05, + "loss": 1.7141, + "num_input_tokens_seen": 183405184, + "step": 1300 + }, + { + "epoch": 0.07983971111432772, + "grad_norm": 0.8351532220840454, + "learning_rate": 8e-05, + "loss": 1.6821, + "num_input_tokens_seen": 184819236, + "step": 1310 + }, + { + "epoch": 0.08044917455794855, + "grad_norm": 0.6285978555679321, + "learning_rate": 8e-05, + "loss": 1.7392, + "num_input_tokens_seen": 186241748, + "step": 1320 + }, + { + "epoch": 0.08105863800156937, + "grad_norm": 0.6861914992332458, + "learning_rate": 8e-05, + "loss": 1.6637, + "num_input_tokens_seen": 187651796, + "step": 1330 + }, + { + "epoch": 0.0816681014451902, + "grad_norm": 0.7372764945030212, + "learning_rate": 8e-05, + "loss": 1.6151, + "num_input_tokens_seen": 189052704, + "step": 1340 + }, + { + "epoch": 0.08227756488881101, + "grad_norm": 0.754943311214447, + "learning_rate": 8e-05, + "loss": 1.6771, + "num_input_tokens_seen": 190457440, + "step": 1350 + }, + { + "epoch": 0.08288702833243183, + "grad_norm": 0.7047508358955383, + "learning_rate": 8e-05, + "loss": 1.7488, + "num_input_tokens_seen": 191855300, + "step": 1360 + }, + { + "epoch": 0.08349649177605266, + "grad_norm": 0.7752687931060791, + "learning_rate": 8e-05, + "loss": 1.7385, + "num_input_tokens_seen": 193237128, + "step": 1370 + }, + { + "epoch": 0.08410595521967348, + "grad_norm": 0.7014668583869934, + "learning_rate": 8e-05, + "loss": 1.767, + "num_input_tokens_seen": 194589060, + "step": 1380 + }, + { + "epoch": 0.08471541866329431, + "grad_norm": 0.6854328513145447, + "learning_rate": 8e-05, + "loss": 1.7614, + "num_input_tokens_seen": 196034960, + "step": 1390 + }, + { + "epoch": 0.08532488210691512, + "grad_norm": 0.7649319171905518, + "learning_rate": 8e-05, + "loss": 1.7496, + "num_input_tokens_seen": 197441788, + "step": 1400 + }, + { + "epoch": 0.08593434555053595, + "grad_norm": 0.6990752220153809, + "learning_rate": 8e-05, + "loss": 1.7276, + "num_input_tokens_seen": 198816188, + "step": 1410 + }, + { + "epoch": 0.08654380899415677, + "grad_norm": 0.6721024513244629, + "learning_rate": 8e-05, + "loss": 1.675, + "num_input_tokens_seen": 200227720, + "step": 1420 + }, + { + "epoch": 0.0871532724377776, + "grad_norm": 0.7723363637924194, + "learning_rate": 8e-05, + "loss": 1.6968, + "num_input_tokens_seen": 201640184, + "step": 1430 + }, + { + "epoch": 0.08776273588139841, + "grad_norm": 0.7399781942367554, + "learning_rate": 8e-05, + "loss": 1.6968, + "num_input_tokens_seen": 203082784, + "step": 1440 + }, + { + "epoch": 0.08837219932501923, + "grad_norm": 0.6369470953941345, + "learning_rate": 8e-05, + "loss": 1.7369, + "num_input_tokens_seen": 204507764, + "step": 1450 + }, + { + "epoch": 0.08898166276864006, + "grad_norm": 0.6783959865570068, + "learning_rate": 8e-05, + "loss": 1.7148, + "num_input_tokens_seen": 205926692, + "step": 1460 + }, + { + "epoch": 0.08959112621226088, + "grad_norm": 0.6937606930732727, + "learning_rate": 8e-05, + "loss": 1.6742, + "num_input_tokens_seen": 207292676, + "step": 1470 + }, + { + "epoch": 0.09020058965588171, + "grad_norm": 0.7870405316352844, + "learning_rate": 8e-05, + "loss": 1.6477, + "num_input_tokens_seen": 208696852, + "step": 1480 + }, + { + "epoch": 0.09081005309950252, + "grad_norm": 0.7263059020042419, + "learning_rate": 8e-05, + "loss": 1.6267, + "num_input_tokens_seen": 210121428, + "step": 1490 + }, + { + "epoch": 0.09141951654312334, + "grad_norm": 0.753978967666626, + "learning_rate": 8e-05, + "loss": 1.6423, + "num_input_tokens_seen": 211550144, + "step": 1500 + }, + { + "epoch": 0.09202897998674417, + "grad_norm": 0.6854983568191528, + "learning_rate": 8e-05, + "loss": 1.6713, + "num_input_tokens_seen": 212992572, + "step": 1510 + }, + { + "epoch": 0.092638443430365, + "grad_norm": 0.7292976975440979, + "learning_rate": 8e-05, + "loss": 1.6481, + "num_input_tokens_seen": 214390412, + "step": 1520 + }, + { + "epoch": 0.09324790687398582, + "grad_norm": 0.6837400197982788, + "learning_rate": 8e-05, + "loss": 1.5692, + "num_input_tokens_seen": 215806552, + "step": 1530 + }, + { + "epoch": 0.09385737031760663, + "grad_norm": 0.7413853406906128, + "learning_rate": 8e-05, + "loss": 1.6803, + "num_input_tokens_seen": 217210216, + "step": 1540 + }, + { + "epoch": 0.09446683376122746, + "grad_norm": 0.7433369755744934, + "learning_rate": 8e-05, + "loss": 1.6934, + "num_input_tokens_seen": 218598456, + "step": 1550 + }, + { + "epoch": 0.09507629720484828, + "grad_norm": 0.7748942375183105, + "learning_rate": 8e-05, + "loss": 1.6741, + "num_input_tokens_seen": 219976860, + "step": 1560 + }, + { + "epoch": 0.0956857606484691, + "grad_norm": 0.7293086647987366, + "learning_rate": 8e-05, + "loss": 1.7087, + "num_input_tokens_seen": 221355396, + "step": 1570 + }, + { + "epoch": 0.09629522409208993, + "grad_norm": 0.862250566482544, + "learning_rate": 8e-05, + "loss": 1.6832, + "num_input_tokens_seen": 222791252, + "step": 1580 + }, + { + "epoch": 0.09690468753571074, + "grad_norm": 0.7801169157028198, + "learning_rate": 8e-05, + "loss": 1.5818, + "num_input_tokens_seen": 224182892, + "step": 1590 + }, + { + "epoch": 0.09751415097933157, + "grad_norm": 0.7509076595306396, + "learning_rate": 8e-05, + "loss": 1.6964, + "num_input_tokens_seen": 225596820, + "step": 1600 + }, + { + "epoch": 0.0981236144229524, + "grad_norm": 0.6174609661102295, + "learning_rate": 8e-05, + "loss": 1.6795, + "num_input_tokens_seen": 227002192, + "step": 1610 + }, + { + "epoch": 0.09873307786657322, + "grad_norm": 0.7319700717926025, + "learning_rate": 8e-05, + "loss": 1.5898, + "num_input_tokens_seen": 228388248, + "step": 1620 + }, + { + "epoch": 0.09934254131019404, + "grad_norm": 0.7365676164627075, + "learning_rate": 8e-05, + "loss": 1.6146, + "num_input_tokens_seen": 229791408, + "step": 1630 + }, + { + "epoch": 0.09995200475381485, + "grad_norm": 0.7724565267562866, + "learning_rate": 8e-05, + "loss": 1.638, + "num_input_tokens_seen": 231187424, + "step": 1640 + }, + { + "epoch": 0.10056146819743568, + "grad_norm": 0.7245753407478333, + "learning_rate": 8e-05, + "loss": 1.712, + "num_input_tokens_seen": 232600616, + "step": 1650 + }, + { + "epoch": 0.1011709316410565, + "grad_norm": 0.6499654650688171, + "learning_rate": 8e-05, + "loss": 1.6265, + "num_input_tokens_seen": 234006344, + "step": 1660 + }, + { + "epoch": 0.10178039508467733, + "grad_norm": 0.6563583612442017, + "learning_rate": 8e-05, + "loss": 1.664, + "num_input_tokens_seen": 235416748, + "step": 1670 + }, + { + "epoch": 0.10238985852829816, + "grad_norm": 0.7042690515518188, + "learning_rate": 8e-05, + "loss": 1.6468, + "num_input_tokens_seen": 236830896, + "step": 1680 + }, + { + "epoch": 0.10299932197191897, + "grad_norm": 0.7631860375404358, + "learning_rate": 8e-05, + "loss": 1.6325, + "num_input_tokens_seen": 238262140, + "step": 1690 + }, + { + "epoch": 0.10360878541553979, + "grad_norm": 0.6432304382324219, + "learning_rate": 8e-05, + "loss": 1.6756, + "num_input_tokens_seen": 239674040, + "step": 1700 + }, + { + "epoch": 0.10421824885916062, + "grad_norm": 0.7273523807525635, + "learning_rate": 8e-05, + "loss": 1.6661, + "num_input_tokens_seen": 241093432, + "step": 1710 + }, + { + "epoch": 0.10482771230278144, + "grad_norm": 0.781891942024231, + "learning_rate": 8e-05, + "loss": 1.7187, + "num_input_tokens_seen": 242530300, + "step": 1720 + }, + { + "epoch": 0.10543717574640227, + "grad_norm": 0.6699258089065552, + "learning_rate": 8e-05, + "loss": 1.6269, + "num_input_tokens_seen": 243920956, + "step": 1730 + }, + { + "epoch": 0.10604663919002308, + "grad_norm": 0.7054083943367004, + "learning_rate": 8e-05, + "loss": 1.6728, + "num_input_tokens_seen": 245343376, + "step": 1740 + }, + { + "epoch": 0.1066561026336439, + "grad_norm": 0.7309428453445435, + "learning_rate": 8e-05, + "loss": 1.6168, + "num_input_tokens_seen": 246742064, + "step": 1750 + }, + { + "epoch": 0.10726556607726473, + "grad_norm": 0.6241644620895386, + "learning_rate": 8e-05, + "loss": 1.6413, + "num_input_tokens_seen": 248121468, + "step": 1760 + }, + { + "epoch": 0.10787502952088555, + "grad_norm": 0.7994136214256287, + "learning_rate": 8e-05, + "loss": 1.6438, + "num_input_tokens_seen": 249536668, + "step": 1770 + }, + { + "epoch": 0.10848449296450638, + "grad_norm": 0.6934658885002136, + "learning_rate": 8e-05, + "loss": 1.7127, + "num_input_tokens_seen": 250943296, + "step": 1780 + }, + { + "epoch": 0.10909395640812719, + "grad_norm": 0.7179552912712097, + "learning_rate": 8e-05, + "loss": 1.6016, + "num_input_tokens_seen": 252370756, + "step": 1790 + }, + { + "epoch": 0.10970341985174802, + "grad_norm": 0.7161432504653931, + "learning_rate": 8e-05, + "loss": 1.6472, + "num_input_tokens_seen": 253768292, + "step": 1800 + }, + { + "epoch": 0.11031288329536884, + "grad_norm": 0.749668300151825, + "learning_rate": 8e-05, + "loss": 1.645, + "num_input_tokens_seen": 255228292, + "step": 1810 + }, + { + "epoch": 0.11092234673898967, + "grad_norm": 0.6765767335891724, + "learning_rate": 8e-05, + "loss": 1.6538, + "num_input_tokens_seen": 256611280, + "step": 1820 + }, + { + "epoch": 0.11153181018261048, + "grad_norm": 0.6850834488868713, + "learning_rate": 8e-05, + "loss": 1.7177, + "num_input_tokens_seen": 258027728, + "step": 1830 + }, + { + "epoch": 0.1121412736262313, + "grad_norm": 0.7252477407455444, + "learning_rate": 8e-05, + "loss": 1.6964, + "num_input_tokens_seen": 259428460, + "step": 1840 + }, + { + "epoch": 0.11275073706985213, + "grad_norm": 0.7121752500534058, + "learning_rate": 8e-05, + "loss": 1.6462, + "num_input_tokens_seen": 260851868, + "step": 1850 + }, + { + "epoch": 0.11336020051347295, + "grad_norm": 0.6859217882156372, + "learning_rate": 8e-05, + "loss": 1.6794, + "num_input_tokens_seen": 262264644, + "step": 1860 + }, + { + "epoch": 0.11396966395709378, + "grad_norm": 0.8705602884292603, + "learning_rate": 8e-05, + "loss": 1.6131, + "num_input_tokens_seen": 263652388, + "step": 1870 + }, + { + "epoch": 0.11457912740071459, + "grad_norm": 0.6738168001174927, + "learning_rate": 8e-05, + "loss": 1.6295, + "num_input_tokens_seen": 265074652, + "step": 1880 + }, + { + "epoch": 0.11518859084433541, + "grad_norm": 0.7672312259674072, + "learning_rate": 8e-05, + "loss": 1.7278, + "num_input_tokens_seen": 266487960, + "step": 1890 + }, + { + "epoch": 0.11579805428795624, + "grad_norm": 0.7259830236434937, + "learning_rate": 8e-05, + "loss": 1.5629, + "num_input_tokens_seen": 267907564, + "step": 1900 + }, + { + "epoch": 0.11640751773157706, + "grad_norm": 0.7508255243301392, + "learning_rate": 8e-05, + "loss": 1.6847, + "num_input_tokens_seen": 269339832, + "step": 1910 + }, + { + "epoch": 0.11701698117519789, + "grad_norm": 0.5813190340995789, + "learning_rate": 8e-05, + "loss": 1.6564, + "num_input_tokens_seen": 270751000, + "step": 1920 + }, + { + "epoch": 0.1176264446188187, + "grad_norm": 0.7213466167449951, + "learning_rate": 8e-05, + "loss": 1.6302, + "num_input_tokens_seen": 272212216, + "step": 1930 + }, + { + "epoch": 0.11823590806243953, + "grad_norm": 0.6883914470672607, + "learning_rate": 8e-05, + "loss": 1.5737, + "num_input_tokens_seen": 273691760, + "step": 1940 + }, + { + "epoch": 0.11884537150606035, + "grad_norm": 0.7095319628715515, + "learning_rate": 8e-05, + "loss": 1.6719, + "num_input_tokens_seen": 275114348, + "step": 1950 + }, + { + "epoch": 0.11945483494968118, + "grad_norm": 0.6590485572814941, + "learning_rate": 8e-05, + "loss": 1.6855, + "num_input_tokens_seen": 276525856, + "step": 1960 + }, + { + "epoch": 0.120064298393302, + "grad_norm": 0.7270433306694031, + "learning_rate": 8e-05, + "loss": 1.6401, + "num_input_tokens_seen": 277930536, + "step": 1970 + }, + { + "epoch": 0.12067376183692281, + "grad_norm": 0.6123723387718201, + "learning_rate": 8e-05, + "loss": 1.6706, + "num_input_tokens_seen": 279346748, + "step": 1980 + }, + { + "epoch": 0.12128322528054364, + "grad_norm": 0.7456077337265015, + "learning_rate": 8e-05, + "loss": 1.7432, + "num_input_tokens_seen": 280734376, + "step": 1990 + }, + { + "epoch": 0.12189268872416446, + "grad_norm": 0.6831845641136169, + "learning_rate": 8e-05, + "loss": 1.6291, + "num_input_tokens_seen": 282132752, + "step": 2000 + }, + { + "epoch": 0.12250215216778529, + "grad_norm": 0.6901050209999084, + "learning_rate": 8e-05, + "loss": 1.6807, + "num_input_tokens_seen": 283584816, + "step": 2010 + }, + { + "epoch": 0.12311161561140611, + "grad_norm": 0.7861285209655762, + "learning_rate": 8e-05, + "loss": 1.7389, + "num_input_tokens_seen": 285064504, + "step": 2020 + }, + { + "epoch": 0.12372107905502693, + "grad_norm": 0.5943942070007324, + "learning_rate": 8e-05, + "loss": 1.6045, + "num_input_tokens_seen": 286464552, + "step": 2030 + }, + { + "epoch": 0.12433054249864775, + "grad_norm": 0.7200583815574646, + "learning_rate": 8e-05, + "loss": 1.6617, + "num_input_tokens_seen": 287842704, + "step": 2040 + }, + { + "epoch": 0.12494000594226858, + "grad_norm": 0.6980604529380798, + "learning_rate": 8e-05, + "loss": 1.5976, + "num_input_tokens_seen": 289280200, + "step": 2050 + }, + { + "epoch": 0.1255494693858894, + "grad_norm": 0.6354398727416992, + "learning_rate": 8e-05, + "loss": 1.6191, + "num_input_tokens_seen": 290665088, + "step": 2060 + }, + { + "epoch": 0.1261589328295102, + "grad_norm": 0.654530942440033, + "learning_rate": 8e-05, + "loss": 1.6089, + "num_input_tokens_seen": 292110316, + "step": 2070 + }, + { + "epoch": 0.12676839627313105, + "grad_norm": 0.6449682712554932, + "learning_rate": 8e-05, + "loss": 1.6043, + "num_input_tokens_seen": 293537852, + "step": 2080 + }, + { + "epoch": 0.12737785971675186, + "grad_norm": 0.7298957705497742, + "learning_rate": 8e-05, + "loss": 1.6753, + "num_input_tokens_seen": 294958232, + "step": 2090 + }, + { + "epoch": 0.12798732316037267, + "grad_norm": 0.7615550756454468, + "learning_rate": 8e-05, + "loss": 1.5889, + "num_input_tokens_seen": 296341104, + "step": 2100 + }, + { + "epoch": 0.1285967866039935, + "grad_norm": 0.6716745495796204, + "learning_rate": 8e-05, + "loss": 1.6452, + "num_input_tokens_seen": 297751192, + "step": 2110 + }, + { + "epoch": 0.12920625004761432, + "grad_norm": 0.6735673546791077, + "learning_rate": 8e-05, + "loss": 1.5333, + "num_input_tokens_seen": 299157160, + "step": 2120 + }, + { + "epoch": 0.12981571349123516, + "grad_norm": 0.6383569836616516, + "learning_rate": 8e-05, + "loss": 1.7071, + "num_input_tokens_seen": 300556440, + "step": 2130 + }, + { + "epoch": 0.13042517693485597, + "grad_norm": 0.6580716967582703, + "learning_rate": 8e-05, + "loss": 1.5882, + "num_input_tokens_seen": 301950588, + "step": 2140 + }, + { + "epoch": 0.13103464037847679, + "grad_norm": 0.7268714904785156, + "learning_rate": 8e-05, + "loss": 1.6112, + "num_input_tokens_seen": 303350336, + "step": 2150 + }, + { + "epoch": 0.13164410382209762, + "grad_norm": 0.6901399493217468, + "learning_rate": 8e-05, + "loss": 1.572, + "num_input_tokens_seen": 304781136, + "step": 2160 + }, + { + "epoch": 0.13225356726571844, + "grad_norm": 0.6628725528717041, + "learning_rate": 8e-05, + "loss": 1.6983, + "num_input_tokens_seen": 306176884, + "step": 2170 + }, + { + "epoch": 0.13286303070933927, + "grad_norm": 0.688841700553894, + "learning_rate": 8e-05, + "loss": 1.6409, + "num_input_tokens_seen": 307601844, + "step": 2180 + }, + { + "epoch": 0.1334724941529601, + "grad_norm": 0.6499971747398376, + "learning_rate": 8e-05, + "loss": 1.5889, + "num_input_tokens_seen": 308995452, + "step": 2190 + }, + { + "epoch": 0.1340819575965809, + "grad_norm": 0.6463174223899841, + "learning_rate": 8e-05, + "loss": 1.6411, + "num_input_tokens_seen": 310417804, + "step": 2200 + }, + { + "epoch": 0.13469142104020174, + "grad_norm": 0.6206100583076477, + "learning_rate": 8e-05, + "loss": 1.59, + "num_input_tokens_seen": 311846032, + "step": 2210 + }, + { + "epoch": 0.13530088448382255, + "grad_norm": 0.6116955876350403, + "learning_rate": 8e-05, + "loss": 1.6057, + "num_input_tokens_seen": 313234256, + "step": 2220 + }, + { + "epoch": 0.1359103479274434, + "grad_norm": 0.7880204916000366, + "learning_rate": 8e-05, + "loss": 1.6058, + "num_input_tokens_seen": 314639172, + "step": 2230 + }, + { + "epoch": 0.1365198113710642, + "grad_norm": 0.6306136846542358, + "learning_rate": 8e-05, + "loss": 1.665, + "num_input_tokens_seen": 316031736, + "step": 2240 + }, + { + "epoch": 0.137129274814685, + "grad_norm": 0.6810283660888672, + "learning_rate": 8e-05, + "loss": 1.6544, + "num_input_tokens_seen": 317438412, + "step": 2250 + }, + { + "epoch": 0.13773873825830585, + "grad_norm": 0.671161949634552, + "learning_rate": 8e-05, + "loss": 1.6091, + "num_input_tokens_seen": 318863568, + "step": 2260 + }, + { + "epoch": 0.13834820170192666, + "grad_norm": 0.6195483803749084, + "learning_rate": 8e-05, + "loss": 1.5676, + "num_input_tokens_seen": 320256312, + "step": 2270 + }, + { + "epoch": 0.1389576651455475, + "grad_norm": 0.6664822101593018, + "learning_rate": 8e-05, + "loss": 1.6302, + "num_input_tokens_seen": 321670092, + "step": 2280 + }, + { + "epoch": 0.1395671285891683, + "grad_norm": 0.6218630075454712, + "learning_rate": 8e-05, + "loss": 1.6288, + "num_input_tokens_seen": 323080412, + "step": 2290 + }, + { + "epoch": 0.14017659203278912, + "grad_norm": 0.676080048084259, + "learning_rate": 8e-05, + "loss": 1.5945, + "num_input_tokens_seen": 324469600, + "step": 2300 + }, + { + "epoch": 0.14078605547640996, + "grad_norm": 0.5576323866844177, + "learning_rate": 8e-05, + "loss": 1.5904, + "num_input_tokens_seen": 325857624, + "step": 2310 + }, + { + "epoch": 0.14139551892003077, + "grad_norm": 0.6566000580787659, + "learning_rate": 8e-05, + "loss": 1.6492, + "num_input_tokens_seen": 327258636, + "step": 2320 + }, + { + "epoch": 0.1420049823636516, + "grad_norm": 0.6909166574478149, + "learning_rate": 8e-05, + "loss": 1.6071, + "num_input_tokens_seen": 328702764, + "step": 2330 + }, + { + "epoch": 0.14261444580727242, + "grad_norm": 0.647297739982605, + "learning_rate": 8e-05, + "loss": 1.5577, + "num_input_tokens_seen": 330100304, + "step": 2340 + }, + { + "epoch": 0.14322390925089323, + "grad_norm": 0.6550387740135193, + "learning_rate": 8e-05, + "loss": 1.6879, + "num_input_tokens_seen": 331549276, + "step": 2350 + }, + { + "epoch": 0.14383337269451407, + "grad_norm": 0.6743698716163635, + "learning_rate": 8e-05, + "loss": 1.7019, + "num_input_tokens_seen": 332916584, + "step": 2360 + }, + { + "epoch": 0.14444283613813488, + "grad_norm": 0.581924557685852, + "learning_rate": 8e-05, + "loss": 1.6591, + "num_input_tokens_seen": 334299004, + "step": 2370 + }, + { + "epoch": 0.14505229958175572, + "grad_norm": 0.6334020495414734, + "learning_rate": 8e-05, + "loss": 1.6316, + "num_input_tokens_seen": 335702312, + "step": 2380 + }, + { + "epoch": 0.14566176302537653, + "grad_norm": 0.5923337340354919, + "learning_rate": 8e-05, + "loss": 1.5888, + "num_input_tokens_seen": 337127304, + "step": 2390 + }, + { + "epoch": 0.14627122646899735, + "grad_norm": 0.6918947696685791, + "learning_rate": 8e-05, + "loss": 1.6433, + "num_input_tokens_seen": 338550764, + "step": 2400 + }, + { + "epoch": 0.14688068991261818, + "grad_norm": 0.7656240463256836, + "learning_rate": 8e-05, + "loss": 1.5949, + "num_input_tokens_seen": 339962352, + "step": 2410 + }, + { + "epoch": 0.147490153356239, + "grad_norm": 0.7092509269714355, + "learning_rate": 8e-05, + "loss": 1.6059, + "num_input_tokens_seen": 341363788, + "step": 2420 + }, + { + "epoch": 0.14809961679985983, + "grad_norm": 0.6121082901954651, + "learning_rate": 8e-05, + "loss": 1.5637, + "num_input_tokens_seen": 342800568, + "step": 2430 + }, + { + "epoch": 0.14870908024348065, + "grad_norm": 0.5944322347640991, + "learning_rate": 8e-05, + "loss": 1.6384, + "num_input_tokens_seen": 344226896, + "step": 2440 + }, + { + "epoch": 0.14931854368710146, + "grad_norm": 0.6050378084182739, + "learning_rate": 8e-05, + "loss": 1.6027, + "num_input_tokens_seen": 345639348, + "step": 2450 + }, + { + "epoch": 0.1499280071307223, + "grad_norm": 0.7982641458511353, + "learning_rate": 8e-05, + "loss": 1.5957, + "num_input_tokens_seen": 347053896, + "step": 2460 + }, + { + "epoch": 0.1505374705743431, + "grad_norm": 0.6125765442848206, + "learning_rate": 8e-05, + "loss": 1.576, + "num_input_tokens_seen": 348438848, + "step": 2470 + }, + { + "epoch": 0.15114693401796395, + "grad_norm": 0.728951096534729, + "learning_rate": 8e-05, + "loss": 1.6711, + "num_input_tokens_seen": 349823624, + "step": 2480 + }, + { + "epoch": 0.15175639746158476, + "grad_norm": 0.5719558596611023, + "learning_rate": 8e-05, + "loss": 1.6289, + "num_input_tokens_seen": 351265596, + "step": 2490 + }, + { + "epoch": 0.15236586090520557, + "grad_norm": 0.5965576767921448, + "learning_rate": 8e-05, + "loss": 1.6114, + "num_input_tokens_seen": 352677476, + "step": 2500 + }, + { + "epoch": 0.1529753243488264, + "grad_norm": 0.6597477197647095, + "learning_rate": 8e-05, + "loss": 1.5929, + "num_input_tokens_seen": 354079432, + "step": 2510 + }, + { + "epoch": 0.15358478779244722, + "grad_norm": 0.633777379989624, + "learning_rate": 8e-05, + "loss": 1.6062, + "num_input_tokens_seen": 355500376, + "step": 2520 + }, + { + "epoch": 0.15419425123606806, + "grad_norm": 0.6986291408538818, + "learning_rate": 8e-05, + "loss": 1.5994, + "num_input_tokens_seen": 356938480, + "step": 2530 + }, + { + "epoch": 0.15480371467968887, + "grad_norm": 0.6283817291259766, + "learning_rate": 8e-05, + "loss": 1.6403, + "num_input_tokens_seen": 358347276, + "step": 2540 + }, + { + "epoch": 0.15541317812330968, + "grad_norm": 0.6436857581138611, + "learning_rate": 8e-05, + "loss": 1.5707, + "num_input_tokens_seen": 359743248, + "step": 2550 + }, + { + "epoch": 0.15602264156693052, + "grad_norm": 0.6602137684822083, + "learning_rate": 8e-05, + "loss": 1.6227, + "num_input_tokens_seen": 361133912, + "step": 2560 + }, + { + "epoch": 0.15663210501055133, + "grad_norm": 0.7311964631080627, + "learning_rate": 8e-05, + "loss": 1.5496, + "num_input_tokens_seen": 362551344, + "step": 2570 + }, + { + "epoch": 0.15724156845417217, + "grad_norm": 0.6504777073860168, + "learning_rate": 8e-05, + "loss": 1.595, + "num_input_tokens_seen": 363943344, + "step": 2580 + }, + { + "epoch": 0.15785103189779298, + "grad_norm": 0.6566677093505859, + "learning_rate": 8e-05, + "loss": 1.5807, + "num_input_tokens_seen": 365347512, + "step": 2590 + }, + { + "epoch": 0.1584604953414138, + "grad_norm": 0.6459395885467529, + "learning_rate": 8e-05, + "loss": 1.6358, + "num_input_tokens_seen": 366720924, + "step": 2600 + }, + { + "epoch": 0.15906995878503463, + "grad_norm": 0.6944287419319153, + "learning_rate": 8e-05, + "loss": 1.586, + "num_input_tokens_seen": 368150964, + "step": 2610 + }, + { + "epoch": 0.15967942222865544, + "grad_norm": 0.5620209574699402, + "learning_rate": 8e-05, + "loss": 1.5993, + "num_input_tokens_seen": 369511972, + "step": 2620 + }, + { + "epoch": 0.16028888567227628, + "grad_norm": 0.6273583173751831, + "learning_rate": 8e-05, + "loss": 1.5833, + "num_input_tokens_seen": 370942616, + "step": 2630 + }, + { + "epoch": 0.1608983491158971, + "grad_norm": 0.669262707233429, + "learning_rate": 8e-05, + "loss": 1.5427, + "num_input_tokens_seen": 372347592, + "step": 2640 + }, + { + "epoch": 0.1615078125595179, + "grad_norm": 0.5964462161064148, + "learning_rate": 8e-05, + "loss": 1.6693, + "num_input_tokens_seen": 373729360, + "step": 2650 + }, + { + "epoch": 0.16211727600313874, + "grad_norm": 0.608135461807251, + "learning_rate": 8e-05, + "loss": 1.5437, + "num_input_tokens_seen": 375181280, + "step": 2660 + }, + { + "epoch": 0.16272673944675956, + "grad_norm": 0.6479006409645081, + "learning_rate": 8e-05, + "loss": 1.6328, + "num_input_tokens_seen": 376596372, + "step": 2670 + }, + { + "epoch": 0.1633362028903804, + "grad_norm": 0.6235690712928772, + "learning_rate": 8e-05, + "loss": 1.6596, + "num_input_tokens_seen": 378020808, + "step": 2680 + }, + { + "epoch": 0.1639456663340012, + "grad_norm": 0.6381850838661194, + "learning_rate": 8e-05, + "loss": 1.5794, + "num_input_tokens_seen": 379380024, + "step": 2690 + }, + { + "epoch": 0.16455512977762202, + "grad_norm": 0.6151752471923828, + "learning_rate": 8e-05, + "loss": 1.5792, + "num_input_tokens_seen": 380803184, + "step": 2700 + }, + { + "epoch": 0.16516459322124286, + "grad_norm": 0.6571025252342224, + "learning_rate": 8e-05, + "loss": 1.5779, + "num_input_tokens_seen": 382188904, + "step": 2710 + }, + { + "epoch": 0.16577405666486367, + "grad_norm": 0.694426417350769, + "learning_rate": 8e-05, + "loss": 1.6054, + "num_input_tokens_seen": 383612672, + "step": 2720 + }, + { + "epoch": 0.1663835201084845, + "grad_norm": 0.6907577514648438, + "learning_rate": 8e-05, + "loss": 1.5726, + "num_input_tokens_seen": 385033960, + "step": 2730 + }, + { + "epoch": 0.16699298355210532, + "grad_norm": 0.7059677243232727, + "learning_rate": 8e-05, + "loss": 1.5798, + "num_input_tokens_seen": 386416964, + "step": 2740 + }, + { + "epoch": 0.16760244699572613, + "grad_norm": 0.6744717359542847, + "learning_rate": 8e-05, + "loss": 1.6229, + "num_input_tokens_seen": 387838080, + "step": 2750 + }, + { + "epoch": 0.16821191043934697, + "grad_norm": 0.6473004817962646, + "learning_rate": 8e-05, + "loss": 1.5853, + "num_input_tokens_seen": 389250364, + "step": 2760 + }, + { + "epoch": 0.16882137388296778, + "grad_norm": 0.7390977144241333, + "learning_rate": 8e-05, + "loss": 1.5684, + "num_input_tokens_seen": 390679648, + "step": 2770 + }, + { + "epoch": 0.16943083732658862, + "grad_norm": 0.7005506753921509, + "learning_rate": 8e-05, + "loss": 1.6613, + "num_input_tokens_seen": 392061336, + "step": 2780 + }, + { + "epoch": 0.17004030077020943, + "grad_norm": 0.6911351084709167, + "learning_rate": 8e-05, + "loss": 1.5572, + "num_input_tokens_seen": 393461980, + "step": 2790 + }, + { + "epoch": 0.17064976421383024, + "grad_norm": 0.5468894243240356, + "learning_rate": 8e-05, + "loss": 1.589, + "num_input_tokens_seen": 394847452, + "step": 2800 + }, + { + "epoch": 0.17125922765745108, + "grad_norm": 0.5998040437698364, + "learning_rate": 8e-05, + "loss": 1.5748, + "num_input_tokens_seen": 396233508, + "step": 2810 + }, + { + "epoch": 0.1718686911010719, + "grad_norm": 0.7814628481864929, + "learning_rate": 8e-05, + "loss": 1.5206, + "num_input_tokens_seen": 397650544, + "step": 2820 + }, + { + "epoch": 0.17247815454469273, + "grad_norm": 0.6909236311912537, + "learning_rate": 8e-05, + "loss": 1.5474, + "num_input_tokens_seen": 399046560, + "step": 2830 + }, + { + "epoch": 0.17308761798831354, + "grad_norm": 0.7126919627189636, + "learning_rate": 8e-05, + "loss": 1.5615, + "num_input_tokens_seen": 400408700, + "step": 2840 + }, + { + "epoch": 0.17369708143193435, + "grad_norm": 0.6222878098487854, + "learning_rate": 8e-05, + "loss": 1.5642, + "num_input_tokens_seen": 401806840, + "step": 2850 + }, + { + "epoch": 0.1743065448755552, + "grad_norm": 0.6794285774230957, + "learning_rate": 8e-05, + "loss": 1.5793, + "num_input_tokens_seen": 403203952, + "step": 2860 + }, + { + "epoch": 0.174916008319176, + "grad_norm": 0.609735906124115, + "learning_rate": 8e-05, + "loss": 1.6068, + "num_input_tokens_seen": 404591240, + "step": 2870 + }, + { + "epoch": 0.17552547176279681, + "grad_norm": 0.6210402250289917, + "learning_rate": 8e-05, + "loss": 1.6045, + "num_input_tokens_seen": 405971452, + "step": 2880 + }, + { + "epoch": 0.17613493520641765, + "grad_norm": 0.6104692816734314, + "learning_rate": 8e-05, + "loss": 1.5263, + "num_input_tokens_seen": 407388256, + "step": 2890 + }, + { + "epoch": 0.17674439865003846, + "grad_norm": 0.6604185700416565, + "learning_rate": 8e-05, + "loss": 1.6198, + "num_input_tokens_seen": 408813112, + "step": 2900 + }, + { + "epoch": 0.1773538620936593, + "grad_norm": 0.6400834321975708, + "learning_rate": 8e-05, + "loss": 1.5672, + "num_input_tokens_seen": 410235500, + "step": 2910 + }, + { + "epoch": 0.17796332553728011, + "grad_norm": 0.6400529742240906, + "learning_rate": 8e-05, + "loss": 1.5761, + "num_input_tokens_seen": 411639812, + "step": 2920 + }, + { + "epoch": 0.17857278898090093, + "grad_norm": 0.603792130947113, + "learning_rate": 8e-05, + "loss": 1.6047, + "num_input_tokens_seen": 413089172, + "step": 2930 + }, + { + "epoch": 0.17918225242452177, + "grad_norm": 0.6779669523239136, + "learning_rate": 8e-05, + "loss": 1.5808, + "num_input_tokens_seen": 414491152, + "step": 2940 + }, + { + "epoch": 0.17979171586814258, + "grad_norm": 0.6752368807792664, + "learning_rate": 8e-05, + "loss": 1.5736, + "num_input_tokens_seen": 415907296, + "step": 2950 + }, + { + "epoch": 0.18040117931176342, + "grad_norm": 0.6246203184127808, + "learning_rate": 8e-05, + "loss": 1.5875, + "num_input_tokens_seen": 417303208, + "step": 2960 + }, + { + "epoch": 0.18101064275538423, + "grad_norm": 0.6521744728088379, + "learning_rate": 8e-05, + "loss": 1.586, + "num_input_tokens_seen": 418730556, + "step": 2970 + }, + { + "epoch": 0.18162010619900504, + "grad_norm": 0.6168652176856995, + "learning_rate": 8e-05, + "loss": 1.541, + "num_input_tokens_seen": 420143632, + "step": 2980 + }, + { + "epoch": 0.18222956964262588, + "grad_norm": 0.6491835713386536, + "learning_rate": 8e-05, + "loss": 1.551, + "num_input_tokens_seen": 421602604, + "step": 2990 + }, + { + "epoch": 0.1828390330862467, + "grad_norm": 0.8011573553085327, + "learning_rate": 8e-05, + "loss": 1.6447, + "num_input_tokens_seen": 423013584, + "step": 3000 + }, + { + "epoch": 0.18344849652986753, + "grad_norm": 0.641477644443512, + "learning_rate": 8e-05, + "loss": 1.5983, + "num_input_tokens_seen": 424408696, + "step": 3010 + }, + { + "epoch": 0.18405795997348834, + "grad_norm": 0.6406200528144836, + "learning_rate": 8e-05, + "loss": 1.6416, + "num_input_tokens_seen": 425827168, + "step": 3020 + }, + { + "epoch": 0.18466742341710915, + "grad_norm": 0.6057642102241516, + "learning_rate": 8e-05, + "loss": 1.5256, + "num_input_tokens_seen": 427245472, + "step": 3030 + }, + { + "epoch": 0.18527688686073, + "grad_norm": 0.6526055932044983, + "learning_rate": 8e-05, + "loss": 1.4959, + "num_input_tokens_seen": 428628168, + "step": 3040 + }, + { + "epoch": 0.1858863503043508, + "grad_norm": 0.6561000943183899, + "learning_rate": 8e-05, + "loss": 1.5155, + "num_input_tokens_seen": 430033608, + "step": 3050 + }, + { + "epoch": 0.18649581374797164, + "grad_norm": 0.6516884565353394, + "learning_rate": 8e-05, + "loss": 1.6234, + "num_input_tokens_seen": 431432912, + "step": 3060 + }, + { + "epoch": 0.18710527719159245, + "grad_norm": 0.6904422044754028, + "learning_rate": 8e-05, + "loss": 1.4835, + "num_input_tokens_seen": 432800096, + "step": 3070 + }, + { + "epoch": 0.18771474063521326, + "grad_norm": 0.5921774506568909, + "learning_rate": 8e-05, + "loss": 1.5685, + "num_input_tokens_seen": 434190712, + "step": 3080 + }, + { + "epoch": 0.1883242040788341, + "grad_norm": 0.5747608542442322, + "learning_rate": 8e-05, + "loss": 1.6084, + "num_input_tokens_seen": 435587468, + "step": 3090 + }, + { + "epoch": 0.1889336675224549, + "grad_norm": 0.6614587306976318, + "learning_rate": 8e-05, + "loss": 1.5626, + "num_input_tokens_seen": 437003988, + "step": 3100 + }, + { + "epoch": 0.18954313096607575, + "grad_norm": 0.5542994141578674, + "learning_rate": 8e-05, + "loss": 1.6015, + "num_input_tokens_seen": 438403756, + "step": 3110 + }, + { + "epoch": 0.19015259440969656, + "grad_norm": 0.6427204608917236, + "learning_rate": 8e-05, + "loss": 1.529, + "num_input_tokens_seen": 439826144, + "step": 3120 + }, + { + "epoch": 0.19076205785331737, + "grad_norm": 0.560806930065155, + "learning_rate": 8e-05, + "loss": 1.5518, + "num_input_tokens_seen": 441217644, + "step": 3130 + }, + { + "epoch": 0.1913715212969382, + "grad_norm": 0.6509169340133667, + "learning_rate": 8e-05, + "loss": 1.5269, + "num_input_tokens_seen": 442617344, + "step": 3140 + }, + { + "epoch": 0.19198098474055902, + "grad_norm": 0.6311179995536804, + "learning_rate": 8e-05, + "loss": 1.5414, + "num_input_tokens_seen": 444028916, + "step": 3150 + }, + { + "epoch": 0.19259044818417986, + "grad_norm": 0.6070147752761841, + "learning_rate": 8e-05, + "loss": 1.5521, + "num_input_tokens_seen": 445413608, + "step": 3160 + }, + { + "epoch": 0.19319991162780067, + "grad_norm": 0.6126248836517334, + "learning_rate": 8e-05, + "loss": 1.6203, + "num_input_tokens_seen": 446824748, + "step": 3170 + }, + { + "epoch": 0.19380937507142149, + "grad_norm": 0.6572065949440002, + "learning_rate": 8e-05, + "loss": 1.56, + "num_input_tokens_seen": 448217780, + "step": 3180 + }, + { + "epoch": 0.19441883851504232, + "grad_norm": 0.6731168627738953, + "learning_rate": 8e-05, + "loss": 1.6071, + "num_input_tokens_seen": 449669388, + "step": 3190 + }, + { + "epoch": 0.19502830195866314, + "grad_norm": 0.5746667385101318, + "learning_rate": 8e-05, + "loss": 1.6396, + "num_input_tokens_seen": 451076256, + "step": 3200 + }, + { + "epoch": 0.19563776540228398, + "grad_norm": 0.5883861184120178, + "learning_rate": 8e-05, + "loss": 1.5441, + "num_input_tokens_seen": 452433176, + "step": 3210 + }, + { + "epoch": 0.1962472288459048, + "grad_norm": 0.6213635206222534, + "learning_rate": 8e-05, + "loss": 1.5408, + "num_input_tokens_seen": 453864744, + "step": 3220 + }, + { + "epoch": 0.1968566922895256, + "grad_norm": 0.5844454169273376, + "learning_rate": 8e-05, + "loss": 1.5892, + "num_input_tokens_seen": 455266916, + "step": 3230 + }, + { + "epoch": 0.19746615573314644, + "grad_norm": 0.7327582240104675, + "learning_rate": 8e-05, + "loss": 1.5541, + "num_input_tokens_seen": 456655360, + "step": 3240 + }, + { + "epoch": 0.19807561917676725, + "grad_norm": 0.7098453044891357, + "learning_rate": 8e-05, + "loss": 1.5219, + "num_input_tokens_seen": 458100268, + "step": 3250 + }, + { + "epoch": 0.1986850826203881, + "grad_norm": 0.6221525073051453, + "learning_rate": 8e-05, + "loss": 1.5587, + "num_input_tokens_seen": 459523316, + "step": 3260 + }, + { + "epoch": 0.1992945460640089, + "grad_norm": 0.646511971950531, + "learning_rate": 8e-05, + "loss": 1.6495, + "num_input_tokens_seen": 460960464, + "step": 3270 + }, + { + "epoch": 0.1999040095076297, + "grad_norm": 0.6622752547264099, + "learning_rate": 8e-05, + "loss": 1.5559, + "num_input_tokens_seen": 462371980, + "step": 3280 + }, + { + "epoch": 0.20051347295125055, + "grad_norm": 0.5687728524208069, + "learning_rate": 8e-05, + "loss": 1.5691, + "num_input_tokens_seen": 463756352, + "step": 3290 + }, + { + "epoch": 0.20112293639487136, + "grad_norm": 0.6098371148109436, + "learning_rate": 8e-05, + "loss": 1.5524, + "num_input_tokens_seen": 465193884, + "step": 3300 + }, + { + "epoch": 0.2017323998384922, + "grad_norm": 0.681311309337616, + "learning_rate": 8e-05, + "loss": 1.5381, + "num_input_tokens_seen": 466639236, + "step": 3310 + }, + { + "epoch": 0.202341863282113, + "grad_norm": 0.5747373104095459, + "learning_rate": 8e-05, + "loss": 1.5177, + "num_input_tokens_seen": 468072688, + "step": 3320 + }, + { + "epoch": 0.20295132672573382, + "grad_norm": 0.5559400320053101, + "learning_rate": 8e-05, + "loss": 1.5873, + "num_input_tokens_seen": 469442532, + "step": 3330 + }, + { + "epoch": 0.20356079016935466, + "grad_norm": 0.6499312520027161, + "learning_rate": 8e-05, + "loss": 1.6033, + "num_input_tokens_seen": 470904716, + "step": 3340 + }, + { + "epoch": 0.20417025361297547, + "grad_norm": 0.5671247839927673, + "learning_rate": 8e-05, + "loss": 1.546, + "num_input_tokens_seen": 472320820, + "step": 3350 + }, + { + "epoch": 0.2047797170565963, + "grad_norm": 0.605981171131134, + "learning_rate": 8e-05, + "loss": 1.5692, + "num_input_tokens_seen": 473731260, + "step": 3360 + }, + { + "epoch": 0.20538918050021712, + "grad_norm": 0.7082146406173706, + "learning_rate": 8e-05, + "loss": 1.502, + "num_input_tokens_seen": 475164164, + "step": 3370 + }, + { + "epoch": 0.20599864394383793, + "grad_norm": 0.6158855557441711, + "learning_rate": 8e-05, + "loss": 1.5276, + "num_input_tokens_seen": 476603328, + "step": 3380 + }, + { + "epoch": 0.20660810738745877, + "grad_norm": 0.701566755771637, + "learning_rate": 8e-05, + "loss": 1.451, + "num_input_tokens_seen": 477996612, + "step": 3390 + }, + { + "epoch": 0.20721757083107958, + "grad_norm": 0.7635865211486816, + "learning_rate": 8e-05, + "loss": 1.5975, + "num_input_tokens_seen": 479446800, + "step": 3400 + }, + { + "epoch": 0.20782703427470042, + "grad_norm": 0.5621991753578186, + "learning_rate": 8e-05, + "loss": 1.5216, + "num_input_tokens_seen": 480863616, + "step": 3410 + }, + { + "epoch": 0.20843649771832123, + "grad_norm": 0.6029215455055237, + "learning_rate": 8e-05, + "loss": 1.671, + "num_input_tokens_seen": 482273428, + "step": 3420 + }, + { + "epoch": 0.20904596116194205, + "grad_norm": 0.6263468265533447, + "learning_rate": 8e-05, + "loss": 1.6108, + "num_input_tokens_seen": 483698556, + "step": 3430 + }, + { + "epoch": 0.20965542460556288, + "grad_norm": 0.6057296991348267, + "learning_rate": 8e-05, + "loss": 1.5797, + "num_input_tokens_seen": 485164152, + "step": 3440 + }, + { + "epoch": 0.2102648880491837, + "grad_norm": 0.639839231967926, + "learning_rate": 8e-05, + "loss": 1.5568, + "num_input_tokens_seen": 486564516, + "step": 3450 + }, + { + "epoch": 0.21087435149280453, + "grad_norm": 0.6368933320045471, + "learning_rate": 8e-05, + "loss": 1.5162, + "num_input_tokens_seen": 487982336, + "step": 3460 + }, + { + "epoch": 0.21148381493642535, + "grad_norm": 0.6618413925170898, + "learning_rate": 8e-05, + "loss": 1.5737, + "num_input_tokens_seen": 489402684, + "step": 3470 + }, + { + "epoch": 0.21209327838004616, + "grad_norm": 0.7187027335166931, + "learning_rate": 8e-05, + "loss": 1.5436, + "num_input_tokens_seen": 490822360, + "step": 3480 + }, + { + "epoch": 0.212702741823667, + "grad_norm": 0.6799234747886658, + "learning_rate": 8e-05, + "loss": 1.6655, + "num_input_tokens_seen": 492270836, + "step": 3490 + }, + { + "epoch": 0.2133122052672878, + "grad_norm": 0.6822348833084106, + "learning_rate": 8e-05, + "loss": 1.5637, + "num_input_tokens_seen": 493667632, + "step": 3500 + }, + { + "epoch": 0.21392166871090865, + "grad_norm": 0.5559425950050354, + "learning_rate": 8e-05, + "loss": 1.6041, + "num_input_tokens_seen": 495092028, + "step": 3510 + }, + { + "epoch": 0.21453113215452946, + "grad_norm": 0.6507359147071838, + "learning_rate": 8e-05, + "loss": 1.5193, + "num_input_tokens_seen": 496485460, + "step": 3520 + }, + { + "epoch": 0.21514059559815027, + "grad_norm": 0.6291355490684509, + "learning_rate": 8e-05, + "loss": 1.5885, + "num_input_tokens_seen": 497903368, + "step": 3530 + }, + { + "epoch": 0.2157500590417711, + "grad_norm": 0.6117660999298096, + "learning_rate": 8e-05, + "loss": 1.583, + "num_input_tokens_seen": 499318444, + "step": 3540 + }, + { + "epoch": 0.21635952248539192, + "grad_norm": 0.6362354159355164, + "learning_rate": 8e-05, + "loss": 1.5588, + "num_input_tokens_seen": 500692116, + "step": 3550 + }, + { + "epoch": 0.21696898592901276, + "grad_norm": 0.7075550556182861, + "learning_rate": 8e-05, + "loss": 1.6182, + "num_input_tokens_seen": 502070960, + "step": 3560 + }, + { + "epoch": 0.21757844937263357, + "grad_norm": 0.5841058492660522, + "learning_rate": 8e-05, + "loss": 1.5713, + "num_input_tokens_seen": 503481416, + "step": 3570 + }, + { + "epoch": 0.21818791281625438, + "grad_norm": 0.6521745920181274, + "learning_rate": 8e-05, + "loss": 1.5641, + "num_input_tokens_seen": 504920860, + "step": 3580 + }, + { + "epoch": 0.21879737625987522, + "grad_norm": 0.527927041053772, + "learning_rate": 8e-05, + "loss": 1.5431, + "num_input_tokens_seen": 506389836, + "step": 3590 + }, + { + "epoch": 0.21940683970349603, + "grad_norm": 0.6047568321228027, + "learning_rate": 8e-05, + "loss": 1.5653, + "num_input_tokens_seen": 507827788, + "step": 3600 + }, + { + "epoch": 0.22001630314711687, + "grad_norm": 0.5986194610595703, + "learning_rate": 8e-05, + "loss": 1.5669, + "num_input_tokens_seen": 509254352, + "step": 3610 + }, + { + "epoch": 0.22062576659073768, + "grad_norm": 0.7015334367752075, + "learning_rate": 8e-05, + "loss": 1.5154, + "num_input_tokens_seen": 510665164, + "step": 3620 + }, + { + "epoch": 0.2212352300343585, + "grad_norm": 0.6172800064086914, + "learning_rate": 8e-05, + "loss": 1.5364, + "num_input_tokens_seen": 512088312, + "step": 3630 + }, + { + "epoch": 0.22184469347797933, + "grad_norm": 0.7089115381240845, + "learning_rate": 8e-05, + "loss": 1.644, + "num_input_tokens_seen": 513489328, + "step": 3640 + }, + { + "epoch": 0.22245415692160014, + "grad_norm": 0.5689125657081604, + "learning_rate": 8e-05, + "loss": 1.5437, + "num_input_tokens_seen": 514882072, + "step": 3650 + }, + { + "epoch": 0.22306362036522095, + "grad_norm": 0.610406756401062, + "learning_rate": 8e-05, + "loss": 1.5451, + "num_input_tokens_seen": 516281980, + "step": 3660 + }, + { + "epoch": 0.2236730838088418, + "grad_norm": 0.5259641408920288, + "learning_rate": 8e-05, + "loss": 1.5613, + "num_input_tokens_seen": 517696272, + "step": 3670 + }, + { + "epoch": 0.2242825472524626, + "grad_norm": 0.6053969264030457, + "learning_rate": 8e-05, + "loss": 1.5655, + "num_input_tokens_seen": 519099176, + "step": 3680 + }, + { + "epoch": 0.22489201069608344, + "grad_norm": 0.6376533508300781, + "learning_rate": 8e-05, + "loss": 1.5481, + "num_input_tokens_seen": 520548316, + "step": 3690 + }, + { + "epoch": 0.22550147413970426, + "grad_norm": 0.6387814879417419, + "learning_rate": 8e-05, + "loss": 1.5541, + "num_input_tokens_seen": 521977404, + "step": 3700 + }, + { + "epoch": 0.22611093758332507, + "grad_norm": 0.5563859343528748, + "learning_rate": 8e-05, + "loss": 1.5862, + "num_input_tokens_seen": 523431332, + "step": 3710 + }, + { + "epoch": 0.2267204010269459, + "grad_norm": 0.5293943285942078, + "learning_rate": 8e-05, + "loss": 1.5249, + "num_input_tokens_seen": 524857860, + "step": 3720 + }, + { + "epoch": 0.22732986447056672, + "grad_norm": 0.6297414898872375, + "learning_rate": 8e-05, + "loss": 1.5081, + "num_input_tokens_seen": 526236604, + "step": 3730 + }, + { + "epoch": 0.22793932791418756, + "grad_norm": 0.7299327850341797, + "learning_rate": 8e-05, + "loss": 1.5876, + "num_input_tokens_seen": 527654036, + "step": 3740 + }, + { + "epoch": 0.22854879135780837, + "grad_norm": 0.6514659523963928, + "learning_rate": 8e-05, + "loss": 1.5381, + "num_input_tokens_seen": 529042780, + "step": 3750 + }, + { + "epoch": 0.22915825480142918, + "grad_norm": 0.5427886247634888, + "learning_rate": 8e-05, + "loss": 1.5652, + "num_input_tokens_seen": 530471304, + "step": 3760 + }, + { + "epoch": 0.22976771824505002, + "grad_norm": 0.6196932196617126, + "learning_rate": 8e-05, + "loss": 1.5239, + "num_input_tokens_seen": 531891692, + "step": 3770 + }, + { + "epoch": 0.23037718168867083, + "grad_norm": 0.628355860710144, + "learning_rate": 8e-05, + "loss": 1.4701, + "num_input_tokens_seen": 533308800, + "step": 3780 + }, + { + "epoch": 0.23098664513229167, + "grad_norm": 0.5529937744140625, + "learning_rate": 8e-05, + "loss": 1.5185, + "num_input_tokens_seen": 534696500, + "step": 3790 + }, + { + "epoch": 0.23159610857591248, + "grad_norm": 0.5507948994636536, + "learning_rate": 8e-05, + "loss": 1.5384, + "num_input_tokens_seen": 536107284, + "step": 3800 + }, + { + "epoch": 0.2322055720195333, + "grad_norm": 0.6283483505249023, + "learning_rate": 8e-05, + "loss": 1.6055, + "num_input_tokens_seen": 537501972, + "step": 3810 + }, + { + "epoch": 0.23281503546315413, + "grad_norm": 0.6029003858566284, + "learning_rate": 8e-05, + "loss": 1.4906, + "num_input_tokens_seen": 538873180, + "step": 3820 + }, + { + "epoch": 0.23342449890677494, + "grad_norm": 0.6179420351982117, + "learning_rate": 8e-05, + "loss": 1.5218, + "num_input_tokens_seen": 540283024, + "step": 3830 + }, + { + "epoch": 0.23403396235039578, + "grad_norm": 0.6198109984397888, + "learning_rate": 8e-05, + "loss": 1.5806, + "num_input_tokens_seen": 541699176, + "step": 3840 + }, + { + "epoch": 0.2346434257940166, + "grad_norm": 0.6719751954078674, + "learning_rate": 8e-05, + "loss": 1.4983, + "num_input_tokens_seen": 543113424, + "step": 3850 + }, + { + "epoch": 0.2352528892376374, + "grad_norm": 0.7113502025604248, + "learning_rate": 8e-05, + "loss": 1.5353, + "num_input_tokens_seen": 544510612, + "step": 3860 + }, + { + "epoch": 0.23586235268125824, + "grad_norm": 0.6690021753311157, + "learning_rate": 8e-05, + "loss": 1.5478, + "num_input_tokens_seen": 545973208, + "step": 3870 + }, + { + "epoch": 0.23647181612487905, + "grad_norm": 0.5268478989601135, + "learning_rate": 8e-05, + "loss": 1.5472, + "num_input_tokens_seen": 547339972, + "step": 3880 + }, + { + "epoch": 0.2370812795684999, + "grad_norm": 0.5470036864280701, + "learning_rate": 8e-05, + "loss": 1.6029, + "num_input_tokens_seen": 548743084, + "step": 3890 + }, + { + "epoch": 0.2376907430121207, + "grad_norm": 0.5641947388648987, + "learning_rate": 8e-05, + "loss": 1.5503, + "num_input_tokens_seen": 550127048, + "step": 3900 + }, + { + "epoch": 0.23830020645574151, + "grad_norm": 0.6231464147567749, + "learning_rate": 8e-05, + "loss": 1.518, + "num_input_tokens_seen": 551531300, + "step": 3910 + }, + { + "epoch": 0.23890966989936235, + "grad_norm": 0.5082821846008301, + "learning_rate": 8e-05, + "loss": 1.4749, + "num_input_tokens_seen": 552910292, + "step": 3920 + }, + { + "epoch": 0.23951913334298316, + "grad_norm": 0.7179940938949585, + "learning_rate": 8e-05, + "loss": 1.511, + "num_input_tokens_seen": 554318712, + "step": 3930 + }, + { + "epoch": 0.240128596786604, + "grad_norm": 0.6112212538719177, + "learning_rate": 8e-05, + "loss": 1.4962, + "num_input_tokens_seen": 555780652, + "step": 3940 + }, + { + "epoch": 0.24073806023022482, + "grad_norm": 0.7789644598960876, + "learning_rate": 8e-05, + "loss": 1.5523, + "num_input_tokens_seen": 557205392, + "step": 3950 + }, + { + "epoch": 0.24134752367384563, + "grad_norm": 0.559933066368103, + "learning_rate": 8e-05, + "loss": 1.5004, + "num_input_tokens_seen": 558642004, + "step": 3960 + }, + { + "epoch": 0.24195698711746647, + "grad_norm": 0.6398128867149353, + "learning_rate": 8e-05, + "loss": 1.5336, + "num_input_tokens_seen": 560061460, + "step": 3970 + }, + { + "epoch": 0.24256645056108728, + "grad_norm": 0.5948666334152222, + "learning_rate": 8e-05, + "loss": 1.6063, + "num_input_tokens_seen": 561476852, + "step": 3980 + }, + { + "epoch": 0.24317591400470812, + "grad_norm": 0.5978219509124756, + "learning_rate": 8e-05, + "loss": 1.535, + "num_input_tokens_seen": 562885480, + "step": 3990 + }, + { + "epoch": 0.24378537744832893, + "grad_norm": 0.5732299089431763, + "learning_rate": 8e-05, + "loss": 1.6014, + "num_input_tokens_seen": 564296708, + "step": 4000 + }, + { + "epoch": 0.24439484089194974, + "grad_norm": 0.5508894324302673, + "learning_rate": 8e-05, + "loss": 1.5665, + "num_input_tokens_seen": 565716316, + "step": 4010 + }, + { + "epoch": 0.24500430433557058, + "grad_norm": 0.6127896904945374, + "learning_rate": 8e-05, + "loss": 1.5016, + "num_input_tokens_seen": 567148104, + "step": 4020 + }, + { + "epoch": 0.2456137677791914, + "grad_norm": 0.6429306864738464, + "learning_rate": 8e-05, + "loss": 1.5325, + "num_input_tokens_seen": 568546676, + "step": 4030 + }, + { + "epoch": 0.24622323122281223, + "grad_norm": 0.5604584813117981, + "learning_rate": 8e-05, + "loss": 1.5726, + "num_input_tokens_seen": 569955020, + "step": 4040 + }, + { + "epoch": 0.24683269466643304, + "grad_norm": 0.5747659206390381, + "learning_rate": 8e-05, + "loss": 1.613, + "num_input_tokens_seen": 571360824, + "step": 4050 + }, + { + "epoch": 0.24744215811005385, + "grad_norm": 0.6003303527832031, + "learning_rate": 8e-05, + "loss": 1.4661, + "num_input_tokens_seen": 572783056, + "step": 4060 + }, + { + "epoch": 0.2480516215536747, + "grad_norm": 0.5869039297103882, + "learning_rate": 8e-05, + "loss": 1.5553, + "num_input_tokens_seen": 574182888, + "step": 4070 + }, + { + "epoch": 0.2486610849972955, + "grad_norm": 0.6359077095985413, + "learning_rate": 8e-05, + "loss": 1.5035, + "num_input_tokens_seen": 575590724, + "step": 4080 + }, + { + "epoch": 0.24927054844091634, + "grad_norm": 0.6442738175392151, + "learning_rate": 8e-05, + "loss": 1.5434, + "num_input_tokens_seen": 577002740, + "step": 4090 + }, + { + "epoch": 0.24988001188453715, + "grad_norm": 0.5803224444389343, + "learning_rate": 8e-05, + "loss": 1.5408, + "num_input_tokens_seen": 578408172, + "step": 4100 + }, + { + "epoch": 0.25048947532815796, + "grad_norm": 0.5301817059516907, + "learning_rate": 8e-05, + "loss": 1.5587, + "num_input_tokens_seen": 579811960, + "step": 4110 + }, + { + "epoch": 0.2510989387717788, + "grad_norm": 0.5798521637916565, + "learning_rate": 8e-05, + "loss": 1.5368, + "num_input_tokens_seen": 581216492, + "step": 4120 + }, + { + "epoch": 0.25170840221539964, + "grad_norm": 0.6303583979606628, + "learning_rate": 8e-05, + "loss": 1.5264, + "num_input_tokens_seen": 582601144, + "step": 4130 + }, + { + "epoch": 0.2523178656590204, + "grad_norm": 0.5813618898391724, + "learning_rate": 8e-05, + "loss": 1.5848, + "num_input_tokens_seen": 583981308, + "step": 4140 + }, + { + "epoch": 0.25292732910264126, + "grad_norm": 0.6459015607833862, + "learning_rate": 8e-05, + "loss": 1.5018, + "num_input_tokens_seen": 585375584, + "step": 4150 + }, + { + "epoch": 0.2535367925462621, + "grad_norm": 0.5455829501152039, + "learning_rate": 8e-05, + "loss": 1.5464, + "num_input_tokens_seen": 586802916, + "step": 4160 + }, + { + "epoch": 0.2541462559898829, + "grad_norm": 0.6944795250892639, + "learning_rate": 8e-05, + "loss": 1.548, + "num_input_tokens_seen": 588224368, + "step": 4170 + }, + { + "epoch": 0.2547557194335037, + "grad_norm": 0.49821239709854126, + "learning_rate": 8e-05, + "loss": 1.5018, + "num_input_tokens_seen": 589678916, + "step": 4180 + }, + { + "epoch": 0.25536518287712456, + "grad_norm": 0.6219947338104248, + "learning_rate": 8e-05, + "loss": 1.4944, + "num_input_tokens_seen": 591107332, + "step": 4190 + }, + { + "epoch": 0.25597464632074535, + "grad_norm": 0.6208930611610413, + "learning_rate": 8e-05, + "loss": 1.5005, + "num_input_tokens_seen": 592503952, + "step": 4200 + }, + { + "epoch": 0.2565841097643662, + "grad_norm": 0.5741164684295654, + "learning_rate": 8e-05, + "loss": 1.551, + "num_input_tokens_seen": 593961892, + "step": 4210 + }, + { + "epoch": 0.257193573207987, + "grad_norm": 0.6542494893074036, + "learning_rate": 8e-05, + "loss": 1.4703, + "num_input_tokens_seen": 595372500, + "step": 4220 + }, + { + "epoch": 0.25780303665160786, + "grad_norm": 0.5730968117713928, + "learning_rate": 8e-05, + "loss": 1.5682, + "num_input_tokens_seen": 596779600, + "step": 4230 + }, + { + "epoch": 0.25841250009522865, + "grad_norm": 0.683051347732544, + "learning_rate": 8e-05, + "loss": 1.4536, + "num_input_tokens_seen": 598184840, + "step": 4240 + }, + { + "epoch": 0.2590219635388495, + "grad_norm": 0.7352629899978638, + "learning_rate": 8e-05, + "loss": 1.5573, + "num_input_tokens_seen": 599586716, + "step": 4250 + }, + { + "epoch": 0.2596314269824703, + "grad_norm": 0.5975140929222107, + "learning_rate": 8e-05, + "loss": 1.4868, + "num_input_tokens_seen": 600969532, + "step": 4260 + }, + { + "epoch": 0.2602408904260911, + "grad_norm": 0.5328396558761597, + "learning_rate": 8e-05, + "loss": 1.4971, + "num_input_tokens_seen": 602381612, + "step": 4270 + }, + { + "epoch": 0.26085035386971195, + "grad_norm": 0.832878053188324, + "learning_rate": 8e-05, + "loss": 1.5396, + "num_input_tokens_seen": 603796776, + "step": 4280 + }, + { + "epoch": 0.2614598173133328, + "grad_norm": 0.5859492421150208, + "learning_rate": 8e-05, + "loss": 1.5363, + "num_input_tokens_seen": 605221644, + "step": 4290 + }, + { + "epoch": 0.26206928075695357, + "grad_norm": 0.5248042345046997, + "learning_rate": 8e-05, + "loss": 1.5086, + "num_input_tokens_seen": 606642144, + "step": 4300 + }, + { + "epoch": 0.2626787442005744, + "grad_norm": 0.5584684014320374, + "learning_rate": 8e-05, + "loss": 1.5288, + "num_input_tokens_seen": 608083892, + "step": 4310 + }, + { + "epoch": 0.26328820764419525, + "grad_norm": 0.5786579251289368, + "learning_rate": 8e-05, + "loss": 1.5388, + "num_input_tokens_seen": 609490492, + "step": 4320 + }, + { + "epoch": 0.2638976710878161, + "grad_norm": 0.6442510485649109, + "learning_rate": 8e-05, + "loss": 1.5189, + "num_input_tokens_seen": 610881576, + "step": 4330 + }, + { + "epoch": 0.26450713453143687, + "grad_norm": 0.5844077467918396, + "learning_rate": 8e-05, + "loss": 1.4962, + "num_input_tokens_seen": 612271752, + "step": 4340 + }, + { + "epoch": 0.2651165979750577, + "grad_norm": 0.5478509664535522, + "learning_rate": 8e-05, + "loss": 1.479, + "num_input_tokens_seen": 613667596, + "step": 4350 + }, + { + "epoch": 0.26572606141867855, + "grad_norm": 0.5829759836196899, + "learning_rate": 8e-05, + "loss": 1.5768, + "num_input_tokens_seen": 615067476, + "step": 4360 + }, + { + "epoch": 0.26633552486229933, + "grad_norm": 0.6037372350692749, + "learning_rate": 8e-05, + "loss": 1.5158, + "num_input_tokens_seen": 616456876, + "step": 4370 + }, + { + "epoch": 0.2669449883059202, + "grad_norm": 0.7143189311027527, + "learning_rate": 8e-05, + "loss": 1.5373, + "num_input_tokens_seen": 617885136, + "step": 4380 + }, + { + "epoch": 0.267554451749541, + "grad_norm": 0.5982239246368408, + "learning_rate": 8e-05, + "loss": 1.5993, + "num_input_tokens_seen": 619310072, + "step": 4390 + }, + { + "epoch": 0.2681639151931618, + "grad_norm": 0.5947147011756897, + "learning_rate": 8e-05, + "loss": 1.5218, + "num_input_tokens_seen": 620729588, + "step": 4400 + }, + { + "epoch": 0.26877337863678263, + "grad_norm": 0.6012521386146545, + "learning_rate": 8e-05, + "loss": 1.4877, + "num_input_tokens_seen": 622107468, + "step": 4410 + }, + { + "epoch": 0.2693828420804035, + "grad_norm": 0.8092216849327087, + "learning_rate": 8e-05, + "loss": 1.5039, + "num_input_tokens_seen": 623481492, + "step": 4420 + }, + { + "epoch": 0.2699923055240243, + "grad_norm": 0.5891050100326538, + "learning_rate": 8e-05, + "loss": 1.6217, + "num_input_tokens_seen": 624886240, + "step": 4430 + }, + { + "epoch": 0.2706017689676451, + "grad_norm": 0.5771660804748535, + "learning_rate": 8e-05, + "loss": 1.464, + "num_input_tokens_seen": 626297120, + "step": 4440 + }, + { + "epoch": 0.27121123241126593, + "grad_norm": 0.6003730893135071, + "learning_rate": 8e-05, + "loss": 1.5165, + "num_input_tokens_seen": 627731736, + "step": 4450 + }, + { + "epoch": 0.2718206958548868, + "grad_norm": 0.545914351940155, + "learning_rate": 8e-05, + "loss": 1.5915, + "num_input_tokens_seen": 629175788, + "step": 4460 + }, + { + "epoch": 0.27243015929850756, + "grad_norm": 0.5131253004074097, + "learning_rate": 8e-05, + "loss": 1.519, + "num_input_tokens_seen": 630590732, + "step": 4470 + }, + { + "epoch": 0.2730396227421284, + "grad_norm": 0.6029266715049744, + "learning_rate": 8e-05, + "loss": 1.5594, + "num_input_tokens_seen": 632034684, + "step": 4480 + }, + { + "epoch": 0.27364908618574924, + "grad_norm": 0.7176979780197144, + "learning_rate": 8e-05, + "loss": 1.4626, + "num_input_tokens_seen": 633439316, + "step": 4490 + }, + { + "epoch": 0.27425854962937, + "grad_norm": 0.5998932719230652, + "learning_rate": 8e-05, + "loss": 1.4572, + "num_input_tokens_seen": 634866124, + "step": 4500 + }, + { + "epoch": 0.27486801307299086, + "grad_norm": 0.6117346882820129, + "learning_rate": 8e-05, + "loss": 1.4799, + "num_input_tokens_seen": 636287180, + "step": 4510 + }, + { + "epoch": 0.2754774765166117, + "grad_norm": 0.6476706862449646, + "learning_rate": 8e-05, + "loss": 1.4876, + "num_input_tokens_seen": 637660604, + "step": 4520 + }, + { + "epoch": 0.27608693996023254, + "grad_norm": 0.5890971422195435, + "learning_rate": 8e-05, + "loss": 1.497, + "num_input_tokens_seen": 639105088, + "step": 4530 + }, + { + "epoch": 0.2766964034038533, + "grad_norm": 0.5373267531394958, + "learning_rate": 8e-05, + "loss": 1.4828, + "num_input_tokens_seen": 640515580, + "step": 4540 + }, + { + "epoch": 0.27730586684747416, + "grad_norm": 0.5229721665382385, + "learning_rate": 8e-05, + "loss": 1.4927, + "num_input_tokens_seen": 641960740, + "step": 4550 + }, + { + "epoch": 0.277915330291095, + "grad_norm": 0.5390836000442505, + "learning_rate": 8e-05, + "loss": 1.5461, + "num_input_tokens_seen": 643367736, + "step": 4560 + }, + { + "epoch": 0.2785247937347158, + "grad_norm": 0.5476319789886475, + "learning_rate": 8e-05, + "loss": 1.5791, + "num_input_tokens_seen": 644815132, + "step": 4570 + }, + { + "epoch": 0.2791342571783366, + "grad_norm": 0.5691061615943909, + "learning_rate": 8e-05, + "loss": 1.5169, + "num_input_tokens_seen": 646189852, + "step": 4580 + }, + { + "epoch": 0.27974372062195746, + "grad_norm": 0.5501654744148254, + "learning_rate": 8e-05, + "loss": 1.4543, + "num_input_tokens_seen": 647603012, + "step": 4590 + }, + { + "epoch": 0.28035318406557824, + "grad_norm": 0.6424432992935181, + "learning_rate": 8e-05, + "loss": 1.5923, + "num_input_tokens_seen": 649042496, + "step": 4600 + }, + { + "epoch": 0.2809626475091991, + "grad_norm": 0.5200062394142151, + "learning_rate": 8e-05, + "loss": 1.5062, + "num_input_tokens_seen": 650458192, + "step": 4610 + }, + { + "epoch": 0.2815721109528199, + "grad_norm": 0.6190493702888489, + "learning_rate": 8e-05, + "loss": 1.5041, + "num_input_tokens_seen": 651891436, + "step": 4620 + }, + { + "epoch": 0.28218157439644076, + "grad_norm": 0.5610544681549072, + "learning_rate": 8e-05, + "loss": 1.4763, + "num_input_tokens_seen": 653287132, + "step": 4630 + }, + { + "epoch": 0.28279103784006154, + "grad_norm": 0.6020642518997192, + "learning_rate": 8e-05, + "loss": 1.4961, + "num_input_tokens_seen": 654715988, + "step": 4640 + }, + { + "epoch": 0.2834005012836824, + "grad_norm": 0.5459628701210022, + "learning_rate": 8e-05, + "loss": 1.5022, + "num_input_tokens_seen": 656116916, + "step": 4650 + }, + { + "epoch": 0.2840099647273032, + "grad_norm": 0.53467857837677, + "learning_rate": 8e-05, + "loss": 1.5723, + "num_input_tokens_seen": 657533764, + "step": 4660 + }, + { + "epoch": 0.284619428170924, + "grad_norm": 0.5662572383880615, + "learning_rate": 8e-05, + "loss": 1.4852, + "num_input_tokens_seen": 658933892, + "step": 4670 + }, + { + "epoch": 0.28522889161454484, + "grad_norm": 0.6344622373580933, + "learning_rate": 8e-05, + "loss": 1.5427, + "num_input_tokens_seen": 660343924, + "step": 4680 + }, + { + "epoch": 0.2858383550581657, + "grad_norm": 0.6415901184082031, + "learning_rate": 8e-05, + "loss": 1.4782, + "num_input_tokens_seen": 661722560, + "step": 4690 + }, + { + "epoch": 0.28644781850178647, + "grad_norm": 0.6006565690040588, + "learning_rate": 8e-05, + "loss": 1.5184, + "num_input_tokens_seen": 663126892, + "step": 4700 + }, + { + "epoch": 0.2870572819454073, + "grad_norm": 0.5979477763175964, + "learning_rate": 8e-05, + "loss": 1.4963, + "num_input_tokens_seen": 664549752, + "step": 4710 + }, + { + "epoch": 0.28766674538902814, + "grad_norm": 0.560646653175354, + "learning_rate": 8e-05, + "loss": 1.4522, + "num_input_tokens_seen": 665976108, + "step": 4720 + }, + { + "epoch": 0.288276208832649, + "grad_norm": 0.4898284673690796, + "learning_rate": 8e-05, + "loss": 1.4917, + "num_input_tokens_seen": 667423832, + "step": 4730 + }, + { + "epoch": 0.28888567227626977, + "grad_norm": 0.5207868218421936, + "learning_rate": 8e-05, + "loss": 1.4298, + "num_input_tokens_seen": 668840704, + "step": 4740 + }, + { + "epoch": 0.2894951357198906, + "grad_norm": 0.5953548550605774, + "learning_rate": 8e-05, + "loss": 1.5316, + "num_input_tokens_seen": 670225524, + "step": 4750 + }, + { + "epoch": 0.29010459916351145, + "grad_norm": 0.6386962532997131, + "learning_rate": 8e-05, + "loss": 1.5264, + "num_input_tokens_seen": 671644060, + "step": 4760 + }, + { + "epoch": 0.29071406260713223, + "grad_norm": 0.581168532371521, + "learning_rate": 8e-05, + "loss": 1.4726, + "num_input_tokens_seen": 673055808, + "step": 4770 + }, + { + "epoch": 0.29132352605075307, + "grad_norm": 0.7128572463989258, + "learning_rate": 8e-05, + "loss": 1.4414, + "num_input_tokens_seen": 674498056, + "step": 4780 + }, + { + "epoch": 0.2919329894943739, + "grad_norm": 0.5856772065162659, + "learning_rate": 8e-05, + "loss": 1.507, + "num_input_tokens_seen": 675874572, + "step": 4790 + }, + { + "epoch": 0.2925424529379947, + "grad_norm": 0.6448741555213928, + "learning_rate": 8e-05, + "loss": 1.488, + "num_input_tokens_seen": 677325828, + "step": 4800 + }, + { + "epoch": 0.29315191638161553, + "grad_norm": 0.6047000885009766, + "learning_rate": 8e-05, + "loss": 1.4665, + "num_input_tokens_seen": 678720324, + "step": 4810 + }, + { + "epoch": 0.29376137982523637, + "grad_norm": 0.5227587819099426, + "learning_rate": 8e-05, + "loss": 1.5083, + "num_input_tokens_seen": 680144452, + "step": 4820 + }, + { + "epoch": 0.2943708432688572, + "grad_norm": 0.5058688521385193, + "learning_rate": 8e-05, + "loss": 1.5108, + "num_input_tokens_seen": 681537236, + "step": 4830 + }, + { + "epoch": 0.294980306712478, + "grad_norm": 0.6823422312736511, + "learning_rate": 8e-05, + "loss": 1.5768, + "num_input_tokens_seen": 682917996, + "step": 4840 + }, + { + "epoch": 0.29558977015609883, + "grad_norm": 0.6330751180648804, + "learning_rate": 8e-05, + "loss": 1.5083, + "num_input_tokens_seen": 684353484, + "step": 4850 + }, + { + "epoch": 0.29619923359971967, + "grad_norm": 0.6086271405220032, + "learning_rate": 8e-05, + "loss": 1.5022, + "num_input_tokens_seen": 685747296, + "step": 4860 + }, + { + "epoch": 0.29680869704334045, + "grad_norm": 0.5951144695281982, + "learning_rate": 8e-05, + "loss": 1.583, + "num_input_tokens_seen": 687170156, + "step": 4870 + }, + { + "epoch": 0.2974181604869613, + "grad_norm": 0.5558424592018127, + "learning_rate": 8e-05, + "loss": 1.4283, + "num_input_tokens_seen": 688574796, + "step": 4880 + }, + { + "epoch": 0.29802762393058213, + "grad_norm": 0.6256549954414368, + "learning_rate": 8e-05, + "loss": 1.6313, + "num_input_tokens_seen": 689998376, + "step": 4890 + }, + { + "epoch": 0.2986370873742029, + "grad_norm": 0.5750453472137451, + "learning_rate": 8e-05, + "loss": 1.4544, + "num_input_tokens_seen": 691383556, + "step": 4900 + }, + { + "epoch": 0.29924655081782375, + "grad_norm": 0.69186931848526, + "learning_rate": 8e-05, + "loss": 1.5238, + "num_input_tokens_seen": 692797940, + "step": 4910 + }, + { + "epoch": 0.2998560142614446, + "grad_norm": 0.5753540992736816, + "learning_rate": 8e-05, + "loss": 1.5077, + "num_input_tokens_seen": 694209928, + "step": 4920 + }, + { + "epoch": 0.3004654777050654, + "grad_norm": 0.6008949279785156, + "learning_rate": 8e-05, + "loss": 1.5407, + "num_input_tokens_seen": 695637656, + "step": 4930 + }, + { + "epoch": 0.3010749411486862, + "grad_norm": 0.5696120262145996, + "learning_rate": 8e-05, + "loss": 1.4826, + "num_input_tokens_seen": 697072516, + "step": 4940 + }, + { + "epoch": 0.30168440459230705, + "grad_norm": 0.5745052099227905, + "learning_rate": 8e-05, + "loss": 1.5266, + "num_input_tokens_seen": 698505536, + "step": 4950 + }, + { + "epoch": 0.3022938680359279, + "grad_norm": 0.5735147595405579, + "learning_rate": 8e-05, + "loss": 1.4663, + "num_input_tokens_seen": 699890632, + "step": 4960 + }, + { + "epoch": 0.3029033314795487, + "grad_norm": 0.5933482050895691, + "learning_rate": 8e-05, + "loss": 1.521, + "num_input_tokens_seen": 701311308, + "step": 4970 + }, + { + "epoch": 0.3035127949231695, + "grad_norm": 0.5017425417900085, + "learning_rate": 8e-05, + "loss": 1.4664, + "num_input_tokens_seen": 702715200, + "step": 4980 + }, + { + "epoch": 0.30412225836679035, + "grad_norm": 0.6305922865867615, + "learning_rate": 8e-05, + "loss": 1.5193, + "num_input_tokens_seen": 704117168, + "step": 4990 + }, + { + "epoch": 0.30473172181041114, + "grad_norm": 0.5459288954734802, + "learning_rate": 8e-05, + "loss": 1.4912, + "num_input_tokens_seen": 705551000, + "step": 5000 + }, + { + "epoch": 0.305341185254032, + "grad_norm": 0.5966220498085022, + "learning_rate": 8e-05, + "loss": 1.5046, + "num_input_tokens_seen": 706973412, + "step": 5010 + }, + { + "epoch": 0.3059506486976528, + "grad_norm": 0.5664955377578735, + "learning_rate": 8e-05, + "loss": 1.5611, + "num_input_tokens_seen": 708316344, + "step": 5020 + }, + { + "epoch": 0.3065601121412736, + "grad_norm": 0.5635570883750916, + "learning_rate": 8e-05, + "loss": 1.4977, + "num_input_tokens_seen": 709764736, + "step": 5030 + }, + { + "epoch": 0.30716957558489444, + "grad_norm": 0.555467963218689, + "learning_rate": 8e-05, + "loss": 1.5142, + "num_input_tokens_seen": 711149252, + "step": 5040 + }, + { + "epoch": 0.3077790390285153, + "grad_norm": 0.5750370025634766, + "learning_rate": 8e-05, + "loss": 1.4936, + "num_input_tokens_seen": 712543440, + "step": 5050 + }, + { + "epoch": 0.3083885024721361, + "grad_norm": 0.4970649778842926, + "learning_rate": 8e-05, + "loss": 1.4638, + "num_input_tokens_seen": 713913020, + "step": 5060 + }, + { + "epoch": 0.3089979659157569, + "grad_norm": 0.510879635810852, + "learning_rate": 8e-05, + "loss": 1.455, + "num_input_tokens_seen": 715350768, + "step": 5070 + }, + { + "epoch": 0.30960742935937774, + "grad_norm": 0.5770237445831299, + "learning_rate": 8e-05, + "loss": 1.5248, + "num_input_tokens_seen": 716774184, + "step": 5080 + }, + { + "epoch": 0.3102168928029986, + "grad_norm": 0.508195698261261, + "learning_rate": 8e-05, + "loss": 1.4908, + "num_input_tokens_seen": 718209544, + "step": 5090 + }, + { + "epoch": 0.31082635624661936, + "grad_norm": 0.501331627368927, + "learning_rate": 8e-05, + "loss": 1.4317, + "num_input_tokens_seen": 719606848, + "step": 5100 + }, + { + "epoch": 0.3114358196902402, + "grad_norm": 0.5919586420059204, + "learning_rate": 8e-05, + "loss": 1.4905, + "num_input_tokens_seen": 721020264, + "step": 5110 + }, + { + "epoch": 0.31204528313386104, + "grad_norm": 0.51059490442276, + "learning_rate": 8e-05, + "loss": 1.4769, + "num_input_tokens_seen": 722433924, + "step": 5120 + }, + { + "epoch": 0.3126547465774818, + "grad_norm": 0.5880390405654907, + "learning_rate": 8e-05, + "loss": 1.4865, + "num_input_tokens_seen": 723841812, + "step": 5130 + }, + { + "epoch": 0.31326421002110266, + "grad_norm": 0.5752800107002258, + "learning_rate": 8e-05, + "loss": 1.5344, + "num_input_tokens_seen": 725237092, + "step": 5140 + }, + { + "epoch": 0.3138736734647235, + "grad_norm": 0.5985324382781982, + "learning_rate": 8e-05, + "loss": 1.4905, + "num_input_tokens_seen": 726645824, + "step": 5150 + }, + { + "epoch": 0.31448313690834434, + "grad_norm": 0.5916955471038818, + "learning_rate": 8e-05, + "loss": 1.4696, + "num_input_tokens_seen": 728082536, + "step": 5160 + }, + { + "epoch": 0.3150926003519651, + "grad_norm": 0.5135784149169922, + "learning_rate": 8e-05, + "loss": 1.4112, + "num_input_tokens_seen": 729471752, + "step": 5170 + }, + { + "epoch": 0.31570206379558596, + "grad_norm": 0.597804069519043, + "learning_rate": 8e-05, + "loss": 1.4754, + "num_input_tokens_seen": 730906264, + "step": 5180 + }, + { + "epoch": 0.3163115272392068, + "grad_norm": 0.48136067390441895, + "learning_rate": 8e-05, + "loss": 1.5421, + "num_input_tokens_seen": 732311764, + "step": 5190 + }, + { + "epoch": 0.3169209906828276, + "grad_norm": 0.5294743180274963, + "learning_rate": 8e-05, + "loss": 1.42, + "num_input_tokens_seen": 733705960, + "step": 5200 + }, + { + "epoch": 0.3175304541264484, + "grad_norm": 0.49918803572654724, + "learning_rate": 8e-05, + "loss": 1.5201, + "num_input_tokens_seen": 735136208, + "step": 5210 + }, + { + "epoch": 0.31813991757006926, + "grad_norm": 0.6225274205207825, + "learning_rate": 8e-05, + "loss": 1.4671, + "num_input_tokens_seen": 736548572, + "step": 5220 + }, + { + "epoch": 0.31874938101369005, + "grad_norm": 0.6126905083656311, + "learning_rate": 8e-05, + "loss": 1.5022, + "num_input_tokens_seen": 737948492, + "step": 5230 + }, + { + "epoch": 0.3193588444573109, + "grad_norm": 0.6735352873802185, + "learning_rate": 8e-05, + "loss": 1.4824, + "num_input_tokens_seen": 739360472, + "step": 5240 + }, + { + "epoch": 0.3199683079009317, + "grad_norm": 0.5282402634620667, + "learning_rate": 8e-05, + "loss": 1.3916, + "num_input_tokens_seen": 740751936, + "step": 5250 + }, + { + "epoch": 0.32057777134455256, + "grad_norm": 0.5800075531005859, + "learning_rate": 8e-05, + "loss": 1.4639, + "num_input_tokens_seen": 742134900, + "step": 5260 + }, + { + "epoch": 0.32118723478817335, + "grad_norm": 0.7085527777671814, + "learning_rate": 8e-05, + "loss": 1.5392, + "num_input_tokens_seen": 743507672, + "step": 5270 + }, + { + "epoch": 0.3217966982317942, + "grad_norm": 0.5233438611030579, + "learning_rate": 8e-05, + "loss": 1.4817, + "num_input_tokens_seen": 744911012, + "step": 5280 + }, + { + "epoch": 0.322406161675415, + "grad_norm": 0.5503348112106323, + "learning_rate": 8e-05, + "loss": 1.4695, + "num_input_tokens_seen": 746348456, + "step": 5290 + }, + { + "epoch": 0.3230156251190358, + "grad_norm": 0.5066059827804565, + "learning_rate": 8e-05, + "loss": 1.5024, + "num_input_tokens_seen": 747760896, + "step": 5300 + }, + { + "epoch": 0.32362508856265665, + "grad_norm": 0.6413135528564453, + "learning_rate": 8e-05, + "loss": 1.4737, + "num_input_tokens_seen": 749182700, + "step": 5310 + }, + { + "epoch": 0.3242345520062775, + "grad_norm": 0.6394685506820679, + "learning_rate": 8e-05, + "loss": 1.497, + "num_input_tokens_seen": 750569816, + "step": 5320 + }, + { + "epoch": 0.32484401544989827, + "grad_norm": 0.6730980277061462, + "learning_rate": 8e-05, + "loss": 1.5137, + "num_input_tokens_seen": 752049528, + "step": 5330 + }, + { + "epoch": 0.3254534788935191, + "grad_norm": 0.5059208273887634, + "learning_rate": 8e-05, + "loss": 1.4047, + "num_input_tokens_seen": 753469828, + "step": 5340 + }, + { + "epoch": 0.32606294233713995, + "grad_norm": 0.6344226002693176, + "learning_rate": 8e-05, + "loss": 1.4388, + "num_input_tokens_seen": 754893840, + "step": 5350 + }, + { + "epoch": 0.3266724057807608, + "grad_norm": 0.5653851628303528, + "learning_rate": 8e-05, + "loss": 1.5341, + "num_input_tokens_seen": 756345148, + "step": 5360 + }, + { + "epoch": 0.32728186922438157, + "grad_norm": 0.5219058394432068, + "learning_rate": 8e-05, + "loss": 1.4945, + "num_input_tokens_seen": 757782128, + "step": 5370 + }, + { + "epoch": 0.3278913326680024, + "grad_norm": 0.6265976428985596, + "learning_rate": 8e-05, + "loss": 1.5015, + "num_input_tokens_seen": 759205500, + "step": 5380 + }, + { + "epoch": 0.32850079611162325, + "grad_norm": 0.5684333443641663, + "learning_rate": 8e-05, + "loss": 1.4409, + "num_input_tokens_seen": 760595412, + "step": 5390 + }, + { + "epoch": 0.32911025955524403, + "grad_norm": 0.5176586508750916, + "learning_rate": 8e-05, + "loss": 1.4624, + "num_input_tokens_seen": 761988752, + "step": 5400 + }, + { + "epoch": 0.3297197229988649, + "grad_norm": 0.6107752323150635, + "learning_rate": 8e-05, + "loss": 1.4213, + "num_input_tokens_seen": 763379640, + "step": 5410 + }, + { + "epoch": 0.3303291864424857, + "grad_norm": 0.7825304865837097, + "learning_rate": 8e-05, + "loss": 1.5461, + "num_input_tokens_seen": 764777136, + "step": 5420 + }, + { + "epoch": 0.3309386498861065, + "grad_norm": 0.5260487794876099, + "learning_rate": 8e-05, + "loss": 1.4863, + "num_input_tokens_seen": 766185628, + "step": 5430 + }, + { + "epoch": 0.33154811332972733, + "grad_norm": 0.5938482284545898, + "learning_rate": 8e-05, + "loss": 1.5117, + "num_input_tokens_seen": 767555088, + "step": 5440 + }, + { + "epoch": 0.3321575767733482, + "grad_norm": 0.5161702632904053, + "learning_rate": 8e-05, + "loss": 1.5179, + "num_input_tokens_seen": 768994976, + "step": 5450 + }, + { + "epoch": 0.332767040216969, + "grad_norm": 0.5811363458633423, + "learning_rate": 8e-05, + "loss": 1.496, + "num_input_tokens_seen": 770367576, + "step": 5460 + }, + { + "epoch": 0.3333765036605898, + "grad_norm": 0.5560015439987183, + "learning_rate": 8e-05, + "loss": 1.5018, + "num_input_tokens_seen": 771776868, + "step": 5470 + }, + { + "epoch": 0.33398596710421063, + "grad_norm": 0.5517913103103638, + "learning_rate": 8e-05, + "loss": 1.5197, + "num_input_tokens_seen": 773199948, + "step": 5480 + }, + { + "epoch": 0.3345954305478315, + "grad_norm": 0.5763364434242249, + "learning_rate": 8e-05, + "loss": 1.4531, + "num_input_tokens_seen": 774635788, + "step": 5490 + }, + { + "epoch": 0.33520489399145226, + "grad_norm": 0.5679718852043152, + "learning_rate": 8e-05, + "loss": 1.5311, + "num_input_tokens_seen": 776038608, + "step": 5500 + }, + { + "epoch": 0.3358143574350731, + "grad_norm": 0.6076738238334656, + "learning_rate": 8e-05, + "loss": 1.5176, + "num_input_tokens_seen": 777447108, + "step": 5510 + }, + { + "epoch": 0.33642382087869394, + "grad_norm": 0.5834559798240662, + "learning_rate": 8e-05, + "loss": 1.5009, + "num_input_tokens_seen": 778861076, + "step": 5520 + }, + { + "epoch": 0.3370332843223147, + "grad_norm": 0.5051541328430176, + "learning_rate": 8e-05, + "loss": 1.4895, + "num_input_tokens_seen": 780263956, + "step": 5530 + }, + { + "epoch": 0.33764274776593556, + "grad_norm": 0.5240879058837891, + "learning_rate": 8e-05, + "loss": 1.5555, + "num_input_tokens_seen": 781682280, + "step": 5540 + }, + { + "epoch": 0.3382522112095564, + "grad_norm": 0.4911370873451233, + "learning_rate": 8e-05, + "loss": 1.4297, + "num_input_tokens_seen": 783125984, + "step": 5550 + }, + { + "epoch": 0.33886167465317724, + "grad_norm": 0.5864706635475159, + "learning_rate": 8e-05, + "loss": 1.5062, + "num_input_tokens_seen": 784568312, + "step": 5560 + }, + { + "epoch": 0.339471138096798, + "grad_norm": 0.5688824653625488, + "learning_rate": 8e-05, + "loss": 1.4459, + "num_input_tokens_seen": 785985256, + "step": 5570 + }, + { + "epoch": 0.34008060154041886, + "grad_norm": 0.521834671497345, + "learning_rate": 8e-05, + "loss": 1.4781, + "num_input_tokens_seen": 787388484, + "step": 5580 + }, + { + "epoch": 0.3406900649840397, + "grad_norm": 0.5668113827705383, + "learning_rate": 8e-05, + "loss": 1.4792, + "num_input_tokens_seen": 788815088, + "step": 5590 + }, + { + "epoch": 0.3412995284276605, + "grad_norm": 0.5735951662063599, + "learning_rate": 8e-05, + "loss": 1.482, + "num_input_tokens_seen": 790242832, + "step": 5600 + }, + { + "epoch": 0.3419089918712813, + "grad_norm": 0.5737040042877197, + "learning_rate": 8e-05, + "loss": 1.4741, + "num_input_tokens_seen": 791648756, + "step": 5610 + }, + { + "epoch": 0.34251845531490216, + "grad_norm": 0.5083377361297607, + "learning_rate": 8e-05, + "loss": 1.4006, + "num_input_tokens_seen": 793038600, + "step": 5620 + }, + { + "epoch": 0.34312791875852294, + "grad_norm": 0.48619720339775085, + "learning_rate": 8e-05, + "loss": 1.4241, + "num_input_tokens_seen": 794466268, + "step": 5630 + }, + { + "epoch": 0.3437373822021438, + "grad_norm": 0.5126612782478333, + "learning_rate": 8e-05, + "loss": 1.472, + "num_input_tokens_seen": 795840520, + "step": 5640 + }, + { + "epoch": 0.3443468456457646, + "grad_norm": 0.553537905216217, + "learning_rate": 8e-05, + "loss": 1.4802, + "num_input_tokens_seen": 797249844, + "step": 5650 + }, + { + "epoch": 0.34495630908938546, + "grad_norm": 0.576506495475769, + "learning_rate": 8e-05, + "loss": 1.4802, + "num_input_tokens_seen": 798662736, + "step": 5660 + }, + { + "epoch": 0.34556577253300624, + "grad_norm": 0.5432490706443787, + "learning_rate": 8e-05, + "loss": 1.5314, + "num_input_tokens_seen": 800085376, + "step": 5670 + }, + { + "epoch": 0.3461752359766271, + "grad_norm": 0.5892384648323059, + "learning_rate": 8e-05, + "loss": 1.4365, + "num_input_tokens_seen": 801519988, + "step": 5680 + }, + { + "epoch": 0.3467846994202479, + "grad_norm": 0.550477147102356, + "learning_rate": 8e-05, + "loss": 1.4772, + "num_input_tokens_seen": 802930932, + "step": 5690 + }, + { + "epoch": 0.3473941628638687, + "grad_norm": 0.46749433875083923, + "learning_rate": 8e-05, + "loss": 1.4563, + "num_input_tokens_seen": 804355416, + "step": 5700 + }, + { + "epoch": 0.34800362630748954, + "grad_norm": 0.6174665689468384, + "learning_rate": 8e-05, + "loss": 1.4796, + "num_input_tokens_seen": 805809472, + "step": 5710 + }, + { + "epoch": 0.3486130897511104, + "grad_norm": 0.5981230139732361, + "learning_rate": 8e-05, + "loss": 1.4873, + "num_input_tokens_seen": 807216568, + "step": 5720 + }, + { + "epoch": 0.34922255319473117, + "grad_norm": 0.8428714275360107, + "learning_rate": 8e-05, + "loss": 1.5775, + "num_input_tokens_seen": 808599216, + "step": 5730 + }, + { + "epoch": 0.349832016638352, + "grad_norm": 0.5814129710197449, + "learning_rate": 8e-05, + "loss": 1.4566, + "num_input_tokens_seen": 810003948, + "step": 5740 + }, + { + "epoch": 0.35044148008197284, + "grad_norm": 0.6042664647102356, + "learning_rate": 8e-05, + "loss": 1.5147, + "num_input_tokens_seen": 811385708, + "step": 5750 + }, + { + "epoch": 0.35105094352559363, + "grad_norm": 0.5238978862762451, + "learning_rate": 8e-05, + "loss": 1.477, + "num_input_tokens_seen": 812784196, + "step": 5760 + }, + { + "epoch": 0.35166040696921447, + "grad_norm": 0.5527967810630798, + "learning_rate": 8e-05, + "loss": 1.5258, + "num_input_tokens_seen": 814197040, + "step": 5770 + }, + { + "epoch": 0.3522698704128353, + "grad_norm": 0.539685070514679, + "learning_rate": 8e-05, + "loss": 1.505, + "num_input_tokens_seen": 815610940, + "step": 5780 + }, + { + "epoch": 0.35287933385645615, + "grad_norm": 0.5669323801994324, + "learning_rate": 8e-05, + "loss": 1.4015, + "num_input_tokens_seen": 817022692, + "step": 5790 + }, + { + "epoch": 0.35348879730007693, + "grad_norm": 0.6599160432815552, + "learning_rate": 8e-05, + "loss": 1.456, + "num_input_tokens_seen": 818400184, + "step": 5800 + }, + { + "epoch": 0.35409826074369777, + "grad_norm": 0.6508094668388367, + "learning_rate": 8e-05, + "loss": 1.432, + "num_input_tokens_seen": 819819924, + "step": 5810 + }, + { + "epoch": 0.3547077241873186, + "grad_norm": 0.45001381635665894, + "learning_rate": 8e-05, + "loss": 1.4672, + "num_input_tokens_seen": 821219128, + "step": 5820 + }, + { + "epoch": 0.3553171876309394, + "grad_norm": 0.5801172256469727, + "learning_rate": 8e-05, + "loss": 1.4242, + "num_input_tokens_seen": 822620580, + "step": 5830 + }, + { + "epoch": 0.35592665107456023, + "grad_norm": 0.588651716709137, + "learning_rate": 8e-05, + "loss": 1.5112, + "num_input_tokens_seen": 824046412, + "step": 5840 + }, + { + "epoch": 0.35653611451818107, + "grad_norm": 0.5115150213241577, + "learning_rate": 8e-05, + "loss": 1.4171, + "num_input_tokens_seen": 825453892, + "step": 5850 + }, + { + "epoch": 0.35714557796180185, + "grad_norm": 0.6320896744728088, + "learning_rate": 8e-05, + "loss": 1.5575, + "num_input_tokens_seen": 826866976, + "step": 5860 + }, + { + "epoch": 0.3577550414054227, + "grad_norm": 0.5521752834320068, + "learning_rate": 8e-05, + "loss": 1.4981, + "num_input_tokens_seen": 828276444, + "step": 5870 + }, + { + "epoch": 0.35836450484904353, + "grad_norm": 0.4956464469432831, + "learning_rate": 8e-05, + "loss": 1.4582, + "num_input_tokens_seen": 829717872, + "step": 5880 + }, + { + "epoch": 0.35897396829266437, + "grad_norm": 0.6160319447517395, + "learning_rate": 8e-05, + "loss": 1.4735, + "num_input_tokens_seen": 831124236, + "step": 5890 + }, + { + "epoch": 0.35958343173628515, + "grad_norm": 0.5546999573707581, + "learning_rate": 8e-05, + "loss": 1.5093, + "num_input_tokens_seen": 832544524, + "step": 5900 + }, + { + "epoch": 0.360192895179906, + "grad_norm": 0.5691624879837036, + "learning_rate": 8e-05, + "loss": 1.4528, + "num_input_tokens_seen": 833976044, + "step": 5910 + }, + { + "epoch": 0.36080235862352683, + "grad_norm": 0.5685278177261353, + "learning_rate": 8e-05, + "loss": 1.4905, + "num_input_tokens_seen": 835420540, + "step": 5920 + }, + { + "epoch": 0.3614118220671476, + "grad_norm": 0.5637968182563782, + "learning_rate": 8e-05, + "loss": 1.438, + "num_input_tokens_seen": 836844744, + "step": 5930 + }, + { + "epoch": 0.36202128551076845, + "grad_norm": 0.5134168267250061, + "learning_rate": 8e-05, + "loss": 1.4268, + "num_input_tokens_seen": 838204660, + "step": 5940 + }, + { + "epoch": 0.3626307489543893, + "grad_norm": 0.6179930567741394, + "learning_rate": 8e-05, + "loss": 1.4223, + "num_input_tokens_seen": 839639804, + "step": 5950 + }, + { + "epoch": 0.3632402123980101, + "grad_norm": 0.6156236529350281, + "learning_rate": 8e-05, + "loss": 1.4914, + "num_input_tokens_seen": 841052816, + "step": 5960 + }, + { + "epoch": 0.3638496758416309, + "grad_norm": 0.5875065922737122, + "learning_rate": 8e-05, + "loss": 1.4722, + "num_input_tokens_seen": 842451068, + "step": 5970 + }, + { + "epoch": 0.36445913928525175, + "grad_norm": 0.6806820631027222, + "learning_rate": 8e-05, + "loss": 1.4535, + "num_input_tokens_seen": 843881212, + "step": 5980 + }, + { + "epoch": 0.3650686027288726, + "grad_norm": 0.5545870065689087, + "learning_rate": 8e-05, + "loss": 1.4698, + "num_input_tokens_seen": 845263884, + "step": 5990 + }, + { + "epoch": 0.3656780661724934, + "grad_norm": 0.4307273328304291, + "learning_rate": 8e-05, + "loss": 1.5082, + "num_input_tokens_seen": 846667064, + "step": 6000 + }, + { + "epoch": 0.3662875296161142, + "grad_norm": 0.58298659324646, + "learning_rate": 8e-05, + "loss": 1.4679, + "num_input_tokens_seen": 848068848, + "step": 6010 + }, + { + "epoch": 0.36689699305973505, + "grad_norm": 0.45190927386283875, + "learning_rate": 8e-05, + "loss": 1.4729, + "num_input_tokens_seen": 849463044, + "step": 6020 + }, + { + "epoch": 0.36750645650335584, + "grad_norm": 0.5742015838623047, + "learning_rate": 8e-05, + "loss": 1.4787, + "num_input_tokens_seen": 850877448, + "step": 6030 + }, + { + "epoch": 0.3681159199469767, + "grad_norm": 0.5748038291931152, + "learning_rate": 8e-05, + "loss": 1.4364, + "num_input_tokens_seen": 852315284, + "step": 6040 + }, + { + "epoch": 0.3687253833905975, + "grad_norm": 0.5301962494850159, + "learning_rate": 8e-05, + "loss": 1.4812, + "num_input_tokens_seen": 853729908, + "step": 6050 + }, + { + "epoch": 0.3693348468342183, + "grad_norm": 0.589756965637207, + "learning_rate": 8e-05, + "loss": 1.4993, + "num_input_tokens_seen": 855128224, + "step": 6060 + }, + { + "epoch": 0.36994431027783914, + "grad_norm": 0.5072458982467651, + "learning_rate": 8e-05, + "loss": 1.5019, + "num_input_tokens_seen": 856560260, + "step": 6070 + }, + { + "epoch": 0.37055377372146, + "grad_norm": 0.532714307308197, + "learning_rate": 8e-05, + "loss": 1.4981, + "num_input_tokens_seen": 857947816, + "step": 6080 + }, + { + "epoch": 0.3711632371650808, + "grad_norm": 0.5182844400405884, + "learning_rate": 8e-05, + "loss": 1.4869, + "num_input_tokens_seen": 859354540, + "step": 6090 + }, + { + "epoch": 0.3717727006087016, + "grad_norm": 0.5242214798927307, + "learning_rate": 8e-05, + "loss": 1.498, + "num_input_tokens_seen": 860784760, + "step": 6100 + }, + { + "epoch": 0.37238216405232244, + "grad_norm": 0.5378220677375793, + "learning_rate": 8e-05, + "loss": 1.4231, + "num_input_tokens_seen": 862165624, + "step": 6110 + }, + { + "epoch": 0.3729916274959433, + "grad_norm": 0.5064565539360046, + "learning_rate": 8e-05, + "loss": 1.4333, + "num_input_tokens_seen": 863576044, + "step": 6120 + }, + { + "epoch": 0.37360109093956406, + "grad_norm": 0.6327691078186035, + "learning_rate": 8e-05, + "loss": 1.4521, + "num_input_tokens_seen": 864950492, + "step": 6130 + }, + { + "epoch": 0.3742105543831849, + "grad_norm": 0.6391866207122803, + "learning_rate": 8e-05, + "loss": 1.4526, + "num_input_tokens_seen": 866373744, + "step": 6140 + }, + { + "epoch": 0.37482001782680574, + "grad_norm": 0.5664558410644531, + "learning_rate": 8e-05, + "loss": 1.5089, + "num_input_tokens_seen": 867827248, + "step": 6150 + }, + { + "epoch": 0.3754294812704265, + "grad_norm": 0.6216340065002441, + "learning_rate": 8e-05, + "loss": 1.4829, + "num_input_tokens_seen": 869255428, + "step": 6160 + }, + { + "epoch": 0.37603894471404736, + "grad_norm": 0.612844705581665, + "learning_rate": 8e-05, + "loss": 1.4621, + "num_input_tokens_seen": 870649988, + "step": 6170 + }, + { + "epoch": 0.3766484081576682, + "grad_norm": 0.5435082912445068, + "learning_rate": 8e-05, + "loss": 1.3834, + "num_input_tokens_seen": 872079800, + "step": 6180 + }, + { + "epoch": 0.37725787160128904, + "grad_norm": 0.6071653366088867, + "learning_rate": 8e-05, + "loss": 1.4301, + "num_input_tokens_seen": 873466904, + "step": 6190 + }, + { + "epoch": 0.3778673350449098, + "grad_norm": 0.5115202069282532, + "learning_rate": 8e-05, + "loss": 1.4156, + "num_input_tokens_seen": 874891528, + "step": 6200 + }, + { + "epoch": 0.37847679848853066, + "grad_norm": 0.5494471788406372, + "learning_rate": 8e-05, + "loss": 1.4658, + "num_input_tokens_seen": 876284912, + "step": 6210 + }, + { + "epoch": 0.3790862619321515, + "grad_norm": 0.4871610701084137, + "learning_rate": 8e-05, + "loss": 1.4497, + "num_input_tokens_seen": 877703108, + "step": 6220 + }, + { + "epoch": 0.3796957253757723, + "grad_norm": 0.6249947547912598, + "learning_rate": 8e-05, + "loss": 1.4456, + "num_input_tokens_seen": 879123340, + "step": 6230 + }, + { + "epoch": 0.3803051888193931, + "grad_norm": 0.5746230483055115, + "learning_rate": 8e-05, + "loss": 1.5171, + "num_input_tokens_seen": 880553048, + "step": 6240 + }, + { + "epoch": 0.38091465226301396, + "grad_norm": 0.5843877792358398, + "learning_rate": 8e-05, + "loss": 1.5046, + "num_input_tokens_seen": 881984776, + "step": 6250 + }, + { + "epoch": 0.38152411570663475, + "grad_norm": 0.5240760445594788, + "learning_rate": 8e-05, + "loss": 1.4689, + "num_input_tokens_seen": 883371668, + "step": 6260 + }, + { + "epoch": 0.3821335791502556, + "grad_norm": 0.5815708041191101, + "learning_rate": 8e-05, + "loss": 1.5634, + "num_input_tokens_seen": 884792444, + "step": 6270 + }, + { + "epoch": 0.3827430425938764, + "grad_norm": 0.6558341383934021, + "learning_rate": 8e-05, + "loss": 1.4519, + "num_input_tokens_seen": 886181612, + "step": 6280 + }, + { + "epoch": 0.38335250603749726, + "grad_norm": 0.5229777097702026, + "learning_rate": 8e-05, + "loss": 1.4538, + "num_input_tokens_seen": 887601452, + "step": 6290 + }, + { + "epoch": 0.38396196948111805, + "grad_norm": 0.5209792852401733, + "learning_rate": 8e-05, + "loss": 1.4415, + "num_input_tokens_seen": 889034508, + "step": 6300 + }, + { + "epoch": 0.3845714329247389, + "grad_norm": 0.5684558749198914, + "learning_rate": 8e-05, + "loss": 1.4671, + "num_input_tokens_seen": 890425468, + "step": 6310 + }, + { + "epoch": 0.3851808963683597, + "grad_norm": 0.578091025352478, + "learning_rate": 8e-05, + "loss": 1.4735, + "num_input_tokens_seen": 891826040, + "step": 6320 + }, + { + "epoch": 0.3857903598119805, + "grad_norm": 0.5623794198036194, + "learning_rate": 8e-05, + "loss": 1.4926, + "num_input_tokens_seen": 893262592, + "step": 6330 + }, + { + "epoch": 0.38639982325560135, + "grad_norm": 0.5411602854728699, + "learning_rate": 8e-05, + "loss": 1.4318, + "num_input_tokens_seen": 894682080, + "step": 6340 + }, + { + "epoch": 0.3870092866992222, + "grad_norm": 0.5540904998779297, + "learning_rate": 8e-05, + "loss": 1.4068, + "num_input_tokens_seen": 896099164, + "step": 6350 + }, + { + "epoch": 0.38761875014284297, + "grad_norm": 0.5864216685295105, + "learning_rate": 8e-05, + "loss": 1.399, + "num_input_tokens_seen": 897476228, + "step": 6360 + }, + { + "epoch": 0.3882282135864638, + "grad_norm": 0.6339991092681885, + "learning_rate": 8e-05, + "loss": 1.467, + "num_input_tokens_seen": 898854080, + "step": 6370 + }, + { + "epoch": 0.38883767703008465, + "grad_norm": 0.555851399898529, + "learning_rate": 8e-05, + "loss": 1.4563, + "num_input_tokens_seen": 900256152, + "step": 6380 + }, + { + "epoch": 0.3894471404737055, + "grad_norm": 0.5713673233985901, + "learning_rate": 8e-05, + "loss": 1.5061, + "num_input_tokens_seen": 901627728, + "step": 6390 + }, + { + "epoch": 0.39005660391732627, + "grad_norm": 0.5585047006607056, + "learning_rate": 8e-05, + "loss": 1.4458, + "num_input_tokens_seen": 902992920, + "step": 6400 + }, + { + "epoch": 0.3906660673609471, + "grad_norm": 0.4973145127296448, + "learning_rate": 8e-05, + "loss": 1.4566, + "num_input_tokens_seen": 904318848, + "step": 6410 + }, + { + "epoch": 0.39127553080456795, + "grad_norm": 0.5424864888191223, + "learning_rate": 8e-05, + "loss": 1.4537, + "num_input_tokens_seen": 905700004, + "step": 6420 + }, + { + "epoch": 0.39188499424818873, + "grad_norm": 0.5117954611778259, + "learning_rate": 8e-05, + "loss": 1.4243, + "num_input_tokens_seen": 907103420, + "step": 6430 + }, + { + "epoch": 0.3924944576918096, + "grad_norm": 0.5525716543197632, + "learning_rate": 8e-05, + "loss": 1.4567, + "num_input_tokens_seen": 908554396, + "step": 6440 + }, + { + "epoch": 0.3931039211354304, + "grad_norm": 0.5715943574905396, + "learning_rate": 8e-05, + "loss": 1.44, + "num_input_tokens_seen": 909956772, + "step": 6450 + }, + { + "epoch": 0.3937133845790512, + "grad_norm": 0.5808925628662109, + "learning_rate": 8e-05, + "loss": 1.4815, + "num_input_tokens_seen": 911395988, + "step": 6460 + }, + { + "epoch": 0.39432284802267203, + "grad_norm": 0.6523997187614441, + "learning_rate": 8e-05, + "loss": 1.4218, + "num_input_tokens_seen": 912784372, + "step": 6470 + }, + { + "epoch": 0.3949323114662929, + "grad_norm": 0.6092135310173035, + "learning_rate": 8e-05, + "loss": 1.4576, + "num_input_tokens_seen": 914207364, + "step": 6480 + }, + { + "epoch": 0.39554177490991366, + "grad_norm": 0.5254921317100525, + "learning_rate": 8e-05, + "loss": 1.4644, + "num_input_tokens_seen": 915636420, + "step": 6490 + }, + { + "epoch": 0.3961512383535345, + "grad_norm": 0.6733999252319336, + "learning_rate": 8e-05, + "loss": 1.4234, + "num_input_tokens_seen": 917065908, + "step": 6500 + }, + { + "epoch": 0.39676070179715534, + "grad_norm": 0.5586018562316895, + "learning_rate": 8e-05, + "loss": 1.5169, + "num_input_tokens_seen": 918451632, + "step": 6510 + }, + { + "epoch": 0.3973701652407762, + "grad_norm": 0.5588156580924988, + "learning_rate": 8e-05, + "loss": 1.3798, + "num_input_tokens_seen": 919829072, + "step": 6520 + }, + { + "epoch": 0.39797962868439696, + "grad_norm": 0.5383242964744568, + "learning_rate": 8e-05, + "loss": 1.468, + "num_input_tokens_seen": 921242500, + "step": 6530 + }, + { + "epoch": 0.3985890921280178, + "grad_norm": 0.516828715801239, + "learning_rate": 8e-05, + "loss": 1.4953, + "num_input_tokens_seen": 922668428, + "step": 6540 + }, + { + "epoch": 0.39919855557163864, + "grad_norm": 0.5715090036392212, + "learning_rate": 8e-05, + "loss": 1.4606, + "num_input_tokens_seen": 924115712, + "step": 6550 + }, + { + "epoch": 0.3998080190152594, + "grad_norm": 0.5680440664291382, + "learning_rate": 8e-05, + "loss": 1.498, + "num_input_tokens_seen": 925525280, + "step": 6560 + }, + { + "epoch": 0.40041748245888026, + "grad_norm": 0.5580743551254272, + "learning_rate": 8e-05, + "loss": 1.4742, + "num_input_tokens_seen": 926948888, + "step": 6570 + }, + { + "epoch": 0.4010269459025011, + "grad_norm": 0.5276727676391602, + "learning_rate": 8e-05, + "loss": 1.4808, + "num_input_tokens_seen": 928345036, + "step": 6580 + }, + { + "epoch": 0.4016364093461219, + "grad_norm": 0.5557569265365601, + "learning_rate": 8e-05, + "loss": 1.5104, + "num_input_tokens_seen": 929788420, + "step": 6590 + }, + { + "epoch": 0.4022458727897427, + "grad_norm": 0.5946210026741028, + "learning_rate": 8e-05, + "loss": 1.4409, + "num_input_tokens_seen": 931238968, + "step": 6600 + }, + { + "epoch": 0.40285533623336356, + "grad_norm": 0.5625696778297424, + "learning_rate": 8e-05, + "loss": 1.4899, + "num_input_tokens_seen": 932685800, + "step": 6610 + }, + { + "epoch": 0.4034647996769844, + "grad_norm": 0.5148335695266724, + "learning_rate": 8e-05, + "loss": 1.4781, + "num_input_tokens_seen": 934066584, + "step": 6620 + }, + { + "epoch": 0.4040742631206052, + "grad_norm": 0.5163837671279907, + "learning_rate": 8e-05, + "loss": 1.3748, + "num_input_tokens_seen": 935478628, + "step": 6630 + }, + { + "epoch": 0.404683726564226, + "grad_norm": 0.5167310237884521, + "learning_rate": 8e-05, + "loss": 1.4414, + "num_input_tokens_seen": 936901648, + "step": 6640 + }, + { + "epoch": 0.40529319000784686, + "grad_norm": 0.5219048261642456, + "learning_rate": 8e-05, + "loss": 1.4501, + "num_input_tokens_seen": 938297212, + "step": 6650 + }, + { + "epoch": 0.40590265345146764, + "grad_norm": 0.4949333369731903, + "learning_rate": 8e-05, + "loss": 1.4314, + "num_input_tokens_seen": 939716088, + "step": 6660 + }, + { + "epoch": 0.4065121168950885, + "grad_norm": 0.6273387670516968, + "learning_rate": 8e-05, + "loss": 1.3889, + "num_input_tokens_seen": 941144088, + "step": 6670 + }, + { + "epoch": 0.4071215803387093, + "grad_norm": 0.7101430296897888, + "learning_rate": 8e-05, + "loss": 1.488, + "num_input_tokens_seen": 942563156, + "step": 6680 + }, + { + "epoch": 0.4077310437823301, + "grad_norm": 0.5433675646781921, + "learning_rate": 8e-05, + "loss": 1.419, + "num_input_tokens_seen": 943957008, + "step": 6690 + }, + { + "epoch": 0.40834050722595094, + "grad_norm": 0.5392646789550781, + "learning_rate": 8e-05, + "loss": 1.4103, + "num_input_tokens_seen": 945372644, + "step": 6700 + }, + { + "epoch": 0.4089499706695718, + "grad_norm": 0.537996768951416, + "learning_rate": 8e-05, + "loss": 1.4489, + "num_input_tokens_seen": 946820652, + "step": 6710 + }, + { + "epoch": 0.4095594341131926, + "grad_norm": 0.625306248664856, + "learning_rate": 8e-05, + "loss": 1.4446, + "num_input_tokens_seen": 948202592, + "step": 6720 + }, + { + "epoch": 0.4101688975568134, + "grad_norm": 0.5231274962425232, + "learning_rate": 8e-05, + "loss": 1.4562, + "num_input_tokens_seen": 949622864, + "step": 6730 + }, + { + "epoch": 0.41077836100043424, + "grad_norm": 0.5137037634849548, + "learning_rate": 8e-05, + "loss": 1.518, + "num_input_tokens_seen": 951016388, + "step": 6740 + }, + { + "epoch": 0.4113878244440551, + "grad_norm": 0.5600166320800781, + "learning_rate": 8e-05, + "loss": 1.3742, + "num_input_tokens_seen": 952467456, + "step": 6750 + }, + { + "epoch": 0.41199728788767587, + "grad_norm": 0.5758654475212097, + "learning_rate": 8e-05, + "loss": 1.4311, + "num_input_tokens_seen": 953914724, + "step": 6760 + }, + { + "epoch": 0.4126067513312967, + "grad_norm": 0.5547349452972412, + "learning_rate": 8e-05, + "loss": 1.4261, + "num_input_tokens_seen": 955295720, + "step": 6770 + }, + { + "epoch": 0.41321621477491755, + "grad_norm": 0.5143230557441711, + "learning_rate": 8e-05, + "loss": 1.4667, + "num_input_tokens_seen": 956681368, + "step": 6780 + }, + { + "epoch": 0.41382567821853833, + "grad_norm": 0.6212142109870911, + "learning_rate": 8e-05, + "loss": 1.5469, + "num_input_tokens_seen": 958080316, + "step": 6790 + }, + { + "epoch": 0.41443514166215917, + "grad_norm": 0.6437855362892151, + "learning_rate": 8e-05, + "loss": 1.4665, + "num_input_tokens_seen": 959483132, + "step": 6800 + }, + { + "epoch": 0.41504460510578, + "grad_norm": 0.6089012622833252, + "learning_rate": 8e-05, + "loss": 1.4368, + "num_input_tokens_seen": 960872924, + "step": 6810 + }, + { + "epoch": 0.41565406854940085, + "grad_norm": 0.5278199315071106, + "learning_rate": 8e-05, + "loss": 1.4487, + "num_input_tokens_seen": 962282748, + "step": 6820 + }, + { + "epoch": 0.41626353199302163, + "grad_norm": 0.5420494079589844, + "learning_rate": 8e-05, + "loss": 1.4431, + "num_input_tokens_seen": 963724892, + "step": 6830 + }, + { + "epoch": 0.41687299543664247, + "grad_norm": 0.5393320322036743, + "learning_rate": 8e-05, + "loss": 1.462, + "num_input_tokens_seen": 965116844, + "step": 6840 + }, + { + "epoch": 0.4174824588802633, + "grad_norm": 0.4785369634628296, + "learning_rate": 8e-05, + "loss": 1.4703, + "num_input_tokens_seen": 966494980, + "step": 6850 + }, + { + "epoch": 0.4180919223238841, + "grad_norm": 0.5344218611717224, + "learning_rate": 8e-05, + "loss": 1.4295, + "num_input_tokens_seen": 967908316, + "step": 6860 + }, + { + "epoch": 0.41870138576750493, + "grad_norm": 0.5722907781600952, + "learning_rate": 8e-05, + "loss": 1.5415, + "num_input_tokens_seen": 969300604, + "step": 6870 + }, + { + "epoch": 0.41931084921112577, + "grad_norm": 0.6001546382904053, + "learning_rate": 8e-05, + "loss": 1.4469, + "num_input_tokens_seen": 970686572, + "step": 6880 + }, + { + "epoch": 0.41992031265474655, + "grad_norm": 0.48998522758483887, + "learning_rate": 8e-05, + "loss": 1.497, + "num_input_tokens_seen": 972128596, + "step": 6890 + }, + { + "epoch": 0.4205297760983674, + "grad_norm": 0.5235675573348999, + "learning_rate": 8e-05, + "loss": 1.4051, + "num_input_tokens_seen": 973505548, + "step": 6900 + }, + { + "epoch": 0.42113923954198823, + "grad_norm": 0.45352238416671753, + "learning_rate": 8e-05, + "loss": 1.5063, + "num_input_tokens_seen": 974916112, + "step": 6910 + }, + { + "epoch": 0.42174870298560907, + "grad_norm": 0.6097639203071594, + "learning_rate": 8e-05, + "loss": 1.4945, + "num_input_tokens_seen": 976344436, + "step": 6920 + }, + { + "epoch": 0.42235816642922985, + "grad_norm": 0.6069915294647217, + "learning_rate": 8e-05, + "loss": 1.4796, + "num_input_tokens_seen": 977726860, + "step": 6930 + }, + { + "epoch": 0.4229676298728507, + "grad_norm": 0.555101752281189, + "learning_rate": 8e-05, + "loss": 1.465, + "num_input_tokens_seen": 979148684, + "step": 6940 + }, + { + "epoch": 0.42357709331647153, + "grad_norm": 0.5358740091323853, + "learning_rate": 8e-05, + "loss": 1.4698, + "num_input_tokens_seen": 980570088, + "step": 6950 + }, + { + "epoch": 0.4241865567600923, + "grad_norm": 0.5622055530548096, + "learning_rate": 8e-05, + "loss": 1.4742, + "num_input_tokens_seen": 982007352, + "step": 6960 + }, + { + "epoch": 0.42479602020371315, + "grad_norm": 0.548179566860199, + "learning_rate": 8e-05, + "loss": 1.3727, + "num_input_tokens_seen": 983386076, + "step": 6970 + }, + { + "epoch": 0.425405483647334, + "grad_norm": 0.49980705976486206, + "learning_rate": 8e-05, + "loss": 1.4423, + "num_input_tokens_seen": 984801120, + "step": 6980 + }, + { + "epoch": 0.4260149470909548, + "grad_norm": 0.5310669541358948, + "learning_rate": 8e-05, + "loss": 1.4295, + "num_input_tokens_seen": 986186972, + "step": 6990 + }, + { + "epoch": 0.4266244105345756, + "grad_norm": 0.5813121199607849, + "learning_rate": 8e-05, + "loss": 1.4334, + "num_input_tokens_seen": 987570284, + "step": 7000 + }, + { + "epoch": 0.42723387397819645, + "grad_norm": 0.5761700868606567, + "learning_rate": 8e-05, + "loss": 1.444, + "num_input_tokens_seen": 988955796, + "step": 7010 + }, + { + "epoch": 0.4278433374218173, + "grad_norm": 0.5148670673370361, + "learning_rate": 8e-05, + "loss": 1.4178, + "num_input_tokens_seen": 990350148, + "step": 7020 + }, + { + "epoch": 0.4284528008654381, + "grad_norm": 0.5637139678001404, + "learning_rate": 8e-05, + "loss": 1.452, + "num_input_tokens_seen": 991771360, + "step": 7030 + }, + { + "epoch": 0.4290622643090589, + "grad_norm": 0.5411230325698853, + "learning_rate": 8e-05, + "loss": 1.4548, + "num_input_tokens_seen": 993232832, + "step": 7040 + }, + { + "epoch": 0.42967172775267976, + "grad_norm": 0.5264154076576233, + "learning_rate": 8e-05, + "loss": 1.4627, + "num_input_tokens_seen": 994638464, + "step": 7050 + }, + { + "epoch": 0.43028119119630054, + "grad_norm": 0.6401540637016296, + "learning_rate": 8e-05, + "loss": 1.4346, + "num_input_tokens_seen": 996023220, + "step": 7060 + }, + { + "epoch": 0.4308906546399214, + "grad_norm": 0.5791023373603821, + "learning_rate": 8e-05, + "loss": 1.419, + "num_input_tokens_seen": 997433208, + "step": 7070 + }, + { + "epoch": 0.4315001180835422, + "grad_norm": 0.5640988945960999, + "learning_rate": 8e-05, + "loss": 1.4145, + "num_input_tokens_seen": 998853492, + "step": 7080 + }, + { + "epoch": 0.432109581527163, + "grad_norm": 0.6895711421966553, + "learning_rate": 8e-05, + "loss": 1.5127, + "num_input_tokens_seen": 1000269424, + "step": 7090 + }, + { + "epoch": 0.43271904497078384, + "grad_norm": 0.5173920392990112, + "learning_rate": 8e-05, + "loss": 1.4035, + "num_input_tokens_seen": 1001659736, + "step": 7100 + }, + { + "epoch": 0.4333285084144047, + "grad_norm": 0.518268346786499, + "learning_rate": 8e-05, + "loss": 1.4023, + "num_input_tokens_seen": 1003061084, + "step": 7110 + }, + { + "epoch": 0.4339379718580255, + "grad_norm": 0.4936695992946625, + "learning_rate": 8e-05, + "loss": 1.4437, + "num_input_tokens_seen": 1004496060, + "step": 7120 + }, + { + "epoch": 0.4345474353016463, + "grad_norm": 0.5039248466491699, + "learning_rate": 8e-05, + "loss": 1.4591, + "num_input_tokens_seen": 1005928784, + "step": 7130 + }, + { + "epoch": 0.43515689874526714, + "grad_norm": 0.5840840339660645, + "learning_rate": 8e-05, + "loss": 1.4275, + "num_input_tokens_seen": 1007361236, + "step": 7140 + }, + { + "epoch": 0.435766362188888, + "grad_norm": 0.5575258135795593, + "learning_rate": 8e-05, + "loss": 1.4881, + "num_input_tokens_seen": 1008766028, + "step": 7150 + }, + { + "epoch": 0.43637582563250876, + "grad_norm": 0.49963733553886414, + "learning_rate": 8e-05, + "loss": 1.4783, + "num_input_tokens_seen": 1010202448, + "step": 7160 + }, + { + "epoch": 0.4369852890761296, + "grad_norm": 0.4767080843448639, + "learning_rate": 8e-05, + "loss": 1.4741, + "num_input_tokens_seen": 1011606648, + "step": 7170 + }, + { + "epoch": 0.43759475251975044, + "grad_norm": 0.5568097829818726, + "learning_rate": 8e-05, + "loss": 1.5022, + "num_input_tokens_seen": 1013003276, + "step": 7180 + }, + { + "epoch": 0.4382042159633712, + "grad_norm": 0.5164647102355957, + "learning_rate": 8e-05, + "loss": 1.3946, + "num_input_tokens_seen": 1014431492, + "step": 7190 + }, + { + "epoch": 0.43881367940699206, + "grad_norm": 0.48684781789779663, + "learning_rate": 8e-05, + "loss": 1.4464, + "num_input_tokens_seen": 1015812592, + "step": 7200 + }, + { + "epoch": 0.4394231428506129, + "grad_norm": 0.4711934030056, + "learning_rate": 8e-05, + "loss": 1.4647, + "num_input_tokens_seen": 1017217112, + "step": 7210 + }, + { + "epoch": 0.44003260629423374, + "grad_norm": 0.5720543265342712, + "learning_rate": 8e-05, + "loss": 1.4723, + "num_input_tokens_seen": 1018655572, + "step": 7220 + }, + { + "epoch": 0.4406420697378545, + "grad_norm": 0.5549769997596741, + "learning_rate": 8e-05, + "loss": 1.4384, + "num_input_tokens_seen": 1020040320, + "step": 7230 + }, + { + "epoch": 0.44125153318147536, + "grad_norm": 0.5608406066894531, + "learning_rate": 8e-05, + "loss": 1.4143, + "num_input_tokens_seen": 1021483904, + "step": 7240 + }, + { + "epoch": 0.4418609966250962, + "grad_norm": 0.5251893401145935, + "learning_rate": 8e-05, + "loss": 1.3365, + "num_input_tokens_seen": 1022886696, + "step": 7250 + }, + { + "epoch": 0.442470460068717, + "grad_norm": 0.50985187292099, + "learning_rate": 8e-05, + "loss": 1.3904, + "num_input_tokens_seen": 1024290644, + "step": 7260 + }, + { + "epoch": 0.4430799235123378, + "grad_norm": 0.673911988735199, + "learning_rate": 8e-05, + "loss": 1.4286, + "num_input_tokens_seen": 1025736504, + "step": 7270 + }, + { + "epoch": 0.44368938695595866, + "grad_norm": 0.6181952357292175, + "learning_rate": 8e-05, + "loss": 1.4864, + "num_input_tokens_seen": 1027167200, + "step": 7280 + }, + { + "epoch": 0.44429885039957945, + "grad_norm": 0.6667662262916565, + "learning_rate": 8e-05, + "loss": 1.4002, + "num_input_tokens_seen": 1028578528, + "step": 7290 + }, + { + "epoch": 0.4449083138432003, + "grad_norm": 0.5670000910758972, + "learning_rate": 8e-05, + "loss": 1.4228, + "num_input_tokens_seen": 1029984048, + "step": 7300 + }, + { + "epoch": 0.4455177772868211, + "grad_norm": 0.527633786201477, + "learning_rate": 8e-05, + "loss": 1.423, + "num_input_tokens_seen": 1031348748, + "step": 7310 + }, + { + "epoch": 0.4461272407304419, + "grad_norm": 0.6152751445770264, + "learning_rate": 8e-05, + "loss": 1.4067, + "num_input_tokens_seen": 1032747992, + "step": 7320 + }, + { + "epoch": 0.44673670417406275, + "grad_norm": 0.6434319615364075, + "learning_rate": 8e-05, + "loss": 1.5126, + "num_input_tokens_seen": 1034148668, + "step": 7330 + }, + { + "epoch": 0.4473461676176836, + "grad_norm": 0.5347893238067627, + "learning_rate": 8e-05, + "loss": 1.5054, + "num_input_tokens_seen": 1035562136, + "step": 7340 + }, + { + "epoch": 0.4479556310613044, + "grad_norm": 0.6078560948371887, + "learning_rate": 8e-05, + "loss": 1.4428, + "num_input_tokens_seen": 1036939864, + "step": 7350 + }, + { + "epoch": 0.4485650945049252, + "grad_norm": 0.5783689618110657, + "learning_rate": 8e-05, + "loss": 1.4361, + "num_input_tokens_seen": 1038323952, + "step": 7360 + }, + { + "epoch": 0.44917455794854605, + "grad_norm": 0.5024809837341309, + "learning_rate": 8e-05, + "loss": 1.4971, + "num_input_tokens_seen": 1039722864, + "step": 7370 + }, + { + "epoch": 0.4497840213921669, + "grad_norm": 0.524659276008606, + "learning_rate": 8e-05, + "loss": 1.4948, + "num_input_tokens_seen": 1041139980, + "step": 7380 + }, + { + "epoch": 0.45039348483578767, + "grad_norm": 0.46882256865501404, + "learning_rate": 8e-05, + "loss": 1.4066, + "num_input_tokens_seen": 1042543716, + "step": 7390 + }, + { + "epoch": 0.4510029482794085, + "grad_norm": 1.1085089445114136, + "learning_rate": 8e-05, + "loss": 1.4335, + "num_input_tokens_seen": 1043957848, + "step": 7400 + }, + { + "epoch": 0.45161241172302935, + "grad_norm": 0.4720146059989929, + "learning_rate": 8e-05, + "loss": 1.5418, + "num_input_tokens_seen": 1045388008, + "step": 7410 + }, + { + "epoch": 0.45222187516665013, + "grad_norm": 0.47802358865737915, + "learning_rate": 8e-05, + "loss": 1.46, + "num_input_tokens_seen": 1046809520, + "step": 7420 + }, + { + "epoch": 0.452831338610271, + "grad_norm": 0.5443661212921143, + "learning_rate": 8e-05, + "loss": 1.4775, + "num_input_tokens_seen": 1048246316, + "step": 7430 + }, + { + "epoch": 0.4534408020538918, + "grad_norm": 0.5573719143867493, + "learning_rate": 8e-05, + "loss": 1.5155, + "num_input_tokens_seen": 1049699188, + "step": 7440 + }, + { + "epoch": 0.45405026549751265, + "grad_norm": 0.4769093692302704, + "learning_rate": 8e-05, + "loss": 1.3774, + "num_input_tokens_seen": 1051064820, + "step": 7450 + }, + { + "epoch": 0.45465972894113343, + "grad_norm": 0.5798795819282532, + "learning_rate": 8e-05, + "loss": 1.4688, + "num_input_tokens_seen": 1052484612, + "step": 7460 + }, + { + "epoch": 0.4552691923847543, + "grad_norm": 0.6087391376495361, + "learning_rate": 8e-05, + "loss": 1.416, + "num_input_tokens_seen": 1053910944, + "step": 7470 + }, + { + "epoch": 0.4558786558283751, + "grad_norm": 0.567535400390625, + "learning_rate": 8e-05, + "loss": 1.4634, + "num_input_tokens_seen": 1055332428, + "step": 7480 + }, + { + "epoch": 0.4564881192719959, + "grad_norm": 0.6195465922355652, + "learning_rate": 8e-05, + "loss": 1.3092, + "num_input_tokens_seen": 1056726892, + "step": 7490 + }, + { + "epoch": 0.45709758271561673, + "grad_norm": 0.5984314680099487, + "learning_rate": 8e-05, + "loss": 1.5001, + "num_input_tokens_seen": 1058136916, + "step": 7500 + }, + { + "epoch": 0.4577070461592376, + "grad_norm": 0.48998868465423584, + "learning_rate": 8e-05, + "loss": 1.4218, + "num_input_tokens_seen": 1059580108, + "step": 7510 + }, + { + "epoch": 0.45831650960285836, + "grad_norm": 0.5266983509063721, + "learning_rate": 8e-05, + "loss": 1.4329, + "num_input_tokens_seen": 1060958932, + "step": 7520 + }, + { + "epoch": 0.4589259730464792, + "grad_norm": 0.5505419969558716, + "learning_rate": 8e-05, + "loss": 1.4541, + "num_input_tokens_seen": 1062394804, + "step": 7530 + }, + { + "epoch": 0.45953543649010004, + "grad_norm": 0.5599749684333801, + "learning_rate": 8e-05, + "loss": 1.3939, + "num_input_tokens_seen": 1063805320, + "step": 7540 + }, + { + "epoch": 0.4601448999337209, + "grad_norm": 0.561083972454071, + "learning_rate": 8e-05, + "loss": 1.3821, + "num_input_tokens_seen": 1065164348, + "step": 7550 + }, + { + "epoch": 0.46075436337734166, + "grad_norm": 0.5383641719818115, + "learning_rate": 8e-05, + "loss": 1.5057, + "num_input_tokens_seen": 1066572824, + "step": 7560 + }, + { + "epoch": 0.4613638268209625, + "grad_norm": 0.4498058259487152, + "learning_rate": 8e-05, + "loss": 1.4079, + "num_input_tokens_seen": 1067999532, + "step": 7570 + }, + { + "epoch": 0.46197329026458334, + "grad_norm": 0.5523561239242554, + "learning_rate": 8e-05, + "loss": 1.4182, + "num_input_tokens_seen": 1069413536, + "step": 7580 + }, + { + "epoch": 0.4625827537082041, + "grad_norm": 0.5212712287902832, + "learning_rate": 8e-05, + "loss": 1.4475, + "num_input_tokens_seen": 1070846708, + "step": 7590 + }, + { + "epoch": 0.46319221715182496, + "grad_norm": 0.5242462158203125, + "learning_rate": 8e-05, + "loss": 1.427, + "num_input_tokens_seen": 1072226000, + "step": 7600 + }, + { + "epoch": 0.4638016805954458, + "grad_norm": 0.5377146601676941, + "learning_rate": 8e-05, + "loss": 1.4384, + "num_input_tokens_seen": 1073640188, + "step": 7610 + }, + { + "epoch": 0.4644111440390666, + "grad_norm": 0.5246424674987793, + "learning_rate": 8e-05, + "loss": 1.4643, + "num_input_tokens_seen": 1075053120, + "step": 7620 + }, + { + "epoch": 0.4650206074826874, + "grad_norm": 0.49351179599761963, + "learning_rate": 8e-05, + "loss": 1.4539, + "num_input_tokens_seen": 1076464372, + "step": 7630 + }, + { + "epoch": 0.46563007092630826, + "grad_norm": 0.4650513827800751, + "learning_rate": 8e-05, + "loss": 1.4067, + "num_input_tokens_seen": 1077887580, + "step": 7640 + }, + { + "epoch": 0.4662395343699291, + "grad_norm": 0.5255919694900513, + "learning_rate": 8e-05, + "loss": 1.4461, + "num_input_tokens_seen": 1079314396, + "step": 7650 + }, + { + "epoch": 0.4668489978135499, + "grad_norm": 0.45597824454307556, + "learning_rate": 8e-05, + "loss": 1.4076, + "num_input_tokens_seen": 1080722664, + "step": 7660 + }, + { + "epoch": 0.4674584612571707, + "grad_norm": 0.5342453718185425, + "learning_rate": 8e-05, + "loss": 1.3912, + "num_input_tokens_seen": 1082104756, + "step": 7670 + }, + { + "epoch": 0.46806792470079156, + "grad_norm": 0.5388374328613281, + "learning_rate": 8e-05, + "loss": 1.4392, + "num_input_tokens_seen": 1083515704, + "step": 7680 + }, + { + "epoch": 0.46867738814441234, + "grad_norm": 0.5510755777359009, + "learning_rate": 8e-05, + "loss": 1.4074, + "num_input_tokens_seen": 1084952776, + "step": 7690 + }, + { + "epoch": 0.4692868515880332, + "grad_norm": 0.5435276627540588, + "learning_rate": 8e-05, + "loss": 1.4402, + "num_input_tokens_seen": 1086371416, + "step": 7700 + }, + { + "epoch": 0.469896315031654, + "grad_norm": 0.4792904853820801, + "learning_rate": 8e-05, + "loss": 1.3989, + "num_input_tokens_seen": 1087779788, + "step": 7710 + }, + { + "epoch": 0.4705057784752748, + "grad_norm": 0.4804374575614929, + "learning_rate": 8e-05, + "loss": 1.3157, + "num_input_tokens_seen": 1089174612, + "step": 7720 + }, + { + "epoch": 0.47111524191889564, + "grad_norm": 0.47840744256973267, + "learning_rate": 8e-05, + "loss": 1.3208, + "num_input_tokens_seen": 1090588704, + "step": 7730 + }, + { + "epoch": 0.4717247053625165, + "grad_norm": 0.5292723178863525, + "learning_rate": 8e-05, + "loss": 1.4032, + "num_input_tokens_seen": 1091995860, + "step": 7740 + }, + { + "epoch": 0.4723341688061373, + "grad_norm": 0.5833985209465027, + "learning_rate": 8e-05, + "loss": 1.4286, + "num_input_tokens_seen": 1093395952, + "step": 7750 + }, + { + "epoch": 0.4729436322497581, + "grad_norm": 0.48498985171318054, + "learning_rate": 8e-05, + "loss": 1.4765, + "num_input_tokens_seen": 1094805108, + "step": 7760 + }, + { + "epoch": 0.47355309569337894, + "grad_norm": 0.5416792631149292, + "learning_rate": 8e-05, + "loss": 1.4559, + "num_input_tokens_seen": 1096191520, + "step": 7770 + }, + { + "epoch": 0.4741625591369998, + "grad_norm": 0.5118204951286316, + "learning_rate": 8e-05, + "loss": 1.4098, + "num_input_tokens_seen": 1097626836, + "step": 7780 + }, + { + "epoch": 0.47477202258062057, + "grad_norm": 0.5373117327690125, + "learning_rate": 8e-05, + "loss": 1.4222, + "num_input_tokens_seen": 1099008448, + "step": 7790 + }, + { + "epoch": 0.4753814860242414, + "grad_norm": 0.634472668170929, + "learning_rate": 8e-05, + "loss": 1.4626, + "num_input_tokens_seen": 1100430712, + "step": 7800 + }, + { + "epoch": 0.47599094946786225, + "grad_norm": 0.5606759190559387, + "learning_rate": 8e-05, + "loss": 1.4181, + "num_input_tokens_seen": 1101864384, + "step": 7810 + }, + { + "epoch": 0.47660041291148303, + "grad_norm": 0.6342650651931763, + "learning_rate": 8e-05, + "loss": 1.3675, + "num_input_tokens_seen": 1103281588, + "step": 7820 + }, + { + "epoch": 0.47720987635510387, + "grad_norm": 0.5315597057342529, + "learning_rate": 8e-05, + "loss": 1.4346, + "num_input_tokens_seen": 1104678480, + "step": 7830 + }, + { + "epoch": 0.4778193397987247, + "grad_norm": 0.6562740802764893, + "learning_rate": 8e-05, + "loss": 1.4669, + "num_input_tokens_seen": 1106087552, + "step": 7840 + }, + { + "epoch": 0.47842880324234555, + "grad_norm": 0.5265791416168213, + "learning_rate": 8e-05, + "loss": 1.4921, + "num_input_tokens_seen": 1107533152, + "step": 7850 + }, + { + "epoch": 0.47903826668596633, + "grad_norm": 0.4799894094467163, + "learning_rate": 8e-05, + "loss": 1.4559, + "num_input_tokens_seen": 1108933276, + "step": 7860 + }, + { + "epoch": 0.47964773012958717, + "grad_norm": 0.5308011174201965, + "learning_rate": 8e-05, + "loss": 1.3988, + "num_input_tokens_seen": 1110344220, + "step": 7870 + }, + { + "epoch": 0.480257193573208, + "grad_norm": 0.5046650171279907, + "learning_rate": 8e-05, + "loss": 1.4366, + "num_input_tokens_seen": 1111735944, + "step": 7880 + }, + { + "epoch": 0.4808666570168288, + "grad_norm": 0.5083670616149902, + "learning_rate": 8e-05, + "loss": 1.4219, + "num_input_tokens_seen": 1113172872, + "step": 7890 + }, + { + "epoch": 0.48147612046044963, + "grad_norm": 0.5572594404220581, + "learning_rate": 8e-05, + "loss": 1.4198, + "num_input_tokens_seen": 1114579892, + "step": 7900 + }, + { + "epoch": 0.48208558390407047, + "grad_norm": 0.5634285807609558, + "learning_rate": 8e-05, + "loss": 1.468, + "num_input_tokens_seen": 1115978076, + "step": 7910 + }, + { + "epoch": 0.48269504734769125, + "grad_norm": 0.4851561486721039, + "learning_rate": 8e-05, + "loss": 1.3592, + "num_input_tokens_seen": 1117361664, + "step": 7920 + }, + { + "epoch": 0.4833045107913121, + "grad_norm": 0.4718756079673767, + "learning_rate": 8e-05, + "loss": 1.439, + "num_input_tokens_seen": 1118816252, + "step": 7930 + }, + { + "epoch": 0.48391397423493293, + "grad_norm": 0.5959479808807373, + "learning_rate": 8e-05, + "loss": 1.4517, + "num_input_tokens_seen": 1120264108, + "step": 7940 + }, + { + "epoch": 0.48452343767855377, + "grad_norm": 0.5836747288703918, + "learning_rate": 8e-05, + "loss": 1.4244, + "num_input_tokens_seen": 1121650124, + "step": 7950 + }, + { + "epoch": 0.48513290112217455, + "grad_norm": 0.5627513527870178, + "learning_rate": 8e-05, + "loss": 1.4043, + "num_input_tokens_seen": 1123080060, + "step": 7960 + }, + { + "epoch": 0.4857423645657954, + "grad_norm": 0.5587437152862549, + "learning_rate": 8e-05, + "loss": 1.4019, + "num_input_tokens_seen": 1124511240, + "step": 7970 + }, + { + "epoch": 0.48635182800941623, + "grad_norm": 0.6421571969985962, + "learning_rate": 8e-05, + "loss": 1.4026, + "num_input_tokens_seen": 1125934080, + "step": 7980 + }, + { + "epoch": 0.486961291453037, + "grad_norm": 0.4620668888092041, + "learning_rate": 8e-05, + "loss": 1.4142, + "num_input_tokens_seen": 1127349476, + "step": 7990 + }, + { + "epoch": 0.48757075489665785, + "grad_norm": 0.5190436840057373, + "learning_rate": 8e-05, + "loss": 1.4625, + "num_input_tokens_seen": 1128787572, + "step": 8000 + }, + { + "epoch": 0.4881802183402787, + "grad_norm": 0.5595162510871887, + "learning_rate": 8e-05, + "loss": 1.4101, + "num_input_tokens_seen": 1130217996, + "step": 8010 + }, + { + "epoch": 0.4887896817838995, + "grad_norm": 0.4808787703514099, + "learning_rate": 8e-05, + "loss": 1.4088, + "num_input_tokens_seen": 1131625032, + "step": 8020 + }, + { + "epoch": 0.4893991452275203, + "grad_norm": 0.5082031488418579, + "learning_rate": 8e-05, + "loss": 1.4811, + "num_input_tokens_seen": 1133074808, + "step": 8030 + }, + { + "epoch": 0.49000860867114115, + "grad_norm": 0.7117087841033936, + "learning_rate": 8e-05, + "loss": 1.3978, + "num_input_tokens_seen": 1134471984, + "step": 8040 + }, + { + "epoch": 0.490618072114762, + "grad_norm": 0.4917582869529724, + "learning_rate": 8e-05, + "loss": 1.4326, + "num_input_tokens_seen": 1135871844, + "step": 8050 + }, + { + "epoch": 0.4912275355583828, + "grad_norm": 0.5196986794471741, + "learning_rate": 8e-05, + "loss": 1.4705, + "num_input_tokens_seen": 1137251528, + "step": 8060 + }, + { + "epoch": 0.4918369990020036, + "grad_norm": 0.45626333355903625, + "learning_rate": 8e-05, + "loss": 1.4173, + "num_input_tokens_seen": 1138668772, + "step": 8070 + }, + { + "epoch": 0.49244646244562446, + "grad_norm": 0.7028423547744751, + "learning_rate": 8e-05, + "loss": 1.4415, + "num_input_tokens_seen": 1140072992, + "step": 8080 + }, + { + "epoch": 0.49305592588924524, + "grad_norm": 0.4503718912601471, + "learning_rate": 8e-05, + "loss": 1.4979, + "num_input_tokens_seen": 1141473780, + "step": 8090 + }, + { + "epoch": 0.4936653893328661, + "grad_norm": 0.49622249603271484, + "learning_rate": 8e-05, + "loss": 1.4959, + "num_input_tokens_seen": 1142896620, + "step": 8100 + }, + { + "epoch": 0.4942748527764869, + "grad_norm": 0.45446139574050903, + "learning_rate": 8e-05, + "loss": 1.4125, + "num_input_tokens_seen": 1144295328, + "step": 8110 + }, + { + "epoch": 0.4948843162201077, + "grad_norm": 0.4794887602329254, + "learning_rate": 8e-05, + "loss": 1.4337, + "num_input_tokens_seen": 1145761996, + "step": 8120 + }, + { + "epoch": 0.49549377966372854, + "grad_norm": 0.535640299320221, + "learning_rate": 8e-05, + "loss": 1.4061, + "num_input_tokens_seen": 1147167508, + "step": 8130 + }, + { + "epoch": 0.4961032431073494, + "grad_norm": 0.5999556183815002, + "learning_rate": 8e-05, + "loss": 1.3923, + "num_input_tokens_seen": 1148590576, + "step": 8140 + }, + { + "epoch": 0.49671270655097016, + "grad_norm": 0.4626942574977875, + "learning_rate": 8e-05, + "loss": 1.3859, + "num_input_tokens_seen": 1149968016, + "step": 8150 + }, + { + "epoch": 0.497322169994591, + "grad_norm": 0.5697856545448303, + "learning_rate": 8e-05, + "loss": 1.4115, + "num_input_tokens_seen": 1151379092, + "step": 8160 + }, + { + "epoch": 0.49793163343821184, + "grad_norm": 0.5212153196334839, + "learning_rate": 8e-05, + "loss": 1.4006, + "num_input_tokens_seen": 1152789164, + "step": 8170 + }, + { + "epoch": 0.4985410968818327, + "grad_norm": 0.5307455062866211, + "learning_rate": 8e-05, + "loss": 1.4147, + "num_input_tokens_seen": 1154215056, + "step": 8180 + }, + { + "epoch": 0.49915056032545346, + "grad_norm": 0.49318286776542664, + "learning_rate": 8e-05, + "loss": 1.3641, + "num_input_tokens_seen": 1155560728, + "step": 8190 + }, + { + "epoch": 0.4997600237690743, + "grad_norm": 0.5201637148857117, + "learning_rate": 8e-05, + "loss": 1.4909, + "num_input_tokens_seen": 1156956988, + "step": 8200 + }, + { + "epoch": 0.5003694872126951, + "grad_norm": 0.5208278894424438, + "learning_rate": 8e-05, + "loss": 1.4412, + "num_input_tokens_seen": 1158393764, + "step": 8210 + }, + { + "epoch": 0.5009789506563159, + "grad_norm": 0.5729420185089111, + "learning_rate": 8e-05, + "loss": 1.3848, + "num_input_tokens_seen": 1159824056, + "step": 8220 + }, + { + "epoch": 0.5015884140999368, + "grad_norm": 0.4863448739051819, + "learning_rate": 8e-05, + "loss": 1.3707, + "num_input_tokens_seen": 1161241976, + "step": 8230 + }, + { + "epoch": 0.5021978775435576, + "grad_norm": 0.544262707233429, + "learning_rate": 8e-05, + "loss": 1.452, + "num_input_tokens_seen": 1162622704, + "step": 8240 + }, + { + "epoch": 0.5028073409871784, + "grad_norm": 0.5542010068893433, + "learning_rate": 8e-05, + "loss": 1.4808, + "num_input_tokens_seen": 1164019924, + "step": 8250 + }, + { + "epoch": 0.5034168044307993, + "grad_norm": 0.466775506734848, + "learning_rate": 8e-05, + "loss": 1.4303, + "num_input_tokens_seen": 1165465436, + "step": 8260 + }, + { + "epoch": 0.5040262678744201, + "grad_norm": 0.6469402313232422, + "learning_rate": 8e-05, + "loss": 1.4665, + "num_input_tokens_seen": 1166886136, + "step": 8270 + }, + { + "epoch": 0.5046357313180408, + "grad_norm": 0.6200391054153442, + "learning_rate": 8e-05, + "loss": 1.4426, + "num_input_tokens_seen": 1168299392, + "step": 8280 + }, + { + "epoch": 0.5052451947616617, + "grad_norm": 0.5519371628761292, + "learning_rate": 8e-05, + "loss": 1.4484, + "num_input_tokens_seen": 1169721572, + "step": 8290 + }, + { + "epoch": 0.5058546582052825, + "grad_norm": 0.5275024771690369, + "learning_rate": 8e-05, + "loss": 1.3321, + "num_input_tokens_seen": 1171103648, + "step": 8300 + }, + { + "epoch": 0.5064641216489033, + "grad_norm": 0.522892951965332, + "learning_rate": 8e-05, + "loss": 1.3864, + "num_input_tokens_seen": 1172502288, + "step": 8310 + }, + { + "epoch": 0.5070735850925242, + "grad_norm": 0.47808873653411865, + "learning_rate": 8e-05, + "loss": 1.3831, + "num_input_tokens_seen": 1173938752, + "step": 8320 + }, + { + "epoch": 0.507683048536145, + "grad_norm": 0.6184147596359253, + "learning_rate": 8e-05, + "loss": 1.4538, + "num_input_tokens_seen": 1175364784, + "step": 8330 + }, + { + "epoch": 0.5082925119797658, + "grad_norm": 0.5707573890686035, + "learning_rate": 8e-05, + "loss": 1.3267, + "num_input_tokens_seen": 1176787692, + "step": 8340 + }, + { + "epoch": 0.5089019754233867, + "grad_norm": 0.5967716574668884, + "learning_rate": 8e-05, + "loss": 1.4053, + "num_input_tokens_seen": 1178179564, + "step": 8350 + }, + { + "epoch": 0.5095114388670074, + "grad_norm": 0.5708630681037903, + "learning_rate": 8e-05, + "loss": 1.478, + "num_input_tokens_seen": 1179580276, + "step": 8360 + }, + { + "epoch": 0.5101209023106282, + "grad_norm": 0.5356885194778442, + "learning_rate": 8e-05, + "loss": 1.4038, + "num_input_tokens_seen": 1180980160, + "step": 8370 + }, + { + "epoch": 0.5107303657542491, + "grad_norm": 0.6001008749008179, + "learning_rate": 8e-05, + "loss": 1.4924, + "num_input_tokens_seen": 1182394556, + "step": 8380 + }, + { + "epoch": 0.5113398291978699, + "grad_norm": 0.4999195635318756, + "learning_rate": 8e-05, + "loss": 1.3733, + "num_input_tokens_seen": 1183836592, + "step": 8390 + }, + { + "epoch": 0.5119492926414907, + "grad_norm": 0.5139046311378479, + "learning_rate": 8e-05, + "loss": 1.399, + "num_input_tokens_seen": 1185238392, + "step": 8400 + }, + { + "epoch": 0.5125587560851116, + "grad_norm": 0.614023745059967, + "learning_rate": 8e-05, + "loss": 1.426, + "num_input_tokens_seen": 1186634124, + "step": 8410 + }, + { + "epoch": 0.5131682195287324, + "grad_norm": 0.5585746765136719, + "learning_rate": 8e-05, + "loss": 1.4441, + "num_input_tokens_seen": 1188016924, + "step": 8420 + }, + { + "epoch": 0.5137776829723533, + "grad_norm": 0.6003695726394653, + "learning_rate": 8e-05, + "loss": 1.4042, + "num_input_tokens_seen": 1189415668, + "step": 8430 + }, + { + "epoch": 0.514387146415974, + "grad_norm": 0.6207943558692932, + "learning_rate": 8e-05, + "loss": 1.3925, + "num_input_tokens_seen": 1190833848, + "step": 8440 + }, + { + "epoch": 0.5149966098595948, + "grad_norm": 0.5384374260902405, + "learning_rate": 8e-05, + "loss": 1.3952, + "num_input_tokens_seen": 1192256296, + "step": 8450 + }, + { + "epoch": 0.5156060733032157, + "grad_norm": 0.5247148275375366, + "learning_rate": 8e-05, + "loss": 1.471, + "num_input_tokens_seen": 1193697300, + "step": 8460 + }, + { + "epoch": 0.5162155367468365, + "grad_norm": 0.5606392621994019, + "learning_rate": 8e-05, + "loss": 1.5178, + "num_input_tokens_seen": 1195149596, + "step": 8470 + }, + { + "epoch": 0.5168250001904573, + "grad_norm": 0.5033918619155884, + "learning_rate": 8e-05, + "loss": 1.3954, + "num_input_tokens_seen": 1196543328, + "step": 8480 + }, + { + "epoch": 0.5174344636340782, + "grad_norm": 0.5382705330848694, + "learning_rate": 8e-05, + "loss": 1.3954, + "num_input_tokens_seen": 1197953020, + "step": 8490 + }, + { + "epoch": 0.518043927077699, + "grad_norm": 0.508390486240387, + "learning_rate": 8e-05, + "loss": 1.448, + "num_input_tokens_seen": 1199356544, + "step": 8500 + }, + { + "epoch": 0.5186533905213198, + "grad_norm": 0.49335700273513794, + "learning_rate": 8e-05, + "loss": 1.4365, + "num_input_tokens_seen": 1200772144, + "step": 8510 + }, + { + "epoch": 0.5192628539649407, + "grad_norm": 0.511746883392334, + "learning_rate": 8e-05, + "loss": 1.4413, + "num_input_tokens_seen": 1202172256, + "step": 8520 + }, + { + "epoch": 0.5198723174085614, + "grad_norm": 0.7011821269989014, + "learning_rate": 8e-05, + "loss": 1.4147, + "num_input_tokens_seen": 1203597072, + "step": 8530 + }, + { + "epoch": 0.5204817808521822, + "grad_norm": 0.612265408039093, + "learning_rate": 8e-05, + "loss": 1.4681, + "num_input_tokens_seen": 1204985968, + "step": 8540 + }, + { + "epoch": 0.5210912442958031, + "grad_norm": 0.510905385017395, + "learning_rate": 8e-05, + "loss": 1.4615, + "num_input_tokens_seen": 1206391220, + "step": 8550 + }, + { + "epoch": 0.5217007077394239, + "grad_norm": 0.6800599694252014, + "learning_rate": 8e-05, + "loss": 1.4496, + "num_input_tokens_seen": 1207786772, + "step": 8560 + }, + { + "epoch": 0.5223101711830447, + "grad_norm": 0.6727950572967529, + "learning_rate": 8e-05, + "loss": 1.4528, + "num_input_tokens_seen": 1209206112, + "step": 8570 + }, + { + "epoch": 0.5229196346266656, + "grad_norm": 0.6258131265640259, + "learning_rate": 8e-05, + "loss": 1.3601, + "num_input_tokens_seen": 1210620340, + "step": 8580 + }, + { + "epoch": 0.5235290980702864, + "grad_norm": 0.5519835948944092, + "learning_rate": 8e-05, + "loss": 1.3857, + "num_input_tokens_seen": 1212035396, + "step": 8590 + }, + { + "epoch": 0.5241385615139071, + "grad_norm": 0.49297013878822327, + "learning_rate": 8e-05, + "loss": 1.4223, + "num_input_tokens_seen": 1213445184, + "step": 8600 + }, + { + "epoch": 0.524748024957528, + "grad_norm": 0.47780099511146545, + "learning_rate": 8e-05, + "loss": 1.378, + "num_input_tokens_seen": 1214854316, + "step": 8610 + }, + { + "epoch": 0.5253574884011488, + "grad_norm": 0.5096369981765747, + "learning_rate": 8e-05, + "loss": 1.4467, + "num_input_tokens_seen": 1216250744, + "step": 8620 + }, + { + "epoch": 0.5259669518447697, + "grad_norm": 0.5387448072433472, + "learning_rate": 8e-05, + "loss": 1.4201, + "num_input_tokens_seen": 1217686440, + "step": 8630 + }, + { + "epoch": 0.5265764152883905, + "grad_norm": 0.5782696604728699, + "learning_rate": 8e-05, + "loss": 1.4183, + "num_input_tokens_seen": 1219090264, + "step": 8640 + }, + { + "epoch": 0.5271858787320113, + "grad_norm": 0.4248006343841553, + "learning_rate": 8e-05, + "loss": 1.5077, + "num_input_tokens_seen": 1220493872, + "step": 8650 + }, + { + "epoch": 0.5277953421756322, + "grad_norm": 0.5155441164970398, + "learning_rate": 8e-05, + "loss": 1.461, + "num_input_tokens_seen": 1221904396, + "step": 8660 + }, + { + "epoch": 0.528404805619253, + "grad_norm": 0.7794909477233887, + "learning_rate": 8e-05, + "loss": 1.359, + "num_input_tokens_seen": 1223272932, + "step": 8670 + }, + { + "epoch": 0.5290142690628737, + "grad_norm": 0.5529311299324036, + "learning_rate": 8e-05, + "loss": 1.4378, + "num_input_tokens_seen": 1224665304, + "step": 8680 + }, + { + "epoch": 0.5296237325064946, + "grad_norm": 0.5060685276985168, + "learning_rate": 8e-05, + "loss": 1.4173, + "num_input_tokens_seen": 1226062180, + "step": 8690 + }, + { + "epoch": 0.5302331959501154, + "grad_norm": 0.4951160252094269, + "learning_rate": 8e-05, + "loss": 1.4475, + "num_input_tokens_seen": 1227516508, + "step": 8700 + }, + { + "epoch": 0.5308426593937362, + "grad_norm": 0.48395630717277527, + "learning_rate": 8e-05, + "loss": 1.374, + "num_input_tokens_seen": 1228928164, + "step": 8710 + }, + { + "epoch": 0.5314521228373571, + "grad_norm": 0.5231735110282898, + "learning_rate": 8e-05, + "loss": 1.3518, + "num_input_tokens_seen": 1230332108, + "step": 8720 + }, + { + "epoch": 0.5320615862809779, + "grad_norm": 0.6426222324371338, + "learning_rate": 8e-05, + "loss": 1.3299, + "num_input_tokens_seen": 1231769688, + "step": 8730 + }, + { + "epoch": 0.5326710497245987, + "grad_norm": 0.496803343296051, + "learning_rate": 8e-05, + "loss": 1.3993, + "num_input_tokens_seen": 1233210764, + "step": 8740 + }, + { + "epoch": 0.5332805131682196, + "grad_norm": 0.4480886459350586, + "learning_rate": 8e-05, + "loss": 1.4256, + "num_input_tokens_seen": 1234671524, + "step": 8750 + }, + { + "epoch": 0.5338899766118403, + "grad_norm": 0.48204657435417175, + "learning_rate": 8e-05, + "loss": 1.3814, + "num_input_tokens_seen": 1236085908, + "step": 8760 + }, + { + "epoch": 0.5344994400554611, + "grad_norm": 0.5191913843154907, + "learning_rate": 8e-05, + "loss": 1.3768, + "num_input_tokens_seen": 1237467536, + "step": 8770 + }, + { + "epoch": 0.535108903499082, + "grad_norm": 0.5462549924850464, + "learning_rate": 8e-05, + "loss": 1.415, + "num_input_tokens_seen": 1238886584, + "step": 8780 + }, + { + "epoch": 0.5357183669427028, + "grad_norm": 0.574504554271698, + "learning_rate": 8e-05, + "loss": 1.4029, + "num_input_tokens_seen": 1240281716, + "step": 8790 + }, + { + "epoch": 0.5363278303863236, + "grad_norm": 0.5356009602546692, + "learning_rate": 8e-05, + "loss": 1.4261, + "num_input_tokens_seen": 1241675192, + "step": 8800 + }, + { + "epoch": 0.5369372938299445, + "grad_norm": 0.5325603485107422, + "learning_rate": 8e-05, + "loss": 1.3992, + "num_input_tokens_seen": 1243056036, + "step": 8810 + }, + { + "epoch": 0.5375467572735653, + "grad_norm": 0.47047409415245056, + "learning_rate": 8e-05, + "loss": 1.3463, + "num_input_tokens_seen": 1244426652, + "step": 8820 + }, + { + "epoch": 0.5381562207171862, + "grad_norm": 0.5718605518341064, + "learning_rate": 8e-05, + "loss": 1.4106, + "num_input_tokens_seen": 1245827804, + "step": 8830 + }, + { + "epoch": 0.538765684160807, + "grad_norm": 0.5218635201454163, + "learning_rate": 8e-05, + "loss": 1.4177, + "num_input_tokens_seen": 1247204320, + "step": 8840 + }, + { + "epoch": 0.5393751476044277, + "grad_norm": 0.6211861968040466, + "learning_rate": 8e-05, + "loss": 1.5218, + "num_input_tokens_seen": 1248618312, + "step": 8850 + }, + { + "epoch": 0.5399846110480486, + "grad_norm": 0.49085572361946106, + "learning_rate": 8e-05, + "loss": 1.4029, + "num_input_tokens_seen": 1250024496, + "step": 8860 + }, + { + "epoch": 0.5405940744916694, + "grad_norm": 0.5032560229301453, + "learning_rate": 8e-05, + "loss": 1.4333, + "num_input_tokens_seen": 1251433236, + "step": 8870 + }, + { + "epoch": 0.5412035379352902, + "grad_norm": 0.7042189240455627, + "learning_rate": 8e-05, + "loss": 1.3757, + "num_input_tokens_seen": 1252853548, + "step": 8880 + }, + { + "epoch": 0.5418130013789111, + "grad_norm": 0.5665264129638672, + "learning_rate": 8e-05, + "loss": 1.413, + "num_input_tokens_seen": 1254255272, + "step": 8890 + }, + { + "epoch": 0.5424224648225319, + "grad_norm": 0.5010056495666504, + "learning_rate": 8e-05, + "loss": 1.328, + "num_input_tokens_seen": 1255651628, + "step": 8900 + }, + { + "epoch": 0.5430319282661527, + "grad_norm": 0.5278030037879944, + "learning_rate": 8e-05, + "loss": 1.4238, + "num_input_tokens_seen": 1257075628, + "step": 8910 + }, + { + "epoch": 0.5436413917097735, + "grad_norm": 0.5137773156166077, + "learning_rate": 8e-05, + "loss": 1.4509, + "num_input_tokens_seen": 1258492128, + "step": 8920 + }, + { + "epoch": 0.5442508551533943, + "grad_norm": 0.49045154452323914, + "learning_rate": 8e-05, + "loss": 1.473, + "num_input_tokens_seen": 1259921876, + "step": 8930 + }, + { + "epoch": 0.5448603185970151, + "grad_norm": 0.5632349848747253, + "learning_rate": 8e-05, + "loss": 1.4155, + "num_input_tokens_seen": 1261282328, + "step": 8940 + }, + { + "epoch": 0.545469782040636, + "grad_norm": 0.4805395007133484, + "learning_rate": 8e-05, + "loss": 1.445, + "num_input_tokens_seen": 1262718360, + "step": 8950 + }, + { + "epoch": 0.5460792454842568, + "grad_norm": 0.5406620502471924, + "learning_rate": 8e-05, + "loss": 1.3724, + "num_input_tokens_seen": 1264130680, + "step": 8960 + }, + { + "epoch": 0.5466887089278776, + "grad_norm": 0.5531741976737976, + "learning_rate": 8e-05, + "loss": 1.4404, + "num_input_tokens_seen": 1265513744, + "step": 8970 + }, + { + "epoch": 0.5472981723714985, + "grad_norm": 0.5422140955924988, + "learning_rate": 8e-05, + "loss": 1.3957, + "num_input_tokens_seen": 1266903584, + "step": 8980 + }, + { + "epoch": 0.5479076358151193, + "grad_norm": 0.5059279203414917, + "learning_rate": 8e-05, + "loss": 1.36, + "num_input_tokens_seen": 1268300852, + "step": 8990 + }, + { + "epoch": 0.54851709925874, + "grad_norm": 0.4933377504348755, + "learning_rate": 8e-05, + "loss": 1.3714, + "num_input_tokens_seen": 1269717552, + "step": 9000 + }, + { + "epoch": 0.5491265627023609, + "grad_norm": 0.6613243818283081, + "learning_rate": 8e-05, + "loss": 1.4211, + "num_input_tokens_seen": 1271115644, + "step": 9010 + }, + { + "epoch": 0.5497360261459817, + "grad_norm": 0.49182480573654175, + "learning_rate": 8e-05, + "loss": 1.352, + "num_input_tokens_seen": 1272504180, + "step": 9020 + }, + { + "epoch": 0.5503454895896025, + "grad_norm": 0.5405098795890808, + "learning_rate": 8e-05, + "loss": 1.4031, + "num_input_tokens_seen": 1273932024, + "step": 9030 + }, + { + "epoch": 0.5509549530332234, + "grad_norm": 0.5463997721672058, + "learning_rate": 8e-05, + "loss": 1.5083, + "num_input_tokens_seen": 1275332648, + "step": 9040 + }, + { + "epoch": 0.5515644164768442, + "grad_norm": 0.5602612495422363, + "learning_rate": 8e-05, + "loss": 1.3726, + "num_input_tokens_seen": 1276735532, + "step": 9050 + }, + { + "epoch": 0.5521738799204651, + "grad_norm": 0.5101543068885803, + "learning_rate": 8e-05, + "loss": 1.3778, + "num_input_tokens_seen": 1278146092, + "step": 9060 + }, + { + "epoch": 0.5527833433640859, + "grad_norm": 0.5481278896331787, + "learning_rate": 8e-05, + "loss": 1.4419, + "num_input_tokens_seen": 1279583276, + "step": 9070 + }, + { + "epoch": 0.5533928068077066, + "grad_norm": 0.4749618172645569, + "learning_rate": 8e-05, + "loss": 1.4162, + "num_input_tokens_seen": 1281000036, + "step": 9080 + }, + { + "epoch": 0.5540022702513275, + "grad_norm": 0.5409462451934814, + "learning_rate": 8e-05, + "loss": 1.3868, + "num_input_tokens_seen": 1282445072, + "step": 9090 + }, + { + "epoch": 0.5546117336949483, + "grad_norm": 0.5022688508033752, + "learning_rate": 8e-05, + "loss": 1.4162, + "num_input_tokens_seen": 1283856564, + "step": 9100 + }, + { + "epoch": 0.5552211971385691, + "grad_norm": 0.4955657124519348, + "learning_rate": 8e-05, + "loss": 1.41, + "num_input_tokens_seen": 1285249116, + "step": 9110 + }, + { + "epoch": 0.55583066058219, + "grad_norm": 0.4351109564304352, + "learning_rate": 8e-05, + "loss": 1.4364, + "num_input_tokens_seen": 1286611828, + "step": 9120 + }, + { + "epoch": 0.5564401240258108, + "grad_norm": 0.47502756118774414, + "learning_rate": 8e-05, + "loss": 1.3688, + "num_input_tokens_seen": 1288059680, + "step": 9130 + }, + { + "epoch": 0.5570495874694316, + "grad_norm": 0.5219740867614746, + "learning_rate": 8e-05, + "loss": 1.3995, + "num_input_tokens_seen": 1289506464, + "step": 9140 + }, + { + "epoch": 0.5576590509130525, + "grad_norm": 0.5647391676902771, + "learning_rate": 8e-05, + "loss": 1.3867, + "num_input_tokens_seen": 1290890052, + "step": 9150 + }, + { + "epoch": 0.5582685143566732, + "grad_norm": 0.5420753359794617, + "learning_rate": 8e-05, + "loss": 1.4256, + "num_input_tokens_seen": 1292311544, + "step": 9160 + }, + { + "epoch": 0.558877977800294, + "grad_norm": 0.629084050655365, + "learning_rate": 8e-05, + "loss": 1.422, + "num_input_tokens_seen": 1293705252, + "step": 9170 + }, + { + "epoch": 0.5594874412439149, + "grad_norm": 0.5770242214202881, + "learning_rate": 8e-05, + "loss": 1.3623, + "num_input_tokens_seen": 1295093572, + "step": 9180 + }, + { + "epoch": 0.5600969046875357, + "grad_norm": 0.5409170985221863, + "learning_rate": 8e-05, + "loss": 1.4737, + "num_input_tokens_seen": 1296519364, + "step": 9190 + }, + { + "epoch": 0.5607063681311565, + "grad_norm": 0.5729120373725891, + "learning_rate": 8e-05, + "loss": 1.4459, + "num_input_tokens_seen": 1297897980, + "step": 9200 + }, + { + "epoch": 0.5613158315747774, + "grad_norm": 0.5346562266349792, + "learning_rate": 8e-05, + "loss": 1.4169, + "num_input_tokens_seen": 1299282408, + "step": 9210 + }, + { + "epoch": 0.5619252950183982, + "grad_norm": 0.6186701655387878, + "learning_rate": 8e-05, + "loss": 1.3818, + "num_input_tokens_seen": 1300736024, + "step": 9220 + }, + { + "epoch": 0.562534758462019, + "grad_norm": 0.5668373107910156, + "learning_rate": 8e-05, + "loss": 1.4676, + "num_input_tokens_seen": 1302156036, + "step": 9230 + }, + { + "epoch": 0.5631442219056398, + "grad_norm": 0.603315532207489, + "learning_rate": 8e-05, + "loss": 1.3285, + "num_input_tokens_seen": 1303545448, + "step": 9240 + }, + { + "epoch": 0.5637536853492606, + "grad_norm": 0.5525988936424255, + "learning_rate": 8e-05, + "loss": 1.4286, + "num_input_tokens_seen": 1304987840, + "step": 9250 + }, + { + "epoch": 0.5643631487928815, + "grad_norm": 0.5281280279159546, + "learning_rate": 8e-05, + "loss": 1.4153, + "num_input_tokens_seen": 1306349096, + "step": 9260 + }, + { + "epoch": 0.5649726122365023, + "grad_norm": 0.539382815361023, + "learning_rate": 8e-05, + "loss": 1.4315, + "num_input_tokens_seen": 1307741544, + "step": 9270 + }, + { + "epoch": 0.5655820756801231, + "grad_norm": 0.49413955211639404, + "learning_rate": 8e-05, + "loss": 1.422, + "num_input_tokens_seen": 1309121068, + "step": 9280 + }, + { + "epoch": 0.566191539123744, + "grad_norm": 0.7388852834701538, + "learning_rate": 8e-05, + "loss": 1.3662, + "num_input_tokens_seen": 1310517248, + "step": 9290 + }, + { + "epoch": 0.5668010025673648, + "grad_norm": 0.5051785707473755, + "learning_rate": 8e-05, + "loss": 1.3247, + "num_input_tokens_seen": 1311906244, + "step": 9300 + }, + { + "epoch": 0.5674104660109855, + "grad_norm": 0.5112394094467163, + "learning_rate": 8e-05, + "loss": 1.3472, + "num_input_tokens_seen": 1313330152, + "step": 9310 + }, + { + "epoch": 0.5680199294546064, + "grad_norm": 0.5090803503990173, + "learning_rate": 8e-05, + "loss": 1.3978, + "num_input_tokens_seen": 1314772320, + "step": 9320 + }, + { + "epoch": 0.5686293928982272, + "grad_norm": 0.4785972237586975, + "learning_rate": 8e-05, + "loss": 1.3579, + "num_input_tokens_seen": 1316168788, + "step": 9330 + }, + { + "epoch": 0.569238856341848, + "grad_norm": 0.565299928188324, + "learning_rate": 8e-05, + "loss": 1.386, + "num_input_tokens_seen": 1317603684, + "step": 9340 + }, + { + "epoch": 0.5698483197854689, + "grad_norm": 0.5050956010818481, + "learning_rate": 8e-05, + "loss": 1.4233, + "num_input_tokens_seen": 1319033440, + "step": 9350 + }, + { + "epoch": 0.5704577832290897, + "grad_norm": 0.5199946761131287, + "learning_rate": 8e-05, + "loss": 1.4047, + "num_input_tokens_seen": 1320462816, + "step": 9360 + }, + { + "epoch": 0.5710672466727105, + "grad_norm": 0.47691354155540466, + "learning_rate": 8e-05, + "loss": 1.3643, + "num_input_tokens_seen": 1321889000, + "step": 9370 + }, + { + "epoch": 0.5716767101163314, + "grad_norm": 0.5591989159584045, + "learning_rate": 8e-05, + "loss": 1.4623, + "num_input_tokens_seen": 1323308556, + "step": 9380 + }, + { + "epoch": 0.5722861735599521, + "grad_norm": 0.516518771648407, + "learning_rate": 8e-05, + "loss": 1.4165, + "num_input_tokens_seen": 1324702200, + "step": 9390 + }, + { + "epoch": 0.5728956370035729, + "grad_norm": 0.5647040009498596, + "learning_rate": 8e-05, + "loss": 1.3805, + "num_input_tokens_seen": 1326076652, + "step": 9400 + }, + { + "epoch": 0.5735051004471938, + "grad_norm": 0.5209704637527466, + "learning_rate": 8e-05, + "loss": 1.3816, + "num_input_tokens_seen": 1327492928, + "step": 9410 + }, + { + "epoch": 0.5741145638908146, + "grad_norm": 0.6285092234611511, + "learning_rate": 8e-05, + "loss": 1.3143, + "num_input_tokens_seen": 1328873088, + "step": 9420 + }, + { + "epoch": 0.5747240273344354, + "grad_norm": 0.42731142044067383, + "learning_rate": 8e-05, + "loss": 1.3784, + "num_input_tokens_seen": 1330316768, + "step": 9430 + }, + { + "epoch": 0.5753334907780563, + "grad_norm": 0.500942051410675, + "learning_rate": 8e-05, + "loss": 1.3928, + "num_input_tokens_seen": 1331718868, + "step": 9440 + }, + { + "epoch": 0.5759429542216771, + "grad_norm": 0.5587007403373718, + "learning_rate": 8e-05, + "loss": 1.423, + "num_input_tokens_seen": 1333111484, + "step": 9450 + }, + { + "epoch": 0.576552417665298, + "grad_norm": 0.48135367035865784, + "learning_rate": 8e-05, + "loss": 1.4287, + "num_input_tokens_seen": 1334496056, + "step": 9460 + }, + { + "epoch": 0.5771618811089188, + "grad_norm": 0.5771949887275696, + "learning_rate": 8e-05, + "loss": 1.4181, + "num_input_tokens_seen": 1335922768, + "step": 9470 + }, + { + "epoch": 0.5777713445525395, + "grad_norm": 0.5315853953361511, + "learning_rate": 8e-05, + "loss": 1.4274, + "num_input_tokens_seen": 1337302964, + "step": 9480 + }, + { + "epoch": 0.5783808079961604, + "grad_norm": 0.5134252309799194, + "learning_rate": 8e-05, + "loss": 1.4307, + "num_input_tokens_seen": 1338719692, + "step": 9490 + }, + { + "epoch": 0.5789902714397812, + "grad_norm": 0.5051198601722717, + "learning_rate": 8e-05, + "loss": 1.496, + "num_input_tokens_seen": 1340134348, + "step": 9500 + }, + { + "epoch": 0.579599734883402, + "grad_norm": 0.549379289150238, + "learning_rate": 8e-05, + "loss": 1.4031, + "num_input_tokens_seen": 1341522456, + "step": 9510 + }, + { + "epoch": 0.5802091983270229, + "grad_norm": 0.56511390209198, + "learning_rate": 8e-05, + "loss": 1.3683, + "num_input_tokens_seen": 1342933524, + "step": 9520 + }, + { + "epoch": 0.5808186617706437, + "grad_norm": 0.55156409740448, + "learning_rate": 8e-05, + "loss": 1.4012, + "num_input_tokens_seen": 1344352168, + "step": 9530 + }, + { + "epoch": 0.5814281252142645, + "grad_norm": 0.4940817654132843, + "learning_rate": 8e-05, + "loss": 1.4177, + "num_input_tokens_seen": 1345777572, + "step": 9540 + }, + { + "epoch": 0.5820375886578854, + "grad_norm": 0.4556514620780945, + "learning_rate": 8e-05, + "loss": 1.3985, + "num_input_tokens_seen": 1347204108, + "step": 9550 + }, + { + "epoch": 0.5826470521015061, + "grad_norm": 0.5312452912330627, + "learning_rate": 8e-05, + "loss": 1.3973, + "num_input_tokens_seen": 1348650808, + "step": 9560 + }, + { + "epoch": 0.5832565155451269, + "grad_norm": 0.5042407512664795, + "learning_rate": 8e-05, + "loss": 1.4426, + "num_input_tokens_seen": 1350015484, + "step": 9570 + }, + { + "epoch": 0.5838659789887478, + "grad_norm": 0.549752950668335, + "learning_rate": 8e-05, + "loss": 1.4271, + "num_input_tokens_seen": 1351431476, + "step": 9580 + }, + { + "epoch": 0.5844754424323686, + "grad_norm": 0.5132181644439697, + "learning_rate": 8e-05, + "loss": 1.3289, + "num_input_tokens_seen": 1352869096, + "step": 9590 + }, + { + "epoch": 0.5850849058759894, + "grad_norm": 0.6704673767089844, + "learning_rate": 8e-05, + "loss": 1.3606, + "num_input_tokens_seen": 1354273284, + "step": 9600 + }, + { + "epoch": 0.5856943693196103, + "grad_norm": 0.5381009578704834, + "learning_rate": 8e-05, + "loss": 1.3736, + "num_input_tokens_seen": 1355701548, + "step": 9610 + }, + { + "epoch": 0.5863038327632311, + "grad_norm": 0.4848478138446808, + "learning_rate": 8e-05, + "loss": 1.4624, + "num_input_tokens_seen": 1357098184, + "step": 9620 + }, + { + "epoch": 0.5869132962068518, + "grad_norm": 0.5218216180801392, + "learning_rate": 8e-05, + "loss": 1.3911, + "num_input_tokens_seen": 1358496652, + "step": 9630 + }, + { + "epoch": 0.5875227596504727, + "grad_norm": 0.5365182757377625, + "learning_rate": 8e-05, + "loss": 1.4491, + "num_input_tokens_seen": 1359933264, + "step": 9640 + }, + { + "epoch": 0.5881322230940935, + "grad_norm": 0.5306704044342041, + "learning_rate": 8e-05, + "loss": 1.4075, + "num_input_tokens_seen": 1361319892, + "step": 9650 + }, + { + "epoch": 0.5887416865377144, + "grad_norm": 0.5075612664222717, + "learning_rate": 8e-05, + "loss": 1.3719, + "num_input_tokens_seen": 1362739532, + "step": 9660 + }, + { + "epoch": 0.5893511499813352, + "grad_norm": 0.5192306041717529, + "learning_rate": 8e-05, + "loss": 1.3277, + "num_input_tokens_seen": 1364184264, + "step": 9670 + }, + { + "epoch": 0.589960613424956, + "grad_norm": 0.5479230880737305, + "learning_rate": 8e-05, + "loss": 1.4084, + "num_input_tokens_seen": 1365614432, + "step": 9680 + }, + { + "epoch": 0.5905700768685769, + "grad_norm": 0.6008402109146118, + "learning_rate": 8e-05, + "loss": 1.4572, + "num_input_tokens_seen": 1367052460, + "step": 9690 + }, + { + "epoch": 0.5911795403121977, + "grad_norm": 0.42700648307800293, + "learning_rate": 8e-05, + "loss": 1.3478, + "num_input_tokens_seen": 1368451128, + "step": 9700 + }, + { + "epoch": 0.5917890037558184, + "grad_norm": 0.5453005433082581, + "learning_rate": 8e-05, + "loss": 1.4283, + "num_input_tokens_seen": 1369849936, + "step": 9710 + }, + { + "epoch": 0.5923984671994393, + "grad_norm": 0.5360944271087646, + "learning_rate": 8e-05, + "loss": 1.4105, + "num_input_tokens_seen": 1371263976, + "step": 9720 + }, + { + "epoch": 0.5930079306430601, + "grad_norm": 0.5020470023155212, + "learning_rate": 8e-05, + "loss": 1.39, + "num_input_tokens_seen": 1372649640, + "step": 9730 + }, + { + "epoch": 0.5936173940866809, + "grad_norm": 0.6608033180236816, + "learning_rate": 8e-05, + "loss": 1.3897, + "num_input_tokens_seen": 1374020584, + "step": 9740 + }, + { + "epoch": 0.5942268575303018, + "grad_norm": 0.4657529294490814, + "learning_rate": 8e-05, + "loss": 1.3444, + "num_input_tokens_seen": 1375393228, + "step": 9750 + }, + { + "epoch": 0.5948363209739226, + "grad_norm": 0.6202511191368103, + "learning_rate": 8e-05, + "loss": 1.4421, + "num_input_tokens_seen": 1376785448, + "step": 9760 + }, + { + "epoch": 0.5954457844175434, + "grad_norm": 0.48330217599868774, + "learning_rate": 8e-05, + "loss": 1.3878, + "num_input_tokens_seen": 1378221156, + "step": 9770 + }, + { + "epoch": 0.5960552478611643, + "grad_norm": 0.5171040296554565, + "learning_rate": 8e-05, + "loss": 1.406, + "num_input_tokens_seen": 1379642012, + "step": 9780 + }, + { + "epoch": 0.596664711304785, + "grad_norm": 0.5539959669113159, + "learning_rate": 8e-05, + "loss": 1.3386, + "num_input_tokens_seen": 1381034376, + "step": 9790 + }, + { + "epoch": 0.5972741747484058, + "grad_norm": 0.5682281851768494, + "learning_rate": 8e-05, + "loss": 1.3227, + "num_input_tokens_seen": 1382464908, + "step": 9800 + }, + { + "epoch": 0.5978836381920267, + "grad_norm": 0.5348504781723022, + "learning_rate": 8e-05, + "loss": 1.3212, + "num_input_tokens_seen": 1383873968, + "step": 9810 + }, + { + "epoch": 0.5984931016356475, + "grad_norm": 0.5483260750770569, + "learning_rate": 8e-05, + "loss": 1.4158, + "num_input_tokens_seen": 1385281380, + "step": 9820 + }, + { + "epoch": 0.5991025650792683, + "grad_norm": 0.4727989137172699, + "learning_rate": 8e-05, + "loss": 1.3502, + "num_input_tokens_seen": 1386679784, + "step": 9830 + }, + { + "epoch": 0.5997120285228892, + "grad_norm": 0.5440919399261475, + "learning_rate": 8e-05, + "loss": 1.5048, + "num_input_tokens_seen": 1388088844, + "step": 9840 + }, + { + "epoch": 0.60032149196651, + "grad_norm": 0.5022852420806885, + "learning_rate": 8e-05, + "loss": 1.3446, + "num_input_tokens_seen": 1389562772, + "step": 9850 + }, + { + "epoch": 0.6009309554101308, + "grad_norm": 0.5559846758842468, + "learning_rate": 8e-05, + "loss": 1.4179, + "num_input_tokens_seen": 1390956260, + "step": 9860 + }, + { + "epoch": 0.6015404188537516, + "grad_norm": 0.5346333384513855, + "learning_rate": 8e-05, + "loss": 1.3865, + "num_input_tokens_seen": 1392385600, + "step": 9870 + }, + { + "epoch": 0.6021498822973724, + "grad_norm": 0.5209230780601501, + "learning_rate": 8e-05, + "loss": 1.3285, + "num_input_tokens_seen": 1393782632, + "step": 9880 + }, + { + "epoch": 0.6027593457409933, + "grad_norm": 0.5146684646606445, + "learning_rate": 8e-05, + "loss": 1.419, + "num_input_tokens_seen": 1395203648, + "step": 9890 + }, + { + "epoch": 0.6033688091846141, + "grad_norm": 0.49187934398651123, + "learning_rate": 8e-05, + "loss": 1.3834, + "num_input_tokens_seen": 1396636232, + "step": 9900 + }, + { + "epoch": 0.6039782726282349, + "grad_norm": 0.5156924724578857, + "learning_rate": 8e-05, + "loss": 1.3366, + "num_input_tokens_seen": 1398070672, + "step": 9910 + }, + { + "epoch": 0.6045877360718558, + "grad_norm": 0.5331023931503296, + "learning_rate": 8e-05, + "loss": 1.3713, + "num_input_tokens_seen": 1399473452, + "step": 9920 + }, + { + "epoch": 0.6051971995154766, + "grad_norm": 0.6120520234107971, + "learning_rate": 8e-05, + "loss": 1.4259, + "num_input_tokens_seen": 1400826964, + "step": 9930 + }, + { + "epoch": 0.6058066629590974, + "grad_norm": 0.4598010182380676, + "learning_rate": 8e-05, + "loss": 1.3581, + "num_input_tokens_seen": 1402235776, + "step": 9940 + }, + { + "epoch": 0.6064161264027182, + "grad_norm": 0.4815860688686371, + "learning_rate": 8e-05, + "loss": 1.3952, + "num_input_tokens_seen": 1403663484, + "step": 9950 + }, + { + "epoch": 0.607025589846339, + "grad_norm": 0.5356654524803162, + "learning_rate": 8e-05, + "loss": 1.4294, + "num_input_tokens_seen": 1405093388, + "step": 9960 + }, + { + "epoch": 0.6076350532899598, + "grad_norm": 0.4952469766139984, + "learning_rate": 8e-05, + "loss": 1.4177, + "num_input_tokens_seen": 1406509716, + "step": 9970 + }, + { + "epoch": 0.6082445167335807, + "grad_norm": 0.497787207365036, + "learning_rate": 8e-05, + "loss": 1.39, + "num_input_tokens_seen": 1407883080, + "step": 9980 + }, + { + "epoch": 0.6088539801772015, + "grad_norm": 0.41468948125839233, + "learning_rate": 8e-05, + "loss": 1.3927, + "num_input_tokens_seen": 1409283056, + "step": 9990 + }, + { + "epoch": 0.6094634436208223, + "grad_norm": 0.45570316910743713, + "learning_rate": 8e-05, + "loss": 1.289, + "num_input_tokens_seen": 1410699192, + "step": 10000 + }, + { + "epoch": 0.6100729070644432, + "grad_norm": 0.5052545666694641, + "learning_rate": 8e-05, + "loss": 1.3546, + "num_input_tokens_seen": 1412106116, + "step": 10010 + }, + { + "epoch": 0.610682370508064, + "grad_norm": 0.5244643688201904, + "learning_rate": 8e-05, + "loss": 1.3356, + "num_input_tokens_seen": 1413547016, + "step": 10020 + }, + { + "epoch": 0.6112918339516847, + "grad_norm": 0.5760179758071899, + "learning_rate": 8e-05, + "loss": 1.389, + "num_input_tokens_seen": 1414973076, + "step": 10030 + }, + { + "epoch": 0.6119012973953056, + "grad_norm": 0.5389821529388428, + "learning_rate": 8e-05, + "loss": 1.3859, + "num_input_tokens_seen": 1416396436, + "step": 10040 + }, + { + "epoch": 0.6125107608389264, + "grad_norm": 0.5720279812812805, + "learning_rate": 8e-05, + "loss": 1.349, + "num_input_tokens_seen": 1417828128, + "step": 10050 + }, + { + "epoch": 0.6131202242825472, + "grad_norm": 0.5402315258979797, + "learning_rate": 8e-05, + "loss": 1.4391, + "num_input_tokens_seen": 1419232568, + "step": 10060 + }, + { + "epoch": 0.6137296877261681, + "grad_norm": 0.5044508576393127, + "learning_rate": 8e-05, + "loss": 1.3909, + "num_input_tokens_seen": 1420627952, + "step": 10070 + }, + { + "epoch": 0.6143391511697889, + "grad_norm": 0.48339608311653137, + "learning_rate": 8e-05, + "loss": 1.3168, + "num_input_tokens_seen": 1422007340, + "step": 10080 + }, + { + "epoch": 0.6149486146134098, + "grad_norm": 0.4976171553134918, + "learning_rate": 8e-05, + "loss": 1.3754, + "num_input_tokens_seen": 1423445552, + "step": 10090 + }, + { + "epoch": 0.6155580780570306, + "grad_norm": 0.49609145522117615, + "learning_rate": 8e-05, + "loss": 1.3285, + "num_input_tokens_seen": 1424851720, + "step": 10100 + }, + { + "epoch": 0.6161675415006513, + "grad_norm": 0.5570063591003418, + "learning_rate": 8e-05, + "loss": 1.4115, + "num_input_tokens_seen": 1426280464, + "step": 10110 + }, + { + "epoch": 0.6167770049442722, + "grad_norm": 0.511796236038208, + "learning_rate": 8e-05, + "loss": 1.391, + "num_input_tokens_seen": 1427688620, + "step": 10120 + }, + { + "epoch": 0.617386468387893, + "grad_norm": 0.5466093420982361, + "learning_rate": 8e-05, + "loss": 1.3736, + "num_input_tokens_seen": 1429118608, + "step": 10130 + }, + { + "epoch": 0.6179959318315138, + "grad_norm": 0.5146467685699463, + "learning_rate": 8e-05, + "loss": 1.4094, + "num_input_tokens_seen": 1430487804, + "step": 10140 + }, + { + "epoch": 0.6186053952751347, + "grad_norm": 0.5151812434196472, + "learning_rate": 8e-05, + "loss": 1.3828, + "num_input_tokens_seen": 1431876676, + "step": 10150 + }, + { + "epoch": 0.6192148587187555, + "grad_norm": 0.5586668848991394, + "learning_rate": 8e-05, + "loss": 1.3261, + "num_input_tokens_seen": 1433285472, + "step": 10160 + }, + { + "epoch": 0.6198243221623763, + "grad_norm": 0.5817645788192749, + "learning_rate": 8e-05, + "loss": 1.4316, + "num_input_tokens_seen": 1434693624, + "step": 10170 + }, + { + "epoch": 0.6204337856059972, + "grad_norm": 0.6884422302246094, + "learning_rate": 8e-05, + "loss": 1.4434, + "num_input_tokens_seen": 1436113752, + "step": 10180 + }, + { + "epoch": 0.6210432490496179, + "grad_norm": 0.5640652179718018, + "learning_rate": 8e-05, + "loss": 1.3756, + "num_input_tokens_seen": 1437519516, + "step": 10190 + }, + { + "epoch": 0.6216527124932387, + "grad_norm": 0.49332916736602783, + "learning_rate": 8e-05, + "loss": 1.3701, + "num_input_tokens_seen": 1438919604, + "step": 10200 + }, + { + "epoch": 0.6222621759368596, + "grad_norm": 0.5096072554588318, + "learning_rate": 8e-05, + "loss": 1.3413, + "num_input_tokens_seen": 1440347500, + "step": 10210 + }, + { + "epoch": 0.6228716393804804, + "grad_norm": 0.44264650344848633, + "learning_rate": 8e-05, + "loss": 1.4, + "num_input_tokens_seen": 1441772520, + "step": 10220 + }, + { + "epoch": 0.6234811028241012, + "grad_norm": 0.4569202661514282, + "learning_rate": 8e-05, + "loss": 1.3895, + "num_input_tokens_seen": 1443207380, + "step": 10230 + }, + { + "epoch": 0.6240905662677221, + "grad_norm": 0.5005388855934143, + "learning_rate": 8e-05, + "loss": 1.3637, + "num_input_tokens_seen": 1444575216, + "step": 10240 + }, + { + "epoch": 0.6247000297113429, + "grad_norm": 0.5004163384437561, + "learning_rate": 8e-05, + "loss": 1.4176, + "num_input_tokens_seen": 1445983064, + "step": 10250 + }, + { + "epoch": 0.6253094931549636, + "grad_norm": 0.5014557242393494, + "learning_rate": 8e-05, + "loss": 1.3762, + "num_input_tokens_seen": 1447412308, + "step": 10260 + }, + { + "epoch": 0.6259189565985845, + "grad_norm": 0.5607008337974548, + "learning_rate": 8e-05, + "loss": 1.3564, + "num_input_tokens_seen": 1448812476, + "step": 10270 + }, + { + "epoch": 0.6265284200422053, + "grad_norm": 0.5385088324546814, + "learning_rate": 8e-05, + "loss": 1.324, + "num_input_tokens_seen": 1450237216, + "step": 10280 + }, + { + "epoch": 0.6271378834858262, + "grad_norm": 0.5262163281440735, + "learning_rate": 8e-05, + "loss": 1.4133, + "num_input_tokens_seen": 1451658832, + "step": 10290 + }, + { + "epoch": 0.627747346929447, + "grad_norm": 0.7675474286079407, + "learning_rate": 8e-05, + "loss": 1.3457, + "num_input_tokens_seen": 1453044480, + "step": 10300 + }, + { + "epoch": 0.6283568103730678, + "grad_norm": 0.48304829001426697, + "learning_rate": 8e-05, + "loss": 1.3758, + "num_input_tokens_seen": 1454441984, + "step": 10310 + }, + { + "epoch": 0.6289662738166887, + "grad_norm": 0.49265825748443604, + "learning_rate": 8e-05, + "loss": 1.3955, + "num_input_tokens_seen": 1455862852, + "step": 10320 + }, + { + "epoch": 0.6295757372603095, + "grad_norm": 0.4803888499736786, + "learning_rate": 8e-05, + "loss": 1.3637, + "num_input_tokens_seen": 1457252176, + "step": 10330 + }, + { + "epoch": 0.6301852007039302, + "grad_norm": 0.4618512690067291, + "learning_rate": 8e-05, + "loss": 1.3878, + "num_input_tokens_seen": 1458690980, + "step": 10340 + }, + { + "epoch": 0.6307946641475511, + "grad_norm": 0.486068457365036, + "learning_rate": 8e-05, + "loss": 1.3597, + "num_input_tokens_seen": 1460126928, + "step": 10350 + }, + { + "epoch": 0.6314041275911719, + "grad_norm": 0.482714980840683, + "learning_rate": 8e-05, + "loss": 1.3995, + "num_input_tokens_seen": 1461522124, + "step": 10360 + }, + { + "epoch": 0.6320135910347927, + "grad_norm": 0.5192288756370544, + "learning_rate": 8e-05, + "loss": 1.4576, + "num_input_tokens_seen": 1462943012, + "step": 10370 + }, + { + "epoch": 0.6326230544784136, + "grad_norm": 0.4682476818561554, + "learning_rate": 8e-05, + "loss": 1.3796, + "num_input_tokens_seen": 1464333524, + "step": 10380 + }, + { + "epoch": 0.6332325179220344, + "grad_norm": 0.5397130250930786, + "learning_rate": 8e-05, + "loss": 1.3822, + "num_input_tokens_seen": 1465743076, + "step": 10390 + }, + { + "epoch": 0.6338419813656552, + "grad_norm": 0.5656686425209045, + "learning_rate": 8e-05, + "loss": 1.3642, + "num_input_tokens_seen": 1467142196, + "step": 10400 + }, + { + "epoch": 0.6344514448092761, + "grad_norm": 0.6183952689170837, + "learning_rate": 8e-05, + "loss": 1.3855, + "num_input_tokens_seen": 1468562740, + "step": 10410 + }, + { + "epoch": 0.6350609082528968, + "grad_norm": 0.4923710227012634, + "learning_rate": 8e-05, + "loss": 1.3049, + "num_input_tokens_seen": 1469960856, + "step": 10420 + }, + { + "epoch": 0.6356703716965176, + "grad_norm": 0.6067305207252502, + "learning_rate": 8e-05, + "loss": 1.3892, + "num_input_tokens_seen": 1471400408, + "step": 10430 + }, + { + "epoch": 0.6362798351401385, + "grad_norm": 0.5619065761566162, + "learning_rate": 8e-05, + "loss": 1.4251, + "num_input_tokens_seen": 1472802868, + "step": 10440 + }, + { + "epoch": 0.6368892985837593, + "grad_norm": 0.5561144351959229, + "learning_rate": 8e-05, + "loss": 1.4278, + "num_input_tokens_seen": 1474193064, + "step": 10450 + }, + { + "epoch": 0.6374987620273801, + "grad_norm": 0.5016257762908936, + "learning_rate": 8e-05, + "loss": 1.3927, + "num_input_tokens_seen": 1475612660, + "step": 10460 + }, + { + "epoch": 0.638108225471001, + "grad_norm": 0.4828028380870819, + "learning_rate": 8e-05, + "loss": 1.3506, + "num_input_tokens_seen": 1477059616, + "step": 10470 + }, + { + "epoch": 0.6387176889146218, + "grad_norm": 0.7187060713768005, + "learning_rate": 8e-05, + "loss": 1.3665, + "num_input_tokens_seen": 1478402568, + "step": 10480 + }, + { + "epoch": 0.6393271523582427, + "grad_norm": 0.6509134769439697, + "learning_rate": 8e-05, + "loss": 1.4104, + "num_input_tokens_seen": 1479782212, + "step": 10490 + }, + { + "epoch": 0.6399366158018635, + "grad_norm": 0.5177718997001648, + "learning_rate": 8e-05, + "loss": 1.2858, + "num_input_tokens_seen": 1481194272, + "step": 10500 + }, + { + "epoch": 0.6405460792454842, + "grad_norm": 0.542962908744812, + "learning_rate": 8e-05, + "loss": 1.4324, + "num_input_tokens_seen": 1482567368, + "step": 10510 + }, + { + "epoch": 0.6411555426891051, + "grad_norm": 0.5583025217056274, + "learning_rate": 8e-05, + "loss": 1.3911, + "num_input_tokens_seen": 1484014248, + "step": 10520 + }, + { + "epoch": 0.6417650061327259, + "grad_norm": 0.4803900718688965, + "learning_rate": 8e-05, + "loss": 1.4556, + "num_input_tokens_seen": 1485396060, + "step": 10530 + }, + { + "epoch": 0.6423744695763467, + "grad_norm": 0.4976584017276764, + "learning_rate": 8e-05, + "loss": 1.3407, + "num_input_tokens_seen": 1486803768, + "step": 10540 + }, + { + "epoch": 0.6429839330199676, + "grad_norm": 0.6051338911056519, + "learning_rate": 8e-05, + "loss": 1.4611, + "num_input_tokens_seen": 1488205756, + "step": 10550 + }, + { + "epoch": 0.6435933964635884, + "grad_norm": 0.5262444615364075, + "learning_rate": 8e-05, + "loss": 1.3961, + "num_input_tokens_seen": 1489615612, + "step": 10560 + }, + { + "epoch": 0.6442028599072092, + "grad_norm": 0.484791100025177, + "learning_rate": 8e-05, + "loss": 1.4121, + "num_input_tokens_seen": 1491011084, + "step": 10570 + }, + { + "epoch": 0.64481232335083, + "grad_norm": 0.5317909717559814, + "learning_rate": 8e-05, + "loss": 1.3567, + "num_input_tokens_seen": 1492429136, + "step": 10580 + }, + { + "epoch": 0.6454217867944508, + "grad_norm": 0.5404983162879944, + "learning_rate": 8e-05, + "loss": 1.3642, + "num_input_tokens_seen": 1493834740, + "step": 10590 + }, + { + "epoch": 0.6460312502380716, + "grad_norm": 0.538868248462677, + "learning_rate": 8e-05, + "loss": 1.3155, + "num_input_tokens_seen": 1495264904, + "step": 10600 + }, + { + "epoch": 0.6466407136816925, + "grad_norm": 0.5658362507820129, + "learning_rate": 8e-05, + "loss": 1.3228, + "num_input_tokens_seen": 1496659280, + "step": 10610 + }, + { + "epoch": 0.6472501771253133, + "grad_norm": 0.5738557577133179, + "learning_rate": 8e-05, + "loss": 1.3978, + "num_input_tokens_seen": 1498064828, + "step": 10620 + }, + { + "epoch": 0.6478596405689341, + "grad_norm": 0.5166726112365723, + "learning_rate": 8e-05, + "loss": 1.4355, + "num_input_tokens_seen": 1499487812, + "step": 10630 + }, + { + "epoch": 0.648469104012555, + "grad_norm": 0.4428934156894684, + "learning_rate": 8e-05, + "loss": 1.3573, + "num_input_tokens_seen": 1500892436, + "step": 10640 + }, + { + "epoch": 0.6490785674561758, + "grad_norm": 0.559181809425354, + "learning_rate": 8e-05, + "loss": 1.4064, + "num_input_tokens_seen": 1502337536, + "step": 10650 + }, + { + "epoch": 0.6496880308997965, + "grad_norm": 0.46578314900398254, + "learning_rate": 8e-05, + "loss": 1.3296, + "num_input_tokens_seen": 1503735980, + "step": 10660 + }, + { + "epoch": 0.6502974943434174, + "grad_norm": 0.5593947768211365, + "learning_rate": 8e-05, + "loss": 1.444, + "num_input_tokens_seen": 1505164384, + "step": 10670 + }, + { + "epoch": 0.6509069577870382, + "grad_norm": 2.141988754272461, + "learning_rate": 8e-05, + "loss": 1.3533, + "num_input_tokens_seen": 1506537700, + "step": 10680 + }, + { + "epoch": 0.651516421230659, + "grad_norm": 0.6614646315574646, + "learning_rate": 8e-05, + "loss": 1.4131, + "num_input_tokens_seen": 1507983352, + "step": 10690 + }, + { + "epoch": 0.6521258846742799, + "grad_norm": 0.49016857147216797, + "learning_rate": 8e-05, + "loss": 1.389, + "num_input_tokens_seen": 1509361728, + "step": 10700 + }, + { + "epoch": 0.6527353481179007, + "grad_norm": 0.5343895554542542, + "learning_rate": 8e-05, + "loss": 1.4211, + "num_input_tokens_seen": 1510712804, + "step": 10710 + }, + { + "epoch": 0.6533448115615216, + "grad_norm": 0.44679155945777893, + "learning_rate": 8e-05, + "loss": 1.3696, + "num_input_tokens_seen": 1512120068, + "step": 10720 + }, + { + "epoch": 0.6539542750051424, + "grad_norm": 0.5805819630622864, + "learning_rate": 8e-05, + "loss": 1.3976, + "num_input_tokens_seen": 1513546256, + "step": 10730 + }, + { + "epoch": 0.6545637384487631, + "grad_norm": 0.5583277940750122, + "learning_rate": 8e-05, + "loss": 1.3012, + "num_input_tokens_seen": 1514925448, + "step": 10740 + }, + { + "epoch": 0.655173201892384, + "grad_norm": 0.49840471148490906, + "learning_rate": 8e-05, + "loss": 1.462, + "num_input_tokens_seen": 1516367584, + "step": 10750 + }, + { + "epoch": 0.6557826653360048, + "grad_norm": 0.4973022937774658, + "learning_rate": 8e-05, + "loss": 1.3537, + "num_input_tokens_seen": 1517757988, + "step": 10760 + }, + { + "epoch": 0.6563921287796256, + "grad_norm": 0.5231379866600037, + "learning_rate": 8e-05, + "loss": 1.3669, + "num_input_tokens_seen": 1519164136, + "step": 10770 + }, + { + "epoch": 0.6570015922232465, + "grad_norm": 0.5220736861228943, + "learning_rate": 8e-05, + "loss": 1.3633, + "num_input_tokens_seen": 1520588564, + "step": 10780 + }, + { + "epoch": 0.6576110556668673, + "grad_norm": 0.5354141592979431, + "learning_rate": 8e-05, + "loss": 1.4131, + "num_input_tokens_seen": 1522010348, + "step": 10790 + }, + { + "epoch": 0.6582205191104881, + "grad_norm": 0.5864176154136658, + "learning_rate": 8e-05, + "loss": 1.3681, + "num_input_tokens_seen": 1523415004, + "step": 10800 + }, + { + "epoch": 0.658829982554109, + "grad_norm": 0.5721861720085144, + "learning_rate": 8e-05, + "loss": 1.308, + "num_input_tokens_seen": 1524815832, + "step": 10810 + }, + { + "epoch": 0.6594394459977297, + "grad_norm": 0.6083350777626038, + "learning_rate": 8e-05, + "loss": 1.3567, + "num_input_tokens_seen": 1526230472, + "step": 10820 + }, + { + "epoch": 0.6600489094413505, + "grad_norm": 0.46110281348228455, + "learning_rate": 8e-05, + "loss": 1.4186, + "num_input_tokens_seen": 1527644032, + "step": 10830 + }, + { + "epoch": 0.6606583728849714, + "grad_norm": 0.5084540843963623, + "learning_rate": 8e-05, + "loss": 1.32, + "num_input_tokens_seen": 1529057312, + "step": 10840 + }, + { + "epoch": 0.6612678363285922, + "grad_norm": 0.5809466242790222, + "learning_rate": 8e-05, + "loss": 1.3788, + "num_input_tokens_seen": 1530445328, + "step": 10850 + }, + { + "epoch": 0.661877299772213, + "grad_norm": 0.46439307928085327, + "learning_rate": 8e-05, + "loss": 1.3833, + "num_input_tokens_seen": 1531850316, + "step": 10860 + }, + { + "epoch": 0.6624867632158339, + "grad_norm": 0.5628945231437683, + "learning_rate": 8e-05, + "loss": 1.3472, + "num_input_tokens_seen": 1533252868, + "step": 10870 + }, + { + "epoch": 0.6630962266594547, + "grad_norm": 0.6179889440536499, + "learning_rate": 8e-05, + "loss": 1.4127, + "num_input_tokens_seen": 1534660880, + "step": 10880 + }, + { + "epoch": 0.6637056901030755, + "grad_norm": 0.5281222462654114, + "learning_rate": 8e-05, + "loss": 1.4029, + "num_input_tokens_seen": 1536068992, + "step": 10890 + }, + { + "epoch": 0.6643151535466963, + "grad_norm": 0.5171144008636475, + "learning_rate": 8e-05, + "loss": 1.4088, + "num_input_tokens_seen": 1537464796, + "step": 10900 + }, + { + "epoch": 0.6649246169903171, + "grad_norm": 0.529052197933197, + "learning_rate": 8e-05, + "loss": 1.3975, + "num_input_tokens_seen": 1538875160, + "step": 10910 + }, + { + "epoch": 0.665534080433938, + "grad_norm": 0.5157914757728577, + "learning_rate": 8e-05, + "loss": 1.3578, + "num_input_tokens_seen": 1540279084, + "step": 10920 + }, + { + "epoch": 0.6661435438775588, + "grad_norm": 0.5008856058120728, + "learning_rate": 8e-05, + "loss": 1.3453, + "num_input_tokens_seen": 1541714696, + "step": 10930 + }, + { + "epoch": 0.6667530073211796, + "grad_norm": 0.45337000489234924, + "learning_rate": 8e-05, + "loss": 1.4766, + "num_input_tokens_seen": 1543170740, + "step": 10940 + }, + { + "epoch": 0.6673624707648005, + "grad_norm": 0.5083340406417847, + "learning_rate": 8e-05, + "loss": 1.4436, + "num_input_tokens_seen": 1544592796, + "step": 10950 + }, + { + "epoch": 0.6679719342084213, + "grad_norm": 0.48253244161605835, + "learning_rate": 8e-05, + "loss": 1.3303, + "num_input_tokens_seen": 1545997384, + "step": 10960 + }, + { + "epoch": 0.668581397652042, + "grad_norm": 0.4723127484321594, + "learning_rate": 8e-05, + "loss": 1.3656, + "num_input_tokens_seen": 1547390872, + "step": 10970 + }, + { + "epoch": 0.669190861095663, + "grad_norm": 0.539252758026123, + "learning_rate": 8e-05, + "loss": 1.3333, + "num_input_tokens_seen": 1548803244, + "step": 10980 + }, + { + "epoch": 0.6698003245392837, + "grad_norm": 0.5402015447616577, + "learning_rate": 8e-05, + "loss": 1.434, + "num_input_tokens_seen": 1550175492, + "step": 10990 + }, + { + "epoch": 0.6704097879829045, + "grad_norm": 0.6111288070678711, + "learning_rate": 8e-05, + "loss": 1.3002, + "num_input_tokens_seen": 1551566972, + "step": 11000 + }, + { + "epoch": 0.6710192514265254, + "grad_norm": 0.5599400997161865, + "learning_rate": 8e-05, + "loss": 1.3756, + "num_input_tokens_seen": 1552990300, + "step": 11010 + }, + { + "epoch": 0.6716287148701462, + "grad_norm": 0.4863987863063812, + "learning_rate": 8e-05, + "loss": 1.3602, + "num_input_tokens_seen": 1554395040, + "step": 11020 + }, + { + "epoch": 0.672238178313767, + "grad_norm": 0.5732718706130981, + "learning_rate": 8e-05, + "loss": 1.3012, + "num_input_tokens_seen": 1555814488, + "step": 11030 + }, + { + "epoch": 0.6728476417573879, + "grad_norm": 0.48249438405036926, + "learning_rate": 8e-05, + "loss": 1.3947, + "num_input_tokens_seen": 1557230564, + "step": 11040 + }, + { + "epoch": 0.6734571052010087, + "grad_norm": 0.5319753289222717, + "learning_rate": 8e-05, + "loss": 1.3969, + "num_input_tokens_seen": 1558697588, + "step": 11050 + }, + { + "epoch": 0.6740665686446294, + "grad_norm": 0.4940206706523895, + "learning_rate": 8e-05, + "loss": 1.3153, + "num_input_tokens_seen": 1560096556, + "step": 11060 + }, + { + "epoch": 0.6746760320882503, + "grad_norm": 0.5515936017036438, + "learning_rate": 8e-05, + "loss": 1.3674, + "num_input_tokens_seen": 1561517816, + "step": 11070 + }, + { + "epoch": 0.6752854955318711, + "grad_norm": 0.4390547573566437, + "learning_rate": 8e-05, + "loss": 1.3617, + "num_input_tokens_seen": 1562934520, + "step": 11080 + }, + { + "epoch": 0.6758949589754919, + "grad_norm": 0.5946500897407532, + "learning_rate": 8e-05, + "loss": 1.4425, + "num_input_tokens_seen": 1564382372, + "step": 11090 + }, + { + "epoch": 0.6765044224191128, + "grad_norm": 0.46804702281951904, + "learning_rate": 8e-05, + "loss": 1.295, + "num_input_tokens_seen": 1565750404, + "step": 11100 + }, + { + "epoch": 0.6771138858627336, + "grad_norm": 0.5131279826164246, + "learning_rate": 8e-05, + "loss": 1.4145, + "num_input_tokens_seen": 1567168144, + "step": 11110 + }, + { + "epoch": 0.6777233493063545, + "grad_norm": 0.5205957293510437, + "learning_rate": 8e-05, + "loss": 1.3669, + "num_input_tokens_seen": 1568518060, + "step": 11120 + }, + { + "epoch": 0.6783328127499753, + "grad_norm": 0.4875277876853943, + "learning_rate": 8e-05, + "loss": 1.4563, + "num_input_tokens_seen": 1569937112, + "step": 11130 + }, + { + "epoch": 0.678942276193596, + "grad_norm": 0.5259339809417725, + "learning_rate": 8e-05, + "loss": 1.3498, + "num_input_tokens_seen": 1571385304, + "step": 11140 + }, + { + "epoch": 0.6795517396372169, + "grad_norm": 0.48126786947250366, + "learning_rate": 8e-05, + "loss": 1.3787, + "num_input_tokens_seen": 1572780332, + "step": 11150 + }, + { + "epoch": 0.6801612030808377, + "grad_norm": 0.45843496918678284, + "learning_rate": 8e-05, + "loss": 1.3826, + "num_input_tokens_seen": 1574176036, + "step": 11160 + }, + { + "epoch": 0.6807706665244585, + "grad_norm": 0.5293328762054443, + "learning_rate": 8e-05, + "loss": 1.3625, + "num_input_tokens_seen": 1575584344, + "step": 11170 + }, + { + "epoch": 0.6813801299680794, + "grad_norm": 0.4788746237754822, + "learning_rate": 8e-05, + "loss": 1.401, + "num_input_tokens_seen": 1577029088, + "step": 11180 + }, + { + "epoch": 0.6819895934117002, + "grad_norm": 0.5486621260643005, + "learning_rate": 8e-05, + "loss": 1.4309, + "num_input_tokens_seen": 1578437036, + "step": 11190 + }, + { + "epoch": 0.682599056855321, + "grad_norm": 0.5115844011306763, + "learning_rate": 8e-05, + "loss": 1.417, + "num_input_tokens_seen": 1579839792, + "step": 11200 + }, + { + "epoch": 0.6832085202989419, + "grad_norm": 0.46927279233932495, + "learning_rate": 8e-05, + "loss": 1.3891, + "num_input_tokens_seen": 1581258564, + "step": 11210 + }, + { + "epoch": 0.6838179837425626, + "grad_norm": 0.4486519694328308, + "learning_rate": 8e-05, + "loss": 1.3672, + "num_input_tokens_seen": 1582668792, + "step": 11220 + }, + { + "epoch": 0.6844274471861834, + "grad_norm": 0.5005569458007812, + "learning_rate": 8e-05, + "loss": 1.421, + "num_input_tokens_seen": 1584050876, + "step": 11230 + }, + { + "epoch": 0.6850369106298043, + "grad_norm": 0.605232834815979, + "learning_rate": 8e-05, + "loss": 1.3457, + "num_input_tokens_seen": 1585466308, + "step": 11240 + }, + { + "epoch": 0.6856463740734251, + "grad_norm": 0.5172209739685059, + "learning_rate": 8e-05, + "loss": 1.3576, + "num_input_tokens_seen": 1586888240, + "step": 11250 + }, + { + "epoch": 0.6862558375170459, + "grad_norm": 0.47500714659690857, + "learning_rate": 8e-05, + "loss": 1.2991, + "num_input_tokens_seen": 1588319356, + "step": 11260 + }, + { + "epoch": 0.6868653009606668, + "grad_norm": 0.48329707980155945, + "learning_rate": 8e-05, + "loss": 1.3285, + "num_input_tokens_seen": 1589717748, + "step": 11270 + }, + { + "epoch": 0.6874747644042876, + "grad_norm": 0.5334200859069824, + "learning_rate": 8e-05, + "loss": 1.3861, + "num_input_tokens_seen": 1591160404, + "step": 11280 + }, + { + "epoch": 0.6880842278479083, + "grad_norm": 0.5618347525596619, + "learning_rate": 8e-05, + "loss": 1.3787, + "num_input_tokens_seen": 1592539424, + "step": 11290 + }, + { + "epoch": 0.6886936912915292, + "grad_norm": 0.47227922081947327, + "learning_rate": 8e-05, + "loss": 1.3481, + "num_input_tokens_seen": 1593941668, + "step": 11300 + }, + { + "epoch": 0.68930315473515, + "grad_norm": 0.4863712191581726, + "learning_rate": 8e-05, + "loss": 1.3482, + "num_input_tokens_seen": 1595349516, + "step": 11310 + }, + { + "epoch": 0.6899126181787709, + "grad_norm": 0.4943729341030121, + "learning_rate": 8e-05, + "loss": 1.3168, + "num_input_tokens_seen": 1596768640, + "step": 11320 + }, + { + "epoch": 0.6905220816223917, + "grad_norm": 0.5097691416740417, + "learning_rate": 8e-05, + "loss": 1.3259, + "num_input_tokens_seen": 1598213372, + "step": 11330 + }, + { + "epoch": 0.6911315450660125, + "grad_norm": 0.5228952169418335, + "learning_rate": 8e-05, + "loss": 1.4481, + "num_input_tokens_seen": 1599627692, + "step": 11340 + }, + { + "epoch": 0.6917410085096334, + "grad_norm": 0.4985556900501251, + "learning_rate": 8e-05, + "loss": 1.3566, + "num_input_tokens_seen": 1601010164, + "step": 11350 + }, + { + "epoch": 0.6923504719532542, + "grad_norm": 0.4366033673286438, + "learning_rate": 8e-05, + "loss": 1.27, + "num_input_tokens_seen": 1602418184, + "step": 11360 + }, + { + "epoch": 0.692959935396875, + "grad_norm": 0.43515148758888245, + "learning_rate": 8e-05, + "loss": 1.3676, + "num_input_tokens_seen": 1603824332, + "step": 11370 + }, + { + "epoch": 0.6935693988404958, + "grad_norm": 0.5262019634246826, + "learning_rate": 8e-05, + "loss": 1.3643, + "num_input_tokens_seen": 1605236940, + "step": 11380 + }, + { + "epoch": 0.6941788622841166, + "grad_norm": 0.5392087697982788, + "learning_rate": 8e-05, + "loss": 1.4025, + "num_input_tokens_seen": 1606675864, + "step": 11390 + }, + { + "epoch": 0.6947883257277374, + "grad_norm": 0.6022461652755737, + "learning_rate": 8e-05, + "loss": 1.408, + "num_input_tokens_seen": 1608094576, + "step": 11400 + }, + { + "epoch": 0.6953977891713583, + "grad_norm": 0.6072622537612915, + "learning_rate": 8e-05, + "loss": 1.3024, + "num_input_tokens_seen": 1609523344, + "step": 11410 + }, + { + "epoch": 0.6960072526149791, + "grad_norm": 0.5051203370094299, + "learning_rate": 8e-05, + "loss": 1.3671, + "num_input_tokens_seen": 1610934128, + "step": 11420 + }, + { + "epoch": 0.6966167160585999, + "grad_norm": 0.5395517349243164, + "learning_rate": 8e-05, + "loss": 1.3802, + "num_input_tokens_seen": 1612402656, + "step": 11430 + }, + { + "epoch": 0.6972261795022208, + "grad_norm": 0.5100081562995911, + "learning_rate": 8e-05, + "loss": 1.4357, + "num_input_tokens_seen": 1613828080, + "step": 11440 + }, + { + "epoch": 0.6978356429458415, + "grad_norm": 0.5012816190719604, + "learning_rate": 8e-05, + "loss": 1.3875, + "num_input_tokens_seen": 1615257472, + "step": 11450 + }, + { + "epoch": 0.6984451063894623, + "grad_norm": 0.5558944344520569, + "learning_rate": 8e-05, + "loss": 1.3962, + "num_input_tokens_seen": 1616629904, + "step": 11460 + }, + { + "epoch": 0.6990545698330832, + "grad_norm": 0.5226157307624817, + "learning_rate": 8e-05, + "loss": 1.3542, + "num_input_tokens_seen": 1618030980, + "step": 11470 + }, + { + "epoch": 0.699664033276704, + "grad_norm": 0.5259307622909546, + "learning_rate": 8e-05, + "loss": 1.3689, + "num_input_tokens_seen": 1619450276, + "step": 11480 + }, + { + "epoch": 0.7002734967203248, + "grad_norm": 0.5086653232574463, + "learning_rate": 8e-05, + "loss": 1.3484, + "num_input_tokens_seen": 1620874576, + "step": 11490 + }, + { + "epoch": 0.7008829601639457, + "grad_norm": 0.5540332198143005, + "learning_rate": 8e-05, + "loss": 1.3293, + "num_input_tokens_seen": 1622283328, + "step": 11500 + }, + { + "epoch": 0.7014924236075665, + "grad_norm": 0.5071319341659546, + "learning_rate": 8e-05, + "loss": 1.3199, + "num_input_tokens_seen": 1623641908, + "step": 11510 + }, + { + "epoch": 0.7021018870511873, + "grad_norm": 0.5358554124832153, + "learning_rate": 8e-05, + "loss": 1.4036, + "num_input_tokens_seen": 1625041264, + "step": 11520 + }, + { + "epoch": 0.7027113504948082, + "grad_norm": 0.5506737232208252, + "learning_rate": 8e-05, + "loss": 1.34, + "num_input_tokens_seen": 1626404844, + "step": 11530 + }, + { + "epoch": 0.7033208139384289, + "grad_norm": 0.44404277205467224, + "learning_rate": 8e-05, + "loss": 1.34, + "num_input_tokens_seen": 1627863172, + "step": 11540 + }, + { + "epoch": 0.7039302773820498, + "grad_norm": 0.46990063786506653, + "learning_rate": 8e-05, + "loss": 1.3722, + "num_input_tokens_seen": 1629274832, + "step": 11550 + }, + { + "epoch": 0.7045397408256706, + "grad_norm": 0.4636783301830292, + "learning_rate": 8e-05, + "loss": 1.3307, + "num_input_tokens_seen": 1630697752, + "step": 11560 + }, + { + "epoch": 0.7051492042692914, + "grad_norm": 0.43071427941322327, + "learning_rate": 8e-05, + "loss": 1.3574, + "num_input_tokens_seen": 1632095920, + "step": 11570 + }, + { + "epoch": 0.7057586677129123, + "grad_norm": 0.5207687616348267, + "learning_rate": 8e-05, + "loss": 1.3084, + "num_input_tokens_seen": 1633509780, + "step": 11580 + }, + { + "epoch": 0.7063681311565331, + "grad_norm": 0.4385841190814972, + "learning_rate": 8e-05, + "loss": 1.3712, + "num_input_tokens_seen": 1634958136, + "step": 11590 + }, + { + "epoch": 0.7069775946001539, + "grad_norm": 0.5395679473876953, + "learning_rate": 8e-05, + "loss": 1.3802, + "num_input_tokens_seen": 1636385408, + "step": 11600 + }, + { + "epoch": 0.7075870580437748, + "grad_norm": 0.605144202709198, + "learning_rate": 8e-05, + "loss": 1.3814, + "num_input_tokens_seen": 1637775280, + "step": 11610 + }, + { + "epoch": 0.7081965214873955, + "grad_norm": 0.46157678961753845, + "learning_rate": 8e-05, + "loss": 1.378, + "num_input_tokens_seen": 1639212620, + "step": 11620 + }, + { + "epoch": 0.7088059849310163, + "grad_norm": 0.4572867751121521, + "learning_rate": 8e-05, + "loss": 1.3809, + "num_input_tokens_seen": 1640607368, + "step": 11630 + }, + { + "epoch": 0.7094154483746372, + "grad_norm": 0.5079594254493713, + "learning_rate": 8e-05, + "loss": 1.2521, + "num_input_tokens_seen": 1642027784, + "step": 11640 + }, + { + "epoch": 0.710024911818258, + "grad_norm": 0.7235115766525269, + "learning_rate": 8e-05, + "loss": 1.3339, + "num_input_tokens_seen": 1643408672, + "step": 11650 + }, + { + "epoch": 0.7106343752618788, + "grad_norm": 0.459494411945343, + "learning_rate": 8e-05, + "loss": 1.3244, + "num_input_tokens_seen": 1644816008, + "step": 11660 + }, + { + "epoch": 0.7112438387054997, + "grad_norm": 0.4589853584766388, + "learning_rate": 8e-05, + "loss": 1.3231, + "num_input_tokens_seen": 1646220640, + "step": 11670 + }, + { + "epoch": 0.7118533021491205, + "grad_norm": 0.496324360370636, + "learning_rate": 8e-05, + "loss": 1.3495, + "num_input_tokens_seen": 1647628668, + "step": 11680 + }, + { + "epoch": 0.7124627655927412, + "grad_norm": 0.5813272595405579, + "learning_rate": 8e-05, + "loss": 1.3994, + "num_input_tokens_seen": 1649031944, + "step": 11690 + }, + { + "epoch": 0.7130722290363621, + "grad_norm": 0.528102695941925, + "learning_rate": 8e-05, + "loss": 1.4662, + "num_input_tokens_seen": 1650445412, + "step": 11700 + }, + { + "epoch": 0.7136816924799829, + "grad_norm": 0.5562940835952759, + "learning_rate": 8e-05, + "loss": 1.3603, + "num_input_tokens_seen": 1651821844, + "step": 11710 + }, + { + "epoch": 0.7142911559236037, + "grad_norm": 0.4890764057636261, + "learning_rate": 8e-05, + "loss": 1.3721, + "num_input_tokens_seen": 1653199044, + "step": 11720 + }, + { + "epoch": 0.7149006193672246, + "grad_norm": 0.5230799913406372, + "learning_rate": 8e-05, + "loss": 1.3574, + "num_input_tokens_seen": 1654565212, + "step": 11730 + }, + { + "epoch": 0.7155100828108454, + "grad_norm": 0.5011894106864929, + "learning_rate": 8e-05, + "loss": 1.3551, + "num_input_tokens_seen": 1655952188, + "step": 11740 + }, + { + "epoch": 0.7161195462544663, + "grad_norm": 0.48514172434806824, + "learning_rate": 8e-05, + "loss": 1.3334, + "num_input_tokens_seen": 1657357812, + "step": 11750 + }, + { + "epoch": 0.7167290096980871, + "grad_norm": 0.5416278839111328, + "learning_rate": 8e-05, + "loss": 1.3142, + "num_input_tokens_seen": 1658753864, + "step": 11760 + }, + { + "epoch": 0.7173384731417078, + "grad_norm": 0.5038822889328003, + "learning_rate": 8e-05, + "loss": 1.3044, + "num_input_tokens_seen": 1660171056, + "step": 11770 + }, + { + "epoch": 0.7179479365853287, + "grad_norm": 0.6000845432281494, + "learning_rate": 8e-05, + "loss": 1.3487, + "num_input_tokens_seen": 1661578792, + "step": 11780 + }, + { + "epoch": 0.7185574000289495, + "grad_norm": 0.4599187672138214, + "learning_rate": 8e-05, + "loss": 1.3169, + "num_input_tokens_seen": 1662961256, + "step": 11790 + }, + { + "epoch": 0.7191668634725703, + "grad_norm": 0.5955362915992737, + "learning_rate": 8e-05, + "loss": 1.4204, + "num_input_tokens_seen": 1664363192, + "step": 11800 + }, + { + "epoch": 0.7197763269161912, + "grad_norm": 0.5447018146514893, + "learning_rate": 8e-05, + "loss": 1.3821, + "num_input_tokens_seen": 1665776148, + "step": 11810 + }, + { + "epoch": 0.720385790359812, + "grad_norm": 0.49061569571495056, + "learning_rate": 8e-05, + "loss": 1.3661, + "num_input_tokens_seen": 1667199220, + "step": 11820 + }, + { + "epoch": 0.7209952538034328, + "grad_norm": 0.5659445524215698, + "learning_rate": 8e-05, + "loss": 1.3536, + "num_input_tokens_seen": 1668633580, + "step": 11830 + }, + { + "epoch": 0.7216047172470537, + "grad_norm": 0.44457441568374634, + "learning_rate": 8e-05, + "loss": 1.3176, + "num_input_tokens_seen": 1670048000, + "step": 11840 + }, + { + "epoch": 0.7222141806906744, + "grad_norm": 0.5029119849205017, + "learning_rate": 8e-05, + "loss": 1.4107, + "num_input_tokens_seen": 1671464340, + "step": 11850 + }, + { + "epoch": 0.7228236441342952, + "grad_norm": 0.4228610694408417, + "learning_rate": 8e-05, + "loss": 1.4419, + "num_input_tokens_seen": 1672855328, + "step": 11860 + }, + { + "epoch": 0.7234331075779161, + "grad_norm": 0.5061122179031372, + "learning_rate": 8e-05, + "loss": 1.3356, + "num_input_tokens_seen": 1674291744, + "step": 11870 + }, + { + "epoch": 0.7240425710215369, + "grad_norm": 0.4675373136997223, + "learning_rate": 8e-05, + "loss": 1.4002, + "num_input_tokens_seen": 1675717044, + "step": 11880 + }, + { + "epoch": 0.7246520344651577, + "grad_norm": 0.40757638216018677, + "learning_rate": 8e-05, + "loss": 1.3313, + "num_input_tokens_seen": 1677101540, + "step": 11890 + }, + { + "epoch": 0.7252614979087786, + "grad_norm": 0.5157292485237122, + "learning_rate": 8e-05, + "loss": 1.4145, + "num_input_tokens_seen": 1678515036, + "step": 11900 + }, + { + "epoch": 0.7258709613523994, + "grad_norm": 0.6204097270965576, + "learning_rate": 8e-05, + "loss": 1.3039, + "num_input_tokens_seen": 1679921248, + "step": 11910 + }, + { + "epoch": 0.7264804247960202, + "grad_norm": 0.44126880168914795, + "learning_rate": 8e-05, + "loss": 1.3353, + "num_input_tokens_seen": 1681306344, + "step": 11920 + }, + { + "epoch": 0.727089888239641, + "grad_norm": 0.6292856931686401, + "learning_rate": 8e-05, + "loss": 1.3355, + "num_input_tokens_seen": 1682719392, + "step": 11930 + }, + { + "epoch": 0.7276993516832618, + "grad_norm": 0.4729043245315552, + "learning_rate": 8e-05, + "loss": 1.2812, + "num_input_tokens_seen": 1684071768, + "step": 11940 + }, + { + "epoch": 0.7283088151268827, + "grad_norm": 0.484418660402298, + "learning_rate": 8e-05, + "loss": 1.3574, + "num_input_tokens_seen": 1685486372, + "step": 11950 + }, + { + "epoch": 0.7289182785705035, + "grad_norm": 0.48124608397483826, + "learning_rate": 8e-05, + "loss": 1.3325, + "num_input_tokens_seen": 1686917640, + "step": 11960 + }, + { + "epoch": 0.7295277420141243, + "grad_norm": 0.5151872634887695, + "learning_rate": 8e-05, + "loss": 1.3391, + "num_input_tokens_seen": 1688341640, + "step": 11970 + }, + { + "epoch": 0.7301372054577452, + "grad_norm": 0.5516366958618164, + "learning_rate": 8e-05, + "loss": 1.3006, + "num_input_tokens_seen": 1689765020, + "step": 11980 + }, + { + "epoch": 0.730746668901366, + "grad_norm": 0.5208513140678406, + "learning_rate": 8e-05, + "loss": 1.3503, + "num_input_tokens_seen": 1691179912, + "step": 11990 + }, + { + "epoch": 0.7313561323449868, + "grad_norm": 0.5036225914955139, + "learning_rate": 8e-05, + "loss": 1.3422, + "num_input_tokens_seen": 1692541944, + "step": 12000 + }, + { + "epoch": 0.7319655957886076, + "grad_norm": 0.44400182366371155, + "learning_rate": 8e-05, + "loss": 1.3675, + "num_input_tokens_seen": 1693937504, + "step": 12010 + }, + { + "epoch": 0.7325750592322284, + "grad_norm": 0.4907507598400116, + "learning_rate": 8e-05, + "loss": 1.3248, + "num_input_tokens_seen": 1695314680, + "step": 12020 + }, + { + "epoch": 0.7331845226758492, + "grad_norm": 0.5090814828872681, + "learning_rate": 8e-05, + "loss": 1.3548, + "num_input_tokens_seen": 1696725392, + "step": 12030 + }, + { + "epoch": 0.7337939861194701, + "grad_norm": 0.5077126622200012, + "learning_rate": 8e-05, + "loss": 1.3663, + "num_input_tokens_seen": 1698124780, + "step": 12040 + }, + { + "epoch": 0.7344034495630909, + "grad_norm": 0.5142135620117188, + "learning_rate": 8e-05, + "loss": 1.3235, + "num_input_tokens_seen": 1699533240, + "step": 12050 + }, + { + "epoch": 0.7350129130067117, + "grad_norm": 0.4585704207420349, + "learning_rate": 8e-05, + "loss": 1.3834, + "num_input_tokens_seen": 1700921096, + "step": 12060 + }, + { + "epoch": 0.7356223764503326, + "grad_norm": 0.6136194467544556, + "learning_rate": 8e-05, + "loss": 1.3458, + "num_input_tokens_seen": 1702319564, + "step": 12070 + }, + { + "epoch": 0.7362318398939534, + "grad_norm": 0.5828477740287781, + "learning_rate": 8e-05, + "loss": 1.3506, + "num_input_tokens_seen": 1703697712, + "step": 12080 + }, + { + "epoch": 0.7368413033375741, + "grad_norm": 0.5217992663383484, + "learning_rate": 8e-05, + "loss": 1.3363, + "num_input_tokens_seen": 1705110788, + "step": 12090 + }, + { + "epoch": 0.737450766781195, + "grad_norm": 0.5703504681587219, + "learning_rate": 8e-05, + "loss": 1.3916, + "num_input_tokens_seen": 1706529936, + "step": 12100 + }, + { + "epoch": 0.7380602302248158, + "grad_norm": 0.49754780530929565, + "learning_rate": 8e-05, + "loss": 1.3259, + "num_input_tokens_seen": 1707949384, + "step": 12110 + }, + { + "epoch": 0.7386696936684366, + "grad_norm": 0.46623051166534424, + "learning_rate": 8e-05, + "loss": 1.3169, + "num_input_tokens_seen": 1709343816, + "step": 12120 + }, + { + "epoch": 0.7392791571120575, + "grad_norm": 0.4529217481613159, + "learning_rate": 8e-05, + "loss": 1.3445, + "num_input_tokens_seen": 1710727904, + "step": 12130 + }, + { + "epoch": 0.7398886205556783, + "grad_norm": 0.5647047758102417, + "learning_rate": 8e-05, + "loss": 1.359, + "num_input_tokens_seen": 1712104092, + "step": 12140 + }, + { + "epoch": 0.7404980839992991, + "grad_norm": 0.4240126609802246, + "learning_rate": 8e-05, + "loss": 1.3276, + "num_input_tokens_seen": 1713494984, + "step": 12150 + }, + { + "epoch": 0.74110754744292, + "grad_norm": 0.4749116003513336, + "learning_rate": 8e-05, + "loss": 1.3626, + "num_input_tokens_seen": 1714923848, + "step": 12160 + }, + { + "epoch": 0.7417170108865407, + "grad_norm": 0.4912160038948059, + "learning_rate": 8e-05, + "loss": 1.3342, + "num_input_tokens_seen": 1716289192, + "step": 12170 + }, + { + "epoch": 0.7423264743301616, + "grad_norm": 0.47284796833992004, + "learning_rate": 8e-05, + "loss": 1.4014, + "num_input_tokens_seen": 1717724568, + "step": 12180 + }, + { + "epoch": 0.7429359377737824, + "grad_norm": 0.520675539970398, + "learning_rate": 8e-05, + "loss": 1.3207, + "num_input_tokens_seen": 1719167576, + "step": 12190 + }, + { + "epoch": 0.7435454012174032, + "grad_norm": 0.48047661781311035, + "learning_rate": 8e-05, + "loss": 1.288, + "num_input_tokens_seen": 1720576612, + "step": 12200 + }, + { + "epoch": 0.7441548646610241, + "grad_norm": 0.49878841638565063, + "learning_rate": 8e-05, + "loss": 1.3228, + "num_input_tokens_seen": 1721972364, + "step": 12210 + }, + { + "epoch": 0.7447643281046449, + "grad_norm": 0.4979226589202881, + "learning_rate": 8e-05, + "loss": 1.2814, + "num_input_tokens_seen": 1723431204, + "step": 12220 + }, + { + "epoch": 0.7453737915482657, + "grad_norm": 0.5070583820343018, + "learning_rate": 8e-05, + "loss": 1.3283, + "num_input_tokens_seen": 1724849792, + "step": 12230 + }, + { + "epoch": 0.7459832549918866, + "grad_norm": 0.4756496250629425, + "learning_rate": 8e-05, + "loss": 1.2717, + "num_input_tokens_seen": 1726305880, + "step": 12240 + }, + { + "epoch": 0.7465927184355073, + "grad_norm": 0.6402299404144287, + "learning_rate": 8e-05, + "loss": 1.3142, + "num_input_tokens_seen": 1727716648, + "step": 12250 + }, + { + "epoch": 0.7472021818791281, + "grad_norm": 0.5206665396690369, + "learning_rate": 8e-05, + "loss": 1.2944, + "num_input_tokens_seen": 1729132756, + "step": 12260 + }, + { + "epoch": 0.747811645322749, + "grad_norm": 0.5346994996070862, + "learning_rate": 8e-05, + "loss": 1.3813, + "num_input_tokens_seen": 1730565924, + "step": 12270 + }, + { + "epoch": 0.7484211087663698, + "grad_norm": 0.6076343059539795, + "learning_rate": 8e-05, + "loss": 1.3387, + "num_input_tokens_seen": 1731972796, + "step": 12280 + }, + { + "epoch": 0.7490305722099906, + "grad_norm": 0.5422996878623962, + "learning_rate": 8e-05, + "loss": 1.3192, + "num_input_tokens_seen": 1733365796, + "step": 12290 + }, + { + "epoch": 0.7496400356536115, + "grad_norm": 0.5899409055709839, + "learning_rate": 8e-05, + "loss": 1.3258, + "num_input_tokens_seen": 1734719368, + "step": 12300 + }, + { + "epoch": 0.7502494990972323, + "grad_norm": 0.4870734214782715, + "learning_rate": 8e-05, + "loss": 1.3172, + "num_input_tokens_seen": 1736156068, + "step": 12310 + }, + { + "epoch": 0.750858962540853, + "grad_norm": 0.44740596413612366, + "learning_rate": 8e-05, + "loss": 1.3336, + "num_input_tokens_seen": 1737561880, + "step": 12320 + }, + { + "epoch": 0.7514684259844739, + "grad_norm": 0.5884395241737366, + "learning_rate": 8e-05, + "loss": 1.3935, + "num_input_tokens_seen": 1738988796, + "step": 12330 + }, + { + "epoch": 0.7520778894280947, + "grad_norm": 0.5457208156585693, + "learning_rate": 8e-05, + "loss": 1.338, + "num_input_tokens_seen": 1740413288, + "step": 12340 + }, + { + "epoch": 0.7526873528717155, + "grad_norm": 0.4471927881240845, + "learning_rate": 8e-05, + "loss": 1.3132, + "num_input_tokens_seen": 1741852120, + "step": 12350 + }, + { + "epoch": 0.7532968163153364, + "grad_norm": 0.5655962228775024, + "learning_rate": 8e-05, + "loss": 1.265, + "num_input_tokens_seen": 1743221408, + "step": 12360 + }, + { + "epoch": 0.7539062797589572, + "grad_norm": 0.5733407735824585, + "learning_rate": 8e-05, + "loss": 1.3406, + "num_input_tokens_seen": 1744609012, + "step": 12370 + }, + { + "epoch": 0.7545157432025781, + "grad_norm": 0.5545304417610168, + "learning_rate": 8e-05, + "loss": 1.411, + "num_input_tokens_seen": 1746007428, + "step": 12380 + }, + { + "epoch": 0.7551252066461989, + "grad_norm": 0.5492582321166992, + "learning_rate": 8e-05, + "loss": 1.3644, + "num_input_tokens_seen": 1747422652, + "step": 12390 + }, + { + "epoch": 0.7557346700898196, + "grad_norm": 0.5435560345649719, + "learning_rate": 8e-05, + "loss": 1.37, + "num_input_tokens_seen": 1748808948, + "step": 12400 + }, + { + "epoch": 0.7563441335334405, + "grad_norm": 0.5100287795066833, + "learning_rate": 8e-05, + "loss": 1.3401, + "num_input_tokens_seen": 1750211396, + "step": 12410 + }, + { + "epoch": 0.7569535969770613, + "grad_norm": 0.5885925889015198, + "learning_rate": 8e-05, + "loss": 1.2552, + "num_input_tokens_seen": 1751641492, + "step": 12420 + }, + { + "epoch": 0.7575630604206821, + "grad_norm": 0.5710425972938538, + "learning_rate": 8e-05, + "loss": 1.3211, + "num_input_tokens_seen": 1753033724, + "step": 12430 + }, + { + "epoch": 0.758172523864303, + "grad_norm": 0.4631437659263611, + "learning_rate": 8e-05, + "loss": 1.3819, + "num_input_tokens_seen": 1754426200, + "step": 12440 + }, + { + "epoch": 0.7587819873079238, + "grad_norm": 0.5151511430740356, + "learning_rate": 8e-05, + "loss": 1.2802, + "num_input_tokens_seen": 1755811048, + "step": 12450 + }, + { + "epoch": 0.7593914507515446, + "grad_norm": 0.5333474278450012, + "learning_rate": 8e-05, + "loss": 1.3127, + "num_input_tokens_seen": 1757194620, + "step": 12460 + }, + { + "epoch": 0.7600009141951655, + "grad_norm": 0.49914056062698364, + "learning_rate": 8e-05, + "loss": 1.3185, + "num_input_tokens_seen": 1758622592, + "step": 12470 + }, + { + "epoch": 0.7606103776387863, + "grad_norm": 0.41091620922088623, + "learning_rate": 8e-05, + "loss": 1.2696, + "num_input_tokens_seen": 1759999208, + "step": 12480 + }, + { + "epoch": 0.761219841082407, + "grad_norm": 0.5769081115722656, + "learning_rate": 8e-05, + "loss": 1.3431, + "num_input_tokens_seen": 1761407552, + "step": 12490 + }, + { + "epoch": 0.7618293045260279, + "grad_norm": 0.5206863880157471, + "learning_rate": 8e-05, + "loss": 1.3317, + "num_input_tokens_seen": 1762800588, + "step": 12500 + }, + { + "epoch": 0.7624387679696487, + "grad_norm": 0.5158757567405701, + "learning_rate": 8e-05, + "loss": 1.3526, + "num_input_tokens_seen": 1764187880, + "step": 12510 + }, + { + "epoch": 0.7630482314132695, + "grad_norm": 0.5738706588745117, + "learning_rate": 8e-05, + "loss": 1.3989, + "num_input_tokens_seen": 1765586032, + "step": 12520 + }, + { + "epoch": 0.7636576948568904, + "grad_norm": 0.6022568941116333, + "learning_rate": 8e-05, + "loss": 1.2905, + "num_input_tokens_seen": 1767057448, + "step": 12530 + }, + { + "epoch": 0.7642671583005112, + "grad_norm": 0.5038744211196899, + "learning_rate": 8e-05, + "loss": 1.3278, + "num_input_tokens_seen": 1768443564, + "step": 12540 + }, + { + "epoch": 0.764876621744132, + "grad_norm": 0.5244400501251221, + "learning_rate": 8e-05, + "loss": 1.3228, + "num_input_tokens_seen": 1769849304, + "step": 12550 + }, + { + "epoch": 0.7654860851877529, + "grad_norm": 0.5055208802223206, + "learning_rate": 8e-05, + "loss": 1.3335, + "num_input_tokens_seen": 1771248796, + "step": 12560 + }, + { + "epoch": 0.7660955486313736, + "grad_norm": 0.48623979091644287, + "learning_rate": 8e-05, + "loss": 1.3765, + "num_input_tokens_seen": 1772625404, + "step": 12570 + }, + { + "epoch": 0.7667050120749945, + "grad_norm": 0.5019470453262329, + "learning_rate": 8e-05, + "loss": 1.3249, + "num_input_tokens_seen": 1774037708, + "step": 12580 + }, + { + "epoch": 0.7673144755186153, + "grad_norm": 0.5067726373672485, + "learning_rate": 8e-05, + "loss": 1.3194, + "num_input_tokens_seen": 1775423152, + "step": 12590 + }, + { + "epoch": 0.7679239389622361, + "grad_norm": 0.4977276027202606, + "learning_rate": 8e-05, + "loss": 1.3447, + "num_input_tokens_seen": 1776840448, + "step": 12600 + }, + { + "epoch": 0.768533402405857, + "grad_norm": 0.5764220356941223, + "learning_rate": 8e-05, + "loss": 1.3149, + "num_input_tokens_seen": 1778260160, + "step": 12610 + }, + { + "epoch": 0.7691428658494778, + "grad_norm": 0.5561099648475647, + "learning_rate": 8e-05, + "loss": 1.416, + "num_input_tokens_seen": 1779627344, + "step": 12620 + }, + { + "epoch": 0.7697523292930986, + "grad_norm": 0.4993090331554413, + "learning_rate": 8e-05, + "loss": 1.4345, + "num_input_tokens_seen": 1781020624, + "step": 12630 + }, + { + "epoch": 0.7703617927367195, + "grad_norm": 0.537917971611023, + "learning_rate": 8e-05, + "loss": 1.2816, + "num_input_tokens_seen": 1782457444, + "step": 12640 + }, + { + "epoch": 0.7709712561803402, + "grad_norm": 0.5599529147148132, + "learning_rate": 8e-05, + "loss": 1.3272, + "num_input_tokens_seen": 1783910788, + "step": 12650 + }, + { + "epoch": 0.771580719623961, + "grad_norm": 0.523169755935669, + "learning_rate": 8e-05, + "loss": 1.3633, + "num_input_tokens_seen": 1785320612, + "step": 12660 + }, + { + "epoch": 0.7721901830675819, + "grad_norm": 0.5435786247253418, + "learning_rate": 8e-05, + "loss": 1.2906, + "num_input_tokens_seen": 1786698656, + "step": 12670 + }, + { + "epoch": 0.7727996465112027, + "grad_norm": 0.489886999130249, + "learning_rate": 8e-05, + "loss": 1.365, + "num_input_tokens_seen": 1788067732, + "step": 12680 + }, + { + "epoch": 0.7734091099548235, + "grad_norm": 0.43583112955093384, + "learning_rate": 8e-05, + "loss": 1.2961, + "num_input_tokens_seen": 1789457292, + "step": 12690 + }, + { + "epoch": 0.7740185733984444, + "grad_norm": 0.4963041841983795, + "learning_rate": 8e-05, + "loss": 1.3571, + "num_input_tokens_seen": 1790862480, + "step": 12700 + }, + { + "epoch": 0.7746280368420652, + "grad_norm": 0.5106602907180786, + "learning_rate": 8e-05, + "loss": 1.4132, + "num_input_tokens_seen": 1792265412, + "step": 12710 + }, + { + "epoch": 0.7752375002856859, + "grad_norm": 0.47308048605918884, + "learning_rate": 8e-05, + "loss": 1.2768, + "num_input_tokens_seen": 1793681304, + "step": 12720 + }, + { + "epoch": 0.7758469637293068, + "grad_norm": 0.5513538718223572, + "learning_rate": 8e-05, + "loss": 1.4489, + "num_input_tokens_seen": 1795090636, + "step": 12730 + }, + { + "epoch": 0.7764564271729276, + "grad_norm": 0.5152673125267029, + "learning_rate": 8e-05, + "loss": 1.3806, + "num_input_tokens_seen": 1796505112, + "step": 12740 + }, + { + "epoch": 0.7770658906165484, + "grad_norm": 0.5017542243003845, + "learning_rate": 8e-05, + "loss": 1.2905, + "num_input_tokens_seen": 1797913208, + "step": 12750 + }, + { + "epoch": 0.7776753540601693, + "grad_norm": 0.513664186000824, + "learning_rate": 8e-05, + "loss": 1.4124, + "num_input_tokens_seen": 1799342844, + "step": 12760 + }, + { + "epoch": 0.7782848175037901, + "grad_norm": 0.47624465823173523, + "learning_rate": 8e-05, + "loss": 1.3233, + "num_input_tokens_seen": 1800772036, + "step": 12770 + }, + { + "epoch": 0.778894280947411, + "grad_norm": 0.5275976061820984, + "learning_rate": 8e-05, + "loss": 1.3211, + "num_input_tokens_seen": 1802188180, + "step": 12780 + }, + { + "epoch": 0.7795037443910318, + "grad_norm": 0.5230554938316345, + "learning_rate": 8e-05, + "loss": 1.3797, + "num_input_tokens_seen": 1803600956, + "step": 12790 + }, + { + "epoch": 0.7801132078346525, + "grad_norm": 0.6262668371200562, + "learning_rate": 8e-05, + "loss": 1.285, + "num_input_tokens_seen": 1805012032, + "step": 12800 + }, + { + "epoch": 0.7807226712782734, + "grad_norm": 0.5023617148399353, + "learning_rate": 8e-05, + "loss": 1.3513, + "num_input_tokens_seen": 1806435284, + "step": 12810 + }, + { + "epoch": 0.7813321347218942, + "grad_norm": 0.7828114032745361, + "learning_rate": 8e-05, + "loss": 1.3308, + "num_input_tokens_seen": 1807822968, + "step": 12820 + }, + { + "epoch": 0.781941598165515, + "grad_norm": 0.4700995981693268, + "learning_rate": 8e-05, + "loss": 1.4061, + "num_input_tokens_seen": 1809275984, + "step": 12830 + }, + { + "epoch": 0.7825510616091359, + "grad_norm": 0.48993438482284546, + "learning_rate": 8e-05, + "loss": 1.2856, + "num_input_tokens_seen": 1810719656, + "step": 12840 + }, + { + "epoch": 0.7831605250527567, + "grad_norm": 0.5454090237617493, + "learning_rate": 8e-05, + "loss": 1.3035, + "num_input_tokens_seen": 1812148332, + "step": 12850 + }, + { + "epoch": 0.7837699884963775, + "grad_norm": 0.4457705318927765, + "learning_rate": 8e-05, + "loss": 1.3257, + "num_input_tokens_seen": 1813587512, + "step": 12860 + }, + { + "epoch": 0.7843794519399984, + "grad_norm": 0.5165936350822449, + "learning_rate": 8e-05, + "loss": 1.3082, + "num_input_tokens_seen": 1814998812, + "step": 12870 + }, + { + "epoch": 0.7849889153836191, + "grad_norm": 0.5486757755279541, + "learning_rate": 8e-05, + "loss": 1.3164, + "num_input_tokens_seen": 1816431448, + "step": 12880 + }, + { + "epoch": 0.7855983788272399, + "grad_norm": 0.5352789163589478, + "learning_rate": 8e-05, + "loss": 1.333, + "num_input_tokens_seen": 1817905740, + "step": 12890 + }, + { + "epoch": 0.7862078422708608, + "grad_norm": 0.5381478071212769, + "learning_rate": 8e-05, + "loss": 1.3422, + "num_input_tokens_seen": 1819312544, + "step": 12900 + }, + { + "epoch": 0.7868173057144816, + "grad_norm": 0.4825122058391571, + "learning_rate": 8e-05, + "loss": 1.3764, + "num_input_tokens_seen": 1820735580, + "step": 12910 + }, + { + "epoch": 0.7874267691581024, + "grad_norm": 0.49656012654304504, + "learning_rate": 8e-05, + "loss": 1.3513, + "num_input_tokens_seen": 1822160700, + "step": 12920 + }, + { + "epoch": 0.7880362326017233, + "grad_norm": 0.4445992112159729, + "learning_rate": 8e-05, + "loss": 1.2595, + "num_input_tokens_seen": 1823572968, + "step": 12930 + }, + { + "epoch": 0.7886456960453441, + "grad_norm": 0.5262532234191895, + "learning_rate": 8e-05, + "loss": 1.3655, + "num_input_tokens_seen": 1824957320, + "step": 12940 + }, + { + "epoch": 0.7892551594889649, + "grad_norm": 0.5006152987480164, + "learning_rate": 8e-05, + "loss": 1.3366, + "num_input_tokens_seen": 1826389520, + "step": 12950 + }, + { + "epoch": 0.7898646229325857, + "grad_norm": 0.5019990801811218, + "learning_rate": 8e-05, + "loss": 1.3276, + "num_input_tokens_seen": 1827851336, + "step": 12960 + }, + { + "epoch": 0.7904740863762065, + "grad_norm": 0.5868325233459473, + "learning_rate": 8e-05, + "loss": 1.2845, + "num_input_tokens_seen": 1829273684, + "step": 12970 + }, + { + "epoch": 0.7910835498198273, + "grad_norm": 0.5358790159225464, + "learning_rate": 8e-05, + "loss": 1.2664, + "num_input_tokens_seen": 1830681708, + "step": 12980 + }, + { + "epoch": 0.7916930132634482, + "grad_norm": 0.5080293416976929, + "learning_rate": 8e-05, + "loss": 1.3323, + "num_input_tokens_seen": 1832051744, + "step": 12990 + }, + { + "epoch": 0.792302476707069, + "grad_norm": 0.48632749915122986, + "learning_rate": 8e-05, + "loss": 1.3402, + "num_input_tokens_seen": 1833451224, + "step": 13000 + }, + { + "epoch": 0.7929119401506899, + "grad_norm": 0.5741437673568726, + "learning_rate": 8e-05, + "loss": 1.3162, + "num_input_tokens_seen": 1834868992, + "step": 13010 + }, + { + "epoch": 0.7935214035943107, + "grad_norm": 0.5048441886901855, + "learning_rate": 8e-05, + "loss": 1.4296, + "num_input_tokens_seen": 1836321756, + "step": 13020 + }, + { + "epoch": 0.7941308670379315, + "grad_norm": 0.5186116695404053, + "learning_rate": 8e-05, + "loss": 1.3066, + "num_input_tokens_seen": 1837704368, + "step": 13030 + }, + { + "epoch": 0.7947403304815523, + "grad_norm": 0.5407278537750244, + "learning_rate": 8e-05, + "loss": 1.3587, + "num_input_tokens_seen": 1839125744, + "step": 13040 + }, + { + "epoch": 0.7953497939251731, + "grad_norm": 0.5114259719848633, + "learning_rate": 8e-05, + "loss": 1.2733, + "num_input_tokens_seen": 1840565796, + "step": 13050 + }, + { + "epoch": 0.7959592573687939, + "grad_norm": 0.5528411269187927, + "learning_rate": 8e-05, + "loss": 1.2579, + "num_input_tokens_seen": 1841969076, + "step": 13060 + }, + { + "epoch": 0.7965687208124148, + "grad_norm": 0.4976850748062134, + "learning_rate": 8e-05, + "loss": 1.3238, + "num_input_tokens_seen": 1843405600, + "step": 13070 + }, + { + "epoch": 0.7971781842560356, + "grad_norm": 0.48887088894844055, + "learning_rate": 8e-05, + "loss": 1.322, + "num_input_tokens_seen": 1844856608, + "step": 13080 + }, + { + "epoch": 0.7977876476996564, + "grad_norm": 0.44426125288009644, + "learning_rate": 8e-05, + "loss": 1.2833, + "num_input_tokens_seen": 1846275428, + "step": 13090 + }, + { + "epoch": 0.7983971111432773, + "grad_norm": 0.5178174376487732, + "learning_rate": 8e-05, + "loss": 1.3407, + "num_input_tokens_seen": 1847671452, + "step": 13100 + }, + { + "epoch": 0.799006574586898, + "grad_norm": 0.445492148399353, + "learning_rate": 8e-05, + "loss": 1.3597, + "num_input_tokens_seen": 1849090208, + "step": 13110 + }, + { + "epoch": 0.7996160380305188, + "grad_norm": 0.547702968120575, + "learning_rate": 8e-05, + "loss": 1.3509, + "num_input_tokens_seen": 1850510744, + "step": 13120 + }, + { + "epoch": 0.8002255014741397, + "grad_norm": 0.531181275844574, + "learning_rate": 8e-05, + "loss": 1.2971, + "num_input_tokens_seen": 1851915908, + "step": 13130 + }, + { + "epoch": 0.8008349649177605, + "grad_norm": 0.5193353891372681, + "learning_rate": 8e-05, + "loss": 1.2707, + "num_input_tokens_seen": 1853305048, + "step": 13140 + }, + { + "epoch": 0.8014444283613813, + "grad_norm": 0.5197256207466125, + "learning_rate": 8e-05, + "loss": 1.3619, + "num_input_tokens_seen": 1854735100, + "step": 13150 + }, + { + "epoch": 0.8020538918050022, + "grad_norm": 0.5376043319702148, + "learning_rate": 8e-05, + "loss": 1.2955, + "num_input_tokens_seen": 1856147500, + "step": 13160 + }, + { + "epoch": 0.802663355248623, + "grad_norm": 0.45468467473983765, + "learning_rate": 8e-05, + "loss": 1.375, + "num_input_tokens_seen": 1857560596, + "step": 13170 + }, + { + "epoch": 0.8032728186922438, + "grad_norm": 0.49229443073272705, + "learning_rate": 8e-05, + "loss": 1.3398, + "num_input_tokens_seen": 1858960492, + "step": 13180 + }, + { + "epoch": 0.8038822821358647, + "grad_norm": 0.49927592277526855, + "learning_rate": 8e-05, + "loss": 1.2409, + "num_input_tokens_seen": 1860333604, + "step": 13190 + }, + { + "epoch": 0.8044917455794854, + "grad_norm": 0.533245861530304, + "learning_rate": 8e-05, + "loss": 1.4103, + "num_input_tokens_seen": 1861704208, + "step": 13200 + }, + { + "epoch": 0.8051012090231063, + "grad_norm": 0.4365921914577484, + "learning_rate": 8e-05, + "loss": 1.3687, + "num_input_tokens_seen": 1863099408, + "step": 13210 + }, + { + "epoch": 0.8057106724667271, + "grad_norm": 0.48126909136772156, + "learning_rate": 8e-05, + "loss": 1.4127, + "num_input_tokens_seen": 1864469736, + "step": 13220 + }, + { + "epoch": 0.8063201359103479, + "grad_norm": 0.5094852447509766, + "learning_rate": 8e-05, + "loss": 1.3602, + "num_input_tokens_seen": 1865862852, + "step": 13230 + }, + { + "epoch": 0.8069295993539688, + "grad_norm": 0.45473966002464294, + "learning_rate": 8e-05, + "loss": 1.3174, + "num_input_tokens_seen": 1867276496, + "step": 13240 + }, + { + "epoch": 0.8075390627975896, + "grad_norm": 0.465964674949646, + "learning_rate": 8e-05, + "loss": 1.3023, + "num_input_tokens_seen": 1868676544, + "step": 13250 + }, + { + "epoch": 0.8081485262412104, + "grad_norm": 0.4686562716960907, + "learning_rate": 8e-05, + "loss": 1.3148, + "num_input_tokens_seen": 1870089728, + "step": 13260 + }, + { + "epoch": 0.8087579896848313, + "grad_norm": 0.510066568851471, + "learning_rate": 8e-05, + "loss": 1.3571, + "num_input_tokens_seen": 1871505476, + "step": 13270 + }, + { + "epoch": 0.809367453128452, + "grad_norm": 0.5353869199752808, + "learning_rate": 8e-05, + "loss": 1.3631, + "num_input_tokens_seen": 1872945652, + "step": 13280 + }, + { + "epoch": 0.8099769165720728, + "grad_norm": 0.46690231561660767, + "learning_rate": 8e-05, + "loss": 1.4069, + "num_input_tokens_seen": 1874334148, + "step": 13290 + }, + { + "epoch": 0.8105863800156937, + "grad_norm": 0.46225330233573914, + "learning_rate": 8e-05, + "loss": 1.2693, + "num_input_tokens_seen": 1875742380, + "step": 13300 + }, + { + "epoch": 0.8111958434593145, + "grad_norm": 0.39511632919311523, + "learning_rate": 8e-05, + "loss": 1.4481, + "num_input_tokens_seen": 1877145116, + "step": 13310 + }, + { + "epoch": 0.8118053069029353, + "grad_norm": 0.53965163230896, + "learning_rate": 8e-05, + "loss": 1.3699, + "num_input_tokens_seen": 1878564336, + "step": 13320 + }, + { + "epoch": 0.8124147703465562, + "grad_norm": 0.47026219964027405, + "learning_rate": 8e-05, + "loss": 1.2953, + "num_input_tokens_seen": 1879957336, + "step": 13330 + }, + { + "epoch": 0.813024233790177, + "grad_norm": 0.45782703161239624, + "learning_rate": 8e-05, + "loss": 1.3148, + "num_input_tokens_seen": 1881375308, + "step": 13340 + }, + { + "epoch": 0.8136336972337977, + "grad_norm": 0.5887371301651001, + "learning_rate": 8e-05, + "loss": 1.3117, + "num_input_tokens_seen": 1882757276, + "step": 13350 + }, + { + "epoch": 0.8142431606774186, + "grad_norm": 0.46690821647644043, + "learning_rate": 8e-05, + "loss": 1.3411, + "num_input_tokens_seen": 1884181132, + "step": 13360 + }, + { + "epoch": 0.8148526241210394, + "grad_norm": 0.44570302963256836, + "learning_rate": 8e-05, + "loss": 1.3642, + "num_input_tokens_seen": 1885627092, + "step": 13370 + }, + { + "epoch": 0.8154620875646602, + "grad_norm": 0.46674537658691406, + "learning_rate": 8e-05, + "loss": 1.3329, + "num_input_tokens_seen": 1887042028, + "step": 13380 + }, + { + "epoch": 0.8160715510082811, + "grad_norm": 0.4996930956840515, + "learning_rate": 8e-05, + "loss": 1.3414, + "num_input_tokens_seen": 1888444844, + "step": 13390 + }, + { + "epoch": 0.8166810144519019, + "grad_norm": 0.5288018584251404, + "learning_rate": 8e-05, + "loss": 1.3824, + "num_input_tokens_seen": 1889802940, + "step": 13400 + }, + { + "epoch": 0.8172904778955228, + "grad_norm": 0.5384576320648193, + "learning_rate": 8e-05, + "loss": 1.2916, + "num_input_tokens_seen": 1891232692, + "step": 13410 + }, + { + "epoch": 0.8178999413391436, + "grad_norm": 0.5390682816505432, + "learning_rate": 8e-05, + "loss": 1.334, + "num_input_tokens_seen": 1892682584, + "step": 13420 + }, + { + "epoch": 0.8185094047827643, + "grad_norm": 0.4713698923587799, + "learning_rate": 8e-05, + "loss": 1.3697, + "num_input_tokens_seen": 1894098956, + "step": 13430 + }, + { + "epoch": 0.8191188682263852, + "grad_norm": 0.4790286421775818, + "learning_rate": 8e-05, + "loss": 1.3651, + "num_input_tokens_seen": 1895517364, + "step": 13440 + }, + { + "epoch": 0.819728331670006, + "grad_norm": 0.5080155730247498, + "learning_rate": 8e-05, + "loss": 1.3155, + "num_input_tokens_seen": 1896935040, + "step": 13450 + }, + { + "epoch": 0.8203377951136268, + "grad_norm": 0.4799495339393616, + "learning_rate": 8e-05, + "loss": 1.3478, + "num_input_tokens_seen": 1898327544, + "step": 13460 + }, + { + "epoch": 0.8209472585572477, + "grad_norm": 0.5344340205192566, + "learning_rate": 8e-05, + "loss": 1.3948, + "num_input_tokens_seen": 1899741048, + "step": 13470 + }, + { + "epoch": 0.8215567220008685, + "grad_norm": 0.5373334288597107, + "learning_rate": 8e-05, + "loss": 1.376, + "num_input_tokens_seen": 1901150512, + "step": 13480 + }, + { + "epoch": 0.8221661854444893, + "grad_norm": 0.511161208152771, + "learning_rate": 8e-05, + "loss": 1.333, + "num_input_tokens_seen": 1902523144, + "step": 13490 + }, + { + "epoch": 0.8227756488881102, + "grad_norm": 0.4798104763031006, + "learning_rate": 8e-05, + "loss": 1.4043, + "num_input_tokens_seen": 1903981052, + "step": 13500 + }, + { + "epoch": 0.823385112331731, + "grad_norm": 0.5256845355033875, + "learning_rate": 8e-05, + "loss": 1.2564, + "num_input_tokens_seen": 1905352460, + "step": 13510 + }, + { + "epoch": 0.8239945757753517, + "grad_norm": 0.43500033020973206, + "learning_rate": 8e-05, + "loss": 1.3265, + "num_input_tokens_seen": 1906776468, + "step": 13520 + }, + { + "epoch": 0.8246040392189726, + "grad_norm": 0.47271063923835754, + "learning_rate": 8e-05, + "loss": 1.3805, + "num_input_tokens_seen": 1908155040, + "step": 13530 + }, + { + "epoch": 0.8252135026625934, + "grad_norm": 0.5768705606460571, + "learning_rate": 8e-05, + "loss": 1.3352, + "num_input_tokens_seen": 1909572824, + "step": 13540 + }, + { + "epoch": 0.8258229661062142, + "grad_norm": 0.496417373418808, + "learning_rate": 8e-05, + "loss": 1.33, + "num_input_tokens_seen": 1911018176, + "step": 13550 + }, + { + "epoch": 0.8264324295498351, + "grad_norm": 0.4653494954109192, + "learning_rate": 8e-05, + "loss": 1.3169, + "num_input_tokens_seen": 1912427256, + "step": 13560 + }, + { + "epoch": 0.8270418929934559, + "grad_norm": 0.4730968475341797, + "learning_rate": 8e-05, + "loss": 1.3272, + "num_input_tokens_seen": 1913827684, + "step": 13570 + }, + { + "epoch": 0.8276513564370767, + "grad_norm": 0.4840553104877472, + "learning_rate": 8e-05, + "loss": 1.3281, + "num_input_tokens_seen": 1915245556, + "step": 13580 + }, + { + "epoch": 0.8282608198806976, + "grad_norm": 0.4903997480869293, + "learning_rate": 8e-05, + "loss": 1.297, + "num_input_tokens_seen": 1916646964, + "step": 13590 + }, + { + "epoch": 0.8288702833243183, + "grad_norm": 0.539023756980896, + "learning_rate": 8e-05, + "loss": 1.3459, + "num_input_tokens_seen": 1918123372, + "step": 13600 + }, + { + "epoch": 0.8294797467679392, + "grad_norm": 0.5554112195968628, + "learning_rate": 8e-05, + "loss": 1.296, + "num_input_tokens_seen": 1919534892, + "step": 13610 + }, + { + "epoch": 0.83008921021156, + "grad_norm": 0.4873456656932831, + "learning_rate": 8e-05, + "loss": 1.3225, + "num_input_tokens_seen": 1920943776, + "step": 13620 + }, + { + "epoch": 0.8306986736551808, + "grad_norm": 0.5194123387336731, + "learning_rate": 8e-05, + "loss": 1.3441, + "num_input_tokens_seen": 1922312644, + "step": 13630 + }, + { + "epoch": 0.8313081370988017, + "grad_norm": 0.476192444562912, + "learning_rate": 8e-05, + "loss": 1.3575, + "num_input_tokens_seen": 1923732804, + "step": 13640 + }, + { + "epoch": 0.8319176005424225, + "grad_norm": 0.540345311164856, + "learning_rate": 8e-05, + "loss": 1.3457, + "num_input_tokens_seen": 1925149948, + "step": 13650 + }, + { + "epoch": 0.8325270639860433, + "grad_norm": 0.5137238502502441, + "learning_rate": 8e-05, + "loss": 1.3691, + "num_input_tokens_seen": 1926573700, + "step": 13660 + }, + { + "epoch": 0.8331365274296642, + "grad_norm": 0.5790938138961792, + "learning_rate": 8e-05, + "loss": 1.3639, + "num_input_tokens_seen": 1927971992, + "step": 13670 + }, + { + "epoch": 0.8337459908732849, + "grad_norm": 0.509369432926178, + "learning_rate": 8e-05, + "loss": 1.2713, + "num_input_tokens_seen": 1929379708, + "step": 13680 + }, + { + "epoch": 0.8343554543169057, + "grad_norm": 0.523574948310852, + "learning_rate": 8e-05, + "loss": 1.3607, + "num_input_tokens_seen": 1930737992, + "step": 13690 + }, + { + "epoch": 0.8349649177605266, + "grad_norm": 0.5223262310028076, + "learning_rate": 8e-05, + "loss": 1.34, + "num_input_tokens_seen": 1932126924, + "step": 13700 + }, + { + "epoch": 0.8355743812041474, + "grad_norm": 0.5009987354278564, + "learning_rate": 8e-05, + "loss": 1.3189, + "num_input_tokens_seen": 1933541832, + "step": 13710 + }, + { + "epoch": 0.8361838446477682, + "grad_norm": 0.4302278459072113, + "learning_rate": 8e-05, + "loss": 1.2678, + "num_input_tokens_seen": 1934971524, + "step": 13720 + }, + { + "epoch": 0.8367933080913891, + "grad_norm": 0.5610336661338806, + "learning_rate": 8e-05, + "loss": 1.3011, + "num_input_tokens_seen": 1936364524, + "step": 13730 + }, + { + "epoch": 0.8374027715350099, + "grad_norm": 0.5906192660331726, + "learning_rate": 8e-05, + "loss": 1.3344, + "num_input_tokens_seen": 1937758596, + "step": 13740 + }, + { + "epoch": 0.8380122349786306, + "grad_norm": 0.5197456479072571, + "learning_rate": 8e-05, + "loss": 1.4107, + "num_input_tokens_seen": 1939192700, + "step": 13750 + }, + { + "epoch": 0.8386216984222515, + "grad_norm": 0.5945485830307007, + "learning_rate": 8e-05, + "loss": 1.31, + "num_input_tokens_seen": 1940593480, + "step": 13760 + }, + { + "epoch": 0.8392311618658723, + "grad_norm": 0.4964558184146881, + "learning_rate": 8e-05, + "loss": 1.3275, + "num_input_tokens_seen": 1941979224, + "step": 13770 + }, + { + "epoch": 0.8398406253094931, + "grad_norm": 0.540803074836731, + "learning_rate": 8e-05, + "loss": 1.3242, + "num_input_tokens_seen": 1943362060, + "step": 13780 + }, + { + "epoch": 0.840450088753114, + "grad_norm": 0.5920431017875671, + "learning_rate": 8e-05, + "loss": 1.2768, + "num_input_tokens_seen": 1944773336, + "step": 13790 + }, + { + "epoch": 0.8410595521967348, + "grad_norm": 0.5019993185997009, + "learning_rate": 8e-05, + "loss": 1.3502, + "num_input_tokens_seen": 1946151796, + "step": 13800 + }, + { + "epoch": 0.8416690156403556, + "grad_norm": 0.44807812571525574, + "learning_rate": 8e-05, + "loss": 1.2786, + "num_input_tokens_seen": 1947532968, + "step": 13810 + }, + { + "epoch": 0.8422784790839765, + "grad_norm": 0.4444776773452759, + "learning_rate": 8e-05, + "loss": 1.2909, + "num_input_tokens_seen": 1948944796, + "step": 13820 + }, + { + "epoch": 0.8428879425275972, + "grad_norm": 0.5267406702041626, + "learning_rate": 8e-05, + "loss": 1.4207, + "num_input_tokens_seen": 1950341252, + "step": 13830 + }, + { + "epoch": 0.8434974059712181, + "grad_norm": 0.5206403732299805, + "learning_rate": 8e-05, + "loss": 1.3525, + "num_input_tokens_seen": 1951752800, + "step": 13840 + }, + { + "epoch": 0.8441068694148389, + "grad_norm": 0.4875728487968445, + "learning_rate": 8e-05, + "loss": 1.2995, + "num_input_tokens_seen": 1953159488, + "step": 13850 + }, + { + "epoch": 0.8447163328584597, + "grad_norm": 0.49969223141670227, + "learning_rate": 8e-05, + "loss": 1.3496, + "num_input_tokens_seen": 1954546568, + "step": 13860 + }, + { + "epoch": 0.8453257963020806, + "grad_norm": 0.44394856691360474, + "learning_rate": 8e-05, + "loss": 1.3643, + "num_input_tokens_seen": 1955979156, + "step": 13870 + }, + { + "epoch": 0.8459352597457014, + "grad_norm": 0.4543069303035736, + "learning_rate": 8e-05, + "loss": 1.2678, + "num_input_tokens_seen": 1957369568, + "step": 13880 + }, + { + "epoch": 0.8465447231893222, + "grad_norm": 0.5392143726348877, + "learning_rate": 8e-05, + "loss": 1.2553, + "num_input_tokens_seen": 1958773732, + "step": 13890 + }, + { + "epoch": 0.8471541866329431, + "grad_norm": 0.48405200242996216, + "learning_rate": 8e-05, + "loss": 1.3331, + "num_input_tokens_seen": 1960203416, + "step": 13900 + }, + { + "epoch": 0.8477636500765638, + "grad_norm": 0.44527801871299744, + "learning_rate": 8e-05, + "loss": 1.3401, + "num_input_tokens_seen": 1961626884, + "step": 13910 + }, + { + "epoch": 0.8483731135201846, + "grad_norm": 0.5325603485107422, + "learning_rate": 8e-05, + "loss": 1.3624, + "num_input_tokens_seen": 1963044460, + "step": 13920 + }, + { + "epoch": 0.8489825769638055, + "grad_norm": 0.531408429145813, + "learning_rate": 8e-05, + "loss": 1.345, + "num_input_tokens_seen": 1964494136, + "step": 13930 + }, + { + "epoch": 0.8495920404074263, + "grad_norm": 0.49144402146339417, + "learning_rate": 8e-05, + "loss": 1.3735, + "num_input_tokens_seen": 1965888548, + "step": 13940 + }, + { + "epoch": 0.8502015038510471, + "grad_norm": 0.5098019242286682, + "learning_rate": 8e-05, + "loss": 1.3549, + "num_input_tokens_seen": 1967290036, + "step": 13950 + }, + { + "epoch": 0.850810967294668, + "grad_norm": 0.5794479250907898, + "learning_rate": 8e-05, + "loss": 1.3227, + "num_input_tokens_seen": 1968705020, + "step": 13960 + }, + { + "epoch": 0.8514204307382888, + "grad_norm": 0.4568016529083252, + "learning_rate": 8e-05, + "loss": 1.26, + "num_input_tokens_seen": 1970114156, + "step": 13970 + }, + { + "epoch": 0.8520298941819096, + "grad_norm": 0.5260335206985474, + "learning_rate": 8e-05, + "loss": 1.356, + "num_input_tokens_seen": 1971510264, + "step": 13980 + }, + { + "epoch": 0.8526393576255304, + "grad_norm": 0.45923447608947754, + "learning_rate": 8e-05, + "loss": 1.3072, + "num_input_tokens_seen": 1972916944, + "step": 13990 + }, + { + "epoch": 0.8532488210691512, + "grad_norm": 0.6013538837432861, + "learning_rate": 8e-05, + "loss": 1.3758, + "num_input_tokens_seen": 1974342624, + "step": 14000 + }, + { + "epoch": 0.853858284512772, + "grad_norm": 0.5479147434234619, + "learning_rate": 8e-05, + "loss": 1.2949, + "num_input_tokens_seen": 1975743100, + "step": 14010 + }, + { + "epoch": 0.8544677479563929, + "grad_norm": 0.47358494997024536, + "learning_rate": 8e-05, + "loss": 1.3132, + "num_input_tokens_seen": 1977148904, + "step": 14020 + }, + { + "epoch": 0.8550772114000137, + "grad_norm": 0.5430836081504822, + "learning_rate": 8e-05, + "loss": 1.3641, + "num_input_tokens_seen": 1978557708, + "step": 14030 + }, + { + "epoch": 0.8556866748436346, + "grad_norm": 0.5587384700775146, + "learning_rate": 8e-05, + "loss": 1.2717, + "num_input_tokens_seen": 1979948296, + "step": 14040 + }, + { + "epoch": 0.8562961382872554, + "grad_norm": 0.5155408978462219, + "learning_rate": 8e-05, + "loss": 1.2849, + "num_input_tokens_seen": 1981315776, + "step": 14050 + }, + { + "epoch": 0.8569056017308762, + "grad_norm": 0.4776671528816223, + "learning_rate": 8e-05, + "loss": 1.3477, + "num_input_tokens_seen": 1982710884, + "step": 14060 + }, + { + "epoch": 0.857515065174497, + "grad_norm": 0.5615227222442627, + "learning_rate": 8e-05, + "loss": 1.3669, + "num_input_tokens_seen": 1984128224, + "step": 14070 + }, + { + "epoch": 0.8581245286181178, + "grad_norm": 0.5377513766288757, + "learning_rate": 8e-05, + "loss": 1.2847, + "num_input_tokens_seen": 1985557920, + "step": 14080 + }, + { + "epoch": 0.8587339920617386, + "grad_norm": 0.4939589202404022, + "learning_rate": 8e-05, + "loss": 1.3306, + "num_input_tokens_seen": 1986944272, + "step": 14090 + }, + { + "epoch": 0.8593434555053595, + "grad_norm": 0.5589674115180969, + "learning_rate": 8e-05, + "loss": 1.2835, + "num_input_tokens_seen": 1988374288, + "step": 14100 + }, + { + "epoch": 0.8599529189489803, + "grad_norm": 0.5040762424468994, + "learning_rate": 8e-05, + "loss": 1.2794, + "num_input_tokens_seen": 1989791720, + "step": 14110 + }, + { + "epoch": 0.8605623823926011, + "grad_norm": 0.40734902024269104, + "learning_rate": 8e-05, + "loss": 1.2372, + "num_input_tokens_seen": 1991197028, + "step": 14120 + }, + { + "epoch": 0.861171845836222, + "grad_norm": 0.49047330021858215, + "learning_rate": 8e-05, + "loss": 1.3511, + "num_input_tokens_seen": 1992584332, + "step": 14130 + }, + { + "epoch": 0.8617813092798428, + "grad_norm": 0.5605955719947815, + "learning_rate": 8e-05, + "loss": 1.2739, + "num_input_tokens_seen": 1993990784, + "step": 14140 + }, + { + "epoch": 0.8623907727234635, + "grad_norm": 0.4485302269458771, + "learning_rate": 8e-05, + "loss": 1.3499, + "num_input_tokens_seen": 1995433372, + "step": 14150 + }, + { + "epoch": 0.8630002361670844, + "grad_norm": 0.5024114847183228, + "learning_rate": 8e-05, + "loss": 1.3728, + "num_input_tokens_seen": 1996822904, + "step": 14160 + }, + { + "epoch": 0.8636096996107052, + "grad_norm": 0.5688835978507996, + "learning_rate": 8e-05, + "loss": 1.3265, + "num_input_tokens_seen": 1998261028, + "step": 14170 + }, + { + "epoch": 0.864219163054326, + "grad_norm": 0.5042539834976196, + "learning_rate": 8e-05, + "loss": 1.3042, + "num_input_tokens_seen": 1999646144, + "step": 14180 + }, + { + "epoch": 0.8648286264979469, + "grad_norm": 0.4236133098602295, + "learning_rate": 8e-05, + "loss": 1.3615, + "num_input_tokens_seen": 2001077220, + "step": 14190 + }, + { + "epoch": 0.8654380899415677, + "grad_norm": 0.5273798108100891, + "learning_rate": 8e-05, + "loss": 1.3866, + "num_input_tokens_seen": 2002474020, + "step": 14200 + }, + { + "epoch": 0.8660475533851885, + "grad_norm": 0.5134631991386414, + "learning_rate": 8e-05, + "loss": 1.3165, + "num_input_tokens_seen": 2003842724, + "step": 14210 + }, + { + "epoch": 0.8666570168288094, + "grad_norm": 0.6235056519508362, + "learning_rate": 8e-05, + "loss": 1.3191, + "num_input_tokens_seen": 2005293516, + "step": 14220 + }, + { + "epoch": 0.8672664802724301, + "grad_norm": 0.6013771295547485, + "learning_rate": 8e-05, + "loss": 1.2849, + "num_input_tokens_seen": 2006708784, + "step": 14230 + }, + { + "epoch": 0.867875943716051, + "grad_norm": 0.5487123131752014, + "learning_rate": 8e-05, + "loss": 1.2884, + "num_input_tokens_seen": 2008066768, + "step": 14240 + }, + { + "epoch": 0.8684854071596718, + "grad_norm": 0.5143485069274902, + "learning_rate": 8e-05, + "loss": 1.3233, + "num_input_tokens_seen": 2009474988, + "step": 14250 + }, + { + "epoch": 0.8690948706032926, + "grad_norm": 0.6184853911399841, + "learning_rate": 8e-05, + "loss": 1.406, + "num_input_tokens_seen": 2010864984, + "step": 14260 + }, + { + "epoch": 0.8697043340469135, + "grad_norm": 0.43288683891296387, + "learning_rate": 8e-05, + "loss": 1.3188, + "num_input_tokens_seen": 2012273664, + "step": 14270 + }, + { + "epoch": 0.8703137974905343, + "grad_norm": 0.5103833079338074, + "learning_rate": 8e-05, + "loss": 1.3006, + "num_input_tokens_seen": 2013731756, + "step": 14280 + }, + { + "epoch": 0.8709232609341551, + "grad_norm": 0.46680596470832825, + "learning_rate": 8e-05, + "loss": 1.306, + "num_input_tokens_seen": 2015108620, + "step": 14290 + }, + { + "epoch": 0.871532724377776, + "grad_norm": 0.46874338388442993, + "learning_rate": 8e-05, + "loss": 1.2844, + "num_input_tokens_seen": 2016514404, + "step": 14300 + }, + { + "epoch": 0.8721421878213967, + "grad_norm": 0.4423303008079529, + "learning_rate": 8e-05, + "loss": 1.3593, + "num_input_tokens_seen": 2017955308, + "step": 14310 + }, + { + "epoch": 0.8727516512650175, + "grad_norm": 0.507398784160614, + "learning_rate": 8e-05, + "loss": 1.2901, + "num_input_tokens_seen": 2019400760, + "step": 14320 + }, + { + "epoch": 0.8733611147086384, + "grad_norm": 0.46312689781188965, + "learning_rate": 8e-05, + "loss": 1.2778, + "num_input_tokens_seen": 2020825292, + "step": 14330 + }, + { + "epoch": 0.8739705781522592, + "grad_norm": 0.45070067048072815, + "learning_rate": 8e-05, + "loss": 1.2942, + "num_input_tokens_seen": 2022226416, + "step": 14340 + }, + { + "epoch": 0.87458004159588, + "grad_norm": 0.458053857088089, + "learning_rate": 8e-05, + "loss": 1.3677, + "num_input_tokens_seen": 2023626112, + "step": 14350 + }, + { + "epoch": 0.8751895050395009, + "grad_norm": 0.47325587272644043, + "learning_rate": 8e-05, + "loss": 1.3597, + "num_input_tokens_seen": 2024990412, + "step": 14360 + }, + { + "epoch": 0.8757989684831217, + "grad_norm": 0.5431790947914124, + "learning_rate": 8e-05, + "loss": 1.3117, + "num_input_tokens_seen": 2026355792, + "step": 14370 + }, + { + "epoch": 0.8764084319267424, + "grad_norm": 0.45192548632621765, + "learning_rate": 8e-05, + "loss": 1.3317, + "num_input_tokens_seen": 2027764676, + "step": 14380 + }, + { + "epoch": 0.8770178953703633, + "grad_norm": 0.541114866733551, + "learning_rate": 8e-05, + "loss": 1.2806, + "num_input_tokens_seen": 2029153492, + "step": 14390 + }, + { + "epoch": 0.8776273588139841, + "grad_norm": 0.4860060513019562, + "learning_rate": 8e-05, + "loss": 1.4211, + "num_input_tokens_seen": 2030588204, + "step": 14400 + }, + { + "epoch": 0.8782368222576049, + "grad_norm": 0.4681771695613861, + "learning_rate": 8e-05, + "loss": 1.3697, + "num_input_tokens_seen": 2032011484, + "step": 14410 + }, + { + "epoch": 0.8788462857012258, + "grad_norm": 0.4607865810394287, + "learning_rate": 8e-05, + "loss": 1.3697, + "num_input_tokens_seen": 2033420856, + "step": 14420 + }, + { + "epoch": 0.8794557491448466, + "grad_norm": 0.6460906863212585, + "learning_rate": 8e-05, + "loss": 1.3781, + "num_input_tokens_seen": 2034852316, + "step": 14430 + }, + { + "epoch": 0.8800652125884675, + "grad_norm": 0.5076463222503662, + "learning_rate": 8e-05, + "loss": 1.3139, + "num_input_tokens_seen": 2036238376, + "step": 14440 + }, + { + "epoch": 0.8806746760320883, + "grad_norm": 0.4806708097457886, + "learning_rate": 8e-05, + "loss": 1.3171, + "num_input_tokens_seen": 2037656080, + "step": 14450 + }, + { + "epoch": 0.881284139475709, + "grad_norm": 0.6310375332832336, + "learning_rate": 8e-05, + "loss": 1.2628, + "num_input_tokens_seen": 2039101276, + "step": 14460 + }, + { + "epoch": 0.8818936029193299, + "grad_norm": 0.5229341387748718, + "learning_rate": 8e-05, + "loss": 1.368, + "num_input_tokens_seen": 2040493272, + "step": 14470 + }, + { + "epoch": 0.8825030663629507, + "grad_norm": 0.5170985460281372, + "learning_rate": 8e-05, + "loss": 1.3502, + "num_input_tokens_seen": 2041931884, + "step": 14480 + }, + { + "epoch": 0.8831125298065715, + "grad_norm": 0.5730608105659485, + "learning_rate": 8e-05, + "loss": 1.3101, + "num_input_tokens_seen": 2043339212, + "step": 14490 + }, + { + "epoch": 0.8837219932501924, + "grad_norm": 0.4748365879058838, + "learning_rate": 8e-05, + "loss": 1.3242, + "num_input_tokens_seen": 2044736972, + "step": 14500 + }, + { + "epoch": 0.8843314566938132, + "grad_norm": 0.4900893270969391, + "learning_rate": 8e-05, + "loss": 1.2727, + "num_input_tokens_seen": 2046111152, + "step": 14510 + }, + { + "epoch": 0.884940920137434, + "grad_norm": 0.5116603374481201, + "learning_rate": 8e-05, + "loss": 1.298, + "num_input_tokens_seen": 2047511888, + "step": 14520 + }, + { + "epoch": 0.8855503835810549, + "grad_norm": 0.4778316617012024, + "learning_rate": 8e-05, + "loss": 1.2757, + "num_input_tokens_seen": 2048942212, + "step": 14530 + }, + { + "epoch": 0.8861598470246757, + "grad_norm": 0.4536020755767822, + "learning_rate": 8e-05, + "loss": 1.3739, + "num_input_tokens_seen": 2050337400, + "step": 14540 + }, + { + "epoch": 0.8867693104682964, + "grad_norm": 0.4740245044231415, + "learning_rate": 8e-05, + "loss": 1.2835, + "num_input_tokens_seen": 2051768816, + "step": 14550 + }, + { + "epoch": 0.8873787739119173, + "grad_norm": 0.4206888675689697, + "learning_rate": 8e-05, + "loss": 1.3625, + "num_input_tokens_seen": 2053167920, + "step": 14560 + }, + { + "epoch": 0.8879882373555381, + "grad_norm": 0.5243297815322876, + "learning_rate": 8e-05, + "loss": 1.338, + "num_input_tokens_seen": 2054565312, + "step": 14570 + }, + { + "epoch": 0.8885977007991589, + "grad_norm": 0.4735357165336609, + "learning_rate": 8e-05, + "loss": 1.3339, + "num_input_tokens_seen": 2056014664, + "step": 14580 + }, + { + "epoch": 0.8892071642427798, + "grad_norm": 0.43609538674354553, + "learning_rate": 8e-05, + "loss": 1.318, + "num_input_tokens_seen": 2057450624, + "step": 14590 + }, + { + "epoch": 0.8898166276864006, + "grad_norm": 0.5434485673904419, + "learning_rate": 8e-05, + "loss": 1.2862, + "num_input_tokens_seen": 2058888776, + "step": 14600 + }, + { + "epoch": 0.8904260911300214, + "grad_norm": 0.516051709651947, + "learning_rate": 8e-05, + "loss": 1.2762, + "num_input_tokens_seen": 2060339988, + "step": 14610 + }, + { + "epoch": 0.8910355545736423, + "grad_norm": 0.4696006178855896, + "learning_rate": 8e-05, + "loss": 1.3784, + "num_input_tokens_seen": 2061765008, + "step": 14620 + }, + { + "epoch": 0.891645018017263, + "grad_norm": 0.5371540188789368, + "learning_rate": 8e-05, + "loss": 1.2606, + "num_input_tokens_seen": 2063161640, + "step": 14630 + }, + { + "epoch": 0.8922544814608838, + "grad_norm": 0.463405042886734, + "learning_rate": 8e-05, + "loss": 1.3241, + "num_input_tokens_seen": 2064557948, + "step": 14640 + }, + { + "epoch": 0.8928639449045047, + "grad_norm": 0.48568493127822876, + "learning_rate": 8e-05, + "loss": 1.3389, + "num_input_tokens_seen": 2065990172, + "step": 14650 + }, + { + "epoch": 0.8934734083481255, + "grad_norm": 0.5302020311355591, + "learning_rate": 8e-05, + "loss": 1.3282, + "num_input_tokens_seen": 2067415172, + "step": 14660 + }, + { + "epoch": 0.8940828717917464, + "grad_norm": 0.4722622036933899, + "learning_rate": 8e-05, + "loss": 1.3124, + "num_input_tokens_seen": 2068826280, + "step": 14670 + }, + { + "epoch": 0.8946923352353672, + "grad_norm": 0.5686400532722473, + "learning_rate": 8e-05, + "loss": 1.2964, + "num_input_tokens_seen": 2070285916, + "step": 14680 + }, + { + "epoch": 0.895301798678988, + "grad_norm": 0.5032939314842224, + "learning_rate": 8e-05, + "loss": 1.3056, + "num_input_tokens_seen": 2071687308, + "step": 14690 + }, + { + "epoch": 0.8959112621226089, + "grad_norm": 0.43025267124176025, + "learning_rate": 8e-05, + "loss": 1.2774, + "num_input_tokens_seen": 2073089308, + "step": 14700 + }, + { + "epoch": 0.8965207255662296, + "grad_norm": 0.5931614637374878, + "learning_rate": 8e-05, + "loss": 1.2295, + "num_input_tokens_seen": 2074487708, + "step": 14710 + }, + { + "epoch": 0.8971301890098504, + "grad_norm": 0.5101529359817505, + "learning_rate": 8e-05, + "loss": 1.3418, + "num_input_tokens_seen": 2075901036, + "step": 14720 + }, + { + "epoch": 0.8977396524534713, + "grad_norm": 0.46563971042633057, + "learning_rate": 8e-05, + "loss": 1.2945, + "num_input_tokens_seen": 2077316540, + "step": 14730 + }, + { + "epoch": 0.8983491158970921, + "grad_norm": 0.535887598991394, + "learning_rate": 8e-05, + "loss": 1.3194, + "num_input_tokens_seen": 2078729452, + "step": 14740 + }, + { + "epoch": 0.8989585793407129, + "grad_norm": 0.5296595096588135, + "learning_rate": 8e-05, + "loss": 1.3476, + "num_input_tokens_seen": 2080099128, + "step": 14750 + }, + { + "epoch": 0.8995680427843338, + "grad_norm": 0.5821929574012756, + "learning_rate": 8e-05, + "loss": 1.3395, + "num_input_tokens_seen": 2081541188, + "step": 14760 + }, + { + "epoch": 0.9001775062279546, + "grad_norm": 0.5599145293235779, + "learning_rate": 8e-05, + "loss": 1.4025, + "num_input_tokens_seen": 2082915756, + "step": 14770 + }, + { + "epoch": 0.9007869696715753, + "grad_norm": 0.548435389995575, + "learning_rate": 8e-05, + "loss": 1.2736, + "num_input_tokens_seen": 2084374868, + "step": 14780 + }, + { + "epoch": 0.9013964331151962, + "grad_norm": 0.5481672286987305, + "learning_rate": 8e-05, + "loss": 1.3607, + "num_input_tokens_seen": 2085774468, + "step": 14790 + }, + { + "epoch": 0.902005896558817, + "grad_norm": 0.5384652018547058, + "learning_rate": 8e-05, + "loss": 1.2555, + "num_input_tokens_seen": 2087183164, + "step": 14800 + }, + { + "epoch": 0.9026153600024378, + "grad_norm": 0.546398937702179, + "learning_rate": 8e-05, + "loss": 1.3217, + "num_input_tokens_seen": 2088598132, + "step": 14810 + }, + { + "epoch": 0.9032248234460587, + "grad_norm": 0.4481549561023712, + "learning_rate": 8e-05, + "loss": 1.2731, + "num_input_tokens_seen": 2089956332, + "step": 14820 + }, + { + "epoch": 0.9038342868896795, + "grad_norm": 0.4861372709274292, + "learning_rate": 8e-05, + "loss": 1.3456, + "num_input_tokens_seen": 2091342172, + "step": 14830 + }, + { + "epoch": 0.9044437503333003, + "grad_norm": 0.5424238443374634, + "learning_rate": 8e-05, + "loss": 1.3, + "num_input_tokens_seen": 2092764024, + "step": 14840 + }, + { + "epoch": 0.9050532137769212, + "grad_norm": 0.4749182462692261, + "learning_rate": 8e-05, + "loss": 1.2833, + "num_input_tokens_seen": 2094144480, + "step": 14850 + }, + { + "epoch": 0.905662677220542, + "grad_norm": 0.4868445098400116, + "learning_rate": 8e-05, + "loss": 1.3079, + "num_input_tokens_seen": 2095575144, + "step": 14860 + }, + { + "epoch": 0.9062721406641628, + "grad_norm": 0.48875224590301514, + "learning_rate": 8e-05, + "loss": 1.3415, + "num_input_tokens_seen": 2096968984, + "step": 14870 + }, + { + "epoch": 0.9068816041077836, + "grad_norm": 0.6021813750267029, + "learning_rate": 8e-05, + "loss": 1.3031, + "num_input_tokens_seen": 2098395432, + "step": 14880 + }, + { + "epoch": 0.9074910675514044, + "grad_norm": 0.5280013084411621, + "learning_rate": 8e-05, + "loss": 1.2774, + "num_input_tokens_seen": 2099787376, + "step": 14890 + }, + { + "epoch": 0.9081005309950253, + "grad_norm": 0.44756680727005005, + "learning_rate": 8e-05, + "loss": 1.415, + "num_input_tokens_seen": 2101178184, + "step": 14900 + }, + { + "epoch": 0.9087099944386461, + "grad_norm": 0.5173446536064148, + "learning_rate": 8e-05, + "loss": 1.2678, + "num_input_tokens_seen": 2102550216, + "step": 14910 + }, + { + "epoch": 0.9093194578822669, + "grad_norm": 0.5057575106620789, + "learning_rate": 8e-05, + "loss": 1.2572, + "num_input_tokens_seen": 2103991892, + "step": 14920 + }, + { + "epoch": 0.9099289213258878, + "grad_norm": 0.4518465995788574, + "learning_rate": 8e-05, + "loss": 1.2135, + "num_input_tokens_seen": 2105393196, + "step": 14930 + }, + { + "epoch": 0.9105383847695085, + "grad_norm": 0.5331540107727051, + "learning_rate": 8e-05, + "loss": 1.225, + "num_input_tokens_seen": 2106853740, + "step": 14940 + }, + { + "epoch": 0.9111478482131293, + "grad_norm": 0.567085862159729, + "learning_rate": 8e-05, + "loss": 1.3343, + "num_input_tokens_seen": 2108256916, + "step": 14950 + }, + { + "epoch": 0.9117573116567502, + "grad_norm": 0.4467783570289612, + "learning_rate": 8e-05, + "loss": 1.3246, + "num_input_tokens_seen": 2109676832, + "step": 14960 + }, + { + "epoch": 0.912366775100371, + "grad_norm": 0.4943903684616089, + "learning_rate": 8e-05, + "loss": 1.3044, + "num_input_tokens_seen": 2111041288, + "step": 14970 + }, + { + "epoch": 0.9129762385439918, + "grad_norm": 0.5459105372428894, + "learning_rate": 8e-05, + "loss": 1.3449, + "num_input_tokens_seen": 2112428456, + "step": 14980 + }, + { + "epoch": 0.9135857019876127, + "grad_norm": 0.6656093597412109, + "learning_rate": 8e-05, + "loss": 1.3407, + "num_input_tokens_seen": 2113839204, + "step": 14990 + }, + { + "epoch": 0.9141951654312335, + "grad_norm": 0.5155654549598694, + "learning_rate": 8e-05, + "loss": 1.3348, + "num_input_tokens_seen": 2115240548, + "step": 15000 + }, + { + "epoch": 0.9148046288748543, + "grad_norm": 0.5123623013496399, + "learning_rate": 8e-05, + "loss": 1.2716, + "num_input_tokens_seen": 2116649340, + "step": 15010 + }, + { + "epoch": 0.9154140923184751, + "grad_norm": 0.46583035588264465, + "learning_rate": 8e-05, + "loss": 1.2999, + "num_input_tokens_seen": 2118049608, + "step": 15020 + }, + { + "epoch": 0.9160235557620959, + "grad_norm": 0.5250749588012695, + "learning_rate": 8e-05, + "loss": 1.3072, + "num_input_tokens_seen": 2119496856, + "step": 15030 + }, + { + "epoch": 0.9166330192057167, + "grad_norm": 0.4794974625110626, + "learning_rate": 8e-05, + "loss": 1.3763, + "num_input_tokens_seen": 2120930156, + "step": 15040 + }, + { + "epoch": 0.9172424826493376, + "grad_norm": 0.4998897314071655, + "learning_rate": 8e-05, + "loss": 1.3722, + "num_input_tokens_seen": 2122320520, + "step": 15050 + }, + { + "epoch": 0.9178519460929584, + "grad_norm": 0.4267031252384186, + "learning_rate": 8e-05, + "loss": 1.3729, + "num_input_tokens_seen": 2123749292, + "step": 15060 + }, + { + "epoch": 0.9184614095365793, + "grad_norm": 0.5466004610061646, + "learning_rate": 8e-05, + "loss": 1.2821, + "num_input_tokens_seen": 2125150732, + "step": 15070 + }, + { + "epoch": 0.9190708729802001, + "grad_norm": 0.4843430519104004, + "learning_rate": 8e-05, + "loss": 1.3095, + "num_input_tokens_seen": 2126553680, + "step": 15080 + }, + { + "epoch": 0.9196803364238209, + "grad_norm": 0.4891015887260437, + "learning_rate": 8e-05, + "loss": 1.2743, + "num_input_tokens_seen": 2127954848, + "step": 15090 + }, + { + "epoch": 0.9202897998674417, + "grad_norm": 0.4860542416572571, + "learning_rate": 8e-05, + "loss": 1.2681, + "num_input_tokens_seen": 2129337336, + "step": 15100 + }, + { + "epoch": 0.9208992633110625, + "grad_norm": 0.5063233375549316, + "learning_rate": 8e-05, + "loss": 1.2805, + "num_input_tokens_seen": 2130776264, + "step": 15110 + }, + { + "epoch": 0.9215087267546833, + "grad_norm": 0.513878345489502, + "learning_rate": 8e-05, + "loss": 1.3024, + "num_input_tokens_seen": 2132183500, + "step": 15120 + }, + { + "epoch": 0.9221181901983042, + "grad_norm": 0.6026753783226013, + "learning_rate": 8e-05, + "loss": 1.3195, + "num_input_tokens_seen": 2133618296, + "step": 15130 + }, + { + "epoch": 0.922727653641925, + "grad_norm": 0.6205632090568542, + "learning_rate": 8e-05, + "loss": 1.2973, + "num_input_tokens_seen": 2135035140, + "step": 15140 + }, + { + "epoch": 0.9233371170855458, + "grad_norm": 0.5202974677085876, + "learning_rate": 8e-05, + "loss": 1.3003, + "num_input_tokens_seen": 2136445656, + "step": 15150 + }, + { + "epoch": 0.9239465805291667, + "grad_norm": 0.4942576587200165, + "learning_rate": 8e-05, + "loss": 1.3261, + "num_input_tokens_seen": 2137871872, + "step": 15160 + }, + { + "epoch": 0.9245560439727875, + "grad_norm": 0.4791765511035919, + "learning_rate": 8e-05, + "loss": 1.3159, + "num_input_tokens_seen": 2139281680, + "step": 15170 + }, + { + "epoch": 0.9251655074164082, + "grad_norm": 0.5186665654182434, + "learning_rate": 8e-05, + "loss": 1.2736, + "num_input_tokens_seen": 2140719180, + "step": 15180 + }, + { + "epoch": 0.9257749708600291, + "grad_norm": 0.5142242312431335, + "learning_rate": 8e-05, + "loss": 1.2888, + "num_input_tokens_seen": 2142122972, + "step": 15190 + }, + { + "epoch": 0.9263844343036499, + "grad_norm": 0.4749925136566162, + "learning_rate": 8e-05, + "loss": 1.3422, + "num_input_tokens_seen": 2143543944, + "step": 15200 + }, + { + "epoch": 0.9269938977472707, + "grad_norm": 0.4524226784706116, + "learning_rate": 8e-05, + "loss": 1.3178, + "num_input_tokens_seen": 2144948304, + "step": 15210 + }, + { + "epoch": 0.9276033611908916, + "grad_norm": 0.5263559222221375, + "learning_rate": 8e-05, + "loss": 1.2454, + "num_input_tokens_seen": 2146366520, + "step": 15220 + }, + { + "epoch": 0.9282128246345124, + "grad_norm": 0.571079432964325, + "learning_rate": 8e-05, + "loss": 1.3552, + "num_input_tokens_seen": 2147785372, + "step": 15230 + }, + { + "epoch": 0.9288222880781332, + "grad_norm": 0.5034895539283752, + "learning_rate": 8e-05, + "loss": 1.325, + "num_input_tokens_seen": 2149217276, + "step": 15240 + }, + { + "epoch": 0.9294317515217541, + "grad_norm": 0.45907917618751526, + "learning_rate": 8e-05, + "loss": 1.2877, + "num_input_tokens_seen": 2150613728, + "step": 15250 + }, + { + "epoch": 0.9300412149653748, + "grad_norm": 0.4312871992588043, + "learning_rate": 8e-05, + "loss": 1.3095, + "num_input_tokens_seen": 2151987964, + "step": 15260 + }, + { + "epoch": 0.9306506784089957, + "grad_norm": 0.5046421885490417, + "learning_rate": 8e-05, + "loss": 1.3279, + "num_input_tokens_seen": 2153424756, + "step": 15270 + }, + { + "epoch": 0.9312601418526165, + "grad_norm": 0.5745195150375366, + "learning_rate": 8e-05, + "loss": 1.2899, + "num_input_tokens_seen": 2154852656, + "step": 15280 + }, + { + "epoch": 0.9318696052962373, + "grad_norm": 0.4521108865737915, + "learning_rate": 8e-05, + "loss": 1.2494, + "num_input_tokens_seen": 2156242808, + "step": 15290 + }, + { + "epoch": 0.9324790687398582, + "grad_norm": 0.43681252002716064, + "learning_rate": 8e-05, + "loss": 1.2328, + "num_input_tokens_seen": 2157698388, + "step": 15300 + }, + { + "epoch": 0.933088532183479, + "grad_norm": 0.43972909450531006, + "learning_rate": 8e-05, + "loss": 1.3337, + "num_input_tokens_seen": 2159079792, + "step": 15310 + }, + { + "epoch": 0.9336979956270998, + "grad_norm": 0.4632760286331177, + "learning_rate": 8e-05, + "loss": 1.2879, + "num_input_tokens_seen": 2160506712, + "step": 15320 + }, + { + "epoch": 0.9343074590707207, + "grad_norm": 0.4858049154281616, + "learning_rate": 8e-05, + "loss": 1.3715, + "num_input_tokens_seen": 2161935332, + "step": 15330 + }, + { + "epoch": 0.9349169225143414, + "grad_norm": 0.502547025680542, + "learning_rate": 8e-05, + "loss": 1.2783, + "num_input_tokens_seen": 2163308116, + "step": 15340 + }, + { + "epoch": 0.9355263859579622, + "grad_norm": 0.5862429141998291, + "learning_rate": 8e-05, + "loss": 1.3081, + "num_input_tokens_seen": 2164680144, + "step": 15350 + }, + { + "epoch": 0.9361358494015831, + "grad_norm": 0.5465579032897949, + "learning_rate": 8e-05, + "loss": 1.3302, + "num_input_tokens_seen": 2166084000, + "step": 15360 + }, + { + "epoch": 0.9367453128452039, + "grad_norm": 0.4558219909667969, + "learning_rate": 8e-05, + "loss": 1.3208, + "num_input_tokens_seen": 2167539648, + "step": 15370 + }, + { + "epoch": 0.9373547762888247, + "grad_norm": 0.4179956614971161, + "learning_rate": 8e-05, + "loss": 1.2822, + "num_input_tokens_seen": 2168925220, + "step": 15380 + }, + { + "epoch": 0.9379642397324456, + "grad_norm": 0.5408328175544739, + "learning_rate": 8e-05, + "loss": 1.311, + "num_input_tokens_seen": 2170353320, + "step": 15390 + }, + { + "epoch": 0.9385737031760664, + "grad_norm": 0.45750945806503296, + "learning_rate": 8e-05, + "loss": 1.3182, + "num_input_tokens_seen": 2171767016, + "step": 15400 + }, + { + "epoch": 0.9391831666196871, + "grad_norm": 0.5293141007423401, + "learning_rate": 8e-05, + "loss": 1.3439, + "num_input_tokens_seen": 2173170916, + "step": 15410 + }, + { + "epoch": 0.939792630063308, + "grad_norm": 0.4048777222633362, + "learning_rate": 8e-05, + "loss": 1.2898, + "num_input_tokens_seen": 2174582080, + "step": 15420 + }, + { + "epoch": 0.9404020935069288, + "grad_norm": 0.5126138925552368, + "learning_rate": 8e-05, + "loss": 1.3273, + "num_input_tokens_seen": 2176029604, + "step": 15430 + }, + { + "epoch": 0.9410115569505496, + "grad_norm": 0.5170230865478516, + "learning_rate": 8e-05, + "loss": 1.2763, + "num_input_tokens_seen": 2177398412, + "step": 15440 + }, + { + "epoch": 0.9416210203941705, + "grad_norm": 0.47761601209640503, + "learning_rate": 8e-05, + "loss": 1.3317, + "num_input_tokens_seen": 2178777544, + "step": 15450 + }, + { + "epoch": 0.9422304838377913, + "grad_norm": 0.4851064682006836, + "learning_rate": 8e-05, + "loss": 1.2528, + "num_input_tokens_seen": 2180193028, + "step": 15460 + }, + { + "epoch": 0.9428399472814121, + "grad_norm": 0.445537805557251, + "learning_rate": 8e-05, + "loss": 1.3249, + "num_input_tokens_seen": 2181639340, + "step": 15470 + }, + { + "epoch": 0.943449410725033, + "grad_norm": 0.4982694983482361, + "learning_rate": 8e-05, + "loss": 1.257, + "num_input_tokens_seen": 2183099028, + "step": 15480 + }, + { + "epoch": 0.9440588741686537, + "grad_norm": 0.4679562747478485, + "learning_rate": 8e-05, + "loss": 1.2946, + "num_input_tokens_seen": 2184447260, + "step": 15490 + }, + { + "epoch": 0.9446683376122746, + "grad_norm": 0.48683637380599976, + "learning_rate": 8e-05, + "loss": 1.3197, + "num_input_tokens_seen": 2185818896, + "step": 15500 + }, + { + "epoch": 0.9452778010558954, + "grad_norm": 0.6176589131355286, + "learning_rate": 8e-05, + "loss": 1.3705, + "num_input_tokens_seen": 2187242616, + "step": 15510 + }, + { + "epoch": 0.9458872644995162, + "grad_norm": 0.3919903635978699, + "learning_rate": 8e-05, + "loss": 1.2758, + "num_input_tokens_seen": 2188635656, + "step": 15520 + }, + { + "epoch": 0.9464967279431371, + "grad_norm": 0.48860692977905273, + "learning_rate": 8e-05, + "loss": 1.2734, + "num_input_tokens_seen": 2190004480, + "step": 15530 + }, + { + "epoch": 0.9471061913867579, + "grad_norm": 0.547088623046875, + "learning_rate": 8e-05, + "loss": 1.2956, + "num_input_tokens_seen": 2191428232, + "step": 15540 + }, + { + "epoch": 0.9477156548303787, + "grad_norm": 0.46737053990364075, + "learning_rate": 8e-05, + "loss": 1.3075, + "num_input_tokens_seen": 2192866976, + "step": 15550 + }, + { + "epoch": 0.9483251182739996, + "grad_norm": 0.5125867128372192, + "learning_rate": 8e-05, + "loss": 1.3354, + "num_input_tokens_seen": 2194303344, + "step": 15560 + }, + { + "epoch": 0.9489345817176204, + "grad_norm": 0.45754584670066833, + "learning_rate": 8e-05, + "loss": 1.3479, + "num_input_tokens_seen": 2195684020, + "step": 15570 + }, + { + "epoch": 0.9495440451612411, + "grad_norm": 0.4521014094352722, + "learning_rate": 8e-05, + "loss": 1.3423, + "num_input_tokens_seen": 2197122540, + "step": 15580 + }, + { + "epoch": 0.950153508604862, + "grad_norm": 0.44286853075027466, + "learning_rate": 8e-05, + "loss": 1.3658, + "num_input_tokens_seen": 2198552600, + "step": 15590 + }, + { + "epoch": 0.9507629720484828, + "grad_norm": 0.489595502614975, + "learning_rate": 8e-05, + "loss": 1.3535, + "num_input_tokens_seen": 2200004540, + "step": 15600 + }, + { + "epoch": 0.9513724354921036, + "grad_norm": 0.5072488188743591, + "learning_rate": 8e-05, + "loss": 1.2492, + "num_input_tokens_seen": 2201446580, + "step": 15610 + }, + { + "epoch": 0.9519818989357245, + "grad_norm": 0.5579765439033508, + "learning_rate": 8e-05, + "loss": 1.3371, + "num_input_tokens_seen": 2202855424, + "step": 15620 + }, + { + "epoch": 0.9525913623793453, + "grad_norm": 0.49142131209373474, + "learning_rate": 8e-05, + "loss": 1.3132, + "num_input_tokens_seen": 2204239896, + "step": 15630 + }, + { + "epoch": 0.9532008258229661, + "grad_norm": 0.49964556097984314, + "learning_rate": 8e-05, + "loss": 1.3217, + "num_input_tokens_seen": 2205656712, + "step": 15640 + }, + { + "epoch": 0.953810289266587, + "grad_norm": 0.493123322725296, + "learning_rate": 8e-05, + "loss": 1.2907, + "num_input_tokens_seen": 2207060424, + "step": 15650 + }, + { + "epoch": 0.9544197527102077, + "grad_norm": 0.5087452530860901, + "learning_rate": 8e-05, + "loss": 1.3345, + "num_input_tokens_seen": 2208448188, + "step": 15660 + }, + { + "epoch": 0.9550292161538285, + "grad_norm": 0.5435436367988586, + "learning_rate": 8e-05, + "loss": 1.3228, + "num_input_tokens_seen": 2209862512, + "step": 15670 + }, + { + "epoch": 0.9556386795974494, + "grad_norm": 0.5179306268692017, + "learning_rate": 8e-05, + "loss": 1.3164, + "num_input_tokens_seen": 2211270724, + "step": 15680 + }, + { + "epoch": 0.9562481430410702, + "grad_norm": 0.4974176287651062, + "learning_rate": 8e-05, + "loss": 1.2702, + "num_input_tokens_seen": 2212669556, + "step": 15690 + }, + { + "epoch": 0.9568576064846911, + "grad_norm": 0.5859740972518921, + "learning_rate": 8e-05, + "loss": 1.2829, + "num_input_tokens_seen": 2214058020, + "step": 15700 + }, + { + "epoch": 0.9574670699283119, + "grad_norm": 0.5075061321258545, + "learning_rate": 8e-05, + "loss": 1.32, + "num_input_tokens_seen": 2215513552, + "step": 15710 + }, + { + "epoch": 0.9580765333719327, + "grad_norm": 0.5155842304229736, + "learning_rate": 8e-05, + "loss": 1.2568, + "num_input_tokens_seen": 2216879340, + "step": 15720 + }, + { + "epoch": 0.9586859968155536, + "grad_norm": 0.5818564295768738, + "learning_rate": 8e-05, + "loss": 1.3082, + "num_input_tokens_seen": 2218291808, + "step": 15730 + }, + { + "epoch": 0.9592954602591743, + "grad_norm": 0.534583568572998, + "learning_rate": 8e-05, + "loss": 1.3433, + "num_input_tokens_seen": 2219696636, + "step": 15740 + }, + { + "epoch": 0.9599049237027951, + "grad_norm": 0.5481460690498352, + "learning_rate": 8e-05, + "loss": 1.2681, + "num_input_tokens_seen": 2221124380, + "step": 15750 + }, + { + "epoch": 0.960514387146416, + "grad_norm": 0.4679185450077057, + "learning_rate": 8e-05, + "loss": 1.3098, + "num_input_tokens_seen": 2222547488, + "step": 15760 + }, + { + "epoch": 0.9611238505900368, + "grad_norm": 0.576784074306488, + "learning_rate": 8e-05, + "loss": 1.3268, + "num_input_tokens_seen": 2223963556, + "step": 15770 + }, + { + "epoch": 0.9617333140336576, + "grad_norm": 0.48329129815101624, + "learning_rate": 8e-05, + "loss": 1.2158, + "num_input_tokens_seen": 2225362452, + "step": 15780 + }, + { + "epoch": 0.9623427774772785, + "grad_norm": 0.5717592835426331, + "learning_rate": 8e-05, + "loss": 1.3256, + "num_input_tokens_seen": 2226820220, + "step": 15790 + }, + { + "epoch": 0.9629522409208993, + "grad_norm": 0.4110204875469208, + "learning_rate": 8e-05, + "loss": 1.3205, + "num_input_tokens_seen": 2228219256, + "step": 15800 + }, + { + "epoch": 0.96356170436452, + "grad_norm": 0.5517961382865906, + "learning_rate": 8e-05, + "loss": 1.3158, + "num_input_tokens_seen": 2229620876, + "step": 15810 + }, + { + "epoch": 0.9641711678081409, + "grad_norm": 0.6182602047920227, + "learning_rate": 8e-05, + "loss": 1.3214, + "num_input_tokens_seen": 2231002716, + "step": 15820 + }, + { + "epoch": 0.9647806312517617, + "grad_norm": 0.5628819465637207, + "learning_rate": 8e-05, + "loss": 1.3393, + "num_input_tokens_seen": 2232410556, + "step": 15830 + }, + { + "epoch": 0.9653900946953825, + "grad_norm": 0.4238695502281189, + "learning_rate": 8e-05, + "loss": 1.2732, + "num_input_tokens_seen": 2233815396, + "step": 15840 + }, + { + "epoch": 0.9659995581390034, + "grad_norm": 0.5082485675811768, + "learning_rate": 8e-05, + "loss": 1.3654, + "num_input_tokens_seen": 2235185328, + "step": 15850 + }, + { + "epoch": 0.9666090215826242, + "grad_norm": 0.550703227519989, + "learning_rate": 8e-05, + "loss": 1.3748, + "num_input_tokens_seen": 2236596760, + "step": 15860 + }, + { + "epoch": 0.967218485026245, + "grad_norm": 0.47868087887763977, + "learning_rate": 8e-05, + "loss": 1.3216, + "num_input_tokens_seen": 2238020480, + "step": 15870 + }, + { + "epoch": 0.9678279484698659, + "grad_norm": 0.42936983704566956, + "learning_rate": 8e-05, + "loss": 1.2932, + "num_input_tokens_seen": 2239422468, + "step": 15880 + }, + { + "epoch": 0.9684374119134866, + "grad_norm": 0.5375044941902161, + "learning_rate": 8e-05, + "loss": 1.2576, + "num_input_tokens_seen": 2240839504, + "step": 15890 + }, + { + "epoch": 0.9690468753571075, + "grad_norm": 0.44315773248672485, + "learning_rate": 8e-05, + "loss": 1.2574, + "num_input_tokens_seen": 2242293840, + "step": 15900 + }, + { + "epoch": 0.9696563388007283, + "grad_norm": 0.550308346748352, + "learning_rate": 8e-05, + "loss": 1.2451, + "num_input_tokens_seen": 2243656524, + "step": 15910 + }, + { + "epoch": 0.9702658022443491, + "grad_norm": 0.47337761521339417, + "learning_rate": 8e-05, + "loss": 1.3502, + "num_input_tokens_seen": 2245041628, + "step": 15920 + }, + { + "epoch": 0.97087526568797, + "grad_norm": 0.49625223875045776, + "learning_rate": 8e-05, + "loss": 1.2356, + "num_input_tokens_seen": 2246441132, + "step": 15930 + }, + { + "epoch": 0.9714847291315908, + "grad_norm": 0.5873345732688904, + "learning_rate": 8e-05, + "loss": 1.3175, + "num_input_tokens_seen": 2247890244, + "step": 15940 + }, + { + "epoch": 0.9720941925752116, + "grad_norm": 0.5385427474975586, + "learning_rate": 8e-05, + "loss": 1.3387, + "num_input_tokens_seen": 2249271220, + "step": 15950 + }, + { + "epoch": 0.9727036560188325, + "grad_norm": 0.6146961450576782, + "learning_rate": 8e-05, + "loss": 1.3295, + "num_input_tokens_seen": 2250717056, + "step": 15960 + }, + { + "epoch": 0.9733131194624532, + "grad_norm": 0.4284912347793579, + "learning_rate": 8e-05, + "loss": 1.3742, + "num_input_tokens_seen": 2252132980, + "step": 15970 + }, + { + "epoch": 0.973922582906074, + "grad_norm": 0.5247157216072083, + "learning_rate": 8e-05, + "loss": 1.391, + "num_input_tokens_seen": 2253553588, + "step": 15980 + }, + { + "epoch": 0.9745320463496949, + "grad_norm": 0.5365486741065979, + "learning_rate": 8e-05, + "loss": 1.3256, + "num_input_tokens_seen": 2254999908, + "step": 15990 + }, + { + "epoch": 0.9751415097933157, + "grad_norm": 0.5320517420768738, + "learning_rate": 8e-05, + "loss": 1.3291, + "num_input_tokens_seen": 2256440196, + "step": 16000 + }, + { + "epoch": 0.9757509732369365, + "grad_norm": 0.5396883487701416, + "learning_rate": 8e-05, + "loss": 1.2364, + "num_input_tokens_seen": 2257840344, + "step": 16010 + }, + { + "epoch": 0.9763604366805574, + "grad_norm": 0.5101752281188965, + "learning_rate": 8e-05, + "loss": 1.2918, + "num_input_tokens_seen": 2259258160, + "step": 16020 + }, + { + "epoch": 0.9769699001241782, + "grad_norm": 0.46001750230789185, + "learning_rate": 8e-05, + "loss": 1.3095, + "num_input_tokens_seen": 2260655960, + "step": 16030 + }, + { + "epoch": 0.977579363567799, + "grad_norm": 0.5177302360534668, + "learning_rate": 8e-05, + "loss": 1.2653, + "num_input_tokens_seen": 2262030540, + "step": 16040 + }, + { + "epoch": 0.9781888270114198, + "grad_norm": 0.5023877620697021, + "learning_rate": 8e-05, + "loss": 1.3001, + "num_input_tokens_seen": 2263437256, + "step": 16050 + }, + { + "epoch": 0.9787982904550406, + "grad_norm": 0.5150201320648193, + "learning_rate": 8e-05, + "loss": 1.2895, + "num_input_tokens_seen": 2264836292, + "step": 16060 + }, + { + "epoch": 0.9794077538986614, + "grad_norm": 0.5193420052528381, + "learning_rate": 8e-05, + "loss": 1.2073, + "num_input_tokens_seen": 2266236256, + "step": 16070 + }, + { + "epoch": 0.9800172173422823, + "grad_norm": 0.5201960802078247, + "learning_rate": 8e-05, + "loss": 1.3091, + "num_input_tokens_seen": 2267626184, + "step": 16080 + }, + { + "epoch": 0.9806266807859031, + "grad_norm": 0.5707736015319824, + "learning_rate": 8e-05, + "loss": 1.2807, + "num_input_tokens_seen": 2269065944, + "step": 16090 + }, + { + "epoch": 0.981236144229524, + "grad_norm": 0.5105067491531372, + "learning_rate": 8e-05, + "loss": 1.3169, + "num_input_tokens_seen": 2270432792, + "step": 16100 + }, + { + "epoch": 0.9818456076731448, + "grad_norm": 0.5395560264587402, + "learning_rate": 8e-05, + "loss": 1.3133, + "num_input_tokens_seen": 2271812640, + "step": 16110 + }, + { + "epoch": 0.9824550711167656, + "grad_norm": 0.43431755900382996, + "learning_rate": 8e-05, + "loss": 1.3047, + "num_input_tokens_seen": 2273222960, + "step": 16120 + }, + { + "epoch": 0.9830645345603864, + "grad_norm": 0.5516846179962158, + "learning_rate": 8e-05, + "loss": 1.2922, + "num_input_tokens_seen": 2274609328, + "step": 16130 + }, + { + "epoch": 0.9836739980040072, + "grad_norm": 0.46326059103012085, + "learning_rate": 8e-05, + "loss": 1.2601, + "num_input_tokens_seen": 2276053984, + "step": 16140 + }, + { + "epoch": 0.984283461447628, + "grad_norm": 0.5828720331192017, + "learning_rate": 8e-05, + "loss": 1.2963, + "num_input_tokens_seen": 2277490232, + "step": 16150 + }, + { + "epoch": 0.9848929248912489, + "grad_norm": 0.5485450029373169, + "learning_rate": 8e-05, + "loss": 1.2844, + "num_input_tokens_seen": 2278878972, + "step": 16160 + }, + { + "epoch": 0.9855023883348697, + "grad_norm": 0.4985129237174988, + "learning_rate": 8e-05, + "loss": 1.271, + "num_input_tokens_seen": 2280327752, + "step": 16170 + }, + { + "epoch": 0.9861118517784905, + "grad_norm": 0.47291767597198486, + "learning_rate": 8e-05, + "loss": 1.2697, + "num_input_tokens_seen": 2281761044, + "step": 16180 + }, + { + "epoch": 0.9867213152221114, + "grad_norm": 0.43932777643203735, + "learning_rate": 8e-05, + "loss": 1.283, + "num_input_tokens_seen": 2283171296, + "step": 16190 + }, + { + "epoch": 0.9873307786657322, + "grad_norm": 0.5212329030036926, + "learning_rate": 8e-05, + "loss": 1.2226, + "num_input_tokens_seen": 2284634396, + "step": 16200 + }, + { + "epoch": 0.9879402421093529, + "grad_norm": 0.468487024307251, + "learning_rate": 8e-05, + "loss": 1.4401, + "num_input_tokens_seen": 2286043496, + "step": 16210 + }, + { + "epoch": 0.9885497055529738, + "grad_norm": 0.6297067403793335, + "learning_rate": 8e-05, + "loss": 1.313, + "num_input_tokens_seen": 2287450120, + "step": 16220 + }, + { + "epoch": 0.9891591689965946, + "grad_norm": 0.44979387521743774, + "learning_rate": 8e-05, + "loss": 1.2554, + "num_input_tokens_seen": 2288841740, + "step": 16230 + }, + { + "epoch": 0.9897686324402154, + "grad_norm": 0.4660492241382599, + "learning_rate": 8e-05, + "loss": 1.2291, + "num_input_tokens_seen": 2290243272, + "step": 16240 + }, + { + "epoch": 0.9903780958838363, + "grad_norm": 0.4804689884185791, + "learning_rate": 8e-05, + "loss": 1.3183, + "num_input_tokens_seen": 2291631656, + "step": 16250 + }, + { + "epoch": 0.9909875593274571, + "grad_norm": 0.5002725124359131, + "learning_rate": 8e-05, + "loss": 1.2312, + "num_input_tokens_seen": 2293014940, + "step": 16260 + }, + { + "epoch": 0.9915970227710779, + "grad_norm": 0.49899348616600037, + "learning_rate": 8e-05, + "loss": 1.1467, + "num_input_tokens_seen": 2294401564, + "step": 16270 + }, + { + "epoch": 0.9922064862146988, + "grad_norm": 0.4643469750881195, + "learning_rate": 8e-05, + "loss": 1.3575, + "num_input_tokens_seen": 2295835048, + "step": 16280 + }, + { + "epoch": 0.9928159496583195, + "grad_norm": 0.5447330474853516, + "learning_rate": 8e-05, + "loss": 1.2759, + "num_input_tokens_seen": 2297276424, + "step": 16290 + }, + { + "epoch": 0.9934254131019403, + "grad_norm": 0.556800901889801, + "learning_rate": 8e-05, + "loss": 1.2833, + "num_input_tokens_seen": 2298712204, + "step": 16300 + }, + { + "epoch": 0.9940348765455612, + "grad_norm": 0.5475782752037048, + "learning_rate": 8e-05, + "loss": 1.346, + "num_input_tokens_seen": 2300152748, + "step": 16310 + }, + { + "epoch": 0.994644339989182, + "grad_norm": 0.47940051555633545, + "learning_rate": 8e-05, + "loss": 1.2262, + "num_input_tokens_seen": 2301556176, + "step": 16320 + }, + { + "epoch": 0.9952538034328029, + "grad_norm": 0.45238494873046875, + "learning_rate": 8e-05, + "loss": 1.3266, + "num_input_tokens_seen": 2303010552, + "step": 16330 + }, + { + "epoch": 0.9958632668764237, + "grad_norm": 0.5712950825691223, + "learning_rate": 8e-05, + "loss": 1.3051, + "num_input_tokens_seen": 2304415944, + "step": 16340 + }, + { + "epoch": 0.9964727303200445, + "grad_norm": 0.5340079069137573, + "learning_rate": 8e-05, + "loss": 1.2415, + "num_input_tokens_seen": 2305805212, + "step": 16350 + }, + { + "epoch": 0.9970821937636654, + "grad_norm": 0.5300690531730652, + "learning_rate": 8e-05, + "loss": 1.295, + "num_input_tokens_seen": 2307230964, + "step": 16360 + }, + { + "epoch": 0.9976916572072861, + "grad_norm": 0.5516910552978516, + "learning_rate": 8e-05, + "loss": 1.2448, + "num_input_tokens_seen": 2308634668, + "step": 16370 + }, + { + "epoch": 0.9983011206509069, + "grad_norm": 0.48180052638053894, + "learning_rate": 8e-05, + "loss": 1.2914, + "num_input_tokens_seen": 2310022092, + "step": 16380 + }, + { + "epoch": 0.9989105840945278, + "grad_norm": 0.48212724924087524, + "learning_rate": 8e-05, + "loss": 1.3808, + "num_input_tokens_seen": 2311466628, + "step": 16390 + }, + { + "epoch": 0.9995200475381486, + "grad_norm": 0.435754656791687, + "learning_rate": 8e-05, + "loss": 1.3024, + "num_input_tokens_seen": 2312932112, + "step": 16400 + }, + { + "epoch": 0.9999466719486831, + "num_input_tokens_seen": 2313940996, + "step": 16407, + "total_flos": 9.029436409798197e+18, + "train_loss": 0.7368571982491552, + "train_runtime": 130509.2663, + "train_samples_per_second": 32.185, + "train_steps_per_second": 0.126 + } + ], + "logging_steps": 10, + "max_steps": 16407, + "num_input_tokens_seen": 2313940996, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 9.029436409798197e+18, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}