{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 93654, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.003203280158882696, "grad_norm": 0.9955105781555176, "learning_rate": 5e-06, "loss": 1.2173, "num_input_tokens_seen": 819200, "step": 100 }, { "epoch": 0.006406560317765392, "grad_norm": 8.62949275970459, "learning_rate": 1e-05, "loss": 1.1953, "num_input_tokens_seen": 1638400, "step": 200 }, { "epoch": 0.009609840476648087, "grad_norm": 1.0293811559677124, "learning_rate": 1.5e-05, "loss": 1.1905, "num_input_tokens_seen": 2457600, "step": 300 }, { "epoch": 0.012813120635530783, "grad_norm": 6.295543193817139, "learning_rate": 2e-05, "loss": 1.1391, "num_input_tokens_seen": 3276800, "step": 400 }, { "epoch": 0.01601640079441348, "grad_norm": 3.0551528930664062, "learning_rate": 2.5e-05, "loss": 1.1383, "num_input_tokens_seen": 4096000, "step": 500 }, { "epoch": 0.019219680953296174, "grad_norm": 0.8111634850502014, "learning_rate": 3e-05, "loss": 1.1022, "num_input_tokens_seen": 4915200, "step": 600 }, { "epoch": 0.022422961112178872, "grad_norm": 0.77763432264328, "learning_rate": 3.5e-05, "loss": 1.0805, "num_input_tokens_seen": 5734400, "step": 700 }, { "epoch": 0.025626241271061567, "grad_norm": 1.9141496419906616, "learning_rate": 4e-05, "loss": 1.0755, "num_input_tokens_seen": 6553600, "step": 800 }, { "epoch": 0.028829521429944265, "grad_norm": 0.8061490058898926, "learning_rate": 4.5e-05, "loss": 1.0995, "num_input_tokens_seen": 7372800, "step": 900 }, { "epoch": 0.03203280158882696, "grad_norm": 0.6671661734580994, "learning_rate": 5e-05, "loss": 1.0835, "num_input_tokens_seen": 8192000, "step": 1000 }, { "epoch": 0.035236081747709654, "grad_norm": 2.4559221267700195, "learning_rate": 4.9999856291983216e-05, "loss": 1.0848, "num_input_tokens_seen": 9011200, "step": 1100 }, { "epoch": 0.03843936190659235, "grad_norm": 0.6218218803405762, "learning_rate": 4.9999425169585025e-05, "loss": 1.0621, "num_input_tokens_seen": 9830400, "step": 1200 }, { "epoch": 0.04164264206547505, "grad_norm": 1.1977851390838623, "learning_rate": 4.999870663776188e-05, "loss": 1.0774, "num_input_tokens_seen": 10649600, "step": 1300 }, { "epoch": 0.044845922224357744, "grad_norm": 0.581513524055481, "learning_rate": 4.99977007047745e-05, "loss": 1.0204, "num_input_tokens_seen": 11468800, "step": 1400 }, { "epoch": 0.04804920238324044, "grad_norm": 0.6710864901542664, "learning_rate": 4.999640738218772e-05, "loss": 1.0509, "num_input_tokens_seen": 12288000, "step": 1500 }, { "epoch": 0.05125248254212313, "grad_norm": 2.048499345779419, "learning_rate": 4.99948266848704e-05, "loss": 1.1401, "num_input_tokens_seen": 13107200, "step": 1600 }, { "epoch": 0.05445576270100583, "grad_norm": 0.6593829989433289, "learning_rate": 4.999295863099528e-05, "loss": 1.042, "num_input_tokens_seen": 13926400, "step": 1700 }, { "epoch": 0.05765904285988853, "grad_norm": 0.5166763663291931, "learning_rate": 4.999080324203867e-05, "loss": 1.1398, "num_input_tokens_seen": 14745600, "step": 1800 }, { "epoch": 0.060862323018771224, "grad_norm": 0.4539300203323364, "learning_rate": 4.9988360542780333e-05, "loss": 1.0759, "num_input_tokens_seen": 15564800, "step": 1900 }, { "epoch": 0.06406560317765392, "grad_norm": 0.7282894253730774, "learning_rate": 4.998563056130308e-05, "loss": 1.0988, "num_input_tokens_seen": 16384000, "step": 2000 }, { "epoch": 0.06726888333653662, "grad_norm": 0.6337546706199646, "learning_rate": 4.998261332899255e-05, "loss": 1.0642, "num_input_tokens_seen": 17203200, "step": 2100 }, { "epoch": 0.07047216349541931, "grad_norm": 0.6283242702484131, "learning_rate": 4.997930888053677e-05, "loss": 1.076, "num_input_tokens_seen": 18022400, "step": 2200 }, { "epoch": 0.07367544365430201, "grad_norm": 0.6066380739212036, "learning_rate": 4.99757172539258e-05, "loss": 1.0616, "num_input_tokens_seen": 18841600, "step": 2300 }, { "epoch": 0.0768787238131847, "grad_norm": 0.506839394569397, "learning_rate": 4.997183849045129e-05, "loss": 1.0691, "num_input_tokens_seen": 19660800, "step": 2400 }, { "epoch": 0.0800820039720674, "grad_norm": 0.6370711922645569, "learning_rate": 4.996767263470599e-05, "loss": 1.0463, "num_input_tokens_seen": 20480000, "step": 2500 }, { "epoch": 0.0832852841309501, "grad_norm": 2.0462234020233154, "learning_rate": 4.996321973458325e-05, "loss": 1.0703, "num_input_tokens_seen": 21299200, "step": 2600 }, { "epoch": 0.08648856428983279, "grad_norm": 0.6036199331283569, "learning_rate": 4.9958479841276446e-05, "loss": 1.0397, "num_input_tokens_seen": 22118400, "step": 2700 }, { "epoch": 0.08969184444871549, "grad_norm": 0.6303982138633728, "learning_rate": 4.995345300927845e-05, "loss": 1.0837, "num_input_tokens_seen": 22937600, "step": 2800 }, { "epoch": 0.09289512460759818, "grad_norm": 0.5572041869163513, "learning_rate": 4.994813929638096e-05, "loss": 1.0399, "num_input_tokens_seen": 23756800, "step": 2900 }, { "epoch": 0.09609840476648088, "grad_norm": 0.6958311200141907, "learning_rate": 4.9942538763673794e-05, "loss": 1.0634, "num_input_tokens_seen": 24576000, "step": 3000 }, { "epoch": 0.09930168492536358, "grad_norm": 0.583613395690918, "learning_rate": 4.993665147554429e-05, "loss": 1.0472, "num_input_tokens_seen": 25395200, "step": 3100 }, { "epoch": 0.10250496508424627, "grad_norm": 0.5093560814857483, "learning_rate": 4.9930477499676495e-05, "loss": 1.0774, "num_input_tokens_seen": 26214400, "step": 3200 }, { "epoch": 0.10570824524312897, "grad_norm": 1.930864691734314, "learning_rate": 4.992401690705038e-05, "loss": 1.0402, "num_input_tokens_seen": 27033600, "step": 3300 }, { "epoch": 0.10891152540201166, "grad_norm": 0.6102778911590576, "learning_rate": 4.9917269771941056e-05, "loss": 1.0353, "num_input_tokens_seen": 27852800, "step": 3400 }, { "epoch": 0.11211480556089436, "grad_norm": 0.5592427849769592, "learning_rate": 4.991023617191792e-05, "loss": 1.0776, "num_input_tokens_seen": 28672000, "step": 3500 }, { "epoch": 0.11531808571977706, "grad_norm": 0.6671651005744934, "learning_rate": 4.990291618784377e-05, "loss": 1.1083, "num_input_tokens_seen": 29491200, "step": 3600 }, { "epoch": 0.11852136587865975, "grad_norm": 1.4246577024459839, "learning_rate": 4.989530990387381e-05, "loss": 1.0262, "num_input_tokens_seen": 30310400, "step": 3700 }, { "epoch": 0.12172464603754245, "grad_norm": 2.4318628311157227, "learning_rate": 4.988741740745477e-05, "loss": 1.0441, "num_input_tokens_seen": 31129600, "step": 3800 }, { "epoch": 0.12492792619642513, "grad_norm": 2.1933786869049072, "learning_rate": 4.987923878932386e-05, "loss": 1.0375, "num_input_tokens_seen": 31948800, "step": 3900 }, { "epoch": 0.12813120635530784, "grad_norm": 0.5265761017799377, "learning_rate": 4.9870774143507696e-05, "loss": 1.0041, "num_input_tokens_seen": 32768000, "step": 4000 }, { "epoch": 0.13133448651419052, "grad_norm": 0.6378248929977417, "learning_rate": 4.98620235673213e-05, "loss": 1.0798, "num_input_tokens_seen": 33587200, "step": 4100 }, { "epoch": 0.13453776667307324, "grad_norm": 0.5426807999610901, "learning_rate": 4.9852987161366895e-05, "loss": 1.1014, "num_input_tokens_seen": 34406400, "step": 4200 }, { "epoch": 0.13774104683195593, "grad_norm": 0.587978720664978, "learning_rate": 4.9843665029532796e-05, "loss": 1.0321, "num_input_tokens_seen": 35225600, "step": 4300 }, { "epoch": 0.14094432699083861, "grad_norm": 0.8025338649749756, "learning_rate": 4.983405727899221e-05, "loss": 0.9954, "num_input_tokens_seen": 36044800, "step": 4400 }, { "epoch": 0.1441476071497213, "grad_norm": 0.5788518786430359, "learning_rate": 4.982416402020201e-05, "loss": 1.0049, "num_input_tokens_seen": 36864000, "step": 4500 }, { "epoch": 0.14735088730860402, "grad_norm": 0.629861056804657, "learning_rate": 4.9813985366901435e-05, "loss": 1.0586, "num_input_tokens_seen": 37683200, "step": 4600 }, { "epoch": 0.1505541674674867, "grad_norm": 0.5835918188095093, "learning_rate": 4.980352143611081e-05, "loss": 1.0949, "num_input_tokens_seen": 38502400, "step": 4700 }, { "epoch": 0.1537574476263694, "grad_norm": 0.5552580952644348, "learning_rate": 4.979277234813021e-05, "loss": 1.0374, "num_input_tokens_seen": 39321600, "step": 4800 }, { "epoch": 0.1569607277852521, "grad_norm": 0.7137876749038696, "learning_rate": 4.978173822653802e-05, "loss": 1.0195, "num_input_tokens_seen": 40140800, "step": 4900 }, { "epoch": 0.1601640079441348, "grad_norm": 0.6314465403556824, "learning_rate": 4.9770419198189595e-05, "loss": 1.0661, "num_input_tokens_seen": 40960000, "step": 5000 }, { "epoch": 0.16336728810301748, "grad_norm": 0.5494422316551208, "learning_rate": 4.975881539321574e-05, "loss": 1.0168, "num_input_tokens_seen": 41779200, "step": 5100 }, { "epoch": 0.1665705682619002, "grad_norm": 2.2284624576568604, "learning_rate": 4.974692694502123e-05, "loss": 1.0523, "num_input_tokens_seen": 42598400, "step": 5200 }, { "epoch": 0.16977384842078289, "grad_norm": 0.5189602375030518, "learning_rate": 4.973475399028331e-05, "loss": 1.0294, "num_input_tokens_seen": 43417600, "step": 5300 }, { "epoch": 0.17297712857966557, "grad_norm": 2.1537561416625977, "learning_rate": 4.972229666895006e-05, "loss": 0.9866, "num_input_tokens_seen": 44236800, "step": 5400 }, { "epoch": 0.17618040873854826, "grad_norm": 0.5834473967552185, "learning_rate": 4.970955512423884e-05, "loss": 0.99, "num_input_tokens_seen": 45056000, "step": 5500 }, { "epoch": 0.17938368889743098, "grad_norm": 0.6151788830757141, "learning_rate": 4.969652950263462e-05, "loss": 1.0292, "num_input_tokens_seen": 45875200, "step": 5600 }, { "epoch": 0.18258696905631366, "grad_norm": 0.641342043876648, "learning_rate": 4.96832199538883e-05, "loss": 1.0712, "num_input_tokens_seen": 46694400, "step": 5700 }, { "epoch": 0.18579024921519635, "grad_norm": 0.7882746458053589, "learning_rate": 4.966962663101499e-05, "loss": 1.0279, "num_input_tokens_seen": 47513600, "step": 5800 }, { "epoch": 0.18899352937407907, "grad_norm": 0.633734405040741, "learning_rate": 4.965574969029223e-05, "loss": 1.0448, "num_input_tokens_seen": 48332800, "step": 5900 }, { "epoch": 0.19219680953296175, "grad_norm": 1.5470919609069824, "learning_rate": 4.9641589291258255e-05, "loss": 1.0492, "num_input_tokens_seen": 49152000, "step": 6000 }, { "epoch": 0.19540008969184444, "grad_norm": 1.6563118696212769, "learning_rate": 4.962714559671008e-05, "loss": 1.0593, "num_input_tokens_seen": 49971200, "step": 6100 }, { "epoch": 0.19860336985072716, "grad_norm": 0.6741557717323303, "learning_rate": 4.961241877270169e-05, "loss": 1.0054, "num_input_tokens_seen": 50790400, "step": 6200 }, { "epoch": 0.20180665000960984, "grad_norm": 0.6842678785324097, "learning_rate": 4.9597408988542096e-05, "loss": 0.9865, "num_input_tokens_seen": 51609600, "step": 6300 }, { "epoch": 0.20500993016849253, "grad_norm": 8.189310073852539, "learning_rate": 4.958211641679339e-05, "loss": 1.0529, "num_input_tokens_seen": 52428800, "step": 6400 }, { "epoch": 0.20821321032737522, "grad_norm": 0.8904711604118347, "learning_rate": 4.956654123326881e-05, "loss": 1.0272, "num_input_tokens_seen": 53248000, "step": 6500 }, { "epoch": 0.21141649048625794, "grad_norm": 0.7857553362846375, "learning_rate": 4.9550683617030624e-05, "loss": 1.0295, "num_input_tokens_seen": 54067200, "step": 6600 }, { "epoch": 0.21461977064514062, "grad_norm": 0.6658555865287781, "learning_rate": 4.9534543750388185e-05, "loss": 0.9849, "num_input_tokens_seen": 54886400, "step": 6700 }, { "epoch": 0.2178230508040233, "grad_norm": 0.6390406489372253, "learning_rate": 4.951812181889573e-05, "loss": 0.9597, "num_input_tokens_seen": 55705600, "step": 6800 }, { "epoch": 0.22102633096290603, "grad_norm": 0.5161400437355042, "learning_rate": 4.950141801135034e-05, "loss": 1.0008, "num_input_tokens_seen": 56524800, "step": 6900 }, { "epoch": 0.2242296111217887, "grad_norm": 0.7651511430740356, "learning_rate": 4.948443251978968e-05, "loss": 0.9889, "num_input_tokens_seen": 57344000, "step": 7000 }, { "epoch": 0.2274328912806714, "grad_norm": 0.5069282054901123, "learning_rate": 4.946716553948987e-05, "loss": 0.9869, "num_input_tokens_seen": 58163200, "step": 7100 }, { "epoch": 0.23063617143955412, "grad_norm": 0.5041384696960449, "learning_rate": 4.9449617268963164e-05, "loss": 0.9669, "num_input_tokens_seen": 58982400, "step": 7200 }, { "epoch": 0.2338394515984368, "grad_norm": 1.7203638553619385, "learning_rate": 4.943178790995576e-05, "loss": 1.0426, "num_input_tokens_seen": 59801600, "step": 7300 }, { "epoch": 0.2370427317573195, "grad_norm": 0.8364699482917786, "learning_rate": 4.941367766744539e-05, "loss": 0.9894, "num_input_tokens_seen": 60620800, "step": 7400 }, { "epoch": 0.24024601191620218, "grad_norm": 0.42120370268821716, "learning_rate": 4.939528674963902e-05, "loss": 0.996, "num_input_tokens_seen": 61440000, "step": 7500 }, { "epoch": 0.2434492920750849, "grad_norm": 4.017838001251221, "learning_rate": 4.937661536797044e-05, "loss": 1.0557, "num_input_tokens_seen": 62259200, "step": 7600 }, { "epoch": 0.24665257223396758, "grad_norm": 0.7951923608779907, "learning_rate": 4.9357663737097824e-05, "loss": 1.0614, "num_input_tokens_seen": 63078400, "step": 7700 }, { "epoch": 0.24985585239285027, "grad_norm": 0.7139900922775269, "learning_rate": 4.9338432074901276e-05, "loss": 1.0525, "num_input_tokens_seen": 63897600, "step": 7800 }, { "epoch": 0.25305913255173296, "grad_norm": 0.6686214208602905, "learning_rate": 4.931892060248032e-05, "loss": 1.0947, "num_input_tokens_seen": 64716800, "step": 7900 }, { "epoch": 0.2562624127106157, "grad_norm": 0.737429678440094, "learning_rate": 4.929912954415135e-05, "loss": 0.9886, "num_input_tokens_seen": 65536000, "step": 8000 }, { "epoch": 0.2594656928694984, "grad_norm": 0.49794241786003113, "learning_rate": 4.9279059127445074e-05, "loss": 1.0407, "num_input_tokens_seen": 66355200, "step": 8100 }, { "epoch": 0.26266897302838105, "grad_norm": 0.6615239977836609, "learning_rate": 4.925870958310388e-05, "loss": 1.021, "num_input_tokens_seen": 67174400, "step": 8200 }, { "epoch": 0.26587225318726376, "grad_norm": 1.568616509437561, "learning_rate": 4.923808114507916e-05, "loss": 1.027, "num_input_tokens_seen": 67993600, "step": 8300 }, { "epoch": 0.2690755333461465, "grad_norm": 0.6627603769302368, "learning_rate": 4.921717405052868e-05, "loss": 1.0552, "num_input_tokens_seen": 68812800, "step": 8400 }, { "epoch": 0.27227881350502914, "grad_norm": 0.5849776864051819, "learning_rate": 4.9195988539813814e-05, "loss": 1.0552, "num_input_tokens_seen": 69632000, "step": 8500 }, { "epoch": 0.27548209366391185, "grad_norm": 1.6558514833450317, "learning_rate": 4.917452485649677e-05, "loss": 1.0516, "num_input_tokens_seen": 70451200, "step": 8600 }, { "epoch": 0.27868537382279457, "grad_norm": 0.5784972310066223, "learning_rate": 4.9152783247337823e-05, "loss": 1.0425, "num_input_tokens_seen": 71270400, "step": 8700 }, { "epoch": 0.28188865398167723, "grad_norm": 0.713585376739502, "learning_rate": 4.9130763962292453e-05, "loss": 1.0633, "num_input_tokens_seen": 72089600, "step": 8800 }, { "epoch": 0.28509193414055994, "grad_norm": 0.678617000579834, "learning_rate": 4.9108467254508487e-05, "loss": 1.0208, "num_input_tokens_seen": 72908800, "step": 8900 }, { "epoch": 0.2882952142994426, "grad_norm": 0.6494852900505066, "learning_rate": 4.908589338032316e-05, "loss": 1.0193, "num_input_tokens_seen": 73728000, "step": 9000 }, { "epoch": 0.2914984944583253, "grad_norm": 0.6913178563117981, "learning_rate": 4.9063042599260234e-05, "loss": 0.9783, "num_input_tokens_seen": 74547200, "step": 9100 }, { "epoch": 0.29470177461720803, "grad_norm": 0.6419298648834229, "learning_rate": 4.9039915174026916e-05, "loss": 1.0251, "num_input_tokens_seen": 75366400, "step": 9200 }, { "epoch": 0.2979050547760907, "grad_norm": 0.6663874983787537, "learning_rate": 4.9016511370510945e-05, "loss": 1.009, "num_input_tokens_seen": 76185600, "step": 9300 }, { "epoch": 0.3011083349349734, "grad_norm": 0.5730396509170532, "learning_rate": 4.8992831457777446e-05, "loss": 1.0154, "num_input_tokens_seen": 77004800, "step": 9400 }, { "epoch": 0.3043116150938561, "grad_norm": 0.5048360824584961, "learning_rate": 4.896887570806588e-05, "loss": 1.0498, "num_input_tokens_seen": 77824000, "step": 9500 }, { "epoch": 0.3075148952527388, "grad_norm": 1.7296109199523926, "learning_rate": 4.89446443967869e-05, "loss": 1.0426, "num_input_tokens_seen": 78643200, "step": 9600 }, { "epoch": 0.3107181754116215, "grad_norm": 0.8863735198974609, "learning_rate": 4.892013780251922e-05, "loss": 0.9947, "num_input_tokens_seen": 79462400, "step": 9700 }, { "epoch": 0.3139214555705042, "grad_norm": 2.7898573875427246, "learning_rate": 4.889535620700635e-05, "loss": 1.0301, "num_input_tokens_seen": 80281600, "step": 9800 }, { "epoch": 0.3171247357293869, "grad_norm": 0.5569226741790771, "learning_rate": 4.887029989515341e-05, "loss": 0.976, "num_input_tokens_seen": 81100800, "step": 9900 }, { "epoch": 0.3203280158882696, "grad_norm": 0.46732258796691895, "learning_rate": 4.884496915502385e-05, "loss": 1.0477, "num_input_tokens_seen": 81920000, "step": 10000 }, { "epoch": 0.3235312960471523, "grad_norm": 0.45553821325302124, "learning_rate": 4.881936427783607e-05, "loss": 1.0019, "num_input_tokens_seen": 82739200, "step": 10100 }, { "epoch": 0.32673457620603497, "grad_norm": 0.7193503379821777, "learning_rate": 4.879348555796018e-05, "loss": 0.997, "num_input_tokens_seen": 83558400, "step": 10200 }, { "epoch": 0.3299378563649177, "grad_norm": 0.6309390664100647, "learning_rate": 4.8767333292914544e-05, "loss": 0.9891, "num_input_tokens_seen": 84377600, "step": 10300 }, { "epoch": 0.3331411365238004, "grad_norm": 0.555618166923523, "learning_rate": 4.874090778336235e-05, "loss": 1.0175, "num_input_tokens_seen": 85196800, "step": 10400 }, { "epoch": 0.33634441668268306, "grad_norm": 1.5369619131088257, "learning_rate": 4.8714209333108236e-05, "loss": 1.0151, "num_input_tokens_seen": 86016000, "step": 10500 }, { "epoch": 0.33954769684156577, "grad_norm": 0.5254389047622681, "learning_rate": 4.868723824909469e-05, "loss": 1.025, "num_input_tokens_seen": 86835200, "step": 10600 }, { "epoch": 0.3427509770004485, "grad_norm": 0.5323970913887024, "learning_rate": 4.8659994841398594e-05, "loss": 1.0334, "num_input_tokens_seen": 87654400, "step": 10700 }, { "epoch": 0.34595425715933115, "grad_norm": 0.602602481842041, "learning_rate": 4.863247942322764e-05, "loss": 1.0237, "num_input_tokens_seen": 88473600, "step": 10800 }, { "epoch": 0.34915753731821386, "grad_norm": 2.1106760501861572, "learning_rate": 4.860469231091671e-05, "loss": 1.0181, "num_input_tokens_seen": 89292800, "step": 10900 }, { "epoch": 0.3523608174770965, "grad_norm": 0.6294669508934021, "learning_rate": 4.857663382392428e-05, "loss": 1.0289, "num_input_tokens_seen": 90112000, "step": 11000 }, { "epoch": 0.35556409763597924, "grad_norm": 0.5473527908325195, "learning_rate": 4.854830428482871e-05, "loss": 1.0296, "num_input_tokens_seen": 90931200, "step": 11100 }, { "epoch": 0.35876737779486195, "grad_norm": 0.5963702201843262, "learning_rate": 4.851970401932454e-05, "loss": 0.9784, "num_input_tokens_seen": 91750400, "step": 11200 }, { "epoch": 0.3619706579537446, "grad_norm": 1.5987745523452759, "learning_rate": 4.849083335621878e-05, "loss": 1.0842, "num_input_tokens_seen": 92569600, "step": 11300 }, { "epoch": 0.3651739381126273, "grad_norm": 1.9906154870986938, "learning_rate": 4.846169262742709e-05, "loss": 1.0196, "num_input_tokens_seen": 93388800, "step": 11400 }, { "epoch": 0.36837721827151004, "grad_norm": 0.7897935509681702, "learning_rate": 4.843228216796996e-05, "loss": 1.0103, "num_input_tokens_seen": 94208000, "step": 11500 }, { "epoch": 0.3715804984303927, "grad_norm": 0.6737790107727051, "learning_rate": 4.8402602315968905e-05, "loss": 1.0551, "num_input_tokens_seen": 95027200, "step": 11600 }, { "epoch": 0.3747837785892754, "grad_norm": 0.5573664307594299, "learning_rate": 4.837265341264253e-05, "loss": 1.0221, "num_input_tokens_seen": 95846400, "step": 11700 }, { "epoch": 0.37798705874815813, "grad_norm": 0.6558005809783936, "learning_rate": 4.834243580230266e-05, "loss": 0.975, "num_input_tokens_seen": 96665600, "step": 11800 }, { "epoch": 0.3811903389070408, "grad_norm": 0.7646604776382446, "learning_rate": 4.831194983235029e-05, "loss": 1.0152, "num_input_tokens_seen": 97484800, "step": 11900 }, { "epoch": 0.3843936190659235, "grad_norm": 0.5662313103675842, "learning_rate": 4.82811958532717e-05, "loss": 0.9909, "num_input_tokens_seen": 98304000, "step": 12000 }, { "epoch": 0.3875968992248062, "grad_norm": 0.5597667098045349, "learning_rate": 4.825017421863436e-05, "loss": 1.0208, "num_input_tokens_seen": 99123200, "step": 12100 }, { "epoch": 0.3908001793836889, "grad_norm": 0.5832675099372864, "learning_rate": 4.821888528508287e-05, "loss": 1.0189, "num_input_tokens_seen": 99942400, "step": 12200 }, { "epoch": 0.3940034595425716, "grad_norm": 1.6424989700317383, "learning_rate": 4.8187329412334884e-05, "loss": 1.055, "num_input_tokens_seen": 100761600, "step": 12300 }, { "epoch": 0.3972067397014543, "grad_norm": 0.4590611755847931, "learning_rate": 4.815550696317695e-05, "loss": 1.0586, "num_input_tokens_seen": 101580800, "step": 12400 }, { "epoch": 0.400410019860337, "grad_norm": 0.5123792290687561, "learning_rate": 4.812341830346035e-05, "loss": 1.0073, "num_input_tokens_seen": 102400000, "step": 12500 }, { "epoch": 0.4036133000192197, "grad_norm": 1.7758103609085083, "learning_rate": 4.80910638020969e-05, "loss": 1.0012, "num_input_tokens_seen": 103219200, "step": 12600 }, { "epoch": 0.40681658017810235, "grad_norm": 0.6465420722961426, "learning_rate": 4.805844383105469e-05, "loss": 0.9919, "num_input_tokens_seen": 104038400, "step": 12700 }, { "epoch": 0.41001986033698506, "grad_norm": 0.6052021980285645, "learning_rate": 4.802555876535383e-05, "loss": 1.0369, "num_input_tokens_seen": 104857600, "step": 12800 }, { "epoch": 0.4132231404958678, "grad_norm": 0.5069152116775513, "learning_rate": 4.799240898306214e-05, "loss": 1.0105, "num_input_tokens_seen": 105676800, "step": 12900 }, { "epoch": 0.41642642065475044, "grad_norm": 0.6421388387680054, "learning_rate": 4.7958994865290766e-05, "loss": 0.9861, "num_input_tokens_seen": 106496000, "step": 13000 }, { "epoch": 0.41962970081363316, "grad_norm": 0.6774849891662598, "learning_rate": 4.7925316796189826e-05, "loss": 0.9771, "num_input_tokens_seen": 107315200, "step": 13100 }, { "epoch": 0.42283298097251587, "grad_norm": 2.159661293029785, "learning_rate": 4.789137516294402e-05, "loss": 1.0182, "num_input_tokens_seen": 108134400, "step": 13200 }, { "epoch": 0.42603626113139853, "grad_norm": 0.6035510301589966, "learning_rate": 4.785717035576812e-05, "loss": 1.036, "num_input_tokens_seen": 108953600, "step": 13300 }, { "epoch": 0.42923954129028125, "grad_norm": 1.6665889024734497, "learning_rate": 4.782270276790254e-05, "loss": 1.0713, "num_input_tokens_seen": 109772800, "step": 13400 }, { "epoch": 0.43244282144916396, "grad_norm": 0.702918291091919, "learning_rate": 4.778797279560876e-05, "loss": 0.9708, "num_input_tokens_seen": 110592000, "step": 13500 }, { "epoch": 0.4356461016080466, "grad_norm": 0.6358348727226257, "learning_rate": 4.775298083816482e-05, "loss": 0.9967, "num_input_tokens_seen": 111411200, "step": 13600 }, { "epoch": 0.43884938176692934, "grad_norm": 0.652087390422821, "learning_rate": 4.77177272978607e-05, "loss": 1.0333, "num_input_tokens_seen": 112230400, "step": 13700 }, { "epoch": 0.44205266192581205, "grad_norm": 0.6892516016960144, "learning_rate": 4.768221257999373e-05, "loss": 1.0308, "num_input_tokens_seen": 113049600, "step": 13800 }, { "epoch": 0.4452559420846947, "grad_norm": 0.6279174089431763, "learning_rate": 4.764643709286386e-05, "loss": 1.057, "num_input_tokens_seen": 113868800, "step": 13900 }, { "epoch": 0.4484592222435774, "grad_norm": 0.6180372834205627, "learning_rate": 4.761040124776904e-05, "loss": 1.0059, "num_input_tokens_seen": 114688000, "step": 14000 }, { "epoch": 0.45166250240246014, "grad_norm": 0.6153070330619812, "learning_rate": 4.757410545900047e-05, "loss": 1.0717, "num_input_tokens_seen": 115507200, "step": 14100 }, { "epoch": 0.4548657825613428, "grad_norm": 0.5821653604507446, "learning_rate": 4.7537550143837796e-05, "loss": 1.0313, "num_input_tokens_seen": 116326400, "step": 14200 }, { "epoch": 0.4580690627202255, "grad_norm": 0.5773714780807495, "learning_rate": 4.750073572254438e-05, "loss": 1.0296, "num_input_tokens_seen": 117145600, "step": 14300 }, { "epoch": 0.46127234287910823, "grad_norm": 0.7084370255470276, "learning_rate": 4.746366261836242e-05, "loss": 0.9977, "num_input_tokens_seen": 117964800, "step": 14400 }, { "epoch": 0.4644756230379909, "grad_norm": 0.719439685344696, "learning_rate": 4.742633125750808e-05, "loss": 0.9753, "num_input_tokens_seen": 118784000, "step": 14500 }, { "epoch": 0.4676789031968736, "grad_norm": 0.6266898512840271, "learning_rate": 4.738874206916665e-05, "loss": 0.9722, "num_input_tokens_seen": 119603200, "step": 14600 }, { "epoch": 0.47088218335575627, "grad_norm": 0.6483869552612305, "learning_rate": 4.7350895485487526e-05, "loss": 1.066, "num_input_tokens_seen": 120422400, "step": 14700 }, { "epoch": 0.474085463514639, "grad_norm": 0.5138384699821472, "learning_rate": 4.731279194157933e-05, "loss": 0.973, "num_input_tokens_seen": 121241600, "step": 14800 }, { "epoch": 0.4772887436735217, "grad_norm": 0.6580103039741516, "learning_rate": 4.727443187550481e-05, "loss": 0.9922, "num_input_tokens_seen": 122060800, "step": 14900 }, { "epoch": 0.48049202383240436, "grad_norm": 0.6680930852890015, "learning_rate": 4.723581572827592e-05, "loss": 0.9851, "num_input_tokens_seen": 122880000, "step": 15000 }, { "epoch": 0.4836953039912871, "grad_norm": 2.329383373260498, "learning_rate": 4.719694394384863e-05, "loss": 1.0284, "num_input_tokens_seen": 123699200, "step": 15100 }, { "epoch": 0.4868985841501698, "grad_norm": 0.7416221499443054, "learning_rate": 4.715781696911792e-05, "loss": 0.9828, "num_input_tokens_seen": 124518400, "step": 15200 }, { "epoch": 0.49010186430905245, "grad_norm": 0.5373809337615967, "learning_rate": 4.7118435253912575e-05, "loss": 0.9621, "num_input_tokens_seen": 125337600, "step": 15300 }, { "epoch": 0.49330514446793516, "grad_norm": 0.5429302453994751, "learning_rate": 4.7078799250990056e-05, "loss": 1.013, "num_input_tokens_seen": 126156800, "step": 15400 }, { "epoch": 0.4965084246268179, "grad_norm": 0.5449560284614563, "learning_rate": 4.7038909416031276e-05, "loss": 1.0564, "num_input_tokens_seen": 126976000, "step": 15500 }, { "epoch": 0.49971170478570054, "grad_norm": 0.6629030704498291, "learning_rate": 4.699876620763535e-05, "loss": 0.9828, "num_input_tokens_seen": 127795200, "step": 15600 }, { "epoch": 0.5029149849445832, "grad_norm": 0.6022646427154541, "learning_rate": 4.6958370087314344e-05, "loss": 1.0435, "num_input_tokens_seen": 128614400, "step": 15700 }, { "epoch": 0.5061182651034659, "grad_norm": 1.8832833766937256, "learning_rate": 4.691772151948799e-05, "loss": 0.9438, "num_input_tokens_seen": 129433600, "step": 15800 }, { "epoch": 0.5093215452623486, "grad_norm": 0.7114049196243286, "learning_rate": 4.687682097147826e-05, "loss": 0.947, "num_input_tokens_seen": 130252800, "step": 15900 }, { "epoch": 0.5125248254212313, "grad_norm": 1.7428299188613892, "learning_rate": 4.683566891350412e-05, "loss": 0.9461, "num_input_tokens_seen": 131072000, "step": 16000 }, { "epoch": 0.5157281055801141, "grad_norm": 0.7306798100471497, "learning_rate": 4.679426581867599e-05, "loss": 0.9964, "num_input_tokens_seen": 131891200, "step": 16100 }, { "epoch": 0.5189313857389968, "grad_norm": 0.6088542938232422, "learning_rate": 4.675261216299042e-05, "loss": 0.9499, "num_input_tokens_seen": 132710400, "step": 16200 }, { "epoch": 0.5221346658978794, "grad_norm": 1.0487473011016846, "learning_rate": 4.6710708425324545e-05, "loss": 1.0205, "num_input_tokens_seen": 133529600, "step": 16300 }, { "epoch": 0.5253379460567621, "grad_norm": 0.4886884093284607, "learning_rate": 4.6668555087430605e-05, "loss": 0.9996, "num_input_tokens_seen": 134348800, "step": 16400 }, { "epoch": 0.5285412262156448, "grad_norm": 0.8639355301856995, "learning_rate": 4.662615263393041e-05, "loss": 1.0013, "num_input_tokens_seen": 135168000, "step": 16500 }, { "epoch": 0.5317445063745275, "grad_norm": 2.132063865661621, "learning_rate": 4.658350155230976e-05, "loss": 1.0437, "num_input_tokens_seen": 135987200, "step": 16600 }, { "epoch": 0.5349477865334102, "grad_norm": 0.5800316333770752, "learning_rate": 4.6540602332912854e-05, "loss": 1.0094, "num_input_tokens_seen": 136806400, "step": 16700 }, { "epoch": 0.538151066692293, "grad_norm": 0.48361486196517944, "learning_rate": 4.6497455468936606e-05, "loss": 1.0141, "num_input_tokens_seen": 137625600, "step": 16800 }, { "epoch": 0.5413543468511756, "grad_norm": 0.5760986804962158, "learning_rate": 4.645406145642506e-05, "loss": 1.0359, "num_input_tokens_seen": 138444800, "step": 16900 }, { "epoch": 0.5445576270100583, "grad_norm": 0.42741426825523376, "learning_rate": 4.64104207942636e-05, "loss": 0.9605, "num_input_tokens_seen": 139264000, "step": 17000 }, { "epoch": 0.547760907168941, "grad_norm": 0.6151024103164673, "learning_rate": 4.6366533984173274e-05, "loss": 0.9502, "num_input_tokens_seen": 140083200, "step": 17100 }, { "epoch": 0.5509641873278237, "grad_norm": 5.775717735290527, "learning_rate": 4.6322401530704995e-05, "loss": 1.016, "num_input_tokens_seen": 140902400, "step": 17200 }, { "epoch": 0.5541674674867064, "grad_norm": 0.5886793732643127, "learning_rate": 4.627802394123375e-05, "loss": 1.0039, "num_input_tokens_seen": 141721600, "step": 17300 }, { "epoch": 0.5573707476455891, "grad_norm": 2.4064829349517822, "learning_rate": 4.623340172595277e-05, "loss": 0.9972, "num_input_tokens_seen": 142540800, "step": 17400 }, { "epoch": 0.5605740278044717, "grad_norm": 0.5964205861091614, "learning_rate": 4.6188535397867675e-05, "loss": 0.9894, "num_input_tokens_seen": 143360000, "step": 17500 }, { "epoch": 0.5637773079633545, "grad_norm": 0.5683798789978027, "learning_rate": 4.614342547279052e-05, "loss": 1.0721, "num_input_tokens_seen": 144179200, "step": 17600 }, { "epoch": 0.5669805881222372, "grad_norm": 0.5441416501998901, "learning_rate": 4.609807246933395e-05, "loss": 1.0183, "num_input_tokens_seen": 144998400, "step": 17700 }, { "epoch": 0.5701838682811199, "grad_norm": 2.547898530960083, "learning_rate": 4.605247690890518e-05, "loss": 1.0083, "num_input_tokens_seen": 145817600, "step": 17800 }, { "epoch": 0.5733871484400026, "grad_norm": 0.7640330791473389, "learning_rate": 4.600663931570001e-05, "loss": 0.9927, "num_input_tokens_seen": 146636800, "step": 17900 }, { "epoch": 0.5765904285988852, "grad_norm": 0.6045035123825073, "learning_rate": 4.596056021669681e-05, "loss": 1.0144, "num_input_tokens_seen": 147456000, "step": 18000 }, { "epoch": 0.5797937087577679, "grad_norm": 0.5718028545379639, "learning_rate": 4.591424014165047e-05, "loss": 1.0417, "num_input_tokens_seen": 148275200, "step": 18100 }, { "epoch": 0.5829969889166506, "grad_norm": 0.49183499813079834, "learning_rate": 4.586767962308625e-05, "loss": 1.0124, "num_input_tokens_seen": 149094400, "step": 18200 }, { "epoch": 0.5862002690755334, "grad_norm": 0.5138664841651917, "learning_rate": 4.5820879196293756e-05, "loss": 0.9961, "num_input_tokens_seen": 149913600, "step": 18300 }, { "epoch": 0.5894035492344161, "grad_norm": 0.6507889628410339, "learning_rate": 4.577383939932069e-05, "loss": 1.0066, "num_input_tokens_seen": 150732800, "step": 18400 }, { "epoch": 0.5926068293932988, "grad_norm": 0.48219242691993713, "learning_rate": 4.572656077296676e-05, "loss": 1.0422, "num_input_tokens_seen": 151552000, "step": 18500 }, { "epoch": 0.5958101095521814, "grad_norm": 2.981851100921631, "learning_rate": 4.567904386077734e-05, "loss": 1.0647, "num_input_tokens_seen": 152371200, "step": 18600 }, { "epoch": 0.5990133897110641, "grad_norm": 1.6492716073989868, "learning_rate": 4.563128920903735e-05, "loss": 1.0465, "num_input_tokens_seen": 153190400, "step": 18700 }, { "epoch": 0.6022166698699468, "grad_norm": 0.6568962335586548, "learning_rate": 4.558329736676488e-05, "loss": 1.0505, "num_input_tokens_seen": 154009600, "step": 18800 }, { "epoch": 0.6054199500288295, "grad_norm": 0.77339768409729, "learning_rate": 4.553506888570494e-05, "loss": 1.0287, "num_input_tokens_seen": 154828800, "step": 18900 }, { "epoch": 0.6086232301877122, "grad_norm": 0.6354805827140808, "learning_rate": 4.548660432032307e-05, "loss": 0.9675, "num_input_tokens_seen": 155648000, "step": 19000 }, { "epoch": 0.611826510346595, "grad_norm": 0.6528341770172119, "learning_rate": 4.5437904227799e-05, "loss": 1.0027, "num_input_tokens_seen": 156467200, "step": 19100 }, { "epoch": 0.6150297905054776, "grad_norm": 0.7518653273582458, "learning_rate": 4.538896916802023e-05, "loss": 1.0002, "num_input_tokens_seen": 157286400, "step": 19200 }, { "epoch": 0.6182330706643603, "grad_norm": 1.2601783275604248, "learning_rate": 4.533979970357558e-05, "loss": 1.0698, "num_input_tokens_seen": 158105600, "step": 19300 }, { "epoch": 0.621436350823243, "grad_norm": 0.7242873311042786, "learning_rate": 4.529039639974876e-05, "loss": 0.9834, "num_input_tokens_seen": 158924800, "step": 19400 }, { "epoch": 0.6246396309821257, "grad_norm": 2.0396833419799805, "learning_rate": 4.524075982451183e-05, "loss": 0.9634, "num_input_tokens_seen": 159744000, "step": 19500 }, { "epoch": 0.6278429111410084, "grad_norm": 2.7037477493286133, "learning_rate": 4.5190890548518696e-05, "loss": 1.0221, "num_input_tokens_seen": 160563200, "step": 19600 }, { "epoch": 0.631046191299891, "grad_norm": 1.6231496334075928, "learning_rate": 4.5140789145098536e-05, "loss": 1.0582, "num_input_tokens_seen": 161382400, "step": 19700 }, { "epoch": 0.6342494714587738, "grad_norm": 0.6004766225814819, "learning_rate": 4.509045619024921e-05, "loss": 1.0112, "num_input_tokens_seen": 162201600, "step": 19800 }, { "epoch": 0.6374527516176565, "grad_norm": 12.123788833618164, "learning_rate": 4.5039892262630656e-05, "loss": 1.0078, "num_input_tokens_seen": 163020800, "step": 19900 }, { "epoch": 0.6406560317765392, "grad_norm": 3.2375683784484863, "learning_rate": 4.498909794355821e-05, "loss": 1.0239, "num_input_tokens_seen": 163840000, "step": 20000 }, { "epoch": 0.6438593119354219, "grad_norm": 0.8260817527770996, "learning_rate": 4.493807381699595e-05, "loss": 1.009, "num_input_tokens_seen": 164659200, "step": 20100 }, { "epoch": 0.6470625920943046, "grad_norm": 0.7712699174880981, "learning_rate": 4.488682046954994e-05, "loss": 0.9565, "num_input_tokens_seen": 165478400, "step": 20200 }, { "epoch": 0.6502658722531872, "grad_norm": 0.5889214277267456, "learning_rate": 4.483533849046155e-05, "loss": 1.0225, "num_input_tokens_seen": 166297600, "step": 20300 }, { "epoch": 0.6534691524120699, "grad_norm": 1.2388112545013428, "learning_rate": 4.4783628471600636e-05, "loss": 1.0642, "num_input_tokens_seen": 167116800, "step": 20400 }, { "epoch": 0.6566724325709526, "grad_norm": 0.6664971709251404, "learning_rate": 4.473169100745871e-05, "loss": 0.9598, "num_input_tokens_seen": 167936000, "step": 20500 }, { "epoch": 0.6598757127298354, "grad_norm": 0.5350831151008606, "learning_rate": 4.4679526695142195e-05, "loss": 1.0391, "num_input_tokens_seen": 168755200, "step": 20600 }, { "epoch": 0.6630789928887181, "grad_norm": 0.6643035411834717, "learning_rate": 4.4627136134365463e-05, "loss": 0.998, "num_input_tokens_seen": 169574400, "step": 20700 }, { "epoch": 0.6662822730476008, "grad_norm": 0.5972053408622742, "learning_rate": 4.457451992744402e-05, "loss": 1.0335, "num_input_tokens_seen": 170393600, "step": 20800 }, { "epoch": 0.6694855532064834, "grad_norm": 0.5102434754371643, "learning_rate": 4.452167867928751e-05, "loss": 1.0459, "num_input_tokens_seen": 171212800, "step": 20900 }, { "epoch": 0.6726888333653661, "grad_norm": 0.5346103310585022, "learning_rate": 4.4468612997392824e-05, "loss": 0.9922, "num_input_tokens_seen": 172032000, "step": 21000 }, { "epoch": 0.6758921135242488, "grad_norm": 0.5129193663597107, "learning_rate": 4.441532349183706e-05, "loss": 1.0024, "num_input_tokens_seen": 172851200, "step": 21100 }, { "epoch": 0.6790953936831315, "grad_norm": 0.5462967753410339, "learning_rate": 4.4361810775270554e-05, "loss": 0.994, "num_input_tokens_seen": 173670400, "step": 21200 }, { "epoch": 0.6822986738420143, "grad_norm": 1.2343724966049194, "learning_rate": 4.430807546290982e-05, "loss": 0.9669, "num_input_tokens_seen": 174489600, "step": 21300 }, { "epoch": 0.685501954000897, "grad_norm": 0.653947651386261, "learning_rate": 4.425411817253048e-05, "loss": 1.0029, "num_input_tokens_seen": 175308800, "step": 21400 }, { "epoch": 0.6887052341597796, "grad_norm": 2.948323965072632, "learning_rate": 4.419993952446013e-05, "loss": 1.0158, "num_input_tokens_seen": 176128000, "step": 21500 }, { "epoch": 0.6919085143186623, "grad_norm": 1.577588438987732, "learning_rate": 4.414554014157127e-05, "loss": 1.0571, "num_input_tokens_seen": 176947200, "step": 21600 }, { "epoch": 0.695111794477545, "grad_norm": 1.0136100053787231, "learning_rate": 4.4090920649274095e-05, "loss": 0.9647, "num_input_tokens_seen": 177766400, "step": 21700 }, { "epoch": 0.6983150746364277, "grad_norm": 0.5571495294570923, "learning_rate": 4.40360816755093e-05, "loss": 0.9609, "num_input_tokens_seen": 178585600, "step": 21800 }, { "epoch": 0.7015183547953104, "grad_norm": 0.5548049211502075, "learning_rate": 4.3981023850740926e-05, "loss": 0.9524, "num_input_tokens_seen": 179404800, "step": 21900 }, { "epoch": 0.704721634954193, "grad_norm": 0.9693801999092102, "learning_rate": 4.392574780794901e-05, "loss": 0.9641, "num_input_tokens_seen": 180224000, "step": 22000 }, { "epoch": 0.7079249151130758, "grad_norm": 0.6628372669219971, "learning_rate": 4.387025418262242e-05, "loss": 0.9838, "num_input_tokens_seen": 181043200, "step": 22100 }, { "epoch": 0.7111281952719585, "grad_norm": 0.5312179923057556, "learning_rate": 4.381454361275143e-05, "loss": 1.0309, "num_input_tokens_seen": 181862400, "step": 22200 }, { "epoch": 0.7143314754308412, "grad_norm": 0.6137087941169739, "learning_rate": 4.3758616738820506e-05, "loss": 1.0029, "num_input_tokens_seen": 182681600, "step": 22300 }, { "epoch": 0.7175347555897239, "grad_norm": 1.6591495275497437, "learning_rate": 4.370247420380085e-05, "loss": 0.9842, "num_input_tokens_seen": 183500800, "step": 22400 }, { "epoch": 0.7207380357486066, "grad_norm": 0.677762508392334, "learning_rate": 4.3646116653143046e-05, "loss": 0.9606, "num_input_tokens_seen": 184320000, "step": 22500 }, { "epoch": 0.7239413159074892, "grad_norm": 0.602687418460846, "learning_rate": 4.358954473476965e-05, "loss": 0.9781, "num_input_tokens_seen": 185139200, "step": 22600 }, { "epoch": 0.7271445960663719, "grad_norm": 0.5638014674186707, "learning_rate": 4.353275909906772e-05, "loss": 0.9823, "num_input_tokens_seen": 185958400, "step": 22700 }, { "epoch": 0.7303478762252547, "grad_norm": 1.6680676937103271, "learning_rate": 4.3475760398881325e-05, "loss": 0.988, "num_input_tokens_seen": 186777600, "step": 22800 }, { "epoch": 0.7335511563841374, "grad_norm": 0.6449896097183228, "learning_rate": 4.3418549289504096e-05, "loss": 0.9878, "num_input_tokens_seen": 187596800, "step": 22900 }, { "epoch": 0.7367544365430201, "grad_norm": 2.6768717765808105, "learning_rate": 4.3361126428671636e-05, "loss": 1.0091, "num_input_tokens_seen": 188416000, "step": 23000 }, { "epoch": 0.7399577167019028, "grad_norm": 1.079026460647583, "learning_rate": 4.330349247655398e-05, "loss": 1.0383, "num_input_tokens_seen": 189235200, "step": 23100 }, { "epoch": 0.7431609968607854, "grad_norm": 0.6426740288734436, "learning_rate": 4.324564809574799e-05, "loss": 0.9801, "num_input_tokens_seen": 190054400, "step": 23200 }, { "epoch": 0.7463642770196681, "grad_norm": 0.8264270424842834, "learning_rate": 4.318759395126979e-05, "loss": 1.0095, "num_input_tokens_seen": 190873600, "step": 23300 }, { "epoch": 0.7495675571785508, "grad_norm": 0.5160927176475525, "learning_rate": 4.3129330710547035e-05, "loss": 0.9601, "num_input_tokens_seen": 191692800, "step": 23400 }, { "epoch": 0.7527708373374336, "grad_norm": 0.6011959910392761, "learning_rate": 4.307085904341133e-05, "loss": 0.9837, "num_input_tokens_seen": 192512000, "step": 23500 }, { "epoch": 0.7559741174963163, "grad_norm": 0.5961838960647583, "learning_rate": 4.3012179622090436e-05, "loss": 0.9647, "num_input_tokens_seen": 193331200, "step": 23600 }, { "epoch": 0.7591773976551989, "grad_norm": 0.8201313614845276, "learning_rate": 4.295329312120063e-05, "loss": 0.9439, "num_input_tokens_seen": 194150400, "step": 23700 }, { "epoch": 0.7623806778140816, "grad_norm": 0.5474829077720642, "learning_rate": 4.289420021773889e-05, "loss": 0.9708, "num_input_tokens_seen": 194969600, "step": 23800 }, { "epoch": 0.7655839579729643, "grad_norm": 0.5124524235725403, "learning_rate": 4.283490159107513e-05, "loss": 1.0109, "num_input_tokens_seen": 195788800, "step": 23900 }, { "epoch": 0.768787238131847, "grad_norm": 0.6800445318222046, "learning_rate": 4.27753979229444e-05, "loss": 1.0119, "num_input_tokens_seen": 196608000, "step": 24000 }, { "epoch": 0.7719905182907297, "grad_norm": 0.5350146889686584, "learning_rate": 4.271568989743903e-05, "loss": 0.9659, "num_input_tokens_seen": 197427200, "step": 24100 }, { "epoch": 0.7751937984496124, "grad_norm": 0.6650831699371338, "learning_rate": 4.265577820100076e-05, "loss": 0.9729, "num_input_tokens_seen": 198246400, "step": 24200 }, { "epoch": 0.778397078608495, "grad_norm": 0.5228304862976074, "learning_rate": 4.2595663522412884e-05, "loss": 0.9633, "num_input_tokens_seen": 199065600, "step": 24300 }, { "epoch": 0.7816003587673778, "grad_norm": 0.532375693321228, "learning_rate": 4.253534655279232e-05, "loss": 0.9687, "num_input_tokens_seen": 199884800, "step": 24400 }, { "epoch": 0.7848036389262605, "grad_norm": 0.8860092759132385, "learning_rate": 4.247482798558161e-05, "loss": 1.0017, "num_input_tokens_seen": 200704000, "step": 24500 }, { "epoch": 0.7880069190851432, "grad_norm": 2.975177526473999, "learning_rate": 4.241410851654102e-05, "loss": 0.9905, "num_input_tokens_seen": 201523200, "step": 24600 }, { "epoch": 0.7912101992440259, "grad_norm": 0.622031033039093, "learning_rate": 4.235318884374051e-05, "loss": 1.0358, "num_input_tokens_seen": 202342400, "step": 24700 }, { "epoch": 0.7944134794029086, "grad_norm": 1.7574553489685059, "learning_rate": 4.229206966755172e-05, "loss": 1.0105, "num_input_tokens_seen": 203161600, "step": 24800 }, { "epoch": 0.7976167595617912, "grad_norm": 0.7439371347427368, "learning_rate": 4.223075169063989e-05, "loss": 0.9345, "num_input_tokens_seen": 203980800, "step": 24900 }, { "epoch": 0.800820039720674, "grad_norm": 0.5452560782432556, "learning_rate": 4.21692356179558e-05, "loss": 0.9655, "num_input_tokens_seen": 204800000, "step": 25000 }, { "epoch": 0.8040233198795567, "grad_norm": 0.5876986384391785, "learning_rate": 4.210752215672769e-05, "loss": 0.949, "num_input_tokens_seen": 205619200, "step": 25100 }, { "epoch": 0.8072266000384394, "grad_norm": 2.6809980869293213, "learning_rate": 4.204561201645307e-05, "loss": 1.0082, "num_input_tokens_seen": 206438400, "step": 25200 }, { "epoch": 0.8104298801973221, "grad_norm": 0.647762656211853, "learning_rate": 4.198350590889064e-05, "loss": 1.0074, "num_input_tokens_seen": 207257600, "step": 25300 }, { "epoch": 0.8136331603562047, "grad_norm": 0.4822922945022583, "learning_rate": 4.192120454805203e-05, "loss": 0.9638, "num_input_tokens_seen": 208076800, "step": 25400 }, { "epoch": 0.8168364405150874, "grad_norm": 9.964862823486328, "learning_rate": 4.185870865019364e-05, "loss": 0.9793, "num_input_tokens_seen": 208896000, "step": 25500 }, { "epoch": 0.8200397206739701, "grad_norm": 0.6270651817321777, "learning_rate": 4.17960189338084e-05, "loss": 0.9515, "num_input_tokens_seen": 209715200, "step": 25600 }, { "epoch": 0.8232430008328528, "grad_norm": 0.5813098549842834, "learning_rate": 4.17331361196175e-05, "loss": 0.9659, "num_input_tokens_seen": 210534400, "step": 25700 }, { "epoch": 0.8264462809917356, "grad_norm": 0.5864317417144775, "learning_rate": 4.167006093056209e-05, "loss": 1.0496, "num_input_tokens_seen": 211353600, "step": 25800 }, { "epoch": 0.8296495611506183, "grad_norm": 2.7955405712127686, "learning_rate": 4.1606794091795e-05, "loss": 0.9466, "num_input_tokens_seen": 212172800, "step": 25900 }, { "epoch": 0.8328528413095009, "grad_norm": 0.5431935787200928, "learning_rate": 4.154333633067238e-05, "loss": 0.9308, "num_input_tokens_seen": 212992000, "step": 26000 }, { "epoch": 0.8360561214683836, "grad_norm": 2.313504934310913, "learning_rate": 4.147968837674535e-05, "loss": 0.9996, "num_input_tokens_seen": 213811200, "step": 26100 }, { "epoch": 0.8392594016272663, "grad_norm": 0.6028672456741333, "learning_rate": 4.141585096175162e-05, "loss": 0.9862, "num_input_tokens_seen": 214630400, "step": 26200 }, { "epoch": 0.842462681786149, "grad_norm": 1.6038614511489868, "learning_rate": 4.1351824819607056e-05, "loss": 1.0175, "num_input_tokens_seen": 215449600, "step": 26300 }, { "epoch": 0.8456659619450317, "grad_norm": 0.6132040619850159, "learning_rate": 4.128761068639723e-05, "loss": 0.9903, "num_input_tokens_seen": 216268800, "step": 26400 }, { "epoch": 0.8488692421039145, "grad_norm": 1.7026666402816772, "learning_rate": 4.122320930036902e-05, "loss": 1.0261, "num_input_tokens_seen": 217088000, "step": 26500 }, { "epoch": 0.8520725222627971, "grad_norm": 0.6355572938919067, "learning_rate": 4.1158621401922046e-05, "loss": 1.0048, "num_input_tokens_seen": 217907200, "step": 26600 }, { "epoch": 0.8552758024216798, "grad_norm": 0.683513879776001, "learning_rate": 4.109384773360023e-05, "loss": 0.9659, "num_input_tokens_seen": 218726400, "step": 26700 }, { "epoch": 0.8584790825805625, "grad_norm": 0.6867396831512451, "learning_rate": 4.10288890400832e-05, "loss": 1.0134, "num_input_tokens_seen": 219545600, "step": 26800 }, { "epoch": 0.8616823627394452, "grad_norm": 0.4578529894351959, "learning_rate": 4.0963746068177744e-05, "loss": 1.0011, "num_input_tokens_seen": 220364800, "step": 26900 }, { "epoch": 0.8648856428983279, "grad_norm": 0.5275700688362122, "learning_rate": 4.089841956680927e-05, "loss": 1.0777, "num_input_tokens_seen": 221184000, "step": 27000 }, { "epoch": 0.8680889230572106, "grad_norm": 0.5704593658447266, "learning_rate": 4.08329102870131e-05, "loss": 1.0113, "num_input_tokens_seen": 222003200, "step": 27100 }, { "epoch": 0.8712922032160932, "grad_norm": 0.5546739101409912, "learning_rate": 4.076721898192597e-05, "loss": 1.0181, "num_input_tokens_seen": 222822400, "step": 27200 }, { "epoch": 0.874495483374976, "grad_norm": 0.4796381890773773, "learning_rate": 4.070134640677722e-05, "loss": 0.9882, "num_input_tokens_seen": 223641600, "step": 27300 }, { "epoch": 0.8776987635338587, "grad_norm": 8.13311767578125, "learning_rate": 4.063529331888024e-05, "loss": 0.9378, "num_input_tokens_seen": 224460800, "step": 27400 }, { "epoch": 0.8809020436927414, "grad_norm": 0.4969484806060791, "learning_rate": 4.056906047762368e-05, "loss": 0.9867, "num_input_tokens_seen": 225280000, "step": 27500 }, { "epoch": 0.8841053238516241, "grad_norm": 3.9572601318359375, "learning_rate": 4.0502648644462774e-05, "loss": 0.9645, "num_input_tokens_seen": 226099200, "step": 27600 }, { "epoch": 0.8873086040105067, "grad_norm": 2.1928722858428955, "learning_rate": 4.043605858291053e-05, "loss": 0.9678, "num_input_tokens_seen": 226918400, "step": 27700 }, { "epoch": 0.8905118841693894, "grad_norm": 0.7099782824516296, "learning_rate": 4.036929105852901e-05, "loss": 1.0127, "num_input_tokens_seen": 227737600, "step": 27800 }, { "epoch": 0.8937151643282721, "grad_norm": 0.6126459836959839, "learning_rate": 4.0302346838920514e-05, "loss": 1.0439, "num_input_tokens_seen": 228556800, "step": 27900 }, { "epoch": 0.8969184444871549, "grad_norm": 0.6163774728775024, "learning_rate": 4.02352266937187e-05, "loss": 0.9393, "num_input_tokens_seen": 229376000, "step": 28000 }, { "epoch": 0.9001217246460376, "grad_norm": 0.6306945085525513, "learning_rate": 4.016793139457982e-05, "loss": 0.8966, "num_input_tokens_seen": 230195200, "step": 28100 }, { "epoch": 0.9033250048049203, "grad_norm": 0.6520447134971619, "learning_rate": 4.0100461715173777e-05, "loss": 0.9861, "num_input_tokens_seen": 231014400, "step": 28200 }, { "epoch": 0.9065282849638029, "grad_norm": 0.5960193276405334, "learning_rate": 4.003281843117528e-05, "loss": 1.0012, "num_input_tokens_seen": 231833600, "step": 28300 }, { "epoch": 0.9097315651226856, "grad_norm": 0.6080912947654724, "learning_rate": 3.9965002320254924e-05, "loss": 0.9602, "num_input_tokens_seen": 232652800, "step": 28400 }, { "epoch": 0.9129348452815683, "grad_norm": 0.6659435033798218, "learning_rate": 3.989701416207019e-05, "loss": 0.988, "num_input_tokens_seen": 233472000, "step": 28500 }, { "epoch": 0.916138125440451, "grad_norm": 2.5207667350769043, "learning_rate": 3.9828854738256564e-05, "loss": 1.0339, "num_input_tokens_seen": 234291200, "step": 28600 }, { "epoch": 0.9193414055993337, "grad_norm": 2.4952239990234375, "learning_rate": 3.976052483241849e-05, "loss": 1.0025, "num_input_tokens_seen": 235110400, "step": 28700 }, { "epoch": 0.9225446857582165, "grad_norm": 0.6766204237937927, "learning_rate": 3.969202523012038e-05, "loss": 1.0335, "num_input_tokens_seen": 235929600, "step": 28800 }, { "epoch": 0.9257479659170991, "grad_norm": 0.666861891746521, "learning_rate": 3.9623356718877605e-05, "loss": 0.9721, "num_input_tokens_seen": 236748800, "step": 28900 }, { "epoch": 0.9289512460759818, "grad_norm": 0.5322718620300293, "learning_rate": 3.955452008814741e-05, "loss": 0.9866, "num_input_tokens_seen": 237568000, "step": 29000 }, { "epoch": 0.9321545262348645, "grad_norm": 0.6603706479072571, "learning_rate": 3.9485516129319844e-05, "loss": 0.9863, "num_input_tokens_seen": 238387200, "step": 29100 }, { "epoch": 0.9353578063937472, "grad_norm": 0.6650800704956055, "learning_rate": 3.9416345635708676e-05, "loss": 0.9902, "num_input_tokens_seen": 239206400, "step": 29200 }, { "epoch": 0.9385610865526299, "grad_norm": 2.477098226547241, "learning_rate": 3.9347009402542256e-05, "loss": 0.991, "num_input_tokens_seen": 240025600, "step": 29300 }, { "epoch": 0.9417643667115125, "grad_norm": 0.6523051261901855, "learning_rate": 3.9277508226954394e-05, "loss": 0.9851, "num_input_tokens_seen": 240844800, "step": 29400 }, { "epoch": 0.9449676468703953, "grad_norm": 0.7197608351707458, "learning_rate": 3.920784290797519e-05, "loss": 1.0144, "num_input_tokens_seen": 241664000, "step": 29500 }, { "epoch": 0.948170927029278, "grad_norm": 0.6857073903083801, "learning_rate": 3.9138014246521806e-05, "loss": 0.9529, "num_input_tokens_seen": 242483200, "step": 29600 }, { "epoch": 0.9513742071881607, "grad_norm": 0.616074800491333, "learning_rate": 3.906802304538935e-05, "loss": 0.9949, "num_input_tokens_seen": 243302400, "step": 29700 }, { "epoch": 0.9545774873470434, "grad_norm": 0.5982092022895813, "learning_rate": 3.899787010924152e-05, "loss": 0.9596, "num_input_tokens_seen": 244121600, "step": 29800 }, { "epoch": 0.9577807675059261, "grad_norm": 0.6943311095237732, "learning_rate": 3.8927556244601495e-05, "loss": 0.9813, "num_input_tokens_seen": 244940800, "step": 29900 }, { "epoch": 0.9609840476648087, "grad_norm": 0.7715808153152466, "learning_rate": 3.885708225984254e-05, "loss": 0.9747, "num_input_tokens_seen": 245760000, "step": 30000 }, { "epoch": 0.9641873278236914, "grad_norm": 0.6129135489463806, "learning_rate": 3.878644896517879e-05, "loss": 0.9933, "num_input_tokens_seen": 246579200, "step": 30100 }, { "epoch": 0.9673906079825741, "grad_norm": 0.7009174227714539, "learning_rate": 3.87156571726559e-05, "loss": 0.964, "num_input_tokens_seen": 247398400, "step": 30200 }, { "epoch": 0.9705938881414569, "grad_norm": 0.7255650758743286, "learning_rate": 3.8644707696141704e-05, "loss": 0.9784, "num_input_tokens_seen": 248217600, "step": 30300 }, { "epoch": 0.9737971683003396, "grad_norm": 4.299106597900391, "learning_rate": 3.857360135131691e-05, "loss": 1.0191, "num_input_tokens_seen": 249036800, "step": 30400 }, { "epoch": 0.9770004484592223, "grad_norm": 0.5924736261367798, "learning_rate": 3.8502338955665644e-05, "loss": 0.9769, "num_input_tokens_seen": 249856000, "step": 30500 }, { "epoch": 0.9802037286181049, "grad_norm": 0.7270549535751343, "learning_rate": 3.843092132846613e-05, "loss": 1.0179, "num_input_tokens_seen": 250675200, "step": 30600 }, { "epoch": 0.9834070087769876, "grad_norm": 0.7704394459724426, "learning_rate": 3.835934929078119e-05, "loss": 0.9206, "num_input_tokens_seen": 251494400, "step": 30700 }, { "epoch": 0.9866102889358703, "grad_norm": 0.612688422203064, "learning_rate": 3.828762366544888e-05, "loss": 0.9686, "num_input_tokens_seen": 252313600, "step": 30800 }, { "epoch": 0.989813569094753, "grad_norm": 0.5262284278869629, "learning_rate": 3.8215745277073e-05, "loss": 0.9694, "num_input_tokens_seen": 253132800, "step": 30900 }, { "epoch": 0.9930168492536358, "grad_norm": 0.5798372626304626, "learning_rate": 3.8143714952013584e-05, "loss": 0.8879, "num_input_tokens_seen": 253952000, "step": 31000 }, { "epoch": 0.9962201294125185, "grad_norm": 0.5605859756469727, "learning_rate": 3.807153351837746e-05, "loss": 0.9948, "num_input_tokens_seen": 254771200, "step": 31100 }, { "epoch": 0.9994234095714011, "grad_norm": 1.9532912969589233, "learning_rate": 3.799920180600868e-05, "loss": 1.027, "num_input_tokens_seen": 255590400, "step": 31200 }, { "epoch": 1.0026266897302838, "grad_norm": 0.6683017611503601, "learning_rate": 3.792672064647898e-05, "loss": 0.9665, "num_input_tokens_seen": 256409600, "step": 31300 }, { "epoch": 1.0058299698891664, "grad_norm": 0.5574291348457336, "learning_rate": 3.785409087307828e-05, "loss": 0.8671, "num_input_tokens_seen": 257228800, "step": 31400 }, { "epoch": 1.0090332500480492, "grad_norm": 0.6487427949905396, "learning_rate": 3.778131332080503e-05, "loss": 0.9356, "num_input_tokens_seen": 258048000, "step": 31500 }, { "epoch": 1.0122365302069318, "grad_norm": 0.6974719166755676, "learning_rate": 3.7708388826356636e-05, "loss": 0.9751, "num_input_tokens_seen": 258867200, "step": 31600 }, { "epoch": 1.0154398103658147, "grad_norm": 0.6754201054573059, "learning_rate": 3.763531822811986e-05, "loss": 0.8963, "num_input_tokens_seen": 259686400, "step": 31700 }, { "epoch": 1.0186430905246973, "grad_norm": 0.5839199423789978, "learning_rate": 3.756210236616117e-05, "loss": 0.9021, "num_input_tokens_seen": 260505600, "step": 31800 }, { "epoch": 1.02184637068358, "grad_norm": 0.5535345673561096, "learning_rate": 3.7488742082217064e-05, "loss": 0.947, "num_input_tokens_seen": 261324800, "step": 31900 }, { "epoch": 1.0250496508424627, "grad_norm": 1.948480248451233, "learning_rate": 3.741523821968441e-05, "loss": 0.9314, "num_input_tokens_seen": 262144000, "step": 32000 }, { "epoch": 1.0282529310013453, "grad_norm": 0.8400202393531799, "learning_rate": 3.734159162361077e-05, "loss": 0.9523, "num_input_tokens_seen": 262963200, "step": 32100 }, { "epoch": 1.0314562111602281, "grad_norm": 0.7016623020172119, "learning_rate": 3.7267803140684635e-05, "loss": 0.9119, "num_input_tokens_seen": 263782400, "step": 32200 }, { "epoch": 1.0346594913191107, "grad_norm": 0.6084064841270447, "learning_rate": 3.719387361922573e-05, "loss": 0.9027, "num_input_tokens_seen": 264601600, "step": 32300 }, { "epoch": 1.0378627714779936, "grad_norm": 1.551859736442566, "learning_rate": 3.711980390917523e-05, "loss": 0.9126, "num_input_tokens_seen": 265420800, "step": 32400 }, { "epoch": 1.0410660516368762, "grad_norm": 0.6663823127746582, "learning_rate": 3.7045594862086065e-05, "loss": 0.909, "num_input_tokens_seen": 266240000, "step": 32500 }, { "epoch": 1.0442693317957588, "grad_norm": 0.6280916333198547, "learning_rate": 3.697124733111299e-05, "loss": 0.8809, "num_input_tokens_seen": 267059200, "step": 32600 }, { "epoch": 1.0474726119546416, "grad_norm": 0.7370727062225342, "learning_rate": 3.689676217100293e-05, "loss": 0.9155, "num_input_tokens_seen": 267878400, "step": 32700 }, { "epoch": 1.0506758921135242, "grad_norm": 0.5798324942588806, "learning_rate": 3.682214023808506e-05, "loss": 0.9514, "num_input_tokens_seen": 268697600, "step": 32800 }, { "epoch": 1.053879172272407, "grad_norm": 0.6621294021606445, "learning_rate": 3.674738239026097e-05, "loss": 0.9057, "num_input_tokens_seen": 269516800, "step": 32900 }, { "epoch": 1.0570824524312896, "grad_norm": 0.9696263074874878, "learning_rate": 3.667248948699482e-05, "loss": 0.9083, "num_input_tokens_seen": 270336000, "step": 33000 }, { "epoch": 1.0602857325901724, "grad_norm": 1.3327863216400146, "learning_rate": 3.659746238930345e-05, "loss": 0.9211, "num_input_tokens_seen": 271155200, "step": 33100 }, { "epoch": 1.063489012749055, "grad_norm": 0.7066917419433594, "learning_rate": 3.6522301959746514e-05, "loss": 0.9384, "num_input_tokens_seen": 271974400, "step": 33200 }, { "epoch": 1.0666922929079377, "grad_norm": 0.6944926977157593, "learning_rate": 3.6447009062416506e-05, "loss": 0.9296, "num_input_tokens_seen": 272793600, "step": 33300 }, { "epoch": 1.0698955730668205, "grad_norm": 2.94767165184021, "learning_rate": 3.637158456292885e-05, "loss": 0.8913, "num_input_tokens_seen": 273612800, "step": 33400 }, { "epoch": 1.073098853225703, "grad_norm": 0.671801745891571, "learning_rate": 3.629602932841199e-05, "loss": 0.9251, "num_input_tokens_seen": 274432000, "step": 33500 }, { "epoch": 1.076302133384586, "grad_norm": 0.6639389991760254, "learning_rate": 3.622034422749734e-05, "loss": 0.9024, "num_input_tokens_seen": 275251200, "step": 33600 }, { "epoch": 1.0795054135434685, "grad_norm": 0.6131206154823303, "learning_rate": 3.614453013030936e-05, "loss": 0.8965, "num_input_tokens_seen": 276070400, "step": 33700 }, { "epoch": 1.0827086937023511, "grad_norm": 2.824341058731079, "learning_rate": 3.606858790845555e-05, "loss": 0.9058, "num_input_tokens_seen": 276889600, "step": 33800 }, { "epoch": 1.085911973861234, "grad_norm": 0.4830228388309479, "learning_rate": 3.5992518435016376e-05, "loss": 0.9052, "num_input_tokens_seen": 277708800, "step": 33900 }, { "epoch": 1.0891152540201166, "grad_norm": 0.49670127034187317, "learning_rate": 3.59163225845353e-05, "loss": 0.9027, "num_input_tokens_seen": 278528000, "step": 34000 }, { "epoch": 1.0923185341789994, "grad_norm": 0.7440226674079895, "learning_rate": 3.584000123300869e-05, "loss": 0.8947, "num_input_tokens_seen": 279347200, "step": 34100 }, { "epoch": 1.095521814337882, "grad_norm": 0.515023410320282, "learning_rate": 3.576355525787576e-05, "loss": 0.8998, "num_input_tokens_seen": 280166400, "step": 34200 }, { "epoch": 1.0987250944967646, "grad_norm": 0.8011521100997925, "learning_rate": 3.5686985538008445e-05, "loss": 0.8951, "num_input_tokens_seen": 280985600, "step": 34300 }, { "epoch": 1.1019283746556474, "grad_norm": 0.5452113151550293, "learning_rate": 3.561029295370138e-05, "loss": 0.9009, "num_input_tokens_seen": 281804800, "step": 34400 }, { "epoch": 1.10513165481453, "grad_norm": 0.8674356937408447, "learning_rate": 3.5533478386661665e-05, "loss": 0.9592, "num_input_tokens_seen": 282624000, "step": 34500 }, { "epoch": 1.1083349349734128, "grad_norm": 0.653605043888092, "learning_rate": 3.545654271999886e-05, "loss": 0.8587, "num_input_tokens_seen": 283443200, "step": 34600 }, { "epoch": 1.1115382151322954, "grad_norm": 0.5951905846595764, "learning_rate": 3.5379486838214715e-05, "loss": 0.906, "num_input_tokens_seen": 284262400, "step": 34700 }, { "epoch": 1.1147414952911783, "grad_norm": 0.6143243908882141, "learning_rate": 3.530231162719307e-05, "loss": 0.8925, "num_input_tokens_seen": 285081600, "step": 34800 }, { "epoch": 1.1179447754500609, "grad_norm": 0.569734513759613, "learning_rate": 3.5225017974189644e-05, "loss": 0.8922, "num_input_tokens_seen": 285900800, "step": 34900 }, { "epoch": 1.1211480556089435, "grad_norm": 1.6546896696090698, "learning_rate": 3.5147606767821846e-05, "loss": 0.884, "num_input_tokens_seen": 286720000, "step": 35000 }, { "epoch": 1.1243513357678263, "grad_norm": 0.7131773829460144, "learning_rate": 3.507007889805856e-05, "loss": 0.8941, "num_input_tokens_seen": 287539200, "step": 35100 }, { "epoch": 1.127554615926709, "grad_norm": 1.8620835542678833, "learning_rate": 3.499243525620988e-05, "loss": 0.9209, "num_input_tokens_seen": 288358400, "step": 35200 }, { "epoch": 1.1307578960855917, "grad_norm": 1.936231017112732, "learning_rate": 3.491467673491692e-05, "loss": 0.9284, "num_input_tokens_seen": 289177600, "step": 35300 }, { "epoch": 1.1339611762444743, "grad_norm": 0.5847631096839905, "learning_rate": 3.483680422814152e-05, "loss": 0.9036, "num_input_tokens_seen": 289996800, "step": 35400 }, { "epoch": 1.137164456403357, "grad_norm": 0.6272117495536804, "learning_rate": 3.4758818631155934e-05, "loss": 0.8766, "num_input_tokens_seen": 290816000, "step": 35500 }, { "epoch": 1.1403677365622398, "grad_norm": 0.50895756483078, "learning_rate": 3.4680720840532636e-05, "loss": 0.8996, "num_input_tokens_seen": 291635200, "step": 35600 }, { "epoch": 1.1435710167211224, "grad_norm": 0.8421196341514587, "learning_rate": 3.460251175413388e-05, "loss": 0.932, "num_input_tokens_seen": 292454400, "step": 35700 }, { "epoch": 1.1467742968800052, "grad_norm": 1.1610244512557983, "learning_rate": 3.452419227110151e-05, "loss": 0.9095, "num_input_tokens_seen": 293273600, "step": 35800 }, { "epoch": 1.1499775770388878, "grad_norm": 0.5575504302978516, "learning_rate": 3.444576329184651e-05, "loss": 0.9166, "num_input_tokens_seen": 294092800, "step": 35900 }, { "epoch": 1.1531808571977704, "grad_norm": 0.5330684781074524, "learning_rate": 3.436722571803874e-05, "loss": 0.9445, "num_input_tokens_seen": 294912000, "step": 36000 }, { "epoch": 1.1563841373566532, "grad_norm": 0.7490949630737305, "learning_rate": 3.428858045259652e-05, "loss": 0.8947, "num_input_tokens_seen": 295731200, "step": 36100 }, { "epoch": 1.1595874175155358, "grad_norm": 1.870923399925232, "learning_rate": 3.420982839967624e-05, "loss": 0.9532, "num_input_tokens_seen": 296550400, "step": 36200 }, { "epoch": 1.1627906976744187, "grad_norm": 3.164524555206299, "learning_rate": 3.413097046466203e-05, "loss": 0.9716, "num_input_tokens_seen": 297369600, "step": 36300 }, { "epoch": 1.1659939778333013, "grad_norm": 1.375303864479065, "learning_rate": 3.405200755415527e-05, "loss": 0.9364, "num_input_tokens_seen": 298188800, "step": 36400 }, { "epoch": 1.169197257992184, "grad_norm": 2.2876625061035156, "learning_rate": 3.397294057596424e-05, "loss": 0.8933, "num_input_tokens_seen": 299008000, "step": 36500 }, { "epoch": 1.1724005381510667, "grad_norm": 0.5776546597480774, "learning_rate": 3.389377043909361e-05, "loss": 0.8916, "num_input_tokens_seen": 299827200, "step": 36600 }, { "epoch": 1.1756038183099493, "grad_norm": 0.7254892587661743, "learning_rate": 3.381449805373406e-05, "loss": 0.922, "num_input_tokens_seen": 300646400, "step": 36700 }, { "epoch": 1.1788070984688321, "grad_norm": 0.7244319319725037, "learning_rate": 3.3735124331251764e-05, "loss": 0.9093, "num_input_tokens_seen": 301465600, "step": 36800 }, { "epoch": 1.1820103786277147, "grad_norm": 0.5166808366775513, "learning_rate": 3.3655650184177957e-05, "loss": 0.9553, "num_input_tokens_seen": 302284800, "step": 36900 }, { "epoch": 1.1852136587865976, "grad_norm": 1.6987115144729614, "learning_rate": 3.357607652619839e-05, "loss": 0.8768, "num_input_tokens_seen": 303104000, "step": 37000 }, { "epoch": 1.1884169389454802, "grad_norm": 0.8271929621696472, "learning_rate": 3.349640427214287e-05, "loss": 0.9632, "num_input_tokens_seen": 303923200, "step": 37100 }, { "epoch": 1.1916202191043628, "grad_norm": 0.7163927555084229, "learning_rate": 3.341663433797474e-05, "loss": 0.8682, "num_input_tokens_seen": 304742400, "step": 37200 }, { "epoch": 1.1948234992632456, "grad_norm": 0.6233458518981934, "learning_rate": 3.33367676407803e-05, "loss": 0.9334, "num_input_tokens_seen": 305561600, "step": 37300 }, { "epoch": 1.1980267794221282, "grad_norm": 1.0882517099380493, "learning_rate": 3.3256805098758346e-05, "loss": 0.9073, "num_input_tokens_seen": 306380800, "step": 37400 }, { "epoch": 1.201230059581011, "grad_norm": 0.8322218656539917, "learning_rate": 3.3176747631209534e-05, "loss": 0.9343, "num_input_tokens_seen": 307200000, "step": 37500 }, { "epoch": 1.2044333397398936, "grad_norm": 1.4540088176727295, "learning_rate": 3.309659615852586e-05, "loss": 0.8541, "num_input_tokens_seen": 308019200, "step": 37600 }, { "epoch": 1.2076366198987762, "grad_norm": 0.6830178499221802, "learning_rate": 3.301635160218005e-05, "loss": 0.8889, "num_input_tokens_seen": 308838400, "step": 37700 }, { "epoch": 1.210839900057659, "grad_norm": 1.9847421646118164, "learning_rate": 3.293601488471499e-05, "loss": 0.883, "num_input_tokens_seen": 309657600, "step": 37800 }, { "epoch": 1.2140431802165417, "grad_norm": 0.8129870891571045, "learning_rate": 3.285558692973312e-05, "loss": 0.9474, "num_input_tokens_seen": 310476800, "step": 37900 }, { "epoch": 1.2172464603754245, "grad_norm": 0.6733205914497375, "learning_rate": 3.277506866188577e-05, "loss": 0.904, "num_input_tokens_seen": 311296000, "step": 38000 }, { "epoch": 1.220449740534307, "grad_norm": 1.2211860418319702, "learning_rate": 3.269446100686261e-05, "loss": 0.8879, "num_input_tokens_seen": 312115200, "step": 38100 }, { "epoch": 1.22365302069319, "grad_norm": 0.7225973010063171, "learning_rate": 3.261376489138092e-05, "loss": 0.9139, "num_input_tokens_seen": 312934400, "step": 38200 }, { "epoch": 1.2268563008520725, "grad_norm": 0.7631468772888184, "learning_rate": 3.253298124317502e-05, "loss": 0.959, "num_input_tokens_seen": 313753600, "step": 38300 }, { "epoch": 1.2300595810109551, "grad_norm": 0.6244317889213562, "learning_rate": 3.245211099098551e-05, "loss": 0.9155, "num_input_tokens_seen": 314572800, "step": 38400 }, { "epoch": 1.233262861169838, "grad_norm": 0.5164452791213989, "learning_rate": 3.237115506454869e-05, "loss": 0.8758, "num_input_tokens_seen": 315392000, "step": 38500 }, { "epoch": 1.2364661413287206, "grad_norm": 0.7463127970695496, "learning_rate": 3.2290114394585815e-05, "loss": 0.9116, "num_input_tokens_seen": 316211200, "step": 38600 }, { "epoch": 1.2396694214876034, "grad_norm": 0.697425901889801, "learning_rate": 3.22089899127924e-05, "loss": 0.8743, "num_input_tokens_seen": 317030400, "step": 38700 }, { "epoch": 1.242872701646486, "grad_norm": 0.6725397706031799, "learning_rate": 3.212778255182752e-05, "loss": 0.9507, "num_input_tokens_seen": 317849600, "step": 38800 }, { "epoch": 1.2460759818053686, "grad_norm": 0.5633911490440369, "learning_rate": 3.2046493245303066e-05, "loss": 0.9114, "num_input_tokens_seen": 318668800, "step": 38900 }, { "epoch": 1.2492792619642514, "grad_norm": 0.4953620135784149, "learning_rate": 3.196512292777305e-05, "loss": 0.9392, "num_input_tokens_seen": 319488000, "step": 39000 }, { "epoch": 1.252482542123134, "grad_norm": 0.5511077642440796, "learning_rate": 3.1883672534722824e-05, "loss": 0.9277, "num_input_tokens_seen": 320307200, "step": 39100 }, { "epoch": 1.2556858222820169, "grad_norm": 1.671002745628357, "learning_rate": 3.180214300255834e-05, "loss": 0.8868, "num_input_tokens_seen": 321126400, "step": 39200 }, { "epoch": 1.2588891024408995, "grad_norm": 0.47333982586860657, "learning_rate": 3.1720535268595406e-05, "loss": 0.9129, "num_input_tokens_seen": 321945600, "step": 39300 }, { "epoch": 1.262092382599782, "grad_norm": 0.6256750226020813, "learning_rate": 3.1638850271048845e-05, "loss": 0.9237, "num_input_tokens_seen": 322764800, "step": 39400 }, { "epoch": 1.265295662758665, "grad_norm": 1.6359134912490845, "learning_rate": 3.15570889490218e-05, "loss": 0.8913, "num_input_tokens_seen": 323584000, "step": 39500 }, { "epoch": 1.2684989429175475, "grad_norm": 0.7079516649246216, "learning_rate": 3.1475252242494855e-05, "loss": 0.9312, "num_input_tokens_seen": 324403200, "step": 39600 }, { "epoch": 1.2717022230764303, "grad_norm": 0.5469818711280823, "learning_rate": 3.139334109231527e-05, "loss": 0.8776, "num_input_tokens_seen": 325222400, "step": 39700 }, { "epoch": 1.274905503235313, "grad_norm": 0.6753129959106445, "learning_rate": 3.131135644018617e-05, "loss": 0.9715, "num_input_tokens_seen": 326041600, "step": 39800 }, { "epoch": 1.2781087833941958, "grad_norm": 1.3139586448669434, "learning_rate": 3.1229299228655683e-05, "loss": 0.9268, "num_input_tokens_seen": 326860800, "step": 39900 }, { "epoch": 1.2813120635530784, "grad_norm": 0.6371886730194092, "learning_rate": 3.1147170401106154e-05, "loss": 0.9286, "num_input_tokens_seen": 327680000, "step": 40000 }, { "epoch": 1.284515343711961, "grad_norm": 0.9212737083435059, "learning_rate": 3.106497090174325e-05, "loss": 0.9317, "num_input_tokens_seen": 328499200, "step": 40100 }, { "epoch": 1.2877186238708438, "grad_norm": 0.6135571002960205, "learning_rate": 3.098270167558514e-05, "loss": 0.9152, "num_input_tokens_seen": 329318400, "step": 40200 }, { "epoch": 1.2909219040297264, "grad_norm": 0.6993789076805115, "learning_rate": 3.09003636684516e-05, "loss": 0.9283, "num_input_tokens_seen": 330137600, "step": 40300 }, { "epoch": 1.294125184188609, "grad_norm": 0.7431827783584595, "learning_rate": 3.081795782695317e-05, "loss": 0.9307, "num_input_tokens_seen": 330956800, "step": 40400 }, { "epoch": 1.2973284643474918, "grad_norm": 0.9774760603904724, "learning_rate": 3.0735485098480255e-05, "loss": 0.8917, "num_input_tokens_seen": 331776000, "step": 40500 }, { "epoch": 1.3005317445063747, "grad_norm": 0.5644115209579468, "learning_rate": 3.0652946431192244e-05, "loss": 0.9321, "num_input_tokens_seen": 332595200, "step": 40600 }, { "epoch": 1.3037350246652573, "grad_norm": 2.2749266624450684, "learning_rate": 3.057034277400658e-05, "loss": 0.9211, "num_input_tokens_seen": 333414400, "step": 40700 }, { "epoch": 1.3069383048241399, "grad_norm": 0.6312987804412842, "learning_rate": 3.048767507658788e-05, "loss": 0.913, "num_input_tokens_seen": 334233600, "step": 40800 }, { "epoch": 1.3101415849830227, "grad_norm": 0.5494056344032288, "learning_rate": 3.0404944289337034e-05, "loss": 0.9423, "num_input_tokens_seen": 335052800, "step": 40900 }, { "epoch": 1.3133448651419053, "grad_norm": 1.3932960033416748, "learning_rate": 3.0322151363380202e-05, "loss": 0.9409, "num_input_tokens_seen": 335872000, "step": 41000 }, { "epoch": 1.316548145300788, "grad_norm": 0.7711178660392761, "learning_rate": 3.023929725055798e-05, "loss": 0.9187, "num_input_tokens_seen": 336691200, "step": 41100 }, { "epoch": 1.3197514254596707, "grad_norm": 0.9086521863937378, "learning_rate": 3.0156382903414383e-05, "loss": 1.0063, "num_input_tokens_seen": 337510400, "step": 41200 }, { "epoch": 1.3229547056185533, "grad_norm": 0.6938414573669434, "learning_rate": 3.007340927518591e-05, "loss": 0.8821, "num_input_tokens_seen": 338329600, "step": 41300 }, { "epoch": 1.3261579857774362, "grad_norm": 0.5269713401794434, "learning_rate": 2.999037731979063e-05, "loss": 0.8968, "num_input_tokens_seen": 339148800, "step": 41400 }, { "epoch": 1.3293612659363188, "grad_norm": 0.69822096824646, "learning_rate": 2.9907287991817128e-05, "loss": 0.955, "num_input_tokens_seen": 339968000, "step": 41500 }, { "epoch": 1.3325645460952016, "grad_norm": 1.9268356561660767, "learning_rate": 2.9824142246513624e-05, "loss": 0.9096, "num_input_tokens_seen": 340787200, "step": 41600 }, { "epoch": 1.3357678262540842, "grad_norm": 0.5475559234619141, "learning_rate": 2.9740941039776925e-05, "loss": 0.8828, "num_input_tokens_seen": 341606400, "step": 41700 }, { "epoch": 1.3389711064129668, "grad_norm": 1.9515366554260254, "learning_rate": 2.9657685328141466e-05, "loss": 0.9614, "num_input_tokens_seen": 342425600, "step": 41800 }, { "epoch": 1.3421743865718496, "grad_norm": 0.6959076523780823, "learning_rate": 2.95743760687683e-05, "loss": 0.8739, "num_input_tokens_seen": 343244800, "step": 41900 }, { "epoch": 1.3453776667307322, "grad_norm": 0.761962890625, "learning_rate": 2.9491014219434105e-05, "loss": 0.9595, "num_input_tokens_seen": 344064000, "step": 42000 }, { "epoch": 1.3485809468896148, "grad_norm": 0.6127232909202576, "learning_rate": 2.9407600738520162e-05, "loss": 0.9026, "num_input_tokens_seen": 344883200, "step": 42100 }, { "epoch": 1.3517842270484977, "grad_norm": 0.6869720220565796, "learning_rate": 2.9324136585001348e-05, "loss": 0.9488, "num_input_tokens_seen": 345702400, "step": 42200 }, { "epoch": 1.3549875072073805, "grad_norm": 0.7109299898147583, "learning_rate": 2.9240622718435107e-05, "loss": 0.9433, "num_input_tokens_seen": 346521600, "step": 42300 }, { "epoch": 1.358190787366263, "grad_norm": 0.6879071593284607, "learning_rate": 2.9157060098950395e-05, "loss": 0.8783, "num_input_tokens_seen": 347340800, "step": 42400 }, { "epoch": 1.3613940675251457, "grad_norm": 0.5623328685760498, "learning_rate": 2.9073449687236688e-05, "loss": 0.8925, "num_input_tokens_seen": 348160000, "step": 42500 }, { "epoch": 1.3645973476840285, "grad_norm": 0.9881012439727783, "learning_rate": 2.8989792444532892e-05, "loss": 0.9417, "num_input_tokens_seen": 348979200, "step": 42600 }, { "epoch": 1.3678006278429111, "grad_norm": 0.6569281816482544, "learning_rate": 2.890608933261633e-05, "loss": 0.9262, "num_input_tokens_seen": 349798400, "step": 42700 }, { "epoch": 1.3710039080017937, "grad_norm": 0.9453611969947815, "learning_rate": 2.882234131379167e-05, "loss": 0.9022, "num_input_tokens_seen": 350617600, "step": 42800 }, { "epoch": 1.3742071881606766, "grad_norm": 0.5668920874595642, "learning_rate": 2.8738549350879824e-05, "loss": 0.9306, "num_input_tokens_seen": 351436800, "step": 42900 }, { "epoch": 1.3774104683195592, "grad_norm": 0.8056479692459106, "learning_rate": 2.8654714407206956e-05, "loss": 0.8878, "num_input_tokens_seen": 352256000, "step": 43000 }, { "epoch": 1.380613748478442, "grad_norm": 0.863929271697998, "learning_rate": 2.8570837446593336e-05, "loss": 0.9391, "num_input_tokens_seen": 353075200, "step": 43100 }, { "epoch": 1.3838170286373246, "grad_norm": 0.5808566808700562, "learning_rate": 2.8486919433342295e-05, "loss": 0.9061, "num_input_tokens_seen": 353894400, "step": 43200 }, { "epoch": 1.3870203087962074, "grad_norm": 0.8920639157295227, "learning_rate": 2.8402961332229143e-05, "loss": 0.8854, "num_input_tokens_seen": 354713600, "step": 43300 }, { "epoch": 1.39022358895509, "grad_norm": 0.6987112760543823, "learning_rate": 2.831896410849005e-05, "loss": 0.893, "num_input_tokens_seen": 355532800, "step": 43400 }, { "epoch": 1.3934268691139726, "grad_norm": 0.6486085653305054, "learning_rate": 2.823492872781098e-05, "loss": 0.9166, "num_input_tokens_seen": 356352000, "step": 43500 }, { "epoch": 1.3966301492728554, "grad_norm": 1.6597498655319214, "learning_rate": 2.815085615631654e-05, "loss": 0.9473, "num_input_tokens_seen": 357171200, "step": 43600 }, { "epoch": 1.399833429431738, "grad_norm": 0.598414957523346, "learning_rate": 2.8066747360558966e-05, "loss": 0.9046, "num_input_tokens_seen": 357990400, "step": 43700 }, { "epoch": 1.4030367095906209, "grad_norm": 2.125504732131958, "learning_rate": 2.798260330750689e-05, "loss": 0.9325, "num_input_tokens_seen": 358809600, "step": 43800 }, { "epoch": 1.4062399897495035, "grad_norm": 0.798989474773407, "learning_rate": 2.789842496453432e-05, "loss": 0.9057, "num_input_tokens_seen": 359628800, "step": 43900 }, { "epoch": 1.4094432699083863, "grad_norm": 0.8189502954483032, "learning_rate": 2.7814213299409475e-05, "loss": 0.923, "num_input_tokens_seen": 360448000, "step": 44000 }, { "epoch": 1.412646550067269, "grad_norm": 0.5460119247436523, "learning_rate": 2.7729969280283662e-05, "loss": 0.8764, "num_input_tokens_seen": 361267200, "step": 44100 }, { "epoch": 1.4158498302261515, "grad_norm": 0.6900705695152283, "learning_rate": 2.7645693875680163e-05, "loss": 0.9295, "num_input_tokens_seen": 362086400, "step": 44200 }, { "epoch": 1.4190531103850343, "grad_norm": 0.7309842705726624, "learning_rate": 2.7561388054483074e-05, "loss": 0.8883, "num_input_tokens_seen": 362905600, "step": 44300 }, { "epoch": 1.422256390543917, "grad_norm": 0.9340581297874451, "learning_rate": 2.7477052785926178e-05, "loss": 0.8784, "num_input_tokens_seen": 363724800, "step": 44400 }, { "epoch": 1.4254596707027996, "grad_norm": 0.6001551151275635, "learning_rate": 2.7392689039581815e-05, "loss": 0.949, "num_input_tokens_seen": 364544000, "step": 44500 }, { "epoch": 1.4286629508616824, "grad_norm": 0.5180249810218811, "learning_rate": 2.7308297785349724e-05, "loss": 0.8738, "num_input_tokens_seen": 365363200, "step": 44600 }, { "epoch": 1.431866231020565, "grad_norm": 0.6243082284927368, "learning_rate": 2.7223879993445873e-05, "loss": 0.9074, "num_input_tokens_seen": 366182400, "step": 44700 }, { "epoch": 1.4350695111794478, "grad_norm": 0.6807756423950195, "learning_rate": 2.713943663439135e-05, "loss": 0.953, "num_input_tokens_seen": 367001600, "step": 44800 }, { "epoch": 1.4382727913383304, "grad_norm": 0.6057282090187073, "learning_rate": 2.7054968679001174e-05, "loss": 0.8736, "num_input_tokens_seen": 367820800, "step": 44900 }, { "epoch": 1.4414760714972132, "grad_norm": 0.593506395816803, "learning_rate": 2.697047709837312e-05, "loss": 0.8405, "num_input_tokens_seen": 368640000, "step": 45000 }, { "epoch": 1.4446793516560958, "grad_norm": 0.7090416550636292, "learning_rate": 2.6885962863876596e-05, "loss": 0.8852, "num_input_tokens_seen": 369459200, "step": 45100 }, { "epoch": 1.4478826318149784, "grad_norm": 0.5391395092010498, "learning_rate": 2.6801426947141435e-05, "loss": 0.9029, "num_input_tokens_seen": 370278400, "step": 45200 }, { "epoch": 1.4510859119738613, "grad_norm": 0.5424131751060486, "learning_rate": 2.671687032004676e-05, "loss": 0.8751, "num_input_tokens_seen": 371097600, "step": 45300 }, { "epoch": 1.4542891921327439, "grad_norm": 0.5781705975532532, "learning_rate": 2.6632293954709785e-05, "loss": 0.9417, "num_input_tokens_seen": 371916800, "step": 45400 }, { "epoch": 1.4574924722916267, "grad_norm": 0.5788801312446594, "learning_rate": 2.654769882347464e-05, "loss": 0.9022, "num_input_tokens_seen": 372736000, "step": 45500 }, { "epoch": 1.4606957524505093, "grad_norm": 0.6637430787086487, "learning_rate": 2.646308589890123e-05, "loss": 0.9017, "num_input_tokens_seen": 373555200, "step": 45600 }, { "epoch": 1.4638990326093921, "grad_norm": 0.7034772634506226, "learning_rate": 2.637845615375397e-05, "loss": 0.883, "num_input_tokens_seen": 374374400, "step": 45700 }, { "epoch": 1.4671023127682747, "grad_norm": 0.6476500630378723, "learning_rate": 2.629381056099071e-05, "loss": 0.9469, "num_input_tokens_seen": 375193600, "step": 45800 }, { "epoch": 1.4703055929271573, "grad_norm": 0.560495913028717, "learning_rate": 2.6209150093751473e-05, "loss": 0.885, "num_input_tokens_seen": 376012800, "step": 45900 }, { "epoch": 1.4735088730860402, "grad_norm": 1.9203239679336548, "learning_rate": 2.612447572534727e-05, "loss": 0.9248, "num_input_tokens_seen": 376832000, "step": 46000 }, { "epoch": 1.4767121532449228, "grad_norm": 2.3468987941741943, "learning_rate": 2.6039788429248957e-05, "loss": 0.9041, "num_input_tokens_seen": 377651200, "step": 46100 }, { "epoch": 1.4799154334038054, "grad_norm": 0.6502100825309753, "learning_rate": 2.5955089179075997e-05, "loss": 0.9431, "num_input_tokens_seen": 378470400, "step": 46200 }, { "epoch": 1.4831187135626882, "grad_norm": 3.609816551208496, "learning_rate": 2.5870378948585295e-05, "loss": 0.8893, "num_input_tokens_seen": 379289600, "step": 46300 }, { "epoch": 1.4863219937215708, "grad_norm": 0.58833247423172, "learning_rate": 2.5785658711659987e-05, "loss": 0.9181, "num_input_tokens_seen": 380108800, "step": 46400 }, { "epoch": 1.4895252738804536, "grad_norm": 1.7303794622421265, "learning_rate": 2.570092944229826e-05, "loss": 0.8921, "num_input_tokens_seen": 380928000, "step": 46500 }, { "epoch": 1.4927285540393362, "grad_norm": 0.7278485894203186, "learning_rate": 2.5616192114602127e-05, "loss": 0.8693, "num_input_tokens_seen": 381747200, "step": 46600 }, { "epoch": 1.495931834198219, "grad_norm": 0.7616570591926575, "learning_rate": 2.5531447702766254e-05, "loss": 0.9397, "num_input_tokens_seen": 382566400, "step": 46700 }, { "epoch": 1.4991351143571017, "grad_norm": 0.11684958636760712, "learning_rate": 2.5446697181066747e-05, "loss": 0.8526, "num_input_tokens_seen": 383385600, "step": 46800 }, { "epoch": 1.5023383945159843, "grad_norm": 0.7726488709449768, "learning_rate": 2.536194152384997e-05, "loss": 0.9122, "num_input_tokens_seen": 384204800, "step": 46900 }, { "epoch": 1.505541674674867, "grad_norm": 0.7091355323791504, "learning_rate": 2.527718170552129e-05, "loss": 0.8666, "num_input_tokens_seen": 385024000, "step": 47000 }, { "epoch": 1.5087449548337497, "grad_norm": 2.5142340660095215, "learning_rate": 2.519241870053396e-05, "loss": 0.911, "num_input_tokens_seen": 385843200, "step": 47100 }, { "epoch": 1.5119482349926323, "grad_norm": 0.6862989664077759, "learning_rate": 2.5107653483377852e-05, "loss": 0.974, "num_input_tokens_seen": 386662400, "step": 47200 }, { "epoch": 1.5151515151515151, "grad_norm": 2.351198196411133, "learning_rate": 2.502288702856824e-05, "loss": 0.8986, "num_input_tokens_seen": 387481600, "step": 47300 }, { "epoch": 1.518354795310398, "grad_norm": 0.7517640590667725, "learning_rate": 2.4938120310634682e-05, "loss": 0.8549, "num_input_tokens_seen": 388300800, "step": 47400 }, { "epoch": 1.5215580754692806, "grad_norm": 2.709975004196167, "learning_rate": 2.485335430410972e-05, "loss": 0.899, "num_input_tokens_seen": 389120000, "step": 47500 }, { "epoch": 1.5247613556281632, "grad_norm": 0.7952636480331421, "learning_rate": 2.4768589983517716e-05, "loss": 0.8622, "num_input_tokens_seen": 389939200, "step": 47600 }, { "epoch": 1.527964635787046, "grad_norm": 0.7378533482551575, "learning_rate": 2.4683828323363687e-05, "loss": 0.8334, "num_input_tokens_seen": 390758400, "step": 47700 }, { "epoch": 1.5311679159459286, "grad_norm": 2.5980470180511475, "learning_rate": 2.459907029812203e-05, "loss": 0.9028, "num_input_tokens_seen": 391577600, "step": 47800 }, { "epoch": 1.5343711961048112, "grad_norm": 0.6807860732078552, "learning_rate": 2.4514316882225347e-05, "loss": 0.9259, "num_input_tokens_seen": 392396800, "step": 47900 }, { "epoch": 1.537574476263694, "grad_norm": 2.3691670894622803, "learning_rate": 2.442956905005328e-05, "loss": 0.8639, "num_input_tokens_seen": 393216000, "step": 48000 }, { "epoch": 1.5407777564225769, "grad_norm": 0.7466169595718384, "learning_rate": 2.434482777592125e-05, "loss": 0.8828, "num_input_tokens_seen": 394035200, "step": 48100 }, { "epoch": 1.5439810365814595, "grad_norm": 0.5329868793487549, "learning_rate": 2.426009403406931e-05, "loss": 0.8802, "num_input_tokens_seen": 394854400, "step": 48200 }, { "epoch": 1.547184316740342, "grad_norm": 0.6394245028495789, "learning_rate": 2.4175368798650884e-05, "loss": 0.8811, "num_input_tokens_seen": 395673600, "step": 48300 }, { "epoch": 1.550387596899225, "grad_norm": 0.9404513239860535, "learning_rate": 2.4090653043721612e-05, "loss": 0.8663, "num_input_tokens_seen": 396492800, "step": 48400 }, { "epoch": 1.5535908770581075, "grad_norm": 0.7973567843437195, "learning_rate": 2.4005947743228157e-05, "loss": 0.9452, "num_input_tokens_seen": 397312000, "step": 48500 }, { "epoch": 1.55679415721699, "grad_norm": 1.8970893621444702, "learning_rate": 2.3921253870996972e-05, "loss": 0.8968, "num_input_tokens_seen": 398131200, "step": 48600 }, { "epoch": 1.559997437375873, "grad_norm": 0.7782315015792847, "learning_rate": 2.383657240072314e-05, "loss": 0.9475, "num_input_tokens_seen": 398950400, "step": 48700 }, { "epoch": 1.5632007175347555, "grad_norm": 0.72723788022995, "learning_rate": 2.375190430595914e-05, "loss": 0.9347, "num_input_tokens_seen": 399769600, "step": 48800 }, { "epoch": 1.5664039976936381, "grad_norm": 0.5238316655158997, "learning_rate": 2.366725056010369e-05, "loss": 0.8969, "num_input_tokens_seen": 400588800, "step": 48900 }, { "epoch": 1.569607277852521, "grad_norm": 0.7676683664321899, "learning_rate": 2.3582612136390556e-05, "loss": 0.8926, "num_input_tokens_seen": 401408000, "step": 49000 }, { "epoch": 1.5728105580114038, "grad_norm": 1.64457106590271, "learning_rate": 2.349799000787733e-05, "loss": 0.9027, "num_input_tokens_seen": 402227200, "step": 49100 }, { "epoch": 1.5760138381702864, "grad_norm": 0.5461480617523193, "learning_rate": 2.3413385147434285e-05, "loss": 0.8651, "num_input_tokens_seen": 403046400, "step": 49200 }, { "epoch": 1.579217118329169, "grad_norm": 0.527300238609314, "learning_rate": 2.332879852773314e-05, "loss": 0.8354, "num_input_tokens_seen": 403865600, "step": 49300 }, { "epoch": 1.5824203984880518, "grad_norm": 0.8455817699432373, "learning_rate": 2.3244231121235936e-05, "loss": 0.903, "num_input_tokens_seen": 404684800, "step": 49400 }, { "epoch": 1.5856236786469344, "grad_norm": 0.8457258939743042, "learning_rate": 2.3159683900183812e-05, "loss": 0.9085, "num_input_tokens_seen": 405504000, "step": 49500 }, { "epoch": 1.588826958805817, "grad_norm": 0.7063552141189575, "learning_rate": 2.3075157836585854e-05, "loss": 0.9002, "num_input_tokens_seen": 406323200, "step": 49600 }, { "epoch": 1.5920302389646999, "grad_norm": 0.6034948229789734, "learning_rate": 2.2990653902207875e-05, "loss": 0.8665, "num_input_tokens_seen": 407142400, "step": 49700 }, { "epoch": 1.5952335191235827, "grad_norm": 0.6883265972137451, "learning_rate": 2.2906173068561324e-05, "loss": 0.9031, "num_input_tokens_seen": 407961600, "step": 49800 }, { "epoch": 1.5984367992824653, "grad_norm": 0.6610883474349976, "learning_rate": 2.282171630689203e-05, "loss": 0.9153, "num_input_tokens_seen": 408780800, "step": 49900 }, { "epoch": 1.601640079441348, "grad_norm": 1.8148962259292603, "learning_rate": 2.2737284588169107e-05, "loss": 0.8904, "num_input_tokens_seen": 409600000, "step": 50000 }, { "epoch": 1.6048433596002307, "grad_norm": 0.8317341804504395, "learning_rate": 2.2652878883073736e-05, "loss": 0.8847, "num_input_tokens_seen": 410419200, "step": 50100 }, { "epoch": 1.6080466397591133, "grad_norm": 0.5359209179878235, "learning_rate": 2.2568500161988023e-05, "loss": 0.8983, "num_input_tokens_seen": 411238400, "step": 50200 }, { "epoch": 1.611249919917996, "grad_norm": 0.6819952726364136, "learning_rate": 2.2484149394983882e-05, "loss": 0.9138, "num_input_tokens_seen": 412057600, "step": 50300 }, { "epoch": 1.6144532000768788, "grad_norm": 0.8475795984268188, "learning_rate": 2.239982755181181e-05, "loss": 0.8536, "num_input_tokens_seen": 412876800, "step": 50400 }, { "epoch": 1.6176564802357616, "grad_norm": 1.1045705080032349, "learning_rate": 2.2315535601889814e-05, "loss": 0.9137, "num_input_tokens_seen": 413696000, "step": 50500 }, { "epoch": 1.620859760394644, "grad_norm": 0.6131917834281921, "learning_rate": 2.2231274514292196e-05, "loss": 0.8992, "num_input_tokens_seen": 414515200, "step": 50600 }, { "epoch": 1.6240630405535268, "grad_norm": 0.6096556186676025, "learning_rate": 2.214704525773846e-05, "loss": 0.9211, "num_input_tokens_seen": 415334400, "step": 50700 }, { "epoch": 1.6272663207124096, "grad_norm": 0.5279362797737122, "learning_rate": 2.2062848800582168e-05, "loss": 0.9231, "num_input_tokens_seen": 416153600, "step": 50800 }, { "epoch": 1.6304696008712922, "grad_norm": 0.5645897388458252, "learning_rate": 2.197868611079978e-05, "loss": 0.8579, "num_input_tokens_seen": 416972800, "step": 50900 }, { "epoch": 1.6336728810301748, "grad_norm": 0.5469439029693604, "learning_rate": 2.189455815597957e-05, "loss": 0.8802, "num_input_tokens_seen": 417792000, "step": 51000 }, { "epoch": 1.6368761611890577, "grad_norm": 0.7165865898132324, "learning_rate": 2.1810465903310445e-05, "loss": 0.897, "num_input_tokens_seen": 418611200, "step": 51100 }, { "epoch": 1.6400794413479403, "grad_norm": 0.49263107776641846, "learning_rate": 2.1726410319570874e-05, "loss": 0.9145, "num_input_tokens_seen": 419430400, "step": 51200 }, { "epoch": 1.6432827215068229, "grad_norm": 0.7984305620193481, "learning_rate": 2.164239237111776e-05, "loss": 0.9656, "num_input_tokens_seen": 420249600, "step": 51300 }, { "epoch": 1.6464860016657057, "grad_norm": 0.6783995628356934, "learning_rate": 2.1558413023875334e-05, "loss": 0.8937, "num_input_tokens_seen": 421068800, "step": 51400 }, { "epoch": 1.6496892818245885, "grad_norm": 0.6700116395950317, "learning_rate": 2.147447324332403e-05, "loss": 0.8966, "num_input_tokens_seen": 421888000, "step": 51500 }, { "epoch": 1.6528925619834711, "grad_norm": 2.6840033531188965, "learning_rate": 2.1390573994489377e-05, "loss": 0.9922, "num_input_tokens_seen": 422707200, "step": 51600 }, { "epoch": 1.6560958421423537, "grad_norm": 0.6062913537025452, "learning_rate": 2.1306716241930968e-05, "loss": 0.9201, "num_input_tokens_seen": 423526400, "step": 51700 }, { "epoch": 1.6592991223012366, "grad_norm": 0.7637689113616943, "learning_rate": 2.1222900949731297e-05, "loss": 0.9039, "num_input_tokens_seen": 424345600, "step": 51800 }, { "epoch": 1.6625024024601192, "grad_norm": 3.154482841491699, "learning_rate": 2.1139129081484734e-05, "loss": 0.968, "num_input_tokens_seen": 425164800, "step": 51900 }, { "epoch": 1.6657056826190018, "grad_norm": 1.900366187095642, "learning_rate": 2.1055401600286386e-05, "loss": 0.9064, "num_input_tokens_seen": 425984000, "step": 52000 }, { "epoch": 1.6689089627778846, "grad_norm": 0.6276770830154419, "learning_rate": 2.0971719468721077e-05, "loss": 0.8786, "num_input_tokens_seen": 426803200, "step": 52100 }, { "epoch": 1.6721122429367674, "grad_norm": 0.7337915301322937, "learning_rate": 2.0888083648852267e-05, "loss": 0.9213, "num_input_tokens_seen": 427622400, "step": 52200 }, { "epoch": 1.6753155230956498, "grad_norm": 0.6604040861129761, "learning_rate": 2.0804495102210975e-05, "loss": 0.944, "num_input_tokens_seen": 428441600, "step": 52300 }, { "epoch": 1.6785188032545326, "grad_norm": 0.6165716648101807, "learning_rate": 2.0720954789784753e-05, "loss": 0.8767, "num_input_tokens_seen": 429260800, "step": 52400 }, { "epoch": 1.6817220834134154, "grad_norm": 1.7939884662628174, "learning_rate": 2.0637463672006595e-05, "loss": 0.9095, "num_input_tokens_seen": 430080000, "step": 52500 }, { "epoch": 1.684925363572298, "grad_norm": 0.6687926054000854, "learning_rate": 2.0554022708743943e-05, "loss": 0.8976, "num_input_tokens_seen": 430899200, "step": 52600 }, { "epoch": 1.6881286437311807, "grad_norm": 0.7300702929496765, "learning_rate": 2.0470632859287628e-05, "loss": 0.9377, "num_input_tokens_seen": 431718400, "step": 52700 }, { "epoch": 1.6913319238900635, "grad_norm": 0.590376615524292, "learning_rate": 2.0387295082340835e-05, "loss": 0.8911, "num_input_tokens_seen": 432537600, "step": 52800 }, { "epoch": 1.694535204048946, "grad_norm": 0.556515097618103, "learning_rate": 2.0304010336008112e-05, "loss": 0.8771, "num_input_tokens_seen": 433356800, "step": 52900 }, { "epoch": 1.6977384842078287, "grad_norm": 0.6625654101371765, "learning_rate": 2.0220779577784298e-05, "loss": 0.9529, "num_input_tokens_seen": 434176000, "step": 53000 }, { "epoch": 1.7009417643667115, "grad_norm": 0.5537979602813721, "learning_rate": 2.0137603764543573e-05, "loss": 0.8813, "num_input_tokens_seen": 434995200, "step": 53100 }, { "epoch": 1.7041450445255943, "grad_norm": 0.49151819944381714, "learning_rate": 2.0054483852528435e-05, "loss": 0.8268, "num_input_tokens_seen": 435814400, "step": 53200 }, { "epoch": 1.707348324684477, "grad_norm": 0.6030770540237427, "learning_rate": 1.9971420797338708e-05, "loss": 0.9116, "num_input_tokens_seen": 436633600, "step": 53300 }, { "epoch": 1.7105516048433596, "grad_norm": 0.872156023979187, "learning_rate": 1.9888415553920525e-05, "loss": 0.8564, "num_input_tokens_seen": 437452800, "step": 53400 }, { "epoch": 1.7137548850022424, "grad_norm": 0.608736515045166, "learning_rate": 1.9805469076555418e-05, "loss": 0.8656, "num_input_tokens_seen": 438272000, "step": 53500 }, { "epoch": 1.716958165161125, "grad_norm": 0.6439238786697388, "learning_rate": 1.9722582318849274e-05, "loss": 0.8819, "num_input_tokens_seen": 439091200, "step": 53600 }, { "epoch": 1.7201614453200076, "grad_norm": 0.5254938006401062, "learning_rate": 1.9639756233721433e-05, "loss": 0.9118, "num_input_tokens_seen": 439910400, "step": 53700 }, { "epoch": 1.7233647254788904, "grad_norm": 0.6956652998924255, "learning_rate": 1.9556991773393686e-05, "loss": 0.8578, "num_input_tokens_seen": 440729600, "step": 53800 }, { "epoch": 1.7265680056377732, "grad_norm": 0.5322553515434265, "learning_rate": 1.9474289889379334e-05, "loss": 0.8907, "num_input_tokens_seen": 441548800, "step": 53900 }, { "epoch": 1.7297712857966556, "grad_norm": 0.706683874130249, "learning_rate": 1.9391651532472296e-05, "loss": 0.8853, "num_input_tokens_seen": 442368000, "step": 54000 }, { "epoch": 1.7329745659555384, "grad_norm": 1.7393512725830078, "learning_rate": 1.930907765273611e-05, "loss": 0.8942, "num_input_tokens_seen": 443187200, "step": 54100 }, { "epoch": 1.7361778461144213, "grad_norm": 0.6126461029052734, "learning_rate": 1.922656919949306e-05, "loss": 0.861, "num_input_tokens_seen": 444006400, "step": 54200 }, { "epoch": 1.7393811262733039, "grad_norm": 15.058053016662598, "learning_rate": 1.914412712131325e-05, "loss": 0.8764, "num_input_tokens_seen": 444825600, "step": 54300 }, { "epoch": 1.7425844064321865, "grad_norm": 1.590517520904541, "learning_rate": 1.906175236600366e-05, "loss": 0.9054, "num_input_tokens_seen": 445644800, "step": 54400 }, { "epoch": 1.7457876865910693, "grad_norm": 2.823185920715332, "learning_rate": 1.8979445880597332e-05, "loss": 0.9166, "num_input_tokens_seen": 446464000, "step": 54500 }, { "epoch": 1.748990966749952, "grad_norm": 0.6295785903930664, "learning_rate": 1.8897208611342392e-05, "loss": 0.893, "num_input_tokens_seen": 447283200, "step": 54600 }, { "epoch": 1.7521942469088345, "grad_norm": 2.9604554176330566, "learning_rate": 1.881504150369125e-05, "loss": 0.8883, "num_input_tokens_seen": 448102400, "step": 54700 }, { "epoch": 1.7553975270677173, "grad_norm": 0.12940554320812225, "learning_rate": 1.873294550228965e-05, "loss": 0.9114, "num_input_tokens_seen": 448921600, "step": 54800 }, { "epoch": 1.7586008072266002, "grad_norm": 0.6710172891616821, "learning_rate": 1.8650921550965884e-05, "loss": 0.9675, "num_input_tokens_seen": 449740800, "step": 54900 }, { "epoch": 1.7618040873854828, "grad_norm": 0.5467862486839294, "learning_rate": 1.8568970592719903e-05, "loss": 0.9055, "num_input_tokens_seen": 450560000, "step": 55000 }, { "epoch": 1.7650073675443654, "grad_norm": 1.6943007707595825, "learning_rate": 1.8487093569712482e-05, "loss": 0.8754, "num_input_tokens_seen": 451379200, "step": 55100 }, { "epoch": 1.7682106477032482, "grad_norm": 0.6068347692489624, "learning_rate": 1.84052914232544e-05, "loss": 0.9695, "num_input_tokens_seen": 452198400, "step": 55200 }, { "epoch": 1.7714139278621308, "grad_norm": 2.650592565536499, "learning_rate": 1.8323565093795576e-05, "loss": 0.8756, "num_input_tokens_seen": 453017600, "step": 55300 }, { "epoch": 1.7746172080210134, "grad_norm": 2.3554019927978516, "learning_rate": 1.824191552091431e-05, "loss": 0.8884, "num_input_tokens_seen": 453836800, "step": 55400 }, { "epoch": 1.7778204881798962, "grad_norm": 0.5100352764129639, "learning_rate": 1.8160343643306467e-05, "loss": 0.901, "num_input_tokens_seen": 454656000, "step": 55500 }, { "epoch": 1.781023768338779, "grad_norm": 2.276134490966797, "learning_rate": 1.8078850398774666e-05, "loss": 0.8653, "num_input_tokens_seen": 455475200, "step": 55600 }, { "epoch": 1.7842270484976614, "grad_norm": 0.6568858027458191, "learning_rate": 1.7997436724217517e-05, "loss": 0.9307, "num_input_tokens_seen": 456294400, "step": 55700 }, { "epoch": 1.7874303286565443, "grad_norm": 0.5729939341545105, "learning_rate": 1.7916103555618818e-05, "loss": 0.8938, "num_input_tokens_seen": 457113600, "step": 55800 }, { "epoch": 1.790633608815427, "grad_norm": 0.4960566759109497, "learning_rate": 1.7834851828036855e-05, "loss": 0.8622, "num_input_tokens_seen": 457932800, "step": 55900 }, { "epoch": 1.7938368889743097, "grad_norm": 0.6195512413978577, "learning_rate": 1.7753682475593587e-05, "loss": 0.9165, "num_input_tokens_seen": 458752000, "step": 56000 }, { "epoch": 1.7970401691331923, "grad_norm": 0.7224614024162292, "learning_rate": 1.7672596431463963e-05, "loss": 0.9159, "num_input_tokens_seen": 459571200, "step": 56100 }, { "epoch": 1.8002434492920751, "grad_norm": 0.683172881603241, "learning_rate": 1.7591594627865134e-05, "loss": 0.928, "num_input_tokens_seen": 460390400, "step": 56200 }, { "epoch": 1.8034467294509577, "grad_norm": 0.6346443891525269, "learning_rate": 1.7510677996045787e-05, "loss": 0.8891, "num_input_tokens_seen": 461209600, "step": 56300 }, { "epoch": 1.8066500096098403, "grad_norm": 0.5797076225280762, "learning_rate": 1.7429847466275424e-05, "loss": 0.9163, "num_input_tokens_seen": 462028800, "step": 56400 }, { "epoch": 1.8098532897687232, "grad_norm": 1.201037883758545, "learning_rate": 1.734910396783364e-05, "loss": 0.9401, "num_input_tokens_seen": 462848000, "step": 56500 }, { "epoch": 1.813056569927606, "grad_norm": 0.6015352606773376, "learning_rate": 1.7268448428999508e-05, "loss": 0.9391, "num_input_tokens_seen": 463667200, "step": 56600 }, { "epoch": 1.8162598500864886, "grad_norm": 0.6725329756736755, "learning_rate": 1.71878817770408e-05, "loss": 0.8751, "num_input_tokens_seen": 464486400, "step": 56700 }, { "epoch": 1.8194631302453712, "grad_norm": 0.7582192420959473, "learning_rate": 1.7107404938203422e-05, "loss": 0.9578, "num_input_tokens_seen": 465305600, "step": 56800 }, { "epoch": 1.822666410404254, "grad_norm": 0.5181425213813782, "learning_rate": 1.702701883770074e-05, "loss": 0.9462, "num_input_tokens_seen": 466124800, "step": 56900 }, { "epoch": 1.8258696905631366, "grad_norm": 0.672991931438446, "learning_rate": 1.6946724399702905e-05, "loss": 0.8676, "num_input_tokens_seen": 466944000, "step": 57000 }, { "epoch": 1.8290729707220192, "grad_norm": 2.6324303150177, "learning_rate": 1.6866522547326292e-05, "loss": 0.9282, "num_input_tokens_seen": 467763200, "step": 57100 }, { "epoch": 1.832276250880902, "grad_norm": 0.5964205861091614, "learning_rate": 1.6786414202622818e-05, "loss": 0.8611, "num_input_tokens_seen": 468582400, "step": 57200 }, { "epoch": 1.835479531039785, "grad_norm": 1.6168113946914673, "learning_rate": 1.670640028656939e-05, "loss": 0.8977, "num_input_tokens_seen": 469401600, "step": 57300 }, { "epoch": 1.8386828111986673, "grad_norm": 0.5584040284156799, "learning_rate": 1.662648171905731e-05, "loss": 0.9157, "num_input_tokens_seen": 470220800, "step": 57400 }, { "epoch": 1.84188609135755, "grad_norm": 0.6906948685646057, "learning_rate": 1.654665941888169e-05, "loss": 0.8808, "num_input_tokens_seen": 471040000, "step": 57500 }, { "epoch": 1.845089371516433, "grad_norm": 0.8261626958847046, "learning_rate": 1.6466934303730866e-05, "loss": 0.9322, "num_input_tokens_seen": 471859200, "step": 57600 }, { "epoch": 1.8482926516753155, "grad_norm": 0.5074647068977356, "learning_rate": 1.6387307290175914e-05, "loss": 0.9141, "num_input_tokens_seen": 472678400, "step": 57700 }, { "epoch": 1.8514959318341981, "grad_norm": 1.8539708852767944, "learning_rate": 1.6307779293660034e-05, "loss": 0.8777, "num_input_tokens_seen": 473497600, "step": 57800 }, { "epoch": 1.854699211993081, "grad_norm": 2.2079038619995117, "learning_rate": 1.622835122848809e-05, "loss": 0.8596, "num_input_tokens_seen": 474316800, "step": 57900 }, { "epoch": 1.8579024921519636, "grad_norm": 0.670155942440033, "learning_rate": 1.6149024007816067e-05, "loss": 0.9112, "num_input_tokens_seen": 475136000, "step": 58000 }, { "epoch": 1.8611057723108462, "grad_norm": 0.8173292875289917, "learning_rate": 1.6069798543640543e-05, "loss": 0.9513, "num_input_tokens_seen": 475955200, "step": 58100 }, { "epoch": 1.864309052469729, "grad_norm": 0.5929046273231506, "learning_rate": 1.599067574678829e-05, "loss": 0.8633, "num_input_tokens_seen": 476774400, "step": 58200 }, { "epoch": 1.8675123326286118, "grad_norm": 0.6177115440368652, "learning_rate": 1.591165652690571e-05, "loss": 0.8829, "num_input_tokens_seen": 477593600, "step": 58300 }, { "epoch": 1.8707156127874944, "grad_norm": 5.405032157897949, "learning_rate": 1.5832741792448447e-05, "loss": 0.853, "num_input_tokens_seen": 478412800, "step": 58400 }, { "epoch": 1.873918892946377, "grad_norm": 0.8819538950920105, "learning_rate": 1.5753932450670892e-05, "loss": 0.8632, "num_input_tokens_seen": 479232000, "step": 58500 }, { "epoch": 1.8771221731052599, "grad_norm": 0.7577266693115234, "learning_rate": 1.5675229407615773e-05, "loss": 0.8691, "num_input_tokens_seen": 480051200, "step": 58600 }, { "epoch": 1.8803254532641425, "grad_norm": 0.5581927299499512, "learning_rate": 1.5596633568103764e-05, "loss": 0.8898, "num_input_tokens_seen": 480870400, "step": 58700 }, { "epoch": 1.883528733423025, "grad_norm": 1.5271930694580078, "learning_rate": 1.5518145835723034e-05, "loss": 0.9001, "num_input_tokens_seen": 481689600, "step": 58800 }, { "epoch": 1.886732013581908, "grad_norm": 0.594035804271698, "learning_rate": 1.54397671128189e-05, "loss": 0.8988, "num_input_tokens_seen": 482508800, "step": 58900 }, { "epoch": 1.8899352937407907, "grad_norm": 0.778454601764679, "learning_rate": 1.5361498300483423e-05, "loss": 0.8744, "num_input_tokens_seen": 483328000, "step": 59000 }, { "epoch": 1.893138573899673, "grad_norm": 0.6719622611999512, "learning_rate": 1.5283340298545056e-05, "loss": 0.9189, "num_input_tokens_seen": 484147200, "step": 59100 }, { "epoch": 1.896341854058556, "grad_norm": 0.7632321119308472, "learning_rate": 1.5205294005558335e-05, "loss": 0.9133, "num_input_tokens_seen": 484966400, "step": 59200 }, { "epoch": 1.8995451342174388, "grad_norm": 2.033229112625122, "learning_rate": 1.5127360318793481e-05, "loss": 0.8913, "num_input_tokens_seen": 485785600, "step": 59300 }, { "epoch": 1.9027484143763214, "grad_norm": 0.598871648311615, "learning_rate": 1.5049540134226158e-05, "loss": 0.8857, "num_input_tokens_seen": 486604800, "step": 59400 }, { "epoch": 1.905951694535204, "grad_norm": 1.5140035152435303, "learning_rate": 1.4971834346527102e-05, "loss": 0.9104, "num_input_tokens_seen": 487424000, "step": 59500 }, { "epoch": 1.9091549746940868, "grad_norm": 1.2196921110153198, "learning_rate": 1.4894243849051889e-05, "loss": 0.8936, "num_input_tokens_seen": 488243200, "step": 59600 }, { "epoch": 1.9123582548529694, "grad_norm": 0.6041728854179382, "learning_rate": 1.4816769533830638e-05, "loss": 0.9233, "num_input_tokens_seen": 489062400, "step": 59700 }, { "epoch": 1.915561535011852, "grad_norm": 0.585239589214325, "learning_rate": 1.4739412291557774e-05, "loss": 0.893, "num_input_tokens_seen": 489881600, "step": 59800 }, { "epoch": 1.9187648151707348, "grad_norm": 0.5198357701301575, "learning_rate": 1.4662173011581757e-05, "loss": 0.8643, "num_input_tokens_seen": 490700800, "step": 59900 }, { "epoch": 1.9219680953296177, "grad_norm": 1.5068873167037964, "learning_rate": 1.4585052581894881e-05, "loss": 0.9376, "num_input_tokens_seen": 491520000, "step": 60000 }, { "epoch": 1.9251713754885003, "grad_norm": 1.573378562927246, "learning_rate": 1.4508051889123075e-05, "loss": 0.9354, "num_input_tokens_seen": 492339200, "step": 60100 }, { "epoch": 1.9283746556473829, "grad_norm": 0.7995052933692932, "learning_rate": 1.4431171818515698e-05, "loss": 0.8201, "num_input_tokens_seen": 493158400, "step": 60200 }, { "epoch": 1.9315779358062657, "grad_norm": 0.7116925716400146, "learning_rate": 1.4354413253935336e-05, "loss": 0.8322, "num_input_tokens_seen": 493977600, "step": 60300 }, { "epoch": 1.9347812159651483, "grad_norm": 0.714451253414154, "learning_rate": 1.4277777077847665e-05, "loss": 0.9181, "num_input_tokens_seen": 494796800, "step": 60400 }, { "epoch": 1.937984496124031, "grad_norm": 0.7062659859657288, "learning_rate": 1.420126417131133e-05, "loss": 0.8783, "num_input_tokens_seen": 495616000, "step": 60500 }, { "epoch": 1.9411877762829137, "grad_norm": 0.5767313838005066, "learning_rate": 1.4124875413967767e-05, "loss": 0.9239, "num_input_tokens_seen": 496435200, "step": 60600 }, { "epoch": 1.9443910564417966, "grad_norm": 0.7007090449333191, "learning_rate": 1.4048611684031138e-05, "loss": 0.8908, "num_input_tokens_seen": 497254400, "step": 60700 }, { "epoch": 1.947594336600679, "grad_norm": 0.663779079914093, "learning_rate": 1.3972473858278184e-05, "loss": 0.8845, "num_input_tokens_seen": 498073600, "step": 60800 }, { "epoch": 1.9507976167595618, "grad_norm": 1.9937938451766968, "learning_rate": 1.3896462812038168e-05, "loss": 0.8902, "num_input_tokens_seen": 498892800, "step": 60900 }, { "epoch": 1.9540008969184446, "grad_norm": 0.5911014676094055, "learning_rate": 1.3820579419182838e-05, "loss": 0.9283, "num_input_tokens_seen": 499712000, "step": 61000 }, { "epoch": 1.9572041770773272, "grad_norm": 0.680264949798584, "learning_rate": 1.3744824552116343e-05, "loss": 0.9166, "num_input_tokens_seen": 500531200, "step": 61100 }, { "epoch": 1.9604074572362098, "grad_norm": 0.5298569202423096, "learning_rate": 1.3669199081765232e-05, "loss": 0.9069, "num_input_tokens_seen": 501350400, "step": 61200 }, { "epoch": 1.9636107373950926, "grad_norm": 2.5101547241210938, "learning_rate": 1.3593703877568407e-05, "loss": 0.9138, "num_input_tokens_seen": 502169600, "step": 61300 }, { "epoch": 1.9668140175539752, "grad_norm": 1.6266756057739258, "learning_rate": 1.3518339807467138e-05, "loss": 0.8311, "num_input_tokens_seen": 502988800, "step": 61400 }, { "epoch": 1.9700172977128578, "grad_norm": 0.6949862241744995, "learning_rate": 1.3443107737895121e-05, "loss": 0.9508, "num_input_tokens_seen": 503808000, "step": 61500 }, { "epoch": 1.9732205778717407, "grad_norm": 1.9142687320709229, "learning_rate": 1.3368008533768478e-05, "loss": 0.8986, "num_input_tokens_seen": 504627200, "step": 61600 }, { "epoch": 1.9764238580306235, "grad_norm": 1.5811573266983032, "learning_rate": 1.3293043058475835e-05, "loss": 0.8775, "num_input_tokens_seen": 505446400, "step": 61700 }, { "epoch": 1.979627138189506, "grad_norm": 0.5435724258422852, "learning_rate": 1.321821217386836e-05, "loss": 0.8588, "num_input_tokens_seen": 506265600, "step": 61800 }, { "epoch": 1.9828304183483887, "grad_norm": 0.5689346194267273, "learning_rate": 1.314351674024989e-05, "loss": 0.9, "num_input_tokens_seen": 507084800, "step": 61900 }, { "epoch": 1.9860336985072715, "grad_norm": 0.5658956170082092, "learning_rate": 1.3068957616367045e-05, "loss": 0.8931, "num_input_tokens_seen": 507904000, "step": 62000 }, { "epoch": 1.9892369786661541, "grad_norm": 0.6352538466453552, "learning_rate": 1.2994535659399327e-05, "loss": 0.9254, "num_input_tokens_seen": 508723200, "step": 62100 }, { "epoch": 1.9924402588250367, "grad_norm": 1.6909618377685547, "learning_rate": 1.2920251724949296e-05, "loss": 0.8628, "num_input_tokens_seen": 509542400, "step": 62200 }, { "epoch": 1.9956435389839196, "grad_norm": 0.6590949892997742, "learning_rate": 1.2846106667032693e-05, "loss": 0.8509, "num_input_tokens_seen": 510361600, "step": 62300 }, { "epoch": 1.9988468191428024, "grad_norm": 2.059828042984009, "learning_rate": 1.2772101338068649e-05, "loss": 0.8547, "num_input_tokens_seen": 511180800, "step": 62400 }, { "epoch": 2.0020500993016848, "grad_norm": 0.8146264553070068, "learning_rate": 1.2698236588869894e-05, "loss": 0.8274, "num_input_tokens_seen": 512000000, "step": 62500 }, { "epoch": 2.0052533794605676, "grad_norm": 0.5894434452056885, "learning_rate": 1.2624513268632967e-05, "loss": 0.8213, "num_input_tokens_seen": 512819200, "step": 62600 }, { "epoch": 2.0084566596194504, "grad_norm": 1.9424681663513184, "learning_rate": 1.2550932224928425e-05, "loss": 0.8608, "num_input_tokens_seen": 513638400, "step": 62700 }, { "epoch": 2.011659939778333, "grad_norm": 0.6579126715660095, "learning_rate": 1.2477494303691157e-05, "loss": 0.836, "num_input_tokens_seen": 514457600, "step": 62800 }, { "epoch": 2.0148632199372156, "grad_norm": 0.5051004886627197, "learning_rate": 1.2404200349210577e-05, "loss": 0.8208, "num_input_tokens_seen": 515276800, "step": 62900 }, { "epoch": 2.0180665000960984, "grad_norm": 0.6397780179977417, "learning_rate": 1.2331051204121009e-05, "loss": 0.8293, "num_input_tokens_seen": 516096000, "step": 63000 }, { "epoch": 2.0212697802549813, "grad_norm": 0.7705442309379578, "learning_rate": 1.2258047709391945e-05, "loss": 0.8663, "num_input_tokens_seen": 516915200, "step": 63100 }, { "epoch": 2.0244730604138637, "grad_norm": 0.711100697517395, "learning_rate": 1.218519070431836e-05, "loss": 0.8186, "num_input_tokens_seen": 517734400, "step": 63200 }, { "epoch": 2.0276763405727465, "grad_norm": 0.6769080758094788, "learning_rate": 1.2112481026511138e-05, "loss": 0.8468, "num_input_tokens_seen": 518553600, "step": 63300 }, { "epoch": 2.0308796207316293, "grad_norm": 0.7686530351638794, "learning_rate": 1.2039919511887338e-05, "loss": 0.7955, "num_input_tokens_seen": 519372800, "step": 63400 }, { "epoch": 2.0340829008905117, "grad_norm": 0.826252281665802, "learning_rate": 1.1967506994660685e-05, "loss": 0.8313, "num_input_tokens_seen": 520192000, "step": 63500 }, { "epoch": 2.0372861810493945, "grad_norm": 1.5545631647109985, "learning_rate": 1.1895244307331923e-05, "loss": 0.8387, "num_input_tokens_seen": 521011200, "step": 63600 }, { "epoch": 2.0404894612082773, "grad_norm": 2.142545461654663, "learning_rate": 1.1823132280679235e-05, "loss": 0.8087, "num_input_tokens_seen": 521830400, "step": 63700 }, { "epoch": 2.04369274136716, "grad_norm": 1.7032113075256348, "learning_rate": 1.1751171743748737e-05, "loss": 0.8357, "num_input_tokens_seen": 522649600, "step": 63800 }, { "epoch": 2.0468960215260426, "grad_norm": 0.6579723358154297, "learning_rate": 1.1679363523844918e-05, "loss": 0.8435, "num_input_tokens_seen": 523468800, "step": 63900 }, { "epoch": 2.0500993016849254, "grad_norm": 0.6495528817176819, "learning_rate": 1.1607708446521125e-05, "loss": 0.8702, "num_input_tokens_seen": 524288000, "step": 64000 }, { "epoch": 2.053302581843808, "grad_norm": 0.5699741840362549, "learning_rate": 1.153620733557007e-05, "loss": 0.8436, "num_input_tokens_seen": 525107200, "step": 64100 }, { "epoch": 2.0565058620026906, "grad_norm": 0.5475245118141174, "learning_rate": 1.1464861013014391e-05, "loss": 0.825, "num_input_tokens_seen": 525926400, "step": 64200 }, { "epoch": 2.0597091421615734, "grad_norm": 2.3118770122528076, "learning_rate": 1.139367029909717e-05, "loss": 0.8469, "num_input_tokens_seen": 526745600, "step": 64300 }, { "epoch": 2.0629124223204562, "grad_norm": 0.7807962894439697, "learning_rate": 1.1322636012272517e-05, "loss": 0.8397, "num_input_tokens_seen": 527564800, "step": 64400 }, { "epoch": 2.0661157024793386, "grad_norm": 1.0216293334960938, "learning_rate": 1.1251758969196147e-05, "loss": 0.7898, "num_input_tokens_seen": 528384000, "step": 64500 }, { "epoch": 2.0693189826382214, "grad_norm": 0.7191298604011536, "learning_rate": 1.1181039984715991e-05, "loss": 0.8449, "num_input_tokens_seen": 529203200, "step": 64600 }, { "epoch": 2.0725222627971043, "grad_norm": 0.4787365198135376, "learning_rate": 1.1110479871862862e-05, "loss": 0.7879, "num_input_tokens_seen": 530022400, "step": 64700 }, { "epoch": 2.075725542955987, "grad_norm": 0.7449747323989868, "learning_rate": 1.1040079441841065e-05, "loss": 0.866, "num_input_tokens_seen": 530841600, "step": 64800 }, { "epoch": 2.0789288231148695, "grad_norm": 0.7580021619796753, "learning_rate": 1.0969839504019108e-05, "loss": 0.851, "num_input_tokens_seen": 531660800, "step": 64900 }, { "epoch": 2.0821321032737523, "grad_norm": 0.6036601662635803, "learning_rate": 1.0899760865920355e-05, "loss": 0.814, "num_input_tokens_seen": 532480000, "step": 65000 }, { "epoch": 2.085335383432635, "grad_norm": 0.553875207901001, "learning_rate": 1.0829844333213766e-05, "loss": 0.8307, "num_input_tokens_seen": 533299200, "step": 65100 }, { "epoch": 2.0885386635915175, "grad_norm": 0.6239012479782104, "learning_rate": 1.0760090709704642e-05, "loss": 0.8406, "num_input_tokens_seen": 534118400, "step": 65200 }, { "epoch": 2.0917419437504003, "grad_norm": 0.8101912140846252, "learning_rate": 1.0690500797325387e-05, "loss": 0.8263, "num_input_tokens_seen": 534937600, "step": 65300 }, { "epoch": 2.094945223909283, "grad_norm": 0.827496349811554, "learning_rate": 1.0621075396126265e-05, "loss": 0.7959, "num_input_tokens_seen": 535756800, "step": 65400 }, { "epoch": 2.098148504068166, "grad_norm": 0.7722252607345581, "learning_rate": 1.055181530426621e-05, "loss": 0.8417, "num_input_tokens_seen": 536576000, "step": 65500 }, { "epoch": 2.1013517842270484, "grad_norm": 0.8276936411857605, "learning_rate": 1.0482721318003644e-05, "loss": 0.8267, "num_input_tokens_seen": 537395200, "step": 65600 }, { "epoch": 2.104555064385931, "grad_norm": 0.5818492770195007, "learning_rate": 1.0413794231687357e-05, "loss": 0.811, "num_input_tokens_seen": 538214400, "step": 65700 }, { "epoch": 2.107758344544814, "grad_norm": 1.9946190118789673, "learning_rate": 1.0345034837747342e-05, "loss": 0.8376, "num_input_tokens_seen": 539033600, "step": 65800 }, { "epoch": 2.1109616247036964, "grad_norm": 0.5959033370018005, "learning_rate": 1.0276443926685694e-05, "loss": 0.8641, "num_input_tokens_seen": 539852800, "step": 65900 }, { "epoch": 2.1141649048625792, "grad_norm": 0.9433934092521667, "learning_rate": 1.0208022287067509e-05, "loss": 0.8445, "num_input_tokens_seen": 540672000, "step": 66000 }, { "epoch": 2.117368185021462, "grad_norm": 1.3814393281936646, "learning_rate": 1.0139770705511833e-05, "loss": 0.8783, "num_input_tokens_seen": 541491200, "step": 66100 }, { "epoch": 2.120571465180345, "grad_norm": 0.5552910566329956, "learning_rate": 1.0071689966682623e-05, "loss": 0.7836, "num_input_tokens_seen": 542310400, "step": 66200 }, { "epoch": 2.1237747453392273, "grad_norm": 0.6831013560295105, "learning_rate": 1.0003780853279732e-05, "loss": 0.8143, "num_input_tokens_seen": 543129600, "step": 66300 }, { "epoch": 2.12697802549811, "grad_norm": 1.8912497758865356, "learning_rate": 9.936044146029855e-06, "loss": 0.8582, "num_input_tokens_seen": 543948800, "step": 66400 }, { "epoch": 2.130181305656993, "grad_norm": 0.6759600639343262, "learning_rate": 9.868480623677643e-06, "loss": 0.8295, "num_input_tokens_seen": 544768000, "step": 66500 }, { "epoch": 2.1333845858158753, "grad_norm": 0.6555814146995544, "learning_rate": 9.801091062976665e-06, "loss": 0.7856, "num_input_tokens_seen": 545587200, "step": 66600 }, { "epoch": 2.136587865974758, "grad_norm": 0.7342298626899719, "learning_rate": 9.733876238680531e-06, "loss": 0.8144, "num_input_tokens_seen": 546406400, "step": 66700 }, { "epoch": 2.139791146133641, "grad_norm": 1.6135506629943848, "learning_rate": 9.666836923533987e-06, "loss": 0.7658, "num_input_tokens_seen": 547225600, "step": 66800 }, { "epoch": 2.1429944262925233, "grad_norm": 0.6479013562202454, "learning_rate": 9.599973888263972e-06, "loss": 0.7818, "num_input_tokens_seen": 548044800, "step": 66900 }, { "epoch": 2.146197706451406, "grad_norm": 0.8639338612556458, "learning_rate": 9.533287901570843e-06, "loss": 0.8259, "num_input_tokens_seen": 548864000, "step": 67000 }, { "epoch": 2.149400986610289, "grad_norm": 0.852070152759552, "learning_rate": 9.466779730119449e-06, "loss": 0.84, "num_input_tokens_seen": 549683200, "step": 67100 }, { "epoch": 2.152604266769172, "grad_norm": 0.8585788607597351, "learning_rate": 9.400450138530394e-06, "loss": 0.8595, "num_input_tokens_seen": 550502400, "step": 67200 }, { "epoch": 2.155807546928054, "grad_norm": 2.652194023132324, "learning_rate": 9.334299889371217e-06, "loss": 0.8404, "num_input_tokens_seen": 551321600, "step": 67300 }, { "epoch": 2.159010827086937, "grad_norm": 0.6588045954704285, "learning_rate": 9.268329743147583e-06, "loss": 0.7933, "num_input_tokens_seen": 552140800, "step": 67400 }, { "epoch": 2.16221410724582, "grad_norm": 2.807159423828125, "learning_rate": 9.202540458294623e-06, "loss": 0.8066, "num_input_tokens_seen": 552960000, "step": 67500 }, { "epoch": 2.1654173874047022, "grad_norm": 0.7351047396659851, "learning_rate": 9.136932791168132e-06, "loss": 0.8831, "num_input_tokens_seen": 553779200, "step": 67600 }, { "epoch": 2.168620667563585, "grad_norm": 0.6064037084579468, "learning_rate": 9.071507496035943e-06, "loss": 0.7602, "num_input_tokens_seen": 554598400, "step": 67700 }, { "epoch": 2.171823947722468, "grad_norm": 0.6641263365745544, "learning_rate": 9.006265325069197e-06, "loss": 0.7984, "num_input_tokens_seen": 555417600, "step": 67800 }, { "epoch": 2.1750272278813503, "grad_norm": 0.6006192564964294, "learning_rate": 8.941207028333737e-06, "loss": 0.7831, "num_input_tokens_seen": 556236800, "step": 67900 }, { "epoch": 2.178230508040233, "grad_norm": 0.6849149465560913, "learning_rate": 8.876333353781468e-06, "loss": 0.829, "num_input_tokens_seen": 557056000, "step": 68000 }, { "epoch": 2.181433788199116, "grad_norm": 0.7569016218185425, "learning_rate": 8.811645047241767e-06, "loss": 0.8623, "num_input_tokens_seen": 557875200, "step": 68100 }, { "epoch": 2.1846370683579988, "grad_norm": 0.7035521268844604, "learning_rate": 8.74714285241289e-06, "loss": 0.8444, "num_input_tokens_seen": 558694400, "step": 68200 }, { "epoch": 2.187840348516881, "grad_norm": 0.7252819538116455, "learning_rate": 8.682827510853426e-06, "loss": 0.8287, "num_input_tokens_seen": 559513600, "step": 68300 }, { "epoch": 2.191043628675764, "grad_norm": 0.5455666780471802, "learning_rate": 8.618699761973792e-06, "loss": 0.7785, "num_input_tokens_seen": 560332800, "step": 68400 }, { "epoch": 2.194246908834647, "grad_norm": 0.8008429408073425, "learning_rate": 8.554760343027724e-06, "loss": 0.8595, "num_input_tokens_seen": 561152000, "step": 68500 }, { "epoch": 2.197450188993529, "grad_norm": 0.755208432674408, "learning_rate": 8.491009989103796e-06, "loss": 0.8538, "num_input_tokens_seen": 561971200, "step": 68600 }, { "epoch": 2.200653469152412, "grad_norm": 0.5776748657226562, "learning_rate": 8.427449433116952e-06, "loss": 0.8333, "num_input_tokens_seen": 562790400, "step": 68700 }, { "epoch": 2.203856749311295, "grad_norm": 0.6535948514938354, "learning_rate": 8.364079405800105e-06, "loss": 0.8281, "num_input_tokens_seen": 563609600, "step": 68800 }, { "epoch": 2.2070600294701777, "grad_norm": 0.5949485898017883, "learning_rate": 8.30090063569573e-06, "loss": 0.7887, "num_input_tokens_seen": 564428800, "step": 68900 }, { "epoch": 2.21026330962906, "grad_norm": 3.0284650325775146, "learning_rate": 8.237913849147497e-06, "loss": 0.8451, "num_input_tokens_seen": 565248000, "step": 69000 }, { "epoch": 2.213466589787943, "grad_norm": 0.5593298673629761, "learning_rate": 8.1751197702919e-06, "loss": 0.8596, "num_input_tokens_seen": 566067200, "step": 69100 }, { "epoch": 2.2166698699468257, "grad_norm": 0.670230507850647, "learning_rate": 8.112519121049942e-06, "loss": 0.8584, "num_input_tokens_seen": 566886400, "step": 69200 }, { "epoch": 2.219873150105708, "grad_norm": 1.34910249710083, "learning_rate": 8.050112621118822e-06, "loss": 0.8518, "num_input_tokens_seen": 567705600, "step": 69300 }, { "epoch": 2.223076430264591, "grad_norm": 0.6535902619361877, "learning_rate": 7.987900987963695e-06, "loss": 0.8544, "num_input_tokens_seen": 568524800, "step": 69400 }, { "epoch": 2.2262797104234737, "grad_norm": 0.594032883644104, "learning_rate": 7.925884936809396e-06, "loss": 0.8395, "num_input_tokens_seen": 569344000, "step": 69500 }, { "epoch": 2.2294829905823565, "grad_norm": 0.6679059863090515, "learning_rate": 7.864065180632233e-06, "loss": 0.8681, "num_input_tokens_seen": 570163200, "step": 69600 }, { "epoch": 2.232686270741239, "grad_norm": 0.5853981375694275, "learning_rate": 7.802442430151757e-06, "loss": 0.7735, "num_input_tokens_seen": 570982400, "step": 69700 }, { "epoch": 2.2358895509001218, "grad_norm": 1.4077626466751099, "learning_rate": 7.741017393822628e-06, "loss": 0.7853, "num_input_tokens_seen": 571801600, "step": 69800 }, { "epoch": 2.2390928310590046, "grad_norm": 0.6583539247512817, "learning_rate": 7.679790777826459e-06, "loss": 0.8403, "num_input_tokens_seen": 572620800, "step": 69900 }, { "epoch": 2.242296111217887, "grad_norm": 0.8946901559829712, "learning_rate": 7.618763286063698e-06, "loss": 0.8336, "num_input_tokens_seen": 573440000, "step": 70000 }, { "epoch": 2.24549939137677, "grad_norm": 0.7540560364723206, "learning_rate": 7.55793562014554e-06, "loss": 0.7682, "num_input_tokens_seen": 574259200, "step": 70100 }, { "epoch": 2.2487026715356526, "grad_norm": 0.7601240873336792, "learning_rate": 7.497308479385831e-06, "loss": 0.8367, "num_input_tokens_seen": 575078400, "step": 70200 }, { "epoch": 2.2519059516945354, "grad_norm": 0.7198605537414551, "learning_rate": 7.43688256079306e-06, "loss": 0.8119, "num_input_tokens_seen": 575897600, "step": 70300 }, { "epoch": 2.255109231853418, "grad_norm": 0.7405291199684143, "learning_rate": 7.376658559062349e-06, "loss": 0.8231, "num_input_tokens_seen": 576716800, "step": 70400 }, { "epoch": 2.2583125120123007, "grad_norm": 0.6844334602355957, "learning_rate": 7.31663716656745e-06, "loss": 0.852, "num_input_tokens_seen": 577536000, "step": 70500 }, { "epoch": 2.2615157921711835, "grad_norm": 3.182279348373413, "learning_rate": 7.256819073352775e-06, "loss": 0.82, "num_input_tokens_seen": 578355200, "step": 70600 }, { "epoch": 2.264719072330066, "grad_norm": 0.7010332345962524, "learning_rate": 7.197204967125498e-06, "loss": 0.8417, "num_input_tokens_seen": 579174400, "step": 70700 }, { "epoch": 2.2679223524889487, "grad_norm": 3.276526927947998, "learning_rate": 7.137795533247604e-06, "loss": 0.8252, "num_input_tokens_seen": 579993600, "step": 70800 }, { "epoch": 2.2711256326478315, "grad_norm": 0.6692455410957336, "learning_rate": 7.078591454728056e-06, "loss": 0.8195, "num_input_tokens_seen": 580812800, "step": 70900 }, { "epoch": 2.274328912806714, "grad_norm": 0.6837947368621826, "learning_rate": 7.019593412214914e-06, "loss": 0.8012, "num_input_tokens_seen": 581632000, "step": 71000 }, { "epoch": 2.2775321929655967, "grad_norm": 0.8453261256217957, "learning_rate": 6.960802083987503e-06, "loss": 0.8097, "num_input_tokens_seen": 582451200, "step": 71100 }, { "epoch": 2.2807354731244796, "grad_norm": 0.7615090608596802, "learning_rate": 6.902218145948647e-06, "loss": 0.8216, "num_input_tokens_seen": 583270400, "step": 71200 }, { "epoch": 2.283938753283362, "grad_norm": 2.4880526065826416, "learning_rate": 6.8438422716168595e-06, "loss": 0.829, "num_input_tokens_seen": 584089600, "step": 71300 }, { "epoch": 2.2871420334422448, "grad_norm": 2.184436798095703, "learning_rate": 6.785675132118638e-06, "loss": 0.8557, "num_input_tokens_seen": 584908800, "step": 71400 }, { "epoch": 2.2903453136011276, "grad_norm": 0.6513957977294922, "learning_rate": 6.72771739618073e-06, "loss": 0.8199, "num_input_tokens_seen": 585728000, "step": 71500 }, { "epoch": 2.2935485937600104, "grad_norm": 2.187042713165283, "learning_rate": 6.6699697301224214e-06, "loss": 0.876, "num_input_tokens_seen": 586547200, "step": 71600 }, { "epoch": 2.296751873918893, "grad_norm": 0.6848201751708984, "learning_rate": 6.612432797847937e-06, "loss": 0.8013, "num_input_tokens_seen": 587366400, "step": 71700 }, { "epoch": 2.2999551540777756, "grad_norm": 0.9538524150848389, "learning_rate": 6.55510726083873e-06, "loss": 0.7922, "num_input_tokens_seen": 588185600, "step": 71800 }, { "epoch": 2.3031584342366584, "grad_norm": 0.6234622597694397, "learning_rate": 6.4979937781459586e-06, "loss": 0.7617, "num_input_tokens_seen": 589004800, "step": 71900 }, { "epoch": 2.306361714395541, "grad_norm": 0.7952730655670166, "learning_rate": 6.441093006382831e-06, "loss": 0.8744, "num_input_tokens_seen": 589824000, "step": 72000 }, { "epoch": 2.3095649945544237, "grad_norm": 0.6471823453903198, "learning_rate": 6.384405599717125e-06, "loss": 0.7952, "num_input_tokens_seen": 590643200, "step": 72100 }, { "epoch": 2.3127682747133065, "grad_norm": 0.713498592376709, "learning_rate": 6.327932209863618e-06, "loss": 0.817, "num_input_tokens_seen": 591462400, "step": 72200 }, { "epoch": 2.3159715548721893, "grad_norm": 0.8223375678062439, "learning_rate": 6.271673486076629e-06, "loss": 0.8127, "num_input_tokens_seen": 592281600, "step": 72300 }, { "epoch": 2.3191748350310717, "grad_norm": 2.696056842803955, "learning_rate": 6.215630075142523e-06, "loss": 0.8191, "num_input_tokens_seen": 593100800, "step": 72400 }, { "epoch": 2.3223781151899545, "grad_norm": 0.6731551885604858, "learning_rate": 6.159802621372279e-06, "loss": 0.831, "num_input_tokens_seen": 593920000, "step": 72500 }, { "epoch": 2.3255813953488373, "grad_norm": 0.6898087859153748, "learning_rate": 6.1041917665941275e-06, "loss": 0.8249, "num_input_tokens_seen": 594739200, "step": 72600 }, { "epoch": 2.3287846755077197, "grad_norm": 0.6532519459724426, "learning_rate": 6.048798150146112e-06, "loss": 0.7416, "num_input_tokens_seen": 595558400, "step": 72700 }, { "epoch": 2.3319879556666026, "grad_norm": 0.6760110259056091, "learning_rate": 5.993622408868788e-06, "loss": 0.8451, "num_input_tokens_seen": 596377600, "step": 72800 }, { "epoch": 2.3351912358254854, "grad_norm": 2.732374668121338, "learning_rate": 5.9386651770978516e-06, "loss": 0.8654, "num_input_tokens_seen": 597196800, "step": 72900 }, { "epoch": 2.338394515984368, "grad_norm": 0.6297926306724548, "learning_rate": 5.8839270866568816e-06, "loss": 0.8397, "num_input_tokens_seen": 598016000, "step": 73000 }, { "epoch": 2.3415977961432506, "grad_norm": 0.5178629755973816, "learning_rate": 5.829408766850078e-06, "loss": 0.833, "num_input_tokens_seen": 598835200, "step": 73100 }, { "epoch": 2.3448010763021334, "grad_norm": 0.5522879958152771, "learning_rate": 5.7751108444550066e-06, "loss": 0.8174, "num_input_tokens_seen": 599654400, "step": 73200 }, { "epoch": 2.3480043564610162, "grad_norm": 0.6307721734046936, "learning_rate": 5.7210339437154175e-06, "loss": 0.7809, "num_input_tokens_seen": 600473600, "step": 73300 }, { "epoch": 2.3512076366198986, "grad_norm": 0.6830965876579285, "learning_rate": 5.667178686334037e-06, "loss": 0.8243, "num_input_tokens_seen": 601292800, "step": 73400 }, { "epoch": 2.3544109167787814, "grad_norm": 2.0725910663604736, "learning_rate": 5.613545691465438e-06, "loss": 0.7868, "num_input_tokens_seen": 602112000, "step": 73500 }, { "epoch": 2.3576141969376643, "grad_norm": 0.994819700717926, "learning_rate": 5.560135575708927e-06, "loss": 0.8176, "num_input_tokens_seen": 602931200, "step": 73600 }, { "epoch": 2.360817477096547, "grad_norm": 0.7025684714317322, "learning_rate": 5.506948953101454e-06, "loss": 0.8417, "num_input_tokens_seen": 603750400, "step": 73700 }, { "epoch": 2.3640207572554295, "grad_norm": 0.6975109577178955, "learning_rate": 5.45398643511055e-06, "loss": 0.8552, "num_input_tokens_seen": 604569600, "step": 73800 }, { "epoch": 2.3672240374143123, "grad_norm": 0.6180407404899597, "learning_rate": 5.401248630627282e-06, "loss": 0.8423, "num_input_tokens_seen": 605388800, "step": 73900 }, { "epoch": 2.370427317573195, "grad_norm": 0.8194453716278076, "learning_rate": 5.3487361459592626e-06, "loss": 0.8278, "num_input_tokens_seen": 606208000, "step": 74000 }, { "epoch": 2.3736305977320775, "grad_norm": 0.6039137244224548, "learning_rate": 5.296449584823707e-06, "loss": 0.8354, "num_input_tokens_seen": 607027200, "step": 74100 }, { "epoch": 2.3768338778909603, "grad_norm": 0.6407757997512817, "learning_rate": 5.244389548340456e-06, "loss": 0.8292, "num_input_tokens_seen": 607846400, "step": 74200 }, { "epoch": 2.380037158049843, "grad_norm": 1.9735205173492432, "learning_rate": 5.19255663502507e-06, "loss": 0.8604, "num_input_tokens_seen": 608665600, "step": 74300 }, { "epoch": 2.3832404382087256, "grad_norm": 0.7297560572624207, "learning_rate": 5.1409514407819745e-06, "loss": 0.8464, "num_input_tokens_seen": 609484800, "step": 74400 }, { "epoch": 2.3864437183676084, "grad_norm": 0.641272246837616, "learning_rate": 5.089574558897564e-06, "loss": 0.8711, "num_input_tokens_seen": 610304000, "step": 74500 }, { "epoch": 2.389646998526491, "grad_norm": 0.5732747316360474, "learning_rate": 5.038426580033431e-06, "loss": 0.8357, "num_input_tokens_seen": 611123200, "step": 74600 }, { "epoch": 2.3928502786853736, "grad_norm": 0.7175111770629883, "learning_rate": 4.98750809221955e-06, "loss": 0.8782, "num_input_tokens_seen": 611942400, "step": 74700 }, { "epoch": 2.3960535588442564, "grad_norm": 0.6939539909362793, "learning_rate": 4.936819680847499e-06, "loss": 0.8051, "num_input_tokens_seen": 612761600, "step": 74800 }, { "epoch": 2.3992568390031392, "grad_norm": 0.9897929430007935, "learning_rate": 4.886361928663779e-06, "loss": 0.8208, "num_input_tokens_seen": 613580800, "step": 74900 }, { "epoch": 2.402460119162022, "grad_norm": 1.3492214679718018, "learning_rate": 4.836135415763054e-06, "loss": 0.8081, "num_input_tokens_seen": 614400000, "step": 75000 }, { "epoch": 2.4056633993209044, "grad_norm": 0.6165256500244141, "learning_rate": 4.786140719581539e-06, "loss": 0.8612, "num_input_tokens_seen": 615219200, "step": 75100 }, { "epoch": 2.4088666794797873, "grad_norm": 0.7315238118171692, "learning_rate": 4.73637841489033e-06, "loss": 0.8201, "num_input_tokens_seen": 616038400, "step": 75200 }, { "epoch": 2.41206995963867, "grad_norm": 0.5693472027778625, "learning_rate": 4.686849073788782e-06, "loss": 0.8319, "num_input_tokens_seen": 616857600, "step": 75300 }, { "epoch": 2.4152732397975525, "grad_norm": 1.28626549243927, "learning_rate": 4.637553265697978e-06, "loss": 0.8012, "num_input_tokens_seen": 617676800, "step": 75400 }, { "epoch": 2.4184765199564353, "grad_norm": 3.020348072052002, "learning_rate": 4.5884915573541326e-06, "loss": 0.8216, "num_input_tokens_seen": 618496000, "step": 75500 }, { "epoch": 2.421679800115318, "grad_norm": 1.7923747301101685, "learning_rate": 4.539664512802125e-06, "loss": 0.8269, "num_input_tokens_seen": 619315200, "step": 75600 }, { "epoch": 2.424883080274201, "grad_norm": 0.6749047636985779, "learning_rate": 4.491072693388957e-06, "loss": 0.7949, "num_input_tokens_seen": 620134400, "step": 75700 }, { "epoch": 2.4280863604330833, "grad_norm": 0.8918429613113403, "learning_rate": 4.442716657757354e-06, "loss": 0.8153, "num_input_tokens_seen": 620953600, "step": 75800 }, { "epoch": 2.431289640591966, "grad_norm": 0.8165135383605957, "learning_rate": 4.3945969618393255e-06, "loss": 0.8063, "num_input_tokens_seen": 621772800, "step": 75900 }, { "epoch": 2.434492920750849, "grad_norm": 2.7509946823120117, "learning_rate": 4.346714158849744e-06, "loss": 0.7779, "num_input_tokens_seen": 622592000, "step": 76000 }, { "epoch": 2.4376962009097314, "grad_norm": 1.2128119468688965, "learning_rate": 4.299068799280032e-06, "loss": 0.8322, "num_input_tokens_seen": 623411200, "step": 76100 }, { "epoch": 2.440899481068614, "grad_norm": 1.1851086616516113, "learning_rate": 4.251661430891787e-06, "loss": 0.8294, "num_input_tokens_seen": 624230400, "step": 76200 }, { "epoch": 2.444102761227497, "grad_norm": 0.7874124646186829, "learning_rate": 4.20449259871053e-06, "loss": 0.819, "num_input_tokens_seen": 625049600, "step": 76300 }, { "epoch": 2.44730604138638, "grad_norm": 0.6558551788330078, "learning_rate": 4.157562845019405e-06, "loss": 0.7969, "num_input_tokens_seen": 625868800, "step": 76400 }, { "epoch": 2.4505093215452622, "grad_norm": 0.7723847031593323, "learning_rate": 4.1108727093529644e-06, "loss": 0.8516, "num_input_tokens_seen": 626688000, "step": 76500 }, { "epoch": 2.453712601704145, "grad_norm": 0.6779108047485352, "learning_rate": 4.064422728490946e-06, "loss": 0.8471, "num_input_tokens_seen": 627507200, "step": 76600 }, { "epoch": 2.456915881863028, "grad_norm": 0.5954208970069885, "learning_rate": 4.018213436452117e-06, "loss": 0.84, "num_input_tokens_seen": 628326400, "step": 76700 }, { "epoch": 2.4601191620219103, "grad_norm": 2.6484439373016357, "learning_rate": 3.972245364488136e-06, "loss": 0.8224, "num_input_tokens_seen": 629145600, "step": 76800 }, { "epoch": 2.463322442180793, "grad_norm": 0.6489027142524719, "learning_rate": 3.926519041077445e-06, "loss": 0.8476, "num_input_tokens_seen": 629964800, "step": 76900 }, { "epoch": 2.466525722339676, "grad_norm": 2.0896570682525635, "learning_rate": 3.8810349919191825e-06, "loss": 0.8256, "num_input_tokens_seen": 630784000, "step": 77000 }, { "epoch": 2.4697290024985588, "grad_norm": 0.8174818158149719, "learning_rate": 3.835793739927151e-06, "loss": 0.8493, "num_input_tokens_seen": 631603200, "step": 77100 }, { "epoch": 2.472932282657441, "grad_norm": 0.7576190829277039, "learning_rate": 3.7907958052237875e-06, "loss": 0.8275, "num_input_tokens_seen": 632422400, "step": 77200 }, { "epoch": 2.476135562816324, "grad_norm": 1.7763944864273071, "learning_rate": 3.746041705134215e-06, "loss": 0.8628, "num_input_tokens_seen": 633241600, "step": 77300 }, { "epoch": 2.479338842975207, "grad_norm": 0.8131124973297119, "learning_rate": 3.7015319541802708e-06, "loss": 0.8246, "num_input_tokens_seen": 634060800, "step": 77400 }, { "epoch": 2.482542123134089, "grad_norm": 0.9916465282440186, "learning_rate": 3.657267064074607e-06, "loss": 0.806, "num_input_tokens_seen": 634880000, "step": 77500 }, { "epoch": 2.485745403292972, "grad_norm": 1.6239954233169556, "learning_rate": 3.613247543714779e-06, "loss": 0.8068, "num_input_tokens_seen": 635699200, "step": 77600 }, { "epoch": 2.488948683451855, "grad_norm": 1.0215014219284058, "learning_rate": 3.5694738991774197e-06, "loss": 0.7704, "num_input_tokens_seen": 636518400, "step": 77700 }, { "epoch": 2.492151963610737, "grad_norm": 0.6939218044281006, "learning_rate": 3.5259466337124293e-06, "loss": 0.8625, "num_input_tokens_seen": 637337600, "step": 77800 }, { "epoch": 2.49535524376962, "grad_norm": 0.7442044615745544, "learning_rate": 3.4826662477371624e-06, "loss": 0.8093, "num_input_tokens_seen": 638156800, "step": 77900 }, { "epoch": 2.498558523928503, "grad_norm": 0.5725979208946228, "learning_rate": 3.4396332388307057e-06, "loss": 0.8533, "num_input_tokens_seen": 638976000, "step": 78000 }, { "epoch": 2.5017618040873852, "grad_norm": 2.239358425140381, "learning_rate": 3.3968481017281173e-06, "loss": 0.8254, "num_input_tokens_seen": 639795200, "step": 78100 }, { "epoch": 2.504965084246268, "grad_norm": 0.6777194142341614, "learning_rate": 3.3543113283147687e-06, "loss": 0.8311, "num_input_tokens_seen": 640614400, "step": 78200 }, { "epoch": 2.508168364405151, "grad_norm": 0.9692057371139526, "learning_rate": 3.3120234076206987e-06, "loss": 0.8285, "num_input_tokens_seen": 641433600, "step": 78300 }, { "epoch": 2.5113716445640337, "grad_norm": 0.8157410621643066, "learning_rate": 3.2699848258149617e-06, "loss": 0.8276, "num_input_tokens_seen": 642252800, "step": 78400 }, { "epoch": 2.514574924722916, "grad_norm": 1.9688010215759277, "learning_rate": 3.228196066200051e-06, "loss": 0.7989, "num_input_tokens_seen": 643072000, "step": 78500 }, { "epoch": 2.517778204881799, "grad_norm": 2.142247200012207, "learning_rate": 3.186657609206353e-06, "loss": 0.8165, "num_input_tokens_seen": 643891200, "step": 78600 }, { "epoch": 2.5209814850406818, "grad_norm": 0.7529670596122742, "learning_rate": 3.1453699323866047e-06, "loss": 0.8476, "num_input_tokens_seen": 644710400, "step": 78700 }, { "epoch": 2.524184765199564, "grad_norm": 0.5978514552116394, "learning_rate": 3.1043335104104233e-06, "loss": 0.8386, "num_input_tokens_seen": 645529600, "step": 78800 }, { "epoch": 2.527388045358447, "grad_norm": 0.7615718841552734, "learning_rate": 3.0635488150588338e-06, "loss": 0.8198, "num_input_tokens_seen": 646348800, "step": 78900 }, { "epoch": 2.53059132551733, "grad_norm": 0.7568325400352478, "learning_rate": 3.0230163152188463e-06, "loss": 0.8364, "num_input_tokens_seen": 647168000, "step": 79000 }, { "epoch": 2.5337946056762126, "grad_norm": 0.5773870944976807, "learning_rate": 2.9827364768780814e-06, "loss": 0.7922, "num_input_tokens_seen": 647987200, "step": 79100 }, { "epoch": 2.536997885835095, "grad_norm": 4.734196662902832, "learning_rate": 2.942709763119386e-06, "loss": 0.7829, "num_input_tokens_seen": 648806400, "step": 79200 }, { "epoch": 2.540201165993978, "grad_norm": 0.7763670682907104, "learning_rate": 2.9029366341155356e-06, "loss": 0.8196, "num_input_tokens_seen": 649625600, "step": 79300 }, { "epoch": 2.5434044461528607, "grad_norm": 0.6776308417320251, "learning_rate": 2.863417547123934e-06, "loss": 0.788, "num_input_tokens_seen": 650444800, "step": 79400 }, { "epoch": 2.546607726311743, "grad_norm": 0.7068803906440735, "learning_rate": 2.8241529564813434e-06, "loss": 0.8413, "num_input_tokens_seen": 651264000, "step": 79500 }, { "epoch": 2.549811006470626, "grad_norm": 1.1894068717956543, "learning_rate": 2.7851433135986843e-06, "loss": 0.851, "num_input_tokens_seen": 652083200, "step": 79600 }, { "epoch": 2.5530142866295087, "grad_norm": 1.9698837995529175, "learning_rate": 2.7463890669558263e-06, "loss": 0.8379, "num_input_tokens_seen": 652902400, "step": 79700 }, { "epoch": 2.5562175667883915, "grad_norm": 1.8066941499710083, "learning_rate": 2.707890662096452e-06, "loss": 0.7906, "num_input_tokens_seen": 653721600, "step": 79800 }, { "epoch": 2.559420846947274, "grad_norm": 0.824046790599823, "learning_rate": 2.6696485416228987e-06, "loss": 0.8011, "num_input_tokens_seen": 654540800, "step": 79900 }, { "epoch": 2.5626241271061567, "grad_norm": 0.7096015214920044, "learning_rate": 2.6316631451911213e-06, "loss": 0.8328, "num_input_tokens_seen": 655360000, "step": 80000 }, { "epoch": 2.5658274072650396, "grad_norm": 0.5634686350822449, "learning_rate": 2.593934909505602e-06, "loss": 0.8896, "num_input_tokens_seen": 656179200, "step": 80100 }, { "epoch": 2.569030687423922, "grad_norm": 0.7022582292556763, "learning_rate": 2.5564642683143263e-06, "loss": 0.8405, "num_input_tokens_seen": 656998400, "step": 80200 }, { "epoch": 2.5722339675828048, "grad_norm": 0.010020343586802483, "learning_rate": 2.51925165240382e-06, "loss": 0.8639, "num_input_tokens_seen": 657817600, "step": 80300 }, { "epoch": 2.5754372477416876, "grad_norm": 0.7010151147842407, "learning_rate": 2.482297489594182e-06, "loss": 0.813, "num_input_tokens_seen": 658636800, "step": 80400 }, { "epoch": 2.5786405279005704, "grad_norm": 1.0606889724731445, "learning_rate": 2.4456022047341653e-06, "loss": 0.8494, "num_input_tokens_seen": 659456000, "step": 80500 }, { "epoch": 2.581843808059453, "grad_norm": 0.5736305713653564, "learning_rate": 2.4091662196963014e-06, "loss": 0.8748, "num_input_tokens_seen": 660275200, "step": 80600 }, { "epoch": 2.5850470882183356, "grad_norm": 0.6299107074737549, "learning_rate": 2.3729899533720485e-06, "loss": 0.8254, "num_input_tokens_seen": 661094400, "step": 80700 }, { "epoch": 2.588250368377218, "grad_norm": 0.8091995120048523, "learning_rate": 2.3370738216669574e-06, "loss": 0.8373, "num_input_tokens_seen": 661913600, "step": 80800 }, { "epoch": 2.591453648536101, "grad_norm": 0.7887117862701416, "learning_rate": 2.3014182374959116e-06, "loss": 0.7675, "num_input_tokens_seen": 662732800, "step": 80900 }, { "epoch": 2.5946569286949837, "grad_norm": 0.7341217994689941, "learning_rate": 2.2660236107783783e-06, "loss": 0.8264, "num_input_tokens_seen": 663552000, "step": 81000 }, { "epoch": 2.5978602088538665, "grad_norm": 0.7887162566184998, "learning_rate": 2.230890348433684e-06, "loss": 0.8579, "num_input_tokens_seen": 664371200, "step": 81100 }, { "epoch": 2.6010634890127493, "grad_norm": 0.8627157807350159, "learning_rate": 2.1960188543763526e-06, "loss": 0.8412, "num_input_tokens_seen": 665190400, "step": 81200 }, { "epoch": 2.6042667691716317, "grad_norm": 2.6676676273345947, "learning_rate": 2.161409529511438e-06, "loss": 0.7985, "num_input_tokens_seen": 666009600, "step": 81300 }, { "epoch": 2.6074700493305145, "grad_norm": 0.6035804748535156, "learning_rate": 2.127062771729929e-06, "loss": 0.8033, "num_input_tokens_seen": 666828800, "step": 81400 }, { "epoch": 2.610673329489397, "grad_norm": 2.14854097366333, "learning_rate": 2.092978975904189e-06, "loss": 0.8538, "num_input_tokens_seen": 667648000, "step": 81500 }, { "epoch": 2.6138766096482797, "grad_norm": 1.651636004447937, "learning_rate": 2.059158533883393e-06, "loss": 0.8805, "num_input_tokens_seen": 668467200, "step": 81600 }, { "epoch": 2.6170798898071626, "grad_norm": 2.1014175415039062, "learning_rate": 2.025601834489038e-06, "loss": 0.8837, "num_input_tokens_seen": 669286400, "step": 81700 }, { "epoch": 2.6202831699660454, "grad_norm": 0.741468071937561, "learning_rate": 1.9923092635104557e-06, "loss": 0.7892, "num_input_tokens_seen": 670105600, "step": 81800 }, { "epoch": 2.6234864501249278, "grad_norm": 1.3246105909347534, "learning_rate": 1.9592812037003918e-06, "loss": 0.774, "num_input_tokens_seen": 670924800, "step": 81900 }, { "epoch": 2.6266897302838106, "grad_norm": 0.6697006225585938, "learning_rate": 1.9265180347706053e-06, "loss": 0.8393, "num_input_tokens_seen": 671744000, "step": 82000 }, { "epoch": 2.6298930104426934, "grad_norm": 0.5421914458274841, "learning_rate": 1.894020133387503e-06, "loss": 0.8398, "num_input_tokens_seen": 672563200, "step": 82100 }, { "epoch": 2.633096290601576, "grad_norm": 2.6112563610076904, "learning_rate": 1.8617878731678e-06, "loss": 0.8031, "num_input_tokens_seen": 673382400, "step": 82200 }, { "epoch": 2.6362995707604586, "grad_norm": 0.7507239580154419, "learning_rate": 1.8298216246742329e-06, "loss": 0.831, "num_input_tokens_seen": 674201600, "step": 82300 }, { "epoch": 2.6395028509193414, "grad_norm": 2.156158685684204, "learning_rate": 1.798121755411289e-06, "loss": 0.8778, "num_input_tokens_seen": 675020800, "step": 82400 }, { "epoch": 2.6427061310782243, "grad_norm": 0.5693337917327881, "learning_rate": 1.7666886298210006e-06, "loss": 0.7904, "num_input_tokens_seen": 675840000, "step": 82500 }, { "epoch": 2.6459094112371067, "grad_norm": 0.9597682356834412, "learning_rate": 1.735522609278742e-06, "loss": 0.8547, "num_input_tokens_seen": 676659200, "step": 82600 }, { "epoch": 2.6491126913959895, "grad_norm": 0.8956586122512817, "learning_rate": 1.7046240520890655e-06, "loss": 0.8395, "num_input_tokens_seen": 677478400, "step": 82700 }, { "epoch": 2.6523159715548723, "grad_norm": 0.918878436088562, "learning_rate": 1.6739933134816117e-06, "loss": 0.8106, "num_input_tokens_seen": 678297600, "step": 82800 }, { "epoch": 2.6555192517137547, "grad_norm": 0.6460690498352051, "learning_rate": 1.6436307456069832e-06, "loss": 0.8427, "num_input_tokens_seen": 679116800, "step": 82900 }, { "epoch": 2.6587225318726375, "grad_norm": 0.7876623868942261, "learning_rate": 1.6135366975327442e-06, "loss": 0.8306, "num_input_tokens_seen": 679936000, "step": 83000 }, { "epoch": 2.6619258120315203, "grad_norm": 0.7109478712081909, "learning_rate": 1.5837115152393695e-06, "loss": 0.8785, "num_input_tokens_seen": 680755200, "step": 83100 }, { "epoch": 2.665129092190403, "grad_norm": 0.6864702701568604, "learning_rate": 1.5541555416162784e-06, "loss": 0.7719, "num_input_tokens_seen": 681574400, "step": 83200 }, { "epoch": 2.6683323723492856, "grad_norm": 0.5490867495536804, "learning_rate": 1.5248691164579054e-06, "loss": 0.7945, "num_input_tokens_seen": 682393600, "step": 83300 }, { "epoch": 2.6715356525081684, "grad_norm": 0.7371602654457092, "learning_rate": 1.4958525764597719e-06, "loss": 0.8751, "num_input_tokens_seen": 683212800, "step": 83400 }, { "epoch": 2.674738932667051, "grad_norm": 3.058120012283325, "learning_rate": 1.4671062552146342e-06, "loss": 0.807, "num_input_tokens_seen": 684032000, "step": 83500 }, { "epoch": 2.6779422128259336, "grad_norm": 2.8297903537750244, "learning_rate": 1.4386304832086333e-06, "loss": 0.8519, "num_input_tokens_seen": 684851200, "step": 83600 }, { "epoch": 2.6811454929848164, "grad_norm": 0.5840158462524414, "learning_rate": 1.4104255878175099e-06, "loss": 0.7911, "num_input_tokens_seen": 685670400, "step": 83700 }, { "epoch": 2.6843487731436992, "grad_norm": 0.5358206629753113, "learning_rate": 1.382491893302837e-06, "loss": 0.85, "num_input_tokens_seen": 686489600, "step": 83800 }, { "epoch": 2.687552053302582, "grad_norm": 0.5446909666061401, "learning_rate": 1.3548297208082678e-06, "loss": 0.7469, "num_input_tokens_seen": 687308800, "step": 83900 }, { "epoch": 2.6907553334614644, "grad_norm": 0.7376157641410828, "learning_rate": 1.3274393883558916e-06, "loss": 0.815, "num_input_tokens_seen": 688128000, "step": 84000 }, { "epoch": 2.6939586136203473, "grad_norm": 2.3603358268737793, "learning_rate": 1.3003212108425256e-06, "loss": 0.8195, "num_input_tokens_seen": 688947200, "step": 84100 }, { "epoch": 2.6971618937792297, "grad_norm": 2.3444812297821045, "learning_rate": 1.2734755000361393e-06, "loss": 0.8265, "num_input_tokens_seen": 689766400, "step": 84200 }, { "epoch": 2.7003651739381125, "grad_norm": 0.7536035776138306, "learning_rate": 1.2469025645722333e-06, "loss": 0.8382, "num_input_tokens_seen": 690585600, "step": 84300 }, { "epoch": 2.7035684540969953, "grad_norm": 0.7054631114006042, "learning_rate": 1.2206027099503275e-06, "loss": 0.7791, "num_input_tokens_seen": 691404800, "step": 84400 }, { "epoch": 2.706771734255878, "grad_norm": 0.7819291353225708, "learning_rate": 1.1945762385304122e-06, "loss": 0.8321, "num_input_tokens_seen": 692224000, "step": 84500 }, { "epoch": 2.709975014414761, "grad_norm": 0.7501091361045837, "learning_rate": 1.168823449529488e-06, "loss": 0.8494, "num_input_tokens_seen": 693043200, "step": 84600 }, { "epoch": 2.7131782945736433, "grad_norm": 0.566743016242981, "learning_rate": 1.1433446390181402e-06, "loss": 0.8685, "num_input_tokens_seen": 693862400, "step": 84700 }, { "epoch": 2.716381574732526, "grad_norm": 2.204374313354492, "learning_rate": 1.1181400999171144e-06, "loss": 0.8147, "num_input_tokens_seen": 694681600, "step": 84800 }, { "epoch": 2.7195848548914086, "grad_norm": 2.641223192214966, "learning_rate": 1.0932101219939594e-06, "loss": 0.8259, "num_input_tokens_seen": 695500800, "step": 84900 }, { "epoch": 2.7227881350502914, "grad_norm": 0.747035562992096, "learning_rate": 1.0685549918596882e-06, "loss": 0.8737, "num_input_tokens_seen": 696320000, "step": 85000 }, { "epoch": 2.725991415209174, "grad_norm": 0.9778177738189697, "learning_rate": 1.0441749929654827e-06, "loss": 0.8358, "num_input_tokens_seen": 697139200, "step": 85100 }, { "epoch": 2.729194695368057, "grad_norm": 2.0086069107055664, "learning_rate": 1.0200704055994548e-06, "loss": 0.8231, "num_input_tokens_seen": 697958400, "step": 85200 }, { "epoch": 2.73239797552694, "grad_norm": 0.7290952801704407, "learning_rate": 9.962415068833968e-07, "loss": 0.8211, "num_input_tokens_seen": 698777600, "step": 85300 }, { "epoch": 2.7356012556858222, "grad_norm": 0.6520437598228455, "learning_rate": 9.726885707696114e-07, "loss": 0.8776, "num_input_tokens_seen": 699596800, "step": 85400 }, { "epoch": 2.738804535844705, "grad_norm": 0.5633389353752136, "learning_rate": 9.494118680377612e-07, "loss": 0.8198, "num_input_tokens_seen": 700416000, "step": 85500 }, { "epoch": 2.7420078160035875, "grad_norm": 0.8410841822624207, "learning_rate": 9.264116662917405e-07, "loss": 0.8894, "num_input_tokens_seen": 701235200, "step": 85600 }, { "epoch": 2.7452110961624703, "grad_norm": 2.9148612022399902, "learning_rate": 9.036882299566229e-07, "loss": 0.8259, "num_input_tokens_seen": 702054400, "step": 85700 }, { "epoch": 2.748414376321353, "grad_norm": 0.5637199878692627, "learning_rate": 8.812418202756107e-07, "loss": 0.7636, "num_input_tokens_seen": 702873600, "step": 85800 }, { "epoch": 2.751617656480236, "grad_norm": 0.5929956436157227, "learning_rate": 8.590726953070228e-07, "loss": 0.8448, "num_input_tokens_seen": 703692800, "step": 85900 }, { "epoch": 2.7548209366391183, "grad_norm": 0.5491350889205933, "learning_rate": 8.371811099213394e-07, "loss": 0.8467, "num_input_tokens_seen": 704512000, "step": 86000 }, { "epoch": 2.758024216798001, "grad_norm": 1.0223699808120728, "learning_rate": 8.155673157982601e-07, "loss": 0.8133, "num_input_tokens_seen": 705331200, "step": 86100 }, { "epoch": 2.761227496956884, "grad_norm": 1.5225611925125122, "learning_rate": 7.942315614238277e-07, "loss": 0.8109, "num_input_tokens_seen": 706150400, "step": 86200 }, { "epoch": 2.7644307771157663, "grad_norm": 0.8148054480552673, "learning_rate": 7.731740920875613e-07, "loss": 0.821, "num_input_tokens_seen": 706969600, "step": 86300 }, { "epoch": 2.767634057274649, "grad_norm": 0.7864372730255127, "learning_rate": 7.523951498796283e-07, "loss": 0.8135, "num_input_tokens_seen": 707788800, "step": 86400 }, { "epoch": 2.770837337433532, "grad_norm": 2.5619330406188965, "learning_rate": 7.318949736880798e-07, "loss": 0.7905, "num_input_tokens_seen": 708608000, "step": 86500 }, { "epoch": 2.774040617592415, "grad_norm": 1.5780519247055054, "learning_rate": 7.116737991960831e-07, "loss": 0.8608, "num_input_tokens_seen": 709427200, "step": 86600 }, { "epoch": 2.777243897751297, "grad_norm": 0.666118323802948, "learning_rate": 6.917318588792299e-07, "loss": 0.8586, "num_input_tokens_seen": 710246400, "step": 86700 }, { "epoch": 2.78044717791018, "grad_norm": 0.5050229430198669, "learning_rate": 6.720693820028629e-07, "loss": 0.8473, "num_input_tokens_seen": 711065600, "step": 86800 }, { "epoch": 2.783650458069063, "grad_norm": 0.5586540699005127, "learning_rate": 6.526865946194172e-07, "loss": 0.8182, "num_input_tokens_seen": 711884800, "step": 86900 }, { "epoch": 2.7868537382279452, "grad_norm": 0.6938973665237427, "learning_rate": 6.335837195658528e-07, "loss": 0.8493, "num_input_tokens_seen": 712704000, "step": 87000 }, { "epoch": 2.790057018386828, "grad_norm": 0.8710479736328125, "learning_rate": 6.147609764610707e-07, "loss": 0.8134, "num_input_tokens_seen": 713523200, "step": 87100 }, { "epoch": 2.793260298545711, "grad_norm": 2.5295767784118652, "learning_rate": 5.962185817034005e-07, "loss": 0.7893, "num_input_tokens_seen": 714342400, "step": 87200 }, { "epoch": 2.7964635787045937, "grad_norm": 0.5434448719024658, "learning_rate": 5.779567484681032e-07, "loss": 0.7896, "num_input_tokens_seen": 715161600, "step": 87300 }, { "epoch": 2.799666858863476, "grad_norm": 2.833872079849243, "learning_rate": 5.599756867049221e-07, "loss": 0.8185, "num_input_tokens_seen": 715980800, "step": 87400 }, { "epoch": 2.802870139022359, "grad_norm": 0.5753843784332275, "learning_rate": 5.422756031356779e-07, "loss": 0.8188, "num_input_tokens_seen": 716800000, "step": 87500 }, { "epoch": 2.8060734191812418, "grad_norm": 0.6721400022506714, "learning_rate": 5.248567012518857e-07, "loss": 0.8303, "num_input_tokens_seen": 717619200, "step": 87600 }, { "epoch": 2.809276699340124, "grad_norm": 0.7175859808921814, "learning_rate": 5.077191813124105e-07, "loss": 0.7866, "num_input_tokens_seen": 718438400, "step": 87700 }, { "epoch": 2.812479979499007, "grad_norm": 0.9649165868759155, "learning_rate": 4.90863240341169e-07, "loss": 0.8269, "num_input_tokens_seen": 719257600, "step": 87800 }, { "epoch": 2.81568325965789, "grad_norm": 0.5693693161010742, "learning_rate": 4.742890721248755e-07, "loss": 0.7737, "num_input_tokens_seen": 720076800, "step": 87900 }, { "epoch": 2.8188865398167726, "grad_norm": 0.6442407369613647, "learning_rate": 4.579968672107943e-07, "loss": 0.8196, "num_input_tokens_seen": 720896000, "step": 88000 }, { "epoch": 2.822089819975655, "grad_norm": 0.72199547290802, "learning_rate": 4.419868129045629e-07, "loss": 0.7998, "num_input_tokens_seen": 721715200, "step": 88100 }, { "epoch": 2.825293100134538, "grad_norm": 1.2243154048919678, "learning_rate": 4.2625909326803325e-07, "loss": 0.8534, "num_input_tokens_seen": 722534400, "step": 88200 }, { "epoch": 2.82849638029342, "grad_norm": 0.8224316835403442, "learning_rate": 4.1081388911715645e-07, "loss": 0.8262, "num_input_tokens_seen": 723353600, "step": 88300 }, { "epoch": 2.831699660452303, "grad_norm": 0.7001350522041321, "learning_rate": 3.9565137801990395e-07, "loss": 0.8323, "num_input_tokens_seen": 724172800, "step": 88400 }, { "epoch": 2.834902940611186, "grad_norm": 0.7441889643669128, "learning_rate": 3.807717342942302e-07, "loss": 0.8116, "num_input_tokens_seen": 724992000, "step": 88500 }, { "epoch": 2.8381062207700687, "grad_norm": 0.6325407028198242, "learning_rate": 3.661751290060633e-07, "loss": 0.8481, "num_input_tokens_seen": 725811200, "step": 88600 }, { "epoch": 2.8413095009289515, "grad_norm": 0.9763919711112976, "learning_rate": 3.5186172996733714e-07, "loss": 0.8084, "num_input_tokens_seen": 726630400, "step": 88700 }, { "epoch": 2.844512781087834, "grad_norm": 0.6528813242912292, "learning_rate": 3.3783170173406764e-07, "loss": 0.7923, "num_input_tokens_seen": 727449600, "step": 88800 }, { "epoch": 2.8477160612467167, "grad_norm": 0.8190716505050659, "learning_rate": 3.2408520560445463e-07, "loss": 0.8397, "num_input_tokens_seen": 728268800, "step": 88900 }, { "epoch": 2.850919341405599, "grad_norm": 0.6821821928024292, "learning_rate": 3.10622399617036e-07, "loss": 0.7856, "num_input_tokens_seen": 729088000, "step": 89000 }, { "epoch": 2.854122621564482, "grad_norm": 0.9017992615699768, "learning_rate": 2.9744343854886393e-07, "loss": 0.8271, "num_input_tokens_seen": 729907200, "step": 89100 }, { "epoch": 2.8573259017233648, "grad_norm": 0.6816012263298035, "learning_rate": 2.8454847391372886e-07, "loss": 0.8334, "num_input_tokens_seen": 730726400, "step": 89200 }, { "epoch": 2.8605291818822476, "grad_norm": 1.0822001695632935, "learning_rate": 2.719376539604107e-07, "loss": 0.8198, "num_input_tokens_seen": 731545600, "step": 89300 }, { "epoch": 2.86373246204113, "grad_norm": 0.782041072845459, "learning_rate": 2.5961112367098306e-07, "loss": 0.8199, "num_input_tokens_seen": 732364800, "step": 89400 }, { "epoch": 2.866935742200013, "grad_norm": 1.8875998258590698, "learning_rate": 2.4756902475914777e-07, "loss": 0.7963, "num_input_tokens_seen": 733184000, "step": 89500 }, { "epoch": 2.8701390223588956, "grad_norm": 0.549452543258667, "learning_rate": 2.358114956685975e-07, "loss": 0.8353, "num_input_tokens_seen": 734003200, "step": 89600 }, { "epoch": 2.873342302517778, "grad_norm": 1.3322216272354126, "learning_rate": 2.243386715714224e-07, "loss": 0.8547, "num_input_tokens_seen": 734822400, "step": 89700 }, { "epoch": 2.876545582676661, "grad_norm": 0.8102174997329712, "learning_rate": 2.1315068436656983e-07, "loss": 0.8233, "num_input_tokens_seen": 735641600, "step": 89800 }, { "epoch": 2.8797488628355437, "grad_norm": 0.6969431042671204, "learning_rate": 2.0224766267831207e-07, "loss": 0.8622, "num_input_tokens_seen": 736460800, "step": 89900 }, { "epoch": 2.8829521429944265, "grad_norm": 1.4771400690078735, "learning_rate": 1.9162973185478383e-07, "loss": 0.789, "num_input_tokens_seen": 737280000, "step": 90000 }, { "epoch": 2.886155423153309, "grad_norm": 0.6978898048400879, "learning_rate": 1.8129701396652487e-07, "loss": 0.8723, "num_input_tokens_seen": 738099200, "step": 90100 }, { "epoch": 2.8893587033121917, "grad_norm": 0.838759183883667, "learning_rate": 1.7124962780508957e-07, "loss": 0.8136, "num_input_tokens_seen": 738918400, "step": 90200 }, { "epoch": 2.8925619834710745, "grad_norm": 0.6396787762641907, "learning_rate": 1.6148768888166744e-07, "loss": 0.8263, "num_input_tokens_seen": 739737600, "step": 90300 }, { "epoch": 2.895765263629957, "grad_norm": 0.7068443298339844, "learning_rate": 1.5201130942577578e-07, "loss": 0.8388, "num_input_tokens_seen": 740556800, "step": 90400 }, { "epoch": 2.8989685437888397, "grad_norm": 0.5743166208267212, "learning_rate": 1.4282059838394701e-07, "loss": 0.8284, "num_input_tokens_seen": 741376000, "step": 90500 }, { "epoch": 2.9021718239477226, "grad_norm": 0.5627537369728088, "learning_rate": 1.3391566141848778e-07, "loss": 0.834, "num_input_tokens_seen": 742195200, "step": 90600 }, { "epoch": 2.9053751041066054, "grad_norm": 2.069951057434082, "learning_rate": 1.2529660090626894e-07, "loss": 0.8798, "num_input_tokens_seen": 743014400, "step": 90700 }, { "epoch": 2.9085783842654878, "grad_norm": 0.5723984241485596, "learning_rate": 1.1696351593753485e-07, "loss": 0.8443, "num_input_tokens_seen": 743833600, "step": 90800 }, { "epoch": 2.9117816644243706, "grad_norm": 0.5584101676940918, "learning_rate": 1.0891650231477646e-07, "loss": 0.7991, "num_input_tokens_seen": 744652800, "step": 90900 }, { "epoch": 2.9149849445832534, "grad_norm": 0.8929557800292969, "learning_rate": 1.0115565255162107e-07, "loss": 0.8134, "num_input_tokens_seen": 745472000, "step": 91000 }, { "epoch": 2.918188224742136, "grad_norm": 0.5613967776298523, "learning_rate": 9.368105587177767e-08, "loss": 0.855, "num_input_tokens_seen": 746291200, "step": 91100 }, { "epoch": 2.9213915049010186, "grad_norm": 0.5235220193862915, "learning_rate": 8.649279820800161e-08, "loss": 0.7894, "num_input_tokens_seen": 747110400, "step": 91200 }, { "epoch": 2.9245947850599014, "grad_norm": 2.220933198928833, "learning_rate": 7.959096220111206e-08, "loss": 0.8311, "num_input_tokens_seen": 747929600, "step": 91300 }, { "epoch": 2.9277980652187843, "grad_norm": 2.264698028564453, "learning_rate": 7.297562719904561e-08, "loss": 0.7856, "num_input_tokens_seen": 748748800, "step": 91400 }, { "epoch": 2.9310013453776667, "grad_norm": 0.6808698773384094, "learning_rate": 6.664686925593188e-08, "loss": 0.8379, "num_input_tokens_seen": 749568000, "step": 91500 }, { "epoch": 2.9342046255365495, "grad_norm": 2.1781809329986572, "learning_rate": 6.060476113123603e-08, "loss": 0.7529, "num_input_tokens_seen": 750387200, "step": 91600 }, { "epoch": 2.937407905695432, "grad_norm": 0.6591463685035706, "learning_rate": 5.4849372288903744e-08, "loss": 0.8836, "num_input_tokens_seen": 751206400, "step": 91700 }, { "epoch": 2.9406111858543147, "grad_norm": 0.5385074019432068, "learning_rate": 4.9380768896578614e-08, "loss": 0.8253, "num_input_tokens_seen": 752025600, "step": 91800 }, { "epoch": 2.9438144660131975, "grad_norm": 0.7810553312301636, "learning_rate": 4.419901382483327e-08, "loss": 0.7867, "num_input_tokens_seen": 752844800, "step": 91900 }, { "epoch": 2.9470177461720803, "grad_norm": 1.6066702604293823, "learning_rate": 3.930416664644498e-08, "loss": 0.8089, "num_input_tokens_seen": 753664000, "step": 92000 }, { "epoch": 2.950221026330963, "grad_norm": 0.8969001173973083, "learning_rate": 3.469628363571564e-08, "loss": 0.8324, "num_input_tokens_seen": 754483200, "step": 92100 }, { "epoch": 2.9534243064898456, "grad_norm": 0.6381150484085083, "learning_rate": 3.037541776782782e-08, "loss": 0.8199, "num_input_tokens_seen": 755302400, "step": 92200 }, { "epoch": 2.9566275866487284, "grad_norm": 0.8189881443977356, "learning_rate": 2.6341618718223048e-08, "loss": 0.8282, "num_input_tokens_seen": 756121600, "step": 92300 }, { "epoch": 2.9598308668076108, "grad_norm": 0.744215190410614, "learning_rate": 2.2594932862041173e-08, "loss": 0.823, "num_input_tokens_seen": 756940800, "step": 92400 }, { "epoch": 2.9630341469664936, "grad_norm": 0.6979692578315735, "learning_rate": 1.91354032735902e-08, "loss": 0.7854, "num_input_tokens_seen": 757760000, "step": 92500 }, { "epoch": 2.9662374271253764, "grad_norm": 0.6506592035293579, "learning_rate": 1.5963069725838385e-08, "loss": 0.8654, "num_input_tokens_seen": 758579200, "step": 92600 }, { "epoch": 2.9694407072842592, "grad_norm": 0.7221033573150635, "learning_rate": 1.3077968689964582e-08, "loss": 0.7966, "num_input_tokens_seen": 759398400, "step": 92700 }, { "epoch": 2.9726439874431416, "grad_norm": 0.5663209557533264, "learning_rate": 1.0480133334947462e-08, "loss": 0.8375, "num_input_tokens_seen": 760217600, "step": 92800 }, { "epoch": 2.9758472676020244, "grad_norm": 0.7616459131240845, "learning_rate": 8.169593527160291e-09, "loss": 0.8056, "num_input_tokens_seen": 761036800, "step": 92900 }, { "epoch": 2.9790505477609073, "grad_norm": 0.7259778380393982, "learning_rate": 6.146375830054507e-09, "loss": 0.8026, "num_input_tokens_seen": 761856000, "step": 93000 }, { "epoch": 2.9822538279197897, "grad_norm": 0.6411218643188477, "learning_rate": 4.410503503840535e-09, "loss": 0.8472, "num_input_tokens_seen": 762675200, "step": 93100 }, { "epoch": 2.9854571080786725, "grad_norm": 0.6619647741317749, "learning_rate": 2.961996505213005e-09, "loss": 0.8558, "num_input_tokens_seen": 763494400, "step": 93200 }, { "epoch": 2.9886603882375553, "grad_norm": 0.7283292412757874, "learning_rate": 1.8008714871453613e-09, "loss": 0.8321, "num_input_tokens_seen": 764313600, "step": 93300 }, { "epoch": 2.991863668396438, "grad_norm": 0.7489187717437744, "learning_rate": 9.271417986705943e-10, "loss": 0.8264, "num_input_tokens_seen": 765132800, "step": 93400 }, { "epoch": 2.9950669485553205, "grad_norm": 2.186750888824463, "learning_rate": 3.408174847480128e-10, "loss": 0.7796, "num_input_tokens_seen": 765952000, "step": 93500 }, { "epoch": 2.9982702287142033, "grad_norm": 2.5423426628112793, "learning_rate": 4.1905286135568434e-11, "loss": 0.7863, "num_input_tokens_seen": 766771200, "step": 93600 }, { "epoch": 3.0, "num_input_tokens_seen": 767213568, "step": 93654, "total_flos": 3.49334314435121e+19, "train_loss": 0.04966252789047391, "train_runtime": 28761.9651, "train_samples_per_second": 3.256, "train_steps_per_second": 3.256 } ], "logging_steps": 100, "max_steps": 93654, "num_input_tokens_seen": 767213568, "num_train_epochs": 3, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3.49334314435121e+19, "train_batch_size": 1, "trial_name": null, "trial_params": null }