{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 344, "global_step": 5510, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0014519056261343012, "grad_norm": 6.9375, "learning_rate": 1.3333333333333333e-05, "loss": 0.6156, "num_input_tokens_seen": 499226, "step": 8 }, { "epoch": 0.0029038112522686023, "grad_norm": 1.203125, "learning_rate": 2.6666666666666667e-05, "loss": 0.0994, "num_input_tokens_seen": 1014244, "step": 16 }, { "epoch": 0.004355716878402904, "grad_norm": 2.125, "learning_rate": 4e-05, "loss": 0.0849, "num_input_tokens_seen": 1528464, "step": 24 }, { "epoch": 0.005807622504537205, "grad_norm": 1.4453125, "learning_rate": 3.999979012178918e-05, "loss": 0.12, "num_input_tokens_seen": 2041011, "step": 32 }, { "epoch": 0.007259528130671506, "grad_norm": 2.609375, "learning_rate": 3.9999160491561583e-05, "loss": 0.1437, "num_input_tokens_seen": 2530185, "step": 40 }, { "epoch": 0.008711433756805808, "grad_norm": 1.4140625, "learning_rate": 3.9998111122531796e-05, "loss": 0.0898, "num_input_tokens_seen": 3017273, "step": 48 }, { "epoch": 0.010163339382940109, "grad_norm": 1.921875, "learning_rate": 3.999664203672378e-05, "loss": 0.1247, "num_input_tokens_seen": 3507672, "step": 56 }, { "epoch": 0.01161524500907441, "grad_norm": 1.0859375, "learning_rate": 3.999475326497044e-05, "loss": 0.0819, "num_input_tokens_seen": 4018539, "step": 64 }, { "epoch": 0.013067150635208712, "grad_norm": 1.6171875, "learning_rate": 3.999244484691299e-05, "loss": 0.1078, "num_input_tokens_seen": 4525857, "step": 72 }, { "epoch": 0.014519056261343012, "grad_norm": 1.3671875, "learning_rate": 3.998971683100009e-05, "loss": 0.099, "num_input_tokens_seen": 5023032, "step": 80 }, { "epoch": 0.015970961887477313, "grad_norm": 1.625, "learning_rate": 3.9986569274486843e-05, "loss": 0.0855, "num_input_tokens_seen": 5524113, "step": 88 }, { "epoch": 0.017422867513611617, "grad_norm": 1.734375, "learning_rate": 3.9983002243433615e-05, "loss": 0.1026, "num_input_tokens_seen": 5999882, "step": 96 }, { "epoch": 0.018874773139745917, "grad_norm": 3.5625, "learning_rate": 3.9979015812704605e-05, "loss": 0.0843, "num_input_tokens_seen": 6471878, "step": 104 }, { "epoch": 0.020326678765880218, "grad_norm": 1.0625, "learning_rate": 3.997461006596631e-05, "loss": 0.0841, "num_input_tokens_seen": 6944973, "step": 112 }, { "epoch": 0.021778584392014518, "grad_norm": 1.0625, "learning_rate": 3.9969785095685765e-05, "loss": 0.0982, "num_input_tokens_seen": 7460215, "step": 120 }, { "epoch": 0.02323049001814882, "grad_norm": 1.0859375, "learning_rate": 3.996454100312857e-05, "loss": 0.0971, "num_input_tokens_seen": 7942417, "step": 128 }, { "epoch": 0.024682395644283123, "grad_norm": 82.0, "learning_rate": 3.9958877898356806e-05, "loss": 0.2563, "num_input_tokens_seen": 8454243, "step": 136 }, { "epoch": 0.026134301270417423, "grad_norm": 2.21875, "learning_rate": 3.99527959002267e-05, "loss": 0.1566, "num_input_tokens_seen": 8973734, "step": 144 }, { "epoch": 0.027586206896551724, "grad_norm": 2.40625, "learning_rate": 3.994629513638614e-05, "loss": 0.1109, "num_input_tokens_seen": 9497439, "step": 152 }, { "epoch": 0.029038112522686024, "grad_norm": 3.65625, "learning_rate": 3.993937574327201e-05, "loss": 0.1353, "num_input_tokens_seen": 9988636, "step": 160 }, { "epoch": 0.030490018148820328, "grad_norm": 1.578125, "learning_rate": 3.993203786610727e-05, "loss": 0.1002, "num_input_tokens_seen": 10460548, "step": 168 }, { "epoch": 0.031941923774954625, "grad_norm": 1.1015625, "learning_rate": 3.992428165889799e-05, "loss": 0.0952, "num_input_tokens_seen": 10983644, "step": 176 }, { "epoch": 0.033393829401088926, "grad_norm": 2.515625, "learning_rate": 3.991610728443006e-05, "loss": 0.1082, "num_input_tokens_seen": 11485663, "step": 184 }, { "epoch": 0.03484573502722323, "grad_norm": 1.53125, "learning_rate": 3.9907514914265776e-05, "loss": 0.0907, "num_input_tokens_seen": 11981340, "step": 192 }, { "epoch": 0.036297640653357534, "grad_norm": 12.0625, "learning_rate": 3.989850472874027e-05, "loss": 0.0704, "num_input_tokens_seen": 12482463, "step": 200 }, { "epoch": 0.037749546279491834, "grad_norm": 1.078125, "learning_rate": 3.988907691695771e-05, "loss": 0.0847, "num_input_tokens_seen": 12968571, "step": 208 }, { "epoch": 0.039201451905626135, "grad_norm": 1.2578125, "learning_rate": 3.987923167678732e-05, "loss": 0.0968, "num_input_tokens_seen": 13451536, "step": 216 }, { "epoch": 0.040653357531760435, "grad_norm": 2.484375, "learning_rate": 3.986896921485924e-05, "loss": 0.1026, "num_input_tokens_seen": 13949131, "step": 224 }, { "epoch": 0.042105263157894736, "grad_norm": 2.453125, "learning_rate": 3.9858289746560183e-05, "loss": 0.1126, "num_input_tokens_seen": 14447251, "step": 232 }, { "epoch": 0.043557168784029036, "grad_norm": 1.2265625, "learning_rate": 3.984719349602892e-05, "loss": 0.0934, "num_input_tokens_seen": 14937783, "step": 240 }, { "epoch": 0.04500907441016334, "grad_norm": 1.75, "learning_rate": 3.983568069615157e-05, "loss": 0.0936, "num_input_tokens_seen": 15429323, "step": 248 }, { "epoch": 0.04646098003629764, "grad_norm": 1.2109375, "learning_rate": 3.982375158855672e-05, "loss": 0.0749, "num_input_tokens_seen": 15920688, "step": 256 }, { "epoch": 0.047912885662431945, "grad_norm": 1.2578125, "learning_rate": 3.981140642361034e-05, "loss": 0.0868, "num_input_tokens_seen": 16393398, "step": 264 }, { "epoch": 0.049364791288566245, "grad_norm": 1.171875, "learning_rate": 3.9798645460410544e-05, "loss": 0.0997, "num_input_tokens_seen": 16894283, "step": 272 }, { "epoch": 0.050816696914700546, "grad_norm": 0.99609375, "learning_rate": 3.9785468966782155e-05, "loss": 0.0849, "num_input_tokens_seen": 17371830, "step": 280 }, { "epoch": 0.052268602540834846, "grad_norm": 1.15625, "learning_rate": 3.9771877219271055e-05, "loss": 0.0925, "num_input_tokens_seen": 17893827, "step": 288 }, { "epoch": 0.05372050816696915, "grad_norm": 0.8125, "learning_rate": 3.975787050313841e-05, "loss": 0.0822, "num_input_tokens_seen": 18380621, "step": 296 }, { "epoch": 0.05517241379310345, "grad_norm": 1.6484375, "learning_rate": 3.9743449112354676e-05, "loss": 0.1172, "num_input_tokens_seen": 18905348, "step": 304 }, { "epoch": 0.05662431941923775, "grad_norm": 1.2734375, "learning_rate": 3.9728613349593415e-05, "loss": 0.1075, "num_input_tokens_seen": 19399905, "step": 312 }, { "epoch": 0.05807622504537205, "grad_norm": 18.25, "learning_rate": 3.971336352622496e-05, "loss": 0.1882, "num_input_tokens_seen": 19921923, "step": 320 }, { "epoch": 0.05952813067150635, "grad_norm": 1.8359375, "learning_rate": 3.969769996230989e-05, "loss": 0.1074, "num_input_tokens_seen": 20436822, "step": 328 }, { "epoch": 0.060980036297640657, "grad_norm": 1.3828125, "learning_rate": 3.968162298659227e-05, "loss": 0.1112, "num_input_tokens_seen": 20943888, "step": 336 }, { "epoch": 0.06243194192377496, "grad_norm": 1.3125, "learning_rate": 3.9665132936492794e-05, "loss": 0.1519, "num_input_tokens_seen": 21418243, "step": 344 }, { "epoch": 0.06243194192377496, "eval_loss": 0.11010845005512238, "eval_runtime": 2622.9951, "eval_samples_per_second": 1.188, "eval_steps_per_second": 0.149, "num_input_tokens_seen": 21418243, "step": 344 }, { "epoch": 0.06388384754990925, "grad_norm": 3.640625, "learning_rate": 3.9648230158101674e-05, "loss": 0.123, "num_input_tokens_seen": 21924518, "step": 352 }, { "epoch": 0.06533575317604355, "grad_norm": 1.5625, "learning_rate": 3.9630915006171416e-05, "loss": 0.1086, "num_input_tokens_seen": 22403227, "step": 360 }, { "epoch": 0.06678765880217785, "grad_norm": 3.09375, "learning_rate": 3.961318784410932e-05, "loss": 0.1068, "num_input_tokens_seen": 22901361, "step": 368 }, { "epoch": 0.06823956442831217, "grad_norm": 0.9375, "learning_rate": 3.95950490439699e-05, "loss": 0.0931, "num_input_tokens_seen": 23408098, "step": 376 }, { "epoch": 0.06969147005444647, "grad_norm": 0.9296875, "learning_rate": 3.9576498986447026e-05, "loss": 0.0817, "num_input_tokens_seen": 23890867, "step": 384 }, { "epoch": 0.07114337568058077, "grad_norm": 1.2109375, "learning_rate": 3.9557538060866005e-05, "loss": 0.0917, "num_input_tokens_seen": 24393313, "step": 392 }, { "epoch": 0.07259528130671507, "grad_norm": 1.0078125, "learning_rate": 3.9538166665175354e-05, "loss": 0.0865, "num_input_tokens_seen": 24894282, "step": 400 }, { "epoch": 0.07404718693284937, "grad_norm": 1.640625, "learning_rate": 3.9518385205938446e-05, "loss": 0.1222, "num_input_tokens_seen": 25397169, "step": 408 }, { "epoch": 0.07549909255898367, "grad_norm": 1.5859375, "learning_rate": 3.949819409832502e-05, "loss": 0.0899, "num_input_tokens_seen": 25894407, "step": 416 }, { "epoch": 0.07695099818511797, "grad_norm": 1.1640625, "learning_rate": 3.947759376610242e-05, "loss": 0.0716, "num_input_tokens_seen": 26375741, "step": 424 }, { "epoch": 0.07840290381125227, "grad_norm": 2.15625, "learning_rate": 3.945658464162674e-05, "loss": 0.1094, "num_input_tokens_seen": 26881148, "step": 432 }, { "epoch": 0.07985480943738657, "grad_norm": 1.265625, "learning_rate": 3.9435167165833724e-05, "loss": 0.1517, "num_input_tokens_seen": 27373108, "step": 440 }, { "epoch": 0.08130671506352087, "grad_norm": 7.84375, "learning_rate": 3.9413341788229524e-05, "loss": 0.0959, "num_input_tokens_seen": 27852888, "step": 448 }, { "epoch": 0.08275862068965517, "grad_norm": 2.828125, "learning_rate": 3.939110896688126e-05, "loss": 0.0824, "num_input_tokens_seen": 28338065, "step": 456 }, { "epoch": 0.08421052631578947, "grad_norm": 5.5625, "learning_rate": 3.93684691684074e-05, "loss": 0.1234, "num_input_tokens_seen": 28842856, "step": 464 }, { "epoch": 0.08566243194192377, "grad_norm": 1.8515625, "learning_rate": 3.9345422867967995e-05, "loss": 0.1118, "num_input_tokens_seen": 29349096, "step": 472 }, { "epoch": 0.08711433756805807, "grad_norm": 1.421875, "learning_rate": 3.9321970549254664e-05, "loss": 0.1055, "num_input_tokens_seen": 29826034, "step": 480 }, { "epoch": 0.08856624319419237, "grad_norm": 18.75, "learning_rate": 3.929811270448049e-05, "loss": 0.1166, "num_input_tokens_seen": 30321718, "step": 488 }, { "epoch": 0.09001814882032667, "grad_norm": 3.46875, "learning_rate": 3.927384983436964e-05, "loss": 0.1134, "num_input_tokens_seen": 30812607, "step": 496 }, { "epoch": 0.09147005444646097, "grad_norm": 1.0390625, "learning_rate": 3.924918244814689e-05, "loss": 0.0805, "num_input_tokens_seen": 31304931, "step": 504 }, { "epoch": 0.09292196007259527, "grad_norm": 1.1015625, "learning_rate": 3.922411106352694e-05, "loss": 0.0849, "num_input_tokens_seen": 31792831, "step": 512 }, { "epoch": 0.09437386569872959, "grad_norm": 1.375, "learning_rate": 3.9198636206703516e-05, "loss": 0.0919, "num_input_tokens_seen": 32286282, "step": 520 }, { "epoch": 0.09582577132486389, "grad_norm": 1.40625, "learning_rate": 3.9172758412338346e-05, "loss": 0.0896, "num_input_tokens_seen": 32770941, "step": 528 }, { "epoch": 0.09727767695099819, "grad_norm": 4.8125, "learning_rate": 3.9146478223549974e-05, "loss": 0.0925, "num_input_tokens_seen": 33253136, "step": 536 }, { "epoch": 0.09872958257713249, "grad_norm": 1.1796875, "learning_rate": 3.9119796191902274e-05, "loss": 0.0656, "num_input_tokens_seen": 33760146, "step": 544 }, { "epoch": 0.10018148820326679, "grad_norm": 3.640625, "learning_rate": 3.9092712877392965e-05, "loss": 0.1162, "num_input_tokens_seen": 34251987, "step": 552 }, { "epoch": 0.10163339382940109, "grad_norm": 2.03125, "learning_rate": 3.906522884844181e-05, "loss": 0.1153, "num_input_tokens_seen": 34730598, "step": 560 }, { "epoch": 0.10308529945553539, "grad_norm": 1.390625, "learning_rate": 3.903734468187868e-05, "loss": 0.0731, "num_input_tokens_seen": 35215481, "step": 568 }, { "epoch": 0.10453720508166969, "grad_norm": 2.515625, "learning_rate": 3.900906096293148e-05, "loss": 0.0992, "num_input_tokens_seen": 35691971, "step": 576 }, { "epoch": 0.105989110707804, "grad_norm": 0.765625, "learning_rate": 3.8980378285213846e-05, "loss": 0.1058, "num_input_tokens_seen": 36191442, "step": 584 }, { "epoch": 0.1074410163339383, "grad_norm": 1.0078125, "learning_rate": 3.895129725071268e-05, "loss": 0.0841, "num_input_tokens_seen": 36677760, "step": 592 }, { "epoch": 0.1088929219600726, "grad_norm": 1.1015625, "learning_rate": 3.892181846977553e-05, "loss": 0.096, "num_input_tokens_seen": 37169594, "step": 600 }, { "epoch": 0.1103448275862069, "grad_norm": 1.0078125, "learning_rate": 3.8891942561097787e-05, "loss": 0.0865, "num_input_tokens_seen": 37658243, "step": 608 }, { "epoch": 0.1117967332123412, "grad_norm": 3.40625, "learning_rate": 3.8861670151709664e-05, "loss": 0.0926, "num_input_tokens_seen": 38172841, "step": 616 }, { "epoch": 0.1132486388384755, "grad_norm": 1.9296875, "learning_rate": 3.883100187696308e-05, "loss": 0.0844, "num_input_tokens_seen": 38680418, "step": 624 }, { "epoch": 0.1147005444646098, "grad_norm": 0.921875, "learning_rate": 3.87999383805183e-05, "loss": 0.0889, "num_input_tokens_seen": 39168241, "step": 632 }, { "epoch": 0.1161524500907441, "grad_norm": 0.9375, "learning_rate": 3.876848031433042e-05, "loss": 0.0931, "num_input_tokens_seen": 39636702, "step": 640 }, { "epoch": 0.1176043557168784, "grad_norm": 1.03125, "learning_rate": 3.8736628338635716e-05, "loss": 0.0638, "num_input_tokens_seen": 40118232, "step": 648 }, { "epoch": 0.1190562613430127, "grad_norm": 1.4140625, "learning_rate": 3.870438312193774e-05, "loss": 0.0775, "num_input_tokens_seen": 40614511, "step": 656 }, { "epoch": 0.120508166969147, "grad_norm": 1.2734375, "learning_rate": 3.8671745340993354e-05, "loss": 0.0902, "num_input_tokens_seen": 41136221, "step": 664 }, { "epoch": 0.12196007259528131, "grad_norm": 2.140625, "learning_rate": 3.863871568079845e-05, "loss": 0.1083, "num_input_tokens_seen": 41626515, "step": 672 }, { "epoch": 0.12341197822141561, "grad_norm": 1.265625, "learning_rate": 3.860529483457362e-05, "loss": 0.0914, "num_input_tokens_seen": 42128107, "step": 680 }, { "epoch": 0.12486388384754991, "grad_norm": 1.921875, "learning_rate": 3.8571483503749625e-05, "loss": 0.1172, "num_input_tokens_seen": 42626752, "step": 688 }, { "epoch": 0.12486388384754991, "eval_loss": 0.08887020498514175, "eval_runtime": 2566.1938, "eval_samples_per_second": 1.215, "eval_steps_per_second": 0.152, "num_input_tokens_seen": 42626752, "step": 688 }, { "epoch": 0.12631578947368421, "grad_norm": 1.1875, "learning_rate": 3.8537282397952604e-05, "loss": 0.0873, "num_input_tokens_seen": 43128274, "step": 696 }, { "epoch": 0.1277676950998185, "grad_norm": 0.92578125, "learning_rate": 3.8502692234989265e-05, "loss": 0.0807, "num_input_tokens_seen": 43630580, "step": 704 }, { "epoch": 0.12921960072595282, "grad_norm": 0.59375, "learning_rate": 3.846771374083175e-05, "loss": 0.0792, "num_input_tokens_seen": 44143904, "step": 712 }, { "epoch": 0.1306715063520871, "grad_norm": 1.015625, "learning_rate": 3.843234764960244e-05, "loss": 0.0808, "num_input_tokens_seen": 44635682, "step": 720 }, { "epoch": 0.13212341197822142, "grad_norm": 0.84375, "learning_rate": 3.839659470355853e-05, "loss": 0.0902, "num_input_tokens_seen": 45110870, "step": 728 }, { "epoch": 0.1335753176043557, "grad_norm": 0.96875, "learning_rate": 3.8360455653076446e-05, "loss": 0.0872, "num_input_tokens_seen": 45620246, "step": 736 }, { "epoch": 0.13502722323049002, "grad_norm": 0.79296875, "learning_rate": 3.832393125663613e-05, "loss": 0.1095, "num_input_tokens_seen": 46106634, "step": 744 }, { "epoch": 0.13647912885662433, "grad_norm": 1.1875, "learning_rate": 3.8287022280805064e-05, "loss": 0.1008, "num_input_tokens_seen": 46599497, "step": 752 }, { "epoch": 0.13793103448275862, "grad_norm": 0.80859375, "learning_rate": 3.824972950022224e-05, "loss": 0.0761, "num_input_tokens_seen": 47098121, "step": 760 }, { "epoch": 0.13938294010889293, "grad_norm": 0.75390625, "learning_rate": 3.8212053697581855e-05, "loss": 0.0864, "num_input_tokens_seen": 47599433, "step": 768 }, { "epoch": 0.14083484573502722, "grad_norm": 0.77734375, "learning_rate": 3.817399566361692e-05, "loss": 0.0756, "num_input_tokens_seen": 48099996, "step": 776 }, { "epoch": 0.14228675136116153, "grad_norm": 0.8203125, "learning_rate": 3.8135556197082647e-05, "loss": 0.0991, "num_input_tokens_seen": 48591151, "step": 784 }, { "epoch": 0.14373865698729582, "grad_norm": 1.1875, "learning_rate": 3.809673610473967e-05, "loss": 0.0859, "num_input_tokens_seen": 49119581, "step": 792 }, { "epoch": 0.14519056261343014, "grad_norm": 0.99609375, "learning_rate": 3.805753620133715e-05, "loss": 0.0938, "num_input_tokens_seen": 49589057, "step": 800 }, { "epoch": 0.14664246823956442, "grad_norm": 1.8828125, "learning_rate": 3.801795730959565e-05, "loss": 0.0657, "num_input_tokens_seen": 50091363, "step": 808 }, { "epoch": 0.14809437386569874, "grad_norm": 1.5, "learning_rate": 3.7978000260189854e-05, "loss": 0.1124, "num_input_tokens_seen": 50595440, "step": 816 }, { "epoch": 0.14954627949183302, "grad_norm": 1.046875, "learning_rate": 3.793766589173117e-05, "loss": 0.0969, "num_input_tokens_seen": 51097536, "step": 824 }, { "epoch": 0.15099818511796734, "grad_norm": 1.2421875, "learning_rate": 3.789695505075013e-05, "loss": 0.0815, "num_input_tokens_seen": 51592933, "step": 832 }, { "epoch": 0.15245009074410162, "grad_norm": 0.640625, "learning_rate": 3.785586859167855e-05, "loss": 0.0806, "num_input_tokens_seen": 52089163, "step": 840 }, { "epoch": 0.15390199637023594, "grad_norm": 0.87109375, "learning_rate": 3.78144073768317e-05, "loss": 0.0628, "num_input_tokens_seen": 52591035, "step": 848 }, { "epoch": 0.15535390199637023, "grad_norm": 0.890625, "learning_rate": 3.7772572276390125e-05, "loss": 0.1, "num_input_tokens_seen": 53108139, "step": 856 }, { "epoch": 0.15680580762250454, "grad_norm": 1.3046875, "learning_rate": 3.7730364168381444e-05, "loss": 0.1083, "num_input_tokens_seen": 53612734, "step": 864 }, { "epoch": 0.15825771324863883, "grad_norm": 1.28125, "learning_rate": 3.768778393866186e-05, "loss": 0.0782, "num_input_tokens_seen": 54104981, "step": 872 }, { "epoch": 0.15970961887477314, "grad_norm": 1.1484375, "learning_rate": 3.764483248089763e-05, "loss": 0.1166, "num_input_tokens_seen": 54591628, "step": 880 }, { "epoch": 0.16116152450090745, "grad_norm": 0.89453125, "learning_rate": 3.760151069654626e-05, "loss": 0.0958, "num_input_tokens_seen": 55092240, "step": 888 }, { "epoch": 0.16261343012704174, "grad_norm": 1.0546875, "learning_rate": 3.75578194948376e-05, "loss": 0.0904, "num_input_tokens_seen": 55596058, "step": 896 }, { "epoch": 0.16406533575317606, "grad_norm": 0.76953125, "learning_rate": 3.751375979275479e-05, "loss": 0.0816, "num_input_tokens_seen": 56065485, "step": 904 }, { "epoch": 0.16551724137931034, "grad_norm": 1.0078125, "learning_rate": 3.746933251501497e-05, "loss": 0.0729, "num_input_tokens_seen": 56559741, "step": 912 }, { "epoch": 0.16696914700544466, "grad_norm": 0.875, "learning_rate": 3.7424538594049886e-05, "loss": 0.0626, "num_input_tokens_seen": 57042468, "step": 920 }, { "epoch": 0.16842105263157894, "grad_norm": 1.2109375, "learning_rate": 3.737937896998634e-05, "loss": 0.0872, "num_input_tokens_seen": 57530081, "step": 928 }, { "epoch": 0.16987295825771326, "grad_norm": 0.9296875, "learning_rate": 3.733385459062645e-05, "loss": 0.0863, "num_input_tokens_seen": 58052036, "step": 936 }, { "epoch": 0.17132486388384754, "grad_norm": 0.80078125, "learning_rate": 3.728796641142775e-05, "loss": 0.0747, "num_input_tokens_seen": 58558654, "step": 944 }, { "epoch": 0.17277676950998186, "grad_norm": 1.46875, "learning_rate": 3.724171539548311e-05, "loss": 0.0946, "num_input_tokens_seen": 59069780, "step": 952 }, { "epoch": 0.17422867513611615, "grad_norm": 0.94921875, "learning_rate": 3.71951025135006e-05, "loss": 0.0707, "num_input_tokens_seen": 59546270, "step": 960 }, { "epoch": 0.17568058076225046, "grad_norm": 1.1875, "learning_rate": 3.714812874378305e-05, "loss": 0.0796, "num_input_tokens_seen": 60050879, "step": 968 }, { "epoch": 0.17713248638838475, "grad_norm": 0.71484375, "learning_rate": 3.710079507220751e-05, "loss": 0.0908, "num_input_tokens_seen": 60542881, "step": 976 }, { "epoch": 0.17858439201451906, "grad_norm": 0.64453125, "learning_rate": 3.705310249220463e-05, "loss": 0.0799, "num_input_tokens_seen": 61009270, "step": 984 }, { "epoch": 0.18003629764065335, "grad_norm": 2.265625, "learning_rate": 3.700505200473774e-05, "loss": 0.0937, "num_input_tokens_seen": 61499242, "step": 992 }, { "epoch": 0.18148820326678766, "grad_norm": 2.78125, "learning_rate": 3.695664461828187e-05, "loss": 0.0913, "num_input_tokens_seen": 61987954, "step": 1000 }, { "epoch": 0.18294010889292195, "grad_norm": 0.78125, "learning_rate": 3.69078813488026e-05, "loss": 0.0546, "num_input_tokens_seen": 62482644, "step": 1008 }, { "epoch": 0.18439201451905626, "grad_norm": 1.609375, "learning_rate": 3.68587632197347e-05, "loss": 0.0788, "num_input_tokens_seen": 62950426, "step": 1016 }, { "epoch": 0.18584392014519055, "grad_norm": 0.90234375, "learning_rate": 3.6809291261960655e-05, "loss": 0.0865, "num_input_tokens_seen": 63454867, "step": 1024 }, { "epoch": 0.18729582577132486, "grad_norm": 1.390625, "learning_rate": 3.675946651378909e-05, "loss": 0.0832, "num_input_tokens_seen": 63980224, "step": 1032 }, { "epoch": 0.18729582577132486, "eval_loss": 0.07875645905733109, "eval_runtime": 2702.6122, "eval_samples_per_second": 1.153, "eval_steps_per_second": 0.144, "num_input_tokens_seen": 63980224, "step": 1032 }, { "epoch": 0.18874773139745918, "grad_norm": 0.9296875, "learning_rate": 3.67092900209329e-05, "loss": 0.0831, "num_input_tokens_seen": 64445080, "step": 1040 }, { "epoch": 0.19019963702359347, "grad_norm": 1.25, "learning_rate": 3.665876283648732e-05, "loss": 0.0697, "num_input_tokens_seen": 64941877, "step": 1048 }, { "epoch": 0.19165154264972778, "grad_norm": 0.62890625, "learning_rate": 3.660788602090788e-05, "loss": 0.0845, "num_input_tokens_seen": 65451057, "step": 1056 }, { "epoch": 0.19310344827586207, "grad_norm": 1.2265625, "learning_rate": 3.655666064198807e-05, "loss": 0.0822, "num_input_tokens_seen": 65944830, "step": 1064 }, { "epoch": 0.19455535390199638, "grad_norm": 1.125, "learning_rate": 3.6505087774836977e-05, "loss": 0.0974, "num_input_tokens_seen": 66458462, "step": 1072 }, { "epoch": 0.19600725952813067, "grad_norm": 0.7578125, "learning_rate": 3.645316850185672e-05, "loss": 0.0907, "num_input_tokens_seen": 66955532, "step": 1080 }, { "epoch": 0.19745916515426498, "grad_norm": 1.390625, "learning_rate": 3.6400903912719696e-05, "loss": 0.0791, "num_input_tokens_seen": 67453162, "step": 1088 }, { "epoch": 0.19891107078039927, "grad_norm": 0.859375, "learning_rate": 3.6348295104345764e-05, "loss": 0.0593, "num_input_tokens_seen": 67939256, "step": 1096 }, { "epoch": 0.20036297640653358, "grad_norm": 1.0546875, "learning_rate": 3.629534318087918e-05, "loss": 0.1024, "num_input_tokens_seen": 68457767, "step": 1104 }, { "epoch": 0.20181488203266787, "grad_norm": 1.0234375, "learning_rate": 3.624204925366543e-05, "loss": 0.0621, "num_input_tokens_seen": 68964063, "step": 1112 }, { "epoch": 0.20326678765880218, "grad_norm": 0.72265625, "learning_rate": 3.618841444122794e-05, "loss": 0.0685, "num_input_tokens_seen": 69443542, "step": 1120 }, { "epoch": 0.20471869328493647, "grad_norm": 0.83203125, "learning_rate": 3.613443986924455e-05, "loss": 0.0866, "num_input_tokens_seen": 69941074, "step": 1128 }, { "epoch": 0.20617059891107078, "grad_norm": 0.890625, "learning_rate": 3.60801266705239e-05, "loss": 0.0873, "num_input_tokens_seen": 70410725, "step": 1136 }, { "epoch": 0.20762250453720507, "grad_norm": 0.68359375, "learning_rate": 3.6025475984981716e-05, "loss": 0.0767, "num_input_tokens_seen": 70885703, "step": 1144 }, { "epoch": 0.20907441016333939, "grad_norm": 0.8515625, "learning_rate": 3.59704889596168e-05, "loss": 0.08, "num_input_tokens_seen": 71379385, "step": 1152 }, { "epoch": 0.21052631578947367, "grad_norm": 3.34375, "learning_rate": 3.5915166748486984e-05, "loss": 0.0974, "num_input_tokens_seen": 71863351, "step": 1160 }, { "epoch": 0.211978221415608, "grad_norm": 0.82421875, "learning_rate": 3.585951051268496e-05, "loss": 0.0799, "num_input_tokens_seen": 72351447, "step": 1168 }, { "epoch": 0.21343012704174227, "grad_norm": 0.76953125, "learning_rate": 3.5803521420313836e-05, "loss": 0.0598, "num_input_tokens_seen": 72853284, "step": 1176 }, { "epoch": 0.2148820326678766, "grad_norm": 0.92578125, "learning_rate": 3.574720064646267e-05, "loss": 0.1021, "num_input_tokens_seen": 73354953, "step": 1184 }, { "epoch": 0.2163339382940109, "grad_norm": 0.87890625, "learning_rate": 3.5690549373181785e-05, "loss": 0.0749, "num_input_tokens_seen": 73851645, "step": 1192 }, { "epoch": 0.2177858439201452, "grad_norm": 1.1015625, "learning_rate": 3.563356878945797e-05, "loss": 0.0677, "num_input_tokens_seen": 74351802, "step": 1200 }, { "epoch": 0.2192377495462795, "grad_norm": 0.84765625, "learning_rate": 3.557626009118951e-05, "loss": 0.0632, "num_input_tokens_seen": 74849173, "step": 1208 }, { "epoch": 0.2206896551724138, "grad_norm": 0.67578125, "learning_rate": 3.551862448116113e-05, "loss": 0.1037, "num_input_tokens_seen": 75333244, "step": 1216 }, { "epoch": 0.2221415607985481, "grad_norm": 1.0234375, "learning_rate": 3.546066316901869e-05, "loss": 0.0675, "num_input_tokens_seen": 75799822, "step": 1224 }, { "epoch": 0.2235934664246824, "grad_norm": 0.89453125, "learning_rate": 3.540237737124384e-05, "loss": 0.0684, "num_input_tokens_seen": 76300896, "step": 1232 }, { "epoch": 0.2250453720508167, "grad_norm": 1.1328125, "learning_rate": 3.534376831112848e-05, "loss": 0.0757, "num_input_tokens_seen": 76787655, "step": 1240 }, { "epoch": 0.226497277676951, "grad_norm": 0.8203125, "learning_rate": 3.528483721874907e-05, "loss": 0.0651, "num_input_tokens_seen": 77298718, "step": 1248 }, { "epoch": 0.2279491833030853, "grad_norm": 1.4921875, "learning_rate": 3.522558533094084e-05, "loss": 0.0863, "num_input_tokens_seen": 77797727, "step": 1256 }, { "epoch": 0.2294010889292196, "grad_norm": 0.5859375, "learning_rate": 3.51660138912718e-05, "loss": 0.0885, "num_input_tokens_seen": 78292669, "step": 1264 }, { "epoch": 0.2308529945553539, "grad_norm": 0.58203125, "learning_rate": 3.510612415001668e-05, "loss": 0.0892, "num_input_tokens_seen": 78800617, "step": 1272 }, { "epoch": 0.2323049001814882, "grad_norm": 0.88671875, "learning_rate": 3.5045917364130644e-05, "loss": 0.0527, "num_input_tokens_seen": 79317483, "step": 1280 }, { "epoch": 0.2337568058076225, "grad_norm": 0.61328125, "learning_rate": 3.4985394797222954e-05, "loss": 0.0587, "num_input_tokens_seen": 79807917, "step": 1288 }, { "epoch": 0.2352087114337568, "grad_norm": 1.3515625, "learning_rate": 3.49245577195304e-05, "loss": 0.0546, "num_input_tokens_seen": 80289419, "step": 1296 }, { "epoch": 0.2366606170598911, "grad_norm": 2.1875, "learning_rate": 3.4863407407890696e-05, "loss": 0.0982, "num_input_tokens_seen": 80784249, "step": 1304 }, { "epoch": 0.2381125226860254, "grad_norm": 2.59375, "learning_rate": 3.480194514571564e-05, "loss": 0.0965, "num_input_tokens_seen": 81278666, "step": 1312 }, { "epoch": 0.2395644283121597, "grad_norm": 1.2109375, "learning_rate": 3.474017222296419e-05, "loss": 0.0984, "num_input_tokens_seen": 81786558, "step": 1320 }, { "epoch": 0.241016333938294, "grad_norm": 0.6328125, "learning_rate": 3.4678089936115395e-05, "loss": 0.1122, "num_input_tokens_seen": 82281843, "step": 1328 }, { "epoch": 0.2424682395644283, "grad_norm": 2.765625, "learning_rate": 3.461569958814119e-05, "loss": 0.0745, "num_input_tokens_seen": 82776869, "step": 1336 }, { "epoch": 0.24392014519056263, "grad_norm": 0.984375, "learning_rate": 3.455300248847903e-05, "loss": 0.1094, "num_input_tokens_seen": 83275171, "step": 1344 }, { "epoch": 0.2453720508166969, "grad_norm": 1.03125, "learning_rate": 3.448999995300443e-05, "loss": 0.0663, "num_input_tokens_seen": 83755833, "step": 1352 }, { "epoch": 0.24682395644283123, "grad_norm": 1.5078125, "learning_rate": 3.4426693304003324e-05, "loss": 0.0879, "num_input_tokens_seen": 84237888, "step": 1360 }, { "epoch": 0.2482758620689655, "grad_norm": 1.0859375, "learning_rate": 3.4363083870144346e-05, "loss": 0.0661, "num_input_tokens_seen": 84739837, "step": 1368 }, { "epoch": 0.24972776769509983, "grad_norm": 1.3046875, "learning_rate": 3.4299172986450906e-05, "loss": 0.0764, "num_input_tokens_seen": 85221444, "step": 1376 }, { "epoch": 0.24972776769509983, "eval_loss": 0.08076217025518417, "eval_runtime": 2579.1691, "eval_samples_per_second": 1.209, "eval_steps_per_second": 0.151, "num_input_tokens_seen": 85221444, "step": 1376 }, { "epoch": 0.25117967332123414, "grad_norm": 1.0078125, "learning_rate": 3.4234961994273206e-05, "loss": 0.0714, "num_input_tokens_seen": 85711647, "step": 1384 }, { "epoch": 0.25263157894736843, "grad_norm": 0.62109375, "learning_rate": 3.417045224126004e-05, "loss": 0.0774, "num_input_tokens_seen": 86223550, "step": 1392 }, { "epoch": 0.2540834845735027, "grad_norm": 1.265625, "learning_rate": 3.410564508133058e-05, "loss": 0.0872, "num_input_tokens_seen": 86721404, "step": 1400 }, { "epoch": 0.255535390199637, "grad_norm": 1.3046875, "learning_rate": 3.40405418746459e-05, "loss": 0.0729, "num_input_tokens_seen": 87180793, "step": 1408 }, { "epoch": 0.25698729582577134, "grad_norm": 0.8984375, "learning_rate": 3.397514398758046e-05, "loss": 0.0732, "num_input_tokens_seen": 87680677, "step": 1416 }, { "epoch": 0.25843920145190563, "grad_norm": 0.5703125, "learning_rate": 3.39094527926934e-05, "loss": 0.0765, "num_input_tokens_seen": 88187512, "step": 1424 }, { "epoch": 0.2598911070780399, "grad_norm": 1.0546875, "learning_rate": 3.384346966869976e-05, "loss": 0.0684, "num_input_tokens_seen": 88692751, "step": 1432 }, { "epoch": 0.2613430127041742, "grad_norm": 2.34375, "learning_rate": 3.377719600044156e-05, "loss": 0.0878, "num_input_tokens_seen": 89183444, "step": 1440 }, { "epoch": 0.26279491833030855, "grad_norm": 0.5234375, "learning_rate": 3.371063317885868e-05, "loss": 0.0738, "num_input_tokens_seen": 89681459, "step": 1448 }, { "epoch": 0.26424682395644283, "grad_norm": 0.8046875, "learning_rate": 3.364378260095972e-05, "loss": 0.075, "num_input_tokens_seen": 90168008, "step": 1456 }, { "epoch": 0.2656987295825771, "grad_norm": 0.984375, "learning_rate": 3.3576645669792634e-05, "loss": 0.0606, "num_input_tokens_seen": 90654438, "step": 1464 }, { "epoch": 0.2671506352087114, "grad_norm": 1.1796875, "learning_rate": 3.350922379441534e-05, "loss": 0.0853, "num_input_tokens_seen": 91167951, "step": 1472 }, { "epoch": 0.26860254083484575, "grad_norm": 0.8828125, "learning_rate": 3.3441518389866075e-05, "loss": 0.0518, "num_input_tokens_seen": 91650643, "step": 1480 }, { "epoch": 0.27005444646098004, "grad_norm": 0.80859375, "learning_rate": 3.3373530877133764e-05, "loss": 0.0749, "num_input_tokens_seen": 92155336, "step": 1488 }, { "epoch": 0.2715063520871143, "grad_norm": 0.75390625, "learning_rate": 3.330526268312817e-05, "loss": 0.0583, "num_input_tokens_seen": 92628298, "step": 1496 }, { "epoch": 0.27295825771324866, "grad_norm": 0.8203125, "learning_rate": 3.323671524064992e-05, "loss": 0.0885, "num_input_tokens_seen": 93154901, "step": 1504 }, { "epoch": 0.27441016333938295, "grad_norm": 0.77734375, "learning_rate": 3.316788998836048e-05, "loss": 0.0583, "num_input_tokens_seen": 93650095, "step": 1512 }, { "epoch": 0.27586206896551724, "grad_norm": 4.5625, "learning_rate": 3.309878837075193e-05, "loss": 0.0764, "num_input_tokens_seen": 94136210, "step": 1520 }, { "epoch": 0.2773139745916515, "grad_norm": 0.80078125, "learning_rate": 3.3029411838116654e-05, "loss": 0.0638, "num_input_tokens_seen": 94624523, "step": 1528 }, { "epoch": 0.27876588021778587, "grad_norm": 1.078125, "learning_rate": 3.295976184651691e-05, "loss": 0.0685, "num_input_tokens_seen": 95110498, "step": 1536 }, { "epoch": 0.28021778584392015, "grad_norm": 0.76171875, "learning_rate": 3.288983985775426e-05, "loss": 0.0853, "num_input_tokens_seen": 95620511, "step": 1544 }, { "epoch": 0.28166969147005444, "grad_norm": 0.73046875, "learning_rate": 3.281964733933889e-05, "loss": 0.0779, "num_input_tokens_seen": 96130692, "step": 1552 }, { "epoch": 0.2831215970961887, "grad_norm": 0.80078125, "learning_rate": 3.274918576445882e-05, "loss": 0.0713, "num_input_tokens_seen": 96638367, "step": 1560 }, { "epoch": 0.28457350272232307, "grad_norm": 0.80859375, "learning_rate": 3.267845661194898e-05, "loss": 0.0653, "num_input_tokens_seen": 97154890, "step": 1568 }, { "epoch": 0.28602540834845736, "grad_norm": 0.87890625, "learning_rate": 3.260746136626016e-05, "loss": 0.0522, "num_input_tokens_seen": 97650182, "step": 1576 }, { "epoch": 0.28747731397459164, "grad_norm": 0.734375, "learning_rate": 3.253620151742788e-05, "loss": 0.0868, "num_input_tokens_seen": 98121695, "step": 1584 }, { "epoch": 0.28892921960072593, "grad_norm": 0.484375, "learning_rate": 3.24646785610411e-05, "loss": 0.0844, "num_input_tokens_seen": 98595616, "step": 1592 }, { "epoch": 0.29038112522686027, "grad_norm": 0.984375, "learning_rate": 3.239289399821083e-05, "loss": 0.0668, "num_input_tokens_seen": 99105755, "step": 1600 }, { "epoch": 0.29183303085299456, "grad_norm": 0.9765625, "learning_rate": 3.2320849335538636e-05, "loss": 0.0699, "num_input_tokens_seen": 99595258, "step": 1608 }, { "epoch": 0.29328493647912884, "grad_norm": 1.6328125, "learning_rate": 3.2248546085084995e-05, "loss": 0.0903, "num_input_tokens_seen": 100106643, "step": 1616 }, { "epoch": 0.29473684210526313, "grad_norm": 1.40625, "learning_rate": 3.21759857643376e-05, "loss": 0.0826, "num_input_tokens_seen": 100593045, "step": 1624 }, { "epoch": 0.2961887477313975, "grad_norm": 0.81640625, "learning_rate": 3.2103169896179476e-05, "loss": 0.084, "num_input_tokens_seen": 101094273, "step": 1632 }, { "epoch": 0.29764065335753176, "grad_norm": 1.046875, "learning_rate": 3.203010000885704e-05, "loss": 0.0742, "num_input_tokens_seen": 101593296, "step": 1640 }, { "epoch": 0.29909255898366605, "grad_norm": 0.75390625, "learning_rate": 3.1956777635948016e-05, "loss": 0.064, "num_input_tokens_seen": 102074203, "step": 1648 }, { "epoch": 0.3005444646098004, "grad_norm": 0.5703125, "learning_rate": 3.188320431632924e-05, "loss": 0.0569, "num_input_tokens_seen": 102576481, "step": 1656 }, { "epoch": 0.3019963702359347, "grad_norm": 0.61328125, "learning_rate": 3.180938159414439e-05, "loss": 0.0932, "num_input_tokens_seen": 103070807, "step": 1664 }, { "epoch": 0.30344827586206896, "grad_norm": 1.03125, "learning_rate": 3.173531101877155e-05, "loss": 0.0621, "num_input_tokens_seen": 103568290, "step": 1672 }, { "epoch": 0.30490018148820325, "grad_norm": 0.7734375, "learning_rate": 3.166099414479069e-05, "loss": 0.0579, "num_input_tokens_seen": 104059494, "step": 1680 }, { "epoch": 0.3063520871143376, "grad_norm": 1.1640625, "learning_rate": 3.158643253195108e-05, "loss": 0.0695, "num_input_tokens_seen": 104556886, "step": 1688 }, { "epoch": 0.3078039927404719, "grad_norm": 0.90625, "learning_rate": 3.15116277451385e-05, "loss": 0.0723, "num_input_tokens_seen": 105058562, "step": 1696 }, { "epoch": 0.30925589836660616, "grad_norm": 0.8203125, "learning_rate": 3.143658135434244e-05, "loss": 0.0652, "num_input_tokens_seen": 105536081, "step": 1704 }, { "epoch": 0.31070780399274045, "grad_norm": 0.80859375, "learning_rate": 3.136129493462312e-05, "loss": 0.0748, "num_input_tokens_seen": 106037792, "step": 1712 }, { "epoch": 0.3121597096188748, "grad_norm": 0.8203125, "learning_rate": 3.1285770066078445e-05, "loss": 0.072, "num_input_tokens_seen": 106546503, "step": 1720 }, { "epoch": 0.3121597096188748, "eval_loss": 0.06825637072324753, "eval_runtime": 2711.2246, "eval_samples_per_second": 1.15, "eval_steps_per_second": 0.144, "num_input_tokens_seen": 106546503, "step": 1720 }, { "epoch": 0.3136116152450091, "grad_norm": 1.3984375, "learning_rate": 3.121000833381084e-05, "loss": 0.0737, "num_input_tokens_seen": 107037952, "step": 1728 }, { "epoch": 0.31506352087114337, "grad_norm": 0.828125, "learning_rate": 3.113401132789399e-05, "loss": 0.0712, "num_input_tokens_seen": 107540349, "step": 1736 }, { "epoch": 0.31651542649727765, "grad_norm": 0.8515625, "learning_rate": 3.1057780643339465e-05, "loss": 0.0685, "num_input_tokens_seen": 108034983, "step": 1744 }, { "epoch": 0.317967332123412, "grad_norm": 0.80859375, "learning_rate": 3.098131788006322e-05, "loss": 0.0718, "num_input_tokens_seen": 108503192, "step": 1752 }, { "epoch": 0.3194192377495463, "grad_norm": 0.4921875, "learning_rate": 3.0904624642852065e-05, "loss": 0.076, "num_input_tokens_seen": 109019554, "step": 1760 }, { "epoch": 0.32087114337568057, "grad_norm": 1.265625, "learning_rate": 3.082770254132993e-05, "loss": 0.0549, "num_input_tokens_seen": 109504850, "step": 1768 }, { "epoch": 0.3223230490018149, "grad_norm": 0.66796875, "learning_rate": 3.075055318992412e-05, "loss": 0.068, "num_input_tokens_seen": 110008850, "step": 1776 }, { "epoch": 0.3237749546279492, "grad_norm": 0.78125, "learning_rate": 3.067317820783143e-05, "loss": 0.0676, "num_input_tokens_seen": 110528376, "step": 1784 }, { "epoch": 0.3252268602540835, "grad_norm": 0.62890625, "learning_rate": 3.0595579218984124e-05, "loss": 0.0862, "num_input_tokens_seen": 111026349, "step": 1792 }, { "epoch": 0.32667876588021777, "grad_norm": 0.71484375, "learning_rate": 3.05177578520159e-05, "loss": 0.0561, "num_input_tokens_seen": 111515922, "step": 1800 }, { "epoch": 0.3281306715063521, "grad_norm": 0.76171875, "learning_rate": 3.04397157402277e-05, "loss": 0.0599, "num_input_tokens_seen": 112007455, "step": 1808 }, { "epoch": 0.3295825771324864, "grad_norm": 0.60546875, "learning_rate": 3.0361454521553383e-05, "loss": 0.0856, "num_input_tokens_seen": 112491694, "step": 1816 }, { "epoch": 0.3310344827586207, "grad_norm": 0.69140625, "learning_rate": 3.028297583852541e-05, "loss": 0.055, "num_input_tokens_seen": 112968009, "step": 1824 }, { "epoch": 0.33248638838475497, "grad_norm": 1.2265625, "learning_rate": 3.020428133824035e-05, "loss": 0.0495, "num_input_tokens_seen": 113462356, "step": 1832 }, { "epoch": 0.3339382940108893, "grad_norm": 0.9140625, "learning_rate": 3.0125372672324285e-05, "loss": 0.0765, "num_input_tokens_seen": 113976443, "step": 1840 }, { "epoch": 0.3353901996370236, "grad_norm": 0.60546875, "learning_rate": 3.0046251496898177e-05, "loss": 0.0521, "num_input_tokens_seen": 114445408, "step": 1848 }, { "epoch": 0.3368421052631579, "grad_norm": 1.0, "learning_rate": 2.9966919472543098e-05, "loss": 0.0659, "num_input_tokens_seen": 114933077, "step": 1856 }, { "epoch": 0.3382940108892922, "grad_norm": 0.8203125, "learning_rate": 2.9887378264265387e-05, "loss": 0.0853, "num_input_tokens_seen": 115416098, "step": 1864 }, { "epoch": 0.3397459165154265, "grad_norm": 0.640625, "learning_rate": 2.9807629541461693e-05, "loss": 0.0611, "num_input_tokens_seen": 115937997, "step": 1872 }, { "epoch": 0.3411978221415608, "grad_norm": 0.76953125, "learning_rate": 2.972767497788393e-05, "loss": 0.048, "num_input_tokens_seen": 116441850, "step": 1880 }, { "epoch": 0.3426497277676951, "grad_norm": 1.046875, "learning_rate": 2.9647516251604192e-05, "loss": 0.0777, "num_input_tokens_seen": 116937086, "step": 1888 }, { "epoch": 0.3441016333938294, "grad_norm": 0.81640625, "learning_rate": 2.9567155044979466e-05, "loss": 0.0598, "num_input_tokens_seen": 117443956, "step": 1896 }, { "epoch": 0.3455535390199637, "grad_norm": 1.40625, "learning_rate": 2.9486593044616394e-05, "loss": 0.0686, "num_input_tokens_seen": 117937379, "step": 1904 }, { "epoch": 0.347005444646098, "grad_norm": 0.72265625, "learning_rate": 2.9405831941335816e-05, "loss": 0.053, "num_input_tokens_seen": 118423431, "step": 1912 }, { "epoch": 0.3484573502722323, "grad_norm": 0.5625, "learning_rate": 2.932487343013732e-05, "loss": 0.0485, "num_input_tokens_seen": 118938547, "step": 1920 }, { "epoch": 0.34990925589836663, "grad_norm": 0.7265625, "learning_rate": 2.9243719210163654e-05, "loss": 0.076, "num_input_tokens_seen": 119414827, "step": 1928 }, { "epoch": 0.3513611615245009, "grad_norm": 0.62890625, "learning_rate": 2.916237098466507e-05, "loss": 0.037, "num_input_tokens_seen": 119906010, "step": 1936 }, { "epoch": 0.3528130671506352, "grad_norm": 0.66015625, "learning_rate": 2.9080830460963563e-05, "loss": 0.0561, "num_input_tokens_seen": 120390508, "step": 1944 }, { "epoch": 0.3542649727767695, "grad_norm": 0.87890625, "learning_rate": 2.8999099350417065e-05, "loss": 0.0846, "num_input_tokens_seen": 120863309, "step": 1952 }, { "epoch": 0.35571687840290384, "grad_norm": 0.73046875, "learning_rate": 2.8917179368383493e-05, "loss": 0.0403, "num_input_tokens_seen": 121339176, "step": 1960 }, { "epoch": 0.3571687840290381, "grad_norm": 0.453125, "learning_rate": 2.883507223418478e-05, "loss": 0.0645, "num_input_tokens_seen": 121867501, "step": 1968 }, { "epoch": 0.3586206896551724, "grad_norm": 1.21875, "learning_rate": 2.875277967107076e-05, "loss": 0.0911, "num_input_tokens_seen": 122375421, "step": 1976 }, { "epoch": 0.3600725952813067, "grad_norm": 0.90234375, "learning_rate": 2.867030340618303e-05, "loss": 0.0454, "num_input_tokens_seen": 122856601, "step": 1984 }, { "epoch": 0.36152450090744104, "grad_norm": 0.546875, "learning_rate": 2.858764517051868e-05, "loss": 0.0615, "num_input_tokens_seen": 123347371, "step": 1992 }, { "epoch": 0.3629764065335753, "grad_norm": 0.369140625, "learning_rate": 2.850480669889397e-05, "loss": 0.0536, "num_input_tokens_seen": 123846779, "step": 2000 }, { "epoch": 0.3644283121597096, "grad_norm": 1.875, "learning_rate": 2.8421789729907928e-05, "loss": 0.0499, "num_input_tokens_seen": 124332390, "step": 2008 }, { "epoch": 0.3658802177858439, "grad_norm": 0.53125, "learning_rate": 2.833859600590583e-05, "loss": 0.076, "num_input_tokens_seen": 124806640, "step": 2016 }, { "epoch": 0.36733212341197824, "grad_norm": 0.98828125, "learning_rate": 2.825522727294268e-05, "loss": 0.0347, "num_input_tokens_seen": 125289556, "step": 2024 }, { "epoch": 0.3687840290381125, "grad_norm": 0.765625, "learning_rate": 2.817168528074654e-05, "loss": 0.0854, "num_input_tokens_seen": 125783042, "step": 2032 }, { "epoch": 0.3702359346642468, "grad_norm": 0.7109375, "learning_rate": 2.8087971782681774e-05, "loss": 0.0731, "num_input_tokens_seen": 126277662, "step": 2040 }, { "epoch": 0.3716878402903811, "grad_norm": 0.7265625, "learning_rate": 2.8004088535712315e-05, "loss": 0.0833, "num_input_tokens_seen": 126770182, "step": 2048 }, { "epoch": 0.37313974591651544, "grad_norm": 0.84375, "learning_rate": 2.7920037300364746e-05, "loss": 0.0752, "num_input_tokens_seen": 127265873, "step": 2056 }, { "epoch": 0.37459165154264973, "grad_norm": 1.046875, "learning_rate": 2.783581984069134e-05, "loss": 0.0652, "num_input_tokens_seen": 127767598, "step": 2064 }, { "epoch": 0.37459165154264973, "eval_loss": 0.06295192986726761, "eval_runtime": 2754.9055, "eval_samples_per_second": 1.131, "eval_steps_per_second": 0.142, "num_input_tokens_seen": 127767598, "step": 2064 }, { "epoch": 0.376043557168784, "grad_norm": 1.9609375, "learning_rate": 2.7751437924233093e-05, "loss": 0.06, "num_input_tokens_seen": 128256289, "step": 2072 }, { "epoch": 0.37749546279491836, "grad_norm": 1.421875, "learning_rate": 2.7666893321982548e-05, "loss": 0.0714, "num_input_tokens_seen": 128789423, "step": 2080 }, { "epoch": 0.37894736842105264, "grad_norm": 0.7265625, "learning_rate": 2.758218780834671e-05, "loss": 0.0608, "num_input_tokens_seen": 129283910, "step": 2088 }, { "epoch": 0.38039927404718693, "grad_norm": 0.87109375, "learning_rate": 2.7497323161109734e-05, "loss": 0.0567, "num_input_tokens_seen": 129762227, "step": 2096 }, { "epoch": 0.3818511796733212, "grad_norm": 0.71484375, "learning_rate": 2.741230116139565e-05, "loss": 0.0822, "num_input_tokens_seen": 130260949, "step": 2104 }, { "epoch": 0.38330308529945556, "grad_norm": 1.328125, "learning_rate": 2.7327123593630984e-05, "loss": 0.0744, "num_input_tokens_seen": 130738461, "step": 2112 }, { "epoch": 0.38475499092558985, "grad_norm": 0.70703125, "learning_rate": 2.7241792245507284e-05, "loss": 0.0428, "num_input_tokens_seen": 131250070, "step": 2120 }, { "epoch": 0.38620689655172413, "grad_norm": 1.0234375, "learning_rate": 2.715630890794362e-05, "loss": 0.0764, "num_input_tokens_seen": 131731607, "step": 2128 }, { "epoch": 0.3876588021778584, "grad_norm": 0.92578125, "learning_rate": 2.7070675375048984e-05, "loss": 0.0464, "num_input_tokens_seen": 132241144, "step": 2136 }, { "epoch": 0.38911070780399276, "grad_norm": 0.83984375, "learning_rate": 2.698489344408464e-05, "loss": 0.0598, "num_input_tokens_seen": 132728134, "step": 2144 }, { "epoch": 0.39056261343012705, "grad_norm": 1.1953125, "learning_rate": 2.689896491542642e-05, "loss": 0.0897, "num_input_tokens_seen": 133209860, "step": 2152 }, { "epoch": 0.39201451905626133, "grad_norm": 1.1015625, "learning_rate": 2.681289159252689e-05, "loss": 0.0525, "num_input_tokens_seen": 133711627, "step": 2160 }, { "epoch": 0.3934664246823956, "grad_norm": 0.65625, "learning_rate": 2.6726675281877567e-05, "loss": 0.0602, "num_input_tokens_seen": 134198176, "step": 2168 }, { "epoch": 0.39491833030852996, "grad_norm": 0.69921875, "learning_rate": 2.6640317792970947e-05, "loss": 0.0562, "num_input_tokens_seen": 134689114, "step": 2176 }, { "epoch": 0.39637023593466425, "grad_norm": 0.72265625, "learning_rate": 2.6553820938262557e-05, "loss": 0.0341, "num_input_tokens_seen": 135179499, "step": 2184 }, { "epoch": 0.39782214156079854, "grad_norm": 1.0234375, "learning_rate": 2.6467186533132906e-05, "loss": 0.0783, "num_input_tokens_seen": 135700208, "step": 2192 }, { "epoch": 0.3992740471869328, "grad_norm": 0.58984375, "learning_rate": 2.638041639584939e-05, "loss": 0.0604, "num_input_tokens_seen": 136212202, "step": 2200 }, { "epoch": 0.40072595281306717, "grad_norm": 0.55859375, "learning_rate": 2.6293512347528122e-05, "loss": 0.0591, "num_input_tokens_seen": 136698380, "step": 2208 }, { "epoch": 0.40217785843920145, "grad_norm": 0.66796875, "learning_rate": 2.6206476212095734e-05, "loss": 0.0743, "num_input_tokens_seen": 137191271, "step": 2216 }, { "epoch": 0.40362976406533574, "grad_norm": 0.5859375, "learning_rate": 2.6119309816251042e-05, "loss": 0.0437, "num_input_tokens_seen": 137660173, "step": 2224 }, { "epoch": 0.4050816696914701, "grad_norm": 0.8671875, "learning_rate": 2.6032014989426784e-05, "loss": 0.0597, "num_input_tokens_seen": 138165909, "step": 2232 }, { "epoch": 0.40653357531760437, "grad_norm": 0.7734375, "learning_rate": 2.594459356375116e-05, "loss": 0.0504, "num_input_tokens_seen": 138631528, "step": 2240 }, { "epoch": 0.40798548094373865, "grad_norm": 0.71484375, "learning_rate": 2.585704737400941e-05, "loss": 0.0611, "num_input_tokens_seen": 139130348, "step": 2248 }, { "epoch": 0.40943738656987294, "grad_norm": 0.6640625, "learning_rate": 2.57693782576053e-05, "loss": 0.0461, "num_input_tokens_seen": 139617268, "step": 2256 }, { "epoch": 0.4108892921960073, "grad_norm": 0.67578125, "learning_rate": 2.568158805452256e-05, "loss": 0.062, "num_input_tokens_seen": 140121646, "step": 2264 }, { "epoch": 0.41234119782214157, "grad_norm": 0.73828125, "learning_rate": 2.559367860728627e-05, "loss": 0.0506, "num_input_tokens_seen": 140625443, "step": 2272 }, { "epoch": 0.41379310344827586, "grad_norm": 0.703125, "learning_rate": 2.5505651760924182e-05, "loss": 0.0757, "num_input_tokens_seen": 141135512, "step": 2280 }, { "epoch": 0.41524500907441014, "grad_norm": 0.56640625, "learning_rate": 2.5417509362927986e-05, "loss": 0.078, "num_input_tokens_seen": 141614186, "step": 2288 }, { "epoch": 0.4166969147005445, "grad_norm": 0.98828125, "learning_rate": 2.5329253263214573e-05, "loss": 0.0549, "num_input_tokens_seen": 142126285, "step": 2296 }, { "epoch": 0.41814882032667877, "grad_norm": 0.49609375, "learning_rate": 2.5240885314087162e-05, "loss": 0.0592, "num_input_tokens_seen": 142609607, "step": 2304 }, { "epoch": 0.41960072595281306, "grad_norm": 0.890625, "learning_rate": 2.5152407370196467e-05, "loss": 0.0477, "num_input_tokens_seen": 143090080, "step": 2312 }, { "epoch": 0.42105263157894735, "grad_norm": 0.77734375, "learning_rate": 2.5063821288501746e-05, "loss": 0.0576, "num_input_tokens_seen": 143576776, "step": 2320 }, { "epoch": 0.4225045372050817, "grad_norm": 0.5859375, "learning_rate": 2.4975128928231823e-05, "loss": 0.0671, "num_input_tokens_seen": 144070311, "step": 2328 }, { "epoch": 0.423956442831216, "grad_norm": 0.97265625, "learning_rate": 2.4886332150846092e-05, "loss": 0.0637, "num_input_tokens_seen": 144581612, "step": 2336 }, { "epoch": 0.42540834845735026, "grad_norm": 0.55078125, "learning_rate": 2.4797432819995427e-05, "loss": 0.0496, "num_input_tokens_seen": 145085129, "step": 2344 }, { "epoch": 0.42686025408348455, "grad_norm": 0.8046875, "learning_rate": 2.4708432801483086e-05, "loss": 0.0662, "num_input_tokens_seen": 145568633, "step": 2352 }, { "epoch": 0.4283121597096189, "grad_norm": 0.84375, "learning_rate": 2.4619333963225525e-05, "loss": 0.059, "num_input_tokens_seen": 146076350, "step": 2360 }, { "epoch": 0.4297640653357532, "grad_norm": 1.1015625, "learning_rate": 2.4530138175213222e-05, "loss": 0.1076, "num_input_tokens_seen": 146577893, "step": 2368 }, { "epoch": 0.43121597096188746, "grad_norm": 0.89453125, "learning_rate": 2.4440847309471422e-05, "loss": 0.0794, "num_input_tokens_seen": 147074725, "step": 2376 }, { "epoch": 0.4326678765880218, "grad_norm": 0.9375, "learning_rate": 2.435146324002083e-05, "loss": 0.0537, "num_input_tokens_seen": 147559139, "step": 2384 }, { "epoch": 0.4341197822141561, "grad_norm": 0.59765625, "learning_rate": 2.426198784283831e-05, "loss": 0.0429, "num_input_tokens_seen": 148055859, "step": 2392 }, { "epoch": 0.4355716878402904, "grad_norm": 0.369140625, "learning_rate": 2.4172422995817496e-05, "loss": 0.0583, "num_input_tokens_seen": 148559803, "step": 2400 }, { "epoch": 0.43702359346642466, "grad_norm": 1.515625, "learning_rate": 2.408277057872936e-05, "loss": 0.0693, "num_input_tokens_seen": 149047633, "step": 2408 }, { "epoch": 0.43702359346642466, "eval_loss": 0.05809076130390167, "eval_runtime": 2813.328, "eval_samples_per_second": 1.108, "eval_steps_per_second": 0.139, "num_input_tokens_seen": 149047633, "step": 2408 }, { "epoch": 0.438475499092559, "grad_norm": 0.7265625, "learning_rate": 2.3993032473182796e-05, "loss": 0.0627, "num_input_tokens_seen": 149553600, "step": 2416 }, { "epoch": 0.4399274047186933, "grad_norm": 0.70703125, "learning_rate": 2.390321056258511e-05, "loss": 0.0518, "num_input_tokens_seen": 150031007, "step": 2424 }, { "epoch": 0.4413793103448276, "grad_norm": 0.6640625, "learning_rate": 2.3813306732102483e-05, "loss": 0.0564, "num_input_tokens_seen": 150506503, "step": 2432 }, { "epoch": 0.44283121597096187, "grad_norm": 0.75390625, "learning_rate": 2.3723322868620436e-05, "loss": 0.0728, "num_input_tokens_seen": 151018070, "step": 2440 }, { "epoch": 0.4442831215970962, "grad_norm": 0.453125, "learning_rate": 2.3633260860704188e-05, "loss": 0.0428, "num_input_tokens_seen": 151507916, "step": 2448 }, { "epoch": 0.4457350272232305, "grad_norm": 0.93359375, "learning_rate": 2.3543122598559053e-05, "loss": 0.0458, "num_input_tokens_seen": 151999967, "step": 2456 }, { "epoch": 0.4471869328493648, "grad_norm": 1.609375, "learning_rate": 2.345290997399074e-05, "loss": 0.051, "num_input_tokens_seen": 152499025, "step": 2464 }, { "epoch": 0.44863883847549907, "grad_norm": 1.3984375, "learning_rate": 2.3362624880365677e-05, "loss": 0.0713, "num_input_tokens_seen": 152984867, "step": 2472 }, { "epoch": 0.4500907441016334, "grad_norm": 0.91796875, "learning_rate": 2.3272269212571262e-05, "loss": 0.0627, "num_input_tokens_seen": 153473082, "step": 2480 }, { "epoch": 0.4515426497277677, "grad_norm": 0.55859375, "learning_rate": 2.3181844866976076e-05, "loss": 0.048, "num_input_tokens_seen": 153951602, "step": 2488 }, { "epoch": 0.452994555353902, "grad_norm": 0.46875, "learning_rate": 2.3091353741390116e-05, "loss": 0.0476, "num_input_tokens_seen": 154432971, "step": 2496 }, { "epoch": 0.45444646098003627, "grad_norm": 0.97265625, "learning_rate": 2.3000797735024922e-05, "loss": 0.049, "num_input_tokens_seen": 154912331, "step": 2504 }, { "epoch": 0.4558983666061706, "grad_norm": 0.94140625, "learning_rate": 2.2910178748453765e-05, "loss": 0.0544, "num_input_tokens_seen": 155385055, "step": 2512 }, { "epoch": 0.4573502722323049, "grad_norm": 0.76953125, "learning_rate": 2.2819498683571718e-05, "loss": 0.0494, "num_input_tokens_seen": 155892191, "step": 2520 }, { "epoch": 0.4588021778584392, "grad_norm": 0.625, "learning_rate": 2.272875944355575e-05, "loss": 0.066, "num_input_tokens_seen": 156405102, "step": 2528 }, { "epoch": 0.46025408348457353, "grad_norm": 0.7734375, "learning_rate": 2.2637962932824803e-05, "loss": 0.0605, "num_input_tokens_seen": 156909466, "step": 2536 }, { "epoch": 0.4617059891107078, "grad_norm": 0.6640625, "learning_rate": 2.2547111056999808e-05, "loss": 0.0394, "num_input_tokens_seen": 157391122, "step": 2544 }, { "epoch": 0.4631578947368421, "grad_norm": 0.361328125, "learning_rate": 2.245620572286366e-05, "loss": 0.0525, "num_input_tokens_seen": 157880121, "step": 2552 }, { "epoch": 0.4646098003629764, "grad_norm": 0.494140625, "learning_rate": 2.2365248838321273e-05, "loss": 0.0491, "num_input_tokens_seen": 158360167, "step": 2560 }, { "epoch": 0.46606170598911073, "grad_norm": 0.52734375, "learning_rate": 2.2274242312359445e-05, "loss": 0.0528, "num_input_tokens_seen": 158867422, "step": 2568 }, { "epoch": 0.467513611615245, "grad_norm": 0.671875, "learning_rate": 2.2183188055006867e-05, "loss": 0.0679, "num_input_tokens_seen": 159364296, "step": 2576 }, { "epoch": 0.4689655172413793, "grad_norm": 0.59375, "learning_rate": 2.2092087977294e-05, "loss": 0.0744, "num_input_tokens_seen": 159890619, "step": 2584 }, { "epoch": 0.4704174228675136, "grad_norm": 0.68359375, "learning_rate": 2.2000943991212977e-05, "loss": 0.0419, "num_input_tokens_seen": 160398651, "step": 2592 }, { "epoch": 0.47186932849364793, "grad_norm": 0.73828125, "learning_rate": 2.190975800967747e-05, "loss": 0.0616, "num_input_tokens_seen": 160922909, "step": 2600 }, { "epoch": 0.4733212341197822, "grad_norm": 0.5390625, "learning_rate": 2.1818531946482543e-05, "loss": 0.0442, "num_input_tokens_seen": 161419902, "step": 2608 }, { "epoch": 0.4747731397459165, "grad_norm": 0.625, "learning_rate": 2.172726771626449e-05, "loss": 0.0469, "num_input_tokens_seen": 161929180, "step": 2616 }, { "epoch": 0.4762250453720508, "grad_norm": 0.63671875, "learning_rate": 2.163596723446065e-05, "loss": 0.0573, "num_input_tokens_seen": 162437709, "step": 2624 }, { "epoch": 0.47767695099818513, "grad_norm": 1.046875, "learning_rate": 2.1544632417269194e-05, "loss": 0.052, "num_input_tokens_seen": 162950151, "step": 2632 }, { "epoch": 0.4791288566243194, "grad_norm": 0.90234375, "learning_rate": 2.145326518160893e-05, "loss": 0.0576, "num_input_tokens_seen": 163429462, "step": 2640 }, { "epoch": 0.4805807622504537, "grad_norm": 0.578125, "learning_rate": 2.136186744507904e-05, "loss": 0.0577, "num_input_tokens_seen": 163939160, "step": 2648 }, { "epoch": 0.482032667876588, "grad_norm": 0.4921875, "learning_rate": 2.1270441125918882e-05, "loss": 0.051, "num_input_tokens_seen": 164446079, "step": 2656 }, { "epoch": 0.48348457350272234, "grad_norm": 0.58984375, "learning_rate": 2.1178988142967678e-05, "loss": 0.0489, "num_input_tokens_seen": 164936233, "step": 2664 }, { "epoch": 0.4849364791288566, "grad_norm": 0.91015625, "learning_rate": 2.108751041562427e-05, "loss": 0.0622, "num_input_tokens_seen": 165409965, "step": 2672 }, { "epoch": 0.4863883847549909, "grad_norm": 0.5234375, "learning_rate": 2.0996009863806834e-05, "loss": 0.0578, "num_input_tokens_seen": 165901841, "step": 2680 }, { "epoch": 0.48784029038112525, "grad_norm": 0.88671875, "learning_rate": 2.0904488407912575e-05, "loss": 0.0389, "num_input_tokens_seen": 166384603, "step": 2688 }, { "epoch": 0.48929219600725954, "grad_norm": 0.34375, "learning_rate": 2.0812947968777437e-05, "loss": 0.0432, "num_input_tokens_seen": 166889709, "step": 2696 }, { "epoch": 0.4907441016333938, "grad_norm": 0.9296875, "learning_rate": 2.0721390467635788e-05, "loss": 0.0453, "num_input_tokens_seen": 167372121, "step": 2704 }, { "epoch": 0.4921960072595281, "grad_norm": 0.4609375, "learning_rate": 2.0629817826080073e-05, "loss": 0.0447, "num_input_tokens_seen": 167871991, "step": 2712 }, { "epoch": 0.49364791288566245, "grad_norm": 0.953125, "learning_rate": 2.053823196602051e-05, "loss": 0.0543, "num_input_tokens_seen": 168369985, "step": 2720 }, { "epoch": 0.49509981851179674, "grad_norm": 0.58203125, "learning_rate": 2.044663480964474e-05, "loss": 0.0416, "num_input_tokens_seen": 168846412, "step": 2728 }, { "epoch": 0.496551724137931, "grad_norm": 0.6171875, "learning_rate": 2.0355028279377498e-05, "loss": 0.0467, "num_input_tokens_seen": 169335334, "step": 2736 }, { "epoch": 0.4980036297640653, "grad_norm": 0.67578125, "learning_rate": 2.026341429784025e-05, "loss": 0.0724, "num_input_tokens_seen": 169830612, "step": 2744 }, { "epoch": 0.49945553539019966, "grad_norm": 0.53125, "learning_rate": 2.0171794787810842e-05, "loss": 0.0723, "num_input_tokens_seen": 170349739, "step": 2752 }, { "epoch": 0.49945553539019966, "eval_loss": 0.054387591779232025, "eval_runtime": 2838.6975, "eval_samples_per_second": 1.098, "eval_steps_per_second": 0.137, "num_input_tokens_seen": 170349739, "step": 2752 }, { "epoch": 0.5009074410163339, "grad_norm": 0.5390625, "learning_rate": 2.008017167218317e-05, "loss": 0.0365, "num_input_tokens_seen": 170843316, "step": 2760 }, { "epoch": 0.5023593466424683, "grad_norm": 0.6640625, "learning_rate": 1.9988546873926788e-05, "loss": 0.0456, "num_input_tokens_seen": 171324496, "step": 2768 }, { "epoch": 0.5038112522686026, "grad_norm": 0.71875, "learning_rate": 1.9896922316046562e-05, "loss": 0.0416, "num_input_tokens_seen": 171829665, "step": 2776 }, { "epoch": 0.5052631578947369, "grad_norm": 0.5625, "learning_rate": 1.980529992154233e-05, "loss": 0.0395, "num_input_tokens_seen": 172325874, "step": 2784 }, { "epoch": 0.5067150635208711, "grad_norm": 0.490234375, "learning_rate": 1.9713681613368506e-05, "loss": 0.0536, "num_input_tokens_seen": 172832464, "step": 2792 }, { "epoch": 0.5081669691470054, "grad_norm": 0.78125, "learning_rate": 1.9622069314393753e-05, "loss": 0.0505, "num_input_tokens_seen": 173320567, "step": 2800 }, { "epoch": 0.5096188747731397, "grad_norm": 0.75390625, "learning_rate": 1.9530464947360615e-05, "loss": 0.0528, "num_input_tokens_seen": 173816293, "step": 2808 }, { "epoch": 0.511070780399274, "grad_norm": 0.74609375, "learning_rate": 1.943887043484515e-05, "loss": 0.0766, "num_input_tokens_seen": 174302982, "step": 2816 }, { "epoch": 0.5125226860254084, "grad_norm": 0.80859375, "learning_rate": 1.9347287699216602e-05, "loss": 0.0574, "num_input_tokens_seen": 174807598, "step": 2824 }, { "epoch": 0.5139745916515427, "grad_norm": 1.1875, "learning_rate": 1.9255718662597044e-05, "loss": 0.0667, "num_input_tokens_seen": 175302323, "step": 2832 }, { "epoch": 0.515426497277677, "grad_norm": 0.59765625, "learning_rate": 1.9164165246821026e-05, "loss": 0.0434, "num_input_tokens_seen": 175782712, "step": 2840 }, { "epoch": 0.5168784029038113, "grad_norm": 0.6328125, "learning_rate": 1.9072629373395268e-05, "loss": 0.0573, "num_input_tokens_seen": 176252965, "step": 2848 }, { "epoch": 0.5183303085299455, "grad_norm": 0.7109375, "learning_rate": 1.8981112963458293e-05, "loss": 0.0541, "num_input_tokens_seen": 176746353, "step": 2856 }, { "epoch": 0.5197822141560798, "grad_norm": 1.0859375, "learning_rate": 1.8889617937740146e-05, "loss": 0.0457, "num_input_tokens_seen": 177252614, "step": 2864 }, { "epoch": 0.5212341197822141, "grad_norm": 0.73828125, "learning_rate": 1.879814621652206e-05, "loss": 0.0588, "num_input_tokens_seen": 177752505, "step": 2872 }, { "epoch": 0.5226860254083484, "grad_norm": 0.83984375, "learning_rate": 1.8706699719596138e-05, "loss": 0.0717, "num_input_tokens_seen": 178248588, "step": 2880 }, { "epoch": 0.5241379310344828, "grad_norm": 0.95703125, "learning_rate": 1.8615280366225113e-05, "loss": 0.0634, "num_input_tokens_seen": 178746624, "step": 2888 }, { "epoch": 0.5255898366606171, "grad_norm": 0.703125, "learning_rate": 1.852389007510201e-05, "loss": 0.0573, "num_input_tokens_seen": 179239200, "step": 2896 }, { "epoch": 0.5270417422867514, "grad_norm": 0.96484375, "learning_rate": 1.8432530764309916e-05, "loss": 0.0574, "num_input_tokens_seen": 179731398, "step": 2904 }, { "epoch": 0.5284936479128857, "grad_norm": 0.58203125, "learning_rate": 1.8341204351281684e-05, "loss": 0.0786, "num_input_tokens_seen": 180216141, "step": 2912 }, { "epoch": 0.52994555353902, "grad_norm": 0.4765625, "learning_rate": 1.8249912752759748e-05, "loss": 0.0481, "num_input_tokens_seen": 180719896, "step": 2920 }, { "epoch": 0.5313974591651542, "grad_norm": 0.64453125, "learning_rate": 1.8158657884755832e-05, "loss": 0.0595, "num_input_tokens_seen": 181215874, "step": 2928 }, { "epoch": 0.5328493647912885, "grad_norm": 0.6953125, "learning_rate": 1.8067441662510782e-05, "loss": 0.0495, "num_input_tokens_seen": 181715660, "step": 2936 }, { "epoch": 0.5343012704174228, "grad_norm": 0.53515625, "learning_rate": 1.797626600045435e-05, "loss": 0.0507, "num_input_tokens_seen": 182189644, "step": 2944 }, { "epoch": 0.5357531760435572, "grad_norm": 0.88671875, "learning_rate": 1.7885132812165022e-05, "loss": 0.0457, "num_input_tokens_seen": 182692258, "step": 2952 }, { "epoch": 0.5372050816696915, "grad_norm": 0.48828125, "learning_rate": 1.7794044010329844e-05, "loss": 0.0454, "num_input_tokens_seen": 183173683, "step": 2960 }, { "epoch": 0.5386569872958258, "grad_norm": 1.015625, "learning_rate": 1.7703001506704297e-05, "loss": 0.0612, "num_input_tokens_seen": 183670207, "step": 2968 }, { "epoch": 0.5401088929219601, "grad_norm": 0.6796875, "learning_rate": 1.761200721207215e-05, "loss": 0.0559, "num_input_tokens_seen": 184191448, "step": 2976 }, { "epoch": 0.5415607985480944, "grad_norm": 0.65625, "learning_rate": 1.7521063036205383e-05, "loss": 0.032, "num_input_tokens_seen": 184672691, "step": 2984 }, { "epoch": 0.5430127041742286, "grad_norm": 0.625, "learning_rate": 1.7430170887824088e-05, "loss": 0.0597, "num_input_tokens_seen": 185179876, "step": 2992 }, { "epoch": 0.5444646098003629, "grad_norm": 0.734375, "learning_rate": 1.7339332674556408e-05, "loss": 0.0566, "num_input_tokens_seen": 185659670, "step": 3000 }, { "epoch": 0.5459165154264973, "grad_norm": 0.279296875, "learning_rate": 1.724855030289852e-05, "loss": 0.028, "num_input_tokens_seen": 186148613, "step": 3008 }, { "epoch": 0.5473684210526316, "grad_norm": 0.87109375, "learning_rate": 1.715782567817459e-05, "loss": 0.0567, "num_input_tokens_seen": 186651171, "step": 3016 }, { "epoch": 0.5488203266787659, "grad_norm": 0.71484375, "learning_rate": 1.7067160704496817e-05, "loss": 0.0584, "num_input_tokens_seen": 187155654, "step": 3024 }, { "epoch": 0.5502722323049002, "grad_norm": 1.078125, "learning_rate": 1.6976557284725434e-05, "loss": 0.0554, "num_input_tokens_seen": 187631290, "step": 3032 }, { "epoch": 0.5517241379310345, "grad_norm": 0.5390625, "learning_rate": 1.6886017320428817e-05, "loss": 0.0654, "num_input_tokens_seen": 188114682, "step": 3040 }, { "epoch": 0.5531760435571688, "grad_norm": 0.7734375, "learning_rate": 1.6795542711843535e-05, "loss": 0.0489, "num_input_tokens_seen": 188586657, "step": 3048 }, { "epoch": 0.554627949183303, "grad_norm": 0.8515625, "learning_rate": 1.670513535783448e-05, "loss": 0.0432, "num_input_tokens_seen": 189073577, "step": 3056 }, { "epoch": 0.5560798548094373, "grad_norm": 0.95703125, "learning_rate": 1.661479715585503e-05, "loss": 0.0559, "num_input_tokens_seen": 189536844, "step": 3064 }, { "epoch": 0.5575317604355717, "grad_norm": 0.90234375, "learning_rate": 1.6524530001907196e-05, "loss": 0.0552, "num_input_tokens_seen": 190005564, "step": 3072 }, { "epoch": 0.558983666061706, "grad_norm": 0.7265625, "learning_rate": 1.643433579050186e-05, "loss": 0.0479, "num_input_tokens_seen": 190494115, "step": 3080 }, { "epoch": 0.5604355716878403, "grad_norm": 0.7265625, "learning_rate": 1.6344216414618998e-05, "loss": 0.0558, "num_input_tokens_seen": 190997100, "step": 3088 }, { "epoch": 0.5618874773139746, "grad_norm": 0.6875, "learning_rate": 1.625417376566794e-05, "loss": 0.0854, "num_input_tokens_seen": 191513399, "step": 3096 }, { "epoch": 0.5618874773139746, "eval_loss": 0.0525849312543869, "eval_runtime": 2614.8433, "eval_samples_per_second": 1.192, "eval_steps_per_second": 0.149, "num_input_tokens_seen": 191513399, "step": 3096 }, { "epoch": 0.5633393829401089, "grad_norm": 0.435546875, "learning_rate": 1.616420973344769e-05, "loss": 0.0467, "num_input_tokens_seen": 191995923, "step": 3104 }, { "epoch": 0.5647912885662432, "grad_norm": 0.67578125, "learning_rate": 1.607432620610727e-05, "loss": 0.0564, "num_input_tokens_seen": 192465595, "step": 3112 }, { "epoch": 0.5662431941923775, "grad_norm": 0.88671875, "learning_rate": 1.5984525070106065e-05, "loss": 0.0507, "num_input_tokens_seen": 192958871, "step": 3120 }, { "epoch": 0.5676950998185119, "grad_norm": 0.53515625, "learning_rate": 1.5894808210174252e-05, "loss": 0.0574, "num_input_tokens_seen": 193430762, "step": 3128 }, { "epoch": 0.5691470054446461, "grad_norm": 0.50390625, "learning_rate": 1.5805177509273226e-05, "loss": 0.0545, "num_input_tokens_seen": 193908960, "step": 3136 }, { "epoch": 0.5705989110707804, "grad_norm": 0.78515625, "learning_rate": 1.571563484855611e-05, "loss": 0.0532, "num_input_tokens_seen": 194435990, "step": 3144 }, { "epoch": 0.5720508166969147, "grad_norm": 0.60546875, "learning_rate": 1.5626182107328253e-05, "loss": 0.0402, "num_input_tokens_seen": 194945870, "step": 3152 }, { "epoch": 0.573502722323049, "grad_norm": 1.1640625, "learning_rate": 1.5536821163007768e-05, "loss": 0.0728, "num_input_tokens_seen": 195449492, "step": 3160 }, { "epoch": 0.5749546279491833, "grad_norm": 0.5703125, "learning_rate": 1.5447553891086178e-05, "loss": 0.0457, "num_input_tokens_seen": 195943237, "step": 3168 }, { "epoch": 0.5764065335753176, "grad_norm": 0.79296875, "learning_rate": 1.5358382165089008e-05, "loss": 0.0612, "num_input_tokens_seen": 196442834, "step": 3176 }, { "epoch": 0.5778584392014519, "grad_norm": 0.81640625, "learning_rate": 1.5269307856536486e-05, "loss": 0.0533, "num_input_tokens_seen": 196964754, "step": 3184 }, { "epoch": 0.5793103448275863, "grad_norm": 0.625, "learning_rate": 1.5180332834904276e-05, "loss": 0.0331, "num_input_tokens_seen": 197500093, "step": 3192 }, { "epoch": 0.5807622504537205, "grad_norm": 0.73046875, "learning_rate": 1.5091458967584199e-05, "loss": 0.0689, "num_input_tokens_seen": 197994930, "step": 3200 }, { "epoch": 0.5822141560798548, "grad_norm": 4.5, "learning_rate": 1.5002688119845086e-05, "loss": 0.0541, "num_input_tokens_seen": 198501247, "step": 3208 }, { "epoch": 0.5836660617059891, "grad_norm": 0.67578125, "learning_rate": 1.4914022154793613e-05, "loss": 0.0435, "num_input_tokens_seen": 199000501, "step": 3216 }, { "epoch": 0.5851179673321234, "grad_norm": 0.8984375, "learning_rate": 1.482546293333518e-05, "loss": 0.0557, "num_input_tokens_seen": 199479084, "step": 3224 }, { "epoch": 0.5865698729582577, "grad_norm": 0.62109375, "learning_rate": 1.473701231413489e-05, "loss": 0.0382, "num_input_tokens_seen": 200003062, "step": 3232 }, { "epoch": 0.588021778584392, "grad_norm": 0.5078125, "learning_rate": 1.464867215357851e-05, "loss": 0.0529, "num_input_tokens_seen": 200510961, "step": 3240 }, { "epoch": 0.5894736842105263, "grad_norm": 0.7421875, "learning_rate": 1.4560444305733521e-05, "loss": 0.0628, "num_input_tokens_seen": 201013169, "step": 3248 }, { "epoch": 0.5909255898366607, "grad_norm": 0.72265625, "learning_rate": 1.447233062231022e-05, "loss": 0.0322, "num_input_tokens_seen": 201480209, "step": 3256 }, { "epoch": 0.592377495462795, "grad_norm": 0.57421875, "learning_rate": 1.4384332952622815e-05, "loss": 0.0567, "num_input_tokens_seen": 201973667, "step": 3264 }, { "epoch": 0.5938294010889292, "grad_norm": 2.15625, "learning_rate": 1.4296453143550664e-05, "loss": 0.0463, "num_input_tokens_seen": 202453986, "step": 3272 }, { "epoch": 0.5952813067150635, "grad_norm": 0.56640625, "learning_rate": 1.4208693039499468e-05, "loss": 0.0425, "num_input_tokens_seen": 202952414, "step": 3280 }, { "epoch": 0.5967332123411978, "grad_norm": 1.125, "learning_rate": 1.4121054482362592e-05, "loss": 0.048, "num_input_tokens_seen": 203470869, "step": 3288 }, { "epoch": 0.5981851179673321, "grad_norm": 0.671875, "learning_rate": 1.4033539311482403e-05, "loss": 0.0449, "num_input_tokens_seen": 203946575, "step": 3296 }, { "epoch": 0.5996370235934664, "grad_norm": 1.0, "learning_rate": 1.3946149363611631e-05, "loss": 0.0579, "num_input_tokens_seen": 204443918, "step": 3304 }, { "epoch": 0.6010889292196008, "grad_norm": 0.5703125, "learning_rate": 1.3858886472874881e-05, "loss": 0.1074, "num_input_tokens_seen": 204950872, "step": 3312 }, { "epoch": 0.6025408348457351, "grad_norm": 0.5390625, "learning_rate": 1.3771752470730078e-05, "loss": 0.0591, "num_input_tokens_seen": 205454235, "step": 3320 }, { "epoch": 0.6039927404718693, "grad_norm": 0.73046875, "learning_rate": 1.3684749185930088e-05, "loss": 0.055, "num_input_tokens_seen": 205939041, "step": 3328 }, { "epoch": 0.6054446460980036, "grad_norm": 0.77734375, "learning_rate": 1.3597878444484272e-05, "loss": 0.0483, "num_input_tokens_seen": 206431197, "step": 3336 }, { "epoch": 0.6068965517241379, "grad_norm": 0.416015625, "learning_rate": 1.351114206962021e-05, "loss": 0.0568, "num_input_tokens_seen": 206925320, "step": 3344 }, { "epoch": 0.6083484573502722, "grad_norm": 0.490234375, "learning_rate": 1.3424541881745425e-05, "loss": 0.0553, "num_input_tokens_seen": 207406668, "step": 3352 }, { "epoch": 0.6098003629764065, "grad_norm": 0.5859375, "learning_rate": 1.333807969840916e-05, "loss": 0.0517, "num_input_tokens_seen": 207877782, "step": 3360 }, { "epoch": 0.6112522686025408, "grad_norm": 0.546875, "learning_rate": 1.3251757334264253e-05, "loss": 0.04, "num_input_tokens_seen": 208344318, "step": 3368 }, { "epoch": 0.6127041742286752, "grad_norm": 1.109375, "learning_rate": 1.316557660102903e-05, "loss": 0.0488, "num_input_tokens_seen": 208814858, "step": 3376 }, { "epoch": 0.6141560798548095, "grad_norm": 0.5, "learning_rate": 1.3079539307449311e-05, "loss": 0.044, "num_input_tokens_seen": 209297102, "step": 3384 }, { "epoch": 0.6156079854809438, "grad_norm": 0.5390625, "learning_rate": 1.2993647259260418e-05, "loss": 0.0469, "num_input_tokens_seen": 209774677, "step": 3392 }, { "epoch": 0.617059891107078, "grad_norm": 1.3359375, "learning_rate": 1.2907902259149287e-05, "loss": 0.0694, "num_input_tokens_seen": 210275870, "step": 3400 }, { "epoch": 0.6185117967332123, "grad_norm": 0.5625, "learning_rate": 1.2822306106716645e-05, "loss": 0.0595, "num_input_tokens_seen": 210797636, "step": 3408 }, { "epoch": 0.6199637023593466, "grad_norm": 0.578125, "learning_rate": 1.2736860598439215e-05, "loss": 0.0665, "num_input_tokens_seen": 211287706, "step": 3416 }, { "epoch": 0.6214156079854809, "grad_norm": 0.83203125, "learning_rate": 1.2651567527632045e-05, "loss": 0.0698, "num_input_tokens_seen": 211773156, "step": 3424 }, { "epoch": 0.6228675136116153, "grad_norm": 0.5390625, "learning_rate": 1.2566428684410843e-05, "loss": 0.0348, "num_input_tokens_seen": 212277142, "step": 3432 }, { "epoch": 0.6243194192377496, "grad_norm": 0.5625, "learning_rate": 1.2481445855654415e-05, "loss": 0.0474, "num_input_tokens_seen": 212767513, "step": 3440 }, { "epoch": 0.6243194192377496, "eval_loss": 0.05037084221839905, "eval_runtime": 2739.6179, "eval_samples_per_second": 1.138, "eval_steps_per_second": 0.142, "num_input_tokens_seen": 212767513, "step": 3440 }, { "epoch": 0.6257713248638839, "grad_norm": 0.71875, "learning_rate": 1.2396620824967169e-05, "loss": 0.1043, "num_input_tokens_seen": 213273298, "step": 3448 }, { "epoch": 0.6272232304900182, "grad_norm": 0.53125, "learning_rate": 1.2311955372641674e-05, "loss": 0.0779, "num_input_tokens_seen": 213743600, "step": 3456 }, { "epoch": 0.6286751361161524, "grad_norm": 0.96875, "learning_rate": 1.222745127562129e-05, "loss": 0.0474, "num_input_tokens_seen": 214249105, "step": 3464 }, { "epoch": 0.6301270417422867, "grad_norm": 0.6171875, "learning_rate": 1.2143110307462892e-05, "loss": 0.0914, "num_input_tokens_seen": 214743732, "step": 3472 }, { "epoch": 0.631578947368421, "grad_norm": 0.58203125, "learning_rate": 1.2058934238299625e-05, "loss": 0.0333, "num_input_tokens_seen": 215240214, "step": 3480 }, { "epoch": 0.6330308529945553, "grad_norm": 0.94140625, "learning_rate": 1.1974924834803765e-05, "loss": 0.0477, "num_input_tokens_seen": 215752215, "step": 3488 }, { "epoch": 0.6344827586206897, "grad_norm": 0.859375, "learning_rate": 1.1891083860149653e-05, "loss": 0.0456, "num_input_tokens_seen": 216218681, "step": 3496 }, { "epoch": 0.635934664246824, "grad_norm": 0.9921875, "learning_rate": 1.1807413073976655e-05, "loss": 0.0537, "num_input_tokens_seen": 216717186, "step": 3504 }, { "epoch": 0.6373865698729583, "grad_norm": 0.5078125, "learning_rate": 1.1723914232352265e-05, "loss": 0.0543, "num_input_tokens_seen": 217224763, "step": 3512 }, { "epoch": 0.6388384754990926, "grad_norm": 0.80078125, "learning_rate": 1.1640589087735222e-05, "loss": 0.053, "num_input_tokens_seen": 217712978, "step": 3520 }, { "epoch": 0.6402903811252268, "grad_norm": 0.5234375, "learning_rate": 1.1557439388938772e-05, "loss": 0.0464, "num_input_tokens_seen": 218177197, "step": 3528 }, { "epoch": 0.6417422867513611, "grad_norm": 0.6796875, "learning_rate": 1.1474466881093904e-05, "loss": 0.0679, "num_input_tokens_seen": 218664950, "step": 3536 }, { "epoch": 0.6431941923774954, "grad_norm": 0.640625, "learning_rate": 1.139167330561277e-05, "loss": 0.0551, "num_input_tokens_seen": 219190307, "step": 3544 }, { "epoch": 0.6446460980036298, "grad_norm": 0.58984375, "learning_rate": 1.130906040015211e-05, "loss": 0.045, "num_input_tokens_seen": 219656276, "step": 3552 }, { "epoch": 0.6460980036297641, "grad_norm": 0.51953125, "learning_rate": 1.1226629898576818e-05, "loss": 0.0516, "num_input_tokens_seen": 220153311, "step": 3560 }, { "epoch": 0.6475499092558984, "grad_norm": 1.125, "learning_rate": 1.1144383530923505e-05, "loss": 0.04, "num_input_tokens_seen": 220641855, "step": 3568 }, { "epoch": 0.6490018148820327, "grad_norm": 1.1328125, "learning_rate": 1.1062323023364217e-05, "loss": 0.0566, "num_input_tokens_seen": 221165742, "step": 3576 }, { "epoch": 0.650453720508167, "grad_norm": 1.3515625, "learning_rate": 1.0980450098170211e-05, "loss": 0.0598, "num_input_tokens_seen": 221645634, "step": 3584 }, { "epoch": 0.6519056261343013, "grad_norm": 0.5390625, "learning_rate": 1.0898766473675795e-05, "loss": 0.0582, "num_input_tokens_seen": 222128368, "step": 3592 }, { "epoch": 0.6533575317604355, "grad_norm": 0.78125, "learning_rate": 1.081727386424225e-05, "loss": 0.0637, "num_input_tokens_seen": 222630366, "step": 3600 }, { "epoch": 0.6548094373865698, "grad_norm": 1.15625, "learning_rate": 1.0735973980221898e-05, "loss": 0.0319, "num_input_tokens_seen": 223132889, "step": 3608 }, { "epoch": 0.6562613430127042, "grad_norm": 0.72265625, "learning_rate": 1.0654868527922157e-05, "loss": 0.0605, "num_input_tokens_seen": 223620866, "step": 3616 }, { "epoch": 0.6577132486388385, "grad_norm": 0.9296875, "learning_rate": 1.0573959209569736e-05, "loss": 0.0563, "num_input_tokens_seen": 224112161, "step": 3624 }, { "epoch": 0.6591651542649728, "grad_norm": 0.5625, "learning_rate": 1.0493247723274949e-05, "loss": 0.0637, "num_input_tokens_seen": 224615692, "step": 3632 }, { "epoch": 0.6606170598911071, "grad_norm": 0.59375, "learning_rate": 1.0412735762996022e-05, "loss": 0.0525, "num_input_tokens_seen": 225123661, "step": 3640 }, { "epoch": 0.6620689655172414, "grad_norm": 0.423828125, "learning_rate": 1.0332425018503573e-05, "loss": 0.0448, "num_input_tokens_seen": 225606843, "step": 3648 }, { "epoch": 0.6635208711433757, "grad_norm": 0.78515625, "learning_rate": 1.025231717534513e-05, "loss": 0.0511, "num_input_tokens_seen": 226083858, "step": 3656 }, { "epoch": 0.6649727767695099, "grad_norm": 0.625, "learning_rate": 1.0172413914809791e-05, "loss": 0.0297, "num_input_tokens_seen": 226586157, "step": 3664 }, { "epoch": 0.6664246823956442, "grad_norm": 0.6484375, "learning_rate": 1.0092716913892878e-05, "loss": 0.0542, "num_input_tokens_seen": 227090262, "step": 3672 }, { "epoch": 0.6678765880217786, "grad_norm": 0.66796875, "learning_rate": 1.0013227845260785e-05, "loss": 0.0496, "num_input_tokens_seen": 227568348, "step": 3680 }, { "epoch": 0.6693284936479129, "grad_norm": 0.431640625, "learning_rate": 9.933948377215873e-06, "loss": 0.0474, "num_input_tokens_seen": 228069156, "step": 3688 }, { "epoch": 0.6707803992740472, "grad_norm": 0.5078125, "learning_rate": 9.85488017366143e-06, "loss": 0.0276, "num_input_tokens_seen": 228546696, "step": 3696 }, { "epoch": 0.6722323049001815, "grad_norm": 0.60546875, "learning_rate": 9.776024894066755e-06, "loss": 0.0413, "num_input_tokens_seen": 229039860, "step": 3704 }, { "epoch": 0.6736842105263158, "grad_norm": 0.69921875, "learning_rate": 9.697384193432365e-06, "loss": 0.0398, "num_input_tokens_seen": 229524911, "step": 3712 }, { "epoch": 0.6751361161524501, "grad_norm": 1.46875, "learning_rate": 9.618959722255204e-06, "loss": 0.0448, "num_input_tokens_seen": 230032334, "step": 3720 }, { "epoch": 0.6765880217785843, "grad_norm": 0.765625, "learning_rate": 9.540753126494035e-06, "loss": 0.0746, "num_input_tokens_seen": 230518610, "step": 3728 }, { "epoch": 0.6780399274047187, "grad_norm": 0.98828125, "learning_rate": 9.462766047534915e-06, "loss": 0.0463, "num_input_tokens_seen": 231010962, "step": 3736 }, { "epoch": 0.679491833030853, "grad_norm": 0.67578125, "learning_rate": 9.385000122156695e-06, "loss": 0.0675, "num_input_tokens_seen": 231515592, "step": 3744 }, { "epoch": 0.6809437386569873, "grad_norm": 0.63671875, "learning_rate": 9.3074569824967e-06, "loss": 0.0627, "num_input_tokens_seen": 232031254, "step": 3752 }, { "epoch": 0.6823956442831216, "grad_norm": 0.578125, "learning_rate": 9.230138256016461e-06, "loss": 0.0601, "num_input_tokens_seen": 232525195, "step": 3760 }, { "epoch": 0.6838475499092559, "grad_norm": 0.7734375, "learning_rate": 9.153045565467605e-06, "loss": 0.0587, "num_input_tokens_seen": 232999291, "step": 3768 }, { "epoch": 0.6852994555353902, "grad_norm": 2.046875, "learning_rate": 9.076180528857709e-06, "loss": 0.0536, "num_input_tokens_seen": 233490579, "step": 3776 }, { "epoch": 0.6867513611615245, "grad_norm": 0.60546875, "learning_rate": 8.999544759416413e-06, "loss": 0.0346, "num_input_tokens_seen": 234000641, "step": 3784 }, { "epoch": 0.6867513611615245, "eval_loss": 0.04955988749861717, "eval_runtime": 2842.036, "eval_samples_per_second": 1.097, "eval_steps_per_second": 0.137, "num_input_tokens_seen": 234000641, "step": 3784 }, { "epoch": 0.6882032667876588, "grad_norm": 0.369140625, "learning_rate": 8.923139865561525e-06, "loss": 0.0568, "num_input_tokens_seen": 234523989, "step": 3792 }, { "epoch": 0.6896551724137931, "grad_norm": 0.5703125, "learning_rate": 8.846967450865302e-06, "loss": 0.0471, "num_input_tokens_seen": 234995824, "step": 3800 }, { "epoch": 0.6911070780399274, "grad_norm": 0.58203125, "learning_rate": 8.77102911402075e-06, "loss": 0.0396, "num_input_tokens_seen": 235480070, "step": 3808 }, { "epoch": 0.6925589836660617, "grad_norm": 1.1875, "learning_rate": 8.695326448808089e-06, "loss": 0.0427, "num_input_tokens_seen": 235969468, "step": 3816 }, { "epoch": 0.694010889292196, "grad_norm": 0.55078125, "learning_rate": 8.61986104406132e-06, "loss": 0.0468, "num_input_tokens_seen": 236457438, "step": 3824 }, { "epoch": 0.6954627949183303, "grad_norm": 0.72265625, "learning_rate": 8.544634483634855e-06, "loss": 0.07, "num_input_tokens_seen": 236964483, "step": 3832 }, { "epoch": 0.6969147005444646, "grad_norm": 0.734375, "learning_rate": 8.469648346370275e-06, "loss": 0.0681, "num_input_tokens_seen": 237478465, "step": 3840 }, { "epoch": 0.6983666061705989, "grad_norm": 0.69921875, "learning_rate": 8.39490420606323e-06, "loss": 0.0486, "num_input_tokens_seen": 237972518, "step": 3848 }, { "epoch": 0.6998185117967333, "grad_norm": 0.70703125, "learning_rate": 8.320403631430352e-06, "loss": 0.0398, "num_input_tokens_seen": 238453985, "step": 3856 }, { "epoch": 0.7012704174228676, "grad_norm": 0.66796875, "learning_rate": 8.246148186076367e-06, "loss": 0.0565, "num_input_tokens_seen": 238956557, "step": 3864 }, { "epoch": 0.7027223230490018, "grad_norm": 1.125, "learning_rate": 8.172139428461292e-06, "loss": 0.0699, "num_input_tokens_seen": 239428560, "step": 3872 }, { "epoch": 0.7041742286751361, "grad_norm": 0.98046875, "learning_rate": 8.098378911867682e-06, "loss": 0.0595, "num_input_tokens_seen": 239904462, "step": 3880 }, { "epoch": 0.7056261343012704, "grad_norm": 2.03125, "learning_rate": 8.02486818436806e-06, "loss": 0.0696, "num_input_tokens_seen": 240404479, "step": 3888 }, { "epoch": 0.7070780399274047, "grad_norm": 1.0234375, "learning_rate": 7.95160878879242e-06, "loss": 0.0534, "num_input_tokens_seen": 240926945, "step": 3896 }, { "epoch": 0.708529945553539, "grad_norm": 0.9140625, "learning_rate": 7.87860226269586e-06, "loss": 0.0596, "num_input_tokens_seen": 241440836, "step": 3904 }, { "epoch": 0.7099818511796733, "grad_norm": 0.8984375, "learning_rate": 7.805850138326282e-06, "loss": 0.035, "num_input_tokens_seen": 241942169, "step": 3912 }, { "epoch": 0.7114337568058077, "grad_norm": 0.87109375, "learning_rate": 7.733353942592246e-06, "loss": 0.0501, "num_input_tokens_seen": 242419037, "step": 3920 }, { "epoch": 0.712885662431942, "grad_norm": 0.69140625, "learning_rate": 7.661115197030954e-06, "loss": 0.0576, "num_input_tokens_seen": 242917759, "step": 3928 }, { "epoch": 0.7143375680580762, "grad_norm": 0.5859375, "learning_rate": 7.589135417776266e-06, "loss": 0.0394, "num_input_tokens_seen": 243411063, "step": 3936 }, { "epoch": 0.7157894736842105, "grad_norm": 0.76171875, "learning_rate": 7.517416115526901e-06, "loss": 0.0485, "num_input_tokens_seen": 243885516, "step": 3944 }, { "epoch": 0.7172413793103448, "grad_norm": 0.5390625, "learning_rate": 7.445958795514761e-06, "loss": 0.0642, "num_input_tokens_seen": 244397104, "step": 3952 }, { "epoch": 0.7186932849364791, "grad_norm": 0.80859375, "learning_rate": 7.374764957473281e-06, "loss": 0.0486, "num_input_tokens_seen": 244892690, "step": 3960 }, { "epoch": 0.7201451905626134, "grad_norm": 1.0703125, "learning_rate": 7.303836095605994e-06, "loss": 0.0532, "num_input_tokens_seen": 245418852, "step": 3968 }, { "epoch": 0.7215970961887477, "grad_norm": 0.455078125, "learning_rate": 7.233173698555174e-06, "loss": 0.0389, "num_input_tokens_seen": 245925757, "step": 3976 }, { "epoch": 0.7230490018148821, "grad_norm": 0.73046875, "learning_rate": 7.16277924937056e-06, "loss": 0.0514, "num_input_tokens_seen": 246421511, "step": 3984 }, { "epoch": 0.7245009074410164, "grad_norm": 0.5625, "learning_rate": 7.092654225478257e-06, "loss": 0.041, "num_input_tokens_seen": 246952363, "step": 3992 }, { "epoch": 0.7259528130671506, "grad_norm": 0.5625, "learning_rate": 7.022800098649716e-06, "loss": 0.0446, "num_input_tokens_seen": 247450049, "step": 4000 }, { "epoch": 0.7274047186932849, "grad_norm": 0.921875, "learning_rate": 6.953218334970861e-06, "loss": 0.0379, "num_input_tokens_seen": 247943269, "step": 4008 }, { "epoch": 0.7288566243194192, "grad_norm": 0.69921875, "learning_rate": 6.8839103948113e-06, "loss": 0.0394, "num_input_tokens_seen": 248447780, "step": 4016 }, { "epoch": 0.7303085299455535, "grad_norm": 0.27734375, "learning_rate": 6.814877732793663e-06, "loss": 0.0401, "num_input_tokens_seen": 248921260, "step": 4024 }, { "epoch": 0.7317604355716878, "grad_norm": 0.61328125, "learning_rate": 6.7461217977631325e-06, "loss": 0.0447, "num_input_tokens_seen": 249435130, "step": 4032 }, { "epoch": 0.7332123411978222, "grad_norm": 0.76953125, "learning_rate": 6.67764403275696e-06, "loss": 0.0457, "num_input_tokens_seen": 249913307, "step": 4040 }, { "epoch": 0.7346642468239565, "grad_norm": 0.94921875, "learning_rate": 6.609445874974218e-06, "loss": 0.066, "num_input_tokens_seen": 250435878, "step": 4048 }, { "epoch": 0.7361161524500908, "grad_norm": 0.40625, "learning_rate": 6.5415287557456585e-06, "loss": 0.0509, "num_input_tokens_seen": 250946234, "step": 4056 }, { "epoch": 0.737568058076225, "grad_norm": 0.48046875, "learning_rate": 6.473894100503615e-06, "loss": 0.0553, "num_input_tokens_seen": 251435205, "step": 4064 }, { "epoch": 0.7390199637023593, "grad_norm": 0.8671875, "learning_rate": 6.4065433287521306e-06, "loss": 0.0445, "num_input_tokens_seen": 251949775, "step": 4072 }, { "epoch": 0.7404718693284936, "grad_norm": 0.94921875, "learning_rate": 6.33947785403716e-06, "loss": 0.0626, "num_input_tokens_seen": 252447111, "step": 4080 }, { "epoch": 0.7419237749546279, "grad_norm": 0.58984375, "learning_rate": 6.272699083916885e-06, "loss": 0.0685, "num_input_tokens_seen": 252958790, "step": 4088 }, { "epoch": 0.7433756805807622, "grad_norm": 0.61328125, "learning_rate": 6.20620841993218e-06, "loss": 0.0705, "num_input_tokens_seen": 253436330, "step": 4096 }, { "epoch": 0.7448275862068966, "grad_norm": 0.9921875, "learning_rate": 6.1400072575772056e-06, "loss": 0.0599, "num_input_tokens_seen": 253927128, "step": 4104 }, { "epoch": 0.7462794918330309, "grad_norm": 0.5859375, "learning_rate": 6.0740969862701195e-06, "loss": 0.0407, "num_input_tokens_seen": 254426830, "step": 4112 }, { "epoch": 0.7477313974591652, "grad_norm": 0.87890625, "learning_rate": 6.008478989323898e-06, "loss": 0.0566, "num_input_tokens_seen": 254922990, "step": 4120 }, { "epoch": 0.7491833030852995, "grad_norm": 0.9375, "learning_rate": 5.943154643917315e-06, "loss": 0.0498, "num_input_tokens_seen": 255423630, "step": 4128 }, { "epoch": 0.7491833030852995, "eval_loss": 0.049039360135793686, "eval_runtime": 2629.7216, "eval_samples_per_second": 1.185, "eval_steps_per_second": 0.148, "num_input_tokens_seen": 255423630, "step": 4128 }, { "epoch": 0.7506352087114337, "grad_norm": 0.4921875, "learning_rate": 5.87812532106606e-06, "loss": 0.0614, "num_input_tokens_seen": 255929632, "step": 4136 }, { "epoch": 0.752087114337568, "grad_norm": 0.7109375, "learning_rate": 5.813392385593915e-06, "loss": 0.0651, "num_input_tokens_seen": 256430965, "step": 4144 }, { "epoch": 0.7535390199637023, "grad_norm": 0.5625, "learning_rate": 5.7489571961041415e-06, "loss": 0.0618, "num_input_tokens_seen": 256934909, "step": 4152 }, { "epoch": 0.7549909255898367, "grad_norm": 0.84375, "learning_rate": 5.684821104950984e-06, "loss": 0.0604, "num_input_tokens_seen": 257421654, "step": 4160 }, { "epoch": 0.756442831215971, "grad_norm": 0.60546875, "learning_rate": 5.620985458211241e-06, "loss": 0.0516, "num_input_tokens_seen": 257913684, "step": 4168 }, { "epoch": 0.7578947368421053, "grad_norm": 0.9453125, "learning_rate": 5.55745159565604e-06, "loss": 0.0418, "num_input_tokens_seen": 258400849, "step": 4176 }, { "epoch": 0.7593466424682396, "grad_norm": 0.734375, "learning_rate": 5.494220850722729e-06, "loss": 0.062, "num_input_tokens_seen": 258878333, "step": 4184 }, { "epoch": 0.7607985480943739, "grad_norm": 1.203125, "learning_rate": 5.431294550486869e-06, "loss": 0.0615, "num_input_tokens_seen": 259369068, "step": 4192 }, { "epoch": 0.7622504537205081, "grad_norm": 0.71875, "learning_rate": 5.3686740156343805e-06, "loss": 0.0584, "num_input_tokens_seen": 259870513, "step": 4200 }, { "epoch": 0.7637023593466424, "grad_norm": 0.66796875, "learning_rate": 5.306360560433854e-06, "loss": 0.0419, "num_input_tokens_seen": 260370376, "step": 4208 }, { "epoch": 0.7651542649727767, "grad_norm": 1.015625, "learning_rate": 5.244355492708941e-06, "loss": 0.0582, "num_input_tokens_seen": 260881761, "step": 4216 }, { "epoch": 0.7666061705989111, "grad_norm": 0.70703125, "learning_rate": 5.182660113810907e-06, "loss": 0.0468, "num_input_tokens_seen": 261402673, "step": 4224 }, { "epoch": 0.7680580762250454, "grad_norm": 1.3671875, "learning_rate": 5.121275718591321e-06, "loss": 0.0686, "num_input_tokens_seen": 261898525, "step": 4232 }, { "epoch": 0.7695099818511797, "grad_norm": 0.51953125, "learning_rate": 5.0602035953748865e-06, "loss": 0.0624, "num_input_tokens_seen": 262392396, "step": 4240 }, { "epoch": 0.770961887477314, "grad_norm": 0.56640625, "learning_rate": 4.999445025932408e-06, "loss": 0.0429, "num_input_tokens_seen": 262882816, "step": 4248 }, { "epoch": 0.7724137931034483, "grad_norm": 1.2734375, "learning_rate": 4.939001285453864e-06, "loss": 0.0372, "num_input_tokens_seen": 263383267, "step": 4256 }, { "epoch": 0.7738656987295826, "grad_norm": 0.54296875, "learning_rate": 4.8788736425216595e-06, "loss": 0.0343, "num_input_tokens_seen": 263858756, "step": 4264 }, { "epoch": 0.7753176043557168, "grad_norm": 0.5859375, "learning_rate": 4.81906335908402e-06, "loss": 0.048, "num_input_tokens_seen": 264345998, "step": 4272 }, { "epoch": 0.7767695099818511, "grad_norm": 2.4375, "learning_rate": 4.759571690428464e-06, "loss": 0.0595, "num_input_tokens_seen": 264834486, "step": 4280 }, { "epoch": 0.7782214156079855, "grad_norm": 0.482421875, "learning_rate": 4.700399885155487e-06, "loss": 0.0456, "num_input_tokens_seen": 265331269, "step": 4288 }, { "epoch": 0.7796733212341198, "grad_norm": 0.90625, "learning_rate": 4.641549185152359e-06, "loss": 0.0374, "num_input_tokens_seen": 265836347, "step": 4296 }, { "epoch": 0.7811252268602541, "grad_norm": 0.828125, "learning_rate": 4.583020825567039e-06, "loss": 0.0359, "num_input_tokens_seen": 266324737, "step": 4304 }, { "epoch": 0.7825771324863884, "grad_norm": 0.451171875, "learning_rate": 4.524816034782263e-06, "loss": 0.0575, "num_input_tokens_seen": 266808164, "step": 4312 }, { "epoch": 0.7840290381125227, "grad_norm": 1.34375, "learning_rate": 4.46693603438977e-06, "loss": 0.0502, "num_input_tokens_seen": 267324813, "step": 4320 }, { "epoch": 0.785480943738657, "grad_norm": 0.60546875, "learning_rate": 4.409382039164653e-06, "loss": 0.063, "num_input_tokens_seen": 267822646, "step": 4328 }, { "epoch": 0.7869328493647912, "grad_norm": 0.6484375, "learning_rate": 4.352155257039865e-06, "loss": 0.0736, "num_input_tokens_seen": 268320339, "step": 4336 }, { "epoch": 0.7883847549909256, "grad_norm": 0.8828125, "learning_rate": 4.295256889080865e-06, "loss": 0.0568, "num_input_tokens_seen": 268805229, "step": 4344 }, { "epoch": 0.7898366606170599, "grad_norm": 0.84375, "learning_rate": 4.238688129460431e-06, "loss": 0.0398, "num_input_tokens_seen": 269290686, "step": 4352 }, { "epoch": 0.7912885662431942, "grad_norm": 0.78515625, "learning_rate": 4.18245016543356e-06, "loss": 0.0468, "num_input_tokens_seen": 269771817, "step": 4360 }, { "epoch": 0.7927404718693285, "grad_norm": 0.53515625, "learning_rate": 4.126544177312577e-06, "loss": 0.0497, "num_input_tokens_seen": 270261530, "step": 4368 }, { "epoch": 0.7941923774954628, "grad_norm": 1.4375, "learning_rate": 4.0709713384423685e-06, "loss": 0.0356, "num_input_tokens_seen": 270769688, "step": 4376 }, { "epoch": 0.7956442831215971, "grad_norm": 2.0, "learning_rate": 4.015732815175728e-06, "loss": 0.0573, "num_input_tokens_seen": 271284923, "step": 4384 }, { "epoch": 0.7970961887477314, "grad_norm": 0.52734375, "learning_rate": 3.960829766848893e-06, "loss": 0.056, "num_input_tokens_seen": 271756884, "step": 4392 }, { "epoch": 0.7985480943738656, "grad_norm": 0.5234375, "learning_rate": 3.906263345757231e-06, "loss": 0.0309, "num_input_tokens_seen": 272248473, "step": 4400 }, { "epoch": 0.8, "grad_norm": 0.828125, "learning_rate": 3.852034697131015e-06, "loss": 0.0447, "num_input_tokens_seen": 272755455, "step": 4408 }, { "epoch": 0.8014519056261343, "grad_norm": 0.7421875, "learning_rate": 3.7981449591114207e-06, "loss": 0.0459, "num_input_tokens_seen": 273244979, "step": 4416 }, { "epoch": 0.8029038112522686, "grad_norm": 0.73046875, "learning_rate": 3.7445952627266336e-06, "loss": 0.0642, "num_input_tokens_seen": 273749266, "step": 4424 }, { "epoch": 0.8043557168784029, "grad_norm": 0.80078125, "learning_rate": 3.6913867318680984e-06, "loss": 0.0455, "num_input_tokens_seen": 274271081, "step": 4432 }, { "epoch": 0.8058076225045372, "grad_norm": 0.859375, "learning_rate": 3.6385204832669385e-06, "loss": 0.0414, "num_input_tokens_seen": 274770517, "step": 4440 }, { "epoch": 0.8072595281306715, "grad_norm": 0.703125, "learning_rate": 3.585997626470519e-06, "loss": 0.0426, "num_input_tokens_seen": 275248505, "step": 4448 }, { "epoch": 0.8087114337568058, "grad_norm": 0.63671875, "learning_rate": 3.533819263819167e-06, "loss": 0.0498, "num_input_tokens_seen": 275748095, "step": 4456 }, { "epoch": 0.8101633393829402, "grad_norm": 0.46484375, "learning_rate": 3.4819864904230195e-06, "loss": 0.0508, "num_input_tokens_seen": 276242421, "step": 4464 }, { "epoch": 0.8116152450090744, "grad_norm": 0.7734375, "learning_rate": 3.4305003941390468e-06, "loss": 0.0605, "num_input_tokens_seen": 276731693, "step": 4472 }, { "epoch": 0.8116152450090744, "eval_loss": 0.04871319234371185, "eval_runtime": 2768.9798, "eval_samples_per_second": 1.126, "eval_steps_per_second": 0.141, "num_input_tokens_seen": 276731693, "step": 4472 }, { "epoch": 0.8130671506352087, "grad_norm": 0.69921875, "learning_rate": 3.3793620555482322e-06, "loss": 0.053, "num_input_tokens_seen": 277218277, "step": 4480 }, { "epoch": 0.814519056261343, "grad_norm": 0.6484375, "learning_rate": 3.3285725479328757e-06, "loss": 0.0582, "num_input_tokens_seen": 277705169, "step": 4488 }, { "epoch": 0.8159709618874773, "grad_norm": 0.74609375, "learning_rate": 3.2781329372540683e-06, "loss": 0.0618, "num_input_tokens_seen": 278213285, "step": 4496 }, { "epoch": 0.8174228675136116, "grad_norm": 0.42578125, "learning_rate": 3.2280442821293455e-06, "loss": 0.0556, "num_input_tokens_seen": 278697097, "step": 4504 }, { "epoch": 0.8188747731397459, "grad_norm": 0.53515625, "learning_rate": 3.178307633810436e-06, "loss": 0.0526, "num_input_tokens_seen": 279193929, "step": 4512 }, { "epoch": 0.8203266787658802, "grad_norm": 0.890625, "learning_rate": 3.128924036161207e-06, "loss": 0.0411, "num_input_tokens_seen": 279698041, "step": 4520 }, { "epoch": 0.8217785843920146, "grad_norm": 0.453125, "learning_rate": 3.079894525635783e-06, "loss": 0.0505, "num_input_tokens_seen": 280182805, "step": 4528 }, { "epoch": 0.8232304900181489, "grad_norm": 0.431640625, "learning_rate": 3.0312201312567536e-06, "loss": 0.04, "num_input_tokens_seen": 280651028, "step": 4536 }, { "epoch": 0.8246823956442831, "grad_norm": 0.94140625, "learning_rate": 2.982901874593598e-06, "loss": 0.0696, "num_input_tokens_seen": 281162798, "step": 4544 }, { "epoch": 0.8261343012704174, "grad_norm": 0.458984375, "learning_rate": 2.934940769741239e-06, "loss": 0.0356, "num_input_tokens_seen": 281658265, "step": 4552 }, { "epoch": 0.8275862068965517, "grad_norm": 3.09375, "learning_rate": 2.8873378232987726e-06, "loss": 0.0503, "num_input_tokens_seen": 282170245, "step": 4560 }, { "epoch": 0.829038112522686, "grad_norm": 0.99609375, "learning_rate": 2.840094034348315e-06, "loss": 0.0471, "num_input_tokens_seen": 282655198, "step": 4568 }, { "epoch": 0.8304900181488203, "grad_norm": 0.46875, "learning_rate": 2.793210394434056e-06, "loss": 0.0615, "num_input_tokens_seen": 283132416, "step": 4576 }, { "epoch": 0.8319419237749546, "grad_norm": 1.1875, "learning_rate": 2.746687887541448e-06, "loss": 0.0537, "num_input_tokens_seen": 283628667, "step": 4584 }, { "epoch": 0.833393829401089, "grad_norm": 0.6328125, "learning_rate": 2.700527490076539e-06, "loss": 0.0375, "num_input_tokens_seen": 284146751, "step": 4592 }, { "epoch": 0.8348457350272233, "grad_norm": 1.1796875, "learning_rate": 2.6547301708454877e-06, "loss": 0.041, "num_input_tokens_seen": 284643128, "step": 4600 }, { "epoch": 0.8362976406533575, "grad_norm": 0.578125, "learning_rate": 2.609296891034241e-06, "loss": 0.0473, "num_input_tokens_seen": 285145371, "step": 4608 }, { "epoch": 0.8377495462794918, "grad_norm": 0.78125, "learning_rate": 2.5642286041883458e-06, "loss": 0.0472, "num_input_tokens_seen": 285639963, "step": 4616 }, { "epoch": 0.8392014519056261, "grad_norm": 0.5078125, "learning_rate": 2.519526256192939e-06, "loss": 0.0493, "num_input_tokens_seen": 286128983, "step": 4624 }, { "epoch": 0.8406533575317604, "grad_norm": 0.45703125, "learning_rate": 2.47519078525291e-06, "loss": 0.0726, "num_input_tokens_seen": 286625920, "step": 4632 }, { "epoch": 0.8421052631578947, "grad_norm": 0.7578125, "learning_rate": 2.431223121873183e-06, "loss": 0.0465, "num_input_tokens_seen": 287119525, "step": 4640 }, { "epoch": 0.8435571687840291, "grad_norm": 0.5390625, "learning_rate": 2.3876241888392173e-06, "loss": 0.0553, "num_input_tokens_seen": 287610722, "step": 4648 }, { "epoch": 0.8450090744101634, "grad_norm": 0.53125, "learning_rate": 2.3443949011976107e-06, "loss": 0.0428, "num_input_tokens_seen": 288097243, "step": 4656 }, { "epoch": 0.8464609800362977, "grad_norm": 1.359375, "learning_rate": 2.301536166236926e-06, "loss": 0.048, "num_input_tokens_seen": 288598177, "step": 4664 }, { "epoch": 0.847912885662432, "grad_norm": 0.53515625, "learning_rate": 2.259048883468622e-06, "loss": 0.0436, "num_input_tokens_seen": 289095940, "step": 4672 }, { "epoch": 0.8493647912885662, "grad_norm": 0.9453125, "learning_rate": 2.216933944608184e-06, "loss": 0.0525, "num_input_tokens_seen": 289579822, "step": 4680 }, { "epoch": 0.8508166969147005, "grad_norm": 1.1875, "learning_rate": 2.1751922335564134e-06, "loss": 0.0752, "num_input_tokens_seen": 290090500, "step": 4688 }, { "epoch": 0.8522686025408348, "grad_norm": 0.7421875, "learning_rate": 2.13382462638088e-06, "loss": 0.0348, "num_input_tokens_seen": 290583181, "step": 4696 }, { "epoch": 0.8537205081669691, "grad_norm": 0.80078125, "learning_rate": 2.0928319912975193e-06, "loss": 0.063, "num_input_tokens_seen": 291086649, "step": 4704 }, { "epoch": 0.8551724137931035, "grad_norm": 0.53125, "learning_rate": 2.0522151886524153e-06, "loss": 0.0492, "num_input_tokens_seen": 291577384, "step": 4712 }, { "epoch": 0.8566243194192378, "grad_norm": 0.68359375, "learning_rate": 2.0119750709037646e-06, "loss": 0.0428, "num_input_tokens_seen": 292058725, "step": 4720 }, { "epoch": 0.8580762250453721, "grad_norm": 0.640625, "learning_rate": 1.972112482603954e-06, "loss": 0.074, "num_input_tokens_seen": 292542677, "step": 4728 }, { "epoch": 0.8595281306715064, "grad_norm": 0.40234375, "learning_rate": 1.9326282603818526e-06, "loss": 0.0493, "num_input_tokens_seen": 293025201, "step": 4736 }, { "epoch": 0.8609800362976406, "grad_norm": 0.84375, "learning_rate": 1.8935232329252585e-06, "loss": 0.0431, "num_input_tokens_seen": 293508845, "step": 4744 }, { "epoch": 0.8624319419237749, "grad_norm": 0.88671875, "learning_rate": 1.854798220963485e-06, "loss": 0.0356, "num_input_tokens_seen": 293995884, "step": 4752 }, { "epoch": 0.8638838475499092, "grad_norm": 0.6953125, "learning_rate": 1.816454037250155e-06, "loss": 0.0548, "num_input_tokens_seen": 294512519, "step": 4760 }, { "epoch": 0.8653357531760436, "grad_norm": 0.8203125, "learning_rate": 1.778491486546141e-06, "loss": 0.0409, "num_input_tokens_seen": 295012760, "step": 4768 }, { "epoch": 0.8667876588021779, "grad_norm": 0.57421875, "learning_rate": 1.7409113656026643e-06, "loss": 0.0336, "num_input_tokens_seen": 295509942, "step": 4776 }, { "epoch": 0.8682395644283122, "grad_norm": 0.74609375, "learning_rate": 1.7037144631445745e-06, "loss": 0.0413, "num_input_tokens_seen": 296013081, "step": 4784 }, { "epoch": 0.8696914700544465, "grad_norm": 0.54296875, "learning_rate": 1.666901559853804e-06, "loss": 0.0387, "num_input_tokens_seen": 296492427, "step": 4792 }, { "epoch": 0.8711433756805808, "grad_norm": 0.50390625, "learning_rate": 1.63047342835299e-06, "loss": 0.0468, "num_input_tokens_seen": 297011120, "step": 4800 }, { "epoch": 0.872595281306715, "grad_norm": 1.0625, "learning_rate": 1.594430833189231e-06, "loss": 0.0518, "num_input_tokens_seen": 297502338, "step": 4808 }, { "epoch": 0.8740471869328493, "grad_norm": 0.51953125, "learning_rate": 1.5587745308180656e-06, "loss": 0.055, "num_input_tokens_seen": 298011343, "step": 4816 }, { "epoch": 0.8740471869328493, "eval_loss": 0.04861417040228844, "eval_runtime": 2715.815, "eval_samples_per_second": 1.148, "eval_steps_per_second": 0.144, "num_input_tokens_seen": 298011343, "step": 4816 }, { "epoch": 0.8754990925589836, "grad_norm": 0.5390625, "learning_rate": 1.523505269587595e-06, "loss": 0.0366, "num_input_tokens_seen": 298524933, "step": 4824 }, { "epoch": 0.876950998185118, "grad_norm": 0.76953125, "learning_rate": 1.4886237897227584e-06, "loss": 0.0466, "num_input_tokens_seen": 299031985, "step": 4832 }, { "epoch": 0.8784029038112523, "grad_norm": 0.58203125, "learning_rate": 1.4541308233098117e-06, "loss": 0.0472, "num_input_tokens_seen": 299512381, "step": 4840 }, { "epoch": 0.8798548094373866, "grad_norm": 0.345703125, "learning_rate": 1.420027094280969e-06, "loss": 0.0585, "num_input_tokens_seen": 300023962, "step": 4848 }, { "epoch": 0.8813067150635209, "grad_norm": 0.76953125, "learning_rate": 1.3863133183991905e-06, "loss": 0.0455, "num_input_tokens_seen": 300499402, "step": 4856 }, { "epoch": 0.8827586206896552, "grad_norm": 0.8828125, "learning_rate": 1.3529902032431698e-06, "loss": 0.0572, "num_input_tokens_seen": 301015365, "step": 4864 }, { "epoch": 0.8842105263157894, "grad_norm": 0.515625, "learning_rate": 1.3200584481924915e-06, "loss": 0.054, "num_input_tokens_seen": 301509565, "step": 4872 }, { "epoch": 0.8856624319419237, "grad_norm": 0.8125, "learning_rate": 1.2875187444129366e-06, "loss": 0.0505, "num_input_tokens_seen": 302023484, "step": 4880 }, { "epoch": 0.8871143375680581, "grad_norm": 1.0234375, "learning_rate": 1.2553717748419846e-06, "loss": 0.0426, "num_input_tokens_seen": 302520603, "step": 4888 }, { "epoch": 0.8885662431941924, "grad_norm": 0.5703125, "learning_rate": 1.2236182141744757e-06, "loss": 0.0495, "num_input_tokens_seen": 303012766, "step": 4896 }, { "epoch": 0.8900181488203267, "grad_norm": 0.400390625, "learning_rate": 1.192258728848472e-06, "loss": 0.0561, "num_input_tokens_seen": 303502416, "step": 4904 }, { "epoch": 0.891470054446461, "grad_norm": 0.5078125, "learning_rate": 1.1612939770312325e-06, "loss": 0.0365, "num_input_tokens_seen": 304003546, "step": 4912 }, { "epoch": 0.8929219600725953, "grad_norm": 0.609375, "learning_rate": 1.130724608605427e-06, "loss": 0.05, "num_input_tokens_seen": 304494827, "step": 4920 }, { "epoch": 0.8943738656987296, "grad_norm": 0.60546875, "learning_rate": 1.1005512651554983e-06, "loss": 0.0365, "num_input_tokens_seen": 304962434, "step": 4928 }, { "epoch": 0.8958257713248639, "grad_norm": 0.3984375, "learning_rate": 1.0707745799541748e-06, "loss": 0.0505, "num_input_tokens_seen": 305453792, "step": 4936 }, { "epoch": 0.8972776769509981, "grad_norm": 0.703125, "learning_rate": 1.041395177949196e-06, "loss": 0.0371, "num_input_tokens_seen": 305940285, "step": 4944 }, { "epoch": 0.8987295825771325, "grad_norm": 0.5703125, "learning_rate": 1.0124136757502012e-06, "loss": 0.0523, "num_input_tokens_seen": 306438405, "step": 4952 }, { "epoch": 0.9001814882032668, "grad_norm": 0.7734375, "learning_rate": 9.838306816157695e-07, "loss": 0.0405, "num_input_tokens_seen": 306937715, "step": 4960 }, { "epoch": 0.9016333938294011, "grad_norm": 0.60546875, "learning_rate": 9.556467954406634e-07, "loss": 0.0742, "num_input_tokens_seen": 307458431, "step": 4968 }, { "epoch": 0.9030852994555354, "grad_norm": 0.69921875, "learning_rate": 9.278626087432529e-07, "loss": 0.049, "num_input_tokens_seen": 307956789, "step": 4976 }, { "epoch": 0.9045372050816697, "grad_norm": 0.48828125, "learning_rate": 9.004787046530694e-07, "loss": 0.0432, "num_input_tokens_seen": 308463995, "step": 4984 }, { "epoch": 0.905989110707804, "grad_norm": 0.546875, "learning_rate": 8.734956578985976e-07, "loss": 0.057, "num_input_tokens_seen": 308971509, "step": 4992 }, { "epoch": 0.9074410163339383, "grad_norm": 0.953125, "learning_rate": 8.469140347951898e-07, "loss": 0.0461, "num_input_tokens_seen": 309453074, "step": 5000 }, { "epoch": 0.9088929219600725, "grad_norm": 0.703125, "learning_rate": 8.207343932332023e-07, "loss": 0.042, "num_input_tokens_seen": 309930257, "step": 5008 }, { "epoch": 0.9103448275862069, "grad_norm": 1.234375, "learning_rate": 7.949572826662622e-07, "loss": 0.077, "num_input_tokens_seen": 310432591, "step": 5016 }, { "epoch": 0.9117967332123412, "grad_norm": 0.54296875, "learning_rate": 7.695832440997563e-07, "loss": 0.0504, "num_input_tokens_seen": 310899484, "step": 5024 }, { "epoch": 0.9132486388384755, "grad_norm": 0.89453125, "learning_rate": 7.44612810079468e-07, "loss": 0.0577, "num_input_tokens_seen": 311385620, "step": 5032 }, { "epoch": 0.9147005444646098, "grad_norm": 1.4375, "learning_rate": 7.200465046803984e-07, "loss": 0.065, "num_input_tokens_seen": 311886953, "step": 5040 }, { "epoch": 0.9161524500907441, "grad_norm": 1.09375, "learning_rate": 6.958848434957643e-07, "loss": 0.0473, "num_input_tokens_seen": 312387145, "step": 5048 }, { "epoch": 0.9176043557168784, "grad_norm": 1.015625, "learning_rate": 6.721283336261964e-07, "loss": 0.0464, "num_input_tokens_seen": 312865084, "step": 5056 }, { "epoch": 0.9190562613430127, "grad_norm": 0.7421875, "learning_rate": 6.487774736690688e-07, "loss": 0.0462, "num_input_tokens_seen": 313342169, "step": 5064 }, { "epoch": 0.9205081669691471, "grad_norm": 0.671875, "learning_rate": 6.258327537080488e-07, "loss": 0.0407, "num_input_tokens_seen": 313820850, "step": 5072 }, { "epoch": 0.9219600725952813, "grad_norm": 0.703125, "learning_rate": 6.032946553028196e-07, "loss": 0.048, "num_input_tokens_seen": 314294169, "step": 5080 }, { "epoch": 0.9234119782214156, "grad_norm": 0.89453125, "learning_rate": 5.811636514789598e-07, "loss": 0.0393, "num_input_tokens_seen": 314789090, "step": 5088 }, { "epoch": 0.9248638838475499, "grad_norm": 0.5, "learning_rate": 5.594402067180116e-07, "loss": 0.0466, "num_input_tokens_seen": 315317576, "step": 5096 }, { "epoch": 0.9263157894736842, "grad_norm": 0.51171875, "learning_rate": 5.381247769477504e-07, "loss": 0.0336, "num_input_tokens_seen": 315804951, "step": 5104 }, { "epoch": 0.9277676950998185, "grad_norm": 0.63671875, "learning_rate": 5.172178095326019e-07, "loss": 0.0515, "num_input_tokens_seen": 316286642, "step": 5112 }, { "epoch": 0.9292196007259528, "grad_norm": 0.8515625, "learning_rate": 4.967197432642579e-07, "loss": 0.079, "num_input_tokens_seen": 316792651, "step": 5120 }, { "epoch": 0.9306715063520871, "grad_norm": 0.8671875, "learning_rate": 4.7663100835246614e-07, "loss": 0.0423, "num_input_tokens_seen": 317277912, "step": 5128 }, { "epoch": 0.9321234119782215, "grad_norm": 0.359375, "learning_rate": 4.569520264159977e-07, "loss": 0.0307, "num_input_tokens_seen": 317761276, "step": 5136 }, { "epoch": 0.9335753176043557, "grad_norm": 0.65625, "learning_rate": 4.3768321047380936e-07, "loss": 0.0443, "num_input_tokens_seen": 318275629, "step": 5144 }, { "epoch": 0.93502722323049, "grad_norm": 0.66796875, "learning_rate": 4.188249649363596e-07, "loss": 0.037, "num_input_tokens_seen": 318764138, "step": 5152 }, { "epoch": 0.9364791288566243, "grad_norm": 0.5078125, "learning_rate": 4.0037768559712864e-07, "loss": 0.0398, "num_input_tokens_seen": 319237492, "step": 5160 }, { "epoch": 0.9364791288566243, "eval_loss": 0.04859951138496399, "eval_runtime": 2495.2416, "eval_samples_per_second": 1.249, "eval_steps_per_second": 0.156, "num_input_tokens_seen": 319237492, "step": 5160 }, { "epoch": 0.9379310344827586, "grad_norm": 0.50390625, "learning_rate": 3.8234175962432284e-07, "loss": 0.0643, "num_input_tokens_seen": 319726771, "step": 5168 }, { "epoch": 0.9393829401088929, "grad_norm": 0.46484375, "learning_rate": 3.647175655527235e-07, "loss": 0.0545, "num_input_tokens_seen": 320207370, "step": 5176 }, { "epoch": 0.9408348457350272, "grad_norm": 1.09375, "learning_rate": 3.4750547327576434e-07, "loss": 0.0645, "num_input_tokens_seen": 320689649, "step": 5184 }, { "epoch": 0.9422867513611616, "grad_norm": 0.5234375, "learning_rate": 3.3070584403775754e-07, "loss": 0.0368, "num_input_tokens_seen": 321189372, "step": 5192 }, { "epoch": 0.9437386569872959, "grad_norm": 0.578125, "learning_rate": 3.143190304263177e-07, "loss": 0.0461, "num_input_tokens_seen": 321681717, "step": 5200 }, { "epoch": 0.9451905626134302, "grad_norm": 0.6640625, "learning_rate": 2.9834537636495466e-07, "loss": 0.0348, "num_input_tokens_seen": 322172599, "step": 5208 }, { "epoch": 0.9466424682395644, "grad_norm": 0.578125, "learning_rate": 2.8278521710586315e-07, "loss": 0.0484, "num_input_tokens_seen": 322668094, "step": 5216 }, { "epoch": 0.9480943738656987, "grad_norm": 0.8125, "learning_rate": 2.6763887922288236e-07, "loss": 0.0589, "num_input_tokens_seen": 323137080, "step": 5224 }, { "epoch": 0.949546279491833, "grad_norm": 0.451171875, "learning_rate": 2.5290668060464095e-07, "loss": 0.0323, "num_input_tokens_seen": 323645462, "step": 5232 }, { "epoch": 0.9509981851179673, "grad_norm": 0.6171875, "learning_rate": 2.385889304478872e-07, "loss": 0.05, "num_input_tokens_seen": 324137149, "step": 5240 }, { "epoch": 0.9524500907441016, "grad_norm": 0.421875, "learning_rate": 2.2468592925100062e-07, "loss": 0.0392, "num_input_tokens_seen": 324621626, "step": 5248 }, { "epoch": 0.953901996370236, "grad_norm": 0.51171875, "learning_rate": 2.1119796880768374e-07, "loss": 0.0468, "num_input_tokens_seen": 325115784, "step": 5256 }, { "epoch": 0.9553539019963703, "grad_norm": 0.51953125, "learning_rate": 1.9812533220083362e-07, "loss": 0.0679, "num_input_tokens_seen": 325614737, "step": 5264 }, { "epoch": 0.9568058076225046, "grad_norm": 0.5078125, "learning_rate": 1.8546829379661125e-07, "loss": 0.07, "num_input_tokens_seen": 326095021, "step": 5272 }, { "epoch": 0.9582577132486388, "grad_norm": 0.423828125, "learning_rate": 1.7322711923867475e-07, "loss": 0.0609, "num_input_tokens_seen": 326613882, "step": 5280 }, { "epoch": 0.9597096188747731, "grad_norm": 0.8125, "learning_rate": 1.6140206544260407e-07, "loss": 0.0323, "num_input_tokens_seen": 327087152, "step": 5288 }, { "epoch": 0.9611615245009074, "grad_norm": 0.44921875, "learning_rate": 1.4999338059051184e-07, "loss": 0.0431, "num_input_tokens_seen": 327601813, "step": 5296 }, { "epoch": 0.9626134301270417, "grad_norm": 0.7578125, "learning_rate": 1.3900130412583646e-07, "loss": 0.0378, "num_input_tokens_seen": 328093647, "step": 5304 }, { "epoch": 0.964065335753176, "grad_norm": 1.2890625, "learning_rate": 1.2842606674831058e-07, "loss": 0.0777, "num_input_tokens_seen": 328588015, "step": 5312 }, { "epoch": 0.9655172413793104, "grad_norm": 0.71484375, "learning_rate": 1.1826789040912723e-07, "loss": 0.0603, "num_input_tokens_seen": 329080878, "step": 5320 }, { "epoch": 0.9669691470054447, "grad_norm": 0.6171875, "learning_rate": 1.0852698830627007e-07, "loss": 0.0433, "num_input_tokens_seen": 329543543, "step": 5328 }, { "epoch": 0.968421052631579, "grad_norm": 0.640625, "learning_rate": 9.920356488005045e-08, "loss": 0.0625, "num_input_tokens_seen": 330031499, "step": 5336 }, { "epoch": 0.9698729582577132, "grad_norm": 0.359375, "learning_rate": 9.029781580881081e-08, "loss": 0.0408, "num_input_tokens_seen": 330508472, "step": 5344 }, { "epoch": 0.9713248638838475, "grad_norm": 0.60546875, "learning_rate": 8.180992800482124e-08, "loss": 0.0362, "num_input_tokens_seen": 330999368, "step": 5352 }, { "epoch": 0.9727767695099818, "grad_norm": 0.80859375, "learning_rate": 7.374007961035157e-08, "loss": 0.0372, "num_input_tokens_seen": 331494527, "step": 5360 }, { "epoch": 0.9742286751361161, "grad_norm": 0.90625, "learning_rate": 6.608843999393655e-08, "loss": 0.0544, "num_input_tokens_seen": 331992801, "step": 5368 }, { "epoch": 0.9756805807622505, "grad_norm": 0.486328125, "learning_rate": 5.885516974681871e-08, "loss": 0.0434, "num_input_tokens_seen": 332484019, "step": 5376 }, { "epoch": 0.9771324863883848, "grad_norm": 0.58984375, "learning_rate": 5.2040420679577706e-08, "loss": 0.0463, "num_input_tokens_seen": 332971275, "step": 5384 }, { "epoch": 0.9785843920145191, "grad_norm": 0.365234375, "learning_rate": 4.564433581895067e-08, "loss": 0.0291, "num_input_tokens_seen": 333465979, "step": 5392 }, { "epoch": 0.9800362976406534, "grad_norm": 0.5546875, "learning_rate": 3.966704940482347e-08, "loss": 0.0428, "num_input_tokens_seen": 333965786, "step": 5400 }, { "epoch": 0.9814882032667877, "grad_norm": 0.478515625, "learning_rate": 3.4108686887408537e-08, "loss": 0.0382, "num_input_tokens_seen": 334462422, "step": 5408 }, { "epoch": 0.9829401088929219, "grad_norm": 0.58203125, "learning_rate": 2.8969364924629205e-08, "loss": 0.0335, "num_input_tokens_seen": 334957763, "step": 5416 }, { "epoch": 0.9843920145190562, "grad_norm": 0.49609375, "learning_rate": 2.424919137965276e-08, "loss": 0.0386, "num_input_tokens_seen": 335453503, "step": 5424 }, { "epoch": 0.9858439201451905, "grad_norm": 0.6640625, "learning_rate": 1.9948265318638915e-08, "loss": 0.0471, "num_input_tokens_seen": 335956152, "step": 5432 }, { "epoch": 0.9872958257713249, "grad_norm": 0.5078125, "learning_rate": 1.606667700865261e-08, "loss": 0.0428, "num_input_tokens_seen": 336428666, "step": 5440 }, { "epoch": 0.9887477313974592, "grad_norm": 0.7421875, "learning_rate": 1.2604507915774389e-08, "loss": 0.0409, "num_input_tokens_seen": 336955164, "step": 5448 }, { "epoch": 0.9901996370235935, "grad_norm": 0.80859375, "learning_rate": 9.561830703390673e-09, "loss": 0.0468, "num_input_tokens_seen": 337481648, "step": 5456 }, { "epoch": 0.9916515426497278, "grad_norm": 1.6953125, "learning_rate": 6.938709230666085e-09, "loss": 0.0517, "num_input_tokens_seen": 337980342, "step": 5464 }, { "epoch": 0.993103448275862, "grad_norm": 0.60546875, "learning_rate": 4.7351985512067435e-09, "loss": 0.0586, "num_input_tokens_seen": 338476887, "step": 5472 }, { "epoch": 0.9945553539019963, "grad_norm": 0.63671875, "learning_rate": 2.9513449118967475e-09, "loss": 0.0758, "num_input_tokens_seen": 338954735, "step": 5480 }, { "epoch": 0.9960072595281306, "grad_norm": 0.486328125, "learning_rate": 1.5871857519411671e-09, "loss": 0.0453, "num_input_tokens_seen": 339472532, "step": 5488 }, { "epoch": 0.997459165154265, "grad_norm": 0.52734375, "learning_rate": 6.427497020644602e-10, "loss": 0.0365, "num_input_tokens_seen": 339948028, "step": 5496 }, { "epoch": 0.9989110707803993, "grad_norm": 0.4609375, "learning_rate": 1.1805658392427533e-10, "loss": 0.0511, "num_input_tokens_seen": 340437678, "step": 5504 }, { "epoch": 0.9989110707803993, "eval_loss": 0.04862402379512787, "eval_runtime": 2527.5451, "eval_samples_per_second": 1.233, "eval_steps_per_second": 0.154, "num_input_tokens_seen": 340437678, "step": 5504 }, { "epoch": 1.0, "num_input_tokens_seen": 340779614, "step": 5510, "total_flos": 1.7763887818171482e+19, "train_loss": 0.06540190598601221, "train_runtime": 392745.8674, "train_samples_per_second": 0.786, "train_steps_per_second": 0.014, "train_tokens_per_second": 108.825 } ], "logging_steps": 8, "max_steps": 5510, "num_input_tokens_seen": 340779614, "num_train_epochs": 1, "save_steps": 688, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.7763887818171482e+19, "train_batch_size": 7, "trial_name": null, "trial_params": null }