{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.9995261786306564, "eval_steps": 22, "global_step": 2110, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0009476427386875149, "grad_norm": 20.756122057594123, "learning_rate": 1.5625e-07, "loss": 1.6431, "step": 1 }, { "epoch": 0.0018952854773750297, "grad_norm": 17.596939119407057, "learning_rate": 3.125e-07, "loss": 1.5603, "step": 2 }, { "epoch": 0.0028429282160625444, "grad_norm": 24.550176754105873, "learning_rate": 4.6875000000000006e-07, "loss": 1.695, "step": 3 }, { "epoch": 0.0037905709547500594, "grad_norm": 23.98623237019658, "learning_rate": 6.25e-07, "loss": 1.7187, "step": 4 }, { "epoch": 0.004738213693437574, "grad_norm": 40.536528584444476, "learning_rate": 7.8125e-07, "loss": 1.6799, "step": 5 }, { "epoch": 0.005685856432125089, "grad_norm": 30.04208597312355, "learning_rate": 9.375000000000001e-07, "loss": 1.7002, "step": 6 }, { "epoch": 0.006633499170812604, "grad_norm": 18.07515828827753, "learning_rate": 1.0937500000000001e-06, "loss": 1.5672, "step": 7 }, { "epoch": 0.007581141909500119, "grad_norm": 33.28181380236923, "learning_rate": 1.25e-06, "loss": 1.7278, "step": 8 }, { "epoch": 0.008528784648187633, "grad_norm": 16.829151356577462, "learning_rate": 1.40625e-06, "loss": 1.7151, "step": 9 }, { "epoch": 0.009476427386875147, "grad_norm": 17.773159683100857, "learning_rate": 1.5625e-06, "loss": 1.5353, "step": 10 }, { "epoch": 0.010424070125562663, "grad_norm": 18.419114948105452, "learning_rate": 1.71875e-06, "loss": 1.6073, "step": 11 }, { "epoch": 0.011371712864250177, "grad_norm": 13.6273956773613, "learning_rate": 1.8750000000000003e-06, "loss": 1.5124, "step": 12 }, { "epoch": 0.012319355602937692, "grad_norm": 14.235281012395534, "learning_rate": 2.0312500000000002e-06, "loss": 1.4477, "step": 13 }, { "epoch": 0.013266998341625208, "grad_norm": 8.20052746448492, "learning_rate": 2.1875000000000002e-06, "loss": 1.4623, "step": 14 }, { "epoch": 0.014214641080312722, "grad_norm": 11.658169078611403, "learning_rate": 2.3437500000000002e-06, "loss": 1.4695, "step": 15 }, { "epoch": 0.015162283819000238, "grad_norm": 6.911597435569106, "learning_rate": 2.5e-06, "loss": 1.4164, "step": 16 }, { "epoch": 0.01610992655768775, "grad_norm": 6.514587154523329, "learning_rate": 2.65625e-06, "loss": 1.3714, "step": 17 }, { "epoch": 0.017057569296375266, "grad_norm": 7.470333408959338, "learning_rate": 2.8125e-06, "loss": 1.4672, "step": 18 }, { "epoch": 0.018005212035062782, "grad_norm": 11.59118534986973, "learning_rate": 2.96875e-06, "loss": 1.3828, "step": 19 }, { "epoch": 0.018952854773750295, "grad_norm": 3.9301638513755988, "learning_rate": 3.125e-06, "loss": 1.4137, "step": 20 }, { "epoch": 0.01990049751243781, "grad_norm": 4.6357203859292495, "learning_rate": 3.28125e-06, "loss": 1.3915, "step": 21 }, { "epoch": 0.020848140251125327, "grad_norm": 4.530214142494876, "learning_rate": 3.4375e-06, "loss": 1.3658, "step": 22 }, { "epoch": 0.020848140251125327, "eval_loss": 1.2158129215240479, "eval_runtime": 60.2576, "eval_samples_per_second": 45.272, "eval_steps_per_second": 0.714, "step": 22 }, { "epoch": 0.02179578298981284, "grad_norm": 3.7036600794639396, "learning_rate": 3.59375e-06, "loss": 1.3058, "step": 23 }, { "epoch": 0.022743425728500355, "grad_norm": 4.229128089681862, "learning_rate": 3.7500000000000005e-06, "loss": 1.2303, "step": 24 }, { "epoch": 0.02369106846718787, "grad_norm": 5.376319236600739, "learning_rate": 3.90625e-06, "loss": 1.2898, "step": 25 }, { "epoch": 0.024638711205875383, "grad_norm": 3.621037119144236, "learning_rate": 4.0625000000000005e-06, "loss": 1.2614, "step": 26 }, { "epoch": 0.0255863539445629, "grad_norm": 3.7708968075921865, "learning_rate": 4.21875e-06, "loss": 1.3618, "step": 27 }, { "epoch": 0.026533996683250415, "grad_norm": 4.5895243093772535, "learning_rate": 4.3750000000000005e-06, "loss": 1.2304, "step": 28 }, { "epoch": 0.027481639421937928, "grad_norm": 10.299410261479563, "learning_rate": 4.53125e-06, "loss": 1.2345, "step": 29 }, { "epoch": 0.028429282160625444, "grad_norm": 3.8644074526148184, "learning_rate": 4.6875000000000004e-06, "loss": 1.2523, "step": 30 }, { "epoch": 0.02937692489931296, "grad_norm": 3.547834365401974, "learning_rate": 4.84375e-06, "loss": 1.2344, "step": 31 }, { "epoch": 0.030324567638000476, "grad_norm": 3.578798181550234, "learning_rate": 5e-06, "loss": 1.2818, "step": 32 }, { "epoch": 0.03127221037668799, "grad_norm": 3.711700954572684, "learning_rate": 5.156250000000001e-06, "loss": 1.2483, "step": 33 }, { "epoch": 0.0322198531153755, "grad_norm": 3.5746276017801537, "learning_rate": 5.3125e-06, "loss": 1.2565, "step": 34 }, { "epoch": 0.03316749585406302, "grad_norm": 3.7129697591402016, "learning_rate": 5.468750000000001e-06, "loss": 1.2261, "step": 35 }, { "epoch": 0.03411513859275053, "grad_norm": 3.1701938959510656, "learning_rate": 5.625e-06, "loss": 1.1836, "step": 36 }, { "epoch": 0.035062781331438045, "grad_norm": 3.081675212149766, "learning_rate": 5.781250000000001e-06, "loss": 1.1683, "step": 37 }, { "epoch": 0.036010424070125564, "grad_norm": 4.351693679221342, "learning_rate": 5.9375e-06, "loss": 1.2133, "step": 38 }, { "epoch": 0.03695806680881308, "grad_norm": 3.1336691817253204, "learning_rate": 6.093750000000001e-06, "loss": 1.1948, "step": 39 }, { "epoch": 0.03790570954750059, "grad_norm": 2.703982626093151, "learning_rate": 6.25e-06, "loss": 1.1376, "step": 40 }, { "epoch": 0.03885335228618811, "grad_norm": 3.003118804501732, "learning_rate": 6.406250000000001e-06, "loss": 1.2059, "step": 41 }, { "epoch": 0.03980099502487562, "grad_norm": 3.3721112860961577, "learning_rate": 6.5625e-06, "loss": 1.2294, "step": 42 }, { "epoch": 0.040748637763563134, "grad_norm": 2.935148387991293, "learning_rate": 6.718750000000001e-06, "loss": 1.2948, "step": 43 }, { "epoch": 0.04169628050225065, "grad_norm": 2.7546718703597, "learning_rate": 6.875e-06, "loss": 1.1058, "step": 44 }, { "epoch": 0.04169628050225065, "eval_loss": 1.1239995956420898, "eval_runtime": 62.5263, "eval_samples_per_second": 43.63, "eval_steps_per_second": 0.688, "step": 44 }, { "epoch": 0.042643923240938165, "grad_norm": 3.2372160639143885, "learning_rate": 7.031250000000001e-06, "loss": 1.187, "step": 45 }, { "epoch": 0.04359156597962568, "grad_norm": 3.3104832856910233, "learning_rate": 7.1875e-06, "loss": 1.1547, "step": 46 }, { "epoch": 0.0445392087183132, "grad_norm": 2.9630493187419096, "learning_rate": 7.343750000000001e-06, "loss": 1.205, "step": 47 }, { "epoch": 0.04548685145700071, "grad_norm": 2.8169766087618537, "learning_rate": 7.500000000000001e-06, "loss": 1.1583, "step": 48 }, { "epoch": 0.04643449419568822, "grad_norm": 3.0223679686127736, "learning_rate": 7.656250000000001e-06, "loss": 1.1546, "step": 49 }, { "epoch": 0.04738213693437574, "grad_norm": 2.9245386601496417, "learning_rate": 7.8125e-06, "loss": 1.0963, "step": 50 }, { "epoch": 0.048329779673063254, "grad_norm": 3.3416755825594207, "learning_rate": 7.96875e-06, "loss": 1.0911, "step": 51 }, { "epoch": 0.04927742241175077, "grad_norm": 3.2146723217948754, "learning_rate": 8.125000000000001e-06, "loss": 1.1444, "step": 52 }, { "epoch": 0.050225065150438286, "grad_norm": 3.7591880901694688, "learning_rate": 8.281250000000001e-06, "loss": 1.1644, "step": 53 }, { "epoch": 0.0511727078891258, "grad_norm": 3.597908493062599, "learning_rate": 8.4375e-06, "loss": 1.1245, "step": 54 }, { "epoch": 0.05212035062781331, "grad_norm": 3.499015413751106, "learning_rate": 8.59375e-06, "loss": 1.172, "step": 55 }, { "epoch": 0.05306799336650083, "grad_norm": 3.309932625198402, "learning_rate": 8.750000000000001e-06, "loss": 1.1184, "step": 56 }, { "epoch": 0.05401563610518834, "grad_norm": 3.2523198848476125, "learning_rate": 8.906250000000001e-06, "loss": 1.1571, "step": 57 }, { "epoch": 0.054963278843875855, "grad_norm": 3.2980910638210545, "learning_rate": 9.0625e-06, "loss": 1.132, "step": 58 }, { "epoch": 0.055910921582563375, "grad_norm": 3.1322189199908053, "learning_rate": 9.21875e-06, "loss": 1.0936, "step": 59 }, { "epoch": 0.05685856432125089, "grad_norm": 3.4181316461149884, "learning_rate": 9.375000000000001e-06, "loss": 1.1518, "step": 60 }, { "epoch": 0.0578062070599384, "grad_norm": 3.238262923289073, "learning_rate": 9.531250000000001e-06, "loss": 1.0584, "step": 61 }, { "epoch": 0.05875384979862592, "grad_norm": 3.1080324386613274, "learning_rate": 9.6875e-06, "loss": 1.1286, "step": 62 }, { "epoch": 0.05970149253731343, "grad_norm": 3.181648226621564, "learning_rate": 9.84375e-06, "loss": 1.0876, "step": 63 }, { "epoch": 0.06064913527600095, "grad_norm": 2.937644097353525, "learning_rate": 1e-05, "loss": 1.1379, "step": 64 }, { "epoch": 0.061596778014688464, "grad_norm": 2.958067762883821, "learning_rate": 9.99999410575193e-06, "loss": 1.1133, "step": 65 }, { "epoch": 0.06254442075337598, "grad_norm": 2.96685444746321, "learning_rate": 9.999976423021617e-06, "loss": 1.1313, "step": 66 }, { "epoch": 0.06254442075337598, "eval_loss": 1.0586260557174683, "eval_runtime": 62.0231, "eval_samples_per_second": 43.984, "eval_steps_per_second": 0.693, "step": 66 }, { "epoch": 0.06349206349206349, "grad_norm": 3.566526642424616, "learning_rate": 9.99994695185075e-06, "loss": 1.1156, "step": 67 }, { "epoch": 0.064439706230751, "grad_norm": 3.220513610820652, "learning_rate": 9.999905692308813e-06, "loss": 1.0942, "step": 68 }, { "epoch": 0.06538734896943853, "grad_norm": 2.545710751356861, "learning_rate": 9.999852644493086e-06, "loss": 1.0751, "step": 69 }, { "epoch": 0.06633499170812604, "grad_norm": 3.3933596790334772, "learning_rate": 9.999787808528639e-06, "loss": 1.1213, "step": 70 }, { "epoch": 0.06728263444681355, "grad_norm": 2.9414176412578303, "learning_rate": 9.999711184568334e-06, "loss": 1.0759, "step": 71 }, { "epoch": 0.06823027718550106, "grad_norm": 2.8170462064724937, "learning_rate": 9.999622772792829e-06, "loss": 1.0679, "step": 72 }, { "epoch": 0.06917791992418858, "grad_norm": 2.82044984183835, "learning_rate": 9.99952257341057e-06, "loss": 1.0539, "step": 73 }, { "epoch": 0.07012556266287609, "grad_norm": 2.6332325002107333, "learning_rate": 9.999410586657801e-06, "loss": 1.0523, "step": 74 }, { "epoch": 0.07107320540156362, "grad_norm": 3.0223476132960276, "learning_rate": 9.99928681279855e-06, "loss": 1.0355, "step": 75 }, { "epoch": 0.07202084814025113, "grad_norm": 3.8896388319954753, "learning_rate": 9.999151252124639e-06, "loss": 1.1244, "step": 76 }, { "epoch": 0.07296849087893864, "grad_norm": 2.542497788954799, "learning_rate": 9.99900390495568e-06, "loss": 0.9883, "step": 77 }, { "epoch": 0.07391613361762615, "grad_norm": 2.75973540600498, "learning_rate": 9.998844771639073e-06, "loss": 1.0339, "step": 78 }, { "epoch": 0.07486377635631367, "grad_norm": 12.963000608570283, "learning_rate": 9.998673852550007e-06, "loss": 1.0512, "step": 79 }, { "epoch": 0.07581141909500118, "grad_norm": 2.3419089920048184, "learning_rate": 9.998491148091457e-06, "loss": 1.0479, "step": 80 }, { "epoch": 0.0767590618336887, "grad_norm": 2.53974833474546, "learning_rate": 9.998296658694185e-06, "loss": 0.9376, "step": 81 }, { "epoch": 0.07770670457237622, "grad_norm": 2.455905437125952, "learning_rate": 9.99809038481674e-06, "loss": 0.9294, "step": 82 }, { "epoch": 0.07865434731106373, "grad_norm": 2.7870267445347343, "learning_rate": 9.997872326945452e-06, "loss": 1.0241, "step": 83 }, { "epoch": 0.07960199004975124, "grad_norm": 2.3034635295759256, "learning_rate": 9.997642485594436e-06, "loss": 1.0372, "step": 84 }, { "epoch": 0.08054963278843875, "grad_norm": 2.243933826068351, "learning_rate": 9.99740086130559e-06, "loss": 1.0229, "step": 85 }, { "epoch": 0.08149727552712627, "grad_norm": 2.1942261875771, "learning_rate": 9.99714745464859e-06, "loss": 1.0116, "step": 86 }, { "epoch": 0.0824449182658138, "grad_norm": 1.9009368734428258, "learning_rate": 9.996882266220895e-06, "loss": 0.9982, "step": 87 }, { "epoch": 0.0833925610045013, "grad_norm": 2.197821917224577, "learning_rate": 9.996605296647737e-06, "loss": 1.0379, "step": 88 }, { "epoch": 0.0833925610045013, "eval_loss": 1.0077329874038696, "eval_runtime": 69.6734, "eval_samples_per_second": 39.154, "eval_steps_per_second": 0.617, "step": 88 }, { "epoch": 0.08434020374318882, "grad_norm": 1.844506043554688, "learning_rate": 9.99631654658213e-06, "loss": 0.8956, "step": 89 }, { "epoch": 0.08528784648187633, "grad_norm": 3.4121023273390283, "learning_rate": 9.996016016704854e-06, "loss": 0.9807, "step": 90 }, { "epoch": 0.08623548922056384, "grad_norm": 1.629161161843959, "learning_rate": 9.995703707724474e-06, "loss": 0.9534, "step": 91 }, { "epoch": 0.08718313195925136, "grad_norm": 2.1015114519988893, "learning_rate": 9.995379620377319e-06, "loss": 0.9817, "step": 92 }, { "epoch": 0.08813077469793888, "grad_norm": 1.6484379590895988, "learning_rate": 9.995043755427487e-06, "loss": 0.9181, "step": 93 }, { "epoch": 0.0890784174366264, "grad_norm": 1.5470369951292255, "learning_rate": 9.99469611366685e-06, "loss": 0.9655, "step": 94 }, { "epoch": 0.09002606017531391, "grad_norm": 1.7131607780023708, "learning_rate": 9.994336695915041e-06, "loss": 0.9522, "step": 95 }, { "epoch": 0.09097370291400142, "grad_norm": 3.4039383281352684, "learning_rate": 9.993965503019457e-06, "loss": 0.9977, "step": 96 }, { "epoch": 0.09192134565268893, "grad_norm": 3.2140501680385536, "learning_rate": 9.993582535855265e-06, "loss": 0.8933, "step": 97 }, { "epoch": 0.09286898839137644, "grad_norm": 1.9418785282824027, "learning_rate": 9.993187795325381e-06, "loss": 1.0526, "step": 98 }, { "epoch": 0.09381663113006397, "grad_norm": 1.4531258804082263, "learning_rate": 9.992781282360486e-06, "loss": 0.9921, "step": 99 }, { "epoch": 0.09476427386875148, "grad_norm": 2.0708416718589273, "learning_rate": 9.992362997919016e-06, "loss": 0.9248, "step": 100 }, { "epoch": 0.095711916607439, "grad_norm": 1.7208209294560886, "learning_rate": 9.99193294298716e-06, "loss": 0.9778, "step": 101 }, { "epoch": 0.09665955934612651, "grad_norm": 1.573086346708121, "learning_rate": 9.991491118578856e-06, "loss": 0.9369, "step": 102 }, { "epoch": 0.09760720208481402, "grad_norm": 1.716766032223459, "learning_rate": 9.991037525735794e-06, "loss": 0.9718, "step": 103 }, { "epoch": 0.09855484482350153, "grad_norm": 1.4683346581590864, "learning_rate": 9.990572165527413e-06, "loss": 1.0322, "step": 104 }, { "epoch": 0.09950248756218906, "grad_norm": 1.6854495739199276, "learning_rate": 9.990095039050886e-06, "loss": 1.0259, "step": 105 }, { "epoch": 0.10045013030087657, "grad_norm": 1.5007691822796942, "learning_rate": 9.98960614743114e-06, "loss": 1.012, "step": 106 }, { "epoch": 0.10139777303956408, "grad_norm": 1.688237803394784, "learning_rate": 9.98910549182083e-06, "loss": 0.9611, "step": 107 }, { "epoch": 0.1023454157782516, "grad_norm": 1.2412298690878776, "learning_rate": 9.988593073400354e-06, "loss": 0.9543, "step": 108 }, { "epoch": 0.10329305851693911, "grad_norm": 1.4280638521919857, "learning_rate": 9.988068893377841e-06, "loss": 1.0555, "step": 109 }, { "epoch": 0.10424070125562662, "grad_norm": 1.6064972212595918, "learning_rate": 9.987532952989145e-06, "loss": 0.947, "step": 110 }, { "epoch": 0.10424070125562662, "eval_loss": 0.9927965998649597, "eval_runtime": 64.552, "eval_samples_per_second": 42.261, "eval_steps_per_second": 0.666, "step": 110 }, { "epoch": 0.10518834399431415, "grad_norm": 1.3706857779346073, "learning_rate": 9.986985253497859e-06, "loss": 0.958, "step": 111 }, { "epoch": 0.10613598673300166, "grad_norm": 1.1044678274982156, "learning_rate": 9.986425796195287e-06, "loss": 0.9613, "step": 112 }, { "epoch": 0.10708362947168917, "grad_norm": 1.2204032164258463, "learning_rate": 9.985854582400465e-06, "loss": 0.9637, "step": 113 }, { "epoch": 0.10803127221037669, "grad_norm": 1.387543526294841, "learning_rate": 9.985271613460144e-06, "loss": 0.988, "step": 114 }, { "epoch": 0.1089789149490642, "grad_norm": 1.5020093056571016, "learning_rate": 9.984676890748787e-06, "loss": 0.986, "step": 115 }, { "epoch": 0.10992655768775171, "grad_norm": 1.503672693192212, "learning_rate": 9.984070415668574e-06, "loss": 0.9858, "step": 116 }, { "epoch": 0.11087420042643924, "grad_norm": 1.4535398323237316, "learning_rate": 9.983452189649388e-06, "loss": 0.9324, "step": 117 }, { "epoch": 0.11182184316512675, "grad_norm": 1.4452814142099772, "learning_rate": 9.98282221414882e-06, "loss": 0.9353, "step": 118 }, { "epoch": 0.11276948590381426, "grad_norm": 1.2444175503325663, "learning_rate": 9.982180490652165e-06, "loss": 0.9864, "step": 119 }, { "epoch": 0.11371712864250177, "grad_norm": 1.484366794735356, "learning_rate": 9.981527020672413e-06, "loss": 0.9683, "step": 120 }, { "epoch": 0.11466477138118929, "grad_norm": 1.6287273073283466, "learning_rate": 9.98086180575025e-06, "loss": 0.9295, "step": 121 }, { "epoch": 0.1156124141198768, "grad_norm": 2.1645074117977363, "learning_rate": 9.980184847454052e-06, "loss": 0.9474, "step": 122 }, { "epoch": 0.11656005685856433, "grad_norm": 1.4428592940450689, "learning_rate": 9.979496147379883e-06, "loss": 1.0116, "step": 123 }, { "epoch": 0.11750769959725184, "grad_norm": 1.442916857159405, "learning_rate": 9.978795707151492e-06, "loss": 0.9565, "step": 124 }, { "epoch": 0.11845534233593935, "grad_norm": 1.3024301965632774, "learning_rate": 9.978083528420303e-06, "loss": 0.9471, "step": 125 }, { "epoch": 0.11940298507462686, "grad_norm": 1.2357348655586111, "learning_rate": 9.977359612865424e-06, "loss": 0.9684, "step": 126 }, { "epoch": 0.12035062781331438, "grad_norm": 1.4890088752220003, "learning_rate": 9.976623962193627e-06, "loss": 0.9535, "step": 127 }, { "epoch": 0.1212982705520019, "grad_norm": 1.3469393114862873, "learning_rate": 9.975876578139355e-06, "loss": 0.986, "step": 128 }, { "epoch": 0.12224591329068941, "grad_norm": 1.5755457587661565, "learning_rate": 9.975117462464716e-06, "loss": 1.019, "step": 129 }, { "epoch": 0.12319355602937693, "grad_norm": 1.4168899024501311, "learning_rate": 9.974346616959476e-06, "loss": 0.9368, "step": 130 }, { "epoch": 0.12414119876806444, "grad_norm": 1.4122617861876974, "learning_rate": 9.973564043441057e-06, "loss": 0.9563, "step": 131 }, { "epoch": 0.12508884150675195, "grad_norm": 1.3777217200154346, "learning_rate": 9.972769743754532e-06, "loss": 1.0045, "step": 132 }, { "epoch": 0.12508884150675195, "eval_loss": 0.9894475340843201, "eval_runtime": 67.9151, "eval_samples_per_second": 40.168, "eval_steps_per_second": 0.633, "step": 132 }, { "epoch": 0.12603648424543948, "grad_norm": 1.3796166756668033, "learning_rate": 9.971963719772621e-06, "loss": 0.9492, "step": 133 }, { "epoch": 0.12698412698412698, "grad_norm": 1.8484880273102298, "learning_rate": 9.971145973395685e-06, "loss": 0.9844, "step": 134 }, { "epoch": 0.1279317697228145, "grad_norm": 1.3341411571681234, "learning_rate": 9.970316506551726e-06, "loss": 0.9752, "step": 135 }, { "epoch": 0.128879412461502, "grad_norm": 1.6561053976261508, "learning_rate": 9.969475321196374e-06, "loss": 0.9745, "step": 136 }, { "epoch": 0.12982705520018953, "grad_norm": 1.4801234453473886, "learning_rate": 9.968622419312895e-06, "loss": 0.983, "step": 137 }, { "epoch": 0.13077469793887705, "grad_norm": 1.650605001197027, "learning_rate": 9.967757802912172e-06, "loss": 0.9226, "step": 138 }, { "epoch": 0.13172234067756455, "grad_norm": 1.5926202309459188, "learning_rate": 9.966881474032711e-06, "loss": 0.9754, "step": 139 }, { "epoch": 0.13266998341625208, "grad_norm": 1.601896363818333, "learning_rate": 9.965993434740634e-06, "loss": 0.9812, "step": 140 }, { "epoch": 0.13361762615493958, "grad_norm": 1.2173182907186852, "learning_rate": 9.965093687129669e-06, "loss": 0.987, "step": 141 }, { "epoch": 0.1345652688936271, "grad_norm": 1.1914620826791744, "learning_rate": 9.96418223332115e-06, "loss": 0.9049, "step": 142 }, { "epoch": 0.13551291163231463, "grad_norm": 1.1718797680017474, "learning_rate": 9.963259075464011e-06, "loss": 1.0314, "step": 143 }, { "epoch": 0.13646055437100213, "grad_norm": 1.1921400894385041, "learning_rate": 9.962324215734782e-06, "loss": 0.9804, "step": 144 }, { "epoch": 0.13740819710968966, "grad_norm": 1.483815329510506, "learning_rate": 9.961377656337579e-06, "loss": 0.9371, "step": 145 }, { "epoch": 0.13835583984837715, "grad_norm": 1.174598142994827, "learning_rate": 9.960419399504107e-06, "loss": 0.9357, "step": 146 }, { "epoch": 0.13930348258706468, "grad_norm": 1.768580101280523, "learning_rate": 9.959449447493643e-06, "loss": 0.9801, "step": 147 }, { "epoch": 0.14025112532575218, "grad_norm": 1.8249892638670866, "learning_rate": 9.958467802593046e-06, "loss": 0.9553, "step": 148 }, { "epoch": 0.1411987680644397, "grad_norm": 1.464158444501908, "learning_rate": 9.957474467116739e-06, "loss": 0.9816, "step": 149 }, { "epoch": 0.14214641080312723, "grad_norm": 1.4006303093968397, "learning_rate": 9.956469443406707e-06, "loss": 0.959, "step": 150 }, { "epoch": 0.14309405354181473, "grad_norm": 1.2677409714516314, "learning_rate": 9.955452733832493e-06, "loss": 0.9901, "step": 151 }, { "epoch": 0.14404169628050226, "grad_norm": 1.616294750537421, "learning_rate": 9.954424340791195e-06, "loss": 0.9611, "step": 152 }, { "epoch": 0.14498933901918976, "grad_norm": 1.2762321668275929, "learning_rate": 9.953384266707453e-06, "loss": 0.9971, "step": 153 }, { "epoch": 0.14593698175787728, "grad_norm": 1.243174536133587, "learning_rate": 9.952332514033449e-06, "loss": 0.9545, "step": 154 }, { "epoch": 0.14593698175787728, "eval_loss": 0.982397735118866, "eval_runtime": 66.0802, "eval_samples_per_second": 41.283, "eval_steps_per_second": 0.651, "step": 154 }, { "epoch": 0.1468846244965648, "grad_norm": 1.3382329641420807, "learning_rate": 9.951269085248898e-06, "loss": 0.9934, "step": 155 }, { "epoch": 0.1478322672352523, "grad_norm": 1.2257802109789377, "learning_rate": 9.950193982861048e-06, "loss": 0.9528, "step": 156 }, { "epoch": 0.14877990997393983, "grad_norm": 1.232838273393549, "learning_rate": 9.949107209404664e-06, "loss": 0.9719, "step": 157 }, { "epoch": 0.14972755271262733, "grad_norm": 1.6289871091304047, "learning_rate": 9.948008767442034e-06, "loss": 0.9634, "step": 158 }, { "epoch": 0.15067519545131486, "grad_norm": 1.486191374802309, "learning_rate": 9.94689865956295e-06, "loss": 0.9457, "step": 159 }, { "epoch": 0.15162283819000236, "grad_norm": 1.2145460306596223, "learning_rate": 9.94577688838472e-06, "loss": 0.9841, "step": 160 }, { "epoch": 0.15257048092868988, "grad_norm": 1.2094755905610057, "learning_rate": 9.944643456552133e-06, "loss": 0.9577, "step": 161 }, { "epoch": 0.1535181236673774, "grad_norm": 1.2312169745263408, "learning_rate": 9.943498366737487e-06, "loss": 0.935, "step": 162 }, { "epoch": 0.1544657664060649, "grad_norm": 1.996437404759428, "learning_rate": 9.942341621640558e-06, "loss": 0.9949, "step": 163 }, { "epoch": 0.15541340914475243, "grad_norm": 1.434919063522936, "learning_rate": 9.941173223988603e-06, "loss": 0.961, "step": 164 }, { "epoch": 0.15636105188343993, "grad_norm": 1.5694305163048035, "learning_rate": 9.93999317653635e-06, "loss": 1.0382, "step": 165 }, { "epoch": 0.15730869462212746, "grad_norm": 1.4810485545937977, "learning_rate": 9.938801482065998e-06, "loss": 0.9782, "step": 166 }, { "epoch": 0.15825633736081499, "grad_norm": 1.2852835752717688, "learning_rate": 9.937598143387207e-06, "loss": 0.9012, "step": 167 }, { "epoch": 0.15920398009950248, "grad_norm": 1.3425076199539143, "learning_rate": 9.93638316333708e-06, "loss": 0.92, "step": 168 }, { "epoch": 0.16015162283819, "grad_norm": 1.1023252456779573, "learning_rate": 9.935156544780183e-06, "loss": 0.9383, "step": 169 }, { "epoch": 0.1610992655768775, "grad_norm": 1.4060044099112272, "learning_rate": 9.93391829060851e-06, "loss": 0.9764, "step": 170 }, { "epoch": 0.16204690831556504, "grad_norm": 1.2799421690962227, "learning_rate": 9.932668403741488e-06, "loss": 0.8693, "step": 171 }, { "epoch": 0.16299455105425253, "grad_norm": 1.139116134527199, "learning_rate": 9.93140688712598e-06, "loss": 0.9494, "step": 172 }, { "epoch": 0.16394219379294006, "grad_norm": 1.217839709509947, "learning_rate": 9.930133743736261e-06, "loss": 0.8957, "step": 173 }, { "epoch": 0.1648898365316276, "grad_norm": 1.301115684104673, "learning_rate": 9.92884897657402e-06, "loss": 0.9477, "step": 174 }, { "epoch": 0.16583747927031509, "grad_norm": 1.2295422076552296, "learning_rate": 9.92755258866835e-06, "loss": 0.9441, "step": 175 }, { "epoch": 0.1667851220090026, "grad_norm": 1.2198046764803545, "learning_rate": 9.926244583075748e-06, "loss": 0.9556, "step": 176 }, { "epoch": 0.1667851220090026, "eval_loss": 0.9768843054771423, "eval_runtime": 65.0055, "eval_samples_per_second": 41.966, "eval_steps_per_second": 0.661, "step": 176 }, { "epoch": 0.1677327647476901, "grad_norm": 1.2348345090307584, "learning_rate": 9.924924962880093e-06, "loss": 0.9633, "step": 177 }, { "epoch": 0.16868040748637764, "grad_norm": 1.2606209599886706, "learning_rate": 9.923593731192655e-06, "loss": 0.98, "step": 178 }, { "epoch": 0.16962805022506516, "grad_norm": 2.6831467782092995, "learning_rate": 9.922250891152078e-06, "loss": 0.994, "step": 179 }, { "epoch": 0.17057569296375266, "grad_norm": 1.2766929465017953, "learning_rate": 9.920896445924372e-06, "loss": 0.9753, "step": 180 }, { "epoch": 0.1715233357024402, "grad_norm": 1.1262505199746897, "learning_rate": 9.919530398702917e-06, "loss": 0.9641, "step": 181 }, { "epoch": 0.1724709784411277, "grad_norm": 1.442803425071792, "learning_rate": 9.918152752708437e-06, "loss": 0.9601, "step": 182 }, { "epoch": 0.1734186211798152, "grad_norm": 1.2137409175608287, "learning_rate": 9.916763511189009e-06, "loss": 0.9747, "step": 183 }, { "epoch": 0.1743662639185027, "grad_norm": 1.3271533812680127, "learning_rate": 9.915362677420045e-06, "loss": 0.9384, "step": 184 }, { "epoch": 0.17531390665719024, "grad_norm": 1.2077500566101147, "learning_rate": 9.913950254704291e-06, "loss": 0.9372, "step": 185 }, { "epoch": 0.17626154939587776, "grad_norm": 1.1297753271104558, "learning_rate": 9.912526246371815e-06, "loss": 0.8775, "step": 186 }, { "epoch": 0.17720919213456526, "grad_norm": 1.2198507607935039, "learning_rate": 9.911090655779997e-06, "loss": 1.0036, "step": 187 }, { "epoch": 0.1781568348732528, "grad_norm": 1.305484389615825, "learning_rate": 9.909643486313533e-06, "loss": 0.9687, "step": 188 }, { "epoch": 0.1791044776119403, "grad_norm": 1.527203085727602, "learning_rate": 9.908184741384412e-06, "loss": 0.9225, "step": 189 }, { "epoch": 0.18005212035062781, "grad_norm": 1.1568401663216765, "learning_rate": 9.906714424431914e-06, "loss": 0.9112, "step": 190 }, { "epoch": 0.18099976308931534, "grad_norm": 1.2426671937737235, "learning_rate": 9.905232538922604e-06, "loss": 0.9509, "step": 191 }, { "epoch": 0.18194740582800284, "grad_norm": 1.535223723588726, "learning_rate": 9.903739088350325e-06, "loss": 0.8984, "step": 192 }, { "epoch": 0.18289504856669037, "grad_norm": 1.5431131775034228, "learning_rate": 9.902234076236182e-06, "loss": 0.9602, "step": 193 }, { "epoch": 0.18384269130537786, "grad_norm": 1.182953828246788, "learning_rate": 9.90071750612854e-06, "loss": 0.887, "step": 194 }, { "epoch": 0.1847903340440654, "grad_norm": 1.4338081609253326, "learning_rate": 9.899189381603018e-06, "loss": 0.9818, "step": 195 }, { "epoch": 0.1857379767827529, "grad_norm": 1.4971956239027924, "learning_rate": 9.897649706262474e-06, "loss": 0.9455, "step": 196 }, { "epoch": 0.18668561952144042, "grad_norm": 1.2771616220713862, "learning_rate": 9.896098483736995e-06, "loss": 0.9563, "step": 197 }, { "epoch": 0.18763326226012794, "grad_norm": 1.1927091790410977, "learning_rate": 9.894535717683902e-06, "loss": 0.9376, "step": 198 }, { "epoch": 0.18763326226012794, "eval_loss": 0.9750568270683289, "eval_runtime": 68.0671, "eval_samples_per_second": 40.078, "eval_steps_per_second": 0.632, "step": 198 }, { "epoch": 0.18858090499881544, "grad_norm": 1.447983084882731, "learning_rate": 9.892961411787725e-06, "loss": 0.941, "step": 199 }, { "epoch": 0.18952854773750297, "grad_norm": 1.179964004851603, "learning_rate": 9.891375569760205e-06, "loss": 1.0044, "step": 200 }, { "epoch": 0.19047619047619047, "grad_norm": 1.1672903614747536, "learning_rate": 9.88977819534028e-06, "loss": 0.9087, "step": 201 }, { "epoch": 0.191423833214878, "grad_norm": 1.2604577059340927, "learning_rate": 9.888169292294077e-06, "loss": 0.97, "step": 202 }, { "epoch": 0.19237147595356552, "grad_norm": 1.4285797440582975, "learning_rate": 9.886548864414906e-06, "loss": 0.9296, "step": 203 }, { "epoch": 0.19331911869225302, "grad_norm": 1.4094308770717812, "learning_rate": 9.88491691552325e-06, "loss": 1.0148, "step": 204 }, { "epoch": 0.19426676143094054, "grad_norm": 1.5152496759647966, "learning_rate": 9.883273449466755e-06, "loss": 0.9839, "step": 205 }, { "epoch": 0.19521440416962804, "grad_norm": 1.4100497762615254, "learning_rate": 9.881618470120216e-06, "loss": 0.9112, "step": 206 }, { "epoch": 0.19616204690831557, "grad_norm": 1.2060557963303735, "learning_rate": 9.879951981385577e-06, "loss": 1.0107, "step": 207 }, { "epoch": 0.19710968964700307, "grad_norm": 1.1817847604275118, "learning_rate": 9.87827398719192e-06, "loss": 0.9401, "step": 208 }, { "epoch": 0.1980573323856906, "grad_norm": 4.640069295683942, "learning_rate": 9.876584491495448e-06, "loss": 0.9453, "step": 209 }, { "epoch": 0.19900497512437812, "grad_norm": 1.3678287853797575, "learning_rate": 9.874883498279485e-06, "loss": 0.9139, "step": 210 }, { "epoch": 0.19995261786306562, "grad_norm": 1.2020105753823802, "learning_rate": 9.87317101155446e-06, "loss": 0.8995, "step": 211 }, { "epoch": 0.20090026060175314, "grad_norm": 1.560649904766898, "learning_rate": 9.871447035357903e-06, "loss": 0.9953, "step": 212 }, { "epoch": 0.20184790334044064, "grad_norm": 1.5587492681660762, "learning_rate": 9.869711573754433e-06, "loss": 0.9954, "step": 213 }, { "epoch": 0.20279554607912817, "grad_norm": 1.1589889744586952, "learning_rate": 9.867964630835742e-06, "loss": 0.9664, "step": 214 }, { "epoch": 0.2037431888178157, "grad_norm": 1.4941711737316694, "learning_rate": 9.8662062107206e-06, "loss": 0.9087, "step": 215 }, { "epoch": 0.2046908315565032, "grad_norm": 1.1922425845332252, "learning_rate": 9.86443631755483e-06, "loss": 1.0093, "step": 216 }, { "epoch": 0.20563847429519072, "grad_norm": 1.236697642847563, "learning_rate": 9.862654955511309e-06, "loss": 0.9649, "step": 217 }, { "epoch": 0.20658611703387822, "grad_norm": 1.2350057563906354, "learning_rate": 9.860862128789954e-06, "loss": 0.9714, "step": 218 }, { "epoch": 0.20753375977256575, "grad_norm": 1.4642161662286084, "learning_rate": 9.859057841617709e-06, "loss": 0.951, "step": 219 }, { "epoch": 0.20848140251125324, "grad_norm": 1.1189678628969209, "learning_rate": 9.857242098248543e-06, "loss": 0.9097, "step": 220 }, { "epoch": 0.20848140251125324, "eval_loss": 0.9686124324798584, "eval_runtime": 68.177, "eval_samples_per_second": 40.013, "eval_steps_per_second": 0.631, "step": 220 }, { "epoch": 0.20942904524994077, "grad_norm": 1.1409361807030405, "learning_rate": 9.85541490296343e-06, "loss": 0.913, "step": 221 }, { "epoch": 0.2103766879886283, "grad_norm": 1.4175269201432499, "learning_rate": 9.853576260070348e-06, "loss": 0.956, "step": 222 }, { "epoch": 0.2113243307273158, "grad_norm": 1.202975487777318, "learning_rate": 9.851726173904264e-06, "loss": 0.9681, "step": 223 }, { "epoch": 0.21227197346600332, "grad_norm": 1.2528114366347458, "learning_rate": 9.849864648827126e-06, "loss": 0.9339, "step": 224 }, { "epoch": 0.21321961620469082, "grad_norm": 1.5633193545585717, "learning_rate": 9.847991689227848e-06, "loss": 0.9481, "step": 225 }, { "epoch": 0.21416725894337835, "grad_norm": 1.3036681318560188, "learning_rate": 9.846107299522305e-06, "loss": 0.9669, "step": 226 }, { "epoch": 0.21511490168206587, "grad_norm": 1.276332389348374, "learning_rate": 9.844211484153326e-06, "loss": 1.0051, "step": 227 }, { "epoch": 0.21606254442075337, "grad_norm": 1.3574477388118054, "learning_rate": 9.842304247590668e-06, "loss": 0.9185, "step": 228 }, { "epoch": 0.2170101871594409, "grad_norm": 1.2290424692902366, "learning_rate": 9.840385594331022e-06, "loss": 0.9402, "step": 229 }, { "epoch": 0.2179578298981284, "grad_norm": 1.3663071377926381, "learning_rate": 9.838455528897998e-06, "loss": 0.9303, "step": 230 }, { "epoch": 0.21890547263681592, "grad_norm": 1.1297310850238833, "learning_rate": 9.836514055842109e-06, "loss": 0.8715, "step": 231 }, { "epoch": 0.21985311537550342, "grad_norm": 1.1981756396394987, "learning_rate": 9.834561179740763e-06, "loss": 0.9603, "step": 232 }, { "epoch": 0.22080075811419095, "grad_norm": 1.0960664647793084, "learning_rate": 9.832596905198255e-06, "loss": 0.9352, "step": 233 }, { "epoch": 0.22174840085287847, "grad_norm": 1.2698198526002429, "learning_rate": 9.830621236845755e-06, "loss": 0.9044, "step": 234 }, { "epoch": 0.22269604359156597, "grad_norm": 1.4209652174245544, "learning_rate": 9.828634179341292e-06, "loss": 0.9839, "step": 235 }, { "epoch": 0.2236436863302535, "grad_norm": 1.5896834703549265, "learning_rate": 9.826635737369752e-06, "loss": 0.9479, "step": 236 }, { "epoch": 0.224591329068941, "grad_norm": 1.118663687959167, "learning_rate": 9.82462591564286e-06, "loss": 0.9568, "step": 237 }, { "epoch": 0.22553897180762852, "grad_norm": 1.081723075754863, "learning_rate": 9.82260471889917e-06, "loss": 1.0009, "step": 238 }, { "epoch": 0.22648661454631605, "grad_norm": 1.3816847638469698, "learning_rate": 9.82057215190406e-06, "loss": 0.9565, "step": 239 }, { "epoch": 0.22743425728500355, "grad_norm": 1.3650320361676973, "learning_rate": 9.818528219449705e-06, "loss": 0.9435, "step": 240 }, { "epoch": 0.22838190002369108, "grad_norm": 1.1163028465916651, "learning_rate": 9.816472926355087e-06, "loss": 0.9926, "step": 241 }, { "epoch": 0.22932954276237857, "grad_norm": 1.1783321971909724, "learning_rate": 9.814406277465969e-06, "loss": 0.9908, "step": 242 }, { "epoch": 0.22932954276237857, "eval_loss": 0.9650764465332031, "eval_runtime": 63.7155, "eval_samples_per_second": 42.815, "eval_steps_per_second": 0.675, "step": 242 }, { "epoch": 0.2302771855010661, "grad_norm": 1.078825580859753, "learning_rate": 9.812328277654889e-06, "loss": 0.9395, "step": 243 }, { "epoch": 0.2312248282397536, "grad_norm": 1.1093483786757967, "learning_rate": 9.810238931821139e-06, "loss": 0.9178, "step": 244 }, { "epoch": 0.23217247097844113, "grad_norm": 1.3499071449657545, "learning_rate": 9.808138244890775e-06, "loss": 0.952, "step": 245 }, { "epoch": 0.23312011371712865, "grad_norm": 1.1761313846911488, "learning_rate": 9.806026221816582e-06, "loss": 0.9497, "step": 246 }, { "epoch": 0.23406775645581615, "grad_norm": 1.2110375794344939, "learning_rate": 9.803902867578075e-06, "loss": 0.944, "step": 247 }, { "epoch": 0.23501539919450368, "grad_norm": 1.2034987469557872, "learning_rate": 9.801768187181487e-06, "loss": 0.986, "step": 248 }, { "epoch": 0.23596304193319118, "grad_norm": 1.3058009296406379, "learning_rate": 9.799622185659748e-06, "loss": 0.967, "step": 249 }, { "epoch": 0.2369106846718787, "grad_norm": 1.1123429020549715, "learning_rate": 9.797464868072489e-06, "loss": 0.9217, "step": 250 }, { "epoch": 0.23785832741056623, "grad_norm": 1.089125109757041, "learning_rate": 9.795296239506011e-06, "loss": 0.8866, "step": 251 }, { "epoch": 0.23880597014925373, "grad_norm": 1.2123667069466009, "learning_rate": 9.793116305073292e-06, "loss": 0.9307, "step": 252 }, { "epoch": 0.23975361288794125, "grad_norm": 1.4622869606703903, "learning_rate": 9.790925069913962e-06, "loss": 0.9538, "step": 253 }, { "epoch": 0.24070125562662875, "grad_norm": 1.5523797111635822, "learning_rate": 9.788722539194291e-06, "loss": 0.969, "step": 254 }, { "epoch": 0.24164889836531628, "grad_norm": 1.1827311652398949, "learning_rate": 9.786508718107184e-06, "loss": 0.9849, "step": 255 }, { "epoch": 0.2425965411040038, "grad_norm": 1.2881186217827927, "learning_rate": 9.78428361187217e-06, "loss": 0.9295, "step": 256 }, { "epoch": 0.2435441838426913, "grad_norm": 1.474451652001404, "learning_rate": 9.782047225735376e-06, "loss": 0.9576, "step": 257 }, { "epoch": 0.24449182658137883, "grad_norm": 1.2287731326656932, "learning_rate": 9.77979956496953e-06, "loss": 0.9485, "step": 258 }, { "epoch": 0.24543946932006633, "grad_norm": 1.3059618909257746, "learning_rate": 9.777540634873939e-06, "loss": 0.9961, "step": 259 }, { "epoch": 0.24638711205875385, "grad_norm": 1.25801433279188, "learning_rate": 9.775270440774481e-06, "loss": 0.9374, "step": 260 }, { "epoch": 0.24733475479744135, "grad_norm": 1.4594944714968974, "learning_rate": 9.772988988023589e-06, "loss": 0.9714, "step": 261 }, { "epoch": 0.24828239753612888, "grad_norm": 1.1788267508576873, "learning_rate": 9.770696282000245e-06, "loss": 0.9251, "step": 262 }, { "epoch": 0.2492300402748164, "grad_norm": 1.2489815438864824, "learning_rate": 9.76839232810996e-06, "loss": 0.9126, "step": 263 }, { "epoch": 0.2501776830135039, "grad_norm": 1.3083502635920439, "learning_rate": 9.766077131784764e-06, "loss": 0.94, "step": 264 }, { "epoch": 0.2501776830135039, "eval_loss": 0.9628852605819702, "eval_runtime": 65.8683, "eval_samples_per_second": 41.416, "eval_steps_per_second": 0.653, "step": 264 }, { "epoch": 0.25112532575219143, "grad_norm": 1.2876315572259667, "learning_rate": 9.763750698483192e-06, "loss": 0.9824, "step": 265 }, { "epoch": 0.25207296849087896, "grad_norm": 1.4509050672400128, "learning_rate": 9.761413033690276e-06, "loss": 1.01, "step": 266 }, { "epoch": 0.25302061122956643, "grad_norm": 1.2615386437049756, "learning_rate": 9.759064142917526e-06, "loss": 0.9336, "step": 267 }, { "epoch": 0.25396825396825395, "grad_norm": 1.1835961624076299, "learning_rate": 9.756704031702919e-06, "loss": 0.9462, "step": 268 }, { "epoch": 0.2549158967069415, "grad_norm": 1.2900537501658034, "learning_rate": 9.75433270561089e-06, "loss": 0.9071, "step": 269 }, { "epoch": 0.255863539445629, "grad_norm": 1.138429016575903, "learning_rate": 9.75195017023231e-06, "loss": 0.8544, "step": 270 }, { "epoch": 0.25681118218431653, "grad_norm": 1.1853801438439096, "learning_rate": 9.74955643118448e-06, "loss": 0.92, "step": 271 }, { "epoch": 0.257758824923004, "grad_norm": 1.2610744856343499, "learning_rate": 9.74715149411112e-06, "loss": 0.9012, "step": 272 }, { "epoch": 0.25870646766169153, "grad_norm": 1.4706709692456896, "learning_rate": 9.744735364682347e-06, "loss": 0.9476, "step": 273 }, { "epoch": 0.25965411040037906, "grad_norm": 1.4148479481637295, "learning_rate": 9.742308048594665e-06, "loss": 0.9121, "step": 274 }, { "epoch": 0.2606017531390666, "grad_norm": 1.236422033348515, "learning_rate": 9.73986955157096e-06, "loss": 0.9135, "step": 275 }, { "epoch": 0.2615493958777541, "grad_norm": 1.1477317083396126, "learning_rate": 9.737419879360471e-06, "loss": 0.9516, "step": 276 }, { "epoch": 0.2624970386164416, "grad_norm": 2.5546186723319373, "learning_rate": 9.734959037738788e-06, "loss": 0.9422, "step": 277 }, { "epoch": 0.2634446813551291, "grad_norm": 1.3564480695771186, "learning_rate": 9.732487032507837e-06, "loss": 0.8961, "step": 278 }, { "epoch": 0.26439232409381663, "grad_norm": 1.4878738583178996, "learning_rate": 9.730003869495863e-06, "loss": 0.9457, "step": 279 }, { "epoch": 0.26533996683250416, "grad_norm": 1.1351790275971436, "learning_rate": 9.727509554557416e-06, "loss": 0.8766, "step": 280 }, { "epoch": 0.2662876095711917, "grad_norm": 1.3900072874584015, "learning_rate": 9.725004093573343e-06, "loss": 0.8972, "step": 281 }, { "epoch": 0.26723525230987916, "grad_norm": 1.1866023759013848, "learning_rate": 9.722487492450764e-06, "loss": 0.9335, "step": 282 }, { "epoch": 0.2681828950485667, "grad_norm": 1.2381217486697587, "learning_rate": 9.719959757123073e-06, "loss": 0.9083, "step": 283 }, { "epoch": 0.2691305377872542, "grad_norm": 1.6107373228302189, "learning_rate": 9.717420893549902e-06, "loss": 0.9913, "step": 284 }, { "epoch": 0.27007818052594174, "grad_norm": 1.3012559103471528, "learning_rate": 9.714870907717134e-06, "loss": 0.9384, "step": 285 }, { "epoch": 0.27102582326462926, "grad_norm": 1.3512266977948462, "learning_rate": 9.712309805636863e-06, "loss": 0.9738, "step": 286 }, { "epoch": 0.27102582326462926, "eval_loss": 0.9620270729064941, "eval_runtime": 59.315, "eval_samples_per_second": 45.992, "eval_steps_per_second": 0.725, "step": 286 }, { "epoch": 0.27197346600331673, "grad_norm": 1.1737111003693583, "learning_rate": 9.709737593347404e-06, "loss": 0.9669, "step": 287 }, { "epoch": 0.27292110874200426, "grad_norm": 1.158891062157781, "learning_rate": 9.707154276913255e-06, "loss": 0.9724, "step": 288 }, { "epoch": 0.2738687514806918, "grad_norm": 1.1818539669598636, "learning_rate": 9.704559862425101e-06, "loss": 0.9411, "step": 289 }, { "epoch": 0.2748163942193793, "grad_norm": 1.317223158403057, "learning_rate": 9.701954355999791e-06, "loss": 0.8897, "step": 290 }, { "epoch": 0.2757640369580668, "grad_norm": 1.2827511719089313, "learning_rate": 9.699337763780325e-06, "loss": 0.9062, "step": 291 }, { "epoch": 0.2767116796967543, "grad_norm": 1.28805108052852, "learning_rate": 9.696710091935842e-06, "loss": 0.9176, "step": 292 }, { "epoch": 0.27765932243544184, "grad_norm": 1.3367234242878245, "learning_rate": 9.6940713466616e-06, "loss": 0.9009, "step": 293 }, { "epoch": 0.27860696517412936, "grad_norm": 1.2541386047985268, "learning_rate": 9.691421534178966e-06, "loss": 0.9109, "step": 294 }, { "epoch": 0.2795546079128169, "grad_norm": 1.5026012491650225, "learning_rate": 9.688760660735403e-06, "loss": 0.9709, "step": 295 }, { "epoch": 0.28050225065150436, "grad_norm": 1.2922689184697398, "learning_rate": 9.68608873260445e-06, "loss": 0.8457, "step": 296 }, { "epoch": 0.2814498933901919, "grad_norm": 1.1843338944530994, "learning_rate": 9.683405756085708e-06, "loss": 0.9313, "step": 297 }, { "epoch": 0.2823975361288794, "grad_norm": 1.315466417029974, "learning_rate": 9.680711737504832e-06, "loss": 1.019, "step": 298 }, { "epoch": 0.28334517886756694, "grad_norm": 1.0199556490757884, "learning_rate": 9.678006683213503e-06, "loss": 0.8922, "step": 299 }, { "epoch": 0.28429282160625446, "grad_norm": 1.1400934246384171, "learning_rate": 9.675290599589429e-06, "loss": 0.908, "step": 300 }, { "epoch": 0.28524046434494194, "grad_norm": 1.8423074242848725, "learning_rate": 9.672563493036318e-06, "loss": 1.0065, "step": 301 }, { "epoch": 0.28618810708362946, "grad_norm": 1.1796939423622033, "learning_rate": 9.669825369983865e-06, "loss": 0.9303, "step": 302 }, { "epoch": 0.287135749822317, "grad_norm": 1.2479579843600068, "learning_rate": 9.667076236887743e-06, "loss": 1.0198, "step": 303 }, { "epoch": 0.2880833925610045, "grad_norm": 1.229386161002158, "learning_rate": 9.664316100229578e-06, "loss": 0.9328, "step": 304 }, { "epoch": 0.28903103529969204, "grad_norm": 1.354608076441114, "learning_rate": 9.661544966516945e-06, "loss": 0.8865, "step": 305 }, { "epoch": 0.2899786780383795, "grad_norm": 1.2733991556068809, "learning_rate": 9.658762842283343e-06, "loss": 0.9805, "step": 306 }, { "epoch": 0.29092632077706704, "grad_norm": 1.2495713583949597, "learning_rate": 9.655969734088184e-06, "loss": 0.9302, "step": 307 }, { "epoch": 0.29187396351575456, "grad_norm": 1.2103907414095358, "learning_rate": 9.653165648516777e-06, "loss": 0.885, "step": 308 }, { "epoch": 0.29187396351575456, "eval_loss": 0.9591483473777771, "eval_runtime": 68.3896, "eval_samples_per_second": 39.889, "eval_steps_per_second": 0.629, "step": 308 }, { "epoch": 0.2928216062544421, "grad_norm": 1.1956016894279018, "learning_rate": 9.650350592180312e-06, "loss": 0.9577, "step": 309 }, { "epoch": 0.2937692489931296, "grad_norm": 1.140247620602589, "learning_rate": 9.647524571715843e-06, "loss": 0.9264, "step": 310 }, { "epoch": 0.2947168917318171, "grad_norm": 1.2006266683263125, "learning_rate": 9.644687593786282e-06, "loss": 0.9792, "step": 311 }, { "epoch": 0.2956645344705046, "grad_norm": 1.2812673838645852, "learning_rate": 9.641839665080363e-06, "loss": 0.954, "step": 312 }, { "epoch": 0.29661217720919214, "grad_norm": 1.010846565968867, "learning_rate": 9.638980792312651e-06, "loss": 0.9515, "step": 313 }, { "epoch": 0.29755981994787967, "grad_norm": 1.508846485133625, "learning_rate": 9.636110982223505e-06, "loss": 0.9611, "step": 314 }, { "epoch": 0.29850746268656714, "grad_norm": 1.2091515162070219, "learning_rate": 9.633230241579075e-06, "loss": 0.8803, "step": 315 }, { "epoch": 0.29945510542525466, "grad_norm": 1.251566988747115, "learning_rate": 9.630338577171282e-06, "loss": 0.9102, "step": 316 }, { "epoch": 0.3004027481639422, "grad_norm": 1.4368558329637313, "learning_rate": 9.627435995817799e-06, "loss": 0.9681, "step": 317 }, { "epoch": 0.3013503909026297, "grad_norm": 1.2724580288581318, "learning_rate": 9.624522504362039e-06, "loss": 0.9714, "step": 318 }, { "epoch": 0.30229803364131724, "grad_norm": 1.2457801062593066, "learning_rate": 9.621598109673142e-06, "loss": 0.9663, "step": 319 }, { "epoch": 0.3032456763800047, "grad_norm": 1.5450412575397683, "learning_rate": 9.618662818645949e-06, "loss": 0.973, "step": 320 }, { "epoch": 0.30419331911869224, "grad_norm": 1.3301347899029445, "learning_rate": 9.615716638200993e-06, "loss": 0.9292, "step": 321 }, { "epoch": 0.30514096185737977, "grad_norm": 1.5045379413960773, "learning_rate": 9.612759575284483e-06, "loss": 0.9943, "step": 322 }, { "epoch": 0.3060886045960673, "grad_norm": 1.2146706034284283, "learning_rate": 9.60979163686828e-06, "loss": 0.8828, "step": 323 }, { "epoch": 0.3070362473347548, "grad_norm": 1.1864956541845377, "learning_rate": 9.606812829949896e-06, "loss": 0.92, "step": 324 }, { "epoch": 0.3079838900734423, "grad_norm": 1.41143117586689, "learning_rate": 9.603823161552459e-06, "loss": 0.9539, "step": 325 }, { "epoch": 0.3089315328121298, "grad_norm": 2.5914491078059796, "learning_rate": 9.600822638724704e-06, "loss": 0.9211, "step": 326 }, { "epoch": 0.30987917555081734, "grad_norm": 1.104156076330228, "learning_rate": 9.597811268540969e-06, "loss": 0.9148, "step": 327 }, { "epoch": 0.31082681828950487, "grad_norm": 1.1472423105684746, "learning_rate": 9.594789058101154e-06, "loss": 0.9518, "step": 328 }, { "epoch": 0.3117744610281924, "grad_norm": 1.1393816701130914, "learning_rate": 9.591756014530723e-06, "loss": 1.0076, "step": 329 }, { "epoch": 0.31272210376687987, "grad_norm": 1.2776861681261165, "learning_rate": 9.588712144980681e-06, "loss": 0.8784, "step": 330 }, { "epoch": 0.31272210376687987, "eval_loss": 0.9570937156677246, "eval_runtime": 68.898, "eval_samples_per_second": 39.595, "eval_steps_per_second": 0.624, "step": 330 }, { "epoch": 0.3136697465055674, "grad_norm": 1.192795131650072, "learning_rate": 9.585657456627557e-06, "loss": 0.9045, "step": 331 }, { "epoch": 0.3146173892442549, "grad_norm": 1.2042562619274322, "learning_rate": 9.582591956673387e-06, "loss": 0.9683, "step": 332 }, { "epoch": 0.31556503198294245, "grad_norm": 1.1444088880890944, "learning_rate": 9.579515652345699e-06, "loss": 0.8678, "step": 333 }, { "epoch": 0.31651267472162997, "grad_norm": 1.0769104211549974, "learning_rate": 9.57642855089749e-06, "loss": 0.9175, "step": 334 }, { "epoch": 0.31746031746031744, "grad_norm": 1.2380307581631063, "learning_rate": 9.57333065960722e-06, "loss": 0.9351, "step": 335 }, { "epoch": 0.31840796019900497, "grad_norm": 1.071043290520968, "learning_rate": 9.570221985778785e-06, "loss": 0.8855, "step": 336 }, { "epoch": 0.3193556029376925, "grad_norm": 1.1849521886922723, "learning_rate": 9.567102536741501e-06, "loss": 0.917, "step": 337 }, { "epoch": 0.32030324567638, "grad_norm": 1.20214216361167, "learning_rate": 9.563972319850092e-06, "loss": 0.9147, "step": 338 }, { "epoch": 0.3212508884150675, "grad_norm": 1.266949477776236, "learning_rate": 9.560831342484668e-06, "loss": 0.9383, "step": 339 }, { "epoch": 0.322198531153755, "grad_norm": 1.5670977324953559, "learning_rate": 9.557679612050708e-06, "loss": 1.0023, "step": 340 }, { "epoch": 0.32314617389244255, "grad_norm": 1.237648169383608, "learning_rate": 9.554517135979044e-06, "loss": 0.9671, "step": 341 }, { "epoch": 0.32409381663113007, "grad_norm": 1.0260918280422053, "learning_rate": 9.551343921725844e-06, "loss": 0.879, "step": 342 }, { "epoch": 0.3250414593698176, "grad_norm": 1.155124445578137, "learning_rate": 9.548159976772593e-06, "loss": 0.9416, "step": 343 }, { "epoch": 0.32598910210850507, "grad_norm": 1.1950689084580686, "learning_rate": 9.544965308626075e-06, "loss": 0.9, "step": 344 }, { "epoch": 0.3269367448471926, "grad_norm": 1.2849959856276705, "learning_rate": 9.541759924818358e-06, "loss": 0.9332, "step": 345 }, { "epoch": 0.3278843875858801, "grad_norm": 1.0302992790409418, "learning_rate": 9.538543832906773e-06, "loss": 0.9051, "step": 346 }, { "epoch": 0.32883203032456765, "grad_norm": 1.2345608543091064, "learning_rate": 9.535317040473895e-06, "loss": 0.9806, "step": 347 }, { "epoch": 0.3297796730632552, "grad_norm": 1.1665835041880899, "learning_rate": 9.532079555127532e-06, "loss": 0.9433, "step": 348 }, { "epoch": 0.33072731580194265, "grad_norm": 1.265860203994782, "learning_rate": 9.528831384500699e-06, "loss": 0.9776, "step": 349 }, { "epoch": 0.33167495854063017, "grad_norm": 1.293238505576827, "learning_rate": 9.525572536251608e-06, "loss": 1.0388, "step": 350 }, { "epoch": 0.3326226012793177, "grad_norm": 1.2363591052870795, "learning_rate": 9.52230301806364e-06, "loss": 0.9252, "step": 351 }, { "epoch": 0.3335702440180052, "grad_norm": 1.3748905848676085, "learning_rate": 9.519022837645337e-06, "loss": 0.8923, "step": 352 }, { "epoch": 0.3335702440180052, "eval_loss": 0.9540281891822815, "eval_runtime": 62.2753, "eval_samples_per_second": 43.805, "eval_steps_per_second": 0.69, "step": 352 }, { "epoch": 0.33451788675669275, "grad_norm": 1.1703557022342401, "learning_rate": 9.51573200273038e-06, "loss": 0.9791, "step": 353 }, { "epoch": 0.3354655294953802, "grad_norm": 1.3163659131319334, "learning_rate": 9.512430521077565e-06, "loss": 0.8974, "step": 354 }, { "epoch": 0.33641317223406775, "grad_norm": 1.1823387827110081, "learning_rate": 9.509118400470792e-06, "loss": 0.8668, "step": 355 }, { "epoch": 0.3373608149727553, "grad_norm": 1.0471543968324866, "learning_rate": 9.505795648719049e-06, "loss": 0.9248, "step": 356 }, { "epoch": 0.3383084577114428, "grad_norm": 1.2873543382804975, "learning_rate": 9.502462273656381e-06, "loss": 0.8897, "step": 357 }, { "epoch": 0.3392561004501303, "grad_norm": 1.2157109813891434, "learning_rate": 9.499118283141887e-06, "loss": 0.9304, "step": 358 }, { "epoch": 0.3402037431888178, "grad_norm": 1.093181377661525, "learning_rate": 9.495763685059689e-06, "loss": 0.9237, "step": 359 }, { "epoch": 0.3411513859275053, "grad_norm": 1.095774592001467, "learning_rate": 9.492398487318922e-06, "loss": 0.8669, "step": 360 }, { "epoch": 0.34209902866619285, "grad_norm": 1.1676179176818222, "learning_rate": 9.48902269785371e-06, "loss": 0.9338, "step": 361 }, { "epoch": 0.3430466714048804, "grad_norm": 1.082117155119373, "learning_rate": 9.485636324623147e-06, "loss": 0.9301, "step": 362 }, { "epoch": 0.34399431414356785, "grad_norm": 1.5869790381600608, "learning_rate": 9.482239375611282e-06, "loss": 0.8566, "step": 363 }, { "epoch": 0.3449419568822554, "grad_norm": 2.1300888287293436, "learning_rate": 9.478831858827105e-06, "loss": 0.9462, "step": 364 }, { "epoch": 0.3458895996209429, "grad_norm": 1.329321349965101, "learning_rate": 9.475413782304509e-06, "loss": 0.9344, "step": 365 }, { "epoch": 0.3468372423596304, "grad_norm": 3.4098937413401678, "learning_rate": 9.471985154102292e-06, "loss": 0.881, "step": 366 }, { "epoch": 0.34778488509831795, "grad_norm": 1.374583167993129, "learning_rate": 9.468545982304132e-06, "loss": 0.8899, "step": 367 }, { "epoch": 0.3487325278370054, "grad_norm": 1.2132880433358602, "learning_rate": 9.465096275018556e-06, "loss": 0.9016, "step": 368 }, { "epoch": 0.34968017057569295, "grad_norm": 1.132880559404501, "learning_rate": 9.461636040378941e-06, "loss": 0.9424, "step": 369 }, { "epoch": 0.3506278133143805, "grad_norm": 1.573588626293436, "learning_rate": 9.458165286543477e-06, "loss": 0.9758, "step": 370 }, { "epoch": 0.351575456053068, "grad_norm": 1.0016737529772646, "learning_rate": 9.454684021695157e-06, "loss": 0.9522, "step": 371 }, { "epoch": 0.35252309879175553, "grad_norm": 1.2060571666651005, "learning_rate": 9.451192254041759e-06, "loss": 0.8995, "step": 372 }, { "epoch": 0.353470741530443, "grad_norm": 1.5491588961886638, "learning_rate": 9.447689991815819e-06, "loss": 0.9497, "step": 373 }, { "epoch": 0.3544183842691305, "grad_norm": 2.323597523498367, "learning_rate": 9.444177243274619e-06, "loss": 0.9483, "step": 374 }, { "epoch": 0.3544183842691305, "eval_loss": 0.9546486139297485, "eval_runtime": 67.7741, "eval_samples_per_second": 40.251, "eval_steps_per_second": 0.634, "step": 374 }, { "epoch": 0.35536602700781805, "grad_norm": 1.1957751593867816, "learning_rate": 9.440654016700161e-06, "loss": 0.9069, "step": 375 }, { "epoch": 0.3563136697465056, "grad_norm": 1.3480198545553501, "learning_rate": 9.437120320399158e-06, "loss": 0.9206, "step": 376 }, { "epoch": 0.3572613124851931, "grad_norm": 1.1240947731641266, "learning_rate": 9.433576162703e-06, "loss": 0.9686, "step": 377 }, { "epoch": 0.3582089552238806, "grad_norm": 1.258961853327028, "learning_rate": 9.430021551967745e-06, "loss": 0.9156, "step": 378 }, { "epoch": 0.3591565979625681, "grad_norm": 1.1465674821214438, "learning_rate": 9.426456496574095e-06, "loss": 0.9027, "step": 379 }, { "epoch": 0.36010424070125563, "grad_norm": 1.334135631113088, "learning_rate": 9.422881004927383e-06, "loss": 0.9215, "step": 380 }, { "epoch": 0.36105188343994316, "grad_norm": 1.052076097463688, "learning_rate": 9.419295085457536e-06, "loss": 0.8708, "step": 381 }, { "epoch": 0.3619995261786307, "grad_norm": 1.3069872390381696, "learning_rate": 9.41569874661908e-06, "loss": 0.9392, "step": 382 }, { "epoch": 0.36294716891731815, "grad_norm": 1.1946541917496492, "learning_rate": 9.412091996891097e-06, "loss": 0.9186, "step": 383 }, { "epoch": 0.3638948116560057, "grad_norm": 1.130570319952377, "learning_rate": 9.408474844777218e-06, "loss": 0.9231, "step": 384 }, { "epoch": 0.3648424543946932, "grad_norm": 1.230122090333074, "learning_rate": 9.4048472988056e-06, "loss": 1.0082, "step": 385 }, { "epoch": 0.36579009713338073, "grad_norm": 1.0720696634128188, "learning_rate": 9.401209367528907e-06, "loss": 0.9291, "step": 386 }, { "epoch": 0.36673773987206826, "grad_norm": 1.1723709465115237, "learning_rate": 9.397561059524285e-06, "loss": 0.9175, "step": 387 }, { "epoch": 0.36768538261075573, "grad_norm": 1.5238004908651446, "learning_rate": 9.393902383393347e-06, "loss": 0.9621, "step": 388 }, { "epoch": 0.36863302534944326, "grad_norm": 1.0814097944853873, "learning_rate": 9.39023334776215e-06, "loss": 0.9293, "step": 389 }, { "epoch": 0.3695806680881308, "grad_norm": 1.1650689858883694, "learning_rate": 9.386553961281179e-06, "loss": 0.9582, "step": 390 }, { "epoch": 0.3705283108268183, "grad_norm": 1.2458078695599824, "learning_rate": 9.382864232625321e-06, "loss": 0.9581, "step": 391 }, { "epoch": 0.3714759535655058, "grad_norm": 1.339036266204836, "learning_rate": 9.379164170493844e-06, "loss": 0.8931, "step": 392 }, { "epoch": 0.3724235963041933, "grad_norm": 1.0125589713218854, "learning_rate": 9.375453783610381e-06, "loss": 0.9012, "step": 393 }, { "epoch": 0.37337123904288083, "grad_norm": 1.0329885700845731, "learning_rate": 9.371733080722911e-06, "loss": 0.8628, "step": 394 }, { "epoch": 0.37431888178156836, "grad_norm": 1.439005100467098, "learning_rate": 9.368002070603731e-06, "loss": 0.8827, "step": 395 }, { "epoch": 0.3752665245202559, "grad_norm": 1.0085800308385358, "learning_rate": 9.36426076204944e-06, "loss": 0.8743, "step": 396 }, { "epoch": 0.3752665245202559, "eval_loss": 0.9504217505455017, "eval_runtime": 66.3641, "eval_samples_per_second": 41.107, "eval_steps_per_second": 0.648, "step": 396 }, { "epoch": 0.37621416725894335, "grad_norm": 1.3876480177466899, "learning_rate": 9.36050916388092e-06, "loss": 0.9472, "step": 397 }, { "epoch": 0.3771618099976309, "grad_norm": 1.1708472397542733, "learning_rate": 9.35674728494331e-06, "loss": 0.9283, "step": 398 }, { "epoch": 0.3781094527363184, "grad_norm": 1.0918645378867784, "learning_rate": 9.35297513410599e-06, "loss": 0.8862, "step": 399 }, { "epoch": 0.37905709547500593, "grad_norm": 0.9955698293935606, "learning_rate": 9.349192720262556e-06, "loss": 0.8965, "step": 400 }, { "epoch": 0.38000473821369346, "grad_norm": 1.305075253905476, "learning_rate": 9.345400052330802e-06, "loss": 0.8806, "step": 401 }, { "epoch": 0.38095238095238093, "grad_norm": 0.99053214435014, "learning_rate": 9.341597139252698e-06, "loss": 1.0084, "step": 402 }, { "epoch": 0.38190002369106846, "grad_norm": 1.3393098226066853, "learning_rate": 9.337783989994371e-06, "loss": 0.9356, "step": 403 }, { "epoch": 0.382847666429756, "grad_norm": 1.01675988520605, "learning_rate": 9.333960613546079e-06, "loss": 0.8987, "step": 404 }, { "epoch": 0.3837953091684435, "grad_norm": 1.1624028341398043, "learning_rate": 9.330127018922195e-06, "loss": 0.8895, "step": 405 }, { "epoch": 0.38474295190713104, "grad_norm": 1.2237993995607808, "learning_rate": 9.326283215161177e-06, "loss": 0.8879, "step": 406 }, { "epoch": 0.3856905946458185, "grad_norm": 1.2652821842567468, "learning_rate": 9.322429211325567e-06, "loss": 0.8893, "step": 407 }, { "epoch": 0.38663823738450603, "grad_norm": 1.1025981014314234, "learning_rate": 9.31856501650194e-06, "loss": 0.9746, "step": 408 }, { "epoch": 0.38758588012319356, "grad_norm": 3.9246963935175763, "learning_rate": 9.314690639800906e-06, "loss": 0.9352, "step": 409 }, { "epoch": 0.3885335228618811, "grad_norm": 1.1259828261006313, "learning_rate": 9.310806090357083e-06, "loss": 0.9083, "step": 410 }, { "epoch": 0.3894811656005686, "grad_norm": 1.2708123609203328, "learning_rate": 9.306911377329067e-06, "loss": 0.9167, "step": 411 }, { "epoch": 0.3904288083392561, "grad_norm": 2.090787985556849, "learning_rate": 9.30300650989942e-06, "loss": 0.9976, "step": 412 }, { "epoch": 0.3913764510779436, "grad_norm": 1.036822985622544, "learning_rate": 9.299091497274647e-06, "loss": 1.002, "step": 413 }, { "epoch": 0.39232409381663114, "grad_norm": 1.08027979908674, "learning_rate": 9.295166348685169e-06, "loss": 0.883, "step": 414 }, { "epoch": 0.39327173655531866, "grad_norm": 1.3152353131345889, "learning_rate": 9.291231073385306e-06, "loss": 0.9368, "step": 415 }, { "epoch": 0.39421937929400613, "grad_norm": 1.3362457501149774, "learning_rate": 9.287285680653254e-06, "loss": 0.9923, "step": 416 }, { "epoch": 0.39516702203269366, "grad_norm": 1.1378299427204326, "learning_rate": 9.283330179791063e-06, "loss": 0.9013, "step": 417 }, { "epoch": 0.3961146647713812, "grad_norm": 0.9698571591778787, "learning_rate": 9.279364580124615e-06, "loss": 0.8294, "step": 418 }, { "epoch": 0.3961146647713812, "eval_loss": 0.9494832754135132, "eval_runtime": 62.1923, "eval_samples_per_second": 43.864, "eval_steps_per_second": 0.691, "step": 418 }, { "epoch": 0.3970623075100687, "grad_norm": 1.2329603475368258, "learning_rate": 9.275388891003596e-06, "loss": 0.9132, "step": 419 }, { "epoch": 0.39800995024875624, "grad_norm": 1.0253483109899053, "learning_rate": 9.271403121801492e-06, "loss": 0.9966, "step": 420 }, { "epoch": 0.3989575929874437, "grad_norm": 1.1122937106526114, "learning_rate": 9.267407281915541e-06, "loss": 0.8949, "step": 421 }, { "epoch": 0.39990523572613124, "grad_norm": 1.0623316599321453, "learning_rate": 9.263401380766739e-06, "loss": 0.9192, "step": 422 }, { "epoch": 0.40085287846481876, "grad_norm": 1.109212522270619, "learning_rate": 9.25938542779979e-06, "loss": 0.9212, "step": 423 }, { "epoch": 0.4018005212035063, "grad_norm": 1.1148931056175715, "learning_rate": 9.255359432483106e-06, "loss": 0.8824, "step": 424 }, { "epoch": 0.4027481639421938, "grad_norm": 1.469688611294437, "learning_rate": 9.251323404308774e-06, "loss": 0.8941, "step": 425 }, { "epoch": 0.4036958066808813, "grad_norm": 1.1366864229617593, "learning_rate": 9.247277352792534e-06, "loss": 0.9542, "step": 426 }, { "epoch": 0.4046434494195688, "grad_norm": 1.2380214332997066, "learning_rate": 9.243221287473755e-06, "loss": 0.9417, "step": 427 }, { "epoch": 0.40559109215825634, "grad_norm": 1.292587978067118, "learning_rate": 9.239155217915422e-06, "loss": 0.9531, "step": 428 }, { "epoch": 0.40653873489694387, "grad_norm": 1.1996181211866257, "learning_rate": 9.235079153704108e-06, "loss": 0.993, "step": 429 }, { "epoch": 0.4074863776356314, "grad_norm": 1.7152618861500344, "learning_rate": 9.23099310444994e-06, "loss": 0.88, "step": 430 }, { "epoch": 0.40843402037431886, "grad_norm": 1.236710405113469, "learning_rate": 9.226897079786594e-06, "loss": 0.8924, "step": 431 }, { "epoch": 0.4093816631130064, "grad_norm": 1.026683565261258, "learning_rate": 9.222791089371266e-06, "loss": 0.8627, "step": 432 }, { "epoch": 0.4103293058516939, "grad_norm": 1.0752239634813958, "learning_rate": 9.218675142884648e-06, "loss": 0.9457, "step": 433 }, { "epoch": 0.41127694859038144, "grad_norm": 1.1942159706425186, "learning_rate": 9.214549250030899e-06, "loss": 0.9697, "step": 434 }, { "epoch": 0.41222459132906897, "grad_norm": 1.302875719838314, "learning_rate": 9.210413420537638e-06, "loss": 0.9266, "step": 435 }, { "epoch": 0.41317223406775644, "grad_norm": 1.2858086492476544, "learning_rate": 9.206267664155906e-06, "loss": 0.8556, "step": 436 }, { "epoch": 0.41411987680644397, "grad_norm": 1.2092507326298383, "learning_rate": 9.20211199066015e-06, "loss": 0.8873, "step": 437 }, { "epoch": 0.4150675195451315, "grad_norm": 1.0641345729826912, "learning_rate": 9.197946409848196e-06, "loss": 0.927, "step": 438 }, { "epoch": 0.416015162283819, "grad_norm": 0.9922730475025484, "learning_rate": 9.19377093154123e-06, "loss": 0.8922, "step": 439 }, { "epoch": 0.4169628050225065, "grad_norm": 1.1994954411324383, "learning_rate": 9.189585565583779e-06, "loss": 0.934, "step": 440 }, { "epoch": 0.4169628050225065, "eval_loss": 0.9466658234596252, "eval_runtime": 64.5961, "eval_samples_per_second": 42.232, "eval_steps_per_second": 0.666, "step": 440 }, { "epoch": 0.417910447761194, "grad_norm": 1.2490962663664558, "learning_rate": 9.185390321843673e-06, "loss": 0.901, "step": 441 }, { "epoch": 0.41885809049988154, "grad_norm": 1.015254380962658, "learning_rate": 9.181185210212034e-06, "loss": 0.9519, "step": 442 }, { "epoch": 0.41980573323856907, "grad_norm": 1.1895181384960887, "learning_rate": 9.176970240603253e-06, "loss": 0.8807, "step": 443 }, { "epoch": 0.4207533759772566, "grad_norm": 1.3706219828971085, "learning_rate": 9.172745422954961e-06, "loss": 0.9148, "step": 444 }, { "epoch": 0.42170101871594406, "grad_norm": 1.0379378858579145, "learning_rate": 9.168510767228008e-06, "loss": 0.9468, "step": 445 }, { "epoch": 0.4226486614546316, "grad_norm": 1.2178466709823097, "learning_rate": 9.164266283406433e-06, "loss": 0.9242, "step": 446 }, { "epoch": 0.4235963041933191, "grad_norm": 1.2808190385423623, "learning_rate": 9.160011981497458e-06, "loss": 0.8654, "step": 447 }, { "epoch": 0.42454394693200664, "grad_norm": 1.250260948302257, "learning_rate": 9.155747871531444e-06, "loss": 0.9284, "step": 448 }, { "epoch": 0.42549158967069417, "grad_norm": 1.2672376071125921, "learning_rate": 9.151473963561884e-06, "loss": 0.9568, "step": 449 }, { "epoch": 0.42643923240938164, "grad_norm": 1.0461013649789057, "learning_rate": 9.147190267665361e-06, "loss": 0.8883, "step": 450 }, { "epoch": 0.42738687514806917, "grad_norm": 1.1516556206793171, "learning_rate": 9.142896793941546e-06, "loss": 0.9596, "step": 451 }, { "epoch": 0.4283345178867567, "grad_norm": 1.1510780017093964, "learning_rate": 9.13859355251316e-06, "loss": 0.9444, "step": 452 }, { "epoch": 0.4292821606254442, "grad_norm": 0.9978574311141366, "learning_rate": 9.134280553525946e-06, "loss": 0.8698, "step": 453 }, { "epoch": 0.43022980336413175, "grad_norm": 1.0518208149889676, "learning_rate": 9.129957807148666e-06, "loss": 0.8508, "step": 454 }, { "epoch": 0.4311774461028192, "grad_norm": 1.0777071914790497, "learning_rate": 9.12562532357305e-06, "loss": 0.9219, "step": 455 }, { "epoch": 0.43212508884150674, "grad_norm": 1.3003109116219143, "learning_rate": 9.121283113013794e-06, "loss": 0.9354, "step": 456 }, { "epoch": 0.43307273158019427, "grad_norm": 1.231896880939342, "learning_rate": 9.116931185708523e-06, "loss": 0.8797, "step": 457 }, { "epoch": 0.4340203743188818, "grad_norm": 1.167418023483012, "learning_rate": 9.112569551917773e-06, "loss": 0.9122, "step": 458 }, { "epoch": 0.4349680170575693, "grad_norm": 1.2433163300824168, "learning_rate": 9.108198221924966e-06, "loss": 0.9241, "step": 459 }, { "epoch": 0.4359156597962568, "grad_norm": 1.2957389966436808, "learning_rate": 9.103817206036383e-06, "loss": 0.9653, "step": 460 }, { "epoch": 0.4368633025349443, "grad_norm": 1.1967614874308203, "learning_rate": 9.09942651458114e-06, "loss": 0.9555, "step": 461 }, { "epoch": 0.43781094527363185, "grad_norm": 1.0311787596301678, "learning_rate": 9.095026157911166e-06, "loss": 0.8532, "step": 462 }, { "epoch": 0.43781094527363185, "eval_loss": 0.9448354840278625, "eval_runtime": 63.5673, "eval_samples_per_second": 42.915, "eval_steps_per_second": 0.676, "step": 462 }, { "epoch": 0.4387585880123194, "grad_norm": 1.1614684984564378, "learning_rate": 9.090616146401183e-06, "loss": 0.911, "step": 463 }, { "epoch": 0.43970623075100684, "grad_norm": 1.1848933141897011, "learning_rate": 9.086196490448668e-06, "loss": 0.8495, "step": 464 }, { "epoch": 0.44065387348969437, "grad_norm": 1.0920125977106059, "learning_rate": 9.081767200473842e-06, "loss": 0.9195, "step": 465 }, { "epoch": 0.4416015162283819, "grad_norm": 1.0487746428767522, "learning_rate": 9.077328286919638e-06, "loss": 0.8775, "step": 466 }, { "epoch": 0.4425491589670694, "grad_norm": 1.0480719750913268, "learning_rate": 9.07287976025168e-06, "loss": 0.8879, "step": 467 }, { "epoch": 0.44349680170575695, "grad_norm": 1.156105288349571, "learning_rate": 9.068421630958254e-06, "loss": 0.9004, "step": 468 }, { "epoch": 0.4444444444444444, "grad_norm": 1.1479660233621711, "learning_rate": 9.063953909550289e-06, "loss": 0.9652, "step": 469 }, { "epoch": 0.44539208718313195, "grad_norm": 1.158618048287916, "learning_rate": 9.059476606561328e-06, "loss": 0.8643, "step": 470 }, { "epoch": 0.4463397299218195, "grad_norm": 1.1045055506935484, "learning_rate": 9.054989732547507e-06, "loss": 0.8307, "step": 471 }, { "epoch": 0.447287372660507, "grad_norm": 1.189869710423804, "learning_rate": 9.050493298087523e-06, "loss": 0.8693, "step": 472 }, { "epoch": 0.4482350153991945, "grad_norm": 1.5017065849353626, "learning_rate": 9.045987313782616e-06, "loss": 0.8868, "step": 473 }, { "epoch": 0.449182658137882, "grad_norm": 1.288348522111584, "learning_rate": 9.041471790256543e-06, "loss": 0.9984, "step": 474 }, { "epoch": 0.4501303008765695, "grad_norm": 1.3428427133159277, "learning_rate": 9.036946738155548e-06, "loss": 0.9328, "step": 475 }, { "epoch": 0.45107794361525705, "grad_norm": 0.9887938032536074, "learning_rate": 9.032412168148345e-06, "loss": 0.9483, "step": 476 }, { "epoch": 0.4520255863539446, "grad_norm": 1.0713968856815155, "learning_rate": 9.027868090926088e-06, "loss": 0.8861, "step": 477 }, { "epoch": 0.4529732290926321, "grad_norm": 1.162032207786328, "learning_rate": 9.023314517202341e-06, "loss": 0.9014, "step": 478 }, { "epoch": 0.4539208718313196, "grad_norm": 1.135173292644661, "learning_rate": 9.018751457713062e-06, "loss": 0.882, "step": 479 }, { "epoch": 0.4548685145700071, "grad_norm": 1.2191006204661359, "learning_rate": 9.014178923216572e-06, "loss": 0.8936, "step": 480 }, { "epoch": 0.4558161573086946, "grad_norm": 1.1422417367554563, "learning_rate": 9.009596924493536e-06, "loss": 0.9046, "step": 481 }, { "epoch": 0.45676380004738215, "grad_norm": 1.0960107607325966, "learning_rate": 9.005005472346923e-06, "loss": 0.8608, "step": 482 }, { "epoch": 0.4577114427860697, "grad_norm": 1.2860608689094808, "learning_rate": 9.000404577602003e-06, "loss": 0.92, "step": 483 }, { "epoch": 0.45865908552475715, "grad_norm": 1.148989084195761, "learning_rate": 8.995794251106295e-06, "loss": 0.9675, "step": 484 }, { "epoch": 0.45865908552475715, "eval_loss": 0.9426133632659912, "eval_runtime": 62.3901, "eval_samples_per_second": 43.725, "eval_steps_per_second": 0.689, "step": 484 }, { "epoch": 0.4596067282634447, "grad_norm": 1.1715395816498915, "learning_rate": 8.991174503729567e-06, "loss": 0.9505, "step": 485 }, { "epoch": 0.4605543710021322, "grad_norm": 1.1418428811721806, "learning_rate": 8.986545346363792e-06, "loss": 0.9194, "step": 486 }, { "epoch": 0.4615020137408197, "grad_norm": 1.2704284828900592, "learning_rate": 8.98190678992313e-06, "loss": 0.9404, "step": 487 }, { "epoch": 0.4624496564795072, "grad_norm": 1.4180260493906214, "learning_rate": 8.977258845343904e-06, "loss": 0.8881, "step": 488 }, { "epoch": 0.4633972992181947, "grad_norm": 1.4745602251152343, "learning_rate": 8.97260152358457e-06, "loss": 0.8991, "step": 489 }, { "epoch": 0.46434494195688225, "grad_norm": 1.5516611931425326, "learning_rate": 8.96793483562569e-06, "loss": 0.8868, "step": 490 }, { "epoch": 0.4652925846955698, "grad_norm": 1.1672873798559753, "learning_rate": 8.963258792469908e-06, "loss": 0.9032, "step": 491 }, { "epoch": 0.4662402274342573, "grad_norm": 0.9800479447492024, "learning_rate": 8.958573405141932e-06, "loss": 0.8875, "step": 492 }, { "epoch": 0.4671878701729448, "grad_norm": 1.3344568834573243, "learning_rate": 8.953878684688492e-06, "loss": 0.8834, "step": 493 }, { "epoch": 0.4681355129116323, "grad_norm": 1.0491821400957775, "learning_rate": 8.949174642178333e-06, "loss": 0.9002, "step": 494 }, { "epoch": 0.4690831556503198, "grad_norm": 1.237676770681135, "learning_rate": 8.944461288702166e-06, "loss": 0.8832, "step": 495 }, { "epoch": 0.47003079838900735, "grad_norm": 1.2759707423338387, "learning_rate": 8.939738635372664e-06, "loss": 0.8949, "step": 496 }, { "epoch": 0.4709784411276949, "grad_norm": 1.1263638492681127, "learning_rate": 8.935006693324423e-06, "loss": 0.8969, "step": 497 }, { "epoch": 0.47192608386638235, "grad_norm": 1.154527093025846, "learning_rate": 8.930265473713939e-06, "loss": 0.8759, "step": 498 }, { "epoch": 0.4728737266050699, "grad_norm": 1.2033690454214934, "learning_rate": 8.92551498771958e-06, "loss": 0.9447, "step": 499 }, { "epoch": 0.4738213693437574, "grad_norm": 1.188345342085479, "learning_rate": 8.920755246541563e-06, "loss": 0.9698, "step": 500 }, { "epoch": 0.47476901208244493, "grad_norm": 1.1460258736111513, "learning_rate": 8.91598626140193e-06, "loss": 0.8861, "step": 501 }, { "epoch": 0.47571665482113246, "grad_norm": 1.0983544593959635, "learning_rate": 8.911208043544513e-06, "loss": 0.9099, "step": 502 }, { "epoch": 0.4766642975598199, "grad_norm": 1.2526221170984964, "learning_rate": 8.906420604234908e-06, "loss": 0.9153, "step": 503 }, { "epoch": 0.47761194029850745, "grad_norm": 1.4378576625792787, "learning_rate": 8.90162395476046e-06, "loss": 0.9098, "step": 504 }, { "epoch": 0.478559583037195, "grad_norm": 1.021190259086082, "learning_rate": 8.896818106430225e-06, "loss": 0.9201, "step": 505 }, { "epoch": 0.4795072257758825, "grad_norm": 1.2166947590641954, "learning_rate": 8.89200307057495e-06, "loss": 0.9498, "step": 506 }, { "epoch": 0.4795072257758825, "eval_loss": 0.9415593147277832, "eval_runtime": 63.0823, "eval_samples_per_second": 43.245, "eval_steps_per_second": 0.682, "step": 506 }, { "epoch": 0.48045486851457003, "grad_norm": 1.099897475733057, "learning_rate": 8.887178858547039e-06, "loss": 0.8785, "step": 507 }, { "epoch": 0.4814025112532575, "grad_norm": 1.1053789477176734, "learning_rate": 8.882345481720533e-06, "loss": 0.9781, "step": 508 }, { "epoch": 0.48235015399194503, "grad_norm": 1.2550219679746741, "learning_rate": 8.877502951491083e-06, "loss": 0.9175, "step": 509 }, { "epoch": 0.48329779673063256, "grad_norm": 1.035777482131784, "learning_rate": 8.872651279275917e-06, "loss": 0.9394, "step": 510 }, { "epoch": 0.4842454394693201, "grad_norm": 1.1823889985534881, "learning_rate": 8.867790476513818e-06, "loss": 0.8619, "step": 511 }, { "epoch": 0.4851930822080076, "grad_norm": 1.0806837978842365, "learning_rate": 8.862920554665098e-06, "loss": 0.8847, "step": 512 }, { "epoch": 0.4861407249466951, "grad_norm": 1.1417084903673171, "learning_rate": 8.858041525211569e-06, "loss": 0.8984, "step": 513 }, { "epoch": 0.4870883676853826, "grad_norm": 1.046685136616654, "learning_rate": 8.853153399656513e-06, "loss": 0.9343, "step": 514 }, { "epoch": 0.48803601042407013, "grad_norm": 1.1600934932807847, "learning_rate": 8.848256189524661e-06, "loss": 0.903, "step": 515 }, { "epoch": 0.48898365316275766, "grad_norm": 0.9999805389325372, "learning_rate": 8.843349906362163e-06, "loss": 0.9087, "step": 516 }, { "epoch": 0.48993129590144513, "grad_norm": 1.1693797728638526, "learning_rate": 8.838434561736556e-06, "loss": 0.9083, "step": 517 }, { "epoch": 0.49087893864013266, "grad_norm": 1.1372932570585796, "learning_rate": 8.833510167236747e-06, "loss": 0.9713, "step": 518 }, { "epoch": 0.4918265813788202, "grad_norm": 1.0947618440390705, "learning_rate": 8.828576734472975e-06, "loss": 0.8689, "step": 519 }, { "epoch": 0.4927742241175077, "grad_norm": 1.1318492632095214, "learning_rate": 8.823634275076792e-06, "loss": 0.8625, "step": 520 }, { "epoch": 0.49372186685619524, "grad_norm": 1.3142475847243504, "learning_rate": 8.818682800701028e-06, "loss": 0.8914, "step": 521 }, { "epoch": 0.4946695095948827, "grad_norm": 1.0542269379359606, "learning_rate": 8.813722323019774e-06, "loss": 0.9204, "step": 522 }, { "epoch": 0.49561715233357023, "grad_norm": 1.2759846986205978, "learning_rate": 8.808752853728341e-06, "loss": 0.9044, "step": 523 }, { "epoch": 0.49656479507225776, "grad_norm": 1.0846144562638056, "learning_rate": 8.803774404543246e-06, "loss": 0.9123, "step": 524 }, { "epoch": 0.4975124378109453, "grad_norm": 1.1086474451297028, "learning_rate": 8.798786987202175e-06, "loss": 0.9293, "step": 525 }, { "epoch": 0.4984600805496328, "grad_norm": 0.9413825393223179, "learning_rate": 8.793790613463956e-06, "loss": 0.8654, "step": 526 }, { "epoch": 0.4994077232883203, "grad_norm": 1.1832807749456735, "learning_rate": 8.788785295108536e-06, "loss": 0.8636, "step": 527 }, { "epoch": 0.5003553660270078, "grad_norm": 1.0977629074376605, "learning_rate": 8.783771043936949e-06, "loss": 0.8765, "step": 528 }, { "epoch": 0.5003553660270078, "eval_loss": 0.941301167011261, "eval_runtime": 61.1844, "eval_samples_per_second": 44.587, "eval_steps_per_second": 0.703, "step": 528 }, { "epoch": 0.5013030087656953, "grad_norm": 1.146767921801711, "learning_rate": 8.778747871771293e-06, "loss": 0.8989, "step": 529 }, { "epoch": 0.5022506515043829, "grad_norm": 1.2639703543113263, "learning_rate": 8.773715790454695e-06, "loss": 0.9151, "step": 530 }, { "epoch": 0.5031982942430704, "grad_norm": 1.113218960186029, "learning_rate": 8.768674811851293e-06, "loss": 0.8692, "step": 531 }, { "epoch": 0.5041459369817579, "grad_norm": 0.9991478843453905, "learning_rate": 8.763624947846195e-06, "loss": 0.8764, "step": 532 }, { "epoch": 0.5050935797204454, "grad_norm": 1.1051839359484277, "learning_rate": 8.758566210345464e-06, "loss": 0.9142, "step": 533 }, { "epoch": 0.5060412224591329, "grad_norm": 1.5864593937619376, "learning_rate": 8.75349861127608e-06, "loss": 0.9167, "step": 534 }, { "epoch": 0.5069888651978204, "grad_norm": 1.0055893256047008, "learning_rate": 8.748422162585915e-06, "loss": 0.9583, "step": 535 }, { "epoch": 0.5079365079365079, "grad_norm": 1.1419438277764564, "learning_rate": 8.743336876243712e-06, "loss": 0.8847, "step": 536 }, { "epoch": 0.5088841506751954, "grad_norm": 1.1093929329894858, "learning_rate": 8.738242764239046e-06, "loss": 0.9657, "step": 537 }, { "epoch": 0.509831793413883, "grad_norm": 1.0924153336293334, "learning_rate": 8.733139838582299e-06, "loss": 0.9452, "step": 538 }, { "epoch": 0.5107794361525705, "grad_norm": 1.100904420569305, "learning_rate": 8.728028111304639e-06, "loss": 0.8705, "step": 539 }, { "epoch": 0.511727078891258, "grad_norm": 1.0377959902393181, "learning_rate": 8.722907594457975e-06, "loss": 0.9021, "step": 540 }, { "epoch": 0.5126747216299455, "grad_norm": 1.3028881798201601, "learning_rate": 8.717778300114952e-06, "loss": 0.9004, "step": 541 }, { "epoch": 0.5136223643686331, "grad_norm": 1.2219633113574593, "learning_rate": 8.712640240368899e-06, "loss": 0.9146, "step": 542 }, { "epoch": 0.5145700071073206, "grad_norm": 1.16735139559823, "learning_rate": 8.707493427333817e-06, "loss": 0.9336, "step": 543 }, { "epoch": 0.515517649846008, "grad_norm": 1.1223934613953974, "learning_rate": 8.702337873144343e-06, "loss": 0.8959, "step": 544 }, { "epoch": 0.5164652925846955, "grad_norm": 1.0381379384688154, "learning_rate": 8.697173589955724e-06, "loss": 0.9147, "step": 545 }, { "epoch": 0.5174129353233831, "grad_norm": 1.071551123667491, "learning_rate": 8.692000589943785e-06, "loss": 0.8713, "step": 546 }, { "epoch": 0.5183605780620706, "grad_norm": 1.1572023966023732, "learning_rate": 8.686818885304907e-06, "loss": 0.9468, "step": 547 }, { "epoch": 0.5193082208007581, "grad_norm": 1.0966755633051661, "learning_rate": 8.681628488255986e-06, "loss": 0.9746, "step": 548 }, { "epoch": 0.5202558635394456, "grad_norm": 1.0054623347539213, "learning_rate": 8.676429411034423e-06, "loss": 0.889, "step": 549 }, { "epoch": 0.5212035062781332, "grad_norm": 1.0688410228225136, "learning_rate": 8.671221665898074e-06, "loss": 0.8986, "step": 550 }, { "epoch": 0.5212035062781332, "eval_loss": 0.9385759234428406, "eval_runtime": 60.7338, "eval_samples_per_second": 44.917, "eval_steps_per_second": 0.708, "step": 550 }, { "epoch": 0.5221511490168207, "grad_norm": 1.1912290308637075, "learning_rate": 8.666005265125238e-06, "loss": 0.9032, "step": 551 }, { "epoch": 0.5230987917555082, "grad_norm": 1.0819840961903495, "learning_rate": 8.660780221014617e-06, "loss": 0.9549, "step": 552 }, { "epoch": 0.5240464344941956, "grad_norm": 1.584365865940181, "learning_rate": 8.655546545885294e-06, "loss": 0.9895, "step": 553 }, { "epoch": 0.5249940772328832, "grad_norm": 1.5230791449620116, "learning_rate": 8.650304252076704e-06, "loss": 0.9359, "step": 554 }, { "epoch": 0.5259417199715707, "grad_norm": 1.2812899118028946, "learning_rate": 8.645053351948594e-06, "loss": 0.8863, "step": 555 }, { "epoch": 0.5268893627102582, "grad_norm": 1.1090479481617728, "learning_rate": 8.63979385788101e-06, "loss": 0.9549, "step": 556 }, { "epoch": 0.5278370054489457, "grad_norm": 1.0243309497173194, "learning_rate": 8.63452578227426e-06, "loss": 0.8837, "step": 557 }, { "epoch": 0.5287846481876333, "grad_norm": 1.1652281440552321, "learning_rate": 8.629249137548873e-06, "loss": 0.8833, "step": 558 }, { "epoch": 0.5297322909263208, "grad_norm": 1.0941817825792766, "learning_rate": 8.6239639361456e-06, "loss": 0.9423, "step": 559 }, { "epoch": 0.5306799336650083, "grad_norm": 1.2574492154883083, "learning_rate": 8.61867019052535e-06, "loss": 0.9524, "step": 560 }, { "epoch": 0.5316275764036958, "grad_norm": 1.1528975788038949, "learning_rate": 8.613367913169188e-06, "loss": 0.8843, "step": 561 }, { "epoch": 0.5325752191423834, "grad_norm": 1.260334993982276, "learning_rate": 8.608057116578283e-06, "loss": 0.9527, "step": 562 }, { "epoch": 0.5335228618810708, "grad_norm": 1.0336321970328701, "learning_rate": 8.602737813273901e-06, "loss": 0.885, "step": 563 }, { "epoch": 0.5344705046197583, "grad_norm": 1.4071128107796536, "learning_rate": 8.597410015797358e-06, "loss": 0.9056, "step": 564 }, { "epoch": 0.5354181473584458, "grad_norm": 1.3243499763253614, "learning_rate": 8.592073736709996e-06, "loss": 0.9816, "step": 565 }, { "epoch": 0.5363657900971334, "grad_norm": 1.0252110946238864, "learning_rate": 8.586728988593158e-06, "loss": 0.8939, "step": 566 }, { "epoch": 0.5373134328358209, "grad_norm": 1.5674480203253196, "learning_rate": 8.581375784048154e-06, "loss": 0.8716, "step": 567 }, { "epoch": 0.5382610755745084, "grad_norm": 1.3373495536241256, "learning_rate": 8.576014135696227e-06, "loss": 0.9189, "step": 568 }, { "epoch": 0.539208718313196, "grad_norm": 1.0083923948069164, "learning_rate": 8.570644056178533e-06, "loss": 0.8696, "step": 569 }, { "epoch": 0.5401563610518835, "grad_norm": 1.134010279426964, "learning_rate": 8.565265558156101e-06, "loss": 0.9171, "step": 570 }, { "epoch": 0.541104003790571, "grad_norm": 1.0122940996397913, "learning_rate": 8.559878654309818e-06, "loss": 0.8536, "step": 571 }, { "epoch": 0.5420516465292585, "grad_norm": 1.0417709805855406, "learning_rate": 8.554483357340379e-06, "loss": 0.8757, "step": 572 }, { "epoch": 0.5420516465292585, "eval_loss": 0.9370559453964233, "eval_runtime": 65.2018, "eval_samples_per_second": 41.839, "eval_steps_per_second": 0.659, "step": 572 }, { "epoch": 0.5429992892679459, "grad_norm": 1.098518656213201, "learning_rate": 8.549079679968272e-06, "loss": 0.8879, "step": 573 }, { "epoch": 0.5439469320066335, "grad_norm": 1.212951157381051, "learning_rate": 8.543667634933743e-06, "loss": 0.8697, "step": 574 }, { "epoch": 0.544894574745321, "grad_norm": 1.3330907351600239, "learning_rate": 8.538247234996766e-06, "loss": 0.8615, "step": 575 }, { "epoch": 0.5458422174840085, "grad_norm": 1.2057308113799874, "learning_rate": 8.532818492937014e-06, "loss": 0.9033, "step": 576 }, { "epoch": 0.546789860222696, "grad_norm": 1.1709128709827088, "learning_rate": 8.52738142155383e-06, "loss": 0.9136, "step": 577 }, { "epoch": 0.5477375029613836, "grad_norm": 1.1465991381882117, "learning_rate": 8.521936033666187e-06, "loss": 0.9102, "step": 578 }, { "epoch": 0.5486851457000711, "grad_norm": 1.4618014976340794, "learning_rate": 8.51648234211268e-06, "loss": 0.8733, "step": 579 }, { "epoch": 0.5496327884387586, "grad_norm": 1.449685521781311, "learning_rate": 8.511020359751467e-06, "loss": 0.9106, "step": 580 }, { "epoch": 0.5505804311774462, "grad_norm": 1.0171758381766154, "learning_rate": 8.505550099460264e-06, "loss": 0.9353, "step": 581 }, { "epoch": 0.5515280739161336, "grad_norm": 1.290290565129861, "learning_rate": 8.500071574136297e-06, "loss": 0.837, "step": 582 }, { "epoch": 0.5524757166548211, "grad_norm": 1.1275094814541378, "learning_rate": 8.49458479669628e-06, "loss": 0.9316, "step": 583 }, { "epoch": 0.5534233593935086, "grad_norm": 1.762720464593278, "learning_rate": 8.489089780076387e-06, "loss": 0.9394, "step": 584 }, { "epoch": 0.5543710021321961, "grad_norm": 1.227259697952017, "learning_rate": 8.483586537232212e-06, "loss": 0.8798, "step": 585 }, { "epoch": 0.5553186448708837, "grad_norm": 1.1785938474090234, "learning_rate": 8.478075081138746e-06, "loss": 0.9288, "step": 586 }, { "epoch": 0.5562662876095712, "grad_norm": 1.1067839714490098, "learning_rate": 8.472555424790348e-06, "loss": 0.833, "step": 587 }, { "epoch": 0.5572139303482587, "grad_norm": 1.1232716949263366, "learning_rate": 8.467027581200702e-06, "loss": 0.9166, "step": 588 }, { "epoch": 0.5581615730869462, "grad_norm": 1.261715047880492, "learning_rate": 8.461491563402807e-06, "loss": 0.9618, "step": 589 }, { "epoch": 0.5591092158256338, "grad_norm": 1.1832942718518242, "learning_rate": 8.455947384448926e-06, "loss": 0.8843, "step": 590 }, { "epoch": 0.5600568585643213, "grad_norm": 1.1707357848301445, "learning_rate": 8.450395057410561e-06, "loss": 0.8667, "step": 591 }, { "epoch": 0.5610045013030087, "grad_norm": 1.051280206948217, "learning_rate": 8.444834595378434e-06, "loss": 0.9182, "step": 592 }, { "epoch": 0.5619521440416962, "grad_norm": 1.5197971665007415, "learning_rate": 8.43926601146244e-06, "loss": 0.9023, "step": 593 }, { "epoch": 0.5628997867803838, "grad_norm": 1.2707540574566858, "learning_rate": 8.433689318791628e-06, "loss": 0.936, "step": 594 }, { "epoch": 0.5628997867803838, "eval_loss": 0.9368069767951965, "eval_runtime": 59.0081, "eval_samples_per_second": 46.231, "eval_steps_per_second": 0.729, "step": 594 }, { "epoch": 0.5638474295190713, "grad_norm": 1.1108224917689546, "learning_rate": 8.428104530514156e-06, "loss": 0.853, "step": 595 }, { "epoch": 0.5647950722577588, "grad_norm": 1.039118804478871, "learning_rate": 8.42251165979728e-06, "loss": 0.9154, "step": 596 }, { "epoch": 0.5657427149964463, "grad_norm": 1.0970139269789967, "learning_rate": 8.416910719827304e-06, "loss": 0.9166, "step": 597 }, { "epoch": 0.5666903577351339, "grad_norm": 1.0306693005295113, "learning_rate": 8.411301723809563e-06, "loss": 0.9061, "step": 598 }, { "epoch": 0.5676380004738214, "grad_norm": 2.3153529152284746, "learning_rate": 8.405684684968383e-06, "loss": 0.9242, "step": 599 }, { "epoch": 0.5685856432125089, "grad_norm": 1.1053912735757487, "learning_rate": 8.400059616547046e-06, "loss": 0.8394, "step": 600 }, { "epoch": 0.5695332859511963, "grad_norm": 1.2091214685314218, "learning_rate": 8.394426531807777e-06, "loss": 0.9289, "step": 601 }, { "epoch": 0.5704809286898839, "grad_norm": 1.1879706774303542, "learning_rate": 8.388785444031695e-06, "loss": 0.9362, "step": 602 }, { "epoch": 0.5714285714285714, "grad_norm": 1.2386582317865258, "learning_rate": 8.383136366518788e-06, "loss": 0.9061, "step": 603 }, { "epoch": 0.5723762141672589, "grad_norm": 1.027254780148272, "learning_rate": 8.37747931258788e-06, "loss": 0.9291, "step": 604 }, { "epoch": 0.5733238569059464, "grad_norm": 1.2061736249322361, "learning_rate": 8.371814295576604e-06, "loss": 0.9435, "step": 605 }, { "epoch": 0.574271499644634, "grad_norm": 1.1051297934960431, "learning_rate": 8.366141328841367e-06, "loss": 0.9444, "step": 606 }, { "epoch": 0.5752191423833215, "grad_norm": 1.0492890936420853, "learning_rate": 8.360460425757316e-06, "loss": 0.8896, "step": 607 }, { "epoch": 0.576166785122009, "grad_norm": 1.1855288112590538, "learning_rate": 8.354771599718313e-06, "loss": 0.9024, "step": 608 }, { "epoch": 0.5771144278606966, "grad_norm": 1.0894896483096521, "learning_rate": 8.349074864136897e-06, "loss": 0.8718, "step": 609 }, { "epoch": 0.5780620705993841, "grad_norm": 1.1673204787473177, "learning_rate": 8.34337023244426e-06, "loss": 0.9477, "step": 610 }, { "epoch": 0.5790097133380715, "grad_norm": 1.1746428108459406, "learning_rate": 8.33765771809021e-06, "loss": 0.9633, "step": 611 }, { "epoch": 0.579957356076759, "grad_norm": 1.6815219702121096, "learning_rate": 8.331937334543132e-06, "loss": 0.9357, "step": 612 }, { "epoch": 0.5809049988154465, "grad_norm": 1.284563514540576, "learning_rate": 8.326209095289973e-06, "loss": 0.9576, "step": 613 }, { "epoch": 0.5818526415541341, "grad_norm": 1.1141153791855245, "learning_rate": 8.320473013836197e-06, "loss": 0.9207, "step": 614 }, { "epoch": 0.5828002842928216, "grad_norm": 1.0820567139576633, "learning_rate": 8.314729103705758e-06, "loss": 0.8984, "step": 615 }, { "epoch": 0.5837479270315091, "grad_norm": 1.0636345740480533, "learning_rate": 8.308977378441072e-06, "loss": 0.9086, "step": 616 }, { "epoch": 0.5837479270315091, "eval_loss": 0.9341678619384766, "eval_runtime": 65.7189, "eval_samples_per_second": 41.51, "eval_steps_per_second": 0.654, "step": 616 }, { "epoch": 0.5846955697701967, "grad_norm": 1.3632356445316784, "learning_rate": 8.303217851602973e-06, "loss": 0.8918, "step": 617 }, { "epoch": 0.5856432125088842, "grad_norm": 1.1417039643528692, "learning_rate": 8.297450536770697e-06, "loss": 0.8531, "step": 618 }, { "epoch": 0.5865908552475717, "grad_norm": 1.03859128947666, "learning_rate": 8.291675447541834e-06, "loss": 0.8609, "step": 619 }, { "epoch": 0.5875384979862592, "grad_norm": 1.2256793137128281, "learning_rate": 8.285892597532311e-06, "loss": 0.9384, "step": 620 }, { "epoch": 0.5884861407249466, "grad_norm": 1.1848786072557997, "learning_rate": 8.280102000376346e-06, "loss": 0.8621, "step": 621 }, { "epoch": 0.5894337834636342, "grad_norm": 1.0897670274263946, "learning_rate": 8.274303669726427e-06, "loss": 0.8895, "step": 622 }, { "epoch": 0.5903814262023217, "grad_norm": 1.2338961521515757, "learning_rate": 8.268497619253273e-06, "loss": 0.9397, "step": 623 }, { "epoch": 0.5913290689410092, "grad_norm": 1.1260558549955006, "learning_rate": 8.262683862645804e-06, "loss": 0.8779, "step": 624 }, { "epoch": 0.5922767116796968, "grad_norm": 1.0331575412446614, "learning_rate": 8.256862413611113e-06, "loss": 0.912, "step": 625 }, { "epoch": 0.5932243544183843, "grad_norm": 1.192716956765876, "learning_rate": 8.25103328587442e-06, "loss": 0.8503, "step": 626 }, { "epoch": 0.5941719971570718, "grad_norm": 1.1264420514270421, "learning_rate": 8.245196493179061e-06, "loss": 0.968, "step": 627 }, { "epoch": 0.5951196398957593, "grad_norm": 1.1664248935291284, "learning_rate": 8.239352049286435e-06, "loss": 0.9293, "step": 628 }, { "epoch": 0.5960672826344469, "grad_norm": 1.165344238639824, "learning_rate": 8.233499967975981e-06, "loss": 0.9285, "step": 629 }, { "epoch": 0.5970149253731343, "grad_norm": 1.1132735526032906, "learning_rate": 8.22764026304515e-06, "loss": 0.8583, "step": 630 }, { "epoch": 0.5979625681118218, "grad_norm": 1.2263330088822129, "learning_rate": 8.221772948309363e-06, "loss": 0.8848, "step": 631 }, { "epoch": 0.5989102108505093, "grad_norm": 1.242835973780116, "learning_rate": 8.215898037601981e-06, "loss": 0.9078, "step": 632 }, { "epoch": 0.5998578535891969, "grad_norm": 1.0611322995754056, "learning_rate": 8.210015544774279e-06, "loss": 0.9158, "step": 633 }, { "epoch": 0.6008054963278844, "grad_norm": 1.0776982828638144, "learning_rate": 8.204125483695403e-06, "loss": 0.8951, "step": 634 }, { "epoch": 0.6017531390665719, "grad_norm": 1.1010692683885481, "learning_rate": 8.198227868252348e-06, "loss": 0.8796, "step": 635 }, { "epoch": 0.6027007818052594, "grad_norm": 1.1791589543105867, "learning_rate": 8.192322712349917e-06, "loss": 0.8649, "step": 636 }, { "epoch": 0.603648424543947, "grad_norm": 1.0601001331133804, "learning_rate": 8.186410029910694e-06, "loss": 0.9523, "step": 637 }, { "epoch": 0.6045960672826345, "grad_norm": 1.1485122140349338, "learning_rate": 8.180489834875e-06, "loss": 0.9796, "step": 638 }, { "epoch": 0.6045960672826345, "eval_loss": 0.9337397813796997, "eval_runtime": 64.0008, "eval_samples_per_second": 42.624, "eval_steps_per_second": 0.672, "step": 638 }, { "epoch": 0.605543710021322, "grad_norm": 1.164013089234412, "learning_rate": 8.174562141200878e-06, "loss": 0.8544, "step": 639 }, { "epoch": 0.6064913527600094, "grad_norm": 0.9760399939930886, "learning_rate": 8.168626962864045e-06, "loss": 0.9098, "step": 640 }, { "epoch": 0.607438995498697, "grad_norm": 1.2834957539805654, "learning_rate": 8.162684313857869e-06, "loss": 0.9297, "step": 641 }, { "epoch": 0.6083866382373845, "grad_norm": 1.1309875805256084, "learning_rate": 8.156734208193327e-06, "loss": 0.8415, "step": 642 }, { "epoch": 0.609334280976072, "grad_norm": 1.1022824206738733, "learning_rate": 8.15077665989898e-06, "loss": 0.89, "step": 643 }, { "epoch": 0.6102819237147595, "grad_norm": 1.024845261979357, "learning_rate": 8.144811683020932e-06, "loss": 0.9135, "step": 644 }, { "epoch": 0.6112295664534471, "grad_norm": 1.1297286487540084, "learning_rate": 8.138839291622807e-06, "loss": 0.9178, "step": 645 }, { "epoch": 0.6121772091921346, "grad_norm": 1.0969400851261308, "learning_rate": 8.132859499785708e-06, "loss": 0.8944, "step": 646 }, { "epoch": 0.6131248519308221, "grad_norm": 1.1361902149740226, "learning_rate": 8.126872321608185e-06, "loss": 0.8428, "step": 647 }, { "epoch": 0.6140724946695096, "grad_norm": 4.987461142356876, "learning_rate": 8.120877771206201e-06, "loss": 0.9267, "step": 648 }, { "epoch": 0.6150201374081972, "grad_norm": 1.1003270497805429, "learning_rate": 8.114875862713107e-06, "loss": 0.9126, "step": 649 }, { "epoch": 0.6159677801468846, "grad_norm": 1.1145863010291335, "learning_rate": 8.108866610279595e-06, "loss": 0.9069, "step": 650 }, { "epoch": 0.6169154228855721, "grad_norm": 1.070440031809025, "learning_rate": 8.102850028073674e-06, "loss": 0.9805, "step": 651 }, { "epoch": 0.6178630656242596, "grad_norm": 1.1878084074243875, "learning_rate": 8.09682613028064e-06, "loss": 0.8608, "step": 652 }, { "epoch": 0.6188107083629472, "grad_norm": 1.1666517787266597, "learning_rate": 8.090794931103026e-06, "loss": 0.8649, "step": 653 }, { "epoch": 0.6197583511016347, "grad_norm": 1.1109949887709072, "learning_rate": 8.08475644476059e-06, "loss": 0.8555, "step": 654 }, { "epoch": 0.6207059938403222, "grad_norm": 1.1293611851504917, "learning_rate": 8.078710685490266e-06, "loss": 0.9048, "step": 655 }, { "epoch": 0.6216536365790097, "grad_norm": 1.0517383761314782, "learning_rate": 8.072657667546136e-06, "loss": 0.8665, "step": 656 }, { "epoch": 0.6226012793176973, "grad_norm": 1.143650330668765, "learning_rate": 8.066597405199393e-06, "loss": 0.8833, "step": 657 }, { "epoch": 0.6235489220563848, "grad_norm": 1.2088279253982825, "learning_rate": 8.060529912738316e-06, "loss": 0.9369, "step": 658 }, { "epoch": 0.6244965647950722, "grad_norm": 1.1771192147073226, "learning_rate": 8.054455204468225e-06, "loss": 0.8912, "step": 659 }, { "epoch": 0.6254442075337597, "grad_norm": 0.9872215985965054, "learning_rate": 8.048373294711455e-06, "loss": 0.8272, "step": 660 }, { "epoch": 0.6254442075337597, "eval_loss": 0.9312112927436829, "eval_runtime": 61.3917, "eval_samples_per_second": 44.436, "eval_steps_per_second": 0.7, "step": 660 }, { "epoch": 0.6263918502724473, "grad_norm": 1.112849369485224, "learning_rate": 8.042284197807323e-06, "loss": 0.8914, "step": 661 }, { "epoch": 0.6273394930111348, "grad_norm": 1.1777170728187258, "learning_rate": 8.036187928112087e-06, "loss": 0.8983, "step": 662 }, { "epoch": 0.6282871357498223, "grad_norm": 1.1537880977099835, "learning_rate": 8.030084499998916e-06, "loss": 0.8823, "step": 663 }, { "epoch": 0.6292347784885098, "grad_norm": 1.1620930961053082, "learning_rate": 8.023973927857857e-06, "loss": 0.9361, "step": 664 }, { "epoch": 0.6301824212271974, "grad_norm": 1.3868544628160782, "learning_rate": 8.017856226095804e-06, "loss": 0.9183, "step": 665 }, { "epoch": 0.6311300639658849, "grad_norm": 1.1115391562362278, "learning_rate": 8.011731409136454e-06, "loss": 0.8678, "step": 666 }, { "epoch": 0.6320777067045724, "grad_norm": 1.1189548409555135, "learning_rate": 8.005599491420288e-06, "loss": 0.9562, "step": 667 }, { "epoch": 0.6330253494432599, "grad_norm": 1.1551587897622888, "learning_rate": 7.99946048740452e-06, "loss": 0.9742, "step": 668 }, { "epoch": 0.6339729921819474, "grad_norm": 0.9985847295223576, "learning_rate": 7.993314411563075e-06, "loss": 0.8763, "step": 669 }, { "epoch": 0.6349206349206349, "grad_norm": 0.9938954837958673, "learning_rate": 7.987161278386555e-06, "loss": 0.8941, "step": 670 }, { "epoch": 0.6358682776593224, "grad_norm": 1.2517564440987765, "learning_rate": 7.981001102382192e-06, "loss": 0.8922, "step": 671 }, { "epoch": 0.6368159203980099, "grad_norm": 1.669042851630183, "learning_rate": 7.974833898073832e-06, "loss": 0.8734, "step": 672 }, { "epoch": 0.6377635631366975, "grad_norm": 1.733742728719525, "learning_rate": 7.968659680001887e-06, "loss": 0.9224, "step": 673 }, { "epoch": 0.638711205875385, "grad_norm": 1.4086875008087318, "learning_rate": 7.962478462723306e-06, "loss": 0.8862, "step": 674 }, { "epoch": 0.6396588486140725, "grad_norm": 1.118275938120274, "learning_rate": 7.95629026081154e-06, "loss": 0.9075, "step": 675 }, { "epoch": 0.64060649135276, "grad_norm": 1.2853033943409442, "learning_rate": 7.950095088856509e-06, "loss": 0.857, "step": 676 }, { "epoch": 0.6415541340914476, "grad_norm": 1.0400548366155555, "learning_rate": 7.943892961464562e-06, "loss": 0.9434, "step": 677 }, { "epoch": 0.642501776830135, "grad_norm": 1.3041391262819717, "learning_rate": 7.937683893258454e-06, "loss": 0.9685, "step": 678 }, { "epoch": 0.6434494195688225, "grad_norm": 1.1598673981069736, "learning_rate": 7.931467898877298e-06, "loss": 0.8632, "step": 679 }, { "epoch": 0.64439706230751, "grad_norm": 1.0009937654408843, "learning_rate": 7.925244992976538e-06, "loss": 0.8824, "step": 680 }, { "epoch": 0.6453447050461976, "grad_norm": 1.017837058069719, "learning_rate": 7.919015190227919e-06, "loss": 0.8505, "step": 681 }, { "epoch": 0.6462923477848851, "grad_norm": 1.1641241757138032, "learning_rate": 7.912778505319436e-06, "loss": 0.8432, "step": 682 }, { "epoch": 0.6462923477848851, "eval_loss": 0.9309917688369751, "eval_runtime": 60.3302, "eval_samples_per_second": 45.218, "eval_steps_per_second": 0.713, "step": 682 }, { "epoch": 0.6472399905235726, "grad_norm": 1.1169534397607939, "learning_rate": 7.906534952955321e-06, "loss": 0.9085, "step": 683 }, { "epoch": 0.6481876332622601, "grad_norm": 1.0900655365751404, "learning_rate": 7.900284547855992e-06, "loss": 0.9411, "step": 684 }, { "epoch": 0.6491352760009477, "grad_norm": 1.2161831558733007, "learning_rate": 7.894027304758023e-06, "loss": 0.8769, "step": 685 }, { "epoch": 0.6500829187396352, "grad_norm": 1.044138736537594, "learning_rate": 7.88776323841411e-06, "loss": 0.9436, "step": 686 }, { "epoch": 0.6510305614783227, "grad_norm": 1.0705430979469939, "learning_rate": 7.88149236359304e-06, "loss": 0.8941, "step": 687 }, { "epoch": 0.6519782042170101, "grad_norm": 1.3845056323680385, "learning_rate": 7.875214695079647e-06, "loss": 0.9501, "step": 688 }, { "epoch": 0.6529258469556977, "grad_norm": 1.0170616350914143, "learning_rate": 7.868930247674787e-06, "loss": 0.9, "step": 689 }, { "epoch": 0.6538734896943852, "grad_norm": 1.0921009934181993, "learning_rate": 7.862639036195298e-06, "loss": 0.9174, "step": 690 }, { "epoch": 0.6548211324330727, "grad_norm": 1.218634642701156, "learning_rate": 7.856341075473963e-06, "loss": 0.9376, "step": 691 }, { "epoch": 0.6557687751717602, "grad_norm": 0.9907566710047155, "learning_rate": 7.850036380359479e-06, "loss": 0.8849, "step": 692 }, { "epoch": 0.6567164179104478, "grad_norm": 1.0543716934739653, "learning_rate": 7.843724965716419e-06, "loss": 0.9345, "step": 693 }, { "epoch": 0.6576640606491353, "grad_norm": 0.9814925522801817, "learning_rate": 7.837406846425205e-06, "loss": 0.8675, "step": 694 }, { "epoch": 0.6586117033878228, "grad_norm": 1.192089061098573, "learning_rate": 7.831082037382057e-06, "loss": 0.9501, "step": 695 }, { "epoch": 0.6595593461265103, "grad_norm": 1.1020254975949058, "learning_rate": 7.824750553498977e-06, "loss": 0.9811, "step": 696 }, { "epoch": 0.6605069888651979, "grad_norm": 1.1111842691891292, "learning_rate": 7.818412409703695e-06, "loss": 0.9328, "step": 697 }, { "epoch": 0.6614546316038853, "grad_norm": 1.1347234782175453, "learning_rate": 7.812067620939653e-06, "loss": 0.9614, "step": 698 }, { "epoch": 0.6624022743425728, "grad_norm": 1.065352923892995, "learning_rate": 7.805716202165949e-06, "loss": 0.8818, "step": 699 }, { "epoch": 0.6633499170812603, "grad_norm": 1.1030004540487208, "learning_rate": 7.799358168357323e-06, "loss": 0.8465, "step": 700 }, { "epoch": 0.6642975598199479, "grad_norm": 0.9089874996068653, "learning_rate": 7.792993534504103e-06, "loss": 0.9243, "step": 701 }, { "epoch": 0.6652452025586354, "grad_norm": 1.2054269196194143, "learning_rate": 7.786622315612182e-06, "loss": 0.8688, "step": 702 }, { "epoch": 0.6661928452973229, "grad_norm": 1.2746014427627002, "learning_rate": 7.78024452670298e-06, "loss": 0.9181, "step": 703 }, { "epoch": 0.6671404880360104, "grad_norm": 1.077600022154621, "learning_rate": 7.773860182813404e-06, "loss": 0.8492, "step": 704 }, { "epoch": 0.6671404880360104, "eval_loss": 0.928534984588623, "eval_runtime": 64.6468, "eval_samples_per_second": 42.199, "eval_steps_per_second": 0.665, "step": 704 }, { "epoch": 0.668088130774698, "grad_norm": 1.033501298641634, "learning_rate": 7.767469298995813e-06, "loss": 0.8854, "step": 705 }, { "epoch": 0.6690357735133855, "grad_norm": 0.9867061772593498, "learning_rate": 7.761071890317994e-06, "loss": 0.8431, "step": 706 }, { "epoch": 0.6699834162520729, "grad_norm": 1.1600752814036814, "learning_rate": 7.754667971863112e-06, "loss": 0.9133, "step": 707 }, { "epoch": 0.6709310589907604, "grad_norm": 1.1443042381662363, "learning_rate": 7.748257558729677e-06, "loss": 0.9184, "step": 708 }, { "epoch": 0.671878701729448, "grad_norm": 1.1577240117048646, "learning_rate": 7.741840666031517e-06, "loss": 0.8738, "step": 709 }, { "epoch": 0.6728263444681355, "grad_norm": 1.308361165359508, "learning_rate": 7.735417308897737e-06, "loss": 0.8414, "step": 710 }, { "epoch": 0.673773987206823, "grad_norm": 1.1739658962601087, "learning_rate": 7.728987502472678e-06, "loss": 0.8551, "step": 711 }, { "epoch": 0.6747216299455105, "grad_norm": 1.0699527544498066, "learning_rate": 7.72255126191589e-06, "loss": 0.8514, "step": 712 }, { "epoch": 0.6756692726841981, "grad_norm": 1.481959370807152, "learning_rate": 7.716108602402094e-06, "loss": 0.8944, "step": 713 }, { "epoch": 0.6766169154228856, "grad_norm": 1.1240913687798249, "learning_rate": 7.709659539121144e-06, "loss": 0.8578, "step": 714 }, { "epoch": 0.6775645581615731, "grad_norm": 1.1176326008647557, "learning_rate": 7.703204087277989e-06, "loss": 0.9374, "step": 715 }, { "epoch": 0.6785122009002607, "grad_norm": 1.2863389842208885, "learning_rate": 7.696742262092643e-06, "loss": 0.8846, "step": 716 }, { "epoch": 0.6794598436389481, "grad_norm": 1.3631041557393186, "learning_rate": 7.690274078800148e-06, "loss": 0.8766, "step": 717 }, { "epoch": 0.6804074863776356, "grad_norm": 1.4470714060969805, "learning_rate": 7.683799552650534e-06, "loss": 0.9231, "step": 718 }, { "epoch": 0.6813551291163231, "grad_norm": 1.1150660700565789, "learning_rate": 7.677318698908788e-06, "loss": 0.8391, "step": 719 }, { "epoch": 0.6823027718550106, "grad_norm": 1.1713456925403845, "learning_rate": 7.670831532854811e-06, "loss": 0.9214, "step": 720 }, { "epoch": 0.6832504145936982, "grad_norm": 1.219437200101105, "learning_rate": 7.66433806978339e-06, "loss": 0.8944, "step": 721 }, { "epoch": 0.6841980573323857, "grad_norm": 1.2287162936444869, "learning_rate": 7.65783832500416e-06, "loss": 0.8854, "step": 722 }, { "epoch": 0.6851457000710732, "grad_norm": 1.344476187588291, "learning_rate": 7.651332313841562e-06, "loss": 0.8488, "step": 723 }, { "epoch": 0.6860933428097608, "grad_norm": 1.0371186151252827, "learning_rate": 7.644820051634813e-06, "loss": 0.8473, "step": 724 }, { "epoch": 0.6870409855484483, "grad_norm": 1.1863536224030726, "learning_rate": 7.638301553737871e-06, "loss": 0.9155, "step": 725 }, { "epoch": 0.6879886282871357, "grad_norm": 1.200025733783506, "learning_rate": 7.63177683551939e-06, "loss": 0.8828, "step": 726 }, { "epoch": 0.6879886282871357, "eval_loss": 0.9281173944473267, "eval_runtime": 65.2556, "eval_samples_per_second": 41.805, "eval_steps_per_second": 0.659, "step": 726 }, { "epoch": 0.6889362710258232, "grad_norm": 1.1992637929718537, "learning_rate": 7.625245912362699e-06, "loss": 0.87, "step": 727 }, { "epoch": 0.6898839137645107, "grad_norm": 1.0955940037433252, "learning_rate": 7.618708799665745e-06, "loss": 0.8636, "step": 728 }, { "epoch": 0.6908315565031983, "grad_norm": 1.0564308744295217, "learning_rate": 7.612165512841076e-06, "loss": 0.9153, "step": 729 }, { "epoch": 0.6917791992418858, "grad_norm": 1.2470763038912207, "learning_rate": 7.605616067315793e-06, "loss": 0.9199, "step": 730 }, { "epoch": 0.6927268419805733, "grad_norm": 1.2122791323172828, "learning_rate": 7.599060478531519e-06, "loss": 0.9248, "step": 731 }, { "epoch": 0.6936744847192609, "grad_norm": 1.097709504783866, "learning_rate": 7.592498761944363e-06, "loss": 0.8689, "step": 732 }, { "epoch": 0.6946221274579484, "grad_norm": 1.230662404513117, "learning_rate": 7.585930933024874e-06, "loss": 0.9021, "step": 733 }, { "epoch": 0.6955697701966359, "grad_norm": 1.0146506247183977, "learning_rate": 7.579357007258022e-06, "loss": 0.9065, "step": 734 }, { "epoch": 0.6965174129353234, "grad_norm": 1.1522681830781554, "learning_rate": 7.572777000143145e-06, "loss": 0.8689, "step": 735 }, { "epoch": 0.6974650556740108, "grad_norm": 1.1876718763825516, "learning_rate": 7.56619092719392e-06, "loss": 0.8553, "step": 736 }, { "epoch": 0.6984126984126984, "grad_norm": 1.254710090679274, "learning_rate": 7.559598803938328e-06, "loss": 0.8994, "step": 737 }, { "epoch": 0.6993603411513859, "grad_norm": 1.3772630607131078, "learning_rate": 7.5530006459186115e-06, "loss": 0.9072, "step": 738 }, { "epoch": 0.7003079838900734, "grad_norm": 1.2658717437982785, "learning_rate": 7.546396468691241e-06, "loss": 0.8704, "step": 739 }, { "epoch": 0.701255626628761, "grad_norm": 1.051563030074167, "learning_rate": 7.539786287826885e-06, "loss": 0.9211, "step": 740 }, { "epoch": 0.7022032693674485, "grad_norm": 1.0078569587527633, "learning_rate": 7.533170118910356e-06, "loss": 0.8865, "step": 741 }, { "epoch": 0.703150912106136, "grad_norm": 1.2344336456961045, "learning_rate": 7.526547977540592e-06, "loss": 0.9072, "step": 742 }, { "epoch": 0.7040985548448235, "grad_norm": 1.1416986384728778, "learning_rate": 7.5199198793306135e-06, "loss": 0.873, "step": 743 }, { "epoch": 0.7050461975835111, "grad_norm": 1.0678982977792042, "learning_rate": 7.51328583990748e-06, "loss": 0.913, "step": 744 }, { "epoch": 0.7059938403221986, "grad_norm": 1.1437093215592007, "learning_rate": 7.506645874912264e-06, "loss": 0.9799, "step": 745 }, { "epoch": 0.706941483060886, "grad_norm": 1.0498302149495389, "learning_rate": 7.500000000000001e-06, "loss": 0.9027, "step": 746 }, { "epoch": 0.7078891257995735, "grad_norm": 2.056830676476333, "learning_rate": 7.4933482308396686e-06, "loss": 0.8287, "step": 747 }, { "epoch": 0.708836768538261, "grad_norm": 1.053677680621179, "learning_rate": 7.486690583114137e-06, "loss": 0.9333, "step": 748 }, { "epoch": 0.708836768538261, "eval_loss": 0.92720627784729, "eval_runtime": 65.9587, "eval_samples_per_second": 41.359, "eval_steps_per_second": 0.652, "step": 748 }, { "epoch": 0.7097844112769486, "grad_norm": 1.5701143667704405, "learning_rate": 7.480027072520137e-06, "loss": 0.8974, "step": 749 }, { "epoch": 0.7107320540156361, "grad_norm": 1.235022340393013, "learning_rate": 7.473357714768222e-06, "loss": 0.9207, "step": 750 }, { "epoch": 0.7116796967543236, "grad_norm": 1.1141250146076045, "learning_rate": 7.466682525582732e-06, "loss": 0.8674, "step": 751 }, { "epoch": 0.7126273394930112, "grad_norm": 1.1886209864097594, "learning_rate": 7.460001520701756e-06, "loss": 0.8858, "step": 752 }, { "epoch": 0.7135749822316987, "grad_norm": 1.1322038451733678, "learning_rate": 7.453314715877094e-06, "loss": 0.843, "step": 753 }, { "epoch": 0.7145226249703862, "grad_norm": 1.0033173758679002, "learning_rate": 7.446622126874219e-06, "loss": 0.8674, "step": 754 }, { "epoch": 0.7154702677090736, "grad_norm": 1.1688006539249634, "learning_rate": 7.439923769472244e-06, "loss": 0.8825, "step": 755 }, { "epoch": 0.7164179104477612, "grad_norm": 1.306296172244326, "learning_rate": 7.4332196594638815e-06, "loss": 0.9753, "step": 756 }, { "epoch": 0.7173655531864487, "grad_norm": 0.9730531759417155, "learning_rate": 7.4265098126554065e-06, "loss": 0.8617, "step": 757 }, { "epoch": 0.7183131959251362, "grad_norm": 1.385852128592582, "learning_rate": 7.419794244866619e-06, "loss": 0.8978, "step": 758 }, { "epoch": 0.7192608386638237, "grad_norm": 1.0569360851945016, "learning_rate": 7.413072971930807e-06, "loss": 0.9421, "step": 759 }, { "epoch": 0.7202084814025113, "grad_norm": 0.9964478050464701, "learning_rate": 7.406346009694713e-06, "loss": 0.85, "step": 760 }, { "epoch": 0.7211561241411988, "grad_norm": 1.193990300075423, "learning_rate": 7.39961337401849e-06, "loss": 0.8751, "step": 761 }, { "epoch": 0.7221037668798863, "grad_norm": 2.0570268630743156, "learning_rate": 7.3928750807756656e-06, "loss": 0.9026, "step": 762 }, { "epoch": 0.7230514096185738, "grad_norm": 40.56532136308673, "learning_rate": 7.386131145853111e-06, "loss": 0.858, "step": 763 }, { "epoch": 0.7239990523572614, "grad_norm": 1.7276587552930305, "learning_rate": 7.379381585150997e-06, "loss": 0.8743, "step": 764 }, { "epoch": 0.7249466950959488, "grad_norm": 1.1917953134999557, "learning_rate": 7.372626414582754e-06, "loss": 0.9486, "step": 765 }, { "epoch": 0.7258943378346363, "grad_norm": 1.1432814350506717, "learning_rate": 7.365865650075046e-06, "loss": 0.9477, "step": 766 }, { "epoch": 0.7268419805733238, "grad_norm": 1.09550223259366, "learning_rate": 7.359099307567721e-06, "loss": 0.9092, "step": 767 }, { "epoch": 0.7277896233120114, "grad_norm": 1.150998457087419, "learning_rate": 7.352327403013779e-06, "loss": 0.8982, "step": 768 }, { "epoch": 0.7287372660506989, "grad_norm": 1.108499327493803, "learning_rate": 7.345549952379334e-06, "loss": 0.9682, "step": 769 }, { "epoch": 0.7296849087893864, "grad_norm": 1.153466896232557, "learning_rate": 7.338766971643579e-06, "loss": 0.8988, "step": 770 }, { "epoch": 0.7296849087893864, "eval_loss": 0.926575779914856, "eval_runtime": 68.8528, "eval_samples_per_second": 39.621, "eval_steps_per_second": 0.625, "step": 770 }, { "epoch": 0.7306325515280739, "grad_norm": 1.0796573599853159, "learning_rate": 7.331978476798738e-06, "loss": 0.8149, "step": 771 }, { "epoch": 0.7315801942667615, "grad_norm": 1.0931539077757213, "learning_rate": 7.325184483850043e-06, "loss": 0.9123, "step": 772 }, { "epoch": 0.732527837005449, "grad_norm": 1.4182919844521371, "learning_rate": 7.318385008815686e-06, "loss": 0.95, "step": 773 }, { "epoch": 0.7334754797441365, "grad_norm": 1.2970117028124304, "learning_rate": 7.311580067726783e-06, "loss": 0.8689, "step": 774 }, { "epoch": 0.7344231224828239, "grad_norm": 1.0927364681801075, "learning_rate": 7.304769676627339e-06, "loss": 0.8769, "step": 775 }, { "epoch": 0.7353707652215115, "grad_norm": 1.0964389214826673, "learning_rate": 7.297953851574207e-06, "loss": 0.9555, "step": 776 }, { "epoch": 0.736318407960199, "grad_norm": 1.0832681682992362, "learning_rate": 7.291132608637053e-06, "loss": 0.9345, "step": 777 }, { "epoch": 0.7372660506988865, "grad_norm": 1.227070674353931, "learning_rate": 7.284305963898315e-06, "loss": 0.8685, "step": 778 }, { "epoch": 0.738213693437574, "grad_norm": 3.9684909478672976, "learning_rate": 7.27747393345317e-06, "loss": 0.8362, "step": 779 }, { "epoch": 0.7391613361762616, "grad_norm": 1.4889622029850116, "learning_rate": 7.270636533409491e-06, "loss": 0.9391, "step": 780 }, { "epoch": 0.7401089789149491, "grad_norm": 1.1829326412392815, "learning_rate": 7.2637937798878085e-06, "loss": 0.9182, "step": 781 }, { "epoch": 0.7410566216536366, "grad_norm": 1.190511352024864, "learning_rate": 7.25694568902128e-06, "loss": 0.8477, "step": 782 }, { "epoch": 0.7420042643923241, "grad_norm": 1.3407968776745007, "learning_rate": 7.250092276955642e-06, "loss": 0.8861, "step": 783 }, { "epoch": 0.7429519071310116, "grad_norm": 1.0306033495447815, "learning_rate": 7.243233559849179e-06, "loss": 0.8723, "step": 784 }, { "epoch": 0.7438995498696991, "grad_norm": 1.032943199516836, "learning_rate": 7.236369553872684e-06, "loss": 0.8848, "step": 785 }, { "epoch": 0.7448471926083866, "grad_norm": 1.1806035892508238, "learning_rate": 7.229500275209418e-06, "loss": 0.9254, "step": 786 }, { "epoch": 0.7457948353470741, "grad_norm": 1.1412881455081196, "learning_rate": 7.222625740055072e-06, "loss": 0.8766, "step": 787 }, { "epoch": 0.7467424780857617, "grad_norm": 1.1293924713464558, "learning_rate": 7.215745964617737e-06, "loss": 0.8932, "step": 788 }, { "epoch": 0.7476901208244492, "grad_norm": 1.0502148008128274, "learning_rate": 7.2088609651178505e-06, "loss": 0.8693, "step": 789 }, { "epoch": 0.7486377635631367, "grad_norm": 1.1691866376909303, "learning_rate": 7.201970757788172e-06, "loss": 0.9133, "step": 790 }, { "epoch": 0.7495854063018242, "grad_norm": 1.673809188183371, "learning_rate": 7.195075358873738e-06, "loss": 0.8997, "step": 791 }, { "epoch": 0.7505330490405118, "grad_norm": 1.136212481513878, "learning_rate": 7.188174784631824e-06, "loss": 0.8343, "step": 792 }, { "epoch": 0.7505330490405118, "eval_loss": 0.925286591053009, "eval_runtime": 67.6399, "eval_samples_per_second": 40.331, "eval_steps_per_second": 0.636, "step": 792 }, { "epoch": 0.7514806917791993, "grad_norm": 1.17472555336564, "learning_rate": 7.18126905133191e-06, "loss": 0.8889, "step": 793 }, { "epoch": 0.7524283345178867, "grad_norm": 1.1001788390092748, "learning_rate": 7.174358175255636e-06, "loss": 0.8534, "step": 794 }, { "epoch": 0.7533759772565742, "grad_norm": 1.465010157671844, "learning_rate": 7.1674421726967704e-06, "loss": 0.8603, "step": 795 }, { "epoch": 0.7543236199952618, "grad_norm": 1.1953256640932477, "learning_rate": 7.160521059961169e-06, "loss": 0.8345, "step": 796 }, { "epoch": 0.7552712627339493, "grad_norm": 3.8996540248644043, "learning_rate": 7.153594853366731e-06, "loss": 0.8398, "step": 797 }, { "epoch": 0.7562189054726368, "grad_norm": 1.245025773648245, "learning_rate": 7.14666356924337e-06, "loss": 0.9068, "step": 798 }, { "epoch": 0.7571665482113243, "grad_norm": 1.084110814960542, "learning_rate": 7.1397272239329684e-06, "loss": 0.881, "step": 799 }, { "epoch": 0.7581141909500119, "grad_norm": 1.4178645946238013, "learning_rate": 7.132785833789344e-06, "loss": 0.9458, "step": 800 }, { "epoch": 0.7590618336886994, "grad_norm": 1.0724390633923235, "learning_rate": 7.125839415178204e-06, "loss": 0.8477, "step": 801 }, { "epoch": 0.7600094764273869, "grad_norm": 1.1299800999994787, "learning_rate": 7.118887984477116e-06, "loss": 0.8842, "step": 802 }, { "epoch": 0.7609571191660743, "grad_norm": 1.1509151559671802, "learning_rate": 7.111931558075465e-06, "loss": 0.8459, "step": 803 }, { "epoch": 0.7619047619047619, "grad_norm": 1.154455319378924, "learning_rate": 7.104970152374405e-06, "loss": 0.9082, "step": 804 }, { "epoch": 0.7628524046434494, "grad_norm": 1.15539942235384, "learning_rate": 7.098003783786844e-06, "loss": 0.9114, "step": 805 }, { "epoch": 0.7638000473821369, "grad_norm": 1.1800047790109787, "learning_rate": 7.091032468737382e-06, "loss": 0.8608, "step": 806 }, { "epoch": 0.7647476901208244, "grad_norm": 1.1477444619292674, "learning_rate": 7.084056223662282e-06, "loss": 0.8842, "step": 807 }, { "epoch": 0.765695332859512, "grad_norm": 1.1737093276570716, "learning_rate": 7.0770750650094335e-06, "loss": 0.9007, "step": 808 }, { "epoch": 0.7666429755981995, "grad_norm": 1.0202933754062562, "learning_rate": 7.070089009238306e-06, "loss": 0.9234, "step": 809 }, { "epoch": 0.767590618336887, "grad_norm": 1.0550807990536413, "learning_rate": 7.063098072819919e-06, "loss": 0.8696, "step": 810 }, { "epoch": 0.7685382610755745, "grad_norm": 1.0531655916431704, "learning_rate": 7.056102272236799e-06, "loss": 0.8853, "step": 811 }, { "epoch": 0.7694859038142621, "grad_norm": 1.2354614191985032, "learning_rate": 7.049101623982938e-06, "loss": 0.883, "step": 812 }, { "epoch": 0.7704335465529495, "grad_norm": 0.9726437070709893, "learning_rate": 7.04209614456376e-06, "loss": 0.9153, "step": 813 }, { "epoch": 0.771381189291637, "grad_norm": 2.241432047297512, "learning_rate": 7.035085850496079e-06, "loss": 0.94, "step": 814 }, { "epoch": 0.771381189291637, "eval_loss": 0.9247336387634277, "eval_runtime": 60.7578, "eval_samples_per_second": 44.9, "eval_steps_per_second": 0.708, "step": 814 }, { "epoch": 0.7723288320303245, "grad_norm": 1.1356145449628114, "learning_rate": 7.028070758308059e-06, "loss": 0.8219, "step": 815 }, { "epoch": 0.7732764747690121, "grad_norm": 1.1079321851319959, "learning_rate": 7.021050884539178e-06, "loss": 0.8588, "step": 816 }, { "epoch": 0.7742241175076996, "grad_norm": 1.3040268308315734, "learning_rate": 7.014026245740185e-06, "loss": 0.8419, "step": 817 }, { "epoch": 0.7751717602463871, "grad_norm": 1.230458455518736, "learning_rate": 7.006996858473068e-06, "loss": 0.9624, "step": 818 }, { "epoch": 0.7761194029850746, "grad_norm": 0.9567806690635783, "learning_rate": 6.999962739311008e-06, "loss": 0.8194, "step": 819 }, { "epoch": 0.7770670457237622, "grad_norm": 1.243710347278979, "learning_rate": 6.992923904838341e-06, "loss": 0.8955, "step": 820 }, { "epoch": 0.7780146884624497, "grad_norm": 1.0335878204965474, "learning_rate": 6.98588037165052e-06, "loss": 0.9045, "step": 821 }, { "epoch": 0.7789623312011372, "grad_norm": 1.0941302857809432, "learning_rate": 6.97883215635408e-06, "loss": 0.8902, "step": 822 }, { "epoch": 0.7799099739398246, "grad_norm": 1.123840857900373, "learning_rate": 6.971779275566593e-06, "loss": 0.8913, "step": 823 }, { "epoch": 0.7808576166785122, "grad_norm": 1.1195400281687917, "learning_rate": 6.96472174591663e-06, "loss": 0.8474, "step": 824 }, { "epoch": 0.7818052594171997, "grad_norm": 1.3420603070760255, "learning_rate": 6.957659584043724e-06, "loss": 0.9077, "step": 825 }, { "epoch": 0.7827529021558872, "grad_norm": 1.0041194424246165, "learning_rate": 6.9505928065983275e-06, "loss": 0.9597, "step": 826 }, { "epoch": 0.7837005448945747, "grad_norm": 1.0270969195404063, "learning_rate": 6.943521430241777e-06, "loss": 0.8403, "step": 827 }, { "epoch": 0.7846481876332623, "grad_norm": 1.1552904131971864, "learning_rate": 6.936445471646249e-06, "loss": 0.9044, "step": 828 }, { "epoch": 0.7855958303719498, "grad_norm": 1.304060055980435, "learning_rate": 6.929364947494729e-06, "loss": 0.9, "step": 829 }, { "epoch": 0.7865434731106373, "grad_norm": 1.210030944302157, "learning_rate": 6.922279874480959e-06, "loss": 0.9113, "step": 830 }, { "epoch": 0.7874911158493249, "grad_norm": 1.025883162070808, "learning_rate": 6.915190269309416e-06, "loss": 0.9074, "step": 831 }, { "epoch": 0.7884387585880123, "grad_norm": 1.1256642999783826, "learning_rate": 6.908096148695251e-06, "loss": 0.9119, "step": 832 }, { "epoch": 0.7893864013266998, "grad_norm": 1.0860563647829231, "learning_rate": 6.900997529364269e-06, "loss": 0.9093, "step": 833 }, { "epoch": 0.7903340440653873, "grad_norm": 1.0010442291598631, "learning_rate": 6.893894428052881e-06, "loss": 0.8858, "step": 834 }, { "epoch": 0.7912816868040748, "grad_norm": 1.0574820455803635, "learning_rate": 6.886786861508061e-06, "loss": 0.8924, "step": 835 }, { "epoch": 0.7922293295427624, "grad_norm": 1.2017601371365032, "learning_rate": 6.879674846487314e-06, "loss": 0.8959, "step": 836 }, { "epoch": 0.7922293295427624, "eval_loss": 0.9229027628898621, "eval_runtime": 61.6949, "eval_samples_per_second": 44.218, "eval_steps_per_second": 0.697, "step": 836 }, { "epoch": 0.7931769722814499, "grad_norm": 1.0725982823665645, "learning_rate": 6.872558399758633e-06, "loss": 0.8485, "step": 837 }, { "epoch": 0.7941246150201374, "grad_norm": 1.0560214413221218, "learning_rate": 6.865437538100456e-06, "loss": 0.8418, "step": 838 }, { "epoch": 0.795072257758825, "grad_norm": 1.1398098092881728, "learning_rate": 6.858312278301638e-06, "loss": 0.8506, "step": 839 }, { "epoch": 0.7960199004975125, "grad_norm": 1.1966491026312496, "learning_rate": 6.8511826371613955e-06, "loss": 0.9207, "step": 840 }, { "epoch": 0.7969675432362, "grad_norm": 1.100229416684982, "learning_rate": 6.8440486314892775e-06, "loss": 0.8327, "step": 841 }, { "epoch": 0.7979151859748874, "grad_norm": 1.0044010076247918, "learning_rate": 6.836910278105124e-06, "loss": 0.823, "step": 842 }, { "epoch": 0.798862828713575, "grad_norm": 1.0289580305146189, "learning_rate": 6.8297675938390275e-06, "loss": 0.8566, "step": 843 }, { "epoch": 0.7998104714522625, "grad_norm": 1.701693817598629, "learning_rate": 6.822620595531286e-06, "loss": 0.9532, "step": 844 }, { "epoch": 0.80075811419095, "grad_norm": 1.1349425721498254, "learning_rate": 6.815469300032374e-06, "loss": 0.914, "step": 845 }, { "epoch": 0.8017057569296375, "grad_norm": 1.139104764424171, "learning_rate": 6.808313724202894e-06, "loss": 0.9461, "step": 846 }, { "epoch": 0.802653399668325, "grad_norm": 1.4336731001103296, "learning_rate": 6.801153884913541e-06, "loss": 0.8307, "step": 847 }, { "epoch": 0.8036010424070126, "grad_norm": 1.0889586206123247, "learning_rate": 6.793989799045067e-06, "loss": 0.9337, "step": 848 }, { "epoch": 0.8045486851457001, "grad_norm": 1.1232729735134666, "learning_rate": 6.7868214834882265e-06, "loss": 0.9321, "step": 849 }, { "epoch": 0.8054963278843876, "grad_norm": 1.1373251435631513, "learning_rate": 6.779648955143754e-06, "loss": 0.8665, "step": 850 }, { "epoch": 0.806443970623075, "grad_norm": 1.2552274257774585, "learning_rate": 6.772472230922313e-06, "loss": 0.8871, "step": 851 }, { "epoch": 0.8073916133617626, "grad_norm": 1.0665372846687498, "learning_rate": 6.765291327744463e-06, "loss": 0.8943, "step": 852 }, { "epoch": 0.8083392561004501, "grad_norm": 2.382365978345813, "learning_rate": 6.758106262540611e-06, "loss": 0.96, "step": 853 }, { "epoch": 0.8092868988391376, "grad_norm": 1.2804659583348557, "learning_rate": 6.750917052250981e-06, "loss": 0.9211, "step": 854 }, { "epoch": 0.8102345415778252, "grad_norm": 1.1113739193035552, "learning_rate": 6.7437237138255686e-06, "loss": 0.9385, "step": 855 }, { "epoch": 0.8111821843165127, "grad_norm": 1.1014129494663154, "learning_rate": 6.736526264224101e-06, "loss": 0.8579, "step": 856 }, { "epoch": 0.8121298270552002, "grad_norm": 0.9349690740722351, "learning_rate": 6.7293247204160024e-06, "loss": 0.8415, "step": 857 }, { "epoch": 0.8130774697938877, "grad_norm": 1.0528528824598096, "learning_rate": 6.722119099380345e-06, "loss": 0.9034, "step": 858 }, { "epoch": 0.8130774697938877, "eval_loss": 0.9221681952476501, "eval_runtime": 66.5105, "eval_samples_per_second": 41.016, "eval_steps_per_second": 0.647, "step": 858 }, { "epoch": 0.8140251125325753, "grad_norm": 1.1189012672893626, "learning_rate": 6.714909418105816e-06, "loss": 0.8928, "step": 859 }, { "epoch": 0.8149727552712628, "grad_norm": 1.0491667848828412, "learning_rate": 6.7076956935906756e-06, "loss": 0.8846, "step": 860 }, { "epoch": 0.8159203980099502, "grad_norm": 1.1800217977478704, "learning_rate": 6.700477942842717e-06, "loss": 0.8467, "step": 861 }, { "epoch": 0.8168680407486377, "grad_norm": 1.3635685933149135, "learning_rate": 6.693256182879224e-06, "loss": 0.8875, "step": 862 }, { "epoch": 0.8178156834873253, "grad_norm": 0.9738480603904128, "learning_rate": 6.686030430726938e-06, "loss": 0.8611, "step": 863 }, { "epoch": 0.8187633262260128, "grad_norm": 1.0646192263592438, "learning_rate": 6.678800703422004e-06, "loss": 0.9442, "step": 864 }, { "epoch": 0.8197109689647003, "grad_norm": 1.133138450105503, "learning_rate": 6.671567018009948e-06, "loss": 0.8936, "step": 865 }, { "epoch": 0.8206586117033878, "grad_norm": 1.1779803833294133, "learning_rate": 6.664329391545625e-06, "loss": 0.8945, "step": 866 }, { "epoch": 0.8216062544420754, "grad_norm": 1.14213552665385, "learning_rate": 6.657087841093179e-06, "loss": 0.8796, "step": 867 }, { "epoch": 0.8225538971807629, "grad_norm": 1.0919659396363655, "learning_rate": 6.649842383726011e-06, "loss": 0.9093, "step": 868 }, { "epoch": 0.8235015399194504, "grad_norm": 1.1540405069787951, "learning_rate": 6.642593036526728e-06, "loss": 0.8398, "step": 869 }, { "epoch": 0.8244491826581379, "grad_norm": 0.9630828315476891, "learning_rate": 6.635339816587109e-06, "loss": 0.8616, "step": 870 }, { "epoch": 0.8253968253968254, "grad_norm": 1.138452507177444, "learning_rate": 6.628082741008068e-06, "loss": 0.9328, "step": 871 }, { "epoch": 0.8263444681355129, "grad_norm": 1.1595340664384783, "learning_rate": 6.620821826899606e-06, "loss": 0.8951, "step": 872 }, { "epoch": 0.8272921108742004, "grad_norm": 1.0403640766877473, "learning_rate": 6.613557091380771e-06, "loss": 0.8403, "step": 873 }, { "epoch": 0.8282397536128879, "grad_norm": 1.0491318496593032, "learning_rate": 6.606288551579629e-06, "loss": 0.8726, "step": 874 }, { "epoch": 0.8291873963515755, "grad_norm": 1.0770673557649837, "learning_rate": 6.599016224633209e-06, "loss": 0.8777, "step": 875 }, { "epoch": 0.830135039090263, "grad_norm": 1.2586661831963606, "learning_rate": 6.59174012768747e-06, "loss": 0.9406, "step": 876 }, { "epoch": 0.8310826818289505, "grad_norm": 0.9975414535990195, "learning_rate": 6.584460277897262e-06, "loss": 0.9178, "step": 877 }, { "epoch": 0.832030324567638, "grad_norm": 1.1950907387823217, "learning_rate": 6.5771766924262795e-06, "loss": 0.8673, "step": 878 }, { "epoch": 0.8329779673063256, "grad_norm": 1.026575911873265, "learning_rate": 6.569889388447025e-06, "loss": 0.8515, "step": 879 }, { "epoch": 0.833925610045013, "grad_norm": 1.1555312684415828, "learning_rate": 6.562598383140773e-06, "loss": 0.9227, "step": 880 }, { "epoch": 0.833925610045013, "eval_loss": 0.9205263257026672, "eval_runtime": 65.5841, "eval_samples_per_second": 41.595, "eval_steps_per_second": 0.656, "step": 880 }, { "epoch": 0.8348732527837005, "grad_norm": 1.0410870491482918, "learning_rate": 6.555303693697517e-06, "loss": 0.8879, "step": 881 }, { "epoch": 0.835820895522388, "grad_norm": 1.2046733679595938, "learning_rate": 6.548005337315943e-06, "loss": 0.9327, "step": 882 }, { "epoch": 0.8367685382610756, "grad_norm": 1.4002394037690635, "learning_rate": 6.540703331203382e-06, "loss": 0.8616, "step": 883 }, { "epoch": 0.8377161809997631, "grad_norm": 1.0820553537637987, "learning_rate": 6.533397692575766e-06, "loss": 0.8599, "step": 884 }, { "epoch": 0.8386638237384506, "grad_norm": 1.1367287841963307, "learning_rate": 6.526088438657594e-06, "loss": 0.9047, "step": 885 }, { "epoch": 0.8396114664771381, "grad_norm": 1.1941704415773284, "learning_rate": 6.518775586681887e-06, "loss": 0.8552, "step": 886 }, { "epoch": 0.8405591092158257, "grad_norm": 1.059195584331362, "learning_rate": 6.511459153890152e-06, "loss": 0.9146, "step": 887 }, { "epoch": 0.8415067519545132, "grad_norm": 1.105675547858621, "learning_rate": 6.504139157532338e-06, "loss": 0.8386, "step": 888 }, { "epoch": 0.8424543946932007, "grad_norm": 1.176419395830192, "learning_rate": 6.496815614866792e-06, "loss": 0.851, "step": 889 }, { "epoch": 0.8434020374318881, "grad_norm": 1.4493743507868786, "learning_rate": 6.489488543160225e-06, "loss": 0.9137, "step": 890 }, { "epoch": 0.8443496801705757, "grad_norm": 0.9826791105867756, "learning_rate": 6.4821579596876705e-06, "loss": 0.9117, "step": 891 }, { "epoch": 0.8452973229092632, "grad_norm": 1.0251650573055395, "learning_rate": 6.4748238817324395e-06, "loss": 0.9214, "step": 892 }, { "epoch": 0.8462449656479507, "grad_norm": 1.025330926521032, "learning_rate": 6.46748632658608e-06, "loss": 0.8434, "step": 893 }, { "epoch": 0.8471926083866382, "grad_norm": 1.0829549887894614, "learning_rate": 6.460145311548341e-06, "loss": 0.9142, "step": 894 }, { "epoch": 0.8481402511253258, "grad_norm": 1.0949066487399224, "learning_rate": 6.452800853927128e-06, "loss": 0.8257, "step": 895 }, { "epoch": 0.8490878938640133, "grad_norm": 1.153959748533555, "learning_rate": 6.445452971038464e-06, "loss": 0.9253, "step": 896 }, { "epoch": 0.8500355366027008, "grad_norm": 1.193405218423659, "learning_rate": 6.438101680206444e-06, "loss": 0.9291, "step": 897 }, { "epoch": 0.8509831793413883, "grad_norm": 1.2544535828853964, "learning_rate": 6.430746998763204e-06, "loss": 0.9173, "step": 898 }, { "epoch": 0.8519308220800759, "grad_norm": 0.9715729053189112, "learning_rate": 6.42338894404887e-06, "loss": 0.8385, "step": 899 }, { "epoch": 0.8528784648187633, "grad_norm": 1.0084105628200437, "learning_rate": 6.41602753341152e-06, "loss": 0.886, "step": 900 }, { "epoch": 0.8538261075574508, "grad_norm": 1.0846787519964416, "learning_rate": 6.408662784207149e-06, "loss": 0.8862, "step": 901 }, { "epoch": 0.8547737502961383, "grad_norm": 1.553862723805949, "learning_rate": 6.4012947137996175e-06, "loss": 0.9481, "step": 902 }, { "epoch": 0.8547737502961383, "eval_loss": 0.9192849397659302, "eval_runtime": 61.9949, "eval_samples_per_second": 44.004, "eval_steps_per_second": 0.694, "step": 902 }, { "epoch": 0.8557213930348259, "grad_norm": 1.190837748503028, "learning_rate": 6.393923339560621e-06, "loss": 0.9056, "step": 903 }, { "epoch": 0.8566690357735134, "grad_norm": 1.0240140450014306, "learning_rate": 6.386548678869644e-06, "loss": 0.8862, "step": 904 }, { "epoch": 0.8576166785122009, "grad_norm": 1.1684091911040653, "learning_rate": 6.379170749113918e-06, "loss": 0.9077, "step": 905 }, { "epoch": 0.8585643212508884, "grad_norm": 1.2458608937767364, "learning_rate": 6.37178956768838e-06, "loss": 0.8883, "step": 906 }, { "epoch": 0.859511963989576, "grad_norm": 1.0899866220356935, "learning_rate": 6.3644051519956366e-06, "loss": 0.8953, "step": 907 }, { "epoch": 0.8604596067282635, "grad_norm": 1.0159031729823804, "learning_rate": 6.3570175194459205e-06, "loss": 0.8946, "step": 908 }, { "epoch": 0.8614072494669509, "grad_norm": 1.0422661890872638, "learning_rate": 6.349626687457045e-06, "loss": 0.9217, "step": 909 }, { "epoch": 0.8623548922056384, "grad_norm": 1.0650534348419232, "learning_rate": 6.342232673454371e-06, "loss": 0.8993, "step": 910 }, { "epoch": 0.863302534944326, "grad_norm": 1.162095463386776, "learning_rate": 6.334835494870759e-06, "loss": 0.9435, "step": 911 }, { "epoch": 0.8642501776830135, "grad_norm": 0.9740991606947988, "learning_rate": 6.3274351691465305e-06, "loss": 0.8874, "step": 912 }, { "epoch": 0.865197820421701, "grad_norm": 1.0752595651605357, "learning_rate": 6.320031713729429e-06, "loss": 0.8733, "step": 913 }, { "epoch": 0.8661454631603885, "grad_norm": 1.003012232982504, "learning_rate": 6.312625146074574e-06, "loss": 0.8997, "step": 914 }, { "epoch": 0.8670931058990761, "grad_norm": 1.2050664252308885, "learning_rate": 6.305215483644427e-06, "loss": 0.9121, "step": 915 }, { "epoch": 0.8680407486377636, "grad_norm": 1.0509143532390477, "learning_rate": 6.2978027439087405e-06, "loss": 0.9215, "step": 916 }, { "epoch": 0.8689883913764511, "grad_norm": 1.1000294092216198, "learning_rate": 6.290386944344527e-06, "loss": 0.8209, "step": 917 }, { "epoch": 0.8699360341151386, "grad_norm": 1.4381680474891718, "learning_rate": 6.28296810243601e-06, "loss": 0.9485, "step": 918 }, { "epoch": 0.8708836768538261, "grad_norm": 1.0986281816759764, "learning_rate": 6.2755462356745885e-06, "loss": 0.8677, "step": 919 }, { "epoch": 0.8718313195925136, "grad_norm": 1.1944430191428006, "learning_rate": 6.268121361558792e-06, "loss": 0.932, "step": 920 }, { "epoch": 0.8727789623312011, "grad_norm": 30.495921119331683, "learning_rate": 6.2606934975942415e-06, "loss": 0.9977, "step": 921 }, { "epoch": 0.8737266050698886, "grad_norm": 1.0629329656422315, "learning_rate": 6.2532626612936035e-06, "loss": 0.9012, "step": 922 }, { "epoch": 0.8746742478085762, "grad_norm": 1.212840019137115, "learning_rate": 6.245828870176557e-06, "loss": 0.8842, "step": 923 }, { "epoch": 0.8756218905472637, "grad_norm": 1.1166380211338318, "learning_rate": 6.238392141769743e-06, "loss": 0.8853, "step": 924 }, { "epoch": 0.8756218905472637, "eval_loss": 0.9188343286514282, "eval_runtime": 71.0766, "eval_samples_per_second": 38.381, "eval_steps_per_second": 0.605, "step": 924 }, { "epoch": 0.8765695332859512, "grad_norm": 1.0374942288524216, "learning_rate": 6.2309524936067344e-06, "loss": 0.8285, "step": 925 }, { "epoch": 0.8775171760246387, "grad_norm": 1.294054885875714, "learning_rate": 6.22350994322798e-06, "loss": 0.9001, "step": 926 }, { "epoch": 0.8784648187633263, "grad_norm": 1.0637784431739705, "learning_rate": 6.216064508180778e-06, "loss": 0.8865, "step": 927 }, { "epoch": 0.8794124615020137, "grad_norm": 1.0389026144557505, "learning_rate": 6.208616206019225e-06, "loss": 0.8368, "step": 928 }, { "epoch": 0.8803601042407012, "grad_norm": 1.008199300013402, "learning_rate": 6.2011650543041734e-06, "loss": 0.8638, "step": 929 }, { "epoch": 0.8813077469793887, "grad_norm": 1.0169656783697818, "learning_rate": 6.193711070603202e-06, "loss": 0.8854, "step": 930 }, { "epoch": 0.8822553897180763, "grad_norm": 1.3473009889737735, "learning_rate": 6.1862542724905605e-06, "loss": 0.8851, "step": 931 }, { "epoch": 0.8832030324567638, "grad_norm": 0.9515757094852794, "learning_rate": 6.178794677547138e-06, "loss": 0.8049, "step": 932 }, { "epoch": 0.8841506751954513, "grad_norm": 1.0333010505925766, "learning_rate": 6.171332303360411e-06, "loss": 0.8989, "step": 933 }, { "epoch": 0.8850983179341388, "grad_norm": 1.0350926221697927, "learning_rate": 6.163867167524419e-06, "loss": 0.8401, "step": 934 }, { "epoch": 0.8860459606728264, "grad_norm": 1.0721173411729321, "learning_rate": 6.156399287639703e-06, "loss": 0.9309, "step": 935 }, { "epoch": 0.8869936034115139, "grad_norm": 1.3120269262792263, "learning_rate": 6.14892868131328e-06, "loss": 0.8991, "step": 936 }, { "epoch": 0.8879412461502014, "grad_norm": 1.1865006482607656, "learning_rate": 6.1414553661585905e-06, "loss": 0.8683, "step": 937 }, { "epoch": 0.8888888888888888, "grad_norm": 1.2132247708632098, "learning_rate": 6.1339793597954675e-06, "loss": 0.8569, "step": 938 }, { "epoch": 0.8898365316275764, "grad_norm": 1.356211439062163, "learning_rate": 6.126500679850082e-06, "loss": 0.8543, "step": 939 }, { "epoch": 0.8907841743662639, "grad_norm": 1.0301726611069624, "learning_rate": 6.119019343954914e-06, "loss": 0.8244, "step": 940 }, { "epoch": 0.8917318171049514, "grad_norm": 1.2284199784911718, "learning_rate": 6.111535369748702e-06, "loss": 0.9085, "step": 941 }, { "epoch": 0.892679459843639, "grad_norm": 1.0917917658926368, "learning_rate": 6.104048774876407e-06, "loss": 0.9026, "step": 942 }, { "epoch": 0.8936271025823265, "grad_norm": 1.0918457932637566, "learning_rate": 6.096559576989166e-06, "loss": 0.8416, "step": 943 }, { "epoch": 0.894574745321014, "grad_norm": 1.090778224144194, "learning_rate": 6.089067793744258e-06, "loss": 0.9331, "step": 944 }, { "epoch": 0.8955223880597015, "grad_norm": 1.1895282418746094, "learning_rate": 6.0815734428050535e-06, "loss": 0.9023, "step": 945 }, { "epoch": 0.896470030798389, "grad_norm": 1.1429336151352636, "learning_rate": 6.074076541840978e-06, "loss": 0.8708, "step": 946 }, { "epoch": 0.896470030798389, "eval_loss": 0.9174872636795044, "eval_runtime": 64.8721, "eval_samples_per_second": 42.052, "eval_steps_per_second": 0.663, "step": 946 }, { "epoch": 0.8974176735370766, "grad_norm": 1.0370632105036575, "learning_rate": 6.066577108527469e-06, "loss": 0.8657, "step": 947 }, { "epoch": 0.898365316275764, "grad_norm": 1.0186442974981031, "learning_rate": 6.059075160545933e-06, "loss": 0.8767, "step": 948 }, { "epoch": 0.8993129590144515, "grad_norm": 1.1803999185158343, "learning_rate": 6.05157071558371e-06, "loss": 0.9213, "step": 949 }, { "epoch": 0.900260601753139, "grad_norm": 1.2808021169879162, "learning_rate": 6.044063791334023e-06, "loss": 0.7969, "step": 950 }, { "epoch": 0.9012082444918266, "grad_norm": 1.100561129817383, "learning_rate": 6.03655440549594e-06, "loss": 0.9169, "step": 951 }, { "epoch": 0.9021558872305141, "grad_norm": 1.0079024114327502, "learning_rate": 6.029042575774334e-06, "loss": 0.8063, "step": 952 }, { "epoch": 0.9031035299692016, "grad_norm": 1.188940267604656, "learning_rate": 6.021528319879843e-06, "loss": 0.8283, "step": 953 }, { "epoch": 0.9040511727078892, "grad_norm": 1.4178205131395747, "learning_rate": 6.01401165552882e-06, "loss": 0.9282, "step": 954 }, { "epoch": 0.9049988154465767, "grad_norm": 1.0472375312060485, "learning_rate": 6.006492600443301e-06, "loss": 0.8396, "step": 955 }, { "epoch": 0.9059464581852642, "grad_norm": 1.1298052814130883, "learning_rate": 5.998971172350953e-06, "loss": 0.8898, "step": 956 }, { "epoch": 0.9068941009239516, "grad_norm": 1.1111085147456654, "learning_rate": 5.991447388985045e-06, "loss": 0.8682, "step": 957 }, { "epoch": 0.9078417436626391, "grad_norm": 1.4266031924717084, "learning_rate": 5.9839212680843925e-06, "loss": 0.8415, "step": 958 }, { "epoch": 0.9087893864013267, "grad_norm": 1.3571311779441146, "learning_rate": 5.976392827393326e-06, "loss": 0.9395, "step": 959 }, { "epoch": 0.9097370291400142, "grad_norm": 1.0252667256604506, "learning_rate": 5.968862084661643e-06, "loss": 0.8144, "step": 960 }, { "epoch": 0.9106846718787017, "grad_norm": 1.0648709592997376, "learning_rate": 5.961329057644571e-06, "loss": 0.8239, "step": 961 }, { "epoch": 0.9116323146173892, "grad_norm": 1.1431114083793006, "learning_rate": 5.9537937641027225e-06, "loss": 0.8986, "step": 962 }, { "epoch": 0.9125799573560768, "grad_norm": 1.051205202397749, "learning_rate": 5.946256221802052e-06, "loss": 0.8686, "step": 963 }, { "epoch": 0.9135276000947643, "grad_norm": 1.1593903949141868, "learning_rate": 5.938716448513819e-06, "loss": 0.8353, "step": 964 }, { "epoch": 0.9144752428334518, "grad_norm": 1.1838771825939993, "learning_rate": 5.931174462014538e-06, "loss": 0.9348, "step": 965 }, { "epoch": 0.9154228855721394, "grad_norm": 1.0369730284903498, "learning_rate": 5.923630280085948e-06, "loss": 0.8309, "step": 966 }, { "epoch": 0.9163705283108268, "grad_norm": 1.1424198652925397, "learning_rate": 5.916083920514959e-06, "loss": 0.8653, "step": 967 }, { "epoch": 0.9173181710495143, "grad_norm": 1.345398860745349, "learning_rate": 5.908535401093618e-06, "loss": 0.871, "step": 968 }, { "epoch": 0.9173181710495143, "eval_loss": 0.9164453744888306, "eval_runtime": 70.7681, "eval_samples_per_second": 38.548, "eval_steps_per_second": 0.608, "step": 968 }, { "epoch": 0.9182658137882018, "grad_norm": 1.2793924011477549, "learning_rate": 5.900984739619062e-06, "loss": 0.9352, "step": 969 }, { "epoch": 0.9192134565268893, "grad_norm": 1.069579103341338, "learning_rate": 5.893431953893483e-06, "loss": 0.8886, "step": 970 }, { "epoch": 0.9201610992655769, "grad_norm": 0.9577007372371958, "learning_rate": 5.885877061724075e-06, "loss": 0.9196, "step": 971 }, { "epoch": 0.9211087420042644, "grad_norm": 1.1384165277972425, "learning_rate": 5.878320080923001e-06, "loss": 0.8944, "step": 972 }, { "epoch": 0.9220563847429519, "grad_norm": 1.0993789420249829, "learning_rate": 5.8707610293073524e-06, "loss": 0.8718, "step": 973 }, { "epoch": 0.9230040274816395, "grad_norm": 0.990859843943125, "learning_rate": 5.8631999246990954e-06, "loss": 0.8815, "step": 974 }, { "epoch": 0.923951670220327, "grad_norm": 0.9596254067235986, "learning_rate": 5.855636784925044e-06, "loss": 0.8873, "step": 975 }, { "epoch": 0.9248993129590144, "grad_norm": 1.1971458477938866, "learning_rate": 5.848071627816804e-06, "loss": 0.9301, "step": 976 }, { "epoch": 0.9258469556977019, "grad_norm": 1.293695132429081, "learning_rate": 5.840504471210742e-06, "loss": 0.8826, "step": 977 }, { "epoch": 0.9267945984363894, "grad_norm": 1.0637632989021661, "learning_rate": 5.832935332947937e-06, "loss": 0.8744, "step": 978 }, { "epoch": 0.927742241175077, "grad_norm": 1.0066024634519515, "learning_rate": 5.82536423087414e-06, "loss": 0.8836, "step": 979 }, { "epoch": 0.9286898839137645, "grad_norm": 1.114286515649878, "learning_rate": 5.817791182839734e-06, "loss": 0.8973, "step": 980 }, { "epoch": 0.929637526652452, "grad_norm": 1.2393549521144707, "learning_rate": 5.810216206699686e-06, "loss": 0.9605, "step": 981 }, { "epoch": 0.9305851693911396, "grad_norm": 1.0160087719698536, "learning_rate": 5.8026393203135145e-06, "loss": 0.9383, "step": 982 }, { "epoch": 0.9315328121298271, "grad_norm": 1.1104693027389803, "learning_rate": 5.7950605415452365e-06, "loss": 0.8697, "step": 983 }, { "epoch": 0.9324804548685146, "grad_norm": 1.0139561721996317, "learning_rate": 5.787479888263333e-06, "loss": 0.8634, "step": 984 }, { "epoch": 0.9334280976072021, "grad_norm": 0.9149964477354714, "learning_rate": 5.779897378340705e-06, "loss": 0.8692, "step": 985 }, { "epoch": 0.9343757403458895, "grad_norm": 1.081719984477777, "learning_rate": 5.772313029654631e-06, "loss": 0.8752, "step": 986 }, { "epoch": 0.9353233830845771, "grad_norm": 0.9686190602510739, "learning_rate": 5.76472686008672e-06, "loss": 0.9374, "step": 987 }, { "epoch": 0.9362710258232646, "grad_norm": 1.1186604360938692, "learning_rate": 5.757138887522884e-06, "loss": 0.9454, "step": 988 }, { "epoch": 0.9372186685619521, "grad_norm": 1.1474863690068453, "learning_rate": 5.749549129853277e-06, "loss": 0.9526, "step": 989 }, { "epoch": 0.9381663113006397, "grad_norm": 1.2065827627859584, "learning_rate": 5.741957604972264e-06, "loss": 0.9015, "step": 990 }, { "epoch": 0.9381663113006397, "eval_loss": 0.9157132506370544, "eval_runtime": 63.106, "eval_samples_per_second": 43.229, "eval_steps_per_second": 0.681, "step": 990 }, { "epoch": 0.9391139540393272, "grad_norm": 1.0667864777077534, "learning_rate": 5.734364330778381e-06, "loss": 0.9115, "step": 991 }, { "epoch": 0.9400615967780147, "grad_norm": 1.0077841132377081, "learning_rate": 5.726769325174279e-06, "loss": 0.8647, "step": 992 }, { "epoch": 0.9410092395167022, "grad_norm": 1.0542855609546453, "learning_rate": 5.719172606066703e-06, "loss": 0.871, "step": 993 }, { "epoch": 0.9419568822553898, "grad_norm": 1.0723234157404569, "learning_rate": 5.711574191366427e-06, "loss": 0.8652, "step": 994 }, { "epoch": 0.9429045249940773, "grad_norm": 1.0614000028665043, "learning_rate": 5.703974098988229e-06, "loss": 0.8693, "step": 995 }, { "epoch": 0.9438521677327647, "grad_norm": 1.184972220370901, "learning_rate": 5.696372346850842e-06, "loss": 0.8748, "step": 996 }, { "epoch": 0.9447998104714522, "grad_norm": 3.0693319889655357, "learning_rate": 5.68876895287691e-06, "loss": 0.9179, "step": 997 }, { "epoch": 0.9457474532101398, "grad_norm": 1.364979369089619, "learning_rate": 5.68116393499295e-06, "loss": 0.9287, "step": 998 }, { "epoch": 0.9466950959488273, "grad_norm": 1.138619446576553, "learning_rate": 5.673557311129306e-06, "loss": 0.9642, "step": 999 }, { "epoch": 0.9476427386875148, "grad_norm": 1.1855283956687006, "learning_rate": 5.66594909922011e-06, "loss": 0.8375, "step": 1000 }, { "epoch": 0.9485903814262023, "grad_norm": 1.1632815768908809, "learning_rate": 5.658339317203235e-06, "loss": 0.8411, "step": 1001 }, { "epoch": 0.9495380241648899, "grad_norm": 1.2165292478376695, "learning_rate": 5.650727983020262e-06, "loss": 0.802, "step": 1002 }, { "epoch": 0.9504856669035774, "grad_norm": 1.1272166319806751, "learning_rate": 5.6431151146164255e-06, "loss": 0.8764, "step": 1003 }, { "epoch": 0.9514333096422649, "grad_norm": 1.179748089906736, "learning_rate": 5.635500729940578e-06, "loss": 0.8728, "step": 1004 }, { "epoch": 0.9523809523809523, "grad_norm": 1.1413912763569145, "learning_rate": 5.627884846945151e-06, "loss": 0.8815, "step": 1005 }, { "epoch": 0.9533285951196399, "grad_norm": 1.1198726067886378, "learning_rate": 5.6202674835861045e-06, "loss": 0.8549, "step": 1006 }, { "epoch": 0.9542762378583274, "grad_norm": 1.0106130328632965, "learning_rate": 5.6126486578228926e-06, "loss": 0.8785, "step": 1007 }, { "epoch": 0.9552238805970149, "grad_norm": 1.092869267593721, "learning_rate": 5.605028387618412e-06, "loss": 0.9306, "step": 1008 }, { "epoch": 0.9561715233357024, "grad_norm": 1.2094913165105556, "learning_rate": 5.597406690938969e-06, "loss": 0.8963, "step": 1009 }, { "epoch": 0.95711916607439, "grad_norm": 1.1750720019167373, "learning_rate": 5.5897835857542315e-06, "loss": 0.8666, "step": 1010 }, { "epoch": 0.9580668088130775, "grad_norm": 1.3245562535134334, "learning_rate": 5.582159090037189e-06, "loss": 0.8291, "step": 1011 }, { "epoch": 0.959014451551765, "grad_norm": 1.2042706760957538, "learning_rate": 5.574533221764109e-06, "loss": 0.8684, "step": 1012 }, { "epoch": 0.959014451551765, "eval_loss": 0.9151268601417542, "eval_runtime": 62.5892, "eval_samples_per_second": 43.586, "eval_steps_per_second": 0.687, "step": 1012 }, { "epoch": 0.9599620942904525, "grad_norm": 1.3006236475762307, "learning_rate": 5.566905998914496e-06, "loss": 0.8668, "step": 1013 }, { "epoch": 0.9609097370291401, "grad_norm": 1.0257069903720804, "learning_rate": 5.559277439471047e-06, "loss": 0.8478, "step": 1014 }, { "epoch": 0.9618573797678275, "grad_norm": 1.511736931943181, "learning_rate": 5.551647561419611e-06, "loss": 0.8859, "step": 1015 }, { "epoch": 0.962805022506515, "grad_norm": 1.0720441919793242, "learning_rate": 5.544016382749146e-06, "loss": 0.8665, "step": 1016 }, { "epoch": 0.9637526652452025, "grad_norm": 1.5150355984208133, "learning_rate": 5.536383921451673e-06, "loss": 0.8628, "step": 1017 }, { "epoch": 0.9647003079838901, "grad_norm": 1.2280290508409115, "learning_rate": 5.528750195522244e-06, "loss": 0.8873, "step": 1018 }, { "epoch": 0.9656479507225776, "grad_norm": 1.0449090226929965, "learning_rate": 5.521115222958889e-06, "loss": 0.9395, "step": 1019 }, { "epoch": 0.9665955934612651, "grad_norm": 1.286647376114692, "learning_rate": 5.513479021762573e-06, "loss": 0.8706, "step": 1020 }, { "epoch": 0.9675432361999526, "grad_norm": 1.2238701214843728, "learning_rate": 5.505841609937162e-06, "loss": 0.85, "step": 1021 }, { "epoch": 0.9684908789386402, "grad_norm": 1.08714516330967, "learning_rate": 5.498203005489378e-06, "loss": 0.8235, "step": 1022 }, { "epoch": 0.9694385216773277, "grad_norm": 0.9837819321269746, "learning_rate": 5.490563226428756e-06, "loss": 0.824, "step": 1023 }, { "epoch": 0.9703861644160152, "grad_norm": 1.0960448192217682, "learning_rate": 5.4829222907675895e-06, "loss": 0.8735, "step": 1024 }, { "epoch": 0.9713338071547026, "grad_norm": 1.0733602660245445, "learning_rate": 5.475280216520913e-06, "loss": 0.846, "step": 1025 }, { "epoch": 0.9722814498933902, "grad_norm": 1.0672536201083378, "learning_rate": 5.467637021706438e-06, "loss": 0.8457, "step": 1026 }, { "epoch": 0.9732290926320777, "grad_norm": 1.2208010933061697, "learning_rate": 5.459992724344516e-06, "loss": 0.8684, "step": 1027 }, { "epoch": 0.9741767353707652, "grad_norm": 1.0289674507090458, "learning_rate": 5.4523473424581045e-06, "loss": 0.8768, "step": 1028 }, { "epoch": 0.9751243781094527, "grad_norm": 1.1688616748496152, "learning_rate": 5.444700894072712e-06, "loss": 0.8708, "step": 1029 }, { "epoch": 0.9760720208481403, "grad_norm": 1.0710612536808917, "learning_rate": 5.437053397216364e-06, "loss": 0.9093, "step": 1030 }, { "epoch": 0.9770196635868278, "grad_norm": 1.210378896652165, "learning_rate": 5.429404869919559e-06, "loss": 0.788, "step": 1031 }, { "epoch": 0.9779673063255153, "grad_norm": 1.0438767939858762, "learning_rate": 5.421755330215223e-06, "loss": 0.95, "step": 1032 }, { "epoch": 0.9789149490642028, "grad_norm": 1.071781685166561, "learning_rate": 5.4141047961386724e-06, "loss": 0.8668, "step": 1033 }, { "epoch": 0.9798625918028903, "grad_norm": 1.0966538771750634, "learning_rate": 5.4064532857275645e-06, "loss": 0.9063, "step": 1034 }, { "epoch": 0.9798625918028903, "eval_loss": 0.9142357707023621, "eval_runtime": 67.863, "eval_samples_per_second": 40.199, "eval_steps_per_second": 0.634, "step": 1034 }, { "epoch": 0.9808102345415778, "grad_norm": 1.0629520496350149, "learning_rate": 5.398800817021857e-06, "loss": 0.9179, "step": 1035 }, { "epoch": 0.9817578772802653, "grad_norm": 1.315356991713657, "learning_rate": 5.3911474080637705e-06, "loss": 0.862, "step": 1036 }, { "epoch": 0.9827055200189528, "grad_norm": 1.5792297604082457, "learning_rate": 5.383493076897742e-06, "loss": 0.8413, "step": 1037 }, { "epoch": 0.9836531627576404, "grad_norm": 1.4810853298698745, "learning_rate": 5.3758378415703825e-06, "loss": 0.845, "step": 1038 }, { "epoch": 0.9846008054963279, "grad_norm": 1.4008685021146463, "learning_rate": 5.368181720130434e-06, "loss": 0.8588, "step": 1039 }, { "epoch": 0.9855484482350154, "grad_norm": 1.0609355637628701, "learning_rate": 5.3605247306287275e-06, "loss": 0.8704, "step": 1040 }, { "epoch": 0.9864960909737029, "grad_norm": 1.147604350423996, "learning_rate": 5.352866891118143e-06, "loss": 0.8918, "step": 1041 }, { "epoch": 0.9874437337123905, "grad_norm": 1.0985464941272645, "learning_rate": 5.345208219653562e-06, "loss": 0.9016, "step": 1042 }, { "epoch": 0.988391376451078, "grad_norm": 1.05526373628293, "learning_rate": 5.337548734291827e-06, "loss": 0.8496, "step": 1043 }, { "epoch": 0.9893390191897654, "grad_norm": 0.9639147858281681, "learning_rate": 5.329888453091701e-06, "loss": 0.8429, "step": 1044 }, { "epoch": 0.9902866619284529, "grad_norm": 1.0367677286445547, "learning_rate": 5.322227394113826e-06, "loss": 0.9336, "step": 1045 }, { "epoch": 0.9912343046671405, "grad_norm": 1.2202980010199613, "learning_rate": 5.314565575420671e-06, "loss": 0.8396, "step": 1046 }, { "epoch": 0.992181947405828, "grad_norm": 1.043650160849041, "learning_rate": 5.306903015076502e-06, "loss": 0.9273, "step": 1047 }, { "epoch": 0.9931295901445155, "grad_norm": 1.2196944237357306, "learning_rate": 5.299239731147332e-06, "loss": 0.882, "step": 1048 }, { "epoch": 0.994077232883203, "grad_norm": 1.160879798524974, "learning_rate": 5.291575741700878e-06, "loss": 0.8874, "step": 1049 }, { "epoch": 0.9950248756218906, "grad_norm": 1.1650239134630531, "learning_rate": 5.283911064806522e-06, "loss": 0.8936, "step": 1050 }, { "epoch": 0.9959725183605781, "grad_norm": 1.0978030906121916, "learning_rate": 5.2762457185352685e-06, "loss": 0.8426, "step": 1051 }, { "epoch": 0.9969201610992656, "grad_norm": 1.1662853004003075, "learning_rate": 5.268579720959698e-06, "loss": 0.8447, "step": 1052 }, { "epoch": 0.997867803837953, "grad_norm": 1.0212044797920623, "learning_rate": 5.260913090153928e-06, "loss": 0.8577, "step": 1053 }, { "epoch": 0.9988154465766406, "grad_norm": 0.9166563097634649, "learning_rate": 5.253245844193564e-06, "loss": 0.8203, "step": 1054 }, { "epoch": 0.9997630893153281, "grad_norm": 1.2814929737482674, "learning_rate": 5.24557800115567e-06, "loss": 0.878, "step": 1055 }, { "epoch": 1.0007107320540156, "grad_norm": 1.029394422791808, "learning_rate": 5.237909579118713e-06, "loss": 0.7881, "step": 1056 }, { "epoch": 1.0007107320540156, "eval_loss": 0.9143710732460022, "eval_runtime": 61.9673, "eval_samples_per_second": 44.023, "eval_steps_per_second": 0.694, "step": 1056 }, { "epoch": 1.0016583747927033, "grad_norm": 1.0123693619172263, "learning_rate": 5.2302405961625225e-06, "loss": 0.7238, "step": 1057 }, { "epoch": 1.0026060175313907, "grad_norm": 0.9727230093690866, "learning_rate": 5.222571070368258e-06, "loss": 0.7209, "step": 1058 }, { "epoch": 1.003553660270078, "grad_norm": 0.972226576136118, "learning_rate": 5.214901019818353e-06, "loss": 0.7445, "step": 1059 }, { "epoch": 1.0045013030087657, "grad_norm": 1.0690844562597608, "learning_rate": 5.2072304625964785e-06, "loss": 0.721, "step": 1060 }, { "epoch": 1.0054489457474531, "grad_norm": 0.892297063646139, "learning_rate": 5.199559416787503e-06, "loss": 0.7467, "step": 1061 }, { "epoch": 1.0063965884861408, "grad_norm": 0.8630887724649647, "learning_rate": 5.191887900477444e-06, "loss": 0.7242, "step": 1062 }, { "epoch": 1.0073442312248282, "grad_norm": 0.8982089162859859, "learning_rate": 5.1842159317534304e-06, "loss": 0.6937, "step": 1063 }, { "epoch": 1.0082918739635158, "grad_norm": 0.9672944077294386, "learning_rate": 5.176543528703657e-06, "loss": 0.7022, "step": 1064 }, { "epoch": 1.0092395167022032, "grad_norm": 0.8839557121691414, "learning_rate": 5.168870709417342e-06, "loss": 0.7057, "step": 1065 }, { "epoch": 1.0101871594408909, "grad_norm": 0.9782692878540812, "learning_rate": 5.161197491984684e-06, "loss": 0.7163, "step": 1066 }, { "epoch": 1.0111348021795783, "grad_norm": 0.886106192089488, "learning_rate": 5.153523894496826e-06, "loss": 0.7415, "step": 1067 }, { "epoch": 1.0120824449182657, "grad_norm": 0.9342675578725627, "learning_rate": 5.1458499350458e-06, "loss": 0.7005, "step": 1068 }, { "epoch": 1.0130300876569533, "grad_norm": 0.9845076442772586, "learning_rate": 5.138175631724495e-06, "loss": 0.679, "step": 1069 }, { "epoch": 1.0139777303956408, "grad_norm": 1.0058057680221788, "learning_rate": 5.130501002626609e-06, "loss": 0.7382, "step": 1070 }, { "epoch": 1.0149253731343284, "grad_norm": 1.0402911368548144, "learning_rate": 5.12282606584661e-06, "loss": 0.7102, "step": 1071 }, { "epoch": 1.0158730158730158, "grad_norm": 0.9653271354076595, "learning_rate": 5.11515083947969e-06, "loss": 0.7487, "step": 1072 }, { "epoch": 1.0168206586117035, "grad_norm": 1.0059830380040296, "learning_rate": 5.107475341621726e-06, "loss": 0.697, "step": 1073 }, { "epoch": 1.0177683013503909, "grad_norm": 0.9592083024886124, "learning_rate": 5.099799590369231e-06, "loss": 0.7111, "step": 1074 }, { "epoch": 1.0187159440890785, "grad_norm": 0.9113464173920633, "learning_rate": 5.092123603819318e-06, "loss": 0.6739, "step": 1075 }, { "epoch": 1.019663586827766, "grad_norm": 0.9658198205738235, "learning_rate": 5.084447400069656e-06, "loss": 0.672, "step": 1076 }, { "epoch": 1.0206112295664536, "grad_norm": 0.9288542434671724, "learning_rate": 5.076770997218424e-06, "loss": 0.7281, "step": 1077 }, { "epoch": 1.021558872305141, "grad_norm": 0.9793083780049829, "learning_rate": 5.069094413364272e-06, "loss": 0.6441, "step": 1078 }, { "epoch": 1.021558872305141, "eval_loss": 0.9252648949623108, "eval_runtime": 63.0278, "eval_samples_per_second": 43.283, "eval_steps_per_second": 0.682, "step": 1078 }, { "epoch": 1.0225065150438284, "grad_norm": 0.9273171413752292, "learning_rate": 5.061417666606274e-06, "loss": 0.6967, "step": 1079 }, { "epoch": 1.023454157782516, "grad_norm": 1.5839472443961204, "learning_rate": 5.053740775043891e-06, "loss": 0.7093, "step": 1080 }, { "epoch": 1.0244018005212034, "grad_norm": 0.9765979302224368, "learning_rate": 5.046063756776926e-06, "loss": 0.6671, "step": 1081 }, { "epoch": 1.025349443259891, "grad_norm": 0.9457182691315833, "learning_rate": 5.038386629905475e-06, "loss": 0.7088, "step": 1082 }, { "epoch": 1.0262970859985785, "grad_norm": 1.0209484041244559, "learning_rate": 5.030709412529896e-06, "loss": 0.6753, "step": 1083 }, { "epoch": 1.0272447287372661, "grad_norm": 1.0234656179924495, "learning_rate": 5.0230321227507595e-06, "loss": 0.7002, "step": 1084 }, { "epoch": 1.0281923714759535, "grad_norm": 0.9799867444681427, "learning_rate": 5.015354778668805e-06, "loss": 0.6913, "step": 1085 }, { "epoch": 1.0291400142146412, "grad_norm": 1.0327323089162292, "learning_rate": 5.007677398384902e-06, "loss": 0.7102, "step": 1086 }, { "epoch": 1.0300876569533286, "grad_norm": 0.8542430531938927, "learning_rate": 5e-06, "loss": 0.7311, "step": 1087 }, { "epoch": 1.031035299692016, "grad_norm": 0.9551198273884293, "learning_rate": 4.992322601615101e-06, "loss": 0.8065, "step": 1088 }, { "epoch": 1.0319829424307037, "grad_norm": 1.2082682361729435, "learning_rate": 4.984645221331196e-06, "loss": 0.7087, "step": 1089 }, { "epoch": 1.032930585169391, "grad_norm": 1.0470114950375882, "learning_rate": 4.976967877249242e-06, "loss": 0.694, "step": 1090 }, { "epoch": 1.0338782279080787, "grad_norm": 0.8968991535832135, "learning_rate": 4.969290587470106e-06, "loss": 0.6542, "step": 1091 }, { "epoch": 1.0348258706467661, "grad_norm": 0.9569176042470755, "learning_rate": 4.961613370094526e-06, "loss": 0.7053, "step": 1092 }, { "epoch": 1.0357735133854538, "grad_norm": 0.956739530222217, "learning_rate": 4.953936243223077e-06, "loss": 0.7299, "step": 1093 }, { "epoch": 1.0367211561241412, "grad_norm": 0.9576460725333583, "learning_rate": 4.9462592249561095e-06, "loss": 0.7516, "step": 1094 }, { "epoch": 1.0376687988628288, "grad_norm": 1.314007292694408, "learning_rate": 4.938582333393727e-06, "loss": 0.7014, "step": 1095 }, { "epoch": 1.0386164416015162, "grad_norm": 0.9732347243923025, "learning_rate": 4.93090558663573e-06, "loss": 0.6131, "step": 1096 }, { "epoch": 1.0395640843402036, "grad_norm": 0.902465289858781, "learning_rate": 4.923229002781577e-06, "loss": 0.7244, "step": 1097 }, { "epoch": 1.0405117270788913, "grad_norm": 0.9138525283319687, "learning_rate": 4.915552599930345e-06, "loss": 0.7447, "step": 1098 }, { "epoch": 1.0414593698175787, "grad_norm": 0.8667199024588413, "learning_rate": 4.907876396180684e-06, "loss": 0.731, "step": 1099 }, { "epoch": 1.0424070125562663, "grad_norm": 1.0023840202075536, "learning_rate": 4.900200409630771e-06, "loss": 0.7, "step": 1100 }, { "epoch": 1.0424070125562663, "eval_loss": 0.9256648421287537, "eval_runtime": 65.8282, "eval_samples_per_second": 41.441, "eval_steps_per_second": 0.653, "step": 1100 }, { "epoch": 1.0433546552949537, "grad_norm": 1.005189996616475, "learning_rate": 4.892524658378276e-06, "loss": 0.662, "step": 1101 }, { "epoch": 1.0443022980336414, "grad_norm": 1.0552789716104816, "learning_rate": 4.884849160520311e-06, "loss": 0.7296, "step": 1102 }, { "epoch": 1.0452499407723288, "grad_norm": 0.9361357742033367, "learning_rate": 4.877173934153392e-06, "loss": 0.7036, "step": 1103 }, { "epoch": 1.0461975835110164, "grad_norm": 0.9030236691224809, "learning_rate": 4.869498997373393e-06, "loss": 0.6941, "step": 1104 }, { "epoch": 1.0471452262497039, "grad_norm": 1.0038838078279626, "learning_rate": 4.861824368275508e-06, "loss": 0.7321, "step": 1105 }, { "epoch": 1.0480928689883915, "grad_norm": 0.9603184361433643, "learning_rate": 4.854150064954201e-06, "loss": 0.6711, "step": 1106 }, { "epoch": 1.049040511727079, "grad_norm": 0.9225275167901936, "learning_rate": 4.846476105503176e-06, "loss": 0.6717, "step": 1107 }, { "epoch": 1.0499881544657663, "grad_norm": 1.2404804952292134, "learning_rate": 4.838802508015316e-06, "loss": 0.7472, "step": 1108 }, { "epoch": 1.050935797204454, "grad_norm": 0.9689214802559678, "learning_rate": 4.83112929058266e-06, "loss": 0.6947, "step": 1109 }, { "epoch": 1.0518834399431414, "grad_norm": 1.1456662341258836, "learning_rate": 4.8234564712963445e-06, "loss": 0.7316, "step": 1110 }, { "epoch": 1.052831082681829, "grad_norm": 0.9768673269000111, "learning_rate": 4.815784068246571e-06, "loss": 0.7487, "step": 1111 }, { "epoch": 1.0537787254205164, "grad_norm": 0.9716968504214143, "learning_rate": 4.808112099522558e-06, "loss": 0.7056, "step": 1112 }, { "epoch": 1.054726368159204, "grad_norm": 0.974665466729692, "learning_rate": 4.800440583212499e-06, "loss": 0.6911, "step": 1113 }, { "epoch": 1.0556740108978915, "grad_norm": 0.9167885828760524, "learning_rate": 4.792769537403523e-06, "loss": 0.7107, "step": 1114 }, { "epoch": 1.0566216536365791, "grad_norm": 1.0329746061334233, "learning_rate": 4.785098980181649e-06, "loss": 0.7229, "step": 1115 }, { "epoch": 1.0575692963752665, "grad_norm": 0.9737952606280224, "learning_rate": 4.777428929631743e-06, "loss": 0.7777, "step": 1116 }, { "epoch": 1.058516939113954, "grad_norm": 1.1038154299902683, "learning_rate": 4.769759403837479e-06, "loss": 0.6809, "step": 1117 }, { "epoch": 1.0594645818526416, "grad_norm": 1.08554606134142, "learning_rate": 4.762090420881289e-06, "loss": 0.6669, "step": 1118 }, { "epoch": 1.060412224591329, "grad_norm": 1.0920359773635873, "learning_rate": 4.754421998844331e-06, "loss": 0.6871, "step": 1119 }, { "epoch": 1.0613598673300166, "grad_norm": 0.9825699774740995, "learning_rate": 4.746754155806437e-06, "loss": 0.727, "step": 1120 }, { "epoch": 1.062307510068704, "grad_norm": 0.9674769463454311, "learning_rate": 4.739086909846075e-06, "loss": 0.7189, "step": 1121 }, { "epoch": 1.0632551528073917, "grad_norm": 1.0091809961919986, "learning_rate": 4.731420279040303e-06, "loss": 0.7278, "step": 1122 }, { "epoch": 1.0632551528073917, "eval_loss": 0.9248070120811462, "eval_runtime": 64.5875, "eval_samples_per_second": 42.237, "eval_steps_per_second": 0.666, "step": 1122 }, { "epoch": 1.064202795546079, "grad_norm": 1.073809323574034, "learning_rate": 4.723754281464732e-06, "loss": 0.7729, "step": 1123 }, { "epoch": 1.0651504382847667, "grad_norm": 0.9206055880032425, "learning_rate": 4.716088935193479e-06, "loss": 0.6833, "step": 1124 }, { "epoch": 1.0660980810234542, "grad_norm": 0.9675985199743727, "learning_rate": 4.708424258299125e-06, "loss": 0.7201, "step": 1125 }, { "epoch": 1.0670457237621416, "grad_norm": 0.970501113273963, "learning_rate": 4.700760268852669e-06, "loss": 0.6957, "step": 1126 }, { "epoch": 1.0679933665008292, "grad_norm": 1.0025167130129373, "learning_rate": 4.693096984923499e-06, "loss": 0.7329, "step": 1127 }, { "epoch": 1.0689410092395166, "grad_norm": 1.2193095917303223, "learning_rate": 4.68543442457933e-06, "loss": 0.7177, "step": 1128 }, { "epoch": 1.0698886519782043, "grad_norm": 1.0294113926873432, "learning_rate": 4.677772605886175e-06, "loss": 0.6829, "step": 1129 }, { "epoch": 1.0708362947168917, "grad_norm": 0.9136644811068081, "learning_rate": 4.670111546908299e-06, "loss": 0.697, "step": 1130 }, { "epoch": 1.0717839374555793, "grad_norm": 0.9443556825485072, "learning_rate": 4.662451265708174e-06, "loss": 0.6735, "step": 1131 }, { "epoch": 1.0727315801942667, "grad_norm": 0.9561425995186141, "learning_rate": 4.65479178034644e-06, "loss": 0.7189, "step": 1132 }, { "epoch": 1.0736792229329544, "grad_norm": 0.9931041504777264, "learning_rate": 4.647133108881858e-06, "loss": 0.6587, "step": 1133 }, { "epoch": 1.0746268656716418, "grad_norm": 0.925346382362746, "learning_rate": 4.639475269371273e-06, "loss": 0.7157, "step": 1134 }, { "epoch": 1.0755745084103294, "grad_norm": 1.031232406635297, "learning_rate": 4.631818279869567e-06, "loss": 0.7325, "step": 1135 }, { "epoch": 1.0765221511490168, "grad_norm": 1.0831354355205014, "learning_rate": 4.624162158429618e-06, "loss": 0.703, "step": 1136 }, { "epoch": 1.0774697938877043, "grad_norm": 0.9705781822246375, "learning_rate": 4.616506923102259e-06, "loss": 0.6238, "step": 1137 }, { "epoch": 1.078417436626392, "grad_norm": 1.1371488724998442, "learning_rate": 4.608852591936231e-06, "loss": 0.7601, "step": 1138 }, { "epoch": 1.0793650793650793, "grad_norm": 1.064474848705152, "learning_rate": 4.601199182978146e-06, "loss": 0.6468, "step": 1139 }, { "epoch": 1.080312722103767, "grad_norm": 0.9450727325021634, "learning_rate": 4.593546714272438e-06, "loss": 0.7266, "step": 1140 }, { "epoch": 1.0812603648424544, "grad_norm": 0.9799112965049801, "learning_rate": 4.585895203861328e-06, "loss": 0.7317, "step": 1141 }, { "epoch": 1.082208007581142, "grad_norm": 0.9097169877735888, "learning_rate": 4.5782446697847775e-06, "loss": 0.746, "step": 1142 }, { "epoch": 1.0831556503198294, "grad_norm": 1.054415192907425, "learning_rate": 4.5705951300804425e-06, "loss": 0.726, "step": 1143 }, { "epoch": 1.0841032930585168, "grad_norm": 0.903372224445911, "learning_rate": 4.562946602783637e-06, "loss": 0.7171, "step": 1144 }, { "epoch": 1.0841032930585168, "eval_loss": 0.9241182804107666, "eval_runtime": 67.5348, "eval_samples_per_second": 40.394, "eval_steps_per_second": 0.637, "step": 1144 }, { "epoch": 1.0850509357972045, "grad_norm": 1.0495310770662147, "learning_rate": 4.55529910592729e-06, "loss": 0.6606, "step": 1145 }, { "epoch": 1.0859985785358919, "grad_norm": 1.3054046428477601, "learning_rate": 4.547652657541897e-06, "loss": 0.7109, "step": 1146 }, { "epoch": 1.0869462212745795, "grad_norm": 0.9385889950812906, "learning_rate": 4.540007275655485e-06, "loss": 0.7101, "step": 1147 }, { "epoch": 1.087893864013267, "grad_norm": 1.0307846935200982, "learning_rate": 4.532362978293564e-06, "loss": 0.7025, "step": 1148 }, { "epoch": 1.0888415067519546, "grad_norm": 1.0094586805349344, "learning_rate": 4.524719783479088e-06, "loss": 0.7341, "step": 1149 }, { "epoch": 1.089789149490642, "grad_norm": 1.0080024003104493, "learning_rate": 4.517077709232411e-06, "loss": 0.7125, "step": 1150 }, { "epoch": 1.0907367922293296, "grad_norm": 0.9839429831089125, "learning_rate": 4.509436773571247e-06, "loss": 0.7263, "step": 1151 }, { "epoch": 1.091684434968017, "grad_norm": 0.9578891424409663, "learning_rate": 4.5017969945106225e-06, "loss": 0.7049, "step": 1152 }, { "epoch": 1.0926320777067047, "grad_norm": 1.589523374002844, "learning_rate": 4.49415839006284e-06, "loss": 0.7045, "step": 1153 }, { "epoch": 1.093579720445392, "grad_norm": 1.1691430951977255, "learning_rate": 4.486520978237431e-06, "loss": 0.6681, "step": 1154 }, { "epoch": 1.0945273631840795, "grad_norm": 1.020265471952243, "learning_rate": 4.478884777041115e-06, "loss": 0.7003, "step": 1155 }, { "epoch": 1.0954750059227671, "grad_norm": 0.9179136320195528, "learning_rate": 4.471249804477758e-06, "loss": 0.7077, "step": 1156 }, { "epoch": 1.0964226486614546, "grad_norm": 0.9635788033380907, "learning_rate": 4.4636160785483285e-06, "loss": 0.7151, "step": 1157 }, { "epoch": 1.0973702914001422, "grad_norm": 2.647967184274327, "learning_rate": 4.455983617250857e-06, "loss": 0.7341, "step": 1158 }, { "epoch": 1.0983179341388296, "grad_norm": 1.0131020212308353, "learning_rate": 4.448352438580391e-06, "loss": 0.6905, "step": 1159 }, { "epoch": 1.0992655768775172, "grad_norm": 1.0530355520145287, "learning_rate": 4.440722560528955e-06, "loss": 0.6387, "step": 1160 }, { "epoch": 1.1002132196162047, "grad_norm": 0.9509811853766807, "learning_rate": 4.433094001085505e-06, "loss": 0.7466, "step": 1161 }, { "epoch": 1.1011608623548923, "grad_norm": 1.0138369505840823, "learning_rate": 4.4254667782358925e-06, "loss": 0.679, "step": 1162 }, { "epoch": 1.1021085050935797, "grad_norm": 1.0773914127698383, "learning_rate": 4.417840909962813e-06, "loss": 0.7367, "step": 1163 }, { "epoch": 1.1030561478322674, "grad_norm": 1.3528732394706713, "learning_rate": 4.410216414245771e-06, "loss": 0.7166, "step": 1164 }, { "epoch": 1.1040037905709548, "grad_norm": 1.0120462677381739, "learning_rate": 4.402593309061034e-06, "loss": 0.6599, "step": 1165 }, { "epoch": 1.1049514333096422, "grad_norm": 0.9783229280065361, "learning_rate": 4.394971612381591e-06, "loss": 0.7053, "step": 1166 }, { "epoch": 1.1049514333096422, "eval_loss": 0.9224104285240173, "eval_runtime": 63.0367, "eval_samples_per_second": 43.276, "eval_steps_per_second": 0.682, "step": 1166 }, { "epoch": 1.1058990760483298, "grad_norm": 0.9564604779753705, "learning_rate": 4.38735134217711e-06, "loss": 0.7522, "step": 1167 }, { "epoch": 1.1068467187870172, "grad_norm": 0.9360222777010359, "learning_rate": 4.379732516413897e-06, "loss": 0.6734, "step": 1168 }, { "epoch": 1.1077943615257049, "grad_norm": 0.8943163287031561, "learning_rate": 4.372115153054851e-06, "loss": 0.7118, "step": 1169 }, { "epoch": 1.1087420042643923, "grad_norm": 1.179413315968657, "learning_rate": 4.364499270059423e-06, "loss": 0.6538, "step": 1170 }, { "epoch": 1.10968964700308, "grad_norm": 0.9833678397573673, "learning_rate": 4.356884885383578e-06, "loss": 0.7024, "step": 1171 }, { "epoch": 1.1106372897417673, "grad_norm": 1.1165040405330118, "learning_rate": 4.34927201697974e-06, "loss": 0.7223, "step": 1172 }, { "epoch": 1.1115849324804548, "grad_norm": 1.0836563622250095, "learning_rate": 4.341660682796766e-06, "loss": 0.7432, "step": 1173 }, { "epoch": 1.1125325752191424, "grad_norm": 0.9956369650089748, "learning_rate": 4.334050900779893e-06, "loss": 0.6979, "step": 1174 }, { "epoch": 1.1134802179578298, "grad_norm": 0.9041906906929965, "learning_rate": 4.326442688870697e-06, "loss": 0.7818, "step": 1175 }, { "epoch": 1.1144278606965174, "grad_norm": 1.0233141405037254, "learning_rate": 4.318836065007052e-06, "loss": 0.6802, "step": 1176 }, { "epoch": 1.1153755034352049, "grad_norm": 1.0925280426338722, "learning_rate": 4.3112310471230925e-06, "loss": 0.7202, "step": 1177 }, { "epoch": 1.1163231461738925, "grad_norm": 1.0258996034471566, "learning_rate": 4.303627653149159e-06, "loss": 0.7173, "step": 1178 }, { "epoch": 1.11727078891258, "grad_norm": 1.0595188906999566, "learning_rate": 4.296025901011773e-06, "loss": 0.7402, "step": 1179 }, { "epoch": 1.1182184316512676, "grad_norm": 0.9566532678110401, "learning_rate": 4.2884258086335755e-06, "loss": 0.6911, "step": 1180 }, { "epoch": 1.119166074389955, "grad_norm": 0.9954131257021107, "learning_rate": 4.2808273939333e-06, "loss": 0.6893, "step": 1181 }, { "epoch": 1.1201137171286426, "grad_norm": 0.9760861787884846, "learning_rate": 4.2732306748257226e-06, "loss": 0.6839, "step": 1182 }, { "epoch": 1.12106135986733, "grad_norm": 1.116236364521447, "learning_rate": 4.265635669221622e-06, "loss": 0.7272, "step": 1183 }, { "epoch": 1.1220090026060174, "grad_norm": 0.9772190754750057, "learning_rate": 4.258042395027738e-06, "loss": 0.7048, "step": 1184 }, { "epoch": 1.122956645344705, "grad_norm": 0.9990968345465719, "learning_rate": 4.250450870146726e-06, "loss": 0.6661, "step": 1185 }, { "epoch": 1.1239042880833925, "grad_norm": 1.004582020487418, "learning_rate": 4.2428611124771184e-06, "loss": 0.7158, "step": 1186 }, { "epoch": 1.1248519308220801, "grad_norm": 1.0285222277895798, "learning_rate": 4.235273139913281e-06, "loss": 0.6759, "step": 1187 }, { "epoch": 1.1257995735607675, "grad_norm": 1.026042016166187, "learning_rate": 4.227686970345373e-06, "loss": 0.6767, "step": 1188 }, { "epoch": 1.1257995735607675, "eval_loss": 0.9233511090278625, "eval_runtime": 63.5378, "eval_samples_per_second": 42.935, "eval_steps_per_second": 0.677, "step": 1188 }, { "epoch": 1.1267472162994552, "grad_norm": 0.9839710491649496, "learning_rate": 4.220102621659298e-06, "loss": 0.698, "step": 1189 }, { "epoch": 1.1276948590381426, "grad_norm": 1.3599383760269543, "learning_rate": 4.21252011173667e-06, "loss": 0.7257, "step": 1190 }, { "epoch": 1.1286425017768302, "grad_norm": 1.1366178207656392, "learning_rate": 4.204939458454767e-06, "loss": 0.7008, "step": 1191 }, { "epoch": 1.1295901445155176, "grad_norm": 0.95168166219681, "learning_rate": 4.197360679686489e-06, "loss": 0.6956, "step": 1192 }, { "epoch": 1.1305377872542053, "grad_norm": 1.0580531496952468, "learning_rate": 4.1897837933003165e-06, "loss": 0.6555, "step": 1193 }, { "epoch": 1.1314854299928927, "grad_norm": 1.3388307797907961, "learning_rate": 4.182208817160269e-06, "loss": 0.7038, "step": 1194 }, { "epoch": 1.1324330727315801, "grad_norm": 1.263598798657549, "learning_rate": 4.174635769125862e-06, "loss": 0.6939, "step": 1195 }, { "epoch": 1.1333807154702678, "grad_norm": 0.9897430245234835, "learning_rate": 4.1670646670520656e-06, "loss": 0.6949, "step": 1196 }, { "epoch": 1.1343283582089552, "grad_norm": 1.5009873894314265, "learning_rate": 4.15949552878926e-06, "loss": 0.663, "step": 1197 }, { "epoch": 1.1352760009476428, "grad_norm": 1.022385852836757, "learning_rate": 4.151928372183198e-06, "loss": 0.7124, "step": 1198 }, { "epoch": 1.1362236436863302, "grad_norm": 1.1789551448297066, "learning_rate": 4.144363215074959e-06, "loss": 0.6713, "step": 1199 }, { "epoch": 1.1371712864250179, "grad_norm": 1.0079132927023848, "learning_rate": 4.136800075300906e-06, "loss": 0.6997, "step": 1200 }, { "epoch": 1.1381189291637053, "grad_norm": 0.9647107031990494, "learning_rate": 4.129238970692651e-06, "loss": 0.6968, "step": 1201 }, { "epoch": 1.1390665719023927, "grad_norm": 1.0120852534707783, "learning_rate": 4.121679919077001e-06, "loss": 0.7705, "step": 1202 }, { "epoch": 1.1400142146410803, "grad_norm": 3.766970964988497, "learning_rate": 4.114122938275929e-06, "loss": 0.664, "step": 1203 }, { "epoch": 1.1409618573797677, "grad_norm": 1.0763801437474894, "learning_rate": 4.10656804610652e-06, "loss": 0.7236, "step": 1204 }, { "epoch": 1.1419095001184554, "grad_norm": 1.0272206175047942, "learning_rate": 4.0990152603809394e-06, "loss": 0.7017, "step": 1205 }, { "epoch": 1.1428571428571428, "grad_norm": 1.0151383640578906, "learning_rate": 4.091464598906385e-06, "loss": 0.7475, "step": 1206 }, { "epoch": 1.1438047855958304, "grad_norm": 1.0605465057418049, "learning_rate": 4.083916079485044e-06, "loss": 0.7003, "step": 1207 }, { "epoch": 1.1447524283345178, "grad_norm": 0.9663496293680089, "learning_rate": 4.076369719914055e-06, "loss": 0.7059, "step": 1208 }, { "epoch": 1.1457000710732055, "grad_norm": 1.5121184694790173, "learning_rate": 4.068825537985465e-06, "loss": 0.7403, "step": 1209 }, { "epoch": 1.146647713811893, "grad_norm": 0.999707638130472, "learning_rate": 4.061283551486185e-06, "loss": 0.6822, "step": 1210 }, { "epoch": 1.146647713811893, "eval_loss": 0.9220083355903625, "eval_runtime": 64.0249, "eval_samples_per_second": 42.608, "eval_steps_per_second": 0.672, "step": 1210 }, { "epoch": 1.1475953565505805, "grad_norm": 0.9638255107668675, "learning_rate": 4.053743778197951e-06, "loss": 0.6955, "step": 1211 }, { "epoch": 1.148542999289268, "grad_norm": 1.092006543225386, "learning_rate": 4.04620623589728e-06, "loss": 0.7363, "step": 1212 }, { "epoch": 1.1494906420279554, "grad_norm": 0.9984016047331302, "learning_rate": 4.038670942355431e-06, "loss": 0.6918, "step": 1213 }, { "epoch": 1.150438284766643, "grad_norm": 1.0035214440616442, "learning_rate": 4.03113791533836e-06, "loss": 0.6924, "step": 1214 }, { "epoch": 1.1513859275053304, "grad_norm": 1.0110914807155829, "learning_rate": 4.023607172606676e-06, "loss": 0.6946, "step": 1215 }, { "epoch": 1.152333570244018, "grad_norm": 0.8736055096829543, "learning_rate": 4.016078731915608e-06, "loss": 0.775, "step": 1216 }, { "epoch": 1.1532812129827055, "grad_norm": 0.9939438831479498, "learning_rate": 4.008552611014955e-06, "loss": 0.6888, "step": 1217 }, { "epoch": 1.154228855721393, "grad_norm": 1.049570796703869, "learning_rate": 4.001028827649046e-06, "loss": 0.7094, "step": 1218 }, { "epoch": 1.1551764984600805, "grad_norm": 1.148462409040153, "learning_rate": 3.993507399556699e-06, "loss": 0.6845, "step": 1219 }, { "epoch": 1.156124141198768, "grad_norm": 0.9773344806508405, "learning_rate": 3.9859883444711795e-06, "loss": 0.6948, "step": 1220 }, { "epoch": 1.1570717839374556, "grad_norm": 1.0928186343002937, "learning_rate": 3.978471680120157e-06, "loss": 0.7538, "step": 1221 }, { "epoch": 1.1580194266761432, "grad_norm": 1.193743573791038, "learning_rate": 3.970957424225666e-06, "loss": 0.7024, "step": 1222 }, { "epoch": 1.1589670694148306, "grad_norm": 1.1120074499425576, "learning_rate": 3.963445594504062e-06, "loss": 0.6627, "step": 1223 }, { "epoch": 1.159914712153518, "grad_norm": 1.2788177552822944, "learning_rate": 3.955936208665979e-06, "loss": 0.6673, "step": 1224 }, { "epoch": 1.1608623548922057, "grad_norm": 1.0546418537225764, "learning_rate": 3.9484292844162905e-06, "loss": 0.6398, "step": 1225 }, { "epoch": 1.161809997630893, "grad_norm": 0.9380567432599234, "learning_rate": 3.940924839454067e-06, "loss": 0.6736, "step": 1226 }, { "epoch": 1.1627576403695807, "grad_norm": 1.0622788802891603, "learning_rate": 3.933422891472532e-06, "loss": 0.6881, "step": 1227 }, { "epoch": 1.1637052831082682, "grad_norm": 1.0284270221411218, "learning_rate": 3.925923458159023e-06, "loss": 0.6836, "step": 1228 }, { "epoch": 1.1646529258469558, "grad_norm": 1.135279890250597, "learning_rate": 3.918426557194947e-06, "loss": 0.7027, "step": 1229 }, { "epoch": 1.1656005685856432, "grad_norm": 1.0051047734363374, "learning_rate": 3.910932206255742e-06, "loss": 0.6571, "step": 1230 }, { "epoch": 1.1665482113243306, "grad_norm": 1.0386743766132205, "learning_rate": 3.903440423010835e-06, "loss": 0.7293, "step": 1231 }, { "epoch": 1.1674958540630183, "grad_norm": 0.9974944591028375, "learning_rate": 3.895951225123595e-06, "loss": 0.7061, "step": 1232 }, { "epoch": 1.1674958540630183, "eval_loss": 0.92330402135849, "eval_runtime": 61.1167, "eval_samples_per_second": 44.636, "eval_steps_per_second": 0.704, "step": 1232 }, { "epoch": 1.1684434968017057, "grad_norm": 1.0523489123192147, "learning_rate": 3.8884646302512985e-06, "loss": 0.6744, "step": 1233 }, { "epoch": 1.1693911395403933, "grad_norm": 1.0581086896752634, "learning_rate": 3.880980656045087e-06, "loss": 0.7234, "step": 1234 }, { "epoch": 1.1703387822790807, "grad_norm": 1.081206532147213, "learning_rate": 3.873499320149918e-06, "loss": 0.7075, "step": 1235 }, { "epoch": 1.1712864250177684, "grad_norm": 1.0406772938616795, "learning_rate": 3.866020640204533e-06, "loss": 0.6703, "step": 1236 }, { "epoch": 1.1722340677564558, "grad_norm": 1.0457307864336358, "learning_rate": 3.858544633841409e-06, "loss": 0.6763, "step": 1237 }, { "epoch": 1.1731817104951434, "grad_norm": 0.9946842072910351, "learning_rate": 3.851071318686721e-06, "loss": 0.6393, "step": 1238 }, { "epoch": 1.1741293532338308, "grad_norm": 0.9269746374884014, "learning_rate": 3.843600712360298e-06, "loss": 0.729, "step": 1239 }, { "epoch": 1.1750769959725185, "grad_norm": 0.9972213948459809, "learning_rate": 3.836132832475583e-06, "loss": 0.6714, "step": 1240 }, { "epoch": 1.1760246387112059, "grad_norm": 1.1281466636664788, "learning_rate": 3.8286676966395895e-06, "loss": 0.7375, "step": 1241 }, { "epoch": 1.1769722814498933, "grad_norm": 1.0775487768465717, "learning_rate": 3.821205322452863e-06, "loss": 0.7771, "step": 1242 }, { "epoch": 1.177919924188581, "grad_norm": 0.9769500849217977, "learning_rate": 3.813745727509439e-06, "loss": 0.7238, "step": 1243 }, { "epoch": 1.1788675669272684, "grad_norm": 0.9685742121387568, "learning_rate": 3.806288929396798e-06, "loss": 0.7081, "step": 1244 }, { "epoch": 1.179815209665956, "grad_norm": 1.096837571136743, "learning_rate": 3.798834945695826e-06, "loss": 0.6977, "step": 1245 }, { "epoch": 1.1807628524046434, "grad_norm": 1.1006333163134505, "learning_rate": 3.7913837939807763e-06, "loss": 0.6762, "step": 1246 }, { "epoch": 1.181710495143331, "grad_norm": 0.9658169918126236, "learning_rate": 3.783935491819222e-06, "loss": 0.6904, "step": 1247 }, { "epoch": 1.1826581378820185, "grad_norm": 1.1429805038475467, "learning_rate": 3.77649005677202e-06, "loss": 0.7098, "step": 1248 }, { "epoch": 1.1836057806207059, "grad_norm": 1.1176775156483372, "learning_rate": 3.769047506393267e-06, "loss": 0.6764, "step": 1249 }, { "epoch": 1.1845534233593935, "grad_norm": 1.0183368197142009, "learning_rate": 3.7616078582302575e-06, "loss": 0.731, "step": 1250 }, { "epoch": 1.1855010660980811, "grad_norm": 0.9859557354856052, "learning_rate": 3.754171129823444e-06, "loss": 0.7222, "step": 1251 }, { "epoch": 1.1864487088367686, "grad_norm": 0.9582987322265264, "learning_rate": 3.7467373387063973e-06, "loss": 0.6739, "step": 1252 }, { "epoch": 1.187396351575456, "grad_norm": 0.9031196186509544, "learning_rate": 3.7393065024057597e-06, "loss": 0.7282, "step": 1253 }, { "epoch": 1.1883439943141436, "grad_norm": 1.0684330693325141, "learning_rate": 3.7318786384412076e-06, "loss": 0.6953, "step": 1254 }, { "epoch": 1.1883439943141436, "eval_loss": 0.920585036277771, "eval_runtime": 65.1767, "eval_samples_per_second": 41.855, "eval_steps_per_second": 0.66, "step": 1254 }, { "epoch": 1.189291637052831, "grad_norm": 1.148345248080178, "learning_rate": 3.7244537643254115e-06, "loss": 0.7035, "step": 1255 }, { "epoch": 1.1902392797915187, "grad_norm": 1.0249604355926194, "learning_rate": 3.7170318975639902e-06, "loss": 0.7582, "step": 1256 }, { "epoch": 1.191186922530206, "grad_norm": 1.179036054066612, "learning_rate": 3.7096130556554744e-06, "loss": 0.697, "step": 1257 }, { "epoch": 1.1921345652688937, "grad_norm": 1.036930121403606, "learning_rate": 3.70219725609126e-06, "loss": 0.7452, "step": 1258 }, { "epoch": 1.1930822080075811, "grad_norm": 0.9553861484853223, "learning_rate": 3.694784516355573e-06, "loss": 0.7419, "step": 1259 }, { "epoch": 1.1940298507462686, "grad_norm": 0.9117393301073062, "learning_rate": 3.687374853925425e-06, "loss": 0.6818, "step": 1260 }, { "epoch": 1.1949774934849562, "grad_norm": 1.109221558375404, "learning_rate": 3.679968286270571e-06, "loss": 0.6819, "step": 1261 }, { "epoch": 1.1959251362236436, "grad_norm": 1.020240570463157, "learning_rate": 3.67256483085347e-06, "loss": 0.7115, "step": 1262 }, { "epoch": 1.1968727789623312, "grad_norm": 1.0960139903595318, "learning_rate": 3.6651645051292415e-06, "loss": 0.7298, "step": 1263 }, { "epoch": 1.1978204217010187, "grad_norm": 0.8730491568921783, "learning_rate": 3.6577673265456296e-06, "loss": 0.6626, "step": 1264 }, { "epoch": 1.1987680644397063, "grad_norm": 1.0528341736215752, "learning_rate": 3.6503733125429557e-06, "loss": 0.7439, "step": 1265 }, { "epoch": 1.1997157071783937, "grad_norm": 1.0899721179963884, "learning_rate": 3.6429824805540816e-06, "loss": 0.6907, "step": 1266 }, { "epoch": 1.2006633499170813, "grad_norm": 1.0591609285941943, "learning_rate": 3.6355948480043647e-06, "loss": 0.6818, "step": 1267 }, { "epoch": 1.2016109926557688, "grad_norm": 1.091729744316094, "learning_rate": 3.628210432311621e-06, "loss": 0.7118, "step": 1268 }, { "epoch": 1.2025586353944564, "grad_norm": 1.1106751888187103, "learning_rate": 3.620829250886083e-06, "loss": 0.7496, "step": 1269 }, { "epoch": 1.2035062781331438, "grad_norm": 0.9071769388935164, "learning_rate": 3.6134513211303555e-06, "loss": 0.6996, "step": 1270 }, { "epoch": 1.2044539208718312, "grad_norm": 0.9816514708234372, "learning_rate": 3.606076660439378e-06, "loss": 0.7154, "step": 1271 }, { "epoch": 1.2054015636105189, "grad_norm": 0.9314656323457674, "learning_rate": 3.5987052862003824e-06, "loss": 0.7288, "step": 1272 }, { "epoch": 1.2063492063492063, "grad_norm": 1.0352321626353647, "learning_rate": 3.5913372157928515e-06, "loss": 0.6235, "step": 1273 }, { "epoch": 1.207296849087894, "grad_norm": 1.3247779592651683, "learning_rate": 3.58397246658848e-06, "loss": 0.7273, "step": 1274 }, { "epoch": 1.2082444918265813, "grad_norm": 0.9925803897741295, "learning_rate": 3.5766110559511313e-06, "loss": 0.749, "step": 1275 }, { "epoch": 1.209192134565269, "grad_norm": 0.9515222056049948, "learning_rate": 3.569253001236795e-06, "loss": 0.7559, "step": 1276 }, { "epoch": 1.209192134565269, "eval_loss": 0.9210101366043091, "eval_runtime": 60.5067, "eval_samples_per_second": 45.086, "eval_steps_per_second": 0.711, "step": 1276 }, { "epoch": 1.2101397773039564, "grad_norm": 1.0505259866511463, "learning_rate": 3.561898319793555e-06, "loss": 0.6777, "step": 1277 }, { "epoch": 1.2110874200426438, "grad_norm": 1.0470739963568594, "learning_rate": 3.554547028961537e-06, "loss": 0.6687, "step": 1278 }, { "epoch": 1.2120350627813314, "grad_norm": 0.9372200609494287, "learning_rate": 3.5471991460728725e-06, "loss": 0.7364, "step": 1279 }, { "epoch": 1.212982705520019, "grad_norm": 1.0677225546085984, "learning_rate": 3.5398546884516606e-06, "loss": 0.6946, "step": 1280 }, { "epoch": 1.2139303482587065, "grad_norm": 1.2432772070818632, "learning_rate": 3.5325136734139213e-06, "loss": 0.7216, "step": 1281 }, { "epoch": 1.214877990997394, "grad_norm": 0.9586638500297385, "learning_rate": 3.5251761182675626e-06, "loss": 0.6836, "step": 1282 }, { "epoch": 1.2158256337360815, "grad_norm": 1.0193646692337255, "learning_rate": 3.5178420403123307e-06, "loss": 0.7499, "step": 1283 }, { "epoch": 1.216773276474769, "grad_norm": 0.9934250718994667, "learning_rate": 3.510511456839777e-06, "loss": 0.7127, "step": 1284 }, { "epoch": 1.2177209192134566, "grad_norm": 1.018374538431729, "learning_rate": 3.5031843851332105e-06, "loss": 0.7211, "step": 1285 }, { "epoch": 1.218668561952144, "grad_norm": 1.0345735993851575, "learning_rate": 3.495860842467664e-06, "loss": 0.7196, "step": 1286 }, { "epoch": 1.2196162046908317, "grad_norm": 1.053615402301811, "learning_rate": 3.488540846109849e-06, "loss": 0.6648, "step": 1287 }, { "epoch": 1.220563847429519, "grad_norm": 1.0164841506591313, "learning_rate": 3.481224413318114e-06, "loss": 0.6602, "step": 1288 }, { "epoch": 1.2215114901682065, "grad_norm": 1.0367484711368176, "learning_rate": 3.4739115613424078e-06, "loss": 0.7115, "step": 1289 }, { "epoch": 1.2224591329068941, "grad_norm": 1.038323393287383, "learning_rate": 3.4666023074242356e-06, "loss": 0.6587, "step": 1290 }, { "epoch": 1.2234067756455815, "grad_norm": 0.9327274906523454, "learning_rate": 3.459296668796619e-06, "loss": 0.6846, "step": 1291 }, { "epoch": 1.2243544183842692, "grad_norm": 1.0084456960618864, "learning_rate": 3.451994662684057e-06, "loss": 0.7076, "step": 1292 }, { "epoch": 1.2253020611229566, "grad_norm": 1.0275585469763515, "learning_rate": 3.4446963063024854e-06, "loss": 0.691, "step": 1293 }, { "epoch": 1.2262497038616442, "grad_norm": 1.125902212799892, "learning_rate": 3.4374016168592296e-06, "loss": 0.7251, "step": 1294 }, { "epoch": 1.2271973466003316, "grad_norm": 1.0322864795599813, "learning_rate": 3.4301106115529766e-06, "loss": 0.7284, "step": 1295 }, { "epoch": 1.2281449893390193, "grad_norm": 0.9989097221420168, "learning_rate": 3.4228233075737225e-06, "loss": 0.7035, "step": 1296 }, { "epoch": 1.2290926320777067, "grad_norm": 0.9373804854464124, "learning_rate": 3.4155397221027396e-06, "loss": 0.7139, "step": 1297 }, { "epoch": 1.2300402748163943, "grad_norm": 1.0343160679933974, "learning_rate": 3.4082598723125303e-06, "loss": 0.6859, "step": 1298 }, { "epoch": 1.2300402748163943, "eval_loss": 0.9219260215759277, "eval_runtime": 64.2211, "eval_samples_per_second": 42.478, "eval_steps_per_second": 0.67, "step": 1298 }, { "epoch": 1.2309879175550817, "grad_norm": 1.0041258596912708, "learning_rate": 3.4009837753667918e-06, "loss": 0.6752, "step": 1299 }, { "epoch": 1.2319355602937692, "grad_norm": 1.062141619360519, "learning_rate": 3.393711448420372e-06, "loss": 0.7558, "step": 1300 }, { "epoch": 1.2328832030324568, "grad_norm": 1.1486226243514412, "learning_rate": 3.3864429086192295e-06, "loss": 0.6976, "step": 1301 }, { "epoch": 1.2338308457711442, "grad_norm": 1.0162684385678484, "learning_rate": 3.379178173100396e-06, "loss": 0.6503, "step": 1302 }, { "epoch": 1.2347784885098319, "grad_norm": 1.0535985388570441, "learning_rate": 3.371917258991933e-06, "loss": 0.7014, "step": 1303 }, { "epoch": 1.2357261312485193, "grad_norm": 0.9476340249103999, "learning_rate": 3.3646601834128924e-06, "loss": 0.7243, "step": 1304 }, { "epoch": 1.236673773987207, "grad_norm": 0.9722138093221362, "learning_rate": 3.3574069634732744e-06, "loss": 0.6936, "step": 1305 }, { "epoch": 1.2376214167258943, "grad_norm": 0.960280885803354, "learning_rate": 3.3501576162739903e-06, "loss": 0.7258, "step": 1306 }, { "epoch": 1.2385690594645817, "grad_norm": 1.011953193368571, "learning_rate": 3.3429121589068213e-06, "loss": 0.7573, "step": 1307 }, { "epoch": 1.2395167022032694, "grad_norm": 1.035071350641064, "learning_rate": 3.3356706084543766e-06, "loss": 0.7303, "step": 1308 }, { "epoch": 1.2404643449419568, "grad_norm": 1.0243269638647712, "learning_rate": 3.328432981990053e-06, "loss": 0.7117, "step": 1309 }, { "epoch": 1.2414119876806444, "grad_norm": 1.00968974551286, "learning_rate": 3.3211992965779984e-06, "loss": 0.6356, "step": 1310 }, { "epoch": 1.2423596304193318, "grad_norm": 0.9221223902659512, "learning_rate": 3.3139695692730644e-06, "loss": 0.6582, "step": 1311 }, { "epoch": 1.2433072731580195, "grad_norm": 1.063526317074848, "learning_rate": 3.306743817120777e-06, "loss": 0.6458, "step": 1312 }, { "epoch": 1.244254915896707, "grad_norm": 1.0099993891673738, "learning_rate": 3.2995220571572845e-06, "loss": 0.6945, "step": 1313 }, { "epoch": 1.2452025586353945, "grad_norm": 1.2682014688129715, "learning_rate": 3.2923043064093252e-06, "loss": 0.7106, "step": 1314 }, { "epoch": 1.246150201374082, "grad_norm": 0.9957192198152068, "learning_rate": 3.2850905818941853e-06, "loss": 0.7159, "step": 1315 }, { "epoch": 1.2470978441127696, "grad_norm": 0.970490636667617, "learning_rate": 3.2778809006196564e-06, "loss": 0.7628, "step": 1316 }, { "epoch": 1.248045486851457, "grad_norm": 1.049773961103418, "learning_rate": 3.2706752795839984e-06, "loss": 0.7065, "step": 1317 }, { "epoch": 1.2489931295901444, "grad_norm": 1.0171220181984746, "learning_rate": 3.2634737357758994e-06, "loss": 0.6594, "step": 1318 }, { "epoch": 1.249940772328832, "grad_norm": 1.0689789937287877, "learning_rate": 3.256276286174433e-06, "loss": 0.72, "step": 1319 }, { "epoch": 1.2508884150675195, "grad_norm": 0.9750893399192391, "learning_rate": 3.2490829477490194e-06, "loss": 0.7237, "step": 1320 }, { "epoch": 1.2508884150675195, "eval_loss": 0.9202948808670044, "eval_runtime": 64.6354, "eval_samples_per_second": 42.206, "eval_steps_per_second": 0.665, "step": 1320 }, { "epoch": 1.251836057806207, "grad_norm": 1.0670485098725635, "learning_rate": 3.2418937374593895e-06, "loss": 0.7168, "step": 1321 }, { "epoch": 1.2527837005448945, "grad_norm": 0.9432261527116877, "learning_rate": 3.2347086722555382e-06, "loss": 0.741, "step": 1322 }, { "epoch": 1.2537313432835822, "grad_norm": 0.9750755053529253, "learning_rate": 3.2275277690776876e-06, "loss": 0.6547, "step": 1323 }, { "epoch": 1.2546789860222696, "grad_norm": 0.9301758889182347, "learning_rate": 3.220351044856247e-06, "loss": 0.7478, "step": 1324 }, { "epoch": 1.255626628760957, "grad_norm": 1.0318451433516014, "learning_rate": 3.2131785165117748e-06, "loss": 0.6562, "step": 1325 }, { "epoch": 1.2565742714996446, "grad_norm": 0.889824279332266, "learning_rate": 3.206010200954935e-06, "loss": 0.6682, "step": 1326 }, { "epoch": 1.2575219142383323, "grad_norm": 0.9419702907671557, "learning_rate": 3.198846115086459e-06, "loss": 0.6833, "step": 1327 }, { "epoch": 1.2584695569770197, "grad_norm": 1.0527246614550414, "learning_rate": 3.191686275797107e-06, "loss": 0.7099, "step": 1328 }, { "epoch": 1.259417199715707, "grad_norm": 1.055473610231369, "learning_rate": 3.1845306999676274e-06, "loss": 0.6996, "step": 1329 }, { "epoch": 1.2603648424543947, "grad_norm": 0.9595441733482751, "learning_rate": 3.177379404468715e-06, "loss": 0.6818, "step": 1330 }, { "epoch": 1.2613124851930821, "grad_norm": 1.0295006726261346, "learning_rate": 3.170232406160974e-06, "loss": 0.6539, "step": 1331 }, { "epoch": 1.2622601279317698, "grad_norm": 1.124771581323331, "learning_rate": 3.1630897218948765e-06, "loss": 0.6911, "step": 1332 }, { "epoch": 1.2632077706704572, "grad_norm": 0.9768323438673063, "learning_rate": 3.1559513685107233e-06, "loss": 0.7021, "step": 1333 }, { "epoch": 1.2641554134091448, "grad_norm": 0.9702345738300058, "learning_rate": 3.1488173628386066e-06, "loss": 0.7039, "step": 1334 }, { "epoch": 1.2651030561478323, "grad_norm": 0.9740566736999792, "learning_rate": 3.141687721698363e-06, "loss": 0.7201, "step": 1335 }, { "epoch": 1.2660506988865197, "grad_norm": 1.0341764156676143, "learning_rate": 3.1345624618995444e-06, "loss": 0.6815, "step": 1336 }, { "epoch": 1.2669983416252073, "grad_norm": 1.0038570117243435, "learning_rate": 3.127441600241369e-06, "loss": 0.6874, "step": 1337 }, { "epoch": 1.267945984363895, "grad_norm": 0.9492110824297334, "learning_rate": 3.1203251535126867e-06, "loss": 0.6973, "step": 1338 }, { "epoch": 1.2688936271025824, "grad_norm": 1.2890789269673384, "learning_rate": 3.11321313849194e-06, "loss": 0.7239, "step": 1339 }, { "epoch": 1.2698412698412698, "grad_norm": 0.9728809474646835, "learning_rate": 3.10610557194712e-06, "loss": 0.6764, "step": 1340 }, { "epoch": 1.2707889125799574, "grad_norm": 1.1635688288087744, "learning_rate": 3.0990024706357314e-06, "loss": 0.6918, "step": 1341 }, { "epoch": 1.2717365553186448, "grad_norm": 0.9704804371870352, "learning_rate": 3.0919038513047507e-06, "loss": 0.7398, "step": 1342 }, { "epoch": 1.2717365553186448, "eval_loss": 0.9205412864685059, "eval_runtime": 64.3962, "eval_samples_per_second": 42.363, "eval_steps_per_second": 0.668, "step": 1342 }, { "epoch": 1.2726841980573325, "grad_norm": 0.9623053497453619, "learning_rate": 3.084809730690587e-06, "loss": 0.7125, "step": 1343 }, { "epoch": 1.2736318407960199, "grad_norm": 1.0918361707437365, "learning_rate": 3.077720125519042e-06, "loss": 0.6929, "step": 1344 }, { "epoch": 1.2745794835347075, "grad_norm": 1.0920621674730842, "learning_rate": 3.070635052505273e-06, "loss": 0.736, "step": 1345 }, { "epoch": 1.275527126273395, "grad_norm": 0.9952373114086388, "learning_rate": 3.0635545283537523e-06, "loss": 0.687, "step": 1346 }, { "epoch": 1.2764747690120823, "grad_norm": 1.2534896758069607, "learning_rate": 3.056478569758225e-06, "loss": 0.7381, "step": 1347 }, { "epoch": 1.27742241175077, "grad_norm": 1.0662944647834347, "learning_rate": 3.0494071934016737e-06, "loss": 0.7478, "step": 1348 }, { "epoch": 1.2783700544894574, "grad_norm": 1.1088922876494405, "learning_rate": 3.0423404159562776e-06, "loss": 0.7582, "step": 1349 }, { "epoch": 1.279317697228145, "grad_norm": 1.176182333691132, "learning_rate": 3.03527825408337e-06, "loss": 0.7656, "step": 1350 }, { "epoch": 1.2802653399668324, "grad_norm": 0.9658766613029017, "learning_rate": 3.0282207244334084e-06, "loss": 0.724, "step": 1351 }, { "epoch": 1.28121298270552, "grad_norm": 1.0827099691926507, "learning_rate": 3.0211678436459214e-06, "loss": 0.6916, "step": 1352 }, { "epoch": 1.2821606254442075, "grad_norm": 1.0338922828145432, "learning_rate": 3.014119628349482e-06, "loss": 0.6895, "step": 1353 }, { "epoch": 1.283108268182895, "grad_norm": 0.9767332618971463, "learning_rate": 3.007076095161662e-06, "loss": 0.6949, "step": 1354 }, { "epoch": 1.2840559109215826, "grad_norm": 1.0930600930773744, "learning_rate": 3.0000372606889937e-06, "loss": 0.7021, "step": 1355 }, { "epoch": 1.2850035536602702, "grad_norm": 1.1419255045928394, "learning_rate": 2.9930031415269327e-06, "loss": 0.6816, "step": 1356 }, { "epoch": 1.2859511963989576, "grad_norm": 1.0003644858641374, "learning_rate": 2.9859737542598157e-06, "loss": 0.7194, "step": 1357 }, { "epoch": 1.286898839137645, "grad_norm": 0.9900004048868847, "learning_rate": 2.978949115460824e-06, "loss": 0.6978, "step": 1358 }, { "epoch": 1.2878464818763327, "grad_norm": 1.0245392546866654, "learning_rate": 2.971929241691942e-06, "loss": 0.7067, "step": 1359 }, { "epoch": 1.28879412461502, "grad_norm": 1.1770400753015886, "learning_rate": 2.9649141495039225e-06, "loss": 0.6811, "step": 1360 }, { "epoch": 1.2897417673537077, "grad_norm": 0.9716668472533, "learning_rate": 2.9579038554362412e-06, "loss": 0.6944, "step": 1361 }, { "epoch": 1.2906894100923951, "grad_norm": 0.9803805263564218, "learning_rate": 2.950898376017064e-06, "loss": 0.6599, "step": 1362 }, { "epoch": 1.2916370528310828, "grad_norm": 1.1276831233974194, "learning_rate": 2.943897727763202e-06, "loss": 0.7439, "step": 1363 }, { "epoch": 1.2925846955697702, "grad_norm": 0.9936565556974072, "learning_rate": 2.9369019271800827e-06, "loss": 0.7139, "step": 1364 }, { "epoch": 1.2925846955697702, "eval_loss": 0.9199525117874146, "eval_runtime": 65.9605, "eval_samples_per_second": 41.358, "eval_steps_per_second": 0.652, "step": 1364 }, { "epoch": 1.2935323383084576, "grad_norm": 1.0225633529982556, "learning_rate": 2.9299109907616956e-06, "loss": 0.7169, "step": 1365 }, { "epoch": 1.2944799810471452, "grad_norm": 1.1496102662994758, "learning_rate": 2.9229249349905686e-06, "loss": 0.7257, "step": 1366 }, { "epoch": 1.2954276237858329, "grad_norm": 1.0411288552368323, "learning_rate": 2.9159437763377187e-06, "loss": 0.7057, "step": 1367 }, { "epoch": 1.2963752665245203, "grad_norm": 1.1200448529620028, "learning_rate": 2.908967531262618e-06, "loss": 0.7623, "step": 1368 }, { "epoch": 1.2973229092632077, "grad_norm": 1.0477643207405842, "learning_rate": 2.9019962162131564e-06, "loss": 0.6169, "step": 1369 }, { "epoch": 1.2982705520018953, "grad_norm": 1.008136764087855, "learning_rate": 2.895029847625595e-06, "loss": 0.6862, "step": 1370 }, { "epoch": 1.2992181947405828, "grad_norm": 1.0600688588701463, "learning_rate": 2.8880684419245387e-06, "loss": 0.7149, "step": 1371 }, { "epoch": 1.3001658374792704, "grad_norm": 1.0027128694027476, "learning_rate": 2.8811120155228843e-06, "loss": 0.7366, "step": 1372 }, { "epoch": 1.3011134802179578, "grad_norm": 1.4069108299597186, "learning_rate": 2.874160584821798e-06, "loss": 0.7393, "step": 1373 }, { "epoch": 1.3020611229566454, "grad_norm": 1.0338612942301875, "learning_rate": 2.8672141662106577e-06, "loss": 0.7036, "step": 1374 }, { "epoch": 1.3030087656953329, "grad_norm": 1.0688910868363117, "learning_rate": 2.8602727760670336e-06, "loss": 0.7306, "step": 1375 }, { "epoch": 1.3039564084340203, "grad_norm": 1.242518786141904, "learning_rate": 2.8533364307566313e-06, "loss": 0.6862, "step": 1376 }, { "epoch": 1.304904051172708, "grad_norm": 1.1911216173581962, "learning_rate": 2.846405146633269e-06, "loss": 0.7568, "step": 1377 }, { "epoch": 1.3058516939113953, "grad_norm": 1.0039693235836689, "learning_rate": 2.839478940038833e-06, "loss": 0.6523, "step": 1378 }, { "epoch": 1.306799336650083, "grad_norm": 1.072057083327897, "learning_rate": 2.8325578273032295e-06, "loss": 0.7036, "step": 1379 }, { "epoch": 1.3077469793887704, "grad_norm": 0.9993390017662981, "learning_rate": 2.8256418247443664e-06, "loss": 0.6887, "step": 1380 }, { "epoch": 1.308694622127458, "grad_norm": 0.9454808776048585, "learning_rate": 2.8187309486680924e-06, "loss": 0.7237, "step": 1381 }, { "epoch": 1.3096422648661454, "grad_norm": 0.9371803052679708, "learning_rate": 2.811825215368179e-06, "loss": 0.7279, "step": 1382 }, { "epoch": 1.3105899076048328, "grad_norm": 1.2827676657928362, "learning_rate": 2.804924641126264e-06, "loss": 0.6878, "step": 1383 }, { "epoch": 1.3115375503435205, "grad_norm": 1.1459955089807479, "learning_rate": 2.7980292422118282e-06, "loss": 0.7615, "step": 1384 }, { "epoch": 1.3124851930822081, "grad_norm": 1.0020748955412062, "learning_rate": 2.791139034882151e-06, "loss": 0.7376, "step": 1385 }, { "epoch": 1.3134328358208955, "grad_norm": 1.277527875810019, "learning_rate": 2.7842540353822634e-06, "loss": 0.7209, "step": 1386 }, { "epoch": 1.3134328358208955, "eval_loss": 0.91898512840271, "eval_runtime": 64.7739, "eval_samples_per_second": 42.116, "eval_steps_per_second": 0.664, "step": 1386 }, { "epoch": 1.314380478559583, "grad_norm": 1.1181626221192054, "learning_rate": 2.777374259944929e-06, "loss": 0.7057, "step": 1387 }, { "epoch": 1.3153281212982706, "grad_norm": 0.9400452950748897, "learning_rate": 2.770499724790584e-06, "loss": 0.721, "step": 1388 }, { "epoch": 1.316275764036958, "grad_norm": 1.0147521664502355, "learning_rate": 2.763630446127319e-06, "loss": 0.7199, "step": 1389 }, { "epoch": 1.3172234067756456, "grad_norm": 0.9881704878801858, "learning_rate": 2.7567664401508225e-06, "loss": 0.7116, "step": 1390 }, { "epoch": 1.318171049514333, "grad_norm": 0.9795454208939292, "learning_rate": 2.7499077230443607e-06, "loss": 0.6953, "step": 1391 }, { "epoch": 1.3191186922530207, "grad_norm": 1.0069354879696941, "learning_rate": 2.743054310978722e-06, "loss": 0.7098, "step": 1392 }, { "epoch": 1.3200663349917081, "grad_norm": 0.9429203635412998, "learning_rate": 2.736206220112192e-06, "loss": 0.7004, "step": 1393 }, { "epoch": 1.3210139777303955, "grad_norm": 1.0682307544212157, "learning_rate": 2.729363466590511e-06, "loss": 0.6745, "step": 1394 }, { "epoch": 1.3219616204690832, "grad_norm": 1.0079367947967734, "learning_rate": 2.72252606654683e-06, "loss": 0.6362, "step": 1395 }, { "epoch": 1.3229092632077708, "grad_norm": 0.936909554532062, "learning_rate": 2.7156940361016864e-06, "loss": 0.7282, "step": 1396 }, { "epoch": 1.3238569059464582, "grad_norm": 1.1619360098958085, "learning_rate": 2.708867391362948e-06, "loss": 0.759, "step": 1397 }, { "epoch": 1.3248045486851456, "grad_norm": 1.0332898627783107, "learning_rate": 2.7020461484257952e-06, "loss": 0.7224, "step": 1398 }, { "epoch": 1.3257521914238333, "grad_norm": 1.0120053743179587, "learning_rate": 2.6952303233726628e-06, "loss": 0.7007, "step": 1399 }, { "epoch": 1.3266998341625207, "grad_norm": 1.0182348658415017, "learning_rate": 2.6884199322732192e-06, "loss": 0.7364, "step": 1400 }, { "epoch": 1.3276474769012083, "grad_norm": 1.0133603532948297, "learning_rate": 2.681614991184315e-06, "loss": 0.743, "step": 1401 }, { "epoch": 1.3285951196398957, "grad_norm": 0.9924480633218891, "learning_rate": 2.6748155161499568e-06, "loss": 0.6545, "step": 1402 }, { "epoch": 1.3295427623785834, "grad_norm": 1.127155396416284, "learning_rate": 2.668021523201263e-06, "loss": 0.7471, "step": 1403 }, { "epoch": 1.3304904051172708, "grad_norm": 1.0775922171839278, "learning_rate": 2.6612330283564226e-06, "loss": 0.6713, "step": 1404 }, { "epoch": 1.3314380478559582, "grad_norm": 0.9947061326854107, "learning_rate": 2.6544500476206675e-06, "loss": 0.6725, "step": 1405 }, { "epoch": 1.3323856905946458, "grad_norm": 1.0208055783943726, "learning_rate": 2.6476725969862227e-06, "loss": 0.7577, "step": 1406 }, { "epoch": 1.3333333333333333, "grad_norm": 0.9200584970050147, "learning_rate": 2.6409006924322824e-06, "loss": 0.7277, "step": 1407 }, { "epoch": 1.334280976072021, "grad_norm": 1.1839327711762426, "learning_rate": 2.634134349924956e-06, "loss": 0.7352, "step": 1408 }, { "epoch": 1.334280976072021, "eval_loss": 0.9188514351844788, "eval_runtime": 65.4529, "eval_samples_per_second": 41.679, "eval_steps_per_second": 0.657, "step": 1408 }, { "epoch": 1.3352286188107083, "grad_norm": 1.0311881529332272, "learning_rate": 2.6273735854172487e-06, "loss": 0.6348, "step": 1409 }, { "epoch": 1.336176261549396, "grad_norm": 0.869437188326139, "learning_rate": 2.6206184148490066e-06, "loss": 0.6783, "step": 1410 }, { "epoch": 1.3371239042880834, "grad_norm": 0.9896388796869628, "learning_rate": 2.6138688541468903e-06, "loss": 0.6565, "step": 1411 }, { "epoch": 1.3380715470267708, "grad_norm": 0.9975523350097485, "learning_rate": 2.6071249192243365e-06, "loss": 0.7388, "step": 1412 }, { "epoch": 1.3390191897654584, "grad_norm": 3.814992564369548, "learning_rate": 2.6003866259815123e-06, "loss": 0.7403, "step": 1413 }, { "epoch": 1.339966832504146, "grad_norm": 1.1226980770763029, "learning_rate": 2.5936539903052893e-06, "loss": 0.7311, "step": 1414 }, { "epoch": 1.3409144752428335, "grad_norm": 1.0149265229287798, "learning_rate": 2.5869270280691945e-06, "loss": 0.6922, "step": 1415 }, { "epoch": 1.3418621179815209, "grad_norm": 1.1097881040738804, "learning_rate": 2.580205755133384e-06, "loss": 0.6867, "step": 1416 }, { "epoch": 1.3428097607202085, "grad_norm": 1.0437635962821175, "learning_rate": 2.573490187344596e-06, "loss": 0.6817, "step": 1417 }, { "epoch": 1.343757403458896, "grad_norm": 1.0360266239174718, "learning_rate": 2.5667803405361214e-06, "loss": 0.7413, "step": 1418 }, { "epoch": 1.3447050461975836, "grad_norm": 0.9107405093250232, "learning_rate": 2.560076230527758e-06, "loss": 0.6722, "step": 1419 }, { "epoch": 1.345652688936271, "grad_norm": 0.9408273813849782, "learning_rate": 2.5533778731257824e-06, "loss": 0.7198, "step": 1420 }, { "epoch": 1.3466003316749586, "grad_norm": 1.0137206933457714, "learning_rate": 2.546685284122909e-06, "loss": 0.6862, "step": 1421 }, { "epoch": 1.347547974413646, "grad_norm": 1.207183923292191, "learning_rate": 2.5399984792982457e-06, "loss": 0.7163, "step": 1422 }, { "epoch": 1.3484956171523335, "grad_norm": 0.9580259639674358, "learning_rate": 2.5333174744172705e-06, "loss": 0.7006, "step": 1423 }, { "epoch": 1.349443259891021, "grad_norm": 1.0387469510787142, "learning_rate": 2.5266422852317796e-06, "loss": 0.66, "step": 1424 }, { "epoch": 1.3503909026297087, "grad_norm": 1.0656383066236028, "learning_rate": 2.5199729274798664e-06, "loss": 0.7036, "step": 1425 }, { "epoch": 1.3513385453683961, "grad_norm": 1.0545231585290438, "learning_rate": 2.513309416885865e-06, "loss": 0.643, "step": 1426 }, { "epoch": 1.3522861881070836, "grad_norm": 1.1886784682255809, "learning_rate": 2.5066517691603327e-06, "loss": 0.6968, "step": 1427 }, { "epoch": 1.3532338308457712, "grad_norm": 1.0686385881546185, "learning_rate": 2.5000000000000015e-06, "loss": 0.6948, "step": 1428 }, { "epoch": 1.3541814735844586, "grad_norm": 1.022360298648431, "learning_rate": 2.493354125087738e-06, "loss": 0.6812, "step": 1429 }, { "epoch": 1.3551291163231463, "grad_norm": 1.057656689299949, "learning_rate": 2.4867141600925214e-06, "loss": 0.7209, "step": 1430 }, { "epoch": 1.3551291163231463, "eval_loss": 0.9181008338928223, "eval_runtime": 61.8166, "eval_samples_per_second": 44.131, "eval_steps_per_second": 0.696, "step": 1430 }, { "epoch": 1.3560767590618337, "grad_norm": 1.0044797087222561, "learning_rate": 2.4800801206693873e-06, "loss": 0.6994, "step": 1431 }, { "epoch": 1.3570244018005213, "grad_norm": 1.1363501611998355, "learning_rate": 2.4734520224594094e-06, "loss": 0.6967, "step": 1432 }, { "epoch": 1.3579720445392087, "grad_norm": 1.0735619665932887, "learning_rate": 2.4668298810896463e-06, "loss": 0.6615, "step": 1433 }, { "epoch": 1.3589196872778961, "grad_norm": 1.2512092821838807, "learning_rate": 2.4602137121731195e-06, "loss": 0.7226, "step": 1434 }, { "epoch": 1.3598673300165838, "grad_norm": 1.0104639217781053, "learning_rate": 2.4536035313087603e-06, "loss": 0.7748, "step": 1435 }, { "epoch": 1.3608149727552712, "grad_norm": 0.945616005794055, "learning_rate": 2.44699935408139e-06, "loss": 0.7169, "step": 1436 }, { "epoch": 1.3617626154939588, "grad_norm": 1.1241609779341695, "learning_rate": 2.4404011960616747e-06, "loss": 0.6734, "step": 1437 }, { "epoch": 1.3627102582326462, "grad_norm": 1.321111844351736, "learning_rate": 2.4338090728060808e-06, "loss": 0.7567, "step": 1438 }, { "epoch": 1.3636579009713339, "grad_norm": 0.9913817986983521, "learning_rate": 2.4272229998568576e-06, "loss": 0.6312, "step": 1439 }, { "epoch": 1.3646055437100213, "grad_norm": 1.0071709428393925, "learning_rate": 2.4206429927419795e-06, "loss": 0.6763, "step": 1440 }, { "epoch": 1.3655531864487087, "grad_norm": 1.0356649949239918, "learning_rate": 2.414069066975128e-06, "loss": 0.6461, "step": 1441 }, { "epoch": 1.3665008291873963, "grad_norm": 0.9723682213152954, "learning_rate": 2.40750123805564e-06, "loss": 0.6884, "step": 1442 }, { "epoch": 1.367448471926084, "grad_norm": 0.9948531066652241, "learning_rate": 2.400939521468484e-06, "loss": 0.7155, "step": 1443 }, { "epoch": 1.3683961146647714, "grad_norm": 0.9940870834143998, "learning_rate": 2.3943839326842096e-06, "loss": 0.6657, "step": 1444 }, { "epoch": 1.3693437574034588, "grad_norm": 1.1771867828127296, "learning_rate": 2.387834487158926e-06, "loss": 0.7088, "step": 1445 }, { "epoch": 1.3702914001421465, "grad_norm": 1.0503694972372863, "learning_rate": 2.381291200334257e-06, "loss": 0.7379, "step": 1446 }, { "epoch": 1.3712390428808339, "grad_norm": 1.0357654532991112, "learning_rate": 2.3747540876373026e-06, "loss": 0.6843, "step": 1447 }, { "epoch": 1.3721866856195215, "grad_norm": 0.9841256078266499, "learning_rate": 2.368223164480611e-06, "loss": 0.7251, "step": 1448 }, { "epoch": 1.373134328358209, "grad_norm": 0.943494428045495, "learning_rate": 2.3616984462621307e-06, "loss": 0.756, "step": 1449 }, { "epoch": 1.3740819710968966, "grad_norm": 0.9561466461280033, "learning_rate": 2.3551799483651894e-06, "loss": 0.6926, "step": 1450 }, { "epoch": 1.375029613835584, "grad_norm": 1.2692497517950982, "learning_rate": 2.348667686158441e-06, "loss": 0.6878, "step": 1451 }, { "epoch": 1.3759772565742714, "grad_norm": 1.2205664546373765, "learning_rate": 2.342161674995843e-06, "loss": 0.7187, "step": 1452 }, { "epoch": 1.3759772565742714, "eval_loss": 0.9179805517196655, "eval_runtime": 66.4952, "eval_samples_per_second": 41.026, "eval_steps_per_second": 0.647, "step": 1452 }, { "epoch": 1.376924899312959, "grad_norm": 0.9875355272859734, "learning_rate": 2.335661930216611e-06, "loss": 0.6266, "step": 1453 }, { "epoch": 1.3778725420516467, "grad_norm": 1.0518188044284036, "learning_rate": 2.3291684671451905e-06, "loss": 0.6734, "step": 1454 }, { "epoch": 1.378820184790334, "grad_norm": 1.0155553726238449, "learning_rate": 2.322681301091214e-06, "loss": 0.6737, "step": 1455 }, { "epoch": 1.3797678275290215, "grad_norm": 1.127782826546028, "learning_rate": 2.316200447349466e-06, "loss": 0.7146, "step": 1456 }, { "epoch": 1.3807154702677091, "grad_norm": 1.0784960552341187, "learning_rate": 2.3097259211998536e-06, "loss": 0.7501, "step": 1457 }, { "epoch": 1.3816631130063965, "grad_norm": 0.9949725127797444, "learning_rate": 2.3032577379073577e-06, "loss": 0.7015, "step": 1458 }, { "epoch": 1.3826107557450842, "grad_norm": 0.9785164181620558, "learning_rate": 2.296795912722014e-06, "loss": 0.7015, "step": 1459 }, { "epoch": 1.3835583984837716, "grad_norm": 1.126981467512306, "learning_rate": 2.2903404608788582e-06, "loss": 0.6766, "step": 1460 }, { "epoch": 1.3845060412224592, "grad_norm": 1.0927804032807178, "learning_rate": 2.283891397597908e-06, "loss": 0.6693, "step": 1461 }, { "epoch": 1.3854536839611467, "grad_norm": 1.0258483620038565, "learning_rate": 2.2774487380841116e-06, "loss": 0.6607, "step": 1462 }, { "epoch": 1.386401326699834, "grad_norm": 1.0139207350796542, "learning_rate": 2.2710124975273236e-06, "loss": 0.7301, "step": 1463 }, { "epoch": 1.3873489694385217, "grad_norm": 1.2623014945186244, "learning_rate": 2.2645826911022656e-06, "loss": 0.6878, "step": 1464 }, { "epoch": 1.3882966121772091, "grad_norm": 1.0697558784167414, "learning_rate": 2.258159333968484e-06, "loss": 0.7058, "step": 1465 }, { "epoch": 1.3892442549158968, "grad_norm": 1.1679882747027204, "learning_rate": 2.2517424412703256e-06, "loss": 0.7337, "step": 1466 }, { "epoch": 1.3901918976545842, "grad_norm": 1.0802296481928269, "learning_rate": 2.2453320281368903e-06, "loss": 0.686, "step": 1467 }, { "epoch": 1.3911395403932718, "grad_norm": 0.9794786330880751, "learning_rate": 2.2389281096820077e-06, "loss": 0.7638, "step": 1468 }, { "epoch": 1.3920871831319592, "grad_norm": 0.9835019275382246, "learning_rate": 2.2325307010041874e-06, "loss": 0.7598, "step": 1469 }, { "epoch": 1.3930348258706466, "grad_norm": 1.239471343857703, "learning_rate": 2.2261398171865976e-06, "loss": 0.6944, "step": 1470 }, { "epoch": 1.3939824686093343, "grad_norm": 1.0053386117085916, "learning_rate": 2.21975547329702e-06, "loss": 0.6756, "step": 1471 }, { "epoch": 1.394930111348022, "grad_norm": 1.0689464802675128, "learning_rate": 2.2133776843878185e-06, "loss": 0.7674, "step": 1472 }, { "epoch": 1.3958777540867093, "grad_norm": 1.345586768860224, "learning_rate": 2.207006465495898e-06, "loss": 0.6936, "step": 1473 }, { "epoch": 1.3968253968253967, "grad_norm": 1.155638754133062, "learning_rate": 2.2006418316426773e-06, "loss": 0.6912, "step": 1474 }, { "epoch": 1.3968253968253967, "eval_loss": 0.9174560308456421, "eval_runtime": 65.4877, "eval_samples_per_second": 41.657, "eval_steps_per_second": 0.657, "step": 1474 }, { "epoch": 1.3977730395640844, "grad_norm": 1.7458254846447263, "learning_rate": 2.1942837978340516e-06, "loss": 0.7289, "step": 1475 }, { "epoch": 1.3987206823027718, "grad_norm": 1.104499286780302, "learning_rate": 2.187932379060348e-06, "loss": 0.6773, "step": 1476 }, { "epoch": 1.3996683250414594, "grad_norm": 1.0183691386946678, "learning_rate": 2.1815875902963058e-06, "loss": 0.7138, "step": 1477 }, { "epoch": 1.4006159677801469, "grad_norm": 0.9953027863427754, "learning_rate": 2.175249446501024e-06, "loss": 0.6644, "step": 1478 }, { "epoch": 1.4015636105188345, "grad_norm": 1.119903318819432, "learning_rate": 2.1689179626179442e-06, "loss": 0.673, "step": 1479 }, { "epoch": 1.402511253257522, "grad_norm": 1.01161813324734, "learning_rate": 2.1625931535747964e-06, "loss": 0.7104, "step": 1480 }, { "epoch": 1.4034588959962093, "grad_norm": 0.953534813718956, "learning_rate": 2.1562750342835827e-06, "loss": 0.7277, "step": 1481 }, { "epoch": 1.404406538734897, "grad_norm": 1.1396651805754336, "learning_rate": 2.1499636196405225e-06, "loss": 0.7227, "step": 1482 }, { "epoch": 1.4053541814735846, "grad_norm": 1.5386081307384067, "learning_rate": 2.1436589245260375e-06, "loss": 0.668, "step": 1483 }, { "epoch": 1.406301824212272, "grad_norm": 0.9995908645331962, "learning_rate": 2.1373609638047033e-06, "loss": 0.7043, "step": 1484 }, { "epoch": 1.4072494669509594, "grad_norm": 1.10383243717245, "learning_rate": 2.1310697523252126e-06, "loss": 0.7026, "step": 1485 }, { "epoch": 1.408197109689647, "grad_norm": 1.0267277972723963, "learning_rate": 2.1247853049203543e-06, "loss": 0.7082, "step": 1486 }, { "epoch": 1.4091447524283345, "grad_norm": 1.0496913097987108, "learning_rate": 2.118507636406962e-06, "loss": 0.6481, "step": 1487 }, { "epoch": 1.410092395167022, "grad_norm": 1.1897337956700562, "learning_rate": 2.112236761585892e-06, "loss": 0.7089, "step": 1488 }, { "epoch": 1.4110400379057095, "grad_norm": 0.9785065766450801, "learning_rate": 2.1059726952419782e-06, "loss": 0.7485, "step": 1489 }, { "epoch": 1.4119876806443972, "grad_norm": 1.0121891810944206, "learning_rate": 2.09971545214401e-06, "loss": 0.7133, "step": 1490 }, { "epoch": 1.4129353233830846, "grad_norm": 1.292470404393716, "learning_rate": 2.0934650470446788e-06, "loss": 0.6978, "step": 1491 }, { "epoch": 1.413882966121772, "grad_norm": 0.9544540655235114, "learning_rate": 2.087221494680563e-06, "loss": 0.7313, "step": 1492 }, { "epoch": 1.4148306088604596, "grad_norm": 1.1659180349275824, "learning_rate": 2.0809848097720823e-06, "loss": 0.6451, "step": 1493 }, { "epoch": 1.415778251599147, "grad_norm": 1.037380174383666, "learning_rate": 2.074755007023461e-06, "loss": 0.7405, "step": 1494 }, { "epoch": 1.4167258943378347, "grad_norm": 1.0242249417898568, "learning_rate": 2.068532101122704e-06, "loss": 0.6708, "step": 1495 }, { "epoch": 1.417673537076522, "grad_norm": 0.9620726481692868, "learning_rate": 2.0623161067415463e-06, "loss": 0.6707, "step": 1496 }, { "epoch": 1.417673537076522, "eval_loss": 0.9181029200553894, "eval_runtime": 64.509, "eval_samples_per_second": 42.289, "eval_steps_per_second": 0.667, "step": 1496 }, { "epoch": 1.4186211798152097, "grad_norm": 1.005417588718906, "learning_rate": 2.0561070385354388e-06, "loss": 0.6731, "step": 1497 }, { "epoch": 1.4195688225538972, "grad_norm": 1.1896671861120682, "learning_rate": 2.0499049111434922e-06, "loss": 0.7227, "step": 1498 }, { "epoch": 1.4205164652925846, "grad_norm": 2.0023777404039786, "learning_rate": 2.0437097391884613e-06, "loss": 0.6868, "step": 1499 }, { "epoch": 1.4214641080312722, "grad_norm": 1.0629425622129343, "learning_rate": 2.0375215372766944e-06, "loss": 0.6846, "step": 1500 }, { "epoch": 1.4224117507699598, "grad_norm": 0.9445974760544792, "learning_rate": 2.0313403199981125e-06, "loss": 0.7394, "step": 1501 }, { "epoch": 1.4233593935086473, "grad_norm": 0.9843138879504003, "learning_rate": 2.025166101926168e-06, "loss": 0.7182, "step": 1502 }, { "epoch": 1.4243070362473347, "grad_norm": 0.9243254028077557, "learning_rate": 2.018998897617808e-06, "loss": 0.6837, "step": 1503 }, { "epoch": 1.4252546789860223, "grad_norm": 1.0253863539461858, "learning_rate": 2.012838721613447e-06, "loss": 0.667, "step": 1504 }, { "epoch": 1.4262023217247097, "grad_norm": 1.327749397929995, "learning_rate": 2.0066855884369246e-06, "loss": 0.7177, "step": 1505 }, { "epoch": 1.4271499644633974, "grad_norm": 1.1076042384170708, "learning_rate": 2.0005395125954814e-06, "loss": 0.7841, "step": 1506 }, { "epoch": 1.4280976072020848, "grad_norm": 1.037107571912964, "learning_rate": 1.9944005085797124e-06, "loss": 0.6346, "step": 1507 }, { "epoch": 1.4290452499407724, "grad_norm": 1.0616935978252389, "learning_rate": 1.988268590863546e-06, "loss": 0.7287, "step": 1508 }, { "epoch": 1.4299928926794598, "grad_norm": 1.1394211171878255, "learning_rate": 1.982143773904197e-06, "loss": 0.7026, "step": 1509 }, { "epoch": 1.4309405354181473, "grad_norm": 1.0108800469903032, "learning_rate": 1.9760260721421426e-06, "loss": 0.722, "step": 1510 }, { "epoch": 1.431888178156835, "grad_norm": 1.0306788000447762, "learning_rate": 1.9699155000010853e-06, "loss": 0.6762, "step": 1511 }, { "epoch": 1.4328358208955223, "grad_norm": 0.963103281240134, "learning_rate": 1.9638120718879133e-06, "loss": 0.7084, "step": 1512 }, { "epoch": 1.43378346363421, "grad_norm": 0.9474369197260795, "learning_rate": 1.9577158021926774e-06, "loss": 0.6745, "step": 1513 }, { "epoch": 1.4347311063728974, "grad_norm": 1.9939781034695376, "learning_rate": 1.951626705288544e-06, "loss": 0.7011, "step": 1514 }, { "epoch": 1.435678749111585, "grad_norm": 0.9725792085154051, "learning_rate": 1.945544795531777e-06, "loss": 0.7155, "step": 1515 }, { "epoch": 1.4366263918502724, "grad_norm": 1.1197678082060893, "learning_rate": 1.9394700872616856e-06, "loss": 0.6595, "step": 1516 }, { "epoch": 1.4375740345889598, "grad_norm": 1.0301457084495482, "learning_rate": 1.9334025948006074e-06, "loss": 0.6955, "step": 1517 }, { "epoch": 1.4385216773276475, "grad_norm": 0.9348949706000595, "learning_rate": 1.927342332453866e-06, "loss": 0.6047, "step": 1518 }, { "epoch": 1.4385216773276475, "eval_loss": 0.9176700115203857, "eval_runtime": 67.6785, "eval_samples_per_second": 40.308, "eval_steps_per_second": 0.635, "step": 1518 }, { "epoch": 1.439469320066335, "grad_norm": 1.0644335037153672, "learning_rate": 1.921289314509734e-06, "loss": 0.7127, "step": 1519 }, { "epoch": 1.4404169628050225, "grad_norm": 1.0834219671738452, "learning_rate": 1.9152435552394105e-06, "loss": 0.7215, "step": 1520 }, { "epoch": 1.44136460554371, "grad_norm": 1.0745604461477756, "learning_rate": 1.9092050688969736e-06, "loss": 0.678, "step": 1521 }, { "epoch": 1.4423122482823976, "grad_norm": 1.0371641556496103, "learning_rate": 1.9031738697193618e-06, "loss": 0.633, "step": 1522 }, { "epoch": 1.443259891021085, "grad_norm": 1.2536801282599777, "learning_rate": 1.8971499719263253e-06, "loss": 0.6985, "step": 1523 }, { "epoch": 1.4442075337597726, "grad_norm": 0.9951235851029153, "learning_rate": 1.8911333897204071e-06, "loss": 0.7719, "step": 1524 }, { "epoch": 1.44515517649846, "grad_norm": 1.0116038011102524, "learning_rate": 1.8851241372868938e-06, "loss": 0.6848, "step": 1525 }, { "epoch": 1.4461028192371477, "grad_norm": 1.4604330607664564, "learning_rate": 1.8791222287937983e-06, "loss": 0.7657, "step": 1526 }, { "epoch": 1.447050461975835, "grad_norm": 1.0830259379657234, "learning_rate": 1.8731276783918162e-06, "loss": 0.6805, "step": 1527 }, { "epoch": 1.4479981047145225, "grad_norm": 1.0267731594205933, "learning_rate": 1.8671405002142918e-06, "loss": 0.6707, "step": 1528 }, { "epoch": 1.4489457474532101, "grad_norm": 1.0461880905401142, "learning_rate": 1.8611607083771931e-06, "loss": 0.7222, "step": 1529 }, { "epoch": 1.4498933901918978, "grad_norm": 1.5078994577302969, "learning_rate": 1.855188316979068e-06, "loss": 0.7749, "step": 1530 }, { "epoch": 1.4508410329305852, "grad_norm": 0.9580998087747682, "learning_rate": 1.8492233401010218e-06, "loss": 0.6656, "step": 1531 }, { "epoch": 1.4517886756692726, "grad_norm": 1.030790506291452, "learning_rate": 1.8432657918066732e-06, "loss": 0.6938, "step": 1532 }, { "epoch": 1.4527363184079602, "grad_norm": 1.0172225305837883, "learning_rate": 1.8373156861421327e-06, "loss": 0.6944, "step": 1533 }, { "epoch": 1.4536839611466477, "grad_norm": 1.0683718679060756, "learning_rate": 1.831373037135955e-06, "loss": 0.6548, "step": 1534 }, { "epoch": 1.4546316038853353, "grad_norm": 1.0948584996468758, "learning_rate": 1.8254378587991229e-06, "loss": 0.7163, "step": 1535 }, { "epoch": 1.4555792466240227, "grad_norm": 0.9144325805834296, "learning_rate": 1.819510165125002e-06, "loss": 0.6897, "step": 1536 }, { "epoch": 1.4565268893627104, "grad_norm": 1.0047087823053469, "learning_rate": 1.813589970089308e-06, "loss": 0.682, "step": 1537 }, { "epoch": 1.4574745321013978, "grad_norm": 1.0404416866339528, "learning_rate": 1.8076772876500831e-06, "loss": 0.7615, "step": 1538 }, { "epoch": 1.4584221748400852, "grad_norm": 0.9884869531835868, "learning_rate": 1.8017721317476517e-06, "loss": 0.7436, "step": 1539 }, { "epoch": 1.4593698175787728, "grad_norm": 1.1394802881055484, "learning_rate": 1.7958745163045987e-06, "loss": 0.6969, "step": 1540 }, { "epoch": 1.4593698175787728, "eval_loss": 0.9182996153831482, "eval_runtime": 64.6749, "eval_samples_per_second": 42.18, "eval_steps_per_second": 0.665, "step": 1540 }, { "epoch": 1.4603174603174602, "grad_norm": 0.9439166549447204, "learning_rate": 1.7899844552257233e-06, "loss": 0.6422, "step": 1541 }, { "epoch": 1.4612651030561479, "grad_norm": 1.0656288471157296, "learning_rate": 1.7841019623980215e-06, "loss": 0.7706, "step": 1542 }, { "epoch": 1.4622127457948353, "grad_norm": 1.1222434957696432, "learning_rate": 1.778227051690639e-06, "loss": 0.7507, "step": 1543 }, { "epoch": 1.463160388533523, "grad_norm": 0.983832268950965, "learning_rate": 1.77235973695485e-06, "loss": 0.6955, "step": 1544 }, { "epoch": 1.4641080312722103, "grad_norm": 1.2587712766095054, "learning_rate": 1.76650003202402e-06, "loss": 0.6477, "step": 1545 }, { "epoch": 1.4650556740108978, "grad_norm": 1.0066028742449087, "learning_rate": 1.760647950713566e-06, "loss": 0.7544, "step": 1546 }, { "epoch": 1.4660033167495854, "grad_norm": 1.7078692165257152, "learning_rate": 1.7548035068209402e-06, "loss": 0.6756, "step": 1547 }, { "epoch": 1.466950959488273, "grad_norm": 1.166304208347539, "learning_rate": 1.7489667141255801e-06, "loss": 0.7093, "step": 1548 }, { "epoch": 1.4678986022269604, "grad_norm": 1.1980159481547823, "learning_rate": 1.74313758638889e-06, "loss": 0.6749, "step": 1549 }, { "epoch": 1.4688462449656479, "grad_norm": 1.1944578051622323, "learning_rate": 1.7373161373541968e-06, "loss": 0.7281, "step": 1550 }, { "epoch": 1.4697938877043355, "grad_norm": 0.9485809631995211, "learning_rate": 1.7315023807467297e-06, "loss": 0.7248, "step": 1551 }, { "epoch": 1.470741530443023, "grad_norm": 1.0931088608413588, "learning_rate": 1.7256963302735752e-06, "loss": 0.7378, "step": 1552 }, { "epoch": 1.4716891731817106, "grad_norm": 1.060786787419658, "learning_rate": 1.7198979996236548e-06, "loss": 0.7155, "step": 1553 }, { "epoch": 1.472636815920398, "grad_norm": 1.0652060238495478, "learning_rate": 1.7141074024676913e-06, "loss": 0.7045, "step": 1554 }, { "epoch": 1.4735844586590856, "grad_norm": 1.0281307395838142, "learning_rate": 1.7083245524581666e-06, "loss": 0.6337, "step": 1555 }, { "epoch": 1.474532101397773, "grad_norm": 1.0438612628712518, "learning_rate": 1.702549463229305e-06, "loss": 0.71, "step": 1556 }, { "epoch": 1.4754797441364604, "grad_norm": 1.1014417394767868, "learning_rate": 1.6967821483970277e-06, "loss": 0.7179, "step": 1557 }, { "epoch": 1.476427386875148, "grad_norm": 1.111247079863271, "learning_rate": 1.6910226215589303e-06, "loss": 0.7377, "step": 1558 }, { "epoch": 1.4773750296138357, "grad_norm": 0.9295211830798729, "learning_rate": 1.6852708962942426e-06, "loss": 0.6809, "step": 1559 }, { "epoch": 1.4783226723525231, "grad_norm": 0.962644045007234, "learning_rate": 1.6795269861638041e-06, "loss": 0.6314, "step": 1560 }, { "epoch": 1.4792703150912105, "grad_norm": 1.0292591372583686, "learning_rate": 1.6737909047100292e-06, "loss": 0.6838, "step": 1561 }, { "epoch": 1.4802179578298982, "grad_norm": 0.933451941605439, "learning_rate": 1.6680626654568688e-06, "loss": 0.6608, "step": 1562 }, { "epoch": 1.4802179578298982, "eval_loss": 0.9174679517745972, "eval_runtime": 65.8561, "eval_samples_per_second": 41.424, "eval_steps_per_second": 0.653, "step": 1562 }, { "epoch": 1.4811656005685856, "grad_norm": 1.2955810557077383, "learning_rate": 1.6623422819097916e-06, "loss": 0.6458, "step": 1563 }, { "epoch": 1.4821132433072732, "grad_norm": 1.0310551387137157, "learning_rate": 1.6566297675557392e-06, "loss": 0.6919, "step": 1564 }, { "epoch": 1.4830608860459606, "grad_norm": 1.0315297687360494, "learning_rate": 1.650925135863104e-06, "loss": 0.7086, "step": 1565 }, { "epoch": 1.4840085287846483, "grad_norm": 1.0726991322217692, "learning_rate": 1.6452284002816893e-06, "loss": 0.7162, "step": 1566 }, { "epoch": 1.4849561715233357, "grad_norm": 0.9497387623431127, "learning_rate": 1.6395395742426873e-06, "loss": 0.7216, "step": 1567 }, { "epoch": 1.4859038142620231, "grad_norm": 1.2380479809887557, "learning_rate": 1.6338586711586358e-06, "loss": 0.7606, "step": 1568 }, { "epoch": 1.4868514570007108, "grad_norm": 1.0796844065547042, "learning_rate": 1.6281857044233968e-06, "loss": 0.7319, "step": 1569 }, { "epoch": 1.4877990997393982, "grad_norm": 1.1057313142497514, "learning_rate": 1.6225206874121219e-06, "loss": 0.6829, "step": 1570 }, { "epoch": 1.4887467424780858, "grad_norm": 0.9734912790760845, "learning_rate": 1.6168636334812126e-06, "loss": 0.7407, "step": 1571 }, { "epoch": 1.4896943852167732, "grad_norm": 1.0421614980005374, "learning_rate": 1.6112145559683057e-06, "loss": 0.7287, "step": 1572 }, { "epoch": 1.4906420279554609, "grad_norm": 1.04980460946954, "learning_rate": 1.6055734681922225e-06, "loss": 0.7045, "step": 1573 }, { "epoch": 1.4915896706941483, "grad_norm": 0.9906252177237804, "learning_rate": 1.5999403834529549e-06, "loss": 0.7192, "step": 1574 }, { "epoch": 1.4925373134328357, "grad_norm": 0.8864390607015001, "learning_rate": 1.5943153150316192e-06, "loss": 0.6814, "step": 1575 }, { "epoch": 1.4934849561715233, "grad_norm": 1.0203127741430889, "learning_rate": 1.588698276190438e-06, "loss": 0.6962, "step": 1576 }, { "epoch": 1.494432598910211, "grad_norm": 1.1100506177133262, "learning_rate": 1.583089280172696e-06, "loss": 0.7852, "step": 1577 }, { "epoch": 1.4953802416488984, "grad_norm": 1.0162831638512881, "learning_rate": 1.5774883402027208e-06, "loss": 0.7059, "step": 1578 }, { "epoch": 1.4963278843875858, "grad_norm": 0.9936052407213679, "learning_rate": 1.5718954694858457e-06, "loss": 0.6858, "step": 1579 }, { "epoch": 1.4972755271262734, "grad_norm": 0.9088159832146572, "learning_rate": 1.5663106812083746e-06, "loss": 0.75, "step": 1580 }, { "epoch": 1.4982231698649608, "grad_norm": 1.078879512723169, "learning_rate": 1.5607339885375616e-06, "loss": 0.7139, "step": 1581 }, { "epoch": 1.4991708126036485, "grad_norm": 1.096073398162632, "learning_rate": 1.555165404621567e-06, "loss": 0.7086, "step": 1582 }, { "epoch": 1.500118455342336, "grad_norm": 0.9334643534638518, "learning_rate": 1.549604942589441e-06, "loss": 0.7479, "step": 1583 }, { "epoch": 1.5010660980810235, "grad_norm": 1.0170090962142344, "learning_rate": 1.5440526155510766e-06, "loss": 0.7369, "step": 1584 }, { "epoch": 1.5010660980810235, "eval_loss": 0.9167425036430359, "eval_runtime": 65.8794, "eval_samples_per_second": 41.409, "eval_steps_per_second": 0.653, "step": 1584 }, { "epoch": 1.502013740819711, "grad_norm": 0.9547990185885695, "learning_rate": 1.5385084365971947e-06, "loss": 0.6959, "step": 1585 }, { "epoch": 1.5029613835583984, "grad_norm": 1.0361542203838576, "learning_rate": 1.5329724187992983e-06, "loss": 0.7212, "step": 1586 }, { "epoch": 1.503909026297086, "grad_norm": 0.9844297983728068, "learning_rate": 1.527444575209654e-06, "loss": 0.6246, "step": 1587 }, { "epoch": 1.5048566690357736, "grad_norm": 1.1774971138059982, "learning_rate": 1.5219249188612556e-06, "loss": 0.7104, "step": 1588 }, { "epoch": 1.505804311774461, "grad_norm": 1.0258042621080319, "learning_rate": 1.5164134627677895e-06, "loss": 0.8074, "step": 1589 }, { "epoch": 1.5067519545131485, "grad_norm": 1.0211430093660283, "learning_rate": 1.5109102199236152e-06, "loss": 0.7133, "step": 1590 }, { "epoch": 1.507699597251836, "grad_norm": 0.9827554066548617, "learning_rate": 1.5054152033037206e-06, "loss": 0.6725, "step": 1591 }, { "epoch": 1.5086472399905235, "grad_norm": 1.039393638590845, "learning_rate": 1.4999284258637054e-06, "loss": 0.7353, "step": 1592 }, { "epoch": 1.509594882729211, "grad_norm": 0.9523927797678573, "learning_rate": 1.4944499005397372e-06, "loss": 0.7171, "step": 1593 }, { "epoch": 1.5105425254678986, "grad_norm": 1.1643227627630548, "learning_rate": 1.488979640248534e-06, "loss": 0.7277, "step": 1594 }, { "epoch": 1.5114901682065862, "grad_norm": 1.3140724772089227, "learning_rate": 1.483517657887321e-06, "loss": 0.6989, "step": 1595 }, { "epoch": 1.5124378109452736, "grad_norm": 1.1677523249682737, "learning_rate": 1.4780639663338125e-06, "loss": 0.6442, "step": 1596 }, { "epoch": 1.513385453683961, "grad_norm": 1.054535982093413, "learning_rate": 1.4726185784461726e-06, "loss": 0.7267, "step": 1597 }, { "epoch": 1.5143330964226487, "grad_norm": 1.0699556405778263, "learning_rate": 1.467181507062987e-06, "loss": 0.7513, "step": 1598 }, { "epoch": 1.5152807391613363, "grad_norm": 1.0277582958250449, "learning_rate": 1.4617527650032359e-06, "loss": 0.7007, "step": 1599 }, { "epoch": 1.5162283819000237, "grad_norm": 0.9386085483384984, "learning_rate": 1.4563323650662586e-06, "loss": 0.6309, "step": 1600 }, { "epoch": 1.5171760246387112, "grad_norm": 0.9866666932951867, "learning_rate": 1.45092032003173e-06, "loss": 0.7236, "step": 1601 }, { "epoch": 1.5181236673773988, "grad_norm": 1.1091828859845587, "learning_rate": 1.4455166426596222e-06, "loss": 0.6645, "step": 1602 }, { "epoch": 1.5190713101160862, "grad_norm": 1.0719630621382366, "learning_rate": 1.440121345690182e-06, "loss": 0.6967, "step": 1603 }, { "epoch": 1.5200189528547736, "grad_norm": 1.2153721393164998, "learning_rate": 1.434734441843899e-06, "loss": 0.6897, "step": 1604 }, { "epoch": 1.5209665955934613, "grad_norm": 1.4318770673483734, "learning_rate": 1.4293559438214688e-06, "loss": 0.6556, "step": 1605 }, { "epoch": 1.521914238332149, "grad_norm": 1.2817723869505593, "learning_rate": 1.4239858643037753e-06, "loss": 0.714, "step": 1606 }, { "epoch": 1.521914238332149, "eval_loss": 0.917027473449707, "eval_runtime": 64.2379, "eval_samples_per_second": 42.467, "eval_steps_per_second": 0.669, "step": 1606 }, { "epoch": 1.5228618810708363, "grad_norm": 0.8577851406821353, "learning_rate": 1.4186242159518477e-06, "loss": 0.7231, "step": 1607 }, { "epoch": 1.5238095238095237, "grad_norm": 1.3693963934030193, "learning_rate": 1.4132710114068427e-06, "loss": 0.7009, "step": 1608 }, { "epoch": 1.5247571665482114, "grad_norm": 1.0677528890480996, "learning_rate": 1.4079262632900048e-06, "loss": 0.7038, "step": 1609 }, { "epoch": 1.525704809286899, "grad_norm": 1.0765183801869453, "learning_rate": 1.4025899842026442e-06, "loss": 0.6736, "step": 1610 }, { "epoch": 1.5266524520255862, "grad_norm": 0.913748130871297, "learning_rate": 1.3972621867261e-06, "loss": 0.7614, "step": 1611 }, { "epoch": 1.5276000947642738, "grad_norm": 0.9806034371545538, "learning_rate": 1.3919428834217163e-06, "loss": 0.7362, "step": 1612 }, { "epoch": 1.5285477375029615, "grad_norm": 0.9176939063393724, "learning_rate": 1.3866320868308137e-06, "loss": 0.7242, "step": 1613 }, { "epoch": 1.5294953802416489, "grad_norm": 1.078657631575495, "learning_rate": 1.3813298094746491e-06, "loss": 0.7036, "step": 1614 }, { "epoch": 1.5304430229803363, "grad_norm": 0.9600837682866002, "learning_rate": 1.3760360638544012e-06, "loss": 0.6084, "step": 1615 }, { "epoch": 1.531390665719024, "grad_norm": 0.9717695630583644, "learning_rate": 1.3707508624511263e-06, "loss": 0.7243, "step": 1616 }, { "epoch": 1.5323383084577116, "grad_norm": 1.00289902715343, "learning_rate": 1.3654742177257436e-06, "loss": 0.7266, "step": 1617 }, { "epoch": 1.533285951196399, "grad_norm": 0.9836974323677292, "learning_rate": 1.3602061421189899e-06, "loss": 0.6669, "step": 1618 }, { "epoch": 1.5342335939350864, "grad_norm": 0.9697616210673388, "learning_rate": 1.3549466480514079e-06, "loss": 0.6768, "step": 1619 }, { "epoch": 1.535181236673774, "grad_norm": 1.0209728170363237, "learning_rate": 1.349695747923298e-06, "loss": 0.68, "step": 1620 }, { "epoch": 1.5361288794124615, "grad_norm": 1.1317276468030417, "learning_rate": 1.3444534541147058e-06, "loss": 0.6391, "step": 1621 }, { "epoch": 1.5370765221511489, "grad_norm": 0.9314842383246464, "learning_rate": 1.339219778985385e-06, "loss": 0.7117, "step": 1622 }, { "epoch": 1.5380241648898365, "grad_norm": 1.1323771307347132, "learning_rate": 1.3339947348747633e-06, "loss": 0.7511, "step": 1623 }, { "epoch": 1.5389718076285241, "grad_norm": 1.0881241560945962, "learning_rate": 1.3287783341019278e-06, "loss": 0.6818, "step": 1624 }, { "epoch": 1.5399194503672116, "grad_norm": 1.0586689056144305, "learning_rate": 1.3235705889655781e-06, "loss": 0.7126, "step": 1625 }, { "epoch": 1.540867093105899, "grad_norm": 0.9786185399753199, "learning_rate": 1.3183715117440143e-06, "loss": 0.704, "step": 1626 }, { "epoch": 1.5418147358445866, "grad_norm": 1.003277545254164, "learning_rate": 1.3131811146950946e-06, "loss": 0.7513, "step": 1627 }, { "epoch": 1.5427623785832743, "grad_norm": 0.9826596333095564, "learning_rate": 1.307999410056216e-06, "loss": 0.7253, "step": 1628 }, { "epoch": 1.5427623785832743, "eval_loss": 0.9158708453178406, "eval_runtime": 63.8817, "eval_samples_per_second": 42.704, "eval_steps_per_second": 0.673, "step": 1628 }, { "epoch": 1.5437100213219617, "grad_norm": 1.0364913585561064, "learning_rate": 1.3028264100442773e-06, "loss": 0.7177, "step": 1629 }, { "epoch": 1.544657664060649, "grad_norm": 1.0314613393589664, "learning_rate": 1.2976621268556571e-06, "loss": 0.6822, "step": 1630 }, { "epoch": 1.5456053067993367, "grad_norm": 1.15231691952742, "learning_rate": 1.2925065726661845e-06, "loss": 0.6954, "step": 1631 }, { "epoch": 1.5465529495380241, "grad_norm": 1.130048978044205, "learning_rate": 1.2873597596311026e-06, "loss": 0.6895, "step": 1632 }, { "epoch": 1.5475005922767116, "grad_norm": 0.9910577741606257, "learning_rate": 1.2822216998850506e-06, "loss": 0.7672, "step": 1633 }, { "epoch": 1.5484482350153992, "grad_norm": 1.1500833811393358, "learning_rate": 1.2770924055420258e-06, "loss": 0.6813, "step": 1634 }, { "epoch": 1.5493958777540868, "grad_norm": 1.0195004139070214, "learning_rate": 1.2719718886953647e-06, "loss": 0.6438, "step": 1635 }, { "epoch": 1.5503435204927742, "grad_norm": 1.0569551060744469, "learning_rate": 1.2668601614177017e-06, "loss": 0.678, "step": 1636 }, { "epoch": 1.5512911632314617, "grad_norm": 1.043875396257435, "learning_rate": 1.2617572357609565e-06, "loss": 0.7044, "step": 1637 }, { "epoch": 1.5522388059701493, "grad_norm": 3.1818927893039124, "learning_rate": 1.2566631237562894e-06, "loss": 0.682, "step": 1638 }, { "epoch": 1.553186448708837, "grad_norm": 0.9283273042702088, "learning_rate": 1.2515778374140858e-06, "loss": 0.688, "step": 1639 }, { "epoch": 1.5541340914475241, "grad_norm": 0.9528198487095788, "learning_rate": 1.246501388723923e-06, "loss": 0.7322, "step": 1640 }, { "epoch": 1.5550817341862118, "grad_norm": 0.9973994825972451, "learning_rate": 1.2414337896545375e-06, "loss": 0.666, "step": 1641 }, { "epoch": 1.5560293769248994, "grad_norm": 0.9902699851910854, "learning_rate": 1.2363750521538064e-06, "loss": 0.6851, "step": 1642 }, { "epoch": 1.5569770196635868, "grad_norm": 0.9650904944333506, "learning_rate": 1.2313251881487081e-06, "loss": 0.6672, "step": 1643 }, { "epoch": 1.5579246624022742, "grad_norm": 1.0589094342875154, "learning_rate": 1.2262842095453065e-06, "loss": 0.7416, "step": 1644 }, { "epoch": 1.5588723051409619, "grad_norm": 0.9386856438191878, "learning_rate": 1.2212521282287093e-06, "loss": 0.6483, "step": 1645 }, { "epoch": 1.5598199478796495, "grad_norm": 1.0392724309573975, "learning_rate": 1.2162289560630524e-06, "loss": 0.647, "step": 1646 }, { "epoch": 1.560767590618337, "grad_norm": 0.9961962290639487, "learning_rate": 1.211214704891467e-06, "loss": 0.6501, "step": 1647 }, { "epoch": 1.5617152333570243, "grad_norm": 0.9147134754595742, "learning_rate": 1.2062093865360458e-06, "loss": 0.6753, "step": 1648 }, { "epoch": 1.562662876095712, "grad_norm": 1.1879710589928145, "learning_rate": 1.2012130127978267e-06, "loss": 0.7233, "step": 1649 }, { "epoch": 1.5636105188343994, "grad_norm": 0.931640687866233, "learning_rate": 1.1962255954567537e-06, "loss": 0.6783, "step": 1650 }, { "epoch": 1.5636105188343994, "eval_loss": 0.9168549180030823, "eval_runtime": 67.249, "eval_samples_per_second": 40.566, "eval_steps_per_second": 0.639, "step": 1650 }, { "epoch": 1.5645581615730868, "grad_norm": 1.0551992767048257, "learning_rate": 1.1912471462716596e-06, "loss": 0.7034, "step": 1651 }, { "epoch": 1.5655058043117744, "grad_norm": 1.0948003208344899, "learning_rate": 1.1862776769802275e-06, "loss": 0.7325, "step": 1652 }, { "epoch": 1.566453447050462, "grad_norm": 0.9671073218744537, "learning_rate": 1.181317199298974e-06, "loss": 0.6658, "step": 1653 }, { "epoch": 1.5674010897891495, "grad_norm": 1.0286649464862914, "learning_rate": 1.1763657249232107e-06, "loss": 0.696, "step": 1654 }, { "epoch": 1.568348732527837, "grad_norm": 1.0682369368320455, "learning_rate": 1.1714232655270264e-06, "loss": 0.6833, "step": 1655 }, { "epoch": 1.5692963752665245, "grad_norm": 0.9350398037136525, "learning_rate": 1.1664898327632552e-06, "loss": 0.6133, "step": 1656 }, { "epoch": 1.5702440180052122, "grad_norm": 1.0464212808521496, "learning_rate": 1.1615654382634444e-06, "loss": 0.6935, "step": 1657 }, { "epoch": 1.5711916607438996, "grad_norm": 1.1421842706854686, "learning_rate": 1.1566500936378389e-06, "loss": 0.6562, "step": 1658 }, { "epoch": 1.572139303482587, "grad_norm": 1.0126937824702735, "learning_rate": 1.1517438104753386e-06, "loss": 0.7224, "step": 1659 }, { "epoch": 1.5730869462212747, "grad_norm": 1.033911213832933, "learning_rate": 1.146846600343488e-06, "loss": 0.7106, "step": 1660 }, { "epoch": 1.574034588959962, "grad_norm": 1.1179259174465113, "learning_rate": 1.1419584747884322e-06, "loss": 0.6983, "step": 1661 }, { "epoch": 1.5749822316986495, "grad_norm": 1.1682227334057596, "learning_rate": 1.1370794453349039e-06, "loss": 0.7165, "step": 1662 }, { "epoch": 1.5759298744373371, "grad_norm": 0.977082622032907, "learning_rate": 1.132209523486184e-06, "loss": 0.6902, "step": 1663 }, { "epoch": 1.5768775171760248, "grad_norm": 1.023609290507064, "learning_rate": 1.1273487207240845e-06, "loss": 0.6784, "step": 1664 }, { "epoch": 1.5778251599147122, "grad_norm": 1.0160057466393255, "learning_rate": 1.1224970485089193e-06, "loss": 0.6993, "step": 1665 }, { "epoch": 1.5787728026533996, "grad_norm": 0.9873881250473061, "learning_rate": 1.1176545182794674e-06, "loss": 0.7175, "step": 1666 }, { "epoch": 1.5797204453920872, "grad_norm": 1.3197902151472374, "learning_rate": 1.1128211414529626e-06, "loss": 0.6993, "step": 1667 }, { "epoch": 1.5806680881307749, "grad_norm": 1.3071909280159977, "learning_rate": 1.1079969294250515e-06, "loss": 0.7093, "step": 1668 }, { "epoch": 1.581615730869462, "grad_norm": 1.0541564785946917, "learning_rate": 1.1031818935697763e-06, "loss": 0.7186, "step": 1669 }, { "epoch": 1.5825633736081497, "grad_norm": 0.9430987398425215, "learning_rate": 1.0983760452395415e-06, "loss": 0.6589, "step": 1670 }, { "epoch": 1.5835110163468373, "grad_norm": 0.990119702459614, "learning_rate": 1.0935793957650947e-06, "loss": 0.6329, "step": 1671 }, { "epoch": 1.5844586590855247, "grad_norm": 1.0566708748575848, "learning_rate": 1.0887919564554893e-06, "loss": 0.7004, "step": 1672 }, { "epoch": 1.5844586590855247, "eval_loss": 0.91568523645401, "eval_runtime": 65.9386, "eval_samples_per_second": 41.372, "eval_steps_per_second": 0.652, "step": 1672 }, { "epoch": 1.5854063018242122, "grad_norm": 0.9610068854225464, "learning_rate": 1.0840137385980698e-06, "loss": 0.6791, "step": 1673 }, { "epoch": 1.5863539445628998, "grad_norm": 1.0544927310701904, "learning_rate": 1.079244753458437e-06, "loss": 0.6809, "step": 1674 }, { "epoch": 1.5873015873015874, "grad_norm": 0.9544448982368492, "learning_rate": 1.0744850122804218e-06, "loss": 0.6979, "step": 1675 }, { "epoch": 1.5882492300402749, "grad_norm": 1.2640977900333505, "learning_rate": 1.0697345262860638e-06, "loss": 0.6599, "step": 1676 }, { "epoch": 1.5891968727789623, "grad_norm": 1.0083657972577804, "learning_rate": 1.064993306675578e-06, "loss": 0.6699, "step": 1677 }, { "epoch": 1.59014451551765, "grad_norm": 1.0076578894558765, "learning_rate": 1.0602613646273374e-06, "loss": 0.6576, "step": 1678 }, { "epoch": 1.5910921582563373, "grad_norm": 1.0296009587033885, "learning_rate": 1.055538711297835e-06, "loss": 0.6642, "step": 1679 }, { "epoch": 1.5920398009950247, "grad_norm": 1.031189178917333, "learning_rate": 1.0508253578216693e-06, "loss": 0.5869, "step": 1680 }, { "epoch": 1.5929874437337124, "grad_norm": 1.0167017867708008, "learning_rate": 1.046121315311508e-06, "loss": 0.699, "step": 1681 }, { "epoch": 1.5939350864724, "grad_norm": 1.7410568540334943, "learning_rate": 1.0414265948580694e-06, "loss": 0.7248, "step": 1682 }, { "epoch": 1.5948827292110874, "grad_norm": 1.0198403446768327, "learning_rate": 1.0367412075300942e-06, "loss": 0.7163, "step": 1683 }, { "epoch": 1.5958303719497748, "grad_norm": 0.9447519143939068, "learning_rate": 1.0320651643743128e-06, "loss": 0.6455, "step": 1684 }, { "epoch": 1.5967780146884625, "grad_norm": 0.9675157899047281, "learning_rate": 1.0273984764154327e-06, "loss": 0.6627, "step": 1685 }, { "epoch": 1.5977256574271501, "grad_norm": 1.099687430563817, "learning_rate": 1.0227411546560962e-06, "loss": 0.6868, "step": 1686 }, { "epoch": 1.5986733001658375, "grad_norm": 1.0425813440851017, "learning_rate": 1.0180932100768714e-06, "loss": 0.7263, "step": 1687 }, { "epoch": 1.599620942904525, "grad_norm": 1.0467623766365717, "learning_rate": 1.0134546536362099e-06, "loss": 0.7087, "step": 1688 }, { "epoch": 1.6005685856432126, "grad_norm": 1.1080721212745201, "learning_rate": 1.008825496270434e-06, "loss": 0.708, "step": 1689 }, { "epoch": 1.6015162283819, "grad_norm": 1.18766161221473, "learning_rate": 1.0042057488937067e-06, "loss": 0.6998, "step": 1690 }, { "epoch": 1.6024638711205874, "grad_norm": 1.0510182256034266, "learning_rate": 9.995954223979992e-07, "loss": 0.6989, "step": 1691 }, { "epoch": 1.603411513859275, "grad_norm": 1.3860613450249764, "learning_rate": 9.949945276530782e-07, "loss": 0.7097, "step": 1692 }, { "epoch": 1.6043591565979627, "grad_norm": 1.2179486037581968, "learning_rate": 9.904030755064659e-07, "loss": 0.6978, "step": 1693 }, { "epoch": 1.60530679933665, "grad_norm": 0.964495472412476, "learning_rate": 9.858210767834292e-07, "loss": 0.6589, "step": 1694 }, { "epoch": 1.60530679933665, "eval_loss": 0.9162237644195557, "eval_runtime": 66.3491, "eval_samples_per_second": 41.116, "eval_steps_per_second": 0.648, "step": 1694 }, { "epoch": 1.6062544420753375, "grad_norm": 0.9604758470801715, "learning_rate": 9.8124854228694e-07, "loss": 0.6931, "step": 1695 }, { "epoch": 1.6072020848140252, "grad_norm": 1.1003612306707224, "learning_rate": 9.76685482797662e-07, "loss": 0.7113, "step": 1696 }, { "epoch": 1.6081497275527128, "grad_norm": 1.0752314491639252, "learning_rate": 9.72131909073914e-07, "loss": 0.6719, "step": 1697 }, { "epoch": 1.6090973702914, "grad_norm": 1.0414996201454678, "learning_rate": 9.675878318516546e-07, "loss": 0.7659, "step": 1698 }, { "epoch": 1.6100450130300876, "grad_norm": 1.1239598855506638, "learning_rate": 9.630532618444532e-07, "loss": 0.6927, "step": 1699 }, { "epoch": 1.6109926557687753, "grad_norm": 1.0046753329350298, "learning_rate": 9.58528209743459e-07, "loss": 0.7055, "step": 1700 }, { "epoch": 1.6119402985074627, "grad_norm": 1.0189187609725145, "learning_rate": 9.540126862173865e-07, "loss": 0.7139, "step": 1701 }, { "epoch": 1.61288794124615, "grad_norm": 1.719157547437819, "learning_rate": 9.495067019124793e-07, "loss": 0.7117, "step": 1702 }, { "epoch": 1.6138355839848377, "grad_norm": 1.01602314797504, "learning_rate": 9.450102674524952e-07, "loss": 0.7244, "step": 1703 }, { "epoch": 1.6147832267235254, "grad_norm": 1.0640275443844036, "learning_rate": 9.405233934386726e-07, "loss": 0.6851, "step": 1704 }, { "epoch": 1.6157308694622128, "grad_norm": 0.9979744262453907, "learning_rate": 9.360460904497132e-07, "loss": 0.712, "step": 1705 }, { "epoch": 1.6166785122009002, "grad_norm": 0.9521292764998414, "learning_rate": 9.315783690417479e-07, "loss": 0.6478, "step": 1706 }, { "epoch": 1.6176261549395878, "grad_norm": 1.0329943572255877, "learning_rate": 9.271202397483214e-07, "loss": 0.692, "step": 1707 }, { "epoch": 1.6185737976782753, "grad_norm": 0.9688251186485236, "learning_rate": 9.226717130803636e-07, "loss": 0.7099, "step": 1708 }, { "epoch": 1.6195214404169627, "grad_norm": 1.0095855927099802, "learning_rate": 9.182327995261592e-07, "loss": 0.6799, "step": 1709 }, { "epoch": 1.6204690831556503, "grad_norm": 0.9336916292986674, "learning_rate": 9.138035095513337e-07, "loss": 0.7118, "step": 1710 }, { "epoch": 1.621416725894338, "grad_norm": 0.9262678528281649, "learning_rate": 9.093838535988181e-07, "loss": 0.7048, "step": 1711 }, { "epoch": 1.6223643686330254, "grad_norm": 1.010842317395484, "learning_rate": 9.049738420888349e-07, "loss": 0.6302, "step": 1712 }, { "epoch": 1.6233120113717128, "grad_norm": 1.0054077234262655, "learning_rate": 9.005734854188625e-07, "loss": 0.7457, "step": 1713 }, { "epoch": 1.6242596541104004, "grad_norm": 1.1774522725658818, "learning_rate": 8.961827939636198e-07, "loss": 0.6728, "step": 1714 }, { "epoch": 1.625207296849088, "grad_norm": 1.0498467807029983, "learning_rate": 8.918017780750349e-07, "loss": 0.7334, "step": 1715 }, { "epoch": 1.6261549395877755, "grad_norm": 0.9901280155483644, "learning_rate": 8.874304480822271e-07, "loss": 0.7517, "step": 1716 }, { "epoch": 1.6261549395877755, "eval_loss": 0.9154621958732605, "eval_runtime": 65.8149, "eval_samples_per_second": 41.45, "eval_steps_per_second": 0.653, "step": 1716 }, { "epoch": 1.6271025823264629, "grad_norm": 1.1089800038542108, "learning_rate": 8.830688142914783e-07, "loss": 0.6657, "step": 1717 }, { "epoch": 1.6280502250651505, "grad_norm": 1.009309929039825, "learning_rate": 8.787168869862067e-07, "loss": 0.6259, "step": 1718 }, { "epoch": 1.628997867803838, "grad_norm": 0.9117346360474112, "learning_rate": 8.743746764269512e-07, "loss": 0.6988, "step": 1719 }, { "epoch": 1.6299455105425253, "grad_norm": 2.22961599294848, "learning_rate": 8.700421928513353e-07, "loss": 0.653, "step": 1720 }, { "epoch": 1.630893153281213, "grad_norm": 0.9850710565653991, "learning_rate": 8.657194464740542e-07, "loss": 0.737, "step": 1721 }, { "epoch": 1.6318407960199006, "grad_norm": 1.0081901085844467, "learning_rate": 8.614064474868423e-07, "loss": 0.6789, "step": 1722 }, { "epoch": 1.632788438758588, "grad_norm": 1.0631665749702537, "learning_rate": 8.571032060584555e-07, "loss": 0.7087, "step": 1723 }, { "epoch": 1.6337360814972754, "grad_norm": 1.0118721550869085, "learning_rate": 8.528097323346408e-07, "loss": 0.6821, "step": 1724 }, { "epoch": 1.634683724235963, "grad_norm": 1.274936977561588, "learning_rate": 8.485260364381187e-07, "loss": 0.6716, "step": 1725 }, { "epoch": 1.6356313669746505, "grad_norm": 1.0982132650644516, "learning_rate": 8.442521284685573e-07, "loss": 0.6765, "step": 1726 }, { "epoch": 1.636579009713338, "grad_norm": 1.1367575465419841, "learning_rate": 8.399880185025439e-07, "loss": 0.6864, "step": 1727 }, { "epoch": 1.6375266524520256, "grad_norm": 0.9279424119252897, "learning_rate": 8.357337165935675e-07, "loss": 0.7321, "step": 1728 }, { "epoch": 1.6384742951907132, "grad_norm": 1.0676158883165283, "learning_rate": 8.314892327719937e-07, "loss": 0.7418, "step": 1729 }, { "epoch": 1.6394219379294006, "grad_norm": 1.014952809890761, "learning_rate": 8.27254577045039e-07, "loss": 0.7356, "step": 1730 }, { "epoch": 1.640369580668088, "grad_norm": 1.0553676092641795, "learning_rate": 8.230297593967463e-07, "loss": 0.6572, "step": 1731 }, { "epoch": 1.6413172234067757, "grad_norm": 1.0596022428705136, "learning_rate": 8.188147897879667e-07, "loss": 0.6834, "step": 1732 }, { "epoch": 1.6422648661454633, "grad_norm": 1.0760233096652914, "learning_rate": 8.146096781563284e-07, "loss": 0.6732, "step": 1733 }, { "epoch": 1.6432125088841507, "grad_norm": 1.0435516775906648, "learning_rate": 8.104144344162229e-07, "loss": 0.7147, "step": 1734 }, { "epoch": 1.6441601516228381, "grad_norm": 0.9126453263477479, "learning_rate": 8.062290684587698e-07, "loss": 0.7066, "step": 1735 }, { "epoch": 1.6451077943615258, "grad_norm": 1.136390471558309, "learning_rate": 8.02053590151805e-07, "loss": 0.675, "step": 1736 }, { "epoch": 1.6460554371002132, "grad_norm": 0.9443277333193851, "learning_rate": 7.978880093398517e-07, "loss": 0.6556, "step": 1737 }, { "epoch": 1.6470030798389006, "grad_norm": 0.9782892174224657, "learning_rate": 7.937323358440935e-07, "loss": 0.6771, "step": 1738 }, { "epoch": 1.6470030798389006, "eval_loss": 0.9155307412147522, "eval_runtime": 69.7327, "eval_samples_per_second": 39.121, "eval_steps_per_second": 0.617, "step": 1738 }, { "epoch": 1.6479507225775882, "grad_norm": 0.9461043845473808, "learning_rate": 7.89586579462362e-07, "loss": 0.6145, "step": 1739 }, { "epoch": 1.6488983653162759, "grad_norm": 1.0760119198442148, "learning_rate": 7.854507499691006e-07, "loss": 0.6764, "step": 1740 }, { "epoch": 1.6498460080549633, "grad_norm": 1.047284386935676, "learning_rate": 7.813248571153542e-07, "loss": 0.7229, "step": 1741 }, { "epoch": 1.6507936507936507, "grad_norm": 1.0394813864374197, "learning_rate": 7.772089106287345e-07, "loss": 0.7326, "step": 1742 }, { "epoch": 1.6517412935323383, "grad_norm": 1.045028572772028, "learning_rate": 7.731029202134077e-07, "loss": 0.7167, "step": 1743 }, { "epoch": 1.652688936271026, "grad_norm": 1.006834434991697, "learning_rate": 7.690068955500623e-07, "loss": 0.705, "step": 1744 }, { "epoch": 1.6536365790097134, "grad_norm": 1.0207105597171535, "learning_rate": 7.649208462958935e-07, "loss": 0.7293, "step": 1745 }, { "epoch": 1.6545842217484008, "grad_norm": 0.9666760369440525, "learning_rate": 7.608447820845771e-07, "loss": 0.6882, "step": 1746 }, { "epoch": 1.6555318644870884, "grad_norm": 0.983892799303487, "learning_rate": 7.567787125262449e-07, "loss": 0.6787, "step": 1747 }, { "epoch": 1.6564795072257759, "grad_norm": 1.0469953263447322, "learning_rate": 7.527226472074678e-07, "loss": 0.7717, "step": 1748 }, { "epoch": 1.6574271499644633, "grad_norm": 0.9326611142849608, "learning_rate": 7.486765956912261e-07, "loss": 0.6829, "step": 1749 }, { "epoch": 1.658374792703151, "grad_norm": 0.9698239262438025, "learning_rate": 7.446405675168938e-07, "loss": 0.6417, "step": 1750 }, { "epoch": 1.6593224354418386, "grad_norm": 1.0388054521896983, "learning_rate": 7.406145722002101e-07, "loss": 0.661, "step": 1751 }, { "epoch": 1.660270078180526, "grad_norm": 1.0284488705513752, "learning_rate": 7.365986192332624e-07, "loss": 0.6885, "step": 1752 }, { "epoch": 1.6612177209192134, "grad_norm": 1.0447857805111578, "learning_rate": 7.325927180844589e-07, "loss": 0.754, "step": 1753 }, { "epoch": 1.662165363657901, "grad_norm": 1.0316358381680006, "learning_rate": 7.285968781985093e-07, "loss": 0.7376, "step": 1754 }, { "epoch": 1.6631130063965884, "grad_norm": 0.9473765886693883, "learning_rate": 7.246111089964042e-07, "loss": 0.7222, "step": 1755 }, { "epoch": 1.6640606491352758, "grad_norm": 1.0459905890378751, "learning_rate": 7.206354198753862e-07, "loss": 0.7092, "step": 1756 }, { "epoch": 1.6650082918739635, "grad_norm": 1.1687326620774066, "learning_rate": 7.166698202089367e-07, "loss": 0.6543, "step": 1757 }, { "epoch": 1.6659559346126511, "grad_norm": 0.9534227603353764, "learning_rate": 7.127143193467445e-07, "loss": 0.6816, "step": 1758 }, { "epoch": 1.6669035773513385, "grad_norm": 1.6473488276372368, "learning_rate": 7.087689266146935e-07, "loss": 0.609, "step": 1759 }, { "epoch": 1.667851220090026, "grad_norm": 1.0168119921182184, "learning_rate": 7.048336513148307e-07, "loss": 0.7228, "step": 1760 }, { "epoch": 1.667851220090026, "eval_loss": 0.9152177572250366, "eval_runtime": 64.5647, "eval_samples_per_second": 42.252, "eval_steps_per_second": 0.666, "step": 1760 }, { "epoch": 1.6687988628287136, "grad_norm": 1.091512400561692, "learning_rate": 7.009085027253543e-07, "loss": 0.7229, "step": 1761 }, { "epoch": 1.6697465055674012, "grad_norm": 0.9169936965491188, "learning_rate": 6.969934901005809e-07, "loss": 0.6622, "step": 1762 }, { "epoch": 1.6706941483060886, "grad_norm": 0.9220318537884853, "learning_rate": 6.930886226709344e-07, "loss": 0.6763, "step": 1763 }, { "epoch": 1.671641791044776, "grad_norm": 1.4840032839975184, "learning_rate": 6.89193909642919e-07, "loss": 0.7216, "step": 1764 }, { "epoch": 1.6725894337834637, "grad_norm": 1.0184604456491309, "learning_rate": 6.853093601990946e-07, "loss": 0.7152, "step": 1765 }, { "epoch": 1.6735370765221511, "grad_norm": 0.9526356849525173, "learning_rate": 6.814349834980622e-07, "loss": 0.6673, "step": 1766 }, { "epoch": 1.6744847192608385, "grad_norm": 1.0461521335211754, "learning_rate": 6.775707886744343e-07, "loss": 0.7344, "step": 1767 }, { "epoch": 1.6754323619995262, "grad_norm": 0.9731694459589282, "learning_rate": 6.737167848388227e-07, "loss": 0.6401, "step": 1768 }, { "epoch": 1.6763800047382138, "grad_norm": 1.033021359037975, "learning_rate": 6.698729810778065e-07, "loss": 0.6726, "step": 1769 }, { "epoch": 1.6773276474769012, "grad_norm": 1.0233932401805885, "learning_rate": 6.660393864539222e-07, "loss": 0.746, "step": 1770 }, { "epoch": 1.6782752902155886, "grad_norm": 1.1404272415247523, "learning_rate": 6.622160100056296e-07, "loss": 0.7257, "step": 1771 }, { "epoch": 1.6792229329542763, "grad_norm": 0.9500280589832726, "learning_rate": 6.584028607473019e-07, "loss": 0.6845, "step": 1772 }, { "epoch": 1.680170575692964, "grad_norm": 1.1155349368000151, "learning_rate": 6.545999476691994e-07, "loss": 0.7388, "step": 1773 }, { "epoch": 1.681118218431651, "grad_norm": 1.0605447278963818, "learning_rate": 6.508072797374454e-07, "loss": 0.7103, "step": 1774 }, { "epoch": 1.6820658611703387, "grad_norm": 1.2288918607070451, "learning_rate": 6.470248658940115e-07, "loss": 0.7631, "step": 1775 }, { "epoch": 1.6830135039090264, "grad_norm": 1.5101120147098823, "learning_rate": 6.432527150566903e-07, "loss": 0.6687, "step": 1776 }, { "epoch": 1.6839611466477138, "grad_norm": 1.0065803962933693, "learning_rate": 6.394908361190804e-07, "loss": 0.6794, "step": 1777 }, { "epoch": 1.6849087893864012, "grad_norm": 0.9998197093667816, "learning_rate": 6.3573923795056e-07, "loss": 0.7064, "step": 1778 }, { "epoch": 1.6858564321250888, "grad_norm": 0.9471515572718026, "learning_rate": 6.319979293962692e-07, "loss": 0.6864, "step": 1779 }, { "epoch": 1.6868040748637765, "grad_norm": 1.1324852784222434, "learning_rate": 6.282669192770896e-07, "loss": 0.6993, "step": 1780 }, { "epoch": 1.687751717602464, "grad_norm": 1.1270168615840963, "learning_rate": 6.245462163896188e-07, "loss": 0.6916, "step": 1781 }, { "epoch": 1.6886993603411513, "grad_norm": 1.0074883404182633, "learning_rate": 6.208358295061572e-07, "loss": 0.6657, "step": 1782 }, { "epoch": 1.6886993603411513, "eval_loss": 0.9154070615768433, "eval_runtime": 66.96, "eval_samples_per_second": 40.741, "eval_steps_per_second": 0.642, "step": 1782 }, { "epoch": 1.689647003079839, "grad_norm": 1.054498287010899, "learning_rate": 6.171357673746798e-07, "loss": 0.6781, "step": 1783 }, { "epoch": 1.6905946458185264, "grad_norm": 1.1173569579127263, "learning_rate": 6.134460387188207e-07, "loss": 0.7066, "step": 1784 }, { "epoch": 1.6915422885572138, "grad_norm": 1.1694100658433553, "learning_rate": 6.097666522378498e-07, "loss": 0.7334, "step": 1785 }, { "epoch": 1.6924899312959014, "grad_norm": 1.0277071128785418, "learning_rate": 6.060976166066546e-07, "loss": 0.653, "step": 1786 }, { "epoch": 1.693437574034589, "grad_norm": 0.9433860569393974, "learning_rate": 6.024389404757164e-07, "loss": 0.7334, "step": 1787 }, { "epoch": 1.6943852167732765, "grad_norm": 1.1834900094280103, "learning_rate": 5.98790632471094e-07, "loss": 0.6869, "step": 1788 }, { "epoch": 1.6953328595119639, "grad_norm": 1.0548703068294034, "learning_rate": 5.951527011944008e-07, "loss": 0.6971, "step": 1789 }, { "epoch": 1.6962805022506515, "grad_norm": 1.04211743004335, "learning_rate": 5.91525155222783e-07, "loss": 0.6956, "step": 1790 }, { "epoch": 1.6972281449893392, "grad_norm": 0.9772649558392219, "learning_rate": 5.879080031089047e-07, "loss": 0.6854, "step": 1791 }, { "epoch": 1.6981757877280266, "grad_norm": 1.0335742882872847, "learning_rate": 5.843012533809211e-07, "loss": 0.6413, "step": 1792 }, { "epoch": 1.699123430466714, "grad_norm": 1.0424548733114698, "learning_rate": 5.807049145424648e-07, "loss": 0.6913, "step": 1793 }, { "epoch": 1.7000710732054016, "grad_norm": 0.9133463333235484, "learning_rate": 5.771189950726191e-07, "loss": 0.7096, "step": 1794 }, { "epoch": 1.701018715944089, "grad_norm": 1.052652054283284, "learning_rate": 5.735435034259057e-07, "loss": 0.6999, "step": 1795 }, { "epoch": 1.7019663586827765, "grad_norm": 0.9804718340800169, "learning_rate": 5.699784480322568e-07, "loss": 0.7222, "step": 1796 }, { "epoch": 1.702914001421464, "grad_norm": 1.0457563081701153, "learning_rate": 5.664238372970016e-07, "loss": 0.7255, "step": 1797 }, { "epoch": 1.7038616441601517, "grad_norm": 1.032471323678011, "learning_rate": 5.628796796008435e-07, "loss": 0.7157, "step": 1798 }, { "epoch": 1.7048092868988391, "grad_norm": 1.060728898729736, "learning_rate": 5.593459832998388e-07, "loss": 0.7115, "step": 1799 }, { "epoch": 1.7057569296375266, "grad_norm": 0.9198580084720938, "learning_rate": 5.558227567253832e-07, "loss": 0.6637, "step": 1800 }, { "epoch": 1.7067045723762142, "grad_norm": 1.1801823503070277, "learning_rate": 5.52310008184182e-07, "loss": 0.6761, "step": 1801 }, { "epoch": 1.7076522151149018, "grad_norm": 0.9488900309898747, "learning_rate": 5.488077459582425e-07, "loss": 0.6881, "step": 1802 }, { "epoch": 1.708599857853589, "grad_norm": 1.0453768970538024, "learning_rate": 5.453159783048434e-07, "loss": 0.6938, "step": 1803 }, { "epoch": 1.7095475005922767, "grad_norm": 0.8783306154356906, "learning_rate": 5.418347134565249e-07, "loss": 0.7375, "step": 1804 }, { "epoch": 1.7095475005922767, "eval_loss": 0.9155663847923279, "eval_runtime": 61.2948, "eval_samples_per_second": 44.506, "eval_steps_per_second": 0.702, "step": 1804 }, { "epoch": 1.7104951433309643, "grad_norm": 0.9783675322880301, "learning_rate": 5.383639596210605e-07, "loss": 0.7133, "step": 1805 }, { "epoch": 1.7114427860696517, "grad_norm": 1.071623947559265, "learning_rate": 5.349037249814443e-07, "loss": 0.717, "step": 1806 }, { "epoch": 1.7123904288083391, "grad_norm": 0.9742720184084963, "learning_rate": 5.314540176958699e-07, "loss": 0.6707, "step": 1807 }, { "epoch": 1.7133380715470268, "grad_norm": 1.0237973061069328, "learning_rate": 5.28014845897708e-07, "loss": 0.6885, "step": 1808 }, { "epoch": 1.7142857142857144, "grad_norm": 1.1521501110479333, "learning_rate": 5.24586217695493e-07, "loss": 0.6501, "step": 1809 }, { "epoch": 1.7152333570244018, "grad_norm": 1.103148137037962, "learning_rate": 5.211681411728969e-07, "loss": 0.7074, "step": 1810 }, { "epoch": 1.7161809997630892, "grad_norm": 1.0912149190601321, "learning_rate": 5.177606243887184e-07, "loss": 0.6816, "step": 1811 }, { "epoch": 1.7171286425017769, "grad_norm": 1.1479638151703746, "learning_rate": 5.14363675376855e-07, "loss": 0.6941, "step": 1812 }, { "epoch": 1.7180762852404643, "grad_norm": 1.0169229566279356, "learning_rate": 5.109773021462921e-07, "loss": 0.6869, "step": 1813 }, { "epoch": 1.7190239279791517, "grad_norm": 1.0911518292945759, "learning_rate": 5.076015126810784e-07, "loss": 0.6936, "step": 1814 }, { "epoch": 1.7199715707178393, "grad_norm": 1.0143944326536474, "learning_rate": 5.042363149403106e-07, "loss": 0.6826, "step": 1815 }, { "epoch": 1.720919213456527, "grad_norm": 1.0145646424072496, "learning_rate": 5.008817168581137e-07, "loss": 0.738, "step": 1816 }, { "epoch": 1.7218668561952144, "grad_norm": 0.9897806551140146, "learning_rate": 4.975377263436193e-07, "loss": 0.702, "step": 1817 }, { "epoch": 1.7228144989339018, "grad_norm": 0.9854817267582501, "learning_rate": 4.94204351280953e-07, "loss": 0.7192, "step": 1818 }, { "epoch": 1.7237621416725895, "grad_norm": 1.5230326854807534, "learning_rate": 4.908815995292082e-07, "loss": 0.7293, "step": 1819 }, { "epoch": 1.724709784411277, "grad_norm": 1.2847042251851852, "learning_rate": 4.875694789224372e-07, "loss": 0.6911, "step": 1820 }, { "epoch": 1.7256574271499645, "grad_norm": 1.026112774551678, "learning_rate": 4.842679972696213e-07, "loss": 0.6836, "step": 1821 }, { "epoch": 1.726605069888652, "grad_norm": 1.0314353073227083, "learning_rate": 4.809771623546627e-07, "loss": 0.6813, "step": 1822 }, { "epoch": 1.7275527126273396, "grad_norm": 0.9609581306542341, "learning_rate": 4.776969819363614e-07, "loss": 0.7, "step": 1823 }, { "epoch": 1.728500355366027, "grad_norm": 1.0594554399314495, "learning_rate": 4.7442746374839363e-07, "loss": 0.6848, "step": 1824 }, { "epoch": 1.7294479981047144, "grad_norm": 1.0911056243232513, "learning_rate": 4.711686154993028e-07, "loss": 0.6629, "step": 1825 }, { "epoch": 1.730395640843402, "grad_norm": 1.0318006859508377, "learning_rate": 4.6792044487247003e-07, "loss": 0.6968, "step": 1826 }, { "epoch": 1.730395640843402, "eval_loss": 0.9146263003349304, "eval_runtime": 63.3709, "eval_samples_per_second": 43.048, "eval_steps_per_second": 0.679, "step": 1826 }, { "epoch": 1.7313432835820897, "grad_norm": 0.9085218539728935, "learning_rate": 4.646829595261071e-07, "loss": 0.6937, "step": 1827 }, { "epoch": 1.732290926320777, "grad_norm": 1.1715576080637178, "learning_rate": 4.614561670932288e-07, "loss": 0.7269, "step": 1828 }, { "epoch": 1.7332385690594645, "grad_norm": 1.1027251721128186, "learning_rate": 4.582400751816435e-07, "loss": 0.7023, "step": 1829 }, { "epoch": 1.7341862117981521, "grad_norm": 1.110203701042969, "learning_rate": 4.5503469137392565e-07, "loss": 0.6782, "step": 1830 }, { "epoch": 1.7351338545368398, "grad_norm": 1.208247198988513, "learning_rate": 4.5184002322740784e-07, "loss": 0.7379, "step": 1831 }, { "epoch": 1.736081497275527, "grad_norm": 0.9596889918326993, "learning_rate": 4.486560782741578e-07, "loss": 0.7485, "step": 1832 }, { "epoch": 1.7370291400142146, "grad_norm": 0.9856266219344122, "learning_rate": 4.454828640209574e-07, "loss": 0.7021, "step": 1833 }, { "epoch": 1.7379767827529022, "grad_norm": 1.0066120094983713, "learning_rate": 4.423203879492943e-07, "loss": 0.6334, "step": 1834 }, { "epoch": 1.7389244254915897, "grad_norm": 1.0174910309231802, "learning_rate": 4.3916865751533313e-07, "loss": 0.6737, "step": 1835 }, { "epoch": 1.739872068230277, "grad_norm": 1.090557902374621, "learning_rate": 4.360276801499086e-07, "loss": 0.6986, "step": 1836 }, { "epoch": 1.7408197109689647, "grad_norm": 0.9525400709898934, "learning_rate": 4.3289746325849924e-07, "loss": 0.6387, "step": 1837 }, { "epoch": 1.7417673537076523, "grad_norm": 0.9714172712407362, "learning_rate": 4.29778014221216e-07, "loss": 0.7426, "step": 1838 }, { "epoch": 1.7427149964463398, "grad_norm": 1.011041205364556, "learning_rate": 4.2666934039278017e-07, "loss": 0.7251, "step": 1839 }, { "epoch": 1.7436626391850272, "grad_norm": 1.0835244258679044, "learning_rate": 4.2357144910251003e-07, "loss": 0.7394, "step": 1840 }, { "epoch": 1.7446102819237148, "grad_norm": 0.9136527438313549, "learning_rate": 4.20484347654303e-07, "loss": 0.6833, "step": 1841 }, { "epoch": 1.7455579246624022, "grad_norm": 0.9394753418742889, "learning_rate": 4.1740804332661365e-07, "loss": 0.7183, "step": 1842 }, { "epoch": 1.7465055674010896, "grad_norm": 1.1163656927143548, "learning_rate": 4.1434254337244404e-07, "loss": 0.6688, "step": 1843 }, { "epoch": 1.7474532101397773, "grad_norm": 1.0767321655039874, "learning_rate": 4.1128785501931947e-07, "loss": 0.7301, "step": 1844 }, { "epoch": 1.748400852878465, "grad_norm": 1.091800385892591, "learning_rate": 4.0824398546927823e-07, "loss": 0.7628, "step": 1845 }, { "epoch": 1.7493484956171523, "grad_norm": 1.1875176453886316, "learning_rate": 4.05210941898847e-07, "loss": 0.7532, "step": 1846 }, { "epoch": 1.7502961383558397, "grad_norm": 0.9616131150389049, "learning_rate": 4.021887314590323e-07, "loss": 0.7407, "step": 1847 }, { "epoch": 1.7512437810945274, "grad_norm": 1.03865176835429, "learning_rate": 3.9917736127529525e-07, "loss": 0.7331, "step": 1848 }, { "epoch": 1.7512437810945274, "eval_loss": 0.914626955986023, "eval_runtime": 61.9304, "eval_samples_per_second": 44.049, "eval_steps_per_second": 0.694, "step": 1848 }, { "epoch": 1.752191423833215, "grad_norm": 1.023686798198027, "learning_rate": 3.9617683844754284e-07, "loss": 0.7311, "step": 1849 }, { "epoch": 1.7531390665719024, "grad_norm": 0.9592372092729335, "learning_rate": 3.9318717005010496e-07, "loss": 0.7405, "step": 1850 }, { "epoch": 1.7540867093105899, "grad_norm": 1.0255950736345492, "learning_rate": 3.902083631317194e-07, "loss": 0.6882, "step": 1851 }, { "epoch": 1.7550343520492775, "grad_norm": 1.0334498715269957, "learning_rate": 3.8724042471551925e-07, "loss": 0.6409, "step": 1852 }, { "epoch": 1.755981994787965, "grad_norm": 1.187230644417929, "learning_rate": 3.8428336179900773e-07, "loss": 0.687, "step": 1853 }, { "epoch": 1.7569296375266523, "grad_norm": 1.0889047565138557, "learning_rate": 3.8133718135405283e-07, "loss": 0.713, "step": 1854 }, { "epoch": 1.75787728026534, "grad_norm": 0.9442970597032486, "learning_rate": 3.784018903268588e-07, "loss": 0.6456, "step": 1855 }, { "epoch": 1.7588249230040276, "grad_norm": 0.9511254579043233, "learning_rate": 3.7547749563796144e-07, "loss": 0.675, "step": 1856 }, { "epoch": 1.759772565742715, "grad_norm": 1.07549297963586, "learning_rate": 3.725640041822026e-07, "loss": 0.7639, "step": 1857 }, { "epoch": 1.7607202084814024, "grad_norm": 1.2232858699290627, "learning_rate": 3.6966142282871873e-07, "loss": 0.738, "step": 1858 }, { "epoch": 1.76166785122009, "grad_norm": 0.977573869114317, "learning_rate": 3.667697584209251e-07, "loss": 0.6537, "step": 1859 }, { "epoch": 1.7626154939587777, "grad_norm": 1.0901080710066895, "learning_rate": 3.638890177764948e-07, "loss": 0.6607, "step": 1860 }, { "epoch": 1.763563136697465, "grad_norm": 1.0047668118955564, "learning_rate": 3.610192076873498e-07, "loss": 0.6992, "step": 1861 }, { "epoch": 1.7645107794361525, "grad_norm": 1.0810551846393102, "learning_rate": 3.581603349196372e-07, "loss": 0.7749, "step": 1862 }, { "epoch": 1.7654584221748402, "grad_norm": 1.2746674219972365, "learning_rate": 3.553124062137203e-07, "loss": 0.697, "step": 1863 }, { "epoch": 1.7664060649135276, "grad_norm": 1.0162449684395298, "learning_rate": 3.524754282841575e-07, "loss": 0.741, "step": 1864 }, { "epoch": 1.767353707652215, "grad_norm": 0.9261105845228721, "learning_rate": 3.49649407819691e-07, "loss": 0.6527, "step": 1865 }, { "epoch": 1.7683013503909026, "grad_norm": 1.0754817790075664, "learning_rate": 3.468343514832251e-07, "loss": 0.6518, "step": 1866 }, { "epoch": 1.7692489931295903, "grad_norm": 0.9487301149199158, "learning_rate": 3.440302659118172e-07, "loss": 0.7055, "step": 1867 }, { "epoch": 1.7701966358682777, "grad_norm": 0.9988153915506769, "learning_rate": 3.4123715771665786e-07, "loss": 0.6693, "step": 1868 }, { "epoch": 1.771144278606965, "grad_norm": 1.1376902119777852, "learning_rate": 3.3845503348305554e-07, "loss": 0.6901, "step": 1869 }, { "epoch": 1.7720919213456527, "grad_norm": 0.9271287842743362, "learning_rate": 3.356838997704226e-07, "loss": 0.6715, "step": 1870 }, { "epoch": 1.7720919213456527, "eval_loss": 0.9146553874015808, "eval_runtime": 65.9577, "eval_samples_per_second": 41.36, "eval_steps_per_second": 0.652, "step": 1870 }, { "epoch": 1.7730395640843402, "grad_norm": 1.114859990972608, "learning_rate": 3.3292376311225837e-07, "loss": 0.7206, "step": 1871 }, { "epoch": 1.7739872068230276, "grad_norm": 0.9957104184290693, "learning_rate": 3.3017463001613625e-07, "loss": 0.7175, "step": 1872 }, { "epoch": 1.7749348495617152, "grad_norm": 1.085353272133234, "learning_rate": 3.274365069636831e-07, "loss": 0.7183, "step": 1873 }, { "epoch": 1.7758824923004028, "grad_norm": 0.9620062890136332, "learning_rate": 3.247094004105711e-07, "loss": 0.6941, "step": 1874 }, { "epoch": 1.7768301350390903, "grad_norm": 1.0208277814345714, "learning_rate": 3.2199331678649804e-07, "loss": 0.6735, "step": 1875 }, { "epoch": 1.7777777777777777, "grad_norm": 1.085385982876263, "learning_rate": 3.1928826249516984e-07, "loss": 0.7081, "step": 1876 }, { "epoch": 1.7787254205164653, "grad_norm": 0.9829670594766973, "learning_rate": 3.165942439142927e-07, "loss": 0.6604, "step": 1877 }, { "epoch": 1.779673063255153, "grad_norm": 1.0546523994379018, "learning_rate": 3.1391126739555134e-07, "loss": 0.6916, "step": 1878 }, { "epoch": 1.7806207059938404, "grad_norm": 1.3474084051089132, "learning_rate": 3.112393392645985e-07, "loss": 0.7241, "step": 1879 }, { "epoch": 1.7815683487325278, "grad_norm": 1.2195465607727498, "learning_rate": 3.0857846582103504e-07, "loss": 0.7133, "step": 1880 }, { "epoch": 1.7825159914712154, "grad_norm": 1.0574688930127023, "learning_rate": 3.059286533384021e-07, "loss": 0.6827, "step": 1881 }, { "epoch": 1.7834636342099028, "grad_norm": 0.9882282740053548, "learning_rate": 3.0328990806415935e-07, "loss": 0.6634, "step": 1882 }, { "epoch": 1.7844112769485903, "grad_norm": 1.6186322230221908, "learning_rate": 3.006622362196748e-07, "loss": 0.681, "step": 1883 }, { "epoch": 1.785358919687278, "grad_norm": 0.9919895774024126, "learning_rate": 2.9804564400021e-07, "loss": 0.6462, "step": 1884 }, { "epoch": 1.7863065624259655, "grad_norm": 1.023431960722124, "learning_rate": 2.9544013757489944e-07, "loss": 0.6782, "step": 1885 }, { "epoch": 1.787254205164653, "grad_norm": 1.4703356457868497, "learning_rate": 2.92845723086746e-07, "loss": 0.7125, "step": 1886 }, { "epoch": 1.7882018479033404, "grad_norm": 1.044872020512602, "learning_rate": 2.9026240665259717e-07, "loss": 0.6705, "step": 1887 }, { "epoch": 1.789149490642028, "grad_norm": 1.0114171562637975, "learning_rate": 2.876901943631372e-07, "loss": 0.7051, "step": 1888 }, { "epoch": 1.7900971333807156, "grad_norm": 0.9319783340627764, "learning_rate": 2.8512909228286814e-07, "loss": 0.6933, "step": 1889 }, { "epoch": 1.7910447761194028, "grad_norm": 1.1744614079136493, "learning_rate": 2.8257910645009935e-07, "loss": 0.6957, "step": 1890 }, { "epoch": 1.7919924188580905, "grad_norm": 0.9991417606881348, "learning_rate": 2.8004024287692944e-07, "loss": 0.7323, "step": 1891 }, { "epoch": 1.792940061596778, "grad_norm": 1.0062578000109592, "learning_rate": 2.7751250754923574e-07, "loss": 0.6934, "step": 1892 }, { "epoch": 1.792940061596778, "eval_loss": 0.9148725867271423, "eval_runtime": 60.4722, "eval_samples_per_second": 45.112, "eval_steps_per_second": 0.711, "step": 1892 }, { "epoch": 1.7938877043354655, "grad_norm": 1.0646128012425906, "learning_rate": 2.7499590642665773e-07, "loss": 0.6725, "step": 1893 }, { "epoch": 1.794835347074153, "grad_norm": 0.9591060924803608, "learning_rate": 2.724904454425836e-07, "loss": 0.7088, "step": 1894 }, { "epoch": 1.7957829898128406, "grad_norm": 1.0631806918984852, "learning_rate": 2.699961305041382e-07, "loss": 0.698, "step": 1895 }, { "epoch": 1.7967306325515282, "grad_norm": 1.082828132414062, "learning_rate": 2.6751296749216395e-07, "loss": 0.6522, "step": 1896 }, { "epoch": 1.7976782752902156, "grad_norm": 1.3644267899875255, "learning_rate": 2.650409622612138e-07, "loss": 0.6988, "step": 1897 }, { "epoch": 1.798625918028903, "grad_norm": 1.1184995105345898, "learning_rate": 2.625801206395312e-07, "loss": 0.6482, "step": 1898 }, { "epoch": 1.7995735607675907, "grad_norm": 1.1815217570033627, "learning_rate": 2.6013044842904233e-07, "loss": 0.6416, "step": 1899 }, { "epoch": 1.800521203506278, "grad_norm": 0.9571223280821131, "learning_rate": 2.5769195140533556e-07, "loss": 0.7289, "step": 1900 }, { "epoch": 1.8014688462449655, "grad_norm": 0.9341994296803589, "learning_rate": 2.5526463531765467e-07, "loss": 0.6686, "step": 1901 }, { "epoch": 1.8024164889836531, "grad_norm": 0.9035497177092859, "learning_rate": 2.528485058888813e-07, "loss": 0.7046, "step": 1902 }, { "epoch": 1.8033641317223408, "grad_norm": 1.0743252934046903, "learning_rate": 2.5044356881552045e-07, "loss": 0.7197, "step": 1903 }, { "epoch": 1.8043117744610282, "grad_norm": 1.0079857675814266, "learning_rate": 2.4804982976769197e-07, "loss": 0.6867, "step": 1904 }, { "epoch": 1.8052594171997156, "grad_norm": 1.059517658637514, "learning_rate": 2.456672943891114e-07, "loss": 0.6749, "step": 1905 }, { "epoch": 1.8062070599384032, "grad_norm": 1.114382931333417, "learning_rate": 2.4329596829708145e-07, "loss": 0.6778, "step": 1906 }, { "epoch": 1.8071547026770909, "grad_norm": 1.0664257741407888, "learning_rate": 2.409358570824749e-07, "loss": 0.7155, "step": 1907 }, { "epoch": 1.8081023454157783, "grad_norm": 1.1085710156266713, "learning_rate": 2.385869663097251e-07, "loss": 0.712, "step": 1908 }, { "epoch": 1.8090499881544657, "grad_norm": 1.0500505818112218, "learning_rate": 2.362493015168088e-07, "loss": 0.657, "step": 1909 }, { "epoch": 1.8099976308931534, "grad_norm": 1.1320348644595801, "learning_rate": 2.3392286821523723e-07, "loss": 0.7458, "step": 1910 }, { "epoch": 1.8109452736318408, "grad_norm": 1.1363033352104308, "learning_rate": 2.316076718900412e-07, "loss": 0.7449, "step": 1911 }, { "epoch": 1.8118929163705282, "grad_norm": 0.9155284671345113, "learning_rate": 2.2930371799975593e-07, "loss": 0.6543, "step": 1912 }, { "epoch": 1.8128405591092158, "grad_norm": 1.0126916247727809, "learning_rate": 2.270110119764124e-07, "loss": 0.7248, "step": 1913 }, { "epoch": 1.8137882018479035, "grad_norm": 1.0272524735780668, "learning_rate": 2.2472955922552164e-07, "loss": 0.7114, "step": 1914 }, { "epoch": 1.8137882018479035, "eval_loss": 0.9147893786430359, "eval_runtime": 67.7847, "eval_samples_per_second": 40.245, "eval_steps_per_second": 0.634, "step": 1914 }, { "epoch": 1.8147358445865909, "grad_norm": 1.0985166320474906, "learning_rate": 2.2245936512606314e-07, "loss": 0.6455, "step": 1915 }, { "epoch": 1.8156834873252783, "grad_norm": 1.0448066285286555, "learning_rate": 2.202004350304715e-07, "loss": 0.6757, "step": 1916 }, { "epoch": 1.816631130063966, "grad_norm": 1.2157625524349163, "learning_rate": 2.179527742646248e-07, "loss": 0.6647, "step": 1917 }, { "epoch": 1.8175787728026536, "grad_norm": 0.9580609539987758, "learning_rate": 2.1571638812783125e-07, "loss": 0.6307, "step": 1918 }, { "epoch": 1.8185264155413408, "grad_norm": 1.0743648151551368, "learning_rate": 2.1349128189281587e-07, "loss": 0.7276, "step": 1919 }, { "epoch": 1.8194740582800284, "grad_norm": 1.1869777760221731, "learning_rate": 2.112774608057111e-07, "loss": 0.7087, "step": 1920 }, { "epoch": 1.820421701018716, "grad_norm": 1.0016495300786694, "learning_rate": 2.0907493008604007e-07, "loss": 0.6908, "step": 1921 }, { "epoch": 1.8213693437574034, "grad_norm": 1.1603016404098847, "learning_rate": 2.068836949267089e-07, "loss": 0.6766, "step": 1922 }, { "epoch": 1.8223169864960909, "grad_norm": 1.0187010201658582, "learning_rate": 2.0470376049398944e-07, "loss": 0.7093, "step": 1923 }, { "epoch": 1.8232646292347785, "grad_norm": 1.3203086559264363, "learning_rate": 2.0253513192751374e-07, "loss": 0.6744, "step": 1924 }, { "epoch": 1.8242122719734661, "grad_norm": 0.9700557122361928, "learning_rate": 2.003778143402535e-07, "loss": 0.6905, "step": 1925 }, { "epoch": 1.8251599147121536, "grad_norm": 1.3517239993246017, "learning_rate": 1.9823181281851513e-07, "loss": 0.6834, "step": 1926 }, { "epoch": 1.826107557450841, "grad_norm": 1.02906230765169, "learning_rate": 1.960971324219263e-07, "loss": 0.7265, "step": 1927 }, { "epoch": 1.8270552001895286, "grad_norm": 1.0585910384244444, "learning_rate": 1.9397377818341945e-07, "loss": 0.6877, "step": 1928 }, { "epoch": 1.828002842928216, "grad_norm": 0.9475218916837848, "learning_rate": 1.9186175510922666e-07, "loss": 0.7416, "step": 1929 }, { "epoch": 1.8289504856669034, "grad_norm": 1.0203159431279483, "learning_rate": 1.8976106817886197e-07, "loss": 0.714, "step": 1930 }, { "epoch": 1.829898128405591, "grad_norm": 1.127014657550726, "learning_rate": 1.876717223451141e-07, "loss": 0.7112, "step": 1931 }, { "epoch": 1.8308457711442787, "grad_norm": 1.353971401648652, "learning_rate": 1.8559372253403152e-07, "loss": 0.714, "step": 1932 }, { "epoch": 1.8317934138829661, "grad_norm": 1.0446112826008636, "learning_rate": 1.8352707364491352e-07, "loss": 0.6958, "step": 1933 }, { "epoch": 1.8327410566216535, "grad_norm": 1.0577797650889291, "learning_rate": 1.814717805502958e-07, "loss": 0.736, "step": 1934 }, { "epoch": 1.8336886993603412, "grad_norm": 1.2894072021725473, "learning_rate": 1.794278480959416e-07, "loss": 0.7035, "step": 1935 }, { "epoch": 1.8346363420990288, "grad_norm": 1.0041014130507704, "learning_rate": 1.7739528110083003e-07, "loss": 0.6661, "step": 1936 }, { "epoch": 1.8346363420990288, "eval_loss": 0.9146416783332825, "eval_runtime": 66.1092, "eval_samples_per_second": 41.265, "eval_steps_per_second": 0.65, "step": 1936 }, { "epoch": 1.8355839848377162, "grad_norm": 1.2009141696461565, "learning_rate": 1.7537408435714054e-07, "loss": 0.698, "step": 1937 }, { "epoch": 1.8365316275764036, "grad_norm": 1.0557423818039546, "learning_rate": 1.7336426263024896e-07, "loss": 0.6599, "step": 1938 }, { "epoch": 1.8374792703150913, "grad_norm": 1.1355253989528187, "learning_rate": 1.7136582065870876e-07, "loss": 0.7389, "step": 1939 }, { "epoch": 1.8384269130537787, "grad_norm": 0.947567680408928, "learning_rate": 1.6937876315424707e-07, "loss": 0.6902, "step": 1940 }, { "epoch": 1.8393745557924661, "grad_norm": 1.0469689776321616, "learning_rate": 1.6740309480174633e-07, "loss": 0.6955, "step": 1941 }, { "epoch": 1.8403221985311538, "grad_norm": 1.0956794304972344, "learning_rate": 1.6543882025923884e-07, "loss": 0.7019, "step": 1942 }, { "epoch": 1.8412698412698414, "grad_norm": 0.9499513224862992, "learning_rate": 1.6348594415789286e-07, "loss": 0.7197, "step": 1943 }, { "epoch": 1.8422174840085288, "grad_norm": 1.0290878955410225, "learning_rate": 1.6154447110200256e-07, "loss": 0.6963, "step": 1944 }, { "epoch": 1.8431651267472162, "grad_norm": 1.084819106183627, "learning_rate": 1.5961440566897913e-07, "loss": 0.6618, "step": 1945 }, { "epoch": 1.8441127694859039, "grad_norm": 0.9712359809714644, "learning_rate": 1.5769575240933422e-07, "loss": 0.7188, "step": 1946 }, { "epoch": 1.8450604122245915, "grad_norm": 1.053958386842615, "learning_rate": 1.5578851584667654e-07, "loss": 0.6487, "step": 1947 }, { "epoch": 1.8460080549632787, "grad_norm": 1.0254427828467692, "learning_rate": 1.5389270047769578e-07, "loss": 0.7443, "step": 1948 }, { "epoch": 1.8469556977019663, "grad_norm": 1.0100347718010834, "learning_rate": 1.520083107721543e-07, "loss": 0.7099, "step": 1949 }, { "epoch": 1.847903340440654, "grad_norm": 1.0591527642673004, "learning_rate": 1.5013535117287648e-07, "loss": 0.7101, "step": 1950 }, { "epoch": 1.8488509831793414, "grad_norm": 0.9196055199586443, "learning_rate": 1.482738260957378e-07, "loss": 0.6968, "step": 1951 }, { "epoch": 1.8497986259180288, "grad_norm": 1.082437628745268, "learning_rate": 1.4642373992965365e-07, "loss": 0.6848, "step": 1952 }, { "epoch": 1.8507462686567164, "grad_norm": 1.020089742858143, "learning_rate": 1.4458509703657197e-07, "loss": 0.7327, "step": 1953 }, { "epoch": 1.851693911395404, "grad_norm": 1.138043400664436, "learning_rate": 1.427579017514591e-07, "loss": 0.7166, "step": 1954 }, { "epoch": 1.8526415541340915, "grad_norm": 1.034045450725446, "learning_rate": 1.4094215838229176e-07, "loss": 0.7585, "step": 1955 }, { "epoch": 1.853589196872779, "grad_norm": 1.7484699081096364, "learning_rate": 1.3913787121004717e-07, "loss": 0.6699, "step": 1956 }, { "epoch": 1.8545368396114665, "grad_norm": 1.1518516482534196, "learning_rate": 1.3734504448869147e-07, "loss": 0.7528, "step": 1957 }, { "epoch": 1.855484482350154, "grad_norm": 1.2727867307662963, "learning_rate": 1.3556368244517116e-07, "loss": 0.7042, "step": 1958 }, { "epoch": 1.855484482350154, "eval_loss": 0.9145249128341675, "eval_runtime": 68.0932, "eval_samples_per_second": 40.063, "eval_steps_per_second": 0.631, "step": 1958 }, { "epoch": 1.8564321250888414, "grad_norm": 0.9279042520126958, "learning_rate": 1.3379378927940167e-07, "loss": 0.7096, "step": 1959 }, { "epoch": 1.857379767827529, "grad_norm": 0.8860547537873784, "learning_rate": 1.3203536916425842e-07, "loss": 0.6665, "step": 1960 }, { "epoch": 1.8583274105662166, "grad_norm": 0.9885824244176509, "learning_rate": 1.3028842624556893e-07, "loss": 0.6769, "step": 1961 }, { "epoch": 1.859275053304904, "grad_norm": 1.1914468020222337, "learning_rate": 1.2855296464209687e-07, "loss": 0.6548, "step": 1962 }, { "epoch": 1.8602226960435915, "grad_norm": 1.2643990216476162, "learning_rate": 1.2682898844554093e-07, "loss": 0.7257, "step": 1963 }, { "epoch": 1.861170338782279, "grad_norm": 0.9779836857682124, "learning_rate": 1.2511650172051636e-07, "loss": 0.6888, "step": 1964 }, { "epoch": 1.8621179815209667, "grad_norm": 0.9438092146007916, "learning_rate": 1.2341550850455353e-07, "loss": 0.6962, "step": 1965 }, { "epoch": 1.8630656242596542, "grad_norm": 1.24403617918052, "learning_rate": 1.217260128080816e-07, "loss": 0.733, "step": 1966 }, { "epoch": 1.8640132669983416, "grad_norm": 0.9517426709373272, "learning_rate": 1.2004801861442373e-07, "loss": 0.7037, "step": 1967 }, { "epoch": 1.8649609097370292, "grad_norm": 1.1778147961281975, "learning_rate": 1.183815298797858e-07, "loss": 0.7429, "step": 1968 }, { "epoch": 1.8659085524757166, "grad_norm": 1.104468634647898, "learning_rate": 1.1672655053324655e-07, "loss": 0.712, "step": 1969 }, { "epoch": 1.866856195214404, "grad_norm": 1.0425031543126493, "learning_rate": 1.1508308447674977e-07, "loss": 0.7324, "step": 1970 }, { "epoch": 1.8678038379530917, "grad_norm": 1.0608934372919678, "learning_rate": 1.1345113558509424e-07, "loss": 0.7224, "step": 1971 }, { "epoch": 1.8687514806917793, "grad_norm": 0.9785171863226142, "learning_rate": 1.1183070770592442e-07, "loss": 0.7362, "step": 1972 }, { "epoch": 1.8696991234304667, "grad_norm": 1.0041114112851635, "learning_rate": 1.1022180465972198e-07, "loss": 0.7145, "step": 1973 }, { "epoch": 1.8706467661691542, "grad_norm": 1.0344356033639341, "learning_rate": 1.0862443023979651e-07, "loss": 0.6638, "step": 1974 }, { "epoch": 1.8715944089078418, "grad_norm": 1.1440137970112205, "learning_rate": 1.0703858821227541e-07, "loss": 0.72, "step": 1975 }, { "epoch": 1.8725420516465292, "grad_norm": 1.011318166969755, "learning_rate": 1.0546428231609896e-07, "loss": 0.7001, "step": 1976 }, { "epoch": 1.8734896943852166, "grad_norm": 1.1534751759952744, "learning_rate": 1.0390151626300527e-07, "loss": 0.7046, "step": 1977 }, { "epoch": 1.8744373371239043, "grad_norm": 1.090411632798464, "learning_rate": 1.0235029373752758e-07, "loss": 0.6901, "step": 1978 }, { "epoch": 1.875384979862592, "grad_norm": 1.07336526868862, "learning_rate": 1.0081061839698259e-07, "loss": 0.723, "step": 1979 }, { "epoch": 1.8763326226012793, "grad_norm": 1.0079673427741604, "learning_rate": 9.928249387145983e-08, "loss": 0.6381, "step": 1980 }, { "epoch": 1.8763326226012793, "eval_loss": 0.9143509268760681, "eval_runtime": 64.728, "eval_samples_per_second": 42.146, "eval_steps_per_second": 0.664, "step": 1980 }, { "epoch": 1.8772802653399667, "grad_norm": 1.1089070458655792, "learning_rate": 9.776592376381955e-08, "loss": 0.684, "step": 1981 }, { "epoch": 1.8782279080786544, "grad_norm": 1.0295711500308624, "learning_rate": 9.626091164967599e-08, "loss": 0.6981, "step": 1982 }, { "epoch": 1.879175550817342, "grad_norm": 1.1306401873193062, "learning_rate": 9.476746107739577e-08, "loss": 0.6419, "step": 1983 }, { "epoch": 1.8801231935560294, "grad_norm": 1.0143892851114753, "learning_rate": 9.32855755680867e-08, "loss": 0.7298, "step": 1984 }, { "epoch": 1.8810708362947168, "grad_norm": 1.200139953828078, "learning_rate": 9.181525861558849e-08, "loss": 0.7129, "step": 1985 }, { "epoch": 1.8820184790334045, "grad_norm": 1.1828188126049186, "learning_rate": 9.035651368646647e-08, "loss": 0.6156, "step": 1986 }, { "epoch": 1.8829661217720919, "grad_norm": 0.9893546670383357, "learning_rate": 8.89093442200023e-08, "loss": 0.6687, "step": 1987 }, { "epoch": 1.8839137645107793, "grad_norm": 1.3094641319150206, "learning_rate": 8.747375362818667e-08, "loss": 0.6492, "step": 1988 }, { "epoch": 1.884861407249467, "grad_norm": 0.9994122322268034, "learning_rate": 8.604974529571042e-08, "loss": 0.6722, "step": 1989 }, { "epoch": 1.8858090499881546, "grad_norm": 1.0426017376290013, "learning_rate": 8.463732257995571e-08, "loss": 0.727, "step": 1990 }, { "epoch": 1.886756692726842, "grad_norm": 1.2180348697029497, "learning_rate": 8.323648881099211e-08, "loss": 0.7231, "step": 1991 }, { "epoch": 1.8877043354655294, "grad_norm": 1.0642272839244786, "learning_rate": 8.184724729156379e-08, "loss": 0.7435, "step": 1992 }, { "epoch": 1.888651978204217, "grad_norm": 0.9269037955466151, "learning_rate": 8.046960129708348e-08, "loss": 0.6886, "step": 1993 }, { "epoch": 1.8895996209429047, "grad_norm": 0.9229485938696481, "learning_rate": 7.910355407562742e-08, "loss": 0.7321, "step": 1994 }, { "epoch": 1.890547263681592, "grad_norm": 1.0976428013397825, "learning_rate": 7.774910884792319e-08, "loss": 0.7073, "step": 1995 }, { "epoch": 1.8914949064202795, "grad_norm": 0.9509585098963864, "learning_rate": 7.640626880734581e-08, "loss": 0.6638, "step": 1996 }, { "epoch": 1.8924425491589671, "grad_norm": 1.0689281848692378, "learning_rate": 7.507503711990771e-08, "loss": 0.6994, "step": 1997 }, { "epoch": 1.8933901918976546, "grad_norm": 0.9440594653903914, "learning_rate": 7.375541692425325e-08, "loss": 0.7163, "step": 1998 }, { "epoch": 1.894337834636342, "grad_norm": 1.231877807532716, "learning_rate": 7.244741133164979e-08, "loss": 0.7402, "step": 1999 }, { "epoch": 1.8952854773750296, "grad_norm": 1.021884925509466, "learning_rate": 7.115102342598101e-08, "loss": 0.7227, "step": 2000 }, { "epoch": 1.8962331201137173, "grad_norm": 1.003242355792749, "learning_rate": 6.986625626373978e-08, "loss": 0.7284, "step": 2001 }, { "epoch": 1.8971807628524047, "grad_norm": 1.0311894967892232, "learning_rate": 6.859311287402081e-08, "loss": 0.745, "step": 2002 }, { "epoch": 1.8971807628524047, "eval_loss": 0.9144185185432434, "eval_runtime": 63.7067, "eval_samples_per_second": 42.821, "eval_steps_per_second": 0.675, "step": 2002 }, { "epoch": 1.898128405591092, "grad_norm": 0.933859188399113, "learning_rate": 6.733159625851304e-08, "loss": 0.7088, "step": 2003 }, { "epoch": 1.8990760483297797, "grad_norm": 0.9503318061850076, "learning_rate": 6.608170939149283e-08, "loss": 0.666, "step": 2004 }, { "epoch": 1.9000236910684671, "grad_norm": 1.028500893291808, "learning_rate": 6.48434552198185e-08, "loss": 0.646, "step": 2005 }, { "epoch": 1.9009713338071546, "grad_norm": 1.1161543459809187, "learning_rate": 6.361683666291973e-08, "loss": 0.6776, "step": 2006 }, { "epoch": 1.9019189765458422, "grad_norm": 1.0620526873003586, "learning_rate": 6.240185661279541e-08, "loss": 0.7249, "step": 2007 }, { "epoch": 1.9028666192845298, "grad_norm": 1.209070343804771, "learning_rate": 6.119851793400188e-08, "loss": 0.6793, "step": 2008 }, { "epoch": 1.9038142620232172, "grad_norm": 0.9596501498455559, "learning_rate": 6.000682346365084e-08, "loss": 0.6838, "step": 2009 }, { "epoch": 1.9047619047619047, "grad_norm": 1.0459464382958554, "learning_rate": 5.882677601139919e-08, "loss": 0.6883, "step": 2010 }, { "epoch": 1.9057095475005923, "grad_norm": 1.3695482020370489, "learning_rate": 5.7658378359443104e-08, "loss": 0.721, "step": 2011 }, { "epoch": 1.90665719023928, "grad_norm": 1.056846758068541, "learning_rate": 5.6501633262513454e-08, "loss": 0.6852, "step": 2012 }, { "epoch": 1.9076048329779673, "grad_norm": 1.0176824003025227, "learning_rate": 5.535654344786756e-08, "loss": 0.6672, "step": 2013 }, { "epoch": 1.9085524757166548, "grad_norm": 1.186973736079735, "learning_rate": 5.4223111615281935e-08, "loss": 0.7098, "step": 2014 }, { "epoch": 1.9095001184553424, "grad_norm": 1.0942365940476848, "learning_rate": 5.310134043704895e-08, "loss": 0.7302, "step": 2015 }, { "epoch": 1.9104477611940298, "grad_norm": 1.1024694043360106, "learning_rate": 5.1991232557966344e-08, "loss": 0.689, "step": 2016 }, { "epoch": 1.9113954039327172, "grad_norm": 0.9659220513288693, "learning_rate": 5.089279059533658e-08, "loss": 0.6806, "step": 2017 }, { "epoch": 1.9123430466714049, "grad_norm": 1.0448744385892244, "learning_rate": 4.9806017138953053e-08, "loss": 0.7005, "step": 2018 }, { "epoch": 1.9132906894100925, "grad_norm": 1.2099073891780832, "learning_rate": 4.873091475110281e-08, "loss": 0.701, "step": 2019 }, { "epoch": 1.91423833214878, "grad_norm": 1.144622221873965, "learning_rate": 4.766748596655268e-08, "loss": 0.7041, "step": 2020 }, { "epoch": 1.9151859748874673, "grad_norm": 1.0628184982525557, "learning_rate": 4.66157332925482e-08, "loss": 0.6452, "step": 2021 }, { "epoch": 1.916133617626155, "grad_norm": 1.0240120660236798, "learning_rate": 4.55756592088058e-08, "loss": 0.6759, "step": 2022 }, { "epoch": 1.9170812603648426, "grad_norm": 1.0346055793091427, "learning_rate": 4.4547266167507264e-08, "loss": 0.6916, "step": 2023 }, { "epoch": 1.9180289031035298, "grad_norm": 1.0007416620575995, "learning_rate": 4.3530556593294194e-08, "loss": 0.704, "step": 2024 }, { "epoch": 1.9180289031035298, "eval_loss": 0.9143268465995789, "eval_runtime": 66.9808, "eval_samples_per_second": 40.728, "eval_steps_per_second": 0.642, "step": 2024 }, { "epoch": 1.9189765458422174, "grad_norm": 1.1078567701211084, "learning_rate": 4.2525532883261886e-08, "loss": 0.6868, "step": 2025 }, { "epoch": 1.919924188580905, "grad_norm": 1.107371658240934, "learning_rate": 4.1532197406954357e-08, "loss": 0.6794, "step": 2026 }, { "epoch": 1.9208718313195925, "grad_norm": 1.1152223254534468, "learning_rate": 4.0550552506357646e-08, "loss": 0.6733, "step": 2027 }, { "epoch": 1.92181947405828, "grad_norm": 0.9814552628881259, "learning_rate": 3.958060049589485e-08, "loss": 0.6746, "step": 2028 }, { "epoch": 1.9227671167969675, "grad_norm": 1.1148524114158356, "learning_rate": 3.862234366242168e-08, "loss": 0.6809, "step": 2029 }, { "epoch": 1.9237147595356552, "grad_norm": 1.1264125552667186, "learning_rate": 3.767578426521923e-08, "loss": 0.6624, "step": 2030 }, { "epoch": 1.9246624022743426, "grad_norm": 0.8821749884652544, "learning_rate": 3.674092453598954e-08, "loss": 0.7492, "step": 2031 }, { "epoch": 1.92561004501303, "grad_norm": 0.970645505659204, "learning_rate": 3.581776667885062e-08, "loss": 0.6561, "step": 2032 }, { "epoch": 1.9265576877517177, "grad_norm": 1.1049159464316436, "learning_rate": 3.4906312870331973e-08, "loss": 0.7361, "step": 2033 }, { "epoch": 1.927505330490405, "grad_norm": 1.0950647602834038, "learning_rate": 3.40065652593663e-08, "loss": 0.7006, "step": 2034 }, { "epoch": 1.9284529732290925, "grad_norm": 1.0452185514606323, "learning_rate": 3.311852596728948e-08, "loss": 0.7276, "step": 2035 }, { "epoch": 1.9294006159677801, "grad_norm": 1.1747706021025806, "learning_rate": 3.2242197087828944e-08, "loss": 0.691, "step": 2036 }, { "epoch": 1.9303482587064678, "grad_norm": 1.2178046074293865, "learning_rate": 3.137758068710694e-08, "loss": 0.6611, "step": 2037 }, { "epoch": 1.9312959014451552, "grad_norm": 0.9619015695423282, "learning_rate": 3.052467880362675e-08, "loss": 0.6696, "step": 2038 }, { "epoch": 1.9322435441838426, "grad_norm": 1.157318529310348, "learning_rate": 2.9683493448275925e-08, "loss": 0.681, "step": 2039 }, { "epoch": 1.9331911869225302, "grad_norm": 1.0061721079680708, "learning_rate": 2.8854026604315798e-08, "loss": 0.6822, "step": 2040 }, { "epoch": 1.9341388296612179, "grad_norm": 1.0526406685824896, "learning_rate": 2.8036280227379808e-08, "loss": 0.6901, "step": 2041 }, { "epoch": 1.9350864723999053, "grad_norm": 0.968442328631709, "learning_rate": 2.723025624546849e-08, "loss": 0.6801, "step": 2042 }, { "epoch": 1.9360341151385927, "grad_norm": 0.9496600506704314, "learning_rate": 2.6435956558943375e-08, "loss": 0.7094, "step": 2043 }, { "epoch": 1.9369817578772803, "grad_norm": 0.9299456148661794, "learning_rate": 2.5653383040524228e-08, "loss": 0.6135, "step": 2044 }, { "epoch": 1.9379294006159677, "grad_norm": 0.9516242679649526, "learning_rate": 2.488253753528458e-08, "loss": 0.7323, "step": 2045 }, { "epoch": 1.9388770433546552, "grad_norm": 1.076415703445146, "learning_rate": 2.4123421860645646e-08, "loss": 0.7032, "step": 2046 }, { "epoch": 1.9388770433546552, "eval_loss": 0.9144385457038879, "eval_runtime": 67.2, "eval_samples_per_second": 40.595, "eval_steps_per_second": 0.64, "step": 2046 }, { "epoch": 1.9398246860933428, "grad_norm": 2.5186162393004623, "learning_rate": 2.3376037806374097e-08, "loss": 0.714, "step": 2047 }, { "epoch": 1.9407723288320304, "grad_norm": 1.008486947731255, "learning_rate": 2.264038713457706e-08, "loss": 0.6927, "step": 2048 }, { "epoch": 1.9417199715707179, "grad_norm": 0.9617322705423318, "learning_rate": 2.1916471579697117e-08, "loss": 0.691, "step": 2049 }, { "epoch": 1.9426676143094053, "grad_norm": 0.9939767442168306, "learning_rate": 2.1204292848509557e-08, "loss": 0.7493, "step": 2050 }, { "epoch": 1.943615257048093, "grad_norm": 0.9853441684157483, "learning_rate": 2.050385262011789e-08, "loss": 0.6878, "step": 2051 }, { "epoch": 1.9445628997867805, "grad_norm": 1.0326884095472177, "learning_rate": 1.98151525459489e-08, "loss": 0.7353, "step": 2052 }, { "epoch": 1.9455105425254677, "grad_norm": 1.0951236722722288, "learning_rate": 1.9138194249750386e-08, "loss": 0.6593, "step": 2053 }, { "epoch": 1.9464581852641554, "grad_norm": 1.1769494445073123, "learning_rate": 1.8472979327587292e-08, "loss": 0.6544, "step": 2054 }, { "epoch": 1.947405828002843, "grad_norm": 1.0395960653177918, "learning_rate": 1.781950934783505e-08, "loss": 0.7335, "step": 2055 }, { "epoch": 1.9483534707415304, "grad_norm": 1.0211404503318975, "learning_rate": 1.7177785851180127e-08, "loss": 0.7096, "step": 2056 }, { "epoch": 1.9493011134802178, "grad_norm": 1.2278758807843733, "learning_rate": 1.654781035061337e-08, "loss": 0.6919, "step": 2057 }, { "epoch": 1.9502487562189055, "grad_norm": 1.1176872580144226, "learning_rate": 1.5929584331427218e-08, "loss": 0.6904, "step": 2058 }, { "epoch": 1.9511963989575931, "grad_norm": 1.2525283554104196, "learning_rate": 1.532310925121294e-08, "loss": 0.733, "step": 2059 }, { "epoch": 1.9521440416962805, "grad_norm": 1.0083196464834074, "learning_rate": 1.4728386539856754e-08, "loss": 0.6684, "step": 2060 }, { "epoch": 1.953091684434968, "grad_norm": 0.9670996702496595, "learning_rate": 1.4145417599534805e-08, "loss": 0.6882, "step": 2061 }, { "epoch": 1.9540393271736556, "grad_norm": 2.2195094052226754, "learning_rate": 1.3574203804713748e-08, "loss": 0.7289, "step": 2062 }, { "epoch": 1.954986969912343, "grad_norm": 1.0493072981480793, "learning_rate": 1.3014746502142962e-08, "loss": 0.7345, "step": 2063 }, { "epoch": 1.9559346126510304, "grad_norm": 1.1402015478284233, "learning_rate": 1.2467047010855659e-08, "loss": 0.7084, "step": 2064 }, { "epoch": 1.956882255389718, "grad_norm": 1.028808624981634, "learning_rate": 1.1931106622161127e-08, "loss": 0.7665, "step": 2065 }, { "epoch": 1.9578298981284057, "grad_norm": 0.9887477552469968, "learning_rate": 1.1406926599646373e-08, "loss": 0.7191, "step": 2066 }, { "epoch": 1.958777540867093, "grad_norm": 1.093798325618811, "learning_rate": 1.0894508179170038e-08, "loss": 0.6972, "step": 2067 }, { "epoch": 1.9597251836057805, "grad_norm": 0.9804042235247051, "learning_rate": 1.0393852568860718e-08, "loss": 0.7328, "step": 2068 }, { "epoch": 1.9597251836057805, "eval_loss": 0.914358913898468, "eval_runtime": 68.5205, "eval_samples_per_second": 39.813, "eval_steps_per_second": 0.628, "step": 2068 }, { "epoch": 1.9606728263444682, "grad_norm": 1.2343380039639555, "learning_rate": 9.904960949114195e-09, "loss": 0.7172, "step": 2069 }, { "epoch": 1.9616204690831558, "grad_norm": 1.0811426709912628, "learning_rate": 9.427834472588992e-09, "loss": 0.721, "step": 2070 }, { "epoch": 1.9625681118218432, "grad_norm": 1.3595974625306733, "learning_rate": 8.962474264206378e-09, "loss": 0.7008, "step": 2071 }, { "epoch": 1.9635157545605306, "grad_norm": 1.0348382536874432, "learning_rate": 8.508881421145366e-09, "loss": 0.7039, "step": 2072 }, { "epoch": 1.9644633972992183, "grad_norm": 1.2153800966637154, "learning_rate": 8.067057012842161e-09, "loss": 0.7865, "step": 2073 }, { "epoch": 1.9654110400379057, "grad_norm": 0.9289644211293113, "learning_rate": 7.637002080985167e-09, "loss": 0.7485, "step": 2074 }, { "epoch": 1.966358682776593, "grad_norm": 0.9661916287589752, "learning_rate": 7.218717639514983e-09, "loss": 0.666, "step": 2075 }, { "epoch": 1.9673063255152807, "grad_norm": 0.9787686202940714, "learning_rate": 6.81220467461996e-09, "loss": 0.7334, "step": 2076 }, { "epoch": 1.9682539682539684, "grad_norm": 0.9964620209319142, "learning_rate": 6.417464144736208e-09, "loss": 0.67, "step": 2077 }, { "epoch": 1.9692016109926558, "grad_norm": 1.030464390987384, "learning_rate": 6.034496980542037e-09, "loss": 0.705, "step": 2078 }, { "epoch": 1.9701492537313432, "grad_norm": 1.1369647103723564, "learning_rate": 5.6633040849601865e-09, "loss": 0.7031, "step": 2079 }, { "epoch": 1.9710968964700308, "grad_norm": 1.17782549137388, "learning_rate": 5.303886333151154e-09, "loss": 0.7033, "step": 2080 }, { "epoch": 1.9720445392087185, "grad_norm": 1.123167425449723, "learning_rate": 4.956244572513203e-09, "loss": 0.7173, "step": 2081 }, { "epoch": 1.9729921819474057, "grad_norm": 0.9688837536293665, "learning_rate": 4.620379622682358e-09, "loss": 0.7403, "step": 2082 }, { "epoch": 1.9739398246860933, "grad_norm": 1.234375631478768, "learning_rate": 4.296292275526859e-09, "loss": 0.6677, "step": 2083 }, { "epoch": 1.974887467424781, "grad_norm": 1.0712274738868512, "learning_rate": 3.983983295146599e-09, "loss": 0.68, "step": 2084 }, { "epoch": 1.9758351101634684, "grad_norm": 1.0128865650855263, "learning_rate": 3.6834534178725734e-09, "loss": 0.7062, "step": 2085 }, { "epoch": 1.9767827529021558, "grad_norm": 1.1056609912623454, "learning_rate": 3.394703352263551e-09, "loss": 0.6977, "step": 2086 }, { "epoch": 1.9777303956408434, "grad_norm": 0.9556835224953595, "learning_rate": 3.117733779105514e-09, "loss": 0.7046, "step": 2087 }, { "epoch": 1.978678038379531, "grad_norm": 0.9749594888727722, "learning_rate": 2.8525453514099966e-09, "loss": 0.6734, "step": 2088 }, { "epoch": 1.9796256811182185, "grad_norm": 1.0169544561219785, "learning_rate": 2.5991386944107524e-09, "loss": 0.7337, "step": 2089 }, { "epoch": 1.9805733238569059, "grad_norm": 1.0181317147660969, "learning_rate": 2.3575144055643094e-09, "loss": 0.6938, "step": 2090 }, { "epoch": 1.9805733238569059, "eval_loss": 0.914404034614563, "eval_runtime": 63.6123, "eval_samples_per_second": 42.885, "eval_steps_per_second": 0.676, "step": 2090 }, { "epoch": 1.9815209665955935, "grad_norm": 1.1555804337504532, "learning_rate": 2.1276730545488623e-09, "loss": 0.6347, "step": 2091 }, { "epoch": 1.982468609334281, "grad_norm": 1.1894163816979972, "learning_rate": 1.9096151832609378e-09, "loss": 0.7038, "step": 2092 }, { "epoch": 1.9834162520729683, "grad_norm": 1.045940279368652, "learning_rate": 1.703341305815398e-09, "loss": 0.7356, "step": 2093 }, { "epoch": 1.984363894811656, "grad_norm": 1.016015756301957, "learning_rate": 1.5088519085437736e-09, "loss": 0.6858, "step": 2094 }, { "epoch": 1.9853115375503436, "grad_norm": 1.0947801703603017, "learning_rate": 1.326147449993709e-09, "loss": 0.6883, "step": 2095 }, { "epoch": 1.986259180289031, "grad_norm": 0.960911476029797, "learning_rate": 1.1552283609272962e-09, "loss": 0.6925, "step": 2096 }, { "epoch": 1.9872068230277184, "grad_norm": 0.9695849235179069, "learning_rate": 9.96095044320522e-10, "loss": 0.7557, "step": 2097 }, { "epoch": 1.988154465766406, "grad_norm": 0.9926261452237545, "learning_rate": 8.487478753615997e-10, "loss": 0.6702, "step": 2098 }, { "epoch": 1.9891021085050937, "grad_norm": 0.9680825511217606, "learning_rate": 7.131872014509711e-10, "loss": 0.6212, "step": 2099 }, { "epoch": 1.9900497512437811, "grad_norm": 1.1257192398766538, "learning_rate": 5.894133422001957e-10, "loss": 0.7271, "step": 2100 }, { "epoch": 1.9909973939824686, "grad_norm": 1.0549297797609254, "learning_rate": 4.774265894302854e-10, "loss": 0.6968, "step": 2101 }, { "epoch": 1.9919450367211562, "grad_norm": 1.0063870271651199, "learning_rate": 3.772272071722594e-10, "loss": 0.7127, "step": 2102 }, { "epoch": 1.9928926794598436, "grad_norm": 0.9955451047912606, "learning_rate": 2.888154316671443e-10, "loss": 0.6535, "step": 2103 }, { "epoch": 1.993840322198531, "grad_norm": 1.0303767113264195, "learning_rate": 2.1219147136264383e-10, "loss": 0.7685, "step": 2104 }, { "epoch": 1.9947879649372187, "grad_norm": 0.9819925019424903, "learning_rate": 1.473555069148036e-10, "loss": 0.6488, "step": 2105 }, { "epoch": 1.9957356076759063, "grad_norm": 1.7129723621352664, "learning_rate": 9.43076911874563e-11, "loss": 0.7282, "step": 2106 }, { "epoch": 1.9966832504145937, "grad_norm": 1.0109896100493034, "learning_rate": 5.3048149251111456e-11, "loss": 0.6369, "step": 2107 }, { "epoch": 1.9976308931532811, "grad_norm": 0.9652839280997483, "learning_rate": 2.3576978384065585e-11, "loss": 0.731, "step": 2108 }, { "epoch": 1.9985785358919688, "grad_norm": 1.1017969508811174, "learning_rate": 5.8942480701817965e-12, "loss": 0.6985, "step": 2109 }, { "epoch": 1.9995261786306564, "grad_norm": 1.030266403397907, "learning_rate": 0.0, "loss": 0.6768, "step": 2110 }, { "epoch": 1.9995261786306564, "step": 2110, "total_flos": 7068238416445440.0, "train_loss": 0.820082382003278, "train_runtime": 57795.8559, "train_samples_per_second": 9.348, "train_steps_per_second": 0.037 } ], "logging_steps": 1.0, "max_steps": 2110, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 500, "total_flos": 7068238416445440.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }