diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,15560 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.9995261786306564, + "eval_steps": 22, + "global_step": 2110, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0009476427386875149, + "grad_norm": 20.756122057594123, + "learning_rate": 1.5625e-07, + "loss": 1.6431, + "step": 1 + }, + { + "epoch": 0.0018952854773750297, + "grad_norm": 17.596939119407057, + "learning_rate": 3.125e-07, + "loss": 1.5603, + "step": 2 + }, + { + "epoch": 0.0028429282160625444, + "grad_norm": 24.550176754105873, + "learning_rate": 4.6875000000000006e-07, + "loss": 1.695, + "step": 3 + }, + { + "epoch": 0.0037905709547500594, + "grad_norm": 23.98623237019658, + "learning_rate": 6.25e-07, + "loss": 1.7187, + "step": 4 + }, + { + "epoch": 0.004738213693437574, + "grad_norm": 40.536528584444476, + "learning_rate": 7.8125e-07, + "loss": 1.6799, + "step": 5 + }, + { + "epoch": 0.005685856432125089, + "grad_norm": 30.04208597312355, + "learning_rate": 9.375000000000001e-07, + "loss": 1.7002, + "step": 6 + }, + { + "epoch": 0.006633499170812604, + "grad_norm": 18.07515828827753, + "learning_rate": 1.0937500000000001e-06, + "loss": 1.5672, + "step": 7 + }, + { + "epoch": 0.007581141909500119, + "grad_norm": 33.28181380236923, + "learning_rate": 1.25e-06, + "loss": 1.7278, + "step": 8 + }, + { + "epoch": 0.008528784648187633, + "grad_norm": 16.829151356577462, + "learning_rate": 1.40625e-06, + "loss": 1.7151, + "step": 9 + }, + { + "epoch": 0.009476427386875147, + "grad_norm": 17.773159683100857, + "learning_rate": 1.5625e-06, + "loss": 1.5353, + "step": 10 + }, + { + "epoch": 0.010424070125562663, + "grad_norm": 18.419114948105452, + "learning_rate": 1.71875e-06, + "loss": 1.6073, + "step": 11 + }, + { + "epoch": 0.011371712864250177, + "grad_norm": 13.6273956773613, + "learning_rate": 1.8750000000000003e-06, + "loss": 1.5124, + "step": 12 + }, + { + "epoch": 0.012319355602937692, + "grad_norm": 14.235281012395534, + "learning_rate": 2.0312500000000002e-06, + "loss": 1.4477, + "step": 13 + }, + { + "epoch": 0.013266998341625208, + "grad_norm": 8.20052746448492, + "learning_rate": 2.1875000000000002e-06, + "loss": 1.4623, + "step": 14 + }, + { + "epoch": 0.014214641080312722, + "grad_norm": 11.658169078611403, + "learning_rate": 2.3437500000000002e-06, + "loss": 1.4695, + "step": 15 + }, + { + "epoch": 0.015162283819000238, + "grad_norm": 6.911597435569106, + "learning_rate": 2.5e-06, + "loss": 1.4164, + "step": 16 + }, + { + "epoch": 0.01610992655768775, + "grad_norm": 6.514587154523329, + "learning_rate": 2.65625e-06, + "loss": 1.3714, + "step": 17 + }, + { + "epoch": 0.017057569296375266, + "grad_norm": 7.470333408959338, + "learning_rate": 2.8125e-06, + "loss": 1.4672, + "step": 18 + }, + { + "epoch": 0.018005212035062782, + "grad_norm": 11.59118534986973, + "learning_rate": 2.96875e-06, + "loss": 1.3828, + "step": 19 + }, + { + "epoch": 0.018952854773750295, + "grad_norm": 3.9301638513755988, + "learning_rate": 3.125e-06, + "loss": 1.4137, + "step": 20 + }, + { + "epoch": 0.01990049751243781, + "grad_norm": 4.6357203859292495, + "learning_rate": 3.28125e-06, + "loss": 1.3915, + "step": 21 + }, + { + "epoch": 0.020848140251125327, + "grad_norm": 4.530214142494876, + "learning_rate": 3.4375e-06, + "loss": 1.3658, + "step": 22 + }, + { + "epoch": 0.020848140251125327, + "eval_loss": 1.2158129215240479, + "eval_runtime": 60.2576, + "eval_samples_per_second": 45.272, + "eval_steps_per_second": 0.714, + "step": 22 + }, + { + "epoch": 0.02179578298981284, + "grad_norm": 3.7036600794639396, + "learning_rate": 3.59375e-06, + "loss": 1.3058, + "step": 23 + }, + { + "epoch": 0.022743425728500355, + "grad_norm": 4.229128089681862, + "learning_rate": 3.7500000000000005e-06, + "loss": 1.2303, + "step": 24 + }, + { + "epoch": 0.02369106846718787, + "grad_norm": 5.376319236600739, + "learning_rate": 3.90625e-06, + "loss": 1.2898, + "step": 25 + }, + { + "epoch": 0.024638711205875383, + "grad_norm": 3.621037119144236, + "learning_rate": 4.0625000000000005e-06, + "loss": 1.2614, + "step": 26 + }, + { + "epoch": 0.0255863539445629, + "grad_norm": 3.7708968075921865, + "learning_rate": 4.21875e-06, + "loss": 1.3618, + "step": 27 + }, + { + "epoch": 0.026533996683250415, + "grad_norm": 4.5895243093772535, + "learning_rate": 4.3750000000000005e-06, + "loss": 1.2304, + "step": 28 + }, + { + "epoch": 0.027481639421937928, + "grad_norm": 10.299410261479563, + "learning_rate": 4.53125e-06, + "loss": 1.2345, + "step": 29 + }, + { + "epoch": 0.028429282160625444, + "grad_norm": 3.8644074526148184, + "learning_rate": 4.6875000000000004e-06, + "loss": 1.2523, + "step": 30 + }, + { + "epoch": 0.02937692489931296, + "grad_norm": 3.547834365401974, + "learning_rate": 4.84375e-06, + "loss": 1.2344, + "step": 31 + }, + { + "epoch": 0.030324567638000476, + "grad_norm": 3.578798181550234, + "learning_rate": 5e-06, + "loss": 1.2818, + "step": 32 + }, + { + "epoch": 0.03127221037668799, + "grad_norm": 3.711700954572684, + "learning_rate": 5.156250000000001e-06, + "loss": 1.2483, + "step": 33 + }, + { + "epoch": 0.0322198531153755, + "grad_norm": 3.5746276017801537, + "learning_rate": 5.3125e-06, + "loss": 1.2565, + "step": 34 + }, + { + "epoch": 0.03316749585406302, + "grad_norm": 3.7129697591402016, + "learning_rate": 5.468750000000001e-06, + "loss": 1.2261, + "step": 35 + }, + { + "epoch": 0.03411513859275053, + "grad_norm": 3.1701938959510656, + "learning_rate": 5.625e-06, + "loss": 1.1836, + "step": 36 + }, + { + "epoch": 0.035062781331438045, + "grad_norm": 3.081675212149766, + "learning_rate": 5.781250000000001e-06, + "loss": 1.1683, + "step": 37 + }, + { + "epoch": 0.036010424070125564, + "grad_norm": 4.351693679221342, + "learning_rate": 5.9375e-06, + "loss": 1.2133, + "step": 38 + }, + { + "epoch": 0.03695806680881308, + "grad_norm": 3.1336691817253204, + "learning_rate": 6.093750000000001e-06, + "loss": 1.1948, + "step": 39 + }, + { + "epoch": 0.03790570954750059, + "grad_norm": 2.703982626093151, + "learning_rate": 6.25e-06, + "loss": 1.1376, + "step": 40 + }, + { + "epoch": 0.03885335228618811, + "grad_norm": 3.003118804501732, + "learning_rate": 6.406250000000001e-06, + "loss": 1.2059, + "step": 41 + }, + { + "epoch": 0.03980099502487562, + "grad_norm": 3.3721112860961577, + "learning_rate": 6.5625e-06, + "loss": 1.2294, + "step": 42 + }, + { + "epoch": 0.040748637763563134, + "grad_norm": 2.935148387991293, + "learning_rate": 6.718750000000001e-06, + "loss": 1.2948, + "step": 43 + }, + { + "epoch": 0.04169628050225065, + "grad_norm": 2.7546718703597, + "learning_rate": 6.875e-06, + "loss": 1.1058, + "step": 44 + }, + { + "epoch": 0.04169628050225065, + "eval_loss": 1.1239995956420898, + "eval_runtime": 62.5263, + "eval_samples_per_second": 43.63, + "eval_steps_per_second": 0.688, + "step": 44 + }, + { + "epoch": 0.042643923240938165, + "grad_norm": 3.2372160639143885, + "learning_rate": 7.031250000000001e-06, + "loss": 1.187, + "step": 45 + }, + { + "epoch": 0.04359156597962568, + "grad_norm": 3.3104832856910233, + "learning_rate": 7.1875e-06, + "loss": 1.1547, + "step": 46 + }, + { + "epoch": 0.0445392087183132, + "grad_norm": 2.9630493187419096, + "learning_rate": 7.343750000000001e-06, + "loss": 1.205, + "step": 47 + }, + { + "epoch": 0.04548685145700071, + "grad_norm": 2.8169766087618537, + "learning_rate": 7.500000000000001e-06, + "loss": 1.1583, + "step": 48 + }, + { + "epoch": 0.04643449419568822, + "grad_norm": 3.0223679686127736, + "learning_rate": 7.656250000000001e-06, + "loss": 1.1546, + "step": 49 + }, + { + "epoch": 0.04738213693437574, + "grad_norm": 2.9245386601496417, + "learning_rate": 7.8125e-06, + "loss": 1.0963, + "step": 50 + }, + { + "epoch": 0.048329779673063254, + "grad_norm": 3.3416755825594207, + "learning_rate": 7.96875e-06, + "loss": 1.0911, + "step": 51 + }, + { + "epoch": 0.04927742241175077, + "grad_norm": 3.2146723217948754, + "learning_rate": 8.125000000000001e-06, + "loss": 1.1444, + "step": 52 + }, + { + "epoch": 0.050225065150438286, + "grad_norm": 3.7591880901694688, + "learning_rate": 8.281250000000001e-06, + "loss": 1.1644, + "step": 53 + }, + { + "epoch": 0.0511727078891258, + "grad_norm": 3.597908493062599, + "learning_rate": 8.4375e-06, + "loss": 1.1245, + "step": 54 + }, + { + "epoch": 0.05212035062781331, + "grad_norm": 3.499015413751106, + "learning_rate": 8.59375e-06, + "loss": 1.172, + "step": 55 + }, + { + "epoch": 0.05306799336650083, + "grad_norm": 3.309932625198402, + "learning_rate": 8.750000000000001e-06, + "loss": 1.1184, + "step": 56 + }, + { + "epoch": 0.05401563610518834, + "grad_norm": 3.2523198848476125, + "learning_rate": 8.906250000000001e-06, + "loss": 1.1571, + "step": 57 + }, + { + "epoch": 0.054963278843875855, + "grad_norm": 3.2980910638210545, + "learning_rate": 9.0625e-06, + "loss": 1.132, + "step": 58 + }, + { + "epoch": 0.055910921582563375, + "grad_norm": 3.1322189199908053, + "learning_rate": 9.21875e-06, + "loss": 1.0936, + "step": 59 + }, + { + "epoch": 0.05685856432125089, + "grad_norm": 3.4181316461149884, + "learning_rate": 9.375000000000001e-06, + "loss": 1.1518, + "step": 60 + }, + { + "epoch": 0.0578062070599384, + "grad_norm": 3.238262923289073, + "learning_rate": 9.531250000000001e-06, + "loss": 1.0584, + "step": 61 + }, + { + "epoch": 0.05875384979862592, + "grad_norm": 3.1080324386613274, + "learning_rate": 9.6875e-06, + "loss": 1.1286, + "step": 62 + }, + { + "epoch": 0.05970149253731343, + "grad_norm": 3.181648226621564, + "learning_rate": 9.84375e-06, + "loss": 1.0876, + "step": 63 + }, + { + "epoch": 0.06064913527600095, + "grad_norm": 2.937644097353525, + "learning_rate": 1e-05, + "loss": 1.1379, + "step": 64 + }, + { + "epoch": 0.061596778014688464, + "grad_norm": 2.958067762883821, + "learning_rate": 9.99999410575193e-06, + "loss": 1.1133, + "step": 65 + }, + { + "epoch": 0.06254442075337598, + "grad_norm": 2.96685444746321, + "learning_rate": 9.999976423021617e-06, + "loss": 1.1313, + "step": 66 + }, + { + "epoch": 0.06254442075337598, + "eval_loss": 1.0586260557174683, + "eval_runtime": 62.0231, + "eval_samples_per_second": 43.984, + "eval_steps_per_second": 0.693, + "step": 66 + }, + { + "epoch": 0.06349206349206349, + "grad_norm": 3.566526642424616, + "learning_rate": 9.99994695185075e-06, + "loss": 1.1156, + "step": 67 + }, + { + "epoch": 0.064439706230751, + "grad_norm": 3.220513610820652, + "learning_rate": 9.999905692308813e-06, + "loss": 1.0942, + "step": 68 + }, + { + "epoch": 0.06538734896943853, + "grad_norm": 2.545710751356861, + "learning_rate": 9.999852644493086e-06, + "loss": 1.0751, + "step": 69 + }, + { + "epoch": 0.06633499170812604, + "grad_norm": 3.3933596790334772, + "learning_rate": 9.999787808528639e-06, + "loss": 1.1213, + "step": 70 + }, + { + "epoch": 0.06728263444681355, + "grad_norm": 2.9414176412578303, + "learning_rate": 9.999711184568334e-06, + "loss": 1.0759, + "step": 71 + }, + { + "epoch": 0.06823027718550106, + "grad_norm": 2.8170462064724937, + "learning_rate": 9.999622772792829e-06, + "loss": 1.0679, + "step": 72 + }, + { + "epoch": 0.06917791992418858, + "grad_norm": 2.82044984183835, + "learning_rate": 9.99952257341057e-06, + "loss": 1.0539, + "step": 73 + }, + { + "epoch": 0.07012556266287609, + "grad_norm": 2.6332325002107333, + "learning_rate": 9.999410586657801e-06, + "loss": 1.0523, + "step": 74 + }, + { + "epoch": 0.07107320540156362, + "grad_norm": 3.0223476132960276, + "learning_rate": 9.99928681279855e-06, + "loss": 1.0355, + "step": 75 + }, + { + "epoch": 0.07202084814025113, + "grad_norm": 3.8896388319954753, + "learning_rate": 9.999151252124639e-06, + "loss": 1.1244, + "step": 76 + }, + { + "epoch": 0.07296849087893864, + "grad_norm": 2.542497788954799, + "learning_rate": 9.99900390495568e-06, + "loss": 0.9883, + "step": 77 + }, + { + "epoch": 0.07391613361762615, + "grad_norm": 2.75973540600498, + "learning_rate": 9.998844771639073e-06, + "loss": 1.0339, + "step": 78 + }, + { + "epoch": 0.07486377635631367, + "grad_norm": 12.963000608570283, + "learning_rate": 9.998673852550007e-06, + "loss": 1.0512, + "step": 79 + }, + { + "epoch": 0.07581141909500118, + "grad_norm": 2.3419089920048184, + "learning_rate": 9.998491148091457e-06, + "loss": 1.0479, + "step": 80 + }, + { + "epoch": 0.0767590618336887, + "grad_norm": 2.53974833474546, + "learning_rate": 9.998296658694185e-06, + "loss": 0.9376, + "step": 81 + }, + { + "epoch": 0.07770670457237622, + "grad_norm": 2.455905437125952, + "learning_rate": 9.99809038481674e-06, + "loss": 0.9294, + "step": 82 + }, + { + "epoch": 0.07865434731106373, + "grad_norm": 2.7870267445347343, + "learning_rate": 9.997872326945452e-06, + "loss": 1.0241, + "step": 83 + }, + { + "epoch": 0.07960199004975124, + "grad_norm": 2.3034635295759256, + "learning_rate": 9.997642485594436e-06, + "loss": 1.0372, + "step": 84 + }, + { + "epoch": 0.08054963278843875, + "grad_norm": 2.243933826068351, + "learning_rate": 9.99740086130559e-06, + "loss": 1.0229, + "step": 85 + }, + { + "epoch": 0.08149727552712627, + "grad_norm": 2.1942261875771, + "learning_rate": 9.99714745464859e-06, + "loss": 1.0116, + "step": 86 + }, + { + "epoch": 0.0824449182658138, + "grad_norm": 1.9009368734428258, + "learning_rate": 9.996882266220895e-06, + "loss": 0.9982, + "step": 87 + }, + { + "epoch": 0.0833925610045013, + "grad_norm": 2.197821917224577, + "learning_rate": 9.996605296647737e-06, + "loss": 1.0379, + "step": 88 + }, + { + "epoch": 0.0833925610045013, + "eval_loss": 1.0077329874038696, + "eval_runtime": 69.6734, + "eval_samples_per_second": 39.154, + "eval_steps_per_second": 0.617, + "step": 88 + }, + { + "epoch": 0.08434020374318882, + "grad_norm": 1.844506043554688, + "learning_rate": 9.99631654658213e-06, + "loss": 0.8956, + "step": 89 + }, + { + "epoch": 0.08528784648187633, + "grad_norm": 3.4121023273390283, + "learning_rate": 9.996016016704854e-06, + "loss": 0.9807, + "step": 90 + }, + { + "epoch": 0.08623548922056384, + "grad_norm": 1.629161161843959, + "learning_rate": 9.995703707724474e-06, + "loss": 0.9534, + "step": 91 + }, + { + "epoch": 0.08718313195925136, + "grad_norm": 2.1015114519988893, + "learning_rate": 9.995379620377319e-06, + "loss": 0.9817, + "step": 92 + }, + { + "epoch": 0.08813077469793888, + "grad_norm": 1.6484379590895988, + "learning_rate": 9.995043755427487e-06, + "loss": 0.9181, + "step": 93 + }, + { + "epoch": 0.0890784174366264, + "grad_norm": 1.5470369951292255, + "learning_rate": 9.99469611366685e-06, + "loss": 0.9655, + "step": 94 + }, + { + "epoch": 0.09002606017531391, + "grad_norm": 1.7131607780023708, + "learning_rate": 9.994336695915041e-06, + "loss": 0.9522, + "step": 95 + }, + { + "epoch": 0.09097370291400142, + "grad_norm": 3.4039383281352684, + "learning_rate": 9.993965503019457e-06, + "loss": 0.9977, + "step": 96 + }, + { + "epoch": 0.09192134565268893, + "grad_norm": 3.2140501680385536, + "learning_rate": 9.993582535855265e-06, + "loss": 0.8933, + "step": 97 + }, + { + "epoch": 0.09286898839137644, + "grad_norm": 1.9418785282824027, + "learning_rate": 9.993187795325381e-06, + "loss": 1.0526, + "step": 98 + }, + { + "epoch": 0.09381663113006397, + "grad_norm": 1.4531258804082263, + "learning_rate": 9.992781282360486e-06, + "loss": 0.9921, + "step": 99 + }, + { + "epoch": 0.09476427386875148, + "grad_norm": 2.0708416718589273, + "learning_rate": 9.992362997919016e-06, + "loss": 0.9248, + "step": 100 + }, + { + "epoch": 0.095711916607439, + "grad_norm": 1.7208209294560886, + "learning_rate": 9.99193294298716e-06, + "loss": 0.9778, + "step": 101 + }, + { + "epoch": 0.09665955934612651, + "grad_norm": 1.573086346708121, + "learning_rate": 9.991491118578856e-06, + "loss": 0.9369, + "step": 102 + }, + { + "epoch": 0.09760720208481402, + "grad_norm": 1.716766032223459, + "learning_rate": 9.991037525735794e-06, + "loss": 0.9718, + "step": 103 + }, + { + "epoch": 0.09855484482350153, + "grad_norm": 1.4683346581590864, + "learning_rate": 9.990572165527413e-06, + "loss": 1.0322, + "step": 104 + }, + { + "epoch": 0.09950248756218906, + "grad_norm": 1.6854495739199276, + "learning_rate": 9.990095039050886e-06, + "loss": 1.0259, + "step": 105 + }, + { + "epoch": 0.10045013030087657, + "grad_norm": 1.5007691822796942, + "learning_rate": 9.98960614743114e-06, + "loss": 1.012, + "step": 106 + }, + { + "epoch": 0.10139777303956408, + "grad_norm": 1.688237803394784, + "learning_rate": 9.98910549182083e-06, + "loss": 0.9611, + "step": 107 + }, + { + "epoch": 0.1023454157782516, + "grad_norm": 1.2412298690878776, + "learning_rate": 9.988593073400354e-06, + "loss": 0.9543, + "step": 108 + }, + { + "epoch": 0.10329305851693911, + "grad_norm": 1.4280638521919857, + "learning_rate": 9.988068893377841e-06, + "loss": 1.0555, + "step": 109 + }, + { + "epoch": 0.10424070125562662, + "grad_norm": 1.6064972212595918, + "learning_rate": 9.987532952989145e-06, + "loss": 0.947, + "step": 110 + }, + { + "epoch": 0.10424070125562662, + "eval_loss": 0.9927965998649597, + "eval_runtime": 64.552, + "eval_samples_per_second": 42.261, + "eval_steps_per_second": 0.666, + "step": 110 + }, + { + "epoch": 0.10518834399431415, + "grad_norm": 1.3706857779346073, + "learning_rate": 9.986985253497859e-06, + "loss": 0.958, + "step": 111 + }, + { + "epoch": 0.10613598673300166, + "grad_norm": 1.1044678274982156, + "learning_rate": 9.986425796195287e-06, + "loss": 0.9613, + "step": 112 + }, + { + "epoch": 0.10708362947168917, + "grad_norm": 1.2204032164258463, + "learning_rate": 9.985854582400465e-06, + "loss": 0.9637, + "step": 113 + }, + { + "epoch": 0.10803127221037669, + "grad_norm": 1.387543526294841, + "learning_rate": 9.985271613460144e-06, + "loss": 0.988, + "step": 114 + }, + { + "epoch": 0.1089789149490642, + "grad_norm": 1.5020093056571016, + "learning_rate": 9.984676890748787e-06, + "loss": 0.986, + "step": 115 + }, + { + "epoch": 0.10992655768775171, + "grad_norm": 1.503672693192212, + "learning_rate": 9.984070415668574e-06, + "loss": 0.9858, + "step": 116 + }, + { + "epoch": 0.11087420042643924, + "grad_norm": 1.4535398323237316, + "learning_rate": 9.983452189649388e-06, + "loss": 0.9324, + "step": 117 + }, + { + "epoch": 0.11182184316512675, + "grad_norm": 1.4452814142099772, + "learning_rate": 9.98282221414882e-06, + "loss": 0.9353, + "step": 118 + }, + { + "epoch": 0.11276948590381426, + "grad_norm": 1.2444175503325663, + "learning_rate": 9.982180490652165e-06, + "loss": 0.9864, + "step": 119 + }, + { + "epoch": 0.11371712864250177, + "grad_norm": 1.484366794735356, + "learning_rate": 9.981527020672413e-06, + "loss": 0.9683, + "step": 120 + }, + { + "epoch": 0.11466477138118929, + "grad_norm": 1.6287273073283466, + "learning_rate": 9.98086180575025e-06, + "loss": 0.9295, + "step": 121 + }, + { + "epoch": 0.1156124141198768, + "grad_norm": 2.1645074117977363, + "learning_rate": 9.980184847454052e-06, + "loss": 0.9474, + "step": 122 + }, + { + "epoch": 0.11656005685856433, + "grad_norm": 1.4428592940450689, + "learning_rate": 9.979496147379883e-06, + "loss": 1.0116, + "step": 123 + }, + { + "epoch": 0.11750769959725184, + "grad_norm": 1.442916857159405, + "learning_rate": 9.978795707151492e-06, + "loss": 0.9565, + "step": 124 + }, + { + "epoch": 0.11845534233593935, + "grad_norm": 1.3024301965632774, + "learning_rate": 9.978083528420303e-06, + "loss": 0.9471, + "step": 125 + }, + { + "epoch": 0.11940298507462686, + "grad_norm": 1.2357348655586111, + "learning_rate": 9.977359612865424e-06, + "loss": 0.9684, + "step": 126 + }, + { + "epoch": 0.12035062781331438, + "grad_norm": 1.4890088752220003, + "learning_rate": 9.976623962193627e-06, + "loss": 0.9535, + "step": 127 + }, + { + "epoch": 0.1212982705520019, + "grad_norm": 1.3469393114862873, + "learning_rate": 9.975876578139355e-06, + "loss": 0.986, + "step": 128 + }, + { + "epoch": 0.12224591329068941, + "grad_norm": 1.5755457587661565, + "learning_rate": 9.975117462464716e-06, + "loss": 1.019, + "step": 129 + }, + { + "epoch": 0.12319355602937693, + "grad_norm": 1.4168899024501311, + "learning_rate": 9.974346616959476e-06, + "loss": 0.9368, + "step": 130 + }, + { + "epoch": 0.12414119876806444, + "grad_norm": 1.4122617861876974, + "learning_rate": 9.973564043441057e-06, + "loss": 0.9563, + "step": 131 + }, + { + "epoch": 0.12508884150675195, + "grad_norm": 1.3777217200154346, + "learning_rate": 9.972769743754532e-06, + "loss": 1.0045, + "step": 132 + }, + { + "epoch": 0.12508884150675195, + "eval_loss": 0.9894475340843201, + "eval_runtime": 67.9151, + "eval_samples_per_second": 40.168, + "eval_steps_per_second": 0.633, + "step": 132 + }, + { + "epoch": 0.12603648424543948, + "grad_norm": 1.3796166756668033, + "learning_rate": 9.971963719772621e-06, + "loss": 0.9492, + "step": 133 + }, + { + "epoch": 0.12698412698412698, + "grad_norm": 1.8484880273102298, + "learning_rate": 9.971145973395685e-06, + "loss": 0.9844, + "step": 134 + }, + { + "epoch": 0.1279317697228145, + "grad_norm": 1.3341411571681234, + "learning_rate": 9.970316506551726e-06, + "loss": 0.9752, + "step": 135 + }, + { + "epoch": 0.128879412461502, + "grad_norm": 1.6561053976261508, + "learning_rate": 9.969475321196374e-06, + "loss": 0.9745, + "step": 136 + }, + { + "epoch": 0.12982705520018953, + "grad_norm": 1.4801234453473886, + "learning_rate": 9.968622419312895e-06, + "loss": 0.983, + "step": 137 + }, + { + "epoch": 0.13077469793887705, + "grad_norm": 1.650605001197027, + "learning_rate": 9.967757802912172e-06, + "loss": 0.9226, + "step": 138 + }, + { + "epoch": 0.13172234067756455, + "grad_norm": 1.5926202309459188, + "learning_rate": 9.966881474032711e-06, + "loss": 0.9754, + "step": 139 + }, + { + "epoch": 0.13266998341625208, + "grad_norm": 1.601896363818333, + "learning_rate": 9.965993434740634e-06, + "loss": 0.9812, + "step": 140 + }, + { + "epoch": 0.13361762615493958, + "grad_norm": 1.2173182907186852, + "learning_rate": 9.965093687129669e-06, + "loss": 0.987, + "step": 141 + }, + { + "epoch": 0.1345652688936271, + "grad_norm": 1.1914620826791744, + "learning_rate": 9.96418223332115e-06, + "loss": 0.9049, + "step": 142 + }, + { + "epoch": 0.13551291163231463, + "grad_norm": 1.1718797680017474, + "learning_rate": 9.963259075464011e-06, + "loss": 1.0314, + "step": 143 + }, + { + "epoch": 0.13646055437100213, + "grad_norm": 1.1921400894385041, + "learning_rate": 9.962324215734782e-06, + "loss": 0.9804, + "step": 144 + }, + { + "epoch": 0.13740819710968966, + "grad_norm": 1.483815329510506, + "learning_rate": 9.961377656337579e-06, + "loss": 0.9371, + "step": 145 + }, + { + "epoch": 0.13835583984837715, + "grad_norm": 1.174598142994827, + "learning_rate": 9.960419399504107e-06, + "loss": 0.9357, + "step": 146 + }, + { + "epoch": 0.13930348258706468, + "grad_norm": 1.768580101280523, + "learning_rate": 9.959449447493643e-06, + "loss": 0.9801, + "step": 147 + }, + { + "epoch": 0.14025112532575218, + "grad_norm": 1.8249892638670866, + "learning_rate": 9.958467802593046e-06, + "loss": 0.9553, + "step": 148 + }, + { + "epoch": 0.1411987680644397, + "grad_norm": 1.464158444501908, + "learning_rate": 9.957474467116739e-06, + "loss": 0.9816, + "step": 149 + }, + { + "epoch": 0.14214641080312723, + "grad_norm": 1.4006303093968397, + "learning_rate": 9.956469443406707e-06, + "loss": 0.959, + "step": 150 + }, + { + "epoch": 0.14309405354181473, + "grad_norm": 1.2677409714516314, + "learning_rate": 9.955452733832493e-06, + "loss": 0.9901, + "step": 151 + }, + { + "epoch": 0.14404169628050226, + "grad_norm": 1.616294750537421, + "learning_rate": 9.954424340791195e-06, + "loss": 0.9611, + "step": 152 + }, + { + "epoch": 0.14498933901918976, + "grad_norm": 1.2762321668275929, + "learning_rate": 9.953384266707453e-06, + "loss": 0.9971, + "step": 153 + }, + { + "epoch": 0.14593698175787728, + "grad_norm": 1.243174536133587, + "learning_rate": 9.952332514033449e-06, + "loss": 0.9545, + "step": 154 + }, + { + "epoch": 0.14593698175787728, + "eval_loss": 0.982397735118866, + "eval_runtime": 66.0802, + "eval_samples_per_second": 41.283, + "eval_steps_per_second": 0.651, + "step": 154 + }, + { + "epoch": 0.1468846244965648, + "grad_norm": 1.3382329641420807, + "learning_rate": 9.951269085248898e-06, + "loss": 0.9934, + "step": 155 + }, + { + "epoch": 0.1478322672352523, + "grad_norm": 1.2257802109789377, + "learning_rate": 9.950193982861048e-06, + "loss": 0.9528, + "step": 156 + }, + { + "epoch": 0.14877990997393983, + "grad_norm": 1.232838273393549, + "learning_rate": 9.949107209404664e-06, + "loss": 0.9719, + "step": 157 + }, + { + "epoch": 0.14972755271262733, + "grad_norm": 1.6289871091304047, + "learning_rate": 9.948008767442034e-06, + "loss": 0.9634, + "step": 158 + }, + { + "epoch": 0.15067519545131486, + "grad_norm": 1.486191374802309, + "learning_rate": 9.94689865956295e-06, + "loss": 0.9457, + "step": 159 + }, + { + "epoch": 0.15162283819000236, + "grad_norm": 1.2145460306596223, + "learning_rate": 9.94577688838472e-06, + "loss": 0.9841, + "step": 160 + }, + { + "epoch": 0.15257048092868988, + "grad_norm": 1.2094755905610057, + "learning_rate": 9.944643456552133e-06, + "loss": 0.9577, + "step": 161 + }, + { + "epoch": 0.1535181236673774, + "grad_norm": 1.2312169745263408, + "learning_rate": 9.943498366737487e-06, + "loss": 0.935, + "step": 162 + }, + { + "epoch": 0.1544657664060649, + "grad_norm": 1.996437404759428, + "learning_rate": 9.942341621640558e-06, + "loss": 0.9949, + "step": 163 + }, + { + "epoch": 0.15541340914475243, + "grad_norm": 1.434919063522936, + "learning_rate": 9.941173223988603e-06, + "loss": 0.961, + "step": 164 + }, + { + "epoch": 0.15636105188343993, + "grad_norm": 1.5694305163048035, + "learning_rate": 9.93999317653635e-06, + "loss": 1.0382, + "step": 165 + }, + { + "epoch": 0.15730869462212746, + "grad_norm": 1.4810485545937977, + "learning_rate": 9.938801482065998e-06, + "loss": 0.9782, + "step": 166 + }, + { + "epoch": 0.15825633736081499, + "grad_norm": 1.2852835752717688, + "learning_rate": 9.937598143387207e-06, + "loss": 0.9012, + "step": 167 + }, + { + "epoch": 0.15920398009950248, + "grad_norm": 1.3425076199539143, + "learning_rate": 9.93638316333708e-06, + "loss": 0.92, + "step": 168 + }, + { + "epoch": 0.16015162283819, + "grad_norm": 1.1023252456779573, + "learning_rate": 9.935156544780183e-06, + "loss": 0.9383, + "step": 169 + }, + { + "epoch": 0.1610992655768775, + "grad_norm": 1.4060044099112272, + "learning_rate": 9.93391829060851e-06, + "loss": 0.9764, + "step": 170 + }, + { + "epoch": 0.16204690831556504, + "grad_norm": 1.2799421690962227, + "learning_rate": 9.932668403741488e-06, + "loss": 0.8693, + "step": 171 + }, + { + "epoch": 0.16299455105425253, + "grad_norm": 1.139116134527199, + "learning_rate": 9.93140688712598e-06, + "loss": 0.9494, + "step": 172 + }, + { + "epoch": 0.16394219379294006, + "grad_norm": 1.217839709509947, + "learning_rate": 9.930133743736261e-06, + "loss": 0.8957, + "step": 173 + }, + { + "epoch": 0.1648898365316276, + "grad_norm": 1.301115684104673, + "learning_rate": 9.92884897657402e-06, + "loss": 0.9477, + "step": 174 + }, + { + "epoch": 0.16583747927031509, + "grad_norm": 1.2295422076552296, + "learning_rate": 9.92755258866835e-06, + "loss": 0.9441, + "step": 175 + }, + { + "epoch": 0.1667851220090026, + "grad_norm": 1.2198046764803545, + "learning_rate": 9.926244583075748e-06, + "loss": 0.9556, + "step": 176 + }, + { + "epoch": 0.1667851220090026, + "eval_loss": 0.9768843054771423, + "eval_runtime": 65.0055, + "eval_samples_per_second": 41.966, + "eval_steps_per_second": 0.661, + "step": 176 + }, + { + "epoch": 0.1677327647476901, + "grad_norm": 1.2348345090307584, + "learning_rate": 9.924924962880093e-06, + "loss": 0.9633, + "step": 177 + }, + { + "epoch": 0.16868040748637764, + "grad_norm": 1.2606209599886706, + "learning_rate": 9.923593731192655e-06, + "loss": 0.98, + "step": 178 + }, + { + "epoch": 0.16962805022506516, + "grad_norm": 2.6831467782092995, + "learning_rate": 9.922250891152078e-06, + "loss": 0.994, + "step": 179 + }, + { + "epoch": 0.17057569296375266, + "grad_norm": 1.2766929465017953, + "learning_rate": 9.920896445924372e-06, + "loss": 0.9753, + "step": 180 + }, + { + "epoch": 0.1715233357024402, + "grad_norm": 1.1262505199746897, + "learning_rate": 9.919530398702917e-06, + "loss": 0.9641, + "step": 181 + }, + { + "epoch": 0.1724709784411277, + "grad_norm": 1.442803425071792, + "learning_rate": 9.918152752708437e-06, + "loss": 0.9601, + "step": 182 + }, + { + "epoch": 0.1734186211798152, + "grad_norm": 1.2137409175608287, + "learning_rate": 9.916763511189009e-06, + "loss": 0.9747, + "step": 183 + }, + { + "epoch": 0.1743662639185027, + "grad_norm": 1.3271533812680127, + "learning_rate": 9.915362677420045e-06, + "loss": 0.9384, + "step": 184 + }, + { + "epoch": 0.17531390665719024, + "grad_norm": 1.2077500566101147, + "learning_rate": 9.913950254704291e-06, + "loss": 0.9372, + "step": 185 + }, + { + "epoch": 0.17626154939587776, + "grad_norm": 1.1297753271104558, + "learning_rate": 9.912526246371815e-06, + "loss": 0.8775, + "step": 186 + }, + { + "epoch": 0.17720919213456526, + "grad_norm": 1.2198507607935039, + "learning_rate": 9.911090655779997e-06, + "loss": 1.0036, + "step": 187 + }, + { + "epoch": 0.1781568348732528, + "grad_norm": 1.305484389615825, + "learning_rate": 9.909643486313533e-06, + "loss": 0.9687, + "step": 188 + }, + { + "epoch": 0.1791044776119403, + "grad_norm": 1.527203085727602, + "learning_rate": 9.908184741384412e-06, + "loss": 0.9225, + "step": 189 + }, + { + "epoch": 0.18005212035062781, + "grad_norm": 1.1568401663216765, + "learning_rate": 9.906714424431914e-06, + "loss": 0.9112, + "step": 190 + }, + { + "epoch": 0.18099976308931534, + "grad_norm": 1.2426671937737235, + "learning_rate": 9.905232538922604e-06, + "loss": 0.9509, + "step": 191 + }, + { + "epoch": 0.18194740582800284, + "grad_norm": 1.535223723588726, + "learning_rate": 9.903739088350325e-06, + "loss": 0.8984, + "step": 192 + }, + { + "epoch": 0.18289504856669037, + "grad_norm": 1.5431131775034228, + "learning_rate": 9.902234076236182e-06, + "loss": 0.9602, + "step": 193 + }, + { + "epoch": 0.18384269130537786, + "grad_norm": 1.182953828246788, + "learning_rate": 9.90071750612854e-06, + "loss": 0.887, + "step": 194 + }, + { + "epoch": 0.1847903340440654, + "grad_norm": 1.4338081609253326, + "learning_rate": 9.899189381603018e-06, + "loss": 0.9818, + "step": 195 + }, + { + "epoch": 0.1857379767827529, + "grad_norm": 1.4971956239027924, + "learning_rate": 9.897649706262474e-06, + "loss": 0.9455, + "step": 196 + }, + { + "epoch": 0.18668561952144042, + "grad_norm": 1.2771616220713862, + "learning_rate": 9.896098483736995e-06, + "loss": 0.9563, + "step": 197 + }, + { + "epoch": 0.18763326226012794, + "grad_norm": 1.1927091790410977, + "learning_rate": 9.894535717683902e-06, + "loss": 0.9376, + "step": 198 + }, + { + "epoch": 0.18763326226012794, + "eval_loss": 0.9750568270683289, + "eval_runtime": 68.0671, + "eval_samples_per_second": 40.078, + "eval_steps_per_second": 0.632, + "step": 198 + }, + { + "epoch": 0.18858090499881544, + "grad_norm": 1.447983084882731, + "learning_rate": 9.892961411787725e-06, + "loss": 0.941, + "step": 199 + }, + { + "epoch": 0.18952854773750297, + "grad_norm": 1.179964004851603, + "learning_rate": 9.891375569760205e-06, + "loss": 1.0044, + "step": 200 + }, + { + "epoch": 0.19047619047619047, + "grad_norm": 1.1672903614747536, + "learning_rate": 9.88977819534028e-06, + "loss": 0.9087, + "step": 201 + }, + { + "epoch": 0.191423833214878, + "grad_norm": 1.2604577059340927, + "learning_rate": 9.888169292294077e-06, + "loss": 0.97, + "step": 202 + }, + { + "epoch": 0.19237147595356552, + "grad_norm": 1.4285797440582975, + "learning_rate": 9.886548864414906e-06, + "loss": 0.9296, + "step": 203 + }, + { + "epoch": 0.19331911869225302, + "grad_norm": 1.4094308770717812, + "learning_rate": 9.88491691552325e-06, + "loss": 1.0148, + "step": 204 + }, + { + "epoch": 0.19426676143094054, + "grad_norm": 1.5152496759647966, + "learning_rate": 9.883273449466755e-06, + "loss": 0.9839, + "step": 205 + }, + { + "epoch": 0.19521440416962804, + "grad_norm": 1.4100497762615254, + "learning_rate": 9.881618470120216e-06, + "loss": 0.9112, + "step": 206 + }, + { + "epoch": 0.19616204690831557, + "grad_norm": 1.2060557963303735, + "learning_rate": 9.879951981385577e-06, + "loss": 1.0107, + "step": 207 + }, + { + "epoch": 0.19710968964700307, + "grad_norm": 1.1817847604275118, + "learning_rate": 9.87827398719192e-06, + "loss": 0.9401, + "step": 208 + }, + { + "epoch": 0.1980573323856906, + "grad_norm": 4.640069295683942, + "learning_rate": 9.876584491495448e-06, + "loss": 0.9453, + "step": 209 + }, + { + "epoch": 0.19900497512437812, + "grad_norm": 1.3678287853797575, + "learning_rate": 9.874883498279485e-06, + "loss": 0.9139, + "step": 210 + }, + { + "epoch": 0.19995261786306562, + "grad_norm": 1.2020105753823802, + "learning_rate": 9.87317101155446e-06, + "loss": 0.8995, + "step": 211 + }, + { + "epoch": 0.20090026060175314, + "grad_norm": 1.560649904766898, + "learning_rate": 9.871447035357903e-06, + "loss": 0.9953, + "step": 212 + }, + { + "epoch": 0.20184790334044064, + "grad_norm": 1.5587492681660762, + "learning_rate": 9.869711573754433e-06, + "loss": 0.9954, + "step": 213 + }, + { + "epoch": 0.20279554607912817, + "grad_norm": 1.1589889744586952, + "learning_rate": 9.867964630835742e-06, + "loss": 0.9664, + "step": 214 + }, + { + "epoch": 0.2037431888178157, + "grad_norm": 1.4941711737316694, + "learning_rate": 9.8662062107206e-06, + "loss": 0.9087, + "step": 215 + }, + { + "epoch": 0.2046908315565032, + "grad_norm": 1.1922425845332252, + "learning_rate": 9.86443631755483e-06, + "loss": 1.0093, + "step": 216 + }, + { + "epoch": 0.20563847429519072, + "grad_norm": 1.236697642847563, + "learning_rate": 9.862654955511309e-06, + "loss": 0.9649, + "step": 217 + }, + { + "epoch": 0.20658611703387822, + "grad_norm": 1.2350057563906354, + "learning_rate": 9.860862128789954e-06, + "loss": 0.9714, + "step": 218 + }, + { + "epoch": 0.20753375977256575, + "grad_norm": 1.4642161662286084, + "learning_rate": 9.859057841617709e-06, + "loss": 0.951, + "step": 219 + }, + { + "epoch": 0.20848140251125324, + "grad_norm": 1.1189678628969209, + "learning_rate": 9.857242098248543e-06, + "loss": 0.9097, + "step": 220 + }, + { + "epoch": 0.20848140251125324, + "eval_loss": 0.9686124324798584, + "eval_runtime": 68.177, + "eval_samples_per_second": 40.013, + "eval_steps_per_second": 0.631, + "step": 220 + }, + { + "epoch": 0.20942904524994077, + "grad_norm": 1.1409361807030405, + "learning_rate": 9.85541490296343e-06, + "loss": 0.913, + "step": 221 + }, + { + "epoch": 0.2103766879886283, + "grad_norm": 1.4175269201432499, + "learning_rate": 9.853576260070348e-06, + "loss": 0.956, + "step": 222 + }, + { + "epoch": 0.2113243307273158, + "grad_norm": 1.202975487777318, + "learning_rate": 9.851726173904264e-06, + "loss": 0.9681, + "step": 223 + }, + { + "epoch": 0.21227197346600332, + "grad_norm": 1.2528114366347458, + "learning_rate": 9.849864648827126e-06, + "loss": 0.9339, + "step": 224 + }, + { + "epoch": 0.21321961620469082, + "grad_norm": 1.5633193545585717, + "learning_rate": 9.847991689227848e-06, + "loss": 0.9481, + "step": 225 + }, + { + "epoch": 0.21416725894337835, + "grad_norm": 1.3036681318560188, + "learning_rate": 9.846107299522305e-06, + "loss": 0.9669, + "step": 226 + }, + { + "epoch": 0.21511490168206587, + "grad_norm": 1.276332389348374, + "learning_rate": 9.844211484153326e-06, + "loss": 1.0051, + "step": 227 + }, + { + "epoch": 0.21606254442075337, + "grad_norm": 1.3574477388118054, + "learning_rate": 9.842304247590668e-06, + "loss": 0.9185, + "step": 228 + }, + { + "epoch": 0.2170101871594409, + "grad_norm": 1.2290424692902366, + "learning_rate": 9.840385594331022e-06, + "loss": 0.9402, + "step": 229 + }, + { + "epoch": 0.2179578298981284, + "grad_norm": 1.3663071377926381, + "learning_rate": 9.838455528897998e-06, + "loss": 0.9303, + "step": 230 + }, + { + "epoch": 0.21890547263681592, + "grad_norm": 1.1297310850238833, + "learning_rate": 9.836514055842109e-06, + "loss": 0.8715, + "step": 231 + }, + { + "epoch": 0.21985311537550342, + "grad_norm": 1.1981756396394987, + "learning_rate": 9.834561179740763e-06, + "loss": 0.9603, + "step": 232 + }, + { + "epoch": 0.22080075811419095, + "grad_norm": 1.0960664647793084, + "learning_rate": 9.832596905198255e-06, + "loss": 0.9352, + "step": 233 + }, + { + "epoch": 0.22174840085287847, + "grad_norm": 1.2698198526002429, + "learning_rate": 9.830621236845755e-06, + "loss": 0.9044, + "step": 234 + }, + { + "epoch": 0.22269604359156597, + "grad_norm": 1.4209652174245544, + "learning_rate": 9.828634179341292e-06, + "loss": 0.9839, + "step": 235 + }, + { + "epoch": 0.2236436863302535, + "grad_norm": 1.5896834703549265, + "learning_rate": 9.826635737369752e-06, + "loss": 0.9479, + "step": 236 + }, + { + "epoch": 0.224591329068941, + "grad_norm": 1.118663687959167, + "learning_rate": 9.82462591564286e-06, + "loss": 0.9568, + "step": 237 + }, + { + "epoch": 0.22553897180762852, + "grad_norm": 1.081723075754863, + "learning_rate": 9.82260471889917e-06, + "loss": 1.0009, + "step": 238 + }, + { + "epoch": 0.22648661454631605, + "grad_norm": 1.3816847638469698, + "learning_rate": 9.82057215190406e-06, + "loss": 0.9565, + "step": 239 + }, + { + "epoch": 0.22743425728500355, + "grad_norm": 1.3650320361676973, + "learning_rate": 9.818528219449705e-06, + "loss": 0.9435, + "step": 240 + }, + { + "epoch": 0.22838190002369108, + "grad_norm": 1.1163028465916651, + "learning_rate": 9.816472926355087e-06, + "loss": 0.9926, + "step": 241 + }, + { + "epoch": 0.22932954276237857, + "grad_norm": 1.1783321971909724, + "learning_rate": 9.814406277465969e-06, + "loss": 0.9908, + "step": 242 + }, + { + "epoch": 0.22932954276237857, + "eval_loss": 0.9650764465332031, + "eval_runtime": 63.7155, + "eval_samples_per_second": 42.815, + "eval_steps_per_second": 0.675, + "step": 242 + }, + { + "epoch": 0.2302771855010661, + "grad_norm": 1.078825580859753, + "learning_rate": 9.812328277654889e-06, + "loss": 0.9395, + "step": 243 + }, + { + "epoch": 0.2312248282397536, + "grad_norm": 1.1093483786757967, + "learning_rate": 9.810238931821139e-06, + "loss": 0.9178, + "step": 244 + }, + { + "epoch": 0.23217247097844113, + "grad_norm": 1.3499071449657545, + "learning_rate": 9.808138244890775e-06, + "loss": 0.952, + "step": 245 + }, + { + "epoch": 0.23312011371712865, + "grad_norm": 1.1761313846911488, + "learning_rate": 9.806026221816582e-06, + "loss": 0.9497, + "step": 246 + }, + { + "epoch": 0.23406775645581615, + "grad_norm": 1.2110375794344939, + "learning_rate": 9.803902867578075e-06, + "loss": 0.944, + "step": 247 + }, + { + "epoch": 0.23501539919450368, + "grad_norm": 1.2034987469557872, + "learning_rate": 9.801768187181487e-06, + "loss": 0.986, + "step": 248 + }, + { + "epoch": 0.23596304193319118, + "grad_norm": 1.3058009296406379, + "learning_rate": 9.799622185659748e-06, + "loss": 0.967, + "step": 249 + }, + { + "epoch": 0.2369106846718787, + "grad_norm": 1.1123429020549715, + "learning_rate": 9.797464868072489e-06, + "loss": 0.9217, + "step": 250 + }, + { + "epoch": 0.23785832741056623, + "grad_norm": 1.089125109757041, + "learning_rate": 9.795296239506011e-06, + "loss": 0.8866, + "step": 251 + }, + { + "epoch": 0.23880597014925373, + "grad_norm": 1.2123667069466009, + "learning_rate": 9.793116305073292e-06, + "loss": 0.9307, + "step": 252 + }, + { + "epoch": 0.23975361288794125, + "grad_norm": 1.4622869606703903, + "learning_rate": 9.790925069913962e-06, + "loss": 0.9538, + "step": 253 + }, + { + "epoch": 0.24070125562662875, + "grad_norm": 1.5523797111635822, + "learning_rate": 9.788722539194291e-06, + "loss": 0.969, + "step": 254 + }, + { + "epoch": 0.24164889836531628, + "grad_norm": 1.1827311652398949, + "learning_rate": 9.786508718107184e-06, + "loss": 0.9849, + "step": 255 + }, + { + "epoch": 0.2425965411040038, + "grad_norm": 1.2881186217827927, + "learning_rate": 9.78428361187217e-06, + "loss": 0.9295, + "step": 256 + }, + { + "epoch": 0.2435441838426913, + "grad_norm": 1.474451652001404, + "learning_rate": 9.782047225735376e-06, + "loss": 0.9576, + "step": 257 + }, + { + "epoch": 0.24449182658137883, + "grad_norm": 1.2287731326656932, + "learning_rate": 9.77979956496953e-06, + "loss": 0.9485, + "step": 258 + }, + { + "epoch": 0.24543946932006633, + "grad_norm": 1.3059618909257746, + "learning_rate": 9.777540634873939e-06, + "loss": 0.9961, + "step": 259 + }, + { + "epoch": 0.24638711205875385, + "grad_norm": 1.25801433279188, + "learning_rate": 9.775270440774481e-06, + "loss": 0.9374, + "step": 260 + }, + { + "epoch": 0.24733475479744135, + "grad_norm": 1.4594944714968974, + "learning_rate": 9.772988988023589e-06, + "loss": 0.9714, + "step": 261 + }, + { + "epoch": 0.24828239753612888, + "grad_norm": 1.1788267508576873, + "learning_rate": 9.770696282000245e-06, + "loss": 0.9251, + "step": 262 + }, + { + "epoch": 0.2492300402748164, + "grad_norm": 1.2489815438864824, + "learning_rate": 9.76839232810996e-06, + "loss": 0.9126, + "step": 263 + }, + { + "epoch": 0.2501776830135039, + "grad_norm": 1.3083502635920439, + "learning_rate": 9.766077131784764e-06, + "loss": 0.94, + "step": 264 + }, + { + "epoch": 0.2501776830135039, + "eval_loss": 0.9628852605819702, + "eval_runtime": 65.8683, + "eval_samples_per_second": 41.416, + "eval_steps_per_second": 0.653, + "step": 264 + }, + { + "epoch": 0.25112532575219143, + "grad_norm": 1.2876315572259667, + "learning_rate": 9.763750698483192e-06, + "loss": 0.9824, + "step": 265 + }, + { + "epoch": 0.25207296849087896, + "grad_norm": 1.4509050672400128, + "learning_rate": 9.761413033690276e-06, + "loss": 1.01, + "step": 266 + }, + { + "epoch": 0.25302061122956643, + "grad_norm": 1.2615386437049756, + "learning_rate": 9.759064142917526e-06, + "loss": 0.9336, + "step": 267 + }, + { + "epoch": 0.25396825396825395, + "grad_norm": 1.1835961624076299, + "learning_rate": 9.756704031702919e-06, + "loss": 0.9462, + "step": 268 + }, + { + "epoch": 0.2549158967069415, + "grad_norm": 1.2900537501658034, + "learning_rate": 9.75433270561089e-06, + "loss": 0.9071, + "step": 269 + }, + { + "epoch": 0.255863539445629, + "grad_norm": 1.138429016575903, + "learning_rate": 9.75195017023231e-06, + "loss": 0.8544, + "step": 270 + }, + { + "epoch": 0.25681118218431653, + "grad_norm": 1.1853801438439096, + "learning_rate": 9.74955643118448e-06, + "loss": 0.92, + "step": 271 + }, + { + "epoch": 0.257758824923004, + "grad_norm": 1.2610744856343499, + "learning_rate": 9.74715149411112e-06, + "loss": 0.9012, + "step": 272 + }, + { + "epoch": 0.25870646766169153, + "grad_norm": 1.4706709692456896, + "learning_rate": 9.744735364682347e-06, + "loss": 0.9476, + "step": 273 + }, + { + "epoch": 0.25965411040037906, + "grad_norm": 1.4148479481637295, + "learning_rate": 9.742308048594665e-06, + "loss": 0.9121, + "step": 274 + }, + { + "epoch": 0.2606017531390666, + "grad_norm": 1.236422033348515, + "learning_rate": 9.73986955157096e-06, + "loss": 0.9135, + "step": 275 + }, + { + "epoch": 0.2615493958777541, + "grad_norm": 1.1477317083396126, + "learning_rate": 9.737419879360471e-06, + "loss": 0.9516, + "step": 276 + }, + { + "epoch": 0.2624970386164416, + "grad_norm": 2.5546186723319373, + "learning_rate": 9.734959037738788e-06, + "loss": 0.9422, + "step": 277 + }, + { + "epoch": 0.2634446813551291, + "grad_norm": 1.3564480695771186, + "learning_rate": 9.732487032507837e-06, + "loss": 0.8961, + "step": 278 + }, + { + "epoch": 0.26439232409381663, + "grad_norm": 1.4878738583178996, + "learning_rate": 9.730003869495863e-06, + "loss": 0.9457, + "step": 279 + }, + { + "epoch": 0.26533996683250416, + "grad_norm": 1.1351790275971436, + "learning_rate": 9.727509554557416e-06, + "loss": 0.8766, + "step": 280 + }, + { + "epoch": 0.2662876095711917, + "grad_norm": 1.3900072874584015, + "learning_rate": 9.725004093573343e-06, + "loss": 0.8972, + "step": 281 + }, + { + "epoch": 0.26723525230987916, + "grad_norm": 1.1866023759013848, + "learning_rate": 9.722487492450764e-06, + "loss": 0.9335, + "step": 282 + }, + { + "epoch": 0.2681828950485667, + "grad_norm": 1.2381217486697587, + "learning_rate": 9.719959757123073e-06, + "loss": 0.9083, + "step": 283 + }, + { + "epoch": 0.2691305377872542, + "grad_norm": 1.6107373228302189, + "learning_rate": 9.717420893549902e-06, + "loss": 0.9913, + "step": 284 + }, + { + "epoch": 0.27007818052594174, + "grad_norm": 1.3012559103471528, + "learning_rate": 9.714870907717134e-06, + "loss": 0.9384, + "step": 285 + }, + { + "epoch": 0.27102582326462926, + "grad_norm": 1.3512266977948462, + "learning_rate": 9.712309805636863e-06, + "loss": 0.9738, + "step": 286 + }, + { + "epoch": 0.27102582326462926, + "eval_loss": 0.9620270729064941, + "eval_runtime": 59.315, + "eval_samples_per_second": 45.992, + "eval_steps_per_second": 0.725, + "step": 286 + }, + { + "epoch": 0.27197346600331673, + "grad_norm": 1.1737111003693583, + "learning_rate": 9.709737593347404e-06, + "loss": 0.9669, + "step": 287 + }, + { + "epoch": 0.27292110874200426, + "grad_norm": 1.158891062157781, + "learning_rate": 9.707154276913255e-06, + "loss": 0.9724, + "step": 288 + }, + { + "epoch": 0.2738687514806918, + "grad_norm": 1.1818539669598636, + "learning_rate": 9.704559862425101e-06, + "loss": 0.9411, + "step": 289 + }, + { + "epoch": 0.2748163942193793, + "grad_norm": 1.317223158403057, + "learning_rate": 9.701954355999791e-06, + "loss": 0.8897, + "step": 290 + }, + { + "epoch": 0.2757640369580668, + "grad_norm": 1.2827511719089313, + "learning_rate": 9.699337763780325e-06, + "loss": 0.9062, + "step": 291 + }, + { + "epoch": 0.2767116796967543, + "grad_norm": 1.28805108052852, + "learning_rate": 9.696710091935842e-06, + "loss": 0.9176, + "step": 292 + }, + { + "epoch": 0.27765932243544184, + "grad_norm": 1.3367234242878245, + "learning_rate": 9.6940713466616e-06, + "loss": 0.9009, + "step": 293 + }, + { + "epoch": 0.27860696517412936, + "grad_norm": 1.2541386047985268, + "learning_rate": 9.691421534178966e-06, + "loss": 0.9109, + "step": 294 + }, + { + "epoch": 0.2795546079128169, + "grad_norm": 1.5026012491650225, + "learning_rate": 9.688760660735403e-06, + "loss": 0.9709, + "step": 295 + }, + { + "epoch": 0.28050225065150436, + "grad_norm": 1.2922689184697398, + "learning_rate": 9.68608873260445e-06, + "loss": 0.8457, + "step": 296 + }, + { + "epoch": 0.2814498933901919, + "grad_norm": 1.1843338944530994, + "learning_rate": 9.683405756085708e-06, + "loss": 0.9313, + "step": 297 + }, + { + "epoch": 0.2823975361288794, + "grad_norm": 1.315466417029974, + "learning_rate": 9.680711737504832e-06, + "loss": 1.019, + "step": 298 + }, + { + "epoch": 0.28334517886756694, + "grad_norm": 1.0199556490757884, + "learning_rate": 9.678006683213503e-06, + "loss": 0.8922, + "step": 299 + }, + { + "epoch": 0.28429282160625446, + "grad_norm": 1.1400934246384171, + "learning_rate": 9.675290599589429e-06, + "loss": 0.908, + "step": 300 + }, + { + "epoch": 0.28524046434494194, + "grad_norm": 1.8423074242848725, + "learning_rate": 9.672563493036318e-06, + "loss": 1.0065, + "step": 301 + }, + { + "epoch": 0.28618810708362946, + "grad_norm": 1.1796939423622033, + "learning_rate": 9.669825369983865e-06, + "loss": 0.9303, + "step": 302 + }, + { + "epoch": 0.287135749822317, + "grad_norm": 1.2479579843600068, + "learning_rate": 9.667076236887743e-06, + "loss": 1.0198, + "step": 303 + }, + { + "epoch": 0.2880833925610045, + "grad_norm": 1.229386161002158, + "learning_rate": 9.664316100229578e-06, + "loss": 0.9328, + "step": 304 + }, + { + "epoch": 0.28903103529969204, + "grad_norm": 1.354608076441114, + "learning_rate": 9.661544966516945e-06, + "loss": 0.8865, + "step": 305 + }, + { + "epoch": 0.2899786780383795, + "grad_norm": 1.2733991556068809, + "learning_rate": 9.658762842283343e-06, + "loss": 0.9805, + "step": 306 + }, + { + "epoch": 0.29092632077706704, + "grad_norm": 1.2495713583949597, + "learning_rate": 9.655969734088184e-06, + "loss": 0.9302, + "step": 307 + }, + { + "epoch": 0.29187396351575456, + "grad_norm": 1.2103907414095358, + "learning_rate": 9.653165648516777e-06, + "loss": 0.885, + "step": 308 + }, + { + "epoch": 0.29187396351575456, + "eval_loss": 0.9591483473777771, + "eval_runtime": 68.3896, + "eval_samples_per_second": 39.889, + "eval_steps_per_second": 0.629, + "step": 308 + }, + { + "epoch": 0.2928216062544421, + "grad_norm": 1.1956016894279018, + "learning_rate": 9.650350592180312e-06, + "loss": 0.9577, + "step": 309 + }, + { + "epoch": 0.2937692489931296, + "grad_norm": 1.140247620602589, + "learning_rate": 9.647524571715843e-06, + "loss": 0.9264, + "step": 310 + }, + { + "epoch": 0.2947168917318171, + "grad_norm": 1.2006266683263125, + "learning_rate": 9.644687593786282e-06, + "loss": 0.9792, + "step": 311 + }, + { + "epoch": 0.2956645344705046, + "grad_norm": 1.2812673838645852, + "learning_rate": 9.641839665080363e-06, + "loss": 0.954, + "step": 312 + }, + { + "epoch": 0.29661217720919214, + "grad_norm": 1.010846565968867, + "learning_rate": 9.638980792312651e-06, + "loss": 0.9515, + "step": 313 + }, + { + "epoch": 0.29755981994787967, + "grad_norm": 1.508846485133625, + "learning_rate": 9.636110982223505e-06, + "loss": 0.9611, + "step": 314 + }, + { + "epoch": 0.29850746268656714, + "grad_norm": 1.2091515162070219, + "learning_rate": 9.633230241579075e-06, + "loss": 0.8803, + "step": 315 + }, + { + "epoch": 0.29945510542525466, + "grad_norm": 1.251566988747115, + "learning_rate": 9.630338577171282e-06, + "loss": 0.9102, + "step": 316 + }, + { + "epoch": 0.3004027481639422, + "grad_norm": 1.4368558329637313, + "learning_rate": 9.627435995817799e-06, + "loss": 0.9681, + "step": 317 + }, + { + "epoch": 0.3013503909026297, + "grad_norm": 1.2724580288581318, + "learning_rate": 9.624522504362039e-06, + "loss": 0.9714, + "step": 318 + }, + { + "epoch": 0.30229803364131724, + "grad_norm": 1.2457801062593066, + "learning_rate": 9.621598109673142e-06, + "loss": 0.9663, + "step": 319 + }, + { + "epoch": 0.3032456763800047, + "grad_norm": 1.5450412575397683, + "learning_rate": 9.618662818645949e-06, + "loss": 0.973, + "step": 320 + }, + { + "epoch": 0.30419331911869224, + "grad_norm": 1.3301347899029445, + "learning_rate": 9.615716638200993e-06, + "loss": 0.9292, + "step": 321 + }, + { + "epoch": 0.30514096185737977, + "grad_norm": 1.5045379413960773, + "learning_rate": 9.612759575284483e-06, + "loss": 0.9943, + "step": 322 + }, + { + "epoch": 0.3060886045960673, + "grad_norm": 1.2146706034284283, + "learning_rate": 9.60979163686828e-06, + "loss": 0.8828, + "step": 323 + }, + { + "epoch": 0.3070362473347548, + "grad_norm": 1.1864956541845377, + "learning_rate": 9.606812829949896e-06, + "loss": 0.92, + "step": 324 + }, + { + "epoch": 0.3079838900734423, + "grad_norm": 1.41143117586689, + "learning_rate": 9.603823161552459e-06, + "loss": 0.9539, + "step": 325 + }, + { + "epoch": 0.3089315328121298, + "grad_norm": 2.5914491078059796, + "learning_rate": 9.600822638724704e-06, + "loss": 0.9211, + "step": 326 + }, + { + "epoch": 0.30987917555081734, + "grad_norm": 1.104156076330228, + "learning_rate": 9.597811268540969e-06, + "loss": 0.9148, + "step": 327 + }, + { + "epoch": 0.31082681828950487, + "grad_norm": 1.1472423105684746, + "learning_rate": 9.594789058101154e-06, + "loss": 0.9518, + "step": 328 + }, + { + "epoch": 0.3117744610281924, + "grad_norm": 1.1393816701130914, + "learning_rate": 9.591756014530723e-06, + "loss": 1.0076, + "step": 329 + }, + { + "epoch": 0.31272210376687987, + "grad_norm": 1.2776861681261165, + "learning_rate": 9.588712144980681e-06, + "loss": 0.8784, + "step": 330 + }, + { + "epoch": 0.31272210376687987, + "eval_loss": 0.9570937156677246, + "eval_runtime": 68.898, + "eval_samples_per_second": 39.595, + "eval_steps_per_second": 0.624, + "step": 330 + }, + { + "epoch": 0.3136697465055674, + "grad_norm": 1.192795131650072, + "learning_rate": 9.585657456627557e-06, + "loss": 0.9045, + "step": 331 + }, + { + "epoch": 0.3146173892442549, + "grad_norm": 1.2042562619274322, + "learning_rate": 9.582591956673387e-06, + "loss": 0.9683, + "step": 332 + }, + { + "epoch": 0.31556503198294245, + "grad_norm": 1.1444088880890944, + "learning_rate": 9.579515652345699e-06, + "loss": 0.8678, + "step": 333 + }, + { + "epoch": 0.31651267472162997, + "grad_norm": 1.0769104211549974, + "learning_rate": 9.57642855089749e-06, + "loss": 0.9175, + "step": 334 + }, + { + "epoch": 0.31746031746031744, + "grad_norm": 1.2380307581631063, + "learning_rate": 9.57333065960722e-06, + "loss": 0.9351, + "step": 335 + }, + { + "epoch": 0.31840796019900497, + "grad_norm": 1.071043290520968, + "learning_rate": 9.570221985778785e-06, + "loss": 0.8855, + "step": 336 + }, + { + "epoch": 0.3193556029376925, + "grad_norm": 1.1849521886922723, + "learning_rate": 9.567102536741501e-06, + "loss": 0.917, + "step": 337 + }, + { + "epoch": 0.32030324567638, + "grad_norm": 1.20214216361167, + "learning_rate": 9.563972319850092e-06, + "loss": 0.9147, + "step": 338 + }, + { + "epoch": 0.3212508884150675, + "grad_norm": 1.266949477776236, + "learning_rate": 9.560831342484668e-06, + "loss": 0.9383, + "step": 339 + }, + { + "epoch": 0.322198531153755, + "grad_norm": 1.5670977324953559, + "learning_rate": 9.557679612050708e-06, + "loss": 1.0023, + "step": 340 + }, + { + "epoch": 0.32314617389244255, + "grad_norm": 1.237648169383608, + "learning_rate": 9.554517135979044e-06, + "loss": 0.9671, + "step": 341 + }, + { + "epoch": 0.32409381663113007, + "grad_norm": 1.0260918280422053, + "learning_rate": 9.551343921725844e-06, + "loss": 0.879, + "step": 342 + }, + { + "epoch": 0.3250414593698176, + "grad_norm": 1.155124445578137, + "learning_rate": 9.548159976772593e-06, + "loss": 0.9416, + "step": 343 + }, + { + "epoch": 0.32598910210850507, + "grad_norm": 1.1950689084580686, + "learning_rate": 9.544965308626075e-06, + "loss": 0.9, + "step": 344 + }, + { + "epoch": 0.3269367448471926, + "grad_norm": 1.2849959856276705, + "learning_rate": 9.541759924818358e-06, + "loss": 0.9332, + "step": 345 + }, + { + "epoch": 0.3278843875858801, + "grad_norm": 1.0302992790409418, + "learning_rate": 9.538543832906773e-06, + "loss": 0.9051, + "step": 346 + }, + { + "epoch": 0.32883203032456765, + "grad_norm": 1.2345608543091064, + "learning_rate": 9.535317040473895e-06, + "loss": 0.9806, + "step": 347 + }, + { + "epoch": 0.3297796730632552, + "grad_norm": 1.1665835041880899, + "learning_rate": 9.532079555127532e-06, + "loss": 0.9433, + "step": 348 + }, + { + "epoch": 0.33072731580194265, + "grad_norm": 1.265860203994782, + "learning_rate": 9.528831384500699e-06, + "loss": 0.9776, + "step": 349 + }, + { + "epoch": 0.33167495854063017, + "grad_norm": 1.293238505576827, + "learning_rate": 9.525572536251608e-06, + "loss": 1.0388, + "step": 350 + }, + { + "epoch": 0.3326226012793177, + "grad_norm": 1.2363591052870795, + "learning_rate": 9.52230301806364e-06, + "loss": 0.9252, + "step": 351 + }, + { + "epoch": 0.3335702440180052, + "grad_norm": 1.3748905848676085, + "learning_rate": 9.519022837645337e-06, + "loss": 0.8923, + "step": 352 + }, + { + "epoch": 0.3335702440180052, + "eval_loss": 0.9540281891822815, + "eval_runtime": 62.2753, + "eval_samples_per_second": 43.805, + "eval_steps_per_second": 0.69, + "step": 352 + }, + { + "epoch": 0.33451788675669275, + "grad_norm": 1.1703557022342401, + "learning_rate": 9.51573200273038e-06, + "loss": 0.9791, + "step": 353 + }, + { + "epoch": 0.3354655294953802, + "grad_norm": 1.3163659131319334, + "learning_rate": 9.512430521077565e-06, + "loss": 0.8974, + "step": 354 + }, + { + "epoch": 0.33641317223406775, + "grad_norm": 1.1823387827110081, + "learning_rate": 9.509118400470792e-06, + "loss": 0.8668, + "step": 355 + }, + { + "epoch": 0.3373608149727553, + "grad_norm": 1.0471543968324866, + "learning_rate": 9.505795648719049e-06, + "loss": 0.9248, + "step": 356 + }, + { + "epoch": 0.3383084577114428, + "grad_norm": 1.2873543382804975, + "learning_rate": 9.502462273656381e-06, + "loss": 0.8897, + "step": 357 + }, + { + "epoch": 0.3392561004501303, + "grad_norm": 1.2157109813891434, + "learning_rate": 9.499118283141887e-06, + "loss": 0.9304, + "step": 358 + }, + { + "epoch": 0.3402037431888178, + "grad_norm": 1.093181377661525, + "learning_rate": 9.495763685059689e-06, + "loss": 0.9237, + "step": 359 + }, + { + "epoch": 0.3411513859275053, + "grad_norm": 1.095774592001467, + "learning_rate": 9.492398487318922e-06, + "loss": 0.8669, + "step": 360 + }, + { + "epoch": 0.34209902866619285, + "grad_norm": 1.1676179176818222, + "learning_rate": 9.48902269785371e-06, + "loss": 0.9338, + "step": 361 + }, + { + "epoch": 0.3430466714048804, + "grad_norm": 1.082117155119373, + "learning_rate": 9.485636324623147e-06, + "loss": 0.9301, + "step": 362 + }, + { + "epoch": 0.34399431414356785, + "grad_norm": 1.5869790381600608, + "learning_rate": 9.482239375611282e-06, + "loss": 0.8566, + "step": 363 + }, + { + "epoch": 0.3449419568822554, + "grad_norm": 2.1300888287293436, + "learning_rate": 9.478831858827105e-06, + "loss": 0.9462, + "step": 364 + }, + { + "epoch": 0.3458895996209429, + "grad_norm": 1.329321349965101, + "learning_rate": 9.475413782304509e-06, + "loss": 0.9344, + "step": 365 + }, + { + "epoch": 0.3468372423596304, + "grad_norm": 3.4098937413401678, + "learning_rate": 9.471985154102292e-06, + "loss": 0.881, + "step": 366 + }, + { + "epoch": 0.34778488509831795, + "grad_norm": 1.374583167993129, + "learning_rate": 9.468545982304132e-06, + "loss": 0.8899, + "step": 367 + }, + { + "epoch": 0.3487325278370054, + "grad_norm": 1.2132880433358602, + "learning_rate": 9.465096275018556e-06, + "loss": 0.9016, + "step": 368 + }, + { + "epoch": 0.34968017057569295, + "grad_norm": 1.132880559404501, + "learning_rate": 9.461636040378941e-06, + "loss": 0.9424, + "step": 369 + }, + { + "epoch": 0.3506278133143805, + "grad_norm": 1.573588626293436, + "learning_rate": 9.458165286543477e-06, + "loss": 0.9758, + "step": 370 + }, + { + "epoch": 0.351575456053068, + "grad_norm": 1.0016737529772646, + "learning_rate": 9.454684021695157e-06, + "loss": 0.9522, + "step": 371 + }, + { + "epoch": 0.35252309879175553, + "grad_norm": 1.2060571666651005, + "learning_rate": 9.451192254041759e-06, + "loss": 0.8995, + "step": 372 + }, + { + "epoch": 0.353470741530443, + "grad_norm": 1.5491588961886638, + "learning_rate": 9.447689991815819e-06, + "loss": 0.9497, + "step": 373 + }, + { + "epoch": 0.3544183842691305, + "grad_norm": 2.323597523498367, + "learning_rate": 9.444177243274619e-06, + "loss": 0.9483, + "step": 374 + }, + { + "epoch": 0.3544183842691305, + "eval_loss": 0.9546486139297485, + "eval_runtime": 67.7741, + "eval_samples_per_second": 40.251, + "eval_steps_per_second": 0.634, + "step": 374 + }, + { + "epoch": 0.35536602700781805, + "grad_norm": 1.1957751593867816, + "learning_rate": 9.440654016700161e-06, + "loss": 0.9069, + "step": 375 + }, + { + "epoch": 0.3563136697465056, + "grad_norm": 1.3480198545553501, + "learning_rate": 9.437120320399158e-06, + "loss": 0.9206, + "step": 376 + }, + { + "epoch": 0.3572613124851931, + "grad_norm": 1.1240947731641266, + "learning_rate": 9.433576162703e-06, + "loss": 0.9686, + "step": 377 + }, + { + "epoch": 0.3582089552238806, + "grad_norm": 1.258961853327028, + "learning_rate": 9.430021551967745e-06, + "loss": 0.9156, + "step": 378 + }, + { + "epoch": 0.3591565979625681, + "grad_norm": 1.1465674821214438, + "learning_rate": 9.426456496574095e-06, + "loss": 0.9027, + "step": 379 + }, + { + "epoch": 0.36010424070125563, + "grad_norm": 1.334135631113088, + "learning_rate": 9.422881004927383e-06, + "loss": 0.9215, + "step": 380 + }, + { + "epoch": 0.36105188343994316, + "grad_norm": 1.052076097463688, + "learning_rate": 9.419295085457536e-06, + "loss": 0.8708, + "step": 381 + }, + { + "epoch": 0.3619995261786307, + "grad_norm": 1.3069872390381696, + "learning_rate": 9.41569874661908e-06, + "loss": 0.9392, + "step": 382 + }, + { + "epoch": 0.36294716891731815, + "grad_norm": 1.1946541917496492, + "learning_rate": 9.412091996891097e-06, + "loss": 0.9186, + "step": 383 + }, + { + "epoch": 0.3638948116560057, + "grad_norm": 1.130570319952377, + "learning_rate": 9.408474844777218e-06, + "loss": 0.9231, + "step": 384 + }, + { + "epoch": 0.3648424543946932, + "grad_norm": 1.230122090333074, + "learning_rate": 9.4048472988056e-06, + "loss": 1.0082, + "step": 385 + }, + { + "epoch": 0.36579009713338073, + "grad_norm": 1.0720696634128188, + "learning_rate": 9.401209367528907e-06, + "loss": 0.9291, + "step": 386 + }, + { + "epoch": 0.36673773987206826, + "grad_norm": 1.1723709465115237, + "learning_rate": 9.397561059524285e-06, + "loss": 0.9175, + "step": 387 + }, + { + "epoch": 0.36768538261075573, + "grad_norm": 1.5238004908651446, + "learning_rate": 9.393902383393347e-06, + "loss": 0.9621, + "step": 388 + }, + { + "epoch": 0.36863302534944326, + "grad_norm": 1.0814097944853873, + "learning_rate": 9.39023334776215e-06, + "loss": 0.9293, + "step": 389 + }, + { + "epoch": 0.3695806680881308, + "grad_norm": 1.1650689858883694, + "learning_rate": 9.386553961281179e-06, + "loss": 0.9582, + "step": 390 + }, + { + "epoch": 0.3705283108268183, + "grad_norm": 1.2458078695599824, + "learning_rate": 9.382864232625321e-06, + "loss": 0.9581, + "step": 391 + }, + { + "epoch": 0.3714759535655058, + "grad_norm": 1.339036266204836, + "learning_rate": 9.379164170493844e-06, + "loss": 0.8931, + "step": 392 + }, + { + "epoch": 0.3724235963041933, + "grad_norm": 1.0125589713218854, + "learning_rate": 9.375453783610381e-06, + "loss": 0.9012, + "step": 393 + }, + { + "epoch": 0.37337123904288083, + "grad_norm": 1.0329885700845731, + "learning_rate": 9.371733080722911e-06, + "loss": 0.8628, + "step": 394 + }, + { + "epoch": 0.37431888178156836, + "grad_norm": 1.439005100467098, + "learning_rate": 9.368002070603731e-06, + "loss": 0.8827, + "step": 395 + }, + { + "epoch": 0.3752665245202559, + "grad_norm": 1.0085800308385358, + "learning_rate": 9.36426076204944e-06, + "loss": 0.8743, + "step": 396 + }, + { + "epoch": 0.3752665245202559, + "eval_loss": 0.9504217505455017, + "eval_runtime": 66.3641, + "eval_samples_per_second": 41.107, + "eval_steps_per_second": 0.648, + "step": 396 + }, + { + "epoch": 0.37621416725894335, + "grad_norm": 1.3876480177466899, + "learning_rate": 9.36050916388092e-06, + "loss": 0.9472, + "step": 397 + }, + { + "epoch": 0.3771618099976309, + "grad_norm": 1.1708472397542733, + "learning_rate": 9.35674728494331e-06, + "loss": 0.9283, + "step": 398 + }, + { + "epoch": 0.3781094527363184, + "grad_norm": 1.0918645378867784, + "learning_rate": 9.35297513410599e-06, + "loss": 0.8862, + "step": 399 + }, + { + "epoch": 0.37905709547500593, + "grad_norm": 0.9955698293935606, + "learning_rate": 9.349192720262556e-06, + "loss": 0.8965, + "step": 400 + }, + { + "epoch": 0.38000473821369346, + "grad_norm": 1.305075253905476, + "learning_rate": 9.345400052330802e-06, + "loss": 0.8806, + "step": 401 + }, + { + "epoch": 0.38095238095238093, + "grad_norm": 0.99053214435014, + "learning_rate": 9.341597139252698e-06, + "loss": 1.0084, + "step": 402 + }, + { + "epoch": 0.38190002369106846, + "grad_norm": 1.3393098226066853, + "learning_rate": 9.337783989994371e-06, + "loss": 0.9356, + "step": 403 + }, + { + "epoch": 0.382847666429756, + "grad_norm": 1.01675988520605, + "learning_rate": 9.333960613546079e-06, + "loss": 0.8987, + "step": 404 + }, + { + "epoch": 0.3837953091684435, + "grad_norm": 1.1624028341398043, + "learning_rate": 9.330127018922195e-06, + "loss": 0.8895, + "step": 405 + }, + { + "epoch": 0.38474295190713104, + "grad_norm": 1.2237993995607808, + "learning_rate": 9.326283215161177e-06, + "loss": 0.8879, + "step": 406 + }, + { + "epoch": 0.3856905946458185, + "grad_norm": 1.2652821842567468, + "learning_rate": 9.322429211325567e-06, + "loss": 0.8893, + "step": 407 + }, + { + "epoch": 0.38663823738450603, + "grad_norm": 1.1025981014314234, + "learning_rate": 9.31856501650194e-06, + "loss": 0.9746, + "step": 408 + }, + { + "epoch": 0.38758588012319356, + "grad_norm": 3.9246963935175763, + "learning_rate": 9.314690639800906e-06, + "loss": 0.9352, + "step": 409 + }, + { + "epoch": 0.3885335228618811, + "grad_norm": 1.1259828261006313, + "learning_rate": 9.310806090357083e-06, + "loss": 0.9083, + "step": 410 + }, + { + "epoch": 0.3894811656005686, + "grad_norm": 1.2708123609203328, + "learning_rate": 9.306911377329067e-06, + "loss": 0.9167, + "step": 411 + }, + { + "epoch": 0.3904288083392561, + "grad_norm": 2.090787985556849, + "learning_rate": 9.30300650989942e-06, + "loss": 0.9976, + "step": 412 + }, + { + "epoch": 0.3913764510779436, + "grad_norm": 1.036822985622544, + "learning_rate": 9.299091497274647e-06, + "loss": 1.002, + "step": 413 + }, + { + "epoch": 0.39232409381663114, + "grad_norm": 1.08027979908674, + "learning_rate": 9.295166348685169e-06, + "loss": 0.883, + "step": 414 + }, + { + "epoch": 0.39327173655531866, + "grad_norm": 1.3152353131345889, + "learning_rate": 9.291231073385306e-06, + "loss": 0.9368, + "step": 415 + }, + { + "epoch": 0.39421937929400613, + "grad_norm": 1.3362457501149774, + "learning_rate": 9.287285680653254e-06, + "loss": 0.9923, + "step": 416 + }, + { + "epoch": 0.39516702203269366, + "grad_norm": 1.1378299427204326, + "learning_rate": 9.283330179791063e-06, + "loss": 0.9013, + "step": 417 + }, + { + "epoch": 0.3961146647713812, + "grad_norm": 0.9698571591778787, + "learning_rate": 9.279364580124615e-06, + "loss": 0.8294, + "step": 418 + }, + { + "epoch": 0.3961146647713812, + "eval_loss": 0.9494832754135132, + "eval_runtime": 62.1923, + "eval_samples_per_second": 43.864, + "eval_steps_per_second": 0.691, + "step": 418 + }, + { + "epoch": 0.3970623075100687, + "grad_norm": 1.2329603475368258, + "learning_rate": 9.275388891003596e-06, + "loss": 0.9132, + "step": 419 + }, + { + "epoch": 0.39800995024875624, + "grad_norm": 1.0253483109899053, + "learning_rate": 9.271403121801492e-06, + "loss": 0.9966, + "step": 420 + }, + { + "epoch": 0.3989575929874437, + "grad_norm": 1.1122937106526114, + "learning_rate": 9.267407281915541e-06, + "loss": 0.8949, + "step": 421 + }, + { + "epoch": 0.39990523572613124, + "grad_norm": 1.0623316599321453, + "learning_rate": 9.263401380766739e-06, + "loss": 0.9192, + "step": 422 + }, + { + "epoch": 0.40085287846481876, + "grad_norm": 1.109212522270619, + "learning_rate": 9.25938542779979e-06, + "loss": 0.9212, + "step": 423 + }, + { + "epoch": 0.4018005212035063, + "grad_norm": 1.1148931056175715, + "learning_rate": 9.255359432483106e-06, + "loss": 0.8824, + "step": 424 + }, + { + "epoch": 0.4027481639421938, + "grad_norm": 1.469688611294437, + "learning_rate": 9.251323404308774e-06, + "loss": 0.8941, + "step": 425 + }, + { + "epoch": 0.4036958066808813, + "grad_norm": 1.1366864229617593, + "learning_rate": 9.247277352792534e-06, + "loss": 0.9542, + "step": 426 + }, + { + "epoch": 0.4046434494195688, + "grad_norm": 1.2380214332997066, + "learning_rate": 9.243221287473755e-06, + "loss": 0.9417, + "step": 427 + }, + { + "epoch": 0.40559109215825634, + "grad_norm": 1.292587978067118, + "learning_rate": 9.239155217915422e-06, + "loss": 0.9531, + "step": 428 + }, + { + "epoch": 0.40653873489694387, + "grad_norm": 1.1996181211866257, + "learning_rate": 9.235079153704108e-06, + "loss": 0.993, + "step": 429 + }, + { + "epoch": 0.4074863776356314, + "grad_norm": 1.7152618861500344, + "learning_rate": 9.23099310444994e-06, + "loss": 0.88, + "step": 430 + }, + { + "epoch": 0.40843402037431886, + "grad_norm": 1.236710405113469, + "learning_rate": 9.226897079786594e-06, + "loss": 0.8924, + "step": 431 + }, + { + "epoch": 0.4093816631130064, + "grad_norm": 1.026683565261258, + "learning_rate": 9.222791089371266e-06, + "loss": 0.8627, + "step": 432 + }, + { + "epoch": 0.4103293058516939, + "grad_norm": 1.0752239634813958, + "learning_rate": 9.218675142884648e-06, + "loss": 0.9457, + "step": 433 + }, + { + "epoch": 0.41127694859038144, + "grad_norm": 1.1942159706425186, + "learning_rate": 9.214549250030899e-06, + "loss": 0.9697, + "step": 434 + }, + { + "epoch": 0.41222459132906897, + "grad_norm": 1.302875719838314, + "learning_rate": 9.210413420537638e-06, + "loss": 0.9266, + "step": 435 + }, + { + "epoch": 0.41317223406775644, + "grad_norm": 1.2858086492476544, + "learning_rate": 9.206267664155906e-06, + "loss": 0.8556, + "step": 436 + }, + { + "epoch": 0.41411987680644397, + "grad_norm": 1.2092507326298383, + "learning_rate": 9.20211199066015e-06, + "loss": 0.8873, + "step": 437 + }, + { + "epoch": 0.4150675195451315, + "grad_norm": 1.0641345729826912, + "learning_rate": 9.197946409848196e-06, + "loss": 0.927, + "step": 438 + }, + { + "epoch": 0.416015162283819, + "grad_norm": 0.9922730475025484, + "learning_rate": 9.19377093154123e-06, + "loss": 0.8922, + "step": 439 + }, + { + "epoch": 0.4169628050225065, + "grad_norm": 1.1994954411324383, + "learning_rate": 9.189585565583779e-06, + "loss": 0.934, + "step": 440 + }, + { + "epoch": 0.4169628050225065, + "eval_loss": 0.9466658234596252, + "eval_runtime": 64.5961, + "eval_samples_per_second": 42.232, + "eval_steps_per_second": 0.666, + "step": 440 + }, + { + "epoch": 0.417910447761194, + "grad_norm": 1.2490962663664558, + "learning_rate": 9.185390321843673e-06, + "loss": 0.901, + "step": 441 + }, + { + "epoch": 0.41885809049988154, + "grad_norm": 1.015254380962658, + "learning_rate": 9.181185210212034e-06, + "loss": 0.9519, + "step": 442 + }, + { + "epoch": 0.41980573323856907, + "grad_norm": 1.1895181384960887, + "learning_rate": 9.176970240603253e-06, + "loss": 0.8807, + "step": 443 + }, + { + "epoch": 0.4207533759772566, + "grad_norm": 1.3706219828971085, + "learning_rate": 9.172745422954961e-06, + "loss": 0.9148, + "step": 444 + }, + { + "epoch": 0.42170101871594406, + "grad_norm": 1.0379378858579145, + "learning_rate": 9.168510767228008e-06, + "loss": 0.9468, + "step": 445 + }, + { + "epoch": 0.4226486614546316, + "grad_norm": 1.2178466709823097, + "learning_rate": 9.164266283406433e-06, + "loss": 0.9242, + "step": 446 + }, + { + "epoch": 0.4235963041933191, + "grad_norm": 1.2808190385423623, + "learning_rate": 9.160011981497458e-06, + "loss": 0.8654, + "step": 447 + }, + { + "epoch": 0.42454394693200664, + "grad_norm": 1.250260948302257, + "learning_rate": 9.155747871531444e-06, + "loss": 0.9284, + "step": 448 + }, + { + "epoch": 0.42549158967069417, + "grad_norm": 1.2672376071125921, + "learning_rate": 9.151473963561884e-06, + "loss": 0.9568, + "step": 449 + }, + { + "epoch": 0.42643923240938164, + "grad_norm": 1.0461013649789057, + "learning_rate": 9.147190267665361e-06, + "loss": 0.8883, + "step": 450 + }, + { + "epoch": 0.42738687514806917, + "grad_norm": 1.1516556206793171, + "learning_rate": 9.142896793941546e-06, + "loss": 0.9596, + "step": 451 + }, + { + "epoch": 0.4283345178867567, + "grad_norm": 1.1510780017093964, + "learning_rate": 9.13859355251316e-06, + "loss": 0.9444, + "step": 452 + }, + { + "epoch": 0.4292821606254442, + "grad_norm": 0.9978574311141366, + "learning_rate": 9.134280553525946e-06, + "loss": 0.8698, + "step": 453 + }, + { + "epoch": 0.43022980336413175, + "grad_norm": 1.0518208149889676, + "learning_rate": 9.129957807148666e-06, + "loss": 0.8508, + "step": 454 + }, + { + "epoch": 0.4311774461028192, + "grad_norm": 1.0777071914790497, + "learning_rate": 9.12562532357305e-06, + "loss": 0.9219, + "step": 455 + }, + { + "epoch": 0.43212508884150674, + "grad_norm": 1.3003109116219143, + "learning_rate": 9.121283113013794e-06, + "loss": 0.9354, + "step": 456 + }, + { + "epoch": 0.43307273158019427, + "grad_norm": 1.231896880939342, + "learning_rate": 9.116931185708523e-06, + "loss": 0.8797, + "step": 457 + }, + { + "epoch": 0.4340203743188818, + "grad_norm": 1.167418023483012, + "learning_rate": 9.112569551917773e-06, + "loss": 0.9122, + "step": 458 + }, + { + "epoch": 0.4349680170575693, + "grad_norm": 1.2433163300824168, + "learning_rate": 9.108198221924966e-06, + "loss": 0.9241, + "step": 459 + }, + { + "epoch": 0.4359156597962568, + "grad_norm": 1.2957389966436808, + "learning_rate": 9.103817206036383e-06, + "loss": 0.9653, + "step": 460 + }, + { + "epoch": 0.4368633025349443, + "grad_norm": 1.1967614874308203, + "learning_rate": 9.09942651458114e-06, + "loss": 0.9555, + "step": 461 + }, + { + "epoch": 0.43781094527363185, + "grad_norm": 1.0311787596301678, + "learning_rate": 9.095026157911166e-06, + "loss": 0.8532, + "step": 462 + }, + { + "epoch": 0.43781094527363185, + "eval_loss": 0.9448354840278625, + "eval_runtime": 63.5673, + "eval_samples_per_second": 42.915, + "eval_steps_per_second": 0.676, + "step": 462 + }, + { + "epoch": 0.4387585880123194, + "grad_norm": 1.1614684984564378, + "learning_rate": 9.090616146401183e-06, + "loss": 0.911, + "step": 463 + }, + { + "epoch": 0.43970623075100684, + "grad_norm": 1.1848933141897011, + "learning_rate": 9.086196490448668e-06, + "loss": 0.8495, + "step": 464 + }, + { + "epoch": 0.44065387348969437, + "grad_norm": 1.0920125977106059, + "learning_rate": 9.081767200473842e-06, + "loss": 0.9195, + "step": 465 + }, + { + "epoch": 0.4416015162283819, + "grad_norm": 1.0487746428767522, + "learning_rate": 9.077328286919638e-06, + "loss": 0.8775, + "step": 466 + }, + { + "epoch": 0.4425491589670694, + "grad_norm": 1.0480719750913268, + "learning_rate": 9.07287976025168e-06, + "loss": 0.8879, + "step": 467 + }, + { + "epoch": 0.44349680170575695, + "grad_norm": 1.156105288349571, + "learning_rate": 9.068421630958254e-06, + "loss": 0.9004, + "step": 468 + }, + { + "epoch": 0.4444444444444444, + "grad_norm": 1.1479660233621711, + "learning_rate": 9.063953909550289e-06, + "loss": 0.9652, + "step": 469 + }, + { + "epoch": 0.44539208718313195, + "grad_norm": 1.158618048287916, + "learning_rate": 9.059476606561328e-06, + "loss": 0.8643, + "step": 470 + }, + { + "epoch": 0.4463397299218195, + "grad_norm": 1.1045055506935484, + "learning_rate": 9.054989732547507e-06, + "loss": 0.8307, + "step": 471 + }, + { + "epoch": 0.447287372660507, + "grad_norm": 1.189869710423804, + "learning_rate": 9.050493298087523e-06, + "loss": 0.8693, + "step": 472 + }, + { + "epoch": 0.4482350153991945, + "grad_norm": 1.5017065849353626, + "learning_rate": 9.045987313782616e-06, + "loss": 0.8868, + "step": 473 + }, + { + "epoch": 0.449182658137882, + "grad_norm": 1.288348522111584, + "learning_rate": 9.041471790256543e-06, + "loss": 0.9984, + "step": 474 + }, + { + "epoch": 0.4501303008765695, + "grad_norm": 1.3428427133159277, + "learning_rate": 9.036946738155548e-06, + "loss": 0.9328, + "step": 475 + }, + { + "epoch": 0.45107794361525705, + "grad_norm": 0.9887938032536074, + "learning_rate": 9.032412168148345e-06, + "loss": 0.9483, + "step": 476 + }, + { + "epoch": 0.4520255863539446, + "grad_norm": 1.0713968856815155, + "learning_rate": 9.027868090926088e-06, + "loss": 0.8861, + "step": 477 + }, + { + "epoch": 0.4529732290926321, + "grad_norm": 1.162032207786328, + "learning_rate": 9.023314517202341e-06, + "loss": 0.9014, + "step": 478 + }, + { + "epoch": 0.4539208718313196, + "grad_norm": 1.135173292644661, + "learning_rate": 9.018751457713062e-06, + "loss": 0.882, + "step": 479 + }, + { + "epoch": 0.4548685145700071, + "grad_norm": 1.2191006204661359, + "learning_rate": 9.014178923216572e-06, + "loss": 0.8936, + "step": 480 + }, + { + "epoch": 0.4558161573086946, + "grad_norm": 1.1422417367554563, + "learning_rate": 9.009596924493536e-06, + "loss": 0.9046, + "step": 481 + }, + { + "epoch": 0.45676380004738215, + "grad_norm": 1.0960107607325966, + "learning_rate": 9.005005472346923e-06, + "loss": 0.8608, + "step": 482 + }, + { + "epoch": 0.4577114427860697, + "grad_norm": 1.2860608689094808, + "learning_rate": 9.000404577602003e-06, + "loss": 0.92, + "step": 483 + }, + { + "epoch": 0.45865908552475715, + "grad_norm": 1.148989084195761, + "learning_rate": 8.995794251106295e-06, + "loss": 0.9675, + "step": 484 + }, + { + "epoch": 0.45865908552475715, + "eval_loss": 0.9426133632659912, + "eval_runtime": 62.3901, + "eval_samples_per_second": 43.725, + "eval_steps_per_second": 0.689, + "step": 484 + }, + { + "epoch": 0.4596067282634447, + "grad_norm": 1.1715395816498915, + "learning_rate": 8.991174503729567e-06, + "loss": 0.9505, + "step": 485 + }, + { + "epoch": 0.4605543710021322, + "grad_norm": 1.1418428811721806, + "learning_rate": 8.986545346363792e-06, + "loss": 0.9194, + "step": 486 + }, + { + "epoch": 0.4615020137408197, + "grad_norm": 1.2704284828900592, + "learning_rate": 8.98190678992313e-06, + "loss": 0.9404, + "step": 487 + }, + { + "epoch": 0.4624496564795072, + "grad_norm": 1.4180260493906214, + "learning_rate": 8.977258845343904e-06, + "loss": 0.8881, + "step": 488 + }, + { + "epoch": 0.4633972992181947, + "grad_norm": 1.4745602251152343, + "learning_rate": 8.97260152358457e-06, + "loss": 0.8991, + "step": 489 + }, + { + "epoch": 0.46434494195688225, + "grad_norm": 1.5516611931425326, + "learning_rate": 8.96793483562569e-06, + "loss": 0.8868, + "step": 490 + }, + { + "epoch": 0.4652925846955698, + "grad_norm": 1.1672873798559753, + "learning_rate": 8.963258792469908e-06, + "loss": 0.9032, + "step": 491 + }, + { + "epoch": 0.4662402274342573, + "grad_norm": 0.9800479447492024, + "learning_rate": 8.958573405141932e-06, + "loss": 0.8875, + "step": 492 + }, + { + "epoch": 0.4671878701729448, + "grad_norm": 1.3344568834573243, + "learning_rate": 8.953878684688492e-06, + "loss": 0.8834, + "step": 493 + }, + { + "epoch": 0.4681355129116323, + "grad_norm": 1.0491821400957775, + "learning_rate": 8.949174642178333e-06, + "loss": 0.9002, + "step": 494 + }, + { + "epoch": 0.4690831556503198, + "grad_norm": 1.237676770681135, + "learning_rate": 8.944461288702166e-06, + "loss": 0.8832, + "step": 495 + }, + { + "epoch": 0.47003079838900735, + "grad_norm": 1.2759707423338387, + "learning_rate": 8.939738635372664e-06, + "loss": 0.8949, + "step": 496 + }, + { + "epoch": 0.4709784411276949, + "grad_norm": 1.1263638492681127, + "learning_rate": 8.935006693324423e-06, + "loss": 0.8969, + "step": 497 + }, + { + "epoch": 0.47192608386638235, + "grad_norm": 1.154527093025846, + "learning_rate": 8.930265473713939e-06, + "loss": 0.8759, + "step": 498 + }, + { + "epoch": 0.4728737266050699, + "grad_norm": 1.2033690454214934, + "learning_rate": 8.92551498771958e-06, + "loss": 0.9447, + "step": 499 + }, + { + "epoch": 0.4738213693437574, + "grad_norm": 1.188345342085479, + "learning_rate": 8.920755246541563e-06, + "loss": 0.9698, + "step": 500 + }, + { + "epoch": 0.47476901208244493, + "grad_norm": 1.1460258736111513, + "learning_rate": 8.91598626140193e-06, + "loss": 0.8861, + "step": 501 + }, + { + "epoch": 0.47571665482113246, + "grad_norm": 1.0983544593959635, + "learning_rate": 8.911208043544513e-06, + "loss": 0.9099, + "step": 502 + }, + { + "epoch": 0.4766642975598199, + "grad_norm": 1.2526221170984964, + "learning_rate": 8.906420604234908e-06, + "loss": 0.9153, + "step": 503 + }, + { + "epoch": 0.47761194029850745, + "grad_norm": 1.4378576625792787, + "learning_rate": 8.90162395476046e-06, + "loss": 0.9098, + "step": 504 + }, + { + "epoch": 0.478559583037195, + "grad_norm": 1.021190259086082, + "learning_rate": 8.896818106430225e-06, + "loss": 0.9201, + "step": 505 + }, + { + "epoch": 0.4795072257758825, + "grad_norm": 1.2166947590641954, + "learning_rate": 8.89200307057495e-06, + "loss": 0.9498, + "step": 506 + }, + { + "epoch": 0.4795072257758825, + "eval_loss": 0.9415593147277832, + "eval_runtime": 63.0823, + "eval_samples_per_second": 43.245, + "eval_steps_per_second": 0.682, + "step": 506 + }, + { + "epoch": 0.48045486851457003, + "grad_norm": 1.099897475733057, + "learning_rate": 8.887178858547039e-06, + "loss": 0.8785, + "step": 507 + }, + { + "epoch": 0.4814025112532575, + "grad_norm": 1.1053789477176734, + "learning_rate": 8.882345481720533e-06, + "loss": 0.9781, + "step": 508 + }, + { + "epoch": 0.48235015399194503, + "grad_norm": 1.2550219679746741, + "learning_rate": 8.877502951491083e-06, + "loss": 0.9175, + "step": 509 + }, + { + "epoch": 0.48329779673063256, + "grad_norm": 1.035777482131784, + "learning_rate": 8.872651279275917e-06, + "loss": 0.9394, + "step": 510 + }, + { + "epoch": 0.4842454394693201, + "grad_norm": 1.1823889985534881, + "learning_rate": 8.867790476513818e-06, + "loss": 0.8619, + "step": 511 + }, + { + "epoch": 0.4851930822080076, + "grad_norm": 1.0806837978842365, + "learning_rate": 8.862920554665098e-06, + "loss": 0.8847, + "step": 512 + }, + { + "epoch": 0.4861407249466951, + "grad_norm": 1.1417084903673171, + "learning_rate": 8.858041525211569e-06, + "loss": 0.8984, + "step": 513 + }, + { + "epoch": 0.4870883676853826, + "grad_norm": 1.046685136616654, + "learning_rate": 8.853153399656513e-06, + "loss": 0.9343, + "step": 514 + }, + { + "epoch": 0.48803601042407013, + "grad_norm": 1.1600934932807847, + "learning_rate": 8.848256189524661e-06, + "loss": 0.903, + "step": 515 + }, + { + "epoch": 0.48898365316275766, + "grad_norm": 0.9999805389325372, + "learning_rate": 8.843349906362163e-06, + "loss": 0.9087, + "step": 516 + }, + { + "epoch": 0.48993129590144513, + "grad_norm": 1.1693797728638526, + "learning_rate": 8.838434561736556e-06, + "loss": 0.9083, + "step": 517 + }, + { + "epoch": 0.49087893864013266, + "grad_norm": 1.1372932570585796, + "learning_rate": 8.833510167236747e-06, + "loss": 0.9713, + "step": 518 + }, + { + "epoch": 0.4918265813788202, + "grad_norm": 1.0947618440390705, + "learning_rate": 8.828576734472975e-06, + "loss": 0.8689, + "step": 519 + }, + { + "epoch": 0.4927742241175077, + "grad_norm": 1.1318492632095214, + "learning_rate": 8.823634275076792e-06, + "loss": 0.8625, + "step": 520 + }, + { + "epoch": 0.49372186685619524, + "grad_norm": 1.3142475847243504, + "learning_rate": 8.818682800701028e-06, + "loss": 0.8914, + "step": 521 + }, + { + "epoch": 0.4946695095948827, + "grad_norm": 1.0542269379359606, + "learning_rate": 8.813722323019774e-06, + "loss": 0.9204, + "step": 522 + }, + { + "epoch": 0.49561715233357023, + "grad_norm": 1.2759846986205978, + "learning_rate": 8.808752853728341e-06, + "loss": 0.9044, + "step": 523 + }, + { + "epoch": 0.49656479507225776, + "grad_norm": 1.0846144562638056, + "learning_rate": 8.803774404543246e-06, + "loss": 0.9123, + "step": 524 + }, + { + "epoch": 0.4975124378109453, + "grad_norm": 1.1086474451297028, + "learning_rate": 8.798786987202175e-06, + "loss": 0.9293, + "step": 525 + }, + { + "epoch": 0.4984600805496328, + "grad_norm": 0.9413825393223179, + "learning_rate": 8.793790613463956e-06, + "loss": 0.8654, + "step": 526 + }, + { + "epoch": 0.4994077232883203, + "grad_norm": 1.1832807749456735, + "learning_rate": 8.788785295108536e-06, + "loss": 0.8636, + "step": 527 + }, + { + "epoch": 0.5003553660270078, + "grad_norm": 1.0977629074376605, + "learning_rate": 8.783771043936949e-06, + "loss": 0.8765, + "step": 528 + }, + { + "epoch": 0.5003553660270078, + "eval_loss": 0.941301167011261, + "eval_runtime": 61.1844, + "eval_samples_per_second": 44.587, + "eval_steps_per_second": 0.703, + "step": 528 + }, + { + "epoch": 0.5013030087656953, + "grad_norm": 1.146767921801711, + "learning_rate": 8.778747871771293e-06, + "loss": 0.8989, + "step": 529 + }, + { + "epoch": 0.5022506515043829, + "grad_norm": 1.2639703543113263, + "learning_rate": 8.773715790454695e-06, + "loss": 0.9151, + "step": 530 + }, + { + "epoch": 0.5031982942430704, + "grad_norm": 1.113218960186029, + "learning_rate": 8.768674811851293e-06, + "loss": 0.8692, + "step": 531 + }, + { + "epoch": 0.5041459369817579, + "grad_norm": 0.9991478843453905, + "learning_rate": 8.763624947846195e-06, + "loss": 0.8764, + "step": 532 + }, + { + "epoch": 0.5050935797204454, + "grad_norm": 1.1051839359484277, + "learning_rate": 8.758566210345464e-06, + "loss": 0.9142, + "step": 533 + }, + { + "epoch": 0.5060412224591329, + "grad_norm": 1.5864593937619376, + "learning_rate": 8.75349861127608e-06, + "loss": 0.9167, + "step": 534 + }, + { + "epoch": 0.5069888651978204, + "grad_norm": 1.0055893256047008, + "learning_rate": 8.748422162585915e-06, + "loss": 0.9583, + "step": 535 + }, + { + "epoch": 0.5079365079365079, + "grad_norm": 1.1419438277764564, + "learning_rate": 8.743336876243712e-06, + "loss": 0.8847, + "step": 536 + }, + { + "epoch": 0.5088841506751954, + "grad_norm": 1.1093929329894858, + "learning_rate": 8.738242764239046e-06, + "loss": 0.9657, + "step": 537 + }, + { + "epoch": 0.509831793413883, + "grad_norm": 1.0924153336293334, + "learning_rate": 8.733139838582299e-06, + "loss": 0.9452, + "step": 538 + }, + { + "epoch": 0.5107794361525705, + "grad_norm": 1.100904420569305, + "learning_rate": 8.728028111304639e-06, + "loss": 0.8705, + "step": 539 + }, + { + "epoch": 0.511727078891258, + "grad_norm": 1.0377959902393181, + "learning_rate": 8.722907594457975e-06, + "loss": 0.9021, + "step": 540 + }, + { + "epoch": 0.5126747216299455, + "grad_norm": 1.3028881798201601, + "learning_rate": 8.717778300114952e-06, + "loss": 0.9004, + "step": 541 + }, + { + "epoch": 0.5136223643686331, + "grad_norm": 1.2219633113574593, + "learning_rate": 8.712640240368899e-06, + "loss": 0.9146, + "step": 542 + }, + { + "epoch": 0.5145700071073206, + "grad_norm": 1.16735139559823, + "learning_rate": 8.707493427333817e-06, + "loss": 0.9336, + "step": 543 + }, + { + "epoch": 0.515517649846008, + "grad_norm": 1.1223934613953974, + "learning_rate": 8.702337873144343e-06, + "loss": 0.8959, + "step": 544 + }, + { + "epoch": 0.5164652925846955, + "grad_norm": 1.0381379384688154, + "learning_rate": 8.697173589955724e-06, + "loss": 0.9147, + "step": 545 + }, + { + "epoch": 0.5174129353233831, + "grad_norm": 1.071551123667491, + "learning_rate": 8.692000589943785e-06, + "loss": 0.8713, + "step": 546 + }, + { + "epoch": 0.5183605780620706, + "grad_norm": 1.1572023966023732, + "learning_rate": 8.686818885304907e-06, + "loss": 0.9468, + "step": 547 + }, + { + "epoch": 0.5193082208007581, + "grad_norm": 1.0966755633051661, + "learning_rate": 8.681628488255986e-06, + "loss": 0.9746, + "step": 548 + }, + { + "epoch": 0.5202558635394456, + "grad_norm": 1.0054623347539213, + "learning_rate": 8.676429411034423e-06, + "loss": 0.889, + "step": 549 + }, + { + "epoch": 0.5212035062781332, + "grad_norm": 1.0688410228225136, + "learning_rate": 8.671221665898074e-06, + "loss": 0.8986, + "step": 550 + }, + { + "epoch": 0.5212035062781332, + "eval_loss": 0.9385759234428406, + "eval_runtime": 60.7338, + "eval_samples_per_second": 44.917, + "eval_steps_per_second": 0.708, + "step": 550 + }, + { + "epoch": 0.5221511490168207, + "grad_norm": 1.1912290308637075, + "learning_rate": 8.666005265125238e-06, + "loss": 0.9032, + "step": 551 + }, + { + "epoch": 0.5230987917555082, + "grad_norm": 1.0819840961903495, + "learning_rate": 8.660780221014617e-06, + "loss": 0.9549, + "step": 552 + }, + { + "epoch": 0.5240464344941956, + "grad_norm": 1.584365865940181, + "learning_rate": 8.655546545885294e-06, + "loss": 0.9895, + "step": 553 + }, + { + "epoch": 0.5249940772328832, + "grad_norm": 1.5230791449620116, + "learning_rate": 8.650304252076704e-06, + "loss": 0.9359, + "step": 554 + }, + { + "epoch": 0.5259417199715707, + "grad_norm": 1.2812899118028946, + "learning_rate": 8.645053351948594e-06, + "loss": 0.8863, + "step": 555 + }, + { + "epoch": 0.5268893627102582, + "grad_norm": 1.1090479481617728, + "learning_rate": 8.63979385788101e-06, + "loss": 0.9549, + "step": 556 + }, + { + "epoch": 0.5278370054489457, + "grad_norm": 1.0243309497173194, + "learning_rate": 8.63452578227426e-06, + "loss": 0.8837, + "step": 557 + }, + { + "epoch": 0.5287846481876333, + "grad_norm": 1.1652281440552321, + "learning_rate": 8.629249137548873e-06, + "loss": 0.8833, + "step": 558 + }, + { + "epoch": 0.5297322909263208, + "grad_norm": 1.0941817825792766, + "learning_rate": 8.6239639361456e-06, + "loss": 0.9423, + "step": 559 + }, + { + "epoch": 0.5306799336650083, + "grad_norm": 1.2574492154883083, + "learning_rate": 8.61867019052535e-06, + "loss": 0.9524, + "step": 560 + }, + { + "epoch": 0.5316275764036958, + "grad_norm": 1.1528975788038949, + "learning_rate": 8.613367913169188e-06, + "loss": 0.8843, + "step": 561 + }, + { + "epoch": 0.5325752191423834, + "grad_norm": 1.260334993982276, + "learning_rate": 8.608057116578283e-06, + "loss": 0.9527, + "step": 562 + }, + { + "epoch": 0.5335228618810708, + "grad_norm": 1.0336321970328701, + "learning_rate": 8.602737813273901e-06, + "loss": 0.885, + "step": 563 + }, + { + "epoch": 0.5344705046197583, + "grad_norm": 1.4071128107796536, + "learning_rate": 8.597410015797358e-06, + "loss": 0.9056, + "step": 564 + }, + { + "epoch": 0.5354181473584458, + "grad_norm": 1.3243499763253614, + "learning_rate": 8.592073736709996e-06, + "loss": 0.9816, + "step": 565 + }, + { + "epoch": 0.5363657900971334, + "grad_norm": 1.0252110946238864, + "learning_rate": 8.586728988593158e-06, + "loss": 0.8939, + "step": 566 + }, + { + "epoch": 0.5373134328358209, + "grad_norm": 1.5674480203253196, + "learning_rate": 8.581375784048154e-06, + "loss": 0.8716, + "step": 567 + }, + { + "epoch": 0.5382610755745084, + "grad_norm": 1.3373495536241256, + "learning_rate": 8.576014135696227e-06, + "loss": 0.9189, + "step": 568 + }, + { + "epoch": 0.539208718313196, + "grad_norm": 1.0083923948069164, + "learning_rate": 8.570644056178533e-06, + "loss": 0.8696, + "step": 569 + }, + { + "epoch": 0.5401563610518835, + "grad_norm": 1.134010279426964, + "learning_rate": 8.565265558156101e-06, + "loss": 0.9171, + "step": 570 + }, + { + "epoch": 0.541104003790571, + "grad_norm": 1.0122940996397913, + "learning_rate": 8.559878654309818e-06, + "loss": 0.8536, + "step": 571 + }, + { + "epoch": 0.5420516465292585, + "grad_norm": 1.0417709805855406, + "learning_rate": 8.554483357340379e-06, + "loss": 0.8757, + "step": 572 + }, + { + "epoch": 0.5420516465292585, + "eval_loss": 0.9370559453964233, + "eval_runtime": 65.2018, + "eval_samples_per_second": 41.839, + "eval_steps_per_second": 0.659, + "step": 572 + }, + { + "epoch": 0.5429992892679459, + "grad_norm": 1.098518656213201, + "learning_rate": 8.549079679968272e-06, + "loss": 0.8879, + "step": 573 + }, + { + "epoch": 0.5439469320066335, + "grad_norm": 1.212951157381051, + "learning_rate": 8.543667634933743e-06, + "loss": 0.8697, + "step": 574 + }, + { + "epoch": 0.544894574745321, + "grad_norm": 1.3330907351600239, + "learning_rate": 8.538247234996766e-06, + "loss": 0.8615, + "step": 575 + }, + { + "epoch": 0.5458422174840085, + "grad_norm": 1.2057308113799874, + "learning_rate": 8.532818492937014e-06, + "loss": 0.9033, + "step": 576 + }, + { + "epoch": 0.546789860222696, + "grad_norm": 1.1709128709827088, + "learning_rate": 8.52738142155383e-06, + "loss": 0.9136, + "step": 577 + }, + { + "epoch": 0.5477375029613836, + "grad_norm": 1.1465991381882117, + "learning_rate": 8.521936033666187e-06, + "loss": 0.9102, + "step": 578 + }, + { + "epoch": 0.5486851457000711, + "grad_norm": 1.4618014976340794, + "learning_rate": 8.51648234211268e-06, + "loss": 0.8733, + "step": 579 + }, + { + "epoch": 0.5496327884387586, + "grad_norm": 1.449685521781311, + "learning_rate": 8.511020359751467e-06, + "loss": 0.9106, + "step": 580 + }, + { + "epoch": 0.5505804311774462, + "grad_norm": 1.0171758381766154, + "learning_rate": 8.505550099460264e-06, + "loss": 0.9353, + "step": 581 + }, + { + "epoch": 0.5515280739161336, + "grad_norm": 1.290290565129861, + "learning_rate": 8.500071574136297e-06, + "loss": 0.837, + "step": 582 + }, + { + "epoch": 0.5524757166548211, + "grad_norm": 1.1275094814541378, + "learning_rate": 8.49458479669628e-06, + "loss": 0.9316, + "step": 583 + }, + { + "epoch": 0.5534233593935086, + "grad_norm": 1.762720464593278, + "learning_rate": 8.489089780076387e-06, + "loss": 0.9394, + "step": 584 + }, + { + "epoch": 0.5543710021321961, + "grad_norm": 1.227259697952017, + "learning_rate": 8.483586537232212e-06, + "loss": 0.8798, + "step": 585 + }, + { + "epoch": 0.5553186448708837, + "grad_norm": 1.1785938474090234, + "learning_rate": 8.478075081138746e-06, + "loss": 0.9288, + "step": 586 + }, + { + "epoch": 0.5562662876095712, + "grad_norm": 1.1067839714490098, + "learning_rate": 8.472555424790348e-06, + "loss": 0.833, + "step": 587 + }, + { + "epoch": 0.5572139303482587, + "grad_norm": 1.1232716949263366, + "learning_rate": 8.467027581200702e-06, + "loss": 0.9166, + "step": 588 + }, + { + "epoch": 0.5581615730869462, + "grad_norm": 1.261715047880492, + "learning_rate": 8.461491563402807e-06, + "loss": 0.9618, + "step": 589 + }, + { + "epoch": 0.5591092158256338, + "grad_norm": 1.1832942718518242, + "learning_rate": 8.455947384448926e-06, + "loss": 0.8843, + "step": 590 + }, + { + "epoch": 0.5600568585643213, + "grad_norm": 1.1707357848301445, + "learning_rate": 8.450395057410561e-06, + "loss": 0.8667, + "step": 591 + }, + { + "epoch": 0.5610045013030087, + "grad_norm": 1.051280206948217, + "learning_rate": 8.444834595378434e-06, + "loss": 0.9182, + "step": 592 + }, + { + "epoch": 0.5619521440416962, + "grad_norm": 1.5197971665007415, + "learning_rate": 8.43926601146244e-06, + "loss": 0.9023, + "step": 593 + }, + { + "epoch": 0.5628997867803838, + "grad_norm": 1.2707540574566858, + "learning_rate": 8.433689318791628e-06, + "loss": 0.936, + "step": 594 + }, + { + "epoch": 0.5628997867803838, + "eval_loss": 0.9368069767951965, + "eval_runtime": 59.0081, + "eval_samples_per_second": 46.231, + "eval_steps_per_second": 0.729, + "step": 594 + }, + { + "epoch": 0.5638474295190713, + "grad_norm": 1.1108224917689546, + "learning_rate": 8.428104530514156e-06, + "loss": 0.853, + "step": 595 + }, + { + "epoch": 0.5647950722577588, + "grad_norm": 1.039118804478871, + "learning_rate": 8.42251165979728e-06, + "loss": 0.9154, + "step": 596 + }, + { + "epoch": 0.5657427149964463, + "grad_norm": 1.0970139269789967, + "learning_rate": 8.416910719827304e-06, + "loss": 0.9166, + "step": 597 + }, + { + "epoch": 0.5666903577351339, + "grad_norm": 1.0306693005295113, + "learning_rate": 8.411301723809563e-06, + "loss": 0.9061, + "step": 598 + }, + { + "epoch": 0.5676380004738214, + "grad_norm": 2.3153529152284746, + "learning_rate": 8.405684684968383e-06, + "loss": 0.9242, + "step": 599 + }, + { + "epoch": 0.5685856432125089, + "grad_norm": 1.1053912735757487, + "learning_rate": 8.400059616547046e-06, + "loss": 0.8394, + "step": 600 + }, + { + "epoch": 0.5695332859511963, + "grad_norm": 1.2091214685314218, + "learning_rate": 8.394426531807777e-06, + "loss": 0.9289, + "step": 601 + }, + { + "epoch": 0.5704809286898839, + "grad_norm": 1.1879706774303542, + "learning_rate": 8.388785444031695e-06, + "loss": 0.9362, + "step": 602 + }, + { + "epoch": 0.5714285714285714, + "grad_norm": 1.2386582317865258, + "learning_rate": 8.383136366518788e-06, + "loss": 0.9061, + "step": 603 + }, + { + "epoch": 0.5723762141672589, + "grad_norm": 1.027254780148272, + "learning_rate": 8.37747931258788e-06, + "loss": 0.9291, + "step": 604 + }, + { + "epoch": 0.5733238569059464, + "grad_norm": 1.2061736249322361, + "learning_rate": 8.371814295576604e-06, + "loss": 0.9435, + "step": 605 + }, + { + "epoch": 0.574271499644634, + "grad_norm": 1.1051297934960431, + "learning_rate": 8.366141328841367e-06, + "loss": 0.9444, + "step": 606 + }, + { + "epoch": 0.5752191423833215, + "grad_norm": 1.0492890936420853, + "learning_rate": 8.360460425757316e-06, + "loss": 0.8896, + "step": 607 + }, + { + "epoch": 0.576166785122009, + "grad_norm": 1.1855288112590538, + "learning_rate": 8.354771599718313e-06, + "loss": 0.9024, + "step": 608 + }, + { + "epoch": 0.5771144278606966, + "grad_norm": 1.0894896483096521, + "learning_rate": 8.349074864136897e-06, + "loss": 0.8718, + "step": 609 + }, + { + "epoch": 0.5780620705993841, + "grad_norm": 1.1673204787473177, + "learning_rate": 8.34337023244426e-06, + "loss": 0.9477, + "step": 610 + }, + { + "epoch": 0.5790097133380715, + "grad_norm": 1.1746428108459406, + "learning_rate": 8.33765771809021e-06, + "loss": 0.9633, + "step": 611 + }, + { + "epoch": 0.579957356076759, + "grad_norm": 1.6815219702121096, + "learning_rate": 8.331937334543132e-06, + "loss": 0.9357, + "step": 612 + }, + { + "epoch": 0.5809049988154465, + "grad_norm": 1.284563514540576, + "learning_rate": 8.326209095289973e-06, + "loss": 0.9576, + "step": 613 + }, + { + "epoch": 0.5818526415541341, + "grad_norm": 1.1141153791855245, + "learning_rate": 8.320473013836197e-06, + "loss": 0.9207, + "step": 614 + }, + { + "epoch": 0.5828002842928216, + "grad_norm": 1.0820567139576633, + "learning_rate": 8.314729103705758e-06, + "loss": 0.8984, + "step": 615 + }, + { + "epoch": 0.5837479270315091, + "grad_norm": 1.0636345740480533, + "learning_rate": 8.308977378441072e-06, + "loss": 0.9086, + "step": 616 + }, + { + "epoch": 0.5837479270315091, + "eval_loss": 0.9341678619384766, + "eval_runtime": 65.7189, + "eval_samples_per_second": 41.51, + "eval_steps_per_second": 0.654, + "step": 616 + }, + { + "epoch": 0.5846955697701967, + "grad_norm": 1.3632356445316784, + "learning_rate": 8.303217851602973e-06, + "loss": 0.8918, + "step": 617 + }, + { + "epoch": 0.5856432125088842, + "grad_norm": 1.1417039643528692, + "learning_rate": 8.297450536770697e-06, + "loss": 0.8531, + "step": 618 + }, + { + "epoch": 0.5865908552475717, + "grad_norm": 1.03859128947666, + "learning_rate": 8.291675447541834e-06, + "loss": 0.8609, + "step": 619 + }, + { + "epoch": 0.5875384979862592, + "grad_norm": 1.2256793137128281, + "learning_rate": 8.285892597532311e-06, + "loss": 0.9384, + "step": 620 + }, + { + "epoch": 0.5884861407249466, + "grad_norm": 1.1848786072557997, + "learning_rate": 8.280102000376346e-06, + "loss": 0.8621, + "step": 621 + }, + { + "epoch": 0.5894337834636342, + "grad_norm": 1.0897670274263946, + "learning_rate": 8.274303669726427e-06, + "loss": 0.8895, + "step": 622 + }, + { + "epoch": 0.5903814262023217, + "grad_norm": 1.2338961521515757, + "learning_rate": 8.268497619253273e-06, + "loss": 0.9397, + "step": 623 + }, + { + "epoch": 0.5913290689410092, + "grad_norm": 1.1260558549955006, + "learning_rate": 8.262683862645804e-06, + "loss": 0.8779, + "step": 624 + }, + { + "epoch": 0.5922767116796968, + "grad_norm": 1.0331575412446614, + "learning_rate": 8.256862413611113e-06, + "loss": 0.912, + "step": 625 + }, + { + "epoch": 0.5932243544183843, + "grad_norm": 1.192716956765876, + "learning_rate": 8.25103328587442e-06, + "loss": 0.8503, + "step": 626 + }, + { + "epoch": 0.5941719971570718, + "grad_norm": 1.1264420514270421, + "learning_rate": 8.245196493179061e-06, + "loss": 0.968, + "step": 627 + }, + { + "epoch": 0.5951196398957593, + "grad_norm": 1.1664248935291284, + "learning_rate": 8.239352049286435e-06, + "loss": 0.9293, + "step": 628 + }, + { + "epoch": 0.5960672826344469, + "grad_norm": 1.165344238639824, + "learning_rate": 8.233499967975981e-06, + "loss": 0.9285, + "step": 629 + }, + { + "epoch": 0.5970149253731343, + "grad_norm": 1.1132735526032906, + "learning_rate": 8.22764026304515e-06, + "loss": 0.8583, + "step": 630 + }, + { + "epoch": 0.5979625681118218, + "grad_norm": 1.2263330088822129, + "learning_rate": 8.221772948309363e-06, + "loss": 0.8848, + "step": 631 + }, + { + "epoch": 0.5989102108505093, + "grad_norm": 1.242835973780116, + "learning_rate": 8.215898037601981e-06, + "loss": 0.9078, + "step": 632 + }, + { + "epoch": 0.5998578535891969, + "grad_norm": 1.0611322995754056, + "learning_rate": 8.210015544774279e-06, + "loss": 0.9158, + "step": 633 + }, + { + "epoch": 0.6008054963278844, + "grad_norm": 1.0776982828638144, + "learning_rate": 8.204125483695403e-06, + "loss": 0.8951, + "step": 634 + }, + { + "epoch": 0.6017531390665719, + "grad_norm": 1.1010692683885481, + "learning_rate": 8.198227868252348e-06, + "loss": 0.8796, + "step": 635 + }, + { + "epoch": 0.6027007818052594, + "grad_norm": 1.1791589543105867, + "learning_rate": 8.192322712349917e-06, + "loss": 0.8649, + "step": 636 + }, + { + "epoch": 0.603648424543947, + "grad_norm": 1.0601001331133804, + "learning_rate": 8.186410029910694e-06, + "loss": 0.9523, + "step": 637 + }, + { + "epoch": 0.6045960672826345, + "grad_norm": 1.1485122140349338, + "learning_rate": 8.180489834875e-06, + "loss": 0.9796, + "step": 638 + }, + { + "epoch": 0.6045960672826345, + "eval_loss": 0.9337397813796997, + "eval_runtime": 64.0008, + "eval_samples_per_second": 42.624, + "eval_steps_per_second": 0.672, + "step": 638 + }, + { + "epoch": 0.605543710021322, + "grad_norm": 1.164013089234412, + "learning_rate": 8.174562141200878e-06, + "loss": 0.8544, + "step": 639 + }, + { + "epoch": 0.6064913527600094, + "grad_norm": 0.9760399939930886, + "learning_rate": 8.168626962864045e-06, + "loss": 0.9098, + "step": 640 + }, + { + "epoch": 0.607438995498697, + "grad_norm": 1.2834957539805654, + "learning_rate": 8.162684313857869e-06, + "loss": 0.9297, + "step": 641 + }, + { + "epoch": 0.6083866382373845, + "grad_norm": 1.1309875805256084, + "learning_rate": 8.156734208193327e-06, + "loss": 0.8415, + "step": 642 + }, + { + "epoch": 0.609334280976072, + "grad_norm": 1.1022824206738733, + "learning_rate": 8.15077665989898e-06, + "loss": 0.89, + "step": 643 + }, + { + "epoch": 0.6102819237147595, + "grad_norm": 1.024845261979357, + "learning_rate": 8.144811683020932e-06, + "loss": 0.9135, + "step": 644 + }, + { + "epoch": 0.6112295664534471, + "grad_norm": 1.1297286487540084, + "learning_rate": 8.138839291622807e-06, + "loss": 0.9178, + "step": 645 + }, + { + "epoch": 0.6121772091921346, + "grad_norm": 1.0969400851261308, + "learning_rate": 8.132859499785708e-06, + "loss": 0.8944, + "step": 646 + }, + { + "epoch": 0.6131248519308221, + "grad_norm": 1.1361902149740226, + "learning_rate": 8.126872321608185e-06, + "loss": 0.8428, + "step": 647 + }, + { + "epoch": 0.6140724946695096, + "grad_norm": 4.987461142356876, + "learning_rate": 8.120877771206201e-06, + "loss": 0.9267, + "step": 648 + }, + { + "epoch": 0.6150201374081972, + "grad_norm": 1.1003270497805429, + "learning_rate": 8.114875862713107e-06, + "loss": 0.9126, + "step": 649 + }, + { + "epoch": 0.6159677801468846, + "grad_norm": 1.1145863010291335, + "learning_rate": 8.108866610279595e-06, + "loss": 0.9069, + "step": 650 + }, + { + "epoch": 0.6169154228855721, + "grad_norm": 1.070440031809025, + "learning_rate": 8.102850028073674e-06, + "loss": 0.9805, + "step": 651 + }, + { + "epoch": 0.6178630656242596, + "grad_norm": 1.1878084074243875, + "learning_rate": 8.09682613028064e-06, + "loss": 0.8608, + "step": 652 + }, + { + "epoch": 0.6188107083629472, + "grad_norm": 1.1666517787266597, + "learning_rate": 8.090794931103026e-06, + "loss": 0.8649, + "step": 653 + }, + { + "epoch": 0.6197583511016347, + "grad_norm": 1.1109949887709072, + "learning_rate": 8.08475644476059e-06, + "loss": 0.8555, + "step": 654 + }, + { + "epoch": 0.6207059938403222, + "grad_norm": 1.1293611851504917, + "learning_rate": 8.078710685490266e-06, + "loss": 0.9048, + "step": 655 + }, + { + "epoch": 0.6216536365790097, + "grad_norm": 1.0517383761314782, + "learning_rate": 8.072657667546136e-06, + "loss": 0.8665, + "step": 656 + }, + { + "epoch": 0.6226012793176973, + "grad_norm": 1.143650330668765, + "learning_rate": 8.066597405199393e-06, + "loss": 0.8833, + "step": 657 + }, + { + "epoch": 0.6235489220563848, + "grad_norm": 1.2088279253982825, + "learning_rate": 8.060529912738316e-06, + "loss": 0.9369, + "step": 658 + }, + { + "epoch": 0.6244965647950722, + "grad_norm": 1.1771192147073226, + "learning_rate": 8.054455204468225e-06, + "loss": 0.8912, + "step": 659 + }, + { + "epoch": 0.6254442075337597, + "grad_norm": 0.9872215985965054, + "learning_rate": 8.048373294711455e-06, + "loss": 0.8272, + "step": 660 + }, + { + "epoch": 0.6254442075337597, + "eval_loss": 0.9312112927436829, + "eval_runtime": 61.3917, + "eval_samples_per_second": 44.436, + "eval_steps_per_second": 0.7, + "step": 660 + }, + { + "epoch": 0.6263918502724473, + "grad_norm": 1.112849369485224, + "learning_rate": 8.042284197807323e-06, + "loss": 0.8914, + "step": 661 + }, + { + "epoch": 0.6273394930111348, + "grad_norm": 1.1777170728187258, + "learning_rate": 8.036187928112087e-06, + "loss": 0.8983, + "step": 662 + }, + { + "epoch": 0.6282871357498223, + "grad_norm": 1.1537880977099835, + "learning_rate": 8.030084499998916e-06, + "loss": 0.8823, + "step": 663 + }, + { + "epoch": 0.6292347784885098, + "grad_norm": 1.1620930961053082, + "learning_rate": 8.023973927857857e-06, + "loss": 0.9361, + "step": 664 + }, + { + "epoch": 0.6301824212271974, + "grad_norm": 1.3868544628160782, + "learning_rate": 8.017856226095804e-06, + "loss": 0.9183, + "step": 665 + }, + { + "epoch": 0.6311300639658849, + "grad_norm": 1.1115391562362278, + "learning_rate": 8.011731409136454e-06, + "loss": 0.8678, + "step": 666 + }, + { + "epoch": 0.6320777067045724, + "grad_norm": 1.1189548409555135, + "learning_rate": 8.005599491420288e-06, + "loss": 0.9562, + "step": 667 + }, + { + "epoch": 0.6330253494432599, + "grad_norm": 1.1551587897622888, + "learning_rate": 7.99946048740452e-06, + "loss": 0.9742, + "step": 668 + }, + { + "epoch": 0.6339729921819474, + "grad_norm": 0.9985847295223576, + "learning_rate": 7.993314411563075e-06, + "loss": 0.8763, + "step": 669 + }, + { + "epoch": 0.6349206349206349, + "grad_norm": 0.9938954837958673, + "learning_rate": 7.987161278386555e-06, + "loss": 0.8941, + "step": 670 + }, + { + "epoch": 0.6358682776593224, + "grad_norm": 1.2517564440987765, + "learning_rate": 7.981001102382192e-06, + "loss": 0.8922, + "step": 671 + }, + { + "epoch": 0.6368159203980099, + "grad_norm": 1.669042851630183, + "learning_rate": 7.974833898073832e-06, + "loss": 0.8734, + "step": 672 + }, + { + "epoch": 0.6377635631366975, + "grad_norm": 1.733742728719525, + "learning_rate": 7.968659680001887e-06, + "loss": 0.9224, + "step": 673 + }, + { + "epoch": 0.638711205875385, + "grad_norm": 1.4086875008087318, + "learning_rate": 7.962478462723306e-06, + "loss": 0.8862, + "step": 674 + }, + { + "epoch": 0.6396588486140725, + "grad_norm": 1.118275938120274, + "learning_rate": 7.95629026081154e-06, + "loss": 0.9075, + "step": 675 + }, + { + "epoch": 0.64060649135276, + "grad_norm": 1.2853033943409442, + "learning_rate": 7.950095088856509e-06, + "loss": 0.857, + "step": 676 + }, + { + "epoch": 0.6415541340914476, + "grad_norm": 1.0400548366155555, + "learning_rate": 7.943892961464562e-06, + "loss": 0.9434, + "step": 677 + }, + { + "epoch": 0.642501776830135, + "grad_norm": 1.3041391262819717, + "learning_rate": 7.937683893258454e-06, + "loss": 0.9685, + "step": 678 + }, + { + "epoch": 0.6434494195688225, + "grad_norm": 1.1598673981069736, + "learning_rate": 7.931467898877298e-06, + "loss": 0.8632, + "step": 679 + }, + { + "epoch": 0.64439706230751, + "grad_norm": 1.0009937654408843, + "learning_rate": 7.925244992976538e-06, + "loss": 0.8824, + "step": 680 + }, + { + "epoch": 0.6453447050461976, + "grad_norm": 1.017837058069719, + "learning_rate": 7.919015190227919e-06, + "loss": 0.8505, + "step": 681 + }, + { + "epoch": 0.6462923477848851, + "grad_norm": 1.1641241757138032, + "learning_rate": 7.912778505319436e-06, + "loss": 0.8432, + "step": 682 + }, + { + "epoch": 0.6462923477848851, + "eval_loss": 0.9309917688369751, + "eval_runtime": 60.3302, + "eval_samples_per_second": 45.218, + "eval_steps_per_second": 0.713, + "step": 682 + }, + { + "epoch": 0.6472399905235726, + "grad_norm": 1.1169534397607939, + "learning_rate": 7.906534952955321e-06, + "loss": 0.9085, + "step": 683 + }, + { + "epoch": 0.6481876332622601, + "grad_norm": 1.0900655365751404, + "learning_rate": 7.900284547855992e-06, + "loss": 0.9411, + "step": 684 + }, + { + "epoch": 0.6491352760009477, + "grad_norm": 1.2161831558733007, + "learning_rate": 7.894027304758023e-06, + "loss": 0.8769, + "step": 685 + }, + { + "epoch": 0.6500829187396352, + "grad_norm": 1.044138736537594, + "learning_rate": 7.88776323841411e-06, + "loss": 0.9436, + "step": 686 + }, + { + "epoch": 0.6510305614783227, + "grad_norm": 1.0705430979469939, + "learning_rate": 7.88149236359304e-06, + "loss": 0.8941, + "step": 687 + }, + { + "epoch": 0.6519782042170101, + "grad_norm": 1.3845056323680385, + "learning_rate": 7.875214695079647e-06, + "loss": 0.9501, + "step": 688 + }, + { + "epoch": 0.6529258469556977, + "grad_norm": 1.0170616350914143, + "learning_rate": 7.868930247674787e-06, + "loss": 0.9, + "step": 689 + }, + { + "epoch": 0.6538734896943852, + "grad_norm": 1.0921009934181993, + "learning_rate": 7.862639036195298e-06, + "loss": 0.9174, + "step": 690 + }, + { + "epoch": 0.6548211324330727, + "grad_norm": 1.218634642701156, + "learning_rate": 7.856341075473963e-06, + "loss": 0.9376, + "step": 691 + }, + { + "epoch": 0.6557687751717602, + "grad_norm": 0.9907566710047155, + "learning_rate": 7.850036380359479e-06, + "loss": 0.8849, + "step": 692 + }, + { + "epoch": 0.6567164179104478, + "grad_norm": 1.0543716934739653, + "learning_rate": 7.843724965716419e-06, + "loss": 0.9345, + "step": 693 + }, + { + "epoch": 0.6576640606491353, + "grad_norm": 0.9814925522801817, + "learning_rate": 7.837406846425205e-06, + "loss": 0.8675, + "step": 694 + }, + { + "epoch": 0.6586117033878228, + "grad_norm": 1.192089061098573, + "learning_rate": 7.831082037382057e-06, + "loss": 0.9501, + "step": 695 + }, + { + "epoch": 0.6595593461265103, + "grad_norm": 1.1020254975949058, + "learning_rate": 7.824750553498977e-06, + "loss": 0.9811, + "step": 696 + }, + { + "epoch": 0.6605069888651979, + "grad_norm": 1.1111842691891292, + "learning_rate": 7.818412409703695e-06, + "loss": 0.9328, + "step": 697 + }, + { + "epoch": 0.6614546316038853, + "grad_norm": 1.1347234782175453, + "learning_rate": 7.812067620939653e-06, + "loss": 0.9614, + "step": 698 + }, + { + "epoch": 0.6624022743425728, + "grad_norm": 1.065352923892995, + "learning_rate": 7.805716202165949e-06, + "loss": 0.8818, + "step": 699 + }, + { + "epoch": 0.6633499170812603, + "grad_norm": 1.1030004540487208, + "learning_rate": 7.799358168357323e-06, + "loss": 0.8465, + "step": 700 + }, + { + "epoch": 0.6642975598199479, + "grad_norm": 0.9089874996068653, + "learning_rate": 7.792993534504103e-06, + "loss": 0.9243, + "step": 701 + }, + { + "epoch": 0.6652452025586354, + "grad_norm": 1.2054269196194143, + "learning_rate": 7.786622315612182e-06, + "loss": 0.8688, + "step": 702 + }, + { + "epoch": 0.6661928452973229, + "grad_norm": 1.2746014427627002, + "learning_rate": 7.78024452670298e-06, + "loss": 0.9181, + "step": 703 + }, + { + "epoch": 0.6671404880360104, + "grad_norm": 1.077600022154621, + "learning_rate": 7.773860182813404e-06, + "loss": 0.8492, + "step": 704 + }, + { + "epoch": 0.6671404880360104, + "eval_loss": 0.928534984588623, + "eval_runtime": 64.6468, + "eval_samples_per_second": 42.199, + "eval_steps_per_second": 0.665, + "step": 704 + }, + { + "epoch": 0.668088130774698, + "grad_norm": 1.033501298641634, + "learning_rate": 7.767469298995813e-06, + "loss": 0.8854, + "step": 705 + }, + { + "epoch": 0.6690357735133855, + "grad_norm": 0.9867061772593498, + "learning_rate": 7.761071890317994e-06, + "loss": 0.8431, + "step": 706 + }, + { + "epoch": 0.6699834162520729, + "grad_norm": 1.1600752814036814, + "learning_rate": 7.754667971863112e-06, + "loss": 0.9133, + "step": 707 + }, + { + "epoch": 0.6709310589907604, + "grad_norm": 1.1443042381662363, + "learning_rate": 7.748257558729677e-06, + "loss": 0.9184, + "step": 708 + }, + { + "epoch": 0.671878701729448, + "grad_norm": 1.1577240117048646, + "learning_rate": 7.741840666031517e-06, + "loss": 0.8738, + "step": 709 + }, + { + "epoch": 0.6728263444681355, + "grad_norm": 1.308361165359508, + "learning_rate": 7.735417308897737e-06, + "loss": 0.8414, + "step": 710 + }, + { + "epoch": 0.673773987206823, + "grad_norm": 1.1739658962601087, + "learning_rate": 7.728987502472678e-06, + "loss": 0.8551, + "step": 711 + }, + { + "epoch": 0.6747216299455105, + "grad_norm": 1.0699527544498066, + "learning_rate": 7.72255126191589e-06, + "loss": 0.8514, + "step": 712 + }, + { + "epoch": 0.6756692726841981, + "grad_norm": 1.481959370807152, + "learning_rate": 7.716108602402094e-06, + "loss": 0.8944, + "step": 713 + }, + { + "epoch": 0.6766169154228856, + "grad_norm": 1.1240913687798249, + "learning_rate": 7.709659539121144e-06, + "loss": 0.8578, + "step": 714 + }, + { + "epoch": 0.6775645581615731, + "grad_norm": 1.1176326008647557, + "learning_rate": 7.703204087277989e-06, + "loss": 0.9374, + "step": 715 + }, + { + "epoch": 0.6785122009002607, + "grad_norm": 1.2863389842208885, + "learning_rate": 7.696742262092643e-06, + "loss": 0.8846, + "step": 716 + }, + { + "epoch": 0.6794598436389481, + "grad_norm": 1.3631041557393186, + "learning_rate": 7.690274078800148e-06, + "loss": 0.8766, + "step": 717 + }, + { + "epoch": 0.6804074863776356, + "grad_norm": 1.4470714060969805, + "learning_rate": 7.683799552650534e-06, + "loss": 0.9231, + "step": 718 + }, + { + "epoch": 0.6813551291163231, + "grad_norm": 1.1150660700565789, + "learning_rate": 7.677318698908788e-06, + "loss": 0.8391, + "step": 719 + }, + { + "epoch": 0.6823027718550106, + "grad_norm": 1.1713456925403845, + "learning_rate": 7.670831532854811e-06, + "loss": 0.9214, + "step": 720 + }, + { + "epoch": 0.6832504145936982, + "grad_norm": 1.219437200101105, + "learning_rate": 7.66433806978339e-06, + "loss": 0.8944, + "step": 721 + }, + { + "epoch": 0.6841980573323857, + "grad_norm": 1.2287162936444869, + "learning_rate": 7.65783832500416e-06, + "loss": 0.8854, + "step": 722 + }, + { + "epoch": 0.6851457000710732, + "grad_norm": 1.344476187588291, + "learning_rate": 7.651332313841562e-06, + "loss": 0.8488, + "step": 723 + }, + { + "epoch": 0.6860933428097608, + "grad_norm": 1.0371186151252827, + "learning_rate": 7.644820051634813e-06, + "loss": 0.8473, + "step": 724 + }, + { + "epoch": 0.6870409855484483, + "grad_norm": 1.1863536224030726, + "learning_rate": 7.638301553737871e-06, + "loss": 0.9155, + "step": 725 + }, + { + "epoch": 0.6879886282871357, + "grad_norm": 1.200025733783506, + "learning_rate": 7.63177683551939e-06, + "loss": 0.8828, + "step": 726 + }, + { + "epoch": 0.6879886282871357, + "eval_loss": 0.9281173944473267, + "eval_runtime": 65.2556, + "eval_samples_per_second": 41.805, + "eval_steps_per_second": 0.659, + "step": 726 + }, + { + "epoch": 0.6889362710258232, + "grad_norm": 1.1992637929718537, + "learning_rate": 7.625245912362699e-06, + "loss": 0.87, + "step": 727 + }, + { + "epoch": 0.6898839137645107, + "grad_norm": 1.0955940037433252, + "learning_rate": 7.618708799665745e-06, + "loss": 0.8636, + "step": 728 + }, + { + "epoch": 0.6908315565031983, + "grad_norm": 1.0564308744295217, + "learning_rate": 7.612165512841076e-06, + "loss": 0.9153, + "step": 729 + }, + { + "epoch": 0.6917791992418858, + "grad_norm": 1.2470763038912207, + "learning_rate": 7.605616067315793e-06, + "loss": 0.9199, + "step": 730 + }, + { + "epoch": 0.6927268419805733, + "grad_norm": 1.2122791323172828, + "learning_rate": 7.599060478531519e-06, + "loss": 0.9248, + "step": 731 + }, + { + "epoch": 0.6936744847192609, + "grad_norm": 1.097709504783866, + "learning_rate": 7.592498761944363e-06, + "loss": 0.8689, + "step": 732 + }, + { + "epoch": 0.6946221274579484, + "grad_norm": 1.230662404513117, + "learning_rate": 7.585930933024874e-06, + "loss": 0.9021, + "step": 733 + }, + { + "epoch": 0.6955697701966359, + "grad_norm": 1.0146506247183977, + "learning_rate": 7.579357007258022e-06, + "loss": 0.9065, + "step": 734 + }, + { + "epoch": 0.6965174129353234, + "grad_norm": 1.1522681830781554, + "learning_rate": 7.572777000143145e-06, + "loss": 0.8689, + "step": 735 + }, + { + "epoch": 0.6974650556740108, + "grad_norm": 1.1876718763825516, + "learning_rate": 7.56619092719392e-06, + "loss": 0.8553, + "step": 736 + }, + { + "epoch": 0.6984126984126984, + "grad_norm": 1.254710090679274, + "learning_rate": 7.559598803938328e-06, + "loss": 0.8994, + "step": 737 + }, + { + "epoch": 0.6993603411513859, + "grad_norm": 1.3772630607131078, + "learning_rate": 7.5530006459186115e-06, + "loss": 0.9072, + "step": 738 + }, + { + "epoch": 0.7003079838900734, + "grad_norm": 1.2658717437982785, + "learning_rate": 7.546396468691241e-06, + "loss": 0.8704, + "step": 739 + }, + { + "epoch": 0.701255626628761, + "grad_norm": 1.051563030074167, + "learning_rate": 7.539786287826885e-06, + "loss": 0.9211, + "step": 740 + }, + { + "epoch": 0.7022032693674485, + "grad_norm": 1.0078569587527633, + "learning_rate": 7.533170118910356e-06, + "loss": 0.8865, + "step": 741 + }, + { + "epoch": 0.703150912106136, + "grad_norm": 1.2344336456961045, + "learning_rate": 7.526547977540592e-06, + "loss": 0.9072, + "step": 742 + }, + { + "epoch": 0.7040985548448235, + "grad_norm": 1.1416986384728778, + "learning_rate": 7.5199198793306135e-06, + "loss": 0.873, + "step": 743 + }, + { + "epoch": 0.7050461975835111, + "grad_norm": 1.0678982977792042, + "learning_rate": 7.51328583990748e-06, + "loss": 0.913, + "step": 744 + }, + { + "epoch": 0.7059938403221986, + "grad_norm": 1.1437093215592007, + "learning_rate": 7.506645874912264e-06, + "loss": 0.9799, + "step": 745 + }, + { + "epoch": 0.706941483060886, + "grad_norm": 1.0498302149495389, + "learning_rate": 7.500000000000001e-06, + "loss": 0.9027, + "step": 746 + }, + { + "epoch": 0.7078891257995735, + "grad_norm": 2.056830676476333, + "learning_rate": 7.4933482308396686e-06, + "loss": 0.8287, + "step": 747 + }, + { + "epoch": 0.708836768538261, + "grad_norm": 1.053677680621179, + "learning_rate": 7.486690583114137e-06, + "loss": 0.9333, + "step": 748 + }, + { + "epoch": 0.708836768538261, + "eval_loss": 0.92720627784729, + "eval_runtime": 65.9587, + "eval_samples_per_second": 41.359, + "eval_steps_per_second": 0.652, + "step": 748 + }, + { + "epoch": 0.7097844112769486, + "grad_norm": 1.5701143667704405, + "learning_rate": 7.480027072520137e-06, + "loss": 0.8974, + "step": 749 + }, + { + "epoch": 0.7107320540156361, + "grad_norm": 1.235022340393013, + "learning_rate": 7.473357714768222e-06, + "loss": 0.9207, + "step": 750 + }, + { + "epoch": 0.7116796967543236, + "grad_norm": 1.1141250146076045, + "learning_rate": 7.466682525582732e-06, + "loss": 0.8674, + "step": 751 + }, + { + "epoch": 0.7126273394930112, + "grad_norm": 1.1886209864097594, + "learning_rate": 7.460001520701756e-06, + "loss": 0.8858, + "step": 752 + }, + { + "epoch": 0.7135749822316987, + "grad_norm": 1.1322038451733678, + "learning_rate": 7.453314715877094e-06, + "loss": 0.843, + "step": 753 + }, + { + "epoch": 0.7145226249703862, + "grad_norm": 1.0033173758679002, + "learning_rate": 7.446622126874219e-06, + "loss": 0.8674, + "step": 754 + }, + { + "epoch": 0.7154702677090736, + "grad_norm": 1.1688006539249634, + "learning_rate": 7.439923769472244e-06, + "loss": 0.8825, + "step": 755 + }, + { + "epoch": 0.7164179104477612, + "grad_norm": 1.306296172244326, + "learning_rate": 7.4332196594638815e-06, + "loss": 0.9753, + "step": 756 + }, + { + "epoch": 0.7173655531864487, + "grad_norm": 0.9730531759417155, + "learning_rate": 7.4265098126554065e-06, + "loss": 0.8617, + "step": 757 + }, + { + "epoch": 0.7183131959251362, + "grad_norm": 1.385852128592582, + "learning_rate": 7.419794244866619e-06, + "loss": 0.8978, + "step": 758 + }, + { + "epoch": 0.7192608386638237, + "grad_norm": 1.0569360851945016, + "learning_rate": 7.413072971930807e-06, + "loss": 0.9421, + "step": 759 + }, + { + "epoch": 0.7202084814025113, + "grad_norm": 0.9964478050464701, + "learning_rate": 7.406346009694713e-06, + "loss": 0.85, + "step": 760 + }, + { + "epoch": 0.7211561241411988, + "grad_norm": 1.193990300075423, + "learning_rate": 7.39961337401849e-06, + "loss": 0.8751, + "step": 761 + }, + { + "epoch": 0.7221037668798863, + "grad_norm": 2.0570268630743156, + "learning_rate": 7.3928750807756656e-06, + "loss": 0.9026, + "step": 762 + }, + { + "epoch": 0.7230514096185738, + "grad_norm": 40.56532136308673, + "learning_rate": 7.386131145853111e-06, + "loss": 0.858, + "step": 763 + }, + { + "epoch": 0.7239990523572614, + "grad_norm": 1.7276587552930305, + "learning_rate": 7.379381585150997e-06, + "loss": 0.8743, + "step": 764 + }, + { + "epoch": 0.7249466950959488, + "grad_norm": 1.1917953134999557, + "learning_rate": 7.372626414582754e-06, + "loss": 0.9486, + "step": 765 + }, + { + "epoch": 0.7258943378346363, + "grad_norm": 1.1432814350506717, + "learning_rate": 7.365865650075046e-06, + "loss": 0.9477, + "step": 766 + }, + { + "epoch": 0.7268419805733238, + "grad_norm": 1.09550223259366, + "learning_rate": 7.359099307567721e-06, + "loss": 0.9092, + "step": 767 + }, + { + "epoch": 0.7277896233120114, + "grad_norm": 1.150998457087419, + "learning_rate": 7.352327403013779e-06, + "loss": 0.8982, + "step": 768 + }, + { + "epoch": 0.7287372660506989, + "grad_norm": 1.108499327493803, + "learning_rate": 7.345549952379334e-06, + "loss": 0.9682, + "step": 769 + }, + { + "epoch": 0.7296849087893864, + "grad_norm": 1.153466896232557, + "learning_rate": 7.338766971643579e-06, + "loss": 0.8988, + "step": 770 + }, + { + "epoch": 0.7296849087893864, + "eval_loss": 0.926575779914856, + "eval_runtime": 68.8528, + "eval_samples_per_second": 39.621, + "eval_steps_per_second": 0.625, + "step": 770 + }, + { + "epoch": 0.7306325515280739, + "grad_norm": 1.0796573599853159, + "learning_rate": 7.331978476798738e-06, + "loss": 0.8149, + "step": 771 + }, + { + "epoch": 0.7315801942667615, + "grad_norm": 1.0931539077757213, + "learning_rate": 7.325184483850043e-06, + "loss": 0.9123, + "step": 772 + }, + { + "epoch": 0.732527837005449, + "grad_norm": 1.4182919844521371, + "learning_rate": 7.318385008815686e-06, + "loss": 0.95, + "step": 773 + }, + { + "epoch": 0.7334754797441365, + "grad_norm": 1.2970117028124304, + "learning_rate": 7.311580067726783e-06, + "loss": 0.8689, + "step": 774 + }, + { + "epoch": 0.7344231224828239, + "grad_norm": 1.0927364681801075, + "learning_rate": 7.304769676627339e-06, + "loss": 0.8769, + "step": 775 + }, + { + "epoch": 0.7353707652215115, + "grad_norm": 1.0964389214826673, + "learning_rate": 7.297953851574207e-06, + "loss": 0.9555, + "step": 776 + }, + { + "epoch": 0.736318407960199, + "grad_norm": 1.0832681682992362, + "learning_rate": 7.291132608637053e-06, + "loss": 0.9345, + "step": 777 + }, + { + "epoch": 0.7372660506988865, + "grad_norm": 1.227070674353931, + "learning_rate": 7.284305963898315e-06, + "loss": 0.8685, + "step": 778 + }, + { + "epoch": 0.738213693437574, + "grad_norm": 3.9684909478672976, + "learning_rate": 7.27747393345317e-06, + "loss": 0.8362, + "step": 779 + }, + { + "epoch": 0.7391613361762616, + "grad_norm": 1.4889622029850116, + "learning_rate": 7.270636533409491e-06, + "loss": 0.9391, + "step": 780 + }, + { + "epoch": 0.7401089789149491, + "grad_norm": 1.1829326412392815, + "learning_rate": 7.2637937798878085e-06, + "loss": 0.9182, + "step": 781 + }, + { + "epoch": 0.7410566216536366, + "grad_norm": 1.190511352024864, + "learning_rate": 7.25694568902128e-06, + "loss": 0.8477, + "step": 782 + }, + { + "epoch": 0.7420042643923241, + "grad_norm": 1.3407968776745007, + "learning_rate": 7.250092276955642e-06, + "loss": 0.8861, + "step": 783 + }, + { + "epoch": 0.7429519071310116, + "grad_norm": 1.0306033495447815, + "learning_rate": 7.243233559849179e-06, + "loss": 0.8723, + "step": 784 + }, + { + "epoch": 0.7438995498696991, + "grad_norm": 1.032943199516836, + "learning_rate": 7.236369553872684e-06, + "loss": 0.8848, + "step": 785 + }, + { + "epoch": 0.7448471926083866, + "grad_norm": 1.1806035892508238, + "learning_rate": 7.229500275209418e-06, + "loss": 0.9254, + "step": 786 + }, + { + "epoch": 0.7457948353470741, + "grad_norm": 1.1412881455081196, + "learning_rate": 7.222625740055072e-06, + "loss": 0.8766, + "step": 787 + }, + { + "epoch": 0.7467424780857617, + "grad_norm": 1.1293924713464558, + "learning_rate": 7.215745964617737e-06, + "loss": 0.8932, + "step": 788 + }, + { + "epoch": 0.7476901208244492, + "grad_norm": 1.0502148008128274, + "learning_rate": 7.2088609651178505e-06, + "loss": 0.8693, + "step": 789 + }, + { + "epoch": 0.7486377635631367, + "grad_norm": 1.1691866376909303, + "learning_rate": 7.201970757788172e-06, + "loss": 0.9133, + "step": 790 + }, + { + "epoch": 0.7495854063018242, + "grad_norm": 1.673809188183371, + "learning_rate": 7.195075358873738e-06, + "loss": 0.8997, + "step": 791 + }, + { + "epoch": 0.7505330490405118, + "grad_norm": 1.136212481513878, + "learning_rate": 7.188174784631824e-06, + "loss": 0.8343, + "step": 792 + }, + { + "epoch": 0.7505330490405118, + "eval_loss": 0.925286591053009, + "eval_runtime": 67.6399, + "eval_samples_per_second": 40.331, + "eval_steps_per_second": 0.636, + "step": 792 + }, + { + "epoch": 0.7514806917791993, + "grad_norm": 1.17472555336564, + "learning_rate": 7.18126905133191e-06, + "loss": 0.8889, + "step": 793 + }, + { + "epoch": 0.7524283345178867, + "grad_norm": 1.1001788390092748, + "learning_rate": 7.174358175255636e-06, + "loss": 0.8534, + "step": 794 + }, + { + "epoch": 0.7533759772565742, + "grad_norm": 1.465010157671844, + "learning_rate": 7.1674421726967704e-06, + "loss": 0.8603, + "step": 795 + }, + { + "epoch": 0.7543236199952618, + "grad_norm": 1.1953256640932477, + "learning_rate": 7.160521059961169e-06, + "loss": 0.8345, + "step": 796 + }, + { + "epoch": 0.7552712627339493, + "grad_norm": 3.8996540248644043, + "learning_rate": 7.153594853366731e-06, + "loss": 0.8398, + "step": 797 + }, + { + "epoch": 0.7562189054726368, + "grad_norm": 1.245025773648245, + "learning_rate": 7.14666356924337e-06, + "loss": 0.9068, + "step": 798 + }, + { + "epoch": 0.7571665482113243, + "grad_norm": 1.084110814960542, + "learning_rate": 7.1397272239329684e-06, + "loss": 0.881, + "step": 799 + }, + { + "epoch": 0.7581141909500119, + "grad_norm": 1.4178645946238013, + "learning_rate": 7.132785833789344e-06, + "loss": 0.9458, + "step": 800 + }, + { + "epoch": 0.7590618336886994, + "grad_norm": 1.0724390633923235, + "learning_rate": 7.125839415178204e-06, + "loss": 0.8477, + "step": 801 + }, + { + "epoch": 0.7600094764273869, + "grad_norm": 1.1299800999994787, + "learning_rate": 7.118887984477116e-06, + "loss": 0.8842, + "step": 802 + }, + { + "epoch": 0.7609571191660743, + "grad_norm": 1.1509151559671802, + "learning_rate": 7.111931558075465e-06, + "loss": 0.8459, + "step": 803 + }, + { + "epoch": 0.7619047619047619, + "grad_norm": 1.154455319378924, + "learning_rate": 7.104970152374405e-06, + "loss": 0.9082, + "step": 804 + }, + { + "epoch": 0.7628524046434494, + "grad_norm": 1.15539942235384, + "learning_rate": 7.098003783786844e-06, + "loss": 0.9114, + "step": 805 + }, + { + "epoch": 0.7638000473821369, + "grad_norm": 1.1800047790109787, + "learning_rate": 7.091032468737382e-06, + "loss": 0.8608, + "step": 806 + }, + { + "epoch": 0.7647476901208244, + "grad_norm": 1.1477444619292674, + "learning_rate": 7.084056223662282e-06, + "loss": 0.8842, + "step": 807 + }, + { + "epoch": 0.765695332859512, + "grad_norm": 1.1737093276570716, + "learning_rate": 7.0770750650094335e-06, + "loss": 0.9007, + "step": 808 + }, + { + "epoch": 0.7666429755981995, + "grad_norm": 1.0202933754062562, + "learning_rate": 7.070089009238306e-06, + "loss": 0.9234, + "step": 809 + }, + { + "epoch": 0.767590618336887, + "grad_norm": 1.0550807990536413, + "learning_rate": 7.063098072819919e-06, + "loss": 0.8696, + "step": 810 + }, + { + "epoch": 0.7685382610755745, + "grad_norm": 1.0531655916431704, + "learning_rate": 7.056102272236799e-06, + "loss": 0.8853, + "step": 811 + }, + { + "epoch": 0.7694859038142621, + "grad_norm": 1.2354614191985032, + "learning_rate": 7.049101623982938e-06, + "loss": 0.883, + "step": 812 + }, + { + "epoch": 0.7704335465529495, + "grad_norm": 0.9726437070709893, + "learning_rate": 7.04209614456376e-06, + "loss": 0.9153, + "step": 813 + }, + { + "epoch": 0.771381189291637, + "grad_norm": 2.241432047297512, + "learning_rate": 7.035085850496079e-06, + "loss": 0.94, + "step": 814 + }, + { + "epoch": 0.771381189291637, + "eval_loss": 0.9247336387634277, + "eval_runtime": 60.7578, + "eval_samples_per_second": 44.9, + "eval_steps_per_second": 0.708, + "step": 814 + }, + { + "epoch": 0.7723288320303245, + "grad_norm": 1.1356145449628114, + "learning_rate": 7.028070758308059e-06, + "loss": 0.8219, + "step": 815 + }, + { + "epoch": 0.7732764747690121, + "grad_norm": 1.1079321851319959, + "learning_rate": 7.021050884539178e-06, + "loss": 0.8588, + "step": 816 + }, + { + "epoch": 0.7742241175076996, + "grad_norm": 1.3040268308315734, + "learning_rate": 7.014026245740185e-06, + "loss": 0.8419, + "step": 817 + }, + { + "epoch": 0.7751717602463871, + "grad_norm": 1.230458455518736, + "learning_rate": 7.006996858473068e-06, + "loss": 0.9624, + "step": 818 + }, + { + "epoch": 0.7761194029850746, + "grad_norm": 0.9567806690635783, + "learning_rate": 6.999962739311008e-06, + "loss": 0.8194, + "step": 819 + }, + { + "epoch": 0.7770670457237622, + "grad_norm": 1.243710347278979, + "learning_rate": 6.992923904838341e-06, + "loss": 0.8955, + "step": 820 + }, + { + "epoch": 0.7780146884624497, + "grad_norm": 1.0335878204965474, + "learning_rate": 6.98588037165052e-06, + "loss": 0.9045, + "step": 821 + }, + { + "epoch": 0.7789623312011372, + "grad_norm": 1.0941302857809432, + "learning_rate": 6.97883215635408e-06, + "loss": 0.8902, + "step": 822 + }, + { + "epoch": 0.7799099739398246, + "grad_norm": 1.123840857900373, + "learning_rate": 6.971779275566593e-06, + "loss": 0.8913, + "step": 823 + }, + { + "epoch": 0.7808576166785122, + "grad_norm": 1.1195400281687917, + "learning_rate": 6.96472174591663e-06, + "loss": 0.8474, + "step": 824 + }, + { + "epoch": 0.7818052594171997, + "grad_norm": 1.3420603070760255, + "learning_rate": 6.957659584043724e-06, + "loss": 0.9077, + "step": 825 + }, + { + "epoch": 0.7827529021558872, + "grad_norm": 1.0041194424246165, + "learning_rate": 6.9505928065983275e-06, + "loss": 0.9597, + "step": 826 + }, + { + "epoch": 0.7837005448945747, + "grad_norm": 1.0270969195404063, + "learning_rate": 6.943521430241777e-06, + "loss": 0.8403, + "step": 827 + }, + { + "epoch": 0.7846481876332623, + "grad_norm": 1.1552904131971864, + "learning_rate": 6.936445471646249e-06, + "loss": 0.9044, + "step": 828 + }, + { + "epoch": 0.7855958303719498, + "grad_norm": 1.304060055980435, + "learning_rate": 6.929364947494729e-06, + "loss": 0.9, + "step": 829 + }, + { + "epoch": 0.7865434731106373, + "grad_norm": 1.210030944302157, + "learning_rate": 6.922279874480959e-06, + "loss": 0.9113, + "step": 830 + }, + { + "epoch": 0.7874911158493249, + "grad_norm": 1.025883162070808, + "learning_rate": 6.915190269309416e-06, + "loss": 0.9074, + "step": 831 + }, + { + "epoch": 0.7884387585880123, + "grad_norm": 1.1256642999783826, + "learning_rate": 6.908096148695251e-06, + "loss": 0.9119, + "step": 832 + }, + { + "epoch": 0.7893864013266998, + "grad_norm": 1.0860563647829231, + "learning_rate": 6.900997529364269e-06, + "loss": 0.9093, + "step": 833 + }, + { + "epoch": 0.7903340440653873, + "grad_norm": 1.0010442291598631, + "learning_rate": 6.893894428052881e-06, + "loss": 0.8858, + "step": 834 + }, + { + "epoch": 0.7912816868040748, + "grad_norm": 1.0574820455803635, + "learning_rate": 6.886786861508061e-06, + "loss": 0.8924, + "step": 835 + }, + { + "epoch": 0.7922293295427624, + "grad_norm": 1.2017601371365032, + "learning_rate": 6.879674846487314e-06, + "loss": 0.8959, + "step": 836 + }, + { + "epoch": 0.7922293295427624, + "eval_loss": 0.9229027628898621, + "eval_runtime": 61.6949, + "eval_samples_per_second": 44.218, + "eval_steps_per_second": 0.697, + "step": 836 + }, + { + "epoch": 0.7931769722814499, + "grad_norm": 1.0725982823665645, + "learning_rate": 6.872558399758633e-06, + "loss": 0.8485, + "step": 837 + }, + { + "epoch": 0.7941246150201374, + "grad_norm": 1.0560214413221218, + "learning_rate": 6.865437538100456e-06, + "loss": 0.8418, + "step": 838 + }, + { + "epoch": 0.795072257758825, + "grad_norm": 1.1398098092881728, + "learning_rate": 6.858312278301638e-06, + "loss": 0.8506, + "step": 839 + }, + { + "epoch": 0.7960199004975125, + "grad_norm": 1.1966491026312496, + "learning_rate": 6.8511826371613955e-06, + "loss": 0.9207, + "step": 840 + }, + { + "epoch": 0.7969675432362, + "grad_norm": 1.100229416684982, + "learning_rate": 6.8440486314892775e-06, + "loss": 0.8327, + "step": 841 + }, + { + "epoch": 0.7979151859748874, + "grad_norm": 1.0044010076247918, + "learning_rate": 6.836910278105124e-06, + "loss": 0.823, + "step": 842 + }, + { + "epoch": 0.798862828713575, + "grad_norm": 1.0289580305146189, + "learning_rate": 6.8297675938390275e-06, + "loss": 0.8566, + "step": 843 + }, + { + "epoch": 0.7998104714522625, + "grad_norm": 1.701693817598629, + "learning_rate": 6.822620595531286e-06, + "loss": 0.9532, + "step": 844 + }, + { + "epoch": 0.80075811419095, + "grad_norm": 1.1349425721498254, + "learning_rate": 6.815469300032374e-06, + "loss": 0.914, + "step": 845 + }, + { + "epoch": 0.8017057569296375, + "grad_norm": 1.139104764424171, + "learning_rate": 6.808313724202894e-06, + "loss": 0.9461, + "step": 846 + }, + { + "epoch": 0.802653399668325, + "grad_norm": 1.4336731001103296, + "learning_rate": 6.801153884913541e-06, + "loss": 0.8307, + "step": 847 + }, + { + "epoch": 0.8036010424070126, + "grad_norm": 1.0889586206123247, + "learning_rate": 6.793989799045067e-06, + "loss": 0.9337, + "step": 848 + }, + { + "epoch": 0.8045486851457001, + "grad_norm": 1.1232729735134666, + "learning_rate": 6.7868214834882265e-06, + "loss": 0.9321, + "step": 849 + }, + { + "epoch": 0.8054963278843876, + "grad_norm": 1.1373251435631513, + "learning_rate": 6.779648955143754e-06, + "loss": 0.8665, + "step": 850 + }, + { + "epoch": 0.806443970623075, + "grad_norm": 1.2552274257774585, + "learning_rate": 6.772472230922313e-06, + "loss": 0.8871, + "step": 851 + }, + { + "epoch": 0.8073916133617626, + "grad_norm": 1.0665372846687498, + "learning_rate": 6.765291327744463e-06, + "loss": 0.8943, + "step": 852 + }, + { + "epoch": 0.8083392561004501, + "grad_norm": 2.382365978345813, + "learning_rate": 6.758106262540611e-06, + "loss": 0.96, + "step": 853 + }, + { + "epoch": 0.8092868988391376, + "grad_norm": 1.2804659583348557, + "learning_rate": 6.750917052250981e-06, + "loss": 0.9211, + "step": 854 + }, + { + "epoch": 0.8102345415778252, + "grad_norm": 1.1113739193035552, + "learning_rate": 6.7437237138255686e-06, + "loss": 0.9385, + "step": 855 + }, + { + "epoch": 0.8111821843165127, + "grad_norm": 1.1014129494663154, + "learning_rate": 6.736526264224101e-06, + "loss": 0.8579, + "step": 856 + }, + { + "epoch": 0.8121298270552002, + "grad_norm": 0.9349690740722351, + "learning_rate": 6.7293247204160024e-06, + "loss": 0.8415, + "step": 857 + }, + { + "epoch": 0.8130774697938877, + "grad_norm": 1.0528528824598096, + "learning_rate": 6.722119099380345e-06, + "loss": 0.9034, + "step": 858 + }, + { + "epoch": 0.8130774697938877, + "eval_loss": 0.9221681952476501, + "eval_runtime": 66.5105, + "eval_samples_per_second": 41.016, + "eval_steps_per_second": 0.647, + "step": 858 + }, + { + "epoch": 0.8140251125325753, + "grad_norm": 1.1189012672893626, + "learning_rate": 6.714909418105816e-06, + "loss": 0.8928, + "step": 859 + }, + { + "epoch": 0.8149727552712628, + "grad_norm": 1.0491667848828412, + "learning_rate": 6.7076956935906756e-06, + "loss": 0.8846, + "step": 860 + }, + { + "epoch": 0.8159203980099502, + "grad_norm": 1.1800217977478704, + "learning_rate": 6.700477942842717e-06, + "loss": 0.8467, + "step": 861 + }, + { + "epoch": 0.8168680407486377, + "grad_norm": 1.3635685933149135, + "learning_rate": 6.693256182879224e-06, + "loss": 0.8875, + "step": 862 + }, + { + "epoch": 0.8178156834873253, + "grad_norm": 0.9738480603904128, + "learning_rate": 6.686030430726938e-06, + "loss": 0.8611, + "step": 863 + }, + { + "epoch": 0.8187633262260128, + "grad_norm": 1.0646192263592438, + "learning_rate": 6.678800703422004e-06, + "loss": 0.9442, + "step": 864 + }, + { + "epoch": 0.8197109689647003, + "grad_norm": 1.133138450105503, + "learning_rate": 6.671567018009948e-06, + "loss": 0.8936, + "step": 865 + }, + { + "epoch": 0.8206586117033878, + "grad_norm": 1.1779803833294133, + "learning_rate": 6.664329391545625e-06, + "loss": 0.8945, + "step": 866 + }, + { + "epoch": 0.8216062544420754, + "grad_norm": 1.14213552665385, + "learning_rate": 6.657087841093179e-06, + "loss": 0.8796, + "step": 867 + }, + { + "epoch": 0.8225538971807629, + "grad_norm": 1.0919659396363655, + "learning_rate": 6.649842383726011e-06, + "loss": 0.9093, + "step": 868 + }, + { + "epoch": 0.8235015399194504, + "grad_norm": 1.1540405069787951, + "learning_rate": 6.642593036526728e-06, + "loss": 0.8398, + "step": 869 + }, + { + "epoch": 0.8244491826581379, + "grad_norm": 0.9630828315476891, + "learning_rate": 6.635339816587109e-06, + "loss": 0.8616, + "step": 870 + }, + { + "epoch": 0.8253968253968254, + "grad_norm": 1.138452507177444, + "learning_rate": 6.628082741008068e-06, + "loss": 0.9328, + "step": 871 + }, + { + "epoch": 0.8263444681355129, + "grad_norm": 1.1595340664384783, + "learning_rate": 6.620821826899606e-06, + "loss": 0.8951, + "step": 872 + }, + { + "epoch": 0.8272921108742004, + "grad_norm": 1.0403640766877473, + "learning_rate": 6.613557091380771e-06, + "loss": 0.8403, + "step": 873 + }, + { + "epoch": 0.8282397536128879, + "grad_norm": 1.0491318496593032, + "learning_rate": 6.606288551579629e-06, + "loss": 0.8726, + "step": 874 + }, + { + "epoch": 0.8291873963515755, + "grad_norm": 1.0770673557649837, + "learning_rate": 6.599016224633209e-06, + "loss": 0.8777, + "step": 875 + }, + { + "epoch": 0.830135039090263, + "grad_norm": 1.2586661831963606, + "learning_rate": 6.59174012768747e-06, + "loss": 0.9406, + "step": 876 + }, + { + "epoch": 0.8310826818289505, + "grad_norm": 0.9975414535990195, + "learning_rate": 6.584460277897262e-06, + "loss": 0.9178, + "step": 877 + }, + { + "epoch": 0.832030324567638, + "grad_norm": 1.1950907387823217, + "learning_rate": 6.5771766924262795e-06, + "loss": 0.8673, + "step": 878 + }, + { + "epoch": 0.8329779673063256, + "grad_norm": 1.026575911873265, + "learning_rate": 6.569889388447025e-06, + "loss": 0.8515, + "step": 879 + }, + { + "epoch": 0.833925610045013, + "grad_norm": 1.1555312684415828, + "learning_rate": 6.562598383140773e-06, + "loss": 0.9227, + "step": 880 + }, + { + "epoch": 0.833925610045013, + "eval_loss": 0.9205263257026672, + "eval_runtime": 65.5841, + "eval_samples_per_second": 41.595, + "eval_steps_per_second": 0.656, + "step": 880 + }, + { + "epoch": 0.8348732527837005, + "grad_norm": 1.0410870491482918, + "learning_rate": 6.555303693697517e-06, + "loss": 0.8879, + "step": 881 + }, + { + "epoch": 0.835820895522388, + "grad_norm": 1.2046733679595938, + "learning_rate": 6.548005337315943e-06, + "loss": 0.9327, + "step": 882 + }, + { + "epoch": 0.8367685382610756, + "grad_norm": 1.4002394037690635, + "learning_rate": 6.540703331203382e-06, + "loss": 0.8616, + "step": 883 + }, + { + "epoch": 0.8377161809997631, + "grad_norm": 1.0820553537637987, + "learning_rate": 6.533397692575766e-06, + "loss": 0.8599, + "step": 884 + }, + { + "epoch": 0.8386638237384506, + "grad_norm": 1.1367287841963307, + "learning_rate": 6.526088438657594e-06, + "loss": 0.9047, + "step": 885 + }, + { + "epoch": 0.8396114664771381, + "grad_norm": 1.1941704415773284, + "learning_rate": 6.518775586681887e-06, + "loss": 0.8552, + "step": 886 + }, + { + "epoch": 0.8405591092158257, + "grad_norm": 1.059195584331362, + "learning_rate": 6.511459153890152e-06, + "loss": 0.9146, + "step": 887 + }, + { + "epoch": 0.8415067519545132, + "grad_norm": 1.105675547858621, + "learning_rate": 6.504139157532338e-06, + "loss": 0.8386, + "step": 888 + }, + { + "epoch": 0.8424543946932007, + "grad_norm": 1.176419395830192, + "learning_rate": 6.496815614866792e-06, + "loss": 0.851, + "step": 889 + }, + { + "epoch": 0.8434020374318881, + "grad_norm": 1.4493743507868786, + "learning_rate": 6.489488543160225e-06, + "loss": 0.9137, + "step": 890 + }, + { + "epoch": 0.8443496801705757, + "grad_norm": 0.9826791105867756, + "learning_rate": 6.4821579596876705e-06, + "loss": 0.9117, + "step": 891 + }, + { + "epoch": 0.8452973229092632, + "grad_norm": 1.0251650573055395, + "learning_rate": 6.4748238817324395e-06, + "loss": 0.9214, + "step": 892 + }, + { + "epoch": 0.8462449656479507, + "grad_norm": 1.025330926521032, + "learning_rate": 6.46748632658608e-06, + "loss": 0.8434, + "step": 893 + }, + { + "epoch": 0.8471926083866382, + "grad_norm": 1.0829549887894614, + "learning_rate": 6.460145311548341e-06, + "loss": 0.9142, + "step": 894 + }, + { + "epoch": 0.8481402511253258, + "grad_norm": 1.0949066487399224, + "learning_rate": 6.452800853927128e-06, + "loss": 0.8257, + "step": 895 + }, + { + "epoch": 0.8490878938640133, + "grad_norm": 1.153959748533555, + "learning_rate": 6.445452971038464e-06, + "loss": 0.9253, + "step": 896 + }, + { + "epoch": 0.8500355366027008, + "grad_norm": 1.193405218423659, + "learning_rate": 6.438101680206444e-06, + "loss": 0.9291, + "step": 897 + }, + { + "epoch": 0.8509831793413883, + "grad_norm": 1.2544535828853964, + "learning_rate": 6.430746998763204e-06, + "loss": 0.9173, + "step": 898 + }, + { + "epoch": 0.8519308220800759, + "grad_norm": 0.9715729053189112, + "learning_rate": 6.42338894404887e-06, + "loss": 0.8385, + "step": 899 + }, + { + "epoch": 0.8528784648187633, + "grad_norm": 1.0084105628200437, + "learning_rate": 6.41602753341152e-06, + "loss": 0.886, + "step": 900 + }, + { + "epoch": 0.8538261075574508, + "grad_norm": 1.0846787519964416, + "learning_rate": 6.408662784207149e-06, + "loss": 0.8862, + "step": 901 + }, + { + "epoch": 0.8547737502961383, + "grad_norm": 1.553862723805949, + "learning_rate": 6.4012947137996175e-06, + "loss": 0.9481, + "step": 902 + }, + { + "epoch": 0.8547737502961383, + "eval_loss": 0.9192849397659302, + "eval_runtime": 61.9949, + "eval_samples_per_second": 44.004, + "eval_steps_per_second": 0.694, + "step": 902 + }, + { + "epoch": 0.8557213930348259, + "grad_norm": 1.190837748503028, + "learning_rate": 6.393923339560621e-06, + "loss": 0.9056, + "step": 903 + }, + { + "epoch": 0.8566690357735134, + "grad_norm": 1.0240140450014306, + "learning_rate": 6.386548678869644e-06, + "loss": 0.8862, + "step": 904 + }, + { + "epoch": 0.8576166785122009, + "grad_norm": 1.1684091911040653, + "learning_rate": 6.379170749113918e-06, + "loss": 0.9077, + "step": 905 + }, + { + "epoch": 0.8585643212508884, + "grad_norm": 1.2458608937767364, + "learning_rate": 6.37178956768838e-06, + "loss": 0.8883, + "step": 906 + }, + { + "epoch": 0.859511963989576, + "grad_norm": 1.0899866220356935, + "learning_rate": 6.3644051519956366e-06, + "loss": 0.8953, + "step": 907 + }, + { + "epoch": 0.8604596067282635, + "grad_norm": 1.0159031729823804, + "learning_rate": 6.3570175194459205e-06, + "loss": 0.8946, + "step": 908 + }, + { + "epoch": 0.8614072494669509, + "grad_norm": 1.0422661890872638, + "learning_rate": 6.349626687457045e-06, + "loss": 0.9217, + "step": 909 + }, + { + "epoch": 0.8623548922056384, + "grad_norm": 1.0650534348419232, + "learning_rate": 6.342232673454371e-06, + "loss": 0.8993, + "step": 910 + }, + { + "epoch": 0.863302534944326, + "grad_norm": 1.162095463386776, + "learning_rate": 6.334835494870759e-06, + "loss": 0.9435, + "step": 911 + }, + { + "epoch": 0.8642501776830135, + "grad_norm": 0.9740991606947988, + "learning_rate": 6.3274351691465305e-06, + "loss": 0.8874, + "step": 912 + }, + { + "epoch": 0.865197820421701, + "grad_norm": 1.0752595651605357, + "learning_rate": 6.320031713729429e-06, + "loss": 0.8733, + "step": 913 + }, + { + "epoch": 0.8661454631603885, + "grad_norm": 1.003012232982504, + "learning_rate": 6.312625146074574e-06, + "loss": 0.8997, + "step": 914 + }, + { + "epoch": 0.8670931058990761, + "grad_norm": 1.2050664252308885, + "learning_rate": 6.305215483644427e-06, + "loss": 0.9121, + "step": 915 + }, + { + "epoch": 0.8680407486377636, + "grad_norm": 1.0509143532390477, + "learning_rate": 6.2978027439087405e-06, + "loss": 0.9215, + "step": 916 + }, + { + "epoch": 0.8689883913764511, + "grad_norm": 1.1000294092216198, + "learning_rate": 6.290386944344527e-06, + "loss": 0.8209, + "step": 917 + }, + { + "epoch": 0.8699360341151386, + "grad_norm": 1.4381680474891718, + "learning_rate": 6.28296810243601e-06, + "loss": 0.9485, + "step": 918 + }, + { + "epoch": 0.8708836768538261, + "grad_norm": 1.0986281816759764, + "learning_rate": 6.2755462356745885e-06, + "loss": 0.8677, + "step": 919 + }, + { + "epoch": 0.8718313195925136, + "grad_norm": 1.1944430191428006, + "learning_rate": 6.268121361558792e-06, + "loss": 0.932, + "step": 920 + }, + { + "epoch": 0.8727789623312011, + "grad_norm": 30.495921119331683, + "learning_rate": 6.2606934975942415e-06, + "loss": 0.9977, + "step": 921 + }, + { + "epoch": 0.8737266050698886, + "grad_norm": 1.0629329656422315, + "learning_rate": 6.2532626612936035e-06, + "loss": 0.9012, + "step": 922 + }, + { + "epoch": 0.8746742478085762, + "grad_norm": 1.212840019137115, + "learning_rate": 6.245828870176557e-06, + "loss": 0.8842, + "step": 923 + }, + { + "epoch": 0.8756218905472637, + "grad_norm": 1.1166380211338318, + "learning_rate": 6.238392141769743e-06, + "loss": 0.8853, + "step": 924 + }, + { + "epoch": 0.8756218905472637, + "eval_loss": 0.9188343286514282, + "eval_runtime": 71.0766, + "eval_samples_per_second": 38.381, + "eval_steps_per_second": 0.605, + "step": 924 + }, + { + "epoch": 0.8765695332859512, + "grad_norm": 1.0374942288524216, + "learning_rate": 6.2309524936067344e-06, + "loss": 0.8285, + "step": 925 + }, + { + "epoch": 0.8775171760246387, + "grad_norm": 1.294054885875714, + "learning_rate": 6.22350994322798e-06, + "loss": 0.9001, + "step": 926 + }, + { + "epoch": 0.8784648187633263, + "grad_norm": 1.0637784431739705, + "learning_rate": 6.216064508180778e-06, + "loss": 0.8865, + "step": 927 + }, + { + "epoch": 0.8794124615020137, + "grad_norm": 1.0389026144557505, + "learning_rate": 6.208616206019225e-06, + "loss": 0.8368, + "step": 928 + }, + { + "epoch": 0.8803601042407012, + "grad_norm": 1.008199300013402, + "learning_rate": 6.2011650543041734e-06, + "loss": 0.8638, + "step": 929 + }, + { + "epoch": 0.8813077469793887, + "grad_norm": 1.0169656783697818, + "learning_rate": 6.193711070603202e-06, + "loss": 0.8854, + "step": 930 + }, + { + "epoch": 0.8822553897180763, + "grad_norm": 1.3473009889737735, + "learning_rate": 6.1862542724905605e-06, + "loss": 0.8851, + "step": 931 + }, + { + "epoch": 0.8832030324567638, + "grad_norm": 0.9515757094852794, + "learning_rate": 6.178794677547138e-06, + "loss": 0.8049, + "step": 932 + }, + { + "epoch": 0.8841506751954513, + "grad_norm": 1.0333010505925766, + "learning_rate": 6.171332303360411e-06, + "loss": 0.8989, + "step": 933 + }, + { + "epoch": 0.8850983179341388, + "grad_norm": 1.0350926221697927, + "learning_rate": 6.163867167524419e-06, + "loss": 0.8401, + "step": 934 + }, + { + "epoch": 0.8860459606728264, + "grad_norm": 1.0721173411729321, + "learning_rate": 6.156399287639703e-06, + "loss": 0.9309, + "step": 935 + }, + { + "epoch": 0.8869936034115139, + "grad_norm": 1.3120269262792263, + "learning_rate": 6.14892868131328e-06, + "loss": 0.8991, + "step": 936 + }, + { + "epoch": 0.8879412461502014, + "grad_norm": 1.1865006482607656, + "learning_rate": 6.1414553661585905e-06, + "loss": 0.8683, + "step": 937 + }, + { + "epoch": 0.8888888888888888, + "grad_norm": 1.2132247708632098, + "learning_rate": 6.1339793597954675e-06, + "loss": 0.8569, + "step": 938 + }, + { + "epoch": 0.8898365316275764, + "grad_norm": 1.356211439062163, + "learning_rate": 6.126500679850082e-06, + "loss": 0.8543, + "step": 939 + }, + { + "epoch": 0.8907841743662639, + "grad_norm": 1.0301726611069624, + "learning_rate": 6.119019343954914e-06, + "loss": 0.8244, + "step": 940 + }, + { + "epoch": 0.8917318171049514, + "grad_norm": 1.2284199784911718, + "learning_rate": 6.111535369748702e-06, + "loss": 0.9085, + "step": 941 + }, + { + "epoch": 0.892679459843639, + "grad_norm": 1.0917917658926368, + "learning_rate": 6.104048774876407e-06, + "loss": 0.9026, + "step": 942 + }, + { + "epoch": 0.8936271025823265, + "grad_norm": 1.0918457932637566, + "learning_rate": 6.096559576989166e-06, + "loss": 0.8416, + "step": 943 + }, + { + "epoch": 0.894574745321014, + "grad_norm": 1.090778224144194, + "learning_rate": 6.089067793744258e-06, + "loss": 0.9331, + "step": 944 + }, + { + "epoch": 0.8955223880597015, + "grad_norm": 1.1895282418746094, + "learning_rate": 6.0815734428050535e-06, + "loss": 0.9023, + "step": 945 + }, + { + "epoch": 0.896470030798389, + "grad_norm": 1.1429336151352636, + "learning_rate": 6.074076541840978e-06, + "loss": 0.8708, + "step": 946 + }, + { + "epoch": 0.896470030798389, + "eval_loss": 0.9174872636795044, + "eval_runtime": 64.8721, + "eval_samples_per_second": 42.052, + "eval_steps_per_second": 0.663, + "step": 946 + }, + { + "epoch": 0.8974176735370766, + "grad_norm": 1.0370632105036575, + "learning_rate": 6.066577108527469e-06, + "loss": 0.8657, + "step": 947 + }, + { + "epoch": 0.898365316275764, + "grad_norm": 1.0186442974981031, + "learning_rate": 6.059075160545933e-06, + "loss": 0.8767, + "step": 948 + }, + { + "epoch": 0.8993129590144515, + "grad_norm": 1.1803999185158343, + "learning_rate": 6.05157071558371e-06, + "loss": 0.9213, + "step": 949 + }, + { + "epoch": 0.900260601753139, + "grad_norm": 1.2808021169879162, + "learning_rate": 6.044063791334023e-06, + "loss": 0.7969, + "step": 950 + }, + { + "epoch": 0.9012082444918266, + "grad_norm": 1.100561129817383, + "learning_rate": 6.03655440549594e-06, + "loss": 0.9169, + "step": 951 + }, + { + "epoch": 0.9021558872305141, + "grad_norm": 1.0079024114327502, + "learning_rate": 6.029042575774334e-06, + "loss": 0.8063, + "step": 952 + }, + { + "epoch": 0.9031035299692016, + "grad_norm": 1.188940267604656, + "learning_rate": 6.021528319879843e-06, + "loss": 0.8283, + "step": 953 + }, + { + "epoch": 0.9040511727078892, + "grad_norm": 1.4178205131395747, + "learning_rate": 6.01401165552882e-06, + "loss": 0.9282, + "step": 954 + }, + { + "epoch": 0.9049988154465767, + "grad_norm": 1.0472375312060485, + "learning_rate": 6.006492600443301e-06, + "loss": 0.8396, + "step": 955 + }, + { + "epoch": 0.9059464581852642, + "grad_norm": 1.1298052814130883, + "learning_rate": 5.998971172350953e-06, + "loss": 0.8898, + "step": 956 + }, + { + "epoch": 0.9068941009239516, + "grad_norm": 1.1111085147456654, + "learning_rate": 5.991447388985045e-06, + "loss": 0.8682, + "step": 957 + }, + { + "epoch": 0.9078417436626391, + "grad_norm": 1.4266031924717084, + "learning_rate": 5.9839212680843925e-06, + "loss": 0.8415, + "step": 958 + }, + { + "epoch": 0.9087893864013267, + "grad_norm": 1.3571311779441146, + "learning_rate": 5.976392827393326e-06, + "loss": 0.9395, + "step": 959 + }, + { + "epoch": 0.9097370291400142, + "grad_norm": 1.0252667256604506, + "learning_rate": 5.968862084661643e-06, + "loss": 0.8144, + "step": 960 + }, + { + "epoch": 0.9106846718787017, + "grad_norm": 1.0648709592997376, + "learning_rate": 5.961329057644571e-06, + "loss": 0.8239, + "step": 961 + }, + { + "epoch": 0.9116323146173892, + "grad_norm": 1.1431114083793006, + "learning_rate": 5.9537937641027225e-06, + "loss": 0.8986, + "step": 962 + }, + { + "epoch": 0.9125799573560768, + "grad_norm": 1.051205202397749, + "learning_rate": 5.946256221802052e-06, + "loss": 0.8686, + "step": 963 + }, + { + "epoch": 0.9135276000947643, + "grad_norm": 1.1593903949141868, + "learning_rate": 5.938716448513819e-06, + "loss": 0.8353, + "step": 964 + }, + { + "epoch": 0.9144752428334518, + "grad_norm": 1.1838771825939993, + "learning_rate": 5.931174462014538e-06, + "loss": 0.9348, + "step": 965 + }, + { + "epoch": 0.9154228855721394, + "grad_norm": 1.0369730284903498, + "learning_rate": 5.923630280085948e-06, + "loss": 0.8309, + "step": 966 + }, + { + "epoch": 0.9163705283108268, + "grad_norm": 1.1424198652925397, + "learning_rate": 5.916083920514959e-06, + "loss": 0.8653, + "step": 967 + }, + { + "epoch": 0.9173181710495143, + "grad_norm": 1.345398860745349, + "learning_rate": 5.908535401093618e-06, + "loss": 0.871, + "step": 968 + }, + { + "epoch": 0.9173181710495143, + "eval_loss": 0.9164453744888306, + "eval_runtime": 70.7681, + "eval_samples_per_second": 38.548, + "eval_steps_per_second": 0.608, + "step": 968 + }, + { + "epoch": 0.9182658137882018, + "grad_norm": 1.2793924011477549, + "learning_rate": 5.900984739619062e-06, + "loss": 0.9352, + "step": 969 + }, + { + "epoch": 0.9192134565268893, + "grad_norm": 1.069579103341338, + "learning_rate": 5.893431953893483e-06, + "loss": 0.8886, + "step": 970 + }, + { + "epoch": 0.9201610992655769, + "grad_norm": 0.9577007372371958, + "learning_rate": 5.885877061724075e-06, + "loss": 0.9196, + "step": 971 + }, + { + "epoch": 0.9211087420042644, + "grad_norm": 1.1384165277972425, + "learning_rate": 5.878320080923001e-06, + "loss": 0.8944, + "step": 972 + }, + { + "epoch": 0.9220563847429519, + "grad_norm": 1.0993789420249829, + "learning_rate": 5.8707610293073524e-06, + "loss": 0.8718, + "step": 973 + }, + { + "epoch": 0.9230040274816395, + "grad_norm": 0.990859843943125, + "learning_rate": 5.8631999246990954e-06, + "loss": 0.8815, + "step": 974 + }, + { + "epoch": 0.923951670220327, + "grad_norm": 0.9596254067235986, + "learning_rate": 5.855636784925044e-06, + "loss": 0.8873, + "step": 975 + }, + { + "epoch": 0.9248993129590144, + "grad_norm": 1.1971458477938866, + "learning_rate": 5.848071627816804e-06, + "loss": 0.9301, + "step": 976 + }, + { + "epoch": 0.9258469556977019, + "grad_norm": 1.293695132429081, + "learning_rate": 5.840504471210742e-06, + "loss": 0.8826, + "step": 977 + }, + { + "epoch": 0.9267945984363894, + "grad_norm": 1.0637632989021661, + "learning_rate": 5.832935332947937e-06, + "loss": 0.8744, + "step": 978 + }, + { + "epoch": 0.927742241175077, + "grad_norm": 1.0066024634519515, + "learning_rate": 5.82536423087414e-06, + "loss": 0.8836, + "step": 979 + }, + { + "epoch": 0.9286898839137645, + "grad_norm": 1.114286515649878, + "learning_rate": 5.817791182839734e-06, + "loss": 0.8973, + "step": 980 + }, + { + "epoch": 0.929637526652452, + "grad_norm": 1.2393549521144707, + "learning_rate": 5.810216206699686e-06, + "loss": 0.9605, + "step": 981 + }, + { + "epoch": 0.9305851693911396, + "grad_norm": 1.0160087719698536, + "learning_rate": 5.8026393203135145e-06, + "loss": 0.9383, + "step": 982 + }, + { + "epoch": 0.9315328121298271, + "grad_norm": 1.1104693027389803, + "learning_rate": 5.7950605415452365e-06, + "loss": 0.8697, + "step": 983 + }, + { + "epoch": 0.9324804548685146, + "grad_norm": 1.0139561721996317, + "learning_rate": 5.787479888263333e-06, + "loss": 0.8634, + "step": 984 + }, + { + "epoch": 0.9334280976072021, + "grad_norm": 0.9149964477354714, + "learning_rate": 5.779897378340705e-06, + "loss": 0.8692, + "step": 985 + }, + { + "epoch": 0.9343757403458895, + "grad_norm": 1.081719984477777, + "learning_rate": 5.772313029654631e-06, + "loss": 0.8752, + "step": 986 + }, + { + "epoch": 0.9353233830845771, + "grad_norm": 0.9686190602510739, + "learning_rate": 5.76472686008672e-06, + "loss": 0.9374, + "step": 987 + }, + { + "epoch": 0.9362710258232646, + "grad_norm": 1.1186604360938692, + "learning_rate": 5.757138887522884e-06, + "loss": 0.9454, + "step": 988 + }, + { + "epoch": 0.9372186685619521, + "grad_norm": 1.1474863690068453, + "learning_rate": 5.749549129853277e-06, + "loss": 0.9526, + "step": 989 + }, + { + "epoch": 0.9381663113006397, + "grad_norm": 1.2065827627859584, + "learning_rate": 5.741957604972264e-06, + "loss": 0.9015, + "step": 990 + }, + { + "epoch": 0.9381663113006397, + "eval_loss": 0.9157132506370544, + "eval_runtime": 63.106, + "eval_samples_per_second": 43.229, + "eval_steps_per_second": 0.681, + "step": 990 + }, + { + "epoch": 0.9391139540393272, + "grad_norm": 1.0667864777077534, + "learning_rate": 5.734364330778381e-06, + "loss": 0.9115, + "step": 991 + }, + { + "epoch": 0.9400615967780147, + "grad_norm": 1.0077841132377081, + "learning_rate": 5.726769325174279e-06, + "loss": 0.8647, + "step": 992 + }, + { + "epoch": 0.9410092395167022, + "grad_norm": 1.0542855609546453, + "learning_rate": 5.719172606066703e-06, + "loss": 0.871, + "step": 993 + }, + { + "epoch": 0.9419568822553898, + "grad_norm": 1.0723234157404569, + "learning_rate": 5.711574191366427e-06, + "loss": 0.8652, + "step": 994 + }, + { + "epoch": 0.9429045249940773, + "grad_norm": 1.0614000028665043, + "learning_rate": 5.703974098988229e-06, + "loss": 0.8693, + "step": 995 + }, + { + "epoch": 0.9438521677327647, + "grad_norm": 1.184972220370901, + "learning_rate": 5.696372346850842e-06, + "loss": 0.8748, + "step": 996 + }, + { + "epoch": 0.9447998104714522, + "grad_norm": 3.0693319889655357, + "learning_rate": 5.68876895287691e-06, + "loss": 0.9179, + "step": 997 + }, + { + "epoch": 0.9457474532101398, + "grad_norm": 1.364979369089619, + "learning_rate": 5.68116393499295e-06, + "loss": 0.9287, + "step": 998 + }, + { + "epoch": 0.9466950959488273, + "grad_norm": 1.138619446576553, + "learning_rate": 5.673557311129306e-06, + "loss": 0.9642, + "step": 999 + }, + { + "epoch": 0.9476427386875148, + "grad_norm": 1.1855283956687006, + "learning_rate": 5.66594909922011e-06, + "loss": 0.8375, + "step": 1000 + }, + { + "epoch": 0.9485903814262023, + "grad_norm": 1.1632815768908809, + "learning_rate": 5.658339317203235e-06, + "loss": 0.8411, + "step": 1001 + }, + { + "epoch": 0.9495380241648899, + "grad_norm": 1.2165292478376695, + "learning_rate": 5.650727983020262e-06, + "loss": 0.802, + "step": 1002 + }, + { + "epoch": 0.9504856669035774, + "grad_norm": 1.1272166319806751, + "learning_rate": 5.6431151146164255e-06, + "loss": 0.8764, + "step": 1003 + }, + { + "epoch": 0.9514333096422649, + "grad_norm": 1.179748089906736, + "learning_rate": 5.635500729940578e-06, + "loss": 0.8728, + "step": 1004 + }, + { + "epoch": 0.9523809523809523, + "grad_norm": 1.1413912763569145, + "learning_rate": 5.627884846945151e-06, + "loss": 0.8815, + "step": 1005 + }, + { + "epoch": 0.9533285951196399, + "grad_norm": 1.1198726067886378, + "learning_rate": 5.6202674835861045e-06, + "loss": 0.8549, + "step": 1006 + }, + { + "epoch": 0.9542762378583274, + "grad_norm": 1.0106130328632965, + "learning_rate": 5.6126486578228926e-06, + "loss": 0.8785, + "step": 1007 + }, + { + "epoch": 0.9552238805970149, + "grad_norm": 1.092869267593721, + "learning_rate": 5.605028387618412e-06, + "loss": 0.9306, + "step": 1008 + }, + { + "epoch": 0.9561715233357024, + "grad_norm": 1.2094913165105556, + "learning_rate": 5.597406690938969e-06, + "loss": 0.8963, + "step": 1009 + }, + { + "epoch": 0.95711916607439, + "grad_norm": 1.1750720019167373, + "learning_rate": 5.5897835857542315e-06, + "loss": 0.8666, + "step": 1010 + }, + { + "epoch": 0.9580668088130775, + "grad_norm": 1.3245562535134334, + "learning_rate": 5.582159090037189e-06, + "loss": 0.8291, + "step": 1011 + }, + { + "epoch": 0.959014451551765, + "grad_norm": 1.2042706760957538, + "learning_rate": 5.574533221764109e-06, + "loss": 0.8684, + "step": 1012 + }, + { + "epoch": 0.959014451551765, + "eval_loss": 0.9151268601417542, + "eval_runtime": 62.5892, + "eval_samples_per_second": 43.586, + "eval_steps_per_second": 0.687, + "step": 1012 + }, + { + "epoch": 0.9599620942904525, + "grad_norm": 1.3006236475762307, + "learning_rate": 5.566905998914496e-06, + "loss": 0.8668, + "step": 1013 + }, + { + "epoch": 0.9609097370291401, + "grad_norm": 1.0257069903720804, + "learning_rate": 5.559277439471047e-06, + "loss": 0.8478, + "step": 1014 + }, + { + "epoch": 0.9618573797678275, + "grad_norm": 1.511736931943181, + "learning_rate": 5.551647561419611e-06, + "loss": 0.8859, + "step": 1015 + }, + { + "epoch": 0.962805022506515, + "grad_norm": 1.0720441919793242, + "learning_rate": 5.544016382749146e-06, + "loss": 0.8665, + "step": 1016 + }, + { + "epoch": 0.9637526652452025, + "grad_norm": 1.5150355984208133, + "learning_rate": 5.536383921451673e-06, + "loss": 0.8628, + "step": 1017 + }, + { + "epoch": 0.9647003079838901, + "grad_norm": 1.2280290508409115, + "learning_rate": 5.528750195522244e-06, + "loss": 0.8873, + "step": 1018 + }, + { + "epoch": 0.9656479507225776, + "grad_norm": 1.0449090226929965, + "learning_rate": 5.521115222958889e-06, + "loss": 0.9395, + "step": 1019 + }, + { + "epoch": 0.9665955934612651, + "grad_norm": 1.286647376114692, + "learning_rate": 5.513479021762573e-06, + "loss": 0.8706, + "step": 1020 + }, + { + "epoch": 0.9675432361999526, + "grad_norm": 1.2238701214843728, + "learning_rate": 5.505841609937162e-06, + "loss": 0.85, + "step": 1021 + }, + { + "epoch": 0.9684908789386402, + "grad_norm": 1.08714516330967, + "learning_rate": 5.498203005489378e-06, + "loss": 0.8235, + "step": 1022 + }, + { + "epoch": 0.9694385216773277, + "grad_norm": 0.9837819321269746, + "learning_rate": 5.490563226428756e-06, + "loss": 0.824, + "step": 1023 + }, + { + "epoch": 0.9703861644160152, + "grad_norm": 1.0960448192217682, + "learning_rate": 5.4829222907675895e-06, + "loss": 0.8735, + "step": 1024 + }, + { + "epoch": 0.9713338071547026, + "grad_norm": 1.0733602660245445, + "learning_rate": 5.475280216520913e-06, + "loss": 0.846, + "step": 1025 + }, + { + "epoch": 0.9722814498933902, + "grad_norm": 1.0672536201083378, + "learning_rate": 5.467637021706438e-06, + "loss": 0.8457, + "step": 1026 + }, + { + "epoch": 0.9732290926320777, + "grad_norm": 1.2208010933061697, + "learning_rate": 5.459992724344516e-06, + "loss": 0.8684, + "step": 1027 + }, + { + "epoch": 0.9741767353707652, + "grad_norm": 1.0289674507090458, + "learning_rate": 5.4523473424581045e-06, + "loss": 0.8768, + "step": 1028 + }, + { + "epoch": 0.9751243781094527, + "grad_norm": 1.1688616748496152, + "learning_rate": 5.444700894072712e-06, + "loss": 0.8708, + "step": 1029 + }, + { + "epoch": 0.9760720208481403, + "grad_norm": 1.0710612536808917, + "learning_rate": 5.437053397216364e-06, + "loss": 0.9093, + "step": 1030 + }, + { + "epoch": 0.9770196635868278, + "grad_norm": 1.210378896652165, + "learning_rate": 5.429404869919559e-06, + "loss": 0.788, + "step": 1031 + }, + { + "epoch": 0.9779673063255153, + "grad_norm": 1.0438767939858762, + "learning_rate": 5.421755330215223e-06, + "loss": 0.95, + "step": 1032 + }, + { + "epoch": 0.9789149490642028, + "grad_norm": 1.071781685166561, + "learning_rate": 5.4141047961386724e-06, + "loss": 0.8668, + "step": 1033 + }, + { + "epoch": 0.9798625918028903, + "grad_norm": 1.0966538771750634, + "learning_rate": 5.4064532857275645e-06, + "loss": 0.9063, + "step": 1034 + }, + { + "epoch": 0.9798625918028903, + "eval_loss": 0.9142357707023621, + "eval_runtime": 67.863, + "eval_samples_per_second": 40.199, + "eval_steps_per_second": 0.634, + "step": 1034 + }, + { + "epoch": 0.9808102345415778, + "grad_norm": 1.0629520496350149, + "learning_rate": 5.398800817021857e-06, + "loss": 0.9179, + "step": 1035 + }, + { + "epoch": 0.9817578772802653, + "grad_norm": 1.315356991713657, + "learning_rate": 5.3911474080637705e-06, + "loss": 0.862, + "step": 1036 + }, + { + "epoch": 0.9827055200189528, + "grad_norm": 1.5792297604082457, + "learning_rate": 5.383493076897742e-06, + "loss": 0.8413, + "step": 1037 + }, + { + "epoch": 0.9836531627576404, + "grad_norm": 1.4810853298698745, + "learning_rate": 5.3758378415703825e-06, + "loss": 0.845, + "step": 1038 + }, + { + "epoch": 0.9846008054963279, + "grad_norm": 1.4008685021146463, + "learning_rate": 5.368181720130434e-06, + "loss": 0.8588, + "step": 1039 + }, + { + "epoch": 0.9855484482350154, + "grad_norm": 1.0609355637628701, + "learning_rate": 5.3605247306287275e-06, + "loss": 0.8704, + "step": 1040 + }, + { + "epoch": 0.9864960909737029, + "grad_norm": 1.147604350423996, + "learning_rate": 5.352866891118143e-06, + "loss": 0.8918, + "step": 1041 + }, + { + "epoch": 0.9874437337123905, + "grad_norm": 1.0985464941272645, + "learning_rate": 5.345208219653562e-06, + "loss": 0.9016, + "step": 1042 + }, + { + "epoch": 0.988391376451078, + "grad_norm": 1.05526373628293, + "learning_rate": 5.337548734291827e-06, + "loss": 0.8496, + "step": 1043 + }, + { + "epoch": 0.9893390191897654, + "grad_norm": 0.9639147858281681, + "learning_rate": 5.329888453091701e-06, + "loss": 0.8429, + "step": 1044 + }, + { + "epoch": 0.9902866619284529, + "grad_norm": 1.0367677286445547, + "learning_rate": 5.322227394113826e-06, + "loss": 0.9336, + "step": 1045 + }, + { + "epoch": 0.9912343046671405, + "grad_norm": 1.2202980010199613, + "learning_rate": 5.314565575420671e-06, + "loss": 0.8396, + "step": 1046 + }, + { + "epoch": 0.992181947405828, + "grad_norm": 1.043650160849041, + "learning_rate": 5.306903015076502e-06, + "loss": 0.9273, + "step": 1047 + }, + { + "epoch": 0.9931295901445155, + "grad_norm": 1.2196944237357306, + "learning_rate": 5.299239731147332e-06, + "loss": 0.882, + "step": 1048 + }, + { + "epoch": 0.994077232883203, + "grad_norm": 1.160879798524974, + "learning_rate": 5.291575741700878e-06, + "loss": 0.8874, + "step": 1049 + }, + { + "epoch": 0.9950248756218906, + "grad_norm": 1.1650239134630531, + "learning_rate": 5.283911064806522e-06, + "loss": 0.8936, + "step": 1050 + }, + { + "epoch": 0.9959725183605781, + "grad_norm": 1.0978030906121916, + "learning_rate": 5.2762457185352685e-06, + "loss": 0.8426, + "step": 1051 + }, + { + "epoch": 0.9969201610992656, + "grad_norm": 1.1662853004003075, + "learning_rate": 5.268579720959698e-06, + "loss": 0.8447, + "step": 1052 + }, + { + "epoch": 0.997867803837953, + "grad_norm": 1.0212044797920623, + "learning_rate": 5.260913090153928e-06, + "loss": 0.8577, + "step": 1053 + }, + { + "epoch": 0.9988154465766406, + "grad_norm": 0.9166563097634649, + "learning_rate": 5.253245844193564e-06, + "loss": 0.8203, + "step": 1054 + }, + { + "epoch": 0.9997630893153281, + "grad_norm": 1.2814929737482674, + "learning_rate": 5.24557800115567e-06, + "loss": 0.878, + "step": 1055 + }, + { + "epoch": 1.0007107320540156, + "grad_norm": 1.029394422791808, + "learning_rate": 5.237909579118713e-06, + "loss": 0.7881, + "step": 1056 + }, + { + "epoch": 1.0007107320540156, + "eval_loss": 0.9143710732460022, + "eval_runtime": 61.9673, + "eval_samples_per_second": 44.023, + "eval_steps_per_second": 0.694, + "step": 1056 + }, + { + "epoch": 1.0016583747927033, + "grad_norm": 1.0123693619172263, + "learning_rate": 5.2302405961625225e-06, + "loss": 0.7238, + "step": 1057 + }, + { + "epoch": 1.0026060175313907, + "grad_norm": 0.9727230093690866, + "learning_rate": 5.222571070368258e-06, + "loss": 0.7209, + "step": 1058 + }, + { + "epoch": 1.003553660270078, + "grad_norm": 0.972226576136118, + "learning_rate": 5.214901019818353e-06, + "loss": 0.7445, + "step": 1059 + }, + { + "epoch": 1.0045013030087657, + "grad_norm": 1.0690844562597608, + "learning_rate": 5.2072304625964785e-06, + "loss": 0.721, + "step": 1060 + }, + { + "epoch": 1.0054489457474531, + "grad_norm": 0.892297063646139, + "learning_rate": 5.199559416787503e-06, + "loss": 0.7467, + "step": 1061 + }, + { + "epoch": 1.0063965884861408, + "grad_norm": 0.8630887724649647, + "learning_rate": 5.191887900477444e-06, + "loss": 0.7242, + "step": 1062 + }, + { + "epoch": 1.0073442312248282, + "grad_norm": 0.8982089162859859, + "learning_rate": 5.1842159317534304e-06, + "loss": 0.6937, + "step": 1063 + }, + { + "epoch": 1.0082918739635158, + "grad_norm": 0.9672944077294386, + "learning_rate": 5.176543528703657e-06, + "loss": 0.7022, + "step": 1064 + }, + { + "epoch": 1.0092395167022032, + "grad_norm": 0.8839557121691414, + "learning_rate": 5.168870709417342e-06, + "loss": 0.7057, + "step": 1065 + }, + { + "epoch": 1.0101871594408909, + "grad_norm": 0.9782692878540812, + "learning_rate": 5.161197491984684e-06, + "loss": 0.7163, + "step": 1066 + }, + { + "epoch": 1.0111348021795783, + "grad_norm": 0.886106192089488, + "learning_rate": 5.153523894496826e-06, + "loss": 0.7415, + "step": 1067 + }, + { + "epoch": 1.0120824449182657, + "grad_norm": 0.9342675578725627, + "learning_rate": 5.1458499350458e-06, + "loss": 0.7005, + "step": 1068 + }, + { + "epoch": 1.0130300876569533, + "grad_norm": 0.9845076442772586, + "learning_rate": 5.138175631724495e-06, + "loss": 0.679, + "step": 1069 + }, + { + "epoch": 1.0139777303956408, + "grad_norm": 1.0058057680221788, + "learning_rate": 5.130501002626609e-06, + "loss": 0.7382, + "step": 1070 + }, + { + "epoch": 1.0149253731343284, + "grad_norm": 1.0402911368548144, + "learning_rate": 5.12282606584661e-06, + "loss": 0.7102, + "step": 1071 + }, + { + "epoch": 1.0158730158730158, + "grad_norm": 0.9653271354076595, + "learning_rate": 5.11515083947969e-06, + "loss": 0.7487, + "step": 1072 + }, + { + "epoch": 1.0168206586117035, + "grad_norm": 1.0059830380040296, + "learning_rate": 5.107475341621726e-06, + "loss": 0.697, + "step": 1073 + }, + { + "epoch": 1.0177683013503909, + "grad_norm": 0.9592083024886124, + "learning_rate": 5.099799590369231e-06, + "loss": 0.7111, + "step": 1074 + }, + { + "epoch": 1.0187159440890785, + "grad_norm": 0.9113464173920633, + "learning_rate": 5.092123603819318e-06, + "loss": 0.6739, + "step": 1075 + }, + { + "epoch": 1.019663586827766, + "grad_norm": 0.9658198205738235, + "learning_rate": 5.084447400069656e-06, + "loss": 0.672, + "step": 1076 + }, + { + "epoch": 1.0206112295664536, + "grad_norm": 0.9288542434671724, + "learning_rate": 5.076770997218424e-06, + "loss": 0.7281, + "step": 1077 + }, + { + "epoch": 1.021558872305141, + "grad_norm": 0.9793083780049829, + "learning_rate": 5.069094413364272e-06, + "loss": 0.6441, + "step": 1078 + }, + { + "epoch": 1.021558872305141, + "eval_loss": 0.9252648949623108, + "eval_runtime": 63.0278, + "eval_samples_per_second": 43.283, + "eval_steps_per_second": 0.682, + "step": 1078 + }, + { + "epoch": 1.0225065150438284, + "grad_norm": 0.9273171413752292, + "learning_rate": 5.061417666606274e-06, + "loss": 0.6967, + "step": 1079 + }, + { + "epoch": 1.023454157782516, + "grad_norm": 1.5839472443961204, + "learning_rate": 5.053740775043891e-06, + "loss": 0.7093, + "step": 1080 + }, + { + "epoch": 1.0244018005212034, + "grad_norm": 0.9765979302224368, + "learning_rate": 5.046063756776926e-06, + "loss": 0.6671, + "step": 1081 + }, + { + "epoch": 1.025349443259891, + "grad_norm": 0.9457182691315833, + "learning_rate": 5.038386629905475e-06, + "loss": 0.7088, + "step": 1082 + }, + { + "epoch": 1.0262970859985785, + "grad_norm": 1.0209484041244559, + "learning_rate": 5.030709412529896e-06, + "loss": 0.6753, + "step": 1083 + }, + { + "epoch": 1.0272447287372661, + "grad_norm": 1.0234656179924495, + "learning_rate": 5.0230321227507595e-06, + "loss": 0.7002, + "step": 1084 + }, + { + "epoch": 1.0281923714759535, + "grad_norm": 0.9799867444681427, + "learning_rate": 5.015354778668805e-06, + "loss": 0.6913, + "step": 1085 + }, + { + "epoch": 1.0291400142146412, + "grad_norm": 1.0327323089162292, + "learning_rate": 5.007677398384902e-06, + "loss": 0.7102, + "step": 1086 + }, + { + "epoch": 1.0300876569533286, + "grad_norm": 0.8542430531938927, + "learning_rate": 5e-06, + "loss": 0.7311, + "step": 1087 + }, + { + "epoch": 1.031035299692016, + "grad_norm": 0.9551198273884293, + "learning_rate": 4.992322601615101e-06, + "loss": 0.8065, + "step": 1088 + }, + { + "epoch": 1.0319829424307037, + "grad_norm": 1.2082682361729435, + "learning_rate": 4.984645221331196e-06, + "loss": 0.7087, + "step": 1089 + }, + { + "epoch": 1.032930585169391, + "grad_norm": 1.0470114950375882, + "learning_rate": 4.976967877249242e-06, + "loss": 0.694, + "step": 1090 + }, + { + "epoch": 1.0338782279080787, + "grad_norm": 0.8968991535832135, + "learning_rate": 4.969290587470106e-06, + "loss": 0.6542, + "step": 1091 + }, + { + "epoch": 1.0348258706467661, + "grad_norm": 0.9569176042470755, + "learning_rate": 4.961613370094526e-06, + "loss": 0.7053, + "step": 1092 + }, + { + "epoch": 1.0357735133854538, + "grad_norm": 0.956739530222217, + "learning_rate": 4.953936243223077e-06, + "loss": 0.7299, + "step": 1093 + }, + { + "epoch": 1.0367211561241412, + "grad_norm": 0.9576460725333583, + "learning_rate": 4.9462592249561095e-06, + "loss": 0.7516, + "step": 1094 + }, + { + "epoch": 1.0376687988628288, + "grad_norm": 1.314007292694408, + "learning_rate": 4.938582333393727e-06, + "loss": 0.7014, + "step": 1095 + }, + { + "epoch": 1.0386164416015162, + "grad_norm": 0.9732347243923025, + "learning_rate": 4.93090558663573e-06, + "loss": 0.6131, + "step": 1096 + }, + { + "epoch": 1.0395640843402036, + "grad_norm": 0.902465289858781, + "learning_rate": 4.923229002781577e-06, + "loss": 0.7244, + "step": 1097 + }, + { + "epoch": 1.0405117270788913, + "grad_norm": 0.9138525283319687, + "learning_rate": 4.915552599930345e-06, + "loss": 0.7447, + "step": 1098 + }, + { + "epoch": 1.0414593698175787, + "grad_norm": 0.8667199024588413, + "learning_rate": 4.907876396180684e-06, + "loss": 0.731, + "step": 1099 + }, + { + "epoch": 1.0424070125562663, + "grad_norm": 1.0023840202075536, + "learning_rate": 4.900200409630771e-06, + "loss": 0.7, + "step": 1100 + }, + { + "epoch": 1.0424070125562663, + "eval_loss": 0.9256648421287537, + "eval_runtime": 65.8282, + "eval_samples_per_second": 41.441, + "eval_steps_per_second": 0.653, + "step": 1100 + }, + { + "epoch": 1.0433546552949537, + "grad_norm": 1.005189996616475, + "learning_rate": 4.892524658378276e-06, + "loss": 0.662, + "step": 1101 + }, + { + "epoch": 1.0443022980336414, + "grad_norm": 1.0552789716104816, + "learning_rate": 4.884849160520311e-06, + "loss": 0.7296, + "step": 1102 + }, + { + "epoch": 1.0452499407723288, + "grad_norm": 0.9361357742033367, + "learning_rate": 4.877173934153392e-06, + "loss": 0.7036, + "step": 1103 + }, + { + "epoch": 1.0461975835110164, + "grad_norm": 0.9030236691224809, + "learning_rate": 4.869498997373393e-06, + "loss": 0.6941, + "step": 1104 + }, + { + "epoch": 1.0471452262497039, + "grad_norm": 1.0038838078279626, + "learning_rate": 4.861824368275508e-06, + "loss": 0.7321, + "step": 1105 + }, + { + "epoch": 1.0480928689883915, + "grad_norm": 0.9603184361433643, + "learning_rate": 4.854150064954201e-06, + "loss": 0.6711, + "step": 1106 + }, + { + "epoch": 1.049040511727079, + "grad_norm": 0.9225275167901936, + "learning_rate": 4.846476105503176e-06, + "loss": 0.6717, + "step": 1107 + }, + { + "epoch": 1.0499881544657663, + "grad_norm": 1.2404804952292134, + "learning_rate": 4.838802508015316e-06, + "loss": 0.7472, + "step": 1108 + }, + { + "epoch": 1.050935797204454, + "grad_norm": 0.9689214802559678, + "learning_rate": 4.83112929058266e-06, + "loss": 0.6947, + "step": 1109 + }, + { + "epoch": 1.0518834399431414, + "grad_norm": 1.1456662341258836, + "learning_rate": 4.8234564712963445e-06, + "loss": 0.7316, + "step": 1110 + }, + { + "epoch": 1.052831082681829, + "grad_norm": 0.9768673269000111, + "learning_rate": 4.815784068246571e-06, + "loss": 0.7487, + "step": 1111 + }, + { + "epoch": 1.0537787254205164, + "grad_norm": 0.9716968504214143, + "learning_rate": 4.808112099522558e-06, + "loss": 0.7056, + "step": 1112 + }, + { + "epoch": 1.054726368159204, + "grad_norm": 0.974665466729692, + "learning_rate": 4.800440583212499e-06, + "loss": 0.6911, + "step": 1113 + }, + { + "epoch": 1.0556740108978915, + "grad_norm": 0.9167885828760524, + "learning_rate": 4.792769537403523e-06, + "loss": 0.7107, + "step": 1114 + }, + { + "epoch": 1.0566216536365791, + "grad_norm": 1.0329746061334233, + "learning_rate": 4.785098980181649e-06, + "loss": 0.7229, + "step": 1115 + }, + { + "epoch": 1.0575692963752665, + "grad_norm": 0.9737952606280224, + "learning_rate": 4.777428929631743e-06, + "loss": 0.7777, + "step": 1116 + }, + { + "epoch": 1.058516939113954, + "grad_norm": 1.1038154299902683, + "learning_rate": 4.769759403837479e-06, + "loss": 0.6809, + "step": 1117 + }, + { + "epoch": 1.0594645818526416, + "grad_norm": 1.08554606134142, + "learning_rate": 4.762090420881289e-06, + "loss": 0.6669, + "step": 1118 + }, + { + "epoch": 1.060412224591329, + "grad_norm": 1.0920359773635873, + "learning_rate": 4.754421998844331e-06, + "loss": 0.6871, + "step": 1119 + }, + { + "epoch": 1.0613598673300166, + "grad_norm": 0.9825699774740995, + "learning_rate": 4.746754155806437e-06, + "loss": 0.727, + "step": 1120 + }, + { + "epoch": 1.062307510068704, + "grad_norm": 0.9674769463454311, + "learning_rate": 4.739086909846075e-06, + "loss": 0.7189, + "step": 1121 + }, + { + "epoch": 1.0632551528073917, + "grad_norm": 1.0091809961919986, + "learning_rate": 4.731420279040303e-06, + "loss": 0.7278, + "step": 1122 + }, + { + "epoch": 1.0632551528073917, + "eval_loss": 0.9248070120811462, + "eval_runtime": 64.5875, + "eval_samples_per_second": 42.237, + "eval_steps_per_second": 0.666, + "step": 1122 + }, + { + "epoch": 1.064202795546079, + "grad_norm": 1.073809323574034, + "learning_rate": 4.723754281464732e-06, + "loss": 0.7729, + "step": 1123 + }, + { + "epoch": 1.0651504382847667, + "grad_norm": 0.9206055880032425, + "learning_rate": 4.716088935193479e-06, + "loss": 0.6833, + "step": 1124 + }, + { + "epoch": 1.0660980810234542, + "grad_norm": 0.9675985199743727, + "learning_rate": 4.708424258299125e-06, + "loss": 0.7201, + "step": 1125 + }, + { + "epoch": 1.0670457237621416, + "grad_norm": 0.970501113273963, + "learning_rate": 4.700760268852669e-06, + "loss": 0.6957, + "step": 1126 + }, + { + "epoch": 1.0679933665008292, + "grad_norm": 1.0025167130129373, + "learning_rate": 4.693096984923499e-06, + "loss": 0.7329, + "step": 1127 + }, + { + "epoch": 1.0689410092395166, + "grad_norm": 1.2193095917303223, + "learning_rate": 4.68543442457933e-06, + "loss": 0.7177, + "step": 1128 + }, + { + "epoch": 1.0698886519782043, + "grad_norm": 1.0294113926873432, + "learning_rate": 4.677772605886175e-06, + "loss": 0.6829, + "step": 1129 + }, + { + "epoch": 1.0708362947168917, + "grad_norm": 0.9136644811068081, + "learning_rate": 4.670111546908299e-06, + "loss": 0.697, + "step": 1130 + }, + { + "epoch": 1.0717839374555793, + "grad_norm": 0.9443556825485072, + "learning_rate": 4.662451265708174e-06, + "loss": 0.6735, + "step": 1131 + }, + { + "epoch": 1.0727315801942667, + "grad_norm": 0.9561425995186141, + "learning_rate": 4.65479178034644e-06, + "loss": 0.7189, + "step": 1132 + }, + { + "epoch": 1.0736792229329544, + "grad_norm": 0.9931041504777264, + "learning_rate": 4.647133108881858e-06, + "loss": 0.6587, + "step": 1133 + }, + { + "epoch": 1.0746268656716418, + "grad_norm": 0.925346382362746, + "learning_rate": 4.639475269371273e-06, + "loss": 0.7157, + "step": 1134 + }, + { + "epoch": 1.0755745084103294, + "grad_norm": 1.031232406635297, + "learning_rate": 4.631818279869567e-06, + "loss": 0.7325, + "step": 1135 + }, + { + "epoch": 1.0765221511490168, + "grad_norm": 1.0831354355205014, + "learning_rate": 4.624162158429618e-06, + "loss": 0.703, + "step": 1136 + }, + { + "epoch": 1.0774697938877043, + "grad_norm": 0.9705781822246375, + "learning_rate": 4.616506923102259e-06, + "loss": 0.6238, + "step": 1137 + }, + { + "epoch": 1.078417436626392, + "grad_norm": 1.1371488724998442, + "learning_rate": 4.608852591936231e-06, + "loss": 0.7601, + "step": 1138 + }, + { + "epoch": 1.0793650793650793, + "grad_norm": 1.064474848705152, + "learning_rate": 4.601199182978146e-06, + "loss": 0.6468, + "step": 1139 + }, + { + "epoch": 1.080312722103767, + "grad_norm": 0.9450727325021634, + "learning_rate": 4.593546714272438e-06, + "loss": 0.7266, + "step": 1140 + }, + { + "epoch": 1.0812603648424544, + "grad_norm": 0.9799112965049801, + "learning_rate": 4.585895203861328e-06, + "loss": 0.7317, + "step": 1141 + }, + { + "epoch": 1.082208007581142, + "grad_norm": 0.9097169877735888, + "learning_rate": 4.5782446697847775e-06, + "loss": 0.746, + "step": 1142 + }, + { + "epoch": 1.0831556503198294, + "grad_norm": 1.054415192907425, + "learning_rate": 4.5705951300804425e-06, + "loss": 0.726, + "step": 1143 + }, + { + "epoch": 1.0841032930585168, + "grad_norm": 0.903372224445911, + "learning_rate": 4.562946602783637e-06, + "loss": 0.7171, + "step": 1144 + }, + { + "epoch": 1.0841032930585168, + "eval_loss": 0.9241182804107666, + "eval_runtime": 67.5348, + "eval_samples_per_second": 40.394, + "eval_steps_per_second": 0.637, + "step": 1144 + }, + { + "epoch": 1.0850509357972045, + "grad_norm": 1.0495310770662147, + "learning_rate": 4.55529910592729e-06, + "loss": 0.6606, + "step": 1145 + }, + { + "epoch": 1.0859985785358919, + "grad_norm": 1.3054046428477601, + "learning_rate": 4.547652657541897e-06, + "loss": 0.7109, + "step": 1146 + }, + { + "epoch": 1.0869462212745795, + "grad_norm": 0.9385889950812906, + "learning_rate": 4.540007275655485e-06, + "loss": 0.7101, + "step": 1147 + }, + { + "epoch": 1.087893864013267, + "grad_norm": 1.0307846935200982, + "learning_rate": 4.532362978293564e-06, + "loss": 0.7025, + "step": 1148 + }, + { + "epoch": 1.0888415067519546, + "grad_norm": 1.0094586805349344, + "learning_rate": 4.524719783479088e-06, + "loss": 0.7341, + "step": 1149 + }, + { + "epoch": 1.089789149490642, + "grad_norm": 1.0080024003104493, + "learning_rate": 4.517077709232411e-06, + "loss": 0.7125, + "step": 1150 + }, + { + "epoch": 1.0907367922293296, + "grad_norm": 0.9839429831089125, + "learning_rate": 4.509436773571247e-06, + "loss": 0.7263, + "step": 1151 + }, + { + "epoch": 1.091684434968017, + "grad_norm": 0.9578891424409663, + "learning_rate": 4.5017969945106225e-06, + "loss": 0.7049, + "step": 1152 + }, + { + "epoch": 1.0926320777067047, + "grad_norm": 1.589523374002844, + "learning_rate": 4.49415839006284e-06, + "loss": 0.7045, + "step": 1153 + }, + { + "epoch": 1.093579720445392, + "grad_norm": 1.1691430951977255, + "learning_rate": 4.486520978237431e-06, + "loss": 0.6681, + "step": 1154 + }, + { + "epoch": 1.0945273631840795, + "grad_norm": 1.020265471952243, + "learning_rate": 4.478884777041115e-06, + "loss": 0.7003, + "step": 1155 + }, + { + "epoch": 1.0954750059227671, + "grad_norm": 0.9179136320195528, + "learning_rate": 4.471249804477758e-06, + "loss": 0.7077, + "step": 1156 + }, + { + "epoch": 1.0964226486614546, + "grad_norm": 0.9635788033380907, + "learning_rate": 4.4636160785483285e-06, + "loss": 0.7151, + "step": 1157 + }, + { + "epoch": 1.0973702914001422, + "grad_norm": 2.647967184274327, + "learning_rate": 4.455983617250857e-06, + "loss": 0.7341, + "step": 1158 + }, + { + "epoch": 1.0983179341388296, + "grad_norm": 1.0131020212308353, + "learning_rate": 4.448352438580391e-06, + "loss": 0.6905, + "step": 1159 + }, + { + "epoch": 1.0992655768775172, + "grad_norm": 1.0530355520145287, + "learning_rate": 4.440722560528955e-06, + "loss": 0.6387, + "step": 1160 + }, + { + "epoch": 1.1002132196162047, + "grad_norm": 0.9509811853766807, + "learning_rate": 4.433094001085505e-06, + "loss": 0.7466, + "step": 1161 + }, + { + "epoch": 1.1011608623548923, + "grad_norm": 1.0138369505840823, + "learning_rate": 4.4254667782358925e-06, + "loss": 0.679, + "step": 1162 + }, + { + "epoch": 1.1021085050935797, + "grad_norm": 1.0773914127698383, + "learning_rate": 4.417840909962813e-06, + "loss": 0.7367, + "step": 1163 + }, + { + "epoch": 1.1030561478322674, + "grad_norm": 1.3528732394706713, + "learning_rate": 4.410216414245771e-06, + "loss": 0.7166, + "step": 1164 + }, + { + "epoch": 1.1040037905709548, + "grad_norm": 1.0120462677381739, + "learning_rate": 4.402593309061034e-06, + "loss": 0.6599, + "step": 1165 + }, + { + "epoch": 1.1049514333096422, + "grad_norm": 0.9783229280065361, + "learning_rate": 4.394971612381591e-06, + "loss": 0.7053, + "step": 1166 + }, + { + "epoch": 1.1049514333096422, + "eval_loss": 0.9224104285240173, + "eval_runtime": 63.0367, + "eval_samples_per_second": 43.276, + "eval_steps_per_second": 0.682, + "step": 1166 + }, + { + "epoch": 1.1058990760483298, + "grad_norm": 0.9564604779753705, + "learning_rate": 4.38735134217711e-06, + "loss": 0.7522, + "step": 1167 + }, + { + "epoch": 1.1068467187870172, + "grad_norm": 0.9360222777010359, + "learning_rate": 4.379732516413897e-06, + "loss": 0.6734, + "step": 1168 + }, + { + "epoch": 1.1077943615257049, + "grad_norm": 0.8943163287031561, + "learning_rate": 4.372115153054851e-06, + "loss": 0.7118, + "step": 1169 + }, + { + "epoch": 1.1087420042643923, + "grad_norm": 1.179413315968657, + "learning_rate": 4.364499270059423e-06, + "loss": 0.6538, + "step": 1170 + }, + { + "epoch": 1.10968964700308, + "grad_norm": 0.9833678397573673, + "learning_rate": 4.356884885383578e-06, + "loss": 0.7024, + "step": 1171 + }, + { + "epoch": 1.1106372897417673, + "grad_norm": 1.1165040405330118, + "learning_rate": 4.34927201697974e-06, + "loss": 0.7223, + "step": 1172 + }, + { + "epoch": 1.1115849324804548, + "grad_norm": 1.0836563622250095, + "learning_rate": 4.341660682796766e-06, + "loss": 0.7432, + "step": 1173 + }, + { + "epoch": 1.1125325752191424, + "grad_norm": 0.9956369650089748, + "learning_rate": 4.334050900779893e-06, + "loss": 0.6979, + "step": 1174 + }, + { + "epoch": 1.1134802179578298, + "grad_norm": 0.9041906906929965, + "learning_rate": 4.326442688870697e-06, + "loss": 0.7818, + "step": 1175 + }, + { + "epoch": 1.1144278606965174, + "grad_norm": 1.0233141405037254, + "learning_rate": 4.318836065007052e-06, + "loss": 0.6802, + "step": 1176 + }, + { + "epoch": 1.1153755034352049, + "grad_norm": 1.0925280426338722, + "learning_rate": 4.3112310471230925e-06, + "loss": 0.7202, + "step": 1177 + }, + { + "epoch": 1.1163231461738925, + "grad_norm": 1.0258996034471566, + "learning_rate": 4.303627653149159e-06, + "loss": 0.7173, + "step": 1178 + }, + { + "epoch": 1.11727078891258, + "grad_norm": 1.0595188906999566, + "learning_rate": 4.296025901011773e-06, + "loss": 0.7402, + "step": 1179 + }, + { + "epoch": 1.1182184316512676, + "grad_norm": 0.9566532678110401, + "learning_rate": 4.2884258086335755e-06, + "loss": 0.6911, + "step": 1180 + }, + { + "epoch": 1.119166074389955, + "grad_norm": 0.9954131257021107, + "learning_rate": 4.2808273939333e-06, + "loss": 0.6893, + "step": 1181 + }, + { + "epoch": 1.1201137171286426, + "grad_norm": 0.9760861787884846, + "learning_rate": 4.2732306748257226e-06, + "loss": 0.6839, + "step": 1182 + }, + { + "epoch": 1.12106135986733, + "grad_norm": 1.116236364521447, + "learning_rate": 4.265635669221622e-06, + "loss": 0.7272, + "step": 1183 + }, + { + "epoch": 1.1220090026060174, + "grad_norm": 0.9772190754750057, + "learning_rate": 4.258042395027738e-06, + "loss": 0.7048, + "step": 1184 + }, + { + "epoch": 1.122956645344705, + "grad_norm": 0.9990968345465719, + "learning_rate": 4.250450870146726e-06, + "loss": 0.6661, + "step": 1185 + }, + { + "epoch": 1.1239042880833925, + "grad_norm": 1.004582020487418, + "learning_rate": 4.2428611124771184e-06, + "loss": 0.7158, + "step": 1186 + }, + { + "epoch": 1.1248519308220801, + "grad_norm": 1.0285222277895798, + "learning_rate": 4.235273139913281e-06, + "loss": 0.6759, + "step": 1187 + }, + { + "epoch": 1.1257995735607675, + "grad_norm": 1.026042016166187, + "learning_rate": 4.227686970345373e-06, + "loss": 0.6767, + "step": 1188 + }, + { + "epoch": 1.1257995735607675, + "eval_loss": 0.9233511090278625, + "eval_runtime": 63.5378, + "eval_samples_per_second": 42.935, + "eval_steps_per_second": 0.677, + "step": 1188 + }, + { + "epoch": 1.1267472162994552, + "grad_norm": 0.9839710491649496, + "learning_rate": 4.220102621659298e-06, + "loss": 0.698, + "step": 1189 + }, + { + "epoch": 1.1276948590381426, + "grad_norm": 1.3599383760269543, + "learning_rate": 4.21252011173667e-06, + "loss": 0.7257, + "step": 1190 + }, + { + "epoch": 1.1286425017768302, + "grad_norm": 1.1366178207656392, + "learning_rate": 4.204939458454767e-06, + "loss": 0.7008, + "step": 1191 + }, + { + "epoch": 1.1295901445155176, + "grad_norm": 0.95168166219681, + "learning_rate": 4.197360679686489e-06, + "loss": 0.6956, + "step": 1192 + }, + { + "epoch": 1.1305377872542053, + "grad_norm": 1.0580531496952468, + "learning_rate": 4.1897837933003165e-06, + "loss": 0.6555, + "step": 1193 + }, + { + "epoch": 1.1314854299928927, + "grad_norm": 1.3388307797907961, + "learning_rate": 4.182208817160269e-06, + "loss": 0.7038, + "step": 1194 + }, + { + "epoch": 1.1324330727315801, + "grad_norm": 1.263598798657549, + "learning_rate": 4.174635769125862e-06, + "loss": 0.6939, + "step": 1195 + }, + { + "epoch": 1.1333807154702678, + "grad_norm": 0.9897430245234835, + "learning_rate": 4.1670646670520656e-06, + "loss": 0.6949, + "step": 1196 + }, + { + "epoch": 1.1343283582089552, + "grad_norm": 1.5009873894314265, + "learning_rate": 4.15949552878926e-06, + "loss": 0.663, + "step": 1197 + }, + { + "epoch": 1.1352760009476428, + "grad_norm": 1.022385852836757, + "learning_rate": 4.151928372183198e-06, + "loss": 0.7124, + "step": 1198 + }, + { + "epoch": 1.1362236436863302, + "grad_norm": 1.1789551448297066, + "learning_rate": 4.144363215074959e-06, + "loss": 0.6713, + "step": 1199 + }, + { + "epoch": 1.1371712864250179, + "grad_norm": 1.0079132927023848, + "learning_rate": 4.136800075300906e-06, + "loss": 0.6997, + "step": 1200 + }, + { + "epoch": 1.1381189291637053, + "grad_norm": 0.9647107031990494, + "learning_rate": 4.129238970692651e-06, + "loss": 0.6968, + "step": 1201 + }, + { + "epoch": 1.1390665719023927, + "grad_norm": 1.0120852534707783, + "learning_rate": 4.121679919077001e-06, + "loss": 0.7705, + "step": 1202 + }, + { + "epoch": 1.1400142146410803, + "grad_norm": 3.766970964988497, + "learning_rate": 4.114122938275929e-06, + "loss": 0.664, + "step": 1203 + }, + { + "epoch": 1.1409618573797677, + "grad_norm": 1.0763801437474894, + "learning_rate": 4.10656804610652e-06, + "loss": 0.7236, + "step": 1204 + }, + { + "epoch": 1.1419095001184554, + "grad_norm": 1.0272206175047942, + "learning_rate": 4.0990152603809394e-06, + "loss": 0.7017, + "step": 1205 + }, + { + "epoch": 1.1428571428571428, + "grad_norm": 1.0151383640578906, + "learning_rate": 4.091464598906385e-06, + "loss": 0.7475, + "step": 1206 + }, + { + "epoch": 1.1438047855958304, + "grad_norm": 1.0605465057418049, + "learning_rate": 4.083916079485044e-06, + "loss": 0.7003, + "step": 1207 + }, + { + "epoch": 1.1447524283345178, + "grad_norm": 0.9663496293680089, + "learning_rate": 4.076369719914055e-06, + "loss": 0.7059, + "step": 1208 + }, + { + "epoch": 1.1457000710732055, + "grad_norm": 1.5121184694790173, + "learning_rate": 4.068825537985465e-06, + "loss": 0.7403, + "step": 1209 + }, + { + "epoch": 1.146647713811893, + "grad_norm": 0.999707638130472, + "learning_rate": 4.061283551486185e-06, + "loss": 0.6822, + "step": 1210 + }, + { + "epoch": 1.146647713811893, + "eval_loss": 0.9220083355903625, + "eval_runtime": 64.0249, + "eval_samples_per_second": 42.608, + "eval_steps_per_second": 0.672, + "step": 1210 + }, + { + "epoch": 1.1475953565505805, + "grad_norm": 0.9638255107668675, + "learning_rate": 4.053743778197951e-06, + "loss": 0.6955, + "step": 1211 + }, + { + "epoch": 1.148542999289268, + "grad_norm": 1.092006543225386, + "learning_rate": 4.04620623589728e-06, + "loss": 0.7363, + "step": 1212 + }, + { + "epoch": 1.1494906420279554, + "grad_norm": 0.9984016047331302, + "learning_rate": 4.038670942355431e-06, + "loss": 0.6918, + "step": 1213 + }, + { + "epoch": 1.150438284766643, + "grad_norm": 1.0035214440616442, + "learning_rate": 4.03113791533836e-06, + "loss": 0.6924, + "step": 1214 + }, + { + "epoch": 1.1513859275053304, + "grad_norm": 1.0110914807155829, + "learning_rate": 4.023607172606676e-06, + "loss": 0.6946, + "step": 1215 + }, + { + "epoch": 1.152333570244018, + "grad_norm": 0.8736055096829543, + "learning_rate": 4.016078731915608e-06, + "loss": 0.775, + "step": 1216 + }, + { + "epoch": 1.1532812129827055, + "grad_norm": 0.9939438831479498, + "learning_rate": 4.008552611014955e-06, + "loss": 0.6888, + "step": 1217 + }, + { + "epoch": 1.154228855721393, + "grad_norm": 1.049570796703869, + "learning_rate": 4.001028827649046e-06, + "loss": 0.7094, + "step": 1218 + }, + { + "epoch": 1.1551764984600805, + "grad_norm": 1.148462409040153, + "learning_rate": 3.993507399556699e-06, + "loss": 0.6845, + "step": 1219 + }, + { + "epoch": 1.156124141198768, + "grad_norm": 0.9773344806508405, + "learning_rate": 3.9859883444711795e-06, + "loss": 0.6948, + "step": 1220 + }, + { + "epoch": 1.1570717839374556, + "grad_norm": 1.0928186343002937, + "learning_rate": 3.978471680120157e-06, + "loss": 0.7538, + "step": 1221 + }, + { + "epoch": 1.1580194266761432, + "grad_norm": 1.193743573791038, + "learning_rate": 3.970957424225666e-06, + "loss": 0.7024, + "step": 1222 + }, + { + "epoch": 1.1589670694148306, + "grad_norm": 1.1120074499425576, + "learning_rate": 3.963445594504062e-06, + "loss": 0.6627, + "step": 1223 + }, + { + "epoch": 1.159914712153518, + "grad_norm": 1.2788177552822944, + "learning_rate": 3.955936208665979e-06, + "loss": 0.6673, + "step": 1224 + }, + { + "epoch": 1.1608623548922057, + "grad_norm": 1.0546418537225764, + "learning_rate": 3.9484292844162905e-06, + "loss": 0.6398, + "step": 1225 + }, + { + "epoch": 1.161809997630893, + "grad_norm": 0.9380567432599234, + "learning_rate": 3.940924839454067e-06, + "loss": 0.6736, + "step": 1226 + }, + { + "epoch": 1.1627576403695807, + "grad_norm": 1.0622788802891603, + "learning_rate": 3.933422891472532e-06, + "loss": 0.6881, + "step": 1227 + }, + { + "epoch": 1.1637052831082682, + "grad_norm": 1.0284270221411218, + "learning_rate": 3.925923458159023e-06, + "loss": 0.6836, + "step": 1228 + }, + { + "epoch": 1.1646529258469558, + "grad_norm": 1.135279890250597, + "learning_rate": 3.918426557194947e-06, + "loss": 0.7027, + "step": 1229 + }, + { + "epoch": 1.1656005685856432, + "grad_norm": 1.0051047734363374, + "learning_rate": 3.910932206255742e-06, + "loss": 0.6571, + "step": 1230 + }, + { + "epoch": 1.1665482113243306, + "grad_norm": 1.0386743766132205, + "learning_rate": 3.903440423010835e-06, + "loss": 0.7293, + "step": 1231 + }, + { + "epoch": 1.1674958540630183, + "grad_norm": 0.9974944591028375, + "learning_rate": 3.895951225123595e-06, + "loss": 0.7061, + "step": 1232 + }, + { + "epoch": 1.1674958540630183, + "eval_loss": 0.92330402135849, + "eval_runtime": 61.1167, + "eval_samples_per_second": 44.636, + "eval_steps_per_second": 0.704, + "step": 1232 + }, + { + "epoch": 1.1684434968017057, + "grad_norm": 1.0523489123192147, + "learning_rate": 3.8884646302512985e-06, + "loss": 0.6744, + "step": 1233 + }, + { + "epoch": 1.1693911395403933, + "grad_norm": 1.0581086896752634, + "learning_rate": 3.880980656045087e-06, + "loss": 0.7234, + "step": 1234 + }, + { + "epoch": 1.1703387822790807, + "grad_norm": 1.081206532147213, + "learning_rate": 3.873499320149918e-06, + "loss": 0.7075, + "step": 1235 + }, + { + "epoch": 1.1712864250177684, + "grad_norm": 1.0406772938616795, + "learning_rate": 3.866020640204533e-06, + "loss": 0.6703, + "step": 1236 + }, + { + "epoch": 1.1722340677564558, + "grad_norm": 1.0457307864336358, + "learning_rate": 3.858544633841409e-06, + "loss": 0.6763, + "step": 1237 + }, + { + "epoch": 1.1731817104951434, + "grad_norm": 0.9946842072910351, + "learning_rate": 3.851071318686721e-06, + "loss": 0.6393, + "step": 1238 + }, + { + "epoch": 1.1741293532338308, + "grad_norm": 0.9269746374884014, + "learning_rate": 3.843600712360298e-06, + "loss": 0.729, + "step": 1239 + }, + { + "epoch": 1.1750769959725185, + "grad_norm": 0.9972213948459809, + "learning_rate": 3.836132832475583e-06, + "loss": 0.6714, + "step": 1240 + }, + { + "epoch": 1.1760246387112059, + "grad_norm": 1.1281466636664788, + "learning_rate": 3.8286676966395895e-06, + "loss": 0.7375, + "step": 1241 + }, + { + "epoch": 1.1769722814498933, + "grad_norm": 1.0775487768465717, + "learning_rate": 3.821205322452863e-06, + "loss": 0.7771, + "step": 1242 + }, + { + "epoch": 1.177919924188581, + "grad_norm": 0.9769500849217977, + "learning_rate": 3.813745727509439e-06, + "loss": 0.7238, + "step": 1243 + }, + { + "epoch": 1.1788675669272684, + "grad_norm": 0.9685742121387568, + "learning_rate": 3.806288929396798e-06, + "loss": 0.7081, + "step": 1244 + }, + { + "epoch": 1.179815209665956, + "grad_norm": 1.096837571136743, + "learning_rate": 3.798834945695826e-06, + "loss": 0.6977, + "step": 1245 + }, + { + "epoch": 1.1807628524046434, + "grad_norm": 1.1006333163134505, + "learning_rate": 3.7913837939807763e-06, + "loss": 0.6762, + "step": 1246 + }, + { + "epoch": 1.181710495143331, + "grad_norm": 0.9658169918126236, + "learning_rate": 3.783935491819222e-06, + "loss": 0.6904, + "step": 1247 + }, + { + "epoch": 1.1826581378820185, + "grad_norm": 1.1429805038475467, + "learning_rate": 3.77649005677202e-06, + "loss": 0.7098, + "step": 1248 + }, + { + "epoch": 1.1836057806207059, + "grad_norm": 1.1176775156483372, + "learning_rate": 3.769047506393267e-06, + "loss": 0.6764, + "step": 1249 + }, + { + "epoch": 1.1845534233593935, + "grad_norm": 1.0183368197142009, + "learning_rate": 3.7616078582302575e-06, + "loss": 0.731, + "step": 1250 + }, + { + "epoch": 1.1855010660980811, + "grad_norm": 0.9859557354856052, + "learning_rate": 3.754171129823444e-06, + "loss": 0.7222, + "step": 1251 + }, + { + "epoch": 1.1864487088367686, + "grad_norm": 0.9582987322265264, + "learning_rate": 3.7467373387063973e-06, + "loss": 0.6739, + "step": 1252 + }, + { + "epoch": 1.187396351575456, + "grad_norm": 0.9031196186509544, + "learning_rate": 3.7393065024057597e-06, + "loss": 0.7282, + "step": 1253 + }, + { + "epoch": 1.1883439943141436, + "grad_norm": 1.0684330693325141, + "learning_rate": 3.7318786384412076e-06, + "loss": 0.6953, + "step": 1254 + }, + { + "epoch": 1.1883439943141436, + "eval_loss": 0.920585036277771, + "eval_runtime": 65.1767, + "eval_samples_per_second": 41.855, + "eval_steps_per_second": 0.66, + "step": 1254 + }, + { + "epoch": 1.189291637052831, + "grad_norm": 1.148345248080178, + "learning_rate": 3.7244537643254115e-06, + "loss": 0.7035, + "step": 1255 + }, + { + "epoch": 1.1902392797915187, + "grad_norm": 1.0249604355926194, + "learning_rate": 3.7170318975639902e-06, + "loss": 0.7582, + "step": 1256 + }, + { + "epoch": 1.191186922530206, + "grad_norm": 1.179036054066612, + "learning_rate": 3.7096130556554744e-06, + "loss": 0.697, + "step": 1257 + }, + { + "epoch": 1.1921345652688937, + "grad_norm": 1.036930121403606, + "learning_rate": 3.70219725609126e-06, + "loss": 0.7452, + "step": 1258 + }, + { + "epoch": 1.1930822080075811, + "grad_norm": 0.9553861484853223, + "learning_rate": 3.694784516355573e-06, + "loss": 0.7419, + "step": 1259 + }, + { + "epoch": 1.1940298507462686, + "grad_norm": 0.9117393301073062, + "learning_rate": 3.687374853925425e-06, + "loss": 0.6818, + "step": 1260 + }, + { + "epoch": 1.1949774934849562, + "grad_norm": 1.109221558375404, + "learning_rate": 3.679968286270571e-06, + "loss": 0.6819, + "step": 1261 + }, + { + "epoch": 1.1959251362236436, + "grad_norm": 1.020240570463157, + "learning_rate": 3.67256483085347e-06, + "loss": 0.7115, + "step": 1262 + }, + { + "epoch": 1.1968727789623312, + "grad_norm": 1.0960139903595318, + "learning_rate": 3.6651645051292415e-06, + "loss": 0.7298, + "step": 1263 + }, + { + "epoch": 1.1978204217010187, + "grad_norm": 0.8730491568921783, + "learning_rate": 3.6577673265456296e-06, + "loss": 0.6626, + "step": 1264 + }, + { + "epoch": 1.1987680644397063, + "grad_norm": 1.0528341736215752, + "learning_rate": 3.6503733125429557e-06, + "loss": 0.7439, + "step": 1265 + }, + { + "epoch": 1.1997157071783937, + "grad_norm": 1.0899721179963884, + "learning_rate": 3.6429824805540816e-06, + "loss": 0.6907, + "step": 1266 + }, + { + "epoch": 1.2006633499170813, + "grad_norm": 1.0591609285941943, + "learning_rate": 3.6355948480043647e-06, + "loss": 0.6818, + "step": 1267 + }, + { + "epoch": 1.2016109926557688, + "grad_norm": 1.091729744316094, + "learning_rate": 3.628210432311621e-06, + "loss": 0.7118, + "step": 1268 + }, + { + "epoch": 1.2025586353944564, + "grad_norm": 1.1106751888187103, + "learning_rate": 3.620829250886083e-06, + "loss": 0.7496, + "step": 1269 + }, + { + "epoch": 1.2035062781331438, + "grad_norm": 0.9071769388935164, + "learning_rate": 3.6134513211303555e-06, + "loss": 0.6996, + "step": 1270 + }, + { + "epoch": 1.2044539208718312, + "grad_norm": 0.9816514708234372, + "learning_rate": 3.606076660439378e-06, + "loss": 0.7154, + "step": 1271 + }, + { + "epoch": 1.2054015636105189, + "grad_norm": 0.9314656323457674, + "learning_rate": 3.5987052862003824e-06, + "loss": 0.7288, + "step": 1272 + }, + { + "epoch": 1.2063492063492063, + "grad_norm": 1.0352321626353647, + "learning_rate": 3.5913372157928515e-06, + "loss": 0.6235, + "step": 1273 + }, + { + "epoch": 1.207296849087894, + "grad_norm": 1.3247779592651683, + "learning_rate": 3.58397246658848e-06, + "loss": 0.7273, + "step": 1274 + }, + { + "epoch": 1.2082444918265813, + "grad_norm": 0.9925803897741295, + "learning_rate": 3.5766110559511313e-06, + "loss": 0.749, + "step": 1275 + }, + { + "epoch": 1.209192134565269, + "grad_norm": 0.9515222056049948, + "learning_rate": 3.569253001236795e-06, + "loss": 0.7559, + "step": 1276 + }, + { + "epoch": 1.209192134565269, + "eval_loss": 0.9210101366043091, + "eval_runtime": 60.5067, + "eval_samples_per_second": 45.086, + "eval_steps_per_second": 0.711, + "step": 1276 + }, + { + "epoch": 1.2101397773039564, + "grad_norm": 1.0505259866511463, + "learning_rate": 3.561898319793555e-06, + "loss": 0.6777, + "step": 1277 + }, + { + "epoch": 1.2110874200426438, + "grad_norm": 1.0470739963568594, + "learning_rate": 3.554547028961537e-06, + "loss": 0.6687, + "step": 1278 + }, + { + "epoch": 1.2120350627813314, + "grad_norm": 0.9372200609494287, + "learning_rate": 3.5471991460728725e-06, + "loss": 0.7364, + "step": 1279 + }, + { + "epoch": 1.212982705520019, + "grad_norm": 1.0677225546085984, + "learning_rate": 3.5398546884516606e-06, + "loss": 0.6946, + "step": 1280 + }, + { + "epoch": 1.2139303482587065, + "grad_norm": 1.2432772070818632, + "learning_rate": 3.5325136734139213e-06, + "loss": 0.7216, + "step": 1281 + }, + { + "epoch": 1.214877990997394, + "grad_norm": 0.9586638500297385, + "learning_rate": 3.5251761182675626e-06, + "loss": 0.6836, + "step": 1282 + }, + { + "epoch": 1.2158256337360815, + "grad_norm": 1.0193646692337255, + "learning_rate": 3.5178420403123307e-06, + "loss": 0.7499, + "step": 1283 + }, + { + "epoch": 1.216773276474769, + "grad_norm": 0.9934250718994667, + "learning_rate": 3.510511456839777e-06, + "loss": 0.7127, + "step": 1284 + }, + { + "epoch": 1.2177209192134566, + "grad_norm": 1.018374538431729, + "learning_rate": 3.5031843851332105e-06, + "loss": 0.7211, + "step": 1285 + }, + { + "epoch": 1.218668561952144, + "grad_norm": 1.0345735993851575, + "learning_rate": 3.495860842467664e-06, + "loss": 0.7196, + "step": 1286 + }, + { + "epoch": 1.2196162046908317, + "grad_norm": 1.053615402301811, + "learning_rate": 3.488540846109849e-06, + "loss": 0.6648, + "step": 1287 + }, + { + "epoch": 1.220563847429519, + "grad_norm": 1.0164841506591313, + "learning_rate": 3.481224413318114e-06, + "loss": 0.6602, + "step": 1288 + }, + { + "epoch": 1.2215114901682065, + "grad_norm": 1.0367484711368176, + "learning_rate": 3.4739115613424078e-06, + "loss": 0.7115, + "step": 1289 + }, + { + "epoch": 1.2224591329068941, + "grad_norm": 1.038323393287383, + "learning_rate": 3.4666023074242356e-06, + "loss": 0.6587, + "step": 1290 + }, + { + "epoch": 1.2234067756455815, + "grad_norm": 0.9327274906523454, + "learning_rate": 3.459296668796619e-06, + "loss": 0.6846, + "step": 1291 + }, + { + "epoch": 1.2243544183842692, + "grad_norm": 1.0084456960618864, + "learning_rate": 3.451994662684057e-06, + "loss": 0.7076, + "step": 1292 + }, + { + "epoch": 1.2253020611229566, + "grad_norm": 1.0275585469763515, + "learning_rate": 3.4446963063024854e-06, + "loss": 0.691, + "step": 1293 + }, + { + "epoch": 1.2262497038616442, + "grad_norm": 1.125902212799892, + "learning_rate": 3.4374016168592296e-06, + "loss": 0.7251, + "step": 1294 + }, + { + "epoch": 1.2271973466003316, + "grad_norm": 1.0322864795599813, + "learning_rate": 3.4301106115529766e-06, + "loss": 0.7284, + "step": 1295 + }, + { + "epoch": 1.2281449893390193, + "grad_norm": 0.9989097221420168, + "learning_rate": 3.4228233075737225e-06, + "loss": 0.7035, + "step": 1296 + }, + { + "epoch": 1.2290926320777067, + "grad_norm": 0.9373804854464124, + "learning_rate": 3.4155397221027396e-06, + "loss": 0.7139, + "step": 1297 + }, + { + "epoch": 1.2300402748163943, + "grad_norm": 1.0343160679933974, + "learning_rate": 3.4082598723125303e-06, + "loss": 0.6859, + "step": 1298 + }, + { + "epoch": 1.2300402748163943, + "eval_loss": 0.9219260215759277, + "eval_runtime": 64.2211, + "eval_samples_per_second": 42.478, + "eval_steps_per_second": 0.67, + "step": 1298 + }, + { + "epoch": 1.2309879175550817, + "grad_norm": 1.0041258596912708, + "learning_rate": 3.4009837753667918e-06, + "loss": 0.6752, + "step": 1299 + }, + { + "epoch": 1.2319355602937692, + "grad_norm": 1.062141619360519, + "learning_rate": 3.393711448420372e-06, + "loss": 0.7558, + "step": 1300 + }, + { + "epoch": 1.2328832030324568, + "grad_norm": 1.1486226243514412, + "learning_rate": 3.3864429086192295e-06, + "loss": 0.6976, + "step": 1301 + }, + { + "epoch": 1.2338308457711442, + "grad_norm": 1.0162684385678484, + "learning_rate": 3.379178173100396e-06, + "loss": 0.6503, + "step": 1302 + }, + { + "epoch": 1.2347784885098319, + "grad_norm": 1.0535985388570441, + "learning_rate": 3.371917258991933e-06, + "loss": 0.7014, + "step": 1303 + }, + { + "epoch": 1.2357261312485193, + "grad_norm": 0.9476340249103999, + "learning_rate": 3.3646601834128924e-06, + "loss": 0.7243, + "step": 1304 + }, + { + "epoch": 1.236673773987207, + "grad_norm": 0.9722138093221362, + "learning_rate": 3.3574069634732744e-06, + "loss": 0.6936, + "step": 1305 + }, + { + "epoch": 1.2376214167258943, + "grad_norm": 0.960280885803354, + "learning_rate": 3.3501576162739903e-06, + "loss": 0.7258, + "step": 1306 + }, + { + "epoch": 1.2385690594645817, + "grad_norm": 1.011953193368571, + "learning_rate": 3.3429121589068213e-06, + "loss": 0.7573, + "step": 1307 + }, + { + "epoch": 1.2395167022032694, + "grad_norm": 1.035071350641064, + "learning_rate": 3.3356706084543766e-06, + "loss": 0.7303, + "step": 1308 + }, + { + "epoch": 1.2404643449419568, + "grad_norm": 1.0243269638647712, + "learning_rate": 3.328432981990053e-06, + "loss": 0.7117, + "step": 1309 + }, + { + "epoch": 1.2414119876806444, + "grad_norm": 1.00968974551286, + "learning_rate": 3.3211992965779984e-06, + "loss": 0.6356, + "step": 1310 + }, + { + "epoch": 1.2423596304193318, + "grad_norm": 0.9221223902659512, + "learning_rate": 3.3139695692730644e-06, + "loss": 0.6582, + "step": 1311 + }, + { + "epoch": 1.2433072731580195, + "grad_norm": 1.063526317074848, + "learning_rate": 3.306743817120777e-06, + "loss": 0.6458, + "step": 1312 + }, + { + "epoch": 1.244254915896707, + "grad_norm": 1.0099993891673738, + "learning_rate": 3.2995220571572845e-06, + "loss": 0.6945, + "step": 1313 + }, + { + "epoch": 1.2452025586353945, + "grad_norm": 1.2682014688129715, + "learning_rate": 3.2923043064093252e-06, + "loss": 0.7106, + "step": 1314 + }, + { + "epoch": 1.246150201374082, + "grad_norm": 0.9957192198152068, + "learning_rate": 3.2850905818941853e-06, + "loss": 0.7159, + "step": 1315 + }, + { + "epoch": 1.2470978441127696, + "grad_norm": 0.970490636667617, + "learning_rate": 3.2778809006196564e-06, + "loss": 0.7628, + "step": 1316 + }, + { + "epoch": 1.248045486851457, + "grad_norm": 1.049773961103418, + "learning_rate": 3.2706752795839984e-06, + "loss": 0.7065, + "step": 1317 + }, + { + "epoch": 1.2489931295901444, + "grad_norm": 1.0171220181984746, + "learning_rate": 3.2634737357758994e-06, + "loss": 0.6594, + "step": 1318 + }, + { + "epoch": 1.249940772328832, + "grad_norm": 1.0689789937287877, + "learning_rate": 3.256276286174433e-06, + "loss": 0.72, + "step": 1319 + }, + { + "epoch": 1.2508884150675195, + "grad_norm": 0.9750893399192391, + "learning_rate": 3.2490829477490194e-06, + "loss": 0.7237, + "step": 1320 + }, + { + "epoch": 1.2508884150675195, + "eval_loss": 0.9202948808670044, + "eval_runtime": 64.6354, + "eval_samples_per_second": 42.206, + "eval_steps_per_second": 0.665, + "step": 1320 + }, + { + "epoch": 1.251836057806207, + "grad_norm": 1.0670485098725635, + "learning_rate": 3.2418937374593895e-06, + "loss": 0.7168, + "step": 1321 + }, + { + "epoch": 1.2527837005448945, + "grad_norm": 0.9432261527116877, + "learning_rate": 3.2347086722555382e-06, + "loss": 0.741, + "step": 1322 + }, + { + "epoch": 1.2537313432835822, + "grad_norm": 0.9750755053529253, + "learning_rate": 3.2275277690776876e-06, + "loss": 0.6547, + "step": 1323 + }, + { + "epoch": 1.2546789860222696, + "grad_norm": 0.9301758889182347, + "learning_rate": 3.220351044856247e-06, + "loss": 0.7478, + "step": 1324 + }, + { + "epoch": 1.255626628760957, + "grad_norm": 1.0318451433516014, + "learning_rate": 3.2131785165117748e-06, + "loss": 0.6562, + "step": 1325 + }, + { + "epoch": 1.2565742714996446, + "grad_norm": 0.889824279332266, + "learning_rate": 3.206010200954935e-06, + "loss": 0.6682, + "step": 1326 + }, + { + "epoch": 1.2575219142383323, + "grad_norm": 0.9419702907671557, + "learning_rate": 3.198846115086459e-06, + "loss": 0.6833, + "step": 1327 + }, + { + "epoch": 1.2584695569770197, + "grad_norm": 1.0527246614550414, + "learning_rate": 3.191686275797107e-06, + "loss": 0.7099, + "step": 1328 + }, + { + "epoch": 1.259417199715707, + "grad_norm": 1.055473610231369, + "learning_rate": 3.1845306999676274e-06, + "loss": 0.6996, + "step": 1329 + }, + { + "epoch": 1.2603648424543947, + "grad_norm": 0.9595441733482751, + "learning_rate": 3.177379404468715e-06, + "loss": 0.6818, + "step": 1330 + }, + { + "epoch": 1.2613124851930821, + "grad_norm": 1.0295006726261346, + "learning_rate": 3.170232406160974e-06, + "loss": 0.6539, + "step": 1331 + }, + { + "epoch": 1.2622601279317698, + "grad_norm": 1.124771581323331, + "learning_rate": 3.1630897218948765e-06, + "loss": 0.6911, + "step": 1332 + }, + { + "epoch": 1.2632077706704572, + "grad_norm": 0.9768323438673063, + "learning_rate": 3.1559513685107233e-06, + "loss": 0.7021, + "step": 1333 + }, + { + "epoch": 1.2641554134091448, + "grad_norm": 0.9702345738300058, + "learning_rate": 3.1488173628386066e-06, + "loss": 0.7039, + "step": 1334 + }, + { + "epoch": 1.2651030561478323, + "grad_norm": 0.9740566736999792, + "learning_rate": 3.141687721698363e-06, + "loss": 0.7201, + "step": 1335 + }, + { + "epoch": 1.2660506988865197, + "grad_norm": 1.0341764156676143, + "learning_rate": 3.1345624618995444e-06, + "loss": 0.6815, + "step": 1336 + }, + { + "epoch": 1.2669983416252073, + "grad_norm": 1.0038570117243435, + "learning_rate": 3.127441600241369e-06, + "loss": 0.6874, + "step": 1337 + }, + { + "epoch": 1.267945984363895, + "grad_norm": 0.9492110824297334, + "learning_rate": 3.1203251535126867e-06, + "loss": 0.6973, + "step": 1338 + }, + { + "epoch": 1.2688936271025824, + "grad_norm": 1.2890789269673384, + "learning_rate": 3.11321313849194e-06, + "loss": 0.7239, + "step": 1339 + }, + { + "epoch": 1.2698412698412698, + "grad_norm": 0.9728809474646835, + "learning_rate": 3.10610557194712e-06, + "loss": 0.6764, + "step": 1340 + }, + { + "epoch": 1.2707889125799574, + "grad_norm": 1.1635688288087744, + "learning_rate": 3.0990024706357314e-06, + "loss": 0.6918, + "step": 1341 + }, + { + "epoch": 1.2717365553186448, + "grad_norm": 0.9704804371870352, + "learning_rate": 3.0919038513047507e-06, + "loss": 0.7398, + "step": 1342 + }, + { + "epoch": 1.2717365553186448, + "eval_loss": 0.9205412864685059, + "eval_runtime": 64.3962, + "eval_samples_per_second": 42.363, + "eval_steps_per_second": 0.668, + "step": 1342 + }, + { + "epoch": 1.2726841980573325, + "grad_norm": 0.9623053497453619, + "learning_rate": 3.084809730690587e-06, + "loss": 0.7125, + "step": 1343 + }, + { + "epoch": 1.2736318407960199, + "grad_norm": 1.0918361707437365, + "learning_rate": 3.077720125519042e-06, + "loss": 0.6929, + "step": 1344 + }, + { + "epoch": 1.2745794835347075, + "grad_norm": 1.0920621674730842, + "learning_rate": 3.070635052505273e-06, + "loss": 0.736, + "step": 1345 + }, + { + "epoch": 1.275527126273395, + "grad_norm": 0.9952373114086388, + "learning_rate": 3.0635545283537523e-06, + "loss": 0.687, + "step": 1346 + }, + { + "epoch": 1.2764747690120823, + "grad_norm": 1.2534896758069607, + "learning_rate": 3.056478569758225e-06, + "loss": 0.7381, + "step": 1347 + }, + { + "epoch": 1.27742241175077, + "grad_norm": 1.0662944647834347, + "learning_rate": 3.0494071934016737e-06, + "loss": 0.7478, + "step": 1348 + }, + { + "epoch": 1.2783700544894574, + "grad_norm": 1.1088922876494405, + "learning_rate": 3.0423404159562776e-06, + "loss": 0.7582, + "step": 1349 + }, + { + "epoch": 1.279317697228145, + "grad_norm": 1.176182333691132, + "learning_rate": 3.03527825408337e-06, + "loss": 0.7656, + "step": 1350 + }, + { + "epoch": 1.2802653399668324, + "grad_norm": 0.9658766613029017, + "learning_rate": 3.0282207244334084e-06, + "loss": 0.724, + "step": 1351 + }, + { + "epoch": 1.28121298270552, + "grad_norm": 1.0827099691926507, + "learning_rate": 3.0211678436459214e-06, + "loss": 0.6916, + "step": 1352 + }, + { + "epoch": 1.2821606254442075, + "grad_norm": 1.0338922828145432, + "learning_rate": 3.014119628349482e-06, + "loss": 0.6895, + "step": 1353 + }, + { + "epoch": 1.283108268182895, + "grad_norm": 0.9767332618971463, + "learning_rate": 3.007076095161662e-06, + "loss": 0.6949, + "step": 1354 + }, + { + "epoch": 1.2840559109215826, + "grad_norm": 1.0930600930773744, + "learning_rate": 3.0000372606889937e-06, + "loss": 0.7021, + "step": 1355 + }, + { + "epoch": 1.2850035536602702, + "grad_norm": 1.1419255045928394, + "learning_rate": 2.9930031415269327e-06, + "loss": 0.6816, + "step": 1356 + }, + { + "epoch": 1.2859511963989576, + "grad_norm": 1.0003644858641374, + "learning_rate": 2.9859737542598157e-06, + "loss": 0.7194, + "step": 1357 + }, + { + "epoch": 1.286898839137645, + "grad_norm": 0.9900004048868847, + "learning_rate": 2.978949115460824e-06, + "loss": 0.6978, + "step": 1358 + }, + { + "epoch": 1.2878464818763327, + "grad_norm": 1.0245392546866654, + "learning_rate": 2.971929241691942e-06, + "loss": 0.7067, + "step": 1359 + }, + { + "epoch": 1.28879412461502, + "grad_norm": 1.1770400753015886, + "learning_rate": 2.9649141495039225e-06, + "loss": 0.6811, + "step": 1360 + }, + { + "epoch": 1.2897417673537077, + "grad_norm": 0.9716668472533, + "learning_rate": 2.9579038554362412e-06, + "loss": 0.6944, + "step": 1361 + }, + { + "epoch": 1.2906894100923951, + "grad_norm": 0.9803805263564218, + "learning_rate": 2.950898376017064e-06, + "loss": 0.6599, + "step": 1362 + }, + { + "epoch": 1.2916370528310828, + "grad_norm": 1.1276831233974194, + "learning_rate": 2.943897727763202e-06, + "loss": 0.7439, + "step": 1363 + }, + { + "epoch": 1.2925846955697702, + "grad_norm": 0.9936565556974072, + "learning_rate": 2.9369019271800827e-06, + "loss": 0.7139, + "step": 1364 + }, + { + "epoch": 1.2925846955697702, + "eval_loss": 0.9199525117874146, + "eval_runtime": 65.9605, + "eval_samples_per_second": 41.358, + "eval_steps_per_second": 0.652, + "step": 1364 + }, + { + "epoch": 1.2935323383084576, + "grad_norm": 1.0225633529982556, + "learning_rate": 2.9299109907616956e-06, + "loss": 0.7169, + "step": 1365 + }, + { + "epoch": 1.2944799810471452, + "grad_norm": 1.1496102662994758, + "learning_rate": 2.9229249349905686e-06, + "loss": 0.7257, + "step": 1366 + }, + { + "epoch": 1.2954276237858329, + "grad_norm": 1.0411288552368323, + "learning_rate": 2.9159437763377187e-06, + "loss": 0.7057, + "step": 1367 + }, + { + "epoch": 1.2963752665245203, + "grad_norm": 1.1200448529620028, + "learning_rate": 2.908967531262618e-06, + "loss": 0.7623, + "step": 1368 + }, + { + "epoch": 1.2973229092632077, + "grad_norm": 1.0477643207405842, + "learning_rate": 2.9019962162131564e-06, + "loss": 0.6169, + "step": 1369 + }, + { + "epoch": 1.2982705520018953, + "grad_norm": 1.008136764087855, + "learning_rate": 2.895029847625595e-06, + "loss": 0.6862, + "step": 1370 + }, + { + "epoch": 1.2992181947405828, + "grad_norm": 1.0600688588701463, + "learning_rate": 2.8880684419245387e-06, + "loss": 0.7149, + "step": 1371 + }, + { + "epoch": 1.3001658374792704, + "grad_norm": 1.0027128694027476, + "learning_rate": 2.8811120155228843e-06, + "loss": 0.7366, + "step": 1372 + }, + { + "epoch": 1.3011134802179578, + "grad_norm": 1.4069108299597186, + "learning_rate": 2.874160584821798e-06, + "loss": 0.7393, + "step": 1373 + }, + { + "epoch": 1.3020611229566454, + "grad_norm": 1.0338612942301875, + "learning_rate": 2.8672141662106577e-06, + "loss": 0.7036, + "step": 1374 + }, + { + "epoch": 1.3030087656953329, + "grad_norm": 1.0688910868363117, + "learning_rate": 2.8602727760670336e-06, + "loss": 0.7306, + "step": 1375 + }, + { + "epoch": 1.3039564084340203, + "grad_norm": 1.242518786141904, + "learning_rate": 2.8533364307566313e-06, + "loss": 0.6862, + "step": 1376 + }, + { + "epoch": 1.304904051172708, + "grad_norm": 1.1911216173581962, + "learning_rate": 2.846405146633269e-06, + "loss": 0.7568, + "step": 1377 + }, + { + "epoch": 1.3058516939113953, + "grad_norm": 1.0039693235836689, + "learning_rate": 2.839478940038833e-06, + "loss": 0.6523, + "step": 1378 + }, + { + "epoch": 1.306799336650083, + "grad_norm": 1.072057083327897, + "learning_rate": 2.8325578273032295e-06, + "loss": 0.7036, + "step": 1379 + }, + { + "epoch": 1.3077469793887704, + "grad_norm": 0.9993390017662981, + "learning_rate": 2.8256418247443664e-06, + "loss": 0.6887, + "step": 1380 + }, + { + "epoch": 1.308694622127458, + "grad_norm": 0.9454808776048585, + "learning_rate": 2.8187309486680924e-06, + "loss": 0.7237, + "step": 1381 + }, + { + "epoch": 1.3096422648661454, + "grad_norm": 0.9371803052679708, + "learning_rate": 2.811825215368179e-06, + "loss": 0.7279, + "step": 1382 + }, + { + "epoch": 1.3105899076048328, + "grad_norm": 1.2827676657928362, + "learning_rate": 2.804924641126264e-06, + "loss": 0.6878, + "step": 1383 + }, + { + "epoch": 1.3115375503435205, + "grad_norm": 1.1459955089807479, + "learning_rate": 2.7980292422118282e-06, + "loss": 0.7615, + "step": 1384 + }, + { + "epoch": 1.3124851930822081, + "grad_norm": 1.0020748955412062, + "learning_rate": 2.791139034882151e-06, + "loss": 0.7376, + "step": 1385 + }, + { + "epoch": 1.3134328358208955, + "grad_norm": 1.277527875810019, + "learning_rate": 2.7842540353822634e-06, + "loss": 0.7209, + "step": 1386 + }, + { + "epoch": 1.3134328358208955, + "eval_loss": 0.91898512840271, + "eval_runtime": 64.7739, + "eval_samples_per_second": 42.116, + "eval_steps_per_second": 0.664, + "step": 1386 + }, + { + "epoch": 1.314380478559583, + "grad_norm": 1.1181626221192054, + "learning_rate": 2.777374259944929e-06, + "loss": 0.7057, + "step": 1387 + }, + { + "epoch": 1.3153281212982706, + "grad_norm": 0.9400452950748897, + "learning_rate": 2.770499724790584e-06, + "loss": 0.721, + "step": 1388 + }, + { + "epoch": 1.316275764036958, + "grad_norm": 1.0147521664502355, + "learning_rate": 2.763630446127319e-06, + "loss": 0.7199, + "step": 1389 + }, + { + "epoch": 1.3172234067756456, + "grad_norm": 0.9881704878801858, + "learning_rate": 2.7567664401508225e-06, + "loss": 0.7116, + "step": 1390 + }, + { + "epoch": 1.318171049514333, + "grad_norm": 0.9795454208939292, + "learning_rate": 2.7499077230443607e-06, + "loss": 0.6953, + "step": 1391 + }, + { + "epoch": 1.3191186922530207, + "grad_norm": 1.0069354879696941, + "learning_rate": 2.743054310978722e-06, + "loss": 0.7098, + "step": 1392 + }, + { + "epoch": 1.3200663349917081, + "grad_norm": 0.9429203635412998, + "learning_rate": 2.736206220112192e-06, + "loss": 0.7004, + "step": 1393 + }, + { + "epoch": 1.3210139777303955, + "grad_norm": 1.0682307544212157, + "learning_rate": 2.729363466590511e-06, + "loss": 0.6745, + "step": 1394 + }, + { + "epoch": 1.3219616204690832, + "grad_norm": 1.0079367947967734, + "learning_rate": 2.72252606654683e-06, + "loss": 0.6362, + "step": 1395 + }, + { + "epoch": 1.3229092632077708, + "grad_norm": 0.936909554532062, + "learning_rate": 2.7156940361016864e-06, + "loss": 0.7282, + "step": 1396 + }, + { + "epoch": 1.3238569059464582, + "grad_norm": 1.1619360098958085, + "learning_rate": 2.708867391362948e-06, + "loss": 0.759, + "step": 1397 + }, + { + "epoch": 1.3248045486851456, + "grad_norm": 1.0332898627783107, + "learning_rate": 2.7020461484257952e-06, + "loss": 0.7224, + "step": 1398 + }, + { + "epoch": 1.3257521914238333, + "grad_norm": 1.0120053743179587, + "learning_rate": 2.6952303233726628e-06, + "loss": 0.7007, + "step": 1399 + }, + { + "epoch": 1.3266998341625207, + "grad_norm": 1.0182348658415017, + "learning_rate": 2.6884199322732192e-06, + "loss": 0.7364, + "step": 1400 + }, + { + "epoch": 1.3276474769012083, + "grad_norm": 1.0133603532948297, + "learning_rate": 2.681614991184315e-06, + "loss": 0.743, + "step": 1401 + }, + { + "epoch": 1.3285951196398957, + "grad_norm": 0.9924480633218891, + "learning_rate": 2.6748155161499568e-06, + "loss": 0.6545, + "step": 1402 + }, + { + "epoch": 1.3295427623785834, + "grad_norm": 1.127155396416284, + "learning_rate": 2.668021523201263e-06, + "loss": 0.7471, + "step": 1403 + }, + { + "epoch": 1.3304904051172708, + "grad_norm": 1.0775922171839278, + "learning_rate": 2.6612330283564226e-06, + "loss": 0.6713, + "step": 1404 + }, + { + "epoch": 1.3314380478559582, + "grad_norm": 0.9947061326854107, + "learning_rate": 2.6544500476206675e-06, + "loss": 0.6725, + "step": 1405 + }, + { + "epoch": 1.3323856905946458, + "grad_norm": 1.0208055783943726, + "learning_rate": 2.6476725969862227e-06, + "loss": 0.7577, + "step": 1406 + }, + { + "epoch": 1.3333333333333333, + "grad_norm": 0.9200584970050147, + "learning_rate": 2.6409006924322824e-06, + "loss": 0.7277, + "step": 1407 + }, + { + "epoch": 1.334280976072021, + "grad_norm": 1.1839327711762426, + "learning_rate": 2.634134349924956e-06, + "loss": 0.7352, + "step": 1408 + }, + { + "epoch": 1.334280976072021, + "eval_loss": 0.9188514351844788, + "eval_runtime": 65.4529, + "eval_samples_per_second": 41.679, + "eval_steps_per_second": 0.657, + "step": 1408 + }, + { + "epoch": 1.3352286188107083, + "grad_norm": 1.0311881529332272, + "learning_rate": 2.6273735854172487e-06, + "loss": 0.6348, + "step": 1409 + }, + { + "epoch": 1.336176261549396, + "grad_norm": 0.869437188326139, + "learning_rate": 2.6206184148490066e-06, + "loss": 0.6783, + "step": 1410 + }, + { + "epoch": 1.3371239042880834, + "grad_norm": 0.9896388796869628, + "learning_rate": 2.6138688541468903e-06, + "loss": 0.6565, + "step": 1411 + }, + { + "epoch": 1.3380715470267708, + "grad_norm": 0.9975523350097485, + "learning_rate": 2.6071249192243365e-06, + "loss": 0.7388, + "step": 1412 + }, + { + "epoch": 1.3390191897654584, + "grad_norm": 3.814992564369548, + "learning_rate": 2.6003866259815123e-06, + "loss": 0.7403, + "step": 1413 + }, + { + "epoch": 1.339966832504146, + "grad_norm": 1.1226980770763029, + "learning_rate": 2.5936539903052893e-06, + "loss": 0.7311, + "step": 1414 + }, + { + "epoch": 1.3409144752428335, + "grad_norm": 1.0149265229287798, + "learning_rate": 2.5869270280691945e-06, + "loss": 0.6922, + "step": 1415 + }, + { + "epoch": 1.3418621179815209, + "grad_norm": 1.1097881040738804, + "learning_rate": 2.580205755133384e-06, + "loss": 0.6867, + "step": 1416 + }, + { + "epoch": 1.3428097607202085, + "grad_norm": 1.0437635962821175, + "learning_rate": 2.573490187344596e-06, + "loss": 0.6817, + "step": 1417 + }, + { + "epoch": 1.343757403458896, + "grad_norm": 1.0360266239174718, + "learning_rate": 2.5667803405361214e-06, + "loss": 0.7413, + "step": 1418 + }, + { + "epoch": 1.3447050461975836, + "grad_norm": 0.9107405093250232, + "learning_rate": 2.560076230527758e-06, + "loss": 0.6722, + "step": 1419 + }, + { + "epoch": 1.345652688936271, + "grad_norm": 0.9408273813849782, + "learning_rate": 2.5533778731257824e-06, + "loss": 0.7198, + "step": 1420 + }, + { + "epoch": 1.3466003316749586, + "grad_norm": 1.0137206933457714, + "learning_rate": 2.546685284122909e-06, + "loss": 0.6862, + "step": 1421 + }, + { + "epoch": 1.347547974413646, + "grad_norm": 1.207183923292191, + "learning_rate": 2.5399984792982457e-06, + "loss": 0.7163, + "step": 1422 + }, + { + "epoch": 1.3484956171523335, + "grad_norm": 0.9580259639674358, + "learning_rate": 2.5333174744172705e-06, + "loss": 0.7006, + "step": 1423 + }, + { + "epoch": 1.349443259891021, + "grad_norm": 1.0387469510787142, + "learning_rate": 2.5266422852317796e-06, + "loss": 0.66, + "step": 1424 + }, + { + "epoch": 1.3503909026297087, + "grad_norm": 1.0656383066236028, + "learning_rate": 2.5199729274798664e-06, + "loss": 0.7036, + "step": 1425 + }, + { + "epoch": 1.3513385453683961, + "grad_norm": 1.0545231585290438, + "learning_rate": 2.513309416885865e-06, + "loss": 0.643, + "step": 1426 + }, + { + "epoch": 1.3522861881070836, + "grad_norm": 1.1886784682255809, + "learning_rate": 2.5066517691603327e-06, + "loss": 0.6968, + "step": 1427 + }, + { + "epoch": 1.3532338308457712, + "grad_norm": 1.0686385881546185, + "learning_rate": 2.5000000000000015e-06, + "loss": 0.6948, + "step": 1428 + }, + { + "epoch": 1.3541814735844586, + "grad_norm": 1.022360298648431, + "learning_rate": 2.493354125087738e-06, + "loss": 0.6812, + "step": 1429 + }, + { + "epoch": 1.3551291163231463, + "grad_norm": 1.057656689299949, + "learning_rate": 2.4867141600925214e-06, + "loss": 0.7209, + "step": 1430 + }, + { + "epoch": 1.3551291163231463, + "eval_loss": 0.9181008338928223, + "eval_runtime": 61.8166, + "eval_samples_per_second": 44.131, + "eval_steps_per_second": 0.696, + "step": 1430 + }, + { + "epoch": 1.3560767590618337, + "grad_norm": 1.0044797087222561, + "learning_rate": 2.4800801206693873e-06, + "loss": 0.6994, + "step": 1431 + }, + { + "epoch": 1.3570244018005213, + "grad_norm": 1.1363501611998355, + "learning_rate": 2.4734520224594094e-06, + "loss": 0.6967, + "step": 1432 + }, + { + "epoch": 1.3579720445392087, + "grad_norm": 1.0735619665932887, + "learning_rate": 2.4668298810896463e-06, + "loss": 0.6615, + "step": 1433 + }, + { + "epoch": 1.3589196872778961, + "grad_norm": 1.2512092821838807, + "learning_rate": 2.4602137121731195e-06, + "loss": 0.7226, + "step": 1434 + }, + { + "epoch": 1.3598673300165838, + "grad_norm": 1.0104639217781053, + "learning_rate": 2.4536035313087603e-06, + "loss": 0.7748, + "step": 1435 + }, + { + "epoch": 1.3608149727552712, + "grad_norm": 0.945616005794055, + "learning_rate": 2.44699935408139e-06, + "loss": 0.7169, + "step": 1436 + }, + { + "epoch": 1.3617626154939588, + "grad_norm": 1.1241609779341695, + "learning_rate": 2.4404011960616747e-06, + "loss": 0.6734, + "step": 1437 + }, + { + "epoch": 1.3627102582326462, + "grad_norm": 1.321111844351736, + "learning_rate": 2.4338090728060808e-06, + "loss": 0.7567, + "step": 1438 + }, + { + "epoch": 1.3636579009713339, + "grad_norm": 0.9913817986983521, + "learning_rate": 2.4272229998568576e-06, + "loss": 0.6312, + "step": 1439 + }, + { + "epoch": 1.3646055437100213, + "grad_norm": 1.0071709428393925, + "learning_rate": 2.4206429927419795e-06, + "loss": 0.6763, + "step": 1440 + }, + { + "epoch": 1.3655531864487087, + "grad_norm": 1.0356649949239918, + "learning_rate": 2.414069066975128e-06, + "loss": 0.6461, + "step": 1441 + }, + { + "epoch": 1.3665008291873963, + "grad_norm": 0.9723682213152954, + "learning_rate": 2.40750123805564e-06, + "loss": 0.6884, + "step": 1442 + }, + { + "epoch": 1.367448471926084, + "grad_norm": 0.9948531066652241, + "learning_rate": 2.400939521468484e-06, + "loss": 0.7155, + "step": 1443 + }, + { + "epoch": 1.3683961146647714, + "grad_norm": 0.9940870834143998, + "learning_rate": 2.3943839326842096e-06, + "loss": 0.6657, + "step": 1444 + }, + { + "epoch": 1.3693437574034588, + "grad_norm": 1.1771867828127296, + "learning_rate": 2.387834487158926e-06, + "loss": 0.7088, + "step": 1445 + }, + { + "epoch": 1.3702914001421465, + "grad_norm": 1.0503694972372863, + "learning_rate": 2.381291200334257e-06, + "loss": 0.7379, + "step": 1446 + }, + { + "epoch": 1.3712390428808339, + "grad_norm": 1.0357654532991112, + "learning_rate": 2.3747540876373026e-06, + "loss": 0.6843, + "step": 1447 + }, + { + "epoch": 1.3721866856195215, + "grad_norm": 0.9841256078266499, + "learning_rate": 2.368223164480611e-06, + "loss": 0.7251, + "step": 1448 + }, + { + "epoch": 1.373134328358209, + "grad_norm": 0.943494428045495, + "learning_rate": 2.3616984462621307e-06, + "loss": 0.756, + "step": 1449 + }, + { + "epoch": 1.3740819710968966, + "grad_norm": 0.9561466461280033, + "learning_rate": 2.3551799483651894e-06, + "loss": 0.6926, + "step": 1450 + }, + { + "epoch": 1.375029613835584, + "grad_norm": 1.2692497517950982, + "learning_rate": 2.348667686158441e-06, + "loss": 0.6878, + "step": 1451 + }, + { + "epoch": 1.3759772565742714, + "grad_norm": 1.2205664546373765, + "learning_rate": 2.342161674995843e-06, + "loss": 0.7187, + "step": 1452 + }, + { + "epoch": 1.3759772565742714, + "eval_loss": 0.9179805517196655, + "eval_runtime": 66.4952, + "eval_samples_per_second": 41.026, + "eval_steps_per_second": 0.647, + "step": 1452 + }, + { + "epoch": 1.376924899312959, + "grad_norm": 0.9875355272859734, + "learning_rate": 2.335661930216611e-06, + "loss": 0.6266, + "step": 1453 + }, + { + "epoch": 1.3778725420516467, + "grad_norm": 1.0518188044284036, + "learning_rate": 2.3291684671451905e-06, + "loss": 0.6734, + "step": 1454 + }, + { + "epoch": 1.378820184790334, + "grad_norm": 1.0155553726238449, + "learning_rate": 2.322681301091214e-06, + "loss": 0.6737, + "step": 1455 + }, + { + "epoch": 1.3797678275290215, + "grad_norm": 1.127782826546028, + "learning_rate": 2.316200447349466e-06, + "loss": 0.7146, + "step": 1456 + }, + { + "epoch": 1.3807154702677091, + "grad_norm": 1.0784960552341187, + "learning_rate": 2.3097259211998536e-06, + "loss": 0.7501, + "step": 1457 + }, + { + "epoch": 1.3816631130063965, + "grad_norm": 0.9949725127797444, + "learning_rate": 2.3032577379073577e-06, + "loss": 0.7015, + "step": 1458 + }, + { + "epoch": 1.3826107557450842, + "grad_norm": 0.9785164181620558, + "learning_rate": 2.296795912722014e-06, + "loss": 0.7015, + "step": 1459 + }, + { + "epoch": 1.3835583984837716, + "grad_norm": 1.126981467512306, + "learning_rate": 2.2903404608788582e-06, + "loss": 0.6766, + "step": 1460 + }, + { + "epoch": 1.3845060412224592, + "grad_norm": 1.0927804032807178, + "learning_rate": 2.283891397597908e-06, + "loss": 0.6693, + "step": 1461 + }, + { + "epoch": 1.3854536839611467, + "grad_norm": 1.0258483620038565, + "learning_rate": 2.2774487380841116e-06, + "loss": 0.6607, + "step": 1462 + }, + { + "epoch": 1.386401326699834, + "grad_norm": 1.0139207350796542, + "learning_rate": 2.2710124975273236e-06, + "loss": 0.7301, + "step": 1463 + }, + { + "epoch": 1.3873489694385217, + "grad_norm": 1.2623014945186244, + "learning_rate": 2.2645826911022656e-06, + "loss": 0.6878, + "step": 1464 + }, + { + "epoch": 1.3882966121772091, + "grad_norm": 1.0697558784167414, + "learning_rate": 2.258159333968484e-06, + "loss": 0.7058, + "step": 1465 + }, + { + "epoch": 1.3892442549158968, + "grad_norm": 1.1679882747027204, + "learning_rate": 2.2517424412703256e-06, + "loss": 0.7337, + "step": 1466 + }, + { + "epoch": 1.3901918976545842, + "grad_norm": 1.0802296481928269, + "learning_rate": 2.2453320281368903e-06, + "loss": 0.686, + "step": 1467 + }, + { + "epoch": 1.3911395403932718, + "grad_norm": 0.9794786330880751, + "learning_rate": 2.2389281096820077e-06, + "loss": 0.7638, + "step": 1468 + }, + { + "epoch": 1.3920871831319592, + "grad_norm": 0.9835019275382246, + "learning_rate": 2.2325307010041874e-06, + "loss": 0.7598, + "step": 1469 + }, + { + "epoch": 1.3930348258706466, + "grad_norm": 1.239471343857703, + "learning_rate": 2.2261398171865976e-06, + "loss": 0.6944, + "step": 1470 + }, + { + "epoch": 1.3939824686093343, + "grad_norm": 1.0053386117085916, + "learning_rate": 2.21975547329702e-06, + "loss": 0.6756, + "step": 1471 + }, + { + "epoch": 1.394930111348022, + "grad_norm": 1.0689464802675128, + "learning_rate": 2.2133776843878185e-06, + "loss": 0.7674, + "step": 1472 + }, + { + "epoch": 1.3958777540867093, + "grad_norm": 1.345586768860224, + "learning_rate": 2.207006465495898e-06, + "loss": 0.6936, + "step": 1473 + }, + { + "epoch": 1.3968253968253967, + "grad_norm": 1.155638754133062, + "learning_rate": 2.2006418316426773e-06, + "loss": 0.6912, + "step": 1474 + }, + { + "epoch": 1.3968253968253967, + "eval_loss": 0.9174560308456421, + "eval_runtime": 65.4877, + "eval_samples_per_second": 41.657, + "eval_steps_per_second": 0.657, + "step": 1474 + }, + { + "epoch": 1.3977730395640844, + "grad_norm": 1.7458254846447263, + "learning_rate": 2.1942837978340516e-06, + "loss": 0.7289, + "step": 1475 + }, + { + "epoch": 1.3987206823027718, + "grad_norm": 1.104499286780302, + "learning_rate": 2.187932379060348e-06, + "loss": 0.6773, + "step": 1476 + }, + { + "epoch": 1.3996683250414594, + "grad_norm": 1.0183691386946678, + "learning_rate": 2.1815875902963058e-06, + "loss": 0.7138, + "step": 1477 + }, + { + "epoch": 1.4006159677801469, + "grad_norm": 0.9953027863427754, + "learning_rate": 2.175249446501024e-06, + "loss": 0.6644, + "step": 1478 + }, + { + "epoch": 1.4015636105188345, + "grad_norm": 1.119903318819432, + "learning_rate": 2.1689179626179442e-06, + "loss": 0.673, + "step": 1479 + }, + { + "epoch": 1.402511253257522, + "grad_norm": 1.01161813324734, + "learning_rate": 2.1625931535747964e-06, + "loss": 0.7104, + "step": 1480 + }, + { + "epoch": 1.4034588959962093, + "grad_norm": 0.953534813718956, + "learning_rate": 2.1562750342835827e-06, + "loss": 0.7277, + "step": 1481 + }, + { + "epoch": 1.404406538734897, + "grad_norm": 1.1396651805754336, + "learning_rate": 2.1499636196405225e-06, + "loss": 0.7227, + "step": 1482 + }, + { + "epoch": 1.4053541814735846, + "grad_norm": 1.5386081307384067, + "learning_rate": 2.1436589245260375e-06, + "loss": 0.668, + "step": 1483 + }, + { + "epoch": 1.406301824212272, + "grad_norm": 0.9995908645331962, + "learning_rate": 2.1373609638047033e-06, + "loss": 0.7043, + "step": 1484 + }, + { + "epoch": 1.4072494669509594, + "grad_norm": 1.10383243717245, + "learning_rate": 2.1310697523252126e-06, + "loss": 0.7026, + "step": 1485 + }, + { + "epoch": 1.408197109689647, + "grad_norm": 1.0267277972723963, + "learning_rate": 2.1247853049203543e-06, + "loss": 0.7082, + "step": 1486 + }, + { + "epoch": 1.4091447524283345, + "grad_norm": 1.0496913097987108, + "learning_rate": 2.118507636406962e-06, + "loss": 0.6481, + "step": 1487 + }, + { + "epoch": 1.410092395167022, + "grad_norm": 1.1897337956700562, + "learning_rate": 2.112236761585892e-06, + "loss": 0.7089, + "step": 1488 + }, + { + "epoch": 1.4110400379057095, + "grad_norm": 0.9785065766450801, + "learning_rate": 2.1059726952419782e-06, + "loss": 0.7485, + "step": 1489 + }, + { + "epoch": 1.4119876806443972, + "grad_norm": 1.0121891810944206, + "learning_rate": 2.09971545214401e-06, + "loss": 0.7133, + "step": 1490 + }, + { + "epoch": 1.4129353233830846, + "grad_norm": 1.292470404393716, + "learning_rate": 2.0934650470446788e-06, + "loss": 0.6978, + "step": 1491 + }, + { + "epoch": 1.413882966121772, + "grad_norm": 0.9544540655235114, + "learning_rate": 2.087221494680563e-06, + "loss": 0.7313, + "step": 1492 + }, + { + "epoch": 1.4148306088604596, + "grad_norm": 1.1659180349275824, + "learning_rate": 2.0809848097720823e-06, + "loss": 0.6451, + "step": 1493 + }, + { + "epoch": 1.415778251599147, + "grad_norm": 1.037380174383666, + "learning_rate": 2.074755007023461e-06, + "loss": 0.7405, + "step": 1494 + }, + { + "epoch": 1.4167258943378347, + "grad_norm": 1.0242249417898568, + "learning_rate": 2.068532101122704e-06, + "loss": 0.6708, + "step": 1495 + }, + { + "epoch": 1.417673537076522, + "grad_norm": 0.9620726481692868, + "learning_rate": 2.0623161067415463e-06, + "loss": 0.6707, + "step": 1496 + }, + { + "epoch": 1.417673537076522, + "eval_loss": 0.9181029200553894, + "eval_runtime": 64.509, + "eval_samples_per_second": 42.289, + "eval_steps_per_second": 0.667, + "step": 1496 + }, + { + "epoch": 1.4186211798152097, + "grad_norm": 1.005417588718906, + "learning_rate": 2.0561070385354388e-06, + "loss": 0.6731, + "step": 1497 + }, + { + "epoch": 1.4195688225538972, + "grad_norm": 1.1896671861120682, + "learning_rate": 2.0499049111434922e-06, + "loss": 0.7227, + "step": 1498 + }, + { + "epoch": 1.4205164652925846, + "grad_norm": 2.0023777404039786, + "learning_rate": 2.0437097391884613e-06, + "loss": 0.6868, + "step": 1499 + }, + { + "epoch": 1.4214641080312722, + "grad_norm": 1.0629425622129343, + "learning_rate": 2.0375215372766944e-06, + "loss": 0.6846, + "step": 1500 + }, + { + "epoch": 1.4224117507699598, + "grad_norm": 0.9445974760544792, + "learning_rate": 2.0313403199981125e-06, + "loss": 0.7394, + "step": 1501 + }, + { + "epoch": 1.4233593935086473, + "grad_norm": 0.9843138879504003, + "learning_rate": 2.025166101926168e-06, + "loss": 0.7182, + "step": 1502 + }, + { + "epoch": 1.4243070362473347, + "grad_norm": 0.9243254028077557, + "learning_rate": 2.018998897617808e-06, + "loss": 0.6837, + "step": 1503 + }, + { + "epoch": 1.4252546789860223, + "grad_norm": 1.0253863539461858, + "learning_rate": 2.012838721613447e-06, + "loss": 0.667, + "step": 1504 + }, + { + "epoch": 1.4262023217247097, + "grad_norm": 1.327749397929995, + "learning_rate": 2.0066855884369246e-06, + "loss": 0.7177, + "step": 1505 + }, + { + "epoch": 1.4271499644633974, + "grad_norm": 1.1076042384170708, + "learning_rate": 2.0005395125954814e-06, + "loss": 0.7841, + "step": 1506 + }, + { + "epoch": 1.4280976072020848, + "grad_norm": 1.037107571912964, + "learning_rate": 1.9944005085797124e-06, + "loss": 0.6346, + "step": 1507 + }, + { + "epoch": 1.4290452499407724, + "grad_norm": 1.0616935978252389, + "learning_rate": 1.988268590863546e-06, + "loss": 0.7287, + "step": 1508 + }, + { + "epoch": 1.4299928926794598, + "grad_norm": 1.1394211171878255, + "learning_rate": 1.982143773904197e-06, + "loss": 0.7026, + "step": 1509 + }, + { + "epoch": 1.4309405354181473, + "grad_norm": 1.0108800469903032, + "learning_rate": 1.9760260721421426e-06, + "loss": 0.722, + "step": 1510 + }, + { + "epoch": 1.431888178156835, + "grad_norm": 1.0306788000447762, + "learning_rate": 1.9699155000010853e-06, + "loss": 0.6762, + "step": 1511 + }, + { + "epoch": 1.4328358208955223, + "grad_norm": 0.963103281240134, + "learning_rate": 1.9638120718879133e-06, + "loss": 0.7084, + "step": 1512 + }, + { + "epoch": 1.43378346363421, + "grad_norm": 0.9474369197260795, + "learning_rate": 1.9577158021926774e-06, + "loss": 0.6745, + "step": 1513 + }, + { + "epoch": 1.4347311063728974, + "grad_norm": 1.9939781034695376, + "learning_rate": 1.951626705288544e-06, + "loss": 0.7011, + "step": 1514 + }, + { + "epoch": 1.435678749111585, + "grad_norm": 0.9725792085154051, + "learning_rate": 1.945544795531777e-06, + "loss": 0.7155, + "step": 1515 + }, + { + "epoch": 1.4366263918502724, + "grad_norm": 1.1197678082060893, + "learning_rate": 1.9394700872616856e-06, + "loss": 0.6595, + "step": 1516 + }, + { + "epoch": 1.4375740345889598, + "grad_norm": 1.0301457084495482, + "learning_rate": 1.9334025948006074e-06, + "loss": 0.6955, + "step": 1517 + }, + { + "epoch": 1.4385216773276475, + "grad_norm": 0.9348949706000595, + "learning_rate": 1.927342332453866e-06, + "loss": 0.6047, + "step": 1518 + }, + { + "epoch": 1.4385216773276475, + "eval_loss": 0.9176700115203857, + "eval_runtime": 67.6785, + "eval_samples_per_second": 40.308, + "eval_steps_per_second": 0.635, + "step": 1518 + }, + { + "epoch": 1.439469320066335, + "grad_norm": 1.0644335037153672, + "learning_rate": 1.921289314509734e-06, + "loss": 0.7127, + "step": 1519 + }, + { + "epoch": 1.4404169628050225, + "grad_norm": 1.0834219671738452, + "learning_rate": 1.9152435552394105e-06, + "loss": 0.7215, + "step": 1520 + }, + { + "epoch": 1.44136460554371, + "grad_norm": 1.0745604461477756, + "learning_rate": 1.9092050688969736e-06, + "loss": 0.678, + "step": 1521 + }, + { + "epoch": 1.4423122482823976, + "grad_norm": 1.0371641556496103, + "learning_rate": 1.9031738697193618e-06, + "loss": 0.633, + "step": 1522 + }, + { + "epoch": 1.443259891021085, + "grad_norm": 1.2536801282599777, + "learning_rate": 1.8971499719263253e-06, + "loss": 0.6985, + "step": 1523 + }, + { + "epoch": 1.4442075337597726, + "grad_norm": 0.9951235851029153, + "learning_rate": 1.8911333897204071e-06, + "loss": 0.7719, + "step": 1524 + }, + { + "epoch": 1.44515517649846, + "grad_norm": 1.0116038011102524, + "learning_rate": 1.8851241372868938e-06, + "loss": 0.6848, + "step": 1525 + }, + { + "epoch": 1.4461028192371477, + "grad_norm": 1.4604330607664564, + "learning_rate": 1.8791222287937983e-06, + "loss": 0.7657, + "step": 1526 + }, + { + "epoch": 1.447050461975835, + "grad_norm": 1.0830259379657234, + "learning_rate": 1.8731276783918162e-06, + "loss": 0.6805, + "step": 1527 + }, + { + "epoch": 1.4479981047145225, + "grad_norm": 1.0267731594205933, + "learning_rate": 1.8671405002142918e-06, + "loss": 0.6707, + "step": 1528 + }, + { + "epoch": 1.4489457474532101, + "grad_norm": 1.0461880905401142, + "learning_rate": 1.8611607083771931e-06, + "loss": 0.7222, + "step": 1529 + }, + { + "epoch": 1.4498933901918978, + "grad_norm": 1.5078994577302969, + "learning_rate": 1.855188316979068e-06, + "loss": 0.7749, + "step": 1530 + }, + { + "epoch": 1.4508410329305852, + "grad_norm": 0.9580998087747682, + "learning_rate": 1.8492233401010218e-06, + "loss": 0.6656, + "step": 1531 + }, + { + "epoch": 1.4517886756692726, + "grad_norm": 1.030790506291452, + "learning_rate": 1.8432657918066732e-06, + "loss": 0.6938, + "step": 1532 + }, + { + "epoch": 1.4527363184079602, + "grad_norm": 1.0172225305837883, + "learning_rate": 1.8373156861421327e-06, + "loss": 0.6944, + "step": 1533 + }, + { + "epoch": 1.4536839611466477, + "grad_norm": 1.0683718679060756, + "learning_rate": 1.831373037135955e-06, + "loss": 0.6548, + "step": 1534 + }, + { + "epoch": 1.4546316038853353, + "grad_norm": 1.0948584996468758, + "learning_rate": 1.8254378587991229e-06, + "loss": 0.7163, + "step": 1535 + }, + { + "epoch": 1.4555792466240227, + "grad_norm": 0.9144325805834296, + "learning_rate": 1.819510165125002e-06, + "loss": 0.6897, + "step": 1536 + }, + { + "epoch": 1.4565268893627104, + "grad_norm": 1.0047087823053469, + "learning_rate": 1.813589970089308e-06, + "loss": 0.682, + "step": 1537 + }, + { + "epoch": 1.4574745321013978, + "grad_norm": 1.0404416866339528, + "learning_rate": 1.8076772876500831e-06, + "loss": 0.7615, + "step": 1538 + }, + { + "epoch": 1.4584221748400852, + "grad_norm": 0.9884869531835868, + "learning_rate": 1.8017721317476517e-06, + "loss": 0.7436, + "step": 1539 + }, + { + "epoch": 1.4593698175787728, + "grad_norm": 1.1394802881055484, + "learning_rate": 1.7958745163045987e-06, + "loss": 0.6969, + "step": 1540 + }, + { + "epoch": 1.4593698175787728, + "eval_loss": 0.9182996153831482, + "eval_runtime": 64.6749, + "eval_samples_per_second": 42.18, + "eval_steps_per_second": 0.665, + "step": 1540 + }, + { + "epoch": 1.4603174603174602, + "grad_norm": 0.9439166549447204, + "learning_rate": 1.7899844552257233e-06, + "loss": 0.6422, + "step": 1541 + }, + { + "epoch": 1.4612651030561479, + "grad_norm": 1.0656288471157296, + "learning_rate": 1.7841019623980215e-06, + "loss": 0.7706, + "step": 1542 + }, + { + "epoch": 1.4622127457948353, + "grad_norm": 1.1222434957696432, + "learning_rate": 1.778227051690639e-06, + "loss": 0.7507, + "step": 1543 + }, + { + "epoch": 1.463160388533523, + "grad_norm": 0.983832268950965, + "learning_rate": 1.77235973695485e-06, + "loss": 0.6955, + "step": 1544 + }, + { + "epoch": 1.4641080312722103, + "grad_norm": 1.2587712766095054, + "learning_rate": 1.76650003202402e-06, + "loss": 0.6477, + "step": 1545 + }, + { + "epoch": 1.4650556740108978, + "grad_norm": 1.0066028742449087, + "learning_rate": 1.760647950713566e-06, + "loss": 0.7544, + "step": 1546 + }, + { + "epoch": 1.4660033167495854, + "grad_norm": 1.7078692165257152, + "learning_rate": 1.7548035068209402e-06, + "loss": 0.6756, + "step": 1547 + }, + { + "epoch": 1.466950959488273, + "grad_norm": 1.166304208347539, + "learning_rate": 1.7489667141255801e-06, + "loss": 0.7093, + "step": 1548 + }, + { + "epoch": 1.4678986022269604, + "grad_norm": 1.1980159481547823, + "learning_rate": 1.74313758638889e-06, + "loss": 0.6749, + "step": 1549 + }, + { + "epoch": 1.4688462449656479, + "grad_norm": 1.1944578051622323, + "learning_rate": 1.7373161373541968e-06, + "loss": 0.7281, + "step": 1550 + }, + { + "epoch": 1.4697938877043355, + "grad_norm": 0.9485809631995211, + "learning_rate": 1.7315023807467297e-06, + "loss": 0.7248, + "step": 1551 + }, + { + "epoch": 1.470741530443023, + "grad_norm": 1.0931088608413588, + "learning_rate": 1.7256963302735752e-06, + "loss": 0.7378, + "step": 1552 + }, + { + "epoch": 1.4716891731817106, + "grad_norm": 1.060786787419658, + "learning_rate": 1.7198979996236548e-06, + "loss": 0.7155, + "step": 1553 + }, + { + "epoch": 1.472636815920398, + "grad_norm": 1.0652060238495478, + "learning_rate": 1.7141074024676913e-06, + "loss": 0.7045, + "step": 1554 + }, + { + "epoch": 1.4735844586590856, + "grad_norm": 1.0281307395838142, + "learning_rate": 1.7083245524581666e-06, + "loss": 0.6337, + "step": 1555 + }, + { + "epoch": 1.474532101397773, + "grad_norm": 1.0438612628712518, + "learning_rate": 1.702549463229305e-06, + "loss": 0.71, + "step": 1556 + }, + { + "epoch": 1.4754797441364604, + "grad_norm": 1.1014417394767868, + "learning_rate": 1.6967821483970277e-06, + "loss": 0.7179, + "step": 1557 + }, + { + "epoch": 1.476427386875148, + "grad_norm": 1.111247079863271, + "learning_rate": 1.6910226215589303e-06, + "loss": 0.7377, + "step": 1558 + }, + { + "epoch": 1.4773750296138357, + "grad_norm": 0.9295211830798729, + "learning_rate": 1.6852708962942426e-06, + "loss": 0.6809, + "step": 1559 + }, + { + "epoch": 1.4783226723525231, + "grad_norm": 0.962644045007234, + "learning_rate": 1.6795269861638041e-06, + "loss": 0.6314, + "step": 1560 + }, + { + "epoch": 1.4792703150912105, + "grad_norm": 1.0292591372583686, + "learning_rate": 1.6737909047100292e-06, + "loss": 0.6838, + "step": 1561 + }, + { + "epoch": 1.4802179578298982, + "grad_norm": 0.933451941605439, + "learning_rate": 1.6680626654568688e-06, + "loss": 0.6608, + "step": 1562 + }, + { + "epoch": 1.4802179578298982, + "eval_loss": 0.9174679517745972, + "eval_runtime": 65.8561, + "eval_samples_per_second": 41.424, + "eval_steps_per_second": 0.653, + "step": 1562 + }, + { + "epoch": 1.4811656005685856, + "grad_norm": 1.2955810557077383, + "learning_rate": 1.6623422819097916e-06, + "loss": 0.6458, + "step": 1563 + }, + { + "epoch": 1.4821132433072732, + "grad_norm": 1.0310551387137157, + "learning_rate": 1.6566297675557392e-06, + "loss": 0.6919, + "step": 1564 + }, + { + "epoch": 1.4830608860459606, + "grad_norm": 1.0315297687360494, + "learning_rate": 1.650925135863104e-06, + "loss": 0.7086, + "step": 1565 + }, + { + "epoch": 1.4840085287846483, + "grad_norm": 1.0726991322217692, + "learning_rate": 1.6452284002816893e-06, + "loss": 0.7162, + "step": 1566 + }, + { + "epoch": 1.4849561715233357, + "grad_norm": 0.9497387623431127, + "learning_rate": 1.6395395742426873e-06, + "loss": 0.7216, + "step": 1567 + }, + { + "epoch": 1.4859038142620231, + "grad_norm": 1.2380479809887557, + "learning_rate": 1.6338586711586358e-06, + "loss": 0.7606, + "step": 1568 + }, + { + "epoch": 1.4868514570007108, + "grad_norm": 1.0796844065547042, + "learning_rate": 1.6281857044233968e-06, + "loss": 0.7319, + "step": 1569 + }, + { + "epoch": 1.4877990997393982, + "grad_norm": 1.1057313142497514, + "learning_rate": 1.6225206874121219e-06, + "loss": 0.6829, + "step": 1570 + }, + { + "epoch": 1.4887467424780858, + "grad_norm": 0.9734912790760845, + "learning_rate": 1.6168636334812126e-06, + "loss": 0.7407, + "step": 1571 + }, + { + "epoch": 1.4896943852167732, + "grad_norm": 1.0421614980005374, + "learning_rate": 1.6112145559683057e-06, + "loss": 0.7287, + "step": 1572 + }, + { + "epoch": 1.4906420279554609, + "grad_norm": 1.04980460946954, + "learning_rate": 1.6055734681922225e-06, + "loss": 0.7045, + "step": 1573 + }, + { + "epoch": 1.4915896706941483, + "grad_norm": 0.9906252177237804, + "learning_rate": 1.5999403834529549e-06, + "loss": 0.7192, + "step": 1574 + }, + { + "epoch": 1.4925373134328357, + "grad_norm": 0.8864390607015001, + "learning_rate": 1.5943153150316192e-06, + "loss": 0.6814, + "step": 1575 + }, + { + "epoch": 1.4934849561715233, + "grad_norm": 1.0203127741430889, + "learning_rate": 1.588698276190438e-06, + "loss": 0.6962, + "step": 1576 + }, + { + "epoch": 1.494432598910211, + "grad_norm": 1.1100506177133262, + "learning_rate": 1.583089280172696e-06, + "loss": 0.7852, + "step": 1577 + }, + { + "epoch": 1.4953802416488984, + "grad_norm": 1.0162831638512881, + "learning_rate": 1.5774883402027208e-06, + "loss": 0.7059, + "step": 1578 + }, + { + "epoch": 1.4963278843875858, + "grad_norm": 0.9936052407213679, + "learning_rate": 1.5718954694858457e-06, + "loss": 0.6858, + "step": 1579 + }, + { + "epoch": 1.4972755271262734, + "grad_norm": 0.9088159832146572, + "learning_rate": 1.5663106812083746e-06, + "loss": 0.75, + "step": 1580 + }, + { + "epoch": 1.4982231698649608, + "grad_norm": 1.078879512723169, + "learning_rate": 1.5607339885375616e-06, + "loss": 0.7139, + "step": 1581 + }, + { + "epoch": 1.4991708126036485, + "grad_norm": 1.096073398162632, + "learning_rate": 1.555165404621567e-06, + "loss": 0.7086, + "step": 1582 + }, + { + "epoch": 1.500118455342336, + "grad_norm": 0.9334643534638518, + "learning_rate": 1.549604942589441e-06, + "loss": 0.7479, + "step": 1583 + }, + { + "epoch": 1.5010660980810235, + "grad_norm": 1.0170090962142344, + "learning_rate": 1.5440526155510766e-06, + "loss": 0.7369, + "step": 1584 + }, + { + "epoch": 1.5010660980810235, + "eval_loss": 0.9167425036430359, + "eval_runtime": 65.8794, + "eval_samples_per_second": 41.409, + "eval_steps_per_second": 0.653, + "step": 1584 + }, + { + "epoch": 1.502013740819711, + "grad_norm": 0.9547990185885695, + "learning_rate": 1.5385084365971947e-06, + "loss": 0.6959, + "step": 1585 + }, + { + "epoch": 1.5029613835583984, + "grad_norm": 1.0361542203838576, + "learning_rate": 1.5329724187992983e-06, + "loss": 0.7212, + "step": 1586 + }, + { + "epoch": 1.503909026297086, + "grad_norm": 0.9844297983728068, + "learning_rate": 1.527444575209654e-06, + "loss": 0.6246, + "step": 1587 + }, + { + "epoch": 1.5048566690357736, + "grad_norm": 1.1774971138059982, + "learning_rate": 1.5219249188612556e-06, + "loss": 0.7104, + "step": 1588 + }, + { + "epoch": 1.505804311774461, + "grad_norm": 1.0258042621080319, + "learning_rate": 1.5164134627677895e-06, + "loss": 0.8074, + "step": 1589 + }, + { + "epoch": 1.5067519545131485, + "grad_norm": 1.0211430093660283, + "learning_rate": 1.5109102199236152e-06, + "loss": 0.7133, + "step": 1590 + }, + { + "epoch": 1.507699597251836, + "grad_norm": 0.9827554066548617, + "learning_rate": 1.5054152033037206e-06, + "loss": 0.6725, + "step": 1591 + }, + { + "epoch": 1.5086472399905235, + "grad_norm": 1.039393638590845, + "learning_rate": 1.4999284258637054e-06, + "loss": 0.7353, + "step": 1592 + }, + { + "epoch": 1.509594882729211, + "grad_norm": 0.9523927797678573, + "learning_rate": 1.4944499005397372e-06, + "loss": 0.7171, + "step": 1593 + }, + { + "epoch": 1.5105425254678986, + "grad_norm": 1.1643227627630548, + "learning_rate": 1.488979640248534e-06, + "loss": 0.7277, + "step": 1594 + }, + { + "epoch": 1.5114901682065862, + "grad_norm": 1.3140724772089227, + "learning_rate": 1.483517657887321e-06, + "loss": 0.6989, + "step": 1595 + }, + { + "epoch": 1.5124378109452736, + "grad_norm": 1.1677523249682737, + "learning_rate": 1.4780639663338125e-06, + "loss": 0.6442, + "step": 1596 + }, + { + "epoch": 1.513385453683961, + "grad_norm": 1.054535982093413, + "learning_rate": 1.4726185784461726e-06, + "loss": 0.7267, + "step": 1597 + }, + { + "epoch": 1.5143330964226487, + "grad_norm": 1.0699556405778263, + "learning_rate": 1.467181507062987e-06, + "loss": 0.7513, + "step": 1598 + }, + { + "epoch": 1.5152807391613363, + "grad_norm": 1.0277582958250449, + "learning_rate": 1.4617527650032359e-06, + "loss": 0.7007, + "step": 1599 + }, + { + "epoch": 1.5162283819000237, + "grad_norm": 0.9386085483384984, + "learning_rate": 1.4563323650662586e-06, + "loss": 0.6309, + "step": 1600 + }, + { + "epoch": 1.5171760246387112, + "grad_norm": 0.9866666932951867, + "learning_rate": 1.45092032003173e-06, + "loss": 0.7236, + "step": 1601 + }, + { + "epoch": 1.5181236673773988, + "grad_norm": 1.1091828859845587, + "learning_rate": 1.4455166426596222e-06, + "loss": 0.6645, + "step": 1602 + }, + { + "epoch": 1.5190713101160862, + "grad_norm": 1.0719630621382366, + "learning_rate": 1.440121345690182e-06, + "loss": 0.6967, + "step": 1603 + }, + { + "epoch": 1.5200189528547736, + "grad_norm": 1.2153721393164998, + "learning_rate": 1.434734441843899e-06, + "loss": 0.6897, + "step": 1604 + }, + { + "epoch": 1.5209665955934613, + "grad_norm": 1.4318770673483734, + "learning_rate": 1.4293559438214688e-06, + "loss": 0.6556, + "step": 1605 + }, + { + "epoch": 1.521914238332149, + "grad_norm": 1.2817723869505593, + "learning_rate": 1.4239858643037753e-06, + "loss": 0.714, + "step": 1606 + }, + { + "epoch": 1.521914238332149, + "eval_loss": 0.917027473449707, + "eval_runtime": 64.2379, + "eval_samples_per_second": 42.467, + "eval_steps_per_second": 0.669, + "step": 1606 + }, + { + "epoch": 1.5228618810708363, + "grad_norm": 0.8577851406821353, + "learning_rate": 1.4186242159518477e-06, + "loss": 0.7231, + "step": 1607 + }, + { + "epoch": 1.5238095238095237, + "grad_norm": 1.3693963934030193, + "learning_rate": 1.4132710114068427e-06, + "loss": 0.7009, + "step": 1608 + }, + { + "epoch": 1.5247571665482114, + "grad_norm": 1.0677528890480996, + "learning_rate": 1.4079262632900048e-06, + "loss": 0.7038, + "step": 1609 + }, + { + "epoch": 1.525704809286899, + "grad_norm": 1.0765183801869453, + "learning_rate": 1.4025899842026442e-06, + "loss": 0.6736, + "step": 1610 + }, + { + "epoch": 1.5266524520255862, + "grad_norm": 0.913748130871297, + "learning_rate": 1.3972621867261e-06, + "loss": 0.7614, + "step": 1611 + }, + { + "epoch": 1.5276000947642738, + "grad_norm": 0.9806034371545538, + "learning_rate": 1.3919428834217163e-06, + "loss": 0.7362, + "step": 1612 + }, + { + "epoch": 1.5285477375029615, + "grad_norm": 0.9176939063393724, + "learning_rate": 1.3866320868308137e-06, + "loss": 0.7242, + "step": 1613 + }, + { + "epoch": 1.5294953802416489, + "grad_norm": 1.078657631575495, + "learning_rate": 1.3813298094746491e-06, + "loss": 0.7036, + "step": 1614 + }, + { + "epoch": 1.5304430229803363, + "grad_norm": 0.9600837682866002, + "learning_rate": 1.3760360638544012e-06, + "loss": 0.6084, + "step": 1615 + }, + { + "epoch": 1.531390665719024, + "grad_norm": 0.9717695630583644, + "learning_rate": 1.3707508624511263e-06, + "loss": 0.7243, + "step": 1616 + }, + { + "epoch": 1.5323383084577116, + "grad_norm": 1.00289902715343, + "learning_rate": 1.3654742177257436e-06, + "loss": 0.7266, + "step": 1617 + }, + { + "epoch": 1.533285951196399, + "grad_norm": 0.9836974323677292, + "learning_rate": 1.3602061421189899e-06, + "loss": 0.6669, + "step": 1618 + }, + { + "epoch": 1.5342335939350864, + "grad_norm": 0.9697616210673388, + "learning_rate": 1.3549466480514079e-06, + "loss": 0.6768, + "step": 1619 + }, + { + "epoch": 1.535181236673774, + "grad_norm": 1.0209728170363237, + "learning_rate": 1.349695747923298e-06, + "loss": 0.68, + "step": 1620 + }, + { + "epoch": 1.5361288794124615, + "grad_norm": 1.1317276468030417, + "learning_rate": 1.3444534541147058e-06, + "loss": 0.6391, + "step": 1621 + }, + { + "epoch": 1.5370765221511489, + "grad_norm": 0.9314842383246464, + "learning_rate": 1.339219778985385e-06, + "loss": 0.7117, + "step": 1622 + }, + { + "epoch": 1.5380241648898365, + "grad_norm": 1.1323771307347132, + "learning_rate": 1.3339947348747633e-06, + "loss": 0.7511, + "step": 1623 + }, + { + "epoch": 1.5389718076285241, + "grad_norm": 1.0881241560945962, + "learning_rate": 1.3287783341019278e-06, + "loss": 0.6818, + "step": 1624 + }, + { + "epoch": 1.5399194503672116, + "grad_norm": 1.0586689056144305, + "learning_rate": 1.3235705889655781e-06, + "loss": 0.7126, + "step": 1625 + }, + { + "epoch": 1.540867093105899, + "grad_norm": 0.9786185399753199, + "learning_rate": 1.3183715117440143e-06, + "loss": 0.704, + "step": 1626 + }, + { + "epoch": 1.5418147358445866, + "grad_norm": 1.003277545254164, + "learning_rate": 1.3131811146950946e-06, + "loss": 0.7513, + "step": 1627 + }, + { + "epoch": 1.5427623785832743, + "grad_norm": 0.9826596333095564, + "learning_rate": 1.307999410056216e-06, + "loss": 0.7253, + "step": 1628 + }, + { + "epoch": 1.5427623785832743, + "eval_loss": 0.9158708453178406, + "eval_runtime": 63.8817, + "eval_samples_per_second": 42.704, + "eval_steps_per_second": 0.673, + "step": 1628 + }, + { + "epoch": 1.5437100213219617, + "grad_norm": 1.0364913585561064, + "learning_rate": 1.3028264100442773e-06, + "loss": 0.7177, + "step": 1629 + }, + { + "epoch": 1.544657664060649, + "grad_norm": 1.0314613393589664, + "learning_rate": 1.2976621268556571e-06, + "loss": 0.6822, + "step": 1630 + }, + { + "epoch": 1.5456053067993367, + "grad_norm": 1.15231691952742, + "learning_rate": 1.2925065726661845e-06, + "loss": 0.6954, + "step": 1631 + }, + { + "epoch": 1.5465529495380241, + "grad_norm": 1.130048978044205, + "learning_rate": 1.2873597596311026e-06, + "loss": 0.6895, + "step": 1632 + }, + { + "epoch": 1.5475005922767116, + "grad_norm": 0.9910577741606257, + "learning_rate": 1.2822216998850506e-06, + "loss": 0.7672, + "step": 1633 + }, + { + "epoch": 1.5484482350153992, + "grad_norm": 1.1500833811393358, + "learning_rate": 1.2770924055420258e-06, + "loss": 0.6813, + "step": 1634 + }, + { + "epoch": 1.5493958777540868, + "grad_norm": 1.0195004139070214, + "learning_rate": 1.2719718886953647e-06, + "loss": 0.6438, + "step": 1635 + }, + { + "epoch": 1.5503435204927742, + "grad_norm": 1.0569551060744469, + "learning_rate": 1.2668601614177017e-06, + "loss": 0.678, + "step": 1636 + }, + { + "epoch": 1.5512911632314617, + "grad_norm": 1.043875396257435, + "learning_rate": 1.2617572357609565e-06, + "loss": 0.7044, + "step": 1637 + }, + { + "epoch": 1.5522388059701493, + "grad_norm": 3.1818927893039124, + "learning_rate": 1.2566631237562894e-06, + "loss": 0.682, + "step": 1638 + }, + { + "epoch": 1.553186448708837, + "grad_norm": 0.9283273042702088, + "learning_rate": 1.2515778374140858e-06, + "loss": 0.688, + "step": 1639 + }, + { + "epoch": 1.5541340914475241, + "grad_norm": 0.9528198487095788, + "learning_rate": 1.246501388723923e-06, + "loss": 0.7322, + "step": 1640 + }, + { + "epoch": 1.5550817341862118, + "grad_norm": 0.9973994825972451, + "learning_rate": 1.2414337896545375e-06, + "loss": 0.666, + "step": 1641 + }, + { + "epoch": 1.5560293769248994, + "grad_norm": 0.9902699851910854, + "learning_rate": 1.2363750521538064e-06, + "loss": 0.6851, + "step": 1642 + }, + { + "epoch": 1.5569770196635868, + "grad_norm": 0.9650904944333506, + "learning_rate": 1.2313251881487081e-06, + "loss": 0.6672, + "step": 1643 + }, + { + "epoch": 1.5579246624022742, + "grad_norm": 1.0589094342875154, + "learning_rate": 1.2262842095453065e-06, + "loss": 0.7416, + "step": 1644 + }, + { + "epoch": 1.5588723051409619, + "grad_norm": 0.9386856438191878, + "learning_rate": 1.2212521282287093e-06, + "loss": 0.6483, + "step": 1645 + }, + { + "epoch": 1.5598199478796495, + "grad_norm": 1.0392724309573975, + "learning_rate": 1.2162289560630524e-06, + "loss": 0.647, + "step": 1646 + }, + { + "epoch": 1.560767590618337, + "grad_norm": 0.9961962290639487, + "learning_rate": 1.211214704891467e-06, + "loss": 0.6501, + "step": 1647 + }, + { + "epoch": 1.5617152333570243, + "grad_norm": 0.9147134754595742, + "learning_rate": 1.2062093865360458e-06, + "loss": 0.6753, + "step": 1648 + }, + { + "epoch": 1.562662876095712, + "grad_norm": 1.1879710589928145, + "learning_rate": 1.2012130127978267e-06, + "loss": 0.7233, + "step": 1649 + }, + { + "epoch": 1.5636105188343994, + "grad_norm": 0.931640687866233, + "learning_rate": 1.1962255954567537e-06, + "loss": 0.6783, + "step": 1650 + }, + { + "epoch": 1.5636105188343994, + "eval_loss": 0.9168549180030823, + "eval_runtime": 67.249, + "eval_samples_per_second": 40.566, + "eval_steps_per_second": 0.639, + "step": 1650 + }, + { + "epoch": 1.5645581615730868, + "grad_norm": 1.0551992767048257, + "learning_rate": 1.1912471462716596e-06, + "loss": 0.7034, + "step": 1651 + }, + { + "epoch": 1.5655058043117744, + "grad_norm": 1.0948003208344899, + "learning_rate": 1.1862776769802275e-06, + "loss": 0.7325, + "step": 1652 + }, + { + "epoch": 1.566453447050462, + "grad_norm": 0.9671073218744537, + "learning_rate": 1.181317199298974e-06, + "loss": 0.6658, + "step": 1653 + }, + { + "epoch": 1.5674010897891495, + "grad_norm": 1.0286649464862914, + "learning_rate": 1.1763657249232107e-06, + "loss": 0.696, + "step": 1654 + }, + { + "epoch": 1.568348732527837, + "grad_norm": 1.0682369368320455, + "learning_rate": 1.1714232655270264e-06, + "loss": 0.6833, + "step": 1655 + }, + { + "epoch": 1.5692963752665245, + "grad_norm": 0.9350398037136525, + "learning_rate": 1.1664898327632552e-06, + "loss": 0.6133, + "step": 1656 + }, + { + "epoch": 1.5702440180052122, + "grad_norm": 1.0464212808521496, + "learning_rate": 1.1615654382634444e-06, + "loss": 0.6935, + "step": 1657 + }, + { + "epoch": 1.5711916607438996, + "grad_norm": 1.1421842706854686, + "learning_rate": 1.1566500936378389e-06, + "loss": 0.6562, + "step": 1658 + }, + { + "epoch": 1.572139303482587, + "grad_norm": 1.0126937824702735, + "learning_rate": 1.1517438104753386e-06, + "loss": 0.7224, + "step": 1659 + }, + { + "epoch": 1.5730869462212747, + "grad_norm": 1.033911213832933, + "learning_rate": 1.146846600343488e-06, + "loss": 0.7106, + "step": 1660 + }, + { + "epoch": 1.574034588959962, + "grad_norm": 1.1179259174465113, + "learning_rate": 1.1419584747884322e-06, + "loss": 0.6983, + "step": 1661 + }, + { + "epoch": 1.5749822316986495, + "grad_norm": 1.1682227334057596, + "learning_rate": 1.1370794453349039e-06, + "loss": 0.7165, + "step": 1662 + }, + { + "epoch": 1.5759298744373371, + "grad_norm": 0.977082622032907, + "learning_rate": 1.132209523486184e-06, + "loss": 0.6902, + "step": 1663 + }, + { + "epoch": 1.5768775171760248, + "grad_norm": 1.023609290507064, + "learning_rate": 1.1273487207240845e-06, + "loss": 0.6784, + "step": 1664 + }, + { + "epoch": 1.5778251599147122, + "grad_norm": 1.0160057466393255, + "learning_rate": 1.1224970485089193e-06, + "loss": 0.6993, + "step": 1665 + }, + { + "epoch": 1.5787728026533996, + "grad_norm": 0.9873881250473061, + "learning_rate": 1.1176545182794674e-06, + "loss": 0.7175, + "step": 1666 + }, + { + "epoch": 1.5797204453920872, + "grad_norm": 1.3197902151472374, + "learning_rate": 1.1128211414529626e-06, + "loss": 0.6993, + "step": 1667 + }, + { + "epoch": 1.5806680881307749, + "grad_norm": 1.3071909280159977, + "learning_rate": 1.1079969294250515e-06, + "loss": 0.7093, + "step": 1668 + }, + { + "epoch": 1.581615730869462, + "grad_norm": 1.0541564785946917, + "learning_rate": 1.1031818935697763e-06, + "loss": 0.7186, + "step": 1669 + }, + { + "epoch": 1.5825633736081497, + "grad_norm": 0.9430987398425215, + "learning_rate": 1.0983760452395415e-06, + "loss": 0.6589, + "step": 1670 + }, + { + "epoch": 1.5835110163468373, + "grad_norm": 0.990119702459614, + "learning_rate": 1.0935793957650947e-06, + "loss": 0.6329, + "step": 1671 + }, + { + "epoch": 1.5844586590855247, + "grad_norm": 1.0566708748575848, + "learning_rate": 1.0887919564554893e-06, + "loss": 0.7004, + "step": 1672 + }, + { + "epoch": 1.5844586590855247, + "eval_loss": 0.91568523645401, + "eval_runtime": 65.9386, + "eval_samples_per_second": 41.372, + "eval_steps_per_second": 0.652, + "step": 1672 + }, + { + "epoch": 1.5854063018242122, + "grad_norm": 0.9610068854225464, + "learning_rate": 1.0840137385980698e-06, + "loss": 0.6791, + "step": 1673 + }, + { + "epoch": 1.5863539445628998, + "grad_norm": 1.0544927310701904, + "learning_rate": 1.079244753458437e-06, + "loss": 0.6809, + "step": 1674 + }, + { + "epoch": 1.5873015873015874, + "grad_norm": 0.9544448982368492, + "learning_rate": 1.0744850122804218e-06, + "loss": 0.6979, + "step": 1675 + }, + { + "epoch": 1.5882492300402749, + "grad_norm": 1.2640977900333505, + "learning_rate": 1.0697345262860638e-06, + "loss": 0.6599, + "step": 1676 + }, + { + "epoch": 1.5891968727789623, + "grad_norm": 1.0083657972577804, + "learning_rate": 1.064993306675578e-06, + "loss": 0.6699, + "step": 1677 + }, + { + "epoch": 1.59014451551765, + "grad_norm": 1.0076578894558765, + "learning_rate": 1.0602613646273374e-06, + "loss": 0.6576, + "step": 1678 + }, + { + "epoch": 1.5910921582563373, + "grad_norm": 1.0296009587033885, + "learning_rate": 1.055538711297835e-06, + "loss": 0.6642, + "step": 1679 + }, + { + "epoch": 1.5920398009950247, + "grad_norm": 1.031189178917333, + "learning_rate": 1.0508253578216693e-06, + "loss": 0.5869, + "step": 1680 + }, + { + "epoch": 1.5929874437337124, + "grad_norm": 1.0167017867708008, + "learning_rate": 1.046121315311508e-06, + "loss": 0.699, + "step": 1681 + }, + { + "epoch": 1.5939350864724, + "grad_norm": 1.7410568540334943, + "learning_rate": 1.0414265948580694e-06, + "loss": 0.7248, + "step": 1682 + }, + { + "epoch": 1.5948827292110874, + "grad_norm": 1.0198403446768327, + "learning_rate": 1.0367412075300942e-06, + "loss": 0.7163, + "step": 1683 + }, + { + "epoch": 1.5958303719497748, + "grad_norm": 0.9447519143939068, + "learning_rate": 1.0320651643743128e-06, + "loss": 0.6455, + "step": 1684 + }, + { + "epoch": 1.5967780146884625, + "grad_norm": 0.9675157899047281, + "learning_rate": 1.0273984764154327e-06, + "loss": 0.6627, + "step": 1685 + }, + { + "epoch": 1.5977256574271501, + "grad_norm": 1.099687430563817, + "learning_rate": 1.0227411546560962e-06, + "loss": 0.6868, + "step": 1686 + }, + { + "epoch": 1.5986733001658375, + "grad_norm": 1.0425813440851017, + "learning_rate": 1.0180932100768714e-06, + "loss": 0.7263, + "step": 1687 + }, + { + "epoch": 1.599620942904525, + "grad_norm": 1.0467623766365717, + "learning_rate": 1.0134546536362099e-06, + "loss": 0.7087, + "step": 1688 + }, + { + "epoch": 1.6005685856432126, + "grad_norm": 1.1080721212745201, + "learning_rate": 1.008825496270434e-06, + "loss": 0.708, + "step": 1689 + }, + { + "epoch": 1.6015162283819, + "grad_norm": 1.18766161221473, + "learning_rate": 1.0042057488937067e-06, + "loss": 0.6998, + "step": 1690 + }, + { + "epoch": 1.6024638711205874, + "grad_norm": 1.0510182256034266, + "learning_rate": 9.995954223979992e-07, + "loss": 0.6989, + "step": 1691 + }, + { + "epoch": 1.603411513859275, + "grad_norm": 1.3860613450249764, + "learning_rate": 9.949945276530782e-07, + "loss": 0.7097, + "step": 1692 + }, + { + "epoch": 1.6043591565979627, + "grad_norm": 1.2179486037581968, + "learning_rate": 9.904030755064659e-07, + "loss": 0.6978, + "step": 1693 + }, + { + "epoch": 1.60530679933665, + "grad_norm": 0.964495472412476, + "learning_rate": 9.858210767834292e-07, + "loss": 0.6589, + "step": 1694 + }, + { + "epoch": 1.60530679933665, + "eval_loss": 0.9162237644195557, + "eval_runtime": 66.3491, + "eval_samples_per_second": 41.116, + "eval_steps_per_second": 0.648, + "step": 1694 + }, + { + "epoch": 1.6062544420753375, + "grad_norm": 0.9604758470801715, + "learning_rate": 9.8124854228694e-07, + "loss": 0.6931, + "step": 1695 + }, + { + "epoch": 1.6072020848140252, + "grad_norm": 1.1003612306707224, + "learning_rate": 9.76685482797662e-07, + "loss": 0.7113, + "step": 1696 + }, + { + "epoch": 1.6081497275527128, + "grad_norm": 1.0752314491639252, + "learning_rate": 9.72131909073914e-07, + "loss": 0.6719, + "step": 1697 + }, + { + "epoch": 1.6090973702914, + "grad_norm": 1.0414996201454678, + "learning_rate": 9.675878318516546e-07, + "loss": 0.7659, + "step": 1698 + }, + { + "epoch": 1.6100450130300876, + "grad_norm": 1.1239598855506638, + "learning_rate": 9.630532618444532e-07, + "loss": 0.6927, + "step": 1699 + }, + { + "epoch": 1.6109926557687753, + "grad_norm": 1.0046753329350298, + "learning_rate": 9.58528209743459e-07, + "loss": 0.7055, + "step": 1700 + }, + { + "epoch": 1.6119402985074627, + "grad_norm": 1.0189187609725145, + "learning_rate": 9.540126862173865e-07, + "loss": 0.7139, + "step": 1701 + }, + { + "epoch": 1.61288794124615, + "grad_norm": 1.719157547437819, + "learning_rate": 9.495067019124793e-07, + "loss": 0.7117, + "step": 1702 + }, + { + "epoch": 1.6138355839848377, + "grad_norm": 1.01602314797504, + "learning_rate": 9.450102674524952e-07, + "loss": 0.7244, + "step": 1703 + }, + { + "epoch": 1.6147832267235254, + "grad_norm": 1.0640275443844036, + "learning_rate": 9.405233934386726e-07, + "loss": 0.6851, + "step": 1704 + }, + { + "epoch": 1.6157308694622128, + "grad_norm": 0.9979744262453907, + "learning_rate": 9.360460904497132e-07, + "loss": 0.712, + "step": 1705 + }, + { + "epoch": 1.6166785122009002, + "grad_norm": 0.9521292764998414, + "learning_rate": 9.315783690417479e-07, + "loss": 0.6478, + "step": 1706 + }, + { + "epoch": 1.6176261549395878, + "grad_norm": 1.0329943572255877, + "learning_rate": 9.271202397483214e-07, + "loss": 0.692, + "step": 1707 + }, + { + "epoch": 1.6185737976782753, + "grad_norm": 0.9688251186485236, + "learning_rate": 9.226717130803636e-07, + "loss": 0.7099, + "step": 1708 + }, + { + "epoch": 1.6195214404169627, + "grad_norm": 1.0095855927099802, + "learning_rate": 9.182327995261592e-07, + "loss": 0.6799, + "step": 1709 + }, + { + "epoch": 1.6204690831556503, + "grad_norm": 0.9336916292986674, + "learning_rate": 9.138035095513337e-07, + "loss": 0.7118, + "step": 1710 + }, + { + "epoch": 1.621416725894338, + "grad_norm": 0.9262678528281649, + "learning_rate": 9.093838535988181e-07, + "loss": 0.7048, + "step": 1711 + }, + { + "epoch": 1.6223643686330254, + "grad_norm": 1.010842317395484, + "learning_rate": 9.049738420888349e-07, + "loss": 0.6302, + "step": 1712 + }, + { + "epoch": 1.6233120113717128, + "grad_norm": 1.0054077234262655, + "learning_rate": 9.005734854188625e-07, + "loss": 0.7457, + "step": 1713 + }, + { + "epoch": 1.6242596541104004, + "grad_norm": 1.1774522725658818, + "learning_rate": 8.961827939636198e-07, + "loss": 0.6728, + "step": 1714 + }, + { + "epoch": 1.625207296849088, + "grad_norm": 1.0498467807029983, + "learning_rate": 8.918017780750349e-07, + "loss": 0.7334, + "step": 1715 + }, + { + "epoch": 1.6261549395877755, + "grad_norm": 0.9901280155483644, + "learning_rate": 8.874304480822271e-07, + "loss": 0.7517, + "step": 1716 + }, + { + "epoch": 1.6261549395877755, + "eval_loss": 0.9154621958732605, + "eval_runtime": 65.8149, + "eval_samples_per_second": 41.45, + "eval_steps_per_second": 0.653, + "step": 1716 + }, + { + "epoch": 1.6271025823264629, + "grad_norm": 1.1089800038542108, + "learning_rate": 8.830688142914783e-07, + "loss": 0.6657, + "step": 1717 + }, + { + "epoch": 1.6280502250651505, + "grad_norm": 1.009309929039825, + "learning_rate": 8.787168869862067e-07, + "loss": 0.6259, + "step": 1718 + }, + { + "epoch": 1.628997867803838, + "grad_norm": 0.9117346360474112, + "learning_rate": 8.743746764269512e-07, + "loss": 0.6988, + "step": 1719 + }, + { + "epoch": 1.6299455105425253, + "grad_norm": 2.22961599294848, + "learning_rate": 8.700421928513353e-07, + "loss": 0.653, + "step": 1720 + }, + { + "epoch": 1.630893153281213, + "grad_norm": 0.9850710565653991, + "learning_rate": 8.657194464740542e-07, + "loss": 0.737, + "step": 1721 + }, + { + "epoch": 1.6318407960199006, + "grad_norm": 1.0081901085844467, + "learning_rate": 8.614064474868423e-07, + "loss": 0.6789, + "step": 1722 + }, + { + "epoch": 1.632788438758588, + "grad_norm": 1.0631665749702537, + "learning_rate": 8.571032060584555e-07, + "loss": 0.7087, + "step": 1723 + }, + { + "epoch": 1.6337360814972754, + "grad_norm": 1.0118721550869085, + "learning_rate": 8.528097323346408e-07, + "loss": 0.6821, + "step": 1724 + }, + { + "epoch": 1.634683724235963, + "grad_norm": 1.274936977561588, + "learning_rate": 8.485260364381187e-07, + "loss": 0.6716, + "step": 1725 + }, + { + "epoch": 1.6356313669746505, + "grad_norm": 1.0982132650644516, + "learning_rate": 8.442521284685573e-07, + "loss": 0.6765, + "step": 1726 + }, + { + "epoch": 1.636579009713338, + "grad_norm": 1.1367575465419841, + "learning_rate": 8.399880185025439e-07, + "loss": 0.6864, + "step": 1727 + }, + { + "epoch": 1.6375266524520256, + "grad_norm": 0.9279424119252897, + "learning_rate": 8.357337165935675e-07, + "loss": 0.7321, + "step": 1728 + }, + { + "epoch": 1.6384742951907132, + "grad_norm": 1.0676158883165283, + "learning_rate": 8.314892327719937e-07, + "loss": 0.7418, + "step": 1729 + }, + { + "epoch": 1.6394219379294006, + "grad_norm": 1.014952809890761, + "learning_rate": 8.27254577045039e-07, + "loss": 0.7356, + "step": 1730 + }, + { + "epoch": 1.640369580668088, + "grad_norm": 1.0553676092641795, + "learning_rate": 8.230297593967463e-07, + "loss": 0.6572, + "step": 1731 + }, + { + "epoch": 1.6413172234067757, + "grad_norm": 1.0596022428705136, + "learning_rate": 8.188147897879667e-07, + "loss": 0.6834, + "step": 1732 + }, + { + "epoch": 1.6422648661454633, + "grad_norm": 1.0760233096652914, + "learning_rate": 8.146096781563284e-07, + "loss": 0.6732, + "step": 1733 + }, + { + "epoch": 1.6432125088841507, + "grad_norm": 1.0435516775906648, + "learning_rate": 8.104144344162229e-07, + "loss": 0.7147, + "step": 1734 + }, + { + "epoch": 1.6441601516228381, + "grad_norm": 0.9126453263477479, + "learning_rate": 8.062290684587698e-07, + "loss": 0.7066, + "step": 1735 + }, + { + "epoch": 1.6451077943615258, + "grad_norm": 1.136390471558309, + "learning_rate": 8.02053590151805e-07, + "loss": 0.675, + "step": 1736 + }, + { + "epoch": 1.6460554371002132, + "grad_norm": 0.9443277333193851, + "learning_rate": 7.978880093398517e-07, + "loss": 0.6556, + "step": 1737 + }, + { + "epoch": 1.6470030798389006, + "grad_norm": 0.9782892174224657, + "learning_rate": 7.937323358440935e-07, + "loss": 0.6771, + "step": 1738 + }, + { + "epoch": 1.6470030798389006, + "eval_loss": 0.9155307412147522, + "eval_runtime": 69.7327, + "eval_samples_per_second": 39.121, + "eval_steps_per_second": 0.617, + "step": 1738 + }, + { + "epoch": 1.6479507225775882, + "grad_norm": 0.9461043845473808, + "learning_rate": 7.89586579462362e-07, + "loss": 0.6145, + "step": 1739 + }, + { + "epoch": 1.6488983653162759, + "grad_norm": 1.0760119198442148, + "learning_rate": 7.854507499691006e-07, + "loss": 0.6764, + "step": 1740 + }, + { + "epoch": 1.6498460080549633, + "grad_norm": 1.047284386935676, + "learning_rate": 7.813248571153542e-07, + "loss": 0.7229, + "step": 1741 + }, + { + "epoch": 1.6507936507936507, + "grad_norm": 1.0394813864374197, + "learning_rate": 7.772089106287345e-07, + "loss": 0.7326, + "step": 1742 + }, + { + "epoch": 1.6517412935323383, + "grad_norm": 1.045028572772028, + "learning_rate": 7.731029202134077e-07, + "loss": 0.7167, + "step": 1743 + }, + { + "epoch": 1.652688936271026, + "grad_norm": 1.006834434991697, + "learning_rate": 7.690068955500623e-07, + "loss": 0.705, + "step": 1744 + }, + { + "epoch": 1.6536365790097134, + "grad_norm": 1.0207105597171535, + "learning_rate": 7.649208462958935e-07, + "loss": 0.7293, + "step": 1745 + }, + { + "epoch": 1.6545842217484008, + "grad_norm": 0.9666760369440525, + "learning_rate": 7.608447820845771e-07, + "loss": 0.6882, + "step": 1746 + }, + { + "epoch": 1.6555318644870884, + "grad_norm": 0.983892799303487, + "learning_rate": 7.567787125262449e-07, + "loss": 0.6787, + "step": 1747 + }, + { + "epoch": 1.6564795072257759, + "grad_norm": 1.0469953263447322, + "learning_rate": 7.527226472074678e-07, + "loss": 0.7717, + "step": 1748 + }, + { + "epoch": 1.6574271499644633, + "grad_norm": 0.9326611142849608, + "learning_rate": 7.486765956912261e-07, + "loss": 0.6829, + "step": 1749 + }, + { + "epoch": 1.658374792703151, + "grad_norm": 0.9698239262438025, + "learning_rate": 7.446405675168938e-07, + "loss": 0.6417, + "step": 1750 + }, + { + "epoch": 1.6593224354418386, + "grad_norm": 1.0388054521896983, + "learning_rate": 7.406145722002101e-07, + "loss": 0.661, + "step": 1751 + }, + { + "epoch": 1.660270078180526, + "grad_norm": 1.0284488705513752, + "learning_rate": 7.365986192332624e-07, + "loss": 0.6885, + "step": 1752 + }, + { + "epoch": 1.6612177209192134, + "grad_norm": 1.0447857805111578, + "learning_rate": 7.325927180844589e-07, + "loss": 0.754, + "step": 1753 + }, + { + "epoch": 1.662165363657901, + "grad_norm": 1.0316358381680006, + "learning_rate": 7.285968781985093e-07, + "loss": 0.7376, + "step": 1754 + }, + { + "epoch": 1.6631130063965884, + "grad_norm": 0.9473765886693883, + "learning_rate": 7.246111089964042e-07, + "loss": 0.7222, + "step": 1755 + }, + { + "epoch": 1.6640606491352758, + "grad_norm": 1.0459905890378751, + "learning_rate": 7.206354198753862e-07, + "loss": 0.7092, + "step": 1756 + }, + { + "epoch": 1.6650082918739635, + "grad_norm": 1.1687326620774066, + "learning_rate": 7.166698202089367e-07, + "loss": 0.6543, + "step": 1757 + }, + { + "epoch": 1.6659559346126511, + "grad_norm": 0.9534227603353764, + "learning_rate": 7.127143193467445e-07, + "loss": 0.6816, + "step": 1758 + }, + { + "epoch": 1.6669035773513385, + "grad_norm": 1.6473488276372368, + "learning_rate": 7.087689266146935e-07, + "loss": 0.609, + "step": 1759 + }, + { + "epoch": 1.667851220090026, + "grad_norm": 1.0168119921182184, + "learning_rate": 7.048336513148307e-07, + "loss": 0.7228, + "step": 1760 + }, + { + "epoch": 1.667851220090026, + "eval_loss": 0.9152177572250366, + "eval_runtime": 64.5647, + "eval_samples_per_second": 42.252, + "eval_steps_per_second": 0.666, + "step": 1760 + }, + { + "epoch": 1.6687988628287136, + "grad_norm": 1.091512400561692, + "learning_rate": 7.009085027253543e-07, + "loss": 0.7229, + "step": 1761 + }, + { + "epoch": 1.6697465055674012, + "grad_norm": 0.9169936965491188, + "learning_rate": 6.969934901005809e-07, + "loss": 0.6622, + "step": 1762 + }, + { + "epoch": 1.6706941483060886, + "grad_norm": 0.9220318537884853, + "learning_rate": 6.930886226709344e-07, + "loss": 0.6763, + "step": 1763 + }, + { + "epoch": 1.671641791044776, + "grad_norm": 1.4840032839975184, + "learning_rate": 6.89193909642919e-07, + "loss": 0.7216, + "step": 1764 + }, + { + "epoch": 1.6725894337834637, + "grad_norm": 1.0184604456491309, + "learning_rate": 6.853093601990946e-07, + "loss": 0.7152, + "step": 1765 + }, + { + "epoch": 1.6735370765221511, + "grad_norm": 0.9526356849525173, + "learning_rate": 6.814349834980622e-07, + "loss": 0.6673, + "step": 1766 + }, + { + "epoch": 1.6744847192608385, + "grad_norm": 1.0461521335211754, + "learning_rate": 6.775707886744343e-07, + "loss": 0.7344, + "step": 1767 + }, + { + "epoch": 1.6754323619995262, + "grad_norm": 0.9731694459589282, + "learning_rate": 6.737167848388227e-07, + "loss": 0.6401, + "step": 1768 + }, + { + "epoch": 1.6763800047382138, + "grad_norm": 1.033021359037975, + "learning_rate": 6.698729810778065e-07, + "loss": 0.6726, + "step": 1769 + }, + { + "epoch": 1.6773276474769012, + "grad_norm": 1.0233932401805885, + "learning_rate": 6.660393864539222e-07, + "loss": 0.746, + "step": 1770 + }, + { + "epoch": 1.6782752902155886, + "grad_norm": 1.1404272415247523, + "learning_rate": 6.622160100056296e-07, + "loss": 0.7257, + "step": 1771 + }, + { + "epoch": 1.6792229329542763, + "grad_norm": 0.9500280589832726, + "learning_rate": 6.584028607473019e-07, + "loss": 0.6845, + "step": 1772 + }, + { + "epoch": 1.680170575692964, + "grad_norm": 1.1155349368000151, + "learning_rate": 6.545999476691994e-07, + "loss": 0.7388, + "step": 1773 + }, + { + "epoch": 1.681118218431651, + "grad_norm": 1.0605447278963818, + "learning_rate": 6.508072797374454e-07, + "loss": 0.7103, + "step": 1774 + }, + { + "epoch": 1.6820658611703387, + "grad_norm": 1.2288918607070451, + "learning_rate": 6.470248658940115e-07, + "loss": 0.7631, + "step": 1775 + }, + { + "epoch": 1.6830135039090264, + "grad_norm": 1.5101120147098823, + "learning_rate": 6.432527150566903e-07, + "loss": 0.6687, + "step": 1776 + }, + { + "epoch": 1.6839611466477138, + "grad_norm": 1.0065803962933693, + "learning_rate": 6.394908361190804e-07, + "loss": 0.6794, + "step": 1777 + }, + { + "epoch": 1.6849087893864012, + "grad_norm": 0.9998197093667816, + "learning_rate": 6.3573923795056e-07, + "loss": 0.7064, + "step": 1778 + }, + { + "epoch": 1.6858564321250888, + "grad_norm": 0.9471515572718026, + "learning_rate": 6.319979293962692e-07, + "loss": 0.6864, + "step": 1779 + }, + { + "epoch": 1.6868040748637765, + "grad_norm": 1.1324852784222434, + "learning_rate": 6.282669192770896e-07, + "loss": 0.6993, + "step": 1780 + }, + { + "epoch": 1.687751717602464, + "grad_norm": 1.1270168615840963, + "learning_rate": 6.245462163896188e-07, + "loss": 0.6916, + "step": 1781 + }, + { + "epoch": 1.6886993603411513, + "grad_norm": 1.0074883404182633, + "learning_rate": 6.208358295061572e-07, + "loss": 0.6657, + "step": 1782 + }, + { + "epoch": 1.6886993603411513, + "eval_loss": 0.9154070615768433, + "eval_runtime": 66.96, + "eval_samples_per_second": 40.741, + "eval_steps_per_second": 0.642, + "step": 1782 + }, + { + "epoch": 1.689647003079839, + "grad_norm": 1.054498287010899, + "learning_rate": 6.171357673746798e-07, + "loss": 0.6781, + "step": 1783 + }, + { + "epoch": 1.6905946458185264, + "grad_norm": 1.1173569579127263, + "learning_rate": 6.134460387188207e-07, + "loss": 0.7066, + "step": 1784 + }, + { + "epoch": 1.6915422885572138, + "grad_norm": 1.1694100658433553, + "learning_rate": 6.097666522378498e-07, + "loss": 0.7334, + "step": 1785 + }, + { + "epoch": 1.6924899312959014, + "grad_norm": 1.0277071128785418, + "learning_rate": 6.060976166066546e-07, + "loss": 0.653, + "step": 1786 + }, + { + "epoch": 1.693437574034589, + "grad_norm": 0.9433860569393974, + "learning_rate": 6.024389404757164e-07, + "loss": 0.7334, + "step": 1787 + }, + { + "epoch": 1.6943852167732765, + "grad_norm": 1.1834900094280103, + "learning_rate": 5.98790632471094e-07, + "loss": 0.6869, + "step": 1788 + }, + { + "epoch": 1.6953328595119639, + "grad_norm": 1.0548703068294034, + "learning_rate": 5.951527011944008e-07, + "loss": 0.6971, + "step": 1789 + }, + { + "epoch": 1.6962805022506515, + "grad_norm": 1.04211743004335, + "learning_rate": 5.91525155222783e-07, + "loss": 0.6956, + "step": 1790 + }, + { + "epoch": 1.6972281449893392, + "grad_norm": 0.9772649558392219, + "learning_rate": 5.879080031089047e-07, + "loss": 0.6854, + "step": 1791 + }, + { + "epoch": 1.6981757877280266, + "grad_norm": 1.0335742882872847, + "learning_rate": 5.843012533809211e-07, + "loss": 0.6413, + "step": 1792 + }, + { + "epoch": 1.699123430466714, + "grad_norm": 1.0424548733114698, + "learning_rate": 5.807049145424648e-07, + "loss": 0.6913, + "step": 1793 + }, + { + "epoch": 1.7000710732054016, + "grad_norm": 0.9133463333235484, + "learning_rate": 5.771189950726191e-07, + "loss": 0.7096, + "step": 1794 + }, + { + "epoch": 1.701018715944089, + "grad_norm": 1.052652054283284, + "learning_rate": 5.735435034259057e-07, + "loss": 0.6999, + "step": 1795 + }, + { + "epoch": 1.7019663586827765, + "grad_norm": 0.9804718340800169, + "learning_rate": 5.699784480322568e-07, + "loss": 0.7222, + "step": 1796 + }, + { + "epoch": 1.702914001421464, + "grad_norm": 1.0457563081701153, + "learning_rate": 5.664238372970016e-07, + "loss": 0.7255, + "step": 1797 + }, + { + "epoch": 1.7038616441601517, + "grad_norm": 1.032471323678011, + "learning_rate": 5.628796796008435e-07, + "loss": 0.7157, + "step": 1798 + }, + { + "epoch": 1.7048092868988391, + "grad_norm": 1.060728898729736, + "learning_rate": 5.593459832998388e-07, + "loss": 0.7115, + "step": 1799 + }, + { + "epoch": 1.7057569296375266, + "grad_norm": 0.9198580084720938, + "learning_rate": 5.558227567253832e-07, + "loss": 0.6637, + "step": 1800 + }, + { + "epoch": 1.7067045723762142, + "grad_norm": 1.1801823503070277, + "learning_rate": 5.52310008184182e-07, + "loss": 0.6761, + "step": 1801 + }, + { + "epoch": 1.7076522151149018, + "grad_norm": 0.9488900309898747, + "learning_rate": 5.488077459582425e-07, + "loss": 0.6881, + "step": 1802 + }, + { + "epoch": 1.708599857853589, + "grad_norm": 1.0453768970538024, + "learning_rate": 5.453159783048434e-07, + "loss": 0.6938, + "step": 1803 + }, + { + "epoch": 1.7095475005922767, + "grad_norm": 0.8783306154356906, + "learning_rate": 5.418347134565249e-07, + "loss": 0.7375, + "step": 1804 + }, + { + "epoch": 1.7095475005922767, + "eval_loss": 0.9155663847923279, + "eval_runtime": 61.2948, + "eval_samples_per_second": 44.506, + "eval_steps_per_second": 0.702, + "step": 1804 + }, + { + "epoch": 1.7104951433309643, + "grad_norm": 0.9783675322880301, + "learning_rate": 5.383639596210605e-07, + "loss": 0.7133, + "step": 1805 + }, + { + "epoch": 1.7114427860696517, + "grad_norm": 1.071623947559265, + "learning_rate": 5.349037249814443e-07, + "loss": 0.717, + "step": 1806 + }, + { + "epoch": 1.7123904288083391, + "grad_norm": 0.9742720184084963, + "learning_rate": 5.314540176958699e-07, + "loss": 0.6707, + "step": 1807 + }, + { + "epoch": 1.7133380715470268, + "grad_norm": 1.0237973061069328, + "learning_rate": 5.28014845897708e-07, + "loss": 0.6885, + "step": 1808 + }, + { + "epoch": 1.7142857142857144, + "grad_norm": 1.1521501110479333, + "learning_rate": 5.24586217695493e-07, + "loss": 0.6501, + "step": 1809 + }, + { + "epoch": 1.7152333570244018, + "grad_norm": 1.103148137037962, + "learning_rate": 5.211681411728969e-07, + "loss": 0.7074, + "step": 1810 + }, + { + "epoch": 1.7161809997630892, + "grad_norm": 1.0912149190601321, + "learning_rate": 5.177606243887184e-07, + "loss": 0.6816, + "step": 1811 + }, + { + "epoch": 1.7171286425017769, + "grad_norm": 1.1479638151703746, + "learning_rate": 5.14363675376855e-07, + "loss": 0.6941, + "step": 1812 + }, + { + "epoch": 1.7180762852404643, + "grad_norm": 1.0169229566279356, + "learning_rate": 5.109773021462921e-07, + "loss": 0.6869, + "step": 1813 + }, + { + "epoch": 1.7190239279791517, + "grad_norm": 1.0911518292945759, + "learning_rate": 5.076015126810784e-07, + "loss": 0.6936, + "step": 1814 + }, + { + "epoch": 1.7199715707178393, + "grad_norm": 1.0143944326536474, + "learning_rate": 5.042363149403106e-07, + "loss": 0.6826, + "step": 1815 + }, + { + "epoch": 1.720919213456527, + "grad_norm": 1.0145646424072496, + "learning_rate": 5.008817168581137e-07, + "loss": 0.738, + "step": 1816 + }, + { + "epoch": 1.7218668561952144, + "grad_norm": 0.9897806551140146, + "learning_rate": 4.975377263436193e-07, + "loss": 0.702, + "step": 1817 + }, + { + "epoch": 1.7228144989339018, + "grad_norm": 0.9854817267582501, + "learning_rate": 4.94204351280953e-07, + "loss": 0.7192, + "step": 1818 + }, + { + "epoch": 1.7237621416725895, + "grad_norm": 1.5230326854807534, + "learning_rate": 4.908815995292082e-07, + "loss": 0.7293, + "step": 1819 + }, + { + "epoch": 1.724709784411277, + "grad_norm": 1.2847042251851852, + "learning_rate": 4.875694789224372e-07, + "loss": 0.6911, + "step": 1820 + }, + { + "epoch": 1.7256574271499645, + "grad_norm": 1.026112774551678, + "learning_rate": 4.842679972696213e-07, + "loss": 0.6836, + "step": 1821 + }, + { + "epoch": 1.726605069888652, + "grad_norm": 1.0314353073227083, + "learning_rate": 4.809771623546627e-07, + "loss": 0.6813, + "step": 1822 + }, + { + "epoch": 1.7275527126273396, + "grad_norm": 0.9609581306542341, + "learning_rate": 4.776969819363614e-07, + "loss": 0.7, + "step": 1823 + }, + { + "epoch": 1.728500355366027, + "grad_norm": 1.0594554399314495, + "learning_rate": 4.7442746374839363e-07, + "loss": 0.6848, + "step": 1824 + }, + { + "epoch": 1.7294479981047144, + "grad_norm": 1.0911056243232513, + "learning_rate": 4.711686154993028e-07, + "loss": 0.6629, + "step": 1825 + }, + { + "epoch": 1.730395640843402, + "grad_norm": 1.0318006859508377, + "learning_rate": 4.6792044487247003e-07, + "loss": 0.6968, + "step": 1826 + }, + { + "epoch": 1.730395640843402, + "eval_loss": 0.9146263003349304, + "eval_runtime": 63.3709, + "eval_samples_per_second": 43.048, + "eval_steps_per_second": 0.679, + "step": 1826 + }, + { + "epoch": 1.7313432835820897, + "grad_norm": 0.9085218539728935, + "learning_rate": 4.646829595261071e-07, + "loss": 0.6937, + "step": 1827 + }, + { + "epoch": 1.732290926320777, + "grad_norm": 1.1715576080637178, + "learning_rate": 4.614561670932288e-07, + "loss": 0.7269, + "step": 1828 + }, + { + "epoch": 1.7332385690594645, + "grad_norm": 1.1027251721128186, + "learning_rate": 4.582400751816435e-07, + "loss": 0.7023, + "step": 1829 + }, + { + "epoch": 1.7341862117981521, + "grad_norm": 1.110203701042969, + "learning_rate": 4.5503469137392565e-07, + "loss": 0.6782, + "step": 1830 + }, + { + "epoch": 1.7351338545368398, + "grad_norm": 1.208247198988513, + "learning_rate": 4.5184002322740784e-07, + "loss": 0.7379, + "step": 1831 + }, + { + "epoch": 1.736081497275527, + "grad_norm": 0.9596889918326993, + "learning_rate": 4.486560782741578e-07, + "loss": 0.7485, + "step": 1832 + }, + { + "epoch": 1.7370291400142146, + "grad_norm": 0.9856266219344122, + "learning_rate": 4.454828640209574e-07, + "loss": 0.7021, + "step": 1833 + }, + { + "epoch": 1.7379767827529022, + "grad_norm": 1.0066120094983713, + "learning_rate": 4.423203879492943e-07, + "loss": 0.6334, + "step": 1834 + }, + { + "epoch": 1.7389244254915897, + "grad_norm": 1.0174910309231802, + "learning_rate": 4.3916865751533313e-07, + "loss": 0.6737, + "step": 1835 + }, + { + "epoch": 1.739872068230277, + "grad_norm": 1.090557902374621, + "learning_rate": 4.360276801499086e-07, + "loss": 0.6986, + "step": 1836 + }, + { + "epoch": 1.7408197109689647, + "grad_norm": 0.9525400709898934, + "learning_rate": 4.3289746325849924e-07, + "loss": 0.6387, + "step": 1837 + }, + { + "epoch": 1.7417673537076523, + "grad_norm": 0.9714172712407362, + "learning_rate": 4.29778014221216e-07, + "loss": 0.7426, + "step": 1838 + }, + { + "epoch": 1.7427149964463398, + "grad_norm": 1.011041205364556, + "learning_rate": 4.2666934039278017e-07, + "loss": 0.7251, + "step": 1839 + }, + { + "epoch": 1.7436626391850272, + "grad_norm": 1.0835244258679044, + "learning_rate": 4.2357144910251003e-07, + "loss": 0.7394, + "step": 1840 + }, + { + "epoch": 1.7446102819237148, + "grad_norm": 0.9136527438313549, + "learning_rate": 4.20484347654303e-07, + "loss": 0.6833, + "step": 1841 + }, + { + "epoch": 1.7455579246624022, + "grad_norm": 0.9394753418742889, + "learning_rate": 4.1740804332661365e-07, + "loss": 0.7183, + "step": 1842 + }, + { + "epoch": 1.7465055674010896, + "grad_norm": 1.1163656927143548, + "learning_rate": 4.1434254337244404e-07, + "loss": 0.6688, + "step": 1843 + }, + { + "epoch": 1.7474532101397773, + "grad_norm": 1.0767321655039874, + "learning_rate": 4.1128785501931947e-07, + "loss": 0.7301, + "step": 1844 + }, + { + "epoch": 1.748400852878465, + "grad_norm": 1.091800385892591, + "learning_rate": 4.0824398546927823e-07, + "loss": 0.7628, + "step": 1845 + }, + { + "epoch": 1.7493484956171523, + "grad_norm": 1.1875176453886316, + "learning_rate": 4.05210941898847e-07, + "loss": 0.7532, + "step": 1846 + }, + { + "epoch": 1.7502961383558397, + "grad_norm": 0.9616131150389049, + "learning_rate": 4.021887314590323e-07, + "loss": 0.7407, + "step": 1847 + }, + { + "epoch": 1.7512437810945274, + "grad_norm": 1.03865176835429, + "learning_rate": 3.9917736127529525e-07, + "loss": 0.7331, + "step": 1848 + }, + { + "epoch": 1.7512437810945274, + "eval_loss": 0.914626955986023, + "eval_runtime": 61.9304, + "eval_samples_per_second": 44.049, + "eval_steps_per_second": 0.694, + "step": 1848 + }, + { + "epoch": 1.752191423833215, + "grad_norm": 1.023686798198027, + "learning_rate": 3.9617683844754284e-07, + "loss": 0.7311, + "step": 1849 + }, + { + "epoch": 1.7531390665719024, + "grad_norm": 0.9592372092729335, + "learning_rate": 3.9318717005010496e-07, + "loss": 0.7405, + "step": 1850 + }, + { + "epoch": 1.7540867093105899, + "grad_norm": 1.0255950736345492, + "learning_rate": 3.902083631317194e-07, + "loss": 0.6882, + "step": 1851 + }, + { + "epoch": 1.7550343520492775, + "grad_norm": 1.0334498715269957, + "learning_rate": 3.8724042471551925e-07, + "loss": 0.6409, + "step": 1852 + }, + { + "epoch": 1.755981994787965, + "grad_norm": 1.187230644417929, + "learning_rate": 3.8428336179900773e-07, + "loss": 0.687, + "step": 1853 + }, + { + "epoch": 1.7569296375266523, + "grad_norm": 1.0889047565138557, + "learning_rate": 3.8133718135405283e-07, + "loss": 0.713, + "step": 1854 + }, + { + "epoch": 1.75787728026534, + "grad_norm": 0.9442970597032486, + "learning_rate": 3.784018903268588e-07, + "loss": 0.6456, + "step": 1855 + }, + { + "epoch": 1.7588249230040276, + "grad_norm": 0.9511254579043233, + "learning_rate": 3.7547749563796144e-07, + "loss": 0.675, + "step": 1856 + }, + { + "epoch": 1.759772565742715, + "grad_norm": 1.07549297963586, + "learning_rate": 3.725640041822026e-07, + "loss": 0.7639, + "step": 1857 + }, + { + "epoch": 1.7607202084814024, + "grad_norm": 1.2232858699290627, + "learning_rate": 3.6966142282871873e-07, + "loss": 0.738, + "step": 1858 + }, + { + "epoch": 1.76166785122009, + "grad_norm": 0.977573869114317, + "learning_rate": 3.667697584209251e-07, + "loss": 0.6537, + "step": 1859 + }, + { + "epoch": 1.7626154939587777, + "grad_norm": 1.0901080710066895, + "learning_rate": 3.638890177764948e-07, + "loss": 0.6607, + "step": 1860 + }, + { + "epoch": 1.763563136697465, + "grad_norm": 1.0047668118955564, + "learning_rate": 3.610192076873498e-07, + "loss": 0.6992, + "step": 1861 + }, + { + "epoch": 1.7645107794361525, + "grad_norm": 1.0810551846393102, + "learning_rate": 3.581603349196372e-07, + "loss": 0.7749, + "step": 1862 + }, + { + "epoch": 1.7654584221748402, + "grad_norm": 1.2746674219972365, + "learning_rate": 3.553124062137203e-07, + "loss": 0.697, + "step": 1863 + }, + { + "epoch": 1.7664060649135276, + "grad_norm": 1.0162449684395298, + "learning_rate": 3.524754282841575e-07, + "loss": 0.741, + "step": 1864 + }, + { + "epoch": 1.767353707652215, + "grad_norm": 0.9261105845228721, + "learning_rate": 3.49649407819691e-07, + "loss": 0.6527, + "step": 1865 + }, + { + "epoch": 1.7683013503909026, + "grad_norm": 1.0754817790075664, + "learning_rate": 3.468343514832251e-07, + "loss": 0.6518, + "step": 1866 + }, + { + "epoch": 1.7692489931295903, + "grad_norm": 0.9487301149199158, + "learning_rate": 3.440302659118172e-07, + "loss": 0.7055, + "step": 1867 + }, + { + "epoch": 1.7701966358682777, + "grad_norm": 0.9988153915506769, + "learning_rate": 3.4123715771665786e-07, + "loss": 0.6693, + "step": 1868 + }, + { + "epoch": 1.771144278606965, + "grad_norm": 1.1376902119777852, + "learning_rate": 3.3845503348305554e-07, + "loss": 0.6901, + "step": 1869 + }, + { + "epoch": 1.7720919213456527, + "grad_norm": 0.9271287842743362, + "learning_rate": 3.356838997704226e-07, + "loss": 0.6715, + "step": 1870 + }, + { + "epoch": 1.7720919213456527, + "eval_loss": 0.9146553874015808, + "eval_runtime": 65.9577, + "eval_samples_per_second": 41.36, + "eval_steps_per_second": 0.652, + "step": 1870 + }, + { + "epoch": 1.7730395640843402, + "grad_norm": 1.114859990972608, + "learning_rate": 3.3292376311225837e-07, + "loss": 0.7206, + "step": 1871 + }, + { + "epoch": 1.7739872068230276, + "grad_norm": 0.9957104184290693, + "learning_rate": 3.3017463001613625e-07, + "loss": 0.7175, + "step": 1872 + }, + { + "epoch": 1.7749348495617152, + "grad_norm": 1.085353272133234, + "learning_rate": 3.274365069636831e-07, + "loss": 0.7183, + "step": 1873 + }, + { + "epoch": 1.7758824923004028, + "grad_norm": 0.9620062890136332, + "learning_rate": 3.247094004105711e-07, + "loss": 0.6941, + "step": 1874 + }, + { + "epoch": 1.7768301350390903, + "grad_norm": 1.0208277814345714, + "learning_rate": 3.2199331678649804e-07, + "loss": 0.6735, + "step": 1875 + }, + { + "epoch": 1.7777777777777777, + "grad_norm": 1.085385982876263, + "learning_rate": 3.1928826249516984e-07, + "loss": 0.7081, + "step": 1876 + }, + { + "epoch": 1.7787254205164653, + "grad_norm": 0.9829670594766973, + "learning_rate": 3.165942439142927e-07, + "loss": 0.6604, + "step": 1877 + }, + { + "epoch": 1.779673063255153, + "grad_norm": 1.0546523994379018, + "learning_rate": 3.1391126739555134e-07, + "loss": 0.6916, + "step": 1878 + }, + { + "epoch": 1.7806207059938404, + "grad_norm": 1.3474084051089132, + "learning_rate": 3.112393392645985e-07, + "loss": 0.7241, + "step": 1879 + }, + { + "epoch": 1.7815683487325278, + "grad_norm": 1.2195465607727498, + "learning_rate": 3.0857846582103504e-07, + "loss": 0.7133, + "step": 1880 + }, + { + "epoch": 1.7825159914712154, + "grad_norm": 1.0574688930127023, + "learning_rate": 3.059286533384021e-07, + "loss": 0.6827, + "step": 1881 + }, + { + "epoch": 1.7834636342099028, + "grad_norm": 0.9882282740053548, + "learning_rate": 3.0328990806415935e-07, + "loss": 0.6634, + "step": 1882 + }, + { + "epoch": 1.7844112769485903, + "grad_norm": 1.6186322230221908, + "learning_rate": 3.006622362196748e-07, + "loss": 0.681, + "step": 1883 + }, + { + "epoch": 1.785358919687278, + "grad_norm": 0.9919895774024126, + "learning_rate": 2.9804564400021e-07, + "loss": 0.6462, + "step": 1884 + }, + { + "epoch": 1.7863065624259655, + "grad_norm": 1.023431960722124, + "learning_rate": 2.9544013757489944e-07, + "loss": 0.6782, + "step": 1885 + }, + { + "epoch": 1.787254205164653, + "grad_norm": 1.4703356457868497, + "learning_rate": 2.92845723086746e-07, + "loss": 0.7125, + "step": 1886 + }, + { + "epoch": 1.7882018479033404, + "grad_norm": 1.044872020512602, + "learning_rate": 2.9026240665259717e-07, + "loss": 0.6705, + "step": 1887 + }, + { + "epoch": 1.789149490642028, + "grad_norm": 1.0114171562637975, + "learning_rate": 2.876901943631372e-07, + "loss": 0.7051, + "step": 1888 + }, + { + "epoch": 1.7900971333807156, + "grad_norm": 0.9319783340627764, + "learning_rate": 2.8512909228286814e-07, + "loss": 0.6933, + "step": 1889 + }, + { + "epoch": 1.7910447761194028, + "grad_norm": 1.1744614079136493, + "learning_rate": 2.8257910645009935e-07, + "loss": 0.6957, + "step": 1890 + }, + { + "epoch": 1.7919924188580905, + "grad_norm": 0.9991417606881348, + "learning_rate": 2.8004024287692944e-07, + "loss": 0.7323, + "step": 1891 + }, + { + "epoch": 1.792940061596778, + "grad_norm": 1.0062578000109592, + "learning_rate": 2.7751250754923574e-07, + "loss": 0.6934, + "step": 1892 + }, + { + "epoch": 1.792940061596778, + "eval_loss": 0.9148725867271423, + "eval_runtime": 60.4722, + "eval_samples_per_second": 45.112, + "eval_steps_per_second": 0.711, + "step": 1892 + }, + { + "epoch": 1.7938877043354655, + "grad_norm": 1.0646128012425906, + "learning_rate": 2.7499590642665773e-07, + "loss": 0.6725, + "step": 1893 + }, + { + "epoch": 1.794835347074153, + "grad_norm": 0.9591060924803608, + "learning_rate": 2.724904454425836e-07, + "loss": 0.7088, + "step": 1894 + }, + { + "epoch": 1.7957829898128406, + "grad_norm": 1.0631806918984852, + "learning_rate": 2.699961305041382e-07, + "loss": 0.698, + "step": 1895 + }, + { + "epoch": 1.7967306325515282, + "grad_norm": 1.082828132414062, + "learning_rate": 2.6751296749216395e-07, + "loss": 0.6522, + "step": 1896 + }, + { + "epoch": 1.7976782752902156, + "grad_norm": 1.3644267899875255, + "learning_rate": 2.650409622612138e-07, + "loss": 0.6988, + "step": 1897 + }, + { + "epoch": 1.798625918028903, + "grad_norm": 1.1184995105345898, + "learning_rate": 2.625801206395312e-07, + "loss": 0.6482, + "step": 1898 + }, + { + "epoch": 1.7995735607675907, + "grad_norm": 1.1815217570033627, + "learning_rate": 2.6013044842904233e-07, + "loss": 0.6416, + "step": 1899 + }, + { + "epoch": 1.800521203506278, + "grad_norm": 0.9571223280821131, + "learning_rate": 2.5769195140533556e-07, + "loss": 0.7289, + "step": 1900 + }, + { + "epoch": 1.8014688462449655, + "grad_norm": 0.9341994296803589, + "learning_rate": 2.5526463531765467e-07, + "loss": 0.6686, + "step": 1901 + }, + { + "epoch": 1.8024164889836531, + "grad_norm": 0.9035497177092859, + "learning_rate": 2.528485058888813e-07, + "loss": 0.7046, + "step": 1902 + }, + { + "epoch": 1.8033641317223408, + "grad_norm": 1.0743252934046903, + "learning_rate": 2.5044356881552045e-07, + "loss": 0.7197, + "step": 1903 + }, + { + "epoch": 1.8043117744610282, + "grad_norm": 1.0079857675814266, + "learning_rate": 2.4804982976769197e-07, + "loss": 0.6867, + "step": 1904 + }, + { + "epoch": 1.8052594171997156, + "grad_norm": 1.059517658637514, + "learning_rate": 2.456672943891114e-07, + "loss": 0.6749, + "step": 1905 + }, + { + "epoch": 1.8062070599384032, + "grad_norm": 1.114382931333417, + "learning_rate": 2.4329596829708145e-07, + "loss": 0.6778, + "step": 1906 + }, + { + "epoch": 1.8071547026770909, + "grad_norm": 1.0664257741407888, + "learning_rate": 2.409358570824749e-07, + "loss": 0.7155, + "step": 1907 + }, + { + "epoch": 1.8081023454157783, + "grad_norm": 1.1085710156266713, + "learning_rate": 2.385869663097251e-07, + "loss": 0.712, + "step": 1908 + }, + { + "epoch": 1.8090499881544657, + "grad_norm": 1.0500505818112218, + "learning_rate": 2.362493015168088e-07, + "loss": 0.657, + "step": 1909 + }, + { + "epoch": 1.8099976308931534, + "grad_norm": 1.1320348644595801, + "learning_rate": 2.3392286821523723e-07, + "loss": 0.7458, + "step": 1910 + }, + { + "epoch": 1.8109452736318408, + "grad_norm": 1.1363033352104308, + "learning_rate": 2.316076718900412e-07, + "loss": 0.7449, + "step": 1911 + }, + { + "epoch": 1.8118929163705282, + "grad_norm": 0.9155284671345113, + "learning_rate": 2.2930371799975593e-07, + "loss": 0.6543, + "step": 1912 + }, + { + "epoch": 1.8128405591092158, + "grad_norm": 1.0126916247727809, + "learning_rate": 2.270110119764124e-07, + "loss": 0.7248, + "step": 1913 + }, + { + "epoch": 1.8137882018479035, + "grad_norm": 1.0272524735780668, + "learning_rate": 2.2472955922552164e-07, + "loss": 0.7114, + "step": 1914 + }, + { + "epoch": 1.8137882018479035, + "eval_loss": 0.9147893786430359, + "eval_runtime": 67.7847, + "eval_samples_per_second": 40.245, + "eval_steps_per_second": 0.634, + "step": 1914 + }, + { + "epoch": 1.8147358445865909, + "grad_norm": 1.0985166320474906, + "learning_rate": 2.2245936512606314e-07, + "loss": 0.6455, + "step": 1915 + }, + { + "epoch": 1.8156834873252783, + "grad_norm": 1.0448066285286555, + "learning_rate": 2.202004350304715e-07, + "loss": 0.6757, + "step": 1916 + }, + { + "epoch": 1.816631130063966, + "grad_norm": 1.2157625524349163, + "learning_rate": 2.179527742646248e-07, + "loss": 0.6647, + "step": 1917 + }, + { + "epoch": 1.8175787728026536, + "grad_norm": 0.9580609539987758, + "learning_rate": 2.1571638812783125e-07, + "loss": 0.6307, + "step": 1918 + }, + { + "epoch": 1.8185264155413408, + "grad_norm": 1.0743648151551368, + "learning_rate": 2.1349128189281587e-07, + "loss": 0.7276, + "step": 1919 + }, + { + "epoch": 1.8194740582800284, + "grad_norm": 1.1869777760221731, + "learning_rate": 2.112774608057111e-07, + "loss": 0.7087, + "step": 1920 + }, + { + "epoch": 1.820421701018716, + "grad_norm": 1.0016495300786694, + "learning_rate": 2.0907493008604007e-07, + "loss": 0.6908, + "step": 1921 + }, + { + "epoch": 1.8213693437574034, + "grad_norm": 1.1603016404098847, + "learning_rate": 2.068836949267089e-07, + "loss": 0.6766, + "step": 1922 + }, + { + "epoch": 1.8223169864960909, + "grad_norm": 1.0187010201658582, + "learning_rate": 2.0470376049398944e-07, + "loss": 0.7093, + "step": 1923 + }, + { + "epoch": 1.8232646292347785, + "grad_norm": 1.3203086559264363, + "learning_rate": 2.0253513192751374e-07, + "loss": 0.6744, + "step": 1924 + }, + { + "epoch": 1.8242122719734661, + "grad_norm": 0.9700557122361928, + "learning_rate": 2.003778143402535e-07, + "loss": 0.6905, + "step": 1925 + }, + { + "epoch": 1.8251599147121536, + "grad_norm": 1.3517239993246017, + "learning_rate": 1.9823181281851513e-07, + "loss": 0.6834, + "step": 1926 + }, + { + "epoch": 1.826107557450841, + "grad_norm": 1.02906230765169, + "learning_rate": 1.960971324219263e-07, + "loss": 0.7265, + "step": 1927 + }, + { + "epoch": 1.8270552001895286, + "grad_norm": 1.0585910384244444, + "learning_rate": 1.9397377818341945e-07, + "loss": 0.6877, + "step": 1928 + }, + { + "epoch": 1.828002842928216, + "grad_norm": 0.9475218916837848, + "learning_rate": 1.9186175510922666e-07, + "loss": 0.7416, + "step": 1929 + }, + { + "epoch": 1.8289504856669034, + "grad_norm": 1.0203159431279483, + "learning_rate": 1.8976106817886197e-07, + "loss": 0.714, + "step": 1930 + }, + { + "epoch": 1.829898128405591, + "grad_norm": 1.127014657550726, + "learning_rate": 1.876717223451141e-07, + "loss": 0.7112, + "step": 1931 + }, + { + "epoch": 1.8308457711442787, + "grad_norm": 1.353971401648652, + "learning_rate": 1.8559372253403152e-07, + "loss": 0.714, + "step": 1932 + }, + { + "epoch": 1.8317934138829661, + "grad_norm": 1.0446112826008636, + "learning_rate": 1.8352707364491352e-07, + "loss": 0.6958, + "step": 1933 + }, + { + "epoch": 1.8327410566216535, + "grad_norm": 1.0577797650889291, + "learning_rate": 1.814717805502958e-07, + "loss": 0.736, + "step": 1934 + }, + { + "epoch": 1.8336886993603412, + "grad_norm": 1.2894072021725473, + "learning_rate": 1.794278480959416e-07, + "loss": 0.7035, + "step": 1935 + }, + { + "epoch": 1.8346363420990288, + "grad_norm": 1.0041014130507704, + "learning_rate": 1.7739528110083003e-07, + "loss": 0.6661, + "step": 1936 + }, + { + "epoch": 1.8346363420990288, + "eval_loss": 0.9146416783332825, + "eval_runtime": 66.1092, + "eval_samples_per_second": 41.265, + "eval_steps_per_second": 0.65, + "step": 1936 + }, + { + "epoch": 1.8355839848377162, + "grad_norm": 1.2009141696461565, + "learning_rate": 1.7537408435714054e-07, + "loss": 0.698, + "step": 1937 + }, + { + "epoch": 1.8365316275764036, + "grad_norm": 1.0557423818039546, + "learning_rate": 1.7336426263024896e-07, + "loss": 0.6599, + "step": 1938 + }, + { + "epoch": 1.8374792703150913, + "grad_norm": 1.1355253989528187, + "learning_rate": 1.7136582065870876e-07, + "loss": 0.7389, + "step": 1939 + }, + { + "epoch": 1.8384269130537787, + "grad_norm": 0.947567680408928, + "learning_rate": 1.6937876315424707e-07, + "loss": 0.6902, + "step": 1940 + }, + { + "epoch": 1.8393745557924661, + "grad_norm": 1.0469689776321616, + "learning_rate": 1.6740309480174633e-07, + "loss": 0.6955, + "step": 1941 + }, + { + "epoch": 1.8403221985311538, + "grad_norm": 1.0956794304972344, + "learning_rate": 1.6543882025923884e-07, + "loss": 0.7019, + "step": 1942 + }, + { + "epoch": 1.8412698412698414, + "grad_norm": 0.9499513224862992, + "learning_rate": 1.6348594415789286e-07, + "loss": 0.7197, + "step": 1943 + }, + { + "epoch": 1.8422174840085288, + "grad_norm": 1.0290878955410225, + "learning_rate": 1.6154447110200256e-07, + "loss": 0.6963, + "step": 1944 + }, + { + "epoch": 1.8431651267472162, + "grad_norm": 1.084819106183627, + "learning_rate": 1.5961440566897913e-07, + "loss": 0.6618, + "step": 1945 + }, + { + "epoch": 1.8441127694859039, + "grad_norm": 0.9712359809714644, + "learning_rate": 1.5769575240933422e-07, + "loss": 0.7188, + "step": 1946 + }, + { + "epoch": 1.8450604122245915, + "grad_norm": 1.053958386842615, + "learning_rate": 1.5578851584667654e-07, + "loss": 0.6487, + "step": 1947 + }, + { + "epoch": 1.8460080549632787, + "grad_norm": 1.0254427828467692, + "learning_rate": 1.5389270047769578e-07, + "loss": 0.7443, + "step": 1948 + }, + { + "epoch": 1.8469556977019663, + "grad_norm": 1.0100347718010834, + "learning_rate": 1.520083107721543e-07, + "loss": 0.7099, + "step": 1949 + }, + { + "epoch": 1.847903340440654, + "grad_norm": 1.0591527642673004, + "learning_rate": 1.5013535117287648e-07, + "loss": 0.7101, + "step": 1950 + }, + { + "epoch": 1.8488509831793414, + "grad_norm": 0.9196055199586443, + "learning_rate": 1.482738260957378e-07, + "loss": 0.6968, + "step": 1951 + }, + { + "epoch": 1.8497986259180288, + "grad_norm": 1.082437628745268, + "learning_rate": 1.4642373992965365e-07, + "loss": 0.6848, + "step": 1952 + }, + { + "epoch": 1.8507462686567164, + "grad_norm": 1.020089742858143, + "learning_rate": 1.4458509703657197e-07, + "loss": 0.7327, + "step": 1953 + }, + { + "epoch": 1.851693911395404, + "grad_norm": 1.138043400664436, + "learning_rate": 1.427579017514591e-07, + "loss": 0.7166, + "step": 1954 + }, + { + "epoch": 1.8526415541340915, + "grad_norm": 1.034045450725446, + "learning_rate": 1.4094215838229176e-07, + "loss": 0.7585, + "step": 1955 + }, + { + "epoch": 1.853589196872779, + "grad_norm": 1.7484699081096364, + "learning_rate": 1.3913787121004717e-07, + "loss": 0.6699, + "step": 1956 + }, + { + "epoch": 1.8545368396114665, + "grad_norm": 1.1518516482534196, + "learning_rate": 1.3734504448869147e-07, + "loss": 0.7528, + "step": 1957 + }, + { + "epoch": 1.855484482350154, + "grad_norm": 1.2727867307662963, + "learning_rate": 1.3556368244517116e-07, + "loss": 0.7042, + "step": 1958 + }, + { + "epoch": 1.855484482350154, + "eval_loss": 0.9145249128341675, + "eval_runtime": 68.0932, + "eval_samples_per_second": 40.063, + "eval_steps_per_second": 0.631, + "step": 1958 + }, + { + "epoch": 1.8564321250888414, + "grad_norm": 0.9279042520126958, + "learning_rate": 1.3379378927940167e-07, + "loss": 0.7096, + "step": 1959 + }, + { + "epoch": 1.857379767827529, + "grad_norm": 0.8860547537873784, + "learning_rate": 1.3203536916425842e-07, + "loss": 0.6665, + "step": 1960 + }, + { + "epoch": 1.8583274105662166, + "grad_norm": 0.9885824244176509, + "learning_rate": 1.3028842624556893e-07, + "loss": 0.6769, + "step": 1961 + }, + { + "epoch": 1.859275053304904, + "grad_norm": 1.1914468020222337, + "learning_rate": 1.2855296464209687e-07, + "loss": 0.6548, + "step": 1962 + }, + { + "epoch": 1.8602226960435915, + "grad_norm": 1.2643990216476162, + "learning_rate": 1.2682898844554093e-07, + "loss": 0.7257, + "step": 1963 + }, + { + "epoch": 1.861170338782279, + "grad_norm": 0.9779836857682124, + "learning_rate": 1.2511650172051636e-07, + "loss": 0.6888, + "step": 1964 + }, + { + "epoch": 1.8621179815209667, + "grad_norm": 0.9438092146007916, + "learning_rate": 1.2341550850455353e-07, + "loss": 0.6962, + "step": 1965 + }, + { + "epoch": 1.8630656242596542, + "grad_norm": 1.24403617918052, + "learning_rate": 1.217260128080816e-07, + "loss": 0.733, + "step": 1966 + }, + { + "epoch": 1.8640132669983416, + "grad_norm": 0.9517426709373272, + "learning_rate": 1.2004801861442373e-07, + "loss": 0.7037, + "step": 1967 + }, + { + "epoch": 1.8649609097370292, + "grad_norm": 1.1778147961281975, + "learning_rate": 1.183815298797858e-07, + "loss": 0.7429, + "step": 1968 + }, + { + "epoch": 1.8659085524757166, + "grad_norm": 1.104468634647898, + "learning_rate": 1.1672655053324655e-07, + "loss": 0.712, + "step": 1969 + }, + { + "epoch": 1.866856195214404, + "grad_norm": 1.0425031543126493, + "learning_rate": 1.1508308447674977e-07, + "loss": 0.7324, + "step": 1970 + }, + { + "epoch": 1.8678038379530917, + "grad_norm": 1.0608934372919678, + "learning_rate": 1.1345113558509424e-07, + "loss": 0.7224, + "step": 1971 + }, + { + "epoch": 1.8687514806917793, + "grad_norm": 0.9785171863226142, + "learning_rate": 1.1183070770592442e-07, + "loss": 0.7362, + "step": 1972 + }, + { + "epoch": 1.8696991234304667, + "grad_norm": 1.0041114112851635, + "learning_rate": 1.1022180465972198e-07, + "loss": 0.7145, + "step": 1973 + }, + { + "epoch": 1.8706467661691542, + "grad_norm": 1.0344356033639341, + "learning_rate": 1.0862443023979651e-07, + "loss": 0.6638, + "step": 1974 + }, + { + "epoch": 1.8715944089078418, + "grad_norm": 1.1440137970112205, + "learning_rate": 1.0703858821227541e-07, + "loss": 0.72, + "step": 1975 + }, + { + "epoch": 1.8725420516465292, + "grad_norm": 1.011318166969755, + "learning_rate": 1.0546428231609896e-07, + "loss": 0.7001, + "step": 1976 + }, + { + "epoch": 1.8734896943852166, + "grad_norm": 1.1534751759952744, + "learning_rate": 1.0390151626300527e-07, + "loss": 0.7046, + "step": 1977 + }, + { + "epoch": 1.8744373371239043, + "grad_norm": 1.090411632798464, + "learning_rate": 1.0235029373752758e-07, + "loss": 0.6901, + "step": 1978 + }, + { + "epoch": 1.875384979862592, + "grad_norm": 1.07336526868862, + "learning_rate": 1.0081061839698259e-07, + "loss": 0.723, + "step": 1979 + }, + { + "epoch": 1.8763326226012793, + "grad_norm": 1.0079673427741604, + "learning_rate": 9.928249387145983e-08, + "loss": 0.6381, + "step": 1980 + }, + { + "epoch": 1.8763326226012793, + "eval_loss": 0.9143509268760681, + "eval_runtime": 64.728, + "eval_samples_per_second": 42.146, + "eval_steps_per_second": 0.664, + "step": 1980 + }, + { + "epoch": 1.8772802653399667, + "grad_norm": 1.1089070458655792, + "learning_rate": 9.776592376381955e-08, + "loss": 0.684, + "step": 1981 + }, + { + "epoch": 1.8782279080786544, + "grad_norm": 1.0295711500308624, + "learning_rate": 9.626091164967599e-08, + "loss": 0.6981, + "step": 1982 + }, + { + "epoch": 1.879175550817342, + "grad_norm": 1.1306401873193062, + "learning_rate": 9.476746107739577e-08, + "loss": 0.6419, + "step": 1983 + }, + { + "epoch": 1.8801231935560294, + "grad_norm": 1.0143892851114753, + "learning_rate": 9.32855755680867e-08, + "loss": 0.7298, + "step": 1984 + }, + { + "epoch": 1.8810708362947168, + "grad_norm": 1.200139953828078, + "learning_rate": 9.181525861558849e-08, + "loss": 0.7129, + "step": 1985 + }, + { + "epoch": 1.8820184790334045, + "grad_norm": 1.1828188126049186, + "learning_rate": 9.035651368646647e-08, + "loss": 0.6156, + "step": 1986 + }, + { + "epoch": 1.8829661217720919, + "grad_norm": 0.9893546670383357, + "learning_rate": 8.89093442200023e-08, + "loss": 0.6687, + "step": 1987 + }, + { + "epoch": 1.8839137645107793, + "grad_norm": 1.3094641319150206, + "learning_rate": 8.747375362818667e-08, + "loss": 0.6492, + "step": 1988 + }, + { + "epoch": 1.884861407249467, + "grad_norm": 0.9994122322268034, + "learning_rate": 8.604974529571042e-08, + "loss": 0.6722, + "step": 1989 + }, + { + "epoch": 1.8858090499881546, + "grad_norm": 1.0426017376290013, + "learning_rate": 8.463732257995571e-08, + "loss": 0.727, + "step": 1990 + }, + { + "epoch": 1.886756692726842, + "grad_norm": 1.2180348697029497, + "learning_rate": 8.323648881099211e-08, + "loss": 0.7231, + "step": 1991 + }, + { + "epoch": 1.8877043354655294, + "grad_norm": 1.0642272839244786, + "learning_rate": 8.184724729156379e-08, + "loss": 0.7435, + "step": 1992 + }, + { + "epoch": 1.888651978204217, + "grad_norm": 0.9269037955466151, + "learning_rate": 8.046960129708348e-08, + "loss": 0.6886, + "step": 1993 + }, + { + "epoch": 1.8895996209429047, + "grad_norm": 0.9229485938696481, + "learning_rate": 7.910355407562742e-08, + "loss": 0.7321, + "step": 1994 + }, + { + "epoch": 1.890547263681592, + "grad_norm": 1.0976428013397825, + "learning_rate": 7.774910884792319e-08, + "loss": 0.7073, + "step": 1995 + }, + { + "epoch": 1.8914949064202795, + "grad_norm": 0.9509585098963864, + "learning_rate": 7.640626880734581e-08, + "loss": 0.6638, + "step": 1996 + }, + { + "epoch": 1.8924425491589671, + "grad_norm": 1.0689281848692378, + "learning_rate": 7.507503711990771e-08, + "loss": 0.6994, + "step": 1997 + }, + { + "epoch": 1.8933901918976546, + "grad_norm": 0.9440594653903914, + "learning_rate": 7.375541692425325e-08, + "loss": 0.7163, + "step": 1998 + }, + { + "epoch": 1.894337834636342, + "grad_norm": 1.231877807532716, + "learning_rate": 7.244741133164979e-08, + "loss": 0.7402, + "step": 1999 + }, + { + "epoch": 1.8952854773750296, + "grad_norm": 1.021884925509466, + "learning_rate": 7.115102342598101e-08, + "loss": 0.7227, + "step": 2000 + }, + { + "epoch": 1.8962331201137173, + "grad_norm": 1.003242355792749, + "learning_rate": 6.986625626373978e-08, + "loss": 0.7284, + "step": 2001 + }, + { + "epoch": 1.8971807628524047, + "grad_norm": 1.0311894967892232, + "learning_rate": 6.859311287402081e-08, + "loss": 0.745, + "step": 2002 + }, + { + "epoch": 1.8971807628524047, + "eval_loss": 0.9144185185432434, + "eval_runtime": 63.7067, + "eval_samples_per_second": 42.821, + "eval_steps_per_second": 0.675, + "step": 2002 + }, + { + "epoch": 1.898128405591092, + "grad_norm": 0.933859188399113, + "learning_rate": 6.733159625851304e-08, + "loss": 0.7088, + "step": 2003 + }, + { + "epoch": 1.8990760483297797, + "grad_norm": 0.9503318061850076, + "learning_rate": 6.608170939149283e-08, + "loss": 0.666, + "step": 2004 + }, + { + "epoch": 1.9000236910684671, + "grad_norm": 1.028500893291808, + "learning_rate": 6.48434552198185e-08, + "loss": 0.646, + "step": 2005 + }, + { + "epoch": 1.9009713338071546, + "grad_norm": 1.1161543459809187, + "learning_rate": 6.361683666291973e-08, + "loss": 0.6776, + "step": 2006 + }, + { + "epoch": 1.9019189765458422, + "grad_norm": 1.0620526873003586, + "learning_rate": 6.240185661279541e-08, + "loss": 0.7249, + "step": 2007 + }, + { + "epoch": 1.9028666192845298, + "grad_norm": 1.209070343804771, + "learning_rate": 6.119851793400188e-08, + "loss": 0.6793, + "step": 2008 + }, + { + "epoch": 1.9038142620232172, + "grad_norm": 0.9596501498455559, + "learning_rate": 6.000682346365084e-08, + "loss": 0.6838, + "step": 2009 + }, + { + "epoch": 1.9047619047619047, + "grad_norm": 1.0459464382958554, + "learning_rate": 5.882677601139919e-08, + "loss": 0.6883, + "step": 2010 + }, + { + "epoch": 1.9057095475005923, + "grad_norm": 1.3695482020370489, + "learning_rate": 5.7658378359443104e-08, + "loss": 0.721, + "step": 2011 + }, + { + "epoch": 1.90665719023928, + "grad_norm": 1.056846758068541, + "learning_rate": 5.6501633262513454e-08, + "loss": 0.6852, + "step": 2012 + }, + { + "epoch": 1.9076048329779673, + "grad_norm": 1.0176824003025227, + "learning_rate": 5.535654344786756e-08, + "loss": 0.6672, + "step": 2013 + }, + { + "epoch": 1.9085524757166548, + "grad_norm": 1.186973736079735, + "learning_rate": 5.4223111615281935e-08, + "loss": 0.7098, + "step": 2014 + }, + { + "epoch": 1.9095001184553424, + "grad_norm": 1.0942365940476848, + "learning_rate": 5.310134043704895e-08, + "loss": 0.7302, + "step": 2015 + }, + { + "epoch": 1.9104477611940298, + "grad_norm": 1.1024694043360106, + "learning_rate": 5.1991232557966344e-08, + "loss": 0.689, + "step": 2016 + }, + { + "epoch": 1.9113954039327172, + "grad_norm": 0.9659220513288693, + "learning_rate": 5.089279059533658e-08, + "loss": 0.6806, + "step": 2017 + }, + { + "epoch": 1.9123430466714049, + "grad_norm": 1.0448744385892244, + "learning_rate": 4.9806017138953053e-08, + "loss": 0.7005, + "step": 2018 + }, + { + "epoch": 1.9132906894100925, + "grad_norm": 1.2099073891780832, + "learning_rate": 4.873091475110281e-08, + "loss": 0.701, + "step": 2019 + }, + { + "epoch": 1.91423833214878, + "grad_norm": 1.144622221873965, + "learning_rate": 4.766748596655268e-08, + "loss": 0.7041, + "step": 2020 + }, + { + "epoch": 1.9151859748874673, + "grad_norm": 1.0628184982525557, + "learning_rate": 4.66157332925482e-08, + "loss": 0.6452, + "step": 2021 + }, + { + "epoch": 1.916133617626155, + "grad_norm": 1.0240120660236798, + "learning_rate": 4.55756592088058e-08, + "loss": 0.6759, + "step": 2022 + }, + { + "epoch": 1.9170812603648426, + "grad_norm": 1.0346055793091427, + "learning_rate": 4.4547266167507264e-08, + "loss": 0.6916, + "step": 2023 + }, + { + "epoch": 1.9180289031035298, + "grad_norm": 1.0007416620575995, + "learning_rate": 4.3530556593294194e-08, + "loss": 0.704, + "step": 2024 + }, + { + "epoch": 1.9180289031035298, + "eval_loss": 0.9143268465995789, + "eval_runtime": 66.9808, + "eval_samples_per_second": 40.728, + "eval_steps_per_second": 0.642, + "step": 2024 + }, + { + "epoch": 1.9189765458422174, + "grad_norm": 1.1078567701211084, + "learning_rate": 4.2525532883261886e-08, + "loss": 0.6868, + "step": 2025 + }, + { + "epoch": 1.919924188580905, + "grad_norm": 1.107371658240934, + "learning_rate": 4.1532197406954357e-08, + "loss": 0.6794, + "step": 2026 + }, + { + "epoch": 1.9208718313195925, + "grad_norm": 1.1152223254534468, + "learning_rate": 4.0550552506357646e-08, + "loss": 0.6733, + "step": 2027 + }, + { + "epoch": 1.92181947405828, + "grad_norm": 0.9814552628881259, + "learning_rate": 3.958060049589485e-08, + "loss": 0.6746, + "step": 2028 + }, + { + "epoch": 1.9227671167969675, + "grad_norm": 1.1148524114158356, + "learning_rate": 3.862234366242168e-08, + "loss": 0.6809, + "step": 2029 + }, + { + "epoch": 1.9237147595356552, + "grad_norm": 1.1264125552667186, + "learning_rate": 3.767578426521923e-08, + "loss": 0.6624, + "step": 2030 + }, + { + "epoch": 1.9246624022743426, + "grad_norm": 0.8821749884652544, + "learning_rate": 3.674092453598954e-08, + "loss": 0.7492, + "step": 2031 + }, + { + "epoch": 1.92561004501303, + "grad_norm": 0.970645505659204, + "learning_rate": 3.581776667885062e-08, + "loss": 0.6561, + "step": 2032 + }, + { + "epoch": 1.9265576877517177, + "grad_norm": 1.1049159464316436, + "learning_rate": 3.4906312870331973e-08, + "loss": 0.7361, + "step": 2033 + }, + { + "epoch": 1.927505330490405, + "grad_norm": 1.0950647602834038, + "learning_rate": 3.40065652593663e-08, + "loss": 0.7006, + "step": 2034 + }, + { + "epoch": 1.9284529732290925, + "grad_norm": 1.0452185514606323, + "learning_rate": 3.311852596728948e-08, + "loss": 0.7276, + "step": 2035 + }, + { + "epoch": 1.9294006159677801, + "grad_norm": 1.1747706021025806, + "learning_rate": 3.2242197087828944e-08, + "loss": 0.691, + "step": 2036 + }, + { + "epoch": 1.9303482587064678, + "grad_norm": 1.2178046074293865, + "learning_rate": 3.137758068710694e-08, + "loss": 0.6611, + "step": 2037 + }, + { + "epoch": 1.9312959014451552, + "grad_norm": 0.9619015695423282, + "learning_rate": 3.052467880362675e-08, + "loss": 0.6696, + "step": 2038 + }, + { + "epoch": 1.9322435441838426, + "grad_norm": 1.157318529310348, + "learning_rate": 2.9683493448275925e-08, + "loss": 0.681, + "step": 2039 + }, + { + "epoch": 1.9331911869225302, + "grad_norm": 1.0061721079680708, + "learning_rate": 2.8854026604315798e-08, + "loss": 0.6822, + "step": 2040 + }, + { + "epoch": 1.9341388296612179, + "grad_norm": 1.0526406685824896, + "learning_rate": 2.8036280227379808e-08, + "loss": 0.6901, + "step": 2041 + }, + { + "epoch": 1.9350864723999053, + "grad_norm": 0.968442328631709, + "learning_rate": 2.723025624546849e-08, + "loss": 0.6801, + "step": 2042 + }, + { + "epoch": 1.9360341151385927, + "grad_norm": 0.9496600506704314, + "learning_rate": 2.6435956558943375e-08, + "loss": 0.7094, + "step": 2043 + }, + { + "epoch": 1.9369817578772803, + "grad_norm": 0.9299456148661794, + "learning_rate": 2.5653383040524228e-08, + "loss": 0.6135, + "step": 2044 + }, + { + "epoch": 1.9379294006159677, + "grad_norm": 0.9516242679649526, + "learning_rate": 2.488253753528458e-08, + "loss": 0.7323, + "step": 2045 + }, + { + "epoch": 1.9388770433546552, + "grad_norm": 1.076415703445146, + "learning_rate": 2.4123421860645646e-08, + "loss": 0.7032, + "step": 2046 + }, + { + "epoch": 1.9388770433546552, + "eval_loss": 0.9144385457038879, + "eval_runtime": 67.2, + "eval_samples_per_second": 40.595, + "eval_steps_per_second": 0.64, + "step": 2046 + }, + { + "epoch": 1.9398246860933428, + "grad_norm": 2.5186162393004623, + "learning_rate": 2.3376037806374097e-08, + "loss": 0.714, + "step": 2047 + }, + { + "epoch": 1.9407723288320304, + "grad_norm": 1.008486947731255, + "learning_rate": 2.264038713457706e-08, + "loss": 0.6927, + "step": 2048 + }, + { + "epoch": 1.9417199715707179, + "grad_norm": 0.9617322705423318, + "learning_rate": 2.1916471579697117e-08, + "loss": 0.691, + "step": 2049 + }, + { + "epoch": 1.9426676143094053, + "grad_norm": 0.9939767442168306, + "learning_rate": 2.1204292848509557e-08, + "loss": 0.7493, + "step": 2050 + }, + { + "epoch": 1.943615257048093, + "grad_norm": 0.9853441684157483, + "learning_rate": 2.050385262011789e-08, + "loss": 0.6878, + "step": 2051 + }, + { + "epoch": 1.9445628997867805, + "grad_norm": 1.0326884095472177, + "learning_rate": 1.98151525459489e-08, + "loss": 0.7353, + "step": 2052 + }, + { + "epoch": 1.9455105425254677, + "grad_norm": 1.0951236722722288, + "learning_rate": 1.9138194249750386e-08, + "loss": 0.6593, + "step": 2053 + }, + { + "epoch": 1.9464581852641554, + "grad_norm": 1.1769494445073123, + "learning_rate": 1.8472979327587292e-08, + "loss": 0.6544, + "step": 2054 + }, + { + "epoch": 1.947405828002843, + "grad_norm": 1.0395960653177918, + "learning_rate": 1.781950934783505e-08, + "loss": 0.7335, + "step": 2055 + }, + { + "epoch": 1.9483534707415304, + "grad_norm": 1.0211404503318975, + "learning_rate": 1.7177785851180127e-08, + "loss": 0.7096, + "step": 2056 + }, + { + "epoch": 1.9493011134802178, + "grad_norm": 1.2278758807843733, + "learning_rate": 1.654781035061337e-08, + "loss": 0.6919, + "step": 2057 + }, + { + "epoch": 1.9502487562189055, + "grad_norm": 1.1176872580144226, + "learning_rate": 1.5929584331427218e-08, + "loss": 0.6904, + "step": 2058 + }, + { + "epoch": 1.9511963989575931, + "grad_norm": 1.2525283554104196, + "learning_rate": 1.532310925121294e-08, + "loss": 0.733, + "step": 2059 + }, + { + "epoch": 1.9521440416962805, + "grad_norm": 1.0083196464834074, + "learning_rate": 1.4728386539856754e-08, + "loss": 0.6684, + "step": 2060 + }, + { + "epoch": 1.953091684434968, + "grad_norm": 0.9670996702496595, + "learning_rate": 1.4145417599534805e-08, + "loss": 0.6882, + "step": 2061 + }, + { + "epoch": 1.9540393271736556, + "grad_norm": 2.2195094052226754, + "learning_rate": 1.3574203804713748e-08, + "loss": 0.7289, + "step": 2062 + }, + { + "epoch": 1.954986969912343, + "grad_norm": 1.0493072981480793, + "learning_rate": 1.3014746502142962e-08, + "loss": 0.7345, + "step": 2063 + }, + { + "epoch": 1.9559346126510304, + "grad_norm": 1.1402015478284233, + "learning_rate": 1.2467047010855659e-08, + "loss": 0.7084, + "step": 2064 + }, + { + "epoch": 1.956882255389718, + "grad_norm": 1.028808624981634, + "learning_rate": 1.1931106622161127e-08, + "loss": 0.7665, + "step": 2065 + }, + { + "epoch": 1.9578298981284057, + "grad_norm": 0.9887477552469968, + "learning_rate": 1.1406926599646373e-08, + "loss": 0.7191, + "step": 2066 + }, + { + "epoch": 1.958777540867093, + "grad_norm": 1.093798325618811, + "learning_rate": 1.0894508179170038e-08, + "loss": 0.6972, + "step": 2067 + }, + { + "epoch": 1.9597251836057805, + "grad_norm": 0.9804042235247051, + "learning_rate": 1.0393852568860718e-08, + "loss": 0.7328, + "step": 2068 + }, + { + "epoch": 1.9597251836057805, + "eval_loss": 0.914358913898468, + "eval_runtime": 68.5205, + "eval_samples_per_second": 39.813, + "eval_steps_per_second": 0.628, + "step": 2068 + }, + { + "epoch": 1.9606728263444682, + "grad_norm": 1.2343380039639555, + "learning_rate": 9.904960949114195e-09, + "loss": 0.7172, + "step": 2069 + }, + { + "epoch": 1.9616204690831558, + "grad_norm": 1.0811426709912628, + "learning_rate": 9.427834472588992e-09, + "loss": 0.721, + "step": 2070 + }, + { + "epoch": 1.9625681118218432, + "grad_norm": 1.3595974625306733, + "learning_rate": 8.962474264206378e-09, + "loss": 0.7008, + "step": 2071 + }, + { + "epoch": 1.9635157545605306, + "grad_norm": 1.0348382536874432, + "learning_rate": 8.508881421145366e-09, + "loss": 0.7039, + "step": 2072 + }, + { + "epoch": 1.9644633972992183, + "grad_norm": 1.2153800966637154, + "learning_rate": 8.067057012842161e-09, + "loss": 0.7865, + "step": 2073 + }, + { + "epoch": 1.9654110400379057, + "grad_norm": 0.9289644211293113, + "learning_rate": 7.637002080985167e-09, + "loss": 0.7485, + "step": 2074 + }, + { + "epoch": 1.966358682776593, + "grad_norm": 0.9661916287589752, + "learning_rate": 7.218717639514983e-09, + "loss": 0.666, + "step": 2075 + }, + { + "epoch": 1.9673063255152807, + "grad_norm": 0.9787686202940714, + "learning_rate": 6.81220467461996e-09, + "loss": 0.7334, + "step": 2076 + }, + { + "epoch": 1.9682539682539684, + "grad_norm": 0.9964620209319142, + "learning_rate": 6.417464144736208e-09, + "loss": 0.67, + "step": 2077 + }, + { + "epoch": 1.9692016109926558, + "grad_norm": 1.030464390987384, + "learning_rate": 6.034496980542037e-09, + "loss": 0.705, + "step": 2078 + }, + { + "epoch": 1.9701492537313432, + "grad_norm": 1.1369647103723564, + "learning_rate": 5.6633040849601865e-09, + "loss": 0.7031, + "step": 2079 + }, + { + "epoch": 1.9710968964700308, + "grad_norm": 1.17782549137388, + "learning_rate": 5.303886333151154e-09, + "loss": 0.7033, + "step": 2080 + }, + { + "epoch": 1.9720445392087185, + "grad_norm": 1.123167425449723, + "learning_rate": 4.956244572513203e-09, + "loss": 0.7173, + "step": 2081 + }, + { + "epoch": 1.9729921819474057, + "grad_norm": 0.9688837536293665, + "learning_rate": 4.620379622682358e-09, + "loss": 0.7403, + "step": 2082 + }, + { + "epoch": 1.9739398246860933, + "grad_norm": 1.234375631478768, + "learning_rate": 4.296292275526859e-09, + "loss": 0.6677, + "step": 2083 + }, + { + "epoch": 1.974887467424781, + "grad_norm": 1.0712274738868512, + "learning_rate": 3.983983295146599e-09, + "loss": 0.68, + "step": 2084 + }, + { + "epoch": 1.9758351101634684, + "grad_norm": 1.0128865650855263, + "learning_rate": 3.6834534178725734e-09, + "loss": 0.7062, + "step": 2085 + }, + { + "epoch": 1.9767827529021558, + "grad_norm": 1.1056609912623454, + "learning_rate": 3.394703352263551e-09, + "loss": 0.6977, + "step": 2086 + }, + { + "epoch": 1.9777303956408434, + "grad_norm": 0.9556835224953595, + "learning_rate": 3.117733779105514e-09, + "loss": 0.7046, + "step": 2087 + }, + { + "epoch": 1.978678038379531, + "grad_norm": 0.9749594888727722, + "learning_rate": 2.8525453514099966e-09, + "loss": 0.6734, + "step": 2088 + }, + { + "epoch": 1.9796256811182185, + "grad_norm": 1.0169544561219785, + "learning_rate": 2.5991386944107524e-09, + "loss": 0.7337, + "step": 2089 + }, + { + "epoch": 1.9805733238569059, + "grad_norm": 1.0181317147660969, + "learning_rate": 2.3575144055643094e-09, + "loss": 0.6938, + "step": 2090 + }, + { + "epoch": 1.9805733238569059, + "eval_loss": 0.914404034614563, + "eval_runtime": 63.6123, + "eval_samples_per_second": 42.885, + "eval_steps_per_second": 0.676, + "step": 2090 + }, + { + "epoch": 1.9815209665955935, + "grad_norm": 1.1555804337504532, + "learning_rate": 2.1276730545488623e-09, + "loss": 0.6347, + "step": 2091 + }, + { + "epoch": 1.982468609334281, + "grad_norm": 1.1894163816979972, + "learning_rate": 1.9096151832609378e-09, + "loss": 0.7038, + "step": 2092 + }, + { + "epoch": 1.9834162520729683, + "grad_norm": 1.045940279368652, + "learning_rate": 1.703341305815398e-09, + "loss": 0.7356, + "step": 2093 + }, + { + "epoch": 1.984363894811656, + "grad_norm": 1.016015756301957, + "learning_rate": 1.5088519085437736e-09, + "loss": 0.6858, + "step": 2094 + }, + { + "epoch": 1.9853115375503436, + "grad_norm": 1.0947801703603017, + "learning_rate": 1.326147449993709e-09, + "loss": 0.6883, + "step": 2095 + }, + { + "epoch": 1.986259180289031, + "grad_norm": 0.960911476029797, + "learning_rate": 1.1552283609272962e-09, + "loss": 0.6925, + "step": 2096 + }, + { + "epoch": 1.9872068230277184, + "grad_norm": 0.9695849235179069, + "learning_rate": 9.96095044320522e-10, + "loss": 0.7557, + "step": 2097 + }, + { + "epoch": 1.988154465766406, + "grad_norm": 0.9926261452237545, + "learning_rate": 8.487478753615997e-10, + "loss": 0.6702, + "step": 2098 + }, + { + "epoch": 1.9891021085050937, + "grad_norm": 0.9680825511217606, + "learning_rate": 7.131872014509711e-10, + "loss": 0.6212, + "step": 2099 + }, + { + "epoch": 1.9900497512437811, + "grad_norm": 1.1257192398766538, + "learning_rate": 5.894133422001957e-10, + "loss": 0.7271, + "step": 2100 + }, + { + "epoch": 1.9909973939824686, + "grad_norm": 1.0549297797609254, + "learning_rate": 4.774265894302854e-10, + "loss": 0.6968, + "step": 2101 + }, + { + "epoch": 1.9919450367211562, + "grad_norm": 1.0063870271651199, + "learning_rate": 3.772272071722594e-10, + "loss": 0.7127, + "step": 2102 + }, + { + "epoch": 1.9928926794598436, + "grad_norm": 0.9955451047912606, + "learning_rate": 2.888154316671443e-10, + "loss": 0.6535, + "step": 2103 + }, + { + "epoch": 1.993840322198531, + "grad_norm": 1.0303767113264195, + "learning_rate": 2.1219147136264383e-10, + "loss": 0.7685, + "step": 2104 + }, + { + "epoch": 1.9947879649372187, + "grad_norm": 0.9819925019424903, + "learning_rate": 1.473555069148036e-10, + "loss": 0.6488, + "step": 2105 + }, + { + "epoch": 1.9957356076759063, + "grad_norm": 1.7129723621352664, + "learning_rate": 9.43076911874563e-11, + "loss": 0.7282, + "step": 2106 + }, + { + "epoch": 1.9966832504145937, + "grad_norm": 1.0109896100493034, + "learning_rate": 5.3048149251111456e-11, + "loss": 0.6369, + "step": 2107 + }, + { + "epoch": 1.9976308931532811, + "grad_norm": 0.9652839280997483, + "learning_rate": 2.3576978384065585e-11, + "loss": 0.731, + "step": 2108 + }, + { + "epoch": 1.9985785358919688, + "grad_norm": 1.1017969508811174, + "learning_rate": 5.8942480701817965e-12, + "loss": 0.6985, + "step": 2109 + }, + { + "epoch": 1.9995261786306564, + "grad_norm": 1.030266403397907, + "learning_rate": 0.0, + "loss": 0.6768, + "step": 2110 + }, + { + "epoch": 1.9995261786306564, + "step": 2110, + "total_flos": 7068238416445440.0, + "train_loss": 0.820082382003278, + "train_runtime": 57795.8559, + "train_samples_per_second": 9.348, + "train_steps_per_second": 0.037 + } + ], + "logging_steps": 1.0, + "max_steps": 2110, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 500, + "total_flos": 7068238416445440.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}