{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.983277591973244, "eval_steps": 37, "global_step": 222, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.013377926421404682, "grad_norm": 7.5698137283325195, "learning_rate": 1.3333333333333332e-06, "loss": 2.1804, "step": 1 }, { "epoch": 0.026755852842809364, "grad_norm": 7.3767266273498535, "learning_rate": 2.6666666666666664e-06, "loss": 2.1927, "step": 2 }, { "epoch": 0.04013377926421405, "grad_norm": 7.829778671264648, "learning_rate": 4e-06, "loss": 2.3279, "step": 3 }, { "epoch": 0.05351170568561873, "grad_norm": 2.793825626373291, "learning_rate": 5.333333333333333e-06, "loss": 1.9777, "step": 4 }, { "epoch": 0.06688963210702341, "grad_norm": 1.4661837816238403, "learning_rate": 6.666666666666667e-06, "loss": 1.8485, "step": 5 }, { "epoch": 0.0802675585284281, "grad_norm": 1.2292248010635376, "learning_rate": 8e-06, "loss": 1.9523, "step": 6 }, { "epoch": 0.09364548494983277, "grad_norm": 1.240803599357605, "learning_rate": 7.99957692770843e-06, "loss": 1.9104, "step": 7 }, { "epoch": 0.10702341137123746, "grad_norm": 0.8672861456871033, "learning_rate": 7.998307800328803e-06, "loss": 1.9006, "step": 8 }, { "epoch": 0.12040133779264214, "grad_norm": 0.7724849581718445, "learning_rate": 7.996192886327432e-06, "loss": 1.8721, "step": 9 }, { "epoch": 0.13377926421404682, "grad_norm": 0.6626549959182739, "learning_rate": 7.993232633085074e-06, "loss": 1.8403, "step": 10 }, { "epoch": 0.14715719063545152, "grad_norm": 0.7850075364112854, "learning_rate": 7.989427666802289e-06, "loss": 1.8972, "step": 11 }, { "epoch": 0.1605351170568562, "grad_norm": 0.6829317808151245, "learning_rate": 7.984778792366982e-06, "loss": 1.815, "step": 12 }, { "epoch": 0.17391304347826086, "grad_norm": 0.5757900476455688, "learning_rate": 7.979286993184132e-06, "loss": 1.7474, "step": 13 }, { "epoch": 0.18729096989966554, "grad_norm": 0.5840671062469482, "learning_rate": 7.972953430967771e-06, "loss": 1.872, "step": 14 }, { "epoch": 0.20066889632107024, "grad_norm": 0.6052594780921936, "learning_rate": 7.965779445495242e-06, "loss": 1.7793, "step": 15 }, { "epoch": 0.2140468227424749, "grad_norm": 0.5719857215881348, "learning_rate": 7.957766554323777e-06, "loss": 1.8001, "step": 16 }, { "epoch": 0.22742474916387959, "grad_norm": 0.6494969129562378, "learning_rate": 7.948916452469496e-06, "loss": 1.8784, "step": 17 }, { "epoch": 0.2408026755852843, "grad_norm": 0.5779961347579956, "learning_rate": 7.939231012048832e-06, "loss": 1.8493, "step": 18 }, { "epoch": 0.25418060200668896, "grad_norm": 0.519511342048645, "learning_rate": 7.928712281882523e-06, "loss": 1.8679, "step": 19 }, { "epoch": 0.26755852842809363, "grad_norm": 0.6307725310325623, "learning_rate": 7.917362487062206e-06, "loss": 1.8664, "step": 20 }, { "epoch": 0.2809364548494983, "grad_norm": 0.521139919757843, "learning_rate": 7.905184028479733e-06, "loss": 1.7756, "step": 21 }, { "epoch": 0.29431438127090304, "grad_norm": 0.5131444931030273, "learning_rate": 7.892179482319294e-06, "loss": 1.6563, "step": 22 }, { "epoch": 0.3076923076923077, "grad_norm": 0.563713014125824, "learning_rate": 7.878351599512464e-06, "loss": 1.852, "step": 23 }, { "epoch": 0.3210702341137124, "grad_norm": 0.5473136901855469, "learning_rate": 7.863703305156273e-06, "loss": 1.8271, "step": 24 }, { "epoch": 0.33444816053511706, "grad_norm": 0.49893084168434143, "learning_rate": 7.848237697894452e-06, "loss": 1.7639, "step": 25 }, { "epoch": 0.34782608695652173, "grad_norm": 0.47465410828590393, "learning_rate": 7.831958049261955e-06, "loss": 1.8612, "step": 26 }, { "epoch": 0.3612040133779264, "grad_norm": 0.5295856595039368, "learning_rate": 7.814867802992907e-06, "loss": 1.819, "step": 27 }, { "epoch": 0.3745819397993311, "grad_norm": 0.4863497316837311, "learning_rate": 7.796970574292136e-06, "loss": 1.7617, "step": 28 }, { "epoch": 0.3879598662207358, "grad_norm": 0.5433112978935242, "learning_rate": 7.778270149070419e-06, "loss": 1.7289, "step": 29 }, { "epoch": 0.4013377926421405, "grad_norm": 0.5282914638519287, "learning_rate": 7.758770483143633e-06, "loss": 1.8131, "step": 30 }, { "epoch": 0.41471571906354515, "grad_norm": 0.5243386030197144, "learning_rate": 7.738475701395954e-06, "loss": 1.8339, "step": 31 }, { "epoch": 0.4280936454849498, "grad_norm": 0.49295875430107117, "learning_rate": 7.717390096907289e-06, "loss": 1.8133, "step": 32 }, { "epoch": 0.4414715719063545, "grad_norm": 0.5231158137321472, "learning_rate": 7.695518130045147e-06, "loss": 1.8031, "step": 33 }, { "epoch": 0.45484949832775917, "grad_norm": 0.5049412250518799, "learning_rate": 7.672864427521097e-06, "loss": 1.7918, "step": 34 }, { "epoch": 0.4682274247491639, "grad_norm": 0.5039061903953552, "learning_rate": 7.649433781412057e-06, "loss": 1.741, "step": 35 }, { "epoch": 0.4816053511705686, "grad_norm": 0.5041850805282593, "learning_rate": 7.6252311481465996e-06, "loss": 1.7254, "step": 36 }, { "epoch": 0.49498327759197325, "grad_norm": 0.4633885622024536, "learning_rate": 7.600261647456484e-06, "loss": 1.8132, "step": 37 }, { "epoch": 0.49498327759197325, "eval_loss": 0.667718768119812, "eval_runtime": 13.4399, "eval_samples_per_second": 90.626, "eval_steps_per_second": 5.729, "step": 37 }, { "epoch": 0.5083612040133779, "grad_norm": 0.5181726217269897, "learning_rate": 7.574530561293649e-06, "loss": 1.882, "step": 38 }, { "epoch": 0.5217391304347826, "grad_norm": 0.5037977695465088, "learning_rate": 7.548043332712886e-06, "loss": 1.8253, "step": 39 }, { "epoch": 0.5351170568561873, "grad_norm": 0.4691613018512726, "learning_rate": 7.520805564720443e-06, "loss": 1.7016, "step": 40 }, { "epoch": 0.5484949832775919, "grad_norm": 0.4761461019515991, "learning_rate": 7.492823019088783e-06, "loss": 1.8041, "step": 41 }, { "epoch": 0.5618729096989966, "grad_norm": 0.4626379907131195, "learning_rate": 7.4641016151377545e-06, "loss": 1.7852, "step": 42 }, { "epoch": 0.5752508361204013, "grad_norm": 0.49921584129333496, "learning_rate": 7.434647428482453e-06, "loss": 1.7104, "step": 43 }, { "epoch": 0.5886287625418061, "grad_norm": 0.49447470903396606, "learning_rate": 7.4044666897479985e-06, "loss": 1.7973, "step": 44 }, { "epoch": 0.6020066889632107, "grad_norm": 0.4844650328159332, "learning_rate": 7.373565783251543e-06, "loss": 1.7678, "step": 45 }, { "epoch": 0.6153846153846154, "grad_norm": 0.49107274413108826, "learning_rate": 7.3419512456517455e-06, "loss": 1.718, "step": 46 }, { "epoch": 0.6287625418060201, "grad_norm": 0.49630844593048096, "learning_rate": 7.309629764566041e-06, "loss": 1.802, "step": 47 }, { "epoch": 0.6421404682274248, "grad_norm": 0.47247716784477234, "learning_rate": 7.276608177155967e-06, "loss": 1.7803, "step": 48 }, { "epoch": 0.6555183946488294, "grad_norm": 1.2681446075439453, "learning_rate": 7.242893468680849e-06, "loss": 1.768, "step": 49 }, { "epoch": 0.6688963210702341, "grad_norm": 0.5203042030334473, "learning_rate": 7.208492771020175e-06, "loss": 1.8885, "step": 50 }, { "epoch": 0.6822742474916388, "grad_norm": 0.5055291056632996, "learning_rate": 7.1734133611649405e-06, "loss": 1.812, "step": 51 }, { "epoch": 0.6956521739130435, "grad_norm": 0.5043210387229919, "learning_rate": 7.137662659678303e-06, "loss": 1.8291, "step": 52 }, { "epoch": 0.7090301003344481, "grad_norm": 0.5000115633010864, "learning_rate": 7.1012482291258626e-06, "loss": 1.7115, "step": 53 }, { "epoch": 0.7224080267558528, "grad_norm": 0.5015853643417358, "learning_rate": 7.064177772475912e-06, "loss": 1.8441, "step": 54 }, { "epoch": 0.7357859531772575, "grad_norm": 0.5204277038574219, "learning_rate": 7.026459131469972e-06, "loss": 1.8268, "step": 55 }, { "epoch": 0.7491638795986622, "grad_norm": 0.5002334117889404, "learning_rate": 6.9881002849639835e-06, "loss": 1.7633, "step": 56 }, { "epoch": 0.7625418060200669, "grad_norm": 0.47437921166419983, "learning_rate": 6.949109347240496e-06, "loss": 1.7573, "step": 57 }, { "epoch": 0.7759197324414716, "grad_norm": 0.46609270572662354, "learning_rate": 6.909494566292195e-06, "loss": 1.7671, "step": 58 }, { "epoch": 0.7892976588628763, "grad_norm": 0.464740514755249, "learning_rate": 6.869264322077157e-06, "loss": 1.735, "step": 59 }, { "epoch": 0.802675585284281, "grad_norm": 0.44125059247016907, "learning_rate": 6.82842712474619e-06, "loss": 1.6895, "step": 60 }, { "epoch": 0.8160535117056856, "grad_norm": 0.4398334324359894, "learning_rate": 6.786991612842619e-06, "loss": 1.6622, "step": 61 }, { "epoch": 0.8294314381270903, "grad_norm": 0.5073325037956238, "learning_rate": 6.744966551474935e-06, "loss": 1.7893, "step": 62 }, { "epoch": 0.842809364548495, "grad_norm": 0.4685400128364563, "learning_rate": 6.702360830462641e-06, "loss": 1.7377, "step": 63 }, { "epoch": 0.8561872909698997, "grad_norm": 0.5076963305473328, "learning_rate": 6.65918346245575e-06, "loss": 1.8125, "step": 64 }, { "epoch": 0.8695652173913043, "grad_norm": 0.510735273361206, "learning_rate": 6.615443581028279e-06, "loss": 1.8576, "step": 65 }, { "epoch": 0.882943143812709, "grad_norm": 0.489524781703949, "learning_rate": 6.571150438746157e-06, "loss": 1.6699, "step": 66 }, { "epoch": 0.8963210702341137, "grad_norm": 0.4763016402721405, "learning_rate": 6.5263134052099895e-06, "loss": 1.7561, "step": 67 }, { "epoch": 0.9096989966555183, "grad_norm": 0.4900319278240204, "learning_rate": 6.480941965073039e-06, "loss": 1.7364, "step": 68 }, { "epoch": 0.9230769230769231, "grad_norm": 0.4876985549926758, "learning_rate": 6.435045716034882e-06, "loss": 1.7202, "step": 69 }, { "epoch": 0.9364548494983278, "grad_norm": 0.5038361549377441, "learning_rate": 6.388634366811145e-06, "loss": 1.7732, "step": 70 }, { "epoch": 0.9498327759197325, "grad_norm": 0.5064495801925659, "learning_rate": 6.341717735079762e-06, "loss": 1.7221, "step": 71 }, { "epoch": 0.9632107023411371, "grad_norm": 0.48319771885871887, "learning_rate": 6.294305745404184e-06, "loss": 1.7169, "step": 72 }, { "epoch": 0.9765886287625418, "grad_norm": 0.4659038782119751, "learning_rate": 6.246408427133971e-06, "loss": 1.7655, "step": 73 }, { "epoch": 0.9899665551839465, "grad_norm": 0.5067590475082397, "learning_rate": 6.198035912283224e-06, "loss": 1.7013, "step": 74 }, { "epoch": 0.9899665551839465, "eval_loss": 0.6622208952903748, "eval_runtime": 13.4118, "eval_samples_per_second": 90.815, "eval_steps_per_second": 5.741, "step": 74 }, { "epoch": 1.0100334448160535, "grad_norm": 1.075007438659668, "learning_rate": 6.149198433387296e-06, "loss": 2.8527, "step": 75 }, { "epoch": 1.0234113712374582, "grad_norm": 0.5882135629653931, "learning_rate": 6.09990632133824e-06, "loss": 1.2937, "step": 76 }, { "epoch": 1.0367892976588629, "grad_norm": 0.5747379064559937, "learning_rate": 6.050170003199461e-06, "loss": 1.3659, "step": 77 }, { "epoch": 1.0501672240802675, "grad_norm": 0.5323824286460876, "learning_rate": 6e-06, "loss": 1.4013, "step": 78 }, { "epoch": 1.0635451505016722, "grad_norm": 0.5771108865737915, "learning_rate": 5.94940692450897e-06, "loss": 1.4312, "step": 79 }, { "epoch": 1.0769230769230769, "grad_norm": 0.6475838422775269, "learning_rate": 5.898401478990562e-06, "loss": 1.5826, "step": 80 }, { "epoch": 1.0903010033444815, "grad_norm": 0.592047929763794, "learning_rate": 5.846994452940136e-06, "loss": 1.4368, "step": 81 }, { "epoch": 1.1036789297658862, "grad_norm": 0.7535114884376526, "learning_rate": 5.795196720801849e-06, "loss": 1.5116, "step": 82 }, { "epoch": 1.117056856187291, "grad_norm": 0.586352527141571, "learning_rate": 5.743019239668317e-06, "loss": 1.4519, "step": 83 }, { "epoch": 1.1304347826086956, "grad_norm": 0.617953360080719, "learning_rate": 5.690473046962798e-06, "loss": 1.516, "step": 84 }, { "epoch": 1.1438127090301002, "grad_norm": 0.5882145166397095, "learning_rate": 5.63756925810437e-06, "loss": 1.3272, "step": 85 }, { "epoch": 1.1571906354515051, "grad_norm": 0.5454166531562805, "learning_rate": 5.584319064156627e-06, "loss": 1.4443, "step": 86 }, { "epoch": 1.1705685618729098, "grad_norm": 0.5162122249603271, "learning_rate": 5.530733729460359e-06, "loss": 1.409, "step": 87 }, { "epoch": 1.1839464882943145, "grad_norm": 0.5802710056304932, "learning_rate": 5.476824589250738e-06, "loss": 1.4925, "step": 88 }, { "epoch": 1.1973244147157192, "grad_norm": 0.5536505579948425, "learning_rate": 5.4226030472595064e-06, "loss": 1.3556, "step": 89 }, { "epoch": 1.2107023411371238, "grad_norm": 0.5804270505905151, "learning_rate": 5.368080573302675e-06, "loss": 1.4867, "step": 90 }, { "epoch": 1.2240802675585285, "grad_norm": 0.5988459587097168, "learning_rate": 5.3132687008542446e-06, "loss": 1.5466, "step": 91 }, { "epoch": 1.2374581939799332, "grad_norm": 0.5100234150886536, "learning_rate": 5.2581790246064545e-06, "loss": 1.3042, "step": 92 }, { "epoch": 1.2508361204013378, "grad_norm": 0.547820508480072, "learning_rate": 5.2028231980170915e-06, "loss": 1.4344, "step": 93 }, { "epoch": 1.2642140468227425, "grad_norm": 0.5059947967529297, "learning_rate": 5.147212930844361e-06, "loss": 1.3313, "step": 94 }, { "epoch": 1.2775919732441472, "grad_norm": 0.5130128860473633, "learning_rate": 5.091359986669844e-06, "loss": 1.3825, "step": 95 }, { "epoch": 1.2909698996655519, "grad_norm": 0.5295710563659668, "learning_rate": 5.035276180410083e-06, "loss": 1.3594, "step": 96 }, { "epoch": 1.3043478260869565, "grad_norm": 0.5060495734214783, "learning_rate": 4.978973375817295e-06, "loss": 1.3036, "step": 97 }, { "epoch": 1.3177257525083612, "grad_norm": 0.5988255143165588, "learning_rate": 4.922463482969761e-06, "loss": 1.5651, "step": 98 }, { "epoch": 1.3311036789297659, "grad_norm": 0.4946533739566803, "learning_rate": 4.8657584557524116e-06, "loss": 1.333, "step": 99 }, { "epoch": 1.3444816053511706, "grad_norm": 0.5674816966056824, "learning_rate": 4.808870289328152e-06, "loss": 1.4971, "step": 100 }, { "epoch": 1.3578595317725752, "grad_norm": 0.5148370265960693, "learning_rate": 4.751811017600447e-06, "loss": 1.4789, "step": 101 }, { "epoch": 1.37123745819398, "grad_norm": 0.5132340788841248, "learning_rate": 4.694592710667722e-06, "loss": 1.3163, "step": 102 }, { "epoch": 1.3846153846153846, "grad_norm": 0.6035396456718445, "learning_rate": 4.637227472270091e-06, "loss": 1.3789, "step": 103 }, { "epoch": 1.3979933110367893, "grad_norm": 0.550737738609314, "learning_rate": 4.579727437228986e-06, "loss": 1.4218, "step": 104 }, { "epoch": 1.411371237458194, "grad_norm": 0.4987228810787201, "learning_rate": 4.522104768880207e-06, "loss": 1.264, "step": 105 }, { "epoch": 1.4247491638795986, "grad_norm": 0.5271551609039307, "learning_rate": 4.4643716565009205e-06, "loss": 1.4445, "step": 106 }, { "epoch": 1.4381270903010033, "grad_norm": 0.5551120638847351, "learning_rate": 4.406540312731208e-06, "loss": 1.5199, "step": 107 }, { "epoch": 1.451505016722408, "grad_norm": 0.5053355097770691, "learning_rate": 4.348622970990633e-06, "loss": 1.3389, "step": 108 }, { "epoch": 1.4648829431438126, "grad_norm": 0.5177690386772156, "learning_rate": 4.290631882890443e-06, "loss": 1.4396, "step": 109 }, { "epoch": 1.4782608695652173, "grad_norm": 0.522657573223114, "learning_rate": 4.232579315641903e-06, "loss": 1.5001, "step": 110 }, { "epoch": 1.491638795986622, "grad_norm": 0.5218788981437683, "learning_rate": 4.174477549461344e-06, "loss": 1.3964, "step": 111 }, { "epoch": 1.491638795986622, "eval_loss": 0.6926424503326416, "eval_runtime": 13.4107, "eval_samples_per_second": 90.823, "eval_steps_per_second": 5.742, "step": 111 }, { "epoch": 1.5050167224080266, "grad_norm": 0.513609766960144, "learning_rate": 4.1163388749724456e-06, "loss": 1.3159, "step": 112 }, { "epoch": 1.5183946488294313, "grad_norm": 0.5356954336166382, "learning_rate": 4.058175590606331e-06, "loss": 1.4034, "step": 113 }, { "epoch": 1.531772575250836, "grad_norm": 0.5953348278999329, "learning_rate": 4e-06, "loss": 1.4772, "step": 114 }, { "epoch": 1.5451505016722407, "grad_norm": 0.5957881808280945, "learning_rate": 3.941824409393669e-06, "loss": 1.5237, "step": 115 }, { "epoch": 1.5585284280936453, "grad_norm": 0.4759249687194824, "learning_rate": 3.883661125027554e-06, "loss": 1.27, "step": 116 }, { "epoch": 1.57190635451505, "grad_norm": 0.5109943747520447, "learning_rate": 3.825522450538656e-06, "loss": 1.4649, "step": 117 }, { "epoch": 1.585284280936455, "grad_norm": 0.477067768573761, "learning_rate": 3.7674206843580965e-06, "loss": 1.3081, "step": 118 }, { "epoch": 1.5986622073578596, "grad_norm": 0.505376398563385, "learning_rate": 3.7093681171095572e-06, "loss": 1.3395, "step": 119 }, { "epoch": 1.6120401337792643, "grad_norm": 0.5487738251686096, "learning_rate": 3.651377029009367e-06, "loss": 1.529, "step": 120 }, { "epoch": 1.625418060200669, "grad_norm": 0.5177662968635559, "learning_rate": 3.5934596872687923e-06, "loss": 1.291, "step": 121 }, { "epoch": 1.6387959866220736, "grad_norm": 0.5494332909584045, "learning_rate": 3.5356283434990783e-06, "loss": 1.4541, "step": 122 }, { "epoch": 1.6521739130434783, "grad_norm": 0.526679277420044, "learning_rate": 3.4778952311197945e-06, "loss": 1.3409, "step": 123 }, { "epoch": 1.665551839464883, "grad_norm": 0.5405285954475403, "learning_rate": 3.4202725627710133e-06, "loss": 1.4129, "step": 124 }, { "epoch": 1.6789297658862876, "grad_norm": 0.5305699110031128, "learning_rate": 3.36277252772991e-06, "loss": 1.458, "step": 125 }, { "epoch": 1.6923076923076923, "grad_norm": 0.5097222924232483, "learning_rate": 3.3054072893322785e-06, "loss": 1.3555, "step": 126 }, { "epoch": 1.705685618729097, "grad_norm": 0.5460465550422668, "learning_rate": 3.2481889823995524e-06, "loss": 1.3272, "step": 127 }, { "epoch": 1.7190635451505016, "grad_norm": 0.5369409918785095, "learning_rate": 3.191129710671849e-06, "loss": 1.2991, "step": 128 }, { "epoch": 1.7324414715719063, "grad_norm": 0.4934872090816498, "learning_rate": 3.1342415442475885e-06, "loss": 1.319, "step": 129 }, { "epoch": 1.745819397993311, "grad_norm": 0.5528122186660767, "learning_rate": 3.077536517030239e-06, "loss": 1.4731, "step": 130 }, { "epoch": 1.7591973244147159, "grad_norm": 0.5233715176582336, "learning_rate": 3.0210266241827046e-06, "loss": 1.4089, "step": 131 }, { "epoch": 1.7725752508361206, "grad_norm": 0.5456512570381165, "learning_rate": 2.9647238195899164e-06, "loss": 1.4056, "step": 132 }, { "epoch": 1.7859531772575252, "grad_norm": 0.5461183190345764, "learning_rate": 2.908640013330157e-06, "loss": 1.4384, "step": 133 }, { "epoch": 1.79933110367893, "grad_norm": 0.5198376178741455, "learning_rate": 2.85278706915564e-06, "loss": 1.48, "step": 134 }, { "epoch": 1.8127090301003346, "grad_norm": 0.5073018670082092, "learning_rate": 2.7971768019829077e-06, "loss": 1.3335, "step": 135 }, { "epoch": 1.8260869565217392, "grad_norm": 0.5290614366531372, "learning_rate": 2.741820975393546e-06, "loss": 1.4239, "step": 136 }, { "epoch": 1.839464882943144, "grad_norm": 0.5525597333908081, "learning_rate": 2.686731299145756e-06, "loss": 1.4017, "step": 137 }, { "epoch": 1.8528428093645486, "grad_norm": 0.5554612874984741, "learning_rate": 2.631919426697325e-06, "loss": 1.5295, "step": 138 }, { "epoch": 1.8662207357859533, "grad_norm": 0.5276882648468018, "learning_rate": 2.5773969527404945e-06, "loss": 1.3571, "step": 139 }, { "epoch": 1.879598662207358, "grad_norm": 0.506417453289032, "learning_rate": 2.5231754107492627e-06, "loss": 1.3666, "step": 140 }, { "epoch": 1.8929765886287626, "grad_norm": 0.535830557346344, "learning_rate": 2.469266270539641e-06, "loss": 1.5119, "step": 141 }, { "epoch": 1.9063545150501673, "grad_norm": 0.5409619808197021, "learning_rate": 2.4156809358433725e-06, "loss": 1.4349, "step": 142 }, { "epoch": 1.919732441471572, "grad_norm": 0.5141175985336304, "learning_rate": 2.3624307418956294e-06, "loss": 1.3672, "step": 143 }, { "epoch": 1.9331103678929766, "grad_norm": 0.5471431612968445, "learning_rate": 2.309526953037203e-06, "loss": 1.3575, "step": 144 }, { "epoch": 1.9464882943143813, "grad_norm": 0.5435221195220947, "learning_rate": 2.256980760331683e-06, "loss": 1.4398, "step": 145 }, { "epoch": 1.959866220735786, "grad_norm": 0.5480269193649292, "learning_rate": 2.2048032791981513e-06, "loss": 1.279, "step": 146 }, { "epoch": 1.9732441471571907, "grad_norm": 0.5423163175582886, "learning_rate": 2.153005547059865e-06, "loss": 1.4763, "step": 147 }, { "epoch": 1.9866220735785953, "grad_norm": 0.5127543807029724, "learning_rate": 2.1015985210094384e-06, "loss": 1.3808, "step": 148 }, { "epoch": 1.9866220735785953, "eval_loss": 0.7046768069267273, "eval_runtime": 13.4302, "eval_samples_per_second": 90.691, "eval_steps_per_second": 5.733, "step": 148 }, { "epoch": 2.0066889632107023, "grad_norm": 1.2160006761550903, "learning_rate": 2.050593075491031e-06, "loss": 2.3417, "step": 149 }, { "epoch": 2.020066889632107, "grad_norm": 0.7894352674484253, "learning_rate": 2.0000000000000008e-06, "loss": 1.1381, "step": 150 }, { "epoch": 2.0334448160535117, "grad_norm": 0.790090799331665, "learning_rate": 1.9498299968005392e-06, "loss": 1.2723, "step": 151 }, { "epoch": 2.0468227424749164, "grad_norm": 0.6597563624382019, "learning_rate": 1.9000936786617597e-06, "loss": 1.1166, "step": 152 }, { "epoch": 2.060200668896321, "grad_norm": 0.6646838784217834, "learning_rate": 1.850801566612704e-06, "loss": 1.2064, "step": 153 }, { "epoch": 2.0735785953177257, "grad_norm": 0.616726815700531, "learning_rate": 1.801964087716776e-06, "loss": 1.1856, "step": 154 }, { "epoch": 2.0869565217391304, "grad_norm": 0.6225292682647705, "learning_rate": 1.7535915728660289e-06, "loss": 1.0631, "step": 155 }, { "epoch": 2.100334448160535, "grad_norm": 0.9003037214279175, "learning_rate": 1.7056942545958167e-06, "loss": 1.1332, "step": 156 }, { "epoch": 2.1137123745819397, "grad_norm": 1.1400443315505981, "learning_rate": 1.6582822649202379e-06, "loss": 1.0256, "step": 157 }, { "epoch": 2.1270903010033444, "grad_norm": 1.0252878665924072, "learning_rate": 1.611365633188856e-06, "loss": 1.0825, "step": 158 }, { "epoch": 2.140468227424749, "grad_norm": 0.785536527633667, "learning_rate": 1.5649542839651173e-06, "loss": 1.1539, "step": 159 }, { "epoch": 2.1538461538461537, "grad_norm": 0.6228716373443604, "learning_rate": 1.5190580349269603e-06, "loss": 1.1693, "step": 160 }, { "epoch": 2.1672240802675584, "grad_norm": 0.605522096157074, "learning_rate": 1.4736865947900103e-06, "loss": 1.0761, "step": 161 }, { "epoch": 2.180602006688963, "grad_norm": 0.6409484148025513, "learning_rate": 1.4288495612538425e-06, "loss": 1.1637, "step": 162 }, { "epoch": 2.1939799331103678, "grad_norm": 0.6087141036987305, "learning_rate": 1.3845564189717216e-06, "loss": 1.0937, "step": 163 }, { "epoch": 2.2073578595317724, "grad_norm": 0.7026439309120178, "learning_rate": 1.3408165375442484e-06, "loss": 1.2132, "step": 164 }, { "epoch": 2.220735785953177, "grad_norm": 0.6594187617301941, "learning_rate": 1.297639169537359e-06, "loss": 1.07, "step": 165 }, { "epoch": 2.234113712374582, "grad_norm": 0.6606442928314209, "learning_rate": 1.255033448525066e-06, "loss": 1.1694, "step": 166 }, { "epoch": 2.2474916387959865, "grad_norm": 0.6164308786392212, "learning_rate": 1.2130083871573812e-06, "loss": 1.0824, "step": 167 }, { "epoch": 2.260869565217391, "grad_norm": 0.5823544263839722, "learning_rate": 1.1715728752538101e-06, "loss": 1.1106, "step": 168 }, { "epoch": 2.274247491638796, "grad_norm": 0.5872677564620972, "learning_rate": 1.130735677922842e-06, "loss": 1.1056, "step": 169 }, { "epoch": 2.2876254180602005, "grad_norm": 0.6060868501663208, "learning_rate": 1.090505433707805e-06, "loss": 1.0993, "step": 170 }, { "epoch": 2.3010033444816056, "grad_norm": 0.7028762698173523, "learning_rate": 1.050890652759504e-06, "loss": 1.3655, "step": 171 }, { "epoch": 2.3143812709030103, "grad_norm": 0.6295290589332581, "learning_rate": 1.0118997150360166e-06, "loss": 1.0931, "step": 172 }, { "epoch": 2.327759197324415, "grad_norm": 0.6583987474441528, "learning_rate": 9.735408685300286e-07, "loss": 1.1103, "step": 173 }, { "epoch": 2.3411371237458196, "grad_norm": 0.7007333040237427, "learning_rate": 9.358222275240884e-07, "loss": 1.1794, "step": 174 }, { "epoch": 2.3545150501672243, "grad_norm": 0.6823887825012207, "learning_rate": 8.987517708741363e-07, "loss": 1.1575, "step": 175 }, { "epoch": 2.367892976588629, "grad_norm": 0.680305540561676, "learning_rate": 8.623373403216971e-07, "loss": 1.096, "step": 176 }, { "epoch": 2.3812709030100336, "grad_norm": 0.6234930157661438, "learning_rate": 8.265866388350598e-07, "loss": 1.0486, "step": 177 }, { "epoch": 2.3946488294314383, "grad_norm": 0.6473740339279175, "learning_rate": 7.915072289798246e-07, "loss": 1.1637, "step": 178 }, { "epoch": 2.408026755852843, "grad_norm": 0.6634021997451782, "learning_rate": 7.571065313191511e-07, "loss": 1.1053, "step": 179 }, { "epoch": 2.4214046822742477, "grad_norm": 0.6495158672332764, "learning_rate": 7.233918228440323e-07, "loss": 1.0907, "step": 180 }, { "epoch": 2.4347826086956523, "grad_norm": 0.6720609664916992, "learning_rate": 6.903702354339578e-07, "loss": 1.1751, "step": 181 }, { "epoch": 2.448160535117057, "grad_norm": 0.6688068509101868, "learning_rate": 6.580487543482549e-07, "loss": 1.1408, "step": 182 }, { "epoch": 2.4615384615384617, "grad_norm": 0.6397896409034729, "learning_rate": 6.26434216748458e-07, "loss": 1.2012, "step": 183 }, { "epoch": 2.4749163879598663, "grad_norm": 0.6841992735862732, "learning_rate": 5.955333102520011e-07, "loss": 1.2623, "step": 184 }, { "epoch": 2.488294314381271, "grad_norm": 0.6013959050178528, "learning_rate": 5.653525715175483e-07, "loss": 1.0792, "step": 185 }, { "epoch": 2.488294314381271, "eval_loss": 0.8052845597267151, "eval_runtime": 13.448, "eval_samples_per_second": 90.571, "eval_steps_per_second": 5.726, "step": 185 }, { "epoch": 2.5016722408026757, "grad_norm": 0.595483660697937, "learning_rate": 5.358983848622451e-07, "loss": 1.1536, "step": 186 }, { "epoch": 2.5150501672240804, "grad_norm": 0.6301653981208801, "learning_rate": 5.07176980911217e-07, "loss": 1.1543, "step": 187 }, { "epoch": 2.528428093645485, "grad_norm": 0.6083581447601318, "learning_rate": 4.791944352795561e-07, "loss": 1.131, "step": 188 }, { "epoch": 2.5418060200668897, "grad_norm": 0.6019948720932007, "learning_rate": 4.519566672871131e-07, "loss": 1.1022, "step": 189 }, { "epoch": 2.5551839464882944, "grad_norm": 0.5989395976066589, "learning_rate": 4.2546943870635135e-07, "loss": 1.1402, "step": 190 }, { "epoch": 2.568561872909699, "grad_norm": 0.575457751750946, "learning_rate": 3.997383525435154e-07, "loss": 1.0687, "step": 191 }, { "epoch": 2.5819397993311037, "grad_norm": 0.6226676106452942, "learning_rate": 3.7476885185340023e-07, "loss": 1.158, "step": 192 }, { "epoch": 2.5953177257525084, "grad_norm": 0.6265813112258911, "learning_rate": 3.5056621858794387e-07, "loss": 1.1689, "step": 193 }, { "epoch": 2.608695652173913, "grad_norm": 0.6149746775627136, "learning_rate": 3.2713557247890447e-07, "loss": 1.1482, "step": 194 }, { "epoch": 2.6220735785953178, "grad_norm": 0.555928647518158, "learning_rate": 3.0448186995485303e-07, "loss": 0.9814, "step": 195 }, { "epoch": 2.6354515050167224, "grad_norm": 0.6666916608810425, "learning_rate": 2.826099030927098e-07, "loss": 1.2773, "step": 196 }, { "epoch": 2.648829431438127, "grad_norm": 0.6038864850997925, "learning_rate": 2.6152429860404646e-07, "loss": 1.0263, "step": 197 }, { "epoch": 2.6622073578595318, "grad_norm": 0.6544002890586853, "learning_rate": 2.412295168563667e-07, "loss": 1.2501, "step": 198 }, { "epoch": 2.6755852842809364, "grad_norm": 0.5613058805465698, "learning_rate": 2.2172985092958128e-07, "loss": 1.0164, "step": 199 }, { "epoch": 2.688963210702341, "grad_norm": 0.6110493540763855, "learning_rate": 2.0302942570786442e-07, "loss": 1.142, "step": 200 }, { "epoch": 2.702341137123746, "grad_norm": 0.6497470140457153, "learning_rate": 1.851321970070927e-07, "loss": 1.1498, "step": 201 }, { "epoch": 2.7157190635451505, "grad_norm": 0.6384419202804565, "learning_rate": 1.680419507380444e-07, "loss": 1.1044, "step": 202 }, { "epoch": 2.729096989966555, "grad_norm": 0.6009129285812378, "learning_rate": 1.5176230210554742e-07, "loss": 1.13, "step": 203 }, { "epoch": 2.74247491638796, "grad_norm": 0.5934394001960754, "learning_rate": 1.3629669484372718e-07, "loss": 1.0401, "step": 204 }, { "epoch": 2.7558528428093645, "grad_norm": 0.669292151927948, "learning_rate": 1.21648400487536e-07, "loss": 1.2259, "step": 205 }, { "epoch": 2.769230769230769, "grad_norm": 0.6402983665466309, "learning_rate": 1.0782051768070477e-07, "loss": 1.1421, "step": 206 }, { "epoch": 2.782608695652174, "grad_norm": 0.6059122085571289, "learning_rate": 9.481597152026654e-08, "loss": 1.1015, "step": 207 }, { "epoch": 2.7959866220735785, "grad_norm": 0.6018065214157104, "learning_rate": 8.263751293779408e-08, "loss": 1.1427, "step": 208 }, { "epoch": 2.809364548494983, "grad_norm": 0.6087521910667419, "learning_rate": 7.128771811747736e-08, "loss": 1.1633, "step": 209 }, { "epoch": 2.822742474916388, "grad_norm": 0.6287218928337097, "learning_rate": 6.076898795116792e-08, "loss": 1.1612, "step": 210 }, { "epoch": 2.8361204013377925, "grad_norm": 0.6059502363204956, "learning_rate": 5.108354753050381e-08, "loss": 1.0879, "step": 211 }, { "epoch": 2.849498327759197, "grad_norm": 0.5889873504638672, "learning_rate": 4.2233445676222114e-08, "loss": 1.1121, "step": 212 }, { "epoch": 2.862876254180602, "grad_norm": 0.6066433787345886, "learning_rate": 3.422055450475847e-08, "loss": 1.102, "step": 213 }, { "epoch": 2.8762541806020065, "grad_norm": 0.6160590648651123, "learning_rate": 2.7046569032227905e-08, "loss": 1.1017, "step": 214 }, { "epoch": 2.8896321070234112, "grad_norm": 0.6136374473571777, "learning_rate": 2.0713006815868074e-08, "loss": 1.1346, "step": 215 }, { "epoch": 2.903010033444816, "grad_norm": 0.6150422692298889, "learning_rate": 1.522120763301782e-08, "loss": 1.1271, "step": 216 }, { "epoch": 2.9163879598662206, "grad_norm": 0.6284250617027283, "learning_rate": 1.0572333197711003e-08, "loss": 1.1855, "step": 217 }, { "epoch": 2.9297658862876252, "grad_norm": 0.5995332598686218, "learning_rate": 6.767366914927297e-09, "loss": 1.1039, "step": 218 }, { "epoch": 2.94314381270903, "grad_norm": 0.5566285252571106, "learning_rate": 3.807113672568807e-09, "loss": 1.0683, "step": 219 }, { "epoch": 2.9565217391304346, "grad_norm": 0.5810141563415527, "learning_rate": 1.6921996711976028e-09, "loss": 1.1098, "step": 220 }, { "epoch": 2.9698996655518393, "grad_norm": 0.6116142868995667, "learning_rate": 4.230722915701257e-10, "loss": 1.0441, "step": 221 }, { "epoch": 2.983277591973244, "grad_norm": 0.6009790301322937, "learning_rate": 0.0, "loss": 1.1404, "step": 222 }, { "epoch": 2.983277591973244, "eval_loss": 0.8089934587478638, "eval_runtime": 13.4287, "eval_samples_per_second": 90.701, "eval_steps_per_second": 5.734, "step": 222 } ], "logging_steps": 1, "max_steps": 222, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 37, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 5.24107627264947e+18, "train_batch_size": 2, "trial_name": null, "trial_params": null }