{ "best_metric": null, "best_model_checkpoint": null, "epoch": 4.937062937062937, "eval_steps": 18, "global_step": 355, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.013986013986013986, "grad_norm": 6.746792793273926, "learning_rate": 3.3333333333333335e-07, "loss": 0.8294, "step": 1 }, { "epoch": 0.013986013986013986, "eval_loss": 0.8744672536849976, "eval_runtime": 36.967, "eval_samples_per_second": 17.367, "eval_steps_per_second": 2.191, "step": 1 }, { "epoch": 0.027972027972027972, "grad_norm": 6.9825944900512695, "learning_rate": 6.666666666666667e-07, "loss": 0.8694, "step": 2 }, { "epoch": 0.04195804195804196, "grad_norm": 7.01480770111084, "learning_rate": 1.0000000000000002e-06, "loss": 0.861, "step": 3 }, { "epoch": 0.055944055944055944, "grad_norm": 7.156968593597412, "learning_rate": 1.3333333333333334e-06, "loss": 0.9027, "step": 4 }, { "epoch": 0.06993006993006994, "grad_norm": 6.0878005027771, "learning_rate": 1.6666666666666667e-06, "loss": 0.8577, "step": 5 }, { "epoch": 0.08391608391608392, "grad_norm": 5.853216648101807, "learning_rate": 2.0000000000000003e-06, "loss": 0.8168, "step": 6 }, { "epoch": 0.0979020979020979, "grad_norm": 4.9973978996276855, "learning_rate": 2.3333333333333336e-06, "loss": 0.788, "step": 7 }, { "epoch": 0.11188811188811189, "grad_norm": 4.611128330230713, "learning_rate": 2.666666666666667e-06, "loss": 0.7959, "step": 8 }, { "epoch": 0.1258741258741259, "grad_norm": 3.1312103271484375, "learning_rate": 3e-06, "loss": 0.7374, "step": 9 }, { "epoch": 0.13986013986013987, "grad_norm": 2.9217381477355957, "learning_rate": 3.3333333333333333e-06, "loss": 0.7329, "step": 10 }, { "epoch": 0.15384615384615385, "grad_norm": 2.5225424766540527, "learning_rate": 3.6666666666666666e-06, "loss": 0.6905, "step": 11 }, { "epoch": 0.16783216783216784, "grad_norm": 2.8658440113067627, "learning_rate": 4.000000000000001e-06, "loss": 0.702, "step": 12 }, { "epoch": 0.18181818181818182, "grad_norm": 2.6459388732910156, "learning_rate": 4.333333333333334e-06, "loss": 0.6659, "step": 13 }, { "epoch": 0.1958041958041958, "grad_norm": 2.4082329273223877, "learning_rate": 4.666666666666667e-06, "loss": 0.6732, "step": 14 }, { "epoch": 0.2097902097902098, "grad_norm": 1.8969792127609253, "learning_rate": 5e-06, "loss": 0.626, "step": 15 }, { "epoch": 0.22377622377622378, "grad_norm": 1.705984354019165, "learning_rate": 5.333333333333334e-06, "loss": 0.6357, "step": 16 }, { "epoch": 0.23776223776223776, "grad_norm": 1.5265748500823975, "learning_rate": 5.666666666666667e-06, "loss": 0.6409, "step": 17 }, { "epoch": 0.2517482517482518, "grad_norm": 1.3590223789215088, "learning_rate": 6e-06, "loss": 0.6128, "step": 18 }, { "epoch": 0.2517482517482518, "eval_loss": 0.6171885132789612, "eval_runtime": 35.4252, "eval_samples_per_second": 18.123, "eval_steps_per_second": 2.287, "step": 18 }, { "epoch": 0.26573426573426573, "grad_norm": 1.3791933059692383, "learning_rate": 6.333333333333333e-06, "loss": 0.6181, "step": 19 }, { "epoch": 0.27972027972027974, "grad_norm": 1.398863434791565, "learning_rate": 6.666666666666667e-06, "loss": 0.593, "step": 20 }, { "epoch": 0.2937062937062937, "grad_norm": 1.1556097269058228, "learning_rate": 7e-06, "loss": 0.6274, "step": 21 }, { "epoch": 0.3076923076923077, "grad_norm": 1.094146728515625, "learning_rate": 7.333333333333333e-06, "loss": 0.6113, "step": 22 }, { "epoch": 0.32167832167832167, "grad_norm": 1.2191824913024902, "learning_rate": 7.666666666666667e-06, "loss": 0.6111, "step": 23 }, { "epoch": 0.3356643356643357, "grad_norm": 0.9371815323829651, "learning_rate": 8.000000000000001e-06, "loss": 0.5895, "step": 24 }, { "epoch": 0.34965034965034963, "grad_norm": 0.8173602223396301, "learning_rate": 8.333333333333334e-06, "loss": 0.6083, "step": 25 }, { "epoch": 0.36363636363636365, "grad_norm": 1.0984693765640259, "learning_rate": 8.666666666666668e-06, "loss": 0.6072, "step": 26 }, { "epoch": 0.3776223776223776, "grad_norm": 1.0279648303985596, "learning_rate": 9e-06, "loss": 0.6001, "step": 27 }, { "epoch": 0.3916083916083916, "grad_norm": 0.9129611253738403, "learning_rate": 9.333333333333334e-06, "loss": 0.5644, "step": 28 }, { "epoch": 0.40559440559440557, "grad_norm": 0.832744300365448, "learning_rate": 9.666666666666667e-06, "loss": 0.5716, "step": 29 }, { "epoch": 0.4195804195804196, "grad_norm": 0.8230701684951782, "learning_rate": 1e-05, "loss": 0.59, "step": 30 }, { "epoch": 0.43356643356643354, "grad_norm": 0.8343638181686401, "learning_rate": 9.999766401714795e-06, "loss": 0.5876, "step": 31 }, { "epoch": 0.44755244755244755, "grad_norm": 0.7421298623085022, "learning_rate": 9.999065628686439e-06, "loss": 0.5959, "step": 32 }, { "epoch": 0.46153846153846156, "grad_norm": 0.7471378445625305, "learning_rate": 9.997897746394684e-06, "loss": 0.5804, "step": 33 }, { "epoch": 0.4755244755244755, "grad_norm": 0.8300222754478455, "learning_rate": 9.996262863965651e-06, "loss": 0.5726, "step": 34 }, { "epoch": 0.48951048951048953, "grad_norm": 0.7753379940986633, "learning_rate": 9.994161134161635e-06, "loss": 0.6034, "step": 35 }, { "epoch": 0.5034965034965035, "grad_norm": 0.8331146240234375, "learning_rate": 9.991592753366822e-06, "loss": 0.5953, "step": 36 }, { "epoch": 0.5034965034965035, "eval_loss": 0.5805296897888184, "eval_runtime": 35.0435, "eval_samples_per_second": 18.32, "eval_steps_per_second": 2.311, "step": 36 }, { "epoch": 0.5174825174825175, "grad_norm": 0.7212592959403992, "learning_rate": 9.988557961568956e-06, "loss": 0.5639, "step": 37 }, { "epoch": 0.5314685314685315, "grad_norm": 0.796295166015625, "learning_rate": 9.985057042336898e-06, "loss": 0.5771, "step": 38 }, { "epoch": 0.5454545454545454, "grad_norm": 0.8607219457626343, "learning_rate": 9.981090322794145e-06, "loss": 0.5763, "step": 39 }, { "epoch": 0.5594405594405595, "grad_norm": 0.861869215965271, "learning_rate": 9.976658173588244e-06, "loss": 0.5729, "step": 40 }, { "epoch": 0.5734265734265734, "grad_norm": 0.7538414597511292, "learning_rate": 9.97176100885618e-06, "loss": 0.571, "step": 41 }, { "epoch": 0.5874125874125874, "grad_norm": 0.7197255492210388, "learning_rate": 9.966399286185666e-06, "loss": 0.5421, "step": 42 }, { "epoch": 0.6013986013986014, "grad_norm": 0.7522373199462891, "learning_rate": 9.960573506572391e-06, "loss": 0.5603, "step": 43 }, { "epoch": 0.6153846153846154, "grad_norm": 0.8054993152618408, "learning_rate": 9.954284214373204e-06, "loss": 0.5723, "step": 44 }, { "epoch": 0.6293706293706294, "grad_norm": 0.639057457447052, "learning_rate": 9.947531997255256e-06, "loss": 0.5483, "step": 45 }, { "epoch": 0.6433566433566433, "grad_norm": 0.6742891073226929, "learning_rate": 9.940317486141084e-06, "loss": 0.5845, "step": 46 }, { "epoch": 0.6573426573426573, "grad_norm": 0.6605424880981445, "learning_rate": 9.932641355149655e-06, "loss": 0.5639, "step": 47 }, { "epoch": 0.6713286713286714, "grad_norm": 0.7080878019332886, "learning_rate": 9.924504321533387e-06, "loss": 0.5851, "step": 48 }, { "epoch": 0.6853146853146853, "grad_norm": 0.6235523223876953, "learning_rate": 9.915907145611117e-06, "loss": 0.574, "step": 49 }, { "epoch": 0.6993006993006993, "grad_norm": 0.6567375063896179, "learning_rate": 9.906850630697068e-06, "loss": 0.5705, "step": 50 }, { "epoch": 0.7132867132867133, "grad_norm": 0.6011090278625488, "learning_rate": 9.89733562302578e-06, "loss": 0.574, "step": 51 }, { "epoch": 0.7272727272727273, "grad_norm": 0.6043576002120972, "learning_rate": 9.887363011673046e-06, "loss": 0.5849, "step": 52 }, { "epoch": 0.7412587412587412, "grad_norm": 0.7147118449211121, "learning_rate": 9.876933728472826e-06, "loss": 0.5584, "step": 53 }, { "epoch": 0.7552447552447552, "grad_norm": 0.6480064392089844, "learning_rate": 9.866048747930194e-06, "loss": 0.5494, "step": 54 }, { "epoch": 0.7552447552447552, "eval_loss": 0.5708758234977722, "eval_runtime": 34.9921, "eval_samples_per_second": 18.347, "eval_steps_per_second": 2.315, "step": 54 }, { "epoch": 0.7692307692307693, "grad_norm": 0.6563164591789246, "learning_rate": 9.854709087130261e-06, "loss": 0.5491, "step": 55 }, { "epoch": 0.7832167832167832, "grad_norm": 0.6024691462516785, "learning_rate": 9.842915805643156e-06, "loss": 0.5589, "step": 56 }, { "epoch": 0.7972027972027972, "grad_norm": 0.6186073422431946, "learning_rate": 9.830670005425012e-06, "loss": 0.5567, "step": 57 }, { "epoch": 0.8111888111888111, "grad_norm": 0.6993715763092041, "learning_rate": 9.817972830715003e-06, "loss": 0.5534, "step": 58 }, { "epoch": 0.8251748251748252, "grad_norm": 0.6327122449874878, "learning_rate": 9.804825467928423e-06, "loss": 0.5709, "step": 59 }, { "epoch": 0.8391608391608392, "grad_norm": 0.6156756281852722, "learning_rate": 9.791229145545832e-06, "loss": 0.5445, "step": 60 }, { "epoch": 0.8531468531468531, "grad_norm": 0.7704036235809326, "learning_rate": 9.777185133998268e-06, "loss": 0.5743, "step": 61 }, { "epoch": 0.8671328671328671, "grad_norm": 0.5839553475379944, "learning_rate": 9.76269474554854e-06, "loss": 0.5536, "step": 62 }, { "epoch": 0.8811188811188811, "grad_norm": 0.6872385144233704, "learning_rate": 9.747759334168602e-06, "loss": 0.5627, "step": 63 }, { "epoch": 0.8951048951048951, "grad_norm": 0.663074791431427, "learning_rate": 9.73238029541305e-06, "loss": 0.5643, "step": 64 }, { "epoch": 0.9090909090909091, "grad_norm": 0.7018933296203613, "learning_rate": 9.716559066288716e-06, "loss": 0.5729, "step": 65 }, { "epoch": 0.9230769230769231, "grad_norm": 0.7574678659439087, "learning_rate": 9.7002971251204e-06, "loss": 0.5813, "step": 66 }, { "epoch": 0.9370629370629371, "grad_norm": 0.6293357014656067, "learning_rate": 9.683595991412725e-06, "loss": 0.5819, "step": 67 }, { "epoch": 0.951048951048951, "grad_norm": 0.6524381041526794, "learning_rate": 9.666457225708175e-06, "loss": 0.5856, "step": 68 }, { "epoch": 0.965034965034965, "grad_norm": 0.8389201164245605, "learning_rate": 9.648882429441258e-06, "loss": 0.5587, "step": 69 }, { "epoch": 0.9790209790209791, "grad_norm": 0.6339119672775269, "learning_rate": 9.630873244788884e-06, "loss": 0.5655, "step": 70 }, { "epoch": 0.993006993006993, "grad_norm": 0.6689181923866272, "learning_rate": 9.612431354516912e-06, "loss": 0.574, "step": 71 }, { "epoch": 1.0, "grad_norm": 0.7970519661903381, "learning_rate": 9.593558481822923e-06, "loss": 0.5541, "step": 72 }, { "epoch": 1.0, "eval_loss": 0.5664608478546143, "eval_runtime": 34.9634, "eval_samples_per_second": 18.362, "eval_steps_per_second": 2.317, "step": 72 }, { "epoch": 1.013986013986014, "grad_norm": 0.6805382370948792, "learning_rate": 9.574256390175192e-06, "loss": 0.5175, "step": 73 }, { "epoch": 1.027972027972028, "grad_norm": 0.6378044486045837, "learning_rate": 9.554526883147926e-06, "loss": 0.5323, "step": 74 }, { "epoch": 1.0419580419580419, "grad_norm": 0.6296578645706177, "learning_rate": 9.534371804252727e-06, "loss": 0.5197, "step": 75 }, { "epoch": 1.055944055944056, "grad_norm": 0.6116400361061096, "learning_rate": 9.513793036766345e-06, "loss": 0.504, "step": 76 }, { "epoch": 1.06993006993007, "grad_norm": 0.6288114190101624, "learning_rate": 9.492792503554695e-06, "loss": 0.5314, "step": 77 }, { "epoch": 1.083916083916084, "grad_norm": 0.6576322913169861, "learning_rate": 9.4713721668932e-06, "loss": 0.5437, "step": 78 }, { "epoch": 1.097902097902098, "grad_norm": 0.5930177569389343, "learning_rate": 9.44953402828342e-06, "loss": 0.5213, "step": 79 }, { "epoch": 1.1118881118881119, "grad_norm": 0.7437406778335571, "learning_rate": 9.427280128266049e-06, "loss": 0.5441, "step": 80 }, { "epoch": 1.1258741258741258, "grad_norm": 0.7347025275230408, "learning_rate": 9.404612546230244e-06, "loss": 0.5078, "step": 81 }, { "epoch": 1.1398601398601398, "grad_norm": 0.6133800148963928, "learning_rate": 9.381533400219319e-06, "loss": 0.5129, "step": 82 }, { "epoch": 1.1538461538461537, "grad_norm": 0.8068645000457764, "learning_rate": 9.358044846732848e-06, "loss": 0.5252, "step": 83 }, { "epoch": 1.167832167832168, "grad_norm": 0.7470645904541016, "learning_rate": 9.334149080525154e-06, "loss": 0.5251, "step": 84 }, { "epoch": 1.1818181818181819, "grad_norm": 0.6085983514785767, "learning_rate": 9.309848334400247e-06, "loss": 0.5119, "step": 85 }, { "epoch": 1.1958041958041958, "grad_norm": 0.6427562236785889, "learning_rate": 9.285144879003173e-06, "loss": 0.5327, "step": 86 }, { "epoch": 1.2097902097902098, "grad_norm": 0.5992908477783203, "learning_rate": 9.26004102260786e-06, "loss": 0.5174, "step": 87 }, { "epoch": 1.2237762237762237, "grad_norm": 0.6650605201721191, "learning_rate": 9.23453911090143e-06, "loss": 0.541, "step": 88 }, { "epoch": 1.2377622377622377, "grad_norm": 0.6733765602111816, "learning_rate": 9.208641526765024e-06, "loss": 0.4968, "step": 89 }, { "epoch": 1.2517482517482517, "grad_norm": 0.5896586775779724, "learning_rate": 9.182350690051134e-06, "loss": 0.5111, "step": 90 }, { "epoch": 1.2517482517482517, "eval_loss": 0.5681217312812805, "eval_runtime": 34.9547, "eval_samples_per_second": 18.367, "eval_steps_per_second": 2.317, "step": 90 }, { "epoch": 1.2657342657342658, "grad_norm": 0.5879291892051697, "learning_rate": 9.155669057357515e-06, "loss": 0.5124, "step": 91 }, { "epoch": 1.2797202797202798, "grad_norm": 0.6704349517822266, "learning_rate": 9.12859912179762e-06, "loss": 0.5264, "step": 92 }, { "epoch": 1.2937062937062938, "grad_norm": 0.7005125284194946, "learning_rate": 9.101143412767665e-06, "loss": 0.5426, "step": 93 }, { "epoch": 1.3076923076923077, "grad_norm": 0.5738447904586792, "learning_rate": 9.073304495710267e-06, "loss": 0.5057, "step": 94 }, { "epoch": 1.3216783216783217, "grad_norm": 0.6039765477180481, "learning_rate": 9.045084971874738e-06, "loss": 0.5106, "step": 95 }, { "epoch": 1.3356643356643356, "grad_norm": 0.6626608967781067, "learning_rate": 9.016487478074032e-06, "loss": 0.5231, "step": 96 }, { "epoch": 1.3496503496503496, "grad_norm": 0.607319176197052, "learning_rate": 8.987514686438353e-06, "loss": 0.5373, "step": 97 }, { "epoch": 1.3636363636363638, "grad_norm": 0.6294829249382019, "learning_rate": 8.95816930416548e-06, "loss": 0.5478, "step": 98 }, { "epoch": 1.3776223776223775, "grad_norm": 0.5931101441383362, "learning_rate": 8.928454073267801e-06, "loss": 0.5183, "step": 99 }, { "epoch": 1.3916083916083917, "grad_norm": 0.5525672435760498, "learning_rate": 8.898371770316113e-06, "loss": 0.5049, "step": 100 }, { "epoch": 1.4055944055944056, "grad_norm": 0.5554185509681702, "learning_rate": 8.867925206180166e-06, "loss": 0.5329, "step": 101 }, { "epoch": 1.4195804195804196, "grad_norm": 0.6104192137718201, "learning_rate": 8.837117225766033e-06, "loss": 0.5421, "step": 102 }, { "epoch": 1.4335664335664335, "grad_norm": 0.5591093897819519, "learning_rate": 8.805950707750268e-06, "loss": 0.5434, "step": 103 }, { "epoch": 1.4475524475524475, "grad_norm": 0.5589428544044495, "learning_rate": 8.774428564310939e-06, "loss": 0.5159, "step": 104 }, { "epoch": 1.4615384615384617, "grad_norm": 0.580699622631073, "learning_rate": 8.742553740855507e-06, "loss": 0.5143, "step": 105 }, { "epoch": 1.4755244755244754, "grad_norm": 0.6007757186889648, "learning_rate": 8.710329215745612e-06, "loss": 0.5066, "step": 106 }, { "epoch": 1.4895104895104896, "grad_norm": 0.6713395118713379, "learning_rate": 8.677758000018777e-06, "loss": 0.5318, "step": 107 }, { "epoch": 1.5034965034965035, "grad_norm": 0.5536379814147949, "learning_rate": 8.644843137107058e-06, "loss": 0.5159, "step": 108 }, { "epoch": 1.5034965034965035, "eval_loss": 0.5661691427230835, "eval_runtime": 35.3668, "eval_samples_per_second": 18.153, "eval_steps_per_second": 2.29, "step": 108 }, { "epoch": 1.5174825174825175, "grad_norm": 0.645210325717926, "learning_rate": 8.61158770255267e-06, "loss": 0.5312, "step": 109 }, { "epoch": 1.5314685314685315, "grad_norm": 0.601094126701355, "learning_rate": 8.577994803720605e-06, "loss": 0.5394, "step": 110 }, { "epoch": 1.5454545454545454, "grad_norm": 0.5418203473091125, "learning_rate": 8.544067579508292e-06, "loss": 0.5264, "step": 111 }, { "epoch": 1.5594405594405596, "grad_norm": 0.5513077974319458, "learning_rate": 8.509809200052286e-06, "loss": 0.5269, "step": 112 }, { "epoch": 1.5734265734265733, "grad_norm": 0.6063372492790222, "learning_rate": 8.475222866432065e-06, "loss": 0.5199, "step": 113 }, { "epoch": 1.5874125874125875, "grad_norm": 0.5637122988700867, "learning_rate": 8.440311810370921e-06, "loss": 0.5342, "step": 114 }, { "epoch": 1.6013986013986012, "grad_norm": 0.5762498378753662, "learning_rate": 8.405079293933986e-06, "loss": 0.5419, "step": 115 }, { "epoch": 1.6153846153846154, "grad_norm": 0.557772159576416, "learning_rate": 8.36952860922343e-06, "loss": 0.5217, "step": 116 }, { "epoch": 1.6293706293706294, "grad_norm": 0.6382875442504883, "learning_rate": 8.333663078070845e-06, "loss": 0.5366, "step": 117 }, { "epoch": 1.6433566433566433, "grad_norm": 0.5209150910377502, "learning_rate": 8.297486051726864e-06, "loss": 0.5087, "step": 118 }, { "epoch": 1.6573426573426573, "grad_norm": 0.5415475964546204, "learning_rate": 8.26100091054801e-06, "loss": 0.5026, "step": 119 }, { "epoch": 1.6713286713286712, "grad_norm": 0.6667906641960144, "learning_rate": 8.224211063680854e-06, "loss": 0.5224, "step": 120 }, { "epoch": 1.6853146853146854, "grad_norm": 0.573965311050415, "learning_rate": 8.18711994874345e-06, "loss": 0.538, "step": 121 }, { "epoch": 1.6993006993006992, "grad_norm": 0.6206014156341553, "learning_rate": 8.149731031504136e-06, "loss": 0.5161, "step": 122 }, { "epoch": 1.7132867132867133, "grad_norm": 0.6324427127838135, "learning_rate": 8.112047805557693e-06, "loss": 0.5407, "step": 123 }, { "epoch": 1.7272727272727273, "grad_norm": 0.5460613965988159, "learning_rate": 8.074073791998907e-06, "loss": 0.5238, "step": 124 }, { "epoch": 1.7412587412587412, "grad_norm": 0.5684161186218262, "learning_rate": 8.035812539093557e-06, "loss": 0.5166, "step": 125 }, { "epoch": 1.7552447552447552, "grad_norm": 0.6114190816879272, "learning_rate": 7.997267621946871e-06, "loss": 0.5212, "step": 126 }, { "epoch": 1.7552447552447552, "eval_loss": 0.5644441843032837, "eval_runtime": 34.8941, "eval_samples_per_second": 18.399, "eval_steps_per_second": 2.321, "step": 126 }, { "epoch": 1.7692307692307692, "grad_norm": 0.5791452527046204, "learning_rate": 7.958442642169469e-06, "loss": 0.5219, "step": 127 }, { "epoch": 1.7832167832167833, "grad_norm": 0.5814895033836365, "learning_rate": 7.919341227540828e-06, "loss": 0.5492, "step": 128 }, { "epoch": 1.797202797202797, "grad_norm": 0.5562170147895813, "learning_rate": 7.879967031670313e-06, "loss": 0.5065, "step": 129 }, { "epoch": 1.8111888111888113, "grad_norm": 0.5666476488113403, "learning_rate": 7.84032373365578e-06, "loss": 0.508, "step": 130 }, { "epoch": 1.8251748251748252, "grad_norm": 0.6123917102813721, "learning_rate": 7.800415037739802e-06, "loss": 0.5245, "step": 131 }, { "epoch": 1.8391608391608392, "grad_norm": 0.6137180924415588, "learning_rate": 7.760244672963548e-06, "loss": 0.5281, "step": 132 }, { "epoch": 1.8531468531468531, "grad_norm": 0.5444206595420837, "learning_rate": 7.719816392818354e-06, "loss": 0.496, "step": 133 }, { "epoch": 1.867132867132867, "grad_norm": 0.5935954451560974, "learning_rate": 7.679133974894984e-06, "loss": 0.5164, "step": 134 }, { "epoch": 1.8811188811188813, "grad_norm": 0.568263828754425, "learning_rate": 7.638201220530664e-06, "loss": 0.509, "step": 135 }, { "epoch": 1.895104895104895, "grad_norm": 0.641503095626831, "learning_rate": 7.597021954453887e-06, "loss": 0.5389, "step": 136 }, { "epoch": 1.9090909090909092, "grad_norm": 0.5866712927818298, "learning_rate": 7.555600024427028e-06, "loss": 0.5163, "step": 137 }, { "epoch": 1.9230769230769231, "grad_norm": 0.559259831905365, "learning_rate": 7.513939300886816e-06, "loss": 0.5074, "step": 138 }, { "epoch": 1.937062937062937, "grad_norm": 0.5635555386543274, "learning_rate": 7.472043676582685e-06, "loss": 0.5184, "step": 139 }, { "epoch": 1.951048951048951, "grad_norm": 0.6236100196838379, "learning_rate": 7.42991706621303e-06, "loss": 0.5162, "step": 140 }, { "epoch": 1.965034965034965, "grad_norm": 0.60297691822052, "learning_rate": 7.387563406059433e-06, "loss": 0.5123, "step": 141 }, { "epoch": 1.9790209790209792, "grad_norm": 0.5734803080558777, "learning_rate": 7.344986653618844e-06, "loss": 0.5281, "step": 142 }, { "epoch": 1.993006993006993, "grad_norm": 0.561177134513855, "learning_rate": 7.302190787233808e-06, "loss": 0.5256, "step": 143 }, { "epoch": 2.0, "grad_norm": 0.6918484568595886, "learning_rate": 7.259179805720726e-06, "loss": 0.4956, "step": 144 }, { "epoch": 2.0, "eval_loss": 0.5634886622428894, "eval_runtime": 34.1505, "eval_samples_per_second": 18.799, "eval_steps_per_second": 2.372, "step": 144 }, { "epoch": 2.013986013986014, "grad_norm": 0.6467083096504211, "learning_rate": 7.215957727996208e-06, "loss": 0.4757, "step": 145 }, { "epoch": 2.027972027972028, "grad_norm": 0.628153920173645, "learning_rate": 7.17252859270155e-06, "loss": 0.4701, "step": 146 }, { "epoch": 2.041958041958042, "grad_norm": 0.6287585496902466, "learning_rate": 7.128896457825364e-06, "loss": 0.4334, "step": 147 }, { "epoch": 2.055944055944056, "grad_norm": 0.5704949498176575, "learning_rate": 7.085065400324407e-06, "loss": 0.4723, "step": 148 }, { "epoch": 2.06993006993007, "grad_norm": 0.6293634176254272, "learning_rate": 7.041039515742626e-06, "loss": 0.4875, "step": 149 }, { "epoch": 2.0839160839160837, "grad_norm": 0.7220337390899658, "learning_rate": 6.9968229178284775e-06, "loss": 0.4809, "step": 150 }, { "epoch": 2.097902097902098, "grad_norm": 0.5713090896606445, "learning_rate": 6.952419738150546e-06, "loss": 0.4998, "step": 151 }, { "epoch": 2.111888111888112, "grad_norm": 0.6713567972183228, "learning_rate": 6.9078341257114765e-06, "loss": 0.4837, "step": 152 }, { "epoch": 2.125874125874126, "grad_norm": 0.6542858481407166, "learning_rate": 6.863070246560319e-06, "loss": 0.4798, "step": 153 }, { "epoch": 2.13986013986014, "grad_norm": 0.5555688738822937, "learning_rate": 6.818132283403236e-06, "loss": 0.4593, "step": 154 }, { "epoch": 2.1538461538461537, "grad_norm": 0.5947204232215881, "learning_rate": 6.773024435212678e-06, "loss": 0.4831, "step": 155 }, { "epoch": 2.167832167832168, "grad_norm": 0.6230157613754272, "learning_rate": 6.7277509168350445e-06, "loss": 0.4634, "step": 156 }, { "epoch": 2.1818181818181817, "grad_norm": 0.5586286783218384, "learning_rate": 6.6823159585968355e-06, "loss": 0.4803, "step": 157 }, { "epoch": 2.195804195804196, "grad_norm": 0.5558333396911621, "learning_rate": 6.636723805909384e-06, "loss": 0.4734, "step": 158 }, { "epoch": 2.20979020979021, "grad_norm": 0.5960513949394226, "learning_rate": 6.590978718872166e-06, "loss": 0.4746, "step": 159 }, { "epoch": 2.2237762237762237, "grad_norm": 0.5779184103012085, "learning_rate": 6.545084971874738e-06, "loss": 0.4499, "step": 160 }, { "epoch": 2.237762237762238, "grad_norm": 0.5827864408493042, "learning_rate": 6.499046853197338e-06, "loss": 0.4826, "step": 161 }, { "epoch": 2.2517482517482517, "grad_norm": 0.6769295930862427, "learning_rate": 6.452868664610197e-06, "loss": 0.4797, "step": 162 }, { "epoch": 2.2517482517482517, "eval_loss": 0.5764052271842957, "eval_runtime": 34.051, "eval_samples_per_second": 18.854, "eval_steps_per_second": 2.379, "step": 162 }, { "epoch": 2.265734265734266, "grad_norm": 0.5850751996040344, "learning_rate": 6.406554720971583e-06, "loss": 0.4829, "step": 163 }, { "epoch": 2.2797202797202796, "grad_norm": 0.5925103425979614, "learning_rate": 6.3601093498246215e-06, "loss": 0.4936, "step": 164 }, { "epoch": 2.2937062937062938, "grad_norm": 0.5747277140617371, "learning_rate": 6.313536890992935e-06, "loss": 0.4686, "step": 165 }, { "epoch": 2.3076923076923075, "grad_norm": 0.6141413450241089, "learning_rate": 6.266841696175132e-06, "loss": 0.4659, "step": 166 }, { "epoch": 2.3216783216783217, "grad_norm": 0.5214844942092896, "learning_rate": 6.220028128538188e-06, "loss": 0.4714, "step": 167 }, { "epoch": 2.335664335664336, "grad_norm": 0.6260507106781006, "learning_rate": 6.173100562309751e-06, "loss": 0.4731, "step": 168 }, { "epoch": 2.3496503496503496, "grad_norm": 0.6246528625488281, "learning_rate": 6.1260633823694224e-06, "loss": 0.4575, "step": 169 }, { "epoch": 2.3636363636363638, "grad_norm": 0.5592030882835388, "learning_rate": 6.078920983839032e-06, "loss": 0.4293, "step": 170 }, { "epoch": 2.3776223776223775, "grad_norm": 0.5436908602714539, "learning_rate": 6.031677771671962e-06, "loss": 0.4821, "step": 171 }, { "epoch": 2.3916083916083917, "grad_norm": 0.5873638987541199, "learning_rate": 5.984338160241552e-06, "loss": 0.4755, "step": 172 }, { "epoch": 2.4055944055944054, "grad_norm": 0.6056978106498718, "learning_rate": 5.936906572928625e-06, "loss": 0.479, "step": 173 }, { "epoch": 2.4195804195804196, "grad_norm": 0.5452414751052856, "learning_rate": 5.889387441708162e-06, "loss": 0.4545, "step": 174 }, { "epoch": 2.4335664335664333, "grad_norm": 0.5708940625190735, "learning_rate": 5.841785206735192e-06, "loss": 0.4706, "step": 175 }, { "epoch": 2.4475524475524475, "grad_norm": 0.5819888114929199, "learning_rate": 5.794104315929904e-06, "loss": 0.4608, "step": 176 }, { "epoch": 2.4615384615384617, "grad_norm": 0.5468575358390808, "learning_rate": 5.746349224562021e-06, "loss": 0.4696, "step": 177 }, { "epoch": 2.4755244755244754, "grad_norm": 0.6171605587005615, "learning_rate": 5.698524394834531e-06, "loss": 0.4809, "step": 178 }, { "epoch": 2.4895104895104896, "grad_norm": 0.6046556234359741, "learning_rate": 5.650634295466717e-06, "loss": 0.4727, "step": 179 }, { "epoch": 2.5034965034965033, "grad_norm": 0.5517058968544006, "learning_rate": 5.6026834012766155e-06, "loss": 0.4728, "step": 180 }, { "epoch": 2.5034965034965033, "eval_loss": 0.5757314562797546, "eval_runtime": 34.5495, "eval_samples_per_second": 18.582, "eval_steps_per_second": 2.344, "step": 180 }, { "epoch": 2.5174825174825175, "grad_norm": 0.5916588306427002, "learning_rate": 5.554676192762891e-06, "loss": 0.4738, "step": 181 }, { "epoch": 2.5314685314685317, "grad_norm": 0.596782386302948, "learning_rate": 5.506617155686177e-06, "loss": 0.4725, "step": 182 }, { "epoch": 2.5454545454545454, "grad_norm": 0.5784814357757568, "learning_rate": 5.458510780649932e-06, "loss": 0.4743, "step": 183 }, { "epoch": 2.5594405594405596, "grad_norm": 0.5162186622619629, "learning_rate": 5.4103615626808426e-06, "loss": 0.4501, "step": 184 }, { "epoch": 2.5734265734265733, "grad_norm": 0.5629183053970337, "learning_rate": 5.362174000808813e-06, "loss": 0.4631, "step": 185 }, { "epoch": 2.5874125874125875, "grad_norm": 0.5455092191696167, "learning_rate": 5.3139525976465675e-06, "loss": 0.4839, "step": 186 }, { "epoch": 2.6013986013986012, "grad_norm": 0.6234388947486877, "learning_rate": 5.265701858968944e-06, "loss": 0.4729, "step": 187 }, { "epoch": 2.6153846153846154, "grad_norm": 0.5270193815231323, "learning_rate": 5.217426293291869e-06, "loss": 0.4767, "step": 188 }, { "epoch": 2.629370629370629, "grad_norm": 0.5291939973831177, "learning_rate": 5.169130411451083e-06, "loss": 0.4659, "step": 189 }, { "epoch": 2.6433566433566433, "grad_norm": 0.5210967063903809, "learning_rate": 5.120818726180662e-06, "loss": 0.4532, "step": 190 }, { "epoch": 2.6573426573426575, "grad_norm": 0.5697853565216064, "learning_rate": 5.072495751691338e-06, "loss": 0.4669, "step": 191 }, { "epoch": 2.6713286713286712, "grad_norm": 0.4967118203639984, "learning_rate": 5.024166003248703e-06, "loss": 0.4777, "step": 192 }, { "epoch": 2.6853146853146854, "grad_norm": 0.5514243245124817, "learning_rate": 4.9758339967512995e-06, "loss": 0.4689, "step": 193 }, { "epoch": 2.699300699300699, "grad_norm": 0.5476483702659607, "learning_rate": 4.927504248308663e-06, "loss": 0.4898, "step": 194 }, { "epoch": 2.7132867132867133, "grad_norm": 0.5073778033256531, "learning_rate": 4.87918127381934e-06, "loss": 0.4462, "step": 195 }, { "epoch": 2.7272727272727275, "grad_norm": 0.5061259865760803, "learning_rate": 4.830869588548918e-06, "loss": 0.4811, "step": 196 }, { "epoch": 2.7412587412587412, "grad_norm": 0.532632052898407, "learning_rate": 4.782573706708133e-06, "loss": 0.4514, "step": 197 }, { "epoch": 2.755244755244755, "grad_norm": 0.5079967379570007, "learning_rate": 4.734298141031057e-06, "loss": 0.4706, "step": 198 }, { "epoch": 2.755244755244755, "eval_loss": 0.5748186111450195, "eval_runtime": 34.6547, "eval_samples_per_second": 18.526, "eval_steps_per_second": 2.337, "step": 198 }, { "epoch": 2.769230769230769, "grad_norm": 0.5450592637062073, "learning_rate": 4.686047402353433e-06, "loss": 0.4717, "step": 199 }, { "epoch": 2.7832167832167833, "grad_norm": 0.4929758906364441, "learning_rate": 4.637825999191189e-06, "loss": 0.469, "step": 200 }, { "epoch": 2.797202797202797, "grad_norm": 0.514842689037323, "learning_rate": 4.589638437319157e-06, "loss": 0.4848, "step": 201 }, { "epoch": 2.8111888111888113, "grad_norm": 0.5259736776351929, "learning_rate": 4.541489219350069e-06, "loss": 0.4676, "step": 202 }, { "epoch": 2.825174825174825, "grad_norm": 0.571843683719635, "learning_rate": 4.493382844313826e-06, "loss": 0.482, "step": 203 }, { "epoch": 2.839160839160839, "grad_norm": 0.49216270446777344, "learning_rate": 4.445323807237112e-06, "loss": 0.479, "step": 204 }, { "epoch": 2.8531468531468533, "grad_norm": 0.5383098721504211, "learning_rate": 4.397316598723385e-06, "loss": 0.4517, "step": 205 }, { "epoch": 2.867132867132867, "grad_norm": 0.5011985898017883, "learning_rate": 4.349365704533285e-06, "loss": 0.4678, "step": 206 }, { "epoch": 2.8811188811188813, "grad_norm": 0.5291906595230103, "learning_rate": 4.301475605165471e-06, "loss": 0.4717, "step": 207 }, { "epoch": 2.895104895104895, "grad_norm": 0.5500873923301697, "learning_rate": 4.25365077543798e-06, "loss": 0.4572, "step": 208 }, { "epoch": 2.909090909090909, "grad_norm": 0.5690264105796814, "learning_rate": 4.205895684070099e-06, "loss": 0.4675, "step": 209 }, { "epoch": 2.9230769230769234, "grad_norm": 0.4746716022491455, "learning_rate": 4.158214793264808e-06, "loss": 0.4579, "step": 210 }, { "epoch": 2.937062937062937, "grad_norm": 0.5113067626953125, "learning_rate": 4.1106125582918385e-06, "loss": 0.5104, "step": 211 }, { "epoch": 2.951048951048951, "grad_norm": 0.5272907018661499, "learning_rate": 4.063093427071376e-06, "loss": 0.4532, "step": 212 }, { "epoch": 2.965034965034965, "grad_norm": 0.5059399008750916, "learning_rate": 4.01566183975845e-06, "loss": 0.4555, "step": 213 }, { "epoch": 2.979020979020979, "grad_norm": 0.4909096658229828, "learning_rate": 3.968322228328041e-06, "loss": 0.4785, "step": 214 }, { "epoch": 2.993006993006993, "grad_norm": 0.5192479491233826, "learning_rate": 3.92107901616097e-06, "loss": 0.4477, "step": 215 }, { "epoch": 3.0, "grad_norm": 0.7363195419311523, "learning_rate": 3.873936617630578e-06, "loss": 0.4927, "step": 216 }, { "epoch": 3.0, "eval_loss": 0.5740084052085876, "eval_runtime": 34.8551, "eval_samples_per_second": 18.419, "eval_steps_per_second": 2.324, "step": 216 }, { "epoch": 3.013986013986014, "grad_norm": 0.5987377762794495, "learning_rate": 3.82689943769025e-06, "loss": 0.4246, "step": 217 }, { "epoch": 3.027972027972028, "grad_norm": 0.589948832988739, "learning_rate": 3.779971871461813e-06, "loss": 0.4367, "step": 218 }, { "epoch": 3.041958041958042, "grad_norm": 0.5003005862236023, "learning_rate": 3.7331583038248688e-06, "loss": 0.4346, "step": 219 }, { "epoch": 3.055944055944056, "grad_norm": 0.528349757194519, "learning_rate": 3.6864631090070656e-06, "loss": 0.3993, "step": 220 }, { "epoch": 3.06993006993007, "grad_norm": 0.5285301208496094, "learning_rate": 3.639890650175379e-06, "loss": 0.419, "step": 221 }, { "epoch": 3.0839160839160837, "grad_norm": 0.5721102356910706, "learning_rate": 3.593445279028418e-06, "loss": 0.4328, "step": 222 }, { "epoch": 3.097902097902098, "grad_norm": 0.5271673202514648, "learning_rate": 3.5471313353898056e-06, "loss": 0.4252, "step": 223 }, { "epoch": 3.111888111888112, "grad_norm": 0.5354319214820862, "learning_rate": 3.5009531468026646e-06, "loss": 0.4367, "step": 224 }, { "epoch": 3.125874125874126, "grad_norm": 0.5849824547767639, "learning_rate": 3.4549150281252635e-06, "loss": 0.4263, "step": 225 }, { "epoch": 3.13986013986014, "grad_norm": 0.6300305128097534, "learning_rate": 3.409021281127835e-06, "loss": 0.4331, "step": 226 }, { "epoch": 3.1538461538461537, "grad_norm": 0.5985769033432007, "learning_rate": 3.3632761940906167e-06, "loss": 0.4316, "step": 227 }, { "epoch": 3.167832167832168, "grad_norm": 0.5028027296066284, "learning_rate": 3.3176840414031653e-06, "loss": 0.4243, "step": 228 }, { "epoch": 3.1818181818181817, "grad_norm": 0.5299258232116699, "learning_rate": 3.2722490831649568e-06, "loss": 0.4166, "step": 229 }, { "epoch": 3.195804195804196, "grad_norm": 0.5425248742103577, "learning_rate": 3.226975564787322e-06, "loss": 0.4389, "step": 230 }, { "epoch": 3.20979020979021, "grad_norm": 0.5929123759269714, "learning_rate": 3.181867716596765e-06, "loss": 0.4288, "step": 231 }, { "epoch": 3.2237762237762237, "grad_norm": 0.5462735891342163, "learning_rate": 3.1369297534396823e-06, "loss": 0.4434, "step": 232 }, { "epoch": 3.237762237762238, "grad_norm": 0.4862322211265564, "learning_rate": 3.092165874288525e-06, "loss": 0.4133, "step": 233 }, { "epoch": 3.2517482517482517, "grad_norm": 0.48885804414749146, "learning_rate": 3.0475802618494564e-06, "loss": 0.4426, "step": 234 }, { "epoch": 3.2517482517482517, "eval_loss": 0.5924859046936035, "eval_runtime": 34.7085, "eval_samples_per_second": 18.497, "eval_steps_per_second": 2.334, "step": 234 }, { "epoch": 3.265734265734266, "grad_norm": 0.4652189314365387, "learning_rate": 3.0031770821715233e-06, "loss": 0.4189, "step": 235 }, { "epoch": 3.2797202797202796, "grad_norm": 0.5389134883880615, "learning_rate": 2.9589604842573762e-06, "loss": 0.4226, "step": 236 }, { "epoch": 3.2937062937062938, "grad_norm": 0.507276177406311, "learning_rate": 2.914934599675594e-06, "loss": 0.4084, "step": 237 }, { "epoch": 3.3076923076923075, "grad_norm": 0.4876704216003418, "learning_rate": 2.871103542174637e-06, "loss": 0.4256, "step": 238 }, { "epoch": 3.3216783216783217, "grad_norm": 0.48441073298454285, "learning_rate": 2.827471407298451e-06, "loss": 0.4297, "step": 239 }, { "epoch": 3.335664335664336, "grad_norm": 0.4634881317615509, "learning_rate": 2.7840422720037943e-06, "loss": 0.4227, "step": 240 }, { "epoch": 3.3496503496503496, "grad_norm": 0.49520549178123474, "learning_rate": 2.7408201942792755e-06, "loss": 0.414, "step": 241 }, { "epoch": 3.3636363636363638, "grad_norm": 0.4892767369747162, "learning_rate": 2.697809212766195e-06, "loss": 0.4326, "step": 242 }, { "epoch": 3.3776223776223775, "grad_norm": 0.4968920052051544, "learning_rate": 2.655013346381158e-06, "loss": 0.4327, "step": 243 }, { "epoch": 3.3916083916083917, "grad_norm": 0.4823973476886749, "learning_rate": 2.612436593940568e-06, "loss": 0.4329, "step": 244 }, { "epoch": 3.4055944055944054, "grad_norm": 0.4838135540485382, "learning_rate": 2.57008293378697e-06, "loss": 0.4206, "step": 245 }, { "epoch": 3.4195804195804196, "grad_norm": 0.47422581911087036, "learning_rate": 2.5279563234173177e-06, "loss": 0.4336, "step": 246 }, { "epoch": 3.4335664335664333, "grad_norm": 0.4846055209636688, "learning_rate": 2.4860606991131857e-06, "loss": 0.4184, "step": 247 }, { "epoch": 3.4475524475524475, "grad_norm": 0.5305242538452148, "learning_rate": 2.444399975572974e-06, "loss": 0.4394, "step": 248 }, { "epoch": 3.4615384615384617, "grad_norm": 0.487332820892334, "learning_rate": 2.402978045546114e-06, "loss": 0.4033, "step": 249 }, { "epoch": 3.4755244755244754, "grad_norm": 0.4706343114376068, "learning_rate": 2.3617987794693358e-06, "loss": 0.4408, "step": 250 }, { "epoch": 3.4895104895104896, "grad_norm": 0.503103494644165, "learning_rate": 2.320866025105016e-06, "loss": 0.4166, "step": 251 }, { "epoch": 3.5034965034965033, "grad_norm": 0.5077600479125977, "learning_rate": 2.2801836071816476e-06, "loss": 0.4423, "step": 252 }, { "epoch": 3.5034965034965033, "eval_loss": 0.5952551364898682, "eval_runtime": 33.5546, "eval_samples_per_second": 19.133, "eval_steps_per_second": 2.414, "step": 252 }, { "epoch": 3.5174825174825175, "grad_norm": 0.48870253562927246, "learning_rate": 2.2397553270364546e-06, "loss": 0.4241, "step": 253 }, { "epoch": 3.5314685314685317, "grad_norm": 0.4966093897819519, "learning_rate": 2.1995849622602017e-06, "loss": 0.4396, "step": 254 }, { "epoch": 3.5454545454545454, "grad_norm": 0.4564977288246155, "learning_rate": 2.159676266344222e-06, "loss": 0.4223, "step": 255 }, { "epoch": 3.5594405594405596, "grad_norm": 0.46915507316589355, "learning_rate": 2.120032968329687e-06, "loss": 0.4283, "step": 256 }, { "epoch": 3.5734265734265733, "grad_norm": 0.49805694818496704, "learning_rate": 2.0806587724591725e-06, "loss": 0.4382, "step": 257 }, { "epoch": 3.5874125874125875, "grad_norm": 0.48657479882240295, "learning_rate": 2.0415573578305343e-06, "loss": 0.4378, "step": 258 }, { "epoch": 3.6013986013986012, "grad_norm": 0.46977299451828003, "learning_rate": 2.0027323780531312e-06, "loss": 0.4224, "step": 259 }, { "epoch": 3.6153846153846154, "grad_norm": 0.49343907833099365, "learning_rate": 1.9641874609064443e-06, "loss": 0.4088, "step": 260 }, { "epoch": 3.629370629370629, "grad_norm": 0.4801478385925293, "learning_rate": 1.9259262080010938e-06, "loss": 0.419, "step": 261 }, { "epoch": 3.6433566433566433, "grad_norm": 0.4632829427719116, "learning_rate": 1.887952194442309e-06, "loss": 0.4185, "step": 262 }, { "epoch": 3.6573426573426575, "grad_norm": 0.4722610414028168, "learning_rate": 1.8502689684958664e-06, "loss": 0.4223, "step": 263 }, { "epoch": 3.6713286713286712, "grad_norm": 0.46521317958831787, "learning_rate": 1.8128800512565514e-06, "loss": 0.4311, "step": 264 }, { "epoch": 3.6853146853146854, "grad_norm": 0.49360647797584534, "learning_rate": 1.7757889363191484e-06, "loss": 0.4336, "step": 265 }, { "epoch": 3.699300699300699, "grad_norm": 0.46490150690078735, "learning_rate": 1.738999089451991e-06, "loss": 0.41, "step": 266 }, { "epoch": 3.7132867132867133, "grad_norm": 0.47419989109039307, "learning_rate": 1.7025139482731385e-06, "loss": 0.4489, "step": 267 }, { "epoch": 3.7272727272727275, "grad_norm": 0.4471936821937561, "learning_rate": 1.6663369219291558e-06, "loss": 0.4075, "step": 268 }, { "epoch": 3.7412587412587412, "grad_norm": 0.4871998727321625, "learning_rate": 1.6304713907765713e-06, "loss": 0.4138, "step": 269 }, { "epoch": 3.755244755244755, "grad_norm": 0.4558921754360199, "learning_rate": 1.5949207060660138e-06, "loss": 0.4209, "step": 270 }, { "epoch": 3.755244755244755, "eval_loss": 0.5941651463508606, "eval_runtime": 34.8033, "eval_samples_per_second": 18.447, "eval_steps_per_second": 2.327, "step": 270 }, { "epoch": 3.769230769230769, "grad_norm": 0.43444135785102844, "learning_rate": 1.55968818962908e-06, "loss": 0.4186, "step": 271 }, { "epoch": 3.7832167832167833, "grad_norm": 0.47602659463882446, "learning_rate": 1.5247771335679372e-06, "loss": 0.4138, "step": 272 }, { "epoch": 3.797202797202797, "grad_norm": 0.4794568121433258, "learning_rate": 1.4901907999477167e-06, "loss": 0.4512, "step": 273 }, { "epoch": 3.8111888111888113, "grad_norm": 0.47370994091033936, "learning_rate": 1.4559324204917102e-06, "loss": 0.4446, "step": 274 }, { "epoch": 3.825174825174825, "grad_norm": 0.4493069052696228, "learning_rate": 1.4220051962793952e-06, "loss": 0.4316, "step": 275 }, { "epoch": 3.839160839160839, "grad_norm": 0.4439810812473297, "learning_rate": 1.3884122974473307e-06, "loss": 0.4276, "step": 276 }, { "epoch": 3.8531468531468533, "grad_norm": 0.44139519333839417, "learning_rate": 1.3551568628929434e-06, "loss": 0.427, "step": 277 }, { "epoch": 3.867132867132867, "grad_norm": 0.45054903626441956, "learning_rate": 1.3222419999812248e-06, "loss": 0.4356, "step": 278 }, { "epoch": 3.8811188811188813, "grad_norm": 0.44140151143074036, "learning_rate": 1.2896707842543898e-06, "loss": 0.4287, "step": 279 }, { "epoch": 3.895104895104895, "grad_norm": 0.4277818202972412, "learning_rate": 1.257446259144494e-06, "loss": 0.4298, "step": 280 }, { "epoch": 3.909090909090909, "grad_norm": 0.4403057098388672, "learning_rate": 1.225571435689062e-06, "loss": 0.4185, "step": 281 }, { "epoch": 3.9230769230769234, "grad_norm": 0.4724678099155426, "learning_rate": 1.1940492922497337e-06, "loss": 0.4465, "step": 282 }, { "epoch": 3.937062937062937, "grad_norm": 0.47128820419311523, "learning_rate": 1.1628827742339688e-06, "loss": 0.4126, "step": 283 }, { "epoch": 3.951048951048951, "grad_norm": 0.4331970512866974, "learning_rate": 1.1320747938198356e-06, "loss": 0.4105, "step": 284 }, { "epoch": 3.965034965034965, "grad_norm": 0.4537077844142914, "learning_rate": 1.1016282296838887e-06, "loss": 0.4257, "step": 285 }, { "epoch": 3.979020979020979, "grad_norm": 0.46981024742126465, "learning_rate": 1.0715459267321998e-06, "loss": 0.4336, "step": 286 }, { "epoch": 3.993006993006993, "grad_norm": 0.4497096538543701, "learning_rate": 1.0418306958345214e-06, "loss": 0.4326, "step": 287 }, { "epoch": 4.0, "grad_norm": 0.6176419258117676, "learning_rate": 1.0124853135616475e-06, "loss": 0.4261, "step": 288 }, { "epoch": 4.0, "eval_loss": 0.594137966632843, "eval_runtime": 35.3287, "eval_samples_per_second": 18.172, "eval_steps_per_second": 2.293, "step": 288 }, { "epoch": 4.013986013986014, "grad_norm": 0.48881927132606506, "learning_rate": 9.835125219259694e-07, "loss": 0.4126, "step": 289 }, { "epoch": 4.027972027972028, "grad_norm": 0.47744905948638916, "learning_rate": 9.549150281252633e-07, "loss": 0.3887, "step": 290 }, { "epoch": 4.041958041958042, "grad_norm": 0.4749980568885803, "learning_rate": 9.266955042897357e-07, "loss": 0.4085, "step": 291 }, { "epoch": 4.055944055944056, "grad_norm": 0.4653206169605255, "learning_rate": 8.988565872323362e-07, "loss": 0.3949, "step": 292 }, { "epoch": 4.06993006993007, "grad_norm": 0.44160446524620056, "learning_rate": 8.714008782023797e-07, "loss": 0.4049, "step": 293 }, { "epoch": 4.083916083916084, "grad_norm": 0.43797171115875244, "learning_rate": 8.443309426424862e-07, "loss": 0.4038, "step": 294 }, { "epoch": 4.0979020979020975, "grad_norm": 0.4569723904132843, "learning_rate": 8.176493099488664e-07, "loss": 0.3956, "step": 295 }, { "epoch": 4.111888111888112, "grad_norm": 0.47445249557495117, "learning_rate": 7.913584732349788e-07, "loss": 0.4107, "step": 296 }, { "epoch": 4.125874125874126, "grad_norm": 0.46384716033935547, "learning_rate": 7.654608890985709e-07, "loss": 0.3895, "step": 297 }, { "epoch": 4.13986013986014, "grad_norm": 0.47651711106300354, "learning_rate": 7.399589773921412e-07, "loss": 0.3859, "step": 298 }, { "epoch": 4.153846153846154, "grad_norm": 0.4623275697231293, "learning_rate": 7.148551209968279e-07, "loss": 0.394, "step": 299 }, { "epoch": 4.1678321678321675, "grad_norm": 0.4649985432624817, "learning_rate": 6.901516655997536e-07, "loss": 0.4108, "step": 300 }, { "epoch": 4.181818181818182, "grad_norm": 0.4691464304924011, "learning_rate": 6.658509194748463e-07, "loss": 0.3626, "step": 301 }, { "epoch": 4.195804195804196, "grad_norm": 0.48455217480659485, "learning_rate": 6.419551532671542e-07, "loss": 0.4172, "step": 302 }, { "epoch": 4.20979020979021, "grad_norm": 0.482030987739563, "learning_rate": 6.184665997806832e-07, "loss": 0.4038, "step": 303 }, { "epoch": 4.223776223776224, "grad_norm": 0.4398139715194702, "learning_rate": 5.953874537697573e-07, "loss": 0.4033, "step": 304 }, { "epoch": 4.2377622377622375, "grad_norm": 0.46925652027130127, "learning_rate": 5.727198717339511e-07, "loss": 0.4091, "step": 305 }, { "epoch": 4.251748251748252, "grad_norm": 0.46952134370803833, "learning_rate": 5.504659717165812e-07, "loss": 0.4111, "step": 306 }, { "epoch": 4.251748251748252, "eval_loss": 0.6070981025695801, "eval_runtime": 35.5097, "eval_samples_per_second": 18.08, "eval_steps_per_second": 2.281, "step": 306 }, { "epoch": 4.265734265734266, "grad_norm": 0.45535174012184143, "learning_rate": 5.286278331068018e-07, "loss": 0.4128, "step": 307 }, { "epoch": 4.27972027972028, "grad_norm": 0.4438033998012543, "learning_rate": 5.072074964453055e-07, "loss": 0.4052, "step": 308 }, { "epoch": 4.293706293706293, "grad_norm": 0.4887377917766571, "learning_rate": 4.862069632336558e-07, "loss": 0.3894, "step": 309 }, { "epoch": 4.3076923076923075, "grad_norm": 0.4616340100765228, "learning_rate": 4.6562819574727304e-07, "loss": 0.4242, "step": 310 }, { "epoch": 4.321678321678322, "grad_norm": 0.44037091732025146, "learning_rate": 4.454731168520754e-07, "loss": 0.4052, "step": 311 }, { "epoch": 4.335664335664336, "grad_norm": 0.4455097019672394, "learning_rate": 4.257436098248091e-07, "loss": 0.3882, "step": 312 }, { "epoch": 4.34965034965035, "grad_norm": 0.47457605600357056, "learning_rate": 4.064415181770787e-07, "loss": 0.4102, "step": 313 }, { "epoch": 4.363636363636363, "grad_norm": 0.4474296271800995, "learning_rate": 3.875686454830885e-07, "loss": 0.3866, "step": 314 }, { "epoch": 4.3776223776223775, "grad_norm": 0.44111815094947815, "learning_rate": 3.691267552111183e-07, "loss": 0.4091, "step": 315 }, { "epoch": 4.391608391608392, "grad_norm": 0.46066638827323914, "learning_rate": 3.511175705587433e-07, "loss": 0.422, "step": 316 }, { "epoch": 4.405594405594406, "grad_norm": 0.4345090389251709, "learning_rate": 3.3354277429182626e-07, "loss": 0.3882, "step": 317 }, { "epoch": 4.41958041958042, "grad_norm": 0.462768018245697, "learning_rate": 3.164040085872755e-07, "loss": 0.4125, "step": 318 }, { "epoch": 4.433566433566433, "grad_norm": 0.4575034976005554, "learning_rate": 2.997028748796016e-07, "loss": 0.4138, "step": 319 }, { "epoch": 4.4475524475524475, "grad_norm": 0.43728622794151306, "learning_rate": 2.834409337112842e-07, "loss": 0.4133, "step": 320 }, { "epoch": 4.461538461538462, "grad_norm": 0.4533195495605469, "learning_rate": 2.676197045869511e-07, "loss": 0.4067, "step": 321 }, { "epoch": 4.475524475524476, "grad_norm": 0.44842609763145447, "learning_rate": 2.522406658313997e-07, "loss": 0.4042, "step": 322 }, { "epoch": 4.489510489510489, "grad_norm": 0.4315699636936188, "learning_rate": 2.3730525445146146e-07, "loss": 0.3969, "step": 323 }, { "epoch": 4.503496503496503, "grad_norm": 0.43630900979042053, "learning_rate": 2.2281486600173207e-07, "loss": 0.3907, "step": 324 }, { "epoch": 4.503496503496503, "eval_loss": 0.6088654398918152, "eval_runtime": 35.0812, "eval_samples_per_second": 18.3, "eval_steps_per_second": 2.309, "step": 324 }, { "epoch": 4.5174825174825175, "grad_norm": 0.43661531805992126, "learning_rate": 2.0877085445416889e-07, "loss": 0.4079, "step": 325 }, { "epoch": 4.531468531468532, "grad_norm": 0.43984201550483704, "learning_rate": 1.9517453207157865e-07, "loss": 0.4071, "step": 326 }, { "epoch": 4.545454545454545, "grad_norm": 0.43304693698883057, "learning_rate": 1.8202716928499842e-07, "loss": 0.4, "step": 327 }, { "epoch": 4.559440559440559, "grad_norm": 0.44190627336502075, "learning_rate": 1.6932999457498823e-07, "loss": 0.3936, "step": 328 }, { "epoch": 4.573426573426573, "grad_norm": 0.46403783559799194, "learning_rate": 1.5708419435684463e-07, "loss": 0.4142, "step": 329 }, { "epoch": 4.5874125874125875, "grad_norm": 0.448397159576416, "learning_rate": 1.4529091286973994e-07, "loss": 0.411, "step": 330 }, { "epoch": 4.601398601398602, "grad_norm": 0.4263162910938263, "learning_rate": 1.3395125206980774e-07, "loss": 0.3991, "step": 331 }, { "epoch": 4.615384615384615, "grad_norm": 0.4367568790912628, "learning_rate": 1.230662715271741e-07, "loss": 0.4144, "step": 332 }, { "epoch": 4.629370629370629, "grad_norm": 0.4405047297477722, "learning_rate": 1.1263698832695513e-07, "loss": 0.3935, "step": 333 }, { "epoch": 4.643356643356643, "grad_norm": 0.4359452426433563, "learning_rate": 1.0266437697422026e-07, "loss": 0.3913, "step": 334 }, { "epoch": 4.6573426573426575, "grad_norm": 0.44500768184661865, "learning_rate": 9.314936930293283e-08, "loss": 0.4102, "step": 335 }, { "epoch": 4.671328671328672, "grad_norm": 0.46006131172180176, "learning_rate": 8.40928543888836e-08, "loss": 0.4138, "step": 336 }, { "epoch": 4.685314685314685, "grad_norm": 0.44435447454452515, "learning_rate": 7.549567846661388e-08, "loss": 0.4185, "step": 337 }, { "epoch": 4.699300699300699, "grad_norm": 0.43049922585487366, "learning_rate": 6.735864485034493e-08, "loss": 0.3946, "step": 338 }, { "epoch": 4.713286713286713, "grad_norm": 0.4270278513431549, "learning_rate": 5.968251385891744e-08, "loss": 0.3969, "step": 339 }, { "epoch": 4.7272727272727275, "grad_norm": 0.4480164647102356, "learning_rate": 5.246800274474439e-08, "loss": 0.4005, "step": 340 }, { "epoch": 4.741258741258742, "grad_norm": 0.4490266740322113, "learning_rate": 4.571578562679757e-08, "loss": 0.3884, "step": 341 }, { "epoch": 4.755244755244755, "grad_norm": 0.4623181223869324, "learning_rate": 3.9426493427611177e-08, "loss": 0.4169, "step": 342 }, { "epoch": 4.755244755244755, "eval_loss": 0.6084015965461731, "eval_runtime": 34.879, "eval_samples_per_second": 18.406, "eval_steps_per_second": 2.322, "step": 342 }, { "epoch": 4.769230769230769, "grad_norm": 0.4283956289291382, "learning_rate": 3.360071381433516e-08, "loss": 0.3969, "step": 343 }, { "epoch": 4.783216783216783, "grad_norm": 0.4356008470058441, "learning_rate": 2.823899114382078e-08, "loss": 0.4027, "step": 344 }, { "epoch": 4.7972027972027975, "grad_norm": 0.44547533988952637, "learning_rate": 2.3341826411756863e-08, "loss": 0.3987, "step": 345 }, { "epoch": 4.811188811188811, "grad_norm": 0.4299108386039734, "learning_rate": 1.8909677205856682e-08, "loss": 0.4017, "step": 346 }, { "epoch": 4.825174825174825, "grad_norm": 0.4200840890407562, "learning_rate": 1.494295766310161e-08, "loss": 0.3885, "step": 347 }, { "epoch": 4.839160839160839, "grad_norm": 0.43688181042671204, "learning_rate": 1.1442038431044856e-08, "loss": 0.4119, "step": 348 }, { "epoch": 4.853146853146853, "grad_norm": 0.4302099943161011, "learning_rate": 8.407246633178601e-09, "loss": 0.3843, "step": 349 }, { "epoch": 4.867132867132867, "grad_norm": 0.45412999391555786, "learning_rate": 5.838865838366792e-09, "loss": 0.4009, "step": 350 }, { "epoch": 4.881118881118881, "grad_norm": 0.43274399638175964, "learning_rate": 3.737136034349109e-09, "loss": 0.3951, "step": 351 }, { "epoch": 4.895104895104895, "grad_norm": 0.4244266450405121, "learning_rate": 2.102253605316684e-09, "loss": 0.4059, "step": 352 }, { "epoch": 4.909090909090909, "grad_norm": 0.4323265552520752, "learning_rate": 9.343713135623323e-10, "loss": 0.3963, "step": 353 }, { "epoch": 4.923076923076923, "grad_norm": 0.4487632215023041, "learning_rate": 2.335982852064156e-10, "loss": 0.3937, "step": 354 }, { "epoch": 4.937062937062937, "grad_norm": 0.4363052546977997, "learning_rate": 0.0, "loss": 0.405, "step": 355 } ], "logging_steps": 1, "max_steps": 355, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 36, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 5.28345287429718e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }