|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 4.937062937062937, |
|
"eval_steps": 18, |
|
"global_step": 355, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.013986013986013986, |
|
"grad_norm": 6.746792793273926, |
|
"learning_rate": 3.3333333333333335e-07, |
|
"loss": 0.8294, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.013986013986013986, |
|
"eval_loss": 0.8744672536849976, |
|
"eval_runtime": 36.967, |
|
"eval_samples_per_second": 17.367, |
|
"eval_steps_per_second": 2.191, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.027972027972027972, |
|
"grad_norm": 6.9825944900512695, |
|
"learning_rate": 6.666666666666667e-07, |
|
"loss": 0.8694, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.04195804195804196, |
|
"grad_norm": 7.01480770111084, |
|
"learning_rate": 1.0000000000000002e-06, |
|
"loss": 0.861, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.055944055944055944, |
|
"grad_norm": 7.156968593597412, |
|
"learning_rate": 1.3333333333333334e-06, |
|
"loss": 0.9027, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.06993006993006994, |
|
"grad_norm": 6.0878005027771, |
|
"learning_rate": 1.6666666666666667e-06, |
|
"loss": 0.8577, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.08391608391608392, |
|
"grad_norm": 5.853216648101807, |
|
"learning_rate": 2.0000000000000003e-06, |
|
"loss": 0.8168, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.0979020979020979, |
|
"grad_norm": 4.9973978996276855, |
|
"learning_rate": 2.3333333333333336e-06, |
|
"loss": 0.788, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.11188811188811189, |
|
"grad_norm": 4.611128330230713, |
|
"learning_rate": 2.666666666666667e-06, |
|
"loss": 0.7959, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.1258741258741259, |
|
"grad_norm": 3.1312103271484375, |
|
"learning_rate": 3e-06, |
|
"loss": 0.7374, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.13986013986013987, |
|
"grad_norm": 2.9217381477355957, |
|
"learning_rate": 3.3333333333333333e-06, |
|
"loss": 0.7329, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.15384615384615385, |
|
"grad_norm": 2.5225424766540527, |
|
"learning_rate": 3.6666666666666666e-06, |
|
"loss": 0.6905, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.16783216783216784, |
|
"grad_norm": 2.8658440113067627, |
|
"learning_rate": 4.000000000000001e-06, |
|
"loss": 0.702, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.18181818181818182, |
|
"grad_norm": 2.6459388732910156, |
|
"learning_rate": 4.333333333333334e-06, |
|
"loss": 0.6659, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.1958041958041958, |
|
"grad_norm": 2.4082329273223877, |
|
"learning_rate": 4.666666666666667e-06, |
|
"loss": 0.6732, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.2097902097902098, |
|
"grad_norm": 1.8969792127609253, |
|
"learning_rate": 5e-06, |
|
"loss": 0.626, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.22377622377622378, |
|
"grad_norm": 1.705984354019165, |
|
"learning_rate": 5.333333333333334e-06, |
|
"loss": 0.6357, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.23776223776223776, |
|
"grad_norm": 1.5265748500823975, |
|
"learning_rate": 5.666666666666667e-06, |
|
"loss": 0.6409, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.2517482517482518, |
|
"grad_norm": 1.3590223789215088, |
|
"learning_rate": 6e-06, |
|
"loss": 0.6128, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.2517482517482518, |
|
"eval_loss": 0.6171885132789612, |
|
"eval_runtime": 35.4252, |
|
"eval_samples_per_second": 18.123, |
|
"eval_steps_per_second": 2.287, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.26573426573426573, |
|
"grad_norm": 1.3791933059692383, |
|
"learning_rate": 6.333333333333333e-06, |
|
"loss": 0.6181, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.27972027972027974, |
|
"grad_norm": 1.398863434791565, |
|
"learning_rate": 6.666666666666667e-06, |
|
"loss": 0.593, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.2937062937062937, |
|
"grad_norm": 1.1556097269058228, |
|
"learning_rate": 7e-06, |
|
"loss": 0.6274, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.3076923076923077, |
|
"grad_norm": 1.094146728515625, |
|
"learning_rate": 7.333333333333333e-06, |
|
"loss": 0.6113, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.32167832167832167, |
|
"grad_norm": 1.2191824913024902, |
|
"learning_rate": 7.666666666666667e-06, |
|
"loss": 0.6111, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.3356643356643357, |
|
"grad_norm": 0.9371815323829651, |
|
"learning_rate": 8.000000000000001e-06, |
|
"loss": 0.5895, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.34965034965034963, |
|
"grad_norm": 0.8173602223396301, |
|
"learning_rate": 8.333333333333334e-06, |
|
"loss": 0.6083, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.36363636363636365, |
|
"grad_norm": 1.0984693765640259, |
|
"learning_rate": 8.666666666666668e-06, |
|
"loss": 0.6072, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.3776223776223776, |
|
"grad_norm": 1.0279648303985596, |
|
"learning_rate": 9e-06, |
|
"loss": 0.6001, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.3916083916083916, |
|
"grad_norm": 0.9129611253738403, |
|
"learning_rate": 9.333333333333334e-06, |
|
"loss": 0.5644, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.40559440559440557, |
|
"grad_norm": 0.832744300365448, |
|
"learning_rate": 9.666666666666667e-06, |
|
"loss": 0.5716, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.4195804195804196, |
|
"grad_norm": 0.8230701684951782, |
|
"learning_rate": 1e-05, |
|
"loss": 0.59, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.43356643356643354, |
|
"grad_norm": 0.8343638181686401, |
|
"learning_rate": 9.999766401714795e-06, |
|
"loss": 0.5876, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.44755244755244755, |
|
"grad_norm": 0.7421298623085022, |
|
"learning_rate": 9.999065628686439e-06, |
|
"loss": 0.5959, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.46153846153846156, |
|
"grad_norm": 0.7471378445625305, |
|
"learning_rate": 9.997897746394684e-06, |
|
"loss": 0.5804, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.4755244755244755, |
|
"grad_norm": 0.8300222754478455, |
|
"learning_rate": 9.996262863965651e-06, |
|
"loss": 0.5726, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.48951048951048953, |
|
"grad_norm": 0.7753379940986633, |
|
"learning_rate": 9.994161134161635e-06, |
|
"loss": 0.6034, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.5034965034965035, |
|
"grad_norm": 0.8331146240234375, |
|
"learning_rate": 9.991592753366822e-06, |
|
"loss": 0.5953, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.5034965034965035, |
|
"eval_loss": 0.5805296897888184, |
|
"eval_runtime": 35.0435, |
|
"eval_samples_per_second": 18.32, |
|
"eval_steps_per_second": 2.311, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.5174825174825175, |
|
"grad_norm": 0.7212592959403992, |
|
"learning_rate": 9.988557961568956e-06, |
|
"loss": 0.5639, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.5314685314685315, |
|
"grad_norm": 0.796295166015625, |
|
"learning_rate": 9.985057042336898e-06, |
|
"loss": 0.5771, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.5454545454545454, |
|
"grad_norm": 0.8607219457626343, |
|
"learning_rate": 9.981090322794145e-06, |
|
"loss": 0.5763, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.5594405594405595, |
|
"grad_norm": 0.861869215965271, |
|
"learning_rate": 9.976658173588244e-06, |
|
"loss": 0.5729, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.5734265734265734, |
|
"grad_norm": 0.7538414597511292, |
|
"learning_rate": 9.97176100885618e-06, |
|
"loss": 0.571, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.5874125874125874, |
|
"grad_norm": 0.7197255492210388, |
|
"learning_rate": 9.966399286185666e-06, |
|
"loss": 0.5421, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.6013986013986014, |
|
"grad_norm": 0.7522373199462891, |
|
"learning_rate": 9.960573506572391e-06, |
|
"loss": 0.5603, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.6153846153846154, |
|
"grad_norm": 0.8054993152618408, |
|
"learning_rate": 9.954284214373204e-06, |
|
"loss": 0.5723, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.6293706293706294, |
|
"grad_norm": 0.639057457447052, |
|
"learning_rate": 9.947531997255256e-06, |
|
"loss": 0.5483, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.6433566433566433, |
|
"grad_norm": 0.6742891073226929, |
|
"learning_rate": 9.940317486141084e-06, |
|
"loss": 0.5845, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.6573426573426573, |
|
"grad_norm": 0.6605424880981445, |
|
"learning_rate": 9.932641355149655e-06, |
|
"loss": 0.5639, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.6713286713286714, |
|
"grad_norm": 0.7080878019332886, |
|
"learning_rate": 9.924504321533387e-06, |
|
"loss": 0.5851, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.6853146853146853, |
|
"grad_norm": 0.6235523223876953, |
|
"learning_rate": 9.915907145611117e-06, |
|
"loss": 0.574, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.6993006993006993, |
|
"grad_norm": 0.6567375063896179, |
|
"learning_rate": 9.906850630697068e-06, |
|
"loss": 0.5705, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.7132867132867133, |
|
"grad_norm": 0.6011090278625488, |
|
"learning_rate": 9.89733562302578e-06, |
|
"loss": 0.574, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.7272727272727273, |
|
"grad_norm": 0.6043576002120972, |
|
"learning_rate": 9.887363011673046e-06, |
|
"loss": 0.5849, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.7412587412587412, |
|
"grad_norm": 0.7147118449211121, |
|
"learning_rate": 9.876933728472826e-06, |
|
"loss": 0.5584, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.7552447552447552, |
|
"grad_norm": 0.6480064392089844, |
|
"learning_rate": 9.866048747930194e-06, |
|
"loss": 0.5494, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.7552447552447552, |
|
"eval_loss": 0.5708758234977722, |
|
"eval_runtime": 34.9921, |
|
"eval_samples_per_second": 18.347, |
|
"eval_steps_per_second": 2.315, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.7692307692307693, |
|
"grad_norm": 0.6563164591789246, |
|
"learning_rate": 9.854709087130261e-06, |
|
"loss": 0.5491, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.7832167832167832, |
|
"grad_norm": 0.6024691462516785, |
|
"learning_rate": 9.842915805643156e-06, |
|
"loss": 0.5589, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.7972027972027972, |
|
"grad_norm": 0.6186073422431946, |
|
"learning_rate": 9.830670005425012e-06, |
|
"loss": 0.5567, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.8111888111888111, |
|
"grad_norm": 0.6993715763092041, |
|
"learning_rate": 9.817972830715003e-06, |
|
"loss": 0.5534, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.8251748251748252, |
|
"grad_norm": 0.6327122449874878, |
|
"learning_rate": 9.804825467928423e-06, |
|
"loss": 0.5709, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.8391608391608392, |
|
"grad_norm": 0.6156756281852722, |
|
"learning_rate": 9.791229145545832e-06, |
|
"loss": 0.5445, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.8531468531468531, |
|
"grad_norm": 0.7704036235809326, |
|
"learning_rate": 9.777185133998268e-06, |
|
"loss": 0.5743, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.8671328671328671, |
|
"grad_norm": 0.5839553475379944, |
|
"learning_rate": 9.76269474554854e-06, |
|
"loss": 0.5536, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.8811188811188811, |
|
"grad_norm": 0.6872385144233704, |
|
"learning_rate": 9.747759334168602e-06, |
|
"loss": 0.5627, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.8951048951048951, |
|
"grad_norm": 0.663074791431427, |
|
"learning_rate": 9.73238029541305e-06, |
|
"loss": 0.5643, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.9090909090909091, |
|
"grad_norm": 0.7018933296203613, |
|
"learning_rate": 9.716559066288716e-06, |
|
"loss": 0.5729, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.9230769230769231, |
|
"grad_norm": 0.7574678659439087, |
|
"learning_rate": 9.7002971251204e-06, |
|
"loss": 0.5813, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.9370629370629371, |
|
"grad_norm": 0.6293357014656067, |
|
"learning_rate": 9.683595991412725e-06, |
|
"loss": 0.5819, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.951048951048951, |
|
"grad_norm": 0.6524381041526794, |
|
"learning_rate": 9.666457225708175e-06, |
|
"loss": 0.5856, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.965034965034965, |
|
"grad_norm": 0.8389201164245605, |
|
"learning_rate": 9.648882429441258e-06, |
|
"loss": 0.5587, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.9790209790209791, |
|
"grad_norm": 0.6339119672775269, |
|
"learning_rate": 9.630873244788884e-06, |
|
"loss": 0.5655, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.993006993006993, |
|
"grad_norm": 0.6689181923866272, |
|
"learning_rate": 9.612431354516912e-06, |
|
"loss": 0.574, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 0.7970519661903381, |
|
"learning_rate": 9.593558481822923e-06, |
|
"loss": 0.5541, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_loss": 0.5664608478546143, |
|
"eval_runtime": 34.9634, |
|
"eval_samples_per_second": 18.362, |
|
"eval_steps_per_second": 2.317, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 1.013986013986014, |
|
"grad_norm": 0.6805382370948792, |
|
"learning_rate": 9.574256390175192e-06, |
|
"loss": 0.5175, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 1.027972027972028, |
|
"grad_norm": 0.6378044486045837, |
|
"learning_rate": 9.554526883147926e-06, |
|
"loss": 0.5323, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 1.0419580419580419, |
|
"grad_norm": 0.6296578645706177, |
|
"learning_rate": 9.534371804252727e-06, |
|
"loss": 0.5197, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 1.055944055944056, |
|
"grad_norm": 0.6116400361061096, |
|
"learning_rate": 9.513793036766345e-06, |
|
"loss": 0.504, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 1.06993006993007, |
|
"grad_norm": 0.6288114190101624, |
|
"learning_rate": 9.492792503554695e-06, |
|
"loss": 0.5314, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 1.083916083916084, |
|
"grad_norm": 0.6576322913169861, |
|
"learning_rate": 9.4713721668932e-06, |
|
"loss": 0.5437, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 1.097902097902098, |
|
"grad_norm": 0.5930177569389343, |
|
"learning_rate": 9.44953402828342e-06, |
|
"loss": 0.5213, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 1.1118881118881119, |
|
"grad_norm": 0.7437406778335571, |
|
"learning_rate": 9.427280128266049e-06, |
|
"loss": 0.5441, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 1.1258741258741258, |
|
"grad_norm": 0.7347025275230408, |
|
"learning_rate": 9.404612546230244e-06, |
|
"loss": 0.5078, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 1.1398601398601398, |
|
"grad_norm": 0.6133800148963928, |
|
"learning_rate": 9.381533400219319e-06, |
|
"loss": 0.5129, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 1.1538461538461537, |
|
"grad_norm": 0.8068645000457764, |
|
"learning_rate": 9.358044846732848e-06, |
|
"loss": 0.5252, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 1.167832167832168, |
|
"grad_norm": 0.7470645904541016, |
|
"learning_rate": 9.334149080525154e-06, |
|
"loss": 0.5251, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 1.1818181818181819, |
|
"grad_norm": 0.6085983514785767, |
|
"learning_rate": 9.309848334400247e-06, |
|
"loss": 0.5119, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 1.1958041958041958, |
|
"grad_norm": 0.6427562236785889, |
|
"learning_rate": 9.285144879003173e-06, |
|
"loss": 0.5327, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 1.2097902097902098, |
|
"grad_norm": 0.5992908477783203, |
|
"learning_rate": 9.26004102260786e-06, |
|
"loss": 0.5174, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 1.2237762237762237, |
|
"grad_norm": 0.6650605201721191, |
|
"learning_rate": 9.23453911090143e-06, |
|
"loss": 0.541, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 1.2377622377622377, |
|
"grad_norm": 0.6733765602111816, |
|
"learning_rate": 9.208641526765024e-06, |
|
"loss": 0.4968, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 1.2517482517482517, |
|
"grad_norm": 0.5896586775779724, |
|
"learning_rate": 9.182350690051134e-06, |
|
"loss": 0.5111, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 1.2517482517482517, |
|
"eval_loss": 0.5681217312812805, |
|
"eval_runtime": 34.9547, |
|
"eval_samples_per_second": 18.367, |
|
"eval_steps_per_second": 2.317, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 1.2657342657342658, |
|
"grad_norm": 0.5879291892051697, |
|
"learning_rate": 9.155669057357515e-06, |
|
"loss": 0.5124, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 1.2797202797202798, |
|
"grad_norm": 0.6704349517822266, |
|
"learning_rate": 9.12859912179762e-06, |
|
"loss": 0.5264, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 1.2937062937062938, |
|
"grad_norm": 0.7005125284194946, |
|
"learning_rate": 9.101143412767665e-06, |
|
"loss": 0.5426, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 1.3076923076923077, |
|
"grad_norm": 0.5738447904586792, |
|
"learning_rate": 9.073304495710267e-06, |
|
"loss": 0.5057, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 1.3216783216783217, |
|
"grad_norm": 0.6039765477180481, |
|
"learning_rate": 9.045084971874738e-06, |
|
"loss": 0.5106, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 1.3356643356643356, |
|
"grad_norm": 0.6626608967781067, |
|
"learning_rate": 9.016487478074032e-06, |
|
"loss": 0.5231, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 1.3496503496503496, |
|
"grad_norm": 0.607319176197052, |
|
"learning_rate": 8.987514686438353e-06, |
|
"loss": 0.5373, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 1.3636363636363638, |
|
"grad_norm": 0.6294829249382019, |
|
"learning_rate": 8.95816930416548e-06, |
|
"loss": 0.5478, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 1.3776223776223775, |
|
"grad_norm": 0.5931101441383362, |
|
"learning_rate": 8.928454073267801e-06, |
|
"loss": 0.5183, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 1.3916083916083917, |
|
"grad_norm": 0.5525672435760498, |
|
"learning_rate": 8.898371770316113e-06, |
|
"loss": 0.5049, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 1.4055944055944056, |
|
"grad_norm": 0.5554185509681702, |
|
"learning_rate": 8.867925206180166e-06, |
|
"loss": 0.5329, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 1.4195804195804196, |
|
"grad_norm": 0.6104192137718201, |
|
"learning_rate": 8.837117225766033e-06, |
|
"loss": 0.5421, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 1.4335664335664335, |
|
"grad_norm": 0.5591093897819519, |
|
"learning_rate": 8.805950707750268e-06, |
|
"loss": 0.5434, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 1.4475524475524475, |
|
"grad_norm": 0.5589428544044495, |
|
"learning_rate": 8.774428564310939e-06, |
|
"loss": 0.5159, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 1.4615384615384617, |
|
"grad_norm": 0.580699622631073, |
|
"learning_rate": 8.742553740855507e-06, |
|
"loss": 0.5143, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 1.4755244755244754, |
|
"grad_norm": 0.6007757186889648, |
|
"learning_rate": 8.710329215745612e-06, |
|
"loss": 0.5066, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 1.4895104895104896, |
|
"grad_norm": 0.6713395118713379, |
|
"learning_rate": 8.677758000018777e-06, |
|
"loss": 0.5318, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 1.5034965034965035, |
|
"grad_norm": 0.5536379814147949, |
|
"learning_rate": 8.644843137107058e-06, |
|
"loss": 0.5159, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 1.5034965034965035, |
|
"eval_loss": 0.5661691427230835, |
|
"eval_runtime": 35.3668, |
|
"eval_samples_per_second": 18.153, |
|
"eval_steps_per_second": 2.29, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 1.5174825174825175, |
|
"grad_norm": 0.645210325717926, |
|
"learning_rate": 8.61158770255267e-06, |
|
"loss": 0.5312, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 1.5314685314685315, |
|
"grad_norm": 0.601094126701355, |
|
"learning_rate": 8.577994803720605e-06, |
|
"loss": 0.5394, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 1.5454545454545454, |
|
"grad_norm": 0.5418203473091125, |
|
"learning_rate": 8.544067579508292e-06, |
|
"loss": 0.5264, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 1.5594405594405596, |
|
"grad_norm": 0.5513077974319458, |
|
"learning_rate": 8.509809200052286e-06, |
|
"loss": 0.5269, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 1.5734265734265733, |
|
"grad_norm": 0.6063372492790222, |
|
"learning_rate": 8.475222866432065e-06, |
|
"loss": 0.5199, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 1.5874125874125875, |
|
"grad_norm": 0.5637122988700867, |
|
"learning_rate": 8.440311810370921e-06, |
|
"loss": 0.5342, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 1.6013986013986012, |
|
"grad_norm": 0.5762498378753662, |
|
"learning_rate": 8.405079293933986e-06, |
|
"loss": 0.5419, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 1.6153846153846154, |
|
"grad_norm": 0.557772159576416, |
|
"learning_rate": 8.36952860922343e-06, |
|
"loss": 0.5217, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 1.6293706293706294, |
|
"grad_norm": 0.6382875442504883, |
|
"learning_rate": 8.333663078070845e-06, |
|
"loss": 0.5366, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 1.6433566433566433, |
|
"grad_norm": 0.5209150910377502, |
|
"learning_rate": 8.297486051726864e-06, |
|
"loss": 0.5087, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 1.6573426573426573, |
|
"grad_norm": 0.5415475964546204, |
|
"learning_rate": 8.26100091054801e-06, |
|
"loss": 0.5026, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 1.6713286713286712, |
|
"grad_norm": 0.6667906641960144, |
|
"learning_rate": 8.224211063680854e-06, |
|
"loss": 0.5224, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 1.6853146853146854, |
|
"grad_norm": 0.573965311050415, |
|
"learning_rate": 8.18711994874345e-06, |
|
"loss": 0.538, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 1.6993006993006992, |
|
"grad_norm": 0.6206014156341553, |
|
"learning_rate": 8.149731031504136e-06, |
|
"loss": 0.5161, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 1.7132867132867133, |
|
"grad_norm": 0.6324427127838135, |
|
"learning_rate": 8.112047805557693e-06, |
|
"loss": 0.5407, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 1.7272727272727273, |
|
"grad_norm": 0.5460613965988159, |
|
"learning_rate": 8.074073791998907e-06, |
|
"loss": 0.5238, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 1.7412587412587412, |
|
"grad_norm": 0.5684161186218262, |
|
"learning_rate": 8.035812539093557e-06, |
|
"loss": 0.5166, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 1.7552447552447552, |
|
"grad_norm": 0.6114190816879272, |
|
"learning_rate": 7.997267621946871e-06, |
|
"loss": 0.5212, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 1.7552447552447552, |
|
"eval_loss": 0.5644441843032837, |
|
"eval_runtime": 34.8941, |
|
"eval_samples_per_second": 18.399, |
|
"eval_steps_per_second": 2.321, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 1.7692307692307692, |
|
"grad_norm": 0.5791452527046204, |
|
"learning_rate": 7.958442642169469e-06, |
|
"loss": 0.5219, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 1.7832167832167833, |
|
"grad_norm": 0.5814895033836365, |
|
"learning_rate": 7.919341227540828e-06, |
|
"loss": 0.5492, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 1.797202797202797, |
|
"grad_norm": 0.5562170147895813, |
|
"learning_rate": 7.879967031670313e-06, |
|
"loss": 0.5065, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 1.8111888111888113, |
|
"grad_norm": 0.5666476488113403, |
|
"learning_rate": 7.84032373365578e-06, |
|
"loss": 0.508, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 1.8251748251748252, |
|
"grad_norm": 0.6123917102813721, |
|
"learning_rate": 7.800415037739802e-06, |
|
"loss": 0.5245, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 1.8391608391608392, |
|
"grad_norm": 0.6137180924415588, |
|
"learning_rate": 7.760244672963548e-06, |
|
"loss": 0.5281, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 1.8531468531468531, |
|
"grad_norm": 0.5444206595420837, |
|
"learning_rate": 7.719816392818354e-06, |
|
"loss": 0.496, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 1.867132867132867, |
|
"grad_norm": 0.5935954451560974, |
|
"learning_rate": 7.679133974894984e-06, |
|
"loss": 0.5164, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 1.8811188811188813, |
|
"grad_norm": 0.568263828754425, |
|
"learning_rate": 7.638201220530664e-06, |
|
"loss": 0.509, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 1.895104895104895, |
|
"grad_norm": 0.641503095626831, |
|
"learning_rate": 7.597021954453887e-06, |
|
"loss": 0.5389, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 1.9090909090909092, |
|
"grad_norm": 0.5866712927818298, |
|
"learning_rate": 7.555600024427028e-06, |
|
"loss": 0.5163, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 1.9230769230769231, |
|
"grad_norm": 0.559259831905365, |
|
"learning_rate": 7.513939300886816e-06, |
|
"loss": 0.5074, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 1.937062937062937, |
|
"grad_norm": 0.5635555386543274, |
|
"learning_rate": 7.472043676582685e-06, |
|
"loss": 0.5184, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 1.951048951048951, |
|
"grad_norm": 0.6236100196838379, |
|
"learning_rate": 7.42991706621303e-06, |
|
"loss": 0.5162, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 1.965034965034965, |
|
"grad_norm": 0.60297691822052, |
|
"learning_rate": 7.387563406059433e-06, |
|
"loss": 0.5123, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 1.9790209790209792, |
|
"grad_norm": 0.5734803080558777, |
|
"learning_rate": 7.344986653618844e-06, |
|
"loss": 0.5281, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 1.993006993006993, |
|
"grad_norm": 0.561177134513855, |
|
"learning_rate": 7.302190787233808e-06, |
|
"loss": 0.5256, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 0.6918484568595886, |
|
"learning_rate": 7.259179805720726e-06, |
|
"loss": 0.4956, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_loss": 0.5634886622428894, |
|
"eval_runtime": 34.1505, |
|
"eval_samples_per_second": 18.799, |
|
"eval_steps_per_second": 2.372, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 2.013986013986014, |
|
"grad_norm": 0.6467083096504211, |
|
"learning_rate": 7.215957727996208e-06, |
|
"loss": 0.4757, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 2.027972027972028, |
|
"grad_norm": 0.628153920173645, |
|
"learning_rate": 7.17252859270155e-06, |
|
"loss": 0.4701, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 2.041958041958042, |
|
"grad_norm": 0.6287585496902466, |
|
"learning_rate": 7.128896457825364e-06, |
|
"loss": 0.4334, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 2.055944055944056, |
|
"grad_norm": 0.5704949498176575, |
|
"learning_rate": 7.085065400324407e-06, |
|
"loss": 0.4723, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 2.06993006993007, |
|
"grad_norm": 0.6293634176254272, |
|
"learning_rate": 7.041039515742626e-06, |
|
"loss": 0.4875, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 2.0839160839160837, |
|
"grad_norm": 0.7220337390899658, |
|
"learning_rate": 6.9968229178284775e-06, |
|
"loss": 0.4809, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 2.097902097902098, |
|
"grad_norm": 0.5713090896606445, |
|
"learning_rate": 6.952419738150546e-06, |
|
"loss": 0.4998, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 2.111888111888112, |
|
"grad_norm": 0.6713567972183228, |
|
"learning_rate": 6.9078341257114765e-06, |
|
"loss": 0.4837, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 2.125874125874126, |
|
"grad_norm": 0.6542858481407166, |
|
"learning_rate": 6.863070246560319e-06, |
|
"loss": 0.4798, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 2.13986013986014, |
|
"grad_norm": 0.5555688738822937, |
|
"learning_rate": 6.818132283403236e-06, |
|
"loss": 0.4593, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 2.1538461538461537, |
|
"grad_norm": 0.5947204232215881, |
|
"learning_rate": 6.773024435212678e-06, |
|
"loss": 0.4831, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 2.167832167832168, |
|
"grad_norm": 0.6230157613754272, |
|
"learning_rate": 6.7277509168350445e-06, |
|
"loss": 0.4634, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 2.1818181818181817, |
|
"grad_norm": 0.5586286783218384, |
|
"learning_rate": 6.6823159585968355e-06, |
|
"loss": 0.4803, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 2.195804195804196, |
|
"grad_norm": 0.5558333396911621, |
|
"learning_rate": 6.636723805909384e-06, |
|
"loss": 0.4734, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 2.20979020979021, |
|
"grad_norm": 0.5960513949394226, |
|
"learning_rate": 6.590978718872166e-06, |
|
"loss": 0.4746, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 2.2237762237762237, |
|
"grad_norm": 0.5779184103012085, |
|
"learning_rate": 6.545084971874738e-06, |
|
"loss": 0.4499, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 2.237762237762238, |
|
"grad_norm": 0.5827864408493042, |
|
"learning_rate": 6.499046853197338e-06, |
|
"loss": 0.4826, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 2.2517482517482517, |
|
"grad_norm": 0.6769295930862427, |
|
"learning_rate": 6.452868664610197e-06, |
|
"loss": 0.4797, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 2.2517482517482517, |
|
"eval_loss": 0.5764052271842957, |
|
"eval_runtime": 34.051, |
|
"eval_samples_per_second": 18.854, |
|
"eval_steps_per_second": 2.379, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 2.265734265734266, |
|
"grad_norm": 0.5850751996040344, |
|
"learning_rate": 6.406554720971583e-06, |
|
"loss": 0.4829, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 2.2797202797202796, |
|
"grad_norm": 0.5925103425979614, |
|
"learning_rate": 6.3601093498246215e-06, |
|
"loss": 0.4936, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 2.2937062937062938, |
|
"grad_norm": 0.5747277140617371, |
|
"learning_rate": 6.313536890992935e-06, |
|
"loss": 0.4686, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 2.3076923076923075, |
|
"grad_norm": 0.6141413450241089, |
|
"learning_rate": 6.266841696175132e-06, |
|
"loss": 0.4659, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 2.3216783216783217, |
|
"grad_norm": 0.5214844942092896, |
|
"learning_rate": 6.220028128538188e-06, |
|
"loss": 0.4714, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 2.335664335664336, |
|
"grad_norm": 0.6260507106781006, |
|
"learning_rate": 6.173100562309751e-06, |
|
"loss": 0.4731, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 2.3496503496503496, |
|
"grad_norm": 0.6246528625488281, |
|
"learning_rate": 6.1260633823694224e-06, |
|
"loss": 0.4575, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 2.3636363636363638, |
|
"grad_norm": 0.5592030882835388, |
|
"learning_rate": 6.078920983839032e-06, |
|
"loss": 0.4293, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 2.3776223776223775, |
|
"grad_norm": 0.5436908602714539, |
|
"learning_rate": 6.031677771671962e-06, |
|
"loss": 0.4821, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 2.3916083916083917, |
|
"grad_norm": 0.5873638987541199, |
|
"learning_rate": 5.984338160241552e-06, |
|
"loss": 0.4755, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 2.4055944055944054, |
|
"grad_norm": 0.6056978106498718, |
|
"learning_rate": 5.936906572928625e-06, |
|
"loss": 0.479, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 2.4195804195804196, |
|
"grad_norm": 0.5452414751052856, |
|
"learning_rate": 5.889387441708162e-06, |
|
"loss": 0.4545, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 2.4335664335664333, |
|
"grad_norm": 0.5708940625190735, |
|
"learning_rate": 5.841785206735192e-06, |
|
"loss": 0.4706, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 2.4475524475524475, |
|
"grad_norm": 0.5819888114929199, |
|
"learning_rate": 5.794104315929904e-06, |
|
"loss": 0.4608, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 2.4615384615384617, |
|
"grad_norm": 0.5468575358390808, |
|
"learning_rate": 5.746349224562021e-06, |
|
"loss": 0.4696, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 2.4755244755244754, |
|
"grad_norm": 0.6171605587005615, |
|
"learning_rate": 5.698524394834531e-06, |
|
"loss": 0.4809, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 2.4895104895104896, |
|
"grad_norm": 0.6046556234359741, |
|
"learning_rate": 5.650634295466717e-06, |
|
"loss": 0.4727, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 2.5034965034965033, |
|
"grad_norm": 0.5517058968544006, |
|
"learning_rate": 5.6026834012766155e-06, |
|
"loss": 0.4728, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 2.5034965034965033, |
|
"eval_loss": 0.5757314562797546, |
|
"eval_runtime": 34.5495, |
|
"eval_samples_per_second": 18.582, |
|
"eval_steps_per_second": 2.344, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 2.5174825174825175, |
|
"grad_norm": 0.5916588306427002, |
|
"learning_rate": 5.554676192762891e-06, |
|
"loss": 0.4738, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 2.5314685314685317, |
|
"grad_norm": 0.596782386302948, |
|
"learning_rate": 5.506617155686177e-06, |
|
"loss": 0.4725, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 2.5454545454545454, |
|
"grad_norm": 0.5784814357757568, |
|
"learning_rate": 5.458510780649932e-06, |
|
"loss": 0.4743, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 2.5594405594405596, |
|
"grad_norm": 0.5162186622619629, |
|
"learning_rate": 5.4103615626808426e-06, |
|
"loss": 0.4501, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 2.5734265734265733, |
|
"grad_norm": 0.5629183053970337, |
|
"learning_rate": 5.362174000808813e-06, |
|
"loss": 0.4631, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 2.5874125874125875, |
|
"grad_norm": 0.5455092191696167, |
|
"learning_rate": 5.3139525976465675e-06, |
|
"loss": 0.4839, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 2.6013986013986012, |
|
"grad_norm": 0.6234388947486877, |
|
"learning_rate": 5.265701858968944e-06, |
|
"loss": 0.4729, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 2.6153846153846154, |
|
"grad_norm": 0.5270193815231323, |
|
"learning_rate": 5.217426293291869e-06, |
|
"loss": 0.4767, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 2.629370629370629, |
|
"grad_norm": 0.5291939973831177, |
|
"learning_rate": 5.169130411451083e-06, |
|
"loss": 0.4659, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 2.6433566433566433, |
|
"grad_norm": 0.5210967063903809, |
|
"learning_rate": 5.120818726180662e-06, |
|
"loss": 0.4532, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 2.6573426573426575, |
|
"grad_norm": 0.5697853565216064, |
|
"learning_rate": 5.072495751691338e-06, |
|
"loss": 0.4669, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 2.6713286713286712, |
|
"grad_norm": 0.4967118203639984, |
|
"learning_rate": 5.024166003248703e-06, |
|
"loss": 0.4777, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 2.6853146853146854, |
|
"grad_norm": 0.5514243245124817, |
|
"learning_rate": 4.9758339967512995e-06, |
|
"loss": 0.4689, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 2.699300699300699, |
|
"grad_norm": 0.5476483702659607, |
|
"learning_rate": 4.927504248308663e-06, |
|
"loss": 0.4898, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 2.7132867132867133, |
|
"grad_norm": 0.5073778033256531, |
|
"learning_rate": 4.87918127381934e-06, |
|
"loss": 0.4462, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 2.7272727272727275, |
|
"grad_norm": 0.5061259865760803, |
|
"learning_rate": 4.830869588548918e-06, |
|
"loss": 0.4811, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 2.7412587412587412, |
|
"grad_norm": 0.532632052898407, |
|
"learning_rate": 4.782573706708133e-06, |
|
"loss": 0.4514, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 2.755244755244755, |
|
"grad_norm": 0.5079967379570007, |
|
"learning_rate": 4.734298141031057e-06, |
|
"loss": 0.4706, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 2.755244755244755, |
|
"eval_loss": 0.5748186111450195, |
|
"eval_runtime": 34.6547, |
|
"eval_samples_per_second": 18.526, |
|
"eval_steps_per_second": 2.337, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 2.769230769230769, |
|
"grad_norm": 0.5450592637062073, |
|
"learning_rate": 4.686047402353433e-06, |
|
"loss": 0.4717, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 2.7832167832167833, |
|
"grad_norm": 0.4929758906364441, |
|
"learning_rate": 4.637825999191189e-06, |
|
"loss": 0.469, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 2.797202797202797, |
|
"grad_norm": 0.514842689037323, |
|
"learning_rate": 4.589638437319157e-06, |
|
"loss": 0.4848, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 2.8111888111888113, |
|
"grad_norm": 0.5259736776351929, |
|
"learning_rate": 4.541489219350069e-06, |
|
"loss": 0.4676, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 2.825174825174825, |
|
"grad_norm": 0.571843683719635, |
|
"learning_rate": 4.493382844313826e-06, |
|
"loss": 0.482, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 2.839160839160839, |
|
"grad_norm": 0.49216270446777344, |
|
"learning_rate": 4.445323807237112e-06, |
|
"loss": 0.479, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 2.8531468531468533, |
|
"grad_norm": 0.5383098721504211, |
|
"learning_rate": 4.397316598723385e-06, |
|
"loss": 0.4517, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 2.867132867132867, |
|
"grad_norm": 0.5011985898017883, |
|
"learning_rate": 4.349365704533285e-06, |
|
"loss": 0.4678, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 2.8811188811188813, |
|
"grad_norm": 0.5291906595230103, |
|
"learning_rate": 4.301475605165471e-06, |
|
"loss": 0.4717, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 2.895104895104895, |
|
"grad_norm": 0.5500873923301697, |
|
"learning_rate": 4.25365077543798e-06, |
|
"loss": 0.4572, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 2.909090909090909, |
|
"grad_norm": 0.5690264105796814, |
|
"learning_rate": 4.205895684070099e-06, |
|
"loss": 0.4675, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 2.9230769230769234, |
|
"grad_norm": 0.4746716022491455, |
|
"learning_rate": 4.158214793264808e-06, |
|
"loss": 0.4579, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 2.937062937062937, |
|
"grad_norm": 0.5113067626953125, |
|
"learning_rate": 4.1106125582918385e-06, |
|
"loss": 0.5104, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 2.951048951048951, |
|
"grad_norm": 0.5272907018661499, |
|
"learning_rate": 4.063093427071376e-06, |
|
"loss": 0.4532, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 2.965034965034965, |
|
"grad_norm": 0.5059399008750916, |
|
"learning_rate": 4.01566183975845e-06, |
|
"loss": 0.4555, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 2.979020979020979, |
|
"grad_norm": 0.4909096658229828, |
|
"learning_rate": 3.968322228328041e-06, |
|
"loss": 0.4785, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 2.993006993006993, |
|
"grad_norm": 0.5192479491233826, |
|
"learning_rate": 3.92107901616097e-06, |
|
"loss": 0.4477, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"grad_norm": 0.7363195419311523, |
|
"learning_rate": 3.873936617630578e-06, |
|
"loss": 0.4927, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_loss": 0.5740084052085876, |
|
"eval_runtime": 34.8551, |
|
"eval_samples_per_second": 18.419, |
|
"eval_steps_per_second": 2.324, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 3.013986013986014, |
|
"grad_norm": 0.5987377762794495, |
|
"learning_rate": 3.82689943769025e-06, |
|
"loss": 0.4246, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 3.027972027972028, |
|
"grad_norm": 0.589948832988739, |
|
"learning_rate": 3.779971871461813e-06, |
|
"loss": 0.4367, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 3.041958041958042, |
|
"grad_norm": 0.5003005862236023, |
|
"learning_rate": 3.7331583038248688e-06, |
|
"loss": 0.4346, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 3.055944055944056, |
|
"grad_norm": 0.528349757194519, |
|
"learning_rate": 3.6864631090070656e-06, |
|
"loss": 0.3993, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 3.06993006993007, |
|
"grad_norm": 0.5285301208496094, |
|
"learning_rate": 3.639890650175379e-06, |
|
"loss": 0.419, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 3.0839160839160837, |
|
"grad_norm": 0.5721102356910706, |
|
"learning_rate": 3.593445279028418e-06, |
|
"loss": 0.4328, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 3.097902097902098, |
|
"grad_norm": 0.5271673202514648, |
|
"learning_rate": 3.5471313353898056e-06, |
|
"loss": 0.4252, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 3.111888111888112, |
|
"grad_norm": 0.5354319214820862, |
|
"learning_rate": 3.5009531468026646e-06, |
|
"loss": 0.4367, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 3.125874125874126, |
|
"grad_norm": 0.5849824547767639, |
|
"learning_rate": 3.4549150281252635e-06, |
|
"loss": 0.4263, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 3.13986013986014, |
|
"grad_norm": 0.6300305128097534, |
|
"learning_rate": 3.409021281127835e-06, |
|
"loss": 0.4331, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 3.1538461538461537, |
|
"grad_norm": 0.5985769033432007, |
|
"learning_rate": 3.3632761940906167e-06, |
|
"loss": 0.4316, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 3.167832167832168, |
|
"grad_norm": 0.5028027296066284, |
|
"learning_rate": 3.3176840414031653e-06, |
|
"loss": 0.4243, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 3.1818181818181817, |
|
"grad_norm": 0.5299258232116699, |
|
"learning_rate": 3.2722490831649568e-06, |
|
"loss": 0.4166, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 3.195804195804196, |
|
"grad_norm": 0.5425248742103577, |
|
"learning_rate": 3.226975564787322e-06, |
|
"loss": 0.4389, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 3.20979020979021, |
|
"grad_norm": 0.5929123759269714, |
|
"learning_rate": 3.181867716596765e-06, |
|
"loss": 0.4288, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 3.2237762237762237, |
|
"grad_norm": 0.5462735891342163, |
|
"learning_rate": 3.1369297534396823e-06, |
|
"loss": 0.4434, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 3.237762237762238, |
|
"grad_norm": 0.4862322211265564, |
|
"learning_rate": 3.092165874288525e-06, |
|
"loss": 0.4133, |
|
"step": 233 |
|
}, |
|
{ |
|
"epoch": 3.2517482517482517, |
|
"grad_norm": 0.48885804414749146, |
|
"learning_rate": 3.0475802618494564e-06, |
|
"loss": 0.4426, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 3.2517482517482517, |
|
"eval_loss": 0.5924859046936035, |
|
"eval_runtime": 34.7085, |
|
"eval_samples_per_second": 18.497, |
|
"eval_steps_per_second": 2.334, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 3.265734265734266, |
|
"grad_norm": 0.4652189314365387, |
|
"learning_rate": 3.0031770821715233e-06, |
|
"loss": 0.4189, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 3.2797202797202796, |
|
"grad_norm": 0.5389134883880615, |
|
"learning_rate": 2.9589604842573762e-06, |
|
"loss": 0.4226, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 3.2937062937062938, |
|
"grad_norm": 0.507276177406311, |
|
"learning_rate": 2.914934599675594e-06, |
|
"loss": 0.4084, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 3.3076923076923075, |
|
"grad_norm": 0.4876704216003418, |
|
"learning_rate": 2.871103542174637e-06, |
|
"loss": 0.4256, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 3.3216783216783217, |
|
"grad_norm": 0.48441073298454285, |
|
"learning_rate": 2.827471407298451e-06, |
|
"loss": 0.4297, |
|
"step": 239 |
|
}, |
|
{ |
|
"epoch": 3.335664335664336, |
|
"grad_norm": 0.4634881317615509, |
|
"learning_rate": 2.7840422720037943e-06, |
|
"loss": 0.4227, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 3.3496503496503496, |
|
"grad_norm": 0.49520549178123474, |
|
"learning_rate": 2.7408201942792755e-06, |
|
"loss": 0.414, |
|
"step": 241 |
|
}, |
|
{ |
|
"epoch": 3.3636363636363638, |
|
"grad_norm": 0.4892767369747162, |
|
"learning_rate": 2.697809212766195e-06, |
|
"loss": 0.4326, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 3.3776223776223775, |
|
"grad_norm": 0.4968920052051544, |
|
"learning_rate": 2.655013346381158e-06, |
|
"loss": 0.4327, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 3.3916083916083917, |
|
"grad_norm": 0.4823973476886749, |
|
"learning_rate": 2.612436593940568e-06, |
|
"loss": 0.4329, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 3.4055944055944054, |
|
"grad_norm": 0.4838135540485382, |
|
"learning_rate": 2.57008293378697e-06, |
|
"loss": 0.4206, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 3.4195804195804196, |
|
"grad_norm": 0.47422581911087036, |
|
"learning_rate": 2.5279563234173177e-06, |
|
"loss": 0.4336, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 3.4335664335664333, |
|
"grad_norm": 0.4846055209636688, |
|
"learning_rate": 2.4860606991131857e-06, |
|
"loss": 0.4184, |
|
"step": 247 |
|
}, |
|
{ |
|
"epoch": 3.4475524475524475, |
|
"grad_norm": 0.5305242538452148, |
|
"learning_rate": 2.444399975572974e-06, |
|
"loss": 0.4394, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 3.4615384615384617, |
|
"grad_norm": 0.487332820892334, |
|
"learning_rate": 2.402978045546114e-06, |
|
"loss": 0.4033, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 3.4755244755244754, |
|
"grad_norm": 0.4706343114376068, |
|
"learning_rate": 2.3617987794693358e-06, |
|
"loss": 0.4408, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 3.4895104895104896, |
|
"grad_norm": 0.503103494644165, |
|
"learning_rate": 2.320866025105016e-06, |
|
"loss": 0.4166, |
|
"step": 251 |
|
}, |
|
{ |
|
"epoch": 3.5034965034965033, |
|
"grad_norm": 0.5077600479125977, |
|
"learning_rate": 2.2801836071816476e-06, |
|
"loss": 0.4423, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 3.5034965034965033, |
|
"eval_loss": 0.5952551364898682, |
|
"eval_runtime": 33.5546, |
|
"eval_samples_per_second": 19.133, |
|
"eval_steps_per_second": 2.414, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 3.5174825174825175, |
|
"grad_norm": 0.48870253562927246, |
|
"learning_rate": 2.2397553270364546e-06, |
|
"loss": 0.4241, |
|
"step": 253 |
|
}, |
|
{ |
|
"epoch": 3.5314685314685317, |
|
"grad_norm": 0.4966093897819519, |
|
"learning_rate": 2.1995849622602017e-06, |
|
"loss": 0.4396, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 3.5454545454545454, |
|
"grad_norm": 0.4564977288246155, |
|
"learning_rate": 2.159676266344222e-06, |
|
"loss": 0.4223, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 3.5594405594405596, |
|
"grad_norm": 0.46915507316589355, |
|
"learning_rate": 2.120032968329687e-06, |
|
"loss": 0.4283, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 3.5734265734265733, |
|
"grad_norm": 0.49805694818496704, |
|
"learning_rate": 2.0806587724591725e-06, |
|
"loss": 0.4382, |
|
"step": 257 |
|
}, |
|
{ |
|
"epoch": 3.5874125874125875, |
|
"grad_norm": 0.48657479882240295, |
|
"learning_rate": 2.0415573578305343e-06, |
|
"loss": 0.4378, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 3.6013986013986012, |
|
"grad_norm": 0.46977299451828003, |
|
"learning_rate": 2.0027323780531312e-06, |
|
"loss": 0.4224, |
|
"step": 259 |
|
}, |
|
{ |
|
"epoch": 3.6153846153846154, |
|
"grad_norm": 0.49343907833099365, |
|
"learning_rate": 1.9641874609064443e-06, |
|
"loss": 0.4088, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 3.629370629370629, |
|
"grad_norm": 0.4801478385925293, |
|
"learning_rate": 1.9259262080010938e-06, |
|
"loss": 0.419, |
|
"step": 261 |
|
}, |
|
{ |
|
"epoch": 3.6433566433566433, |
|
"grad_norm": 0.4632829427719116, |
|
"learning_rate": 1.887952194442309e-06, |
|
"loss": 0.4185, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 3.6573426573426575, |
|
"grad_norm": 0.4722610414028168, |
|
"learning_rate": 1.8502689684958664e-06, |
|
"loss": 0.4223, |
|
"step": 263 |
|
}, |
|
{ |
|
"epoch": 3.6713286713286712, |
|
"grad_norm": 0.46521317958831787, |
|
"learning_rate": 1.8128800512565514e-06, |
|
"loss": 0.4311, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 3.6853146853146854, |
|
"grad_norm": 0.49360647797584534, |
|
"learning_rate": 1.7757889363191484e-06, |
|
"loss": 0.4336, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 3.699300699300699, |
|
"grad_norm": 0.46490150690078735, |
|
"learning_rate": 1.738999089451991e-06, |
|
"loss": 0.41, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 3.7132867132867133, |
|
"grad_norm": 0.47419989109039307, |
|
"learning_rate": 1.7025139482731385e-06, |
|
"loss": 0.4489, |
|
"step": 267 |
|
}, |
|
{ |
|
"epoch": 3.7272727272727275, |
|
"grad_norm": 0.4471936821937561, |
|
"learning_rate": 1.6663369219291558e-06, |
|
"loss": 0.4075, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 3.7412587412587412, |
|
"grad_norm": 0.4871998727321625, |
|
"learning_rate": 1.6304713907765713e-06, |
|
"loss": 0.4138, |
|
"step": 269 |
|
}, |
|
{ |
|
"epoch": 3.755244755244755, |
|
"grad_norm": 0.4558921754360199, |
|
"learning_rate": 1.5949207060660138e-06, |
|
"loss": 0.4209, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 3.755244755244755, |
|
"eval_loss": 0.5941651463508606, |
|
"eval_runtime": 34.8033, |
|
"eval_samples_per_second": 18.447, |
|
"eval_steps_per_second": 2.327, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 3.769230769230769, |
|
"grad_norm": 0.43444135785102844, |
|
"learning_rate": 1.55968818962908e-06, |
|
"loss": 0.4186, |
|
"step": 271 |
|
}, |
|
{ |
|
"epoch": 3.7832167832167833, |
|
"grad_norm": 0.47602659463882446, |
|
"learning_rate": 1.5247771335679372e-06, |
|
"loss": 0.4138, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 3.797202797202797, |
|
"grad_norm": 0.4794568121433258, |
|
"learning_rate": 1.4901907999477167e-06, |
|
"loss": 0.4512, |
|
"step": 273 |
|
}, |
|
{ |
|
"epoch": 3.8111888111888113, |
|
"grad_norm": 0.47370994091033936, |
|
"learning_rate": 1.4559324204917102e-06, |
|
"loss": 0.4446, |
|
"step": 274 |
|
}, |
|
{ |
|
"epoch": 3.825174825174825, |
|
"grad_norm": 0.4493069052696228, |
|
"learning_rate": 1.4220051962793952e-06, |
|
"loss": 0.4316, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 3.839160839160839, |
|
"grad_norm": 0.4439810812473297, |
|
"learning_rate": 1.3884122974473307e-06, |
|
"loss": 0.4276, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 3.8531468531468533, |
|
"grad_norm": 0.44139519333839417, |
|
"learning_rate": 1.3551568628929434e-06, |
|
"loss": 0.427, |
|
"step": 277 |
|
}, |
|
{ |
|
"epoch": 3.867132867132867, |
|
"grad_norm": 0.45054903626441956, |
|
"learning_rate": 1.3222419999812248e-06, |
|
"loss": 0.4356, |
|
"step": 278 |
|
}, |
|
{ |
|
"epoch": 3.8811188811188813, |
|
"grad_norm": 0.44140151143074036, |
|
"learning_rate": 1.2896707842543898e-06, |
|
"loss": 0.4287, |
|
"step": 279 |
|
}, |
|
{ |
|
"epoch": 3.895104895104895, |
|
"grad_norm": 0.4277818202972412, |
|
"learning_rate": 1.257446259144494e-06, |
|
"loss": 0.4298, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 3.909090909090909, |
|
"grad_norm": 0.4403057098388672, |
|
"learning_rate": 1.225571435689062e-06, |
|
"loss": 0.4185, |
|
"step": 281 |
|
}, |
|
{ |
|
"epoch": 3.9230769230769234, |
|
"grad_norm": 0.4724678099155426, |
|
"learning_rate": 1.1940492922497337e-06, |
|
"loss": 0.4465, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 3.937062937062937, |
|
"grad_norm": 0.47128820419311523, |
|
"learning_rate": 1.1628827742339688e-06, |
|
"loss": 0.4126, |
|
"step": 283 |
|
}, |
|
{ |
|
"epoch": 3.951048951048951, |
|
"grad_norm": 0.4331970512866974, |
|
"learning_rate": 1.1320747938198356e-06, |
|
"loss": 0.4105, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 3.965034965034965, |
|
"grad_norm": 0.4537077844142914, |
|
"learning_rate": 1.1016282296838887e-06, |
|
"loss": 0.4257, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 3.979020979020979, |
|
"grad_norm": 0.46981024742126465, |
|
"learning_rate": 1.0715459267321998e-06, |
|
"loss": 0.4336, |
|
"step": 286 |
|
}, |
|
{ |
|
"epoch": 3.993006993006993, |
|
"grad_norm": 0.4497096538543701, |
|
"learning_rate": 1.0418306958345214e-06, |
|
"loss": 0.4326, |
|
"step": 287 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"grad_norm": 0.6176419258117676, |
|
"learning_rate": 1.0124853135616475e-06, |
|
"loss": 0.4261, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"eval_loss": 0.594137966632843, |
|
"eval_runtime": 35.3287, |
|
"eval_samples_per_second": 18.172, |
|
"eval_steps_per_second": 2.293, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 4.013986013986014, |
|
"grad_norm": 0.48881927132606506, |
|
"learning_rate": 9.835125219259694e-07, |
|
"loss": 0.4126, |
|
"step": 289 |
|
}, |
|
{ |
|
"epoch": 4.027972027972028, |
|
"grad_norm": 0.47744905948638916, |
|
"learning_rate": 9.549150281252633e-07, |
|
"loss": 0.3887, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 4.041958041958042, |
|
"grad_norm": 0.4749980568885803, |
|
"learning_rate": 9.266955042897357e-07, |
|
"loss": 0.4085, |
|
"step": 291 |
|
}, |
|
{ |
|
"epoch": 4.055944055944056, |
|
"grad_norm": 0.4653206169605255, |
|
"learning_rate": 8.988565872323362e-07, |
|
"loss": 0.3949, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 4.06993006993007, |
|
"grad_norm": 0.44160446524620056, |
|
"learning_rate": 8.714008782023797e-07, |
|
"loss": 0.4049, |
|
"step": 293 |
|
}, |
|
{ |
|
"epoch": 4.083916083916084, |
|
"grad_norm": 0.43797171115875244, |
|
"learning_rate": 8.443309426424862e-07, |
|
"loss": 0.4038, |
|
"step": 294 |
|
}, |
|
{ |
|
"epoch": 4.0979020979020975, |
|
"grad_norm": 0.4569723904132843, |
|
"learning_rate": 8.176493099488664e-07, |
|
"loss": 0.3956, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 4.111888111888112, |
|
"grad_norm": 0.47445249557495117, |
|
"learning_rate": 7.913584732349788e-07, |
|
"loss": 0.4107, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 4.125874125874126, |
|
"grad_norm": 0.46384716033935547, |
|
"learning_rate": 7.654608890985709e-07, |
|
"loss": 0.3895, |
|
"step": 297 |
|
}, |
|
{ |
|
"epoch": 4.13986013986014, |
|
"grad_norm": 0.47651711106300354, |
|
"learning_rate": 7.399589773921412e-07, |
|
"loss": 0.3859, |
|
"step": 298 |
|
}, |
|
{ |
|
"epoch": 4.153846153846154, |
|
"grad_norm": 0.4623275697231293, |
|
"learning_rate": 7.148551209968279e-07, |
|
"loss": 0.394, |
|
"step": 299 |
|
}, |
|
{ |
|
"epoch": 4.1678321678321675, |
|
"grad_norm": 0.4649985432624817, |
|
"learning_rate": 6.901516655997536e-07, |
|
"loss": 0.4108, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 4.181818181818182, |
|
"grad_norm": 0.4691464304924011, |
|
"learning_rate": 6.658509194748463e-07, |
|
"loss": 0.3626, |
|
"step": 301 |
|
}, |
|
{ |
|
"epoch": 4.195804195804196, |
|
"grad_norm": 0.48455217480659485, |
|
"learning_rate": 6.419551532671542e-07, |
|
"loss": 0.4172, |
|
"step": 302 |
|
}, |
|
{ |
|
"epoch": 4.20979020979021, |
|
"grad_norm": 0.482030987739563, |
|
"learning_rate": 6.184665997806832e-07, |
|
"loss": 0.4038, |
|
"step": 303 |
|
}, |
|
{ |
|
"epoch": 4.223776223776224, |
|
"grad_norm": 0.4398139715194702, |
|
"learning_rate": 5.953874537697573e-07, |
|
"loss": 0.4033, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 4.2377622377622375, |
|
"grad_norm": 0.46925652027130127, |
|
"learning_rate": 5.727198717339511e-07, |
|
"loss": 0.4091, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 4.251748251748252, |
|
"grad_norm": 0.46952134370803833, |
|
"learning_rate": 5.504659717165812e-07, |
|
"loss": 0.4111, |
|
"step": 306 |
|
}, |
|
{ |
|
"epoch": 4.251748251748252, |
|
"eval_loss": 0.6070981025695801, |
|
"eval_runtime": 35.5097, |
|
"eval_samples_per_second": 18.08, |
|
"eval_steps_per_second": 2.281, |
|
"step": 306 |
|
}, |
|
{ |
|
"epoch": 4.265734265734266, |
|
"grad_norm": 0.45535174012184143, |
|
"learning_rate": 5.286278331068018e-07, |
|
"loss": 0.4128, |
|
"step": 307 |
|
}, |
|
{ |
|
"epoch": 4.27972027972028, |
|
"grad_norm": 0.4438033998012543, |
|
"learning_rate": 5.072074964453055e-07, |
|
"loss": 0.4052, |
|
"step": 308 |
|
}, |
|
{ |
|
"epoch": 4.293706293706293, |
|
"grad_norm": 0.4887377917766571, |
|
"learning_rate": 4.862069632336558e-07, |
|
"loss": 0.3894, |
|
"step": 309 |
|
}, |
|
{ |
|
"epoch": 4.3076923076923075, |
|
"grad_norm": 0.4616340100765228, |
|
"learning_rate": 4.6562819574727304e-07, |
|
"loss": 0.4242, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 4.321678321678322, |
|
"grad_norm": 0.44037091732025146, |
|
"learning_rate": 4.454731168520754e-07, |
|
"loss": 0.4052, |
|
"step": 311 |
|
}, |
|
{ |
|
"epoch": 4.335664335664336, |
|
"grad_norm": 0.4455097019672394, |
|
"learning_rate": 4.257436098248091e-07, |
|
"loss": 0.3882, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 4.34965034965035, |
|
"grad_norm": 0.47457605600357056, |
|
"learning_rate": 4.064415181770787e-07, |
|
"loss": 0.4102, |
|
"step": 313 |
|
}, |
|
{ |
|
"epoch": 4.363636363636363, |
|
"grad_norm": 0.4474296271800995, |
|
"learning_rate": 3.875686454830885e-07, |
|
"loss": 0.3866, |
|
"step": 314 |
|
}, |
|
{ |
|
"epoch": 4.3776223776223775, |
|
"grad_norm": 0.44111815094947815, |
|
"learning_rate": 3.691267552111183e-07, |
|
"loss": 0.4091, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 4.391608391608392, |
|
"grad_norm": 0.46066638827323914, |
|
"learning_rate": 3.511175705587433e-07, |
|
"loss": 0.422, |
|
"step": 316 |
|
}, |
|
{ |
|
"epoch": 4.405594405594406, |
|
"grad_norm": 0.4345090389251709, |
|
"learning_rate": 3.3354277429182626e-07, |
|
"loss": 0.3882, |
|
"step": 317 |
|
}, |
|
{ |
|
"epoch": 4.41958041958042, |
|
"grad_norm": 0.462768018245697, |
|
"learning_rate": 3.164040085872755e-07, |
|
"loss": 0.4125, |
|
"step": 318 |
|
}, |
|
{ |
|
"epoch": 4.433566433566433, |
|
"grad_norm": 0.4575034976005554, |
|
"learning_rate": 2.997028748796016e-07, |
|
"loss": 0.4138, |
|
"step": 319 |
|
}, |
|
{ |
|
"epoch": 4.4475524475524475, |
|
"grad_norm": 0.43728622794151306, |
|
"learning_rate": 2.834409337112842e-07, |
|
"loss": 0.4133, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 4.461538461538462, |
|
"grad_norm": 0.4533195495605469, |
|
"learning_rate": 2.676197045869511e-07, |
|
"loss": 0.4067, |
|
"step": 321 |
|
}, |
|
{ |
|
"epoch": 4.475524475524476, |
|
"grad_norm": 0.44842609763145447, |
|
"learning_rate": 2.522406658313997e-07, |
|
"loss": 0.4042, |
|
"step": 322 |
|
}, |
|
{ |
|
"epoch": 4.489510489510489, |
|
"grad_norm": 0.4315699636936188, |
|
"learning_rate": 2.3730525445146146e-07, |
|
"loss": 0.3969, |
|
"step": 323 |
|
}, |
|
{ |
|
"epoch": 4.503496503496503, |
|
"grad_norm": 0.43630900979042053, |
|
"learning_rate": 2.2281486600173207e-07, |
|
"loss": 0.3907, |
|
"step": 324 |
|
}, |
|
{ |
|
"epoch": 4.503496503496503, |
|
"eval_loss": 0.6088654398918152, |
|
"eval_runtime": 35.0812, |
|
"eval_samples_per_second": 18.3, |
|
"eval_steps_per_second": 2.309, |
|
"step": 324 |
|
}, |
|
{ |
|
"epoch": 4.5174825174825175, |
|
"grad_norm": 0.43661531805992126, |
|
"learning_rate": 2.0877085445416889e-07, |
|
"loss": 0.4079, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 4.531468531468532, |
|
"grad_norm": 0.43984201550483704, |
|
"learning_rate": 1.9517453207157865e-07, |
|
"loss": 0.4071, |
|
"step": 326 |
|
}, |
|
{ |
|
"epoch": 4.545454545454545, |
|
"grad_norm": 0.43304693698883057, |
|
"learning_rate": 1.8202716928499842e-07, |
|
"loss": 0.4, |
|
"step": 327 |
|
}, |
|
{ |
|
"epoch": 4.559440559440559, |
|
"grad_norm": 0.44190627336502075, |
|
"learning_rate": 1.6932999457498823e-07, |
|
"loss": 0.3936, |
|
"step": 328 |
|
}, |
|
{ |
|
"epoch": 4.573426573426573, |
|
"grad_norm": 0.46403783559799194, |
|
"learning_rate": 1.5708419435684463e-07, |
|
"loss": 0.4142, |
|
"step": 329 |
|
}, |
|
{ |
|
"epoch": 4.5874125874125875, |
|
"grad_norm": 0.448397159576416, |
|
"learning_rate": 1.4529091286973994e-07, |
|
"loss": 0.411, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 4.601398601398602, |
|
"grad_norm": 0.4263162910938263, |
|
"learning_rate": 1.3395125206980774e-07, |
|
"loss": 0.3991, |
|
"step": 331 |
|
}, |
|
{ |
|
"epoch": 4.615384615384615, |
|
"grad_norm": 0.4367568790912628, |
|
"learning_rate": 1.230662715271741e-07, |
|
"loss": 0.4144, |
|
"step": 332 |
|
}, |
|
{ |
|
"epoch": 4.629370629370629, |
|
"grad_norm": 0.4405047297477722, |
|
"learning_rate": 1.1263698832695513e-07, |
|
"loss": 0.3935, |
|
"step": 333 |
|
}, |
|
{ |
|
"epoch": 4.643356643356643, |
|
"grad_norm": 0.4359452426433563, |
|
"learning_rate": 1.0266437697422026e-07, |
|
"loss": 0.3913, |
|
"step": 334 |
|
}, |
|
{ |
|
"epoch": 4.6573426573426575, |
|
"grad_norm": 0.44500768184661865, |
|
"learning_rate": 9.314936930293283e-08, |
|
"loss": 0.4102, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 4.671328671328672, |
|
"grad_norm": 0.46006131172180176, |
|
"learning_rate": 8.40928543888836e-08, |
|
"loss": 0.4138, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 4.685314685314685, |
|
"grad_norm": 0.44435447454452515, |
|
"learning_rate": 7.549567846661388e-08, |
|
"loss": 0.4185, |
|
"step": 337 |
|
}, |
|
{ |
|
"epoch": 4.699300699300699, |
|
"grad_norm": 0.43049922585487366, |
|
"learning_rate": 6.735864485034493e-08, |
|
"loss": 0.3946, |
|
"step": 338 |
|
}, |
|
{ |
|
"epoch": 4.713286713286713, |
|
"grad_norm": 0.4270278513431549, |
|
"learning_rate": 5.968251385891744e-08, |
|
"loss": 0.3969, |
|
"step": 339 |
|
}, |
|
{ |
|
"epoch": 4.7272727272727275, |
|
"grad_norm": 0.4480164647102356, |
|
"learning_rate": 5.246800274474439e-08, |
|
"loss": 0.4005, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 4.741258741258742, |
|
"grad_norm": 0.4490266740322113, |
|
"learning_rate": 4.571578562679757e-08, |
|
"loss": 0.3884, |
|
"step": 341 |
|
}, |
|
{ |
|
"epoch": 4.755244755244755, |
|
"grad_norm": 0.4623181223869324, |
|
"learning_rate": 3.9426493427611177e-08, |
|
"loss": 0.4169, |
|
"step": 342 |
|
}, |
|
{ |
|
"epoch": 4.755244755244755, |
|
"eval_loss": 0.6084015965461731, |
|
"eval_runtime": 34.879, |
|
"eval_samples_per_second": 18.406, |
|
"eval_steps_per_second": 2.322, |
|
"step": 342 |
|
}, |
|
{ |
|
"epoch": 4.769230769230769, |
|
"grad_norm": 0.4283956289291382, |
|
"learning_rate": 3.360071381433516e-08, |
|
"loss": 0.3969, |
|
"step": 343 |
|
}, |
|
{ |
|
"epoch": 4.783216783216783, |
|
"grad_norm": 0.4356008470058441, |
|
"learning_rate": 2.823899114382078e-08, |
|
"loss": 0.4027, |
|
"step": 344 |
|
}, |
|
{ |
|
"epoch": 4.7972027972027975, |
|
"grad_norm": 0.44547533988952637, |
|
"learning_rate": 2.3341826411756863e-08, |
|
"loss": 0.3987, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 4.811188811188811, |
|
"grad_norm": 0.4299108386039734, |
|
"learning_rate": 1.8909677205856682e-08, |
|
"loss": 0.4017, |
|
"step": 346 |
|
}, |
|
{ |
|
"epoch": 4.825174825174825, |
|
"grad_norm": 0.4200840890407562, |
|
"learning_rate": 1.494295766310161e-08, |
|
"loss": 0.3885, |
|
"step": 347 |
|
}, |
|
{ |
|
"epoch": 4.839160839160839, |
|
"grad_norm": 0.43688181042671204, |
|
"learning_rate": 1.1442038431044856e-08, |
|
"loss": 0.4119, |
|
"step": 348 |
|
}, |
|
{ |
|
"epoch": 4.853146853146853, |
|
"grad_norm": 0.4302099943161011, |
|
"learning_rate": 8.407246633178601e-09, |
|
"loss": 0.3843, |
|
"step": 349 |
|
}, |
|
{ |
|
"epoch": 4.867132867132867, |
|
"grad_norm": 0.45412999391555786, |
|
"learning_rate": 5.838865838366792e-09, |
|
"loss": 0.4009, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 4.881118881118881, |
|
"grad_norm": 0.43274399638175964, |
|
"learning_rate": 3.737136034349109e-09, |
|
"loss": 0.3951, |
|
"step": 351 |
|
}, |
|
{ |
|
"epoch": 4.895104895104895, |
|
"grad_norm": 0.4244266450405121, |
|
"learning_rate": 2.102253605316684e-09, |
|
"loss": 0.4059, |
|
"step": 352 |
|
}, |
|
{ |
|
"epoch": 4.909090909090909, |
|
"grad_norm": 0.4323265552520752, |
|
"learning_rate": 9.343713135623323e-10, |
|
"loss": 0.3963, |
|
"step": 353 |
|
}, |
|
{ |
|
"epoch": 4.923076923076923, |
|
"grad_norm": 0.4487632215023041, |
|
"learning_rate": 2.335982852064156e-10, |
|
"loss": 0.3937, |
|
"step": 354 |
|
}, |
|
{ |
|
"epoch": 4.937062937062937, |
|
"grad_norm": 0.4363052546977997, |
|
"learning_rate": 0.0, |
|
"loss": 0.405, |
|
"step": 355 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 355, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 5, |
|
"save_steps": 36, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 5.28345287429718e+17, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|