|
{ |
|
"best_metric": 2.169156074523926, |
|
"best_model_checkpoint": "/home/sunggeunan/data/ICL/outputs/lora/SKIML-ICL_mrqa_nq_v3/Meta-Llama-3-8B-Instruct-unanswerable-1Q-0U-0C-qa_first/checkpoint-297", |
|
"epoch": 0.9970625262274444, |
|
"eval_steps": 500, |
|
"global_step": 297, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.003357112882920688, |
|
"grad_norm": 0.39633309841156006, |
|
"learning_rate": 1.111111111111111e-08, |
|
"loss": 2.1607, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.006714225765841376, |
|
"grad_norm": 0.4136360287666321, |
|
"learning_rate": 2.222222222222222e-08, |
|
"loss": 2.2063, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.010071338648762064, |
|
"grad_norm": 0.40252378582954407, |
|
"learning_rate": 3.3333333333333334e-08, |
|
"loss": 2.1702, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.013428451531682753, |
|
"grad_norm": 0.3657248914241791, |
|
"learning_rate": 4.444444444444444e-08, |
|
"loss": 2.1677, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.01678556441460344, |
|
"grad_norm": 0.38506612181663513, |
|
"learning_rate": 5.555555555555555e-08, |
|
"loss": 2.203, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.020142677297524128, |
|
"grad_norm": 0.39267775416374207, |
|
"learning_rate": 6.666666666666667e-08, |
|
"loss": 2.1989, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.02349979018044482, |
|
"grad_norm": 0.41893133521080017, |
|
"learning_rate": 7.777777777777778e-08, |
|
"loss": 2.1882, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.026856903063365505, |
|
"grad_norm": 0.363130122423172, |
|
"learning_rate": 8.888888888888888e-08, |
|
"loss": 2.1636, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.030214015946286196, |
|
"grad_norm": 0.43022215366363525, |
|
"learning_rate": 1e-07, |
|
"loss": 2.1881, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.03357112882920688, |
|
"grad_norm": 0.43208909034729004, |
|
"learning_rate": 1.111111111111111e-07, |
|
"loss": 2.1748, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.03692824171212757, |
|
"grad_norm": 0.4211503267288208, |
|
"learning_rate": 1.2222222222222222e-07, |
|
"loss": 2.2072, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.040285354595048256, |
|
"grad_norm": 0.43464261293411255, |
|
"learning_rate": 1.3333333333333334e-07, |
|
"loss": 2.1711, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.043642467477968946, |
|
"grad_norm": 0.38066577911376953, |
|
"learning_rate": 1.4444444444444442e-07, |
|
"loss": 2.1946, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.04699958036088964, |
|
"grad_norm": 0.3847394585609436, |
|
"learning_rate": 1.5555555555555556e-07, |
|
"loss": 2.1638, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.05035669324381032, |
|
"grad_norm": 0.40741828083992004, |
|
"learning_rate": 1.6666666666666665e-07, |
|
"loss": 2.2389, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.05371380612673101, |
|
"grad_norm": 0.37301868200302124, |
|
"learning_rate": 1.7777777777777776e-07, |
|
"loss": 2.1762, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.0570709190096517, |
|
"grad_norm": 0.4193646013736725, |
|
"learning_rate": 1.8888888888888888e-07, |
|
"loss": 2.2245, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.06042803189257239, |
|
"grad_norm": 0.4078114330768585, |
|
"learning_rate": 2e-07, |
|
"loss": 2.1756, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.06378514477549307, |
|
"grad_norm": 0.40552276372909546, |
|
"learning_rate": 2.111111111111111e-07, |
|
"loss": 2.1302, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.06714225765841376, |
|
"grad_norm": 0.40120214223861694, |
|
"learning_rate": 2.222222222222222e-07, |
|
"loss": 2.1546, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.07049937054133446, |
|
"grad_norm": 0.3937098979949951, |
|
"learning_rate": 2.3333333333333333e-07, |
|
"loss": 2.1822, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.07385648342425515, |
|
"grad_norm": 0.39223670959472656, |
|
"learning_rate": 2.4444444444444445e-07, |
|
"loss": 2.1548, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.07721359630717582, |
|
"grad_norm": 0.395595520734787, |
|
"learning_rate": 2.5555555555555553e-07, |
|
"loss": 2.1538, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.08057070919009651, |
|
"grad_norm": 0.38706085085868835, |
|
"learning_rate": 2.6666666666666667e-07, |
|
"loss": 2.162, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.0839278220730172, |
|
"grad_norm": 0.40628549456596375, |
|
"learning_rate": 2.7777777777777776e-07, |
|
"loss": 2.1493, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.08728493495593789, |
|
"grad_norm": 0.3962867259979248, |
|
"learning_rate": 2.8888888888888885e-07, |
|
"loss": 2.1609, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.09064204783885858, |
|
"grad_norm": 0.36925041675567627, |
|
"learning_rate": 3e-07, |
|
"loss": 2.096, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.09399916072177927, |
|
"grad_norm": 0.38802072405815125, |
|
"learning_rate": 3.111111111111111e-07, |
|
"loss": 2.2456, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.09735627360469996, |
|
"grad_norm": 0.38850194215774536, |
|
"learning_rate": 3.222222222222222e-07, |
|
"loss": 2.1663, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.10071338648762064, |
|
"grad_norm": 0.38965868949890137, |
|
"learning_rate": 3.333333333333333e-07, |
|
"loss": 2.2527, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.10407049937054133, |
|
"grad_norm": 1.802207112312317, |
|
"learning_rate": 3.4444444444444444e-07, |
|
"loss": 2.1718, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.10742761225346202, |
|
"grad_norm": 0.41264647245407104, |
|
"learning_rate": 3.5555555555555553e-07, |
|
"loss": 2.216, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.11078472513638271, |
|
"grad_norm": 0.38629451394081116, |
|
"learning_rate": 3.666666666666666e-07, |
|
"loss": 2.1767, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.1141418380193034, |
|
"grad_norm": 0.38191673159599304, |
|
"learning_rate": 3.7777777777777775e-07, |
|
"loss": 2.1206, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.11749895090222409, |
|
"grad_norm": 0.3905788064002991, |
|
"learning_rate": 3.888888888888889e-07, |
|
"loss": 2.1053, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.12085606378514478, |
|
"grad_norm": 0.4043135941028595, |
|
"learning_rate": 4e-07, |
|
"loss": 2.1955, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.12421317666806546, |
|
"grad_norm": 0.446430504322052, |
|
"learning_rate": 4.1111111111111107e-07, |
|
"loss": 2.2385, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.12757028955098615, |
|
"grad_norm": 0.38461461663246155, |
|
"learning_rate": 4.222222222222222e-07, |
|
"loss": 2.1255, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.13092740243390685, |
|
"grad_norm": 0.4022009074687958, |
|
"learning_rate": 4.3333333333333335e-07, |
|
"loss": 2.1821, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.13428451531682753, |
|
"grad_norm": 0.40789297223091125, |
|
"learning_rate": 4.444444444444444e-07, |
|
"loss": 2.1387, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.1376416281997482, |
|
"grad_norm": 0.4071018099784851, |
|
"learning_rate": 4.555555555555555e-07, |
|
"loss": 2.1077, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.1409987410826689, |
|
"grad_norm": 0.42578282952308655, |
|
"learning_rate": 4.6666666666666666e-07, |
|
"loss": 2.1956, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.1443558539655896, |
|
"grad_norm": 0.4121275842189789, |
|
"learning_rate": 4.777777777777778e-07, |
|
"loss": 2.1491, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.1477129668485103, |
|
"grad_norm": 0.3832322657108307, |
|
"learning_rate": 4.888888888888889e-07, |
|
"loss": 2.1465, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.15107007973143097, |
|
"grad_norm": 0.4325246214866638, |
|
"learning_rate": 5e-07, |
|
"loss": 2.2452, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.15442719261435164, |
|
"grad_norm": 0.38803404569625854, |
|
"learning_rate": 4.994089834515367e-07, |
|
"loss": 2.1442, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.15778430549727235, |
|
"grad_norm": 0.38622474670410156, |
|
"learning_rate": 4.988179669030732e-07, |
|
"loss": 2.1404, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.16114141838019302, |
|
"grad_norm": 0.365347683429718, |
|
"learning_rate": 4.982269503546099e-07, |
|
"loss": 2.1322, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.16449853126311373, |
|
"grad_norm": 0.3673339784145355, |
|
"learning_rate": 4.976359338061466e-07, |
|
"loss": 2.1242, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.1678556441460344, |
|
"grad_norm": 0.3915681838989258, |
|
"learning_rate": 4.970449172576833e-07, |
|
"loss": 2.188, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.1712127570289551, |
|
"grad_norm": 0.4330926239490509, |
|
"learning_rate": 4.964539007092198e-07, |
|
"loss": 2.1702, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.17456986991187579, |
|
"grad_norm": 0.40760231018066406, |
|
"learning_rate": 4.958628841607565e-07, |
|
"loss": 2.218, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.17792698279479646, |
|
"grad_norm": 0.432960569858551, |
|
"learning_rate": 4.952718676122931e-07, |
|
"loss": 2.1804, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.18128409567771717, |
|
"grad_norm": 0.38337603211402893, |
|
"learning_rate": 4.946808510638298e-07, |
|
"loss": 2.1611, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.18464120856063784, |
|
"grad_norm": 0.4071826636791229, |
|
"learning_rate": 4.940898345153664e-07, |
|
"loss": 2.1569, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.18799832144355855, |
|
"grad_norm": 0.416966050863266, |
|
"learning_rate": 4.934988179669031e-07, |
|
"loss": 2.1722, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.19135543432647922, |
|
"grad_norm": 0.42446526885032654, |
|
"learning_rate": 4.929078014184397e-07, |
|
"loss": 2.1529, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.19471254720939993, |
|
"grad_norm": 0.41747376322746277, |
|
"learning_rate": 4.923167848699764e-07, |
|
"loss": 2.1191, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.1980696600923206, |
|
"grad_norm": 0.44791901111602783, |
|
"learning_rate": 4.917257683215129e-07, |
|
"loss": 2.1183, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.20142677297524128, |
|
"grad_norm": 0.39679446816444397, |
|
"learning_rate": 4.911347517730496e-07, |
|
"loss": 2.161, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.20478388585816198, |
|
"grad_norm": 0.38211897015571594, |
|
"learning_rate": 4.905437352245863e-07, |
|
"loss": 2.1334, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.20814099874108266, |
|
"grad_norm": 0.4393980801105499, |
|
"learning_rate": 4.89952718676123e-07, |
|
"loss": 2.2287, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.21149811162400337, |
|
"grad_norm": 0.40504157543182373, |
|
"learning_rate": 4.893617021276595e-07, |
|
"loss": 2.1986, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.21485522450692404, |
|
"grad_norm": 0.40123313665390015, |
|
"learning_rate": 4.887706855791962e-07, |
|
"loss": 2.2045, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.21821233738984475, |
|
"grad_norm": 0.4357161819934845, |
|
"learning_rate": 4.881796690307328e-07, |
|
"loss": 2.235, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.22156945027276542, |
|
"grad_norm": 0.39656224846839905, |
|
"learning_rate": 4.875886524822695e-07, |
|
"loss": 2.1712, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.2249265631556861, |
|
"grad_norm": 0.41355738043785095, |
|
"learning_rate": 4.869976359338061e-07, |
|
"loss": 2.1957, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.2282836760386068, |
|
"grad_norm": 0.4384121298789978, |
|
"learning_rate": 4.864066193853428e-07, |
|
"loss": 2.1832, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.23164078892152748, |
|
"grad_norm": 0.4240085184574127, |
|
"learning_rate": 4.858156028368794e-07, |
|
"loss": 2.1331, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.23499790180444818, |
|
"grad_norm": 0.38766977190971375, |
|
"learning_rate": 4.852245862884161e-07, |
|
"loss": 2.1492, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.23835501468736886, |
|
"grad_norm": 0.4235953390598297, |
|
"learning_rate": 4.846335697399526e-07, |
|
"loss": 2.2232, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.24171212757028956, |
|
"grad_norm": 0.41447708010673523, |
|
"learning_rate": 4.840425531914893e-07, |
|
"loss": 2.1978, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.24506924045321024, |
|
"grad_norm": 0.4142104685306549, |
|
"learning_rate": 4.83451536643026e-07, |
|
"loss": 2.1532, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.24842635333613092, |
|
"grad_norm": 0.4122621417045593, |
|
"learning_rate": 4.828605200945627e-07, |
|
"loss": 2.2096, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.2517834662190516, |
|
"grad_norm": 0.4637957513332367, |
|
"learning_rate": 4.822695035460992e-07, |
|
"loss": 2.2043, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.2551405791019723, |
|
"grad_norm": 0.4476231038570404, |
|
"learning_rate": 4.816784869976359e-07, |
|
"loss": 2.2264, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.258497691984893, |
|
"grad_norm": 0.40626445412635803, |
|
"learning_rate": 4.810874704491725e-07, |
|
"loss": 2.2184, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.2618548048678137, |
|
"grad_norm": 0.4468678832054138, |
|
"learning_rate": 4.804964539007092e-07, |
|
"loss": 2.2189, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.2652119177507344, |
|
"grad_norm": 0.42194902896881104, |
|
"learning_rate": 4.799054373522458e-07, |
|
"loss": 2.1769, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.26856903063365506, |
|
"grad_norm": 0.4420163035392761, |
|
"learning_rate": 4.793144208037825e-07, |
|
"loss": 2.1329, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.27192614351657574, |
|
"grad_norm": 0.4482937455177307, |
|
"learning_rate": 4.787234042553192e-07, |
|
"loss": 2.1954, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.2752832563994964, |
|
"grad_norm": 0.41959258913993835, |
|
"learning_rate": 4.781323877068558e-07, |
|
"loss": 2.1776, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.27864036928241714, |
|
"grad_norm": 0.42231133580207825, |
|
"learning_rate": 4.775413711583924e-07, |
|
"loss": 2.139, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.2819974821653378, |
|
"grad_norm": 0.4405987560749054, |
|
"learning_rate": 4.76950354609929e-07, |
|
"loss": 2.2445, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.2853545950482585, |
|
"grad_norm": 0.394240140914917, |
|
"learning_rate": 4.7635933806146573e-07, |
|
"loss": 2.1289, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.2887117079311792, |
|
"grad_norm": 0.44175001978874207, |
|
"learning_rate": 4.7576832151300236e-07, |
|
"loss": 2.2419, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.29206882081409985, |
|
"grad_norm": 0.41716253757476807, |
|
"learning_rate": 4.75177304964539e-07, |
|
"loss": 2.2557, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.2954259336970206, |
|
"grad_norm": 0.41680270433425903, |
|
"learning_rate": 4.745862884160756e-07, |
|
"loss": 2.1866, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.29878304657994126, |
|
"grad_norm": 0.4188416600227356, |
|
"learning_rate": 4.739952718676123e-07, |
|
"loss": 2.1909, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.30214015946286193, |
|
"grad_norm": 0.41669386625289917, |
|
"learning_rate": 4.734042553191489e-07, |
|
"loss": 2.1913, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.3054972723457826, |
|
"grad_norm": 0.4323998689651489, |
|
"learning_rate": 4.728132387706856e-07, |
|
"loss": 2.1811, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.3088543852287033, |
|
"grad_norm": 0.431393027305603, |
|
"learning_rate": 4.722222222222222e-07, |
|
"loss": 2.1772, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.312211498111624, |
|
"grad_norm": 0.4159488081932068, |
|
"learning_rate": 4.716312056737589e-07, |
|
"loss": 2.1459, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.3155686109945447, |
|
"grad_norm": 0.4011417329311371, |
|
"learning_rate": 4.7104018912529545e-07, |
|
"loss": 2.1657, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.3189257238774654, |
|
"grad_norm": 0.41295671463012695, |
|
"learning_rate": 4.7044917257683213e-07, |
|
"loss": 2.1217, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.32228283676038605, |
|
"grad_norm": 0.4088380038738251, |
|
"learning_rate": 4.6985815602836876e-07, |
|
"loss": 2.152, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.3256399496433068, |
|
"grad_norm": 0.43500083684921265, |
|
"learning_rate": 4.6926713947990543e-07, |
|
"loss": 2.2021, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.32899706252622746, |
|
"grad_norm": 0.4200705587863922, |
|
"learning_rate": 4.6867612293144206e-07, |
|
"loss": 2.1357, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.33235417540914813, |
|
"grad_norm": 0.4516183137893677, |
|
"learning_rate": 4.6808510638297873e-07, |
|
"loss": 2.2323, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.3357112882920688, |
|
"grad_norm": 0.49128514528274536, |
|
"learning_rate": 4.674940898345153e-07, |
|
"loss": 2.2364, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.3390684011749895, |
|
"grad_norm": 0.4172728657722473, |
|
"learning_rate": 4.66903073286052e-07, |
|
"loss": 2.2085, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.3424255140579102, |
|
"grad_norm": 0.4487544000148773, |
|
"learning_rate": 4.663120567375886e-07, |
|
"loss": 2.1661, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.3457826269408309, |
|
"grad_norm": 0.4443681538105011, |
|
"learning_rate": 4.657210401891253e-07, |
|
"loss": 2.2109, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.34913973982375157, |
|
"grad_norm": 0.4674079418182373, |
|
"learning_rate": 4.651300236406619e-07, |
|
"loss": 2.1596, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.35249685270667225, |
|
"grad_norm": 0.43013623356819153, |
|
"learning_rate": 4.645390070921986e-07, |
|
"loss": 2.1658, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.3558539655895929, |
|
"grad_norm": 0.43104687333106995, |
|
"learning_rate": 4.6394799054373515e-07, |
|
"loss": 2.1686, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.35921107847251366, |
|
"grad_norm": 0.4218711853027344, |
|
"learning_rate": 4.6335697399527183e-07, |
|
"loss": 2.141, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.36256819135543433, |
|
"grad_norm": 0.45031747221946716, |
|
"learning_rate": 4.6276595744680846e-07, |
|
"loss": 2.1561, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.365925304238355, |
|
"grad_norm": 0.48128026723861694, |
|
"learning_rate": 4.6217494089834513e-07, |
|
"loss": 2.214, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.3692824171212757, |
|
"grad_norm": 0.44868627190589905, |
|
"learning_rate": 4.6158392434988176e-07, |
|
"loss": 2.1488, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.3726395300041964, |
|
"grad_norm": 0.44237226247787476, |
|
"learning_rate": 4.6099290780141843e-07, |
|
"loss": 2.1099, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.3759966428871171, |
|
"grad_norm": 0.42734286189079285, |
|
"learning_rate": 4.604018912529551e-07, |
|
"loss": 2.2332, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.37935375577003777, |
|
"grad_norm": 0.45235806703567505, |
|
"learning_rate": 4.598108747044917e-07, |
|
"loss": 2.1925, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.38271086865295845, |
|
"grad_norm": 0.4485257863998413, |
|
"learning_rate": 4.5921985815602836e-07, |
|
"loss": 2.1786, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.3860679815358791, |
|
"grad_norm": 0.45567062497138977, |
|
"learning_rate": 4.58628841607565e-07, |
|
"loss": 2.1386, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.38942509441879986, |
|
"grad_norm": 0.45261716842651367, |
|
"learning_rate": 4.5803782505910166e-07, |
|
"loss": 2.136, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.39278220730172053, |
|
"grad_norm": 0.4375866949558258, |
|
"learning_rate": 4.574468085106383e-07, |
|
"loss": 2.1422, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.3961393201846412, |
|
"grad_norm": 0.46383175253868103, |
|
"learning_rate": 4.5685579196217496e-07, |
|
"loss": 2.1732, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.3994964330675619, |
|
"grad_norm": 0.4010314345359802, |
|
"learning_rate": 4.5626477541371153e-07, |
|
"loss": 2.1329, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.40285354595048256, |
|
"grad_norm": 0.4446873068809509, |
|
"learning_rate": 4.556737588652482e-07, |
|
"loss": 2.1791, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.4062106588334033, |
|
"grad_norm": 0.47618600726127625, |
|
"learning_rate": 4.5508274231678483e-07, |
|
"loss": 2.236, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.40956777171632397, |
|
"grad_norm": 0.4493118226528168, |
|
"learning_rate": 4.544917257683215e-07, |
|
"loss": 2.1575, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.41292488459924465, |
|
"grad_norm": 0.4111258387565613, |
|
"learning_rate": 4.5390070921985813e-07, |
|
"loss": 2.24, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.4162819974821653, |
|
"grad_norm": 0.41655582189559937, |
|
"learning_rate": 4.533096926713948e-07, |
|
"loss": 2.0788, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.419639110365086, |
|
"grad_norm": 0.47266441583633423, |
|
"learning_rate": 4.5271867612293143e-07, |
|
"loss": 2.1774, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.42299622324800673, |
|
"grad_norm": 0.464999794960022, |
|
"learning_rate": 4.5212765957446806e-07, |
|
"loss": 2.143, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.4263533361309274, |
|
"grad_norm": 0.44828522205352783, |
|
"learning_rate": 4.515366430260047e-07, |
|
"loss": 2.1363, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 0.4297104490138481, |
|
"grad_norm": 0.4714733362197876, |
|
"learning_rate": 4.5094562647754136e-07, |
|
"loss": 2.225, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.43306756189676876, |
|
"grad_norm": 0.42666733264923096, |
|
"learning_rate": 4.50354609929078e-07, |
|
"loss": 2.1774, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.4364246747796895, |
|
"grad_norm": 0.46839290857315063, |
|
"learning_rate": 4.4976359338061466e-07, |
|
"loss": 2.1427, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.43978178766261017, |
|
"grad_norm": 0.48040419816970825, |
|
"learning_rate": 4.491725768321513e-07, |
|
"loss": 2.1909, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 0.44313890054553084, |
|
"grad_norm": 0.4932810962200165, |
|
"learning_rate": 4.485815602836879e-07, |
|
"loss": 2.1226, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.4464960134284515, |
|
"grad_norm": 0.4730973541736603, |
|
"learning_rate": 4.4799054373522453e-07, |
|
"loss": 2.1844, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 0.4498531263113722, |
|
"grad_norm": 0.44282010197639465, |
|
"learning_rate": 4.473995271867612e-07, |
|
"loss": 2.1503, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.45321023919429293, |
|
"grad_norm": 0.4495702087879181, |
|
"learning_rate": 4.4680851063829783e-07, |
|
"loss": 2.1599, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.4565673520772136, |
|
"grad_norm": 0.44728878140449524, |
|
"learning_rate": 4.462174940898345e-07, |
|
"loss": 2.1479, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.4599244649601343, |
|
"grad_norm": 0.4495660960674286, |
|
"learning_rate": 4.4562647754137114e-07, |
|
"loss": 2.1272, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 0.46328157784305496, |
|
"grad_norm": 0.4553879499435425, |
|
"learning_rate": 4.4503546099290776e-07, |
|
"loss": 2.2729, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.46663869072597564, |
|
"grad_norm": 0.46510016918182373, |
|
"learning_rate": 4.444444444444444e-07, |
|
"loss": 2.2367, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 0.46999580360889637, |
|
"grad_norm": 0.4671325981616974, |
|
"learning_rate": 4.4385342789598106e-07, |
|
"loss": 2.1167, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.47335291649181704, |
|
"grad_norm": 0.4627954661846161, |
|
"learning_rate": 4.432624113475177e-07, |
|
"loss": 2.2521, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 0.4767100293747377, |
|
"grad_norm": 0.4297815263271332, |
|
"learning_rate": 4.4267139479905436e-07, |
|
"loss": 2.1935, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 0.4800671422576584, |
|
"grad_norm": 0.4634767770767212, |
|
"learning_rate": 4.4208037825059104e-07, |
|
"loss": 2.1128, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 0.48342425514057913, |
|
"grad_norm": 0.4689215421676636, |
|
"learning_rate": 4.4148936170212766e-07, |
|
"loss": 2.2286, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.4867813680234998, |
|
"grad_norm": 0.4813438355922699, |
|
"learning_rate": 4.408983451536643e-07, |
|
"loss": 2.1433, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.4901384809064205, |
|
"grad_norm": 0.45745640993118286, |
|
"learning_rate": 4.403073286052009e-07, |
|
"loss": 2.1949, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 0.49349559378934116, |
|
"grad_norm": 0.4202418625354767, |
|
"learning_rate": 4.397163120567376e-07, |
|
"loss": 2.1028, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 0.49685270667226183, |
|
"grad_norm": 0.42282456159591675, |
|
"learning_rate": 4.391252955082742e-07, |
|
"loss": 2.1114, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.5002098195551825, |
|
"grad_norm": 0.4623030424118042, |
|
"learning_rate": 4.385342789598109e-07, |
|
"loss": 2.1618, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 0.5035669324381032, |
|
"grad_norm": 0.4584071934223175, |
|
"learning_rate": 4.379432624113475e-07, |
|
"loss": 2.2274, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.5069240453210239, |
|
"grad_norm": 0.43828415870666504, |
|
"learning_rate": 4.3735224586288414e-07, |
|
"loss": 2.1807, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 0.5102811582039446, |
|
"grad_norm": 0.4550941288471222, |
|
"learning_rate": 4.3676122931442076e-07, |
|
"loss": 2.1355, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.5136382710868653, |
|
"grad_norm": 0.4852266013622284, |
|
"learning_rate": 4.3617021276595744e-07, |
|
"loss": 2.1623, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 0.516995383969786, |
|
"grad_norm": 0.450320303440094, |
|
"learning_rate": 4.3557919621749406e-07, |
|
"loss": 2.1963, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 0.5203524968527067, |
|
"grad_norm": 0.4544139504432678, |
|
"learning_rate": 4.3498817966903074e-07, |
|
"loss": 2.1413, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.5237096097356274, |
|
"grad_norm": 0.4609904885292053, |
|
"learning_rate": 4.3439716312056736e-07, |
|
"loss": 2.2289, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.527066722618548, |
|
"grad_norm": 0.46614569425582886, |
|
"learning_rate": 4.3380614657210404e-07, |
|
"loss": 2.1289, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 0.5304238355014688, |
|
"grad_norm": 0.4586597681045532, |
|
"learning_rate": 4.332151300236406e-07, |
|
"loss": 2.1033, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 0.5337809483843894, |
|
"grad_norm": 0.4757809340953827, |
|
"learning_rate": 4.326241134751773e-07, |
|
"loss": 2.1708, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 0.5371380612673101, |
|
"grad_norm": 0.45364031195640564, |
|
"learning_rate": 4.320330969267139e-07, |
|
"loss": 2.1473, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.5404951741502309, |
|
"grad_norm": 0.45321136713027954, |
|
"learning_rate": 4.314420803782506e-07, |
|
"loss": 2.177, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 0.5438522870331515, |
|
"grad_norm": 0.43466734886169434, |
|
"learning_rate": 4.308510638297872e-07, |
|
"loss": 2.1304, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 0.5472093999160722, |
|
"grad_norm": 0.4303533732891083, |
|
"learning_rate": 4.302600472813239e-07, |
|
"loss": 2.0758, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 0.5505665127989928, |
|
"grad_norm": 0.47530239820480347, |
|
"learning_rate": 4.2966903073286046e-07, |
|
"loss": 2.2194, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 0.5539236256819136, |
|
"grad_norm": 0.4379255175590515, |
|
"learning_rate": 4.2907801418439714e-07, |
|
"loss": 2.1497, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.5572807385648343, |
|
"grad_norm": 0.4771229922771454, |
|
"learning_rate": 4.2848699763593376e-07, |
|
"loss": 2.1491, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 0.5606378514477549, |
|
"grad_norm": 0.4536450505256653, |
|
"learning_rate": 4.2789598108747044e-07, |
|
"loss": 2.2061, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 0.5639949643306756, |
|
"grad_norm": 0.46324947476387024, |
|
"learning_rate": 4.2730496453900706e-07, |
|
"loss": 2.1859, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.5673520772135963, |
|
"grad_norm": 0.4493923485279083, |
|
"learning_rate": 4.2671394799054374e-07, |
|
"loss": 2.0956, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 0.570709190096517, |
|
"grad_norm": 0.4963778853416443, |
|
"learning_rate": 4.261229314420803e-07, |
|
"loss": 2.1433, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.5740663029794377, |
|
"grad_norm": 0.5063489675521851, |
|
"learning_rate": 4.25531914893617e-07, |
|
"loss": 2.1887, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 0.5774234158623583, |
|
"grad_norm": 0.4580891728401184, |
|
"learning_rate": 4.249408983451536e-07, |
|
"loss": 2.1164, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 0.5807805287452791, |
|
"grad_norm": 0.4890580177307129, |
|
"learning_rate": 4.243498817966903e-07, |
|
"loss": 2.1647, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 0.5841376416281997, |
|
"grad_norm": 0.45317739248275757, |
|
"learning_rate": 4.237588652482269e-07, |
|
"loss": 2.1837, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 0.5874947545111204, |
|
"grad_norm": 0.4900612533092499, |
|
"learning_rate": 4.231678486997636e-07, |
|
"loss": 2.1558, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.5908518673940412, |
|
"grad_norm": 0.47292637825012207, |
|
"learning_rate": 4.2257683215130027e-07, |
|
"loss": 2.163, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 0.5942089802769618, |
|
"grad_norm": 0.4768417477607727, |
|
"learning_rate": 4.2198581560283684e-07, |
|
"loss": 2.1963, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 0.5975660931598825, |
|
"grad_norm": 0.4955364465713501, |
|
"learning_rate": 4.213947990543735e-07, |
|
"loss": 2.1402, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 0.6009232060428031, |
|
"grad_norm": 0.46482154726982117, |
|
"learning_rate": 4.2080378250591014e-07, |
|
"loss": 2.238, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 0.6042803189257239, |
|
"grad_norm": 0.47761717438697815, |
|
"learning_rate": 4.202127659574468e-07, |
|
"loss": 2.1089, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.6076374318086446, |
|
"grad_norm": 0.48028987646102905, |
|
"learning_rate": 4.1962174940898344e-07, |
|
"loss": 2.1946, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 0.6109945446915652, |
|
"grad_norm": 0.4602825939655304, |
|
"learning_rate": 4.190307328605201e-07, |
|
"loss": 2.1304, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 0.614351657574486, |
|
"grad_norm": 0.4691866338253021, |
|
"learning_rate": 4.184397163120567e-07, |
|
"loss": 2.1363, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 0.6177087704574066, |
|
"grad_norm": 0.46271318197250366, |
|
"learning_rate": 4.1784869976359336e-07, |
|
"loss": 2.1805, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 0.6210658833403273, |
|
"grad_norm": 0.48010194301605225, |
|
"learning_rate": 4.1725768321513e-07, |
|
"loss": 2.178, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.624422996223248, |
|
"grad_norm": 0.45885005593299866, |
|
"learning_rate": 4.1666666666666667e-07, |
|
"loss": 2.1575, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 0.6277801091061687, |
|
"grad_norm": 0.45524775981903076, |
|
"learning_rate": 4.160756501182033e-07, |
|
"loss": 2.1303, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 0.6311372219890894, |
|
"grad_norm": 0.4570733606815338, |
|
"learning_rate": 4.1548463356973997e-07, |
|
"loss": 2.1628, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 0.6344943348720101, |
|
"grad_norm": 0.489170640707016, |
|
"learning_rate": 4.148936170212766e-07, |
|
"loss": 2.1663, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 0.6378514477549307, |
|
"grad_norm": 0.47888293862342834, |
|
"learning_rate": 4.143026004728132e-07, |
|
"loss": 2.1347, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.6412085606378515, |
|
"grad_norm": 0.4729193449020386, |
|
"learning_rate": 4.1371158392434984e-07, |
|
"loss": 2.1394, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 0.6445656735207721, |
|
"grad_norm": 0.5049130320549011, |
|
"learning_rate": 4.131205673758865e-07, |
|
"loss": 2.2181, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 0.6479227864036928, |
|
"grad_norm": 0.44132182002067566, |
|
"learning_rate": 4.1252955082742314e-07, |
|
"loss": 2.1427, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 0.6512798992866136, |
|
"grad_norm": 0.49706417322158813, |
|
"learning_rate": 4.119385342789598e-07, |
|
"loss": 2.1993, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 0.6546370121695342, |
|
"grad_norm": 0.46416929364204407, |
|
"learning_rate": 4.1134751773049644e-07, |
|
"loss": 2.1108, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.6579941250524549, |
|
"grad_norm": 0.4778405427932739, |
|
"learning_rate": 4.1075650118203306e-07, |
|
"loss": 2.1694, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 0.6613512379353755, |
|
"grad_norm": 0.46708041429519653, |
|
"learning_rate": 4.101654846335697e-07, |
|
"loss": 2.184, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 0.6647083508182963, |
|
"grad_norm": 0.48584261536598206, |
|
"learning_rate": 4.0957446808510637e-07, |
|
"loss": 2.1222, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 0.668065463701217, |
|
"grad_norm": 0.5111873745918274, |
|
"learning_rate": 4.08983451536643e-07, |
|
"loss": 2.1711, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 0.6714225765841376, |
|
"grad_norm": 0.4958716630935669, |
|
"learning_rate": 4.0839243498817967e-07, |
|
"loss": 2.1523, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.6747796894670584, |
|
"grad_norm": 0.48708048462867737, |
|
"learning_rate": 4.078014184397163e-07, |
|
"loss": 2.1921, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 0.678136802349979, |
|
"grad_norm": 0.47986796498298645, |
|
"learning_rate": 4.0721040189125297e-07, |
|
"loss": 2.1619, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 0.6814939152328997, |
|
"grad_norm": 0.487250417470932, |
|
"learning_rate": 4.0661938534278954e-07, |
|
"loss": 2.2103, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 0.6848510281158204, |
|
"grad_norm": 0.5118921995162964, |
|
"learning_rate": 4.060283687943262e-07, |
|
"loss": 2.2549, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 0.6882081409987411, |
|
"grad_norm": 0.5187731981277466, |
|
"learning_rate": 4.0543735224586284e-07, |
|
"loss": 2.2144, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.6915652538816618, |
|
"grad_norm": 0.4841180145740509, |
|
"learning_rate": 4.048463356973995e-07, |
|
"loss": 2.1528, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 0.6949223667645824, |
|
"grad_norm": 0.47858700156211853, |
|
"learning_rate": 4.0425531914893614e-07, |
|
"loss": 2.1743, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 0.6982794796475031, |
|
"grad_norm": 0.47898271679878235, |
|
"learning_rate": 4.036643026004728e-07, |
|
"loss": 2.1268, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 0.7016365925304239, |
|
"grad_norm": 0.4743264615535736, |
|
"learning_rate": 4.0307328605200944e-07, |
|
"loss": 2.1992, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 0.7049937054133445, |
|
"grad_norm": 0.5258775353431702, |
|
"learning_rate": 4.0248226950354607e-07, |
|
"loss": 2.1288, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.7083508182962652, |
|
"grad_norm": 0.4403035044670105, |
|
"learning_rate": 4.0189125295508274e-07, |
|
"loss": 2.1033, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 0.7117079311791858, |
|
"grad_norm": 0.4601992666721344, |
|
"learning_rate": 4.0130023640661937e-07, |
|
"loss": 2.2051, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 0.7150650440621066, |
|
"grad_norm": 0.48560434579849243, |
|
"learning_rate": 4.0070921985815604e-07, |
|
"loss": 2.1469, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 0.7184221569450273, |
|
"grad_norm": 0.4823721945285797, |
|
"learning_rate": 4.0011820330969267e-07, |
|
"loss": 2.1988, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 0.7217792698279479, |
|
"grad_norm": 0.48195022344589233, |
|
"learning_rate": 3.995271867612293e-07, |
|
"loss": 2.1211, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.7251363827108687, |
|
"grad_norm": 0.5148845314979553, |
|
"learning_rate": 3.989361702127659e-07, |
|
"loss": 2.1803, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 0.7284934955937893, |
|
"grad_norm": 0.4884459376335144, |
|
"learning_rate": 3.983451536643026e-07, |
|
"loss": 2.1332, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 0.73185060847671, |
|
"grad_norm": 0.5225220322608948, |
|
"learning_rate": 3.977541371158392e-07, |
|
"loss": 2.1582, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 0.7352077213596308, |
|
"grad_norm": 0.4897938668727875, |
|
"learning_rate": 3.971631205673759e-07, |
|
"loss": 2.1322, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 0.7385648342425514, |
|
"grad_norm": 0.502916693687439, |
|
"learning_rate": 3.965721040189125e-07, |
|
"loss": 2.1518, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.7419219471254721, |
|
"grad_norm": 0.4693153202533722, |
|
"learning_rate": 3.959810874704492e-07, |
|
"loss": 2.1207, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 0.7452790600083928, |
|
"grad_norm": 0.4866141676902771, |
|
"learning_rate": 3.9539007092198577e-07, |
|
"loss": 2.1424, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 0.7486361728913135, |
|
"grad_norm": 0.48267892003059387, |
|
"learning_rate": 3.9479905437352244e-07, |
|
"loss": 2.1664, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 0.7519932857742342, |
|
"grad_norm": 0.505587637424469, |
|
"learning_rate": 3.9420803782505907e-07, |
|
"loss": 2.1723, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 0.7553503986571548, |
|
"grad_norm": 0.47869905829429626, |
|
"learning_rate": 3.9361702127659574e-07, |
|
"loss": 2.0781, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.7587075115400755, |
|
"grad_norm": 0.487474650144577, |
|
"learning_rate": 3.9302600472813237e-07, |
|
"loss": 2.1889, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 0.7620646244229963, |
|
"grad_norm": 0.5115759968757629, |
|
"learning_rate": 3.9243498817966904e-07, |
|
"loss": 2.2055, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 0.7654217373059169, |
|
"grad_norm": 0.4802757203578949, |
|
"learning_rate": 3.918439716312056e-07, |
|
"loss": 2.1542, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 0.7687788501888376, |
|
"grad_norm": 0.48687273263931274, |
|
"learning_rate": 3.912529550827423e-07, |
|
"loss": 2.2195, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 0.7721359630717582, |
|
"grad_norm": 0.5212287902832031, |
|
"learning_rate": 3.906619385342789e-07, |
|
"loss": 2.2386, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.775493075954679, |
|
"grad_norm": 0.4856519401073456, |
|
"learning_rate": 3.900709219858156e-07, |
|
"loss": 2.1322, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 0.7788501888375997, |
|
"grad_norm": 0.4821922183036804, |
|
"learning_rate": 3.894799054373522e-07, |
|
"loss": 2.1594, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 0.7822073017205203, |
|
"grad_norm": 0.46911802887916565, |
|
"learning_rate": 3.888888888888889e-07, |
|
"loss": 2.1279, |
|
"step": 233 |
|
}, |
|
{ |
|
"epoch": 0.7855644146034411, |
|
"grad_norm": 0.5064778923988342, |
|
"learning_rate": 3.8829787234042547e-07, |
|
"loss": 2.1294, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 0.7889215274863617, |
|
"grad_norm": 0.5024438500404358, |
|
"learning_rate": 3.8770685579196214e-07, |
|
"loss": 2.1321, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.7922786403692824, |
|
"grad_norm": 0.5185412168502808, |
|
"learning_rate": 3.8711583924349877e-07, |
|
"loss": 2.13, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 0.7956357532522031, |
|
"grad_norm": 0.5049921274185181, |
|
"learning_rate": 3.8652482269503544e-07, |
|
"loss": 2.165, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 0.7989928661351238, |
|
"grad_norm": 0.5252367258071899, |
|
"learning_rate": 3.8593380614657207e-07, |
|
"loss": 2.0951, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 0.8023499790180445, |
|
"grad_norm": 0.5152316093444824, |
|
"learning_rate": 3.8534278959810874e-07, |
|
"loss": 2.1517, |
|
"step": 239 |
|
}, |
|
{ |
|
"epoch": 0.8057070919009651, |
|
"grad_norm": 0.4972199499607086, |
|
"learning_rate": 3.8475177304964537e-07, |
|
"loss": 2.1474, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.8090642047838859, |
|
"grad_norm": 0.5103582143783569, |
|
"learning_rate": 3.84160756501182e-07, |
|
"loss": 2.1318, |
|
"step": 241 |
|
}, |
|
{ |
|
"epoch": 0.8124213176668066, |
|
"grad_norm": 0.4988660216331482, |
|
"learning_rate": 3.8356973995271867e-07, |
|
"loss": 2.1228, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 0.8157784305497272, |
|
"grad_norm": 0.5007835030555725, |
|
"learning_rate": 3.829787234042553e-07, |
|
"loss": 2.1176, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 0.8191355434326479, |
|
"grad_norm": 0.4536113440990448, |
|
"learning_rate": 3.8238770685579197e-07, |
|
"loss": 2.1406, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 0.8224926563155686, |
|
"grad_norm": 0.5342024564743042, |
|
"learning_rate": 3.817966903073286e-07, |
|
"loss": 2.1462, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.8258497691984893, |
|
"grad_norm": 0.48217201232910156, |
|
"learning_rate": 3.8120567375886527e-07, |
|
"loss": 2.1259, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 0.82920688208141, |
|
"grad_norm": 0.5227500200271606, |
|
"learning_rate": 3.8061465721040184e-07, |
|
"loss": 2.1516, |
|
"step": 247 |
|
}, |
|
{ |
|
"epoch": 0.8325639949643306, |
|
"grad_norm": 0.47303012013435364, |
|
"learning_rate": 3.800236406619385e-07, |
|
"loss": 2.1713, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 0.8359211078472514, |
|
"grad_norm": 0.512878954410553, |
|
"learning_rate": 3.7943262411347514e-07, |
|
"loss": 2.1799, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 0.839278220730172, |
|
"grad_norm": 0.5365780591964722, |
|
"learning_rate": 3.788416075650118e-07, |
|
"loss": 2.1795, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.8426353336130927, |
|
"grad_norm": 0.5341731905937195, |
|
"learning_rate": 3.7825059101654844e-07, |
|
"loss": 2.2331, |
|
"step": 251 |
|
}, |
|
{ |
|
"epoch": 0.8459924464960135, |
|
"grad_norm": 0.4720432758331299, |
|
"learning_rate": 3.776595744680851e-07, |
|
"loss": 2.155, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 0.8493495593789341, |
|
"grad_norm": 0.5171768665313721, |
|
"learning_rate": 3.7706855791962175e-07, |
|
"loss": 2.1395, |
|
"step": 253 |
|
}, |
|
{ |
|
"epoch": 0.8527066722618548, |
|
"grad_norm": 0.5279157757759094, |
|
"learning_rate": 3.7647754137115837e-07, |
|
"loss": 2.1647, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 0.8560637851447755, |
|
"grad_norm": 0.5167645812034607, |
|
"learning_rate": 3.75886524822695e-07, |
|
"loss": 2.1915, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.8594208980276962, |
|
"grad_norm": 0.4854820668697357, |
|
"learning_rate": 3.7529550827423167e-07, |
|
"loss": 2.1293, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 0.8627780109106169, |
|
"grad_norm": 0.5053945183753967, |
|
"learning_rate": 3.747044917257683e-07, |
|
"loss": 2.1776, |
|
"step": 257 |
|
}, |
|
{ |
|
"epoch": 0.8661351237935375, |
|
"grad_norm": 0.5340734720230103, |
|
"learning_rate": 3.7411347517730497e-07, |
|
"loss": 2.2158, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 0.8694922366764583, |
|
"grad_norm": 0.5089324116706848, |
|
"learning_rate": 3.735224586288416e-07, |
|
"loss": 2.1451, |
|
"step": 259 |
|
}, |
|
{ |
|
"epoch": 0.872849349559379, |
|
"grad_norm": 0.49475711584091187, |
|
"learning_rate": 3.729314420803782e-07, |
|
"loss": 2.1416, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.8762064624422996, |
|
"grad_norm": 0.5191430449485779, |
|
"learning_rate": 3.7234042553191484e-07, |
|
"loss": 2.1815, |
|
"step": 261 |
|
}, |
|
{ |
|
"epoch": 0.8795635753252203, |
|
"grad_norm": 0.4857535660266876, |
|
"learning_rate": 3.717494089834515e-07, |
|
"loss": 2.1388, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 0.882920688208141, |
|
"grad_norm": 0.4946460425853729, |
|
"learning_rate": 3.7115839243498815e-07, |
|
"loss": 2.1562, |
|
"step": 263 |
|
}, |
|
{ |
|
"epoch": 0.8862778010910617, |
|
"grad_norm": 0.4693676233291626, |
|
"learning_rate": 3.705673758865248e-07, |
|
"loss": 2.1189, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 0.8896349139739824, |
|
"grad_norm": 0.5070816278457642, |
|
"learning_rate": 3.6997635933806145e-07, |
|
"loss": 2.1278, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.892992026856903, |
|
"grad_norm": 0.5286785960197449, |
|
"learning_rate": 3.693853427895981e-07, |
|
"loss": 2.1917, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 0.8963491397398238, |
|
"grad_norm": 0.48202502727508545, |
|
"learning_rate": 3.687943262411347e-07, |
|
"loss": 2.1224, |
|
"step": 267 |
|
}, |
|
{ |
|
"epoch": 0.8997062526227444, |
|
"grad_norm": 0.5092111825942993, |
|
"learning_rate": 3.6820330969267137e-07, |
|
"loss": 2.2141, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 0.9030633655056651, |
|
"grad_norm": 0.5308806300163269, |
|
"learning_rate": 3.67612293144208e-07, |
|
"loss": 2.151, |
|
"step": 269 |
|
}, |
|
{ |
|
"epoch": 0.9064204783885859, |
|
"grad_norm": 0.5302571058273315, |
|
"learning_rate": 3.6702127659574467e-07, |
|
"loss": 2.1899, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.9097775912715065, |
|
"grad_norm": 0.489431768655777, |
|
"learning_rate": 3.664302600472813e-07, |
|
"loss": 2.1448, |
|
"step": 271 |
|
}, |
|
{ |
|
"epoch": 0.9131347041544272, |
|
"grad_norm": 0.47753775119781494, |
|
"learning_rate": 3.6583924349881797e-07, |
|
"loss": 2.1036, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 0.9164918170373478, |
|
"grad_norm": 0.49404028058052063, |
|
"learning_rate": 3.652482269503546e-07, |
|
"loss": 2.1422, |
|
"step": 273 |
|
}, |
|
{ |
|
"epoch": 0.9198489299202686, |
|
"grad_norm": 0.5034516453742981, |
|
"learning_rate": 3.646572104018912e-07, |
|
"loss": 2.152, |
|
"step": 274 |
|
}, |
|
{ |
|
"epoch": 0.9232060428031893, |
|
"grad_norm": 0.5550661683082581, |
|
"learning_rate": 3.640661938534279e-07, |
|
"loss": 2.1861, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.9265631556861099, |
|
"grad_norm": 0.4908338487148285, |
|
"learning_rate": 3.634751773049645e-07, |
|
"loss": 2.1026, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 0.9299202685690307, |
|
"grad_norm": 0.5155569911003113, |
|
"learning_rate": 3.628841607565012e-07, |
|
"loss": 2.1006, |
|
"step": 277 |
|
}, |
|
{ |
|
"epoch": 0.9332773814519513, |
|
"grad_norm": 0.5384230613708496, |
|
"learning_rate": 3.622931442080378e-07, |
|
"loss": 2.2128, |
|
"step": 278 |
|
}, |
|
{ |
|
"epoch": 0.936634494334872, |
|
"grad_norm": 0.5264031291007996, |
|
"learning_rate": 3.617021276595745e-07, |
|
"loss": 2.1531, |
|
"step": 279 |
|
}, |
|
{ |
|
"epoch": 0.9399916072177927, |
|
"grad_norm": 0.5026865601539612, |
|
"learning_rate": 3.6111111111111107e-07, |
|
"loss": 2.1594, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.9433487201007134, |
|
"grad_norm": 0.4906868040561676, |
|
"learning_rate": 3.6052009456264775e-07, |
|
"loss": 2.1489, |
|
"step": 281 |
|
}, |
|
{ |
|
"epoch": 0.9467058329836341, |
|
"grad_norm": 0.5679292678833008, |
|
"learning_rate": 3.5992907801418437e-07, |
|
"loss": 2.1501, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 0.9500629458665547, |
|
"grad_norm": 0.49988269805908203, |
|
"learning_rate": 3.5933806146572105e-07, |
|
"loss": 2.1413, |
|
"step": 283 |
|
}, |
|
{ |
|
"epoch": 0.9534200587494754, |
|
"grad_norm": 0.4949737787246704, |
|
"learning_rate": 3.5874704491725767e-07, |
|
"loss": 2.188, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 0.9567771716323962, |
|
"grad_norm": 0.4845784902572632, |
|
"learning_rate": 3.5815602836879435e-07, |
|
"loss": 2.08, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.9601342845153168, |
|
"grad_norm": 0.5556589365005493, |
|
"learning_rate": 3.575650118203309e-07, |
|
"loss": 2.1766, |
|
"step": 286 |
|
}, |
|
{ |
|
"epoch": 0.9634913973982375, |
|
"grad_norm": 0.5051941871643066, |
|
"learning_rate": 3.569739952718676e-07, |
|
"loss": 2.1159, |
|
"step": 287 |
|
}, |
|
{ |
|
"epoch": 0.9668485102811583, |
|
"grad_norm": 0.5166348814964294, |
|
"learning_rate": 3.563829787234042e-07, |
|
"loss": 2.2121, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 0.9702056231640789, |
|
"grad_norm": 0.5659390091896057, |
|
"learning_rate": 3.557919621749409e-07, |
|
"loss": 2.1162, |
|
"step": 289 |
|
}, |
|
{ |
|
"epoch": 0.9735627360469996, |
|
"grad_norm": 0.5001223683357239, |
|
"learning_rate": 3.552009456264775e-07, |
|
"loss": 2.1424, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.9769198489299202, |
|
"grad_norm": 0.4793240427970886, |
|
"learning_rate": 3.546099290780142e-07, |
|
"loss": 2.136, |
|
"step": 291 |
|
}, |
|
{ |
|
"epoch": 0.980276961812841, |
|
"grad_norm": 0.5031545162200928, |
|
"learning_rate": 3.5401891252955077e-07, |
|
"loss": 2.1264, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 0.9836340746957617, |
|
"grad_norm": 0.526989221572876, |
|
"learning_rate": 3.5342789598108745e-07, |
|
"loss": 2.2329, |
|
"step": 293 |
|
}, |
|
{ |
|
"epoch": 0.9869911875786823, |
|
"grad_norm": 0.5093796253204346, |
|
"learning_rate": 3.5283687943262407e-07, |
|
"loss": 2.1477, |
|
"step": 294 |
|
}, |
|
{ |
|
"epoch": 0.990348300461603, |
|
"grad_norm": 0.5002118945121765, |
|
"learning_rate": 3.5224586288416075e-07, |
|
"loss": 2.195, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.9937054133445237, |
|
"grad_norm": 0.5272600650787354, |
|
"learning_rate": 3.5165484633569737e-07, |
|
"loss": 2.1664, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 0.9970625262274444, |
|
"grad_norm": 0.48927053809165955, |
|
"learning_rate": 3.5106382978723405e-07, |
|
"loss": 2.1497, |
|
"step": 297 |
|
}, |
|
{ |
|
"epoch": 0.9970625262274444, |
|
"eval_loss": 2.169156074523926, |
|
"eval_runtime": 360.7188, |
|
"eval_samples_per_second": 1.004, |
|
"eval_steps_per_second": 0.252, |
|
"step": 297 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 891, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 8.794451043460055e+17, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|