{ "best_metric": 2.169156074523926, "best_model_checkpoint": "/home/sunggeunan/data/ICL/outputs/lora/SKIML-ICL_mrqa_nq_v3/Meta-Llama-3-8B-Instruct-unanswerable-1Q-0U-0C-qa_first/checkpoint-297", "epoch": 0.9970625262274444, "eval_steps": 500, "global_step": 297, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.003357112882920688, "grad_norm": 0.39633309841156006, "learning_rate": 1.111111111111111e-08, "loss": 2.1607, "step": 1 }, { "epoch": 0.006714225765841376, "grad_norm": 0.4136360287666321, "learning_rate": 2.222222222222222e-08, "loss": 2.2063, "step": 2 }, { "epoch": 0.010071338648762064, "grad_norm": 0.40252378582954407, "learning_rate": 3.3333333333333334e-08, "loss": 2.1702, "step": 3 }, { "epoch": 0.013428451531682753, "grad_norm": 0.3657248914241791, "learning_rate": 4.444444444444444e-08, "loss": 2.1677, "step": 4 }, { "epoch": 0.01678556441460344, "grad_norm": 0.38506612181663513, "learning_rate": 5.555555555555555e-08, "loss": 2.203, "step": 5 }, { "epoch": 0.020142677297524128, "grad_norm": 0.39267775416374207, "learning_rate": 6.666666666666667e-08, "loss": 2.1989, "step": 6 }, { "epoch": 0.02349979018044482, "grad_norm": 0.41893133521080017, "learning_rate": 7.777777777777778e-08, "loss": 2.1882, "step": 7 }, { "epoch": 0.026856903063365505, "grad_norm": 0.363130122423172, "learning_rate": 8.888888888888888e-08, "loss": 2.1636, "step": 8 }, { "epoch": 0.030214015946286196, "grad_norm": 0.43022215366363525, "learning_rate": 1e-07, "loss": 2.1881, "step": 9 }, { "epoch": 0.03357112882920688, "grad_norm": 0.43208909034729004, "learning_rate": 1.111111111111111e-07, "loss": 2.1748, "step": 10 }, { "epoch": 0.03692824171212757, "grad_norm": 0.4211503267288208, "learning_rate": 1.2222222222222222e-07, "loss": 2.2072, "step": 11 }, { "epoch": 0.040285354595048256, "grad_norm": 0.43464261293411255, "learning_rate": 1.3333333333333334e-07, "loss": 2.1711, "step": 12 }, { "epoch": 0.043642467477968946, "grad_norm": 0.38066577911376953, "learning_rate": 1.4444444444444442e-07, "loss": 2.1946, "step": 13 }, { "epoch": 0.04699958036088964, "grad_norm": 0.3847394585609436, "learning_rate": 1.5555555555555556e-07, "loss": 2.1638, "step": 14 }, { "epoch": 0.05035669324381032, "grad_norm": 0.40741828083992004, "learning_rate": 1.6666666666666665e-07, "loss": 2.2389, "step": 15 }, { "epoch": 0.05371380612673101, "grad_norm": 0.37301868200302124, "learning_rate": 1.7777777777777776e-07, "loss": 2.1762, "step": 16 }, { "epoch": 0.0570709190096517, "grad_norm": 0.4193646013736725, "learning_rate": 1.8888888888888888e-07, "loss": 2.2245, "step": 17 }, { "epoch": 0.06042803189257239, "grad_norm": 0.4078114330768585, "learning_rate": 2e-07, "loss": 2.1756, "step": 18 }, { "epoch": 0.06378514477549307, "grad_norm": 0.40552276372909546, "learning_rate": 2.111111111111111e-07, "loss": 2.1302, "step": 19 }, { "epoch": 0.06714225765841376, "grad_norm": 0.40120214223861694, "learning_rate": 2.222222222222222e-07, "loss": 2.1546, "step": 20 }, { "epoch": 0.07049937054133446, "grad_norm": 0.3937098979949951, "learning_rate": 2.3333333333333333e-07, "loss": 2.1822, "step": 21 }, { "epoch": 0.07385648342425515, "grad_norm": 0.39223670959472656, "learning_rate": 2.4444444444444445e-07, "loss": 2.1548, "step": 22 }, { "epoch": 0.07721359630717582, "grad_norm": 0.395595520734787, "learning_rate": 2.5555555555555553e-07, "loss": 2.1538, "step": 23 }, { "epoch": 0.08057070919009651, "grad_norm": 0.38706085085868835, "learning_rate": 2.6666666666666667e-07, "loss": 2.162, "step": 24 }, { "epoch": 0.0839278220730172, "grad_norm": 0.40628549456596375, "learning_rate": 2.7777777777777776e-07, "loss": 2.1493, "step": 25 }, { "epoch": 0.08728493495593789, "grad_norm": 0.3962867259979248, "learning_rate": 2.8888888888888885e-07, "loss": 2.1609, "step": 26 }, { "epoch": 0.09064204783885858, "grad_norm": 0.36925041675567627, "learning_rate": 3e-07, "loss": 2.096, "step": 27 }, { "epoch": 0.09399916072177927, "grad_norm": 0.38802072405815125, "learning_rate": 3.111111111111111e-07, "loss": 2.2456, "step": 28 }, { "epoch": 0.09735627360469996, "grad_norm": 0.38850194215774536, "learning_rate": 3.222222222222222e-07, "loss": 2.1663, "step": 29 }, { "epoch": 0.10071338648762064, "grad_norm": 0.38965868949890137, "learning_rate": 3.333333333333333e-07, "loss": 2.2527, "step": 30 }, { "epoch": 0.10407049937054133, "grad_norm": 1.802207112312317, "learning_rate": 3.4444444444444444e-07, "loss": 2.1718, "step": 31 }, { "epoch": 0.10742761225346202, "grad_norm": 0.41264647245407104, "learning_rate": 3.5555555555555553e-07, "loss": 2.216, "step": 32 }, { "epoch": 0.11078472513638271, "grad_norm": 0.38629451394081116, "learning_rate": 3.666666666666666e-07, "loss": 2.1767, "step": 33 }, { "epoch": 0.1141418380193034, "grad_norm": 0.38191673159599304, "learning_rate": 3.7777777777777775e-07, "loss": 2.1206, "step": 34 }, { "epoch": 0.11749895090222409, "grad_norm": 0.3905788064002991, "learning_rate": 3.888888888888889e-07, "loss": 2.1053, "step": 35 }, { "epoch": 0.12085606378514478, "grad_norm": 0.4043135941028595, "learning_rate": 4e-07, "loss": 2.1955, "step": 36 }, { "epoch": 0.12421317666806546, "grad_norm": 0.446430504322052, "learning_rate": 4.1111111111111107e-07, "loss": 2.2385, "step": 37 }, { "epoch": 0.12757028955098615, "grad_norm": 0.38461461663246155, "learning_rate": 4.222222222222222e-07, "loss": 2.1255, "step": 38 }, { "epoch": 0.13092740243390685, "grad_norm": 0.4022009074687958, "learning_rate": 4.3333333333333335e-07, "loss": 2.1821, "step": 39 }, { "epoch": 0.13428451531682753, "grad_norm": 0.40789297223091125, "learning_rate": 4.444444444444444e-07, "loss": 2.1387, "step": 40 }, { "epoch": 0.1376416281997482, "grad_norm": 0.4071018099784851, "learning_rate": 4.555555555555555e-07, "loss": 2.1077, "step": 41 }, { "epoch": 0.1409987410826689, "grad_norm": 0.42578282952308655, "learning_rate": 4.6666666666666666e-07, "loss": 2.1956, "step": 42 }, { "epoch": 0.1443558539655896, "grad_norm": 0.4121275842189789, "learning_rate": 4.777777777777778e-07, "loss": 2.1491, "step": 43 }, { "epoch": 0.1477129668485103, "grad_norm": 0.3832322657108307, "learning_rate": 4.888888888888889e-07, "loss": 2.1465, "step": 44 }, { "epoch": 0.15107007973143097, "grad_norm": 0.4325246214866638, "learning_rate": 5e-07, "loss": 2.2452, "step": 45 }, { "epoch": 0.15442719261435164, "grad_norm": 0.38803404569625854, "learning_rate": 4.994089834515367e-07, "loss": 2.1442, "step": 46 }, { "epoch": 0.15778430549727235, "grad_norm": 0.38622474670410156, "learning_rate": 4.988179669030732e-07, "loss": 2.1404, "step": 47 }, { "epoch": 0.16114141838019302, "grad_norm": 0.365347683429718, "learning_rate": 4.982269503546099e-07, "loss": 2.1322, "step": 48 }, { "epoch": 0.16449853126311373, "grad_norm": 0.3673339784145355, "learning_rate": 4.976359338061466e-07, "loss": 2.1242, "step": 49 }, { "epoch": 0.1678556441460344, "grad_norm": 0.3915681838989258, "learning_rate": 4.970449172576833e-07, "loss": 2.188, "step": 50 }, { "epoch": 0.1712127570289551, "grad_norm": 0.4330926239490509, "learning_rate": 4.964539007092198e-07, "loss": 2.1702, "step": 51 }, { "epoch": 0.17456986991187579, "grad_norm": 0.40760231018066406, "learning_rate": 4.958628841607565e-07, "loss": 2.218, "step": 52 }, { "epoch": 0.17792698279479646, "grad_norm": 0.432960569858551, "learning_rate": 4.952718676122931e-07, "loss": 2.1804, "step": 53 }, { "epoch": 0.18128409567771717, "grad_norm": 0.38337603211402893, "learning_rate": 4.946808510638298e-07, "loss": 2.1611, "step": 54 }, { "epoch": 0.18464120856063784, "grad_norm": 0.4071826636791229, "learning_rate": 4.940898345153664e-07, "loss": 2.1569, "step": 55 }, { "epoch": 0.18799832144355855, "grad_norm": 0.416966050863266, "learning_rate": 4.934988179669031e-07, "loss": 2.1722, "step": 56 }, { "epoch": 0.19135543432647922, "grad_norm": 0.42446526885032654, "learning_rate": 4.929078014184397e-07, "loss": 2.1529, "step": 57 }, { "epoch": 0.19471254720939993, "grad_norm": 0.41747376322746277, "learning_rate": 4.923167848699764e-07, "loss": 2.1191, "step": 58 }, { "epoch": 0.1980696600923206, "grad_norm": 0.44791901111602783, "learning_rate": 4.917257683215129e-07, "loss": 2.1183, "step": 59 }, { "epoch": 0.20142677297524128, "grad_norm": 0.39679446816444397, "learning_rate": 4.911347517730496e-07, "loss": 2.161, "step": 60 }, { "epoch": 0.20478388585816198, "grad_norm": 0.38211897015571594, "learning_rate": 4.905437352245863e-07, "loss": 2.1334, "step": 61 }, { "epoch": 0.20814099874108266, "grad_norm": 0.4393980801105499, "learning_rate": 4.89952718676123e-07, "loss": 2.2287, "step": 62 }, { "epoch": 0.21149811162400337, "grad_norm": 0.40504157543182373, "learning_rate": 4.893617021276595e-07, "loss": 2.1986, "step": 63 }, { "epoch": 0.21485522450692404, "grad_norm": 0.40123313665390015, "learning_rate": 4.887706855791962e-07, "loss": 2.2045, "step": 64 }, { "epoch": 0.21821233738984475, "grad_norm": 0.4357161819934845, "learning_rate": 4.881796690307328e-07, "loss": 2.235, "step": 65 }, { "epoch": 0.22156945027276542, "grad_norm": 0.39656224846839905, "learning_rate": 4.875886524822695e-07, "loss": 2.1712, "step": 66 }, { "epoch": 0.2249265631556861, "grad_norm": 0.41355738043785095, "learning_rate": 4.869976359338061e-07, "loss": 2.1957, "step": 67 }, { "epoch": 0.2282836760386068, "grad_norm": 0.4384121298789978, "learning_rate": 4.864066193853428e-07, "loss": 2.1832, "step": 68 }, { "epoch": 0.23164078892152748, "grad_norm": 0.4240085184574127, "learning_rate": 4.858156028368794e-07, "loss": 2.1331, "step": 69 }, { "epoch": 0.23499790180444818, "grad_norm": 0.38766977190971375, "learning_rate": 4.852245862884161e-07, "loss": 2.1492, "step": 70 }, { "epoch": 0.23835501468736886, "grad_norm": 0.4235953390598297, "learning_rate": 4.846335697399526e-07, "loss": 2.2232, "step": 71 }, { "epoch": 0.24171212757028956, "grad_norm": 0.41447708010673523, "learning_rate": 4.840425531914893e-07, "loss": 2.1978, "step": 72 }, { "epoch": 0.24506924045321024, "grad_norm": 0.4142104685306549, "learning_rate": 4.83451536643026e-07, "loss": 2.1532, "step": 73 }, { "epoch": 0.24842635333613092, "grad_norm": 0.4122621417045593, "learning_rate": 4.828605200945627e-07, "loss": 2.2096, "step": 74 }, { "epoch": 0.2517834662190516, "grad_norm": 0.4637957513332367, "learning_rate": 4.822695035460992e-07, "loss": 2.2043, "step": 75 }, { "epoch": 0.2551405791019723, "grad_norm": 0.4476231038570404, "learning_rate": 4.816784869976359e-07, "loss": 2.2264, "step": 76 }, { "epoch": 0.258497691984893, "grad_norm": 0.40626445412635803, "learning_rate": 4.810874704491725e-07, "loss": 2.2184, "step": 77 }, { "epoch": 0.2618548048678137, "grad_norm": 0.4468678832054138, "learning_rate": 4.804964539007092e-07, "loss": 2.2189, "step": 78 }, { "epoch": 0.2652119177507344, "grad_norm": 0.42194902896881104, "learning_rate": 4.799054373522458e-07, "loss": 2.1769, "step": 79 }, { "epoch": 0.26856903063365506, "grad_norm": 0.4420163035392761, "learning_rate": 4.793144208037825e-07, "loss": 2.1329, "step": 80 }, { "epoch": 0.27192614351657574, "grad_norm": 0.4482937455177307, "learning_rate": 4.787234042553192e-07, "loss": 2.1954, "step": 81 }, { "epoch": 0.2752832563994964, "grad_norm": 0.41959258913993835, "learning_rate": 4.781323877068558e-07, "loss": 2.1776, "step": 82 }, { "epoch": 0.27864036928241714, "grad_norm": 0.42231133580207825, "learning_rate": 4.775413711583924e-07, "loss": 2.139, "step": 83 }, { "epoch": 0.2819974821653378, "grad_norm": 0.4405987560749054, "learning_rate": 4.76950354609929e-07, "loss": 2.2445, "step": 84 }, { "epoch": 0.2853545950482585, "grad_norm": 0.394240140914917, "learning_rate": 4.7635933806146573e-07, "loss": 2.1289, "step": 85 }, { "epoch": 0.2887117079311792, "grad_norm": 0.44175001978874207, "learning_rate": 4.7576832151300236e-07, "loss": 2.2419, "step": 86 }, { "epoch": 0.29206882081409985, "grad_norm": 0.41716253757476807, "learning_rate": 4.75177304964539e-07, "loss": 2.2557, "step": 87 }, { "epoch": 0.2954259336970206, "grad_norm": 0.41680270433425903, "learning_rate": 4.745862884160756e-07, "loss": 2.1866, "step": 88 }, { "epoch": 0.29878304657994126, "grad_norm": 0.4188416600227356, "learning_rate": 4.739952718676123e-07, "loss": 2.1909, "step": 89 }, { "epoch": 0.30214015946286193, "grad_norm": 0.41669386625289917, "learning_rate": 4.734042553191489e-07, "loss": 2.1913, "step": 90 }, { "epoch": 0.3054972723457826, "grad_norm": 0.4323998689651489, "learning_rate": 4.728132387706856e-07, "loss": 2.1811, "step": 91 }, { "epoch": 0.3088543852287033, "grad_norm": 0.431393027305603, "learning_rate": 4.722222222222222e-07, "loss": 2.1772, "step": 92 }, { "epoch": 0.312211498111624, "grad_norm": 0.4159488081932068, "learning_rate": 4.716312056737589e-07, "loss": 2.1459, "step": 93 }, { "epoch": 0.3155686109945447, "grad_norm": 0.4011417329311371, "learning_rate": 4.7104018912529545e-07, "loss": 2.1657, "step": 94 }, { "epoch": 0.3189257238774654, "grad_norm": 0.41295671463012695, "learning_rate": 4.7044917257683213e-07, "loss": 2.1217, "step": 95 }, { "epoch": 0.32228283676038605, "grad_norm": 0.4088380038738251, "learning_rate": 4.6985815602836876e-07, "loss": 2.152, "step": 96 }, { "epoch": 0.3256399496433068, "grad_norm": 0.43500083684921265, "learning_rate": 4.6926713947990543e-07, "loss": 2.2021, "step": 97 }, { "epoch": 0.32899706252622746, "grad_norm": 0.4200705587863922, "learning_rate": 4.6867612293144206e-07, "loss": 2.1357, "step": 98 }, { "epoch": 0.33235417540914813, "grad_norm": 0.4516183137893677, "learning_rate": 4.6808510638297873e-07, "loss": 2.2323, "step": 99 }, { "epoch": 0.3357112882920688, "grad_norm": 0.49128514528274536, "learning_rate": 4.674940898345153e-07, "loss": 2.2364, "step": 100 }, { "epoch": 0.3390684011749895, "grad_norm": 0.4172728657722473, "learning_rate": 4.66903073286052e-07, "loss": 2.2085, "step": 101 }, { "epoch": 0.3424255140579102, "grad_norm": 0.4487544000148773, "learning_rate": 4.663120567375886e-07, "loss": 2.1661, "step": 102 }, { "epoch": 0.3457826269408309, "grad_norm": 0.4443681538105011, "learning_rate": 4.657210401891253e-07, "loss": 2.2109, "step": 103 }, { "epoch": 0.34913973982375157, "grad_norm": 0.4674079418182373, "learning_rate": 4.651300236406619e-07, "loss": 2.1596, "step": 104 }, { "epoch": 0.35249685270667225, "grad_norm": 0.43013623356819153, "learning_rate": 4.645390070921986e-07, "loss": 2.1658, "step": 105 }, { "epoch": 0.3558539655895929, "grad_norm": 0.43104687333106995, "learning_rate": 4.6394799054373515e-07, "loss": 2.1686, "step": 106 }, { "epoch": 0.35921107847251366, "grad_norm": 0.4218711853027344, "learning_rate": 4.6335697399527183e-07, "loss": 2.141, "step": 107 }, { "epoch": 0.36256819135543433, "grad_norm": 0.45031747221946716, "learning_rate": 4.6276595744680846e-07, "loss": 2.1561, "step": 108 }, { "epoch": 0.365925304238355, "grad_norm": 0.48128026723861694, "learning_rate": 4.6217494089834513e-07, "loss": 2.214, "step": 109 }, { "epoch": 0.3692824171212757, "grad_norm": 0.44868627190589905, "learning_rate": 4.6158392434988176e-07, "loss": 2.1488, "step": 110 }, { "epoch": 0.3726395300041964, "grad_norm": 0.44237226247787476, "learning_rate": 4.6099290780141843e-07, "loss": 2.1099, "step": 111 }, { "epoch": 0.3759966428871171, "grad_norm": 0.42734286189079285, "learning_rate": 4.604018912529551e-07, "loss": 2.2332, "step": 112 }, { "epoch": 0.37935375577003777, "grad_norm": 0.45235806703567505, "learning_rate": 4.598108747044917e-07, "loss": 2.1925, "step": 113 }, { "epoch": 0.38271086865295845, "grad_norm": 0.4485257863998413, "learning_rate": 4.5921985815602836e-07, "loss": 2.1786, "step": 114 }, { "epoch": 0.3860679815358791, "grad_norm": 0.45567062497138977, "learning_rate": 4.58628841607565e-07, "loss": 2.1386, "step": 115 }, { "epoch": 0.38942509441879986, "grad_norm": 0.45261716842651367, "learning_rate": 4.5803782505910166e-07, "loss": 2.136, "step": 116 }, { "epoch": 0.39278220730172053, "grad_norm": 0.4375866949558258, "learning_rate": 4.574468085106383e-07, "loss": 2.1422, "step": 117 }, { "epoch": 0.3961393201846412, "grad_norm": 0.46383175253868103, "learning_rate": 4.5685579196217496e-07, "loss": 2.1732, "step": 118 }, { "epoch": 0.3994964330675619, "grad_norm": 0.4010314345359802, "learning_rate": 4.5626477541371153e-07, "loss": 2.1329, "step": 119 }, { "epoch": 0.40285354595048256, "grad_norm": 0.4446873068809509, "learning_rate": 4.556737588652482e-07, "loss": 2.1791, "step": 120 }, { "epoch": 0.4062106588334033, "grad_norm": 0.47618600726127625, "learning_rate": 4.5508274231678483e-07, "loss": 2.236, "step": 121 }, { "epoch": 0.40956777171632397, "grad_norm": 0.4493118226528168, "learning_rate": 4.544917257683215e-07, "loss": 2.1575, "step": 122 }, { "epoch": 0.41292488459924465, "grad_norm": 0.4111258387565613, "learning_rate": 4.5390070921985813e-07, "loss": 2.24, "step": 123 }, { "epoch": 0.4162819974821653, "grad_norm": 0.41655582189559937, "learning_rate": 4.533096926713948e-07, "loss": 2.0788, "step": 124 }, { "epoch": 0.419639110365086, "grad_norm": 0.47266441583633423, "learning_rate": 4.5271867612293143e-07, "loss": 2.1774, "step": 125 }, { "epoch": 0.42299622324800673, "grad_norm": 0.464999794960022, "learning_rate": 4.5212765957446806e-07, "loss": 2.143, "step": 126 }, { "epoch": 0.4263533361309274, "grad_norm": 0.44828522205352783, "learning_rate": 4.515366430260047e-07, "loss": 2.1363, "step": 127 }, { "epoch": 0.4297104490138481, "grad_norm": 0.4714733362197876, "learning_rate": 4.5094562647754136e-07, "loss": 2.225, "step": 128 }, { "epoch": 0.43306756189676876, "grad_norm": 0.42666733264923096, "learning_rate": 4.50354609929078e-07, "loss": 2.1774, "step": 129 }, { "epoch": 0.4364246747796895, "grad_norm": 0.46839290857315063, "learning_rate": 4.4976359338061466e-07, "loss": 2.1427, "step": 130 }, { "epoch": 0.43978178766261017, "grad_norm": 0.48040419816970825, "learning_rate": 4.491725768321513e-07, "loss": 2.1909, "step": 131 }, { "epoch": 0.44313890054553084, "grad_norm": 0.4932810962200165, "learning_rate": 4.485815602836879e-07, "loss": 2.1226, "step": 132 }, { "epoch": 0.4464960134284515, "grad_norm": 0.4730973541736603, "learning_rate": 4.4799054373522453e-07, "loss": 2.1844, "step": 133 }, { "epoch": 0.4498531263113722, "grad_norm": 0.44282010197639465, "learning_rate": 4.473995271867612e-07, "loss": 2.1503, "step": 134 }, { "epoch": 0.45321023919429293, "grad_norm": 0.4495702087879181, "learning_rate": 4.4680851063829783e-07, "loss": 2.1599, "step": 135 }, { "epoch": 0.4565673520772136, "grad_norm": 0.44728878140449524, "learning_rate": 4.462174940898345e-07, "loss": 2.1479, "step": 136 }, { "epoch": 0.4599244649601343, "grad_norm": 0.4495660960674286, "learning_rate": 4.4562647754137114e-07, "loss": 2.1272, "step": 137 }, { "epoch": 0.46328157784305496, "grad_norm": 0.4553879499435425, "learning_rate": 4.4503546099290776e-07, "loss": 2.2729, "step": 138 }, { "epoch": 0.46663869072597564, "grad_norm": 0.46510016918182373, "learning_rate": 4.444444444444444e-07, "loss": 2.2367, "step": 139 }, { "epoch": 0.46999580360889637, "grad_norm": 0.4671325981616974, "learning_rate": 4.4385342789598106e-07, "loss": 2.1167, "step": 140 }, { "epoch": 0.47335291649181704, "grad_norm": 0.4627954661846161, "learning_rate": 4.432624113475177e-07, "loss": 2.2521, "step": 141 }, { "epoch": 0.4767100293747377, "grad_norm": 0.4297815263271332, "learning_rate": 4.4267139479905436e-07, "loss": 2.1935, "step": 142 }, { "epoch": 0.4800671422576584, "grad_norm": 0.4634767770767212, "learning_rate": 4.4208037825059104e-07, "loss": 2.1128, "step": 143 }, { "epoch": 0.48342425514057913, "grad_norm": 0.4689215421676636, "learning_rate": 4.4148936170212766e-07, "loss": 2.2286, "step": 144 }, { "epoch": 0.4867813680234998, "grad_norm": 0.4813438355922699, "learning_rate": 4.408983451536643e-07, "loss": 2.1433, "step": 145 }, { "epoch": 0.4901384809064205, "grad_norm": 0.45745640993118286, "learning_rate": 4.403073286052009e-07, "loss": 2.1949, "step": 146 }, { "epoch": 0.49349559378934116, "grad_norm": 0.4202418625354767, "learning_rate": 4.397163120567376e-07, "loss": 2.1028, "step": 147 }, { "epoch": 0.49685270667226183, "grad_norm": 0.42282456159591675, "learning_rate": 4.391252955082742e-07, "loss": 2.1114, "step": 148 }, { "epoch": 0.5002098195551825, "grad_norm": 0.4623030424118042, "learning_rate": 4.385342789598109e-07, "loss": 2.1618, "step": 149 }, { "epoch": 0.5035669324381032, "grad_norm": 0.4584071934223175, "learning_rate": 4.379432624113475e-07, "loss": 2.2274, "step": 150 }, { "epoch": 0.5069240453210239, "grad_norm": 0.43828415870666504, "learning_rate": 4.3735224586288414e-07, "loss": 2.1807, "step": 151 }, { "epoch": 0.5102811582039446, "grad_norm": 0.4550941288471222, "learning_rate": 4.3676122931442076e-07, "loss": 2.1355, "step": 152 }, { "epoch": 0.5136382710868653, "grad_norm": 0.4852266013622284, "learning_rate": 4.3617021276595744e-07, "loss": 2.1623, "step": 153 }, { "epoch": 0.516995383969786, "grad_norm": 0.450320303440094, "learning_rate": 4.3557919621749406e-07, "loss": 2.1963, "step": 154 }, { "epoch": 0.5203524968527067, "grad_norm": 0.4544139504432678, "learning_rate": 4.3498817966903074e-07, "loss": 2.1413, "step": 155 }, { "epoch": 0.5237096097356274, "grad_norm": 0.4609904885292053, "learning_rate": 4.3439716312056736e-07, "loss": 2.2289, "step": 156 }, { "epoch": 0.527066722618548, "grad_norm": 0.46614569425582886, "learning_rate": 4.3380614657210404e-07, "loss": 2.1289, "step": 157 }, { "epoch": 0.5304238355014688, "grad_norm": 0.4586597681045532, "learning_rate": 4.332151300236406e-07, "loss": 2.1033, "step": 158 }, { "epoch": 0.5337809483843894, "grad_norm": 0.4757809340953827, "learning_rate": 4.326241134751773e-07, "loss": 2.1708, "step": 159 }, { "epoch": 0.5371380612673101, "grad_norm": 0.45364031195640564, "learning_rate": 4.320330969267139e-07, "loss": 2.1473, "step": 160 }, { "epoch": 0.5404951741502309, "grad_norm": 0.45321136713027954, "learning_rate": 4.314420803782506e-07, "loss": 2.177, "step": 161 }, { "epoch": 0.5438522870331515, "grad_norm": 0.43466734886169434, "learning_rate": 4.308510638297872e-07, "loss": 2.1304, "step": 162 }, { "epoch": 0.5472093999160722, "grad_norm": 0.4303533732891083, "learning_rate": 4.302600472813239e-07, "loss": 2.0758, "step": 163 }, { "epoch": 0.5505665127989928, "grad_norm": 0.47530239820480347, "learning_rate": 4.2966903073286046e-07, "loss": 2.2194, "step": 164 }, { "epoch": 0.5539236256819136, "grad_norm": 0.4379255175590515, "learning_rate": 4.2907801418439714e-07, "loss": 2.1497, "step": 165 }, { "epoch": 0.5572807385648343, "grad_norm": 0.4771229922771454, "learning_rate": 4.2848699763593376e-07, "loss": 2.1491, "step": 166 }, { "epoch": 0.5606378514477549, "grad_norm": 0.4536450505256653, "learning_rate": 4.2789598108747044e-07, "loss": 2.2061, "step": 167 }, { "epoch": 0.5639949643306756, "grad_norm": 0.46324947476387024, "learning_rate": 4.2730496453900706e-07, "loss": 2.1859, "step": 168 }, { "epoch": 0.5673520772135963, "grad_norm": 0.4493923485279083, "learning_rate": 4.2671394799054374e-07, "loss": 2.0956, "step": 169 }, { "epoch": 0.570709190096517, "grad_norm": 0.4963778853416443, "learning_rate": 4.261229314420803e-07, "loss": 2.1433, "step": 170 }, { "epoch": 0.5740663029794377, "grad_norm": 0.5063489675521851, "learning_rate": 4.25531914893617e-07, "loss": 2.1887, "step": 171 }, { "epoch": 0.5774234158623583, "grad_norm": 0.4580891728401184, "learning_rate": 4.249408983451536e-07, "loss": 2.1164, "step": 172 }, { "epoch": 0.5807805287452791, "grad_norm": 0.4890580177307129, "learning_rate": 4.243498817966903e-07, "loss": 2.1647, "step": 173 }, { "epoch": 0.5841376416281997, "grad_norm": 0.45317739248275757, "learning_rate": 4.237588652482269e-07, "loss": 2.1837, "step": 174 }, { "epoch": 0.5874947545111204, "grad_norm": 0.4900612533092499, "learning_rate": 4.231678486997636e-07, "loss": 2.1558, "step": 175 }, { "epoch": 0.5908518673940412, "grad_norm": 0.47292637825012207, "learning_rate": 4.2257683215130027e-07, "loss": 2.163, "step": 176 }, { "epoch": 0.5942089802769618, "grad_norm": 0.4768417477607727, "learning_rate": 4.2198581560283684e-07, "loss": 2.1963, "step": 177 }, { "epoch": 0.5975660931598825, "grad_norm": 0.4955364465713501, "learning_rate": 4.213947990543735e-07, "loss": 2.1402, "step": 178 }, { "epoch": 0.6009232060428031, "grad_norm": 0.46482154726982117, "learning_rate": 4.2080378250591014e-07, "loss": 2.238, "step": 179 }, { "epoch": 0.6042803189257239, "grad_norm": 0.47761717438697815, "learning_rate": 4.202127659574468e-07, "loss": 2.1089, "step": 180 }, { "epoch": 0.6076374318086446, "grad_norm": 0.48028987646102905, "learning_rate": 4.1962174940898344e-07, "loss": 2.1946, "step": 181 }, { "epoch": 0.6109945446915652, "grad_norm": 0.4602825939655304, "learning_rate": 4.190307328605201e-07, "loss": 2.1304, "step": 182 }, { "epoch": 0.614351657574486, "grad_norm": 0.4691866338253021, "learning_rate": 4.184397163120567e-07, "loss": 2.1363, "step": 183 }, { "epoch": 0.6177087704574066, "grad_norm": 0.46271318197250366, "learning_rate": 4.1784869976359336e-07, "loss": 2.1805, "step": 184 }, { "epoch": 0.6210658833403273, "grad_norm": 0.48010194301605225, "learning_rate": 4.1725768321513e-07, "loss": 2.178, "step": 185 }, { "epoch": 0.624422996223248, "grad_norm": 0.45885005593299866, "learning_rate": 4.1666666666666667e-07, "loss": 2.1575, "step": 186 }, { "epoch": 0.6277801091061687, "grad_norm": 0.45524775981903076, "learning_rate": 4.160756501182033e-07, "loss": 2.1303, "step": 187 }, { "epoch": 0.6311372219890894, "grad_norm": 0.4570733606815338, "learning_rate": 4.1548463356973997e-07, "loss": 2.1628, "step": 188 }, { "epoch": 0.6344943348720101, "grad_norm": 0.489170640707016, "learning_rate": 4.148936170212766e-07, "loss": 2.1663, "step": 189 }, { "epoch": 0.6378514477549307, "grad_norm": 0.47888293862342834, "learning_rate": 4.143026004728132e-07, "loss": 2.1347, "step": 190 }, { "epoch": 0.6412085606378515, "grad_norm": 0.4729193449020386, "learning_rate": 4.1371158392434984e-07, "loss": 2.1394, "step": 191 }, { "epoch": 0.6445656735207721, "grad_norm": 0.5049130320549011, "learning_rate": 4.131205673758865e-07, "loss": 2.2181, "step": 192 }, { "epoch": 0.6479227864036928, "grad_norm": 0.44132182002067566, "learning_rate": 4.1252955082742314e-07, "loss": 2.1427, "step": 193 }, { "epoch": 0.6512798992866136, "grad_norm": 0.49706417322158813, "learning_rate": 4.119385342789598e-07, "loss": 2.1993, "step": 194 }, { "epoch": 0.6546370121695342, "grad_norm": 0.46416929364204407, "learning_rate": 4.1134751773049644e-07, "loss": 2.1108, "step": 195 }, { "epoch": 0.6579941250524549, "grad_norm": 0.4778405427932739, "learning_rate": 4.1075650118203306e-07, "loss": 2.1694, "step": 196 }, { "epoch": 0.6613512379353755, "grad_norm": 0.46708041429519653, "learning_rate": 4.101654846335697e-07, "loss": 2.184, "step": 197 }, { "epoch": 0.6647083508182963, "grad_norm": 0.48584261536598206, "learning_rate": 4.0957446808510637e-07, "loss": 2.1222, "step": 198 }, { "epoch": 0.668065463701217, "grad_norm": 0.5111873745918274, "learning_rate": 4.08983451536643e-07, "loss": 2.1711, "step": 199 }, { "epoch": 0.6714225765841376, "grad_norm": 0.4958716630935669, "learning_rate": 4.0839243498817967e-07, "loss": 2.1523, "step": 200 }, { "epoch": 0.6747796894670584, "grad_norm": 0.48708048462867737, "learning_rate": 4.078014184397163e-07, "loss": 2.1921, "step": 201 }, { "epoch": 0.678136802349979, "grad_norm": 0.47986796498298645, "learning_rate": 4.0721040189125297e-07, "loss": 2.1619, "step": 202 }, { "epoch": 0.6814939152328997, "grad_norm": 0.487250417470932, "learning_rate": 4.0661938534278954e-07, "loss": 2.2103, "step": 203 }, { "epoch": 0.6848510281158204, "grad_norm": 0.5118921995162964, "learning_rate": 4.060283687943262e-07, "loss": 2.2549, "step": 204 }, { "epoch": 0.6882081409987411, "grad_norm": 0.5187731981277466, "learning_rate": 4.0543735224586284e-07, "loss": 2.2144, "step": 205 }, { "epoch": 0.6915652538816618, "grad_norm": 0.4841180145740509, "learning_rate": 4.048463356973995e-07, "loss": 2.1528, "step": 206 }, { "epoch": 0.6949223667645824, "grad_norm": 0.47858700156211853, "learning_rate": 4.0425531914893614e-07, "loss": 2.1743, "step": 207 }, { "epoch": 0.6982794796475031, "grad_norm": 0.47898271679878235, "learning_rate": 4.036643026004728e-07, "loss": 2.1268, "step": 208 }, { "epoch": 0.7016365925304239, "grad_norm": 0.4743264615535736, "learning_rate": 4.0307328605200944e-07, "loss": 2.1992, "step": 209 }, { "epoch": 0.7049937054133445, "grad_norm": 0.5258775353431702, "learning_rate": 4.0248226950354607e-07, "loss": 2.1288, "step": 210 }, { "epoch": 0.7083508182962652, "grad_norm": 0.4403035044670105, "learning_rate": 4.0189125295508274e-07, "loss": 2.1033, "step": 211 }, { "epoch": 0.7117079311791858, "grad_norm": 0.4601992666721344, "learning_rate": 4.0130023640661937e-07, "loss": 2.2051, "step": 212 }, { "epoch": 0.7150650440621066, "grad_norm": 0.48560434579849243, "learning_rate": 4.0070921985815604e-07, "loss": 2.1469, "step": 213 }, { "epoch": 0.7184221569450273, "grad_norm": 0.4823721945285797, "learning_rate": 4.0011820330969267e-07, "loss": 2.1988, "step": 214 }, { "epoch": 0.7217792698279479, "grad_norm": 0.48195022344589233, "learning_rate": 3.995271867612293e-07, "loss": 2.1211, "step": 215 }, { "epoch": 0.7251363827108687, "grad_norm": 0.5148845314979553, "learning_rate": 3.989361702127659e-07, "loss": 2.1803, "step": 216 }, { "epoch": 0.7284934955937893, "grad_norm": 0.4884459376335144, "learning_rate": 3.983451536643026e-07, "loss": 2.1332, "step": 217 }, { "epoch": 0.73185060847671, "grad_norm": 0.5225220322608948, "learning_rate": 3.977541371158392e-07, "loss": 2.1582, "step": 218 }, { "epoch": 0.7352077213596308, "grad_norm": 0.4897938668727875, "learning_rate": 3.971631205673759e-07, "loss": 2.1322, "step": 219 }, { "epoch": 0.7385648342425514, "grad_norm": 0.502916693687439, "learning_rate": 3.965721040189125e-07, "loss": 2.1518, "step": 220 }, { "epoch": 0.7419219471254721, "grad_norm": 0.4693153202533722, "learning_rate": 3.959810874704492e-07, "loss": 2.1207, "step": 221 }, { "epoch": 0.7452790600083928, "grad_norm": 0.4866141676902771, "learning_rate": 3.9539007092198577e-07, "loss": 2.1424, "step": 222 }, { "epoch": 0.7486361728913135, "grad_norm": 0.48267892003059387, "learning_rate": 3.9479905437352244e-07, "loss": 2.1664, "step": 223 }, { "epoch": 0.7519932857742342, "grad_norm": 0.505587637424469, "learning_rate": 3.9420803782505907e-07, "loss": 2.1723, "step": 224 }, { "epoch": 0.7553503986571548, "grad_norm": 0.47869905829429626, "learning_rate": 3.9361702127659574e-07, "loss": 2.0781, "step": 225 }, { "epoch": 0.7587075115400755, "grad_norm": 0.487474650144577, "learning_rate": 3.9302600472813237e-07, "loss": 2.1889, "step": 226 }, { "epoch": 0.7620646244229963, "grad_norm": 0.5115759968757629, "learning_rate": 3.9243498817966904e-07, "loss": 2.2055, "step": 227 }, { "epoch": 0.7654217373059169, "grad_norm": 0.4802757203578949, "learning_rate": 3.918439716312056e-07, "loss": 2.1542, "step": 228 }, { "epoch": 0.7687788501888376, "grad_norm": 0.48687273263931274, "learning_rate": 3.912529550827423e-07, "loss": 2.2195, "step": 229 }, { "epoch": 0.7721359630717582, "grad_norm": 0.5212287902832031, "learning_rate": 3.906619385342789e-07, "loss": 2.2386, "step": 230 }, { "epoch": 0.775493075954679, "grad_norm": 0.4856519401073456, "learning_rate": 3.900709219858156e-07, "loss": 2.1322, "step": 231 }, { "epoch": 0.7788501888375997, "grad_norm": 0.4821922183036804, "learning_rate": 3.894799054373522e-07, "loss": 2.1594, "step": 232 }, { "epoch": 0.7822073017205203, "grad_norm": 0.46911802887916565, "learning_rate": 3.888888888888889e-07, "loss": 2.1279, "step": 233 }, { "epoch": 0.7855644146034411, "grad_norm": 0.5064778923988342, "learning_rate": 3.8829787234042547e-07, "loss": 2.1294, "step": 234 }, { "epoch": 0.7889215274863617, "grad_norm": 0.5024438500404358, "learning_rate": 3.8770685579196214e-07, "loss": 2.1321, "step": 235 }, { "epoch": 0.7922786403692824, "grad_norm": 0.5185412168502808, "learning_rate": 3.8711583924349877e-07, "loss": 2.13, "step": 236 }, { "epoch": 0.7956357532522031, "grad_norm": 0.5049921274185181, "learning_rate": 3.8652482269503544e-07, "loss": 2.165, "step": 237 }, { "epoch": 0.7989928661351238, "grad_norm": 0.5252367258071899, "learning_rate": 3.8593380614657207e-07, "loss": 2.0951, "step": 238 }, { "epoch": 0.8023499790180445, "grad_norm": 0.5152316093444824, "learning_rate": 3.8534278959810874e-07, "loss": 2.1517, "step": 239 }, { "epoch": 0.8057070919009651, "grad_norm": 0.4972199499607086, "learning_rate": 3.8475177304964537e-07, "loss": 2.1474, "step": 240 }, { "epoch": 0.8090642047838859, "grad_norm": 0.5103582143783569, "learning_rate": 3.84160756501182e-07, "loss": 2.1318, "step": 241 }, { "epoch": 0.8124213176668066, "grad_norm": 0.4988660216331482, "learning_rate": 3.8356973995271867e-07, "loss": 2.1228, "step": 242 }, { "epoch": 0.8157784305497272, "grad_norm": 0.5007835030555725, "learning_rate": 3.829787234042553e-07, "loss": 2.1176, "step": 243 }, { "epoch": 0.8191355434326479, "grad_norm": 0.4536113440990448, "learning_rate": 3.8238770685579197e-07, "loss": 2.1406, "step": 244 }, { "epoch": 0.8224926563155686, "grad_norm": 0.5342024564743042, "learning_rate": 3.817966903073286e-07, "loss": 2.1462, "step": 245 }, { "epoch": 0.8258497691984893, "grad_norm": 0.48217201232910156, "learning_rate": 3.8120567375886527e-07, "loss": 2.1259, "step": 246 }, { "epoch": 0.82920688208141, "grad_norm": 0.5227500200271606, "learning_rate": 3.8061465721040184e-07, "loss": 2.1516, "step": 247 }, { "epoch": 0.8325639949643306, "grad_norm": 0.47303012013435364, "learning_rate": 3.800236406619385e-07, "loss": 2.1713, "step": 248 }, { "epoch": 0.8359211078472514, "grad_norm": 0.512878954410553, "learning_rate": 3.7943262411347514e-07, "loss": 2.1799, "step": 249 }, { "epoch": 0.839278220730172, "grad_norm": 0.5365780591964722, "learning_rate": 3.788416075650118e-07, "loss": 2.1795, "step": 250 }, { "epoch": 0.8426353336130927, "grad_norm": 0.5341731905937195, "learning_rate": 3.7825059101654844e-07, "loss": 2.2331, "step": 251 }, { "epoch": 0.8459924464960135, "grad_norm": 0.4720432758331299, "learning_rate": 3.776595744680851e-07, "loss": 2.155, "step": 252 }, { "epoch": 0.8493495593789341, "grad_norm": 0.5171768665313721, "learning_rate": 3.7706855791962175e-07, "loss": 2.1395, "step": 253 }, { "epoch": 0.8527066722618548, "grad_norm": 0.5279157757759094, "learning_rate": 3.7647754137115837e-07, "loss": 2.1647, "step": 254 }, { "epoch": 0.8560637851447755, "grad_norm": 0.5167645812034607, "learning_rate": 3.75886524822695e-07, "loss": 2.1915, "step": 255 }, { "epoch": 0.8594208980276962, "grad_norm": 0.4854820668697357, "learning_rate": 3.7529550827423167e-07, "loss": 2.1293, "step": 256 }, { "epoch": 0.8627780109106169, "grad_norm": 0.5053945183753967, "learning_rate": 3.747044917257683e-07, "loss": 2.1776, "step": 257 }, { "epoch": 0.8661351237935375, "grad_norm": 0.5340734720230103, "learning_rate": 3.7411347517730497e-07, "loss": 2.2158, "step": 258 }, { "epoch": 0.8694922366764583, "grad_norm": 0.5089324116706848, "learning_rate": 3.735224586288416e-07, "loss": 2.1451, "step": 259 }, { "epoch": 0.872849349559379, "grad_norm": 0.49475711584091187, "learning_rate": 3.729314420803782e-07, "loss": 2.1416, "step": 260 }, { "epoch": 0.8762064624422996, "grad_norm": 0.5191430449485779, "learning_rate": 3.7234042553191484e-07, "loss": 2.1815, "step": 261 }, { "epoch": 0.8795635753252203, "grad_norm": 0.4857535660266876, "learning_rate": 3.717494089834515e-07, "loss": 2.1388, "step": 262 }, { "epoch": 0.882920688208141, "grad_norm": 0.4946460425853729, "learning_rate": 3.7115839243498815e-07, "loss": 2.1562, "step": 263 }, { "epoch": 0.8862778010910617, "grad_norm": 0.4693676233291626, "learning_rate": 3.705673758865248e-07, "loss": 2.1189, "step": 264 }, { "epoch": 0.8896349139739824, "grad_norm": 0.5070816278457642, "learning_rate": 3.6997635933806145e-07, "loss": 2.1278, "step": 265 }, { "epoch": 0.892992026856903, "grad_norm": 0.5286785960197449, "learning_rate": 3.693853427895981e-07, "loss": 2.1917, "step": 266 }, { "epoch": 0.8963491397398238, "grad_norm": 0.48202502727508545, "learning_rate": 3.687943262411347e-07, "loss": 2.1224, "step": 267 }, { "epoch": 0.8997062526227444, "grad_norm": 0.5092111825942993, "learning_rate": 3.6820330969267137e-07, "loss": 2.2141, "step": 268 }, { "epoch": 0.9030633655056651, "grad_norm": 0.5308806300163269, "learning_rate": 3.67612293144208e-07, "loss": 2.151, "step": 269 }, { "epoch": 0.9064204783885859, "grad_norm": 0.5302571058273315, "learning_rate": 3.6702127659574467e-07, "loss": 2.1899, "step": 270 }, { "epoch": 0.9097775912715065, "grad_norm": 0.489431768655777, "learning_rate": 3.664302600472813e-07, "loss": 2.1448, "step": 271 }, { "epoch": 0.9131347041544272, "grad_norm": 0.47753775119781494, "learning_rate": 3.6583924349881797e-07, "loss": 2.1036, "step": 272 }, { "epoch": 0.9164918170373478, "grad_norm": 0.49404028058052063, "learning_rate": 3.652482269503546e-07, "loss": 2.1422, "step": 273 }, { "epoch": 0.9198489299202686, "grad_norm": 0.5034516453742981, "learning_rate": 3.646572104018912e-07, "loss": 2.152, "step": 274 }, { "epoch": 0.9232060428031893, "grad_norm": 0.5550661683082581, "learning_rate": 3.640661938534279e-07, "loss": 2.1861, "step": 275 }, { "epoch": 0.9265631556861099, "grad_norm": 0.4908338487148285, "learning_rate": 3.634751773049645e-07, "loss": 2.1026, "step": 276 }, { "epoch": 0.9299202685690307, "grad_norm": 0.5155569911003113, "learning_rate": 3.628841607565012e-07, "loss": 2.1006, "step": 277 }, { "epoch": 0.9332773814519513, "grad_norm": 0.5384230613708496, "learning_rate": 3.622931442080378e-07, "loss": 2.2128, "step": 278 }, { "epoch": 0.936634494334872, "grad_norm": 0.5264031291007996, "learning_rate": 3.617021276595745e-07, "loss": 2.1531, "step": 279 }, { "epoch": 0.9399916072177927, "grad_norm": 0.5026865601539612, "learning_rate": 3.6111111111111107e-07, "loss": 2.1594, "step": 280 }, { "epoch": 0.9433487201007134, "grad_norm": 0.4906868040561676, "learning_rate": 3.6052009456264775e-07, "loss": 2.1489, "step": 281 }, { "epoch": 0.9467058329836341, "grad_norm": 0.5679292678833008, "learning_rate": 3.5992907801418437e-07, "loss": 2.1501, "step": 282 }, { "epoch": 0.9500629458665547, "grad_norm": 0.49988269805908203, "learning_rate": 3.5933806146572105e-07, "loss": 2.1413, "step": 283 }, { "epoch": 0.9534200587494754, "grad_norm": 0.4949737787246704, "learning_rate": 3.5874704491725767e-07, "loss": 2.188, "step": 284 }, { "epoch": 0.9567771716323962, "grad_norm": 0.4845784902572632, "learning_rate": 3.5815602836879435e-07, "loss": 2.08, "step": 285 }, { "epoch": 0.9601342845153168, "grad_norm": 0.5556589365005493, "learning_rate": 3.575650118203309e-07, "loss": 2.1766, "step": 286 }, { "epoch": 0.9634913973982375, "grad_norm": 0.5051941871643066, "learning_rate": 3.569739952718676e-07, "loss": 2.1159, "step": 287 }, { "epoch": 0.9668485102811583, "grad_norm": 0.5166348814964294, "learning_rate": 3.563829787234042e-07, "loss": 2.2121, "step": 288 }, { "epoch": 0.9702056231640789, "grad_norm": 0.5659390091896057, "learning_rate": 3.557919621749409e-07, "loss": 2.1162, "step": 289 }, { "epoch": 0.9735627360469996, "grad_norm": 0.5001223683357239, "learning_rate": 3.552009456264775e-07, "loss": 2.1424, "step": 290 }, { "epoch": 0.9769198489299202, "grad_norm": 0.4793240427970886, "learning_rate": 3.546099290780142e-07, "loss": 2.136, "step": 291 }, { "epoch": 0.980276961812841, "grad_norm": 0.5031545162200928, "learning_rate": 3.5401891252955077e-07, "loss": 2.1264, "step": 292 }, { "epoch": 0.9836340746957617, "grad_norm": 0.526989221572876, "learning_rate": 3.5342789598108745e-07, "loss": 2.2329, "step": 293 }, { "epoch": 0.9869911875786823, "grad_norm": 0.5093796253204346, "learning_rate": 3.5283687943262407e-07, "loss": 2.1477, "step": 294 }, { "epoch": 0.990348300461603, "grad_norm": 0.5002118945121765, "learning_rate": 3.5224586288416075e-07, "loss": 2.195, "step": 295 }, { "epoch": 0.9937054133445237, "grad_norm": 0.5272600650787354, "learning_rate": 3.5165484633569737e-07, "loss": 2.1664, "step": 296 }, { "epoch": 0.9970625262274444, "grad_norm": 0.48927053809165955, "learning_rate": 3.5106382978723405e-07, "loss": 2.1497, "step": 297 }, { "epoch": 0.9970625262274444, "eval_loss": 2.169156074523926, "eval_runtime": 360.7188, "eval_samples_per_second": 1.004, "eval_steps_per_second": 0.252, "step": 297 } ], "logging_steps": 1, "max_steps": 891, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 8.794451043460055e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }