{ "best_metric": 2.129138708114624, "best_model_checkpoint": "/home/sunggeunan/data/ICL/outputs/lora/SKIML-ICL_mrqa_nq_v3/Meta-Llama-3-8B-Instruct-unanswerable-2Q-1U-0C-qa_first/checkpoint-402", "epoch": 0.9996891513832763, "eval_steps": 500, "global_step": 402, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0024867889337892445, "grad_norm": 0.4153629243373871, "learning_rate": 8.19672131147541e-09, "loss": 2.121, "step": 1 }, { "epoch": 0.004973577867578489, "grad_norm": 0.4008300006389618, "learning_rate": 1.639344262295082e-08, "loss": 2.1542, "step": 2 }, { "epoch": 0.007460366801367734, "grad_norm": 0.42680642008781433, "learning_rate": 2.459016393442623e-08, "loss": 2.1439, "step": 3 }, { "epoch": 0.009947155735156978, "grad_norm": 0.3831591308116913, "learning_rate": 3.278688524590164e-08, "loss": 2.1025, "step": 4 }, { "epoch": 0.012433944668946224, "grad_norm": 0.40637049078941345, "learning_rate": 4.0983606557377046e-08, "loss": 2.1374, "step": 5 }, { "epoch": 0.014920733602735468, "grad_norm": 0.3883218467235565, "learning_rate": 4.918032786885246e-08, "loss": 2.1412, "step": 6 }, { "epoch": 0.017407522536524712, "grad_norm": 0.42491665482521057, "learning_rate": 5.7377049180327866e-08, "loss": 2.205, "step": 7 }, { "epoch": 0.019894311470313956, "grad_norm": 0.40381714701652527, "learning_rate": 6.557377049180328e-08, "loss": 2.1575, "step": 8 }, { "epoch": 0.022381100404103203, "grad_norm": 0.3807780146598816, "learning_rate": 7.377049180327868e-08, "loss": 2.2206, "step": 9 }, { "epoch": 0.024867889337892447, "grad_norm": 0.3880959451198578, "learning_rate": 8.196721311475409e-08, "loss": 2.1772, "step": 10 }, { "epoch": 0.02735467827168169, "grad_norm": 0.36500561237335205, "learning_rate": 9.01639344262295e-08, "loss": 2.1232, "step": 11 }, { "epoch": 0.029841467205470935, "grad_norm": 0.3805394172668457, "learning_rate": 9.836065573770492e-08, "loss": 2.1446, "step": 12 }, { "epoch": 0.03232825613926018, "grad_norm": 0.38014543056488037, "learning_rate": 1.0655737704918032e-07, "loss": 2.1277, "step": 13 }, { "epoch": 0.034815045073049423, "grad_norm": 0.3875851333141327, "learning_rate": 1.1475409836065573e-07, "loss": 2.1064, "step": 14 }, { "epoch": 0.03730183400683867, "grad_norm": 0.39165419340133667, "learning_rate": 1.2295081967213113e-07, "loss": 2.1352, "step": 15 }, { "epoch": 0.03978862294062791, "grad_norm": 0.4067535102367401, "learning_rate": 1.3114754098360656e-07, "loss": 2.1601, "step": 16 }, { "epoch": 0.042275411874417156, "grad_norm": 0.41718506813049316, "learning_rate": 1.3934426229508196e-07, "loss": 2.1471, "step": 17 }, { "epoch": 0.04476220080820641, "grad_norm": 0.4221360981464386, "learning_rate": 1.4754098360655736e-07, "loss": 2.1499, "step": 18 }, { "epoch": 0.04724898974199565, "grad_norm": 0.39923396706581116, "learning_rate": 1.5573770491803278e-07, "loss": 2.1003, "step": 19 }, { "epoch": 0.049735778675784895, "grad_norm": 0.3728751242160797, "learning_rate": 1.6393442622950818e-07, "loss": 2.0842, "step": 20 }, { "epoch": 0.05222256760957414, "grad_norm": 0.3873041868209839, "learning_rate": 1.7213114754098358e-07, "loss": 2.1152, "step": 21 }, { "epoch": 0.05470935654336338, "grad_norm": 0.3714573383331299, "learning_rate": 1.80327868852459e-07, "loss": 2.1215, "step": 22 }, { "epoch": 0.05719614547715263, "grad_norm": 0.4204677939414978, "learning_rate": 1.885245901639344e-07, "loss": 2.1859, "step": 23 }, { "epoch": 0.05968293441094187, "grad_norm": 0.4137566089630127, "learning_rate": 1.9672131147540984e-07, "loss": 2.1317, "step": 24 }, { "epoch": 0.062169723344731115, "grad_norm": 0.3629921078681946, "learning_rate": 2.0491803278688524e-07, "loss": 2.0563, "step": 25 }, { "epoch": 0.06465651227852036, "grad_norm": 0.39072492718696594, "learning_rate": 2.1311475409836064e-07, "loss": 2.164, "step": 26 }, { "epoch": 0.0671433012123096, "grad_norm": 0.37331125140190125, "learning_rate": 2.2131147540983606e-07, "loss": 2.2048, "step": 27 }, { "epoch": 0.06963009014609885, "grad_norm": 0.3944483697414398, "learning_rate": 2.2950819672131146e-07, "loss": 2.1835, "step": 28 }, { "epoch": 0.07211687907988809, "grad_norm": 0.39379164576530457, "learning_rate": 2.3770491803278686e-07, "loss": 2.1465, "step": 29 }, { "epoch": 0.07460366801367734, "grad_norm": 0.3914564549922943, "learning_rate": 2.4590163934426226e-07, "loss": 2.091, "step": 30 }, { "epoch": 0.07709045694746658, "grad_norm": 0.4301564395427704, "learning_rate": 2.540983606557377e-07, "loss": 2.1183, "step": 31 }, { "epoch": 0.07957724588125582, "grad_norm": 0.40827327966690063, "learning_rate": 2.622950819672131e-07, "loss": 2.1588, "step": 32 }, { "epoch": 0.08206403481504507, "grad_norm": 0.3868783116340637, "learning_rate": 2.704918032786885e-07, "loss": 2.1667, "step": 33 }, { "epoch": 0.08455082374883431, "grad_norm": 0.40489786863327026, "learning_rate": 2.786885245901639e-07, "loss": 2.1857, "step": 34 }, { "epoch": 0.08703761268262357, "grad_norm": 0.3836217224597931, "learning_rate": 2.868852459016393e-07, "loss": 2.1313, "step": 35 }, { "epoch": 0.08952440161641281, "grad_norm": 0.4212404787540436, "learning_rate": 2.950819672131147e-07, "loss": 2.2024, "step": 36 }, { "epoch": 0.09201119055020206, "grad_norm": 0.395867258310318, "learning_rate": 3.0327868852459017e-07, "loss": 2.1355, "step": 37 }, { "epoch": 0.0944979794839913, "grad_norm": 0.3836336135864258, "learning_rate": 3.1147540983606557e-07, "loss": 2.1254, "step": 38 }, { "epoch": 0.09698476841778055, "grad_norm": 0.42212599515914917, "learning_rate": 3.1967213114754097e-07, "loss": 2.1262, "step": 39 }, { "epoch": 0.09947155735156979, "grad_norm": 0.43291711807250977, "learning_rate": 3.2786885245901637e-07, "loss": 2.1664, "step": 40 }, { "epoch": 0.10195834628535903, "grad_norm": 0.3876365125179291, "learning_rate": 3.3606557377049177e-07, "loss": 2.147, "step": 41 }, { "epoch": 0.10444513521914828, "grad_norm": 0.39074528217315674, "learning_rate": 3.4426229508196717e-07, "loss": 2.1113, "step": 42 }, { "epoch": 0.10693192415293752, "grad_norm": 0.4135940670967102, "learning_rate": 3.524590163934426e-07, "loss": 2.1834, "step": 43 }, { "epoch": 0.10941871308672677, "grad_norm": 0.4124310612678528, "learning_rate": 3.60655737704918e-07, "loss": 2.1019, "step": 44 }, { "epoch": 0.11190550202051601, "grad_norm": 0.3812576234340668, "learning_rate": 3.6885245901639347e-07, "loss": 2.1368, "step": 45 }, { "epoch": 0.11439229095430525, "grad_norm": 0.3919021189212799, "learning_rate": 3.770491803278688e-07, "loss": 2.1226, "step": 46 }, { "epoch": 0.1168790798880945, "grad_norm": 0.37712955474853516, "learning_rate": 3.852459016393442e-07, "loss": 2.1723, "step": 47 }, { "epoch": 0.11936586882188374, "grad_norm": 0.40433424711227417, "learning_rate": 3.9344262295081967e-07, "loss": 2.1469, "step": 48 }, { "epoch": 0.12185265775567299, "grad_norm": 0.4323996603488922, "learning_rate": 4.0163934426229507e-07, "loss": 2.1625, "step": 49 }, { "epoch": 0.12433944668946223, "grad_norm": 0.37467238306999207, "learning_rate": 4.0983606557377047e-07, "loss": 2.0877, "step": 50 }, { "epoch": 0.1268262356232515, "grad_norm": 0.3842613399028778, "learning_rate": 4.180327868852459e-07, "loss": 2.1367, "step": 51 }, { "epoch": 0.12931302455704072, "grad_norm": 0.41727927327156067, "learning_rate": 4.2622950819672127e-07, "loss": 2.158, "step": 52 }, { "epoch": 0.13179981349082998, "grad_norm": 0.427172988653183, "learning_rate": 4.3442622950819667e-07, "loss": 2.185, "step": 53 }, { "epoch": 0.1342866024246192, "grad_norm": 0.3944658041000366, "learning_rate": 4.426229508196721e-07, "loss": 2.1537, "step": 54 }, { "epoch": 0.13677339135840846, "grad_norm": 0.3892759382724762, "learning_rate": 4.508196721311475e-07, "loss": 2.1187, "step": 55 }, { "epoch": 0.1392601802921977, "grad_norm": 0.40089288353919983, "learning_rate": 4.590163934426229e-07, "loss": 2.1299, "step": 56 }, { "epoch": 0.14174696922598695, "grad_norm": 0.4039812982082367, "learning_rate": 4.672131147540984e-07, "loss": 2.1734, "step": 57 }, { "epoch": 0.14423375815977618, "grad_norm": 0.43650051951408386, "learning_rate": 4.754098360655737e-07, "loss": 2.1511, "step": 58 }, { "epoch": 0.14672054709356544, "grad_norm": 0.40934914350509644, "learning_rate": 4.836065573770492e-07, "loss": 2.1244, "step": 59 }, { "epoch": 0.14920733602735467, "grad_norm": 0.40043023228645325, "learning_rate": 4.918032786885245e-07, "loss": 2.1584, "step": 60 }, { "epoch": 0.15169412496114393, "grad_norm": 0.4129016697406769, "learning_rate": 5e-07, "loss": 2.158, "step": 61 }, { "epoch": 0.15418091389493316, "grad_norm": 0.38239961862564087, "learning_rate": 4.995633187772926e-07, "loss": 2.1423, "step": 62 }, { "epoch": 0.15666770282872242, "grad_norm": 0.4175527095794678, "learning_rate": 4.991266375545852e-07, "loss": 2.1224, "step": 63 }, { "epoch": 0.15915449176251165, "grad_norm": 0.4162661135196686, "learning_rate": 4.986899563318778e-07, "loss": 2.1816, "step": 64 }, { "epoch": 0.1616412806963009, "grad_norm": 0.4039028584957123, "learning_rate": 4.982532751091702e-07, "loss": 2.1734, "step": 65 }, { "epoch": 0.16412806963009013, "grad_norm": 0.4020048677921295, "learning_rate": 4.978165938864628e-07, "loss": 2.1546, "step": 66 }, { "epoch": 0.1666148585638794, "grad_norm": 0.4302126169204712, "learning_rate": 4.973799126637554e-07, "loss": 2.1906, "step": 67 }, { "epoch": 0.16910164749766862, "grad_norm": 0.4021979570388794, "learning_rate": 4.96943231441048e-07, "loss": 2.1297, "step": 68 }, { "epoch": 0.17158843643145788, "grad_norm": 0.3942105174064636, "learning_rate": 4.965065502183406e-07, "loss": 2.1292, "step": 69 }, { "epoch": 0.17407522536524714, "grad_norm": 0.4106265902519226, "learning_rate": 4.960698689956332e-07, "loss": 2.1454, "step": 70 }, { "epoch": 0.17656201429903637, "grad_norm": 0.4014648199081421, "learning_rate": 4.956331877729257e-07, "loss": 2.0864, "step": 71 }, { "epoch": 0.17904880323282563, "grad_norm": 0.41134366393089294, "learning_rate": 4.951965065502184e-07, "loss": 2.1021, "step": 72 }, { "epoch": 0.18153559216661486, "grad_norm": 0.40096017718315125, "learning_rate": 4.947598253275109e-07, "loss": 2.1465, "step": 73 }, { "epoch": 0.18402238110040411, "grad_norm": 0.41377922892570496, "learning_rate": 4.943231441048035e-07, "loss": 2.1694, "step": 74 }, { "epoch": 0.18650917003419334, "grad_norm": 0.39552953839302063, "learning_rate": 4.93886462882096e-07, "loss": 2.1748, "step": 75 }, { "epoch": 0.1889959589679826, "grad_norm": 0.44786471128463745, "learning_rate": 4.934497816593886e-07, "loss": 2.1785, "step": 76 }, { "epoch": 0.19148274790177183, "grad_norm": 0.42525768280029297, "learning_rate": 4.930131004366812e-07, "loss": 2.1825, "step": 77 }, { "epoch": 0.1939695368355611, "grad_norm": 0.427071750164032, "learning_rate": 4.925764192139738e-07, "loss": 2.1463, "step": 78 }, { "epoch": 0.19645632576935032, "grad_norm": 0.41076913475990295, "learning_rate": 4.921397379912663e-07, "loss": 2.124, "step": 79 }, { "epoch": 0.19894311470313958, "grad_norm": 0.4056430160999298, "learning_rate": 4.917030567685589e-07, "loss": 2.143, "step": 80 }, { "epoch": 0.2014299036369288, "grad_norm": 0.4058414101600647, "learning_rate": 4.912663755458515e-07, "loss": 2.1337, "step": 81 }, { "epoch": 0.20391669257071807, "grad_norm": 0.4427083730697632, "learning_rate": 4.908296943231441e-07, "loss": 2.1631, "step": 82 }, { "epoch": 0.2064034815045073, "grad_norm": 0.4002906084060669, "learning_rate": 4.903930131004367e-07, "loss": 2.0538, "step": 83 }, { "epoch": 0.20889027043829655, "grad_norm": 0.40065857768058777, "learning_rate": 4.899563318777293e-07, "loss": 2.1129, "step": 84 }, { "epoch": 0.21137705937208578, "grad_norm": 0.42688536643981934, "learning_rate": 4.895196506550219e-07, "loss": 2.139, "step": 85 }, { "epoch": 0.21386384830587504, "grad_norm": 0.4278879165649414, "learning_rate": 4.890829694323143e-07, "loss": 2.1395, "step": 86 }, { "epoch": 0.21635063723966427, "grad_norm": 0.43649378418922424, "learning_rate": 4.886462882096069e-07, "loss": 2.0751, "step": 87 }, { "epoch": 0.21883742617345353, "grad_norm": 0.3865818977355957, "learning_rate": 4.882096069868995e-07, "loss": 2.0862, "step": 88 }, { "epoch": 0.22132421510724276, "grad_norm": 0.42509347200393677, "learning_rate": 4.877729257641921e-07, "loss": 2.1478, "step": 89 }, { "epoch": 0.22381100404103202, "grad_norm": 0.4220832884311676, "learning_rate": 4.873362445414847e-07, "loss": 2.1609, "step": 90 }, { "epoch": 0.22629779297482125, "grad_norm": 0.40812230110168457, "learning_rate": 4.868995633187773e-07, "loss": 2.1551, "step": 91 }, { "epoch": 0.2287845819086105, "grad_norm": 0.4381932020187378, "learning_rate": 4.864628820960698e-07, "loss": 2.1594, "step": 92 }, { "epoch": 0.23127137084239976, "grad_norm": 0.4095819890499115, "learning_rate": 4.860262008733625e-07, "loss": 2.1396, "step": 93 }, { "epoch": 0.233758159776189, "grad_norm": 0.42679563164711, "learning_rate": 4.85589519650655e-07, "loss": 2.1771, "step": 94 }, { "epoch": 0.23624494870997825, "grad_norm": 0.44320186972618103, "learning_rate": 4.851528384279476e-07, "loss": 2.1842, "step": 95 }, { "epoch": 0.23873173764376748, "grad_norm": 0.39184531569480896, "learning_rate": 4.847161572052402e-07, "loss": 2.1378, "step": 96 }, { "epoch": 0.24121852657755674, "grad_norm": 0.4558006823062897, "learning_rate": 4.842794759825327e-07, "loss": 2.1706, "step": 97 }, { "epoch": 0.24370531551134597, "grad_norm": 0.4423806667327881, "learning_rate": 4.838427947598253e-07, "loss": 2.1841, "step": 98 }, { "epoch": 0.24619210444513523, "grad_norm": 0.4314688444137573, "learning_rate": 4.834061135371178e-07, "loss": 2.1154, "step": 99 }, { "epoch": 0.24867889337892446, "grad_norm": 0.44223618507385254, "learning_rate": 4.829694323144104e-07, "loss": 2.1124, "step": 100 }, { "epoch": 0.2511656823127137, "grad_norm": 0.44006800651550293, "learning_rate": 4.82532751091703e-07, "loss": 2.1937, "step": 101 }, { "epoch": 0.253652471246503, "grad_norm": 0.4089645445346832, "learning_rate": 4.820960698689956e-07, "loss": 2.1236, "step": 102 }, { "epoch": 0.2561392601802922, "grad_norm": 0.4407235383987427, "learning_rate": 4.816593886462882e-07, "loss": 2.1116, "step": 103 }, { "epoch": 0.25862604911408144, "grad_norm": 0.4155865013599396, "learning_rate": 4.812227074235808e-07, "loss": 2.1259, "step": 104 }, { "epoch": 0.26111283804787067, "grad_norm": 0.4286578893661499, "learning_rate": 4.807860262008734e-07, "loss": 2.1831, "step": 105 }, { "epoch": 0.26359962698165995, "grad_norm": 0.44605061411857605, "learning_rate": 4.80349344978166e-07, "loss": 2.1373, "step": 106 }, { "epoch": 0.2660864159154492, "grad_norm": 0.4532274007797241, "learning_rate": 4.799126637554585e-07, "loss": 2.1249, "step": 107 }, { "epoch": 0.2685732048492384, "grad_norm": 0.4327315092086792, "learning_rate": 4.79475982532751e-07, "loss": 2.1414, "step": 108 }, { "epoch": 0.27105999378302764, "grad_norm": 0.4438115358352661, "learning_rate": 4.790393013100436e-07, "loss": 2.0787, "step": 109 }, { "epoch": 0.2735467827168169, "grad_norm": 0.4239655137062073, "learning_rate": 4.786026200873362e-07, "loss": 2.1234, "step": 110 }, { "epoch": 0.27603357165060616, "grad_norm": 0.4541226327419281, "learning_rate": 4.781659388646288e-07, "loss": 2.2023, "step": 111 }, { "epoch": 0.2785203605843954, "grad_norm": 0.4275488555431366, "learning_rate": 4.777292576419214e-07, "loss": 2.1872, "step": 112 }, { "epoch": 0.2810071495181846, "grad_norm": 0.4427001476287842, "learning_rate": 4.772925764192139e-07, "loss": 2.1646, "step": 113 }, { "epoch": 0.2834939384519739, "grad_norm": 0.43924546241760254, "learning_rate": 4.768558951965065e-07, "loss": 2.1175, "step": 114 }, { "epoch": 0.28598072738576313, "grad_norm": 0.42863723635673523, "learning_rate": 4.764192139737991e-07, "loss": 2.1269, "step": 115 }, { "epoch": 0.28846751631955236, "grad_norm": 0.40726035833358765, "learning_rate": 4.759825327510917e-07, "loss": 2.071, "step": 116 }, { "epoch": 0.29095430525334165, "grad_norm": 0.4581323564052582, "learning_rate": 4.7554585152838427e-07, "loss": 2.1981, "step": 117 }, { "epoch": 0.2934410941871309, "grad_norm": 0.46849963068962097, "learning_rate": 4.751091703056768e-07, "loss": 2.1689, "step": 118 }, { "epoch": 0.2959278831209201, "grad_norm": 0.45309266448020935, "learning_rate": 4.746724890829694e-07, "loss": 2.1122, "step": 119 }, { "epoch": 0.29841467205470934, "grad_norm": 0.4709586203098297, "learning_rate": 4.7423580786026193e-07, "loss": 2.1774, "step": 120 }, { "epoch": 0.3009014609884986, "grad_norm": 0.4502153992652893, "learning_rate": 4.737991266375546e-07, "loss": 2.1357, "step": 121 }, { "epoch": 0.30338824992228786, "grad_norm": 0.4565674066543579, "learning_rate": 4.7336244541484717e-07, "loss": 2.1982, "step": 122 }, { "epoch": 0.3058750388560771, "grad_norm": 0.4817062020301819, "learning_rate": 4.729257641921397e-07, "loss": 2.1124, "step": 123 }, { "epoch": 0.3083618277898663, "grad_norm": 0.43636277318000793, "learning_rate": 4.724890829694323e-07, "loss": 2.1345, "step": 124 }, { "epoch": 0.3108486167236556, "grad_norm": 0.4348713159561157, "learning_rate": 4.720524017467249e-07, "loss": 2.1382, "step": 125 }, { "epoch": 0.31333540565744483, "grad_norm": 0.43462586402893066, "learning_rate": 4.7161572052401743e-07, "loss": 2.1708, "step": 126 }, { "epoch": 0.31582219459123406, "grad_norm": 0.4370459318161011, "learning_rate": 4.7117903930131e-07, "loss": 2.128, "step": 127 }, { "epoch": 0.3183089835250233, "grad_norm": 0.43912699818611145, "learning_rate": 4.7074235807860256e-07, "loss": 2.1016, "step": 128 }, { "epoch": 0.3207957724588126, "grad_norm": 0.4648686647415161, "learning_rate": 4.7030567685589515e-07, "loss": 2.073, "step": 129 }, { "epoch": 0.3232825613926018, "grad_norm": 0.45214056968688965, "learning_rate": 4.6986899563318775e-07, "loss": 2.1608, "step": 130 }, { "epoch": 0.32576935032639104, "grad_norm": 0.4569113850593567, "learning_rate": 4.6943231441048034e-07, "loss": 2.1456, "step": 131 }, { "epoch": 0.32825613926018027, "grad_norm": 0.4468495547771454, "learning_rate": 4.6899563318777293e-07, "loss": 2.104, "step": 132 }, { "epoch": 0.33074292819396955, "grad_norm": 0.4579126834869385, "learning_rate": 4.685589519650655e-07, "loss": 2.1415, "step": 133 }, { "epoch": 0.3332297171277588, "grad_norm": 0.41133925318717957, "learning_rate": 4.6812227074235806e-07, "loss": 2.0556, "step": 134 }, { "epoch": 0.335716506061548, "grad_norm": 0.45494189858436584, "learning_rate": 4.6768558951965065e-07, "loss": 2.1353, "step": 135 }, { "epoch": 0.33820329499533724, "grad_norm": 0.45598360896110535, "learning_rate": 4.672489082969432e-07, "loss": 2.1608, "step": 136 }, { "epoch": 0.34069008392912653, "grad_norm": 0.4804225564002991, "learning_rate": 4.668122270742358e-07, "loss": 2.125, "step": 137 }, { "epoch": 0.34317687286291576, "grad_norm": 0.45764321088790894, "learning_rate": 4.6637554585152837e-07, "loss": 2.1547, "step": 138 }, { "epoch": 0.345663661796705, "grad_norm": 0.469670832157135, "learning_rate": 4.659388646288209e-07, "loss": 2.1538, "step": 139 }, { "epoch": 0.3481504507304943, "grad_norm": 0.47154125571250916, "learning_rate": 4.655021834061135e-07, "loss": 2.1144, "step": 140 }, { "epoch": 0.3506372396642835, "grad_norm": 0.4631963074207306, "learning_rate": 4.6506550218340604e-07, "loss": 2.194, "step": 141 }, { "epoch": 0.35312402859807274, "grad_norm": 0.4393676519393921, "learning_rate": 4.646288209606987e-07, "loss": 2.0945, "step": 142 }, { "epoch": 0.35561081753186197, "grad_norm": 0.444672554731369, "learning_rate": 4.641921397379913e-07, "loss": 2.111, "step": 143 }, { "epoch": 0.35809760646565125, "grad_norm": 0.46494928002357483, "learning_rate": 4.637554585152838e-07, "loss": 2.1777, "step": 144 }, { "epoch": 0.3605843953994405, "grad_norm": 0.4453777074813843, "learning_rate": 4.633187772925764e-07, "loss": 2.1631, "step": 145 }, { "epoch": 0.3630711843332297, "grad_norm": 0.4813487231731415, "learning_rate": 4.62882096069869e-07, "loss": 2.1595, "step": 146 }, { "epoch": 0.36555797326701894, "grad_norm": 0.4742406904697418, "learning_rate": 4.6244541484716154e-07, "loss": 2.1576, "step": 147 }, { "epoch": 0.36804476220080823, "grad_norm": 0.45844781398773193, "learning_rate": 4.6200873362445413e-07, "loss": 2.1756, "step": 148 }, { "epoch": 0.37053155113459746, "grad_norm": 0.44323110580444336, "learning_rate": 4.6157205240174667e-07, "loss": 2.1483, "step": 149 }, { "epoch": 0.3730183400683867, "grad_norm": 0.49860548973083496, "learning_rate": 4.6113537117903926e-07, "loss": 2.2144, "step": 150 }, { "epoch": 0.3755051290021759, "grad_norm": 0.43856751918792725, "learning_rate": 4.6069868995633185e-07, "loss": 2.0581, "step": 151 }, { "epoch": 0.3779919179359652, "grad_norm": 0.4472333788871765, "learning_rate": 4.602620087336244e-07, "loss": 2.117, "step": 152 }, { "epoch": 0.38047870686975443, "grad_norm": 0.4927634298801422, "learning_rate": 4.5982532751091704e-07, "loss": 2.153, "step": 153 }, { "epoch": 0.38296549580354367, "grad_norm": 0.4599962532520294, "learning_rate": 4.5938864628820963e-07, "loss": 2.2226, "step": 154 }, { "epoch": 0.3854522847373329, "grad_norm": 0.45448487997055054, "learning_rate": 4.5895196506550217e-07, "loss": 2.1189, "step": 155 }, { "epoch": 0.3879390736711222, "grad_norm": 0.4686853587627411, "learning_rate": 4.5851528384279476e-07, "loss": 2.1257, "step": 156 }, { "epoch": 0.3904258626049114, "grad_norm": 0.5059552192687988, "learning_rate": 4.580786026200873e-07, "loss": 2.1586, "step": 157 }, { "epoch": 0.39291265153870064, "grad_norm": 0.4529350996017456, "learning_rate": 4.576419213973799e-07, "loss": 2.1436, "step": 158 }, { "epoch": 0.39539944047248987, "grad_norm": 0.44359931349754333, "learning_rate": 4.572052401746725e-07, "loss": 2.1086, "step": 159 }, { "epoch": 0.39788622940627916, "grad_norm": 0.4643580913543701, "learning_rate": 4.56768558951965e-07, "loss": 2.1566, "step": 160 }, { "epoch": 0.4003730183400684, "grad_norm": 0.47713202238082886, "learning_rate": 4.563318777292576e-07, "loss": 2.1494, "step": 161 }, { "epoch": 0.4028598072738576, "grad_norm": 0.4480564296245575, "learning_rate": 4.5589519650655015e-07, "loss": 2.1188, "step": 162 }, { "epoch": 0.4053465962076469, "grad_norm": 0.4450179636478424, "learning_rate": 4.554585152838428e-07, "loss": 2.1035, "step": 163 }, { "epoch": 0.40783338514143613, "grad_norm": 0.4772661328315735, "learning_rate": 4.550218340611354e-07, "loss": 2.1598, "step": 164 }, { "epoch": 0.41032017407522536, "grad_norm": 0.47646352648735046, "learning_rate": 4.545851528384279e-07, "loss": 2.1995, "step": 165 }, { "epoch": 0.4128069630090146, "grad_norm": 0.4821939468383789, "learning_rate": 4.541484716157205e-07, "loss": 2.1441, "step": 166 }, { "epoch": 0.4152937519428039, "grad_norm": 0.4545115828514099, "learning_rate": 4.537117903930131e-07, "loss": 2.1598, "step": 167 }, { "epoch": 0.4177805408765931, "grad_norm": 0.45019111037254333, "learning_rate": 4.5327510917030565e-07, "loss": 2.1148, "step": 168 }, { "epoch": 0.42026732981038234, "grad_norm": 0.4689873456954956, "learning_rate": 4.5283842794759824e-07, "loss": 2.1509, "step": 169 }, { "epoch": 0.42275411874417157, "grad_norm": 0.47073739767074585, "learning_rate": 4.524017467248908e-07, "loss": 2.1425, "step": 170 }, { "epoch": 0.42524090767796086, "grad_norm": 0.4607613980770111, "learning_rate": 4.5196506550218337e-07, "loss": 2.1226, "step": 171 }, { "epoch": 0.4277276966117501, "grad_norm": 0.46717172861099243, "learning_rate": 4.5152838427947596e-07, "loss": 2.1222, "step": 172 }, { "epoch": 0.4302144855455393, "grad_norm": 0.5047352313995361, "learning_rate": 4.510917030567685e-07, "loss": 2.2121, "step": 173 }, { "epoch": 0.43270127447932855, "grad_norm": 0.49366188049316406, "learning_rate": 4.5065502183406115e-07, "loss": 2.1806, "step": 174 }, { "epoch": 0.43518806341311783, "grad_norm": 0.4741223454475403, "learning_rate": 4.502183406113537e-07, "loss": 2.0808, "step": 175 }, { "epoch": 0.43767485234690706, "grad_norm": 0.4672994911670685, "learning_rate": 4.497816593886463e-07, "loss": 2.1021, "step": 176 }, { "epoch": 0.4401616412806963, "grad_norm": 0.4813832640647888, "learning_rate": 4.4934497816593887e-07, "loss": 2.139, "step": 177 }, { "epoch": 0.4426484302144855, "grad_norm": 0.4757406413555145, "learning_rate": 4.489082969432314e-07, "loss": 2.1245, "step": 178 }, { "epoch": 0.4451352191482748, "grad_norm": 0.4674074351787567, "learning_rate": 4.48471615720524e-07, "loss": 2.1619, "step": 179 }, { "epoch": 0.44762200808206404, "grad_norm": 0.4354044795036316, "learning_rate": 4.480349344978166e-07, "loss": 2.0624, "step": 180 }, { "epoch": 0.45010879701585327, "grad_norm": 0.4484567940235138, "learning_rate": 4.4759825327510913e-07, "loss": 2.1071, "step": 181 }, { "epoch": 0.4525955859496425, "grad_norm": 0.4580535292625427, "learning_rate": 4.471615720524017e-07, "loss": 2.1039, "step": 182 }, { "epoch": 0.4550823748834318, "grad_norm": 0.504393994808197, "learning_rate": 4.4672489082969426e-07, "loss": 2.1172, "step": 183 }, { "epoch": 0.457569163817221, "grad_norm": 0.4727741777896881, "learning_rate": 4.4628820960698685e-07, "loss": 2.1343, "step": 184 }, { "epoch": 0.46005595275101024, "grad_norm": 0.4549051523208618, "learning_rate": 4.458515283842795e-07, "loss": 2.1586, "step": 185 }, { "epoch": 0.46254274168479953, "grad_norm": 0.4877924919128418, "learning_rate": 4.4541484716157203e-07, "loss": 2.2136, "step": 186 }, { "epoch": 0.46502953061858876, "grad_norm": 0.47328630089759827, "learning_rate": 4.449781659388646e-07, "loss": 2.1065, "step": 187 }, { "epoch": 0.467516319552378, "grad_norm": 0.46814873814582825, "learning_rate": 4.445414847161572e-07, "loss": 2.0879, "step": 188 }, { "epoch": 0.4700031084861672, "grad_norm": 0.46883970499038696, "learning_rate": 4.4410480349344976e-07, "loss": 2.1495, "step": 189 }, { "epoch": 0.4724898974199565, "grad_norm": 0.5020297169685364, "learning_rate": 4.4366812227074235e-07, "loss": 2.161, "step": 190 }, { "epoch": 0.47497668635374574, "grad_norm": 0.47197675704956055, "learning_rate": 4.432314410480349e-07, "loss": 2.1354, "step": 191 }, { "epoch": 0.47746347528753497, "grad_norm": 0.47488582134246826, "learning_rate": 4.427947598253275e-07, "loss": 2.0913, "step": 192 }, { "epoch": 0.4799502642213242, "grad_norm": 0.49505242705345154, "learning_rate": 4.4235807860262007e-07, "loss": 2.1499, "step": 193 }, { "epoch": 0.4824370531551135, "grad_norm": 0.48239609599113464, "learning_rate": 4.419213973799126e-07, "loss": 2.1432, "step": 194 }, { "epoch": 0.4849238420889027, "grad_norm": 0.46357694268226624, "learning_rate": 4.4148471615720525e-07, "loss": 2.1352, "step": 195 }, { "epoch": 0.48741063102269194, "grad_norm": 0.4855436086654663, "learning_rate": 4.410480349344978e-07, "loss": 2.0876, "step": 196 }, { "epoch": 0.4898974199564812, "grad_norm": 0.5182051658630371, "learning_rate": 4.406113537117904e-07, "loss": 2.1872, "step": 197 }, { "epoch": 0.49238420889027046, "grad_norm": 0.4700855016708374, "learning_rate": 4.40174672489083e-07, "loss": 2.0742, "step": 198 }, { "epoch": 0.4948709978240597, "grad_norm": 0.4741506278514862, "learning_rate": 4.397379912663755e-07, "loss": 2.0981, "step": 199 }, { "epoch": 0.4973577867578489, "grad_norm": 0.4737417697906494, "learning_rate": 4.393013100436681e-07, "loss": 2.1206, "step": 200 }, { "epoch": 0.49984457569163815, "grad_norm": 0.4860036075115204, "learning_rate": 4.388646288209607e-07, "loss": 2.0539, "step": 201 }, { "epoch": 0.5023313646254274, "grad_norm": 0.4895828068256378, "learning_rate": 4.3842794759825324e-07, "loss": 2.2017, "step": 202 }, { "epoch": 0.5048181535592167, "grad_norm": 0.5107592940330505, "learning_rate": 4.3799126637554583e-07, "loss": 2.2033, "step": 203 }, { "epoch": 0.507304942493006, "grad_norm": 0.49359220266342163, "learning_rate": 4.3755458515283837e-07, "loss": 2.1245, "step": 204 }, { "epoch": 0.5097917314267951, "grad_norm": 0.43286237120628357, "learning_rate": 4.3711790393013096e-07, "loss": 2.1112, "step": 205 }, { "epoch": 0.5122785203605844, "grad_norm": 0.4412092864513397, "learning_rate": 4.366812227074236e-07, "loss": 2.032, "step": 206 }, { "epoch": 0.5147653092943736, "grad_norm": 0.5042114853858948, "learning_rate": 4.3624454148471614e-07, "loss": 2.1303, "step": 207 }, { "epoch": 0.5172520982281629, "grad_norm": 0.4746697247028351, "learning_rate": 4.3580786026200873e-07, "loss": 2.1337, "step": 208 }, { "epoch": 0.5197388871619522, "grad_norm": 0.4542432427406311, "learning_rate": 4.353711790393013e-07, "loss": 2.1181, "step": 209 }, { "epoch": 0.5222256760957413, "grad_norm": 0.5013236403465271, "learning_rate": 4.3493449781659386e-07, "loss": 2.1283, "step": 210 }, { "epoch": 0.5247124650295306, "grad_norm": 0.44694000482559204, "learning_rate": 4.3449781659388646e-07, "loss": 2.1317, "step": 211 }, { "epoch": 0.5271992539633199, "grad_norm": 0.5043014287948608, "learning_rate": 4.34061135371179e-07, "loss": 2.1751, "step": 212 }, { "epoch": 0.5296860428971091, "grad_norm": 0.45841050148010254, "learning_rate": 4.336244541484716e-07, "loss": 2.1164, "step": 213 }, { "epoch": 0.5321728318308984, "grad_norm": 0.4801214635372162, "learning_rate": 4.331877729257642e-07, "loss": 2.0855, "step": 214 }, { "epoch": 0.5346596207646876, "grad_norm": 0.5102494955062866, "learning_rate": 4.327510917030567e-07, "loss": 2.1819, "step": 215 }, { "epoch": 0.5371464096984768, "grad_norm": 0.5200817584991455, "learning_rate": 4.323144104803493e-07, "loss": 2.175, "step": 216 }, { "epoch": 0.5396331986322661, "grad_norm": 0.49951592087745667, "learning_rate": 4.318777292576419e-07, "loss": 2.1599, "step": 217 }, { "epoch": 0.5421199875660553, "grad_norm": 0.5104175209999084, "learning_rate": 4.314410480349345e-07, "loss": 2.1793, "step": 218 }, { "epoch": 0.5446067764998446, "grad_norm": 0.49184200167655945, "learning_rate": 4.310043668122271e-07, "loss": 2.1548, "step": 219 }, { "epoch": 0.5470935654336339, "grad_norm": 0.48827120661735535, "learning_rate": 4.305676855895196e-07, "loss": 2.1406, "step": 220 }, { "epoch": 0.549580354367423, "grad_norm": 0.47386690974235535, "learning_rate": 4.301310043668122e-07, "loss": 2.1014, "step": 221 }, { "epoch": 0.5520671433012123, "grad_norm": 0.47124195098876953, "learning_rate": 4.296943231441048e-07, "loss": 2.086, "step": 222 }, { "epoch": 0.5545539322350016, "grad_norm": 0.5211581587791443, "learning_rate": 4.2925764192139734e-07, "loss": 2.0998, "step": 223 }, { "epoch": 0.5570407211687908, "grad_norm": 0.4680314064025879, "learning_rate": 4.2882096069868994e-07, "loss": 2.0983, "step": 224 }, { "epoch": 0.5595275101025801, "grad_norm": 0.4839833080768585, "learning_rate": 4.283842794759825e-07, "loss": 2.1144, "step": 225 }, { "epoch": 0.5620142990363692, "grad_norm": 0.4539274275302887, "learning_rate": 4.2794759825327507e-07, "loss": 2.0693, "step": 226 }, { "epoch": 0.5645010879701585, "grad_norm": 0.4671003818511963, "learning_rate": 4.275109170305677e-07, "loss": 2.0779, "step": 227 }, { "epoch": 0.5669878769039478, "grad_norm": 0.5179879069328308, "learning_rate": 4.2707423580786025e-07, "loss": 2.1674, "step": 228 }, { "epoch": 0.569474665837737, "grad_norm": 0.4587318003177643, "learning_rate": 4.2663755458515284e-07, "loss": 2.1075, "step": 229 }, { "epoch": 0.5719614547715263, "grad_norm": 0.5107843279838562, "learning_rate": 4.262008733624454e-07, "loss": 2.1966, "step": 230 }, { "epoch": 0.5744482437053156, "grad_norm": 0.48125070333480835, "learning_rate": 4.2576419213973797e-07, "loss": 2.111, "step": 231 }, { "epoch": 0.5769350326391047, "grad_norm": 0.5291087031364441, "learning_rate": 4.2532751091703056e-07, "loss": 2.1401, "step": 232 }, { "epoch": 0.579421821572894, "grad_norm": 0.5241518020629883, "learning_rate": 4.248908296943231e-07, "loss": 2.1662, "step": 233 }, { "epoch": 0.5819086105066833, "grad_norm": 0.5210862755775452, "learning_rate": 4.244541484716157e-07, "loss": 2.2149, "step": 234 }, { "epoch": 0.5843953994404725, "grad_norm": 0.5254886150360107, "learning_rate": 4.240174672489083e-07, "loss": 2.2019, "step": 235 }, { "epoch": 0.5868821883742618, "grad_norm": 0.49172264337539673, "learning_rate": 4.235807860262008e-07, "loss": 2.1671, "step": 236 }, { "epoch": 0.5893689773080509, "grad_norm": 0.47738420963287354, "learning_rate": 4.231441048034934e-07, "loss": 2.0794, "step": 237 }, { "epoch": 0.5918557662418402, "grad_norm": 0.5100018978118896, "learning_rate": 4.22707423580786e-07, "loss": 2.1181, "step": 238 }, { "epoch": 0.5943425551756295, "grad_norm": 0.5403950810432434, "learning_rate": 4.222707423580786e-07, "loss": 2.1559, "step": 239 }, { "epoch": 0.5968293441094187, "grad_norm": 0.5101498961448669, "learning_rate": 4.218340611353712e-07, "loss": 2.2091, "step": 240 }, { "epoch": 0.599316133043208, "grad_norm": 0.5162122845649719, "learning_rate": 4.2139737991266373e-07, "loss": 2.1471, "step": 241 }, { "epoch": 0.6018029219769973, "grad_norm": 0.5074677467346191, "learning_rate": 4.209606986899563e-07, "loss": 2.0849, "step": 242 }, { "epoch": 0.6042897109107864, "grad_norm": 0.5043840408325195, "learning_rate": 4.205240174672489e-07, "loss": 2.109, "step": 243 }, { "epoch": 0.6067764998445757, "grad_norm": 0.49023503065109253, "learning_rate": 4.2008733624454145e-07, "loss": 2.157, "step": 244 }, { "epoch": 0.6092632887783649, "grad_norm": 0.5031821131706238, "learning_rate": 4.1965065502183404e-07, "loss": 2.1597, "step": 245 }, { "epoch": 0.6117500777121542, "grad_norm": 0.5147417783737183, "learning_rate": 4.192139737991266e-07, "loss": 2.1502, "step": 246 }, { "epoch": 0.6142368666459435, "grad_norm": 0.5135524272918701, "learning_rate": 4.187772925764192e-07, "loss": 2.1204, "step": 247 }, { "epoch": 0.6167236555797326, "grad_norm": 0.516242265701294, "learning_rate": 4.1834061135371177e-07, "loss": 2.1121, "step": 248 }, { "epoch": 0.6192104445135219, "grad_norm": 0.5270472764968872, "learning_rate": 4.1790393013100436e-07, "loss": 2.1296, "step": 249 }, { "epoch": 0.6216972334473112, "grad_norm": 0.5023481249809265, "learning_rate": 4.1746724890829695e-07, "loss": 2.1138, "step": 250 }, { "epoch": 0.6241840223811004, "grad_norm": 0.5072234869003296, "learning_rate": 4.170305676855895e-07, "loss": 2.1852, "step": 251 }, { "epoch": 0.6266708113148897, "grad_norm": 0.5261276364326477, "learning_rate": 4.165938864628821e-07, "loss": 2.1707, "step": 252 }, { "epoch": 0.6291576002486788, "grad_norm": 0.524861216545105, "learning_rate": 4.1615720524017467e-07, "loss": 2.1885, "step": 253 }, { "epoch": 0.6316443891824681, "grad_norm": 0.5122174620628357, "learning_rate": 4.157205240174672e-07, "loss": 2.1095, "step": 254 }, { "epoch": 0.6341311781162574, "grad_norm": 0.4976103603839874, "learning_rate": 4.152838427947598e-07, "loss": 2.1686, "step": 255 }, { "epoch": 0.6366179670500466, "grad_norm": 0.48984527587890625, "learning_rate": 4.148471615720524e-07, "loss": 2.0661, "step": 256 }, { "epoch": 0.6391047559838359, "grad_norm": 0.5184794664382935, "learning_rate": 4.1441048034934493e-07, "loss": 2.1323, "step": 257 }, { "epoch": 0.6415915449176252, "grad_norm": 0.4991200566291809, "learning_rate": 4.139737991266375e-07, "loss": 2.1294, "step": 258 }, { "epoch": 0.6440783338514143, "grad_norm": 0.5302152037620544, "learning_rate": 4.1353711790393006e-07, "loss": 2.0922, "step": 259 }, { "epoch": 0.6465651227852036, "grad_norm": 0.5143322348594666, "learning_rate": 4.131004366812227e-07, "loss": 2.1694, "step": 260 }, { "epoch": 0.6490519117189929, "grad_norm": 0.5043548941612244, "learning_rate": 4.126637554585153e-07, "loss": 2.0899, "step": 261 }, { "epoch": 0.6515387006527821, "grad_norm": 0.5160046815872192, "learning_rate": 4.1222707423580784e-07, "loss": 2.1185, "step": 262 }, { "epoch": 0.6540254895865714, "grad_norm": 0.5054792761802673, "learning_rate": 4.1179039301310043e-07, "loss": 2.1503, "step": 263 }, { "epoch": 0.6565122785203605, "grad_norm": 0.5056222677230835, "learning_rate": 4.11353711790393e-07, "loss": 2.1089, "step": 264 }, { "epoch": 0.6589990674541498, "grad_norm": 0.5285047292709351, "learning_rate": 4.1091703056768556e-07, "loss": 2.1556, "step": 265 }, { "epoch": 0.6614858563879391, "grad_norm": 0.48898041248321533, "learning_rate": 4.1048034934497815e-07, "loss": 2.1107, "step": 266 }, { "epoch": 0.6639726453217283, "grad_norm": 0.525590717792511, "learning_rate": 4.100436681222707e-07, "loss": 2.1374, "step": 267 }, { "epoch": 0.6664594342555176, "grad_norm": 0.5363737344741821, "learning_rate": 4.096069868995633e-07, "loss": 2.1198, "step": 268 }, { "epoch": 0.6689462231893069, "grad_norm": 0.5182633399963379, "learning_rate": 4.091703056768559e-07, "loss": 2.126, "step": 269 }, { "epoch": 0.671433012123096, "grad_norm": 0.4978923201560974, "learning_rate": 4.0873362445414847e-07, "loss": 2.0764, "step": 270 }, { "epoch": 0.6739198010568853, "grad_norm": 0.5094720125198364, "learning_rate": 4.0829694323144106e-07, "loss": 2.145, "step": 271 }, { "epoch": 0.6764065899906745, "grad_norm": 0.540023148059845, "learning_rate": 4.078602620087336e-07, "loss": 2.1327, "step": 272 }, { "epoch": 0.6788933789244638, "grad_norm": 0.5420276522636414, "learning_rate": 4.074235807860262e-07, "loss": 2.1707, "step": 273 }, { "epoch": 0.6813801678582531, "grad_norm": 0.5282043218612671, "learning_rate": 4.069868995633188e-07, "loss": 2.1473, "step": 274 }, { "epoch": 0.6838669567920422, "grad_norm": 0.5049037933349609, "learning_rate": 4.065502183406113e-07, "loss": 2.1335, "step": 275 }, { "epoch": 0.6863537457258315, "grad_norm": 0.5107303261756897, "learning_rate": 4.061135371179039e-07, "loss": 2.1349, "step": 276 }, { "epoch": 0.6888405346596208, "grad_norm": 0.4959608018398285, "learning_rate": 4.056768558951965e-07, "loss": 2.1044, "step": 277 }, { "epoch": 0.69132732359341, "grad_norm": 0.5125852227210999, "learning_rate": 4.0524017467248904e-07, "loss": 2.1428, "step": 278 }, { "epoch": 0.6938141125271993, "grad_norm": 0.511873185634613, "learning_rate": 4.0480349344978163e-07, "loss": 2.0763, "step": 279 }, { "epoch": 0.6963009014609886, "grad_norm": 0.5032888054847717, "learning_rate": 4.0436681222707417e-07, "loss": 2.122, "step": 280 }, { "epoch": 0.6987876903947777, "grad_norm": 0.5102598667144775, "learning_rate": 4.039301310043668e-07, "loss": 2.0693, "step": 281 }, { "epoch": 0.701274479328567, "grad_norm": 0.5118304491043091, "learning_rate": 4.034934497816594e-07, "loss": 2.11, "step": 282 }, { "epoch": 0.7037612682623562, "grad_norm": 0.5202342867851257, "learning_rate": 4.0305676855895195e-07, "loss": 2.1582, "step": 283 }, { "epoch": 0.7062480571961455, "grad_norm": 0.48433917760849, "learning_rate": 4.0262008733624454e-07, "loss": 2.103, "step": 284 }, { "epoch": 0.7087348461299348, "grad_norm": 0.4986036717891693, "learning_rate": 4.0218340611353713e-07, "loss": 2.1203, "step": 285 }, { "epoch": 0.7112216350637239, "grad_norm": 0.5467602014541626, "learning_rate": 4.0174672489082967e-07, "loss": 2.1204, "step": 286 }, { "epoch": 0.7137084239975132, "grad_norm": 0.5108657479286194, "learning_rate": 4.0131004366812226e-07, "loss": 2.1478, "step": 287 }, { "epoch": 0.7161952129313025, "grad_norm": 0.5145993232727051, "learning_rate": 4.008733624454148e-07, "loss": 2.1563, "step": 288 }, { "epoch": 0.7186820018650917, "grad_norm": 0.5134692788124084, "learning_rate": 4.004366812227074e-07, "loss": 2.1067, "step": 289 }, { "epoch": 0.721168790798881, "grad_norm": 0.5436774492263794, "learning_rate": 4e-07, "loss": 2.1369, "step": 290 }, { "epoch": 0.7236555797326701, "grad_norm": 0.5296205282211304, "learning_rate": 3.995633187772925e-07, "loss": 2.1452, "step": 291 }, { "epoch": 0.7261423686664594, "grad_norm": 0.4911108911037445, "learning_rate": 3.9912663755458517e-07, "loss": 2.1279, "step": 292 }, { "epoch": 0.7286291576002487, "grad_norm": 0.5625902414321899, "learning_rate": 3.986899563318777e-07, "loss": 2.169, "step": 293 }, { "epoch": 0.7311159465340379, "grad_norm": 0.5042857527732849, "learning_rate": 3.982532751091703e-07, "loss": 2.0692, "step": 294 }, { "epoch": 0.7336027354678272, "grad_norm": 0.5251498222351074, "learning_rate": 3.978165938864629e-07, "loss": 2.1261, "step": 295 }, { "epoch": 0.7360895244016165, "grad_norm": 0.5093502402305603, "learning_rate": 3.973799126637554e-07, "loss": 2.1175, "step": 296 }, { "epoch": 0.7385763133354056, "grad_norm": 0.49675241112709045, "learning_rate": 3.96943231441048e-07, "loss": 2.1679, "step": 297 }, { "epoch": 0.7410631022691949, "grad_norm": 0.523313045501709, "learning_rate": 3.965065502183406e-07, "loss": 2.1195, "step": 298 }, { "epoch": 0.7435498912029841, "grad_norm": 0.5194100737571716, "learning_rate": 3.9606986899563315e-07, "loss": 2.1431, "step": 299 }, { "epoch": 0.7460366801367734, "grad_norm": 0.5145063996315002, "learning_rate": 3.9563318777292574e-07, "loss": 2.0968, "step": 300 }, { "epoch": 0.7485234690705627, "grad_norm": 0.5165944695472717, "learning_rate": 3.951965065502183e-07, "loss": 2.1316, "step": 301 }, { "epoch": 0.7510102580043518, "grad_norm": 0.5502745509147644, "learning_rate": 3.947598253275109e-07, "loss": 2.1215, "step": 302 }, { "epoch": 0.7534970469381411, "grad_norm": 0.5752532482147217, "learning_rate": 3.943231441048035e-07, "loss": 2.165, "step": 303 }, { "epoch": 0.7559838358719304, "grad_norm": 0.5388760566711426, "learning_rate": 3.9388646288209605e-07, "loss": 2.136, "step": 304 }, { "epoch": 0.7584706248057196, "grad_norm": 0.5256951451301575, "learning_rate": 3.9344978165938865e-07, "loss": 2.0978, "step": 305 }, { "epoch": 0.7609574137395089, "grad_norm": 0.5247829556465149, "learning_rate": 3.930131004366812e-07, "loss": 2.1591, "step": 306 }, { "epoch": 0.7634442026732982, "grad_norm": 0.508574366569519, "learning_rate": 3.925764192139738e-07, "loss": 2.1542, "step": 307 }, { "epoch": 0.7659309916070873, "grad_norm": 0.49671751260757446, "learning_rate": 3.9213973799126637e-07, "loss": 2.1469, "step": 308 }, { "epoch": 0.7684177805408766, "grad_norm": 0.5673956274986267, "learning_rate": 3.917030567685589e-07, "loss": 2.2209, "step": 309 }, { "epoch": 0.7709045694746658, "grad_norm": 0.4881182610988617, "learning_rate": 3.912663755458515e-07, "loss": 2.1353, "step": 310 }, { "epoch": 0.7733913584084551, "grad_norm": 0.5374391078948975, "learning_rate": 3.908296943231441e-07, "loss": 2.2102, "step": 311 }, { "epoch": 0.7758781473422444, "grad_norm": 0.520723283290863, "learning_rate": 3.9039301310043663e-07, "loss": 2.1716, "step": 312 }, { "epoch": 0.7783649362760335, "grad_norm": 0.5542478561401367, "learning_rate": 3.8995633187772927e-07, "loss": 2.2128, "step": 313 }, { "epoch": 0.7808517252098228, "grad_norm": 0.5180374979972839, "learning_rate": 3.895196506550218e-07, "loss": 2.1245, "step": 314 }, { "epoch": 0.7833385141436121, "grad_norm": 0.5454829931259155, "learning_rate": 3.890829694323144e-07, "loss": 2.1609, "step": 315 }, { "epoch": 0.7858253030774013, "grad_norm": 0.568573534488678, "learning_rate": 3.88646288209607e-07, "loss": 2.175, "step": 316 }, { "epoch": 0.7883120920111906, "grad_norm": 0.5162298679351807, "learning_rate": 3.8820960698689953e-07, "loss": 2.1474, "step": 317 }, { "epoch": 0.7907988809449797, "grad_norm": 0.5148350596427917, "learning_rate": 3.877729257641921e-07, "loss": 2.0697, "step": 318 }, { "epoch": 0.793285669878769, "grad_norm": 0.5210283994674683, "learning_rate": 3.873362445414847e-07, "loss": 2.1587, "step": 319 }, { "epoch": 0.7957724588125583, "grad_norm": 0.4845898151397705, "learning_rate": 3.8689956331877726e-07, "loss": 2.0581, "step": 320 }, { "epoch": 0.7982592477463475, "grad_norm": 0.5222198367118835, "learning_rate": 3.8646288209606985e-07, "loss": 2.0504, "step": 321 }, { "epoch": 0.8007460366801368, "grad_norm": 0.5040515065193176, "learning_rate": 3.860262008733624e-07, "loss": 2.0898, "step": 322 }, { "epoch": 0.8032328256139261, "grad_norm": 0.501930296421051, "learning_rate": 3.85589519650655e-07, "loss": 2.1335, "step": 323 }, { "epoch": 0.8057196145477152, "grad_norm": 0.5378695726394653, "learning_rate": 3.851528384279476e-07, "loss": 2.0953, "step": 324 }, { "epoch": 0.8082064034815045, "grad_norm": 0.49689194560050964, "learning_rate": 3.8471615720524016e-07, "loss": 2.0837, "step": 325 }, { "epoch": 0.8106931924152938, "grad_norm": 0.5431040525436401, "learning_rate": 3.8427947598253275e-07, "loss": 2.1151, "step": 326 }, { "epoch": 0.813179981349083, "grad_norm": 0.510339081287384, "learning_rate": 3.838427947598253e-07, "loss": 2.1399, "step": 327 }, { "epoch": 0.8156667702828723, "grad_norm": 0.5451592206954956, "learning_rate": 3.834061135371179e-07, "loss": 2.1182, "step": 328 }, { "epoch": 0.8181535592166614, "grad_norm": 0.5272311568260193, "learning_rate": 3.829694323144105e-07, "loss": 2.139, "step": 329 }, { "epoch": 0.8206403481504507, "grad_norm": 0.5389718413352966, "learning_rate": 3.82532751091703e-07, "loss": 2.1086, "step": 330 }, { "epoch": 0.82312713708424, "grad_norm": 0.5228806138038635, "learning_rate": 3.820960698689956e-07, "loss": 2.1435, "step": 331 }, { "epoch": 0.8256139260180292, "grad_norm": 0.5186501741409302, "learning_rate": 3.816593886462882e-07, "loss": 2.095, "step": 332 }, { "epoch": 0.8281007149518185, "grad_norm": 0.5484049320220947, "learning_rate": 3.8122270742358074e-07, "loss": 2.1666, "step": 333 }, { "epoch": 0.8305875038856078, "grad_norm": 0.5375939607620239, "learning_rate": 3.807860262008734e-07, "loss": 2.125, "step": 334 }, { "epoch": 0.8330742928193969, "grad_norm": 0.5139255523681641, "learning_rate": 3.803493449781659e-07, "loss": 2.075, "step": 335 }, { "epoch": 0.8355610817531862, "grad_norm": 0.5235688090324402, "learning_rate": 3.799126637554585e-07, "loss": 2.0731, "step": 336 }, { "epoch": 0.8380478706869754, "grad_norm": 0.5630027651786804, "learning_rate": 3.794759825327511e-07, "loss": 2.1261, "step": 337 }, { "epoch": 0.8405346596207647, "grad_norm": 0.547572910785675, "learning_rate": 3.7903930131004364e-07, "loss": 2.1641, "step": 338 }, { "epoch": 0.843021448554554, "grad_norm": 0.5533425211906433, "learning_rate": 3.7860262008733623e-07, "loss": 2.1293, "step": 339 }, { "epoch": 0.8455082374883431, "grad_norm": 0.5256425738334656, "learning_rate": 3.781659388646288e-07, "loss": 2.1397, "step": 340 }, { "epoch": 0.8479950264221324, "grad_norm": 0.5411325693130493, "learning_rate": 3.7772925764192136e-07, "loss": 2.1631, "step": 341 }, { "epoch": 0.8504818153559217, "grad_norm": 0.5244682431221008, "learning_rate": 3.7729257641921396e-07, "loss": 2.1655, "step": 342 }, { "epoch": 0.8529686042897109, "grad_norm": 0.5120859742164612, "learning_rate": 3.768558951965065e-07, "loss": 2.0929, "step": 343 }, { "epoch": 0.8554553932235002, "grad_norm": 0.5486117601394653, "learning_rate": 3.764192139737991e-07, "loss": 2.1333, "step": 344 }, { "epoch": 0.8579421821572893, "grad_norm": 0.5485012531280518, "learning_rate": 3.7598253275109173e-07, "loss": 2.1497, "step": 345 }, { "epoch": 0.8604289710910786, "grad_norm": 0.5423093438148499, "learning_rate": 3.7554585152838427e-07, "loss": 2.2169, "step": 346 }, { "epoch": 0.8629157600248679, "grad_norm": 0.5003622770309448, "learning_rate": 3.7510917030567686e-07, "loss": 2.1818, "step": 347 }, { "epoch": 0.8654025489586571, "grad_norm": 0.5931081771850586, "learning_rate": 3.746724890829694e-07, "loss": 2.1631, "step": 348 }, { "epoch": 0.8678893378924464, "grad_norm": 0.5221492052078247, "learning_rate": 3.74235807860262e-07, "loss": 2.1087, "step": 349 }, { "epoch": 0.8703761268262357, "grad_norm": 0.5065641403198242, "learning_rate": 3.737991266375546e-07, "loss": 2.0622, "step": 350 }, { "epoch": 0.8728629157600248, "grad_norm": 0.5329532027244568, "learning_rate": 3.733624454148471e-07, "loss": 2.1275, "step": 351 }, { "epoch": 0.8753497046938141, "grad_norm": 0.5383079648017883, "learning_rate": 3.729257641921397e-07, "loss": 2.1342, "step": 352 }, { "epoch": 0.8778364936276034, "grad_norm": 0.49477216601371765, "learning_rate": 3.724890829694323e-07, "loss": 2.0404, "step": 353 }, { "epoch": 0.8803232825613926, "grad_norm": 0.5197799205780029, "learning_rate": 3.7205240174672484e-07, "loss": 2.1228, "step": 354 }, { "epoch": 0.8828100714951819, "grad_norm": 0.5122123956680298, "learning_rate": 3.7161572052401744e-07, "loss": 2.1329, "step": 355 }, { "epoch": 0.885296860428971, "grad_norm": 0.5379232168197632, "learning_rate": 3.7117903930131003e-07, "loss": 2.0743, "step": 356 }, { "epoch": 0.8877836493627603, "grad_norm": 0.5164668560028076, "learning_rate": 3.707423580786026e-07, "loss": 2.1474, "step": 357 }, { "epoch": 0.8902704382965496, "grad_norm": 0.518368661403656, "learning_rate": 3.703056768558952e-07, "loss": 2.1987, "step": 358 }, { "epoch": 0.8927572272303388, "grad_norm": 0.5662968754768372, "learning_rate": 3.6986899563318775e-07, "loss": 2.1301, "step": 359 }, { "epoch": 0.8952440161641281, "grad_norm": 0.5161558389663696, "learning_rate": 3.6943231441048034e-07, "loss": 2.1033, "step": 360 }, { "epoch": 0.8977308050979174, "grad_norm": 0.5516855120658875, "learning_rate": 3.6899563318777293e-07, "loss": 2.1003, "step": 361 }, { "epoch": 0.9002175940317065, "grad_norm": 0.5291304588317871, "learning_rate": 3.6855895196506547e-07, "loss": 2.0533, "step": 362 }, { "epoch": 0.9027043829654958, "grad_norm": 0.5586827397346497, "learning_rate": 3.6812227074235806e-07, "loss": 2.1052, "step": 363 }, { "epoch": 0.905191171899285, "grad_norm": 0.5328514575958252, "learning_rate": 3.676855895196506e-07, "loss": 2.1548, "step": 364 }, { "epoch": 0.9076779608330743, "grad_norm": 0.5259972810745239, "learning_rate": 3.672489082969432e-07, "loss": 2.1101, "step": 365 }, { "epoch": 0.9101647497668636, "grad_norm": 0.5482295751571655, "learning_rate": 3.6681222707423584e-07, "loss": 2.1678, "step": 366 }, { "epoch": 0.9126515387006527, "grad_norm": 0.5381218194961548, "learning_rate": 3.663755458515284e-07, "loss": 2.2098, "step": 367 }, { "epoch": 0.915138327634442, "grad_norm": 0.5494764447212219, "learning_rate": 3.6593886462882097e-07, "loss": 2.1338, "step": 368 }, { "epoch": 0.9176251165682313, "grad_norm": 0.5393621921539307, "learning_rate": 3.655021834061135e-07, "loss": 2.0952, "step": 369 }, { "epoch": 0.9201119055020205, "grad_norm": 0.5395556092262268, "learning_rate": 3.650655021834061e-07, "loss": 2.1402, "step": 370 }, { "epoch": 0.9225986944358098, "grad_norm": 0.5069707632064819, "learning_rate": 3.646288209606987e-07, "loss": 2.0925, "step": 371 }, { "epoch": 0.9250854833695991, "grad_norm": 0.5580669641494751, "learning_rate": 3.6419213973799123e-07, "loss": 2.1585, "step": 372 }, { "epoch": 0.9275722723033882, "grad_norm": 0.5407446026802063, "learning_rate": 3.637554585152838e-07, "loss": 2.1448, "step": 373 }, { "epoch": 0.9300590612371775, "grad_norm": 0.5261268019676208, "learning_rate": 3.633187772925764e-07, "loss": 2.1687, "step": 374 }, { "epoch": 0.9325458501709667, "grad_norm": 0.5728645920753479, "learning_rate": 3.6288209606986895e-07, "loss": 2.0929, "step": 375 }, { "epoch": 0.935032639104756, "grad_norm": 0.536983072757721, "learning_rate": 3.6244541484716154e-07, "loss": 2.1669, "step": 376 }, { "epoch": 0.9375194280385453, "grad_norm": 0.5492017269134521, "learning_rate": 3.6200873362445414e-07, "loss": 2.1449, "step": 377 }, { "epoch": 0.9400062169723344, "grad_norm": 0.5745022296905518, "learning_rate": 3.6157205240174673e-07, "loss": 2.1315, "step": 378 }, { "epoch": 0.9424930059061237, "grad_norm": 0.5852669477462769, "learning_rate": 3.611353711790393e-07, "loss": 2.1405, "step": 379 }, { "epoch": 0.944979794839913, "grad_norm": 0.5169341564178467, "learning_rate": 3.6069868995633186e-07, "loss": 2.0788, "step": 380 }, { "epoch": 0.9474665837737022, "grad_norm": 0.5499164462089539, "learning_rate": 3.6026200873362445e-07, "loss": 2.1843, "step": 381 }, { "epoch": 0.9499533727074915, "grad_norm": 0.5195809006690979, "learning_rate": 3.59825327510917e-07, "loss": 2.1045, "step": 382 }, { "epoch": 0.9524401616412806, "grad_norm": 0.5368107557296753, "learning_rate": 3.593886462882096e-07, "loss": 2.1261, "step": 383 }, { "epoch": 0.9549269505750699, "grad_norm": 0.5721762776374817, "learning_rate": 3.5895196506550217e-07, "loss": 2.1323, "step": 384 }, { "epoch": 0.9574137395088592, "grad_norm": 0.5255040526390076, "learning_rate": 3.585152838427947e-07, "loss": 2.13, "step": 385 }, { "epoch": 0.9599005284426484, "grad_norm": 0.5373786687850952, "learning_rate": 3.580786026200873e-07, "loss": 2.0763, "step": 386 }, { "epoch": 0.9623873173764377, "grad_norm": 0.5432249307632446, "learning_rate": 3.576419213973799e-07, "loss": 2.1305, "step": 387 }, { "epoch": 0.964874106310227, "grad_norm": 0.5505443811416626, "learning_rate": 3.572052401746725e-07, "loss": 2.1311, "step": 388 }, { "epoch": 0.9673608952440161, "grad_norm": 0.5119839906692505, "learning_rate": 3.567685589519651e-07, "loss": 2.1121, "step": 389 }, { "epoch": 0.9698476841778054, "grad_norm": 0.5414577126502991, "learning_rate": 3.563318777292576e-07, "loss": 2.1076, "step": 390 }, { "epoch": 0.9723344731115946, "grad_norm": 0.5283794403076172, "learning_rate": 3.558951965065502e-07, "loss": 2.1293, "step": 391 }, { "epoch": 0.9748212620453839, "grad_norm": 0.5475645065307617, "learning_rate": 3.554585152838428e-07, "loss": 2.1373, "step": 392 }, { "epoch": 0.9773080509791732, "grad_norm": 0.5172975063323975, "learning_rate": 3.5502183406113534e-07, "loss": 2.1378, "step": 393 }, { "epoch": 0.9797948399129623, "grad_norm": 0.5674493312835693, "learning_rate": 3.5458515283842793e-07, "loss": 2.0797, "step": 394 }, { "epoch": 0.9822816288467516, "grad_norm": 0.510979950428009, "learning_rate": 3.541484716157205e-07, "loss": 2.091, "step": 395 }, { "epoch": 0.9847684177805409, "grad_norm": 0.5517850518226624, "learning_rate": 3.5371179039301306e-07, "loss": 2.1703, "step": 396 }, { "epoch": 0.9872552067143301, "grad_norm": 0.5487313270568848, "learning_rate": 3.5327510917030565e-07, "loss": 2.1213, "step": 397 }, { "epoch": 0.9897419956481194, "grad_norm": 0.5256079435348511, "learning_rate": 3.528384279475982e-07, "loss": 2.1052, "step": 398 }, { "epoch": 0.9922287845819087, "grad_norm": 0.5553068518638611, "learning_rate": 3.5240174672489084e-07, "loss": 2.0818, "step": 399 }, { "epoch": 0.9947155735156978, "grad_norm": 0.5434982180595398, "learning_rate": 3.5196506550218343e-07, "loss": 2.0612, "step": 400 }, { "epoch": 0.9972023624494871, "grad_norm": 0.5237376689910889, "learning_rate": 3.5152838427947597e-07, "loss": 2.1489, "step": 401 }, { "epoch": 0.9996891513832763, "grad_norm": 0.5455615520477295, "learning_rate": 3.5109170305676856e-07, "loss": 2.0467, "step": 402 }, { "epoch": 0.9996891513832763, "eval_loss": 2.129138708114624, "eval_runtime": 458.2461, "eval_samples_per_second": 1.065, "eval_steps_per_second": 0.266, "step": 402 } ], "logging_steps": 1, "max_steps": 1206, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.1872324383890473e+18, "train_batch_size": 2, "trial_name": null, "trial_params": null }