{ "best_metric": null, "best_model_checkpoint": null, "epoch": 9.411764705882353, "global_step": 400, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.02, "learning_rate": 2.9999999999999997e-05, "loss": 4.4008, "step": 1 }, { "epoch": 0.05, "learning_rate": 5.9999999999999995e-05, "loss": 4.1205, "step": 2 }, { "epoch": 0.07, "learning_rate": 8.999999999999999e-05, "loss": 3.856, "step": 3 }, { "epoch": 0.09, "learning_rate": 0.00011999999999999999, "loss": 4.2166, "step": 4 }, { "epoch": 0.12, "learning_rate": 0.00015, "loss": 3.853, "step": 5 }, { "epoch": 0.14, "learning_rate": 0.00017999999999999998, "loss": 3.8786, "step": 6 }, { "epoch": 0.16, "learning_rate": 0.00020999999999999998, "loss": 3.7177, "step": 7 }, { "epoch": 0.19, "learning_rate": 0.00023999999999999998, "loss": 4.0304, "step": 8 }, { "epoch": 0.21, "learning_rate": 0.00027, "loss": 3.8415, "step": 9 }, { "epoch": 0.24, "learning_rate": 0.0003, "loss": 3.8809, "step": 10 }, { "epoch": 0.26, "learning_rate": 0.000299995596569254, "loss": 3.8165, "step": 11 }, { "epoch": 0.28, "learning_rate": 0.0002999823865355522, "loss": 3.672, "step": 12 }, { "epoch": 0.31, "learning_rate": 0.0002999603706744874, "loss": 3.6258, "step": 13 }, { "epoch": 0.33, "learning_rate": 0.00029992955027866394, "loss": 3.7398, "step": 14 }, { "epoch": 0.35, "learning_rate": 0.00029988992715762147, "loss": 3.7463, "step": 15 }, { "epoch": 0.38, "learning_rate": 0.000299841503637729, "loss": 3.5098, "step": 16 }, { "epoch": 0.4, "learning_rate": 0.0002997842825620479, "loss": 3.6305, "step": 17 }, { "epoch": 0.42, "learning_rate": 0.0002997182672901657, "loss": 3.4217, "step": 18 }, { "epoch": 0.45, "learning_rate": 0.00029964346169799786, "loss": 3.2482, "step": 19 }, { "epoch": 0.47, "learning_rate": 0.000299559870177561, "loss": 3.0741, "step": 20 }, { "epoch": 0.49, "learning_rate": 0.0002994674976367149, "loss": 3.8112, "step": 21 }, { "epoch": 0.52, "learning_rate": 0.0002993663494988739, "loss": 3.3994, "step": 22 }, { "epoch": 0.54, "learning_rate": 0.0002992564317026891, "loss": 3.2294, "step": 23 }, { "epoch": 0.56, "learning_rate": 0.00029913775070169893, "loss": 3.0507, "step": 24 }, { "epoch": 0.59, "learning_rate": 0.0002990103134639512, "loss": 3.3722, "step": 25 }, { "epoch": 0.61, "learning_rate": 0.0002988741274715932, "loss": 3.2097, "step": 26 }, { "epoch": 0.64, "learning_rate": 0.00029872920072043275, "loss": 3.4514, "step": 27 }, { "epoch": 0.66, "learning_rate": 0.00029857554171946863, "loss": 3.7536, "step": 28 }, { "epoch": 0.68, "learning_rate": 0.00029841315949039114, "loss": 3.5086, "step": 29 }, { "epoch": 0.71, "learning_rate": 0.0002982420635670523, "loss": 3.6062, "step": 30 }, { "epoch": 0.73, "learning_rate": 0.000298062263994906, "loss": 3.5467, "step": 31 }, { "epoch": 0.75, "learning_rate": 0.0002978737713304185, "loss": 3.6129, "step": 32 }, { "epoch": 0.78, "learning_rate": 0.0002976765966404484, "loss": 3.6464, "step": 33 }, { "epoch": 0.8, "learning_rate": 0.0002974707515015969, "loss": 3.4577, "step": 34 }, { "epoch": 0.82, "learning_rate": 0.0002972562479995282, "loss": 3.3455, "step": 35 }, { "epoch": 0.85, "learning_rate": 0.0002970330987282599, "loss": 3.6202, "step": 36 }, { "epoch": 0.87, "learning_rate": 0.0002968013167894234, "loss": 3.1719, "step": 37 }, { "epoch": 0.89, "learning_rate": 0.00029656091579149485, "loss": 3.2336, "step": 38 }, { "epoch": 0.92, "learning_rate": 0.0002963119098489964, "loss": 3.7547, "step": 39 }, { "epoch": 0.94, "learning_rate": 0.00029605431358166684, "loss": 3.419, "step": 40 }, { "epoch": 0.96, "learning_rate": 0.00029578814211360393, "loss": 3.5237, "step": 41 }, { "epoch": 0.99, "learning_rate": 0.00029551341107237597, "loss": 2.983, "step": 42 }, { "epoch": 1.01, "learning_rate": 0.00029523013658810444, "loss": 2.863, "step": 43 }, { "epoch": 1.04, "learning_rate": 0.00029493833529251707, "loss": 2.4656, "step": 44 }, { "epoch": 1.06, "learning_rate": 0.00029463802431797115, "loss": 2.1783, "step": 45 }, { "epoch": 1.08, "learning_rate": 0.0002943292212964476, "loss": 2.4046, "step": 46 }, { "epoch": 1.11, "learning_rate": 0.00029401194435851614, "loss": 2.4341, "step": 47 }, { "epoch": 1.13, "learning_rate": 0.00029368621213227044, "loss": 2.2513, "step": 48 }, { "epoch": 1.15, "learning_rate": 0.00029335204374223437, "loss": 2.2957, "step": 49 }, { "epoch": 1.18, "learning_rate": 0.00029300945880823956, "loss": 2.6274, "step": 50 }, { "epoch": 1.2, "learning_rate": 0.00029265847744427303, "loss": 2.2075, "step": 51 }, { "epoch": 1.22, "learning_rate": 0.00029229912025729646, "loss": 2.2498, "step": 52 }, { "epoch": 1.25, "learning_rate": 0.00029193140834603645, "loss": 2.3298, "step": 53 }, { "epoch": 1.27, "learning_rate": 0.0002915553632997454, "loss": 1.9839, "step": 54 }, { "epoch": 1.29, "learning_rate": 0.0002911710071969342, "loss": 1.9872, "step": 55 }, { "epoch": 1.32, "learning_rate": 0.000290778362604076, "loss": 2.1115, "step": 56 }, { "epoch": 1.34, "learning_rate": 0.0002903774525742811, "loss": 2.3718, "step": 57 }, { "epoch": 1.36, "learning_rate": 0.00028996830064594335, "loss": 2.5018, "step": 58 }, { "epoch": 1.39, "learning_rate": 0.0002895509308413587, "loss": 2.5214, "step": 59 }, { "epoch": 1.41, "learning_rate": 0.0002891253676653142, "loss": 2.3296, "step": 60 }, { "epoch": 1.44, "learning_rate": 0.0002886916361036494, "loss": 2.575, "step": 61 }, { "epoch": 1.46, "learning_rate": 0.0002882497616217896, "loss": 2.5662, "step": 62 }, { "epoch": 1.48, "learning_rate": 0.0002877997701632505, "loss": 2.4488, "step": 63 }, { "epoch": 1.51, "learning_rate": 0.0002873416881481151, "loss": 2.6527, "step": 64 }, { "epoch": 1.53, "learning_rate": 0.00028687554247148247, "loss": 2.2993, "step": 65 }, { "epoch": 1.55, "learning_rate": 0.0002864013605018887, "loss": 2.2061, "step": 66 }, { "epoch": 1.58, "learning_rate": 0.00028591917007969993, "loss": 2.398, "step": 67 }, { "epoch": 1.6, "learning_rate": 0.00028542899951547793, "loss": 2.4864, "step": 68 }, { "epoch": 1.62, "learning_rate": 0.0002849308775883178, "loss": 2.0955, "step": 69 }, { "epoch": 1.65, "learning_rate": 0.0002844248335441583, "loss": 2.1742, "step": 70 }, { "epoch": 1.67, "learning_rate": 0.00028391089709406484, "loss": 2.4512, "step": 71 }, { "epoch": 1.69, "learning_rate": 0.00028338909841248497, "loss": 2.4987, "step": 72 }, { "epoch": 1.72, "learning_rate": 0.0002828594681354768, "loss": 2.3906, "step": 73 }, { "epoch": 1.74, "learning_rate": 0.00028232203735891023, "loss": 2.5329, "step": 74 }, { "epoch": 1.76, "learning_rate": 0.00028177683763664133, "loss": 2.4044, "step": 75 }, { "epoch": 1.79, "learning_rate": 0.0002812239009786597, "loss": 2.3984, "step": 76 }, { "epoch": 1.81, "learning_rate": 0.00028066325984920916, "loss": 2.4416, "step": 77 }, { "epoch": 1.84, "learning_rate": 0.00028009494716488146, "loss": 2.4914, "step": 78 }, { "epoch": 1.86, "learning_rate": 0.00027951899629268385, "loss": 2.2867, "step": 79 }, { "epoch": 1.88, "learning_rate": 0.0002789354410480802, "loss": 2.2848, "step": 80 }, { "epoch": 1.91, "learning_rate": 0.0002783443156930051, "loss": 2.2625, "step": 81 }, { "epoch": 1.93, "learning_rate": 0.00027774565493385273, "loss": 2.4286, "step": 82 }, { "epoch": 1.95, "learning_rate": 0.0002771394939194392, "loss": 2.4524, "step": 83 }, { "epoch": 1.98, "learning_rate": 0.0002765258682389382, "loss": 2.3957, "step": 84 }, { "epoch": 2.0, "learning_rate": 0.0002759048139197925, "loss": 2.3202, "step": 85 }, { "epoch": 2.02, "learning_rate": 0.0002752763674255977, "loss": 1.6208, "step": 86 }, { "epoch": 2.05, "learning_rate": 0.000274640565653962, "loss": 1.6103, "step": 87 }, { "epoch": 2.07, "learning_rate": 0.00027399744593433986, "loss": 1.2998, "step": 88 }, { "epoch": 2.09, "learning_rate": 0.0002733470460258397, "loss": 1.4186, "step": 89 }, { "epoch": 2.12, "learning_rate": 0.00027268940411500763, "loss": 1.221, "step": 90 }, { "epoch": 2.14, "learning_rate": 0.00027202455881358514, "loss": 1.3517, "step": 91 }, { "epoch": 2.16, "learning_rate": 0.0002713525491562421, "loss": 1.4388, "step": 92 }, { "epoch": 2.19, "learning_rate": 0.000270673414598285, "loss": 1.301, "step": 93 }, { "epoch": 2.21, "learning_rate": 0.0002699871950133404, "loss": 1.4386, "step": 94 }, { "epoch": 2.24, "learning_rate": 0.000269293930691014, "loss": 1.399, "step": 95 }, { "epoch": 2.26, "learning_rate": 0.0002685936623345247, "loss": 1.4381, "step": 96 }, { "epoch": 2.28, "learning_rate": 0.0002678864310583154, "loss": 1.2775, "step": 97 }, { "epoch": 2.31, "learning_rate": 0.0002671722783856388, "loss": 1.4458, "step": 98 }, { "epoch": 2.33, "learning_rate": 0.00026645124624611927, "loss": 1.3532, "step": 99 }, { "epoch": 2.35, "learning_rate": 0.00026572337697329144, "loss": 1.2288, "step": 100 }, { "epoch": 2.38, "learning_rate": 0.0002649887133021144, "loss": 1.4598, "step": 101 }, { "epoch": 2.4, "learning_rate": 0.0002642472983664628, "loss": 1.4812, "step": 102 }, { "epoch": 2.42, "learning_rate": 0.00026349917569659426, "loss": 1.3315, "step": 103 }, { "epoch": 2.45, "learning_rate": 0.0002627443892165937, "loss": 1.4532, "step": 104 }, { "epoch": 2.47, "learning_rate": 0.00026198298324179437, "loss": 1.3674, "step": 105 }, { "epoch": 2.49, "learning_rate": 0.000261215002476176, "loss": 1.2486, "step": 106 }, { "epoch": 2.52, "learning_rate": 0.00026044049200974004, "loss": 1.2998, "step": 107 }, { "epoch": 2.54, "learning_rate": 0.00025965949731586257, "loss": 1.4161, "step": 108 }, { "epoch": 2.56, "learning_rate": 0.0002588720642486242, "loss": 1.4535, "step": 109 }, { "epoch": 2.59, "learning_rate": 0.000258078239040118, "loss": 1.2735, "step": 110 }, { "epoch": 2.61, "learning_rate": 0.0002572780682977351, "loss": 1.2421, "step": 111 }, { "epoch": 2.64, "learning_rate": 0.000256471599001428, "loss": 1.4072, "step": 112 }, { "epoch": 2.66, "learning_rate": 0.0002556588785009528, "loss": 1.4291, "step": 113 }, { "epoch": 2.68, "learning_rate": 0.0002548399545130886, "loss": 1.3882, "step": 114 }, { "epoch": 2.71, "learning_rate": 0.0002540148751188362, "loss": 1.6521, "step": 115 }, { "epoch": 2.73, "learning_rate": 0.00025318368876059546, "loss": 1.3777, "step": 116 }, { "epoch": 2.75, "learning_rate": 0.0002523464442393204, "loss": 1.3828, "step": 117 }, { "epoch": 2.78, "learning_rate": 0.0002515031907116547, "loss": 1.5907, "step": 118 }, { "epoch": 2.8, "learning_rate": 0.0002506539776870451, "loss": 1.2091, "step": 119 }, { "epoch": 2.82, "learning_rate": 0.00024979885502483476, "loss": 1.3796, "step": 120 }, { "epoch": 2.85, "learning_rate": 0.0002489378729313361, "loss": 1.5818, "step": 121 }, { "epoch": 2.87, "learning_rate": 0.00024807108195688273, "loss": 1.4737, "step": 122 }, { "epoch": 2.89, "learning_rate": 0.0002471985329928617, "loss": 1.4355, "step": 123 }, { "epoch": 2.92, "learning_rate": 0.00024632027726872535, "loss": 1.2755, "step": 124 }, { "epoch": 2.94, "learning_rate": 0.00024543636634898394, "loss": 1.3296, "step": 125 }, { "epoch": 2.96, "learning_rate": 0.00024454685213017767, "loss": 1.3869, "step": 126 }, { "epoch": 2.99, "learning_rate": 0.00024365178683783008, "loss": 1.2251, "step": 127 }, { "epoch": 3.01, "learning_rate": 0.00024275122302338143, "loss": 1.2315, "step": 128 }, { "epoch": 3.04, "learning_rate": 0.00024184521356110367, "loss": 0.8248, "step": 129 }, { "epoch": 3.06, "learning_rate": 0.00024093381164499568, "loss": 0.6754, "step": 130 }, { "epoch": 3.08, "learning_rate": 0.0002400170707856605, "loss": 0.8092, "step": 131 }, { "epoch": 3.11, "learning_rate": 0.00023909504480716317, "loss": 0.8068, "step": 132 }, { "epoch": 3.13, "learning_rate": 0.00023816778784387094, "loss": 0.7371, "step": 133 }, { "epoch": 3.15, "learning_rate": 0.00023723535433727485, "loss": 0.8762, "step": 134 }, { "epoch": 3.18, "learning_rate": 0.0002362977990327931, "loss": 0.4843, "step": 135 }, { "epoch": 3.2, "learning_rate": 0.00023535517697655708, "loss": 0.662, "step": 136 }, { "epoch": 3.22, "learning_rate": 0.0002344075435121794, "loss": 0.7358, "step": 137 }, { "epoch": 3.25, "learning_rate": 0.0002334549542775045, "loss": 0.7616, "step": 138 }, { "epoch": 3.27, "learning_rate": 0.00023249746520134201, "loss": 0.6971, "step": 139 }, { "epoch": 3.29, "learning_rate": 0.00023153513250018316, "loss": 0.9456, "step": 140 }, { "epoch": 3.32, "learning_rate": 0.00023056801267489995, "loss": 0.7159, "step": 141 }, { "epoch": 3.34, "learning_rate": 0.000229596162507428, "loss": 0.7363, "step": 142 }, { "epoch": 3.36, "learning_rate": 0.00022861963905743281, "loss": 0.7258, "step": 143 }, { "epoch": 3.39, "learning_rate": 0.00022763849965895942, "loss": 0.6632, "step": 144 }, { "epoch": 3.41, "learning_rate": 0.00022665280191706653, "loss": 0.6326, "step": 145 }, { "epoch": 3.44, "learning_rate": 0.00022566260370444395, "loss": 0.6292, "step": 146 }, { "epoch": 3.46, "learning_rate": 0.00022466796315801508, "loss": 0.6591, "step": 147 }, { "epoch": 3.48, "learning_rate": 0.00022366893867552346, "loss": 0.7075, "step": 148 }, { "epoch": 3.51, "learning_rate": 0.00022266558891210402, "loss": 0.6943, "step": 149 }, { "epoch": 3.53, "learning_rate": 0.0002216579727768394, "loss": 0.6478, "step": 150 }, { "epoch": 3.55, "learning_rate": 0.00022064614942930122, "loss": 0.6094, "step": 151 }, { "epoch": 3.58, "learning_rate": 0.00021963017827607666, "loss": 0.6405, "step": 152 }, { "epoch": 3.6, "learning_rate": 0.00021861011896728052, "loss": 0.6633, "step": 153 }, { "epoch": 3.62, "learning_rate": 0.00021758603139305314, "loss": 0.6685, "step": 154 }, { "epoch": 3.65, "learning_rate": 0.00021655797568004395, "loss": 0.6076, "step": 155 }, { "epoch": 3.67, "learning_rate": 0.00021552601218788146, "loss": 0.5769, "step": 156 }, { "epoch": 3.69, "learning_rate": 0.00021449020150562928, "loss": 0.6439, "step": 157 }, { "epoch": 3.72, "learning_rate": 0.00021345060444822879, "loss": 0.6128, "step": 158 }, { "epoch": 3.74, "learning_rate": 0.00021240728205292863, "loss": 0.723, "step": 159 }, { "epoch": 3.76, "learning_rate": 0.000211360295575701, "loss": 0.5633, "step": 160 }, { "epoch": 3.79, "learning_rate": 0.00021030970648764505, "loss": 0.7373, "step": 161 }, { "epoch": 3.81, "learning_rate": 0.0002092555764713781, "loss": 0.5987, "step": 162 }, { "epoch": 3.84, "learning_rate": 0.00020819796741741375, "loss": 0.7485, "step": 163 }, { "epoch": 3.86, "learning_rate": 0.00020713694142052838, "loss": 0.8548, "step": 164 }, { "epoch": 3.88, "learning_rate": 0.00020607256077611528, "loss": 0.6598, "step": 165 }, { "epoch": 3.91, "learning_rate": 0.0002050048879765272, "loss": 0.676, "step": 166 }, { "epoch": 3.93, "learning_rate": 0.00020393398570740716, "loss": 0.6663, "step": 167 }, { "epoch": 3.95, "learning_rate": 0.00020285991684400827, "loss": 0.7429, "step": 168 }, { "epoch": 3.98, "learning_rate": 0.00020178274444750187, "loss": 0.6373, "step": 169 }, { "epoch": 4.0, "learning_rate": 0.0002007025317612754, "loss": 0.6915, "step": 170 }, { "epoch": 4.02, "learning_rate": 0.00019961934220721883, "loss": 0.322, "step": 171 }, { "epoch": 4.05, "learning_rate": 0.00019853323938200134, "loss": 0.333, "step": 172 }, { "epoch": 4.07, "learning_rate": 0.00019744428705333728, "loss": 0.3354, "step": 173 }, { "epoch": 4.09, "learning_rate": 0.0001963525491562421, "loss": 0.3078, "step": 174 }, { "epoch": 4.12, "learning_rate": 0.00019525808978927887, "loss": 0.3663, "step": 175 }, { "epoch": 4.14, "learning_rate": 0.00019416097321079448, "loss": 0.3445, "step": 176 }, { "epoch": 4.16, "learning_rate": 0.00019306126383514737, "loss": 0.3903, "step": 177 }, { "epoch": 4.19, "learning_rate": 0.00019195902622892518, "loss": 0.2701, "step": 178 }, { "epoch": 4.21, "learning_rate": 0.0001908543251071541, "loss": 0.3602, "step": 179 }, { "epoch": 4.24, "learning_rate": 0.00018974722532949927, "loss": 0.381, "step": 180 }, { "epoch": 4.26, "learning_rate": 0.00018863779189645666, "loss": 0.3546, "step": 181 }, { "epoch": 4.28, "learning_rate": 0.00018752608994553678, "loss": 0.3365, "step": 182 }, { "epoch": 4.31, "learning_rate": 0.00018641218474744039, "loss": 0.3302, "step": 183 }, { "epoch": 4.33, "learning_rate": 0.0001852961417022261, "loss": 0.2817, "step": 184 }, { "epoch": 4.35, "learning_rate": 0.00018417802633547065, "loss": 0.3544, "step": 185 }, { "epoch": 4.38, "learning_rate": 0.00018305790429442182, "loss": 0.3249, "step": 186 }, { "epoch": 4.4, "learning_rate": 0.0001819358413441441, "loss": 0.354, "step": 187 }, { "epoch": 4.42, "learning_rate": 0.00018081190336365744, "loss": 0.2666, "step": 188 }, { "epoch": 4.45, "learning_rate": 0.00017968615634206928, "loss": 0.2664, "step": 189 }, { "epoch": 4.47, "learning_rate": 0.00017855866637470023, "loss": 0.37, "step": 190 }, { "epoch": 4.49, "learning_rate": 0.0001774294996592035, "loss": 0.3381, "step": 191 }, { "epoch": 4.52, "learning_rate": 0.00017629872249167816, "loss": 0.3329, "step": 192 }, { "epoch": 4.54, "learning_rate": 0.0001751664012627768, "loss": 0.2704, "step": 193 }, { "epoch": 4.56, "learning_rate": 0.00017403260245380762, "loss": 0.3881, "step": 194 }, { "epoch": 4.59, "learning_rate": 0.00017289739263283115, "loss": 0.4599, "step": 195 }, { "epoch": 4.61, "learning_rate": 0.00017176083845075172, "loss": 0.2881, "step": 196 }, { "epoch": 4.64, "learning_rate": 0.0001706230066374044, "loss": 0.343, "step": 197 }, { "epoch": 4.66, "learning_rate": 0.00016948396399763704, "loss": 0.4128, "step": 198 }, { "epoch": 4.68, "learning_rate": 0.0001683437774073881, "loss": 0.2909, "step": 199 }, { "epoch": 4.71, "learning_rate": 0.00016720251380976007, "loss": 0.4494, "step": 200 }, { "epoch": 4.73, "learning_rate": 0.0001660602402110891, "loss": 0.2766, "step": 201 }, { "epoch": 4.75, "learning_rate": 0.00016491702367701103, "loss": 0.3148, "step": 202 }, { "epoch": 4.78, "learning_rate": 0.0001637729313285237, "loss": 0.295, "step": 203 }, { "epoch": 4.8, "learning_rate": 0.00016262803033804604, "loss": 0.3496, "step": 204 }, { "epoch": 4.82, "learning_rate": 0.0001614823879254744, "loss": 0.2802, "step": 205 }, { "epoch": 4.85, "learning_rate": 0.0001603360713542356, "loss": 0.349, "step": 206 }, { "epoch": 4.87, "learning_rate": 0.0001591891479273383, "loss": 0.5714, "step": 207 }, { "epoch": 4.89, "learning_rate": 0.00015804168498342083, "loss": 0.3354, "step": 208 }, { "epoch": 4.92, "learning_rate": 0.00015689374989279797, "loss": 0.3117, "step": 209 }, { "epoch": 4.94, "learning_rate": 0.0001557454100535053, "loss": 0.3284, "step": 210 }, { "epoch": 4.96, "learning_rate": 0.0001545967328873423, "loss": 0.4018, "step": 211 }, { "epoch": 4.99, "learning_rate": 0.00015344778583591356, "loss": 0.33, "step": 212 }, { "epoch": 5.01, "learning_rate": 0.00015229863635666944, "loss": 0.2153, "step": 213 }, { "epoch": 5.04, "learning_rate": 0.00015114935191894524, "loss": 0.1414, "step": 214 }, { "epoch": 5.06, "learning_rate": 0.00015, "loss": 0.1691, "step": 215 }, { "epoch": 5.08, "learning_rate": 0.00014885064808105476, "loss": 0.1569, "step": 216 }, { "epoch": 5.11, "learning_rate": 0.00014770136364333054, "loss": 0.1469, "step": 217 }, { "epoch": 5.13, "learning_rate": 0.00014655221416408644, "loss": 0.1712, "step": 218 }, { "epoch": 5.15, "learning_rate": 0.00014540326711265768, "loss": 0.1795, "step": 219 }, { "epoch": 5.18, "learning_rate": 0.0001442545899464947, "loss": 0.2716, "step": 220 }, { "epoch": 5.2, "learning_rate": 0.00014310625010720203, "loss": 0.112, "step": 221 }, { "epoch": 5.22, "learning_rate": 0.00014195831501657917, "loss": 0.2199, "step": 222 }, { "epoch": 5.25, "learning_rate": 0.0001408108520726617, "loss": 0.1946, "step": 223 }, { "epoch": 5.27, "learning_rate": 0.0001396639286457644, "loss": 0.1926, "step": 224 }, { "epoch": 5.29, "learning_rate": 0.00013851761207452564, "loss": 0.2176, "step": 225 }, { "epoch": 5.32, "learning_rate": 0.00013737196966195393, "loss": 0.1611, "step": 226 }, { "epoch": 5.34, "learning_rate": 0.00013622706867147627, "loss": 0.1369, "step": 227 }, { "epoch": 5.36, "learning_rate": 0.00013508297632298892, "loss": 0.1554, "step": 228 }, { "epoch": 5.39, "learning_rate": 0.00013393975978891087, "loss": 0.1581, "step": 229 }, { "epoch": 5.41, "learning_rate": 0.00013279748619023993, "loss": 0.1599, "step": 230 }, { "epoch": 5.44, "learning_rate": 0.00013165622259261187, "loss": 0.2309, "step": 231 }, { "epoch": 5.46, "learning_rate": 0.0001305160360023629, "loss": 0.1977, "step": 232 }, { "epoch": 5.48, "learning_rate": 0.00012937699336259555, "loss": 0.1367, "step": 233 }, { "epoch": 5.51, "learning_rate": 0.00012823916154924825, "loss": 0.1378, "step": 234 }, { "epoch": 5.53, "learning_rate": 0.0001271026073671688, "loss": 0.2091, "step": 235 }, { "epoch": 5.55, "learning_rate": 0.0001259673975461923, "loss": 0.1367, "step": 236 }, { "epoch": 5.58, "learning_rate": 0.0001248335987372232, "loss": 0.1847, "step": 237 }, { "epoch": 5.6, "learning_rate": 0.0001237012775083218, "loss": 0.1688, "step": 238 }, { "epoch": 5.62, "learning_rate": 0.00012257050034079645, "loss": 0.149, "step": 239 }, { "epoch": 5.65, "learning_rate": 0.00012144133362529971, "loss": 0.1458, "step": 240 }, { "epoch": 5.67, "learning_rate": 0.00012031384365793073, "loss": 0.1646, "step": 241 }, { "epoch": 5.69, "learning_rate": 0.00011918809663634253, "loss": 0.1478, "step": 242 }, { "epoch": 5.72, "learning_rate": 0.00011806415865585587, "loss": 0.1426, "step": 243 }, { "epoch": 5.74, "learning_rate": 0.0001169420957055782, "loss": 0.1442, "step": 244 }, { "epoch": 5.76, "learning_rate": 0.00011582197366452939, "loss": 0.1989, "step": 245 }, { "epoch": 5.79, "learning_rate": 0.00011470385829777393, "loss": 0.2061, "step": 246 }, { "epoch": 5.81, "learning_rate": 0.00011358781525255963, "loss": 0.1528, "step": 247 }, { "epoch": 5.84, "learning_rate": 0.00011247391005446323, "loss": 0.1925, "step": 248 }, { "epoch": 5.86, "learning_rate": 0.00011136220810354336, "loss": 0.252, "step": 249 }, { "epoch": 5.88, "learning_rate": 0.00011025277467050076, "loss": 0.1626, "step": 250 }, { "epoch": 5.91, "learning_rate": 0.00010914567489284591, "loss": 0.2026, "step": 251 }, { "epoch": 5.93, "learning_rate": 0.00010804097377107482, "loss": 0.1206, "step": 252 }, { "epoch": 5.95, "learning_rate": 0.00010693873616485264, "loss": 0.1395, "step": 253 }, { "epoch": 5.98, "learning_rate": 0.00010583902678920553, "loss": 0.1204, "step": 254 }, { "epoch": 6.0, "learning_rate": 0.00010474191021072114, "loss": 0.1886, "step": 255 }, { "epoch": 6.02, "learning_rate": 0.0001036474508437579, "loss": 0.0958, "step": 256 }, { "epoch": 6.05, "learning_rate": 0.00010255571294666272, "loss": 0.1227, "step": 257 }, { "epoch": 6.07, "learning_rate": 0.00010146676061799863, "loss": 0.1148, "step": 258 }, { "epoch": 6.09, "learning_rate": 0.00010038065779278117, "loss": 0.0816, "step": 259 }, { "epoch": 6.12, "learning_rate": 9.92974682387246e-05, "loss": 0.1013, "step": 260 }, { "epoch": 6.14, "learning_rate": 9.82172555524981e-05, "loss": 0.0973, "step": 261 }, { "epoch": 6.16, "learning_rate": 9.714008315599173e-05, "loss": 0.0757, "step": 262 }, { "epoch": 6.19, "learning_rate": 9.606601429259282e-05, "loss": 0.0977, "step": 263 }, { "epoch": 6.21, "learning_rate": 9.499511202347281e-05, "loss": 0.0735, "step": 264 }, { "epoch": 6.24, "learning_rate": 9.392743922388468e-05, "loss": 0.0987, "step": 265 }, { "epoch": 6.26, "learning_rate": 9.286305857947158e-05, "loss": 0.1316, "step": 266 }, { "epoch": 6.28, "learning_rate": 9.180203258258622e-05, "loss": 0.0973, "step": 267 }, { "epoch": 6.31, "learning_rate": 9.074442352862188e-05, "loss": 0.0753, "step": 268 }, { "epoch": 6.33, "learning_rate": 8.969029351235493e-05, "loss": 0.0806, "step": 269 }, { "epoch": 6.35, "learning_rate": 8.8639704424299e-05, "loss": 0.088, "step": 270 }, { "epoch": 6.38, "learning_rate": 8.759271794707134e-05, "loss": 0.0691, "step": 271 }, { "epoch": 6.4, "learning_rate": 8.654939555177119e-05, "loss": 0.0884, "step": 272 }, { "epoch": 6.42, "learning_rate": 8.550979849437068e-05, "loss": 0.0778, "step": 273 }, { "epoch": 6.45, "learning_rate": 8.447398781211854e-05, "loss": 0.0578, "step": 274 }, { "epoch": 6.47, "learning_rate": 8.344202431995602e-05, "loss": 0.0759, "step": 275 }, { "epoch": 6.49, "learning_rate": 8.24139686069469e-05, "loss": 0.1102, "step": 276 }, { "epoch": 6.52, "learning_rate": 8.138988103271947e-05, "loss": 0.0947, "step": 277 }, { "epoch": 6.54, "learning_rate": 8.03698217239233e-05, "loss": 0.1158, "step": 278 }, { "epoch": 6.56, "learning_rate": 7.935385057069874e-05, "loss": 0.0978, "step": 279 }, { "epoch": 6.59, "learning_rate": 7.834202722316054e-05, "loss": 0.0948, "step": 280 }, { "epoch": 6.61, "learning_rate": 7.733441108789596e-05, "loss": 0.073, "step": 281 }, { "epoch": 6.64, "learning_rate": 7.63310613244765e-05, "loss": 0.1076, "step": 282 }, { "epoch": 6.66, "learning_rate": 7.53320368419849e-05, "loss": 0.1503, "step": 283 }, { "epoch": 6.68, "learning_rate": 7.433739629555601e-05, "loss": 0.0844, "step": 284 }, { "epoch": 6.71, "learning_rate": 7.334719808293341e-05, "loss": 0.0836, "step": 285 }, { "epoch": 6.73, "learning_rate": 7.236150034104052e-05, "loss": 0.1257, "step": 286 }, { "epoch": 6.75, "learning_rate": 7.138036094256716e-05, "loss": 0.0872, "step": 287 }, { "epoch": 6.78, "learning_rate": 7.040383749257198e-05, "loss": 0.0813, "step": 288 }, { "epoch": 6.8, "learning_rate": 6.943198732510002e-05, "loss": 0.1105, "step": 289 }, { "epoch": 6.82, "learning_rate": 6.846486749981684e-05, "loss": 0.1228, "step": 290 }, { "epoch": 6.85, "learning_rate": 6.750253479865795e-05, "loss": 0.0875, "step": 291 }, { "epoch": 6.87, "learning_rate": 6.654504572249551e-05, "loss": 0.0858, "step": 292 }, { "epoch": 6.89, "learning_rate": 6.559245648782064e-05, "loss": 0.0841, "step": 293 }, { "epoch": 6.92, "learning_rate": 6.464482302344295e-05, "loss": 0.0964, "step": 294 }, { "epoch": 6.94, "learning_rate": 6.370220096720691e-05, "loss": 0.0731, "step": 295 }, { "epoch": 6.96, "learning_rate": 6.276464566272519e-05, "loss": 0.091, "step": 296 }, { "epoch": 6.99, "learning_rate": 6.183221215612904e-05, "loss": 0.0842, "step": 297 }, { "epoch": 7.01, "learning_rate": 6.0904955192836867e-05, "loss": 0.0818, "step": 298 }, { "epoch": 7.04, "learning_rate": 5.998292921433952e-05, "loss": 0.0631, "step": 299 }, { "epoch": 7.06, "learning_rate": 5.9066188355004337e-05, "loss": 0.0614, "step": 300 }, { "epoch": 7.08, "learning_rate": 5.815478643889635e-05, "loss": 0.0701, "step": 301 }, { "epoch": 7.11, "learning_rate": 5.724877697661855e-05, "loss": 0.0588, "step": 302 }, { "epoch": 7.13, "learning_rate": 5.634821316216995e-05, "loss": 0.0406, "step": 303 }, { "epoch": 7.15, "learning_rate": 5.545314786982229e-05, "loss": 0.0629, "step": 304 }, { "epoch": 7.18, "learning_rate": 5.4563633651016056e-05, "loss": 0.0495, "step": 305 }, { "epoch": 7.2, "learning_rate": 5.367972273127461e-05, "loss": 0.0789, "step": 306 }, { "epoch": 7.22, "learning_rate": 5.280146700713833e-05, "loss": 0.0552, "step": 307 }, { "epoch": 7.25, "learning_rate": 5.1928918043117236e-05, "loss": 0.0682, "step": 308 }, { "epoch": 7.27, "learning_rate": 5.106212706866384e-05, "loss": 0.0635, "step": 309 }, { "epoch": 7.29, "learning_rate": 5.020114497516521e-05, "loss": 0.0635, "step": 310 }, { "epoch": 7.32, "learning_rate": 4.9346022312954915e-05, "loss": 0.0823, "step": 311 }, { "epoch": 7.34, "learning_rate": 4.8496809288345314e-05, "loss": 0.0463, "step": 312 }, { "epoch": 7.36, "learning_rate": 4.7653555760679555e-05, "loss": 0.0828, "step": 313 }, { "epoch": 7.39, "learning_rate": 4.6816311239404556e-05, "loss": 0.0554, "step": 314 }, { "epoch": 7.41, "learning_rate": 4.5985124881163754e-05, "loss": 0.0655, "step": 315 }, { "epoch": 7.44, "learning_rate": 4.5160045486911405e-05, "loss": 0.0596, "step": 316 }, { "epoch": 7.46, "learning_rate": 4.434112149904721e-05, "loss": 0.048, "step": 317 }, { "epoch": 7.48, "learning_rate": 4.352840099857195e-05, "loss": 0.0675, "step": 318 }, { "epoch": 7.51, "learning_rate": 4.272193170226492e-05, "loss": 0.0507, "step": 319 }, { "epoch": 7.53, "learning_rate": 4.1921760959881954e-05, "loss": 0.0804, "step": 320 }, { "epoch": 7.55, "learning_rate": 4.112793575137575e-05, "loss": 0.0617, "step": 321 }, { "epoch": 7.58, "learning_rate": 4.0340502684137436e-05, "loss": 0.0573, "step": 322 }, { "epoch": 7.6, "learning_rate": 3.9559507990259956e-05, "loss": 0.0554, "step": 323 }, { "epoch": 7.62, "learning_rate": 3.878499752382404e-05, "loss": 0.0531, "step": 324 }, { "epoch": 7.65, "learning_rate": 3.8017016758205594e-05, "loss": 0.0409, "step": 325 }, { "epoch": 7.67, "learning_rate": 3.7255610783406275e-05, "loss": 0.0441, "step": 326 }, { "epoch": 7.69, "learning_rate": 3.6500824303405704e-05, "loss": 0.0663, "step": 327 }, { "epoch": 7.72, "learning_rate": 3.575270163353717e-05, "loss": 0.0656, "step": 328 }, { "epoch": 7.74, "learning_rate": 3.501128669788561e-05, "loss": 0.0688, "step": 329 }, { "epoch": 7.76, "learning_rate": 3.427662302670855e-05, "loss": 0.057, "step": 330 }, { "epoch": 7.79, "learning_rate": 3.3548753753880734e-05, "loss": 0.0624, "step": 331 }, { "epoch": 7.81, "learning_rate": 3.282772161436119e-05, "loss": 0.0657, "step": 332 }, { "epoch": 7.84, "learning_rate": 3.211356894168459e-05, "loss": 0.0617, "step": 333 }, { "epoch": 7.86, "learning_rate": 3.14063376654753e-05, "loss": 0.0823, "step": 334 }, { "epoch": 7.88, "learning_rate": 3.070606930898602e-05, "loss": 0.0407, "step": 335 }, { "epoch": 7.91, "learning_rate": 3.0012804986659565e-05, "loss": 0.0615, "step": 336 }, { "epoch": 7.93, "learning_rate": 2.9326585401714974e-05, "loss": 0.0636, "step": 337 }, { "epoch": 7.95, "learning_rate": 2.8647450843757897e-05, "loss": 0.076, "step": 338 }, { "epoch": 7.98, "learning_rate": 2.7975441186414834e-05, "loss": 0.0761, "step": 339 }, { "epoch": 8.0, "learning_rate": 2.7310595884992355e-05, "loss": 0.0528, "step": 340 }, { "epoch": 8.02, "learning_rate": 2.665295397416029e-05, "loss": 0.0412, "step": 341 }, { "epoch": 8.05, "learning_rate": 2.6002554065660098e-05, "loss": 0.0491, "step": 342 }, { "epoch": 8.07, "learning_rate": 2.5359434346037915e-05, "loss": 0.0361, "step": 343 }, { "epoch": 8.09, "learning_rate": 2.4723632574402317e-05, "loss": 0.0482, "step": 344 }, { "epoch": 8.12, "learning_rate": 2.4095186080207502e-05, "loss": 0.0622, "step": 345 }, { "epoch": 8.14, "learning_rate": 2.347413176106178e-05, "loss": 0.0431, "step": 346 }, { "epoch": 8.16, "learning_rate": 2.2860506080560835e-05, "loss": 0.0581, "step": 347 }, { "epoch": 8.19, "learning_rate": 2.2254345066147243e-05, "loss": 0.0477, "step": 348 }, { "epoch": 8.21, "learning_rate": 2.165568430699493e-05, "loss": 0.0791, "step": 349 }, { "epoch": 8.24, "learning_rate": 2.106455895191985e-05, "loss": 0.0478, "step": 350 }, { "epoch": 8.26, "learning_rate": 2.0481003707316134e-05, "loss": 0.0585, "step": 351 }, { "epoch": 8.28, "learning_rate": 1.9905052835118533e-05, "loss": 0.0575, "step": 352 }, { "epoch": 8.31, "learning_rate": 1.933674015079083e-05, "loss": 0.0403, "step": 353 }, { "epoch": 8.33, "learning_rate": 1.8776099021340245e-05, "loss": 0.0505, "step": 354 }, { "epoch": 8.35, "learning_rate": 1.8223162363358667e-05, "loss": 0.0468, "step": 355 }, { "epoch": 8.38, "learning_rate": 1.767796264108977e-05, "loss": 0.0353, "step": 356 }, { "epoch": 8.4, "learning_rate": 1.71405318645232e-05, "loss": 0.0687, "step": 357 }, { "epoch": 8.42, "learning_rate": 1.6610901587514995e-05, "loss": 0.0425, "step": 358 }, { "epoch": 8.45, "learning_rate": 1.6089102905935107e-05, "loss": 0.0593, "step": 359 }, { "epoch": 8.47, "learning_rate": 1.5575166455841677e-05, "loss": 0.055, "step": 360 }, { "epoch": 8.49, "learning_rate": 1.5069122411682188e-05, "loss": 0.0432, "step": 361 }, { "epoch": 8.52, "learning_rate": 1.4571000484522055e-05, "loss": 0.04, "step": 362 }, { "epoch": 8.54, "learning_rate": 1.4080829920300047e-05, "loss": 0.0392, "step": 363 }, { "epoch": 8.56, "learning_rate": 1.359863949811127e-05, "loss": 0.0387, "step": 364 }, { "epoch": 8.59, "learning_rate": 1.3124457528517502e-05, "loss": 0.0484, "step": 365 }, { "epoch": 8.61, "learning_rate": 1.265831185188489e-05, "loss": 0.0419, "step": 366 }, { "epoch": 8.64, "learning_rate": 1.220022983674952e-05, "loss": 0.0315, "step": 367 }, { "epoch": 8.66, "learning_rate": 1.1750238378210425e-05, "loss": 0.2463, "step": 368 }, { "epoch": 8.68, "learning_rate": 1.1308363896350625e-05, "loss": 0.0532, "step": 369 }, { "epoch": 8.71, "learning_rate": 1.0874632334685806e-05, "loss": 0.0404, "step": 370 }, { "epoch": 8.73, "learning_rate": 1.0449069158641238e-05, "loss": 0.0376, "step": 371 }, { "epoch": 8.75, "learning_rate": 1.0031699354056616e-05, "loss": 0.0484, "step": 372 }, { "epoch": 8.78, "learning_rate": 9.622547425718924e-06, "loss": 0.0421, "step": 373 }, { "epoch": 8.8, "learning_rate": 9.22163739592398e-06, "loss": 0.0523, "step": 374 }, { "epoch": 8.82, "learning_rate": 8.82899280306577e-06, "loss": 0.053, "step": 375 }, { "epoch": 8.85, "learning_rate": 8.444636700254598e-06, "loss": 0.054, "step": 376 }, { "epoch": 8.87, "learning_rate": 8.068591653963535e-06, "loss": 0.0476, "step": 377 }, { "epoch": 8.89, "learning_rate": 7.700879742703486e-06, "loss": 0.0482, "step": 378 }, { "epoch": 8.92, "learning_rate": 7.34152255572697e-06, "loss": 0.0542, "step": 379 }, { "epoch": 8.94, "learning_rate": 6.990541191760418e-06, "loss": 0.0458, "step": 380 }, { "epoch": 8.96, "learning_rate": 6.647956257765585e-06, "loss": 0.0402, "step": 381 }, { "epoch": 8.99, "learning_rate": 6.3137878677295306e-06, "loss": 0.0567, "step": 382 }, { "epoch": 9.01, "learning_rate": 5.988055641483796e-06, "loss": 0.0475, "step": 383 }, { "epoch": 9.04, "learning_rate": 5.670778703552348e-06, "loss": 0.0444, "step": 384 }, { "epoch": 9.06, "learning_rate": 5.361975682028852e-06, "loss": 0.0463, "step": 385 }, { "epoch": 9.08, "learning_rate": 5.061664707482904e-06, "loss": 0.0422, "step": 386 }, { "epoch": 9.11, "learning_rate": 4.769863411895514e-06, "loss": 0.0488, "step": 387 }, { "epoch": 9.13, "learning_rate": 4.486588927624046e-06, "loss": 0.0476, "step": 388 }, { "epoch": 9.15, "learning_rate": 4.211857886396064e-06, "loss": 0.057, "step": 389 }, { "epoch": 9.18, "learning_rate": 3.945686418333155e-06, "loss": 0.0316, "step": 390 }, { "epoch": 9.2, "learning_rate": 3.6880901510036086e-06, "loss": 0.0411, "step": 391 }, { "epoch": 9.22, "learning_rate": 3.4390842085051164e-06, "loss": 0.0452, "step": 392 }, { "epoch": 9.25, "learning_rate": 3.1986832105766467e-06, "loss": 0.0337, "step": 393 }, { "epoch": 9.27, "learning_rate": 2.9669012717401187e-06, "loss": 0.0462, "step": 394 }, { "epoch": 9.29, "learning_rate": 2.7437520004717608e-06, "loss": 0.0567, "step": 395 }, { "epoch": 9.32, "learning_rate": 2.5292484984030693e-06, "loss": 0.0427, "step": 396 }, { "epoch": 9.34, "learning_rate": 2.32340335955159e-06, "loss": 0.0401, "step": 397 }, { "epoch": 9.36, "learning_rate": 2.126228669581492e-06, "loss": 0.0414, "step": 398 }, { "epoch": 9.39, "learning_rate": 1.937736005094004e-06, "loss": 0.0408, "step": 399 }, { "epoch": 9.41, "learning_rate": 1.7579364329477375e-06, "loss": 0.0301, "step": 400 } ], "max_steps": 420, "num_train_epochs": 10, "total_flos": 6953968199270400.0, "trial_name": null, "trial_params": null }