{ "best_metric": 6.787229061126709, "best_model_checkpoint": "./results/models/checkpoint-106530", "epoch": 30.0, "eval_steps": 500, "global_step": 106530, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.14080540692762603, "grad_norm": 0.2255859375, "learning_rate": 0.00398873556744579, "loss": 6.8932, "step": 500 }, { "epoch": 0.28161081385525205, "grad_norm": 0.9609375, "learning_rate": 0.0039774711348915795, "loss": 6.8292, "step": 1000 }, { "epoch": 0.42241622078287805, "grad_norm": 1.6015625, "learning_rate": 0.00396620670233737, "loss": 6.8237, "step": 1500 }, { "epoch": 0.5632216277105041, "grad_norm": 2.75, "learning_rate": 0.00395494226978316, "loss": 6.8763, "step": 2000 }, { "epoch": 0.7040270346381301, "grad_norm": 1.375, "learning_rate": 0.00394367783722895, "loss": 6.9046, "step": 2500 }, { "epoch": 0.8448324415657561, "grad_norm": 1.1171875, "learning_rate": 0.003932413404674739, "loss": 6.8564, "step": 3000 }, { "epoch": 0.9856378484933821, "grad_norm": 2.25, "learning_rate": 0.003921148972120529, "loss": 6.8481, "step": 3500 }, { "epoch": 1.0, "eval_loss": 6.840273380279541, "eval_runtime": 197.295, "eval_samples_per_second": 10.137, "eval_steps_per_second": 1.267, "step": 3551 }, { "epoch": 1.1264432554210082, "grad_norm": 6.875, "learning_rate": 0.003909884539566319, "loss": 6.8419, "step": 4000 }, { "epoch": 1.267248662348634, "grad_norm": 0.349609375, "learning_rate": 0.003898620107012109, "loss": 6.9106, "step": 4500 }, { "epoch": 1.4080540692762602, "grad_norm": 1.96875, "learning_rate": 0.003887355674457899, "loss": 6.8742, "step": 5000 }, { "epoch": 1.5488594762038863, "grad_norm": 1.734375, "learning_rate": 0.0038760912419036893, "loss": 6.8742, "step": 5500 }, { "epoch": 1.6896648831315122, "grad_norm": 4.125, "learning_rate": 0.003864826809349479, "loss": 6.8928, "step": 6000 }, { "epoch": 1.8304702900591383, "grad_norm": 2.125, "learning_rate": 0.003853562376795269, "loss": 6.8663, "step": 6500 }, { "epoch": 1.9712756969867642, "grad_norm": 3.640625, "learning_rate": 0.003842297944241059, "loss": 6.8515, "step": 7000 }, { "epoch": 2.0, "eval_loss": 6.85520601272583, "eval_runtime": 192.2095, "eval_samples_per_second": 10.405, "eval_steps_per_second": 1.301, "step": 7102 }, { "epoch": 2.1120811039143903, "grad_norm": 55.75, "learning_rate": 0.0038310335116868485, "loss": 6.8759, "step": 7500 }, { "epoch": 2.2528865108420164, "grad_norm": 15.0, "learning_rate": 0.0038197690791326386, "loss": 6.9025, "step": 8000 }, { "epoch": 2.3936919177696425, "grad_norm": 0.259765625, "learning_rate": 0.0038085046465784287, "loss": 6.9002, "step": 8500 }, { "epoch": 2.534497324697268, "grad_norm": 0.42578125, "learning_rate": 0.003797240214024219, "loss": 6.8857, "step": 9000 }, { "epoch": 2.6753027316248943, "grad_norm": 2.984375, "learning_rate": 0.0037859757814700085, "loss": 6.882, "step": 9500 }, { "epoch": 2.8161081385525204, "grad_norm": 1.953125, "learning_rate": 0.0037747113489157986, "loss": 6.8677, "step": 10000 }, { "epoch": 2.9569135454801465, "grad_norm": 1.484375, "learning_rate": 0.0037634469163615883, "loss": 6.8696, "step": 10500 }, { "epoch": 3.0, "eval_loss": 6.850285530090332, "eval_runtime": 190.0725, "eval_samples_per_second": 10.522, "eval_steps_per_second": 1.315, "step": 10653 }, { "epoch": 3.0977189524077726, "grad_norm": 0.94140625, "learning_rate": 0.003752182483807378, "loss": 6.8573, "step": 11000 }, { "epoch": 3.2385243593353983, "grad_norm": 11.1875, "learning_rate": 0.003740918051253168, "loss": 6.8511, "step": 11500 }, { "epoch": 3.3793297662630244, "grad_norm": 2.3125, "learning_rate": 0.0037296536186989583, "loss": 6.8586, "step": 12000 }, { "epoch": 3.5201351731906505, "grad_norm": 134.0, "learning_rate": 0.003718389186144748, "loss": 6.8683, "step": 12500 }, { "epoch": 3.6609405801182766, "grad_norm": 3.65625, "learning_rate": 0.003707124753590538, "loss": 6.8525, "step": 13000 }, { "epoch": 3.8017459870459027, "grad_norm": 1.78125, "learning_rate": 0.0036958603210363278, "loss": 6.8547, "step": 13500 }, { "epoch": 3.9425513939735284, "grad_norm": 14.1875, "learning_rate": 0.0036845958884821174, "loss": 6.8456, "step": 14000 }, { "epoch": 4.0, "eval_loss": 6.838648319244385, "eval_runtime": 187.9085, "eval_samples_per_second": 10.643, "eval_steps_per_second": 1.33, "step": 14204 }, { "epoch": 4.0833568009011545, "grad_norm": 1.5078125, "learning_rate": 0.0036733314559279076, "loss": 6.8422, "step": 14500 }, { "epoch": 4.224162207828781, "grad_norm": 2.484375, "learning_rate": 0.0036620670233736977, "loss": 6.859, "step": 15000 }, { "epoch": 4.364967614756407, "grad_norm": 6.65625, "learning_rate": 0.003650802590819488, "loss": 6.8541, "step": 15500 }, { "epoch": 4.505773021684033, "grad_norm": 6.59375, "learning_rate": 0.0036395381582652775, "loss": 6.8385, "step": 16000 }, { "epoch": 4.646578428611659, "grad_norm": 3.546875, "learning_rate": 0.003628273725711067, "loss": 6.8464, "step": 16500 }, { "epoch": 4.787383835539285, "grad_norm": 1.625, "learning_rate": 0.0036170092931568573, "loss": 6.8434, "step": 17000 }, { "epoch": 4.92818924246691, "grad_norm": 16.25, "learning_rate": 0.003605744860602647, "loss": 6.8525, "step": 17500 }, { "epoch": 5.0, "eval_loss": 6.835108280181885, "eval_runtime": 207.3832, "eval_samples_per_second": 9.644, "eval_steps_per_second": 1.205, "step": 17755 }, { "epoch": 5.068994649394536, "grad_norm": 2.5625, "learning_rate": 0.003594480428048437, "loss": 6.8404, "step": 18000 }, { "epoch": 5.2098000563221625, "grad_norm": 4.8125, "learning_rate": 0.0035832159954942273, "loss": 6.8484, "step": 18500 }, { "epoch": 5.350605463249789, "grad_norm": 3.8125, "learning_rate": 0.003571951562940017, "loss": 6.8502, "step": 19000 }, { "epoch": 5.491410870177415, "grad_norm": 2.34375, "learning_rate": 0.0035606871303858066, "loss": 6.8384, "step": 19500 }, { "epoch": 5.632216277105041, "grad_norm": 1.96875, "learning_rate": 0.0035494226978315968, "loss": 6.8478, "step": 20000 }, { "epoch": 5.773021684032667, "grad_norm": 3.375, "learning_rate": 0.003538158265277387, "loss": 6.8339, "step": 20500 }, { "epoch": 5.913827090960293, "grad_norm": 4.65625, "learning_rate": 0.0035268938327231766, "loss": 6.843, "step": 21000 }, { "epoch": 6.0, "eval_loss": 6.82509708404541, "eval_runtime": 199.4174, "eval_samples_per_second": 10.029, "eval_steps_per_second": 1.254, "step": 21306 }, { "epoch": 6.054632497887919, "grad_norm": 3.171875, "learning_rate": 0.0035156294001689667, "loss": 6.8405, "step": 21500 }, { "epoch": 6.195437904815545, "grad_norm": 10.9375, "learning_rate": 0.003504364967614757, "loss": 6.8416, "step": 22000 }, { "epoch": 6.336243311743171, "grad_norm": 7.3125, "learning_rate": 0.003493100535060546, "loss": 6.8379, "step": 22500 }, { "epoch": 6.477048718670797, "grad_norm": 4.65625, "learning_rate": 0.003481836102506336, "loss": 6.8452, "step": 23000 }, { "epoch": 6.617854125598423, "grad_norm": 1.7734375, "learning_rate": 0.0034705716699521263, "loss": 6.8463, "step": 23500 }, { "epoch": 6.758659532526049, "grad_norm": 11.25, "learning_rate": 0.003459307237397916, "loss": 6.8358, "step": 24000 }, { "epoch": 6.899464939453675, "grad_norm": 1.7421875, "learning_rate": 0.003448042804843706, "loss": 6.8361, "step": 24500 }, { "epoch": 7.0, "eval_loss": 6.824986457824707, "eval_runtime": 181.2581, "eval_samples_per_second": 11.034, "eval_steps_per_second": 1.379, "step": 24857 }, { "epoch": 7.040270346381301, "grad_norm": 3.84375, "learning_rate": 0.0034367783722894962, "loss": 6.8328, "step": 25000 }, { "epoch": 7.181075753308927, "grad_norm": 1.4296875, "learning_rate": 0.0034255139397352855, "loss": 6.8272, "step": 25500 }, { "epoch": 7.321881160236553, "grad_norm": 3.359375, "learning_rate": 0.0034142495071810756, "loss": 6.8164, "step": 26000 }, { "epoch": 7.462686567164179, "grad_norm": 8.875, "learning_rate": 0.0034029850746268657, "loss": 6.8276, "step": 26500 }, { "epoch": 7.603491974091805, "grad_norm": 2.078125, "learning_rate": 0.003391720642072656, "loss": 6.8248, "step": 27000 }, { "epoch": 7.744297381019431, "grad_norm": 8.8125, "learning_rate": 0.0033804562095184456, "loss": 6.8188, "step": 27500 }, { "epoch": 7.885102787947057, "grad_norm": 3.921875, "learning_rate": 0.0033691917769642357, "loss": 6.8232, "step": 28000 }, { "epoch": 8.0, "eval_loss": 6.8248701095581055, "eval_runtime": 170.7503, "eval_samples_per_second": 11.713, "eval_steps_per_second": 1.464, "step": 28408 }, { "epoch": 8.025908194874683, "grad_norm": 5.9375, "learning_rate": 0.0033579273444100254, "loss": 6.8241, "step": 28500 }, { "epoch": 8.166713601802309, "grad_norm": 5.8125, "learning_rate": 0.003346662911855815, "loss": 6.8311, "step": 29000 }, { "epoch": 8.307519008729935, "grad_norm": 3.328125, "learning_rate": 0.003335398479301605, "loss": 6.8203, "step": 29500 }, { "epoch": 8.448324415657561, "grad_norm": 7.46875, "learning_rate": 0.0033241340467473953, "loss": 6.8291, "step": 30000 }, { "epoch": 8.589129822585187, "grad_norm": 4.59375, "learning_rate": 0.003312869614193185, "loss": 6.8261, "step": 30500 }, { "epoch": 8.729935229512813, "grad_norm": 4.3125, "learning_rate": 0.003301605181638975, "loss": 6.8362, "step": 31000 }, { "epoch": 8.87074063644044, "grad_norm": 11.25, "learning_rate": 0.003290340749084765, "loss": 6.8353, "step": 31500 }, { "epoch": 9.0, "eval_loss": 6.821832656860352, "eval_runtime": 210.9202, "eval_samples_per_second": 9.482, "eval_steps_per_second": 1.185, "step": 31959 }, { "epoch": 9.011546043368066, "grad_norm": 4.46875, "learning_rate": 0.003279076316530555, "loss": 6.832, "step": 32000 }, { "epoch": 9.152351450295692, "grad_norm": 4.0625, "learning_rate": 0.0032678118839763446, "loss": 6.8292, "step": 32500 }, { "epoch": 9.293156857223318, "grad_norm": 1.984375, "learning_rate": 0.0032565474514221347, "loss": 6.8248, "step": 33000 }, { "epoch": 9.433962264150944, "grad_norm": 43.25, "learning_rate": 0.003245283018867925, "loss": 6.8277, "step": 33500 }, { "epoch": 9.57476767107857, "grad_norm": 1.7890625, "learning_rate": 0.0032340185863137146, "loss": 6.8229, "step": 34000 }, { "epoch": 9.715573078006196, "grad_norm": 3.296875, "learning_rate": 0.0032227541537595042, "loss": 6.8194, "step": 34500 }, { "epoch": 9.85637848493382, "grad_norm": 3.390625, "learning_rate": 0.0032114897212052944, "loss": 6.8225, "step": 35000 }, { "epoch": 9.997183891861447, "grad_norm": 22.375, "learning_rate": 0.003200225288651084, "loss": 6.8291, "step": 35500 }, { "epoch": 10.0, "eval_loss": 6.81277322769165, "eval_runtime": 239.1719, "eval_samples_per_second": 8.362, "eval_steps_per_second": 1.045, "step": 35510 }, { "epoch": 10.137989298789073, "grad_norm": 5.375, "learning_rate": 0.003188960856096874, "loss": 6.815, "step": 36000 }, { "epoch": 10.278794705716699, "grad_norm": 5.25, "learning_rate": 0.0031776964235426643, "loss": 6.8173, "step": 36500 }, { "epoch": 10.419600112644325, "grad_norm": 3.84375, "learning_rate": 0.003166431990988454, "loss": 6.8189, "step": 37000 }, { "epoch": 10.560405519571951, "grad_norm": 1.1875, "learning_rate": 0.0031551675584342437, "loss": 6.828, "step": 37500 }, { "epoch": 10.701210926499577, "grad_norm": 5.96875, "learning_rate": 0.003143903125880034, "loss": 6.8141, "step": 38000 }, { "epoch": 10.842016333427203, "grad_norm": 117.0, "learning_rate": 0.003132638693325824, "loss": 6.8258, "step": 38500 }, { "epoch": 10.98282174035483, "grad_norm": 3.40625, "learning_rate": 0.0031213742607716136, "loss": 6.8254, "step": 39000 }, { "epoch": 11.0, "eval_loss": 6.814772129058838, "eval_runtime": 228.9281, "eval_samples_per_second": 8.736, "eval_steps_per_second": 1.092, "step": 39061 }, { "epoch": 11.123627147282455, "grad_norm": 1.578125, "learning_rate": 0.0031101098282174037, "loss": 6.8192, "step": 39500 }, { "epoch": 11.264432554210082, "grad_norm": 2.6875, "learning_rate": 0.003098845395663194, "loss": 6.8217, "step": 40000 }, { "epoch": 11.405237961137708, "grad_norm": 35.0, "learning_rate": 0.003087580963108983, "loss": 6.8172, "step": 40500 }, { "epoch": 11.546043368065334, "grad_norm": 9.5625, "learning_rate": 0.0030763165305547732, "loss": 6.8307, "step": 41000 }, { "epoch": 11.68684877499296, "grad_norm": 9.3125, "learning_rate": 0.0030650520980005634, "loss": 6.8237, "step": 41500 }, { "epoch": 11.827654181920586, "grad_norm": 5.3125, "learning_rate": 0.003053787665446353, "loss": 6.8306, "step": 42000 }, { "epoch": 11.968459588848212, "grad_norm": 5.34375, "learning_rate": 0.003042523232892143, "loss": 6.8138, "step": 42500 }, { "epoch": 12.0, "eval_loss": 6.806704044342041, "eval_runtime": 228.3922, "eval_samples_per_second": 8.757, "eval_steps_per_second": 1.095, "step": 42612 }, { "epoch": 12.109264995775838, "grad_norm": 1.9296875, "learning_rate": 0.0030312588003379333, "loss": 6.8157, "step": 43000 }, { "epoch": 12.250070402703464, "grad_norm": 5.25, "learning_rate": 0.003019994367783723, "loss": 6.8189, "step": 43500 }, { "epoch": 12.39087580963109, "grad_norm": 3.078125, "learning_rate": 0.0030087299352295127, "loss": 6.8276, "step": 44000 }, { "epoch": 12.531681216558717, "grad_norm": 3.46875, "learning_rate": 0.002997465502675303, "loss": 6.8092, "step": 44500 }, { "epoch": 12.672486623486343, "grad_norm": 1.2421875, "learning_rate": 0.002986201070121093, "loss": 6.8152, "step": 45000 }, { "epoch": 12.813292030413969, "grad_norm": 3.078125, "learning_rate": 0.0029749366375668826, "loss": 6.8141, "step": 45500 }, { "epoch": 12.954097437341593, "grad_norm": 2.90625, "learning_rate": 0.0029636722050126727, "loss": 6.8209, "step": 46000 }, { "epoch": 13.0, "eval_loss": 6.805023670196533, "eval_runtime": 230.3566, "eval_samples_per_second": 8.682, "eval_steps_per_second": 1.085, "step": 46163 }, { "epoch": 13.09490284426922, "grad_norm": 2.90625, "learning_rate": 0.0029524077724584624, "loss": 6.8179, "step": 46500 }, { "epoch": 13.235708251196845, "grad_norm": 6.0, "learning_rate": 0.002941143339904252, "loss": 6.8193, "step": 47000 }, { "epoch": 13.376513658124471, "grad_norm": 4.03125, "learning_rate": 0.0029298789073500422, "loss": 6.8153, "step": 47500 }, { "epoch": 13.517319065052098, "grad_norm": 3.65625, "learning_rate": 0.0029186144747958324, "loss": 6.8046, "step": 48000 }, { "epoch": 13.658124471979724, "grad_norm": 52.0, "learning_rate": 0.002907350042241622, "loss": 6.8161, "step": 48500 }, { "epoch": 13.79892987890735, "grad_norm": 16.625, "learning_rate": 0.002896085609687412, "loss": 6.8094, "step": 49000 }, { "epoch": 13.939735285834976, "grad_norm": 1.6328125, "learning_rate": 0.002884821177133202, "loss": 6.8181, "step": 49500 }, { "epoch": 14.0, "eval_loss": 6.8096513748168945, "eval_runtime": 235.2835, "eval_samples_per_second": 8.5, "eval_steps_per_second": 1.063, "step": 49714 }, { "epoch": 14.080540692762602, "grad_norm": 1.140625, "learning_rate": 0.002873556744578992, "loss": 6.8197, "step": 50000 }, { "epoch": 14.221346099690228, "grad_norm": 1.265625, "learning_rate": 0.0028622923120247817, "loss": 6.8097, "step": 50500 }, { "epoch": 14.362151506617854, "grad_norm": 4.75, "learning_rate": 0.002851027879470572, "loss": 6.82, "step": 51000 }, { "epoch": 14.50295691354548, "grad_norm": 2.28125, "learning_rate": 0.002839763446916362, "loss": 6.8203, "step": 51500 }, { "epoch": 14.643762320473106, "grad_norm": 5.125, "learning_rate": 0.0028284990143621516, "loss": 6.807, "step": 52000 }, { "epoch": 14.784567727400733, "grad_norm": 2.046875, "learning_rate": 0.0028172345818079413, "loss": 6.8153, "step": 52500 }, { "epoch": 14.925373134328359, "grad_norm": 8.0, "learning_rate": 0.0028059701492537314, "loss": 6.8102, "step": 53000 }, { "epoch": 15.0, "eval_loss": 6.802714824676514, "eval_runtime": 235.2095, "eval_samples_per_second": 8.503, "eval_steps_per_second": 1.063, "step": 53265 }, { "epoch": 15.066178541255985, "grad_norm": 16.5, "learning_rate": 0.002794705716699521, "loss": 6.8108, "step": 53500 }, { "epoch": 15.20698394818361, "grad_norm": 2.015625, "learning_rate": 0.0027834412841453112, "loss": 6.8123, "step": 54000 }, { "epoch": 15.347789355111237, "grad_norm": 2.90625, "learning_rate": 0.0027721768515911013, "loss": 6.8105, "step": 54500 }, { "epoch": 15.488594762038863, "grad_norm": 8.375, "learning_rate": 0.0027609124190368915, "loss": 6.8163, "step": 55000 }, { "epoch": 15.629400168966487, "grad_norm": 3.34375, "learning_rate": 0.0027496479864826807, "loss": 6.8062, "step": 55500 }, { "epoch": 15.770205575894114, "grad_norm": 48.25, "learning_rate": 0.002738383553928471, "loss": 6.8065, "step": 56000 }, { "epoch": 15.91101098282174, "grad_norm": 2.9375, "learning_rate": 0.002727119121374261, "loss": 6.8066, "step": 56500 }, { "epoch": 16.0, "eval_loss": 6.799798011779785, "eval_runtime": 204.1083, "eval_samples_per_second": 9.799, "eval_steps_per_second": 1.225, "step": 56816 }, { "epoch": 16.051816389749366, "grad_norm": 136.0, "learning_rate": 0.0027158546888200507, "loss": 6.8064, "step": 57000 }, { "epoch": 16.192621796676992, "grad_norm": 2.75, "learning_rate": 0.002704590256265841, "loss": 6.8083, "step": 57500 }, { "epoch": 16.333427203604618, "grad_norm": 15.625, "learning_rate": 0.002693325823711631, "loss": 6.8038, "step": 58000 }, { "epoch": 16.474232610532244, "grad_norm": 20.875, "learning_rate": 0.00268206139115742, "loss": 6.802, "step": 58500 }, { "epoch": 16.61503801745987, "grad_norm": 15.3125, "learning_rate": 0.0026707969586032103, "loss": 6.8112, "step": 59000 }, { "epoch": 16.755843424387496, "grad_norm": 8.1875, "learning_rate": 0.0026595325260490004, "loss": 6.8144, "step": 59500 }, { "epoch": 16.896648831315122, "grad_norm": 5.03125, "learning_rate": 0.0026482680934947905, "loss": 6.8158, "step": 60000 }, { "epoch": 17.0, "eval_loss": 6.8019022941589355, "eval_runtime": 194.9738, "eval_samples_per_second": 10.258, "eval_steps_per_second": 1.282, "step": 60367 }, { "epoch": 17.03745423824275, "grad_norm": 5.34375, "learning_rate": 0.0026370036609405802, "loss": 6.8102, "step": 60500 }, { "epoch": 17.178259645170375, "grad_norm": 26.125, "learning_rate": 0.0026257392283863703, "loss": 6.8135, "step": 61000 }, { "epoch": 17.319065052098, "grad_norm": 6.71875, "learning_rate": 0.00261447479583216, "loss": 6.8102, "step": 61500 }, { "epoch": 17.459870459025627, "grad_norm": 30.875, "learning_rate": 0.0026032103632779497, "loss": 6.8099, "step": 62000 }, { "epoch": 17.600675865953253, "grad_norm": 78.0, "learning_rate": 0.00259194593072374, "loss": 6.8046, "step": 62500 }, { "epoch": 17.74148127288088, "grad_norm": 14.625, "learning_rate": 0.00258068149816953, "loss": 6.809, "step": 63000 }, { "epoch": 17.882286679808505, "grad_norm": 170.0, "learning_rate": 0.0025694170656153197, "loss": 6.8035, "step": 63500 }, { "epoch": 18.0, "eval_loss": 6.801079273223877, "eval_runtime": 232.8698, "eval_samples_per_second": 8.588, "eval_steps_per_second": 1.074, "step": 63918 }, { "epoch": 18.02309208673613, "grad_norm": 2.296875, "learning_rate": 0.0025581526330611098, "loss": 6.8134, "step": 64000 }, { "epoch": 18.163897493663757, "grad_norm": 87.0, "learning_rate": 0.0025468882005068995, "loss": 6.8024, "step": 64500 }, { "epoch": 18.304702900591383, "grad_norm": 1.3828125, "learning_rate": 0.002535623767952689, "loss": 6.8068, "step": 65000 }, { "epoch": 18.44550830751901, "grad_norm": 22.5, "learning_rate": 0.0025243593353984793, "loss": 6.8083, "step": 65500 }, { "epoch": 18.586313714446636, "grad_norm": 3.578125, "learning_rate": 0.0025130949028442694, "loss": 6.8071, "step": 66000 }, { "epoch": 18.727119121374262, "grad_norm": 22.125, "learning_rate": 0.0025018304702900595, "loss": 6.8059, "step": 66500 }, { "epoch": 18.867924528301888, "grad_norm": 16.75, "learning_rate": 0.002490566037735849, "loss": 6.8056, "step": 67000 }, { "epoch": 19.0, "eval_loss": 6.797260761260986, "eval_runtime": 216.6541, "eval_samples_per_second": 9.231, "eval_steps_per_second": 1.154, "step": 67469 }, { "epoch": 19.008729935229514, "grad_norm": 4.625, "learning_rate": 0.002479301605181639, "loss": 6.8015, "step": 67500 }, { "epoch": 19.14953534215714, "grad_norm": 2.734375, "learning_rate": 0.002468037172627429, "loss": 6.8118, "step": 68000 }, { "epoch": 19.290340749084766, "grad_norm": 45.0, "learning_rate": 0.0024567727400732187, "loss": 6.7995, "step": 68500 }, { "epoch": 19.431146156012392, "grad_norm": 46.0, "learning_rate": 0.002445508307519009, "loss": 6.8098, "step": 69000 }, { "epoch": 19.57195156294002, "grad_norm": 7.09375, "learning_rate": 0.002434243874964799, "loss": 6.7972, "step": 69500 }, { "epoch": 19.712756969867645, "grad_norm": 91.5, "learning_rate": 0.002422979442410588, "loss": 6.8085, "step": 70000 }, { "epoch": 19.853562376795267, "grad_norm": 1.7890625, "learning_rate": 0.0024117150098563783, "loss": 6.8034, "step": 70500 }, { "epoch": 19.994367783722893, "grad_norm": 4.21875, "learning_rate": 0.0024004505773021685, "loss": 6.8024, "step": 71000 }, { "epoch": 20.0, "eval_loss": 6.794472694396973, "eval_runtime": 212.8014, "eval_samples_per_second": 9.398, "eval_steps_per_second": 1.175, "step": 71020 }, { "epoch": 20.13517319065052, "grad_norm": 4.21875, "learning_rate": 0.0023891861447479586, "loss": 6.7964, "step": 71500 }, { "epoch": 20.275978597578145, "grad_norm": 27.5, "learning_rate": 0.0023779217121937483, "loss": 6.8033, "step": 72000 }, { "epoch": 20.41678400450577, "grad_norm": 4.84375, "learning_rate": 0.0023666572796395384, "loss": 6.8066, "step": 72500 }, { "epoch": 20.557589411433398, "grad_norm": 10.0625, "learning_rate": 0.0023553928470853285, "loss": 6.8076, "step": 73000 }, { "epoch": 20.698394818361024, "grad_norm": 60.75, "learning_rate": 0.0023441284145311178, "loss": 6.8114, "step": 73500 }, { "epoch": 20.83920022528865, "grad_norm": 47.0, "learning_rate": 0.002332863981976908, "loss": 6.8046, "step": 74000 }, { "epoch": 20.980005632216276, "grad_norm": 4.5, "learning_rate": 0.002321599549422698, "loss": 6.8088, "step": 74500 }, { "epoch": 21.0, "eval_loss": 6.799040794372559, "eval_runtime": 211.5781, "eval_samples_per_second": 9.453, "eval_steps_per_second": 1.182, "step": 74571 }, { "epoch": 21.120811039143902, "grad_norm": 1.7734375, "learning_rate": 0.0023103351168684877, "loss": 6.8154, "step": 75000 }, { "epoch": 21.26161644607153, "grad_norm": 9.75, "learning_rate": 0.002299070684314278, "loss": 6.8074, "step": 75500 }, { "epoch": 21.402421852999154, "grad_norm": 12.9375, "learning_rate": 0.0022878062517600675, "loss": 6.8097, "step": 76000 }, { "epoch": 21.54322725992678, "grad_norm": 2.703125, "learning_rate": 0.002276541819205857, "loss": 6.8016, "step": 76500 }, { "epoch": 21.684032666854407, "grad_norm": 2.875, "learning_rate": 0.0022652773866516473, "loss": 6.7996, "step": 77000 }, { "epoch": 21.824838073782033, "grad_norm": 4.84375, "learning_rate": 0.0022540129540974375, "loss": 6.8058, "step": 77500 }, { "epoch": 21.96564348070966, "grad_norm": 7.15625, "learning_rate": 0.0022427485215432276, "loss": 6.8024, "step": 78000 }, { "epoch": 22.0, "eval_loss": 6.7944440841674805, "eval_runtime": 187.9546, "eval_samples_per_second": 10.641, "eval_steps_per_second": 1.33, "step": 78122 }, { "epoch": 22.106448887637285, "grad_norm": 4.65625, "learning_rate": 0.0022314840889890173, "loss": 6.8025, "step": 78500 }, { "epoch": 22.24725429456491, "grad_norm": 8.9375, "learning_rate": 0.002220219656434807, "loss": 6.8027, "step": 79000 }, { "epoch": 22.388059701492537, "grad_norm": 8.75, "learning_rate": 0.002208955223880597, "loss": 6.8032, "step": 79500 }, { "epoch": 22.528865108420163, "grad_norm": 10.625, "learning_rate": 0.0021976907913263868, "loss": 6.7998, "step": 80000 }, { "epoch": 22.66967051534779, "grad_norm": 12.375, "learning_rate": 0.002186426358772177, "loss": 6.8102, "step": 80500 }, { "epoch": 22.810475922275415, "grad_norm": 3.625, "learning_rate": 0.002175161926217967, "loss": 6.7944, "step": 81000 }, { "epoch": 22.95128132920304, "grad_norm": 3.484375, "learning_rate": 0.0021638974936637567, "loss": 6.8079, "step": 81500 }, { "epoch": 23.0, "eval_loss": 6.793390274047852, "eval_runtime": 214.7382, "eval_samples_per_second": 9.314, "eval_steps_per_second": 1.164, "step": 81673 }, { "epoch": 23.092086736130668, "grad_norm": 5.46875, "learning_rate": 0.0021526330611095464, "loss": 6.8065, "step": 82000 }, { "epoch": 23.232892143058294, "grad_norm": 3.125, "learning_rate": 0.0021413686285553365, "loss": 6.8017, "step": 82500 }, { "epoch": 23.37369754998592, "grad_norm": 77.5, "learning_rate": 0.0021301041960011266, "loss": 6.8055, "step": 83000 }, { "epoch": 23.514502956913546, "grad_norm": 1.5390625, "learning_rate": 0.0021188397634469163, "loss": 6.8064, "step": 83500 }, { "epoch": 23.655308363841172, "grad_norm": 3.28125, "learning_rate": 0.0021075753308927065, "loss": 6.8029, "step": 84000 }, { "epoch": 23.796113770768798, "grad_norm": 4.5, "learning_rate": 0.0020963108983384966, "loss": 6.7993, "step": 84500 }, { "epoch": 23.936919177696424, "grad_norm": 3.078125, "learning_rate": 0.002085046465784286, "loss": 6.7938, "step": 85000 }, { "epoch": 24.0, "eval_loss": 6.793313503265381, "eval_runtime": 192.9768, "eval_samples_per_second": 10.364, "eval_steps_per_second": 1.295, "step": 85224 }, { "epoch": 24.07772458462405, "grad_norm": 16.875, "learning_rate": 0.002073782033230076, "loss": 6.8043, "step": 85500 }, { "epoch": 24.218529991551677, "grad_norm": 7.0, "learning_rate": 0.002062517600675866, "loss": 6.7964, "step": 86000 }, { "epoch": 24.359335398479303, "grad_norm": 4.46875, "learning_rate": 0.0020512531681216558, "loss": 6.8079, "step": 86500 }, { "epoch": 24.50014080540693, "grad_norm": 3.1875, "learning_rate": 0.002039988735567446, "loss": 6.8029, "step": 87000 }, { "epoch": 24.640946212334555, "grad_norm": 3.203125, "learning_rate": 0.002028724303013236, "loss": 6.7963, "step": 87500 }, { "epoch": 24.78175161926218, "grad_norm": 8.9375, "learning_rate": 0.0020174598704590253, "loss": 6.8056, "step": 88000 }, { "epoch": 24.922557026189807, "grad_norm": 2.859375, "learning_rate": 0.0020061954379048154, "loss": 6.8061, "step": 88500 }, { "epoch": 25.0, "eval_loss": 6.796795845031738, "eval_runtime": 181.8838, "eval_samples_per_second": 10.996, "eval_steps_per_second": 1.375, "step": 88775 }, { "epoch": 25.063362433117433, "grad_norm": 6.09375, "learning_rate": 0.0019949310053506055, "loss": 6.7973, "step": 89000 }, { "epoch": 25.20416784004506, "grad_norm": 1.09375, "learning_rate": 0.001983666572796395, "loss": 6.8059, "step": 89500 }, { "epoch": 25.344973246972685, "grad_norm": 24.375, "learning_rate": 0.0019724021402421853, "loss": 6.8048, "step": 90000 }, { "epoch": 25.48577865390031, "grad_norm": 3.171875, "learning_rate": 0.0019611377076879754, "loss": 6.7963, "step": 90500 }, { "epoch": 25.626584060827938, "grad_norm": 1.546875, "learning_rate": 0.0019498732751337651, "loss": 6.7984, "step": 91000 }, { "epoch": 25.76738946775556, "grad_norm": 24.375, "learning_rate": 0.0019386088425795553, "loss": 6.7948, "step": 91500 }, { "epoch": 25.908194874683186, "grad_norm": 29.125, "learning_rate": 0.001927344410025345, "loss": 6.8014, "step": 92000 }, { "epoch": 26.0, "eval_loss": 6.788823127746582, "eval_runtime": 205.9541, "eval_samples_per_second": 9.711, "eval_steps_per_second": 1.214, "step": 92326 }, { "epoch": 26.049000281610812, "grad_norm": 3.046875, "learning_rate": 0.0019160799774711349, "loss": 6.794, "step": 92500 }, { "epoch": 26.18980568853844, "grad_norm": 2.1875, "learning_rate": 0.001904815544916925, "loss": 6.7969, "step": 93000 }, { "epoch": 26.330611095466065, "grad_norm": 3.78125, "learning_rate": 0.0018935511123627147, "loss": 6.7898, "step": 93500 }, { "epoch": 26.47141650239369, "grad_norm": 20.875, "learning_rate": 0.0018822866798085048, "loss": 6.7979, "step": 94000 }, { "epoch": 26.612221909321317, "grad_norm": 3.21875, "learning_rate": 0.0018710222472542947, "loss": 6.7962, "step": 94500 }, { "epoch": 26.753027316248943, "grad_norm": 11.375, "learning_rate": 0.0018597578147000844, "loss": 6.7987, "step": 95000 }, { "epoch": 26.89383272317657, "grad_norm": 8.875, "learning_rate": 0.0018484933821458745, "loss": 6.801, "step": 95500 }, { "epoch": 27.0, "eval_loss": 6.789410591125488, "eval_runtime": 227.3793, "eval_samples_per_second": 8.796, "eval_steps_per_second": 1.099, "step": 95877 }, { "epoch": 27.034638130104195, "grad_norm": 1.46875, "learning_rate": 0.0018372289495916644, "loss": 6.7975, "step": 96000 }, { "epoch": 27.17544353703182, "grad_norm": 80.5, "learning_rate": 0.0018259645170374543, "loss": 6.7984, "step": 96500 }, { "epoch": 27.316248943959447, "grad_norm": 2.671875, "learning_rate": 0.0018147000844832442, "loss": 6.7878, "step": 97000 }, { "epoch": 27.457054350887073, "grad_norm": 9.0, "learning_rate": 0.0018034356519290341, "loss": 6.7973, "step": 97500 }, { "epoch": 27.5978597578147, "grad_norm": 2.1875, "learning_rate": 0.001792171219374824, "loss": 6.7993, "step": 98000 }, { "epoch": 27.738665164742326, "grad_norm": 5.65625, "learning_rate": 0.001780906786820614, "loss": 6.7979, "step": 98500 }, { "epoch": 27.879470571669952, "grad_norm": 6.53125, "learning_rate": 0.001769642354266404, "loss": 6.7953, "step": 99000 }, { "epoch": 28.0, "eval_loss": 6.787979602813721, "eval_runtime": 193.1872, "eval_samples_per_second": 10.353, "eval_steps_per_second": 1.294, "step": 99428 }, { "epoch": 28.020275978597578, "grad_norm": 8.5625, "learning_rate": 0.0017583779217121938, "loss": 6.7974, "step": 99500 }, { "epoch": 28.161081385525204, "grad_norm": 7.25, "learning_rate": 0.0017471134891579837, "loss": 6.7964, "step": 100000 }, { "epoch": 28.30188679245283, "grad_norm": 5.5625, "learning_rate": 0.0017358490566037738, "loss": 6.7967, "step": 100500 }, { "epoch": 28.442692199380456, "grad_norm": 3.34375, "learning_rate": 0.0017245846240495635, "loss": 6.7968, "step": 101000 }, { "epoch": 28.583497606308082, "grad_norm": 8.3125, "learning_rate": 0.0017133201914953536, "loss": 6.7985, "step": 101500 }, { "epoch": 28.72430301323571, "grad_norm": 32.25, "learning_rate": 0.0017020557589411435, "loss": 6.7975, "step": 102000 }, { "epoch": 28.865108420163335, "grad_norm": 5.25, "learning_rate": 0.0016907913263869332, "loss": 6.7887, "step": 102500 }, { "epoch": 29.0, "eval_loss": 6.7877984046936035, "eval_runtime": 234.4697, "eval_samples_per_second": 8.53, "eval_steps_per_second": 1.066, "step": 102979 }, { "epoch": 29.00591382709096, "grad_norm": 16.75, "learning_rate": 0.0016795268938327233, "loss": 6.7878, "step": 103000 }, { "epoch": 29.146719234018587, "grad_norm": 4.65625, "learning_rate": 0.0016682624612785132, "loss": 6.7852, "step": 103500 }, { "epoch": 29.287524640946213, "grad_norm": 8.0625, "learning_rate": 0.001656998028724303, "loss": 6.7974, "step": 104000 }, { "epoch": 29.42833004787384, "grad_norm": 7.3125, "learning_rate": 0.001645733596170093, "loss": 6.8057, "step": 104500 }, { "epoch": 29.569135454801465, "grad_norm": 1.0546875, "learning_rate": 0.001634469163615883, "loss": 6.7982, "step": 105000 }, { "epoch": 29.70994086172909, "grad_norm": 11.125, "learning_rate": 0.0016232047310616728, "loss": 6.7948, "step": 105500 }, { "epoch": 29.850746268656717, "grad_norm": 9.8125, "learning_rate": 0.0016119402985074627, "loss": 6.7843, "step": 106000 }, { "epoch": 29.991551675584343, "grad_norm": 4.8125, "learning_rate": 0.0016006758659532524, "loss": 6.8003, "step": 106500 }, { "epoch": 30.0, "eval_loss": 6.787229061126709, "eval_runtime": 216.4819, "eval_samples_per_second": 9.239, "eval_steps_per_second": 1.155, "step": 106530 } ], "logging_steps": 500, "max_steps": 177550, "num_input_tokens_seen": 0, "num_train_epochs": 50, "save_steps": 500, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 3, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 4.342179461275713e+18, "train_batch_size": 8, "trial_name": null, "trial_params": null }