diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -1,5437 +1,2749 @@ { - "best_metric": 1.4534417390823364, - "best_model_checkpoint": "./results/models/checkpoint-307580", - "epoch": 5.0, + "best_metric": 1.3305245637893677, + "best_model_checkpoint": "./results/models/checkpoint-182628", + "epoch": 19.0, "eval_steps": 500, - "global_step": 384475, + "global_step": 182628, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { - "epoch": 0.006502373366278692, - "grad_norm": 1.359375, - "learning_rate": 0.001999739905065349, - "loss": 2.651, + "epoch": 0.05201831044527674, + "grad_norm": 1.3984375, + "learning_rate": 0.001997919267582189, + "loss": 2.3433, "step": 500 }, { - "epoch": 0.013004746732557384, - "grad_norm": 4.40625, - "learning_rate": 0.0019994798101306975, - "loss": 2.3581, + "epoch": 0.10403662089055347, + "grad_norm": 0.92578125, + "learning_rate": 0.001995838535164378, + "loss": 1.9524, "step": 1000 }, { - "epoch": 0.019507120098836074, - "grad_norm": 0.87890625, - "learning_rate": 0.0019992197151960465, - "loss": 2.2735, + "epoch": 0.1560549313358302, + "grad_norm": 0.6171875, + "learning_rate": 0.001993757802746567, + "loss": 1.8833, "step": 1500 }, { - "epoch": 0.026009493465114768, - "grad_norm": 0.60546875, - "learning_rate": 0.0019989596202613954, - "loss": 2.2235, + "epoch": 0.20807324178110695, + "grad_norm": 0.80859375, + "learning_rate": 0.0019916770703287557, + "loss": 1.8518, "step": 2000 }, { - "epoch": 0.03251186683139346, - "grad_norm": 1.75, - "learning_rate": 0.0019986995253267444, - "loss": 2.1651, + "epoch": 0.2600915522263837, + "grad_norm": 0.494140625, + "learning_rate": 0.0019895963379109446, + "loss": 1.8064, "step": 2500 }, { - "epoch": 0.03901424019767215, - "grad_norm": 0.68359375, - "learning_rate": 0.0019984394303920934, - "loss": 2.125, + "epoch": 0.3121098626716604, + "grad_norm": 0.61328125, + "learning_rate": 0.0019875156054931335, + "loss": 1.7881, "step": 3000 }, { - "epoch": 0.04551661356395084, - "grad_norm": 0.5703125, - "learning_rate": 0.001998179335457442, - "loss": 2.0895, + "epoch": 0.3641281731169372, + "grad_norm": 0.71484375, + "learning_rate": 0.0019854348730753224, + "loss": 1.7552, "step": 3500 }, { - "epoch": 0.052018986930229535, - "grad_norm": 0.6796875, - "learning_rate": 0.001997919240522791, - "loss": 2.0827, + "epoch": 0.4161464835622139, + "grad_norm": 0.34375, + "learning_rate": 0.0019833541406575114, + "loss": 1.7362, "step": 4000 }, { - "epoch": 0.05852136029650822, - "grad_norm": 0.78515625, - "learning_rate": 0.00199765914558814, - "loss": 2.0646, + "epoch": 0.4681647940074906, + "grad_norm": 0.310546875, + "learning_rate": 0.0019812734082397003, + "loss": 1.7302, "step": 4500 }, { - "epoch": 0.06502373366278692, - "grad_norm": 0.6953125, - "learning_rate": 0.0019973990506534883, - "loss": 2.039, + "epoch": 0.5201831044527674, + "grad_norm": 0.33203125, + "learning_rate": 0.0019791926758218896, + "loss": 1.6997, "step": 5000 }, { - "epoch": 0.0715261070290656, - "grad_norm": 0.83984375, - "learning_rate": 0.0019971389557188377, - "loss": 2.0101, + "epoch": 0.5722014148980441, + "grad_norm": 0.41796875, + "learning_rate": 0.001977111943404078, + "loss": 1.6694, "step": 5500 }, { - "epoch": 0.0780284803953443, - "grad_norm": 0.95703125, - "learning_rate": 0.0019968788607841862, - "loss": 2.0275, + "epoch": 0.6242197253433208, + "grad_norm": 0.71484375, + "learning_rate": 0.001975031210986267, + "loss": 1.6882, "step": 6000 }, { - "epoch": 0.08453085376162299, - "grad_norm": 0.68359375, - "learning_rate": 0.001996618765849535, - "loss": 1.9981, + "epoch": 0.6762380357885975, + "grad_norm": 0.40625, + "learning_rate": 0.0019729504785684564, + "loss": 1.6712, "step": 6500 }, { - "epoch": 0.09103322712790168, - "grad_norm": 0.69140625, - "learning_rate": 0.001996358670914884, - "loss": 1.9754, + "epoch": 0.7282563462338744, + "grad_norm": 1.21875, + "learning_rate": 0.0019708697461506453, + "loss": 1.6519, "step": 7000 }, { - "epoch": 0.09753560049418038, - "grad_norm": 0.55859375, - "learning_rate": 0.0019960985759802327, - "loss": 1.9366, + "epoch": 0.7802746566791511, + "grad_norm": 0.27734375, + "learning_rate": 0.0019687890137328337, + "loss": 1.6525, "step": 7500 }, { - "epoch": 0.10403797386045907, - "grad_norm": 0.8671875, - "learning_rate": 0.0019958384810455816, - "loss": 1.921, + "epoch": 0.8322929671244278, + "grad_norm": 0.408203125, + "learning_rate": 0.001966708281315023, + "loss": 1.6239, "step": 8000 }, { - "epoch": 0.11054034722673776, - "grad_norm": 0.455078125, - "learning_rate": 0.0019955783861109306, - "loss": 1.9106, + "epoch": 0.8843112775697045, + "grad_norm": 0.546875, + "learning_rate": 0.001964627548897212, + "loss": 1.6152, "step": 8500 }, { - "epoch": 0.11704272059301644, - "grad_norm": 0.57421875, - "learning_rate": 0.001995318291176279, - "loss": 1.9082, + "epoch": 0.9363295880149812, + "grad_norm": 0.609375, + "learning_rate": 0.0019625468164794005, + "loss": 1.6053, "step": 9000 }, { - "epoch": 0.12354509395929514, - "grad_norm": 0.58984375, - "learning_rate": 0.0019950581962416285, - "loss": 1.8952, + "epoch": 0.9883478984602581, + "grad_norm": 0.86328125, + "learning_rate": 0.00196046608406159, + "loss": 1.6078, "step": 9500 }, { - "epoch": 0.13004746732557385, - "grad_norm": 0.5234375, - "learning_rate": 0.001994798101306977, - "loss": 1.9024, + "epoch": 1.0, + "eval_loss": 1.5721051692962646, + "eval_runtime": 1.4853, + "eval_samples_per_second": 673.283, + "eval_steps_per_second": 0.673, + "step": 9612 + }, + { + "epoch": 1.0403662089055348, + "grad_norm": 0.31640625, + "learning_rate": 0.0019583853516437788, + "loss": 1.5953, "step": 10000 }, { - "epoch": 0.13654984069185253, - "grad_norm": 1.0859375, - "learning_rate": 0.001994538006372326, - "loss": 1.9417, + "epoch": 1.0923845193508115, + "grad_norm": 0.484375, + "learning_rate": 0.0019563046192259677, + "loss": 1.587, "step": 10500 }, { - "epoch": 0.1430522140581312, - "grad_norm": 0.609375, - "learning_rate": 0.001994277911437675, - "loss": 1.9231, + "epoch": 1.1444028297960882, + "grad_norm": 0.5625, + "learning_rate": 0.0019542238868081566, + "loss": 1.5759, "step": 11000 }, { - "epoch": 0.1495545874244099, - "grad_norm": 1.09375, - "learning_rate": 0.0019940178165030235, - "loss": 1.9009, + "epoch": 1.196421140241365, + "grad_norm": 0.6484375, + "learning_rate": 0.0019521431543903455, + "loss": 1.5785, "step": 11500 }, { - "epoch": 0.1560569607906886, - "grad_norm": 0.62109375, - "learning_rate": 0.0019937577215683724, - "loss": 1.9132, + "epoch": 1.2484394506866416, + "grad_norm": 0.3984375, + "learning_rate": 0.0019500624219725344, + "loss": 1.5734, "step": 12000 }, { - "epoch": 0.1625593341569673, - "grad_norm": 0.59765625, - "learning_rate": 0.0019934976266337214, - "loss": 1.8845, + "epoch": 1.3004577611319184, + "grad_norm": 0.3359375, + "learning_rate": 0.0019479816895547233, + "loss": 1.5593, "step": 12500 }, { - "epoch": 0.16906170752324598, - "grad_norm": 1.578125, - "learning_rate": 0.0019932375316990703, - "loss": 1.8613, + "epoch": 1.352476071577195, + "grad_norm": 0.408203125, + "learning_rate": 0.0019459009571369122, + "loss": 1.5589, "step": 13000 }, { - "epoch": 0.1755640808895247, - "grad_norm": 4.78125, - "learning_rate": 0.0019929774367644193, - "loss": 1.8528, + "epoch": 1.404494382022472, + "grad_norm": 0.4296875, + "learning_rate": 0.0019438202247191011, + "loss": 1.555, "step": 13500 }, { - "epoch": 0.18206645425580337, - "grad_norm": 0.56640625, - "learning_rate": 0.001992717341829768, - "loss": 1.8416, + "epoch": 1.4565126924677487, + "grad_norm": 0.310546875, + "learning_rate": 0.00194173949230129, + "loss": 1.548, "step": 14000 }, { - "epoch": 0.18856882762208205, - "grad_norm": 0.73828125, - "learning_rate": 0.0019924572468951168, - "loss": 1.8443, + "epoch": 1.5085310029130254, + "grad_norm": 0.5234375, + "learning_rate": 0.001939658759883479, + "loss": 1.5519, "step": 14500 }, { - "epoch": 0.19507120098836075, - "grad_norm": 0.53125, - "learning_rate": 0.0019921971519604657, - "loss": 1.8301, + "epoch": 1.5605493133583022, + "grad_norm": 0.81640625, + "learning_rate": 0.001937578027465668, + "loss": 1.5534, "step": 15000 }, { - "epoch": 0.20157357435463943, - "grad_norm": 0.98828125, - "learning_rate": 0.0019919370570258142, - "loss": 1.8077, + "epoch": 1.6125676238035789, + "grad_norm": 0.275390625, + "learning_rate": 0.0019354972950478568, + "loss": 1.5408, "step": 15500 }, { - "epoch": 0.20807594772091814, - "grad_norm": 1.53125, - "learning_rate": 0.001991676962091163, - "loss": 1.7963, + "epoch": 1.6645859342488556, + "grad_norm": 0.421875, + "learning_rate": 0.0019334165626300457, + "loss": 1.5339, "step": 16000 }, { - "epoch": 0.21457832108719682, - "grad_norm": 0.671875, - "learning_rate": 0.001991416867156512, - "loss": 1.7879, + "epoch": 1.7166042446941323, + "grad_norm": 0.337890625, + "learning_rate": 0.0019313358302122348, + "loss": 1.5238, "step": 16500 }, { - "epoch": 0.22108069445347553, - "grad_norm": 5.25, - "learning_rate": 0.001991156772221861, - "loss": 1.793, + "epoch": 1.768622555139409, + "grad_norm": 0.474609375, + "learning_rate": 0.0019292550977944235, + "loss": 1.527, "step": 17000 }, { - "epoch": 0.2275830678197542, - "grad_norm": 0.6171875, - "learning_rate": 0.00199089667728721, - "loss": 1.7897, + "epoch": 1.8206408655846857, + "grad_norm": 0.345703125, + "learning_rate": 0.0019271743653766125, + "loss": 1.5253, "step": 17500 }, { - "epoch": 0.2340854411860329, - "grad_norm": 0.8359375, - "learning_rate": 0.0019906365823525586, - "loss": 1.7831, + "epoch": 1.8726591760299627, + "grad_norm": 0.53125, + "learning_rate": 0.0019250936329588016, + "loss": 1.5217, "step": 18000 }, { - "epoch": 0.2405878145523116, - "grad_norm": 1.28125, - "learning_rate": 0.0019903764874179075, - "loss": 1.7912, + "epoch": 1.9246774864752392, + "grad_norm": 0.416015625, + "learning_rate": 0.0019230129005409905, + "loss": 1.5201, "step": 18500 }, { - "epoch": 0.24709018791859028, - "grad_norm": 0.79296875, - "learning_rate": 0.0019901163924832565, - "loss": 1.7988, + "epoch": 1.9766957969205161, + "grad_norm": 0.212890625, + "learning_rate": 0.0019209321681231794, + "loss": 1.5157, "step": 19000 }, { - "epoch": 0.25359256128486896, - "grad_norm": 0.5234375, - "learning_rate": 0.001989856297548605, - "loss": 1.8031, + "epoch": 2.0, + "eval_loss": 1.4967154264450073, + "eval_runtime": 1.4155, + "eval_samples_per_second": 706.48, + "eval_steps_per_second": 0.706, + "step": 19224 + }, + { + "epoch": 2.0287141073657926, + "grad_norm": 0.640625, + "learning_rate": 0.0019188514357053683, + "loss": 1.5093, "step": 19500 }, { - "epoch": 0.2600949346511477, - "grad_norm": 0.71484375, - "learning_rate": 0.0019895962026139544, - "loss": 1.8197, + "epoch": 2.0807324178110695, + "grad_norm": 0.349609375, + "learning_rate": 0.0019167707032875572, + "loss": 1.5081, "step": 20000 }, { - "epoch": 0.26659730801742637, - "grad_norm": 0.78515625, - "learning_rate": 0.001989336107679303, - "loss": 1.8048, + "epoch": 2.132750728256346, + "grad_norm": 0.447265625, + "learning_rate": 0.0019146899708697464, + "loss": 1.5137, "step": 20500 }, { - "epoch": 0.27309968138370505, - "grad_norm": 0.546875, - "learning_rate": 0.001989076012744652, - "loss": 1.7964, + "epoch": 2.184769038701623, + "grad_norm": 0.30078125, + "learning_rate": 0.001912609238451935, + "loss": 1.5052, "step": 21000 }, { - "epoch": 0.27960205474998373, - "grad_norm": 0.64453125, - "learning_rate": 0.001988815917810001, - "loss": 1.7898, + "epoch": 2.2367873491468995, + "grad_norm": 0.423828125, + "learning_rate": 0.001910528506034124, + "loss": 1.4989, "step": 21500 }, { - "epoch": 0.2861044281162624, - "grad_norm": 5.25, - "learning_rate": 0.0019885558228753494, - "loss": 1.7911, + "epoch": 2.2888056595921764, + "grad_norm": 0.322265625, + "learning_rate": 0.0019084477736163131, + "loss": 1.4933, "step": 22000 }, { - "epoch": 0.29260680148254115, - "grad_norm": 0.494140625, - "learning_rate": 0.0019882957279406983, - "loss": 1.7739, + "epoch": 2.3408239700374533, + "grad_norm": 0.287109375, + "learning_rate": 0.0019063670411985018, + "loss": 1.4908, "step": 22500 }, { - "epoch": 0.2991091748488198, - "grad_norm": 0.6015625, - "learning_rate": 0.0019880356330060473, - "loss": 1.7675, + "epoch": 2.39284228048273, + "grad_norm": 0.60546875, + "learning_rate": 0.0019042863087806907, + "loss": 1.483, "step": 23000 }, { - "epoch": 0.3056115482150985, - "grad_norm": 0.431640625, - "learning_rate": 0.001987775538071396, - "loss": 1.767, + "epoch": 2.444860590928007, + "grad_norm": 0.37890625, + "learning_rate": 0.0019022055763628799, + "loss": 1.4808, "step": 23500 }, { - "epoch": 0.3121139215813772, - "grad_norm": 0.75390625, - "learning_rate": 0.001987515443136745, - "loss": 1.7782, + "epoch": 2.4968789013732833, + "grad_norm": 0.28125, + "learning_rate": 0.0019001248439450688, + "loss": 1.4751, "step": 24000 }, { - "epoch": 0.3186162949476559, - "grad_norm": 0.51171875, - "learning_rate": 0.0019872553482020937, - "loss": 1.7878, + "epoch": 2.54889721181856, + "grad_norm": 0.26953125, + "learning_rate": 0.0018980441115272575, + "loss": 1.4713, "step": 24500 }, { - "epoch": 0.3251186683139346, - "grad_norm": 0.625, - "learning_rate": 0.0019869952532674427, - "loss": 1.7762, + "epoch": 2.6009155222638367, + "grad_norm": 0.2392578125, + "learning_rate": 0.0018959633791094466, + "loss": 1.4743, "step": 25000 }, { - "epoch": 0.3316210416802133, - "grad_norm": 6.15625, - "learning_rate": 0.0019867351583327916, - "loss": 1.744, + "epoch": 2.6529338327091136, + "grad_norm": 0.255859375, + "learning_rate": 0.0018938826466916355, + "loss": 1.4703, "step": 25500 }, { - "epoch": 0.33812341504649196, - "grad_norm": 0.82421875, - "learning_rate": 0.00198647506339814, - "loss": 1.7591, + "epoch": 2.70495214315439, + "grad_norm": 0.333984375, + "learning_rate": 0.0018918019142738244, + "loss": 1.4722, "step": 26000 }, { - "epoch": 0.34462578841277064, + "epoch": 2.756970453599667, "grad_norm": 0.87890625, - "learning_rate": 0.001986214968463489, - "loss": 1.7565, + "learning_rate": 0.0018897211818560133, + "loss": 1.4728, "step": 26500 }, { - "epoch": 0.3511281617790494, - "grad_norm": 0.5078125, - "learning_rate": 0.001985954873528838, - "loss": 1.7639, + "epoch": 2.808988764044944, + "grad_norm": 0.435546875, + "learning_rate": 0.0018876404494382023, + "loss": 1.4738, "step": 27000 }, { - "epoch": 0.35763053514532805, - "grad_norm": 0.609375, - "learning_rate": 0.001985694778594187, - "loss": 1.7547, + "epoch": 2.8610070744902205, + "grad_norm": 1.7265625, + "learning_rate": 0.0018855597170203914, + "loss": 1.4717, "step": 27500 }, { - "epoch": 0.36413290851160673, - "grad_norm": 0.84375, - "learning_rate": 0.001985434683659536, - "loss": 1.7356, + "epoch": 2.9130253849354975, + "grad_norm": 0.361328125, + "learning_rate": 0.00188347898460258, + "loss": 1.473, "step": 28000 }, { - "epoch": 0.3706352818778854, - "grad_norm": 0.66796875, - "learning_rate": 0.0019851745887248845, - "loss": 1.7366, + "epoch": 2.965043695380774, + "grad_norm": 0.3984375, + "learning_rate": 0.001881398252184769, + "loss": 1.472, "step": 28500 }, { - "epoch": 0.3771376552441641, - "grad_norm": 1.2265625, - "learning_rate": 0.0019849144937902335, - "loss": 1.7293, + "epoch": 3.0, + "eval_loss": 1.4684182405471802, + "eval_runtime": 1.4391, + "eval_samples_per_second": 694.9, + "eval_steps_per_second": 0.695, + "step": 28836 + }, + { + "epoch": 3.017062005826051, + "grad_norm": 0.23046875, + "learning_rate": 0.0018793175197669581, + "loss": 1.4727, "step": 29000 }, { - "epoch": 0.38364002861044283, - "grad_norm": 0.498046875, - "learning_rate": 0.0019846543988555824, - "loss": 1.7272, + "epoch": 3.0690803162713274, + "grad_norm": 0.73046875, + "learning_rate": 0.001877236787349147, + "loss": 1.4677, "step": 29500 }, { - "epoch": 0.3901424019767215, - "grad_norm": 0.55859375, - "learning_rate": 0.001984394303920931, - "loss": 1.7196, + "epoch": 3.1210986267166043, + "grad_norm": 0.470703125, + "learning_rate": 0.0018751560549313357, + "loss": 1.4668, "step": 30000 }, { - "epoch": 0.3966447753430002, - "grad_norm": 0.5078125, - "learning_rate": 0.00198413420898628, - "loss": 1.7293, + "epoch": 3.173116937161881, + "grad_norm": 1.2578125, + "learning_rate": 0.0018730753225135249, + "loss": 1.4653, "step": 30500 }, { - "epoch": 0.40314714870927887, - "grad_norm": 0.875, - "learning_rate": 0.001983874114051629, - "loss": 1.7171, + "epoch": 3.2251352476071578, + "grad_norm": 0.2890625, + "learning_rate": 0.0018709945900957138, + "loss": 1.4667, "step": 31000 }, { - "epoch": 0.4096495220755576, - "grad_norm": 1.296875, - "learning_rate": 0.001983614019116978, - "loss": 1.7131, + "epoch": 3.2771535580524347, + "grad_norm": 0.361328125, + "learning_rate": 0.0018689138576779025, + "loss": 1.4613, "step": 31500 }, { - "epoch": 0.4161518954418363, - "grad_norm": 2.546875, - "learning_rate": 0.001983353924182327, - "loss": 1.7128, + "epoch": 3.329171868497711, + "grad_norm": 0.462890625, + "learning_rate": 0.0018668331252600916, + "loss": 1.4604, "step": 32000 }, { - "epoch": 0.42265426880811496, - "grad_norm": 0.71484375, - "learning_rate": 0.0019830938292476753, - "loss": 1.7127, + "epoch": 3.381190178942988, + "grad_norm": 0.5546875, + "learning_rate": 0.0018647523928422805, + "loss": 1.4672, "step": 32500 }, { - "epoch": 0.42915664217439364, - "grad_norm": 0.546875, - "learning_rate": 0.0019828337343130243, - "loss": 1.7052, + "epoch": 3.4332084893882646, + "grad_norm": 2.0625, + "learning_rate": 0.0018626716604244697, + "loss": 1.465, "step": 33000 }, { - "epoch": 0.4356590155406723, - "grad_norm": 0.5625, - "learning_rate": 0.0019825736393783732, - "loss": 1.701, + "epoch": 3.4852267998335416, + "grad_norm": 0.39453125, + "learning_rate": 0.0018605909280066584, + "loss": 1.4604, "step": 33500 }, { - "epoch": 0.44216138890695106, - "grad_norm": 0.4609375, - "learning_rate": 0.0019823135444437217, - "loss": 1.6965, + "epoch": 3.537245110278818, + "grad_norm": 0.255859375, + "learning_rate": 0.0018585101955888473, + "loss": 1.4539, "step": 34000 }, { - "epoch": 0.44866376227322974, - "grad_norm": 0.5625, - "learning_rate": 0.001982053449509071, - "loss": 1.6987, + "epoch": 3.589263420724095, + "grad_norm": 0.59375, + "learning_rate": 0.0018564294631710364, + "loss": 1.4536, "step": 34500 }, { - "epoch": 0.4551661356395084, - "grad_norm": 1.4296875, - "learning_rate": 0.0019817933545744197, - "loss": 1.699, + "epoch": 3.6412817311693715, + "grad_norm": 0.30078125, + "learning_rate": 0.001854348730753225, + "loss": 1.4549, "step": 35000 }, { - "epoch": 0.4616685090057871, - "grad_norm": 0.490234375, - "learning_rate": 0.0019815332596397686, - "loss": 1.6868, + "epoch": 3.6933000416146484, + "grad_norm": 0.345703125, + "learning_rate": 0.001852267998335414, + "loss": 1.4571, "step": 35500 }, { - "epoch": 0.4681708823720658, - "grad_norm": 0.73828125, - "learning_rate": 0.0019812731647051176, - "loss": 1.6836, + "epoch": 3.7453183520599254, + "grad_norm": 0.376953125, + "learning_rate": 0.0018501872659176031, + "loss": 1.4584, "step": 36000 }, { - "epoch": 0.4746732557383445, - "grad_norm": 0.5234375, - "learning_rate": 0.001981013069770466, - "loss": 1.6805, + "epoch": 3.797336662505202, + "grad_norm": 0.53515625, + "learning_rate": 0.001848106533499792, + "loss": 1.4542, "step": 36500 }, { - "epoch": 0.4811756291046232, - "grad_norm": 0.49609375, - "learning_rate": 0.001980752974835815, - "loss": 1.6832, + "epoch": 3.8493549729504783, + "grad_norm": 0.3515625, + "learning_rate": 0.0018460258010819808, + "loss": 1.4588, "step": 37000 }, { - "epoch": 0.4876780024709019, - "grad_norm": 0.71484375, - "learning_rate": 0.001980492879901164, - "loss": 1.6827, + "epoch": 3.9013732833957553, + "grad_norm": 0.279296875, + "learning_rate": 0.0018439450686641699, + "loss": 1.456, "step": 37500 }, { - "epoch": 0.49418037583718055, - "grad_norm": 0.58984375, - "learning_rate": 0.0019802327849665125, - "loss": 1.6739, + "epoch": 3.9533915938410322, + "grad_norm": 0.31640625, + "learning_rate": 0.0018418643362463588, + "loss": 1.4596, "step": 38000 }, { - "epoch": 0.5006827492034592, - "grad_norm": 0.71875, - "learning_rate": 0.001979972690031862, - "loss": 1.676, + "epoch": 4.0, + "eval_loss": 1.4403541088104248, + "eval_runtime": 1.4124, + "eval_samples_per_second": 708.019, + "eval_steps_per_second": 0.708, + "step": 38448 + }, + { + "epoch": 4.005409904286309, + "grad_norm": 0.259765625, + "learning_rate": 0.0018397836038285475, + "loss": 1.449, "step": 38500 }, { - "epoch": 0.5071851225697379, - "grad_norm": 0.76171875, - "learning_rate": 0.0019797125950972105, - "loss": 1.6762, + "epoch": 4.057428214731585, + "grad_norm": 0.349609375, + "learning_rate": 0.0018377028714107366, + "loss": 1.445, "step": 39000 }, { - "epoch": 0.5136874959360166, - "grad_norm": 0.69140625, - "learning_rate": 0.0019794525001625594, - "loss": 1.6743, + "epoch": 4.109446525176862, + "grad_norm": 0.330078125, + "learning_rate": 0.0018356221389929255, + "loss": 1.4441, "step": 39500 }, { - "epoch": 0.5201898693022954, - "grad_norm": 0.69921875, - "learning_rate": 0.0019791924052279084, - "loss": 1.6713, + "epoch": 4.161464835622139, + "grad_norm": 0.283203125, + "learning_rate": 0.0018335414065751145, + "loss": 1.4423, "step": 40000 }, { - "epoch": 0.5266922426685741, - "grad_norm": 0.609375, - "learning_rate": 0.001978932310293257, - "loss": 1.668, + "epoch": 4.213483146067416, + "grad_norm": 0.54296875, + "learning_rate": 0.0018314606741573034, + "loss": 1.442, "step": 40500 }, { - "epoch": 0.5331946160348527, - "grad_norm": 0.55078125, - "learning_rate": 0.001978672215358606, - "loss": 1.665, + "epoch": 4.265501456512692, + "grad_norm": 0.23046875, + "learning_rate": 0.0018293799417394923, + "loss": 1.4348, "step": 41000 }, { - "epoch": 0.5396969894011314, - "grad_norm": 0.6171875, - "learning_rate": 0.001978412120423955, - "loss": 1.6673, + "epoch": 4.317519766957969, + "grad_norm": 0.50390625, + "learning_rate": 0.0018272992093216814, + "loss": 1.4352, "step": 41500 }, { - "epoch": 0.5461993627674101, - "grad_norm": 2.578125, - "learning_rate": 0.0019781520254893038, - "loss": 1.6608, + "epoch": 4.369538077403246, + "grad_norm": 0.4765625, + "learning_rate": 0.0018252184769038703, + "loss": 1.4344, "step": 42000 }, { - "epoch": 0.5527017361336888, - "grad_norm": 0.46484375, - "learning_rate": 0.0019778919305546527, - "loss": 1.6578, + "epoch": 4.421556387848523, + "grad_norm": 0.400390625, + "learning_rate": 0.001823137744486059, + "loss": 1.438, "step": 42500 }, { - "epoch": 0.5592041094999675, - "grad_norm": 0.57421875, - "learning_rate": 0.0019776318356200012, - "loss": 1.6561, + "epoch": 4.473574698293799, + "grad_norm": 0.42578125, + "learning_rate": 0.0018210570120682482, + "loss": 1.4417, "step": 43000 }, { - "epoch": 0.5657064828662461, - "grad_norm": 0.57421875, - "learning_rate": 0.00197737174068535, - "loss": 1.6527, + "epoch": 4.525593008739076, + "grad_norm": 1.2265625, + "learning_rate": 0.001818976279650437, + "loss": 1.436, "step": 43500 }, { - "epoch": 0.5722088562325248, - "grad_norm": 0.640625, - "learning_rate": 0.001977111645750699, - "loss": 1.6467, + "epoch": 4.577611319184353, + "grad_norm": 0.48828125, + "learning_rate": 0.0018168955472326258, + "loss": 1.4367, "step": 44000 }, { - "epoch": 0.5787112295988036, - "grad_norm": 0.6015625, - "learning_rate": 0.0019768515508160477, - "loss": 1.6501, + "epoch": 4.62962962962963, + "grad_norm": 0.37109375, + "learning_rate": 0.001814814814814815, + "loss": 1.4299, "step": 44500 }, { - "epoch": 0.5852136029650823, - "grad_norm": 0.55859375, - "learning_rate": 0.0019765914558813966, - "loss": 1.6533, + "epoch": 4.681647940074907, + "grad_norm": 0.31640625, + "learning_rate": 0.0018127340823970038, + "loss": 1.4332, "step": 45000 }, { - "epoch": 0.591715976331361, - "grad_norm": 0.78515625, - "learning_rate": 0.0019763313609467456, - "loss": 1.6543, + "epoch": 4.733666250520183, + "grad_norm": 0.455078125, + "learning_rate": 0.0018106533499791927, + "loss": 1.4269, "step": 45500 }, { - "epoch": 0.5982183496976397, - "grad_norm": 0.53125, - "learning_rate": 0.0019760712660120945, - "loss": 1.651, + "epoch": 4.78568456096546, + "grad_norm": 0.671875, + "learning_rate": 0.0018085726175613816, + "loss": 1.4276, "step": 46000 }, { - "epoch": 0.6047207230639183, - "grad_norm": 0.5703125, - "learning_rate": 0.0019758111710774435, - "loss": 1.6474, + "epoch": 4.837702871410737, + "grad_norm": 1.0078125, + "learning_rate": 0.0018064918851435705, + "loss": 1.4288, "step": 46500 }, { - "epoch": 0.611223096430197, - "grad_norm": 0.5, - "learning_rate": 0.001975551076142792, - "loss": 1.6363, + "epoch": 4.889721181856014, + "grad_norm": 1.34375, + "learning_rate": 0.0018044111527257595, + "loss": 1.4351, "step": 47000 }, { - "epoch": 0.6177254697964757, - "grad_norm": 0.48828125, - "learning_rate": 0.001975290981208141, - "loss": 1.6395, + "epoch": 4.94173949230129, + "grad_norm": 0.470703125, + "learning_rate": 0.0018023304203079484, + "loss": 1.4307, "step": 47500 }, { - "epoch": 0.6242278431627544, - "grad_norm": 1.0078125, - "learning_rate": 0.00197503088627349, - "loss": 1.6419, + "epoch": 4.9937578027465666, + "grad_norm": 0.2470703125, + "learning_rate": 0.0018002496878901373, + "loss": 1.4257, "step": 48000 }, { - "epoch": 0.630730216529033, - "grad_norm": 0.53515625, - "learning_rate": 0.0019747707913388385, - "loss": 1.6377, + "epoch": 5.0, + "eval_loss": 1.4106667041778564, + "eval_runtime": 1.4218, + "eval_samples_per_second": 703.325, + "eval_steps_per_second": 0.703, + "step": 48060 + }, + { + "epoch": 5.0457761131918435, + "grad_norm": 0.345703125, + "learning_rate": 0.0017981689554723264, + "loss": 1.4193, "step": 48500 }, { - "epoch": 0.6372325898953118, - "grad_norm": 0.70703125, - "learning_rate": 0.001974510696404188, - "loss": 1.6428, + "epoch": 5.09779442363712, + "grad_norm": 0.322265625, + "learning_rate": 0.0017960882230545153, + "loss": 1.4176, "step": 49000 }, { - "epoch": 0.6437349632615905, - "grad_norm": 0.5859375, - "learning_rate": 0.0019742506014695364, - "loss": 1.6507, + "epoch": 5.149812734082397, + "grad_norm": 0.27734375, + "learning_rate": 0.001794007490636704, + "loss": 1.4157, "step": 49500 }, { - "epoch": 0.6502373366278692, - "grad_norm": 0.76953125, - "learning_rate": 0.0019739905065348853, - "loss": 1.6552, + "epoch": 5.201831044527673, + "grad_norm": 0.333984375, + "learning_rate": 0.0017919267582188932, + "loss": 1.4186, "step": 50000 }, { - "epoch": 0.6567397099941479, - "grad_norm": 2.09375, - "learning_rate": 0.0019737304116002343, - "loss": 1.6509, + "epoch": 5.25384935497295, + "grad_norm": 0.2734375, + "learning_rate": 0.001789846025801082, + "loss": 1.4155, "step": 50500 }, { - "epoch": 0.6632420833604266, - "grad_norm": 0.5390625, - "learning_rate": 0.001973470316665583, - "loss": 1.6545, + "epoch": 5.305867665418227, + "grad_norm": 2.640625, + "learning_rate": 0.0017877652933832708, + "loss": 1.4142, "step": 51000 }, { - "epoch": 0.6697444567267052, - "grad_norm": 0.51953125, - "learning_rate": 0.0019732102217309318, - "loss": 1.6451, + "epoch": 5.357885975863504, + "grad_norm": 0.2431640625, + "learning_rate": 0.00178568456096546, + "loss": 1.4155, "step": 51500 }, { - "epoch": 0.6762468300929839, - "grad_norm": 0.6953125, - "learning_rate": 0.0019729501267962807, - "loss": 1.6408, + "epoch": 5.40990428630878, + "grad_norm": 0.244140625, + "learning_rate": 0.0017836038285476488, + "loss": 1.4113, "step": 52000 }, { - "epoch": 0.6827492034592626, - "grad_norm": 0.5546875, - "learning_rate": 0.0019726900318616293, - "loss": 1.6378, + "epoch": 5.461922596754057, + "grad_norm": 0.21484375, + "learning_rate": 0.0017815230961298377, + "loss": 1.4145, "step": 52500 }, { - "epoch": 0.6892515768255413, - "grad_norm": 0.56640625, - "learning_rate": 0.0019724299369269786, - "loss": 1.637, + "epoch": 5.513940907199334, + "grad_norm": 0.42578125, + "learning_rate": 0.0017794423637120266, + "loss": 1.4132, "step": 53000 }, { - "epoch": 0.6957539501918201, - "grad_norm": 0.5625, - "learning_rate": 0.001972169841992327, - "loss": 1.6317, + "epoch": 5.565959217644611, + "grad_norm": 0.2060546875, + "learning_rate": 0.0017773616312942156, + "loss": 1.4141, "step": 53500 }, { - "epoch": 0.7022563235580987, - "grad_norm": 0.83984375, - "learning_rate": 0.001971909747057676, - "loss": 1.6326, + "epoch": 5.617977528089888, + "grad_norm": 0.51171875, + "learning_rate": 0.0017752808988764045, + "loss": 1.4118, "step": 54000 }, { - "epoch": 0.7087586969243774, - "grad_norm": 0.609375, - "learning_rate": 0.001971649652123025, - "loss": 1.6285, + "epoch": 5.669995838535164, + "grad_norm": 0.73828125, + "learning_rate": 0.0017732001664585936, + "loss": 1.4094, "step": 54500 }, { - "epoch": 0.7152610702906561, - "grad_norm": 0.51953125, - "learning_rate": 0.0019713895571883736, - "loss": 1.6336, + "epoch": 5.722014148980441, + "grad_norm": 0.390625, + "learning_rate": 0.0017711194340407823, + "loss": 1.4088, "step": 55000 }, { - "epoch": 0.7217634436569348, - "grad_norm": 0.6015625, - "learning_rate": 0.0019711294622537226, - "loss": 1.636, + "epoch": 5.774032459425718, + "grad_norm": 2.34375, + "learning_rate": 0.0017690387016229714, + "loss": 1.4068, "step": 55500 }, { - "epoch": 0.7282658170232135, - "grad_norm": 0.5, - "learning_rate": 0.0019708693673190715, - "loss": 1.6426, + "epoch": 5.826050769870995, + "grad_norm": 0.265625, + "learning_rate": 0.0017669579692051603, + "loss": 1.4059, "step": 56000 }, { - "epoch": 0.7347681903894921, - "grad_norm": 0.69140625, - "learning_rate": 0.0019706092723844205, - "loss": 1.6444, + "epoch": 5.878069080316271, + "grad_norm": 0.283203125, + "learning_rate": 0.001764877236787349, + "loss": 1.4041, "step": 56500 }, { - "epoch": 0.7412705637557708, - "grad_norm": 2.140625, - "learning_rate": 0.0019703491774497694, - "loss": 1.6306, + "epoch": 5.930087390761548, + "grad_norm": 0.77734375, + "learning_rate": 0.0017627965043695382, + "loss": 1.4024, "step": 57000 }, { - "epoch": 0.7477729371220495, - "grad_norm": 0.5390625, - "learning_rate": 0.001970089082515118, - "loss": 1.6341, + "epoch": 5.982105701206825, + "grad_norm": 0.30859375, + "learning_rate": 0.001760715771951727, + "loss": 1.4006, "step": 57500 }, { - "epoch": 0.7542753104883282, - "grad_norm": 0.435546875, - "learning_rate": 0.001969828987580467, - "loss": 1.6356, + "epoch": 6.0, + "eval_loss": 1.3921489715576172, + "eval_runtime": 1.4219, + "eval_samples_per_second": 703.273, + "eval_steps_per_second": 0.703, + "step": 57672 + }, + { + "epoch": 6.034124011652102, + "grad_norm": 0.255859375, + "learning_rate": 0.001758635039533916, + "loss": 1.4001, "step": 58000 }, { - "epoch": 0.760777683854607, - "grad_norm": 0.546875, - "learning_rate": 0.001969568892645816, - "loss": 1.6326, + "epoch": 6.086142322097379, + "grad_norm": 0.2431640625, + "learning_rate": 0.001756554307116105, + "loss": 1.3981, "step": 58500 }, { - "epoch": 0.7672800572208857, - "grad_norm": 0.640625, - "learning_rate": 0.0019693087977111644, - "loss": 1.6272, + "epoch": 6.138160632542655, + "grad_norm": 0.4296875, + "learning_rate": 0.0017544735746982938, + "loss": 1.4018, "step": 59000 }, { - "epoch": 0.7737824305871643, - "grad_norm": 1.09375, - "learning_rate": 0.0019690487027765134, - "loss": 1.6223, + "epoch": 6.190178942987932, + "grad_norm": 0.85546875, + "learning_rate": 0.0017523928422804827, + "loss": 1.397, "step": 59500 }, { - "epoch": 0.780284803953443, - "grad_norm": 0.6015625, - "learning_rate": 0.0019687886078418623, - "loss": 1.6297, + "epoch": 6.242197253433209, + "grad_norm": 0.361328125, + "learning_rate": 0.0017503121098626717, + "loss": 1.3974, "step": 60000 }, { - "epoch": 0.7867871773197217, - "grad_norm": 0.427734375, - "learning_rate": 0.0019685285129072113, - "loss": 1.6278, + "epoch": 6.294215563878486, + "grad_norm": 3.578125, + "learning_rate": 0.0017482313774448606, + "loss": 1.397, "step": 60500 }, { - "epoch": 0.7932895506860004, - "grad_norm": 0.88671875, - "learning_rate": 0.0019682684179725602, - "loss": 1.632, + "epoch": 6.346233874323762, + "grad_norm": 0.2451171875, + "learning_rate": 0.0017461506450270495, + "loss": 1.3971, "step": 61000 }, { - "epoch": 0.7997919240522791, - "grad_norm": 0.65625, - "learning_rate": 0.0019680083230379087, - "loss": 1.6304, + "epoch": 6.398252184769039, + "grad_norm": 0.53125, + "learning_rate": 0.0017440699126092386, + "loss": 1.3955, "step": 61500 }, { - "epoch": 0.8062942974185577, - "grad_norm": 0.6171875, - "learning_rate": 0.0019677482281032577, - "loss": 1.6262, + "epoch": 6.4502704952143155, + "grad_norm": 0.24609375, + "learning_rate": 0.0017419891801914273, + "loss": 1.3959, "step": 62000 }, { - "epoch": 0.8127966707848364, - "grad_norm": 0.5859375, - "learning_rate": 0.0019674881331686067, - "loss": 1.6304, + "epoch": 6.502288805659592, + "grad_norm": 0.390625, + "learning_rate": 0.0017399084477736164, + "loss": 1.3951, "step": 62500 }, { - "epoch": 0.8192990441511152, - "grad_norm": 1.1640625, - "learning_rate": 0.001967228038233955, - "loss": 1.6413, + "epoch": 6.554307116104869, + "grad_norm": 0.3125, + "learning_rate": 0.0017378277153558054, + "loss": 1.3977, "step": 63000 }, { - "epoch": 0.8258014175173939, - "grad_norm": 1.4921875, - "learning_rate": 0.0019669679432993046, - "loss": 1.6264, + "epoch": 6.606325426550145, + "grad_norm": 0.279296875, + "learning_rate": 0.001735746982937994, + "loss": 1.3976, "step": 63500 }, { - "epoch": 0.8323037908836726, - "grad_norm": 0.625, - "learning_rate": 0.001966707848364653, - "loss": 1.6243, + "epoch": 6.658343736995422, + "grad_norm": 0.83984375, + "learning_rate": 0.0017336662505201832, + "loss": 1.3964, "step": 64000 }, { - "epoch": 0.8388061642499512, - "grad_norm": 0.75, - "learning_rate": 0.001966447753430002, - "loss": 1.6298, + "epoch": 6.710362047440699, + "grad_norm": 0.357421875, + "learning_rate": 0.001731585518102372, + "loss": 1.396, "step": 64500 }, { - "epoch": 0.8453085376162299, - "grad_norm": 0.7265625, - "learning_rate": 0.001966187658495351, - "loss": 1.6207, + "epoch": 6.762380357885976, + "grad_norm": 0.27734375, + "learning_rate": 0.001729504785684561, + "loss": 1.3931, "step": 65000 }, { - "epoch": 0.8518109109825086, - "grad_norm": 0.515625, - "learning_rate": 0.0019659275635606995, - "loss": 1.6121, + "epoch": 6.814398668331252, + "grad_norm": 0.330078125, + "learning_rate": 0.00172742405326675, + "loss": 1.3931, "step": 65500 }, { - "epoch": 0.8583132843487873, - "grad_norm": 0.48046875, - "learning_rate": 0.0019656674686260485, - "loss": 1.6175, + "epoch": 6.866416978776529, + "grad_norm": 0.40625, + "learning_rate": 0.0017253433208489388, + "loss": 1.388, "step": 66000 }, { - "epoch": 0.864815657715066, - "grad_norm": 0.578125, - "learning_rate": 0.0019654073736913974, - "loss": 1.6142, + "epoch": 6.918435289221806, + "grad_norm": 0.2578125, + "learning_rate": 0.0017232625884311278, + "loss": 1.3862, "step": 66500 }, { - "epoch": 0.8713180310813446, - "grad_norm": 1.03125, - "learning_rate": 0.001965147278756746, - "loss": 1.614, + "epoch": 6.970453599667083, + "grad_norm": 0.3984375, + "learning_rate": 0.0017211818560133169, + "loss": 1.3865, "step": 67000 }, { - "epoch": 0.8778204044476234, - "grad_norm": 0.6640625, - "learning_rate": 0.0019648871838220954, - "loss": 1.6112, + "epoch": 7.0, + "eval_loss": 1.3738893270492554, + "eval_runtime": 1.4206, + "eval_samples_per_second": 703.936, + "eval_steps_per_second": 0.704, + "step": 67284 + }, + { + "epoch": 7.022471910112359, + "grad_norm": 0.298828125, + "learning_rate": 0.0017191011235955056, + "loss": 1.385, "step": 67500 }, { - "epoch": 0.8843227778139021, - "grad_norm": 0.62109375, - "learning_rate": 0.001964627088887444, - "loss": 1.6082, + "epoch": 7.074490220557636, + "grad_norm": 0.39453125, + "learning_rate": 0.0017170203911776945, + "loss": 1.3827, "step": 68000 }, { - "epoch": 0.8908251511801808, - "grad_norm": 0.93359375, - "learning_rate": 0.001964366993952793, - "loss": 1.6121, + "epoch": 7.126508531002913, + "grad_norm": 0.609375, + "learning_rate": 0.0017149396587598836, + "loss": 1.3864, "step": 68500 }, { - "epoch": 0.8973275245464595, - "grad_norm": 0.84375, - "learning_rate": 0.001964106899018142, - "loss": 1.6084, + "epoch": 7.17852684144819, + "grad_norm": 0.51953125, + "learning_rate": 0.0017128589263420723, + "loss": 1.3874, "step": 69000 }, { - "epoch": 0.9038298979127382, - "grad_norm": 0.50390625, - "learning_rate": 0.0019638468040834903, - "loss": 1.603, + "epoch": 7.230545151893467, + "grad_norm": 0.326171875, + "learning_rate": 0.0017107781939242615, + "loss": 1.3913, "step": 69500 }, { - "epoch": 0.9103322712790168, - "grad_norm": 0.486328125, - "learning_rate": 0.0019635867091488393, - "loss": 1.6033, + "epoch": 7.282563462338743, + "grad_norm": 0.69921875, + "learning_rate": 0.0017086974615064504, + "loss": 1.3892, "step": 70000 }, { - "epoch": 0.9168346446452955, - "grad_norm": 0.55859375, - "learning_rate": 0.0019633266142141882, - "loss": 1.5994, + "epoch": 7.33458177278402, + "grad_norm": 0.38671875, + "learning_rate": 0.0017066167290886393, + "loss": 1.3888, "step": 70500 }, { - "epoch": 0.9233370180115742, - "grad_norm": 0.66796875, - "learning_rate": 0.001963066519279537, - "loss": 1.5958, + "epoch": 7.386600083229297, + "grad_norm": 0.259765625, + "learning_rate": 0.0017045359966708282, + "loss": 1.3908, "step": 71000 }, { - "epoch": 0.9298393913778529, - "grad_norm": 0.5625, - "learning_rate": 0.001962806424344886, - "loss": 1.5963, + "epoch": 7.438618393674574, + "grad_norm": 0.419921875, + "learning_rate": 0.001702455264253017, + "loss": 1.3879, "step": 71500 }, { - "epoch": 0.9363417647441316, - "grad_norm": 0.5, - "learning_rate": 0.0019625463294102347, - "loss": 1.5962, + "epoch": 7.49063670411985, + "grad_norm": 0.2197265625, + "learning_rate": 0.001700374531835206, + "loss": 1.3873, "step": 72000 }, { - "epoch": 0.9428441381104103, - "grad_norm": 0.8046875, - "learning_rate": 0.0019622862344755836, - "loss": 1.5861, + "epoch": 7.542655014565127, + "grad_norm": 0.2119140625, + "learning_rate": 0.001698293799417395, + "loss": 1.3845, "step": 72500 }, { - "epoch": 0.949346511476689, - "grad_norm": 1.140625, - "learning_rate": 0.0019620261395409326, - "loss": 1.5928, + "epoch": 7.594673325010404, + "grad_norm": 0.34375, + "learning_rate": 0.0016962130669995838, + "loss": 1.3832, "step": 73000 }, { - "epoch": 0.9558488848429677, - "grad_norm": 0.5546875, - "learning_rate": 0.001961766044606281, - "loss": 1.592, + "epoch": 7.646691635455681, + "grad_norm": 0.55078125, + "learning_rate": 0.0016941323345817728, + "loss": 1.38, "step": 73500 }, { - "epoch": 0.9623512582092464, - "grad_norm": 0.46875, - "learning_rate": 0.00196150594967163, - "loss": 1.5859, + "epoch": 7.698709945900957, + "grad_norm": 0.240234375, + "learning_rate": 0.001692051602163962, + "loss": 1.3798, "step": 74000 }, { - "epoch": 0.9688536315755251, - "grad_norm": 0.59765625, - "learning_rate": 0.001961245854736979, - "loss": 1.5834, + "epoch": 7.750728256346234, + "grad_norm": 0.2490234375, + "learning_rate": 0.0016899708697461506, + "loss": 1.3775, "step": 74500 }, { - "epoch": 0.9753560049418037, + "epoch": 7.802746566791511, "grad_norm": 1.015625, - "learning_rate": 0.001960985759802328, - "loss": 1.5893, + "learning_rate": 0.0016878901373283395, + "loss": 1.3797, "step": 75000 }, { - "epoch": 0.9818583783080824, - "grad_norm": 0.9609375, - "learning_rate": 0.001960725664867677, - "loss": 1.5846, + "epoch": 7.8547648772367875, + "grad_norm": 0.263671875, + "learning_rate": 0.0016858094049105286, + "loss": 1.3813, "step": 75500 }, { - "epoch": 0.9883607516743611, - "grad_norm": 0.45703125, - "learning_rate": 0.0019604655699330255, - "loss": 1.5838, + "epoch": 7.9067831876820645, + "grad_norm": 0.271484375, + "learning_rate": 0.0016837286724927173, + "loss": 1.3779, "step": 76000 }, { - "epoch": 0.9948631250406398, - "grad_norm": 0.5390625, - "learning_rate": 0.0019602054749983744, - "loss": 1.5809, + "epoch": 7.9588014981273405, + "grad_norm": 0.6875, + "learning_rate": 0.0016816479400749065, + "loss": 1.3771, "step": 76500 }, { - "epoch": 1.0, - "eval_loss": 1.5596588850021362, - "eval_runtime": 0.8816, - "eval_samples_per_second": 1134.308, - "eval_steps_per_second": 9.074, - "step": 76895 + "epoch": 8.0, + "eval_loss": 1.3700777292251587, + "eval_runtime": 1.4197, + "eval_samples_per_second": 704.38, + "eval_steps_per_second": 0.704, + "step": 76896 }, { - "epoch": 1.0013654984069185, - "grad_norm": 0.50390625, - "learning_rate": 0.0019599453800637234, - "loss": 1.5838, + "epoch": 8.010819808572618, + "grad_norm": 0.384765625, + "learning_rate": 0.0016795672076570954, + "loss": 1.3753, "step": 77000 }, { - "epoch": 1.0078678717731973, - "grad_norm": 0.47265625, - "learning_rate": 0.001959685285129072, - "loss": 1.5829, + "epoch": 8.062838119017893, + "grad_norm": 0.376953125, + "learning_rate": 0.0016774864752392843, + "loss": 1.3763, "step": 77500 }, { - "epoch": 1.0143702451394758, - "grad_norm": 0.5078125, - "learning_rate": 0.0019594251901944213, - "loss": 1.5812, + "epoch": 8.11485642946317, + "grad_norm": 0.263671875, + "learning_rate": 0.0016754057428214732, + "loss": 1.3761, "step": 78000 }, { - "epoch": 1.0208726185057546, - "grad_norm": 0.51171875, - "learning_rate": 0.00195916509525977, - "loss": 1.5797, + "epoch": 8.166874739908447, + "grad_norm": 0.392578125, + "learning_rate": 0.0016733250104036621, + "loss": 1.3786, "step": 78500 }, { - "epoch": 1.0273749918720332, - "grad_norm": 0.66015625, - "learning_rate": 0.0019589050003251188, - "loss": 1.5785, + "epoch": 8.218893050353724, + "grad_norm": 0.287109375, + "learning_rate": 0.001671244277985851, + "loss": 1.3786, "step": 79000 }, { - "epoch": 1.033877365238312, - "grad_norm": 0.515625, - "learning_rate": 0.0019586449053904677, - "loss": 1.5752, + "epoch": 8.270911360799001, + "grad_norm": 0.298828125, + "learning_rate": 0.0016691635455680402, + "loss": 1.3797, "step": 79500 }, { - "epoch": 1.0403797386045908, - "grad_norm": 0.90625, - "learning_rate": 0.0019583848104558163, - "loss": 1.582, + "epoch": 8.322929671244278, + "grad_norm": 0.341796875, + "learning_rate": 0.0016670828131502289, + "loss": 1.3791, "step": 80000 }, { - "epoch": 1.0468821119708693, - "grad_norm": 0.60546875, - "learning_rate": 0.001958124715521165, - "loss": 1.5762, + "epoch": 8.374947981689555, + "grad_norm": 0.302734375, + "learning_rate": 0.0016650020807324178, + "loss": 1.3783, "step": 80500 }, { - "epoch": 1.0533844853371481, - "grad_norm": 0.8046875, - "learning_rate": 0.001957864620586514, - "loss": 1.577, + "epoch": 8.426966292134832, + "grad_norm": 0.318359375, + "learning_rate": 0.001662921348314607, + "loss": 1.376, "step": 81000 }, { - "epoch": 1.0598868587034267, - "grad_norm": 0.76171875, - "learning_rate": 0.0019576045256518627, - "loss": 1.5741, + "epoch": 8.478984602580109, + "grad_norm": 1.1796875, + "learning_rate": 0.0016608406158967956, + "loss": 1.3763, "step": 81500 }, { - "epoch": 1.0663892320697055, - "grad_norm": 0.59375, - "learning_rate": 0.001957344430717212, - "loss": 1.574, + "epoch": 8.531002913025384, + "grad_norm": 0.408203125, + "learning_rate": 0.0016587598834789845, + "loss": 1.3757, "step": 82000 }, { - "epoch": 1.072891605435984, - "grad_norm": 0.55859375, - "learning_rate": 0.0019570843357825606, - "loss": 1.5744, + "epoch": 8.583021223470661, + "grad_norm": 0.224609375, + "learning_rate": 0.0016566791510611736, + "loss": 1.377, "step": 82500 }, { - "epoch": 1.0793939788022628, - "grad_norm": 0.5703125, - "learning_rate": 0.0019568242408479096, - "loss": 1.5745, + "epoch": 8.635039533915938, + "grad_norm": 0.55859375, + "learning_rate": 0.0016545984186433626, + "loss": 1.3755, "step": 83000 }, { - "epoch": 1.0858963521685414, - "grad_norm": 0.5, - "learning_rate": 0.0019565641459132585, - "loss": 1.5758, + "epoch": 8.687057844361215, + "grad_norm": 0.23828125, + "learning_rate": 0.0016525176862255513, + "loss": 1.3739, "step": 83500 }, { - "epoch": 1.0923987255348202, - "grad_norm": 0.75, - "learning_rate": 0.001956304050978607, - "loss": 1.5698, + "epoch": 8.739076154806492, + "grad_norm": 0.33984375, + "learning_rate": 0.0016504369538077404, + "loss": 1.3744, "step": 84000 }, { - "epoch": 1.098901098901099, - "grad_norm": 0.447265625, - "learning_rate": 0.001956043956043956, - "loss": 1.576, + "epoch": 8.791094465251769, + "grad_norm": 0.36328125, + "learning_rate": 0.0016483562213899293, + "loss": 1.3733, "step": 84500 }, { - "epoch": 1.1054034722673776, - "grad_norm": 0.5703125, - "learning_rate": 0.001955783861109305, - "loss": 1.5778, + "epoch": 8.843112775697046, + "grad_norm": 0.263671875, + "learning_rate": 0.0016462754889721182, + "loss": 1.3735, "step": 85000 }, { - "epoch": 1.1119058456336564, + "epoch": 8.895131086142323, "grad_norm": 0.478515625, - "learning_rate": 0.001955523766174654, - "loss": 1.5739, + "learning_rate": 0.0016441947565543071, + "loss": 1.3754, "step": 85500 }, { - "epoch": 1.118408218999935, - "grad_norm": 0.61328125, - "learning_rate": 0.001955263671240003, - "loss": 1.5778, + "epoch": 8.947149396587598, + "grad_norm": 0.515625, + "learning_rate": 0.001642114024136496, + "loss": 1.3749, "step": 86000 }, { - "epoch": 1.1249105923662137, - "grad_norm": 0.5859375, - "learning_rate": 0.0019550035763053514, - "loss": 1.5713, + "epoch": 8.999167707032875, + "grad_norm": 0.5390625, + "learning_rate": 0.0016400332917186852, + "loss": 1.3739, "step": 86500 }, { - "epoch": 1.1314129657324923, - "grad_norm": 0.640625, - "learning_rate": 0.0019547434813707004, - "loss": 1.5767, + "epoch": 9.0, + "eval_loss": 1.3678644895553589, + "eval_runtime": 1.4359, + "eval_samples_per_second": 696.406, + "eval_steps_per_second": 0.696, + "step": 86508 + }, + { + "epoch": 9.051186017478152, + "grad_norm": 0.26171875, + "learning_rate": 0.0016379525593008739, + "loss": 1.3717, "step": 87000 }, { - "epoch": 1.137915339098771, - "grad_norm": 0.9296875, - "learning_rate": 0.0019544833864360493, - "loss": 1.5732, + "epoch": 9.103204327923429, + "grad_norm": 0.2373046875, + "learning_rate": 0.0016358718268830628, + "loss": 1.3705, "step": 87500 }, { - "epoch": 1.1444177124650499, - "grad_norm": 0.6953125, - "learning_rate": 0.001954223291501398, - "loss": 1.5688, + "epoch": 9.155222638368706, + "grad_norm": 0.2578125, + "learning_rate": 0.001633791094465252, + "loss": 1.3719, "step": 88000 }, { - "epoch": 1.1509200858313284, - "grad_norm": 0.69921875, - "learning_rate": 0.001953963196566747, - "loss": 1.5739, + "epoch": 9.207240948813983, + "grad_norm": 0.32421875, + "learning_rate": 0.0016317103620474408, + "loss": 1.3727, "step": 88500 }, { - "epoch": 1.1574224591976072, - "grad_norm": 0.625, - "learning_rate": 0.0019537031016320957, - "loss": 1.5733, + "epoch": 9.25925925925926, + "grad_norm": 5.125, + "learning_rate": 0.0016296296296296295, + "loss": 1.3737, "step": 89000 }, { - "epoch": 1.1639248325638858, - "grad_norm": 0.73828125, - "learning_rate": 0.0019534430066974447, - "loss": 1.576, + "epoch": 9.311277569704536, + "grad_norm": 0.263671875, + "learning_rate": 0.0016275488972118187, + "loss": 1.3707, "step": 89500 }, { - "epoch": 1.1704272059301646, - "grad_norm": 0.61328125, - "learning_rate": 0.0019531829117627937, - "loss": 1.5668, + "epoch": 9.363295880149813, + "grad_norm": 0.2021484375, + "learning_rate": 0.0016254681647940076, + "loss": 1.3694, "step": 90000 }, { - "epoch": 1.1769295792964432, - "grad_norm": 1.109375, - "learning_rate": 0.0019529228168281424, - "loss": 1.5677, + "epoch": 9.41531419059509, + "grad_norm": 0.353515625, + "learning_rate": 0.0016233874323761963, + "loss": 1.3687, "step": 90500 }, { - "epoch": 1.183431952662722, - "grad_norm": 2.03125, - "learning_rate": 0.0019526627218934911, - "loss": 1.5671, + "epoch": 9.467332501040365, + "grad_norm": 0.7578125, + "learning_rate": 0.0016213066999583854, + "loss": 1.368, "step": 91000 }, { - "epoch": 1.1899343260290005, - "grad_norm": 0.5234375, - "learning_rate": 0.0019524026269588399, - "loss": 1.5649, + "epoch": 9.519350811485642, + "grad_norm": 0.30078125, + "learning_rate": 0.0016192259675405743, + "loss": 1.3716, "step": 91500 }, { - "epoch": 1.1964366993952793, - "grad_norm": 0.56640625, - "learning_rate": 0.0019521425320241888, - "loss": 1.5645, + "epoch": 9.57136912193092, + "grad_norm": 0.23828125, + "learning_rate": 0.0016171452351227634, + "loss": 1.3697, "step": 92000 }, { - "epoch": 1.2029390727615579, - "grad_norm": 0.5546875, - "learning_rate": 0.0019518824370895378, - "loss": 1.5646, + "epoch": 9.623387432376196, + "grad_norm": 0.271484375, + "learning_rate": 0.0016150645027049521, + "loss": 1.3692, "step": 92500 }, { - "epoch": 1.2094414461278367, - "grad_norm": 2.421875, - "learning_rate": 0.0019516223421548868, - "loss": 1.5711, + "epoch": 9.675405742821473, + "grad_norm": 0.470703125, + "learning_rate": 0.001612983770287141, + "loss": 1.3682, "step": 93000 }, { - "epoch": 1.2159438194941155, - "grad_norm": 10.25, - "learning_rate": 0.0019513622472202355, - "loss": 1.5674, + "epoch": 9.72742405326675, + "grad_norm": 0.41015625, + "learning_rate": 0.0016109030378693302, + "loss": 1.3673, "step": 93500 }, { - "epoch": 1.222446192860394, - "grad_norm": 1.2421875, - "learning_rate": 0.0019511021522855842, - "loss": 1.5693, + "epoch": 9.779442363712027, + "grad_norm": 0.25, + "learning_rate": 0.0016088223054515189, + "loss": 1.3663, "step": 94000 }, { - "epoch": 1.2289485662266728, - "grad_norm": 0.50390625, - "learning_rate": 0.0019508420573509332, - "loss": 1.5683, + "epoch": 9.831460674157304, + "grad_norm": 0.2578125, + "learning_rate": 0.0016067415730337078, + "loss": 1.3662, "step": 94500 }, { - "epoch": 1.2354509395929514, - "grad_norm": 0.9765625, - "learning_rate": 0.001950581962416282, - "loss": 1.5786, + "epoch": 9.88347898460258, + "grad_norm": 0.361328125, + "learning_rate": 0.001604660840615897, + "loss": 1.37, "step": 95000 }, { - "epoch": 1.2419533129592302, - "grad_norm": 0.6796875, - "learning_rate": 0.0019503218674816307, - "loss": 1.5745, + "epoch": 9.935497295047856, + "grad_norm": 0.2578125, + "learning_rate": 0.0016025801081980858, + "loss": 1.372, "step": 95500 }, { - "epoch": 1.2484556863255087, - "grad_norm": 0.49609375, - "learning_rate": 0.0019500617725469796, - "loss": 1.5704, + "epoch": 9.987515605493133, + "grad_norm": 0.30859375, + "learning_rate": 0.0016004993757802745, + "loss": 1.3699, "step": 96000 }, { - "epoch": 1.2549580596917875, - "grad_norm": 0.56640625, - "learning_rate": 0.0019498016776123286, - "loss": 1.5753, + "epoch": 10.0, + "eval_loss": 1.3671537637710571, + "eval_runtime": 2.0428, + "eval_samples_per_second": 489.53, + "eval_steps_per_second": 0.49, + "step": 96120 + }, + { + "epoch": 10.03953391593841, + "grad_norm": 0.2578125, + "learning_rate": 0.0015984186433624637, + "loss": 1.3686, "step": 96500 }, { - "epoch": 1.2614604330580663, - "grad_norm": 1.2265625, - "learning_rate": 0.0019495415826776775, - "loss": 1.5765, + "epoch": 10.091552226383687, + "grad_norm": 1.484375, + "learning_rate": 0.0015963379109446526, + "loss": 1.3682, "step": 97000 }, { - "epoch": 1.267962806424345, - "grad_norm": 0.63671875, - "learning_rate": 0.0019492814877430263, - "loss": 1.5718, + "epoch": 10.143570536828964, + "grad_norm": 0.369140625, + "learning_rate": 0.0015942571785268413, + "loss": 1.3659, "step": 97500 }, { - "epoch": 1.2744651797906235, - "grad_norm": 0.8515625, - "learning_rate": 0.001949021392808375, - "loss": 1.5716, + "epoch": 10.19558884727424, + "grad_norm": 0.263671875, + "learning_rate": 0.0015921764461090304, + "loss": 1.3653, "step": 98000 }, { - "epoch": 1.2809675531569022, - "grad_norm": 0.67578125, - "learning_rate": 0.001948761297873724, - "loss": 1.5655, + "epoch": 10.247607157719518, + "grad_norm": 0.26171875, + "learning_rate": 0.0015900957136912193, + "loss": 1.3668, "step": 98500 }, { - "epoch": 1.287469926523181, - "grad_norm": 1.0390625, - "learning_rate": 0.0019485012029390727, - "loss": 1.5646, + "epoch": 10.299625468164795, + "grad_norm": 0.2373046875, + "learning_rate": 0.0015880149812734085, + "loss": 1.3695, "step": 99000 }, { - "epoch": 1.2939722998894596, - "grad_norm": 0.52734375, - "learning_rate": 0.0019482411080044215, - "loss": 1.57, + "epoch": 10.35164377861007, + "grad_norm": 0.2392578125, + "learning_rate": 0.0015859342488555972, + "loss": 1.3673, "step": 99500 }, { - "epoch": 1.3004746732557384, - "grad_norm": 0.62109375, - "learning_rate": 0.0019479810130697706, - "loss": 1.5607, + "epoch": 10.403662089055347, + "grad_norm": 0.248046875, + "learning_rate": 0.001583853516437786, + "loss": 1.3669, "step": 100000 }, { - "epoch": 1.306977046622017, - "grad_norm": 0.85546875, - "learning_rate": 0.0019477209181351194, - "loss": 1.5576, + "epoch": 10.455680399500624, + "grad_norm": 0.3359375, + "learning_rate": 0.0015817727840199752, + "loss": 1.367, "step": 100500 }, { - "epoch": 1.3134794199882958, - "grad_norm": 0.63671875, - "learning_rate": 0.0019474608232004683, - "loss": 1.5718, + "epoch": 10.5076987099459, + "grad_norm": 0.330078125, + "learning_rate": 0.0015796920516021641, + "loss": 1.3668, "step": 101000 }, { - "epoch": 1.3199817933545743, - "grad_norm": 0.72265625, - "learning_rate": 0.001947200728265817, - "loss": 1.562, + "epoch": 10.559717020391178, + "grad_norm": 0.3125, + "learning_rate": 0.0015776113191843528, + "loss": 1.3654, "step": 101500 }, { - "epoch": 1.3264841667208531, - "grad_norm": 0.55078125, - "learning_rate": 0.0019469406333311658, - "loss": 1.5603, + "epoch": 10.611735330836455, + "grad_norm": 0.45703125, + "learning_rate": 0.001575530586766542, + "loss": 1.3655, "step": 102000 }, { - "epoch": 1.332986540087132, - "grad_norm": 0.546875, - "learning_rate": 0.0019466805383965148, - "loss": 1.5631, + "epoch": 10.663753641281732, + "grad_norm": 0.361328125, + "learning_rate": 0.0015734498543487309, + "loss": 1.3673, "step": 102500 }, { - "epoch": 1.3394889134534105, - "grad_norm": 0.51953125, - "learning_rate": 0.0019464204434618635, - "loss": 1.5616, + "epoch": 10.715771951727008, + "grad_norm": 0.2734375, + "learning_rate": 0.0015713691219309195, + "loss": 1.3651, "step": 103000 }, { - "epoch": 1.345991286819689, - "grad_norm": 0.69921875, - "learning_rate": 0.0019461603485272127, - "loss": 1.5611, + "epoch": 10.767790262172285, + "grad_norm": 0.28125, + "learning_rate": 0.0015692883895131087, + "loss": 1.3652, "step": 103500 }, { - "epoch": 1.3524936601859678, - "grad_norm": 0.5, - "learning_rate": 0.0019459002535925614, - "loss": 1.5634, + "epoch": 10.81980857261756, + "grad_norm": 0.1982421875, + "learning_rate": 0.0015672076570952976, + "loss": 1.366, "step": 104000 }, { - "epoch": 1.3589960335522466, - "grad_norm": 0.609375, - "learning_rate": 0.0019456401586579102, - "loss": 1.5648, + "epoch": 10.871826883062838, + "grad_norm": 0.416015625, + "learning_rate": 0.0015651269246774865, + "loss": 1.3624, "step": 104500 }, { - "epoch": 1.3654984069185252, - "grad_norm": 0.5078125, - "learning_rate": 0.0019453800637232591, - "loss": 1.5611, + "epoch": 10.923845193508114, + "grad_norm": 0.94921875, + "learning_rate": 0.0015630461922596754, + "loss": 1.3605, "step": 105000 }, { - "epoch": 1.372000780284804, - "grad_norm": 0.72265625, - "learning_rate": 0.0019451199687886079, - "loss": 1.5615, + "epoch": 10.975863503953391, + "grad_norm": 0.74609375, + "learning_rate": 0.0015609654598418643, + "loss": 1.36, "step": 105500 }, { - "epoch": 1.3785031536510828, - "grad_norm": 0.78515625, - "learning_rate": 0.0019448598738539566, - "loss": 1.5628, + "epoch": 11.0, + "eval_loss": 1.3506468534469604, + "eval_runtime": 1.4359, + "eval_samples_per_second": 696.43, + "eval_steps_per_second": 0.696, + "step": 105732 + }, + { + "epoch": 11.027881814398668, + "grad_norm": 0.30078125, + "learning_rate": 0.0015588847274240535, + "loss": 1.3598, "step": 106000 }, { - "epoch": 1.3850055270173613, - "grad_norm": 0.63671875, - "learning_rate": 0.0019445997789193056, - "loss": 1.5608, + "epoch": 11.079900124843945, + "grad_norm": 0.2177734375, + "learning_rate": 0.0015568039950062422, + "loss": 1.3614, "step": 106500 }, { - "epoch": 1.39150790038364, - "grad_norm": 0.65625, - "learning_rate": 0.0019443396839846543, - "loss": 1.5609, + "epoch": 11.131918435289222, + "grad_norm": 0.35546875, + "learning_rate": 0.001554723262588431, + "loss": 1.3587, "step": 107000 }, { - "epoch": 1.3980102737499187, - "grad_norm": 0.6640625, - "learning_rate": 0.0019440795890500035, - "loss": 1.5579, + "epoch": 11.1839367457345, + "grad_norm": 0.2109375, + "learning_rate": 0.0015526425301706202, + "loss": 1.3585, "step": 107500 }, { - "epoch": 1.4045126471161975, - "grad_norm": 0.46484375, - "learning_rate": 0.0019438194941153522, - "loss": 1.5526, + "epoch": 11.235955056179776, + "grad_norm": 1.2890625, + "learning_rate": 0.0015505617977528091, + "loss": 1.3577, "step": 108000 }, { - "epoch": 1.411015020482476, - "grad_norm": 0.62890625, - "learning_rate": 0.001943559399180701, - "loss": 1.554, + "epoch": 11.287973366625051, + "grad_norm": 0.2109375, + "learning_rate": 0.0015484810653349978, + "loss": 1.3581, "step": 108500 }, { - "epoch": 1.4175173938487549, - "grad_norm": 1.4453125, - "learning_rate": 0.00194329930424605, - "loss": 1.5554, + "epoch": 11.339991677070328, + "grad_norm": 0.298828125, + "learning_rate": 0.001546400332917187, + "loss": 1.3574, "step": 109000 }, { - "epoch": 1.4240197672150334, - "grad_norm": 0.9375, - "learning_rate": 0.0019430392093113986, - "loss": 1.5518, + "epoch": 11.392009987515605, + "grad_norm": 0.330078125, + "learning_rate": 0.0015443196004993759, + "loss": 1.3586, "step": 109500 }, { - "epoch": 1.4305221405813122, - "grad_norm": 0.53125, - "learning_rate": 0.0019427791143767474, - "loss": 1.5562, + "epoch": 11.444028297960882, + "grad_norm": 0.828125, + "learning_rate": 0.0015422388680815646, + "loss": 1.3582, "step": 110000 }, { - "epoch": 1.4370245139475908, - "grad_norm": 0.46875, - "learning_rate": 0.0019425190194420963, - "loss": 1.5568, + "epoch": 11.496046608406159, + "grad_norm": 0.255859375, + "learning_rate": 0.0015401581356637537, + "loss": 1.3573, "step": 110500 }, { - "epoch": 1.4435268873138696, - "grad_norm": 0.53125, - "learning_rate": 0.0019422589245074453, - "loss": 1.5476, + "epoch": 11.548064918851436, + "grad_norm": 0.23828125, + "learning_rate": 0.0015380774032459426, + "loss": 1.3592, "step": 111000 }, { - "epoch": 1.4500292606801484, - "grad_norm": 0.51171875, - "learning_rate": 0.0019419988295727943, - "loss": 1.558, + "epoch": 11.600083229296713, + "grad_norm": 0.244140625, + "learning_rate": 0.0015359966708281315, + "loss": 1.3558, "step": 111500 }, { - "epoch": 1.456531634046427, - "grad_norm": 0.455078125, - "learning_rate": 0.001941738734638143, - "loss": 1.5478, + "epoch": 11.65210153974199, + "grad_norm": 0.2314453125, + "learning_rate": 0.0015339159384103204, + "loss": 1.3578, "step": 112000 }, { - "epoch": 1.4630340074127055, - "grad_norm": 0.66796875, - "learning_rate": 0.0019414786397034917, - "loss": 1.553, + "epoch": 11.704119850187267, + "grad_norm": 0.220703125, + "learning_rate": 0.0015318352059925093, + "loss": 1.3562, "step": 112500 }, { - "epoch": 1.4695363807789843, - "grad_norm": 1.5390625, - "learning_rate": 0.0019412185447688407, - "loss": 1.5621, + "epoch": 11.756138160632542, + "grad_norm": 0.3828125, + "learning_rate": 0.0015297544735746985, + "loss": 1.3575, "step": 113000 }, { - "epoch": 1.476038754145263, - "grad_norm": 0.76953125, - "learning_rate": 0.0019409584498341894, - "loss": 1.5583, + "epoch": 11.808156471077819, + "grad_norm": 0.30078125, + "learning_rate": 0.0015276737411568874, + "loss": 1.3568, "step": 113500 }, { - "epoch": 1.4825411275115417, - "grad_norm": 0.6328125, - "learning_rate": 0.0019406983548995382, - "loss": 1.5567, + "epoch": 11.860174781523096, + "grad_norm": 0.310546875, + "learning_rate": 0.001525593008739076, + "loss": 1.3611, "step": 114000 }, { - "epoch": 1.4890435008778204, - "grad_norm": 0.8828125, - "learning_rate": 0.0019404382599648873, - "loss": 1.5509, + "epoch": 11.912193091968373, + "grad_norm": 0.2236328125, + "learning_rate": 0.0015235122763212652, + "loss": 1.3598, "step": 114500 }, { - "epoch": 1.495545874244099, - "grad_norm": 0.8671875, - "learning_rate": 0.001940178165030236, - "loss": 1.5542, + "epoch": 11.96421140241365, + "grad_norm": 4.28125, + "learning_rate": 0.0015214315439034541, + "loss": 1.3598, "step": 115000 }, { - "epoch": 1.5020482476103778, - "grad_norm": 0.6796875, - "learning_rate": 0.001939918070095585, - "loss": 1.554, + "epoch": 12.0, + "eval_loss": 1.3485276699066162, + "eval_runtime": 1.4101, + "eval_samples_per_second": 709.157, + "eval_steps_per_second": 0.709, + "step": 115344 + }, + { + "epoch": 12.016229712858927, + "grad_norm": 0.26171875, + "learning_rate": 0.0015193508114856428, + "loss": 1.3563, "step": 115500 }, { - "epoch": 1.5085506209766564, - "grad_norm": 0.875, - "learning_rate": 0.0019396579751609338, - "loss": 1.5505, + "epoch": 12.068248023304204, + "grad_norm": 0.255859375, + "learning_rate": 0.001517270079067832, + "loss": 1.3552, "step": 116000 }, { - "epoch": 1.5150529943429352, - "grad_norm": 0.50390625, - "learning_rate": 0.0019393978802262825, - "loss": 1.5542, + "epoch": 12.12026633374948, + "grad_norm": 0.32421875, + "learning_rate": 0.0015151893466500209, + "loss": 1.3552, "step": 116500 }, { - "epoch": 1.521555367709214, - "grad_norm": 0.53125, - "learning_rate": 0.0019391377852916315, - "loss": 1.5487, + "epoch": 12.172284644194757, + "grad_norm": 0.353515625, + "learning_rate": 0.0015131086142322098, + "loss": 1.3543, "step": 117000 }, { - "epoch": 1.5280577410754925, - "grad_norm": 1.171875, - "learning_rate": 0.0019388776903569802, - "loss": 1.5535, + "epoch": 12.224302954640033, + "grad_norm": 0.248046875, + "learning_rate": 0.0015110278818143987, + "loss": 1.3534, "step": 117500 }, { - "epoch": 1.534560114441771, - "grad_norm": 0.66015625, - "learning_rate": 0.0019386175954223294, - "loss": 1.5524, + "epoch": 12.27632126508531, + "grad_norm": 0.337890625, + "learning_rate": 0.0015089471493965876, + "loss": 1.3541, "step": 118000 }, { - "epoch": 1.5410624878080499, - "grad_norm": 0.54296875, - "learning_rate": 0.0019383575004876781, - "loss": 1.548, + "epoch": 12.328339575530586, + "grad_norm": 0.228515625, + "learning_rate": 0.0015068664169787765, + "loss": 1.3542, "step": 118500 }, { - "epoch": 1.5475648611743287, - "grad_norm": 0.50390625, - "learning_rate": 0.0019380974055530269, - "loss": 1.5434, + "epoch": 12.380357885975863, + "grad_norm": 0.208984375, + "learning_rate": 0.0015047856845609654, + "loss": 1.3561, "step": 119000 }, { - "epoch": 1.5540672345406072, - "grad_norm": 0.88671875, - "learning_rate": 0.0019378373106183758, - "loss": 1.5461, + "epoch": 12.43237619642114, + "grad_norm": 0.251953125, + "learning_rate": 0.0015027049521431544, + "loss": 1.3544, "step": 119500 }, { - "epoch": 1.560569607906886, - "grad_norm": 0.5, - "learning_rate": 0.0019375772156837246, - "loss": 1.5469, + "epoch": 12.484394506866417, + "grad_norm": 0.2353515625, + "learning_rate": 0.0015006242197253433, + "loss": 1.3561, "step": 120000 }, { - "epoch": 1.5670719812731648, - "grad_norm": 0.455078125, - "learning_rate": 0.0019373171207490733, - "loss": 1.5484, + "epoch": 12.536412817311694, + "grad_norm": 0.25390625, + "learning_rate": 0.0014985434873075324, + "loss": 1.3533, "step": 120500 }, { - "epoch": 1.5735743546394434, - "grad_norm": 0.95703125, - "learning_rate": 0.0019370570258144223, - "loss": 1.5412, + "epoch": 12.588431127756971, + "grad_norm": 0.34765625, + "learning_rate": 0.001496462754889721, + "loss": 1.3546, "step": 121000 }, { - "epoch": 1.580076728005722, - "grad_norm": 0.55859375, - "learning_rate": 0.001936796930879771, - "loss": 1.5426, + "epoch": 12.640449438202246, + "grad_norm": 0.341796875, + "learning_rate": 0.0014943820224719102, + "loss": 1.3548, "step": 121500 }, { - "epoch": 1.5865791013720008, - "grad_norm": 0.89453125, - "learning_rate": 0.0019365368359451202, - "loss": 1.5482, + "epoch": 12.692467748647523, + "grad_norm": 0.234375, + "learning_rate": 0.0014923012900540991, + "loss": 1.3531, "step": 122000 }, { - "epoch": 1.5930814747382795, - "grad_norm": 0.83203125, - "learning_rate": 0.001936276741010469, - "loss": 1.5487, + "epoch": 12.7444860590928, + "grad_norm": 0.298828125, + "learning_rate": 0.0014902205576362878, + "loss": 1.3518, "step": 122500 }, { - "epoch": 1.5995838481045581, - "grad_norm": 0.68359375, - "learning_rate": 0.0019360166460758177, - "loss": 1.5529, + "epoch": 12.796504369538077, + "grad_norm": 0.326171875, + "learning_rate": 0.001488139825218477, + "loss": 1.3512, "step": 123000 }, { - "epoch": 1.606086221470837, - "grad_norm": 0.73046875, - "learning_rate": 0.0019357565511411666, - "loss": 1.5546, + "epoch": 12.848522679983354, + "grad_norm": 0.546875, + "learning_rate": 0.0014860590928006659, + "loss": 1.3506, "step": 123500 }, { - "epoch": 1.6125885948371157, - "grad_norm": 0.54296875, - "learning_rate": 0.0019354964562065154, - "loss": 1.544, + "epoch": 12.900540990428631, + "grad_norm": 0.28515625, + "learning_rate": 0.0014839783603828548, + "loss": 1.3539, "step": 124000 }, { - "epoch": 1.6190909682033943, - "grad_norm": 0.68359375, - "learning_rate": 0.001935236361271864, - "loss": 1.5507, + "epoch": 12.952559300873908, + "grad_norm": 0.240234375, + "learning_rate": 0.0014818976279650437, + "loss": 1.3581, "step": 124500 }, { - "epoch": 1.6255933415696728, - "grad_norm": 0.55078125, - "learning_rate": 0.001934976266337213, - "loss": 1.5515, + "epoch": 13.0, + "eval_loss": 1.3496302366256714, + "eval_runtime": 1.4282, + "eval_samples_per_second": 700.187, + "eval_steps_per_second": 0.7, + "step": 124956 + }, + { + "epoch": 13.004577611319185, + "grad_norm": 0.2578125, + "learning_rate": 0.0014798168955472326, + "loss": 1.3558, "step": 125000 }, { - "epoch": 1.6320957149359516, - "grad_norm": 0.98828125, - "learning_rate": 0.001934716171402562, - "loss": 1.5502, + "epoch": 13.056595921764462, + "grad_norm": 0.5390625, + "learning_rate": 0.0014777361631294215, + "loss": 1.3526, "step": 125500 }, { - "epoch": 1.6385980883022304, - "grad_norm": 1.3046875, - "learning_rate": 0.001934456076467911, - "loss": 1.5427, + "epoch": 13.108614232209737, + "grad_norm": 0.2255859375, + "learning_rate": 0.0014756554307116107, + "loss": 1.3508, "step": 126000 }, { - "epoch": 1.645100461668509, - "grad_norm": 0.515625, - "learning_rate": 0.0019341959815332597, - "loss": 1.5471, + "epoch": 13.160632542655014, + "grad_norm": 0.361328125, + "learning_rate": 0.0014735746982937994, + "loss": 1.3507, "step": 126500 }, { - "epoch": 1.6516028350347876, - "grad_norm": 0.466796875, - "learning_rate": 0.0019339358865986085, - "loss": 1.5405, + "epoch": 13.21265085310029, + "grad_norm": 0.478515625, + "learning_rate": 0.0014714939658759883, + "loss": 1.3525, "step": 127000 }, { - "epoch": 1.6581052084010663, - "grad_norm": 0.49609375, - "learning_rate": 0.0019336757916639574, - "loss": 1.5431, + "epoch": 13.264669163545568, + "grad_norm": 0.2490234375, + "learning_rate": 0.0014694132334581774, + "loss": 1.3525, "step": 127500 }, { - "epoch": 1.6646075817673451, - "grad_norm": 1.7421875, - "learning_rate": 0.0019334156967293062, - "loss": 1.5484, + "epoch": 13.316687473990845, + "grad_norm": 0.25, + "learning_rate": 0.0014673325010403661, + "loss": 1.3529, "step": 128000 }, { - "epoch": 1.6711099551336237, - "grad_norm": 0.71875, - "learning_rate": 0.001933155601794655, - "loss": 1.5474, + "epoch": 13.368705784436122, + "grad_norm": 0.318359375, + "learning_rate": 0.0014652517686225552, + "loss": 1.3518, "step": 128500 }, { - "epoch": 1.6776123284999025, - "grad_norm": 0.65625, - "learning_rate": 0.001932895506860004, - "loss": 1.544, + "epoch": 13.420724094881399, + "grad_norm": 0.228515625, + "learning_rate": 0.0014631710362047442, + "loss": 1.3534, "step": 129000 }, { - "epoch": 1.6841147018661813, - "grad_norm": 0.62890625, - "learning_rate": 0.0019326354119253528, - "loss": 1.5443, + "epoch": 13.472742405326676, + "grad_norm": 0.23828125, + "learning_rate": 0.001461090303786933, + "loss": 1.3513, "step": 129500 }, { - "epoch": 1.6906170752324599, - "grad_norm": 0.5078125, - "learning_rate": 0.0019323753169907018, - "loss": 1.5468, + "epoch": 13.524760715771952, + "grad_norm": 0.2421875, + "learning_rate": 0.001459009571369122, + "loss": 1.3516, "step": 130000 }, { - "epoch": 1.6971194485987384, - "grad_norm": 0.9453125, - "learning_rate": 0.0019321152220560505, - "loss": 1.5416, + "epoch": 13.576779026217228, + "grad_norm": 0.21484375, + "learning_rate": 0.001456928838951311, + "loss": 1.3521, "step": 130500 }, { - "epoch": 1.7036218219650172, - "grad_norm": 0.515625, - "learning_rate": 0.0019318551271213992, - "loss": 1.5398, + "epoch": 13.628797336662505, + "grad_norm": 0.255859375, + "learning_rate": 0.0014548481065334998, + "loss": 1.353, "step": 131000 }, { - "epoch": 1.710124195331296, - "grad_norm": 0.55859375, - "learning_rate": 0.0019315950321867482, - "loss": 1.5375, + "epoch": 13.680815647107782, + "grad_norm": 0.341796875, + "learning_rate": 0.0014527673741156887, + "loss": 1.3525, "step": 131500 }, { - "epoch": 1.7166265686975746, - "grad_norm": 0.48046875, - "learning_rate": 0.001931334937252097, - "loss": 1.5381, + "epoch": 13.732833957553058, + "grad_norm": 0.287109375, + "learning_rate": 0.0014506866416978776, + "loss": 1.3501, "step": 132000 }, { - "epoch": 1.7231289420638531, - "grad_norm": 0.58984375, - "learning_rate": 0.0019310748423174461, - "loss": 1.5443, + "epoch": 13.784852267998335, + "grad_norm": 0.30859375, + "learning_rate": 0.0014486059092800666, + "loss": 1.3495, "step": 132500 }, { - "epoch": 1.7296313154301322, - "grad_norm": 0.58984375, - "learning_rate": 0.0019308147473827949, - "loss": 1.5485, + "epoch": 13.836870578443612, + "grad_norm": 0.28125, + "learning_rate": 0.0014465251768622557, + "loss": 1.3513, "step": 133000 }, { - "epoch": 1.7361336887964107, - "grad_norm": 0.56640625, - "learning_rate": 0.0019305546524481436, - "loss": 1.5439, + "epoch": 13.88888888888889, + "grad_norm": 0.353515625, + "learning_rate": 0.0014444444444444444, + "loss": 1.3516, "step": 133500 }, { - "epoch": 1.7426360621626893, - "grad_norm": 0.60546875, - "learning_rate": 0.0019302945575134926, - "loss": 1.5427, + "epoch": 13.940907199334166, + "grad_norm": 0.4921875, + "learning_rate": 0.0014423637120266333, + "loss": 1.3512, "step": 134000 }, { - "epoch": 1.749138435528968, - "grad_norm": 1.890625, - "learning_rate": 0.0019300344625788413, - "loss": 1.5403, + "epoch": 13.992925509779443, + "grad_norm": 0.287109375, + "learning_rate": 0.0014402829796088224, + "loss": 1.3493, "step": 134500 }, { - "epoch": 1.7556408088952469, - "grad_norm": 0.490234375, - "learning_rate": 0.00192977436764419, - "loss": 1.5394, + "epoch": 14.0, + "eval_loss": 1.3465324640274048, + "eval_runtime": 1.419, + "eval_samples_per_second": 704.714, + "eval_steps_per_second": 0.705, + "step": 134568 + }, + { + "epoch": 14.044943820224718, + "grad_norm": 0.400390625, + "learning_rate": 0.0014382022471910111, + "loss": 1.3478, "step": 135000 }, { - "epoch": 1.7621431822615254, - "grad_norm": 0.59765625, - "learning_rate": 0.001929514272709539, - "loss": 1.5378, + "epoch": 14.096962130669995, + "grad_norm": 0.203125, + "learning_rate": 0.0014361215147732003, + "loss": 1.3485, "step": 135500 }, { - "epoch": 1.768645555627804, - "grad_norm": 0.578125, - "learning_rate": 0.0019292541777748877, - "loss": 1.5362, + "epoch": 14.148980441115272, + "grad_norm": 0.322265625, + "learning_rate": 0.0014340407823553892, + "loss": 1.3473, "step": 136000 }, { - "epoch": 1.7751479289940828, - "grad_norm": 1.46875, - "learning_rate": 0.001928994082840237, - "loss": 1.5378, + "epoch": 14.20099875156055, + "grad_norm": 0.578125, + "learning_rate": 0.001431960049937578, + "loss": 1.3469, "step": 136500 }, { - "epoch": 1.7816503023603616, - "grad_norm": 1.1953125, - "learning_rate": 0.0019287339879055856, - "loss": 1.5368, + "epoch": 14.253017062005826, + "grad_norm": 0.205078125, + "learning_rate": 0.001429879317519767, + "loss": 1.3488, "step": 137000 }, { - "epoch": 1.7881526757266402, - "grad_norm": 0.5859375, - "learning_rate": 0.0019284738929709344, - "loss": 1.5322, + "epoch": 14.305035372451103, + "grad_norm": 0.28125, + "learning_rate": 0.001427798585101956, + "loss": 1.3482, "step": 137500 }, { - "epoch": 1.794655049092919, - "grad_norm": 0.6328125, - "learning_rate": 0.0019282137980362833, - "loss": 1.5335, + "epoch": 14.35705368289638, + "grad_norm": 0.85546875, + "learning_rate": 0.0014257178526841448, + "loss": 1.3485, "step": 138000 }, { - "epoch": 1.8011574224591977, - "grad_norm": 0.515625, - "learning_rate": 0.001927953703101632, - "loss": 1.5356, + "epoch": 14.409071993341657, + "grad_norm": 0.48828125, + "learning_rate": 0.001423637120266334, + "loss": 1.3479, "step": 138500 }, { - "epoch": 1.8076597958254763, - "grad_norm": 1.5234375, - "learning_rate": 0.0019276936081669808, - "loss": 1.5369, + "epoch": 14.461090303786934, + "grad_norm": 0.208984375, + "learning_rate": 0.0014215563878485226, + "loss": 1.3483, "step": 139000 }, { - "epoch": 1.8141621691917549, - "grad_norm": 0.64453125, - "learning_rate": 0.0019274335132323298, - "loss": 1.5375, + "epoch": 14.513108614232209, + "grad_norm": 0.4140625, + "learning_rate": 0.0014194756554307116, + "loss": 1.3471, "step": 139500 }, { - "epoch": 1.8206645425580337, - "grad_norm": 0.6640625, - "learning_rate": 0.0019271734182976787, - "loss": 1.5313, + "epoch": 14.565126924677486, + "grad_norm": 0.30078125, + "learning_rate": 0.0014173949230129007, + "loss": 1.3475, "step": 140000 }, { - "epoch": 1.8271669159243125, - "grad_norm": 0.55078125, - "learning_rate": 0.0019269133233630277, - "loss": 1.5302, + "epoch": 14.617145235122763, + "grad_norm": 0.4453125, + "learning_rate": 0.0014153141905950894, + "loss": 1.3467, "step": 140500 }, { - "epoch": 1.833669289290591, - "grad_norm": 0.5078125, - "learning_rate": 0.0019266532284283764, - "loss": 1.5262, + "epoch": 14.66916354556804, + "grad_norm": 0.2255859375, + "learning_rate": 0.0014132334581772783, + "loss": 1.3462, "step": 141000 }, { - "epoch": 1.8401716626568696, - "grad_norm": 0.443359375, - "learning_rate": 0.0019263931334937252, - "loss": 1.5323, + "epoch": 14.721181856013317, + "grad_norm": 0.2451171875, + "learning_rate": 0.0014111527257594674, + "loss": 1.3456, "step": 141500 }, { - "epoch": 1.8466740360231486, - "grad_norm": 0.765625, - "learning_rate": 0.0019261330385590741, - "loss": 1.5322, + "epoch": 14.773200166458594, + "grad_norm": 0.259765625, + "learning_rate": 0.0014090719933416563, + "loss": 1.3455, "step": 142000 }, { - "epoch": 1.8531764093894272, - "grad_norm": 0.4921875, - "learning_rate": 0.0019258729436244229, - "loss": 1.5286, + "epoch": 14.82521847690387, + "grad_norm": 0.2578125, + "learning_rate": 0.0014069912609238453, + "loss": 1.3447, "step": 142500 }, { - "epoch": 1.8596787827557058, - "grad_norm": 0.55859375, - "learning_rate": 0.0019256128486897716, - "loss": 1.5282, + "epoch": 14.877236787349148, + "grad_norm": 2.359375, + "learning_rate": 0.0014049105285060342, + "loss": 1.3454, "step": 143000 }, { - "epoch": 1.8661811561219845, - "grad_norm": 0.5234375, - "learning_rate": 0.0019253527537551208, - "loss": 1.5302, + "epoch": 14.929255097794425, + "grad_norm": 0.4296875, + "learning_rate": 0.001402829796088223, + "loss": 1.3461, "step": 143500 }, { - "epoch": 1.8726835294882633, - "grad_norm": 1.15625, - "learning_rate": 0.0019250926588204695, - "loss": 1.5259, + "epoch": 14.9812734082397, + "grad_norm": 0.33203125, + "learning_rate": 0.001400749063670412, + "loss": 1.3454, "step": 144000 }, { - "epoch": 1.879185902854542, - "grad_norm": 2.078125, - "learning_rate": 0.0019248325638858185, - "loss": 1.5295, + "epoch": 15.0, + "eval_loss": 1.3407135009765625, + "eval_runtime": 1.4239, + "eval_samples_per_second": 702.319, + "eval_steps_per_second": 0.702, + "step": 144180 + }, + { + "epoch": 15.033291718684977, + "grad_norm": 0.63671875, + "learning_rate": 0.001398668331252601, + "loss": 1.3444, "step": 144500 }, { - "epoch": 1.8856882762208205, - "grad_norm": 0.75, - "learning_rate": 0.0019245724689511672, - "loss": 1.5289, + "epoch": 15.085310029130254, + "grad_norm": 0.275390625, + "learning_rate": 0.0013965875988347898, + "loss": 1.3444, "step": 145000 }, { - "epoch": 1.8921906495870993, - "grad_norm": 0.5234375, - "learning_rate": 0.001924312374016516, - "loss": 1.5319, + "epoch": 15.13732833957553, + "grad_norm": 0.56640625, + "learning_rate": 0.001394506866416979, + "loss": 1.3437, "step": 145500 }, { - "epoch": 1.898693022953378, - "grad_norm": 0.462890625, - "learning_rate": 0.001924052279081865, - "loss": 1.5349, + "epoch": 15.189346650020807, + "grad_norm": 0.56640625, + "learning_rate": 0.0013924261339991677, + "loss": 1.3473, "step": 146000 }, { - "epoch": 1.9051953963196566, - "grad_norm": 3.109375, - "learning_rate": 0.0019237921841472137, - "loss": 1.5369, + "epoch": 15.241364960466084, + "grad_norm": 0.201171875, + "learning_rate": 0.0013903454015813566, + "loss": 1.3474, "step": 146500 }, { - "epoch": 1.9116977696859354, - "grad_norm": 0.58203125, - "learning_rate": 0.0019235320892125628, - "loss": 1.5371, + "epoch": 15.293383270911361, + "grad_norm": 0.369140625, + "learning_rate": 0.0013882646691635457, + "loss": 1.3472, "step": 147000 }, { - "epoch": 1.9182001430522142, - "grad_norm": 0.68359375, - "learning_rate": 0.0019232719942779116, - "loss": 1.5371, + "epoch": 15.345401581356638, + "grad_norm": 0.279296875, + "learning_rate": 0.0013861839367457346, + "loss": 1.3457, "step": 147500 }, { - "epoch": 1.9247025164184928, - "grad_norm": 1.265625, - "learning_rate": 0.0019230118993432603, - "loss": 1.531, + "epoch": 15.397419891801913, + "grad_norm": 0.314453125, + "learning_rate": 0.0013841032043279233, + "loss": 1.3438, "step": 148000 }, { - "epoch": 1.9312048897847713, - "grad_norm": 0.56640625, - "learning_rate": 0.0019227518044086093, - "loss": 1.5355, + "epoch": 15.44943820224719, + "grad_norm": 0.283203125, + "learning_rate": 0.0013820224719101124, + "loss": 1.3452, "step": 148500 }, { - "epoch": 1.9377072631510501, - "grad_norm": 0.796875, - "learning_rate": 0.001922491709473958, - "loss": 1.5308, + "epoch": 15.501456512692467, + "grad_norm": 0.236328125, + "learning_rate": 0.0013799417394923014, + "loss": 1.3461, "step": 149000 }, { - "epoch": 1.944209636517329, - "grad_norm": 0.52734375, - "learning_rate": 0.0019222316145393068, - "loss": 1.533, + "epoch": 15.553474823137744, + "grad_norm": 0.6640625, + "learning_rate": 0.0013778610070744903, + "loss": 1.344, "step": 149500 }, { - "epoch": 1.9507120098836075, - "grad_norm": 0.6953125, - "learning_rate": 0.0019219715196046557, - "loss": 1.5318, + "epoch": 15.605493133583021, + "grad_norm": 0.287109375, + "learning_rate": 0.0013757802746566792, + "loss": 1.3437, "step": 150000 }, { - "epoch": 1.957214383249886, - "grad_norm": 0.6953125, - "learning_rate": 0.0019217114246700044, - "loss": 1.5307, + "epoch": 15.657511444028298, + "grad_norm": 0.255859375, + "learning_rate": 0.001373699542238868, + "loss": 1.3468, "step": 150500 }, { - "epoch": 1.9637167566161648, - "grad_norm": 0.66015625, - "learning_rate": 0.0019214513297353536, - "loss": 1.5297, + "epoch": 15.709529754473575, + "grad_norm": 0.337890625, + "learning_rate": 0.0013716188098210572, + "loss": 1.3439, "step": 151000 }, { - "epoch": 1.9702191299824436, - "grad_norm": 1.046875, - "learning_rate": 0.0019211912348007024, - "loss": 1.5264, + "epoch": 15.761548064918852, + "grad_norm": 0.81640625, + "learning_rate": 0.001369538077403246, + "loss": 1.3435, "step": 151500 }, { - "epoch": 1.9767215033487222, - "grad_norm": 0.59375, - "learning_rate": 0.001920931139866051, - "loss": 1.5289, + "epoch": 15.813566375364129, + "grad_norm": 1.5078125, + "learning_rate": 0.0013674573449854348, + "loss": 1.3463, "step": 152000 }, { - "epoch": 1.983223876715001, - "grad_norm": 0.58984375, - "learning_rate": 0.0019206710449314, - "loss": 1.5292, + "epoch": 15.865584685809406, + "grad_norm": 0.2392578125, + "learning_rate": 0.001365376612567624, + "loss": 1.3467, "step": 152500 }, { - "epoch": 1.9897262500812798, - "grad_norm": 0.6015625, - "learning_rate": 0.0019204109499967488, - "loss": 1.5252, + "epoch": 15.917602996254681, + "grad_norm": 1.1015625, + "learning_rate": 0.0013632958801498127, + "loss": 1.3462, "step": 153000 }, { - "epoch": 1.9962286234475584, - "grad_norm": 0.5546875, - "learning_rate": 0.0019201508550620975, - "loss": 1.5272, + "epoch": 15.969621306699958, + "grad_norm": 0.2158203125, + "learning_rate": 0.0013612151477320016, + "loss": 1.3461, "step": 153500 }, { - "epoch": 2.0, - "eval_loss": 1.502743124961853, - "eval_runtime": 7.2048, - "eval_samples_per_second": 138.797, - "eval_steps_per_second": 1.11, - "step": 153790 + "epoch": 16.0, + "eval_loss": 1.3369859457015991, + "eval_runtime": 1.4272, + "eval_samples_per_second": 700.674, + "eval_steps_per_second": 0.701, + "step": 153792 }, { - "epoch": 2.002730996813837, - "grad_norm": 0.6953125, - "learning_rate": 0.0019198907601274465, - "loss": 1.523, + "epoch": 16.021639617145237, + "grad_norm": 0.2294921875, + "learning_rate": 0.0013591344153141907, + "loss": 1.3446, "step": 154000 }, { - "epoch": 2.009233370180116, - "grad_norm": 0.4375, - "learning_rate": 0.0019196306651927955, - "loss": 1.5192, + "epoch": 16.073657927590514, + "grad_norm": 0.2890625, + "learning_rate": 0.0013570536828963796, + "loss": 1.3422, "step": 154500 }, { - "epoch": 2.0157357435463945, - "grad_norm": 0.515625, - "learning_rate": 0.0019193705702581444, - "loss": 1.5178, + "epoch": 16.125676238035787, + "grad_norm": 0.27734375, + "learning_rate": 0.0013549729504785683, + "loss": 1.3409, "step": 155000 }, { - "epoch": 2.022238116912673, - "grad_norm": 0.73828125, - "learning_rate": 0.0019191104753234932, - "loss": 1.5219, + "epoch": 16.177694548481064, + "grad_norm": 0.6796875, + "learning_rate": 0.0013528922180607575, + "loss": 1.3428, "step": 155500 }, { - "epoch": 2.0287404902789516, - "grad_norm": 0.58203125, - "learning_rate": 0.001918850380388842, - "loss": 1.5232, + "epoch": 16.22971285892634, + "grad_norm": 0.66796875, + "learning_rate": 0.0013508114856429464, + "loss": 1.3443, "step": 156000 }, { - "epoch": 2.0352428636452307, - "grad_norm": 0.439453125, - "learning_rate": 0.0019185902854541908, - "loss": 1.5235, + "epoch": 16.281731169371618, + "grad_norm": 0.376953125, + "learning_rate": 0.001348730753225135, + "loss": 1.3423, "step": 156500 }, { - "epoch": 2.0417452370115092, - "grad_norm": 0.478515625, - "learning_rate": 0.0019183301905195396, - "loss": 1.5259, + "epoch": 16.333749479816895, + "grad_norm": 0.486328125, + "learning_rate": 0.0013466500208073242, + "loss": 1.3408, "step": 157000 }, { - "epoch": 2.048247610377788, - "grad_norm": 0.458984375, - "learning_rate": 0.0019180700955848883, - "loss": 1.5214, + "epoch": 16.38576779026217, + "grad_norm": 0.55859375, + "learning_rate": 0.0013445692883895131, + "loss": 1.3421, "step": 157500 }, { - "epoch": 2.0547499837440664, - "grad_norm": 0.5390625, - "learning_rate": 0.0019178100006502375, - "loss": 1.5198, + "epoch": 16.43778610070745, + "grad_norm": 0.443359375, + "learning_rate": 0.0013424885559717022, + "loss": 1.3424, "step": 158000 }, { - "epoch": 2.0612523571103454, - "grad_norm": 0.5625, - "learning_rate": 0.0019175499057155862, - "loss": 1.52, + "epoch": 16.489804411152726, + "grad_norm": 0.2734375, + "learning_rate": 0.001340407823553891, + "loss": 1.3412, "step": 158500 }, { - "epoch": 2.067754730476624, - "grad_norm": 0.48828125, - "learning_rate": 0.0019172898107809352, - "loss": 1.5229, + "epoch": 16.541822721598002, + "grad_norm": 0.46875, + "learning_rate": 0.0013383270911360799, + "loss": 1.3419, "step": 159000 }, { - "epoch": 2.0742571038429025, - "grad_norm": 0.5703125, - "learning_rate": 0.001917029715846284, - "loss": 1.5198, + "epoch": 16.59384103204328, + "grad_norm": 0.267578125, + "learning_rate": 0.001336246358718269, + "loss": 1.3415, "step": 159500 }, { - "epoch": 2.0807594772091815, - "grad_norm": 0.4765625, - "learning_rate": 0.0019167696209116327, - "loss": 1.525, + "epoch": 16.645859342488556, + "grad_norm": 5.65625, + "learning_rate": 0.001334165626300458, + "loss": 1.3417, "step": 160000 }, { - "epoch": 2.08726185057546, - "grad_norm": 0.51171875, - "learning_rate": 0.0019165095259769816, - "loss": 1.5199, + "epoch": 16.697877652933833, + "grad_norm": 0.181640625, + "learning_rate": 0.0013320848938826466, + "loss": 1.3405, "step": 160500 }, { - "epoch": 2.0937642239417387, - "grad_norm": 0.65625, - "learning_rate": 0.0019162494310423304, - "loss": 1.5194, + "epoch": 16.74989596337911, + "grad_norm": 0.298828125, + "learning_rate": 0.0013300041614648357, + "loss": 1.339, "step": 161000 }, { - "epoch": 2.1002665973080172, - "grad_norm": 0.625, - "learning_rate": 0.0019159893361076796, - "loss": 1.5146, + "epoch": 16.801914273824387, + "grad_norm": 0.609375, + "learning_rate": 0.0013279234290470246, + "loss": 1.3415, "step": 161500 }, { - "epoch": 2.1067689706742962, - "grad_norm": 0.6796875, - "learning_rate": 0.0019157292411730283, - "loss": 1.5195, + "epoch": 16.853932584269664, + "grad_norm": 0.2392578125, + "learning_rate": 0.0013258426966292133, + "loss": 1.341, "step": 162000 }, { - "epoch": 2.113271344040575, - "grad_norm": 0.6484375, - "learning_rate": 0.001915469146238377, - "loss": 1.5237, + "epoch": 16.90595089471494, + "grad_norm": 0.330078125, + "learning_rate": 0.0013237619642114025, + "loss": 1.3424, "step": 162500 }, { - "epoch": 2.1197737174068534, - "grad_norm": 0.5, - "learning_rate": 0.001915209051303726, - "loss": 1.5214, + "epoch": 16.957969205160218, + "grad_norm": 0.25390625, + "learning_rate": 0.0013216812317935914, + "loss": 1.3479, "step": 163000 }, { - "epoch": 2.1262760907731324, - "grad_norm": 2.1875, - "learning_rate": 0.0019149489563690747, - "loss": 1.517, + "epoch": 17.0, + "eval_loss": 1.339566707611084, + "eval_runtime": 1.4145, + "eval_samples_per_second": 706.982, + "eval_steps_per_second": 0.707, + "step": 163404 + }, + { + "epoch": 17.00998751560549, + "grad_norm": 0.43359375, + "learning_rate": 0.0013196004993757803, + "loss": 1.3445, "step": 163500 }, { - "epoch": 2.132778464139411, - "grad_norm": 0.83203125, - "learning_rate": 0.0019146888614344235, - "loss": 1.5163, + "epoch": 17.06200582605077, + "grad_norm": 0.314453125, + "learning_rate": 0.0013175197669579692, + "loss": 1.3435, "step": 164000 }, { - "epoch": 2.1392808375056895, - "grad_norm": 0.515625, - "learning_rate": 0.0019144287664997724, - "loss": 1.5162, + "epoch": 17.114024136496045, + "grad_norm": 0.2314453125, + "learning_rate": 0.0013154390345401581, + "loss": 1.3448, "step": 164500 }, { - "epoch": 2.145783210871968, - "grad_norm": 0.75390625, - "learning_rate": 0.0019141686715651212, - "loss": 1.5184, + "epoch": 17.166042446941322, + "grad_norm": 0.294921875, + "learning_rate": 0.0013133583021223473, + "loss": 1.3436, "step": 165000 }, { - "epoch": 2.152285584238247, - "grad_norm": 0.609375, - "learning_rate": 0.0019139085766304703, - "loss": 1.5204, + "epoch": 17.2180607573866, + "grad_norm": 0.275390625, + "learning_rate": 0.001311277569704536, + "loss": 1.3445, "step": 165500 }, { - "epoch": 2.1587879576045257, - "grad_norm": 0.77734375, - "learning_rate": 0.001913648481695819, - "loss": 1.5225, + "epoch": 17.270079067831876, + "grad_norm": 1.7890625, + "learning_rate": 0.0013091968372867249, + "loss": 1.343, "step": 166000 }, { - "epoch": 2.1652903309708043, - "grad_norm": 0.84375, - "learning_rate": 0.0019133883867611678, - "loss": 1.5205, + "epoch": 17.322097378277153, + "grad_norm": 0.37109375, + "learning_rate": 0.001307116104868914, + "loss": 1.3429, "step": 166500 }, { - "epoch": 2.171792704337083, - "grad_norm": 0.55859375, - "learning_rate": 0.0019131282918265168, - "loss": 1.5217, + "epoch": 17.37411568872243, + "grad_norm": 0.240234375, + "learning_rate": 0.001305035372451103, + "loss": 1.342, "step": 167000 }, { - "epoch": 2.178295077703362, - "grad_norm": 0.609375, - "learning_rate": 0.0019128681968918655, - "loss": 1.5178, + "epoch": 17.426133999167707, + "grad_norm": 2.875, + "learning_rate": 0.0013029546400332916, + "loss": 1.3429, "step": 167500 }, { - "epoch": 2.1847974510696404, - "grad_norm": 0.515625, - "learning_rate": 0.0019126081019572143, - "loss": 1.5149, + "epoch": 17.478152309612984, + "grad_norm": 0.310546875, + "learning_rate": 0.0013008739076154807, + "loss": 1.3424, "step": 168000 }, { - "epoch": 2.191299824435919, - "grad_norm": 0.71484375, - "learning_rate": 0.0019123480070225632, - "loss": 1.516, + "epoch": 17.53017062005826, + "grad_norm": 0.4453125, + "learning_rate": 0.0012987931751976696, + "loss": 1.3422, "step": 168500 }, { - "epoch": 2.197802197802198, - "grad_norm": 0.5625, - "learning_rate": 0.0019120879120879122, - "loss": 1.5138, + "epoch": 17.582188930503538, + "grad_norm": 0.33203125, + "learning_rate": 0.0012967124427798583, + "loss": 1.3419, "step": 169000 }, { - "epoch": 2.2043045711684766, - "grad_norm": 0.58203125, - "learning_rate": 0.0019118278171532611, - "loss": 1.5141, + "epoch": 17.634207240948815, + "grad_norm": 0.3515625, + "learning_rate": 0.0012946317103620475, + "loss": 1.3419, "step": 169500 }, { - "epoch": 2.210806944534755, - "grad_norm": 0.63671875, - "learning_rate": 0.0019115677222186099, - "loss": 1.5111, + "epoch": 17.68622555139409, + "grad_norm": 0.21875, + "learning_rate": 0.0012925509779442364, + "loss": 1.3411, "step": 170000 }, { - "epoch": 2.2173093179010337, - "grad_norm": 0.875, - "learning_rate": 0.0019113076272839586, - "loss": 1.5136, + "epoch": 17.73824386183937, + "grad_norm": 0.240234375, + "learning_rate": 0.0012904702455264253, + "loss": 1.3409, "step": 170500 }, { - "epoch": 2.2238116912673127, - "grad_norm": 0.53515625, - "learning_rate": 0.0019110475323493076, - "loss": 1.5115, + "epoch": 17.790262172284645, + "grad_norm": 0.26953125, + "learning_rate": 0.0012883895131086142, + "loss": 1.3429, "step": 171000 }, { - "epoch": 2.2303140646335913, - "grad_norm": 0.55859375, - "learning_rate": 0.0019107874374146563, - "loss": 1.5159, + "epoch": 17.842280482729922, + "grad_norm": 0.28515625, + "learning_rate": 0.0012863087806908031, + "loss": 1.3426, "step": 171500 }, { - "epoch": 2.23681643799987, - "grad_norm": 0.57421875, - "learning_rate": 0.001910527342480005, - "loss": 1.514, + "epoch": 17.8942987931752, + "grad_norm": 0.248046875, + "learning_rate": 0.0012842280482729923, + "loss": 1.342, "step": 172000 }, { - "epoch": 2.243318811366149, - "grad_norm": 0.48046875, - "learning_rate": 0.0019102672475453542, - "loss": 1.5094, + "epoch": 17.946317103620473, + "grad_norm": 0.392578125, + "learning_rate": 0.0012821473158551812, + "loss": 1.3418, "step": 172500 }, { - "epoch": 2.2498211847324274, - "grad_norm": 0.498046875, - "learning_rate": 0.001910007152610703, - "loss": 1.5128, + "epoch": 17.99833541406575, + "grad_norm": 0.30078125, + "learning_rate": 0.0012800665834373699, + "loss": 1.3429, "step": 173000 }, { - "epoch": 2.256323558098706, - "grad_norm": 1.265625, - "learning_rate": 0.001909747057676052, - "loss": 1.5091, + "epoch": 18.0, + "eval_loss": 1.3371888399124146, + "eval_runtime": 1.4206, + "eval_samples_per_second": 703.942, + "eval_steps_per_second": 0.704, + "step": 173016 + }, + { + "epoch": 18.050353724511027, + "grad_norm": 0.51953125, + "learning_rate": 0.001277985851019559, + "loss": 1.3399, "step": 173500 }, { - "epoch": 2.2628259314649846, - "grad_norm": 0.86328125, - "learning_rate": 0.0019094869627414007, - "loss": 1.5094, + "epoch": 18.102372034956304, + "grad_norm": 0.26953125, + "learning_rate": 0.001275905118601748, + "loss": 1.3414, "step": 174000 }, { - "epoch": 2.2693283048312636, - "grad_norm": 0.51953125, - "learning_rate": 0.0019092268678067494, - "loss": 1.5072, + "epoch": 18.15439034540158, + "grad_norm": 0.2431640625, + "learning_rate": 0.0012738243861839366, + "loss": 1.3425, "step": 174500 }, { - "epoch": 2.275830678197542, - "grad_norm": 0.625, - "learning_rate": 0.0019089667728720984, - "loss": 1.5126, + "epoch": 18.206408655846857, + "grad_norm": 0.255859375, + "learning_rate": 0.0012717436537661257, + "loss": 1.339, "step": 175000 }, { - "epoch": 2.2823330515638207, - "grad_norm": 0.53125, - "learning_rate": 0.001908706677937447, - "loss": 1.5113, + "epoch": 18.258426966292134, + "grad_norm": 0.19921875, + "learning_rate": 0.0012696629213483147, + "loss": 1.3403, "step": 175500 }, { - "epoch": 2.2888354249300997, - "grad_norm": 0.64453125, - "learning_rate": 0.0019084465830027963, - "loss": 1.5066, + "epoch": 18.31044527673741, + "grad_norm": 0.328125, + "learning_rate": 0.0012675821889305036, + "loss": 1.3388, "step": 176000 }, { - "epoch": 2.2953377982963783, - "grad_norm": 0.46484375, - "learning_rate": 0.001908186488068145, - "loss": 1.5094, + "epoch": 18.36246358718269, + "grad_norm": 0.220703125, + "learning_rate": 0.0012655014565126925, + "loss": 1.3385, "step": 176500 }, { - "epoch": 2.301840171662657, - "grad_norm": 0.50390625, - "learning_rate": 0.0019079263931334938, - "loss": 1.5059, + "epoch": 18.414481897627965, + "grad_norm": 0.251953125, + "learning_rate": 0.0012634207240948814, + "loss": 1.3372, "step": 177000 }, { - "epoch": 2.3083425450289354, - "grad_norm": 0.66796875, - "learning_rate": 0.0019076662981988427, - "loss": 1.5127, + "epoch": 18.466500208073242, + "grad_norm": 0.1962890625, + "learning_rate": 0.0012613399916770703, + "loss": 1.3384, "step": 177500 }, { - "epoch": 2.3148449183952144, - "grad_norm": 0.57421875, - "learning_rate": 0.0019074062032641914, - "loss": 1.5114, + "epoch": 18.51851851851852, + "grad_norm": 0.2490234375, + "learning_rate": 0.0012592592592592592, + "loss": 1.3377, "step": 178000 }, { - "epoch": 2.321347291761493, - "grad_norm": 0.56640625, - "learning_rate": 0.0019071461083295402, - "loss": 1.5088, + "epoch": 18.570536828963796, + "grad_norm": 0.2109375, + "learning_rate": 0.0012571785268414481, + "loss": 1.3372, "step": 178500 }, { - "epoch": 2.3278496651277716, - "grad_norm": 0.5, - "learning_rate": 0.0019068860133948891, - "loss": 1.509, + "epoch": 18.622555139409073, + "grad_norm": 0.2490234375, + "learning_rate": 0.0012550977944236373, + "loss": 1.3368, "step": 179000 }, { - "epoch": 2.33435203849405, - "grad_norm": 3.625, - "learning_rate": 0.0019066259184602379, - "loss": 1.5016, + "epoch": 18.67457344985435, + "grad_norm": 0.201171875, + "learning_rate": 0.0012530170620058262, + "loss": 1.3384, "step": 179500 }, { - "epoch": 2.340854411860329, - "grad_norm": 0.58203125, - "learning_rate": 0.001906365823525587, - "loss": 1.5006, + "epoch": 18.726591760299627, + "grad_norm": 0.2451171875, + "learning_rate": 0.0012509363295880149, + "loss": 1.3374, "step": 180000 }, { - "epoch": 2.3473567852266077, - "grad_norm": 0.90625, - "learning_rate": 0.0019061057285909358, - "loss": 1.5052, + "epoch": 18.778610070744904, + "grad_norm": 0.431640625, + "learning_rate": 0.001248855597170204, + "loss": 1.3378, "step": 180500 }, { - "epoch": 2.3538591585928863, - "grad_norm": 0.6015625, - "learning_rate": 0.0019058456336562845, - "loss": 1.502, + "epoch": 18.83062838119018, + "grad_norm": 0.4765625, + "learning_rate": 0.001246774864752393, + "loss": 1.3397, "step": 181000 }, { - "epoch": 2.360361531959165, - "grad_norm": 0.4765625, - "learning_rate": 0.0019055855387216335, - "loss": 1.5067, + "epoch": 18.882646691635454, + "grad_norm": 0.2216796875, + "learning_rate": 0.0012446941323345816, + "loss": 1.3379, "step": 181500 }, { - "epoch": 2.366863905325444, - "grad_norm": 0.73828125, - "learning_rate": 0.0019053254437869822, - "loss": 1.5013, + "epoch": 18.93466500208073, + "grad_norm": 0.55078125, + "learning_rate": 0.0012426133999167708, + "loss": 1.3377, "step": 182000 }, { - "epoch": 2.3733662786917225, - "grad_norm": 0.81640625, - "learning_rate": 0.001905065348852331, - "loss": 1.5033, + "epoch": 18.986683312526008, + "grad_norm": 0.337890625, + "learning_rate": 0.0012405326674989597, + "loss": 1.3379, "step": 182500 }, { - "epoch": 2.379868652058001, - "grad_norm": 0.51171875, - "learning_rate": 0.00190480525391768, - "loss": 1.5061, - "step": 183000 - }, - { - "epoch": 2.38637102542428, - "grad_norm": 2.90625, - "learning_rate": 0.0019045451589830289, - "loss": 1.508, - "step": 183500 - }, - { - "epoch": 2.3928733987905586, - "grad_norm": 0.640625, - "learning_rate": 0.0019042850640483778, - "loss": 1.5048, - "step": 184000 - }, - { - "epoch": 2.399375772156837, - "grad_norm": 0.58984375, - "learning_rate": 0.0019040249691137266, - "loss": 1.5025, - "step": 184500 - }, - { - "epoch": 2.4058781455231157, - "grad_norm": 0.59375, - "learning_rate": 0.0019037648741790753, - "loss": 1.4983, - "step": 185000 - }, - { - "epoch": 2.4123805188893948, - "grad_norm": 0.6484375, - "learning_rate": 0.0019035047792444243, - "loss": 1.5018, - "step": 185500 - }, - { - "epoch": 2.4188828922556733, - "grad_norm": 0.458984375, - "learning_rate": 0.001903244684309773, - "loss": 1.5034, - "step": 186000 - }, - { - "epoch": 2.425385265621952, - "grad_norm": 0.66015625, - "learning_rate": 0.0019029845893751218, - "loss": 1.5026, - "step": 186500 - }, - { - "epoch": 2.431887638988231, - "grad_norm": 0.86328125, - "learning_rate": 0.001902724494440471, - "loss": 1.5019, - "step": 187000 - }, - { - "epoch": 2.4383900123545095, - "grad_norm": 0.77734375, - "learning_rate": 0.0019024643995058197, - "loss": 1.5043, - "step": 187500 - }, - { - "epoch": 2.444892385720788, - "grad_norm": 0.65625, - "learning_rate": 0.0019022043045711686, - "loss": 1.5081, - "step": 188000 - }, - { - "epoch": 2.4513947590870666, - "grad_norm": 0.51953125, - "learning_rate": 0.0019019442096365174, - "loss": 1.5045, - "step": 188500 - }, - { - "epoch": 2.4578971324533456, - "grad_norm": 0.49609375, - "learning_rate": 0.0019016841147018661, - "loss": 1.5091, - "step": 189000 - }, - { - "epoch": 2.464399505819624, - "grad_norm": 0.51953125, - "learning_rate": 0.001901424019767215, - "loss": 1.5121, - "step": 189500 - }, - { - "epoch": 2.4709018791859028, - "grad_norm": 0.5078125, - "learning_rate": 0.0019011639248325638, - "loss": 1.5124, - "step": 190000 - }, - { - "epoch": 2.4774042525521818, - "grad_norm": 0.49609375, - "learning_rate": 0.001900903829897913, - "loss": 1.5159, - "step": 190500 - }, - { - "epoch": 2.4839066259184603, - "grad_norm": 0.4765625, - "learning_rate": 0.0019006437349632617, - "loss": 1.5078, - "step": 191000 - }, - { - "epoch": 2.490408999284739, - "grad_norm": 0.640625, - "learning_rate": 0.0019003836400286105, - "loss": 1.5112, - "step": 191500 - }, - { - "epoch": 2.4969113726510175, - "grad_norm": 0.5625, - "learning_rate": 0.0019001235450939594, - "loss": 1.5106, - "step": 192000 - }, - { - "epoch": 2.503413746017296, - "grad_norm": 0.478515625, - "learning_rate": 0.0018998634501593082, - "loss": 1.5032, - "step": 192500 - }, - { - "epoch": 2.509916119383575, - "grad_norm": 0.8203125, - "learning_rate": 0.001899603355224657, - "loss": 1.5047, - "step": 193000 - }, - { - "epoch": 2.5164184927498536, - "grad_norm": 0.57421875, - "learning_rate": 0.0018993432602900059, - "loss": 1.5036, - "step": 193500 - }, - { - "epoch": 2.5229208661161326, - "grad_norm": 0.66015625, - "learning_rate": 0.0018990831653553546, - "loss": 1.5028, - "step": 194000 - }, - { - "epoch": 2.529423239482411, - "grad_norm": 0.5546875, - "learning_rate": 0.0018988230704207038, - "loss": 1.5045, - "step": 194500 - }, - { - "epoch": 2.53592561284869, - "grad_norm": 0.52734375, - "learning_rate": 0.0018985629754860525, - "loss": 1.5016, - "step": 195000 - }, - { - "epoch": 2.5424279862149683, - "grad_norm": 0.54296875, - "learning_rate": 0.0018983028805514013, - "loss": 1.5057, - "step": 195500 - }, - { - "epoch": 2.548930359581247, - "grad_norm": 0.51171875, - "learning_rate": 0.0018980427856167502, - "loss": 1.5069, - "step": 196000 - }, - { - "epoch": 2.555432732947526, - "grad_norm": 0.58203125, - "learning_rate": 0.001897782690682099, - "loss": 1.5057, - "step": 196500 - }, - { - "epoch": 2.5619351063138045, - "grad_norm": 0.490234375, - "learning_rate": 0.0018975225957474477, - "loss": 1.5039, - "step": 197000 - }, - { - "epoch": 2.568437479680083, - "grad_norm": 0.625, - "learning_rate": 0.0018972625008127967, - "loss": 1.5041, - "step": 197500 - }, - { - "epoch": 2.574939853046362, - "grad_norm": 0.486328125, - "learning_rate": 0.0018970024058781456, - "loss": 1.5048, - "step": 198000 - }, - { - "epoch": 2.5814422264126407, - "grad_norm": 3.59375, - "learning_rate": 0.0018967423109434946, - "loss": 1.5011, - "step": 198500 - }, - { - "epoch": 2.587944599778919, - "grad_norm": 1.546875, - "learning_rate": 0.0018964822160088433, - "loss": 1.5014, - "step": 199000 - }, - { - "epoch": 2.594446973145198, - "grad_norm": 0.453125, - "learning_rate": 0.001896222121074192, - "loss": 1.4983, - "step": 199500 - }, - { - "epoch": 2.600949346511477, - "grad_norm": 0.5, - "learning_rate": 0.001895962026139541, - "loss": 1.5062, - "step": 200000 - }, - { - "epoch": 2.6074517198777554, - "grad_norm": 0.67578125, - "learning_rate": 0.0018957019312048897, - "loss": 1.5062, - "step": 200500 - }, - { - "epoch": 2.613954093244034, - "grad_norm": 0.56640625, - "learning_rate": 0.0018954418362702385, - "loss": 1.5011, - "step": 201000 - }, - { - "epoch": 2.620456466610313, - "grad_norm": 0.5625, - "learning_rate": 0.0018951817413355877, - "loss": 1.5029, - "step": 201500 - }, - { - "epoch": 2.6269588399765915, - "grad_norm": 1.0703125, - "learning_rate": 0.0018949216464009364, - "loss": 1.5017, - "step": 202000 - }, - { - "epoch": 2.63346121334287, - "grad_norm": 0.470703125, - "learning_rate": 0.0018946615514662854, - "loss": 1.5012, - "step": 202500 - }, - { - "epoch": 2.6399635867091487, - "grad_norm": 0.515625, - "learning_rate": 0.001894401456531634, - "loss": 1.5036, - "step": 203000 - }, - { - "epoch": 2.6464659600754277, - "grad_norm": 0.5859375, - "learning_rate": 0.0018941413615969828, - "loss": 1.5042, - "step": 203500 - }, - { - "epoch": 2.6529683334417062, - "grad_norm": 0.474609375, - "learning_rate": 0.0018938812666623318, - "loss": 1.5042, - "step": 204000 - }, - { - "epoch": 2.659470706807985, - "grad_norm": 1.046875, - "learning_rate": 0.0018936211717276805, - "loss": 1.5079, - "step": 204500 - }, - { - "epoch": 2.665973080174264, - "grad_norm": 0.51171875, - "learning_rate": 0.0018933610767930297, - "loss": 1.5093, - "step": 205000 - }, - { - "epoch": 2.6724754535405424, - "grad_norm": 0.50390625, - "learning_rate": 0.0018931009818583784, - "loss": 1.5066, - "step": 205500 - }, - { - "epoch": 2.678977826906821, - "grad_norm": 0.69140625, - "learning_rate": 0.0018928408869237272, - "loss": 1.5034, - "step": 206000 - }, - { - "epoch": 2.6854802002730995, - "grad_norm": 0.96875, - "learning_rate": 0.0018925807919890761, - "loss": 1.5045, - "step": 206500 - }, - { - "epoch": 2.691982573639378, - "grad_norm": 0.57421875, - "learning_rate": 0.0018923206970544249, - "loss": 1.5112, - "step": 207000 - }, - { - "epoch": 2.698484947005657, - "grad_norm": 0.6953125, - "learning_rate": 0.0018920606021197736, - "loss": 1.5055, - "step": 207500 - }, - { - "epoch": 2.7049873203719357, - "grad_norm": 2.5, - "learning_rate": 0.0018918005071851226, - "loss": 1.5023, - "step": 208000 - }, - { - "epoch": 2.7114896937382147, - "grad_norm": 0.423828125, - "learning_rate": 0.0018915404122504713, - "loss": 1.5052, - "step": 208500 - }, - { - "epoch": 2.7179920671044933, - "grad_norm": 1.5234375, - "learning_rate": 0.0018912803173158205, - "loss": 1.4958, - "step": 209000 - }, - { - "epoch": 2.724494440470772, - "grad_norm": 0.5859375, - "learning_rate": 0.0018910202223811692, - "loss": 1.5016, - "step": 209500 - }, - { - "epoch": 2.7309968138370504, - "grad_norm": 1.4375, - "learning_rate": 0.001890760127446518, - "loss": 1.4964, - "step": 210000 - }, - { - "epoch": 2.737499187203329, - "grad_norm": 1.3203125, - "learning_rate": 0.001890500032511867, - "loss": 1.5011, - "step": 210500 - }, - { - "epoch": 2.744001560569608, - "grad_norm": 0.423828125, - "learning_rate": 0.0018902399375772157, - "loss": 1.5008, - "step": 211000 - }, - { - "epoch": 2.7505039339358865, - "grad_norm": 0.478515625, - "learning_rate": 0.0018899798426425644, - "loss": 1.4991, - "step": 211500 - }, - { - "epoch": 2.7570063073021656, - "grad_norm": 1.09375, - "learning_rate": 0.0018897197477079134, - "loss": 1.4968, - "step": 212000 - }, - { - "epoch": 2.763508680668444, - "grad_norm": 1.453125, - "learning_rate": 0.0018894596527732623, - "loss": 1.4974, - "step": 212500 - }, - { - "epoch": 2.7700110540347227, - "grad_norm": 0.59765625, - "learning_rate": 0.0018891995578386113, - "loss": 1.4991, - "step": 213000 - }, - { - "epoch": 2.7765134274010013, - "grad_norm": 0.6796875, - "learning_rate": 0.00188893946290396, - "loss": 1.4964, - "step": 213500 - }, - { - "epoch": 2.78301580076728, - "grad_norm": 0.86328125, - "learning_rate": 0.0018886793679693088, - "loss": 1.4988, - "step": 214000 - }, - { - "epoch": 2.789518174133559, - "grad_norm": 0.703125, - "learning_rate": 0.0018884192730346577, - "loss": 1.4971, - "step": 214500 - }, - { - "epoch": 2.7960205474998374, - "grad_norm": 0.6328125, - "learning_rate": 0.0018881591781000065, - "loss": 1.4988, - "step": 215000 - }, - { - "epoch": 2.802522920866116, - "grad_norm": 0.578125, - "learning_rate": 0.0018878990831653552, - "loss": 1.5002, - "step": 215500 - }, - { - "epoch": 2.809025294232395, - "grad_norm": 0.51953125, - "learning_rate": 0.0018876389882307044, - "loss": 1.4966, - "step": 216000 - }, - { - "epoch": 2.8155276675986736, - "grad_norm": 0.62890625, - "learning_rate": 0.0018873788932960531, - "loss": 1.4988, - "step": 216500 - }, - { - "epoch": 2.822030040964952, - "grad_norm": 2.15625, - "learning_rate": 0.001887118798361402, - "loss": 1.5013, - "step": 217000 - }, - { - "epoch": 2.8285324143312307, - "grad_norm": 0.6640625, - "learning_rate": 0.0018868587034267508, - "loss": 1.4979, - "step": 217500 - }, - { - "epoch": 2.8350347876975097, - "grad_norm": 0.59765625, - "learning_rate": 0.0018865986084920996, - "loss": 1.4991, - "step": 218000 - }, - { - "epoch": 2.8415371610637883, - "grad_norm": 0.56640625, - "learning_rate": 0.0018863385135574485, - "loss": 1.4934, - "step": 218500 - }, - { - "epoch": 2.848039534430067, - "grad_norm": 0.53515625, - "learning_rate": 0.0018860784186227973, - "loss": 1.4923, - "step": 219000 - }, - { - "epoch": 2.854541907796346, - "grad_norm": 0.65625, - "learning_rate": 0.0018858183236881464, - "loss": 1.4957, - "step": 219500 - }, - { - "epoch": 2.8610442811626244, - "grad_norm": 0.49609375, - "learning_rate": 0.0018855582287534952, - "loss": 1.4958, - "step": 220000 - }, - { - "epoch": 2.867546654528903, - "grad_norm": 1.5546875, - "learning_rate": 0.001885298133818844, - "loss": 1.4962, - "step": 220500 - }, - { - "epoch": 2.8740490278951816, - "grad_norm": 0.64453125, - "learning_rate": 0.0018850380388841929, - "loss": 1.4942, - "step": 221000 - }, - { - "epoch": 2.8805514012614606, - "grad_norm": 0.66796875, - "learning_rate": 0.0018847779439495416, - "loss": 1.4962, - "step": 221500 - }, - { - "epoch": 2.887053774627739, - "grad_norm": 0.51953125, - "learning_rate": 0.0018845178490148903, - "loss": 1.4908, - "step": 222000 - }, - { - "epoch": 2.8935561479940177, - "grad_norm": 1.1328125, - "learning_rate": 0.0018842577540802393, - "loss": 1.4947, - "step": 222500 - }, - { - "epoch": 2.9000585213602967, - "grad_norm": 0.55078125, - "learning_rate": 0.001883997659145588, - "loss": 1.497, - "step": 223000 - }, - { - "epoch": 2.9065608947265753, - "grad_norm": 0.427734375, - "learning_rate": 0.0018837375642109372, - "loss": 1.4954, - "step": 223500 - }, - { - "epoch": 2.913063268092854, - "grad_norm": 0.6953125, - "learning_rate": 0.001883477469276286, - "loss": 1.4954, - "step": 224000 - }, - { - "epoch": 2.9195656414591324, - "grad_norm": 1.0859375, - "learning_rate": 0.0018832173743416347, - "loss": 1.4906, - "step": 224500 - }, - { - "epoch": 2.926068014825411, - "grad_norm": 1.8984375, - "learning_rate": 0.0018829572794069837, - "loss": 1.4947, - "step": 225000 - }, - { - "epoch": 2.93257038819169, - "grad_norm": 0.52734375, - "learning_rate": 0.0018826971844723324, - "loss": 1.5016, - "step": 225500 - }, - { - "epoch": 2.9390727615579686, - "grad_norm": 0.74609375, - "learning_rate": 0.0018824370895376811, - "loss": 1.4978, - "step": 226000 - }, - { - "epoch": 2.9455751349242476, - "grad_norm": 1.5, - "learning_rate": 0.00188217699460303, - "loss": 1.495, - "step": 226500 - }, - { - "epoch": 2.952077508290526, - "grad_norm": 0.8359375, - "learning_rate": 0.001881916899668379, - "loss": 1.4984, - "step": 227000 - }, - { - "epoch": 2.9585798816568047, - "grad_norm": 0.54296875, - "learning_rate": 0.001881656804733728, - "loss": 1.4918, - "step": 227500 - }, - { - "epoch": 2.9650822550230833, - "grad_norm": 0.44921875, - "learning_rate": 0.0018813967097990767, - "loss": 1.4907, - "step": 228000 - }, - { - "epoch": 2.971584628389362, - "grad_norm": 0.609375, - "learning_rate": 0.0018811366148644255, - "loss": 1.4954, - "step": 228500 - }, - { - "epoch": 2.978087001755641, - "grad_norm": 0.53515625, - "learning_rate": 0.0018808765199297744, - "loss": 1.4943, - "step": 229000 - }, - { - "epoch": 2.9845893751219195, - "grad_norm": 0.640625, - "learning_rate": 0.0018806164249951232, - "loss": 1.493, - "step": 229500 - }, - { - "epoch": 2.991091748488198, - "grad_norm": 0.734375, - "learning_rate": 0.001880356330060472, - "loss": 1.4926, - "step": 230000 - }, - { - "epoch": 2.997594121854477, - "grad_norm": 0.5, - "learning_rate": 0.001880096235125821, - "loss": 1.4914, - "step": 230500 - }, - { - "epoch": 3.0, - "eval_loss": 1.4690916538238525, - "eval_runtime": 0.9958, - "eval_samples_per_second": 1004.258, - "eval_steps_per_second": 8.034, - "step": 230685 - }, - { - "epoch": 3.0040964952207556, - "grad_norm": 0.6484375, - "learning_rate": 0.0018798361401911698, - "loss": 1.4899, - "step": 231000 - }, - { - "epoch": 3.010598868587034, - "grad_norm": 0.515625, - "learning_rate": 0.0018795760452565188, - "loss": 1.4889, - "step": 231500 - }, - { - "epoch": 3.0171012419533128, - "grad_norm": 0.9296875, - "learning_rate": 0.0018793159503218675, - "loss": 1.4908, - "step": 232000 - }, - { - "epoch": 3.0236036153195918, - "grad_norm": 1.75, - "learning_rate": 0.0018790558553872163, - "loss": 1.4906, - "step": 232500 - }, - { - "epoch": 3.0301059886858703, - "grad_norm": 0.5390625, - "learning_rate": 0.0018787957604525652, - "loss": 1.4915, - "step": 233000 - }, - { - "epoch": 3.036608362052149, - "grad_norm": 0.62890625, - "learning_rate": 0.001878535665517914, - "loss": 1.4917, - "step": 233500 - }, - { - "epoch": 3.043110735418428, - "grad_norm": 0.90625, - "learning_rate": 0.0018782755705832631, - "loss": 1.4915, - "step": 234000 - }, - { - "epoch": 3.0496131087847065, - "grad_norm": 0.5546875, - "learning_rate": 0.0018780154756486119, - "loss": 1.4875, - "step": 234500 - }, - { - "epoch": 3.056115482150985, - "grad_norm": 1.0703125, - "learning_rate": 0.0018777553807139606, - "loss": 1.4922, - "step": 235000 - }, - { - "epoch": 3.0626178555172636, - "grad_norm": 0.6328125, - "learning_rate": 0.0018774952857793096, - "loss": 1.4917, - "step": 235500 - }, - { - "epoch": 3.0691202288835426, - "grad_norm": 0.734375, - "learning_rate": 0.0018772351908446583, - "loss": 1.4925, - "step": 236000 - }, - { - "epoch": 3.075622602249821, - "grad_norm": 0.451171875, - "learning_rate": 0.001876975095910007, - "loss": 1.4906, - "step": 236500 - }, - { - "epoch": 3.0821249756160998, - "grad_norm": 1.2734375, - "learning_rate": 0.001876715000975356, - "loss": 1.4913, - "step": 237000 - }, - { - "epoch": 3.088627348982379, - "grad_norm": 0.8359375, - "learning_rate": 0.0018764549060407048, - "loss": 1.4908, - "step": 237500 - }, - { - "epoch": 3.0951297223486574, - "grad_norm": 1.625, - "learning_rate": 0.001876194811106054, - "loss": 1.4913, - "step": 238000 - }, - { - "epoch": 3.101632095714936, - "grad_norm": 0.984375, - "learning_rate": 0.0018759347161714027, - "loss": 1.4925, - "step": 238500 - }, - { - "epoch": 3.1081344690812145, - "grad_norm": 0.6015625, - "learning_rate": 0.0018756746212367514, - "loss": 1.4905, - "step": 239000 - }, - { - "epoch": 3.1146368424474935, - "grad_norm": 0.55859375, - "learning_rate": 0.0018754145263021004, - "loss": 1.4899, - "step": 239500 - }, - { - "epoch": 3.121139215813772, - "grad_norm": 0.640625, - "learning_rate": 0.0018751544313674491, - "loss": 1.4862, - "step": 240000 - }, - { - "epoch": 3.1276415891800506, - "grad_norm": 0.482421875, - "learning_rate": 0.0018748943364327978, - "loss": 1.4903, - "step": 240500 - }, - { - "epoch": 3.134143962546329, - "grad_norm": 0.4921875, - "learning_rate": 0.0018746342414981468, - "loss": 1.4926, - "step": 241000 - }, - { - "epoch": 3.1406463359126082, - "grad_norm": 0.48046875, - "learning_rate": 0.0018743741465634958, - "loss": 1.4893, - "step": 241500 - }, - { - "epoch": 3.147148709278887, - "grad_norm": 0.50390625, - "learning_rate": 0.0018741140516288447, - "loss": 1.4858, - "step": 242000 - }, - { - "epoch": 3.1536510826451654, - "grad_norm": 0.578125, - "learning_rate": 0.0018738539566941935, - "loss": 1.4875, - "step": 242500 - }, - { - "epoch": 3.1601534560114444, - "grad_norm": 0.5859375, - "learning_rate": 0.0018735938617595422, - "loss": 1.4881, - "step": 243000 - }, - { - "epoch": 3.166655829377723, - "grad_norm": 0.494140625, - "learning_rate": 0.0018733337668248912, - "loss": 1.4878, - "step": 243500 - }, - { - "epoch": 3.1731582027440015, - "grad_norm": 0.59375, - "learning_rate": 0.00187307367189024, - "loss": 1.4921, - "step": 244000 - }, - { - "epoch": 3.17966057611028, - "grad_norm": 0.76171875, - "learning_rate": 0.0018728135769555886, - "loss": 1.4891, - "step": 244500 - }, - { - "epoch": 3.186162949476559, - "grad_norm": 0.59375, - "learning_rate": 0.0018725534820209378, - "loss": 1.4898, - "step": 245000 - }, - { - "epoch": 3.1926653228428377, - "grad_norm": 1.1953125, - "learning_rate": 0.0018722933870862866, - "loss": 1.4902, - "step": 245500 - }, - { - "epoch": 3.1991676962091162, - "grad_norm": 1.546875, - "learning_rate": 0.0018720332921516355, - "loss": 1.4911, - "step": 246000 - }, - { - "epoch": 3.205670069575395, - "grad_norm": 1.015625, - "learning_rate": 0.0018717731972169842, - "loss": 1.4868, - "step": 246500 - }, - { - "epoch": 3.212172442941674, - "grad_norm": 0.60546875, - "learning_rate": 0.001871513102282333, - "loss": 1.4901, - "step": 247000 - }, - { - "epoch": 3.2186748163079524, - "grad_norm": 0.51171875, - "learning_rate": 0.001871253007347682, - "loss": 1.4866, - "step": 247500 - }, - { - "epoch": 3.225177189674231, - "grad_norm": 0.55078125, - "learning_rate": 0.0018709929124130307, - "loss": 1.489, - "step": 248000 - }, - { - "epoch": 3.23167956304051, - "grad_norm": 0.73828125, - "learning_rate": 0.0018707328174783799, - "loss": 1.4874, - "step": 248500 - }, - { - "epoch": 3.2381819364067885, - "grad_norm": 1.453125, - "learning_rate": 0.0018704727225437286, - "loss": 1.4909, - "step": 249000 - }, - { - "epoch": 3.244684309773067, - "grad_norm": 0.51171875, - "learning_rate": 0.0018702126276090773, - "loss": 1.487, - "step": 249500 - }, - { - "epoch": 3.2511866831393457, - "grad_norm": 0.54296875, - "learning_rate": 0.0018699525326744263, - "loss": 1.4912, - "step": 250000 - }, - { - "epoch": 3.2576890565056247, - "grad_norm": 0.86328125, - "learning_rate": 0.001869692437739775, - "loss": 1.4872, - "step": 250500 - }, - { - "epoch": 3.2641914298719032, - "grad_norm": 0.4609375, - "learning_rate": 0.0018694323428051238, - "loss": 1.4817, - "step": 251000 - }, - { - "epoch": 3.270693803238182, - "grad_norm": 0.49609375, - "learning_rate": 0.0018691722478704727, - "loss": 1.4821, - "step": 251500 - }, - { - "epoch": 3.277196176604461, - "grad_norm": 0.51171875, - "learning_rate": 0.0018689121529358215, - "loss": 1.4884, - "step": 252000 - }, - { - "epoch": 3.2836985499707394, - "grad_norm": 0.87109375, - "learning_rate": 0.0018686520580011706, - "loss": 1.4884, - "step": 252500 - }, - { - "epoch": 3.290200923337018, - "grad_norm": 0.71875, - "learning_rate": 0.0018683919630665194, - "loss": 1.4864, - "step": 253000 - }, - { - "epoch": 3.2967032967032965, - "grad_norm": 0.62109375, - "learning_rate": 0.0018681318681318681, - "loss": 1.4894, - "step": 253500 - }, - { - "epoch": 3.3032056700695756, - "grad_norm": 0.6484375, - "learning_rate": 0.001867871773197217, - "loss": 1.482, - "step": 254000 - }, - { - "epoch": 3.309708043435854, - "grad_norm": 0.50390625, - "learning_rate": 0.0018676116782625658, - "loss": 1.4864, - "step": 254500 - }, - { - "epoch": 3.3162104168021327, - "grad_norm": 0.490234375, - "learning_rate": 0.0018673515833279146, - "loss": 1.488, - "step": 255000 - }, - { - "epoch": 3.3227127901684117, - "grad_norm": 0.78125, - "learning_rate": 0.0018670914883932635, - "loss": 1.4858, - "step": 255500 - }, - { - "epoch": 3.3292151635346903, - "grad_norm": 1.0078125, - "learning_rate": 0.0018668313934586125, - "loss": 1.4894, - "step": 256000 - }, - { - "epoch": 3.335717536900969, - "grad_norm": 0.55078125, - "learning_rate": 0.0018665712985239614, - "loss": 1.4929, - "step": 256500 - }, - { - "epoch": 3.3422199102672474, - "grad_norm": 0.7734375, - "learning_rate": 0.0018663112035893102, - "loss": 1.4937, - "step": 257000 - }, - { - "epoch": 3.3487222836335264, - "grad_norm": 0.58203125, - "learning_rate": 0.001866051108654659, - "loss": 1.4913, - "step": 257500 - }, - { - "epoch": 3.355224656999805, - "grad_norm": 0.91015625, - "learning_rate": 0.0018657910137200079, - "loss": 1.4903, - "step": 258000 - }, - { - "epoch": 3.3617270303660836, - "grad_norm": 0.546875, - "learning_rate": 0.0018655309187853566, - "loss": 1.4862, - "step": 258500 - }, - { - "epoch": 3.368229403732362, - "grad_norm": 0.4453125, - "learning_rate": 0.0018652708238507054, - "loss": 1.4836, - "step": 259000 - }, - { - "epoch": 3.374731777098641, - "grad_norm": 0.490234375, - "learning_rate": 0.0018650107289160545, - "loss": 1.4841, - "step": 259500 - }, - { - "epoch": 3.3812341504649197, - "grad_norm": 0.4609375, - "learning_rate": 0.0018647506339814033, - "loss": 1.4857, - "step": 260000 - }, - { - "epoch": 3.3877365238311983, - "grad_norm": 0.71875, - "learning_rate": 0.0018644905390467522, - "loss": 1.4833, - "step": 260500 - }, - { - "epoch": 3.394238897197477, - "grad_norm": 1.375, - "learning_rate": 0.001864230444112101, - "loss": 1.487, - "step": 261000 - }, - { - "epoch": 3.400741270563756, - "grad_norm": 0.5078125, - "learning_rate": 0.0018639703491774497, - "loss": 1.4886, - "step": 261500 - }, - { - "epoch": 3.4072436439300344, - "grad_norm": 0.859375, - "learning_rate": 0.0018637102542427987, - "loss": 1.4843, - "step": 262000 - }, - { - "epoch": 3.413746017296313, - "grad_norm": 0.5859375, - "learning_rate": 0.0018634501593081474, - "loss": 1.4825, - "step": 262500 - }, - { - "epoch": 3.420248390662592, - "grad_norm": 0.59765625, - "learning_rate": 0.0018631900643734966, - "loss": 1.4808, - "step": 263000 - }, - { - "epoch": 3.4267507640288706, - "grad_norm": 0.66015625, - "learning_rate": 0.0018629299694388453, - "loss": 1.4814, - "step": 263500 - }, - { - "epoch": 3.433253137395149, - "grad_norm": 3.3125, - "learning_rate": 0.001862669874504194, - "loss": 1.4807, - "step": 264000 - }, - { - "epoch": 3.4397555107614277, - "grad_norm": 0.76171875, - "learning_rate": 0.001862409779569543, - "loss": 1.4817, - "step": 264500 - }, - { - "epoch": 3.4462578841277067, - "grad_norm": 0.609375, - "learning_rate": 0.0018621496846348918, - "loss": 1.4824, - "step": 265000 - }, - { - "epoch": 3.4527602574939853, - "grad_norm": 1.296875, - "learning_rate": 0.0018618895897002405, - "loss": 1.4813, - "step": 265500 - }, - { - "epoch": 3.459262630860264, - "grad_norm": 25.0, - "learning_rate": 0.0018616294947655895, - "loss": 1.4827, - "step": 266000 - }, - { - "epoch": 3.465765004226543, - "grad_norm": 0.5390625, - "learning_rate": 0.0018613693998309382, - "loss": 1.4771, - "step": 266500 - }, - { - "epoch": 3.4722673775928214, - "grad_norm": 0.54296875, - "learning_rate": 0.0018611093048962874, - "loss": 1.4795, - "step": 267000 - }, - { - "epoch": 3.4787697509591, - "grad_norm": 1.1953125, - "learning_rate": 0.001860849209961636, - "loss": 1.4776, - "step": 267500 - }, - { - "epoch": 3.4852721243253786, - "grad_norm": 0.5078125, - "learning_rate": 0.0018605891150269848, - "loss": 1.4828, - "step": 268000 - }, - { - "epoch": 3.4917744976916576, - "grad_norm": 0.474609375, - "learning_rate": 0.0018603290200923338, - "loss": 1.4827, - "step": 268500 - }, - { - "epoch": 3.498276871057936, - "grad_norm": 0.90625, - "learning_rate": 0.0018600689251576825, - "loss": 1.4794, - "step": 269000 - }, - { - "epoch": 3.5047792444242147, - "grad_norm": 2.8125, - "learning_rate": 0.0018598088302230313, - "loss": 1.4791, - "step": 269500 - }, - { - "epoch": 3.5112816177904937, - "grad_norm": 0.53515625, - "learning_rate": 0.0018595487352883802, - "loss": 1.4805, - "step": 270000 - }, - { - "epoch": 3.5177839911567723, - "grad_norm": 0.53515625, - "learning_rate": 0.0018592886403537292, - "loss": 1.4781, - "step": 270500 - }, - { - "epoch": 3.524286364523051, - "grad_norm": 0.53515625, - "learning_rate": 0.0018590285454190782, - "loss": 1.4817, - "step": 271000 - }, - { - "epoch": 3.5307887378893295, - "grad_norm": 0.4375, - "learning_rate": 0.001858768450484427, - "loss": 1.4873, - "step": 271500 - }, - { - "epoch": 3.537291111255608, - "grad_norm": 0.486328125, - "learning_rate": 0.0018585083555497756, - "loss": 1.4794, - "step": 272000 - }, - { - "epoch": 3.543793484621887, - "grad_norm": 0.453125, - "learning_rate": 0.0018582482606151246, - "loss": 1.4802, - "step": 272500 - }, - { - "epoch": 3.5502958579881656, - "grad_norm": 0.55859375, - "learning_rate": 0.0018579881656804733, - "loss": 1.4799, - "step": 273000 - }, - { - "epoch": 3.5567982313544446, - "grad_norm": 0.6640625, - "learning_rate": 0.001857728070745822, - "loss": 1.4772, - "step": 273500 - }, - { - "epoch": 3.563300604720723, - "grad_norm": 0.56640625, - "learning_rate": 0.0018574679758111712, - "loss": 1.4766, - "step": 274000 - }, - { - "epoch": 3.5698029780870018, - "grad_norm": 0.53515625, - "learning_rate": 0.00185720788087652, - "loss": 1.4788, - "step": 274500 - }, - { - "epoch": 3.5763053514532803, - "grad_norm": 0.5859375, - "learning_rate": 0.001856947785941869, - "loss": 1.4778, - "step": 275000 - }, - { - "epoch": 3.582807724819559, - "grad_norm": 0.6796875, - "learning_rate": 0.0018566876910072177, - "loss": 1.4819, - "step": 275500 - }, - { - "epoch": 3.589310098185838, - "grad_norm": 0.6328125, - "learning_rate": 0.0018564275960725664, - "loss": 1.4799, - "step": 276000 - }, - { - "epoch": 3.5958124715521165, - "grad_norm": 0.515625, - "learning_rate": 0.0018561675011379154, - "loss": 1.4789, - "step": 276500 - }, - { - "epoch": 3.6023148449183955, - "grad_norm": 0.55859375, - "learning_rate": 0.0018559074062032641, - "loss": 1.4805, - "step": 277000 - }, - { - "epoch": 3.608817218284674, - "grad_norm": 0.5078125, - "learning_rate": 0.0018556473112686133, - "loss": 1.4783, - "step": 277500 - }, - { - "epoch": 3.6153195916509526, - "grad_norm": 0.546875, - "learning_rate": 0.001855387216333962, - "loss": 1.4763, - "step": 278000 - }, - { - "epoch": 3.621821965017231, - "grad_norm": 0.65625, - "learning_rate": 0.0018551271213993108, - "loss": 1.4771, - "step": 278500 - }, - { - "epoch": 3.6283243383835098, - "grad_norm": 0.859375, - "learning_rate": 0.0018548670264646597, - "loss": 1.4786, - "step": 279000 - }, - { - "epoch": 3.6348267117497888, - "grad_norm": 3.390625, - "learning_rate": 0.0018546069315300085, - "loss": 1.4806, - "step": 279500 - }, - { - "epoch": 3.6413290851160673, - "grad_norm": 0.51953125, - "learning_rate": 0.0018543468365953572, - "loss": 1.4846, - "step": 280000 - }, - { - "epoch": 3.647831458482346, - "grad_norm": 0.455078125, - "learning_rate": 0.0018540867416607062, - "loss": 1.487, - "step": 280500 - }, - { - "epoch": 3.654333831848625, - "grad_norm": 0.55078125, - "learning_rate": 0.001853826646726055, - "loss": 1.4828, - "step": 281000 - }, - { - "epoch": 3.6608362052149035, - "grad_norm": 0.5234375, - "learning_rate": 0.001853566551791404, - "loss": 1.4859, - "step": 281500 - }, - { - "epoch": 3.667338578581182, - "grad_norm": 0.55859375, - "learning_rate": 0.0018533064568567528, - "loss": 1.4862, - "step": 282000 - }, - { - "epoch": 3.6738409519474606, - "grad_norm": 0.443359375, - "learning_rate": 0.0018530463619221016, - "loss": 1.4859, - "step": 282500 - }, - { - "epoch": 3.6803433253137396, - "grad_norm": 2.15625, - "learning_rate": 0.0018527862669874505, - "loss": 1.4826, - "step": 283000 - }, - { - "epoch": 3.686845698680018, - "grad_norm": 0.455078125, - "learning_rate": 0.0018525261720527993, - "loss": 1.4795, - "step": 283500 - }, - { - "epoch": 3.693348072046297, - "grad_norm": 0.62109375, - "learning_rate": 0.001852266077118148, - "loss": 1.4778, - "step": 284000 - }, - { - "epoch": 3.699850445412576, - "grad_norm": 0.451171875, - "learning_rate": 0.001852005982183497, - "loss": 1.4837, - "step": 284500 - }, - { - "epoch": 3.7063528187788544, - "grad_norm": 0.50390625, - "learning_rate": 0.001851745887248846, - "loss": 1.4808, - "step": 285000 - }, - { - "epoch": 3.712855192145133, - "grad_norm": 0.80859375, - "learning_rate": 0.0018514857923141949, - "loss": 1.4796, - "step": 285500 - }, - { - "epoch": 3.7193575655114115, - "grad_norm": 0.78515625, - "learning_rate": 0.0018512256973795436, - "loss": 1.477, - "step": 286000 - }, - { - "epoch": 3.7258599388776905, - "grad_norm": 0.52734375, - "learning_rate": 0.0018509656024448924, - "loss": 1.475, - "step": 286500 - }, - { - "epoch": 3.732362312243969, - "grad_norm": 0.6796875, - "learning_rate": 0.0018507055075102413, - "loss": 1.4817, - "step": 287000 - }, - { - "epoch": 3.7388646856102477, - "grad_norm": 0.54296875, - "learning_rate": 0.00185044541257559, - "loss": 1.4804, - "step": 287500 - }, - { - "epoch": 3.7453670589765267, - "grad_norm": 0.75, - "learning_rate": 0.0018501853176409388, - "loss": 1.4767, - "step": 288000 - }, - { - "epoch": 3.7518694323428052, - "grad_norm": 0.6328125, - "learning_rate": 0.001849925222706288, - "loss": 1.4779, - "step": 288500 - }, - { - "epoch": 3.758371805709084, - "grad_norm": 0.61328125, - "learning_rate": 0.0018496651277716367, - "loss": 1.4767, - "step": 289000 - }, - { - "epoch": 3.7648741790753624, - "grad_norm": 0.91015625, - "learning_rate": 0.0018494050328369857, - "loss": 1.4749, - "step": 289500 - }, - { - "epoch": 3.771376552441641, - "grad_norm": 0.4609375, - "learning_rate": 0.0018491449379023344, - "loss": 1.4803, - "step": 290000 - }, - { - "epoch": 3.77787892580792, - "grad_norm": 0.609375, - "learning_rate": 0.0018488848429676831, - "loss": 1.483, - "step": 290500 - }, - { - "epoch": 3.7843812991741985, - "grad_norm": 0.61328125, - "learning_rate": 0.001848624748033032, - "loss": 1.4785, - "step": 291000 - }, - { - "epoch": 3.7908836725404775, - "grad_norm": 0.59765625, - "learning_rate": 0.0018483646530983808, - "loss": 1.4722, - "step": 291500 - }, - { - "epoch": 3.797386045906756, - "grad_norm": 0.5390625, - "learning_rate": 0.00184810455816373, - "loss": 1.4768, - "step": 292000 - }, - { - "epoch": 3.8038884192730347, - "grad_norm": 1.1953125, - "learning_rate": 0.0018478444632290788, - "loss": 1.4764, - "step": 292500 - }, - { - "epoch": 3.8103907926393132, - "grad_norm": 0.484375, - "learning_rate": 0.0018475843682944275, - "loss": 1.4804, - "step": 293000 - }, - { - "epoch": 3.816893166005592, - "grad_norm": 0.625, - "learning_rate": 0.0018473242733597765, - "loss": 1.4773, - "step": 293500 - }, - { - "epoch": 3.823395539371871, - "grad_norm": 0.6484375, - "learning_rate": 0.0018470641784251252, - "loss": 1.4777, - "step": 294000 - }, - { - "epoch": 3.8298979127381494, - "grad_norm": 0.5078125, - "learning_rate": 0.001846804083490474, - "loss": 1.4838, - "step": 294500 - }, - { - "epoch": 3.836400286104428, - "grad_norm": 0.48046875, - "learning_rate": 0.0018465439885558229, - "loss": 1.4781, - "step": 295000 - }, - { - "epoch": 3.842902659470707, - "grad_norm": 0.51171875, - "learning_rate": 0.0018462838936211716, - "loss": 1.4859, - "step": 295500 - }, - { - "epoch": 3.8494050328369855, - "grad_norm": 1.25, - "learning_rate": 0.0018460237986865208, - "loss": 1.4856, - "step": 296000 - }, - { - "epoch": 3.855907406203264, - "grad_norm": 0.55859375, - "learning_rate": 0.0018457637037518695, - "loss": 1.4853, - "step": 296500 - }, - { - "epoch": 3.8624097795695427, - "grad_norm": 0.5546875, - "learning_rate": 0.0018455036088172183, - "loss": 1.4766, - "step": 297000 - }, - { - "epoch": 3.8689121529358217, - "grad_norm": 1.4765625, - "learning_rate": 0.0018452435138825672, - "loss": 1.4831, - "step": 297500 - }, - { - "epoch": 3.8754145263021003, - "grad_norm": 0.57421875, - "learning_rate": 0.001844983418947916, - "loss": 1.4798, - "step": 298000 - }, - { - "epoch": 3.881916899668379, - "grad_norm": 0.75, - "learning_rate": 0.0018447233240132647, - "loss": 1.4808, - "step": 298500 - }, - { - "epoch": 3.888419273034658, - "grad_norm": 0.53515625, - "learning_rate": 0.0018444632290786137, - "loss": 1.4808, - "step": 299000 - }, - { - "epoch": 3.8949216464009364, - "grad_norm": 0.61328125, - "learning_rate": 0.0018442031341439626, - "loss": 1.4744, - "step": 299500 - }, - { - "epoch": 3.901424019767215, - "grad_norm": 0.58984375, - "learning_rate": 0.0018439430392093116, - "loss": 1.4792, - "step": 300000 - }, - { - "epoch": 3.9079263931334935, - "grad_norm": 0.486328125, - "learning_rate": 0.0018436829442746603, - "loss": 1.4776, - "step": 300500 - }, - { - "epoch": 3.9144287664997726, - "grad_norm": 1.0859375, - "learning_rate": 0.001843422849340009, - "loss": 1.4733, - "step": 301000 - }, - { - "epoch": 3.920931139866051, - "grad_norm": 0.515625, - "learning_rate": 0.001843162754405358, - "loss": 1.4766, - "step": 301500 - }, - { - "epoch": 3.9274335132323297, - "grad_norm": 0.53515625, - "learning_rate": 0.0018429026594707068, - "loss": 1.4743, - "step": 302000 - }, - { - "epoch": 3.9339358865986087, - "grad_norm": 0.49609375, - "learning_rate": 0.0018426425645360555, - "loss": 1.4735, - "step": 302500 - }, - { - "epoch": 3.9404382599648873, - "grad_norm": 0.51953125, - "learning_rate": 0.0018423824696014047, - "loss": 1.4729, - "step": 303000 - }, - { - "epoch": 3.946940633331166, - "grad_norm": 0.4921875, - "learning_rate": 0.0018421223746667534, - "loss": 1.4776, - "step": 303500 - }, - { - "epoch": 3.9534430066974444, - "grad_norm": 0.625, - "learning_rate": 0.0018418622797321024, - "loss": 1.4805, - "step": 304000 - }, - { - "epoch": 3.959945380063723, - "grad_norm": 0.59375, - "learning_rate": 0.0018416021847974511, - "loss": 1.4744, - "step": 304500 - }, - { - "epoch": 3.966447753430002, - "grad_norm": 0.46875, - "learning_rate": 0.0018413420898627999, - "loss": 1.4781, - "step": 305000 - }, - { - "epoch": 3.9729501267962806, - "grad_norm": 0.53515625, - "learning_rate": 0.0018410819949281488, - "loss": 1.4763, - "step": 305500 - }, - { - "epoch": 3.9794525001625596, - "grad_norm": 0.734375, - "learning_rate": 0.0018408218999934976, - "loss": 1.4752, - "step": 306000 - }, - { - "epoch": 3.985954873528838, - "grad_norm": 1.4921875, - "learning_rate": 0.0018405618050588467, - "loss": 1.4749, - "step": 306500 - }, - { - "epoch": 3.9924572468951167, - "grad_norm": 0.66015625, - "learning_rate": 0.0018403017101241955, - "loss": 1.4747, - "step": 307000 - }, - { - "epoch": 3.9989596202613953, - "grad_norm": 1.0703125, - "learning_rate": 0.0018400416151895442, - "loss": 1.4753, - "step": 307500 - }, - { - "epoch": 4.0, - "eval_loss": 1.4534417390823364, - "eval_runtime": 0.9363, - "eval_samples_per_second": 1068.081, - "eval_steps_per_second": 8.545, - "step": 307580 - }, - { - "epoch": 4.005461993627674, - "grad_norm": 0.4609375, - "learning_rate": 0.0018397815202548932, - "loss": 1.4734, - "step": 308000 - }, - { - "epoch": 4.011964366993952, - "grad_norm": 0.5234375, - "learning_rate": 0.001839521425320242, - "loss": 1.4706, - "step": 308500 - }, - { - "epoch": 4.018466740360232, - "grad_norm": 0.96484375, - "learning_rate": 0.0018392613303855907, - "loss": 1.4735, - "step": 309000 - }, - { - "epoch": 4.0249691137265105, - "grad_norm": 0.5234375, - "learning_rate": 0.0018390012354509396, - "loss": 1.4777, - "step": 309500 - }, - { - "epoch": 4.031471487092789, - "grad_norm": 1.9765625, - "learning_rate": 0.0018387411405162883, - "loss": 1.4732, - "step": 310000 - }, - { - "epoch": 4.037973860459068, - "grad_norm": 0.498046875, - "learning_rate": 0.0018384810455816375, - "loss": 1.4726, - "step": 310500 - }, - { - "epoch": 4.044476233825346, - "grad_norm": 0.54296875, - "learning_rate": 0.0018382209506469863, - "loss": 1.4752, - "step": 311000 - }, - { - "epoch": 4.050978607191625, - "grad_norm": 0.90234375, - "learning_rate": 0.001837960855712335, - "loss": 1.478, - "step": 311500 - }, - { - "epoch": 4.057480980557903, - "grad_norm": 0.5078125, - "learning_rate": 0.001837700760777684, - "loss": 1.4736, - "step": 312000 - }, - { - "epoch": 4.063983353924183, - "grad_norm": 0.50390625, - "learning_rate": 0.0018374406658430327, - "loss": 1.4773, - "step": 312500 - }, - { - "epoch": 4.070485727290461, - "grad_norm": 0.79296875, - "learning_rate": 0.0018371805709083814, - "loss": 1.4747, - "step": 313000 - }, - { - "epoch": 4.07698810065674, - "grad_norm": 0.90234375, - "learning_rate": 0.0018369204759737304, - "loss": 1.4762, - "step": 313500 - }, - { - "epoch": 4.0834904740230185, - "grad_norm": 0.62109375, - "learning_rate": 0.0018366603810390794, - "loss": 1.4793, - "step": 314000 - }, - { - "epoch": 4.089992847389297, - "grad_norm": 3.640625, - "learning_rate": 0.0018364002861044283, - "loss": 1.4763, - "step": 314500 - }, - { - "epoch": 4.096495220755576, - "grad_norm": 0.5, - "learning_rate": 0.001836140191169777, - "loss": 1.4728, - "step": 315000 - }, - { - "epoch": 4.102997594121854, - "grad_norm": 1.4765625, - "learning_rate": 0.0018358800962351258, - "loss": 1.4715, - "step": 315500 - }, - { - "epoch": 4.109499967488133, - "grad_norm": 0.453125, - "learning_rate": 0.0018356200013004747, - "loss": 1.4714, - "step": 316000 - }, - { - "epoch": 4.116002340854412, - "grad_norm": 0.416015625, - "learning_rate": 0.0018353599063658235, - "loss": 1.4756, - "step": 316500 - }, - { - "epoch": 4.122504714220691, - "grad_norm": 0.54296875, - "learning_rate": 0.0018350998114311722, - "loss": 1.4792, - "step": 317000 - }, - { - "epoch": 4.129007087586969, - "grad_norm": 0.478515625, - "learning_rate": 0.0018348397164965214, - "loss": 1.4831, - "step": 317500 - }, - { - "epoch": 4.135509460953248, - "grad_norm": 0.546875, - "learning_rate": 0.0018345796215618701, - "loss": 1.4805, - "step": 318000 - }, - { - "epoch": 4.1420118343195265, - "grad_norm": 0.6484375, - "learning_rate": 0.001834319526627219, - "loss": 1.4802, - "step": 318500 - }, - { - "epoch": 4.148514207685805, - "grad_norm": 1.203125, - "learning_rate": 0.0018340594316925678, - "loss": 1.4795, - "step": 319000 - }, - { - "epoch": 4.155016581052084, - "grad_norm": 1.6484375, - "learning_rate": 0.0018337993367579166, - "loss": 1.481, - "step": 319500 - }, - { - "epoch": 4.161518954418363, - "grad_norm": 0.490234375, - "learning_rate": 0.0018335392418232655, - "loss": 1.4766, - "step": 320000 - }, - { - "epoch": 4.168021327784642, - "grad_norm": 0.6484375, - "learning_rate": 0.0018332791468886143, - "loss": 1.4774, - "step": 320500 - }, - { - "epoch": 4.17452370115092, - "grad_norm": 52.5, - "learning_rate": 0.001833019051953963, - "loss": 1.4821, - "step": 321000 - }, - { - "epoch": 4.181026074517199, - "grad_norm": 1.2421875, - "learning_rate": 0.0018327589570193122, - "loss": 1.4914, - "step": 321500 - }, - { - "epoch": 4.187528447883477, - "grad_norm": 0.50390625, - "learning_rate": 0.001832498862084661, - "loss": 1.4781, - "step": 322000 - }, - { - "epoch": 4.194030821249756, - "grad_norm": 7.96875, - "learning_rate": 0.0018322387671500099, - "loss": 1.4784, - "step": 322500 - }, - { - "epoch": 4.2005331946160345, - "grad_norm": 0.75390625, - "learning_rate": 0.0018319786722153586, - "loss": 1.4806, - "step": 323000 - }, - { - "epoch": 4.207035567982314, - "grad_norm": 0.55078125, - "learning_rate": 0.0018317185772807074, - "loss": 1.4829, - "step": 323500 - }, - { - "epoch": 4.2135379413485925, - "grad_norm": 0.7890625, - "learning_rate": 0.0018314584823460563, - "loss": 1.4921, - "step": 324000 - }, - { - "epoch": 4.220040314714871, - "grad_norm": 0.9609375, - "learning_rate": 0.001831198387411405, - "loss": 1.4937, - "step": 324500 - }, - { - "epoch": 4.22654268808115, - "grad_norm": 0.60546875, - "learning_rate": 0.0018309382924767542, - "loss": 1.495, - "step": 325000 - }, - { - "epoch": 4.233045061447428, - "grad_norm": 0.59765625, - "learning_rate": 0.001830678197542103, - "loss": 1.4985, - "step": 325500 - }, - { - "epoch": 4.239547434813707, - "grad_norm": 0.5078125, - "learning_rate": 0.0018304181026074517, - "loss": 1.5151, - "step": 326000 - }, - { - "epoch": 4.246049808179985, - "grad_norm": 1.0078125, - "learning_rate": 0.0018301580076728007, - "loss": 1.515, - "step": 326500 - }, - { - "epoch": 4.252552181546265, - "grad_norm": 0.5234375, - "learning_rate": 0.0018298979127381494, - "loss": 1.5118, - "step": 327000 - }, - { - "epoch": 4.259054554912543, - "grad_norm": 0.66796875, - "learning_rate": 0.0018296378178034982, - "loss": 1.5253, - "step": 327500 - }, - { - "epoch": 4.265556928278822, - "grad_norm": 0.69140625, - "learning_rate": 0.0018293777228688471, - "loss": 1.5161, - "step": 328000 - }, - { - "epoch": 4.2720593016451005, - "grad_norm": 0.53515625, - "learning_rate": 0.001829117627934196, - "loss": 1.4984, - "step": 328500 - }, - { - "epoch": 4.278561675011379, - "grad_norm": 0.55859375, - "learning_rate": 0.001828857532999545, - "loss": 1.4888, - "step": 329000 - }, - { - "epoch": 4.285064048377658, - "grad_norm": 3.078125, - "learning_rate": 0.0018285974380648938, - "loss": 1.4896, - "step": 329500 - }, - { - "epoch": 4.291566421743936, - "grad_norm": 0.462890625, - "learning_rate": 0.0018283373431302425, - "loss": 1.4805, - "step": 330000 - }, - { - "epoch": 4.298068795110215, - "grad_norm": 0.546875, - "learning_rate": 0.0018280772481955915, - "loss": 1.4812, - "step": 330500 - }, - { - "epoch": 4.304571168476494, - "grad_norm": 0.6015625, - "learning_rate": 0.0018278171532609402, - "loss": 1.4903, - "step": 331000 - }, - { - "epoch": 4.311073541842773, - "grad_norm": 1.34375, - "learning_rate": 0.001827557058326289, - "loss": 1.4923, - "step": 331500 - }, - { - "epoch": 4.317575915209051, - "grad_norm": 0.494140625, - "learning_rate": 0.0018272969633916381, - "loss": 1.4971, - "step": 332000 - }, - { - "epoch": 4.32407828857533, - "grad_norm": 0.44140625, - "learning_rate": 0.0018270368684569869, - "loss": 1.4985, - "step": 332500 - }, - { - "epoch": 4.3305806619416085, - "grad_norm": 1.0546875, - "learning_rate": 0.0018267767735223358, - "loss": 1.4864, - "step": 333000 - }, - { - "epoch": 4.337083035307887, - "grad_norm": 0.49609375, - "learning_rate": 0.0018265166785876846, - "loss": 1.4898, - "step": 333500 - }, - { - "epoch": 4.343585408674166, - "grad_norm": 1.640625, - "learning_rate": 0.0018262565836530333, - "loss": 1.4832, - "step": 334000 - }, - { - "epoch": 4.350087782040445, - "grad_norm": 0.51953125, - "learning_rate": 0.0018259964887183823, - "loss": 1.4832, - "step": 334500 - }, - { - "epoch": 4.356590155406724, - "grad_norm": 3.0, - "learning_rate": 0.001825736393783731, - "loss": 1.4868, - "step": 335000 - }, - { - "epoch": 4.363092528773002, - "grad_norm": 1.59375, - "learning_rate": 0.0018254762988490797, - "loss": 1.4895, - "step": 335500 - }, - { - "epoch": 4.369594902139281, - "grad_norm": 0.67578125, - "learning_rate": 0.001825216203914429, - "loss": 1.4896, - "step": 336000 - }, - { - "epoch": 4.376097275505559, - "grad_norm": 3.984375, - "learning_rate": 0.0018249561089797777, - "loss": 1.4895, - "step": 336500 - }, - { - "epoch": 4.382599648871838, - "grad_norm": 0.8828125, - "learning_rate": 0.0018246960140451266, - "loss": 1.4857, - "step": 337000 - }, - { - "epoch": 4.3891020222381165, - "grad_norm": 1.3515625, - "learning_rate": 0.0018244359191104753, - "loss": 1.4922, - "step": 337500 - }, - { - "epoch": 4.395604395604396, - "grad_norm": 0.52734375, - "learning_rate": 0.001824175824175824, - "loss": 1.492, - "step": 338000 - }, - { - "epoch": 4.4021067689706745, - "grad_norm": 0.53515625, - "learning_rate": 0.001823915729241173, - "loss": 1.4948, - "step": 338500 - }, - { - "epoch": 4.408609142336953, - "grad_norm": 0.458984375, - "learning_rate": 0.0018236556343065218, - "loss": 1.4994, - "step": 339000 - }, - { - "epoch": 4.415111515703232, - "grad_norm": 0.5859375, - "learning_rate": 0.001823395539371871, - "loss": 1.4972, - "step": 339500 - }, - { - "epoch": 4.42161388906951, - "grad_norm": 0.5234375, - "learning_rate": 0.0018231354444372197, - "loss": 1.4951, - "step": 340000 - }, - { - "epoch": 4.428116262435789, - "grad_norm": 0.55078125, - "learning_rate": 0.0018228753495025684, - "loss": 1.4894, - "step": 340500 - }, - { - "epoch": 4.434618635802067, - "grad_norm": 0.58203125, - "learning_rate": 0.0018226152545679174, - "loss": 1.4951, - "step": 341000 - }, - { - "epoch": 4.441121009168347, - "grad_norm": 0.50390625, - "learning_rate": 0.0018223551596332661, - "loss": 1.4847, - "step": 341500 - }, - { - "epoch": 4.447623382534625, - "grad_norm": 0.80859375, - "learning_rate": 0.0018220950646986149, - "loss": 1.4824, - "step": 342000 - }, - { - "epoch": 4.454125755900904, - "grad_norm": 0.5, - "learning_rate": 0.0018218349697639638, - "loss": 1.4808, - "step": 342500 - }, - { - "epoch": 4.4606281292671826, - "grad_norm": 0.5234375, - "learning_rate": 0.0018215748748293128, - "loss": 1.4828, - "step": 343000 - }, - { - "epoch": 4.467130502633461, - "grad_norm": 0.5234375, - "learning_rate": 0.0018213147798946617, - "loss": 1.4803, - "step": 343500 - }, - { - "epoch": 4.47363287599974, - "grad_norm": 0.5, - "learning_rate": 0.0018210546849600105, - "loss": 1.4776, - "step": 344000 - }, - { - "epoch": 4.480135249366018, - "grad_norm": 0.60546875, - "learning_rate": 0.0018207945900253592, - "loss": 1.4754, - "step": 344500 - }, - { - "epoch": 4.486637622732298, - "grad_norm": 0.69921875, - "learning_rate": 0.0018205344950907082, - "loss": 1.4711, - "step": 345000 - }, - { - "epoch": 4.493139996098576, - "grad_norm": 0.5, - "learning_rate": 0.001820274400156057, - "loss": 1.4715, - "step": 345500 - }, - { - "epoch": 4.499642369464855, - "grad_norm": 0.54296875, - "learning_rate": 0.0018200143052214057, - "loss": 1.4745, - "step": 346000 - }, - { - "epoch": 4.506144742831133, - "grad_norm": 0.423828125, - "learning_rate": 0.0018197542102867548, - "loss": 1.4725, - "step": 346500 - }, - { - "epoch": 4.512647116197412, - "grad_norm": 0.55859375, - "learning_rate": 0.0018194941153521036, - "loss": 1.4725, - "step": 347000 - }, - { - "epoch": 4.519149489563691, - "grad_norm": 0.53125, - "learning_rate": 0.0018192340204174525, - "loss": 1.4669, - "step": 347500 - }, - { - "epoch": 4.525651862929969, - "grad_norm": 0.5078125, - "learning_rate": 0.0018189739254828013, - "loss": 1.4681, - "step": 348000 - }, - { - "epoch": 4.532154236296249, - "grad_norm": 0.87109375, - "learning_rate": 0.00181871383054815, - "loss": 1.4667, - "step": 348500 - }, - { - "epoch": 4.538656609662527, - "grad_norm": 0.515625, - "learning_rate": 0.001818453735613499, - "loss": 1.468, - "step": 349000 - }, - { - "epoch": 4.545158983028806, - "grad_norm": 0.5703125, - "learning_rate": 0.0018181936406788477, - "loss": 1.4745, - "step": 349500 - }, - { - "epoch": 4.551661356395084, - "grad_norm": 0.46875, - "learning_rate": 0.0018179335457441965, - "loss": 1.4762, - "step": 350000 - }, - { - "epoch": 4.558163729761363, - "grad_norm": 0.72265625, - "learning_rate": 0.0018176734508095456, - "loss": 1.4789, - "step": 350500 - }, - { - "epoch": 4.564666103127641, - "grad_norm": 0.70703125, - "learning_rate": 0.0018174133558748944, - "loss": 1.475, - "step": 351000 - }, - { - "epoch": 4.57116847649392, - "grad_norm": 0.447265625, - "learning_rate": 0.0018171532609402433, - "loss": 1.4692, - "step": 351500 - }, - { - "epoch": 4.5776708498601995, - "grad_norm": 0.69140625, - "learning_rate": 0.001816893166005592, - "loss": 1.475, - "step": 352000 - }, - { - "epoch": 4.584173223226478, - "grad_norm": 0.67578125, - "learning_rate": 0.0018166330710709408, - "loss": 1.477, - "step": 352500 - }, - { - "epoch": 4.590675596592757, - "grad_norm": 0.462890625, - "learning_rate": 0.0018163729761362898, - "loss": 1.4727, - "step": 353000 - }, - { - "epoch": 4.597177969959035, - "grad_norm": 2.53125, - "learning_rate": 0.0018161128812016385, - "loss": 1.4759, - "step": 353500 - }, - { - "epoch": 4.603680343325314, - "grad_norm": 0.671875, - "learning_rate": 0.0018158527862669877, - "loss": 1.4758, - "step": 354000 - }, - { - "epoch": 4.610182716691592, - "grad_norm": 0.76171875, - "learning_rate": 0.0018155926913323364, - "loss": 1.4732, - "step": 354500 - }, - { - "epoch": 4.616685090057871, - "grad_norm": 0.4375, - "learning_rate": 0.0018153325963976852, - "loss": 1.4689, - "step": 355000 - }, - { - "epoch": 4.623187463424149, - "grad_norm": 0.765625, - "learning_rate": 0.0018150725014630341, - "loss": 1.4703, - "step": 355500 - }, - { - "epoch": 4.629689836790429, - "grad_norm": 0.51953125, - "learning_rate": 0.0018148124065283829, - "loss": 1.4674, - "step": 356000 - }, - { - "epoch": 4.6361922101567075, - "grad_norm": 1.1171875, - "learning_rate": 0.0018145523115937316, - "loss": 1.4734, - "step": 356500 - }, - { - "epoch": 4.642694583522986, - "grad_norm": 0.59375, - "learning_rate": 0.0018142922166590806, - "loss": 1.4702, - "step": 357000 - }, - { - "epoch": 4.649196956889265, - "grad_norm": 0.578125, - "learning_rate": 0.0018140321217244295, - "loss": 1.4727, - "step": 357500 - }, - { - "epoch": 4.655699330255543, - "grad_norm": 0.478515625, - "learning_rate": 0.0018137720267897785, - "loss": 1.4695, - "step": 358000 - }, - { - "epoch": 4.662201703621822, - "grad_norm": 0.51171875, - "learning_rate": 0.0018135119318551272, - "loss": 1.4736, - "step": 358500 - }, - { - "epoch": 4.6687040769881, - "grad_norm": 0.498046875, - "learning_rate": 0.001813251836920476, - "loss": 1.4751, - "step": 359000 - }, - { - "epoch": 4.675206450354379, - "grad_norm": 0.55078125, - "learning_rate": 0.001812991741985825, - "loss": 1.4731, - "step": 359500 - }, - { - "epoch": 4.681708823720658, - "grad_norm": 0.56640625, - "learning_rate": 0.0018127316470511736, - "loss": 1.4729, - "step": 360000 - }, - { - "epoch": 4.688211197086937, - "grad_norm": 0.67578125, - "learning_rate": 0.0018124715521165224, - "loss": 1.4717, - "step": 360500 - }, - { - "epoch": 4.6947135704532155, - "grad_norm": 0.48046875, - "learning_rate": 0.0018122114571818716, - "loss": 1.4695, - "step": 361000 - }, - { - "epoch": 4.701215943819494, - "grad_norm": 0.494140625, - "learning_rate": 0.0018119513622472203, - "loss": 1.4721, - "step": 361500 - }, - { - "epoch": 4.707718317185773, - "grad_norm": 0.62109375, - "learning_rate": 0.0018116912673125693, - "loss": 1.4716, - "step": 362000 - }, - { - "epoch": 4.714220690552051, - "grad_norm": 0.5703125, - "learning_rate": 0.001811431172377918, - "loss": 1.466, - "step": 362500 - }, - { - "epoch": 4.72072306391833, - "grad_norm": 3.59375, - "learning_rate": 0.0018111710774432667, - "loss": 1.4712, - "step": 363000 - }, - { - "epoch": 4.727225437284609, - "grad_norm": 0.62109375, - "learning_rate": 0.0018109109825086157, - "loss": 1.4683, - "step": 363500 - }, - { - "epoch": 4.733727810650888, - "grad_norm": 1.390625, - "learning_rate": 0.0018106508875739644, - "loss": 1.4724, - "step": 364000 - }, - { - "epoch": 4.740230184017166, - "grad_norm": 0.51953125, - "learning_rate": 0.0018103907926393132, - "loss": 1.4704, - "step": 364500 - }, - { - "epoch": 4.746732557383445, - "grad_norm": 0.578125, - "learning_rate": 0.0018101306977046623, - "loss": 1.4699, - "step": 365000 - }, - { - "epoch": 4.7532349307497235, - "grad_norm": 0.58203125, - "learning_rate": 0.001809870602770011, - "loss": 1.4741, - "step": 365500 - }, - { - "epoch": 4.759737304116002, - "grad_norm": 0.5546875, - "learning_rate": 0.00180961050783536, - "loss": 1.4728, - "step": 366000 - }, - { - "epoch": 4.766239677482281, - "grad_norm": 0.59765625, - "learning_rate": 0.0018093504129007088, - "loss": 1.4674, - "step": 366500 - }, - { - "epoch": 4.77274205084856, - "grad_norm": 1.03125, - "learning_rate": 0.0018090903179660575, - "loss": 1.4757, - "step": 367000 - }, - { - "epoch": 4.779244424214839, - "grad_norm": 0.65625, - "learning_rate": 0.0018088302230314065, - "loss": 1.4755, - "step": 367500 - }, - { - "epoch": 4.785746797581117, - "grad_norm": 0.6953125, - "learning_rate": 0.0018085701280967552, - "loss": 1.4721, - "step": 368000 - }, - { - "epoch": 4.792249170947396, - "grad_norm": 2.375, - "learning_rate": 0.0018083100331621044, - "loss": 1.473, - "step": 368500 - }, - { - "epoch": 4.798751544313674, - "grad_norm": 0.515625, - "learning_rate": 0.0018080499382274531, - "loss": 1.472, - "step": 369000 - }, - { - "epoch": 4.805253917679953, - "grad_norm": 0.443359375, - "learning_rate": 0.0018077898432928019, - "loss": 1.4735, - "step": 369500 - }, - { - "epoch": 4.8117562910462315, - "grad_norm": 0.515625, - "learning_rate": 0.0018075297483581508, - "loss": 1.4696, - "step": 370000 - }, - { - "epoch": 4.818258664412511, - "grad_norm": 0.466796875, - "learning_rate": 0.0018072696534234996, - "loss": 1.4682, - "step": 370500 - }, - { - "epoch": 4.8247610377787895, - "grad_norm": 0.53515625, - "learning_rate": 0.0018070095584888483, - "loss": 1.4683, - "step": 371000 - }, - { - "epoch": 4.831263411145068, - "grad_norm": 0.453125, - "learning_rate": 0.0018067494635541973, - "loss": 1.4678, - "step": 371500 - }, - { - "epoch": 4.837765784511347, - "grad_norm": 1.1171875, - "learning_rate": 0.0018064893686195462, - "loss": 1.4719, - "step": 372000 - }, - { - "epoch": 4.844268157877625, - "grad_norm": 0.55078125, - "learning_rate": 0.0018062292736848952, - "loss": 1.4704, - "step": 372500 - }, - { - "epoch": 4.850770531243904, - "grad_norm": 0.65234375, - "learning_rate": 0.001805969178750244, - "loss": 1.4744, - "step": 373000 - }, - { - "epoch": 4.857272904610182, - "grad_norm": 0.99609375, - "learning_rate": 0.0018057090838155927, - "loss": 1.4697, - "step": 373500 - }, - { - "epoch": 4.863775277976462, - "grad_norm": 0.52734375, - "learning_rate": 0.0018054489888809416, - "loss": 1.4715, - "step": 374000 - }, - { - "epoch": 4.87027765134274, - "grad_norm": 0.5078125, - "learning_rate": 0.0018051888939462904, - "loss": 1.4689, - "step": 374500 - }, - { - "epoch": 4.876780024709019, - "grad_norm": 0.80078125, - "learning_rate": 0.001804928799011639, - "loss": 1.4676, - "step": 375000 - }, - { - "epoch": 4.8832823980752975, - "grad_norm": 0.51171875, - "learning_rate": 0.0018046687040769883, - "loss": 1.4726, - "step": 375500 - }, - { - "epoch": 4.889784771441576, - "grad_norm": 0.87109375, - "learning_rate": 0.001804408609142337, - "loss": 1.4689, - "step": 376000 - }, - { - "epoch": 4.896287144807855, - "grad_norm": 0.55859375, - "learning_rate": 0.001804148514207686, - "loss": 1.4697, - "step": 376500 - }, - { - "epoch": 4.902789518174133, - "grad_norm": 0.4921875, - "learning_rate": 0.0018038884192730347, - "loss": 1.4706, - "step": 377000 - }, - { - "epoch": 4.909291891540413, - "grad_norm": 0.5859375, - "learning_rate": 0.0018036283243383835, - "loss": 1.4714, - "step": 377500 - }, - { - "epoch": 4.915794264906691, - "grad_norm": 0.46875, - "learning_rate": 0.0018033682294037324, - "loss": 1.466, - "step": 378000 - }, - { - "epoch": 4.92229663827297, - "grad_norm": 1.1875, - "learning_rate": 0.0018031081344690812, - "loss": 1.4719, - "step": 378500 - }, - { - "epoch": 4.928799011639248, - "grad_norm": 0.58984375, - "learning_rate": 0.0018028480395344299, - "loss": 1.4701, - "step": 379000 - }, - { - "epoch": 4.935301385005527, - "grad_norm": 5.46875, - "learning_rate": 0.001802587944599779, - "loss": 1.473, - "step": 379500 - }, - { - "epoch": 4.9418037583718055, - "grad_norm": 0.6796875, - "learning_rate": 0.0018023278496651278, - "loss": 1.4721, - "step": 380000 - }, - { - "epoch": 4.948306131738084, - "grad_norm": 0.59375, - "learning_rate": 0.0018020677547304768, - "loss": 1.474, - "step": 380500 - }, - { - "epoch": 4.9548085051043635, - "grad_norm": 0.546875, - "learning_rate": 0.0018018076597958255, - "loss": 1.473, - "step": 381000 - }, - { - "epoch": 4.961310878470642, - "grad_norm": 0.54296875, - "learning_rate": 0.0018015475648611742, - "loss": 1.4764, - "step": 381500 - }, - { - "epoch": 4.967813251836921, - "grad_norm": 1.9140625, - "learning_rate": 0.0018012874699265232, - "loss": 1.4717, - "step": 382000 - }, - { - "epoch": 4.974315625203199, - "grad_norm": 0.58203125, - "learning_rate": 0.001801027374991872, - "loss": 1.4759, - "step": 382500 - }, - { - "epoch": 4.980817998569478, - "grad_norm": 0.462890625, - "learning_rate": 0.0018007672800572211, - "loss": 1.4792, - "step": 383000 - }, - { - "epoch": 4.987320371935756, - "grad_norm": 0.84765625, - "learning_rate": 0.0018005071851225699, - "loss": 1.4764, - "step": 383500 - }, - { - "epoch": 4.993822745302035, - "grad_norm": 0.53515625, - "learning_rate": 0.0018002470901879186, - "loss": 1.4758, - "step": 384000 - }, - { - "epoch": 5.0, - "eval_loss": 1.4539484977722168, - "eval_runtime": 0.9039, - "eval_samples_per_second": 1106.3, - "eval_steps_per_second": 8.85, - "step": 384475 + "epoch": 19.0, + "eval_loss": 1.3305245637893677, + "eval_runtime": 1.4316, + "eval_samples_per_second": 698.511, + "eval_steps_per_second": 0.699, + "step": 182628 } ], "logging_steps": 500, - "max_steps": 3844750, + "max_steps": 480600, "num_input_tokens_seen": 0, "num_train_epochs": 50, "save_steps": 500, - "total_flos": 3.3089171938476826e+18, - "train_batch_size": 128, + "stateful_callbacks": { + "EarlyStoppingCallback": { + "args": { + "early_stopping_patience": 3, + "early_stopping_threshold": 0.0 + }, + "attributes": { + "early_stopping_patience_counter": 0 + } + }, + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.3420668838410615e+19, + "train_batch_size": 1024, "trial_name": null, "trial_params": null }