|
{ |
|
"best_metric": 1.4534417390823364, |
|
"best_model_checkpoint": "./results/models/checkpoint-307580", |
|
"epoch": 5.0, |
|
"eval_steps": 500, |
|
"global_step": 384475, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.006502373366278692, |
|
"grad_norm": 1.359375, |
|
"learning_rate": 0.001999739905065349, |
|
"loss": 2.651, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.013004746732557384, |
|
"grad_norm": 4.40625, |
|
"learning_rate": 0.0019994798101306975, |
|
"loss": 2.3581, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.019507120098836074, |
|
"grad_norm": 0.87890625, |
|
"learning_rate": 0.0019992197151960465, |
|
"loss": 2.2735, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.026009493465114768, |
|
"grad_norm": 0.60546875, |
|
"learning_rate": 0.0019989596202613954, |
|
"loss": 2.2235, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.03251186683139346, |
|
"grad_norm": 1.75, |
|
"learning_rate": 0.0019986995253267444, |
|
"loss": 2.1651, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.03901424019767215, |
|
"grad_norm": 0.68359375, |
|
"learning_rate": 0.0019984394303920934, |
|
"loss": 2.125, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.04551661356395084, |
|
"grad_norm": 0.5703125, |
|
"learning_rate": 0.001998179335457442, |
|
"loss": 2.0895, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.052018986930229535, |
|
"grad_norm": 0.6796875, |
|
"learning_rate": 0.001997919240522791, |
|
"loss": 2.0827, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.05852136029650822, |
|
"grad_norm": 0.78515625, |
|
"learning_rate": 0.00199765914558814, |
|
"loss": 2.0646, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 0.06502373366278692, |
|
"grad_norm": 0.6953125, |
|
"learning_rate": 0.0019973990506534883, |
|
"loss": 2.039, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.0715261070290656, |
|
"grad_norm": 0.83984375, |
|
"learning_rate": 0.0019971389557188377, |
|
"loss": 2.0101, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 0.0780284803953443, |
|
"grad_norm": 0.95703125, |
|
"learning_rate": 0.0019968788607841862, |
|
"loss": 2.0275, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.08453085376162299, |
|
"grad_norm": 0.68359375, |
|
"learning_rate": 0.001996618765849535, |
|
"loss": 1.9981, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 0.09103322712790168, |
|
"grad_norm": 0.69140625, |
|
"learning_rate": 0.001996358670914884, |
|
"loss": 1.9754, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 0.09753560049418038, |
|
"grad_norm": 0.55859375, |
|
"learning_rate": 0.0019960985759802327, |
|
"loss": 1.9366, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 0.10403797386045907, |
|
"grad_norm": 0.8671875, |
|
"learning_rate": 0.0019958384810455816, |
|
"loss": 1.921, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 0.11054034722673776, |
|
"grad_norm": 0.455078125, |
|
"learning_rate": 0.0019955783861109306, |
|
"loss": 1.9106, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 0.11704272059301644, |
|
"grad_norm": 0.57421875, |
|
"learning_rate": 0.001995318291176279, |
|
"loss": 1.9082, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 0.12354509395929514, |
|
"grad_norm": 0.58984375, |
|
"learning_rate": 0.0019950581962416285, |
|
"loss": 1.8952, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 0.13004746732557385, |
|
"grad_norm": 0.5234375, |
|
"learning_rate": 0.001994798101306977, |
|
"loss": 1.9024, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 0.13654984069185253, |
|
"grad_norm": 1.0859375, |
|
"learning_rate": 0.001994538006372326, |
|
"loss": 1.9417, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 0.1430522140581312, |
|
"grad_norm": 0.609375, |
|
"learning_rate": 0.001994277911437675, |
|
"loss": 1.9231, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 0.1495545874244099, |
|
"grad_norm": 1.09375, |
|
"learning_rate": 0.0019940178165030235, |
|
"loss": 1.9009, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 0.1560569607906886, |
|
"grad_norm": 0.62109375, |
|
"learning_rate": 0.0019937577215683724, |
|
"loss": 1.9132, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 0.1625593341569673, |
|
"grad_norm": 0.59765625, |
|
"learning_rate": 0.0019934976266337214, |
|
"loss": 1.8845, |
|
"step": 12500 |
|
}, |
|
{ |
|
"epoch": 0.16906170752324598, |
|
"grad_norm": 1.578125, |
|
"learning_rate": 0.0019932375316990703, |
|
"loss": 1.8613, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 0.1755640808895247, |
|
"grad_norm": 4.78125, |
|
"learning_rate": 0.0019929774367644193, |
|
"loss": 1.8528, |
|
"step": 13500 |
|
}, |
|
{ |
|
"epoch": 0.18206645425580337, |
|
"grad_norm": 0.56640625, |
|
"learning_rate": 0.001992717341829768, |
|
"loss": 1.8416, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 0.18856882762208205, |
|
"grad_norm": 0.73828125, |
|
"learning_rate": 0.0019924572468951168, |
|
"loss": 1.8443, |
|
"step": 14500 |
|
}, |
|
{ |
|
"epoch": 0.19507120098836075, |
|
"grad_norm": 0.53125, |
|
"learning_rate": 0.0019921971519604657, |
|
"loss": 1.8301, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 0.20157357435463943, |
|
"grad_norm": 0.98828125, |
|
"learning_rate": 0.0019919370570258142, |
|
"loss": 1.8077, |
|
"step": 15500 |
|
}, |
|
{ |
|
"epoch": 0.20807594772091814, |
|
"grad_norm": 1.53125, |
|
"learning_rate": 0.001991676962091163, |
|
"loss": 1.7963, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 0.21457832108719682, |
|
"grad_norm": 0.671875, |
|
"learning_rate": 0.001991416867156512, |
|
"loss": 1.7879, |
|
"step": 16500 |
|
}, |
|
{ |
|
"epoch": 0.22108069445347553, |
|
"grad_norm": 5.25, |
|
"learning_rate": 0.001991156772221861, |
|
"loss": 1.793, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 0.2275830678197542, |
|
"grad_norm": 0.6171875, |
|
"learning_rate": 0.00199089667728721, |
|
"loss": 1.7897, |
|
"step": 17500 |
|
}, |
|
{ |
|
"epoch": 0.2340854411860329, |
|
"grad_norm": 0.8359375, |
|
"learning_rate": 0.0019906365823525586, |
|
"loss": 1.7831, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 0.2405878145523116, |
|
"grad_norm": 1.28125, |
|
"learning_rate": 0.0019903764874179075, |
|
"loss": 1.7912, |
|
"step": 18500 |
|
}, |
|
{ |
|
"epoch": 0.24709018791859028, |
|
"grad_norm": 0.79296875, |
|
"learning_rate": 0.0019901163924832565, |
|
"loss": 1.7988, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 0.25359256128486896, |
|
"grad_norm": 0.5234375, |
|
"learning_rate": 0.001989856297548605, |
|
"loss": 1.8031, |
|
"step": 19500 |
|
}, |
|
{ |
|
"epoch": 0.2600949346511477, |
|
"grad_norm": 0.71484375, |
|
"learning_rate": 0.0019895962026139544, |
|
"loss": 1.8197, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 0.26659730801742637, |
|
"grad_norm": 0.78515625, |
|
"learning_rate": 0.001989336107679303, |
|
"loss": 1.8048, |
|
"step": 20500 |
|
}, |
|
{ |
|
"epoch": 0.27309968138370505, |
|
"grad_norm": 0.546875, |
|
"learning_rate": 0.001989076012744652, |
|
"loss": 1.7964, |
|
"step": 21000 |
|
}, |
|
{ |
|
"epoch": 0.27960205474998373, |
|
"grad_norm": 0.64453125, |
|
"learning_rate": 0.001988815917810001, |
|
"loss": 1.7898, |
|
"step": 21500 |
|
}, |
|
{ |
|
"epoch": 0.2861044281162624, |
|
"grad_norm": 5.25, |
|
"learning_rate": 0.0019885558228753494, |
|
"loss": 1.7911, |
|
"step": 22000 |
|
}, |
|
{ |
|
"epoch": 0.29260680148254115, |
|
"grad_norm": 0.494140625, |
|
"learning_rate": 0.0019882957279406983, |
|
"loss": 1.7739, |
|
"step": 22500 |
|
}, |
|
{ |
|
"epoch": 0.2991091748488198, |
|
"grad_norm": 0.6015625, |
|
"learning_rate": 0.0019880356330060473, |
|
"loss": 1.7675, |
|
"step": 23000 |
|
}, |
|
{ |
|
"epoch": 0.3056115482150985, |
|
"grad_norm": 0.431640625, |
|
"learning_rate": 0.001987775538071396, |
|
"loss": 1.767, |
|
"step": 23500 |
|
}, |
|
{ |
|
"epoch": 0.3121139215813772, |
|
"grad_norm": 0.75390625, |
|
"learning_rate": 0.001987515443136745, |
|
"loss": 1.7782, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 0.3186162949476559, |
|
"grad_norm": 0.51171875, |
|
"learning_rate": 0.0019872553482020937, |
|
"loss": 1.7878, |
|
"step": 24500 |
|
}, |
|
{ |
|
"epoch": 0.3251186683139346, |
|
"grad_norm": 0.625, |
|
"learning_rate": 0.0019869952532674427, |
|
"loss": 1.7762, |
|
"step": 25000 |
|
}, |
|
{ |
|
"epoch": 0.3316210416802133, |
|
"grad_norm": 6.15625, |
|
"learning_rate": 0.0019867351583327916, |
|
"loss": 1.744, |
|
"step": 25500 |
|
}, |
|
{ |
|
"epoch": 0.33812341504649196, |
|
"grad_norm": 0.82421875, |
|
"learning_rate": 0.00198647506339814, |
|
"loss": 1.7591, |
|
"step": 26000 |
|
}, |
|
{ |
|
"epoch": 0.34462578841277064, |
|
"grad_norm": 0.87890625, |
|
"learning_rate": 0.001986214968463489, |
|
"loss": 1.7565, |
|
"step": 26500 |
|
}, |
|
{ |
|
"epoch": 0.3511281617790494, |
|
"grad_norm": 0.5078125, |
|
"learning_rate": 0.001985954873528838, |
|
"loss": 1.7639, |
|
"step": 27000 |
|
}, |
|
{ |
|
"epoch": 0.35763053514532805, |
|
"grad_norm": 0.609375, |
|
"learning_rate": 0.001985694778594187, |
|
"loss": 1.7547, |
|
"step": 27500 |
|
}, |
|
{ |
|
"epoch": 0.36413290851160673, |
|
"grad_norm": 0.84375, |
|
"learning_rate": 0.001985434683659536, |
|
"loss": 1.7356, |
|
"step": 28000 |
|
}, |
|
{ |
|
"epoch": 0.3706352818778854, |
|
"grad_norm": 0.66796875, |
|
"learning_rate": 0.0019851745887248845, |
|
"loss": 1.7366, |
|
"step": 28500 |
|
}, |
|
{ |
|
"epoch": 0.3771376552441641, |
|
"grad_norm": 1.2265625, |
|
"learning_rate": 0.0019849144937902335, |
|
"loss": 1.7293, |
|
"step": 29000 |
|
}, |
|
{ |
|
"epoch": 0.38364002861044283, |
|
"grad_norm": 0.498046875, |
|
"learning_rate": 0.0019846543988555824, |
|
"loss": 1.7272, |
|
"step": 29500 |
|
}, |
|
{ |
|
"epoch": 0.3901424019767215, |
|
"grad_norm": 0.55859375, |
|
"learning_rate": 0.001984394303920931, |
|
"loss": 1.7196, |
|
"step": 30000 |
|
}, |
|
{ |
|
"epoch": 0.3966447753430002, |
|
"grad_norm": 0.5078125, |
|
"learning_rate": 0.00198413420898628, |
|
"loss": 1.7293, |
|
"step": 30500 |
|
}, |
|
{ |
|
"epoch": 0.40314714870927887, |
|
"grad_norm": 0.875, |
|
"learning_rate": 0.001983874114051629, |
|
"loss": 1.7171, |
|
"step": 31000 |
|
}, |
|
{ |
|
"epoch": 0.4096495220755576, |
|
"grad_norm": 1.296875, |
|
"learning_rate": 0.001983614019116978, |
|
"loss": 1.7131, |
|
"step": 31500 |
|
}, |
|
{ |
|
"epoch": 0.4161518954418363, |
|
"grad_norm": 2.546875, |
|
"learning_rate": 0.001983353924182327, |
|
"loss": 1.7128, |
|
"step": 32000 |
|
}, |
|
{ |
|
"epoch": 0.42265426880811496, |
|
"grad_norm": 0.71484375, |
|
"learning_rate": 0.0019830938292476753, |
|
"loss": 1.7127, |
|
"step": 32500 |
|
}, |
|
{ |
|
"epoch": 0.42915664217439364, |
|
"grad_norm": 0.546875, |
|
"learning_rate": 0.0019828337343130243, |
|
"loss": 1.7052, |
|
"step": 33000 |
|
}, |
|
{ |
|
"epoch": 0.4356590155406723, |
|
"grad_norm": 0.5625, |
|
"learning_rate": 0.0019825736393783732, |
|
"loss": 1.701, |
|
"step": 33500 |
|
}, |
|
{ |
|
"epoch": 0.44216138890695106, |
|
"grad_norm": 0.4609375, |
|
"learning_rate": 0.0019823135444437217, |
|
"loss": 1.6965, |
|
"step": 34000 |
|
}, |
|
{ |
|
"epoch": 0.44866376227322974, |
|
"grad_norm": 0.5625, |
|
"learning_rate": 0.001982053449509071, |
|
"loss": 1.6987, |
|
"step": 34500 |
|
}, |
|
{ |
|
"epoch": 0.4551661356395084, |
|
"grad_norm": 1.4296875, |
|
"learning_rate": 0.0019817933545744197, |
|
"loss": 1.699, |
|
"step": 35000 |
|
}, |
|
{ |
|
"epoch": 0.4616685090057871, |
|
"grad_norm": 0.490234375, |
|
"learning_rate": 0.0019815332596397686, |
|
"loss": 1.6868, |
|
"step": 35500 |
|
}, |
|
{ |
|
"epoch": 0.4681708823720658, |
|
"grad_norm": 0.73828125, |
|
"learning_rate": 0.0019812731647051176, |
|
"loss": 1.6836, |
|
"step": 36000 |
|
}, |
|
{ |
|
"epoch": 0.4746732557383445, |
|
"grad_norm": 0.5234375, |
|
"learning_rate": 0.001981013069770466, |
|
"loss": 1.6805, |
|
"step": 36500 |
|
}, |
|
{ |
|
"epoch": 0.4811756291046232, |
|
"grad_norm": 0.49609375, |
|
"learning_rate": 0.001980752974835815, |
|
"loss": 1.6832, |
|
"step": 37000 |
|
}, |
|
{ |
|
"epoch": 0.4876780024709019, |
|
"grad_norm": 0.71484375, |
|
"learning_rate": 0.001980492879901164, |
|
"loss": 1.6827, |
|
"step": 37500 |
|
}, |
|
{ |
|
"epoch": 0.49418037583718055, |
|
"grad_norm": 0.58984375, |
|
"learning_rate": 0.0019802327849665125, |
|
"loss": 1.6739, |
|
"step": 38000 |
|
}, |
|
{ |
|
"epoch": 0.5006827492034592, |
|
"grad_norm": 0.71875, |
|
"learning_rate": 0.001979972690031862, |
|
"loss": 1.676, |
|
"step": 38500 |
|
}, |
|
{ |
|
"epoch": 0.5071851225697379, |
|
"grad_norm": 0.76171875, |
|
"learning_rate": 0.0019797125950972105, |
|
"loss": 1.6762, |
|
"step": 39000 |
|
}, |
|
{ |
|
"epoch": 0.5136874959360166, |
|
"grad_norm": 0.69140625, |
|
"learning_rate": 0.0019794525001625594, |
|
"loss": 1.6743, |
|
"step": 39500 |
|
}, |
|
{ |
|
"epoch": 0.5201898693022954, |
|
"grad_norm": 0.69921875, |
|
"learning_rate": 0.0019791924052279084, |
|
"loss": 1.6713, |
|
"step": 40000 |
|
}, |
|
{ |
|
"epoch": 0.5266922426685741, |
|
"grad_norm": 0.609375, |
|
"learning_rate": 0.001978932310293257, |
|
"loss": 1.668, |
|
"step": 40500 |
|
}, |
|
{ |
|
"epoch": 0.5331946160348527, |
|
"grad_norm": 0.55078125, |
|
"learning_rate": 0.001978672215358606, |
|
"loss": 1.665, |
|
"step": 41000 |
|
}, |
|
{ |
|
"epoch": 0.5396969894011314, |
|
"grad_norm": 0.6171875, |
|
"learning_rate": 0.001978412120423955, |
|
"loss": 1.6673, |
|
"step": 41500 |
|
}, |
|
{ |
|
"epoch": 0.5461993627674101, |
|
"grad_norm": 2.578125, |
|
"learning_rate": 0.0019781520254893038, |
|
"loss": 1.6608, |
|
"step": 42000 |
|
}, |
|
{ |
|
"epoch": 0.5527017361336888, |
|
"grad_norm": 0.46484375, |
|
"learning_rate": 0.0019778919305546527, |
|
"loss": 1.6578, |
|
"step": 42500 |
|
}, |
|
{ |
|
"epoch": 0.5592041094999675, |
|
"grad_norm": 0.57421875, |
|
"learning_rate": 0.0019776318356200012, |
|
"loss": 1.6561, |
|
"step": 43000 |
|
}, |
|
{ |
|
"epoch": 0.5657064828662461, |
|
"grad_norm": 0.57421875, |
|
"learning_rate": 0.00197737174068535, |
|
"loss": 1.6527, |
|
"step": 43500 |
|
}, |
|
{ |
|
"epoch": 0.5722088562325248, |
|
"grad_norm": 0.640625, |
|
"learning_rate": 0.001977111645750699, |
|
"loss": 1.6467, |
|
"step": 44000 |
|
}, |
|
{ |
|
"epoch": 0.5787112295988036, |
|
"grad_norm": 0.6015625, |
|
"learning_rate": 0.0019768515508160477, |
|
"loss": 1.6501, |
|
"step": 44500 |
|
}, |
|
{ |
|
"epoch": 0.5852136029650823, |
|
"grad_norm": 0.55859375, |
|
"learning_rate": 0.0019765914558813966, |
|
"loss": 1.6533, |
|
"step": 45000 |
|
}, |
|
{ |
|
"epoch": 0.591715976331361, |
|
"grad_norm": 0.78515625, |
|
"learning_rate": 0.0019763313609467456, |
|
"loss": 1.6543, |
|
"step": 45500 |
|
}, |
|
{ |
|
"epoch": 0.5982183496976397, |
|
"grad_norm": 0.53125, |
|
"learning_rate": 0.0019760712660120945, |
|
"loss": 1.651, |
|
"step": 46000 |
|
}, |
|
{ |
|
"epoch": 0.6047207230639183, |
|
"grad_norm": 0.5703125, |
|
"learning_rate": 0.0019758111710774435, |
|
"loss": 1.6474, |
|
"step": 46500 |
|
}, |
|
{ |
|
"epoch": 0.611223096430197, |
|
"grad_norm": 0.5, |
|
"learning_rate": 0.001975551076142792, |
|
"loss": 1.6363, |
|
"step": 47000 |
|
}, |
|
{ |
|
"epoch": 0.6177254697964757, |
|
"grad_norm": 0.48828125, |
|
"learning_rate": 0.001975290981208141, |
|
"loss": 1.6395, |
|
"step": 47500 |
|
}, |
|
{ |
|
"epoch": 0.6242278431627544, |
|
"grad_norm": 1.0078125, |
|
"learning_rate": 0.00197503088627349, |
|
"loss": 1.6419, |
|
"step": 48000 |
|
}, |
|
{ |
|
"epoch": 0.630730216529033, |
|
"grad_norm": 0.53515625, |
|
"learning_rate": 0.0019747707913388385, |
|
"loss": 1.6377, |
|
"step": 48500 |
|
}, |
|
{ |
|
"epoch": 0.6372325898953118, |
|
"grad_norm": 0.70703125, |
|
"learning_rate": 0.001974510696404188, |
|
"loss": 1.6428, |
|
"step": 49000 |
|
}, |
|
{ |
|
"epoch": 0.6437349632615905, |
|
"grad_norm": 0.5859375, |
|
"learning_rate": 0.0019742506014695364, |
|
"loss": 1.6507, |
|
"step": 49500 |
|
}, |
|
{ |
|
"epoch": 0.6502373366278692, |
|
"grad_norm": 0.76953125, |
|
"learning_rate": 0.0019739905065348853, |
|
"loss": 1.6552, |
|
"step": 50000 |
|
}, |
|
{ |
|
"epoch": 0.6567397099941479, |
|
"grad_norm": 2.09375, |
|
"learning_rate": 0.0019737304116002343, |
|
"loss": 1.6509, |
|
"step": 50500 |
|
}, |
|
{ |
|
"epoch": 0.6632420833604266, |
|
"grad_norm": 0.5390625, |
|
"learning_rate": 0.001973470316665583, |
|
"loss": 1.6545, |
|
"step": 51000 |
|
}, |
|
{ |
|
"epoch": 0.6697444567267052, |
|
"grad_norm": 0.51953125, |
|
"learning_rate": 0.0019732102217309318, |
|
"loss": 1.6451, |
|
"step": 51500 |
|
}, |
|
{ |
|
"epoch": 0.6762468300929839, |
|
"grad_norm": 0.6953125, |
|
"learning_rate": 0.0019729501267962807, |
|
"loss": 1.6408, |
|
"step": 52000 |
|
}, |
|
{ |
|
"epoch": 0.6827492034592626, |
|
"grad_norm": 0.5546875, |
|
"learning_rate": 0.0019726900318616293, |
|
"loss": 1.6378, |
|
"step": 52500 |
|
}, |
|
{ |
|
"epoch": 0.6892515768255413, |
|
"grad_norm": 0.56640625, |
|
"learning_rate": 0.0019724299369269786, |
|
"loss": 1.637, |
|
"step": 53000 |
|
}, |
|
{ |
|
"epoch": 0.6957539501918201, |
|
"grad_norm": 0.5625, |
|
"learning_rate": 0.001972169841992327, |
|
"loss": 1.6317, |
|
"step": 53500 |
|
}, |
|
{ |
|
"epoch": 0.7022563235580987, |
|
"grad_norm": 0.83984375, |
|
"learning_rate": 0.001971909747057676, |
|
"loss": 1.6326, |
|
"step": 54000 |
|
}, |
|
{ |
|
"epoch": 0.7087586969243774, |
|
"grad_norm": 0.609375, |
|
"learning_rate": 0.001971649652123025, |
|
"loss": 1.6285, |
|
"step": 54500 |
|
}, |
|
{ |
|
"epoch": 0.7152610702906561, |
|
"grad_norm": 0.51953125, |
|
"learning_rate": 0.0019713895571883736, |
|
"loss": 1.6336, |
|
"step": 55000 |
|
}, |
|
{ |
|
"epoch": 0.7217634436569348, |
|
"grad_norm": 0.6015625, |
|
"learning_rate": 0.0019711294622537226, |
|
"loss": 1.636, |
|
"step": 55500 |
|
}, |
|
{ |
|
"epoch": 0.7282658170232135, |
|
"grad_norm": 0.5, |
|
"learning_rate": 0.0019708693673190715, |
|
"loss": 1.6426, |
|
"step": 56000 |
|
}, |
|
{ |
|
"epoch": 0.7347681903894921, |
|
"grad_norm": 0.69140625, |
|
"learning_rate": 0.0019706092723844205, |
|
"loss": 1.6444, |
|
"step": 56500 |
|
}, |
|
{ |
|
"epoch": 0.7412705637557708, |
|
"grad_norm": 2.140625, |
|
"learning_rate": 0.0019703491774497694, |
|
"loss": 1.6306, |
|
"step": 57000 |
|
}, |
|
{ |
|
"epoch": 0.7477729371220495, |
|
"grad_norm": 0.5390625, |
|
"learning_rate": 0.001970089082515118, |
|
"loss": 1.6341, |
|
"step": 57500 |
|
}, |
|
{ |
|
"epoch": 0.7542753104883282, |
|
"grad_norm": 0.435546875, |
|
"learning_rate": 0.001969828987580467, |
|
"loss": 1.6356, |
|
"step": 58000 |
|
}, |
|
{ |
|
"epoch": 0.760777683854607, |
|
"grad_norm": 0.546875, |
|
"learning_rate": 0.001969568892645816, |
|
"loss": 1.6326, |
|
"step": 58500 |
|
}, |
|
{ |
|
"epoch": 0.7672800572208857, |
|
"grad_norm": 0.640625, |
|
"learning_rate": 0.0019693087977111644, |
|
"loss": 1.6272, |
|
"step": 59000 |
|
}, |
|
{ |
|
"epoch": 0.7737824305871643, |
|
"grad_norm": 1.09375, |
|
"learning_rate": 0.0019690487027765134, |
|
"loss": 1.6223, |
|
"step": 59500 |
|
}, |
|
{ |
|
"epoch": 0.780284803953443, |
|
"grad_norm": 0.6015625, |
|
"learning_rate": 0.0019687886078418623, |
|
"loss": 1.6297, |
|
"step": 60000 |
|
}, |
|
{ |
|
"epoch": 0.7867871773197217, |
|
"grad_norm": 0.427734375, |
|
"learning_rate": 0.0019685285129072113, |
|
"loss": 1.6278, |
|
"step": 60500 |
|
}, |
|
{ |
|
"epoch": 0.7932895506860004, |
|
"grad_norm": 0.88671875, |
|
"learning_rate": 0.0019682684179725602, |
|
"loss": 1.632, |
|
"step": 61000 |
|
}, |
|
{ |
|
"epoch": 0.7997919240522791, |
|
"grad_norm": 0.65625, |
|
"learning_rate": 0.0019680083230379087, |
|
"loss": 1.6304, |
|
"step": 61500 |
|
}, |
|
{ |
|
"epoch": 0.8062942974185577, |
|
"grad_norm": 0.6171875, |
|
"learning_rate": 0.0019677482281032577, |
|
"loss": 1.6262, |
|
"step": 62000 |
|
}, |
|
{ |
|
"epoch": 0.8127966707848364, |
|
"grad_norm": 0.5859375, |
|
"learning_rate": 0.0019674881331686067, |
|
"loss": 1.6304, |
|
"step": 62500 |
|
}, |
|
{ |
|
"epoch": 0.8192990441511152, |
|
"grad_norm": 1.1640625, |
|
"learning_rate": 0.001967228038233955, |
|
"loss": 1.6413, |
|
"step": 63000 |
|
}, |
|
{ |
|
"epoch": 0.8258014175173939, |
|
"grad_norm": 1.4921875, |
|
"learning_rate": 0.0019669679432993046, |
|
"loss": 1.6264, |
|
"step": 63500 |
|
}, |
|
{ |
|
"epoch": 0.8323037908836726, |
|
"grad_norm": 0.625, |
|
"learning_rate": 0.001966707848364653, |
|
"loss": 1.6243, |
|
"step": 64000 |
|
}, |
|
{ |
|
"epoch": 0.8388061642499512, |
|
"grad_norm": 0.75, |
|
"learning_rate": 0.001966447753430002, |
|
"loss": 1.6298, |
|
"step": 64500 |
|
}, |
|
{ |
|
"epoch": 0.8453085376162299, |
|
"grad_norm": 0.7265625, |
|
"learning_rate": 0.001966187658495351, |
|
"loss": 1.6207, |
|
"step": 65000 |
|
}, |
|
{ |
|
"epoch": 0.8518109109825086, |
|
"grad_norm": 0.515625, |
|
"learning_rate": 0.0019659275635606995, |
|
"loss": 1.6121, |
|
"step": 65500 |
|
}, |
|
{ |
|
"epoch": 0.8583132843487873, |
|
"grad_norm": 0.48046875, |
|
"learning_rate": 0.0019656674686260485, |
|
"loss": 1.6175, |
|
"step": 66000 |
|
}, |
|
{ |
|
"epoch": 0.864815657715066, |
|
"grad_norm": 0.578125, |
|
"learning_rate": 0.0019654073736913974, |
|
"loss": 1.6142, |
|
"step": 66500 |
|
}, |
|
{ |
|
"epoch": 0.8713180310813446, |
|
"grad_norm": 1.03125, |
|
"learning_rate": 0.001965147278756746, |
|
"loss": 1.614, |
|
"step": 67000 |
|
}, |
|
{ |
|
"epoch": 0.8778204044476234, |
|
"grad_norm": 0.6640625, |
|
"learning_rate": 0.0019648871838220954, |
|
"loss": 1.6112, |
|
"step": 67500 |
|
}, |
|
{ |
|
"epoch": 0.8843227778139021, |
|
"grad_norm": 0.62109375, |
|
"learning_rate": 0.001964627088887444, |
|
"loss": 1.6082, |
|
"step": 68000 |
|
}, |
|
{ |
|
"epoch": 0.8908251511801808, |
|
"grad_norm": 0.93359375, |
|
"learning_rate": 0.001964366993952793, |
|
"loss": 1.6121, |
|
"step": 68500 |
|
}, |
|
{ |
|
"epoch": 0.8973275245464595, |
|
"grad_norm": 0.84375, |
|
"learning_rate": 0.001964106899018142, |
|
"loss": 1.6084, |
|
"step": 69000 |
|
}, |
|
{ |
|
"epoch": 0.9038298979127382, |
|
"grad_norm": 0.50390625, |
|
"learning_rate": 0.0019638468040834903, |
|
"loss": 1.603, |
|
"step": 69500 |
|
}, |
|
{ |
|
"epoch": 0.9103322712790168, |
|
"grad_norm": 0.486328125, |
|
"learning_rate": 0.0019635867091488393, |
|
"loss": 1.6033, |
|
"step": 70000 |
|
}, |
|
{ |
|
"epoch": 0.9168346446452955, |
|
"grad_norm": 0.55859375, |
|
"learning_rate": 0.0019633266142141882, |
|
"loss": 1.5994, |
|
"step": 70500 |
|
}, |
|
{ |
|
"epoch": 0.9233370180115742, |
|
"grad_norm": 0.66796875, |
|
"learning_rate": 0.001963066519279537, |
|
"loss": 1.5958, |
|
"step": 71000 |
|
}, |
|
{ |
|
"epoch": 0.9298393913778529, |
|
"grad_norm": 0.5625, |
|
"learning_rate": 0.001962806424344886, |
|
"loss": 1.5963, |
|
"step": 71500 |
|
}, |
|
{ |
|
"epoch": 0.9363417647441316, |
|
"grad_norm": 0.5, |
|
"learning_rate": 0.0019625463294102347, |
|
"loss": 1.5962, |
|
"step": 72000 |
|
}, |
|
{ |
|
"epoch": 0.9428441381104103, |
|
"grad_norm": 0.8046875, |
|
"learning_rate": 0.0019622862344755836, |
|
"loss": 1.5861, |
|
"step": 72500 |
|
}, |
|
{ |
|
"epoch": 0.949346511476689, |
|
"grad_norm": 1.140625, |
|
"learning_rate": 0.0019620261395409326, |
|
"loss": 1.5928, |
|
"step": 73000 |
|
}, |
|
{ |
|
"epoch": 0.9558488848429677, |
|
"grad_norm": 0.5546875, |
|
"learning_rate": 0.001961766044606281, |
|
"loss": 1.592, |
|
"step": 73500 |
|
}, |
|
{ |
|
"epoch": 0.9623512582092464, |
|
"grad_norm": 0.46875, |
|
"learning_rate": 0.00196150594967163, |
|
"loss": 1.5859, |
|
"step": 74000 |
|
}, |
|
{ |
|
"epoch": 0.9688536315755251, |
|
"grad_norm": 0.59765625, |
|
"learning_rate": 0.001961245854736979, |
|
"loss": 1.5834, |
|
"step": 74500 |
|
}, |
|
{ |
|
"epoch": 0.9753560049418037, |
|
"grad_norm": 1.015625, |
|
"learning_rate": 0.001960985759802328, |
|
"loss": 1.5893, |
|
"step": 75000 |
|
}, |
|
{ |
|
"epoch": 0.9818583783080824, |
|
"grad_norm": 0.9609375, |
|
"learning_rate": 0.001960725664867677, |
|
"loss": 1.5846, |
|
"step": 75500 |
|
}, |
|
{ |
|
"epoch": 0.9883607516743611, |
|
"grad_norm": 0.45703125, |
|
"learning_rate": 0.0019604655699330255, |
|
"loss": 1.5838, |
|
"step": 76000 |
|
}, |
|
{ |
|
"epoch": 0.9948631250406398, |
|
"grad_norm": 0.5390625, |
|
"learning_rate": 0.0019602054749983744, |
|
"loss": 1.5809, |
|
"step": 76500 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_loss": 1.5596588850021362, |
|
"eval_runtime": 0.8816, |
|
"eval_samples_per_second": 1134.308, |
|
"eval_steps_per_second": 9.074, |
|
"step": 76895 |
|
}, |
|
{ |
|
"epoch": 1.0013654984069185, |
|
"grad_norm": 0.50390625, |
|
"learning_rate": 0.0019599453800637234, |
|
"loss": 1.5838, |
|
"step": 77000 |
|
}, |
|
{ |
|
"epoch": 1.0078678717731973, |
|
"grad_norm": 0.47265625, |
|
"learning_rate": 0.001959685285129072, |
|
"loss": 1.5829, |
|
"step": 77500 |
|
}, |
|
{ |
|
"epoch": 1.0143702451394758, |
|
"grad_norm": 0.5078125, |
|
"learning_rate": 0.0019594251901944213, |
|
"loss": 1.5812, |
|
"step": 78000 |
|
}, |
|
{ |
|
"epoch": 1.0208726185057546, |
|
"grad_norm": 0.51171875, |
|
"learning_rate": 0.00195916509525977, |
|
"loss": 1.5797, |
|
"step": 78500 |
|
}, |
|
{ |
|
"epoch": 1.0273749918720332, |
|
"grad_norm": 0.66015625, |
|
"learning_rate": 0.0019589050003251188, |
|
"loss": 1.5785, |
|
"step": 79000 |
|
}, |
|
{ |
|
"epoch": 1.033877365238312, |
|
"grad_norm": 0.515625, |
|
"learning_rate": 0.0019586449053904677, |
|
"loss": 1.5752, |
|
"step": 79500 |
|
}, |
|
{ |
|
"epoch": 1.0403797386045908, |
|
"grad_norm": 0.90625, |
|
"learning_rate": 0.0019583848104558163, |
|
"loss": 1.582, |
|
"step": 80000 |
|
}, |
|
{ |
|
"epoch": 1.0468821119708693, |
|
"grad_norm": 0.60546875, |
|
"learning_rate": 0.001958124715521165, |
|
"loss": 1.5762, |
|
"step": 80500 |
|
}, |
|
{ |
|
"epoch": 1.0533844853371481, |
|
"grad_norm": 0.8046875, |
|
"learning_rate": 0.001957864620586514, |
|
"loss": 1.577, |
|
"step": 81000 |
|
}, |
|
{ |
|
"epoch": 1.0598868587034267, |
|
"grad_norm": 0.76171875, |
|
"learning_rate": 0.0019576045256518627, |
|
"loss": 1.5741, |
|
"step": 81500 |
|
}, |
|
{ |
|
"epoch": 1.0663892320697055, |
|
"grad_norm": 0.59375, |
|
"learning_rate": 0.001957344430717212, |
|
"loss": 1.574, |
|
"step": 82000 |
|
}, |
|
{ |
|
"epoch": 1.072891605435984, |
|
"grad_norm": 0.55859375, |
|
"learning_rate": 0.0019570843357825606, |
|
"loss": 1.5744, |
|
"step": 82500 |
|
}, |
|
{ |
|
"epoch": 1.0793939788022628, |
|
"grad_norm": 0.5703125, |
|
"learning_rate": 0.0019568242408479096, |
|
"loss": 1.5745, |
|
"step": 83000 |
|
}, |
|
{ |
|
"epoch": 1.0858963521685414, |
|
"grad_norm": 0.5, |
|
"learning_rate": 0.0019565641459132585, |
|
"loss": 1.5758, |
|
"step": 83500 |
|
}, |
|
{ |
|
"epoch": 1.0923987255348202, |
|
"grad_norm": 0.75, |
|
"learning_rate": 0.001956304050978607, |
|
"loss": 1.5698, |
|
"step": 84000 |
|
}, |
|
{ |
|
"epoch": 1.098901098901099, |
|
"grad_norm": 0.447265625, |
|
"learning_rate": 0.001956043956043956, |
|
"loss": 1.576, |
|
"step": 84500 |
|
}, |
|
{ |
|
"epoch": 1.1054034722673776, |
|
"grad_norm": 0.5703125, |
|
"learning_rate": 0.001955783861109305, |
|
"loss": 1.5778, |
|
"step": 85000 |
|
}, |
|
{ |
|
"epoch": 1.1119058456336564, |
|
"grad_norm": 0.478515625, |
|
"learning_rate": 0.001955523766174654, |
|
"loss": 1.5739, |
|
"step": 85500 |
|
}, |
|
{ |
|
"epoch": 1.118408218999935, |
|
"grad_norm": 0.61328125, |
|
"learning_rate": 0.001955263671240003, |
|
"loss": 1.5778, |
|
"step": 86000 |
|
}, |
|
{ |
|
"epoch": 1.1249105923662137, |
|
"grad_norm": 0.5859375, |
|
"learning_rate": 0.0019550035763053514, |
|
"loss": 1.5713, |
|
"step": 86500 |
|
}, |
|
{ |
|
"epoch": 1.1314129657324923, |
|
"grad_norm": 0.640625, |
|
"learning_rate": 0.0019547434813707004, |
|
"loss": 1.5767, |
|
"step": 87000 |
|
}, |
|
{ |
|
"epoch": 1.137915339098771, |
|
"grad_norm": 0.9296875, |
|
"learning_rate": 0.0019544833864360493, |
|
"loss": 1.5732, |
|
"step": 87500 |
|
}, |
|
{ |
|
"epoch": 1.1444177124650499, |
|
"grad_norm": 0.6953125, |
|
"learning_rate": 0.001954223291501398, |
|
"loss": 1.5688, |
|
"step": 88000 |
|
}, |
|
{ |
|
"epoch": 1.1509200858313284, |
|
"grad_norm": 0.69921875, |
|
"learning_rate": 0.001953963196566747, |
|
"loss": 1.5739, |
|
"step": 88500 |
|
}, |
|
{ |
|
"epoch": 1.1574224591976072, |
|
"grad_norm": 0.625, |
|
"learning_rate": 0.0019537031016320957, |
|
"loss": 1.5733, |
|
"step": 89000 |
|
}, |
|
{ |
|
"epoch": 1.1639248325638858, |
|
"grad_norm": 0.73828125, |
|
"learning_rate": 0.0019534430066974447, |
|
"loss": 1.576, |
|
"step": 89500 |
|
}, |
|
{ |
|
"epoch": 1.1704272059301646, |
|
"grad_norm": 0.61328125, |
|
"learning_rate": 0.0019531829117627937, |
|
"loss": 1.5668, |
|
"step": 90000 |
|
}, |
|
{ |
|
"epoch": 1.1769295792964432, |
|
"grad_norm": 1.109375, |
|
"learning_rate": 0.0019529228168281424, |
|
"loss": 1.5677, |
|
"step": 90500 |
|
}, |
|
{ |
|
"epoch": 1.183431952662722, |
|
"grad_norm": 2.03125, |
|
"learning_rate": 0.0019526627218934911, |
|
"loss": 1.5671, |
|
"step": 91000 |
|
}, |
|
{ |
|
"epoch": 1.1899343260290005, |
|
"grad_norm": 0.5234375, |
|
"learning_rate": 0.0019524026269588399, |
|
"loss": 1.5649, |
|
"step": 91500 |
|
}, |
|
{ |
|
"epoch": 1.1964366993952793, |
|
"grad_norm": 0.56640625, |
|
"learning_rate": 0.0019521425320241888, |
|
"loss": 1.5645, |
|
"step": 92000 |
|
}, |
|
{ |
|
"epoch": 1.2029390727615579, |
|
"grad_norm": 0.5546875, |
|
"learning_rate": 0.0019518824370895378, |
|
"loss": 1.5646, |
|
"step": 92500 |
|
}, |
|
{ |
|
"epoch": 1.2094414461278367, |
|
"grad_norm": 2.421875, |
|
"learning_rate": 0.0019516223421548868, |
|
"loss": 1.5711, |
|
"step": 93000 |
|
}, |
|
{ |
|
"epoch": 1.2159438194941155, |
|
"grad_norm": 10.25, |
|
"learning_rate": 0.0019513622472202355, |
|
"loss": 1.5674, |
|
"step": 93500 |
|
}, |
|
{ |
|
"epoch": 1.222446192860394, |
|
"grad_norm": 1.2421875, |
|
"learning_rate": 0.0019511021522855842, |
|
"loss": 1.5693, |
|
"step": 94000 |
|
}, |
|
{ |
|
"epoch": 1.2289485662266728, |
|
"grad_norm": 0.50390625, |
|
"learning_rate": 0.0019508420573509332, |
|
"loss": 1.5683, |
|
"step": 94500 |
|
}, |
|
{ |
|
"epoch": 1.2354509395929514, |
|
"grad_norm": 0.9765625, |
|
"learning_rate": 0.001950581962416282, |
|
"loss": 1.5786, |
|
"step": 95000 |
|
}, |
|
{ |
|
"epoch": 1.2419533129592302, |
|
"grad_norm": 0.6796875, |
|
"learning_rate": 0.0019503218674816307, |
|
"loss": 1.5745, |
|
"step": 95500 |
|
}, |
|
{ |
|
"epoch": 1.2484556863255087, |
|
"grad_norm": 0.49609375, |
|
"learning_rate": 0.0019500617725469796, |
|
"loss": 1.5704, |
|
"step": 96000 |
|
}, |
|
{ |
|
"epoch": 1.2549580596917875, |
|
"grad_norm": 0.56640625, |
|
"learning_rate": 0.0019498016776123286, |
|
"loss": 1.5753, |
|
"step": 96500 |
|
}, |
|
{ |
|
"epoch": 1.2614604330580663, |
|
"grad_norm": 1.2265625, |
|
"learning_rate": 0.0019495415826776775, |
|
"loss": 1.5765, |
|
"step": 97000 |
|
}, |
|
{ |
|
"epoch": 1.267962806424345, |
|
"grad_norm": 0.63671875, |
|
"learning_rate": 0.0019492814877430263, |
|
"loss": 1.5718, |
|
"step": 97500 |
|
}, |
|
{ |
|
"epoch": 1.2744651797906235, |
|
"grad_norm": 0.8515625, |
|
"learning_rate": 0.001949021392808375, |
|
"loss": 1.5716, |
|
"step": 98000 |
|
}, |
|
{ |
|
"epoch": 1.2809675531569022, |
|
"grad_norm": 0.67578125, |
|
"learning_rate": 0.001948761297873724, |
|
"loss": 1.5655, |
|
"step": 98500 |
|
}, |
|
{ |
|
"epoch": 1.287469926523181, |
|
"grad_norm": 1.0390625, |
|
"learning_rate": 0.0019485012029390727, |
|
"loss": 1.5646, |
|
"step": 99000 |
|
}, |
|
{ |
|
"epoch": 1.2939722998894596, |
|
"grad_norm": 0.52734375, |
|
"learning_rate": 0.0019482411080044215, |
|
"loss": 1.57, |
|
"step": 99500 |
|
}, |
|
{ |
|
"epoch": 1.3004746732557384, |
|
"grad_norm": 0.62109375, |
|
"learning_rate": 0.0019479810130697706, |
|
"loss": 1.5607, |
|
"step": 100000 |
|
}, |
|
{ |
|
"epoch": 1.306977046622017, |
|
"grad_norm": 0.85546875, |
|
"learning_rate": 0.0019477209181351194, |
|
"loss": 1.5576, |
|
"step": 100500 |
|
}, |
|
{ |
|
"epoch": 1.3134794199882958, |
|
"grad_norm": 0.63671875, |
|
"learning_rate": 0.0019474608232004683, |
|
"loss": 1.5718, |
|
"step": 101000 |
|
}, |
|
{ |
|
"epoch": 1.3199817933545743, |
|
"grad_norm": 0.72265625, |
|
"learning_rate": 0.001947200728265817, |
|
"loss": 1.562, |
|
"step": 101500 |
|
}, |
|
{ |
|
"epoch": 1.3264841667208531, |
|
"grad_norm": 0.55078125, |
|
"learning_rate": 0.0019469406333311658, |
|
"loss": 1.5603, |
|
"step": 102000 |
|
}, |
|
{ |
|
"epoch": 1.332986540087132, |
|
"grad_norm": 0.546875, |
|
"learning_rate": 0.0019466805383965148, |
|
"loss": 1.5631, |
|
"step": 102500 |
|
}, |
|
{ |
|
"epoch": 1.3394889134534105, |
|
"grad_norm": 0.51953125, |
|
"learning_rate": 0.0019464204434618635, |
|
"loss": 1.5616, |
|
"step": 103000 |
|
}, |
|
{ |
|
"epoch": 1.345991286819689, |
|
"grad_norm": 0.69921875, |
|
"learning_rate": 0.0019461603485272127, |
|
"loss": 1.5611, |
|
"step": 103500 |
|
}, |
|
{ |
|
"epoch": 1.3524936601859678, |
|
"grad_norm": 0.5, |
|
"learning_rate": 0.0019459002535925614, |
|
"loss": 1.5634, |
|
"step": 104000 |
|
}, |
|
{ |
|
"epoch": 1.3589960335522466, |
|
"grad_norm": 0.609375, |
|
"learning_rate": 0.0019456401586579102, |
|
"loss": 1.5648, |
|
"step": 104500 |
|
}, |
|
{ |
|
"epoch": 1.3654984069185252, |
|
"grad_norm": 0.5078125, |
|
"learning_rate": 0.0019453800637232591, |
|
"loss": 1.5611, |
|
"step": 105000 |
|
}, |
|
{ |
|
"epoch": 1.372000780284804, |
|
"grad_norm": 0.72265625, |
|
"learning_rate": 0.0019451199687886079, |
|
"loss": 1.5615, |
|
"step": 105500 |
|
}, |
|
{ |
|
"epoch": 1.3785031536510828, |
|
"grad_norm": 0.78515625, |
|
"learning_rate": 0.0019448598738539566, |
|
"loss": 1.5628, |
|
"step": 106000 |
|
}, |
|
{ |
|
"epoch": 1.3850055270173613, |
|
"grad_norm": 0.63671875, |
|
"learning_rate": 0.0019445997789193056, |
|
"loss": 1.5608, |
|
"step": 106500 |
|
}, |
|
{ |
|
"epoch": 1.39150790038364, |
|
"grad_norm": 0.65625, |
|
"learning_rate": 0.0019443396839846543, |
|
"loss": 1.5609, |
|
"step": 107000 |
|
}, |
|
{ |
|
"epoch": 1.3980102737499187, |
|
"grad_norm": 0.6640625, |
|
"learning_rate": 0.0019440795890500035, |
|
"loss": 1.5579, |
|
"step": 107500 |
|
}, |
|
{ |
|
"epoch": 1.4045126471161975, |
|
"grad_norm": 0.46484375, |
|
"learning_rate": 0.0019438194941153522, |
|
"loss": 1.5526, |
|
"step": 108000 |
|
}, |
|
{ |
|
"epoch": 1.411015020482476, |
|
"grad_norm": 0.62890625, |
|
"learning_rate": 0.001943559399180701, |
|
"loss": 1.554, |
|
"step": 108500 |
|
}, |
|
{ |
|
"epoch": 1.4175173938487549, |
|
"grad_norm": 1.4453125, |
|
"learning_rate": 0.00194329930424605, |
|
"loss": 1.5554, |
|
"step": 109000 |
|
}, |
|
{ |
|
"epoch": 1.4240197672150334, |
|
"grad_norm": 0.9375, |
|
"learning_rate": 0.0019430392093113986, |
|
"loss": 1.5518, |
|
"step": 109500 |
|
}, |
|
{ |
|
"epoch": 1.4305221405813122, |
|
"grad_norm": 0.53125, |
|
"learning_rate": 0.0019427791143767474, |
|
"loss": 1.5562, |
|
"step": 110000 |
|
}, |
|
{ |
|
"epoch": 1.4370245139475908, |
|
"grad_norm": 0.46875, |
|
"learning_rate": 0.0019425190194420963, |
|
"loss": 1.5568, |
|
"step": 110500 |
|
}, |
|
{ |
|
"epoch": 1.4435268873138696, |
|
"grad_norm": 0.53125, |
|
"learning_rate": 0.0019422589245074453, |
|
"loss": 1.5476, |
|
"step": 111000 |
|
}, |
|
{ |
|
"epoch": 1.4500292606801484, |
|
"grad_norm": 0.51171875, |
|
"learning_rate": 0.0019419988295727943, |
|
"loss": 1.558, |
|
"step": 111500 |
|
}, |
|
{ |
|
"epoch": 1.456531634046427, |
|
"grad_norm": 0.455078125, |
|
"learning_rate": 0.001941738734638143, |
|
"loss": 1.5478, |
|
"step": 112000 |
|
}, |
|
{ |
|
"epoch": 1.4630340074127055, |
|
"grad_norm": 0.66796875, |
|
"learning_rate": 0.0019414786397034917, |
|
"loss": 1.553, |
|
"step": 112500 |
|
}, |
|
{ |
|
"epoch": 1.4695363807789843, |
|
"grad_norm": 1.5390625, |
|
"learning_rate": 0.0019412185447688407, |
|
"loss": 1.5621, |
|
"step": 113000 |
|
}, |
|
{ |
|
"epoch": 1.476038754145263, |
|
"grad_norm": 0.76953125, |
|
"learning_rate": 0.0019409584498341894, |
|
"loss": 1.5583, |
|
"step": 113500 |
|
}, |
|
{ |
|
"epoch": 1.4825411275115417, |
|
"grad_norm": 0.6328125, |
|
"learning_rate": 0.0019406983548995382, |
|
"loss": 1.5567, |
|
"step": 114000 |
|
}, |
|
{ |
|
"epoch": 1.4890435008778204, |
|
"grad_norm": 0.8828125, |
|
"learning_rate": 0.0019404382599648873, |
|
"loss": 1.5509, |
|
"step": 114500 |
|
}, |
|
{ |
|
"epoch": 1.495545874244099, |
|
"grad_norm": 0.8671875, |
|
"learning_rate": 0.001940178165030236, |
|
"loss": 1.5542, |
|
"step": 115000 |
|
}, |
|
{ |
|
"epoch": 1.5020482476103778, |
|
"grad_norm": 0.6796875, |
|
"learning_rate": 0.001939918070095585, |
|
"loss": 1.554, |
|
"step": 115500 |
|
}, |
|
{ |
|
"epoch": 1.5085506209766564, |
|
"grad_norm": 0.875, |
|
"learning_rate": 0.0019396579751609338, |
|
"loss": 1.5505, |
|
"step": 116000 |
|
}, |
|
{ |
|
"epoch": 1.5150529943429352, |
|
"grad_norm": 0.50390625, |
|
"learning_rate": 0.0019393978802262825, |
|
"loss": 1.5542, |
|
"step": 116500 |
|
}, |
|
{ |
|
"epoch": 1.521555367709214, |
|
"grad_norm": 0.53125, |
|
"learning_rate": 0.0019391377852916315, |
|
"loss": 1.5487, |
|
"step": 117000 |
|
}, |
|
{ |
|
"epoch": 1.5280577410754925, |
|
"grad_norm": 1.171875, |
|
"learning_rate": 0.0019388776903569802, |
|
"loss": 1.5535, |
|
"step": 117500 |
|
}, |
|
{ |
|
"epoch": 1.534560114441771, |
|
"grad_norm": 0.66015625, |
|
"learning_rate": 0.0019386175954223294, |
|
"loss": 1.5524, |
|
"step": 118000 |
|
}, |
|
{ |
|
"epoch": 1.5410624878080499, |
|
"grad_norm": 0.54296875, |
|
"learning_rate": 0.0019383575004876781, |
|
"loss": 1.548, |
|
"step": 118500 |
|
}, |
|
{ |
|
"epoch": 1.5475648611743287, |
|
"grad_norm": 0.50390625, |
|
"learning_rate": 0.0019380974055530269, |
|
"loss": 1.5434, |
|
"step": 119000 |
|
}, |
|
{ |
|
"epoch": 1.5540672345406072, |
|
"grad_norm": 0.88671875, |
|
"learning_rate": 0.0019378373106183758, |
|
"loss": 1.5461, |
|
"step": 119500 |
|
}, |
|
{ |
|
"epoch": 1.560569607906886, |
|
"grad_norm": 0.5, |
|
"learning_rate": 0.0019375772156837246, |
|
"loss": 1.5469, |
|
"step": 120000 |
|
}, |
|
{ |
|
"epoch": 1.5670719812731648, |
|
"grad_norm": 0.455078125, |
|
"learning_rate": 0.0019373171207490733, |
|
"loss": 1.5484, |
|
"step": 120500 |
|
}, |
|
{ |
|
"epoch": 1.5735743546394434, |
|
"grad_norm": 0.95703125, |
|
"learning_rate": 0.0019370570258144223, |
|
"loss": 1.5412, |
|
"step": 121000 |
|
}, |
|
{ |
|
"epoch": 1.580076728005722, |
|
"grad_norm": 0.55859375, |
|
"learning_rate": 0.001936796930879771, |
|
"loss": 1.5426, |
|
"step": 121500 |
|
}, |
|
{ |
|
"epoch": 1.5865791013720008, |
|
"grad_norm": 0.89453125, |
|
"learning_rate": 0.0019365368359451202, |
|
"loss": 1.5482, |
|
"step": 122000 |
|
}, |
|
{ |
|
"epoch": 1.5930814747382795, |
|
"grad_norm": 0.83203125, |
|
"learning_rate": 0.001936276741010469, |
|
"loss": 1.5487, |
|
"step": 122500 |
|
}, |
|
{ |
|
"epoch": 1.5995838481045581, |
|
"grad_norm": 0.68359375, |
|
"learning_rate": 0.0019360166460758177, |
|
"loss": 1.5529, |
|
"step": 123000 |
|
}, |
|
{ |
|
"epoch": 1.606086221470837, |
|
"grad_norm": 0.73046875, |
|
"learning_rate": 0.0019357565511411666, |
|
"loss": 1.5546, |
|
"step": 123500 |
|
}, |
|
{ |
|
"epoch": 1.6125885948371157, |
|
"grad_norm": 0.54296875, |
|
"learning_rate": 0.0019354964562065154, |
|
"loss": 1.544, |
|
"step": 124000 |
|
}, |
|
{ |
|
"epoch": 1.6190909682033943, |
|
"grad_norm": 0.68359375, |
|
"learning_rate": 0.001935236361271864, |
|
"loss": 1.5507, |
|
"step": 124500 |
|
}, |
|
{ |
|
"epoch": 1.6255933415696728, |
|
"grad_norm": 0.55078125, |
|
"learning_rate": 0.001934976266337213, |
|
"loss": 1.5515, |
|
"step": 125000 |
|
}, |
|
{ |
|
"epoch": 1.6320957149359516, |
|
"grad_norm": 0.98828125, |
|
"learning_rate": 0.001934716171402562, |
|
"loss": 1.5502, |
|
"step": 125500 |
|
}, |
|
{ |
|
"epoch": 1.6385980883022304, |
|
"grad_norm": 1.3046875, |
|
"learning_rate": 0.001934456076467911, |
|
"loss": 1.5427, |
|
"step": 126000 |
|
}, |
|
{ |
|
"epoch": 1.645100461668509, |
|
"grad_norm": 0.515625, |
|
"learning_rate": 0.0019341959815332597, |
|
"loss": 1.5471, |
|
"step": 126500 |
|
}, |
|
{ |
|
"epoch": 1.6516028350347876, |
|
"grad_norm": 0.466796875, |
|
"learning_rate": 0.0019339358865986085, |
|
"loss": 1.5405, |
|
"step": 127000 |
|
}, |
|
{ |
|
"epoch": 1.6581052084010663, |
|
"grad_norm": 0.49609375, |
|
"learning_rate": 0.0019336757916639574, |
|
"loss": 1.5431, |
|
"step": 127500 |
|
}, |
|
{ |
|
"epoch": 1.6646075817673451, |
|
"grad_norm": 1.7421875, |
|
"learning_rate": 0.0019334156967293062, |
|
"loss": 1.5484, |
|
"step": 128000 |
|
}, |
|
{ |
|
"epoch": 1.6711099551336237, |
|
"grad_norm": 0.71875, |
|
"learning_rate": 0.001933155601794655, |
|
"loss": 1.5474, |
|
"step": 128500 |
|
}, |
|
{ |
|
"epoch": 1.6776123284999025, |
|
"grad_norm": 0.65625, |
|
"learning_rate": 0.001932895506860004, |
|
"loss": 1.544, |
|
"step": 129000 |
|
}, |
|
{ |
|
"epoch": 1.6841147018661813, |
|
"grad_norm": 0.62890625, |
|
"learning_rate": 0.0019326354119253528, |
|
"loss": 1.5443, |
|
"step": 129500 |
|
}, |
|
{ |
|
"epoch": 1.6906170752324599, |
|
"grad_norm": 0.5078125, |
|
"learning_rate": 0.0019323753169907018, |
|
"loss": 1.5468, |
|
"step": 130000 |
|
}, |
|
{ |
|
"epoch": 1.6971194485987384, |
|
"grad_norm": 0.9453125, |
|
"learning_rate": 0.0019321152220560505, |
|
"loss": 1.5416, |
|
"step": 130500 |
|
}, |
|
{ |
|
"epoch": 1.7036218219650172, |
|
"grad_norm": 0.515625, |
|
"learning_rate": 0.0019318551271213992, |
|
"loss": 1.5398, |
|
"step": 131000 |
|
}, |
|
{ |
|
"epoch": 1.710124195331296, |
|
"grad_norm": 0.55859375, |
|
"learning_rate": 0.0019315950321867482, |
|
"loss": 1.5375, |
|
"step": 131500 |
|
}, |
|
{ |
|
"epoch": 1.7166265686975746, |
|
"grad_norm": 0.48046875, |
|
"learning_rate": 0.001931334937252097, |
|
"loss": 1.5381, |
|
"step": 132000 |
|
}, |
|
{ |
|
"epoch": 1.7231289420638531, |
|
"grad_norm": 0.58984375, |
|
"learning_rate": 0.0019310748423174461, |
|
"loss": 1.5443, |
|
"step": 132500 |
|
}, |
|
{ |
|
"epoch": 1.7296313154301322, |
|
"grad_norm": 0.58984375, |
|
"learning_rate": 0.0019308147473827949, |
|
"loss": 1.5485, |
|
"step": 133000 |
|
}, |
|
{ |
|
"epoch": 1.7361336887964107, |
|
"grad_norm": 0.56640625, |
|
"learning_rate": 0.0019305546524481436, |
|
"loss": 1.5439, |
|
"step": 133500 |
|
}, |
|
{ |
|
"epoch": 1.7426360621626893, |
|
"grad_norm": 0.60546875, |
|
"learning_rate": 0.0019302945575134926, |
|
"loss": 1.5427, |
|
"step": 134000 |
|
}, |
|
{ |
|
"epoch": 1.749138435528968, |
|
"grad_norm": 1.890625, |
|
"learning_rate": 0.0019300344625788413, |
|
"loss": 1.5403, |
|
"step": 134500 |
|
}, |
|
{ |
|
"epoch": 1.7556408088952469, |
|
"grad_norm": 0.490234375, |
|
"learning_rate": 0.00192977436764419, |
|
"loss": 1.5394, |
|
"step": 135000 |
|
}, |
|
{ |
|
"epoch": 1.7621431822615254, |
|
"grad_norm": 0.59765625, |
|
"learning_rate": 0.001929514272709539, |
|
"loss": 1.5378, |
|
"step": 135500 |
|
}, |
|
{ |
|
"epoch": 1.768645555627804, |
|
"grad_norm": 0.578125, |
|
"learning_rate": 0.0019292541777748877, |
|
"loss": 1.5362, |
|
"step": 136000 |
|
}, |
|
{ |
|
"epoch": 1.7751479289940828, |
|
"grad_norm": 1.46875, |
|
"learning_rate": 0.001928994082840237, |
|
"loss": 1.5378, |
|
"step": 136500 |
|
}, |
|
{ |
|
"epoch": 1.7816503023603616, |
|
"grad_norm": 1.1953125, |
|
"learning_rate": 0.0019287339879055856, |
|
"loss": 1.5368, |
|
"step": 137000 |
|
}, |
|
{ |
|
"epoch": 1.7881526757266402, |
|
"grad_norm": 0.5859375, |
|
"learning_rate": 0.0019284738929709344, |
|
"loss": 1.5322, |
|
"step": 137500 |
|
}, |
|
{ |
|
"epoch": 1.794655049092919, |
|
"grad_norm": 0.6328125, |
|
"learning_rate": 0.0019282137980362833, |
|
"loss": 1.5335, |
|
"step": 138000 |
|
}, |
|
{ |
|
"epoch": 1.8011574224591977, |
|
"grad_norm": 0.515625, |
|
"learning_rate": 0.001927953703101632, |
|
"loss": 1.5356, |
|
"step": 138500 |
|
}, |
|
{ |
|
"epoch": 1.8076597958254763, |
|
"grad_norm": 1.5234375, |
|
"learning_rate": 0.0019276936081669808, |
|
"loss": 1.5369, |
|
"step": 139000 |
|
}, |
|
{ |
|
"epoch": 1.8141621691917549, |
|
"grad_norm": 0.64453125, |
|
"learning_rate": 0.0019274335132323298, |
|
"loss": 1.5375, |
|
"step": 139500 |
|
}, |
|
{ |
|
"epoch": 1.8206645425580337, |
|
"grad_norm": 0.6640625, |
|
"learning_rate": 0.0019271734182976787, |
|
"loss": 1.5313, |
|
"step": 140000 |
|
}, |
|
{ |
|
"epoch": 1.8271669159243125, |
|
"grad_norm": 0.55078125, |
|
"learning_rate": 0.0019269133233630277, |
|
"loss": 1.5302, |
|
"step": 140500 |
|
}, |
|
{ |
|
"epoch": 1.833669289290591, |
|
"grad_norm": 0.5078125, |
|
"learning_rate": 0.0019266532284283764, |
|
"loss": 1.5262, |
|
"step": 141000 |
|
}, |
|
{ |
|
"epoch": 1.8401716626568696, |
|
"grad_norm": 0.443359375, |
|
"learning_rate": 0.0019263931334937252, |
|
"loss": 1.5323, |
|
"step": 141500 |
|
}, |
|
{ |
|
"epoch": 1.8466740360231486, |
|
"grad_norm": 0.765625, |
|
"learning_rate": 0.0019261330385590741, |
|
"loss": 1.5322, |
|
"step": 142000 |
|
}, |
|
{ |
|
"epoch": 1.8531764093894272, |
|
"grad_norm": 0.4921875, |
|
"learning_rate": 0.0019258729436244229, |
|
"loss": 1.5286, |
|
"step": 142500 |
|
}, |
|
{ |
|
"epoch": 1.8596787827557058, |
|
"grad_norm": 0.55859375, |
|
"learning_rate": 0.0019256128486897716, |
|
"loss": 1.5282, |
|
"step": 143000 |
|
}, |
|
{ |
|
"epoch": 1.8661811561219845, |
|
"grad_norm": 0.5234375, |
|
"learning_rate": 0.0019253527537551208, |
|
"loss": 1.5302, |
|
"step": 143500 |
|
}, |
|
{ |
|
"epoch": 1.8726835294882633, |
|
"grad_norm": 1.15625, |
|
"learning_rate": 0.0019250926588204695, |
|
"loss": 1.5259, |
|
"step": 144000 |
|
}, |
|
{ |
|
"epoch": 1.879185902854542, |
|
"grad_norm": 2.078125, |
|
"learning_rate": 0.0019248325638858185, |
|
"loss": 1.5295, |
|
"step": 144500 |
|
}, |
|
{ |
|
"epoch": 1.8856882762208205, |
|
"grad_norm": 0.75, |
|
"learning_rate": 0.0019245724689511672, |
|
"loss": 1.5289, |
|
"step": 145000 |
|
}, |
|
{ |
|
"epoch": 1.8921906495870993, |
|
"grad_norm": 0.5234375, |
|
"learning_rate": 0.001924312374016516, |
|
"loss": 1.5319, |
|
"step": 145500 |
|
}, |
|
{ |
|
"epoch": 1.898693022953378, |
|
"grad_norm": 0.462890625, |
|
"learning_rate": 0.001924052279081865, |
|
"loss": 1.5349, |
|
"step": 146000 |
|
}, |
|
{ |
|
"epoch": 1.9051953963196566, |
|
"grad_norm": 3.109375, |
|
"learning_rate": 0.0019237921841472137, |
|
"loss": 1.5369, |
|
"step": 146500 |
|
}, |
|
{ |
|
"epoch": 1.9116977696859354, |
|
"grad_norm": 0.58203125, |
|
"learning_rate": 0.0019235320892125628, |
|
"loss": 1.5371, |
|
"step": 147000 |
|
}, |
|
{ |
|
"epoch": 1.9182001430522142, |
|
"grad_norm": 0.68359375, |
|
"learning_rate": 0.0019232719942779116, |
|
"loss": 1.5371, |
|
"step": 147500 |
|
}, |
|
{ |
|
"epoch": 1.9247025164184928, |
|
"grad_norm": 1.265625, |
|
"learning_rate": 0.0019230118993432603, |
|
"loss": 1.531, |
|
"step": 148000 |
|
}, |
|
{ |
|
"epoch": 1.9312048897847713, |
|
"grad_norm": 0.56640625, |
|
"learning_rate": 0.0019227518044086093, |
|
"loss": 1.5355, |
|
"step": 148500 |
|
}, |
|
{ |
|
"epoch": 1.9377072631510501, |
|
"grad_norm": 0.796875, |
|
"learning_rate": 0.001922491709473958, |
|
"loss": 1.5308, |
|
"step": 149000 |
|
}, |
|
{ |
|
"epoch": 1.944209636517329, |
|
"grad_norm": 0.52734375, |
|
"learning_rate": 0.0019222316145393068, |
|
"loss": 1.533, |
|
"step": 149500 |
|
}, |
|
{ |
|
"epoch": 1.9507120098836075, |
|
"grad_norm": 0.6953125, |
|
"learning_rate": 0.0019219715196046557, |
|
"loss": 1.5318, |
|
"step": 150000 |
|
}, |
|
{ |
|
"epoch": 1.957214383249886, |
|
"grad_norm": 0.6953125, |
|
"learning_rate": 0.0019217114246700044, |
|
"loss": 1.5307, |
|
"step": 150500 |
|
}, |
|
{ |
|
"epoch": 1.9637167566161648, |
|
"grad_norm": 0.66015625, |
|
"learning_rate": 0.0019214513297353536, |
|
"loss": 1.5297, |
|
"step": 151000 |
|
}, |
|
{ |
|
"epoch": 1.9702191299824436, |
|
"grad_norm": 1.046875, |
|
"learning_rate": 0.0019211912348007024, |
|
"loss": 1.5264, |
|
"step": 151500 |
|
}, |
|
{ |
|
"epoch": 1.9767215033487222, |
|
"grad_norm": 0.59375, |
|
"learning_rate": 0.001920931139866051, |
|
"loss": 1.5289, |
|
"step": 152000 |
|
}, |
|
{ |
|
"epoch": 1.983223876715001, |
|
"grad_norm": 0.58984375, |
|
"learning_rate": 0.0019206710449314, |
|
"loss": 1.5292, |
|
"step": 152500 |
|
}, |
|
{ |
|
"epoch": 1.9897262500812798, |
|
"grad_norm": 0.6015625, |
|
"learning_rate": 0.0019204109499967488, |
|
"loss": 1.5252, |
|
"step": 153000 |
|
}, |
|
{ |
|
"epoch": 1.9962286234475584, |
|
"grad_norm": 0.5546875, |
|
"learning_rate": 0.0019201508550620975, |
|
"loss": 1.5272, |
|
"step": 153500 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_loss": 1.502743124961853, |
|
"eval_runtime": 7.2048, |
|
"eval_samples_per_second": 138.797, |
|
"eval_steps_per_second": 1.11, |
|
"step": 153790 |
|
}, |
|
{ |
|
"epoch": 2.002730996813837, |
|
"grad_norm": 0.6953125, |
|
"learning_rate": 0.0019198907601274465, |
|
"loss": 1.523, |
|
"step": 154000 |
|
}, |
|
{ |
|
"epoch": 2.009233370180116, |
|
"grad_norm": 0.4375, |
|
"learning_rate": 0.0019196306651927955, |
|
"loss": 1.5192, |
|
"step": 154500 |
|
}, |
|
{ |
|
"epoch": 2.0157357435463945, |
|
"grad_norm": 0.515625, |
|
"learning_rate": 0.0019193705702581444, |
|
"loss": 1.5178, |
|
"step": 155000 |
|
}, |
|
{ |
|
"epoch": 2.022238116912673, |
|
"grad_norm": 0.73828125, |
|
"learning_rate": 0.0019191104753234932, |
|
"loss": 1.5219, |
|
"step": 155500 |
|
}, |
|
{ |
|
"epoch": 2.0287404902789516, |
|
"grad_norm": 0.58203125, |
|
"learning_rate": 0.001918850380388842, |
|
"loss": 1.5232, |
|
"step": 156000 |
|
}, |
|
{ |
|
"epoch": 2.0352428636452307, |
|
"grad_norm": 0.439453125, |
|
"learning_rate": 0.0019185902854541908, |
|
"loss": 1.5235, |
|
"step": 156500 |
|
}, |
|
{ |
|
"epoch": 2.0417452370115092, |
|
"grad_norm": 0.478515625, |
|
"learning_rate": 0.0019183301905195396, |
|
"loss": 1.5259, |
|
"step": 157000 |
|
}, |
|
{ |
|
"epoch": 2.048247610377788, |
|
"grad_norm": 0.458984375, |
|
"learning_rate": 0.0019180700955848883, |
|
"loss": 1.5214, |
|
"step": 157500 |
|
}, |
|
{ |
|
"epoch": 2.0547499837440664, |
|
"grad_norm": 0.5390625, |
|
"learning_rate": 0.0019178100006502375, |
|
"loss": 1.5198, |
|
"step": 158000 |
|
}, |
|
{ |
|
"epoch": 2.0612523571103454, |
|
"grad_norm": 0.5625, |
|
"learning_rate": 0.0019175499057155862, |
|
"loss": 1.52, |
|
"step": 158500 |
|
}, |
|
{ |
|
"epoch": 2.067754730476624, |
|
"grad_norm": 0.48828125, |
|
"learning_rate": 0.0019172898107809352, |
|
"loss": 1.5229, |
|
"step": 159000 |
|
}, |
|
{ |
|
"epoch": 2.0742571038429025, |
|
"grad_norm": 0.5703125, |
|
"learning_rate": 0.001917029715846284, |
|
"loss": 1.5198, |
|
"step": 159500 |
|
}, |
|
{ |
|
"epoch": 2.0807594772091815, |
|
"grad_norm": 0.4765625, |
|
"learning_rate": 0.0019167696209116327, |
|
"loss": 1.525, |
|
"step": 160000 |
|
}, |
|
{ |
|
"epoch": 2.08726185057546, |
|
"grad_norm": 0.51171875, |
|
"learning_rate": 0.0019165095259769816, |
|
"loss": 1.5199, |
|
"step": 160500 |
|
}, |
|
{ |
|
"epoch": 2.0937642239417387, |
|
"grad_norm": 0.65625, |
|
"learning_rate": 0.0019162494310423304, |
|
"loss": 1.5194, |
|
"step": 161000 |
|
}, |
|
{ |
|
"epoch": 2.1002665973080172, |
|
"grad_norm": 0.625, |
|
"learning_rate": 0.0019159893361076796, |
|
"loss": 1.5146, |
|
"step": 161500 |
|
}, |
|
{ |
|
"epoch": 2.1067689706742962, |
|
"grad_norm": 0.6796875, |
|
"learning_rate": 0.0019157292411730283, |
|
"loss": 1.5195, |
|
"step": 162000 |
|
}, |
|
{ |
|
"epoch": 2.113271344040575, |
|
"grad_norm": 0.6484375, |
|
"learning_rate": 0.001915469146238377, |
|
"loss": 1.5237, |
|
"step": 162500 |
|
}, |
|
{ |
|
"epoch": 2.1197737174068534, |
|
"grad_norm": 0.5, |
|
"learning_rate": 0.001915209051303726, |
|
"loss": 1.5214, |
|
"step": 163000 |
|
}, |
|
{ |
|
"epoch": 2.1262760907731324, |
|
"grad_norm": 2.1875, |
|
"learning_rate": 0.0019149489563690747, |
|
"loss": 1.517, |
|
"step": 163500 |
|
}, |
|
{ |
|
"epoch": 2.132778464139411, |
|
"grad_norm": 0.83203125, |
|
"learning_rate": 0.0019146888614344235, |
|
"loss": 1.5163, |
|
"step": 164000 |
|
}, |
|
{ |
|
"epoch": 2.1392808375056895, |
|
"grad_norm": 0.515625, |
|
"learning_rate": 0.0019144287664997724, |
|
"loss": 1.5162, |
|
"step": 164500 |
|
}, |
|
{ |
|
"epoch": 2.145783210871968, |
|
"grad_norm": 0.75390625, |
|
"learning_rate": 0.0019141686715651212, |
|
"loss": 1.5184, |
|
"step": 165000 |
|
}, |
|
{ |
|
"epoch": 2.152285584238247, |
|
"grad_norm": 0.609375, |
|
"learning_rate": 0.0019139085766304703, |
|
"loss": 1.5204, |
|
"step": 165500 |
|
}, |
|
{ |
|
"epoch": 2.1587879576045257, |
|
"grad_norm": 0.77734375, |
|
"learning_rate": 0.001913648481695819, |
|
"loss": 1.5225, |
|
"step": 166000 |
|
}, |
|
{ |
|
"epoch": 2.1652903309708043, |
|
"grad_norm": 0.84375, |
|
"learning_rate": 0.0019133883867611678, |
|
"loss": 1.5205, |
|
"step": 166500 |
|
}, |
|
{ |
|
"epoch": 2.171792704337083, |
|
"grad_norm": 0.55859375, |
|
"learning_rate": 0.0019131282918265168, |
|
"loss": 1.5217, |
|
"step": 167000 |
|
}, |
|
{ |
|
"epoch": 2.178295077703362, |
|
"grad_norm": 0.609375, |
|
"learning_rate": 0.0019128681968918655, |
|
"loss": 1.5178, |
|
"step": 167500 |
|
}, |
|
{ |
|
"epoch": 2.1847974510696404, |
|
"grad_norm": 0.515625, |
|
"learning_rate": 0.0019126081019572143, |
|
"loss": 1.5149, |
|
"step": 168000 |
|
}, |
|
{ |
|
"epoch": 2.191299824435919, |
|
"grad_norm": 0.71484375, |
|
"learning_rate": 0.0019123480070225632, |
|
"loss": 1.516, |
|
"step": 168500 |
|
}, |
|
{ |
|
"epoch": 2.197802197802198, |
|
"grad_norm": 0.5625, |
|
"learning_rate": 0.0019120879120879122, |
|
"loss": 1.5138, |
|
"step": 169000 |
|
}, |
|
{ |
|
"epoch": 2.2043045711684766, |
|
"grad_norm": 0.58203125, |
|
"learning_rate": 0.0019118278171532611, |
|
"loss": 1.5141, |
|
"step": 169500 |
|
}, |
|
{ |
|
"epoch": 2.210806944534755, |
|
"grad_norm": 0.63671875, |
|
"learning_rate": 0.0019115677222186099, |
|
"loss": 1.5111, |
|
"step": 170000 |
|
}, |
|
{ |
|
"epoch": 2.2173093179010337, |
|
"grad_norm": 0.875, |
|
"learning_rate": 0.0019113076272839586, |
|
"loss": 1.5136, |
|
"step": 170500 |
|
}, |
|
{ |
|
"epoch": 2.2238116912673127, |
|
"grad_norm": 0.53515625, |
|
"learning_rate": 0.0019110475323493076, |
|
"loss": 1.5115, |
|
"step": 171000 |
|
}, |
|
{ |
|
"epoch": 2.2303140646335913, |
|
"grad_norm": 0.55859375, |
|
"learning_rate": 0.0019107874374146563, |
|
"loss": 1.5159, |
|
"step": 171500 |
|
}, |
|
{ |
|
"epoch": 2.23681643799987, |
|
"grad_norm": 0.57421875, |
|
"learning_rate": 0.001910527342480005, |
|
"loss": 1.514, |
|
"step": 172000 |
|
}, |
|
{ |
|
"epoch": 2.243318811366149, |
|
"grad_norm": 0.48046875, |
|
"learning_rate": 0.0019102672475453542, |
|
"loss": 1.5094, |
|
"step": 172500 |
|
}, |
|
{ |
|
"epoch": 2.2498211847324274, |
|
"grad_norm": 0.498046875, |
|
"learning_rate": 0.001910007152610703, |
|
"loss": 1.5128, |
|
"step": 173000 |
|
}, |
|
{ |
|
"epoch": 2.256323558098706, |
|
"grad_norm": 1.265625, |
|
"learning_rate": 0.001909747057676052, |
|
"loss": 1.5091, |
|
"step": 173500 |
|
}, |
|
{ |
|
"epoch": 2.2628259314649846, |
|
"grad_norm": 0.86328125, |
|
"learning_rate": 0.0019094869627414007, |
|
"loss": 1.5094, |
|
"step": 174000 |
|
}, |
|
{ |
|
"epoch": 2.2693283048312636, |
|
"grad_norm": 0.51953125, |
|
"learning_rate": 0.0019092268678067494, |
|
"loss": 1.5072, |
|
"step": 174500 |
|
}, |
|
{ |
|
"epoch": 2.275830678197542, |
|
"grad_norm": 0.625, |
|
"learning_rate": 0.0019089667728720984, |
|
"loss": 1.5126, |
|
"step": 175000 |
|
}, |
|
{ |
|
"epoch": 2.2823330515638207, |
|
"grad_norm": 0.53125, |
|
"learning_rate": 0.001908706677937447, |
|
"loss": 1.5113, |
|
"step": 175500 |
|
}, |
|
{ |
|
"epoch": 2.2888354249300997, |
|
"grad_norm": 0.64453125, |
|
"learning_rate": 0.0019084465830027963, |
|
"loss": 1.5066, |
|
"step": 176000 |
|
}, |
|
{ |
|
"epoch": 2.2953377982963783, |
|
"grad_norm": 0.46484375, |
|
"learning_rate": 0.001908186488068145, |
|
"loss": 1.5094, |
|
"step": 176500 |
|
}, |
|
{ |
|
"epoch": 2.301840171662657, |
|
"grad_norm": 0.50390625, |
|
"learning_rate": 0.0019079263931334938, |
|
"loss": 1.5059, |
|
"step": 177000 |
|
}, |
|
{ |
|
"epoch": 2.3083425450289354, |
|
"grad_norm": 0.66796875, |
|
"learning_rate": 0.0019076662981988427, |
|
"loss": 1.5127, |
|
"step": 177500 |
|
}, |
|
{ |
|
"epoch": 2.3148449183952144, |
|
"grad_norm": 0.57421875, |
|
"learning_rate": 0.0019074062032641914, |
|
"loss": 1.5114, |
|
"step": 178000 |
|
}, |
|
{ |
|
"epoch": 2.321347291761493, |
|
"grad_norm": 0.56640625, |
|
"learning_rate": 0.0019071461083295402, |
|
"loss": 1.5088, |
|
"step": 178500 |
|
}, |
|
{ |
|
"epoch": 2.3278496651277716, |
|
"grad_norm": 0.5, |
|
"learning_rate": 0.0019068860133948891, |
|
"loss": 1.509, |
|
"step": 179000 |
|
}, |
|
{ |
|
"epoch": 2.33435203849405, |
|
"grad_norm": 3.625, |
|
"learning_rate": 0.0019066259184602379, |
|
"loss": 1.5016, |
|
"step": 179500 |
|
}, |
|
{ |
|
"epoch": 2.340854411860329, |
|
"grad_norm": 0.58203125, |
|
"learning_rate": 0.001906365823525587, |
|
"loss": 1.5006, |
|
"step": 180000 |
|
}, |
|
{ |
|
"epoch": 2.3473567852266077, |
|
"grad_norm": 0.90625, |
|
"learning_rate": 0.0019061057285909358, |
|
"loss": 1.5052, |
|
"step": 180500 |
|
}, |
|
{ |
|
"epoch": 2.3538591585928863, |
|
"grad_norm": 0.6015625, |
|
"learning_rate": 0.0019058456336562845, |
|
"loss": 1.502, |
|
"step": 181000 |
|
}, |
|
{ |
|
"epoch": 2.360361531959165, |
|
"grad_norm": 0.4765625, |
|
"learning_rate": 0.0019055855387216335, |
|
"loss": 1.5067, |
|
"step": 181500 |
|
}, |
|
{ |
|
"epoch": 2.366863905325444, |
|
"grad_norm": 0.73828125, |
|
"learning_rate": 0.0019053254437869822, |
|
"loss": 1.5013, |
|
"step": 182000 |
|
}, |
|
{ |
|
"epoch": 2.3733662786917225, |
|
"grad_norm": 0.81640625, |
|
"learning_rate": 0.001905065348852331, |
|
"loss": 1.5033, |
|
"step": 182500 |
|
}, |
|
{ |
|
"epoch": 2.379868652058001, |
|
"grad_norm": 0.51171875, |
|
"learning_rate": 0.00190480525391768, |
|
"loss": 1.5061, |
|
"step": 183000 |
|
}, |
|
{ |
|
"epoch": 2.38637102542428, |
|
"grad_norm": 2.90625, |
|
"learning_rate": 0.0019045451589830289, |
|
"loss": 1.508, |
|
"step": 183500 |
|
}, |
|
{ |
|
"epoch": 2.3928733987905586, |
|
"grad_norm": 0.640625, |
|
"learning_rate": 0.0019042850640483778, |
|
"loss": 1.5048, |
|
"step": 184000 |
|
}, |
|
{ |
|
"epoch": 2.399375772156837, |
|
"grad_norm": 0.58984375, |
|
"learning_rate": 0.0019040249691137266, |
|
"loss": 1.5025, |
|
"step": 184500 |
|
}, |
|
{ |
|
"epoch": 2.4058781455231157, |
|
"grad_norm": 0.59375, |
|
"learning_rate": 0.0019037648741790753, |
|
"loss": 1.4983, |
|
"step": 185000 |
|
}, |
|
{ |
|
"epoch": 2.4123805188893948, |
|
"grad_norm": 0.6484375, |
|
"learning_rate": 0.0019035047792444243, |
|
"loss": 1.5018, |
|
"step": 185500 |
|
}, |
|
{ |
|
"epoch": 2.4188828922556733, |
|
"grad_norm": 0.458984375, |
|
"learning_rate": 0.001903244684309773, |
|
"loss": 1.5034, |
|
"step": 186000 |
|
}, |
|
{ |
|
"epoch": 2.425385265621952, |
|
"grad_norm": 0.66015625, |
|
"learning_rate": 0.0019029845893751218, |
|
"loss": 1.5026, |
|
"step": 186500 |
|
}, |
|
{ |
|
"epoch": 2.431887638988231, |
|
"grad_norm": 0.86328125, |
|
"learning_rate": 0.001902724494440471, |
|
"loss": 1.5019, |
|
"step": 187000 |
|
}, |
|
{ |
|
"epoch": 2.4383900123545095, |
|
"grad_norm": 0.77734375, |
|
"learning_rate": 0.0019024643995058197, |
|
"loss": 1.5043, |
|
"step": 187500 |
|
}, |
|
{ |
|
"epoch": 2.444892385720788, |
|
"grad_norm": 0.65625, |
|
"learning_rate": 0.0019022043045711686, |
|
"loss": 1.5081, |
|
"step": 188000 |
|
}, |
|
{ |
|
"epoch": 2.4513947590870666, |
|
"grad_norm": 0.51953125, |
|
"learning_rate": 0.0019019442096365174, |
|
"loss": 1.5045, |
|
"step": 188500 |
|
}, |
|
{ |
|
"epoch": 2.4578971324533456, |
|
"grad_norm": 0.49609375, |
|
"learning_rate": 0.0019016841147018661, |
|
"loss": 1.5091, |
|
"step": 189000 |
|
}, |
|
{ |
|
"epoch": 2.464399505819624, |
|
"grad_norm": 0.51953125, |
|
"learning_rate": 0.001901424019767215, |
|
"loss": 1.5121, |
|
"step": 189500 |
|
}, |
|
{ |
|
"epoch": 2.4709018791859028, |
|
"grad_norm": 0.5078125, |
|
"learning_rate": 0.0019011639248325638, |
|
"loss": 1.5124, |
|
"step": 190000 |
|
}, |
|
{ |
|
"epoch": 2.4774042525521818, |
|
"grad_norm": 0.49609375, |
|
"learning_rate": 0.001900903829897913, |
|
"loss": 1.5159, |
|
"step": 190500 |
|
}, |
|
{ |
|
"epoch": 2.4839066259184603, |
|
"grad_norm": 0.4765625, |
|
"learning_rate": 0.0019006437349632617, |
|
"loss": 1.5078, |
|
"step": 191000 |
|
}, |
|
{ |
|
"epoch": 2.490408999284739, |
|
"grad_norm": 0.640625, |
|
"learning_rate": 0.0019003836400286105, |
|
"loss": 1.5112, |
|
"step": 191500 |
|
}, |
|
{ |
|
"epoch": 2.4969113726510175, |
|
"grad_norm": 0.5625, |
|
"learning_rate": 0.0019001235450939594, |
|
"loss": 1.5106, |
|
"step": 192000 |
|
}, |
|
{ |
|
"epoch": 2.503413746017296, |
|
"grad_norm": 0.478515625, |
|
"learning_rate": 0.0018998634501593082, |
|
"loss": 1.5032, |
|
"step": 192500 |
|
}, |
|
{ |
|
"epoch": 2.509916119383575, |
|
"grad_norm": 0.8203125, |
|
"learning_rate": 0.001899603355224657, |
|
"loss": 1.5047, |
|
"step": 193000 |
|
}, |
|
{ |
|
"epoch": 2.5164184927498536, |
|
"grad_norm": 0.57421875, |
|
"learning_rate": 0.0018993432602900059, |
|
"loss": 1.5036, |
|
"step": 193500 |
|
}, |
|
{ |
|
"epoch": 2.5229208661161326, |
|
"grad_norm": 0.66015625, |
|
"learning_rate": 0.0018990831653553546, |
|
"loss": 1.5028, |
|
"step": 194000 |
|
}, |
|
{ |
|
"epoch": 2.529423239482411, |
|
"grad_norm": 0.5546875, |
|
"learning_rate": 0.0018988230704207038, |
|
"loss": 1.5045, |
|
"step": 194500 |
|
}, |
|
{ |
|
"epoch": 2.53592561284869, |
|
"grad_norm": 0.52734375, |
|
"learning_rate": 0.0018985629754860525, |
|
"loss": 1.5016, |
|
"step": 195000 |
|
}, |
|
{ |
|
"epoch": 2.5424279862149683, |
|
"grad_norm": 0.54296875, |
|
"learning_rate": 0.0018983028805514013, |
|
"loss": 1.5057, |
|
"step": 195500 |
|
}, |
|
{ |
|
"epoch": 2.548930359581247, |
|
"grad_norm": 0.51171875, |
|
"learning_rate": 0.0018980427856167502, |
|
"loss": 1.5069, |
|
"step": 196000 |
|
}, |
|
{ |
|
"epoch": 2.555432732947526, |
|
"grad_norm": 0.58203125, |
|
"learning_rate": 0.001897782690682099, |
|
"loss": 1.5057, |
|
"step": 196500 |
|
}, |
|
{ |
|
"epoch": 2.5619351063138045, |
|
"grad_norm": 0.490234375, |
|
"learning_rate": 0.0018975225957474477, |
|
"loss": 1.5039, |
|
"step": 197000 |
|
}, |
|
{ |
|
"epoch": 2.568437479680083, |
|
"grad_norm": 0.625, |
|
"learning_rate": 0.0018972625008127967, |
|
"loss": 1.5041, |
|
"step": 197500 |
|
}, |
|
{ |
|
"epoch": 2.574939853046362, |
|
"grad_norm": 0.486328125, |
|
"learning_rate": 0.0018970024058781456, |
|
"loss": 1.5048, |
|
"step": 198000 |
|
}, |
|
{ |
|
"epoch": 2.5814422264126407, |
|
"grad_norm": 3.59375, |
|
"learning_rate": 0.0018967423109434946, |
|
"loss": 1.5011, |
|
"step": 198500 |
|
}, |
|
{ |
|
"epoch": 2.587944599778919, |
|
"grad_norm": 1.546875, |
|
"learning_rate": 0.0018964822160088433, |
|
"loss": 1.5014, |
|
"step": 199000 |
|
}, |
|
{ |
|
"epoch": 2.594446973145198, |
|
"grad_norm": 0.453125, |
|
"learning_rate": 0.001896222121074192, |
|
"loss": 1.4983, |
|
"step": 199500 |
|
}, |
|
{ |
|
"epoch": 2.600949346511477, |
|
"grad_norm": 0.5, |
|
"learning_rate": 0.001895962026139541, |
|
"loss": 1.5062, |
|
"step": 200000 |
|
}, |
|
{ |
|
"epoch": 2.6074517198777554, |
|
"grad_norm": 0.67578125, |
|
"learning_rate": 0.0018957019312048897, |
|
"loss": 1.5062, |
|
"step": 200500 |
|
}, |
|
{ |
|
"epoch": 2.613954093244034, |
|
"grad_norm": 0.56640625, |
|
"learning_rate": 0.0018954418362702385, |
|
"loss": 1.5011, |
|
"step": 201000 |
|
}, |
|
{ |
|
"epoch": 2.620456466610313, |
|
"grad_norm": 0.5625, |
|
"learning_rate": 0.0018951817413355877, |
|
"loss": 1.5029, |
|
"step": 201500 |
|
}, |
|
{ |
|
"epoch": 2.6269588399765915, |
|
"grad_norm": 1.0703125, |
|
"learning_rate": 0.0018949216464009364, |
|
"loss": 1.5017, |
|
"step": 202000 |
|
}, |
|
{ |
|
"epoch": 2.63346121334287, |
|
"grad_norm": 0.470703125, |
|
"learning_rate": 0.0018946615514662854, |
|
"loss": 1.5012, |
|
"step": 202500 |
|
}, |
|
{ |
|
"epoch": 2.6399635867091487, |
|
"grad_norm": 0.515625, |
|
"learning_rate": 0.001894401456531634, |
|
"loss": 1.5036, |
|
"step": 203000 |
|
}, |
|
{ |
|
"epoch": 2.6464659600754277, |
|
"grad_norm": 0.5859375, |
|
"learning_rate": 0.0018941413615969828, |
|
"loss": 1.5042, |
|
"step": 203500 |
|
}, |
|
{ |
|
"epoch": 2.6529683334417062, |
|
"grad_norm": 0.474609375, |
|
"learning_rate": 0.0018938812666623318, |
|
"loss": 1.5042, |
|
"step": 204000 |
|
}, |
|
{ |
|
"epoch": 2.659470706807985, |
|
"grad_norm": 1.046875, |
|
"learning_rate": 0.0018936211717276805, |
|
"loss": 1.5079, |
|
"step": 204500 |
|
}, |
|
{ |
|
"epoch": 2.665973080174264, |
|
"grad_norm": 0.51171875, |
|
"learning_rate": 0.0018933610767930297, |
|
"loss": 1.5093, |
|
"step": 205000 |
|
}, |
|
{ |
|
"epoch": 2.6724754535405424, |
|
"grad_norm": 0.50390625, |
|
"learning_rate": 0.0018931009818583784, |
|
"loss": 1.5066, |
|
"step": 205500 |
|
}, |
|
{ |
|
"epoch": 2.678977826906821, |
|
"grad_norm": 0.69140625, |
|
"learning_rate": 0.0018928408869237272, |
|
"loss": 1.5034, |
|
"step": 206000 |
|
}, |
|
{ |
|
"epoch": 2.6854802002730995, |
|
"grad_norm": 0.96875, |
|
"learning_rate": 0.0018925807919890761, |
|
"loss": 1.5045, |
|
"step": 206500 |
|
}, |
|
{ |
|
"epoch": 2.691982573639378, |
|
"grad_norm": 0.57421875, |
|
"learning_rate": 0.0018923206970544249, |
|
"loss": 1.5112, |
|
"step": 207000 |
|
}, |
|
{ |
|
"epoch": 2.698484947005657, |
|
"grad_norm": 0.6953125, |
|
"learning_rate": 0.0018920606021197736, |
|
"loss": 1.5055, |
|
"step": 207500 |
|
}, |
|
{ |
|
"epoch": 2.7049873203719357, |
|
"grad_norm": 2.5, |
|
"learning_rate": 0.0018918005071851226, |
|
"loss": 1.5023, |
|
"step": 208000 |
|
}, |
|
{ |
|
"epoch": 2.7114896937382147, |
|
"grad_norm": 0.423828125, |
|
"learning_rate": 0.0018915404122504713, |
|
"loss": 1.5052, |
|
"step": 208500 |
|
}, |
|
{ |
|
"epoch": 2.7179920671044933, |
|
"grad_norm": 1.5234375, |
|
"learning_rate": 0.0018912803173158205, |
|
"loss": 1.4958, |
|
"step": 209000 |
|
}, |
|
{ |
|
"epoch": 2.724494440470772, |
|
"grad_norm": 0.5859375, |
|
"learning_rate": 0.0018910202223811692, |
|
"loss": 1.5016, |
|
"step": 209500 |
|
}, |
|
{ |
|
"epoch": 2.7309968138370504, |
|
"grad_norm": 1.4375, |
|
"learning_rate": 0.001890760127446518, |
|
"loss": 1.4964, |
|
"step": 210000 |
|
}, |
|
{ |
|
"epoch": 2.737499187203329, |
|
"grad_norm": 1.3203125, |
|
"learning_rate": 0.001890500032511867, |
|
"loss": 1.5011, |
|
"step": 210500 |
|
}, |
|
{ |
|
"epoch": 2.744001560569608, |
|
"grad_norm": 0.423828125, |
|
"learning_rate": 0.0018902399375772157, |
|
"loss": 1.5008, |
|
"step": 211000 |
|
}, |
|
{ |
|
"epoch": 2.7505039339358865, |
|
"grad_norm": 0.478515625, |
|
"learning_rate": 0.0018899798426425644, |
|
"loss": 1.4991, |
|
"step": 211500 |
|
}, |
|
{ |
|
"epoch": 2.7570063073021656, |
|
"grad_norm": 1.09375, |
|
"learning_rate": 0.0018897197477079134, |
|
"loss": 1.4968, |
|
"step": 212000 |
|
}, |
|
{ |
|
"epoch": 2.763508680668444, |
|
"grad_norm": 1.453125, |
|
"learning_rate": 0.0018894596527732623, |
|
"loss": 1.4974, |
|
"step": 212500 |
|
}, |
|
{ |
|
"epoch": 2.7700110540347227, |
|
"grad_norm": 0.59765625, |
|
"learning_rate": 0.0018891995578386113, |
|
"loss": 1.4991, |
|
"step": 213000 |
|
}, |
|
{ |
|
"epoch": 2.7765134274010013, |
|
"grad_norm": 0.6796875, |
|
"learning_rate": 0.00188893946290396, |
|
"loss": 1.4964, |
|
"step": 213500 |
|
}, |
|
{ |
|
"epoch": 2.78301580076728, |
|
"grad_norm": 0.86328125, |
|
"learning_rate": 0.0018886793679693088, |
|
"loss": 1.4988, |
|
"step": 214000 |
|
}, |
|
{ |
|
"epoch": 2.789518174133559, |
|
"grad_norm": 0.703125, |
|
"learning_rate": 0.0018884192730346577, |
|
"loss": 1.4971, |
|
"step": 214500 |
|
}, |
|
{ |
|
"epoch": 2.7960205474998374, |
|
"grad_norm": 0.6328125, |
|
"learning_rate": 0.0018881591781000065, |
|
"loss": 1.4988, |
|
"step": 215000 |
|
}, |
|
{ |
|
"epoch": 2.802522920866116, |
|
"grad_norm": 0.578125, |
|
"learning_rate": 0.0018878990831653552, |
|
"loss": 1.5002, |
|
"step": 215500 |
|
}, |
|
{ |
|
"epoch": 2.809025294232395, |
|
"grad_norm": 0.51953125, |
|
"learning_rate": 0.0018876389882307044, |
|
"loss": 1.4966, |
|
"step": 216000 |
|
}, |
|
{ |
|
"epoch": 2.8155276675986736, |
|
"grad_norm": 0.62890625, |
|
"learning_rate": 0.0018873788932960531, |
|
"loss": 1.4988, |
|
"step": 216500 |
|
}, |
|
{ |
|
"epoch": 2.822030040964952, |
|
"grad_norm": 2.15625, |
|
"learning_rate": 0.001887118798361402, |
|
"loss": 1.5013, |
|
"step": 217000 |
|
}, |
|
{ |
|
"epoch": 2.8285324143312307, |
|
"grad_norm": 0.6640625, |
|
"learning_rate": 0.0018868587034267508, |
|
"loss": 1.4979, |
|
"step": 217500 |
|
}, |
|
{ |
|
"epoch": 2.8350347876975097, |
|
"grad_norm": 0.59765625, |
|
"learning_rate": 0.0018865986084920996, |
|
"loss": 1.4991, |
|
"step": 218000 |
|
}, |
|
{ |
|
"epoch": 2.8415371610637883, |
|
"grad_norm": 0.56640625, |
|
"learning_rate": 0.0018863385135574485, |
|
"loss": 1.4934, |
|
"step": 218500 |
|
}, |
|
{ |
|
"epoch": 2.848039534430067, |
|
"grad_norm": 0.53515625, |
|
"learning_rate": 0.0018860784186227973, |
|
"loss": 1.4923, |
|
"step": 219000 |
|
}, |
|
{ |
|
"epoch": 2.854541907796346, |
|
"grad_norm": 0.65625, |
|
"learning_rate": 0.0018858183236881464, |
|
"loss": 1.4957, |
|
"step": 219500 |
|
}, |
|
{ |
|
"epoch": 2.8610442811626244, |
|
"grad_norm": 0.49609375, |
|
"learning_rate": 0.0018855582287534952, |
|
"loss": 1.4958, |
|
"step": 220000 |
|
}, |
|
{ |
|
"epoch": 2.867546654528903, |
|
"grad_norm": 1.5546875, |
|
"learning_rate": 0.001885298133818844, |
|
"loss": 1.4962, |
|
"step": 220500 |
|
}, |
|
{ |
|
"epoch": 2.8740490278951816, |
|
"grad_norm": 0.64453125, |
|
"learning_rate": 0.0018850380388841929, |
|
"loss": 1.4942, |
|
"step": 221000 |
|
}, |
|
{ |
|
"epoch": 2.8805514012614606, |
|
"grad_norm": 0.66796875, |
|
"learning_rate": 0.0018847779439495416, |
|
"loss": 1.4962, |
|
"step": 221500 |
|
}, |
|
{ |
|
"epoch": 2.887053774627739, |
|
"grad_norm": 0.51953125, |
|
"learning_rate": 0.0018845178490148903, |
|
"loss": 1.4908, |
|
"step": 222000 |
|
}, |
|
{ |
|
"epoch": 2.8935561479940177, |
|
"grad_norm": 1.1328125, |
|
"learning_rate": 0.0018842577540802393, |
|
"loss": 1.4947, |
|
"step": 222500 |
|
}, |
|
{ |
|
"epoch": 2.9000585213602967, |
|
"grad_norm": 0.55078125, |
|
"learning_rate": 0.001883997659145588, |
|
"loss": 1.497, |
|
"step": 223000 |
|
}, |
|
{ |
|
"epoch": 2.9065608947265753, |
|
"grad_norm": 0.427734375, |
|
"learning_rate": 0.0018837375642109372, |
|
"loss": 1.4954, |
|
"step": 223500 |
|
}, |
|
{ |
|
"epoch": 2.913063268092854, |
|
"grad_norm": 0.6953125, |
|
"learning_rate": 0.001883477469276286, |
|
"loss": 1.4954, |
|
"step": 224000 |
|
}, |
|
{ |
|
"epoch": 2.9195656414591324, |
|
"grad_norm": 1.0859375, |
|
"learning_rate": 0.0018832173743416347, |
|
"loss": 1.4906, |
|
"step": 224500 |
|
}, |
|
{ |
|
"epoch": 2.926068014825411, |
|
"grad_norm": 1.8984375, |
|
"learning_rate": 0.0018829572794069837, |
|
"loss": 1.4947, |
|
"step": 225000 |
|
}, |
|
{ |
|
"epoch": 2.93257038819169, |
|
"grad_norm": 0.52734375, |
|
"learning_rate": 0.0018826971844723324, |
|
"loss": 1.5016, |
|
"step": 225500 |
|
}, |
|
{ |
|
"epoch": 2.9390727615579686, |
|
"grad_norm": 0.74609375, |
|
"learning_rate": 0.0018824370895376811, |
|
"loss": 1.4978, |
|
"step": 226000 |
|
}, |
|
{ |
|
"epoch": 2.9455751349242476, |
|
"grad_norm": 1.5, |
|
"learning_rate": 0.00188217699460303, |
|
"loss": 1.495, |
|
"step": 226500 |
|
}, |
|
{ |
|
"epoch": 2.952077508290526, |
|
"grad_norm": 0.8359375, |
|
"learning_rate": 0.001881916899668379, |
|
"loss": 1.4984, |
|
"step": 227000 |
|
}, |
|
{ |
|
"epoch": 2.9585798816568047, |
|
"grad_norm": 0.54296875, |
|
"learning_rate": 0.001881656804733728, |
|
"loss": 1.4918, |
|
"step": 227500 |
|
}, |
|
{ |
|
"epoch": 2.9650822550230833, |
|
"grad_norm": 0.44921875, |
|
"learning_rate": 0.0018813967097990767, |
|
"loss": 1.4907, |
|
"step": 228000 |
|
}, |
|
{ |
|
"epoch": 2.971584628389362, |
|
"grad_norm": 0.609375, |
|
"learning_rate": 0.0018811366148644255, |
|
"loss": 1.4954, |
|
"step": 228500 |
|
}, |
|
{ |
|
"epoch": 2.978087001755641, |
|
"grad_norm": 0.53515625, |
|
"learning_rate": 0.0018808765199297744, |
|
"loss": 1.4943, |
|
"step": 229000 |
|
}, |
|
{ |
|
"epoch": 2.9845893751219195, |
|
"grad_norm": 0.640625, |
|
"learning_rate": 0.0018806164249951232, |
|
"loss": 1.493, |
|
"step": 229500 |
|
}, |
|
{ |
|
"epoch": 2.991091748488198, |
|
"grad_norm": 0.734375, |
|
"learning_rate": 0.001880356330060472, |
|
"loss": 1.4926, |
|
"step": 230000 |
|
}, |
|
{ |
|
"epoch": 2.997594121854477, |
|
"grad_norm": 0.5, |
|
"learning_rate": 0.001880096235125821, |
|
"loss": 1.4914, |
|
"step": 230500 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_loss": 1.4690916538238525, |
|
"eval_runtime": 0.9958, |
|
"eval_samples_per_second": 1004.258, |
|
"eval_steps_per_second": 8.034, |
|
"step": 230685 |
|
}, |
|
{ |
|
"epoch": 3.0040964952207556, |
|
"grad_norm": 0.6484375, |
|
"learning_rate": 0.0018798361401911698, |
|
"loss": 1.4899, |
|
"step": 231000 |
|
}, |
|
{ |
|
"epoch": 3.010598868587034, |
|
"grad_norm": 0.515625, |
|
"learning_rate": 0.0018795760452565188, |
|
"loss": 1.4889, |
|
"step": 231500 |
|
}, |
|
{ |
|
"epoch": 3.0171012419533128, |
|
"grad_norm": 0.9296875, |
|
"learning_rate": 0.0018793159503218675, |
|
"loss": 1.4908, |
|
"step": 232000 |
|
}, |
|
{ |
|
"epoch": 3.0236036153195918, |
|
"grad_norm": 1.75, |
|
"learning_rate": 0.0018790558553872163, |
|
"loss": 1.4906, |
|
"step": 232500 |
|
}, |
|
{ |
|
"epoch": 3.0301059886858703, |
|
"grad_norm": 0.5390625, |
|
"learning_rate": 0.0018787957604525652, |
|
"loss": 1.4915, |
|
"step": 233000 |
|
}, |
|
{ |
|
"epoch": 3.036608362052149, |
|
"grad_norm": 0.62890625, |
|
"learning_rate": 0.001878535665517914, |
|
"loss": 1.4917, |
|
"step": 233500 |
|
}, |
|
{ |
|
"epoch": 3.043110735418428, |
|
"grad_norm": 0.90625, |
|
"learning_rate": 0.0018782755705832631, |
|
"loss": 1.4915, |
|
"step": 234000 |
|
}, |
|
{ |
|
"epoch": 3.0496131087847065, |
|
"grad_norm": 0.5546875, |
|
"learning_rate": 0.0018780154756486119, |
|
"loss": 1.4875, |
|
"step": 234500 |
|
}, |
|
{ |
|
"epoch": 3.056115482150985, |
|
"grad_norm": 1.0703125, |
|
"learning_rate": 0.0018777553807139606, |
|
"loss": 1.4922, |
|
"step": 235000 |
|
}, |
|
{ |
|
"epoch": 3.0626178555172636, |
|
"grad_norm": 0.6328125, |
|
"learning_rate": 0.0018774952857793096, |
|
"loss": 1.4917, |
|
"step": 235500 |
|
}, |
|
{ |
|
"epoch": 3.0691202288835426, |
|
"grad_norm": 0.734375, |
|
"learning_rate": 0.0018772351908446583, |
|
"loss": 1.4925, |
|
"step": 236000 |
|
}, |
|
{ |
|
"epoch": 3.075622602249821, |
|
"grad_norm": 0.451171875, |
|
"learning_rate": 0.001876975095910007, |
|
"loss": 1.4906, |
|
"step": 236500 |
|
}, |
|
{ |
|
"epoch": 3.0821249756160998, |
|
"grad_norm": 1.2734375, |
|
"learning_rate": 0.001876715000975356, |
|
"loss": 1.4913, |
|
"step": 237000 |
|
}, |
|
{ |
|
"epoch": 3.088627348982379, |
|
"grad_norm": 0.8359375, |
|
"learning_rate": 0.0018764549060407048, |
|
"loss": 1.4908, |
|
"step": 237500 |
|
}, |
|
{ |
|
"epoch": 3.0951297223486574, |
|
"grad_norm": 1.625, |
|
"learning_rate": 0.001876194811106054, |
|
"loss": 1.4913, |
|
"step": 238000 |
|
}, |
|
{ |
|
"epoch": 3.101632095714936, |
|
"grad_norm": 0.984375, |
|
"learning_rate": 0.0018759347161714027, |
|
"loss": 1.4925, |
|
"step": 238500 |
|
}, |
|
{ |
|
"epoch": 3.1081344690812145, |
|
"grad_norm": 0.6015625, |
|
"learning_rate": 0.0018756746212367514, |
|
"loss": 1.4905, |
|
"step": 239000 |
|
}, |
|
{ |
|
"epoch": 3.1146368424474935, |
|
"grad_norm": 0.55859375, |
|
"learning_rate": 0.0018754145263021004, |
|
"loss": 1.4899, |
|
"step": 239500 |
|
}, |
|
{ |
|
"epoch": 3.121139215813772, |
|
"grad_norm": 0.640625, |
|
"learning_rate": 0.0018751544313674491, |
|
"loss": 1.4862, |
|
"step": 240000 |
|
}, |
|
{ |
|
"epoch": 3.1276415891800506, |
|
"grad_norm": 0.482421875, |
|
"learning_rate": 0.0018748943364327978, |
|
"loss": 1.4903, |
|
"step": 240500 |
|
}, |
|
{ |
|
"epoch": 3.134143962546329, |
|
"grad_norm": 0.4921875, |
|
"learning_rate": 0.0018746342414981468, |
|
"loss": 1.4926, |
|
"step": 241000 |
|
}, |
|
{ |
|
"epoch": 3.1406463359126082, |
|
"grad_norm": 0.48046875, |
|
"learning_rate": 0.0018743741465634958, |
|
"loss": 1.4893, |
|
"step": 241500 |
|
}, |
|
{ |
|
"epoch": 3.147148709278887, |
|
"grad_norm": 0.50390625, |
|
"learning_rate": 0.0018741140516288447, |
|
"loss": 1.4858, |
|
"step": 242000 |
|
}, |
|
{ |
|
"epoch": 3.1536510826451654, |
|
"grad_norm": 0.578125, |
|
"learning_rate": 0.0018738539566941935, |
|
"loss": 1.4875, |
|
"step": 242500 |
|
}, |
|
{ |
|
"epoch": 3.1601534560114444, |
|
"grad_norm": 0.5859375, |
|
"learning_rate": 0.0018735938617595422, |
|
"loss": 1.4881, |
|
"step": 243000 |
|
}, |
|
{ |
|
"epoch": 3.166655829377723, |
|
"grad_norm": 0.494140625, |
|
"learning_rate": 0.0018733337668248912, |
|
"loss": 1.4878, |
|
"step": 243500 |
|
}, |
|
{ |
|
"epoch": 3.1731582027440015, |
|
"grad_norm": 0.59375, |
|
"learning_rate": 0.00187307367189024, |
|
"loss": 1.4921, |
|
"step": 244000 |
|
}, |
|
{ |
|
"epoch": 3.17966057611028, |
|
"grad_norm": 0.76171875, |
|
"learning_rate": 0.0018728135769555886, |
|
"loss": 1.4891, |
|
"step": 244500 |
|
}, |
|
{ |
|
"epoch": 3.186162949476559, |
|
"grad_norm": 0.59375, |
|
"learning_rate": 0.0018725534820209378, |
|
"loss": 1.4898, |
|
"step": 245000 |
|
}, |
|
{ |
|
"epoch": 3.1926653228428377, |
|
"grad_norm": 1.1953125, |
|
"learning_rate": 0.0018722933870862866, |
|
"loss": 1.4902, |
|
"step": 245500 |
|
}, |
|
{ |
|
"epoch": 3.1991676962091162, |
|
"grad_norm": 1.546875, |
|
"learning_rate": 0.0018720332921516355, |
|
"loss": 1.4911, |
|
"step": 246000 |
|
}, |
|
{ |
|
"epoch": 3.205670069575395, |
|
"grad_norm": 1.015625, |
|
"learning_rate": 0.0018717731972169842, |
|
"loss": 1.4868, |
|
"step": 246500 |
|
}, |
|
{ |
|
"epoch": 3.212172442941674, |
|
"grad_norm": 0.60546875, |
|
"learning_rate": 0.001871513102282333, |
|
"loss": 1.4901, |
|
"step": 247000 |
|
}, |
|
{ |
|
"epoch": 3.2186748163079524, |
|
"grad_norm": 0.51171875, |
|
"learning_rate": 0.001871253007347682, |
|
"loss": 1.4866, |
|
"step": 247500 |
|
}, |
|
{ |
|
"epoch": 3.225177189674231, |
|
"grad_norm": 0.55078125, |
|
"learning_rate": 0.0018709929124130307, |
|
"loss": 1.489, |
|
"step": 248000 |
|
}, |
|
{ |
|
"epoch": 3.23167956304051, |
|
"grad_norm": 0.73828125, |
|
"learning_rate": 0.0018707328174783799, |
|
"loss": 1.4874, |
|
"step": 248500 |
|
}, |
|
{ |
|
"epoch": 3.2381819364067885, |
|
"grad_norm": 1.453125, |
|
"learning_rate": 0.0018704727225437286, |
|
"loss": 1.4909, |
|
"step": 249000 |
|
}, |
|
{ |
|
"epoch": 3.244684309773067, |
|
"grad_norm": 0.51171875, |
|
"learning_rate": 0.0018702126276090773, |
|
"loss": 1.487, |
|
"step": 249500 |
|
}, |
|
{ |
|
"epoch": 3.2511866831393457, |
|
"grad_norm": 0.54296875, |
|
"learning_rate": 0.0018699525326744263, |
|
"loss": 1.4912, |
|
"step": 250000 |
|
}, |
|
{ |
|
"epoch": 3.2576890565056247, |
|
"grad_norm": 0.86328125, |
|
"learning_rate": 0.001869692437739775, |
|
"loss": 1.4872, |
|
"step": 250500 |
|
}, |
|
{ |
|
"epoch": 3.2641914298719032, |
|
"grad_norm": 0.4609375, |
|
"learning_rate": 0.0018694323428051238, |
|
"loss": 1.4817, |
|
"step": 251000 |
|
}, |
|
{ |
|
"epoch": 3.270693803238182, |
|
"grad_norm": 0.49609375, |
|
"learning_rate": 0.0018691722478704727, |
|
"loss": 1.4821, |
|
"step": 251500 |
|
}, |
|
{ |
|
"epoch": 3.277196176604461, |
|
"grad_norm": 0.51171875, |
|
"learning_rate": 0.0018689121529358215, |
|
"loss": 1.4884, |
|
"step": 252000 |
|
}, |
|
{ |
|
"epoch": 3.2836985499707394, |
|
"grad_norm": 0.87109375, |
|
"learning_rate": 0.0018686520580011706, |
|
"loss": 1.4884, |
|
"step": 252500 |
|
}, |
|
{ |
|
"epoch": 3.290200923337018, |
|
"grad_norm": 0.71875, |
|
"learning_rate": 0.0018683919630665194, |
|
"loss": 1.4864, |
|
"step": 253000 |
|
}, |
|
{ |
|
"epoch": 3.2967032967032965, |
|
"grad_norm": 0.62109375, |
|
"learning_rate": 0.0018681318681318681, |
|
"loss": 1.4894, |
|
"step": 253500 |
|
}, |
|
{ |
|
"epoch": 3.3032056700695756, |
|
"grad_norm": 0.6484375, |
|
"learning_rate": 0.001867871773197217, |
|
"loss": 1.482, |
|
"step": 254000 |
|
}, |
|
{ |
|
"epoch": 3.309708043435854, |
|
"grad_norm": 0.50390625, |
|
"learning_rate": 0.0018676116782625658, |
|
"loss": 1.4864, |
|
"step": 254500 |
|
}, |
|
{ |
|
"epoch": 3.3162104168021327, |
|
"grad_norm": 0.490234375, |
|
"learning_rate": 0.0018673515833279146, |
|
"loss": 1.488, |
|
"step": 255000 |
|
}, |
|
{ |
|
"epoch": 3.3227127901684117, |
|
"grad_norm": 0.78125, |
|
"learning_rate": 0.0018670914883932635, |
|
"loss": 1.4858, |
|
"step": 255500 |
|
}, |
|
{ |
|
"epoch": 3.3292151635346903, |
|
"grad_norm": 1.0078125, |
|
"learning_rate": 0.0018668313934586125, |
|
"loss": 1.4894, |
|
"step": 256000 |
|
}, |
|
{ |
|
"epoch": 3.335717536900969, |
|
"grad_norm": 0.55078125, |
|
"learning_rate": 0.0018665712985239614, |
|
"loss": 1.4929, |
|
"step": 256500 |
|
}, |
|
{ |
|
"epoch": 3.3422199102672474, |
|
"grad_norm": 0.7734375, |
|
"learning_rate": 0.0018663112035893102, |
|
"loss": 1.4937, |
|
"step": 257000 |
|
}, |
|
{ |
|
"epoch": 3.3487222836335264, |
|
"grad_norm": 0.58203125, |
|
"learning_rate": 0.001866051108654659, |
|
"loss": 1.4913, |
|
"step": 257500 |
|
}, |
|
{ |
|
"epoch": 3.355224656999805, |
|
"grad_norm": 0.91015625, |
|
"learning_rate": 0.0018657910137200079, |
|
"loss": 1.4903, |
|
"step": 258000 |
|
}, |
|
{ |
|
"epoch": 3.3617270303660836, |
|
"grad_norm": 0.546875, |
|
"learning_rate": 0.0018655309187853566, |
|
"loss": 1.4862, |
|
"step": 258500 |
|
}, |
|
{ |
|
"epoch": 3.368229403732362, |
|
"grad_norm": 0.4453125, |
|
"learning_rate": 0.0018652708238507054, |
|
"loss": 1.4836, |
|
"step": 259000 |
|
}, |
|
{ |
|
"epoch": 3.374731777098641, |
|
"grad_norm": 0.490234375, |
|
"learning_rate": 0.0018650107289160545, |
|
"loss": 1.4841, |
|
"step": 259500 |
|
}, |
|
{ |
|
"epoch": 3.3812341504649197, |
|
"grad_norm": 0.4609375, |
|
"learning_rate": 0.0018647506339814033, |
|
"loss": 1.4857, |
|
"step": 260000 |
|
}, |
|
{ |
|
"epoch": 3.3877365238311983, |
|
"grad_norm": 0.71875, |
|
"learning_rate": 0.0018644905390467522, |
|
"loss": 1.4833, |
|
"step": 260500 |
|
}, |
|
{ |
|
"epoch": 3.394238897197477, |
|
"grad_norm": 1.375, |
|
"learning_rate": 0.001864230444112101, |
|
"loss": 1.487, |
|
"step": 261000 |
|
}, |
|
{ |
|
"epoch": 3.400741270563756, |
|
"grad_norm": 0.5078125, |
|
"learning_rate": 0.0018639703491774497, |
|
"loss": 1.4886, |
|
"step": 261500 |
|
}, |
|
{ |
|
"epoch": 3.4072436439300344, |
|
"grad_norm": 0.859375, |
|
"learning_rate": 0.0018637102542427987, |
|
"loss": 1.4843, |
|
"step": 262000 |
|
}, |
|
{ |
|
"epoch": 3.413746017296313, |
|
"grad_norm": 0.5859375, |
|
"learning_rate": 0.0018634501593081474, |
|
"loss": 1.4825, |
|
"step": 262500 |
|
}, |
|
{ |
|
"epoch": 3.420248390662592, |
|
"grad_norm": 0.59765625, |
|
"learning_rate": 0.0018631900643734966, |
|
"loss": 1.4808, |
|
"step": 263000 |
|
}, |
|
{ |
|
"epoch": 3.4267507640288706, |
|
"grad_norm": 0.66015625, |
|
"learning_rate": 0.0018629299694388453, |
|
"loss": 1.4814, |
|
"step": 263500 |
|
}, |
|
{ |
|
"epoch": 3.433253137395149, |
|
"grad_norm": 3.3125, |
|
"learning_rate": 0.001862669874504194, |
|
"loss": 1.4807, |
|
"step": 264000 |
|
}, |
|
{ |
|
"epoch": 3.4397555107614277, |
|
"grad_norm": 0.76171875, |
|
"learning_rate": 0.001862409779569543, |
|
"loss": 1.4817, |
|
"step": 264500 |
|
}, |
|
{ |
|
"epoch": 3.4462578841277067, |
|
"grad_norm": 0.609375, |
|
"learning_rate": 0.0018621496846348918, |
|
"loss": 1.4824, |
|
"step": 265000 |
|
}, |
|
{ |
|
"epoch": 3.4527602574939853, |
|
"grad_norm": 1.296875, |
|
"learning_rate": 0.0018618895897002405, |
|
"loss": 1.4813, |
|
"step": 265500 |
|
}, |
|
{ |
|
"epoch": 3.459262630860264, |
|
"grad_norm": 25.0, |
|
"learning_rate": 0.0018616294947655895, |
|
"loss": 1.4827, |
|
"step": 266000 |
|
}, |
|
{ |
|
"epoch": 3.465765004226543, |
|
"grad_norm": 0.5390625, |
|
"learning_rate": 0.0018613693998309382, |
|
"loss": 1.4771, |
|
"step": 266500 |
|
}, |
|
{ |
|
"epoch": 3.4722673775928214, |
|
"grad_norm": 0.54296875, |
|
"learning_rate": 0.0018611093048962874, |
|
"loss": 1.4795, |
|
"step": 267000 |
|
}, |
|
{ |
|
"epoch": 3.4787697509591, |
|
"grad_norm": 1.1953125, |
|
"learning_rate": 0.001860849209961636, |
|
"loss": 1.4776, |
|
"step": 267500 |
|
}, |
|
{ |
|
"epoch": 3.4852721243253786, |
|
"grad_norm": 0.5078125, |
|
"learning_rate": 0.0018605891150269848, |
|
"loss": 1.4828, |
|
"step": 268000 |
|
}, |
|
{ |
|
"epoch": 3.4917744976916576, |
|
"grad_norm": 0.474609375, |
|
"learning_rate": 0.0018603290200923338, |
|
"loss": 1.4827, |
|
"step": 268500 |
|
}, |
|
{ |
|
"epoch": 3.498276871057936, |
|
"grad_norm": 0.90625, |
|
"learning_rate": 0.0018600689251576825, |
|
"loss": 1.4794, |
|
"step": 269000 |
|
}, |
|
{ |
|
"epoch": 3.5047792444242147, |
|
"grad_norm": 2.8125, |
|
"learning_rate": 0.0018598088302230313, |
|
"loss": 1.4791, |
|
"step": 269500 |
|
}, |
|
{ |
|
"epoch": 3.5112816177904937, |
|
"grad_norm": 0.53515625, |
|
"learning_rate": 0.0018595487352883802, |
|
"loss": 1.4805, |
|
"step": 270000 |
|
}, |
|
{ |
|
"epoch": 3.5177839911567723, |
|
"grad_norm": 0.53515625, |
|
"learning_rate": 0.0018592886403537292, |
|
"loss": 1.4781, |
|
"step": 270500 |
|
}, |
|
{ |
|
"epoch": 3.524286364523051, |
|
"grad_norm": 0.53515625, |
|
"learning_rate": 0.0018590285454190782, |
|
"loss": 1.4817, |
|
"step": 271000 |
|
}, |
|
{ |
|
"epoch": 3.5307887378893295, |
|
"grad_norm": 0.4375, |
|
"learning_rate": 0.001858768450484427, |
|
"loss": 1.4873, |
|
"step": 271500 |
|
}, |
|
{ |
|
"epoch": 3.537291111255608, |
|
"grad_norm": 0.486328125, |
|
"learning_rate": 0.0018585083555497756, |
|
"loss": 1.4794, |
|
"step": 272000 |
|
}, |
|
{ |
|
"epoch": 3.543793484621887, |
|
"grad_norm": 0.453125, |
|
"learning_rate": 0.0018582482606151246, |
|
"loss": 1.4802, |
|
"step": 272500 |
|
}, |
|
{ |
|
"epoch": 3.5502958579881656, |
|
"grad_norm": 0.55859375, |
|
"learning_rate": 0.0018579881656804733, |
|
"loss": 1.4799, |
|
"step": 273000 |
|
}, |
|
{ |
|
"epoch": 3.5567982313544446, |
|
"grad_norm": 0.6640625, |
|
"learning_rate": 0.001857728070745822, |
|
"loss": 1.4772, |
|
"step": 273500 |
|
}, |
|
{ |
|
"epoch": 3.563300604720723, |
|
"grad_norm": 0.56640625, |
|
"learning_rate": 0.0018574679758111712, |
|
"loss": 1.4766, |
|
"step": 274000 |
|
}, |
|
{ |
|
"epoch": 3.5698029780870018, |
|
"grad_norm": 0.53515625, |
|
"learning_rate": 0.00185720788087652, |
|
"loss": 1.4788, |
|
"step": 274500 |
|
}, |
|
{ |
|
"epoch": 3.5763053514532803, |
|
"grad_norm": 0.5859375, |
|
"learning_rate": 0.001856947785941869, |
|
"loss": 1.4778, |
|
"step": 275000 |
|
}, |
|
{ |
|
"epoch": 3.582807724819559, |
|
"grad_norm": 0.6796875, |
|
"learning_rate": 0.0018566876910072177, |
|
"loss": 1.4819, |
|
"step": 275500 |
|
}, |
|
{ |
|
"epoch": 3.589310098185838, |
|
"grad_norm": 0.6328125, |
|
"learning_rate": 0.0018564275960725664, |
|
"loss": 1.4799, |
|
"step": 276000 |
|
}, |
|
{ |
|
"epoch": 3.5958124715521165, |
|
"grad_norm": 0.515625, |
|
"learning_rate": 0.0018561675011379154, |
|
"loss": 1.4789, |
|
"step": 276500 |
|
}, |
|
{ |
|
"epoch": 3.6023148449183955, |
|
"grad_norm": 0.55859375, |
|
"learning_rate": 0.0018559074062032641, |
|
"loss": 1.4805, |
|
"step": 277000 |
|
}, |
|
{ |
|
"epoch": 3.608817218284674, |
|
"grad_norm": 0.5078125, |
|
"learning_rate": 0.0018556473112686133, |
|
"loss": 1.4783, |
|
"step": 277500 |
|
}, |
|
{ |
|
"epoch": 3.6153195916509526, |
|
"grad_norm": 0.546875, |
|
"learning_rate": 0.001855387216333962, |
|
"loss": 1.4763, |
|
"step": 278000 |
|
}, |
|
{ |
|
"epoch": 3.621821965017231, |
|
"grad_norm": 0.65625, |
|
"learning_rate": 0.0018551271213993108, |
|
"loss": 1.4771, |
|
"step": 278500 |
|
}, |
|
{ |
|
"epoch": 3.6283243383835098, |
|
"grad_norm": 0.859375, |
|
"learning_rate": 0.0018548670264646597, |
|
"loss": 1.4786, |
|
"step": 279000 |
|
}, |
|
{ |
|
"epoch": 3.6348267117497888, |
|
"grad_norm": 3.390625, |
|
"learning_rate": 0.0018546069315300085, |
|
"loss": 1.4806, |
|
"step": 279500 |
|
}, |
|
{ |
|
"epoch": 3.6413290851160673, |
|
"grad_norm": 0.51953125, |
|
"learning_rate": 0.0018543468365953572, |
|
"loss": 1.4846, |
|
"step": 280000 |
|
}, |
|
{ |
|
"epoch": 3.647831458482346, |
|
"grad_norm": 0.455078125, |
|
"learning_rate": 0.0018540867416607062, |
|
"loss": 1.487, |
|
"step": 280500 |
|
}, |
|
{ |
|
"epoch": 3.654333831848625, |
|
"grad_norm": 0.55078125, |
|
"learning_rate": 0.001853826646726055, |
|
"loss": 1.4828, |
|
"step": 281000 |
|
}, |
|
{ |
|
"epoch": 3.6608362052149035, |
|
"grad_norm": 0.5234375, |
|
"learning_rate": 0.001853566551791404, |
|
"loss": 1.4859, |
|
"step": 281500 |
|
}, |
|
{ |
|
"epoch": 3.667338578581182, |
|
"grad_norm": 0.55859375, |
|
"learning_rate": 0.0018533064568567528, |
|
"loss": 1.4862, |
|
"step": 282000 |
|
}, |
|
{ |
|
"epoch": 3.6738409519474606, |
|
"grad_norm": 0.443359375, |
|
"learning_rate": 0.0018530463619221016, |
|
"loss": 1.4859, |
|
"step": 282500 |
|
}, |
|
{ |
|
"epoch": 3.6803433253137396, |
|
"grad_norm": 2.15625, |
|
"learning_rate": 0.0018527862669874505, |
|
"loss": 1.4826, |
|
"step": 283000 |
|
}, |
|
{ |
|
"epoch": 3.686845698680018, |
|
"grad_norm": 0.455078125, |
|
"learning_rate": 0.0018525261720527993, |
|
"loss": 1.4795, |
|
"step": 283500 |
|
}, |
|
{ |
|
"epoch": 3.693348072046297, |
|
"grad_norm": 0.62109375, |
|
"learning_rate": 0.001852266077118148, |
|
"loss": 1.4778, |
|
"step": 284000 |
|
}, |
|
{ |
|
"epoch": 3.699850445412576, |
|
"grad_norm": 0.451171875, |
|
"learning_rate": 0.001852005982183497, |
|
"loss": 1.4837, |
|
"step": 284500 |
|
}, |
|
{ |
|
"epoch": 3.7063528187788544, |
|
"grad_norm": 0.50390625, |
|
"learning_rate": 0.001851745887248846, |
|
"loss": 1.4808, |
|
"step": 285000 |
|
}, |
|
{ |
|
"epoch": 3.712855192145133, |
|
"grad_norm": 0.80859375, |
|
"learning_rate": 0.0018514857923141949, |
|
"loss": 1.4796, |
|
"step": 285500 |
|
}, |
|
{ |
|
"epoch": 3.7193575655114115, |
|
"grad_norm": 0.78515625, |
|
"learning_rate": 0.0018512256973795436, |
|
"loss": 1.477, |
|
"step": 286000 |
|
}, |
|
{ |
|
"epoch": 3.7258599388776905, |
|
"grad_norm": 0.52734375, |
|
"learning_rate": 0.0018509656024448924, |
|
"loss": 1.475, |
|
"step": 286500 |
|
}, |
|
{ |
|
"epoch": 3.732362312243969, |
|
"grad_norm": 0.6796875, |
|
"learning_rate": 0.0018507055075102413, |
|
"loss": 1.4817, |
|
"step": 287000 |
|
}, |
|
{ |
|
"epoch": 3.7388646856102477, |
|
"grad_norm": 0.54296875, |
|
"learning_rate": 0.00185044541257559, |
|
"loss": 1.4804, |
|
"step": 287500 |
|
}, |
|
{ |
|
"epoch": 3.7453670589765267, |
|
"grad_norm": 0.75, |
|
"learning_rate": 0.0018501853176409388, |
|
"loss": 1.4767, |
|
"step": 288000 |
|
}, |
|
{ |
|
"epoch": 3.7518694323428052, |
|
"grad_norm": 0.6328125, |
|
"learning_rate": 0.001849925222706288, |
|
"loss": 1.4779, |
|
"step": 288500 |
|
}, |
|
{ |
|
"epoch": 3.758371805709084, |
|
"grad_norm": 0.61328125, |
|
"learning_rate": 0.0018496651277716367, |
|
"loss": 1.4767, |
|
"step": 289000 |
|
}, |
|
{ |
|
"epoch": 3.7648741790753624, |
|
"grad_norm": 0.91015625, |
|
"learning_rate": 0.0018494050328369857, |
|
"loss": 1.4749, |
|
"step": 289500 |
|
}, |
|
{ |
|
"epoch": 3.771376552441641, |
|
"grad_norm": 0.4609375, |
|
"learning_rate": 0.0018491449379023344, |
|
"loss": 1.4803, |
|
"step": 290000 |
|
}, |
|
{ |
|
"epoch": 3.77787892580792, |
|
"grad_norm": 0.609375, |
|
"learning_rate": 0.0018488848429676831, |
|
"loss": 1.483, |
|
"step": 290500 |
|
}, |
|
{ |
|
"epoch": 3.7843812991741985, |
|
"grad_norm": 0.61328125, |
|
"learning_rate": 0.001848624748033032, |
|
"loss": 1.4785, |
|
"step": 291000 |
|
}, |
|
{ |
|
"epoch": 3.7908836725404775, |
|
"grad_norm": 0.59765625, |
|
"learning_rate": 0.0018483646530983808, |
|
"loss": 1.4722, |
|
"step": 291500 |
|
}, |
|
{ |
|
"epoch": 3.797386045906756, |
|
"grad_norm": 0.5390625, |
|
"learning_rate": 0.00184810455816373, |
|
"loss": 1.4768, |
|
"step": 292000 |
|
}, |
|
{ |
|
"epoch": 3.8038884192730347, |
|
"grad_norm": 1.1953125, |
|
"learning_rate": 0.0018478444632290788, |
|
"loss": 1.4764, |
|
"step": 292500 |
|
}, |
|
{ |
|
"epoch": 3.8103907926393132, |
|
"grad_norm": 0.484375, |
|
"learning_rate": 0.0018475843682944275, |
|
"loss": 1.4804, |
|
"step": 293000 |
|
}, |
|
{ |
|
"epoch": 3.816893166005592, |
|
"grad_norm": 0.625, |
|
"learning_rate": 0.0018473242733597765, |
|
"loss": 1.4773, |
|
"step": 293500 |
|
}, |
|
{ |
|
"epoch": 3.823395539371871, |
|
"grad_norm": 0.6484375, |
|
"learning_rate": 0.0018470641784251252, |
|
"loss": 1.4777, |
|
"step": 294000 |
|
}, |
|
{ |
|
"epoch": 3.8298979127381494, |
|
"grad_norm": 0.5078125, |
|
"learning_rate": 0.001846804083490474, |
|
"loss": 1.4838, |
|
"step": 294500 |
|
}, |
|
{ |
|
"epoch": 3.836400286104428, |
|
"grad_norm": 0.48046875, |
|
"learning_rate": 0.0018465439885558229, |
|
"loss": 1.4781, |
|
"step": 295000 |
|
}, |
|
{ |
|
"epoch": 3.842902659470707, |
|
"grad_norm": 0.51171875, |
|
"learning_rate": 0.0018462838936211716, |
|
"loss": 1.4859, |
|
"step": 295500 |
|
}, |
|
{ |
|
"epoch": 3.8494050328369855, |
|
"grad_norm": 1.25, |
|
"learning_rate": 0.0018460237986865208, |
|
"loss": 1.4856, |
|
"step": 296000 |
|
}, |
|
{ |
|
"epoch": 3.855907406203264, |
|
"grad_norm": 0.55859375, |
|
"learning_rate": 0.0018457637037518695, |
|
"loss": 1.4853, |
|
"step": 296500 |
|
}, |
|
{ |
|
"epoch": 3.8624097795695427, |
|
"grad_norm": 0.5546875, |
|
"learning_rate": 0.0018455036088172183, |
|
"loss": 1.4766, |
|
"step": 297000 |
|
}, |
|
{ |
|
"epoch": 3.8689121529358217, |
|
"grad_norm": 1.4765625, |
|
"learning_rate": 0.0018452435138825672, |
|
"loss": 1.4831, |
|
"step": 297500 |
|
}, |
|
{ |
|
"epoch": 3.8754145263021003, |
|
"grad_norm": 0.57421875, |
|
"learning_rate": 0.001844983418947916, |
|
"loss": 1.4798, |
|
"step": 298000 |
|
}, |
|
{ |
|
"epoch": 3.881916899668379, |
|
"grad_norm": 0.75, |
|
"learning_rate": 0.0018447233240132647, |
|
"loss": 1.4808, |
|
"step": 298500 |
|
}, |
|
{ |
|
"epoch": 3.888419273034658, |
|
"grad_norm": 0.53515625, |
|
"learning_rate": 0.0018444632290786137, |
|
"loss": 1.4808, |
|
"step": 299000 |
|
}, |
|
{ |
|
"epoch": 3.8949216464009364, |
|
"grad_norm": 0.61328125, |
|
"learning_rate": 0.0018442031341439626, |
|
"loss": 1.4744, |
|
"step": 299500 |
|
}, |
|
{ |
|
"epoch": 3.901424019767215, |
|
"grad_norm": 0.58984375, |
|
"learning_rate": 0.0018439430392093116, |
|
"loss": 1.4792, |
|
"step": 300000 |
|
}, |
|
{ |
|
"epoch": 3.9079263931334935, |
|
"grad_norm": 0.486328125, |
|
"learning_rate": 0.0018436829442746603, |
|
"loss": 1.4776, |
|
"step": 300500 |
|
}, |
|
{ |
|
"epoch": 3.9144287664997726, |
|
"grad_norm": 1.0859375, |
|
"learning_rate": 0.001843422849340009, |
|
"loss": 1.4733, |
|
"step": 301000 |
|
}, |
|
{ |
|
"epoch": 3.920931139866051, |
|
"grad_norm": 0.515625, |
|
"learning_rate": 0.001843162754405358, |
|
"loss": 1.4766, |
|
"step": 301500 |
|
}, |
|
{ |
|
"epoch": 3.9274335132323297, |
|
"grad_norm": 0.53515625, |
|
"learning_rate": 0.0018429026594707068, |
|
"loss": 1.4743, |
|
"step": 302000 |
|
}, |
|
{ |
|
"epoch": 3.9339358865986087, |
|
"grad_norm": 0.49609375, |
|
"learning_rate": 0.0018426425645360555, |
|
"loss": 1.4735, |
|
"step": 302500 |
|
}, |
|
{ |
|
"epoch": 3.9404382599648873, |
|
"grad_norm": 0.51953125, |
|
"learning_rate": 0.0018423824696014047, |
|
"loss": 1.4729, |
|
"step": 303000 |
|
}, |
|
{ |
|
"epoch": 3.946940633331166, |
|
"grad_norm": 0.4921875, |
|
"learning_rate": 0.0018421223746667534, |
|
"loss": 1.4776, |
|
"step": 303500 |
|
}, |
|
{ |
|
"epoch": 3.9534430066974444, |
|
"grad_norm": 0.625, |
|
"learning_rate": 0.0018418622797321024, |
|
"loss": 1.4805, |
|
"step": 304000 |
|
}, |
|
{ |
|
"epoch": 3.959945380063723, |
|
"grad_norm": 0.59375, |
|
"learning_rate": 0.0018416021847974511, |
|
"loss": 1.4744, |
|
"step": 304500 |
|
}, |
|
{ |
|
"epoch": 3.966447753430002, |
|
"grad_norm": 0.46875, |
|
"learning_rate": 0.0018413420898627999, |
|
"loss": 1.4781, |
|
"step": 305000 |
|
}, |
|
{ |
|
"epoch": 3.9729501267962806, |
|
"grad_norm": 0.53515625, |
|
"learning_rate": 0.0018410819949281488, |
|
"loss": 1.4763, |
|
"step": 305500 |
|
}, |
|
{ |
|
"epoch": 3.9794525001625596, |
|
"grad_norm": 0.734375, |
|
"learning_rate": 0.0018408218999934976, |
|
"loss": 1.4752, |
|
"step": 306000 |
|
}, |
|
{ |
|
"epoch": 3.985954873528838, |
|
"grad_norm": 1.4921875, |
|
"learning_rate": 0.0018405618050588467, |
|
"loss": 1.4749, |
|
"step": 306500 |
|
}, |
|
{ |
|
"epoch": 3.9924572468951167, |
|
"grad_norm": 0.66015625, |
|
"learning_rate": 0.0018403017101241955, |
|
"loss": 1.4747, |
|
"step": 307000 |
|
}, |
|
{ |
|
"epoch": 3.9989596202613953, |
|
"grad_norm": 1.0703125, |
|
"learning_rate": 0.0018400416151895442, |
|
"loss": 1.4753, |
|
"step": 307500 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"eval_loss": 1.4534417390823364, |
|
"eval_runtime": 0.9363, |
|
"eval_samples_per_second": 1068.081, |
|
"eval_steps_per_second": 8.545, |
|
"step": 307580 |
|
}, |
|
{ |
|
"epoch": 4.005461993627674, |
|
"grad_norm": 0.4609375, |
|
"learning_rate": 0.0018397815202548932, |
|
"loss": 1.4734, |
|
"step": 308000 |
|
}, |
|
{ |
|
"epoch": 4.011964366993952, |
|
"grad_norm": 0.5234375, |
|
"learning_rate": 0.001839521425320242, |
|
"loss": 1.4706, |
|
"step": 308500 |
|
}, |
|
{ |
|
"epoch": 4.018466740360232, |
|
"grad_norm": 0.96484375, |
|
"learning_rate": 0.0018392613303855907, |
|
"loss": 1.4735, |
|
"step": 309000 |
|
}, |
|
{ |
|
"epoch": 4.0249691137265105, |
|
"grad_norm": 0.5234375, |
|
"learning_rate": 0.0018390012354509396, |
|
"loss": 1.4777, |
|
"step": 309500 |
|
}, |
|
{ |
|
"epoch": 4.031471487092789, |
|
"grad_norm": 1.9765625, |
|
"learning_rate": 0.0018387411405162883, |
|
"loss": 1.4732, |
|
"step": 310000 |
|
}, |
|
{ |
|
"epoch": 4.037973860459068, |
|
"grad_norm": 0.498046875, |
|
"learning_rate": 0.0018384810455816375, |
|
"loss": 1.4726, |
|
"step": 310500 |
|
}, |
|
{ |
|
"epoch": 4.044476233825346, |
|
"grad_norm": 0.54296875, |
|
"learning_rate": 0.0018382209506469863, |
|
"loss": 1.4752, |
|
"step": 311000 |
|
}, |
|
{ |
|
"epoch": 4.050978607191625, |
|
"grad_norm": 0.90234375, |
|
"learning_rate": 0.001837960855712335, |
|
"loss": 1.478, |
|
"step": 311500 |
|
}, |
|
{ |
|
"epoch": 4.057480980557903, |
|
"grad_norm": 0.5078125, |
|
"learning_rate": 0.001837700760777684, |
|
"loss": 1.4736, |
|
"step": 312000 |
|
}, |
|
{ |
|
"epoch": 4.063983353924183, |
|
"grad_norm": 0.50390625, |
|
"learning_rate": 0.0018374406658430327, |
|
"loss": 1.4773, |
|
"step": 312500 |
|
}, |
|
{ |
|
"epoch": 4.070485727290461, |
|
"grad_norm": 0.79296875, |
|
"learning_rate": 0.0018371805709083814, |
|
"loss": 1.4747, |
|
"step": 313000 |
|
}, |
|
{ |
|
"epoch": 4.07698810065674, |
|
"grad_norm": 0.90234375, |
|
"learning_rate": 0.0018369204759737304, |
|
"loss": 1.4762, |
|
"step": 313500 |
|
}, |
|
{ |
|
"epoch": 4.0834904740230185, |
|
"grad_norm": 0.62109375, |
|
"learning_rate": 0.0018366603810390794, |
|
"loss": 1.4793, |
|
"step": 314000 |
|
}, |
|
{ |
|
"epoch": 4.089992847389297, |
|
"grad_norm": 3.640625, |
|
"learning_rate": 0.0018364002861044283, |
|
"loss": 1.4763, |
|
"step": 314500 |
|
}, |
|
{ |
|
"epoch": 4.096495220755576, |
|
"grad_norm": 0.5, |
|
"learning_rate": 0.001836140191169777, |
|
"loss": 1.4728, |
|
"step": 315000 |
|
}, |
|
{ |
|
"epoch": 4.102997594121854, |
|
"grad_norm": 1.4765625, |
|
"learning_rate": 0.0018358800962351258, |
|
"loss": 1.4715, |
|
"step": 315500 |
|
}, |
|
{ |
|
"epoch": 4.109499967488133, |
|
"grad_norm": 0.453125, |
|
"learning_rate": 0.0018356200013004747, |
|
"loss": 1.4714, |
|
"step": 316000 |
|
}, |
|
{ |
|
"epoch": 4.116002340854412, |
|
"grad_norm": 0.416015625, |
|
"learning_rate": 0.0018353599063658235, |
|
"loss": 1.4756, |
|
"step": 316500 |
|
}, |
|
{ |
|
"epoch": 4.122504714220691, |
|
"grad_norm": 0.54296875, |
|
"learning_rate": 0.0018350998114311722, |
|
"loss": 1.4792, |
|
"step": 317000 |
|
}, |
|
{ |
|
"epoch": 4.129007087586969, |
|
"grad_norm": 0.478515625, |
|
"learning_rate": 0.0018348397164965214, |
|
"loss": 1.4831, |
|
"step": 317500 |
|
}, |
|
{ |
|
"epoch": 4.135509460953248, |
|
"grad_norm": 0.546875, |
|
"learning_rate": 0.0018345796215618701, |
|
"loss": 1.4805, |
|
"step": 318000 |
|
}, |
|
{ |
|
"epoch": 4.1420118343195265, |
|
"grad_norm": 0.6484375, |
|
"learning_rate": 0.001834319526627219, |
|
"loss": 1.4802, |
|
"step": 318500 |
|
}, |
|
{ |
|
"epoch": 4.148514207685805, |
|
"grad_norm": 1.203125, |
|
"learning_rate": 0.0018340594316925678, |
|
"loss": 1.4795, |
|
"step": 319000 |
|
}, |
|
{ |
|
"epoch": 4.155016581052084, |
|
"grad_norm": 1.6484375, |
|
"learning_rate": 0.0018337993367579166, |
|
"loss": 1.481, |
|
"step": 319500 |
|
}, |
|
{ |
|
"epoch": 4.161518954418363, |
|
"grad_norm": 0.490234375, |
|
"learning_rate": 0.0018335392418232655, |
|
"loss": 1.4766, |
|
"step": 320000 |
|
}, |
|
{ |
|
"epoch": 4.168021327784642, |
|
"grad_norm": 0.6484375, |
|
"learning_rate": 0.0018332791468886143, |
|
"loss": 1.4774, |
|
"step": 320500 |
|
}, |
|
{ |
|
"epoch": 4.17452370115092, |
|
"grad_norm": 52.5, |
|
"learning_rate": 0.001833019051953963, |
|
"loss": 1.4821, |
|
"step": 321000 |
|
}, |
|
{ |
|
"epoch": 4.181026074517199, |
|
"grad_norm": 1.2421875, |
|
"learning_rate": 0.0018327589570193122, |
|
"loss": 1.4914, |
|
"step": 321500 |
|
}, |
|
{ |
|
"epoch": 4.187528447883477, |
|
"grad_norm": 0.50390625, |
|
"learning_rate": 0.001832498862084661, |
|
"loss": 1.4781, |
|
"step": 322000 |
|
}, |
|
{ |
|
"epoch": 4.194030821249756, |
|
"grad_norm": 7.96875, |
|
"learning_rate": 0.0018322387671500099, |
|
"loss": 1.4784, |
|
"step": 322500 |
|
}, |
|
{ |
|
"epoch": 4.2005331946160345, |
|
"grad_norm": 0.75390625, |
|
"learning_rate": 0.0018319786722153586, |
|
"loss": 1.4806, |
|
"step": 323000 |
|
}, |
|
{ |
|
"epoch": 4.207035567982314, |
|
"grad_norm": 0.55078125, |
|
"learning_rate": 0.0018317185772807074, |
|
"loss": 1.4829, |
|
"step": 323500 |
|
}, |
|
{ |
|
"epoch": 4.2135379413485925, |
|
"grad_norm": 0.7890625, |
|
"learning_rate": 0.0018314584823460563, |
|
"loss": 1.4921, |
|
"step": 324000 |
|
}, |
|
{ |
|
"epoch": 4.220040314714871, |
|
"grad_norm": 0.9609375, |
|
"learning_rate": 0.001831198387411405, |
|
"loss": 1.4937, |
|
"step": 324500 |
|
}, |
|
{ |
|
"epoch": 4.22654268808115, |
|
"grad_norm": 0.60546875, |
|
"learning_rate": 0.0018309382924767542, |
|
"loss": 1.495, |
|
"step": 325000 |
|
}, |
|
{ |
|
"epoch": 4.233045061447428, |
|
"grad_norm": 0.59765625, |
|
"learning_rate": 0.001830678197542103, |
|
"loss": 1.4985, |
|
"step": 325500 |
|
}, |
|
{ |
|
"epoch": 4.239547434813707, |
|
"grad_norm": 0.5078125, |
|
"learning_rate": 0.0018304181026074517, |
|
"loss": 1.5151, |
|
"step": 326000 |
|
}, |
|
{ |
|
"epoch": 4.246049808179985, |
|
"grad_norm": 1.0078125, |
|
"learning_rate": 0.0018301580076728007, |
|
"loss": 1.515, |
|
"step": 326500 |
|
}, |
|
{ |
|
"epoch": 4.252552181546265, |
|
"grad_norm": 0.5234375, |
|
"learning_rate": 0.0018298979127381494, |
|
"loss": 1.5118, |
|
"step": 327000 |
|
}, |
|
{ |
|
"epoch": 4.259054554912543, |
|
"grad_norm": 0.66796875, |
|
"learning_rate": 0.0018296378178034982, |
|
"loss": 1.5253, |
|
"step": 327500 |
|
}, |
|
{ |
|
"epoch": 4.265556928278822, |
|
"grad_norm": 0.69140625, |
|
"learning_rate": 0.0018293777228688471, |
|
"loss": 1.5161, |
|
"step": 328000 |
|
}, |
|
{ |
|
"epoch": 4.2720593016451005, |
|
"grad_norm": 0.53515625, |
|
"learning_rate": 0.001829117627934196, |
|
"loss": 1.4984, |
|
"step": 328500 |
|
}, |
|
{ |
|
"epoch": 4.278561675011379, |
|
"grad_norm": 0.55859375, |
|
"learning_rate": 0.001828857532999545, |
|
"loss": 1.4888, |
|
"step": 329000 |
|
}, |
|
{ |
|
"epoch": 4.285064048377658, |
|
"grad_norm": 3.078125, |
|
"learning_rate": 0.0018285974380648938, |
|
"loss": 1.4896, |
|
"step": 329500 |
|
}, |
|
{ |
|
"epoch": 4.291566421743936, |
|
"grad_norm": 0.462890625, |
|
"learning_rate": 0.0018283373431302425, |
|
"loss": 1.4805, |
|
"step": 330000 |
|
}, |
|
{ |
|
"epoch": 4.298068795110215, |
|
"grad_norm": 0.546875, |
|
"learning_rate": 0.0018280772481955915, |
|
"loss": 1.4812, |
|
"step": 330500 |
|
}, |
|
{ |
|
"epoch": 4.304571168476494, |
|
"grad_norm": 0.6015625, |
|
"learning_rate": 0.0018278171532609402, |
|
"loss": 1.4903, |
|
"step": 331000 |
|
}, |
|
{ |
|
"epoch": 4.311073541842773, |
|
"grad_norm": 1.34375, |
|
"learning_rate": 0.001827557058326289, |
|
"loss": 1.4923, |
|
"step": 331500 |
|
}, |
|
{ |
|
"epoch": 4.317575915209051, |
|
"grad_norm": 0.494140625, |
|
"learning_rate": 0.0018272969633916381, |
|
"loss": 1.4971, |
|
"step": 332000 |
|
}, |
|
{ |
|
"epoch": 4.32407828857533, |
|
"grad_norm": 0.44140625, |
|
"learning_rate": 0.0018270368684569869, |
|
"loss": 1.4985, |
|
"step": 332500 |
|
}, |
|
{ |
|
"epoch": 4.3305806619416085, |
|
"grad_norm": 1.0546875, |
|
"learning_rate": 0.0018267767735223358, |
|
"loss": 1.4864, |
|
"step": 333000 |
|
}, |
|
{ |
|
"epoch": 4.337083035307887, |
|
"grad_norm": 0.49609375, |
|
"learning_rate": 0.0018265166785876846, |
|
"loss": 1.4898, |
|
"step": 333500 |
|
}, |
|
{ |
|
"epoch": 4.343585408674166, |
|
"grad_norm": 1.640625, |
|
"learning_rate": 0.0018262565836530333, |
|
"loss": 1.4832, |
|
"step": 334000 |
|
}, |
|
{ |
|
"epoch": 4.350087782040445, |
|
"grad_norm": 0.51953125, |
|
"learning_rate": 0.0018259964887183823, |
|
"loss": 1.4832, |
|
"step": 334500 |
|
}, |
|
{ |
|
"epoch": 4.356590155406724, |
|
"grad_norm": 3.0, |
|
"learning_rate": 0.001825736393783731, |
|
"loss": 1.4868, |
|
"step": 335000 |
|
}, |
|
{ |
|
"epoch": 4.363092528773002, |
|
"grad_norm": 1.59375, |
|
"learning_rate": 0.0018254762988490797, |
|
"loss": 1.4895, |
|
"step": 335500 |
|
}, |
|
{ |
|
"epoch": 4.369594902139281, |
|
"grad_norm": 0.67578125, |
|
"learning_rate": 0.001825216203914429, |
|
"loss": 1.4896, |
|
"step": 336000 |
|
}, |
|
{ |
|
"epoch": 4.376097275505559, |
|
"grad_norm": 3.984375, |
|
"learning_rate": 0.0018249561089797777, |
|
"loss": 1.4895, |
|
"step": 336500 |
|
}, |
|
{ |
|
"epoch": 4.382599648871838, |
|
"grad_norm": 0.8828125, |
|
"learning_rate": 0.0018246960140451266, |
|
"loss": 1.4857, |
|
"step": 337000 |
|
}, |
|
{ |
|
"epoch": 4.3891020222381165, |
|
"grad_norm": 1.3515625, |
|
"learning_rate": 0.0018244359191104753, |
|
"loss": 1.4922, |
|
"step": 337500 |
|
}, |
|
{ |
|
"epoch": 4.395604395604396, |
|
"grad_norm": 0.52734375, |
|
"learning_rate": 0.001824175824175824, |
|
"loss": 1.492, |
|
"step": 338000 |
|
}, |
|
{ |
|
"epoch": 4.4021067689706745, |
|
"grad_norm": 0.53515625, |
|
"learning_rate": 0.001823915729241173, |
|
"loss": 1.4948, |
|
"step": 338500 |
|
}, |
|
{ |
|
"epoch": 4.408609142336953, |
|
"grad_norm": 0.458984375, |
|
"learning_rate": 0.0018236556343065218, |
|
"loss": 1.4994, |
|
"step": 339000 |
|
}, |
|
{ |
|
"epoch": 4.415111515703232, |
|
"grad_norm": 0.5859375, |
|
"learning_rate": 0.001823395539371871, |
|
"loss": 1.4972, |
|
"step": 339500 |
|
}, |
|
{ |
|
"epoch": 4.42161388906951, |
|
"grad_norm": 0.5234375, |
|
"learning_rate": 0.0018231354444372197, |
|
"loss": 1.4951, |
|
"step": 340000 |
|
}, |
|
{ |
|
"epoch": 4.428116262435789, |
|
"grad_norm": 0.55078125, |
|
"learning_rate": 0.0018228753495025684, |
|
"loss": 1.4894, |
|
"step": 340500 |
|
}, |
|
{ |
|
"epoch": 4.434618635802067, |
|
"grad_norm": 0.58203125, |
|
"learning_rate": 0.0018226152545679174, |
|
"loss": 1.4951, |
|
"step": 341000 |
|
}, |
|
{ |
|
"epoch": 4.441121009168347, |
|
"grad_norm": 0.50390625, |
|
"learning_rate": 0.0018223551596332661, |
|
"loss": 1.4847, |
|
"step": 341500 |
|
}, |
|
{ |
|
"epoch": 4.447623382534625, |
|
"grad_norm": 0.80859375, |
|
"learning_rate": 0.0018220950646986149, |
|
"loss": 1.4824, |
|
"step": 342000 |
|
}, |
|
{ |
|
"epoch": 4.454125755900904, |
|
"grad_norm": 0.5, |
|
"learning_rate": 0.0018218349697639638, |
|
"loss": 1.4808, |
|
"step": 342500 |
|
}, |
|
{ |
|
"epoch": 4.4606281292671826, |
|
"grad_norm": 0.5234375, |
|
"learning_rate": 0.0018215748748293128, |
|
"loss": 1.4828, |
|
"step": 343000 |
|
}, |
|
{ |
|
"epoch": 4.467130502633461, |
|
"grad_norm": 0.5234375, |
|
"learning_rate": 0.0018213147798946617, |
|
"loss": 1.4803, |
|
"step": 343500 |
|
}, |
|
{ |
|
"epoch": 4.47363287599974, |
|
"grad_norm": 0.5, |
|
"learning_rate": 0.0018210546849600105, |
|
"loss": 1.4776, |
|
"step": 344000 |
|
}, |
|
{ |
|
"epoch": 4.480135249366018, |
|
"grad_norm": 0.60546875, |
|
"learning_rate": 0.0018207945900253592, |
|
"loss": 1.4754, |
|
"step": 344500 |
|
}, |
|
{ |
|
"epoch": 4.486637622732298, |
|
"grad_norm": 0.69921875, |
|
"learning_rate": 0.0018205344950907082, |
|
"loss": 1.4711, |
|
"step": 345000 |
|
}, |
|
{ |
|
"epoch": 4.493139996098576, |
|
"grad_norm": 0.5, |
|
"learning_rate": 0.001820274400156057, |
|
"loss": 1.4715, |
|
"step": 345500 |
|
}, |
|
{ |
|
"epoch": 4.499642369464855, |
|
"grad_norm": 0.54296875, |
|
"learning_rate": 0.0018200143052214057, |
|
"loss": 1.4745, |
|
"step": 346000 |
|
}, |
|
{ |
|
"epoch": 4.506144742831133, |
|
"grad_norm": 0.423828125, |
|
"learning_rate": 0.0018197542102867548, |
|
"loss": 1.4725, |
|
"step": 346500 |
|
}, |
|
{ |
|
"epoch": 4.512647116197412, |
|
"grad_norm": 0.55859375, |
|
"learning_rate": 0.0018194941153521036, |
|
"loss": 1.4725, |
|
"step": 347000 |
|
}, |
|
{ |
|
"epoch": 4.519149489563691, |
|
"grad_norm": 0.53125, |
|
"learning_rate": 0.0018192340204174525, |
|
"loss": 1.4669, |
|
"step": 347500 |
|
}, |
|
{ |
|
"epoch": 4.525651862929969, |
|
"grad_norm": 0.5078125, |
|
"learning_rate": 0.0018189739254828013, |
|
"loss": 1.4681, |
|
"step": 348000 |
|
}, |
|
{ |
|
"epoch": 4.532154236296249, |
|
"grad_norm": 0.87109375, |
|
"learning_rate": 0.00181871383054815, |
|
"loss": 1.4667, |
|
"step": 348500 |
|
}, |
|
{ |
|
"epoch": 4.538656609662527, |
|
"grad_norm": 0.515625, |
|
"learning_rate": 0.001818453735613499, |
|
"loss": 1.468, |
|
"step": 349000 |
|
}, |
|
{ |
|
"epoch": 4.545158983028806, |
|
"grad_norm": 0.5703125, |
|
"learning_rate": 0.0018181936406788477, |
|
"loss": 1.4745, |
|
"step": 349500 |
|
}, |
|
{ |
|
"epoch": 4.551661356395084, |
|
"grad_norm": 0.46875, |
|
"learning_rate": 0.0018179335457441965, |
|
"loss": 1.4762, |
|
"step": 350000 |
|
}, |
|
{ |
|
"epoch": 4.558163729761363, |
|
"grad_norm": 0.72265625, |
|
"learning_rate": 0.0018176734508095456, |
|
"loss": 1.4789, |
|
"step": 350500 |
|
}, |
|
{ |
|
"epoch": 4.564666103127641, |
|
"grad_norm": 0.70703125, |
|
"learning_rate": 0.0018174133558748944, |
|
"loss": 1.475, |
|
"step": 351000 |
|
}, |
|
{ |
|
"epoch": 4.57116847649392, |
|
"grad_norm": 0.447265625, |
|
"learning_rate": 0.0018171532609402433, |
|
"loss": 1.4692, |
|
"step": 351500 |
|
}, |
|
{ |
|
"epoch": 4.5776708498601995, |
|
"grad_norm": 0.69140625, |
|
"learning_rate": 0.001816893166005592, |
|
"loss": 1.475, |
|
"step": 352000 |
|
}, |
|
{ |
|
"epoch": 4.584173223226478, |
|
"grad_norm": 0.67578125, |
|
"learning_rate": 0.0018166330710709408, |
|
"loss": 1.477, |
|
"step": 352500 |
|
}, |
|
{ |
|
"epoch": 4.590675596592757, |
|
"grad_norm": 0.462890625, |
|
"learning_rate": 0.0018163729761362898, |
|
"loss": 1.4727, |
|
"step": 353000 |
|
}, |
|
{ |
|
"epoch": 4.597177969959035, |
|
"grad_norm": 2.53125, |
|
"learning_rate": 0.0018161128812016385, |
|
"loss": 1.4759, |
|
"step": 353500 |
|
}, |
|
{ |
|
"epoch": 4.603680343325314, |
|
"grad_norm": 0.671875, |
|
"learning_rate": 0.0018158527862669877, |
|
"loss": 1.4758, |
|
"step": 354000 |
|
}, |
|
{ |
|
"epoch": 4.610182716691592, |
|
"grad_norm": 0.76171875, |
|
"learning_rate": 0.0018155926913323364, |
|
"loss": 1.4732, |
|
"step": 354500 |
|
}, |
|
{ |
|
"epoch": 4.616685090057871, |
|
"grad_norm": 0.4375, |
|
"learning_rate": 0.0018153325963976852, |
|
"loss": 1.4689, |
|
"step": 355000 |
|
}, |
|
{ |
|
"epoch": 4.623187463424149, |
|
"grad_norm": 0.765625, |
|
"learning_rate": 0.0018150725014630341, |
|
"loss": 1.4703, |
|
"step": 355500 |
|
}, |
|
{ |
|
"epoch": 4.629689836790429, |
|
"grad_norm": 0.51953125, |
|
"learning_rate": 0.0018148124065283829, |
|
"loss": 1.4674, |
|
"step": 356000 |
|
}, |
|
{ |
|
"epoch": 4.6361922101567075, |
|
"grad_norm": 1.1171875, |
|
"learning_rate": 0.0018145523115937316, |
|
"loss": 1.4734, |
|
"step": 356500 |
|
}, |
|
{ |
|
"epoch": 4.642694583522986, |
|
"grad_norm": 0.59375, |
|
"learning_rate": 0.0018142922166590806, |
|
"loss": 1.4702, |
|
"step": 357000 |
|
}, |
|
{ |
|
"epoch": 4.649196956889265, |
|
"grad_norm": 0.578125, |
|
"learning_rate": 0.0018140321217244295, |
|
"loss": 1.4727, |
|
"step": 357500 |
|
}, |
|
{ |
|
"epoch": 4.655699330255543, |
|
"grad_norm": 0.478515625, |
|
"learning_rate": 0.0018137720267897785, |
|
"loss": 1.4695, |
|
"step": 358000 |
|
}, |
|
{ |
|
"epoch": 4.662201703621822, |
|
"grad_norm": 0.51171875, |
|
"learning_rate": 0.0018135119318551272, |
|
"loss": 1.4736, |
|
"step": 358500 |
|
}, |
|
{ |
|
"epoch": 4.6687040769881, |
|
"grad_norm": 0.498046875, |
|
"learning_rate": 0.001813251836920476, |
|
"loss": 1.4751, |
|
"step": 359000 |
|
}, |
|
{ |
|
"epoch": 4.675206450354379, |
|
"grad_norm": 0.55078125, |
|
"learning_rate": 0.001812991741985825, |
|
"loss": 1.4731, |
|
"step": 359500 |
|
}, |
|
{ |
|
"epoch": 4.681708823720658, |
|
"grad_norm": 0.56640625, |
|
"learning_rate": 0.0018127316470511736, |
|
"loss": 1.4729, |
|
"step": 360000 |
|
}, |
|
{ |
|
"epoch": 4.688211197086937, |
|
"grad_norm": 0.67578125, |
|
"learning_rate": 0.0018124715521165224, |
|
"loss": 1.4717, |
|
"step": 360500 |
|
}, |
|
{ |
|
"epoch": 4.6947135704532155, |
|
"grad_norm": 0.48046875, |
|
"learning_rate": 0.0018122114571818716, |
|
"loss": 1.4695, |
|
"step": 361000 |
|
}, |
|
{ |
|
"epoch": 4.701215943819494, |
|
"grad_norm": 0.494140625, |
|
"learning_rate": 0.0018119513622472203, |
|
"loss": 1.4721, |
|
"step": 361500 |
|
}, |
|
{ |
|
"epoch": 4.707718317185773, |
|
"grad_norm": 0.62109375, |
|
"learning_rate": 0.0018116912673125693, |
|
"loss": 1.4716, |
|
"step": 362000 |
|
}, |
|
{ |
|
"epoch": 4.714220690552051, |
|
"grad_norm": 0.5703125, |
|
"learning_rate": 0.001811431172377918, |
|
"loss": 1.466, |
|
"step": 362500 |
|
}, |
|
{ |
|
"epoch": 4.72072306391833, |
|
"grad_norm": 3.59375, |
|
"learning_rate": 0.0018111710774432667, |
|
"loss": 1.4712, |
|
"step": 363000 |
|
}, |
|
{ |
|
"epoch": 4.727225437284609, |
|
"grad_norm": 0.62109375, |
|
"learning_rate": 0.0018109109825086157, |
|
"loss": 1.4683, |
|
"step": 363500 |
|
}, |
|
{ |
|
"epoch": 4.733727810650888, |
|
"grad_norm": 1.390625, |
|
"learning_rate": 0.0018106508875739644, |
|
"loss": 1.4724, |
|
"step": 364000 |
|
}, |
|
{ |
|
"epoch": 4.740230184017166, |
|
"grad_norm": 0.51953125, |
|
"learning_rate": 0.0018103907926393132, |
|
"loss": 1.4704, |
|
"step": 364500 |
|
}, |
|
{ |
|
"epoch": 4.746732557383445, |
|
"grad_norm": 0.578125, |
|
"learning_rate": 0.0018101306977046623, |
|
"loss": 1.4699, |
|
"step": 365000 |
|
}, |
|
{ |
|
"epoch": 4.7532349307497235, |
|
"grad_norm": 0.58203125, |
|
"learning_rate": 0.001809870602770011, |
|
"loss": 1.4741, |
|
"step": 365500 |
|
}, |
|
{ |
|
"epoch": 4.759737304116002, |
|
"grad_norm": 0.5546875, |
|
"learning_rate": 0.00180961050783536, |
|
"loss": 1.4728, |
|
"step": 366000 |
|
}, |
|
{ |
|
"epoch": 4.766239677482281, |
|
"grad_norm": 0.59765625, |
|
"learning_rate": 0.0018093504129007088, |
|
"loss": 1.4674, |
|
"step": 366500 |
|
}, |
|
{ |
|
"epoch": 4.77274205084856, |
|
"grad_norm": 1.03125, |
|
"learning_rate": 0.0018090903179660575, |
|
"loss": 1.4757, |
|
"step": 367000 |
|
}, |
|
{ |
|
"epoch": 4.779244424214839, |
|
"grad_norm": 0.65625, |
|
"learning_rate": 0.0018088302230314065, |
|
"loss": 1.4755, |
|
"step": 367500 |
|
}, |
|
{ |
|
"epoch": 4.785746797581117, |
|
"grad_norm": 0.6953125, |
|
"learning_rate": 0.0018085701280967552, |
|
"loss": 1.4721, |
|
"step": 368000 |
|
}, |
|
{ |
|
"epoch": 4.792249170947396, |
|
"grad_norm": 2.375, |
|
"learning_rate": 0.0018083100331621044, |
|
"loss": 1.473, |
|
"step": 368500 |
|
}, |
|
{ |
|
"epoch": 4.798751544313674, |
|
"grad_norm": 0.515625, |
|
"learning_rate": 0.0018080499382274531, |
|
"loss": 1.472, |
|
"step": 369000 |
|
}, |
|
{ |
|
"epoch": 4.805253917679953, |
|
"grad_norm": 0.443359375, |
|
"learning_rate": 0.0018077898432928019, |
|
"loss": 1.4735, |
|
"step": 369500 |
|
}, |
|
{ |
|
"epoch": 4.8117562910462315, |
|
"grad_norm": 0.515625, |
|
"learning_rate": 0.0018075297483581508, |
|
"loss": 1.4696, |
|
"step": 370000 |
|
}, |
|
{ |
|
"epoch": 4.818258664412511, |
|
"grad_norm": 0.466796875, |
|
"learning_rate": 0.0018072696534234996, |
|
"loss": 1.4682, |
|
"step": 370500 |
|
}, |
|
{ |
|
"epoch": 4.8247610377787895, |
|
"grad_norm": 0.53515625, |
|
"learning_rate": 0.0018070095584888483, |
|
"loss": 1.4683, |
|
"step": 371000 |
|
}, |
|
{ |
|
"epoch": 4.831263411145068, |
|
"grad_norm": 0.453125, |
|
"learning_rate": 0.0018067494635541973, |
|
"loss": 1.4678, |
|
"step": 371500 |
|
}, |
|
{ |
|
"epoch": 4.837765784511347, |
|
"grad_norm": 1.1171875, |
|
"learning_rate": 0.0018064893686195462, |
|
"loss": 1.4719, |
|
"step": 372000 |
|
}, |
|
{ |
|
"epoch": 4.844268157877625, |
|
"grad_norm": 0.55078125, |
|
"learning_rate": 0.0018062292736848952, |
|
"loss": 1.4704, |
|
"step": 372500 |
|
}, |
|
{ |
|
"epoch": 4.850770531243904, |
|
"grad_norm": 0.65234375, |
|
"learning_rate": 0.001805969178750244, |
|
"loss": 1.4744, |
|
"step": 373000 |
|
}, |
|
{ |
|
"epoch": 4.857272904610182, |
|
"grad_norm": 0.99609375, |
|
"learning_rate": 0.0018057090838155927, |
|
"loss": 1.4697, |
|
"step": 373500 |
|
}, |
|
{ |
|
"epoch": 4.863775277976462, |
|
"grad_norm": 0.52734375, |
|
"learning_rate": 0.0018054489888809416, |
|
"loss": 1.4715, |
|
"step": 374000 |
|
}, |
|
{ |
|
"epoch": 4.87027765134274, |
|
"grad_norm": 0.5078125, |
|
"learning_rate": 0.0018051888939462904, |
|
"loss": 1.4689, |
|
"step": 374500 |
|
}, |
|
{ |
|
"epoch": 4.876780024709019, |
|
"grad_norm": 0.80078125, |
|
"learning_rate": 0.001804928799011639, |
|
"loss": 1.4676, |
|
"step": 375000 |
|
}, |
|
{ |
|
"epoch": 4.8832823980752975, |
|
"grad_norm": 0.51171875, |
|
"learning_rate": 0.0018046687040769883, |
|
"loss": 1.4726, |
|
"step": 375500 |
|
}, |
|
{ |
|
"epoch": 4.889784771441576, |
|
"grad_norm": 0.87109375, |
|
"learning_rate": 0.001804408609142337, |
|
"loss": 1.4689, |
|
"step": 376000 |
|
}, |
|
{ |
|
"epoch": 4.896287144807855, |
|
"grad_norm": 0.55859375, |
|
"learning_rate": 0.001804148514207686, |
|
"loss": 1.4697, |
|
"step": 376500 |
|
}, |
|
{ |
|
"epoch": 4.902789518174133, |
|
"grad_norm": 0.4921875, |
|
"learning_rate": 0.0018038884192730347, |
|
"loss": 1.4706, |
|
"step": 377000 |
|
}, |
|
{ |
|
"epoch": 4.909291891540413, |
|
"grad_norm": 0.5859375, |
|
"learning_rate": 0.0018036283243383835, |
|
"loss": 1.4714, |
|
"step": 377500 |
|
}, |
|
{ |
|
"epoch": 4.915794264906691, |
|
"grad_norm": 0.46875, |
|
"learning_rate": 0.0018033682294037324, |
|
"loss": 1.466, |
|
"step": 378000 |
|
}, |
|
{ |
|
"epoch": 4.92229663827297, |
|
"grad_norm": 1.1875, |
|
"learning_rate": 0.0018031081344690812, |
|
"loss": 1.4719, |
|
"step": 378500 |
|
}, |
|
{ |
|
"epoch": 4.928799011639248, |
|
"grad_norm": 0.58984375, |
|
"learning_rate": 0.0018028480395344299, |
|
"loss": 1.4701, |
|
"step": 379000 |
|
}, |
|
{ |
|
"epoch": 4.935301385005527, |
|
"grad_norm": 5.46875, |
|
"learning_rate": 0.001802587944599779, |
|
"loss": 1.473, |
|
"step": 379500 |
|
}, |
|
{ |
|
"epoch": 4.9418037583718055, |
|
"grad_norm": 0.6796875, |
|
"learning_rate": 0.0018023278496651278, |
|
"loss": 1.4721, |
|
"step": 380000 |
|
}, |
|
{ |
|
"epoch": 4.948306131738084, |
|
"grad_norm": 0.59375, |
|
"learning_rate": 0.0018020677547304768, |
|
"loss": 1.474, |
|
"step": 380500 |
|
}, |
|
{ |
|
"epoch": 4.9548085051043635, |
|
"grad_norm": 0.546875, |
|
"learning_rate": 0.0018018076597958255, |
|
"loss": 1.473, |
|
"step": 381000 |
|
}, |
|
{ |
|
"epoch": 4.961310878470642, |
|
"grad_norm": 0.54296875, |
|
"learning_rate": 0.0018015475648611742, |
|
"loss": 1.4764, |
|
"step": 381500 |
|
}, |
|
{ |
|
"epoch": 4.967813251836921, |
|
"grad_norm": 1.9140625, |
|
"learning_rate": 0.0018012874699265232, |
|
"loss": 1.4717, |
|
"step": 382000 |
|
}, |
|
{ |
|
"epoch": 4.974315625203199, |
|
"grad_norm": 0.58203125, |
|
"learning_rate": 0.001801027374991872, |
|
"loss": 1.4759, |
|
"step": 382500 |
|
}, |
|
{ |
|
"epoch": 4.980817998569478, |
|
"grad_norm": 0.462890625, |
|
"learning_rate": 0.0018007672800572211, |
|
"loss": 1.4792, |
|
"step": 383000 |
|
}, |
|
{ |
|
"epoch": 4.987320371935756, |
|
"grad_norm": 0.84765625, |
|
"learning_rate": 0.0018005071851225699, |
|
"loss": 1.4764, |
|
"step": 383500 |
|
}, |
|
{ |
|
"epoch": 4.993822745302035, |
|
"grad_norm": 0.53515625, |
|
"learning_rate": 0.0018002470901879186, |
|
"loss": 1.4758, |
|
"step": 384000 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"eval_loss": 1.4539484977722168, |
|
"eval_runtime": 0.9039, |
|
"eval_samples_per_second": 1106.3, |
|
"eval_steps_per_second": 8.85, |
|
"step": 384475 |
|
} |
|
], |
|
"logging_steps": 500, |
|
"max_steps": 3844750, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 50, |
|
"save_steps": 500, |
|
"total_flos": 3.3089171938476826e+18, |
|
"train_batch_size": 128, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|