diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -1,1649 +1,3432 @@ { "best_metric": null, "best_model_checkpoint": null, - "epoch": 4.988913525498892, + "epoch": 10.0, "eval_steps": 500, - "global_step": 1125, + "global_step": 2370, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { - "epoch": 0.004434589800443459, - "grad_norm": 1.9132194519042969, - "learning_rate": 1.7699115044247788e-06, - "loss": 2.8127, + "epoch": 0.004219409282700422, + "grad_norm": 3.1438205242156982, + "learning_rate": 8.438818565400843e-07, + "loss": 2.5933, "step": 1 }, { - "epoch": 0.022172949002217297, - "grad_norm": 2.0755808353424072, - "learning_rate": 8.849557522123894e-06, - "loss": 2.8248, + "epoch": 0.02109704641350211, + "grad_norm": 3.45337176322937, + "learning_rate": 4.219409282700422e-06, + "loss": 2.5683, "step": 5 }, { - "epoch": 0.04434589800443459, - "grad_norm": 1.6995130777359009, - "learning_rate": 1.7699115044247787e-05, - "loss": 2.7381, + "epoch": 0.04219409282700422, + "grad_norm": 3.8354620933532715, + "learning_rate": 8.438818565400844e-06, + "loss": 2.567, "step": 10 }, { - "epoch": 0.06651884700665188, - "grad_norm": 1.6350675821304321, - "learning_rate": 2.6548672566371686e-05, - "loss": 2.6572, + "epoch": 0.06329113924050633, + "grad_norm": 3.2700271606445312, + "learning_rate": 1.2658227848101267e-05, + "loss": 2.4327, "step": 15 }, { - "epoch": 0.08869179600886919, - "grad_norm": 1.2029041051864624, - "learning_rate": 3.5398230088495574e-05, - "loss": 2.4881, + "epoch": 0.08438818565400844, + "grad_norm": 3.2498395442962646, + "learning_rate": 1.6877637130801688e-05, + "loss": 2.2197, "step": 20 }, { - "epoch": 0.11086474501108648, - "grad_norm": 0.7936518788337708, - "learning_rate": 4.4247787610619477e-05, - "loss": 2.3403, + "epoch": 0.10548523206751055, + "grad_norm": 2.3556034564971924, + "learning_rate": 2.1097046413502112e-05, + "loss": 1.8787, "step": 25 }, { - "epoch": 0.13303769401330376, - "grad_norm": 0.8098437190055847, - "learning_rate": 5.309734513274337e-05, - "loss": 2.2186, + "epoch": 0.12658227848101267, + "grad_norm": 1.5979266166687012, + "learning_rate": 2.5316455696202533e-05, + "loss": 1.5467, "step": 30 }, { - "epoch": 0.15521064301552107, - "grad_norm": 0.6613564491271973, - "learning_rate": 6.194690265486725e-05, - "loss": 2.1041, + "epoch": 0.14767932489451477, + "grad_norm": 1.260302186012268, + "learning_rate": 2.9535864978902954e-05, + "loss": 1.4303, "step": 35 }, { - "epoch": 0.17738359201773837, - "grad_norm": 0.6659165620803833, - "learning_rate": 7.079646017699115e-05, - "loss": 1.9928, + "epoch": 0.16877637130801687, + "grad_norm": 0.5591890811920166, + "learning_rate": 3.3755274261603375e-05, + "loss": 1.3705, "step": 40 }, { - "epoch": 0.19955654101995565, - "grad_norm": 0.559091329574585, - "learning_rate": 7.964601769911504e-05, - "loss": 1.9216, + "epoch": 0.189873417721519, + "grad_norm": 0.5186863541603088, + "learning_rate": 3.79746835443038e-05, + "loss": 1.328, "step": 45 }, { - "epoch": 0.22172949002217296, - "grad_norm": 0.4300543963909149, - "learning_rate": 8.849557522123895e-05, - "loss": 1.8657, + "epoch": 0.2109704641350211, + "grad_norm": 0.5181670188903809, + "learning_rate": 4.2194092827004224e-05, + "loss": 1.2614, "step": 50 }, { - "epoch": 0.24390243902439024, - "grad_norm": 0.34058740735054016, - "learning_rate": 9.734513274336283e-05, - "loss": 1.7914, + "epoch": 0.2320675105485232, + "grad_norm": 0.5109447240829468, + "learning_rate": 4.641350210970464e-05, + "loss": 1.2328, "step": 55 }, { - "epoch": 0.2660753880266075, - "grad_norm": 0.34278205037117004, - "learning_rate": 0.00010619469026548674, - "loss": 1.7762, + "epoch": 0.25316455696202533, + "grad_norm": 0.4200008809566498, + "learning_rate": 5.0632911392405066e-05, + "loss": 1.1882, "step": 60 }, { - "epoch": 0.28824833702882485, - "grad_norm": 0.31179192662239075, - "learning_rate": 0.00011504424778761063, - "loss": 1.7351, + "epoch": 0.2742616033755274, + "grad_norm": 0.37015053629875183, + "learning_rate": 5.4852320675105484e-05, + "loss": 1.1461, "step": 65 }, { - "epoch": 0.31042128603104213, - "grad_norm": 0.3422749936580658, - "learning_rate": 0.0001238938053097345, - "loss": 1.6966, + "epoch": 0.29535864978902954, + "grad_norm": 0.39964228868484497, + "learning_rate": 5.907172995780591e-05, + "loss": 1.1336, "step": 70 }, { - "epoch": 0.3325942350332594, - "grad_norm": 0.4129348397254944, - "learning_rate": 0.00013274336283185842, - "loss": 1.6817, + "epoch": 0.31645569620253167, + "grad_norm": 0.3632591962814331, + "learning_rate": 6.329113924050633e-05, + "loss": 1.1081, "step": 75 }, { - "epoch": 0.35476718403547675, - "grad_norm": 0.48307937383651733, - "learning_rate": 0.0001415929203539823, - "loss": 1.6672, + "epoch": 0.33755274261603374, + "grad_norm": 0.363908588886261, + "learning_rate": 6.751054852320675e-05, + "loss": 1.1042, "step": 80 }, { - "epoch": 0.376940133037694, - "grad_norm": 0.5420916676521301, - "learning_rate": 0.00015044247787610618, - "loss": 1.6174, + "epoch": 0.35864978902953587, + "grad_norm": 0.373738557100296, + "learning_rate": 7.172995780590718e-05, + "loss": 1.0802, "step": 85 }, { - "epoch": 0.3991130820399113, - "grad_norm": 0.41327813267707825, - "learning_rate": 0.0001592920353982301, - "loss": 1.5966, + "epoch": 0.379746835443038, + "grad_norm": 0.3337308168411255, + "learning_rate": 7.59493670886076e-05, + "loss": 1.0781, "step": 90 }, { - "epoch": 0.4212860310421286, - "grad_norm": 0.4299337565898895, - "learning_rate": 0.000168141592920354, - "loss": 1.569, + "epoch": 0.4008438818565401, + "grad_norm": 0.36707767844200134, + "learning_rate": 8.016877637130802e-05, + "loss": 1.07, "step": 95 }, { - "epoch": 0.4434589800443459, - "grad_norm": 0.3761642575263977, - "learning_rate": 0.0001769911504424779, - "loss": 1.5668, + "epoch": 0.4219409282700422, + "grad_norm": 0.40698128938674927, + "learning_rate": 8.438818565400845e-05, + "loss": 1.0672, "step": 100 }, { - "epoch": 0.4656319290465632, - "grad_norm": 0.41222497820854187, - "learning_rate": 0.0001858407079646018, - "loss": 1.5515, + "epoch": 0.4430379746835443, + "grad_norm": 0.4015476107597351, + "learning_rate": 8.860759493670887e-05, + "loss": 1.0508, "step": 105 }, { - "epoch": 0.4878048780487805, - "grad_norm": 0.4026060402393341, - "learning_rate": 0.00019469026548672567, - "loss": 1.5341, + "epoch": 0.4641350210970464, + "grad_norm": 0.3830510675907135, + "learning_rate": 9.282700421940928e-05, + "loss": 1.0396, "step": 110 }, { - "epoch": 0.5099778270509978, - "grad_norm": 0.36704277992248535, - "learning_rate": 0.00019999807262012045, - "loss": 1.536, + "epoch": 0.48523206751054854, + "grad_norm": 0.540158748626709, + "learning_rate": 9.704641350210972e-05, + "loss": 1.0356, "step": 115 }, { - "epoch": 0.532150776053215, - "grad_norm": 0.33943068981170654, - "learning_rate": 0.00019997639044970784, - "loss": 1.5084, + "epoch": 0.5063291139240507, + "grad_norm": 0.5870048999786377, + "learning_rate": 0.00010126582278481013, + "loss": 1.0363, "step": 120 }, { - "epoch": 0.5543237250554324, - "grad_norm": 0.42166343331336975, - "learning_rate": 0.00019993062212508053, - "loss": 1.5056, + "epoch": 0.5274261603375527, + "grad_norm": 0.6282536387443542, + "learning_rate": 0.00010548523206751055, + "loss": 1.0205, "step": 125 }, { - "epoch": 0.5764966740576497, - "grad_norm": 0.3949909806251526, - "learning_rate": 0.00019986077867267113, - "loss": 1.4979, + "epoch": 0.5485232067510548, + "grad_norm": 0.5962668061256409, + "learning_rate": 0.00010970464135021097, + "loss": 1.0327, "step": 130 }, { - "epoch": 0.5986696230598669, - "grad_norm": 0.47220396995544434, - "learning_rate": 0.00019976687691905393, - "loss": 1.4935, + "epoch": 0.569620253164557, + "grad_norm": 0.44145339727401733, + "learning_rate": 0.0001139240506329114, + "loss": 1.0036, "step": 135 }, { - "epoch": 0.6208425720620843, - "grad_norm": 0.4970516860485077, - "learning_rate": 0.00019964893948689122, - "loss": 1.5058, + "epoch": 0.5907172995780591, + "grad_norm": 0.3850124776363373, + "learning_rate": 0.00011814345991561182, + "loss": 0.9939, "step": 140 }, { - "epoch": 0.6430155210643016, - "grad_norm": 0.40719351172447205, - "learning_rate": 0.00019950699478948309, - "loss": 1.4937, + "epoch": 0.6118143459915611, + "grad_norm": 0.45476189255714417, + "learning_rate": 0.00012236286919831225, + "loss": 1.01, "step": 145 }, { - "epoch": 0.6651884700665188, - "grad_norm": 0.3738563358783722, - "learning_rate": 0.000199341077023922, - "loss": 1.4998, + "epoch": 0.6329113924050633, + "grad_norm": 0.4156922399997711, + "learning_rate": 0.00012658227848101267, + "loss": 1.0046, "step": 150 }, { - "epoch": 0.6873614190687362, - "grad_norm": 0.3738692104816437, - "learning_rate": 0.00019915122616285418, - "loss": 1.4868, + "epoch": 0.6540084388185654, + "grad_norm": 0.5297821760177612, + "learning_rate": 0.00013080168776371308, + "loss": 0.9885, "step": 155 }, { - "epoch": 0.7095343680709535, - "grad_norm": 0.3409136235713959, - "learning_rate": 0.00019893748794484948, - "loss": 1.4755, + "epoch": 0.6751054852320675, + "grad_norm": 0.4995609521865845, + "learning_rate": 0.0001350210970464135, + "loss": 0.9885, "step": 160 }, { - "epoch": 0.7317073170731707, - "grad_norm": 0.36116743087768555, - "learning_rate": 0.0001986999138633821, - "loss": 1.4555, + "epoch": 0.6962025316455697, + "grad_norm": 0.43320751190185547, + "learning_rate": 0.00013924050632911395, + "loss": 0.9818, "step": 165 }, { - "epoch": 0.753880266075388, - "grad_norm": 0.38903379440307617, - "learning_rate": 0.00019843856115442482, - "loss": 1.4628, + "epoch": 0.7172995780590717, + "grad_norm": 0.3719841539859772, + "learning_rate": 0.00014345991561181436, + "loss": 0.9739, "step": 170 }, { - "epoch": 0.7760532150776053, - "grad_norm": 0.42216047644615173, - "learning_rate": 0.00019815349278265988, - "loss": 1.4725, + "epoch": 0.7383966244725738, + "grad_norm": 0.40309059619903564, + "learning_rate": 0.00014767932489451478, + "loss": 0.9646, "step": 175 }, { - "epoch": 0.7982261640798226, - "grad_norm": 0.3521736264228821, - "learning_rate": 0.00019784477742630952, - "loss": 1.4695, + "epoch": 0.759493670886076, + "grad_norm": 0.41251224279403687, + "learning_rate": 0.0001518987341772152, + "loss": 0.9606, "step": 180 }, { - "epoch": 0.8203991130820399, - "grad_norm": 0.3384956419467926, - "learning_rate": 0.00019751248946059014, - "loss": 1.4512, + "epoch": 0.7805907172995781, + "grad_norm": 0.3959939181804657, + "learning_rate": 0.00015611814345991562, + "loss": 0.9585, "step": 185 }, { - "epoch": 0.8425720620842572, - "grad_norm": 0.35126349329948425, - "learning_rate": 0.00019715670893979414, - "loss": 1.4509, + "epoch": 0.8016877637130801, + "grad_norm": 0.5289701819419861, + "learning_rate": 0.00016033755274261603, + "loss": 0.9709, "step": 190 }, { - "epoch": 0.8647450110864745, - "grad_norm": 0.37433576583862305, - "learning_rate": 0.00019677752157800312, - "loss": 1.4386, + "epoch": 0.8227848101265823, + "grad_norm": 0.4239669740200043, + "learning_rate": 0.00016455696202531648, + "loss": 0.9577, "step": 195 }, { - "epoch": 0.8869179600886918, - "grad_norm": 0.39587318897247314, - "learning_rate": 0.0001963750187284379, - "loss": 1.4529, + "epoch": 0.8438818565400844, + "grad_norm": 0.5463127493858337, + "learning_rate": 0.0001687763713080169, + "loss": 0.9703, "step": 200 }, { - "epoch": 0.9090909090909091, - "grad_norm": 0.403945654630661, - "learning_rate": 0.00019594929736144976, - "loss": 1.4368, + "epoch": 0.8649789029535865, + "grad_norm": 0.4942500591278076, + "learning_rate": 0.00017299578059071731, + "loss": 0.956, "step": 205 }, { - "epoch": 0.9312638580931264, - "grad_norm": 0.3656717538833618, - "learning_rate": 0.0001955004600411586, - "loss": 1.4208, + "epoch": 0.8860759493670886, + "grad_norm": 0.361708402633667, + "learning_rate": 0.00017721518987341773, + "loss": 0.9598, "step": 210 }, { - "epoch": 0.9534368070953437, - "grad_norm": 0.409030556678772, - "learning_rate": 0.0001950286149007434, - "loss": 1.4151, + "epoch": 0.9071729957805907, + "grad_norm": 0.5146432518959045, + "learning_rate": 0.00018143459915611815, + "loss": 0.9606, "step": 215 }, { - "epoch": 0.975609756097561, - "grad_norm": 0.4138728678226471, - "learning_rate": 0.0001945338756163907, - "loss": 1.4441, + "epoch": 0.9282700421940928, + "grad_norm": 0.4746183156967163, + "learning_rate": 0.00018565400843881857, + "loss": 0.9255, "step": 220 }, { - "epoch": 0.9977827050997783, - "grad_norm": 0.4125191271305084, - "learning_rate": 0.00019401636137990816, - "loss": 1.4429, + "epoch": 0.9493670886075949, + "grad_norm": 0.35139352083206177, + "learning_rate": 0.00018987341772151899, + "loss": 0.9452, "step": 225 }, { - "epoch": 0.9977827050997783, - "eval_loss": 1.7484289407730103, - "eval_runtime": 0.374, - "eval_samples_per_second": 2.674, - "eval_steps_per_second": 2.674, - "step": 225 - }, - { - "epoch": 1.0199556541019956, - "grad_norm": 0.498823344707489, - "learning_rate": 0.00019347619687000892, - "loss": 1.4079, + "epoch": 0.9704641350210971, + "grad_norm": 0.3744509816169739, + "learning_rate": 0.00019409282700421943, + "loss": 0.9388, "step": 230 }, { - "epoch": 1.042128603104213, - "grad_norm": 0.3439851999282837, - "learning_rate": 0.00019291351222227432, - "loss": 1.4058, + "epoch": 0.9915611814345991, + "grad_norm": 0.409365177154541, + "learning_rate": 0.00019831223628691985, + "loss": 0.9183, "step": 235 }, { - "epoch": 1.06430155210643, - "grad_norm": 0.38840946555137634, - "learning_rate": 0.0001923284429978017, - "loss": 1.4204, + "epoch": 1.0, + "eval_loss": 1.6516629457473755, + "eval_runtime": 0.5551, + "eval_samples_per_second": 3.603, + "eval_steps_per_second": 1.802, + "step": 237 + }, + { + "epoch": 1.0126582278481013, + "grad_norm": 0.3671579658985138, + "learning_rate": 0.00019999902382014363, + "loss": 0.9181, "step": 240 }, { - "epoch": 1.0864745011086474, - "grad_norm": 0.34416335821151733, - "learning_rate": 0.00019172113015054532, - "loss": 1.4056, + "epoch": 1.0337552742616034, + "grad_norm": 0.37430304288864136, + "learning_rate": 0.0001999930583455953, + "loss": 0.9099, "step": 245 }, { - "epoch": 1.1086474501108647, - "grad_norm": 0.3941856324672699, - "learning_rate": 0.00019109171999335793, - "loss": 1.4249, + "epoch": 1.0548523206751055, + "grad_norm": 0.3971017599105835, + "learning_rate": 0.00019998167004176888, + "loss": 0.9187, "step": 250 }, { - "epoch": 1.130820399113082, - "grad_norm": 0.36946943402290344, - "learning_rate": 0.00019044036416274133, - "loss": 1.4083, + "epoch": 1.0759493670886076, + "grad_norm": 0.35043320059776306, + "learning_rate": 0.00019996485952627552, + "loss": 0.9063, "step": 255 }, { - "epoch": 1.1529933481152994, - "grad_norm": 0.3524357080459595, - "learning_rate": 0.00018976721958231438, - "loss": 1.3953, + "epoch": 1.0970464135021096, + "grad_norm": 0.34455016255378723, + "learning_rate": 0.00019994262771078406, + "loss": 0.9072, "step": 260 }, { - "epoch": 1.1751662971175167, - "grad_norm": 0.39715054631233215, - "learning_rate": 0.00018907244842500704, - "loss": 1.4021, + "epoch": 1.1181434599156117, + "grad_norm": 0.358531653881073, + "learning_rate": 0.0001999149758009716, + "loss": 0.916, "step": 265 }, { - "epoch": 1.1973392461197339, - "grad_norm": 0.43194007873535156, - "learning_rate": 0.00018835621807399016, - "loss": 1.4057, + "epoch": 1.139240506329114, + "grad_norm": 0.3874802589416504, + "learning_rate": 0.00019988190529645808, + "loss": 0.8913, "step": 270 }, { - "epoch": 1.2195121951219512, - "grad_norm": 0.38655465841293335, - "learning_rate": 0.0001876187010823496, - "loss": 1.3816, + "epoch": 1.160337552742616, + "grad_norm": 0.3963850736618042, + "learning_rate": 0.00019984341799072504, + "loss": 0.9033, "step": 275 }, { - "epoch": 1.2416851441241685, - "grad_norm": 0.330555260181427, - "learning_rate": 0.00018686007513151514, - "loss": 1.3913, + "epoch": 1.1814345991561181, + "grad_norm": 0.485408753156662, + "learning_rate": 0.0001997995159710182, + "loss": 0.8965, "step": 280 }, { - "epoch": 1.2638580931263859, - "grad_norm": 0.34880608320236206, - "learning_rate": 0.0001860805229884536, - "loss": 1.407, + "epoch": 1.2025316455696202, + "grad_norm": 0.383575975894928, + "learning_rate": 0.00019975020161823445, + "loss": 0.8919, "step": 285 }, { - "epoch": 1.2860310421286032, - "grad_norm": 0.38133811950683594, - "learning_rate": 0.00018528023246163717, - "loss": 1.3834, + "epoch": 1.2236286919831223, + "grad_norm": 0.3458747863769531, + "learning_rate": 0.00019969547760679258, + "loss": 0.8827, "step": 290 }, { - "epoch": 1.3082039911308203, - "grad_norm": 0.36675792932510376, - "learning_rate": 0.00018445939635579656, - "loss": 1.377, + "epoch": 1.2447257383966246, + "grad_norm": 0.36945974826812744, + "learning_rate": 0.00019963534690448835, + "loss": 0.8957, "step": 295 }, { - "epoch": 1.3303769401330376, - "grad_norm": 0.383324533700943, - "learning_rate": 0.0001836182124254711, - "loss": 1.3904, + "epoch": 1.2658227848101267, + "grad_norm": 0.39198634028434753, + "learning_rate": 0.0001995698127723334, + "loss": 0.879, "step": 300 }, { - "epoch": 1.352549889135255, - "grad_norm": 0.47266829013824463, - "learning_rate": 0.00018275688332736577, - "loss": 1.3839, + "epoch": 1.2869198312236287, + "grad_norm": 0.36909279227256775, + "learning_rate": 0.0001994988787643786, + "loss": 0.9014, "step": 305 }, { - "epoch": 1.3747228381374723, - "grad_norm": 0.36780834197998047, - "learning_rate": 0.00018187561657152757, - "loss": 1.3729, + "epoch": 1.3080168776371308, + "grad_norm": 0.3728243112564087, + "learning_rate": 0.00019942254872752112, + "loss": 0.891, "step": 310 }, { - "epoch": 1.3968957871396896, - "grad_norm": 0.35294458270072937, - "learning_rate": 0.00018097462447135273, - "loss": 1.386, + "epoch": 1.3291139240506329, + "grad_norm": 0.34412381052970886, + "learning_rate": 0.00019934082680129586, + "loss": 0.8744, "step": 315 }, { - "epoch": 1.4190687361419068, - "grad_norm": 0.36432820558547974, - "learning_rate": 0.00018005412409243606, - "loss": 1.3678, + "epoch": 1.350210970464135, + "grad_norm": 0.3310409486293793, + "learning_rate": 0.00019925371741765107, + "loss": 0.8788, "step": 320 }, { - "epoch": 1.441241685144124, - "grad_norm": 0.38247236609458923, - "learning_rate": 0.00017911433720027624, - "loss": 1.3612, + "epoch": 1.371308016877637, + "grad_norm": 0.3466126620769501, + "learning_rate": 0.00019916122530070783, + "loss": 0.8953, "step": 325 }, { - "epoch": 1.4634146341463414, - "grad_norm": 0.4539274275302887, - "learning_rate": 0.00017815549020684825, - "loss": 1.3778, + "epoch": 1.3924050632911391, + "grad_norm": 0.3331877291202545, + "learning_rate": 0.00019906335546650392, + "loss": 0.8703, "step": 330 }, { - "epoch": 1.4855875831485588, - "grad_norm": 0.39669978618621826, - "learning_rate": 0.0001771778141160566, - "loss": 1.38, + "epoch": 1.4135021097046414, + "grad_norm": 0.37082529067993164, + "learning_rate": 0.0001989601132227218, + "loss": 0.8951, "step": 335 }, { - "epoch": 1.507760532150776, - "grad_norm": 0.3531462550163269, - "learning_rate": 0.0001761815444680822, - "loss": 1.3805, + "epoch": 1.4345991561181435, + "grad_norm": 0.3338633179664612, + "learning_rate": 0.00019885150416840082, + "loss": 0.8826, "step": 340 }, { - "epoch": 1.5299334811529932, - "grad_norm": 0.3824242055416107, - "learning_rate": 0.00017516692128263648, - "loss": 1.3764, + "epoch": 1.4556962025316456, + "grad_norm": 0.32301968336105347, + "learning_rate": 0.00019873753419363336, + "loss": 0.8821, "step": 345 }, { - "epoch": 1.5521064301552108, - "grad_norm": 0.3946467936038971, - "learning_rate": 0.00017413418900113605, - "loss": 1.3678, + "epoch": 1.4767932489451476, + "grad_norm": 0.3255464732646942, + "learning_rate": 0.00019861820947924565, + "loss": 0.87, "step": 350 }, { - "epoch": 1.5742793791574279, - "grad_norm": 0.38375967741012573, - "learning_rate": 0.00017308359642781242, - "loss": 1.3767, + "epoch": 1.49789029535865, + "grad_norm": 0.33072352409362793, + "learning_rate": 0.0001984935364964625, + "loss": 0.8755, "step": 355 }, { - "epoch": 1.5964523281596452, - "grad_norm": 0.418390154838562, - "learning_rate": 0.00017201539666977043, - "loss": 1.3734, + "epoch": 1.518987341772152, + "grad_norm": 0.39160993695259094, + "learning_rate": 0.0001983635220065562, + "loss": 0.861, "step": 360 }, { - "epoch": 1.6186252771618626, - "grad_norm": 0.35475605726242065, - "learning_rate": 0.0001709298470760101, - "loss": 1.3733, + "epoch": 1.540084388185654, + "grad_norm": 0.35338205099105835, + "learning_rate": 0.00019822817306048006, + "loss": 0.864, "step": 365 }, { - "epoch": 1.6407982261640797, - "grad_norm": 0.35535967350006104, - "learning_rate": 0.0001698272091754264, - "loss": 1.3879, + "epoch": 1.5611814345991561, + "grad_norm": 0.30854344367980957, + "learning_rate": 0.00019808749699848593, + "loss": 0.8521, "step": 370 }, { - "epoch": 1.6629711751662972, - "grad_norm": 0.38182777166366577, - "learning_rate": 0.00016870774861380228, - "loss": 1.3673, + "epoch": 1.5822784810126582, + "grad_norm": 0.3593827188014984, + "learning_rate": 0.00019794150144972602, + "loss": 0.8738, "step": 375 }, { - "epoch": 1.6851441241685143, - "grad_norm": 0.4077872037887573, - "learning_rate": 0.00016757173508980965, - "loss": 1.3711, + "epoch": 1.6033755274261603, + "grad_norm": 0.30474522709846497, + "learning_rate": 0.0001977901943318393, + "loss": 0.8612, "step": 380 }, { - "epoch": 1.7073170731707317, - "grad_norm": 0.42192327976226807, - "learning_rate": 0.00016641944229003395, - "loss": 1.376, + "epoch": 1.6244725738396624, + "grad_norm": 0.3270438611507416, + "learning_rate": 0.0001976335838505221, + "loss": 0.8838, "step": 385 }, { - "epoch": 1.729490022172949, - "grad_norm": 0.36239758133888245, - "learning_rate": 0.00016525114782303807, - "loss": 1.3643, + "epoch": 1.6455696202531644, + "grad_norm": 0.3059784471988678, + "learning_rate": 0.00019747167849908304, + "loss": 0.8687, "step": 390 }, { - "epoch": 1.7516629711751663, - "grad_norm": 0.37795644998550415, - "learning_rate": 0.00016406713315248136, - "loss": 1.3608, + "epoch": 1.6666666666666665, + "grad_norm": 0.3195934295654297, + "learning_rate": 0.00019730448705798239, + "loss": 0.8639, "step": 395 }, { - "epoch": 1.7738359201773837, - "grad_norm": 0.33361098170280457, - "learning_rate": 0.00016286768352930973, - "loss": 1.3693, + "epoch": 1.6877637130801688, + "grad_norm": 0.31650060415267944, + "learning_rate": 0.00019713201859435602, + "loss": 0.8825, "step": 400 }, { - "epoch": 1.7960088691796008, - "grad_norm": 0.36207035183906555, - "learning_rate": 0.0001616530879230335, - "loss": 1.3709, + "epoch": 1.7088607594936709, + "grad_norm": 0.33506250381469727, + "learning_rate": 0.0001969542824615235, + "loss": 0.8663, "step": 405 }, { - "epoch": 1.8181818181818183, - "grad_norm": 0.34705451130867004, - "learning_rate": 0.00016042363895210946, - "loss": 1.3502, + "epoch": 1.729957805907173, + "grad_norm": 0.3536715805530548, + "learning_rate": 0.00019677128829848103, + "loss": 0.8498, "step": 410 }, { - "epoch": 1.8403547671840355, - "grad_norm": 0.339901864528656, - "learning_rate": 0.00015917963281344345, - "loss": 1.3591, + "epoch": 1.7510548523206753, + "grad_norm": 0.35253262519836426, + "learning_rate": 0.00019658304602937856, + "loss": 0.8614, "step": 415 }, { - "epoch": 1.8625277161862528, - "grad_norm": 0.3383360505104065, - "learning_rate": 0.00015792136921103124, - "loss": 1.3473, + "epoch": 1.7721518987341773, + "grad_norm": 0.3264296054840088, + "learning_rate": 0.0001963895658629816, + "loss": 0.8456, "step": 420 }, { - "epoch": 1.8847006651884701, - "grad_norm": 0.3463388979434967, - "learning_rate": 0.0001566491512837543, - "loss": 1.3723, + "epoch": 1.7932489451476794, + "grad_norm": 0.32499557733535767, + "learning_rate": 0.00019619085829211764, + "loss": 0.8435, "step": 425 }, { - "epoch": 1.9068736141906872, - "grad_norm": 0.36131027340888977, - "learning_rate": 0.00015536328553234792, - "loss": 1.3747, + "epoch": 1.8143459915611815, + "grad_norm": 0.31721100211143494, + "learning_rate": 0.00019598693409310708, + "loss": 0.8716, "step": 430 }, { - "epoch": 1.9290465631929048, - "grad_norm": 0.3317002058029175, - "learning_rate": 0.00015406408174555976, - "loss": 1.3497, + "epoch": 1.8354430379746836, + "grad_norm": 0.31418412923812866, + "learning_rate": 0.00019577780432517879, + "loss": 0.859, "step": 435 }, { - "epoch": 1.951219512195122, - "grad_norm": 0.39210090041160583, - "learning_rate": 0.00015275185292551585, - "loss": 1.3622, + "epoch": 1.8565400843881856, + "grad_norm": 0.3075924515724182, + "learning_rate": 0.0001955634803298703, + "loss": 0.8573, "step": 440 }, { - "epoch": 1.9733924611973392, - "grad_norm": 0.3842204213142395, - "learning_rate": 0.00015142691521231267, - "loss": 1.3575, + "epoch": 1.8776371308016877, + "grad_norm": 0.30187729001045227, + "learning_rate": 0.00019534397373041285, + "loss": 0.8381, "step": 445 }, { - "epoch": 1.9955654101995566, - "grad_norm": 0.3843972980976105, - "learning_rate": 0.0001500895878078532, - "loss": 1.3594, + "epoch": 1.8987341772151898, + "grad_norm": 0.3083683252334595, + "learning_rate": 0.00019511929643110097, + "loss": 0.8536, "step": 450 }, { - "epoch": 2.0, - "eval_loss": 1.77187979221344, - "eval_runtime": 0.3372, - "eval_samples_per_second": 2.966, - "eval_steps_per_second": 2.966, - "step": 451 - }, - { - "epoch": 2.0177383592017737, - "grad_norm": 0.3560327887535095, - "learning_rate": 0.00014874019289894537, - "loss": 1.3498, + "epoch": 1.9198312236286919, + "grad_norm": 0.2972455620765686, + "learning_rate": 0.0001948894606166468, + "loss": 0.8487, "step": 455 }, { - "epoch": 2.0399113082039912, - "grad_norm": 0.3512759804725647, - "learning_rate": 0.00014737905557968105, - "loss": 1.3208, + "epoch": 1.9409282700421941, + "grad_norm": 0.3294317424297333, + "learning_rate": 0.00019465447875151946, + "loss": 0.8485, "step": 460 }, { - "epoch": 2.0620842572062084, - "grad_norm": 0.35677239298820496, - "learning_rate": 0.00014600650377311522, - "loss": 1.3367, + "epoch": 1.9620253164556962, + "grad_norm": 0.28597962856292725, + "learning_rate": 0.00019441436357926892, + "loss": 0.8608, "step": 465 }, { - "epoch": 2.084257206208426, - "grad_norm": 0.3483263850212097, - "learning_rate": 0.00014462286815226314, - "loss": 1.3235, + "epoch": 1.9831223628691983, + "grad_norm": 0.30924198031425476, + "learning_rate": 0.00019416912812183498, + "loss": 0.8583, "step": 470 }, { - "epoch": 2.106430155210643, - "grad_norm": 0.3995297849178314, - "learning_rate": 0.00014322848206043505, - "loss": 1.3278, + "epoch": 2.0, + "eval_loss": 1.629499912261963, + "eval_runtime": 0.5557, + "eval_samples_per_second": 3.599, + "eval_steps_per_second": 1.8, + "step": 474 + }, + { + "epoch": 2.0042194092827006, + "grad_norm": 0.3041239380836487, + "learning_rate": 0.000193918785678841, + "loss": 0.8475, "step": 475 }, { - "epoch": 2.12860310421286, - "grad_norm": 0.34428730607032776, - "learning_rate": 0.00014182368143092768, - "loss": 1.3253, + "epoch": 2.0253164556962027, + "grad_norm": 0.31540679931640625, + "learning_rate": 0.0001936633498268728, + "loss": 0.8119, "step": 480 }, { - "epoch": 2.1507760532150777, - "grad_norm": 0.3784777820110321, - "learning_rate": 0.00014040880470609187, - "loss": 1.3062, + "epoch": 2.0464135021097047, + "grad_norm": 0.3115026652812958, + "learning_rate": 0.0001934028344187421, + "loss": 0.8259, "step": 485 }, { - "epoch": 2.172949002217295, - "grad_norm": 0.35997453331947327, - "learning_rate": 0.00013898419275579522, - "loss": 1.3055, + "epoch": 2.067510548523207, + "grad_norm": 0.39703112840652466, + "learning_rate": 0.00019313725358273548, + "loss": 0.8041, "step": 490 }, { - "epoch": 2.1951219512195124, - "grad_norm": 0.37477272748947144, - "learning_rate": 0.00013755018879530075, - "loss": 1.3283, + "epoch": 2.088607594936709, + "grad_norm": 0.3294001817703247, + "learning_rate": 0.00019286662172184808, + "loss": 0.8003, "step": 495 }, { - "epoch": 2.2172949002217295, - "grad_norm": 0.42467719316482544, - "learning_rate": 0.00013610713830257954, - "loss": 1.3244, + "epoch": 2.109704641350211, + "grad_norm": 0.3266647160053253, + "learning_rate": 0.00019259095351300252, + "loss": 0.8109, "step": 500 }, { - "epoch": 2.2394678492239466, - "grad_norm": 0.37103238701820374, - "learning_rate": 0.00013465538893507907, - "loss": 1.3338, + "epoch": 2.130801687763713, + "grad_norm": 0.31092244386672974, + "learning_rate": 0.0001923102639062529, + "loss": 0.8212, "step": 505 }, { - "epoch": 2.261640798226164, - "grad_norm": 0.3519739806652069, - "learning_rate": 0.00013319529044596593, - "loss": 1.3243, + "epoch": 2.151898734177215, + "grad_norm": 0.3094409704208374, + "learning_rate": 0.00019202456812397406, + "loss": 0.8187, "step": 510 }, { - "epoch": 2.2838137472283813, - "grad_norm": 0.366401731967926, - "learning_rate": 0.00013172719459986397, - "loss": 1.3188, + "epoch": 2.172995780590717, + "grad_norm": 0.32525700330734253, + "learning_rate": 0.00019173388166003613, + "loss": 0.8058, "step": 515 }, { - "epoch": 2.305986696230599, - "grad_norm": 0.39243802428245544, - "learning_rate": 0.0001302514550881076, - "loss": 1.3119, + "epoch": 2.1940928270042193, + "grad_norm": 0.296203076839447, + "learning_rate": 0.00019143822027896406, + "loss": 0.8037, "step": 520 }, { - "epoch": 2.328159645232816, - "grad_norm": 0.3964441418647766, - "learning_rate": 0.00012876842744353112, - "loss": 1.3176, + "epoch": 2.2151898734177213, + "grad_norm": 0.3076232373714447, + "learning_rate": 0.0001911376000150828, + "loss": 0.8208, "step": 525 }, { - "epoch": 2.3503325942350335, - "grad_norm": 0.3520803153514862, - "learning_rate": 0.00012727846895481434, - "loss": 1.3171, + "epoch": 2.2362869198312234, + "grad_norm": 0.2956830859184265, + "learning_rate": 0.0001908320371716478, + "loss": 0.788, "step": 530 }, { - "epoch": 2.3725055432372506, - "grad_norm": 0.35095489025115967, - "learning_rate": 0.00012578193858040507, - "loss": 1.3239, + "epoch": 2.257383966244726, + "grad_norm": 0.3284301161766052, + "learning_rate": 0.00019052154831996073, + "loss": 0.7986, "step": 535 }, { - "epoch": 2.3946784922394677, - "grad_norm": 0.3672475516796112, - "learning_rate": 0.0001242791968620394, - "loss": 1.3295, + "epoch": 2.278481012658228, + "grad_norm": 0.31620386242866516, + "learning_rate": 0.00019020615029847072, + "loss": 0.8049, "step": 540 }, { - "epoch": 2.4168514412416853, - "grad_norm": 0.3635219633579254, - "learning_rate": 0.00012277060583788064, - "loss": 1.3299, + "epoch": 2.29957805907173, + "grad_norm": 0.3297165632247925, + "learning_rate": 0.00018988586021186147, + "loss": 0.8309, "step": 545 }, { - "epoch": 2.4390243902439024, - "grad_norm": 0.3647380471229553, - "learning_rate": 0.00012125652895529766, - "loss": 1.3255, + "epoch": 2.320675105485232, + "grad_norm": 0.2959255874156952, + "learning_rate": 0.0001895606954301233, + "loss": 0.7965, "step": 550 }, { - "epoch": 2.4611973392461195, - "grad_norm": 0.40251094102859497, - "learning_rate": 0.00011973733098330368, - "loss": 1.324, + "epoch": 2.3417721518987342, + "grad_norm": 0.3089437782764435, + "learning_rate": 0.00018923067358761136, + "loss": 0.7946, "step": 555 }, { - "epoch": 2.483370288248337, - "grad_norm": 0.36837145686149597, - "learning_rate": 0.0001182133779246766, - "loss": 1.3061, + "epoch": 2.3628691983122363, + "grad_norm": 0.30109426379203796, + "learning_rate": 0.00018889581258208903, + "loss": 0.8123, "step": 560 }, { - "epoch": 2.505543237250554, - "grad_norm": 0.3746941387653351, - "learning_rate": 0.00011668503692778239, - "loss": 1.3239, + "epoch": 2.3839662447257384, + "grad_norm": 0.32586753368377686, + "learning_rate": 0.0001885561305737577, + "loss": 0.8162, "step": 565 }, { - "epoch": 2.5277161862527717, - "grad_norm": 0.3703044652938843, - "learning_rate": 0.00011515267619812214, - "loss": 1.3197, + "epoch": 2.4050632911392404, + "grad_norm": 0.3282499611377716, + "learning_rate": 0.00018821164598427145, + "loss": 0.8196, "step": 570 }, { - "epoch": 2.549889135254989, - "grad_norm": 0.3508300185203552, - "learning_rate": 0.00011361666490962468, - "loss": 1.3231, + "epoch": 2.4261603375527425, + "grad_norm": 0.3379076421260834, + "learning_rate": 0.00018786237749573837, + "loss": 0.816, "step": 575 }, { - "epoch": 2.5720620842572064, - "grad_norm": 0.36951854825019836, - "learning_rate": 0.00011207737311570559, - "loss": 1.3205, + "epoch": 2.4472573839662446, + "grad_norm": 0.30591997504234314, + "learning_rate": 0.00018750834404970718, + "loss": 0.8015, "step": 580 }, { - "epoch": 2.5942350332594235, - "grad_norm": 0.3559792637825012, - "learning_rate": 0.00011053517166011471, - "loss": 1.312, + "epoch": 2.4683544303797467, + "grad_norm": 0.3031338155269623, + "learning_rate": 0.00018714956484613995, + "loss": 0.817, "step": 585 }, { - "epoch": 2.6164079822616406, - "grad_norm": 0.37214240431785583, - "learning_rate": 0.00010899043208759305, - "loss": 1.3354, + "epoch": 2.489451476793249, + "grad_norm": 0.3209945261478424, + "learning_rate": 0.0001867860593423711, + "loss": 0.8134, "step": 590 }, { - "epoch": 2.638580931263858, - "grad_norm": 0.35989540815353394, - "learning_rate": 0.00010744352655436059, - "loss": 1.3408, + "epoch": 2.510548523206751, + "grad_norm": 0.31199783086776733, + "learning_rate": 0.000186417847252052, + "loss": 0.8053, "step": 595 }, { - "epoch": 2.6607538802660753, - "grad_norm": 0.35892507433891296, - "learning_rate": 0.00010589482773845727, - "loss": 1.3268, + "epoch": 2.5316455696202533, + "grad_norm": 0.29871612787246704, + "learning_rate": 0.00018604494854408178, + "loss": 0.804, "step": 600 }, { - "epoch": 2.682926829268293, - "grad_norm": 0.3559330105781555, - "learning_rate": 0.00010434470874995781, - "loss": 1.3102, + "epoch": 2.5527426160337554, + "grad_norm": 0.31065070629119873, + "learning_rate": 0.0001856673834415246, + "loss": 0.8033, "step": 605 }, { - "epoch": 2.70509977827051, - "grad_norm": 0.3716677725315094, - "learning_rate": 0.00010279354304108271, - "loss": 1.3015, + "epoch": 2.5738396624472575, + "grad_norm": 0.2946309447288513, + "learning_rate": 0.00018528517242051283, + "loss": 0.8006, "step": 610 }, { - "epoch": 2.7272727272727275, - "grad_norm": 0.3538273572921753, - "learning_rate": 0.0001012417043162266, - "loss": 1.3046, + "epoch": 2.5949367088607596, + "grad_norm": 0.2882119119167328, + "learning_rate": 0.00018489833620913642, + "loss": 0.8059, "step": 615 }, { - "epoch": 2.7494456762749446, - "grad_norm": 0.38784587383270264, - "learning_rate": 9.968956644192617e-05, - "loss": 1.3245, + "epoch": 2.6160337552742616, + "grad_norm": 0.31968575716018677, + "learning_rate": 0.00018450689578631898, + "loss": 0.8045, "step": 620 }, { - "epoch": 2.7716186252771617, - "grad_norm": 0.3511577248573303, - "learning_rate": 9.813750335678866e-05, - "loss": 1.3181, + "epoch": 2.6371308016877637, + "grad_norm": 0.3142683804035187, + "learning_rate": 0.00018411087238068003, + "loss": 0.8045, "step": 625 }, { - "epoch": 2.7937915742793793, - "grad_norm": 0.37702956795692444, - "learning_rate": 9.658588898140322e-05, - "loss": 1.3147, + "epoch": 2.6582278481012658, + "grad_norm": 0.3006449043750763, + "learning_rate": 0.0001837102874693836, + "loss": 0.8056, "step": 630 }, { - "epoch": 2.8159645232815964, - "grad_norm": 0.3686079680919647, - "learning_rate": 9.503509712825658e-05, - "loss": 1.311, + "epoch": 2.679324894514768, + "grad_norm": 0.3158734440803528, + "learning_rate": 0.0001833051627769736, + "loss": 0.8275, "step": 635 }, { - "epoch": 2.8381374722838135, - "grad_norm": 0.3567045331001282, - "learning_rate": 9.348550141167472e-05, - "loss": 1.2965, + "epoch": 2.70042194092827, + "grad_norm": 0.30407387018203735, + "learning_rate": 0.00018289552027419558, + "loss": 0.8133, "step": 640 }, { - "epoch": 2.860310421286031, - "grad_norm": 0.3552383780479431, - "learning_rate": 9.193747515781224e-05, - "loss": 1.3121, + "epoch": 2.721518987341772, + "grad_norm": 0.3022385835647583, + "learning_rate": 0.0001824813821768053, + "loss": 0.8026, "step": 645 }, { - "epoch": 2.882483370288248, - "grad_norm": 0.3610834777355194, - "learning_rate": 9.039139131471128e-05, - "loss": 1.3136, + "epoch": 2.742616033755274, + "grad_norm": 0.29126378893852234, + "learning_rate": 0.00018206277094436377, + "loss": 0.8075, "step": 650 }, { - "epoch": 2.9046563192904657, - "grad_norm": 0.36015474796295166, - "learning_rate": 8.884762236245145e-05, - "loss": 1.3131, + "epoch": 2.7637130801687766, + "grad_norm": 0.29224568605422974, + "learning_rate": 0.00018163970927901937, + "loss": 0.811, "step": 655 }, { - "epoch": 2.926829268292683, - "grad_norm": 0.3582945764064789, - "learning_rate": 8.730654022341256e-05, - "loss": 1.3237, + "epoch": 2.7848101265822782, + "grad_norm": 0.3017180860042572, + "learning_rate": 0.00018121222012427665, + "loss": 0.7945, "step": 660 }, { - "epoch": 2.9490022172949004, - "grad_norm": 0.34879669547080994, - "learning_rate": 8.57685161726715e-05, - "loss": 1.3295, + "epoch": 2.8059071729957807, + "grad_norm": 0.2974776029586792, + "learning_rate": 0.00018078032666375194, + "loss": 0.8078, "step": 665 }, { - "epoch": 2.9711751662971175, - "grad_norm": 0.3675631582736969, - "learning_rate": 8.423392074855545e-05, - "loss": 1.3066, + "epoch": 2.827004219409283, + "grad_norm": 0.2910807430744171, + "learning_rate": 0.0001803440523199162, + "loss": 0.7887, "step": 670 }, { - "epoch": 2.9933481152993346, - "grad_norm": 0.34397903084754944, - "learning_rate": 8.270312366337226e-05, - "loss": 1.3154, + "epoch": 2.848101265822785, + "grad_norm": 0.3062914311885834, + "learning_rate": 0.0001799034207528247, + "loss": 0.7928, "step": 675 }, { - "epoch": 2.9977827050997785, - "eval_loss": 1.7679176330566406, - "eval_runtime": 0.3532, - "eval_samples_per_second": 2.831, - "eval_steps_per_second": 2.831, - "step": 676 - }, - { - "epoch": 3.015521064301552, - "grad_norm": 0.38900941610336304, - "learning_rate": 8.117649371433994e-05, - "loss": 1.2977, + "epoch": 2.869198312236287, + "grad_norm": 0.29467758536338806, + "learning_rate": 0.0001794584558588338, + "loss": 0.8047, "step": 680 }, { - "epoch": 3.0376940133037693, - "grad_norm": 0.3746815025806427, - "learning_rate": 7.965439869473664e-05, - "loss": 1.2848, + "epoch": 2.890295358649789, + "grad_norm": 0.32635724544525146, + "learning_rate": 0.00017900918176930522, + "loss": 0.8144, "step": 685 }, { - "epoch": 3.059866962305987, - "grad_norm": 0.3739188313484192, - "learning_rate": 7.813720530529243e-05, - "loss": 1.29, + "epoch": 2.911392405063291, + "grad_norm": 0.31900787353515625, + "learning_rate": 0.00017855562284929718, + "loss": 0.8089, "step": 690 }, { - "epoch": 3.082039911308204, - "grad_norm": 0.3820560574531555, - "learning_rate": 7.66252790658445e-05, - "loss": 1.2729, + "epoch": 2.932489451476793, + "grad_norm": 0.3085595667362213, + "learning_rate": 0.00017809780369624302, + "loss": 0.8048, "step": 695 }, { - "epoch": 3.104212860310421, - "grad_norm": 0.41729655861854553, - "learning_rate": 7.511898422727642e-05, - "loss": 1.2701, + "epoch": 2.9535864978902953, + "grad_norm": 0.2946968078613281, + "learning_rate": 0.00017763574913861734, + "loss": 0.8157, "step": 700 }, { - "epoch": 3.1263858093126387, - "grad_norm": 0.3661589026451111, - "learning_rate": 7.361868368376364e-05, - "loss": 1.279, + "epoch": 2.9746835443037973, + "grad_norm": 0.29723235964775085, + "learning_rate": 0.00017716948423458938, + "loss": 0.796, "step": 705 }, { - "epoch": 3.1485587583148558, - "grad_norm": 0.3968789875507355, - "learning_rate": 7.212473888534546e-05, - "loss": 1.2863, + "epoch": 2.9957805907173, + "grad_norm": 0.2712932229042053, + "learning_rate": 0.00017669903427066424, + "loss": 0.8179, "step": 710 }, { - "epoch": 3.1707317073170733, - "grad_norm": 0.3675246238708496, - "learning_rate": 7.063750975084518e-05, - "loss": 1.2811, + "epoch": 3.0, + "eval_loss": 1.6558986902236938, + "eval_runtime": 0.5507, + "eval_samples_per_second": 3.632, + "eval_steps_per_second": 1.816, + "step": 711 + }, + { + "epoch": 3.0168776371308015, + "grad_norm": 0.3356448709964752, + "learning_rate": 0.0001762244247603113, + "loss": 0.7628, "step": 715 }, { - "epoch": 3.1929046563192904, - "grad_norm": 0.39409342408180237, - "learning_rate": 6.915735458115884e-05, - "loss": 1.2765, + "epoch": 3.037974683544304, + "grad_norm": 0.3006523847579956, + "learning_rate": 0.00017574568144258077, + "loss": 0.7558, "step": 720 }, { - "epoch": 3.2150776053215075, - "grad_norm": 0.3648887574672699, - "learning_rate": 6.768462997293413e-05, - "loss": 1.2882, + "epoch": 3.059071729957806, + "grad_norm": 0.30827251076698303, + "learning_rate": 0.00017526283028070777, + "loss": 0.7567, "step": 725 }, { - "epoch": 3.237250554323725, - "grad_norm": 0.3808637857437134, - "learning_rate": 6.62196907326595e-05, - "loss": 1.2973, + "epoch": 3.080168776371308, + "grad_norm": 0.3096933662891388, + "learning_rate": 0.00017477589746070417, + "loss": 0.7581, "step": 730 }, { - "epoch": 3.259423503325942, - "grad_norm": 0.388408362865448, - "learning_rate": 6.476288979118496e-05, - "loss": 1.3099, + "epoch": 3.1012658227848102, + "grad_norm": 0.32005831599235535, + "learning_rate": 0.00017428490938993862, + "loss": 0.7549, "step": 735 }, { - "epoch": 3.2815964523281598, - "grad_norm": 0.3849842846393585, - "learning_rate": 6.331457811869437e-05, - "loss": 1.2919, + "epoch": 3.1223628691983123, + "grad_norm": 0.30930569767951965, + "learning_rate": 0.00017378989269570437, + "loss": 0.7702, "step": 740 }, { - "epoch": 3.303769401330377, - "grad_norm": 0.3851810693740845, - "learning_rate": 6.187510464015022e-05, - "loss": 1.2793, + "epoch": 3.1434599156118144, + "grad_norm": 0.32762596011161804, + "learning_rate": 0.0001732908742237752, + "loss": 0.7471, "step": 745 }, { - "epoch": 3.3259423503325944, - "grad_norm": 0.3811032474040985, - "learning_rate": 6.0444816151231375e-05, - "loss": 1.2803, + "epoch": 3.1645569620253164, + "grad_norm": 0.32086798548698425, + "learning_rate": 0.00017278788103694943, + "loss": 0.7618, "step": 750 }, { - "epoch": 3.3481152993348116, - "grad_norm": 0.36981481313705444, - "learning_rate": 5.902405723478346e-05, - "loss": 1.2612, + "epoch": 3.1856540084388185, + "grad_norm": 0.3558262586593628, + "learning_rate": 0.00017228094041358248, + "loss": 0.7764, "step": 755 }, { - "epoch": 3.3702882483370287, - "grad_norm": 0.37967202067375183, - "learning_rate": 5.76131701778025e-05, - "loss": 1.2924, + "epoch": 3.2067510548523206, + "grad_norm": 0.3397001326084137, + "learning_rate": 0.0001717700798461074, + "loss": 0.753, "step": 760 }, { - "epoch": 3.3924611973392462, - "grad_norm": 0.39395394921302795, - "learning_rate": 5.621249488897176e-05, - "loss": 1.2714, + "epoch": 3.2278481012658227, + "grad_norm": 0.30650395154953003, + "learning_rate": 0.00017125532703954365, + "loss": 0.7595, "step": 765 }, { - "epoch": 3.4146341463414633, - "grad_norm": 0.38406023383140564, - "learning_rate": 5.4822368816771406e-05, - "loss": 1.2885, + "epoch": 3.2489451476793247, + "grad_norm": 0.317777156829834, + "learning_rate": 0.0001707367099099951, + "loss": 0.7546, "step": 770 }, { - "epoch": 3.436807095343681, - "grad_norm": 0.372646301984787, - "learning_rate": 5.344312686818106e-05, - "loss": 1.2791, + "epoch": 3.270042194092827, + "grad_norm": 0.3183245062828064, + "learning_rate": 0.00017021425658313565, + "loss": 0.7633, "step": 775 }, { - "epoch": 3.458980044345898, - "grad_norm": 0.3789440393447876, - "learning_rate": 5.207510132799436e-05, - "loss": 1.2918, + "epoch": 3.291139240506329, + "grad_norm": 0.3169344365596771, + "learning_rate": 0.00016968799539268407, + "loss": 0.7759, "step": 780 }, { - "epoch": 3.481152993348115, - "grad_norm": 0.3723030388355255, - "learning_rate": 5.0718621778765476e-05, - "loss": 1.2772, + "epoch": 3.3122362869198314, + "grad_norm": 0.30704858899116516, + "learning_rate": 0.00016915795487886746, + "loss": 0.7565, "step": 785 }, { - "epoch": 3.5033259423503327, - "grad_norm": 0.373038649559021, - "learning_rate": 4.9374015021406914e-05, - "loss": 1.2677, + "epoch": 3.3333333333333335, + "grad_norm": 0.3002530038356781, + "learning_rate": 0.0001686241637868734, + "loss": 0.7509, "step": 790 }, { - "epoch": 3.52549889135255, - "grad_norm": 0.3948681354522705, - "learning_rate": 4.804160499645667e-05, - "loss": 1.2791, + "epoch": 3.3544303797468356, + "grad_norm": 0.3143273591995239, + "learning_rate": 0.00016808665106529094, + "loss": 0.7482, "step": 795 }, { - "epoch": 3.5476718403547673, - "grad_norm": 0.3771447241306305, - "learning_rate": 4.6721712706035236e-05, - "loss": 1.2895, + "epoch": 3.3755274261603376, + "grad_norm": 0.30195352435112, + "learning_rate": 0.00016754544586454094, + "loss": 0.762, "step": 800 }, { - "epoch": 3.5698447893569845, - "grad_norm": 0.37859615683555603, - "learning_rate": 4.5414656136510334e-05, - "loss": 1.2841, + "epoch": 3.3966244725738397, + "grad_norm": 0.32630935311317444, + "learning_rate": 0.00016700057753529484, + "loss": 0.7637, "step": 805 }, { - "epoch": 3.5920177383592016, - "grad_norm": 0.3813944160938263, - "learning_rate": 4.412075018188805e-05, - "loss": 1.2785, + "epoch": 3.4177215189873418, + "grad_norm": 0.31649506092071533, + "learning_rate": 0.0001664520756268832, + "loss": 0.7577, "step": 810 }, { - "epoch": 3.614190687361419, - "grad_norm": 0.37438470125198364, - "learning_rate": 4.2840306567949076e-05, - "loss": 1.2803, + "epoch": 3.438818565400844, + "grad_norm": 0.3301686644554138, + "learning_rate": 0.0001658999698856929, + "loss": 0.7534, "step": 815 }, { - "epoch": 3.6363636363636362, - "grad_norm": 0.36801373958587646, - "learning_rate": 4.157363377714819e-05, - "loss": 1.3009, + "epoch": 3.459915611814346, + "grad_norm": 0.3230050802230835, + "learning_rate": 0.00016534429025355426, + "loss": 0.7567, "step": 820 }, { - "epoch": 3.658536585365854, - "grad_norm": 0.37997427582740784, - "learning_rate": 4.0321036974295156e-05, - "loss": 1.2833, + "epoch": 3.481012658227848, + "grad_norm": 0.30652645230293274, + "learning_rate": 0.00016478506686611697, + "loss": 0.757, "step": 825 }, { - "epoch": 3.680709534368071, - "grad_norm": 0.36499011516571045, - "learning_rate": 3.9082817933035134e-05, - "loss": 1.2836, + "epoch": 3.50210970464135, + "grad_norm": 0.32210221886634827, + "learning_rate": 0.0001642223300512158, + "loss": 0.7734, "step": 830 }, { - "epoch": 3.7028824833702885, - "grad_norm": 0.3749338984489441, - "learning_rate": 3.785927496314543e-05, - "loss": 1.2869, + "epoch": 3.523206751054852, + "grad_norm": 0.3032419681549072, + "learning_rate": 0.00016365611032722604, + "loss": 0.7519, "step": 835 }, { - "epoch": 3.7250554323725056, - "grad_norm": 0.3751271665096283, - "learning_rate": 3.6650702838667464e-05, - "loss": 1.2732, + "epoch": 3.5443037974683547, + "grad_norm": 0.2990473508834839, + "learning_rate": 0.00016308643840140828, + "loss": 0.7579, "step": 840 }, { - "epoch": 3.7472283813747227, - "grad_norm": 0.3683511018753052, - "learning_rate": 3.5457392726890236e-05, - "loss": 1.2787, + "epoch": 3.5654008438818563, + "grad_norm": 0.32090187072753906, + "learning_rate": 0.000162513345168243, + "loss": 0.7569, "step": 845 }, { - "epoch": 3.7694013303769403, - "grad_norm": 0.38828131556510925, - "learning_rate": 3.427963211820274e-05, - "loss": 1.2811, + "epoch": 3.586497890295359, + "grad_norm": 0.3112528920173645, + "learning_rate": 0.00016193686170775537, + "loss": 0.7752, "step": 850 }, { - "epoch": 3.7915742793791574, - "grad_norm": 0.37317919731140137, - "learning_rate": 3.3117704756832226e-05, - "loss": 1.289, + "epoch": 3.607594936708861, + "grad_norm": 0.311675488948822, + "learning_rate": 0.00016135701928382952, + "loss": 0.7523, "step": 855 }, { - "epoch": 3.8137472283813745, - "grad_norm": 0.3718849718570709, - "learning_rate": 3.197189057248491e-05, - "loss": 1.2764, + "epoch": 3.628691983122363, + "grad_norm": 0.316641628742218, + "learning_rate": 0.000160773849342513, + "loss": 0.7651, "step": 860 }, { - "epoch": 3.835920177383592, - "grad_norm": 0.3691292107105255, - "learning_rate": 3.0842465612905837e-05, - "loss": 1.2862, + "epoch": 3.649789029535865, + "grad_norm": 0.32175716757774353, + "learning_rate": 0.00016018738351031156, + "loss": 0.7646, "step": 865 }, { - "epoch": 3.858093126385809, - "grad_norm": 0.3802024722099304, - "learning_rate": 2.9729701977374035e-05, - "loss": 1.2838, + "epoch": 3.670886075949367, + "grad_norm": 0.30499377846717834, + "learning_rate": 0.00015959765359247388, + "loss": 0.7654, "step": 870 }, { - "epoch": 3.8802660753880267, - "grad_norm": 0.38017788529396057, - "learning_rate": 2.863386775114848e-05, - "loss": 1.2934, + "epoch": 3.691983122362869, + "grad_norm": 0.3078381419181824, + "learning_rate": 0.0001590046915712667, + "loss": 0.7682, "step": 875 }, { - "epoch": 3.902439024390244, - "grad_norm": 0.38141724467277527, - "learning_rate": 2.7555226940881583e-05, - "loss": 1.2829, + "epoch": 3.7130801687763713, + "grad_norm": 0.3422172963619232, + "learning_rate": 0.00015840852960424036, + "loss": 0.7504, "step": 880 }, { - "epoch": 3.9246119733924614, - "grad_norm": 0.38505685329437256, - "learning_rate": 2.6494039411015193e-05, - "loss": 1.3002, + "epoch": 3.7341772151898733, + "grad_norm": 0.30137816071510315, + "learning_rate": 0.00015780920002248484, + "loss": 0.75, "step": 885 }, { - "epoch": 3.9467849223946785, - "grad_norm": 0.3752408027648926, - "learning_rate": 2.545056082117433e-05, - "loss": 1.2971, + "epoch": 3.7552742616033754, + "grad_norm": 0.31054186820983887, + "learning_rate": 0.00015720673532887647, + "loss": 0.7511, "step": 890 }, { - "epoch": 3.9689578713968956, - "grad_norm": 0.3711921274662018, - "learning_rate": 2.4425042564574184e-05, - "loss": 1.282, + "epoch": 3.7763713080168775, + "grad_norm": 0.3199822008609772, + "learning_rate": 0.00015660116819631506, + "loss": 0.7659, "step": 895 }, { - "epoch": 3.991130820399113, - "grad_norm": 0.37682175636291504, - "learning_rate": 2.3417731707454737e-05, - "loss": 1.268, + "epoch": 3.7974683544303796, + "grad_norm": 0.30703869462013245, + "learning_rate": 0.0001559925314659521, + "loss": 0.7641, "step": 900 }, { - "epoch": 4.0, - "eval_loss": 1.796044945716858, - "eval_runtime": 0.3372, - "eval_samples_per_second": 2.965, - "eval_steps_per_second": 2.965, - "step": 902 - }, - { - "epoch": 4.013303769401331, - "grad_norm": 0.37139764428138733, - "learning_rate": 2.242887092955801e-05, - "loss": 1.2666, + "epoch": 3.818565400843882, + "grad_norm": 0.3145774006843567, + "learning_rate": 0.00015538085814540962, + "loss": 0.7589, "step": 905 }, { - "epoch": 4.035476718403547, - "grad_norm": 0.3784310221672058, - "learning_rate": 2.1458698465662187e-05, - "loss": 1.2518, + "epoch": 3.8396624472573837, + "grad_norm": 0.3188943862915039, + "learning_rate": 0.00015476618140699034, + "loss": 0.7615, "step": 910 }, { - "epoch": 4.057649667405765, - "grad_norm": 0.3832833766937256, - "learning_rate": 2.0507448048186208e-05, - "loss": 1.2522, + "epoch": 3.8607594936708862, + "grad_norm": 0.32847416400909424, + "learning_rate": 0.00015414853458587833, + "loss": 0.7569, "step": 915 }, { - "epoch": 4.0798226164079825, - "grad_norm": 0.38216501474380493, - "learning_rate": 1.957534885087944e-05, - "loss": 1.2616, + "epoch": 3.8818565400843883, + "grad_norm": 0.33269551396369934, + "learning_rate": 0.00015352795117833145, + "loss": 0.7539, "step": 920 }, { - "epoch": 4.101995565410199, - "grad_norm": 0.37775281071662903, - "learning_rate": 1.866262543360958e-05, - "loss": 1.2696, + "epoch": 3.9029535864978904, + "grad_norm": 0.30027341842651367, + "learning_rate": 0.00015290446483986472, + "loss": 0.76, "step": 925 }, { - "epoch": 4.124168514412417, - "grad_norm": 0.3922324478626251, - "learning_rate": 1.7769497688261973e-05, - "loss": 1.2572, + "epoch": 3.9240506329113924, + "grad_norm": 0.3010607063770294, + "learning_rate": 0.00015227810938342492, + "loss": 0.7574, "step": 930 }, { - "epoch": 4.146341463414634, - "grad_norm": 0.39753395318984985, - "learning_rate": 1.6896180785763593e-05, - "loss": 1.2574, + "epoch": 3.9451476793248945, + "grad_norm": 0.30083900690078735, + "learning_rate": 0.0001516489187775572, + "loss": 0.7556, "step": 935 }, { - "epoch": 4.168514412416852, - "grad_norm": 0.39192479848861694, - "learning_rate": 1.604288512424439e-05, - "loss": 1.2624, + "epoch": 3.9662447257383966, + "grad_norm": 0.30435124039649963, + "learning_rate": 0.00015101692714456259, + "loss": 0.7612, "step": 940 }, { - "epoch": 4.1906873614190685, - "grad_norm": 0.3945992588996887, - "learning_rate": 1.520981627834851e-05, - "loss": 1.2756, + "epoch": 3.9873417721518987, + "grad_norm": 0.31260964274406433, + "learning_rate": 0.00015038216875864756, + "loss": 0.7533, "step": 945 }, { - "epoch": 4.212860310421286, - "grad_norm": 0.37924352288246155, - "learning_rate": 1.4397174949707725e-05, - "loss": 1.257, + "epoch": 4.0, + "eval_loss": 1.6894222497940063, + "eval_runtime": 0.5552, + "eval_samples_per_second": 3.602, + "eval_steps_per_second": 1.801, + "step": 948 + }, + { + "epoch": 4.008438818565401, + "grad_norm": 0.3140685558319092, + "learning_rate": 0.00014974467804406533, + "loss": 0.749, "step": 950 }, { - "epoch": 4.235033259423504, - "grad_norm": 0.373811274766922, - "learning_rate": 1.3605156918588469e-05, - "loss": 1.2703, + "epoch": 4.029535864978903, + "grad_norm": 0.3326011896133423, + "learning_rate": 0.00014910448957324897, + "loss": 0.7177, "step": 955 }, { - "epoch": 4.25720620842572, - "grad_norm": 0.3824734389781952, - "learning_rate": 1.2833952996724863e-05, - "loss": 1.2635, + "epoch": 4.050632911392405, + "grad_norm": 0.32034996151924133, + "learning_rate": 0.00014846163806493627, + "loss": 0.7061, "step": 960 }, { - "epoch": 4.279379157427938, - "grad_norm": 0.3803390562534332, - "learning_rate": 1.208374898134883e-05, - "loss": 1.2715, + "epoch": 4.071729957805907, + "grad_norm": 0.31769704818725586, + "learning_rate": 0.00014781615838228715, + "loss": 0.6986, "step": 965 }, { - "epoch": 4.301552106430155, - "grad_norm": 0.38187548518180847, - "learning_rate": 1.1354725610427807e-05, - "loss": 1.2775, + "epoch": 4.0928270042194095, + "grad_norm": 0.35571393370628357, + "learning_rate": 0.00014716808553099286, + "loss": 0.7042, "step": 970 }, { - "epoch": 4.323725055432373, - "grad_norm": 0.3768659234046936, - "learning_rate": 1.0647058519121821e-05, - "loss": 1.2394, + "epoch": 4.113924050632911, + "grad_norm": 0.33056944608688354, + "learning_rate": 0.00014651745465737737, + "loss": 0.7195, "step": 975 }, { - "epoch": 4.34589800443459, - "grad_norm": 0.3807064890861511, - "learning_rate": 9.960918197469771e-06, - "loss": 1.2673, + "epoch": 4.135021097046414, + "grad_norm": 0.35726672410964966, + "learning_rate": 0.00014586430104649163, + "loss": 0.7245, "step": 980 }, { - "epoch": 4.368070953436807, - "grad_norm": 0.38065963983535767, - "learning_rate": 9.296469949315156e-06, - "loss": 1.2713, + "epoch": 4.156118143459915, + "grad_norm": 0.3273336887359619, + "learning_rate": 0.0001452086601201997, + "loss": 0.709, "step": 985 }, { - "epoch": 4.390243902439025, - "grad_norm": 0.3850005865097046, - "learning_rate": 8.653873852481364e-06, - "loss": 1.2468, + "epoch": 4.177215189873418, + "grad_norm": 0.33940553665161133, + "learning_rate": 0.00014455056743525792, + "loss": 0.7115, "step": 990 }, { - "epoch": 4.412416851441241, - "grad_norm": 0.3868677020072937, - "learning_rate": 8.033284720205946e-06, - "loss": 1.2654, + "epoch": 4.198312236286919, + "grad_norm": 0.34996211528778076, + "learning_rate": 0.00014389005868138658, + "loss": 0.7078, "step": 995 }, { - "epoch": 4.434589800443459, - "grad_norm": 0.37283623218536377, - "learning_rate": 7.434852063843278e-06, - "loss": 1.2616, + "epoch": 4.219409282700422, + "grad_norm": 0.33837664127349854, + "learning_rate": 0.00014322716967933428, + "loss": 0.7042, "step": 1000 }, { - "epoch": 4.4567627494456765, - "grad_norm": 0.3784988522529602, - "learning_rate": 6.858720056844614e-06, - "loss": 1.2602, + "epoch": 4.2405063291139244, + "grad_norm": 0.3329886198043823, + "learning_rate": 0.0001425619363789354, + "loss": 0.7212, "step": 1005 }, { - "epoch": 4.478935698447893, - "grad_norm": 0.389172226190567, - "learning_rate": 6.3050275000238414e-06, - "loss": 1.2671, + "epoch": 4.261603375527426, + "grad_norm": 0.35570377111434937, + "learning_rate": 0.00014189439485716053, + "loss": 0.7088, "step": 1010 }, { - "epoch": 4.501108647450111, - "grad_norm": 0.37874796986579895, - "learning_rate": 5.77390778811796e-06, - "loss": 1.2743, + "epoch": 4.282700421940929, + "grad_norm": 0.3659791648387909, + "learning_rate": 0.00014122458131615975, + "loss": 0.7023, "step": 1015 }, { - "epoch": 4.523281596452328, - "grad_norm": 0.39037781953811646, - "learning_rate": 5.265488877649816e-06, - "loss": 1.2684, + "epoch": 4.30379746835443, + "grad_norm": 0.3362638056278229, + "learning_rate": 0.00014055253208129938, + "loss": 0.7138, "step": 1020 }, { - "epoch": 4.545454545454545, - "grad_norm": 0.3832471966743469, - "learning_rate": 4.7798932561009865e-06, - "loss": 1.2674, + "epoch": 4.324894514767933, + "grad_norm": 0.3303203284740448, + "learning_rate": 0.00013987828359919222, + "loss": 0.7085, "step": 1025 }, { - "epoch": 4.5676274944567625, - "grad_norm": 0.3740783631801605, - "learning_rate": 4.317237912402316e-06, - "loss": 1.2686, + "epoch": 4.345991561181434, + "grad_norm": 0.32455962896347046, + "learning_rate": 0.00013920187243572057, + "loss": 0.7142, "step": 1030 }, { - "epoch": 4.58980044345898, - "grad_norm": 0.38245856761932373, - "learning_rate": 3.877634308749078e-06, - "loss": 1.2457, + "epoch": 4.367088607594937, + "grad_norm": 0.33820950984954834, + "learning_rate": 0.00013852333527405346, + "loss": 0.7198, "step": 1035 }, { - "epoch": 4.611973392461198, - "grad_norm": 0.38514626026153564, - "learning_rate": 3.461188353747702e-06, - "loss": 1.2631, + "epoch": 4.3881856540084385, + "grad_norm": 0.3443733751773834, + "learning_rate": 0.00013784270891265717, + "loss": 0.7281, "step": 1040 }, { - "epoch": 4.634146341463414, - "grad_norm": 0.38133639097213745, - "learning_rate": 3.068000376900515e-06, - "loss": 1.2775, + "epoch": 4.409282700421941, + "grad_norm": 0.3376203179359436, + "learning_rate": 0.00013716003026329965, + "loss": 0.7157, "step": 1045 }, { - "epoch": 4.656319290465632, - "grad_norm": 0.3845633268356323, - "learning_rate": 2.6981651044344024e-06, - "loss": 1.2642, + "epoch": 4.430379746835443, + "grad_norm": 0.3343973159790039, + "learning_rate": 0.0001364753363490485, + "loss": 0.7157, "step": 1050 }, { - "epoch": 4.678492239467849, - "grad_norm": 0.38162142038345337, - "learning_rate": 2.3517716364795385e-06, - "loss": 1.2692, + "epoch": 4.451476793248945, + "grad_norm": 0.32973435521125793, + "learning_rate": 0.00013578866430226342, + "loss": 0.7183, "step": 1055 }, { - "epoch": 4.700665188470067, - "grad_norm": 0.392133504152298, - "learning_rate": 2.028903425603612e-06, - "loss": 1.2628, + "epoch": 4.472573839662447, + "grad_norm": 0.3444620370864868, + "learning_rate": 0.00013510005136258227, + "loss": 0.7196, "step": 1060 }, { - "epoch": 4.722838137472284, - "grad_norm": 0.374054878950119, - "learning_rate": 1.7296382567064672e-06, - "loss": 1.2611, + "epoch": 4.493670886075949, + "grad_norm": 0.33004656434059143, + "learning_rate": 0.00013440953487490144, + "loss": 0.7139, "step": 1065 }, { - "epoch": 4.745011086474501, - "grad_norm": 0.38326382637023926, - "learning_rate": 1.4540482282803137e-06, - "loss": 1.2453, + "epoch": 4.514767932489452, + "grad_norm": 0.3244040608406067, + "learning_rate": 0.00013371715228735077, + "loss": 0.7144, "step": 1070 }, { - "epoch": 4.767184035476719, - "grad_norm": 0.39525189995765686, - "learning_rate": 1.2021997350399106e-06, - "loss": 1.2849, + "epoch": 4.5358649789029535, + "grad_norm": 0.3370364308357239, + "learning_rate": 0.0001330229411492625, + "loss": 0.7014, "step": 1075 }, { - "epoch": 4.789356984478935, - "grad_norm": 0.38262608647346497, - "learning_rate": 9.741534519267736e-07, - "loss": 1.2696, + "epoch": 4.556962025316456, + "grad_norm": 0.3164542317390442, + "learning_rate": 0.00013232693910913485, + "loss": 0.7124, "step": 1080 }, { - "epoch": 4.811529933481153, - "grad_norm": 0.38532087206840515, - "learning_rate": 7.699643194915784e-07, - "loss": 1.26, + "epoch": 4.578059071729958, + "grad_norm": 0.3478745222091675, + "learning_rate": 0.0001316291839125904, + "loss": 0.7253, "step": 1085 }, { - "epoch": 4.8337028824833705, - "grad_norm": 0.3865302503108978, - "learning_rate": 5.896815306578818e-07, - "loss": 1.2568, + "epoch": 4.59915611814346, + "grad_norm": 0.33551761507987976, + "learning_rate": 0.00013092971340032905, + "loss": 0.7237, "step": 1090 }, { - "epoch": 4.855875831485587, - "grad_norm": 0.3837885856628418, - "learning_rate": 4.333485188706576e-07, - "loss": 1.2644, + "epoch": 4.620253164556962, + "grad_norm": 0.3593490421772003, + "learning_rate": 0.00013022856550607572, + "loss": 0.7187, "step": 1095 }, { - "epoch": 4.878048780487805, - "grad_norm": 0.37372496724128723, - "learning_rate": 3.0100294763238946e-07, - "loss": 1.2764, + "epoch": 4.641350210970464, + "grad_norm": 0.33983170986175537, + "learning_rate": 0.0001295257782545233, + "loss": 0.715, "step": 1100 }, { - "epoch": 4.900221729490022, - "grad_norm": 0.38477322459220886, - "learning_rate": 1.9267670142926187e-07, - "loss": 1.2529, + "epoch": 4.662447257383966, + "grad_norm": 0.3238469064235687, + "learning_rate": 0.00012882138975927026, + "loss": 0.7024, "step": 1105 }, { - "epoch": 4.922394678492239, - "grad_norm": 0.3782412111759186, - "learning_rate": 1.0839587804954975e-07, - "loss": 1.2552, + "epoch": 4.6835443037974684, + "grad_norm": 0.3401734232902527, + "learning_rate": 0.00012811543822075397, + "loss": 0.7175, "step": 1110 }, { - "epoch": 4.9445676274944566, - "grad_norm": 0.3779396116733551, - "learning_rate": 4.818078229622547e-08, - "loss": 1.2773, + "epoch": 4.70464135021097, + "grad_norm": 0.35343295335769653, + "learning_rate": 0.00012740796192417875, + "loss": 0.7445, "step": 1115 }, { - "epoch": 4.966740576496674, - "grad_norm": 0.38919439911842346, - "learning_rate": 1.2045921095127366e-08, - "loss": 1.2647, + "epoch": 4.725738396624473, + "grad_norm": 0.3328774869441986, + "learning_rate": 0.00012669899923743968, + "loss": 0.7007, "step": 1120 }, { - "epoch": 4.988913525498892, - "grad_norm": 0.38752755522727966, - "learning_rate": 0.0, - "loss": 1.2695, + "epoch": 4.746835443037975, + "grad_norm": 0.341886967420578, + "learning_rate": 0.00012598858860904193, + "loss": 0.7275, "step": 1125 }, { - "epoch": 4.988913525498892, - "eval_loss": 1.8087279796600342, - "eval_runtime": 0.3391, - "eval_samples_per_second": 2.949, - "eval_steps_per_second": 2.949, - "step": 1125 + "epoch": 4.767932489451477, + "grad_norm": 0.33224210143089294, + "learning_rate": 0.00012527676856601542, + "loss": 0.7093, + "step": 1130 + }, + { + "epoch": 4.789029535864979, + "grad_norm": 0.3608289062976837, + "learning_rate": 0.0001245635777118256, + "loss": 0.7237, + "step": 1135 + }, + { + "epoch": 4.810126582278481, + "grad_norm": 0.3258775472640991, + "learning_rate": 0.00012384905472427975, + "loss": 0.7068, + "step": 1140 + }, + { + "epoch": 4.831223628691983, + "grad_norm": 0.3356561064720154, + "learning_rate": 0.0001231332383534296, + "loss": 0.7208, + "step": 1145 + }, + { + "epoch": 4.852320675105485, + "grad_norm": 0.32746249437332153, + "learning_rate": 0.00012241616741946962, + "loss": 0.7143, + "step": 1150 + }, + { + "epoch": 4.8734177215189876, + "grad_norm": 0.33153674006462097, + "learning_rate": 0.0001216978808106318, + "loss": 0.726, + "step": 1155 + }, + { + "epoch": 4.894514767932489, + "grad_norm": 0.33083775639533997, + "learning_rate": 0.00012097841748107681, + "loss": 0.7015, + "step": 1160 + }, + { + "epoch": 4.915611814345992, + "grad_norm": 0.3352629542350769, + "learning_rate": 0.00012025781644878118, + "loss": 0.7234, + "step": 1165 + }, + { + "epoch": 4.936708860759493, + "grad_norm": 0.3423599898815155, + "learning_rate": 0.00011953611679342143, + "loss": 0.733, + "step": 1170 + }, + { + "epoch": 4.957805907172996, + "grad_norm": 0.3402250409126282, + "learning_rate": 0.00011881335765425473, + "loss": 0.7187, + "step": 1175 + }, + { + "epoch": 4.978902953586498, + "grad_norm": 0.345759779214859, + "learning_rate": 0.00011808957822799614, + "loss": 0.7119, + "step": 1180 + }, + { + "epoch": 5.0, + "grad_norm": 0.3274582326412201, + "learning_rate": 0.00011736481776669306, + "loss": 0.716, + "step": 1185 + }, + { + "epoch": 5.0, + "eval_loss": 1.7251324653625488, + "eval_runtime": 0.554, + "eval_samples_per_second": 3.61, + "eval_steps_per_second": 1.805, + "step": 1185 + }, + { + "epoch": 5.0210970464135025, + "grad_norm": 0.36755794286727905, + "learning_rate": 0.0001166391155755964, + "loss": 0.6589, + "step": 1190 + }, + { + "epoch": 5.042194092827004, + "grad_norm": 0.3618139624595642, + "learning_rate": 0.00011591251101102906, + "loss": 0.6697, + "step": 1195 + }, + { + "epoch": 5.063291139240507, + "grad_norm": 0.38643401861190796, + "learning_rate": 0.00011518504347825145, + "loss": 0.661, + "step": 1200 + }, + { + "epoch": 5.084388185654008, + "grad_norm": 0.3515397012233734, + "learning_rate": 0.00011445675242932457, + "loss": 0.6455, + "step": 1205 + }, + { + "epoch": 5.105485232067511, + "grad_norm": 0.37624698877334595, + "learning_rate": 0.00011372767736097039, + "loss": 0.6628, + "step": 1210 + }, + { + "epoch": 5.1265822784810124, + "grad_norm": 0.3468095660209656, + "learning_rate": 0.00011299785781242982, + "loss": 0.6591, + "step": 1215 + }, + { + "epoch": 5.147679324894515, + "grad_norm": 0.3849187195301056, + "learning_rate": 0.00011226733336331855, + "loss": 0.6726, + "step": 1220 + }, + { + "epoch": 5.168776371308017, + "grad_norm": 0.36786338686943054, + "learning_rate": 0.00011153614363148032, + "loss": 0.6795, + "step": 1225 + }, + { + "epoch": 5.189873417721519, + "grad_norm": 0.35997211933135986, + "learning_rate": 0.00011080432827083873, + "loss": 0.676, + "step": 1230 + }, + { + "epoch": 5.210970464135021, + "grad_norm": 0.3702506721019745, + "learning_rate": 0.00011007192696924638, + "loss": 0.6734, + "step": 1235 + }, + { + "epoch": 5.232067510548523, + "grad_norm": 0.35727155208587646, + "learning_rate": 0.00010933897944633265, + "loss": 0.6719, + "step": 1240 + }, + { + "epoch": 5.253164556962025, + "grad_norm": 0.35158923268318176, + "learning_rate": 0.0001086055254513497, + "loss": 0.6572, + "step": 1245 + }, + { + "epoch": 5.274261603375527, + "grad_norm": 0.3676392734050751, + "learning_rate": 0.00010787160476101668, + "loss": 0.663, + "step": 1250 + }, + { + "epoch": 5.29535864978903, + "grad_norm": 0.35416457056999207, + "learning_rate": 0.00010713725717736254, + "loss": 0.6619, + "step": 1255 + }, + { + "epoch": 5.3164556962025316, + "grad_norm": 0.36827412247657776, + "learning_rate": 0.00010640252252556759, + "loss": 0.6861, + "step": 1260 + }, + { + "epoch": 5.337552742616034, + "grad_norm": 0.37270885705947876, + "learning_rate": 0.00010566744065180368, + "loss": 0.6842, + "step": 1265 + }, + { + "epoch": 5.358649789029536, + "grad_norm": 0.6396368741989136, + "learning_rate": 0.00010493205142107312, + "loss": 0.6648, + "step": 1270 + }, + { + "epoch": 5.379746835443038, + "grad_norm": 0.3901231288909912, + "learning_rate": 0.00010419639471504682, + "loss": 0.6682, + "step": 1275 + }, + { + "epoch": 5.40084388185654, + "grad_norm": 0.3932683765888214, + "learning_rate": 0.0001034605104299016, + "loss": 0.6715, + "step": 1280 + }, + { + "epoch": 5.421940928270042, + "grad_norm": 0.3795235753059387, + "learning_rate": 0.00010272443847415615, + "loss": 0.6826, + "step": 1285 + }, + { + "epoch": 5.443037974683544, + "grad_norm": 0.3844228982925415, + "learning_rate": 0.00010198821876650701, + "loss": 0.6624, + "step": 1290 + }, + { + "epoch": 5.4641350210970465, + "grad_norm": 0.37277084589004517, + "learning_rate": 0.00010125189123366368, + "loss": 0.6818, + "step": 1295 + }, + { + "epoch": 5.485232067510548, + "grad_norm": 0.3795084059238434, + "learning_rate": 0.0001005154958081831, + "loss": 0.6688, + "step": 1300 + }, + { + "epoch": 5.506329113924051, + "grad_norm": 0.37196341156959534, + "learning_rate": 9.977907242630426e-05, + "loss": 0.6627, + "step": 1305 + }, + { + "epoch": 5.527426160337553, + "grad_norm": 0.3792167603969574, + "learning_rate": 9.904266102578231e-05, + "loss": 0.6768, + "step": 1310 + }, + { + "epoch": 5.548523206751055, + "grad_norm": 0.3688276410102844, + "learning_rate": 9.830630154372252e-05, + "loss": 0.6663, + "step": 1315 + }, + { + "epoch": 5.569620253164557, + "grad_norm": 0.3876282870769501, + "learning_rate": 9.75700339144146e-05, + "loss": 0.6757, + "step": 1320 + }, + { + "epoch": 5.590717299578059, + "grad_norm": 0.35115256905555725, + "learning_rate": 9.68338980671669e-05, + "loss": 0.6846, + "step": 1325 + }, + { + "epoch": 5.6118143459915615, + "grad_norm": 0.3650346100330353, + "learning_rate": 9.609793392414086e-05, + "loss": 0.6948, + "step": 1330 + }, + { + "epoch": 5.632911392405063, + "grad_norm": 0.3864571750164032, + "learning_rate": 9.536218139818614e-05, + "loss": 0.6712, + "step": 1335 + }, + { + "epoch": 5.654008438818566, + "grad_norm": 0.36888009309768677, + "learning_rate": 9.462668039067602e-05, + "loss": 0.6705, + "step": 1340 + }, + { + "epoch": 5.675105485232067, + "grad_norm": 0.36247017979621887, + "learning_rate": 9.389147078934329e-05, + "loss": 0.6696, + "step": 1345 + }, + { + "epoch": 5.69620253164557, + "grad_norm": 0.3620111048221588, + "learning_rate": 9.31565924661172e-05, + "loss": 0.6686, + "step": 1350 + }, + { + "epoch": 5.717299578059071, + "grad_norm": 0.3552044630050659, + "learning_rate": 9.242208527496121e-05, + "loss": 0.6922, + "step": 1355 + }, + { + "epoch": 5.738396624472574, + "grad_norm": 0.36270490288734436, + "learning_rate": 9.168798904971143e-05, + "loss": 0.6625, + "step": 1360 + }, + { + "epoch": 5.759493670886076, + "grad_norm": 0.3620161712169647, + "learning_rate": 9.095434360191642e-05, + "loss": 0.6684, + "step": 1365 + }, + { + "epoch": 5.780590717299578, + "grad_norm": 0.37736937403678894, + "learning_rate": 9.02211887186783e-05, + "loss": 0.6896, + "step": 1370 + }, + { + "epoch": 5.80168776371308, + "grad_norm": 0.4165714979171753, + "learning_rate": 8.948856416049475e-05, + "loss": 0.6704, + "step": 1375 + }, + { + "epoch": 5.822784810126582, + "grad_norm": 0.3674893081188202, + "learning_rate": 8.875650965910279e-05, + "loss": 0.6871, + "step": 1380 + }, + { + "epoch": 5.843881856540085, + "grad_norm": 0.39419984817504883, + "learning_rate": 8.802506491532421e-05, + "loss": 0.6941, + "step": 1385 + }, + { + "epoch": 5.864978902953586, + "grad_norm": 0.3581133782863617, + "learning_rate": 8.72942695969123e-05, + "loss": 0.6815, + "step": 1390 + }, + { + "epoch": 5.886075949367089, + "grad_norm": 0.3663847744464874, + "learning_rate": 8.656416333640066e-05, + "loss": 0.6792, + "step": 1395 + }, + { + "epoch": 5.9071729957805905, + "grad_norm": 0.39068740606307983, + "learning_rate": 8.583478572895394e-05, + "loss": 0.6689, + "step": 1400 + }, + { + "epoch": 5.928270042194093, + "grad_norm": 0.3782387971878052, + "learning_rate": 8.510617633022044e-05, + "loss": 0.6825, + "step": 1405 + }, + { + "epoch": 5.949367088607595, + "grad_norm": 0.3802437484264374, + "learning_rate": 8.437837465418684e-05, + "loss": 0.669, + "step": 1410 + }, + { + "epoch": 5.970464135021097, + "grad_norm": 0.35812443494796753, + "learning_rate": 8.365142017103542e-05, + "loss": 0.6788, + "step": 1415 + }, + { + "epoch": 5.991561181434599, + "grad_norm": 0.36878547072410583, + "learning_rate": 8.292535230500342e-05, + "loss": 0.6876, + "step": 1420 + }, + { + "epoch": 6.0, + "eval_loss": 1.782979130744934, + "eval_runtime": 0.5539, + "eval_samples_per_second": 3.611, + "eval_steps_per_second": 1.805, + "step": 1422 + }, + { + "epoch": 6.012658227848101, + "grad_norm": 0.40747499465942383, + "learning_rate": 8.2200210432245e-05, + "loss": 0.6326, + "step": 1425 + }, + { + "epoch": 6.033755274261603, + "grad_norm": 0.40314236283302307, + "learning_rate": 8.147603387869582e-05, + "loss": 0.6234, + "step": 1430 + }, + { + "epoch": 6.0548523206751055, + "grad_norm": 0.3922586739063263, + "learning_rate": 8.075286191794025e-05, + "loss": 0.6238, + "step": 1435 + }, + { + "epoch": 6.075949367088608, + "grad_norm": 0.41105443239212036, + "learning_rate": 8.003073376908163e-05, + "loss": 0.6312, + "step": 1440 + }, + { + "epoch": 6.09704641350211, + "grad_norm": 0.3970966339111328, + "learning_rate": 7.930968859461516e-05, + "loss": 0.6233, + "step": 1445 + }, + { + "epoch": 6.118143459915612, + "grad_norm": 0.42427581548690796, + "learning_rate": 7.85897654983041e-05, + "loss": 0.6348, + "step": 1450 + }, + { + "epoch": 6.139240506329114, + "grad_norm": 0.38989487290382385, + "learning_rate": 7.787100352305908e-05, + "loss": 0.6237, + "step": 1455 + }, + { + "epoch": 6.160337552742616, + "grad_norm": 0.4042844772338867, + "learning_rate": 7.715344164882085e-05, + "loss": 0.6232, + "step": 1460 + }, + { + "epoch": 6.181434599156118, + "grad_norm": 0.40070950984954834, + "learning_rate": 7.643711879044612e-05, + "loss": 0.6173, + "step": 1465 + }, + { + "epoch": 6.2025316455696204, + "grad_norm": 0.40951260924339294, + "learning_rate": 7.572207379559721e-05, + "loss": 0.6369, + "step": 1470 + }, + { + "epoch": 6.223628691983122, + "grad_norm": 0.40946945548057556, + "learning_rate": 7.50083454426354e-05, + "loss": 0.6267, + "step": 1475 + }, + { + "epoch": 6.244725738396625, + "grad_norm": 0.40567830204963684, + "learning_rate": 7.429597243851764e-05, + "loss": 0.616, + "step": 1480 + }, + { + "epoch": 6.265822784810126, + "grad_norm": 0.4094925820827484, + "learning_rate": 7.358499341669756e-05, + "loss": 0.6231, + "step": 1485 + }, + { + "epoch": 6.286919831223629, + "grad_norm": 0.396982878446579, + "learning_rate": 7.287544693503028e-05, + "loss": 0.6263, + "step": 1490 + }, + { + "epoch": 6.308016877637131, + "grad_norm": 0.41034215688705444, + "learning_rate": 7.216737147368127e-05, + "loss": 0.6466, + "step": 1495 + }, + { + "epoch": 6.329113924050633, + "grad_norm": 0.4219072163105011, + "learning_rate": 7.146080543303965e-05, + "loss": 0.6479, + "step": 1500 + }, + { + "epoch": 6.350210970464135, + "grad_norm": 0.39759665727615356, + "learning_rate": 7.075578713163541e-05, + "loss": 0.6235, + "step": 1505 + }, + { + "epoch": 6.371308016877637, + "grad_norm": 0.4137880504131317, + "learning_rate": 7.00523548040616e-05, + "loss": 0.6221, + "step": 1510 + }, + { + "epoch": 6.3924050632911396, + "grad_norm": 0.4084639847278595, + "learning_rate": 6.935054659890052e-05, + "loss": 0.633, + "step": 1515 + }, + { + "epoch": 6.413502109704641, + "grad_norm": 0.39727863669395447, + "learning_rate": 6.865040057665506e-05, + "loss": 0.6356, + "step": 1520 + }, + { + "epoch": 6.434599156118144, + "grad_norm": 0.4197627007961273, + "learning_rate": 6.795195470768444e-05, + "loss": 0.6355, + "step": 1525 + }, + { + "epoch": 6.455696202531645, + "grad_norm": 0.4036734402179718, + "learning_rate": 6.725524687014514e-05, + "loss": 0.6367, + "step": 1530 + }, + { + "epoch": 6.476793248945148, + "grad_norm": 0.4073878526687622, + "learning_rate": 6.656031484793657e-05, + "loss": 0.6367, + "step": 1535 + }, + { + "epoch": 6.4978902953586495, + "grad_norm": 0.4095742702484131, + "learning_rate": 6.586719632865198e-05, + "loss": 0.6292, + "step": 1540 + }, + { + "epoch": 6.518987341772152, + "grad_norm": 0.408542662858963, + "learning_rate": 6.517592890153476e-05, + "loss": 0.6312, + "step": 1545 + }, + { + "epoch": 6.540084388185654, + "grad_norm": 0.4064979553222656, + "learning_rate": 6.448655005543969e-05, + "loss": 0.6373, + "step": 1550 + }, + { + "epoch": 6.561181434599156, + "grad_norm": 0.4208141565322876, + "learning_rate": 6.379909717679985e-05, + "loss": 0.6289, + "step": 1555 + }, + { + "epoch": 6.582278481012658, + "grad_norm": 0.4085118770599365, + "learning_rate": 6.311360754759923e-05, + "loss": 0.6289, + "step": 1560 + }, + { + "epoch": 6.60337552742616, + "grad_norm": 0.4019670784473419, + "learning_rate": 6.243011834335075e-05, + "loss": 0.639, + "step": 1565 + }, + { + "epoch": 6.624472573839663, + "grad_norm": 0.4115982949733734, + "learning_rate": 6.17486666310801e-05, + "loss": 0.6437, + "step": 1570 + }, + { + "epoch": 6.6455696202531644, + "grad_norm": 0.40410783886909485, + "learning_rate": 6.106928936731571e-05, + "loss": 0.6439, + "step": 1575 + }, + { + "epoch": 6.666666666666667, + "grad_norm": 0.3954565227031708, + "learning_rate": 6.039202339608432e-05, + "loss": 0.6339, + "step": 1580 + }, + { + "epoch": 6.687763713080169, + "grad_norm": 0.40417176485061646, + "learning_rate": 5.971690544691294e-05, + "loss": 0.6238, + "step": 1585 + }, + { + "epoch": 6.708860759493671, + "grad_norm": 0.40106064081192017, + "learning_rate": 5.90439721328369e-05, + "loss": 0.6183, + "step": 1590 + }, + { + "epoch": 6.729957805907173, + "grad_norm": 0.3997708261013031, + "learning_rate": 5.837325994841434e-05, + "loss": 0.6349, + "step": 1595 + }, + { + "epoch": 6.751054852320675, + "grad_norm": 0.40423154830932617, + "learning_rate": 5.770480526774693e-05, + "loss": 0.6319, + "step": 1600 + }, + { + "epoch": 6.772151898734177, + "grad_norm": 0.39728954434394836, + "learning_rate": 5.7038644342507205e-05, + "loss": 0.6454, + "step": 1605 + }, + { + "epoch": 6.793248945147679, + "grad_norm": 0.4143037497997284, + "learning_rate": 5.6374813299972805e-05, + "loss": 0.6532, + "step": 1610 + }, + { + "epoch": 6.814345991561181, + "grad_norm": 0.4104886054992676, + "learning_rate": 5.571334814106681e-05, + "loss": 0.6375, + "step": 1615 + }, + { + "epoch": 6.8354430379746836, + "grad_norm": 0.41742509603500366, + "learning_rate": 5.505428473840576e-05, + "loss": 0.6443, + "step": 1620 + }, + { + "epoch": 6.856540084388186, + "grad_norm": 0.4019664227962494, + "learning_rate": 5.4397658834353895e-05, + "loss": 0.6207, + "step": 1625 + }, + { + "epoch": 6.877637130801688, + "grad_norm": 0.4325370490550995, + "learning_rate": 5.3743506039084913e-05, + "loss": 0.6357, + "step": 1630 + }, + { + "epoch": 6.89873417721519, + "grad_norm": 0.4043619632720947, + "learning_rate": 5.309186182865076e-05, + "loss": 0.646, + "step": 1635 + }, + { + "epoch": 6.919831223628692, + "grad_norm": 0.4148579239845276, + "learning_rate": 5.244276154305758e-05, + "loss": 0.6417, + "step": 1640 + }, + { + "epoch": 6.940928270042194, + "grad_norm": 0.41201284527778625, + "learning_rate": 5.179624038434938e-05, + "loss": 0.6396, + "step": 1645 + }, + { + "epoch": 6.962025316455696, + "grad_norm": 0.4112018644809723, + "learning_rate": 5.115233341469877e-05, + "loss": 0.6391, + "step": 1650 + }, + { + "epoch": 6.9831223628691985, + "grad_norm": 0.42246147990226746, + "learning_rate": 5.0511075554505426e-05, + "loss": 0.6344, + "step": 1655 + }, + { + "epoch": 7.0, + "eval_loss": 1.8556586503982544, + "eval_runtime": 0.5548, + "eval_samples_per_second": 3.605, + "eval_steps_per_second": 1.802, + "step": 1659 + }, + { + "epoch": 7.0042194092827, + "grad_norm": 0.38922393321990967, + "learning_rate": 4.987250158050244e-05, + "loss": 0.6267, + "step": 1660 + }, + { + "epoch": 7.025316455696203, + "grad_norm": 0.4556201100349426, + "learning_rate": 4.923664612387019e-05, + "loss": 0.5894, + "step": 1665 + }, + { + "epoch": 7.046413502109704, + "grad_norm": 0.4320254325866699, + "learning_rate": 4.860354366835825e-05, + "loss": 0.6007, + "step": 1670 + }, + { + "epoch": 7.067510548523207, + "grad_norm": 0.41525062918663025, + "learning_rate": 4.7973228548415385e-05, + "loss": 0.5944, + "step": 1675 + }, + { + "epoch": 7.0886075949367084, + "grad_norm": 0.46430733799934387, + "learning_rate": 4.734573494732735e-05, + "loss": 0.5945, + "step": 1680 + }, + { + "epoch": 7.109704641350211, + "grad_norm": 0.421763151884079, + "learning_rate": 4.6721096895363114e-05, + "loss": 0.583, + "step": 1685 + }, + { + "epoch": 7.1308016877637135, + "grad_norm": 0.44340547919273376, + "learning_rate": 4.6099348267929334e-05, + "loss": 0.6034, + "step": 1690 + }, + { + "epoch": 7.151898734177215, + "grad_norm": 0.4334201216697693, + "learning_rate": 4.548052278373327e-05, + "loss": 0.592, + "step": 1695 + }, + { + "epoch": 7.172995780590718, + "grad_norm": 0.4375658631324768, + "learning_rate": 4.486465400295404e-05, + "loss": 0.5942, + "step": 1700 + }, + { + "epoch": 7.194092827004219, + "grad_norm": 0.4318469762802124, + "learning_rate": 4.4251775325422795e-05, + "loss": 0.6079, + "step": 1705 + }, + { + "epoch": 7.215189873417722, + "grad_norm": 0.4487842619419098, + "learning_rate": 4.364191998881104e-05, + "loss": 0.5938, + "step": 1710 + }, + { + "epoch": 7.236286919831223, + "grad_norm": 0.4327734112739563, + "learning_rate": 4.303512106682849e-05, + "loss": 0.5965, + "step": 1715 + }, + { + "epoch": 7.257383966244726, + "grad_norm": 0.4447080194950104, + "learning_rate": 4.243141146742905e-05, + "loss": 0.5953, + "step": 1720 + }, + { + "epoch": 7.2784810126582276, + "grad_norm": 0.4422175884246826, + "learning_rate": 4.183082393102636e-05, + "loss": 0.5849, + "step": 1725 + }, + { + "epoch": 7.29957805907173, + "grad_norm": 0.4476224184036255, + "learning_rate": 4.1233391028718116e-05, + "loss": 0.5962, + "step": 1730 + }, + { + "epoch": 7.320675105485232, + "grad_norm": 0.4534938931465149, + "learning_rate": 4.063914516051984e-05, + "loss": 0.5838, + "step": 1735 + }, + { + "epoch": 7.341772151898734, + "grad_norm": 0.45842060446739197, + "learning_rate": 4.004811855360748e-05, + "loss": 0.6046, + "step": 1740 + }, + { + "epoch": 7.362869198312236, + "grad_norm": 0.43340378999710083, + "learning_rate": 3.9460343260569964e-05, + "loss": 0.5972, + "step": 1745 + }, + { + "epoch": 7.383966244725738, + "grad_norm": 0.4477992057800293, + "learning_rate": 3.887585115767068e-05, + "loss": 0.6067, + "step": 1750 + }, + { + "epoch": 7.405063291139241, + "grad_norm": 0.44521939754486084, + "learning_rate": 3.82946739431189e-05, + "loss": 0.5959, + "step": 1755 + }, + { + "epoch": 7.4261603375527425, + "grad_norm": 0.42936068773269653, + "learning_rate": 3.771684313535062e-05, + "loss": 0.5963, + "step": 1760 + }, + { + "epoch": 7.447257383966245, + "grad_norm": 0.45330098271369934, + "learning_rate": 3.7142390071319454e-05, + "loss": 0.6001, + "step": 1765 + }, + { + "epoch": 7.468354430379747, + "grad_norm": 0.451648473739624, + "learning_rate": 3.65713459047969e-05, + "loss": 0.6104, + "step": 1770 + }, + { + "epoch": 7.489451476793249, + "grad_norm": 0.4406780004501343, + "learning_rate": 3.60037416046829e-05, + "loss": 0.5942, + "step": 1775 + }, + { + "epoch": 7.510548523206751, + "grad_norm": 0.4443998634815216, + "learning_rate": 3.543960795332653e-05, + "loss": 0.5919, + "step": 1780 + }, + { + "epoch": 7.531645569620253, + "grad_norm": 0.45104894042015076, + "learning_rate": 3.487897554485628e-05, + "loss": 0.5995, + "step": 1785 + }, + { + "epoch": 7.552742616033755, + "grad_norm": 0.45314210653305054, + "learning_rate": 3.43218747835211e-05, + "loss": 0.587, + "step": 1790 + }, + { + "epoch": 7.5738396624472575, + "grad_norm": 0.4450884163379669, + "learning_rate": 3.376833588204148e-05, + "loss": 0.5879, + "step": 1795 + }, + { + "epoch": 7.594936708860759, + "grad_norm": 0.44848042726516724, + "learning_rate": 3.3218388859970875e-05, + "loss": 0.598, + "step": 1800 + }, + { + "epoch": 7.616033755274262, + "grad_norm": 0.45061829686164856, + "learning_rate": 3.2672063542067734e-05, + "loss": 0.6111, + "step": 1805 + }, + { + "epoch": 7.637130801687764, + "grad_norm": 0.43524765968322754, + "learning_rate": 3.2129389556678016e-05, + "loss": 0.6004, + "step": 1810 + }, + { + "epoch": 7.658227848101266, + "grad_norm": 0.46142658591270447, + "learning_rate": 3.15903963341285e-05, + "loss": 0.594, + "step": 1815 + }, + { + "epoch": 7.679324894514768, + "grad_norm": 0.45434656739234924, + "learning_rate": 3.1055113105130506e-05, + "loss": 0.6002, + "step": 1820 + }, + { + "epoch": 7.70042194092827, + "grad_norm": 0.45925071835517883, + "learning_rate": 3.052356889919489e-05, + "loss": 0.5914, + "step": 1825 + }, + { + "epoch": 7.7215189873417724, + "grad_norm": 0.44464772939682007, + "learning_rate": 2.9995792543057478e-05, + "loss": 0.6064, + "step": 1830 + }, + { + "epoch": 7.742616033755274, + "grad_norm": 0.44117605686187744, + "learning_rate": 2.9471812659115917e-05, + "loss": 0.5993, + "step": 1835 + }, + { + "epoch": 7.763713080168777, + "grad_norm": 0.4454299509525299, + "learning_rate": 2.895165766387733e-05, + "loss": 0.5957, + "step": 1840 + }, + { + "epoch": 7.784810126582278, + "grad_norm": 0.4484660029411316, + "learning_rate": 2.843535576641725e-05, + "loss": 0.5985, + "step": 1845 + }, + { + "epoch": 7.805907172995781, + "grad_norm": 0.4591384828090668, + "learning_rate": 2.7922934966849823e-05, + "loss": 0.6044, + "step": 1850 + }, + { + "epoch": 7.827004219409282, + "grad_norm": 0.4372834861278534, + "learning_rate": 2.7414423054809302e-05, + "loss": 0.5958, + "step": 1855 + }, + { + "epoch": 7.848101265822785, + "grad_norm": 0.44407814741134644, + "learning_rate": 2.690984760794284e-05, + "loss": 0.5965, + "step": 1860 + }, + { + "epoch": 7.869198312236287, + "grad_norm": 0.44278717041015625, + "learning_rate": 2.6409235990415026e-05, + "loss": 0.6062, + "step": 1865 + }, + { + "epoch": 7.890295358649789, + "grad_norm": 0.4526854455471039, + "learning_rate": 2.591261535142383e-05, + "loss": 0.6035, + "step": 1870 + }, + { + "epoch": 7.911392405063291, + "grad_norm": 0.4361235499382019, + "learning_rate": 2.5420012623728208e-05, + "loss": 0.6041, + "step": 1875 + }, + { + "epoch": 7.932489451476793, + "grad_norm": 0.4319293200969696, + "learning_rate": 2.4931454522187593e-05, + "loss": 0.6005, + "step": 1880 + }, + { + "epoch": 7.953586497890296, + "grad_norm": 0.44515419006347656, + "learning_rate": 2.4446967542313015e-05, + "loss": 0.614, + "step": 1885 + }, + { + "epoch": 7.974683544303797, + "grad_norm": 0.4468868672847748, + "learning_rate": 2.3966577958830128e-05, + "loss": 0.5999, + "step": 1890 + }, + { + "epoch": 7.9957805907173, + "grad_norm": 0.43502089381217957, + "learning_rate": 2.3490311824254386e-05, + "loss": 0.591, + "step": 1895 + }, + { + "epoch": 8.0, + "eval_loss": 1.9239612817764282, + "eval_runtime": 0.555, + "eval_samples_per_second": 3.604, + "eval_steps_per_second": 1.802, + "step": 1896 + }, + { + "epoch": 8.016877637130802, + "grad_norm": 0.43088486790657043, + "learning_rate": 2.3018194967478145e-05, + "loss": 0.5772, + "step": 1900 + }, + { + "epoch": 8.037974683544304, + "grad_norm": 0.4887051582336426, + "learning_rate": 2.2550252992369837e-05, + "loss": 0.5858, + "step": 1905 + }, + { + "epoch": 8.059071729957806, + "grad_norm": 0.46031367778778076, + "learning_rate": 2.2086511276385556e-05, + "loss": 0.5698, + "step": 1910 + }, + { + "epoch": 8.080168776371307, + "grad_norm": 0.44916045665740967, + "learning_rate": 2.1626994969192617e-05, + "loss": 0.5832, + "step": 1915 + }, + { + "epoch": 8.10126582278481, + "grad_norm": 0.45516934990882874, + "learning_rate": 2.1171728991305795e-05, + "loss": 0.5678, + "step": 1920 + }, + { + "epoch": 8.122362869198312, + "grad_norm": 0.4672262668609619, + "learning_rate": 2.072073803273572e-05, + "loss": 0.5609, + "step": 1925 + }, + { + "epoch": 8.143459915611814, + "grad_norm": 0.4732205271720886, + "learning_rate": 2.0274046551649918e-05, + "loss": 0.5748, + "step": 1930 + }, + { + "epoch": 8.164556962025316, + "grad_norm": 0.4523015022277832, + "learning_rate": 1.9831678773046424e-05, + "loss": 0.572, + "step": 1935 + }, + { + "epoch": 8.185654008438819, + "grad_norm": 0.46077847480773926, + "learning_rate": 1.9393658687439985e-05, + "loss": 0.5734, + "step": 1940 + }, + { + "epoch": 8.20675105485232, + "grad_norm": 0.48243677616119385, + "learning_rate": 1.8960010049561028e-05, + "loss": 0.5749, + "step": 1945 + }, + { + "epoch": 8.227848101265822, + "grad_norm": 0.45998242497444153, + "learning_rate": 1.8530756377067394e-05, + "loss": 0.5635, + "step": 1950 + }, + { + "epoch": 8.248945147679326, + "grad_norm": 0.4926050305366516, + "learning_rate": 1.8105920949268862e-05, + "loss": 0.5656, + "step": 1955 + }, + { + "epoch": 8.270042194092827, + "grad_norm": 0.44752731919288635, + "learning_rate": 1.7685526805864727e-05, + "loss": 0.5713, + "step": 1960 + }, + { + "epoch": 8.291139240506329, + "grad_norm": 0.4773804843425751, + "learning_rate": 1.7269596745694295e-05, + "loss": 0.5753, + "step": 1965 + }, + { + "epoch": 8.31223628691983, + "grad_norm": 0.4709602892398834, + "learning_rate": 1.6858153325500435e-05, + "loss": 0.5604, + "step": 1970 + }, + { + "epoch": 8.333333333333334, + "grad_norm": 0.46927887201309204, + "learning_rate": 1.6451218858706374e-05, + "loss": 0.578, + "step": 1975 + }, + { + "epoch": 8.354430379746836, + "grad_norm": 0.467042475938797, + "learning_rate": 1.60488154142054e-05, + "loss": 0.5876, + "step": 1980 + }, + { + "epoch": 8.375527426160337, + "grad_norm": 0.4665224850177765, + "learning_rate": 1.565096481516427e-05, + "loss": 0.5727, + "step": 1985 + }, + { + "epoch": 8.396624472573839, + "grad_norm": 0.46915140748023987, + "learning_rate": 1.5257688637839484e-05, + "loss": 0.5744, + "step": 1990 + }, + { + "epoch": 8.417721518987342, + "grad_norm": 0.4630158841609955, + "learning_rate": 1.4869008210407243e-05, + "loss": 0.5609, + "step": 1995 + }, + { + "epoch": 8.438818565400844, + "grad_norm": 0.46690833568573, + "learning_rate": 1.4484944611806773e-05, + "loss": 0.5764, + "step": 2000 + }, + { + "epoch": 8.459915611814345, + "grad_norm": 0.46290239691734314, + "learning_rate": 1.410551867059724e-05, + "loss": 0.5817, + "step": 2005 + }, + { + "epoch": 8.481012658227849, + "grad_norm": 0.4760874807834625, + "learning_rate": 1.3730750963828032e-05, + "loss": 0.5704, + "step": 2010 + }, + { + "epoch": 8.50210970464135, + "grad_norm": 0.47287824749946594, + "learning_rate": 1.3360661815922903e-05, + "loss": 0.574, + "step": 2015 + }, + { + "epoch": 8.523206751054852, + "grad_norm": 0.46598172187805176, + "learning_rate": 1.2995271297577816e-05, + "loss": 0.5792, + "step": 2020 + }, + { + "epoch": 8.544303797468354, + "grad_norm": 0.4603840410709381, + "learning_rate": 1.2634599224672294e-05, + "loss": 0.5674, + "step": 2025 + }, + { + "epoch": 8.565400843881857, + "grad_norm": 0.48355668783187866, + "learning_rate": 1.227866515719489e-05, + "loss": 0.5676, + "step": 2030 + }, + { + "epoch": 8.586497890295359, + "grad_norm": 0.4648090898990631, + "learning_rate": 1.1927488398182395e-05, + "loss": 0.5595, + "step": 2035 + }, + { + "epoch": 8.60759493670886, + "grad_norm": 0.47335174679756165, + "learning_rate": 1.1581087992672935e-05, + "loss": 0.5743, + "step": 2040 + }, + { + "epoch": 8.628691983122362, + "grad_norm": 0.46940383315086365, + "learning_rate": 1.1239482726673201e-05, + "loss": 0.5719, + "step": 2045 + }, + { + "epoch": 8.649789029535865, + "grad_norm": 0.46548011898994446, + "learning_rate": 1.0902691126139542e-05, + "loss": 0.5722, + "step": 2050 + }, + { + "epoch": 8.670886075949367, + "grad_norm": 0.4601798355579376, + "learning_rate": 1.0570731455973414e-05, + "loss": 0.5752, + "step": 2055 + }, + { + "epoch": 8.691983122362869, + "grad_norm": 0.47531968355178833, + "learning_rate": 1.024362171903065e-05, + "loss": 0.5833, + "step": 2060 + }, + { + "epoch": 8.713080168776372, + "grad_norm": 0.46817246079444885, + "learning_rate": 9.921379655145313e-06, + "loss": 0.5716, + "step": 2065 + }, + { + "epoch": 8.734177215189874, + "grad_norm": 0.47469767928123474, + "learning_rate": 9.604022740167495e-06, + "loss": 0.5825, + "step": 2070 + }, + { + "epoch": 8.755274261603375, + "grad_norm": 0.4739144444465637, + "learning_rate": 9.29156818501561e-06, + "loss": 0.5669, + "step": 2075 + }, + { + "epoch": 8.776371308016877, + "grad_norm": 0.5070598721504211, + "learning_rate": 8.984032934743026e-06, + "loss": 0.5797, + "step": 2080 + }, + { + "epoch": 8.79746835443038, + "grad_norm": 0.4720567464828491, + "learning_rate": 8.681433667619065e-06, + "loss": 0.5635, + "step": 2085 + }, + { + "epoch": 8.818565400843882, + "grad_norm": 0.45980048179626465, + "learning_rate": 8.383786794224569e-06, + "loss": 0.5715, + "step": 2090 + }, + { + "epoch": 8.839662447257384, + "grad_norm": 0.4796925187110901, + "learning_rate": 8.09110845656187e-06, + "loss": 0.5785, + "step": 2095 + }, + { + "epoch": 8.860759493670885, + "grad_norm": 0.4801785349845886, + "learning_rate": 7.803414527179343e-06, + "loss": 0.5772, + "step": 2100 + }, + { + "epoch": 8.881856540084389, + "grad_norm": 0.4780319631099701, + "learning_rate": 7.520720608310683e-06, + "loss": 0.5726, + "step": 2105 + }, + { + "epoch": 8.90295358649789, + "grad_norm": 0.4621387720108032, + "learning_rate": 7.243042031028713e-06, + "loss": 0.5752, + "step": 2110 + }, + { + "epoch": 8.924050632911392, + "grad_norm": 0.4708462059497833, + "learning_rate": 6.9703938544139706e-06, + "loss": 0.5716, + "step": 2115 + }, + { + "epoch": 8.945147679324894, + "grad_norm": 0.4696125090122223, + "learning_rate": 6.702790864738018e-06, + "loss": 0.5666, + "step": 2120 + }, + { + "epoch": 8.966244725738397, + "grad_norm": 0.47694242000579834, + "learning_rate": 6.440247574661573e-06, + "loss": 0.568, + "step": 2125 + }, + { + "epoch": 8.987341772151899, + "grad_norm": 0.4875074326992035, + "learning_rate": 6.182778222447383e-06, + "loss": 0.5677, + "step": 2130 + }, + { + "epoch": 9.0, + "eval_loss": 1.984204888343811, + "eval_runtime": 0.5545, + "eval_samples_per_second": 3.607, + "eval_steps_per_second": 1.804, + "step": 2133 + }, + { + "epoch": 9.0084388185654, + "grad_norm": 0.4417002499103546, + "learning_rate": 5.930396771188129e-06, + "loss": 0.566, + "step": 2135 + }, + { + "epoch": 9.029535864978904, + "grad_norm": 0.4602307677268982, + "learning_rate": 5.683116908049168e-06, + "loss": 0.5625, + "step": 2140 + }, + { + "epoch": 9.050632911392405, + "grad_norm": 0.4665865898132324, + "learning_rate": 5.440952043526215e-06, + "loss": 0.5584, + "step": 2145 + }, + { + "epoch": 9.071729957805907, + "grad_norm": 0.47397249937057495, + "learning_rate": 5.203915310718099e-06, + "loss": 0.558, + "step": 2150 + }, + { + "epoch": 9.092827004219409, + "grad_norm": 0.47351840138435364, + "learning_rate": 4.972019564614539e-06, + "loss": 0.5516, + "step": 2155 + }, + { + "epoch": 9.113924050632912, + "grad_norm": 0.4746864438056946, + "learning_rate": 4.745277381398938e-06, + "loss": 0.5536, + "step": 2160 + }, + { + "epoch": 9.135021097046414, + "grad_norm": 0.4737743139266968, + "learning_rate": 4.523701057766361e-06, + "loss": 0.5577, + "step": 2165 + }, + { + "epoch": 9.156118143459915, + "grad_norm": 0.4778996706008911, + "learning_rate": 4.307302610256736e-06, + "loss": 0.5541, + "step": 2170 + }, + { + "epoch": 9.177215189873417, + "grad_norm": 0.4771966338157654, + "learning_rate": 4.0960937746030605e-06, + "loss": 0.552, + "step": 2175 + }, + { + "epoch": 9.19831223628692, + "grad_norm": 0.4708782732486725, + "learning_rate": 3.890086005095051e-06, + "loss": 0.5515, + "step": 2180 + }, + { + "epoch": 9.219409282700422, + "grad_norm": 0.49065274000167847, + "learning_rate": 3.6892904739578736e-06, + "loss": 0.5593, + "step": 2185 + }, + { + "epoch": 9.240506329113924, + "grad_norm": 0.47753557562828064, + "learning_rate": 3.493718070746299e-06, + "loss": 0.5558, + "step": 2190 + }, + { + "epoch": 9.261603375527427, + "grad_norm": 0.4750811755657196, + "learning_rate": 3.3033794017541254e-06, + "loss": 0.5588, + "step": 2195 + }, + { + "epoch": 9.282700421940929, + "grad_norm": 0.46534910798072815, + "learning_rate": 3.1182847894389634e-06, + "loss": 0.5567, + "step": 2200 + }, + { + "epoch": 9.30379746835443, + "grad_norm": 0.46732398867607117, + "learning_rate": 2.9384442718624395e-06, + "loss": 0.5712, + "step": 2205 + }, + { + "epoch": 9.324894514767932, + "grad_norm": 0.4703245759010315, + "learning_rate": 2.763867602145842e-06, + "loss": 0.5566, + "step": 2210 + }, + { + "epoch": 9.345991561181435, + "grad_norm": 0.4757389426231384, + "learning_rate": 2.5945642479411448e-06, + "loss": 0.5669, + "step": 2215 + }, + { + "epoch": 9.367088607594937, + "grad_norm": 0.49615395069122314, + "learning_rate": 2.430543390917539e-06, + "loss": 0.5771, + "step": 2220 + }, + { + "epoch": 9.388185654008439, + "grad_norm": 0.4786005914211273, + "learning_rate": 2.2718139262635775e-06, + "loss": 0.5581, + "step": 2225 + }, + { + "epoch": 9.40928270042194, + "grad_norm": 0.4749692380428314, + "learning_rate": 2.1183844622047034e-06, + "loss": 0.5566, + "step": 2230 + }, + { + "epoch": 9.430379746835444, + "grad_norm": 0.4892341196537018, + "learning_rate": 1.9702633195363917e-06, + "loss": 0.5577, + "step": 2235 + }, + { + "epoch": 9.451476793248945, + "grad_norm": 0.485775351524353, + "learning_rate": 1.8274585311729653e-06, + "loss": 0.5724, + "step": 2240 + }, + { + "epoch": 9.472573839662447, + "grad_norm": 0.47523242235183716, + "learning_rate": 1.6899778417118983e-06, + "loss": 0.5472, + "step": 2245 + }, + { + "epoch": 9.49367088607595, + "grad_norm": 0.45913201570510864, + "learning_rate": 1.557828707013831e-06, + "loss": 0.5576, + "step": 2250 + }, + { + "epoch": 9.514767932489452, + "grad_norm": 0.4752641022205353, + "learning_rate": 1.4310182937982141e-06, + "loss": 0.5605, + "step": 2255 + }, + { + "epoch": 9.535864978902953, + "grad_norm": 0.47292277216911316, + "learning_rate": 1.309553479254666e-06, + "loss": 0.5653, + "step": 2260 + }, + { + "epoch": 9.556962025316455, + "grad_norm": 0.4652714133262634, + "learning_rate": 1.1934408506699802e-06, + "loss": 0.5571, + "step": 2265 + }, + { + "epoch": 9.578059071729959, + "grad_norm": 0.4648183584213257, + "learning_rate": 1.0826867050708678e-06, + "loss": 0.5603, + "step": 2270 + }, + { + "epoch": 9.59915611814346, + "grad_norm": 0.4922165274620056, + "learning_rate": 9.772970488825417e-07, + "loss": 0.5627, + "step": 2275 + }, + { + "epoch": 9.620253164556962, + "grad_norm": 0.4720841348171234, + "learning_rate": 8.772775976028546e-07, + "loss": 0.5517, + "step": 2280 + }, + { + "epoch": 9.641350210970463, + "grad_norm": 0.510443389415741, + "learning_rate": 7.826337754924473e-07, + "loss": 0.5641, + "step": 2285 + }, + { + "epoch": 9.662447257383967, + "grad_norm": 0.46088987588882446, + "learning_rate": 6.933707152805058e-07, + "loss": 0.5595, + "step": 2290 + }, + { + "epoch": 9.683544303797468, + "grad_norm": 0.48828017711639404, + "learning_rate": 6.094932578864287e-07, + "loss": 0.565, + "step": 2295 + }, + { + "epoch": 9.70464135021097, + "grad_norm": 0.4722835421562195, + "learning_rate": 5.31005952157304e-07, + "loss": 0.5611, + "step": 2300 + }, + { + "epoch": 9.725738396624472, + "grad_norm": 0.4879083037376404, + "learning_rate": 4.5791305462120625e-07, + "loss": 0.5746, + "step": 2305 + }, + { + "epoch": 9.746835443037975, + "grad_norm": 0.4783223271369934, + "learning_rate": 3.902185292563365e-07, + "loss": 0.569, + "step": 2310 + }, + { + "epoch": 9.767932489451477, + "grad_norm": 0.4648023247718811, + "learning_rate": 3.2792604727608367e-07, + "loss": 0.5503, + "step": 2315 + }, + { + "epoch": 9.789029535864978, + "grad_norm": 0.47712016105651855, + "learning_rate": 2.710389869298946e-07, + "loss": 0.5522, + "step": 2320 + }, + { + "epoch": 9.810126582278482, + "grad_norm": 0.4652068614959717, + "learning_rate": 2.1956043332010955e-07, + "loss": 0.556, + "step": 2325 + }, + { + "epoch": 9.831223628691983, + "grad_norm": 0.48310577869415283, + "learning_rate": 1.7349317823459609e-07, + "loss": 0.5637, + "step": 2330 + }, + { + "epoch": 9.852320675105485, + "grad_norm": 0.5295414924621582, + "learning_rate": 1.3283971999537015e-07, + "loss": 0.5559, + "step": 2335 + }, + { + "epoch": 9.873417721518987, + "grad_norm": 0.4798893928527832, + "learning_rate": 9.76022633231155e-08, + "loss": 0.5543, + "step": 2340 + }, + { + "epoch": 9.89451476793249, + "grad_norm": 0.48038187623023987, + "learning_rate": 6.778271921760171e-08, + "loss": 0.5626, + "step": 2345 + }, + { + "epoch": 9.915611814345992, + "grad_norm": 0.47445496916770935, + "learning_rate": 4.338270485405582e-08, + "loss": 0.5545, + "step": 2350 + }, + { + "epoch": 9.936708860759493, + "grad_norm": 0.47432997822761536, + "learning_rate": 2.4403543495454818e-08, + "loss": 0.5651, + "step": 2355 + }, + { + "epoch": 9.957805907172995, + "grad_norm": 0.4740554094314575, + "learning_rate": 1.0846264420771857e-08, + "loss": 0.5637, + "step": 2360 + }, + { + "epoch": 9.978902953586498, + "grad_norm": 0.4762984812259674, + "learning_rate": 2.7116028691431817e-09, + "loss": 0.5611, + "step": 2365 + }, + { + "epoch": 10.0, + "grad_norm": 0.4564709961414337, + "learning_rate": 0.0, + "loss": 0.5648, + "step": 2370 + }, + { + "epoch": 10.0, + "eval_loss": 2.0032131671905518, + "eval_runtime": 0.5807, + "eval_samples_per_second": 3.444, + "eval_steps_per_second": 1.722, + "step": 2370 }, { - "epoch": 4.988913525498892, - "step": 1125, - "total_flos": 1.6629843858229821e+18, - "train_loss": 1.390608725865682, - "train_runtime": 3610.9267, - "train_samples_per_second": 9.977, - "train_steps_per_second": 0.312 + "epoch": 10.0, + "step": 2370, + "total_flos": 3.5097090775444357e+18, + "train_loss": 0.7390849222110797, + "train_runtime": 8188.9555, + "train_samples_per_second": 9.243, + "train_steps_per_second": 0.289 } ], "logging_steps": 5, - "max_steps": 1125, + "max_steps": 2370, "num_input_tokens_seen": 0, - "num_train_epochs": 5, + "num_train_epochs": 10, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { @@ -1657,7 +3440,7 @@ "attributes": {} } }, - "total_flos": 1.6629843858229821e+18, + "total_flos": 3.5097090775444357e+18, "train_batch_size": 4, "trial_name": null, "trial_params": null