diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -1,3366 +1,5039 @@ { "best_metric": null, "best_model_checkpoint": null, - "epoch": 50.0, + "epoch": 30.0, "eval_steps": 500, - "global_step": 238700, + "global_step": 71610, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { - "epoch": 0.10473397570171764, - "grad_norm": 1.1384308338165283, - "learning_rate": 4.989526602429829e-05, - "loss": 8.9672, - "step": 500 + "epoch": 0.04189359028068706, + "grad_norm": 1.1499052047729492, + "learning_rate": 4.993017734953219e-05, + "loss": 9.973, + "step": 100 }, { - "epoch": 0.20946795140343527, - "grad_norm": 0.9985808730125427, + "epoch": 0.08378718056137412, + "grad_norm": 1.1211209297180176, + "learning_rate": 4.986035469906438e-05, + "loss": 9.315, + "step": 200 + }, + { + "epoch": 0.12568077084206117, + "grad_norm": 1.0281990766525269, "learning_rate": 4.979053204859657e-05, - "loss": 7.7253, + "loss": 8.8239, + "step": 300 + }, + { + "epoch": 0.16757436112274823, + "grad_norm": 1.1534616947174072, + "learning_rate": 4.972070939812876e-05, + "loss": 8.4254, + "step": 400 + }, + { + "epoch": 0.20946795140343527, + "grad_norm": 0.9906216859817505, + "learning_rate": 4.9650886747660944e-05, + "loss": 8.1225, + "step": 500 + }, + { + "epoch": 0.25136154168412234, + "grad_norm": 0.9559820294380188, + "learning_rate": 4.958106409719313e-05, + "loss": 7.8557, + "step": 600 + }, + { + "epoch": 0.2932551319648094, + "grad_norm": 0.7887147665023804, + "learning_rate": 4.951124144672532e-05, + "loss": 7.7282, + "step": 700 + }, + { + "epoch": 0.33514872224549647, + "grad_norm": 0.6530473232269287, + "learning_rate": 4.944141879625751e-05, + "loss": 7.6586, + "step": 800 + }, + { + "epoch": 0.3770423125261835, + "grad_norm": 0.6546779870986938, + "learning_rate": 4.9371596145789694e-05, + "loss": 7.6438, + "step": 900 + }, + { + "epoch": 0.41893590280687054, + "grad_norm": 0.5660322308540344, + "learning_rate": 4.9301773495321885e-05, + "loss": 7.6237, "step": 1000 }, { - "epoch": 0.31420192710515293, - "grad_norm": 1.0521348714828491, - "learning_rate": 4.968579807289485e-05, - "loss": 7.5614, + "epoch": 0.4608294930875576, + "grad_norm": 0.6094719171524048, + "learning_rate": 4.9231950844854076e-05, + "loss": 7.5937, + "step": 1100 + }, + { + "epoch": 0.5027230833682447, + "grad_norm": 0.6101346015930176, + "learning_rate": 4.916212819438626e-05, + "loss": 7.5661, + "step": 1200 + }, + { + "epoch": 0.5446166736489317, + "grad_norm": 0.5943477749824524, + "learning_rate": 4.909230554391845e-05, + "loss": 7.6007, + "step": 1300 + }, + { + "epoch": 0.5865102639296188, + "grad_norm": 0.6604776382446289, + "learning_rate": 4.902248289345064e-05, + "loss": 7.568, + "step": 1400 + }, + { + "epoch": 0.6284038542103059, + "grad_norm": 0.6151777505874634, + "learning_rate": 4.8952660242982826e-05, + "loss": 7.5521, "step": 1500 }, { - "epoch": 0.41893590280687054, - "grad_norm": 1.0602227449417114, - "learning_rate": 4.958106409719313e-05, - "loss": 7.5037, + "epoch": 0.6702974444909929, + "grad_norm": 0.6381381750106812, + "learning_rate": 4.888283759251501e-05, + "loss": 7.5478, + "step": 1600 + }, + { + "epoch": 0.7121910347716799, + "grad_norm": 0.6552081108093262, + "learning_rate": 4.88130149420472e-05, + "loss": 7.5293, + "step": 1700 + }, + { + "epoch": 0.754084625052367, + "grad_norm": 0.6973659992218018, + "learning_rate": 4.874319229157939e-05, + "loss": 7.4983, + "step": 1800 + }, + { + "epoch": 0.795978215333054, + "grad_norm": 0.8130584955215454, + "learning_rate": 4.8673369641111576e-05, + "loss": 7.5213, + "step": 1900 + }, + { + "epoch": 0.8378718056137411, + "grad_norm": 0.8530446887016296, + "learning_rate": 4.860354699064377e-05, + "loss": 7.5105, "step": 2000 }, { - "epoch": 0.5236698785085881, - "grad_norm": 1.6102268695831299, - "learning_rate": 4.9476330121491414e-05, - "loss": 7.4595, + "epoch": 0.8797653958944281, + "grad_norm": 0.8059477210044861, + "learning_rate": 4.853372434017596e-05, + "loss": 7.508, + "step": 2100 + }, + { + "epoch": 0.9216589861751152, + "grad_norm": 0.7378331422805786, + "learning_rate": 4.846390168970814e-05, + "loss": 7.482, + "step": 2200 + }, + { + "epoch": 0.9635525764558023, + "grad_norm": 1.1823194026947021, + "learning_rate": 4.839407903924033e-05, + "loss": 7.4778, + "step": 2300 + }, + { + "epoch": 1.0054461667364893, + "grad_norm": 0.8547298908233643, + "learning_rate": 4.832425638877252e-05, + "loss": 7.4529, + "step": 2400 + }, + { + "epoch": 1.0473397570171763, + "grad_norm": 0.8535734415054321, + "learning_rate": 4.825443373830471e-05, + "loss": 7.4374, "step": 2500 }, { - "epoch": 0.6284038542103059, - "grad_norm": 1.335976004600525, - "learning_rate": 4.9371596145789694e-05, - "loss": 7.4267, + "epoch": 1.0892333472978635, + "grad_norm": 0.994597852230072, + "learning_rate": 4.818461108783689e-05, + "loss": 7.4306, + "step": 2600 + }, + { + "epoch": 1.1311269375785504, + "grad_norm": 1.2056490182876587, + "learning_rate": 4.8114788437369084e-05, + "loss": 7.4322, + "step": 2700 + }, + { + "epoch": 1.1730205278592376, + "grad_norm": 1.2451157569885254, + "learning_rate": 4.8044965786901275e-05, + "loss": 7.4205, + "step": 2800 + }, + { + "epoch": 1.2149141181399246, + "grad_norm": 0.9964780211448669, + "learning_rate": 4.797514313643346e-05, + "loss": 7.4031, + "step": 2900 + }, + { + "epoch": 1.2568077084206117, + "grad_norm": 0.8989804983139038, + "learning_rate": 4.790532048596565e-05, + "loss": 7.4043, "step": 3000 }, { - "epoch": 0.7331378299120235, - "grad_norm": 1.340728998184204, - "learning_rate": 4.926686217008798e-05, - "loss": 7.392, + "epoch": 1.2987012987012987, + "grad_norm": 1.1330469846725464, + "learning_rate": 4.783549783549784e-05, + "loss": 7.4031, + "step": 3100 + }, + { + "epoch": 1.3405948889819856, + "grad_norm": 0.9531299471855164, + "learning_rate": 4.7765675185030025e-05, + "loss": 7.3866, + "step": 3200 + }, + { + "epoch": 1.3824884792626728, + "grad_norm": 1.0342323780059814, + "learning_rate": 4.7695852534562216e-05, + "loss": 7.3477, + "step": 3300 + }, + { + "epoch": 1.42438206954336, + "grad_norm": 1.0523111820220947, + "learning_rate": 4.76260298840944e-05, + "loss": 7.3726, + "step": 3400 + }, + { + "epoch": 1.466275659824047, + "grad_norm": 1.298751711845398, + "learning_rate": 4.755620723362659e-05, + "loss": 7.3484, "step": 3500 }, { - "epoch": 0.8378718056137411, - "grad_norm": 1.4520059823989868, - "learning_rate": 4.916212819438626e-05, - "loss": 7.3253, + "epoch": 1.508169250104734, + "grad_norm": 1.0065233707427979, + "learning_rate": 4.7486384583158775e-05, + "loss": 7.3567, + "step": 3600 + }, + { + "epoch": 1.550062840385421, + "grad_norm": 1.2989579439163208, + "learning_rate": 4.7416561932690966e-05, + "loss": 7.3275, + "step": 3700 + }, + { + "epoch": 1.591956430666108, + "grad_norm": 1.0343406200408936, + "learning_rate": 4.734673928222316e-05, + "loss": 7.3353, + "step": 3800 + }, + { + "epoch": 1.6338500209467952, + "grad_norm": 0.9944115281105042, + "learning_rate": 4.727691663175534e-05, + "loss": 7.3304, + "step": 3900 + }, + { + "epoch": 1.6757436112274822, + "grad_norm": 1.102974534034729, + "learning_rate": 4.720709398128753e-05, + "loss": 7.3128, "step": 4000 }, { - "epoch": 0.9426057813154587, - "grad_norm": 1.7685532569885254, - "learning_rate": 4.905760368663595e-05, - "loss": 7.3204, - "step": 4500 + "epoch": 1.7176372015081691, + "grad_norm": 1.112282156944275, + "learning_rate": 4.713727133081972e-05, + "loss": 7.333, + "step": 4100 + }, + { + "epoch": 1.7595307917888563, + "grad_norm": 1.2143328189849854, + "learning_rate": 4.7067448680351914e-05, + "loss": 7.3342, + "step": 4200 + }, + { + "epoch": 1.8014243820695435, + "grad_norm": 1.1656922101974487, + "learning_rate": 4.69976260298841e-05, + "loss": 7.2995, + "step": 4300 + }, + { + "epoch": 1.8433179723502304, + "grad_norm": 1.2085694074630737, + "learning_rate": 4.692780337941628e-05, + "loss": 7.2939, + "step": 4400 + }, + { + "epoch": 1.8852115626309174, + "grad_norm": 1.1366217136383057, + "learning_rate": 4.685798072894847e-05, + "loss": 7.3079, + "step": 4500 + }, + { + "epoch": 1.9271051529116046, + "grad_norm": 1.5368098020553589, + "learning_rate": 4.678815807848066e-05, + "loss": 7.2682, + "step": 4600 + }, + { + "epoch": 1.9689987431922917, + "grad_norm": 1.1548763513565063, + "learning_rate": 4.671833542801285e-05, + "loss": 7.2708, + "step": 4700 + }, + { + "epoch": 2.0108923334729787, + "grad_norm": 1.1510928869247437, + "learning_rate": 4.664851277754504e-05, + "loss": 7.2794, + "step": 4800 + }, + { + "epoch": 2.0527859237536656, + "grad_norm": 1.103461503982544, + "learning_rate": 4.6578690127077224e-05, + "loss": 7.2612, + "step": 4900 + }, + { + "epoch": 2.0946795140343526, + "grad_norm": 1.2767215967178345, + "learning_rate": 4.6508867476609414e-05, + "loss": 7.2436, + "step": 5000 + }, + { + "epoch": 2.13657310431504, + "grad_norm": 1.3062710762023926, + "learning_rate": 4.6439044826141605e-05, + "loss": 7.2162, + "step": 5100 + }, + { + "epoch": 2.178466694595727, + "grad_norm": 1.2461299896240234, + "learning_rate": 4.6369222175673796e-05, + "loss": 7.2387, + "step": 5200 + }, + { + "epoch": 2.220360284876414, + "grad_norm": 1.5427358150482178, + "learning_rate": 4.629939952520598e-05, + "loss": 7.2004, + "step": 5300 + }, + { + "epoch": 2.262253875157101, + "grad_norm": 1.390331506729126, + "learning_rate": 4.6229576874738165e-05, + "loss": 7.2403, + "step": 5400 + }, + { + "epoch": 2.3041474654377883, + "grad_norm": 1.4087032079696655, + "learning_rate": 4.6159754224270356e-05, + "loss": 7.2157, + "step": 5500 + }, + { + "epoch": 2.346041055718475, + "grad_norm": 1.2417359352111816, + "learning_rate": 4.608993157380254e-05, + "loss": 7.2277, + "step": 5600 + }, + { + "epoch": 2.387934645999162, + "grad_norm": 1.4267281293869019, + "learning_rate": 4.602010892333473e-05, + "loss": 7.1999, + "step": 5700 + }, + { + "epoch": 2.429828236279849, + "grad_norm": 1.3897684812545776, + "learning_rate": 4.595028627286692e-05, + "loss": 7.2155, + "step": 5800 + }, + { + "epoch": 2.471721826560536, + "grad_norm": 1.326821208000183, + "learning_rate": 4.5880463622399106e-05, + "loss": 7.1705, + "step": 5900 + }, + { + "epoch": 2.5136154168412235, + "grad_norm": 1.2585749626159668, + "learning_rate": 4.58106409719313e-05, + "loss": 7.1787, + "step": 6000 + }, + { + "epoch": 2.5555090071219104, + "grad_norm": 1.4856244325637817, + "learning_rate": 4.574081832146349e-05, + "loss": 7.1923, + "step": 6100 + }, + { + "epoch": 2.5974025974025974, + "grad_norm": 1.2883421182632446, + "learning_rate": 4.567099567099568e-05, + "loss": 7.1776, + "step": 6200 + }, + { + "epoch": 2.6392961876832843, + "grad_norm": 1.4935518503189087, + "learning_rate": 4.560117302052786e-05, + "loss": 7.1711, + "step": 6300 + }, + { + "epoch": 2.6811897779639713, + "grad_norm": 1.3920152187347412, + "learning_rate": 4.553135037006005e-05, + "loss": 7.1292, + "step": 6400 + }, + { + "epoch": 2.7230833682446587, + "grad_norm": 1.2802495956420898, + "learning_rate": 4.546152771959224e-05, + "loss": 7.1558, + "step": 6500 + }, + { + "epoch": 2.7649769585253456, + "grad_norm": 1.4111789464950562, + "learning_rate": 4.5392403295629106e-05, + "loss": 7.172, + "step": 6600 + }, + { + "epoch": 2.8068705488060326, + "grad_norm": 1.6390964984893799, + "learning_rate": 4.53225806451613e-05, + "loss": 7.1263, + "step": 6700 + }, + { + "epoch": 2.84876413908672, + "grad_norm": 1.4132812023162842, + "learning_rate": 4.525275799469348e-05, + "loss": 7.1259, + "step": 6800 + }, + { + "epoch": 2.890657729367407, + "grad_norm": 1.4943978786468506, + "learning_rate": 4.518293534422567e-05, + "loss": 7.1252, + "step": 6900 + }, + { + "epoch": 2.932551319648094, + "grad_norm": 1.3022414445877075, + "learning_rate": 4.5113112693757856e-05, + "loss": 7.107, + "step": 7000 + }, + { + "epoch": 2.974444909928781, + "grad_norm": 1.4270446300506592, + "learning_rate": 4.504329004329004e-05, + "loss": 7.1278, + "step": 7100 + }, + { + "epoch": 3.016338500209468, + "grad_norm": 1.3672137260437012, + "learning_rate": 4.497346739282223e-05, + "loss": 7.1282, + "step": 7200 + }, + { + "epoch": 3.058232090490155, + "grad_norm": 1.955368995666504, + "learning_rate": 4.490364474235442e-05, + "loss": 7.1123, + "step": 7300 + }, + { + "epoch": 3.100125680770842, + "grad_norm": 1.3990498781204224, + "learning_rate": 4.483382209188661e-05, + "loss": 7.1117, + "step": 7400 + }, + { + "epoch": 3.142019271051529, + "grad_norm": 1.6294671297073364, + "learning_rate": 4.47639994414188e-05, + "loss": 7.0721, + "step": 7500 + }, + { + "epoch": 3.183912861332216, + "grad_norm": 1.3939063549041748, + "learning_rate": 4.469417679095099e-05, + "loss": 7.0599, + "step": 7600 + }, + { + "epoch": 3.225806451612903, + "grad_norm": 1.918155312538147, + "learning_rate": 4.462435414048318e-05, + "loss": 7.0759, + "step": 7700 + }, + { + "epoch": 3.2677000418935904, + "grad_norm": 1.3072093725204468, + "learning_rate": 4.455453149001536e-05, + "loss": 7.0784, + "step": 7800 + }, + { + "epoch": 3.3095936321742774, + "grad_norm": 1.3794573545455933, + "learning_rate": 4.4484708839547554e-05, + "loss": 7.0726, + "step": 7900 + }, + { + "epoch": 3.3514872224549643, + "grad_norm": 1.4223533868789673, + "learning_rate": 4.441488618907974e-05, + "loss": 7.0513, + "step": 8000 + }, + { + "epoch": 3.3933808127356513, + "grad_norm": 1.695520043373108, + "learning_rate": 4.434506353861192e-05, + "loss": 7.0586, + "step": 8100 + }, + { + "epoch": 3.4352744030163387, + "grad_norm": 2.290275812149048, + "learning_rate": 4.4275240888144113e-05, + "loss": 7.0429, + "step": 8200 + }, + { + "epoch": 3.4771679932970256, + "grad_norm": 1.5322943925857544, + "learning_rate": 4.4205418237676304e-05, + "loss": 7.0196, + "step": 8300 + }, + { + "epoch": 3.5190615835777126, + "grad_norm": 1.4767639636993408, + "learning_rate": 4.4135595587208495e-05, + "loss": 7.0643, + "step": 8400 + }, + { + "epoch": 3.5609551738583995, + "grad_norm": 1.4343881607055664, + "learning_rate": 4.406577293674068e-05, + "loss": 7.0153, + "step": 8500 + }, + { + "epoch": 3.602848764139087, + "grad_norm": 1.7641897201538086, + "learning_rate": 4.399664851277755e-05, + "loss": 7.013, + "step": 8600 + }, + { + "epoch": 3.644742354419774, + "grad_norm": 1.9688279628753662, + "learning_rate": 4.392682586230973e-05, + "loss": 6.9985, + "step": 8700 + }, + { + "epoch": 3.686635944700461, + "grad_norm": 1.7434871196746826, + "learning_rate": 4.385700321184192e-05, + "loss": 7.0142, + "step": 8800 + }, + { + "epoch": 3.728529534981148, + "grad_norm": 1.550470232963562, + "learning_rate": 4.378718056137411e-05, + "loss": 6.9975, + "step": 8900 + }, + { + "epoch": 3.7704231252618348, + "grad_norm": 1.759869933128357, + "learning_rate": 4.37173579109063e-05, + "loss": 6.9821, + "step": 9000 + }, + { + "epoch": 3.812316715542522, + "grad_norm": 2.052905797958374, + "learning_rate": 4.364753526043849e-05, + "loss": 6.9917, + "step": 9100 + }, + { + "epoch": 3.854210305823209, + "grad_norm": 1.8859872817993164, + "learning_rate": 4.357771260997068e-05, + "loss": 6.9934, + "step": 9200 + }, + { + "epoch": 3.896103896103896, + "grad_norm": 1.8349354267120361, + "learning_rate": 4.3507889959502863e-05, + "loss": 6.9861, + "step": 9300 + }, + { + "epoch": 3.937997486384583, + "grad_norm": 2.519893169403076, + "learning_rate": 4.3438067309035054e-05, + "loss": 6.9611, + "step": 9400 + }, + { + "epoch": 3.97989107666527, + "grad_norm": 1.506072759628296, + "learning_rate": 4.336824465856724e-05, + "loss": 6.9688, + "step": 9500 + }, + { + "epoch": 4.021784666945957, + "grad_norm": 1.5342004299163818, + "learning_rate": 4.329842200809943e-05, + "loss": 6.9582, + "step": 9600 + }, + { + "epoch": 4.063678257226645, + "grad_norm": 1.6476861238479614, + "learning_rate": 4.3228599357631614e-05, + "loss": 6.9712, + "step": 9700 + }, + { + "epoch": 4.105571847507331, + "grad_norm": 2.112595319747925, + "learning_rate": 4.3158776707163805e-05, + "loss": 6.9568, + "step": 9800 + }, + { + "epoch": 4.147465437788019, + "grad_norm": 2.390101194381714, + "learning_rate": 4.3088954056695996e-05, + "loss": 6.9501, + "step": 9900 + }, + { + "epoch": 4.189359028068705, + "grad_norm": 1.900177240371704, + "learning_rate": 4.301913140622818e-05, + "loss": 6.935, + "step": 10000 + }, + { + "epoch": 4.231252618349393, + "grad_norm": 1.7032443284988403, + "learning_rate": 4.294930875576037e-05, + "loss": 6.9343, + "step": 10100 + }, + { + "epoch": 4.27314620863008, + "grad_norm": 1.8393847942352295, + "learning_rate": 4.287948610529256e-05, + "loss": 6.8927, + "step": 10200 + }, + { + "epoch": 4.3150397989107665, + "grad_norm": 2.046727180480957, + "learning_rate": 4.280966345482475e-05, + "loss": 6.9156, + "step": 10300 + }, + { + "epoch": 4.356933389191454, + "grad_norm": 1.832216501235962, + "learning_rate": 4.273984080435694e-05, + "loss": 6.89, + "step": 10400 + }, + { + "epoch": 4.39882697947214, + "grad_norm": 1.8682448863983154, + "learning_rate": 4.267001815388912e-05, + "loss": 6.8995, + "step": 10500 + }, + { + "epoch": 4.440720569752828, + "grad_norm": 2.0732340812683105, + "learning_rate": 4.260089372992599e-05, + "loss": 6.9094, + "step": 10600 + }, + { + "epoch": 4.482614160033515, + "grad_norm": 1.6016206741333008, + "learning_rate": 4.253107107945818e-05, + "loss": 6.913, + "step": 10700 + }, + { + "epoch": 4.524507750314202, + "grad_norm": 2.063062906265259, + "learning_rate": 4.246124842899037e-05, + "loss": 6.8943, + "step": 10800 + }, + { + "epoch": 4.566401340594889, + "grad_norm": 1.9563026428222656, + "learning_rate": 4.2391425778522555e-05, + "loss": 6.8986, + "step": 10900 + }, + { + "epoch": 4.6082949308755765, + "grad_norm": 1.8872498273849487, + "learning_rate": 4.2321603128054746e-05, + "loss": 6.8883, + "step": 11000 + }, + { + "epoch": 4.650188521156263, + "grad_norm": 2.1376144886016846, + "learning_rate": 4.225178047758693e-05, + "loss": 6.8716, + "step": 11100 + }, + { + "epoch": 4.69208211143695, + "grad_norm": 1.938679575920105, + "learning_rate": 4.218195782711912e-05, + "loss": 6.8836, + "step": 11200 + }, + { + "epoch": 4.733975701717637, + "grad_norm": 1.9372957944869995, + "learning_rate": 4.2112135176651305e-05, + "loss": 6.8925, + "step": 11300 + }, + { + "epoch": 4.775869291998324, + "grad_norm": 2.716827630996704, + "learning_rate": 4.2042312526183496e-05, + "loss": 6.8284, + "step": 11400 + }, + { + "epoch": 4.817762882279011, + "grad_norm": 1.942700743675232, + "learning_rate": 4.197248987571568e-05, + "loss": 6.8753, + "step": 11500 + }, + { + "epoch": 4.859656472559698, + "grad_norm": 2.026385545730591, + "learning_rate": 4.190266722524787e-05, + "loss": 6.8707, + "step": 11600 + }, + { + "epoch": 4.901550062840386, + "grad_norm": 1.7594517469406128, + "learning_rate": 4.183284457478006e-05, + "loss": 6.8427, + "step": 11700 + }, + { + "epoch": 4.943443653121072, + "grad_norm": 1.8161801099777222, + "learning_rate": 4.176302192431225e-05, + "loss": 6.8519, + "step": 11800 + }, + { + "epoch": 4.9853372434017595, + "grad_norm": 2.6034481525421143, + "learning_rate": 4.169319927384444e-05, + "loss": 6.8448, + "step": 11900 + }, + { + "epoch": 5.027230833682447, + "grad_norm": 1.93776535987854, + "learning_rate": 4.162337662337663e-05, + "loss": 6.8071, + "step": 12000 + }, + { + "epoch": 5.0691244239631335, + "grad_norm": 2.0754964351654053, + "learning_rate": 4.155355397290881e-05, + "loss": 6.8386, + "step": 12100 + }, + { + "epoch": 5.111018014243821, + "grad_norm": 2.0640342235565186, + "learning_rate": 4.1483731322440996e-05, + "loss": 6.8402, + "step": 12200 + }, + { + "epoch": 5.152911604524507, + "grad_norm": 1.8218064308166504, + "learning_rate": 4.141390867197319e-05, + "loss": 6.8153, + "step": 12300 + }, + { + "epoch": 5.194805194805195, + "grad_norm": 2.0181634426116943, + "learning_rate": 4.134408602150538e-05, + "loss": 6.8104, + "step": 12400 + }, + { + "epoch": 5.236698785085882, + "grad_norm": 2.5224316120147705, + "learning_rate": 4.127426337103757e-05, + "loss": 6.8355, + "step": 12500 + }, + { + "epoch": 5.278592375366569, + "grad_norm": 3.1008002758026123, + "learning_rate": 4.120513894707444e-05, + "loss": 6.8384, + "step": 12600 + }, + { + "epoch": 5.320485965647256, + "grad_norm": 1.8872394561767578, + "learning_rate": 4.113531629660662e-05, + "loss": 6.8087, + "step": 12700 + }, + { + "epoch": 5.362379555927943, + "grad_norm": 2.109281063079834, + "learning_rate": 4.1065493646138805e-05, + "loss": 6.8161, + "step": 12800 + }, + { + "epoch": 5.40427314620863, + "grad_norm": 1.7881128787994385, + "learning_rate": 4.0995670995670996e-05, + "loss": 6.8215, + "step": 12900 + }, + { + "epoch": 5.446166736489317, + "grad_norm": 2.5179624557495117, + "learning_rate": 4.092584834520319e-05, + "loss": 6.7883, + "step": 13000 + }, + { + "epoch": 5.488060326770004, + "grad_norm": 2.4349751472473145, + "learning_rate": 4.085602569473537e-05, + "loss": 6.792, + "step": 13100 + }, + { + "epoch": 5.529953917050691, + "grad_norm": 2.011018991470337, + "learning_rate": 4.078620304426756e-05, + "loss": 6.7846, + "step": 13200 + }, + { + "epoch": 5.571847507331379, + "grad_norm": 2.519958019256592, + "learning_rate": 4.071638039379975e-05, + "loss": 6.7887, + "step": 13300 + }, + { + "epoch": 5.613741097612065, + "grad_norm": 1.9241886138916016, + "learning_rate": 4.064655774333194e-05, + "loss": 6.7662, + "step": 13400 + }, + { + "epoch": 5.655634687892753, + "grad_norm": 1.8995391130447388, + "learning_rate": 4.057673509286413e-05, + "loss": 6.7672, + "step": 13500 + }, + { + "epoch": 5.697528278173439, + "grad_norm": 2.1511363983154297, + "learning_rate": 4.050691244239632e-05, + "loss": 6.7867, + "step": 13600 + }, + { + "epoch": 5.7394218684541265, + "grad_norm": 1.8995012044906616, + "learning_rate": 4.04370897919285e-05, + "loss": 6.7563, + "step": 13700 + }, + { + "epoch": 5.781315458734814, + "grad_norm": 1.83163321018219, + "learning_rate": 4.036726714146069e-05, + "loss": 6.7848, + "step": 13800 + }, + { + "epoch": 5.8232090490155, + "grad_norm": 2.2616159915924072, + "learning_rate": 4.029744449099288e-05, + "loss": 6.7896, + "step": 13900 + }, + { + "epoch": 5.865102639296188, + "grad_norm": 2.0548572540283203, + "learning_rate": 4.0228320067029746e-05, + "loss": 6.7633, + "step": 14000 + }, + { + "epoch": 5.906996229576874, + "grad_norm": 2.4749302864074707, + "learning_rate": 4.015849741656194e-05, + "loss": 6.7267, + "step": 14100 + }, + { + "epoch": 5.948889819857562, + "grad_norm": 1.906648874282837, + "learning_rate": 4.008867476609413e-05, + "loss": 6.7645, + "step": 14200 + }, + { + "epoch": 5.990783410138249, + "grad_norm": 2.0839619636535645, + "learning_rate": 4.001885211562631e-05, + "loss": 6.8082, + "step": 14300 + }, + { + "epoch": 6.032677000418936, + "grad_norm": 2.1202664375305176, + "learning_rate": 3.9949029465158496e-05, + "loss": 6.7625, + "step": 14400 + }, + { + "epoch": 6.074570590699623, + "grad_norm": 1.988951563835144, + "learning_rate": 3.987920681469069e-05, + "loss": 6.7413, + "step": 14500 + }, + { + "epoch": 6.11646418098031, + "grad_norm": 2.4327659606933594, + "learning_rate": 3.980938416422287e-05, + "loss": 6.7123, + "step": 14600 + }, + { + "epoch": 6.158357771260997, + "grad_norm": 2.07710599899292, + "learning_rate": 3.973956151375506e-05, + "loss": 6.7316, + "step": 14700 + }, + { + "epoch": 6.200251361541684, + "grad_norm": 1.9640876054763794, + "learning_rate": 3.966973886328725e-05, + "loss": 6.752, + "step": 14800 + }, + { + "epoch": 6.242144951822371, + "grad_norm": 2.3012888431549072, + "learning_rate": 3.959991621281944e-05, + "loss": 6.7188, + "step": 14900 + }, + { + "epoch": 6.284038542103058, + "grad_norm": 2.0262773036956787, + "learning_rate": 3.953009356235163e-05, + "loss": 6.7255, + "step": 15000 + }, + { + "epoch": 6.325932132383746, + "grad_norm": 1.8689815998077393, + "learning_rate": 3.946027091188382e-05, + "loss": 6.7132, + "step": 15100 + }, + { + "epoch": 6.367825722664432, + "grad_norm": 2.188612937927246, + "learning_rate": 3.939044826141601e-05, + "loss": 6.7407, + "step": 15200 + }, + { + "epoch": 6.4097193129451195, + "grad_norm": 2.0168368816375732, + "learning_rate": 3.9320625610948195e-05, + "loss": 6.7132, + "step": 15300 + }, + { + "epoch": 6.451612903225806, + "grad_norm": 2.496889352798462, + "learning_rate": 3.925080296048038e-05, + "loss": 6.7003, + "step": 15400 + }, + { + "epoch": 6.4935064935064934, + "grad_norm": 2.1601486206054688, + "learning_rate": 3.918098031001257e-05, + "loss": 6.7056, + "step": 15500 + }, + { + "epoch": 6.535400083787181, + "grad_norm": 2.300112009048462, + "learning_rate": 3.9111157659544754e-05, + "loss": 6.7314, + "step": 15600 + }, + { + "epoch": 6.577293674067867, + "grad_norm": 2.321880578994751, + "learning_rate": 3.9041335009076945e-05, + "loss": 6.7166, + "step": 15700 + }, + { + "epoch": 6.619187264348555, + "grad_norm": 2.029465913772583, + "learning_rate": 3.8971512358609136e-05, + "loss": 6.6908, + "step": 15800 + }, + { + "epoch": 6.661080854629242, + "grad_norm": 2.258577585220337, + "learning_rate": 3.890168970814133e-05, + "loss": 6.7359, + "step": 15900 + }, + { + "epoch": 6.702974444909929, + "grad_norm": 2.3579437732696533, + "learning_rate": 3.883186705767351e-05, + "loss": 6.7021, + "step": 16000 + }, + { + "epoch": 6.744868035190616, + "grad_norm": 2.236828565597534, + "learning_rate": 3.87620444072057e-05, + "loss": 6.6897, + "step": 16100 + }, + { + "epoch": 6.786761625471303, + "grad_norm": 2.6255593299865723, + "learning_rate": 3.869222175673789e-05, + "loss": 6.6899, + "step": 16200 + }, + { + "epoch": 6.82865521575199, + "grad_norm": 2.297067880630493, + "learning_rate": 3.862239910627008e-05, + "loss": 6.7058, + "step": 16300 + }, + { + "epoch": 6.870548806032677, + "grad_norm": 2.440605640411377, + "learning_rate": 3.8553274682306945e-05, + "loss": 6.6559, + "step": 16400 + }, + { + "epoch": 6.912442396313364, + "grad_norm": 2.0427000522613525, + "learning_rate": 3.848345203183913e-05, + "loss": 6.6799, + "step": 16500 + }, + { + "epoch": 6.954335986594051, + "grad_norm": 2.0323081016540527, + "learning_rate": 3.841362938137132e-05, + "loss": 6.6863, + "step": 16600 + }, + { + "epoch": 6.996229576874738, + "grad_norm": 3.407731533050537, + "learning_rate": 3.834380673090351e-05, + "loss": 6.6767, + "step": 16700 + }, + { + "epoch": 7.038123167155425, + "grad_norm": 2.112870931625366, + "learning_rate": 3.8273984080435695e-05, + "loss": 6.682, + "step": 16800 + }, + { + "epoch": 7.080016757436113, + "grad_norm": 2.710810422897339, + "learning_rate": 3.8204161429967886e-05, + "loss": 6.7046, + "step": 16900 + }, + { + "epoch": 7.121910347716799, + "grad_norm": 2.0754942893981934, + "learning_rate": 3.813433877950007e-05, + "loss": 6.6511, + "step": 17000 + }, + { + "epoch": 7.1638039379974865, + "grad_norm": 3.1009552478790283, + "learning_rate": 3.8064516129032254e-05, + "loss": 6.666, + "step": 17100 + }, + { + "epoch": 7.205697528278174, + "grad_norm": 2.1582441329956055, + "learning_rate": 3.7994693478564445e-05, + "loss": 6.6574, + "step": 17200 + }, + { + "epoch": 7.24759111855886, + "grad_norm": 2.680147647857666, + "learning_rate": 3.7924870828096636e-05, + "loss": 6.6814, + "step": 17300 + }, + { + "epoch": 7.289484708839548, + "grad_norm": 2.0264320373535156, + "learning_rate": 3.785504817762883e-05, + "loss": 6.668, + "step": 17400 + }, + { + "epoch": 7.331378299120234, + "grad_norm": 2.032093048095703, + "learning_rate": 3.778522552716101e-05, + "loss": 6.6603, + "step": 17500 + }, + { + "epoch": 7.373271889400922, + "grad_norm": 2.4837894439697266, + "learning_rate": 3.77154028766932e-05, + "loss": 6.6817, + "step": 17600 + }, + { + "epoch": 7.415165479681609, + "grad_norm": 2.70166015625, + "learning_rate": 3.764558022622539e-05, + "loss": 6.6657, + "step": 17700 + }, + { + "epoch": 7.457059069962296, + "grad_norm": 2.3508477210998535, + "learning_rate": 3.757575757575758e-05, + "loss": 6.6314, + "step": 17800 + }, + { + "epoch": 7.498952660242983, + "grad_norm": 2.450437307357788, + "learning_rate": 3.750593492528977e-05, + "loss": 6.6551, + "step": 17900 + }, + { + "epoch": 7.5408462505236695, + "grad_norm": 1.9939864873886108, + "learning_rate": 3.743611227482195e-05, + "loss": 6.6128, + "step": 18000 + }, + { + "epoch": 7.582739840804357, + "grad_norm": 2.470285177230835, + "learning_rate": 3.736628962435414e-05, + "loss": 6.6126, + "step": 18100 + }, + { + "epoch": 7.624633431085044, + "grad_norm": 2.5651469230651855, + "learning_rate": 3.729646697388633e-05, + "loss": 6.6694, + "step": 18200 + }, + { + "epoch": 7.666527021365731, + "grad_norm": 2.361785650253296, + "learning_rate": 3.722664432341852e-05, + "loss": 6.6349, + "step": 18300 + }, + { + "epoch": 7.708420611646418, + "grad_norm": 2.371994972229004, + "learning_rate": 3.715682167295071e-05, + "loss": 6.6483, + "step": 18400 + }, + { + "epoch": 7.750314201927106, + "grad_norm": 2.862107038497925, + "learning_rate": 3.708769724898758e-05, + "loss": 6.634, + "step": 18500 + }, + { + "epoch": 7.792207792207792, + "grad_norm": 2.815486192703247, + "learning_rate": 3.701787459851976e-05, + "loss": 6.6324, + "step": 18600 + }, + { + "epoch": 7.8341013824884795, + "grad_norm": 1.930017352104187, + "learning_rate": 3.6948051948051945e-05, + "loss": 6.6275, + "step": 18700 + }, + { + "epoch": 7.875994972769166, + "grad_norm": 3.1758625507354736, + "learning_rate": 3.6878229297584136e-05, + "loss": 6.6529, + "step": 18800 + }, + { + "epoch": 7.9178885630498534, + "grad_norm": 2.1219429969787598, + "learning_rate": 3.680840664711633e-05, + "loss": 6.6085, + "step": 18900 + }, + { + "epoch": 7.95978215333054, + "grad_norm": 2.1965785026550293, + "learning_rate": 3.673858399664851e-05, + "loss": 6.6206, + "step": 19000 + }, + { + "epoch": 8.001675743611228, + "grad_norm": 2.489473581314087, + "learning_rate": 3.66687613461807e-05, + "loss": 6.6089, + "step": 19100 + }, + { + "epoch": 8.043569333891915, + "grad_norm": 2.3411850929260254, + "learning_rate": 3.659893869571289e-05, + "loss": 6.6286, + "step": 19200 + }, + { + "epoch": 8.085462924172601, + "grad_norm": 2.32071590423584, + "learning_rate": 3.6529116045245084e-05, + "loss": 6.5984, + "step": 19300 + }, + { + "epoch": 8.12735651445329, + "grad_norm": 2.402956247329712, + "learning_rate": 3.645929339477727e-05, + "loss": 6.5952, + "step": 19400 + }, + { + "epoch": 8.169250104733976, + "grad_norm": 2.6951029300689697, + "learning_rate": 3.638947074430946e-05, + "loss": 6.6106, + "step": 19500 + }, + { + "epoch": 8.211143695014663, + "grad_norm": 2.807187080383301, + "learning_rate": 3.6319648093841643e-05, + "loss": 6.6109, + "step": 19600 + }, + { + "epoch": 8.253037285295349, + "grad_norm": 2.798614025115967, + "learning_rate": 3.624982544337383e-05, + "loss": 6.6052, + "step": 19700 + }, + { + "epoch": 8.294930875576037, + "grad_norm": 4.015589237213135, + "learning_rate": 3.618000279290602e-05, + "loss": 6.5995, + "step": 19800 + }, + { + "epoch": 8.336824465856724, + "grad_norm": 2.6923959255218506, + "learning_rate": 3.611018014243821e-05, + "loss": 6.5855, + "step": 19900 + }, + { + "epoch": 8.37871805613741, + "grad_norm": 2.112994909286499, + "learning_rate": 3.6040357491970394e-05, + "loss": 6.5968, + "step": 20000 + }, + { + "epoch": 8.420611646418099, + "grad_norm": 2.8196451663970947, + "learning_rate": 3.5970534841502585e-05, + "loss": 6.5977, + "step": 20100 + }, + { + "epoch": 8.462505236698785, + "grad_norm": 2.2421326637268066, + "learning_rate": 3.5900712191034776e-05, + "loss": 6.5846, + "step": 20200 + }, + { + "epoch": 8.504398826979472, + "grad_norm": 2.634634256362915, + "learning_rate": 3.583088954056697e-05, + "loss": 6.5955, + "step": 20300 + }, + { + "epoch": 8.54629241726016, + "grad_norm": 2.101125955581665, + "learning_rate": 3.576106689009915e-05, + "loss": 6.6013, + "step": 20400 + }, + { + "epoch": 8.588186007540846, + "grad_norm": 2.719330072402954, + "learning_rate": 3.569194246613601e-05, + "loss": 6.5668, + "step": 20500 + }, + { + "epoch": 8.630079597821533, + "grad_norm": 2.283790349960327, + "learning_rate": 3.56221198156682e-05, + "loss": 6.6107, + "step": 20600 + }, + { + "epoch": 8.671973188102221, + "grad_norm": 2.1805171966552734, + "learning_rate": 3.5552297165200393e-05, + "loss": 6.5875, + "step": 20700 + }, + { + "epoch": 8.713866778382908, + "grad_norm": 2.6632487773895264, + "learning_rate": 3.5482474514732584e-05, + "loss": 6.613, + "step": 20800 + }, + { + "epoch": 8.755760368663594, + "grad_norm": 2.3296337127685547, + "learning_rate": 3.541265186426477e-05, + "loss": 6.5628, + "step": 20900 + }, + { + "epoch": 8.79765395894428, + "grad_norm": 2.8429343700408936, + "learning_rate": 3.534282921379696e-05, + "loss": 6.5823, + "step": 21000 + }, + { + "epoch": 8.839547549224969, + "grad_norm": 2.4361233711242676, + "learning_rate": 3.527300656332915e-05, + "loss": 6.5853, + "step": 21100 + }, + { + "epoch": 8.881441139505656, + "grad_norm": 2.5633111000061035, + "learning_rate": 3.5203183912861335e-05, + "loss": 6.5979, + "step": 21200 + }, + { + "epoch": 8.923334729786342, + "grad_norm": 2.350463628768921, + "learning_rate": 3.513336126239352e-05, + "loss": 6.5744, + "step": 21300 + }, + { + "epoch": 8.96522832006703, + "grad_norm": 2.456291675567627, + "learning_rate": 3.506353861192571e-05, + "loss": 6.57, + "step": 21400 + }, + { + "epoch": 9.007121910347717, + "grad_norm": 2.401036262512207, + "learning_rate": 3.49937159614579e-05, + "loss": 6.5614, + "step": 21500 + }, + { + "epoch": 9.049015500628403, + "grad_norm": 2.5537233352661133, + "learning_rate": 3.4923893310990085e-05, + "loss": 6.5836, + "step": 21600 + }, + { + "epoch": 9.090909090909092, + "grad_norm": 2.6386375427246094, + "learning_rate": 3.4854070660522276e-05, + "loss": 6.6178, + "step": 21700 + }, + { + "epoch": 9.132802681189778, + "grad_norm": 2.508533477783203, + "learning_rate": 3.478424801005447e-05, + "loss": 6.5761, + "step": 21800 + }, + { + "epoch": 9.174696271470465, + "grad_norm": 3.1510419845581055, + "learning_rate": 3.471442535958665e-05, + "loss": 6.558, + "step": 21900 + }, + { + "epoch": 9.216589861751151, + "grad_norm": 2.6325526237487793, + "learning_rate": 3.464460270911884e-05, + "loss": 6.5661, + "step": 22000 + }, + { + "epoch": 9.25848345203184, + "grad_norm": 2.9870827198028564, + "learning_rate": 3.457478005865103e-05, + "loss": 6.5392, + "step": 22100 + }, + { + "epoch": 9.300377042312526, + "grad_norm": 2.4924209117889404, + "learning_rate": 3.450495740818322e-05, + "loss": 6.547, + "step": 22200 + }, + { + "epoch": 9.342270632593213, + "grad_norm": 2.3227298259735107, + "learning_rate": 3.44351347577154e-05, + "loss": 6.5306, + "step": 22300 + }, + { + "epoch": 9.3841642228739, + "grad_norm": 2.867182731628418, + "learning_rate": 3.436531210724759e-05, + "loss": 6.5628, + "step": 22400 + }, + { + "epoch": 9.426057813154587, + "grad_norm": 2.2619149684906006, + "learning_rate": 3.429548945677978e-05, + "loss": 6.5192, + "step": 22500 + }, + { + "epoch": 9.467951403435274, + "grad_norm": 2.232321262359619, + "learning_rate": 3.422636503281665e-05, + "loss": 6.56, + "step": 22600 + }, + { + "epoch": 9.509844993715962, + "grad_norm": 2.4485862255096436, + "learning_rate": 3.4156542382348835e-05, + "loss": 6.557, + "step": 22700 + }, + { + "epoch": 9.551738583996649, + "grad_norm": 2.4476943016052246, + "learning_rate": 3.4086719731881026e-05, + "loss": 6.5314, + "step": 22800 + }, + { + "epoch": 9.593632174277335, + "grad_norm": 2.491731643676758, + "learning_rate": 3.401689708141321e-05, + "loss": 6.4952, + "step": 22900 + }, + { + "epoch": 9.635525764558023, + "grad_norm": 2.6474783420562744, + "learning_rate": 3.39470744309454e-05, + "loss": 6.5499, + "step": 23000 + }, + { + "epoch": 9.67741935483871, + "grad_norm": 2.5691514015197754, + "learning_rate": 3.3877251780477585e-05, + "loss": 6.5417, + "step": 23100 + }, + { + "epoch": 9.719312945119396, + "grad_norm": 2.601832151412964, + "learning_rate": 3.3807429130009776e-05, + "loss": 6.5584, + "step": 23200 + }, + { + "epoch": 9.761206535400083, + "grad_norm": 3.481239080429077, + "learning_rate": 3.373760647954197e-05, + "loss": 6.5403, + "step": 23300 + }, + { + "epoch": 9.803100125680771, + "grad_norm": 3.0747485160827637, + "learning_rate": 3.366778382907415e-05, + "loss": 6.5751, + "step": 23400 + }, + { + "epoch": 9.844993715961458, + "grad_norm": 2.2310988903045654, + "learning_rate": 3.359796117860634e-05, + "loss": 6.5046, + "step": 23500 + }, + { + "epoch": 9.886887306242144, + "grad_norm": 2.4555273056030273, + "learning_rate": 3.352813852813853e-05, + "loss": 6.5544, + "step": 23600 + }, + { + "epoch": 9.928780896522833, + "grad_norm": 3.1235666275024414, + "learning_rate": 3.345831587767072e-05, + "loss": 6.5396, + "step": 23700 + }, + { + "epoch": 9.970674486803519, + "grad_norm": 2.2766611576080322, + "learning_rate": 3.338849322720291e-05, + "loss": 6.5542, + "step": 23800 + }, + { + "epoch": 10.012568077084206, + "grad_norm": 3.0408995151519775, + "learning_rate": 3.331867057673509e-05, + "loss": 6.4978, + "step": 23900 + }, + { + "epoch": 10.054461667364894, + "grad_norm": 2.8702831268310547, + "learning_rate": 3.3248847926267283e-05, + "loss": 6.5264, + "step": 24000 + }, + { + "epoch": 10.09635525764558, + "grad_norm": 2.9117937088012695, + "learning_rate": 3.317902527579947e-05, + "loss": 6.5028, + "step": 24100 + }, + { + "epoch": 10.138248847926267, + "grad_norm": 2.925631046295166, + "learning_rate": 3.310920262533166e-05, + "loss": 6.5143, + "step": 24200 + }, + { + "epoch": 10.180142438206955, + "grad_norm": 2.6605536937713623, + "learning_rate": 3.303937997486385e-05, + "loss": 6.5394, + "step": 24300 + }, + { + "epoch": 10.222036028487642, + "grad_norm": 2.31357479095459, + "learning_rate": 3.2969557324396034e-05, + "loss": 6.5224, + "step": 24400 + }, + { + "epoch": 10.263929618768328, + "grad_norm": 2.6544747352600098, + "learning_rate": 3.2899734673928225e-05, + "loss": 6.5035, + "step": 24500 + }, + { + "epoch": 10.305823209049015, + "grad_norm": 2.5945372581481934, + "learning_rate": 3.2830610249965085e-05, + "loss": 6.4977, + "step": 24600 + }, + { + "epoch": 10.347716799329703, + "grad_norm": 3.120873212814331, + "learning_rate": 3.2760787599497276e-05, + "loss": 6.5399, + "step": 24700 + }, + { + "epoch": 10.38961038961039, + "grad_norm": 2.705008029937744, + "learning_rate": 3.269096494902947e-05, + "loss": 6.4938, + "step": 24800 }, { - "epoch": 1.0473397570171763, - "grad_norm": 1.396130084991455, - "learning_rate": 4.8952869710934226e-05, - "loss": 7.2682, - "step": 5000 + "epoch": 10.431503979891076, + "grad_norm": 2.2395503520965576, + "learning_rate": 3.262114229856166e-05, + "loss": 6.4854, + "step": 24900 }, { - "epoch": 1.1520737327188941, - "grad_norm": 1.5962079763412476, - "learning_rate": 4.884813573523251e-05, - "loss": 7.2441, - "step": 5500 + "epoch": 10.473397570171764, + "grad_norm": 2.5891764163970947, + "learning_rate": 3.255131964809384e-05, + "loss": 6.5107, + "step": 25000 }, { - "epoch": 1.2568077084206117, - "grad_norm": 1.6328166723251343, - "learning_rate": 4.874340175953079e-05, - "loss": 7.2216, - "step": 6000 + "epoch": 10.51529116045245, + "grad_norm": 3.115931749343872, + "learning_rate": 3.248149699762603e-05, + "loss": 6.5389, + "step": 25100 }, { - "epoch": 1.3615416841223293, - "grad_norm": 1.8534362316131592, - "learning_rate": 4.863887725178048e-05, - "loss": 7.1957, - "step": 6500 + "epoch": 10.557184750733137, + "grad_norm": 2.264437675476074, + "learning_rate": 3.2411674347158224e-05, + "loss": 6.51, + "step": 25200 }, { - "epoch": 1.466275659824047, - "grad_norm": 1.4871692657470703, - "learning_rate": 4.8534143276078766e-05, - "loss": 7.1561, - "step": 7000 + "epoch": 10.599078341013826, + "grad_norm": 3.449631690979004, + "learning_rate": 3.234185169669041e-05, + "loss": 6.5161, + "step": 25300 }, { - "epoch": 1.5710096355257646, - "grad_norm": 1.8590672016143799, - "learning_rate": 4.8429409300377045e-05, - "loss": 7.1397, - "step": 7500 + "epoch": 10.640971931294512, + "grad_norm": 2.478337526321411, + "learning_rate": 3.227202904622259e-05, + "loss": 6.5019, + "step": 25400 }, { - "epoch": 1.6757436112274822, - "grad_norm": 1.7009446620941162, - "learning_rate": 4.8324675324675325e-05, - "loss": 7.1149, - "step": 8000 + "epoch": 10.682865521575199, + "grad_norm": 3.2756478786468506, + "learning_rate": 3.2202206395754784e-05, + "loss": 6.4869, + "step": 25500 }, { - "epoch": 1.7804775869291998, - "grad_norm": 1.9020010232925415, - "learning_rate": 4.822015081692501e-05, - "loss": 7.1215, - "step": 8500 + "epoch": 10.724759111855885, + "grad_norm": 2.7576985359191895, + "learning_rate": 3.213238374528697e-05, + "loss": 6.5206, + "step": 25600 }, { - "epoch": 1.8852115626309174, - "grad_norm": 2.912442445755005, - "learning_rate": 4.811541684122329e-05, - "loss": 7.0616, - "step": 9000 + "epoch": 10.766652702136573, + "grad_norm": 2.200963020324707, + "learning_rate": 3.206256109481916e-05, + "loss": 6.48, + "step": 25700 }, { - "epoch": 1.989945538332635, - "grad_norm": 2.654263496398926, - "learning_rate": 4.801068286552158e-05, - "loss": 7.0508, - "step": 9500 + "epoch": 10.80854629241726, + "grad_norm": 2.7358744144439697, + "learning_rate": 3.199273844435135e-05, + "loss": 6.5126, + "step": 25800 }, { - "epoch": 2.0946795140343526, - "grad_norm": 2.1642003059387207, - "learning_rate": 4.790594888981986e-05, - "loss": 7.0251, - "step": 10000 + "epoch": 10.850439882697946, + "grad_norm": 2.7179319858551025, + "learning_rate": 3.192291579388354e-05, + "loss": 6.4699, + "step": 25900 }, { - "epoch": 2.19941348973607, - "grad_norm": 1.9420874118804932, - "learning_rate": 4.7801424382069544e-05, - "loss": 6.9806, - "step": 10500 + "epoch": 10.892333472978635, + "grad_norm": 2.811340808868408, + "learning_rate": 3.1853093143415725e-05, + "loss": 6.5056, + "step": 26000 }, { - "epoch": 2.3041474654377883, - "grad_norm": 2.2306201457977295, - "learning_rate": 4.769669040636783e-05, - "loss": 6.9721, - "step": 11000 + "epoch": 10.934227063259321, + "grad_norm": 3.010690450668335, + "learning_rate": 3.1783270492947916e-05, + "loss": 6.5103, + "step": 26100 }, { - "epoch": 2.4088814411395054, - "grad_norm": 2.8180389404296875, - "learning_rate": 4.759195643066611e-05, - "loss": 6.9582, - "step": 11500 + "epoch": 10.976120653540008, + "grad_norm": 3.213487148284912, + "learning_rate": 3.171344784248011e-05, + "loss": 6.4874, + "step": 26200 }, { - "epoch": 2.5136154168412235, - "grad_norm": 2.387949228286743, - "learning_rate": 4.748722245496439e-05, - "loss": 6.9211, - "step": 12000 + "epoch": 11.018014243820696, + "grad_norm": 2.5710039138793945, + "learning_rate": 3.164362519201229e-05, + "loss": 6.4919, + "step": 26300 }, { - "epoch": 2.618349392542941, - "grad_norm": 3.3709394931793213, - "learning_rate": 4.7382697947214076e-05, - "loss": 6.9183, - "step": 12500 + "epoch": 11.059907834101383, + "grad_norm": 2.6933746337890625, + "learning_rate": 3.1573802541544475e-05, + "loss": 6.5284, + "step": 26400 }, { - "epoch": 2.7230833682446587, - "grad_norm": 2.567798376083374, - "learning_rate": 4.727796397151236e-05, - "loss": 6.8732, - "step": 13000 + "epoch": 11.101801424382069, + "grad_norm": 3.775012254714966, + "learning_rate": 3.1503979891076666e-05, + "loss": 6.4894, + "step": 26500 }, { - "epoch": 2.8278173439463763, - "grad_norm": 2.6373414993286133, - "learning_rate": 4.717322999581064e-05, - "loss": 6.8658, - "step": 13500 + "epoch": 11.143695014662757, + "grad_norm": 3.2401301860809326, + "learning_rate": 3.1434855467113534e-05, + "loss": 6.4721, + "step": 26600 }, { - "epoch": 2.932551319648094, - "grad_norm": 2.2950875759124756, - "learning_rate": 4.706849602010893e-05, - "loss": 6.8436, - "step": 14000 + "epoch": 11.185588604943444, + "grad_norm": 2.642794132232666, + "learning_rate": 3.1365032816645725e-05, + "loss": 6.4797, + "step": 26700 }, { - "epoch": 3.0372852953498115, - "grad_norm": 4.0021514892578125, - "learning_rate": 4.696397151235861e-05, - "loss": 6.8496, - "step": 14500 + "epoch": 11.22748219522413, + "grad_norm": 3.191567897796631, + "learning_rate": 3.129521016617791e-05, + "loss": 6.5022, + "step": 26800 }, { - "epoch": 3.142019271051529, - "grad_norm": 3.289193630218506, - "learning_rate": 4.685923753665689e-05, - "loss": 6.8047, - "step": 15000 + "epoch": 11.269375785504817, + "grad_norm": 2.816554307937622, + "learning_rate": 3.12253875157101e-05, + "loss": 6.4853, + "step": 26900 }, { - "epoch": 3.2467532467532467, - "grad_norm": 2.9973654747009277, - "learning_rate": 4.6754503560955175e-05, - "loss": 6.7651, - "step": 15500 + "epoch": 11.311269375785505, + "grad_norm": 2.8666136264801025, + "learning_rate": 3.1155564865242284e-05, + "loss": 6.4839, + "step": 27000 }, { - "epoch": 3.3514872224549643, - "grad_norm": 2.9979376792907715, - "learning_rate": 4.664976958525346e-05, - "loss": 6.7768, - "step": 16000 + "epoch": 11.353162966066192, + "grad_norm": 2.9831254482269287, + "learning_rate": 3.1085742214774475e-05, + "loss": 6.5082, + "step": 27100 }, { - "epoch": 3.456221198156682, - "grad_norm": 3.263784885406494, - "learning_rate": 4.654524507750315e-05, - "loss": 6.7617, - "step": 16500 + "epoch": 11.395056556346878, + "grad_norm": 2.7065083980560303, + "learning_rate": 3.101591956430666e-05, + "loss": 6.4412, + "step": 27200 }, { - "epoch": 3.5609551738583995, - "grad_norm": 3.330116033554077, - "learning_rate": 4.644051110180143e-05, - "loss": 6.7417, - "step": 17000 + "epoch": 11.436950146627566, + "grad_norm": 2.5580694675445557, + "learning_rate": 3.094609691383885e-05, + "loss": 6.4849, + "step": 27300 }, { - "epoch": 3.665689149560117, - "grad_norm": 3.224337339401245, - "learning_rate": 4.6335777126099714e-05, - "loss": 6.7028, - "step": 17500 + "epoch": 11.478843736908253, + "grad_norm": 2.571390390396118, + "learning_rate": 3.087627426337104e-05, + "loss": 6.4689, + "step": 27400 }, { - "epoch": 3.7704231252618348, - "grad_norm": 3.21891450881958, - "learning_rate": 4.623104315039799e-05, - "loss": 6.7101, - "step": 18000 + "epoch": 11.52073732718894, + "grad_norm": 2.835906982421875, + "learning_rate": 3.0806451612903225e-05, + "loss": 6.4887, + "step": 27500 }, { - "epoch": 3.875157100963553, - "grad_norm": 2.3559324741363525, - "learning_rate": 4.6126518642647674e-05, - "loss": 6.6892, - "step": 18500 + "epoch": 11.562630917469628, + "grad_norm": 3.1355161666870117, + "learning_rate": 3.0736628962435416e-05, + "loss": 6.4568, + "step": 27600 }, { - "epoch": 3.97989107666527, - "grad_norm": 3.1527633666992188, - "learning_rate": 4.602178466694596e-05, - "loss": 6.6851, - "step": 19000 + "epoch": 11.604524507750314, + "grad_norm": 3.0155599117279053, + "learning_rate": 3.066680631196761e-05, + "loss": 6.4607, + "step": 27700 }, { - "epoch": 4.084625052366988, - "grad_norm": 2.9760189056396484, - "learning_rate": 4.591705069124424e-05, - "loss": 6.6704, - "step": 19500 + "epoch": 11.646418098031, + "grad_norm": 2.6346957683563232, + "learning_rate": 3.059698366149979e-05, + "loss": 6.4706, + "step": 27800 }, { - "epoch": 4.189359028068705, - "grad_norm": 2.8135318756103516, - "learning_rate": 4.5812316715542526e-05, - "loss": 6.6621, - "step": 20000 + "epoch": 11.688311688311689, + "grad_norm": 2.4353625774383545, + "learning_rate": 3.052716101103198e-05, + "loss": 6.482, + "step": 27900 }, { - "epoch": 4.294093003770423, - "grad_norm": 3.060316324234009, - "learning_rate": 4.570779220779221e-05, - "loss": 6.6298, - "step": 20500 + "epoch": 11.730205278592376, + "grad_norm": 3.29835844039917, + "learning_rate": 3.045733836056417e-05, + "loss": 6.4625, + "step": 28000 }, { - "epoch": 4.39882697947214, - "grad_norm": 2.7130279541015625, - "learning_rate": 4.560305823209049e-05, - "loss": 6.628, - "step": 21000 + "epoch": 11.772098868873062, + "grad_norm": 2.233579158782959, + "learning_rate": 3.038751571009636e-05, + "loss": 6.4727, + "step": 28100 }, { - "epoch": 4.5035609551738585, - "grad_norm": 3.156386613845825, - "learning_rate": 4.549853372434017e-05, - "loss": 6.6423, - "step": 21500 + "epoch": 11.813992459153749, + "grad_norm": 2.5708439350128174, + "learning_rate": 3.0317693059628545e-05, + "loss": 6.4751, + "step": 28200 }, { - "epoch": 4.6082949308755765, - "grad_norm": 3.039471387863159, - "learning_rate": 4.539379974863846e-05, - "loss": 6.6343, - "step": 22000 + "epoch": 11.855886049434437, + "grad_norm": 2.29488205909729, + "learning_rate": 3.0247870409160732e-05, + "loss": 6.4599, + "step": 28300 }, { - "epoch": 4.713028906577294, - "grad_norm": 3.976949453353882, - "learning_rate": 4.5289065772936745e-05, - "loss": 6.606, - "step": 22500 + "epoch": 11.897779639715123, + "grad_norm": 2.858208179473877, + "learning_rate": 3.0178047758692923e-05, + "loss": 6.469, + "step": 28400 }, { - "epoch": 4.817762882279011, - "grad_norm": 3.310382604598999, - "learning_rate": 4.5184331797235025e-05, - "loss": 6.5956, - "step": 23000 + "epoch": 11.93967322999581, + "grad_norm": 2.854923725128174, + "learning_rate": 3.0108225108225107e-05, + "loss": 6.4995, + "step": 28500 }, { - "epoch": 4.922496857980729, - "grad_norm": 3.5924322605133057, - "learning_rate": 4.507959782153331e-05, - "loss": 6.5965, - "step": 23500 + "epoch": 11.981566820276498, + "grad_norm": 2.590484857559204, + "learning_rate": 3.003910068426198e-05, + "loss": 6.4508, + "step": 28600 }, { - "epoch": 5.027230833682447, - "grad_norm": 2.616468667984009, - "learning_rate": 4.497486384583159e-05, - "loss": 6.5788, - "step": 24000 + "epoch": 12.023460410557185, + "grad_norm": 3.3479676246643066, + "learning_rate": 2.9969278033794163e-05, + "loss": 6.4581, + "step": 28700 }, { - "epoch": 5.131964809384164, - "grad_norm": 3.3178062438964844, - "learning_rate": 4.487012987012987e-05, - "loss": 6.5684, - "step": 24500 + "epoch": 12.065354000837871, + "grad_norm": 2.7855923175811768, + "learning_rate": 2.9899455383326354e-05, + "loss": 6.4744, + "step": 28800 }, { - "epoch": 5.236698785085882, - "grad_norm": 3.7108089923858643, - "learning_rate": 4.476539589442815e-05, - "loss": 6.5756, - "step": 25000 + "epoch": 12.10724759111856, + "grad_norm": 3.2668962478637695, + "learning_rate": 2.982963273285854e-05, + "loss": 6.4731, + "step": 28900 }, { - "epoch": 5.341432760787599, - "grad_norm": 3.396498918533325, - "learning_rate": 4.466087138667784e-05, - "loss": 6.5678, - "step": 25500 + "epoch": 12.149141181399246, + "grad_norm": 2.850735664367676, + "learning_rate": 2.9759810082390725e-05, + "loss": 6.4788, + "step": 29000 }, { - "epoch": 5.446166736489317, - "grad_norm": 3.7245748043060303, - "learning_rate": 4.4556137410976123e-05, - "loss": 6.5578, - "step": 26000 + "epoch": 12.191034771679933, + "grad_norm": 2.9676952362060547, + "learning_rate": 2.9689987431922916e-05, + "loss": 6.4525, + "step": 29100 }, { - "epoch": 5.5509007121910345, - "grad_norm": 3.6525135040283203, - "learning_rate": 4.44514034352744e-05, - "loss": 6.5385, - "step": 26500 + "epoch": 12.23292836196062, + "grad_norm": 2.604408025741577, + "learning_rate": 2.9620164781455107e-05, + "loss": 6.4564, + "step": 29200 }, { - "epoch": 5.655634687892753, - "grad_norm": 3.4302523136138916, - "learning_rate": 4.434666945957269e-05, - "loss": 6.5143, - "step": 27000 + "epoch": 12.274821952241307, + "grad_norm": 2.974653482437134, + "learning_rate": 2.9550342130987295e-05, + "loss": 6.463, + "step": 29300 }, { - "epoch": 5.76036866359447, - "grad_norm": 3.762871503829956, - "learning_rate": 4.4242144951822376e-05, - "loss": 6.52, - "step": 27500 + "epoch": 12.316715542521994, + "grad_norm": 3.372664213180542, + "learning_rate": 2.9480519480519482e-05, + "loss": 6.464, + "step": 29400 }, { - "epoch": 5.865102639296188, - "grad_norm": 2.8195388317108154, - "learning_rate": 4.4137410976120656e-05, - "loss": 6.5213, - "step": 28000 + "epoch": 12.35860913280268, + "grad_norm": 2.6891355514526367, + "learning_rate": 2.941069683005167e-05, + "loss": 6.4674, + "step": 29500 }, { - "epoch": 5.969836614997905, - "grad_norm": 3.013187885284424, - "learning_rate": 4.4032677000418936e-05, - "loss": 6.5052, - "step": 28500 + "epoch": 12.400502723083369, + "grad_norm": 2.964113473892212, + "learning_rate": 2.934087417958386e-05, + "loss": 6.4539, + "step": 29600 }, { - "epoch": 6.074570590699623, - "grad_norm": 2.9772274494171143, - "learning_rate": 4.392794302471722e-05, - "loss": 6.502, - "step": 29000 + "epoch": 12.442396313364055, + "grad_norm": 2.7328097820281982, + "learning_rate": 2.9271051529116045e-05, + "loss": 6.4224, + "step": 29700 }, { - "epoch": 6.17930456640134, - "grad_norm": 3.2228713035583496, - "learning_rate": 4.38232090490155e-05, - "loss": 6.4889, - "step": 29500 + "epoch": 12.484289903644742, + "grad_norm": 2.6205203533172607, + "learning_rate": 2.9201228878648236e-05, + "loss": 6.4266, + "step": 29800 }, { - "epoch": 6.284038542103058, - "grad_norm": 3.824286937713623, - "learning_rate": 4.371847507331379e-05, - "loss": 6.4792, - "step": 30000 + "epoch": 12.52618349392543, + "grad_norm": 3.681053400039673, + "learning_rate": 2.9131406228180424e-05, + "loss": 6.4549, + "step": 29900 }, { - "epoch": 6.388772517804776, - "grad_norm": 3.100308656692505, - "learning_rate": 4.361374109761207e-05, - "loss": 6.4816, - "step": 30500 + "epoch": 12.568077084206116, + "grad_norm": 2.9732627868652344, + "learning_rate": 2.9061583577712608e-05, + "loss": 6.4466, + "step": 30000 }, { - "epoch": 6.4935064935064934, - "grad_norm": 3.4449245929718018, - "learning_rate": 4.350900712191035e-05, - "loss": 6.4786, - "step": 31000 + "epoch": 12.609970674486803, + "grad_norm": 3.47816801071167, + "learning_rate": 2.89917609272448e-05, + "loss": 6.4408, + "step": 30100 }, { - "epoch": 6.5982404692082115, - "grad_norm": 3.6803085803985596, - "learning_rate": 4.3404482614160034e-05, - "loss": 6.4778, - "step": 31500 + "epoch": 12.651864264767491, + "grad_norm": 2.70326566696167, + "learning_rate": 2.892193827677699e-05, + "loss": 6.4444, + "step": 30200 }, { - "epoch": 6.702974444909929, - "grad_norm": 3.6413722038269043, - "learning_rate": 4.329974863845832e-05, - "loss": 6.4782, - "step": 32000 + "epoch": 12.693757855048178, + "grad_norm": 2.9219532012939453, + "learning_rate": 2.8852115626309177e-05, + "loss": 6.4183, + "step": 30300 }, { - "epoch": 6.807708420611647, - "grad_norm": 3.482905626296997, - "learning_rate": 4.31950146627566e-05, - "loss": 6.4719, - "step": 32500 + "epoch": 12.735651445328864, + "grad_norm": 2.8546571731567383, + "learning_rate": 2.878229297584136e-05, + "loss": 6.4399, + "step": 30400 }, { - "epoch": 6.912442396313364, - "grad_norm": 3.4605376720428467, - "learning_rate": 4.3090280687054887e-05, - "loss": 6.4458, - "step": 33000 + "epoch": 12.777545035609553, + "grad_norm": 2.95047926902771, + "learning_rate": 2.8712470325373552e-05, + "loss": 6.4396, + "step": 30500 }, { - "epoch": 7.017176372015082, - "grad_norm": 3.814375877380371, - "learning_rate": 4.2985965647255974e-05, - "loss": 6.4539, - "step": 33500 + "epoch": 12.819438625890239, + "grad_norm": 3.397934675216675, + "learning_rate": 2.8643345901410416e-05, + "loss": 6.438, + "step": 30600 }, { - "epoch": 7.121910347716799, - "grad_norm": 4.238844871520996, - "learning_rate": 4.288123167155425e-05, - "loss": 6.4522, - "step": 34000 + "epoch": 12.861332216170926, + "grad_norm": 2.625852346420288, + "learning_rate": 2.8573523250942607e-05, + "loss": 6.4363, + "step": 30700 }, { - "epoch": 7.226644323418517, - "grad_norm": 2.8327670097351074, - "learning_rate": 4.277649769585253e-05, - "loss": 6.4383, - "step": 34500 + "epoch": 12.903225806451612, + "grad_norm": 2.5299527645111084, + "learning_rate": 2.85037006004748e-05, + "loss": 6.3952, + "step": 30800 }, { - "epoch": 7.331378299120234, - "grad_norm": 3.1451475620269775, - "learning_rate": 4.267176372015082e-05, - "loss": 6.4451, - "step": 35000 + "epoch": 12.9451193967323, + "grad_norm": 2.6445415019989014, + "learning_rate": 2.8433877950006983e-05, + "loss": 6.4559, + "step": 30900 }, { - "epoch": 7.436112274821952, - "grad_norm": 3.6858575344085693, - "learning_rate": 4.2567239212400506e-05, - "loss": 6.4313, - "step": 35500 + "epoch": 12.987012987012987, + "grad_norm": 2.9675769805908203, + "learning_rate": 2.836405529953917e-05, + "loss": 6.4447, + "step": 31000 }, { - "epoch": 7.5408462505236695, - "grad_norm": 4.258295059204102, - "learning_rate": 4.2462505236698786e-05, - "loss": 6.4143, - "step": 36000 + "epoch": 13.028906577293673, + "grad_norm": 2.607391119003296, + "learning_rate": 2.829423264907136e-05, + "loss": 6.446, + "step": 31100 }, { - "epoch": 7.645580226225388, - "grad_norm": 4.3574676513671875, - "learning_rate": 4.235777126099707e-05, - "loss": 6.4061, - "step": 36500 + "epoch": 13.070800167574362, + "grad_norm": 3.196765661239624, + "learning_rate": 2.8224409998603545e-05, + "loss": 6.4336, + "step": 31200 }, { - "epoch": 7.750314201927106, - "grad_norm": 3.8001816272735596, - "learning_rate": 4.225303728529535e-05, - "loss": 6.3994, - "step": 37000 + "epoch": 13.112693757855048, + "grad_norm": 5.778535842895508, + "learning_rate": 2.8154587348135736e-05, + "loss": 6.4339, + "step": 31300 }, { - "epoch": 7.855048177628823, - "grad_norm": 3.487893581390381, - "learning_rate": 4.214851277754504e-05, - "loss": 6.4192, - "step": 37500 + "epoch": 13.154587348135735, + "grad_norm": 3.0479419231414795, + "learning_rate": 2.8084764697667927e-05, + "loss": 6.4147, + "step": 31400 }, { - "epoch": 7.95978215333054, - "grad_norm": 3.9729723930358887, - "learning_rate": 4.204377880184332e-05, - "loss": 6.407, - "step": 38000 + "epoch": 13.196480938416423, + "grad_norm": 2.6787302494049072, + "learning_rate": 2.8014942047200115e-05, + "loss": 6.4312, + "step": 31500 }, { - "epoch": 8.064516129032258, - "grad_norm": 3.4465062618255615, - "learning_rate": 4.1939044826141604e-05, - "loss": 6.3867, - "step": 38500 + "epoch": 13.23837452869711, + "grad_norm": 2.7929670810699463, + "learning_rate": 2.79451193967323e-05, + "loss": 6.4224, + "step": 31600 }, { - "epoch": 8.169250104733976, - "grad_norm": 3.706404685974121, - "learning_rate": 4.1834310850439884e-05, - "loss": 6.3877, - "step": 39000 + "epoch": 13.280268118977796, + "grad_norm": 2.722101926803589, + "learning_rate": 2.787529674626449e-05, + "loss": 6.4247, + "step": 31700 }, { - "epoch": 8.273984080435694, - "grad_norm": 3.8204259872436523, - "learning_rate": 4.172957687473817e-05, - "loss": 6.3921, - "step": 39500 + "epoch": 13.322161709258484, + "grad_norm": 3.295348644256592, + "learning_rate": 2.780547409579668e-05, + "loss": 6.4435, + "step": 31800 }, { - "epoch": 8.37871805613741, - "grad_norm": 3.4868948459625244, - "learning_rate": 4.162505236698786e-05, - "loss": 6.3729, - "step": 40000 + "epoch": 13.36405529953917, + "grad_norm": 2.5780696868896484, + "learning_rate": 2.7735651445328865e-05, + "loss": 6.406, + "step": 31900 }, { - "epoch": 8.483452031839128, - "grad_norm": 3.5007081031799316, - "learning_rate": 4.152031839128614e-05, - "loss": 6.3886, - "step": 40500 + "epoch": 13.405948889819857, + "grad_norm": 2.955299139022827, + "learning_rate": 2.7665828794861053e-05, + "loss": 6.4633, + "step": 32000 }, { - "epoch": 8.588186007540846, - "grad_norm": 2.937894582748413, - "learning_rate": 4.1415584415584417e-05, - "loss": 6.3814, - "step": 41000 + "epoch": 13.447842480100544, + "grad_norm": 3.8027708530426025, + "learning_rate": 2.7596006144393244e-05, + "loss": 6.4445, + "step": 32100 }, { - "epoch": 8.692919983242565, - "grad_norm": 3.529237985610962, - "learning_rate": 4.1310850439882696e-05, - "loss": 6.3722, - "step": 41500 + "epoch": 13.489736070381232, + "grad_norm": 2.6895995140075684, + "learning_rate": 2.7526183493925428e-05, + "loss": 6.4015, + "step": 32200 }, { - "epoch": 8.79765395894428, - "grad_norm": 3.883575677871704, - "learning_rate": 4.120611646418098e-05, - "loss": 6.3655, - "step": 42000 + "epoch": 13.531629660661919, + "grad_norm": 2.6936516761779785, + "learning_rate": 2.745636084345762e-05, + "loss": 6.4211, + "step": 32300 }, { - "epoch": 8.902387934645999, - "grad_norm": 4.439103603363037, - "learning_rate": 4.110159195643067e-05, - "loss": 6.3673, - "step": 42500 + "epoch": 13.573523250942605, + "grad_norm": 2.948420763015747, + "learning_rate": 2.738653819298981e-05, + "loss": 6.4042, + "step": 32400 }, { - "epoch": 9.007121910347717, - "grad_norm": 4.103298664093018, - "learning_rate": 4.099685798072895e-05, - "loss": 6.3659, - "step": 43000 + "epoch": 13.615416841223293, + "grad_norm": 2.763885974884033, + "learning_rate": 2.7316715542521997e-05, + "loss": 6.393, + "step": 32500 }, { - "epoch": 9.111855886049435, - "grad_norm": 3.491204023361206, - "learning_rate": 4.0892124005027235e-05, - "loss": 6.3744, - "step": 43500 + "epoch": 13.65731043150398, + "grad_norm": 3.1601672172546387, + "learning_rate": 2.724759111855886e-05, + "loss": 6.4398, + "step": 32600 }, { - "epoch": 9.216589861751151, - "grad_norm": 3.441976547241211, - "learning_rate": 4.0787390029325515e-05, - "loss": 6.3573, - "step": 44000 + "epoch": 13.699204021784666, + "grad_norm": 2.4161715507507324, + "learning_rate": 2.7177768468091052e-05, + "loss": 6.401, + "step": 32700 }, { - "epoch": 9.32132383745287, - "grad_norm": 3.58134126663208, - "learning_rate": 4.0682656053623795e-05, - "loss": 6.3407, - "step": 44500 + "epoch": 13.741097612065355, + "grad_norm": 3.0796055793762207, + "learning_rate": 2.7107945817623236e-05, + "loss": 6.4265, + "step": 32800 }, { - "epoch": 9.426057813154587, - "grad_norm": 3.274592638015747, - "learning_rate": 4.057813154587348e-05, - "loss": 6.3373, - "step": 45000 + "epoch": 13.782991202346041, + "grad_norm": 3.6223697662353516, + "learning_rate": 2.7038123167155427e-05, + "loss": 6.4075, + "step": 32900 }, { - "epoch": 9.530791788856305, - "grad_norm": 4.296390533447266, - "learning_rate": 4.047339757017177e-05, - "loss": 6.3499, - "step": 45500 + "epoch": 13.824884792626728, + "grad_norm": 2.6991615295410156, + "learning_rate": 2.696830051668762e-05, + "loss": 6.3912, + "step": 33000 }, { - "epoch": 9.635525764558023, - "grad_norm": 3.5000336170196533, - "learning_rate": 4.036866359447005e-05, - "loss": 6.3199, - "step": 46000 + "epoch": 13.866778382907416, + "grad_norm": 3.1701860427856445, + "learning_rate": 2.6898477866219803e-05, + "loss": 6.4173, + "step": 33100 }, { - "epoch": 9.74025974025974, - "grad_norm": 3.4947054386138916, - "learning_rate": 4.0263929618768334e-05, - "loss": 6.3474, - "step": 46500 + "epoch": 13.908671973188103, + "grad_norm": 2.915432929992676, + "learning_rate": 2.682865521575199e-05, + "loss": 6.4179, + "step": 33200 }, { - "epoch": 9.844993715961458, - "grad_norm": 3.3658857345581055, - "learning_rate": 4.0159195643066614e-05, - "loss": 6.3296, - "step": 47000 + "epoch": 13.950565563468789, + "grad_norm": 3.155080795288086, + "learning_rate": 2.675883256528418e-05, + "loss": 6.3895, + "step": 33300 }, { - "epoch": 9.949727691663176, - "grad_norm": 2.9811642169952393, - "learning_rate": 4.0054671135316294e-05, - "loss": 6.345, - "step": 47500 + "epoch": 13.992459153749476, + "grad_norm": 3.3861114978790283, + "learning_rate": 2.6689009914816365e-05, + "loss": 6.4279, + "step": 33400 }, { - "epoch": 10.054461667364894, - "grad_norm": 4.165875434875488, - "learning_rate": 3.994993715961458e-05, - "loss": 6.3209, - "step": 48000 + "epoch": 14.034352744030164, + "grad_norm": 3.301805019378662, + "learning_rate": 2.6619187264348556e-05, + "loss": 6.4072, + "step": 33500 }, { - "epoch": 10.15919564306661, - "grad_norm": 3.6118202209472656, - "learning_rate": 3.9845203183912866e-05, - "loss": 6.3175, - "step": 48500 + "epoch": 14.07624633431085, + "grad_norm": 3.305147171020508, + "learning_rate": 2.6549364613880744e-05, + "loss": 6.3949, + "step": 33600 }, { - "epoch": 10.263929618768328, - "grad_norm": 3.930669069290161, - "learning_rate": 3.9740469208211146e-05, - "loss": 6.3231, - "step": 49000 + "epoch": 14.118139924591537, + "grad_norm": 2.7602477073669434, + "learning_rate": 2.6479541963412935e-05, + "loss": 6.4048, + "step": 33700 }, { - "epoch": 10.368663594470046, - "grad_norm": 3.1688554286956787, - "learning_rate": 3.963594470046083e-05, - "loss": 6.309, - "step": 49500 + "epoch": 14.160033514872225, + "grad_norm": 2.5257952213287354, + "learning_rate": 2.640971931294512e-05, + "loss": 6.4033, + "step": 33800 }, { - "epoch": 10.473397570171764, - "grad_norm": 3.6746394634246826, - "learning_rate": 3.953121072475911e-05, - "loss": 6.3077, - "step": 50000 + "epoch": 14.201927105152912, + "grad_norm": 2.4649853706359863, + "learning_rate": 2.633989666247731e-05, + "loss": 6.374, + "step": 33900 }, { - "epoch": 10.57813154587348, - "grad_norm": 3.5134785175323486, - "learning_rate": 3.942647674905739e-05, - "loss": 6.3299, - "step": 50500 + "epoch": 14.243820695433598, + "grad_norm": 2.7136335372924805, + "learning_rate": 2.6270074012009497e-05, + "loss": 6.3993, + "step": 34000 }, { - "epoch": 10.682865521575199, - "grad_norm": 3.2903287410736084, - "learning_rate": 3.932174277335568e-05, - "loss": 6.3178, - "step": 51000 + "epoch": 14.285714285714286, + "grad_norm": 2.801712989807129, + "learning_rate": 2.6200251361541685e-05, + "loss": 6.4059, + "step": 34100 }, { - "epoch": 10.787599497276917, - "grad_norm": 3.5344769954681396, - "learning_rate": 3.921700879765396e-05, - "loss": 6.3139, - "step": 51500 + "epoch": 14.327607875994973, + "grad_norm": 2.7054030895233154, + "learning_rate": 2.6130428711073873e-05, + "loss": 6.431, + "step": 34200 }, { - "epoch": 10.892333472978635, - "grad_norm": 3.5710573196411133, - "learning_rate": 3.9112274821952245e-05, - "loss": 6.306, - "step": 52000 + "epoch": 14.36950146627566, + "grad_norm": 2.653932809829712, + "learning_rate": 2.6060606060606063e-05, + "loss": 6.4035, + "step": 34300 }, { - "epoch": 10.997067448680351, - "grad_norm": 3.4193336963653564, - "learning_rate": 3.9007540846250524e-05, - "loss": 6.3129, - "step": 52500 + "epoch": 14.411395056556348, + "grad_norm": 2.5450570583343506, + "learning_rate": 2.5990783410138248e-05, + "loss": 6.417, + "step": 34400 }, { - "epoch": 11.101801424382069, - "grad_norm": 3.683143377304077, - "learning_rate": 3.890301633850021e-05, - "loss": 6.3084, - "step": 53000 + "epoch": 14.453288646837034, + "grad_norm": 2.9578003883361816, + "learning_rate": 2.592096075967044e-05, + "loss": 6.4087, + "step": 34500 }, { - "epoch": 11.206535400083787, - "grad_norm": 3.214221239089966, - "learning_rate": 3.879828236279849e-05, - "loss": 6.302, - "step": 53500 + "epoch": 14.49518223711772, + "grad_norm": 2.9408493041992188, + "learning_rate": 2.5851836335707303e-05, + "loss": 6.3936, + "step": 34600 }, { - "epoch": 11.311269375785505, - "grad_norm": 3.5691747665405273, - "learning_rate": 3.869354838709678e-05, - "loss": 6.3164, - "step": 54000 + "epoch": 14.537075827398407, + "grad_norm": 2.756441116333008, + "learning_rate": 2.5782013685239494e-05, + "loss": 6.404, + "step": 34700 }, { - "epoch": 11.416003351487223, - "grad_norm": 3.2734036445617676, - "learning_rate": 3.858881441139506e-05, - "loss": 6.2889, - "step": 54500 + "epoch": 14.578969417679096, + "grad_norm": 3.685004711151123, + "learning_rate": 2.571219103477168e-05, + "loss": 6.3932, + "step": 34800 }, { - "epoch": 11.52073732718894, - "grad_norm": 4.049854278564453, - "learning_rate": 3.848428990364474e-05, - "loss": 6.2957, - "step": 55000 + "epoch": 14.620863007959782, + "grad_norm": 2.670825719833374, + "learning_rate": 2.5642368384303872e-05, + "loss": 6.3839, + "step": 34900 }, { - "epoch": 11.625471302890658, - "grad_norm": 3.837921380996704, - "learning_rate": 3.837955592794303e-05, - "loss": 6.2806, - "step": 55500 + "epoch": 14.662756598240469, + "grad_norm": 3.0986082553863525, + "learning_rate": 2.5572545733836056e-05, + "loss": 6.3782, + "step": 35000 }, { - "epoch": 11.730205278592376, - "grad_norm": 3.4606828689575195, - "learning_rate": 3.827482195224131e-05, - "loss": 6.2896, - "step": 56000 + "epoch": 14.704650188521157, + "grad_norm": 3.003432273864746, + "learning_rate": 2.5502723083368247e-05, + "loss": 6.3775, + "step": 35100 }, { - "epoch": 11.834939254294094, - "grad_norm": 4.859198093414307, - "learning_rate": 3.8170087976539596e-05, - "loss": 6.273, - "step": 56500 + "epoch": 14.746543778801843, + "grad_norm": 2.752516269683838, + "learning_rate": 2.5432900432900435e-05, + "loss": 6.3731, + "step": 35200 }, { - "epoch": 11.93967322999581, - "grad_norm": 4.689023494720459, - "learning_rate": 3.806535400083787e-05, - "loss": 6.2776, - "step": 57000 + "epoch": 14.78843736908253, + "grad_norm": 2.7697649002075195, + "learning_rate": 2.536307778243262e-05, + "loss": 6.3701, + "step": 35300 }, { - "epoch": 12.044407205697528, - "grad_norm": 4.234752178192139, - "learning_rate": 3.7960829493087555e-05, - "loss": 6.282, - "step": 57500 + "epoch": 14.830330959363218, + "grad_norm": 3.0245521068573, + "learning_rate": 2.529325513196481e-05, + "loss": 6.3916, + "step": 35400 }, { - "epoch": 12.149141181399246, - "grad_norm": 3.950773239135742, - "learning_rate": 3.785609551738584e-05, - "loss": 6.283, - "step": 58000 + "epoch": 14.872224549643905, + "grad_norm": 3.1849350929260254, + "learning_rate": 2.5223432481497e-05, + "loss": 6.3993, + "step": 35500 }, { - "epoch": 12.253875157100964, - "grad_norm": 4.1780548095703125, - "learning_rate": 3.775136154168412e-05, - "loss": 6.2635, - "step": 58500 + "epoch": 14.914118139924591, + "grad_norm": 3.6655123233795166, + "learning_rate": 2.5153609831029185e-05, + "loss": 6.3791, + "step": 35600 }, { - "epoch": 12.35860913280268, - "grad_norm": 3.2049672603607178, - "learning_rate": 3.764662756598241e-05, - "loss": 6.2858, - "step": 59000 + "epoch": 14.95601173020528, + "grad_norm": 3.2252790927886963, + "learning_rate": 2.5083787180561376e-05, + "loss": 6.3865, + "step": 35700 }, { - "epoch": 12.463343108504398, - "grad_norm": 3.863649606704712, - "learning_rate": 3.7541893590280694e-05, - "loss": 6.2609, - "step": 59500 + "epoch": 14.997905320485966, + "grad_norm": 2.8366169929504395, + "learning_rate": 2.5013964530093564e-05, + "loss": 6.3897, + "step": 35800 }, { - "epoch": 12.568077084206116, - "grad_norm": 3.881343364715576, - "learning_rate": 3.743715961457897e-05, - "loss": 6.2695, - "step": 60000 + "epoch": 15.039798910766653, + "grad_norm": 2.757725715637207, + "learning_rate": 2.494414187962575e-05, + "loss": 6.376, + "step": 35900 }, { - "epoch": 12.672811059907835, - "grad_norm": 3.522132635116577, - "learning_rate": 3.7332635106828654e-05, - "loss": 6.2523, - "step": 60500 + "epoch": 15.081692501047339, + "grad_norm": 3.1640422344207764, + "learning_rate": 2.4874319229157942e-05, + "loss": 6.3796, + "step": 36000 }, { - "epoch": 12.777545035609553, - "grad_norm": 4.043595790863037, - "learning_rate": 3.722790113112694e-05, - "loss": 6.2546, - "step": 61000 + "epoch": 15.123586091328027, + "grad_norm": 2.849719285964966, + "learning_rate": 2.480449657869013e-05, + "loss": 6.3765, + "step": 36100 }, { - "epoch": 12.882279011311269, - "grad_norm": 3.4860141277313232, - "learning_rate": 3.712316715542522e-05, - "loss": 6.2468, - "step": 61500 + "epoch": 15.165479681608714, + "grad_norm": 2.7223923206329346, + "learning_rate": 2.4734673928222314e-05, + "loss": 6.3953, + "step": 36200 }, { - "epoch": 12.987012987012987, - "grad_norm": 3.9201574325561523, - "learning_rate": 3.7018433179723506e-05, - "loss": 6.2615, - "step": 62000 + "epoch": 15.2073732718894, + "grad_norm": 3.173750162124634, + "learning_rate": 2.4664851277754505e-05, + "loss": 6.3724, + "step": 36300 }, { - "epoch": 13.091746962714705, - "grad_norm": 3.5582118034362793, - "learning_rate": 3.6913699204021786e-05, - "loss": 6.2529, - "step": 62500 + "epoch": 15.249266862170089, + "grad_norm": 3.054779529571533, + "learning_rate": 2.4595028627286692e-05, + "loss": 6.3764, + "step": 36400 }, { - "epoch": 13.196480938416423, - "grad_norm": 3.1254093647003174, - "learning_rate": 3.680917469627147e-05, - "loss": 6.2418, - "step": 63000 + "epoch": 15.291160452450775, + "grad_norm": 3.277862071990967, + "learning_rate": 2.4525205976818883e-05, + "loss": 6.3583, + "step": 36500 }, { - "epoch": 13.30121491411814, - "grad_norm": 4.058616638183594, - "learning_rate": 3.670444072056975e-05, - "loss": 6.243, - "step": 63500 + "epoch": 15.333054042731462, + "grad_norm": 2.9208297729492188, + "learning_rate": 2.4456081552855748e-05, + "loss": 6.3878, + "step": 36600 }, { - "epoch": 13.405948889819857, - "grad_norm": 3.5146963596343994, - "learning_rate": 3.659970674486803e-05, - "loss": 6.2595, - "step": 64000 + "epoch": 15.37494763301215, + "grad_norm": 2.5356411933898926, + "learning_rate": 2.4386258902387935e-05, + "loss": 6.3705, + "step": 36700 }, { - "epoch": 13.510682865521575, - "grad_norm": 3.804818630218506, - "learning_rate": 3.649497276916632e-05, - "loss": 6.2438, - "step": 64500 + "epoch": 15.416841223292836, + "grad_norm": 2.8953468799591064, + "learning_rate": 2.4316436251920126e-05, + "loss": 6.3947, + "step": 36800 }, { - "epoch": 13.615416841223293, - "grad_norm": 3.591214179992676, - "learning_rate": 3.6390238793464605e-05, - "loss": 6.2266, - "step": 65000 + "epoch": 15.458734813573523, + "grad_norm": 2.9166266918182373, + "learning_rate": 2.424661360145231e-05, + "loss": 6.3809, + "step": 36900 }, { - "epoch": 13.72015081692501, - "grad_norm": 4.973635196685791, - "learning_rate": 3.6285504817762885e-05, - "loss": 6.2501, - "step": 65500 + "epoch": 15.50062840385421, + "grad_norm": 3.4554710388183594, + "learning_rate": 2.41767909509845e-05, + "loss": 6.3746, + "step": 37000 }, { - "epoch": 13.824884792626728, - "grad_norm": 4.189575672149658, - "learning_rate": 3.618098031001257e-05, - "loss": 6.2341, - "step": 66000 + "epoch": 15.542521994134898, + "grad_norm": 3.7208077907562256, + "learning_rate": 2.410696830051669e-05, + "loss": 6.3758, + "step": 37100 }, { - "epoch": 13.929618768328446, - "grad_norm": 4.408186912536621, - "learning_rate": 3.607624633431085e-05, - "loss": 6.227, - "step": 66500 + "epoch": 15.584415584415584, + "grad_norm": 3.3161842823028564, + "learning_rate": 2.4037145650048876e-05, + "loss": 6.3744, + "step": 37200 }, { - "epoch": 14.034352744030164, - "grad_norm": 4.066199779510498, - "learning_rate": 3.597151235860913e-05, - "loss": 6.2316, - "step": 67000 + "epoch": 15.62630917469627, + "grad_norm": 2.4062047004699707, + "learning_rate": 2.3967322999581064e-05, + "loss": 6.381, + "step": 37300 }, { - "epoch": 14.139086719731882, - "grad_norm": 3.8263683319091797, - "learning_rate": 3.586677838290742e-05, - "loss": 6.2322, - "step": 67500 + "epoch": 15.668202764976959, + "grad_norm": 3.1894476413726807, + "learning_rate": 2.389750034911325e-05, + "loss": 6.3895, + "step": 37400 }, { - "epoch": 14.243820695433598, - "grad_norm": 4.787020206451416, - "learning_rate": 3.57620444072057e-05, - "loss": 6.2099, - "step": 68000 + "epoch": 15.710096355257646, + "grad_norm": 2.9203104972839355, + "learning_rate": 2.3827677698645442e-05, + "loss": 6.363, + "step": 37500 }, { - "epoch": 14.348554671135316, - "grad_norm": 3.5196545124053955, - "learning_rate": 3.565751989945538e-05, - "loss": 6.2425, - "step": 68500 + "epoch": 15.751989945538332, + "grad_norm": 3.000694513320923, + "learning_rate": 2.375785504817763e-05, + "loss": 6.3837, + "step": 37600 }, { - "epoch": 14.453288646837034, - "grad_norm": 4.1746439933776855, - "learning_rate": 3.555278592375367e-05, - "loss": 6.2373, - "step": 69000 + "epoch": 15.79388353581902, + "grad_norm": 2.838684558868408, + "learning_rate": 2.368803239770982e-05, + "loss": 6.3859, + "step": 37700 }, { - "epoch": 14.558022622538752, - "grad_norm": 4.07820463180542, - "learning_rate": 3.544805194805195e-05, - "loss": 6.2099, - "step": 69500 + "epoch": 15.835777126099707, + "grad_norm": 2.648862361907959, + "learning_rate": 2.3618209747242005e-05, + "loss": 6.3411, + "step": 37800 }, { - "epoch": 14.662756598240469, - "grad_norm": 3.400038242340088, - "learning_rate": 3.534331797235023e-05, - "loss": 6.216, - "step": 70000 + "epoch": 15.877670716380393, + "grad_norm": 3.5438232421875, + "learning_rate": 2.3548387096774193e-05, + "loss": 6.3627, + "step": 37900 }, { - "epoch": 14.767490573942187, - "grad_norm": 4.578042030334473, - "learning_rate": 3.5238793464599916e-05, - "loss": 6.2091, - "step": 70500 + "epoch": 15.91956430666108, + "grad_norm": 2.8182501792907715, + "learning_rate": 2.3478564446306384e-05, + "loss": 6.3731, + "step": 38000 }, { - "epoch": 14.872224549643905, - "grad_norm": 3.6254208087921143, - "learning_rate": 3.51340594888982e-05, - "loss": 6.2258, - "step": 71000 + "epoch": 15.961457896941768, + "grad_norm": 3.3253772258758545, + "learning_rate": 2.340874179583857e-05, + "loss": 6.396, + "step": 38100 }, { - "epoch": 14.976958525345623, - "grad_norm": 3.496166467666626, - "learning_rate": 3.502932551319648e-05, - "loss": 6.2138, - "step": 71500 + "epoch": 16.003351487222456, + "grad_norm": 3.668926954269409, + "learning_rate": 2.3338919145370762e-05, + "loss": 6.3437, + "step": 38200 }, { - "epoch": 15.081692501047339, - "grad_norm": 3.5367865562438965, - "learning_rate": 3.492459153749477e-05, - "loss": 6.213, - "step": 72000 + "epoch": 16.045245077503143, + "grad_norm": 3.028989315032959, + "learning_rate": 2.3269096494902946e-05, + "loss": 6.3837, + "step": 38300 }, { - "epoch": 15.186426476749057, - "grad_norm": 3.4754440784454346, - "learning_rate": 3.481985756179305e-05, - "loss": 6.2153, - "step": 72500 + "epoch": 16.08713866778383, + "grad_norm": 3.220702648162842, + "learning_rate": 2.3199273844435134e-05, + "loss": 6.3609, + "step": 38400 }, { - "epoch": 15.291160452450775, - "grad_norm": 4.432271957397461, - "learning_rate": 3.4715333054042735e-05, - "loss": 6.2007, - "step": 73000 + "epoch": 16.129032258064516, + "grad_norm": 3.1788036823272705, + "learning_rate": 2.3129451193967325e-05, + "loss": 6.3723, + "step": 38500 }, { - "epoch": 15.395894428152493, - "grad_norm": 3.8427770137786865, - "learning_rate": 3.4610599078341014e-05, - "loss": 6.2071, - "step": 73500 + "epoch": 16.170925848345203, + "grad_norm": 3.351151466369629, + "learning_rate": 2.306032677000419e-05, + "loss": 6.3731, + "step": 38600 }, { - "epoch": 15.50062840385421, - "grad_norm": 3.9617857933044434, - "learning_rate": 3.4505865102639294e-05, - "loss": 6.2142, - "step": 74000 + "epoch": 16.21281943862589, + "grad_norm": 2.933992862701416, + "learning_rate": 2.299050411953638e-05, + "loss": 6.3654, + "step": 38700 }, { - "epoch": 15.605362379555928, - "grad_norm": 3.769693613052368, - "learning_rate": 3.440113112693758e-05, - "loss": 6.2065, - "step": 74500 + "epoch": 16.25471302890658, + "grad_norm": 4.2123589515686035, + "learning_rate": 2.2920681469068568e-05, + "loss": 6.3364, + "step": 38800 }, { - "epoch": 15.710096355257646, - "grad_norm": 3.825507402420044, - "learning_rate": 3.429639715123587e-05, - "loss": 6.2072, - "step": 75000 + "epoch": 16.296606619187266, + "grad_norm": 2.9287397861480713, + "learning_rate": 2.2850858818600755e-05, + "loss": 6.3643, + "step": 38900 }, { - "epoch": 15.814830330959364, - "grad_norm": 3.982872724533081, - "learning_rate": 3.4191872643485554e-05, - "loss": 6.2003, - "step": 75500 + "epoch": 16.338500209467952, + "grad_norm": 2.6518173217773438, + "learning_rate": 2.2781036168132943e-05, + "loss": 6.3538, + "step": 39000 }, { - "epoch": 15.91956430666108, - "grad_norm": 3.9958648681640625, - "learning_rate": 3.408713866778383e-05, - "loss": 6.1913, - "step": 76000 + "epoch": 16.38039379974864, + "grad_norm": 3.490497589111328, + "learning_rate": 2.271121351766513e-05, + "loss": 6.365, + "step": 39100 }, { - "epoch": 16.0242982823628, - "grad_norm": 3.947957754135132, - "learning_rate": 3.398240469208211e-05, - "loss": 6.2019, - "step": 76500 + "epoch": 16.422287390029325, + "grad_norm": 3.090874195098877, + "learning_rate": 2.264139086719732e-05, + "loss": 6.3513, + "step": 39200 }, { - "epoch": 16.129032258064516, - "grad_norm": 3.8135411739349365, - "learning_rate": 3.387767071638039e-05, - "loss": 6.1944, - "step": 77000 + "epoch": 16.46418098031001, + "grad_norm": 2.793083429336548, + "learning_rate": 2.257156821672951e-05, + "loss": 6.3815, + "step": 39300 }, { - "epoch": 16.233766233766232, - "grad_norm": 3.940861701965332, - "learning_rate": 3.377293674067868e-05, - "loss": 6.1893, - "step": 77500 + "epoch": 16.506074570590698, + "grad_norm": 2.656334638595581, + "learning_rate": 2.2501745566261696e-05, + "loss": 6.3677, + "step": 39400 }, { - "epoch": 16.338500209467952, - "grad_norm": 5.24894905090332, - "learning_rate": 3.3668412232928366e-05, - "loss": 6.1984, - "step": 78000 + "epoch": 16.547968160871388, + "grad_norm": 2.950857162475586, + "learning_rate": 2.2431922915793884e-05, + "loss": 6.3601, + "step": 39500 }, { - "epoch": 16.44323418516967, - "grad_norm": 4.470870494842529, - "learning_rate": 3.3563678257226645e-05, - "loss": 6.1958, - "step": 78500 + "epoch": 16.589861751152075, + "grad_norm": 2.948397636413574, + "learning_rate": 2.236210026532607e-05, + "loss": 6.3633, + "step": 39600 }, { - "epoch": 16.547968160871388, - "grad_norm": 3.699892282485962, - "learning_rate": 3.345894428152493e-05, - "loss": 6.1952, - "step": 79000 + "epoch": 16.63175534143276, + "grad_norm": 3.759934902191162, + "learning_rate": 2.2292277614858262e-05, + "loss": 6.3664, + "step": 39700 }, { - "epoch": 16.652702136573104, - "grad_norm": 4.136711120605469, - "learning_rate": 3.335421030582321e-05, - "loss": 6.1896, - "step": 79500 + "epoch": 16.673648931713448, + "grad_norm": 2.6607794761657715, + "learning_rate": 2.222245496439045e-05, + "loss": 6.3659, + "step": 39800 }, { - "epoch": 16.75743611227482, - "grad_norm": 4.904257297515869, - "learning_rate": 3.324968579807289e-05, - "loss": 6.1715, - "step": 80000 + "epoch": 16.715542521994134, + "grad_norm": 3.2569267749786377, + "learning_rate": 2.2152632313922638e-05, + "loss": 6.3477, + "step": 39900 }, { - "epoch": 16.86217008797654, - "grad_norm": 4.219280242919922, - "learning_rate": 3.314495182237118e-05, - "loss": 6.1829, - "step": 80500 + "epoch": 16.75743611227482, + "grad_norm": 3.1701977252960205, + "learning_rate": 2.2082809663454825e-05, + "loss": 6.3466, + "step": 40000 }, { - "epoch": 16.966904063678257, - "grad_norm": 4.426414489746094, - "learning_rate": 3.3040217846669464e-05, - "loss": 6.1782, - "step": 81000 + "epoch": 16.79932970255551, + "grad_norm": 2.8855369091033936, + "learning_rate": 2.2012987012987013e-05, + "loss": 6.3774, + "step": 40100 }, { - "epoch": 17.071638039379973, - "grad_norm": 4.792020797729492, - "learning_rate": 3.2935483870967744e-05, - "loss": 6.1675, - "step": 81500 + "epoch": 16.841223292836197, + "grad_norm": 2.8468215465545654, + "learning_rate": 2.1943164362519204e-05, + "loss": 6.3388, + "step": 40200 }, { - "epoch": 17.176372015081693, - "grad_norm": 3.9796903133392334, - "learning_rate": 3.283095936321743e-05, - "loss": 6.179, - "step": 82000 + "epoch": 16.883116883116884, + "grad_norm": 3.3314404487609863, + "learning_rate": 2.187334171205139e-05, + "loss": 6.3658, + "step": 40300 }, { - "epoch": 17.28110599078341, - "grad_norm": 4.554388046264648, - "learning_rate": 3.272622538751571e-05, - "loss": 6.1756, - "step": 82500 + "epoch": 16.92501047339757, + "grad_norm": 3.023106336593628, + "learning_rate": 2.180351906158358e-05, + "loss": 6.3443, + "step": 40400 }, { - "epoch": 17.38583996648513, - "grad_norm": 4.024316787719727, - "learning_rate": 3.262149141181399e-05, - "loss": 6.177, - "step": 83000 + "epoch": 16.966904063678257, + "grad_norm": 3.2845230102539062, + "learning_rate": 2.1733696411115766e-05, + "loss": 6.3785, + "step": 40500 }, { - "epoch": 17.490573942186845, - "grad_norm": 4.059772968292236, - "learning_rate": 3.2516757436112276e-05, - "loss": 6.1818, - "step": 83500 + "epoch": 17.008797653958943, + "grad_norm": 2.805790424346924, + "learning_rate": 2.166457198715263e-05, + "loss": 6.3792, + "step": 40600 }, { - "epoch": 17.59530791788856, - "grad_norm": 4.296391487121582, - "learning_rate": 3.241223292836196e-05, - "loss": 6.1866, - "step": 84000 + "epoch": 17.05069124423963, + "grad_norm": 2.893737554550171, + "learning_rate": 2.159474933668482e-05, + "loss": 6.3138, + "step": 40700 }, { - "epoch": 17.70004189359028, - "grad_norm": 4.008220672607422, - "learning_rate": 3.230749895266024e-05, - "loss": 6.172, - "step": 84500 + "epoch": 17.09258483452032, + "grad_norm": 3.238863945007324, + "learning_rate": 2.1525624912721686e-05, + "loss": 6.3686, + "step": 40800 }, { - "epoch": 17.804775869291998, - "grad_norm": 4.639082908630371, - "learning_rate": 3.220276497695853e-05, - "loss": 6.169, - "step": 85000 + "epoch": 17.134478424801006, + "grad_norm": 3.403582811355591, + "learning_rate": 2.1455802262253877e-05, + "loss": 6.3312, + "step": 40900 }, { - "epoch": 17.909509844993718, - "grad_norm": 4.635848522186279, - "learning_rate": 3.209803100125681e-05, - "loss": 6.1721, - "step": 85500 + "epoch": 17.176372015081693, + "grad_norm": 2.963287353515625, + "learning_rate": 2.1385979611786064e-05, + "loss": 6.3515, + "step": 41000 }, { - "epoch": 18.014243820695434, - "grad_norm": 4.662270545959473, - "learning_rate": 3.199329702555509e-05, - "loss": 6.1575, - "step": 86000 + "epoch": 17.21826560536238, + "grad_norm": 3.867340087890625, + "learning_rate": 2.1316156961318255e-05, + "loss": 6.3566, + "step": 41100 }, { - "epoch": 18.11897779639715, - "grad_norm": 4.280701637268066, - "learning_rate": 3.1888772517804775e-05, - "loss": 6.1482, - "step": 86500 + "epoch": 17.260159195643066, + "grad_norm": 2.841190814971924, + "learning_rate": 2.124633431085044e-05, + "loss": 6.3308, + "step": 41200 }, { - "epoch": 18.22371177209887, - "grad_norm": 3.8602380752563477, - "learning_rate": 3.178403854210306e-05, - "loss": 6.1565, - "step": 87000 + "epoch": 17.302052785923753, + "grad_norm": 2.872523307800293, + "learning_rate": 2.1176511660382627e-05, + "loss": 6.3433, + "step": 41300 }, { - "epoch": 18.328445747800586, - "grad_norm": 4.634263515472412, - "learning_rate": 3.167930456640134e-05, - "loss": 6.1587, - "step": 87500 + "epoch": 17.34394637620444, + "grad_norm": 3.156465530395508, + "learning_rate": 2.1106689009914818e-05, + "loss": 6.3779, + "step": 41400 }, { - "epoch": 18.433179723502302, - "grad_norm": 4.115392208099365, - "learning_rate": 3.157457059069963e-05, - "loss": 6.1436, - "step": 88000 + "epoch": 17.38583996648513, + "grad_norm": 3.5904667377471924, + "learning_rate": 2.1036866359447005e-05, + "loss": 6.3402, + "step": 41500 }, { - "epoch": 18.537913699204022, - "grad_norm": 3.6665916442871094, - "learning_rate": 3.146983661499791e-05, - "loss": 6.1442, - "step": 88500 + "epoch": 17.427733556765816, + "grad_norm": 3.5753939151763916, + "learning_rate": 2.0967043708979196e-05, + "loss": 6.3572, + "step": 41600 }, { - "epoch": 18.64264767490574, - "grad_norm": 4.444345951080322, - "learning_rate": 3.1365312107247594e-05, - "loss": 6.1571, - "step": 89000 + "epoch": 17.469627147046502, + "grad_norm": 3.129514217376709, + "learning_rate": 2.089722105851138e-05, + "loss": 6.3302, + "step": 41700 }, { - "epoch": 18.74738165060746, - "grad_norm": 3.792792558670044, - "learning_rate": 3.1260578131545873e-05, - "loss": 6.1535, - "step": 89500 + "epoch": 17.51152073732719, + "grad_norm": 2.988732099533081, + "learning_rate": 2.0827398408043568e-05, + "loss": 6.3807, + "step": 41800 }, { - "epoch": 18.852115626309175, - "grad_norm": 3.904019832611084, - "learning_rate": 3.115584415584415e-05, - "loss": 6.1501, - "step": 90000 + "epoch": 17.553414327607875, + "grad_norm": 2.857875108718872, + "learning_rate": 2.075757575757576e-05, + "loss": 6.3519, + "step": 41900 }, { - "epoch": 18.95684960201089, - "grad_norm": 4.531284332275391, - "learning_rate": 3.105111018014244e-05, - "loss": 6.1592, - "step": 90500 + "epoch": 17.59530791788856, + "grad_norm": 4.023842811584473, + "learning_rate": 2.0687753107107947e-05, + "loss": 6.3467, + "step": 42000 }, { - "epoch": 19.06158357771261, - "grad_norm": 3.7976317405700684, - "learning_rate": 3.0946376204440726e-05, - "loss": 6.1474, - "step": 91000 + "epoch": 17.63720150816925, + "grad_norm": 3.049686908721924, + "learning_rate": 2.0617930456640137e-05, + "loss": 6.3306, + "step": 42100 }, { - "epoch": 19.166317553414327, - "grad_norm": 3.8021469116210938, - "learning_rate": 3.084185169669041e-05, - "loss": 6.1408, - "step": 91500 + "epoch": 17.679095098449938, + "grad_norm": 3.3211073875427246, + "learning_rate": 2.054810780617232e-05, + "loss": 6.3611, + "step": 42200 }, { - "epoch": 19.271051529116047, - "grad_norm": 4.194758892059326, - "learning_rate": 3.073711772098869e-05, - "loss": 6.1476, - "step": 92000 + "epoch": 17.720988688730625, + "grad_norm": 3.064138174057007, + "learning_rate": 2.047828515570451e-05, + "loss": 6.3217, + "step": 42300 }, { - "epoch": 19.375785504817763, - "grad_norm": 4.084668159484863, - "learning_rate": 3.063238374528697e-05, - "loss": 6.1443, - "step": 92500 + "epoch": 17.76288227901131, + "grad_norm": 2.7812724113464355, + "learning_rate": 2.04084625052367e-05, + "loss": 6.3131, + "step": 42400 }, { - "epoch": 19.48051948051948, - "grad_norm": 4.383222579956055, - "learning_rate": 3.052764976958525e-05, - "loss": 6.1422, - "step": 93000 + "epoch": 17.804775869291998, + "grad_norm": 2.5516164302825928, + "learning_rate": 2.0338639854768888e-05, + "loss": 6.3428, + "step": 42500 }, { - "epoch": 19.5852534562212, - "grad_norm": 4.250995635986328, - "learning_rate": 3.042312526183494e-05, - "loss": 6.1375, - "step": 93500 + "epoch": 17.846669459572684, + "grad_norm": 2.9599711894989014, + "learning_rate": 2.026881720430108e-05, + "loss": 6.3545, + "step": 42600 }, { - "epoch": 19.689987431922916, - "grad_norm": 4.78529691696167, - "learning_rate": 3.0318391286133225e-05, - "loss": 6.1368, - "step": 94000 + "epoch": 17.88856304985337, + "grad_norm": 2.8674137592315674, + "learning_rate": 2.0198994553833263e-05, + "loss": 6.3302, + "step": 42700 }, { - "epoch": 19.794721407624632, - "grad_norm": 3.4997754096984863, - "learning_rate": 3.0213657310431504e-05, - "loss": 6.1432, - "step": 94500 + "epoch": 17.93045664013406, + "grad_norm": 3.3227078914642334, + "learning_rate": 2.012917190336545e-05, + "loss": 6.3278, + "step": 42800 }, { - "epoch": 19.89945538332635, - "grad_norm": 4.723648548126221, - "learning_rate": 3.0108923334729787e-05, - "loss": 6.1263, - "step": 95000 + "epoch": 17.972350230414747, + "grad_norm": 3.080399751663208, + "learning_rate": 2.005934925289764e-05, + "loss": 6.3206, + "step": 42900 }, { - "epoch": 20.004189359028068, - "grad_norm": 3.930859088897705, - "learning_rate": 3.0004398826979474e-05, - "loss": 6.1387, - "step": 95500 + "epoch": 18.014243820695434, + "grad_norm": 4.004719257354736, + "learning_rate": 1.998952660242983e-05, + "loss": 6.3407, + "step": 43000 }, { - "epoch": 20.108923334729788, - "grad_norm": 4.286599159240723, - "learning_rate": 2.9899664851277754e-05, - "loss": 6.1187, - "step": 96000 + "epoch": 18.05613741097612, + "grad_norm": 2.8186423778533936, + "learning_rate": 1.991970395196202e-05, + "loss": 6.3136, + "step": 43100 }, { - "epoch": 20.213657310431504, - "grad_norm": 3.8475680351257324, - "learning_rate": 2.9794930875576037e-05, - "loss": 6.1312, - "step": 96500 + "epoch": 18.098031001256807, + "grad_norm": 2.81748104095459, + "learning_rate": 1.9849881301494204e-05, + "loss": 6.3353, + "step": 43200 }, { - "epoch": 20.31839128613322, - "grad_norm": 4.844906806945801, - "learning_rate": 2.9690196899874323e-05, - "loss": 6.1457, - "step": 97000 + "epoch": 18.139924591537493, + "grad_norm": 2.9991416931152344, + "learning_rate": 1.9780058651026395e-05, + "loss": 6.3194, + "step": 43300 }, { - "epoch": 20.42312526183494, - "grad_norm": 4.691315174102783, - "learning_rate": 2.958567239212401e-05, - "loss": 6.1351, - "step": 97500 + "epoch": 18.181818181818183, + "grad_norm": 3.4876794815063477, + "learning_rate": 1.9710236000558583e-05, + "loss": 6.3293, + "step": 43400 }, { - "epoch": 20.527859237536656, - "grad_norm": 6.15250825881958, - "learning_rate": 2.9480938416422286e-05, - "loss": 6.1275, - "step": 98000 + "epoch": 18.22371177209887, + "grad_norm": 3.0756711959838867, + "learning_rate": 1.964041335009077e-05, + "loss": 6.341, + "step": 43500 }, { - "epoch": 20.632593213238373, - "grad_norm": 3.8872599601745605, - "learning_rate": 2.9376204440720573e-05, - "loss": 6.1275, - "step": 98500 + "epoch": 18.265605362379556, + "grad_norm": 3.171670436859131, + "learning_rate": 1.9570590699622958e-05, + "loss": 6.3075, + "step": 43600 }, { - "epoch": 20.737327188940093, - "grad_norm": 4.541051864624023, - "learning_rate": 2.9271470465018852e-05, - "loss": 6.1472, - "step": 99000 + "epoch": 18.307498952660243, + "grad_norm": 3.3317439556121826, + "learning_rate": 1.9500768049155145e-05, + "loss": 6.3436, + "step": 43700 }, { - "epoch": 20.84206116464181, - "grad_norm": 4.556408405303955, - "learning_rate": 2.9166736489317135e-05, - "loss": 6.1369, - "step": 99500 + "epoch": 18.34939254294093, + "grad_norm": 2.924349308013916, + "learning_rate": 1.9430945398687336e-05, + "loss": 6.3217, + "step": 43800 }, { - "epoch": 20.94679514034353, - "grad_norm": 4.5567498207092285, - "learning_rate": 2.9062211981566822e-05, - "loss": 6.1148, - "step": 100000 + "epoch": 18.391286133221616, + "grad_norm": 3.247955560684204, + "learning_rate": 1.9361122748219524e-05, + "loss": 6.3324, + "step": 43900 }, { - "epoch": 21.051529116045245, - "grad_norm": 4.647518634796143, - "learning_rate": 2.8957478005865102e-05, - "loss": 6.1281, - "step": 100500 + "epoch": 18.433179723502302, + "grad_norm": 3.340263843536377, + "learning_rate": 1.929130009775171e-05, + "loss": 6.2993, + "step": 44000 }, { - "epoch": 21.15626309174696, - "grad_norm": 4.372421741485596, - "learning_rate": 2.8852744030163388e-05, - "loss": 6.1226, - "step": 101000 + "epoch": 18.475073313782993, + "grad_norm": 2.973019599914551, + "learning_rate": 1.92214774472839e-05, + "loss": 6.3292, + "step": 44100 }, { - "epoch": 21.26099706744868, - "grad_norm": 4.270533084869385, - "learning_rate": 2.874801005446167e-05, - "loss": 6.123, - "step": 101500 + "epoch": 18.51696690406368, + "grad_norm": 3.5055582523345947, + "learning_rate": 1.9151654796816086e-05, + "loss": 6.3175, + "step": 44200 }, { - "epoch": 21.365731043150397, - "grad_norm": 3.5596370697021484, - "learning_rate": 2.8643485546711358e-05, - "loss": 6.1402, - "step": 102000 + "epoch": 18.558860494344366, + "grad_norm": 2.9543776512145996, + "learning_rate": 1.9081832146348277e-05, + "loss": 6.3206, + "step": 44300 }, { - "epoch": 21.470465018852117, - "grad_norm": 5.230384826660156, - "learning_rate": 2.8538751571009638e-05, - "loss": 6.1199, - "step": 102500 + "epoch": 18.600754084625052, + "grad_norm": 2.790940284729004, + "learning_rate": 1.9012009495880465e-05, + "loss": 6.3383, + "step": 44400 }, { - "epoch": 21.575198994553833, - "grad_norm": 3.9881417751312256, - "learning_rate": 2.843401759530792e-05, - "loss": 6.1244, - "step": 103000 + "epoch": 18.64264767490574, + "grad_norm": 3.419908285140991, + "learning_rate": 1.8942186845412653e-05, + "loss": 6.3329, + "step": 44500 }, { - "epoch": 21.67993297025555, - "grad_norm": 4.617568016052246, - "learning_rate": 2.83292836196062e-05, - "loss": 6.1154, - "step": 103500 + "epoch": 18.684541265186425, + "grad_norm": 3.3396215438842773, + "learning_rate": 1.887236419494484e-05, + "loss": 6.312, + "step": 44600 }, { - "epoch": 21.78466694595727, - "grad_norm": 4.641009330749512, - "learning_rate": 2.8224759111855887e-05, - "loss": 6.113, - "step": 104000 + "epoch": 18.726434855467115, + "grad_norm": 2.6713643074035645, + "learning_rate": 1.8802541544477028e-05, + "loss": 6.315, + "step": 44700 }, { - "epoch": 21.889400921658986, - "grad_norm": 4.005772113800049, - "learning_rate": 2.812002513615417e-05, - "loss": 6.1163, - "step": 104500 + "epoch": 18.7683284457478, + "grad_norm": 3.2764880657196045, + "learning_rate": 1.8733417120513895e-05, + "loss": 6.3311, + "step": 44800 }, { - "epoch": 21.994134897360702, - "grad_norm": 4.2611799240112305, - "learning_rate": 2.801529116045245e-05, - "loss": 6.1023, - "step": 105000 + "epoch": 18.810222036028488, + "grad_norm": 3.602581739425659, + "learning_rate": 1.8663594470046083e-05, + "loss": 6.327, + "step": 44900 }, { - "epoch": 22.098868873062422, - "grad_norm": 4.568357467651367, - "learning_rate": 2.7910557184750736e-05, - "loss": 6.1169, - "step": 105500 + "epoch": 18.852115626309175, + "grad_norm": 3.052971124649048, + "learning_rate": 1.8593771819578274e-05, + "loss": 6.2911, + "step": 45000 }, { - "epoch": 22.203602848764138, - "grad_norm": 4.323103427886963, - "learning_rate": 2.780603267700042e-05, - "loss": 6.1226, - "step": 106000 + "epoch": 18.89400921658986, + "grad_norm": 3.0912699699401855, + "learning_rate": 1.852394916911046e-05, + "loss": 6.3057, + "step": 45100 }, { - "epoch": 22.308336824465858, - "grad_norm": 4.507444381713867, - "learning_rate": 2.77012987012987e-05, - "loss": 6.1052, - "step": 106500 + "epoch": 18.935902806870548, + "grad_norm": 2.631545305252075, + "learning_rate": 1.845412651864265e-05, + "loss": 6.3381, + "step": 45200 }, { - "epoch": 22.413070800167574, - "grad_norm": 4.301244735717773, - "learning_rate": 2.7596564725596985e-05, - "loss": 6.0944, - "step": 107000 + "epoch": 18.977796397151234, + "grad_norm": 3.8213324546813965, + "learning_rate": 1.8384303868174836e-05, + "loss": 6.3123, + "step": 45300 }, { - "epoch": 22.51780477586929, - "grad_norm": 4.984853267669678, - "learning_rate": 2.749183074989527e-05, - "loss": 6.1125, - "step": 107500 + "epoch": 19.019689987431924, + "grad_norm": 3.3717353343963623, + "learning_rate": 1.8314481217707024e-05, + "loss": 6.3194, + "step": 45400 }, { - "epoch": 22.62253875157101, - "grad_norm": 4.682931423187256, - "learning_rate": 2.7387096774193548e-05, - "loss": 6.1158, - "step": 108000 + "epoch": 19.06158357771261, + "grad_norm": 2.831409215927124, + "learning_rate": 1.8244658567239215e-05, + "loss": 6.3383, + "step": 45500 }, { - "epoch": 22.727272727272727, - "grad_norm": 4.494015693664551, - "learning_rate": 2.7282572266443235e-05, - "loss": 6.1035, - "step": 108500 + "epoch": 19.103477167993297, + "grad_norm": 2.915093183517456, + "learning_rate": 1.8174835916771403e-05, + "loss": 6.3208, + "step": 45600 }, { - "epoch": 22.832006702974446, - "grad_norm": 3.880779981613159, - "learning_rate": 2.717783829074152e-05, - "loss": 6.1084, - "step": 109000 + "epoch": 19.145370758273984, + "grad_norm": 3.1236917972564697, + "learning_rate": 1.810501326630359e-05, + "loss": 6.3089, + "step": 45700 }, { - "epoch": 22.936740678676163, - "grad_norm": 4.154653072357178, - "learning_rate": 2.7073104315039798e-05, - "loss": 6.0945, - "step": 109500 + "epoch": 19.18726434855467, + "grad_norm": 3.2876298427581787, + "learning_rate": 1.8035190615835778e-05, + "loss": 6.2975, + "step": 45800 }, { - "epoch": 23.04147465437788, - "grad_norm": 5.443271160125732, - "learning_rate": 2.6968370339338084e-05, - "loss": 6.1016, - "step": 110000 + "epoch": 19.229157938835357, + "grad_norm": 2.6437103748321533, + "learning_rate": 1.7965367965367965e-05, + "loss": 6.3341, + "step": 45900 }, { - "epoch": 23.1462086300796, - "grad_norm": 4.298133373260498, - "learning_rate": 2.686384583158777e-05, - "loss": 6.103, - "step": 110500 + "epoch": 19.271051529116047, + "grad_norm": 2.9252028465270996, + "learning_rate": 1.7895545314900156e-05, + "loss": 6.3404, + "step": 46000 }, { - "epoch": 23.250942605781315, - "grad_norm": 4.379884243011475, - "learning_rate": 2.675911185588605e-05, - "loss": 6.0814, - "step": 111000 + "epoch": 19.312945119396733, + "grad_norm": 3.4250340461730957, + "learning_rate": 1.7825722664432344e-05, + "loss": 6.3072, + "step": 46100 }, { - "epoch": 23.35567658148303, - "grad_norm": 6.175398349761963, - "learning_rate": 2.6654377880184333e-05, - "loss": 6.1088, - "step": 111500 + "epoch": 19.35483870967742, + "grad_norm": 3.1287946701049805, + "learning_rate": 1.775590001396453e-05, + "loss": 6.3022, + "step": 46200 }, { - "epoch": 23.46041055718475, - "grad_norm": 4.121715068817139, - "learning_rate": 2.6549643904482613e-05, - "loss": 6.0966, - "step": 112000 + "epoch": 19.396732299958106, + "grad_norm": 3.4577419757843018, + "learning_rate": 1.76867755900014e-05, + "loss": 6.2938, + "step": 46300 }, { - "epoch": 23.565144532886467, - "grad_norm": 5.040287494659424, - "learning_rate": 2.6444909928780896e-05, - "loss": 6.098, - "step": 112500 + "epoch": 19.438625890238793, + "grad_norm": 3.7131240367889404, + "learning_rate": 1.7616952939533586e-05, + "loss": 6.3088, + "step": 46400 }, { - "epoch": 23.669878508588187, - "grad_norm": 4.766879081726074, - "learning_rate": 2.6340175953079182e-05, - "loss": 6.0957, - "step": 113000 + "epoch": 19.48051948051948, + "grad_norm": 3.6799802780151367, + "learning_rate": 1.7547130289065774e-05, + "loss": 6.3326, + "step": 46500 }, { - "epoch": 23.774612484289904, - "grad_norm": 5.87930965423584, - "learning_rate": 2.623565144532887e-05, - "loss": 6.1089, - "step": 113500 + "epoch": 19.522413070800166, + "grad_norm": 2.834351062774658, + "learning_rate": 1.747730763859796e-05, + "loss": 6.2952, + "step": 46600 }, { - "epoch": 23.87934645999162, - "grad_norm": 5.318653583526611, - "learning_rate": 2.613091746962715e-05, - "loss": 6.0761, - "step": 114000 + "epoch": 19.564306661080856, + "grad_norm": 3.0629451274871826, + "learning_rate": 1.7407484988130152e-05, + "loss": 6.3185, + "step": 46700 }, { - "epoch": 23.98408043569334, - "grad_norm": 4.465319633483887, - "learning_rate": 2.6026183493925432e-05, - "loss": 6.0826, - "step": 114500 + "epoch": 19.606200251361543, + "grad_norm": 3.4801712036132812, + "learning_rate": 1.733766233766234e-05, + "loss": 6.3003, + "step": 46800 }, { - "epoch": 24.088814411395056, - "grad_norm": 4.640571594238281, - "learning_rate": 2.592144951822371e-05, - "loss": 6.0805, - "step": 115000 + "epoch": 19.64809384164223, + "grad_norm": 2.8250389099121094, + "learning_rate": 1.7267839687194524e-05, + "loss": 6.3033, + "step": 46900 }, { - "epoch": 24.193548387096776, - "grad_norm": 4.252554416656494, - "learning_rate": 2.5816925010473398e-05, - "loss": 6.0701, - "step": 115500 + "epoch": 19.689987431922916, + "grad_norm": 3.5964672565460205, + "learning_rate": 1.7198017036726715e-05, + "loss": 6.293, + "step": 47000 }, { - "epoch": 24.298282362798492, - "grad_norm": 4.704644203186035, - "learning_rate": 2.571219103477168e-05, - "loss": 6.0936, - "step": 116000 + "epoch": 19.731881022203602, + "grad_norm": 2.7947146892547607, + "learning_rate": 1.7128194386258903e-05, + "loss": 6.2884, + "step": 47100 }, { - "epoch": 24.40301633850021, - "grad_norm": 4.601324558258057, - "learning_rate": 2.560745705906996e-05, - "loss": 6.0758, - "step": 116500 + "epoch": 19.77377461248429, + "grad_norm": 3.0473551750183105, + "learning_rate": 1.7058371735791094e-05, + "loss": 6.312, + "step": 47200 }, { - "epoch": 24.507750314201928, - "grad_norm": 4.380444526672363, - "learning_rate": 2.5502723083368247e-05, - "loss": 6.0871, - "step": 117000 + "epoch": 19.81566820276498, + "grad_norm": 3.1810736656188965, + "learning_rate": 1.698854908532328e-05, + "loss": 6.3102, + "step": 47300 }, { - "epoch": 24.612484289903644, - "grad_norm": 4.119806289672852, - "learning_rate": 2.5397989107666527e-05, - "loss": 6.102, - "step": 117500 + "epoch": 19.857561793045665, + "grad_norm": 3.0046746730804443, + "learning_rate": 1.6918726434855465e-05, + "loss": 6.3115, + "step": 47400 }, { - "epoch": 24.71721826560536, - "grad_norm": 3.9712698459625244, - "learning_rate": 2.5293464599916217e-05, - "loss": 6.0999, - "step": 118000 + "epoch": 19.89945538332635, + "grad_norm": 2.6985220909118652, + "learning_rate": 1.6848903784387656e-05, + "loss": 6.3132, + "step": 47500 }, { - "epoch": 24.82195224130708, - "grad_norm": 5.146612167358398, - "learning_rate": 2.5188730624214497e-05, - "loss": 6.0947, - "step": 118500 + "epoch": 19.941348973607038, + "grad_norm": 2.958906650543213, + "learning_rate": 1.6779081133919844e-05, + "loss": 6.3024, + "step": 47600 }, { - "epoch": 24.926686217008797, - "grad_norm": 4.406741142272949, - "learning_rate": 2.508399664851278e-05, - "loss": 6.0829, - "step": 119000 + "epoch": 19.983242563887725, + "grad_norm": 3.5484089851379395, + "learning_rate": 1.6709258483452035e-05, + "loss": 6.2989, + "step": 47700 }, { - "epoch": 25.031420192710517, - "grad_norm": 5.4739766120910645, - "learning_rate": 2.497926267281106e-05, - "loss": 6.0598, - "step": 119500 + "epoch": 20.02513615416841, + "grad_norm": 4.328272342681885, + "learning_rate": 1.6639435832984222e-05, + "loss": 6.3162, + "step": 47800 }, { - "epoch": 25.136154168412233, - "grad_norm": 4.6231794357299805, - "learning_rate": 2.4874738165060746e-05, - "loss": 6.072, - "step": 120000 + "epoch": 20.067029744449098, + "grad_norm": 3.0396926403045654, + "learning_rate": 1.6569613182516407e-05, + "loss": 6.3004, + "step": 47900 }, { - "epoch": 25.24088814411395, - "grad_norm": 4.47750186920166, - "learning_rate": 2.477000418935903e-05, - "loss": 6.0664, - "step": 120500 + "epoch": 20.108923334729788, + "grad_norm": 3.328972339630127, + "learning_rate": 1.6499790532048598e-05, + "loss": 6.2855, + "step": 48000 }, { - "epoch": 25.34562211981567, - "grad_norm": 5.023014068603516, - "learning_rate": 2.4665270213657312e-05, - "loss": 6.0894, - "step": 121000 + "epoch": 20.150816925010474, + "grad_norm": 3.301114320755005, + "learning_rate": 1.6429967881580785e-05, + "loss": 6.2874, + "step": 48100 }, { - "epoch": 25.450356095517385, - "grad_norm": 5.687644004821777, - "learning_rate": 2.4560536237955595e-05, - "loss": 6.0936, - "step": 121500 + "epoch": 20.19271051529116, + "grad_norm": 3.297041177749634, + "learning_rate": 1.6360145231112976e-05, + "loss": 6.3089, + "step": 48200 }, { - "epoch": 25.555090071219105, - "grad_norm": 4.534958362579346, - "learning_rate": 2.4455802262253878e-05, - "loss": 6.0794, - "step": 122000 + "epoch": 20.234604105571847, + "grad_norm": 2.9122605323791504, + "learning_rate": 1.6290322580645164e-05, + "loss": 6.3157, + "step": 48300 }, { - "epoch": 25.65982404692082, - "grad_norm": 5.563751697540283, - "learning_rate": 2.435127775450356e-05, - "loss": 6.081, - "step": 122500 + "epoch": 20.276497695852534, + "grad_norm": 2.8182084560394287, + "learning_rate": 1.6220499930177348e-05, + "loss": 6.3118, + "step": 48400 }, { - "epoch": 25.764558022622538, - "grad_norm": 4.613626956939697, - "learning_rate": 2.4246543778801845e-05, - "loss": 6.0982, - "step": 123000 + "epoch": 20.31839128613322, + "grad_norm": 3.8560192584991455, + "learning_rate": 1.615067727970954e-05, + "loss": 6.2858, + "step": 48500 }, { - "epoch": 25.869291998324258, - "grad_norm": 4.645818710327148, - "learning_rate": 2.4141809803100128e-05, - "loss": 6.0665, - "step": 123500 + "epoch": 20.36028487641391, + "grad_norm": 2.457240581512451, + "learning_rate": 1.6080854629241726e-05, + "loss": 6.3077, + "step": 48600 }, { - "epoch": 25.974025974025974, - "grad_norm": 4.9156928062438965, - "learning_rate": 2.4037075827398407e-05, - "loss": 6.0674, - "step": 124000 + "epoch": 20.402178466694597, + "grad_norm": 3.5376362800598145, + "learning_rate": 1.6011031978773917e-05, + "loss": 6.2892, + "step": 48700 }, { - "epoch": 26.07875994972769, - "grad_norm": 4.7342305183410645, - "learning_rate": 2.3932551319648094e-05, - "loss": 6.0741, - "step": 124500 + "epoch": 20.444072056975283, + "grad_norm": 3.3489222526550293, + "learning_rate": 1.59412093283061e-05, + "loss": 6.2973, + "step": 48800 }, { - "epoch": 26.18349392542941, - "grad_norm": 4.607081413269043, - "learning_rate": 2.3827817343946377e-05, - "loss": 6.0626, - "step": 125000 + "epoch": 20.48596564725597, + "grad_norm": 3.600166082382202, + "learning_rate": 1.587138667783829e-05, + "loss": 6.31, + "step": 48900 }, { - "epoch": 26.288227901131126, - "grad_norm": 4.820442199707031, - "learning_rate": 2.372308336824466e-05, - "loss": 6.0936, - "step": 125500 + "epoch": 20.527859237536656, + "grad_norm": 3.255598783493042, + "learning_rate": 1.580156402737048e-05, + "loss": 6.2389, + "step": 49000 }, { - "epoch": 26.392961876832846, - "grad_norm": 4.549975395202637, - "learning_rate": 2.3618349392542943e-05, - "loss": 6.0804, - "step": 126000 + "epoch": 20.569752827817343, + "grad_norm": 3.166994094848633, + "learning_rate": 1.5731741376902668e-05, + "loss": 6.303, + "step": 49100 }, { - "epoch": 26.497695852534562, - "grad_norm": 4.722150802612305, - "learning_rate": 2.351382488479263e-05, - "loss": 6.0555, - "step": 126500 + "epoch": 20.61164641809803, + "grad_norm": 3.615269184112549, + "learning_rate": 1.566191872643486e-05, + "loss": 6.281, + "step": 49200 }, { - "epoch": 26.60242982823628, - "grad_norm": 4.2948408126831055, - "learning_rate": 2.340909090909091e-05, - "loss": 6.0626, - "step": 127000 + "epoch": 20.65354000837872, + "grad_norm": 3.1495063304901123, + "learning_rate": 1.5592096075967043e-05, + "loss": 6.2666, + "step": 49300 }, { - "epoch": 26.707163803938, - "grad_norm": 4.246878623962402, - "learning_rate": 2.3304356933389193e-05, - "loss": 6.0577, - "step": 127500 + "epoch": 20.695433598659406, + "grad_norm": 2.9170730113983154, + "learning_rate": 1.552227342549923e-05, + "loss": 6.2738, + "step": 49400 }, { - "epoch": 26.811897779639715, - "grad_norm": 4.165809154510498, - "learning_rate": 2.3199622957687476e-05, - "loss": 6.0608, - "step": 128000 + "epoch": 20.737327188940093, + "grad_norm": 3.0922224521636963, + "learning_rate": 1.545245077503142e-05, + "loss": 6.2805, + "step": 49500 }, { - "epoch": 26.916631755341434, - "grad_norm": 4.3159894943237305, - "learning_rate": 2.309488898198576e-05, - "loss": 6.0806, - "step": 128500 + "epoch": 20.77922077922078, + "grad_norm": 3.088012933731079, + "learning_rate": 1.538262812456361e-05, + "loss": 6.2906, + "step": 49600 }, { - "epoch": 27.02136573104315, - "grad_norm": 4.15300989151001, - "learning_rate": 2.2990364474235442e-05, - "loss": 6.0654, - "step": 129000 + "epoch": 20.821114369501466, + "grad_norm": 2.939486503601074, + "learning_rate": 1.53128054740958e-05, + "loss": 6.2636, + "step": 49700 }, { - "epoch": 27.126099706744867, - "grad_norm": 4.730154991149902, - "learning_rate": 2.2885630498533725e-05, - "loss": 6.0567, - "step": 129500 + "epoch": 20.863007959782152, + "grad_norm": 3.597949743270874, + "learning_rate": 1.5242982823627986e-05, + "loss": 6.2745, + "step": 49800 }, { - "epoch": 27.230833682446587, - "grad_norm": 4.300974369049072, - "learning_rate": 2.2780896522832008e-05, - "loss": 6.0449, - "step": 130000 + "epoch": 20.90490155006284, + "grad_norm": 3.4760777950286865, + "learning_rate": 1.5173160173160175e-05, + "loss": 6.2702, + "step": 49900 }, { - "epoch": 27.335567658148303, - "grad_norm": 4.148283958435059, - "learning_rate": 2.2676162547130288e-05, - "loss": 6.0491, - "step": 130500 + "epoch": 20.94679514034353, + "grad_norm": 3.04856014251709, + "learning_rate": 1.5103337522692362e-05, + "loss": 6.2841, + "step": 50000 }, { - "epoch": 27.44030163385002, - "grad_norm": 4.9924421310424805, - "learning_rate": 2.2571638039379974e-05, - "loss": 6.0491, - "step": 131000 + "epoch": 20.988688730624215, + "grad_norm": 2.849895477294922, + "learning_rate": 1.503351487222455e-05, + "loss": 6.2814, + "step": 50100 }, { - "epoch": 27.54503560955174, - "grad_norm": 5.713706016540527, - "learning_rate": 2.246690406367826e-05, - "loss": 6.0396, - "step": 131500 + "epoch": 21.0305823209049, + "grad_norm": 3.1246280670166016, + "learning_rate": 1.496369222175674e-05, + "loss": 6.2754, + "step": 50200 }, { - "epoch": 27.649769585253456, - "grad_norm": 5.007369518280029, - "learning_rate": 2.236217008797654e-05, - "loss": 6.0378, - "step": 132000 + "epoch": 21.072475911185588, + "grad_norm": 3.303846836090088, + "learning_rate": 1.4894567797793605e-05, + "loss": 6.2661, + "step": 50300 }, { - "epoch": 27.754503560955175, - "grad_norm": 4.500640392303467, - "learning_rate": 2.2257436112274823e-05, - "loss": 6.0313, - "step": 132500 + "epoch": 21.114369501466275, + "grad_norm": 3.5818755626678467, + "learning_rate": 1.4824745147325794e-05, + "loss": 6.2804, + "step": 50400 }, { - "epoch": 27.85923753665689, - "grad_norm": 4.709275722503662, - "learning_rate": 2.2152702136573107e-05, - "loss": 6.0278, - "step": 133000 + "epoch": 21.15626309174696, + "grad_norm": 3.0695786476135254, + "learning_rate": 1.4754922496857982e-05, + "loss": 6.284, + "step": 50500 }, { - "epoch": 27.963971512358608, - "grad_norm": 4.891386032104492, - "learning_rate": 2.204817762882279e-05, - "loss": 6.0267, - "step": 133500 + "epoch": 21.19815668202765, + "grad_norm": 3.6067614555358887, + "learning_rate": 1.4685099846390168e-05, + "loss": 6.2863, + "step": 50600 }, { - "epoch": 28.068705488060328, - "grad_norm": 4.82666540145874, - "learning_rate": 2.1943443653121073e-05, - "loss": 5.986, - "step": 134000 + "epoch": 21.240050272308338, + "grad_norm": 3.2230417728424072, + "learning_rate": 1.4615277195922359e-05, + "loss": 6.287, + "step": 50700 }, { - "epoch": 28.173439463762044, - "grad_norm": 4.489607810974121, - "learning_rate": 2.1838709677419356e-05, - "loss": 6.0209, - "step": 134500 + "epoch": 21.281943862589024, + "grad_norm": 3.059466600418091, + "learning_rate": 1.4545454545454545e-05, + "loss": 6.2442, + "step": 50800 }, { - "epoch": 28.278173439463764, - "grad_norm": 4.719301700592041, - "learning_rate": 2.173397570171764e-05, - "loss": 5.9998, - "step": 135000 + "epoch": 21.32383745286971, + "grad_norm": 3.7770040035247803, + "learning_rate": 1.4475631894986736e-05, + "loss": 6.2612, + "step": 50900 }, { - "epoch": 28.38290741516548, - "grad_norm": 5.639565467834473, - "learning_rate": 2.1629451193967322e-05, - "loss": 5.9881, - "step": 135500 + "epoch": 21.365731043150397, + "grad_norm": 3.3269879817962646, + "learning_rate": 1.4405809244518923e-05, + "loss": 6.2985, + "step": 51000 }, { - "epoch": 28.487641390867196, - "grad_norm": 4.745512008666992, - "learning_rate": 2.1524717218265605e-05, - "loss": 6.0002, - "step": 136000 + "epoch": 21.407624633431084, + "grad_norm": 2.649940252304077, + "learning_rate": 1.4335986594051109e-05, + "loss": 6.2343, + "step": 51100 }, { - "epoch": 28.592375366568916, - "grad_norm": 5.661725997924805, - "learning_rate": 2.141998324256389e-05, - "loss": 5.9967, - "step": 136500 + "epoch": 21.44951822371177, + "grad_norm": 3.4042983055114746, + "learning_rate": 1.42661639435833e-05, + "loss": 6.2701, + "step": 51200 }, { - "epoch": 28.697109342270632, - "grad_norm": 5.3391194343566895, - "learning_rate": 2.131524926686217e-05, - "loss": 6.0024, - "step": 137000 + "epoch": 21.49141181399246, + "grad_norm": 3.1958000659942627, + "learning_rate": 1.4196341293115486e-05, + "loss": 6.2866, + "step": 51300 }, { - "epoch": 28.80184331797235, - "grad_norm": 5.1614089012146, - "learning_rate": 2.1210724759111858e-05, - "loss": 5.9992, - "step": 137500 + "epoch": 21.533305404273147, + "grad_norm": 3.6010313034057617, + "learning_rate": 1.4126518642647677e-05, + "loss": 6.2683, + "step": 51400 }, { - "epoch": 28.90657729367407, - "grad_norm": 5.429248332977295, - "learning_rate": 2.110599078341014e-05, - "loss": 5.9929, - "step": 138000 + "epoch": 21.575198994553833, + "grad_norm": 3.429414749145508, + "learning_rate": 1.4056695992179864e-05, + "loss": 6.2408, + "step": 51500 }, { - "epoch": 29.011311269375785, - "grad_norm": 5.1270012855529785, - "learning_rate": 2.100125680770842e-05, - "loss": 5.9883, - "step": 138500 + "epoch": 21.61709258483452, + "grad_norm": 3.069561004638672, + "learning_rate": 1.3986873341712054e-05, + "loss": 6.2641, + "step": 51600 }, { - "epoch": 29.116045245077505, - "grad_norm": 5.027891159057617, - "learning_rate": 2.0896522832006704e-05, - "loss": 5.963, - "step": 139000 + "epoch": 21.658986175115206, + "grad_norm": 3.575247287750244, + "learning_rate": 1.3917050691244241e-05, + "loss": 6.2722, + "step": 51700 }, { - "epoch": 29.22077922077922, - "grad_norm": 5.712099552154541, - "learning_rate": 2.079199832425639e-05, - "loss": 5.9811, - "step": 139500 + "epoch": 21.700879765395893, + "grad_norm": 3.033505439758301, + "learning_rate": 1.3847228040776427e-05, + "loss": 6.2424, + "step": 51800 }, { - "epoch": 29.325513196480937, - "grad_norm": 4.954220294952393, - "learning_rate": 2.0687264348554674e-05, - "loss": 5.9808, - "step": 140000 + "epoch": 21.742773355676583, + "grad_norm": 3.287740707397461, + "learning_rate": 1.3777405390308618e-05, + "loss": 6.2516, + "step": 51900 }, { - "epoch": 29.430247172182657, - "grad_norm": 5.713419437408447, - "learning_rate": 2.0582530372852953e-05, - "loss": 5.9887, - "step": 140500 + "epoch": 21.78466694595727, + "grad_norm": 3.0363028049468994, + "learning_rate": 1.3707582739840804e-05, + "loss": 6.2641, + "step": 52000 }, { - "epoch": 29.534981147884373, - "grad_norm": 4.683711528778076, - "learning_rate": 2.0477796397151236e-05, - "loss": 5.9612, - "step": 141000 + "epoch": 21.826560536237956, + "grad_norm": 3.1549689769744873, + "learning_rate": 1.3637760089372995e-05, + "loss": 6.2335, + "step": 52100 }, { - "epoch": 29.63971512358609, - "grad_norm": 5.164538383483887, - "learning_rate": 2.0373271889400923e-05, - "loss": 5.993, - "step": 141500 + "epoch": 21.868454126518643, + "grad_norm": 3.8512282371520996, + "learning_rate": 1.3567937438905182e-05, + "loss": 6.2729, + "step": 52200 }, { - "epoch": 29.74444909928781, - "grad_norm": 5.386078357696533, - "learning_rate": 2.0268537913699203e-05, - "loss": 5.9735, - "step": 142000 + "epoch": 21.91034771679933, + "grad_norm": 4.0751824378967285, + "learning_rate": 1.3498813014942047e-05, + "loss": 6.2397, + "step": 52300 }, { - "epoch": 29.849183074989526, - "grad_norm": 4.4406418800354, - "learning_rate": 2.016380393799749e-05, - "loss": 5.9672, - "step": 142500 + "epoch": 21.952241307080016, + "grad_norm": 3.375235080718994, + "learning_rate": 1.3428990364474236e-05, + "loss": 6.2316, + "step": 52400 }, { - "epoch": 29.953917050691246, - "grad_norm": 5.029815673828125, - "learning_rate": 2.0059069962295772e-05, - "loss": 5.961, - "step": 143000 + "epoch": 21.994134897360702, + "grad_norm": 3.093156337738037, + "learning_rate": 1.3359167714006423e-05, + "loss": 6.2468, + "step": 52500 }, { - "epoch": 30.058651026392962, - "grad_norm": 4.666591167449951, - "learning_rate": 1.9954335986594052e-05, - "loss": 5.9505, - "step": 143500 + "epoch": 22.036028487641392, + "grad_norm": 3.729182243347168, + "learning_rate": 1.3289345063538614e-05, + "loss": 6.2366, + "step": 52600 }, { - "epoch": 30.163385002094678, - "grad_norm": 6.975547790527344, - "learning_rate": 1.9849602010892335e-05, - "loss": 5.956, - "step": 144000 + "epoch": 22.07792207792208, + "grad_norm": 3.4075732231140137, + "learning_rate": 1.32195224130708e-05, + "loss": 6.2693, + "step": 52700 }, { - "epoch": 30.268118977796398, - "grad_norm": 4.687684535980225, - "learning_rate": 1.974507750314202e-05, - "loss": 5.9475, - "step": 144500 + "epoch": 22.119815668202765, + "grad_norm": 2.9553005695343018, + "learning_rate": 1.3149699762602988e-05, + "loss": 6.2592, + "step": 52800 }, { - "epoch": 30.372852953498114, - "grad_norm": 5.594231605529785, - "learning_rate": 1.96403435274403e-05, - "loss": 5.9496, - "step": 145000 + "epoch": 22.16170925848345, + "grad_norm": 3.094538688659668, + "learning_rate": 1.3079877112135177e-05, + "loss": 6.26, + "step": 52900 }, { - "epoch": 30.477586929199834, - "grad_norm": 4.879722595214844, - "learning_rate": 1.9535609551738584e-05, - "loss": 5.9577, - "step": 145500 + "epoch": 22.203602848764138, + "grad_norm": 3.907914161682129, + "learning_rate": 1.3010054461667365e-05, + "loss": 6.2711, + "step": 53000 }, { - "epoch": 30.58232090490155, - "grad_norm": 5.470447540283203, - "learning_rate": 1.9430875576036867e-05, - "loss": 5.9672, - "step": 146000 + "epoch": 22.245496439044825, + "grad_norm": 3.7182159423828125, + "learning_rate": 1.2940231811199554e-05, + "loss": 6.2713, + "step": 53100 }, { - "epoch": 30.687054880603267, - "grad_norm": 5.818385124206543, - "learning_rate": 1.932614160033515e-05, - "loss": 5.9501, - "step": 146500 + "epoch": 22.287390029325515, + "grad_norm": 2.8652303218841553, + "learning_rate": 1.2870409160731741e-05, + "loss": 6.2325, + "step": 53200 }, { - "epoch": 30.791788856304986, - "grad_norm": 5.907487392425537, - "learning_rate": 1.9221617092584834e-05, - "loss": 5.9458, - "step": 147000 + "epoch": 22.3292836196062, + "grad_norm": 3.190359592437744, + "learning_rate": 1.2800586510263929e-05, + "loss": 6.2563, + "step": 53300 }, { - "epoch": 30.896522832006703, - "grad_norm": 4.739224433898926, - "learning_rate": 1.911688311688312e-05, - "loss": 5.935, - "step": 147500 + "epoch": 22.371177209886888, + "grad_norm": 3.372394561767578, + "learning_rate": 1.2730763859796118e-05, + "loss": 6.2489, + "step": 53400 }, { - "epoch": 31.00125680770842, - "grad_norm": 4.57131814956665, - "learning_rate": 1.90121491411814e-05, - "loss": 5.945, - "step": 148000 + "epoch": 22.413070800167574, + "grad_norm": 3.340397596359253, + "learning_rate": 1.2660941209328306e-05, + "loss": 6.2147, + "step": 53500 }, { - "epoch": 31.10599078341014, - "grad_norm": 5.128586769104004, - "learning_rate": 1.8907415165479683e-05, - "loss": 5.9494, - "step": 148500 + "epoch": 22.45496439044826, + "grad_norm": 3.1127400398254395, + "learning_rate": 1.2591118558860495e-05, + "loss": 6.2588, + "step": 53600 }, { - "epoch": 31.210724759111855, - "grad_norm": 4.871676921844482, - "learning_rate": 1.880289065772937e-05, - "loss": 5.9415, - "step": 149000 + "epoch": 22.496857980728947, + "grad_norm": 4.315746307373047, + "learning_rate": 1.2521295908392683e-05, + "loss": 6.2641, + "step": 53700 }, { - "epoch": 31.315458734813575, - "grad_norm": 5.380068778991699, - "learning_rate": 1.8698156682027652e-05, - "loss": 5.939, - "step": 149500 + "epoch": 22.538751571009634, + "grad_norm": 3.204827070236206, + "learning_rate": 1.2451473257924872e-05, + "loss": 6.2506, + "step": 53800 }, { - "epoch": 31.42019271051529, - "grad_norm": 5.430812835693359, - "learning_rate": 1.8593422706325932e-05, - "loss": 5.9276, - "step": 150000 + "epoch": 22.580645161290324, + "grad_norm": 3.653074026107788, + "learning_rate": 1.238165060745706e-05, + "loss": 6.2512, + "step": 53900 }, { - "epoch": 31.524926686217007, - "grad_norm": 4.7710442543029785, - "learning_rate": 1.8488688730624215e-05, - "loss": 5.9413, - "step": 150500 + "epoch": 22.62253875157101, + "grad_norm": 3.8693697452545166, + "learning_rate": 1.2311827956989249e-05, + "loss": 6.2515, + "step": 54000 }, { - "epoch": 31.629660661918727, - "grad_norm": 5.183919906616211, - "learning_rate": 1.8383954754922498e-05, - "loss": 5.9257, - "step": 151000 + "epoch": 22.664432341851697, + "grad_norm": 3.9418985843658447, + "learning_rate": 1.2242005306521436e-05, + "loss": 6.2522, + "step": 54100 }, { - "epoch": 31.734394637620444, - "grad_norm": 4.851598739624023, - "learning_rate": 1.827943024717218e-05, - "loss": 5.9251, - "step": 151500 + "epoch": 22.706325932132383, + "grad_norm": 3.328951358795166, + "learning_rate": 1.2172182656053624e-05, + "loss": 6.2244, + "step": 54200 }, { - "epoch": 31.839128613322163, - "grad_norm": 4.835882663726807, - "learning_rate": 1.8174696271470464e-05, - "loss": 5.92, - "step": 152000 + "epoch": 22.74821952241307, + "grad_norm": 3.251552104949951, + "learning_rate": 1.210305823209049e-05, + "loss": 6.2413, + "step": 54300 }, { - "epoch": 31.94386258902388, - "grad_norm": 5.428823947906494, - "learning_rate": 1.8069962295768748e-05, - "loss": 5.9146, - "step": 152500 + "epoch": 22.790113112693756, + "grad_norm": 3.0756313800811768, + "learning_rate": 1.2033235581622679e-05, + "loss": 6.2343, + "step": 54400 }, { - "epoch": 32.0485965647256, - "grad_norm": 6.11329984664917, - "learning_rate": 1.796522832006703e-05, - "loss": 5.9179, - "step": 153000 + "epoch": 22.832006702974446, + "grad_norm": 3.174830913543701, + "learning_rate": 1.1963412931154867e-05, + "loss": 6.2445, + "step": 54500 }, { - "epoch": 32.15333054042731, - "grad_norm": 4.836859226226807, - "learning_rate": 1.7860703812316717e-05, - "loss": 5.9189, - "step": 153500 + "epoch": 22.873900293255133, + "grad_norm": 2.831454038619995, + "learning_rate": 1.1893590280687056e-05, + "loss": 6.2457, + "step": 54600 }, { - "epoch": 32.25806451612903, - "grad_norm": 4.598475456237793, - "learning_rate": 1.7755969836615e-05, - "loss": 5.9207, - "step": 154000 + "epoch": 22.91579388353582, + "grad_norm": 3.3783247470855713, + "learning_rate": 1.1823767630219245e-05, + "loss": 6.2202, + "step": 54700 }, { - "epoch": 32.36279849183075, - "grad_norm": 4.638394832611084, - "learning_rate": 1.765123586091328e-05, - "loss": 5.915, - "step": 154500 + "epoch": 22.957687473816506, + "grad_norm": 3.4505226612091064, + "learning_rate": 1.1753944979751431e-05, + "loss": 6.2329, + "step": 54800 }, { - "epoch": 32.467532467532465, - "grad_norm": 5.637279987335205, - "learning_rate": 1.7546501885211563e-05, - "loss": 5.9228, - "step": 155000 + "epoch": 22.999581064097192, + "grad_norm": 4.203530311584473, + "learning_rate": 1.168412232928362e-05, + "loss": 6.2464, + "step": 54900 }, { - "epoch": 32.572266443234184, - "grad_norm": 4.516068458557129, - "learning_rate": 1.7441767909509846e-05, - "loss": 5.9322, - "step": 155500 + "epoch": 23.04147465437788, + "grad_norm": 3.295198678970337, + "learning_rate": 1.1614299678815808e-05, + "loss": 6.2163, + "step": 55000 }, { - "epoch": 32.677000418935904, - "grad_norm": 4.652084827423096, - "learning_rate": 1.7337243401759533e-05, - "loss": 5.9395, - "step": 156000 + "epoch": 23.083368244658566, + "grad_norm": 3.6795082092285156, + "learning_rate": 1.1544477028347997e-05, + "loss": 6.2108, + "step": 55100 }, { - "epoch": 32.78173439463762, - "grad_norm": 5.667607307434082, - "learning_rate": 1.7232509426057812e-05, - "loss": 5.9172, - "step": 156500 + "epoch": 23.125261834939256, + "grad_norm": 3.7577404975891113, + "learning_rate": 1.1474654377880186e-05, + "loss": 6.2406, + "step": 55200 }, { - "epoch": 32.88646837033934, - "grad_norm": 4.980391025543213, - "learning_rate": 1.7127775450356095e-05, - "loss": 5.934, - "step": 157000 + "epoch": 23.167155425219942, + "grad_norm": 4.524641036987305, + "learning_rate": 1.1404831727412372e-05, + "loss": 6.2449, + "step": 55300 }, { - "epoch": 32.99120234604106, - "grad_norm": 4.39646053314209, - "learning_rate": 1.702304147465438e-05, - "loss": 5.914, - "step": 157500 + "epoch": 23.20904901550063, + "grad_norm": 3.3049490451812744, + "learning_rate": 1.1335009076944561e-05, + "loss": 6.202, + "step": 55400 }, { - "epoch": 33.095936321742776, - "grad_norm": 5.263533115386963, - "learning_rate": 1.6918516966904062e-05, - "loss": 5.9094, - "step": 158000 + "epoch": 23.250942605781315, + "grad_norm": 3.6244115829467773, + "learning_rate": 1.1265186426476749e-05, + "loss": 6.2214, + "step": 55500 }, { - "epoch": 33.20067029744449, - "grad_norm": 4.661126136779785, - "learning_rate": 1.6813782991202348e-05, - "loss": 5.9189, - "step": 158500 + "epoch": 23.292836196062, + "grad_norm": 3.1158556938171387, + "learning_rate": 1.1195363776008938e-05, + "loss": 6.2247, + "step": 55600 }, { - "epoch": 33.30540427314621, - "grad_norm": 4.935306549072266, - "learning_rate": 1.670904901550063e-05, - "loss": 5.8944, - "step": 159000 + "epoch": 23.334729786342688, + "grad_norm": 3.208771228790283, + "learning_rate": 1.1125541125541126e-05, + "loss": 6.2416, + "step": 55700 }, { - "epoch": 33.41013824884793, - "grad_norm": 5.82065486907959, - "learning_rate": 1.660431503979891e-05, - "loss": 5.892, - "step": 159500 + "epoch": 23.376623376623378, + "grad_norm": 4.181106090545654, + "learning_rate": 1.1055718475073313e-05, + "loss": 6.2343, + "step": 55800 }, { - "epoch": 33.51487222454964, - "grad_norm": 4.6220927238464355, - "learning_rate": 1.6499581064097194e-05, - "loss": 5.9022, - "step": 160000 + "epoch": 23.418516966904065, + "grad_norm": 2.8972866535186768, + "learning_rate": 1.0985895824605503e-05, + "loss": 6.2186, + "step": 55900 }, { - "epoch": 33.61960620025136, - "grad_norm": 5.109046936035156, - "learning_rate": 1.639505655634688e-05, - "loss": 5.8933, - "step": 160500 + "epoch": 23.46041055718475, + "grad_norm": 3.1691384315490723, + "learning_rate": 1.091607317413769e-05, + "loss": 6.2328, + "step": 56000 }, { - "epoch": 33.72434017595308, - "grad_norm": 5.230437278747559, - "learning_rate": 1.6290322580645164e-05, - "loss": 5.9107, - "step": 161000 + "epoch": 23.502304147465438, + "grad_norm": 3.214346408843994, + "learning_rate": 1.084625052366988e-05, + "loss": 6.2356, + "step": 56100 }, { - "epoch": 33.829074151654794, - "grad_norm": 6.466080188751221, - "learning_rate": 1.6185588604943443e-05, - "loss": 5.9077, - "step": 161500 + "epoch": 23.544197737746124, + "grad_norm": 3.0547690391540527, + "learning_rate": 1.0776427873202067e-05, + "loss": 6.2245, + "step": 56200 }, { - "epoch": 33.933808127356514, - "grad_norm": 4.655428409576416, - "learning_rate": 1.6080854629241726e-05, - "loss": 5.9186, - "step": 162000 + "epoch": 23.58609132802681, + "grad_norm": 3.6090760231018066, + "learning_rate": 1.0707303449238935e-05, + "loss": 6.2634, + "step": 56300 }, { - "epoch": 34.038542103058234, - "grad_norm": 6.136354923248291, - "learning_rate": 1.597612065354001e-05, - "loss": 5.8815, - "step": 162500 + "epoch": 23.627984918307497, + "grad_norm": 3.210068702697754, + "learning_rate": 1.0637480798771122e-05, + "loss": 6.2126, + "step": 56400 }, { - "epoch": 34.143276078759946, - "grad_norm": 5.668376445770264, - "learning_rate": 1.587138667783829e-05, - "loss": 5.8872, - "step": 163000 + "epoch": 23.669878508588187, + "grad_norm": 3.872507095336914, + "learning_rate": 1.056765814830331e-05, + "loss": 6.2286, + "step": 56500 }, { - "epoch": 34.248010054461666, - "grad_norm": 5.579314708709717, - "learning_rate": 1.5766862170087976e-05, - "loss": 5.8966, - "step": 163500 + "epoch": 23.711772098868874, + "grad_norm": 4.503695011138916, + "learning_rate": 1.0497835497835499e-05, + "loss": 6.2156, + "step": 56600 }, { - "epoch": 34.352744030163386, - "grad_norm": 5.474893569946289, - "learning_rate": 1.5662128194386262e-05, - "loss": 5.8865, - "step": 164000 + "epoch": 23.75366568914956, + "grad_norm": 3.963315486907959, + "learning_rate": 1.0428012847367686e-05, + "loss": 6.2247, + "step": 56700 }, { - "epoch": 34.457478005865106, - "grad_norm": 4.853377342224121, - "learning_rate": 1.5557394218684542e-05, - "loss": 5.8718, - "step": 164500 + "epoch": 23.795559279430247, + "grad_norm": 3.4394917488098145, + "learning_rate": 1.0358190196899876e-05, + "loss": 6.234, + "step": 56800 }, { - "epoch": 34.56221198156682, - "grad_norm": 4.8684563636779785, - "learning_rate": 1.5452660242982825e-05, - "loss": 5.8896, - "step": 165000 + "epoch": 23.837452869710933, + "grad_norm": 3.403167724609375, + "learning_rate": 1.0288367546432063e-05, + "loss": 6.2045, + "step": 56900 }, { - "epoch": 34.66694595726854, - "grad_norm": 4.851233959197998, - "learning_rate": 1.5347926267281108e-05, - "loss": 5.8908, - "step": 165500 + "epoch": 23.87934645999162, + "grad_norm": 2.8274378776550293, + "learning_rate": 1.0218544895964251e-05, + "loss": 6.2121, + "step": 57000 }, { - "epoch": 34.77167993297026, - "grad_norm": 6.647628307342529, - "learning_rate": 1.524319229157939e-05, - "loss": 5.8972, - "step": 166000 + "epoch": 23.92124005027231, + "grad_norm": 3.277188301086426, + "learning_rate": 1.0148722245496438e-05, + "loss": 6.222, + "step": 57100 }, { - "epoch": 34.87641390867197, - "grad_norm": 5.826745986938477, - "learning_rate": 1.5138667783829074e-05, - "loss": 5.8852, - "step": 166500 + "epoch": 23.963133640552996, + "grad_norm": 3.0735063552856445, + "learning_rate": 1.0078899595028628e-05, + "loss": 6.2257, + "step": 57200 }, { - "epoch": 34.98114788437369, - "grad_norm": 5.109675407409668, - "learning_rate": 1.5033933808127357e-05, - "loss": 5.8971, - "step": 167000 + "epoch": 24.005027230833683, + "grad_norm": 3.6680026054382324, + "learning_rate": 1.0009076944560817e-05, + "loss": 6.2131, + "step": 57300 }, { - "epoch": 35.08588186007541, - "grad_norm": 5.743017196655273, - "learning_rate": 1.4929199832425639e-05, - "loss": 5.8892, - "step": 167500 + "epoch": 24.04692082111437, + "grad_norm": 3.134713888168335, + "learning_rate": 9.939254294093005e-06, + "loss": 6.2241, + "step": 57400 }, { - "epoch": 35.19061583577712, - "grad_norm": 4.862270355224609, - "learning_rate": 1.482446585672392e-05, - "loss": 5.8976, - "step": 168000 + "epoch": 24.088814411395056, + "grad_norm": 2.9466712474823, + "learning_rate": 9.869431643625192e-06, + "loss": 6.2158, + "step": 57500 }, { - "epoch": 35.29534981147884, - "grad_norm": 5.532686233520508, - "learning_rate": 1.4719941348973607e-05, - "loss": 5.8716, - "step": 168500 + "epoch": 24.130708001675742, + "grad_norm": 3.468949794769287, + "learning_rate": 9.79960899315738e-06, + "loss": 6.1793, + "step": 57600 }, { - "epoch": 35.40008378718056, - "grad_norm": 5.498019695281982, - "learning_rate": 1.4615207373271891e-05, - "loss": 5.8768, - "step": 169000 + "epoch": 24.17260159195643, + "grad_norm": 3.5487060546875, + "learning_rate": 9.729786342689569e-06, + "loss": 6.2218, + "step": 57700 }, { - "epoch": 35.504817762882276, - "grad_norm": 4.7324042320251465, - "learning_rate": 1.4510473397570173e-05, - "loss": 5.8739, - "step": 169500 + "epoch": 24.21449518223712, + "grad_norm": 4.345893383026123, + "learning_rate": 9.659963692221758e-06, + "loss": 6.2023, + "step": 57800 }, { - "epoch": 35.609551738583995, - "grad_norm": 4.973413944244385, - "learning_rate": 1.4405739421868456e-05, - "loss": 5.8801, - "step": 170000 + "epoch": 24.256388772517806, + "grad_norm": 2.9016401767730713, + "learning_rate": 9.590141041753946e-06, + "loss": 6.23, + "step": 57900 }, { - "epoch": 35.714285714285715, - "grad_norm": 4.977658271789551, - "learning_rate": 1.430121491411814e-05, - "loss": 5.8644, - "step": 170500 + "epoch": 24.298282362798492, + "grad_norm": 4.17023229598999, + "learning_rate": 9.520318391286135e-06, + "loss": 6.2114, + "step": 58000 }, { - "epoch": 35.819019689987435, - "grad_norm": 5.551715850830078, - "learning_rate": 1.4196480938416424e-05, - "loss": 5.8789, - "step": 171000 + "epoch": 24.34017595307918, + "grad_norm": 3.322115421295166, + "learning_rate": 9.45049574081832e-06, + "loss": 6.204, + "step": 58100 }, { - "epoch": 35.92375366568915, - "grad_norm": 5.135740756988525, - "learning_rate": 1.4091746962714705e-05, - "loss": 5.8862, - "step": 171500 + "epoch": 24.382069543359865, + "grad_norm": 3.709805488586426, + "learning_rate": 9.38067309035051e-06, + "loss": 6.2087, + "step": 58200 }, { - "epoch": 36.02848764139087, - "grad_norm": 5.068655967712402, - "learning_rate": 1.3987012987012987e-05, - "loss": 5.879, - "step": 172000 + "epoch": 24.42396313364055, + "grad_norm": 3.225588798522949, + "learning_rate": 9.311548666387376e-06, + "loss": 6.2436, + "step": 58300 }, { - "epoch": 36.13322161709259, - "grad_norm": 5.393857479095459, - "learning_rate": 1.388227901131127e-05, - "loss": 5.8544, - "step": 172500 + "epoch": 24.46585672392124, + "grad_norm": 3.1229472160339355, + "learning_rate": 9.241726015919565e-06, + "loss": 6.2253, + "step": 58400 }, { - "epoch": 36.2379555927943, - "grad_norm": 5.854538440704346, - "learning_rate": 1.3777545035609551e-05, - "loss": 5.8824, - "step": 173000 + "epoch": 24.507750314201928, + "grad_norm": 3.4445230960845947, + "learning_rate": 9.171903365451753e-06, + "loss": 6.2254, + "step": 58500 }, { - "epoch": 36.34268956849602, - "grad_norm": 5.566401481628418, - "learning_rate": 1.3673020527859238e-05, - "loss": 5.8587, - "step": 173500 + "epoch": 24.549643904482615, + "grad_norm": 4.2796807289123535, + "learning_rate": 9.102080714983942e-06, + "loss": 6.221, + "step": 58600 }, { - "epoch": 36.44742354419774, - "grad_norm": 6.091250896453857, - "learning_rate": 1.3568286552157519e-05, - "loss": 5.8624, - "step": 174000 + "epoch": 24.5915374947633, + "grad_norm": 3.2323966026306152, + "learning_rate": 9.03225806451613e-06, + "loss": 6.228, + "step": 58700 }, { - "epoch": 36.55215751989945, - "grad_norm": 4.826417922973633, - "learning_rate": 1.3463552576455804e-05, - "loss": 5.879, - "step": 174500 + "epoch": 24.633431085043988, + "grad_norm": 4.064596652984619, + "learning_rate": 8.962435414048317e-06, + "loss": 6.2363, + "step": 58800 }, { - "epoch": 36.65689149560117, - "grad_norm": 5.28770637512207, - "learning_rate": 1.3358818600754087e-05, - "loss": 5.8632, - "step": 175000 + "epoch": 24.675324675324674, + "grad_norm": 3.068544864654541, + "learning_rate": 8.893310990085183e-06, + "loss": 6.2508, + "step": 58900 }, { - "epoch": 36.76162547130289, - "grad_norm": 5.072086811065674, - "learning_rate": 1.3254084625052368e-05, - "loss": 5.8698, - "step": 175500 + "epoch": 24.71721826560536, + "grad_norm": 2.6201155185699463, + "learning_rate": 8.823488339617372e-06, + "loss": 6.2193, + "step": 59000 }, { - "epoch": 36.866359447004605, - "grad_norm": 6.194067001342773, - "learning_rate": 1.3149560117302053e-05, - "loss": 5.8701, - "step": 176000 + "epoch": 24.75911185588605, + "grad_norm": 4.960629463195801, + "learning_rate": 8.753665689149562e-06, + "loss": 6.1999, + "step": 59100 }, { - "epoch": 36.971093422706325, - "grad_norm": 5.250491142272949, - "learning_rate": 1.3044826141600336e-05, - "loss": 5.855, - "step": 176500 + "epoch": 24.801005446166737, + "grad_norm": 3.191586971282959, + "learning_rate": 8.683843038681749e-06, + "loss": 6.2203, + "step": 59200 }, { - "epoch": 37.075827398408045, - "grad_norm": 4.9726080894470215, - "learning_rate": 1.2940092165898618e-05, - "loss": 5.8613, - "step": 177000 + "epoch": 24.842899036447424, + "grad_norm": 3.224745512008667, + "learning_rate": 8.614020388213937e-06, + "loss": 6.212, + "step": 59300 }, { - "epoch": 37.180561374109764, - "grad_norm": 5.526548385620117, - "learning_rate": 1.28353581901969e-05, - "loss": 5.8519, - "step": 177500 + "epoch": 24.88479262672811, + "grad_norm": 3.450741767883301, + "learning_rate": 8.544197737746124e-06, + "loss": 6.2386, + "step": 59400 }, { - "epoch": 37.28529534981148, - "grad_norm": 5.5989861488342285, - "learning_rate": 1.2730624214495182e-05, - "loss": 5.8642, - "step": 178000 + "epoch": 24.926686217008797, + "grad_norm": 4.297729969024658, + "learning_rate": 8.474375087278313e-06, + "loss": 6.2088, + "step": 59500 }, { - "epoch": 37.3900293255132, - "grad_norm": 5.138686180114746, - "learning_rate": 1.2625890238793465e-05, - "loss": 5.852, - "step": 178500 + "epoch": 24.968579807289483, + "grad_norm": 3.376110553741455, + "learning_rate": 8.404552436810503e-06, + "loss": 6.2176, + "step": 59600 }, { - "epoch": 37.49476330121492, - "grad_norm": 5.0514326095581055, - "learning_rate": 1.252115626309175e-05, - "loss": 5.8484, - "step": 179000 + "epoch": 25.010473397570173, + "grad_norm": 3.0211358070373535, + "learning_rate": 8.33472978634269e-06, + "loss": 6.1906, + "step": 59700 }, { - "epoch": 37.59949727691663, - "grad_norm": 4.9300360679626465, - "learning_rate": 1.241642228739003e-05, - "loss": 5.849, - "step": 179500 + "epoch": 25.05236698785086, + "grad_norm": 2.8490803241729736, + "learning_rate": 8.264907135874878e-06, + "loss": 6.2, + "step": 59800 }, { - "epoch": 37.70423125261835, - "grad_norm": 5.487224102020264, - "learning_rate": 1.2311897779639716e-05, - "loss": 5.8562, - "step": 180000 + "epoch": 25.094260578131546, + "grad_norm": 3.0233705043792725, + "learning_rate": 8.195084485407065e-06, + "loss": 6.1886, + "step": 59900 }, { - "epoch": 37.80896522832007, - "grad_norm": 5.826539516448975, - "learning_rate": 1.2207163803937999e-05, - "loss": 5.8665, - "step": 180500 + "epoch": 25.136154168412233, + "grad_norm": 3.7582995891571045, + "learning_rate": 8.125261834939255e-06, + "loss": 6.2064, + "step": 60000 }, { - "epoch": 37.91369920402178, - "grad_norm": 5.733819961547852, - "learning_rate": 1.2102639296187684e-05, - "loss": 5.8569, - "step": 181000 + "epoch": 25.17804775869292, + "grad_norm": 3.128079891204834, + "learning_rate": 8.055439184471442e-06, + "loss": 6.2264, + "step": 60100 }, { - "epoch": 38.0184331797235, - "grad_norm": 4.917960166931152, - "learning_rate": 1.1997905320485967e-05, - "loss": 5.8395, - "step": 181500 + "epoch": 25.219941348973606, + "grad_norm": 3.1808972358703613, + "learning_rate": 7.985616534003632e-06, + "loss": 6.2149, + "step": 60200 }, { - "epoch": 38.12316715542522, - "grad_norm": 5.337119102478027, - "learning_rate": 1.1893171344784248e-05, - "loss": 5.854, - "step": 182000 + "epoch": 25.261834939254292, + "grad_norm": 3.2326996326446533, + "learning_rate": 7.91579388353582e-06, + "loss": 6.2142, + "step": 60300 }, { - "epoch": 38.227901131126934, - "grad_norm": 5.299139022827148, - "learning_rate": 1.178843736908253e-05, - "loss": 5.8433, - "step": 182500 + "epoch": 25.303728529534983, + "grad_norm": 3.267465114593506, + "learning_rate": 7.845971233068007e-06, + "loss": 6.2439, + "step": 60400 }, { - "epoch": 38.332635106828654, - "grad_norm": 5.900153160095215, - "learning_rate": 1.1683703393380813e-05, - "loss": 5.8535, - "step": 183000 + "epoch": 25.34562211981567, + "grad_norm": 3.691075563430786, + "learning_rate": 7.776148582600196e-06, + "loss": 6.2178, + "step": 60500 }, { - "epoch": 38.437369082530374, - "grad_norm": 6.776584625244141, - "learning_rate": 1.15791788856305e-05, - "loss": 5.8454, - "step": 183500 + "epoch": 25.387515710096356, + "grad_norm": 3.290562152862549, + "learning_rate": 7.706325932132383e-06, + "loss": 6.2165, + "step": 60600 }, { - "epoch": 38.542103058232094, - "grad_norm": 6.258368015289307, - "learning_rate": 1.1474444909928783e-05, - "loss": 5.8354, - "step": 184000 + "epoch": 25.429409300377042, + "grad_norm": 4.553886413574219, + "learning_rate": 7.636503281664573e-06, + "loss": 6.2165, + "step": 60700 }, { - "epoch": 38.64683703393381, - "grad_norm": 5.288670539855957, - "learning_rate": 1.1369710934227064e-05, - "loss": 5.8458, - "step": 184500 + "epoch": 25.47130289065773, + "grad_norm": 4.013444423675537, + "learning_rate": 7.566680631196761e-06, + "loss": 6.2122, + "step": 60800 }, { - "epoch": 38.751571009635526, - "grad_norm": 5.596650123596191, - "learning_rate": 1.1264976958525345e-05, - "loss": 5.8387, - "step": 185000 + "epoch": 25.513196480938415, + "grad_norm": 4.044810771942139, + "learning_rate": 7.496857980728948e-06, + "loss": 6.2533, + "step": 60900 }, { - "epoch": 38.856304985337246, - "grad_norm": 5.121638774871826, - "learning_rate": 1.1160452450775032e-05, - "loss": 5.8268, - "step": 185500 + "epoch": 25.555090071219105, + "grad_norm": 3.788613796234131, + "learning_rate": 7.427035330261137e-06, + "loss": 6.2039, + "step": 61000 }, { - "epoch": 38.96103896103896, - "grad_norm": 4.5758442878723145, - "learning_rate": 1.1055718475073313e-05, - "loss": 5.83, - "step": 186000 + "epoch": 25.59698366149979, + "grad_norm": 3.317281484603882, + "learning_rate": 7.3572126797933255e-06, + "loss": 6.2228, + "step": 61100 }, { - "epoch": 39.06577293674068, - "grad_norm": 5.161282539367676, - "learning_rate": 1.0950984499371596e-05, - "loss": 5.8544, - "step": 186500 + "epoch": 25.638877251780478, + "grad_norm": 3.4238085746765137, + "learning_rate": 7.287390029325514e-06, + "loss": 6.1979, + "step": 61200 }, { - "epoch": 39.1705069124424, - "grad_norm": 4.628884315490723, - "learning_rate": 1.084625052366988e-05, - "loss": 5.8474, - "step": 187000 + "epoch": 25.680770842061165, + "grad_norm": 3.1558725833892822, + "learning_rate": 7.217567378857702e-06, + "loss": 6.2044, + "step": 61300 }, { - "epoch": 39.27524088814411, - "grad_norm": 5.854598045349121, - "learning_rate": 1.074151654796816e-05, - "loss": 5.8501, - "step": 187500 + "epoch": 25.72266443234185, + "grad_norm": 2.939328670501709, + "learning_rate": 7.147744728389889e-06, + "loss": 6.2312, + "step": 61400 }, { - "epoch": 39.37997486384583, - "grad_norm": 5.315525054931641, - "learning_rate": 1.0636782572266444e-05, - "loss": 5.8265, - "step": 188000 + "epoch": 25.764558022622538, + "grad_norm": 4.0037455558776855, + "learning_rate": 7.0779220779220775e-06, + "loss": 6.228, + "step": 61500 }, { - "epoch": 39.48470883954755, - "grad_norm": 6.078185081481934, - "learning_rate": 1.0532048596564727e-05, - "loss": 5.8419, - "step": 188500 + "epoch": 25.806451612903224, + "grad_norm": 4.4582343101501465, + "learning_rate": 7.008099427454267e-06, + "loss": 6.2132, + "step": 61600 }, { - "epoch": 39.589442815249264, - "grad_norm": 5.223086357116699, - "learning_rate": 1.0427524088814412e-05, - "loss": 5.8197, - "step": 189000 + "epoch": 25.848345203183914, + "grad_norm": 3.006201982498169, + "learning_rate": 6.938276776986455e-06, + "loss": 6.2242, + "step": 61700 }, { - "epoch": 39.69417679095098, - "grad_norm": 5.235757827758789, - "learning_rate": 1.0322790113112695e-05, - "loss": 5.8245, - "step": 189500 + "epoch": 25.8902387934646, + "grad_norm": 3.6898059844970703, + "learning_rate": 6.8684541265186436e-06, + "loss": 6.2134, + "step": 61800 }, { - "epoch": 39.7989107666527, - "grad_norm": 5.124643325805664, - "learning_rate": 1.0218056137410976e-05, - "loss": 5.839, - "step": 190000 + "epoch": 25.932132383745287, + "grad_norm": 3.3489785194396973, + "learning_rate": 6.798631476050832e-06, + "loss": 6.2042, + "step": 61900 }, { - "epoch": 39.90364474235442, - "grad_norm": 5.613321304321289, - "learning_rate": 1.011332216170926e-05, - "loss": 5.844, - "step": 190500 + "epoch": 25.974025974025974, + "grad_norm": 3.2489922046661377, + "learning_rate": 6.729507052087698e-06, + "loss": 6.2212, + "step": 62000 }, { - "epoch": 40.008378718056136, - "grad_norm": 5.873430252075195, - "learning_rate": 1.0008588186007542e-05, - "loss": 5.837, - "step": 191000 + "epoch": 26.01591956430666, + "grad_norm": 4.022356033325195, + "learning_rate": 6.659684401619885e-06, + "loss": 6.2423, + "step": 62100 }, { - "epoch": 40.113112693757856, - "grad_norm": 5.089309215545654, - "learning_rate": 9.903854210305824e-06, - "loss": 5.8384, - "step": 191500 + "epoch": 26.057813154587347, + "grad_norm": 4.803937911987305, + "learning_rate": 6.589861751152074e-06, + "loss": 6.2319, + "step": 62200 }, { - "epoch": 40.217846669459576, - "grad_norm": 7.3569817543029785, - "learning_rate": 9.79932970255551e-06, - "loss": 5.8232, - "step": 192000 + "epoch": 26.099706744868037, + "grad_norm": 3.7283337116241455, + "learning_rate": 6.520039100684262e-06, + "loss": 6.1924, + "step": 62300 }, { - "epoch": 40.32258064516129, - "grad_norm": 6.024489402770996, - "learning_rate": 9.694595726853792e-06, - "loss": 5.8292, - "step": 192500 + "epoch": 26.141600335148723, + "grad_norm": 3.817946672439575, + "learning_rate": 6.450216450216451e-06, + "loss": 6.2039, + "step": 62400 }, { - "epoch": 40.42731462086301, - "grad_norm": 5.7150983810424805, - "learning_rate": 9.589861751152073e-06, - "loss": 5.8614, - "step": 193000 + "epoch": 26.18349392542941, + "grad_norm": 3.4621963500976562, + "learning_rate": 6.380393799748639e-06, + "loss": 6.214, + "step": 62500 }, { - "epoch": 40.53204859656473, - "grad_norm": 4.717107772827148, - "learning_rate": 9.485127775450356e-06, - "loss": 5.8092, - "step": 193500 + "epoch": 26.225387515710096, + "grad_norm": 4.458475112915039, + "learning_rate": 6.310571149280827e-06, + "loss": 6.2327, + "step": 62600 }, { - "epoch": 40.63678257226644, - "grad_norm": 4.9722490310668945, - "learning_rate": 9.380603267700043e-06, - "loss": 5.8231, - "step": 194000 + "epoch": 26.267281105990783, + "grad_norm": 3.1324493885040283, + "learning_rate": 6.240748498813015e-06, + "loss": 6.2518, + "step": 62700 }, { - "epoch": 40.74151654796816, - "grad_norm": 5.593094825744629, - "learning_rate": 9.275869291998326e-06, - "loss": 5.8339, - "step": 194500 + "epoch": 26.30917469627147, + "grad_norm": 3.410626173019409, + "learning_rate": 6.1709258483452034e-06, + "loss": 6.2054, + "step": 62800 }, { - "epoch": 40.84625052366988, - "grad_norm": 5.731310844421387, - "learning_rate": 9.171135316296607e-06, - "loss": 5.8381, - "step": 195000 + "epoch": 26.351068286552156, + "grad_norm": 3.221602201461792, + "learning_rate": 6.101103197877392e-06, + "loss": 6.2297, + "step": 62900 }, { - "epoch": 40.95098449937159, - "grad_norm": 5.072065353393555, - "learning_rate": 9.066401340594889e-06, - "loss": 5.8367, - "step": 195500 + "epoch": 26.392961876832846, + "grad_norm": 3.1413893699645996, + "learning_rate": 6.031280547409579e-06, + "loss": 6.2134, + "step": 63000 }, { - "epoch": 41.05571847507331, - "grad_norm": 5.219040870666504, - "learning_rate": 8.961667364893172e-06, - "loss": 5.8234, - "step": 196000 + "epoch": 26.434855467113533, + "grad_norm": 3.3834433555603027, + "learning_rate": 5.961457896941768e-06, + "loss": 6.167, + "step": 63100 }, { - "epoch": 41.16045245077503, - "grad_norm": 5.844238758087158, - "learning_rate": 8.856933389191455e-06, - "loss": 5.8347, - "step": 196500 + "epoch": 26.47674905739422, + "grad_norm": 3.016921281814575, + "learning_rate": 5.891635246473957e-06, + "loss": 6.2181, + "step": 63200 }, { - "epoch": 41.26518642647675, - "grad_norm": 6.088447093963623, - "learning_rate": 8.752199413489736e-06, - "loss": 5.8178, - "step": 197000 + "epoch": 26.518642647674906, + "grad_norm": 3.4190244674682617, + "learning_rate": 5.821812596006145e-06, + "loss": 6.172, + "step": 63300 }, { - "epoch": 41.369920402178465, - "grad_norm": 5.14108943939209, - "learning_rate": 8.647465437788019e-06, - "loss": 5.8248, - "step": 197500 + "epoch": 26.560536237955592, + "grad_norm": 3.519742488861084, + "learning_rate": 5.751989945538333e-06, + "loss": 6.2144, + "step": 63400 }, { - "epoch": 41.474654377880185, - "grad_norm": 5.424249172210693, - "learning_rate": 8.542940930037704e-06, - "loss": 5.8113, - "step": 198000 + "epoch": 26.60242982823628, + "grad_norm": 3.083923101425171, + "learning_rate": 5.682167295070521e-06, + "loss": 6.204, + "step": 63500 }, { - "epoch": 41.579388353581905, - "grad_norm": 4.888121604919434, - "learning_rate": 8.43841642228739e-06, - "loss": 5.8111, - "step": 198500 + "epoch": 26.64432341851697, + "grad_norm": 3.8977878093719482, + "learning_rate": 5.612344644602709e-06, + "loss": 6.1759, + "step": 63600 }, { - "epoch": 41.68412232928362, - "grad_norm": 4.9909515380859375, - "learning_rate": 8.333682446585672e-06, - "loss": 5.8276, - "step": 199000 + "epoch": 26.686217008797655, + "grad_norm": 3.5598249435424805, + "learning_rate": 5.5425219941348974e-06, + "loss": 6.2233, + "step": 63700 }, { - "epoch": 41.78885630498534, - "grad_norm": 5.032175540924072, - "learning_rate": 8.228948470883955e-06, - "loss": 5.8332, - "step": 199500 + "epoch": 26.72811059907834, + "grad_norm": 3.6333513259887695, + "learning_rate": 5.472699343667086e-06, + "loss": 6.2133, + "step": 63800 }, { - "epoch": 41.89359028068706, - "grad_norm": 5.116880416870117, - "learning_rate": 8.124214495182238e-06, - "loss": 5.8233, - "step": 200000 + "epoch": 26.770004189359028, + "grad_norm": 3.2468085289001465, + "learning_rate": 5.402876693199274e-06, + "loss": 6.2081, + "step": 63900 }, { - "epoch": 41.99832425638877, - "grad_norm": 5.235647678375244, - "learning_rate": 8.019689987431923e-06, - "loss": 5.8297, - "step": 200500 + "epoch": 26.811897779639715, + "grad_norm": 3.6896772384643555, + "learning_rate": 5.333054042731463e-06, + "loss": 6.1935, + "step": 64000 }, { - "epoch": 42.10305823209049, - "grad_norm": 5.445380210876465, - "learning_rate": 7.914956011730206e-06, - "loss": 5.8159, - "step": 201000 + "epoch": 26.8537913699204, + "grad_norm": 3.263144016265869, + "learning_rate": 5.26323139226365e-06, + "loss": 6.2127, + "step": 64100 }, { - "epoch": 42.20779220779221, - "grad_norm": 4.979036331176758, - "learning_rate": 7.810222036028488e-06, - "loss": 5.809, - "step": 201500 + "epoch": 26.895684960201088, + "grad_norm": 3.2848362922668457, + "learning_rate": 5.193408741795839e-06, + "loss": 6.2074, + "step": 64200 }, { - "epoch": 42.31252618349392, - "grad_norm": 5.359362602233887, - "learning_rate": 7.70548806032677e-06, - "loss": 5.8346, - "step": 202000 + "epoch": 26.937578550481778, + "grad_norm": 3.675541639328003, + "learning_rate": 5.123586091328027e-06, + "loss": 6.2015, + "step": 64300 }, { - "epoch": 42.41726015919564, - "grad_norm": 5.264519214630127, - "learning_rate": 7.600754084625053e-06, - "loss": 5.8089, - "step": 202500 + "epoch": 26.979472140762464, + "grad_norm": 3.413780689239502, + "learning_rate": 5.0537634408602155e-06, + "loss": 6.2218, + "step": 64400 }, { - "epoch": 42.52199413489736, - "grad_norm": 5.985982894897461, - "learning_rate": 7.496020108923335e-06, - "loss": 5.8192, - "step": 203000 + "epoch": 27.02136573104315, + "grad_norm": 4.108157634735107, + "learning_rate": 4.983940790392404e-06, + "loss": 6.212, + "step": 64500 }, { - "epoch": 42.626728110599075, - "grad_norm": 5.505626201629639, - "learning_rate": 7.391286133221617e-06, - "loss": 5.8095, - "step": 203500 + "epoch": 27.063259321323837, + "grad_norm": 3.7690155506134033, + "learning_rate": 4.9141181399245915e-06, + "loss": 6.22, + "step": 64600 }, { - "epoch": 42.731462086300795, - "grad_norm": 5.069738388061523, - "learning_rate": 7.286552157519899e-06, - "loss": 5.8186, - "step": 204000 + "epoch": 27.105152911604524, + "grad_norm": 3.379786491394043, + "learning_rate": 4.84429548945678e-06, + "loss": 6.2334, + "step": 64700 }, { - "epoch": 42.836196062002514, - "grad_norm": 6.004745960235596, - "learning_rate": 7.182027649769586e-06, - "loss": 5.8136, - "step": 204500 + "epoch": 27.14704650188521, + "grad_norm": 3.5175390243530273, + "learning_rate": 4.774472838988968e-06, + "loss": 6.1932, + "step": 64800 }, { - "epoch": 42.940930037704234, - "grad_norm": 6.299502372741699, - "learning_rate": 7.077293674067868e-06, - "loss": 5.8213, - "step": 205000 + "epoch": 27.1889400921659, + "grad_norm": 2.8454129695892334, + "learning_rate": 4.704650188521157e-06, + "loss": 6.2087, + "step": 64900 }, { - "epoch": 43.04566401340595, - "grad_norm": 6.302718162536621, - "learning_rate": 6.97255969836615e-06, - "loss": 5.8075, - "step": 205500 + "epoch": 27.230833682446587, + "grad_norm": 3.4630961418151855, + "learning_rate": 4.634827538053345e-06, + "loss": 6.2142, + "step": 65000 }, { - "epoch": 43.15039798910767, - "grad_norm": 5.921250343322754, - "learning_rate": 6.867825722664433e-06, - "loss": 5.8276, - "step": 206000 + "epoch": 27.272727272727273, + "grad_norm": 3.339860677719116, + "learning_rate": 4.565004887585533e-06, + "loss": 6.1772, + "step": 65100 }, { - "epoch": 43.25513196480939, - "grad_norm": 5.123110771179199, - "learning_rate": 6.763091746962715e-06, - "loss": 5.7965, - "step": 206500 + "epoch": 27.31462086300796, + "grad_norm": 3.0743260383605957, + "learning_rate": 4.495182237117721e-06, + "loss": 6.2044, + "step": 65200 }, { - "epoch": 43.3598659405111, - "grad_norm": 5.187294006347656, - "learning_rate": 6.658357771260998e-06, - "loss": 5.8137, - "step": 207000 + "epoch": 27.356514453288646, + "grad_norm": 3.2576496601104736, + "learning_rate": 4.4253595866499095e-06, + "loss": 6.2191, + "step": 65300 }, { - "epoch": 43.46459991621282, - "grad_norm": 5.407510757446289, - "learning_rate": 6.55362379555928e-06, - "loss": 5.8305, - "step": 207500 + "epoch": 27.398408043569333, + "grad_norm": 3.326819896697998, + "learning_rate": 4.355536936182097e-06, + "loss": 6.1762, + "step": 65400 }, { - "epoch": 43.56933389191454, - "grad_norm": 5.892600059509277, - "learning_rate": 6.449099287808966e-06, - "loss": 5.8167, - "step": 208000 + "epoch": 27.44030163385002, + "grad_norm": 3.4447667598724365, + "learning_rate": 4.285714285714286e-06, + "loss": 6.1823, + "step": 65500 }, { - "epoch": 43.67406786761625, - "grad_norm": 5.39382266998291, - "learning_rate": 6.344365312107248e-06, - "loss": 5.8185, - "step": 208500 + "epoch": 27.48219522413071, + "grad_norm": 3.4771687984466553, + "learning_rate": 4.215891635246475e-06, + "loss": 6.228, + "step": 65600 }, { - "epoch": 43.77880184331797, - "grad_norm": 5.608034133911133, - "learning_rate": 6.23963133640553e-06, - "loss": 5.8051, - "step": 209000 + "epoch": 27.524088814411396, + "grad_norm": 3.3457424640655518, + "learning_rate": 4.146068984778662e-06, + "loss": 6.1651, + "step": 65700 }, { - "epoch": 43.88353581901969, - "grad_norm": 6.069722652435303, - "learning_rate": 6.1348973607038125e-06, - "loss": 5.8101, - "step": 209500 + "epoch": 27.565982404692082, + "grad_norm": 3.006155490875244, + "learning_rate": 4.076246334310851e-06, + "loss": 6.2026, + "step": 65800 }, { - "epoch": 43.988269794721404, - "grad_norm": 5.938599109649658, - "learning_rate": 6.0301633850020955e-06, - "loss": 5.807, - "step": 210000 + "epoch": 27.60787599497277, + "grad_norm": 4.228708744049072, + "learning_rate": 4.006423683843038e-06, + "loss": 6.1923, + "step": 65900 }, { - "epoch": 44.093003770423124, - "grad_norm": 5.808456897735596, - "learning_rate": 5.925429409300378e-06, - "loss": 5.8246, - "step": 210500 + "epoch": 27.649769585253456, + "grad_norm": 3.4744226932525635, + "learning_rate": 3.937299259879905e-06, + "loss": 6.1891, + "step": 66000 }, { - "epoch": 44.197737746124844, - "grad_norm": 5.229996681213379, - "learning_rate": 5.820904901550063e-06, - "loss": 5.8029, - "step": 211000 + "epoch": 27.691663175534142, + "grad_norm": 3.8300633430480957, + "learning_rate": 3.867476609412093e-06, + "loss": 6.2237, + "step": 66100 }, { - "epoch": 44.302471721826564, - "grad_norm": 5.295706748962402, - "learning_rate": 5.716170925848346e-06, - "loss": 5.8094, - "step": 211500 + "epoch": 27.733556765814832, + "grad_norm": 2.9689528942108154, + "learning_rate": 3.7976539589442818e-06, + "loss": 6.217, + "step": 66200 }, { - "epoch": 44.407205697528276, - "grad_norm": 5.649194240570068, - "learning_rate": 5.611436950146628e-06, - "loss": 5.811, - "step": 212000 + "epoch": 27.77545035609552, + "grad_norm": 3.1309947967529297, + "learning_rate": 3.7278313084764698e-06, + "loss": 6.2061, + "step": 66300 }, { - "epoch": 44.511939673229996, - "grad_norm": 6.5928521156311035, - "learning_rate": 5.50670297444491e-06, - "loss": 5.7974, - "step": 212500 + "epoch": 27.817343946376205, + "grad_norm": 3.4571166038513184, + "learning_rate": 3.658008658008658e-06, + "loss": 6.1863, + "step": 66400 }, { - "epoch": 44.616673648931716, - "grad_norm": 6.246605396270752, - "learning_rate": 5.401968998743192e-06, - "loss": 5.8011, - "step": 213000 + "epoch": 27.85923753665689, + "grad_norm": 3.354229211807251, + "learning_rate": 3.5881860075408466e-06, + "loss": 6.1996, + "step": 66500 }, { - "epoch": 44.72140762463343, - "grad_norm": 5.312093734741211, - "learning_rate": 5.2972350230414745e-06, - "loss": 5.7819, - "step": 213500 + "epoch": 27.901131126937578, + "grad_norm": 3.745568037033081, + "learning_rate": 3.5183633570730346e-06, + "loss": 6.1839, + "step": 66600 }, { - "epoch": 44.82614160033515, - "grad_norm": 5.348554611206055, - "learning_rate": 5.19271051529116e-06, - "loss": 5.8027, - "step": 214000 + "epoch": 27.943024717218265, + "grad_norm": 3.356715440750122, + "learning_rate": 3.448540706605223e-06, + "loss": 6.2048, + "step": 66700 }, { - "epoch": 44.93087557603687, - "grad_norm": 5.95352029800415, - "learning_rate": 5.087976539589443e-06, - "loss": 5.8046, - "step": 214500 + "epoch": 27.98491830749895, + "grad_norm": 2.964492082595825, + "learning_rate": 3.378718056137411e-06, + "loss": 6.228, + "step": 66800 }, { - "epoch": 45.03560955173858, - "grad_norm": 5.978014945983887, - "learning_rate": 4.983242563887726e-06, - "loss": 5.8021, - "step": 215000 + "epoch": 28.02681189777964, + "grad_norm": 3.336606502532959, + "learning_rate": 3.3088954056695994e-06, + "loss": 6.1953, + "step": 66900 }, { - "epoch": 45.1403435274403, - "grad_norm": 5.595849990844727, - "learning_rate": 4.878508588186008e-06, - "loss": 5.7996, - "step": 215500 + "epoch": 28.068705488060328, + "grad_norm": 3.264971971511841, + "learning_rate": 3.239072755201788e-06, + "loss": 6.1783, + "step": 67000 }, { - "epoch": 45.24507750314202, - "grad_norm": 5.570345401763916, - "learning_rate": 4.77377461248429e-06, - "loss": 5.7973, - "step": 216000 + "epoch": 28.110599078341014, + "grad_norm": 3.4968082904815674, + "learning_rate": 3.169250104733976e-06, + "loss": 6.2117, + "step": 67100 }, { - "epoch": 45.34981147884373, - "grad_norm": 5.320748805999756, - "learning_rate": 4.669040636782573e-06, - "loss": 5.7886, - "step": 216500 + "epoch": 28.1524926686217, + "grad_norm": 3.4082252979278564, + "learning_rate": 3.099427454266164e-06, + "loss": 6.2278, + "step": 67200 }, { - "epoch": 45.45454545454545, - "grad_norm": 4.676185607910156, - "learning_rate": 4.564516129032258e-06, - "loss": 5.7874, - "step": 217000 + "epoch": 28.194386258902387, + "grad_norm": 3.52056884765625, + "learning_rate": 3.029604803798352e-06, + "loss": 6.2037, + "step": 67300 }, { - "epoch": 45.55927943024717, - "grad_norm": 5.7768473625183105, - "learning_rate": 4.45978215333054e-06, - "loss": 5.7875, - "step": 217500 + "epoch": 28.236279849183074, + "grad_norm": 3.6062779426574707, + "learning_rate": 2.9597821533305406e-06, + "loss": 6.1952, + "step": 67400 }, { - "epoch": 45.66401340594889, - "grad_norm": 5.668895244598389, - "learning_rate": 4.355048177628823e-06, - "loss": 5.8121, - "step": 218000 + "epoch": 28.278173439463764, + "grad_norm": 3.158705472946167, + "learning_rate": 2.889959502862729e-06, + "loss": 6.2067, + "step": 67500 }, { - "epoch": 45.768747381650606, - "grad_norm": 5.033557891845703, - "learning_rate": 4.2503142019271055e-06, - "loss": 5.8137, - "step": 218500 + "epoch": 28.32006702974445, + "grad_norm": 3.6732075214385986, + "learning_rate": 2.820136852394917e-06, + "loss": 6.1752, + "step": 67600 }, { - "epoch": 45.873481357352325, - "grad_norm": 6.15772819519043, - "learning_rate": 4.145580226225388e-06, - "loss": 5.8137, - "step": 219000 + "epoch": 28.361960620025137, + "grad_norm": 2.842560291290283, + "learning_rate": 2.7503142019271054e-06, + "loss": 6.1823, + "step": 67700 }, { - "epoch": 45.978215333054045, - "grad_norm": 6.617910861968994, - "learning_rate": 4.04084625052367e-06, - "loss": 5.8079, - "step": 219500 + "epoch": 28.403854210305823, + "grad_norm": 3.412233591079712, + "learning_rate": 2.6804915514592934e-06, + "loss": 6.1997, + "step": 67800 }, { - "epoch": 46.08294930875576, - "grad_norm": 5.210205554962158, - "learning_rate": 3.936112274821952e-06, - "loss": 5.8189, - "step": 220000 + "epoch": 28.44574780058651, + "grad_norm": 2.8313143253326416, + "learning_rate": 2.610668900991482e-06, + "loss": 6.187, + "step": 67900 }, { - "epoch": 46.18768328445748, - "grad_norm": 5.630945205688477, - "learning_rate": 3.831587767071639e-06, - "loss": 5.795, - "step": 220500 + "epoch": 28.487641390867196, + "grad_norm": 3.122307300567627, + "learning_rate": 2.541544477028348e-06, + "loss": 6.1977, + "step": 68000 }, { - "epoch": 46.2924172601592, - "grad_norm": 5.8690032958984375, - "learning_rate": 3.7268537913699205e-06, - "loss": 5.8019, - "step": 221000 + "epoch": 28.529534981147883, + "grad_norm": 3.4732697010040283, + "learning_rate": 2.4717218265605365e-06, + "loss": 6.1913, + "step": 68100 }, { - "epoch": 46.39715123586091, - "grad_norm": 5.787112712860107, - "learning_rate": 3.6221198156682027e-06, - "loss": 5.7901, - "step": 221500 + "epoch": 28.571428571428573, + "grad_norm": 3.3936917781829834, + "learning_rate": 2.4018991760927245e-06, + "loss": 6.2216, + "step": 68200 }, { - "epoch": 46.50188521156263, - "grad_norm": 5.568469524383545, - "learning_rate": 3.5173858399664853e-06, - "loss": 5.803, - "step": 222000 + "epoch": 28.61332216170926, + "grad_norm": 3.2980170249938965, + "learning_rate": 2.332076525624913e-06, + "loss": 6.1989, + "step": 68300 }, { - "epoch": 46.60661918726435, - "grad_norm": 5.671326637268066, - "learning_rate": 3.4126518642647675e-06, - "loss": 5.7931, - "step": 222500 + "epoch": 28.655215751989946, + "grad_norm": 4.099823951721191, + "learning_rate": 2.2622538751571013e-06, + "loss": 6.1755, + "step": 68400 }, { - "epoch": 46.71135316296606, - "grad_norm": 5.4085307121276855, - "learning_rate": 3.30791788856305e-06, - "loss": 5.7903, - "step": 223000 + "epoch": 28.697109342270632, + "grad_norm": 3.7930960655212402, + "learning_rate": 2.1924312246892893e-06, + "loss": 6.2155, + "step": 68500 }, { - "epoch": 46.81608713866778, - "grad_norm": 5.7440571784973145, - "learning_rate": 3.203393380812736e-06, - "loss": 5.7897, - "step": 223500 + "epoch": 28.73900293255132, + "grad_norm": 3.620065212249756, + "learning_rate": 2.1226085742214777e-06, + "loss": 6.2041, + "step": 68600 }, { - "epoch": 46.9208211143695, - "grad_norm": 5.144542217254639, - "learning_rate": 3.098659405111018e-06, - "loss": 5.8145, - "step": 224000 + "epoch": 28.780896522832006, + "grad_norm": 3.3451735973358154, + "learning_rate": 2.0527859237536657e-06, + "loss": 6.2095, + "step": 68700 }, { - "epoch": 47.02555509007122, - "grad_norm": 5.842213153839111, - "learning_rate": 2.9939254294093008e-06, - "loss": 5.8046, - "step": 224500 + "epoch": 28.822790113112696, + "grad_norm": 4.007857799530029, + "learning_rate": 1.982963273285854e-06, + "loss": 6.2283, + "step": 68800 }, { - "epoch": 47.130289065772935, - "grad_norm": 6.161410331726074, - "learning_rate": 2.8891914537075826e-06, - "loss": 5.7965, - "step": 225000 + "epoch": 28.864683703393382, + "grad_norm": 4.236888885498047, + "learning_rate": 1.9131406228180425e-06, + "loss": 6.1799, + "step": 68900 }, { - "epoch": 47.235023041474655, - "grad_norm": 6.173724174499512, - "learning_rate": 2.784457478005865e-06, - "loss": 5.7854, - "step": 225500 + "epoch": 28.90657729367407, + "grad_norm": 3.222273111343384, + "learning_rate": 1.8433179723502305e-06, + "loss": 6.2008, + "step": 69000 }, { - "epoch": 47.339757017176375, - "grad_norm": 5.132796287536621, - "learning_rate": 2.679932970255551e-06, - "loss": 5.791, - "step": 226000 + "epoch": 28.948470883954755, + "grad_norm": 3.8649580478668213, + "learning_rate": 1.7734953218824187e-06, + "loss": 6.194, + "step": 69100 }, { - "epoch": 47.44449099287809, - "grad_norm": 6.053417205810547, - "learning_rate": 2.5751989945538332e-06, - "loss": 5.7935, - "step": 226500 + "epoch": 28.99036447423544, + "grad_norm": 3.9630191326141357, + "learning_rate": 1.7036726714146071e-06, + "loss": 6.2083, + "step": 69200 }, { - "epoch": 47.54922496857981, - "grad_norm": 5.08466911315918, - "learning_rate": 2.470465018852116e-06, - "loss": 5.8006, - "step": 227000 + "epoch": 29.032258064516128, + "grad_norm": 3.9617035388946533, + "learning_rate": 1.6338500209467953e-06, + "loss": 6.2197, + "step": 69300 }, { - "epoch": 47.65395894428153, - "grad_norm": 6.060305595397949, - "learning_rate": 2.365731043150398e-06, - "loss": 5.7839, - "step": 227500 + "epoch": 29.074151654796815, + "grad_norm": 3.4647514820098877, + "learning_rate": 1.5640273704789835e-06, + "loss": 6.1919, + "step": 69400 }, { - "epoch": 47.75869291998324, - "grad_norm": 5.808520317077637, - "learning_rate": 2.2609970674486806e-06, - "loss": 5.7933, - "step": 228000 + "epoch": 29.116045245077505, + "grad_norm": 3.7548468112945557, + "learning_rate": 1.4942047200111717e-06, + "loss": 6.171, + "step": 69500 }, { - "epoch": 47.86342689568496, - "grad_norm": 5.413971424102783, - "learning_rate": 2.156472559698366e-06, - "loss": 5.7985, - "step": 228500 + "epoch": 29.15793883535819, + "grad_norm": 3.4267735481262207, + "learning_rate": 1.42438206954336e-06, + "loss": 6.2031, + "step": 69600 }, { - "epoch": 47.96816087138668, - "grad_norm": 6.4786529541015625, - "learning_rate": 2.0517385839966487e-06, - "loss": 5.8059, - "step": 229000 + "epoch": 29.199832425638878, + "grad_norm": 3.166888952255249, + "learning_rate": 1.3545594190755483e-06, + "loss": 6.1937, + "step": 69700 }, { - "epoch": 48.07289484708839, - "grad_norm": 5.606147289276123, - "learning_rate": 1.947004608294931e-06, - "loss": 5.7934, - "step": 229500 + "epoch": 29.241726015919564, + "grad_norm": 2.9794344902038574, + "learning_rate": 1.2847367686077363e-06, + "loss": 6.2068, + "step": 69800 }, { - "epoch": 48.17762882279011, - "grad_norm": 5.827240467071533, - "learning_rate": 1.8422706325932133e-06, - "loss": 5.8007, - "step": 230000 + "epoch": 29.28361960620025, + "grad_norm": 3.056293249130249, + "learning_rate": 1.2149141181399247e-06, + "loss": 6.1652, + "step": 69900 }, { - "epoch": 48.28236279849183, - "grad_norm": 5.678854465484619, - "learning_rate": 1.7375366568914957e-06, - "loss": 5.8038, - "step": 230500 + "epoch": 29.325513196480937, + "grad_norm": 3.851149320602417, + "learning_rate": 1.145789694176791e-06, + "loss": 6.2129, + "step": 70000 }, { - "epoch": 48.38709677419355, - "grad_norm": 5.42138671875, - "learning_rate": 1.632802681189778e-06, - "loss": 5.7976, - "step": 231000 + "epoch": 29.367406786761624, + "grad_norm": 3.670929193496704, + "learning_rate": 1.0759670437089792e-06, + "loss": 6.2068, + "step": 70100 }, { - "epoch": 48.491830749895264, - "grad_norm": 6.03090238571167, - "learning_rate": 1.5280687054880603e-06, - "loss": 5.7874, - "step": 231500 + "epoch": 29.409300377042314, + "grad_norm": 3.3581252098083496, + "learning_rate": 1.0061443932411674e-06, + "loss": 6.2208, + "step": 70200 }, { - "epoch": 48.596564725596984, - "grad_norm": 4.660221099853516, - "learning_rate": 1.4233347297863427e-06, - "loss": 5.7742, - "step": 232000 + "epoch": 29.451193967323, + "grad_norm": 3.7551257610321045, + "learning_rate": 9.363217427733557e-07, + "loss": 6.179, + "step": 70300 }, { - "epoch": 48.701298701298704, - "grad_norm": 6.032063961029053, - "learning_rate": 1.3188102220360285e-06, - "loss": 5.7866, - "step": 232500 + "epoch": 29.493087557603687, + "grad_norm": 2.9767682552337646, + "learning_rate": 8.664990923055439e-07, + "loss": 6.2157, + "step": 70400 }, { - "epoch": 48.80603267700042, - "grad_norm": 6.296925067901611, - "learning_rate": 1.2142857142857144e-06, - "loss": 5.7922, - "step": 233000 + "epoch": 29.534981147884373, + "grad_norm": 3.3218774795532227, + "learning_rate": 7.966764418377322e-07, + "loss": 6.1773, + "step": 70500 }, { - "epoch": 48.91076665270214, - "grad_norm": 5.2991437911987305, - "learning_rate": 1.1095517385839968e-06, - "loss": 5.7873, - "step": 233500 + "epoch": 29.57687473816506, + "grad_norm": 4.360437870025635, + "learning_rate": 7.268537913699204e-07, + "loss": 6.1985, + "step": 70600 }, { - "epoch": 49.015500628403856, - "grad_norm": 6.130777835845947, - "learning_rate": 1.0048177628822792e-06, - "loss": 5.8074, - "step": 234000 + "epoch": 29.618768328445746, + "grad_norm": 3.544264078140259, + "learning_rate": 6.570311409021087e-07, + "loss": 6.1973, + "step": 70700 }, { - "epoch": 49.12023460410557, - "grad_norm": 6.305094242095947, - "learning_rate": 9.000837871805614e-07, - "loss": 5.7847, - "step": 234500 + "epoch": 29.660661918726436, + "grad_norm": 3.7416069507598877, + "learning_rate": 5.872084904342969e-07, + "loss": 6.2031, + "step": 70800 }, { - "epoch": 49.22496857980729, - "grad_norm": 6.156949996948242, - "learning_rate": 7.953498114788438e-07, - "loss": 5.7741, - "step": 235000 + "epoch": 29.702555509007123, + "grad_norm": 3.0346035957336426, + "learning_rate": 5.173858399664851e-07, + "loss": 6.2123, + "step": 70900 }, { - "epoch": 49.32970255550901, - "grad_norm": 6.475966930389404, - "learning_rate": 6.908253037285296e-07, - "loss": 5.8083, - "step": 235500 + "epoch": 29.74444909928781, + "grad_norm": 3.2308425903320312, + "learning_rate": 4.4756318949867344e-07, + "loss": 6.2106, + "step": 71000 }, { - "epoch": 49.43443653121072, - "grad_norm": 5.687895774841309, - "learning_rate": 5.86091328026812e-07, - "loss": 5.7976, - "step": 236000 + "epoch": 29.786342689568496, + "grad_norm": 3.0109570026397705, + "learning_rate": 3.7774053903086163e-07, + "loss": 6.213, + "step": 71100 }, { - "epoch": 49.53917050691244, - "grad_norm": 5.615401744842529, - "learning_rate": 4.813573523250943e-07, - "loss": 5.7907, - "step": 236500 + "epoch": 29.828236279849182, + "grad_norm": 3.733609199523926, + "learning_rate": 3.079178885630499e-07, + "loss": 6.1984, + "step": 71200 }, { - "epoch": 49.64390448261416, - "grad_norm": 6.177160263061523, - "learning_rate": 3.7662337662337666e-07, - "loss": 5.792, - "step": 237000 + "epoch": 29.87012987012987, + "grad_norm": 3.5430541038513184, + "learning_rate": 2.3809523809523814e-07, + "loss": 6.183, + "step": 71300 }, { - "epoch": 49.74863845831588, - "grad_norm": 5.604287624359131, - "learning_rate": 2.71889400921659e-07, - "loss": 5.7837, - "step": 237500 + "epoch": 29.912023460410555, + "grad_norm": 3.1964950561523438, + "learning_rate": 1.6827258762742634e-07, + "loss": 6.1817, + "step": 71400 }, { - "epoch": 49.853372434017594, - "grad_norm": 4.826539039611816, - "learning_rate": 1.673648931713448e-07, - "loss": 5.7808, - "step": 238000 + "epoch": 29.953917050691246, + "grad_norm": 3.6197755336761475, + "learning_rate": 9.844993715961458e-08, + "loss": 6.1907, + "step": 71500 }, { - "epoch": 49.95810640971931, - "grad_norm": 5.6649250984191895, - "learning_rate": 6.263091746962715e-08, - "loss": 5.7888, - "step": 238500 + "epoch": 29.995810640971932, + "grad_norm": 3.035473346710205, + "learning_rate": 2.862728669180282e-08, + "loss": 6.1677, + "step": 71600 }, { - "epoch": 50.0, - "step": 238700, - "total_flos": 5161725447936000.0, - "train_loss": 6.140905278960581, - "train_runtime": 7883.0646, - "train_samples_per_second": 484.444, - "train_steps_per_second": 30.28 + "epoch": 30.0, + "step": 71610, + "total_flos": 781486986700800.0, + "train_loss": 6.528281962779993, + "train_runtime": 3948.1737, + "train_samples_per_second": 580.354, + "train_steps_per_second": 18.137 } ], - "logging_steps": 500, - "max_steps": 238700, + "logging_steps": 100, + "max_steps": 71610, "num_input_tokens_seen": 0, - "num_train_epochs": 50, + "num_train_epochs": 30, "save_steps": 5000, "stateful_callbacks": { "TrainerControl": { @@ -3374,8 +5047,8 @@ "attributes": {} } }, - "total_flos": 5161725447936000.0, - "train_batch_size": 16, + "total_flos": 781486986700800.0, + "train_batch_size": 32, "trial_name": null, "trial_params": null }