|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.0, |
|
"eval_steps": 500, |
|
"global_step": 13480, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.000741839762611276, |
|
"grad_norm": 18.7586669921875, |
|
"learning_rate": 2.9999959263751826e-06, |
|
"loss": 2.1609, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.001483679525222552, |
|
"grad_norm": 10.413708686828613, |
|
"learning_rate": 2.999983705522856e-06, |
|
"loss": 1.7246, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.002225519287833828, |
|
"grad_norm": 11.689846992492676, |
|
"learning_rate": 2.9999633375093975e-06, |
|
"loss": 1.703, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.002967359050445104, |
|
"grad_norm": 10.514595985412598, |
|
"learning_rate": 2.9999348224454366e-06, |
|
"loss": 1.5608, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.00370919881305638, |
|
"grad_norm": 9.605154037475586, |
|
"learning_rate": 2.9998981604858526e-06, |
|
"loss": 1.5354, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.004451038575667656, |
|
"grad_norm": 25.274913787841797, |
|
"learning_rate": 2.999853351829775e-06, |
|
"loss": 1.5925, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.0051928783382789315, |
|
"grad_norm": 11.746683120727539, |
|
"learning_rate": 2.9998003967205817e-06, |
|
"loss": 1.4979, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.005934718100890208, |
|
"grad_norm": 15.60824203491211, |
|
"learning_rate": 2.9997392954458987e-06, |
|
"loss": 1.4213, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.0066765578635014835, |
|
"grad_norm": 12.809992790222168, |
|
"learning_rate": 2.9996700483375973e-06, |
|
"loss": 1.604, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.00741839762611276, |
|
"grad_norm": 13.352578163146973, |
|
"learning_rate": 2.9995926557717933e-06, |
|
"loss": 1.5562, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.008160237388724036, |
|
"grad_norm": 11.533346176147461, |
|
"learning_rate": 2.9995071181688438e-06, |
|
"loss": 1.4658, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.008902077151335312, |
|
"grad_norm": 11.692753791809082, |
|
"learning_rate": 2.9994134359933475e-06, |
|
"loss": 1.4382, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.009643916913946587, |
|
"grad_norm": 15.99777889251709, |
|
"learning_rate": 2.9993116097541383e-06, |
|
"loss": 1.5165, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.010385756676557863, |
|
"grad_norm": 34.999664306640625, |
|
"learning_rate": 2.999201640004285e-06, |
|
"loss": 1.6118, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.01112759643916914, |
|
"grad_norm": 10.084653854370117, |
|
"learning_rate": 2.99908352734109e-06, |
|
"loss": 1.4602, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.011869436201780416, |
|
"grad_norm": 96.4203872680664, |
|
"learning_rate": 2.99895727240608e-06, |
|
"loss": 1.6046, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.012611275964391691, |
|
"grad_norm": 14.107004165649414, |
|
"learning_rate": 2.9988228758850097e-06, |
|
"loss": 1.6841, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.013353115727002967, |
|
"grad_norm": 11.504150390625, |
|
"learning_rate": 2.9986803385078545e-06, |
|
"loss": 1.4916, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.014094955489614243, |
|
"grad_norm": 8.963112831115723, |
|
"learning_rate": 2.998529661048805e-06, |
|
"loss": 1.4495, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.01483679525222552, |
|
"grad_norm": 11.037364959716797, |
|
"learning_rate": 2.9983708443262657e-06, |
|
"loss": 1.5727, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.015578635014836795, |
|
"grad_norm": 10.980456352233887, |
|
"learning_rate": 2.99820388920285e-06, |
|
"loss": 1.6488, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.016320474777448073, |
|
"grad_norm": 12.589357376098633, |
|
"learning_rate": 2.9980287965853754e-06, |
|
"loss": 1.4721, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.017062314540059347, |
|
"grad_norm": 12.314191818237305, |
|
"learning_rate": 2.9978455674248558e-06, |
|
"loss": 1.6205, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.017804154302670624, |
|
"grad_norm": 9.882691383361816, |
|
"learning_rate": 2.9976542027165016e-06, |
|
"loss": 1.5918, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.018545994065281898, |
|
"grad_norm": 11.461004257202148, |
|
"learning_rate": 2.99745470349971e-06, |
|
"loss": 1.639, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.019287833827893175, |
|
"grad_norm": 9.780576705932617, |
|
"learning_rate": 2.99724707085806e-06, |
|
"loss": 1.3886, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.020029673590504452, |
|
"grad_norm": 9.100162506103516, |
|
"learning_rate": 2.9970313059193096e-06, |
|
"loss": 1.2965, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.020771513353115726, |
|
"grad_norm": 10.991832733154297, |
|
"learning_rate": 2.996807409855385e-06, |
|
"loss": 1.556, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.021513353115727003, |
|
"grad_norm": 10.909322738647461, |
|
"learning_rate": 2.9965753838823784e-06, |
|
"loss": 1.4454, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.02225519287833828, |
|
"grad_norm": 9.893937110900879, |
|
"learning_rate": 2.996335229260538e-06, |
|
"loss": 1.5107, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.022997032640949554, |
|
"grad_norm": 11.460049629211426, |
|
"learning_rate": 2.996086947294264e-06, |
|
"loss": 1.5962, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.02373887240356083, |
|
"grad_norm": 11.341355323791504, |
|
"learning_rate": 2.9958305393321e-06, |
|
"loss": 1.4185, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.024480712166172106, |
|
"grad_norm": 8.831833839416504, |
|
"learning_rate": 2.9955660067667256e-06, |
|
"loss": 1.426, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.025222551928783383, |
|
"grad_norm": 46.39206314086914, |
|
"learning_rate": 2.995293351034949e-06, |
|
"loss": 1.6725, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.02596439169139466, |
|
"grad_norm": 9.161226272583008, |
|
"learning_rate": 2.9950125736177004e-06, |
|
"loss": 1.4317, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.026706231454005934, |
|
"grad_norm": 9.408476829528809, |
|
"learning_rate": 2.9947236760400217e-06, |
|
"loss": 1.6589, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.02744807121661721, |
|
"grad_norm": 10.739395141601562, |
|
"learning_rate": 2.9944266598710606e-06, |
|
"loss": 1.4851, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.028189910979228485, |
|
"grad_norm": 9.48901653289795, |
|
"learning_rate": 2.99412152672406e-06, |
|
"loss": 1.4584, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.028931750741839762, |
|
"grad_norm": 21.37883758544922, |
|
"learning_rate": 2.9938082782563505e-06, |
|
"loss": 1.438, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.02967359050445104, |
|
"grad_norm": 9.90542984008789, |
|
"learning_rate": 2.993486916169341e-06, |
|
"loss": 1.4416, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.030415430267062313, |
|
"grad_norm": 10.844427108764648, |
|
"learning_rate": 2.99315744220851e-06, |
|
"loss": 1.5955, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.03115727002967359, |
|
"grad_norm": 9.396254539489746, |
|
"learning_rate": 2.9928198581633946e-06, |
|
"loss": 1.3916, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.031899109792284865, |
|
"grad_norm": 9.14573860168457, |
|
"learning_rate": 2.9924741658675827e-06, |
|
"loss": 1.4467, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.032640949554896145, |
|
"grad_norm": 8.896514892578125, |
|
"learning_rate": 2.9921203671987025e-06, |
|
"loss": 1.4743, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.03338278931750742, |
|
"grad_norm": 10.342260360717773, |
|
"learning_rate": 2.9917584640784107e-06, |
|
"loss": 1.541, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.03412462908011869, |
|
"grad_norm": 9.386099815368652, |
|
"learning_rate": 2.991388458472385e-06, |
|
"loss": 1.27, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.034866468842729974, |
|
"grad_norm": 10.977550506591797, |
|
"learning_rate": 2.9910103523903087e-06, |
|
"loss": 1.4037, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.03560830860534125, |
|
"grad_norm": 9.735797882080078, |
|
"learning_rate": 2.9906241478858667e-06, |
|
"loss": 1.5082, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.03635014836795252, |
|
"grad_norm": 9.580273628234863, |
|
"learning_rate": 2.9902298470567285e-06, |
|
"loss": 1.4226, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.037091988130563795, |
|
"grad_norm": 12.713663101196289, |
|
"learning_rate": 2.989827452044538e-06, |
|
"loss": 1.5578, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.037091988130563795, |
|
"eval_loss": 1.465081810951233, |
|
"eval_runtime": 23.4325, |
|
"eval_samples_per_second": 19.033, |
|
"eval_steps_per_second": 9.517, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.037833827893175076, |
|
"grad_norm": 9.113161087036133, |
|
"learning_rate": 2.9894169650349047e-06, |
|
"loss": 1.4684, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.03857566765578635, |
|
"grad_norm": 9.525900840759277, |
|
"learning_rate": 2.988998388257388e-06, |
|
"loss": 1.3998, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.039317507418397624, |
|
"grad_norm": 10.796713829040527, |
|
"learning_rate": 2.988571723985488e-06, |
|
"loss": 1.556, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.040059347181008904, |
|
"grad_norm": 10.529806137084961, |
|
"learning_rate": 2.9881369745366313e-06, |
|
"loss": 1.3819, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.04080118694362018, |
|
"grad_norm": 9.02527141571045, |
|
"learning_rate": 2.9876941422721592e-06, |
|
"loss": 1.4893, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.04154302670623145, |
|
"grad_norm": 8.577601432800293, |
|
"learning_rate": 2.987243229597316e-06, |
|
"loss": 1.5552, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.04228486646884273, |
|
"grad_norm": 10.954402923583984, |
|
"learning_rate": 2.9867842389612326e-06, |
|
"loss": 1.3512, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.04302670623145401, |
|
"grad_norm": 9.236324310302734, |
|
"learning_rate": 2.9863171728569175e-06, |
|
"loss": 1.5264, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.04376854599406528, |
|
"grad_norm": 9.721325874328613, |
|
"learning_rate": 2.9858420338212393e-06, |
|
"loss": 1.5841, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.04451038575667656, |
|
"grad_norm": 10.43162727355957, |
|
"learning_rate": 2.985358824434916e-06, |
|
"loss": 1.6017, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.045252225519287835, |
|
"grad_norm": 9.003376960754395, |
|
"learning_rate": 2.984867547322499e-06, |
|
"loss": 1.4716, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.04599406528189911, |
|
"grad_norm": 9.628597259521484, |
|
"learning_rate": 2.9843682051523604e-06, |
|
"loss": 1.5641, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.04673590504451038, |
|
"grad_norm": 14.442529678344727, |
|
"learning_rate": 2.9838608006366766e-06, |
|
"loss": 1.5637, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.04747774480712166, |
|
"grad_norm": 10.035704612731934, |
|
"learning_rate": 2.983345336531415e-06, |
|
"loss": 1.5762, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.04821958456973294, |
|
"grad_norm": 8.628552436828613, |
|
"learning_rate": 2.9828218156363188e-06, |
|
"loss": 1.4425, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.04896142433234421, |
|
"grad_norm": 10.285855293273926, |
|
"learning_rate": 2.982290240794892e-06, |
|
"loss": 1.4368, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.04970326409495549, |
|
"grad_norm": 9.917787551879883, |
|
"learning_rate": 2.981750614894383e-06, |
|
"loss": 1.547, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.050445103857566766, |
|
"grad_norm": 10.46651554107666, |
|
"learning_rate": 2.9812029408657698e-06, |
|
"loss": 1.4292, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.05118694362017804, |
|
"grad_norm": 8.533087730407715, |
|
"learning_rate": 2.9806472216837436e-06, |
|
"loss": 1.4962, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.05192878338278932, |
|
"grad_norm": 9.585909843444824, |
|
"learning_rate": 2.9800834603666935e-06, |
|
"loss": 1.471, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.052670623145400594, |
|
"grad_norm": 9.136356353759766, |
|
"learning_rate": 2.9795116599766883e-06, |
|
"loss": 1.31, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.05341246290801187, |
|
"grad_norm": 9.795812606811523, |
|
"learning_rate": 2.9789318236194618e-06, |
|
"loss": 1.5102, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.05415430267062315, |
|
"grad_norm": 9.728421211242676, |
|
"learning_rate": 2.9783439544443953e-06, |
|
"loss": 1.4569, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.05489614243323442, |
|
"grad_norm": 8.628436088562012, |
|
"learning_rate": 2.9777480556444996e-06, |
|
"loss": 1.5004, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.055637982195845696, |
|
"grad_norm": 9.77978229522705, |
|
"learning_rate": 2.9771441304563996e-06, |
|
"loss": 1.4255, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.05637982195845697, |
|
"grad_norm": 9.335463523864746, |
|
"learning_rate": 2.9765321821603144e-06, |
|
"loss": 1.5658, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.05712166172106825, |
|
"grad_norm": 12.877664566040039, |
|
"learning_rate": 2.9759122140800406e-06, |
|
"loss": 1.672, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.057863501483679525, |
|
"grad_norm": 15.497161865234375, |
|
"learning_rate": 2.9752842295829357e-06, |
|
"loss": 1.4453, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.0586053412462908, |
|
"grad_norm": 9.707072257995605, |
|
"learning_rate": 2.9746482320798967e-06, |
|
"loss": 1.4298, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.05934718100890208, |
|
"grad_norm": 9.096467018127441, |
|
"learning_rate": 2.9740042250253443e-06, |
|
"loss": 1.6281, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.06008902077151335, |
|
"grad_norm": 10.356392860412598, |
|
"learning_rate": 2.973352211917202e-06, |
|
"loss": 1.4703, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.06083086053412463, |
|
"grad_norm": 10.25114917755127, |
|
"learning_rate": 2.972692196296879e-06, |
|
"loss": 1.4442, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.06157270029673591, |
|
"grad_norm": 8.946527481079102, |
|
"learning_rate": 2.9720241817492502e-06, |
|
"loss": 1.3684, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.06231454005934718, |
|
"grad_norm": 10.437005043029785, |
|
"learning_rate": 2.9713481719026366e-06, |
|
"loss": 1.548, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.06305637982195846, |
|
"grad_norm": 9.255142211914062, |
|
"learning_rate": 2.9706641704287855e-06, |
|
"loss": 1.4895, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.06379821958456973, |
|
"grad_norm": 9.349931716918945, |
|
"learning_rate": 2.9699721810428503e-06, |
|
"loss": 1.4152, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.064540059347181, |
|
"grad_norm": 8.700305938720703, |
|
"learning_rate": 2.9692722075033715e-06, |
|
"loss": 1.4541, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.06528189910979229, |
|
"grad_norm": 10.963595390319824, |
|
"learning_rate": 2.9685642536122545e-06, |
|
"loss": 1.3894, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.06602373887240356, |
|
"grad_norm": 25.613452911376953, |
|
"learning_rate": 2.967848323214752e-06, |
|
"loss": 1.6023, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 0.06676557863501484, |
|
"grad_norm": 9.307974815368652, |
|
"learning_rate": 2.967124420199439e-06, |
|
"loss": 1.5659, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.06750741839762611, |
|
"grad_norm": 9.049477577209473, |
|
"learning_rate": 2.966392548498195e-06, |
|
"loss": 1.5969, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 0.06824925816023739, |
|
"grad_norm": 8.523443222045898, |
|
"learning_rate": 2.9656527120861803e-06, |
|
"loss": 1.491, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.06899109792284866, |
|
"grad_norm": 8.638110160827637, |
|
"learning_rate": 2.9649049149818167e-06, |
|
"loss": 1.4304, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 0.06973293768545995, |
|
"grad_norm": 10.084444999694824, |
|
"learning_rate": 2.9641491612467636e-06, |
|
"loss": 1.4847, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 0.07047477744807122, |
|
"grad_norm": 7.784031391143799, |
|
"learning_rate": 2.9633854549858975e-06, |
|
"loss": 1.3943, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.0712166172106825, |
|
"grad_norm": 8.431685447692871, |
|
"learning_rate": 2.9626138003472885e-06, |
|
"loss": 1.4669, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.07195845697329377, |
|
"grad_norm": 9.953826904296875, |
|
"learning_rate": 2.9618342015221793e-06, |
|
"loss": 1.3398, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 0.07270029673590504, |
|
"grad_norm": 8.906854629516602, |
|
"learning_rate": 2.9610466627449597e-06, |
|
"loss": 1.5057, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 0.07344213649851632, |
|
"grad_norm": 9.184341430664062, |
|
"learning_rate": 2.9602511882931473e-06, |
|
"loss": 1.476, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 0.07418397626112759, |
|
"grad_norm": 9.252667427062988, |
|
"learning_rate": 2.959447782487361e-06, |
|
"loss": 1.4645, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.07418397626112759, |
|
"eval_loss": 1.4362387657165527, |
|
"eval_runtime": 23.4866, |
|
"eval_samples_per_second": 18.99, |
|
"eval_steps_per_second": 9.495, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.07492581602373888, |
|
"grad_norm": 9.09242057800293, |
|
"learning_rate": 2.958636449691299e-06, |
|
"loss": 1.561, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 0.07566765578635015, |
|
"grad_norm": 8.406475067138672, |
|
"learning_rate": 2.957817194311716e-06, |
|
"loss": 1.4029, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 0.07640949554896143, |
|
"grad_norm": 9.518254280090332, |
|
"learning_rate": 2.956990020798396e-06, |
|
"loss": 1.5051, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 0.0771513353115727, |
|
"grad_norm": 9.93432903289795, |
|
"learning_rate": 2.956154933644133e-06, |
|
"loss": 1.2554, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 0.07789317507418397, |
|
"grad_norm": 7.695739269256592, |
|
"learning_rate": 2.955311937384702e-06, |
|
"loss": 1.4648, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.07863501483679525, |
|
"grad_norm": 9.189163208007812, |
|
"learning_rate": 2.9544610365988374e-06, |
|
"loss": 1.5584, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 0.07937685459940653, |
|
"grad_norm": 8.053617477416992, |
|
"learning_rate": 2.9536022359082062e-06, |
|
"loss": 1.3786, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 0.08011869436201781, |
|
"grad_norm": 9.746628761291504, |
|
"learning_rate": 2.9527355399773845e-06, |
|
"loss": 1.3726, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 0.08086053412462908, |
|
"grad_norm": 8.845373153686523, |
|
"learning_rate": 2.951860953513831e-06, |
|
"loss": 1.3768, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 0.08160237388724036, |
|
"grad_norm": 8.069707870483398, |
|
"learning_rate": 2.950978481267862e-06, |
|
"loss": 1.3207, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.08234421364985163, |
|
"grad_norm": 9.178265571594238, |
|
"learning_rate": 2.9500881280326244e-06, |
|
"loss": 1.5972, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 0.0830860534124629, |
|
"grad_norm": 8.713502883911133, |
|
"learning_rate": 2.9491898986440725e-06, |
|
"loss": 1.4182, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 0.08382789317507418, |
|
"grad_norm": 9.482294082641602, |
|
"learning_rate": 2.948283797980939e-06, |
|
"loss": 1.5129, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 0.08456973293768547, |
|
"grad_norm": 11.337164878845215, |
|
"learning_rate": 2.947369830964709e-06, |
|
"loss": 1.426, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 0.08531157270029674, |
|
"grad_norm": 9.933257102966309, |
|
"learning_rate": 2.9464480025595937e-06, |
|
"loss": 1.4275, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.08605341246290801, |
|
"grad_norm": 8.315671920776367, |
|
"learning_rate": 2.9455183177725058e-06, |
|
"loss": 1.4933, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 0.08679525222551929, |
|
"grad_norm": 8.2044677734375, |
|
"learning_rate": 2.9445807816530258e-06, |
|
"loss": 1.4755, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 0.08753709198813056, |
|
"grad_norm": 7.752995014190674, |
|
"learning_rate": 2.9436353992933816e-06, |
|
"loss": 1.207, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 0.08827893175074183, |
|
"grad_norm": 8.823128700256348, |
|
"learning_rate": 2.9426821758284173e-06, |
|
"loss": 1.4338, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 0.08902077151335312, |
|
"grad_norm": 7.024681091308594, |
|
"learning_rate": 2.9417211164355664e-06, |
|
"loss": 1.4365, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.0897626112759644, |
|
"grad_norm": 8.412097930908203, |
|
"learning_rate": 2.940752226334822e-06, |
|
"loss": 1.1898, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 0.09050445103857567, |
|
"grad_norm": 8.81240463256836, |
|
"learning_rate": 2.9397755107887114e-06, |
|
"loss": 1.5879, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 0.09124629080118694, |
|
"grad_norm": 12.02270793914795, |
|
"learning_rate": 2.938790975102264e-06, |
|
"loss": 1.3401, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 0.09198813056379822, |
|
"grad_norm": 9.22630500793457, |
|
"learning_rate": 2.9377986246229853e-06, |
|
"loss": 1.3431, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 0.09272997032640949, |
|
"grad_norm": 8.395411491394043, |
|
"learning_rate": 2.9367984647408272e-06, |
|
"loss": 1.3423, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 0.09347181008902077, |
|
"grad_norm": 9.383752822875977, |
|
"learning_rate": 2.9357905008881574e-06, |
|
"loss": 1.5453, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 0.09421364985163205, |
|
"grad_norm": 13.686159133911133, |
|
"learning_rate": 2.934774738539731e-06, |
|
"loss": 1.5254, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 0.09495548961424333, |
|
"grad_norm": 9.126317977905273, |
|
"learning_rate": 2.9337511832126614e-06, |
|
"loss": 1.3578, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 0.0956973293768546, |
|
"grad_norm": 9.802062034606934, |
|
"learning_rate": 2.9327198404663893e-06, |
|
"loss": 1.3732, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 0.09643916913946587, |
|
"grad_norm": 8.623979568481445, |
|
"learning_rate": 2.931680715902652e-06, |
|
"loss": 1.4103, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.09718100890207715, |
|
"grad_norm": 9.61336612701416, |
|
"learning_rate": 2.9306338151654547e-06, |
|
"loss": 1.4382, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 0.09792284866468842, |
|
"grad_norm": 8.745745658874512, |
|
"learning_rate": 2.9295791439410385e-06, |
|
"loss": 1.2856, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 0.09866468842729971, |
|
"grad_norm": 8.679821968078613, |
|
"learning_rate": 2.9285167079578504e-06, |
|
"loss": 1.257, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 0.09940652818991098, |
|
"grad_norm": 11.308154106140137, |
|
"learning_rate": 2.92744651298651e-06, |
|
"loss": 1.4787, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 0.10014836795252226, |
|
"grad_norm": 8.959935188293457, |
|
"learning_rate": 2.926368564839782e-06, |
|
"loss": 1.2769, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 0.10089020771513353, |
|
"grad_norm": 6.9831438064575195, |
|
"learning_rate": 2.9252828693725405e-06, |
|
"loss": 1.4526, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 0.1016320474777448, |
|
"grad_norm": 8.822589874267578, |
|
"learning_rate": 2.924189432481741e-06, |
|
"loss": 1.3483, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 0.10237388724035608, |
|
"grad_norm": 8.989341735839844, |
|
"learning_rate": 2.923088260106386e-06, |
|
"loss": 1.4483, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 0.10311572700296735, |
|
"grad_norm": 9.763890266418457, |
|
"learning_rate": 2.921979358227492e-06, |
|
"loss": 1.3835, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 0.10385756676557864, |
|
"grad_norm": 8.562960624694824, |
|
"learning_rate": 2.92086273286806e-06, |
|
"loss": 1.4348, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.10459940652818991, |
|
"grad_norm": 10.014548301696777, |
|
"learning_rate": 2.91973839009304e-06, |
|
"loss": 1.2826, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 0.10534124629080119, |
|
"grad_norm": 11.542120933532715, |
|
"learning_rate": 2.9186063360093e-06, |
|
"loss": 1.3613, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 0.10608308605341246, |
|
"grad_norm": 8.246392250061035, |
|
"learning_rate": 2.917466576765591e-06, |
|
"loss": 1.4738, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 0.10682492581602374, |
|
"grad_norm": 9.511324882507324, |
|
"learning_rate": 2.916319118552515e-06, |
|
"loss": 1.4706, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 0.10756676557863501, |
|
"grad_norm": 8.671672821044922, |
|
"learning_rate": 2.915163967602492e-06, |
|
"loss": 1.3392, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 0.1083086053412463, |
|
"grad_norm": 9.805370330810547, |
|
"learning_rate": 2.914001130189722e-06, |
|
"loss": 1.5192, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 0.10905044510385757, |
|
"grad_norm": 8.378101348876953, |
|
"learning_rate": 2.912830612630158e-06, |
|
"loss": 1.3507, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 0.10979228486646884, |
|
"grad_norm": 8.799610137939453, |
|
"learning_rate": 2.9116524212814653e-06, |
|
"loss": 1.4003, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 0.11053412462908012, |
|
"grad_norm": 8.829014778137207, |
|
"learning_rate": 2.91046656254299e-06, |
|
"loss": 1.5949, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 0.11127596439169139, |
|
"grad_norm": 8.634420394897461, |
|
"learning_rate": 2.9092730428557236e-06, |
|
"loss": 1.4198, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.11127596439169139, |
|
"eval_loss": 1.4195871353149414, |
|
"eval_runtime": 23.4693, |
|
"eval_samples_per_second": 19.004, |
|
"eval_steps_per_second": 9.502, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.11201780415430267, |
|
"grad_norm": 7.636455059051514, |
|
"learning_rate": 2.9080718687022676e-06, |
|
"loss": 1.4234, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 0.11275964391691394, |
|
"grad_norm": 8.863425254821777, |
|
"learning_rate": 2.9068630466067996e-06, |
|
"loss": 1.5965, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 0.11350148367952523, |
|
"grad_norm": 8.970385551452637, |
|
"learning_rate": 2.905646583135036e-06, |
|
"loss": 1.4643, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 0.1142433234421365, |
|
"grad_norm": 10.134622573852539, |
|
"learning_rate": 2.904422484894198e-06, |
|
"loss": 1.4593, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 0.11498516320474778, |
|
"grad_norm": 8.219001770019531, |
|
"learning_rate": 2.9031907585329753e-06, |
|
"loss": 1.4802, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 0.11572700296735905, |
|
"grad_norm": 9.880292892456055, |
|
"learning_rate": 2.901951410741489e-06, |
|
"loss": 1.4993, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 0.11646884272997032, |
|
"grad_norm": 8.31434154510498, |
|
"learning_rate": 2.9007044482512563e-06, |
|
"loss": 1.5126, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 0.1172106824925816, |
|
"grad_norm": 8.074999809265137, |
|
"learning_rate": 2.899449877835154e-06, |
|
"loss": 1.1785, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 0.11795252225519288, |
|
"grad_norm": 9.222709655761719, |
|
"learning_rate": 2.8981877063073808e-06, |
|
"loss": 1.3661, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 0.11869436201780416, |
|
"grad_norm": 9.243541717529297, |
|
"learning_rate": 2.8969179405234202e-06, |
|
"loss": 1.3965, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.11943620178041543, |
|
"grad_norm": 8.484634399414062, |
|
"learning_rate": 2.8956405873800063e-06, |
|
"loss": 1.4526, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 0.1201780415430267, |
|
"grad_norm": 7.875013828277588, |
|
"learning_rate": 2.8943556538150813e-06, |
|
"loss": 1.439, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 0.12091988130563798, |
|
"grad_norm": 8.981459617614746, |
|
"learning_rate": 2.893063146807762e-06, |
|
"loss": 1.5325, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 0.12166172106824925, |
|
"grad_norm": 8.40335464477539, |
|
"learning_rate": 2.8917630733783004e-06, |
|
"loss": 1.4615, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 0.12240356083086053, |
|
"grad_norm": 8.828475952148438, |
|
"learning_rate": 2.890455440588043e-06, |
|
"loss": 1.5635, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 0.12314540059347182, |
|
"grad_norm": 10.00554084777832, |
|
"learning_rate": 2.8891402555393995e-06, |
|
"loss": 1.4823, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 0.12388724035608309, |
|
"grad_norm": 9.928216934204102, |
|
"learning_rate": 2.8878175253757955e-06, |
|
"loss": 1.3582, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 0.12462908011869436, |
|
"grad_norm": 11.623834609985352, |
|
"learning_rate": 2.8864872572816406e-06, |
|
"loss": 1.4406, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 0.12537091988130564, |
|
"grad_norm": 12.635778427124023, |
|
"learning_rate": 2.885149458482285e-06, |
|
"loss": 1.3821, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 0.1261127596439169, |
|
"grad_norm": 10.610758781433105, |
|
"learning_rate": 2.8838041362439823e-06, |
|
"loss": 1.5266, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.12685459940652818, |
|
"grad_norm": 8.499368667602539, |
|
"learning_rate": 2.8824512978738506e-06, |
|
"loss": 1.3015, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 0.12759643916913946, |
|
"grad_norm": 7.7737507820129395, |
|
"learning_rate": 2.881090950719831e-06, |
|
"loss": 1.3831, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 0.12833827893175073, |
|
"grad_norm": 9.742268562316895, |
|
"learning_rate": 2.8797231021706486e-06, |
|
"loss": 1.5125, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 0.129080118694362, |
|
"grad_norm": 9.315298080444336, |
|
"learning_rate": 2.8783477596557722e-06, |
|
"loss": 1.5418, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 0.1298219584569733, |
|
"grad_norm": 9.360373497009277, |
|
"learning_rate": 2.8769649306453745e-06, |
|
"loss": 1.4129, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 0.13056379821958458, |
|
"grad_norm": 10.6887845993042, |
|
"learning_rate": 2.8755746226502914e-06, |
|
"loss": 1.3005, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 0.13130563798219586, |
|
"grad_norm": 8.747626304626465, |
|
"learning_rate": 2.87417684322198e-06, |
|
"loss": 1.3693, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 0.13204747774480713, |
|
"grad_norm": 10.2086820602417, |
|
"learning_rate": 2.872771599952479e-06, |
|
"loss": 1.3155, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 0.1327893175074184, |
|
"grad_norm": 8.937162399291992, |
|
"learning_rate": 2.871358900474367e-06, |
|
"loss": 1.5346, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 0.13353115727002968, |
|
"grad_norm": 8.907169342041016, |
|
"learning_rate": 2.8699387524607205e-06, |
|
"loss": 1.4442, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.13427299703264095, |
|
"grad_norm": 8.316621780395508, |
|
"learning_rate": 2.8685111636250736e-06, |
|
"loss": 1.3703, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 0.13501483679525222, |
|
"grad_norm": 8.593326568603516, |
|
"learning_rate": 2.867076141721374e-06, |
|
"loss": 1.2765, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 0.1357566765578635, |
|
"grad_norm": 9.69709300994873, |
|
"learning_rate": 2.865633694543944e-06, |
|
"loss": 1.5247, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 0.13649851632047477, |
|
"grad_norm": 8.481054306030273, |
|
"learning_rate": 2.864183829927434e-06, |
|
"loss": 1.437, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 0.13724035608308605, |
|
"grad_norm": 7.5963335037231445, |
|
"learning_rate": 2.8627265557467836e-06, |
|
"loss": 1.3608, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 0.13798219584569732, |
|
"grad_norm": 9.460357666015625, |
|
"learning_rate": 2.861261879917177e-06, |
|
"loss": 1.4096, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 0.1387240356083086, |
|
"grad_norm": 8.779165267944336, |
|
"learning_rate": 2.8597898103940014e-06, |
|
"loss": 1.3327, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 0.1394658753709199, |
|
"grad_norm": 8.048774719238281, |
|
"learning_rate": 2.858310355172801e-06, |
|
"loss": 1.3372, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 0.14020771513353117, |
|
"grad_norm": 8.53365421295166, |
|
"learning_rate": 2.8568235222892375e-06, |
|
"loss": 1.4482, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 0.14094955489614244, |
|
"grad_norm": 9.450532913208008, |
|
"learning_rate": 2.8553293198190425e-06, |
|
"loss": 1.3362, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.14169139465875372, |
|
"grad_norm": 7.9473958015441895, |
|
"learning_rate": 2.853827755877977e-06, |
|
"loss": 1.3946, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 0.142433234421365, |
|
"grad_norm": 10.09933090209961, |
|
"learning_rate": 2.852318838621784e-06, |
|
"loss": 1.5963, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 0.14317507418397626, |
|
"grad_norm": 8.691498756408691, |
|
"learning_rate": 2.850802576246149e-06, |
|
"loss": 1.3957, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 0.14391691394658754, |
|
"grad_norm": 9.597620010375977, |
|
"learning_rate": 2.8492789769866493e-06, |
|
"loss": 1.4577, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 0.1446587537091988, |
|
"grad_norm": 9.706177711486816, |
|
"learning_rate": 2.8477480491187146e-06, |
|
"loss": 1.4256, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 0.14540059347181009, |
|
"grad_norm": 9.215739250183105, |
|
"learning_rate": 2.846209800957579e-06, |
|
"loss": 1.4918, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 0.14614243323442136, |
|
"grad_norm": 8.966597557067871, |
|
"learning_rate": 2.8446642408582374e-06, |
|
"loss": 1.435, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 0.14688427299703263, |
|
"grad_norm": 8.87956428527832, |
|
"learning_rate": 2.8431113772153984e-06, |
|
"loss": 1.4318, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 0.1476261127596439, |
|
"grad_norm": 9.43526840209961, |
|
"learning_rate": 2.8415512184634413e-06, |
|
"loss": 1.4226, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 0.14836795252225518, |
|
"grad_norm": 7.335799694061279, |
|
"learning_rate": 2.839983773076367e-06, |
|
"loss": 1.3469, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.14836795252225518, |
|
"eval_loss": 1.405104160308838, |
|
"eval_runtime": 23.4479, |
|
"eval_samples_per_second": 19.021, |
|
"eval_steps_per_second": 9.51, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.14910979228486648, |
|
"grad_norm": 8.135221481323242, |
|
"learning_rate": 2.8384090495677555e-06, |
|
"loss": 1.3779, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 0.14985163204747776, |
|
"grad_norm": 8.584566116333008, |
|
"learning_rate": 2.8368270564907167e-06, |
|
"loss": 1.4178, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 0.15059347181008903, |
|
"grad_norm": 9.192804336547852, |
|
"learning_rate": 2.8352378024378462e-06, |
|
"loss": 1.4223, |
|
"step": 2030 |
|
}, |
|
{ |
|
"epoch": 0.1513353115727003, |
|
"grad_norm": 10.986886024475098, |
|
"learning_rate": 2.8336412960411765e-06, |
|
"loss": 1.5351, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 0.15207715133531158, |
|
"grad_norm": 8.154606819152832, |
|
"learning_rate": 2.832037545972132e-06, |
|
"loss": 1.3744, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 0.15281899109792285, |
|
"grad_norm": 8.556278228759766, |
|
"learning_rate": 2.8304265609414803e-06, |
|
"loss": 1.3267, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 0.15356083086053413, |
|
"grad_norm": 9.713357925415039, |
|
"learning_rate": 2.8288083496992867e-06, |
|
"loss": 1.3808, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 0.1543026706231454, |
|
"grad_norm": 8.706491470336914, |
|
"learning_rate": 2.8271829210348656e-06, |
|
"loss": 1.297, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 0.15504451038575667, |
|
"grad_norm": 8.89303970336914, |
|
"learning_rate": 2.825550283776731e-06, |
|
"loss": 1.2562, |
|
"step": 2090 |
|
}, |
|
{ |
|
"epoch": 0.15578635014836795, |
|
"grad_norm": 8.402449607849121, |
|
"learning_rate": 2.8239104467925532e-06, |
|
"loss": 1.4105, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.15652818991097922, |
|
"grad_norm": 7.475712776184082, |
|
"learning_rate": 2.8222634189891055e-06, |
|
"loss": 1.3397, |
|
"step": 2110 |
|
}, |
|
{ |
|
"epoch": 0.1572700296735905, |
|
"grad_norm": 8.340933799743652, |
|
"learning_rate": 2.8206092093122193e-06, |
|
"loss": 1.2691, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 0.15801186943620177, |
|
"grad_norm": 7.353670597076416, |
|
"learning_rate": 2.8189478267467344e-06, |
|
"loss": 1.408, |
|
"step": 2130 |
|
}, |
|
{ |
|
"epoch": 0.15875370919881307, |
|
"grad_norm": 8.455607414245605, |
|
"learning_rate": 2.817279280316449e-06, |
|
"loss": 1.5435, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 0.15949554896142434, |
|
"grad_norm": 9.295350074768066, |
|
"learning_rate": 2.8156035790840733e-06, |
|
"loss": 1.5229, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 0.16023738872403562, |
|
"grad_norm": 9.709535598754883, |
|
"learning_rate": 2.8139207321511777e-06, |
|
"loss": 1.5848, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 0.1609792284866469, |
|
"grad_norm": 10.39367389678955, |
|
"learning_rate": 2.8122307486581455e-06, |
|
"loss": 1.4792, |
|
"step": 2170 |
|
}, |
|
{ |
|
"epoch": 0.16172106824925817, |
|
"grad_norm": 8.161094665527344, |
|
"learning_rate": 2.8105336377841212e-06, |
|
"loss": 1.4138, |
|
"step": 2180 |
|
}, |
|
{ |
|
"epoch": 0.16246290801186944, |
|
"grad_norm": 9.033132553100586, |
|
"learning_rate": 2.808829408746962e-06, |
|
"loss": 1.5123, |
|
"step": 2190 |
|
}, |
|
{ |
|
"epoch": 0.1632047477744807, |
|
"grad_norm": 8.76311206817627, |
|
"learning_rate": 2.8071180708031874e-06, |
|
"loss": 1.4737, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 0.163946587537092, |
|
"grad_norm": 9.680130004882812, |
|
"learning_rate": 2.8053996332479296e-06, |
|
"loss": 1.3447, |
|
"step": 2210 |
|
}, |
|
{ |
|
"epoch": 0.16468842729970326, |
|
"grad_norm": 9.140039443969727, |
|
"learning_rate": 2.8036741054148817e-06, |
|
"loss": 1.479, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 0.16543026706231453, |
|
"grad_norm": 7.610710144042969, |
|
"learning_rate": 2.801941496676247e-06, |
|
"loss": 1.3595, |
|
"step": 2230 |
|
}, |
|
{ |
|
"epoch": 0.1661721068249258, |
|
"grad_norm": 11.338227272033691, |
|
"learning_rate": 2.8002018164426896e-06, |
|
"loss": 1.4566, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 0.16691394658753708, |
|
"grad_norm": 8.052413940429688, |
|
"learning_rate": 2.7984550741632837e-06, |
|
"loss": 1.3201, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 0.16765578635014836, |
|
"grad_norm": 8.803062438964844, |
|
"learning_rate": 2.7967012793254575e-06, |
|
"loss": 1.3299, |
|
"step": 2260 |
|
}, |
|
{ |
|
"epoch": 0.16839762611275966, |
|
"grad_norm": 8.115534782409668, |
|
"learning_rate": 2.7949404414549484e-06, |
|
"loss": 1.4376, |
|
"step": 2270 |
|
}, |
|
{ |
|
"epoch": 0.16913946587537093, |
|
"grad_norm": 9.156294822692871, |
|
"learning_rate": 2.7931725701157462e-06, |
|
"loss": 1.4132, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 0.1698813056379822, |
|
"grad_norm": 8.102431297302246, |
|
"learning_rate": 2.7913976749100445e-06, |
|
"loss": 1.4156, |
|
"step": 2290 |
|
}, |
|
{ |
|
"epoch": 0.17062314540059348, |
|
"grad_norm": 8.303695678710938, |
|
"learning_rate": 2.789615765478186e-06, |
|
"loss": 1.2913, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 0.17136498516320475, |
|
"grad_norm": 7.867891311645508, |
|
"learning_rate": 2.787826851498611e-06, |
|
"loss": 1.2225, |
|
"step": 2310 |
|
}, |
|
{ |
|
"epoch": 0.17210682492581603, |
|
"grad_norm": 8.89625072479248, |
|
"learning_rate": 2.786030942687805e-06, |
|
"loss": 1.5093, |
|
"step": 2320 |
|
}, |
|
{ |
|
"epoch": 0.1728486646884273, |
|
"grad_norm": 8.792491912841797, |
|
"learning_rate": 2.784228048800247e-06, |
|
"loss": 1.3146, |
|
"step": 2330 |
|
}, |
|
{ |
|
"epoch": 0.17359050445103857, |
|
"grad_norm": 9.683384895324707, |
|
"learning_rate": 2.7824181796283543e-06, |
|
"loss": 1.4008, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 0.17433234421364985, |
|
"grad_norm": 9.359085083007812, |
|
"learning_rate": 2.780601345002431e-06, |
|
"loss": 1.2744, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 0.17507418397626112, |
|
"grad_norm": 7.971740245819092, |
|
"learning_rate": 2.7787775547906143e-06, |
|
"loss": 1.3748, |
|
"step": 2360 |
|
}, |
|
{ |
|
"epoch": 0.1758160237388724, |
|
"grad_norm": 9.259309768676758, |
|
"learning_rate": 2.77694681889882e-06, |
|
"loss": 1.3978, |
|
"step": 2370 |
|
}, |
|
{ |
|
"epoch": 0.17655786350148367, |
|
"grad_norm": 8.904669761657715, |
|
"learning_rate": 2.7751091472706886e-06, |
|
"loss": 1.3772, |
|
"step": 2380 |
|
}, |
|
{ |
|
"epoch": 0.17729970326409494, |
|
"grad_norm": 7.627325057983398, |
|
"learning_rate": 2.773264549887535e-06, |
|
"loss": 1.3509, |
|
"step": 2390 |
|
}, |
|
{ |
|
"epoch": 0.17804154302670624, |
|
"grad_norm": 9.28232479095459, |
|
"learning_rate": 2.771413036768288e-06, |
|
"loss": 1.4038, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.17878338278931752, |
|
"grad_norm": 11.565908432006836, |
|
"learning_rate": 2.7695546179694412e-06, |
|
"loss": 1.4158, |
|
"step": 2410 |
|
}, |
|
{ |
|
"epoch": 0.1795252225519288, |
|
"grad_norm": 8.238388061523438, |
|
"learning_rate": 2.767689303584996e-06, |
|
"loss": 1.4911, |
|
"step": 2420 |
|
}, |
|
{ |
|
"epoch": 0.18026706231454007, |
|
"grad_norm": 8.432221412658691, |
|
"learning_rate": 2.765817103746407e-06, |
|
"loss": 1.5864, |
|
"step": 2430 |
|
}, |
|
{ |
|
"epoch": 0.18100890207715134, |
|
"grad_norm": 8.204069137573242, |
|
"learning_rate": 2.7639380286225262e-06, |
|
"loss": 1.3994, |
|
"step": 2440 |
|
}, |
|
{ |
|
"epoch": 0.18175074183976261, |
|
"grad_norm": 8.444053649902344, |
|
"learning_rate": 2.762052088419551e-06, |
|
"loss": 1.576, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 0.1824925816023739, |
|
"grad_norm": 8.946913719177246, |
|
"learning_rate": 2.760159293380965e-06, |
|
"loss": 1.1678, |
|
"step": 2460 |
|
}, |
|
{ |
|
"epoch": 0.18323442136498516, |
|
"grad_norm": 8.895451545715332, |
|
"learning_rate": 2.758259653787483e-06, |
|
"loss": 1.3972, |
|
"step": 2470 |
|
}, |
|
{ |
|
"epoch": 0.18397626112759644, |
|
"grad_norm": 9.011785507202148, |
|
"learning_rate": 2.7563531799569982e-06, |
|
"loss": 1.2209, |
|
"step": 2480 |
|
}, |
|
{ |
|
"epoch": 0.1847181008902077, |
|
"grad_norm": 8.469378471374512, |
|
"learning_rate": 2.754439882244522e-06, |
|
"loss": 1.4777, |
|
"step": 2490 |
|
}, |
|
{ |
|
"epoch": 0.18545994065281898, |
|
"grad_norm": 8.05780029296875, |
|
"learning_rate": 2.7525197710421303e-06, |
|
"loss": 1.3816, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.18545994065281898, |
|
"eval_loss": 1.391993522644043, |
|
"eval_runtime": 23.4505, |
|
"eval_samples_per_second": 19.019, |
|
"eval_steps_per_second": 9.509, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.18620178041543026, |
|
"grad_norm": 7.978991508483887, |
|
"learning_rate": 2.7505928567789073e-06, |
|
"loss": 1.4641, |
|
"step": 2510 |
|
}, |
|
{ |
|
"epoch": 0.18694362017804153, |
|
"grad_norm": 8.432256698608398, |
|
"learning_rate": 2.7486591499208866e-06, |
|
"loss": 1.4184, |
|
"step": 2520 |
|
}, |
|
{ |
|
"epoch": 0.18768545994065283, |
|
"grad_norm": 9.253658294677734, |
|
"learning_rate": 2.7467186609709973e-06, |
|
"loss": 1.4106, |
|
"step": 2530 |
|
}, |
|
{ |
|
"epoch": 0.1884272997032641, |
|
"grad_norm": 16.86107635498047, |
|
"learning_rate": 2.7447714004690042e-06, |
|
"loss": 1.4225, |
|
"step": 2540 |
|
}, |
|
{ |
|
"epoch": 0.18916913946587538, |
|
"grad_norm": 9.117183685302734, |
|
"learning_rate": 2.7428173789914524e-06, |
|
"loss": 1.3031, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 0.18991097922848665, |
|
"grad_norm": 11.524558067321777, |
|
"learning_rate": 2.740856607151609e-06, |
|
"loss": 1.3394, |
|
"step": 2560 |
|
}, |
|
{ |
|
"epoch": 0.19065281899109793, |
|
"grad_norm": 9.210947036743164, |
|
"learning_rate": 2.7388890955994055e-06, |
|
"loss": 1.5357, |
|
"step": 2570 |
|
}, |
|
{ |
|
"epoch": 0.1913946587537092, |
|
"grad_norm": 10.00994873046875, |
|
"learning_rate": 2.7369148550213806e-06, |
|
"loss": 1.3765, |
|
"step": 2580 |
|
}, |
|
{ |
|
"epoch": 0.19213649851632048, |
|
"grad_norm": 7.468533992767334, |
|
"learning_rate": 2.7349338961406223e-06, |
|
"loss": 1.4192, |
|
"step": 2590 |
|
}, |
|
{ |
|
"epoch": 0.19287833827893175, |
|
"grad_norm": 8.357904434204102, |
|
"learning_rate": 2.7329462297167068e-06, |
|
"loss": 1.3348, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 0.19362017804154302, |
|
"grad_norm": 9.04192066192627, |
|
"learning_rate": 2.7309518665456454e-06, |
|
"loss": 1.3598, |
|
"step": 2610 |
|
}, |
|
{ |
|
"epoch": 0.1943620178041543, |
|
"grad_norm": 9.699695587158203, |
|
"learning_rate": 2.72895081745982e-06, |
|
"loss": 1.5076, |
|
"step": 2620 |
|
}, |
|
{ |
|
"epoch": 0.19510385756676557, |
|
"grad_norm": 8.667801856994629, |
|
"learning_rate": 2.7269430933279284e-06, |
|
"loss": 1.2957, |
|
"step": 2630 |
|
}, |
|
{ |
|
"epoch": 0.19584569732937684, |
|
"grad_norm": 8.39424991607666, |
|
"learning_rate": 2.724928705054924e-06, |
|
"loss": 1.3713, |
|
"step": 2640 |
|
}, |
|
{ |
|
"epoch": 0.19658753709198812, |
|
"grad_norm": 8.892675399780273, |
|
"learning_rate": 2.7229076635819563e-06, |
|
"loss": 1.4559, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 0.19732937685459942, |
|
"grad_norm": 10.235827445983887, |
|
"learning_rate": 2.720879979886311e-06, |
|
"loss": 1.3907, |
|
"step": 2660 |
|
}, |
|
{ |
|
"epoch": 0.1980712166172107, |
|
"grad_norm": 9.297379493713379, |
|
"learning_rate": 2.7188456649813526e-06, |
|
"loss": 1.4805, |
|
"step": 2670 |
|
}, |
|
{ |
|
"epoch": 0.19881305637982197, |
|
"grad_norm": 10.14811897277832, |
|
"learning_rate": 2.7168047299164614e-06, |
|
"loss": 1.4573, |
|
"step": 2680 |
|
}, |
|
{ |
|
"epoch": 0.19955489614243324, |
|
"grad_norm": 8.918148040771484, |
|
"learning_rate": 2.7147571857769755e-06, |
|
"loss": 1.3873, |
|
"step": 2690 |
|
}, |
|
{ |
|
"epoch": 0.20029673590504452, |
|
"grad_norm": 8.084507942199707, |
|
"learning_rate": 2.7127030436841307e-06, |
|
"loss": 1.2873, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 0.2010385756676558, |
|
"grad_norm": 8.225303649902344, |
|
"learning_rate": 2.710642314794999e-06, |
|
"loss": 1.4675, |
|
"step": 2710 |
|
}, |
|
{ |
|
"epoch": 0.20178041543026706, |
|
"grad_norm": 8.811010360717773, |
|
"learning_rate": 2.7085750103024297e-06, |
|
"loss": 1.4683, |
|
"step": 2720 |
|
}, |
|
{ |
|
"epoch": 0.20252225519287834, |
|
"grad_norm": 8.835148811340332, |
|
"learning_rate": 2.7065011414349858e-06, |
|
"loss": 1.4257, |
|
"step": 2730 |
|
}, |
|
{ |
|
"epoch": 0.2032640949554896, |
|
"grad_norm": 15.418182373046875, |
|
"learning_rate": 2.704420719456885e-06, |
|
"loss": 1.4806, |
|
"step": 2740 |
|
}, |
|
{ |
|
"epoch": 0.20400593471810088, |
|
"grad_norm": 9.259235382080078, |
|
"learning_rate": 2.7023337556679402e-06, |
|
"loss": 1.6237, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 0.20474777448071216, |
|
"grad_norm": 11.389565467834473, |
|
"learning_rate": 2.7002402614034937e-06, |
|
"loss": 1.3695, |
|
"step": 2760 |
|
}, |
|
{ |
|
"epoch": 0.20548961424332343, |
|
"grad_norm": 7.731765270233154, |
|
"learning_rate": 2.69814024803436e-06, |
|
"loss": 1.4801, |
|
"step": 2770 |
|
}, |
|
{ |
|
"epoch": 0.2062314540059347, |
|
"grad_norm": 8.97433853149414, |
|
"learning_rate": 2.6960337269667605e-06, |
|
"loss": 1.4708, |
|
"step": 2780 |
|
}, |
|
{ |
|
"epoch": 0.206973293768546, |
|
"grad_norm": 9.035865783691406, |
|
"learning_rate": 2.6939207096422634e-06, |
|
"loss": 1.4399, |
|
"step": 2790 |
|
}, |
|
{ |
|
"epoch": 0.20771513353115728, |
|
"grad_norm": 9.75682258605957, |
|
"learning_rate": 2.6918012075377224e-06, |
|
"loss": 1.3488, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 0.20845697329376855, |
|
"grad_norm": 9.119101524353027, |
|
"learning_rate": 2.689675232165213e-06, |
|
"loss": 1.3, |
|
"step": 2810 |
|
}, |
|
{ |
|
"epoch": 0.20919881305637983, |
|
"grad_norm": 8.837667465209961, |
|
"learning_rate": 2.68754279507197e-06, |
|
"loss": 1.3659, |
|
"step": 2820 |
|
}, |
|
{ |
|
"epoch": 0.2099406528189911, |
|
"grad_norm": 8.174179077148438, |
|
"learning_rate": 2.685403907840324e-06, |
|
"loss": 1.3446, |
|
"step": 2830 |
|
}, |
|
{ |
|
"epoch": 0.21068249258160238, |
|
"grad_norm": 9.282876968383789, |
|
"learning_rate": 2.6832585820876413e-06, |
|
"loss": 1.4882, |
|
"step": 2840 |
|
}, |
|
{ |
|
"epoch": 0.21142433234421365, |
|
"grad_norm": 7.6600213050842285, |
|
"learning_rate": 2.681106829466258e-06, |
|
"loss": 1.1834, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 0.21216617210682492, |
|
"grad_norm": 9.84327220916748, |
|
"learning_rate": 2.678948661663417e-06, |
|
"loss": 1.4927, |
|
"step": 2860 |
|
}, |
|
{ |
|
"epoch": 0.2129080118694362, |
|
"grad_norm": 9.372842788696289, |
|
"learning_rate": 2.6767840904012078e-06, |
|
"loss": 1.4625, |
|
"step": 2870 |
|
}, |
|
{ |
|
"epoch": 0.21364985163204747, |
|
"grad_norm": 7.723082542419434, |
|
"learning_rate": 2.6746131274364977e-06, |
|
"loss": 1.3829, |
|
"step": 2880 |
|
}, |
|
{ |
|
"epoch": 0.21439169139465875, |
|
"grad_norm": 8.692205429077148, |
|
"learning_rate": 2.6724357845608716e-06, |
|
"loss": 1.46, |
|
"step": 2890 |
|
}, |
|
{ |
|
"epoch": 0.21513353115727002, |
|
"grad_norm": 9.735092163085938, |
|
"learning_rate": 2.6702520736005673e-06, |
|
"loss": 1.3574, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 0.2158753709198813, |
|
"grad_norm": 8.781496047973633, |
|
"learning_rate": 2.6680620064164094e-06, |
|
"loss": 1.421, |
|
"step": 2910 |
|
}, |
|
{ |
|
"epoch": 0.2166172106824926, |
|
"grad_norm": 8.708477020263672, |
|
"learning_rate": 2.6658655949037482e-06, |
|
"loss": 1.3353, |
|
"step": 2920 |
|
}, |
|
{ |
|
"epoch": 0.21735905044510387, |
|
"grad_norm": 9.43267822265625, |
|
"learning_rate": 2.6636628509923924e-06, |
|
"loss": 1.2779, |
|
"step": 2930 |
|
}, |
|
{ |
|
"epoch": 0.21810089020771514, |
|
"grad_norm": 9.485703468322754, |
|
"learning_rate": 2.661453786646544e-06, |
|
"loss": 1.4917, |
|
"step": 2940 |
|
}, |
|
{ |
|
"epoch": 0.21884272997032642, |
|
"grad_norm": 9.164180755615234, |
|
"learning_rate": 2.659238413864736e-06, |
|
"loss": 1.2931, |
|
"step": 2950 |
|
}, |
|
{ |
|
"epoch": 0.2195845697329377, |
|
"grad_norm": 8.09424114227295, |
|
"learning_rate": 2.6570167446797654e-06, |
|
"loss": 1.4717, |
|
"step": 2960 |
|
}, |
|
{ |
|
"epoch": 0.22032640949554896, |
|
"grad_norm": 8.689072608947754, |
|
"learning_rate": 2.6547887911586278e-06, |
|
"loss": 1.3389, |
|
"step": 2970 |
|
}, |
|
{ |
|
"epoch": 0.22106824925816024, |
|
"grad_norm": 7.4104838371276855, |
|
"learning_rate": 2.6525545654024517e-06, |
|
"loss": 1.2771, |
|
"step": 2980 |
|
}, |
|
{ |
|
"epoch": 0.2218100890207715, |
|
"grad_norm": 8.580281257629395, |
|
"learning_rate": 2.650314079546434e-06, |
|
"loss": 1.3574, |
|
"step": 2990 |
|
}, |
|
{ |
|
"epoch": 0.22255192878338279, |
|
"grad_norm": 6.826554298400879, |
|
"learning_rate": 2.648067345759774e-06, |
|
"loss": 1.3653, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.22255192878338279, |
|
"eval_loss": 1.380942463874817, |
|
"eval_runtime": 23.446, |
|
"eval_samples_per_second": 19.022, |
|
"eval_steps_per_second": 9.511, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.22329376854599406, |
|
"grad_norm": 8.048758506774902, |
|
"learning_rate": 2.6458143762456038e-06, |
|
"loss": 1.4932, |
|
"step": 3010 |
|
}, |
|
{ |
|
"epoch": 0.22403560830860533, |
|
"grad_norm": 8.818073272705078, |
|
"learning_rate": 2.643555183240928e-06, |
|
"loss": 1.3055, |
|
"step": 3020 |
|
}, |
|
{ |
|
"epoch": 0.2247774480712166, |
|
"grad_norm": 7.931951999664307, |
|
"learning_rate": 2.6412897790165526e-06, |
|
"loss": 1.4524, |
|
"step": 3030 |
|
}, |
|
{ |
|
"epoch": 0.22551928783382788, |
|
"grad_norm": 7.983026504516602, |
|
"learning_rate": 2.6390181758770205e-06, |
|
"loss": 1.3969, |
|
"step": 3040 |
|
}, |
|
{ |
|
"epoch": 0.22626112759643918, |
|
"grad_norm": 9.100227355957031, |
|
"learning_rate": 2.636740386160543e-06, |
|
"loss": 1.3396, |
|
"step": 3050 |
|
}, |
|
{ |
|
"epoch": 0.22700296735905046, |
|
"grad_norm": 8.59542179107666, |
|
"learning_rate": 2.6344564222389353e-06, |
|
"loss": 1.3731, |
|
"step": 3060 |
|
}, |
|
{ |
|
"epoch": 0.22774480712166173, |
|
"grad_norm": 7.7173752784729, |
|
"learning_rate": 2.6321662965175457e-06, |
|
"loss": 1.2887, |
|
"step": 3070 |
|
}, |
|
{ |
|
"epoch": 0.228486646884273, |
|
"grad_norm": 9.884195327758789, |
|
"learning_rate": 2.6298700214351924e-06, |
|
"loss": 1.2001, |
|
"step": 3080 |
|
}, |
|
{ |
|
"epoch": 0.22922848664688428, |
|
"grad_norm": 8.387007713317871, |
|
"learning_rate": 2.627567609464092e-06, |
|
"loss": 1.4851, |
|
"step": 3090 |
|
}, |
|
{ |
|
"epoch": 0.22997032640949555, |
|
"grad_norm": 8.314335823059082, |
|
"learning_rate": 2.6252590731097956e-06, |
|
"loss": 1.3391, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 0.23071216617210683, |
|
"grad_norm": 8.861979484558105, |
|
"learning_rate": 2.6229444249111175e-06, |
|
"loss": 1.3721, |
|
"step": 3110 |
|
}, |
|
{ |
|
"epoch": 0.2314540059347181, |
|
"grad_norm": 11.68078899383545, |
|
"learning_rate": 2.6206236774400685e-06, |
|
"loss": 1.5759, |
|
"step": 3120 |
|
}, |
|
{ |
|
"epoch": 0.23219584569732937, |
|
"grad_norm": 8.4688081741333, |
|
"learning_rate": 2.618296843301788e-06, |
|
"loss": 1.3431, |
|
"step": 3130 |
|
}, |
|
{ |
|
"epoch": 0.23293768545994065, |
|
"grad_norm": 8.566194534301758, |
|
"learning_rate": 2.6159639351344755e-06, |
|
"loss": 1.373, |
|
"step": 3140 |
|
}, |
|
{ |
|
"epoch": 0.23367952522255192, |
|
"grad_norm": 6.903346538543701, |
|
"learning_rate": 2.6136249656093204e-06, |
|
"loss": 1.2995, |
|
"step": 3150 |
|
}, |
|
{ |
|
"epoch": 0.2344213649851632, |
|
"grad_norm": 8.093761444091797, |
|
"learning_rate": 2.611279947430436e-06, |
|
"loss": 1.4552, |
|
"step": 3160 |
|
}, |
|
{ |
|
"epoch": 0.23516320474777447, |
|
"grad_norm": 9.532185554504395, |
|
"learning_rate": 2.608928893334788e-06, |
|
"loss": 1.359, |
|
"step": 3170 |
|
}, |
|
{ |
|
"epoch": 0.23590504451038577, |
|
"grad_norm": 10.045039176940918, |
|
"learning_rate": 2.6065718160921246e-06, |
|
"loss": 1.5474, |
|
"step": 3180 |
|
}, |
|
{ |
|
"epoch": 0.23664688427299704, |
|
"grad_norm": 9.059492111206055, |
|
"learning_rate": 2.604208728504912e-06, |
|
"loss": 1.2215, |
|
"step": 3190 |
|
}, |
|
{ |
|
"epoch": 0.23738872403560832, |
|
"grad_norm": 10.714762687683105, |
|
"learning_rate": 2.601839643408259e-06, |
|
"loss": 1.3327, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 0.2381305637982196, |
|
"grad_norm": 8.981411933898926, |
|
"learning_rate": 2.599464573669851e-06, |
|
"loss": 1.3985, |
|
"step": 3210 |
|
}, |
|
{ |
|
"epoch": 0.23887240356083086, |
|
"grad_norm": 8.016975402832031, |
|
"learning_rate": 2.597083532189879e-06, |
|
"loss": 1.2672, |
|
"step": 3220 |
|
}, |
|
{ |
|
"epoch": 0.23961424332344214, |
|
"grad_norm": 9.3323335647583, |
|
"learning_rate": 2.594696531900968e-06, |
|
"loss": 1.2048, |
|
"step": 3230 |
|
}, |
|
{ |
|
"epoch": 0.2403560830860534, |
|
"grad_norm": 7.841317653656006, |
|
"learning_rate": 2.592303585768111e-06, |
|
"loss": 1.3764, |
|
"step": 3240 |
|
}, |
|
{ |
|
"epoch": 0.2410979228486647, |
|
"grad_norm": 9.452821731567383, |
|
"learning_rate": 2.5899047067885935e-06, |
|
"loss": 1.3729, |
|
"step": 3250 |
|
}, |
|
{ |
|
"epoch": 0.24183976261127596, |
|
"grad_norm": 11.088187217712402, |
|
"learning_rate": 2.5874999079919264e-06, |
|
"loss": 1.3502, |
|
"step": 3260 |
|
}, |
|
{ |
|
"epoch": 0.24258160237388723, |
|
"grad_norm": 9.076626777648926, |
|
"learning_rate": 2.5850892024397736e-06, |
|
"loss": 1.3962, |
|
"step": 3270 |
|
}, |
|
{ |
|
"epoch": 0.2433234421364985, |
|
"grad_norm": 9.371712684631348, |
|
"learning_rate": 2.5826726032258818e-06, |
|
"loss": 1.5036, |
|
"step": 3280 |
|
}, |
|
{ |
|
"epoch": 0.24406528189910978, |
|
"grad_norm": 8.981965065002441, |
|
"learning_rate": 2.580250123476009e-06, |
|
"loss": 1.3917, |
|
"step": 3290 |
|
}, |
|
{ |
|
"epoch": 0.24480712166172106, |
|
"grad_norm": 7.41351842880249, |
|
"learning_rate": 2.577821776347853e-06, |
|
"loss": 1.2765, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 0.24554896142433236, |
|
"grad_norm": 7.898843765258789, |
|
"learning_rate": 2.5753875750309814e-06, |
|
"loss": 1.4827, |
|
"step": 3310 |
|
}, |
|
{ |
|
"epoch": 0.24629080118694363, |
|
"grad_norm": 8.024171829223633, |
|
"learning_rate": 2.572947532746758e-06, |
|
"loss": 1.4173, |
|
"step": 3320 |
|
}, |
|
{ |
|
"epoch": 0.2470326409495549, |
|
"grad_norm": 7.735332489013672, |
|
"learning_rate": 2.570501662748271e-06, |
|
"loss": 1.3901, |
|
"step": 3330 |
|
}, |
|
{ |
|
"epoch": 0.24777448071216618, |
|
"grad_norm": 8.987187385559082, |
|
"learning_rate": 2.568049978320263e-06, |
|
"loss": 1.4371, |
|
"step": 3340 |
|
}, |
|
{ |
|
"epoch": 0.24851632047477745, |
|
"grad_norm": 9.167318344116211, |
|
"learning_rate": 2.5655924927790585e-06, |
|
"loss": 1.3519, |
|
"step": 3350 |
|
}, |
|
{ |
|
"epoch": 0.24925816023738873, |
|
"grad_norm": 7.899603366851807, |
|
"learning_rate": 2.5631292194724884e-06, |
|
"loss": 1.31, |
|
"step": 3360 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 8.992423057556152, |
|
"learning_rate": 2.5606601717798212e-06, |
|
"loss": 1.3822, |
|
"step": 3370 |
|
}, |
|
{ |
|
"epoch": 0.2507418397626113, |
|
"grad_norm": 9.284130096435547, |
|
"learning_rate": 2.558185363111689e-06, |
|
"loss": 1.4068, |
|
"step": 3380 |
|
}, |
|
{ |
|
"epoch": 0.25148367952522255, |
|
"grad_norm": 9.180769920349121, |
|
"learning_rate": 2.555704806910015e-06, |
|
"loss": 1.377, |
|
"step": 3390 |
|
}, |
|
{ |
|
"epoch": 0.2522255192878338, |
|
"grad_norm": 9.335295677185059, |
|
"learning_rate": 2.553218516647939e-06, |
|
"loss": 1.3997, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 0.2529673590504451, |
|
"grad_norm": 10.324609756469727, |
|
"learning_rate": 2.550726505829746e-06, |
|
"loss": 1.502, |
|
"step": 3410 |
|
}, |
|
{ |
|
"epoch": 0.25370919881305637, |
|
"grad_norm": 8.74648380279541, |
|
"learning_rate": 2.5482287879907926e-06, |
|
"loss": 1.3515, |
|
"step": 3420 |
|
}, |
|
{ |
|
"epoch": 0.25445103857566764, |
|
"grad_norm": 9.311241149902344, |
|
"learning_rate": 2.5457253766974314e-06, |
|
"loss": 1.3607, |
|
"step": 3430 |
|
}, |
|
{ |
|
"epoch": 0.2551928783382789, |
|
"grad_norm": 9.811213493347168, |
|
"learning_rate": 2.543216285546942e-06, |
|
"loss": 1.436, |
|
"step": 3440 |
|
}, |
|
{ |
|
"epoch": 0.2559347181008902, |
|
"grad_norm": 8.822476387023926, |
|
"learning_rate": 2.5407015281674513e-06, |
|
"loss": 1.582, |
|
"step": 3450 |
|
}, |
|
{ |
|
"epoch": 0.25667655786350146, |
|
"grad_norm": 7.025854110717773, |
|
"learning_rate": 2.5381811182178632e-06, |
|
"loss": 1.3498, |
|
"step": 3460 |
|
}, |
|
{ |
|
"epoch": 0.25741839762611274, |
|
"grad_norm": 8.49760627746582, |
|
"learning_rate": 2.5356550693877845e-06, |
|
"loss": 1.4426, |
|
"step": 3470 |
|
}, |
|
{ |
|
"epoch": 0.258160237388724, |
|
"grad_norm": 9.154727935791016, |
|
"learning_rate": 2.5331233953974484e-06, |
|
"loss": 1.2733, |
|
"step": 3480 |
|
}, |
|
{ |
|
"epoch": 0.2589020771513353, |
|
"grad_norm": 7.772784233093262, |
|
"learning_rate": 2.5305861099976416e-06, |
|
"loss": 1.2198, |
|
"step": 3490 |
|
}, |
|
{ |
|
"epoch": 0.2596439169139466, |
|
"grad_norm": 7.934385776519775, |
|
"learning_rate": 2.5280432269696283e-06, |
|
"loss": 1.4087, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.2596439169139466, |
|
"eval_loss": 1.3714910745620728, |
|
"eval_runtime": 23.4503, |
|
"eval_samples_per_second": 19.019, |
|
"eval_steps_per_second": 9.509, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.2603857566765579, |
|
"grad_norm": 7.804587364196777, |
|
"learning_rate": 2.5254947601250787e-06, |
|
"loss": 1.2602, |
|
"step": 3510 |
|
}, |
|
{ |
|
"epoch": 0.26112759643916916, |
|
"grad_norm": 10.741705894470215, |
|
"learning_rate": 2.5229407233059886e-06, |
|
"loss": 1.5066, |
|
"step": 3520 |
|
}, |
|
{ |
|
"epoch": 0.26186943620178044, |
|
"grad_norm": 7.940061092376709, |
|
"learning_rate": 2.5203811303846093e-06, |
|
"loss": 1.3713, |
|
"step": 3530 |
|
}, |
|
{ |
|
"epoch": 0.2626112759643917, |
|
"grad_norm": 8.638043403625488, |
|
"learning_rate": 2.5178159952633683e-06, |
|
"loss": 1.4127, |
|
"step": 3540 |
|
}, |
|
{ |
|
"epoch": 0.263353115727003, |
|
"grad_norm": 7.808784008026123, |
|
"learning_rate": 2.515245331874797e-06, |
|
"loss": 1.3337, |
|
"step": 3550 |
|
}, |
|
{ |
|
"epoch": 0.26409495548961426, |
|
"grad_norm": 7.855457782745361, |
|
"learning_rate": 2.5126691541814516e-06, |
|
"loss": 1.4842, |
|
"step": 3560 |
|
}, |
|
{ |
|
"epoch": 0.26483679525222553, |
|
"grad_norm": 7.667708873748779, |
|
"learning_rate": 2.5100874761758426e-06, |
|
"loss": 1.2371, |
|
"step": 3570 |
|
}, |
|
{ |
|
"epoch": 0.2655786350148368, |
|
"grad_norm": 8.755106925964355, |
|
"learning_rate": 2.5075003118803524e-06, |
|
"loss": 1.4708, |
|
"step": 3580 |
|
}, |
|
{ |
|
"epoch": 0.2663204747774481, |
|
"grad_norm": 8.294569969177246, |
|
"learning_rate": 2.504907675347163e-06, |
|
"loss": 1.4162, |
|
"step": 3590 |
|
}, |
|
{ |
|
"epoch": 0.26706231454005935, |
|
"grad_norm": 8.485974311828613, |
|
"learning_rate": 2.50230958065818e-06, |
|
"loss": 1.4551, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 0.2678041543026706, |
|
"grad_norm": 12.968074798583984, |
|
"learning_rate": 2.4997060419249534e-06, |
|
"loss": 1.4756, |
|
"step": 3610 |
|
}, |
|
{ |
|
"epoch": 0.2685459940652819, |
|
"grad_norm": 7.765286922454834, |
|
"learning_rate": 2.4970970732886032e-06, |
|
"loss": 1.2534, |
|
"step": 3620 |
|
}, |
|
{ |
|
"epoch": 0.2692878338278932, |
|
"grad_norm": 8.599440574645996, |
|
"learning_rate": 2.494482688919742e-06, |
|
"loss": 1.3371, |
|
"step": 3630 |
|
}, |
|
{ |
|
"epoch": 0.27002967359050445, |
|
"grad_norm": 8.294087409973145, |
|
"learning_rate": 2.491862903018398e-06, |
|
"loss": 1.4185, |
|
"step": 3640 |
|
}, |
|
{ |
|
"epoch": 0.2707715133531157, |
|
"grad_norm": 8.291155815124512, |
|
"learning_rate": 2.489237729813938e-06, |
|
"loss": 1.3793, |
|
"step": 3650 |
|
}, |
|
{ |
|
"epoch": 0.271513353115727, |
|
"grad_norm": 7.898152828216553, |
|
"learning_rate": 2.4866071835649887e-06, |
|
"loss": 1.3714, |
|
"step": 3660 |
|
}, |
|
{ |
|
"epoch": 0.27225519287833827, |
|
"grad_norm": 8.396595001220703, |
|
"learning_rate": 2.483971278559362e-06, |
|
"loss": 1.4737, |
|
"step": 3670 |
|
}, |
|
{ |
|
"epoch": 0.27299703264094954, |
|
"grad_norm": 7.634808540344238, |
|
"learning_rate": 2.4813300291139753e-06, |
|
"loss": 1.3822, |
|
"step": 3680 |
|
}, |
|
{ |
|
"epoch": 0.2737388724035608, |
|
"grad_norm": 8.787116050720215, |
|
"learning_rate": 2.4786834495747738e-06, |
|
"loss": 1.2784, |
|
"step": 3690 |
|
}, |
|
{ |
|
"epoch": 0.2744807121661721, |
|
"grad_norm": 10.124987602233887, |
|
"learning_rate": 2.476031554316655e-06, |
|
"loss": 1.4317, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 0.27522255192878337, |
|
"grad_norm": 8.735859870910645, |
|
"learning_rate": 2.4733743577433857e-06, |
|
"loss": 1.2954, |
|
"step": 3710 |
|
}, |
|
{ |
|
"epoch": 0.27596439169139464, |
|
"grad_norm": 9.41859245300293, |
|
"learning_rate": 2.470711874287529e-06, |
|
"loss": 1.4109, |
|
"step": 3720 |
|
}, |
|
{ |
|
"epoch": 0.2767062314540059, |
|
"grad_norm": 7.346931457519531, |
|
"learning_rate": 2.4680441184103642e-06, |
|
"loss": 1.3118, |
|
"step": 3730 |
|
}, |
|
{ |
|
"epoch": 0.2774480712166172, |
|
"grad_norm": 8.223915100097656, |
|
"learning_rate": 2.465371104601805e-06, |
|
"loss": 1.451, |
|
"step": 3740 |
|
}, |
|
{ |
|
"epoch": 0.27818991097922846, |
|
"grad_norm": 8.05762004852295, |
|
"learning_rate": 2.4626928473803264e-06, |
|
"loss": 1.4075, |
|
"step": 3750 |
|
}, |
|
{ |
|
"epoch": 0.2789317507418398, |
|
"grad_norm": 10.53507137298584, |
|
"learning_rate": 2.4600093612928813e-06, |
|
"loss": 1.4301, |
|
"step": 3760 |
|
}, |
|
{ |
|
"epoch": 0.27967359050445106, |
|
"grad_norm": 7.951254367828369, |
|
"learning_rate": 2.457320660914824e-06, |
|
"loss": 1.4816, |
|
"step": 3770 |
|
}, |
|
{ |
|
"epoch": 0.28041543026706234, |
|
"grad_norm": 11.656047821044922, |
|
"learning_rate": 2.45462676084983e-06, |
|
"loss": 1.2551, |
|
"step": 3780 |
|
}, |
|
{ |
|
"epoch": 0.2811572700296736, |
|
"grad_norm": 9.22987174987793, |
|
"learning_rate": 2.451927675729816e-06, |
|
"loss": 1.4458, |
|
"step": 3790 |
|
}, |
|
{ |
|
"epoch": 0.2818991097922849, |
|
"grad_norm": 9.910201072692871, |
|
"learning_rate": 2.4492234202148643e-06, |
|
"loss": 1.428, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 0.28264094955489616, |
|
"grad_norm": 8.999225616455078, |
|
"learning_rate": 2.4465140089931357e-06, |
|
"loss": 1.275, |
|
"step": 3810 |
|
}, |
|
{ |
|
"epoch": 0.28338278931750743, |
|
"grad_norm": 7.863303184509277, |
|
"learning_rate": 2.443799456780798e-06, |
|
"loss": 1.3344, |
|
"step": 3820 |
|
}, |
|
{ |
|
"epoch": 0.2841246290801187, |
|
"grad_norm": 8.949956893920898, |
|
"learning_rate": 2.44107977832194e-06, |
|
"loss": 1.3681, |
|
"step": 3830 |
|
}, |
|
{ |
|
"epoch": 0.28486646884273, |
|
"grad_norm": 10.083333015441895, |
|
"learning_rate": 2.438354988388495e-06, |
|
"loss": 1.2786, |
|
"step": 3840 |
|
}, |
|
{ |
|
"epoch": 0.28560830860534125, |
|
"grad_norm": 8.96097183227539, |
|
"learning_rate": 2.4356251017801596e-06, |
|
"loss": 1.3194, |
|
"step": 3850 |
|
}, |
|
{ |
|
"epoch": 0.28635014836795253, |
|
"grad_norm": 9.839349746704102, |
|
"learning_rate": 2.432890133324311e-06, |
|
"loss": 1.3521, |
|
"step": 3860 |
|
}, |
|
{ |
|
"epoch": 0.2870919881305638, |
|
"grad_norm": 7.604780197143555, |
|
"learning_rate": 2.43015009787593e-06, |
|
"loss": 1.3759, |
|
"step": 3870 |
|
}, |
|
{ |
|
"epoch": 0.2878338278931751, |
|
"grad_norm": 7.909048080444336, |
|
"learning_rate": 2.427405010317519e-06, |
|
"loss": 1.3872, |
|
"step": 3880 |
|
}, |
|
{ |
|
"epoch": 0.28857566765578635, |
|
"grad_norm": 8.023886680603027, |
|
"learning_rate": 2.4246548855590206e-06, |
|
"loss": 1.4451, |
|
"step": 3890 |
|
}, |
|
{ |
|
"epoch": 0.2893175074183976, |
|
"grad_norm": 8.603988647460938, |
|
"learning_rate": 2.4218997385377356e-06, |
|
"loss": 1.3554, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 0.2900593471810089, |
|
"grad_norm": 8.416375160217285, |
|
"learning_rate": 2.4191395842182455e-06, |
|
"loss": 1.4591, |
|
"step": 3910 |
|
}, |
|
{ |
|
"epoch": 0.29080118694362017, |
|
"grad_norm": 8.673905372619629, |
|
"learning_rate": 2.416374437592327e-06, |
|
"loss": 1.3327, |
|
"step": 3920 |
|
}, |
|
{ |
|
"epoch": 0.29154302670623145, |
|
"grad_norm": 8.481094360351562, |
|
"learning_rate": 2.413604313678874e-06, |
|
"loss": 1.3097, |
|
"step": 3930 |
|
}, |
|
{ |
|
"epoch": 0.2922848664688427, |
|
"grad_norm": 8.51818561553955, |
|
"learning_rate": 2.4108292275238133e-06, |
|
"loss": 1.2288, |
|
"step": 3940 |
|
}, |
|
{ |
|
"epoch": 0.293026706231454, |
|
"grad_norm": 9.287731170654297, |
|
"learning_rate": 2.4080491942000247e-06, |
|
"loss": 1.3104, |
|
"step": 3950 |
|
}, |
|
{ |
|
"epoch": 0.29376854599406527, |
|
"grad_norm": 9.262923240661621, |
|
"learning_rate": 2.4052642288072596e-06, |
|
"loss": 1.5436, |
|
"step": 3960 |
|
}, |
|
{ |
|
"epoch": 0.29451038575667654, |
|
"grad_norm": 9.646564483642578, |
|
"learning_rate": 2.4024743464720555e-06, |
|
"loss": 1.3926, |
|
"step": 3970 |
|
}, |
|
{ |
|
"epoch": 0.2952522255192878, |
|
"grad_norm": 8.739798545837402, |
|
"learning_rate": 2.3996795623476577e-06, |
|
"loss": 1.4747, |
|
"step": 3980 |
|
}, |
|
{ |
|
"epoch": 0.2959940652818991, |
|
"grad_norm": 8.455376625061035, |
|
"learning_rate": 2.396879891613936e-06, |
|
"loss": 1.371, |
|
"step": 3990 |
|
}, |
|
{ |
|
"epoch": 0.29673590504451036, |
|
"grad_norm": 8.93728256225586, |
|
"learning_rate": 2.394075349477302e-06, |
|
"loss": 1.2973, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.29673590504451036, |
|
"eval_loss": 1.36147141456604, |
|
"eval_runtime": 23.4427, |
|
"eval_samples_per_second": 19.025, |
|
"eval_steps_per_second": 9.513, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.29747774480712164, |
|
"grad_norm": 8.445281982421875, |
|
"learning_rate": 2.3912659511706243e-06, |
|
"loss": 1.4152, |
|
"step": 4010 |
|
}, |
|
{ |
|
"epoch": 0.29821958456973297, |
|
"grad_norm": 9.02658748626709, |
|
"learning_rate": 2.3884517119531496e-06, |
|
"loss": 1.4489, |
|
"step": 4020 |
|
}, |
|
{ |
|
"epoch": 0.29896142433234424, |
|
"grad_norm": 8.706474304199219, |
|
"learning_rate": 2.385632647110418e-06, |
|
"loss": 1.401, |
|
"step": 4030 |
|
}, |
|
{ |
|
"epoch": 0.2997032640949555, |
|
"grad_norm": 7.351003170013428, |
|
"learning_rate": 2.382808771954179e-06, |
|
"loss": 1.4131, |
|
"step": 4040 |
|
}, |
|
{ |
|
"epoch": 0.3004451038575668, |
|
"grad_norm": 8.288825988769531, |
|
"learning_rate": 2.3799801018223095e-06, |
|
"loss": 1.2643, |
|
"step": 4050 |
|
}, |
|
{ |
|
"epoch": 0.30118694362017806, |
|
"grad_norm": 8.027029991149902, |
|
"learning_rate": 2.3771466520787316e-06, |
|
"loss": 1.3642, |
|
"step": 4060 |
|
}, |
|
{ |
|
"epoch": 0.30192878338278933, |
|
"grad_norm": 9.516772270202637, |
|
"learning_rate": 2.3743084381133264e-06, |
|
"loss": 1.2057, |
|
"step": 4070 |
|
}, |
|
{ |
|
"epoch": 0.3026706231454006, |
|
"grad_norm": 8.332013130187988, |
|
"learning_rate": 2.371465475341852e-06, |
|
"loss": 1.347, |
|
"step": 4080 |
|
}, |
|
{ |
|
"epoch": 0.3034124629080119, |
|
"grad_norm": 7.586446762084961, |
|
"learning_rate": 2.3686177792058606e-06, |
|
"loss": 1.4661, |
|
"step": 4090 |
|
}, |
|
{ |
|
"epoch": 0.30415430267062316, |
|
"grad_norm": 9.531535148620605, |
|
"learning_rate": 2.3657653651726125e-06, |
|
"loss": 1.242, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 0.30489614243323443, |
|
"grad_norm": 7.554753303527832, |
|
"learning_rate": 2.362908248734994e-06, |
|
"loss": 1.3381, |
|
"step": 4110 |
|
}, |
|
{ |
|
"epoch": 0.3056379821958457, |
|
"grad_norm": 9.01855754852295, |
|
"learning_rate": 2.360046445411433e-06, |
|
"loss": 1.5718, |
|
"step": 4120 |
|
}, |
|
{ |
|
"epoch": 0.306379821958457, |
|
"grad_norm": 8.020215034484863, |
|
"learning_rate": 2.3571799707458125e-06, |
|
"loss": 1.2917, |
|
"step": 4130 |
|
}, |
|
{ |
|
"epoch": 0.30712166172106825, |
|
"grad_norm": 8.08421802520752, |
|
"learning_rate": 2.35430884030739e-06, |
|
"loss": 1.4316, |
|
"step": 4140 |
|
}, |
|
{ |
|
"epoch": 0.3078635014836795, |
|
"grad_norm": 8.234532356262207, |
|
"learning_rate": 2.351433069690709e-06, |
|
"loss": 1.2778, |
|
"step": 4150 |
|
}, |
|
{ |
|
"epoch": 0.3086053412462908, |
|
"grad_norm": 7.486210823059082, |
|
"learning_rate": 2.348552674515517e-06, |
|
"loss": 1.3158, |
|
"step": 4160 |
|
}, |
|
{ |
|
"epoch": 0.3093471810089021, |
|
"grad_norm": 11.375346183776855, |
|
"learning_rate": 2.34566767042668e-06, |
|
"loss": 1.4065, |
|
"step": 4170 |
|
}, |
|
{ |
|
"epoch": 0.31008902077151335, |
|
"grad_norm": 8.795413970947266, |
|
"learning_rate": 2.3427780730940967e-06, |
|
"loss": 1.3817, |
|
"step": 4180 |
|
}, |
|
{ |
|
"epoch": 0.3108308605341246, |
|
"grad_norm": 8.96834659576416, |
|
"learning_rate": 2.3398838982126147e-06, |
|
"loss": 1.4102, |
|
"step": 4190 |
|
}, |
|
{ |
|
"epoch": 0.3115727002967359, |
|
"grad_norm": 6.874296188354492, |
|
"learning_rate": 2.3369851615019433e-06, |
|
"loss": 1.3764, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 0.31231454005934717, |
|
"grad_norm": 7.878982067108154, |
|
"learning_rate": 2.3340818787065715e-06, |
|
"loss": 1.313, |
|
"step": 4210 |
|
}, |
|
{ |
|
"epoch": 0.31305637982195844, |
|
"grad_norm": 8.147690773010254, |
|
"learning_rate": 2.3311740655956785e-06, |
|
"loss": 1.4591, |
|
"step": 4220 |
|
}, |
|
{ |
|
"epoch": 0.3137982195845697, |
|
"grad_norm": 8.309657096862793, |
|
"learning_rate": 2.32826173796305e-06, |
|
"loss": 1.367, |
|
"step": 4230 |
|
}, |
|
{ |
|
"epoch": 0.314540059347181, |
|
"grad_norm": 9.30339241027832, |
|
"learning_rate": 2.3253449116269937e-06, |
|
"loss": 1.2814, |
|
"step": 4240 |
|
}, |
|
{ |
|
"epoch": 0.31528189910979226, |
|
"grad_norm": 9.000772476196289, |
|
"learning_rate": 2.3224236024302502e-06, |
|
"loss": 1.2713, |
|
"step": 4250 |
|
}, |
|
{ |
|
"epoch": 0.31602373887240354, |
|
"grad_norm": 8.01784610748291, |
|
"learning_rate": 2.319497826239911e-06, |
|
"loss": 1.3312, |
|
"step": 4260 |
|
}, |
|
{ |
|
"epoch": 0.3167655786350148, |
|
"grad_norm": 8.405533790588379, |
|
"learning_rate": 2.316567598947327e-06, |
|
"loss": 1.3651, |
|
"step": 4270 |
|
}, |
|
{ |
|
"epoch": 0.31750741839762614, |
|
"grad_norm": 8.148391723632812, |
|
"learning_rate": 2.3136329364680287e-06, |
|
"loss": 1.4414, |
|
"step": 4280 |
|
}, |
|
{ |
|
"epoch": 0.3182492581602374, |
|
"grad_norm": 36.44773864746094, |
|
"learning_rate": 2.3106938547416338e-06, |
|
"loss": 1.3181, |
|
"step": 4290 |
|
}, |
|
{ |
|
"epoch": 0.3189910979228487, |
|
"grad_norm": 7.259230613708496, |
|
"learning_rate": 2.307750369731764e-06, |
|
"loss": 1.3493, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 0.31973293768545996, |
|
"grad_norm": 8.317214012145996, |
|
"learning_rate": 2.304802497425958e-06, |
|
"loss": 1.4059, |
|
"step": 4310 |
|
}, |
|
{ |
|
"epoch": 0.32047477744807124, |
|
"grad_norm": 8.004743576049805, |
|
"learning_rate": 2.3018502538355825e-06, |
|
"loss": 1.4011, |
|
"step": 4320 |
|
}, |
|
{ |
|
"epoch": 0.3212166172106825, |
|
"grad_norm": 9.351004600524902, |
|
"learning_rate": 2.298893654995749e-06, |
|
"loss": 1.5036, |
|
"step": 4330 |
|
}, |
|
{ |
|
"epoch": 0.3219584569732938, |
|
"grad_norm": 8.475602149963379, |
|
"learning_rate": 2.295932716965222e-06, |
|
"loss": 1.2183, |
|
"step": 4340 |
|
}, |
|
{ |
|
"epoch": 0.32270029673590506, |
|
"grad_norm": 7.471583366394043, |
|
"learning_rate": 2.292967455826337e-06, |
|
"loss": 1.3892, |
|
"step": 4350 |
|
}, |
|
{ |
|
"epoch": 0.32344213649851633, |
|
"grad_norm": 9.214890480041504, |
|
"learning_rate": 2.2899978876849085e-06, |
|
"loss": 1.472, |
|
"step": 4360 |
|
}, |
|
{ |
|
"epoch": 0.3241839762611276, |
|
"grad_norm": 8.986857414245605, |
|
"learning_rate": 2.287024028670145e-06, |
|
"loss": 1.2721, |
|
"step": 4370 |
|
}, |
|
{ |
|
"epoch": 0.3249258160237389, |
|
"grad_norm": 8.836446762084961, |
|
"learning_rate": 2.284045894934562e-06, |
|
"loss": 1.2329, |
|
"step": 4380 |
|
}, |
|
{ |
|
"epoch": 0.32566765578635015, |
|
"grad_norm": 8.13981819152832, |
|
"learning_rate": 2.281063502653891e-06, |
|
"loss": 1.2512, |
|
"step": 4390 |
|
}, |
|
{ |
|
"epoch": 0.3264094955489614, |
|
"grad_norm": 8.709846496582031, |
|
"learning_rate": 2.278076868026995e-06, |
|
"loss": 1.3859, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 0.3271513353115727, |
|
"grad_norm": 9.3983154296875, |
|
"learning_rate": 2.27508600727578e-06, |
|
"loss": 1.4237, |
|
"step": 4410 |
|
}, |
|
{ |
|
"epoch": 0.327893175074184, |
|
"grad_norm": 8.226868629455566, |
|
"learning_rate": 2.272090936645105e-06, |
|
"loss": 1.3894, |
|
"step": 4420 |
|
}, |
|
{ |
|
"epoch": 0.32863501483679525, |
|
"grad_norm": 9.627702713012695, |
|
"learning_rate": 2.2690916724026954e-06, |
|
"loss": 1.3225, |
|
"step": 4430 |
|
}, |
|
{ |
|
"epoch": 0.3293768545994065, |
|
"grad_norm": 11.345617294311523, |
|
"learning_rate": 2.266088230839055e-06, |
|
"loss": 1.3649, |
|
"step": 4440 |
|
}, |
|
{ |
|
"epoch": 0.3301186943620178, |
|
"grad_norm": 7.237599849700928, |
|
"learning_rate": 2.2630806282673744e-06, |
|
"loss": 1.5589, |
|
"step": 4450 |
|
}, |
|
{ |
|
"epoch": 0.33086053412462907, |
|
"grad_norm": 8.742907524108887, |
|
"learning_rate": 2.2600688810234474e-06, |
|
"loss": 1.4584, |
|
"step": 4460 |
|
}, |
|
{ |
|
"epoch": 0.33160237388724034, |
|
"grad_norm": 9.190670013427734, |
|
"learning_rate": 2.257053005465578e-06, |
|
"loss": 1.4466, |
|
"step": 4470 |
|
}, |
|
{ |
|
"epoch": 0.3323442136498516, |
|
"grad_norm": 8.909046173095703, |
|
"learning_rate": 2.2540330179744934e-06, |
|
"loss": 1.3321, |
|
"step": 4480 |
|
}, |
|
{ |
|
"epoch": 0.3330860534124629, |
|
"grad_norm": 8.911348342895508, |
|
"learning_rate": 2.2510089349532553e-06, |
|
"loss": 1.4146, |
|
"step": 4490 |
|
}, |
|
{ |
|
"epoch": 0.33382789317507416, |
|
"grad_norm": 8.258678436279297, |
|
"learning_rate": 2.2479807728271696e-06, |
|
"loss": 1.348, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 0.33382789317507416, |
|
"eval_loss": 1.3544670343399048, |
|
"eval_runtime": 23.4388, |
|
"eval_samples_per_second": 19.028, |
|
"eval_steps_per_second": 9.514, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 0.33456973293768544, |
|
"grad_norm": 8.755362510681152, |
|
"learning_rate": 2.2449485480436982e-06, |
|
"loss": 1.3788, |
|
"step": 4510 |
|
}, |
|
{ |
|
"epoch": 0.3353115727002967, |
|
"grad_norm": 8.534749031066895, |
|
"learning_rate": 2.24191227707237e-06, |
|
"loss": 1.2039, |
|
"step": 4520 |
|
}, |
|
{ |
|
"epoch": 0.336053412462908, |
|
"grad_norm": 7.606124401092529, |
|
"learning_rate": 2.238871976404689e-06, |
|
"loss": 1.4215, |
|
"step": 4530 |
|
}, |
|
{ |
|
"epoch": 0.3367952522255193, |
|
"grad_norm": 8.163749694824219, |
|
"learning_rate": 2.235827662554048e-06, |
|
"loss": 1.3814, |
|
"step": 4540 |
|
}, |
|
{ |
|
"epoch": 0.3375370919881306, |
|
"grad_norm": 7.764957427978516, |
|
"learning_rate": 2.232779352055637e-06, |
|
"loss": 1.2437, |
|
"step": 4550 |
|
}, |
|
{ |
|
"epoch": 0.33827893175074186, |
|
"grad_norm": 10.332768440246582, |
|
"learning_rate": 2.2297270614663533e-06, |
|
"loss": 1.4328, |
|
"step": 4560 |
|
}, |
|
{ |
|
"epoch": 0.33902077151335314, |
|
"grad_norm": 8.382997512817383, |
|
"learning_rate": 2.2266708073647128e-06, |
|
"loss": 1.4947, |
|
"step": 4570 |
|
}, |
|
{ |
|
"epoch": 0.3397626112759644, |
|
"grad_norm": 8.392914772033691, |
|
"learning_rate": 2.2236106063507592e-06, |
|
"loss": 1.3206, |
|
"step": 4580 |
|
}, |
|
{ |
|
"epoch": 0.3405044510385757, |
|
"grad_norm": 8.482207298278809, |
|
"learning_rate": 2.220546475045973e-06, |
|
"loss": 1.473, |
|
"step": 4590 |
|
}, |
|
{ |
|
"epoch": 0.34124629080118696, |
|
"grad_norm": 9.380014419555664, |
|
"learning_rate": 2.2174784300931828e-06, |
|
"loss": 1.5559, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 0.34198813056379823, |
|
"grad_norm": 8.139824867248535, |
|
"learning_rate": 2.2144064881564747e-06, |
|
"loss": 1.5721, |
|
"step": 4610 |
|
}, |
|
{ |
|
"epoch": 0.3427299703264095, |
|
"grad_norm": 9.55907917022705, |
|
"learning_rate": 2.2113306659210997e-06, |
|
"loss": 1.3778, |
|
"step": 4620 |
|
}, |
|
{ |
|
"epoch": 0.3434718100890208, |
|
"grad_norm": 10.155835151672363, |
|
"learning_rate": 2.208250980093386e-06, |
|
"loss": 1.2517, |
|
"step": 4630 |
|
}, |
|
{ |
|
"epoch": 0.34421364985163205, |
|
"grad_norm": 8.608782768249512, |
|
"learning_rate": 2.205167447400646e-06, |
|
"loss": 1.3875, |
|
"step": 4640 |
|
}, |
|
{ |
|
"epoch": 0.3449554896142433, |
|
"grad_norm": 9.097238540649414, |
|
"learning_rate": 2.202080084591087e-06, |
|
"loss": 1.389, |
|
"step": 4650 |
|
}, |
|
{ |
|
"epoch": 0.3456973293768546, |
|
"grad_norm": 8.809340476989746, |
|
"learning_rate": 2.1989889084337194e-06, |
|
"loss": 1.2246, |
|
"step": 4660 |
|
}, |
|
{ |
|
"epoch": 0.3464391691394659, |
|
"grad_norm": 9.638260841369629, |
|
"learning_rate": 2.195893935718266e-06, |
|
"loss": 1.4718, |
|
"step": 4670 |
|
}, |
|
{ |
|
"epoch": 0.34718100890207715, |
|
"grad_norm": 7.2880730628967285, |
|
"learning_rate": 2.19279518325507e-06, |
|
"loss": 1.1473, |
|
"step": 4680 |
|
}, |
|
{ |
|
"epoch": 0.3479228486646884, |
|
"grad_norm": 9.370959281921387, |
|
"learning_rate": 2.1896926678750043e-06, |
|
"loss": 1.3126, |
|
"step": 4690 |
|
}, |
|
{ |
|
"epoch": 0.3486646884272997, |
|
"grad_norm": 7.85057258605957, |
|
"learning_rate": 2.1865864064293813e-06, |
|
"loss": 1.3338, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 0.34940652818991097, |
|
"grad_norm": 8.449581146240234, |
|
"learning_rate": 2.1834764157898587e-06, |
|
"loss": 1.3948, |
|
"step": 4710 |
|
}, |
|
{ |
|
"epoch": 0.35014836795252224, |
|
"grad_norm": 10.200738906860352, |
|
"learning_rate": 2.18036271284835e-06, |
|
"loss": 1.4157, |
|
"step": 4720 |
|
}, |
|
{ |
|
"epoch": 0.3508902077151335, |
|
"grad_norm": 9.506202697753906, |
|
"learning_rate": 2.177245314516932e-06, |
|
"loss": 1.4382, |
|
"step": 4730 |
|
}, |
|
{ |
|
"epoch": 0.3516320474777448, |
|
"grad_norm": 9.932241439819336, |
|
"learning_rate": 2.174124237727753e-06, |
|
"loss": 1.408, |
|
"step": 4740 |
|
}, |
|
{ |
|
"epoch": 0.35237388724035607, |
|
"grad_norm": 10.123774528503418, |
|
"learning_rate": 2.1709994994329406e-06, |
|
"loss": 1.1708, |
|
"step": 4750 |
|
}, |
|
{ |
|
"epoch": 0.35311572700296734, |
|
"grad_norm": 7.982966899871826, |
|
"learning_rate": 2.1678711166045108e-06, |
|
"loss": 1.2625, |
|
"step": 4760 |
|
}, |
|
{ |
|
"epoch": 0.3538575667655786, |
|
"grad_norm": 9.418827056884766, |
|
"learning_rate": 2.164739106234273e-06, |
|
"loss": 1.3367, |
|
"step": 4770 |
|
}, |
|
{ |
|
"epoch": 0.3545994065281899, |
|
"grad_norm": 9.385802268981934, |
|
"learning_rate": 2.161603485333742e-06, |
|
"loss": 1.5404, |
|
"step": 4780 |
|
}, |
|
{ |
|
"epoch": 0.35534124629080116, |
|
"grad_norm": 8.353150367736816, |
|
"learning_rate": 2.1584642709340414e-06, |
|
"loss": 1.5455, |
|
"step": 4790 |
|
}, |
|
{ |
|
"epoch": 0.3560830860534125, |
|
"grad_norm": 7.22542667388916, |
|
"learning_rate": 2.155321480085813e-06, |
|
"loss": 1.4264, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 0.35682492581602376, |
|
"grad_norm": 7.641038417816162, |
|
"learning_rate": 2.152175129859125e-06, |
|
"loss": 1.3006, |
|
"step": 4810 |
|
}, |
|
{ |
|
"epoch": 0.35756676557863504, |
|
"grad_norm": 7.675732135772705, |
|
"learning_rate": 2.1490252373433783e-06, |
|
"loss": 1.3992, |
|
"step": 4820 |
|
}, |
|
{ |
|
"epoch": 0.3583086053412463, |
|
"grad_norm": 7.769400119781494, |
|
"learning_rate": 2.1458718196472124e-06, |
|
"loss": 1.2344, |
|
"step": 4830 |
|
}, |
|
{ |
|
"epoch": 0.3590504451038576, |
|
"grad_norm": 8.751335144042969, |
|
"learning_rate": 2.1427148938984156e-06, |
|
"loss": 1.4056, |
|
"step": 4840 |
|
}, |
|
{ |
|
"epoch": 0.35979228486646886, |
|
"grad_norm": 10.821932792663574, |
|
"learning_rate": 2.1395544772438288e-06, |
|
"loss": 1.362, |
|
"step": 4850 |
|
}, |
|
{ |
|
"epoch": 0.36053412462908013, |
|
"grad_norm": 7.864255905151367, |
|
"learning_rate": 2.136390586849255e-06, |
|
"loss": 1.4346, |
|
"step": 4860 |
|
}, |
|
{ |
|
"epoch": 0.3612759643916914, |
|
"grad_norm": 10.004661560058594, |
|
"learning_rate": 2.1332232398993634e-06, |
|
"loss": 1.4811, |
|
"step": 4870 |
|
}, |
|
{ |
|
"epoch": 0.3620178041543027, |
|
"grad_norm": 8.67725944519043, |
|
"learning_rate": 2.130052453597598e-06, |
|
"loss": 1.3436, |
|
"step": 4880 |
|
}, |
|
{ |
|
"epoch": 0.36275964391691395, |
|
"grad_norm": 8.538166999816895, |
|
"learning_rate": 2.126878245166084e-06, |
|
"loss": 1.286, |
|
"step": 4890 |
|
}, |
|
{ |
|
"epoch": 0.36350148367952523, |
|
"grad_norm": 8.13525676727295, |
|
"learning_rate": 2.1237006318455345e-06, |
|
"loss": 1.3891, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 0.3642433234421365, |
|
"grad_norm": 7.657358646392822, |
|
"learning_rate": 2.1205196308951547e-06, |
|
"loss": 1.4672, |
|
"step": 4910 |
|
}, |
|
{ |
|
"epoch": 0.3649851632047478, |
|
"grad_norm": 9.132546424865723, |
|
"learning_rate": 2.1173352595925505e-06, |
|
"loss": 1.2085, |
|
"step": 4920 |
|
}, |
|
{ |
|
"epoch": 0.36572700296735905, |
|
"grad_norm": 8.413400650024414, |
|
"learning_rate": 2.1141475352336345e-06, |
|
"loss": 1.2139, |
|
"step": 4930 |
|
}, |
|
{ |
|
"epoch": 0.3664688427299703, |
|
"grad_norm": 8.649598121643066, |
|
"learning_rate": 2.1109564751325297e-06, |
|
"loss": 1.5049, |
|
"step": 4940 |
|
}, |
|
{ |
|
"epoch": 0.3672106824925816, |
|
"grad_norm": 10.267006874084473, |
|
"learning_rate": 2.107762096621479e-06, |
|
"loss": 1.4108, |
|
"step": 4950 |
|
}, |
|
{ |
|
"epoch": 0.36795252225519287, |
|
"grad_norm": 8.94491195678711, |
|
"learning_rate": 2.104564417050749e-06, |
|
"loss": 1.3822, |
|
"step": 4960 |
|
}, |
|
{ |
|
"epoch": 0.36869436201780414, |
|
"grad_norm": 7.626391887664795, |
|
"learning_rate": 2.101363453788534e-06, |
|
"loss": 1.4081, |
|
"step": 4970 |
|
}, |
|
{ |
|
"epoch": 0.3694362017804154, |
|
"grad_norm": 9.207382202148438, |
|
"learning_rate": 2.0981592242208664e-06, |
|
"loss": 1.3541, |
|
"step": 4980 |
|
}, |
|
{ |
|
"epoch": 0.3701780415430267, |
|
"grad_norm": 7.966575622558594, |
|
"learning_rate": 2.094951745751518e-06, |
|
"loss": 1.5405, |
|
"step": 4990 |
|
}, |
|
{ |
|
"epoch": 0.37091988130563797, |
|
"grad_norm": 8.80086612701416, |
|
"learning_rate": 2.0917410358019074e-06, |
|
"loss": 1.4639, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.37091988130563797, |
|
"eval_loss": 1.3480095863342285, |
|
"eval_runtime": 23.4543, |
|
"eval_samples_per_second": 19.016, |
|
"eval_steps_per_second": 9.508, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.37166172106824924, |
|
"grad_norm": 8.063216209411621, |
|
"learning_rate": 2.0885271118110046e-06, |
|
"loss": 1.3554, |
|
"step": 5010 |
|
}, |
|
{ |
|
"epoch": 0.3724035608308605, |
|
"grad_norm": 8.728006362915039, |
|
"learning_rate": 2.0853099912352377e-06, |
|
"loss": 1.2087, |
|
"step": 5020 |
|
}, |
|
{ |
|
"epoch": 0.3731454005934718, |
|
"grad_norm": 9.18012523651123, |
|
"learning_rate": 2.0820896915483957e-06, |
|
"loss": 1.3693, |
|
"step": 5030 |
|
}, |
|
{ |
|
"epoch": 0.37388724035608306, |
|
"grad_norm": 7.697686672210693, |
|
"learning_rate": 2.0788662302415355e-06, |
|
"loss": 1.3692, |
|
"step": 5040 |
|
}, |
|
{ |
|
"epoch": 0.37462908011869434, |
|
"grad_norm": 7.777410984039307, |
|
"learning_rate": 2.075639624822886e-06, |
|
"loss": 1.4546, |
|
"step": 5050 |
|
}, |
|
{ |
|
"epoch": 0.37537091988130566, |
|
"grad_norm": 8.502872467041016, |
|
"learning_rate": 2.072409892817755e-06, |
|
"loss": 1.3695, |
|
"step": 5060 |
|
}, |
|
{ |
|
"epoch": 0.37611275964391694, |
|
"grad_norm": 8.375325202941895, |
|
"learning_rate": 2.0691770517684303e-06, |
|
"loss": 1.3583, |
|
"step": 5070 |
|
}, |
|
{ |
|
"epoch": 0.3768545994065282, |
|
"grad_norm": 10.402475357055664, |
|
"learning_rate": 2.0659411192340875e-06, |
|
"loss": 1.4421, |
|
"step": 5080 |
|
}, |
|
{ |
|
"epoch": 0.3775964391691395, |
|
"grad_norm": 8.315070152282715, |
|
"learning_rate": 2.0627021127906936e-06, |
|
"loss": 1.3451, |
|
"step": 5090 |
|
}, |
|
{ |
|
"epoch": 0.37833827893175076, |
|
"grad_norm": 8.026792526245117, |
|
"learning_rate": 2.05946005003091e-06, |
|
"loss": 1.2854, |
|
"step": 5100 |
|
}, |
|
{ |
|
"epoch": 0.37908011869436203, |
|
"grad_norm": 8.60229778289795, |
|
"learning_rate": 2.056214948564002e-06, |
|
"loss": 1.3984, |
|
"step": 5110 |
|
}, |
|
{ |
|
"epoch": 0.3798219584569733, |
|
"grad_norm": 8.691934585571289, |
|
"learning_rate": 2.0529668260157356e-06, |
|
"loss": 1.4777, |
|
"step": 5120 |
|
}, |
|
{ |
|
"epoch": 0.3805637982195846, |
|
"grad_norm": 8.551725387573242, |
|
"learning_rate": 2.049715700028288e-06, |
|
"loss": 1.2376, |
|
"step": 5130 |
|
}, |
|
{ |
|
"epoch": 0.38130563798219586, |
|
"grad_norm": 7.708804130554199, |
|
"learning_rate": 2.04646158826015e-06, |
|
"loss": 1.253, |
|
"step": 5140 |
|
}, |
|
{ |
|
"epoch": 0.38204747774480713, |
|
"grad_norm": 9.0563325881958, |
|
"learning_rate": 2.043204508386028e-06, |
|
"loss": 1.3143, |
|
"step": 5150 |
|
}, |
|
{ |
|
"epoch": 0.3827893175074184, |
|
"grad_norm": 9.717677116394043, |
|
"learning_rate": 2.0399444780967514e-06, |
|
"loss": 1.389, |
|
"step": 5160 |
|
}, |
|
{ |
|
"epoch": 0.3835311572700297, |
|
"grad_norm": 10.435174942016602, |
|
"learning_rate": 2.036681515099173e-06, |
|
"loss": 1.3088, |
|
"step": 5170 |
|
}, |
|
{ |
|
"epoch": 0.38427299703264095, |
|
"grad_norm": 8.454843521118164, |
|
"learning_rate": 2.0334156371160754e-06, |
|
"loss": 1.3449, |
|
"step": 5180 |
|
}, |
|
{ |
|
"epoch": 0.3850148367952522, |
|
"grad_norm": 8.752850532531738, |
|
"learning_rate": 2.030146861886075e-06, |
|
"loss": 1.3281, |
|
"step": 5190 |
|
}, |
|
{ |
|
"epoch": 0.3857566765578635, |
|
"grad_norm": 7.73056173324585, |
|
"learning_rate": 2.0268752071635235e-06, |
|
"loss": 1.4503, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 0.38649851632047477, |
|
"grad_norm": 8.349225044250488, |
|
"learning_rate": 2.0236006907184124e-06, |
|
"loss": 1.3468, |
|
"step": 5210 |
|
}, |
|
{ |
|
"epoch": 0.38724035608308605, |
|
"grad_norm": 9.541553497314453, |
|
"learning_rate": 2.0203233303362773e-06, |
|
"loss": 1.4216, |
|
"step": 5220 |
|
}, |
|
{ |
|
"epoch": 0.3879821958456973, |
|
"grad_norm": 7.54893159866333, |
|
"learning_rate": 2.0170431438181e-06, |
|
"loss": 1.4398, |
|
"step": 5230 |
|
}, |
|
{ |
|
"epoch": 0.3887240356083086, |
|
"grad_norm": 8.763372421264648, |
|
"learning_rate": 2.0137601489802127e-06, |
|
"loss": 1.5001, |
|
"step": 5240 |
|
}, |
|
{ |
|
"epoch": 0.38946587537091987, |
|
"grad_norm": 6.774653434753418, |
|
"learning_rate": 2.010474363654201e-06, |
|
"loss": 1.2526, |
|
"step": 5250 |
|
}, |
|
{ |
|
"epoch": 0.39020771513353114, |
|
"grad_norm": 7.963438510894775, |
|
"learning_rate": 2.0071858056868074e-06, |
|
"loss": 1.2569, |
|
"step": 5260 |
|
}, |
|
{ |
|
"epoch": 0.3909495548961424, |
|
"grad_norm": 10.730804443359375, |
|
"learning_rate": 2.003894492939834e-06, |
|
"loss": 1.3766, |
|
"step": 5270 |
|
}, |
|
{ |
|
"epoch": 0.3916913946587537, |
|
"grad_norm": 8.266863822937012, |
|
"learning_rate": 2.0006004432900444e-06, |
|
"loss": 1.4004, |
|
"step": 5280 |
|
}, |
|
{ |
|
"epoch": 0.39243323442136496, |
|
"grad_norm": 8.219123840332031, |
|
"learning_rate": 1.997303674629069e-06, |
|
"loss": 1.3371, |
|
"step": 5290 |
|
}, |
|
{ |
|
"epoch": 0.39317507418397624, |
|
"grad_norm": 7.95269250869751, |
|
"learning_rate": 1.9940042048633056e-06, |
|
"loss": 1.4416, |
|
"step": 5300 |
|
}, |
|
{ |
|
"epoch": 0.3939169139465875, |
|
"grad_norm": 7.302926063537598, |
|
"learning_rate": 1.9907020519138247e-06, |
|
"loss": 1.3352, |
|
"step": 5310 |
|
}, |
|
{ |
|
"epoch": 0.39465875370919884, |
|
"grad_norm": 8.411139488220215, |
|
"learning_rate": 1.987397233716267e-06, |
|
"loss": 1.29, |
|
"step": 5320 |
|
}, |
|
{ |
|
"epoch": 0.3954005934718101, |
|
"grad_norm": 7.670512676239014, |
|
"learning_rate": 1.9840897682207537e-06, |
|
"loss": 1.3194, |
|
"step": 5330 |
|
}, |
|
{ |
|
"epoch": 0.3961424332344214, |
|
"grad_norm": 11.99163818359375, |
|
"learning_rate": 1.9807796733917815e-06, |
|
"loss": 1.4642, |
|
"step": 5340 |
|
}, |
|
{ |
|
"epoch": 0.39688427299703266, |
|
"grad_norm": 8.448274612426758, |
|
"learning_rate": 1.9774669672081307e-06, |
|
"loss": 1.277, |
|
"step": 5350 |
|
}, |
|
{ |
|
"epoch": 0.39762611275964393, |
|
"grad_norm": 8.752152442932129, |
|
"learning_rate": 1.9741516676627632e-06, |
|
"loss": 1.3266, |
|
"step": 5360 |
|
}, |
|
{ |
|
"epoch": 0.3983679525222552, |
|
"grad_norm": 8.631105422973633, |
|
"learning_rate": 1.970833792762729e-06, |
|
"loss": 1.4025, |
|
"step": 5370 |
|
}, |
|
{ |
|
"epoch": 0.3991097922848665, |
|
"grad_norm": 8.437644004821777, |
|
"learning_rate": 1.967513360529063e-06, |
|
"loss": 1.4304, |
|
"step": 5380 |
|
}, |
|
{ |
|
"epoch": 0.39985163204747776, |
|
"grad_norm": 8.341066360473633, |
|
"learning_rate": 1.964190388996694e-06, |
|
"loss": 1.3816, |
|
"step": 5390 |
|
}, |
|
{ |
|
"epoch": 0.40059347181008903, |
|
"grad_norm": 7.804527282714844, |
|
"learning_rate": 1.9608648962143394e-06, |
|
"loss": 1.4099, |
|
"step": 5400 |
|
}, |
|
{ |
|
"epoch": 0.4013353115727003, |
|
"grad_norm": 8.778786659240723, |
|
"learning_rate": 1.957536900244414e-06, |
|
"loss": 1.2651, |
|
"step": 5410 |
|
}, |
|
{ |
|
"epoch": 0.4020771513353116, |
|
"grad_norm": 8.054415702819824, |
|
"learning_rate": 1.954206419162925e-06, |
|
"loss": 1.4155, |
|
"step": 5420 |
|
}, |
|
{ |
|
"epoch": 0.40281899109792285, |
|
"grad_norm": 7.543354511260986, |
|
"learning_rate": 1.950873471059382e-06, |
|
"loss": 1.412, |
|
"step": 5430 |
|
}, |
|
{ |
|
"epoch": 0.4035608308605341, |
|
"grad_norm": 9.169261932373047, |
|
"learning_rate": 1.9475380740366903e-06, |
|
"loss": 1.4265, |
|
"step": 5440 |
|
}, |
|
{ |
|
"epoch": 0.4043026706231454, |
|
"grad_norm": 8.047539710998535, |
|
"learning_rate": 1.944200246211058e-06, |
|
"loss": 1.4605, |
|
"step": 5450 |
|
}, |
|
{ |
|
"epoch": 0.4050445103857567, |
|
"grad_norm": 9.375300407409668, |
|
"learning_rate": 1.940860005711897e-06, |
|
"loss": 1.4745, |
|
"step": 5460 |
|
}, |
|
{ |
|
"epoch": 0.40578635014836795, |
|
"grad_norm": 8.199248313903809, |
|
"learning_rate": 1.9375173706817215e-06, |
|
"loss": 1.3614, |
|
"step": 5470 |
|
}, |
|
{ |
|
"epoch": 0.4065281899109792, |
|
"grad_norm": 9.075878143310547, |
|
"learning_rate": 1.9341723592760542e-06, |
|
"loss": 1.4263, |
|
"step": 5480 |
|
}, |
|
{ |
|
"epoch": 0.4072700296735905, |
|
"grad_norm": 7.4491472244262695, |
|
"learning_rate": 1.930824989663323e-06, |
|
"loss": 1.251, |
|
"step": 5490 |
|
}, |
|
{ |
|
"epoch": 0.40801186943620177, |
|
"grad_norm": 8.764143943786621, |
|
"learning_rate": 1.9274752800247654e-06, |
|
"loss": 1.4405, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 0.40801186943620177, |
|
"eval_loss": 1.340783953666687, |
|
"eval_runtime": 23.4462, |
|
"eval_samples_per_second": 19.022, |
|
"eval_steps_per_second": 9.511, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 0.40875370919881304, |
|
"grad_norm": 8.902606964111328, |
|
"learning_rate": 1.9241232485543284e-06, |
|
"loss": 1.3789, |
|
"step": 5510 |
|
}, |
|
{ |
|
"epoch": 0.4094955489614243, |
|
"grad_norm": 7.769072532653809, |
|
"learning_rate": 1.9207689134585698e-06, |
|
"loss": 1.5089, |
|
"step": 5520 |
|
}, |
|
{ |
|
"epoch": 0.4102373887240356, |
|
"grad_norm": 9.30247974395752, |
|
"learning_rate": 1.91741229295656e-06, |
|
"loss": 1.2942, |
|
"step": 5530 |
|
}, |
|
{ |
|
"epoch": 0.41097922848664686, |
|
"grad_norm": 9.735326766967773, |
|
"learning_rate": 1.914053405279783e-06, |
|
"loss": 1.1792, |
|
"step": 5540 |
|
}, |
|
{ |
|
"epoch": 0.41172106824925814, |
|
"grad_norm": 8.925307273864746, |
|
"learning_rate": 1.9106922686720356e-06, |
|
"loss": 1.4032, |
|
"step": 5550 |
|
}, |
|
{ |
|
"epoch": 0.4124629080118694, |
|
"grad_norm": 8.152726173400879, |
|
"learning_rate": 1.9073289013893313e-06, |
|
"loss": 1.3349, |
|
"step": 5560 |
|
}, |
|
{ |
|
"epoch": 0.4132047477744807, |
|
"grad_norm": 8.074481964111328, |
|
"learning_rate": 1.9039633216997978e-06, |
|
"loss": 1.2687, |
|
"step": 5570 |
|
}, |
|
{ |
|
"epoch": 0.413946587537092, |
|
"grad_norm": 7.500307559967041, |
|
"learning_rate": 1.900595547883581e-06, |
|
"loss": 1.3318, |
|
"step": 5580 |
|
}, |
|
{ |
|
"epoch": 0.4146884272997033, |
|
"grad_norm": 8.518424987792969, |
|
"learning_rate": 1.8972255982327432e-06, |
|
"loss": 1.4255, |
|
"step": 5590 |
|
}, |
|
{ |
|
"epoch": 0.41543026706231456, |
|
"grad_norm": 9.059218406677246, |
|
"learning_rate": 1.8938534910511652e-06, |
|
"loss": 1.3451, |
|
"step": 5600 |
|
}, |
|
{ |
|
"epoch": 0.41617210682492584, |
|
"grad_norm": 8.822978973388672, |
|
"learning_rate": 1.8904792446544467e-06, |
|
"loss": 1.623, |
|
"step": 5610 |
|
}, |
|
{ |
|
"epoch": 0.4169139465875371, |
|
"grad_norm": 8.972715377807617, |
|
"learning_rate": 1.8871028773698058e-06, |
|
"loss": 1.447, |
|
"step": 5620 |
|
}, |
|
{ |
|
"epoch": 0.4176557863501484, |
|
"grad_norm": 7.2900519371032715, |
|
"learning_rate": 1.8837244075359804e-06, |
|
"loss": 1.3426, |
|
"step": 5630 |
|
}, |
|
{ |
|
"epoch": 0.41839762611275966, |
|
"grad_norm": 8.24610710144043, |
|
"learning_rate": 1.880343853503129e-06, |
|
"loss": 1.3507, |
|
"step": 5640 |
|
}, |
|
{ |
|
"epoch": 0.41913946587537093, |
|
"grad_norm": 10.137441635131836, |
|
"learning_rate": 1.8769612336327294e-06, |
|
"loss": 1.4335, |
|
"step": 5650 |
|
}, |
|
{ |
|
"epoch": 0.4198813056379822, |
|
"grad_norm": 10.343937873840332, |
|
"learning_rate": 1.8735765662974818e-06, |
|
"loss": 1.3133, |
|
"step": 5660 |
|
}, |
|
{ |
|
"epoch": 0.4206231454005935, |
|
"grad_norm": 8.10049057006836, |
|
"learning_rate": 1.8701898698812047e-06, |
|
"loss": 1.31, |
|
"step": 5670 |
|
}, |
|
{ |
|
"epoch": 0.42136498516320475, |
|
"grad_norm": 8.974928855895996, |
|
"learning_rate": 1.86680116277874e-06, |
|
"loss": 1.3522, |
|
"step": 5680 |
|
}, |
|
{ |
|
"epoch": 0.422106824925816, |
|
"grad_norm": 7.443127632141113, |
|
"learning_rate": 1.8634104633958483e-06, |
|
"loss": 1.2373, |
|
"step": 5690 |
|
}, |
|
{ |
|
"epoch": 0.4228486646884273, |
|
"grad_norm": 8.140283584594727, |
|
"learning_rate": 1.8600177901491135e-06, |
|
"loss": 1.2969, |
|
"step": 5700 |
|
}, |
|
{ |
|
"epoch": 0.4235905044510386, |
|
"grad_norm": 8.618755340576172, |
|
"learning_rate": 1.8566231614658389e-06, |
|
"loss": 1.185, |
|
"step": 5710 |
|
}, |
|
{ |
|
"epoch": 0.42433234421364985, |
|
"grad_norm": 8.221843719482422, |
|
"learning_rate": 1.8532265957839497e-06, |
|
"loss": 1.3558, |
|
"step": 5720 |
|
}, |
|
{ |
|
"epoch": 0.4250741839762611, |
|
"grad_norm": 12.334073066711426, |
|
"learning_rate": 1.8498281115518912e-06, |
|
"loss": 1.3281, |
|
"step": 5730 |
|
}, |
|
{ |
|
"epoch": 0.4258160237388724, |
|
"grad_norm": 7.851191997528076, |
|
"learning_rate": 1.8464277272285305e-06, |
|
"loss": 1.2885, |
|
"step": 5740 |
|
}, |
|
{ |
|
"epoch": 0.42655786350148367, |
|
"grad_norm": 8.391671180725098, |
|
"learning_rate": 1.843025461283053e-06, |
|
"loss": 1.3001, |
|
"step": 5750 |
|
}, |
|
{ |
|
"epoch": 0.42729970326409494, |
|
"grad_norm": 9.907540321350098, |
|
"learning_rate": 1.839621332194866e-06, |
|
"loss": 1.4639, |
|
"step": 5760 |
|
}, |
|
{ |
|
"epoch": 0.4280415430267062, |
|
"grad_norm": 8.890905380249023, |
|
"learning_rate": 1.8362153584534963e-06, |
|
"loss": 1.3371, |
|
"step": 5770 |
|
}, |
|
{ |
|
"epoch": 0.4287833827893175, |
|
"grad_norm": 8.191327095031738, |
|
"learning_rate": 1.8328075585584888e-06, |
|
"loss": 1.4174, |
|
"step": 5780 |
|
}, |
|
{ |
|
"epoch": 0.42952522255192876, |
|
"grad_norm": 7.765829563140869, |
|
"learning_rate": 1.829397951019308e-06, |
|
"loss": 1.3488, |
|
"step": 5790 |
|
}, |
|
{ |
|
"epoch": 0.43026706231454004, |
|
"grad_norm": 8.07245922088623, |
|
"learning_rate": 1.8259865543552362e-06, |
|
"loss": 1.1749, |
|
"step": 5800 |
|
}, |
|
{ |
|
"epoch": 0.4310089020771513, |
|
"grad_norm": 7.672754287719727, |
|
"learning_rate": 1.8225733870952739e-06, |
|
"loss": 1.3164, |
|
"step": 5810 |
|
}, |
|
{ |
|
"epoch": 0.4317507418397626, |
|
"grad_norm": 8.181532859802246, |
|
"learning_rate": 1.819158467778038e-06, |
|
"loss": 1.387, |
|
"step": 5820 |
|
}, |
|
{ |
|
"epoch": 0.43249258160237386, |
|
"grad_norm": 8.17938232421875, |
|
"learning_rate": 1.8157418149516617e-06, |
|
"loss": 1.2231, |
|
"step": 5830 |
|
}, |
|
{ |
|
"epoch": 0.4332344213649852, |
|
"grad_norm": 7.951348304748535, |
|
"learning_rate": 1.8123234471736945e-06, |
|
"loss": 1.4411, |
|
"step": 5840 |
|
}, |
|
{ |
|
"epoch": 0.43397626112759646, |
|
"grad_norm": 7.451209545135498, |
|
"learning_rate": 1.8089033830110003e-06, |
|
"loss": 1.3168, |
|
"step": 5850 |
|
}, |
|
{ |
|
"epoch": 0.43471810089020774, |
|
"grad_norm": 8.86732292175293, |
|
"learning_rate": 1.805481641039656e-06, |
|
"loss": 1.4272, |
|
"step": 5860 |
|
}, |
|
{ |
|
"epoch": 0.435459940652819, |
|
"grad_norm": 8.028582572937012, |
|
"learning_rate": 1.8020582398448532e-06, |
|
"loss": 1.2012, |
|
"step": 5870 |
|
}, |
|
{ |
|
"epoch": 0.4362017804154303, |
|
"grad_norm": 7.9948506355285645, |
|
"learning_rate": 1.7986331980207942e-06, |
|
"loss": 1.377, |
|
"step": 5880 |
|
}, |
|
{ |
|
"epoch": 0.43694362017804156, |
|
"grad_norm": 8.945382118225098, |
|
"learning_rate": 1.7952065341705928e-06, |
|
"loss": 1.285, |
|
"step": 5890 |
|
}, |
|
{ |
|
"epoch": 0.43768545994065283, |
|
"grad_norm": 8.703865051269531, |
|
"learning_rate": 1.7917782669061727e-06, |
|
"loss": 1.4814, |
|
"step": 5900 |
|
}, |
|
{ |
|
"epoch": 0.4384272997032641, |
|
"grad_norm": 8.220625877380371, |
|
"learning_rate": 1.7883484148481669e-06, |
|
"loss": 1.3047, |
|
"step": 5910 |
|
}, |
|
{ |
|
"epoch": 0.4391691394658754, |
|
"grad_norm": 8.814275741577148, |
|
"learning_rate": 1.7849169966258158e-06, |
|
"loss": 1.2686, |
|
"step": 5920 |
|
}, |
|
{ |
|
"epoch": 0.43991097922848665, |
|
"grad_norm": 8.656988143920898, |
|
"learning_rate": 1.7814840308768672e-06, |
|
"loss": 1.3689, |
|
"step": 5930 |
|
}, |
|
{ |
|
"epoch": 0.4406528189910979, |
|
"grad_norm": 7.942451000213623, |
|
"learning_rate": 1.778049536247473e-06, |
|
"loss": 1.4089, |
|
"step": 5940 |
|
}, |
|
{ |
|
"epoch": 0.4413946587537092, |
|
"grad_norm": 8.073698997497559, |
|
"learning_rate": 1.7746135313920907e-06, |
|
"loss": 1.3592, |
|
"step": 5950 |
|
}, |
|
{ |
|
"epoch": 0.4421364985163205, |
|
"grad_norm": 9.229683876037598, |
|
"learning_rate": 1.7711760349733793e-06, |
|
"loss": 1.2828, |
|
"step": 5960 |
|
}, |
|
{ |
|
"epoch": 0.44287833827893175, |
|
"grad_norm": 9.150603294372559, |
|
"learning_rate": 1.7677370656620997e-06, |
|
"loss": 1.2879, |
|
"step": 5970 |
|
}, |
|
{ |
|
"epoch": 0.443620178041543, |
|
"grad_norm": 8.25768756866455, |
|
"learning_rate": 1.7642966421370136e-06, |
|
"loss": 1.4304, |
|
"step": 5980 |
|
}, |
|
{ |
|
"epoch": 0.4443620178041543, |
|
"grad_norm": 9.358892440795898, |
|
"learning_rate": 1.7608547830847795e-06, |
|
"loss": 1.4317, |
|
"step": 5990 |
|
}, |
|
{ |
|
"epoch": 0.44510385756676557, |
|
"grad_norm": 8.074627876281738, |
|
"learning_rate": 1.757411507199855e-06, |
|
"loss": 1.2926, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.44510385756676557, |
|
"eval_loss": 1.3348528146743774, |
|
"eval_runtime": 23.4773, |
|
"eval_samples_per_second": 18.997, |
|
"eval_steps_per_second": 9.499, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.44584569732937684, |
|
"grad_norm": 7.637718677520752, |
|
"learning_rate": 1.7539668331843914e-06, |
|
"loss": 1.3149, |
|
"step": 6010 |
|
}, |
|
{ |
|
"epoch": 0.4465875370919881, |
|
"grad_norm": 10.58519458770752, |
|
"learning_rate": 1.7505207797481356e-06, |
|
"loss": 1.3607, |
|
"step": 6020 |
|
}, |
|
{ |
|
"epoch": 0.4473293768545994, |
|
"grad_norm": 7.9096174240112305, |
|
"learning_rate": 1.7470733656083253e-06, |
|
"loss": 1.2627, |
|
"step": 6030 |
|
}, |
|
{ |
|
"epoch": 0.44807121661721067, |
|
"grad_norm": 7.344761848449707, |
|
"learning_rate": 1.7436246094895896e-06, |
|
"loss": 1.4465, |
|
"step": 6040 |
|
}, |
|
{ |
|
"epoch": 0.44881305637982194, |
|
"grad_norm": 8.851872444152832, |
|
"learning_rate": 1.740174530123847e-06, |
|
"loss": 1.3832, |
|
"step": 6050 |
|
}, |
|
{ |
|
"epoch": 0.4495548961424332, |
|
"grad_norm": 8.735071182250977, |
|
"learning_rate": 1.7367231462502024e-06, |
|
"loss": 1.4773, |
|
"step": 6060 |
|
}, |
|
{ |
|
"epoch": 0.4502967359050445, |
|
"grad_norm": 8.918268203735352, |
|
"learning_rate": 1.7332704766148466e-06, |
|
"loss": 1.3897, |
|
"step": 6070 |
|
}, |
|
{ |
|
"epoch": 0.45103857566765576, |
|
"grad_norm": 8.48647689819336, |
|
"learning_rate": 1.729816539970954e-06, |
|
"loss": 1.2423, |
|
"step": 6080 |
|
}, |
|
{ |
|
"epoch": 0.45178041543026703, |
|
"grad_norm": 6.995253562927246, |
|
"learning_rate": 1.72636135507858e-06, |
|
"loss": 1.2992, |
|
"step": 6090 |
|
}, |
|
{ |
|
"epoch": 0.45252225519287836, |
|
"grad_norm": 7.405545234680176, |
|
"learning_rate": 1.7229049407045613e-06, |
|
"loss": 1.3971, |
|
"step": 6100 |
|
}, |
|
{ |
|
"epoch": 0.45326409495548964, |
|
"grad_norm": 8.452637672424316, |
|
"learning_rate": 1.7194473156224113e-06, |
|
"loss": 1.3156, |
|
"step": 6110 |
|
}, |
|
{ |
|
"epoch": 0.4540059347181009, |
|
"grad_norm": 7.952899932861328, |
|
"learning_rate": 1.7159884986122197e-06, |
|
"loss": 1.3817, |
|
"step": 6120 |
|
}, |
|
{ |
|
"epoch": 0.4547477744807122, |
|
"grad_norm": 8.648924827575684, |
|
"learning_rate": 1.7125285084605509e-06, |
|
"loss": 1.3269, |
|
"step": 6130 |
|
}, |
|
{ |
|
"epoch": 0.45548961424332346, |
|
"grad_norm": 7.878424167633057, |
|
"learning_rate": 1.7090673639603399e-06, |
|
"loss": 1.3511, |
|
"step": 6140 |
|
}, |
|
{ |
|
"epoch": 0.45623145400593473, |
|
"grad_norm": 10.038208961486816, |
|
"learning_rate": 1.7056050839107924e-06, |
|
"loss": 1.3547, |
|
"step": 6150 |
|
}, |
|
{ |
|
"epoch": 0.456973293768546, |
|
"grad_norm": 11.209604263305664, |
|
"learning_rate": 1.7021416871172816e-06, |
|
"loss": 1.38, |
|
"step": 6160 |
|
}, |
|
{ |
|
"epoch": 0.4577151335311573, |
|
"grad_norm": 8.880349159240723, |
|
"learning_rate": 1.6986771923912466e-06, |
|
"loss": 1.3767, |
|
"step": 6170 |
|
}, |
|
{ |
|
"epoch": 0.45845697329376855, |
|
"grad_norm": 7.9594221115112305, |
|
"learning_rate": 1.6952116185500891e-06, |
|
"loss": 1.3401, |
|
"step": 6180 |
|
}, |
|
{ |
|
"epoch": 0.45919881305637983, |
|
"grad_norm": 9.231648445129395, |
|
"learning_rate": 1.6917449844170733e-06, |
|
"loss": 1.3873, |
|
"step": 6190 |
|
}, |
|
{ |
|
"epoch": 0.4599406528189911, |
|
"grad_norm": 8.900077819824219, |
|
"learning_rate": 1.6882773088212214e-06, |
|
"loss": 1.4, |
|
"step": 6200 |
|
}, |
|
{ |
|
"epoch": 0.4606824925816024, |
|
"grad_norm": 9.752120018005371, |
|
"learning_rate": 1.6848086105972123e-06, |
|
"loss": 1.3674, |
|
"step": 6210 |
|
}, |
|
{ |
|
"epoch": 0.46142433234421365, |
|
"grad_norm": 9.113099098205566, |
|
"learning_rate": 1.6813389085852794e-06, |
|
"loss": 1.454, |
|
"step": 6220 |
|
}, |
|
{ |
|
"epoch": 0.4621661721068249, |
|
"grad_norm": 8.19013500213623, |
|
"learning_rate": 1.677868221631109e-06, |
|
"loss": 1.3381, |
|
"step": 6230 |
|
}, |
|
{ |
|
"epoch": 0.4629080118694362, |
|
"grad_norm": 7.306256294250488, |
|
"learning_rate": 1.674396568585736e-06, |
|
"loss": 1.3912, |
|
"step": 6240 |
|
}, |
|
{ |
|
"epoch": 0.46364985163204747, |
|
"grad_norm": 8.432893753051758, |
|
"learning_rate": 1.6709239683054433e-06, |
|
"loss": 1.2639, |
|
"step": 6250 |
|
}, |
|
{ |
|
"epoch": 0.46439169139465875, |
|
"grad_norm": 9.081368446350098, |
|
"learning_rate": 1.6674504396516583e-06, |
|
"loss": 1.3728, |
|
"step": 6260 |
|
}, |
|
{ |
|
"epoch": 0.46513353115727, |
|
"grad_norm": 8.188736915588379, |
|
"learning_rate": 1.663976001490851e-06, |
|
"loss": 1.3573, |
|
"step": 6270 |
|
}, |
|
{ |
|
"epoch": 0.4658753709198813, |
|
"grad_norm": 8.223960876464844, |
|
"learning_rate": 1.6605006726944314e-06, |
|
"loss": 1.3602, |
|
"step": 6280 |
|
}, |
|
{ |
|
"epoch": 0.46661721068249257, |
|
"grad_norm": 7.188130855560303, |
|
"learning_rate": 1.6570244721386472e-06, |
|
"loss": 1.3091, |
|
"step": 6290 |
|
}, |
|
{ |
|
"epoch": 0.46735905044510384, |
|
"grad_norm": 8.153417587280273, |
|
"learning_rate": 1.6535474187044809e-06, |
|
"loss": 1.3743, |
|
"step": 6300 |
|
}, |
|
{ |
|
"epoch": 0.4681008902077151, |
|
"grad_norm": 7.9417290687561035, |
|
"learning_rate": 1.650069531277547e-06, |
|
"loss": 1.2242, |
|
"step": 6310 |
|
}, |
|
{ |
|
"epoch": 0.4688427299703264, |
|
"grad_norm": 10.858664512634277, |
|
"learning_rate": 1.6465908287479907e-06, |
|
"loss": 1.329, |
|
"step": 6320 |
|
}, |
|
{ |
|
"epoch": 0.46958456973293766, |
|
"grad_norm": 11.415666580200195, |
|
"learning_rate": 1.6431113300103836e-06, |
|
"loss": 1.3142, |
|
"step": 6330 |
|
}, |
|
{ |
|
"epoch": 0.47032640949554894, |
|
"grad_norm": 9.50818920135498, |
|
"learning_rate": 1.6396310539636222e-06, |
|
"loss": 1.335, |
|
"step": 6340 |
|
}, |
|
{ |
|
"epoch": 0.4710682492581602, |
|
"grad_norm": 8.820195198059082, |
|
"learning_rate": 1.6361500195108256e-06, |
|
"loss": 1.3818, |
|
"step": 6350 |
|
}, |
|
{ |
|
"epoch": 0.47181008902077154, |
|
"grad_norm": 8.231925964355469, |
|
"learning_rate": 1.6326682455592306e-06, |
|
"loss": 1.5702, |
|
"step": 6360 |
|
}, |
|
{ |
|
"epoch": 0.4725519287833828, |
|
"grad_norm": 8.553587913513184, |
|
"learning_rate": 1.6291857510200926e-06, |
|
"loss": 1.3378, |
|
"step": 6370 |
|
}, |
|
{ |
|
"epoch": 0.4732937685459941, |
|
"grad_norm": 8.568156242370605, |
|
"learning_rate": 1.6257025548085788e-06, |
|
"loss": 1.3023, |
|
"step": 6380 |
|
}, |
|
{ |
|
"epoch": 0.47403560830860536, |
|
"grad_norm": 8.378904342651367, |
|
"learning_rate": 1.6222186758436698e-06, |
|
"loss": 1.4306, |
|
"step": 6390 |
|
}, |
|
{ |
|
"epoch": 0.47477744807121663, |
|
"grad_norm": 8.451229095458984, |
|
"learning_rate": 1.6187341330480523e-06, |
|
"loss": 1.166, |
|
"step": 6400 |
|
}, |
|
{ |
|
"epoch": 0.4755192878338279, |
|
"grad_norm": 8.599996566772461, |
|
"learning_rate": 1.6152489453480202e-06, |
|
"loss": 1.365, |
|
"step": 6410 |
|
}, |
|
{ |
|
"epoch": 0.4762611275964392, |
|
"grad_norm": 8.459872245788574, |
|
"learning_rate": 1.6117631316733698e-06, |
|
"loss": 1.278, |
|
"step": 6420 |
|
}, |
|
{ |
|
"epoch": 0.47700296735905046, |
|
"grad_norm": 9.12617301940918, |
|
"learning_rate": 1.6082767109572964e-06, |
|
"loss": 1.2172, |
|
"step": 6430 |
|
}, |
|
{ |
|
"epoch": 0.47774480712166173, |
|
"grad_norm": 7.814152717590332, |
|
"learning_rate": 1.6047897021362942e-06, |
|
"loss": 1.2797, |
|
"step": 6440 |
|
}, |
|
{ |
|
"epoch": 0.478486646884273, |
|
"grad_norm": 9.098596572875977, |
|
"learning_rate": 1.60130212415005e-06, |
|
"loss": 1.3154, |
|
"step": 6450 |
|
}, |
|
{ |
|
"epoch": 0.4792284866468843, |
|
"grad_norm": 9.89655876159668, |
|
"learning_rate": 1.597813995941343e-06, |
|
"loss": 1.4306, |
|
"step": 6460 |
|
}, |
|
{ |
|
"epoch": 0.47997032640949555, |
|
"grad_norm": 8.791460037231445, |
|
"learning_rate": 1.5943253364559412e-06, |
|
"loss": 1.2269, |
|
"step": 6470 |
|
}, |
|
{ |
|
"epoch": 0.4807121661721068, |
|
"grad_norm": 8.997727394104004, |
|
"learning_rate": 1.5908361646424973e-06, |
|
"loss": 1.4215, |
|
"step": 6480 |
|
}, |
|
{ |
|
"epoch": 0.4814540059347181, |
|
"grad_norm": 7.6386284828186035, |
|
"learning_rate": 1.5873464994524473e-06, |
|
"loss": 1.2984, |
|
"step": 6490 |
|
}, |
|
{ |
|
"epoch": 0.4821958456973294, |
|
"grad_norm": 9.248114585876465, |
|
"learning_rate": 1.5838563598399068e-06, |
|
"loss": 1.3452, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 0.4821958456973294, |
|
"eval_loss": 1.326774001121521, |
|
"eval_runtime": 23.5945, |
|
"eval_samples_per_second": 18.903, |
|
"eval_steps_per_second": 9.451, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 0.48293768545994065, |
|
"grad_norm": 7.455526828765869, |
|
"learning_rate": 1.580365764761568e-06, |
|
"loss": 1.2932, |
|
"step": 6510 |
|
}, |
|
{ |
|
"epoch": 0.4836795252225519, |
|
"grad_norm": 9.042367935180664, |
|
"learning_rate": 1.5768747331765977e-06, |
|
"loss": 1.358, |
|
"step": 6520 |
|
}, |
|
{ |
|
"epoch": 0.4844213649851632, |
|
"grad_norm": 7.080817222595215, |
|
"learning_rate": 1.5733832840465328e-06, |
|
"loss": 1.2915, |
|
"step": 6530 |
|
}, |
|
{ |
|
"epoch": 0.48516320474777447, |
|
"grad_norm": 11.231888771057129, |
|
"learning_rate": 1.5698914363351784e-06, |
|
"loss": 1.3181, |
|
"step": 6540 |
|
}, |
|
{ |
|
"epoch": 0.48590504451038574, |
|
"grad_norm": 7.5210347175598145, |
|
"learning_rate": 1.5663992090085044e-06, |
|
"loss": 1.3802, |
|
"step": 6550 |
|
}, |
|
{ |
|
"epoch": 0.486646884272997, |
|
"grad_norm": 8.118837356567383, |
|
"learning_rate": 1.5629066210345432e-06, |
|
"loss": 1.4856, |
|
"step": 6560 |
|
}, |
|
{ |
|
"epoch": 0.4873887240356083, |
|
"grad_norm": 7.752665996551514, |
|
"learning_rate": 1.559413691383285e-06, |
|
"loss": 1.3588, |
|
"step": 6570 |
|
}, |
|
{ |
|
"epoch": 0.48813056379821956, |
|
"grad_norm": 8.421116828918457, |
|
"learning_rate": 1.5559204390265764e-06, |
|
"loss": 1.4454, |
|
"step": 6580 |
|
}, |
|
{ |
|
"epoch": 0.48887240356083084, |
|
"grad_norm": 8.583824157714844, |
|
"learning_rate": 1.5524268829380168e-06, |
|
"loss": 1.4392, |
|
"step": 6590 |
|
}, |
|
{ |
|
"epoch": 0.4896142433234421, |
|
"grad_norm": 8.850062370300293, |
|
"learning_rate": 1.5489330420928555e-06, |
|
"loss": 1.3796, |
|
"step": 6600 |
|
}, |
|
{ |
|
"epoch": 0.4903560830860534, |
|
"grad_norm": 7.187986850738525, |
|
"learning_rate": 1.5454389354678882e-06, |
|
"loss": 1.1743, |
|
"step": 6610 |
|
}, |
|
{ |
|
"epoch": 0.4910979228486647, |
|
"grad_norm": 17.749059677124023, |
|
"learning_rate": 1.541944582041353e-06, |
|
"loss": 1.3122, |
|
"step": 6620 |
|
}, |
|
{ |
|
"epoch": 0.491839762611276, |
|
"grad_norm": 9.939379692077637, |
|
"learning_rate": 1.5384500007928312e-06, |
|
"loss": 1.1216, |
|
"step": 6630 |
|
}, |
|
{ |
|
"epoch": 0.49258160237388726, |
|
"grad_norm": 9.638907432556152, |
|
"learning_rate": 1.53495521070314e-06, |
|
"loss": 1.2621, |
|
"step": 6640 |
|
}, |
|
{ |
|
"epoch": 0.49332344213649854, |
|
"grad_norm": 9.247072219848633, |
|
"learning_rate": 1.5314602307542297e-06, |
|
"loss": 1.255, |
|
"step": 6650 |
|
}, |
|
{ |
|
"epoch": 0.4940652818991098, |
|
"grad_norm": 10.329320907592773, |
|
"learning_rate": 1.5279650799290838e-06, |
|
"loss": 1.3395, |
|
"step": 6660 |
|
}, |
|
{ |
|
"epoch": 0.4948071216617211, |
|
"grad_norm": 8.686713218688965, |
|
"learning_rate": 1.5244697772116131e-06, |
|
"loss": 1.1988, |
|
"step": 6670 |
|
}, |
|
{ |
|
"epoch": 0.49554896142433236, |
|
"grad_norm": 9.043136596679688, |
|
"learning_rate": 1.5209743415865535e-06, |
|
"loss": 1.3861, |
|
"step": 6680 |
|
}, |
|
{ |
|
"epoch": 0.49629080118694363, |
|
"grad_norm": 9.186018943786621, |
|
"learning_rate": 1.5174787920393627e-06, |
|
"loss": 1.2588, |
|
"step": 6690 |
|
}, |
|
{ |
|
"epoch": 0.4970326409495549, |
|
"grad_norm": 9.252155303955078, |
|
"learning_rate": 1.5139831475561171e-06, |
|
"loss": 1.554, |
|
"step": 6700 |
|
}, |
|
{ |
|
"epoch": 0.4977744807121662, |
|
"grad_norm": 9.65112018585205, |
|
"learning_rate": 1.510487427123409e-06, |
|
"loss": 1.3435, |
|
"step": 6710 |
|
}, |
|
{ |
|
"epoch": 0.49851632047477745, |
|
"grad_norm": 22.156383514404297, |
|
"learning_rate": 1.5069916497282432e-06, |
|
"loss": 1.178, |
|
"step": 6720 |
|
}, |
|
{ |
|
"epoch": 0.4992581602373887, |
|
"grad_norm": 8.21938419342041, |
|
"learning_rate": 1.5034958343579333e-06, |
|
"loss": 1.3944, |
|
"step": 6730 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 7.787656307220459, |
|
"learning_rate": 1.5e-06, |
|
"loss": 1.4009, |
|
"step": 6740 |
|
}, |
|
{ |
|
"epoch": 0.5007418397626113, |
|
"grad_norm": 8.978195190429688, |
|
"learning_rate": 1.4965041656420666e-06, |
|
"loss": 1.241, |
|
"step": 6750 |
|
}, |
|
{ |
|
"epoch": 0.5014836795252225, |
|
"grad_norm": 9.333284378051758, |
|
"learning_rate": 1.4930083502717571e-06, |
|
"loss": 1.5115, |
|
"step": 6760 |
|
}, |
|
{ |
|
"epoch": 0.5022255192878339, |
|
"grad_norm": 9.057726860046387, |
|
"learning_rate": 1.489512572876591e-06, |
|
"loss": 1.2611, |
|
"step": 6770 |
|
}, |
|
{ |
|
"epoch": 0.5029673590504451, |
|
"grad_norm": 9.008346557617188, |
|
"learning_rate": 1.4860168524438831e-06, |
|
"loss": 1.2435, |
|
"step": 6780 |
|
}, |
|
{ |
|
"epoch": 0.5037091988130564, |
|
"grad_norm": 7.9738640785217285, |
|
"learning_rate": 1.4825212079606374e-06, |
|
"loss": 1.2969, |
|
"step": 6790 |
|
}, |
|
{ |
|
"epoch": 0.5044510385756676, |
|
"grad_norm": 13.787586212158203, |
|
"learning_rate": 1.4790256584134468e-06, |
|
"loss": 1.4168, |
|
"step": 6800 |
|
}, |
|
{ |
|
"epoch": 0.505192878338279, |
|
"grad_norm": 8.508440971374512, |
|
"learning_rate": 1.4755302227883868e-06, |
|
"loss": 1.2758, |
|
"step": 6810 |
|
}, |
|
{ |
|
"epoch": 0.5059347181008902, |
|
"grad_norm": 9.42790699005127, |
|
"learning_rate": 1.4720349200709164e-06, |
|
"loss": 1.2748, |
|
"step": 6820 |
|
}, |
|
{ |
|
"epoch": 0.5066765578635015, |
|
"grad_norm": 9.03829288482666, |
|
"learning_rate": 1.4685397692457704e-06, |
|
"loss": 1.3407, |
|
"step": 6830 |
|
}, |
|
{ |
|
"epoch": 0.5074183976261127, |
|
"grad_norm": 9.19029712677002, |
|
"learning_rate": 1.4650447892968606e-06, |
|
"loss": 1.52, |
|
"step": 6840 |
|
}, |
|
{ |
|
"epoch": 0.5081602373887241, |
|
"grad_norm": 8.805255889892578, |
|
"learning_rate": 1.4615499992071685e-06, |
|
"loss": 1.3314, |
|
"step": 6850 |
|
}, |
|
{ |
|
"epoch": 0.5089020771513353, |
|
"grad_norm": 8.248116493225098, |
|
"learning_rate": 1.4580554179586471e-06, |
|
"loss": 1.4094, |
|
"step": 6860 |
|
}, |
|
{ |
|
"epoch": 0.5096439169139466, |
|
"grad_norm": 8.346354484558105, |
|
"learning_rate": 1.4545610645321123e-06, |
|
"loss": 1.4973, |
|
"step": 6870 |
|
}, |
|
{ |
|
"epoch": 0.5103857566765578, |
|
"grad_norm": 8.899476051330566, |
|
"learning_rate": 1.451066957907145e-06, |
|
"loss": 1.3733, |
|
"step": 6880 |
|
}, |
|
{ |
|
"epoch": 0.5111275964391692, |
|
"grad_norm": 7.146321773529053, |
|
"learning_rate": 1.4475731170619835e-06, |
|
"loss": 1.3282, |
|
"step": 6890 |
|
}, |
|
{ |
|
"epoch": 0.5118694362017804, |
|
"grad_norm": 9.217137336730957, |
|
"learning_rate": 1.444079560973424e-06, |
|
"loss": 1.5009, |
|
"step": 6900 |
|
}, |
|
{ |
|
"epoch": 0.5126112759643917, |
|
"grad_norm": 8.994102478027344, |
|
"learning_rate": 1.4405863086167155e-06, |
|
"loss": 1.3771, |
|
"step": 6910 |
|
}, |
|
{ |
|
"epoch": 0.5133531157270029, |
|
"grad_norm": 7.989219665527344, |
|
"learning_rate": 1.4370933789654571e-06, |
|
"loss": 1.385, |
|
"step": 6920 |
|
}, |
|
{ |
|
"epoch": 0.5140949554896143, |
|
"grad_norm": 8.614723205566406, |
|
"learning_rate": 1.4336007909914957e-06, |
|
"loss": 1.2987, |
|
"step": 6930 |
|
}, |
|
{ |
|
"epoch": 0.5148367952522255, |
|
"grad_norm": 7.992114543914795, |
|
"learning_rate": 1.430108563664822e-06, |
|
"loss": 1.1859, |
|
"step": 6940 |
|
}, |
|
{ |
|
"epoch": 0.5155786350148368, |
|
"grad_norm": 8.345887184143066, |
|
"learning_rate": 1.4266167159534675e-06, |
|
"loss": 1.4507, |
|
"step": 6950 |
|
}, |
|
{ |
|
"epoch": 0.516320474777448, |
|
"grad_norm": 8.506096839904785, |
|
"learning_rate": 1.4231252668234026e-06, |
|
"loss": 1.2592, |
|
"step": 6960 |
|
}, |
|
{ |
|
"epoch": 0.5170623145400594, |
|
"grad_norm": 7.255486011505127, |
|
"learning_rate": 1.4196342352384323e-06, |
|
"loss": 1.2013, |
|
"step": 6970 |
|
}, |
|
{ |
|
"epoch": 0.5178041543026706, |
|
"grad_norm": 7.925352573394775, |
|
"learning_rate": 1.4161436401600939e-06, |
|
"loss": 1.3405, |
|
"step": 6980 |
|
}, |
|
{ |
|
"epoch": 0.5185459940652819, |
|
"grad_norm": 7.987504482269287, |
|
"learning_rate": 1.412653500547553e-06, |
|
"loss": 1.3114, |
|
"step": 6990 |
|
}, |
|
{ |
|
"epoch": 0.5192878338278932, |
|
"grad_norm": 9.995888710021973, |
|
"learning_rate": 1.4091638353575025e-06, |
|
"loss": 1.3076, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 0.5192878338278932, |
|
"eval_loss": 1.3201655149459839, |
|
"eval_runtime": 23.6126, |
|
"eval_samples_per_second": 18.888, |
|
"eval_steps_per_second": 9.444, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 0.5200296735905044, |
|
"grad_norm": 6.1546831130981445, |
|
"learning_rate": 1.405674663544059e-06, |
|
"loss": 1.4727, |
|
"step": 7010 |
|
}, |
|
{ |
|
"epoch": 0.5207715133531158, |
|
"grad_norm": 8.864068984985352, |
|
"learning_rate": 1.4021860040586568e-06, |
|
"loss": 1.2877, |
|
"step": 7020 |
|
}, |
|
{ |
|
"epoch": 0.521513353115727, |
|
"grad_norm": 9.57347297668457, |
|
"learning_rate": 1.3986978758499504e-06, |
|
"loss": 1.2283, |
|
"step": 7030 |
|
}, |
|
{ |
|
"epoch": 0.5222551928783383, |
|
"grad_norm": 8.824577331542969, |
|
"learning_rate": 1.395210297863706e-06, |
|
"loss": 1.4945, |
|
"step": 7040 |
|
}, |
|
{ |
|
"epoch": 0.5229970326409495, |
|
"grad_norm": 10.610620498657227, |
|
"learning_rate": 1.3917232890427038e-06, |
|
"loss": 1.4092, |
|
"step": 7050 |
|
}, |
|
{ |
|
"epoch": 0.5237388724035609, |
|
"grad_norm": 7.3669514656066895, |
|
"learning_rate": 1.3882368683266303e-06, |
|
"loss": 1.1762, |
|
"step": 7060 |
|
}, |
|
{ |
|
"epoch": 0.5244807121661721, |
|
"grad_norm": 8.22118091583252, |
|
"learning_rate": 1.38475105465198e-06, |
|
"loss": 1.3694, |
|
"step": 7070 |
|
}, |
|
{ |
|
"epoch": 0.5252225519287834, |
|
"grad_norm": 8.95012378692627, |
|
"learning_rate": 1.3812658669519474e-06, |
|
"loss": 1.3601, |
|
"step": 7080 |
|
}, |
|
{ |
|
"epoch": 0.5259643916913946, |
|
"grad_norm": 8.938467979431152, |
|
"learning_rate": 1.3777813241563305e-06, |
|
"loss": 1.4346, |
|
"step": 7090 |
|
}, |
|
{ |
|
"epoch": 0.526706231454006, |
|
"grad_norm": 8.244651794433594, |
|
"learning_rate": 1.3742974451914208e-06, |
|
"loss": 1.3497, |
|
"step": 7100 |
|
}, |
|
{ |
|
"epoch": 0.5274480712166172, |
|
"grad_norm": 9.305986404418945, |
|
"learning_rate": 1.370814248979908e-06, |
|
"loss": 1.5719, |
|
"step": 7110 |
|
}, |
|
{ |
|
"epoch": 0.5281899109792285, |
|
"grad_norm": 7.710730075836182, |
|
"learning_rate": 1.3673317544407693e-06, |
|
"loss": 1.191, |
|
"step": 7120 |
|
}, |
|
{ |
|
"epoch": 0.5289317507418397, |
|
"grad_norm": 9.58619499206543, |
|
"learning_rate": 1.363849980489175e-06, |
|
"loss": 1.3419, |
|
"step": 7130 |
|
}, |
|
{ |
|
"epoch": 0.5296735905044511, |
|
"grad_norm": 8.806848526000977, |
|
"learning_rate": 1.3603689460363779e-06, |
|
"loss": 1.3253, |
|
"step": 7140 |
|
}, |
|
{ |
|
"epoch": 0.5304154302670623, |
|
"grad_norm": 8.474712371826172, |
|
"learning_rate": 1.3568886699896171e-06, |
|
"loss": 1.2181, |
|
"step": 7150 |
|
}, |
|
{ |
|
"epoch": 0.5311572700296736, |
|
"grad_norm": 8.78541374206543, |
|
"learning_rate": 1.3534091712520096e-06, |
|
"loss": 1.3726, |
|
"step": 7160 |
|
}, |
|
{ |
|
"epoch": 0.5318991097922848, |
|
"grad_norm": 11.253677368164062, |
|
"learning_rate": 1.3499304687224536e-06, |
|
"loss": 1.2884, |
|
"step": 7170 |
|
}, |
|
{ |
|
"epoch": 0.5326409495548962, |
|
"grad_norm": 8.340043067932129, |
|
"learning_rate": 1.3464525812955194e-06, |
|
"loss": 1.3605, |
|
"step": 7180 |
|
}, |
|
{ |
|
"epoch": 0.5333827893175074, |
|
"grad_norm": 8.733418464660645, |
|
"learning_rate": 1.3429755278613535e-06, |
|
"loss": 1.2541, |
|
"step": 7190 |
|
}, |
|
{ |
|
"epoch": 0.5341246290801187, |
|
"grad_norm": 9.979363441467285, |
|
"learning_rate": 1.3394993273055689e-06, |
|
"loss": 1.3203, |
|
"step": 7200 |
|
}, |
|
{ |
|
"epoch": 0.5348664688427299, |
|
"grad_norm": 8.473489761352539, |
|
"learning_rate": 1.3360239985091496e-06, |
|
"loss": 1.3836, |
|
"step": 7210 |
|
}, |
|
{ |
|
"epoch": 0.5356083086053413, |
|
"grad_norm": 8.469969749450684, |
|
"learning_rate": 1.3325495603483418e-06, |
|
"loss": 1.3789, |
|
"step": 7220 |
|
}, |
|
{ |
|
"epoch": 0.5363501483679525, |
|
"grad_norm": 7.77994966506958, |
|
"learning_rate": 1.3290760316945572e-06, |
|
"loss": 1.2116, |
|
"step": 7230 |
|
}, |
|
{ |
|
"epoch": 0.5370919881305638, |
|
"grad_norm": 9.14150619506836, |
|
"learning_rate": 1.325603431414264e-06, |
|
"loss": 1.2778, |
|
"step": 7240 |
|
}, |
|
{ |
|
"epoch": 0.537833827893175, |
|
"grad_norm": 8.883842468261719, |
|
"learning_rate": 1.3221317783688914e-06, |
|
"loss": 1.2829, |
|
"step": 7250 |
|
}, |
|
{ |
|
"epoch": 0.5385756676557863, |
|
"grad_norm": 6.918141841888428, |
|
"learning_rate": 1.3186610914147208e-06, |
|
"loss": 1.2587, |
|
"step": 7260 |
|
}, |
|
{ |
|
"epoch": 0.5393175074183977, |
|
"grad_norm": 8.339578628540039, |
|
"learning_rate": 1.3151913894027878e-06, |
|
"loss": 1.3557, |
|
"step": 7270 |
|
}, |
|
{ |
|
"epoch": 0.5400593471810089, |
|
"grad_norm": 8.50107192993164, |
|
"learning_rate": 1.3117226911787791e-06, |
|
"loss": 1.2453, |
|
"step": 7280 |
|
}, |
|
{ |
|
"epoch": 0.5408011869436202, |
|
"grad_norm": 9.355497360229492, |
|
"learning_rate": 1.3082550155829264e-06, |
|
"loss": 1.4713, |
|
"step": 7290 |
|
}, |
|
{ |
|
"epoch": 0.5415430267062314, |
|
"grad_norm": 8.334994316101074, |
|
"learning_rate": 1.304788381449911e-06, |
|
"loss": 1.2284, |
|
"step": 7300 |
|
}, |
|
{ |
|
"epoch": 0.5422848664688428, |
|
"grad_norm": 9.552740097045898, |
|
"learning_rate": 1.3013228076087534e-06, |
|
"loss": 1.3224, |
|
"step": 7310 |
|
}, |
|
{ |
|
"epoch": 0.543026706231454, |
|
"grad_norm": 9.53915786743164, |
|
"learning_rate": 1.2978583128827187e-06, |
|
"loss": 1.3691, |
|
"step": 7320 |
|
}, |
|
{ |
|
"epoch": 0.5437685459940653, |
|
"grad_norm": 9.11638355255127, |
|
"learning_rate": 1.2943949160892076e-06, |
|
"loss": 1.3347, |
|
"step": 7330 |
|
}, |
|
{ |
|
"epoch": 0.5445103857566765, |
|
"grad_norm": 9.868489265441895, |
|
"learning_rate": 1.2909326360396604e-06, |
|
"loss": 1.5654, |
|
"step": 7340 |
|
}, |
|
{ |
|
"epoch": 0.5452522255192879, |
|
"grad_norm": 8.725732803344727, |
|
"learning_rate": 1.287471491539449e-06, |
|
"loss": 1.3145, |
|
"step": 7350 |
|
}, |
|
{ |
|
"epoch": 0.5459940652818991, |
|
"grad_norm": 8.547471046447754, |
|
"learning_rate": 1.2840115013877804e-06, |
|
"loss": 1.2752, |
|
"step": 7360 |
|
}, |
|
{ |
|
"epoch": 0.5467359050445104, |
|
"grad_norm": 11.938176155090332, |
|
"learning_rate": 1.2805526843775888e-06, |
|
"loss": 1.3646, |
|
"step": 7370 |
|
}, |
|
{ |
|
"epoch": 0.5474777448071216, |
|
"grad_norm": 11.184774398803711, |
|
"learning_rate": 1.2770950592954392e-06, |
|
"loss": 1.4144, |
|
"step": 7380 |
|
}, |
|
{ |
|
"epoch": 0.548219584569733, |
|
"grad_norm": 8.136163711547852, |
|
"learning_rate": 1.27363864492142e-06, |
|
"loss": 1.2555, |
|
"step": 7390 |
|
}, |
|
{ |
|
"epoch": 0.5489614243323442, |
|
"grad_norm": 8.048996925354004, |
|
"learning_rate": 1.2701834600290465e-06, |
|
"loss": 1.3139, |
|
"step": 7400 |
|
}, |
|
{ |
|
"epoch": 0.5497032640949555, |
|
"grad_norm": 8.8002347946167, |
|
"learning_rate": 1.2667295233851534e-06, |
|
"loss": 1.3354, |
|
"step": 7410 |
|
}, |
|
{ |
|
"epoch": 0.5504451038575667, |
|
"grad_norm": 8.829628944396973, |
|
"learning_rate": 1.263276853749798e-06, |
|
"loss": 1.519, |
|
"step": 7420 |
|
}, |
|
{ |
|
"epoch": 0.5511869436201781, |
|
"grad_norm": 8.89567756652832, |
|
"learning_rate": 1.259825469876153e-06, |
|
"loss": 1.4514, |
|
"step": 7430 |
|
}, |
|
{ |
|
"epoch": 0.5519287833827893, |
|
"grad_norm": 8.236814498901367, |
|
"learning_rate": 1.2563753905104107e-06, |
|
"loss": 1.2586, |
|
"step": 7440 |
|
}, |
|
{ |
|
"epoch": 0.5526706231454006, |
|
"grad_norm": 9.010204315185547, |
|
"learning_rate": 1.252926634391675e-06, |
|
"loss": 1.1963, |
|
"step": 7450 |
|
}, |
|
{ |
|
"epoch": 0.5534124629080118, |
|
"grad_norm": 8.456092834472656, |
|
"learning_rate": 1.2494792202518651e-06, |
|
"loss": 1.4698, |
|
"step": 7460 |
|
}, |
|
{ |
|
"epoch": 0.5541543026706232, |
|
"grad_norm": 7.783117294311523, |
|
"learning_rate": 1.2460331668156087e-06, |
|
"loss": 1.2172, |
|
"step": 7470 |
|
}, |
|
{ |
|
"epoch": 0.5548961424332344, |
|
"grad_norm": 8.84600830078125, |
|
"learning_rate": 1.2425884928001456e-06, |
|
"loss": 1.3524, |
|
"step": 7480 |
|
}, |
|
{ |
|
"epoch": 0.5556379821958457, |
|
"grad_norm": 13.498913764953613, |
|
"learning_rate": 1.2391452169152206e-06, |
|
"loss": 1.4842, |
|
"step": 7490 |
|
}, |
|
{ |
|
"epoch": 0.5563798219584569, |
|
"grad_norm": 7.4838433265686035, |
|
"learning_rate": 1.2357033578629871e-06, |
|
"loss": 1.2696, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 0.5563798219584569, |
|
"eval_loss": 1.31540048122406, |
|
"eval_runtime": 23.6078, |
|
"eval_samples_per_second": 18.892, |
|
"eval_steps_per_second": 9.446, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 0.5571216617210683, |
|
"grad_norm": 8.919118881225586, |
|
"learning_rate": 1.2322629343379003e-06, |
|
"loss": 1.205, |
|
"step": 7510 |
|
}, |
|
{ |
|
"epoch": 0.5578635014836796, |
|
"grad_norm": 9.142733573913574, |
|
"learning_rate": 1.2288239650266212e-06, |
|
"loss": 1.1951, |
|
"step": 7520 |
|
}, |
|
{ |
|
"epoch": 0.5586053412462908, |
|
"grad_norm": 8.228799819946289, |
|
"learning_rate": 1.2253864686079096e-06, |
|
"loss": 1.2712, |
|
"step": 7530 |
|
}, |
|
{ |
|
"epoch": 0.5593471810089021, |
|
"grad_norm": 9.651594161987305, |
|
"learning_rate": 1.2219504637525272e-06, |
|
"loss": 1.3421, |
|
"step": 7540 |
|
}, |
|
{ |
|
"epoch": 0.5600890207715133, |
|
"grad_norm": 8.157588005065918, |
|
"learning_rate": 1.2185159691231333e-06, |
|
"loss": 1.3639, |
|
"step": 7550 |
|
}, |
|
{ |
|
"epoch": 0.5608308605341247, |
|
"grad_norm": 8.895820617675781, |
|
"learning_rate": 1.2150830033741845e-06, |
|
"loss": 1.4126, |
|
"step": 7560 |
|
}, |
|
{ |
|
"epoch": 0.5615727002967359, |
|
"grad_norm": 8.002445220947266, |
|
"learning_rate": 1.2116515851518336e-06, |
|
"loss": 1.5194, |
|
"step": 7570 |
|
}, |
|
{ |
|
"epoch": 0.5623145400593472, |
|
"grad_norm": 8.261061668395996, |
|
"learning_rate": 1.2082217330938278e-06, |
|
"loss": 1.341, |
|
"step": 7580 |
|
}, |
|
{ |
|
"epoch": 0.5630563798219584, |
|
"grad_norm": 8.519086837768555, |
|
"learning_rate": 1.2047934658294077e-06, |
|
"loss": 1.317, |
|
"step": 7590 |
|
}, |
|
{ |
|
"epoch": 0.5637982195845698, |
|
"grad_norm": 9.409530639648438, |
|
"learning_rate": 1.2013668019792059e-06, |
|
"loss": 1.3258, |
|
"step": 7600 |
|
}, |
|
{ |
|
"epoch": 0.564540059347181, |
|
"grad_norm": 8.627307891845703, |
|
"learning_rate": 1.197941760155147e-06, |
|
"loss": 1.3783, |
|
"step": 7610 |
|
}, |
|
{ |
|
"epoch": 0.5652818991097923, |
|
"grad_norm": 8.954816818237305, |
|
"learning_rate": 1.1945183589603436e-06, |
|
"loss": 1.3198, |
|
"step": 7620 |
|
}, |
|
{ |
|
"epoch": 0.5660237388724035, |
|
"grad_norm": 9.60593318939209, |
|
"learning_rate": 1.191096616989e-06, |
|
"loss": 1.4557, |
|
"step": 7630 |
|
}, |
|
{ |
|
"epoch": 0.5667655786350149, |
|
"grad_norm": 10.09070110321045, |
|
"learning_rate": 1.1876765528263054e-06, |
|
"loss": 1.2944, |
|
"step": 7640 |
|
}, |
|
{ |
|
"epoch": 0.5675074183976261, |
|
"grad_norm": 9.579095840454102, |
|
"learning_rate": 1.1842581850483386e-06, |
|
"loss": 1.1665, |
|
"step": 7650 |
|
}, |
|
{ |
|
"epoch": 0.5682492581602374, |
|
"grad_norm": 8.07282829284668, |
|
"learning_rate": 1.1808415322219623e-06, |
|
"loss": 1.2873, |
|
"step": 7660 |
|
}, |
|
{ |
|
"epoch": 0.5689910979228486, |
|
"grad_norm": 8.33482837677002, |
|
"learning_rate": 1.1774266129047268e-06, |
|
"loss": 1.3965, |
|
"step": 7670 |
|
}, |
|
{ |
|
"epoch": 0.56973293768546, |
|
"grad_norm": 7.368827819824219, |
|
"learning_rate": 1.1740134456447643e-06, |
|
"loss": 1.361, |
|
"step": 7680 |
|
}, |
|
{ |
|
"epoch": 0.5704747774480712, |
|
"grad_norm": 7.657955169677734, |
|
"learning_rate": 1.1706020489806927e-06, |
|
"loss": 1.2028, |
|
"step": 7690 |
|
}, |
|
{ |
|
"epoch": 0.5712166172106825, |
|
"grad_norm": 10.629265785217285, |
|
"learning_rate": 1.1671924414415115e-06, |
|
"loss": 1.4689, |
|
"step": 7700 |
|
}, |
|
{ |
|
"epoch": 0.5719584569732937, |
|
"grad_norm": 8.179962158203125, |
|
"learning_rate": 1.1637846415465042e-06, |
|
"loss": 1.2847, |
|
"step": 7710 |
|
}, |
|
{ |
|
"epoch": 0.5727002967359051, |
|
"grad_norm": 9.228793144226074, |
|
"learning_rate": 1.160378667805134e-06, |
|
"loss": 1.2259, |
|
"step": 7720 |
|
}, |
|
{ |
|
"epoch": 0.5734421364985163, |
|
"grad_norm": 9.42039966583252, |
|
"learning_rate": 1.1569745387169476e-06, |
|
"loss": 1.3845, |
|
"step": 7730 |
|
}, |
|
{ |
|
"epoch": 0.5741839762611276, |
|
"grad_norm": 7.016010284423828, |
|
"learning_rate": 1.15357227277147e-06, |
|
"loss": 1.2342, |
|
"step": 7740 |
|
}, |
|
{ |
|
"epoch": 0.5749258160237388, |
|
"grad_norm": 8.45107650756836, |
|
"learning_rate": 1.1501718884481093e-06, |
|
"loss": 1.2879, |
|
"step": 7750 |
|
}, |
|
{ |
|
"epoch": 0.5756676557863502, |
|
"grad_norm": 8.31155776977539, |
|
"learning_rate": 1.1467734042160506e-06, |
|
"loss": 1.1682, |
|
"step": 7760 |
|
}, |
|
{ |
|
"epoch": 0.5764094955489614, |
|
"grad_norm": 6.724592208862305, |
|
"learning_rate": 1.1433768385341618e-06, |
|
"loss": 1.279, |
|
"step": 7770 |
|
}, |
|
{ |
|
"epoch": 0.5771513353115727, |
|
"grad_norm": 7.946195602416992, |
|
"learning_rate": 1.1399822098508868e-06, |
|
"loss": 1.2484, |
|
"step": 7780 |
|
}, |
|
{ |
|
"epoch": 0.577893175074184, |
|
"grad_norm": 8.1701078414917, |
|
"learning_rate": 1.1365895366041515e-06, |
|
"loss": 1.4018, |
|
"step": 7790 |
|
}, |
|
{ |
|
"epoch": 0.5786350148367952, |
|
"grad_norm": 7.849374771118164, |
|
"learning_rate": 1.1331988372212606e-06, |
|
"loss": 1.3815, |
|
"step": 7800 |
|
}, |
|
{ |
|
"epoch": 0.5793768545994066, |
|
"grad_norm": 7.588284015655518, |
|
"learning_rate": 1.129810130118795e-06, |
|
"loss": 1.2523, |
|
"step": 7810 |
|
}, |
|
{ |
|
"epoch": 0.5801186943620178, |
|
"grad_norm": 7.635886192321777, |
|
"learning_rate": 1.1264234337025184e-06, |
|
"loss": 1.3134, |
|
"step": 7820 |
|
}, |
|
{ |
|
"epoch": 0.5808605341246291, |
|
"grad_norm": 8.270919799804688, |
|
"learning_rate": 1.1230387663672702e-06, |
|
"loss": 1.2948, |
|
"step": 7830 |
|
}, |
|
{ |
|
"epoch": 0.5816023738872403, |
|
"grad_norm": 8.233508110046387, |
|
"learning_rate": 1.1196561464968714e-06, |
|
"loss": 1.4182, |
|
"step": 7840 |
|
}, |
|
{ |
|
"epoch": 0.5823442136498517, |
|
"grad_norm": 7.905995845794678, |
|
"learning_rate": 1.1162755924640197e-06, |
|
"loss": 1.2159, |
|
"step": 7850 |
|
}, |
|
{ |
|
"epoch": 0.5830860534124629, |
|
"grad_norm": 8.946208953857422, |
|
"learning_rate": 1.1128971226301945e-06, |
|
"loss": 1.3037, |
|
"step": 7860 |
|
}, |
|
{ |
|
"epoch": 0.5838278931750742, |
|
"grad_norm": 7.928177833557129, |
|
"learning_rate": 1.1095207553455534e-06, |
|
"loss": 1.2651, |
|
"step": 7870 |
|
}, |
|
{ |
|
"epoch": 0.5845697329376854, |
|
"grad_norm": 8.850945472717285, |
|
"learning_rate": 1.106146508948835e-06, |
|
"loss": 1.2274, |
|
"step": 7880 |
|
}, |
|
{ |
|
"epoch": 0.5853115727002968, |
|
"grad_norm": 8.4835844039917, |
|
"learning_rate": 1.1027744017672569e-06, |
|
"loss": 1.3851, |
|
"step": 7890 |
|
}, |
|
{ |
|
"epoch": 0.586053412462908, |
|
"grad_norm": 9.85268783569336, |
|
"learning_rate": 1.0994044521164195e-06, |
|
"loss": 1.2782, |
|
"step": 7900 |
|
}, |
|
{ |
|
"epoch": 0.5867952522255193, |
|
"grad_norm": 9.465106964111328, |
|
"learning_rate": 1.0960366783002025e-06, |
|
"loss": 1.3173, |
|
"step": 7910 |
|
}, |
|
{ |
|
"epoch": 0.5875370919881305, |
|
"grad_norm": 8.64224624633789, |
|
"learning_rate": 1.0926710986106692e-06, |
|
"loss": 1.2422, |
|
"step": 7920 |
|
}, |
|
{ |
|
"epoch": 0.5882789317507419, |
|
"grad_norm": 7.39610481262207, |
|
"learning_rate": 1.0893077313279645e-06, |
|
"loss": 1.3971, |
|
"step": 7930 |
|
}, |
|
{ |
|
"epoch": 0.5890207715133531, |
|
"grad_norm": 10.103199005126953, |
|
"learning_rate": 1.0859465947202174e-06, |
|
"loss": 1.2907, |
|
"step": 7940 |
|
}, |
|
{ |
|
"epoch": 0.5897626112759644, |
|
"grad_norm": 8.195404052734375, |
|
"learning_rate": 1.08258770704344e-06, |
|
"loss": 1.3106, |
|
"step": 7950 |
|
}, |
|
{ |
|
"epoch": 0.5905044510385756, |
|
"grad_norm": 10.311722755432129, |
|
"learning_rate": 1.0792310865414305e-06, |
|
"loss": 1.482, |
|
"step": 7960 |
|
}, |
|
{ |
|
"epoch": 0.591246290801187, |
|
"grad_norm": 9.214127540588379, |
|
"learning_rate": 1.075876751445672e-06, |
|
"loss": 1.366, |
|
"step": 7970 |
|
}, |
|
{ |
|
"epoch": 0.5919881305637982, |
|
"grad_norm": 8.142541885375977, |
|
"learning_rate": 1.0725247199752353e-06, |
|
"loss": 1.2611, |
|
"step": 7980 |
|
}, |
|
{ |
|
"epoch": 0.5927299703264095, |
|
"grad_norm": 8.227744102478027, |
|
"learning_rate": 1.0691750103366772e-06, |
|
"loss": 1.3838, |
|
"step": 7990 |
|
}, |
|
{ |
|
"epoch": 0.5934718100890207, |
|
"grad_norm": 7.327287673950195, |
|
"learning_rate": 1.0658276407239463e-06, |
|
"loss": 1.3833, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 0.5934718100890207, |
|
"eval_loss": 1.310362458229065, |
|
"eval_runtime": 23.6142, |
|
"eval_samples_per_second": 18.887, |
|
"eval_steps_per_second": 9.443, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 0.594213649851632, |
|
"grad_norm": 7.846217155456543, |
|
"learning_rate": 1.0624826293182785e-06, |
|
"loss": 1.3256, |
|
"step": 8010 |
|
}, |
|
{ |
|
"epoch": 0.5949554896142433, |
|
"grad_norm": 10.000598907470703, |
|
"learning_rate": 1.0591399942881038e-06, |
|
"loss": 1.2878, |
|
"step": 8020 |
|
}, |
|
{ |
|
"epoch": 0.5956973293768546, |
|
"grad_norm": 12.841207504272461, |
|
"learning_rate": 1.0557997537889423e-06, |
|
"loss": 1.506, |
|
"step": 8030 |
|
}, |
|
{ |
|
"epoch": 0.5964391691394659, |
|
"grad_norm": 12.134847640991211, |
|
"learning_rate": 1.05246192596331e-06, |
|
"loss": 1.2424, |
|
"step": 8040 |
|
}, |
|
{ |
|
"epoch": 0.5971810089020771, |
|
"grad_norm": 8.403639793395996, |
|
"learning_rate": 1.0491265289406184e-06, |
|
"loss": 1.2328, |
|
"step": 8050 |
|
}, |
|
{ |
|
"epoch": 0.5979228486646885, |
|
"grad_norm": 8.479205131530762, |
|
"learning_rate": 1.0457935808370746e-06, |
|
"loss": 1.3008, |
|
"step": 8060 |
|
}, |
|
{ |
|
"epoch": 0.5986646884272997, |
|
"grad_norm": 8.48107624053955, |
|
"learning_rate": 1.0424630997555867e-06, |
|
"loss": 1.3708, |
|
"step": 8070 |
|
}, |
|
{ |
|
"epoch": 0.599406528189911, |
|
"grad_norm": 7.639008045196533, |
|
"learning_rate": 1.0391351037856604e-06, |
|
"loss": 1.3698, |
|
"step": 8080 |
|
}, |
|
{ |
|
"epoch": 0.6001483679525222, |
|
"grad_norm": 8.282002449035645, |
|
"learning_rate": 1.0358096110033063e-06, |
|
"loss": 1.3946, |
|
"step": 8090 |
|
}, |
|
{ |
|
"epoch": 0.6008902077151336, |
|
"grad_norm": 7.605436325073242, |
|
"learning_rate": 1.0324866394709365e-06, |
|
"loss": 1.3852, |
|
"step": 8100 |
|
}, |
|
{ |
|
"epoch": 0.6016320474777448, |
|
"grad_norm": 7.768093585968018, |
|
"learning_rate": 1.0291662072372715e-06, |
|
"loss": 1.389, |
|
"step": 8110 |
|
}, |
|
{ |
|
"epoch": 0.6023738872403561, |
|
"grad_norm": 8.550724029541016, |
|
"learning_rate": 1.0258483323372364e-06, |
|
"loss": 1.3193, |
|
"step": 8120 |
|
}, |
|
{ |
|
"epoch": 0.6031157270029673, |
|
"grad_norm": 7.690985202789307, |
|
"learning_rate": 1.0225330327918696e-06, |
|
"loss": 1.2423, |
|
"step": 8130 |
|
}, |
|
{ |
|
"epoch": 0.6038575667655787, |
|
"grad_norm": 8.371797561645508, |
|
"learning_rate": 1.0192203266082185e-06, |
|
"loss": 1.4319, |
|
"step": 8140 |
|
}, |
|
{ |
|
"epoch": 0.6045994065281899, |
|
"grad_norm": 7.518775463104248, |
|
"learning_rate": 1.0159102317792468e-06, |
|
"loss": 1.3037, |
|
"step": 8150 |
|
}, |
|
{ |
|
"epoch": 0.6053412462908012, |
|
"grad_norm": 12.325798034667969, |
|
"learning_rate": 1.012602766283733e-06, |
|
"loss": 1.4, |
|
"step": 8160 |
|
}, |
|
{ |
|
"epoch": 0.6060830860534124, |
|
"grad_norm": 8.312942504882812, |
|
"learning_rate": 1.0092979480861763e-06, |
|
"loss": 1.4318, |
|
"step": 8170 |
|
}, |
|
{ |
|
"epoch": 0.6068249258160238, |
|
"grad_norm": 8.462662696838379, |
|
"learning_rate": 1.0059957951366943e-06, |
|
"loss": 1.2307, |
|
"step": 8180 |
|
}, |
|
{ |
|
"epoch": 0.607566765578635, |
|
"grad_norm": 8.316648483276367, |
|
"learning_rate": 1.0026963253709315e-06, |
|
"loss": 1.2333, |
|
"step": 8190 |
|
}, |
|
{ |
|
"epoch": 0.6083086053412463, |
|
"grad_norm": 10.025683403015137, |
|
"learning_rate": 9.993995567099557e-07, |
|
"loss": 1.3134, |
|
"step": 8200 |
|
}, |
|
{ |
|
"epoch": 0.6090504451038575, |
|
"grad_norm": 10.292147636413574, |
|
"learning_rate": 9.961055070601667e-07, |
|
"loss": 1.1875, |
|
"step": 8210 |
|
}, |
|
{ |
|
"epoch": 0.6097922848664689, |
|
"grad_norm": 7.682520389556885, |
|
"learning_rate": 9.928141943131926e-07, |
|
"loss": 1.2678, |
|
"step": 8220 |
|
}, |
|
{ |
|
"epoch": 0.6105341246290801, |
|
"grad_norm": 9.251666069030762, |
|
"learning_rate": 9.895256363457996e-07, |
|
"loss": 1.4774, |
|
"step": 8230 |
|
}, |
|
{ |
|
"epoch": 0.6112759643916914, |
|
"grad_norm": 9.813620567321777, |
|
"learning_rate": 9.862398510197875e-07, |
|
"loss": 1.4223, |
|
"step": 8240 |
|
}, |
|
{ |
|
"epoch": 0.6120178041543026, |
|
"grad_norm": 8.737075805664062, |
|
"learning_rate": 9.829568561819005e-07, |
|
"loss": 1.2286, |
|
"step": 8250 |
|
}, |
|
{ |
|
"epoch": 0.612759643916914, |
|
"grad_norm": 8.058504104614258, |
|
"learning_rate": 9.796766696637232e-07, |
|
"loss": 1.3313, |
|
"step": 8260 |
|
}, |
|
{ |
|
"epoch": 0.6135014836795252, |
|
"grad_norm": 8.038324356079102, |
|
"learning_rate": 9.763993092815876e-07, |
|
"loss": 1.3329, |
|
"step": 8270 |
|
}, |
|
{ |
|
"epoch": 0.6142433234421365, |
|
"grad_norm": 8.749686241149902, |
|
"learning_rate": 9.731247928364766e-07, |
|
"loss": 1.29, |
|
"step": 8280 |
|
}, |
|
{ |
|
"epoch": 0.6149851632047477, |
|
"grad_norm": 9.395393371582031, |
|
"learning_rate": 9.69853138113925e-07, |
|
"loss": 1.3178, |
|
"step": 8290 |
|
}, |
|
{ |
|
"epoch": 0.615727002967359, |
|
"grad_norm": 7.980838775634766, |
|
"learning_rate": 9.665843628839246e-07, |
|
"loss": 1.2876, |
|
"step": 8300 |
|
}, |
|
{ |
|
"epoch": 0.6164688427299704, |
|
"grad_norm": 7.525308609008789, |
|
"learning_rate": 9.633184849008272e-07, |
|
"loss": 1.4126, |
|
"step": 8310 |
|
}, |
|
{ |
|
"epoch": 0.6172106824925816, |
|
"grad_norm": 7.652345657348633, |
|
"learning_rate": 9.600555219032493e-07, |
|
"loss": 1.3087, |
|
"step": 8320 |
|
}, |
|
{ |
|
"epoch": 0.6179525222551929, |
|
"grad_norm": 8.231952667236328, |
|
"learning_rate": 9.567954916139718e-07, |
|
"loss": 1.3444, |
|
"step": 8330 |
|
}, |
|
{ |
|
"epoch": 0.6186943620178041, |
|
"grad_norm": 8.18375301361084, |
|
"learning_rate": 9.535384117398501e-07, |
|
"loss": 1.274, |
|
"step": 8340 |
|
}, |
|
{ |
|
"epoch": 0.6194362017804155, |
|
"grad_norm": 6.912817478179932, |
|
"learning_rate": 9.502842999717117e-07, |
|
"loss": 1.3022, |
|
"step": 8350 |
|
}, |
|
{ |
|
"epoch": 0.6201780415430267, |
|
"grad_norm": 8.46898078918457, |
|
"learning_rate": 9.470331739842646e-07, |
|
"loss": 1.4138, |
|
"step": 8360 |
|
}, |
|
{ |
|
"epoch": 0.620919881305638, |
|
"grad_norm": 8.513370513916016, |
|
"learning_rate": 9.43785051435998e-07, |
|
"loss": 1.1663, |
|
"step": 8370 |
|
}, |
|
{ |
|
"epoch": 0.6216617210682492, |
|
"grad_norm": 7.527527332305908, |
|
"learning_rate": 9.405399499690899e-07, |
|
"loss": 1.2236, |
|
"step": 8380 |
|
}, |
|
{ |
|
"epoch": 0.6224035608308606, |
|
"grad_norm": 8.23088264465332, |
|
"learning_rate": 9.372978872093067e-07, |
|
"loss": 1.3095, |
|
"step": 8390 |
|
}, |
|
{ |
|
"epoch": 0.6231454005934718, |
|
"grad_norm": 8.179079055786133, |
|
"learning_rate": 9.340588807659127e-07, |
|
"loss": 1.3372, |
|
"step": 8400 |
|
}, |
|
{ |
|
"epoch": 0.6238872403560831, |
|
"grad_norm": 8.004188537597656, |
|
"learning_rate": 9.308229482315696e-07, |
|
"loss": 1.401, |
|
"step": 8410 |
|
}, |
|
{ |
|
"epoch": 0.6246290801186943, |
|
"grad_norm": 8.311544418334961, |
|
"learning_rate": 9.275901071822453e-07, |
|
"loss": 1.2743, |
|
"step": 8420 |
|
}, |
|
{ |
|
"epoch": 0.6253709198813057, |
|
"grad_norm": 9.387375831604004, |
|
"learning_rate": 9.243603751771139e-07, |
|
"loss": 1.3015, |
|
"step": 8430 |
|
}, |
|
{ |
|
"epoch": 0.6261127596439169, |
|
"grad_norm": 7.231583118438721, |
|
"learning_rate": 9.211337697584654e-07, |
|
"loss": 1.3418, |
|
"step": 8440 |
|
}, |
|
{ |
|
"epoch": 0.6268545994065282, |
|
"grad_norm": 7.891911029815674, |
|
"learning_rate": 9.179103084516049e-07, |
|
"loss": 1.2991, |
|
"step": 8450 |
|
}, |
|
{ |
|
"epoch": 0.6275964391691394, |
|
"grad_norm": 8.7365083694458, |
|
"learning_rate": 9.14690008764763e-07, |
|
"loss": 1.4723, |
|
"step": 8460 |
|
}, |
|
{ |
|
"epoch": 0.6283382789317508, |
|
"grad_norm": 8.894607543945312, |
|
"learning_rate": 9.114728881889955e-07, |
|
"loss": 1.4044, |
|
"step": 8470 |
|
}, |
|
{ |
|
"epoch": 0.629080118694362, |
|
"grad_norm": 9.663780212402344, |
|
"learning_rate": 9.082589641980931e-07, |
|
"loss": 1.3265, |
|
"step": 8480 |
|
}, |
|
{ |
|
"epoch": 0.6298219584569733, |
|
"grad_norm": 9.393049240112305, |
|
"learning_rate": 9.050482542484822e-07, |
|
"loss": 1.3115, |
|
"step": 8490 |
|
}, |
|
{ |
|
"epoch": 0.6305637982195845, |
|
"grad_norm": 8.080445289611816, |
|
"learning_rate": 9.018407757791341e-07, |
|
"loss": 1.3217, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 0.6305637982195845, |
|
"eval_loss": 1.3059756755828857, |
|
"eval_runtime": 23.6134, |
|
"eval_samples_per_second": 18.888, |
|
"eval_steps_per_second": 9.444, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 0.6313056379821959, |
|
"grad_norm": 7.894256114959717, |
|
"learning_rate": 8.986365462114664e-07, |
|
"loss": 1.2935, |
|
"step": 8510 |
|
}, |
|
{ |
|
"epoch": 0.6320474777448071, |
|
"grad_norm": 9.298333168029785, |
|
"learning_rate": 8.954355829492521e-07, |
|
"loss": 1.4362, |
|
"step": 8520 |
|
}, |
|
{ |
|
"epoch": 0.6327893175074184, |
|
"grad_norm": 7.979432106018066, |
|
"learning_rate": 8.922379033785212e-07, |
|
"loss": 1.5357, |
|
"step": 8530 |
|
}, |
|
{ |
|
"epoch": 0.6335311572700296, |
|
"grad_norm": 8.477805137634277, |
|
"learning_rate": 8.890435248674709e-07, |
|
"loss": 1.2728, |
|
"step": 8540 |
|
}, |
|
{ |
|
"epoch": 0.634272997032641, |
|
"grad_norm": 9.298026084899902, |
|
"learning_rate": 8.858524647663661e-07, |
|
"loss": 1.4405, |
|
"step": 8550 |
|
}, |
|
{ |
|
"epoch": 0.6350148367952523, |
|
"grad_norm": 8.434690475463867, |
|
"learning_rate": 8.826647404074497e-07, |
|
"loss": 1.2176, |
|
"step": 8560 |
|
}, |
|
{ |
|
"epoch": 0.6357566765578635, |
|
"grad_norm": 8.349011421203613, |
|
"learning_rate": 8.794803691048457e-07, |
|
"loss": 1.3891, |
|
"step": 8570 |
|
}, |
|
{ |
|
"epoch": 0.6364985163204748, |
|
"grad_norm": 7.770542144775391, |
|
"learning_rate": 8.762993681544657e-07, |
|
"loss": 1.2877, |
|
"step": 8580 |
|
}, |
|
{ |
|
"epoch": 0.637240356083086, |
|
"grad_norm": 9.452780723571777, |
|
"learning_rate": 8.731217548339163e-07, |
|
"loss": 1.4215, |
|
"step": 8590 |
|
}, |
|
{ |
|
"epoch": 0.6379821958456974, |
|
"grad_norm": 7.791207790374756, |
|
"learning_rate": 8.699475464024022e-07, |
|
"loss": 1.2664, |
|
"step": 8600 |
|
}, |
|
{ |
|
"epoch": 0.6387240356083086, |
|
"grad_norm": 8.289834976196289, |
|
"learning_rate": 8.667767601006372e-07, |
|
"loss": 1.2292, |
|
"step": 8610 |
|
}, |
|
{ |
|
"epoch": 0.6394658753709199, |
|
"grad_norm": 9.18075180053711, |
|
"learning_rate": 8.63609413150745e-07, |
|
"loss": 1.2276, |
|
"step": 8620 |
|
}, |
|
{ |
|
"epoch": 0.6402077151335311, |
|
"grad_norm": 7.8221635818481445, |
|
"learning_rate": 8.604455227561712e-07, |
|
"loss": 1.1693, |
|
"step": 8630 |
|
}, |
|
{ |
|
"epoch": 0.6409495548961425, |
|
"grad_norm": 8.45543384552002, |
|
"learning_rate": 8.572851061015842e-07, |
|
"loss": 1.3574, |
|
"step": 8640 |
|
}, |
|
{ |
|
"epoch": 0.6416913946587537, |
|
"grad_norm": 8.002989768981934, |
|
"learning_rate": 8.541281803527875e-07, |
|
"loss": 1.1484, |
|
"step": 8650 |
|
}, |
|
{ |
|
"epoch": 0.642433234421365, |
|
"grad_norm": 7.84604549407959, |
|
"learning_rate": 8.509747626566218e-07, |
|
"loss": 1.2894, |
|
"step": 8660 |
|
}, |
|
{ |
|
"epoch": 0.6431750741839762, |
|
"grad_norm": 8.202202796936035, |
|
"learning_rate": 8.478248701408751e-07, |
|
"loss": 1.2653, |
|
"step": 8670 |
|
}, |
|
{ |
|
"epoch": 0.6439169139465876, |
|
"grad_norm": 9.587785720825195, |
|
"learning_rate": 8.44678519914187e-07, |
|
"loss": 1.2584, |
|
"step": 8680 |
|
}, |
|
{ |
|
"epoch": 0.6446587537091988, |
|
"grad_norm": 7.761561870574951, |
|
"learning_rate": 8.415357290659591e-07, |
|
"loss": 1.2958, |
|
"step": 8690 |
|
}, |
|
{ |
|
"epoch": 0.6454005934718101, |
|
"grad_norm": 8.499533653259277, |
|
"learning_rate": 8.383965146662582e-07, |
|
"loss": 1.2073, |
|
"step": 8700 |
|
}, |
|
{ |
|
"epoch": 0.6461424332344213, |
|
"grad_norm": 9.094478607177734, |
|
"learning_rate": 8.352608937657273e-07, |
|
"loss": 1.4064, |
|
"step": 8710 |
|
}, |
|
{ |
|
"epoch": 0.6468842729970327, |
|
"grad_norm": 9.180924415588379, |
|
"learning_rate": 8.321288833954896e-07, |
|
"loss": 1.324, |
|
"step": 8720 |
|
}, |
|
{ |
|
"epoch": 0.6476261127596439, |
|
"grad_norm": 8.090041160583496, |
|
"learning_rate": 8.290005005670598e-07, |
|
"loss": 1.2272, |
|
"step": 8730 |
|
}, |
|
{ |
|
"epoch": 0.6483679525222552, |
|
"grad_norm": 8.494268417358398, |
|
"learning_rate": 8.258757622722475e-07, |
|
"loss": 1.298, |
|
"step": 8740 |
|
}, |
|
{ |
|
"epoch": 0.6491097922848664, |
|
"grad_norm": 8.722259521484375, |
|
"learning_rate": 8.227546854830687e-07, |
|
"loss": 1.2791, |
|
"step": 8750 |
|
}, |
|
{ |
|
"epoch": 0.6498516320474778, |
|
"grad_norm": 9.675090789794922, |
|
"learning_rate": 8.196372871516503e-07, |
|
"loss": 1.4562, |
|
"step": 8760 |
|
}, |
|
{ |
|
"epoch": 0.650593471810089, |
|
"grad_norm": 8.035630226135254, |
|
"learning_rate": 8.165235842101421e-07, |
|
"loss": 1.424, |
|
"step": 8770 |
|
}, |
|
{ |
|
"epoch": 0.6513353115727003, |
|
"grad_norm": 7.216797351837158, |
|
"learning_rate": 8.134135935706192e-07, |
|
"loss": 1.3999, |
|
"step": 8780 |
|
}, |
|
{ |
|
"epoch": 0.6520771513353115, |
|
"grad_norm": 9.409671783447266, |
|
"learning_rate": 8.103073321249961e-07, |
|
"loss": 1.213, |
|
"step": 8790 |
|
}, |
|
{ |
|
"epoch": 0.6528189910979229, |
|
"grad_norm": 8.56403923034668, |
|
"learning_rate": 8.072048167449306e-07, |
|
"loss": 1.2852, |
|
"step": 8800 |
|
}, |
|
{ |
|
"epoch": 0.6535608308605341, |
|
"grad_norm": 8.66519546508789, |
|
"learning_rate": 8.041060642817348e-07, |
|
"loss": 1.395, |
|
"step": 8810 |
|
}, |
|
{ |
|
"epoch": 0.6543026706231454, |
|
"grad_norm": 8.845466613769531, |
|
"learning_rate": 8.010110915662808e-07, |
|
"loss": 1.2783, |
|
"step": 8820 |
|
}, |
|
{ |
|
"epoch": 0.6550445103857567, |
|
"grad_norm": 7.585766792297363, |
|
"learning_rate": 7.97919915408913e-07, |
|
"loss": 1.2284, |
|
"step": 8830 |
|
}, |
|
{ |
|
"epoch": 0.655786350148368, |
|
"grad_norm": 9.476142883300781, |
|
"learning_rate": 7.948325525993545e-07, |
|
"loss": 1.3386, |
|
"step": 8840 |
|
}, |
|
{ |
|
"epoch": 0.6565281899109793, |
|
"grad_norm": 14.948787689208984, |
|
"learning_rate": 7.917490199066141e-07, |
|
"loss": 1.2518, |
|
"step": 8850 |
|
}, |
|
{ |
|
"epoch": 0.6572700296735905, |
|
"grad_norm": 8.437254905700684, |
|
"learning_rate": 7.886693340789006e-07, |
|
"loss": 1.342, |
|
"step": 8860 |
|
}, |
|
{ |
|
"epoch": 0.6580118694362018, |
|
"grad_norm": 7.908801555633545, |
|
"learning_rate": 7.855935118435254e-07, |
|
"loss": 1.4527, |
|
"step": 8870 |
|
}, |
|
{ |
|
"epoch": 0.658753709198813, |
|
"grad_norm": 8.332324981689453, |
|
"learning_rate": 7.825215699068171e-07, |
|
"loss": 1.5184, |
|
"step": 8880 |
|
}, |
|
{ |
|
"epoch": 0.6594955489614244, |
|
"grad_norm": 8.469289779663086, |
|
"learning_rate": 7.794535249540267e-07, |
|
"loss": 1.3789, |
|
"step": 8890 |
|
}, |
|
{ |
|
"epoch": 0.6602373887240356, |
|
"grad_norm": 6.876429557800293, |
|
"learning_rate": 7.763893936492411e-07, |
|
"loss": 1.3259, |
|
"step": 8900 |
|
}, |
|
{ |
|
"epoch": 0.6609792284866469, |
|
"grad_norm": 8.458147048950195, |
|
"learning_rate": 7.733291926352871e-07, |
|
"loss": 1.2604, |
|
"step": 8910 |
|
}, |
|
{ |
|
"epoch": 0.6617210682492581, |
|
"grad_norm": 9.535111427307129, |
|
"learning_rate": 7.70272938533647e-07, |
|
"loss": 1.2525, |
|
"step": 8920 |
|
}, |
|
{ |
|
"epoch": 0.6624629080118695, |
|
"grad_norm": 7.999839782714844, |
|
"learning_rate": 7.67220647944363e-07, |
|
"loss": 1.2877, |
|
"step": 8930 |
|
}, |
|
{ |
|
"epoch": 0.6632047477744807, |
|
"grad_norm": 8.702188491821289, |
|
"learning_rate": 7.641723374459524e-07, |
|
"loss": 1.2842, |
|
"step": 8940 |
|
}, |
|
{ |
|
"epoch": 0.663946587537092, |
|
"grad_norm": 9.051286697387695, |
|
"learning_rate": 7.61128023595311e-07, |
|
"loss": 1.4605, |
|
"step": 8950 |
|
}, |
|
{ |
|
"epoch": 0.6646884272997032, |
|
"grad_norm": 8.036483764648438, |
|
"learning_rate": 7.580877229276303e-07, |
|
"loss": 1.247, |
|
"step": 8960 |
|
}, |
|
{ |
|
"epoch": 0.6654302670623146, |
|
"grad_norm": 8.07690143585205, |
|
"learning_rate": 7.550514519563013e-07, |
|
"loss": 1.4113, |
|
"step": 8970 |
|
}, |
|
{ |
|
"epoch": 0.6661721068249258, |
|
"grad_norm": 8.693827629089355, |
|
"learning_rate": 7.520192271728303e-07, |
|
"loss": 1.2892, |
|
"step": 8980 |
|
}, |
|
{ |
|
"epoch": 0.6669139465875371, |
|
"grad_norm": 7.980526924133301, |
|
"learning_rate": 7.489910650467445e-07, |
|
"loss": 1.2029, |
|
"step": 8990 |
|
}, |
|
{ |
|
"epoch": 0.6676557863501483, |
|
"grad_norm": 8.663146018981934, |
|
"learning_rate": 7.459669820255068e-07, |
|
"loss": 1.2351, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 0.6676557863501483, |
|
"eval_loss": 1.3026496171951294, |
|
"eval_runtime": 23.6065, |
|
"eval_samples_per_second": 18.893, |
|
"eval_steps_per_second": 9.447, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 0.6683976261127597, |
|
"grad_norm": 8.916913986206055, |
|
"learning_rate": 7.42946994534422e-07, |
|
"loss": 1.2413, |
|
"step": 9010 |
|
}, |
|
{ |
|
"epoch": 0.6691394658753709, |
|
"grad_norm": 8.399556159973145, |
|
"learning_rate": 7.399311189765529e-07, |
|
"loss": 1.1094, |
|
"step": 9020 |
|
}, |
|
{ |
|
"epoch": 0.6698813056379822, |
|
"grad_norm": 9.757269859313965, |
|
"learning_rate": 7.369193717326254e-07, |
|
"loss": 1.3129, |
|
"step": 9030 |
|
}, |
|
{ |
|
"epoch": 0.6706231454005934, |
|
"grad_norm": 8.055035591125488, |
|
"learning_rate": 7.339117691609455e-07, |
|
"loss": 1.2858, |
|
"step": 9040 |
|
}, |
|
{ |
|
"epoch": 0.6713649851632048, |
|
"grad_norm": 7.464066028594971, |
|
"learning_rate": 7.309083275973042e-07, |
|
"loss": 1.1974, |
|
"step": 9050 |
|
}, |
|
{ |
|
"epoch": 0.672106824925816, |
|
"grad_norm": 7.977897644042969, |
|
"learning_rate": 7.27909063354895e-07, |
|
"loss": 1.3727, |
|
"step": 9060 |
|
}, |
|
{ |
|
"epoch": 0.6728486646884273, |
|
"grad_norm": 8.40994930267334, |
|
"learning_rate": 7.249139927242198e-07, |
|
"loss": 1.3799, |
|
"step": 9070 |
|
}, |
|
{ |
|
"epoch": 0.6735905044510386, |
|
"grad_norm": 7.28301477432251, |
|
"learning_rate": 7.21923131973005e-07, |
|
"loss": 1.2326, |
|
"step": 9080 |
|
}, |
|
{ |
|
"epoch": 0.6743323442136498, |
|
"grad_norm": 8.845423698425293, |
|
"learning_rate": 7.189364973461092e-07, |
|
"loss": 1.349, |
|
"step": 9090 |
|
}, |
|
{ |
|
"epoch": 0.6750741839762612, |
|
"grad_norm": 8.522547721862793, |
|
"learning_rate": 7.159541050654386e-07, |
|
"loss": 1.3534, |
|
"step": 9100 |
|
}, |
|
{ |
|
"epoch": 0.6758160237388724, |
|
"grad_norm": 7.7692790031433105, |
|
"learning_rate": 7.129759713298553e-07, |
|
"loss": 1.2062, |
|
"step": 9110 |
|
}, |
|
{ |
|
"epoch": 0.6765578635014837, |
|
"grad_norm": 8.87850570678711, |
|
"learning_rate": 7.100021123150917e-07, |
|
"loss": 1.2687, |
|
"step": 9120 |
|
}, |
|
{ |
|
"epoch": 0.6772997032640949, |
|
"grad_norm": 11.794063568115234, |
|
"learning_rate": 7.070325441736635e-07, |
|
"loss": 1.3114, |
|
"step": 9130 |
|
}, |
|
{ |
|
"epoch": 0.6780415430267063, |
|
"grad_norm": 9.376462936401367, |
|
"learning_rate": 7.040672830347781e-07, |
|
"loss": 1.3112, |
|
"step": 9140 |
|
}, |
|
{ |
|
"epoch": 0.6787833827893175, |
|
"grad_norm": 9.135132789611816, |
|
"learning_rate": 7.011063450042518e-07, |
|
"loss": 1.3361, |
|
"step": 9150 |
|
}, |
|
{ |
|
"epoch": 0.6795252225519288, |
|
"grad_norm": 8.93464183807373, |
|
"learning_rate": 6.981497461644176e-07, |
|
"loss": 1.3685, |
|
"step": 9160 |
|
}, |
|
{ |
|
"epoch": 0.68026706231454, |
|
"grad_norm": 8.19428539276123, |
|
"learning_rate": 6.951975025740427e-07, |
|
"loss": 1.3093, |
|
"step": 9170 |
|
}, |
|
{ |
|
"epoch": 0.6810089020771514, |
|
"grad_norm": 8.19371509552002, |
|
"learning_rate": 6.92249630268236e-07, |
|
"loss": 1.426, |
|
"step": 9180 |
|
}, |
|
{ |
|
"epoch": 0.6817507418397626, |
|
"grad_norm": 7.4333014488220215, |
|
"learning_rate": 6.893061452583667e-07, |
|
"loss": 1.3935, |
|
"step": 9190 |
|
}, |
|
{ |
|
"epoch": 0.6824925816023739, |
|
"grad_norm": 9.02108383178711, |
|
"learning_rate": 6.863670635319714e-07, |
|
"loss": 1.3407, |
|
"step": 9200 |
|
}, |
|
{ |
|
"epoch": 0.6832344213649851, |
|
"grad_norm": 10.15578842163086, |
|
"learning_rate": 6.834324010526733e-07, |
|
"loss": 1.3954, |
|
"step": 9210 |
|
}, |
|
{ |
|
"epoch": 0.6839762611275965, |
|
"grad_norm": 8.848624229431152, |
|
"learning_rate": 6.805021737600896e-07, |
|
"loss": 1.2578, |
|
"step": 9220 |
|
}, |
|
{ |
|
"epoch": 0.6847181008902077, |
|
"grad_norm": 7.951557636260986, |
|
"learning_rate": 6.775763975697501e-07, |
|
"loss": 1.3615, |
|
"step": 9230 |
|
}, |
|
{ |
|
"epoch": 0.685459940652819, |
|
"grad_norm": 8.311724662780762, |
|
"learning_rate": 6.746550883730067e-07, |
|
"loss": 1.1818, |
|
"step": 9240 |
|
}, |
|
{ |
|
"epoch": 0.6862017804154302, |
|
"grad_norm": 7.773900508880615, |
|
"learning_rate": 6.717382620369506e-07, |
|
"loss": 1.3195, |
|
"step": 9250 |
|
}, |
|
{ |
|
"epoch": 0.6869436201780416, |
|
"grad_norm": 9.448432922363281, |
|
"learning_rate": 6.688259344043221e-07, |
|
"loss": 1.1781, |
|
"step": 9260 |
|
}, |
|
{ |
|
"epoch": 0.6876854599406528, |
|
"grad_norm": 7.867501258850098, |
|
"learning_rate": 6.659181212934291e-07, |
|
"loss": 1.2175, |
|
"step": 9270 |
|
}, |
|
{ |
|
"epoch": 0.6884272997032641, |
|
"grad_norm": 8.866848945617676, |
|
"learning_rate": 6.630148384980567e-07, |
|
"loss": 1.3159, |
|
"step": 9280 |
|
}, |
|
{ |
|
"epoch": 0.6891691394658753, |
|
"grad_norm": 6.734555244445801, |
|
"learning_rate": 6.601161017873861e-07, |
|
"loss": 1.291, |
|
"step": 9290 |
|
}, |
|
{ |
|
"epoch": 0.6899109792284867, |
|
"grad_norm": 7.454867362976074, |
|
"learning_rate": 6.572219269059037e-07, |
|
"loss": 1.2432, |
|
"step": 9300 |
|
}, |
|
{ |
|
"epoch": 0.6906528189910979, |
|
"grad_norm": 8.541337966918945, |
|
"learning_rate": 6.543323295733207e-07, |
|
"loss": 1.3534, |
|
"step": 9310 |
|
}, |
|
{ |
|
"epoch": 0.6913946587537092, |
|
"grad_norm": 9.236302375793457, |
|
"learning_rate": 6.514473254844833e-07, |
|
"loss": 1.188, |
|
"step": 9320 |
|
}, |
|
{ |
|
"epoch": 0.6921364985163204, |
|
"grad_norm": 8.086477279663086, |
|
"learning_rate": 6.485669303092917e-07, |
|
"loss": 1.3077, |
|
"step": 9330 |
|
}, |
|
{ |
|
"epoch": 0.6928783382789317, |
|
"grad_norm": 9.85937786102295, |
|
"learning_rate": 6.456911596926104e-07, |
|
"loss": 1.3409, |
|
"step": 9340 |
|
}, |
|
{ |
|
"epoch": 0.6936201780415431, |
|
"grad_norm": 8.914828300476074, |
|
"learning_rate": 6.428200292541874e-07, |
|
"loss": 1.4067, |
|
"step": 9350 |
|
}, |
|
{ |
|
"epoch": 0.6943620178041543, |
|
"grad_norm": 9.060097694396973, |
|
"learning_rate": 6.399535545885673e-07, |
|
"loss": 1.4621, |
|
"step": 9360 |
|
}, |
|
{ |
|
"epoch": 0.6951038575667656, |
|
"grad_norm": 8.829442977905273, |
|
"learning_rate": 6.370917512650057e-07, |
|
"loss": 1.0863, |
|
"step": 9370 |
|
}, |
|
{ |
|
"epoch": 0.6958456973293768, |
|
"grad_norm": 11.040599822998047, |
|
"learning_rate": 6.342346348273879e-07, |
|
"loss": 1.3622, |
|
"step": 9380 |
|
}, |
|
{ |
|
"epoch": 0.6965875370919882, |
|
"grad_norm": 8.519377708435059, |
|
"learning_rate": 6.313822207941395e-07, |
|
"loss": 1.374, |
|
"step": 9390 |
|
}, |
|
{ |
|
"epoch": 0.6973293768545994, |
|
"grad_norm": 7.66409969329834, |
|
"learning_rate": 6.285345246581483e-07, |
|
"loss": 1.2223, |
|
"step": 9400 |
|
}, |
|
{ |
|
"epoch": 0.6980712166172107, |
|
"grad_norm": 7.880566596984863, |
|
"learning_rate": 6.256915618866739e-07, |
|
"loss": 1.2694, |
|
"step": 9410 |
|
}, |
|
{ |
|
"epoch": 0.6988130563798219, |
|
"grad_norm": 8.899270057678223, |
|
"learning_rate": 6.228533479212686e-07, |
|
"loss": 1.4051, |
|
"step": 9420 |
|
}, |
|
{ |
|
"epoch": 0.6995548961424333, |
|
"grad_norm": 7.940200328826904, |
|
"learning_rate": 6.200198981776902e-07, |
|
"loss": 1.4107, |
|
"step": 9430 |
|
}, |
|
{ |
|
"epoch": 0.7002967359050445, |
|
"grad_norm": 8.319690704345703, |
|
"learning_rate": 6.171912280458215e-07, |
|
"loss": 1.246, |
|
"step": 9440 |
|
}, |
|
{ |
|
"epoch": 0.7010385756676558, |
|
"grad_norm": 9.996255874633789, |
|
"learning_rate": 6.143673528895821e-07, |
|
"loss": 1.2741, |
|
"step": 9450 |
|
}, |
|
{ |
|
"epoch": 0.701780415430267, |
|
"grad_norm": 7.97064733505249, |
|
"learning_rate": 6.115482880468506e-07, |
|
"loss": 1.2776, |
|
"step": 9460 |
|
}, |
|
{ |
|
"epoch": 0.7025222551928784, |
|
"grad_norm": 8.822321891784668, |
|
"learning_rate": 6.087340488293757e-07, |
|
"loss": 1.4845, |
|
"step": 9470 |
|
}, |
|
{ |
|
"epoch": 0.7032640949554896, |
|
"grad_norm": 7.667973041534424, |
|
"learning_rate": 6.059246505226985e-07, |
|
"loss": 1.3351, |
|
"step": 9480 |
|
}, |
|
{ |
|
"epoch": 0.7040059347181009, |
|
"grad_norm": 9.208893775939941, |
|
"learning_rate": 6.031201083860636e-07, |
|
"loss": 1.3834, |
|
"step": 9490 |
|
}, |
|
{ |
|
"epoch": 0.7047477744807121, |
|
"grad_norm": 11.381084442138672, |
|
"learning_rate": 6.003204376523425e-07, |
|
"loss": 1.5295, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 0.7047477744807121, |
|
"eval_loss": 1.298993468284607, |
|
"eval_runtime": 23.6145, |
|
"eval_samples_per_second": 18.887, |
|
"eval_steps_per_second": 9.443, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 0.7054896142433235, |
|
"grad_norm": 8.421923637390137, |
|
"learning_rate": 5.975256535279449e-07, |
|
"loss": 1.3051, |
|
"step": 9510 |
|
}, |
|
{ |
|
"epoch": 0.7062314540059347, |
|
"grad_norm": 8.727239608764648, |
|
"learning_rate": 5.94735771192741e-07, |
|
"loss": 1.2386, |
|
"step": 9520 |
|
}, |
|
{ |
|
"epoch": 0.706973293768546, |
|
"grad_norm": 8.890826225280762, |
|
"learning_rate": 5.919508057999751e-07, |
|
"loss": 1.4653, |
|
"step": 9530 |
|
}, |
|
{ |
|
"epoch": 0.7077151335311572, |
|
"grad_norm": 8.684818267822266, |
|
"learning_rate": 5.891707724761871e-07, |
|
"loss": 1.3042, |
|
"step": 9540 |
|
}, |
|
{ |
|
"epoch": 0.7084569732937686, |
|
"grad_norm": 8.57326889038086, |
|
"learning_rate": 5.863956863211263e-07, |
|
"loss": 1.3526, |
|
"step": 9550 |
|
}, |
|
{ |
|
"epoch": 0.7091988130563798, |
|
"grad_norm": 9.280582427978516, |
|
"learning_rate": 5.836255624076732e-07, |
|
"loss": 1.3168, |
|
"step": 9560 |
|
}, |
|
{ |
|
"epoch": 0.7099406528189911, |
|
"grad_norm": 8.064082145690918, |
|
"learning_rate": 5.808604157817548e-07, |
|
"loss": 1.3998, |
|
"step": 9570 |
|
}, |
|
{ |
|
"epoch": 0.7106824925816023, |
|
"grad_norm": 8.513121604919434, |
|
"learning_rate": 5.781002614622646e-07, |
|
"loss": 1.1547, |
|
"step": 9580 |
|
}, |
|
{ |
|
"epoch": 0.7114243323442137, |
|
"grad_norm": 9.24774169921875, |
|
"learning_rate": 5.753451144409796e-07, |
|
"loss": 1.2401, |
|
"step": 9590 |
|
}, |
|
{ |
|
"epoch": 0.712166172106825, |
|
"grad_norm": 8.027381896972656, |
|
"learning_rate": 5.725949896824806e-07, |
|
"loss": 1.3028, |
|
"step": 9600 |
|
}, |
|
{ |
|
"epoch": 0.7129080118694362, |
|
"grad_norm": 7.955070495605469, |
|
"learning_rate": 5.698499021240699e-07, |
|
"loss": 1.2351, |
|
"step": 9610 |
|
}, |
|
{ |
|
"epoch": 0.7136498516320475, |
|
"grad_norm": 7.460748672485352, |
|
"learning_rate": 5.671098666756888e-07, |
|
"loss": 1.289, |
|
"step": 9620 |
|
}, |
|
{ |
|
"epoch": 0.7143916913946587, |
|
"grad_norm": 7.7787885665893555, |
|
"learning_rate": 5.643748982198407e-07, |
|
"loss": 1.3013, |
|
"step": 9630 |
|
}, |
|
{ |
|
"epoch": 0.7151335311572701, |
|
"grad_norm": 7.563982009887695, |
|
"learning_rate": 5.616450116115045e-07, |
|
"loss": 1.3116, |
|
"step": 9640 |
|
}, |
|
{ |
|
"epoch": 0.7158753709198813, |
|
"grad_norm": 10.66048812866211, |
|
"learning_rate": 5.5892022167806e-07, |
|
"loss": 1.4897, |
|
"step": 9650 |
|
}, |
|
{ |
|
"epoch": 0.7166172106824926, |
|
"grad_norm": 7.235440731048584, |
|
"learning_rate": 5.56200543219202e-07, |
|
"loss": 1.2029, |
|
"step": 9660 |
|
}, |
|
{ |
|
"epoch": 0.7173590504451038, |
|
"grad_norm": 9.794096946716309, |
|
"learning_rate": 5.534859910068643e-07, |
|
"loss": 1.1368, |
|
"step": 9670 |
|
}, |
|
{ |
|
"epoch": 0.7181008902077152, |
|
"grad_norm": 12.36744499206543, |
|
"learning_rate": 5.507765797851356e-07, |
|
"loss": 1.2889, |
|
"step": 9680 |
|
}, |
|
{ |
|
"epoch": 0.7188427299703264, |
|
"grad_norm": 8.576911926269531, |
|
"learning_rate": 5.480723242701836e-07, |
|
"loss": 1.2541, |
|
"step": 9690 |
|
}, |
|
{ |
|
"epoch": 0.7195845697329377, |
|
"grad_norm": 7.381803512573242, |
|
"learning_rate": 5.4537323915017e-07, |
|
"loss": 1.3102, |
|
"step": 9700 |
|
}, |
|
{ |
|
"epoch": 0.7203264094955489, |
|
"grad_norm": 8.61953353881836, |
|
"learning_rate": 5.426793390851761e-07, |
|
"loss": 1.2292, |
|
"step": 9710 |
|
}, |
|
{ |
|
"epoch": 0.7210682492581603, |
|
"grad_norm": 9.234679222106934, |
|
"learning_rate": 5.399906387071186e-07, |
|
"loss": 1.4074, |
|
"step": 9720 |
|
}, |
|
{ |
|
"epoch": 0.7218100890207715, |
|
"grad_norm": 7.804644584655762, |
|
"learning_rate": 5.373071526196739e-07, |
|
"loss": 1.1214, |
|
"step": 9730 |
|
}, |
|
{ |
|
"epoch": 0.7225519287833828, |
|
"grad_norm": 9.674349784851074, |
|
"learning_rate": 5.346288953981949e-07, |
|
"loss": 1.2788, |
|
"step": 9740 |
|
}, |
|
{ |
|
"epoch": 0.723293768545994, |
|
"grad_norm": 8.411057472229004, |
|
"learning_rate": 5.319558815896363e-07, |
|
"loss": 1.227, |
|
"step": 9750 |
|
}, |
|
{ |
|
"epoch": 0.7240356083086054, |
|
"grad_norm": 8.84724235534668, |
|
"learning_rate": 5.29288125712471e-07, |
|
"loss": 1.2271, |
|
"step": 9760 |
|
}, |
|
{ |
|
"epoch": 0.7247774480712166, |
|
"grad_norm": 8.127745628356934, |
|
"learning_rate": 5.266256422566145e-07, |
|
"loss": 1.2995, |
|
"step": 9770 |
|
}, |
|
{ |
|
"epoch": 0.7255192878338279, |
|
"grad_norm": 7.898895740509033, |
|
"learning_rate": 5.239684456833457e-07, |
|
"loss": 1.1288, |
|
"step": 9780 |
|
}, |
|
{ |
|
"epoch": 0.7262611275964391, |
|
"grad_norm": 8.459515571594238, |
|
"learning_rate": 5.213165504252262e-07, |
|
"loss": 1.373, |
|
"step": 9790 |
|
}, |
|
{ |
|
"epoch": 0.7270029673590505, |
|
"grad_norm": 9.448688507080078, |
|
"learning_rate": 5.186699708860253e-07, |
|
"loss": 1.2424, |
|
"step": 9800 |
|
}, |
|
{ |
|
"epoch": 0.7277448071216617, |
|
"grad_norm": 8.228900909423828, |
|
"learning_rate": 5.160287214406383e-07, |
|
"loss": 1.2119, |
|
"step": 9810 |
|
}, |
|
{ |
|
"epoch": 0.728486646884273, |
|
"grad_norm": 7.960751533508301, |
|
"learning_rate": 5.133928164350119e-07, |
|
"loss": 1.3451, |
|
"step": 9820 |
|
}, |
|
{ |
|
"epoch": 0.7292284866468842, |
|
"grad_norm": 7.8820414543151855, |
|
"learning_rate": 5.107622701860624e-07, |
|
"loss": 1.2296, |
|
"step": 9830 |
|
}, |
|
{ |
|
"epoch": 0.7299703264094956, |
|
"grad_norm": 8.707436561584473, |
|
"learning_rate": 5.081370969816023e-07, |
|
"loss": 1.2629, |
|
"step": 9840 |
|
}, |
|
{ |
|
"epoch": 0.7307121661721068, |
|
"grad_norm": 9.171490669250488, |
|
"learning_rate": 5.055173110802586e-07, |
|
"loss": 1.3124, |
|
"step": 9850 |
|
}, |
|
{ |
|
"epoch": 0.7314540059347181, |
|
"grad_norm": 7.622151851654053, |
|
"learning_rate": 5.029029267113971e-07, |
|
"loss": 1.2931, |
|
"step": 9860 |
|
}, |
|
{ |
|
"epoch": 0.7321958456973294, |
|
"grad_norm": 7.796103000640869, |
|
"learning_rate": 5.002939580750467e-07, |
|
"loss": 1.3467, |
|
"step": 9870 |
|
}, |
|
{ |
|
"epoch": 0.7329376854599406, |
|
"grad_norm": 8.309154510498047, |
|
"learning_rate": 4.976904193418203e-07, |
|
"loss": 1.3801, |
|
"step": 9880 |
|
}, |
|
{ |
|
"epoch": 0.733679525222552, |
|
"grad_norm": 8.498586654663086, |
|
"learning_rate": 4.950923246528368e-07, |
|
"loss": 1.2142, |
|
"step": 9890 |
|
}, |
|
{ |
|
"epoch": 0.7344213649851632, |
|
"grad_norm": 8.15847396850586, |
|
"learning_rate": 4.92499688119648e-07, |
|
"loss": 1.2417, |
|
"step": 9900 |
|
}, |
|
{ |
|
"epoch": 0.7351632047477745, |
|
"grad_norm": 8.350110054016113, |
|
"learning_rate": 4.899125238241574e-07, |
|
"loss": 1.3085, |
|
"step": 9910 |
|
}, |
|
{ |
|
"epoch": 0.7359050445103857, |
|
"grad_norm": 8.587996482849121, |
|
"learning_rate": 4.873308458185486e-07, |
|
"loss": 1.1625, |
|
"step": 9920 |
|
}, |
|
{ |
|
"epoch": 0.7366468842729971, |
|
"grad_norm": 6.703005313873291, |
|
"learning_rate": 4.847546681252034e-07, |
|
"loss": 1.2597, |
|
"step": 9930 |
|
}, |
|
{ |
|
"epoch": 0.7373887240356083, |
|
"grad_norm": 8.741930961608887, |
|
"learning_rate": 4.821840047366322e-07, |
|
"loss": 1.3137, |
|
"step": 9940 |
|
}, |
|
{ |
|
"epoch": 0.7381305637982196, |
|
"grad_norm": 9.368997573852539, |
|
"learning_rate": 4.796188696153909e-07, |
|
"loss": 1.4068, |
|
"step": 9950 |
|
}, |
|
{ |
|
"epoch": 0.7388724035608308, |
|
"grad_norm": 9.121284484863281, |
|
"learning_rate": 4.770592766940116e-07, |
|
"loss": 1.284, |
|
"step": 9960 |
|
}, |
|
{ |
|
"epoch": 0.7396142433234422, |
|
"grad_norm": 8.773377418518066, |
|
"learning_rate": 4.745052398749213e-07, |
|
"loss": 1.3025, |
|
"step": 9970 |
|
}, |
|
{ |
|
"epoch": 0.7403560830860534, |
|
"grad_norm": 8.985709190368652, |
|
"learning_rate": 4.719567730303719e-07, |
|
"loss": 1.276, |
|
"step": 9980 |
|
}, |
|
{ |
|
"epoch": 0.7410979228486647, |
|
"grad_norm": 7.726775169372559, |
|
"learning_rate": 4.6941389000235893e-07, |
|
"loss": 1.2906, |
|
"step": 9990 |
|
}, |
|
{ |
|
"epoch": 0.7418397626112759, |
|
"grad_norm": 8.630135536193848, |
|
"learning_rate": 4.668766046025522e-07, |
|
"loss": 1.293, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 0.7418397626112759, |
|
"eval_loss": 1.2966691255569458, |
|
"eval_runtime": 23.6049, |
|
"eval_samples_per_second": 18.894, |
|
"eval_steps_per_second": 9.447, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 0.7425816023738873, |
|
"grad_norm": 9.198748588562012, |
|
"learning_rate": 4.643449306122158e-07, |
|
"loss": 1.2206, |
|
"step": 10010 |
|
}, |
|
{ |
|
"epoch": 0.7433234421364985, |
|
"grad_norm": 8.540892601013184, |
|
"learning_rate": 4.618188817821371e-07, |
|
"loss": 1.4011, |
|
"step": 10020 |
|
}, |
|
{ |
|
"epoch": 0.7440652818991098, |
|
"grad_norm": 8.046586990356445, |
|
"learning_rate": 4.5929847183254916e-07, |
|
"loss": 1.284, |
|
"step": 10030 |
|
}, |
|
{ |
|
"epoch": 0.744807121661721, |
|
"grad_norm": 8.560956954956055, |
|
"learning_rate": 4.567837144530585e-07, |
|
"loss": 1.2844, |
|
"step": 10040 |
|
}, |
|
{ |
|
"epoch": 0.7455489614243324, |
|
"grad_norm": 9.451622009277344, |
|
"learning_rate": 4.542746233025685e-07, |
|
"loss": 1.37, |
|
"step": 10050 |
|
}, |
|
{ |
|
"epoch": 0.7462908011869436, |
|
"grad_norm": 10.818734169006348, |
|
"learning_rate": 4.51771212009208e-07, |
|
"loss": 1.3427, |
|
"step": 10060 |
|
}, |
|
{ |
|
"epoch": 0.7470326409495549, |
|
"grad_norm": 8.870102882385254, |
|
"learning_rate": 4.492734941702541e-07, |
|
"loss": 1.3504, |
|
"step": 10070 |
|
}, |
|
{ |
|
"epoch": 0.7477744807121661, |
|
"grad_norm": 10.100753784179688, |
|
"learning_rate": 4.467814833520613e-07, |
|
"loss": 1.1713, |
|
"step": 10080 |
|
}, |
|
{ |
|
"epoch": 0.7485163204747775, |
|
"grad_norm": 8.58507251739502, |
|
"learning_rate": 4.4429519308998503e-07, |
|
"loss": 1.1272, |
|
"step": 10090 |
|
}, |
|
{ |
|
"epoch": 0.7492581602373887, |
|
"grad_norm": 7.8265299797058105, |
|
"learning_rate": 4.41814636888311e-07, |
|
"loss": 1.2065, |
|
"step": 10100 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 8.231193542480469, |
|
"learning_rate": 4.3933982822017883e-07, |
|
"loss": 1.2077, |
|
"step": 10110 |
|
}, |
|
{ |
|
"epoch": 0.7507418397626113, |
|
"grad_norm": 7.965888500213623, |
|
"learning_rate": 4.368707805275116e-07, |
|
"loss": 1.4395, |
|
"step": 10120 |
|
}, |
|
{ |
|
"epoch": 0.7514836795252225, |
|
"grad_norm": 9.374115943908691, |
|
"learning_rate": 4.344075072209417e-07, |
|
"loss": 1.2853, |
|
"step": 10130 |
|
}, |
|
{ |
|
"epoch": 0.7522255192878339, |
|
"grad_norm": 7.917102813720703, |
|
"learning_rate": 4.3195002167973655e-07, |
|
"loss": 1.3366, |
|
"step": 10140 |
|
}, |
|
{ |
|
"epoch": 0.7529673590504451, |
|
"grad_norm": 9.077959060668945, |
|
"learning_rate": 4.294983372517293e-07, |
|
"loss": 1.4383, |
|
"step": 10150 |
|
}, |
|
{ |
|
"epoch": 0.7537091988130564, |
|
"grad_norm": 9.32331657409668, |
|
"learning_rate": 4.2705246725324216e-07, |
|
"loss": 1.2742, |
|
"step": 10160 |
|
}, |
|
{ |
|
"epoch": 0.7544510385756676, |
|
"grad_norm": 8.539690971374512, |
|
"learning_rate": 4.246124249690187e-07, |
|
"loss": 1.2168, |
|
"step": 10170 |
|
}, |
|
{ |
|
"epoch": 0.755192878338279, |
|
"grad_norm": 8.285751342773438, |
|
"learning_rate": 4.2217822365214686e-07, |
|
"loss": 1.376, |
|
"step": 10180 |
|
}, |
|
{ |
|
"epoch": 0.7559347181008902, |
|
"grad_norm": 8.879798889160156, |
|
"learning_rate": 4.197498765239913e-07, |
|
"loss": 1.3534, |
|
"step": 10190 |
|
}, |
|
{ |
|
"epoch": 0.7566765578635015, |
|
"grad_norm": 8.319602012634277, |
|
"learning_rate": 4.1732739677411836e-07, |
|
"loss": 1.2968, |
|
"step": 10200 |
|
}, |
|
{ |
|
"epoch": 0.7574183976261127, |
|
"grad_norm": 7.641089916229248, |
|
"learning_rate": 4.149107975602267e-07, |
|
"loss": 1.2378, |
|
"step": 10210 |
|
}, |
|
{ |
|
"epoch": 0.7581602373887241, |
|
"grad_norm": 9.449283599853516, |
|
"learning_rate": 4.1250009200807353e-07, |
|
"loss": 1.0789, |
|
"step": 10220 |
|
}, |
|
{ |
|
"epoch": 0.7589020771513353, |
|
"grad_norm": 9.45445442199707, |
|
"learning_rate": 4.100952932114066e-07, |
|
"loss": 1.2849, |
|
"step": 10230 |
|
}, |
|
{ |
|
"epoch": 0.7596439169139466, |
|
"grad_norm": 7.804203987121582, |
|
"learning_rate": 4.07696414231889e-07, |
|
"loss": 1.2507, |
|
"step": 10240 |
|
}, |
|
{ |
|
"epoch": 0.7603857566765578, |
|
"grad_norm": 8.116350173950195, |
|
"learning_rate": 4.0530346809903196e-07, |
|
"loss": 1.2658, |
|
"step": 10250 |
|
}, |
|
{ |
|
"epoch": 0.7611275964391692, |
|
"grad_norm": 9.725852012634277, |
|
"learning_rate": 4.029164678101213e-07, |
|
"loss": 1.462, |
|
"step": 10260 |
|
}, |
|
{ |
|
"epoch": 0.7618694362017804, |
|
"grad_norm": 8.416056632995605, |
|
"learning_rate": 4.0053542633014913e-07, |
|
"loss": 1.3301, |
|
"step": 10270 |
|
}, |
|
{ |
|
"epoch": 0.7626112759643917, |
|
"grad_norm": 6.388516426086426, |
|
"learning_rate": 3.98160356591741e-07, |
|
"loss": 1.2121, |
|
"step": 10280 |
|
}, |
|
{ |
|
"epoch": 0.7633531157270029, |
|
"grad_norm": 7.303947925567627, |
|
"learning_rate": 3.957912714950882e-07, |
|
"loss": 1.2568, |
|
"step": 10290 |
|
}, |
|
{ |
|
"epoch": 0.7640949554896143, |
|
"grad_norm": 8.52409553527832, |
|
"learning_rate": 3.9342818390787535e-07, |
|
"loss": 1.435, |
|
"step": 10300 |
|
}, |
|
{ |
|
"epoch": 0.7648367952522255, |
|
"grad_norm": 9.281074523925781, |
|
"learning_rate": 3.910711066652127e-07, |
|
"loss": 1.3805, |
|
"step": 10310 |
|
}, |
|
{ |
|
"epoch": 0.7655786350148368, |
|
"grad_norm": 7.558801651000977, |
|
"learning_rate": 3.8872005256956383e-07, |
|
"loss": 1.2831, |
|
"step": 10320 |
|
}, |
|
{ |
|
"epoch": 0.766320474777448, |
|
"grad_norm": 9.506136894226074, |
|
"learning_rate": 3.863750343906796e-07, |
|
"loss": 1.396, |
|
"step": 10330 |
|
}, |
|
{ |
|
"epoch": 0.7670623145400594, |
|
"grad_norm": 9.334778785705566, |
|
"learning_rate": 3.840360648655247e-07, |
|
"loss": 1.374, |
|
"step": 10340 |
|
}, |
|
{ |
|
"epoch": 0.7678041543026706, |
|
"grad_norm": 8.17182445526123, |
|
"learning_rate": 3.8170315669821227e-07, |
|
"loss": 1.3962, |
|
"step": 10350 |
|
}, |
|
{ |
|
"epoch": 0.7685459940652819, |
|
"grad_norm": 8.254951477050781, |
|
"learning_rate": 3.7937632255993176e-07, |
|
"loss": 1.4787, |
|
"step": 10360 |
|
}, |
|
{ |
|
"epoch": 0.7692878338278932, |
|
"grad_norm": 7.743471622467041, |
|
"learning_rate": 3.770555750888825e-07, |
|
"loss": 1.3078, |
|
"step": 10370 |
|
}, |
|
{ |
|
"epoch": 0.7700296735905044, |
|
"grad_norm": 8.887690544128418, |
|
"learning_rate": 3.747409268902046e-07, |
|
"loss": 1.3241, |
|
"step": 10380 |
|
}, |
|
{ |
|
"epoch": 0.7707715133531158, |
|
"grad_norm": 9.078700065612793, |
|
"learning_rate": 3.724323905359082e-07, |
|
"loss": 1.4121, |
|
"step": 10390 |
|
}, |
|
{ |
|
"epoch": 0.771513353115727, |
|
"grad_norm": 8.609134674072266, |
|
"learning_rate": 3.7012997856480794e-07, |
|
"loss": 1.2956, |
|
"step": 10400 |
|
}, |
|
{ |
|
"epoch": 0.7722551928783383, |
|
"grad_norm": 8.539812088012695, |
|
"learning_rate": 3.678337034824545e-07, |
|
"loss": 1.2113, |
|
"step": 10410 |
|
}, |
|
{ |
|
"epoch": 0.7729970326409495, |
|
"grad_norm": 8.455937385559082, |
|
"learning_rate": 3.655435777610649e-07, |
|
"loss": 1.3018, |
|
"step": 10420 |
|
}, |
|
{ |
|
"epoch": 0.7737388724035609, |
|
"grad_norm": 8.811159133911133, |
|
"learning_rate": 3.63259613839457e-07, |
|
"loss": 1.3779, |
|
"step": 10430 |
|
}, |
|
{ |
|
"epoch": 0.7744807121661721, |
|
"grad_norm": 8.420944213867188, |
|
"learning_rate": 3.6098182412297944e-07, |
|
"loss": 1.2882, |
|
"step": 10440 |
|
}, |
|
{ |
|
"epoch": 0.7752225519287834, |
|
"grad_norm": 8.92984390258789, |
|
"learning_rate": 3.587102209834474e-07, |
|
"loss": 1.227, |
|
"step": 10450 |
|
}, |
|
{ |
|
"epoch": 0.7759643916913946, |
|
"grad_norm": 7.994571208953857, |
|
"learning_rate": 3.564448167590721e-07, |
|
"loss": 1.3261, |
|
"step": 10460 |
|
}, |
|
{ |
|
"epoch": 0.776706231454006, |
|
"grad_norm": 7.83929443359375, |
|
"learning_rate": 3.541856237543967e-07, |
|
"loss": 1.5789, |
|
"step": 10470 |
|
}, |
|
{ |
|
"epoch": 0.7774480712166172, |
|
"grad_norm": 8.824812889099121, |
|
"learning_rate": 3.51932654240227e-07, |
|
"loss": 1.2063, |
|
"step": 10480 |
|
}, |
|
{ |
|
"epoch": 0.7781899109792285, |
|
"grad_norm": 9.47778606414795, |
|
"learning_rate": 3.4968592045356605e-07, |
|
"loss": 1.2887, |
|
"step": 10490 |
|
}, |
|
{ |
|
"epoch": 0.7789317507418397, |
|
"grad_norm": 7.607693195343018, |
|
"learning_rate": 3.474454345975488e-07, |
|
"loss": 1.2231, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 0.7789317507418397, |
|
"eval_loss": 1.294171690940857, |
|
"eval_runtime": 23.6195, |
|
"eval_samples_per_second": 18.883, |
|
"eval_steps_per_second": 9.441, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 0.7796735905044511, |
|
"grad_norm": 8.560502052307129, |
|
"learning_rate": 3.4521120884137254e-07, |
|
"loss": 1.2739, |
|
"step": 10510 |
|
}, |
|
{ |
|
"epoch": 0.7804154302670623, |
|
"grad_norm": 9.082324028015137, |
|
"learning_rate": 3.4298325532023496e-07, |
|
"loss": 1.2654, |
|
"step": 10520 |
|
}, |
|
{ |
|
"epoch": 0.7811572700296736, |
|
"grad_norm": 9.28708267211914, |
|
"learning_rate": 3.40761586135264e-07, |
|
"loss": 1.2823, |
|
"step": 10530 |
|
}, |
|
{ |
|
"epoch": 0.7818991097922848, |
|
"grad_norm": 8.582283973693848, |
|
"learning_rate": 3.385462133534565e-07, |
|
"loss": 1.2891, |
|
"step": 10540 |
|
}, |
|
{ |
|
"epoch": 0.7826409495548962, |
|
"grad_norm": 8.136933326721191, |
|
"learning_rate": 3.3633714900760804e-07, |
|
"loss": 1.2946, |
|
"step": 10550 |
|
}, |
|
{ |
|
"epoch": 0.7833827893175074, |
|
"grad_norm": 9.77947998046875, |
|
"learning_rate": 3.34134405096252e-07, |
|
"loss": 1.1834, |
|
"step": 10560 |
|
}, |
|
{ |
|
"epoch": 0.7841246290801187, |
|
"grad_norm": 8.430279731750488, |
|
"learning_rate": 3.319379935835907e-07, |
|
"loss": 1.3921, |
|
"step": 10570 |
|
}, |
|
{ |
|
"epoch": 0.7848664688427299, |
|
"grad_norm": 9.608940124511719, |
|
"learning_rate": 3.297479263994334e-07, |
|
"loss": 1.1624, |
|
"step": 10580 |
|
}, |
|
{ |
|
"epoch": 0.7856083086053413, |
|
"grad_norm": 8.380851745605469, |
|
"learning_rate": 3.2756421543912855e-07, |
|
"loss": 1.4092, |
|
"step": 10590 |
|
}, |
|
{ |
|
"epoch": 0.7863501483679525, |
|
"grad_norm": 8.177634239196777, |
|
"learning_rate": 3.25386872563503e-07, |
|
"loss": 1.1724, |
|
"step": 10600 |
|
}, |
|
{ |
|
"epoch": 0.7870919881305638, |
|
"grad_norm": 9.429834365844727, |
|
"learning_rate": 3.232159095987926e-07, |
|
"loss": 1.2285, |
|
"step": 10610 |
|
}, |
|
{ |
|
"epoch": 0.787833827893175, |
|
"grad_norm": 9.116068840026855, |
|
"learning_rate": 3.2105133833658333e-07, |
|
"loss": 1.27, |
|
"step": 10620 |
|
}, |
|
{ |
|
"epoch": 0.7885756676557863, |
|
"grad_norm": 7.366293430328369, |
|
"learning_rate": 3.1889317053374265e-07, |
|
"loss": 1.2879, |
|
"step": 10630 |
|
}, |
|
{ |
|
"epoch": 0.7893175074183977, |
|
"grad_norm": 8.340385437011719, |
|
"learning_rate": 3.167414179123589e-07, |
|
"loss": 1.4321, |
|
"step": 10640 |
|
}, |
|
{ |
|
"epoch": 0.7900593471810089, |
|
"grad_norm": 8.602953910827637, |
|
"learning_rate": 3.145960921596762e-07, |
|
"loss": 1.3273, |
|
"step": 10650 |
|
}, |
|
{ |
|
"epoch": 0.7908011869436202, |
|
"grad_norm": 8.914224624633789, |
|
"learning_rate": 3.124572049280301e-07, |
|
"loss": 1.2689, |
|
"step": 10660 |
|
}, |
|
{ |
|
"epoch": 0.7915430267062314, |
|
"grad_norm": 8.413691520690918, |
|
"learning_rate": 3.1032476783478694e-07, |
|
"loss": 1.226, |
|
"step": 10670 |
|
}, |
|
{ |
|
"epoch": 0.7922848664688428, |
|
"grad_norm": 8.446022033691406, |
|
"learning_rate": 3.0819879246227737e-07, |
|
"loss": 1.3581, |
|
"step": 10680 |
|
}, |
|
{ |
|
"epoch": 0.793026706231454, |
|
"grad_norm": 9.09033203125, |
|
"learning_rate": 3.0607929035773686e-07, |
|
"loss": 1.3828, |
|
"step": 10690 |
|
}, |
|
{ |
|
"epoch": 0.7937685459940653, |
|
"grad_norm": 7.936834812164307, |
|
"learning_rate": 3.039662730332399e-07, |
|
"loss": 1.3225, |
|
"step": 10700 |
|
}, |
|
{ |
|
"epoch": 0.7945103857566765, |
|
"grad_norm": 8.607504844665527, |
|
"learning_rate": 3.018597519656404e-07, |
|
"loss": 1.3454, |
|
"step": 10710 |
|
}, |
|
{ |
|
"epoch": 0.7952522255192879, |
|
"grad_norm": 9.906630516052246, |
|
"learning_rate": 2.997597385965062e-07, |
|
"loss": 1.3378, |
|
"step": 10720 |
|
}, |
|
{ |
|
"epoch": 0.7959940652818991, |
|
"grad_norm": 8.220865249633789, |
|
"learning_rate": 2.9766624433206e-07, |
|
"loss": 1.39, |
|
"step": 10730 |
|
}, |
|
{ |
|
"epoch": 0.7967359050445104, |
|
"grad_norm": 9.154244422912598, |
|
"learning_rate": 2.955792805431149e-07, |
|
"loss": 1.5, |
|
"step": 10740 |
|
}, |
|
{ |
|
"epoch": 0.7974777448071216, |
|
"grad_norm": 8.345115661621094, |
|
"learning_rate": 2.93498858565015e-07, |
|
"loss": 1.1509, |
|
"step": 10750 |
|
}, |
|
{ |
|
"epoch": 0.798219584569733, |
|
"grad_norm": 8.3400239944458, |
|
"learning_rate": 2.914249896975705e-07, |
|
"loss": 1.2945, |
|
"step": 10760 |
|
}, |
|
{ |
|
"epoch": 0.7989614243323442, |
|
"grad_norm": 8.932135581970215, |
|
"learning_rate": 2.89357685205001e-07, |
|
"loss": 1.3422, |
|
"step": 10770 |
|
}, |
|
{ |
|
"epoch": 0.7997032640949555, |
|
"grad_norm": 9.743860244750977, |
|
"learning_rate": 2.872969563158693e-07, |
|
"loss": 1.321, |
|
"step": 10780 |
|
}, |
|
{ |
|
"epoch": 0.8004451038575667, |
|
"grad_norm": 8.630380630493164, |
|
"learning_rate": 2.852428142230246e-07, |
|
"loss": 1.2738, |
|
"step": 10790 |
|
}, |
|
{ |
|
"epoch": 0.8011869436201781, |
|
"grad_norm": 10.236412048339844, |
|
"learning_rate": 2.831952700835386e-07, |
|
"loss": 1.3867, |
|
"step": 10800 |
|
}, |
|
{ |
|
"epoch": 0.8019287833827893, |
|
"grad_norm": 9.148995399475098, |
|
"learning_rate": 2.811543350186474e-07, |
|
"loss": 1.4451, |
|
"step": 10810 |
|
}, |
|
{ |
|
"epoch": 0.8026706231454006, |
|
"grad_norm": 8.069551467895508, |
|
"learning_rate": 2.791200201136886e-07, |
|
"loss": 1.2106, |
|
"step": 10820 |
|
}, |
|
{ |
|
"epoch": 0.8034124629080118, |
|
"grad_norm": 8.256972312927246, |
|
"learning_rate": 2.7709233641804396e-07, |
|
"loss": 1.279, |
|
"step": 10830 |
|
}, |
|
{ |
|
"epoch": 0.8041543026706232, |
|
"grad_norm": 9.485831260681152, |
|
"learning_rate": 2.75071294945076e-07, |
|
"loss": 1.299, |
|
"step": 10840 |
|
}, |
|
{ |
|
"epoch": 0.8048961424332344, |
|
"grad_norm": 8.086010932922363, |
|
"learning_rate": 2.730569066720718e-07, |
|
"loss": 1.3489, |
|
"step": 10850 |
|
}, |
|
{ |
|
"epoch": 0.8056379821958457, |
|
"grad_norm": 9.07319450378418, |
|
"learning_rate": 2.710491825401803e-07, |
|
"loss": 1.4439, |
|
"step": 10860 |
|
}, |
|
{ |
|
"epoch": 0.8063798219584569, |
|
"grad_norm": 8.647632598876953, |
|
"learning_rate": 2.69048133454355e-07, |
|
"loss": 1.3518, |
|
"step": 10870 |
|
}, |
|
{ |
|
"epoch": 0.8071216617210683, |
|
"grad_norm": 8.795075416564941, |
|
"learning_rate": 2.6705377028329315e-07, |
|
"loss": 1.3317, |
|
"step": 10880 |
|
}, |
|
{ |
|
"epoch": 0.8078635014836796, |
|
"grad_norm": 9.038534164428711, |
|
"learning_rate": 2.650661038593778e-07, |
|
"loss": 1.2905, |
|
"step": 10890 |
|
}, |
|
{ |
|
"epoch": 0.8086053412462908, |
|
"grad_norm": 8.689789772033691, |
|
"learning_rate": 2.630851449786193e-07, |
|
"loss": 1.1946, |
|
"step": 10900 |
|
}, |
|
{ |
|
"epoch": 0.8093471810089021, |
|
"grad_norm": 7.252419471740723, |
|
"learning_rate": 2.6111090440059453e-07, |
|
"loss": 1.3711, |
|
"step": 10910 |
|
}, |
|
{ |
|
"epoch": 0.8100890207715133, |
|
"grad_norm": 12.24842357635498, |
|
"learning_rate": 2.5914339284839143e-07, |
|
"loss": 1.1649, |
|
"step": 10920 |
|
}, |
|
{ |
|
"epoch": 0.8108308605341247, |
|
"grad_norm": 7.127127170562744, |
|
"learning_rate": 2.5718262100854774e-07, |
|
"loss": 1.1895, |
|
"step": 10930 |
|
}, |
|
{ |
|
"epoch": 0.8115727002967359, |
|
"grad_norm": 8.82343578338623, |
|
"learning_rate": 2.55228599530996e-07, |
|
"loss": 1.3349, |
|
"step": 10940 |
|
}, |
|
{ |
|
"epoch": 0.8123145400593472, |
|
"grad_norm": 8.194673538208008, |
|
"learning_rate": 2.532813390290026e-07, |
|
"loss": 1.1975, |
|
"step": 10950 |
|
}, |
|
{ |
|
"epoch": 0.8130563798219584, |
|
"grad_norm": 9.27203369140625, |
|
"learning_rate": 2.513408500791135e-07, |
|
"loss": 1.2848, |
|
"step": 10960 |
|
}, |
|
{ |
|
"epoch": 0.8137982195845698, |
|
"grad_norm": 8.359867095947266, |
|
"learning_rate": 2.494071432210928e-07, |
|
"loss": 1.295, |
|
"step": 10970 |
|
}, |
|
{ |
|
"epoch": 0.814540059347181, |
|
"grad_norm": 8.281832695007324, |
|
"learning_rate": 2.4748022895786977e-07, |
|
"loss": 1.3752, |
|
"step": 10980 |
|
}, |
|
{ |
|
"epoch": 0.8152818991097923, |
|
"grad_norm": 9.253495216369629, |
|
"learning_rate": 2.4556011775547804e-07, |
|
"loss": 1.3411, |
|
"step": 10990 |
|
}, |
|
{ |
|
"epoch": 0.8160237388724035, |
|
"grad_norm": 7.842648029327393, |
|
"learning_rate": 2.4364682004300195e-07, |
|
"loss": 1.2721, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 0.8160237388724035, |
|
"eval_loss": 1.2925976514816284, |
|
"eval_runtime": 23.6283, |
|
"eval_samples_per_second": 18.876, |
|
"eval_steps_per_second": 9.438, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 0.8167655786350149, |
|
"grad_norm": 7.728657245635986, |
|
"learning_rate": 2.417403462125166e-07, |
|
"loss": 1.3394, |
|
"step": 11010 |
|
}, |
|
{ |
|
"epoch": 0.8175074183976261, |
|
"grad_norm": 12.809070587158203, |
|
"learning_rate": 2.398407066190351e-07, |
|
"loss": 1.4096, |
|
"step": 11020 |
|
}, |
|
{ |
|
"epoch": 0.8182492581602374, |
|
"grad_norm": 8.430037498474121, |
|
"learning_rate": 2.3794791158044865e-07, |
|
"loss": 1.1904, |
|
"step": 11030 |
|
}, |
|
{ |
|
"epoch": 0.8189910979228486, |
|
"grad_norm": 7.398181915283203, |
|
"learning_rate": 2.3606197137747366e-07, |
|
"loss": 1.1156, |
|
"step": 11040 |
|
}, |
|
{ |
|
"epoch": 0.81973293768546, |
|
"grad_norm": 7.863873481750488, |
|
"learning_rate": 2.341828962535932e-07, |
|
"loss": 1.3122, |
|
"step": 11050 |
|
}, |
|
{ |
|
"epoch": 0.8204747774480712, |
|
"grad_norm": 8.728108406066895, |
|
"learning_rate": 2.3231069641500414e-07, |
|
"loss": 1.2932, |
|
"step": 11060 |
|
}, |
|
{ |
|
"epoch": 0.8212166172106825, |
|
"grad_norm": 8.522738456726074, |
|
"learning_rate": 2.3044538203055876e-07, |
|
"loss": 1.3695, |
|
"step": 11070 |
|
}, |
|
{ |
|
"epoch": 0.8219584569732937, |
|
"grad_norm": 8.17654800415039, |
|
"learning_rate": 2.2858696323171225e-07, |
|
"loss": 1.2997, |
|
"step": 11080 |
|
}, |
|
{ |
|
"epoch": 0.8227002967359051, |
|
"grad_norm": 9.464920997619629, |
|
"learning_rate": 2.267354501124652e-07, |
|
"loss": 1.1666, |
|
"step": 11090 |
|
}, |
|
{ |
|
"epoch": 0.8234421364985163, |
|
"grad_norm": 8.210715293884277, |
|
"learning_rate": 2.2489085272931132e-07, |
|
"loss": 1.3267, |
|
"step": 11100 |
|
}, |
|
{ |
|
"epoch": 0.8241839762611276, |
|
"grad_norm": 8.052397727966309, |
|
"learning_rate": 2.230531811011804e-07, |
|
"loss": 1.1532, |
|
"step": 11110 |
|
}, |
|
{ |
|
"epoch": 0.8249258160237388, |
|
"grad_norm": 8.493069648742676, |
|
"learning_rate": 2.212224452093859e-07, |
|
"loss": 1.3931, |
|
"step": 11120 |
|
}, |
|
{ |
|
"epoch": 0.8256676557863502, |
|
"grad_norm": 9.07796573638916, |
|
"learning_rate": 2.1939865499756905e-07, |
|
"loss": 1.2122, |
|
"step": 11130 |
|
}, |
|
{ |
|
"epoch": 0.8264094955489614, |
|
"grad_norm": 8.513764381408691, |
|
"learning_rate": 2.1758182037164564e-07, |
|
"loss": 1.3071, |
|
"step": 11140 |
|
}, |
|
{ |
|
"epoch": 0.8271513353115727, |
|
"grad_norm": 8.735884666442871, |
|
"learning_rate": 2.1577195119975328e-07, |
|
"loss": 1.3478, |
|
"step": 11150 |
|
}, |
|
{ |
|
"epoch": 0.827893175074184, |
|
"grad_norm": 8.680800437927246, |
|
"learning_rate": 2.1396905731219506e-07, |
|
"loss": 1.3153, |
|
"step": 11160 |
|
}, |
|
{ |
|
"epoch": 0.8286350148367952, |
|
"grad_norm": 9.367341041564941, |
|
"learning_rate": 2.1217314850138952e-07, |
|
"loss": 1.4447, |
|
"step": 11170 |
|
}, |
|
{ |
|
"epoch": 0.8293768545994066, |
|
"grad_norm": 7.34644079208374, |
|
"learning_rate": 2.103842345218142e-07, |
|
"loss": 1.1235, |
|
"step": 11180 |
|
}, |
|
{ |
|
"epoch": 0.8301186943620178, |
|
"grad_norm": 8.427669525146484, |
|
"learning_rate": 2.0860232508995558e-07, |
|
"loss": 1.5228, |
|
"step": 11190 |
|
}, |
|
{ |
|
"epoch": 0.8308605341246291, |
|
"grad_norm": 8.698617935180664, |
|
"learning_rate": 2.068274298842537e-07, |
|
"loss": 1.3354, |
|
"step": 11200 |
|
}, |
|
{ |
|
"epoch": 0.8316023738872403, |
|
"grad_norm": 9.806768417358398, |
|
"learning_rate": 2.050595585450522e-07, |
|
"loss": 1.4197, |
|
"step": 11210 |
|
}, |
|
{ |
|
"epoch": 0.8323442136498517, |
|
"grad_norm": 8.465378761291504, |
|
"learning_rate": 2.0329872067454286e-07, |
|
"loss": 1.3557, |
|
"step": 11220 |
|
}, |
|
{ |
|
"epoch": 0.8330860534124629, |
|
"grad_norm": 9.589797973632812, |
|
"learning_rate": 2.0154492583671708e-07, |
|
"loss": 1.2585, |
|
"step": 11230 |
|
}, |
|
{ |
|
"epoch": 0.8338278931750742, |
|
"grad_norm": 7.806549072265625, |
|
"learning_rate": 1.9979818355731023e-07, |
|
"loss": 1.2215, |
|
"step": 11240 |
|
}, |
|
{ |
|
"epoch": 0.8345697329376854, |
|
"grad_norm": 9.690045356750488, |
|
"learning_rate": 1.9805850332375347e-07, |
|
"loss": 1.347, |
|
"step": 11250 |
|
}, |
|
{ |
|
"epoch": 0.8353115727002968, |
|
"grad_norm": 8.724777221679688, |
|
"learning_rate": 1.9632589458511884e-07, |
|
"loss": 1.236, |
|
"step": 11260 |
|
}, |
|
{ |
|
"epoch": 0.836053412462908, |
|
"grad_norm": 9.154813766479492, |
|
"learning_rate": 1.9460036675207077e-07, |
|
"loss": 1.2911, |
|
"step": 11270 |
|
}, |
|
{ |
|
"epoch": 0.8367952522255193, |
|
"grad_norm": 7.401409149169922, |
|
"learning_rate": 1.9288192919681274e-07, |
|
"loss": 1.2317, |
|
"step": 11280 |
|
}, |
|
{ |
|
"epoch": 0.8375370919881305, |
|
"grad_norm": 8.244491577148438, |
|
"learning_rate": 1.9117059125303858e-07, |
|
"loss": 1.3247, |
|
"step": 11290 |
|
}, |
|
{ |
|
"epoch": 0.8382789317507419, |
|
"grad_norm": 8.045402526855469, |
|
"learning_rate": 1.8946636221587916e-07, |
|
"loss": 1.1623, |
|
"step": 11300 |
|
}, |
|
{ |
|
"epoch": 0.8390207715133531, |
|
"grad_norm": 8.480494499206543, |
|
"learning_rate": 1.8776925134185496e-07, |
|
"loss": 1.0614, |
|
"step": 11310 |
|
}, |
|
{ |
|
"epoch": 0.8397626112759644, |
|
"grad_norm": 7.870119571685791, |
|
"learning_rate": 1.8607926784882235e-07, |
|
"loss": 1.2517, |
|
"step": 11320 |
|
}, |
|
{ |
|
"epoch": 0.8405044510385756, |
|
"grad_norm": 8.129040718078613, |
|
"learning_rate": 1.8439642091592705e-07, |
|
"loss": 1.2463, |
|
"step": 11330 |
|
}, |
|
{ |
|
"epoch": 0.841246290801187, |
|
"grad_norm": 9.212223052978516, |
|
"learning_rate": 1.8272071968355125e-07, |
|
"loss": 1.2069, |
|
"step": 11340 |
|
}, |
|
{ |
|
"epoch": 0.8419881305637982, |
|
"grad_norm": 8.223834991455078, |
|
"learning_rate": 1.8105217325326607e-07, |
|
"loss": 1.2398, |
|
"step": 11350 |
|
}, |
|
{ |
|
"epoch": 0.8427299703264095, |
|
"grad_norm": 8.517450332641602, |
|
"learning_rate": 1.7939079068778075e-07, |
|
"loss": 1.3121, |
|
"step": 11360 |
|
}, |
|
{ |
|
"epoch": 0.8434718100890207, |
|
"grad_norm": 9.853727340698242, |
|
"learning_rate": 1.7773658101089484e-07, |
|
"loss": 1.3609, |
|
"step": 11370 |
|
}, |
|
{ |
|
"epoch": 0.844213649851632, |
|
"grad_norm": 7.983924388885498, |
|
"learning_rate": 1.7608955320744708e-07, |
|
"loss": 1.1758, |
|
"step": 11380 |
|
}, |
|
{ |
|
"epoch": 0.8449554896142433, |
|
"grad_norm": 10.495153427124023, |
|
"learning_rate": 1.7444971622326916e-07, |
|
"loss": 1.4902, |
|
"step": 11390 |
|
}, |
|
{ |
|
"epoch": 0.8456973293768546, |
|
"grad_norm": 7.264878273010254, |
|
"learning_rate": 1.7281707896513477e-07, |
|
"loss": 1.3286, |
|
"step": 11400 |
|
}, |
|
{ |
|
"epoch": 0.8464391691394659, |
|
"grad_norm": 8.638553619384766, |
|
"learning_rate": 1.71191650300713e-07, |
|
"loss": 1.3583, |
|
"step": 11410 |
|
}, |
|
{ |
|
"epoch": 0.8471810089020771, |
|
"grad_norm": 8.893415451049805, |
|
"learning_rate": 1.6957343905851974e-07, |
|
"loss": 1.1841, |
|
"step": 11420 |
|
}, |
|
{ |
|
"epoch": 0.8479228486646885, |
|
"grad_norm": 8.31513500213623, |
|
"learning_rate": 1.6796245402786814e-07, |
|
"loss": 1.3391, |
|
"step": 11430 |
|
}, |
|
{ |
|
"epoch": 0.8486646884272997, |
|
"grad_norm": 8.640569686889648, |
|
"learning_rate": 1.663587039588237e-07, |
|
"loss": 1.3447, |
|
"step": 11440 |
|
}, |
|
{ |
|
"epoch": 0.849406528189911, |
|
"grad_norm": 8.092960357666016, |
|
"learning_rate": 1.6476219756215383e-07, |
|
"loss": 1.3939, |
|
"step": 11450 |
|
}, |
|
{ |
|
"epoch": 0.8501483679525222, |
|
"grad_norm": 8.167853355407715, |
|
"learning_rate": 1.631729435092833e-07, |
|
"loss": 1.3579, |
|
"step": 11460 |
|
}, |
|
{ |
|
"epoch": 0.8508902077151336, |
|
"grad_norm": 8.560916900634766, |
|
"learning_rate": 1.6159095043224452e-07, |
|
"loss": 1.3254, |
|
"step": 11470 |
|
}, |
|
{ |
|
"epoch": 0.8516320474777448, |
|
"grad_norm": 9.065086364746094, |
|
"learning_rate": 1.6001622692363315e-07, |
|
"loss": 1.4158, |
|
"step": 11480 |
|
}, |
|
{ |
|
"epoch": 0.8523738872403561, |
|
"grad_norm": 8.567241668701172, |
|
"learning_rate": 1.584487815365589e-07, |
|
"loss": 1.3156, |
|
"step": 11490 |
|
}, |
|
{ |
|
"epoch": 0.8531157270029673, |
|
"grad_norm": 8.35258960723877, |
|
"learning_rate": 1.568886227846016e-07, |
|
"loss": 1.3877, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 0.8531157270029673, |
|
"eval_loss": 1.2913334369659424, |
|
"eval_runtime": 23.6172, |
|
"eval_samples_per_second": 18.885, |
|
"eval_steps_per_second": 9.442, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 0.8538575667655787, |
|
"grad_norm": 8.62309455871582, |
|
"learning_rate": 1.5533575914176257e-07, |
|
"loss": 1.4222, |
|
"step": 11510 |
|
}, |
|
{ |
|
"epoch": 0.8545994065281899, |
|
"grad_norm": 8.811359405517578, |
|
"learning_rate": 1.5379019904242088e-07, |
|
"loss": 1.2911, |
|
"step": 11520 |
|
}, |
|
{ |
|
"epoch": 0.8553412462908012, |
|
"grad_norm": 8.009239196777344, |
|
"learning_rate": 1.5225195088128525e-07, |
|
"loss": 1.2665, |
|
"step": 11530 |
|
}, |
|
{ |
|
"epoch": 0.8560830860534124, |
|
"grad_norm": 7.761435031890869, |
|
"learning_rate": 1.5072102301335056e-07, |
|
"loss": 1.1277, |
|
"step": 11540 |
|
}, |
|
{ |
|
"epoch": 0.8568249258160238, |
|
"grad_norm": 10.778253555297852, |
|
"learning_rate": 1.49197423753851e-07, |
|
"loss": 1.2821, |
|
"step": 11550 |
|
}, |
|
{ |
|
"epoch": 0.857566765578635, |
|
"grad_norm": 8.768142700195312, |
|
"learning_rate": 1.4768116137821587e-07, |
|
"loss": 1.4462, |
|
"step": 11560 |
|
}, |
|
{ |
|
"epoch": 0.8583086053412463, |
|
"grad_norm": 8.696138381958008, |
|
"learning_rate": 1.461722441220234e-07, |
|
"loss": 1.3719, |
|
"step": 11570 |
|
}, |
|
{ |
|
"epoch": 0.8590504451038575, |
|
"grad_norm": 9.442025184631348, |
|
"learning_rate": 1.4467068018095775e-07, |
|
"loss": 1.1023, |
|
"step": 11580 |
|
}, |
|
{ |
|
"epoch": 0.8597922848664689, |
|
"grad_norm": 8.020044326782227, |
|
"learning_rate": 1.4317647771076265e-07, |
|
"loss": 1.5093, |
|
"step": 11590 |
|
}, |
|
{ |
|
"epoch": 0.8605341246290801, |
|
"grad_norm": 7.807736396789551, |
|
"learning_rate": 1.4168964482719914e-07, |
|
"loss": 1.2425, |
|
"step": 11600 |
|
}, |
|
{ |
|
"epoch": 0.8612759643916914, |
|
"grad_norm": 8.01576042175293, |
|
"learning_rate": 1.4021018960599885e-07, |
|
"loss": 1.1915, |
|
"step": 11610 |
|
}, |
|
{ |
|
"epoch": 0.8620178041543026, |
|
"grad_norm": 8.013411521911621, |
|
"learning_rate": 1.3873812008282306e-07, |
|
"loss": 1.4305, |
|
"step": 11620 |
|
}, |
|
{ |
|
"epoch": 0.862759643916914, |
|
"grad_norm": 9.141283988952637, |
|
"learning_rate": 1.3727344425321665e-07, |
|
"loss": 1.4392, |
|
"step": 11630 |
|
}, |
|
{ |
|
"epoch": 0.8635014836795252, |
|
"grad_norm": 8.028132438659668, |
|
"learning_rate": 1.3581617007256646e-07, |
|
"loss": 1.2475, |
|
"step": 11640 |
|
}, |
|
{ |
|
"epoch": 0.8642433234421365, |
|
"grad_norm": 7.848435401916504, |
|
"learning_rate": 1.3436630545605622e-07, |
|
"loss": 1.3094, |
|
"step": 11650 |
|
}, |
|
{ |
|
"epoch": 0.8649851632047477, |
|
"grad_norm": 11.25391674041748, |
|
"learning_rate": 1.3292385827862608e-07, |
|
"loss": 1.2995, |
|
"step": 11660 |
|
}, |
|
{ |
|
"epoch": 0.865727002967359, |
|
"grad_norm": 9.802054405212402, |
|
"learning_rate": 1.3148883637492665e-07, |
|
"loss": 1.2417, |
|
"step": 11670 |
|
}, |
|
{ |
|
"epoch": 0.8664688427299704, |
|
"grad_norm": 8.343031883239746, |
|
"learning_rate": 1.3006124753927945e-07, |
|
"loss": 1.2874, |
|
"step": 11680 |
|
}, |
|
{ |
|
"epoch": 0.8672106824925816, |
|
"grad_norm": 8.474637985229492, |
|
"learning_rate": 1.2864109952563313e-07, |
|
"loss": 1.2236, |
|
"step": 11690 |
|
}, |
|
{ |
|
"epoch": 0.8679525222551929, |
|
"grad_norm": 8.79692268371582, |
|
"learning_rate": 1.2722840004752085e-07, |
|
"loss": 1.3287, |
|
"step": 11700 |
|
}, |
|
{ |
|
"epoch": 0.8686943620178041, |
|
"grad_norm": 8.213624954223633, |
|
"learning_rate": 1.2582315677802008e-07, |
|
"loss": 1.3982, |
|
"step": 11710 |
|
}, |
|
{ |
|
"epoch": 0.8694362017804155, |
|
"grad_norm": 7.986428260803223, |
|
"learning_rate": 1.2442537734970843e-07, |
|
"loss": 1.3435, |
|
"step": 11720 |
|
}, |
|
{ |
|
"epoch": 0.8701780415430267, |
|
"grad_norm": 8.277667045593262, |
|
"learning_rate": 1.2303506935462538e-07, |
|
"loss": 1.1284, |
|
"step": 11730 |
|
}, |
|
{ |
|
"epoch": 0.870919881305638, |
|
"grad_norm": 11.558180809020996, |
|
"learning_rate": 1.2165224034422774e-07, |
|
"loss": 1.3261, |
|
"step": 11740 |
|
}, |
|
{ |
|
"epoch": 0.8716617210682492, |
|
"grad_norm": 10.026036262512207, |
|
"learning_rate": 1.202768978293516e-07, |
|
"loss": 1.1863, |
|
"step": 11750 |
|
}, |
|
{ |
|
"epoch": 0.8724035608308606, |
|
"grad_norm": 7.991722106933594, |
|
"learning_rate": 1.1890904928016927e-07, |
|
"loss": 1.3425, |
|
"step": 11760 |
|
}, |
|
{ |
|
"epoch": 0.8731454005934718, |
|
"grad_norm": 9.529873847961426, |
|
"learning_rate": 1.1754870212614933e-07, |
|
"loss": 1.2864, |
|
"step": 11770 |
|
}, |
|
{ |
|
"epoch": 0.8738872403560831, |
|
"grad_norm": 7.1360883712768555, |
|
"learning_rate": 1.161958637560177e-07, |
|
"loss": 1.2753, |
|
"step": 11780 |
|
}, |
|
{ |
|
"epoch": 0.8746290801186943, |
|
"grad_norm": 7.879760265350342, |
|
"learning_rate": 1.1485054151771518e-07, |
|
"loss": 1.3217, |
|
"step": 11790 |
|
}, |
|
{ |
|
"epoch": 0.8753709198813057, |
|
"grad_norm": 9.130861282348633, |
|
"learning_rate": 1.1351274271835948e-07, |
|
"loss": 1.2331, |
|
"step": 11800 |
|
}, |
|
{ |
|
"epoch": 0.8761127596439169, |
|
"grad_norm": 8.248236656188965, |
|
"learning_rate": 1.1218247462420422e-07, |
|
"loss": 1.3204, |
|
"step": 11810 |
|
}, |
|
{ |
|
"epoch": 0.8768545994065282, |
|
"grad_norm": 6.826605796813965, |
|
"learning_rate": 1.1085974446060054e-07, |
|
"loss": 1.4165, |
|
"step": 11820 |
|
}, |
|
{ |
|
"epoch": 0.8775964391691394, |
|
"grad_norm": 9.05876350402832, |
|
"learning_rate": 1.0954455941195668e-07, |
|
"loss": 1.3007, |
|
"step": 11830 |
|
}, |
|
{ |
|
"epoch": 0.8783382789317508, |
|
"grad_norm": 8.325678825378418, |
|
"learning_rate": 1.0823692662170015e-07, |
|
"loss": 1.5846, |
|
"step": 11840 |
|
}, |
|
{ |
|
"epoch": 0.879080118694362, |
|
"grad_norm": 9.26690673828125, |
|
"learning_rate": 1.0693685319223812e-07, |
|
"loss": 1.3343, |
|
"step": 11850 |
|
}, |
|
{ |
|
"epoch": 0.8798219584569733, |
|
"grad_norm": 7.220630645751953, |
|
"learning_rate": 1.0564434618491875e-07, |
|
"loss": 1.3249, |
|
"step": 11860 |
|
}, |
|
{ |
|
"epoch": 0.8805637982195845, |
|
"grad_norm": 8.11390495300293, |
|
"learning_rate": 1.0435941261999393e-07, |
|
"loss": 1.1482, |
|
"step": 11870 |
|
}, |
|
{ |
|
"epoch": 0.8813056379821959, |
|
"grad_norm": 7.764613151550293, |
|
"learning_rate": 1.0308205947657978e-07, |
|
"loss": 1.3138, |
|
"step": 11880 |
|
}, |
|
{ |
|
"epoch": 0.8820474777448071, |
|
"grad_norm": 8.297335624694824, |
|
"learning_rate": 1.0181229369261985e-07, |
|
"loss": 1.1945, |
|
"step": 11890 |
|
}, |
|
{ |
|
"epoch": 0.8827893175074184, |
|
"grad_norm": 8.837085723876953, |
|
"learning_rate": 1.0055012216484633e-07, |
|
"loss": 1.2443, |
|
"step": 11900 |
|
}, |
|
{ |
|
"epoch": 0.8835311572700296, |
|
"grad_norm": 8.462185859680176, |
|
"learning_rate": 9.929555174874388e-08, |
|
"loss": 1.2518, |
|
"step": 11910 |
|
}, |
|
{ |
|
"epoch": 0.884272997032641, |
|
"grad_norm": 8.001595497131348, |
|
"learning_rate": 9.804858925851124e-08, |
|
"loss": 1.2265, |
|
"step": 11920 |
|
}, |
|
{ |
|
"epoch": 0.8850148367952523, |
|
"grad_norm": 8.231101989746094, |
|
"learning_rate": 9.68092414670248e-08, |
|
"loss": 1.3531, |
|
"step": 11930 |
|
}, |
|
{ |
|
"epoch": 0.8857566765578635, |
|
"grad_norm": 9.410528182983398, |
|
"learning_rate": 9.557751510580209e-08, |
|
"loss": 1.2193, |
|
"step": 11940 |
|
}, |
|
{ |
|
"epoch": 0.8864985163204748, |
|
"grad_norm": 7.80114221572876, |
|
"learning_rate": 9.435341686496408e-08, |
|
"loss": 1.3282, |
|
"step": 11950 |
|
}, |
|
{ |
|
"epoch": 0.887240356083086, |
|
"grad_norm": 7.797093868255615, |
|
"learning_rate": 9.313695339320066e-08, |
|
"loss": 1.2345, |
|
"step": 11960 |
|
}, |
|
{ |
|
"epoch": 0.8879821958456974, |
|
"grad_norm": 8.155489921569824, |
|
"learning_rate": 9.192813129773248e-08, |
|
"loss": 1.1327, |
|
"step": 11970 |
|
}, |
|
{ |
|
"epoch": 0.8887240356083086, |
|
"grad_norm": 7.540963649749756, |
|
"learning_rate": 9.072695714427665e-08, |
|
"loss": 1.2119, |
|
"step": 11980 |
|
}, |
|
{ |
|
"epoch": 0.8894658753709199, |
|
"grad_norm": 8.781906127929688, |
|
"learning_rate": 8.953343745700987e-08, |
|
"loss": 1.2566, |
|
"step": 11990 |
|
}, |
|
{ |
|
"epoch": 0.8902077151335311, |
|
"grad_norm": 8.42147445678711, |
|
"learning_rate": 8.83475787185346e-08, |
|
"loss": 1.2929, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 0.8902077151335311, |
|
"eval_loss": 1.2903343439102173, |
|
"eval_runtime": 23.6255, |
|
"eval_samples_per_second": 18.878, |
|
"eval_steps_per_second": 9.439, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 0.8909495548961425, |
|
"grad_norm": 7.471776485443115, |
|
"learning_rate": 8.716938736984192e-08, |
|
"loss": 1.2483, |
|
"step": 12010 |
|
}, |
|
{ |
|
"epoch": 0.8916913946587537, |
|
"grad_norm": 8.134690284729004, |
|
"learning_rate": 8.599886981027805e-08, |
|
"loss": 1.2836, |
|
"step": 12020 |
|
}, |
|
{ |
|
"epoch": 0.892433234421365, |
|
"grad_norm": 7.840508460998535, |
|
"learning_rate": 8.48360323975087e-08, |
|
"loss": 1.2289, |
|
"step": 12030 |
|
}, |
|
{ |
|
"epoch": 0.8931750741839762, |
|
"grad_norm": 7.892512798309326, |
|
"learning_rate": 8.368088144748515e-08, |
|
"loss": 1.267, |
|
"step": 12040 |
|
}, |
|
{ |
|
"epoch": 0.8939169139465876, |
|
"grad_norm": 8.905203819274902, |
|
"learning_rate": 8.253342323440921e-08, |
|
"loss": 1.4043, |
|
"step": 12050 |
|
}, |
|
{ |
|
"epoch": 0.8946587537091988, |
|
"grad_norm": 7.420648574829102, |
|
"learning_rate": 8.139366399070014e-08, |
|
"loss": 1.3941, |
|
"step": 12060 |
|
}, |
|
{ |
|
"epoch": 0.8954005934718101, |
|
"grad_norm": 8.54706859588623, |
|
"learning_rate": 8.026160990695996e-08, |
|
"loss": 1.3438, |
|
"step": 12070 |
|
}, |
|
{ |
|
"epoch": 0.8961424332344213, |
|
"grad_norm": 7.239863872528076, |
|
"learning_rate": 7.91372671319402e-08, |
|
"loss": 1.3068, |
|
"step": 12080 |
|
}, |
|
{ |
|
"epoch": 0.8968842729970327, |
|
"grad_norm": 8.183691024780273, |
|
"learning_rate": 7.8020641772508e-08, |
|
"loss": 1.4976, |
|
"step": 12090 |
|
}, |
|
{ |
|
"epoch": 0.8976261127596439, |
|
"grad_norm": 7.157724380493164, |
|
"learning_rate": 7.691173989361428e-08, |
|
"loss": 1.4513, |
|
"step": 12100 |
|
}, |
|
{ |
|
"epoch": 0.8983679525222552, |
|
"grad_norm": 8.098734855651855, |
|
"learning_rate": 7.581056751825893e-08, |
|
"loss": 1.3248, |
|
"step": 12110 |
|
}, |
|
{ |
|
"epoch": 0.8991097922848664, |
|
"grad_norm": 7.755335807800293, |
|
"learning_rate": 7.471713062745967e-08, |
|
"loss": 1.3429, |
|
"step": 12120 |
|
}, |
|
{ |
|
"epoch": 0.8998516320474778, |
|
"grad_norm": 6.79167366027832, |
|
"learning_rate": 7.363143516021858e-08, |
|
"loss": 1.2159, |
|
"step": 12130 |
|
}, |
|
{ |
|
"epoch": 0.900593471810089, |
|
"grad_norm": 8.930359840393066, |
|
"learning_rate": 7.255348701349029e-08, |
|
"loss": 1.297, |
|
"step": 12140 |
|
}, |
|
{ |
|
"epoch": 0.9013353115727003, |
|
"grad_norm": 9.669726371765137, |
|
"learning_rate": 7.148329204214987e-08, |
|
"loss": 1.2854, |
|
"step": 12150 |
|
}, |
|
{ |
|
"epoch": 0.9020771513353115, |
|
"grad_norm": 8.40202522277832, |
|
"learning_rate": 7.042085605896142e-08, |
|
"loss": 1.0888, |
|
"step": 12160 |
|
}, |
|
{ |
|
"epoch": 0.9028189910979229, |
|
"grad_norm": 7.3866729736328125, |
|
"learning_rate": 6.936618483454527e-08, |
|
"loss": 1.4845, |
|
"step": 12170 |
|
}, |
|
{ |
|
"epoch": 0.9035608308605341, |
|
"grad_norm": 8.179498672485352, |
|
"learning_rate": 6.831928409734811e-08, |
|
"loss": 1.2014, |
|
"step": 12180 |
|
}, |
|
{ |
|
"epoch": 0.9043026706231454, |
|
"grad_norm": 8.435233116149902, |
|
"learning_rate": 6.728015953361094e-08, |
|
"loss": 1.248, |
|
"step": 12190 |
|
}, |
|
{ |
|
"epoch": 0.9050445103857567, |
|
"grad_norm": 7.416328430175781, |
|
"learning_rate": 6.624881678733852e-08, |
|
"loss": 1.2651, |
|
"step": 12200 |
|
}, |
|
{ |
|
"epoch": 0.905786350148368, |
|
"grad_norm": 8.049245834350586, |
|
"learning_rate": 6.522526146026924e-08, |
|
"loss": 1.1607, |
|
"step": 12210 |
|
}, |
|
{ |
|
"epoch": 0.9065281899109793, |
|
"grad_norm": 7.982175827026367, |
|
"learning_rate": 6.420949911184288e-08, |
|
"loss": 1.2755, |
|
"step": 12220 |
|
}, |
|
{ |
|
"epoch": 0.9072700296735905, |
|
"grad_norm": 9.001856803894043, |
|
"learning_rate": 6.320153525917299e-08, |
|
"loss": 1.1793, |
|
"step": 12230 |
|
}, |
|
{ |
|
"epoch": 0.9080118694362018, |
|
"grad_norm": 8.896450996398926, |
|
"learning_rate": 6.220137537701459e-08, |
|
"loss": 1.4263, |
|
"step": 12240 |
|
}, |
|
{ |
|
"epoch": 0.908753709198813, |
|
"grad_norm": 9.380216598510742, |
|
"learning_rate": 6.120902489773606e-08, |
|
"loss": 1.4032, |
|
"step": 12250 |
|
}, |
|
{ |
|
"epoch": 0.9094955489614244, |
|
"grad_norm": 7.810571193695068, |
|
"learning_rate": 6.022448921128854e-08, |
|
"loss": 1.3057, |
|
"step": 12260 |
|
}, |
|
{ |
|
"epoch": 0.9102373887240356, |
|
"grad_norm": 7.799693584442139, |
|
"learning_rate": 5.9247773665177805e-08, |
|
"loss": 1.3243, |
|
"step": 12270 |
|
}, |
|
{ |
|
"epoch": 0.9109792284866469, |
|
"grad_norm": 8.116616249084473, |
|
"learning_rate": 5.8278883564433614e-08, |
|
"loss": 1.4306, |
|
"step": 12280 |
|
}, |
|
{ |
|
"epoch": 0.9117210682492581, |
|
"grad_norm": 8.54800033569336, |
|
"learning_rate": 5.731782417158271e-08, |
|
"loss": 1.3961, |
|
"step": 12290 |
|
}, |
|
{ |
|
"epoch": 0.9124629080118695, |
|
"grad_norm": 8.728897094726562, |
|
"learning_rate": 5.636460070661853e-08, |
|
"loss": 1.3383, |
|
"step": 12300 |
|
}, |
|
{ |
|
"epoch": 0.9132047477744807, |
|
"grad_norm": 8.328527450561523, |
|
"learning_rate": 5.5419218346974723e-08, |
|
"loss": 1.2801, |
|
"step": 12310 |
|
}, |
|
{ |
|
"epoch": 0.913946587537092, |
|
"grad_norm": 7.380051612854004, |
|
"learning_rate": 5.448168222749467e-08, |
|
"loss": 1.3151, |
|
"step": 12320 |
|
}, |
|
{ |
|
"epoch": 0.9146884272997032, |
|
"grad_norm": 9.386639595031738, |
|
"learning_rate": 5.355199744040601e-08, |
|
"loss": 1.2813, |
|
"step": 12330 |
|
}, |
|
{ |
|
"epoch": 0.9154302670623146, |
|
"grad_norm": 8.962152481079102, |
|
"learning_rate": 5.2630169035291164e-08, |
|
"loss": 1.3694, |
|
"step": 12340 |
|
}, |
|
{ |
|
"epoch": 0.9161721068249258, |
|
"grad_norm": 8.48715877532959, |
|
"learning_rate": 5.171620201906119e-08, |
|
"loss": 1.2331, |
|
"step": 12350 |
|
}, |
|
{ |
|
"epoch": 0.9169139465875371, |
|
"grad_norm": 8.15807819366455, |
|
"learning_rate": 5.081010135592745e-08, |
|
"loss": 1.2515, |
|
"step": 12360 |
|
}, |
|
{ |
|
"epoch": 0.9176557863501483, |
|
"grad_norm": 7.585864543914795, |
|
"learning_rate": 4.9911871967375675e-08, |
|
"loss": 1.2352, |
|
"step": 12370 |
|
}, |
|
{ |
|
"epoch": 0.9183976261127597, |
|
"grad_norm": 7.90684700012207, |
|
"learning_rate": 4.902151873213828e-08, |
|
"loss": 1.2776, |
|
"step": 12380 |
|
}, |
|
{ |
|
"epoch": 0.9191394658753709, |
|
"grad_norm": 10.257676124572754, |
|
"learning_rate": 4.813904648616907e-08, |
|
"loss": 1.3307, |
|
"step": 12390 |
|
}, |
|
{ |
|
"epoch": 0.9198813056379822, |
|
"grad_norm": 8.50632095336914, |
|
"learning_rate": 4.7264460022615416e-08, |
|
"loss": 1.2977, |
|
"step": 12400 |
|
}, |
|
{ |
|
"epoch": 0.9206231454005934, |
|
"grad_norm": 7.718177318572998, |
|
"learning_rate": 4.63977640917938e-08, |
|
"loss": 1.1651, |
|
"step": 12410 |
|
}, |
|
{ |
|
"epoch": 0.9213649851632048, |
|
"grad_norm": 8.514959335327148, |
|
"learning_rate": 4.5538963401162645e-08, |
|
"loss": 1.197, |
|
"step": 12420 |
|
}, |
|
{ |
|
"epoch": 0.922106824925816, |
|
"grad_norm": 7.185023784637451, |
|
"learning_rate": 4.468806261529801e-08, |
|
"loss": 1.2111, |
|
"step": 12430 |
|
}, |
|
{ |
|
"epoch": 0.9228486646884273, |
|
"grad_norm": 10.854412078857422, |
|
"learning_rate": 4.38450663558671e-08, |
|
"loss": 1.2498, |
|
"step": 12440 |
|
}, |
|
{ |
|
"epoch": 0.9235905044510386, |
|
"grad_norm": 8.594488143920898, |
|
"learning_rate": 4.3009979201604154e-08, |
|
"loss": 1.2785, |
|
"step": 12450 |
|
}, |
|
{ |
|
"epoch": 0.9243323442136498, |
|
"grad_norm": 8.167387008666992, |
|
"learning_rate": 4.218280568828442e-08, |
|
"loss": 1.4823, |
|
"step": 12460 |
|
}, |
|
{ |
|
"epoch": 0.9250741839762612, |
|
"grad_norm": 9.161100387573242, |
|
"learning_rate": 4.136355030870104e-08, |
|
"loss": 1.2747, |
|
"step": 12470 |
|
}, |
|
{ |
|
"epoch": 0.9258160237388724, |
|
"grad_norm": 8.26723575592041, |
|
"learning_rate": 4.0552217512639213e-08, |
|
"loss": 1.3196, |
|
"step": 12480 |
|
}, |
|
{ |
|
"epoch": 0.9265578635014837, |
|
"grad_norm": 8.994638442993164, |
|
"learning_rate": 3.974881170685274e-08, |
|
"loss": 1.127, |
|
"step": 12490 |
|
}, |
|
{ |
|
"epoch": 0.9272997032640949, |
|
"grad_norm": 9.040610313415527, |
|
"learning_rate": 3.895333725504035e-08, |
|
"loss": 1.4017, |
|
"step": 12500 |
|
}, |
|
{ |
|
"epoch": 0.9272997032640949, |
|
"eval_loss": 1.289976954460144, |
|
"eval_runtime": 23.649, |
|
"eval_samples_per_second": 18.859, |
|
"eval_steps_per_second": 9.43, |
|
"step": 12500 |
|
}, |
|
{ |
|
"epoch": 0.9280415430267063, |
|
"grad_norm": 7.480683326721191, |
|
"learning_rate": 3.816579847782092e-08, |
|
"loss": 1.3201, |
|
"step": 12510 |
|
}, |
|
{ |
|
"epoch": 0.9287833827893175, |
|
"grad_norm": 8.271261215209961, |
|
"learning_rate": 3.738619965271145e-08, |
|
"loss": 1.2206, |
|
"step": 12520 |
|
}, |
|
{ |
|
"epoch": 0.9295252225519288, |
|
"grad_norm": 8.740528106689453, |
|
"learning_rate": 3.661454501410277e-08, |
|
"loss": 1.3493, |
|
"step": 12530 |
|
}, |
|
{ |
|
"epoch": 0.93026706231454, |
|
"grad_norm": 11.37153434753418, |
|
"learning_rate": 3.585083875323675e-08, |
|
"loss": 1.2472, |
|
"step": 12540 |
|
}, |
|
{ |
|
"epoch": 0.9310089020771514, |
|
"grad_norm": 9.2501802444458, |
|
"learning_rate": 3.5095085018183595e-08, |
|
"loss": 1.3783, |
|
"step": 12550 |
|
}, |
|
{ |
|
"epoch": 0.9317507418397626, |
|
"grad_norm": 7.809544086456299, |
|
"learning_rate": 3.434728791381991e-08, |
|
"loss": 1.1981, |
|
"step": 12560 |
|
}, |
|
{ |
|
"epoch": 0.9324925816023739, |
|
"grad_norm": 9.464616775512695, |
|
"learning_rate": 3.360745150180522e-08, |
|
"loss": 1.4154, |
|
"step": 12570 |
|
}, |
|
{ |
|
"epoch": 0.9332344213649851, |
|
"grad_norm": 6.653102874755859, |
|
"learning_rate": 3.2875579800561104e-08, |
|
"loss": 1.1891, |
|
"step": 12580 |
|
}, |
|
{ |
|
"epoch": 0.9339762611275965, |
|
"grad_norm": 9.972185134887695, |
|
"learning_rate": 3.215167678524794e-08, |
|
"loss": 1.3693, |
|
"step": 12590 |
|
}, |
|
{ |
|
"epoch": 0.9347181008902077, |
|
"grad_norm": 7.9361419677734375, |
|
"learning_rate": 3.143574638774555e-08, |
|
"loss": 1.274, |
|
"step": 12600 |
|
}, |
|
{ |
|
"epoch": 0.935459940652819, |
|
"grad_norm": 10.938789367675781, |
|
"learning_rate": 3.072779249662905e-08, |
|
"loss": 1.5216, |
|
"step": 12610 |
|
}, |
|
{ |
|
"epoch": 0.9362017804154302, |
|
"grad_norm": 8.189596176147461, |
|
"learning_rate": 3.002781895715023e-08, |
|
"loss": 1.21, |
|
"step": 12620 |
|
}, |
|
{ |
|
"epoch": 0.9369436201780416, |
|
"grad_norm": 7.8240790367126465, |
|
"learning_rate": 2.933582957121489e-08, |
|
"loss": 1.3034, |
|
"step": 12630 |
|
}, |
|
{ |
|
"epoch": 0.9376854599406528, |
|
"grad_norm": 10.08263874053955, |
|
"learning_rate": 2.8651828097363663e-08, |
|
"loss": 1.3179, |
|
"step": 12640 |
|
}, |
|
{ |
|
"epoch": 0.9384272997032641, |
|
"grad_norm": 7.9933366775512695, |
|
"learning_rate": 2.7975818250749906e-08, |
|
"loss": 1.2449, |
|
"step": 12650 |
|
}, |
|
{ |
|
"epoch": 0.9391691394658753, |
|
"grad_norm": 10.46999740600586, |
|
"learning_rate": 2.730780370312119e-08, |
|
"loss": 1.2007, |
|
"step": 12660 |
|
}, |
|
{ |
|
"epoch": 0.9399109792284867, |
|
"grad_norm": 8.219291687011719, |
|
"learning_rate": 2.664778808279833e-08, |
|
"loss": 1.1914, |
|
"step": 12670 |
|
}, |
|
{ |
|
"epoch": 0.9406528189910979, |
|
"grad_norm": 7.833841800689697, |
|
"learning_rate": 2.599577497465605e-08, |
|
"loss": 1.257, |
|
"step": 12680 |
|
}, |
|
{ |
|
"epoch": 0.9413946587537092, |
|
"grad_norm": 8.004801750183105, |
|
"learning_rate": 2.5351767920103187e-08, |
|
"loss": 1.227, |
|
"step": 12690 |
|
}, |
|
{ |
|
"epoch": 0.9421364985163204, |
|
"grad_norm": 9.302260398864746, |
|
"learning_rate": 2.4715770417064187e-08, |
|
"loss": 1.5238, |
|
"step": 12700 |
|
}, |
|
{ |
|
"epoch": 0.9428783382789317, |
|
"grad_norm": 9.043020248413086, |
|
"learning_rate": 2.4087785919959137e-08, |
|
"loss": 1.3732, |
|
"step": 12710 |
|
}, |
|
{ |
|
"epoch": 0.9436201780415431, |
|
"grad_norm": 8.14455795288086, |
|
"learning_rate": 2.3467817839685767e-08, |
|
"loss": 1.3711, |
|
"step": 12720 |
|
}, |
|
{ |
|
"epoch": 0.9443620178041543, |
|
"grad_norm": 8.116730690002441, |
|
"learning_rate": 2.285586954360047e-08, |
|
"loss": 1.3635, |
|
"step": 12730 |
|
}, |
|
{ |
|
"epoch": 0.9451038575667656, |
|
"grad_norm": 8.274658203125, |
|
"learning_rate": 2.225194435550032e-08, |
|
"loss": 1.2573, |
|
"step": 12740 |
|
}, |
|
{ |
|
"epoch": 0.9458456973293768, |
|
"grad_norm": 9.78200912475586, |
|
"learning_rate": 2.1656045555605074e-08, |
|
"loss": 1.4526, |
|
"step": 12750 |
|
}, |
|
{ |
|
"epoch": 0.9465875370919882, |
|
"grad_norm": 9.067741394042969, |
|
"learning_rate": 2.1068176380538373e-08, |
|
"loss": 1.3403, |
|
"step": 12760 |
|
}, |
|
{ |
|
"epoch": 0.9473293768545994, |
|
"grad_norm": 13.13876724243164, |
|
"learning_rate": 2.0488340023312068e-08, |
|
"loss": 1.3285, |
|
"step": 12770 |
|
}, |
|
{ |
|
"epoch": 0.9480712166172107, |
|
"grad_norm": 9.662564277648926, |
|
"learning_rate": 1.9916539633306753e-08, |
|
"loss": 1.1916, |
|
"step": 12780 |
|
}, |
|
{ |
|
"epoch": 0.9488130563798219, |
|
"grad_norm": 8.476212501525879, |
|
"learning_rate": 1.9352778316256258e-08, |
|
"loss": 1.2045, |
|
"step": 12790 |
|
}, |
|
{ |
|
"epoch": 0.9495548961424333, |
|
"grad_norm": 8.053838729858398, |
|
"learning_rate": 1.8797059134230186e-08, |
|
"loss": 1.2306, |
|
"step": 12800 |
|
}, |
|
{ |
|
"epoch": 0.9502967359050445, |
|
"grad_norm": 9.246392250061035, |
|
"learning_rate": 1.8249385105616913e-08, |
|
"loss": 1.3062, |
|
"step": 12810 |
|
}, |
|
{ |
|
"epoch": 0.9510385756676558, |
|
"grad_norm": 8.371253967285156, |
|
"learning_rate": 1.7709759205107923e-08, |
|
"loss": 1.3793, |
|
"step": 12820 |
|
}, |
|
{ |
|
"epoch": 0.951780415430267, |
|
"grad_norm": 7.30432653427124, |
|
"learning_rate": 1.7178184363681182e-08, |
|
"loss": 1.2617, |
|
"step": 12830 |
|
}, |
|
{ |
|
"epoch": 0.9525222551928784, |
|
"grad_norm": 8.539069175720215, |
|
"learning_rate": 1.6654663468585295e-08, |
|
"loss": 1.4355, |
|
"step": 12840 |
|
}, |
|
{ |
|
"epoch": 0.9532640949554896, |
|
"grad_norm": 10.273870468139648, |
|
"learning_rate": 1.6139199363323864e-08, |
|
"loss": 1.4731, |
|
"step": 12850 |
|
}, |
|
{ |
|
"epoch": 0.9540059347181009, |
|
"grad_norm": 8.182214736938477, |
|
"learning_rate": 1.5631794847639824e-08, |
|
"loss": 1.3659, |
|
"step": 12860 |
|
}, |
|
{ |
|
"epoch": 0.9547477744807121, |
|
"grad_norm": 11.630437850952148, |
|
"learning_rate": 1.513245267750113e-08, |
|
"loss": 1.2748, |
|
"step": 12870 |
|
}, |
|
{ |
|
"epoch": 0.9554896142433235, |
|
"grad_norm": 10.194863319396973, |
|
"learning_rate": 1.4641175565084265e-08, |
|
"loss": 1.3644, |
|
"step": 12880 |
|
}, |
|
{ |
|
"epoch": 0.9562314540059347, |
|
"grad_norm": 7.40037727355957, |
|
"learning_rate": 1.4157966178761083e-08, |
|
"loss": 1.2678, |
|
"step": 12890 |
|
}, |
|
{ |
|
"epoch": 0.956973293768546, |
|
"grad_norm": 9.2279052734375, |
|
"learning_rate": 1.3682827143082832e-08, |
|
"loss": 1.4593, |
|
"step": 12900 |
|
}, |
|
{ |
|
"epoch": 0.9577151335311572, |
|
"grad_norm": 8.294316291809082, |
|
"learning_rate": 1.3215761038767483e-08, |
|
"loss": 1.2412, |
|
"step": 12910 |
|
}, |
|
{ |
|
"epoch": 0.9584569732937686, |
|
"grad_norm": 7.223811626434326, |
|
"learning_rate": 1.2756770402684081e-08, |
|
"loss": 1.2322, |
|
"step": 12920 |
|
}, |
|
{ |
|
"epoch": 0.9591988130563798, |
|
"grad_norm": 7.492358207702637, |
|
"learning_rate": 1.2305857727840597e-08, |
|
"loss": 1.3891, |
|
"step": 12930 |
|
}, |
|
{ |
|
"epoch": 0.9599406528189911, |
|
"grad_norm": 8.983826637268066, |
|
"learning_rate": 1.186302546336876e-08, |
|
"loss": 1.4126, |
|
"step": 12940 |
|
}, |
|
{ |
|
"epoch": 0.9606824925816023, |
|
"grad_norm": 8.535353660583496, |
|
"learning_rate": 1.1428276014512073e-08, |
|
"loss": 1.1881, |
|
"step": 12950 |
|
}, |
|
{ |
|
"epoch": 0.9614243323442137, |
|
"grad_norm": 7.106237888336182, |
|
"learning_rate": 1.1001611742611827e-08, |
|
"loss": 1.292, |
|
"step": 12960 |
|
}, |
|
{ |
|
"epoch": 0.962166172106825, |
|
"grad_norm": 8.555818557739258, |
|
"learning_rate": 1.0583034965095274e-08, |
|
"loss": 1.4155, |
|
"step": 12970 |
|
}, |
|
{ |
|
"epoch": 0.9629080118694362, |
|
"grad_norm": 12.074318885803223, |
|
"learning_rate": 1.0172547955461798e-08, |
|
"loss": 1.4455, |
|
"step": 12980 |
|
}, |
|
{ |
|
"epoch": 0.9636498516320475, |
|
"grad_norm": 8.598979949951172, |
|
"learning_rate": 9.770152943271604e-09, |
|
"loss": 1.3468, |
|
"step": 12990 |
|
}, |
|
{ |
|
"epoch": 0.9643916913946587, |
|
"grad_norm": 9.474443435668945, |
|
"learning_rate": 9.375852114133221e-09, |
|
"loss": 1.2126, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 0.9643916913946587, |
|
"eval_loss": 1.2897428274154663, |
|
"eval_runtime": 23.7046, |
|
"eval_samples_per_second": 18.815, |
|
"eval_steps_per_second": 9.407, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 0.9651335311572701, |
|
"grad_norm": 7.944087982177734, |
|
"learning_rate": 8.989647609691342e-09, |
|
"loss": 1.2518, |
|
"step": 13010 |
|
}, |
|
{ |
|
"epoch": 0.9658753709198813, |
|
"grad_norm": 8.350529670715332, |
|
"learning_rate": 8.611541527615508e-09, |
|
"loss": 1.1986, |
|
"step": 13020 |
|
}, |
|
{ |
|
"epoch": 0.9666172106824926, |
|
"grad_norm": 8.835983276367188, |
|
"learning_rate": 8.241535921589106e-09, |
|
"loss": 1.4297, |
|
"step": 13030 |
|
}, |
|
{ |
|
"epoch": 0.9673590504451038, |
|
"grad_norm": 9.169357299804688, |
|
"learning_rate": 7.879632801297387e-09, |
|
"loss": 1.2199, |
|
"step": 13040 |
|
}, |
|
{ |
|
"epoch": 0.9681008902077152, |
|
"grad_norm": 11.830096244812012, |
|
"learning_rate": 7.525834132416976e-09, |
|
"loss": 1.3442, |
|
"step": 13050 |
|
}, |
|
{ |
|
"epoch": 0.9688427299703264, |
|
"grad_norm": 8.521605491638184, |
|
"learning_rate": 7.180141836605536e-09, |
|
"loss": 1.3721, |
|
"step": 13060 |
|
}, |
|
{ |
|
"epoch": 0.9695845697329377, |
|
"grad_norm": 8.605573654174805, |
|
"learning_rate": 6.842557791490122e-09, |
|
"loss": 1.3636, |
|
"step": 13070 |
|
}, |
|
{ |
|
"epoch": 0.9703264094955489, |
|
"grad_norm": 7.742245197296143, |
|
"learning_rate": 6.513083830659017e-09, |
|
"loss": 1.2094, |
|
"step": 13080 |
|
}, |
|
{ |
|
"epoch": 0.9710682492581603, |
|
"grad_norm": 9.237808227539062, |
|
"learning_rate": 6.19172174364957e-09, |
|
"loss": 1.2527, |
|
"step": 13090 |
|
}, |
|
{ |
|
"epoch": 0.9718100890207715, |
|
"grad_norm": 8.128382682800293, |
|
"learning_rate": 5.878473275940044e-09, |
|
"loss": 1.1039, |
|
"step": 13100 |
|
}, |
|
{ |
|
"epoch": 0.9725519287833828, |
|
"grad_norm": 9.049505233764648, |
|
"learning_rate": 5.573340128939286e-09, |
|
"loss": 1.3061, |
|
"step": 13110 |
|
}, |
|
{ |
|
"epoch": 0.973293768545994, |
|
"grad_norm": 8.908202171325684, |
|
"learning_rate": 5.276323959978235e-09, |
|
"loss": 1.3528, |
|
"step": 13120 |
|
}, |
|
{ |
|
"epoch": 0.9740356083086054, |
|
"grad_norm": 10.514373779296875, |
|
"learning_rate": 4.987426382299598e-09, |
|
"loss": 1.2388, |
|
"step": 13130 |
|
}, |
|
{ |
|
"epoch": 0.9747774480712166, |
|
"grad_norm": 8.527087211608887, |
|
"learning_rate": 4.706648965051019e-09, |
|
"loss": 1.249, |
|
"step": 13140 |
|
}, |
|
{ |
|
"epoch": 0.9755192878338279, |
|
"grad_norm": 8.145854949951172, |
|
"learning_rate": 4.433993233274591e-09, |
|
"loss": 1.1806, |
|
"step": 13150 |
|
}, |
|
{ |
|
"epoch": 0.9762611275964391, |
|
"grad_norm": 7.99056339263916, |
|
"learning_rate": 4.169460667900027e-09, |
|
"loss": 1.2609, |
|
"step": 13160 |
|
}, |
|
{ |
|
"epoch": 0.9770029673590505, |
|
"grad_norm": 8.545060157775879, |
|
"learning_rate": 3.913052705735997e-09, |
|
"loss": 1.3828, |
|
"step": 13170 |
|
}, |
|
{ |
|
"epoch": 0.9777448071216617, |
|
"grad_norm": 8.997559547424316, |
|
"learning_rate": 3.6647707394619756e-09, |
|
"loss": 1.2281, |
|
"step": 13180 |
|
}, |
|
{ |
|
"epoch": 0.978486646884273, |
|
"grad_norm": 8.990921974182129, |
|
"learning_rate": 3.4246161176217372e-09, |
|
"loss": 1.3476, |
|
"step": 13190 |
|
}, |
|
{ |
|
"epoch": 0.9792284866468842, |
|
"grad_norm": 7.651655673980713, |
|
"learning_rate": 3.1925901446148707e-09, |
|
"loss": 1.2981, |
|
"step": 13200 |
|
}, |
|
{ |
|
"epoch": 0.9799703264094956, |
|
"grad_norm": 8.907322883605957, |
|
"learning_rate": 2.9686940806904485e-09, |
|
"loss": 1.517, |
|
"step": 13210 |
|
}, |
|
{ |
|
"epoch": 0.9807121661721068, |
|
"grad_norm": 9.220152854919434, |
|
"learning_rate": 2.752929141939864e-09, |
|
"loss": 1.2845, |
|
"step": 13220 |
|
}, |
|
{ |
|
"epoch": 0.9814540059347181, |
|
"grad_norm": 7.591921329498291, |
|
"learning_rate": 2.5452965002903396e-09, |
|
"loss": 1.2068, |
|
"step": 13230 |
|
}, |
|
{ |
|
"epoch": 0.9821958456973294, |
|
"grad_norm": 8.76726245880127, |
|
"learning_rate": 2.34579728349843e-09, |
|
"loss": 1.2462, |
|
"step": 13240 |
|
}, |
|
{ |
|
"epoch": 0.9829376854599406, |
|
"grad_norm": 8.317231178283691, |
|
"learning_rate": 2.154432575144194e-09, |
|
"loss": 1.3612, |
|
"step": 13250 |
|
}, |
|
{ |
|
"epoch": 0.983679525222552, |
|
"grad_norm": 9.405437469482422, |
|
"learning_rate": 1.9712034146250336e-09, |
|
"loss": 1.2786, |
|
"step": 13260 |
|
}, |
|
{ |
|
"epoch": 0.9844213649851632, |
|
"grad_norm": 7.619749069213867, |
|
"learning_rate": 1.7961107971498635e-09, |
|
"loss": 1.2626, |
|
"step": 13270 |
|
}, |
|
{ |
|
"epoch": 0.9851632047477745, |
|
"grad_norm": 13.960756301879883, |
|
"learning_rate": 1.6291556737344503e-09, |
|
"loss": 1.432, |
|
"step": 13280 |
|
}, |
|
{ |
|
"epoch": 0.9859050445103857, |
|
"grad_norm": 9.093308448791504, |
|
"learning_rate": 1.4703389511955822e-09, |
|
"loss": 1.3687, |
|
"step": 13290 |
|
}, |
|
{ |
|
"epoch": 0.9866468842729971, |
|
"grad_norm": 7.0503458976745605, |
|
"learning_rate": 1.319661492145907e-09, |
|
"loss": 1.3628, |
|
"step": 13300 |
|
}, |
|
{ |
|
"epoch": 0.9873887240356083, |
|
"grad_norm": 9.696161270141602, |
|
"learning_rate": 1.1771241149901024e-09, |
|
"loss": 1.3019, |
|
"step": 13310 |
|
}, |
|
{ |
|
"epoch": 0.9881305637982196, |
|
"grad_norm": 9.714421272277832, |
|
"learning_rate": 1.0427275939200453e-09, |
|
"loss": 1.3525, |
|
"step": 13320 |
|
}, |
|
{ |
|
"epoch": 0.9888724035608308, |
|
"grad_norm": 9.047686576843262, |
|
"learning_rate": 9.164726589103167e-10, |
|
"loss": 1.208, |
|
"step": 13330 |
|
}, |
|
{ |
|
"epoch": 0.9896142433234422, |
|
"grad_norm": 10.157636642456055, |
|
"learning_rate": 7.983599957147036e-10, |
|
"loss": 1.393, |
|
"step": 13340 |
|
}, |
|
{ |
|
"epoch": 0.9903560830860534, |
|
"grad_norm": 8.367962837219238, |
|
"learning_rate": 6.883902458618696e-10, |
|
"loss": 1.2299, |
|
"step": 13350 |
|
}, |
|
{ |
|
"epoch": 0.9910979228486647, |
|
"grad_norm": 8.495455741882324, |
|
"learning_rate": 5.865640066525235e-10, |
|
"loss": 1.3572, |
|
"step": 13360 |
|
}, |
|
{ |
|
"epoch": 0.9918397626112759, |
|
"grad_norm": 7.387685298919678, |
|
"learning_rate": 4.92881831156089e-10, |
|
"loss": 1.2902, |
|
"step": 13370 |
|
}, |
|
{ |
|
"epoch": 0.9925816023738873, |
|
"grad_norm": 8.251172065734863, |
|
"learning_rate": 4.073442282070405e-10, |
|
"loss": 1.2775, |
|
"step": 13380 |
|
}, |
|
{ |
|
"epoch": 0.9933234421364985, |
|
"grad_norm": 7.994820594787598, |
|
"learning_rate": 3.2995166240290533e-10, |
|
"loss": 1.1886, |
|
"step": 13390 |
|
}, |
|
{ |
|
"epoch": 0.9940652818991098, |
|
"grad_norm": 9.602749824523926, |
|
"learning_rate": 2.6070455410159843e-10, |
|
"loss": 1.2908, |
|
"step": 13400 |
|
}, |
|
{ |
|
"epoch": 0.994807121661721, |
|
"grad_norm": 8.252080917358398, |
|
"learning_rate": 1.996032794184255e-10, |
|
"loss": 1.3749, |
|
"step": 13410 |
|
}, |
|
{ |
|
"epoch": 0.9955489614243324, |
|
"grad_norm": 8.841800689697266, |
|
"learning_rate": 1.4664817022508326e-10, |
|
"loss": 1.3027, |
|
"step": 13420 |
|
}, |
|
{ |
|
"epoch": 0.9962908011869436, |
|
"grad_norm": 7.5887370109558105, |
|
"learning_rate": 1.0183951414732828e-10, |
|
"loss": 1.3064, |
|
"step": 13430 |
|
}, |
|
{ |
|
"epoch": 0.9970326409495549, |
|
"grad_norm": 8.982939720153809, |
|
"learning_rate": 6.517755456331153e-11, |
|
"loss": 1.3691, |
|
"step": 13440 |
|
}, |
|
{ |
|
"epoch": 0.9977744807121661, |
|
"grad_norm": 8.62787914276123, |
|
"learning_rate": 3.666249060241267e-11, |
|
"loss": 1.2796, |
|
"step": 13450 |
|
}, |
|
{ |
|
"epoch": 0.9985163204747775, |
|
"grad_norm": 8.360151290893555, |
|
"learning_rate": 1.6294477144074282e-11, |
|
"loss": 1.2975, |
|
"step": 13460 |
|
}, |
|
{ |
|
"epoch": 0.9992581602373887, |
|
"grad_norm": 8.118022918701172, |
|
"learning_rate": 4.073624817468868e-12, |
|
"loss": 1.2711, |
|
"step": 13470 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 7.892242908477783, |
|
"learning_rate": 0.0, |
|
"loss": 1.2698, |
|
"step": 13480 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"step": 13480, |
|
"total_flos": 3.3534568071535e+17, |
|
"train_loss": 1.3545556901116753, |
|
"train_runtime": 10924.2621, |
|
"train_samples_per_second": 2.468, |
|
"train_steps_per_second": 1.234 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 13480, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": false, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 3.3534568071535e+17, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|