|
{ |
|
"best_metric": 0.31903526186943054, |
|
"best_model_checkpoint": "./convnext-base-15ep/checkpoint-15386", |
|
"epoch": 15.0, |
|
"eval_steps": 500, |
|
"global_step": 16485, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 12.268571853637695, |
|
"learning_rate": 9.999092077649917e-05, |
|
"loss": 1.8816, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 16.745285034179688, |
|
"learning_rate": 9.996368640328861e-05, |
|
"loss": 0.9784, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 15.278342247009277, |
|
"learning_rate": 9.991830677104683e-05, |
|
"loss": 0.8074, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 12.999685287475586, |
|
"learning_rate": 9.985479836024671e-05, |
|
"loss": 0.6924, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 15.637765884399414, |
|
"learning_rate": 9.977318423517052e-05, |
|
"loss": 0.6877, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 8.964929580688477, |
|
"learning_rate": 9.967349403553353e-05, |
|
"loss": 0.6085, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 20.98185157775879, |
|
"learning_rate": 9.95557639657199e-05, |
|
"loss": 0.6267, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 17.35112953186035, |
|
"learning_rate": 9.942003678163429e-05, |
|
"loss": 0.6133, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 4.6055192947387695, |
|
"learning_rate": 9.926636177517427e-05, |
|
"loss": 0.5813, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 10.87624454498291, |
|
"learning_rate": 9.909479475632904e-05, |
|
"loss": 0.5717, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_accuracy": 0.8572564612326043, |
|
"eval_loss": 0.4616268575191498, |
|
"eval_runtime": 111.9566, |
|
"eval_samples_per_second": 22.464, |
|
"eval_steps_per_second": 1.411, |
|
"step": 1099 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 12.744146347045898, |
|
"learning_rate": 9.8905398032911e-05, |
|
"loss": 0.5917, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 1.09, |
|
"grad_norm": 7.046997547149658, |
|
"learning_rate": 9.869824038792741e-05, |
|
"loss": 0.4628, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 1.18, |
|
"grad_norm": 14.269908905029297, |
|
"learning_rate": 9.847339705460064e-05, |
|
"loss": 0.4921, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 1.27, |
|
"grad_norm": 13.306988716125488, |
|
"learning_rate": 9.823094968904572e-05, |
|
"loss": 0.474, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 1.36, |
|
"grad_norm": 11.327611923217773, |
|
"learning_rate": 9.797098634061542e-05, |
|
"loss": 0.4723, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 1.46, |
|
"grad_norm": 10.74616813659668, |
|
"learning_rate": 9.769360141992343e-05, |
|
"loss": 0.4207, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 1.55, |
|
"grad_norm": 12.389139175415039, |
|
"learning_rate": 9.739889566455738e-05, |
|
"loss": 0.492, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 1.64, |
|
"grad_norm": 13.068366050720215, |
|
"learning_rate": 9.708697610249406e-05, |
|
"loss": 0.4583, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 1.73, |
|
"grad_norm": 6.97837495803833, |
|
"learning_rate": 9.675795601323023e-05, |
|
"loss": 0.4619, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 1.82, |
|
"grad_norm": 11.903929710388184, |
|
"learning_rate": 9.641195488664292e-05, |
|
"loss": 0.4033, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 1.91, |
|
"grad_norm": 8.415858268737793, |
|
"learning_rate": 9.604909837959455e-05, |
|
"loss": 0.4653, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_accuracy": 0.8970178926441352, |
|
"eval_loss": 0.36072757840156555, |
|
"eval_runtime": 111.9718, |
|
"eval_samples_per_second": 22.461, |
|
"eval_steps_per_second": 1.411, |
|
"step": 2198 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 19.435712814331055, |
|
"learning_rate": 9.566951827029816e-05, |
|
"loss": 0.442, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 2.09, |
|
"grad_norm": 6.7451982498168945, |
|
"learning_rate": 9.52733524104597e-05, |
|
"loss": 0.3708, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 2.18, |
|
"grad_norm": 15.736494064331055, |
|
"learning_rate": 9.486074467521456e-05, |
|
"loss": 0.3583, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 2.27, |
|
"grad_norm": 10.238139152526855, |
|
"learning_rate": 9.44318449108766e-05, |
|
"loss": 0.3513, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 2.37, |
|
"grad_norm": 5.870657444000244, |
|
"learning_rate": 9.398680888051863e-05, |
|
"loss": 0.4243, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 2.46, |
|
"grad_norm": 10.592901229858398, |
|
"learning_rate": 9.352579820740405e-05, |
|
"loss": 0.3681, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 2.55, |
|
"grad_norm": 2.5265157222747803, |
|
"learning_rate": 9.304898031629036e-05, |
|
"loss": 0.3971, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 2.64, |
|
"grad_norm": 20.8736572265625, |
|
"learning_rate": 9.25565283726257e-05, |
|
"loss": 0.4244, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 2.73, |
|
"grad_norm": 2.61696457862854, |
|
"learning_rate": 9.204862121966044e-05, |
|
"loss": 0.3762, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 2.82, |
|
"grad_norm": 11.449930191040039, |
|
"learning_rate": 9.152544331349694e-05, |
|
"loss": 0.3584, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 2.91, |
|
"grad_norm": 9.69878101348877, |
|
"learning_rate": 9.098718465610088e-05, |
|
"loss": 0.3449, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_accuracy": 0.8950298210735587, |
|
"eval_loss": 0.41042593121528625, |
|
"eval_runtime": 111.0632, |
|
"eval_samples_per_second": 22.645, |
|
"eval_steps_per_second": 1.423, |
|
"step": 3297 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"grad_norm": 2.2799901962280273, |
|
"learning_rate": 9.043404072629829e-05, |
|
"loss": 0.394, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 3.09, |
|
"grad_norm": 11.792191505432129, |
|
"learning_rate": 8.986621240878385e-05, |
|
"loss": 0.2869, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 3.18, |
|
"grad_norm": 1.5572706460952759, |
|
"learning_rate": 8.928390592116575e-05, |
|
"loss": 0.2879, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 3.28, |
|
"grad_norm": 7.704630374908447, |
|
"learning_rate": 8.86873327390739e-05, |
|
"loss": 0.3059, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 3.37, |
|
"grad_norm": 10.151177406311035, |
|
"learning_rate": 8.807670951935846e-05, |
|
"loss": 0.3329, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 3.46, |
|
"grad_norm": 9.523168563842773, |
|
"learning_rate": 8.745225802140691e-05, |
|
"loss": 0.287, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 3.55, |
|
"grad_norm": 9.30478572845459, |
|
"learning_rate": 8.681420502660786e-05, |
|
"loss": 0.2992, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 3.64, |
|
"grad_norm": 7.661346912384033, |
|
"learning_rate": 8.616278225599111e-05, |
|
"loss": 0.3259, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 3.73, |
|
"grad_norm": 5.623369216918945, |
|
"learning_rate": 8.54982262860738e-05, |
|
"loss": 0.334, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 3.82, |
|
"grad_norm": 0.08087614178657532, |
|
"learning_rate": 8.482077846294308e-05, |
|
"loss": 0.3385, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 3.91, |
|
"grad_norm": 12.912845611572266, |
|
"learning_rate": 8.413068481460686e-05, |
|
"loss": 0.3522, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"eval_accuracy": 0.9025844930417495, |
|
"eval_loss": 0.37546443939208984, |
|
"eval_runtime": 111.8065, |
|
"eval_samples_per_second": 22.494, |
|
"eval_steps_per_second": 1.413, |
|
"step": 4396 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"grad_norm": 2.0130882263183594, |
|
"learning_rate": 8.342819596164387e-05, |
|
"loss": 0.2963, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 4.09, |
|
"grad_norm": 9.422698020935059, |
|
"learning_rate": 8.271356702618626e-05, |
|
"loss": 0.263, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 4.19, |
|
"grad_norm": 4.466526985168457, |
|
"learning_rate": 8.198705753926704e-05, |
|
"loss": 0.2767, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 4.28, |
|
"grad_norm": 8.486282348632812, |
|
"learning_rate": 8.12489313465665e-05, |
|
"loss": 0.2691, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 4.37, |
|
"grad_norm": 1.178945541381836, |
|
"learning_rate": 8.049945651259163e-05, |
|
"loss": 0.2454, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 4.46, |
|
"grad_norm": 8.335172653198242, |
|
"learning_rate": 7.973890522332348e-05, |
|
"loss": 0.2906, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 4.55, |
|
"grad_norm": 3.436420440673828, |
|
"learning_rate": 7.89675536873676e-05, |
|
"loss": 0.2316, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 4.64, |
|
"grad_norm": 7.467193603515625, |
|
"learning_rate": 7.818568203564374e-05, |
|
"loss": 0.2738, |
|
"step": 5100 |
|
}, |
|
{ |
|
"epoch": 4.73, |
|
"grad_norm": 10.7122163772583, |
|
"learning_rate": 7.739357421965086e-05, |
|
"loss": 0.281, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 4.82, |
|
"grad_norm": 14.049674034118652, |
|
"learning_rate": 7.65915179083449e-05, |
|
"loss": 0.2782, |
|
"step": 5300 |
|
}, |
|
{ |
|
"epoch": 4.91, |
|
"grad_norm": 16.04802703857422, |
|
"learning_rate": 7.577980438366628e-05, |
|
"loss": 0.28, |
|
"step": 5400 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"eval_accuracy": 0.9065606361829026, |
|
"eval_loss": 0.3756468892097473, |
|
"eval_runtime": 111.8227, |
|
"eval_samples_per_second": 22.491, |
|
"eval_steps_per_second": 1.413, |
|
"step": 5495 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"grad_norm": 2.5667672157287598, |
|
"learning_rate": 7.495872843475536e-05, |
|
"loss": 0.2711, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 5.1, |
|
"grad_norm": 5.182483673095703, |
|
"learning_rate": 7.412858825089422e-05, |
|
"loss": 0.1939, |
|
"step": 5600 |
|
}, |
|
{ |
|
"epoch": 5.19, |
|
"grad_norm": 13.568757057189941, |
|
"learning_rate": 7.32896853132135e-05, |
|
"loss": 0.2178, |
|
"step": 5700 |
|
}, |
|
{ |
|
"epoch": 5.28, |
|
"grad_norm": 10.223276138305664, |
|
"learning_rate": 7.244232428520383e-05, |
|
"loss": 0.2185, |
|
"step": 5800 |
|
}, |
|
{ |
|
"epoch": 5.37, |
|
"grad_norm": 0.07271099835634232, |
|
"learning_rate": 7.158681290207163e-05, |
|
"loss": 0.2218, |
|
"step": 5900 |
|
}, |
|
{ |
|
"epoch": 5.46, |
|
"grad_norm": 2.859640598297119, |
|
"learning_rate": 7.07234618589791e-05, |
|
"loss": 0.1974, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 5.55, |
|
"grad_norm": 4.893543243408203, |
|
"learning_rate": 6.985258469820939e-05, |
|
"loss": 0.2201, |
|
"step": 6100 |
|
}, |
|
{ |
|
"epoch": 5.64, |
|
"grad_norm": 9.013633728027344, |
|
"learning_rate": 6.897449769529792e-05, |
|
"loss": 0.2031, |
|
"step": 6200 |
|
}, |
|
{ |
|
"epoch": 5.73, |
|
"grad_norm": 8.197441101074219, |
|
"learning_rate": 6.808951974417078e-05, |
|
"loss": 0.2532, |
|
"step": 6300 |
|
}, |
|
{ |
|
"epoch": 5.82, |
|
"grad_norm": 0.8769248127937317, |
|
"learning_rate": 6.719797224133242e-05, |
|
"loss": 0.2399, |
|
"step": 6400 |
|
}, |
|
{ |
|
"epoch": 5.91, |
|
"grad_norm": 9.442893981933594, |
|
"learning_rate": 6.630017896914446e-05, |
|
"loss": 0.2456, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"eval_accuracy": 0.9172962226640159, |
|
"eval_loss": 0.349565327167511, |
|
"eval_runtime": 110.6537, |
|
"eval_samples_per_second": 22.729, |
|
"eval_steps_per_second": 1.428, |
|
"step": 6594 |
|
}, |
|
{ |
|
"epoch": 6.01, |
|
"grad_norm": 9.589459419250488, |
|
"learning_rate": 6.539646597823791e-05, |
|
"loss": 0.2373, |
|
"step": 6600 |
|
}, |
|
{ |
|
"epoch": 6.1, |
|
"grad_norm": 11.073722839355469, |
|
"learning_rate": 6.44871614691018e-05, |
|
"loss": 0.1756, |
|
"step": 6700 |
|
}, |
|
{ |
|
"epoch": 6.19, |
|
"grad_norm": 15.033493995666504, |
|
"learning_rate": 6.357259567289082e-05, |
|
"loss": 0.1883, |
|
"step": 6800 |
|
}, |
|
{ |
|
"epoch": 6.28, |
|
"grad_norm": 2.1004364490509033, |
|
"learning_rate": 6.265310073149584e-05, |
|
"loss": 0.1599, |
|
"step": 6900 |
|
}, |
|
{ |
|
"epoch": 6.37, |
|
"grad_norm": 2.42246675491333, |
|
"learning_rate": 6.172901057692007e-05, |
|
"loss": 0.2066, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 6.46, |
|
"grad_norm": 5.9083147048950195, |
|
"learning_rate": 6.0800660810005416e-05, |
|
"loss": 0.1966, |
|
"step": 7100 |
|
}, |
|
{ |
|
"epoch": 6.55, |
|
"grad_norm": 2.3564085960388184, |
|
"learning_rate": 5.9868388578552734e-05, |
|
"loss": 0.1641, |
|
"step": 7200 |
|
}, |
|
{ |
|
"epoch": 6.64, |
|
"grad_norm": 0.16256462037563324, |
|
"learning_rate": 5.893253245488015e-05, |
|
"loss": 0.1643, |
|
"step": 7300 |
|
}, |
|
{ |
|
"epoch": 6.73, |
|
"grad_norm": 6.537099838256836, |
|
"learning_rate": 5.79934323128641e-05, |
|
"loss": 0.1932, |
|
"step": 7400 |
|
}, |
|
{ |
|
"epoch": 6.82, |
|
"grad_norm": 9.104645729064941, |
|
"learning_rate": 5.705142920450777e-05, |
|
"loss": 0.1993, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 6.92, |
|
"grad_norm": 13.837006568908691, |
|
"learning_rate": 5.610686523608151e-05, |
|
"loss": 0.2141, |
|
"step": 7600 |
|
}, |
|
{ |
|
"epoch": 7.0, |
|
"eval_accuracy": 0.920079522862823, |
|
"eval_loss": 0.3611968457698822, |
|
"eval_runtime": 111.0708, |
|
"eval_samples_per_second": 22.643, |
|
"eval_steps_per_second": 1.423, |
|
"step": 7693 |
|
}, |
|
{ |
|
"epoch": 7.01, |
|
"grad_norm": 13.015904426574707, |
|
"learning_rate": 5.516008344388053e-05, |
|
"loss": 0.1581, |
|
"step": 7700 |
|
}, |
|
{ |
|
"epoch": 7.1, |
|
"grad_norm": 5.245505332946777, |
|
"learning_rate": 5.421142766964474e-05, |
|
"loss": 0.1366, |
|
"step": 7800 |
|
}, |
|
{ |
|
"epoch": 7.19, |
|
"grad_norm": 1.4385249614715576, |
|
"learning_rate": 5.326124243568617e-05, |
|
"loss": 0.1867, |
|
"step": 7900 |
|
}, |
|
{ |
|
"epoch": 7.28, |
|
"grad_norm": 1.0775200128555298, |
|
"learning_rate": 5.230987281976901e-05, |
|
"loss": 0.1415, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 7.37, |
|
"grad_norm": 1.4409250020980835, |
|
"learning_rate": 5.135766432978829e-05, |
|
"loss": 0.2004, |
|
"step": 8100 |
|
}, |
|
{ |
|
"epoch": 7.46, |
|
"grad_norm": 0.05134790390729904, |
|
"learning_rate": 5.0404962778292e-05, |
|
"loss": 0.159, |
|
"step": 8200 |
|
}, |
|
{ |
|
"epoch": 7.55, |
|
"grad_norm": 7.336070537567139, |
|
"learning_rate": 4.945211415689278e-05, |
|
"loss": 0.1578, |
|
"step": 8300 |
|
}, |
|
{ |
|
"epoch": 7.64, |
|
"grad_norm": 0.08769059181213379, |
|
"learning_rate": 4.849946451061443e-05, |
|
"loss": 0.1883, |
|
"step": 8400 |
|
}, |
|
{ |
|
"epoch": 7.73, |
|
"grad_norm": 12.924542427062988, |
|
"learning_rate": 4.754735981221927e-05, |
|
"loss": 0.1717, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 7.83, |
|
"grad_norm": 6.11922025680542, |
|
"learning_rate": 4.659614583656138e-05, |
|
"loss": 0.1557, |
|
"step": 8600 |
|
}, |
|
{ |
|
"epoch": 7.92, |
|
"grad_norm": 20.91149139404297, |
|
"learning_rate": 4.564616803501205e-05, |
|
"loss": 0.1458, |
|
"step": 8700 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"eval_accuracy": 0.9304174950298211, |
|
"eval_loss": 0.33909356594085693, |
|
"eval_runtime": 111.9164, |
|
"eval_samples_per_second": 22.472, |
|
"eval_steps_per_second": 1.412, |
|
"step": 8792 |
|
}, |
|
{ |
|
"epoch": 8.01, |
|
"grad_norm": 2.7522988319396973, |
|
"learning_rate": 4.469777141000255e-05, |
|
"loss": 0.1309, |
|
"step": 8800 |
|
}, |
|
{ |
|
"epoch": 8.1, |
|
"grad_norm": 8.19546890258789, |
|
"learning_rate": 4.375130038972988e-05, |
|
"loss": 0.1502, |
|
"step": 8900 |
|
}, |
|
{ |
|
"epoch": 8.19, |
|
"grad_norm": 3.5378799438476562, |
|
"learning_rate": 4.2807098703071255e-05, |
|
"loss": 0.1528, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 8.28, |
|
"grad_norm": 8.663042068481445, |
|
"learning_rate": 4.18655092547524e-05, |
|
"loss": 0.0897, |
|
"step": 9100 |
|
}, |
|
{ |
|
"epoch": 8.37, |
|
"grad_norm": 0.019041290506720543, |
|
"learning_rate": 4.092687400081522e-05, |
|
"loss": 0.16, |
|
"step": 9200 |
|
}, |
|
{ |
|
"epoch": 8.46, |
|
"grad_norm": 0.053332068026065826, |
|
"learning_rate": 3.999153382442995e-05, |
|
"loss": 0.1378, |
|
"step": 9300 |
|
}, |
|
{ |
|
"epoch": 8.55, |
|
"grad_norm": 5.864047050476074, |
|
"learning_rate": 3.9059828412097024e-05, |
|
"loss": 0.1176, |
|
"step": 9400 |
|
}, |
|
{ |
|
"epoch": 8.64, |
|
"grad_norm": 11.58228588104248, |
|
"learning_rate": 3.8132096130283455e-05, |
|
"loss": 0.1336, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 8.74, |
|
"grad_norm": 0.3950587511062622, |
|
"learning_rate": 3.7208673902538706e-05, |
|
"loss": 0.1525, |
|
"step": 9600 |
|
}, |
|
{ |
|
"epoch": 8.83, |
|
"grad_norm": 9.62333869934082, |
|
"learning_rate": 3.628989708713436e-05, |
|
"loss": 0.1565, |
|
"step": 9700 |
|
}, |
|
{ |
|
"epoch": 8.92, |
|
"grad_norm": 12.212615966796875, |
|
"learning_rate": 3.537609935527264e-05, |
|
"loss": 0.1842, |
|
"step": 9800 |
|
}, |
|
{ |
|
"epoch": 9.0, |
|
"eval_accuracy": 0.932803180914513, |
|
"eval_loss": 0.33526965975761414, |
|
"eval_runtime": 111.0268, |
|
"eval_samples_per_second": 22.652, |
|
"eval_steps_per_second": 1.423, |
|
"step": 9891 |
|
}, |
|
{ |
|
"epoch": 9.01, |
|
"grad_norm": 2.527280807495117, |
|
"learning_rate": 3.446761256990723e-05, |
|
"loss": 0.1074, |
|
"step": 9900 |
|
}, |
|
{ |
|
"epoch": 9.1, |
|
"grad_norm": 0.02450253628194332, |
|
"learning_rate": 3.356476666522099e-05, |
|
"loss": 0.1001, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 9.19, |
|
"grad_norm": 0.32657700777053833, |
|
"learning_rate": 3.266788952680414e-05, |
|
"loss": 0.1094, |
|
"step": 10100 |
|
}, |
|
{ |
|
"epoch": 9.28, |
|
"grad_norm": 2.115306854248047, |
|
"learning_rate": 3.177730687257639e-05, |
|
"loss": 0.1069, |
|
"step": 10200 |
|
}, |
|
{ |
|
"epoch": 9.37, |
|
"grad_norm": 0.252822607755661, |
|
"learning_rate": 3.0893342134496295e-05, |
|
"loss": 0.1074, |
|
"step": 10300 |
|
}, |
|
{ |
|
"epoch": 9.46, |
|
"grad_norm": 10.717082023620605, |
|
"learning_rate": 3.0016316341100808e-05, |
|
"loss": 0.1075, |
|
"step": 10400 |
|
}, |
|
{ |
|
"epoch": 9.55, |
|
"grad_norm": 0.030081748962402344, |
|
"learning_rate": 2.914654800091768e-05, |
|
"loss": 0.1344, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 9.65, |
|
"grad_norm": 13.859551429748535, |
|
"learning_rate": 2.8284352986793094e-05, |
|
"loss": 0.1205, |
|
"step": 10600 |
|
}, |
|
{ |
|
"epoch": 9.74, |
|
"grad_norm": 1.8139934539794922, |
|
"learning_rate": 2.7430044421176447e-05, |
|
"loss": 0.1253, |
|
"step": 10700 |
|
}, |
|
{ |
|
"epoch": 9.83, |
|
"grad_norm": 1.271942377090454, |
|
"learning_rate": 2.6583932562403957e-05, |
|
"loss": 0.1226, |
|
"step": 10800 |
|
}, |
|
{ |
|
"epoch": 9.92, |
|
"grad_norm": 0.060803137719631195, |
|
"learning_rate": 2.5746324692022527e-05, |
|
"loss": 0.1037, |
|
"step": 10900 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"eval_accuracy": 0.9355864811133201, |
|
"eval_loss": 0.33833950757980347, |
|
"eval_runtime": 111.2929, |
|
"eval_samples_per_second": 22.598, |
|
"eval_steps_per_second": 1.42, |
|
"step": 10990 |
|
}, |
|
{ |
|
"epoch": 10.01, |
|
"grad_norm": 0.0546095035970211, |
|
"learning_rate": 2.4917525003194624e-05, |
|
"loss": 0.1162, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 10.1, |
|
"grad_norm": 6.095231533050537, |
|
"learning_rate": 2.409783449022475e-05, |
|
"loss": 0.0934, |
|
"step": 11100 |
|
}, |
|
{ |
|
"epoch": 10.19, |
|
"grad_norm": 0.6238592267036438, |
|
"learning_rate": 2.3287550839247624e-05, |
|
"loss": 0.087, |
|
"step": 11200 |
|
}, |
|
{ |
|
"epoch": 10.28, |
|
"grad_norm": 24.5279598236084, |
|
"learning_rate": 2.2486968320117907e-05, |
|
"loss": 0.0873, |
|
"step": 11300 |
|
}, |
|
{ |
|
"epoch": 10.37, |
|
"grad_norm": 2.309201240539551, |
|
"learning_rate": 2.169637767954048e-05, |
|
"loss": 0.0853, |
|
"step": 11400 |
|
}, |
|
{ |
|
"epoch": 10.46, |
|
"grad_norm": 0.10698918998241425, |
|
"learning_rate": 2.091606603548029e-05, |
|
"loss": 0.0994, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 10.56, |
|
"grad_norm": 0.09092956781387329, |
|
"learning_rate": 2.0146316772889983e-05, |
|
"loss": 0.0704, |
|
"step": 11600 |
|
}, |
|
{ |
|
"epoch": 10.65, |
|
"grad_norm": 10.446697235107422, |
|
"learning_rate": 1.9387409440793386e-05, |
|
"loss": 0.0918, |
|
"step": 11700 |
|
}, |
|
{ |
|
"epoch": 10.74, |
|
"grad_norm": 8.398408889770508, |
|
"learning_rate": 1.863961965076186e-05, |
|
"loss": 0.096, |
|
"step": 11800 |
|
}, |
|
{ |
|
"epoch": 10.83, |
|
"grad_norm": 0.9171711802482605, |
|
"learning_rate": 1.790321897682083e-05, |
|
"loss": 0.1025, |
|
"step": 11900 |
|
}, |
|
{ |
|
"epoch": 10.92, |
|
"grad_norm": 0.2961582839488983, |
|
"learning_rate": 1.7178474856822456e-05, |
|
"loss": 0.0747, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 11.0, |
|
"eval_accuracy": 0.936779324055666, |
|
"eval_loss": 0.33453264832496643, |
|
"eval_runtime": 109.702, |
|
"eval_samples_per_second": 22.926, |
|
"eval_steps_per_second": 1.44, |
|
"step": 12089 |
|
}, |
|
{ |
|
"epoch": 11.01, |
|
"grad_norm": 0.05641782283782959, |
|
"learning_rate": 1.646565049532063e-05, |
|
"loss": 0.0575, |
|
"step": 12100 |
|
}, |
|
{ |
|
"epoch": 11.1, |
|
"grad_norm": 0.008614973165094852, |
|
"learning_rate": 1.576500476798311e-05, |
|
"loss": 0.0942, |
|
"step": 12200 |
|
}, |
|
{ |
|
"epoch": 11.19, |
|
"grad_norm": 0.0614984966814518, |
|
"learning_rate": 1.5076792127576073e-05, |
|
"loss": 0.0737, |
|
"step": 12300 |
|
}, |
|
{ |
|
"epoch": 11.28, |
|
"grad_norm": 0.6600818037986755, |
|
"learning_rate": 1.4401262511554642e-05, |
|
"loss": 0.0601, |
|
"step": 12400 |
|
}, |
|
{ |
|
"epoch": 11.37, |
|
"grad_norm": 7.176374435424805, |
|
"learning_rate": 1.3738661251293423e-05, |
|
"loss": 0.0739, |
|
"step": 12500 |
|
}, |
|
{ |
|
"epoch": 11.46, |
|
"grad_norm": 0.5290641784667969, |
|
"learning_rate": 1.308922898298977e-05, |
|
"loss": 0.069, |
|
"step": 12600 |
|
}, |
|
{ |
|
"epoch": 11.56, |
|
"grad_norm": 0.010565654374659061, |
|
"learning_rate": 1.2453201560272204e-05, |
|
"loss": 0.0818, |
|
"step": 12700 |
|
}, |
|
{ |
|
"epoch": 11.65, |
|
"grad_norm": 0.48881271481513977, |
|
"learning_rate": 1.183080996854562e-05, |
|
"loss": 0.0751, |
|
"step": 12800 |
|
}, |
|
{ |
|
"epoch": 11.74, |
|
"grad_norm": 0.5463552474975586, |
|
"learning_rate": 1.1222280241104716e-05, |
|
"loss": 0.0684, |
|
"step": 12900 |
|
}, |
|
{ |
|
"epoch": 11.83, |
|
"grad_norm": 5.191521167755127, |
|
"learning_rate": 1.062783337704557e-05, |
|
"loss": 0.1047, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 11.92, |
|
"grad_norm": 0.0762176662683487, |
|
"learning_rate": 1.0047685261005707e-05, |
|
"loss": 0.0912, |
|
"step": 13100 |
|
}, |
|
{ |
|
"epoch": 12.0, |
|
"eval_accuracy": 0.9391650099403579, |
|
"eval_loss": 0.3244304656982422, |
|
"eval_runtime": 112.1314, |
|
"eval_samples_per_second": 22.429, |
|
"eval_steps_per_second": 1.409, |
|
"step": 13188 |
|
}, |
|
{ |
|
"epoch": 12.01, |
|
"grad_norm": 0.017082059755921364, |
|
"learning_rate": 9.482046584761495e-06, |
|
"loss": 0.0909, |
|
"step": 13200 |
|
}, |
|
{ |
|
"epoch": 12.1, |
|
"grad_norm": 16.901845932006836, |
|
"learning_rate": 8.931122770711425e-06, |
|
"loss": 0.0827, |
|
"step": 13300 |
|
}, |
|
{ |
|
"epoch": 12.19, |
|
"grad_norm": 0.1826079934835434, |
|
"learning_rate": 8.395113897273105e-06, |
|
"loss": 0.086, |
|
"step": 13400 |
|
}, |
|
{ |
|
"epoch": 12.28, |
|
"grad_norm": 0.14454180002212524, |
|
"learning_rate": 7.874214626220899e-06, |
|
"loss": 0.0624, |
|
"step": 13500 |
|
}, |
|
{ |
|
"epoch": 12.37, |
|
"grad_norm": 3.5157103538513184, |
|
"learning_rate": 7.368614131990986e-06, |
|
"loss": 0.1045, |
|
"step": 13600 |
|
}, |
|
{ |
|
"epoch": 12.47, |
|
"grad_norm": 0.0011518648825585842, |
|
"learning_rate": 6.8784960329789264e-06, |
|
"loss": 0.0802, |
|
"step": 13700 |
|
}, |
|
{ |
|
"epoch": 12.56, |
|
"grad_norm": 1.2086189985275269, |
|
"learning_rate": 6.404038324855222e-06, |
|
"loss": 0.0515, |
|
"step": 13800 |
|
}, |
|
{ |
|
"epoch": 12.65, |
|
"grad_norm": 5.540249824523926, |
|
"learning_rate": 5.945413315922826e-06, |
|
"loss": 0.0687, |
|
"step": 13900 |
|
}, |
|
{ |
|
"epoch": 12.74, |
|
"grad_norm": 9.239886283874512, |
|
"learning_rate": 5.5027875645401015e-06, |
|
"loss": 0.0625, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 12.83, |
|
"grad_norm": 0.017092958092689514, |
|
"learning_rate": 5.076321818632018e-06, |
|
"loss": 0.1087, |
|
"step": 14100 |
|
}, |
|
{ |
|
"epoch": 12.92, |
|
"grad_norm": 9.08121109008789, |
|
"learning_rate": 4.666170957311472e-06, |
|
"loss": 0.0733, |
|
"step": 14200 |
|
}, |
|
{ |
|
"epoch": 13.0, |
|
"eval_accuracy": 0.9407554671968191, |
|
"eval_loss": 0.32192325592041016, |
|
"eval_runtime": 110.2447, |
|
"eval_samples_per_second": 22.813, |
|
"eval_steps_per_second": 1.433, |
|
"step": 14287 |
|
}, |
|
{ |
|
"epoch": 13.01, |
|
"grad_norm": 0.021881932392716408, |
|
"learning_rate": 4.272483934632021e-06, |
|
"loss": 0.0548, |
|
"step": 14300 |
|
}, |
|
{ |
|
"epoch": 13.1, |
|
"grad_norm": 0.6576229929924011, |
|
"learning_rate": 3.895403725492402e-06, |
|
"loss": 0.059, |
|
"step": 14400 |
|
}, |
|
{ |
|
"epoch": 13.19, |
|
"grad_norm": 5.590795516967773, |
|
"learning_rate": 3.5350672737124725e-06, |
|
"loss": 0.092, |
|
"step": 14500 |
|
}, |
|
{ |
|
"epoch": 13.28, |
|
"grad_norm": 0.2879085838794708, |
|
"learning_rate": 3.1916054422994834e-06, |
|
"loss": 0.0642, |
|
"step": 14600 |
|
}, |
|
{ |
|
"epoch": 13.38, |
|
"grad_norm": 0.1755078285932541, |
|
"learning_rate": 2.86514296592269e-06, |
|
"loss": 0.069, |
|
"step": 14700 |
|
}, |
|
{ |
|
"epoch": 13.47, |
|
"grad_norm": 11.491580963134766, |
|
"learning_rate": 2.5557984056135964e-06, |
|
"loss": 0.0692, |
|
"step": 14800 |
|
}, |
|
{ |
|
"epoch": 13.56, |
|
"grad_norm": 0.22245600819587708, |
|
"learning_rate": 2.263684105708275e-06, |
|
"loss": 0.0843, |
|
"step": 14900 |
|
}, |
|
{ |
|
"epoch": 13.65, |
|
"grad_norm": 19.67642593383789, |
|
"learning_rate": 1.9889061530473986e-06, |
|
"loss": 0.058, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 13.74, |
|
"grad_norm": 6.375917434692383, |
|
"learning_rate": 1.7315643384487713e-06, |
|
"loss": 0.0828, |
|
"step": 15100 |
|
}, |
|
{ |
|
"epoch": 13.83, |
|
"grad_norm": 0.5529243350028992, |
|
"learning_rate": 1.4917521204664331e-06, |
|
"loss": 0.0743, |
|
"step": 15200 |
|
}, |
|
{ |
|
"epoch": 13.92, |
|
"grad_norm": 6.179434299468994, |
|
"learning_rate": 1.269556591449389e-06, |
|
"loss": 0.0667, |
|
"step": 15300 |
|
}, |
|
{ |
|
"epoch": 14.0, |
|
"eval_accuracy": 0.9435387673956263, |
|
"eval_loss": 0.31903526186943054, |
|
"eval_runtime": 110.0813, |
|
"eval_samples_per_second": 22.847, |
|
"eval_steps_per_second": 1.435, |
|
"step": 15386 |
|
}, |
|
{ |
|
"epoch": 14.01, |
|
"grad_norm": 2.8753578662872314, |
|
"learning_rate": 1.065058445912398e-06, |
|
"loss": 0.0588, |
|
"step": 15400 |
|
}, |
|
{ |
|
"epoch": 14.1, |
|
"grad_norm": 0.04811515659093857, |
|
"learning_rate": 8.783319512302102e-07, |
|
"loss": 0.0608, |
|
"step": 15500 |
|
}, |
|
{ |
|
"epoch": 14.19, |
|
"grad_norm": 7.1183762550354, |
|
"learning_rate": 7.094449206659748e-07, |
|
"loss": 0.0768, |
|
"step": 15600 |
|
}, |
|
{ |
|
"epoch": 14.29, |
|
"grad_norm": 0.26764577627182007, |
|
"learning_rate": 5.584586887435739e-07, |
|
"loss": 0.0713, |
|
"step": 15700 |
|
}, |
|
{ |
|
"epoch": 14.38, |
|
"grad_norm": 5.16556978225708, |
|
"learning_rate": 4.254280889728068e-07, |
|
"loss": 0.0687, |
|
"step": 15800 |
|
}, |
|
{ |
|
"epoch": 14.47, |
|
"grad_norm": 0.2050618976354599, |
|
"learning_rate": 3.104014339355921e-07, |
|
"loss": 0.0591, |
|
"step": 15900 |
|
}, |
|
{ |
|
"epoch": 14.56, |
|
"grad_norm": 5.02229642868042, |
|
"learning_rate": 2.1342049774030758e-07, |
|
"loss": 0.0687, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 14.65, |
|
"grad_norm": 0.027897778898477554, |
|
"learning_rate": 1.3452050085075442e-07, |
|
"loss": 0.0534, |
|
"step": 16100 |
|
}, |
|
{ |
|
"epoch": 14.74, |
|
"grad_norm": 14.302966117858887, |
|
"learning_rate": 7.37300972951771e-08, |
|
"loss": 0.0811, |
|
"step": 16200 |
|
}, |
|
{ |
|
"epoch": 14.83, |
|
"grad_norm": 0.017526021227240562, |
|
"learning_rate": 3.107136425999912e-08, |
|
"loss": 0.0723, |
|
"step": 16300 |
|
}, |
|
{ |
|
"epoch": 14.92, |
|
"grad_norm": 4.576681137084961, |
|
"learning_rate": 6.559794072080738e-09, |
|
"loss": 0.0694, |
|
"step": 16400 |
|
}, |
|
{ |
|
"epoch": 15.0, |
|
"eval_accuracy": 0.9431411530815109, |
|
"eval_loss": 0.31916290521621704, |
|
"eval_runtime": 106.1086, |
|
"eval_samples_per_second": 23.702, |
|
"eval_steps_per_second": 1.489, |
|
"step": 16485 |
|
}, |
|
{ |
|
"epoch": 15.0, |
|
"step": 16485, |
|
"total_flos": 6.140249030814106e+19, |
|
"train_loss": 0.2272892904238228, |
|
"train_runtime": 26538.9058, |
|
"train_samples_per_second": 9.937, |
|
"train_steps_per_second": 0.621 |
|
} |
|
], |
|
"logging_steps": 100, |
|
"max_steps": 16485, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 15, |
|
"save_steps": 500, |
|
"total_flos": 6.140249030814106e+19, |
|
"train_batch_size": 16, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|