|
{ |
|
"best_metric": 0.9820952380952381, |
|
"best_model_checkpoint": "dinov2-large-cluster-finetune-linear-probe-trueface/checkpoint-2952", |
|
"epoch": 1.9989842559674962, |
|
"eval_steps": 500, |
|
"global_step": 2952, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.006771626883358727, |
|
"grad_norm": 5.265126705169678, |
|
"learning_rate": 1.6891891891891894e-06, |
|
"loss": 0.8799, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.013543253766717453, |
|
"grad_norm": 6.357407093048096, |
|
"learning_rate": 3.3783783783783788e-06, |
|
"loss": 0.8361, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.02031488065007618, |
|
"grad_norm": 4.737895965576172, |
|
"learning_rate": 5.067567567567568e-06, |
|
"loss": 0.8673, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.027086507533434907, |
|
"grad_norm": 5.784124374389648, |
|
"learning_rate": 6.7567567567567575e-06, |
|
"loss": 0.87, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.03385813441679363, |
|
"grad_norm": 6.456064701080322, |
|
"learning_rate": 8.445945945945946e-06, |
|
"loss": 0.8188, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.04062976130015236, |
|
"grad_norm": 6.111750602722168, |
|
"learning_rate": 1.0135135135135136e-05, |
|
"loss": 0.8241, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.04740138818351109, |
|
"grad_norm": 4.581135272979736, |
|
"learning_rate": 1.1824324324324325e-05, |
|
"loss": 0.8163, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.05417301506686981, |
|
"grad_norm": 4.400207996368408, |
|
"learning_rate": 1.3513513513513515e-05, |
|
"loss": 0.8066, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.06094464195022854, |
|
"grad_norm": 4.919775009155273, |
|
"learning_rate": 1.5202702702702704e-05, |
|
"loss": 0.7875, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.06771626883358726, |
|
"grad_norm": 4.747180461883545, |
|
"learning_rate": 1.6891891891891892e-05, |
|
"loss": 0.7627, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.074487895716946, |
|
"grad_norm": 4.32789421081543, |
|
"learning_rate": 1.8581081081081082e-05, |
|
"loss": 0.7603, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.08125952260030472, |
|
"grad_norm": 4.564222812652588, |
|
"learning_rate": 2.0270270270270273e-05, |
|
"loss": 0.7323, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.08803114948366345, |
|
"grad_norm": 4.640108108520508, |
|
"learning_rate": 2.195945945945946e-05, |
|
"loss": 0.7282, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.09480277636702218, |
|
"grad_norm": 4.069850921630859, |
|
"learning_rate": 2.364864864864865e-05, |
|
"loss": 0.6938, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.1015744032503809, |
|
"grad_norm": 4.110805988311768, |
|
"learning_rate": 2.533783783783784e-05, |
|
"loss": 0.6607, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.10834603013373963, |
|
"grad_norm": 4.257925510406494, |
|
"learning_rate": 2.702702702702703e-05, |
|
"loss": 0.6171, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.11511765701709836, |
|
"grad_norm": 3.9571423530578613, |
|
"learning_rate": 2.8716216216216217e-05, |
|
"loss": 0.6084, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.12188928390045708, |
|
"grad_norm": 3.1258833408355713, |
|
"learning_rate": 3.0405405405405407e-05, |
|
"loss": 0.5765, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.1286609107838158, |
|
"grad_norm": 3.235241651535034, |
|
"learning_rate": 3.20945945945946e-05, |
|
"loss": 0.5534, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.13543253766717453, |
|
"grad_norm": 4.166315078735352, |
|
"learning_rate": 3.3783783783783784e-05, |
|
"loss": 0.5382, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.14220416455053325, |
|
"grad_norm": 3.878629446029663, |
|
"learning_rate": 3.547297297297297e-05, |
|
"loss": 0.5036, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.148975791433892, |
|
"grad_norm": 2.8281824588775635, |
|
"learning_rate": 3.7162162162162165e-05, |
|
"loss": 0.4804, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.15574741831725072, |
|
"grad_norm": 2.894228219985962, |
|
"learning_rate": 3.885135135135135e-05, |
|
"loss": 0.4522, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.16251904520060945, |
|
"grad_norm": 2.7571442127227783, |
|
"learning_rate": 4.0540540540540545e-05, |
|
"loss": 0.4496, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.16929067208396817, |
|
"grad_norm": 2.831394672393799, |
|
"learning_rate": 4.222972972972973e-05, |
|
"loss": 0.412, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.1760622989673269, |
|
"grad_norm": 2.329063892364502, |
|
"learning_rate": 4.391891891891892e-05, |
|
"loss": 0.386, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.1828339258506856, |
|
"grad_norm": 2.811448812484741, |
|
"learning_rate": 4.560810810810811e-05, |
|
"loss": 0.3755, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.18960555273404436, |
|
"grad_norm": 2.9765515327453613, |
|
"learning_rate": 4.72972972972973e-05, |
|
"loss": 0.3544, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.19637717961740309, |
|
"grad_norm": 2.6609086990356445, |
|
"learning_rate": 4.8986486486486486e-05, |
|
"loss": 0.3317, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.2031488065007618, |
|
"grad_norm": 4.433963298797607, |
|
"learning_rate": 4.992469879518072e-05, |
|
"loss": 0.3321, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.20992043338412053, |
|
"grad_norm": 1.8786640167236328, |
|
"learning_rate": 4.9736445783132535e-05, |
|
"loss": 0.3046, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.21669206026747925, |
|
"grad_norm": 2.7514021396636963, |
|
"learning_rate": 4.954819277108434e-05, |
|
"loss": 0.2974, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.22346368715083798, |
|
"grad_norm": 3.1674134731292725, |
|
"learning_rate": 4.9359939759036146e-05, |
|
"loss": 0.2762, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.23023531403419673, |
|
"grad_norm": 1.7846009731292725, |
|
"learning_rate": 4.917168674698795e-05, |
|
"loss": 0.2604, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.23700694091755545, |
|
"grad_norm": 1.7402037382125854, |
|
"learning_rate": 4.898343373493976e-05, |
|
"loss": 0.263, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.24377856780091417, |
|
"grad_norm": 1.480891466140747, |
|
"learning_rate": 4.879518072289157e-05, |
|
"loss": 0.25, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.2505501946842729, |
|
"grad_norm": 2.047008514404297, |
|
"learning_rate": 4.8606927710843376e-05, |
|
"loss": 0.2281, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.2573218215676316, |
|
"grad_norm": 1.3639116287231445, |
|
"learning_rate": 4.841867469879519e-05, |
|
"loss": 0.2264, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.26409344845099036, |
|
"grad_norm": 1.8563684225082397, |
|
"learning_rate": 4.823042168674699e-05, |
|
"loss": 0.2245, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.27086507533434906, |
|
"grad_norm": 1.3617446422576904, |
|
"learning_rate": 4.804216867469879e-05, |
|
"loss": 0.2177, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.2776367022177078, |
|
"grad_norm": 1.7790151834487915, |
|
"learning_rate": 4.7853915662650606e-05, |
|
"loss": 0.2191, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.2844083291010665, |
|
"grad_norm": 1.2283886671066284, |
|
"learning_rate": 4.766566265060241e-05, |
|
"loss": 0.208, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.29117995598442525, |
|
"grad_norm": 1.2149569988250732, |
|
"learning_rate": 4.7477409638554224e-05, |
|
"loss": 0.1974, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.297951582867784, |
|
"grad_norm": 1.4879294633865356, |
|
"learning_rate": 4.728915662650602e-05, |
|
"loss": 0.1905, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.3047232097511427, |
|
"grad_norm": 1.4393724203109741, |
|
"learning_rate": 4.710090361445783e-05, |
|
"loss": 0.194, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.31149483663450145, |
|
"grad_norm": 1.5028496980667114, |
|
"learning_rate": 4.691265060240964e-05, |
|
"loss": 0.1831, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.31826646351786014, |
|
"grad_norm": 1.403731346130371, |
|
"learning_rate": 4.672439759036145e-05, |
|
"loss": 0.193, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.3250380904012189, |
|
"grad_norm": 1.6740349531173706, |
|
"learning_rate": 4.653614457831326e-05, |
|
"loss": 0.1724, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.33180971728457764, |
|
"grad_norm": 0.9081584215164185, |
|
"learning_rate": 4.634789156626506e-05, |
|
"loss": 0.1593, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.33858134416793634, |
|
"grad_norm": 1.0645991563796997, |
|
"learning_rate": 4.615963855421687e-05, |
|
"loss": 0.1611, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.3453529710512951, |
|
"grad_norm": 2.582812786102295, |
|
"learning_rate": 4.597138554216868e-05, |
|
"loss": 0.1558, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.3521245979346538, |
|
"grad_norm": 1.2780678272247314, |
|
"learning_rate": 4.578313253012048e-05, |
|
"loss": 0.1426, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.35889622481801253, |
|
"grad_norm": 2.387918710708618, |
|
"learning_rate": 4.5594879518072295e-05, |
|
"loss": 0.1573, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.3656678517013712, |
|
"grad_norm": 1.4807089567184448, |
|
"learning_rate": 4.5406626506024094e-05, |
|
"loss": 0.1618, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.37243947858473, |
|
"grad_norm": 1.0865875482559204, |
|
"learning_rate": 4.5218373493975907e-05, |
|
"loss": 0.1439, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.3792111054680887, |
|
"grad_norm": 0.9812231063842773, |
|
"learning_rate": 4.503012048192771e-05, |
|
"loss": 0.1422, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.3859827323514474, |
|
"grad_norm": 0.9900213479995728, |
|
"learning_rate": 4.4841867469879525e-05, |
|
"loss": 0.1308, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.39275435923480617, |
|
"grad_norm": 1.0598454475402832, |
|
"learning_rate": 4.465361445783133e-05, |
|
"loss": 0.1565, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.39952598611816487, |
|
"grad_norm": 1.130131483078003, |
|
"learning_rate": 4.446536144578313e-05, |
|
"loss": 0.1432, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.4062976130015236, |
|
"grad_norm": 1.4709850549697876, |
|
"learning_rate": 4.427710843373494e-05, |
|
"loss": 0.1427, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.41306923988488237, |
|
"grad_norm": 1.2057602405548096, |
|
"learning_rate": 4.408885542168675e-05, |
|
"loss": 0.1377, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.41984086676824106, |
|
"grad_norm": 1.0241261720657349, |
|
"learning_rate": 4.390060240963856e-05, |
|
"loss": 0.1303, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.4266124936515998, |
|
"grad_norm": 0.70640629529953, |
|
"learning_rate": 4.3712349397590366e-05, |
|
"loss": 0.1284, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.4333841205349585, |
|
"grad_norm": 1.1402077674865723, |
|
"learning_rate": 4.352409638554217e-05, |
|
"loss": 0.1153, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.44015574741831726, |
|
"grad_norm": 2.3130173683166504, |
|
"learning_rate": 4.333584337349398e-05, |
|
"loss": 0.1353, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.44692737430167595, |
|
"grad_norm": 1.018059253692627, |
|
"learning_rate": 4.3147590361445783e-05, |
|
"loss": 0.1198, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.4536990011850347, |
|
"grad_norm": 1.1822158098220825, |
|
"learning_rate": 4.2959337349397596e-05, |
|
"loss": 0.1224, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.46047062806839345, |
|
"grad_norm": 0.8007648587226868, |
|
"learning_rate": 4.27710843373494e-05, |
|
"loss": 0.1156, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.46724225495175215, |
|
"grad_norm": 2.0608577728271484, |
|
"learning_rate": 4.258283132530121e-05, |
|
"loss": 0.1355, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.4740138818351109, |
|
"grad_norm": 0.8175686597824097, |
|
"learning_rate": 4.239457831325301e-05, |
|
"loss": 0.117, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.4807855087184696, |
|
"grad_norm": 0.9864722490310669, |
|
"learning_rate": 4.220632530120482e-05, |
|
"loss": 0.1143, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.48755713560182834, |
|
"grad_norm": 1.1170806884765625, |
|
"learning_rate": 4.201807228915663e-05, |
|
"loss": 0.1128, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.4943287624851871, |
|
"grad_norm": 0.8426876664161682, |
|
"learning_rate": 4.182981927710844e-05, |
|
"loss": 0.1039, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.5011003893685458, |
|
"grad_norm": 1.0189573764801025, |
|
"learning_rate": 4.164156626506024e-05, |
|
"loss": 0.1056, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.5078720162519045, |
|
"grad_norm": 1.0524259805679321, |
|
"learning_rate": 4.145331325301205e-05, |
|
"loss": 0.106, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.5146436431352632, |
|
"grad_norm": 0.8824974298477173, |
|
"learning_rate": 4.126506024096386e-05, |
|
"loss": 0.1077, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.521415270018622, |
|
"grad_norm": 1.034146785736084, |
|
"learning_rate": 4.107680722891567e-05, |
|
"loss": 0.1147, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.5281868969019807, |
|
"grad_norm": 1.1661494970321655, |
|
"learning_rate": 4.088855421686747e-05, |
|
"loss": 0.1159, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.5349585237853395, |
|
"grad_norm": 1.9664863348007202, |
|
"learning_rate": 4.070030120481928e-05, |
|
"loss": 0.1155, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.5417301506686981, |
|
"grad_norm": 0.9557604193687439, |
|
"learning_rate": 4.0512048192771084e-05, |
|
"loss": 0.1117, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.5485017775520569, |
|
"grad_norm": 1.1883480548858643, |
|
"learning_rate": 4.03237951807229e-05, |
|
"loss": 0.1001, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.5552734044354156, |
|
"grad_norm": 1.3493871688842773, |
|
"learning_rate": 4.01355421686747e-05, |
|
"loss": 0.1022, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.5620450313187744, |
|
"grad_norm": 0.8323326110839844, |
|
"learning_rate": 3.994728915662651e-05, |
|
"loss": 0.1084, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.568816658202133, |
|
"grad_norm": 1.3079185485839844, |
|
"learning_rate": 3.9759036144578314e-05, |
|
"loss": 0.1054, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.5755882850854918, |
|
"grad_norm": 0.961869478225708, |
|
"learning_rate": 3.957078313253012e-05, |
|
"loss": 0.1027, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.5823599119688505, |
|
"grad_norm": 0.7965933680534363, |
|
"learning_rate": 3.938253012048193e-05, |
|
"loss": 0.101, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.5891315388522093, |
|
"grad_norm": 0.5749910473823547, |
|
"learning_rate": 3.919427710843374e-05, |
|
"loss": 0.0942, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.595903165735568, |
|
"grad_norm": 1.314102292060852, |
|
"learning_rate": 3.9006024096385544e-05, |
|
"loss": 0.1062, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.6026747926189266, |
|
"grad_norm": 0.9225144982337952, |
|
"learning_rate": 3.881777108433735e-05, |
|
"loss": 0.0895, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 0.6094464195022854, |
|
"grad_norm": 1.0227304697036743, |
|
"learning_rate": 3.8629518072289155e-05, |
|
"loss": 0.1057, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.6162180463856441, |
|
"grad_norm": 0.9698050618171692, |
|
"learning_rate": 3.844126506024097e-05, |
|
"loss": 0.0996, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 0.6229896732690029, |
|
"grad_norm": 0.7296442985534668, |
|
"learning_rate": 3.8253012048192774e-05, |
|
"loss": 0.1035, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.6297613001523616, |
|
"grad_norm": 0.943133533000946, |
|
"learning_rate": 3.806475903614458e-05, |
|
"loss": 0.0892, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 0.6365329270357203, |
|
"grad_norm": 1.618379831314087, |
|
"learning_rate": 3.7876506024096385e-05, |
|
"loss": 0.0828, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 0.643304553919079, |
|
"grad_norm": 0.8589970469474792, |
|
"learning_rate": 3.76882530120482e-05, |
|
"loss": 0.1033, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.6500761808024378, |
|
"grad_norm": 0.7783709764480591, |
|
"learning_rate": 3.7500000000000003e-05, |
|
"loss": 0.0943, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.6568478076857965, |
|
"grad_norm": 1.0342212915420532, |
|
"learning_rate": 3.731174698795181e-05, |
|
"loss": 0.0979, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 0.6636194345691553, |
|
"grad_norm": 0.815461277961731, |
|
"learning_rate": 3.7123493975903615e-05, |
|
"loss": 0.092, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 0.6703910614525139, |
|
"grad_norm": 0.7246832251548767, |
|
"learning_rate": 3.693524096385542e-05, |
|
"loss": 0.0843, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 0.6771626883358727, |
|
"grad_norm": 0.9605166912078857, |
|
"learning_rate": 3.674698795180723e-05, |
|
"loss": 0.0835, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.6839343152192314, |
|
"grad_norm": 1.1440260410308838, |
|
"learning_rate": 3.655873493975904e-05, |
|
"loss": 0.0886, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 0.6907059421025902, |
|
"grad_norm": 1.0243327617645264, |
|
"learning_rate": 3.6370481927710845e-05, |
|
"loss": 0.089, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 0.6974775689859489, |
|
"grad_norm": 0.8508220314979553, |
|
"learning_rate": 3.618222891566265e-05, |
|
"loss": 0.0795, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 0.7042491958693076, |
|
"grad_norm": 0.852389931678772, |
|
"learning_rate": 3.5993975903614456e-05, |
|
"loss": 0.091, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 0.7110208227526663, |
|
"grad_norm": 0.9550991058349609, |
|
"learning_rate": 3.580572289156627e-05, |
|
"loss": 0.0856, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.7177924496360251, |
|
"grad_norm": 1.009476661682129, |
|
"learning_rate": 3.5617469879518075e-05, |
|
"loss": 0.084, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 0.7245640765193838, |
|
"grad_norm": 0.8816896080970764, |
|
"learning_rate": 3.542921686746988e-05, |
|
"loss": 0.0873, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 0.7313357034027425, |
|
"grad_norm": 0.8172402381896973, |
|
"learning_rate": 3.5240963855421686e-05, |
|
"loss": 0.0785, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 0.7381073302861012, |
|
"grad_norm": 1.3699508905410767, |
|
"learning_rate": 3.505271084337349e-05, |
|
"loss": 0.0785, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 0.74487895716946, |
|
"grad_norm": 0.6807621121406555, |
|
"learning_rate": 3.4864457831325304e-05, |
|
"loss": 0.0797, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.7516505840528187, |
|
"grad_norm": 0.8369671106338501, |
|
"learning_rate": 3.467620481927711e-05, |
|
"loss": 0.0845, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 0.7584222109361775, |
|
"grad_norm": 0.7025638818740845, |
|
"learning_rate": 3.4487951807228916e-05, |
|
"loss": 0.0699, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 0.7651938378195361, |
|
"grad_norm": 0.6446508765220642, |
|
"learning_rate": 3.429969879518072e-05, |
|
"loss": 0.0879, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 0.7719654647028948, |
|
"grad_norm": 1.6889147758483887, |
|
"learning_rate": 3.4111445783132534e-05, |
|
"loss": 0.0966, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 0.7787370915862536, |
|
"grad_norm": 0.7355449199676514, |
|
"learning_rate": 3.392319277108434e-05, |
|
"loss": 0.0824, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.7855087184696123, |
|
"grad_norm": 0.5761105418205261, |
|
"learning_rate": 3.3734939759036146e-05, |
|
"loss": 0.0759, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 0.7922803453529711, |
|
"grad_norm": 0.7694604396820068, |
|
"learning_rate": 3.354668674698795e-05, |
|
"loss": 0.0839, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 0.7990519722363297, |
|
"grad_norm": 0.8113671541213989, |
|
"learning_rate": 3.335843373493976e-05, |
|
"loss": 0.0799, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 0.8058235991196885, |
|
"grad_norm": 0.6517748832702637, |
|
"learning_rate": 3.317018072289157e-05, |
|
"loss": 0.078, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 0.8125952260030472, |
|
"grad_norm": 0.8274058103561401, |
|
"learning_rate": 3.2981927710843376e-05, |
|
"loss": 0.0789, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.819366852886406, |
|
"grad_norm": 0.7922660112380981, |
|
"learning_rate": 3.279367469879519e-05, |
|
"loss": 0.0764, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 0.8261384797697647, |
|
"grad_norm": 1.2404780387878418, |
|
"learning_rate": 3.260542168674699e-05, |
|
"loss": 0.0731, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 0.8329101066531234, |
|
"grad_norm": 0.6905914545059204, |
|
"learning_rate": 3.241716867469879e-05, |
|
"loss": 0.0704, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 0.8396817335364821, |
|
"grad_norm": 0.9580501914024353, |
|
"learning_rate": 3.2228915662650605e-05, |
|
"loss": 0.0806, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 0.8464533604198409, |
|
"grad_norm": 0.4647589325904846, |
|
"learning_rate": 3.204066265060241e-05, |
|
"loss": 0.0749, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 0.8532249873031996, |
|
"grad_norm": 0.7317982316017151, |
|
"learning_rate": 3.1852409638554224e-05, |
|
"loss": 0.0739, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 0.8599966141865584, |
|
"grad_norm": 0.7131925821304321, |
|
"learning_rate": 3.166415662650602e-05, |
|
"loss": 0.0742, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 0.866768241069917, |
|
"grad_norm": 0.42962631583213806, |
|
"learning_rate": 3.147590361445783e-05, |
|
"loss": 0.0595, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 0.8735398679532758, |
|
"grad_norm": 1.0177807807922363, |
|
"learning_rate": 3.128765060240964e-05, |
|
"loss": 0.0815, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 0.8803114948366345, |
|
"grad_norm": 0.7908213138580322, |
|
"learning_rate": 3.1099397590361447e-05, |
|
"loss": 0.0758, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.8870831217199933, |
|
"grad_norm": 0.7449843883514404, |
|
"learning_rate": 3.091114457831326e-05, |
|
"loss": 0.0789, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 0.8938547486033519, |
|
"grad_norm": 0.8616693615913391, |
|
"learning_rate": 3.072289156626506e-05, |
|
"loss": 0.0653, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 0.9006263754867107, |
|
"grad_norm": 1.0630964040756226, |
|
"learning_rate": 3.053463855421687e-05, |
|
"loss": 0.081, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 0.9073980023700694, |
|
"grad_norm": 0.7529353499412537, |
|
"learning_rate": 3.0346385542168676e-05, |
|
"loss": 0.0753, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 0.9141696292534282, |
|
"grad_norm": 0.7995216846466064, |
|
"learning_rate": 3.0158132530120482e-05, |
|
"loss": 0.066, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 0.9209412561367869, |
|
"grad_norm": 0.3852786719799042, |
|
"learning_rate": 2.996987951807229e-05, |
|
"loss": 0.07, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 0.9277128830201455, |
|
"grad_norm": 0.6470819711685181, |
|
"learning_rate": 2.9781626506024097e-05, |
|
"loss": 0.0815, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 0.9344845099035043, |
|
"grad_norm": 0.9942120313644409, |
|
"learning_rate": 2.9593373493975906e-05, |
|
"loss": 0.0749, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 0.941256136786863, |
|
"grad_norm": 1.0045307874679565, |
|
"learning_rate": 2.9405120481927712e-05, |
|
"loss": 0.0797, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 0.9480277636702218, |
|
"grad_norm": 0.5135801434516907, |
|
"learning_rate": 2.921686746987952e-05, |
|
"loss": 0.0647, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.9547993905535805, |
|
"grad_norm": 1.0146623849868774, |
|
"learning_rate": 2.9028614457831327e-05, |
|
"loss": 0.078, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 0.9615710174369392, |
|
"grad_norm": 0.730907142162323, |
|
"learning_rate": 2.8840361445783133e-05, |
|
"loss": 0.0671, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 0.9683426443202979, |
|
"grad_norm": 1.0035794973373413, |
|
"learning_rate": 2.8652108433734942e-05, |
|
"loss": 0.0804, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 0.9751142712036567, |
|
"grad_norm": 0.6765666604042053, |
|
"learning_rate": 2.8463855421686748e-05, |
|
"loss": 0.0691, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 0.9818858980870154, |
|
"grad_norm": 0.8572028279304504, |
|
"learning_rate": 2.8275602409638557e-05, |
|
"loss": 0.0716, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 0.9886575249703742, |
|
"grad_norm": 0.4426514208316803, |
|
"learning_rate": 2.8087349397590362e-05, |
|
"loss": 0.0709, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 0.9954291518537328, |
|
"grad_norm": 0.5503798723220825, |
|
"learning_rate": 2.789909638554217e-05, |
|
"loss": 0.0709, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 0.9994921279837481, |
|
"eval_accuracy": 0.9789523809523809, |
|
"eval_loss": 0.07530121505260468, |
|
"eval_runtime": 1028.8877, |
|
"eval_samples_per_second": 20.41, |
|
"eval_steps_per_second": 0.639, |
|
"step": 1476 |
|
}, |
|
{ |
|
"epoch": 1.0022007787370917, |
|
"grad_norm": 0.808159589767456, |
|
"learning_rate": 2.7710843373493977e-05, |
|
"loss": 0.0653, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 1.0089724056204503, |
|
"grad_norm": 1.711690902709961, |
|
"learning_rate": 2.7522590361445783e-05, |
|
"loss": 0.0712, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 1.015744032503809, |
|
"grad_norm": 0.584488570690155, |
|
"learning_rate": 2.7334337349397592e-05, |
|
"loss": 0.0748, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 1.0225156593871678, |
|
"grad_norm": 0.6118115186691284, |
|
"learning_rate": 2.7146084337349398e-05, |
|
"loss": 0.0663, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 1.0292872862705265, |
|
"grad_norm": 0.7841724753379822, |
|
"learning_rate": 2.6957831325301207e-05, |
|
"loss": 0.0664, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 1.0360589131538853, |
|
"grad_norm": 0.6936156153678894, |
|
"learning_rate": 2.6769578313253013e-05, |
|
"loss": 0.0642, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 1.042830540037244, |
|
"grad_norm": 0.7428813576698303, |
|
"learning_rate": 2.658132530120482e-05, |
|
"loss": 0.0694, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 1.0496021669206026, |
|
"grad_norm": 1.0296515226364136, |
|
"learning_rate": 2.6393072289156628e-05, |
|
"loss": 0.0754, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 1.0563737938039615, |
|
"grad_norm": 0.6250594258308411, |
|
"learning_rate": 2.6204819277108434e-05, |
|
"loss": 0.0646, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 1.06314542068732, |
|
"grad_norm": 1.5104742050170898, |
|
"learning_rate": 2.6016566265060243e-05, |
|
"loss": 0.073, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 1.069917047570679, |
|
"grad_norm": 0.8500051498413086, |
|
"learning_rate": 2.582831325301205e-05, |
|
"loss": 0.0677, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 1.0766886744540376, |
|
"grad_norm": 1.0728169679641724, |
|
"learning_rate": 2.5640060240963858e-05, |
|
"loss": 0.0692, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 1.0834603013373962, |
|
"grad_norm": 0.7029580473899841, |
|
"learning_rate": 2.5451807228915663e-05, |
|
"loss": 0.0681, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 1.090231928220755, |
|
"grad_norm": 0.7941862344741821, |
|
"learning_rate": 2.526355421686747e-05, |
|
"loss": 0.0727, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 1.0970035551041137, |
|
"grad_norm": 0.8318243026733398, |
|
"learning_rate": 2.5075301204819278e-05, |
|
"loss": 0.0764, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 1.1037751819874724, |
|
"grad_norm": 0.9039504528045654, |
|
"learning_rate": 2.4887048192771087e-05, |
|
"loss": 0.0817, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 1.1105468088708312, |
|
"grad_norm": 0.8218241930007935, |
|
"learning_rate": 2.4698795180722893e-05, |
|
"loss": 0.0705, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 1.1173184357541899, |
|
"grad_norm": 0.5053806900978088, |
|
"learning_rate": 2.45105421686747e-05, |
|
"loss": 0.0738, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 1.1240900626375487, |
|
"grad_norm": 0.5837478637695312, |
|
"learning_rate": 2.4322289156626508e-05, |
|
"loss": 0.0623, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 1.1308616895209074, |
|
"grad_norm": 0.6927766799926758, |
|
"learning_rate": 2.4134036144578314e-05, |
|
"loss": 0.0692, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 1.137633316404266, |
|
"grad_norm": 0.8067636489868164, |
|
"learning_rate": 2.3945783132530123e-05, |
|
"loss": 0.0716, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 1.1444049432876249, |
|
"grad_norm": 0.8656367063522339, |
|
"learning_rate": 2.375753012048193e-05, |
|
"loss": 0.0648, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 1.1511765701709835, |
|
"grad_norm": 0.5790455341339111, |
|
"learning_rate": 2.3569277108433734e-05, |
|
"loss": 0.0682, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 1.1579481970543424, |
|
"grad_norm": 0.9087624549865723, |
|
"learning_rate": 2.3381024096385544e-05, |
|
"loss": 0.0606, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 1.164719823937701, |
|
"grad_norm": 0.6014874577522278, |
|
"learning_rate": 2.319277108433735e-05, |
|
"loss": 0.078, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 1.1714914508210597, |
|
"grad_norm": 0.6091852784156799, |
|
"learning_rate": 2.300451807228916e-05, |
|
"loss": 0.0647, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 1.1782630777044185, |
|
"grad_norm": 0.581322431564331, |
|
"learning_rate": 2.2816265060240964e-05, |
|
"loss": 0.0712, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 1.1850347045877772, |
|
"grad_norm": 0.4964611530303955, |
|
"learning_rate": 2.2628012048192773e-05, |
|
"loss": 0.0524, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 1.191806331471136, |
|
"grad_norm": 0.8920990228652954, |
|
"learning_rate": 2.243975903614458e-05, |
|
"loss": 0.0719, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 1.1985779583544947, |
|
"grad_norm": 0.6709417104721069, |
|
"learning_rate": 2.2251506024096385e-05, |
|
"loss": 0.0695, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 1.2053495852378533, |
|
"grad_norm": 0.7828611731529236, |
|
"learning_rate": 2.2063253012048194e-05, |
|
"loss": 0.0742, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 1.2121212121212122, |
|
"grad_norm": 1.2962734699249268, |
|
"learning_rate": 2.1875e-05, |
|
"loss": 0.0743, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 1.2188928390045708, |
|
"grad_norm": 0.5535785555839539, |
|
"learning_rate": 2.168674698795181e-05, |
|
"loss": 0.064, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 1.2256644658879297, |
|
"grad_norm": 1.0534149408340454, |
|
"learning_rate": 2.1498493975903615e-05, |
|
"loss": 0.0594, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 1.2324360927712883, |
|
"grad_norm": 0.8949538469314575, |
|
"learning_rate": 2.1310240963855424e-05, |
|
"loss": 0.0607, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 1.239207719654647, |
|
"grad_norm": 0.7287763357162476, |
|
"learning_rate": 2.112198795180723e-05, |
|
"loss": 0.076, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 1.2459793465380058, |
|
"grad_norm": 1.2560794353485107, |
|
"learning_rate": 2.0933734939759035e-05, |
|
"loss": 0.0662, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 1.2527509734213644, |
|
"grad_norm": 0.4771580398082733, |
|
"learning_rate": 2.0745481927710844e-05, |
|
"loss": 0.0541, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 1.2595226003047233, |
|
"grad_norm": 0.9189515113830566, |
|
"learning_rate": 2.055722891566265e-05, |
|
"loss": 0.058, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 1.266294227188082, |
|
"grad_norm": 1.0164196491241455, |
|
"learning_rate": 2.036897590361446e-05, |
|
"loss": 0.0604, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 1.2730658540714406, |
|
"grad_norm": 0.8721578121185303, |
|
"learning_rate": 2.018072289156627e-05, |
|
"loss": 0.0731, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 1.2798374809547994, |
|
"grad_norm": 0.8412027955055237, |
|
"learning_rate": 1.999246987951807e-05, |
|
"loss": 0.0636, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 1.286609107838158, |
|
"grad_norm": 0.784599244594574, |
|
"learning_rate": 1.980421686746988e-05, |
|
"loss": 0.0636, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 1.293380734721517, |
|
"grad_norm": 0.5514250993728638, |
|
"learning_rate": 1.9615963855421686e-05, |
|
"loss": 0.0677, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 1.3001523616048756, |
|
"grad_norm": 0.6008131504058838, |
|
"learning_rate": 1.9427710843373495e-05, |
|
"loss": 0.0637, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 1.3069239884882342, |
|
"grad_norm": 0.9827722311019897, |
|
"learning_rate": 1.9239457831325304e-05, |
|
"loss": 0.0581, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 1.313695615371593, |
|
"grad_norm": 0.699483335018158, |
|
"learning_rate": 1.905120481927711e-05, |
|
"loss": 0.0651, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 1.3204672422549517, |
|
"grad_norm": 0.7085596919059753, |
|
"learning_rate": 1.8862951807228916e-05, |
|
"loss": 0.0589, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 1.3272388691383106, |
|
"grad_norm": 1.0991511344909668, |
|
"learning_rate": 1.867469879518072e-05, |
|
"loss": 0.0699, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 1.3340104960216692, |
|
"grad_norm": 0.8005927801132202, |
|
"learning_rate": 1.848644578313253e-05, |
|
"loss": 0.0673, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 1.3407821229050279, |
|
"grad_norm": 0.7139153480529785, |
|
"learning_rate": 1.829819277108434e-05, |
|
"loss": 0.0571, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 1.3475537497883867, |
|
"grad_norm": 0.4111141562461853, |
|
"learning_rate": 1.8109939759036145e-05, |
|
"loss": 0.0587, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 1.3543253766717454, |
|
"grad_norm": 0.5993856191635132, |
|
"learning_rate": 1.7921686746987955e-05, |
|
"loss": 0.0649, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 1.3610970035551042, |
|
"grad_norm": 0.9376055598258972, |
|
"learning_rate": 1.773343373493976e-05, |
|
"loss": 0.0672, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 1.3678686304384629, |
|
"grad_norm": 0.4629497230052948, |
|
"learning_rate": 1.7545180722891566e-05, |
|
"loss": 0.0517, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 1.3746402573218215, |
|
"grad_norm": 0.4839510917663574, |
|
"learning_rate": 1.7356927710843375e-05, |
|
"loss": 0.0524, |
|
"step": 2030 |
|
}, |
|
{ |
|
"epoch": 1.3814118842051804, |
|
"grad_norm": 0.5999952554702759, |
|
"learning_rate": 1.716867469879518e-05, |
|
"loss": 0.056, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 1.388183511088539, |
|
"grad_norm": 0.5760485529899597, |
|
"learning_rate": 1.698042168674699e-05, |
|
"loss": 0.0663, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 1.3949551379718979, |
|
"grad_norm": 0.7293563485145569, |
|
"learning_rate": 1.6792168674698796e-05, |
|
"loss": 0.0649, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 1.4017267648552565, |
|
"grad_norm": 0.6858052611351013, |
|
"learning_rate": 1.6603915662650605e-05, |
|
"loss": 0.0545, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 1.4084983917386151, |
|
"grad_norm": 0.9105307459831238, |
|
"learning_rate": 1.641566265060241e-05, |
|
"loss": 0.0616, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 1.415270018621974, |
|
"grad_norm": 0.4213721752166748, |
|
"learning_rate": 1.6227409638554216e-05, |
|
"loss": 0.0647, |
|
"step": 2090 |
|
}, |
|
{ |
|
"epoch": 1.4220416455053326, |
|
"grad_norm": 0.5692495703697205, |
|
"learning_rate": 1.6039156626506026e-05, |
|
"loss": 0.0602, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 1.4288132723886915, |
|
"grad_norm": 0.70749431848526, |
|
"learning_rate": 1.585090361445783e-05, |
|
"loss": 0.0638, |
|
"step": 2110 |
|
}, |
|
{ |
|
"epoch": 1.4355848992720501, |
|
"grad_norm": 0.7523058652877808, |
|
"learning_rate": 1.566265060240964e-05, |
|
"loss": 0.0565, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 1.4423565261554088, |
|
"grad_norm": 0.5956985354423523, |
|
"learning_rate": 1.5474397590361446e-05, |
|
"loss": 0.0584, |
|
"step": 2130 |
|
}, |
|
{ |
|
"epoch": 1.4491281530387676, |
|
"grad_norm": 0.7276691794395447, |
|
"learning_rate": 1.5286144578313255e-05, |
|
"loss": 0.0589, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 1.4558997799221263, |
|
"grad_norm": 0.9040044546127319, |
|
"learning_rate": 1.509789156626506e-05, |
|
"loss": 0.0635, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 1.4626714068054851, |
|
"grad_norm": 0.7457456588745117, |
|
"learning_rate": 1.4909638554216867e-05, |
|
"loss": 0.0654, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 1.4694430336888438, |
|
"grad_norm": 0.7019338011741638, |
|
"learning_rate": 1.4721385542168676e-05, |
|
"loss": 0.0641, |
|
"step": 2170 |
|
}, |
|
{ |
|
"epoch": 1.4762146605722024, |
|
"grad_norm": 0.8061505556106567, |
|
"learning_rate": 1.4533132530120484e-05, |
|
"loss": 0.0573, |
|
"step": 2180 |
|
}, |
|
{ |
|
"epoch": 1.4829862874555613, |
|
"grad_norm": 0.6644711494445801, |
|
"learning_rate": 1.4344879518072291e-05, |
|
"loss": 0.0654, |
|
"step": 2190 |
|
}, |
|
{ |
|
"epoch": 1.48975791433892, |
|
"grad_norm": 0.8262456059455872, |
|
"learning_rate": 1.4156626506024098e-05, |
|
"loss": 0.0642, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 1.4965295412222788, |
|
"grad_norm": 0.9707741737365723, |
|
"learning_rate": 1.3968373493975902e-05, |
|
"loss": 0.0629, |
|
"step": 2210 |
|
}, |
|
{ |
|
"epoch": 1.5033011681056374, |
|
"grad_norm": 1.0274876356124878, |
|
"learning_rate": 1.378012048192771e-05, |
|
"loss": 0.0584, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 1.510072794988996, |
|
"grad_norm": 0.5561116933822632, |
|
"learning_rate": 1.3591867469879519e-05, |
|
"loss": 0.0703, |
|
"step": 2230 |
|
}, |
|
{ |
|
"epoch": 1.5168444218723547, |
|
"grad_norm": 0.7350441217422485, |
|
"learning_rate": 1.3403614457831327e-05, |
|
"loss": 0.0679, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 1.5236160487557135, |
|
"grad_norm": 1.2632744312286377, |
|
"learning_rate": 1.3215361445783134e-05, |
|
"loss": 0.0572, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 1.5303876756390724, |
|
"grad_norm": 0.970457911491394, |
|
"learning_rate": 1.3027108433734941e-05, |
|
"loss": 0.057, |
|
"step": 2260 |
|
}, |
|
{ |
|
"epoch": 1.537159302522431, |
|
"grad_norm": 0.8355304598808289, |
|
"learning_rate": 1.2838855421686745e-05, |
|
"loss": 0.0543, |
|
"step": 2270 |
|
}, |
|
{ |
|
"epoch": 1.5439309294057897, |
|
"grad_norm": 0.61915522813797, |
|
"learning_rate": 1.2650602409638555e-05, |
|
"loss": 0.0623, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 1.5507025562891483, |
|
"grad_norm": 0.5436218976974487, |
|
"learning_rate": 1.2462349397590362e-05, |
|
"loss": 0.0621, |
|
"step": 2290 |
|
}, |
|
{ |
|
"epoch": 1.5574741831725072, |
|
"grad_norm": 0.8874384164810181, |
|
"learning_rate": 1.227409638554217e-05, |
|
"loss": 0.0598, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 1.564245810055866, |
|
"grad_norm": 0.6226460933685303, |
|
"learning_rate": 1.2085843373493977e-05, |
|
"loss": 0.0549, |
|
"step": 2310 |
|
}, |
|
{ |
|
"epoch": 1.5710174369392247, |
|
"grad_norm": 0.648560643196106, |
|
"learning_rate": 1.1897590361445783e-05, |
|
"loss": 0.0581, |
|
"step": 2320 |
|
}, |
|
{ |
|
"epoch": 1.5777890638225833, |
|
"grad_norm": 0.6028856039047241, |
|
"learning_rate": 1.170933734939759e-05, |
|
"loss": 0.0628, |
|
"step": 2330 |
|
}, |
|
{ |
|
"epoch": 1.584560690705942, |
|
"grad_norm": 0.5027485489845276, |
|
"learning_rate": 1.1521084337349398e-05, |
|
"loss": 0.0655, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 1.5913323175893008, |
|
"grad_norm": 1.5245540142059326, |
|
"learning_rate": 1.1332831325301205e-05, |
|
"loss": 0.0637, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 1.5981039444726597, |
|
"grad_norm": 0.7036380767822266, |
|
"learning_rate": 1.1144578313253013e-05, |
|
"loss": 0.0582, |
|
"step": 2360 |
|
}, |
|
{ |
|
"epoch": 1.6048755713560183, |
|
"grad_norm": 0.7167279124259949, |
|
"learning_rate": 1.095632530120482e-05, |
|
"loss": 0.0542, |
|
"step": 2370 |
|
}, |
|
{ |
|
"epoch": 1.611647198239377, |
|
"grad_norm": 0.9925076365470886, |
|
"learning_rate": 1.0768072289156627e-05, |
|
"loss": 0.0521, |
|
"step": 2380 |
|
}, |
|
{ |
|
"epoch": 1.6184188251227356, |
|
"grad_norm": 0.8511892557144165, |
|
"learning_rate": 1.0579819277108433e-05, |
|
"loss": 0.0603, |
|
"step": 2390 |
|
}, |
|
{ |
|
"epoch": 1.6251904520060945, |
|
"grad_norm": 1.3457633256912231, |
|
"learning_rate": 1.0391566265060242e-05, |
|
"loss": 0.0654, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 1.6319620788894533, |
|
"grad_norm": 0.8031491041183472, |
|
"learning_rate": 1.020331325301205e-05, |
|
"loss": 0.0526, |
|
"step": 2410 |
|
}, |
|
{ |
|
"epoch": 1.638733705772812, |
|
"grad_norm": 0.6882662773132324, |
|
"learning_rate": 1.0015060240963856e-05, |
|
"loss": 0.0585, |
|
"step": 2420 |
|
}, |
|
{ |
|
"epoch": 1.6455053326561706, |
|
"grad_norm": 0.5608332753181458, |
|
"learning_rate": 9.826807228915663e-06, |
|
"loss": 0.0613, |
|
"step": 2430 |
|
}, |
|
{ |
|
"epoch": 1.6522769595395292, |
|
"grad_norm": 0.512611985206604, |
|
"learning_rate": 9.63855421686747e-06, |
|
"loss": 0.0535, |
|
"step": 2440 |
|
}, |
|
{ |
|
"epoch": 1.659048586422888, |
|
"grad_norm": 0.48724594712257385, |
|
"learning_rate": 9.450301204819278e-06, |
|
"loss": 0.0554, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 1.665820213306247, |
|
"grad_norm": 0.7341836094856262, |
|
"learning_rate": 9.262048192771085e-06, |
|
"loss": 0.0598, |
|
"step": 2460 |
|
}, |
|
{ |
|
"epoch": 1.6725918401896056, |
|
"grad_norm": 0.5964226722717285, |
|
"learning_rate": 9.073795180722893e-06, |
|
"loss": 0.0742, |
|
"step": 2470 |
|
}, |
|
{ |
|
"epoch": 1.6793634670729642, |
|
"grad_norm": 1.4178098440170288, |
|
"learning_rate": 8.885542168674699e-06, |
|
"loss": 0.0716, |
|
"step": 2480 |
|
}, |
|
{ |
|
"epoch": 1.6861350939563229, |
|
"grad_norm": 0.7655882239341736, |
|
"learning_rate": 8.697289156626506e-06, |
|
"loss": 0.0509, |
|
"step": 2490 |
|
}, |
|
{ |
|
"epoch": 1.6929067208396817, |
|
"grad_norm": 0.7825894355773926, |
|
"learning_rate": 8.509036144578313e-06, |
|
"loss": 0.056, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 1.6996783477230406, |
|
"grad_norm": 0.7740017771720886, |
|
"learning_rate": 8.320783132530121e-06, |
|
"loss": 0.0669, |
|
"step": 2510 |
|
}, |
|
{ |
|
"epoch": 1.7064499746063992, |
|
"grad_norm": 0.4478660523891449, |
|
"learning_rate": 8.132530120481928e-06, |
|
"loss": 0.0711, |
|
"step": 2520 |
|
}, |
|
{ |
|
"epoch": 1.7132216014897579, |
|
"grad_norm": 0.9198179244995117, |
|
"learning_rate": 7.944277108433736e-06, |
|
"loss": 0.0534, |
|
"step": 2530 |
|
}, |
|
{ |
|
"epoch": 1.7199932283731165, |
|
"grad_norm": 0.5813512802124023, |
|
"learning_rate": 7.756024096385543e-06, |
|
"loss": 0.0612, |
|
"step": 2540 |
|
}, |
|
{ |
|
"epoch": 1.7267648552564754, |
|
"grad_norm": 0.6739583611488342, |
|
"learning_rate": 7.56777108433735e-06, |
|
"loss": 0.0617, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 1.7335364821398342, |
|
"grad_norm": 0.6534674167633057, |
|
"learning_rate": 7.379518072289157e-06, |
|
"loss": 0.0639, |
|
"step": 2560 |
|
}, |
|
{ |
|
"epoch": 1.7403081090231929, |
|
"grad_norm": 0.6965113878250122, |
|
"learning_rate": 7.191265060240965e-06, |
|
"loss": 0.0543, |
|
"step": 2570 |
|
}, |
|
{ |
|
"epoch": 1.7470797359065515, |
|
"grad_norm": 0.7045726776123047, |
|
"learning_rate": 7.003012048192771e-06, |
|
"loss": 0.0557, |
|
"step": 2580 |
|
}, |
|
{ |
|
"epoch": 1.7538513627899102, |
|
"grad_norm": 0.7541308403015137, |
|
"learning_rate": 6.814759036144579e-06, |
|
"loss": 0.0609, |
|
"step": 2590 |
|
}, |
|
{ |
|
"epoch": 1.760622989673269, |
|
"grad_norm": 1.2629750967025757, |
|
"learning_rate": 6.626506024096386e-06, |
|
"loss": 0.055, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 1.7673946165566279, |
|
"grad_norm": 0.8643785119056702, |
|
"learning_rate": 6.438253012048193e-06, |
|
"loss": 0.0675, |
|
"step": 2610 |
|
}, |
|
{ |
|
"epoch": 1.7741662434399865, |
|
"grad_norm": 0.7467519640922546, |
|
"learning_rate": 6.25e-06, |
|
"loss": 0.0641, |
|
"step": 2620 |
|
}, |
|
{ |
|
"epoch": 1.7809378703233452, |
|
"grad_norm": 0.4777075946331024, |
|
"learning_rate": 6.061746987951807e-06, |
|
"loss": 0.0605, |
|
"step": 2630 |
|
}, |
|
{ |
|
"epoch": 1.7877094972067038, |
|
"grad_norm": 0.6084995865821838, |
|
"learning_rate": 5.873493975903615e-06, |
|
"loss": 0.0545, |
|
"step": 2640 |
|
}, |
|
{ |
|
"epoch": 1.7944811240900627, |
|
"grad_norm": 0.9602014422416687, |
|
"learning_rate": 5.685240963855422e-06, |
|
"loss": 0.0484, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 1.8012527509734215, |
|
"grad_norm": 0.7709717154502869, |
|
"learning_rate": 5.496987951807229e-06, |
|
"loss": 0.0637, |
|
"step": 2660 |
|
}, |
|
{ |
|
"epoch": 1.8080243778567802, |
|
"grad_norm": 0.422376424074173, |
|
"learning_rate": 5.308734939759037e-06, |
|
"loss": 0.0659, |
|
"step": 2670 |
|
}, |
|
{ |
|
"epoch": 1.8147960047401388, |
|
"grad_norm": 0.4001966118812561, |
|
"learning_rate": 5.120481927710843e-06, |
|
"loss": 0.0609, |
|
"step": 2680 |
|
}, |
|
{ |
|
"epoch": 1.8215676316234974, |
|
"grad_norm": 0.4686823785305023, |
|
"learning_rate": 4.932228915662651e-06, |
|
"loss": 0.0588, |
|
"step": 2690 |
|
}, |
|
{ |
|
"epoch": 1.8283392585068563, |
|
"grad_norm": 0.4594326615333557, |
|
"learning_rate": 4.743975903614458e-06, |
|
"loss": 0.0558, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 1.8351108853902152, |
|
"grad_norm": 0.7824903130531311, |
|
"learning_rate": 4.555722891566265e-06, |
|
"loss": 0.0536, |
|
"step": 2710 |
|
}, |
|
{ |
|
"epoch": 1.8418825122735738, |
|
"grad_norm": 0.6944392323493958, |
|
"learning_rate": 4.367469879518073e-06, |
|
"loss": 0.0653, |
|
"step": 2720 |
|
}, |
|
{ |
|
"epoch": 1.8486541391569324, |
|
"grad_norm": 0.665273129940033, |
|
"learning_rate": 4.17921686746988e-06, |
|
"loss": 0.0544, |
|
"step": 2730 |
|
}, |
|
{ |
|
"epoch": 1.855425766040291, |
|
"grad_norm": 0.6628099083900452, |
|
"learning_rate": 3.990963855421686e-06, |
|
"loss": 0.056, |
|
"step": 2740 |
|
}, |
|
{ |
|
"epoch": 1.86219739292365, |
|
"grad_norm": 0.79542475938797, |
|
"learning_rate": 3.802710843373494e-06, |
|
"loss": 0.0549, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 1.8689690198070086, |
|
"grad_norm": 0.7736166715621948, |
|
"learning_rate": 3.614457831325301e-06, |
|
"loss": 0.0606, |
|
"step": 2760 |
|
}, |
|
{ |
|
"epoch": 1.8757406466903674, |
|
"grad_norm": 1.0499205589294434, |
|
"learning_rate": 3.426204819277109e-06, |
|
"loss": 0.0656, |
|
"step": 2770 |
|
}, |
|
{ |
|
"epoch": 1.882512273573726, |
|
"grad_norm": 0.831328809261322, |
|
"learning_rate": 3.2379518072289157e-06, |
|
"loss": 0.0575, |
|
"step": 2780 |
|
}, |
|
{ |
|
"epoch": 1.8892839004570847, |
|
"grad_norm": 0.7741392254829407, |
|
"learning_rate": 3.049698795180723e-06, |
|
"loss": 0.0493, |
|
"step": 2790 |
|
}, |
|
{ |
|
"epoch": 1.8960555273404436, |
|
"grad_norm": 0.5156965851783752, |
|
"learning_rate": 2.86144578313253e-06, |
|
"loss": 0.0574, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 1.9028271542238022, |
|
"grad_norm": 0.7923727035522461, |
|
"learning_rate": 2.6731927710843376e-06, |
|
"loss": 0.0496, |
|
"step": 2810 |
|
}, |
|
{ |
|
"epoch": 1.909598781107161, |
|
"grad_norm": 0.6281595230102539, |
|
"learning_rate": 2.4849397590361446e-06, |
|
"loss": 0.0564, |
|
"step": 2820 |
|
}, |
|
{ |
|
"epoch": 1.9163704079905197, |
|
"grad_norm": 0.475424200296402, |
|
"learning_rate": 2.296686746987952e-06, |
|
"loss": 0.0518, |
|
"step": 2830 |
|
}, |
|
{ |
|
"epoch": 1.9231420348738784, |
|
"grad_norm": 0.808529794216156, |
|
"learning_rate": 2.108433734939759e-06, |
|
"loss": 0.0519, |
|
"step": 2840 |
|
}, |
|
{ |
|
"epoch": 1.9299136617572372, |
|
"grad_norm": 0.9661712050437927, |
|
"learning_rate": 1.920180722891566e-06, |
|
"loss": 0.057, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 1.9366852886405959, |
|
"grad_norm": 0.7859961986541748, |
|
"learning_rate": 1.7319277108433736e-06, |
|
"loss": 0.0597, |
|
"step": 2860 |
|
}, |
|
{ |
|
"epoch": 1.9434569155239547, |
|
"grad_norm": 0.8376522660255432, |
|
"learning_rate": 1.5436746987951808e-06, |
|
"loss": 0.0562, |
|
"step": 2870 |
|
}, |
|
{ |
|
"epoch": 1.9502285424073134, |
|
"grad_norm": 1.249200463294983, |
|
"learning_rate": 1.355421686746988e-06, |
|
"loss": 0.0582, |
|
"step": 2880 |
|
}, |
|
{ |
|
"epoch": 1.957000169290672, |
|
"grad_norm": 0.6086472272872925, |
|
"learning_rate": 1.167168674698795e-06, |
|
"loss": 0.0575, |
|
"step": 2890 |
|
}, |
|
{ |
|
"epoch": 1.9637717961740309, |
|
"grad_norm": 0.5852237939834595, |
|
"learning_rate": 9.789156626506025e-07, |
|
"loss": 0.0523, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 1.9705434230573895, |
|
"grad_norm": 0.5044909119606018, |
|
"learning_rate": 7.906626506024097e-07, |
|
"loss": 0.0599, |
|
"step": 2910 |
|
}, |
|
{ |
|
"epoch": 1.9773150499407484, |
|
"grad_norm": 0.5792267918586731, |
|
"learning_rate": 6.024096385542169e-07, |
|
"loss": 0.0514, |
|
"step": 2920 |
|
}, |
|
{ |
|
"epoch": 1.984086676824107, |
|
"grad_norm": 0.6914946436882019, |
|
"learning_rate": 4.1415662650602414e-07, |
|
"loss": 0.0582, |
|
"step": 2930 |
|
}, |
|
{ |
|
"epoch": 1.9908583037074656, |
|
"grad_norm": 1.0799570083618164, |
|
"learning_rate": 2.2590361445783133e-07, |
|
"loss": 0.0467, |
|
"step": 2940 |
|
}, |
|
{ |
|
"epoch": 1.9976299305908245, |
|
"grad_norm": 0.7630107998847961, |
|
"learning_rate": 3.7650602409638556e-08, |
|
"loss": 0.0634, |
|
"step": 2950 |
|
}, |
|
{ |
|
"epoch": 1.9989842559674962, |
|
"eval_accuracy": 0.9820952380952381, |
|
"eval_loss": 0.062325455248355865, |
|
"eval_runtime": 751.2071, |
|
"eval_samples_per_second": 27.955, |
|
"eval_steps_per_second": 0.875, |
|
"step": 2952 |
|
}, |
|
{ |
|
"epoch": 1.9989842559674962, |
|
"step": 2952, |
|
"total_flos": 1.3566159569165903e+20, |
|
"train_loss": 0.14182275843612224, |
|
"train_runtime": 17938.7006, |
|
"train_samples_per_second": 21.072, |
|
"train_steps_per_second": 0.165 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 2952, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 2, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.3566159569165903e+20, |
|
"train_batch_size": 32, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|