mnauf's picture
End of training
d02fdb6 verified
{
"best_metric": 0.9820952380952381,
"best_model_checkpoint": "dinov2-large-cluster-finetune-linear-probe-trueface/checkpoint-2952",
"epoch": 1.9989842559674962,
"eval_steps": 500,
"global_step": 2952,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.006771626883358727,
"grad_norm": 5.265126705169678,
"learning_rate": 1.6891891891891894e-06,
"loss": 0.8799,
"step": 10
},
{
"epoch": 0.013543253766717453,
"grad_norm": 6.357407093048096,
"learning_rate": 3.3783783783783788e-06,
"loss": 0.8361,
"step": 20
},
{
"epoch": 0.02031488065007618,
"grad_norm": 4.737895965576172,
"learning_rate": 5.067567567567568e-06,
"loss": 0.8673,
"step": 30
},
{
"epoch": 0.027086507533434907,
"grad_norm": 5.784124374389648,
"learning_rate": 6.7567567567567575e-06,
"loss": 0.87,
"step": 40
},
{
"epoch": 0.03385813441679363,
"grad_norm": 6.456064701080322,
"learning_rate": 8.445945945945946e-06,
"loss": 0.8188,
"step": 50
},
{
"epoch": 0.04062976130015236,
"grad_norm": 6.111750602722168,
"learning_rate": 1.0135135135135136e-05,
"loss": 0.8241,
"step": 60
},
{
"epoch": 0.04740138818351109,
"grad_norm": 4.581135272979736,
"learning_rate": 1.1824324324324325e-05,
"loss": 0.8163,
"step": 70
},
{
"epoch": 0.05417301506686981,
"grad_norm": 4.400207996368408,
"learning_rate": 1.3513513513513515e-05,
"loss": 0.8066,
"step": 80
},
{
"epoch": 0.06094464195022854,
"grad_norm": 4.919775009155273,
"learning_rate": 1.5202702702702704e-05,
"loss": 0.7875,
"step": 90
},
{
"epoch": 0.06771626883358726,
"grad_norm": 4.747180461883545,
"learning_rate": 1.6891891891891892e-05,
"loss": 0.7627,
"step": 100
},
{
"epoch": 0.074487895716946,
"grad_norm": 4.32789421081543,
"learning_rate": 1.8581081081081082e-05,
"loss": 0.7603,
"step": 110
},
{
"epoch": 0.08125952260030472,
"grad_norm": 4.564222812652588,
"learning_rate": 2.0270270270270273e-05,
"loss": 0.7323,
"step": 120
},
{
"epoch": 0.08803114948366345,
"grad_norm": 4.640108108520508,
"learning_rate": 2.195945945945946e-05,
"loss": 0.7282,
"step": 130
},
{
"epoch": 0.09480277636702218,
"grad_norm": 4.069850921630859,
"learning_rate": 2.364864864864865e-05,
"loss": 0.6938,
"step": 140
},
{
"epoch": 0.1015744032503809,
"grad_norm": 4.110805988311768,
"learning_rate": 2.533783783783784e-05,
"loss": 0.6607,
"step": 150
},
{
"epoch": 0.10834603013373963,
"grad_norm": 4.257925510406494,
"learning_rate": 2.702702702702703e-05,
"loss": 0.6171,
"step": 160
},
{
"epoch": 0.11511765701709836,
"grad_norm": 3.9571423530578613,
"learning_rate": 2.8716216216216217e-05,
"loss": 0.6084,
"step": 170
},
{
"epoch": 0.12188928390045708,
"grad_norm": 3.1258833408355713,
"learning_rate": 3.0405405405405407e-05,
"loss": 0.5765,
"step": 180
},
{
"epoch": 0.1286609107838158,
"grad_norm": 3.235241651535034,
"learning_rate": 3.20945945945946e-05,
"loss": 0.5534,
"step": 190
},
{
"epoch": 0.13543253766717453,
"grad_norm": 4.166315078735352,
"learning_rate": 3.3783783783783784e-05,
"loss": 0.5382,
"step": 200
},
{
"epoch": 0.14220416455053325,
"grad_norm": 3.878629446029663,
"learning_rate": 3.547297297297297e-05,
"loss": 0.5036,
"step": 210
},
{
"epoch": 0.148975791433892,
"grad_norm": 2.8281824588775635,
"learning_rate": 3.7162162162162165e-05,
"loss": 0.4804,
"step": 220
},
{
"epoch": 0.15574741831725072,
"grad_norm": 2.894228219985962,
"learning_rate": 3.885135135135135e-05,
"loss": 0.4522,
"step": 230
},
{
"epoch": 0.16251904520060945,
"grad_norm": 2.7571442127227783,
"learning_rate": 4.0540540540540545e-05,
"loss": 0.4496,
"step": 240
},
{
"epoch": 0.16929067208396817,
"grad_norm": 2.831394672393799,
"learning_rate": 4.222972972972973e-05,
"loss": 0.412,
"step": 250
},
{
"epoch": 0.1760622989673269,
"grad_norm": 2.329063892364502,
"learning_rate": 4.391891891891892e-05,
"loss": 0.386,
"step": 260
},
{
"epoch": 0.1828339258506856,
"grad_norm": 2.811448812484741,
"learning_rate": 4.560810810810811e-05,
"loss": 0.3755,
"step": 270
},
{
"epoch": 0.18960555273404436,
"grad_norm": 2.9765515327453613,
"learning_rate": 4.72972972972973e-05,
"loss": 0.3544,
"step": 280
},
{
"epoch": 0.19637717961740309,
"grad_norm": 2.6609086990356445,
"learning_rate": 4.8986486486486486e-05,
"loss": 0.3317,
"step": 290
},
{
"epoch": 0.2031488065007618,
"grad_norm": 4.433963298797607,
"learning_rate": 4.992469879518072e-05,
"loss": 0.3321,
"step": 300
},
{
"epoch": 0.20992043338412053,
"grad_norm": 1.8786640167236328,
"learning_rate": 4.9736445783132535e-05,
"loss": 0.3046,
"step": 310
},
{
"epoch": 0.21669206026747925,
"grad_norm": 2.7514021396636963,
"learning_rate": 4.954819277108434e-05,
"loss": 0.2974,
"step": 320
},
{
"epoch": 0.22346368715083798,
"grad_norm": 3.1674134731292725,
"learning_rate": 4.9359939759036146e-05,
"loss": 0.2762,
"step": 330
},
{
"epoch": 0.23023531403419673,
"grad_norm": 1.7846009731292725,
"learning_rate": 4.917168674698795e-05,
"loss": 0.2604,
"step": 340
},
{
"epoch": 0.23700694091755545,
"grad_norm": 1.7402037382125854,
"learning_rate": 4.898343373493976e-05,
"loss": 0.263,
"step": 350
},
{
"epoch": 0.24377856780091417,
"grad_norm": 1.480891466140747,
"learning_rate": 4.879518072289157e-05,
"loss": 0.25,
"step": 360
},
{
"epoch": 0.2505501946842729,
"grad_norm": 2.047008514404297,
"learning_rate": 4.8606927710843376e-05,
"loss": 0.2281,
"step": 370
},
{
"epoch": 0.2573218215676316,
"grad_norm": 1.3639116287231445,
"learning_rate": 4.841867469879519e-05,
"loss": 0.2264,
"step": 380
},
{
"epoch": 0.26409344845099036,
"grad_norm": 1.8563684225082397,
"learning_rate": 4.823042168674699e-05,
"loss": 0.2245,
"step": 390
},
{
"epoch": 0.27086507533434906,
"grad_norm": 1.3617446422576904,
"learning_rate": 4.804216867469879e-05,
"loss": 0.2177,
"step": 400
},
{
"epoch": 0.2776367022177078,
"grad_norm": 1.7790151834487915,
"learning_rate": 4.7853915662650606e-05,
"loss": 0.2191,
"step": 410
},
{
"epoch": 0.2844083291010665,
"grad_norm": 1.2283886671066284,
"learning_rate": 4.766566265060241e-05,
"loss": 0.208,
"step": 420
},
{
"epoch": 0.29117995598442525,
"grad_norm": 1.2149569988250732,
"learning_rate": 4.7477409638554224e-05,
"loss": 0.1974,
"step": 430
},
{
"epoch": 0.297951582867784,
"grad_norm": 1.4879294633865356,
"learning_rate": 4.728915662650602e-05,
"loss": 0.1905,
"step": 440
},
{
"epoch": 0.3047232097511427,
"grad_norm": 1.4393724203109741,
"learning_rate": 4.710090361445783e-05,
"loss": 0.194,
"step": 450
},
{
"epoch": 0.31149483663450145,
"grad_norm": 1.5028496980667114,
"learning_rate": 4.691265060240964e-05,
"loss": 0.1831,
"step": 460
},
{
"epoch": 0.31826646351786014,
"grad_norm": 1.403731346130371,
"learning_rate": 4.672439759036145e-05,
"loss": 0.193,
"step": 470
},
{
"epoch": 0.3250380904012189,
"grad_norm": 1.6740349531173706,
"learning_rate": 4.653614457831326e-05,
"loss": 0.1724,
"step": 480
},
{
"epoch": 0.33180971728457764,
"grad_norm": 0.9081584215164185,
"learning_rate": 4.634789156626506e-05,
"loss": 0.1593,
"step": 490
},
{
"epoch": 0.33858134416793634,
"grad_norm": 1.0645991563796997,
"learning_rate": 4.615963855421687e-05,
"loss": 0.1611,
"step": 500
},
{
"epoch": 0.3453529710512951,
"grad_norm": 2.582812786102295,
"learning_rate": 4.597138554216868e-05,
"loss": 0.1558,
"step": 510
},
{
"epoch": 0.3521245979346538,
"grad_norm": 1.2780678272247314,
"learning_rate": 4.578313253012048e-05,
"loss": 0.1426,
"step": 520
},
{
"epoch": 0.35889622481801253,
"grad_norm": 2.387918710708618,
"learning_rate": 4.5594879518072295e-05,
"loss": 0.1573,
"step": 530
},
{
"epoch": 0.3656678517013712,
"grad_norm": 1.4807089567184448,
"learning_rate": 4.5406626506024094e-05,
"loss": 0.1618,
"step": 540
},
{
"epoch": 0.37243947858473,
"grad_norm": 1.0865875482559204,
"learning_rate": 4.5218373493975907e-05,
"loss": 0.1439,
"step": 550
},
{
"epoch": 0.3792111054680887,
"grad_norm": 0.9812231063842773,
"learning_rate": 4.503012048192771e-05,
"loss": 0.1422,
"step": 560
},
{
"epoch": 0.3859827323514474,
"grad_norm": 0.9900213479995728,
"learning_rate": 4.4841867469879525e-05,
"loss": 0.1308,
"step": 570
},
{
"epoch": 0.39275435923480617,
"grad_norm": 1.0598454475402832,
"learning_rate": 4.465361445783133e-05,
"loss": 0.1565,
"step": 580
},
{
"epoch": 0.39952598611816487,
"grad_norm": 1.130131483078003,
"learning_rate": 4.446536144578313e-05,
"loss": 0.1432,
"step": 590
},
{
"epoch": 0.4062976130015236,
"grad_norm": 1.4709850549697876,
"learning_rate": 4.427710843373494e-05,
"loss": 0.1427,
"step": 600
},
{
"epoch": 0.41306923988488237,
"grad_norm": 1.2057602405548096,
"learning_rate": 4.408885542168675e-05,
"loss": 0.1377,
"step": 610
},
{
"epoch": 0.41984086676824106,
"grad_norm": 1.0241261720657349,
"learning_rate": 4.390060240963856e-05,
"loss": 0.1303,
"step": 620
},
{
"epoch": 0.4266124936515998,
"grad_norm": 0.70640629529953,
"learning_rate": 4.3712349397590366e-05,
"loss": 0.1284,
"step": 630
},
{
"epoch": 0.4333841205349585,
"grad_norm": 1.1402077674865723,
"learning_rate": 4.352409638554217e-05,
"loss": 0.1153,
"step": 640
},
{
"epoch": 0.44015574741831726,
"grad_norm": 2.3130173683166504,
"learning_rate": 4.333584337349398e-05,
"loss": 0.1353,
"step": 650
},
{
"epoch": 0.44692737430167595,
"grad_norm": 1.018059253692627,
"learning_rate": 4.3147590361445783e-05,
"loss": 0.1198,
"step": 660
},
{
"epoch": 0.4536990011850347,
"grad_norm": 1.1822158098220825,
"learning_rate": 4.2959337349397596e-05,
"loss": 0.1224,
"step": 670
},
{
"epoch": 0.46047062806839345,
"grad_norm": 0.8007648587226868,
"learning_rate": 4.27710843373494e-05,
"loss": 0.1156,
"step": 680
},
{
"epoch": 0.46724225495175215,
"grad_norm": 2.0608577728271484,
"learning_rate": 4.258283132530121e-05,
"loss": 0.1355,
"step": 690
},
{
"epoch": 0.4740138818351109,
"grad_norm": 0.8175686597824097,
"learning_rate": 4.239457831325301e-05,
"loss": 0.117,
"step": 700
},
{
"epoch": 0.4807855087184696,
"grad_norm": 0.9864722490310669,
"learning_rate": 4.220632530120482e-05,
"loss": 0.1143,
"step": 710
},
{
"epoch": 0.48755713560182834,
"grad_norm": 1.1170806884765625,
"learning_rate": 4.201807228915663e-05,
"loss": 0.1128,
"step": 720
},
{
"epoch": 0.4943287624851871,
"grad_norm": 0.8426876664161682,
"learning_rate": 4.182981927710844e-05,
"loss": 0.1039,
"step": 730
},
{
"epoch": 0.5011003893685458,
"grad_norm": 1.0189573764801025,
"learning_rate": 4.164156626506024e-05,
"loss": 0.1056,
"step": 740
},
{
"epoch": 0.5078720162519045,
"grad_norm": 1.0524259805679321,
"learning_rate": 4.145331325301205e-05,
"loss": 0.106,
"step": 750
},
{
"epoch": 0.5146436431352632,
"grad_norm": 0.8824974298477173,
"learning_rate": 4.126506024096386e-05,
"loss": 0.1077,
"step": 760
},
{
"epoch": 0.521415270018622,
"grad_norm": 1.034146785736084,
"learning_rate": 4.107680722891567e-05,
"loss": 0.1147,
"step": 770
},
{
"epoch": 0.5281868969019807,
"grad_norm": 1.1661494970321655,
"learning_rate": 4.088855421686747e-05,
"loss": 0.1159,
"step": 780
},
{
"epoch": 0.5349585237853395,
"grad_norm": 1.9664863348007202,
"learning_rate": 4.070030120481928e-05,
"loss": 0.1155,
"step": 790
},
{
"epoch": 0.5417301506686981,
"grad_norm": 0.9557604193687439,
"learning_rate": 4.0512048192771084e-05,
"loss": 0.1117,
"step": 800
},
{
"epoch": 0.5485017775520569,
"grad_norm": 1.1883480548858643,
"learning_rate": 4.03237951807229e-05,
"loss": 0.1001,
"step": 810
},
{
"epoch": 0.5552734044354156,
"grad_norm": 1.3493871688842773,
"learning_rate": 4.01355421686747e-05,
"loss": 0.1022,
"step": 820
},
{
"epoch": 0.5620450313187744,
"grad_norm": 0.8323326110839844,
"learning_rate": 3.994728915662651e-05,
"loss": 0.1084,
"step": 830
},
{
"epoch": 0.568816658202133,
"grad_norm": 1.3079185485839844,
"learning_rate": 3.9759036144578314e-05,
"loss": 0.1054,
"step": 840
},
{
"epoch": 0.5755882850854918,
"grad_norm": 0.961869478225708,
"learning_rate": 3.957078313253012e-05,
"loss": 0.1027,
"step": 850
},
{
"epoch": 0.5823599119688505,
"grad_norm": 0.7965933680534363,
"learning_rate": 3.938253012048193e-05,
"loss": 0.101,
"step": 860
},
{
"epoch": 0.5891315388522093,
"grad_norm": 0.5749910473823547,
"learning_rate": 3.919427710843374e-05,
"loss": 0.0942,
"step": 870
},
{
"epoch": 0.595903165735568,
"grad_norm": 1.314102292060852,
"learning_rate": 3.9006024096385544e-05,
"loss": 0.1062,
"step": 880
},
{
"epoch": 0.6026747926189266,
"grad_norm": 0.9225144982337952,
"learning_rate": 3.881777108433735e-05,
"loss": 0.0895,
"step": 890
},
{
"epoch": 0.6094464195022854,
"grad_norm": 1.0227304697036743,
"learning_rate": 3.8629518072289155e-05,
"loss": 0.1057,
"step": 900
},
{
"epoch": 0.6162180463856441,
"grad_norm": 0.9698050618171692,
"learning_rate": 3.844126506024097e-05,
"loss": 0.0996,
"step": 910
},
{
"epoch": 0.6229896732690029,
"grad_norm": 0.7296442985534668,
"learning_rate": 3.8253012048192774e-05,
"loss": 0.1035,
"step": 920
},
{
"epoch": 0.6297613001523616,
"grad_norm": 0.943133533000946,
"learning_rate": 3.806475903614458e-05,
"loss": 0.0892,
"step": 930
},
{
"epoch": 0.6365329270357203,
"grad_norm": 1.618379831314087,
"learning_rate": 3.7876506024096385e-05,
"loss": 0.0828,
"step": 940
},
{
"epoch": 0.643304553919079,
"grad_norm": 0.8589970469474792,
"learning_rate": 3.76882530120482e-05,
"loss": 0.1033,
"step": 950
},
{
"epoch": 0.6500761808024378,
"grad_norm": 0.7783709764480591,
"learning_rate": 3.7500000000000003e-05,
"loss": 0.0943,
"step": 960
},
{
"epoch": 0.6568478076857965,
"grad_norm": 1.0342212915420532,
"learning_rate": 3.731174698795181e-05,
"loss": 0.0979,
"step": 970
},
{
"epoch": 0.6636194345691553,
"grad_norm": 0.815461277961731,
"learning_rate": 3.7123493975903615e-05,
"loss": 0.092,
"step": 980
},
{
"epoch": 0.6703910614525139,
"grad_norm": 0.7246832251548767,
"learning_rate": 3.693524096385542e-05,
"loss": 0.0843,
"step": 990
},
{
"epoch": 0.6771626883358727,
"grad_norm": 0.9605166912078857,
"learning_rate": 3.674698795180723e-05,
"loss": 0.0835,
"step": 1000
},
{
"epoch": 0.6839343152192314,
"grad_norm": 1.1440260410308838,
"learning_rate": 3.655873493975904e-05,
"loss": 0.0886,
"step": 1010
},
{
"epoch": 0.6907059421025902,
"grad_norm": 1.0243327617645264,
"learning_rate": 3.6370481927710845e-05,
"loss": 0.089,
"step": 1020
},
{
"epoch": 0.6974775689859489,
"grad_norm": 0.8508220314979553,
"learning_rate": 3.618222891566265e-05,
"loss": 0.0795,
"step": 1030
},
{
"epoch": 0.7042491958693076,
"grad_norm": 0.852389931678772,
"learning_rate": 3.5993975903614456e-05,
"loss": 0.091,
"step": 1040
},
{
"epoch": 0.7110208227526663,
"grad_norm": 0.9550991058349609,
"learning_rate": 3.580572289156627e-05,
"loss": 0.0856,
"step": 1050
},
{
"epoch": 0.7177924496360251,
"grad_norm": 1.009476661682129,
"learning_rate": 3.5617469879518075e-05,
"loss": 0.084,
"step": 1060
},
{
"epoch": 0.7245640765193838,
"grad_norm": 0.8816896080970764,
"learning_rate": 3.542921686746988e-05,
"loss": 0.0873,
"step": 1070
},
{
"epoch": 0.7313357034027425,
"grad_norm": 0.8172402381896973,
"learning_rate": 3.5240963855421686e-05,
"loss": 0.0785,
"step": 1080
},
{
"epoch": 0.7381073302861012,
"grad_norm": 1.3699508905410767,
"learning_rate": 3.505271084337349e-05,
"loss": 0.0785,
"step": 1090
},
{
"epoch": 0.74487895716946,
"grad_norm": 0.6807621121406555,
"learning_rate": 3.4864457831325304e-05,
"loss": 0.0797,
"step": 1100
},
{
"epoch": 0.7516505840528187,
"grad_norm": 0.8369671106338501,
"learning_rate": 3.467620481927711e-05,
"loss": 0.0845,
"step": 1110
},
{
"epoch": 0.7584222109361775,
"grad_norm": 0.7025638818740845,
"learning_rate": 3.4487951807228916e-05,
"loss": 0.0699,
"step": 1120
},
{
"epoch": 0.7651938378195361,
"grad_norm": 0.6446508765220642,
"learning_rate": 3.429969879518072e-05,
"loss": 0.0879,
"step": 1130
},
{
"epoch": 0.7719654647028948,
"grad_norm": 1.6889147758483887,
"learning_rate": 3.4111445783132534e-05,
"loss": 0.0966,
"step": 1140
},
{
"epoch": 0.7787370915862536,
"grad_norm": 0.7355449199676514,
"learning_rate": 3.392319277108434e-05,
"loss": 0.0824,
"step": 1150
},
{
"epoch": 0.7855087184696123,
"grad_norm": 0.5761105418205261,
"learning_rate": 3.3734939759036146e-05,
"loss": 0.0759,
"step": 1160
},
{
"epoch": 0.7922803453529711,
"grad_norm": 0.7694604396820068,
"learning_rate": 3.354668674698795e-05,
"loss": 0.0839,
"step": 1170
},
{
"epoch": 0.7990519722363297,
"grad_norm": 0.8113671541213989,
"learning_rate": 3.335843373493976e-05,
"loss": 0.0799,
"step": 1180
},
{
"epoch": 0.8058235991196885,
"grad_norm": 0.6517748832702637,
"learning_rate": 3.317018072289157e-05,
"loss": 0.078,
"step": 1190
},
{
"epoch": 0.8125952260030472,
"grad_norm": 0.8274058103561401,
"learning_rate": 3.2981927710843376e-05,
"loss": 0.0789,
"step": 1200
},
{
"epoch": 0.819366852886406,
"grad_norm": 0.7922660112380981,
"learning_rate": 3.279367469879519e-05,
"loss": 0.0764,
"step": 1210
},
{
"epoch": 0.8261384797697647,
"grad_norm": 1.2404780387878418,
"learning_rate": 3.260542168674699e-05,
"loss": 0.0731,
"step": 1220
},
{
"epoch": 0.8329101066531234,
"grad_norm": 0.6905914545059204,
"learning_rate": 3.241716867469879e-05,
"loss": 0.0704,
"step": 1230
},
{
"epoch": 0.8396817335364821,
"grad_norm": 0.9580501914024353,
"learning_rate": 3.2228915662650605e-05,
"loss": 0.0806,
"step": 1240
},
{
"epoch": 0.8464533604198409,
"grad_norm": 0.4647589325904846,
"learning_rate": 3.204066265060241e-05,
"loss": 0.0749,
"step": 1250
},
{
"epoch": 0.8532249873031996,
"grad_norm": 0.7317982316017151,
"learning_rate": 3.1852409638554224e-05,
"loss": 0.0739,
"step": 1260
},
{
"epoch": 0.8599966141865584,
"grad_norm": 0.7131925821304321,
"learning_rate": 3.166415662650602e-05,
"loss": 0.0742,
"step": 1270
},
{
"epoch": 0.866768241069917,
"grad_norm": 0.42962631583213806,
"learning_rate": 3.147590361445783e-05,
"loss": 0.0595,
"step": 1280
},
{
"epoch": 0.8735398679532758,
"grad_norm": 1.0177807807922363,
"learning_rate": 3.128765060240964e-05,
"loss": 0.0815,
"step": 1290
},
{
"epoch": 0.8803114948366345,
"grad_norm": 0.7908213138580322,
"learning_rate": 3.1099397590361447e-05,
"loss": 0.0758,
"step": 1300
},
{
"epoch": 0.8870831217199933,
"grad_norm": 0.7449843883514404,
"learning_rate": 3.091114457831326e-05,
"loss": 0.0789,
"step": 1310
},
{
"epoch": 0.8938547486033519,
"grad_norm": 0.8616693615913391,
"learning_rate": 3.072289156626506e-05,
"loss": 0.0653,
"step": 1320
},
{
"epoch": 0.9006263754867107,
"grad_norm": 1.0630964040756226,
"learning_rate": 3.053463855421687e-05,
"loss": 0.081,
"step": 1330
},
{
"epoch": 0.9073980023700694,
"grad_norm": 0.7529353499412537,
"learning_rate": 3.0346385542168676e-05,
"loss": 0.0753,
"step": 1340
},
{
"epoch": 0.9141696292534282,
"grad_norm": 0.7995216846466064,
"learning_rate": 3.0158132530120482e-05,
"loss": 0.066,
"step": 1350
},
{
"epoch": 0.9209412561367869,
"grad_norm": 0.3852786719799042,
"learning_rate": 2.996987951807229e-05,
"loss": 0.07,
"step": 1360
},
{
"epoch": 0.9277128830201455,
"grad_norm": 0.6470819711685181,
"learning_rate": 2.9781626506024097e-05,
"loss": 0.0815,
"step": 1370
},
{
"epoch": 0.9344845099035043,
"grad_norm": 0.9942120313644409,
"learning_rate": 2.9593373493975906e-05,
"loss": 0.0749,
"step": 1380
},
{
"epoch": 0.941256136786863,
"grad_norm": 1.0045307874679565,
"learning_rate": 2.9405120481927712e-05,
"loss": 0.0797,
"step": 1390
},
{
"epoch": 0.9480277636702218,
"grad_norm": 0.5135801434516907,
"learning_rate": 2.921686746987952e-05,
"loss": 0.0647,
"step": 1400
},
{
"epoch": 0.9547993905535805,
"grad_norm": 1.0146623849868774,
"learning_rate": 2.9028614457831327e-05,
"loss": 0.078,
"step": 1410
},
{
"epoch": 0.9615710174369392,
"grad_norm": 0.730907142162323,
"learning_rate": 2.8840361445783133e-05,
"loss": 0.0671,
"step": 1420
},
{
"epoch": 0.9683426443202979,
"grad_norm": 1.0035794973373413,
"learning_rate": 2.8652108433734942e-05,
"loss": 0.0804,
"step": 1430
},
{
"epoch": 0.9751142712036567,
"grad_norm": 0.6765666604042053,
"learning_rate": 2.8463855421686748e-05,
"loss": 0.0691,
"step": 1440
},
{
"epoch": 0.9818858980870154,
"grad_norm": 0.8572028279304504,
"learning_rate": 2.8275602409638557e-05,
"loss": 0.0716,
"step": 1450
},
{
"epoch": 0.9886575249703742,
"grad_norm": 0.4426514208316803,
"learning_rate": 2.8087349397590362e-05,
"loss": 0.0709,
"step": 1460
},
{
"epoch": 0.9954291518537328,
"grad_norm": 0.5503798723220825,
"learning_rate": 2.789909638554217e-05,
"loss": 0.0709,
"step": 1470
},
{
"epoch": 0.9994921279837481,
"eval_accuracy": 0.9789523809523809,
"eval_loss": 0.07530121505260468,
"eval_runtime": 1028.8877,
"eval_samples_per_second": 20.41,
"eval_steps_per_second": 0.639,
"step": 1476
},
{
"epoch": 1.0022007787370917,
"grad_norm": 0.808159589767456,
"learning_rate": 2.7710843373493977e-05,
"loss": 0.0653,
"step": 1480
},
{
"epoch": 1.0089724056204503,
"grad_norm": 1.711690902709961,
"learning_rate": 2.7522590361445783e-05,
"loss": 0.0712,
"step": 1490
},
{
"epoch": 1.015744032503809,
"grad_norm": 0.584488570690155,
"learning_rate": 2.7334337349397592e-05,
"loss": 0.0748,
"step": 1500
},
{
"epoch": 1.0225156593871678,
"grad_norm": 0.6118115186691284,
"learning_rate": 2.7146084337349398e-05,
"loss": 0.0663,
"step": 1510
},
{
"epoch": 1.0292872862705265,
"grad_norm": 0.7841724753379822,
"learning_rate": 2.6957831325301207e-05,
"loss": 0.0664,
"step": 1520
},
{
"epoch": 1.0360589131538853,
"grad_norm": 0.6936156153678894,
"learning_rate": 2.6769578313253013e-05,
"loss": 0.0642,
"step": 1530
},
{
"epoch": 1.042830540037244,
"grad_norm": 0.7428813576698303,
"learning_rate": 2.658132530120482e-05,
"loss": 0.0694,
"step": 1540
},
{
"epoch": 1.0496021669206026,
"grad_norm": 1.0296515226364136,
"learning_rate": 2.6393072289156628e-05,
"loss": 0.0754,
"step": 1550
},
{
"epoch": 1.0563737938039615,
"grad_norm": 0.6250594258308411,
"learning_rate": 2.6204819277108434e-05,
"loss": 0.0646,
"step": 1560
},
{
"epoch": 1.06314542068732,
"grad_norm": 1.5104742050170898,
"learning_rate": 2.6016566265060243e-05,
"loss": 0.073,
"step": 1570
},
{
"epoch": 1.069917047570679,
"grad_norm": 0.8500051498413086,
"learning_rate": 2.582831325301205e-05,
"loss": 0.0677,
"step": 1580
},
{
"epoch": 1.0766886744540376,
"grad_norm": 1.0728169679641724,
"learning_rate": 2.5640060240963858e-05,
"loss": 0.0692,
"step": 1590
},
{
"epoch": 1.0834603013373962,
"grad_norm": 0.7029580473899841,
"learning_rate": 2.5451807228915663e-05,
"loss": 0.0681,
"step": 1600
},
{
"epoch": 1.090231928220755,
"grad_norm": 0.7941862344741821,
"learning_rate": 2.526355421686747e-05,
"loss": 0.0727,
"step": 1610
},
{
"epoch": 1.0970035551041137,
"grad_norm": 0.8318243026733398,
"learning_rate": 2.5075301204819278e-05,
"loss": 0.0764,
"step": 1620
},
{
"epoch": 1.1037751819874724,
"grad_norm": 0.9039504528045654,
"learning_rate": 2.4887048192771087e-05,
"loss": 0.0817,
"step": 1630
},
{
"epoch": 1.1105468088708312,
"grad_norm": 0.8218241930007935,
"learning_rate": 2.4698795180722893e-05,
"loss": 0.0705,
"step": 1640
},
{
"epoch": 1.1173184357541899,
"grad_norm": 0.5053806900978088,
"learning_rate": 2.45105421686747e-05,
"loss": 0.0738,
"step": 1650
},
{
"epoch": 1.1240900626375487,
"grad_norm": 0.5837478637695312,
"learning_rate": 2.4322289156626508e-05,
"loss": 0.0623,
"step": 1660
},
{
"epoch": 1.1308616895209074,
"grad_norm": 0.6927766799926758,
"learning_rate": 2.4134036144578314e-05,
"loss": 0.0692,
"step": 1670
},
{
"epoch": 1.137633316404266,
"grad_norm": 0.8067636489868164,
"learning_rate": 2.3945783132530123e-05,
"loss": 0.0716,
"step": 1680
},
{
"epoch": 1.1444049432876249,
"grad_norm": 0.8656367063522339,
"learning_rate": 2.375753012048193e-05,
"loss": 0.0648,
"step": 1690
},
{
"epoch": 1.1511765701709835,
"grad_norm": 0.5790455341339111,
"learning_rate": 2.3569277108433734e-05,
"loss": 0.0682,
"step": 1700
},
{
"epoch": 1.1579481970543424,
"grad_norm": 0.9087624549865723,
"learning_rate": 2.3381024096385544e-05,
"loss": 0.0606,
"step": 1710
},
{
"epoch": 1.164719823937701,
"grad_norm": 0.6014874577522278,
"learning_rate": 2.319277108433735e-05,
"loss": 0.078,
"step": 1720
},
{
"epoch": 1.1714914508210597,
"grad_norm": 0.6091852784156799,
"learning_rate": 2.300451807228916e-05,
"loss": 0.0647,
"step": 1730
},
{
"epoch": 1.1782630777044185,
"grad_norm": 0.581322431564331,
"learning_rate": 2.2816265060240964e-05,
"loss": 0.0712,
"step": 1740
},
{
"epoch": 1.1850347045877772,
"grad_norm": 0.4964611530303955,
"learning_rate": 2.2628012048192773e-05,
"loss": 0.0524,
"step": 1750
},
{
"epoch": 1.191806331471136,
"grad_norm": 0.8920990228652954,
"learning_rate": 2.243975903614458e-05,
"loss": 0.0719,
"step": 1760
},
{
"epoch": 1.1985779583544947,
"grad_norm": 0.6709417104721069,
"learning_rate": 2.2251506024096385e-05,
"loss": 0.0695,
"step": 1770
},
{
"epoch": 1.2053495852378533,
"grad_norm": 0.7828611731529236,
"learning_rate": 2.2063253012048194e-05,
"loss": 0.0742,
"step": 1780
},
{
"epoch": 1.2121212121212122,
"grad_norm": 1.2962734699249268,
"learning_rate": 2.1875e-05,
"loss": 0.0743,
"step": 1790
},
{
"epoch": 1.2188928390045708,
"grad_norm": 0.5535785555839539,
"learning_rate": 2.168674698795181e-05,
"loss": 0.064,
"step": 1800
},
{
"epoch": 1.2256644658879297,
"grad_norm": 1.0534149408340454,
"learning_rate": 2.1498493975903615e-05,
"loss": 0.0594,
"step": 1810
},
{
"epoch": 1.2324360927712883,
"grad_norm": 0.8949538469314575,
"learning_rate": 2.1310240963855424e-05,
"loss": 0.0607,
"step": 1820
},
{
"epoch": 1.239207719654647,
"grad_norm": 0.7287763357162476,
"learning_rate": 2.112198795180723e-05,
"loss": 0.076,
"step": 1830
},
{
"epoch": 1.2459793465380058,
"grad_norm": 1.2560794353485107,
"learning_rate": 2.0933734939759035e-05,
"loss": 0.0662,
"step": 1840
},
{
"epoch": 1.2527509734213644,
"grad_norm": 0.4771580398082733,
"learning_rate": 2.0745481927710844e-05,
"loss": 0.0541,
"step": 1850
},
{
"epoch": 1.2595226003047233,
"grad_norm": 0.9189515113830566,
"learning_rate": 2.055722891566265e-05,
"loss": 0.058,
"step": 1860
},
{
"epoch": 1.266294227188082,
"grad_norm": 1.0164196491241455,
"learning_rate": 2.036897590361446e-05,
"loss": 0.0604,
"step": 1870
},
{
"epoch": 1.2730658540714406,
"grad_norm": 0.8721578121185303,
"learning_rate": 2.018072289156627e-05,
"loss": 0.0731,
"step": 1880
},
{
"epoch": 1.2798374809547994,
"grad_norm": 0.8412027955055237,
"learning_rate": 1.999246987951807e-05,
"loss": 0.0636,
"step": 1890
},
{
"epoch": 1.286609107838158,
"grad_norm": 0.784599244594574,
"learning_rate": 1.980421686746988e-05,
"loss": 0.0636,
"step": 1900
},
{
"epoch": 1.293380734721517,
"grad_norm": 0.5514250993728638,
"learning_rate": 1.9615963855421686e-05,
"loss": 0.0677,
"step": 1910
},
{
"epoch": 1.3001523616048756,
"grad_norm": 0.6008131504058838,
"learning_rate": 1.9427710843373495e-05,
"loss": 0.0637,
"step": 1920
},
{
"epoch": 1.3069239884882342,
"grad_norm": 0.9827722311019897,
"learning_rate": 1.9239457831325304e-05,
"loss": 0.0581,
"step": 1930
},
{
"epoch": 1.313695615371593,
"grad_norm": 0.699483335018158,
"learning_rate": 1.905120481927711e-05,
"loss": 0.0651,
"step": 1940
},
{
"epoch": 1.3204672422549517,
"grad_norm": 0.7085596919059753,
"learning_rate": 1.8862951807228916e-05,
"loss": 0.0589,
"step": 1950
},
{
"epoch": 1.3272388691383106,
"grad_norm": 1.0991511344909668,
"learning_rate": 1.867469879518072e-05,
"loss": 0.0699,
"step": 1960
},
{
"epoch": 1.3340104960216692,
"grad_norm": 0.8005927801132202,
"learning_rate": 1.848644578313253e-05,
"loss": 0.0673,
"step": 1970
},
{
"epoch": 1.3407821229050279,
"grad_norm": 0.7139153480529785,
"learning_rate": 1.829819277108434e-05,
"loss": 0.0571,
"step": 1980
},
{
"epoch": 1.3475537497883867,
"grad_norm": 0.4111141562461853,
"learning_rate": 1.8109939759036145e-05,
"loss": 0.0587,
"step": 1990
},
{
"epoch": 1.3543253766717454,
"grad_norm": 0.5993856191635132,
"learning_rate": 1.7921686746987955e-05,
"loss": 0.0649,
"step": 2000
},
{
"epoch": 1.3610970035551042,
"grad_norm": 0.9376055598258972,
"learning_rate": 1.773343373493976e-05,
"loss": 0.0672,
"step": 2010
},
{
"epoch": 1.3678686304384629,
"grad_norm": 0.4629497230052948,
"learning_rate": 1.7545180722891566e-05,
"loss": 0.0517,
"step": 2020
},
{
"epoch": 1.3746402573218215,
"grad_norm": 0.4839510917663574,
"learning_rate": 1.7356927710843375e-05,
"loss": 0.0524,
"step": 2030
},
{
"epoch": 1.3814118842051804,
"grad_norm": 0.5999952554702759,
"learning_rate": 1.716867469879518e-05,
"loss": 0.056,
"step": 2040
},
{
"epoch": 1.388183511088539,
"grad_norm": 0.5760485529899597,
"learning_rate": 1.698042168674699e-05,
"loss": 0.0663,
"step": 2050
},
{
"epoch": 1.3949551379718979,
"grad_norm": 0.7293563485145569,
"learning_rate": 1.6792168674698796e-05,
"loss": 0.0649,
"step": 2060
},
{
"epoch": 1.4017267648552565,
"grad_norm": 0.6858052611351013,
"learning_rate": 1.6603915662650605e-05,
"loss": 0.0545,
"step": 2070
},
{
"epoch": 1.4084983917386151,
"grad_norm": 0.9105307459831238,
"learning_rate": 1.641566265060241e-05,
"loss": 0.0616,
"step": 2080
},
{
"epoch": 1.415270018621974,
"grad_norm": 0.4213721752166748,
"learning_rate": 1.6227409638554216e-05,
"loss": 0.0647,
"step": 2090
},
{
"epoch": 1.4220416455053326,
"grad_norm": 0.5692495703697205,
"learning_rate": 1.6039156626506026e-05,
"loss": 0.0602,
"step": 2100
},
{
"epoch": 1.4288132723886915,
"grad_norm": 0.70749431848526,
"learning_rate": 1.585090361445783e-05,
"loss": 0.0638,
"step": 2110
},
{
"epoch": 1.4355848992720501,
"grad_norm": 0.7523058652877808,
"learning_rate": 1.566265060240964e-05,
"loss": 0.0565,
"step": 2120
},
{
"epoch": 1.4423565261554088,
"grad_norm": 0.5956985354423523,
"learning_rate": 1.5474397590361446e-05,
"loss": 0.0584,
"step": 2130
},
{
"epoch": 1.4491281530387676,
"grad_norm": 0.7276691794395447,
"learning_rate": 1.5286144578313255e-05,
"loss": 0.0589,
"step": 2140
},
{
"epoch": 1.4558997799221263,
"grad_norm": 0.9040044546127319,
"learning_rate": 1.509789156626506e-05,
"loss": 0.0635,
"step": 2150
},
{
"epoch": 1.4626714068054851,
"grad_norm": 0.7457456588745117,
"learning_rate": 1.4909638554216867e-05,
"loss": 0.0654,
"step": 2160
},
{
"epoch": 1.4694430336888438,
"grad_norm": 0.7019338011741638,
"learning_rate": 1.4721385542168676e-05,
"loss": 0.0641,
"step": 2170
},
{
"epoch": 1.4762146605722024,
"grad_norm": 0.8061505556106567,
"learning_rate": 1.4533132530120484e-05,
"loss": 0.0573,
"step": 2180
},
{
"epoch": 1.4829862874555613,
"grad_norm": 0.6644711494445801,
"learning_rate": 1.4344879518072291e-05,
"loss": 0.0654,
"step": 2190
},
{
"epoch": 1.48975791433892,
"grad_norm": 0.8262456059455872,
"learning_rate": 1.4156626506024098e-05,
"loss": 0.0642,
"step": 2200
},
{
"epoch": 1.4965295412222788,
"grad_norm": 0.9707741737365723,
"learning_rate": 1.3968373493975902e-05,
"loss": 0.0629,
"step": 2210
},
{
"epoch": 1.5033011681056374,
"grad_norm": 1.0274876356124878,
"learning_rate": 1.378012048192771e-05,
"loss": 0.0584,
"step": 2220
},
{
"epoch": 1.510072794988996,
"grad_norm": 0.5561116933822632,
"learning_rate": 1.3591867469879519e-05,
"loss": 0.0703,
"step": 2230
},
{
"epoch": 1.5168444218723547,
"grad_norm": 0.7350441217422485,
"learning_rate": 1.3403614457831327e-05,
"loss": 0.0679,
"step": 2240
},
{
"epoch": 1.5236160487557135,
"grad_norm": 1.2632744312286377,
"learning_rate": 1.3215361445783134e-05,
"loss": 0.0572,
"step": 2250
},
{
"epoch": 1.5303876756390724,
"grad_norm": 0.970457911491394,
"learning_rate": 1.3027108433734941e-05,
"loss": 0.057,
"step": 2260
},
{
"epoch": 1.537159302522431,
"grad_norm": 0.8355304598808289,
"learning_rate": 1.2838855421686745e-05,
"loss": 0.0543,
"step": 2270
},
{
"epoch": 1.5439309294057897,
"grad_norm": 0.61915522813797,
"learning_rate": 1.2650602409638555e-05,
"loss": 0.0623,
"step": 2280
},
{
"epoch": 1.5507025562891483,
"grad_norm": 0.5436218976974487,
"learning_rate": 1.2462349397590362e-05,
"loss": 0.0621,
"step": 2290
},
{
"epoch": 1.5574741831725072,
"grad_norm": 0.8874384164810181,
"learning_rate": 1.227409638554217e-05,
"loss": 0.0598,
"step": 2300
},
{
"epoch": 1.564245810055866,
"grad_norm": 0.6226460933685303,
"learning_rate": 1.2085843373493977e-05,
"loss": 0.0549,
"step": 2310
},
{
"epoch": 1.5710174369392247,
"grad_norm": 0.648560643196106,
"learning_rate": 1.1897590361445783e-05,
"loss": 0.0581,
"step": 2320
},
{
"epoch": 1.5777890638225833,
"grad_norm": 0.6028856039047241,
"learning_rate": 1.170933734939759e-05,
"loss": 0.0628,
"step": 2330
},
{
"epoch": 1.584560690705942,
"grad_norm": 0.5027485489845276,
"learning_rate": 1.1521084337349398e-05,
"loss": 0.0655,
"step": 2340
},
{
"epoch": 1.5913323175893008,
"grad_norm": 1.5245540142059326,
"learning_rate": 1.1332831325301205e-05,
"loss": 0.0637,
"step": 2350
},
{
"epoch": 1.5981039444726597,
"grad_norm": 0.7036380767822266,
"learning_rate": 1.1144578313253013e-05,
"loss": 0.0582,
"step": 2360
},
{
"epoch": 1.6048755713560183,
"grad_norm": 0.7167279124259949,
"learning_rate": 1.095632530120482e-05,
"loss": 0.0542,
"step": 2370
},
{
"epoch": 1.611647198239377,
"grad_norm": 0.9925076365470886,
"learning_rate": 1.0768072289156627e-05,
"loss": 0.0521,
"step": 2380
},
{
"epoch": 1.6184188251227356,
"grad_norm": 0.8511892557144165,
"learning_rate": 1.0579819277108433e-05,
"loss": 0.0603,
"step": 2390
},
{
"epoch": 1.6251904520060945,
"grad_norm": 1.3457633256912231,
"learning_rate": 1.0391566265060242e-05,
"loss": 0.0654,
"step": 2400
},
{
"epoch": 1.6319620788894533,
"grad_norm": 0.8031491041183472,
"learning_rate": 1.020331325301205e-05,
"loss": 0.0526,
"step": 2410
},
{
"epoch": 1.638733705772812,
"grad_norm": 0.6882662773132324,
"learning_rate": 1.0015060240963856e-05,
"loss": 0.0585,
"step": 2420
},
{
"epoch": 1.6455053326561706,
"grad_norm": 0.5608332753181458,
"learning_rate": 9.826807228915663e-06,
"loss": 0.0613,
"step": 2430
},
{
"epoch": 1.6522769595395292,
"grad_norm": 0.512611985206604,
"learning_rate": 9.63855421686747e-06,
"loss": 0.0535,
"step": 2440
},
{
"epoch": 1.659048586422888,
"grad_norm": 0.48724594712257385,
"learning_rate": 9.450301204819278e-06,
"loss": 0.0554,
"step": 2450
},
{
"epoch": 1.665820213306247,
"grad_norm": 0.7341836094856262,
"learning_rate": 9.262048192771085e-06,
"loss": 0.0598,
"step": 2460
},
{
"epoch": 1.6725918401896056,
"grad_norm": 0.5964226722717285,
"learning_rate": 9.073795180722893e-06,
"loss": 0.0742,
"step": 2470
},
{
"epoch": 1.6793634670729642,
"grad_norm": 1.4178098440170288,
"learning_rate": 8.885542168674699e-06,
"loss": 0.0716,
"step": 2480
},
{
"epoch": 1.6861350939563229,
"grad_norm": 0.7655882239341736,
"learning_rate": 8.697289156626506e-06,
"loss": 0.0509,
"step": 2490
},
{
"epoch": 1.6929067208396817,
"grad_norm": 0.7825894355773926,
"learning_rate": 8.509036144578313e-06,
"loss": 0.056,
"step": 2500
},
{
"epoch": 1.6996783477230406,
"grad_norm": 0.7740017771720886,
"learning_rate": 8.320783132530121e-06,
"loss": 0.0669,
"step": 2510
},
{
"epoch": 1.7064499746063992,
"grad_norm": 0.4478660523891449,
"learning_rate": 8.132530120481928e-06,
"loss": 0.0711,
"step": 2520
},
{
"epoch": 1.7132216014897579,
"grad_norm": 0.9198179244995117,
"learning_rate": 7.944277108433736e-06,
"loss": 0.0534,
"step": 2530
},
{
"epoch": 1.7199932283731165,
"grad_norm": 0.5813512802124023,
"learning_rate": 7.756024096385543e-06,
"loss": 0.0612,
"step": 2540
},
{
"epoch": 1.7267648552564754,
"grad_norm": 0.6739583611488342,
"learning_rate": 7.56777108433735e-06,
"loss": 0.0617,
"step": 2550
},
{
"epoch": 1.7335364821398342,
"grad_norm": 0.6534674167633057,
"learning_rate": 7.379518072289157e-06,
"loss": 0.0639,
"step": 2560
},
{
"epoch": 1.7403081090231929,
"grad_norm": 0.6965113878250122,
"learning_rate": 7.191265060240965e-06,
"loss": 0.0543,
"step": 2570
},
{
"epoch": 1.7470797359065515,
"grad_norm": 0.7045726776123047,
"learning_rate": 7.003012048192771e-06,
"loss": 0.0557,
"step": 2580
},
{
"epoch": 1.7538513627899102,
"grad_norm": 0.7541308403015137,
"learning_rate": 6.814759036144579e-06,
"loss": 0.0609,
"step": 2590
},
{
"epoch": 1.760622989673269,
"grad_norm": 1.2629750967025757,
"learning_rate": 6.626506024096386e-06,
"loss": 0.055,
"step": 2600
},
{
"epoch": 1.7673946165566279,
"grad_norm": 0.8643785119056702,
"learning_rate": 6.438253012048193e-06,
"loss": 0.0675,
"step": 2610
},
{
"epoch": 1.7741662434399865,
"grad_norm": 0.7467519640922546,
"learning_rate": 6.25e-06,
"loss": 0.0641,
"step": 2620
},
{
"epoch": 1.7809378703233452,
"grad_norm": 0.4777075946331024,
"learning_rate": 6.061746987951807e-06,
"loss": 0.0605,
"step": 2630
},
{
"epoch": 1.7877094972067038,
"grad_norm": 0.6084995865821838,
"learning_rate": 5.873493975903615e-06,
"loss": 0.0545,
"step": 2640
},
{
"epoch": 1.7944811240900627,
"grad_norm": 0.9602014422416687,
"learning_rate": 5.685240963855422e-06,
"loss": 0.0484,
"step": 2650
},
{
"epoch": 1.8012527509734215,
"grad_norm": 0.7709717154502869,
"learning_rate": 5.496987951807229e-06,
"loss": 0.0637,
"step": 2660
},
{
"epoch": 1.8080243778567802,
"grad_norm": 0.422376424074173,
"learning_rate": 5.308734939759037e-06,
"loss": 0.0659,
"step": 2670
},
{
"epoch": 1.8147960047401388,
"grad_norm": 0.4001966118812561,
"learning_rate": 5.120481927710843e-06,
"loss": 0.0609,
"step": 2680
},
{
"epoch": 1.8215676316234974,
"grad_norm": 0.4686823785305023,
"learning_rate": 4.932228915662651e-06,
"loss": 0.0588,
"step": 2690
},
{
"epoch": 1.8283392585068563,
"grad_norm": 0.4594326615333557,
"learning_rate": 4.743975903614458e-06,
"loss": 0.0558,
"step": 2700
},
{
"epoch": 1.8351108853902152,
"grad_norm": 0.7824903130531311,
"learning_rate": 4.555722891566265e-06,
"loss": 0.0536,
"step": 2710
},
{
"epoch": 1.8418825122735738,
"grad_norm": 0.6944392323493958,
"learning_rate": 4.367469879518073e-06,
"loss": 0.0653,
"step": 2720
},
{
"epoch": 1.8486541391569324,
"grad_norm": 0.665273129940033,
"learning_rate": 4.17921686746988e-06,
"loss": 0.0544,
"step": 2730
},
{
"epoch": 1.855425766040291,
"grad_norm": 0.6628099083900452,
"learning_rate": 3.990963855421686e-06,
"loss": 0.056,
"step": 2740
},
{
"epoch": 1.86219739292365,
"grad_norm": 0.79542475938797,
"learning_rate": 3.802710843373494e-06,
"loss": 0.0549,
"step": 2750
},
{
"epoch": 1.8689690198070086,
"grad_norm": 0.7736166715621948,
"learning_rate": 3.614457831325301e-06,
"loss": 0.0606,
"step": 2760
},
{
"epoch": 1.8757406466903674,
"grad_norm": 1.0499205589294434,
"learning_rate": 3.426204819277109e-06,
"loss": 0.0656,
"step": 2770
},
{
"epoch": 1.882512273573726,
"grad_norm": 0.831328809261322,
"learning_rate": 3.2379518072289157e-06,
"loss": 0.0575,
"step": 2780
},
{
"epoch": 1.8892839004570847,
"grad_norm": 0.7741392254829407,
"learning_rate": 3.049698795180723e-06,
"loss": 0.0493,
"step": 2790
},
{
"epoch": 1.8960555273404436,
"grad_norm": 0.5156965851783752,
"learning_rate": 2.86144578313253e-06,
"loss": 0.0574,
"step": 2800
},
{
"epoch": 1.9028271542238022,
"grad_norm": 0.7923727035522461,
"learning_rate": 2.6731927710843376e-06,
"loss": 0.0496,
"step": 2810
},
{
"epoch": 1.909598781107161,
"grad_norm": 0.6281595230102539,
"learning_rate": 2.4849397590361446e-06,
"loss": 0.0564,
"step": 2820
},
{
"epoch": 1.9163704079905197,
"grad_norm": 0.475424200296402,
"learning_rate": 2.296686746987952e-06,
"loss": 0.0518,
"step": 2830
},
{
"epoch": 1.9231420348738784,
"grad_norm": 0.808529794216156,
"learning_rate": 2.108433734939759e-06,
"loss": 0.0519,
"step": 2840
},
{
"epoch": 1.9299136617572372,
"grad_norm": 0.9661712050437927,
"learning_rate": 1.920180722891566e-06,
"loss": 0.057,
"step": 2850
},
{
"epoch": 1.9366852886405959,
"grad_norm": 0.7859961986541748,
"learning_rate": 1.7319277108433736e-06,
"loss": 0.0597,
"step": 2860
},
{
"epoch": 1.9434569155239547,
"grad_norm": 0.8376522660255432,
"learning_rate": 1.5436746987951808e-06,
"loss": 0.0562,
"step": 2870
},
{
"epoch": 1.9502285424073134,
"grad_norm": 1.249200463294983,
"learning_rate": 1.355421686746988e-06,
"loss": 0.0582,
"step": 2880
},
{
"epoch": 1.957000169290672,
"grad_norm": 0.6086472272872925,
"learning_rate": 1.167168674698795e-06,
"loss": 0.0575,
"step": 2890
},
{
"epoch": 1.9637717961740309,
"grad_norm": 0.5852237939834595,
"learning_rate": 9.789156626506025e-07,
"loss": 0.0523,
"step": 2900
},
{
"epoch": 1.9705434230573895,
"grad_norm": 0.5044909119606018,
"learning_rate": 7.906626506024097e-07,
"loss": 0.0599,
"step": 2910
},
{
"epoch": 1.9773150499407484,
"grad_norm": 0.5792267918586731,
"learning_rate": 6.024096385542169e-07,
"loss": 0.0514,
"step": 2920
},
{
"epoch": 1.984086676824107,
"grad_norm": 0.6914946436882019,
"learning_rate": 4.1415662650602414e-07,
"loss": 0.0582,
"step": 2930
},
{
"epoch": 1.9908583037074656,
"grad_norm": 1.0799570083618164,
"learning_rate": 2.2590361445783133e-07,
"loss": 0.0467,
"step": 2940
},
{
"epoch": 1.9976299305908245,
"grad_norm": 0.7630107998847961,
"learning_rate": 3.7650602409638556e-08,
"loss": 0.0634,
"step": 2950
},
{
"epoch": 1.9989842559674962,
"eval_accuracy": 0.9820952380952381,
"eval_loss": 0.062325455248355865,
"eval_runtime": 751.2071,
"eval_samples_per_second": 27.955,
"eval_steps_per_second": 0.875,
"step": 2952
},
{
"epoch": 1.9989842559674962,
"step": 2952,
"total_flos": 1.3566159569165903e+20,
"train_loss": 0.14182275843612224,
"train_runtime": 17938.7006,
"train_samples_per_second": 21.072,
"train_steps_per_second": 0.165
}
],
"logging_steps": 10,
"max_steps": 2952,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.3566159569165903e+20,
"train_batch_size": 32,
"trial_name": null,
"trial_params": null
}