Glot500-m-iuseg / trainer_state.json
matsten's picture
Upload 12 files
5c666cf verified
{
"best_metric": 0.9824038982133189,
"best_model_checkpoint": "/p/project/trustllm-eu/stenlund1/LLMSegm_iu/out/glot500-iu-morph-unamb-sup-6/checkpoint-1500",
"epoch": 4.132231404958677,
"eval_steps": 50,
"global_step": 1500,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.06887052341597796,
"grad_norm": 0.25278759002685547,
"learning_rate": 1.9944289693593316e-05,
"loss": 0.6929,
"step": 25
},
{
"epoch": 0.13774104683195593,
"grad_norm": 3.6986641883850098,
"learning_rate": 1.966573816155989e-05,
"loss": 0.6275,
"step": 50
},
{
"epoch": 0.13774104683195593,
"eval_accuracy": 0.7771941343838914,
"eval_f1": 0.7581564776686728,
"eval_loss": 0.49594685435295105,
"eval_precision": 0.6763687742847051,
"eval_recall": 0.8624448247905594,
"eval_runtime": 4.0112,
"eval_samples_per_second": 6834.373,
"eval_steps_per_second": 6.731,
"step": 50
},
{
"epoch": 0.2066115702479339,
"grad_norm": 3.397982120513916,
"learning_rate": 1.9387186629526465e-05,
"loss": 0.4712,
"step": 75
},
{
"epoch": 0.27548209366391185,
"grad_norm": 3.2428324222564697,
"learning_rate": 1.910863509749304e-05,
"loss": 0.3489,
"step": 100
},
{
"epoch": 0.27548209366391185,
"eval_accuracy": 0.9177062814620267,
"eval_f1": 0.898533777098138,
"eval_loss": 0.24391423165798187,
"eval_precision": 0.8972424324081559,
"eval_recall": 0.8998288442482659,
"eval_runtime": 3.5257,
"eval_samples_per_second": 7775.49,
"eval_steps_per_second": 7.658,
"step": 100
},
{
"epoch": 0.3443526170798898,
"grad_norm": 5.30737829208374,
"learning_rate": 1.883008356545961e-05,
"loss": 0.2683,
"step": 125
},
{
"epoch": 0.4132231404958678,
"grad_norm": 3.5402088165283203,
"learning_rate": 1.8551532033426184e-05,
"loss": 0.2206,
"step": 150
},
{
"epoch": 0.4132231404958678,
"eval_accuracy": 0.9466695848836361,
"eval_f1": 0.9355492858402398,
"eval_loss": 0.15823155641555786,
"eval_precision": 0.916083916083916,
"eval_recall": 0.9558598324475273,
"eval_runtime": 3.8577,
"eval_samples_per_second": 7106.225,
"eval_steps_per_second": 6.999,
"step": 150
},
{
"epoch": 0.4820936639118457,
"grad_norm": 3.316082239151001,
"learning_rate": 1.827298050139276e-05,
"loss": 0.1807,
"step": 175
},
{
"epoch": 0.5509641873278237,
"grad_norm": 2.592745065689087,
"learning_rate": 1.7994428969359333e-05,
"loss": 0.1541,
"step": 200
},
{
"epoch": 0.5509641873278237,
"eval_accuracy": 0.9585977967461881,
"eval_f1": 0.9495398568443516,
"eval_loss": 0.12743310630321503,
"eval_precision": 0.9374122191011236,
"eval_recall": 0.9619854067201153,
"eval_runtime": 3.4893,
"eval_samples_per_second": 7856.68,
"eval_steps_per_second": 7.738,
"step": 200
},
{
"epoch": 0.6198347107438017,
"grad_norm": 2.1011128425598145,
"learning_rate": 1.7715877437325907e-05,
"loss": 0.1369,
"step": 225
},
{
"epoch": 0.6887052341597796,
"grad_norm": 1.8149715662002563,
"learning_rate": 1.743732590529248e-05,
"loss": 0.1258,
"step": 250
},
{
"epoch": 0.6887052341597796,
"eval_accuracy": 0.9647990078062304,
"eval_f1": 0.9563289134271621,
"eval_loss": 0.10787822306156158,
"eval_precision": 0.9608948708621317,
"eval_recall": 0.9518061435906675,
"eval_runtime": 3.4856,
"eval_samples_per_second": 7864.85,
"eval_steps_per_second": 7.746,
"step": 250
},
{
"epoch": 0.7575757575757576,
"grad_norm": 2.8485476970672607,
"learning_rate": 1.7158774373259056e-05,
"loss": 0.1107,
"step": 275
},
{
"epoch": 0.8264462809917356,
"grad_norm": 1.723015546798706,
"learning_rate": 1.688022284122563e-05,
"loss": 0.1085,
"step": 300
},
{
"epoch": 0.8264462809917356,
"eval_accuracy": 0.9711826074268621,
"eval_f1": 0.9645421903052065,
"eval_loss": 0.09259311854839325,
"eval_precision": 0.9611772072636193,
"eval_recall": 0.9679308170435096,
"eval_runtime": 3.4737,
"eval_samples_per_second": 7891.819,
"eval_steps_per_second": 7.773,
"step": 300
},
{
"epoch": 0.8953168044077136,
"grad_norm": 1.7039345502853394,
"learning_rate": 1.66016713091922e-05,
"loss": 0.1007,
"step": 325
},
{
"epoch": 0.9641873278236914,
"grad_norm": 2.643916368484497,
"learning_rate": 1.6323119777158775e-05,
"loss": 0.1001,
"step": 350
},
{
"epoch": 0.9641873278236914,
"eval_accuracy": 0.9737725249872328,
"eval_f1": 0.9676897496966701,
"eval_loss": 0.08496326208114624,
"eval_precision": 0.9654770444763271,
"eval_recall": 0.969912620484641,
"eval_runtime": 3.877,
"eval_samples_per_second": 7071.012,
"eval_steps_per_second": 6.964,
"step": 350
},
{
"epoch": 1.0330578512396693,
"grad_norm": 2.7803874015808105,
"learning_rate": 1.604456824512535e-05,
"loss": 0.0872,
"step": 375
},
{
"epoch": 1.1019283746556474,
"grad_norm": 1.6734216213226318,
"learning_rate": 1.5766016713091924e-05,
"loss": 0.0803,
"step": 400
},
{
"epoch": 1.1019283746556474,
"eval_accuracy": 0.9746479900780624,
"eval_f1": 0.968868980963046,
"eval_loss": 0.07618524879217148,
"eval_precision": 0.9635602280826799,
"eval_recall": 0.9742365552652914,
"eval_runtime": 3.4865,
"eval_samples_per_second": 7862.946,
"eval_steps_per_second": 7.744,
"step": 400
},
{
"epoch": 1.1707988980716253,
"grad_norm": 1.4769816398620605,
"learning_rate": 1.5487465181058498e-05,
"loss": 0.0794,
"step": 425
},
{
"epoch": 1.2396694214876034,
"grad_norm": 1.9178351163864136,
"learning_rate": 1.520891364902507e-05,
"loss": 0.0746,
"step": 450
},
{
"epoch": 1.2396694214876034,
"eval_accuracy": 0.976143576274896,
"eval_f1": 0.9705113175218685,
"eval_loss": 0.07501044124364853,
"eval_precision": 0.9715626974812674,
"eval_recall": 0.9694622106116566,
"eval_runtime": 3.483,
"eval_samples_per_second": 7870.76,
"eval_steps_per_second": 7.752,
"step": 450
},
{
"epoch": 1.3085399449035813,
"grad_norm": 1.5861995220184326,
"learning_rate": 1.4930362116991646e-05,
"loss": 0.0712,
"step": 475
},
{
"epoch": 1.3774104683195592,
"grad_norm": 1.4486507177352905,
"learning_rate": 1.4651810584958219e-05,
"loss": 0.0628,
"step": 500
},
{
"epoch": 1.3774104683195592,
"eval_accuracy": 0.9772379076384329,
"eval_f1": 0.9720254640007173,
"eval_loss": 0.07069610804319382,
"eval_precision": 0.9675145024542615,
"eval_recall": 0.9765786866048104,
"eval_runtime": 3.478,
"eval_samples_per_second": 7882.089,
"eval_steps_per_second": 7.763,
"step": 500
},
{
"epoch": 1.4462809917355373,
"grad_norm": 1.6411774158477783,
"learning_rate": 1.4373259052924793e-05,
"loss": 0.0687,
"step": 525
},
{
"epoch": 1.5151515151515151,
"grad_norm": 1.1685402393341064,
"learning_rate": 1.4094707520891366e-05,
"loss": 0.0649,
"step": 550
},
{
"epoch": 1.5151515151515151,
"eval_accuracy": 0.9787334938352666,
"eval_f1": 0.9736473353523483,
"eval_loss": 0.06786245107650757,
"eval_precision": 0.9771366358192706,
"eval_recall": 0.9701828664084317,
"eval_runtime": 3.8335,
"eval_samples_per_second": 7151.144,
"eval_steps_per_second": 7.043,
"step": 550
},
{
"epoch": 1.5840220385674932,
"grad_norm": 1.5705974102020264,
"learning_rate": 1.381615598885794e-05,
"loss": 0.0602,
"step": 575
},
{
"epoch": 1.6528925619834711,
"grad_norm": 1.430296778678894,
"learning_rate": 1.3537604456824513e-05,
"loss": 0.0598,
"step": 600
},
{
"epoch": 1.6528925619834711,
"eval_accuracy": 0.9796089589260961,
"eval_f1": 0.9748390871854886,
"eval_loss": 0.06365057826042175,
"eval_precision": 0.9741813602015114,
"eval_recall": 0.9754977029096478,
"eval_runtime": 3.4734,
"eval_samples_per_second": 7892.464,
"eval_steps_per_second": 7.773,
"step": 600
},
{
"epoch": 1.721763085399449,
"grad_norm": 1.6765629053115845,
"learning_rate": 1.3259052924791087e-05,
"loss": 0.0588,
"step": 625
},
{
"epoch": 1.790633608815427,
"grad_norm": 2.7560369968414307,
"learning_rate": 1.2980501392757661e-05,
"loss": 0.0588,
"step": 650
},
{
"epoch": 1.790633608815427,
"eval_accuracy": 0.9799737360472751,
"eval_f1": 0.9752423900789177,
"eval_loss": 0.06402380764484406,
"eval_precision": 0.9764312804767925,
"eval_recall": 0.9740563913160977,
"eval_runtime": 3.4664,
"eval_samples_per_second": 7908.573,
"eval_steps_per_second": 7.789,
"step": 650
},
{
"epoch": 1.859504132231405,
"grad_norm": 1.4049510955810547,
"learning_rate": 1.2701949860724234e-05,
"loss": 0.0587,
"step": 675
},
{
"epoch": 1.9283746556473829,
"grad_norm": 1.4854605197906494,
"learning_rate": 1.2423398328690808e-05,
"loss": 0.0549,
"step": 700
},
{
"epoch": 1.9283746556473829,
"eval_accuracy": 0.9801926023199825,
"eval_f1": 0.975607564799425,
"eval_loss": 0.06324595212936401,
"eval_precision": 0.9730286738351255,
"eval_recall": 0.9782001621475542,
"eval_runtime": 3.831,
"eval_samples_per_second": 7155.784,
"eval_steps_per_second": 7.048,
"step": 700
},
{
"epoch": 1.997245179063361,
"grad_norm": 1.7832911014556885,
"learning_rate": 1.2144846796657384e-05,
"loss": 0.0598,
"step": 725
},
{
"epoch": 2.0661157024793386,
"grad_norm": 1.3163201808929443,
"learning_rate": 1.1866295264623957e-05,
"loss": 0.049,
"step": 750
},
{
"epoch": 2.0661157024793386,
"eval_accuracy": 0.9817611439410521,
"eval_f1": 0.9773939777556742,
"eval_loss": 0.06054531782865524,
"eval_precision": 0.9811200871380593,
"eval_recall": 0.9736960634177101,
"eval_runtime": 3.4659,
"eval_samples_per_second": 7909.629,
"eval_steps_per_second": 7.79,
"step": 750
},
{
"epoch": 2.1349862258953167,
"grad_norm": 2.2273190021514893,
"learning_rate": 1.1587743732590531e-05,
"loss": 0.0452,
"step": 775
},
{
"epoch": 2.203856749311295,
"grad_norm": 0.6999920010566711,
"learning_rate": 1.1309192200557103e-05,
"loss": 0.0443,
"step": 800
},
{
"epoch": 2.203856749311295,
"eval_accuracy": 0.982818997592471,
"eval_f1": 0.9788533201634265,
"eval_loss": 0.05351677164435387,
"eval_precision": 0.9757429287504475,
"eval_recall": 0.9819836050806233,
"eval_runtime": 3.4789,
"eval_samples_per_second": 7880.134,
"eval_steps_per_second": 7.761,
"step": 800
},
{
"epoch": 2.2727272727272725,
"grad_norm": 2.3686063289642334,
"learning_rate": 1.1030640668523678e-05,
"loss": 0.0414,
"step": 825
},
{
"epoch": 2.3415977961432506,
"grad_norm": 4.006628036499023,
"learning_rate": 1.0752089136490252e-05,
"loss": 0.0456,
"step": 850
},
{
"epoch": 2.3415977961432506,
"eval_accuracy": 0.9815057999562268,
"eval_f1": 0.9769765224104264,
"eval_loss": 0.0683741644024849,
"eval_precision": 0.9850732600732601,
"eval_recall": 0.9690118007386722,
"eval_runtime": 3.4666,
"eval_samples_per_second": 7908.086,
"eval_steps_per_second": 7.789,
"step": 850
},
{
"epoch": 2.4104683195592287,
"grad_norm": 1.687317967414856,
"learning_rate": 1.0473537604456825e-05,
"loss": 0.0459,
"step": 875
},
{
"epoch": 2.479338842975207,
"grad_norm": 1.9865649938583374,
"learning_rate": 1.0194986072423399e-05,
"loss": 0.0437,
"step": 900
},
{
"epoch": 2.479338842975207,
"eval_accuracy": 0.9822718319107026,
"eval_f1": 0.9782121402313279,
"eval_loss": 0.055423617362976074,
"eval_precision": 0.9736724676483712,
"eval_recall": 0.9827943428519953,
"eval_runtime": 3.8118,
"eval_samples_per_second": 7191.836,
"eval_steps_per_second": 7.083,
"step": 900
},
{
"epoch": 2.5482093663911844,
"grad_norm": 1.7087410688400269,
"learning_rate": 9.916434540389973e-06,
"loss": 0.0434,
"step": 925
},
{
"epoch": 2.6170798898071626,
"grad_norm": 1.4715299606323242,
"learning_rate": 9.637883008356547e-06,
"loss": 0.041,
"step": 950
},
{
"epoch": 2.6170798898071626,
"eval_accuracy": 0.9830378638651783,
"eval_f1": 0.9790983053894907,
"eval_loss": 0.05513562262058258,
"eval_precision": 0.9771218374304683,
"eval_recall": 0.9810827853346545,
"eval_runtime": 3.4833,
"eval_samples_per_second": 7870.073,
"eval_steps_per_second": 7.751,
"step": 950
},
{
"epoch": 2.6859504132231407,
"grad_norm": 1.1084128618240356,
"learning_rate": 9.35933147632312e-06,
"loss": 0.0421,
"step": 975
},
{
"epoch": 2.7548209366391183,
"grad_norm": 1.778534173965454,
"learning_rate": 9.080779944289694e-06,
"loss": 0.0403,
"step": 1000
},
{
"epoch": 2.7548209366391183,
"eval_accuracy": 0.9833296855621215,
"eval_f1": 0.9794283142021157,
"eval_loss": 0.05323425307869911,
"eval_precision": 0.9788554975706316,
"eval_recall": 0.9800018016394919,
"eval_runtime": 3.4664,
"eval_samples_per_second": 7908.452,
"eval_steps_per_second": 7.789,
"step": 1000
},
{
"epoch": 2.8236914600550964,
"grad_norm": 1.2113419771194458,
"learning_rate": 8.802228412256268e-06,
"loss": 0.0373,
"step": 1025
},
{
"epoch": 2.8925619834710745,
"grad_norm": 1.100354552268982,
"learning_rate": 8.523676880222843e-06,
"loss": 0.0408,
"step": 1050
},
{
"epoch": 2.8925619834710745,
"eval_accuracy": 0.9839133289560079,
"eval_f1": 0.980182447310475,
"eval_loss": 0.050565555691719055,
"eval_precision": 0.9779411764705882,
"eval_recall": 0.9824340149536078,
"eval_runtime": 3.4729,
"eval_samples_per_second": 7893.777,
"eval_steps_per_second": 7.775,
"step": 1050
},
{
"epoch": 2.9614325068870526,
"grad_norm": 1.6811611652374268,
"learning_rate": 8.245125348189415e-06,
"loss": 0.0404,
"step": 1075
},
{
"epoch": 3.0303030303030303,
"grad_norm": 1.8884915113449097,
"learning_rate": 7.96657381615599e-06,
"loss": 0.0322,
"step": 1100
},
{
"epoch": 3.0303030303030303,
"eval_accuracy": 0.9840592398044795,
"eval_f1": 0.9802949001217478,
"eval_loss": 0.05778279900550842,
"eval_precision": 0.9814012278801011,
"eval_recall": 0.9791910638681199,
"eval_runtime": 3.8894,
"eval_samples_per_second": 7048.308,
"eval_steps_per_second": 6.942,
"step": 1100
},
{
"epoch": 3.0991735537190084,
"grad_norm": 1.4037514925003052,
"learning_rate": 7.688022284122564e-06,
"loss": 0.0367,
"step": 1125
},
{
"epoch": 3.168044077134986,
"grad_norm": 1.3870735168457031,
"learning_rate": 7.409470752089137e-06,
"loss": 0.036,
"step": 1150
},
{
"epoch": 3.168044077134986,
"eval_accuracy": 0.98387685124389,
"eval_f1": 0.9801115910727142,
"eval_loss": 0.05209185555577278,
"eval_precision": 0.9791423177200396,
"eval_recall": 0.9810827853346545,
"eval_runtime": 3.4792,
"eval_samples_per_second": 7879.356,
"eval_steps_per_second": 7.76,
"step": 1150
},
{
"epoch": 3.236914600550964,
"grad_norm": 1.1460018157958984,
"learning_rate": 7.130919220055711e-06,
"loss": 0.0329,
"step": 1175
},
{
"epoch": 3.3057851239669422,
"grad_norm": 1.3611100912094116,
"learning_rate": 6.852367688022284e-06,
"loss": 0.033,
"step": 1200
},
{
"epoch": 3.3057851239669422,
"eval_accuracy": 0.9842781060771868,
"eval_f1": 0.980535609447681,
"eval_loss": 0.05169374495744705,
"eval_precision": 0.9831552255026264,
"eval_recall": 0.9779299162237636,
"eval_runtime": 3.4677,
"eval_samples_per_second": 7905.479,
"eval_steps_per_second": 7.786,
"step": 1200
},
{
"epoch": 3.3746556473829203,
"grad_norm": 1.9900141954421997,
"learning_rate": 6.573816155988858e-06,
"loss": 0.0346,
"step": 1225
},
{
"epoch": 3.443526170798898,
"grad_norm": 1.3785734176635742,
"learning_rate": 6.295264623955433e-06,
"loss": 0.0308,
"step": 1250
},
{
"epoch": 3.443526170798898,
"eval_accuracy": 0.9836944626833005,
"eval_f1": 0.9797508493771234,
"eval_loss": 0.0601566806435585,
"eval_precision": 0.985420083834518,
"eval_recall": 0.9741464732906945,
"eval_runtime": 3.4659,
"eval_samples_per_second": 7909.55,
"eval_steps_per_second": 7.79,
"step": 1250
},
{
"epoch": 3.512396694214876,
"grad_norm": 1.4920400381088257,
"learning_rate": 6.016713091922006e-06,
"loss": 0.0284,
"step": 1275
},
{
"epoch": 3.581267217630854,
"grad_norm": 2.0963149070739746,
"learning_rate": 5.7381615598885795e-06,
"loss": 0.0324,
"step": 1300
},
{
"epoch": 3.581267217630854,
"eval_accuracy": 0.9845334500620121,
"eval_f1": 0.9808957375867352,
"eval_loss": 0.05010749772191048,
"eval_precision": 0.981249436581628,
"eval_recall": 0.9805422934870732,
"eval_runtime": 4.0132,
"eval_samples_per_second": 6831.004,
"eval_steps_per_second": 6.728,
"step": 1300
},
{
"epoch": 3.650137741046832,
"grad_norm": 1.2143880128860474,
"learning_rate": 5.459610027855154e-06,
"loss": 0.0295,
"step": 1325
},
{
"epoch": 3.71900826446281,
"grad_norm": 1.3561748266220093,
"learning_rate": 5.181058495821727e-06,
"loss": 0.0348,
"step": 1350
},
{
"epoch": 3.71900826446281,
"eval_accuracy": 0.9849711826074269,
"eval_f1": 0.9814062641032584,
"eval_loss": 0.05094814673066139,
"eval_precision": 0.983358958126074,
"eval_recall": 0.9794613097919106,
"eval_runtime": 3.469,
"eval_samples_per_second": 7902.599,
"eval_steps_per_second": 7.783,
"step": 1350
},
{
"epoch": 3.787878787878788,
"grad_norm": 1.4375017881393433,
"learning_rate": 4.902506963788301e-06,
"loss": 0.0296,
"step": 1375
},
{
"epoch": 3.8567493112947657,
"grad_norm": 1.0931159257888794,
"learning_rate": 4.623955431754875e-06,
"loss": 0.0338,
"step": 1400
},
{
"epoch": 3.8567493112947657,
"eval_accuracy": 0.9852994820164879,
"eval_f1": 0.9818329351305053,
"eval_loss": 0.047653596848249435,
"eval_precision": 0.9826746074715755,
"eval_recall": 0.9809927033600576,
"eval_runtime": 3.4683,
"eval_samples_per_second": 7904.087,
"eval_steps_per_second": 7.785,
"step": 1400
},
{
"epoch": 3.925619834710744,
"grad_norm": 1.2919989824295044,
"learning_rate": 4.345403899721449e-06,
"loss": 0.0316,
"step": 1425
},
{
"epoch": 3.994490358126722,
"grad_norm": 1.893475890159607,
"learning_rate": 4.0668523676880225e-06,
"loss": 0.0285,
"step": 1450
},
{
"epoch": 3.994490358126722,
"eval_accuracy": 0.9851900488801343,
"eval_f1": 0.9816439099376074,
"eval_loss": 0.05093059316277504,
"eval_precision": 0.9853862212943633,
"eval_recall": 0.9779299162237636,
"eval_runtime": 3.8831,
"eval_samples_per_second": 7059.86,
"eval_steps_per_second": 6.953,
"step": 1450
},
{
"epoch": 4.0633608815427,
"grad_norm": 1.606433629989624,
"learning_rate": 3.7883008356545963e-06,
"loss": 0.0398,
"step": 1475
},
{
"epoch": 4.132231404958677,
"grad_norm": 1.2981059551239014,
"learning_rate": 3.5097493036211698e-06,
"loss": 0.026,
"step": 1500
},
{
"epoch": 4.132231404958677,
"eval_accuracy": 0.9857736922740206,
"eval_f1": 0.9824038982133189,
"eval_loss": 0.05328037962317467,
"eval_precision": 0.9840911145258971,
"eval_recall": 0.980722457436267,
"eval_runtime": 3.4891,
"eval_samples_per_second": 7856.935,
"eval_steps_per_second": 7.738,
"step": 1500
}
],
"logging_steps": 25,
"max_steps": 1815,
"num_input_tokens_seen": 0,
"num_train_epochs": 5,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 2.140915438023948e+16,
"train_batch_size": 1024,
"trial_name": null,
"trial_params": null
}