{ "best_metric": 0.9824038982133189, "best_model_checkpoint": "/p/project/trustllm-eu/stenlund1/LLMSegm_iu/out/glot500-iu-morph-unamb-sup-6/checkpoint-1500", "epoch": 4.132231404958677, "eval_steps": 50, "global_step": 1500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.06887052341597796, "grad_norm": 0.25278759002685547, "learning_rate": 1.9944289693593316e-05, "loss": 0.6929, "step": 25 }, { "epoch": 0.13774104683195593, "grad_norm": 3.6986641883850098, "learning_rate": 1.966573816155989e-05, "loss": 0.6275, "step": 50 }, { "epoch": 0.13774104683195593, "eval_accuracy": 0.7771941343838914, "eval_f1": 0.7581564776686728, "eval_loss": 0.49594685435295105, "eval_precision": 0.6763687742847051, "eval_recall": 0.8624448247905594, "eval_runtime": 4.0112, "eval_samples_per_second": 6834.373, "eval_steps_per_second": 6.731, "step": 50 }, { "epoch": 0.2066115702479339, "grad_norm": 3.397982120513916, "learning_rate": 1.9387186629526465e-05, "loss": 0.4712, "step": 75 }, { "epoch": 0.27548209366391185, "grad_norm": 3.2428324222564697, "learning_rate": 1.910863509749304e-05, "loss": 0.3489, "step": 100 }, { "epoch": 0.27548209366391185, "eval_accuracy": 0.9177062814620267, "eval_f1": 0.898533777098138, "eval_loss": 0.24391423165798187, "eval_precision": 0.8972424324081559, "eval_recall": 0.8998288442482659, "eval_runtime": 3.5257, "eval_samples_per_second": 7775.49, "eval_steps_per_second": 7.658, "step": 100 }, { "epoch": 0.3443526170798898, "grad_norm": 5.30737829208374, "learning_rate": 1.883008356545961e-05, "loss": 0.2683, "step": 125 }, { "epoch": 0.4132231404958678, "grad_norm": 3.5402088165283203, "learning_rate": 1.8551532033426184e-05, "loss": 0.2206, "step": 150 }, { "epoch": 0.4132231404958678, "eval_accuracy": 0.9466695848836361, "eval_f1": 0.9355492858402398, "eval_loss": 0.15823155641555786, "eval_precision": 0.916083916083916, "eval_recall": 0.9558598324475273, "eval_runtime": 3.8577, "eval_samples_per_second": 7106.225, "eval_steps_per_second": 6.999, "step": 150 }, { "epoch": 0.4820936639118457, "grad_norm": 3.316082239151001, "learning_rate": 1.827298050139276e-05, "loss": 0.1807, "step": 175 }, { "epoch": 0.5509641873278237, "grad_norm": 2.592745065689087, "learning_rate": 1.7994428969359333e-05, "loss": 0.1541, "step": 200 }, { "epoch": 0.5509641873278237, "eval_accuracy": 0.9585977967461881, "eval_f1": 0.9495398568443516, "eval_loss": 0.12743310630321503, "eval_precision": 0.9374122191011236, "eval_recall": 0.9619854067201153, "eval_runtime": 3.4893, "eval_samples_per_second": 7856.68, "eval_steps_per_second": 7.738, "step": 200 }, { "epoch": 0.6198347107438017, "grad_norm": 2.1011128425598145, "learning_rate": 1.7715877437325907e-05, "loss": 0.1369, "step": 225 }, { "epoch": 0.6887052341597796, "grad_norm": 1.8149715662002563, "learning_rate": 1.743732590529248e-05, "loss": 0.1258, "step": 250 }, { "epoch": 0.6887052341597796, "eval_accuracy": 0.9647990078062304, "eval_f1": 0.9563289134271621, "eval_loss": 0.10787822306156158, "eval_precision": 0.9608948708621317, "eval_recall": 0.9518061435906675, "eval_runtime": 3.4856, "eval_samples_per_second": 7864.85, "eval_steps_per_second": 7.746, "step": 250 }, { "epoch": 0.7575757575757576, "grad_norm": 2.8485476970672607, "learning_rate": 1.7158774373259056e-05, "loss": 0.1107, "step": 275 }, { "epoch": 0.8264462809917356, "grad_norm": 1.723015546798706, "learning_rate": 1.688022284122563e-05, "loss": 0.1085, "step": 300 }, { "epoch": 0.8264462809917356, "eval_accuracy": 0.9711826074268621, "eval_f1": 0.9645421903052065, "eval_loss": 0.09259311854839325, "eval_precision": 0.9611772072636193, "eval_recall": 0.9679308170435096, "eval_runtime": 3.4737, "eval_samples_per_second": 7891.819, "eval_steps_per_second": 7.773, "step": 300 }, { "epoch": 0.8953168044077136, "grad_norm": 1.7039345502853394, "learning_rate": 1.66016713091922e-05, "loss": 0.1007, "step": 325 }, { "epoch": 0.9641873278236914, "grad_norm": 2.643916368484497, "learning_rate": 1.6323119777158775e-05, "loss": 0.1001, "step": 350 }, { "epoch": 0.9641873278236914, "eval_accuracy": 0.9737725249872328, "eval_f1": 0.9676897496966701, "eval_loss": 0.08496326208114624, "eval_precision": 0.9654770444763271, "eval_recall": 0.969912620484641, "eval_runtime": 3.877, "eval_samples_per_second": 7071.012, "eval_steps_per_second": 6.964, "step": 350 }, { "epoch": 1.0330578512396693, "grad_norm": 2.7803874015808105, "learning_rate": 1.604456824512535e-05, "loss": 0.0872, "step": 375 }, { "epoch": 1.1019283746556474, "grad_norm": 1.6734216213226318, "learning_rate": 1.5766016713091924e-05, "loss": 0.0803, "step": 400 }, { "epoch": 1.1019283746556474, "eval_accuracy": 0.9746479900780624, "eval_f1": 0.968868980963046, "eval_loss": 0.07618524879217148, "eval_precision": 0.9635602280826799, "eval_recall": 0.9742365552652914, "eval_runtime": 3.4865, "eval_samples_per_second": 7862.946, "eval_steps_per_second": 7.744, "step": 400 }, { "epoch": 1.1707988980716253, "grad_norm": 1.4769816398620605, "learning_rate": 1.5487465181058498e-05, "loss": 0.0794, "step": 425 }, { "epoch": 1.2396694214876034, "grad_norm": 1.9178351163864136, "learning_rate": 1.520891364902507e-05, "loss": 0.0746, "step": 450 }, { "epoch": 1.2396694214876034, "eval_accuracy": 0.976143576274896, "eval_f1": 0.9705113175218685, "eval_loss": 0.07501044124364853, "eval_precision": 0.9715626974812674, "eval_recall": 0.9694622106116566, "eval_runtime": 3.483, "eval_samples_per_second": 7870.76, "eval_steps_per_second": 7.752, "step": 450 }, { "epoch": 1.3085399449035813, "grad_norm": 1.5861995220184326, "learning_rate": 1.4930362116991646e-05, "loss": 0.0712, "step": 475 }, { "epoch": 1.3774104683195592, "grad_norm": 1.4486507177352905, "learning_rate": 1.4651810584958219e-05, "loss": 0.0628, "step": 500 }, { "epoch": 1.3774104683195592, "eval_accuracy": 0.9772379076384329, "eval_f1": 0.9720254640007173, "eval_loss": 0.07069610804319382, "eval_precision": 0.9675145024542615, "eval_recall": 0.9765786866048104, "eval_runtime": 3.478, "eval_samples_per_second": 7882.089, "eval_steps_per_second": 7.763, "step": 500 }, { "epoch": 1.4462809917355373, "grad_norm": 1.6411774158477783, "learning_rate": 1.4373259052924793e-05, "loss": 0.0687, "step": 525 }, { "epoch": 1.5151515151515151, "grad_norm": 1.1685402393341064, "learning_rate": 1.4094707520891366e-05, "loss": 0.0649, "step": 550 }, { "epoch": 1.5151515151515151, "eval_accuracy": 0.9787334938352666, "eval_f1": 0.9736473353523483, "eval_loss": 0.06786245107650757, "eval_precision": 0.9771366358192706, "eval_recall": 0.9701828664084317, "eval_runtime": 3.8335, "eval_samples_per_second": 7151.144, "eval_steps_per_second": 7.043, "step": 550 }, { "epoch": 1.5840220385674932, "grad_norm": 1.5705974102020264, "learning_rate": 1.381615598885794e-05, "loss": 0.0602, "step": 575 }, { "epoch": 1.6528925619834711, "grad_norm": 1.430296778678894, "learning_rate": 1.3537604456824513e-05, "loss": 0.0598, "step": 600 }, { "epoch": 1.6528925619834711, "eval_accuracy": 0.9796089589260961, "eval_f1": 0.9748390871854886, "eval_loss": 0.06365057826042175, "eval_precision": 0.9741813602015114, "eval_recall": 0.9754977029096478, "eval_runtime": 3.4734, "eval_samples_per_second": 7892.464, "eval_steps_per_second": 7.773, "step": 600 }, { "epoch": 1.721763085399449, "grad_norm": 1.6765629053115845, "learning_rate": 1.3259052924791087e-05, "loss": 0.0588, "step": 625 }, { "epoch": 1.790633608815427, "grad_norm": 2.7560369968414307, "learning_rate": 1.2980501392757661e-05, "loss": 0.0588, "step": 650 }, { "epoch": 1.790633608815427, "eval_accuracy": 0.9799737360472751, "eval_f1": 0.9752423900789177, "eval_loss": 0.06402380764484406, "eval_precision": 0.9764312804767925, "eval_recall": 0.9740563913160977, "eval_runtime": 3.4664, "eval_samples_per_second": 7908.573, "eval_steps_per_second": 7.789, "step": 650 }, { "epoch": 1.859504132231405, "grad_norm": 1.4049510955810547, "learning_rate": 1.2701949860724234e-05, "loss": 0.0587, "step": 675 }, { "epoch": 1.9283746556473829, "grad_norm": 1.4854605197906494, "learning_rate": 1.2423398328690808e-05, "loss": 0.0549, "step": 700 }, { "epoch": 1.9283746556473829, "eval_accuracy": 0.9801926023199825, "eval_f1": 0.975607564799425, "eval_loss": 0.06324595212936401, "eval_precision": 0.9730286738351255, "eval_recall": 0.9782001621475542, "eval_runtime": 3.831, "eval_samples_per_second": 7155.784, "eval_steps_per_second": 7.048, "step": 700 }, { "epoch": 1.997245179063361, "grad_norm": 1.7832911014556885, "learning_rate": 1.2144846796657384e-05, "loss": 0.0598, "step": 725 }, { "epoch": 2.0661157024793386, "grad_norm": 1.3163201808929443, "learning_rate": 1.1866295264623957e-05, "loss": 0.049, "step": 750 }, { "epoch": 2.0661157024793386, "eval_accuracy": 0.9817611439410521, "eval_f1": 0.9773939777556742, "eval_loss": 0.06054531782865524, "eval_precision": 0.9811200871380593, "eval_recall": 0.9736960634177101, "eval_runtime": 3.4659, "eval_samples_per_second": 7909.629, "eval_steps_per_second": 7.79, "step": 750 }, { "epoch": 2.1349862258953167, "grad_norm": 2.2273190021514893, "learning_rate": 1.1587743732590531e-05, "loss": 0.0452, "step": 775 }, { "epoch": 2.203856749311295, "grad_norm": 0.6999920010566711, "learning_rate": 1.1309192200557103e-05, "loss": 0.0443, "step": 800 }, { "epoch": 2.203856749311295, "eval_accuracy": 0.982818997592471, "eval_f1": 0.9788533201634265, "eval_loss": 0.05351677164435387, "eval_precision": 0.9757429287504475, "eval_recall": 0.9819836050806233, "eval_runtime": 3.4789, "eval_samples_per_second": 7880.134, "eval_steps_per_second": 7.761, "step": 800 }, { "epoch": 2.2727272727272725, "grad_norm": 2.3686063289642334, "learning_rate": 1.1030640668523678e-05, "loss": 0.0414, "step": 825 }, { "epoch": 2.3415977961432506, "grad_norm": 4.006628036499023, "learning_rate": 1.0752089136490252e-05, "loss": 0.0456, "step": 850 }, { "epoch": 2.3415977961432506, "eval_accuracy": 0.9815057999562268, "eval_f1": 0.9769765224104264, "eval_loss": 0.0683741644024849, "eval_precision": 0.9850732600732601, "eval_recall": 0.9690118007386722, "eval_runtime": 3.4666, "eval_samples_per_second": 7908.086, "eval_steps_per_second": 7.789, "step": 850 }, { "epoch": 2.4104683195592287, "grad_norm": 1.687317967414856, "learning_rate": 1.0473537604456825e-05, "loss": 0.0459, "step": 875 }, { "epoch": 2.479338842975207, "grad_norm": 1.9865649938583374, "learning_rate": 1.0194986072423399e-05, "loss": 0.0437, "step": 900 }, { "epoch": 2.479338842975207, "eval_accuracy": 0.9822718319107026, "eval_f1": 0.9782121402313279, "eval_loss": 0.055423617362976074, "eval_precision": 0.9736724676483712, "eval_recall": 0.9827943428519953, "eval_runtime": 3.8118, "eval_samples_per_second": 7191.836, "eval_steps_per_second": 7.083, "step": 900 }, { "epoch": 2.5482093663911844, "grad_norm": 1.7087410688400269, "learning_rate": 9.916434540389973e-06, "loss": 0.0434, "step": 925 }, { "epoch": 2.6170798898071626, "grad_norm": 1.4715299606323242, "learning_rate": 9.637883008356547e-06, "loss": 0.041, "step": 950 }, { "epoch": 2.6170798898071626, "eval_accuracy": 0.9830378638651783, "eval_f1": 0.9790983053894907, "eval_loss": 0.05513562262058258, "eval_precision": 0.9771218374304683, "eval_recall": 0.9810827853346545, "eval_runtime": 3.4833, "eval_samples_per_second": 7870.073, "eval_steps_per_second": 7.751, "step": 950 }, { "epoch": 2.6859504132231407, "grad_norm": 1.1084128618240356, "learning_rate": 9.35933147632312e-06, "loss": 0.0421, "step": 975 }, { "epoch": 2.7548209366391183, "grad_norm": 1.778534173965454, "learning_rate": 9.080779944289694e-06, "loss": 0.0403, "step": 1000 }, { "epoch": 2.7548209366391183, "eval_accuracy": 0.9833296855621215, "eval_f1": 0.9794283142021157, "eval_loss": 0.05323425307869911, "eval_precision": 0.9788554975706316, "eval_recall": 0.9800018016394919, "eval_runtime": 3.4664, "eval_samples_per_second": 7908.452, "eval_steps_per_second": 7.789, "step": 1000 }, { "epoch": 2.8236914600550964, "grad_norm": 1.2113419771194458, "learning_rate": 8.802228412256268e-06, "loss": 0.0373, "step": 1025 }, { "epoch": 2.8925619834710745, "grad_norm": 1.100354552268982, "learning_rate": 8.523676880222843e-06, "loss": 0.0408, "step": 1050 }, { "epoch": 2.8925619834710745, "eval_accuracy": 0.9839133289560079, "eval_f1": 0.980182447310475, "eval_loss": 0.050565555691719055, "eval_precision": 0.9779411764705882, "eval_recall": 0.9824340149536078, "eval_runtime": 3.4729, "eval_samples_per_second": 7893.777, "eval_steps_per_second": 7.775, "step": 1050 }, { "epoch": 2.9614325068870526, "grad_norm": 1.6811611652374268, "learning_rate": 8.245125348189415e-06, "loss": 0.0404, "step": 1075 }, { "epoch": 3.0303030303030303, "grad_norm": 1.8884915113449097, "learning_rate": 7.96657381615599e-06, "loss": 0.0322, "step": 1100 }, { "epoch": 3.0303030303030303, "eval_accuracy": 0.9840592398044795, "eval_f1": 0.9802949001217478, "eval_loss": 0.05778279900550842, "eval_precision": 0.9814012278801011, "eval_recall": 0.9791910638681199, "eval_runtime": 3.8894, "eval_samples_per_second": 7048.308, "eval_steps_per_second": 6.942, "step": 1100 }, { "epoch": 3.0991735537190084, "grad_norm": 1.4037514925003052, "learning_rate": 7.688022284122564e-06, "loss": 0.0367, "step": 1125 }, { "epoch": 3.168044077134986, "grad_norm": 1.3870735168457031, "learning_rate": 7.409470752089137e-06, "loss": 0.036, "step": 1150 }, { "epoch": 3.168044077134986, "eval_accuracy": 0.98387685124389, "eval_f1": 0.9801115910727142, "eval_loss": 0.05209185555577278, "eval_precision": 0.9791423177200396, "eval_recall": 0.9810827853346545, "eval_runtime": 3.4792, "eval_samples_per_second": 7879.356, "eval_steps_per_second": 7.76, "step": 1150 }, { "epoch": 3.236914600550964, "grad_norm": 1.1460018157958984, "learning_rate": 7.130919220055711e-06, "loss": 0.0329, "step": 1175 }, { "epoch": 3.3057851239669422, "grad_norm": 1.3611100912094116, "learning_rate": 6.852367688022284e-06, "loss": 0.033, "step": 1200 }, { "epoch": 3.3057851239669422, "eval_accuracy": 0.9842781060771868, "eval_f1": 0.980535609447681, "eval_loss": 0.05169374495744705, "eval_precision": 0.9831552255026264, "eval_recall": 0.9779299162237636, "eval_runtime": 3.4677, "eval_samples_per_second": 7905.479, "eval_steps_per_second": 7.786, "step": 1200 }, { "epoch": 3.3746556473829203, "grad_norm": 1.9900141954421997, "learning_rate": 6.573816155988858e-06, "loss": 0.0346, "step": 1225 }, { "epoch": 3.443526170798898, "grad_norm": 1.3785734176635742, "learning_rate": 6.295264623955433e-06, "loss": 0.0308, "step": 1250 }, { "epoch": 3.443526170798898, "eval_accuracy": 0.9836944626833005, "eval_f1": 0.9797508493771234, "eval_loss": 0.0601566806435585, "eval_precision": 0.985420083834518, "eval_recall": 0.9741464732906945, "eval_runtime": 3.4659, "eval_samples_per_second": 7909.55, "eval_steps_per_second": 7.79, "step": 1250 }, { "epoch": 3.512396694214876, "grad_norm": 1.4920400381088257, "learning_rate": 6.016713091922006e-06, "loss": 0.0284, "step": 1275 }, { "epoch": 3.581267217630854, "grad_norm": 2.0963149070739746, "learning_rate": 5.7381615598885795e-06, "loss": 0.0324, "step": 1300 }, { "epoch": 3.581267217630854, "eval_accuracy": 0.9845334500620121, "eval_f1": 0.9808957375867352, "eval_loss": 0.05010749772191048, "eval_precision": 0.981249436581628, "eval_recall": 0.9805422934870732, "eval_runtime": 4.0132, "eval_samples_per_second": 6831.004, "eval_steps_per_second": 6.728, "step": 1300 }, { "epoch": 3.650137741046832, "grad_norm": 1.2143880128860474, "learning_rate": 5.459610027855154e-06, "loss": 0.0295, "step": 1325 }, { "epoch": 3.71900826446281, "grad_norm": 1.3561748266220093, "learning_rate": 5.181058495821727e-06, "loss": 0.0348, "step": 1350 }, { "epoch": 3.71900826446281, "eval_accuracy": 0.9849711826074269, "eval_f1": 0.9814062641032584, "eval_loss": 0.05094814673066139, "eval_precision": 0.983358958126074, "eval_recall": 0.9794613097919106, "eval_runtime": 3.469, "eval_samples_per_second": 7902.599, "eval_steps_per_second": 7.783, "step": 1350 }, { "epoch": 3.787878787878788, "grad_norm": 1.4375017881393433, "learning_rate": 4.902506963788301e-06, "loss": 0.0296, "step": 1375 }, { "epoch": 3.8567493112947657, "grad_norm": 1.0931159257888794, "learning_rate": 4.623955431754875e-06, "loss": 0.0338, "step": 1400 }, { "epoch": 3.8567493112947657, "eval_accuracy": 0.9852994820164879, "eval_f1": 0.9818329351305053, "eval_loss": 0.047653596848249435, "eval_precision": 0.9826746074715755, "eval_recall": 0.9809927033600576, "eval_runtime": 3.4683, "eval_samples_per_second": 7904.087, "eval_steps_per_second": 7.785, "step": 1400 }, { "epoch": 3.925619834710744, "grad_norm": 1.2919989824295044, "learning_rate": 4.345403899721449e-06, "loss": 0.0316, "step": 1425 }, { "epoch": 3.994490358126722, "grad_norm": 1.893475890159607, "learning_rate": 4.0668523676880225e-06, "loss": 0.0285, "step": 1450 }, { "epoch": 3.994490358126722, "eval_accuracy": 0.9851900488801343, "eval_f1": 0.9816439099376074, "eval_loss": 0.05093059316277504, "eval_precision": 0.9853862212943633, "eval_recall": 0.9779299162237636, "eval_runtime": 3.8831, "eval_samples_per_second": 7059.86, "eval_steps_per_second": 6.953, "step": 1450 }, { "epoch": 4.0633608815427, "grad_norm": 1.606433629989624, "learning_rate": 3.7883008356545963e-06, "loss": 0.0398, "step": 1475 }, { "epoch": 4.132231404958677, "grad_norm": 1.2981059551239014, "learning_rate": 3.5097493036211698e-06, "loss": 0.026, "step": 1500 }, { "epoch": 4.132231404958677, "eval_accuracy": 0.9857736922740206, "eval_f1": 0.9824038982133189, "eval_loss": 0.05328037962317467, "eval_precision": 0.9840911145258971, "eval_recall": 0.980722457436267, "eval_runtime": 3.4891, "eval_samples_per_second": 7856.935, "eval_steps_per_second": 7.738, "step": 1500 } ], "logging_steps": 25, "max_steps": 1815, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 2.140915438023948e+16, "train_batch_size": 1024, "trial_name": null, "trial_params": null }