{ "best_metric": 0.0490318201482296, "best_model_checkpoint": "/home1/datahome/villien/project_hub/DinoVdeau/models/Kamoulox-large-2024_10_31-batch-size64_freeze_monolabel/checkpoint-387504", "epoch": 150.0, "eval_steps": 500, "global_step": 403650, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.18580453363062058, "grad_norm": 2.81394624710083, "learning_rate": 0.001, "loss": 1.1787, "step": 500 }, { "epoch": 0.37160906726124115, "grad_norm": 2.4735236167907715, "learning_rate": 0.001, "loss": 0.98, "step": 1000 }, { "epoch": 0.5574136008918618, "grad_norm": 1.8667224645614624, "learning_rate": 0.001, "loss": 0.9449, "step": 1500 }, { "epoch": 0.7432181345224823, "grad_norm": 1.7729698419570923, "learning_rate": 0.001, "loss": 0.9107, "step": 2000 }, { "epoch": 0.929022668153103, "grad_norm": 1.785703182220459, "learning_rate": 0.001, "loss": 0.9079, "step": 2500 }, { "epoch": 1.0, "eval_accuracy": 0.7326527412414418, "eval_f1_macro": 0.2723318326456157, "eval_f1_micro": 0.7326527412414418, "eval_loss": 0.8028618097305298, "eval_runtime": 507.3076, "eval_samples_per_second": 113.148, "eval_steps_per_second": 1.768, "learning_rate": 0.001, "step": 2691 }, { "epoch": 1.1148272017837235, "grad_norm": 1.4361521005630493, "learning_rate": 0.001, "loss": 0.9035, "step": 3000 }, { "epoch": 1.3006317354143442, "grad_norm": 1.2895761728286743, "learning_rate": 0.001, "loss": 0.8945, "step": 3500 }, { "epoch": 1.4864362690449646, "grad_norm": 1.5815945863723755, "learning_rate": 0.001, "loss": 0.8879, "step": 4000 }, { "epoch": 1.6722408026755853, "grad_norm": 1.3825953006744385, "learning_rate": 0.001, "loss": 0.8901, "step": 4500 }, { "epoch": 1.8580453363062057, "grad_norm": 1.0456748008728027, "learning_rate": 0.001, "loss": 0.8824, "step": 5000 }, { "epoch": 2.0, "eval_accuracy": 0.7288723192975732, "eval_f1_macro": 0.290650504066453, "eval_f1_micro": 0.7288723192975732, "eval_loss": 0.8038854002952576, "eval_runtime": 507.5267, "eval_samples_per_second": 113.099, "eval_steps_per_second": 1.767, "learning_rate": 0.001, "step": 5382 }, { "epoch": 2.0438498699368264, "grad_norm": 1.1838769912719727, "learning_rate": 0.001, "loss": 0.8883, "step": 5500 }, { "epoch": 2.229654403567447, "grad_norm": 1.2504063844680786, "learning_rate": 0.001, "loss": 0.8713, "step": 6000 }, { "epoch": 2.4154589371980677, "grad_norm": 1.2367465496063232, "learning_rate": 0.001, "loss": 0.8738, "step": 6500 }, { "epoch": 2.6012634708286884, "grad_norm": 1.2117840051651, "learning_rate": 0.001, "loss": 0.8664, "step": 7000 }, { "epoch": 2.787068004459309, "grad_norm": 0.9993311762809753, "learning_rate": 0.001, "loss": 0.8667, "step": 7500 }, { "epoch": 2.9728725380899292, "grad_norm": 1.0402483940124512, "learning_rate": 0.001, "loss": 0.8655, "step": 8000 }, { "epoch": 3.0, "eval_accuracy": 0.7408581732025574, "eval_f1_macro": 0.326985383349658, "eval_f1_micro": 0.7408581732025574, "eval_loss": 0.7705450654029846, "eval_runtime": 518.2697, "eval_samples_per_second": 110.755, "eval_steps_per_second": 1.731, "learning_rate": 0.001, "step": 8073 }, { "epoch": 3.15867707172055, "grad_norm": 1.2037022113800049, "learning_rate": 0.001, "loss": 0.8485, "step": 8500 }, { "epoch": 3.3444816053511706, "grad_norm": 1.2167097330093384, "learning_rate": 0.001, "loss": 0.8707, "step": 9000 }, { "epoch": 3.530286138981791, "grad_norm": 1.150800108909607, "learning_rate": 0.001, "loss": 0.8543, "step": 9500 }, { "epoch": 3.716090672612412, "grad_norm": 0.8481634855270386, "learning_rate": 0.001, "loss": 0.848, "step": 10000 }, { "epoch": 3.901895206243032, "grad_norm": 0.9482496380805969, "learning_rate": 0.001, "loss": 0.8514, "step": 10500 }, { "epoch": 4.0, "eval_accuracy": 0.7417118168673019, "eval_f1_macro": 0.310238611675961, "eval_f1_micro": 0.7417118168673019, "eval_loss": 0.7623223066329956, "eval_runtime": 526.8057, "eval_samples_per_second": 108.96, "eval_steps_per_second": 1.703, "learning_rate": 0.001, "step": 10764 }, { "epoch": 4.087699739873653, "grad_norm": 0.9820572137832642, "learning_rate": 0.001, "loss": 0.8542, "step": 11000 }, { "epoch": 4.273504273504273, "grad_norm": 1.2227846384048462, "learning_rate": 0.001, "loss": 0.8536, "step": 11500 }, { "epoch": 4.459308807134894, "grad_norm": 0.9903749823570251, "learning_rate": 0.001, "loss": 0.8473, "step": 12000 }, { "epoch": 4.645113340765515, "grad_norm": 0.802663266658783, "learning_rate": 0.001, "loss": 0.8475, "step": 12500 }, { "epoch": 4.830917874396135, "grad_norm": 0.8765626549720764, "learning_rate": 0.001, "loss": 0.844, "step": 13000 }, { "epoch": 5.0, "eval_accuracy": 0.7383495061061653, "eval_f1_macro": 0.3107947240093857, "eval_f1_micro": 0.7383495061061653, "eval_loss": 0.7626621127128601, "eval_runtime": 531.6556, "eval_samples_per_second": 107.967, "eval_steps_per_second": 1.687, "learning_rate": 0.001, "step": 13455 }, { "epoch": 5.016722408026756, "grad_norm": 0.846638560295105, "learning_rate": 0.001, "loss": 0.8519, "step": 13500 }, { "epoch": 5.202526941657377, "grad_norm": 0.9798027276992798, "learning_rate": 0.001, "loss": 0.8477, "step": 14000 }, { "epoch": 5.388331475287997, "grad_norm": 1.171106219291687, "learning_rate": 0.001, "loss": 0.8377, "step": 14500 }, { "epoch": 5.574136008918618, "grad_norm": 5.736990451812744, "learning_rate": 0.001, "loss": 0.8358, "step": 15000 }, { "epoch": 5.759940542549238, "grad_norm": 0.9148433208465576, "learning_rate": 0.001, "loss": 0.8361, "step": 15500 }, { "epoch": 5.9457450761798585, "grad_norm": 0.9974657297134399, "learning_rate": 0.001, "loss": 0.8437, "step": 16000 }, { "epoch": 6.0, "eval_accuracy": 0.7447257016428285, "eval_f1_macro": 0.34331624846537584, "eval_f1_micro": 0.7447257016428285, "eval_loss": 0.7451828122138977, "eval_runtime": 532.2923, "eval_samples_per_second": 107.837, "eval_steps_per_second": 1.685, "learning_rate": 0.001, "step": 16146 }, { "epoch": 6.131549609810479, "grad_norm": 0.7431060671806335, "learning_rate": 0.001, "loss": 0.8485, "step": 16500 }, { "epoch": 6.3173541434411, "grad_norm": 0.899385392665863, "learning_rate": 0.001, "loss": 0.8323, "step": 17000 }, { "epoch": 6.5031586770717205, "grad_norm": 0.8070719838142395, "learning_rate": 0.001, "loss": 0.8388, "step": 17500 }, { "epoch": 6.688963210702341, "grad_norm": 0.7958774566650391, "learning_rate": 0.001, "loss": 0.8428, "step": 18000 }, { "epoch": 6.874767744332962, "grad_norm": 0.9422007203102112, "learning_rate": 0.001, "loss": 0.8289, "step": 18500 }, { "epoch": 7.0, "eval_accuracy": 0.7467291510600861, "eval_f1_macro": 0.3283260141196405, "eval_f1_micro": 0.7467291510600861, "eval_loss": 0.7458378672599792, "eval_runtime": 529.9283, "eval_samples_per_second": 108.318, "eval_steps_per_second": 1.693, "learning_rate": 0.001, "step": 18837 }, { "epoch": 7.060572277963582, "grad_norm": 0.8445965051651001, "learning_rate": 0.001, "loss": 0.8383, "step": 19000 }, { "epoch": 7.246376811594203, "grad_norm": 0.9642378687858582, "learning_rate": 0.001, "loss": 0.8319, "step": 19500 }, { "epoch": 7.432181345224824, "grad_norm": 0.9426402449607849, "learning_rate": 0.001, "loss": 0.8365, "step": 20000 }, { "epoch": 7.617985878855444, "grad_norm": 1.0661524534225464, "learning_rate": 0.001, "loss": 0.8348, "step": 20500 }, { "epoch": 7.803790412486064, "grad_norm": 0.7548210620880127, "learning_rate": 0.001, "loss": 0.8436, "step": 21000 }, { "epoch": 7.989594946116685, "grad_norm": 1.0589923858642578, "learning_rate": 0.001, "loss": 0.8402, "step": 21500 }, { "epoch": 8.0, "eval_accuracy": 0.7458929286946221, "eval_f1_macro": 0.3352756381207012, "eval_f1_micro": 0.7458929286946221, "eval_loss": 0.7398682832717896, "eval_runtime": 542.5535, "eval_samples_per_second": 105.798, "eval_steps_per_second": 1.653, "learning_rate": 0.001, "step": 21528 }, { "epoch": 8.175399479747306, "grad_norm": 0.9055599570274353, "learning_rate": 0.001, "loss": 0.8301, "step": 22000 }, { "epoch": 8.361204013377927, "grad_norm": 0.6724838614463806, "learning_rate": 0.001, "loss": 0.8362, "step": 22500 }, { "epoch": 8.547008547008547, "grad_norm": 0.9552667140960693, "learning_rate": 0.001, "loss": 0.837, "step": 23000 }, { "epoch": 8.732813080639168, "grad_norm": 0.7529712319374084, "learning_rate": 0.001, "loss": 0.8335, "step": 23500 }, { "epoch": 8.918617614269788, "grad_norm": 0.9149639010429382, "learning_rate": 0.001, "loss": 0.8274, "step": 24000 }, { "epoch": 9.0, "eval_accuracy": 0.7455270814097316, "eval_f1_macro": 0.329402464136349, "eval_f1_micro": 0.7455270814097316, "eval_loss": 0.7424591779708862, "eval_runtime": 539.3684, "eval_samples_per_second": 106.423, "eval_steps_per_second": 1.663, "learning_rate": 0.001, "step": 24219 }, { "epoch": 9.104422147900408, "grad_norm": 0.9954981207847595, "learning_rate": 0.001, "loss": 0.8204, "step": 24500 }, { "epoch": 9.29022668153103, "grad_norm": 0.8292973041534424, "learning_rate": 0.001, "loss": 0.8373, "step": 25000 }, { "epoch": 9.47603121516165, "grad_norm": 0.6125388741493225, "learning_rate": 0.001, "loss": 0.834, "step": 25500 }, { "epoch": 9.66183574879227, "grad_norm": 0.7945879697799683, "learning_rate": 0.001, "loss": 0.8523, "step": 26000 }, { "epoch": 9.84764028242289, "grad_norm": 0.8813033699989319, "learning_rate": 0.001, "loss": 0.8289, "step": 26500 }, { "epoch": 10.0, "eval_accuracy": 0.7474782669291475, "eval_f1_macro": 0.31573051576045374, "eval_f1_micro": 0.7474782669291475, "eval_loss": 0.7364382147789001, "eval_runtime": 565.5402, "eval_samples_per_second": 101.498, "eval_steps_per_second": 1.586, "learning_rate": 0.001, "step": 26910 }, { "epoch": 10.033444816053512, "grad_norm": 0.7872824668884277, "learning_rate": 0.001, "loss": 0.8307, "step": 27000 }, { "epoch": 10.219249349684132, "grad_norm": 0.9812045097351074, "learning_rate": 0.001, "loss": 0.8275, "step": 27500 }, { "epoch": 10.405053883314753, "grad_norm": 0.9366200566291809, "learning_rate": 0.001, "loss": 0.8271, "step": 28000 }, { "epoch": 10.590858416945373, "grad_norm": 0.8672580122947693, "learning_rate": 0.001, "loss": 0.8314, "step": 28500 }, { "epoch": 10.776662950575995, "grad_norm": 0.8797865509986877, "learning_rate": 0.001, "loss": 0.845, "step": 29000 }, { "epoch": 10.962467484206615, "grad_norm": 0.708544135093689, "learning_rate": 0.001, "loss": 0.8368, "step": 29500 }, { "epoch": 11.0, "eval_accuracy": 0.7465375167680005, "eval_f1_macro": 0.34419509138343996, "eval_f1_micro": 0.7465375167680005, "eval_loss": 0.7368418574333191, "eval_runtime": 540.8249, "eval_samples_per_second": 106.136, "eval_steps_per_second": 1.659, "learning_rate": 0.001, "step": 29601 }, { "epoch": 11.148272017837236, "grad_norm": 0.7683461904525757, "learning_rate": 0.001, "loss": 0.8328, "step": 30000 }, { "epoch": 11.334076551467856, "grad_norm": 0.8067987561225891, "learning_rate": 0.001, "loss": 0.8367, "step": 30500 }, { "epoch": 11.519881085098476, "grad_norm": 0.8982560038566589, "learning_rate": 0.001, "loss": 0.8333, "step": 31000 }, { "epoch": 11.705685618729097, "grad_norm": 0.8005286455154419, "learning_rate": 0.001, "loss": 0.8264, "step": 31500 }, { "epoch": 11.891490152359717, "grad_norm": 0.848416805267334, "learning_rate": 0.001, "loss": 0.8329, "step": 32000 }, { "epoch": 12.0, "eval_accuracy": 0.7428093587219735, "eval_f1_macro": 0.3321199292959056, "eval_f1_micro": 0.7428093587219735, "eval_loss": 0.7442134022712708, "eval_runtime": 602.0094, "eval_samples_per_second": 95.349, "eval_steps_per_second": 1.49, "learning_rate": 0.001, "step": 32292 }, { "epoch": 12.077294685990339, "grad_norm": 0.9192249178886414, "learning_rate": 0.001, "loss": 0.8242, "step": 32500 }, { "epoch": 12.263099219620958, "grad_norm": 0.9414801597595215, "learning_rate": 0.001, "loss": 0.8211, "step": 33000 }, { "epoch": 12.44890375325158, "grad_norm": 0.8428712487220764, "learning_rate": 0.001, "loss": 0.8252, "step": 33500 }, { "epoch": 12.6347082868822, "grad_norm": 0.8568095564842224, "learning_rate": 0.001, "loss": 0.834, "step": 34000 }, { "epoch": 12.820512820512821, "grad_norm": 0.552550196647644, "learning_rate": 0.001, "loss": 0.8359, "step": 34500 }, { "epoch": 13.0, "eval_accuracy": 0.7479312207104406, "eval_f1_macro": 0.35283143267744377, "eval_f1_micro": 0.7479312207104406, "eval_loss": 0.7384127378463745, "eval_runtime": 536.6777, "eval_samples_per_second": 106.956, "eval_steps_per_second": 1.671, "learning_rate": 0.001, "step": 34983 }, { "epoch": 13.006317354143441, "grad_norm": 0.790766179561615, "learning_rate": 0.001, "loss": 0.8317, "step": 35000 }, { "epoch": 13.192121887774062, "grad_norm": 0.6483836770057678, "learning_rate": 0.001, "loss": 0.8142, "step": 35500 }, { "epoch": 13.377926421404682, "grad_norm": 0.7646375298500061, "learning_rate": 0.001, "loss": 0.8336, "step": 36000 }, { "epoch": 13.563730955035304, "grad_norm": 0.8401319980621338, "learning_rate": 0.001, "loss": 0.8318, "step": 36500 }, { "epoch": 13.749535488665924, "grad_norm": 0.8368563055992126, "learning_rate": 0.001, "loss": 0.8305, "step": 37000 }, { "epoch": 13.935340022296543, "grad_norm": 1.0513092279434204, "learning_rate": 0.001, "loss": 0.8388, "step": 37500 }, { "epoch": 14.0, "eval_accuracy": 0.7463633037751956, "eval_f1_macro": 0.33455884660313173, "eval_f1_micro": 0.7463633037751956, "eval_loss": 0.7464041113853455, "eval_runtime": 519.5938, "eval_samples_per_second": 110.473, "eval_steps_per_second": 1.726, "learning_rate": 0.001, "step": 37674 }, { "epoch": 14.121144555927165, "grad_norm": 0.9093230962753296, "learning_rate": 0.001, "loss": 0.8277, "step": 38000 }, { "epoch": 14.306949089557785, "grad_norm": 0.7018683552742004, "learning_rate": 0.001, "loss": 0.8154, "step": 38500 }, { "epoch": 14.492753623188406, "grad_norm": 1.060300588607788, "learning_rate": 0.001, "loss": 0.8342, "step": 39000 }, { "epoch": 14.678558156819026, "grad_norm": 0.9249637126922607, "learning_rate": 0.001, "loss": 0.8382, "step": 39500 }, { "epoch": 14.864362690449648, "grad_norm": 0.7552688717842102, "learning_rate": 0.001, "loss": 0.8306, "step": 40000 }, { "epoch": 15.0, "eval_accuracy": 0.7446734377449871, "eval_f1_macro": 0.34277495445998946, "eval_f1_micro": 0.7446734377449871, "eval_loss": 0.7394037842750549, "eval_runtime": 526.6425, "eval_samples_per_second": 108.994, "eval_steps_per_second": 1.703, "learning_rate": 0.001, "step": 40365 }, { "epoch": 15.050167224080267, "grad_norm": 0.7855104207992554, "learning_rate": 0.001, "loss": 0.8399, "step": 40500 }, { "epoch": 15.235971757710889, "grad_norm": 0.709564745426178, "learning_rate": 0.001, "loss": 0.8384, "step": 41000 }, { "epoch": 15.421776291341509, "grad_norm": 0.7003925442695618, "learning_rate": 0.001, "loss": 0.8236, "step": 41500 }, { "epoch": 15.607580824972128, "grad_norm": 0.7792437672615051, "learning_rate": 0.001, "loss": 0.8296, "step": 42000 }, { "epoch": 15.79338535860275, "grad_norm": 0.8424106240272522, "learning_rate": 0.001, "loss": 0.8329, "step": 42500 }, { "epoch": 15.97918989223337, "grad_norm": 0.8874714374542236, "learning_rate": 0.001, "loss": 0.8304, "step": 43000 }, { "epoch": 16.0, "eval_accuracy": 0.7478789568125991, "eval_f1_macro": 0.3506456767847629, "eval_f1_micro": 0.7478789568125991, "eval_loss": 0.7397111058235168, "eval_runtime": 533.4385, "eval_samples_per_second": 107.606, "eval_steps_per_second": 1.682, "learning_rate": 0.001, "step": 43056 }, { "epoch": 16.16499442586399, "grad_norm": 0.7367419600486755, "learning_rate": 0.0001, "loss": 0.8145, "step": 43500 }, { "epoch": 16.35079895949461, "grad_norm": 0.9213405251502991, "learning_rate": 0.0001, "loss": 0.7917, "step": 44000 }, { "epoch": 16.53660349312523, "grad_norm": 0.7902110815048218, "learning_rate": 0.0001, "loss": 0.7909, "step": 44500 }, { "epoch": 16.722408026755854, "grad_norm": 0.7533911466598511, "learning_rate": 0.0001, "loss": 0.8054, "step": 45000 }, { "epoch": 16.908212560386474, "grad_norm": 0.6558223962783813, "learning_rate": 0.0001, "loss": 0.7886, "step": 45500 }, { "epoch": 17.0, "eval_accuracy": 0.7554398007003362, "eval_f1_macro": 0.3747287292827094, "eval_f1_micro": 0.7554398007003362, "eval_loss": 0.7110718488693237, "eval_runtime": 519.8124, "eval_samples_per_second": 110.426, "eval_steps_per_second": 1.726, "learning_rate": 0.0001, "step": 45747 }, { "epoch": 17.094017094017094, "grad_norm": 0.8171585202217102, "learning_rate": 0.0001, "loss": 0.7962, "step": 46000 }, { "epoch": 17.279821627647713, "grad_norm": 0.9112296104431152, "learning_rate": 0.0001, "loss": 0.7774, "step": 46500 }, { "epoch": 17.465626161278337, "grad_norm": 0.7937678694725037, "learning_rate": 0.0001, "loss": 0.7802, "step": 47000 }, { "epoch": 17.651430694908957, "grad_norm": 0.8527361154556274, "learning_rate": 0.0001, "loss": 0.7867, "step": 47500 }, { "epoch": 17.837235228539576, "grad_norm": 0.8812717199325562, "learning_rate": 0.0001, "loss": 0.7815, "step": 48000 }, { "epoch": 18.0, "eval_accuracy": 0.7567463981463738, "eval_f1_macro": 0.3792648315920964, "eval_f1_micro": 0.7567463981463738, "eval_loss": 0.7041681408882141, "eval_runtime": 506.1969, "eval_samples_per_second": 113.397, "eval_steps_per_second": 1.772, "learning_rate": 0.0001, "step": 48438 }, { "epoch": 18.023039762170196, "grad_norm": 0.9349254965782166, "learning_rate": 0.0001, "loss": 0.7883, "step": 48500 }, { "epoch": 18.208844295800816, "grad_norm": 0.8031138777732849, "learning_rate": 0.0001, "loss": 0.7735, "step": 49000 }, { "epoch": 18.39464882943144, "grad_norm": 0.8354169130325317, "learning_rate": 0.0001, "loss": 0.7861, "step": 49500 }, { "epoch": 18.58045336306206, "grad_norm": 0.9157727956771851, "learning_rate": 0.0001, "loss": 0.7743, "step": 50000 }, { "epoch": 18.76625789669268, "grad_norm": 0.8417840600013733, "learning_rate": 0.0001, "loss": 0.7785, "step": 50500 }, { "epoch": 18.9520624303233, "grad_norm": 0.8945490121841431, "learning_rate": 0.0001, "loss": 0.7682, "step": 51000 }, { "epoch": 19.0, "eval_accuracy": 0.7582446298844968, "eval_f1_macro": 0.38576725361345504, "eval_f1_micro": 0.7582446298844968, "eval_loss": 0.7004917860031128, "eval_runtime": 517.5109, "eval_samples_per_second": 110.917, "eval_steps_per_second": 1.733, "learning_rate": 0.0001, "step": 51129 }, { "epoch": 19.137866963953922, "grad_norm": 1.05229914188385, "learning_rate": 0.0001, "loss": 0.7689, "step": 51500 }, { "epoch": 19.32367149758454, "grad_norm": 0.6953094601631165, "learning_rate": 0.0001, "loss": 0.7721, "step": 52000 }, { "epoch": 19.50947603121516, "grad_norm": 0.7234140634536743, "learning_rate": 0.0001, "loss": 0.7757, "step": 52500 }, { "epoch": 19.69528056484578, "grad_norm": 0.8757944703102112, "learning_rate": 0.0001, "loss": 0.7688, "step": 53000 }, { "epoch": 19.881085098476404, "grad_norm": 1.1684017181396484, "learning_rate": 0.0001, "loss": 0.7788, "step": 53500 }, { "epoch": 20.0, "eval_accuracy": 0.7602829219003153, "eval_f1_macro": 0.39339544323315423, "eval_f1_micro": 0.7602829219003153, "eval_loss": 0.6942671537399292, "eval_runtime": 534.9756, "eval_samples_per_second": 107.296, "eval_steps_per_second": 1.677, "learning_rate": 0.0001, "step": 53820 }, { "epoch": 20.066889632107024, "grad_norm": 1.1924428939819336, "learning_rate": 0.0001, "loss": 0.7598, "step": 54000 }, { "epoch": 20.252694165737644, "grad_norm": 0.6945735812187195, "learning_rate": 0.0001, "loss": 0.7695, "step": 54500 }, { "epoch": 20.438498699368264, "grad_norm": 0.8915848135948181, "learning_rate": 0.0001, "loss": 0.7645, "step": 55000 }, { "epoch": 20.624303232998884, "grad_norm": 0.9735463857650757, "learning_rate": 0.0001, "loss": 0.7747, "step": 55500 }, { "epoch": 20.810107766629507, "grad_norm": 1.0891400575637817, "learning_rate": 0.0001, "loss": 0.774, "step": 56000 }, { "epoch": 20.995912300260127, "grad_norm": 0.9328783750534058, "learning_rate": 0.0001, "loss": 0.7735, "step": 56500 }, { "epoch": 21.0, "eval_accuracy": 0.7592899078413268, "eval_f1_macro": 0.3942710537489151, "eval_f1_micro": 0.7592899078413268, "eval_loss": 0.6919424533843994, "eval_runtime": 511.1285, "eval_samples_per_second": 112.302, "eval_steps_per_second": 1.755, "learning_rate": 0.0001, "step": 56511 }, { "epoch": 21.181716833890746, "grad_norm": 0.7840440273284912, "learning_rate": 0.0001, "loss": 0.7777, "step": 57000 }, { "epoch": 21.367521367521366, "grad_norm": 0.9738557934761047, "learning_rate": 0.0001, "loss": 0.7598, "step": 57500 }, { "epoch": 21.55332590115199, "grad_norm": 0.8154065608978271, "learning_rate": 0.0001, "loss": 0.7661, "step": 58000 }, { "epoch": 21.73913043478261, "grad_norm": 1.0773000717163086, "learning_rate": 0.0001, "loss": 0.7691, "step": 58500 }, { "epoch": 21.92493496841323, "grad_norm": 1.0894912481307983, "learning_rate": 0.0001, "loss": 0.7602, "step": 59000 }, { "epoch": 22.0, "eval_accuracy": 0.7606313478859253, "eval_f1_macro": 0.3925331634038099, "eval_f1_micro": 0.7606313478859253, "eval_loss": 0.6903713941574097, "eval_runtime": 507.2562, "eval_samples_per_second": 113.16, "eval_steps_per_second": 1.768, "learning_rate": 0.0001, "step": 59202 }, { "epoch": 22.11073950204385, "grad_norm": 1.1766314506530762, "learning_rate": 0.0001, "loss": 0.7665, "step": 59500 }, { "epoch": 22.296544035674472, "grad_norm": 1.028473973274231, "learning_rate": 0.0001, "loss": 0.7638, "step": 60000 }, { "epoch": 22.482348569305092, "grad_norm": 0.900393545627594, "learning_rate": 0.0001, "loss": 0.7633, "step": 60500 }, { "epoch": 22.66815310293571, "grad_norm": 0.9336509704589844, "learning_rate": 0.0001, "loss": 0.7636, "step": 61000 }, { "epoch": 22.85395763656633, "grad_norm": 1.0151203870773315, "learning_rate": 0.0001, "loss": 0.7572, "step": 61500 }, { "epoch": 23.0, "eval_accuracy": 0.7607010330830474, "eval_f1_macro": 0.39534246826429575, "eval_f1_micro": 0.7607010330830474, "eval_loss": 0.6874070167541504, "eval_runtime": 507.2598, "eval_samples_per_second": 113.159, "eval_steps_per_second": 1.768, "learning_rate": 0.0001, "step": 61893 }, { "epoch": 23.03976217019695, "grad_norm": 1.0469074249267578, "learning_rate": 0.0001, "loss": 0.7692, "step": 62000 }, { "epoch": 23.225566703827575, "grad_norm": 0.9608765840530396, "learning_rate": 0.0001, "loss": 0.7497, "step": 62500 }, { "epoch": 23.411371237458194, "grad_norm": 1.2574784755706787, "learning_rate": 0.0001, "loss": 0.7566, "step": 63000 }, { "epoch": 23.597175771088814, "grad_norm": 1.0878256559371948, "learning_rate": 0.0001, "loss": 0.7658, "step": 63500 }, { "epoch": 23.782980304719434, "grad_norm": 1.1496306657791138, "learning_rate": 0.0001, "loss": 0.7642, "step": 64000 }, { "epoch": 23.968784838350057, "grad_norm": 1.0485353469848633, "learning_rate": 0.0001, "loss": 0.7593, "step": 64500 }, { "epoch": 24.0, "eval_accuracy": 0.7612236720614624, "eval_f1_macro": 0.3933203620961568, "eval_f1_micro": 0.7612236720614624, "eval_loss": 0.6864963173866272, "eval_runtime": 500.8584, "eval_samples_per_second": 114.605, "eval_steps_per_second": 1.791, "learning_rate": 0.0001, "step": 64584 }, { "epoch": 24.154589371980677, "grad_norm": 1.0143301486968994, "learning_rate": 0.0001, "loss": 0.7559, "step": 65000 }, { "epoch": 24.340393905611297, "grad_norm": 0.8291743993759155, "learning_rate": 0.0001, "loss": 0.7643, "step": 65500 }, { "epoch": 24.526198439241917, "grad_norm": 1.1790424585342407, "learning_rate": 0.0001, "loss": 0.7602, "step": 66000 }, { "epoch": 24.71200297287254, "grad_norm": 1.1937212944030762, "learning_rate": 0.0001, "loss": 0.7636, "step": 66500 }, { "epoch": 24.89780750650316, "grad_norm": 1.230569839477539, "learning_rate": 0.0001, "loss": 0.7548, "step": 67000 }, { "epoch": 25.0, "eval_accuracy": 0.7614153063535478, "eval_f1_macro": 0.402343965759826, "eval_f1_micro": 0.7614153063535478, "eval_loss": 0.684335470199585, "eval_runtime": 514.3947, "eval_samples_per_second": 111.589, "eval_steps_per_second": 1.744, "learning_rate": 0.0001, "step": 67275 }, { "epoch": 25.08361204013378, "grad_norm": 1.0271650552749634, "learning_rate": 0.0001, "loss": 0.7568, "step": 67500 }, { "epoch": 25.2694165737644, "grad_norm": 1.042904019355774, "learning_rate": 0.0001, "loss": 0.7524, "step": 68000 }, { "epoch": 25.45522110739502, "grad_norm": 1.03508460521698, "learning_rate": 0.0001, "loss": 0.7555, "step": 68500 }, { "epoch": 25.641025641025642, "grad_norm": 0.8760964870452881, "learning_rate": 0.0001, "loss": 0.7605, "step": 69000 }, { "epoch": 25.826830174656262, "grad_norm": 1.2160849571228027, "learning_rate": 0.0001, "loss": 0.7557, "step": 69500 }, { "epoch": 26.0, "eval_accuracy": 0.7629309593909513, "eval_f1_macro": 0.4055175650763901, "eval_f1_micro": 0.7629309593909513, "eval_loss": 0.6830293536186218, "eval_runtime": 568.9631, "eval_samples_per_second": 100.887, "eval_steps_per_second": 1.577, "learning_rate": 0.0001, "step": 69966 }, { "epoch": 26.012634708286882, "grad_norm": 0.9436312317848206, "learning_rate": 0.0001, "loss": 0.7523, "step": 70000 }, { "epoch": 26.1984392419175, "grad_norm": 1.0304313898086548, "learning_rate": 0.0001, "loss": 0.7623, "step": 70500 }, { "epoch": 26.384243775548125, "grad_norm": 1.1352418661117554, "learning_rate": 0.0001, "loss": 0.7476, "step": 71000 }, { "epoch": 26.570048309178745, "grad_norm": 0.825731098651886, "learning_rate": 0.0001, "loss": 0.752, "step": 71500 }, { "epoch": 26.755852842809364, "grad_norm": 0.9680258631706238, "learning_rate": 0.0001, "loss": 0.7572, "step": 72000 }, { "epoch": 26.941657376439984, "grad_norm": 0.8693468570709229, "learning_rate": 0.0001, "loss": 0.7534, "step": 72500 }, { "epoch": 27.0, "eval_accuracy": 0.7630703297851954, "eval_f1_macro": 0.40742696523740624, "eval_f1_micro": 0.7630703297851954, "eval_loss": 0.6827249526977539, "eval_runtime": 550.472, "eval_samples_per_second": 104.276, "eval_steps_per_second": 1.63, "learning_rate": 0.0001, "step": 72657 }, { "epoch": 27.127461910070604, "grad_norm": 1.1599422693252563, "learning_rate": 0.0001, "loss": 0.7587, "step": 73000 }, { "epoch": 27.313266443701227, "grad_norm": 1.182005763053894, "learning_rate": 0.0001, "loss": 0.7495, "step": 73500 }, { "epoch": 27.499070977331847, "grad_norm": 0.935971736907959, "learning_rate": 0.0001, "loss": 0.7522, "step": 74000 }, { "epoch": 27.684875510962467, "grad_norm": 1.0990965366363525, "learning_rate": 0.0001, "loss": 0.7513, "step": 74500 }, { "epoch": 27.870680044593087, "grad_norm": 0.834299623966217, "learning_rate": 0.0001, "loss": 0.7609, "step": 75000 }, { "epoch": 28.0, "eval_accuracy": 0.7629309593909513, "eval_f1_macro": 0.413578373717749, "eval_f1_micro": 0.7629309593909513, "eval_loss": 0.6805527210235596, "eval_runtime": 553.9776, "eval_samples_per_second": 103.616, "eval_steps_per_second": 1.619, "learning_rate": 0.0001, "step": 75348 }, { "epoch": 28.05648457822371, "grad_norm": 1.1609550714492798, "learning_rate": 0.0001, "loss": 0.7471, "step": 75500 }, { "epoch": 28.24228911185433, "grad_norm": 0.9144974946975708, "learning_rate": 0.0001, "loss": 0.7524, "step": 76000 }, { "epoch": 28.42809364548495, "grad_norm": 1.08271324634552, "learning_rate": 0.0001, "loss": 0.7461, "step": 76500 }, { "epoch": 28.61389817911557, "grad_norm": 1.0623596906661987, "learning_rate": 0.0001, "loss": 0.7497, "step": 77000 }, { "epoch": 28.799702712746193, "grad_norm": 1.168769359588623, "learning_rate": 0.0001, "loss": 0.7433, "step": 77500 }, { "epoch": 28.985507246376812, "grad_norm": 0.9733015298843384, "learning_rate": 0.0001, "loss": 0.7537, "step": 78000 }, { "epoch": 29.0, "eval_accuracy": 0.7625825334053413, "eval_f1_macro": 0.4138447052049864, "eval_f1_micro": 0.7625825334053413, "eval_loss": 0.6796479225158691, "eval_runtime": 535.8607, "eval_samples_per_second": 107.119, "eval_steps_per_second": 1.674, "learning_rate": 0.0001, "step": 78039 }, { "epoch": 29.171311780007432, "grad_norm": 1.3112975358963013, "learning_rate": 0.0001, "loss": 0.7435, "step": 78500 }, { "epoch": 29.357116313638052, "grad_norm": 1.1170014142990112, "learning_rate": 0.0001, "loss": 0.7524, "step": 79000 }, { "epoch": 29.54292084726867, "grad_norm": 0.9579864144325256, "learning_rate": 0.0001, "loss": 0.7544, "step": 79500 }, { "epoch": 29.728725380899295, "grad_norm": 0.9765433073043823, "learning_rate": 0.0001, "loss": 0.7498, "step": 80000 }, { "epoch": 29.914529914529915, "grad_norm": 1.1268893480300903, "learning_rate": 0.0001, "loss": 0.7533, "step": 80500 }, { "epoch": 30.0, "eval_accuracy": 0.7635929687636104, "eval_f1_macro": 0.41283784117218203, "eval_f1_micro": 0.7635929687636104, "eval_loss": 0.6774595379829407, "eval_runtime": 535.3939, "eval_samples_per_second": 107.213, "eval_steps_per_second": 1.675, "learning_rate": 0.0001, "step": 80730 }, { "epoch": 30.100334448160535, "grad_norm": 1.268797755241394, "learning_rate": 0.0001, "loss": 0.7368, "step": 81000 }, { "epoch": 30.286138981791154, "grad_norm": 1.029023289680481, "learning_rate": 0.0001, "loss": 0.7478, "step": 81500 }, { "epoch": 30.471943515421778, "grad_norm": 1.3363114595413208, "learning_rate": 0.0001, "loss": 0.7487, "step": 82000 }, { "epoch": 30.657748049052397, "grad_norm": 1.4572941064834595, "learning_rate": 0.0001, "loss": 0.7467, "step": 82500 }, { "epoch": 30.843552582683017, "grad_norm": 1.1603232622146606, "learning_rate": 0.0001, "loss": 0.7481, "step": 83000 }, { "epoch": 31.0, "eval_accuracy": 0.7643769272312328, "eval_f1_macro": 0.4100283180344899, "eval_f1_micro": 0.7643769272312328, "eval_loss": 0.677918553352356, "eval_runtime": 535.5808, "eval_samples_per_second": 107.175, "eval_steps_per_second": 1.675, "learning_rate": 0.0001, "step": 83421 }, { "epoch": 31.029357116313637, "grad_norm": 1.3755314350128174, "learning_rate": 0.0001, "loss": 0.7545, "step": 83500 }, { "epoch": 31.21516164994426, "grad_norm": 1.2730791568756104, "learning_rate": 0.0001, "loss": 0.7523, "step": 84000 }, { "epoch": 31.40096618357488, "grad_norm": 1.3255988359451294, "learning_rate": 0.0001, "loss": 0.7459, "step": 84500 }, { "epoch": 31.5867707172055, "grad_norm": 1.246517300605774, "learning_rate": 0.0001, "loss": 0.7411, "step": 85000 }, { "epoch": 31.77257525083612, "grad_norm": 1.3307832479476929, "learning_rate": 0.0001, "loss": 0.7313, "step": 85500 }, { "epoch": 31.95837978446674, "grad_norm": 1.4022419452667236, "learning_rate": 0.0001, "loss": 0.7523, "step": 86000 }, { "epoch": 32.0, "eval_accuracy": 0.7641504503405864, "eval_f1_macro": 0.41093585032897456, "eval_f1_micro": 0.7641504503405864, "eval_loss": 0.6754601001739502, "eval_runtime": 518.6265, "eval_samples_per_second": 110.679, "eval_steps_per_second": 1.73, "learning_rate": 0.0001, "step": 86112 }, { "epoch": 32.14418431809736, "grad_norm": 1.3993639945983887, "learning_rate": 0.0001, "loss": 0.7392, "step": 86500 }, { "epoch": 32.32998885172798, "grad_norm": 1.2422406673431396, "learning_rate": 0.0001, "loss": 0.7434, "step": 87000 }, { "epoch": 32.515793385358606, "grad_norm": 1.2804062366485596, "learning_rate": 0.0001, "loss": 0.7366, "step": 87500 }, { "epoch": 32.70159791898922, "grad_norm": 1.6868325471878052, "learning_rate": 0.0001, "loss": 0.7515, "step": 88000 }, { "epoch": 32.887402452619845, "grad_norm": 1.067484974861145, "learning_rate": 0.0001, "loss": 0.7459, "step": 88500 }, { "epoch": 33.0, "eval_accuracy": 0.7645162976254769, "eval_f1_macro": 0.4185852176848548, "eval_f1_micro": 0.7645162976254769, "eval_loss": 0.6749601364135742, "eval_runtime": 515.3075, "eval_samples_per_second": 111.392, "eval_steps_per_second": 1.741, "learning_rate": 0.0001, "step": 88803 }, { "epoch": 33.07320698625046, "grad_norm": 1.3152741193771362, "learning_rate": 0.0001, "loss": 0.7448, "step": 89000 }, { "epoch": 33.259011519881085, "grad_norm": 1.0844374895095825, "learning_rate": 0.0001, "loss": 0.7353, "step": 89500 }, { "epoch": 33.44481605351171, "grad_norm": 1.1964083909988403, "learning_rate": 0.0001, "loss": 0.7439, "step": 90000 }, { "epoch": 33.630620587142324, "grad_norm": 1.1734319925308228, "learning_rate": 0.0001, "loss": 0.7434, "step": 90500 }, { "epoch": 33.81642512077295, "grad_norm": 1.2583281993865967, "learning_rate": 0.0001, "loss": 0.7454, "step": 91000 }, { "epoch": 34.0, "eval_accuracy": 0.7650040940053309, "eval_f1_macro": 0.41458520697205553, "eval_f1_micro": 0.7650040940053309, "eval_loss": 0.6746455430984497, "eval_runtime": 513.1282, "eval_samples_per_second": 111.865, "eval_steps_per_second": 1.748, "learning_rate": 0.0001, "step": 91494 }, { "epoch": 34.002229654403564, "grad_norm": 1.368170142173767, "learning_rate": 0.0001, "loss": 0.7475, "step": 91500 }, { "epoch": 34.18803418803419, "grad_norm": 1.1858837604522705, "learning_rate": 0.0001, "loss": 0.7363, "step": 92000 }, { "epoch": 34.37383872166481, "grad_norm": 1.5839638710021973, "learning_rate": 0.0001, "loss": 0.7352, "step": 92500 }, { "epoch": 34.55964325529543, "grad_norm": 1.560922622680664, "learning_rate": 0.0001, "loss": 0.7371, "step": 93000 }, { "epoch": 34.74544778892605, "grad_norm": 1.3220325708389282, "learning_rate": 0.0001, "loss": 0.7442, "step": 93500 }, { "epoch": 34.931252322556674, "grad_norm": 1.275272250175476, "learning_rate": 0.0001, "loss": 0.7426, "step": 94000 }, { "epoch": 35.0, "eval_accuracy": 0.7641852929391474, "eval_f1_macro": 0.42549626405712787, "eval_f1_micro": 0.7641852929391474, "eval_loss": 0.6740487813949585, "eval_runtime": 514.2418, "eval_samples_per_second": 111.623, "eval_steps_per_second": 1.744, "learning_rate": 0.0001, "step": 94185 }, { "epoch": 35.11705685618729, "grad_norm": 1.3994961977005005, "learning_rate": 0.0001, "loss": 0.7432, "step": 94500 }, { "epoch": 35.30286138981791, "grad_norm": 1.2828725576400757, "learning_rate": 0.0001, "loss": 0.7443, "step": 95000 }, { "epoch": 35.48866592344853, "grad_norm": 1.3517532348632812, "learning_rate": 0.0001, "loss": 0.7426, "step": 95500 }, { "epoch": 35.67447045707915, "grad_norm": 1.2311525344848633, "learning_rate": 0.0001, "loss": 0.7274, "step": 96000 }, { "epoch": 35.860274990709776, "grad_norm": 1.209617257118225, "learning_rate": 0.0001, "loss": 0.7446, "step": 96500 }, { "epoch": 36.0, "eval_accuracy": 0.7646905106182819, "eval_f1_macro": 0.42042185334833837, "eval_f1_micro": 0.7646905106182819, "eval_loss": 0.6740365624427795, "eval_runtime": 526.8929, "eval_samples_per_second": 108.942, "eval_steps_per_second": 1.702, "learning_rate": 0.0001, "step": 96876 }, { "epoch": 36.04607952434039, "grad_norm": 1.2096632719039917, "learning_rate": 0.0001, "loss": 0.738, "step": 97000 }, { "epoch": 36.231884057971016, "grad_norm": 1.258972406387329, "learning_rate": 0.0001, "loss": 0.7421, "step": 97500 }, { "epoch": 36.41768859160163, "grad_norm": 1.5985496044158936, "learning_rate": 0.0001, "loss": 0.7426, "step": 98000 }, { "epoch": 36.603493125232255, "grad_norm": 1.261830449104309, "learning_rate": 0.0001, "loss": 0.7384, "step": 98500 }, { "epoch": 36.78929765886288, "grad_norm": 1.5109082460403442, "learning_rate": 0.0001, "loss": 0.7383, "step": 99000 }, { "epoch": 36.975102192493495, "grad_norm": 1.1752451658248901, "learning_rate": 0.0001, "loss": 0.7431, "step": 99500 }, { "epoch": 37.0, "eval_accuracy": 0.7641678716398669, "eval_f1_macro": 0.4194338951085209, "eval_f1_micro": 0.7641678716398669, "eval_loss": 0.6731483936309814, "eval_runtime": 503.5131, "eval_samples_per_second": 114.001, "eval_steps_per_second": 1.781, "learning_rate": 0.0001, "step": 99567 }, { "epoch": 37.16090672612412, "grad_norm": 1.4535642862319946, "learning_rate": 0.0001, "loss": 0.7282, "step": 100000 }, { "epoch": 37.34671125975474, "grad_norm": 1.5286682844161987, "learning_rate": 0.0001, "loss": 0.7332, "step": 100500 }, { "epoch": 37.53251579338536, "grad_norm": 1.6406091451644897, "learning_rate": 0.0001, "loss": 0.7387, "step": 101000 }, { "epoch": 37.71832032701598, "grad_norm": 1.0090203285217285, "learning_rate": 0.0001, "loss": 0.7364, "step": 101500 }, { "epoch": 37.9041248606466, "grad_norm": 1.3233065605163574, "learning_rate": 0.0001, "loss": 0.7468, "step": 102000 }, { "epoch": 38.0, "eval_accuracy": 0.7648821449103674, "eval_f1_macro": 0.42433192329853336, "eval_f1_micro": 0.7648821449103674, "eval_loss": 0.671998918056488, "eval_runtime": 536.8447, "eval_samples_per_second": 106.923, "eval_steps_per_second": 1.671, "learning_rate": 0.0001, "step": 102258 }, { "epoch": 38.08992939427722, "grad_norm": 1.2303552627563477, "learning_rate": 0.0001, "loss": 0.7463, "step": 102500 }, { "epoch": 38.275733927907844, "grad_norm": 1.200378179550171, "learning_rate": 0.0001, "loss": 0.7296, "step": 103000 }, { "epoch": 38.46153846153846, "grad_norm": 1.3014321327209473, "learning_rate": 0.0001, "loss": 0.7437, "step": 103500 }, { "epoch": 38.64734299516908, "grad_norm": 1.429656982421875, "learning_rate": 0.0001, "loss": 0.7444, "step": 104000 }, { "epoch": 38.8331475287997, "grad_norm": 1.3417949676513672, "learning_rate": 0.0001, "loss": 0.7307, "step": 104500 }, { "epoch": 39.0, "eval_accuracy": 0.7659274228671974, "eval_f1_macro": 0.42040362773805245, "eval_f1_micro": 0.7659274228671974, "eval_loss": 0.6694707870483398, "eval_runtime": 527.8534, "eval_samples_per_second": 108.744, "eval_steps_per_second": 1.699, "learning_rate": 0.0001, "step": 104949 }, { "epoch": 39.01895206243032, "grad_norm": 1.656445860862732, "learning_rate": 0.0001, "loss": 0.7423, "step": 105000 }, { "epoch": 39.204756596060946, "grad_norm": 1.33378267288208, "learning_rate": 0.0001, "loss": 0.7407, "step": 105500 }, { "epoch": 39.39056112969156, "grad_norm": 1.814835548400879, "learning_rate": 0.0001, "loss": 0.7395, "step": 106000 }, { "epoch": 39.576365663322186, "grad_norm": 1.5420753955841064, "learning_rate": 0.0001, "loss": 0.7321, "step": 106500 }, { "epoch": 39.76217019695281, "grad_norm": 1.2062960863113403, "learning_rate": 0.0001, "loss": 0.7302, "step": 107000 }, { "epoch": 39.947974730583425, "grad_norm": 1.4515705108642578, "learning_rate": 0.0001, "loss": 0.7404, "step": 107500 }, { "epoch": 40.0, "eval_accuracy": 0.7665023257434539, "eval_f1_macro": 0.42052839725843943, "eval_f1_micro": 0.7665023257434539, "eval_loss": 0.6693674325942993, "eval_runtime": 517.9605, "eval_samples_per_second": 110.821, "eval_steps_per_second": 1.732, "learning_rate": 0.0001, "step": 107640 }, { "epoch": 40.13377926421405, "grad_norm": 1.211496114730835, "learning_rate": 0.0001, "loss": 0.728, "step": 108000 }, { "epoch": 40.319583797844665, "grad_norm": 1.0356364250183105, "learning_rate": 0.0001, "loss": 0.7307, "step": 108500 }, { "epoch": 40.50538833147529, "grad_norm": 1.508729338645935, "learning_rate": 0.0001, "loss": 0.732, "step": 109000 }, { "epoch": 40.69119286510591, "grad_norm": 1.315461277961731, "learning_rate": 0.0001, "loss": 0.7471, "step": 109500 }, { "epoch": 40.87699739873653, "grad_norm": 1.2628021240234375, "learning_rate": 0.0001, "loss": 0.7355, "step": 110000 }, { "epoch": 41.0, "eval_accuracy": 0.7658228950715145, "eval_f1_macro": 0.4176095837463332, "eval_f1_micro": 0.7658228950715145, "eval_loss": 0.6682748198509216, "eval_runtime": 491.8206, "eval_samples_per_second": 116.711, "eval_steps_per_second": 1.824, "learning_rate": 0.0001, "step": 110331 }, { "epoch": 41.06280193236715, "grad_norm": 1.4534225463867188, "learning_rate": 0.0001, "loss": 0.7362, "step": 110500 }, { "epoch": 41.24860646599777, "grad_norm": 1.305442214012146, "learning_rate": 0.0001, "loss": 0.7292, "step": 111000 }, { "epoch": 41.43441099962839, "grad_norm": 1.2671395540237427, "learning_rate": 0.0001, "loss": 0.7293, "step": 111500 }, { "epoch": 41.620215533259014, "grad_norm": 1.7886228561401367, "learning_rate": 0.0001, "loss": 0.7379, "step": 112000 }, { "epoch": 41.80602006688963, "grad_norm": 1.565897822380066, "learning_rate": 0.0001, "loss": 0.7388, "step": 112500 }, { "epoch": 41.99182460052025, "grad_norm": 1.4743560552597046, "learning_rate": 0.0001, "loss": 0.7508, "step": 113000 }, { "epoch": 42.0, "eval_accuracy": 0.7665371683420149, "eval_f1_macro": 0.4307432846853069, "eval_f1_micro": 0.7665371683420149, "eval_loss": 0.6682831645011902, "eval_runtime": 484.4423, "eval_samples_per_second": 118.489, "eval_steps_per_second": 1.852, "learning_rate": 0.0001, "step": 113022 }, { "epoch": 42.17762913415088, "grad_norm": 1.530715823173523, "learning_rate": 0.0001, "loss": 0.7309, "step": 113500 }, { "epoch": 42.36343366778149, "grad_norm": 1.4992977380752563, "learning_rate": 0.0001, "loss": 0.7282, "step": 114000 }, { "epoch": 42.549238201412116, "grad_norm": 1.5467846393585205, "learning_rate": 0.0001, "loss": 0.7377, "step": 114500 }, { "epoch": 42.73504273504273, "grad_norm": 1.5137412548065186, "learning_rate": 0.0001, "loss": 0.733, "step": 115000 }, { "epoch": 42.920847268673356, "grad_norm": 1.6055561304092407, "learning_rate": 0.0001, "loss": 0.7368, "step": 115500 }, { "epoch": 43.0, "eval_accuracy": 0.7664152192470515, "eval_f1_macro": 0.42637174689527435, "eval_f1_micro": 0.7664152192470515, "eval_loss": 0.6695142984390259, "eval_runtime": 488.5528, "eval_samples_per_second": 117.492, "eval_steps_per_second": 1.836, "learning_rate": 0.0001, "step": 115713 }, { "epoch": 43.10665180230398, "grad_norm": 1.460276484489441, "learning_rate": 0.0001, "loss": 0.7371, "step": 116000 }, { "epoch": 43.292456335934595, "grad_norm": 1.4303547143936157, "learning_rate": 0.0001, "loss": 0.7276, "step": 116500 }, { "epoch": 43.47826086956522, "grad_norm": 1.5059736967086792, "learning_rate": 0.0001, "loss": 0.7282, "step": 117000 }, { "epoch": 43.664065403195835, "grad_norm": 1.4985263347625732, "learning_rate": 0.0001, "loss": 0.7347, "step": 117500 }, { "epoch": 43.84986993682646, "grad_norm": 1.1422086954116821, "learning_rate": 0.0001, "loss": 0.7362, "step": 118000 }, { "epoch": 44.0, "eval_accuracy": 0.765840316370795, "eval_f1_macro": 0.42609728385101026, "eval_f1_micro": 0.765840316370795, "eval_loss": 0.669391393661499, "eval_runtime": 508.1911, "eval_samples_per_second": 112.952, "eval_steps_per_second": 1.765, "learning_rate": 0.0001, "step": 118404 }, { "epoch": 44.03567447045708, "grad_norm": 1.8496322631835938, "learning_rate": 0.0001, "loss": 0.7336, "step": 118500 }, { "epoch": 44.2214790040877, "grad_norm": 1.4898383617401123, "learning_rate": 0.0001, "loss": 0.7361, "step": 119000 }, { "epoch": 44.40728353771832, "grad_norm": 1.341386079788208, "learning_rate": 0.0001, "loss": 0.733, "step": 119500 }, { "epoch": 44.593088071348944, "grad_norm": 1.747814416885376, "learning_rate": 0.0001, "loss": 0.7256, "step": 120000 }, { "epoch": 44.77889260497956, "grad_norm": 1.5329481363296509, "learning_rate": 0.0001, "loss": 0.7203, "step": 120500 }, { "epoch": 44.964697138610184, "grad_norm": 1.4953057765960693, "learning_rate": 0.0001, "loss": 0.7287, "step": 121000 }, { "epoch": 45.0, "eval_accuracy": 0.7649518301074895, "eval_f1_macro": 0.4307801257532618, "eval_f1_micro": 0.7649518301074895, "eval_loss": 0.6695447564125061, "eval_runtime": 486.6366, "eval_samples_per_second": 117.955, "eval_steps_per_second": 1.843, "learning_rate": 0.0001, "step": 121095 }, { "epoch": 45.1505016722408, "grad_norm": 1.5713474750518799, "learning_rate": 0.0001, "loss": 0.7282, "step": 121500 }, { "epoch": 45.33630620587142, "grad_norm": 1.133755087852478, "learning_rate": 0.0001, "loss": 0.7333, "step": 122000 }, { "epoch": 45.52211073950205, "grad_norm": 1.4431047439575195, "learning_rate": 0.0001, "loss": 0.7271, "step": 122500 }, { "epoch": 45.70791527313266, "grad_norm": 1.942878007888794, "learning_rate": 0.0001, "loss": 0.7298, "step": 123000 }, { "epoch": 45.893719806763286, "grad_norm": 1.6559330224990845, "learning_rate": 0.0001, "loss": 0.7374, "step": 123500 }, { "epoch": 46.0, "eval_accuracy": 0.7673908120067595, "eval_f1_macro": 0.4362286100546047, "eval_f1_micro": 0.7673908120067595, "eval_loss": 0.6653340458869934, "eval_runtime": 483.687, "eval_samples_per_second": 118.674, "eval_steps_per_second": 1.855, "learning_rate": 0.0001, "step": 123786 }, { "epoch": 46.0795243403939, "grad_norm": 1.346835732460022, "learning_rate": 0.0001, "loss": 0.7369, "step": 124000 }, { "epoch": 46.265328874024526, "grad_norm": 1.3699783086776733, "learning_rate": 0.0001, "loss": 0.7344, "step": 124500 }, { "epoch": 46.45113340765515, "grad_norm": 1.4523323774337769, "learning_rate": 0.0001, "loss": 0.7322, "step": 125000 }, { "epoch": 46.636937941285765, "grad_norm": 1.5713220834732056, "learning_rate": 0.0001, "loss": 0.7276, "step": 125500 }, { "epoch": 46.82274247491639, "grad_norm": 1.4702354669570923, "learning_rate": 0.0001, "loss": 0.7321, "step": 126000 }, { "epoch": 47.0, "eval_accuracy": 0.7674430759046009, "eval_f1_macro": 0.43279477858934173, "eval_f1_micro": 0.7674430759046009, "eval_loss": 0.6659862995147705, "eval_runtime": 487.7507, "eval_samples_per_second": 117.685, "eval_steps_per_second": 1.839, "learning_rate": 0.0001, "step": 126477 }, { "epoch": 47.00854700854701, "grad_norm": 1.5675249099731445, "learning_rate": 0.0001, "loss": 0.7255, "step": 126500 }, { "epoch": 47.19435154217763, "grad_norm": 1.4069844484329224, "learning_rate": 0.0001, "loss": 0.7272, "step": 127000 }, { "epoch": 47.38015607580825, "grad_norm": 1.2489932775497437, "learning_rate": 0.0001, "loss": 0.7273, "step": 127500 }, { "epoch": 47.56596060943887, "grad_norm": 1.8463833332061768, "learning_rate": 0.0001, "loss": 0.74, "step": 128000 }, { "epoch": 47.75176514306949, "grad_norm": 1.3903892040252686, "learning_rate": 0.0001, "loss": 0.7391, "step": 128500 }, { "epoch": 47.937569676700114, "grad_norm": 1.2857866287231445, "learning_rate": 0.0001, "loss": 0.7352, "step": 129000 }, { "epoch": 48.0, "eval_accuracy": 0.7668855943276249, "eval_f1_macro": 0.43077383038275147, "eval_f1_micro": 0.7668855943276249, "eval_loss": 0.665557861328125, "eval_runtime": 486.8422, "eval_samples_per_second": 117.905, "eval_steps_per_second": 1.842, "learning_rate": 0.0001, "step": 129168 }, { "epoch": 48.12337421033073, "grad_norm": 1.7764297723770142, "learning_rate": 0.0001, "loss": 0.7203, "step": 129500 }, { "epoch": 48.309178743961354, "grad_norm": 1.469336748123169, "learning_rate": 0.0001, "loss": 0.7292, "step": 130000 }, { "epoch": 48.49498327759197, "grad_norm": 1.4002306461334229, "learning_rate": 0.0001, "loss": 0.7279, "step": 130500 }, { "epoch": 48.680787811222594, "grad_norm": 1.6152621507644653, "learning_rate": 0.0001, "loss": 0.7386, "step": 131000 }, { "epoch": 48.86659234485322, "grad_norm": 1.5064826011657715, "learning_rate": 0.0001, "loss": 0.7373, "step": 131500 }, { "epoch": 49.0, "eval_accuracy": 0.7666242748384174, "eval_f1_macro": 0.4241681801855841, "eval_f1_micro": 0.7666242748384174, "eval_loss": 0.6672787666320801, "eval_runtime": 484.1155, "eval_samples_per_second": 118.569, "eval_steps_per_second": 1.853, "learning_rate": 0.0001, "step": 131859 }, { "epoch": 49.05239687848383, "grad_norm": 1.329869031906128, "learning_rate": 0.0001, "loss": 0.7295, "step": 132000 }, { "epoch": 49.238201412114456, "grad_norm": 1.749746322631836, "learning_rate": 0.0001, "loss": 0.7301, "step": 132500 }, { "epoch": 49.42400594574507, "grad_norm": 2.059324026107788, "learning_rate": 0.0001, "loss": 0.7214, "step": 133000 }, { "epoch": 49.609810479375696, "grad_norm": 1.6400606632232666, "learning_rate": 0.0001, "loss": 0.7285, "step": 133500 }, { "epoch": 49.79561501300632, "grad_norm": 1.6473073959350586, "learning_rate": 0.0001, "loss": 0.7283, "step": 134000 }, { "epoch": 49.981419546636936, "grad_norm": 1.7654767036437988, "learning_rate": 0.0001, "loss": 0.7307, "step": 134500 }, { "epoch": 50.0, "eval_accuracy": 0.7662584275535269, "eval_f1_macro": 0.43151276516162357, "eval_f1_micro": 0.7662584275535269, "eval_loss": 0.6661437749862671, "eval_runtime": 486.5547, "eval_samples_per_second": 117.974, "eval_steps_per_second": 1.844, "learning_rate": 0.0001, "step": 134550 }, { "epoch": 50.16722408026756, "grad_norm": 1.484832525253296, "learning_rate": 0.0001, "loss": 0.731, "step": 135000 }, { "epoch": 50.35302861389818, "grad_norm": 1.4548848867416382, "learning_rate": 0.0001, "loss": 0.7262, "step": 135500 }, { "epoch": 50.5388331475288, "grad_norm": 1.7120981216430664, "learning_rate": 0.0001, "loss": 0.7345, "step": 136000 }, { "epoch": 50.72463768115942, "grad_norm": 1.789556860923767, "learning_rate": 0.0001, "loss": 0.7253, "step": 136500 }, { "epoch": 50.91044221479004, "grad_norm": 2.5206713676452637, "learning_rate": 0.0001, "loss": 0.7235, "step": 137000 }, { "epoch": 51.0, "eval_accuracy": 0.7667288026341005, "eval_f1_macro": 0.4307690989303114, "eval_f1_micro": 0.7667288026341005, "eval_loss": 0.6638755798339844, "eval_runtime": 505.3338, "eval_samples_per_second": 113.59, "eval_steps_per_second": 1.775, "learning_rate": 0.0001, "step": 137241 }, { "epoch": 51.09624674842066, "grad_norm": 1.9092504978179932, "learning_rate": 0.0001, "loss": 0.7266, "step": 137500 }, { "epoch": 51.282051282051285, "grad_norm": 1.6534066200256348, "learning_rate": 0.0001, "loss": 0.7266, "step": 138000 }, { "epoch": 51.4678558156819, "grad_norm": 1.7287977933883667, "learning_rate": 0.0001, "loss": 0.7304, "step": 138500 }, { "epoch": 51.653660349312524, "grad_norm": 2.113309860229492, "learning_rate": 0.0001, "loss": 0.7356, "step": 139000 }, { "epoch": 51.83946488294314, "grad_norm": 1.4403417110443115, "learning_rate": 0.0001, "loss": 0.7295, "step": 139500 }, { "epoch": 52.0, "eval_accuracy": 0.7679134509851745, "eval_f1_macro": 0.4427799679621408, "eval_f1_micro": 0.7679134509851745, "eval_loss": 0.6654694676399231, "eval_runtime": 502.5773, "eval_samples_per_second": 114.213, "eval_steps_per_second": 1.785, "learning_rate": 0.0001, "step": 139932 }, { "epoch": 52.025269416573764, "grad_norm": 1.7017192840576172, "learning_rate": 0.0001, "loss": 0.712, "step": 140000 }, { "epoch": 52.21107395020439, "grad_norm": 1.3907127380371094, "learning_rate": 0.0001, "loss": 0.7329, "step": 140500 }, { "epoch": 52.396878483835, "grad_norm": 1.791027307510376, "learning_rate": 0.0001, "loss": 0.7276, "step": 141000 }, { "epoch": 52.58268301746563, "grad_norm": 1.384203553199768, "learning_rate": 0.0001, "loss": 0.7126, "step": 141500 }, { "epoch": 52.76848755109625, "grad_norm": 2.309049367904663, "learning_rate": 0.0001, "loss": 0.7284, "step": 142000 }, { "epoch": 52.954292084726866, "grad_norm": 1.4892642498016357, "learning_rate": 0.0001, "loss": 0.7267, "step": 142500 }, { "epoch": 53.0, "eval_accuracy": 0.767234020313235, "eval_f1_macro": 0.4341825403324277, "eval_f1_micro": 0.767234020313235, "eval_loss": 0.6643231511116028, "eval_runtime": 484.553, "eval_samples_per_second": 118.462, "eval_steps_per_second": 1.851, "learning_rate": 0.0001, "step": 142623 }, { "epoch": 53.14009661835749, "grad_norm": 1.9168522357940674, "learning_rate": 0.0001, "loss": 0.7301, "step": 143000 }, { "epoch": 53.325901151988106, "grad_norm": 2.0074169635772705, "learning_rate": 0.0001, "loss": 0.7194, "step": 143500 }, { "epoch": 53.51170568561873, "grad_norm": 1.7252275943756104, "learning_rate": 0.0001, "loss": 0.7248, "step": 144000 }, { "epoch": 53.69751021924935, "grad_norm": 1.5249263048171997, "learning_rate": 0.0001, "loss": 0.7318, "step": 144500 }, { "epoch": 53.88331475287997, "grad_norm": 1.5287233591079712, "learning_rate": 0.0001, "loss": 0.724, "step": 145000 }, { "epoch": 54.0, "eval_accuracy": 0.7663455340499294, "eval_f1_macro": 0.4459616186399035, "eval_f1_micro": 0.7663455340499294, "eval_loss": 0.667382001876831, "eval_runtime": 485.0681, "eval_samples_per_second": 118.336, "eval_steps_per_second": 1.849, "learning_rate": 0.0001, "step": 145314 }, { "epoch": 54.06911928651059, "grad_norm": 2.074103593826294, "learning_rate": 0.0001, "loss": 0.7333, "step": 145500 }, { "epoch": 54.25492382014121, "grad_norm": 1.8293678760528564, "learning_rate": 0.0001, "loss": 0.7265, "step": 146000 }, { "epoch": 54.44072835377183, "grad_norm": 1.985289454460144, "learning_rate": 0.0001, "loss": 0.7241, "step": 146500 }, { "epoch": 54.626532887402455, "grad_norm": 1.5796940326690674, "learning_rate": 0.0001, "loss": 0.7188, "step": 147000 }, { "epoch": 54.81233742103307, "grad_norm": 2.1631064414978027, "learning_rate": 0.0001, "loss": 0.7321, "step": 147500 }, { "epoch": 54.998141954663694, "grad_norm": 1.892568588256836, "learning_rate": 0.0001, "loss": 0.734, "step": 148000 }, { "epoch": 55.0, "eval_accuracy": 0.7684709325621505, "eval_f1_macro": 0.4389385481332223, "eval_f1_micro": 0.7684709325621505, "eval_loss": 0.6627440452575684, "eval_runtime": 477.8566, "eval_samples_per_second": 120.122, "eval_steps_per_second": 1.877, "learning_rate": 0.0001, "step": 148005 }, { "epoch": 55.18394648829432, "grad_norm": 1.820019006729126, "learning_rate": 0.0001, "loss": 0.7167, "step": 148500 }, { "epoch": 55.369751021924934, "grad_norm": 1.9633277654647827, "learning_rate": 0.0001, "loss": 0.7347, "step": 149000 }, { "epoch": 55.55555555555556, "grad_norm": 1.589463710784912, "learning_rate": 0.0001, "loss": 0.7285, "step": 149500 }, { "epoch": 55.74136008918617, "grad_norm": 1.771761417388916, "learning_rate": 0.0001, "loss": 0.73, "step": 150000 }, { "epoch": 55.9271646228168, "grad_norm": 1.93351411819458, "learning_rate": 0.0001, "loss": 0.7285, "step": 150500 }, { "epoch": 56.0, "eval_accuracy": 0.767094649918991, "eval_f1_macro": 0.43857797557707, "eval_f1_micro": 0.767094649918991, "eval_loss": 0.6627209186553955, "eval_runtime": 479.9805, "eval_samples_per_second": 119.59, "eval_steps_per_second": 1.869, "learning_rate": 0.0001, "step": 150696 }, { "epoch": 56.11296915644742, "grad_norm": 1.5868228673934937, "learning_rate": 0.0001, "loss": 0.724, "step": 151000 }, { "epoch": 56.298773690078036, "grad_norm": 1.6344318389892578, "learning_rate": 0.0001, "loss": 0.7218, "step": 151500 }, { "epoch": 56.48457822370866, "grad_norm": 1.6172913312911987, "learning_rate": 0.0001, "loss": 0.7248, "step": 152000 }, { "epoch": 56.670382757339276, "grad_norm": 1.5940032005310059, "learning_rate": 0.0001, "loss": 0.7336, "step": 152500 }, { "epoch": 56.8561872909699, "grad_norm": 1.6969693899154663, "learning_rate": 0.0001, "loss": 0.729, "step": 153000 }, { "epoch": 57.0, "eval_accuracy": 0.7669204369261859, "eval_f1_macro": 0.43847119749006624, "eval_f1_micro": 0.7669204369261859, "eval_loss": 0.6640397310256958, "eval_runtime": 484.1334, "eval_samples_per_second": 118.564, "eval_steps_per_second": 1.853, "learning_rate": 0.0001, "step": 153387 }, { "epoch": 57.04199182460052, "grad_norm": 1.940199851989746, "learning_rate": 0.0001, "loss": 0.7136, "step": 153500 }, { "epoch": 57.22779635823114, "grad_norm": 2.240196466445923, "learning_rate": 0.0001, "loss": 0.7264, "step": 154000 }, { "epoch": 57.41360089186176, "grad_norm": 1.6355632543563843, "learning_rate": 0.0001, "loss": 0.7226, "step": 154500 }, { "epoch": 57.599405425492385, "grad_norm": 1.930558681488037, "learning_rate": 0.0001, "loss": 0.7322, "step": 155000 }, { "epoch": 57.785209959123, "grad_norm": 2.192221164703369, "learning_rate": 0.0001, "loss": 0.7322, "step": 155500 }, { "epoch": 57.971014492753625, "grad_norm": 1.9269661903381348, "learning_rate": 0.0001, "loss": 0.7179, "step": 156000 }, { "epoch": 58.0, "eval_accuracy": 0.7672862842110765, "eval_f1_macro": 0.43760916892251167, "eval_f1_micro": 0.7672862842110765, "eval_loss": 0.6627684235572815, "eval_runtime": 479.7239, "eval_samples_per_second": 119.654, "eval_steps_per_second": 1.87, "learning_rate": 0.0001, "step": 156078 }, { "epoch": 58.15681902638424, "grad_norm": 2.0105526447296143, "learning_rate": 0.0001, "loss": 0.7321, "step": 156500 }, { "epoch": 58.342623560014864, "grad_norm": 1.7992244958877563, "learning_rate": 0.0001, "loss": 0.7319, "step": 157000 }, { "epoch": 58.52842809364549, "grad_norm": 2.2000529766082764, "learning_rate": 0.0001, "loss": 0.7215, "step": 157500 }, { "epoch": 58.714232627276104, "grad_norm": 2.374115228652954, "learning_rate": 0.0001, "loss": 0.7189, "step": 158000 }, { "epoch": 58.90003716090673, "grad_norm": 2.0600476264953613, "learning_rate": 0.0001, "loss": 0.7257, "step": 158500 }, { "epoch": 59.0, "eval_accuracy": 0.7679134509851745, "eval_f1_macro": 0.439932052501894, "eval_f1_micro": 0.7679134509851745, "eval_loss": 0.6614954471588135, "eval_runtime": 476.1801, "eval_samples_per_second": 120.545, "eval_steps_per_second": 1.884, "learning_rate": 0.0001, "step": 158769 }, { "epoch": 59.08584169453734, "grad_norm": 1.518589735031128, "learning_rate": 0.0001, "loss": 0.7106, "step": 159000 }, { "epoch": 59.27164622816797, "grad_norm": 1.8264387845993042, "learning_rate": 0.0001, "loss": 0.7262, "step": 159500 }, { "epoch": 59.45745076179859, "grad_norm": 2.5099358558654785, "learning_rate": 0.0001, "loss": 0.7217, "step": 160000 }, { "epoch": 59.643255295429206, "grad_norm": 2.0228655338287354, "learning_rate": 0.0001, "loss": 0.7234, "step": 160500 }, { "epoch": 59.82905982905983, "grad_norm": 1.8197671175003052, "learning_rate": 0.0001, "loss": 0.7297, "step": 161000 }, { "epoch": 60.0, "eval_accuracy": 0.766990122123308, "eval_f1_macro": 0.44188579219640917, "eval_f1_micro": 0.766990122123308, "eval_loss": 0.6633245944976807, "eval_runtime": 478.7046, "eval_samples_per_second": 119.909, "eval_steps_per_second": 1.874, "learning_rate": 0.0001, "step": 161460 }, { "epoch": 60.01486436269045, "grad_norm": 1.6832709312438965, "learning_rate": 0.0001, "loss": 0.7319, "step": 161500 }, { "epoch": 60.20066889632107, "grad_norm": 2.1717429161071777, "learning_rate": 0.0001, "loss": 0.7281, "step": 162000 }, { "epoch": 60.38647342995169, "grad_norm": 2.8050575256347656, "learning_rate": 0.0001, "loss": 0.7216, "step": 162500 }, { "epoch": 60.57227796358231, "grad_norm": 2.1643614768981934, "learning_rate": 0.0001, "loss": 0.7272, "step": 163000 }, { "epoch": 60.75808249721293, "grad_norm": 2.2237448692321777, "learning_rate": 0.0001, "loss": 0.7238, "step": 163500 }, { "epoch": 60.943887030843555, "grad_norm": 2.3279480934143066, "learning_rate": 0.0001, "loss": 0.7297, "step": 164000 }, { "epoch": 61.0, "eval_accuracy": 0.7685754603578335, "eval_f1_macro": 0.4370683373238057, "eval_f1_micro": 0.7685754603578335, "eval_loss": 0.6611309051513672, "eval_runtime": 490.7606, "eval_samples_per_second": 116.963, "eval_steps_per_second": 1.828, "learning_rate": 0.0001, "step": 164151 }, { "epoch": 61.12969156447417, "grad_norm": 1.1877753734588623, "learning_rate": 0.0001, "loss": 0.721, "step": 164500 }, { "epoch": 61.315496098104795, "grad_norm": 1.6631951332092285, "learning_rate": 0.0001, "loss": 0.7222, "step": 165000 }, { "epoch": 61.50130063173541, "grad_norm": 1.9209131002426147, "learning_rate": 0.0001, "loss": 0.7283, "step": 165500 }, { "epoch": 61.687105165366034, "grad_norm": 2.088667392730713, "learning_rate": 0.0001, "loss": 0.7217, "step": 166000 }, { "epoch": 61.87290969899666, "grad_norm": 2.503835916519165, "learning_rate": 0.0001, "loss": 0.7262, "step": 166500 }, { "epoch": 62.0, "eval_accuracy": 0.7684360899635895, "eval_f1_macro": 0.45352172583851785, "eval_f1_micro": 0.7684360899635895, "eval_loss": 0.660831093788147, "eval_runtime": 497.2138, "eval_samples_per_second": 115.445, "eval_steps_per_second": 1.804, "learning_rate": 0.0001, "step": 166842 }, { "epoch": 62.058714232627274, "grad_norm": 1.5878970623016357, "learning_rate": 0.0001, "loss": 0.7166, "step": 167000 }, { "epoch": 62.2445187662579, "grad_norm": 1.8134915828704834, "learning_rate": 0.0001, "loss": 0.7234, "step": 167500 }, { "epoch": 62.43032329988852, "grad_norm": 1.9176697731018066, "learning_rate": 0.0001, "loss": 0.7246, "step": 168000 }, { "epoch": 62.61612783351914, "grad_norm": 1.6024200916290283, "learning_rate": 0.0001, "loss": 0.7322, "step": 168500 }, { "epoch": 62.80193236714976, "grad_norm": 1.7893162965774536, "learning_rate": 0.0001, "loss": 0.7227, "step": 169000 }, { "epoch": 62.98773690078038, "grad_norm": 1.9036942720413208, "learning_rate": 0.0001, "loss": 0.7204, "step": 169500 }, { "epoch": 63.0, "eval_accuracy": 0.767791501890211, "eval_f1_macro": 0.44611945476580944, "eval_f1_micro": 0.767791501890211, "eval_loss": 0.6621896028518677, "eval_runtime": 505.1543, "eval_samples_per_second": 113.631, "eval_steps_per_second": 1.776, "learning_rate": 0.0001, "step": 169533 }, { "epoch": 63.173541434411, "grad_norm": 2.1570420265197754, "learning_rate": 0.0001, "loss": 0.7229, "step": 170000 }, { "epoch": 63.35934596804162, "grad_norm": 1.816713571548462, "learning_rate": 0.0001, "loss": 0.7209, "step": 170500 }, { "epoch": 63.54515050167224, "grad_norm": 1.9060814380645752, "learning_rate": 0.0001, "loss": 0.7279, "step": 171000 }, { "epoch": 63.73095503530286, "grad_norm": 1.8209328651428223, "learning_rate": 0.0001, "loss": 0.726, "step": 171500 }, { "epoch": 63.91675956893348, "grad_norm": 2.5898454189300537, "learning_rate": 0.0001, "loss": 0.7296, "step": 172000 }, { "epoch": 64.0, "eval_accuracy": 0.767547603700284, "eval_f1_macro": 0.4439334258360575, "eval_f1_micro": 0.767547603700284, "eval_loss": 0.6610415577888489, "eval_runtime": 518.2702, "eval_samples_per_second": 110.755, "eval_steps_per_second": 1.731, "learning_rate": 0.0001, "step": 172224 }, { "epoch": 64.1025641025641, "grad_norm": 2.03393816947937, "learning_rate": 0.0001, "loss": 0.7116, "step": 172500 }, { "epoch": 64.28836863619472, "grad_norm": 2.2850654125213623, "learning_rate": 0.0001, "loss": 0.7204, "step": 173000 }, { "epoch": 64.47417316982535, "grad_norm": 2.1166341304779053, "learning_rate": 0.0001, "loss": 0.7139, "step": 173500 }, { "epoch": 64.65997770345597, "grad_norm": 2.2916886806488037, "learning_rate": 0.0001, "loss": 0.7235, "step": 174000 }, { "epoch": 64.84578223708658, "grad_norm": 2.0275001525878906, "learning_rate": 0.0001, "loss": 0.7253, "step": 174500 }, { "epoch": 65.0, "eval_accuracy": 0.7680354000801379, "eval_f1_macro": 0.434573899819693, "eval_f1_micro": 0.7680354000801379, "eval_loss": 0.6589834690093994, "eval_runtime": 514.9526, "eval_samples_per_second": 111.469, "eval_steps_per_second": 1.742, "learning_rate": 0.0001, "step": 174915 }, { "epoch": 65.03158677071721, "grad_norm": 1.5037002563476562, "learning_rate": 0.0001, "loss": 0.7336, "step": 175000 }, { "epoch": 65.21739130434783, "grad_norm": 1.769515037536621, "learning_rate": 0.0001, "loss": 0.7181, "step": 175500 }, { "epoch": 65.40319583797844, "grad_norm": 2.0847535133361816, "learning_rate": 0.0001, "loss": 0.7236, "step": 176000 }, { "epoch": 65.58900037160906, "grad_norm": 2.0402798652648926, "learning_rate": 0.0001, "loss": 0.7196, "step": 176500 }, { "epoch": 65.77480490523969, "grad_norm": 2.2478229999542236, "learning_rate": 0.0001, "loss": 0.7277, "step": 177000 }, { "epoch": 65.9606094388703, "grad_norm": 2.4031426906585693, "learning_rate": 0.0001, "loss": 0.723, "step": 177500 }, { "epoch": 66.0, "eval_accuracy": 0.76845351126287, "eval_f1_macro": 0.4397010901020239, "eval_f1_micro": 0.76845351126287, "eval_loss": 0.6599727272987366, "eval_runtime": 512.5743, "eval_samples_per_second": 111.986, "eval_steps_per_second": 1.75, "learning_rate": 0.0001, "step": 177606 }, { "epoch": 66.14641397250092, "grad_norm": 1.965077519416809, "learning_rate": 0.0001, "loss": 0.7167, "step": 178000 }, { "epoch": 66.33221850613155, "grad_norm": 2.626265525817871, "learning_rate": 0.0001, "loss": 0.7315, "step": 178500 }, { "epoch": 66.51802303976217, "grad_norm": 2.360399007797241, "learning_rate": 0.0001, "loss": 0.7101, "step": 179000 }, { "epoch": 66.70382757339279, "grad_norm": 2.442690849304199, "learning_rate": 0.0001, "loss": 0.7336, "step": 179500 }, { "epoch": 66.88963210702342, "grad_norm": 1.652727723121643, "learning_rate": 0.0001, "loss": 0.7257, "step": 180000 }, { "epoch": 67.0, "eval_accuracy": 0.769045835438407, "eval_f1_macro": 0.44838573955584105, "eval_f1_micro": 0.769045835438407, "eval_loss": 0.6572328209877014, "eval_runtime": 509.8483, "eval_samples_per_second": 112.584, "eval_steps_per_second": 1.759, "learning_rate": 0.0001, "step": 180297 }, { "epoch": 67.07543664065403, "grad_norm": 2.3557684421539307, "learning_rate": 0.0001, "loss": 0.7238, "step": 180500 }, { "epoch": 67.26124117428465, "grad_norm": 2.1195363998413086, "learning_rate": 0.0001, "loss": 0.718, "step": 181000 }, { "epoch": 67.44704570791528, "grad_norm": 2.189373731613159, "learning_rate": 0.0001, "loss": 0.7148, "step": 181500 }, { "epoch": 67.6328502415459, "grad_norm": 2.0466713905334473, "learning_rate": 0.0001, "loss": 0.7295, "step": 182000 }, { "epoch": 67.81865477517651, "grad_norm": 2.2755849361419678, "learning_rate": 0.0001, "loss": 0.7257, "step": 182500 }, { "epoch": 68.0, "eval_accuracy": 0.7686799881535165, "eval_f1_macro": 0.4442341389313245, "eval_f1_micro": 0.7686799881535165, "eval_loss": 0.658860445022583, "eval_runtime": 508.6729, "eval_samples_per_second": 112.845, "eval_steps_per_second": 1.763, "learning_rate": 0.0001, "step": 182988 }, { "epoch": 68.00445930880713, "grad_norm": 2.214465618133545, "learning_rate": 0.0001, "loss": 0.7245, "step": 183000 }, { "epoch": 68.19026384243776, "grad_norm": 2.169665813446045, "learning_rate": 0.0001, "loss": 0.7131, "step": 183500 }, { "epoch": 68.37606837606837, "grad_norm": 2.195838451385498, "learning_rate": 0.0001, "loss": 0.7217, "step": 184000 }, { "epoch": 68.56187290969899, "grad_norm": 2.631565809249878, "learning_rate": 0.0001, "loss": 0.714, "step": 184500 }, { "epoch": 68.74767744332962, "grad_norm": 2.0778391361236572, "learning_rate": 0.0001, "loss": 0.7235, "step": 185000 }, { "epoch": 68.93348197696024, "grad_norm": 2.478853464126587, "learning_rate": 0.0001, "loss": 0.7299, "step": 185500 }, { "epoch": 69.0, "eval_accuracy": 0.7688542011463215, "eval_f1_macro": 0.43926108990057783, "eval_f1_micro": 0.7688542011463215, "eval_loss": 0.659292995929718, "eval_runtime": 515.951, "eval_samples_per_second": 111.253, "eval_steps_per_second": 1.739, "learning_rate": 0.0001, "step": 185679 }, { "epoch": 69.11928651059085, "grad_norm": 2.1239874362945557, "learning_rate": 0.0001, "loss": 0.7327, "step": 186000 }, { "epoch": 69.30509104422148, "grad_norm": 2.156639814376831, "learning_rate": 0.0001, "loss": 0.7178, "step": 186500 }, { "epoch": 69.4908955778521, "grad_norm": 2.094212532043457, "learning_rate": 0.0001, "loss": 0.7205, "step": 187000 }, { "epoch": 69.67670011148272, "grad_norm": 2.293548822402954, "learning_rate": 0.0001, "loss": 0.7275, "step": 187500 }, { "epoch": 69.86250464511335, "grad_norm": 2.42189359664917, "learning_rate": 0.0001, "loss": 0.7289, "step": 188000 }, { "epoch": 70.0, "eval_accuracy": 0.7679134509851745, "eval_f1_macro": 0.4357439201261406, "eval_f1_micro": 0.7679134509851745, "eval_loss": 0.658970832824707, "eval_runtime": 507.1382, "eval_samples_per_second": 113.186, "eval_steps_per_second": 1.769, "learning_rate": 0.0001, "step": 188370 }, { "epoch": 70.04830917874396, "grad_norm": 2.229328155517578, "learning_rate": 0.0001, "loss": 0.7243, "step": 188500 }, { "epoch": 70.23411371237458, "grad_norm": 2.238511323928833, "learning_rate": 0.0001, "loss": 0.7194, "step": 189000 }, { "epoch": 70.4199182460052, "grad_norm": 2.1684393882751465, "learning_rate": 0.0001, "loss": 0.7217, "step": 189500 }, { "epoch": 70.60572277963583, "grad_norm": 2.2486910820007324, "learning_rate": 0.0001, "loss": 0.7249, "step": 190000 }, { "epoch": 70.79152731326644, "grad_norm": 2.3368637561798096, "learning_rate": 0.0001, "loss": 0.7169, "step": 190500 }, { "epoch": 70.97733184689706, "grad_norm": 2.2436201572418213, "learning_rate": 0.0001, "loss": 0.7179, "step": 191000 }, { "epoch": 71.0, "eval_accuracy": 0.768244455671504, "eval_f1_macro": 0.4432082950234043, "eval_f1_micro": 0.768244455671504, "eval_loss": 0.6567061543464661, "eval_runtime": 510.955, "eval_samples_per_second": 112.341, "eval_steps_per_second": 1.756, "learning_rate": 0.0001, "step": 191061 }, { "epoch": 71.16313638052769, "grad_norm": 2.196202039718628, "learning_rate": 0.0001, "loss": 0.7218, "step": 191500 }, { "epoch": 71.3489409141583, "grad_norm": 1.8094837665557861, "learning_rate": 0.0001, "loss": 0.7271, "step": 192000 }, { "epoch": 71.53474544778892, "grad_norm": 2.3732166290283203, "learning_rate": 0.0001, "loss": 0.7135, "step": 192500 }, { "epoch": 71.72054998141955, "grad_norm": 2.5137126445770264, "learning_rate": 0.0001, "loss": 0.7195, "step": 193000 }, { "epoch": 71.90635451505017, "grad_norm": 1.83713960647583, "learning_rate": 0.0001, "loss": 0.7292, "step": 193500 }, { "epoch": 72.0, "eval_accuracy": 0.7681225065765405, "eval_f1_macro": 0.4369170898714745, "eval_f1_micro": 0.7681225065765405, "eval_loss": 0.65887850522995, "eval_runtime": 522.7505, "eval_samples_per_second": 109.806, "eval_steps_per_second": 1.716, "learning_rate": 0.0001, "step": 193752 }, { "epoch": 72.09215904868078, "grad_norm": 2.539991855621338, "learning_rate": 0.0001, "loss": 0.7165, "step": 194000 }, { "epoch": 72.27796358231141, "grad_norm": 2.3056955337524414, "learning_rate": 0.0001, "loss": 0.7267, "step": 194500 }, { "epoch": 72.46376811594203, "grad_norm": 2.263315200805664, "learning_rate": 0.0001, "loss": 0.7303, "step": 195000 }, { "epoch": 72.64957264957265, "grad_norm": 2.132422685623169, "learning_rate": 0.0001, "loss": 0.7237, "step": 195500 }, { "epoch": 72.83537718320326, "grad_norm": 2.034406900405884, "learning_rate": 0.0001, "loss": 0.7139, "step": 196000 }, { "epoch": 73.0, "eval_accuracy": 0.7675998675981255, "eval_f1_macro": 0.4436833314710758, "eval_f1_micro": 0.7675998675981255, "eval_loss": 0.6611541509628296, "eval_runtime": 506.2664, "eval_samples_per_second": 113.381, "eval_steps_per_second": 1.772, "learning_rate": 0.0001, "step": 196443 }, { "epoch": 73.0211817168339, "grad_norm": 1.860060691833496, "learning_rate": 0.0001, "loss": 0.7214, "step": 196500 }, { "epoch": 73.20698625046451, "grad_norm": 2.3336594104766846, "learning_rate": 0.0001, "loss": 0.7223, "step": 197000 }, { "epoch": 73.39279078409513, "grad_norm": 2.637392282485962, "learning_rate": 0.0001, "loss": 0.7284, "step": 197500 }, { "epoch": 73.57859531772576, "grad_norm": 1.9666019678115845, "learning_rate": 0.0001, "loss": 0.7141, "step": 198000 }, { "epoch": 73.76439985135637, "grad_norm": 2.4116933345794678, "learning_rate": 0.0001, "loss": 0.7158, "step": 198500 }, { "epoch": 73.95020438498699, "grad_norm": 2.568808078765869, "learning_rate": 0.0001, "loss": 0.7307, "step": 199000 }, { "epoch": 74.0, "eval_accuracy": 0.768488353861431, "eval_f1_macro": 0.44906341810873124, "eval_f1_micro": 0.768488353861431, "eval_loss": 0.6570971012115479, "eval_runtime": 519.8036, "eval_samples_per_second": 110.428, "eval_steps_per_second": 1.726, "learning_rate": 0.0001, "step": 199134 }, { "epoch": 74.13600891861762, "grad_norm": 2.4878387451171875, "learning_rate": 0.0001, "loss": 0.7172, "step": 199500 }, { "epoch": 74.32181345224824, "grad_norm": 2.029940605163574, "learning_rate": 0.0001, "loss": 0.713, "step": 200000 }, { "epoch": 74.50761798587885, "grad_norm": 1.8782607316970825, "learning_rate": 0.0001, "loss": 0.7148, "step": 200500 }, { "epoch": 74.69342251950948, "grad_norm": 2.0745551586151123, "learning_rate": 0.0001, "loss": 0.7249, "step": 201000 }, { "epoch": 74.8792270531401, "grad_norm": 2.2313883304595947, "learning_rate": 0.0001, "loss": 0.7238, "step": 201500 }, { "epoch": 75.0, "eval_accuracy": 0.768174770474382, "eval_f1_macro": 0.44439364748025234, "eval_f1_micro": 0.768174770474382, "eval_loss": 0.6557245254516602, "eval_runtime": 535.3584, "eval_samples_per_second": 107.22, "eval_steps_per_second": 1.676, "learning_rate": 0.0001, "step": 201825 }, { "epoch": 75.06503158677071, "grad_norm": 2.656343936920166, "learning_rate": 0.0001, "loss": 0.7334, "step": 202000 }, { "epoch": 75.25083612040133, "grad_norm": 2.0540146827697754, "learning_rate": 0.0001, "loss": 0.7186, "step": 202500 }, { "epoch": 75.43664065403196, "grad_norm": 1.8651381731033325, "learning_rate": 0.0001, "loss": 0.723, "step": 203000 }, { "epoch": 75.62244518766258, "grad_norm": 1.9370218515396118, "learning_rate": 0.0001, "loss": 0.7211, "step": 203500 }, { "epoch": 75.8082497212932, "grad_norm": 1.9121958017349243, "learning_rate": 0.0001, "loss": 0.7205, "step": 204000 }, { "epoch": 75.99405425492382, "grad_norm": 2.61309552192688, "learning_rate": 0.0001, "loss": 0.7257, "step": 204500 }, { "epoch": 76.0, "eval_accuracy": 0.7683489834671869, "eval_f1_macro": 0.4478607285233603, "eval_f1_micro": 0.7683489834671869, "eval_loss": 0.658838152885437, "eval_runtime": 499.8427, "eval_samples_per_second": 114.838, "eval_steps_per_second": 1.795, "learning_rate": 0.0001, "step": 204516 }, { "epoch": 76.17985878855444, "grad_norm": 2.235535144805908, "learning_rate": 0.0001, "loss": 0.7205, "step": 205000 }, { "epoch": 76.36566332218506, "grad_norm": 1.883389949798584, "learning_rate": 0.0001, "loss": 0.7299, "step": 205500 }, { "epoch": 76.55146785581569, "grad_norm": 2.6287364959716797, "learning_rate": 0.0001, "loss": 0.7118, "step": 206000 }, { "epoch": 76.7372723894463, "grad_norm": 2.0496041774749756, "learning_rate": 0.0001, "loss": 0.72, "step": 206500 }, { "epoch": 76.92307692307692, "grad_norm": 2.1624155044555664, "learning_rate": 0.0001, "loss": 0.7252, "step": 207000 }, { "epoch": 77.0, "eval_accuracy": 0.7686277242556749, "eval_f1_macro": 0.44887316801028765, "eval_f1_micro": 0.7686277242556749, "eval_loss": 0.6572225093841553, "eval_runtime": 492.6745, "eval_samples_per_second": 116.509, "eval_steps_per_second": 1.821, "learning_rate": 0.0001, "step": 207207 }, { "epoch": 77.10888145670755, "grad_norm": 2.232707977294922, "learning_rate": 0.0001, "loss": 0.7144, "step": 207500 }, { "epoch": 77.29468599033817, "grad_norm": 2.15569806098938, "learning_rate": 0.0001, "loss": 0.7182, "step": 208000 }, { "epoch": 77.48049052396878, "grad_norm": 2.2068002223968506, "learning_rate": 0.0001, "loss": 0.7128, "step": 208500 }, { "epoch": 77.6662950575994, "grad_norm": 2.9470760822296143, "learning_rate": 0.0001, "loss": 0.7247, "step": 209000 }, { "epoch": 77.85209959123003, "grad_norm": 2.5690219402313232, "learning_rate": 0.0001, "loss": 0.7231, "step": 209500 }, { "epoch": 78.0, "eval_accuracy": 0.768767094649919, "eval_f1_macro": 0.4439839848601264, "eval_f1_micro": 0.768767094649919, "eval_loss": 0.6562930941581726, "eval_runtime": 495.5045, "eval_samples_per_second": 115.844, "eval_steps_per_second": 1.81, "learning_rate": 0.0001, "step": 209898 }, { "epoch": 78.03790412486065, "grad_norm": 1.879889726638794, "learning_rate": 0.0001, "loss": 0.7258, "step": 210000 }, { "epoch": 78.22370865849126, "grad_norm": 1.8929120302200317, "learning_rate": 0.0001, "loss": 0.7188, "step": 210500 }, { "epoch": 78.40951319212189, "grad_norm": 2.2578604221343994, "learning_rate": 0.0001, "loss": 0.7239, "step": 211000 }, { "epoch": 78.59531772575251, "grad_norm": 2.1204187870025635, "learning_rate": 0.0001, "loss": 0.7255, "step": 211500 }, { "epoch": 78.78112225938312, "grad_norm": 1.968096375465393, "learning_rate": 0.0001, "loss": 0.7206, "step": 212000 }, { "epoch": 78.96692679301376, "grad_norm": 2.7695274353027344, "learning_rate": 0.0001, "loss": 0.7207, "step": 212500 }, { "epoch": 79.0, "eval_accuracy": 0.76810508527726, "eval_f1_macro": 0.4379298766193662, "eval_f1_micro": 0.76810508527726, "eval_loss": 0.6564787030220032, "eval_runtime": 498.2393, "eval_samples_per_second": 115.208, "eval_steps_per_second": 1.8, "learning_rate": 0.0001, "step": 212589 }, { "epoch": 79.15273132664437, "grad_norm": 2.0273005962371826, "learning_rate": 0.0001, "loss": 0.7177, "step": 213000 }, { "epoch": 79.33853586027499, "grad_norm": 2.40535306930542, "learning_rate": 0.0001, "loss": 0.7288, "step": 213500 }, { "epoch": 79.52434039390562, "grad_norm": 2.329434871673584, "learning_rate": 0.0001, "loss": 0.7139, "step": 214000 }, { "epoch": 79.71014492753623, "grad_norm": 2.750331401824951, "learning_rate": 0.0001, "loss": 0.7244, "step": 214500 }, { "epoch": 79.89594946116685, "grad_norm": 2.482513427734375, "learning_rate": 0.0001, "loss": 0.7179, "step": 215000 }, { "epoch": 80.0, "eval_accuracy": 0.768383826065748, "eval_f1_macro": 0.4460529321244195, "eval_f1_micro": 0.768383826065748, "eval_loss": 0.661143958568573, "eval_runtime": 505.2043, "eval_samples_per_second": 113.619, "eval_steps_per_second": 1.776, "learning_rate": 0.0001, "step": 215280 }, { "epoch": 80.08175399479747, "grad_norm": 2.6754283905029297, "learning_rate": 0.0001, "loss": 0.7247, "step": 215500 }, { "epoch": 80.2675585284281, "grad_norm": 3.016185760498047, "learning_rate": 0.0001, "loss": 0.7194, "step": 216000 }, { "epoch": 80.45336306205871, "grad_norm": 2.425431489944458, "learning_rate": 0.0001, "loss": 0.7197, "step": 216500 }, { "epoch": 80.63916759568933, "grad_norm": 2.2331862449645996, "learning_rate": 0.0001, "loss": 0.7217, "step": 217000 }, { "epoch": 80.82497212931996, "grad_norm": 2.3233511447906494, "learning_rate": 0.0001, "loss": 0.7275, "step": 217500 }, { "epoch": 81.0, "eval_accuracy": 0.768941307642724, "eval_f1_macro": 0.44750591322776384, "eval_f1_micro": 0.768941307642724, "eval_loss": 0.660437285900116, "eval_runtime": 497.6996, "eval_samples_per_second": 115.333, "eval_steps_per_second": 1.802, "learning_rate": 0.0001, "step": 217971 }, { "epoch": 81.01077666295058, "grad_norm": 2.5221219062805176, "learning_rate": 1e-05, "loss": 0.7201, "step": 218000 }, { "epoch": 81.19658119658119, "grad_norm": 2.2925214767456055, "learning_rate": 1e-05, "loss": 0.7095, "step": 218500 }, { "epoch": 81.38238573021182, "grad_norm": 2.733447313308716, "learning_rate": 1e-05, "loss": 0.7125, "step": 219000 }, { "epoch": 81.56819026384244, "grad_norm": 2.3250811100006104, "learning_rate": 1e-05, "loss": 0.7193, "step": 219500 }, { "epoch": 81.75399479747306, "grad_norm": 2.4225757122039795, "learning_rate": 1e-05, "loss": 0.7081, "step": 220000 }, { "epoch": 81.93979933110369, "grad_norm": 2.1880016326904297, "learning_rate": 1e-05, "loss": 0.7101, "step": 220500 }, { "epoch": 82.0, "eval_accuracy": 0.7705614884758105, "eval_f1_macro": 0.4526720456188751, "eval_f1_micro": 0.7705614884758105, "eval_loss": 0.6531779766082764, "eval_runtime": 493.1117, "eval_samples_per_second": 116.406, "eval_steps_per_second": 1.819, "learning_rate": 1e-05, "step": 220662 }, { "epoch": 82.1256038647343, "grad_norm": 2.038079261779785, "learning_rate": 1e-05, "loss": 0.7052, "step": 221000 }, { "epoch": 82.31140839836492, "grad_norm": 2.8099365234375, "learning_rate": 1e-05, "loss": 0.7054, "step": 221500 }, { "epoch": 82.49721293199553, "grad_norm": 2.683091878890991, "learning_rate": 1e-05, "loss": 0.7013, "step": 222000 }, { "epoch": 82.68301746562616, "grad_norm": 2.0887763500213623, "learning_rate": 1e-05, "loss": 0.7024, "step": 222500 }, { "epoch": 82.86882199925678, "grad_norm": 2.346670627593994, "learning_rate": 1e-05, "loss": 0.7063, "step": 223000 }, { "epoch": 83.0, "eval_accuracy": 0.77019564119092, "eval_f1_macro": 0.4489367718812771, "eval_f1_micro": 0.77019564119092, "eval_loss": 0.6532895565032959, "eval_runtime": 492.0028, "eval_samples_per_second": 116.668, "eval_steps_per_second": 1.823, "learning_rate": 1e-05, "step": 223353 }, { "epoch": 83.0546265328874, "grad_norm": 2.1046745777130127, "learning_rate": 1e-05, "loss": 0.7073, "step": 223500 }, { "epoch": 83.24043106651803, "grad_norm": 1.7634518146514893, "learning_rate": 1e-05, "loss": 0.7081, "step": 224000 }, { "epoch": 83.42623560014864, "grad_norm": 2.451301097869873, "learning_rate": 1e-05, "loss": 0.7151, "step": 224500 }, { "epoch": 83.61204013377926, "grad_norm": 2.347801923751831, "learning_rate": 1e-05, "loss": 0.6973, "step": 225000 }, { "epoch": 83.79784466740989, "grad_norm": 2.34899640083313, "learning_rate": 1e-05, "loss": 0.7013, "step": 225500 }, { "epoch": 83.9836492010405, "grad_norm": 2.269644260406494, "learning_rate": 1e-05, "loss": 0.7067, "step": 226000 }, { "epoch": 84.0, "eval_accuracy": 0.7705266458772495, "eval_f1_macro": 0.45139096558153424, "eval_f1_micro": 0.7705266458772495, "eval_loss": 0.6505005359649658, "eval_runtime": 486.6669, "eval_samples_per_second": 117.947, "eval_steps_per_second": 1.843, "learning_rate": 1e-05, "step": 226044 }, { "epoch": 84.16945373467112, "grad_norm": 2.6176042556762695, "learning_rate": 1e-05, "loss": 0.6912, "step": 226500 }, { "epoch": 84.35525826830175, "grad_norm": 2.330493211746216, "learning_rate": 1e-05, "loss": 0.7045, "step": 227000 }, { "epoch": 84.54106280193237, "grad_norm": 2.351470470428467, "learning_rate": 1e-05, "loss": 0.7049, "step": 227500 }, { "epoch": 84.72686733556299, "grad_norm": 2.444443702697754, "learning_rate": 1e-05, "loss": 0.7122, "step": 228000 }, { "epoch": 84.9126718691936, "grad_norm": 2.2119200229644775, "learning_rate": 1e-05, "loss": 0.707, "step": 228500 }, { "epoch": 85.0, "eval_accuracy": 0.7708053866657375, "eval_f1_macro": 0.4565625671001629, "eval_f1_micro": 0.7708053866657375, "eval_loss": 0.6501905918121338, "eval_runtime": 485.2188, "eval_samples_per_second": 118.299, "eval_steps_per_second": 1.849, "learning_rate": 1e-05, "step": 228735 }, { "epoch": 85.09847640282423, "grad_norm": 2.385796546936035, "learning_rate": 1e-05, "loss": 0.6966, "step": 229000 }, { "epoch": 85.28428093645485, "grad_norm": 2.420588254928589, "learning_rate": 1e-05, "loss": 0.705, "step": 229500 }, { "epoch": 85.47008547008546, "grad_norm": 2.1321732997894287, "learning_rate": 1e-05, "loss": 0.6976, "step": 230000 }, { "epoch": 85.6558900037161, "grad_norm": 2.215148448944092, "learning_rate": 1e-05, "loss": 0.6998, "step": 230500 }, { "epoch": 85.84169453734671, "grad_norm": 2.643437385559082, "learning_rate": 1e-05, "loss": 0.6944, "step": 231000 }, { "epoch": 86.0, "eval_accuracy": 0.7707182801693351, "eval_f1_macro": 0.4559124472677571, "eval_f1_micro": 0.7707182801693351, "eval_loss": 0.6507149338722229, "eval_runtime": 493.7179, "eval_samples_per_second": 116.263, "eval_steps_per_second": 1.817, "learning_rate": 1e-05, "step": 231426 }, { "epoch": 86.02749907097733, "grad_norm": 2.8356385231018066, "learning_rate": 1e-05, "loss": 0.6966, "step": 231500 }, { "epoch": 86.21330360460796, "grad_norm": 2.272819757461548, "learning_rate": 1e-05, "loss": 0.7011, "step": 232000 }, { "epoch": 86.39910813823857, "grad_norm": 2.5435869693756104, "learning_rate": 1e-05, "loss": 0.7019, "step": 232500 }, { "epoch": 86.58491267186919, "grad_norm": 2.345691204071045, "learning_rate": 1e-05, "loss": 0.6964, "step": 233000 }, { "epoch": 86.77071720549982, "grad_norm": 1.9666670560836792, "learning_rate": 1e-05, "loss": 0.6989, "step": 233500 }, { "epoch": 86.95652173913044, "grad_norm": 2.101868152618408, "learning_rate": 1e-05, "loss": 0.6958, "step": 234000 }, { "epoch": 87.0, "eval_accuracy": 0.770927335760701, "eval_f1_macro": 0.45838476700319497, "eval_f1_micro": 0.770927335760701, "eval_loss": 0.6484472751617432, "eval_runtime": 497.0614, "eval_samples_per_second": 115.481, "eval_steps_per_second": 1.805, "learning_rate": 1e-05, "step": 234117 }, { "epoch": 87.14232627276105, "grad_norm": 3.2775235176086426, "learning_rate": 1e-05, "loss": 0.6957, "step": 234500 }, { "epoch": 87.32813080639167, "grad_norm": 2.4724538326263428, "learning_rate": 1e-05, "loss": 0.6963, "step": 235000 }, { "epoch": 87.5139353400223, "grad_norm": 2.9358458518981934, "learning_rate": 1e-05, "loss": 0.6987, "step": 235500 }, { "epoch": 87.69973987365292, "grad_norm": 2.5557122230529785, "learning_rate": 1e-05, "loss": 0.708, "step": 236000 }, { "epoch": 87.88554440728353, "grad_norm": 2.3544118404388428, "learning_rate": 1e-05, "loss": 0.6967, "step": 236500 }, { "epoch": 88.0, "eval_accuracy": 0.77054406717653, "eval_f1_macro": 0.4569340367642959, "eval_f1_micro": 0.77054406717653, "eval_loss": 0.6496042013168335, "eval_runtime": 501.5965, "eval_samples_per_second": 114.437, "eval_steps_per_second": 1.788, "learning_rate": 1e-05, "step": 236808 }, { "epoch": 88.07134894091416, "grad_norm": 2.06174898147583, "learning_rate": 1e-05, "loss": 0.6987, "step": 237000 }, { "epoch": 88.25715347454478, "grad_norm": 1.922559380531311, "learning_rate": 1e-05, "loss": 0.7057, "step": 237500 }, { "epoch": 88.4429580081754, "grad_norm": 2.603978157043457, "learning_rate": 1e-05, "loss": 0.7019, "step": 238000 }, { "epoch": 88.62876254180603, "grad_norm": 1.859614372253418, "learning_rate": 1e-05, "loss": 0.6921, "step": 238500 }, { "epoch": 88.81456707543664, "grad_norm": 2.208031177520752, "learning_rate": 1e-05, "loss": 0.698, "step": 239000 }, { "epoch": 89.0, "eval_accuracy": 0.7713977108412745, "eval_f1_macro": 0.45601319436503734, "eval_f1_micro": 0.7713977108412745, "eval_loss": 0.6486304402351379, "eval_runtime": 503.0751, "eval_samples_per_second": 114.1, "eval_steps_per_second": 1.783, "learning_rate": 1e-05, "step": 239499 }, { "epoch": 89.00037160906726, "grad_norm": 2.6880383491516113, "learning_rate": 1e-05, "loss": 0.6938, "step": 239500 }, { "epoch": 89.18617614269789, "grad_norm": 2.5231385231018066, "learning_rate": 1e-05, "loss": 0.7063, "step": 240000 }, { "epoch": 89.3719806763285, "grad_norm": 2.500443935394287, "learning_rate": 1e-05, "loss": 0.6931, "step": 240500 }, { "epoch": 89.55778520995912, "grad_norm": 2.0993025302886963, "learning_rate": 1e-05, "loss": 0.7048, "step": 241000 }, { "epoch": 89.74358974358974, "grad_norm": 2.4644758701324463, "learning_rate": 1e-05, "loss": 0.7037, "step": 241500 }, { "epoch": 89.92939427722037, "grad_norm": 2.391343116760254, "learning_rate": 1e-05, "loss": 0.6966, "step": 242000 }, { "epoch": 90.0, "eval_accuracy": 0.7711886552499085, "eval_f1_macro": 0.45742417795749246, "eval_f1_micro": 0.7711886552499085, "eval_loss": 0.6490767598152161, "eval_runtime": 499.9247, "eval_samples_per_second": 114.819, "eval_steps_per_second": 1.794, "learning_rate": 1e-05, "step": 242190 }, { "epoch": 90.11519881085098, "grad_norm": 2.728985071182251, "learning_rate": 1e-05, "loss": 0.6981, "step": 242500 }, { "epoch": 90.3010033444816, "grad_norm": 2.9933202266693115, "learning_rate": 1e-05, "loss": 0.6972, "step": 243000 }, { "epoch": 90.48680787811223, "grad_norm": 2.6327061653137207, "learning_rate": 1e-05, "loss": 0.6901, "step": 243500 }, { "epoch": 90.67261241174285, "grad_norm": 2.532104969024658, "learning_rate": 1e-05, "loss": 0.6879, "step": 244000 }, { "epoch": 90.85841694537346, "grad_norm": 2.649080753326416, "learning_rate": 1e-05, "loss": 0.7017, "step": 244500 }, { "epoch": 91.0, "eval_accuracy": 0.7704221180815666, "eval_f1_macro": 0.45182909085509243, "eval_f1_micro": 0.7704221180815666, "eval_loss": 0.6481940746307373, "eval_runtime": 494.0364, "eval_samples_per_second": 116.188, "eval_steps_per_second": 1.816, "learning_rate": 1e-05, "step": 244881 }, { "epoch": 91.0442214790041, "grad_norm": 2.7153379917144775, "learning_rate": 1e-05, "loss": 0.6977, "step": 245000 }, { "epoch": 91.23002601263471, "grad_norm": 2.7882251739501953, "learning_rate": 1e-05, "loss": 0.6979, "step": 245500 }, { "epoch": 91.41583054626533, "grad_norm": 2.632366418838501, "learning_rate": 1e-05, "loss": 0.6984, "step": 246000 }, { "epoch": 91.60163507989596, "grad_norm": 2.7723135948181152, "learning_rate": 1e-05, "loss": 0.6972, "step": 246500 }, { "epoch": 91.78743961352657, "grad_norm": 2.384683132171631, "learning_rate": 1e-05, "loss": 0.6989, "step": 247000 }, { "epoch": 91.97324414715719, "grad_norm": 2.210071086883545, "learning_rate": 1e-05, "loss": 0.7, "step": 247500 }, { "epoch": 92.0, "eval_accuracy": 0.7715545025347991, "eval_f1_macro": 0.45503456574154166, "eval_f1_micro": 0.7715545025347991, "eval_loss": 0.6477252244949341, "eval_runtime": 495.0222, "eval_samples_per_second": 115.956, "eval_steps_per_second": 1.812, "learning_rate": 1e-05, "step": 247572 }, { "epoch": 92.1590486807878, "grad_norm": 2.723433494567871, "learning_rate": 1e-05, "loss": 0.6915, "step": 248000 }, { "epoch": 92.34485321441844, "grad_norm": 2.203484535217285, "learning_rate": 1e-05, "loss": 0.6938, "step": 248500 }, { "epoch": 92.53065774804905, "grad_norm": 2.4244284629821777, "learning_rate": 1e-05, "loss": 0.6994, "step": 249000 }, { "epoch": 92.71646228167967, "grad_norm": 2.495445728302002, "learning_rate": 1e-05, "loss": 0.7034, "step": 249500 }, { "epoch": 92.9022668153103, "grad_norm": 2.1715991497039795, "learning_rate": 1e-05, "loss": 0.7, "step": 250000 }, { "epoch": 93.0, "eval_accuracy": 0.771206076549189, "eval_f1_macro": 0.4518083431267937, "eval_f1_micro": 0.771206076549189, "eval_loss": 0.6489835381507874, "eval_runtime": 487.2388, "eval_samples_per_second": 117.809, "eval_steps_per_second": 1.841, "learning_rate": 1e-05, "step": 250263 }, { "epoch": 93.08807134894091, "grad_norm": 2.252865791320801, "learning_rate": 1e-05, "loss": 0.6916, "step": 250500 }, { "epoch": 93.27387588257153, "grad_norm": 2.484537363052368, "learning_rate": 1e-05, "loss": 0.6953, "step": 251000 }, { "epoch": 93.45968041620216, "grad_norm": 1.7372355461120605, "learning_rate": 1e-05, "loss": 0.6918, "step": 251500 }, { "epoch": 93.64548494983278, "grad_norm": 2.283757448196411, "learning_rate": 1e-05, "loss": 0.7009, "step": 252000 }, { "epoch": 93.8312894834634, "grad_norm": 2.009822130203247, "learning_rate": 1e-05, "loss": 0.7049, "step": 252500 }, { "epoch": 94.0, "eval_accuracy": 0.770753122767896, "eval_f1_macro": 0.45105746851856937, "eval_f1_micro": 0.770753122767896, "eval_loss": 0.6485304832458496, "eval_runtime": 500.9843, "eval_samples_per_second": 114.576, "eval_steps_per_second": 1.79, "learning_rate": 1e-05, "step": 252954 }, { "epoch": 94.01709401709402, "grad_norm": 2.0452888011932373, "learning_rate": 1e-05, "loss": 0.6917, "step": 253000 }, { "epoch": 94.20289855072464, "grad_norm": 2.5866355895996094, "learning_rate": 1e-05, "loss": 0.7064, "step": 253500 }, { "epoch": 94.38870308435526, "grad_norm": 2.3897805213928223, "learning_rate": 1e-05, "loss": 0.6944, "step": 254000 }, { "epoch": 94.57450761798587, "grad_norm": 2.8837811946868896, "learning_rate": 1e-05, "loss": 0.6925, "step": 254500 }, { "epoch": 94.7603121516165, "grad_norm": 2.44002628326416, "learning_rate": 1e-05, "loss": 0.6989, "step": 255000 }, { "epoch": 94.94611668524712, "grad_norm": 3.1637446880340576, "learning_rate": 1e-05, "loss": 0.6949, "step": 255500 }, { "epoch": 95.0, "eval_accuracy": 0.771659030330482, "eval_f1_macro": 0.45671492126995916, "eval_f1_micro": 0.771659030330482, "eval_loss": 0.647895336151123, "eval_runtime": 490.9837, "eval_samples_per_second": 116.91, "eval_steps_per_second": 1.827, "learning_rate": 1e-05, "step": 255645 }, { "epoch": 95.13192121887774, "grad_norm": 2.9006567001342773, "learning_rate": 1e-05, "loss": 0.6916, "step": 256000 }, { "epoch": 95.31772575250837, "grad_norm": 2.7343852519989014, "learning_rate": 1e-05, "loss": 0.6968, "step": 256500 }, { "epoch": 95.50353028613898, "grad_norm": 2.1861000061035156, "learning_rate": 1e-05, "loss": 0.6921, "step": 257000 }, { "epoch": 95.6893348197696, "grad_norm": 2.3142426013946533, "learning_rate": 1e-05, "loss": 0.7016, "step": 257500 }, { "epoch": 95.87513935340023, "grad_norm": 3.253568649291992, "learning_rate": 1e-05, "loss": 0.6998, "step": 258000 }, { "epoch": 96.0, "eval_accuracy": 0.7715022386369575, "eval_f1_macro": 0.4596959338122155, "eval_f1_micro": 0.7715022386369575, "eval_loss": 0.6472702622413635, "eval_runtime": 484.103, "eval_samples_per_second": 118.572, "eval_steps_per_second": 1.853, "learning_rate": 1e-05, "step": 258336 }, { "epoch": 96.06094388703085, "grad_norm": 2.4987857341766357, "learning_rate": 1e-05, "loss": 0.6887, "step": 258500 }, { "epoch": 96.24674842066146, "grad_norm": 3.320939064025879, "learning_rate": 1e-05, "loss": 0.6984, "step": 259000 }, { "epoch": 96.43255295429208, "grad_norm": 2.3504180908203125, "learning_rate": 1e-05, "loss": 0.6984, "step": 259500 }, { "epoch": 96.61835748792271, "grad_norm": 2.383805513381958, "learning_rate": 1e-05, "loss": 0.6963, "step": 260000 }, { "epoch": 96.80416202155332, "grad_norm": 2.4699482917785645, "learning_rate": 1e-05, "loss": 0.6847, "step": 260500 }, { "epoch": 96.98996655518394, "grad_norm": 2.491687297821045, "learning_rate": 1e-05, "loss": 0.6968, "step": 261000 }, { "epoch": 97.0, "eval_accuracy": 0.7713977108412745, "eval_f1_macro": 0.4625401473178366, "eval_f1_micro": 0.7713977108412745, "eval_loss": 0.6460831165313721, "eval_runtime": 491.1356, "eval_samples_per_second": 116.874, "eval_steps_per_second": 1.826, "learning_rate": 1e-05, "step": 261027 }, { "epoch": 97.17577108881457, "grad_norm": 1.9844508171081543, "learning_rate": 1e-05, "loss": 0.7004, "step": 261500 }, { "epoch": 97.36157562244519, "grad_norm": 2.3233275413513184, "learning_rate": 1e-05, "loss": 0.6867, "step": 262000 }, { "epoch": 97.5473801560758, "grad_norm": 2.318446397781372, "learning_rate": 1e-05, "loss": 0.6992, "step": 262500 }, { "epoch": 97.73318468970643, "grad_norm": 2.4279417991638184, "learning_rate": 1e-05, "loss": 0.6933, "step": 263000 }, { "epoch": 97.91898922333705, "grad_norm": 2.6880953311920166, "learning_rate": 1e-05, "loss": 0.7055, "step": 263500 }, { "epoch": 98.0, "eval_accuracy": 0.7722165119074581, "eval_f1_macro": 0.45892155771298887, "eval_f1_micro": 0.7722165119074581, "eval_loss": 0.6463102698326111, "eval_runtime": 504.464, "eval_samples_per_second": 113.786, "eval_steps_per_second": 1.778, "learning_rate": 1e-05, "step": 263718 }, { "epoch": 98.10479375696767, "grad_norm": 2.778778553009033, "learning_rate": 1e-05, "loss": 0.6994, "step": 264000 }, { "epoch": 98.2905982905983, "grad_norm": 2.2124552726745605, "learning_rate": 1e-05, "loss": 0.6842, "step": 264500 }, { "epoch": 98.47640282422891, "grad_norm": 3.063041925430298, "learning_rate": 1e-05, "loss": 0.696, "step": 265000 }, { "epoch": 98.66220735785953, "grad_norm": 2.1842856407165527, "learning_rate": 1e-05, "loss": 0.6972, "step": 265500 }, { "epoch": 98.84801189149015, "grad_norm": 3.16538143157959, "learning_rate": 1e-05, "loss": 0.6931, "step": 266000 }, { "epoch": 99.0, "eval_accuracy": 0.77089249316214, "eval_f1_macro": 0.4549241891767946, "eval_f1_micro": 0.77089249316214, "eval_loss": 0.646852433681488, "eval_runtime": 498.8103, "eval_samples_per_second": 115.076, "eval_steps_per_second": 1.798, "learning_rate": 1e-05, "step": 266409 }, { "epoch": 99.03381642512078, "grad_norm": 2.8139493465423584, "learning_rate": 1e-05, "loss": 0.6938, "step": 266500 }, { "epoch": 99.21962095875139, "grad_norm": 2.5012643337249756, "learning_rate": 1e-05, "loss": 0.6978, "step": 267000 }, { "epoch": 99.40542549238201, "grad_norm": 2.0635502338409424, "learning_rate": 1e-05, "loss": 0.702, "step": 267500 }, { "epoch": 99.59123002601264, "grad_norm": 1.9686059951782227, "learning_rate": 1e-05, "loss": 0.6867, "step": 268000 }, { "epoch": 99.77703455964325, "grad_norm": 2.32633900642395, "learning_rate": 1e-05, "loss": 0.6916, "step": 268500 }, { "epoch": 99.96283909327387, "grad_norm": 2.2013280391693115, "learning_rate": 1e-05, "loss": 0.6872, "step": 269000 }, { "epoch": 100.0, "eval_accuracy": 0.7723384610024215, "eval_f1_macro": 0.45970146016594643, "eval_f1_micro": 0.7723384610024215, "eval_loss": 0.6456441879272461, "eval_runtime": 523.4938, "eval_samples_per_second": 109.65, "eval_steps_per_second": 1.713, "learning_rate": 1e-05, "step": 269100 }, { "epoch": 100.1486436269045, "grad_norm": 2.9276328086853027, "learning_rate": 1e-05, "loss": 0.6936, "step": 269500 }, { "epoch": 100.33444816053512, "grad_norm": 2.9882853031158447, "learning_rate": 1e-05, "loss": 0.6916, "step": 270000 }, { "epoch": 100.52025269416573, "grad_norm": 2.50858998298645, "learning_rate": 1e-05, "loss": 0.7004, "step": 270500 }, { "epoch": 100.70605722779636, "grad_norm": 2.6219215393066406, "learning_rate": 1e-05, "loss": 0.7033, "step": 271000 }, { "epoch": 100.89186176142698, "grad_norm": 2.8792617321014404, "learning_rate": 1e-05, "loss": 0.6822, "step": 271500 }, { "epoch": 101.0, "eval_accuracy": 0.7717461368268845, "eval_f1_macro": 0.4573593202819609, "eval_f1_micro": 0.7717461368268845, "eval_loss": 0.6469387412071228, "eval_runtime": 517.8794, "eval_samples_per_second": 110.839, "eval_steps_per_second": 1.732, "learning_rate": 1e-05, "step": 271791 }, { "epoch": 101.0776662950576, "grad_norm": 2.3658642768859863, "learning_rate": 1e-05, "loss": 0.707, "step": 272000 }, { "epoch": 101.26347082868821, "grad_norm": 3.0172219276428223, "learning_rate": 1e-05, "loss": 0.6982, "step": 272500 }, { "epoch": 101.44927536231884, "grad_norm": 3.0194907188415527, "learning_rate": 1e-05, "loss": 0.6899, "step": 273000 }, { "epoch": 101.63507989594946, "grad_norm": 2.587480306625366, "learning_rate": 1e-05, "loss": 0.6917, "step": 273500 }, { "epoch": 101.82088442958008, "grad_norm": 2.457369089126587, "learning_rate": 1e-05, "loss": 0.6875, "step": 274000 }, { "epoch": 102.0, "eval_accuracy": 0.7717809794254455, "eval_f1_macro": 0.4593480600391769, "eval_f1_micro": 0.7717809794254455, "eval_loss": 0.646738588809967, "eval_runtime": 515.0733, "eval_samples_per_second": 111.442, "eval_steps_per_second": 1.741, "learning_rate": 1e-05, "step": 274482 }, { "epoch": 102.0066889632107, "grad_norm": 3.1219208240509033, "learning_rate": 1e-05, "loss": 0.6994, "step": 274500 }, { "epoch": 102.19249349684132, "grad_norm": 2.158254623413086, "learning_rate": 1e-05, "loss": 0.6944, "step": 275000 }, { "epoch": 102.37829803047194, "grad_norm": 2.474193811416626, "learning_rate": 1e-05, "loss": 0.6939, "step": 275500 }, { "epoch": 102.56410256410257, "grad_norm": 2.223621129989624, "learning_rate": 1e-05, "loss": 0.7028, "step": 276000 }, { "epoch": 102.74990709773319, "grad_norm": 2.8731536865234375, "learning_rate": 1e-05, "loss": 0.6972, "step": 276500 }, { "epoch": 102.9357116313638, "grad_norm": 1.9848276376724243, "learning_rate": 1e-05, "loss": 0.6983, "step": 277000 }, { "epoch": 103.0, "eval_accuracy": 0.7723036184038605, "eval_f1_macro": 0.4576617106244536, "eval_f1_micro": 0.7723036184038605, "eval_loss": 0.6467755436897278, "eval_runtime": 512.6943, "eval_samples_per_second": 111.959, "eval_steps_per_second": 1.75, "learning_rate": 1e-05, "step": 277173 }, { "epoch": 103.12151616499443, "grad_norm": 2.9539742469787598, "learning_rate": 1e-05, "loss": 0.6854, "step": 277500 }, { "epoch": 103.30732069862505, "grad_norm": 3.1808767318725586, "learning_rate": 1e-05, "loss": 0.6903, "step": 278000 }, { "epoch": 103.49312523225566, "grad_norm": 2.223482131958008, "learning_rate": 1e-05, "loss": 0.6962, "step": 278500 }, { "epoch": 103.67892976588628, "grad_norm": 2.4267632961273193, "learning_rate": 1e-05, "loss": 0.6889, "step": 279000 }, { "epoch": 103.86473429951691, "grad_norm": 2.554532527923584, "learning_rate": 1e-05, "loss": 0.6902, "step": 279500 }, { "epoch": 104.0, "eval_accuracy": 0.7725475165937876, "eval_f1_macro": 0.4578893476938316, "eval_f1_micro": 0.7725475165937876, "eval_loss": 0.6456966400146484, "eval_runtime": 510.0416, "eval_samples_per_second": 112.542, "eval_steps_per_second": 1.759, "learning_rate": 1e-05, "step": 279864 }, { "epoch": 104.05053883314753, "grad_norm": 2.5456807613372803, "learning_rate": 1e-05, "loss": 0.6956, "step": 280000 }, { "epoch": 104.23634336677814, "grad_norm": 2.9953627586364746, "learning_rate": 1e-05, "loss": 0.7025, "step": 280500 }, { "epoch": 104.42214790040877, "grad_norm": 2.574535369873047, "learning_rate": 1e-05, "loss": 0.6825, "step": 281000 }, { "epoch": 104.60795243403939, "grad_norm": 3.287419557571411, "learning_rate": 1e-05, "loss": 0.6953, "step": 281500 }, { "epoch": 104.79375696767, "grad_norm": 2.3188724517822266, "learning_rate": 1e-05, "loss": 0.6979, "step": 282000 }, { "epoch": 104.97956150130064, "grad_norm": 2.4571897983551025, "learning_rate": 1e-05, "loss": 0.6876, "step": 282500 }, { "epoch": 105.0, "eval_accuracy": 0.7719377711189701, "eval_f1_macro": 0.45556962818046953, "eval_f1_micro": 0.7719377711189701, "eval_loss": 0.6455578804016113, "eval_runtime": 507.9867, "eval_samples_per_second": 112.997, "eval_steps_per_second": 1.766, "learning_rate": 1e-05, "step": 282555 }, { "epoch": 105.16536603493125, "grad_norm": 2.7945613861083984, "learning_rate": 1e-05, "loss": 0.6903, "step": 283000 }, { "epoch": 105.35117056856187, "grad_norm": 2.2913684844970703, "learning_rate": 1e-05, "loss": 0.7094, "step": 283500 }, { "epoch": 105.5369751021925, "grad_norm": 2.745809555053711, "learning_rate": 1e-05, "loss": 0.685, "step": 284000 }, { "epoch": 105.72277963582312, "grad_norm": 2.8689208030700684, "learning_rate": 1e-05, "loss": 0.6873, "step": 284500 }, { "epoch": 105.90858416945373, "grad_norm": 2.0228753089904785, "learning_rate": 1e-05, "loss": 0.6849, "step": 285000 }, { "epoch": 106.0, "eval_accuracy": 0.7723384610024215, "eval_f1_macro": 0.4644160307431161, "eval_f1_micro": 0.7723384610024215, "eval_loss": 0.6444206237792969, "eval_runtime": 515.3095, "eval_samples_per_second": 111.391, "eval_steps_per_second": 1.741, "learning_rate": 1e-05, "step": 285246 }, { "epoch": 106.09438870308435, "grad_norm": 0.45027607679367065, "learning_rate": 1e-05, "loss": 0.6656, "step": 285500 }, { "epoch": 106.28019323671498, "grad_norm": 0.3893894553184509, "learning_rate": 1e-05, "loss": 0.5644, "step": 286000 }, { "epoch": 106.4659977703456, "grad_norm": 0.3509347438812256, "learning_rate": 1e-05, "loss": 0.4686, "step": 286500 }, { "epoch": 106.65180230397621, "grad_norm": 0.32040271162986755, "learning_rate": 1e-05, "loss": 0.3969, "step": 287000 }, { "epoch": 106.83760683760684, "grad_norm": 0.4328668415546417, "learning_rate": 1e-05, "loss": 0.3411, "step": 287500 }, { "epoch": 107.0, "eval_accuracy": 0.04797825821849794, "eval_f1_macro": 0.360721302464235, "eval_f1_micro": 0.4910938804941607, "eval_loss": 0.26552170515060425, "eval_runtime": 519.2635, "eval_samples_per_second": 110.543, "eval_steps_per_second": 1.727, "learning_rate": 1e-05, "step": 287937 }, { "epoch": 107.02341137123746, "grad_norm": 0.26703259348869324, "learning_rate": 1e-05, "loss": 0.2913, "step": 288000 }, { "epoch": 107.20921590486807, "grad_norm": 0.2284982055425644, "learning_rate": 1e-05, "loss": 0.2501, "step": 288500 }, { "epoch": 107.3950204384987, "grad_norm": 1.4908051490783691, "learning_rate": 1e-05, "loss": 0.2169, "step": 289000 }, { "epoch": 107.58082497212932, "grad_norm": 0.18897071480751038, "learning_rate": 1e-05, "loss": 0.1873, "step": 289500 }, { "epoch": 107.76662950575994, "grad_norm": 0.17562171816825867, "learning_rate": 1e-05, "loss": 0.1619, "step": 290000 }, { "epoch": 107.95243403939057, "grad_norm": 0.19028235971927643, "learning_rate": 1e-05, "loss": 0.141, "step": 290500 }, { "epoch": 108.0, "eval_accuracy": 0.44983536872179924, "eval_f1_macro": 0.22462065038139475, "eval_f1_micro": 0.6693680656054029, "eval_loss": 0.1419014185667038, "eval_runtime": 516.6186, "eval_samples_per_second": 111.109, "eval_steps_per_second": 1.736, "learning_rate": 1e-05, "step": 290628 }, { "epoch": 108.13823857302118, "grad_norm": 0.1382223218679428, "learning_rate": 1e-05, "loss": 0.1234, "step": 291000 }, { "epoch": 108.3240431066518, "grad_norm": 0.2637607753276825, "learning_rate": 1e-05, "loss": 0.109, "step": 291500 }, { "epoch": 108.50984764028242, "grad_norm": 0.16210788488388062, "learning_rate": 1e-05, "loss": 0.0977, "step": 292000 }, { "epoch": 108.69565217391305, "grad_norm": 0.1515241414308548, "learning_rate": 1e-05, "loss": 0.088, "step": 292500 }, { "epoch": 108.88145670754366, "grad_norm": 0.15725761651992798, "learning_rate": 1e-05, "loss": 0.0809, "step": 293000 }, { "epoch": 109.0, "eval_accuracy": 0.6714691381683245, "eval_f1_macro": 0.2136927959803109, "eval_f1_micro": 0.7449736568518617, "eval_loss": 0.07755623757839203, "eval_runtime": 503.5448, "eval_samples_per_second": 113.994, "eval_steps_per_second": 1.781, "learning_rate": 1e-05, "step": 293319 }, { "epoch": 109.06726124117428, "grad_norm": 0.13999715447425842, "learning_rate": 1e-05, "loss": 0.076, "step": 293500 }, { "epoch": 109.25306577480491, "grad_norm": 0.12521325051784515, "learning_rate": 1e-05, "loss": 0.0714, "step": 294000 }, { "epoch": 109.43887030843553, "grad_norm": 0.16435325145721436, "learning_rate": 1e-05, "loss": 0.0688, "step": 294500 }, { "epoch": 109.62467484206614, "grad_norm": 0.1280326545238495, "learning_rate": 1e-05, "loss": 0.066, "step": 295000 }, { "epoch": 109.81047937569677, "grad_norm": 0.1201464980840683, "learning_rate": 1e-05, "loss": 0.0643, "step": 295500 }, { "epoch": 109.99628390932739, "grad_norm": 0.09229467064142227, "learning_rate": 1e-05, "loss": 0.0621, "step": 296000 }, { "epoch": 110.0, "eval_accuracy": 0.6837163115625163, "eval_f1_macro": 0.26414762709864326, "eval_f1_micro": 0.7489470111853911, "eval_loss": 0.05802077427506447, "eval_runtime": 515.3576, "eval_samples_per_second": 111.381, "eval_steps_per_second": 1.741, "learning_rate": 1e-05, "step": 296010 }, { "epoch": 110.182088442958, "grad_norm": 0.12133761495351791, "learning_rate": 1e-05, "loss": 0.0613, "step": 296500 }, { "epoch": 110.36789297658864, "grad_norm": 0.11678178608417511, "learning_rate": 1e-05, "loss": 0.0601, "step": 297000 }, { "epoch": 110.55369751021925, "grad_norm": 0.11925112456083298, "learning_rate": 1e-05, "loss": 0.0596, "step": 297500 }, { "epoch": 110.73950204384987, "grad_norm": 0.16815921664237976, "learning_rate": 1e-05, "loss": 0.0589, "step": 298000 }, { "epoch": 110.92530657748048, "grad_norm": 0.11897558718919754, "learning_rate": 1e-05, "loss": 0.0582, "step": 298500 }, { "epoch": 111.0, "eval_accuracy": 0.6935245030574381, "eval_f1_macro": 0.331603552491686, "eval_f1_micro": 0.7547299175391458, "eval_loss": 0.053473543375730515, "eval_runtime": 517.7576, "eval_samples_per_second": 110.865, "eval_steps_per_second": 1.732, "learning_rate": 1e-05, "step": 298701 }, { "epoch": 111.11111111111111, "grad_norm": 0.20368880033493042, "learning_rate": 1e-05, "loss": 0.0588, "step": 299000 }, { "epoch": 111.29691564474173, "grad_norm": 0.1355835497379303, "learning_rate": 1e-05, "loss": 0.0577, "step": 299500 }, { "epoch": 111.48272017837235, "grad_norm": 0.15652874112129211, "learning_rate": 1e-05, "loss": 0.0569, "step": 300000 }, { "epoch": 111.66852471200298, "grad_norm": 0.1435033082962036, "learning_rate": 1e-05, "loss": 0.0575, "step": 300500 }, { "epoch": 111.8543292456336, "grad_norm": 0.13060764968395233, "learning_rate": 1e-05, "loss": 0.0568, "step": 301000 }, { "epoch": 112.0, "eval_accuracy": 0.6998135920976987, "eval_f1_macro": 0.3483537603081725, "eval_f1_micro": 0.7585280588776449, "eval_loss": 0.05167479068040848, "eval_runtime": 513.0638, "eval_samples_per_second": 111.879, "eval_steps_per_second": 1.748, "learning_rate": 1e-05, "step": 301392 }, { "epoch": 112.04013377926421, "grad_norm": 0.12529785931110382, "learning_rate": 1e-05, "loss": 0.0571, "step": 301500 }, { "epoch": 112.22593831289484, "grad_norm": 0.16798335313796997, "learning_rate": 1e-05, "loss": 0.0568, "step": 302000 }, { "epoch": 112.41174284652546, "grad_norm": 0.1425975114107132, "learning_rate": 1e-05, "loss": 0.0557, "step": 302500 }, { "epoch": 112.59754738015607, "grad_norm": 0.14742553234100342, "learning_rate": 1e-05, "loss": 0.0553, "step": 303000 }, { "epoch": 112.7833519137867, "grad_norm": 0.1510126292705536, "learning_rate": 1e-05, "loss": 0.056, "step": 303500 }, { "epoch": 112.96915644741732, "grad_norm": 0.14474214613437653, "learning_rate": 1e-05, "loss": 0.0557, "step": 304000 }, { "epoch": 113.0, "eval_accuracy": 0.7042734447135067, "eval_f1_macro": 0.3378893620669972, "eval_f1_micro": 0.7611001027447527, "eval_loss": 0.05106380954384804, "eval_runtime": 521.661, "eval_samples_per_second": 110.035, "eval_steps_per_second": 1.72, "learning_rate": 1e-05, "step": 304083 }, { "epoch": 113.15496098104794, "grad_norm": 0.12822121381759644, "learning_rate": 1e-05, "loss": 0.056, "step": 304500 }, { "epoch": 113.34076551467855, "grad_norm": 0.14637312293052673, "learning_rate": 1e-05, "loss": 0.0556, "step": 305000 }, { "epoch": 113.52657004830918, "grad_norm": 0.1259402185678482, "learning_rate": 1e-05, "loss": 0.0554, "step": 305500 }, { "epoch": 113.7123745819398, "grad_norm": 0.13158832490444183, "learning_rate": 1e-05, "loss": 0.0556, "step": 306000 }, { "epoch": 113.89817911557041, "grad_norm": 0.12858903408050537, "learning_rate": 1e-05, "loss": 0.0552, "step": 306500 }, { "epoch": 114.0, "eval_accuracy": 0.7053535652688978, "eval_f1_macro": 0.35703913789456687, "eval_f1_micro": 0.7622047244094489, "eval_loss": 0.05065497010946274, "eval_runtime": 534.7718, "eval_samples_per_second": 107.337, "eval_steps_per_second": 1.677, "learning_rate": 1e-05, "step": 306774 }, { "epoch": 114.08398364920104, "grad_norm": 0.16015706956386566, "learning_rate": 1e-05, "loss": 0.0548, "step": 307000 }, { "epoch": 114.26978818283166, "grad_norm": 0.13509486615657806, "learning_rate": 1e-05, "loss": 0.0551, "step": 307500 }, { "epoch": 114.45559271646228, "grad_norm": 0.14582909643650055, "learning_rate": 1e-05, "loss": 0.0547, "step": 308000 }, { "epoch": 114.64139725009291, "grad_norm": 0.14752507209777832, "learning_rate": 1e-05, "loss": 0.0552, "step": 308500 }, { "epoch": 114.82720178372352, "grad_norm": 0.1321249157190323, "learning_rate": 1e-05, "loss": 0.0555, "step": 309000 }, { "epoch": 115.0, "eval_accuracy": 0.7117820247034023, "eval_f1_macro": 0.36428903036337407, "eval_f1_micro": 0.7647240545893983, "eval_loss": 0.05039990693330765, "eval_runtime": 609.7033, "eval_samples_per_second": 94.146, "eval_steps_per_second": 1.471, "learning_rate": 1e-05, "step": 309465 }, { "epoch": 115.01300631735414, "grad_norm": 0.1596326380968094, "learning_rate": 1e-05, "loss": 0.0546, "step": 309500 }, { "epoch": 115.19881085098477, "grad_norm": 0.19358591735363007, "learning_rate": 1e-05, "loss": 0.0547, "step": 310000 }, { "epoch": 115.38461538461539, "grad_norm": 0.1181555911898613, "learning_rate": 1e-05, "loss": 0.0551, "step": 310500 }, { "epoch": 115.570419918246, "grad_norm": 0.16747964918613434, "learning_rate": 1e-05, "loss": 0.0551, "step": 311000 }, { "epoch": 115.75622445187662, "grad_norm": 0.1441546380519867, "learning_rate": 1e-05, "loss": 0.0544, "step": 311500 }, { "epoch": 115.94202898550725, "grad_norm": 0.1473866105079651, "learning_rate": 1e-05, "loss": 0.0546, "step": 312000 }, { "epoch": 116.0, "eval_accuracy": 0.7101966864688769, "eval_f1_macro": 0.3622993891059384, "eval_f1_micro": 0.7647289615591668, "eval_loss": 0.05015714839100838, "eval_runtime": 523.6323, "eval_samples_per_second": 109.621, "eval_steps_per_second": 1.713, "learning_rate": 1e-05, "step": 312156 }, { "epoch": 116.12783351913787, "grad_norm": 0.18043090403079987, "learning_rate": 1e-05, "loss": 0.0548, "step": 312500 }, { "epoch": 116.31363805276848, "grad_norm": 0.1623302400112152, "learning_rate": 1e-05, "loss": 0.0542, "step": 313000 }, { "epoch": 116.49944258639911, "grad_norm": 0.1569686233997345, "learning_rate": 1e-05, "loss": 0.055, "step": 313500 }, { "epoch": 116.68524712002973, "grad_norm": 0.17043337225914001, "learning_rate": 1e-05, "loss": 0.0541, "step": 314000 }, { "epoch": 116.87105165366034, "grad_norm": 0.15649768710136414, "learning_rate": 1e-05, "loss": 0.0545, "step": 314500 }, { "epoch": 117.0, "eval_accuracy": 0.712914409156635, "eval_f1_macro": 0.36544151863175506, "eval_f1_micro": 0.7657223847509677, "eval_loss": 0.050175271928310394, "eval_runtime": 598.1573, "eval_samples_per_second": 95.963, "eval_steps_per_second": 1.5, "learning_rate": 1e-05, "step": 314847 }, { "epoch": 117.05685618729098, "grad_norm": 0.1493714600801468, "learning_rate": 1e-05, "loss": 0.0549, "step": 315000 }, { "epoch": 117.24266072092159, "grad_norm": 0.1313825249671936, "learning_rate": 1e-05, "loss": 0.0547, "step": 315500 }, { "epoch": 117.42846525455221, "grad_norm": 0.13993392884731293, "learning_rate": 1e-05, "loss": 0.0541, "step": 316000 }, { "epoch": 117.61426978818284, "grad_norm": 0.13758248090744019, "learning_rate": 1e-05, "loss": 0.0546, "step": 316500 }, { "epoch": 117.80007432181345, "grad_norm": 0.16916200518608093, "learning_rate": 1e-05, "loss": 0.0545, "step": 317000 }, { "epoch": 117.98587885544407, "grad_norm": 0.1452600359916687, "learning_rate": 1e-05, "loss": 0.0535, "step": 317500 }, { "epoch": 118.0, "eval_accuracy": 0.7141687427048309, "eval_f1_macro": 0.3524192831401073, "eval_f1_micro": 0.7654782537680462, "eval_loss": 0.050468478351831436, "eval_runtime": 549.4659, "eval_samples_per_second": 104.467, "eval_steps_per_second": 1.632, "learning_rate": 1e-05, "step": 317538 }, { "epoch": 118.17168338907469, "grad_norm": 0.12589971721172333, "learning_rate": 1e-05, "loss": 0.054, "step": 318000 }, { "epoch": 118.35748792270532, "grad_norm": 0.17545440793037415, "learning_rate": 1e-05, "loss": 0.0539, "step": 318500 }, { "epoch": 118.54329245633593, "grad_norm": 0.15301626920700073, "learning_rate": 1e-05, "loss": 0.0541, "step": 319000 }, { "epoch": 118.72909698996655, "grad_norm": 0.1384187638759613, "learning_rate": 1e-05, "loss": 0.0545, "step": 319500 }, { "epoch": 118.91490152359718, "grad_norm": 0.16759108006954193, "learning_rate": 1e-05, "loss": 0.0539, "step": 320000 }, { "epoch": 119.0, "eval_accuracy": 0.7127053535652689, "eval_f1_macro": 0.34416444697858145, "eval_f1_micro": 0.7658673932788375, "eval_loss": 0.049900032579898834, "eval_runtime": 491.987, "eval_samples_per_second": 116.672, "eval_steps_per_second": 1.823, "learning_rate": 1e-05, "step": 320229 }, { "epoch": 119.1007060572278, "grad_norm": 0.15850204229354858, "learning_rate": 1e-05, "loss": 0.0538, "step": 320500 }, { "epoch": 119.28651059085841, "grad_norm": 0.18434040248394012, "learning_rate": 1e-05, "loss": 0.0538, "step": 321000 }, { "epoch": 119.47231512448904, "grad_norm": 0.15472249686717987, "learning_rate": 1e-05, "loss": 0.054, "step": 321500 }, { "epoch": 119.65811965811966, "grad_norm": 0.1760583370923996, "learning_rate": 1e-05, "loss": 0.0544, "step": 322000 }, { "epoch": 119.84392419175028, "grad_norm": 0.1788097620010376, "learning_rate": 1e-05, "loss": 0.0541, "step": 322500 }, { "epoch": 120.0, "eval_accuracy": 0.7130886221494399, "eval_f1_macro": 0.35077501544817247, "eval_f1_micro": 0.7657258505633957, "eval_loss": 0.049903545528650284, "eval_runtime": 513.1716, "eval_samples_per_second": 111.855, "eval_steps_per_second": 1.748, "learning_rate": 1e-05, "step": 322920 }, { "epoch": 120.0297287253809, "grad_norm": 0.1507652848958969, "learning_rate": 1e-05, "loss": 0.0538, "step": 323000 }, { "epoch": 120.21553325901152, "grad_norm": 0.1735050082206726, "learning_rate": 1e-05, "loss": 0.0539, "step": 323500 }, { "epoch": 120.40133779264214, "grad_norm": 0.14820708334445953, "learning_rate": 1e-05, "loss": 0.0543, "step": 324000 }, { "epoch": 120.58714232627275, "grad_norm": 0.14484427869319916, "learning_rate": 1e-05, "loss": 0.0537, "step": 324500 }, { "epoch": 120.77294685990339, "grad_norm": 0.15814656019210815, "learning_rate": 1e-05, "loss": 0.0542, "step": 325000 }, { "epoch": 120.958751393534, "grad_norm": 0.19824689626693726, "learning_rate": 1e-05, "loss": 0.0539, "step": 325500 }, { "epoch": 121.0, "eval_accuracy": 0.7140816362084285, "eval_f1_macro": 0.3627670615797559, "eval_f1_micro": 0.7665756914119359, "eval_loss": 0.04957958310842514, "eval_runtime": 509.187, "eval_samples_per_second": 112.731, "eval_steps_per_second": 1.762, "learning_rate": 1e-05, "step": 325611 }, { "epoch": 121.14455592716462, "grad_norm": 0.16547606885433197, "learning_rate": 1e-05, "loss": 0.054, "step": 326000 }, { "epoch": 121.33036046079525, "grad_norm": 0.2055787444114685, "learning_rate": 1e-05, "loss": 0.0538, "step": 326500 }, { "epoch": 121.51616499442586, "grad_norm": 0.2001011222600937, "learning_rate": 1e-05, "loss": 0.0539, "step": 327000 }, { "epoch": 121.70196952805648, "grad_norm": 0.17856118083000183, "learning_rate": 1e-05, "loss": 0.0535, "step": 327500 }, { "epoch": 121.88777406168711, "grad_norm": 0.14508357644081116, "learning_rate": 1e-05, "loss": 0.0542, "step": 328000 }, { "epoch": 122.0, "eval_accuracy": 0.7163812477134545, "eval_f1_macro": 0.35293584502475456, "eval_f1_micro": 0.7672263726699065, "eval_loss": 0.04973344877362251, "eval_runtime": 495.422, "eval_samples_per_second": 115.863, "eval_steps_per_second": 1.811, "learning_rate": 1e-05, "step": 328302 }, { "epoch": 122.07357859531773, "grad_norm": 0.1540980488061905, "learning_rate": 1e-05, "loss": 0.0543, "step": 328500 }, { "epoch": 122.25938312894834, "grad_norm": 0.18639309704303741, "learning_rate": 1e-05, "loss": 0.0542, "step": 329000 }, { "epoch": 122.44518766257897, "grad_norm": 0.15830326080322266, "learning_rate": 1e-05, "loss": 0.0532, "step": 329500 }, { "epoch": 122.63099219620959, "grad_norm": 0.17811599373817444, "learning_rate": 1e-05, "loss": 0.0537, "step": 330000 }, { "epoch": 122.8167967298402, "grad_norm": 0.1285717487335205, "learning_rate": 1e-05, "loss": 0.0536, "step": 330500 }, { "epoch": 123.0, "eval_accuracy": 0.7153533910559049, "eval_f1_macro": 0.36741171960145996, "eval_f1_micro": 0.7661930650098223, "eval_loss": 0.04949206858873367, "eval_runtime": 496.6111, "eval_samples_per_second": 115.585, "eval_steps_per_second": 1.806, "learning_rate": 1e-05, "step": 330993 }, { "epoch": 123.00260126347082, "grad_norm": 0.17617733776569366, "learning_rate": 1e-05, "loss": 0.0537, "step": 331000 }, { "epoch": 123.18840579710145, "grad_norm": 0.20897239446640015, "learning_rate": 1e-05, "loss": 0.0539, "step": 331500 }, { "epoch": 123.37421033073207, "grad_norm": 0.1522960364818573, "learning_rate": 1e-05, "loss": 0.0536, "step": 332000 }, { "epoch": 123.56001486436269, "grad_norm": 0.18495243787765503, "learning_rate": 1e-05, "loss": 0.0542, "step": 332500 }, { "epoch": 123.74581939799332, "grad_norm": 0.15397348999977112, "learning_rate": 1e-05, "loss": 0.0541, "step": 333000 }, { "epoch": 123.93162393162393, "grad_norm": 0.16448819637298584, "learning_rate": 1e-05, "loss": 0.0539, "step": 333500 }, { "epoch": 124.0, "eval_accuracy": 0.7160676643264055, "eval_f1_macro": 0.36413807211107735, "eval_f1_micro": 0.7673595994775795, "eval_loss": 0.049613192677497864, "eval_runtime": 491.9647, "eval_samples_per_second": 116.677, "eval_steps_per_second": 1.823, "learning_rate": 1e-05, "step": 333684 }, { "epoch": 124.11742846525455, "grad_norm": 0.14923396706581116, "learning_rate": 1e-05, "loss": 0.0532, "step": 334000 }, { "epoch": 124.30323299888518, "grad_norm": 0.1714375615119934, "learning_rate": 1e-05, "loss": 0.0534, "step": 334500 }, { "epoch": 124.4890375325158, "grad_norm": 0.17104235291481018, "learning_rate": 1e-05, "loss": 0.0542, "step": 335000 }, { "epoch": 124.67484206614641, "grad_norm": 0.16875581443309784, "learning_rate": 1e-05, "loss": 0.0542, "step": 335500 }, { "epoch": 124.86064659977704, "grad_norm": 0.14356301724910736, "learning_rate": 1e-05, "loss": 0.0535, "step": 336000 }, { "epoch": 125.0, "eval_accuracy": 0.7124440340760614, "eval_f1_macro": 0.3509397162428964, "eval_f1_micro": 0.7658070643240676, "eval_loss": 0.04959910735487938, "eval_runtime": 487.4427, "eval_samples_per_second": 117.759, "eval_steps_per_second": 1.84, "learning_rate": 1e-05, "step": 336375 }, { "epoch": 125.04645113340766, "grad_norm": 0.15746818482875824, "learning_rate": 1e-05, "loss": 0.053, "step": 336500 }, { "epoch": 125.23225566703827, "grad_norm": 0.1541147232055664, "learning_rate": 1e-05, "loss": 0.0534, "step": 337000 }, { "epoch": 125.41806020066889, "grad_norm": 0.15555234253406525, "learning_rate": 1e-05, "loss": 0.0534, "step": 337500 }, { "epoch": 125.60386473429952, "grad_norm": 0.17335093021392822, "learning_rate": 1e-05, "loss": 0.054, "step": 338000 }, { "epoch": 125.78966926793014, "grad_norm": 0.18515528738498688, "learning_rate": 1e-05, "loss": 0.0538, "step": 338500 }, { "epoch": 125.97547380156075, "grad_norm": 0.17530472576618195, "learning_rate": 1e-05, "loss": 0.0536, "step": 339000 }, { "epoch": 126.0, "eval_accuracy": 0.7152662845595025, "eval_f1_macro": 0.37424111866414195, "eval_f1_micro": 0.7660751240774316, "eval_loss": 0.0494619682431221, "eval_runtime": 502.6945, "eval_samples_per_second": 114.187, "eval_steps_per_second": 1.784, "learning_rate": 1e-05, "step": 339066 }, { "epoch": 126.16127833519138, "grad_norm": 0.1517336517572403, "learning_rate": 1e-05, "loss": 0.0534, "step": 339500 }, { "epoch": 126.347082868822, "grad_norm": 0.1815425604581833, "learning_rate": 1e-05, "loss": 0.0542, "step": 340000 }, { "epoch": 126.53288740245262, "grad_norm": 0.17021115124225616, "learning_rate": 1e-05, "loss": 0.0538, "step": 340500 }, { "epoch": 126.71869193608325, "grad_norm": 0.19511722028255463, "learning_rate": 1e-05, "loss": 0.0534, "step": 341000 }, { "epoch": 126.90449646971386, "grad_norm": 0.22142985463142395, "learning_rate": 1e-05, "loss": 0.054, "step": 341500 }, { "epoch": 127.0, "eval_accuracy": 0.7149178585738925, "eval_f1_macro": 0.35735030768195364, "eval_f1_micro": 0.766314294299216, "eval_loss": 0.049399666488170624, "eval_runtime": 486.665, "eval_samples_per_second": 117.948, "eval_steps_per_second": 1.843, "learning_rate": 1e-05, "step": 341757 }, { "epoch": 127.09030100334448, "grad_norm": 0.1855282485485077, "learning_rate": 1e-05, "loss": 0.0527, "step": 342000 }, { "epoch": 127.27610553697511, "grad_norm": 0.14693668484687805, "learning_rate": 1e-05, "loss": 0.0531, "step": 342500 }, { "epoch": 127.46191007060573, "grad_norm": 0.19144034385681152, "learning_rate": 1e-05, "loss": 0.0537, "step": 343000 }, { "epoch": 127.64771460423634, "grad_norm": 0.16127558052539825, "learning_rate": 1e-05, "loss": 0.0536, "step": 343500 }, { "epoch": 127.83351913786696, "grad_norm": 0.20746751129627228, "learning_rate": 1e-05, "loss": 0.054, "step": 344000 }, { "epoch": 128.0, "eval_accuracy": 0.714412640894758, "eval_f1_macro": 0.36010176970077795, "eval_f1_micro": 0.7664776721721585, "eval_loss": 0.04938925430178642, "eval_runtime": 486.3797, "eval_samples_per_second": 118.017, "eval_steps_per_second": 1.844, "learning_rate": 1e-05, "step": 344448 }, { "epoch": 128.01932367149757, "grad_norm": 0.13175125420093536, "learning_rate": 1e-05, "loss": 0.0534, "step": 344500 }, { "epoch": 128.2051282051282, "grad_norm": 0.1861082762479782, "learning_rate": 1e-05, "loss": 0.0534, "step": 345000 }, { "epoch": 128.39093273875883, "grad_norm": 0.16761593520641327, "learning_rate": 1e-05, "loss": 0.0533, "step": 345500 }, { "epoch": 128.57673727238944, "grad_norm": 0.18102224171161652, "learning_rate": 1e-05, "loss": 0.0536, "step": 346000 }, { "epoch": 128.76254180602007, "grad_norm": 0.166373148560524, "learning_rate": 1e-05, "loss": 0.0535, "step": 346500 }, { "epoch": 128.9483463396507, "grad_norm": 0.2147304117679596, "learning_rate": 1e-05, "loss": 0.0538, "step": 347000 }, { "epoch": 129.0, "eval_accuracy": 0.717931743349419, "eval_f1_macro": 0.3641550142658243, "eval_f1_micro": 0.7674323253122921, "eval_loss": 0.049368634819984436, "eval_runtime": 497.4863, "eval_samples_per_second": 115.382, "eval_steps_per_second": 1.803, "learning_rate": 1e-05, "step": 347139 }, { "epoch": 129.1341508732813, "grad_norm": 0.15825386345386505, "learning_rate": 1e-05, "loss": 0.0536, "step": 347500 }, { "epoch": 129.31995540691193, "grad_norm": 0.17780201137065887, "learning_rate": 1e-05, "loss": 0.0532, "step": 348000 }, { "epoch": 129.50575994054256, "grad_norm": 0.1677912026643753, "learning_rate": 1e-05, "loss": 0.0535, "step": 348500 }, { "epoch": 129.69156447417316, "grad_norm": 0.18808604776859283, "learning_rate": 1e-05, "loss": 0.0534, "step": 349000 }, { "epoch": 129.8773690078038, "grad_norm": 0.1602342426776886, "learning_rate": 1e-05, "loss": 0.0538, "step": 349500 }, { "epoch": 130.0, "eval_accuracy": 0.717687845159492, "eval_f1_macro": 0.3602711408206009, "eval_f1_micro": 0.7667705923765463, "eval_loss": 0.049409620463848114, "eval_runtime": 497.5454, "eval_samples_per_second": 115.368, "eval_steps_per_second": 1.803, "learning_rate": 1e-05, "step": 349830 }, { "epoch": 130.06317354143442, "grad_norm": 0.23750899732112885, "learning_rate": 1e-05, "loss": 0.0532, "step": 350000 }, { "epoch": 130.24897807506503, "grad_norm": 0.16858229041099548, "learning_rate": 1e-05, "loss": 0.0529, "step": 350500 }, { "epoch": 130.43478260869566, "grad_norm": 0.19223752617835999, "learning_rate": 1e-05, "loss": 0.0535, "step": 351000 }, { "epoch": 130.6205871423263, "grad_norm": 0.1930113583803177, "learning_rate": 1e-05, "loss": 0.0536, "step": 351500 }, { "epoch": 130.8063916759569, "grad_norm": 0.20461726188659668, "learning_rate": 1e-05, "loss": 0.054, "step": 352000 }, { "epoch": 130.99219620958752, "grad_norm": 0.20236244797706604, "learning_rate": 1e-05, "loss": 0.0531, "step": 352500 }, { "epoch": 131.0, "eval_accuracy": 0.718210484137907, "eval_f1_macro": 0.3664294602272974, "eval_f1_micro": 0.7665082507046622, "eval_loss": 0.04939533770084381, "eval_runtime": 493.8302, "eval_samples_per_second": 116.236, "eval_steps_per_second": 1.816, "learning_rate": 1e-05, "step": 352521 }, { "epoch": 131.17800074321812, "grad_norm": 0.16268064081668854, "learning_rate": 1e-05, "loss": 0.0536, "step": 353000 }, { "epoch": 131.36380527684875, "grad_norm": 0.22975020110607147, "learning_rate": 1e-05, "loss": 0.0534, "step": 353500 }, { "epoch": 131.54960981047938, "grad_norm": 0.16796068847179413, "learning_rate": 1e-05, "loss": 0.0533, "step": 354000 }, { "epoch": 131.73541434410998, "grad_norm": 0.2104586660861969, "learning_rate": 1e-05, "loss": 0.0533, "step": 354500 }, { "epoch": 131.9212188777406, "grad_norm": 0.17215226590633392, "learning_rate": 1e-05, "loss": 0.0528, "step": 355000 }, { "epoch": 132.0, "eval_accuracy": 0.717583317363809, "eval_f1_macro": 0.3651176446191739, "eval_f1_micro": 0.7664564319910658, "eval_loss": 0.04943186417222023, "eval_runtime": 487.4839, "eval_samples_per_second": 117.75, "eval_steps_per_second": 1.84, "learning_rate": 1e-05, "step": 355212 }, { "epoch": 132.10702341137124, "grad_norm": 0.1763976663351059, "learning_rate": 1e-05, "loss": 0.0534, "step": 355500 }, { "epoch": 132.29282794500185, "grad_norm": 0.19165627658367157, "learning_rate": 1e-05, "loss": 0.0534, "step": 356000 }, { "epoch": 132.47863247863248, "grad_norm": 0.18794529139995575, "learning_rate": 1e-05, "loss": 0.0535, "step": 356500 }, { "epoch": 132.6644370122631, "grad_norm": 0.19783887267112732, "learning_rate": 1e-05, "loss": 0.0535, "step": 357000 }, { "epoch": 132.8502415458937, "grad_norm": 0.14160636067390442, "learning_rate": 1e-05, "loss": 0.053, "step": 357500 }, { "epoch": 133.0, "eval_accuracy": 0.714621696486124, "eval_f1_macro": 0.36115748131858555, "eval_f1_micro": 0.7658437005098911, "eval_loss": 0.049288176000118256, "eval_runtime": 479.1036, "eval_samples_per_second": 119.809, "eval_steps_per_second": 1.872, "learning_rate": 1e-05, "step": 357903 }, { "epoch": 133.03604607952434, "grad_norm": 0.18090824782848358, "learning_rate": 1e-05, "loss": 0.0533, "step": 358000 }, { "epoch": 133.22185061315497, "grad_norm": 0.19353412091732025, "learning_rate": 1e-05, "loss": 0.0536, "step": 358500 }, { "epoch": 133.40765514678557, "grad_norm": 0.22670786082744598, "learning_rate": 1e-05, "loss": 0.0535, "step": 359000 }, { "epoch": 133.5934596804162, "grad_norm": 0.18295426666736603, "learning_rate": 1e-05, "loss": 0.0533, "step": 359500 }, { "epoch": 133.77926421404683, "grad_norm": 0.20159971714019775, "learning_rate": 1e-05, "loss": 0.0532, "step": 360000 }, { "epoch": 133.96506874767744, "grad_norm": 0.21708709001541138, "learning_rate": 1e-05, "loss": 0.0534, "step": 360500 }, { "epoch": 134.0, "eval_accuracy": 0.7154753401508684, "eval_f1_macro": 0.3677607284274943, "eval_f1_micro": 0.7659699195779215, "eval_loss": 0.04927274212241173, "eval_runtime": 492.3636, "eval_samples_per_second": 116.583, "eval_steps_per_second": 1.822, "learning_rate": 1e-05, "step": 360594 }, { "epoch": 134.15087328130807, "grad_norm": 0.1716473251581192, "learning_rate": 1e-05, "loss": 0.053, "step": 361000 }, { "epoch": 134.3366778149387, "grad_norm": 0.1569896936416626, "learning_rate": 1e-05, "loss": 0.0534, "step": 361500 }, { "epoch": 134.5224823485693, "grad_norm": 0.18859770894050598, "learning_rate": 1e-05, "loss": 0.0534, "step": 362000 }, { "epoch": 134.70828688219993, "grad_norm": 0.18753333389759064, "learning_rate": 1e-05, "loss": 0.053, "step": 362500 }, { "epoch": 134.89409141583056, "grad_norm": 0.19746656715869904, "learning_rate": 1e-05, "loss": 0.0528, "step": 363000 }, { "epoch": 135.0, "eval_accuracy": 0.7189944426055295, "eval_f1_macro": 0.37226237632641596, "eval_f1_micro": 0.7674080308866179, "eval_loss": 0.04929700121283531, "eval_runtime": 476.5646, "eval_samples_per_second": 120.447, "eval_steps_per_second": 1.882, "learning_rate": 1e-05, "step": 363285 }, { "epoch": 135.07989594946116, "grad_norm": 0.1843235194683075, "learning_rate": 1e-05, "loss": 0.053, "step": 363500 }, { "epoch": 135.2657004830918, "grad_norm": 0.17511311173439026, "learning_rate": 1e-05, "loss": 0.0532, "step": 364000 }, { "epoch": 135.45150501672242, "grad_norm": 0.20711584389209747, "learning_rate": 1e-05, "loss": 0.053, "step": 364500 }, { "epoch": 135.63730955035302, "grad_norm": 0.19971944391727448, "learning_rate": 1e-05, "loss": 0.0529, "step": 365000 }, { "epoch": 135.82311408398365, "grad_norm": 0.2045559585094452, "learning_rate": 1e-05, "loss": 0.0533, "step": 365500 }, { "epoch": 136.0, "eval_accuracy": 0.7151094928659779, "eval_f1_macro": 0.35969130867283144, "eval_f1_micro": 0.766711291239524, "eval_loss": 0.049187980592250824, "eval_runtime": 502.7189, "eval_samples_per_second": 114.181, "eval_steps_per_second": 1.784, "learning_rate": 1e-05, "step": 365976 }, { "epoch": 136.00891861761426, "grad_norm": 0.17318807542324066, "learning_rate": 1e-05, "loss": 0.0529, "step": 366000 }, { "epoch": 136.1947231512449, "grad_norm": 0.20163071155548096, "learning_rate": 1e-05, "loss": 0.0528, "step": 366500 }, { "epoch": 136.38052768487552, "grad_norm": 0.17554792761802673, "learning_rate": 1e-05, "loss": 0.0534, "step": 367000 }, { "epoch": 136.56633221850612, "grad_norm": 0.20222993195056915, "learning_rate": 1e-05, "loss": 0.0531, "step": 367500 }, { "epoch": 136.75213675213675, "grad_norm": 0.2069316953420639, "learning_rate": 1e-05, "loss": 0.0531, "step": 368000 }, { "epoch": 136.93794128576738, "grad_norm": 0.17365820705890656, "learning_rate": 1e-05, "loss": 0.0534, "step": 368500 }, { "epoch": 137.0, "eval_accuracy": 0.7157889235379175, "eval_f1_macro": 0.3631043131303743, "eval_f1_micro": 0.7664713487937058, "eval_loss": 0.0491538941860199, "eval_runtime": 518.3587, "eval_samples_per_second": 110.736, "eval_steps_per_second": 1.73, "learning_rate": 1e-05, "step": 368667 }, { "epoch": 137.12374581939798, "grad_norm": 0.19572311639785767, "learning_rate": 1e-05, "loss": 0.0533, "step": 369000 }, { "epoch": 137.3095503530286, "grad_norm": 0.1570323258638382, "learning_rate": 1e-05, "loss": 0.0528, "step": 369500 }, { "epoch": 137.49535488665924, "grad_norm": 0.16947729885578156, "learning_rate": 1e-05, "loss": 0.0533, "step": 370000 }, { "epoch": 137.68115942028984, "grad_norm": 0.16606110334396362, "learning_rate": 1e-05, "loss": 0.053, "step": 370500 }, { "epoch": 137.86696395392048, "grad_norm": 0.2260875552892685, "learning_rate": 1e-05, "loss": 0.0534, "step": 371000 }, { "epoch": 138.0, "eval_accuracy": 0.7178446368530165, "eval_f1_macro": 0.3687935119842292, "eval_f1_micro": 0.7665450277813434, "eval_loss": 0.04930136725306511, "eval_runtime": 510.8639, "eval_samples_per_second": 112.361, "eval_steps_per_second": 1.756, "learning_rate": 1e-05, "step": 371358 }, { "epoch": 138.0527684875511, "grad_norm": 0.163988396525383, "learning_rate": 1e-05, "loss": 0.0532, "step": 371500 }, { "epoch": 138.2385730211817, "grad_norm": 0.20549984276294708, "learning_rate": 1e-05, "loss": 0.053, "step": 372000 }, { "epoch": 138.42437755481234, "grad_norm": 0.18827009201049805, "learning_rate": 1e-05, "loss": 0.0534, "step": 372500 }, { "epoch": 138.61018208844297, "grad_norm": 0.2782110571861267, "learning_rate": 1e-05, "loss": 0.053, "step": 373000 }, { "epoch": 138.79598662207357, "grad_norm": 0.1959678679704666, "learning_rate": 1e-05, "loss": 0.0538, "step": 373500 }, { "epoch": 138.9817911557042, "grad_norm": 0.18641294538974762, "learning_rate": 1e-05, "loss": 0.053, "step": 374000 }, { "epoch": 139.0, "eval_accuracy": 0.7182279054371875, "eval_f1_macro": 0.35626916649899915, "eval_f1_micro": 0.766155421092079, "eval_loss": 0.04927237331867218, "eval_runtime": 508.9115, "eval_samples_per_second": 112.792, "eval_steps_per_second": 1.763, "learning_rate": 1e-05, "step": 374049 }, { "epoch": 139.16759568933483, "grad_norm": 0.16375960409641266, "learning_rate": 1e-05, "loss": 0.0531, "step": 374500 }, { "epoch": 139.35340022296543, "grad_norm": 0.19818973541259766, "learning_rate": 1e-05, "loss": 0.053, "step": 375000 }, { "epoch": 139.53920475659606, "grad_norm": 0.19776101410388947, "learning_rate": 1e-05, "loss": 0.0536, "step": 375500 }, { "epoch": 139.7250092902267, "grad_norm": 0.24181506037712097, "learning_rate": 1e-05, "loss": 0.0532, "step": 376000 }, { "epoch": 139.9108138238573, "grad_norm": 0.21174757182598114, "learning_rate": 1e-05, "loss": 0.0528, "step": 376500 }, { "epoch": 140.0, "eval_accuracy": 0.7197958223724326, "eval_f1_macro": 0.3699733355385332, "eval_f1_micro": 0.767376184687937, "eval_loss": 0.04918988421559334, "eval_runtime": 505.3732, "eval_samples_per_second": 113.581, "eval_steps_per_second": 1.775, "learning_rate": 1e-05, "step": 376740 }, { "epoch": 140.09661835748793, "grad_norm": 0.18043100833892822, "learning_rate": 1e-05, "loss": 0.0531, "step": 377000 }, { "epoch": 140.28242289111856, "grad_norm": 0.18150250613689423, "learning_rate": 1e-05, "loss": 0.0531, "step": 377500 }, { "epoch": 140.46822742474916, "grad_norm": 0.17685070633888245, "learning_rate": 1e-05, "loss": 0.0523, "step": 378000 }, { "epoch": 140.6540319583798, "grad_norm": 0.2478715479373932, "learning_rate": 1e-05, "loss": 0.0532, "step": 378500 }, { "epoch": 140.8398364920104, "grad_norm": 0.20271550118923187, "learning_rate": 1e-05, "loss": 0.0529, "step": 379000 }, { "epoch": 141.0, "eval_accuracy": 0.716468354209857, "eval_f1_macro": 0.35072596287625124, "eval_f1_micro": 0.7666041104041745, "eval_loss": 0.04920462518930435, "eval_runtime": 510.1227, "eval_samples_per_second": 112.524, "eval_steps_per_second": 1.758, "learning_rate": 1e-05, "step": 379431 }, { "epoch": 141.02564102564102, "grad_norm": 0.25310391187667847, "learning_rate": 1e-05, "loss": 0.0539, "step": 379500 }, { "epoch": 141.21144555927165, "grad_norm": 0.2286742925643921, "learning_rate": 1e-05, "loss": 0.0532, "step": 380000 }, { "epoch": 141.39725009290225, "grad_norm": 0.2335425764322281, "learning_rate": 1e-05, "loss": 0.0536, "step": 380500 }, { "epoch": 141.58305462653288, "grad_norm": 0.21884822845458984, "learning_rate": 1e-05, "loss": 0.0531, "step": 381000 }, { "epoch": 141.76885916016352, "grad_norm": 0.23641781508922577, "learning_rate": 1e-05, "loss": 0.053, "step": 381500 }, { "epoch": 141.95466369379412, "grad_norm": 0.19402460753917694, "learning_rate": 1e-05, "loss": 0.0529, "step": 382000 }, { "epoch": 142.0, "eval_accuracy": 0.7194473963868225, "eval_f1_macro": 0.36600085102264546, "eval_f1_micro": 0.7669340748803981, "eval_loss": 0.04919710010290146, "eval_runtime": 506.4483, "eval_samples_per_second": 113.34, "eval_steps_per_second": 1.771, "learning_rate": 1e-05, "step": 382122 }, { "epoch": 142.14046822742475, "grad_norm": 0.21041558682918549, "learning_rate": 1e-05, "loss": 0.0529, "step": 382500 }, { "epoch": 142.32627276105538, "grad_norm": 0.21750488877296448, "learning_rate": 1e-05, "loss": 0.0533, "step": 383000 }, { "epoch": 142.51207729468598, "grad_norm": 0.2097017467021942, "learning_rate": 1e-05, "loss": 0.053, "step": 383500 }, { "epoch": 142.6978818283166, "grad_norm": 0.2606968879699707, "learning_rate": 1e-05, "loss": 0.0535, "step": 384000 }, { "epoch": 142.88368636194724, "grad_norm": 0.18851463496685028, "learning_rate": 1e-05, "loss": 0.0534, "step": 384500 }, { "epoch": 143.0, "eval_accuracy": 0.7168342014947475, "eval_f1_macro": 0.3673237139632794, "eval_f1_micro": 0.765517685242224, "eval_loss": 0.04930509999394417, "eval_runtime": 513.7066, "eval_samples_per_second": 111.739, "eval_steps_per_second": 1.746, "learning_rate": 1e-05, "step": 384813 }, { "epoch": 143.06949089557784, "grad_norm": 0.243364155292511, "learning_rate": 1.0000000000000002e-06, "loss": 0.0527, "step": 385000 }, { "epoch": 143.25529542920847, "grad_norm": 0.2838144600391388, "learning_rate": 1.0000000000000002e-06, "loss": 0.053, "step": 385500 }, { "epoch": 143.4410999628391, "grad_norm": 0.24525409936904907, "learning_rate": 1.0000000000000002e-06, "loss": 0.0528, "step": 386000 }, { "epoch": 143.6269044964697, "grad_norm": 0.2145887315273285, "learning_rate": 1.0000000000000002e-06, "loss": 0.0527, "step": 386500 }, { "epoch": 143.81270903010034, "grad_norm": 0.16669905185699463, "learning_rate": 1.0000000000000002e-06, "loss": 0.0531, "step": 387000 }, { "epoch": 143.99851356373097, "grad_norm": 0.23091119527816772, "learning_rate": 1.0000000000000002e-06, "loss": 0.0528, "step": 387500 }, { "epoch": 144.0, "eval_accuracy": 0.7171477848817965, "eval_f1_macro": 0.3554021435508309, "eval_f1_micro": 0.7667940015206897, "eval_loss": 0.0490318201482296, "eval_runtime": 525.9642, "eval_samples_per_second": 109.135, "eval_steps_per_second": 1.705, "learning_rate": 1.0000000000000002e-06, "step": 387504 }, { "epoch": 144.18431809736157, "grad_norm": 0.2074396163225174, "learning_rate": 1.0000000000000002e-06, "loss": 0.0531, "step": 388000 }, { "epoch": 144.3701226309922, "grad_norm": 0.2579312026500702, "learning_rate": 1.0000000000000002e-06, "loss": 0.0529, "step": 388500 }, { "epoch": 144.55592716462283, "grad_norm": 0.1861879676580429, "learning_rate": 1.0000000000000002e-06, "loss": 0.0528, "step": 389000 }, { "epoch": 144.74173169825343, "grad_norm": 0.21441423892974854, "learning_rate": 1.0000000000000002e-06, "loss": 0.0526, "step": 389500 }, { "epoch": 144.92753623188406, "grad_norm": 0.23621511459350586, "learning_rate": 1.0000000000000002e-06, "loss": 0.0534, "step": 390000 }, { "epoch": 145.0, "eval_accuracy": 0.7201616696573231, "eval_f1_macro": 0.3711029550898432, "eval_f1_micro": 0.7677822164123848, "eval_loss": 0.04918621480464935, "eval_runtime": 507.392, "eval_samples_per_second": 113.129, "eval_steps_per_second": 1.768, "learning_rate": 1.0000000000000002e-06, "step": 390195 }, { "epoch": 145.1133407655147, "grad_norm": 0.2494996339082718, "learning_rate": 1.0000000000000002e-06, "loss": 0.0533, "step": 390500 }, { "epoch": 145.2991452991453, "grad_norm": 0.19221335649490356, "learning_rate": 1.0000000000000002e-06, "loss": 0.0526, "step": 391000 }, { "epoch": 145.48494983277592, "grad_norm": 0.19597986340522766, "learning_rate": 1.0000000000000002e-06, "loss": 0.0533, "step": 391500 }, { "epoch": 145.67075436640653, "grad_norm": 0.19304433465003967, "learning_rate": 1.0000000000000002e-06, "loss": 0.0526, "step": 392000 }, { "epoch": 145.85655890003716, "grad_norm": 0.21061711013317108, "learning_rate": 1.0000000000000002e-06, "loss": 0.0529, "step": 392500 }, { "epoch": 146.0, "eval_accuracy": 0.717130363582516, "eval_f1_macro": 0.368326447075977, "eval_f1_micro": 0.7665065530257804, "eval_loss": 0.04903709515929222, "eval_runtime": 590.7348, "eval_samples_per_second": 97.169, "eval_steps_per_second": 1.518, "learning_rate": 1.0000000000000002e-06, "step": 392886 }, { "epoch": 146.0423634336678, "grad_norm": 0.21325454115867615, "learning_rate": 1.0000000000000002e-06, "loss": 0.0529, "step": 393000 }, { "epoch": 146.2281679672984, "grad_norm": 0.15669451653957367, "learning_rate": 1.0000000000000002e-06, "loss": 0.0525, "step": 393500 }, { "epoch": 146.41397250092902, "grad_norm": 0.22324424982070923, "learning_rate": 1.0000000000000002e-06, "loss": 0.053, "step": 394000 }, { "epoch": 146.59977703455965, "grad_norm": 0.192140132188797, "learning_rate": 1.0000000000000002e-06, "loss": 0.0531, "step": 394500 }, { "epoch": 146.78558156819025, "grad_norm": 0.20880526304244995, "learning_rate": 1.0000000000000002e-06, "loss": 0.0532, "step": 395000 }, { "epoch": 146.97138610182088, "grad_norm": 0.21843858063220978, "learning_rate": 1.0000000000000002e-06, "loss": 0.0532, "step": 395500 }, { "epoch": 147.0, "eval_accuracy": 0.720823679029982, "eval_f1_macro": 0.37476196073915324, "eval_f1_micro": 0.768544776459646, "eval_loss": 0.049094948917627335, "eval_runtime": 525.4912, "eval_samples_per_second": 109.233, "eval_steps_per_second": 1.707, "learning_rate": 1.0000000000000002e-06, "step": 395577 }, { "epoch": 147.1571906354515, "grad_norm": 0.17876408994197845, "learning_rate": 1.0000000000000002e-06, "loss": 0.0521, "step": 396000 }, { "epoch": 147.34299516908212, "grad_norm": 0.25762057304382324, "learning_rate": 1.0000000000000002e-06, "loss": 0.0528, "step": 396500 }, { "epoch": 147.52879970271275, "grad_norm": 0.20070230960845947, "learning_rate": 1.0000000000000002e-06, "loss": 0.053, "step": 397000 }, { "epoch": 147.71460423634338, "grad_norm": 0.20209959149360657, "learning_rate": 1.0000000000000002e-06, "loss": 0.0536, "step": 397500 }, { "epoch": 147.90040876997398, "grad_norm": 0.22516289353370667, "learning_rate": 1.0000000000000002e-06, "loss": 0.053, "step": 398000 }, { "epoch": 148.0, "eval_accuracy": 0.7167296736990645, "eval_f1_macro": 0.3649988602534311, "eval_f1_micro": 0.7667018106807243, "eval_loss": 0.04907181113958359, "eval_runtime": 525.537, "eval_samples_per_second": 109.224, "eval_steps_per_second": 1.707, "learning_rate": 1.0000000000000002e-06, "step": 398268 }, { "epoch": 148.0862133036046, "grad_norm": 0.2316817343235016, "learning_rate": 1.0000000000000002e-06, "loss": 0.0532, "step": 398500 }, { "epoch": 148.27201783723524, "grad_norm": 0.2039523720741272, "learning_rate": 1.0000000000000002e-06, "loss": 0.0525, "step": 399000 }, { "epoch": 148.45782237086584, "grad_norm": 0.22887806594371796, "learning_rate": 1.0000000000000002e-06, "loss": 0.0533, "step": 399500 }, { "epoch": 148.64362690449647, "grad_norm": 0.20459462702274323, "learning_rate": 1.0000000000000002e-06, "loss": 0.0527, "step": 400000 }, { "epoch": 148.8294314381271, "grad_norm": 0.20509076118469238, "learning_rate": 1.0000000000000002e-06, "loss": 0.0526, "step": 400500 }, { "epoch": 149.0, "eval_accuracy": 0.718210484137907, "eval_f1_macro": 0.3787860055208151, "eval_f1_micro": 0.7671139893046166, "eval_loss": 0.04904184117913246, "eval_runtime": 553.2009, "eval_samples_per_second": 103.762, "eval_steps_per_second": 1.621, "learning_rate": 1.0000000000000002e-06, "step": 400959 }, { "epoch": 149.0152359717577, "grad_norm": 0.2034793645143509, "learning_rate": 1.0000000000000002e-06, "loss": 0.0536, "step": 401000 }, { "epoch": 149.20104050538833, "grad_norm": 0.21667377650737762, "learning_rate": 1.0000000000000002e-06, "loss": 0.0531, "step": 401500 }, { "epoch": 149.38684503901897, "grad_norm": 0.23273786902427673, "learning_rate": 1.0000000000000002e-06, "loss": 0.0526, "step": 402000 }, { "epoch": 149.57264957264957, "grad_norm": 0.2024523764848709, "learning_rate": 1.0000000000000002e-06, "loss": 0.0528, "step": 402500 }, { "epoch": 149.7584541062802, "grad_norm": 0.19701418280601501, "learning_rate": 1.0000000000000002e-06, "loss": 0.0519, "step": 403000 }, { "epoch": 149.94425863991083, "grad_norm": 0.20358909666538239, "learning_rate": 1.0000000000000002e-06, "loss": 0.0535, "step": 403500 }, { "epoch": 150.0, "eval_accuracy": 0.7154404975523074, "eval_f1_macro": 0.3726446424747262, "eval_f1_micro": 0.7667920374277589, "eval_loss": 0.04912904277443886, "eval_runtime": 540.2131, "eval_samples_per_second": 106.256, "eval_steps_per_second": 1.66, "learning_rate": 1.0000000000000002e-06, "step": 403650 }, { "epoch": 150.0, "learning_rate": 1.0000000000000002e-07, "step": 403650, "total_flos": 2.3283011301956885e+21, "train_loss": 0.019644906779846472, "train_runtime": 108552.2715, "train_samples_per_second": 237.919, "train_steps_per_second": 3.718 } ], "logging_steps": 500, "max_steps": 403650, "num_input_tokens_seen": 0, "num_train_epochs": 150, "save_steps": 500, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 10, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.3283011301956885e+21, "train_batch_size": 64, "trial_name": null, "trial_params": null }