{ "best_metric": null, "best_model_checkpoint": null, "epoch": 50.0, "eval_steps": 10000, "global_step": 126850, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "learning_rate": 1.9999842333464724e-05, "loss": 0.1488, "step": 1 }, { "epoch": 0.39, "learning_rate": 1.9842333464722114e-05, "loss": 0.1737, "step": 1000 }, { "epoch": 0.79, "learning_rate": 1.968498226251478e-05, "loss": 0.1713, "step": 2000 }, { "epoch": 1.18, "learning_rate": 1.9527315727236894e-05, "loss": 0.1741, "step": 3000 }, { "epoch": 1.58, "learning_rate": 1.9369806858494287e-05, "loss": 0.1357, "step": 4000 }, { "epoch": 1.97, "learning_rate": 1.92121403232164e-05, "loss": 0.0985, "step": 5000 }, { "epoch": 2.36, "learning_rate": 1.9054473787938513e-05, "loss": 0.0746, "step": 6000 }, { "epoch": 2.76, "learning_rate": 1.8897437918801735e-05, "loss": 0.077, "step": 7000 }, { "epoch": 3.15, "learning_rate": 1.8739771383523847e-05, "loss": 0.0747, "step": 8000 }, { "epoch": 3.55, "learning_rate": 1.858210484824596e-05, "loss": 0.0607, "step": 9000 }, { "epoch": 3.94, "learning_rate": 1.8424438312968072e-05, "loss": 0.0648, "step": 10000 }, { "epoch": 3.94, "eval_accuracy": 0.971966859607309, "eval_f1": 0.8975528826213189, "eval_loss": 0.047781217843294144, "eval_precision": 0.9169491525423729, "eval_recall": 0.8789601949634444, "eval_runtime": 30.9282, "eval_samples_per_second": 284.886, "eval_steps_per_second": 35.631, "step": 10000 }, { "epoch": 4.34, "learning_rate": 1.8266771777690185e-05, "loss": 0.0604, "step": 11000 }, { "epoch": 4.73, "learning_rate": 1.8109105242412298e-05, "loss": 0.0587, "step": 12000 }, { "epoch": 5.12, "learning_rate": 1.795159637366969e-05, "loss": 0.0635, "step": 13000 }, { "epoch": 5.52, "learning_rate": 1.779408750492708e-05, "loss": 0.0558, "step": 14000 }, { "epoch": 5.91, "learning_rate": 1.7636420969649194e-05, "loss": 0.0525, "step": 15000 }, { "epoch": 6.31, "learning_rate": 1.7478754434371306e-05, "loss": 0.05, "step": 16000 }, { "epoch": 6.7, "learning_rate": 1.7321245565628696e-05, "loss": 0.0513, "step": 17000 }, { "epoch": 7.09, "learning_rate": 1.716357903035081e-05, "loss": 0.0423, "step": 18000 }, { "epoch": 7.49, "learning_rate": 1.70060701616082e-05, "loss": 0.0417, "step": 19000 }, { "epoch": 7.88, "learning_rate": 1.684840362633031e-05, "loss": 0.0432, "step": 20000 }, { "epoch": 7.88, "eval_accuracy": 0.9800249687890137, "eval_f1": 0.9293172690763052, "eval_loss": 0.03126443922519684, "eval_precision": 0.9189833200953137, "eval_recall": 0.9398862713241267, "eval_runtime": 30.5459, "eval_samples_per_second": 288.451, "eval_steps_per_second": 36.077, "step": 20000 }, { "epoch": 8.28, "learning_rate": 1.6690737091052424e-05, "loss": 0.0378, "step": 21000 }, { "epoch": 8.67, "learning_rate": 1.6533070555774536e-05, "loss": 0.041, "step": 22000 }, { "epoch": 9.07, "learning_rate": 1.637540402049665e-05, "loss": 0.0403, "step": 23000 }, { "epoch": 9.46, "learning_rate": 1.621805281828932e-05, "loss": 0.0338, "step": 24000 }, { "epoch": 9.85, "learning_rate": 1.6060386283011432e-05, "loss": 0.0352, "step": 25000 }, { "epoch": 10.25, "learning_rate": 1.5902877414268822e-05, "loss": 0.0356, "step": 26000 }, { "epoch": 10.64, "learning_rate": 1.5745210878990935e-05, "loss": 0.0343, "step": 27000 }, { "epoch": 11.04, "learning_rate": 1.5587702010248328e-05, "loss": 0.039, "step": 28000 }, { "epoch": 11.43, "learning_rate": 1.543003547497044e-05, "loss": 0.0336, "step": 29000 }, { "epoch": 11.82, "learning_rate": 1.5272368939692553e-05, "loss": 0.0294, "step": 30000 }, { "epoch": 11.82, "eval_accuracy": 0.9878560889796845, "eval_f1": 0.9563799429270281, "eval_loss": 0.023840567097067833, "eval_precision": 0.9599018003273322, "eval_recall": 0.9528838342810723, "eval_runtime": 29.3298, "eval_samples_per_second": 300.411, "eval_steps_per_second": 37.573, "step": 30000 }, { "epoch": 12.22, "learning_rate": 1.5114702404414664e-05, "loss": 0.0362, "step": 31000 }, { "epoch": 12.61, "learning_rate": 1.4957035869136779e-05, "loss": 0.0352, "step": 32000 }, { "epoch": 13.01, "learning_rate": 1.4799369333858891e-05, "loss": 0.0297, "step": 33000 }, { "epoch": 13.4, "learning_rate": 1.464186046511628e-05, "loss": 0.0316, "step": 34000 }, { "epoch": 13.8, "learning_rate": 1.4484193929838392e-05, "loss": 0.0312, "step": 35000 }, { "epoch": 14.19, "learning_rate": 1.4326527394560506e-05, "loss": 0.0315, "step": 36000 }, { "epoch": 14.58, "learning_rate": 1.4168860859282619e-05, "loss": 0.0275, "step": 37000 }, { "epoch": 14.98, "learning_rate": 1.401135199054001e-05, "loss": 0.0431, "step": 38000 }, { "epoch": 15.37, "learning_rate": 1.3853843121797399e-05, "loss": 0.034, "step": 39000 }, { "epoch": 15.77, "learning_rate": 1.3696176586519511e-05, "loss": 0.0275, "step": 40000 }, { "epoch": 15.77, "eval_accuracy": 0.9868346385200317, "eval_f1": 0.9522240527182867, "eval_loss": 0.022756921127438545, "eval_precision": 0.9657477025898078, "eval_recall": 0.9390739236393176, "eval_runtime": 29.4347, "eval_samples_per_second": 299.34, "eval_steps_per_second": 37.439, "step": 40000 }, { "epoch": 16.16, "learning_rate": 1.3538510051241624e-05, "loss": 0.0285, "step": 41000 }, { "epoch": 16.55, "learning_rate": 1.3381001182499016e-05, "loss": 0.0286, "step": 42000 }, { "epoch": 16.95, "learning_rate": 1.3223334647221128e-05, "loss": 0.0279, "step": 43000 }, { "epoch": 17.34, "learning_rate": 1.306566811194324e-05, "loss": 0.0254, "step": 44000 }, { "epoch": 17.74, "learning_rate": 1.2908001576665355e-05, "loss": 0.0239, "step": 45000 }, { "epoch": 18.13, "learning_rate": 1.2750335041387468e-05, "loss": 0.0269, "step": 46000 }, { "epoch": 18.53, "learning_rate": 1.259266850610958e-05, "loss": 0.0268, "step": 47000 }, { "epoch": 18.92, "learning_rate": 1.2435001970831693e-05, "loss": 0.0242, "step": 48000 }, { "epoch": 19.31, "learning_rate": 1.2277493102089081e-05, "loss": 0.0268, "step": 49000 }, { "epoch": 19.71, "learning_rate": 1.2119984233346473e-05, "loss": 0.0268, "step": 50000 }, { "epoch": 19.71, "eval_accuracy": 0.9872886165020996, "eval_f1": 0.9541734860883797, "eval_loss": 0.017343735322356224, "eval_precision": 0.9612530915086562, "eval_recall": 0.9471974004874086, "eval_runtime": 29.9772, "eval_samples_per_second": 293.923, "eval_steps_per_second": 36.761, "step": 50000 }, { "epoch": 20.1, "learning_rate": 1.1962317698068587e-05, "loss": 0.0238, "step": 51000 }, { "epoch": 20.5, "learning_rate": 1.1804808829325975e-05, "loss": 0.0279, "step": 52000 }, { "epoch": 20.89, "learning_rate": 1.1647142294048088e-05, "loss": 0.0251, "step": 53000 }, { "epoch": 21.28, "learning_rate": 1.148963342530548e-05, "loss": 0.0258, "step": 54000 }, { "epoch": 21.68, "learning_rate": 1.1331966890027592e-05, "loss": 0.0233, "step": 55000 }, { "epoch": 22.07, "learning_rate": 1.1174458021284984e-05, "loss": 0.0281, "step": 56000 }, { "epoch": 22.47, "learning_rate": 1.1016949152542374e-05, "loss": 0.0255, "step": 57000 }, { "epoch": 22.86, "learning_rate": 1.0859282617264486e-05, "loss": 0.0251, "step": 58000 }, { "epoch": 23.26, "learning_rate": 1.0701616081986599e-05, "loss": 0.0252, "step": 59000 }, { "epoch": 23.65, "learning_rate": 1.0543949546708712e-05, "loss": 0.0192, "step": 60000 }, { "epoch": 23.65, "eval_accuracy": 0.9854727045738282, "eval_f1": 0.9498825371965545, "eval_loss": 0.01524350605905056, "eval_precision": 0.9168556311413454, "eval_recall": 0.9853777416734363, "eval_runtime": 29.585, "eval_samples_per_second": 297.82, "eval_steps_per_second": 37.249, "step": 60000 }, { "epoch": 24.04, "learning_rate": 1.0386440677966103e-05, "loss": 0.0223, "step": 61000 }, { "epoch": 24.44, "learning_rate": 1.0228774142688216e-05, "loss": 0.0221, "step": 62000 }, { "epoch": 24.83, "learning_rate": 1.0071107607410328e-05, "loss": 0.0232, "step": 63000 }, { "epoch": 25.23, "learning_rate": 9.913441072132441e-06, "loss": 0.0215, "step": 64000 }, { "epoch": 25.62, "learning_rate": 9.755774536854554e-06, "loss": 0.0221, "step": 65000 }, { "epoch": 26.01, "learning_rate": 9.598265668111943e-06, "loss": 0.0216, "step": 66000 }, { "epoch": 26.41, "learning_rate": 9.440599132834056e-06, "loss": 0.0215, "step": 67000 }, { "epoch": 26.8, "learning_rate": 9.282932597556169e-06, "loss": 0.02, "step": 68000 }, { "epoch": 27.2, "learning_rate": 9.12542372881356e-06, "loss": 0.0227, "step": 69000 }, { "epoch": 27.59, "learning_rate": 8.967757193535673e-06, "loss": 0.018, "step": 70000 }, { "epoch": 27.59, "eval_accuracy": 0.9919418908182953, "eval_f1": 0.9716113554578168, "eval_loss": 0.011301785707473755, "eval_precision": 0.9566929133858267, "eval_recall": 0.9870024370430545, "eval_runtime": 29.5941, "eval_samples_per_second": 297.728, "eval_steps_per_second": 37.237, "step": 70000 }, { "epoch": 27.99, "learning_rate": 8.810248324793063e-06, "loss": 0.0211, "step": 71000 }, { "epoch": 28.38, "learning_rate": 8.652581789515177e-06, "loss": 0.0199, "step": 72000 }, { "epoch": 28.77, "learning_rate": 8.49491525423729e-06, "loss": 0.0197, "step": 73000 }, { "epoch": 29.17, "learning_rate": 8.337248718959402e-06, "loss": 0.0216, "step": 74000 }, { "epoch": 29.56, "learning_rate": 8.17989751675207e-06, "loss": 0.0177, "step": 75000 }, { "epoch": 29.96, "learning_rate": 8.022230981474182e-06, "loss": 0.0193, "step": 76000 }, { "epoch": 30.35, "learning_rate": 7.864564446196295e-06, "loss": 0.0167, "step": 77000 }, { "epoch": 30.74, "learning_rate": 7.706897910918409e-06, "loss": 0.0181, "step": 78000 }, { "epoch": 31.14, "learning_rate": 7.549231375640521e-06, "loss": 0.0194, "step": 79000 }, { "epoch": 31.53, "learning_rate": 7.3915648403626335e-06, "loss": 0.0155, "step": 80000 }, { "epoch": 31.53, "eval_accuracy": 0.9923958688003632, "eval_f1": 0.9729947601773479, "eval_loss": 0.012764379382133484, "eval_precision": 0.9656, "eval_recall": 0.9805036555645816, "eval_runtime": 29.6436, "eval_samples_per_second": 297.231, "eval_steps_per_second": 37.175, "step": 80000 }, { "epoch": 31.93, "learning_rate": 7.233898305084747e-06, "loss": 0.018, "step": 81000 }, { "epoch": 32.32, "learning_rate": 7.076389436342137e-06, "loss": 0.0184, "step": 82000 }, { "epoch": 32.72, "learning_rate": 6.9187229010642495e-06, "loss": 0.0169, "step": 83000 }, { "epoch": 33.11, "learning_rate": 6.76121403232164e-06, "loss": 0.0172, "step": 84000 }, { "epoch": 33.5, "learning_rate": 6.603547497043753e-06, "loss": 0.0191, "step": 85000 }, { "epoch": 33.9, "learning_rate": 6.446038628301144e-06, "loss": 0.0136, "step": 86000 }, { "epoch": 34.29, "learning_rate": 6.288372093023256e-06, "loss": 0.0173, "step": 87000 }, { "epoch": 34.69, "learning_rate": 6.130705557745369e-06, "loss": 0.0161, "step": 88000 }, { "epoch": 35.08, "learning_rate": 5.97319668900276e-06, "loss": 0.0143, "step": 89000 }, { "epoch": 35.47, "learning_rate": 5.815530153724872e-06, "loss": 0.0142, "step": 90000 }, { "epoch": 35.47, "eval_accuracy": 0.9943252752241516, "eval_f1": 0.9798549556809025, "eval_loss": 0.00998597126454115, "eval_precision": 0.9720223820943246, "eval_recall": 0.9878147847278635, "eval_runtime": 29.6518, "eval_samples_per_second": 297.149, "eval_steps_per_second": 37.165, "step": 90000 }, { "epoch": 35.87, "learning_rate": 5.657863618446985e-06, "loss": 0.0157, "step": 91000 }, { "epoch": 36.26, "learning_rate": 5.500354749704376e-06, "loss": 0.0164, "step": 92000 }, { "epoch": 36.66, "learning_rate": 5.342688214426488e-06, "loss": 0.0159, "step": 93000 }, { "epoch": 37.05, "learning_rate": 5.18517934568388e-06, "loss": 0.0134, "step": 94000 }, { "epoch": 37.45, "learning_rate": 5.0275128104059924e-06, "loss": 0.0148, "step": 95000 }, { "epoch": 37.84, "learning_rate": 4.870003941663382e-06, "loss": 0.0119, "step": 96000 }, { "epoch": 38.23, "learning_rate": 4.712337406385495e-06, "loss": 0.0116, "step": 97000 }, { "epoch": 38.63, "learning_rate": 4.554670871107608e-06, "loss": 0.0128, "step": 98000 }, { "epoch": 39.02, "learning_rate": 4.397162002364998e-06, "loss": 0.016, "step": 99000 }, { "epoch": 39.42, "learning_rate": 4.239495467087111e-06, "loss": 0.016, "step": 100000 }, { "epoch": 39.42, "eval_accuracy": 0.9950062421972534, "eval_f1": 0.9822437449556094, "eval_loss": 0.009744029492139816, "eval_precision": 0.9759422614274258, "eval_recall": 0.9886271324126726, "eval_runtime": 29.5778, "eval_samples_per_second": 297.892, "eval_steps_per_second": 37.258, "step": 100000 }, { "epoch": 39.81, "learning_rate": 4.081986598344502e-06, "loss": 0.0126, "step": 101000 }, { "epoch": 40.2, "learning_rate": 3.924320063066614e-06, "loss": 0.015, "step": 102000 }, { "epoch": 40.6, "learning_rate": 3.7666535277887274e-06, "loss": 0.0116, "step": 103000 }, { "epoch": 40.99, "learning_rate": 3.60898699251084e-06, "loss": 0.013, "step": 104000 }, { "epoch": 41.39, "learning_rate": 3.4514781237682304e-06, "loss": 0.0119, "step": 105000 }, { "epoch": 41.78, "learning_rate": 3.2938115884903434e-06, "loss": 0.0117, "step": 106000 }, { "epoch": 42.18, "learning_rate": 3.136145053212456e-06, "loss": 0.0107, "step": 107000 }, { "epoch": 42.57, "learning_rate": 2.9786361844698463e-06, "loss": 0.0103, "step": 108000 }, { "epoch": 42.96, "learning_rate": 2.8209696491919594e-06, "loss": 0.0137, "step": 109000 }, { "epoch": 43.36, "learning_rate": 2.663303113914072e-06, "loss": 0.0122, "step": 110000 }, { "epoch": 43.36, "eval_accuracy": 0.9962546816479401, "eval_f1": 0.9865689865689866, "eval_loss": 0.010858706198632717, "eval_precision": 0.9885807504078303, "eval_recall": 0.9845653939886271, "eval_runtime": 29.5587, "eval_samples_per_second": 298.085, "eval_steps_per_second": 37.282, "step": 110000 }, { "epoch": 43.75, "learning_rate": 2.5056365786361845e-06, "loss": 0.0104, "step": 111000 }, { "epoch": 44.15, "learning_rate": 2.3481277098935753e-06, "loss": 0.013, "step": 112000 }, { "epoch": 44.54, "learning_rate": 2.190461174615688e-06, "loss": 0.0095, "step": 113000 }, { "epoch": 44.93, "learning_rate": 2.0329523058730783e-06, "loss": 0.0114, "step": 114000 }, { "epoch": 45.33, "learning_rate": 1.8752857705951913e-06, "loss": 0.0116, "step": 115000 }, { "epoch": 45.72, "learning_rate": 1.7176192353173041e-06, "loss": 0.0084, "step": 116000 }, { "epoch": 46.12, "learning_rate": 1.559952700039417e-06, "loss": 0.0134, "step": 117000 }, { "epoch": 46.51, "learning_rate": 1.4024438312968075e-06, "loss": 0.0116, "step": 118000 }, { "epoch": 46.91, "learning_rate": 1.2449349625541981e-06, "loss": 0.0116, "step": 119000 }, { "epoch": 47.3, "learning_rate": 1.0872684272763107e-06, "loss": 0.0087, "step": 120000 }, { "epoch": 47.3, "eval_accuracy": 0.9962546816479401, "eval_f1": 0.9865799105327369, "eval_loss": 0.010331220924854279, "eval_precision": 0.987785016286645, "eval_recall": 0.9853777416734363, "eval_runtime": 29.6133, "eval_samples_per_second": 297.536, "eval_steps_per_second": 37.213, "step": 120000 }, { "epoch": 47.69, "learning_rate": 9.296018919984234e-07, "loss": 0.0127, "step": 121000 }, { "epoch": 48.09, "learning_rate": 7.719353567205361e-07, "loss": 0.0113, "step": 122000 }, { "epoch": 48.48, "learning_rate": 6.144264879779267e-07, "loss": 0.0103, "step": 123000 }, { "epoch": 48.88, "learning_rate": 4.5675995270003944e-07, "loss": 0.0078, "step": 124000 }, { "epoch": 49.27, "learning_rate": 2.992510839574301e-07, "loss": 0.0095, "step": 125000 }, { "epoch": 49.66, "learning_rate": 1.4158454867954278e-07, "loss": 0.0097, "step": 126000 } ], "logging_steps": 1000, "max_steps": 126850, "num_input_tokens_seen": 0, "num_train_epochs": 50, "save_steps": 500, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }