{ "best_metric": 1.6244161128997803, "best_model_checkpoint": "./output/checkpoints/2024-06-11_10-58-33/checkpoint-30", "epoch": 1.0, "eval_steps": 1, "global_step": 37, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.02702702702702703, "grad_norm": 3.2274646759033203, "learning_rate": 0.0001, "loss": 5.7374, "step": 1 }, { "epoch": 0.02702702702702703, "eval_loss": 5.6431050300598145, "eval_runtime": 10.9746, "eval_samples_per_second": 11.299, "eval_steps_per_second": 0.729, "step": 1 }, { "epoch": 0.05405405405405406, "grad_norm": 3.1417839527130127, "learning_rate": 0.0002, "loss": 5.6423, "step": 2 }, { "epoch": 0.05405405405405406, "eval_loss": 5.1099138259887695, "eval_runtime": 11.0219, "eval_samples_per_second": 11.25, "eval_steps_per_second": 0.726, "step": 2 }, { "epoch": 0.08108108108108109, "grad_norm": 3.1990153789520264, "learning_rate": 0.00030000000000000003, "loss": 5.0948, "step": 3 }, { "epoch": 0.08108108108108109, "eval_loss": 3.559605836868286, "eval_runtime": 11.0991, "eval_samples_per_second": 11.172, "eval_steps_per_second": 0.721, "step": 3 }, { "epoch": 0.10810810810810811, "grad_norm": 3.2903366088867188, "learning_rate": 0.0004, "loss": 3.4375, "step": 4 }, { "epoch": 0.10810810810810811, "eval_loss": 2.3610196113586426, "eval_runtime": 11.0573, "eval_samples_per_second": 11.214, "eval_steps_per_second": 0.724, "step": 4 }, { "epoch": 0.13513513513513514, "grad_norm": 1.878879189491272, "learning_rate": 0.0003878787878787879, "loss": 2.2693, "step": 5 }, { "epoch": 0.13513513513513514, "eval_loss": 1.8543975353240967, "eval_runtime": 11.1541, "eval_samples_per_second": 11.117, "eval_steps_per_second": 0.717, "step": 5 }, { "epoch": 0.16216216216216217, "grad_norm": 1.2040495872497559, "learning_rate": 0.0003757575757575758, "loss": 1.7546, "step": 6 }, { "epoch": 0.16216216216216217, "eval_loss": 1.7222554683685303, "eval_runtime": 11.0742, "eval_samples_per_second": 11.197, "eval_steps_per_second": 0.722, "step": 6 }, { "epoch": 0.1891891891891892, "grad_norm": 1.080614447593689, "learning_rate": 0.00036363636363636367, "loss": 1.6633, "step": 7 }, { "epoch": 0.1891891891891892, "eval_loss": 1.610931158065796, "eval_runtime": 11.0872, "eval_samples_per_second": 11.184, "eval_steps_per_second": 0.722, "step": 7 }, { "epoch": 0.21621621621621623, "grad_norm": 0.28874385356903076, "learning_rate": 0.00035151515151515155, "loss": 1.5122, "step": 8 }, { "epoch": 0.21621621621621623, "eval_loss": 1.5804481506347656, "eval_runtime": 11.053, "eval_samples_per_second": 11.219, "eval_steps_per_second": 0.724, "step": 8 }, { "epoch": 0.24324324324324326, "grad_norm": 0.32991790771484375, "learning_rate": 0.00033939393939393943, "loss": 1.4316, "step": 9 }, { "epoch": 0.24324324324324326, "eval_loss": 1.5746935606002808, "eval_runtime": 11.152, "eval_samples_per_second": 11.119, "eval_steps_per_second": 0.717, "step": 9 }, { "epoch": 0.2702702702702703, "grad_norm": 0.5137693285942078, "learning_rate": 0.0003272727272727273, "loss": 1.3161, "step": 10 }, { "epoch": 0.2702702702702703, "eval_loss": 1.651396632194519, "eval_runtime": 11.1562, "eval_samples_per_second": 11.115, "eval_steps_per_second": 0.717, "step": 10 }, { "epoch": 0.2972972972972973, "grad_norm": 0.25246673822402954, "learning_rate": 0.00031515151515151515, "loss": 1.207, "step": 11 }, { "epoch": 0.2972972972972973, "eval_loss": 1.7246230840682983, "eval_runtime": 11.1298, "eval_samples_per_second": 11.141, "eval_steps_per_second": 0.719, "step": 11 }, { "epoch": 0.32432432432432434, "grad_norm": 0.2032381296157837, "learning_rate": 0.00030303030303030303, "loss": 1.158, "step": 12 }, { "epoch": 0.32432432432432434, "eval_loss": 1.7255425453186035, "eval_runtime": 11.0733, "eval_samples_per_second": 11.198, "eval_steps_per_second": 0.722, "step": 12 }, { "epoch": 0.35135135135135137, "grad_norm": 0.2133413404226303, "learning_rate": 0.0002909090909090909, "loss": 1.1137, "step": 13 }, { "epoch": 0.35135135135135137, "eval_loss": 1.6880252361297607, "eval_runtime": 11.2007, "eval_samples_per_second": 11.071, "eval_steps_per_second": 0.714, "step": 13 }, { "epoch": 0.3783783783783784, "grad_norm": 0.20175401866436005, "learning_rate": 0.0002787878787878788, "loss": 1.1059, "step": 14 }, { "epoch": 0.3783783783783784, "eval_loss": 1.6500831842422485, "eval_runtime": 11.1367, "eval_samples_per_second": 11.134, "eval_steps_per_second": 0.718, "step": 14 }, { "epoch": 0.40540540540540543, "grad_norm": 0.22595511376857758, "learning_rate": 0.0002666666666666667, "loss": 1.0483, "step": 15 }, { "epoch": 0.40540540540540543, "eval_loss": 1.6288588047027588, "eval_runtime": 11.1914, "eval_samples_per_second": 11.08, "eval_steps_per_second": 0.715, "step": 15 }, { "epoch": 0.43243243243243246, "grad_norm": 0.17468485236167908, "learning_rate": 0.00025454545454545456, "loss": 1.0584, "step": 16 }, { "epoch": 0.43243243243243246, "eval_loss": 1.6247642040252686, "eval_runtime": 11.1035, "eval_samples_per_second": 11.168, "eval_steps_per_second": 0.72, "step": 16 }, { "epoch": 0.4594594594594595, "grad_norm": 0.1654416024684906, "learning_rate": 0.00024242424242424245, "loss": 1.0402, "step": 17 }, { "epoch": 0.4594594594594595, "eval_loss": 1.6316722631454468, "eval_runtime": 11.2065, "eval_samples_per_second": 11.065, "eval_steps_per_second": 0.714, "step": 17 }, { "epoch": 0.4864864864864865, "grad_norm": 0.10361829400062561, "learning_rate": 0.00023030303030303033, "loss": 1.0301, "step": 18 }, { "epoch": 0.4864864864864865, "eval_loss": 1.6415338516235352, "eval_runtime": 11.18, "eval_samples_per_second": 11.091, "eval_steps_per_second": 0.716, "step": 18 }, { "epoch": 0.5135135135135135, "grad_norm": 0.09156349301338196, "learning_rate": 0.00021818181818181818, "loss": 1.0183, "step": 19 }, { "epoch": 0.5135135135135135, "eval_loss": 1.6544169187545776, "eval_runtime": 11.1626, "eval_samples_per_second": 11.109, "eval_steps_per_second": 0.717, "step": 19 }, { "epoch": 0.5405405405405406, "grad_norm": 0.087005615234375, "learning_rate": 0.00020606060606060607, "loss": 1.028, "step": 20 }, { "epoch": 0.5405405405405406, "eval_loss": 1.6620415449142456, "eval_runtime": 11.2393, "eval_samples_per_second": 11.033, "eval_steps_per_second": 0.712, "step": 20 }, { "epoch": 0.5675675675675675, "grad_norm": 0.09235216677188873, "learning_rate": 0.00019393939393939395, "loss": 0.9825, "step": 21 }, { "epoch": 0.5675675675675675, "eval_loss": 1.6642476320266724, "eval_runtime": 11.2278, "eval_samples_per_second": 11.044, "eval_steps_per_second": 0.713, "step": 21 }, { "epoch": 0.5945945945945946, "grad_norm": 0.0915454775094986, "learning_rate": 0.00018181818181818183, "loss": 0.9991, "step": 22 }, { "epoch": 0.5945945945945946, "eval_loss": 1.6625572443008423, "eval_runtime": 11.1424, "eval_samples_per_second": 11.129, "eval_steps_per_second": 0.718, "step": 22 }, { "epoch": 0.6216216216216216, "grad_norm": 0.09213992953300476, "learning_rate": 0.00016969696969696972, "loss": 1.0211, "step": 23 }, { "epoch": 0.6216216216216216, "eval_loss": 1.6593235731124878, "eval_runtime": 11.1978, "eval_samples_per_second": 11.074, "eval_steps_per_second": 0.714, "step": 23 }, { "epoch": 0.6486486486486487, "grad_norm": 0.0854020044207573, "learning_rate": 0.00015757575757575757, "loss": 1.0291, "step": 24 }, { "epoch": 0.6486486486486487, "eval_loss": 1.6526458263397217, "eval_runtime": 11.2323, "eval_samples_per_second": 11.04, "eval_steps_per_second": 0.712, "step": 24 }, { "epoch": 0.6756756756756757, "grad_norm": 0.08045388758182526, "learning_rate": 0.00014545454545454546, "loss": 0.9887, "step": 25 }, { "epoch": 0.6756756756756757, "eval_loss": 1.6451815366744995, "eval_runtime": 11.1905, "eval_samples_per_second": 11.081, "eval_steps_per_second": 0.715, "step": 25 }, { "epoch": 0.7027027027027027, "grad_norm": 0.07576093822717667, "learning_rate": 0.00013333333333333334, "loss": 1.0044, "step": 26 }, { "epoch": 0.7027027027027027, "eval_loss": 1.6377238035202026, "eval_runtime": 11.1714, "eval_samples_per_second": 11.1, "eval_steps_per_second": 0.716, "step": 26 }, { "epoch": 0.7297297297297297, "grad_norm": 0.07311829924583435, "learning_rate": 0.00012121212121212122, "loss": 0.9772, "step": 27 }, { "epoch": 0.7297297297297297, "eval_loss": 1.6314424276351929, "eval_runtime": 11.1489, "eval_samples_per_second": 11.122, "eval_steps_per_second": 0.718, "step": 27 }, { "epoch": 0.7567567567567568, "grad_norm": 0.07776332646608353, "learning_rate": 0.00010909090909090909, "loss": 0.9902, "step": 28 }, { "epoch": 0.7567567567567568, "eval_loss": 1.625641942024231, "eval_runtime": 11.1261, "eval_samples_per_second": 11.145, "eval_steps_per_second": 0.719, "step": 28 }, { "epoch": 0.7837837837837838, "grad_norm": 0.07536856085062027, "learning_rate": 9.696969696969698e-05, "loss": 0.9902, "step": 29 }, { "epoch": 0.7837837837837838, "eval_loss": 1.6233930587768555, "eval_runtime": 11.1754, "eval_samples_per_second": 11.096, "eval_steps_per_second": 0.716, "step": 29 }, { "epoch": 0.8108108108108109, "grad_norm": 0.07941398024559021, "learning_rate": 8.484848484848486e-05, "loss": 0.9784, "step": 30 }, { "epoch": 0.8108108108108109, "eval_loss": 1.6244161128997803, "eval_runtime": 11.2511, "eval_samples_per_second": 11.021, "eval_steps_per_second": 0.711, "step": 30 }, { "epoch": 0.8378378378378378, "grad_norm": 0.07617861032485962, "learning_rate": 7.272727272727273e-05, "loss": 1.0064, "step": 31 }, { "epoch": 0.8378378378378378, "eval_loss": 1.62636399269104, "eval_runtime": 11.1098, "eval_samples_per_second": 11.161, "eval_steps_per_second": 0.72, "step": 31 }, { "epoch": 0.8648648648648649, "grad_norm": 0.06959453225135803, "learning_rate": 6.060606060606061e-05, "loss": 0.9764, "step": 32 }, { "epoch": 0.8648648648648649, "eval_loss": 1.6286530494689941, "eval_runtime": 11.2497, "eval_samples_per_second": 11.023, "eval_steps_per_second": 0.711, "step": 32 }, { "epoch": 0.8918918918918919, "grad_norm": 0.07171300053596497, "learning_rate": 4.848484848484849e-05, "loss": 0.9921, "step": 33 }, { "epoch": 0.8918918918918919, "eval_loss": 1.630918264389038, "eval_runtime": 11.1794, "eval_samples_per_second": 11.092, "eval_steps_per_second": 0.716, "step": 33 }, { "epoch": 0.918918918918919, "grad_norm": 0.07644116133451462, "learning_rate": 3.6363636363636364e-05, "loss": 0.9716, "step": 34 }, { "epoch": 0.918918918918919, "eval_loss": 1.63330078125, "eval_runtime": 11.1352, "eval_samples_per_second": 11.136, "eval_steps_per_second": 0.718, "step": 34 }, { "epoch": 0.9459459459459459, "grad_norm": 0.07242273539304733, "learning_rate": 2.4242424242424244e-05, "loss": 0.9781, "step": 35 }, { "epoch": 0.9459459459459459, "eval_loss": 1.634429931640625, "eval_runtime": 11.203, "eval_samples_per_second": 11.069, "eval_steps_per_second": 0.714, "step": 35 }, { "epoch": 0.972972972972973, "grad_norm": 0.069486603140831, "learning_rate": 1.2121212121212122e-05, "loss": 0.9592, "step": 36 }, { "epoch": 0.972972972972973, "eval_loss": 1.6349563598632812, "eval_runtime": 11.1077, "eval_samples_per_second": 11.163, "eval_steps_per_second": 0.72, "step": 36 }, { "epoch": 1.0, "grad_norm": 0.07558272778987885, "learning_rate": 0.0, "loss": 0.9368, "step": 37 }, { "epoch": 1.0, "eval_loss": 1.6352812051773071, "eval_runtime": 11.1164, "eval_samples_per_second": 11.155, "eval_steps_per_second": 0.72, "step": 37 }, { "epoch": 1.0, "step": 37, "total_flos": 1.3641878835560448e+16, "train_loss": 1.5526673584371, "train_runtime": 758.4004, "train_samples_per_second": 1.552, "train_steps_per_second": 0.049 } ], "logging_steps": 1, "max_steps": 37, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 10, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.3641878835560448e+16, "train_batch_size": 32, "trial_name": null, "trial_params": null }