{ "best_metric": 1.6187845468521118, "best_model_checkpoint": "./Sustainability_model/checkpoint-2000", "epoch": 1.220703125, "eval_steps": 100, "global_step": 2500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01220703125, "grad_norm": 3.0088555812835693, "learning_rate": 2e-05, "loss": 2.1582, "step": 25 }, { "epoch": 0.0244140625, "grad_norm": 5.197660446166992, "learning_rate": 2e-05, "loss": 2.0856, "step": 50 }, { "epoch": 0.03662109375, "grad_norm": 3.234564781188965, "learning_rate": 2e-05, "loss": 1.9269, "step": 75 }, { "epoch": 0.048828125, "grad_norm": 7.08390474319458, "learning_rate": 2e-05, "loss": 1.888, "step": 100 }, { "epoch": 0.048828125, "eval_loss": 1.8261231184005737, "eval_runtime": 590.9102, "eval_samples_per_second": 3.468, "eval_steps_per_second": 0.435, "step": 100 }, { "epoch": 0.06103515625, "grad_norm": 3.1646361351013184, "learning_rate": 2e-05, "loss": 1.8649, "step": 125 }, { "epoch": 0.0732421875, "grad_norm": 6.104555130004883, "learning_rate": 2e-05, "loss": 1.742, "step": 150 }, { "epoch": 0.08544921875, "grad_norm": 2.9724113941192627, "learning_rate": 2e-05, "loss": 1.7567, "step": 175 }, { "epoch": 0.09765625, "grad_norm": 6.2468791007995605, "learning_rate": 2e-05, "loss": 1.7452, "step": 200 }, { "epoch": 0.09765625, "eval_loss": 1.7315690517425537, "eval_runtime": 590.974, "eval_samples_per_second": 3.467, "eval_steps_per_second": 0.435, "step": 200 }, { "epoch": 0.10986328125, "grad_norm": 2.97963285446167, "learning_rate": 2e-05, "loss": 1.6694, "step": 225 }, { "epoch": 0.1220703125, "grad_norm": 4.771264553070068, "learning_rate": 2e-05, "loss": 1.6833, "step": 250 }, { "epoch": 0.13427734375, "grad_norm": 2.825491428375244, "learning_rate": 2e-05, "loss": 1.6958, "step": 275 }, { "epoch": 0.146484375, "grad_norm": 4.647068977355957, "learning_rate": 2e-05, "loss": 1.7428, "step": 300 }, { "epoch": 0.146484375, "eval_loss": 1.6999598741531372, "eval_runtime": 590.2857, "eval_samples_per_second": 3.471, "eval_steps_per_second": 0.435, "step": 300 }, { "epoch": 0.15869140625, "grad_norm": 3.1953535079956055, "learning_rate": 2e-05, "loss": 1.7458, "step": 325 }, { "epoch": 0.1708984375, "grad_norm": 5.5873799324035645, "learning_rate": 2e-05, "loss": 1.6244, "step": 350 }, { "epoch": 0.18310546875, "grad_norm": 2.5425360202789307, "learning_rate": 2e-05, "loss": 1.6862, "step": 375 }, { "epoch": 0.1953125, "grad_norm": 4.082971572875977, "learning_rate": 2e-05, "loss": 1.6836, "step": 400 }, { "epoch": 0.1953125, "eval_loss": 1.6864606142044067, "eval_runtime": 589.1989, "eval_samples_per_second": 3.478, "eval_steps_per_second": 0.436, "step": 400 }, { "epoch": 0.20751953125, "grad_norm": 2.6709253787994385, "learning_rate": 2e-05, "loss": 1.6939, "step": 425 }, { "epoch": 0.2197265625, "grad_norm": 5.410455703735352, "learning_rate": 2e-05, "loss": 1.5974, "step": 450 }, { "epoch": 0.23193359375, "grad_norm": 2.8631389141082764, "learning_rate": 2e-05, "loss": 1.6609, "step": 475 }, { "epoch": 0.244140625, "grad_norm": 3.2581229209899902, "learning_rate": 2e-05, "loss": 1.6251, "step": 500 }, { "epoch": 0.244140625, "eval_loss": 1.67488431930542, "eval_runtime": 589.2638, "eval_samples_per_second": 3.477, "eval_steps_per_second": 0.436, "step": 500 }, { "epoch": 0.25634765625, "grad_norm": 2.8811697959899902, "learning_rate": 2e-05, "loss": 1.7135, "step": 525 }, { "epoch": 0.2685546875, "grad_norm": 5.96162748336792, "learning_rate": 2e-05, "loss": 1.6709, "step": 550 }, { "epoch": 0.28076171875, "grad_norm": 2.4651806354522705, "learning_rate": 2e-05, "loss": 1.6504, "step": 575 }, { "epoch": 0.29296875, "grad_norm": 4.032615661621094, "learning_rate": 2e-05, "loss": 1.7128, "step": 600 }, { "epoch": 0.29296875, "eval_loss": 1.668798565864563, "eval_runtime": 589.1105, "eval_samples_per_second": 3.478, "eval_steps_per_second": 0.436, "step": 600 }, { "epoch": 0.30517578125, "grad_norm": 2.694554328918457, "learning_rate": 2e-05, "loss": 1.7093, "step": 625 }, { "epoch": 0.3173828125, "grad_norm": 4.213258743286133, "learning_rate": 2e-05, "loss": 1.6899, "step": 650 }, { "epoch": 0.32958984375, "grad_norm": 2.69679594039917, "learning_rate": 2e-05, "loss": 1.6451, "step": 675 }, { "epoch": 0.341796875, "grad_norm": 3.6988604068756104, "learning_rate": 2e-05, "loss": 1.631, "step": 700 }, { "epoch": 0.341796875, "eval_loss": 1.662984013557434, "eval_runtime": 588.5535, "eval_samples_per_second": 3.481, "eval_steps_per_second": 0.437, "step": 700 }, { "epoch": 0.35400390625, "grad_norm": 2.6815237998962402, "learning_rate": 2e-05, "loss": 1.688, "step": 725 }, { "epoch": 0.3662109375, "grad_norm": 5.819088459014893, "learning_rate": 2e-05, "loss": 1.6649, "step": 750 }, { "epoch": 0.37841796875, "grad_norm": 2.524092674255371, "learning_rate": 2e-05, "loss": 1.6305, "step": 775 }, { "epoch": 0.390625, "grad_norm": 4.0569963455200195, "learning_rate": 2e-05, "loss": 1.6493, "step": 800 }, { "epoch": 0.390625, "eval_loss": 1.6568603515625, "eval_runtime": 588.2081, "eval_samples_per_second": 3.483, "eval_steps_per_second": 0.437, "step": 800 }, { "epoch": 0.40283203125, "grad_norm": 2.565763473510742, "learning_rate": 2e-05, "loss": 1.6983, "step": 825 }, { "epoch": 0.4150390625, "grad_norm": 6.5800676345825195, "learning_rate": 2e-05, "loss": 1.6565, "step": 850 }, { "epoch": 0.42724609375, "grad_norm": 2.1741669178009033, "learning_rate": 2e-05, "loss": 1.7585, "step": 875 }, { "epoch": 0.439453125, "grad_norm": 3.838252305984497, "learning_rate": 2e-05, "loss": 1.6141, "step": 900 }, { "epoch": 0.439453125, "eval_loss": 1.6529587507247925, "eval_runtime": 588.0827, "eval_samples_per_second": 3.484, "eval_steps_per_second": 0.437, "step": 900 }, { "epoch": 0.45166015625, "grad_norm": 4.486364841461182, "learning_rate": 2e-05, "loss": 1.6489, "step": 925 }, { "epoch": 0.4638671875, "grad_norm": 3.693453311920166, "learning_rate": 2e-05, "loss": 1.6026, "step": 950 }, { "epoch": 0.47607421875, "grad_norm": 2.4286513328552246, "learning_rate": 2e-05, "loss": 1.5639, "step": 975 }, { "epoch": 0.48828125, "grad_norm": 3.9820656776428223, "learning_rate": 2e-05, "loss": 1.6621, "step": 1000 }, { "epoch": 0.48828125, "eval_loss": 1.6506658792495728, "eval_runtime": 588.1468, "eval_samples_per_second": 3.484, "eval_steps_per_second": 0.437, "step": 1000 }, { "epoch": 0.50048828125, "grad_norm": 2.915191411972046, "learning_rate": 2e-05, "loss": 1.6281, "step": 1025 }, { "epoch": 0.5126953125, "grad_norm": 4.406491756439209, "learning_rate": 2e-05, "loss": 1.7108, "step": 1050 }, { "epoch": 0.52490234375, "grad_norm": 2.6505398750305176, "learning_rate": 2e-05, "loss": 1.7151, "step": 1075 }, { "epoch": 0.537109375, "grad_norm": 3.872833728790283, "learning_rate": 2e-05, "loss": 1.5925, "step": 1100 }, { "epoch": 0.537109375, "eval_loss": 1.6442919969558716, "eval_runtime": 588.2624, "eval_samples_per_second": 3.483, "eval_steps_per_second": 0.437, "step": 1100 }, { "epoch": 0.54931640625, "grad_norm": 2.210282802581787, "learning_rate": 2e-05, "loss": 1.5845, "step": 1125 }, { "epoch": 0.5615234375, "grad_norm": 3.7344298362731934, "learning_rate": 2e-05, "loss": 1.5994, "step": 1150 }, { "epoch": 0.57373046875, "grad_norm": 2.3247945308685303, "learning_rate": 2e-05, "loss": 1.622, "step": 1175 }, { "epoch": 0.5859375, "grad_norm": 4.974765300750732, "learning_rate": 2e-05, "loss": 1.6571, "step": 1200 }, { "epoch": 0.5859375, "eval_loss": 1.6453276872634888, "eval_runtime": 588.5916, "eval_samples_per_second": 3.481, "eval_steps_per_second": 0.437, "step": 1200 }, { "epoch": 0.59814453125, "grad_norm": 2.6029038429260254, "learning_rate": 2e-05, "loss": 1.6854, "step": 1225 }, { "epoch": 0.6103515625, "grad_norm": 3.8252599239349365, "learning_rate": 2e-05, "loss": 1.6875, "step": 1250 }, { "epoch": 0.62255859375, "grad_norm": 2.5335938930511475, "learning_rate": 2e-05, "loss": 1.5917, "step": 1275 }, { "epoch": 0.634765625, "grad_norm": 3.6627395153045654, "learning_rate": 2e-05, "loss": 1.6078, "step": 1300 }, { "epoch": 0.634765625, "eval_loss": 1.638580322265625, "eval_runtime": 588.7972, "eval_samples_per_second": 3.48, "eval_steps_per_second": 0.436, "step": 1300 }, { "epoch": 0.64697265625, "grad_norm": 2.5015482902526855, "learning_rate": 2e-05, "loss": 1.6793, "step": 1325 }, { "epoch": 0.6591796875, "grad_norm": 3.70072340965271, "learning_rate": 2e-05, "loss": 1.661, "step": 1350 }, { "epoch": 0.67138671875, "grad_norm": 2.6039609909057617, "learning_rate": 2e-05, "loss": 1.6349, "step": 1375 }, { "epoch": 0.68359375, "grad_norm": 3.3291618824005127, "learning_rate": 2e-05, "loss": 1.616, "step": 1400 }, { "epoch": 0.68359375, "eval_loss": 1.6347644329071045, "eval_runtime": 588.5837, "eval_samples_per_second": 3.481, "eval_steps_per_second": 0.437, "step": 1400 }, { "epoch": 0.69580078125, "grad_norm": 2.6853315830230713, "learning_rate": 2e-05, "loss": 1.7087, "step": 1425 }, { "epoch": 0.7080078125, "grad_norm": 3.296851396560669, "learning_rate": 2e-05, "loss": 1.6676, "step": 1450 }, { "epoch": 0.72021484375, "grad_norm": 2.3841185569763184, "learning_rate": 2e-05, "loss": 1.6212, "step": 1475 }, { "epoch": 0.732421875, "grad_norm": 3.612088441848755, "learning_rate": 2e-05, "loss": 1.6473, "step": 1500 }, { "epoch": 0.732421875, "eval_loss": 1.6339186429977417, "eval_runtime": 588.3073, "eval_samples_per_second": 3.483, "eval_steps_per_second": 0.437, "step": 1500 }, { "epoch": 0.74462890625, "grad_norm": 2.6555330753326416, "learning_rate": 2e-05, "loss": 1.6643, "step": 1525 }, { "epoch": 0.7568359375, "grad_norm": 4.533504486083984, "learning_rate": 2e-05, "loss": 1.6236, "step": 1550 }, { "epoch": 0.76904296875, "grad_norm": 2.2276220321655273, "learning_rate": 2e-05, "loss": 1.6783, "step": 1575 }, { "epoch": 0.78125, "grad_norm": 3.533113956451416, "learning_rate": 2e-05, "loss": 1.6123, "step": 1600 }, { "epoch": 0.78125, "eval_loss": 1.628023386001587, "eval_runtime": 588.6386, "eval_samples_per_second": 3.481, "eval_steps_per_second": 0.437, "step": 1600 }, { "epoch": 0.79345703125, "grad_norm": 2.2332117557525635, "learning_rate": 2e-05, "loss": 1.6795, "step": 1625 }, { "epoch": 0.8056640625, "grad_norm": 4.059207916259766, "learning_rate": 2e-05, "loss": 1.5915, "step": 1650 }, { "epoch": 0.81787109375, "grad_norm": 2.46692156791687, "learning_rate": 2e-05, "loss": 1.6456, "step": 1675 }, { "epoch": 0.830078125, "grad_norm": 3.602611780166626, "learning_rate": 2e-05, "loss": 1.564, "step": 1700 }, { "epoch": 0.830078125, "eval_loss": 1.6274890899658203, "eval_runtime": 588.2617, "eval_samples_per_second": 3.483, "eval_steps_per_second": 0.437, "step": 1700 }, { "epoch": 0.84228515625, "grad_norm": 2.20896315574646, "learning_rate": 2e-05, "loss": 1.6469, "step": 1725 }, { "epoch": 0.8544921875, "grad_norm": 4.329638481140137, "learning_rate": 2e-05, "loss": 1.5571, "step": 1750 }, { "epoch": 0.86669921875, "grad_norm": 1.9945570230484009, "learning_rate": 2e-05, "loss": 1.6461, "step": 1775 }, { "epoch": 0.87890625, "grad_norm": 3.428687334060669, "learning_rate": 2e-05, "loss": 1.6564, "step": 1800 }, { "epoch": 0.87890625, "eval_loss": 1.6232744455337524, "eval_runtime": 588.0784, "eval_samples_per_second": 3.484, "eval_steps_per_second": 0.437, "step": 1800 }, { "epoch": 0.89111328125, "grad_norm": 2.5266592502593994, "learning_rate": 2e-05, "loss": 1.5607, "step": 1825 }, { "epoch": 0.9033203125, "grad_norm": 3.4067883491516113, "learning_rate": 2e-05, "loss": 1.6394, "step": 1850 }, { "epoch": 0.91552734375, "grad_norm": 2.0028152465820312, "learning_rate": 2e-05, "loss": 1.6908, "step": 1875 }, { "epoch": 0.927734375, "grad_norm": 2.8983733654022217, "learning_rate": 2e-05, "loss": 1.5646, "step": 1900 }, { "epoch": 0.927734375, "eval_loss": 1.6202832460403442, "eval_runtime": 587.8115, "eval_samples_per_second": 3.486, "eval_steps_per_second": 0.437, "step": 1900 }, { "epoch": 0.93994140625, "grad_norm": 2.6408419609069824, "learning_rate": 2e-05, "loss": 1.5905, "step": 1925 }, { "epoch": 0.9521484375, "grad_norm": 3.899275302886963, "learning_rate": 2e-05, "loss": 1.6138, "step": 1950 }, { "epoch": 0.96435546875, "grad_norm": 2.338137149810791, "learning_rate": 2e-05, "loss": 1.6963, "step": 1975 }, { "epoch": 0.9765625, "grad_norm": 3.6352951526641846, "learning_rate": 2e-05, "loss": 1.5849, "step": 2000 }, { "epoch": 0.9765625, "eval_loss": 1.6187845468521118, "eval_runtime": 587.8791, "eval_samples_per_second": 3.485, "eval_steps_per_second": 0.437, "step": 2000 }, { "epoch": 0.98876953125, "grad_norm": 2.4254846572875977, "learning_rate": 2e-05, "loss": 1.6391, "step": 2025 }, { "epoch": 1.0009765625, "grad_norm": 2.079317569732666, "learning_rate": 2e-05, "loss": 1.6238, "step": 2050 }, { "epoch": 1.01318359375, "grad_norm": 2.1677002906799316, "learning_rate": 2e-05, "loss": 1.5543, "step": 2075 }, { "epoch": 1.025390625, "grad_norm": 2.4266505241394043, "learning_rate": 2e-05, "loss": 1.4812, "step": 2100 }, { "epoch": 1.025390625, "eval_loss": 1.6256210803985596, "eval_runtime": 585.954, "eval_samples_per_second": 3.497, "eval_steps_per_second": 0.439, "step": 2100 }, { "epoch": 1.03759765625, "grad_norm": 2.4697976112365723, "learning_rate": 2e-05, "loss": 1.5147, "step": 2125 }, { "epoch": 1.0498046875, "grad_norm": 2.3185527324676514, "learning_rate": 2e-05, "loss": 1.5198, "step": 2150 }, { "epoch": 1.06201171875, "grad_norm": 2.7304463386535645, "learning_rate": 2e-05, "loss": 1.5237, "step": 2175 }, { "epoch": 1.07421875, "grad_norm": 2.616072177886963, "learning_rate": 2e-05, "loss": 1.5598, "step": 2200 }, { "epoch": 1.07421875, "eval_loss": 1.623382568359375, "eval_runtime": 586.1381, "eval_samples_per_second": 3.496, "eval_steps_per_second": 0.438, "step": 2200 }, { "epoch": 1.08642578125, "grad_norm": 2.7308809757232666, "learning_rate": 2e-05, "loss": 1.5691, "step": 2225 }, { "epoch": 1.0986328125, "grad_norm": 2.6916451454162598, "learning_rate": 2e-05, "loss": 1.5102, "step": 2250 }, { "epoch": 1.11083984375, "grad_norm": 2.960580348968506, "learning_rate": 2e-05, "loss": 1.539, "step": 2275 }, { "epoch": 1.123046875, "grad_norm": 2.5936009883880615, "learning_rate": 2e-05, "loss": 1.5657, "step": 2300 }, { "epoch": 1.123046875, "eval_loss": 1.6226788759231567, "eval_runtime": 586.4284, "eval_samples_per_second": 3.494, "eval_steps_per_second": 0.438, "step": 2300 }, { "epoch": 1.13525390625, "grad_norm": 2.8930952548980713, "learning_rate": 2e-05, "loss": 1.4579, "step": 2325 }, { "epoch": 1.1474609375, "grad_norm": 2.8736538887023926, "learning_rate": 2e-05, "loss": 1.5127, "step": 2350 }, { "epoch": 1.15966796875, "grad_norm": 4.384296894073486, "learning_rate": 2e-05, "loss": 1.5988, "step": 2375 }, { "epoch": 1.171875, "grad_norm": 2.728992223739624, "learning_rate": 2e-05, "loss": 1.51, "step": 2400 }, { "epoch": 1.171875, "eval_loss": 1.6226541996002197, "eval_runtime": 586.345, "eval_samples_per_second": 3.495, "eval_steps_per_second": 0.438, "step": 2400 }, { "epoch": 1.18408203125, "grad_norm": 2.651820421218872, "learning_rate": 2e-05, "loss": 1.5226, "step": 2425 }, { "epoch": 1.1962890625, "grad_norm": 2.717193126678467, "learning_rate": 2e-05, "loss": 1.4966, "step": 2450 }, { "epoch": 1.20849609375, "grad_norm": 2.9759628772735596, "learning_rate": 2e-05, "loss": 1.526, "step": 2475 }, { "epoch": 1.220703125, "grad_norm": 2.8832080364227295, "learning_rate": 2e-05, "loss": 1.5452, "step": 2500 }, { "epoch": 1.220703125, "eval_loss": 1.6226392984390259, "eval_runtime": 586.3744, "eval_samples_per_second": 3.494, "eval_steps_per_second": 0.438, "step": 2500 } ], "logging_steps": 25, "max_steps": 4096, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 500, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 6, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 8.924062136972083e+16, "train_batch_size": 4, "trial_name": null, "trial_params": null }