{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.984974958263773, "eval_steps": 10, "global_step": 447, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.07, "learning_rate": 3e-05, "loss": 1.5293, "step": 10 }, { "epoch": 0.07, "eval_accuracy": 0.3519736842105263, "eval_loss": 0.6503910422325134, "eval_runtime": 16.3619, "eval_samples_per_second": 18.58, "eval_steps_per_second": 4.645, "step": 10 }, { "epoch": 0.13, "learning_rate": 3e-05, "loss": 0.6652, "step": 20 }, { "epoch": 0.13, "eval_accuracy": 0.3782894736842105, "eval_loss": 0.6469289660453796, "eval_runtime": 16.3596, "eval_samples_per_second": 18.582, "eval_steps_per_second": 4.646, "step": 20 }, { "epoch": 0.2, "learning_rate": 3e-05, "loss": 0.6523, "step": 30 }, { "epoch": 0.2, "eval_accuracy": 0.3651315789473684, "eval_loss": 0.6429744362831116, "eval_runtime": 16.3534, "eval_samples_per_second": 18.589, "eval_steps_per_second": 4.647, "step": 30 }, { "epoch": 0.27, "learning_rate": 3e-05, "loss": 0.613, "step": 40 }, { "epoch": 0.27, "eval_accuracy": 0.40789473684210525, "eval_loss": 0.6341487765312195, "eval_runtime": 16.4446, "eval_samples_per_second": 18.486, "eval_steps_per_second": 4.622, "step": 40 }, { "epoch": 0.33, "learning_rate": 3e-05, "loss": 0.6586, "step": 50 }, { "epoch": 0.33, "eval_accuracy": 0.3881578947368421, "eval_loss": 0.6206462979316711, "eval_runtime": 16.366, "eval_samples_per_second": 18.575, "eval_steps_per_second": 4.644, "step": 50 }, { "epoch": 0.4, "learning_rate": 3e-05, "loss": 0.586, "step": 60 }, { "epoch": 0.4, "eval_accuracy": 0.41776315789473684, "eval_loss": 0.6268599033355713, "eval_runtime": 16.3556, "eval_samples_per_second": 18.587, "eval_steps_per_second": 4.647, "step": 60 }, { "epoch": 0.47, "learning_rate": 3e-05, "loss": 0.594, "step": 70 }, { "epoch": 0.47, "eval_accuracy": 0.4276315789473684, "eval_loss": 0.6045505404472351, "eval_runtime": 16.3658, "eval_samples_per_second": 18.575, "eval_steps_per_second": 4.644, "step": 70 }, { "epoch": 0.53, "learning_rate": 3e-05, "loss": 0.6063, "step": 80 }, { "epoch": 0.53, "eval_accuracy": 0.41776315789473684, "eval_loss": 0.6135305762290955, "eval_runtime": 16.3648, "eval_samples_per_second": 18.576, "eval_steps_per_second": 4.644, "step": 80 }, { "epoch": 0.6, "learning_rate": 3e-05, "loss": 0.5988, "step": 90 }, { "epoch": 0.6, "eval_accuracy": 0.4276315789473684, "eval_loss": 0.6097424626350403, "eval_runtime": 16.3755, "eval_samples_per_second": 18.564, "eval_steps_per_second": 4.641, "step": 90 }, { "epoch": 0.67, "learning_rate": 3e-05, "loss": 0.6217, "step": 100 }, { "epoch": 0.67, "eval_accuracy": 0.45394736842105265, "eval_loss": 0.6098220348358154, "eval_runtime": 16.3674, "eval_samples_per_second": 18.574, "eval_steps_per_second": 4.643, "step": 100 }, { "epoch": 0.73, "learning_rate": 3e-05, "loss": 0.5817, "step": 110 }, { "epoch": 0.73, "eval_accuracy": 0.45394736842105265, "eval_loss": 0.6021943092346191, "eval_runtime": 16.3678, "eval_samples_per_second": 18.573, "eval_steps_per_second": 4.643, "step": 110 }, { "epoch": 0.8, "learning_rate": 3e-05, "loss": 0.6219, "step": 120 }, { "epoch": 0.8, "eval_accuracy": 0.45723684210526316, "eval_loss": 0.5926041007041931, "eval_runtime": 16.3627, "eval_samples_per_second": 18.579, "eval_steps_per_second": 4.645, "step": 120 }, { "epoch": 0.87, "learning_rate": 3e-05, "loss": 0.559, "step": 130 }, { "epoch": 0.87, "eval_accuracy": 0.4605263157894737, "eval_loss": 0.5816267728805542, "eval_runtime": 16.3673, "eval_samples_per_second": 18.574, "eval_steps_per_second": 4.643, "step": 130 }, { "epoch": 0.93, "learning_rate": 3e-05, "loss": 0.5514, "step": 140 }, { "epoch": 0.93, "eval_accuracy": 0.47368421052631576, "eval_loss": 0.5783373713493347, "eval_runtime": 16.368, "eval_samples_per_second": 18.573, "eval_steps_per_second": 4.643, "step": 140 }, { "epoch": 1.0, "learning_rate": 3e-05, "loss": 0.59, "step": 150 }, { "epoch": 1.0, "eval_accuracy": 0.4868421052631579, "eval_loss": 0.5621668100357056, "eval_runtime": 16.3648, "eval_samples_per_second": 18.576, "eval_steps_per_second": 4.644, "step": 150 }, { "epoch": 1.07, "learning_rate": 3e-05, "loss": 0.46, "step": 160 }, { "epoch": 1.07, "eval_accuracy": 0.48026315789473684, "eval_loss": 0.5867581367492676, "eval_runtime": 16.3579, "eval_samples_per_second": 18.584, "eval_steps_per_second": 4.646, "step": 160 }, { "epoch": 1.14, "learning_rate": 3e-05, "loss": 0.4484, "step": 170 }, { "epoch": 1.14, "eval_accuracy": 0.4868421052631579, "eval_loss": 0.5666611194610596, "eval_runtime": 16.3613, "eval_samples_per_second": 18.58, "eval_steps_per_second": 4.645, "step": 170 }, { "epoch": 1.2, "learning_rate": 3e-05, "loss": 0.4162, "step": 180 }, { "epoch": 1.2, "eval_accuracy": 0.48026315789473684, "eval_loss": 0.5819750428199768, "eval_runtime": 16.3725, "eval_samples_per_second": 18.568, "eval_steps_per_second": 4.642, "step": 180 }, { "epoch": 1.27, "learning_rate": 3e-05, "loss": 0.4716, "step": 190 }, { "epoch": 1.27, "eval_accuracy": 0.46381578947368424, "eval_loss": 0.590432345867157, "eval_runtime": 16.361, "eval_samples_per_second": 18.581, "eval_steps_per_second": 4.645, "step": 190 }, { "epoch": 1.34, "learning_rate": 3e-05, "loss": 0.4486, "step": 200 }, { "epoch": 1.34, "eval_accuracy": 0.5098684210526315, "eval_loss": 0.5777420997619629, "eval_runtime": 16.354, "eval_samples_per_second": 18.589, "eval_steps_per_second": 4.647, "step": 200 }, { "epoch": 1.4, "learning_rate": 3e-05, "loss": 0.4264, "step": 210 }, { "epoch": 1.4, "eval_accuracy": 0.4967105263157895, "eval_loss": 0.6482496857643127, "eval_runtime": 16.3696, "eval_samples_per_second": 18.571, "eval_steps_per_second": 4.643, "step": 210 }, { "epoch": 1.47, "learning_rate": 3e-05, "loss": 0.4236, "step": 220 }, { "epoch": 1.47, "eval_accuracy": 0.5032894736842105, "eval_loss": 0.5741015076637268, "eval_runtime": 16.3648, "eval_samples_per_second": 18.576, "eval_steps_per_second": 4.644, "step": 220 }, { "epoch": 1.54, "learning_rate": 3e-05, "loss": 0.4141, "step": 230 }, { "epoch": 1.54, "eval_accuracy": 0.5164473684210527, "eval_loss": 0.5607666373252869, "eval_runtime": 16.3604, "eval_samples_per_second": 18.581, "eval_steps_per_second": 4.645, "step": 230 }, { "epoch": 1.6, "learning_rate": 3e-05, "loss": 0.4308, "step": 240 }, { "epoch": 1.6, "eval_accuracy": 0.5098684210526315, "eval_loss": 0.5539161562919617, "eval_runtime": 16.3597, "eval_samples_per_second": 18.582, "eval_steps_per_second": 4.646, "step": 240 }, { "epoch": 1.67, "learning_rate": 3e-05, "loss": 0.4505, "step": 250 }, { "epoch": 1.67, "eval_accuracy": 0.5032894736842105, "eval_loss": 0.5494562387466431, "eval_runtime": 16.3671, "eval_samples_per_second": 18.574, "eval_steps_per_second": 4.643, "step": 250 }, { "epoch": 1.74, "learning_rate": 3e-05, "loss": 0.3958, "step": 260 }, { "epoch": 1.74, "eval_accuracy": 0.5098684210526315, "eval_loss": 0.5593812465667725, "eval_runtime": 16.3598, "eval_samples_per_second": 18.582, "eval_steps_per_second": 4.646, "step": 260 }, { "epoch": 1.8, "learning_rate": 3e-05, "loss": 0.4432, "step": 270 }, { "epoch": 1.8, "eval_accuracy": 0.5164473684210527, "eval_loss": 0.5492013096809387, "eval_runtime": 16.3635, "eval_samples_per_second": 18.578, "eval_steps_per_second": 4.644, "step": 270 }, { "epoch": 1.87, "learning_rate": 3e-05, "loss": 0.4067, "step": 280 }, { "epoch": 1.87, "eval_accuracy": 0.506578947368421, "eval_loss": 0.6023809313774109, "eval_runtime": 16.3658, "eval_samples_per_second": 18.575, "eval_steps_per_second": 4.644, "step": 280 }, { "epoch": 1.94, "learning_rate": 3e-05, "loss": 0.3988, "step": 290 }, { "epoch": 1.94, "eval_accuracy": 0.5098684210526315, "eval_loss": 0.5606762766838074, "eval_runtime": 16.3642, "eval_samples_per_second": 18.577, "eval_steps_per_second": 4.644, "step": 290 }, { "epoch": 2.0, "learning_rate": 3e-05, "loss": 0.3992, "step": 300 }, { "epoch": 2.0, "eval_accuracy": 0.5164473684210527, "eval_loss": 0.5669550895690918, "eval_runtime": 16.3688, "eval_samples_per_second": 18.572, "eval_steps_per_second": 4.643, "step": 300 }, { "epoch": 2.07, "learning_rate": 3e-05, "loss": 0.2304, "step": 310 }, { "epoch": 2.07, "eval_accuracy": 0.5361842105263158, "eval_loss": 0.8199814558029175, "eval_runtime": 16.3644, "eval_samples_per_second": 18.577, "eval_steps_per_second": 4.644, "step": 310 }, { "epoch": 2.14, "learning_rate": 3e-05, "loss": 0.1696, "step": 320 }, { "epoch": 2.14, "eval_accuracy": 0.5296052631578947, "eval_loss": 0.9087279438972473, "eval_runtime": 16.362, "eval_samples_per_second": 18.58, "eval_steps_per_second": 4.645, "step": 320 }, { "epoch": 2.2, "learning_rate": 3e-05, "loss": 0.2255, "step": 330 }, { "epoch": 2.2, "eval_accuracy": 0.5361842105263158, "eval_loss": 0.7565640211105347, "eval_runtime": 16.3681, "eval_samples_per_second": 18.573, "eval_steps_per_second": 4.643, "step": 330 }, { "epoch": 2.27, "learning_rate": 3e-05, "loss": 0.1923, "step": 340 }, { "epoch": 2.27, "eval_accuracy": 0.5197368421052632, "eval_loss": 0.701989471912384, "eval_runtime": 16.3692, "eval_samples_per_second": 18.571, "eval_steps_per_second": 4.643, "step": 340 }, { "epoch": 2.34, "learning_rate": 3e-05, "loss": 0.281, "step": 350 }, { "epoch": 2.34, "eval_accuracy": 0.5032894736842105, "eval_loss": 0.6653422117233276, "eval_runtime": 16.3703, "eval_samples_per_second": 18.57, "eval_steps_per_second": 4.643, "step": 350 }, { "epoch": 2.4, "learning_rate": 3e-05, "loss": 0.2311, "step": 360 }, { "epoch": 2.4, "eval_accuracy": 0.5131578947368421, "eval_loss": 0.6411683559417725, "eval_runtime": 16.3702, "eval_samples_per_second": 18.57, "eval_steps_per_second": 4.643, "step": 360 }, { "epoch": 2.47, "learning_rate": 3e-05, "loss": 0.1523, "step": 370 }, { "epoch": 2.47, "eval_accuracy": 0.5230263157894737, "eval_loss": 0.8846220970153809, "eval_runtime": 16.3728, "eval_samples_per_second": 18.567, "eval_steps_per_second": 4.642, "step": 370 }, { "epoch": 2.54, "learning_rate": 3e-05, "loss": 0.2451, "step": 380 }, { "epoch": 2.54, "eval_accuracy": 0.5164473684210527, "eval_loss": 0.9251819252967834, "eval_runtime": 16.379, "eval_samples_per_second": 18.56, "eval_steps_per_second": 4.64, "step": 380 }, { "epoch": 2.6, "learning_rate": 3e-05, "loss": 0.2022, "step": 390 }, { "epoch": 2.6, "eval_accuracy": 0.5197368421052632, "eval_loss": 0.7422206401824951, "eval_runtime": 16.3727, "eval_samples_per_second": 18.567, "eval_steps_per_second": 4.642, "step": 390 }, { "epoch": 2.67, "learning_rate": 3e-05, "loss": 0.217, "step": 400 }, { "epoch": 2.67, "eval_accuracy": 0.5328947368421053, "eval_loss": 0.7557851076126099, "eval_runtime": 16.3638, "eval_samples_per_second": 18.578, "eval_steps_per_second": 4.644, "step": 400 }, { "epoch": 2.74, "learning_rate": 3e-05, "loss": 0.165, "step": 410 }, { "epoch": 2.74, "eval_accuracy": 0.5427631578947368, "eval_loss": 0.7846018075942993, "eval_runtime": 16.3674, "eval_samples_per_second": 18.574, "eval_steps_per_second": 4.643, "step": 410 }, { "epoch": 2.8, "learning_rate": 3e-05, "loss": 0.2025, "step": 420 }, { "epoch": 2.8, "eval_accuracy": 0.5230263157894737, "eval_loss": 0.725389301776886, "eval_runtime": 16.3775, "eval_samples_per_second": 18.562, "eval_steps_per_second": 4.641, "step": 420 }, { "epoch": 2.87, "learning_rate": 3e-05, "loss": 0.2201, "step": 430 }, { "epoch": 2.87, "eval_accuracy": 0.5296052631578947, "eval_loss": 0.6530900001525879, "eval_runtime": 16.3694, "eval_samples_per_second": 18.571, "eval_steps_per_second": 4.643, "step": 430 }, { "epoch": 2.94, "learning_rate": 3e-05, "loss": 0.2037, "step": 440 }, { "epoch": 2.94, "eval_accuracy": 0.5493421052631579, "eval_loss": 0.7827179431915283, "eval_runtime": 16.3682, "eval_samples_per_second": 18.573, "eval_steps_per_second": 4.643, "step": 440 }, { "epoch": 2.98, "step": 447, "total_flos": 1.09848374069035e+17, "train_loss": 0.43748926956381573, "train_runtime": 1886.9121, "train_samples_per_second": 3.809, "train_steps_per_second": 0.237 } ], "logging_steps": 10, "max_steps": 447, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 250, "total_flos": 1.09848374069035e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }