|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.984974958263773, |
|
"eval_steps": 10, |
|
"global_step": 447, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.07, |
|
"learning_rate": 3e-05, |
|
"loss": 1.5293, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"eval_accuracy": 0.3519736842105263, |
|
"eval_loss": 0.6503910422325134, |
|
"eval_runtime": 16.3619, |
|
"eval_samples_per_second": 18.58, |
|
"eval_steps_per_second": 4.645, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"learning_rate": 3e-05, |
|
"loss": 0.6652, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"eval_accuracy": 0.3782894736842105, |
|
"eval_loss": 0.6469289660453796, |
|
"eval_runtime": 16.3596, |
|
"eval_samples_per_second": 18.582, |
|
"eval_steps_per_second": 4.646, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"learning_rate": 3e-05, |
|
"loss": 0.6523, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"eval_accuracy": 0.3651315789473684, |
|
"eval_loss": 0.6429744362831116, |
|
"eval_runtime": 16.3534, |
|
"eval_samples_per_second": 18.589, |
|
"eval_steps_per_second": 4.647, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"learning_rate": 3e-05, |
|
"loss": 0.613, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"eval_accuracy": 0.40789473684210525, |
|
"eval_loss": 0.6341487765312195, |
|
"eval_runtime": 16.4446, |
|
"eval_samples_per_second": 18.486, |
|
"eval_steps_per_second": 4.622, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"learning_rate": 3e-05, |
|
"loss": 0.6586, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"eval_accuracy": 0.3881578947368421, |
|
"eval_loss": 0.6206462979316711, |
|
"eval_runtime": 16.366, |
|
"eval_samples_per_second": 18.575, |
|
"eval_steps_per_second": 4.644, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"learning_rate": 3e-05, |
|
"loss": 0.586, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"eval_accuracy": 0.41776315789473684, |
|
"eval_loss": 0.6268599033355713, |
|
"eval_runtime": 16.3556, |
|
"eval_samples_per_second": 18.587, |
|
"eval_steps_per_second": 4.647, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"learning_rate": 3e-05, |
|
"loss": 0.594, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"eval_accuracy": 0.4276315789473684, |
|
"eval_loss": 0.6045505404472351, |
|
"eval_runtime": 16.3658, |
|
"eval_samples_per_second": 18.575, |
|
"eval_steps_per_second": 4.644, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"learning_rate": 3e-05, |
|
"loss": 0.6063, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"eval_accuracy": 0.41776315789473684, |
|
"eval_loss": 0.6135305762290955, |
|
"eval_runtime": 16.3648, |
|
"eval_samples_per_second": 18.576, |
|
"eval_steps_per_second": 4.644, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"learning_rate": 3e-05, |
|
"loss": 0.5988, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"eval_accuracy": 0.4276315789473684, |
|
"eval_loss": 0.6097424626350403, |
|
"eval_runtime": 16.3755, |
|
"eval_samples_per_second": 18.564, |
|
"eval_steps_per_second": 4.641, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"learning_rate": 3e-05, |
|
"loss": 0.6217, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"eval_accuracy": 0.45394736842105265, |
|
"eval_loss": 0.6098220348358154, |
|
"eval_runtime": 16.3674, |
|
"eval_samples_per_second": 18.574, |
|
"eval_steps_per_second": 4.643, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"learning_rate": 3e-05, |
|
"loss": 0.5817, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"eval_accuracy": 0.45394736842105265, |
|
"eval_loss": 0.6021943092346191, |
|
"eval_runtime": 16.3678, |
|
"eval_samples_per_second": 18.573, |
|
"eval_steps_per_second": 4.643, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"learning_rate": 3e-05, |
|
"loss": 0.6219, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"eval_accuracy": 0.45723684210526316, |
|
"eval_loss": 0.5926041007041931, |
|
"eval_runtime": 16.3627, |
|
"eval_samples_per_second": 18.579, |
|
"eval_steps_per_second": 4.645, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"learning_rate": 3e-05, |
|
"loss": 0.559, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"eval_accuracy": 0.4605263157894737, |
|
"eval_loss": 0.5816267728805542, |
|
"eval_runtime": 16.3673, |
|
"eval_samples_per_second": 18.574, |
|
"eval_steps_per_second": 4.643, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"learning_rate": 3e-05, |
|
"loss": 0.5514, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"eval_accuracy": 0.47368421052631576, |
|
"eval_loss": 0.5783373713493347, |
|
"eval_runtime": 16.368, |
|
"eval_samples_per_second": 18.573, |
|
"eval_steps_per_second": 4.643, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"learning_rate": 3e-05, |
|
"loss": 0.59, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_accuracy": 0.4868421052631579, |
|
"eval_loss": 0.5621668100357056, |
|
"eval_runtime": 16.3648, |
|
"eval_samples_per_second": 18.576, |
|
"eval_steps_per_second": 4.644, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 1.07, |
|
"learning_rate": 3e-05, |
|
"loss": 0.46, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 1.07, |
|
"eval_accuracy": 0.48026315789473684, |
|
"eval_loss": 0.5867581367492676, |
|
"eval_runtime": 16.3579, |
|
"eval_samples_per_second": 18.584, |
|
"eval_steps_per_second": 4.646, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 1.14, |
|
"learning_rate": 3e-05, |
|
"loss": 0.4484, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 1.14, |
|
"eval_accuracy": 0.4868421052631579, |
|
"eval_loss": 0.5666611194610596, |
|
"eval_runtime": 16.3613, |
|
"eval_samples_per_second": 18.58, |
|
"eval_steps_per_second": 4.645, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 1.2, |
|
"learning_rate": 3e-05, |
|
"loss": 0.4162, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 1.2, |
|
"eval_accuracy": 0.48026315789473684, |
|
"eval_loss": 0.5819750428199768, |
|
"eval_runtime": 16.3725, |
|
"eval_samples_per_second": 18.568, |
|
"eval_steps_per_second": 4.642, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 1.27, |
|
"learning_rate": 3e-05, |
|
"loss": 0.4716, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 1.27, |
|
"eval_accuracy": 0.46381578947368424, |
|
"eval_loss": 0.590432345867157, |
|
"eval_runtime": 16.361, |
|
"eval_samples_per_second": 18.581, |
|
"eval_steps_per_second": 4.645, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 1.34, |
|
"learning_rate": 3e-05, |
|
"loss": 0.4486, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 1.34, |
|
"eval_accuracy": 0.5098684210526315, |
|
"eval_loss": 0.5777420997619629, |
|
"eval_runtime": 16.354, |
|
"eval_samples_per_second": 18.589, |
|
"eval_steps_per_second": 4.647, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 1.4, |
|
"learning_rate": 3e-05, |
|
"loss": 0.4264, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 1.4, |
|
"eval_accuracy": 0.4967105263157895, |
|
"eval_loss": 0.6482496857643127, |
|
"eval_runtime": 16.3696, |
|
"eval_samples_per_second": 18.571, |
|
"eval_steps_per_second": 4.643, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 1.47, |
|
"learning_rate": 3e-05, |
|
"loss": 0.4236, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 1.47, |
|
"eval_accuracy": 0.5032894736842105, |
|
"eval_loss": 0.5741015076637268, |
|
"eval_runtime": 16.3648, |
|
"eval_samples_per_second": 18.576, |
|
"eval_steps_per_second": 4.644, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 1.54, |
|
"learning_rate": 3e-05, |
|
"loss": 0.4141, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 1.54, |
|
"eval_accuracy": 0.5164473684210527, |
|
"eval_loss": 0.5607666373252869, |
|
"eval_runtime": 16.3604, |
|
"eval_samples_per_second": 18.581, |
|
"eval_steps_per_second": 4.645, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 1.6, |
|
"learning_rate": 3e-05, |
|
"loss": 0.4308, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 1.6, |
|
"eval_accuracy": 0.5098684210526315, |
|
"eval_loss": 0.5539161562919617, |
|
"eval_runtime": 16.3597, |
|
"eval_samples_per_second": 18.582, |
|
"eval_steps_per_second": 4.646, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 1.67, |
|
"learning_rate": 3e-05, |
|
"loss": 0.4505, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 1.67, |
|
"eval_accuracy": 0.5032894736842105, |
|
"eval_loss": 0.5494562387466431, |
|
"eval_runtime": 16.3671, |
|
"eval_samples_per_second": 18.574, |
|
"eval_steps_per_second": 4.643, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 1.74, |
|
"learning_rate": 3e-05, |
|
"loss": 0.3958, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 1.74, |
|
"eval_accuracy": 0.5098684210526315, |
|
"eval_loss": 0.5593812465667725, |
|
"eval_runtime": 16.3598, |
|
"eval_samples_per_second": 18.582, |
|
"eval_steps_per_second": 4.646, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 1.8, |
|
"learning_rate": 3e-05, |
|
"loss": 0.4432, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 1.8, |
|
"eval_accuracy": 0.5164473684210527, |
|
"eval_loss": 0.5492013096809387, |
|
"eval_runtime": 16.3635, |
|
"eval_samples_per_second": 18.578, |
|
"eval_steps_per_second": 4.644, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 1.87, |
|
"learning_rate": 3e-05, |
|
"loss": 0.4067, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 1.87, |
|
"eval_accuracy": 0.506578947368421, |
|
"eval_loss": 0.6023809313774109, |
|
"eval_runtime": 16.3658, |
|
"eval_samples_per_second": 18.575, |
|
"eval_steps_per_second": 4.644, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 1.94, |
|
"learning_rate": 3e-05, |
|
"loss": 0.3988, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 1.94, |
|
"eval_accuracy": 0.5098684210526315, |
|
"eval_loss": 0.5606762766838074, |
|
"eval_runtime": 16.3642, |
|
"eval_samples_per_second": 18.577, |
|
"eval_steps_per_second": 4.644, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"learning_rate": 3e-05, |
|
"loss": 0.3992, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_accuracy": 0.5164473684210527, |
|
"eval_loss": 0.5669550895690918, |
|
"eval_runtime": 16.3688, |
|
"eval_samples_per_second": 18.572, |
|
"eval_steps_per_second": 4.643, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 2.07, |
|
"learning_rate": 3e-05, |
|
"loss": 0.2304, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 2.07, |
|
"eval_accuracy": 0.5361842105263158, |
|
"eval_loss": 0.8199814558029175, |
|
"eval_runtime": 16.3644, |
|
"eval_samples_per_second": 18.577, |
|
"eval_steps_per_second": 4.644, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 2.14, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1696, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 2.14, |
|
"eval_accuracy": 0.5296052631578947, |
|
"eval_loss": 0.9087279438972473, |
|
"eval_runtime": 16.362, |
|
"eval_samples_per_second": 18.58, |
|
"eval_steps_per_second": 4.645, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 2.2, |
|
"learning_rate": 3e-05, |
|
"loss": 0.2255, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 2.2, |
|
"eval_accuracy": 0.5361842105263158, |
|
"eval_loss": 0.7565640211105347, |
|
"eval_runtime": 16.3681, |
|
"eval_samples_per_second": 18.573, |
|
"eval_steps_per_second": 4.643, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 2.27, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1923, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 2.27, |
|
"eval_accuracy": 0.5197368421052632, |
|
"eval_loss": 0.701989471912384, |
|
"eval_runtime": 16.3692, |
|
"eval_samples_per_second": 18.571, |
|
"eval_steps_per_second": 4.643, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 2.34, |
|
"learning_rate": 3e-05, |
|
"loss": 0.281, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 2.34, |
|
"eval_accuracy": 0.5032894736842105, |
|
"eval_loss": 0.6653422117233276, |
|
"eval_runtime": 16.3703, |
|
"eval_samples_per_second": 18.57, |
|
"eval_steps_per_second": 4.643, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 2.4, |
|
"learning_rate": 3e-05, |
|
"loss": 0.2311, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 2.4, |
|
"eval_accuracy": 0.5131578947368421, |
|
"eval_loss": 0.6411683559417725, |
|
"eval_runtime": 16.3702, |
|
"eval_samples_per_second": 18.57, |
|
"eval_steps_per_second": 4.643, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 2.47, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1523, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 2.47, |
|
"eval_accuracy": 0.5230263157894737, |
|
"eval_loss": 0.8846220970153809, |
|
"eval_runtime": 16.3728, |
|
"eval_samples_per_second": 18.567, |
|
"eval_steps_per_second": 4.642, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 2.54, |
|
"learning_rate": 3e-05, |
|
"loss": 0.2451, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 2.54, |
|
"eval_accuracy": 0.5164473684210527, |
|
"eval_loss": 0.9251819252967834, |
|
"eval_runtime": 16.379, |
|
"eval_samples_per_second": 18.56, |
|
"eval_steps_per_second": 4.64, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 2.6, |
|
"learning_rate": 3e-05, |
|
"loss": 0.2022, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 2.6, |
|
"eval_accuracy": 0.5197368421052632, |
|
"eval_loss": 0.7422206401824951, |
|
"eval_runtime": 16.3727, |
|
"eval_samples_per_second": 18.567, |
|
"eval_steps_per_second": 4.642, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 2.67, |
|
"learning_rate": 3e-05, |
|
"loss": 0.217, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 2.67, |
|
"eval_accuracy": 0.5328947368421053, |
|
"eval_loss": 0.7557851076126099, |
|
"eval_runtime": 16.3638, |
|
"eval_samples_per_second": 18.578, |
|
"eval_steps_per_second": 4.644, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 2.74, |
|
"learning_rate": 3e-05, |
|
"loss": 0.165, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 2.74, |
|
"eval_accuracy": 0.5427631578947368, |
|
"eval_loss": 0.7846018075942993, |
|
"eval_runtime": 16.3674, |
|
"eval_samples_per_second": 18.574, |
|
"eval_steps_per_second": 4.643, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 2.8, |
|
"learning_rate": 3e-05, |
|
"loss": 0.2025, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 2.8, |
|
"eval_accuracy": 0.5230263157894737, |
|
"eval_loss": 0.725389301776886, |
|
"eval_runtime": 16.3775, |
|
"eval_samples_per_second": 18.562, |
|
"eval_steps_per_second": 4.641, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 2.87, |
|
"learning_rate": 3e-05, |
|
"loss": 0.2201, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 2.87, |
|
"eval_accuracy": 0.5296052631578947, |
|
"eval_loss": 0.6530900001525879, |
|
"eval_runtime": 16.3694, |
|
"eval_samples_per_second": 18.571, |
|
"eval_steps_per_second": 4.643, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 2.94, |
|
"learning_rate": 3e-05, |
|
"loss": 0.2037, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 2.94, |
|
"eval_accuracy": 0.5493421052631579, |
|
"eval_loss": 0.7827179431915283, |
|
"eval_runtime": 16.3682, |
|
"eval_samples_per_second": 18.573, |
|
"eval_steps_per_second": 4.643, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 2.98, |
|
"step": 447, |
|
"total_flos": 1.09848374069035e+17, |
|
"train_loss": 0.43748926956381573, |
|
"train_runtime": 1886.9121, |
|
"train_samples_per_second": 3.809, |
|
"train_steps_per_second": 0.237 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 447, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 250, |
|
"total_flos": 1.09848374069035e+17, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|