{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.003338898163606, "eval_steps": 10, "global_step": 300, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.07, "learning_rate": 3e-05, "loss": 1.5245, "step": 10 }, { "epoch": 0.07, "eval_accuracy": 0.3355263157894737, "eval_loss": 0.6506821513175964, "eval_runtime": 16.9243, "eval_samples_per_second": 17.962, "eval_steps_per_second": 4.491, "step": 10 }, { "epoch": 0.13, "learning_rate": 3e-05, "loss": 0.6666, "step": 20 }, { "epoch": 0.13, "eval_accuracy": 0.3815789473684211, "eval_loss": 0.6464425325393677, "eval_runtime": 16.9195, "eval_samples_per_second": 17.967, "eval_steps_per_second": 4.492, "step": 20 }, { "epoch": 0.2, "learning_rate": 3e-05, "loss": 0.6527, "step": 30 }, { "epoch": 0.2, "eval_accuracy": 0.3684210526315789, "eval_loss": 0.6426967978477478, "eval_runtime": 16.9282, "eval_samples_per_second": 17.958, "eval_steps_per_second": 4.49, "step": 30 }, { "epoch": 0.27, "learning_rate": 3e-05, "loss": 0.6168, "step": 40 }, { "epoch": 0.27, "eval_accuracy": 0.3980263157894737, "eval_loss": 0.6321499943733215, "eval_runtime": 17.0092, "eval_samples_per_second": 17.873, "eval_steps_per_second": 4.468, "step": 40 }, { "epoch": 0.33, "learning_rate": 3e-05, "loss": 0.6584, "step": 50 }, { "epoch": 0.33, "eval_accuracy": 0.39144736842105265, "eval_loss": 0.6181844472885132, "eval_runtime": 16.9419, "eval_samples_per_second": 17.944, "eval_steps_per_second": 4.486, "step": 50 }, { "epoch": 0.4, "learning_rate": 3e-05, "loss": 0.586, "step": 60 }, { "epoch": 0.4, "eval_accuracy": 0.4144736842105263, "eval_loss": 0.6244160532951355, "eval_runtime": 16.9269, "eval_samples_per_second": 17.96, "eval_steps_per_second": 4.49, "step": 60 }, { "epoch": 0.47, "learning_rate": 3e-05, "loss": 0.5924, "step": 70 }, { "epoch": 0.47, "eval_accuracy": 0.4342105263157895, "eval_loss": 0.6033625602722168, "eval_runtime": 16.9236, "eval_samples_per_second": 17.963, "eval_steps_per_second": 4.491, "step": 70 }, { "epoch": 0.53, "learning_rate": 3e-05, "loss": 0.6069, "step": 80 }, { "epoch": 0.53, "eval_accuracy": 0.4375, "eval_loss": 0.6096391677856445, "eval_runtime": 16.9238, "eval_samples_per_second": 17.963, "eval_steps_per_second": 4.491, "step": 80 }, { "epoch": 0.6, "learning_rate": 3e-05, "loss": 0.5999, "step": 90 }, { "epoch": 0.6, "eval_accuracy": 0.4407894736842105, "eval_loss": 0.6095999479293823, "eval_runtime": 16.9342, "eval_samples_per_second": 17.952, "eval_steps_per_second": 4.488, "step": 90 }, { "epoch": 0.67, "learning_rate": 3e-05, "loss": 0.6206, "step": 100 }, { "epoch": 0.67, "eval_accuracy": 0.45723684210526316, "eval_loss": 0.607021152973175, "eval_runtime": 16.9304, "eval_samples_per_second": 17.956, "eval_steps_per_second": 4.489, "step": 100 }, { "epoch": 0.73, "learning_rate": 3e-05, "loss": 0.5793, "step": 110 }, { "epoch": 0.73, "eval_accuracy": 0.45723684210526316, "eval_loss": 0.601601243019104, "eval_runtime": 16.9375, "eval_samples_per_second": 17.948, "eval_steps_per_second": 4.487, "step": 110 }, { "epoch": 0.8, "learning_rate": 3e-05, "loss": 0.6208, "step": 120 }, { "epoch": 0.8, "eval_accuracy": 0.4605263157894737, "eval_loss": 0.5902404189109802, "eval_runtime": 16.924, "eval_samples_per_second": 17.963, "eval_steps_per_second": 4.491, "step": 120 }, { "epoch": 0.87, "learning_rate": 3e-05, "loss": 0.5622, "step": 130 }, { "epoch": 0.87, "eval_accuracy": 0.4769736842105263, "eval_loss": 0.5775408744812012, "eval_runtime": 16.9329, "eval_samples_per_second": 17.953, "eval_steps_per_second": 4.488, "step": 130 }, { "epoch": 0.93, "learning_rate": 3e-05, "loss": 0.5502, "step": 140 }, { "epoch": 0.93, "eval_accuracy": 0.46710526315789475, "eval_loss": 0.57607102394104, "eval_runtime": 16.9226, "eval_samples_per_second": 17.964, "eval_steps_per_second": 4.491, "step": 140 }, { "epoch": 1.0, "learning_rate": 3e-05, "loss": 0.5958, "step": 150 }, { "epoch": 1.0, "eval_accuracy": 0.4901315789473684, "eval_loss": 0.5606401562690735, "eval_runtime": 16.929, "eval_samples_per_second": 17.957, "eval_steps_per_second": 4.489, "step": 150 }, { "epoch": 1.07, "learning_rate": 3e-05, "loss": 0.4558, "step": 160 }, { "epoch": 1.07, "eval_accuracy": 0.47368421052631576, "eval_loss": 0.5839833617210388, "eval_runtime": 16.9304, "eval_samples_per_second": 17.956, "eval_steps_per_second": 4.489, "step": 160 }, { "epoch": 1.14, "learning_rate": 3e-05, "loss": 0.4411, "step": 170 }, { "epoch": 1.14, "eval_accuracy": 0.4901315789473684, "eval_loss": 0.5631235837936401, "eval_runtime": 16.9238, "eval_samples_per_second": 17.963, "eval_steps_per_second": 4.491, "step": 170 }, { "epoch": 1.2, "learning_rate": 3e-05, "loss": 0.4144, "step": 180 }, { "epoch": 1.2, "eval_accuracy": 0.5, "eval_loss": 0.5744868516921997, "eval_runtime": 16.9382, "eval_samples_per_second": 17.948, "eval_steps_per_second": 4.487, "step": 180 }, { "epoch": 1.27, "learning_rate": 3e-05, "loss": 0.4647, "step": 190 }, { "epoch": 1.27, "eval_accuracy": 0.4605263157894737, "eval_loss": 0.593177080154419, "eval_runtime": 16.932, "eval_samples_per_second": 17.954, "eval_steps_per_second": 4.489, "step": 190 }, { "epoch": 1.34, "learning_rate": 3e-05, "loss": 0.4504, "step": 200 }, { "epoch": 1.34, "eval_accuracy": 0.5098684210526315, "eval_loss": 0.5798581838607788, "eval_runtime": 16.9337, "eval_samples_per_second": 17.952, "eval_steps_per_second": 4.488, "step": 200 }, { "epoch": 1.4, "learning_rate": 3e-05, "loss": 0.4299, "step": 210 }, { "epoch": 1.4, "eval_accuracy": 0.4934210526315789, "eval_loss": 0.64882493019104, "eval_runtime": 16.9391, "eval_samples_per_second": 17.947, "eval_steps_per_second": 4.487, "step": 210 }, { "epoch": 1.47, "learning_rate": 3e-05, "loss": 0.425, "step": 220 }, { "epoch": 1.47, "eval_accuracy": 0.5131578947368421, "eval_loss": 0.5704348683357239, "eval_runtime": 16.9325, "eval_samples_per_second": 17.954, "eval_steps_per_second": 4.488, "step": 220 }, { "epoch": 1.54, "learning_rate": 3e-05, "loss": 0.4152, "step": 230 }, { "epoch": 1.54, "eval_accuracy": 0.506578947368421, "eval_loss": 0.5582014322280884, "eval_runtime": 16.9258, "eval_samples_per_second": 17.961, "eval_steps_per_second": 4.49, "step": 230 }, { "epoch": 1.6, "learning_rate": 3e-05, "loss": 0.425, "step": 240 }, { "epoch": 1.6, "eval_accuracy": 0.5328947368421053, "eval_loss": 0.5488855838775635, "eval_runtime": 16.9288, "eval_samples_per_second": 17.958, "eval_steps_per_second": 4.489, "step": 240 }, { "epoch": 1.67, "learning_rate": 3e-05, "loss": 0.446, "step": 250 }, { "epoch": 1.67, "eval_accuracy": 0.5197368421052632, "eval_loss": 0.5479023456573486, "eval_runtime": 16.9319, "eval_samples_per_second": 17.954, "eval_steps_per_second": 4.489, "step": 250 }, { "epoch": 1.74, "learning_rate": 3e-05, "loss": 0.3908, "step": 260 }, { "epoch": 1.74, "eval_accuracy": 0.5164473684210527, "eval_loss": 0.5564107894897461, "eval_runtime": 16.9414, "eval_samples_per_second": 17.944, "eval_steps_per_second": 4.486, "step": 260 }, { "epoch": 1.8, "learning_rate": 3e-05, "loss": 0.443, "step": 270 }, { "epoch": 1.8, "eval_accuracy": 0.5032894736842105, "eval_loss": 0.5418796539306641, "eval_runtime": 16.9208, "eval_samples_per_second": 17.966, "eval_steps_per_second": 4.492, "step": 270 }, { "epoch": 1.87, "learning_rate": 3e-05, "loss": 0.4081, "step": 280 }, { "epoch": 1.87, "eval_accuracy": 0.506578947368421, "eval_loss": 0.5948407053947449, "eval_runtime": 16.9289, "eval_samples_per_second": 17.957, "eval_steps_per_second": 4.489, "step": 280 }, { "epoch": 1.94, "learning_rate": 3e-05, "loss": 0.3944, "step": 290 }, { "epoch": 1.94, "eval_accuracy": 0.5394736842105263, "eval_loss": 0.554680347442627, "eval_runtime": 16.9311, "eval_samples_per_second": 17.955, "eval_steps_per_second": 4.489, "step": 290 }, { "epoch": 2.0, "learning_rate": 3e-05, "loss": 0.4005, "step": 300 }, { "epoch": 2.0, "eval_accuracy": 0.5361842105263158, "eval_loss": 0.5615983009338379, "eval_runtime": 16.9277, "eval_samples_per_second": 17.959, "eval_steps_per_second": 4.49, "step": 300 }, { "epoch": 2.0, "step": 300, "total_flos": 7.380159778455552e+16, "train_loss": 0.5479170862833659, "train_runtime": 1462.1781, "train_samples_per_second": 3.283, "train_steps_per_second": 0.205 } ], "logging_steps": 10, "max_steps": 300, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 150, "total_flos": 7.380159778455552e+16, "train_batch_size": 4, "trial_name": null, "trial_params": null }