{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.6149993409779886, "eval_steps": 100, "global_step": 3100, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.08, "grad_norm": 6.90625, "learning_rate": 3e-06, "loss": 3.4225, "step": 100 }, { "epoch": 0.08, "eval_loss": 3.0902585983276367, "eval_runtime": 125.4074, "eval_samples_per_second": 67.221, "eval_steps_per_second": 33.61, "step": 100 }, { "epoch": 0.17, "grad_norm": 3.3125, "learning_rate": 6e-06, "loss": 2.6313, "step": 200 }, { "epoch": 0.17, "eval_loss": 1.4370466470718384, "eval_runtime": 124.5179, "eval_samples_per_second": 67.701, "eval_steps_per_second": 33.851, "step": 200 }, { "epoch": 0.25, "grad_norm": 1.4453125, "learning_rate": 9e-06, "loss": 1.2926, "step": 300 }, { "epoch": 0.25, "eval_loss": 1.0395174026489258, "eval_runtime": 125.1005, "eval_samples_per_second": 67.386, "eval_steps_per_second": 33.693, "step": 300 }, { "epoch": 0.34, "grad_norm": 1.6015625, "learning_rate": 1.2e-05, "loss": 1.1492, "step": 400 }, { "epoch": 0.34, "eval_loss": 1.0025979280471802, "eval_runtime": 124.2401, "eval_samples_per_second": 67.852, "eval_steps_per_second": 33.926, "step": 400 }, { "epoch": 0.42, "grad_norm": 1.2734375, "learning_rate": 1.5e-05, "loss": 1.1062, "step": 500 }, { "epoch": 0.42, "eval_loss": 0.9820207357406616, "eval_runtime": 124.4083, "eval_samples_per_second": 67.761, "eval_steps_per_second": 33.88, "step": 500 }, { "epoch": 0.51, "grad_norm": 1.3984375, "learning_rate": 1.4960378963658215e-05, "loss": 1.0761, "step": 600 }, { "epoch": 0.51, "eval_loss": 0.9677473902702332, "eval_runtime": 124.3968, "eval_samples_per_second": 67.767, "eval_steps_per_second": 33.884, "step": 600 }, { "epoch": 0.59, "grad_norm": 1.328125, "learning_rate": 1.484193447503841e-05, "loss": 1.056, "step": 700 }, { "epoch": 0.59, "eval_loss": 0.9569339156150818, "eval_runtime": 124.228, "eval_samples_per_second": 67.859, "eval_steps_per_second": 33.93, "step": 700 }, { "epoch": 0.67, "grad_norm": 1.359375, "learning_rate": 1.4645917972377404e-05, "loss": 1.0446, "step": 800 }, { "epoch": 0.67, "eval_loss": 0.9495565891265869, "eval_runtime": 124.2985, "eval_samples_per_second": 67.821, "eval_steps_per_second": 33.91, "step": 800 }, { "epoch": 0.76, "grad_norm": 1.3125, "learning_rate": 1.4374400489535342e-05, "loss": 1.0399, "step": 900 }, { "epoch": 0.76, "eval_loss": 0.9445509910583496, "eval_runtime": 124.152, "eval_samples_per_second": 67.901, "eval_steps_per_second": 33.95, "step": 900 }, { "epoch": 0.84, "grad_norm": 1.2578125, "learning_rate": 1.403025077426025e-05, "loss": 1.0252, "step": 1000 }, { "epoch": 0.84, "eval_loss": 0.9403882026672363, "eval_runtime": 123.8904, "eval_samples_per_second": 68.044, "eval_steps_per_second": 34.022, "step": 1000 }, { "epoch": 0.93, "grad_norm": 1.296875, "learning_rate": 1.3617104978119044e-05, "loss": 1.0284, "step": 1100 }, { "epoch": 0.93, "eval_loss": 0.9368470907211304, "eval_runtime": 123.9167, "eval_samples_per_second": 68.03, "eval_steps_per_second": 34.015, "step": 1100 }, { "epoch": 1.01, "grad_norm": 1.375, "learning_rate": 1.3139328238339287e-05, "loss": 1.0171, "step": 1200 }, { "epoch": 1.01, "eval_loss": 0.9342640042304993, "eval_runtime": 124.6542, "eval_samples_per_second": 67.627, "eval_steps_per_second": 33.814, "step": 1200 }, { "epoch": 1.1, "grad_norm": 1.21875, "learning_rate": 1.2601968557473e-05, "loss": 1.0086, "step": 1300 }, { "epoch": 1.1, "eval_loss": 0.9315484166145325, "eval_runtime": 124.4874, "eval_samples_per_second": 67.718, "eval_steps_per_second": 33.859, "step": 1300 }, { "epoch": 1.18, "grad_norm": 1.2265625, "learning_rate": 1.2010703468171973e-05, "loss": 1.0056, "step": 1400 }, { "epoch": 1.18, "eval_loss": 0.9296947717666626, "eval_runtime": 124.175, "eval_samples_per_second": 67.888, "eval_steps_per_second": 33.944, "step": 1400 }, { "epoch": 1.27, "grad_norm": 1.25, "learning_rate": 1.1371780046593758e-05, "loss": 1.0083, "step": 1500 }, { "epoch": 1.27, "eval_loss": 0.9280151128768921, "eval_runtime": 123.9014, "eval_samples_per_second": 68.038, "eval_steps_per_second": 34.019, "step": 1500 }, { "epoch": 1.35, "grad_norm": 1.2890625, "learning_rate": 1.069194890823328e-05, "loss": 1.0037, "step": 1600 }, { "epoch": 1.35, "eval_loss": 0.926633358001709, "eval_runtime": 123.7914, "eval_samples_per_second": 68.098, "eval_steps_per_second": 34.049, "step": 1600 }, { "epoch": 1.43, "grad_norm": 1.1796875, "learning_rate": 9.978392883554342e-06, "loss": 0.9951, "step": 1700 }, { "epoch": 1.43, "eval_loss": 0.9259628653526306, "eval_runtime": 124.5775, "eval_samples_per_second": 67.669, "eval_steps_per_second": 33.834, "step": 1700 }, { "epoch": 1.52, "grad_norm": 1.234375, "learning_rate": 9.238651127006462e-06, "loss": 0.9962, "step": 1800 }, { "epoch": 1.52, "eval_loss": 0.9251705408096313, "eval_runtime": 124.4301, "eval_samples_per_second": 67.749, "eval_steps_per_second": 33.874, "step": 1800 }, { "epoch": 1.6, "grad_norm": 1.2109375, "learning_rate": 8.48053946126157e-06, "loss": 0.9907, "step": 1900 }, { "epoch": 1.6, "eval_loss": 0.924351155757904, "eval_runtime": 124.2821, "eval_samples_per_second": 67.83, "eval_steps_per_second": 33.915, "step": 1900 }, { "epoch": 1.69, "grad_norm": 1.3046875, "learning_rate": 7.712067798282222e-06, "loss": 1.0003, "step": 2000 }, { "epoch": 1.69, "eval_loss": 0.9239010810852051, "eval_runtime": 123.3108, "eval_samples_per_second": 68.364, "eval_steps_per_second": 34.182, "step": 2000 }, { "epoch": 1.77, "grad_norm": 1.3046875, "learning_rate": 6.941355509718164e-06, "loss": 0.9976, "step": 2100 }, { "epoch": 1.77, "eval_loss": 0.9232047200202942, "eval_runtime": 124.3131, "eval_samples_per_second": 67.813, "eval_steps_per_second": 33.906, "step": 2100 }, { "epoch": 1.86, "grad_norm": 1.21875, "learning_rate": 6.176545640794535e-06, "loss": 0.9896, "step": 2200 }, { "epoch": 1.86, "eval_loss": 0.923151969909668, "eval_runtime": 123.8739, "eval_samples_per_second": 68.053, "eval_steps_per_second": 34.027, "step": 2200 }, { "epoch": 1.94, "grad_norm": 1.203125, "learning_rate": 5.4257188740743086e-06, "loss": 0.9954, "step": 2300 }, { "epoch": 1.94, "eval_loss": 0.9228904247283936, "eval_runtime": 124.4202, "eval_samples_per_second": 67.754, "eval_steps_per_second": 33.877, "step": 2300 }, { "epoch": 2.02, "grad_norm": 1.2421875, "learning_rate": 4.696808152120318e-06, "loss": 0.9982, "step": 2400 }, { "epoch": 2.02, "eval_loss": 0.9227039813995361, "eval_runtime": 124.3531, "eval_samples_per_second": 67.791, "eval_steps_per_second": 33.895, "step": 2400 }, { "epoch": 2.11, "grad_norm": 1.2421875, "learning_rate": 3.997514861120414e-06, "loss": 0.9957, "step": 2500 }, { "epoch": 2.11, "eval_loss": 0.9224779605865479, "eval_runtime": 123.9488, "eval_samples_per_second": 68.012, "eval_steps_per_second": 34.006, "step": 2500 }, { "epoch": 2.19, "grad_norm": 1.1875, "learning_rate": 3.335227461046941e-06, "loss": 0.9883, "step": 2600 }, { "epoch": 2.19, "eval_loss": 0.922317624092102, "eval_runtime": 124.5434, "eval_samples_per_second": 67.687, "eval_steps_per_second": 33.844, "step": 2600 }, { "epoch": 2.28, "grad_norm": 1.2890625, "learning_rate": 2.7169434220724335e-06, "loss": 0.9849, "step": 2700 }, { "epoch": 2.28, "eval_loss": 0.9223732352256775, "eval_runtime": 124.8267, "eval_samples_per_second": 67.534, "eval_steps_per_second": 33.767, "step": 2700 }, { "epoch": 2.36, "grad_norm": 1.2109375, "learning_rate": 2.14919529203096e-06, "loss": 0.9974, "step": 2800 }, { "epoch": 2.36, "eval_loss": 0.9222919940948486, "eval_runtime": 124.9102, "eval_samples_per_second": 67.489, "eval_steps_per_second": 33.744, "step": 2800 }, { "epoch": 2.45, "grad_norm": 1.28125, "learning_rate": 1.6379816760674141e-06, "loss": 0.9854, "step": 2900 }, { "epoch": 2.45, "eval_loss": 0.9222748279571533, "eval_runtime": 124.029, "eval_samples_per_second": 67.968, "eval_steps_per_second": 33.984, "step": 2900 }, { "epoch": 2.53, "grad_norm": 1.234375, "learning_rate": 1.1887038577168646e-06, "loss": 0.9831, "step": 3000 }, { "epoch": 2.53, "eval_loss": 0.922382652759552, "eval_runtime": 123.9646, "eval_samples_per_second": 68.003, "eval_steps_per_second": 34.002, "step": 3000 }, { "epoch": 2.61, "grad_norm": 1.2265625, "learning_rate": 8.061087310508917e-07, "loss": 0.9958, "step": 3100 }, { "epoch": 2.61, "eval_loss": 0.9223530292510986, "eval_runtime": 124.2043, "eval_samples_per_second": 67.872, "eval_steps_per_second": 33.936, "step": 3100 } ], "logging_steps": 100, "max_steps": 3555, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 100, "total_flos": 2.106483852705915e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }