{ "best_metric": 3.4919426441192627, "best_model_checkpoint": "/Users/frapadovani/Desktop/babyLM_controlled/models_trained/convergence_french/random_sentence_french/checkpoint-28000", "epoch": 0.8290646374322683, "eval_steps": 2000, "global_step": 28000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.059218902673733455, "grad_norm": 1.0812722444534302, "learning_rate": 0.0001, "loss": 4.7523, "step": 2000 }, { "epoch": 0.059218902673733455, "eval_loss": 4.0236735343933105, "eval_runtime": 5.173, "eval_samples_per_second": 425.477, "eval_steps_per_second": 26.677, "step": 2000 }, { "epoch": 0.11843780534746691, "grad_norm": 1.2557106018066406, "learning_rate": 0.0001, "loss": 3.8958, "step": 4000 }, { "epoch": 0.11843780534746691, "eval_loss": 3.8087611198425293, "eval_runtime": 5.2666, "eval_samples_per_second": 417.913, "eval_steps_per_second": 26.203, "step": 4000 }, { "epoch": 0.17765670802120037, "grad_norm": 1.3326870203018188, "learning_rate": 0.0001, "loss": 3.7344, "step": 6000 }, { "epoch": 0.17765670802120037, "eval_loss": 3.704986333847046, "eval_runtime": 5.2277, "eval_samples_per_second": 421.023, "eval_steps_per_second": 26.398, "step": 6000 }, { "epoch": 0.23687561069493382, "grad_norm": 1.38993239402771, "learning_rate": 0.0001, "loss": 3.6379, "step": 8000 }, { "epoch": 0.23687561069493382, "eval_loss": 3.6455252170562744, "eval_runtime": 5.1485, "eval_samples_per_second": 427.5, "eval_steps_per_second": 26.804, "step": 8000 }, { "epoch": 0.29609451336866727, "grad_norm": 1.3884963989257812, "learning_rate": 0.0001, "loss": 3.5687, "step": 10000 }, { "epoch": 0.29609451336866727, "eval_loss": 3.6062421798706055, "eval_runtime": 5.1466, "eval_samples_per_second": 427.661, "eval_steps_per_second": 26.814, "step": 10000 }, { "epoch": 0.35531341604240074, "grad_norm": 1.435062289237976, "learning_rate": 0.0001, "loss": 3.5165, "step": 12000 }, { "epoch": 0.35531341604240074, "eval_loss": 3.5764389038085938, "eval_runtime": 5.132, "eval_samples_per_second": 428.881, "eval_steps_per_second": 26.89, "step": 12000 }, { "epoch": 0.41453231871613416, "grad_norm": 1.3539327383041382, "learning_rate": 0.0001, "loss": 3.4738, "step": 14000 }, { "epoch": 0.41453231871613416, "eval_loss": 3.555619239807129, "eval_runtime": 5.1586, "eval_samples_per_second": 426.667, "eval_steps_per_second": 26.752, "step": 14000 }, { "epoch": 0.47375122138986764, "grad_norm": 1.3827875852584839, "learning_rate": 0.0001, "loss": 3.4389, "step": 16000 }, { "epoch": 0.47375122138986764, "eval_loss": 3.5388290882110596, "eval_runtime": 5.8029, "eval_samples_per_second": 379.292, "eval_steps_per_second": 23.781, "step": 16000 }, { "epoch": 0.5329701240636011, "grad_norm": 1.316589593887329, "learning_rate": 0.0001, "loss": 3.4067, "step": 18000 }, { "epoch": 0.5329701240636011, "eval_loss": 3.5234413146972656, "eval_runtime": 5.1383, "eval_samples_per_second": 428.353, "eval_steps_per_second": 26.857, "step": 18000 }, { "epoch": 0.5921890267373345, "grad_norm": 1.3910475969314575, "learning_rate": 0.0001, "loss": 3.3778, "step": 20000 }, { "epoch": 0.5921890267373345, "eval_loss": 3.5121970176696777, "eval_runtime": 7.6572, "eval_samples_per_second": 287.442, "eval_steps_per_second": 18.022, "step": 20000 }, { "epoch": 0.6514079294110681, "grad_norm": 1.510622262954712, "learning_rate": 0.0001, "loss": 3.3511, "step": 22000 }, { "epoch": 0.6514079294110681, "eval_loss": 3.5116806030273438, "eval_runtime": 5.6879, "eval_samples_per_second": 386.964, "eval_steps_per_second": 24.262, "step": 22000 }, { "epoch": 0.7106268320848015, "grad_norm": 1.4063044786453247, "learning_rate": 0.0001, "loss": 3.3309, "step": 24000 }, { "epoch": 0.7106268320848015, "eval_loss": 3.5099916458129883, "eval_runtime": 5.2486, "eval_samples_per_second": 419.351, "eval_steps_per_second": 26.293, "step": 24000 }, { "epoch": 0.7698457347585349, "grad_norm": 1.459807276725769, "learning_rate": 0.0001, "loss": 3.309, "step": 26000 }, { "epoch": 0.7698457347585349, "eval_loss": 3.498032808303833, "eval_runtime": 5.3512, "eval_samples_per_second": 411.307, "eval_steps_per_second": 25.788, "step": 26000 }, { "epoch": 0.8290646374322683, "grad_norm": 1.4196616411209106, "learning_rate": 0.0001, "loss": 3.2905, "step": 28000 }, { "epoch": 0.8290646374322683, "eval_loss": 3.4919426441192627, "eval_runtime": 5.2649, "eval_samples_per_second": 418.049, "eval_steps_per_second": 26.211, "step": 28000 } ], "logging_steps": 2000, "max_steps": 33773, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 2000, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 3, "early_stopping_threshold": 0.001 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 545834139648000.0, "train_batch_size": 16, "trial_name": null, "trial_params": null }