{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.955033824114604, "eval_steps": 200, "global_step": 2400, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.03979307600477517, "grad_norm": 19.560626983642578, "learning_rate": 4.000000000000001e-06, "loss": 0.9524, "step": 100 }, { "epoch": 0.07958615200955034, "grad_norm": 10.810285568237305, "learning_rate": 8.000000000000001e-06, "loss": 0.7019, "step": 200 }, { "epoch": 0.07958615200955034, "eval_accuracy": 0.6237137511693172, "eval_loss": 0.6542279124259949, "eval_runtime": 24.2722, "eval_samples_per_second": 352.337, "eval_steps_per_second": 5.521, "step": 200 }, { "epoch": 0.1193792280143255, "grad_norm": 6.921683311462402, "learning_rate": 1.2e-05, "loss": 0.6443, "step": 300 }, { "epoch": 0.15917230401910068, "grad_norm": 5.143254280090332, "learning_rate": 1.6000000000000003e-05, "loss": 0.6156, "step": 400 }, { "epoch": 0.15917230401910068, "eval_accuracy": 0.6449953227315248, "eval_loss": 0.6248486638069153, "eval_runtime": 24.3077, "eval_samples_per_second": 351.823, "eval_steps_per_second": 5.513, "step": 400 }, { "epoch": 0.19896538002387584, "grad_norm": 4.483047962188721, "learning_rate": 2e-05, "loss": 0.6307, "step": 500 }, { "epoch": 0.238758456028651, "grad_norm": 3.8908305168151855, "learning_rate": 1.999004191733529e-05, "loss": 0.6263, "step": 600 }, { "epoch": 0.238758456028651, "eval_accuracy": 0.641955098222638, "eval_loss": 0.6327589750289917, "eval_runtime": 24.3113, "eval_samples_per_second": 351.771, "eval_steps_per_second": 5.512, "step": 600 }, { "epoch": 0.2785515320334262, "grad_norm": 2.846820831298828, "learning_rate": 1.9960187502023228e-05, "loss": 0.6237, "step": 700 }, { "epoch": 0.31834460803820136, "grad_norm": 2.4224681854248047, "learning_rate": 1.991049621261093e-05, "loss": 0.6237, "step": 800 }, { "epoch": 0.31834460803820136, "eval_accuracy": 0.6501403180542563, "eval_loss": 0.6154850721359253, "eval_runtime": 24.2938, "eval_samples_per_second": 352.024, "eval_steps_per_second": 5.516, "step": 800 }, { "epoch": 0.3581376840429765, "grad_norm": 3.673722267150879, "learning_rate": 1.9841067015091934e-05, "loss": 0.61, "step": 900 }, { "epoch": 0.3979307600477517, "grad_norm": 4.1186299324035645, "learning_rate": 1.975203818580389e-05, "loss": 0.6091, "step": 1000 }, { "epoch": 0.3979307600477517, "eval_accuracy": 0.6554022450888681, "eval_loss": 0.6121359467506409, "eval_runtime": 24.2673, "eval_samples_per_second": 352.408, "eval_steps_per_second": 5.522, "step": 1000 }, { "epoch": 0.43772383605252685, "grad_norm": 3.4261631965637207, "learning_rate": 1.964358703603511e-05, "loss": 0.6112, "step": 1100 }, { "epoch": 0.477516912057302, "grad_norm": 2.712700128555298, "learning_rate": 1.9515929558888497e-05, "loss": 0.6121, "step": 1200 }, { "epoch": 0.477516912057302, "eval_accuracy": 0.659377923292797, "eval_loss": 0.606299638748169, "eval_runtime": 24.2915, "eval_samples_per_second": 352.058, "eval_steps_per_second": 5.516, "step": 1200 }, { "epoch": 0.5173099880620772, "grad_norm": 4.298662185668945, "learning_rate": 1.936931999910609e-05, "loss": 0.6026, "step": 1300 }, { "epoch": 0.5571030640668524, "grad_norm": 2.0829362869262695, "learning_rate": 1.9204050346711034e-05, "loss": 0.6022, "step": 1400 }, { "epoch": 0.5571030640668524, "eval_accuracy": 0.653999064546305, "eval_loss": 0.6064969301223755, "eval_runtime": 24.3164, "eval_samples_per_second": 351.697, "eval_steps_per_second": 5.511, "step": 1400 }, { "epoch": 0.5968961400716275, "grad_norm": 3.5935146808624268, "learning_rate": 1.9020449755475434e-05, "loss": 0.6059, "step": 1500 }, { "epoch": 0.6366892160764027, "grad_norm": 2.9716155529022217, "learning_rate": 1.881888388737226e-05, "loss": 0.5957, "step": 1600 }, { "epoch": 0.6366892160764027, "eval_accuracy": 0.6696679139382601, "eval_loss": 0.5993675589561462, "eval_runtime": 24.2743, "eval_samples_per_second": 352.307, "eval_steps_per_second": 5.52, "step": 1600 }, { "epoch": 0.6764822920811778, "grad_norm": 2.7621397972106934, "learning_rate": 1.859975418431689e-05, "loss": 0.5926, "step": 1700 }, { "epoch": 0.716275368085953, "grad_norm": 2.882694721221924, "learning_rate": 1.8363497068648795e-05, "loss": 0.6017, "step": 1800 }, { "epoch": 0.716275368085953, "eval_accuracy": 0.6667446211412535, "eval_loss": 0.6018164157867432, "eval_runtime": 24.2955, "eval_samples_per_second": 352.0, "eval_steps_per_second": 5.515, "step": 1800 }, { "epoch": 0.7560684440907283, "grad_norm": 3.4406116008758545, "learning_rate": 1.8110583073945566e-05, "loss": 0.5994, "step": 1900 }, { "epoch": 0.7958615200955034, "grad_norm": 3.2141339778900146, "learning_rate": 1.7841515907900467e-05, "loss": 0.5902, "step": 2000 }, { "epoch": 0.7958615200955034, "eval_accuracy": 0.6710710944808232, "eval_loss": 0.6057897210121155, "eval_runtime": 24.3126, "eval_samples_per_second": 351.752, "eval_steps_per_second": 5.512, "step": 2000 }, { "epoch": 0.8356545961002786, "grad_norm": 2.75370192527771, "learning_rate": 1.755683144912986e-05, "loss": 0.6024, "step": 2100 }, { "epoch": 0.8754476721050537, "grad_norm": 4.044663429260254, "learning_rate": 1.725709667990851e-05, "loss": 0.59, "step": 2200 }, { "epoch": 0.8754476721050537, "eval_accuracy": 0.6708372310570627, "eval_loss": 0.594099760055542, "eval_runtime": 24.3163, "eval_samples_per_second": 351.699, "eval_steps_per_second": 5.511, "step": 2200 }, { "epoch": 0.9152407481098289, "grad_norm": 3.4913995265960693, "learning_rate": 1.6942908556958297e-05, "loss": 0.5919, "step": 2300 }, { "epoch": 0.955033824114604, "grad_norm": 3.277291774749756, "learning_rate": 1.6614892822539333e-05, "loss": 0.5963, "step": 2400 }, { "epoch": 0.955033824114604, "eval_accuracy": 0.6722404115996258, "eval_loss": 0.5924503207206726, "eval_runtime": 24.2851, "eval_samples_per_second": 352.151, "eval_steps_per_second": 5.518, "step": 2400 } ], "logging_steps": 100, "max_steps": 7539, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 400, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }