{ "best_metric": 0.11999432742595673, "best_model_checkpoint": "./fine-tuned/checkpoint-1500", "epoch": 0.26345832967418986, "eval_steps": 100, "global_step": 1500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.008781944322472996, "grad_norm": 212427.96875, "learning_rate": 2.9934129632882487e-05, "loss": 0.5421, "step": 50 }, { "epoch": 0.017563888644945992, "grad_norm": 15316.291015625, "learning_rate": 2.9868259265764974e-05, "loss": 0.1903, "step": 100 }, { "epoch": 0.017563888644945992, "eval_loss": 0.16095133125782013, "eval_runtime": 175.5949, "eval_samples_per_second": 25.399, "eval_steps_per_second": 3.178, "step": 100 }, { "epoch": 0.026345832967418988, "grad_norm": 21344.13671875, "learning_rate": 2.980238889864746e-05, "loss": 0.1742, "step": 150 }, { "epoch": 0.035127777289891984, "grad_norm": 26603.357421875, "learning_rate": 2.973651853152995e-05, "loss": 0.164, "step": 200 }, { "epoch": 0.035127777289891984, "eval_loss": 0.14671418070793152, "eval_runtime": 175.3478, "eval_samples_per_second": 25.435, "eval_steps_per_second": 3.182, "step": 200 }, { "epoch": 0.04390972161236498, "grad_norm": 18468.01953125, "learning_rate": 2.9670648164412437e-05, "loss": 0.1697, "step": 250 }, { "epoch": 0.052691665934837977, "grad_norm": 15799.6875, "learning_rate": 2.9604777797294924e-05, "loss": 0.161, "step": 300 }, { "epoch": 0.052691665934837977, "eval_loss": 0.14008501172065735, "eval_runtime": 175.2345, "eval_samples_per_second": 25.452, "eval_steps_per_second": 3.184, "step": 300 }, { "epoch": 0.06147361025731097, "grad_norm": 17163.763671875, "learning_rate": 2.953890743017741e-05, "loss": 0.1634, "step": 350 }, { "epoch": 0.07025555457978397, "grad_norm": 17603.025390625, "learning_rate": 2.94730370630599e-05, "loss": 0.1543, "step": 400 }, { "epoch": 0.07025555457978397, "eval_loss": 0.13591521978378296, "eval_runtime": 175.0506, "eval_samples_per_second": 25.478, "eval_steps_per_second": 3.188, "step": 400 }, { "epoch": 0.07903749890225696, "grad_norm": 12623.9189453125, "learning_rate": 2.9407166695942387e-05, "loss": 0.1417, "step": 450 }, { "epoch": 0.08781944322472995, "grad_norm": 14828.5, "learning_rate": 2.9341296328824874e-05, "loss": 0.1403, "step": 500 }, { "epoch": 0.08781944322472995, "eval_loss": 0.13329531252384186, "eval_runtime": 175.1721, "eval_samples_per_second": 25.461, "eval_steps_per_second": 3.185, "step": 500 }, { "epoch": 0.09660138754720295, "grad_norm": 16192.8515625, "learning_rate": 2.927542596170736e-05, "loss": 0.1444, "step": 550 }, { "epoch": 0.10538333186967595, "grad_norm": 20510.47265625, "learning_rate": 2.9209555594589847e-05, "loss": 0.1466, "step": 600 }, { "epoch": 0.10538333186967595, "eval_loss": 0.1307835429906845, "eval_runtime": 175.06, "eval_samples_per_second": 25.477, "eval_steps_per_second": 3.187, "step": 600 }, { "epoch": 0.11416527619214895, "grad_norm": 10555.8408203125, "learning_rate": 2.9143685227472337e-05, "loss": 0.1472, "step": 650 }, { "epoch": 0.12294722051462194, "grad_norm": 12451.990234375, "learning_rate": 2.907781486035482e-05, "loss": 0.1415, "step": 700 }, { "epoch": 0.12294722051462194, "eval_loss": 0.1288571059703827, "eval_runtime": 175.1799, "eval_samples_per_second": 25.46, "eval_steps_per_second": 3.185, "step": 700 }, { "epoch": 0.13172916483709493, "grad_norm": 11173.96875, "learning_rate": 2.901194449323731e-05, "loss": 0.1368, "step": 750 }, { "epoch": 0.14051110915956794, "grad_norm": 47561.75, "learning_rate": 2.8946074126119797e-05, "loss": 0.1399, "step": 800 }, { "epoch": 0.14051110915956794, "eval_loss": 0.12726937234401703, "eval_runtime": 175.2229, "eval_samples_per_second": 25.453, "eval_steps_per_second": 3.185, "step": 800 }, { "epoch": 0.14929305348204092, "grad_norm": 11766.6767578125, "learning_rate": 2.8880203759002283e-05, "loss": 0.1433, "step": 850 }, { "epoch": 0.15807499780451392, "grad_norm": 14977.416015625, "learning_rate": 2.881433339188477e-05, "loss": 0.1371, "step": 900 }, { "epoch": 0.15807499780451392, "eval_loss": 0.12529444694519043, "eval_runtime": 174.8253, "eval_samples_per_second": 25.511, "eval_steps_per_second": 3.192, "step": 900 }, { "epoch": 0.1668569421269869, "grad_norm": 11109.173828125, "learning_rate": 2.874846302476726e-05, "loss": 0.1292, "step": 950 }, { "epoch": 0.1756388864494599, "grad_norm": 9897.7958984375, "learning_rate": 2.8682592657649747e-05, "loss": 0.1351, "step": 1000 }, { "epoch": 0.1756388864494599, "eval_loss": 0.12485189735889435, "eval_runtime": 174.8115, "eval_samples_per_second": 25.513, "eval_steps_per_second": 3.192, "step": 1000 }, { "epoch": 0.18442083077193291, "grad_norm": 20060.55859375, "learning_rate": 2.8616722290532233e-05, "loss": 0.1303, "step": 1050 }, { "epoch": 0.1932027750944059, "grad_norm": 10244.4052734375, "learning_rate": 2.855085192341472e-05, "loss": 0.1413, "step": 1100 }, { "epoch": 0.1932027750944059, "eval_loss": 0.12359971553087234, "eval_runtime": 175.122, "eval_samples_per_second": 25.468, "eval_steps_per_second": 3.186, "step": 1100 }, { "epoch": 0.2019847194168789, "grad_norm": 36993.25, "learning_rate": 2.848498155629721e-05, "loss": 0.1275, "step": 1150 }, { "epoch": 0.2107666637393519, "grad_norm": 11102.2646484375, "learning_rate": 2.8419111189179697e-05, "loss": 0.1377, "step": 1200 }, { "epoch": 0.2107666637393519, "eval_loss": 0.12276890873908997, "eval_runtime": 175.1309, "eval_samples_per_second": 25.467, "eval_steps_per_second": 3.186, "step": 1200 }, { "epoch": 0.21954860806182488, "grad_norm": 10398.369140625, "learning_rate": 2.835324082206218e-05, "loss": 0.1356, "step": 1250 }, { "epoch": 0.2283305523842979, "grad_norm": 14664.177734375, "learning_rate": 2.828737045494467e-05, "loss": 0.1309, "step": 1300 }, { "epoch": 0.2283305523842979, "eval_loss": 0.1219501867890358, "eval_runtime": 174.8703, "eval_samples_per_second": 25.505, "eval_steps_per_second": 3.191, "step": 1300 }, { "epoch": 0.23711249670677087, "grad_norm": 9694.1875, "learning_rate": 2.8221500087827156e-05, "loss": 0.1271, "step": 1350 }, { "epoch": 0.24589444102924388, "grad_norm": 17376.810546875, "learning_rate": 2.8155629720709643e-05, "loss": 0.1434, "step": 1400 }, { "epoch": 0.24589444102924388, "eval_loss": 0.12065327912569046, "eval_runtime": 174.9734, "eval_samples_per_second": 25.49, "eval_steps_per_second": 3.189, "step": 1400 }, { "epoch": 0.2546763853517169, "grad_norm": 13443.2255859375, "learning_rate": 2.808975935359213e-05, "loss": 0.1383, "step": 1450 }, { "epoch": 0.26345832967418986, "grad_norm": 10927.8994140625, "learning_rate": 2.802388898647462e-05, "loss": 0.125, "step": 1500 }, { "epoch": 0.26345832967418986, "eval_loss": 0.11999432742595673, "eval_runtime": 174.9084, "eval_samples_per_second": 25.499, "eval_steps_per_second": 3.19, "step": 1500 } ], "logging_steps": 50, "max_steps": 22772, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 7307494686720000.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }