{ "best_metric": 0.613650998068255, "best_model_checkpoint": "/root/crosslingual-mining-for-domain-nli/output/pretraining/vihealthbert-w_dual-ViNLI/lr3e-5_wr0.1_wd0.0/checkpoint-8000", "epoch": 171.875, "eval_steps": 1000, "global_step": 11000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.015625, "grad_norm": 34.54078674316406, "learning_rate": 1e-08, "loss": 10.2871, "step": 1 }, { "epoch": 15.625, "grad_norm": 5.461294174194336, "learning_rate": 9.999999999999999e-06, "loss": 5.8126, "step": 1000 }, { "epoch": 15.625, "eval_accuracy": 0.44503311258278144, "eval_loss": 3.546058177947998, "eval_runtime": 0.7421, "eval_samples_per_second": 312.643, "eval_steps_per_second": 20.214, "step": 1000 }, { "epoch": 31.25, "grad_norm": 4.604820728302002, "learning_rate": 1.9999999999999998e-05, "loss": 2.605, "step": 2000 }, { "epoch": 31.25, "eval_accuracy": 0.540436456996149, "eval_loss": 2.7789385318756104, "eval_runtime": 0.799, "eval_samples_per_second": 290.377, "eval_steps_per_second": 18.774, "step": 2000 }, { "epoch": 46.875, "grad_norm": 4.253050327301025, "learning_rate": 3e-05, "loss": 1.5924, "step": 3000 }, { "epoch": 46.875, "eval_accuracy": 0.5809284818067754, "eval_loss": 2.5431649684906006, "eval_runtime": 0.7348, "eval_samples_per_second": 315.742, "eval_steps_per_second": 20.414, "step": 3000 }, { "epoch": 62.5, "grad_norm": 4.554752349853516, "learning_rate": 2.8888888888888888e-05, "loss": 1.2233, "step": 4000 }, { "epoch": 62.5, "eval_accuracy": 0.5567077122488658, "eval_loss": 2.6662397384643555, "eval_runtime": 0.7259, "eval_samples_per_second": 319.609, "eval_steps_per_second": 20.664, "step": 4000 }, { "epoch": 78.125, "grad_norm": 3.5844781398773193, "learning_rate": 2.777777777777778e-05, "loss": 0.9236, "step": 5000 }, { "epoch": 78.125, "eval_accuracy": 0.5927342256214149, "eval_loss": 2.4690873622894287, "eval_runtime": 0.7491, "eval_samples_per_second": 309.699, "eval_steps_per_second": 20.024, "step": 5000 }, { "epoch": 93.75, "grad_norm": 5.544617176055908, "learning_rate": 2.6666666666666667e-05, "loss": 0.7193, "step": 6000 }, { "epoch": 93.75, "eval_accuracy": 0.6027219701879456, "eval_loss": 2.405272960662842, "eval_runtime": 0.6998, "eval_samples_per_second": 331.502, "eval_steps_per_second": 21.433, "step": 6000 }, { "epoch": 109.375, "grad_norm": 4.963156700134277, "learning_rate": 2.5555555555555557e-05, "loss": 0.6259, "step": 7000 }, { "epoch": 109.375, "eval_accuracy": 0.5781748564135291, "eval_loss": 2.5938053131103516, "eval_runtime": 0.6966, "eval_samples_per_second": 333.065, "eval_steps_per_second": 21.534, "step": 7000 }, { "epoch": 125.0, "grad_norm": 4.307164192199707, "learning_rate": 2.4444444444444445e-05, "loss": 0.5082, "step": 8000 }, { "epoch": 125.0, "eval_accuracy": 0.613650998068255, "eval_loss": 2.480868101119995, "eval_runtime": 0.7149, "eval_samples_per_second": 324.519, "eval_steps_per_second": 20.982, "step": 8000 }, { "epoch": 140.625, "grad_norm": 3.2876017093658447, "learning_rate": 2.3333333333333336e-05, "loss": 0.4438, "step": 9000 }, { "epoch": 140.625, "eval_accuracy": 0.5819174757281553, "eval_loss": 2.705582618713379, "eval_runtime": 0.845, "eval_samples_per_second": 274.54, "eval_steps_per_second": 17.75, "step": 9000 }, { "epoch": 156.25, "grad_norm": 14.243521690368652, "learning_rate": 2.222222222222222e-05, "loss": 0.4075, "step": 10000 }, { "epoch": 156.25, "eval_accuracy": 0.5945945945945946, "eval_loss": 2.650148630142212, "eval_runtime": 0.8487, "eval_samples_per_second": 273.36, "eval_steps_per_second": 17.674, "step": 10000 }, { "epoch": 171.875, "grad_norm": 12.720209121704102, "learning_rate": 2.111111111111111e-05, "loss": 0.3571, "step": 11000 }, { "epoch": 171.875, "eval_accuracy": 0.6082406801831263, "eval_loss": 2.53369140625, "eval_runtime": 0.7729, "eval_samples_per_second": 300.18, "eval_steps_per_second": 19.408, "step": 11000 } ], "logging_steps": 1000, "max_steps": 30000, "num_input_tokens_seen": 0, "num_train_epochs": 469, "save_steps": 1000, "total_flos": 0.0, "train_batch_size": 32, "trial_name": null, "trial_params": null }