{ "best_metric": 0.1978042721748352, "best_model_checkpoint": "/p/scratch/ccstdl/kalyan1/finetuned-cosine-loss/checkpoint-430", "epoch": 0.043, "eval_steps": 10, "global_step": 430, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.001, "grad_norm": 1.646112322807312, "learning_rate": 5.000000000000001e-07, "loss": 0.6464, "step": 10 }, { "epoch": 0.001, "eval_cos_sim": 0.30497944355010986, "eval_loss": 0.6448461413383484, "eval_runtime": 205.7229, "eval_samples_per_second": 4.861, "eval_steps_per_second": 0.156, "step": 10 }, { "epoch": 0.002, "grad_norm": 1.769713282585144, "learning_rate": 1.0000000000000002e-06, "loss": 0.6442, "step": 20 }, { "epoch": 0.002, "eval_cos_sim": 0.30667245388031006, "eval_loss": 0.6431530714035034, "eval_runtime": 205.6457, "eval_samples_per_second": 4.863, "eval_steps_per_second": 0.156, "step": 20 }, { "epoch": 0.003, "grad_norm": 1.794607400894165, "learning_rate": 1.5e-06, "loss": 0.6231, "step": 30 }, { "epoch": 0.003, "eval_cos_sim": 0.30951327085494995, "eval_loss": 0.640312135219574, "eval_runtime": 204.7069, "eval_samples_per_second": 4.885, "eval_steps_per_second": 0.156, "step": 30 }, { "epoch": 0.004, "grad_norm": 1.7727755308151245, "learning_rate": 2.0000000000000003e-06, "loss": 0.6291, "step": 40 }, { "epoch": 0.004, "eval_cos_sim": 0.3135722279548645, "eval_loss": 0.6362530589103699, "eval_runtime": 204.4705, "eval_samples_per_second": 4.891, "eval_steps_per_second": 0.157, "step": 40 }, { "epoch": 0.005, "grad_norm": 1.492561936378479, "learning_rate": 2.5e-06, "loss": 0.619, "step": 50 }, { "epoch": 0.005, "eval_cos_sim": 0.31877169013023376, "eval_loss": 0.6310535669326782, "eval_runtime": 205.9799, "eval_samples_per_second": 4.855, "eval_steps_per_second": 0.155, "step": 50 }, { "epoch": 0.006, "grad_norm": 1.761650800704956, "learning_rate": 3e-06, "loss": 0.619, "step": 60 }, { "epoch": 0.006, "eval_cos_sim": 0.32503262162208557, "eval_loss": 0.624792218208313, "eval_runtime": 206.3179, "eval_samples_per_second": 4.847, "eval_steps_per_second": 0.155, "step": 60 }, { "epoch": 0.007, "grad_norm": 1.7647501230239868, "learning_rate": 3.5000000000000004e-06, "loss": 0.6151, "step": 70 }, { "epoch": 0.007, "eval_cos_sim": 0.33216750621795654, "eval_loss": 0.6176563501358032, "eval_runtime": 208.1576, "eval_samples_per_second": 4.804, "eval_steps_per_second": 0.154, "step": 70 }, { "epoch": 0.008, "grad_norm": 1.762241005897522, "learning_rate": 4.000000000000001e-06, "loss": 0.6257, "step": 80 }, { "epoch": 0.008, "eval_cos_sim": 0.3403419852256775, "eval_loss": 0.6094820499420166, "eval_runtime": 205.6449, "eval_samples_per_second": 4.863, "eval_steps_per_second": 0.156, "step": 80 }, { "epoch": 0.009, "grad_norm": 1.4978479146957397, "learning_rate": 4.5e-06, "loss": 0.5978, "step": 90 }, { "epoch": 0.009, "eval_cos_sim": 0.34952041506767273, "eval_loss": 0.6003041863441467, "eval_runtime": 205.9245, "eval_samples_per_second": 4.856, "eval_steps_per_second": 0.155, "step": 90 }, { "epoch": 0.01, "grad_norm": 1.4880496263504028, "learning_rate": 5e-06, "loss": 0.5816, "step": 100 }, { "epoch": 0.01, "eval_cos_sim": 0.35966718196868896, "eval_loss": 0.590160071849823, "eval_runtime": 206.7863, "eval_samples_per_second": 4.836, "eval_steps_per_second": 0.155, "step": 100 }, { "epoch": 0.011, "grad_norm": 1.4825042486190796, "learning_rate": 5.500000000000001e-06, "loss": 0.5922, "step": 110 }, { "epoch": 0.011, "eval_cos_sim": 0.37048399448394775, "eval_loss": 0.5793455243110657, "eval_runtime": 206.0982, "eval_samples_per_second": 4.852, "eval_steps_per_second": 0.155, "step": 110 }, { "epoch": 0.012, "grad_norm": 1.761700987815857, "learning_rate": 6e-06, "loss": 0.5767, "step": 120 }, { "epoch": 0.012, "eval_cos_sim": 0.38159415125846863, "eval_loss": 0.5682348608970642, "eval_runtime": 204.1513, "eval_samples_per_second": 4.898, "eval_steps_per_second": 0.157, "step": 120 }, { "epoch": 0.013, "grad_norm": 1.7859400510787964, "learning_rate": 6.5000000000000004e-06, "loss": 0.5625, "step": 130 }, { "epoch": 0.013, "eval_cos_sim": 0.39334964752197266, "eval_loss": 0.5564795732498169, "eval_runtime": 205.8548, "eval_samples_per_second": 4.858, "eval_steps_per_second": 0.155, "step": 130 }, { "epoch": 0.014, "grad_norm": 1.4824175834655762, "learning_rate": 7.000000000000001e-06, "loss": 0.5348, "step": 140 }, { "epoch": 0.014, "eval_cos_sim": 0.4060880243778229, "eval_loss": 0.5437397956848145, "eval_runtime": 205.6798, "eval_samples_per_second": 4.862, "eval_steps_per_second": 0.156, "step": 140 }, { "epoch": 0.015, "grad_norm": 1.453766107559204, "learning_rate": 7.5e-06, "loss": 0.5359, "step": 150 }, { "epoch": 0.015, "eval_cos_sim": 0.4195743799209595, "eval_loss": 0.5302525758743286, "eval_runtime": 206.2792, "eval_samples_per_second": 4.848, "eval_steps_per_second": 0.155, "step": 150 }, { "epoch": 0.016, "grad_norm": 1.7521625757217407, "learning_rate": 8.000000000000001e-06, "loss": 0.5202, "step": 160 }, { "epoch": 0.016, "eval_cos_sim": 0.43342718482017517, "eval_loss": 0.516399621963501, "eval_runtime": 206.1506, "eval_samples_per_second": 4.851, "eval_steps_per_second": 0.155, "step": 160 }, { "epoch": 0.017, "grad_norm": 1.4833202362060547, "learning_rate": 8.500000000000002e-06, "loss": 0.5133, "step": 170 }, { "epoch": 0.017, "eval_cos_sim": 0.44808587431907654, "eval_loss": 0.5017415285110474, "eval_runtime": 205.6637, "eval_samples_per_second": 4.862, "eval_steps_per_second": 0.156, "step": 170 }, { "epoch": 0.018, "grad_norm": 1.7655549049377441, "learning_rate": 9e-06, "loss": 0.5066, "step": 180 }, { "epoch": 0.018, "eval_cos_sim": 0.46300601959228516, "eval_loss": 0.4868185222148895, "eval_runtime": 206.8425, "eval_samples_per_second": 4.835, "eval_steps_per_second": 0.155, "step": 180 }, { "epoch": 0.019, "grad_norm": 1.6160492897033691, "learning_rate": 9.5e-06, "loss": 0.4772, "step": 190 }, { "epoch": 0.019, "eval_cos_sim": 0.4783403277397156, "eval_loss": 0.4714851975440979, "eval_runtime": 205.1447, "eval_samples_per_second": 4.875, "eval_steps_per_second": 0.156, "step": 190 }, { "epoch": 0.02, "grad_norm": 1.7988218069076538, "learning_rate": 1e-05, "loss": 0.4706, "step": 200 }, { "epoch": 0.02, "eval_cos_sim": 0.49314171075820923, "eval_loss": 0.45668038725852966, "eval_runtime": 206.3434, "eval_samples_per_second": 4.846, "eval_steps_per_second": 0.155, "step": 200 }, { "epoch": 0.021, "grad_norm": 1.7824742794036865, "learning_rate": 1.05e-05, "loss": 0.4531, "step": 210 }, { "epoch": 0.021, "eval_cos_sim": 0.5080329179763794, "eval_loss": 0.44177642464637756, "eval_runtime": 207.391, "eval_samples_per_second": 4.822, "eval_steps_per_second": 0.154, "step": 210 }, { "epoch": 0.022, "grad_norm": 1.800609827041626, "learning_rate": 1.1000000000000001e-05, "loss": 0.4324, "step": 220 }, { "epoch": 0.022, "eval_cos_sim": 0.5240386724472046, "eval_loss": 0.42576006054878235, "eval_runtime": 207.013, "eval_samples_per_second": 4.831, "eval_steps_per_second": 0.155, "step": 220 }, { "epoch": 0.023, "grad_norm": 1.4796422719955444, "learning_rate": 1.1500000000000002e-05, "loss": 0.4139, "step": 230 }, { "epoch": 0.023, "eval_cos_sim": 0.5389044880867004, "eval_loss": 0.410887211561203, "eval_runtime": 205.4474, "eval_samples_per_second": 4.867, "eval_steps_per_second": 0.156, "step": 230 }, { "epoch": 0.024, "grad_norm": 1.513137936592102, "learning_rate": 1.2e-05, "loss": 0.3916, "step": 240 }, { "epoch": 0.024, "eval_cos_sim": 0.5540903806686401, "eval_loss": 0.39568817615509033, "eval_runtime": 206.197, "eval_samples_per_second": 4.85, "eval_steps_per_second": 0.155, "step": 240 }, { "epoch": 0.025, "grad_norm": 1.4907095432281494, "learning_rate": 1.25e-05, "loss": 0.3867, "step": 250 }, { "epoch": 0.025, "eval_cos_sim": 0.5701205730438232, "eval_loss": 0.37963634729385376, "eval_runtime": 206.8619, "eval_samples_per_second": 4.834, "eval_steps_per_second": 0.155, "step": 250 }, { "epoch": 0.026, "grad_norm": 1.5080854892730713, "learning_rate": 1.3000000000000001e-05, "loss": 0.3893, "step": 260 }, { "epoch": 0.026, "eval_cos_sim": 0.5862915515899658, "eval_loss": 0.36344093084335327, "eval_runtime": 206.4905, "eval_samples_per_second": 4.843, "eval_steps_per_second": 0.155, "step": 260 }, { "epoch": 0.027, "grad_norm": 1.738050103187561, "learning_rate": 1.3500000000000001e-05, "loss": 0.3718, "step": 270 }, { "epoch": 0.027, "eval_cos_sim": 0.5994535088539124, "eval_loss": 0.3502576947212219, "eval_runtime": 205.2709, "eval_samples_per_second": 4.872, "eval_steps_per_second": 0.156, "step": 270 }, { "epoch": 0.028, "grad_norm": 1.7529724836349487, "learning_rate": 1.4000000000000001e-05, "loss": 0.3364, "step": 280 }, { "epoch": 0.028, "eval_cos_sim": 0.613076388835907, "eval_loss": 0.33661308884620667, "eval_runtime": 205.299, "eval_samples_per_second": 4.871, "eval_steps_per_second": 0.156, "step": 280 }, { "epoch": 0.029, "grad_norm": 1.753875732421875, "learning_rate": 1.45e-05, "loss": 0.3149, "step": 290 }, { "epoch": 0.029, "eval_cos_sim": 0.6267997622489929, "eval_loss": 0.32285839319229126, "eval_runtime": 205.7429, "eval_samples_per_second": 4.86, "eval_steps_per_second": 0.156, "step": 290 }, { "epoch": 0.03, "grad_norm": 1.4430509805679321, "learning_rate": 1.5e-05, "loss": 0.3346, "step": 300 }, { "epoch": 0.03, "eval_cos_sim": 0.6370284557342529, "eval_loss": 0.31259986758232117, "eval_runtime": 205.8712, "eval_samples_per_second": 4.857, "eval_steps_per_second": 0.155, "step": 300 }, { "epoch": 0.031, "grad_norm": 1.6388583183288574, "learning_rate": 1.55e-05, "loss": 0.2906, "step": 310 }, { "epoch": 0.031, "eval_cos_sim": 0.6471716165542603, "eval_loss": 0.3024389445781708, "eval_runtime": 205.8913, "eval_samples_per_second": 4.857, "eval_steps_per_second": 0.155, "step": 310 }, { "epoch": 0.032, "grad_norm": 1.6151902675628662, "learning_rate": 1.6000000000000003e-05, "loss": 0.2823, "step": 320 }, { "epoch": 0.032, "eval_cos_sim": 0.658047616481781, "eval_loss": 0.29153889417648315, "eval_runtime": 205.5626, "eval_samples_per_second": 4.865, "eval_steps_per_second": 0.156, "step": 320 }, { "epoch": 0.033, "grad_norm": 1.3532074689865112, "learning_rate": 1.65e-05, "loss": 0.2806, "step": 330 }, { "epoch": 0.033, "eval_cos_sim": 0.6677139401435852, "eval_loss": 0.28184273838996887, "eval_runtime": 205.9081, "eval_samples_per_second": 4.857, "eval_steps_per_second": 0.155, "step": 330 }, { "epoch": 0.034, "grad_norm": 1.5439000129699707, "learning_rate": 1.7000000000000003e-05, "loss": 0.2604, "step": 340 }, { "epoch": 0.034, "eval_cos_sim": 0.6758972406387329, "eval_loss": 0.2736455500125885, "eval_runtime": 206.6691, "eval_samples_per_second": 4.839, "eval_steps_per_second": 0.155, "step": 340 }, { "epoch": 0.035, "grad_norm": 1.4628266096115112, "learning_rate": 1.75e-05, "loss": 0.2488, "step": 350 }, { "epoch": 0.035, "eval_cos_sim": 0.6875510215759277, "eval_loss": 0.2619880139827728, "eval_runtime": 205.4389, "eval_samples_per_second": 4.868, "eval_steps_per_second": 0.156, "step": 350 }, { "epoch": 0.036, "grad_norm": 1.2534619569778442, "learning_rate": 1.8e-05, "loss": 0.2421, "step": 360 }, { "epoch": 0.036, "eval_cos_sim": 0.6983117461204529, "eval_loss": 0.2512022852897644, "eval_runtime": 205.4173, "eval_samples_per_second": 4.868, "eval_steps_per_second": 0.156, "step": 360 }, { "epoch": 0.037, "grad_norm": 1.3977046012878418, "learning_rate": 1.85e-05, "loss": 0.2185, "step": 370 }, { "epoch": 0.037, "eval_cos_sim": 0.7063655257225037, "eval_loss": 0.2431405633687973, "eval_runtime": 206.1942, "eval_samples_per_second": 4.85, "eval_steps_per_second": 0.155, "step": 370 }, { "epoch": 0.038, "grad_norm": 1.193185567855835, "learning_rate": 1.9e-05, "loss": 0.237, "step": 380 }, { "epoch": 0.038, "eval_cos_sim": 0.7140612602233887, "eval_loss": 0.23545415699481964, "eval_runtime": 204.7566, "eval_samples_per_second": 4.884, "eval_steps_per_second": 0.156, "step": 380 }, { "epoch": 0.039, "grad_norm": 1.2560912370681763, "learning_rate": 1.9500000000000003e-05, "loss": 0.244, "step": 390 }, { "epoch": 0.039, "eval_cos_sim": 0.7227933406829834, "eval_loss": 0.22671909630298615, "eval_runtime": 205.2004, "eval_samples_per_second": 4.873, "eval_steps_per_second": 0.156, "step": 390 }, { "epoch": 0.04, "grad_norm": 1.1640348434448242, "learning_rate": 2e-05, "loss": 0.2261, "step": 400 }, { "epoch": 0.04, "eval_cos_sim": 0.7307066917419434, "eval_loss": 0.21881377696990967, "eval_runtime": 206.284, "eval_samples_per_second": 4.848, "eval_steps_per_second": 0.155, "step": 400 }, { "epoch": 0.041, "grad_norm": 1.0957697629928589, "learning_rate": 2.05e-05, "loss": 0.204, "step": 410 }, { "epoch": 0.041, "eval_cos_sim": 0.7365487813949585, "eval_loss": 0.2129872441291809, "eval_runtime": 205.7321, "eval_samples_per_second": 4.861, "eval_steps_per_second": 0.156, "step": 410 }, { "epoch": 0.042, "grad_norm": 1.025030493736267, "learning_rate": 2.1e-05, "loss": 0.2039, "step": 420 }, { "epoch": 0.042, "eval_cos_sim": 0.7446093559265137, "eval_loss": 0.20492298901081085, "eval_runtime": 206.2353, "eval_samples_per_second": 4.849, "eval_steps_per_second": 0.155, "step": 420 }, { "epoch": 0.043, "grad_norm": 0.9591805934906006, "learning_rate": 2.15e-05, "loss": 0.2197, "step": 430 }, { "epoch": 0.043, "eval_cos_sim": 0.751737117767334, "eval_loss": 0.1978042721748352, "eval_runtime": 206.0346, "eval_samples_per_second": 4.854, "eval_steps_per_second": 0.155, "step": 430 } ], "logging_steps": 10, "max_steps": 10000, "num_input_tokens_seen": 0, "num_train_epochs": 9223372036854775807, "save_steps": 10, "total_flos": 0.0, "train_batch_size": 140, "trial_name": null, "trial_params": null }