{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 678, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "learning_rate": 2.9411764705882355e-06, "loss": 1.0978, "step": 1 }, { "epoch": 0.02, "learning_rate": 1.4705882352941177e-05, "loss": 1.1379, "step": 5 }, { "epoch": 0.04, "learning_rate": 2.9411764705882354e-05, "loss": 0.9998, "step": 10 }, { "epoch": 0.07, "learning_rate": 4.411764705882353e-05, "loss": 0.8656, "step": 15 }, { "epoch": 0.09, "learning_rate": 5.882352941176471e-05, "loss": 0.7844, "step": 20 }, { "epoch": 0.11, "learning_rate": 7.352941176470589e-05, "loss": 0.7223, "step": 25 }, { "epoch": 0.13, "learning_rate": 8.823529411764706e-05, "loss": 0.672, "step": 30 }, { "epoch": 0.15, "learning_rate": 0.00010294117647058823, "loss": 0.6365, "step": 35 }, { "epoch": 0.18, "learning_rate": 0.00011764705882352942, "loss": 0.6207, "step": 40 }, { "epoch": 0.2, "learning_rate": 0.0001323529411764706, "loss": 0.6073, "step": 45 }, { "epoch": 0.22, "learning_rate": 0.00014705882352941178, "loss": 0.6086, "step": 50 }, { "epoch": 0.24, "learning_rate": 0.00016176470588235295, "loss": 0.5898, "step": 55 }, { "epoch": 0.27, "learning_rate": 0.00017647058823529413, "loss": 0.5777, "step": 60 }, { "epoch": 0.29, "learning_rate": 0.0001911764705882353, "loss": 0.5774, "step": 65 }, { "epoch": 0.31, "learning_rate": 0.00019999469523400122, "loss": 0.5809, "step": 70 }, { "epoch": 0.33, "learning_rate": 0.00019993502307985724, "loss": 0.5749, "step": 75 }, { "epoch": 0.35, "learning_rate": 0.00019980908751249555, "loss": 0.5782, "step": 80 }, { "epoch": 0.38, "learning_rate": 0.00019961697203537952, "loss": 0.5809, "step": 85 }, { "epoch": 0.4, "learning_rate": 0.00019935880403355253, "loss": 0.5669, "step": 90 }, { "epoch": 0.42, "learning_rate": 0.0001990347546891733, "loss": 0.573, "step": 95 }, { "epoch": 0.44, "learning_rate": 0.00019864503886801106, "loss": 0.5774, "step": 100 }, { "epoch": 0.46, "learning_rate": 0.00019818991497697565, "loss": 0.58, "step": 105 }, { "epoch": 0.49, "learning_rate": 0.00019766968479277683, "loss": 0.577, "step": 110 }, { "epoch": 0.51, "learning_rate": 0.00019708469326182678, "loss": 0.5631, "step": 115 }, { "epoch": 0.53, "learning_rate": 0.0001964353282715183, "loss": 0.5676, "step": 120 }, { "epoch": 0.55, "learning_rate": 0.00019572202039303, "loss": 0.5722, "step": 125 }, { "epoch": 0.58, "learning_rate": 0.00019494524259582992, "loss": 0.5665, "step": 130 }, { "epoch": 0.6, "learning_rate": 0.00019410550993406603, "loss": 0.5767, "step": 135 }, { "epoch": 0.62, "learning_rate": 0.00019320337920505153, "loss": 0.5553, "step": 140 }, { "epoch": 0.64, "learning_rate": 0.00019223944858007253, "loss": 0.5646, "step": 145 }, { "epoch": 0.66, "learning_rate": 0.00019121435720776122, "loss": 0.5625, "step": 150 }, { "epoch": 0.69, "learning_rate": 0.00019012878479029906, "loss": 0.5611, "step": 155 }, { "epoch": 0.71, "learning_rate": 0.00018898345113272998, "loss": 0.5602, "step": 160 }, { "epoch": 0.73, "learning_rate": 0.00018777911566568282, "loss": 0.5731, "step": 165 }, { "epoch": 0.75, "learning_rate": 0.0001865165769418196, "loss": 0.5587, "step": 170 }, { "epoch": 0.77, "learning_rate": 0.0001851966721063431, "loss": 0.5729, "step": 175 }, { "epoch": 0.8, "learning_rate": 0.00018382027634191524, "loss": 0.5558, "step": 180 }, { "epoch": 0.82, "learning_rate": 0.00018238830228835417, "loss": 0.55, "step": 185 }, { "epoch": 0.84, "learning_rate": 0.00018090169943749476, "loss": 0.5561, "step": 190 }, { "epoch": 0.86, "learning_rate": 0.0001793614535036137, "loss": 0.552, "step": 195 }, { "epoch": 0.88, "learning_rate": 0.00017776858576983712, "loss": 0.5615, "step": 200 }, { "epoch": 0.91, "learning_rate": 0.00017612415241096327, "loss": 0.5577, "step": 205 }, { "epoch": 0.93, "learning_rate": 0.0001744292437931502, "loss": 0.559, "step": 210 }, { "epoch": 0.95, "learning_rate": 0.00017268498375093202, "loss": 0.5467, "step": 215 }, { "epoch": 0.97, "learning_rate": 0.00017089252884204377, "loss": 0.564, "step": 220 }, { "epoch": 1.0, "learning_rate": 0.0001690530675805482, "loss": 0.5498, "step": 225 }, { "epoch": 1.0, "eval_loss": 0.5578014254570007, "eval_runtime": 40.6018, "eval_samples_per_second": 1.33, "eval_steps_per_second": 0.172, "step": 226 }, { "epoch": 1.02, "learning_rate": 0.0001671678196487741, "loss": 0.5024, "step": 230 }, { "epoch": 1.04, "learning_rate": 0.00016523803508858732, "loss": 0.4999, "step": 235 }, { "epoch": 1.06, "learning_rate": 0.00016326499347253207, "loss": 0.4973, "step": 240 }, { "epoch": 1.08, "learning_rate": 0.00016125000305539094, "loss": 0.4988, "step": 245 }, { "epoch": 1.11, "learning_rate": 0.0001591943999067273, "loss": 0.5004, "step": 250 }, { "epoch": 1.13, "learning_rate": 0.00015709954702498397, "loss": 0.4931, "step": 255 }, { "epoch": 1.15, "learning_rate": 0.0001549668334337271, "loss": 0.4909, "step": 260 }, { "epoch": 1.17, "learning_rate": 0.00015279767326063296, "loss": 0.4924, "step": 265 }, { "epoch": 1.19, "learning_rate": 0.00015059350479982965, "loss": 0.4958, "step": 270 }, { "epoch": 1.22, "learning_rate": 0.00014835578955821457, "loss": 0.4941, "step": 275 }, { "epoch": 1.24, "learning_rate": 0.00014608601128638027, "loss": 0.4854, "step": 280 }, { "epoch": 1.26, "learning_rate": 0.00014378567499479168, "loss": 0.4858, "step": 285 }, { "epoch": 1.28, "learning_rate": 0.00014145630595586607, "loss": 0.4914, "step": 290 }, { "epoch": 1.31, "learning_rate": 0.00013909944869261855, "loss": 0.4841, "step": 295 }, { "epoch": 1.33, "learning_rate": 0.00013671666595454295, "loss": 0.4931, "step": 300 }, { "epoch": 1.35, "learning_rate": 0.00013430953768140723, "loss": 0.5001, "step": 305 }, { "epoch": 1.37, "learning_rate": 0.00013187965995565098, "loss": 0.5034, "step": 310 }, { "epoch": 1.39, "learning_rate": 0.00012942864394407879, "loss": 0.4865, "step": 315 }, { "epoch": 1.42, "learning_rate": 0.00012695811482955227, "loss": 0.4919, "step": 320 }, { "epoch": 1.44, "learning_rate": 0.000124469710733388, "loss": 0.4806, "step": 325 }, { "epoch": 1.46, "learning_rate": 0.00012196508162917677, "loss": 0.4887, "step": 330 }, { "epoch": 1.48, "learning_rate": 0.00011944588824874388, "loss": 0.4826, "step": 335 }, { "epoch": 1.5, "learning_rate": 0.00011691380098097597, "loss": 0.4882, "step": 340 }, { "epoch": 1.53, "learning_rate": 0.0001143704987642448, "loss": 0.4881, "step": 345 }, { "epoch": 1.55, "learning_rate": 0.0001118176679731619, "loss": 0.4896, "step": 350 }, { "epoch": 1.57, "learning_rate": 0.00010925700130040266, "loss": 0.4799, "step": 355 }, { "epoch": 1.59, "learning_rate": 0.00010669019663434117, "loss": 0.4853, "step": 360 }, { "epoch": 1.62, "learning_rate": 0.00010411895593323981, "loss": 0.4885, "step": 365 }, { "epoch": 1.64, "learning_rate": 0.00010154498409674051, "loss": 0.4864, "step": 370 }, { "epoch": 1.66, "learning_rate": 9.896998783540536e-05, "loss": 0.4894, "step": 375 }, { "epoch": 1.68, "learning_rate": 9.639567453905661e-05, "loss": 0.4846, "step": 380 }, { "epoch": 1.7, "learning_rate": 9.382375114466644e-05, "loss": 0.481, "step": 385 }, { "epoch": 1.73, "learning_rate": 9.125592300454676e-05, "loss": 0.4834, "step": 390 }, { "epoch": 1.75, "learning_rate": 8.869389275558972e-05, "loss": 0.4916, "step": 395 }, { "epoch": 1.77, "learning_rate": 8.613935919030907e-05, "loss": 0.4831, "step": 400 }, { "epoch": 1.79, "learning_rate": 8.359401613043012e-05, "loss": 0.4769, "step": 405 }, { "epoch": 1.81, "learning_rate": 8.1059551303776e-05, "loss": 0.4839, "step": 410 }, { "epoch": 1.84, "learning_rate": 7.853764522519445e-05, "loss": 0.4901, "step": 415 }, { "epoch": 1.86, "learning_rate": 7.602997008226726e-05, "loss": 0.4893, "step": 420 }, { "epoch": 1.88, "learning_rate": 7.353818862654129e-05, "loss": 0.4855, "step": 425 }, { "epoch": 1.9, "learning_rate": 7.106395307101621e-05, "loss": 0.476, "step": 430 }, { "epoch": 1.92, "learning_rate": 6.860890399461974e-05, "loss": 0.481, "step": 435 }, { "epoch": 1.95, "learning_rate": 6.617466925439746e-05, "loss": 0.482, "step": 440 }, { "epoch": 1.97, "learning_rate": 6.376286290613776e-05, "loss": 0.494, "step": 445 }, { "epoch": 1.99, "learning_rate": 6.137508413414784e-05, "loss": 0.4788, "step": 450 }, { "epoch": 2.0, "eval_loss": 0.557712197303772, "eval_runtime": 40.1816, "eval_samples_per_second": 1.344, "eval_steps_per_second": 0.174, "step": 452 }, { "epoch": 2.01, "learning_rate": 5.901291619089081e-05, "loss": 0.4471, "step": 455 }, { "epoch": 2.04, "learning_rate": 5.667792534718639e-05, "loss": 0.4053, "step": 460 }, { "epoch": 2.06, "learning_rate": 5.437165985367145e-05, "loss": 0.4078, "step": 465 }, { "epoch": 2.08, "learning_rate": 5.2095648914209525e-05, "loss": 0.4129, "step": 470 }, { "epoch": 2.1, "learning_rate": 4.9851401671928934e-05, "loss": 0.41, "step": 475 }, { "epoch": 2.12, "learning_rate": 4.7640406208563224e-05, "loss": 0.3972, "step": 480 }, { "epoch": 2.15, "learning_rate": 4.546412855775595e-05, "loss": 0.3995, "step": 485 }, { "epoch": 2.17, "learning_rate": 4.3324011732985433e-05, "loss": 0.3996, "step": 490 }, { "epoch": 2.19, "learning_rate": 4.12214747707527e-05, "loss": 0.4089, "step": 495 }, { "epoch": 2.21, "learning_rate": 3.9157911789668525e-05, "loss": 0.4055, "step": 500 }, { "epoch": 2.23, "learning_rate": 3.7134691066061825e-05, "loss": 0.401, "step": 505 }, { "epoch": 2.26, "learning_rate": 3.515315412672384e-05, "loss": 0.3981, "step": 510 }, { "epoch": 2.28, "learning_rate": 3.321461485938847e-05, "loss": 0.4027, "step": 515 }, { "epoch": 2.3, "learning_rate": 3.132035864153958e-05, "loss": 0.3921, "step": 520 }, { "epoch": 2.32, "learning_rate": 2.947164148812198e-05, "loss": 0.4034, "step": 525 }, { "epoch": 2.35, "learning_rate": 2.766968921872213e-05, "loss": 0.4035, "step": 530 }, { "epoch": 2.37, "learning_rate": 2.5915696644769915e-05, "loss": 0.3863, "step": 535 }, { "epoch": 2.39, "learning_rate": 2.4210826777301153e-05, "loss": 0.3936, "step": 540 }, { "epoch": 2.41, "learning_rate": 2.2556210055805793e-05, "loss": 0.4058, "step": 545 }, { "epoch": 2.43, "learning_rate": 2.0952943598672846e-05, "loss": 0.4054, "step": 550 }, { "epoch": 2.46, "learning_rate": 1.9402090475729795e-05, "loss": 0.3956, "step": 555 }, { "epoch": 2.48, "learning_rate": 1.7904679003358283e-05, "loss": 0.4115, "step": 560 }, { "epoch": 2.5, "learning_rate": 1.6461702062653307e-05, "loss": 0.3995, "step": 565 }, { "epoch": 2.52, "learning_rate": 1.507411644107879e-05, "loss": 0.389, "step": 570 }, { "epoch": 2.54, "learning_rate": 1.3742842198055173e-05, "loss": 0.394, "step": 575 }, { "epoch": 2.57, "learning_rate": 1.2468762054900262e-05, "loss": 0.4082, "step": 580 }, { "epoch": 2.59, "learning_rate": 1.1252720809527751e-05, "loss": 0.4037, "step": 585 }, { "epoch": 2.61, "learning_rate": 1.0095524776291165e-05, "loss": 0.4072, "step": 590 }, { "epoch": 2.63, "learning_rate": 8.997941251345177e-06, "loss": 0.4015, "step": 595 }, { "epoch": 2.65, "learning_rate": 7.960698003878309e-06, "loss": 0.3952, "step": 600 }, { "epoch": 2.68, "learning_rate": 6.984482793554781e-06, "loss": 0.3996, "step": 605 }, { "epoch": 2.7, "learning_rate": 6.06994291448505e-06, "loss": 0.3877, "step": 610 }, { "epoch": 2.72, "learning_rate": 5.217684766027886e-06, "loss": 0.4102, "step": 615 }, { "epoch": 2.74, "learning_rate": 4.428273450708065e-06, "loss": 0.4056, "step": 620 }, { "epoch": 2.77, "learning_rate": 3.702232399516803e-06, "loss": 0.4014, "step": 625 }, { "epoch": 2.79, "learning_rate": 3.040043024842898e-06, "loss": 0.409, "step": 630 }, { "epoch": 2.81, "learning_rate": 2.4421444012651564e-06, "loss": 0.4061, "step": 635 }, { "epoch": 2.83, "learning_rate": 1.908932974417266e-06, "loss": 0.4087, "step": 640 }, { "epoch": 2.85, "learning_rate": 1.4407622981187253e-06, "loss": 0.3909, "step": 645 }, { "epoch": 2.88, "learning_rate": 1.0379427999456015e-06, "loss": 0.4103, "step": 650 }, { "epoch": 2.9, "learning_rate": 7.007415753969348e-07, "loss": 0.4012, "step": 655 }, { "epoch": 2.92, "learning_rate": 4.2938221079300566e-07, "loss": 0.4029, "step": 660 }, { "epoch": 2.94, "learning_rate": 2.2404463502315732e-07, "loss": 0.4052, "step": 665 }, { "epoch": 2.96, "learning_rate": 8.486500024118548e-08, "loss": 0.3996, "step": 670 }, { "epoch": 2.99, "learning_rate": 1.1935591587641792e-08, "loss": 0.3871, "step": 675 }, { "epoch": 3.0, "eval_loss": 0.5821943879127502, "eval_runtime": 40.1449, "eval_samples_per_second": 1.345, "eval_steps_per_second": 0.174, "step": 678 }, { "epoch": 3.0, "step": 678, "total_flos": 4.7593079718700646e+17, "train_loss": 0.5003105528586733, "train_runtime": 13978.5586, "train_samples_per_second": 0.387, "train_steps_per_second": 0.049 } ], "logging_steps": 5, "max_steps": 678, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "total_flos": 4.7593079718700646e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }