{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9995189995189995, "eval_steps": 500, "global_step": 1039, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 0.0, "learning_rate": 1.9230769230769234e-06, "loss": 0.0, "step": 1 }, { "epoch": 0.0, "grad_norm": 0.0, "learning_rate": 9.615384615384616e-06, "loss": 0.0, "step": 5 }, { "epoch": 0.01, "grad_norm": 0.0, "learning_rate": 1.923076923076923e-05, "loss": 0.0, "step": 10 }, { "epoch": 0.01, "grad_norm": 0.0, "learning_rate": 2.8846153846153845e-05, "loss": 0.0, "step": 15 }, { "epoch": 0.02, "grad_norm": 0.0, "learning_rate": 3.846153846153846e-05, "loss": 0.0, "step": 20 }, { "epoch": 0.02, "grad_norm": 0.0, "learning_rate": 4.8076923076923084e-05, "loss": 0.0, "step": 25 }, { "epoch": 0.03, "grad_norm": 0.0, "learning_rate": 5.769230769230769e-05, "loss": 0.0, "step": 30 }, { "epoch": 0.03, "grad_norm": 0.0, "learning_rate": 6.730769230769232e-05, "loss": 0.0, "step": 35 }, { "epoch": 0.04, "grad_norm": 0.0, "learning_rate": 7.692307692307693e-05, "loss": 0.0, "step": 40 }, { "epoch": 0.04, "grad_norm": 0.0, "learning_rate": 8.653846153846155e-05, "loss": 0.0, "step": 45 }, { "epoch": 0.05, "grad_norm": 0.0, "learning_rate": 9.615384615384617e-05, "loss": 0.0, "step": 50 }, { "epoch": 0.05, "grad_norm": 0.0, "learning_rate": 0.00010576923076923077, "loss": 0.0, "step": 55 }, { "epoch": 0.06, "grad_norm": 0.0, "learning_rate": 0.00011538461538461538, "loss": 0.0, "step": 60 }, { "epoch": 0.06, "grad_norm": 0.0, "learning_rate": 0.000125, "loss": 0.0, "step": 65 }, { "epoch": 0.07, "grad_norm": 0.0, "learning_rate": 0.00013461538461538464, "loss": 0.0, "step": 70 }, { "epoch": 0.07, "grad_norm": 0.0, "learning_rate": 0.00014423076923076924, "loss": 0.0, "step": 75 }, { "epoch": 0.08, "grad_norm": 0.0, "learning_rate": 0.00015384615384615385, "loss": 0.0, "step": 80 }, { "epoch": 0.08, "grad_norm": 0.0, "learning_rate": 0.00016346153846153846, "loss": 0.0, "step": 85 }, { "epoch": 0.09, "grad_norm": 0.0, "learning_rate": 0.0001730769230769231, "loss": 0.0, "step": 90 }, { "epoch": 0.09, "grad_norm": 0.0, "learning_rate": 0.0001826923076923077, "loss": 0.0, "step": 95 }, { "epoch": 0.1, "grad_norm": 0.0, "learning_rate": 0.00019230769230769233, "loss": 0.0, "step": 100 }, { "epoch": 0.1, "grad_norm": 0.0, "learning_rate": 0.00019999943552317104, "loss": 0.0, "step": 105 }, { "epoch": 0.11, "grad_norm": 0.0, "learning_rate": 0.00019997967950328128, "loss": 0.0, "step": 110 }, { "epoch": 0.11, "grad_norm": 0.0, "learning_rate": 0.0001999317060143023, "loss": 0.0, "step": 115 }, { "epoch": 0.12, "grad_norm": 0.0, "learning_rate": 0.0001998555285958899, "loss": 0.0, "step": 120 }, { "epoch": 0.12, "grad_norm": 0.0, "learning_rate": 0.00019975116874775242, "loss": 0.0, "step": 125 }, { "epoch": 0.13, "grad_norm": 0.0, "learning_rate": 0.00019961865592358288, "loss": 0.0, "step": 130 }, { "epoch": 0.13, "grad_norm": 0.0, "learning_rate": 0.0001994580275227462, "loss": 0.0, "step": 135 }, { "epoch": 0.13, "grad_norm": 0.0, "learning_rate": 0.00019926932887972393, "loss": 0.0, "step": 140 }, { "epoch": 0.14, "grad_norm": 0.0, "learning_rate": 0.0001990526132513194, "loss": 0.0, "step": 145 }, { "epoch": 0.14, "grad_norm": 0.0, "learning_rate": 0.00019880794180162693, "loss": 0.0, "step": 150 }, { "epoch": 0.15, "grad_norm": 0.0, "learning_rate": 0.00019853538358476932, "loss": 0.0, "step": 155 }, { "epoch": 0.15, "grad_norm": 0.0, "learning_rate": 0.00019823501552540865, "loss": 0.0, "step": 160 }, { "epoch": 0.16, "grad_norm": 0.0, "learning_rate": 0.00019790692239703557, "loss": 0.0, "step": 165 }, { "epoch": 0.16, "grad_norm": 0.0, "learning_rate": 0.00019755119679804367, "loss": 0.0, "step": 170 }, { "epoch": 0.17, "grad_norm": 0.0, "learning_rate": 0.00019716793912559507, "loss": 0.0, "step": 175 }, { "epoch": 0.17, "grad_norm": 0.0, "learning_rate": 0.00019675725754728527, "loss": 0.0, "step": 180 }, { "epoch": 0.18, "grad_norm": 0.0, "learning_rate": 0.00019631926797061456, "loss": 0.0, "step": 185 }, { "epoch": 0.18, "grad_norm": 0.0, "learning_rate": 0.00019585409401027556, "loss": 0.0, "step": 190 }, { "epoch": 0.19, "grad_norm": 0.0, "learning_rate": 0.00019536186695326486, "loss": 0.0, "step": 195 }, { "epoch": 0.19, "grad_norm": 0.0, "learning_rate": 0.00019484272572182986, "loss": 0.0, "step": 200 }, { "epoch": 0.2, "grad_norm": 0.0, "learning_rate": 0.00019429681683426022, "loss": 0.0, "step": 205 }, { "epoch": 0.2, "grad_norm": 0.0, "learning_rate": 0.00019372429436353606, "loss": 0.0, "step": 210 }, { "epoch": 0.21, "grad_norm": 0.0, "learning_rate": 0.0001931253198938432, "loss": 0.0, "step": 215 }, { "epoch": 0.21, "grad_norm": 0.0, "learning_rate": 0.00019250006247496928, "loss": 0.0, "step": 220 }, { "epoch": 0.22, "grad_norm": 0.0, "learning_rate": 0.00019184869857459232, "loss": 0.0, "step": 225 }, { "epoch": 0.22, "grad_norm": 0.0, "learning_rate": 0.00019117141202847586, "loss": 0.0, "step": 230 }, { "epoch": 0.23, "grad_norm": 0.0, "learning_rate": 0.00019046839398858474, "loss": 0.0, "step": 235 }, { "epoch": 0.23, "grad_norm": 0.0, "learning_rate": 0.00018973984286913584, "loss": 0.0, "step": 240 }, { "epoch": 0.24, "grad_norm": 0.0, "learning_rate": 0.0001889859642905992, "loss": 0.0, "step": 245 }, { "epoch": 0.24, "grad_norm": 0.0, "learning_rate": 0.00018820697102166526, "loss": 0.0, "step": 250 }, { "epoch": 0.25, "grad_norm": 0.0, "learning_rate": 0.00018740308291919497, "loss": 0.0, "step": 255 }, { "epoch": 0.25, "grad_norm": 0.0, "learning_rate": 0.0001865745268661689, "loss": 0.0, "step": 260 }, { "epoch": 0.25, "grad_norm": 0.0, "learning_rate": 0.00018572153670765365, "loss": 0.0, "step": 265 }, { "epoch": 0.26, "grad_norm": 0.0, "learning_rate": 0.00018484435318480332, "loss": 0.0, "step": 270 }, { "epoch": 0.26, "grad_norm": 0.0, "learning_rate": 0.0001839432238669147, "loss": 0.0, "step": 275 }, { "epoch": 0.27, "grad_norm": 0.0, "learning_rate": 0.00018301840308155507, "loss": 0.0, "step": 280 }, { "epoch": 0.27, "grad_norm": 0.0, "learning_rate": 0.00018207015184278305, "loss": 0.0, "step": 285 }, { "epoch": 0.28, "grad_norm": 0.0, "learning_rate": 0.000181098737777482, "loss": 0.0, "step": 290 }, { "epoch": 0.28, "grad_norm": 0.0, "learning_rate": 0.00018010443504982694, "loss": 0.0, "step": 295 }, { "epoch": 0.29, "grad_norm": 0.0, "learning_rate": 0.000179087524283907, "loss": 0.0, "step": 300 }, { "epoch": 0.29, "grad_norm": 0.0, "learning_rate": 0.00017804829248452395, "loss": 0.0, "step": 305 }, { "epoch": 0.3, "grad_norm": 0.0, "learning_rate": 0.00017698703295619052, "loss": 0.0, "step": 310 }, { "epoch": 0.3, "grad_norm": 0.0, "learning_rate": 0.00017590404522035028, "loss": 0.0, "step": 315 }, { "epoch": 0.31, "grad_norm": 0.0, "learning_rate": 0.00017479963493084329, "loss": 0.0, "step": 320 }, { "epoch": 0.31, "grad_norm": 0.0, "learning_rate": 0.0001736741137876405, "loss": 0.0, "step": 325 }, { "epoch": 0.32, "grad_norm": 0.0, "learning_rate": 0.00017252779944887235, "loss": 0.0, "step": 330 }, { "epoch": 0.32, "grad_norm": 0.0, "learning_rate": 0.00017136101544117525, "loss": 0.0, "step": 335 }, { "epoch": 0.33, "grad_norm": 0.0, "learning_rate": 0.00017017409106838207, "loss": 0.0, "step": 340 }, { "epoch": 0.33, "grad_norm": 0.0, "learning_rate": 0.00016896736131858208, "loss": 0.0, "step": 345 }, { "epoch": 0.34, "grad_norm": 0.0, "learning_rate": 0.0001677411667695765, "loss": 0.0, "step": 350 }, { "epoch": 0.34, "grad_norm": 0.0, "learning_rate": 0.00016649585349275662, "loss": 0.0, "step": 355 }, { "epoch": 0.35, "grad_norm": 0.0, "learning_rate": 0.0001652317729554313, "loss": 0.0, "step": 360 }, { "epoch": 0.35, "grad_norm": 0.0, "learning_rate": 0.0001639492819216316, "loss": 0.0, "step": 365 }, { "epoch": 0.36, "grad_norm": 0.0, "learning_rate": 0.0001626487423514207, "loss": 0.0, "step": 370 }, { "epoch": 0.36, "grad_norm": 0.0, "learning_rate": 0.00016133052129873693, "loss": 0.0, "step": 375 }, { "epoch": 0.37, "grad_norm": 0.0, "learning_rate": 0.0001599949908077996, "loss": 0.0, "step": 380 }, { "epoch": 0.37, "grad_norm": 0.0, "learning_rate": 0.00015864252780810616, "loss": 0.0, "step": 385 }, { "epoch": 0.38, "grad_norm": 0.0, "learning_rate": 0.00015727351400805052, "loss": 0.0, "step": 390 }, { "epoch": 0.38, "grad_norm": 0.0, "learning_rate": 0.0001558883357871928, "loss": 0.0, "step": 395 }, { "epoch": 0.38, "grad_norm": 0.0, "learning_rate": 0.00015448738408721052, "loss": 0.0, "step": 400 }, { "epoch": 0.39, "grad_norm": 0.0, "learning_rate": 0.00015307105430156255, "loss": 0.0, "step": 405 }, { "epoch": 0.39, "grad_norm": 0.0, "learning_rate": 0.0001516397461638962, "loss": 0.0, "step": 410 }, { "epoch": 0.4, "grad_norm": 0.0, "learning_rate": 0.0001501938636352297, "loss": 0.0, "step": 415 }, { "epoch": 0.4, "grad_norm": 0.0, "learning_rate": 0.00014873381478994134, "loss": 0.0, "step": 420 }, { "epoch": 0.41, "grad_norm": 0.0, "learning_rate": 0.00014726001170059792, "loss": 0.0, "step": 425 }, { "epoch": 0.41, "grad_norm": 0.0, "learning_rate": 0.00014577287032165468, "loss": 0.0, "step": 430 }, { "epoch": 0.42, "grad_norm": 0.0, "learning_rate": 0.00014427281037205945, "loss": 0.0, "step": 435 }, { "epoch": 0.42, "grad_norm": 0.0, "learning_rate": 0.00014276025521679471, "loss": 0.0, "step": 440 }, { "epoch": 0.43, "grad_norm": 0.0, "learning_rate": 0.00014123563174739037, "loss": 0.0, "step": 445 }, { "epoch": 0.43, "grad_norm": 0.0, "learning_rate": 0.00013969937026144118, "loss": 0.0, "step": 450 }, { "epoch": 0.44, "grad_norm": 0.0, "learning_rate": 0.00013815190434116317, "loss": 0.0, "step": 455 }, { "epoch": 0.44, "grad_norm": 0.0, "learning_rate": 0.00013659367073102268, "loss": 0.0, "step": 460 }, { "epoch": 0.45, "grad_norm": 0.0, "learning_rate": 0.00013502510921447323, "loss": 0.0, "step": 465 }, { "epoch": 0.45, "grad_norm": 0.0, "learning_rate": 0.00013344666248983432, "loss": 0.0, "step": 470 }, { "epoch": 0.46, "grad_norm": 0.0, "learning_rate": 0.000131858776045348, "loss": 0.0, "step": 475 }, { "epoch": 0.46, "grad_norm": 0.0, "learning_rate": 0.00013026189803344774, "loss": 0.0, "step": 480 }, { "epoch": 0.47, "grad_norm": 0.0, "learning_rate": 0.00012865647914427544, "loss": 0.0, "step": 485 }, { "epoch": 0.47, "grad_norm": 0.0, "learning_rate": 0.00012704297247848216, "loss": 0.0, "step": 490 }, { "epoch": 0.48, "grad_norm": 0.0, "learning_rate": 0.00012542183341934872, "loss": 0.0, "step": 495 }, { "epoch": 0.48, "grad_norm": 0.0, "learning_rate": 0.00012379351950426187, "loss": 0.0, "step": 500 }, { "epoch": 0.49, "grad_norm": 0.0, "learning_rate": 0.0001221584902955827, "loss": 0.0, "step": 505 }, { "epoch": 0.49, "grad_norm": 0.0, "learning_rate": 0.00012051720725094324, "loss": 0.0, "step": 510 }, { "epoch": 0.5, "grad_norm": 0.0, "learning_rate": 0.00011887013359300837, "loss": 0.0, "step": 515 }, { "epoch": 0.5, "grad_norm": 0.0, "learning_rate": 0.00011721773417873965, "loss": 0.0, "step": 520 }, { "epoch": 0.51, "grad_norm": 0.0, "learning_rate": 0.00011556047536819777, "loss": 0.0, "step": 525 }, { "epoch": 0.51, "grad_norm": 0.0, "learning_rate": 0.00011389882489292061, "loss": 0.0, "step": 530 }, { "epoch": 0.51, "grad_norm": 0.0, "learning_rate": 0.0001122332517239147, "loss": 0.0, "step": 535 }, { "epoch": 0.52, "grad_norm": 0.0, "learning_rate": 0.00011056422593929635, "loss": 0.0, "step": 540 }, { "epoch": 0.52, "grad_norm": 0.0, "learning_rate": 0.00010889221859162062, "loss": 0.0, "step": 545 }, { "epoch": 0.53, "grad_norm": 0.0, "learning_rate": 0.00010721770157493527, "loss": 0.0, "step": 550 }, { "epoch": 0.53, "grad_norm": 0.0, "learning_rate": 0.000105541147491597, "loss": 0.0, "step": 555 }, { "epoch": 0.54, "grad_norm": 0.0, "learning_rate": 0.00010386302951888804, "loss": 0.0, "step": 560 }, { "epoch": 0.54, "grad_norm": 0.0, "learning_rate": 0.00010218382127547022, "loss": 0.0, "step": 565 }, { "epoch": 0.55, "grad_norm": 0.0, "learning_rate": 0.00010050399668771479, "loss": 0.0, "step": 570 }, { "epoch": 0.55, "grad_norm": 0.0, "learning_rate": 9.882402985594515e-05, "loss": 0.0, "step": 575 }, { "epoch": 0.56, "grad_norm": 0.0, "learning_rate": 9.71443949206304e-05, "loss": 0.0, "step": 580 }, { "epoch": 0.56, "grad_norm": 0.0, "learning_rate": 9.546556592856789e-05, "loss": 0.0, "step": 585 }, { "epoch": 0.57, "grad_norm": 0.0, "learning_rate": 9.378801669909197e-05, "loss": 0.0, "step": 590 }, { "epoch": 0.57, "grad_norm": 0.0, "learning_rate": 9.211222069034695e-05, "loss": 0.0, "step": 595 }, { "epoch": 0.58, "grad_norm": 0.0, "learning_rate": 9.043865086566214e-05, "loss": 0.0, "step": 600 }, { "epoch": 0.58, "grad_norm": 0.0, "learning_rate": 8.87677795600663e-05, "loss": 0.0, "step": 605 }, { "epoch": 0.59, "grad_norm": 0.0, "learning_rate": 8.710007834697969e-05, "loss": 0.0, "step": 610 }, { "epoch": 0.59, "grad_norm": 0.0, "learning_rate": 8.543601790512083e-05, "loss": 0.0, "step": 615 }, { "epoch": 0.6, "grad_norm": 0.0, "learning_rate": 8.377606788566597e-05, "loss": 0.0, "step": 620 }, { "epoch": 0.6, "grad_norm": 0.0, "learning_rate": 8.212069677969851e-05, "loss": 0.0, "step": 625 }, { "epoch": 0.61, "grad_norm": 0.0, "learning_rate": 8.047037178598567e-05, "loss": 0.0, "step": 630 }, { "epoch": 0.61, "grad_norm": 0.0, "learning_rate": 7.882555867912017e-05, "loss": 0.0, "step": 635 }, { "epoch": 0.62, "grad_norm": 0.0, "learning_rate": 7.718672167806354e-05, "loss": 0.0, "step": 640 }, { "epoch": 0.62, "grad_norm": 0.0, "learning_rate": 7.55543233151289e-05, "loss": 0.0, "step": 645 }, { "epoch": 0.63, "grad_norm": 0.0, "learning_rate": 7.392882430543928e-05, "loss": 0.0, "step": 650 }, { "epoch": 0.63, "grad_norm": 0.0, "learning_rate": 7.231068341689923e-05, "loss": 0.0, "step": 655 }, { "epoch": 0.63, "grad_norm": 0.0, "learning_rate": 7.070035734071574e-05, "loss": 0.0, "step": 660 }, { "epoch": 0.64, "grad_norm": 0.0, "learning_rate": 6.909830056250527e-05, "loss": 0.0, "step": 665 }, { "epoch": 0.64, "grad_norm": 0.0, "learning_rate": 6.750496523402352e-05, "loss": 0.0, "step": 670 }, { "epoch": 0.65, "grad_norm": 0.0, "learning_rate": 6.592080104555357e-05, "loss": 0.0, "step": 675 }, { "epoch": 0.65, "grad_norm": 0.0, "learning_rate": 6.434625509898897e-05, "loss": 0.0, "step": 680 }, { "epoch": 0.66, "grad_norm": 0.0, "learning_rate": 6.278177178164721e-05, "loss": 0.0, "step": 685 }, { "epoch": 0.66, "grad_norm": 0.0, "learning_rate": 6.122779264084932e-05, "loss": 0.0, "step": 690 }, { "epoch": 0.67, "grad_norm": 0.0, "learning_rate": 5.968475625930124e-05, "loss": 0.0, "step": 695 }, { "epoch": 0.67, "grad_norm": 0.0, "learning_rate": 5.815309813131153e-05, "loss": 0.0, "step": 700 }, { "epoch": 0.68, "grad_norm": 0.0, "learning_rate": 5.663325053988112e-05, "loss": 0.0, "step": 705 }, { "epoch": 0.68, "grad_norm": 0.0, "learning_rate": 5.5125642434699044e-05, "loss": 0.0, "step": 710 }, { "epoch": 0.69, "grad_norm": 0.0, "learning_rate": 5.363069931107902e-05, "loss": 0.0, "step": 715 }, { "epoch": 0.69, "grad_norm": 0.0, "learning_rate": 5.214884308987136e-05, "loss": 0.0, "step": 720 }, { "epoch": 0.7, "grad_norm": 0.0, "learning_rate": 5.068049199838307e-05, "loss": 0.0, "step": 725 }, { "epoch": 0.7, "grad_norm": 0.0, "learning_rate": 4.9226060452340825e-05, "loss": 0.0, "step": 730 }, { "epoch": 0.71, "grad_norm": 0.0, "learning_rate": 4.7785958938929644e-05, "loss": 0.0, "step": 735 }, { "epoch": 0.71, "grad_norm": 0.0, "learning_rate": 4.6360593900940074e-05, "loss": 0.0, "step": 740 }, { "epoch": 0.72, "grad_norm": 0.0, "learning_rate": 4.4950367622057173e-05, "loss": 0.0, "step": 745 }, { "epoch": 0.72, "grad_norm": 0.0, "learning_rate": 4.355567811332311e-05, "loss": 0.0, "step": 750 }, { "epoch": 0.73, "grad_norm": 0.0, "learning_rate": 4.21769190008056e-05, "loss": 0.0, "step": 755 }, { "epoch": 0.73, "grad_norm": 0.0, "learning_rate": 4.081447941450428e-05, "loss": 0.0, "step": 760 }, { "epoch": 0.74, "grad_norm": 0.0, "learning_rate": 3.946874387852545e-05, "loss": 0.0, "step": 765 }, { "epoch": 0.74, "grad_norm": 0.0, "learning_rate": 3.8140092202557185e-05, "loss": 0.0, "step": 770 }, { "epoch": 0.75, "grad_norm": 0.0, "learning_rate": 3.682889937467493e-05, "loss": 0.0, "step": 775 }, { "epoch": 0.75, "grad_norm": 0.0, "learning_rate": 3.553553545550768e-05, "loss": 0.0, "step": 780 }, { "epoch": 0.76, "grad_norm": 0.0, "learning_rate": 3.426036547379528e-05, "loss": 0.0, "step": 785 }, { "epoch": 0.76, "grad_norm": 0.0, "learning_rate": 3.300374932336533e-05, "loss": 0.0, "step": 790 }, { "epoch": 0.76, "grad_norm": 0.0, "learning_rate": 3.176604166155976e-05, "loss": 0.0, "step": 795 }, { "epoch": 0.77, "grad_norm": 0.0, "learning_rate": 3.054759180913921e-05, "loss": 0.0, "step": 800 }, { "epoch": 0.77, "grad_norm": 0.0, "learning_rate": 2.9348743651693357e-05, "loss": 0.0, "step": 805 }, { "epoch": 0.78, "grad_norm": 0.0, "learning_rate": 2.8169835542585587e-05, "loss": 0.0, "step": 810 }, { "epoch": 0.78, "grad_norm": 0.0, "learning_rate": 2.7011200207458677e-05, "loss": 0.0, "step": 815 }, { "epoch": 0.79, "grad_norm": 0.0, "learning_rate": 2.5873164650328996e-05, "loss": 0.0, "step": 820 }, { "epoch": 0.79, "grad_norm": 0.0, "learning_rate": 2.4756050061295534e-05, "loss": 0.0, "step": 825 }, { "epoch": 0.8, "grad_norm": 0.0, "learning_rate": 2.36601717258897e-05, "loss": 0.0, "step": 830 }, { "epoch": 0.8, "grad_norm": 0.0, "learning_rate": 2.2585838936091754e-05, "loss": 0.0, "step": 835 }, { "epoch": 0.81, "grad_norm": 0.0, "learning_rate": 2.153335490303856e-05, "loss": 0.0, "step": 840 }, { "epoch": 0.81, "grad_norm": 0.0, "learning_rate": 2.0503016671447785e-05, "loss": 0.0, "step": 845 }, { "epoch": 0.82, "grad_norm": 0.0, "learning_rate": 1.9495115035782307e-05, "loss": 0.0, "step": 850 }, { "epoch": 0.82, "grad_norm": 0.0, "learning_rate": 1.8509934458178712e-05, "loss": 0.0, "step": 855 }, { "epoch": 0.83, "grad_norm": 0.0, "learning_rate": 1.754775298816307e-05, "loss": 0.0, "step": 860 }, { "epoch": 0.83, "grad_norm": 0.0, "learning_rate": 1.6608842184176243e-05, "loss": 0.0, "step": 865 }, { "epoch": 0.84, "grad_norm": 0.0, "learning_rate": 1.5693467036931576e-05, "loss": 0.0, "step": 870 }, { "epoch": 0.84, "grad_norm": 0.0, "learning_rate": 1.48018858946259e-05, "loss": 0.0, "step": 875 }, { "epoch": 0.85, "grad_norm": 0.0, "learning_rate": 1.3934350390025463e-05, "loss": 0.0, "step": 880 }, { "epoch": 0.85, "grad_norm": 0.0, "learning_rate": 1.3091105369447165e-05, "loss": 0.0, "step": 885 }, { "epoch": 0.86, "grad_norm": 0.0, "learning_rate": 1.22723888236549e-05, "loss": 0.0, "step": 890 }, { "epoch": 0.86, "grad_norm": 0.0, "learning_rate": 1.1478431820691083e-05, "loss": 0.0, "step": 895 }, { "epoch": 0.87, "grad_norm": 0.0, "learning_rate": 1.0709458440661801e-05, "loss": 0.0, "step": 900 }, { "epoch": 0.87, "grad_norm": 0.0, "learning_rate": 9.965685712494199e-06, "loss": 0.0, "step": 905 }, { "epoch": 0.88, "grad_norm": 0.0, "learning_rate": 9.247323552684051e-06, "loss": 0.0, "step": 910 }, { "epoch": 0.88, "grad_norm": 0.0, "learning_rate": 8.554574706050488e-06, "loss": 0.0, "step": 915 }, { "epoch": 0.89, "grad_norm": 0.0, "learning_rate": 7.887634688515e-06, "loss": 0.0, "step": 920 }, { "epoch": 0.89, "grad_norm": 0.0, "learning_rate": 7.246691731920485e-06, "loss": 0.0, "step": 925 }, { "epoch": 0.89, "grad_norm": 0.0, "learning_rate": 6.631926730906324e-06, "loss": 0.0, "step": 930 }, { "epoch": 0.9, "grad_norm": 0.0, "learning_rate": 6.043513191853978e-06, "loss": 0.0, "step": 935 }, { "epoch": 0.9, "grad_norm": 0.0, "learning_rate": 5.481617183918053e-06, "loss": 0.0, "step": 940 }, { "epoch": 0.91, "grad_norm": 0.0, "learning_rate": 4.946397292156158e-06, "loss": 0.0, "step": 945 }, { "epoch": 0.91, "grad_norm": 0.0, "learning_rate": 4.438004572771182e-06, "loss": 0.0, "step": 950 }, { "epoch": 0.92, "grad_norm": 0.0, "learning_rate": 3.9565825104783685e-06, "loss": 0.0, "step": 955 }, { "epoch": 0.92, "grad_norm": 0.0, "learning_rate": 3.5022669780093497e-06, "loss": 0.0, "step": 960 }, { "epoch": 0.93, "grad_norm": 0.0, "learning_rate": 3.0751861977645125e-06, "loss": 0.0, "step": 965 }, { "epoch": 0.93, "grad_norm": 0.0, "learning_rate": 2.6754607056244883e-06, "loss": 0.0, "step": 970 }, { "epoch": 0.94, "grad_norm": 0.0, "learning_rate": 2.303203316931102e-06, "loss": 0.0, "step": 975 }, { "epoch": 0.94, "grad_norm": 0.0, "learning_rate": 1.9585190946472488e-06, "loss": 0.0, "step": 980 }, { "epoch": 0.95, "grad_norm": 0.0, "learning_rate": 1.6415053197047725e-06, "loss": 0.0, "step": 985 }, { "epoch": 0.95, "grad_norm": 0.0, "learning_rate": 1.3522514635486816e-06, "loss": 0.0, "step": 990 }, { "epoch": 0.96, "grad_norm": 0.0, "learning_rate": 1.0908391628854041e-06, "loss": 0.0, "step": 995 }, { "epoch": 0.96, "grad_norm": 0.0, "learning_rate": 8.57342196642319e-07, "loss": 0.0, "step": 1000 }, { "epoch": 0.97, "grad_norm": 0.0, "learning_rate": 6.518264651449779e-07, "loss": 0.0, "step": 1005 }, { "epoch": 0.97, "grad_norm": 0.0, "learning_rate": 4.743499715179067e-07, "loss": 0.0, "step": 1010 }, { "epoch": 0.98, "grad_norm": 0.0, "learning_rate": 3.249628053142884e-07, "loss": 0.0, "step": 1015 }, { "epoch": 0.98, "grad_norm": 0.0, "learning_rate": 2.0370712837906037e-07, "loss": 0.0, "step": 1020 }, { "epoch": 0.99, "grad_norm": 0.0, "learning_rate": 1.1061716294951118e-07, "loss": 0.0, "step": 1025 }, { "epoch": 0.99, "grad_norm": 0.0, "learning_rate": 4.5719181996650705e-08, "loss": 0.0, "step": 1030 }, { "epoch": 1.0, "grad_norm": 0.0, "learning_rate": 9.031501810174981e-09, "loss": 0.0, "step": 1035 }, { "epoch": 1.0, "eval_loss": NaN, "eval_runtime": 181.8174, "eval_samples_per_second": 12.705, "eval_steps_per_second": 0.798, "step": 1039 }, { "epoch": 1.0, "step": 1039, "total_flos": 5523816681832448.0, "train_loss": 0.0, "train_runtime": 10243.5321, "train_samples_per_second": 3.247, "train_steps_per_second": 0.101 } ], "logging_steps": 5, "max_steps": 1039, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 5523816681832448.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }