{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 1350, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 0.0, "learning_rate": 1.4814814814814817e-06, "loss": 0.0, "step": 1 }, { "epoch": 0.0, "grad_norm": 0.0, "learning_rate": 7.4074074074074075e-06, "loss": 0.0, "step": 5 }, { "epoch": 0.01, "grad_norm": 0.0, "learning_rate": 1.4814814814814815e-05, "loss": 0.0, "step": 10 }, { "epoch": 0.01, "grad_norm": 0.0, "learning_rate": 2.2222222222222223e-05, "loss": 0.0, "step": 15 }, { "epoch": 0.01, "grad_norm": 0.0, "learning_rate": 2.962962962962963e-05, "loss": 0.0, "step": 20 }, { "epoch": 0.02, "grad_norm": 0.0, "learning_rate": 3.7037037037037037e-05, "loss": 0.0, "step": 25 }, { "epoch": 0.02, "grad_norm": 0.0, "learning_rate": 4.4444444444444447e-05, "loss": 0.0, "step": 30 }, { "epoch": 0.03, "grad_norm": 0.0, "learning_rate": 5.185185185185185e-05, "loss": 0.0, "step": 35 }, { "epoch": 0.03, "grad_norm": 0.0, "learning_rate": 5.925925925925926e-05, "loss": 0.0, "step": 40 }, { "epoch": 0.03, "grad_norm": 0.0, "learning_rate": 6.666666666666667e-05, "loss": 0.0, "step": 45 }, { "epoch": 0.04, "grad_norm": 0.0, "learning_rate": 7.407407407407407e-05, "loss": 0.0, "step": 50 }, { "epoch": 0.04, "grad_norm": 0.0, "learning_rate": 8.148148148148148e-05, "loss": 0.0, "step": 55 }, { "epoch": 0.04, "grad_norm": 0.0, "learning_rate": 8.888888888888889e-05, "loss": 0.0, "step": 60 }, { "epoch": 0.05, "grad_norm": 0.0, "learning_rate": 9.62962962962963e-05, "loss": 0.0, "step": 65 }, { "epoch": 0.05, "grad_norm": 0.0, "learning_rate": 0.0001037037037037037, "loss": 0.0, "step": 70 }, { "epoch": 0.06, "grad_norm": 0.0, "learning_rate": 0.00011111111111111112, "loss": 0.0, "step": 75 }, { "epoch": 0.06, "grad_norm": 0.0, "learning_rate": 0.00011851851851851852, "loss": 0.0, "step": 80 }, { "epoch": 0.06, "grad_norm": 0.0, "learning_rate": 0.00012592592592592592, "loss": 0.0, "step": 85 }, { "epoch": 0.07, "grad_norm": 0.0, "learning_rate": 0.00013333333333333334, "loss": 0.0, "step": 90 }, { "epoch": 0.07, "grad_norm": 0.0, "learning_rate": 0.00014074074074074076, "loss": 0.0, "step": 95 }, { "epoch": 0.07, "grad_norm": 0.0, "learning_rate": 0.00014814814814814815, "loss": 0.0, "step": 100 }, { "epoch": 0.08, "grad_norm": 0.0, "learning_rate": 0.00015555555555555556, "loss": 0.0, "step": 105 }, { "epoch": 0.08, "grad_norm": 0.0, "learning_rate": 0.00016296296296296295, "loss": 0.0, "step": 110 }, { "epoch": 0.09, "grad_norm": 0.0, "learning_rate": 0.00017037037037037037, "loss": 0.0, "step": 115 }, { "epoch": 0.09, "grad_norm": 0.0, "learning_rate": 0.00017777777777777779, "loss": 0.0, "step": 120 }, { "epoch": 0.09, "grad_norm": 0.0, "learning_rate": 0.0001851851851851852, "loss": 0.0, "step": 125 }, { "epoch": 0.1, "grad_norm": 0.0, "learning_rate": 0.0001925925925925926, "loss": 0.0, "step": 130 }, { "epoch": 0.1, "grad_norm": 0.0, "learning_rate": 0.0002, "loss": 0.0, "step": 135 }, { "epoch": 0.1, "grad_norm": 0.0, "learning_rate": 0.00019999164298554375, "loss": 0.0, "step": 140 }, { "epoch": 0.11, "grad_norm": 0.0, "learning_rate": 0.00019996657333896877, "loss": 0.0, "step": 145 }, { "epoch": 0.11, "grad_norm": 0.0, "learning_rate": 0.00019992479525042303, "loss": 0.0, "step": 150 }, { "epoch": 0.11, "grad_norm": 0.0, "learning_rate": 0.00019986631570270832, "loss": 0.0, "step": 155 }, { "epoch": 0.12, "grad_norm": 0.0, "learning_rate": 0.00019979114447011323, "loss": 0.0, "step": 160 }, { "epoch": 0.12, "grad_norm": 0.0, "learning_rate": 0.0001996992941167792, "loss": 0.0, "step": 165 }, { "epoch": 0.13, "grad_norm": 0.0, "learning_rate": 0.00019959077999460095, "loss": 0.0, "step": 170 }, { "epoch": 0.13, "grad_norm": 0.0, "learning_rate": 0.00019946562024066014, "loss": 0.0, "step": 175 }, { "epoch": 0.13, "grad_norm": 0.0, "learning_rate": 0.00019932383577419432, "loss": 0.0, "step": 180 }, { "epoch": 0.14, "grad_norm": 0.0, "learning_rate": 0.00019916545029310012, "loss": 0.0, "step": 185 }, { "epoch": 0.14, "grad_norm": 0.0, "learning_rate": 0.00019899049026997272, "loss": 0.0, "step": 190 }, { "epoch": 0.14, "grad_norm": 0.0, "learning_rate": 0.00019879898494768093, "loss": 0.0, "step": 195 }, { "epoch": 0.15, "grad_norm": 0.0, "learning_rate": 0.00019859096633447965, "loss": 0.0, "step": 200 }, { "epoch": 0.15, "grad_norm": 0.0, "learning_rate": 0.0001983664691986601, "loss": 0.0, "step": 205 }, { "epoch": 0.16, "grad_norm": 0.0, "learning_rate": 0.00019812553106273847, "loss": 0.0, "step": 210 }, { "epoch": 0.16, "grad_norm": 0.0, "learning_rate": 0.00019786819219718443, "loss": 0.0, "step": 215 }, { "epoch": 0.16, "grad_norm": 0.0, "learning_rate": 0.00019759449561369038, "loss": 0.0, "step": 220 }, { "epoch": 0.17, "grad_norm": 0.0, "learning_rate": 0.00019730448705798239, "loss": 0.0, "step": 225 }, { "epoch": 0.17, "grad_norm": 0.0, "learning_rate": 0.00019699821500217434, "loss": 0.0, "step": 230 }, { "epoch": 0.17, "grad_norm": 0.0, "learning_rate": 0.0001966757306366662, "loss": 0.0, "step": 235 }, { "epoch": 0.18, "grad_norm": 0.0, "learning_rate": 0.00019633708786158806, "loss": 0.0, "step": 240 }, { "epoch": 0.18, "grad_norm": 0.0, "learning_rate": 0.00019598234327779118, "loss": 0.0, "step": 245 }, { "epoch": 0.19, "grad_norm": 0.0, "learning_rate": 0.00019561155617738797, "loss": 0.0, "step": 250 }, { "epoch": 0.19, "grad_norm": 0.0, "learning_rate": 0.00019522478853384155, "loss": 0.0, "step": 255 }, { "epoch": 0.19, "grad_norm": 0.0, "learning_rate": 0.00019482210499160765, "loss": 0.0, "step": 260 }, { "epoch": 0.2, "grad_norm": 0.0, "learning_rate": 0.00019440357285533, "loss": 0.0, "step": 265 }, { "epoch": 0.2, "grad_norm": 0.0, "learning_rate": 0.00019396926207859084, "loss": 0.0, "step": 270 }, { "epoch": 0.2, "grad_norm": 0.0, "learning_rate": 0.000193519245252219, "loss": 0.0, "step": 275 }, { "epoch": 0.21, "grad_norm": 0.0, "learning_rate": 0.00019305359759215685, "loss": 0.0, "step": 280 }, { "epoch": 0.21, "grad_norm": 0.0, "learning_rate": 0.00019257239692688907, "loss": 0.0, "step": 285 }, { "epoch": 0.21, "grad_norm": 0.0, "learning_rate": 0.00019207572368443385, "loss": 0.0, "step": 290 }, { "epoch": 0.22, "grad_norm": 0.0, "learning_rate": 0.0001915636608789006, "loss": 0.0, "step": 295 }, { "epoch": 0.22, "grad_norm": 0.0, "learning_rate": 0.0001910362940966147, "loss": 0.0, "step": 300 }, { "epoch": 0.23, "grad_norm": 0.0, "learning_rate": 0.00019049371148181253, "loss": 0.0, "step": 305 }, { "epoch": 0.23, "grad_norm": 0.0, "learning_rate": 0.00018993600372190932, "loss": 0.0, "step": 310 }, { "epoch": 0.23, "grad_norm": 0.0, "learning_rate": 0.00018936326403234125, "loss": 0.0, "step": 315 }, { "epoch": 0.24, "grad_norm": 0.0, "learning_rate": 0.00018877558814098561, "loss": 0.0, "step": 320 }, { "epoch": 0.24, "grad_norm": 0.0, "learning_rate": 0.0001881730742721608, "loss": 0.0, "step": 325 }, { "epoch": 0.24, "grad_norm": 0.0, "learning_rate": 0.0001875558231302091, "loss": 0.0, "step": 330 }, { "epoch": 0.25, "grad_norm": 0.0, "learning_rate": 0.00018692393788266479, "loss": 0.0, "step": 335 }, { "epoch": 0.25, "grad_norm": 0.0, "learning_rate": 0.00018627752414301086, "loss": 0.0, "step": 340 }, { "epoch": 0.26, "grad_norm": 0.0, "learning_rate": 0.00018561668995302667, "loss": 0.0, "step": 345 }, { "epoch": 0.26, "grad_norm": 0.0, "learning_rate": 0.00018494154576472976, "loss": 0.0, "step": 350 }, { "epoch": 0.26, "grad_norm": 0.0, "learning_rate": 0.00018425220442191495, "loss": 0.0, "step": 355 }, { "epoch": 0.27, "grad_norm": 0.0, "learning_rate": 0.00018354878114129367, "loss": 0.0, "step": 360 }, { "epoch": 0.27, "grad_norm": 0.0, "learning_rate": 0.00018283139349323634, "loss": 0.0, "step": 365 }, { "epoch": 0.27, "grad_norm": 0.0, "learning_rate": 0.00018210016138212187, "loss": 0.0, "step": 370 }, { "epoch": 0.28, "grad_norm": 0.0, "learning_rate": 0.00018135520702629675, "loss": 0.0, "step": 375 }, { "epoch": 0.28, "grad_norm": 0.0, "learning_rate": 0.00018059665493764743, "loss": 0.0, "step": 380 }, { "epoch": 0.29, "grad_norm": 0.0, "learning_rate": 0.0001798246319007893, "loss": 0.0, "step": 385 }, { "epoch": 0.29, "grad_norm": 0.0, "learning_rate": 0.00017903926695187595, "loss": 0.0, "step": 390 }, { "epoch": 0.29, "grad_norm": 0.0, "learning_rate": 0.00017824069135703198, "loss": 0.0, "step": 395 }, { "epoch": 0.3, "grad_norm": 0.0, "learning_rate": 0.00017742903859041325, "loss": 0.0, "step": 400 }, { "epoch": 0.3, "grad_norm": 0.0, "learning_rate": 0.0001766044443118978, "loss": 0.0, "step": 405 }, { "epoch": 0.3, "grad_norm": 0.0, "learning_rate": 0.0001757670463444118, "loss": 0.0, "step": 410 }, { "epoch": 0.31, "grad_norm": 0.0, "learning_rate": 0.00017491698465089362, "loss": 0.0, "step": 415 }, { "epoch": 0.31, "grad_norm": 0.0, "learning_rate": 0.00017405440131090048, "loss": 0.0, "step": 420 }, { "epoch": 0.31, "grad_norm": 0.0, "learning_rate": 0.00017317944049686124, "loss": 0.0, "step": 425 }, { "epoch": 0.32, "grad_norm": 0.0, "learning_rate": 0.00017229224844997928, "loss": 0.0, "step": 430 }, { "epoch": 0.32, "grad_norm": 0.0, "learning_rate": 0.00017139297345578994, "loss": 0.0, "step": 435 }, { "epoch": 0.33, "grad_norm": 0.0, "learning_rate": 0.00017048176581937563, "loss": 0.0, "step": 440 }, { "epoch": 0.33, "grad_norm": 0.0, "learning_rate": 0.0001695587778402442, "loss": 0.0, "step": 445 }, { "epoch": 0.33, "grad_norm": 0.0, "learning_rate": 0.0001686241637868734, "loss": 0.0, "step": 450 }, { "epoch": 0.34, "grad_norm": 0.0, "learning_rate": 0.00016767807987092621, "loss": 0.0, "step": 455 }, { "epoch": 0.34, "grad_norm": 0.0, "learning_rate": 0.00016672068422114196, "loss": 0.0, "step": 460 }, { "epoch": 0.34, "grad_norm": 0.0, "learning_rate": 0.0001657521368569064, "loss": 0.0, "step": 465 }, { "epoch": 0.35, "grad_norm": 0.0, "learning_rate": 0.00016477259966150588, "loss": 0.0, "step": 470 }, { "epoch": 0.35, "grad_norm": 0.0, "learning_rate": 0.0001637822363550706, "loss": 0.0, "step": 475 }, { "epoch": 0.36, "grad_norm": 0.0, "learning_rate": 0.00016278121246720987, "loss": 0.0, "step": 480 }, { "epoch": 0.36, "grad_norm": 0.0, "learning_rate": 0.00016176969530934572, "loss": 0.0, "step": 485 }, { "epoch": 0.36, "grad_norm": 0.0, "learning_rate": 0.00016074785394674837, "loss": 0.0, "step": 490 }, { "epoch": 0.37, "grad_norm": 0.0, "learning_rate": 0.00015971585917027862, "loss": 0.0, "step": 495 }, { "epoch": 0.37, "grad_norm": 0.0, "learning_rate": 0.0001586738834678418, "loss": 0.0, "step": 500 }, { "epoch": 0.37, "grad_norm": 0.0, "learning_rate": 0.00015762210099555803, "loss": 0.0, "step": 505 }, { "epoch": 0.38, "grad_norm": 0.0, "learning_rate": 0.00015656068754865387, "loss": 0.0, "step": 510 }, { "epoch": 0.38, "grad_norm": 0.0, "learning_rate": 0.0001554898205320797, "loss": 0.0, "step": 515 }, { "epoch": 0.39, "grad_norm": 0.0, "learning_rate": 0.00015440967893085828, "loss": 0.0, "step": 520 }, { "epoch": 0.39, "grad_norm": 0.0, "learning_rate": 0.00015332044328016914, "loss": 0.0, "step": 525 }, { "epoch": 0.39, "grad_norm": 0.0, "learning_rate": 0.00015222229563517385, "loss": 0.0, "step": 530 }, { "epoch": 0.4, "grad_norm": 0.0, "learning_rate": 0.00015111541954058734, "loss": 0.0, "step": 535 }, { "epoch": 0.4, "grad_norm": 0.0, "learning_rate": 0.00015000000000000001, "loss": 0.0, "step": 540 }, { "epoch": 0.4, "grad_norm": 0.0, "learning_rate": 0.00014887622344495643, "loss": 0.0, "step": 545 }, { "epoch": 0.41, "grad_norm": 0.0, "learning_rate": 0.0001477442777037949, "loss": 0.0, "step": 550 }, { "epoch": 0.41, "grad_norm": 0.0, "learning_rate": 0.0001466043519702539, "loss": 0.0, "step": 555 }, { "epoch": 0.41, "grad_norm": 0.0, "learning_rate": 0.00014545663677185006, "loss": 0.0, "step": 560 }, { "epoch": 0.42, "grad_norm": 0.0, "learning_rate": 0.00014430132393803352, "loss": 0.0, "step": 565 }, { "epoch": 0.42, "grad_norm": 0.0, "learning_rate": 0.00014313860656812536, "loss": 0.0, "step": 570 }, { "epoch": 0.43, "grad_norm": 0.0, "learning_rate": 0.0001419686789990429, "loss": 0.0, "step": 575 }, { "epoch": 0.43, "grad_norm": 0.0, "learning_rate": 0.00014079173677281837, "loss": 0.0, "step": 580 }, { "epoch": 0.43, "grad_norm": 0.0, "learning_rate": 0.0001396079766039157, "loss": 0.0, "step": 585 }, { "epoch": 0.44, "grad_norm": 0.0, "learning_rate": 0.00013841759634635178, "loss": 0.0, "step": 590 }, { "epoch": 0.44, "grad_norm": 0.0, "learning_rate": 0.00013722079496062702, "loss": 0.0, "step": 595 }, { "epoch": 0.44, "grad_norm": 0.0, "learning_rate": 0.00013601777248047105, "loss": 0.0, "step": 600 }, { "epoch": 0.45, "grad_norm": 0.0, "learning_rate": 0.00013480872997940905, "loss": 0.0, "step": 605 }, { "epoch": 0.45, "grad_norm": 0.0, "learning_rate": 0.00013359386953715421, "loss": 0.0, "step": 610 }, { "epoch": 0.46, "grad_norm": 0.0, "learning_rate": 0.00013237339420583212, "loss": 0.0, "step": 615 }, { "epoch": 0.46, "grad_norm": 0.0, "learning_rate": 0.00013114750797604247, "loss": 0.0, "step": 620 }, { "epoch": 0.46, "grad_norm": 0.0, "learning_rate": 0.00012991641574276418, "loss": 0.0, "step": 625 }, { "epoch": 0.47, "grad_norm": 0.0, "learning_rate": 0.00012868032327110904, "loss": 0.0, "step": 630 }, { "epoch": 0.47, "grad_norm": 0.0, "learning_rate": 0.00012743943716193016, "loss": 0.0, "step": 635 }, { "epoch": 0.47, "grad_norm": 0.0, "learning_rate": 0.0001261939648172906, "loss": 0.0, "step": 640 }, { "epoch": 0.48, "grad_norm": 0.0, "learning_rate": 0.00012494411440579814, "loss": 0.0, "step": 645 }, { "epoch": 0.48, "grad_norm": 0.0, "learning_rate": 0.00012369009482781192, "loss": 0.0, "step": 650 }, { "epoch": 0.49, "grad_norm": 0.0, "learning_rate": 0.00012243211568052677, "loss": 0.0, "step": 655 }, { "epoch": 0.49, "grad_norm": 0.0, "learning_rate": 0.0001211703872229411, "loss": 0.0, "step": 660 }, { "epoch": 0.49, "grad_norm": 0.0, "learning_rate": 0.00011990512034071406, "loss": 0.0, "step": 665 }, { "epoch": 0.5, "grad_norm": 0.0, "learning_rate": 0.00011863652651091823, "loss": 0.0, "step": 670 }, { "epoch": 0.5, "grad_norm": 0.0, "learning_rate": 0.00011736481776669306, "loss": 0.0, "step": 675 }, { "epoch": 0.5, "grad_norm": 0.0, "learning_rate": 0.00011609020666180575, "loss": 0.0, "step": 680 }, { "epoch": 0.51, "grad_norm": 0.0, "learning_rate": 0.0001148129062351249, "loss": 0.0, "step": 685 }, { "epoch": 0.51, "grad_norm": 0.0, "learning_rate": 0.00011353312997501313, "loss": 0.0, "step": 690 }, { "epoch": 0.51, "grad_norm": 0.0, "learning_rate": 0.00011225109178364455, "loss": 0.0, "step": 695 }, { "epoch": 0.52, "grad_norm": 0.0, "learning_rate": 0.00011096700594125318, "loss": 0.0, "step": 700 }, { "epoch": 0.52, "grad_norm": 0.0, "learning_rate": 0.00010968108707031792, "loss": 0.0, "step": 705 }, { "epoch": 0.53, "grad_norm": 0.0, "learning_rate": 0.00010839355009969068, "loss": 0.0, "step": 710 }, { "epoch": 0.53, "grad_norm": 0.0, "learning_rate": 0.00010710461022867302, "loss": 0.0, "step": 715 }, { "epoch": 0.53, "grad_norm": 0.0, "learning_rate": 0.00010581448289104758, "loss": 0.0, "step": 720 }, { "epoch": 0.54, "grad_norm": 0.0, "learning_rate": 0.00010452338371907064, "loss": 0.0, "step": 725 }, { "epoch": 0.54, "grad_norm": 0.0, "learning_rate": 0.00010323152850743107, "loss": 0.0, "step": 730 }, { "epoch": 0.54, "grad_norm": 0.0, "learning_rate": 0.00010193913317718244, "loss": 0.0, "step": 735 }, { "epoch": 0.55, "grad_norm": 0.0, "learning_rate": 0.00010064641373965393, "loss": 0.0, "step": 740 }, { "epoch": 0.55, "grad_norm": 0.0, "learning_rate": 9.935358626034606e-05, "loss": 0.0, "step": 745 }, { "epoch": 0.56, "grad_norm": 0.0, "learning_rate": 9.806086682281758e-05, "loss": 0.0, "step": 750 }, { "epoch": 0.56, "grad_norm": 0.0, "learning_rate": 9.676847149256895e-05, "loss": 0.0, "step": 755 }, { "epoch": 0.56, "grad_norm": 0.0, "learning_rate": 9.547661628092937e-05, "loss": 0.0, "step": 760 }, { "epoch": 0.57, "grad_norm": 0.0, "learning_rate": 9.418551710895243e-05, "loss": 0.0, "step": 765 }, { "epoch": 0.57, "grad_norm": 0.0, "learning_rate": 9.289538977132703e-05, "loss": 0.0, "step": 770 }, { "epoch": 0.57, "grad_norm": 0.0, "learning_rate": 9.160644990030931e-05, "loss": 0.0, "step": 775 }, { "epoch": 0.58, "grad_norm": 0.0, "learning_rate": 9.03189129296821e-05, "loss": 0.0, "step": 780 }, { "epoch": 0.58, "grad_norm": 0.0, "learning_rate": 8.903299405874684e-05, "loss": 0.0, "step": 785 }, { "epoch": 0.59, "grad_norm": 0.0, "learning_rate": 8.774890821635548e-05, "loss": 0.0, "step": 790 }, { "epoch": 0.59, "grad_norm": 0.0, "learning_rate": 8.646687002498692e-05, "loss": 0.0, "step": 795 }, { "epoch": 0.59, "grad_norm": 0.0, "learning_rate": 8.518709376487515e-05, "loss": 0.0, "step": 800 }, { "epoch": 0.6, "grad_norm": 0.0, "learning_rate": 8.390979333819426e-05, "loss": 0.0, "step": 805 }, { "epoch": 0.6, "grad_norm": 0.0, "learning_rate": 8.263518223330697e-05, "loss": 0.0, "step": 810 }, { "epoch": 0.6, "grad_norm": 0.0, "learning_rate": 8.13634734890818e-05, "loss": 0.0, "step": 815 }, { "epoch": 0.61, "grad_norm": 0.0, "learning_rate": 8.009487965928596e-05, "loss": 0.0, "step": 820 }, { "epoch": 0.61, "grad_norm": 0.0, "learning_rate": 7.882961277705895e-05, "loss": 0.0, "step": 825 }, { "epoch": 0.61, "grad_norm": 0.0, "learning_rate": 7.756788431947326e-05, "loss": 0.0, "step": 830 }, { "epoch": 0.62, "grad_norm": 0.0, "learning_rate": 7.630990517218808e-05, "loss": 0.0, "step": 835 }, { "epoch": 0.62, "grad_norm": 0.0, "learning_rate": 7.505588559420189e-05, "loss": 0.0, "step": 840 }, { "epoch": 0.63, "grad_norm": 0.0, "learning_rate": 7.380603518270941e-05, "loss": 0.0, "step": 845 }, { "epoch": 0.63, "grad_norm": 0.0, "learning_rate": 7.256056283806986e-05, "loss": 0.0, "step": 850 }, { "epoch": 0.63, "grad_norm": 0.0, "learning_rate": 7.131967672889101e-05, "loss": 0.0, "step": 855 }, { "epoch": 0.64, "grad_norm": 0.0, "learning_rate": 7.008358425723585e-05, "loss": 0.0, "step": 860 }, { "epoch": 0.64, "grad_norm": 0.0, "learning_rate": 6.885249202395754e-05, "loss": 0.0, "step": 865 }, { "epoch": 0.64, "grad_norm": 0.0, "learning_rate": 6.762660579416791e-05, "loss": 0.0, "step": 870 }, { "epoch": 0.65, "grad_norm": 0.0, "learning_rate": 6.640613046284581e-05, "loss": 0.0, "step": 875 }, { "epoch": 0.65, "grad_norm": 0.0, "learning_rate": 6.519127002059095e-05, "loss": 0.0, "step": 880 }, { "epoch": 0.66, "grad_norm": 0.0, "learning_rate": 6.398222751952899e-05, "loss": 0.0, "step": 885 }, { "epoch": 0.66, "grad_norm": 0.0, "learning_rate": 6.277920503937303e-05, "loss": 0.0, "step": 890 }, { "epoch": 0.66, "grad_norm": 0.0, "learning_rate": 6.158240365364823e-05, "loss": 0.0, "step": 895 }, { "epoch": 0.67, "grad_norm": 0.0, "learning_rate": 6.039202339608432e-05, "loss": 0.0, "step": 900 }, { "epoch": 0.67, "grad_norm": 0.0, "learning_rate": 5.920826322718165e-05, "loss": 0.0, "step": 905 }, { "epoch": 0.67, "grad_norm": 0.0, "learning_rate": 5.80313210009571e-05, "loss": 0.0, "step": 910 }, { "epoch": 0.68, "grad_norm": 0.0, "learning_rate": 5.6861393431874675e-05, "loss": 0.0, "step": 915 }, { "epoch": 0.68, "grad_norm": 0.0, "learning_rate": 5.5698676061966515e-05, "loss": 0.0, "step": 920 }, { "epoch": 0.69, "grad_norm": 0.0, "learning_rate": 5.4543363228149946e-05, "loss": 0.0, "step": 925 }, { "epoch": 0.69, "grad_norm": 0.0, "learning_rate": 5.339564802974615e-05, "loss": 0.0, "step": 930 }, { "epoch": 0.69, "grad_norm": 0.0, "learning_rate": 5.22557222962051e-05, "loss": 0.0, "step": 935 }, { "epoch": 0.7, "grad_norm": 0.0, "learning_rate": 5.112377655504359e-05, "loss": 0.0, "step": 940 }, { "epoch": 0.7, "grad_norm": 0.0, "learning_rate": 5.000000000000002e-05, "loss": 0.0, "step": 945 }, { "epoch": 0.7, "grad_norm": 0.0, "learning_rate": 4.888458045941269e-05, "loss": 0.0, "step": 950 }, { "epoch": 0.71, "grad_norm": 0.0, "learning_rate": 4.777770436482617e-05, "loss": 0.0, "step": 955 }, { "epoch": 0.71, "grad_norm": 0.0, "learning_rate": 4.66795567198309e-05, "loss": 0.0, "step": 960 }, { "epoch": 0.71, "grad_norm": 0.0, "learning_rate": 4.559032106914173e-05, "loss": 0.0, "step": 965 }, { "epoch": 0.72, "grad_norm": 0.0, "learning_rate": 4.451017946792032e-05, "loss": 0.0, "step": 970 }, { "epoch": 0.72, "grad_norm": 0.0, "learning_rate": 4.343931245134616e-05, "loss": 0.0, "step": 975 }, { "epoch": 0.73, "grad_norm": 0.0, "learning_rate": 4.2377899004441966e-05, "loss": 0.0, "step": 980 }, { "epoch": 0.73, "grad_norm": 0.0, "learning_rate": 4.132611653215822e-05, "loss": 0.0, "step": 985 }, { "epoch": 0.73, "grad_norm": 0.0, "learning_rate": 4.028414082972141e-05, "loss": 0.0, "step": 990 }, { "epoch": 0.74, "grad_norm": 0.0, "learning_rate": 3.9252146053251636e-05, "loss": 0.0, "step": 995 }, { "epoch": 0.74, "grad_norm": 0.0, "learning_rate": 3.8230304690654304e-05, "loss": 0.0, "step": 1000 }, { "epoch": 0.74, "grad_norm": 0.0, "learning_rate": 3.721878753279017e-05, "loss": 0.0, "step": 1005 }, { "epoch": 0.75, "grad_norm": 0.0, "learning_rate": 3.621776364492939e-05, "loss": 0.0, "step": 1010 }, { "epoch": 0.75, "grad_norm": 0.0, "learning_rate": 3.522740033849411e-05, "loss": 0.0, "step": 1015 }, { "epoch": 0.76, "grad_norm": 0.0, "learning_rate": 3.424786314309365e-05, "loss": 0.0, "step": 1020 }, { "epoch": 0.76, "grad_norm": 0.0, "learning_rate": 3.3279315778858036e-05, "loss": 0.0, "step": 1025 }, { "epoch": 0.76, "grad_norm": 0.0, "learning_rate": 3.2321920129073816e-05, "loss": 0.0, "step": 1030 }, { "epoch": 0.77, "grad_norm": 0.0, "learning_rate": 3.137583621312665e-05, "loss": 0.0, "step": 1035 }, { "epoch": 0.77, "grad_norm": 0.0, "learning_rate": 3.04412221597558e-05, "loss": 0.0, "step": 1040 }, { "epoch": 0.77, "grad_norm": 0.0, "learning_rate": 2.9518234180624393e-05, "loss": 0.0, "step": 1045 }, { "epoch": 0.78, "grad_norm": 0.0, "learning_rate": 2.8607026544210114e-05, "loss": 0.0, "step": 1050 }, { "epoch": 0.78, "grad_norm": 0.0, "learning_rate": 2.770775155002071e-05, "loss": 0.0, "step": 1055 }, { "epoch": 0.79, "grad_norm": 0.0, "learning_rate": 2.6820559503138797e-05, "loss": 0.0, "step": 1060 }, { "epoch": 0.79, "grad_norm": 0.0, "learning_rate": 2.594559868909956e-05, "loss": 0.0, "step": 1065 }, { "epoch": 0.79, "grad_norm": 0.0, "learning_rate": 2.50830153491064e-05, "loss": 0.0, "step": 1070 }, { "epoch": 0.8, "grad_norm": 0.0, "learning_rate": 2.423295365558821e-05, "loss": 0.0, "step": 1075 }, { "epoch": 0.8, "grad_norm": 0.0, "learning_rate": 2.339555568810221e-05, "loss": 0.0, "step": 1080 }, { "epoch": 0.8, "grad_norm": 0.0, "learning_rate": 2.2570961409586754e-05, "loss": 0.0, "step": 1085 }, { "epoch": 0.81, "grad_norm": 0.0, "learning_rate": 2.1759308642968025e-05, "loss": 0.0, "step": 1090 }, { "epoch": 0.81, "grad_norm": 0.0, "learning_rate": 2.0960733048124083e-05, "loss": 0.0, "step": 1095 }, { "epoch": 0.81, "grad_norm": 0.0, "learning_rate": 2.01753680992107e-05, "loss": 0.0, "step": 1100 }, { "epoch": 0.82, "grad_norm": 0.0, "learning_rate": 1.9403345062352573e-05, "loss": 0.0, "step": 1105 }, { "epoch": 0.82, "grad_norm": 0.0, "learning_rate": 1.864479297370325e-05, "loss": 0.0, "step": 1110 }, { "epoch": 0.83, "grad_norm": 0.0, "learning_rate": 1.7899838617878163e-05, "loss": 0.0, "step": 1115 }, { "epoch": 0.83, "grad_norm": 0.0, "learning_rate": 1.7168606506763695e-05, "loss": 0.0, "step": 1120 }, { "epoch": 0.83, "grad_norm": 0.0, "learning_rate": 1.6451218858706374e-05, "loss": 0.0, "step": 1125 }, { "epoch": 0.84, "grad_norm": 0.0, "learning_rate": 1.5747795578085046e-05, "loss": 0.0, "step": 1130 }, { "epoch": 0.84, "grad_norm": 0.0, "learning_rate": 1.505845423527027e-05, "loss": 0.0, "step": 1135 }, { "epoch": 0.84, "grad_norm": 0.0, "learning_rate": 1.4383310046973365e-05, "loss": 0.0, "step": 1140 }, { "epoch": 0.85, "grad_norm": 0.0, "learning_rate": 1.3722475856989158e-05, "loss": 0.0, "step": 1145 }, { "epoch": 0.85, "grad_norm": 0.0, "learning_rate": 1.307606211733522e-05, "loss": 0.0, "step": 1150 }, { "epoch": 0.86, "grad_norm": 0.0, "learning_rate": 1.2444176869790925e-05, "loss": 0.0, "step": 1155 }, { "epoch": 0.86, "grad_norm": 0.0, "learning_rate": 1.18269257278392e-05, "loss": 0.0, "step": 1160 }, { "epoch": 0.86, "grad_norm": 0.0, "learning_rate": 1.1224411859014417e-05, "loss": 0.0, "step": 1165 }, { "epoch": 0.87, "grad_norm": 0.0, "learning_rate": 1.0636735967658784e-05, "loss": 0.0, "step": 1170 }, { "epoch": 0.87, "grad_norm": 0.0, "learning_rate": 1.0063996278090704e-05, "loss": 0.0, "step": 1175 }, { "epoch": 0.87, "grad_norm": 0.0, "learning_rate": 9.506288518187467e-06, "loss": 0.0, "step": 1180 }, { "epoch": 0.88, "grad_norm": 0.0, "learning_rate": 8.963705903385345e-06, "loss": 0.0, "step": 1185 }, { "epoch": 0.88, "grad_norm": 0.0, "learning_rate": 8.436339121099412e-06, "loss": 0.0, "step": 1190 }, { "epoch": 0.89, "grad_norm": 0.0, "learning_rate": 7.92427631556617e-06, "loss": 0.0, "step": 1195 }, { "epoch": 0.89, "grad_norm": 0.0, "learning_rate": 7.427603073110967e-06, "loss": 0.0, "step": 1200 }, { "epoch": 0.89, "grad_norm": 0.0, "learning_rate": 6.946402407843155e-06, "loss": 0.0, "step": 1205 }, { "epoch": 0.9, "grad_norm": 0.0, "learning_rate": 6.480754747781037e-06, "loss": 0.0, "step": 1210 }, { "epoch": 0.9, "grad_norm": 0.0, "learning_rate": 6.030737921409169e-06, "loss": 0.0, "step": 1215 }, { "epoch": 0.9, "grad_norm": 0.0, "learning_rate": 5.596427144670002e-06, "loss": 0.0, "step": 1220 }, { "epoch": 0.91, "grad_norm": 0.0, "learning_rate": 5.177895008392353e-06, "loss": 0.0, "step": 1225 }, { "epoch": 0.91, "grad_norm": 0.0, "learning_rate": 4.775211466158469e-06, "loss": 0.0, "step": 1230 }, { "epoch": 0.91, "grad_norm": 0.0, "learning_rate": 4.3884438226120424e-06, "loss": 0.0, "step": 1235 }, { "epoch": 0.92, "grad_norm": 0.0, "learning_rate": 4.017656722208807e-06, "loss": 0.0, "step": 1240 }, { "epoch": 0.92, "grad_norm": 0.0, "learning_rate": 3.662912138411967e-06, "loss": 0.0, "step": 1245 }, { "epoch": 0.93, "grad_norm": 0.0, "learning_rate": 3.3242693633337983e-06, "loss": 0.0, "step": 1250 }, { "epoch": 0.93, "grad_norm": 0.0, "learning_rate": 3.0017849978256516e-06, "loss": 0.0, "step": 1255 }, { "epoch": 0.93, "grad_norm": 0.0, "learning_rate": 2.6955129420176196e-06, "loss": 0.0, "step": 1260 }, { "epoch": 0.94, "grad_norm": 0.0, "learning_rate": 2.4055043863096428e-06, "loss": 0.0, "step": 1265 }, { "epoch": 0.94, "grad_norm": 0.0, "learning_rate": 2.1318078028155888e-06, "loss": 0.0, "step": 1270 }, { "epoch": 0.94, "grad_norm": 0.0, "learning_rate": 1.874468937261531e-06, "loss": 0.0, "step": 1275 }, { "epoch": 0.95, "grad_norm": 0.0, "learning_rate": 1.6335308013398886e-06, "loss": 0.0, "step": 1280 }, { "epoch": 0.95, "grad_norm": 0.0, "learning_rate": 1.409033665520354e-06, "loss": 0.0, "step": 1285 }, { "epoch": 0.96, "grad_norm": 0.0, "learning_rate": 1.201015052319099e-06, "loss": 0.0, "step": 1290 }, { "epoch": 0.96, "grad_norm": 0.0, "learning_rate": 1.0095097300273026e-06, "loss": 0.0, "step": 1295 }, { "epoch": 0.96, "grad_norm": 0.0, "learning_rate": 8.345497068998897e-07, "loss": 0.0, "step": 1300 }, { "epoch": 0.97, "grad_norm": 0.0, "learning_rate": 6.761642258056978e-07, "loss": 0.0, "step": 1305 }, { "epoch": 0.97, "grad_norm": 0.0, "learning_rate": 5.343797593398536e-07, "loss": 0.0, "step": 1310 }, { "epoch": 0.97, "grad_norm": 0.0, "learning_rate": 4.092200053990691e-07, "loss": 0.0, "step": 1315 }, { "epoch": 0.98, "grad_norm": 0.0, "learning_rate": 3.007058832207976e-07, "loss": 0.0, "step": 1320 }, { "epoch": 0.98, "grad_norm": 0.0, "learning_rate": 2.088555298867978e-07, "loss": 0.0, "step": 1325 }, { "epoch": 0.99, "grad_norm": 0.0, "learning_rate": 1.3368429729168076e-07, "loss": 0.0, "step": 1330 }, { "epoch": 0.99, "grad_norm": 0.0, "learning_rate": 7.520474957699586e-08, "loss": 0.0, "step": 1335 }, { "epoch": 0.99, "grad_norm": 0.0, "learning_rate": 3.3426661031255026e-08, "loss": 0.0, "step": 1340 }, { "epoch": 1.0, "grad_norm": 0.0, "learning_rate": 8.357014456272794e-09, "loss": 0.0, "step": 1345 }, { "epoch": 1.0, "grad_norm": 0.0, "learning_rate": 0.0, "loss": 0.0, "step": 1350 }, { "epoch": 1.0, "eval_loss": NaN, "eval_runtime": 865.6078, "eval_samples_per_second": 2.669, "eval_steps_per_second": 0.334, "step": 1350 }, { "epoch": 1.0, "step": 1350, "total_flos": 2763707490304000.0, "train_loss": 0.0, "train_runtime": 23143.2556, "train_samples_per_second": 0.933, "train_steps_per_second": 0.058 } ], "logging_steps": 5, "max_steps": 1350, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 2763707490304000.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }