{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.1824990145841545, "eval_steps": 500, "global_step": 3000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "learning_rate": 1.999997870262096e-05, "loss": 2.714, "step": 5 }, { "epoch": 0.0, "learning_rate": 1.999991481057455e-05, "loss": 2.7679, "step": 10 }, { "epoch": 0.01, "learning_rate": 1.9999808324132915e-05, "loss": 2.6887, "step": 15 }, { "epoch": 0.01, "learning_rate": 1.999965924374964e-05, "loss": 2.6986, "step": 20 }, { "epoch": 0.01, "learning_rate": 1.999946757005972e-05, "loss": 2.7536, "step": 25 }, { "epoch": 0.01, "learning_rate": 1.9999233303879592e-05, "loss": 2.7516, "step": 30 }, { "epoch": 0.01, "learning_rate": 1.99989564462071e-05, "loss": 2.6283, "step": 35 }, { "epoch": 0.02, "learning_rate": 1.999863699822152e-05, "loss": 2.708, "step": 40 }, { "epoch": 0.02, "learning_rate": 1.9998274961283523e-05, "loss": 2.6932, "step": 45 }, { "epoch": 0.02, "learning_rate": 1.9997870336935207e-05, "loss": 2.6321, "step": 50 }, { "epoch": 0.02, "learning_rate": 1.9997423126900056e-05, "loss": 2.7129, "step": 55 }, { "epoch": 0.02, "learning_rate": 1.9996933333082945e-05, "loss": 2.6179, "step": 60 }, { "epoch": 0.03, "learning_rate": 1.9996400957570148e-05, "loss": 2.6052, "step": 65 }, { "epoch": 0.03, "learning_rate": 1.99958260026293e-05, "loss": 2.7054, "step": 70 }, { "epoch": 0.03, "learning_rate": 1.9995208470709405e-05, "loss": 2.6568, "step": 75 }, { "epoch": 0.03, "learning_rate": 1.9994548364440836e-05, "loss": 2.6436, "step": 80 }, { "epoch": 0.03, "learning_rate": 1.999384568663529e-05, "loss": 2.7091, "step": 85 }, { "epoch": 0.04, "learning_rate": 1.9993100440285805e-05, "loss": 2.5557, "step": 90 }, { "epoch": 0.04, "learning_rate": 1.999231262856675e-05, "loss": 2.5945, "step": 95 }, { "epoch": 0.04, "learning_rate": 1.999148225483378e-05, "loss": 2.4729, "step": 100 }, { "epoch": 0.04, "learning_rate": 1.9990609322623854e-05, "loss": 2.6724, "step": 105 }, { "epoch": 0.04, "learning_rate": 1.9989693835655205e-05, "loss": 2.5272, "step": 110 }, { "epoch": 0.05, "learning_rate": 1.9988735797827336e-05, "loss": 2.6145, "step": 115 }, { "epoch": 0.05, "learning_rate": 1.9987735213220975e-05, "loss": 2.6772, "step": 120 }, { "epoch": 0.05, "learning_rate": 1.9986692086098095e-05, "loss": 2.6388, "step": 125 }, { "epoch": 0.05, "learning_rate": 1.998560642090187e-05, "loss": 2.4973, "step": 130 }, { "epoch": 0.05, "learning_rate": 1.998447822225666e-05, "loss": 2.5338, "step": 135 }, { "epoch": 0.06, "learning_rate": 1.9983307494968e-05, "loss": 2.6469, "step": 140 }, { "epoch": 0.06, "learning_rate": 1.9982094244022582e-05, "loss": 2.6008, "step": 145 }, { "epoch": 0.06, "learning_rate": 1.9980838474588214e-05, "loss": 2.5936, "step": 150 }, { "epoch": 0.06, "learning_rate": 1.9979540192013814e-05, "loss": 2.5862, "step": 155 }, { "epoch": 0.06, "learning_rate": 1.997819940182939e-05, "loss": 2.568, "step": 160 }, { "epoch": 0.07, "learning_rate": 1.9976816109746e-05, "loss": 2.5343, "step": 165 }, { "epoch": 0.07, "learning_rate": 1.9975390321655745e-05, "loss": 2.5575, "step": 170 }, { "epoch": 0.07, "learning_rate": 1.9973922043631737e-05, "loss": 2.5399, "step": 175 }, { "epoch": 0.07, "learning_rate": 1.9972411281928068e-05, "loss": 2.6394, "step": 180 }, { "epoch": 0.07, "learning_rate": 1.9970858042979794e-05, "loss": 2.5378, "step": 185 }, { "epoch": 0.07, "learning_rate": 1.9969262333402893e-05, "loss": 2.5373, "step": 190 }, { "epoch": 0.08, "learning_rate": 1.9967624159994262e-05, "loss": 2.5414, "step": 195 }, { "epoch": 0.08, "learning_rate": 1.9965943529731646e-05, "loss": 2.5606, "step": 200 }, { "epoch": 0.08, "learning_rate": 1.9964220449773664e-05, "loss": 2.5154, "step": 205 }, { "epoch": 0.08, "learning_rate": 1.9962454927459723e-05, "loss": 2.5468, "step": 210 }, { "epoch": 0.08, "learning_rate": 1.9960646970310027e-05, "loss": 2.5137, "step": 215 }, { "epoch": 0.09, "learning_rate": 1.9958796586025527e-05, "loss": 2.5453, "step": 220 }, { "epoch": 0.09, "learning_rate": 1.9956903782487885e-05, "loss": 2.5325, "step": 225 }, { "epoch": 0.09, "learning_rate": 1.9954968567759456e-05, "loss": 2.5054, "step": 230 }, { "epoch": 0.09, "learning_rate": 1.9952990950083236e-05, "loss": 2.5139, "step": 235 }, { "epoch": 0.09, "learning_rate": 1.995097093788285e-05, "loss": 2.5097, "step": 240 }, { "epoch": 0.1, "learning_rate": 1.994890853976248e-05, "loss": 2.5546, "step": 245 }, { "epoch": 0.1, "learning_rate": 1.994680376450686e-05, "loss": 2.4609, "step": 250 }, { "epoch": 0.1, "learning_rate": 1.994465662108124e-05, "loss": 2.5063, "step": 255 }, { "epoch": 0.1, "learning_rate": 1.9942467118631322e-05, "loss": 2.5162, "step": 260 }, { "epoch": 0.1, "learning_rate": 1.994023526648323e-05, "loss": 2.5317, "step": 265 }, { "epoch": 0.11, "learning_rate": 1.9937961074143492e-05, "loss": 2.4621, "step": 270 }, { "epoch": 0.11, "learning_rate": 1.9935644551298976e-05, "loss": 2.5879, "step": 275 }, { "epoch": 0.11, "learning_rate": 1.993328570781685e-05, "loss": 2.4742, "step": 280 }, { "epoch": 0.11, "learning_rate": 1.993088455374456e-05, "loss": 2.4661, "step": 285 }, { "epoch": 0.11, "learning_rate": 1.992844109930975e-05, "loss": 2.4943, "step": 290 }, { "epoch": 0.12, "learning_rate": 1.9925955354920265e-05, "loss": 2.4322, "step": 295 }, { "epoch": 0.12, "learning_rate": 1.9923427331164072e-05, "loss": 2.484, "step": 300 }, { "epoch": 0.12, "learning_rate": 1.9920857038809223e-05, "loss": 2.5334, "step": 305 }, { "epoch": 0.12, "learning_rate": 1.991824448880382e-05, "loss": 2.6554, "step": 310 }, { "epoch": 0.12, "learning_rate": 1.9915589692275955e-05, "loss": 2.4749, "step": 315 }, { "epoch": 0.13, "learning_rate": 1.991289266053367e-05, "loss": 2.5076, "step": 320 }, { "epoch": 0.13, "learning_rate": 1.9910153405064904e-05, "loss": 2.4635, "step": 325 }, { "epoch": 0.13, "learning_rate": 1.990737193753745e-05, "loss": 2.5443, "step": 330 }, { "epoch": 0.13, "learning_rate": 1.9904548269798906e-05, "loss": 2.6012, "step": 335 }, { "epoch": 0.13, "learning_rate": 1.990168241387662e-05, "loss": 2.4818, "step": 340 }, { "epoch": 0.14, "learning_rate": 1.9898774381977618e-05, "loss": 2.4717, "step": 345 }, { "epoch": 0.14, "learning_rate": 1.989582418648861e-05, "loss": 2.4756, "step": 350 }, { "epoch": 0.14, "learning_rate": 1.9892831839975874e-05, "loss": 2.4868, "step": 355 }, { "epoch": 0.14, "learning_rate": 1.9889797355185237e-05, "loss": 2.5652, "step": 360 }, { "epoch": 0.14, "learning_rate": 1.9886720745042017e-05, "loss": 2.3907, "step": 365 }, { "epoch": 0.15, "learning_rate": 1.988360202265096e-05, "loss": 2.4984, "step": 370 }, { "epoch": 0.15, "learning_rate": 1.9880441201296186e-05, "loss": 2.5654, "step": 375 }, { "epoch": 0.15, "learning_rate": 1.987723829444114e-05, "loss": 2.581, "step": 380 }, { "epoch": 0.15, "learning_rate": 1.9873993315728523e-05, "loss": 2.5189, "step": 385 }, { "epoch": 0.15, "learning_rate": 1.987070627898025e-05, "loss": 2.491, "step": 390 }, { "epoch": 0.16, "learning_rate": 1.9867377198197367e-05, "loss": 2.4875, "step": 395 }, { "epoch": 0.16, "learning_rate": 1.9864006087560016e-05, "loss": 2.4828, "step": 400 }, { "epoch": 0.16, "learning_rate": 1.9860592961427358e-05, "loss": 2.5298, "step": 405 }, { "epoch": 0.16, "learning_rate": 1.9857137834337527e-05, "loss": 2.4873, "step": 410 }, { "epoch": 0.16, "learning_rate": 1.985364072100755e-05, "loss": 2.5371, "step": 415 }, { "epoch": 0.17, "learning_rate": 1.98501016363333e-05, "loss": 2.4167, "step": 420 }, { "epoch": 0.17, "learning_rate": 1.9846520595389415e-05, "loss": 2.4312, "step": 425 }, { "epoch": 0.17, "learning_rate": 1.984289761342926e-05, "loss": 2.4471, "step": 430 }, { "epoch": 0.17, "learning_rate": 1.9839232705884836e-05, "loss": 2.4971, "step": 435 }, { "epoch": 0.17, "learning_rate": 1.9835525888366727e-05, "loss": 2.5206, "step": 440 }, { "epoch": 0.18, "learning_rate": 1.9831777176664035e-05, "loss": 2.5067, "step": 445 }, { "epoch": 0.18, "learning_rate": 1.9827986586744302e-05, "loss": 2.4903, "step": 450 }, { "epoch": 0.18, "learning_rate": 1.982415413475346e-05, "loss": 2.4391, "step": 455 }, { "epoch": 0.18, "learning_rate": 1.9820279837015742e-05, "loss": 2.4358, "step": 460 }, { "epoch": 0.18, "learning_rate": 1.981636371003363e-05, "loss": 2.4809, "step": 465 }, { "epoch": 0.19, "learning_rate": 1.9812405770487763e-05, "loss": 2.516, "step": 470 }, { "epoch": 0.19, "learning_rate": 1.9808406035236897e-05, "loss": 2.4091, "step": 475 }, { "epoch": 0.19, "learning_rate": 1.9804364521317806e-05, "loss": 2.5363, "step": 480 }, { "epoch": 0.19, "learning_rate": 1.9800281245945217e-05, "loss": 2.51, "step": 485 }, { "epoch": 0.19, "learning_rate": 1.9796156226511747e-05, "loss": 2.5049, "step": 490 }, { "epoch": 0.2, "learning_rate": 1.9791989480587815e-05, "loss": 2.4722, "step": 495 }, { "epoch": 0.2, "learning_rate": 1.978778102592157e-05, "loss": 2.4111, "step": 500 }, { "epoch": 0.2, "learning_rate": 1.9783530880438832e-05, "loss": 2.4078, "step": 505 }, { "epoch": 0.2, "learning_rate": 1.9779239062242988e-05, "loss": 2.3981, "step": 510 }, { "epoch": 0.2, "learning_rate": 1.9774905589614935e-05, "loss": 2.5459, "step": 515 }, { "epoch": 0.2, "learning_rate": 1.977053048101299e-05, "loss": 2.4432, "step": 520 }, { "epoch": 0.21, "learning_rate": 1.976611375507283e-05, "loss": 2.5329, "step": 525 }, { "epoch": 0.21, "learning_rate": 1.9761655430607384e-05, "loss": 2.5107, "step": 530 }, { "epoch": 0.21, "learning_rate": 1.975715552660678e-05, "loss": 2.4389, "step": 535 }, { "epoch": 0.21, "learning_rate": 1.9752614062238256e-05, "loss": 2.4939, "step": 540 }, { "epoch": 0.21, "learning_rate": 1.974803105684606e-05, "loss": 2.5349, "step": 545 }, { "epoch": 0.22, "learning_rate": 1.9743406529951403e-05, "loss": 2.5268, "step": 550 }, { "epoch": 0.22, "learning_rate": 1.9738740501252337e-05, "loss": 2.4477, "step": 555 }, { "epoch": 0.22, "learning_rate": 1.9734032990623702e-05, "loss": 2.4597, "step": 560 }, { "epoch": 0.22, "learning_rate": 1.972928401811703e-05, "loss": 2.4938, "step": 565 }, { "epoch": 0.22, "learning_rate": 1.9724493603960443e-05, "loss": 2.4756, "step": 570 }, { "epoch": 0.23, "learning_rate": 1.9719661768558604e-05, "loss": 2.3664, "step": 575 }, { "epoch": 0.23, "learning_rate": 1.9714788532492595e-05, "loss": 2.5166, "step": 580 }, { "epoch": 0.23, "learning_rate": 1.9709873916519853e-05, "loss": 2.4143, "step": 585 }, { "epoch": 0.23, "learning_rate": 1.9704917941574053e-05, "loss": 2.4666, "step": 590 }, { "epoch": 0.23, "learning_rate": 1.9699920628765065e-05, "loss": 2.4617, "step": 595 }, { "epoch": 0.24, "learning_rate": 1.969488199937881e-05, "loss": 2.4912, "step": 600 }, { "epoch": 0.24, "learning_rate": 1.9689802074877216e-05, "loss": 2.4386, "step": 605 }, { "epoch": 0.24, "learning_rate": 1.9684680876898096e-05, "loss": 2.5002, "step": 610 }, { "epoch": 0.24, "learning_rate": 1.967951842725507e-05, "loss": 2.5668, "step": 615 }, { "epoch": 0.24, "learning_rate": 1.9674314747937462e-05, "loss": 2.4955, "step": 620 }, { "epoch": 0.25, "learning_rate": 1.9669069861110225e-05, "loss": 2.5317, "step": 625 }, { "epoch": 0.25, "learning_rate": 1.9663783789113827e-05, "loss": 2.4709, "step": 630 }, { "epoch": 0.25, "learning_rate": 1.9658456554464157e-05, "loss": 2.4066, "step": 635 }, { "epoch": 0.25, "learning_rate": 1.9653088179852448e-05, "loss": 2.4462, "step": 640 }, { "epoch": 0.25, "learning_rate": 1.9647678688145163e-05, "loss": 2.5093, "step": 645 }, { "epoch": 0.26, "learning_rate": 1.9642228102383894e-05, "loss": 2.4634, "step": 650 }, { "epoch": 0.26, "learning_rate": 1.9636736445785288e-05, "loss": 2.424, "step": 655 }, { "epoch": 0.26, "learning_rate": 1.963120374174092e-05, "loss": 2.4968, "step": 660 }, { "epoch": 0.26, "learning_rate": 1.9625630013817204e-05, "loss": 2.3947, "step": 665 }, { "epoch": 0.26, "learning_rate": 1.9620015285755306e-05, "loss": 2.3954, "step": 670 }, { "epoch": 0.27, "learning_rate": 1.961435958147102e-05, "loss": 2.3764, "step": 675 }, { "epoch": 0.27, "learning_rate": 1.9608662925054684e-05, "loss": 2.386, "step": 680 }, { "epoch": 0.27, "learning_rate": 1.960292534077107e-05, "loss": 2.4218, "step": 685 }, { "epoch": 0.27, "learning_rate": 1.9597146853059273e-05, "loss": 2.4479, "step": 690 }, { "epoch": 0.27, "learning_rate": 1.959132748653263e-05, "loss": 2.4301, "step": 695 }, { "epoch": 0.28, "learning_rate": 1.9585467265978585e-05, "loss": 2.5069, "step": 700 }, { "epoch": 0.28, "learning_rate": 1.957956621635861e-05, "loss": 2.4144, "step": 705 }, { "epoch": 0.28, "learning_rate": 1.9573624362808078e-05, "loss": 2.4491, "step": 710 }, { "epoch": 0.28, "learning_rate": 1.9567641730636174e-05, "loss": 2.4723, "step": 715 }, { "epoch": 0.28, "learning_rate": 1.956161834532578e-05, "loss": 2.4839, "step": 720 }, { "epoch": 0.29, "learning_rate": 1.955555423253335e-05, "loss": 2.4881, "step": 725 }, { "epoch": 0.29, "learning_rate": 1.9549449418088832e-05, "loss": 2.5076, "step": 730 }, { "epoch": 0.29, "learning_rate": 1.9543303927995536e-05, "loss": 2.4704, "step": 735 }, { "epoch": 0.29, "learning_rate": 1.9537117788430024e-05, "loss": 2.3958, "step": 740 }, { "epoch": 0.29, "learning_rate": 1.953089102574201e-05, "loss": 2.5256, "step": 745 }, { "epoch": 0.3, "learning_rate": 1.9524623666454243e-05, "loss": 2.4658, "step": 750 }, { "epoch": 0.3, "learning_rate": 1.951831573726238e-05, "loss": 2.4636, "step": 755 }, { "epoch": 0.3, "learning_rate": 1.9511967265034904e-05, "loss": 2.4249, "step": 760 }, { "epoch": 0.3, "learning_rate": 1.9505578276812964e-05, "loss": 2.5412, "step": 765 }, { "epoch": 0.3, "learning_rate": 1.9499148799810314e-05, "loss": 2.4355, "step": 770 }, { "epoch": 0.31, "learning_rate": 1.949267886141315e-05, "loss": 2.5814, "step": 775 }, { "epoch": 0.31, "learning_rate": 1.948616848918002e-05, "loss": 2.5195, "step": 780 }, { "epoch": 0.31, "learning_rate": 1.9479617710841693e-05, "loss": 2.5177, "step": 785 }, { "epoch": 0.31, "learning_rate": 1.9473026554301057e-05, "loss": 2.4418, "step": 790 }, { "epoch": 0.31, "learning_rate": 1.946639504763298e-05, "loss": 2.5187, "step": 795 }, { "epoch": 0.32, "learning_rate": 1.94597232190842e-05, "loss": 2.479, "step": 800 }, { "epoch": 0.32, "learning_rate": 1.9453011097073217e-05, "loss": 2.4451, "step": 805 }, { "epoch": 0.32, "learning_rate": 1.9446258710190152e-05, "loss": 2.5426, "step": 810 }, { "epoch": 0.32, "learning_rate": 1.9439466087196627e-05, "loss": 2.4299, "step": 815 }, { "epoch": 0.32, "learning_rate": 1.943263325702566e-05, "loss": 2.5317, "step": 820 }, { "epoch": 0.33, "learning_rate": 1.9425760248781525e-05, "loss": 2.4384, "step": 825 }, { "epoch": 0.33, "learning_rate": 1.9418847091739638e-05, "loss": 2.4445, "step": 830 }, { "epoch": 0.33, "learning_rate": 1.9411893815346418e-05, "loss": 2.4872, "step": 835 }, { "epoch": 0.33, "learning_rate": 1.9404900449219178e-05, "loss": 2.5123, "step": 840 }, { "epoch": 0.33, "learning_rate": 1.9397867023146e-05, "loss": 2.3569, "step": 845 }, { "epoch": 0.34, "learning_rate": 1.9390793567085585e-05, "loss": 2.467, "step": 850 }, { "epoch": 0.34, "learning_rate": 1.9383680111167146e-05, "loss": 2.4097, "step": 855 }, { "epoch": 0.34, "learning_rate": 1.937652668569028e-05, "loss": 2.4653, "step": 860 }, { "epoch": 0.34, "learning_rate": 1.9369333321124832e-05, "loss": 2.4164, "step": 865 }, { "epoch": 0.34, "learning_rate": 1.936210004811076e-05, "loss": 2.3906, "step": 870 }, { "epoch": 0.34, "learning_rate": 1.9354826897458016e-05, "loss": 2.4433, "step": 875 }, { "epoch": 0.35, "learning_rate": 1.9347513900146412e-05, "loss": 2.42, "step": 880 }, { "epoch": 0.35, "learning_rate": 1.9340161087325483e-05, "loss": 2.4606, "step": 885 }, { "epoch": 0.35, "learning_rate": 1.9332768490314354e-05, "loss": 2.4242, "step": 890 }, { "epoch": 0.35, "learning_rate": 1.9325336140601612e-05, "loss": 2.461, "step": 895 }, { "epoch": 0.35, "learning_rate": 1.931786406984518e-05, "loss": 2.3949, "step": 900 }, { "epoch": 0.36, "learning_rate": 1.9310352309872153e-05, "loss": 2.4449, "step": 905 }, { "epoch": 0.36, "learning_rate": 1.9302800892678693e-05, "loss": 2.4864, "step": 910 }, { "epoch": 0.36, "learning_rate": 1.9295209850429884e-05, "loss": 2.5432, "step": 915 }, { "epoch": 0.36, "learning_rate": 1.9287579215459585e-05, "loss": 2.4357, "step": 920 }, { "epoch": 0.36, "learning_rate": 1.9279909020270294e-05, "loss": 2.4625, "step": 925 }, { "epoch": 0.37, "learning_rate": 1.9272199297533027e-05, "loss": 2.3937, "step": 930 }, { "epoch": 0.37, "learning_rate": 1.9264450080087163e-05, "loss": 2.5051, "step": 935 }, { "epoch": 0.37, "learning_rate": 1.9256661400940303e-05, "loss": 2.4894, "step": 940 }, { "epoch": 0.37, "learning_rate": 1.9248833293268144e-05, "loss": 2.4117, "step": 945 }, { "epoch": 0.37, "learning_rate": 1.9240965790414312e-05, "loss": 2.4892, "step": 950 }, { "epoch": 0.38, "learning_rate": 1.923305892589025e-05, "loss": 2.4059, "step": 955 }, { "epoch": 0.38, "learning_rate": 1.9225112733375057e-05, "loss": 2.3975, "step": 960 }, { "epoch": 0.38, "learning_rate": 1.9217127246715344e-05, "loss": 2.4867, "step": 965 }, { "epoch": 0.38, "learning_rate": 1.92091024999251e-05, "loss": 2.4757, "step": 970 }, { "epoch": 0.38, "learning_rate": 1.9201038527185546e-05, "loss": 2.4164, "step": 975 }, { "epoch": 0.39, "learning_rate": 1.919293536284497e-05, "loss": 2.4024, "step": 980 }, { "epoch": 0.39, "learning_rate": 1.9184793041418607e-05, "loss": 2.4706, "step": 985 }, { "epoch": 0.39, "learning_rate": 1.917661159758848e-05, "loss": 2.465, "step": 990 }, { "epoch": 0.39, "learning_rate": 1.9168391066203248e-05, "loss": 2.555, "step": 995 }, { "epoch": 0.39, "learning_rate": 1.9160131482278068e-05, "loss": 2.4465, "step": 1000 }, { "epoch": 0.4, "learning_rate": 1.9151832880994438e-05, "loss": 2.4014, "step": 1005 }, { "epoch": 0.4, "learning_rate": 1.914349529770005e-05, "loss": 2.4358, "step": 1010 }, { "epoch": 0.4, "learning_rate": 1.9135118767908637e-05, "loss": 2.402, "step": 1015 }, { "epoch": 0.4, "learning_rate": 1.9126703327299822e-05, "loss": 2.3952, "step": 1020 }, { "epoch": 0.4, "learning_rate": 1.9118249011718975e-05, "loss": 2.4051, "step": 1025 }, { "epoch": 0.41, "learning_rate": 1.9109755857177053e-05, "loss": 2.4198, "step": 1030 }, { "epoch": 0.41, "learning_rate": 1.910122389985043e-05, "loss": 2.4414, "step": 1035 }, { "epoch": 0.41, "learning_rate": 1.909265317608078e-05, "loss": 2.4688, "step": 1040 }, { "epoch": 0.41, "learning_rate": 1.9084043722374895e-05, "loss": 2.3766, "step": 1045 }, { "epoch": 0.41, "learning_rate": 1.907539557540453e-05, "loss": 2.3966, "step": 1050 }, { "epoch": 0.42, "learning_rate": 1.9066708772006262e-05, "loss": 2.5027, "step": 1055 }, { "epoch": 0.42, "learning_rate": 1.9057983349181316e-05, "loss": 2.4904, "step": 1060 }, { "epoch": 0.42, "learning_rate": 1.904921934409542e-05, "loss": 2.4296, "step": 1065 }, { "epoch": 0.42, "learning_rate": 1.9040416794078648e-05, "loss": 2.4294, "step": 1070 }, { "epoch": 0.42, "learning_rate": 1.903157573662524e-05, "loss": 2.4442, "step": 1075 }, { "epoch": 0.43, "learning_rate": 1.902269620939347e-05, "loss": 2.382, "step": 1080 }, { "epoch": 0.43, "learning_rate": 1.901377825020547e-05, "loss": 2.5004, "step": 1085 }, { "epoch": 0.43, "learning_rate": 1.9004821897047067e-05, "loss": 2.3837, "step": 1090 }, { "epoch": 0.43, "learning_rate": 1.899582718806764e-05, "loss": 2.3654, "step": 1095 }, { "epoch": 0.43, "learning_rate": 1.8986794161579927e-05, "loss": 2.4624, "step": 1100 }, { "epoch": 0.44, "learning_rate": 1.8977722856059886e-05, "loss": 2.4875, "step": 1105 }, { "epoch": 0.44, "learning_rate": 1.8968613310146527e-05, "loss": 2.4575, "step": 1110 }, { "epoch": 0.44, "learning_rate": 1.8959465562641738e-05, "loss": 2.337, "step": 1115 }, { "epoch": 0.44, "learning_rate": 1.895027965251013e-05, "loss": 2.4485, "step": 1120 }, { "epoch": 0.44, "learning_rate": 1.8941055618878864e-05, "loss": 2.5231, "step": 1125 }, { "epoch": 0.45, "learning_rate": 1.8931793501037483e-05, "loss": 2.3996, "step": 1130 }, { "epoch": 0.45, "learning_rate": 1.8922493338437765e-05, "loss": 2.4486, "step": 1135 }, { "epoch": 0.45, "learning_rate": 1.8913155170693514e-05, "loss": 2.5302, "step": 1140 }, { "epoch": 0.45, "learning_rate": 1.8903779037580442e-05, "loss": 2.4209, "step": 1145 }, { "epoch": 0.45, "learning_rate": 1.8894364979035956e-05, "loss": 2.3894, "step": 1150 }, { "epoch": 0.46, "learning_rate": 1.8884913035159008e-05, "loss": 2.3971, "step": 1155 }, { "epoch": 0.46, "learning_rate": 1.8875423246209928e-05, "loss": 2.3819, "step": 1160 }, { "epoch": 0.46, "learning_rate": 1.8865895652610244e-05, "loss": 2.4543, "step": 1165 }, { "epoch": 0.46, "learning_rate": 1.8856330294942506e-05, "loss": 2.4273, "step": 1170 }, { "epoch": 0.46, "learning_rate": 1.8846727213950125e-05, "loss": 2.4266, "step": 1175 }, { "epoch": 0.47, "learning_rate": 1.8837086450537195e-05, "loss": 2.4436, "step": 1180 }, { "epoch": 0.47, "learning_rate": 1.8827408045768308e-05, "loss": 2.5767, "step": 1185 }, { "epoch": 0.47, "learning_rate": 1.8817692040868404e-05, "loss": 2.3555, "step": 1190 }, { "epoch": 0.47, "learning_rate": 1.880793847722256e-05, "loss": 2.4157, "step": 1195 }, { "epoch": 0.47, "learning_rate": 1.8798147396375855e-05, "loss": 2.4256, "step": 1200 }, { "epoch": 0.47, "learning_rate": 1.8788318840033155e-05, "loss": 2.529, "step": 1205 }, { "epoch": 0.48, "learning_rate": 1.8778452850058957e-05, "loss": 2.4668, "step": 1210 }, { "epoch": 0.48, "learning_rate": 1.8768549468477212e-05, "loss": 2.4191, "step": 1215 }, { "epoch": 0.48, "learning_rate": 1.875860873747113e-05, "loss": 2.3992, "step": 1220 }, { "epoch": 0.48, "learning_rate": 1.8748630699383016e-05, "loss": 2.4485, "step": 1225 }, { "epoch": 0.48, "learning_rate": 1.8738615396714083e-05, "loss": 2.3803, "step": 1230 }, { "epoch": 0.49, "learning_rate": 1.8728562872124264e-05, "loss": 2.428, "step": 1235 }, { "epoch": 0.49, "learning_rate": 1.8718473168432054e-05, "loss": 2.4404, "step": 1240 }, { "epoch": 0.49, "learning_rate": 1.8708346328614297e-05, "loss": 2.5451, "step": 1245 }, { "epoch": 0.49, "learning_rate": 1.869818239580602e-05, "loss": 2.4906, "step": 1250 }, { "epoch": 0.49, "learning_rate": 1.8687981413300246e-05, "loss": 2.4486, "step": 1255 }, { "epoch": 0.5, "learning_rate": 1.8677743424547824e-05, "loss": 2.4106, "step": 1260 }, { "epoch": 0.5, "learning_rate": 1.8667468473157212e-05, "loss": 2.3316, "step": 1265 }, { "epoch": 0.5, "learning_rate": 1.865715660289432e-05, "loss": 2.4518, "step": 1270 }, { "epoch": 0.5, "learning_rate": 1.8646807857682308e-05, "loss": 2.4549, "step": 1275 }, { "epoch": 0.5, "learning_rate": 1.8636422281601406e-05, "loss": 2.4411, "step": 1280 }, { "epoch": 0.51, "learning_rate": 1.8625999918888726e-05, "loss": 2.509, "step": 1285 }, { "epoch": 0.51, "learning_rate": 1.8615540813938063e-05, "loss": 2.4455, "step": 1290 }, { "epoch": 0.51, "learning_rate": 1.860504501129973e-05, "loss": 2.5486, "step": 1295 }, { "epoch": 0.51, "learning_rate": 1.8594512555680338e-05, "loss": 2.4134, "step": 1300 }, { "epoch": 0.51, "learning_rate": 1.8583943491942635e-05, "loss": 2.483, "step": 1305 }, { "epoch": 0.52, "learning_rate": 1.8573337865105285e-05, "loss": 2.4329, "step": 1310 }, { "epoch": 0.52, "learning_rate": 1.8562695720342704e-05, "loss": 2.3614, "step": 1315 }, { "epoch": 0.52, "learning_rate": 1.8552017102984842e-05, "loss": 2.4119, "step": 1320 }, { "epoch": 0.52, "learning_rate": 1.854130205851702e-05, "loss": 2.4654, "step": 1325 }, { "epoch": 0.52, "learning_rate": 1.853055063257971e-05, "loss": 2.3784, "step": 1330 }, { "epoch": 0.53, "learning_rate": 1.8519762870968344e-05, "loss": 2.4059, "step": 1335 }, { "epoch": 0.53, "learning_rate": 1.8508938819633138e-05, "loss": 2.3768, "step": 1340 }, { "epoch": 0.53, "learning_rate": 1.8498078524678874e-05, "loss": 2.3345, "step": 1345 }, { "epoch": 0.53, "learning_rate": 1.8487182032364714e-05, "loss": 2.4273, "step": 1350 }, { "epoch": 0.53, "learning_rate": 1.8476249389104007e-05, "loss": 2.3947, "step": 1355 }, { "epoch": 0.54, "learning_rate": 1.8465280641464085e-05, "loss": 2.4527, "step": 1360 }, { "epoch": 0.54, "learning_rate": 1.8454275836166052e-05, "loss": 2.3342, "step": 1365 }, { "epoch": 0.54, "learning_rate": 1.8443235020084624e-05, "loss": 2.4361, "step": 1370 }, { "epoch": 0.54, "learning_rate": 1.843215824024788e-05, "loss": 2.502, "step": 1375 }, { "epoch": 0.54, "learning_rate": 1.84210455438371e-05, "loss": 2.4718, "step": 1380 }, { "epoch": 0.55, "learning_rate": 1.8409896978186547e-05, "loss": 2.428, "step": 1385 }, { "epoch": 0.55, "learning_rate": 1.8398712590783258e-05, "loss": 2.4213, "step": 1390 }, { "epoch": 0.55, "learning_rate": 1.838749242926687e-05, "loss": 2.3812, "step": 1395 }, { "epoch": 0.55, "learning_rate": 1.8376236541429386e-05, "loss": 2.4458, "step": 1400 }, { "epoch": 0.55, "learning_rate": 1.836494497521499e-05, "loss": 2.4359, "step": 1405 }, { "epoch": 0.56, "learning_rate": 1.835361777871983e-05, "loss": 2.3377, "step": 1410 }, { "epoch": 0.56, "learning_rate": 1.8342255000191832e-05, "loss": 2.414, "step": 1415 }, { "epoch": 0.56, "learning_rate": 1.8330856688030474e-05, "loss": 2.4517, "step": 1420 }, { "epoch": 0.56, "learning_rate": 1.8319422890786586e-05, "loss": 2.375, "step": 1425 }, { "epoch": 0.56, "learning_rate": 1.830795365716216e-05, "loss": 2.3515, "step": 1430 }, { "epoch": 0.57, "learning_rate": 1.829644903601011e-05, "loss": 2.4428, "step": 1435 }, { "epoch": 0.57, "learning_rate": 1.8284909076334094e-05, "loss": 2.4186, "step": 1440 }, { "epoch": 0.57, "learning_rate": 1.8273333827288294e-05, "loss": 2.4604, "step": 1445 }, { "epoch": 0.57, "learning_rate": 1.8261723338177204e-05, "loss": 2.4712, "step": 1450 }, { "epoch": 0.57, "learning_rate": 1.825007765845542e-05, "loss": 2.4355, "step": 1455 }, { "epoch": 0.58, "learning_rate": 1.823839683772743e-05, "loss": 2.4399, "step": 1460 }, { "epoch": 0.58, "learning_rate": 1.822668092574741e-05, "loss": 2.4747, "step": 1465 }, { "epoch": 0.58, "learning_rate": 1.8214929972419004e-05, "loss": 2.3503, "step": 1470 }, { "epoch": 0.58, "learning_rate": 1.820314402779511e-05, "loss": 2.42, "step": 1475 }, { "epoch": 0.58, "learning_rate": 1.819132314207768e-05, "loss": 2.3705, "step": 1480 }, { "epoch": 0.59, "learning_rate": 1.8179467365617486e-05, "loss": 2.3828, "step": 1485 }, { "epoch": 0.59, "learning_rate": 1.816757674891392e-05, "loss": 2.4663, "step": 1490 }, { "epoch": 0.59, "learning_rate": 1.8155651342614784e-05, "loss": 2.4991, "step": 1495 }, { "epoch": 0.59, "learning_rate": 1.8143691197516048e-05, "loss": 2.3769, "step": 1500 }, { "epoch": 0.59, "learning_rate": 1.813169636456167e-05, "loss": 2.4001, "step": 1505 }, { "epoch": 0.6, "learning_rate": 1.811966689484334e-05, "loss": 2.4054, "step": 1510 }, { "epoch": 0.6, "learning_rate": 1.8107602839600306e-05, "loss": 2.4524, "step": 1515 }, { "epoch": 0.6, "learning_rate": 1.8095504250219103e-05, "loss": 2.423, "step": 1520 }, { "epoch": 0.6, "learning_rate": 1.80833711782334e-05, "loss": 2.5138, "step": 1525 }, { "epoch": 0.6, "learning_rate": 1.8071203675323708e-05, "loss": 2.3669, "step": 1530 }, { "epoch": 0.61, "learning_rate": 1.8059001793317215e-05, "loss": 2.3542, "step": 1535 }, { "epoch": 0.61, "learning_rate": 1.8046765584187544e-05, "loss": 2.3358, "step": 1540 }, { "epoch": 0.61, "learning_rate": 1.803449510005453e-05, "loss": 2.3739, "step": 1545 }, { "epoch": 0.61, "learning_rate": 1.8022190393184008e-05, "loss": 2.4784, "step": 1550 }, { "epoch": 0.61, "learning_rate": 1.8009851515987573e-05, "loss": 2.3919, "step": 1555 }, { "epoch": 0.61, "learning_rate": 1.7997478521022378e-05, "loss": 2.4998, "step": 1560 }, { "epoch": 0.62, "learning_rate": 1.7985071460990894e-05, "loss": 2.4079, "step": 1565 }, { "epoch": 0.62, "learning_rate": 1.7972630388740696e-05, "loss": 2.3714, "step": 1570 }, { "epoch": 0.62, "learning_rate": 1.7960155357264224e-05, "loss": 2.4316, "step": 1575 }, { "epoch": 0.62, "learning_rate": 1.7947646419698578e-05, "loss": 2.3991, "step": 1580 }, { "epoch": 0.62, "learning_rate": 1.793510362932527e-05, "loss": 2.4027, "step": 1585 }, { "epoch": 0.63, "learning_rate": 1.7922527039570022e-05, "loss": 2.443, "step": 1590 }, { "epoch": 0.63, "learning_rate": 1.7909916704002506e-05, "loss": 2.3988, "step": 1595 }, { "epoch": 0.63, "learning_rate": 1.7897272676336143e-05, "loss": 2.5286, "step": 1600 }, { "epoch": 0.63, "learning_rate": 1.788459501042786e-05, "loss": 2.4104, "step": 1605 }, { "epoch": 0.63, "learning_rate": 1.7871883760277872e-05, "loss": 2.3934, "step": 1610 }, { "epoch": 0.64, "learning_rate": 1.785913898002944e-05, "loss": 2.4028, "step": 1615 }, { "epoch": 0.64, "learning_rate": 1.784636072396865e-05, "loss": 2.3815, "step": 1620 }, { "epoch": 0.64, "learning_rate": 1.783354904652417e-05, "loss": 2.3652, "step": 1625 }, { "epoch": 0.64, "learning_rate": 1.7820704002267034e-05, "loss": 2.4487, "step": 1630 }, { "epoch": 0.64, "learning_rate": 1.7807825645910396e-05, "loss": 2.5422, "step": 1635 }, { "epoch": 0.65, "learning_rate": 1.77949140323093e-05, "loss": 2.3982, "step": 1640 }, { "epoch": 0.65, "learning_rate": 1.7781969216460458e-05, "loss": 2.3603, "step": 1645 }, { "epoch": 0.65, "learning_rate": 1.7768991253501993e-05, "loss": 2.3725, "step": 1650 }, { "epoch": 0.65, "learning_rate": 1.775598019871323e-05, "loss": 2.4896, "step": 1655 }, { "epoch": 0.65, "learning_rate": 1.7742936107514442e-05, "loss": 2.4789, "step": 1660 }, { "epoch": 0.66, "learning_rate": 1.7729859035466617e-05, "loss": 2.4811, "step": 1665 }, { "epoch": 0.66, "learning_rate": 1.7716749038271225e-05, "loss": 2.4251, "step": 1670 }, { "epoch": 0.66, "learning_rate": 1.770360617176999e-05, "loss": 2.4285, "step": 1675 }, { "epoch": 0.66, "learning_rate": 1.7690430491944625e-05, "loss": 2.4722, "step": 1680 }, { "epoch": 0.66, "learning_rate": 1.7677222054916627e-05, "loss": 2.384, "step": 1685 }, { "epoch": 0.67, "learning_rate": 1.7663980916947007e-05, "loss": 2.3686, "step": 1690 }, { "epoch": 0.67, "learning_rate": 1.7650707134436075e-05, "loss": 2.4773, "step": 1695 }, { "epoch": 0.67, "learning_rate": 1.7637400763923187e-05, "loss": 2.3632, "step": 1700 }, { "epoch": 0.67, "learning_rate": 1.7624061862086508e-05, "loss": 2.3542, "step": 1705 }, { "epoch": 0.67, "learning_rate": 1.7610690485742763e-05, "loss": 2.3659, "step": 1710 }, { "epoch": 0.68, "learning_rate": 1.759728669184701e-05, "loss": 2.5581, "step": 1715 }, { "epoch": 0.68, "learning_rate": 1.7583850537492386e-05, "loss": 2.3999, "step": 1720 }, { "epoch": 0.68, "learning_rate": 1.757038207990986e-05, "loss": 2.3789, "step": 1725 }, { "epoch": 0.68, "learning_rate": 1.7556881376468004e-05, "loss": 2.4012, "step": 1730 }, { "epoch": 0.68, "learning_rate": 1.754334848467274e-05, "loss": 2.3385, "step": 1735 }, { "epoch": 0.69, "learning_rate": 1.7529783462167088e-05, "loss": 2.4043, "step": 1740 }, { "epoch": 0.69, "learning_rate": 1.751618636673094e-05, "loss": 2.3214, "step": 1745 }, { "epoch": 0.69, "learning_rate": 1.7502557256280792e-05, "loss": 2.3812, "step": 1750 }, { "epoch": 0.69, "learning_rate": 1.7488896188869503e-05, "loss": 2.3469, "step": 1755 }, { "epoch": 0.69, "learning_rate": 1.7475203222686073e-05, "loss": 2.3821, "step": 1760 }, { "epoch": 0.7, "learning_rate": 1.7461478416055352e-05, "loss": 2.4155, "step": 1765 }, { "epoch": 0.7, "learning_rate": 1.744772182743782e-05, "loss": 2.3077, "step": 1770 }, { "epoch": 0.7, "learning_rate": 1.7433933515429336e-05, "loss": 2.4099, "step": 1775 }, { "epoch": 0.7, "learning_rate": 1.7420113538760885e-05, "loss": 2.4284, "step": 1780 }, { "epoch": 0.7, "learning_rate": 1.7406261956298317e-05, "loss": 2.383, "step": 1785 }, { "epoch": 0.71, "learning_rate": 1.739237882704212e-05, "loss": 2.4171, "step": 1790 }, { "epoch": 0.71, "learning_rate": 1.737846421012714e-05, "loss": 2.5326, "step": 1795 }, { "epoch": 0.71, "learning_rate": 1.736451816482235e-05, "loss": 2.2905, "step": 1800 }, { "epoch": 0.71, "learning_rate": 1.73505407505306e-05, "loss": 2.3553, "step": 1805 }, { "epoch": 0.71, "learning_rate": 1.7336532026788345e-05, "loss": 2.5168, "step": 1810 }, { "epoch": 0.72, "learning_rate": 1.7322492053265403e-05, "loss": 2.4175, "step": 1815 }, { "epoch": 0.72, "learning_rate": 1.7308420889764702e-05, "loss": 2.5014, "step": 1820 }, { "epoch": 0.72, "learning_rate": 1.7294318596222023e-05, "loss": 2.3504, "step": 1825 }, { "epoch": 0.72, "learning_rate": 1.7280185232705748e-05, "loss": 2.4076, "step": 1830 }, { "epoch": 0.72, "learning_rate": 1.726602085941659e-05, "loss": 2.3685, "step": 1835 }, { "epoch": 0.73, "learning_rate": 1.725182553668736e-05, "loss": 2.4052, "step": 1840 }, { "epoch": 0.73, "learning_rate": 1.7237599324982692e-05, "loss": 2.4028, "step": 1845 }, { "epoch": 0.73, "learning_rate": 1.7223342284898786e-05, "loss": 2.4059, "step": 1850 }, { "epoch": 0.73, "learning_rate": 1.720905447716316e-05, "loss": 2.4449, "step": 1855 }, { "epoch": 0.73, "learning_rate": 1.719473596263439e-05, "loss": 2.439, "step": 1860 }, { "epoch": 0.74, "learning_rate": 1.7180386802301836e-05, "loss": 2.4218, "step": 1865 }, { "epoch": 0.74, "learning_rate": 1.7166007057285405e-05, "loss": 2.4144, "step": 1870 }, { "epoch": 0.74, "learning_rate": 1.715159678883527e-05, "loss": 2.3402, "step": 1875 }, { "epoch": 0.74, "learning_rate": 1.713715605833162e-05, "loss": 2.4232, "step": 1880 }, { "epoch": 0.74, "learning_rate": 1.7122684927284404e-05, "loss": 2.3479, "step": 1885 }, { "epoch": 0.74, "learning_rate": 1.7108183457333044e-05, "loss": 2.3906, "step": 1890 }, { "epoch": 0.75, "learning_rate": 1.7093651710246208e-05, "loss": 2.4045, "step": 1895 }, { "epoch": 0.75, "learning_rate": 1.7079089747921517e-05, "loss": 2.319, "step": 1900 }, { "epoch": 0.75, "learning_rate": 1.70644976323853e-05, "loss": 2.4375, "step": 1905 }, { "epoch": 0.75, "learning_rate": 1.7049875425792318e-05, "loss": 2.4046, "step": 1910 }, { "epoch": 0.75, "learning_rate": 1.703522319042551e-05, "loss": 2.3748, "step": 1915 }, { "epoch": 0.76, "learning_rate": 1.7020540988695715e-05, "loss": 2.4615, "step": 1920 }, { "epoch": 0.76, "learning_rate": 1.700582888314142e-05, "loss": 2.4903, "step": 1925 }, { "epoch": 0.76, "learning_rate": 1.699108693642847e-05, "loss": 2.4123, "step": 1930 }, { "epoch": 0.76, "learning_rate": 1.6976315211349848e-05, "loss": 2.4224, "step": 1935 }, { "epoch": 0.76, "learning_rate": 1.696151377082535e-05, "loss": 2.4851, "step": 1940 }, { "epoch": 0.77, "learning_rate": 1.6946682677901354e-05, "loss": 2.3529, "step": 1945 }, { "epoch": 0.77, "learning_rate": 1.6931821995750544e-05, "loss": 2.4379, "step": 1950 }, { "epoch": 0.77, "learning_rate": 1.6916931787671634e-05, "loss": 2.321, "step": 1955 }, { "epoch": 0.77, "learning_rate": 1.6902012117089106e-05, "loss": 2.3816, "step": 1960 }, { "epoch": 0.77, "learning_rate": 1.688706304755294e-05, "loss": 2.3744, "step": 1965 }, { "epoch": 0.78, "learning_rate": 1.6872084642738325e-05, "loss": 2.4826, "step": 1970 }, { "epoch": 0.78, "learning_rate": 1.6857076966445425e-05, "loss": 2.4553, "step": 1975 }, { "epoch": 0.78, "learning_rate": 1.684204008259907e-05, "loss": 2.4753, "step": 1980 }, { "epoch": 0.78, "learning_rate": 1.6826974055248502e-05, "loss": 2.3913, "step": 1985 }, { "epoch": 0.78, "learning_rate": 1.68118789485671e-05, "loss": 2.448, "step": 1990 }, { "epoch": 0.79, "learning_rate": 1.679675482685211e-05, "loss": 2.3791, "step": 1995 }, { "epoch": 0.79, "learning_rate": 1.6781601754524354e-05, "loss": 2.4049, "step": 2000 }, { "epoch": 0.79, "learning_rate": 1.676641979612799e-05, "loss": 2.4594, "step": 2005 }, { "epoch": 0.79, "learning_rate": 1.6751209016330186e-05, "loss": 2.404, "step": 2010 }, { "epoch": 0.79, "learning_rate": 1.6735969479920905e-05, "loss": 2.4933, "step": 2015 }, { "epoch": 0.8, "learning_rate": 1.672070125181258e-05, "loss": 2.34, "step": 2020 }, { "epoch": 0.8, "learning_rate": 1.6705404397039852e-05, "loss": 2.3853, "step": 2025 }, { "epoch": 0.8, "learning_rate": 1.6690078980759312e-05, "loss": 2.4498, "step": 2030 }, { "epoch": 0.8, "learning_rate": 1.66747250682492e-05, "loss": 2.4049, "step": 2035 }, { "epoch": 0.8, "learning_rate": 1.6659342724909126e-05, "loss": 2.3463, "step": 2040 }, { "epoch": 0.81, "learning_rate": 1.664393201625982e-05, "loss": 2.398, "step": 2045 }, { "epoch": 0.81, "learning_rate": 1.6628493007942817e-05, "loss": 2.496, "step": 2050 }, { "epoch": 0.81, "learning_rate": 1.66130257657202e-05, "loss": 2.3937, "step": 2055 }, { "epoch": 0.81, "learning_rate": 1.659753035547431e-05, "loss": 2.3504, "step": 2060 }, { "epoch": 0.81, "learning_rate": 1.658200684320748e-05, "loss": 2.4795, "step": 2065 }, { "epoch": 0.82, "learning_rate": 1.6566455295041733e-05, "loss": 2.4123, "step": 2070 }, { "epoch": 0.82, "learning_rate": 1.6550875777218506e-05, "loss": 2.3896, "step": 2075 }, { "epoch": 0.82, "learning_rate": 1.653526835609838e-05, "loss": 2.4726, "step": 2080 }, { "epoch": 0.82, "learning_rate": 1.6519633098160795e-05, "loss": 2.4098, "step": 2085 }, { "epoch": 0.82, "learning_rate": 1.6503970070003744e-05, "loss": 2.3638, "step": 2090 }, { "epoch": 0.83, "learning_rate": 1.6488279338343525e-05, "loss": 2.4039, "step": 2095 }, { "epoch": 0.83, "learning_rate": 1.6472560970014423e-05, "loss": 2.3837, "step": 2100 }, { "epoch": 0.83, "learning_rate": 1.6456815031968452e-05, "loss": 2.3698, "step": 2105 }, { "epoch": 0.83, "learning_rate": 1.6441041591275055e-05, "loss": 2.4735, "step": 2110 }, { "epoch": 0.83, "learning_rate": 1.6425240715120816e-05, "loss": 2.4111, "step": 2115 }, { "epoch": 0.84, "learning_rate": 1.6409412470809188e-05, "loss": 2.4355, "step": 2120 }, { "epoch": 0.84, "learning_rate": 1.63935569257602e-05, "loss": 2.3808, "step": 2125 }, { "epoch": 0.84, "learning_rate": 1.637767414751015e-05, "loss": 2.5006, "step": 2130 }, { "epoch": 0.84, "learning_rate": 1.6361764203711357e-05, "loss": 2.4593, "step": 2135 }, { "epoch": 0.84, "learning_rate": 1.634582716213184e-05, "loss": 2.4232, "step": 2140 }, { "epoch": 0.85, "learning_rate": 1.6329863090655043e-05, "loss": 2.4027, "step": 2145 }, { "epoch": 0.85, "learning_rate": 1.6313872057279536e-05, "loss": 2.4994, "step": 2150 }, { "epoch": 0.85, "learning_rate": 1.6297854130118748e-05, "loss": 2.4925, "step": 2155 }, { "epoch": 0.85, "learning_rate": 1.6281809377400648e-05, "loss": 2.5332, "step": 2160 }, { "epoch": 0.85, "learning_rate": 1.6265737867467472e-05, "loss": 2.4433, "step": 2165 }, { "epoch": 0.86, "learning_rate": 1.6249639668775424e-05, "loss": 2.3635, "step": 2170 }, { "epoch": 0.86, "learning_rate": 1.62335148498944e-05, "loss": 2.3916, "step": 2175 }, { "epoch": 0.86, "learning_rate": 1.621736347950767e-05, "loss": 2.325, "step": 2180 }, { "epoch": 0.86, "learning_rate": 1.6201185626411607e-05, "loss": 2.4767, "step": 2185 }, { "epoch": 0.86, "learning_rate": 1.6184981359515382e-05, "loss": 2.3271, "step": 2190 }, { "epoch": 0.87, "learning_rate": 1.6168750747840683e-05, "loss": 2.4032, "step": 2195 }, { "epoch": 0.87, "learning_rate": 1.6152493860521404e-05, "loss": 2.4398, "step": 2200 }, { "epoch": 0.87, "learning_rate": 1.613621076680337e-05, "loss": 2.3501, "step": 2205 }, { "epoch": 0.87, "learning_rate": 1.6119901536044015e-05, "loss": 2.3147, "step": 2210 }, { "epoch": 0.87, "learning_rate": 1.610356623771212e-05, "loss": 2.4519, "step": 2215 }, { "epoch": 0.88, "learning_rate": 1.6087204941387486e-05, "loss": 2.4859, "step": 2220 }, { "epoch": 0.88, "learning_rate": 1.6070817716760668e-05, "loss": 2.3667, "step": 2225 }, { "epoch": 0.88, "learning_rate": 1.6054404633632647e-05, "loss": 2.3945, "step": 2230 }, { "epoch": 0.88, "learning_rate": 1.603796576191455e-05, "loss": 2.3519, "step": 2235 }, { "epoch": 0.88, "learning_rate": 1.6021501171627366e-05, "loss": 2.4191, "step": 2240 }, { "epoch": 0.88, "learning_rate": 1.6005010932901604e-05, "loss": 2.408, "step": 2245 }, { "epoch": 0.89, "learning_rate": 1.5988495115977046e-05, "loss": 2.4436, "step": 2250 }, { "epoch": 0.89, "learning_rate": 1.5971953791202417e-05, "loss": 2.4142, "step": 2255 }, { "epoch": 0.89, "learning_rate": 1.5955387029035084e-05, "loss": 2.4192, "step": 2260 }, { "epoch": 0.89, "learning_rate": 1.5938794900040768e-05, "loss": 2.3712, "step": 2265 }, { "epoch": 0.89, "learning_rate": 1.5922177474893245e-05, "loss": 2.4247, "step": 2270 }, { "epoch": 0.9, "learning_rate": 1.5905534824374035e-05, "loss": 2.3674, "step": 2275 }, { "epoch": 0.9, "learning_rate": 1.5888867019372107e-05, "loss": 2.5493, "step": 2280 }, { "epoch": 0.9, "learning_rate": 1.5872174130883567e-05, "loss": 2.3714, "step": 2285 }, { "epoch": 0.9, "learning_rate": 1.585545623001138e-05, "loss": 2.4268, "step": 2290 }, { "epoch": 0.9, "learning_rate": 1.5838713387965027e-05, "loss": 2.4177, "step": 2295 }, { "epoch": 0.91, "learning_rate": 1.5821945676060253e-05, "loss": 2.3839, "step": 2300 }, { "epoch": 0.91, "learning_rate": 1.5805153165718707e-05, "loss": 2.37, "step": 2305 }, { "epoch": 0.91, "learning_rate": 1.5788335928467693e-05, "loss": 2.4504, "step": 2310 }, { "epoch": 0.91, "learning_rate": 1.5771494035939818e-05, "loss": 2.4738, "step": 2315 }, { "epoch": 0.91, "learning_rate": 1.575462755987272e-05, "loss": 2.4448, "step": 2320 }, { "epoch": 0.92, "learning_rate": 1.573773657210874e-05, "loss": 2.4254, "step": 2325 }, { "epoch": 0.92, "learning_rate": 1.572082114459464e-05, "loss": 2.4822, "step": 2330 }, { "epoch": 0.92, "learning_rate": 1.570388134938127e-05, "loss": 2.3381, "step": 2335 }, { "epoch": 0.92, "learning_rate": 1.568691725862328e-05, "loss": 2.3961, "step": 2340 }, { "epoch": 0.92, "learning_rate": 1.5669928944578797e-05, "loss": 2.4217, "step": 2345 }, { "epoch": 0.93, "learning_rate": 1.5652916479609144e-05, "loss": 2.4484, "step": 2350 }, { "epoch": 0.93, "learning_rate": 1.56358799361785e-05, "loss": 2.4127, "step": 2355 }, { "epoch": 0.93, "learning_rate": 1.5618819386853607e-05, "loss": 2.4082, "step": 2360 }, { "epoch": 0.93, "learning_rate": 1.560173490430346e-05, "loss": 2.4137, "step": 2365 }, { "epoch": 0.93, "learning_rate": 1.5584626561299008e-05, "loss": 2.3963, "step": 2370 }, { "epoch": 0.94, "learning_rate": 1.5567494430712818e-05, "loss": 2.398, "step": 2375 }, { "epoch": 0.94, "learning_rate": 1.555033858551879e-05, "loss": 2.3476, "step": 2380 }, { "epoch": 0.94, "learning_rate": 1.553315909879182e-05, "loss": 2.4435, "step": 2385 }, { "epoch": 0.94, "learning_rate": 1.5515956043707535e-05, "loss": 2.4849, "step": 2390 }, { "epoch": 0.94, "learning_rate": 1.5498729493541914e-05, "loss": 2.2806, "step": 2395 }, { "epoch": 0.95, "learning_rate": 1.548147952167104e-05, "loss": 2.3392, "step": 2400 }, { "epoch": 0.95, "learning_rate": 1.546420620157075e-05, "loss": 2.4154, "step": 2405 }, { "epoch": 0.95, "learning_rate": 1.5446909606816332e-05, "loss": 2.4219, "step": 2410 }, { "epoch": 0.95, "learning_rate": 1.5429589811082215e-05, "loss": 2.3888, "step": 2415 }, { "epoch": 0.95, "learning_rate": 1.5412246888141645e-05, "loss": 2.4238, "step": 2420 }, { "epoch": 0.96, "learning_rate": 1.5394880911866386e-05, "loss": 2.3031, "step": 2425 }, { "epoch": 0.96, "learning_rate": 1.53774919562264e-05, "loss": 2.2881, "step": 2430 }, { "epoch": 0.96, "learning_rate": 1.536008009528951e-05, "loss": 2.4005, "step": 2435 }, { "epoch": 0.96, "learning_rate": 1.5342645403221125e-05, "loss": 2.4371, "step": 2440 }, { "epoch": 0.96, "learning_rate": 1.5325187954283888e-05, "loss": 2.4448, "step": 2445 }, { "epoch": 0.97, "learning_rate": 1.530770782283739e-05, "loss": 2.3219, "step": 2450 }, { "epoch": 0.97, "learning_rate": 1.529020508333782e-05, "loss": 2.4135, "step": 2455 }, { "epoch": 0.97, "learning_rate": 1.5272679810337677e-05, "loss": 2.365, "step": 2460 }, { "epoch": 0.97, "learning_rate": 1.5255132078485436e-05, "loss": 2.4404, "step": 2465 }, { "epoch": 0.97, "learning_rate": 1.5237561962525239e-05, "loss": 2.3785, "step": 2470 }, { "epoch": 0.98, "learning_rate": 1.5219969537296563e-05, "loss": 2.3826, "step": 2475 }, { "epoch": 0.98, "learning_rate": 1.5202354877733925e-05, "loss": 2.305, "step": 2480 }, { "epoch": 0.98, "learning_rate": 1.5184718058866538e-05, "loss": 2.4282, "step": 2485 }, { "epoch": 0.98, "learning_rate": 1.5167059155818006e-05, "loss": 2.4481, "step": 2490 }, { "epoch": 0.98, "learning_rate": 1.5149378243805999e-05, "loss": 2.4888, "step": 2495 }, { "epoch": 0.99, "learning_rate": 1.5131675398141936e-05, "loss": 2.4635, "step": 2500 }, { "epoch": 0.99, "learning_rate": 1.5113950694230654e-05, "loss": 2.3716, "step": 2505 }, { "epoch": 0.99, "learning_rate": 1.5096204207570112e-05, "loss": 2.4266, "step": 2510 }, { "epoch": 0.99, "learning_rate": 1.5078436013751026e-05, "loss": 2.3681, "step": 2515 }, { "epoch": 0.99, "learning_rate": 1.5060646188456598e-05, "loss": 2.4378, "step": 2520 }, { "epoch": 1.0, "learning_rate": 1.5042834807462154e-05, "loss": 2.3843, "step": 2525 }, { "epoch": 1.0, "learning_rate": 1.502500194663484e-05, "loss": 2.4125, "step": 2530 }, { "epoch": 1.0, "learning_rate": 1.5007147681933298e-05, "loss": 2.4875, "step": 2535 }, { "epoch": 1.0, "learning_rate": 1.498927208940734e-05, "loss": 2.4527, "step": 2540 }, { "epoch": 1.0, "learning_rate": 1.4971375245197612e-05, "loss": 2.3885, "step": 2545 }, { "epoch": 1.01, "learning_rate": 1.4953457225535295e-05, "loss": 2.3448, "step": 2550 }, { "epoch": 1.01, "learning_rate": 1.4935518106741756e-05, "loss": 2.3505, "step": 2555 }, { "epoch": 1.01, "learning_rate": 1.491755796522824e-05, "loss": 2.4446, "step": 2560 }, { "epoch": 1.01, "learning_rate": 1.4899576877495533e-05, "loss": 2.4048, "step": 2565 }, { "epoch": 1.01, "learning_rate": 1.4881574920133646e-05, "loss": 2.4144, "step": 2570 }, { "epoch": 1.01, "learning_rate": 1.4863552169821478e-05, "loss": 2.3966, "step": 2575 }, { "epoch": 1.02, "learning_rate": 1.4845508703326504e-05, "loss": 2.507, "step": 2580 }, { "epoch": 1.02, "learning_rate": 1.4827444597504425e-05, "loss": 2.4524, "step": 2585 }, { "epoch": 1.02, "learning_rate": 1.480935992929887e-05, "loss": 2.3931, "step": 2590 }, { "epoch": 1.02, "learning_rate": 1.4791254775741038e-05, "loss": 2.402, "step": 2595 }, { "epoch": 1.02, "learning_rate": 1.47731292139494e-05, "loss": 2.361, "step": 2600 }, { "epoch": 1.03, "learning_rate": 1.4754983321129344e-05, "loss": 2.4062, "step": 2605 }, { "epoch": 1.03, "learning_rate": 1.4736817174572861e-05, "loss": 2.4315, "step": 2610 }, { "epoch": 1.03, "learning_rate": 1.4718630851658213e-05, "loss": 2.3975, "step": 2615 }, { "epoch": 1.03, "learning_rate": 1.4700424429849607e-05, "loss": 2.2759, "step": 2620 }, { "epoch": 1.03, "learning_rate": 1.468219798669685e-05, "loss": 2.3927, "step": 2625 }, { "epoch": 1.04, "learning_rate": 1.4663951599835043e-05, "loss": 2.4473, "step": 2630 }, { "epoch": 1.04, "learning_rate": 1.464568534698422e-05, "loss": 2.3687, "step": 2635 }, { "epoch": 1.04, "learning_rate": 1.4627399305949053e-05, "loss": 2.3609, "step": 2640 }, { "epoch": 1.04, "learning_rate": 1.4609093554618481e-05, "loss": 2.3298, "step": 2645 }, { "epoch": 1.04, "learning_rate": 1.459076817096542e-05, "loss": 2.3708, "step": 2650 }, { "epoch": 1.05, "learning_rate": 1.4572423233046386e-05, "loss": 2.3378, "step": 2655 }, { "epoch": 1.05, "learning_rate": 1.4554058819001209e-05, "loss": 2.3351, "step": 2660 }, { "epoch": 1.05, "learning_rate": 1.4535675007052661e-05, "loss": 2.347, "step": 2665 }, { "epoch": 1.05, "learning_rate": 1.451727187550615e-05, "loss": 2.3634, "step": 2670 }, { "epoch": 1.05, "learning_rate": 1.4498849502749358e-05, "loss": 2.3631, "step": 2675 }, { "epoch": 1.06, "learning_rate": 1.4480407967251951e-05, "loss": 2.4034, "step": 2680 }, { "epoch": 1.06, "learning_rate": 1.446194734756519e-05, "loss": 2.4199, "step": 2685 }, { "epoch": 1.06, "learning_rate": 1.4443467722321647e-05, "loss": 2.3987, "step": 2690 }, { "epoch": 1.06, "learning_rate": 1.4424969170234832e-05, "loss": 2.3888, "step": 2695 }, { "epoch": 1.06, "learning_rate": 1.4406451770098885e-05, "loss": 2.4313, "step": 2700 }, { "epoch": 1.07, "learning_rate": 1.4387915600788222e-05, "loss": 2.4165, "step": 2705 }, { "epoch": 1.07, "learning_rate": 1.4369360741257212e-05, "loss": 2.4478, "step": 2710 }, { "epoch": 1.07, "learning_rate": 1.4350787270539824e-05, "loss": 2.3312, "step": 2715 }, { "epoch": 1.07, "learning_rate": 1.433219526774931e-05, "loss": 2.3582, "step": 2720 }, { "epoch": 1.07, "learning_rate": 1.4313584812077852e-05, "loss": 2.4806, "step": 2725 }, { "epoch": 1.08, "learning_rate": 1.4294955982796243e-05, "loss": 2.3729, "step": 2730 }, { "epoch": 1.08, "learning_rate": 1.4276308859253526e-05, "loss": 2.3763, "step": 2735 }, { "epoch": 1.08, "learning_rate": 1.4257643520876677e-05, "loss": 2.28, "step": 2740 }, { "epoch": 1.08, "learning_rate": 1.4238960047170244e-05, "loss": 2.3267, "step": 2745 }, { "epoch": 1.08, "learning_rate": 1.4220258517716041e-05, "loss": 2.2964, "step": 2750 }, { "epoch": 1.09, "learning_rate": 1.4201539012172776e-05, "loss": 2.4307, "step": 2755 }, { "epoch": 1.09, "learning_rate": 1.4182801610275731e-05, "loss": 2.4229, "step": 2760 }, { "epoch": 1.09, "learning_rate": 1.4164046391836414e-05, "loss": 2.396, "step": 2765 }, { "epoch": 1.09, "learning_rate": 1.4145273436742228e-05, "loss": 2.3456, "step": 2770 }, { "epoch": 1.09, "learning_rate": 1.4126482824956118e-05, "loss": 2.3657, "step": 2775 }, { "epoch": 1.1, "learning_rate": 1.4107674636516242e-05, "loss": 2.4299, "step": 2780 }, { "epoch": 1.1, "learning_rate": 1.4088848951535623e-05, "loss": 2.3755, "step": 2785 }, { "epoch": 1.1, "learning_rate": 1.4070005850201809e-05, "loss": 2.2979, "step": 2790 }, { "epoch": 1.1, "learning_rate": 1.4051145412776536e-05, "loss": 2.4243, "step": 2795 }, { "epoch": 1.1, "learning_rate": 1.4032267719595383e-05, "loss": 2.4392, "step": 2800 }, { "epoch": 1.11, "learning_rate": 1.4013372851067423e-05, "loss": 2.401, "step": 2805 }, { "epoch": 1.11, "learning_rate": 1.3994460887674896e-05, "loss": 2.3311, "step": 2810 }, { "epoch": 1.11, "learning_rate": 1.397553190997285e-05, "loss": 2.349, "step": 2815 }, { "epoch": 1.11, "learning_rate": 1.3956585998588807e-05, "loss": 2.4336, "step": 2820 }, { "epoch": 1.11, "learning_rate": 1.393762323422242e-05, "loss": 2.439, "step": 2825 }, { "epoch": 1.12, "learning_rate": 1.3918643697645124e-05, "loss": 2.4678, "step": 2830 }, { "epoch": 1.12, "learning_rate": 1.3899647469699795e-05, "loss": 2.4207, "step": 2835 }, { "epoch": 1.12, "learning_rate": 1.388063463130041e-05, "loss": 2.43, "step": 2840 }, { "epoch": 1.12, "learning_rate": 1.3861605263431688e-05, "loss": 2.4045, "step": 2845 }, { "epoch": 1.12, "learning_rate": 1.3842559447148764e-05, "loss": 2.4006, "step": 2850 }, { "epoch": 1.13, "learning_rate": 1.3823497263576837e-05, "loss": 2.2872, "step": 2855 }, { "epoch": 1.13, "learning_rate": 1.3804418793910812e-05, "loss": 2.4296, "step": 2860 }, { "epoch": 1.13, "learning_rate": 1.3785324119414968e-05, "loss": 2.4368, "step": 2865 }, { "epoch": 1.13, "learning_rate": 1.3766213321422611e-05, "loss": 2.3958, "step": 2870 }, { "epoch": 1.13, "learning_rate": 1.3747086481335721e-05, "loss": 2.3776, "step": 2875 }, { "epoch": 1.14, "learning_rate": 1.3727943680624611e-05, "loss": 2.3652, "step": 2880 }, { "epoch": 1.14, "learning_rate": 1.3708785000827577e-05, "loss": 2.4509, "step": 2885 }, { "epoch": 1.14, "learning_rate": 1.3689610523550556e-05, "loss": 2.3153, "step": 2890 }, { "epoch": 1.14, "learning_rate": 1.3670420330466769e-05, "loss": 2.4312, "step": 2895 }, { "epoch": 1.14, "learning_rate": 1.3651214503316377e-05, "loss": 2.4979, "step": 2900 }, { "epoch": 1.15, "learning_rate": 1.3631993123906136e-05, "loss": 2.2979, "step": 2905 }, { "epoch": 1.15, "learning_rate": 1.3612756274109046e-05, "loss": 2.4211, "step": 2910 }, { "epoch": 1.15, "learning_rate": 1.3593504035864007e-05, "loss": 2.3053, "step": 2915 }, { "epoch": 1.15, "learning_rate": 1.357423649117546e-05, "loss": 2.4048, "step": 2920 }, { "epoch": 1.15, "learning_rate": 1.3554953722113043e-05, "loss": 2.415, "step": 2925 }, { "epoch": 1.15, "learning_rate": 1.3535655810811245e-05, "loss": 2.2979, "step": 2930 }, { "epoch": 1.16, "learning_rate": 1.351634283946906e-05, "loss": 2.3771, "step": 2935 }, { "epoch": 1.16, "learning_rate": 1.349701489034961e-05, "loss": 2.3933, "step": 2940 }, { "epoch": 1.16, "learning_rate": 1.3477672045779839e-05, "loss": 2.4166, "step": 2945 }, { "epoch": 1.16, "learning_rate": 1.3458314388150115e-05, "loss": 2.418, "step": 2950 }, { "epoch": 1.16, "learning_rate": 1.3438941999913919e-05, "loss": 2.3956, "step": 2955 }, { "epoch": 1.17, "learning_rate": 1.3419554963587466e-05, "loss": 2.3494, "step": 2960 }, { "epoch": 1.17, "learning_rate": 1.3400153361749373e-05, "loss": 2.359, "step": 2965 }, { "epoch": 1.17, "learning_rate": 1.3380737277040288e-05, "loss": 2.4069, "step": 2970 }, { "epoch": 1.17, "learning_rate": 1.3361306792162558e-05, "loss": 2.4351, "step": 2975 }, { "epoch": 1.17, "learning_rate": 1.334186198987986e-05, "loss": 2.3192, "step": 2980 }, { "epoch": 1.18, "learning_rate": 1.3322402953016864e-05, "loss": 2.4305, "step": 2985 }, { "epoch": 1.18, "learning_rate": 1.3302929764458863e-05, "loss": 2.4582, "step": 2990 }, { "epoch": 1.18, "learning_rate": 1.3283442507151433e-05, "loss": 2.316, "step": 2995 }, { "epoch": 1.18, "learning_rate": 1.3263941264100076e-05, "loss": 2.4488, "step": 3000 } ], "logging_steps": 5, "max_steps": 7611, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 100, "total_flos": 4.0468985905152e+16, "train_batch_size": 4, "trial_name": null, "trial_params": null }