{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.9964104098115465, "eval_steps": 500, "global_step": 3340, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.02, "learning_rate": 1.1976047904191617e-05, "loss": 1.194, "step": 20 }, { "epoch": 0.05, "learning_rate": 2.3952095808383234e-05, "loss": 1.0484, "step": 40 }, { "epoch": 0.07, "learning_rate": 3.592814371257485e-05, "loss": 0.8359, "step": 60 }, { "epoch": 0.1, "learning_rate": 4.790419161676647e-05, "loss": 0.6286, "step": 80 }, { "epoch": 0.12, "learning_rate": 5.988023952095808e-05, "loss": 0.4686, "step": 100 }, { "epoch": 0.14, "learning_rate": 7.18562874251497e-05, "loss": 0.3526, "step": 120 }, { "epoch": 0.17, "learning_rate": 8.383233532934131e-05, "loss": 0.2483, "step": 140 }, { "epoch": 0.19, "learning_rate": 9.580838323353294e-05, "loss": 0.1862, "step": 160 }, { "epoch": 0.22, "learning_rate": 0.00010778443113772456, "loss": 0.1524, "step": 180 }, { "epoch": 0.24, "learning_rate": 0.00011976047904191617, "loss": 0.1389, "step": 200 }, { "epoch": 0.26, "learning_rate": 0.0001317365269461078, "loss": 0.1361, "step": 220 }, { "epoch": 0.29, "learning_rate": 0.0001437125748502994, "loss": 0.1286, "step": 240 }, { "epoch": 0.31, "learning_rate": 0.00015568862275449103, "loss": 0.1239, "step": 260 }, { "epoch": 0.34, "learning_rate": 0.00016766467065868263, "loss": 0.1205, "step": 280 }, { "epoch": 0.36, "learning_rate": 0.00017964071856287425, "loss": 0.1206, "step": 300 }, { "epoch": 0.38, "learning_rate": 0.00019161676646706587, "loss": 0.1166, "step": 320 }, { "epoch": 0.41, "learning_rate": 0.0001996007984031936, "loss": 0.1174, "step": 340 }, { "epoch": 0.43, "learning_rate": 0.000198270126413839, "loss": 0.115, "step": 360 }, { "epoch": 0.45, "learning_rate": 0.00019693945442448438, "loss": 0.1158, "step": 380 }, { "epoch": 0.48, "learning_rate": 0.00019560878243512974, "loss": 0.1157, "step": 400 }, { "epoch": 0.5, "learning_rate": 0.00019427811044577512, "loss": 0.114, "step": 420 }, { "epoch": 0.53, "learning_rate": 0.00019294743845642048, "loss": 0.1115, "step": 440 }, { "epoch": 0.55, "learning_rate": 0.00019161676646706587, "loss": 0.1152, "step": 460 }, { "epoch": 0.57, "learning_rate": 0.00019028609447771126, "loss": 0.1107, "step": 480 }, { "epoch": 0.6, "learning_rate": 0.00018895542248835662, "loss": 0.1117, "step": 500 }, { "epoch": 0.62, "learning_rate": 0.000187624750499002, "loss": 0.1096, "step": 520 }, { "epoch": 0.65, "learning_rate": 0.00018629407850964737, "loss": 0.1108, "step": 540 }, { "epoch": 0.67, "learning_rate": 0.00018496340652029275, "loss": 0.1103, "step": 560 }, { "epoch": 0.69, "learning_rate": 0.00018363273453093811, "loss": 0.106, "step": 580 }, { "epoch": 0.72, "learning_rate": 0.0001823020625415835, "loss": 0.1066, "step": 600 }, { "epoch": 0.74, "learning_rate": 0.0001809713905522289, "loss": 0.1078, "step": 620 }, { "epoch": 0.77, "learning_rate": 0.00017964071856287425, "loss": 0.1074, "step": 640 }, { "epoch": 0.79, "learning_rate": 0.00017831004657351964, "loss": 0.1047, "step": 660 }, { "epoch": 0.81, "learning_rate": 0.000176979374584165, "loss": 0.1051, "step": 680 }, { "epoch": 0.84, "learning_rate": 0.00017564870259481038, "loss": 0.1052, "step": 700 }, { "epoch": 0.86, "learning_rate": 0.00017431803060545577, "loss": 0.1068, "step": 720 }, { "epoch": 0.89, "learning_rate": 0.00017298735861610113, "loss": 0.1081, "step": 740 }, { "epoch": 0.91, "learning_rate": 0.00017165668662674652, "loss": 0.1069, "step": 760 }, { "epoch": 0.93, "learning_rate": 0.00017032601463739188, "loss": 0.1072, "step": 780 }, { "epoch": 0.96, "learning_rate": 0.00016899534264803727, "loss": 0.1057, "step": 800 }, { "epoch": 0.98, "learning_rate": 0.00016766467065868263, "loss": 0.1057, "step": 820 }, { "epoch": 1.01, "learning_rate": 0.00016633399866932801, "loss": 0.1072, "step": 840 }, { "epoch": 1.03, "learning_rate": 0.0001650033266799734, "loss": 0.1054, "step": 860 }, { "epoch": 1.05, "learning_rate": 0.00016367265469061876, "loss": 0.1064, "step": 880 }, { "epoch": 1.08, "learning_rate": 0.00016234198270126415, "loss": 0.1069, "step": 900 }, { "epoch": 1.1, "learning_rate": 0.0001610113107119095, "loss": 0.1062, "step": 920 }, { "epoch": 1.12, "learning_rate": 0.0001596806387225549, "loss": 0.1033, "step": 940 }, { "epoch": 1.15, "learning_rate": 0.00015834996673320028, "loss": 0.1022, "step": 960 }, { "epoch": 1.17, "learning_rate": 0.00015701929474384565, "loss": 0.1023, "step": 980 }, { "epoch": 1.2, "learning_rate": 0.00015568862275449103, "loss": 0.1024, "step": 1000 }, { "epoch": 1.22, "learning_rate": 0.0001543579507651364, "loss": 0.1008, "step": 1020 }, { "epoch": 1.24, "learning_rate": 0.00015302727877578178, "loss": 0.1011, "step": 1040 }, { "epoch": 1.27, "learning_rate": 0.00015169660678642714, "loss": 0.1035, "step": 1060 }, { "epoch": 1.29, "learning_rate": 0.00015036593479707253, "loss": 0.1012, "step": 1080 }, { "epoch": 1.32, "learning_rate": 0.00014903526280771792, "loss": 0.1004, "step": 1100 }, { "epoch": 1.34, "learning_rate": 0.00014770459081836328, "loss": 0.0996, "step": 1120 }, { "epoch": 1.36, "learning_rate": 0.00014637391882900866, "loss": 0.102, "step": 1140 }, { "epoch": 1.39, "learning_rate": 0.00014504324683965402, "loss": 0.0976, "step": 1160 }, { "epoch": 1.41, "learning_rate": 0.0001437125748502994, "loss": 0.1024, "step": 1180 }, { "epoch": 1.44, "learning_rate": 0.0001423819028609448, "loss": 0.1006, "step": 1200 }, { "epoch": 1.46, "learning_rate": 0.00014105123087159016, "loss": 0.101, "step": 1220 }, { "epoch": 1.48, "learning_rate": 0.00013972055888223555, "loss": 0.102, "step": 1240 }, { "epoch": 1.51, "learning_rate": 0.0001383898868928809, "loss": 0.1018, "step": 1260 }, { "epoch": 1.53, "learning_rate": 0.0001370592149035263, "loss": 0.0999, "step": 1280 }, { "epoch": 1.56, "learning_rate": 0.00013572854291417165, "loss": 0.1021, "step": 1300 }, { "epoch": 1.58, "learning_rate": 0.00013439787092481704, "loss": 0.0996, "step": 1320 }, { "epoch": 1.6, "learning_rate": 0.00013306719893546243, "loss": 0.1005, "step": 1340 }, { "epoch": 1.63, "learning_rate": 0.0001317365269461078, "loss": 0.0993, "step": 1360 }, { "epoch": 1.65, "learning_rate": 0.00013040585495675318, "loss": 0.0999, "step": 1380 }, { "epoch": 1.68, "learning_rate": 0.00012907518296739854, "loss": 0.1011, "step": 1400 }, { "epoch": 1.7, "learning_rate": 0.00012774451097804392, "loss": 0.0968, "step": 1420 }, { "epoch": 1.72, "learning_rate": 0.0001264138389886893, "loss": 0.0983, "step": 1440 }, { "epoch": 1.75, "learning_rate": 0.00012508316699933467, "loss": 0.0987, "step": 1460 }, { "epoch": 1.77, "learning_rate": 0.00012375249500998006, "loss": 0.0984, "step": 1480 }, { "epoch": 1.79, "learning_rate": 0.00012242182302062542, "loss": 0.0956, "step": 1500 }, { "epoch": 1.82, "learning_rate": 0.0001210911510312708, "loss": 0.0971, "step": 1520 }, { "epoch": 1.84, "learning_rate": 0.00011976047904191617, "loss": 0.0986, "step": 1540 }, { "epoch": 1.87, "learning_rate": 0.00011842980705256155, "loss": 0.0995, "step": 1560 }, { "epoch": 1.89, "learning_rate": 0.00011709913506320693, "loss": 0.0989, "step": 1580 }, { "epoch": 1.91, "learning_rate": 0.0001157684630738523, "loss": 0.1006, "step": 1600 }, { "epoch": 1.94, "learning_rate": 0.00011443779108449768, "loss": 0.0982, "step": 1620 }, { "epoch": 1.96, "learning_rate": 0.00011310711909514305, "loss": 0.0982, "step": 1640 }, { "epoch": 1.99, "learning_rate": 0.00011177644710578842, "loss": 0.0998, "step": 1660 }, { "epoch": 2.01, "learning_rate": 0.00011044577511643381, "loss": 0.0986, "step": 1680 }, { "epoch": 2.03, "learning_rate": 0.00010911510312707917, "loss": 0.0996, "step": 1700 }, { "epoch": 2.06, "learning_rate": 0.00010778443113772456, "loss": 0.0994, "step": 1720 }, { "epoch": 2.08, "learning_rate": 0.00010645375914836992, "loss": 0.0999, "step": 1740 }, { "epoch": 2.11, "learning_rate": 0.0001051230871590153, "loss": 0.1006, "step": 1760 }, { "epoch": 2.13, "learning_rate": 0.00010379241516966068, "loss": 0.0954, "step": 1780 }, { "epoch": 2.15, "learning_rate": 0.00010246174318030605, "loss": 0.0958, "step": 1800 }, { "epoch": 2.18, "learning_rate": 0.00010113107119095144, "loss": 0.0969, "step": 1820 }, { "epoch": 2.2, "learning_rate": 9.98003992015968e-05, "loss": 0.0949, "step": 1840 }, { "epoch": 2.23, "learning_rate": 9.846972721224219e-05, "loss": 0.0957, "step": 1860 }, { "epoch": 2.25, "learning_rate": 9.713905522288756e-05, "loss": 0.0951, "step": 1880 }, { "epoch": 2.27, "learning_rate": 9.580838323353294e-05, "loss": 0.0966, "step": 1900 }, { "epoch": 2.3, "learning_rate": 9.447771124417831e-05, "loss": 0.0946, "step": 1920 }, { "epoch": 2.32, "learning_rate": 9.314703925482368e-05, "loss": 0.0947, "step": 1940 }, { "epoch": 2.35, "learning_rate": 9.181636726546906e-05, "loss": 0.0947, "step": 1960 }, { "epoch": 2.37, "learning_rate": 9.048569527611444e-05, "loss": 0.0961, "step": 1980 }, { "epoch": 2.39, "learning_rate": 8.915502328675982e-05, "loss": 0.0926, "step": 2000 }, { "epoch": 2.42, "learning_rate": 8.782435129740519e-05, "loss": 0.0972, "step": 2020 }, { "epoch": 2.44, "learning_rate": 8.649367930805057e-05, "loss": 0.095, "step": 2040 }, { "epoch": 2.46, "learning_rate": 8.516300731869594e-05, "loss": 0.0969, "step": 2060 }, { "epoch": 2.49, "learning_rate": 8.383233532934131e-05, "loss": 0.0964, "step": 2080 }, { "epoch": 2.51, "learning_rate": 8.25016633399867e-05, "loss": 0.0944, "step": 2100 }, { "epoch": 2.54, "learning_rate": 8.117099135063207e-05, "loss": 0.097, "step": 2120 }, { "epoch": 2.56, "learning_rate": 7.984031936127745e-05, "loss": 0.0955, "step": 2140 }, { "epoch": 2.58, "learning_rate": 7.850964737192282e-05, "loss": 0.0957, "step": 2160 }, { "epoch": 2.61, "learning_rate": 7.71789753825682e-05, "loss": 0.095, "step": 2180 }, { "epoch": 2.63, "learning_rate": 7.584830339321357e-05, "loss": 0.0941, "step": 2200 }, { "epoch": 2.66, "learning_rate": 7.451763140385896e-05, "loss": 0.096, "step": 2220 }, { "epoch": 2.68, "learning_rate": 7.318695941450433e-05, "loss": 0.0942, "step": 2240 }, { "epoch": 2.7, "learning_rate": 7.18562874251497e-05, "loss": 0.0936, "step": 2260 }, { "epoch": 2.73, "learning_rate": 7.052561543579508e-05, "loss": 0.0928, "step": 2280 }, { "epoch": 2.75, "learning_rate": 6.919494344644045e-05, "loss": 0.0948, "step": 2300 }, { "epoch": 2.78, "learning_rate": 6.786427145708583e-05, "loss": 0.0937, "step": 2320 }, { "epoch": 2.8, "learning_rate": 6.653359946773121e-05, "loss": 0.0902, "step": 2340 }, { "epoch": 2.82, "learning_rate": 6.520292747837659e-05, "loss": 0.093, "step": 2360 }, { "epoch": 2.85, "learning_rate": 6.387225548902196e-05, "loss": 0.0936, "step": 2380 }, { "epoch": 2.87, "learning_rate": 6.254158349966734e-05, "loss": 0.0958, "step": 2400 }, { "epoch": 2.9, "learning_rate": 6.121091151031271e-05, "loss": 0.0948, "step": 2420 }, { "epoch": 2.92, "learning_rate": 5.988023952095808e-05, "loss": 0.0965, "step": 2440 }, { "epoch": 2.94, "learning_rate": 5.8549567531603464e-05, "loss": 0.0932, "step": 2460 }, { "epoch": 2.97, "learning_rate": 5.721889554224884e-05, "loss": 0.0941, "step": 2480 }, { "epoch": 2.99, "learning_rate": 5.588822355289421e-05, "loss": 0.0957, "step": 2500 }, { "epoch": 3.02, "learning_rate": 5.4557551563539585e-05, "loss": 0.0943, "step": 2520 }, { "epoch": 3.04, "learning_rate": 5.322687957418496e-05, "loss": 0.0954, "step": 2540 }, { "epoch": 3.06, "learning_rate": 5.189620758483034e-05, "loss": 0.095, "step": 2560 }, { "epoch": 3.09, "learning_rate": 5.056553559547572e-05, "loss": 0.096, "step": 2580 }, { "epoch": 3.11, "learning_rate": 4.9234863606121094e-05, "loss": 0.0954, "step": 2600 }, { "epoch": 3.13, "learning_rate": 4.790419161676647e-05, "loss": 0.0913, "step": 2620 }, { "epoch": 3.16, "learning_rate": 4.657351962741184e-05, "loss": 0.092, "step": 2640 }, { "epoch": 3.18, "learning_rate": 4.524284763805722e-05, "loss": 0.0933, "step": 2660 }, { "epoch": 3.21, "learning_rate": 4.3912175648702596e-05, "loss": 0.0915, "step": 2680 }, { "epoch": 3.23, "learning_rate": 4.258150365934797e-05, "loss": 0.0903, "step": 2700 }, { "epoch": 3.25, "learning_rate": 4.125083166999335e-05, "loss": 0.0924, "step": 2720 }, { "epoch": 3.28, "learning_rate": 3.9920159680638724e-05, "loss": 0.0926, "step": 2740 }, { "epoch": 3.3, "learning_rate": 3.85894876912841e-05, "loss": 0.0923, "step": 2760 }, { "epoch": 3.33, "learning_rate": 3.725881570192948e-05, "loss": 0.0899, "step": 2780 }, { "epoch": 3.35, "learning_rate": 3.592814371257485e-05, "loss": 0.0922, "step": 2800 }, { "epoch": 3.37, "learning_rate": 3.4597471723220226e-05, "loss": 0.0905, "step": 2820 }, { "epoch": 3.4, "learning_rate": 3.326679973386561e-05, "loss": 0.0897, "step": 2840 }, { "epoch": 3.42, "learning_rate": 3.193612774451098e-05, "loss": 0.0923, "step": 2860 }, { "epoch": 3.45, "learning_rate": 3.0605455755156355e-05, "loss": 0.0922, "step": 2880 }, { "epoch": 3.47, "learning_rate": 2.9274783765801732e-05, "loss": 0.0939, "step": 2900 }, { "epoch": 3.49, "learning_rate": 2.7944111776447106e-05, "loss": 0.0931, "step": 2920 }, { "epoch": 3.52, "learning_rate": 2.661343978709248e-05, "loss": 0.0898, "step": 2940 }, { "epoch": 3.54, "learning_rate": 2.528276779773786e-05, "loss": 0.0943, "step": 2960 }, { "epoch": 3.57, "learning_rate": 2.3952095808383234e-05, "loss": 0.0932, "step": 2980 }, { "epoch": 3.59, "learning_rate": 2.2687957418496342e-05, "loss": 0.0909, "step": 3000 }, { "epoch": 3.61, "learning_rate": 2.135728542914172e-05, "loss": 0.0926, "step": 3020 }, { "epoch": 3.64, "learning_rate": 2.0026613439787093e-05, "loss": 0.0902, "step": 3040 }, { "epoch": 3.66, "learning_rate": 1.869594145043247e-05, "loss": 0.0931, "step": 3060 }, { "epoch": 3.69, "learning_rate": 1.7365269461077845e-05, "loss": 0.0892, "step": 3080 }, { "epoch": 3.71, "learning_rate": 1.603459747172322e-05, "loss": 0.0912, "step": 3100 }, { "epoch": 3.73, "learning_rate": 1.4703925482368597e-05, "loss": 0.0897, "step": 3120 }, { "epoch": 3.76, "learning_rate": 1.3373253493013973e-05, "loss": 0.0912, "step": 3140 }, { "epoch": 3.78, "learning_rate": 1.2042581503659348e-05, "loss": 0.0918, "step": 3160 }, { "epoch": 3.8, "learning_rate": 1.0711909514304724e-05, "loss": 0.0861, "step": 3180 }, { "epoch": 3.83, "learning_rate": 9.3812375249501e-06, "loss": 0.091, "step": 3200 }, { "epoch": 3.85, "learning_rate": 8.050565535595477e-06, "loss": 0.0904, "step": 3220 }, { "epoch": 3.88, "learning_rate": 6.719893546240852e-06, "loss": 0.093, "step": 3240 }, { "epoch": 3.9, "learning_rate": 5.3892215568862275e-06, "loss": 0.0928, "step": 3260 }, { "epoch": 3.92, "learning_rate": 4.058549567531603e-06, "loss": 0.092, "step": 3280 }, { "epoch": 3.95, "learning_rate": 2.7278775781769794e-06, "loss": 0.0905, "step": 3300 }, { "epoch": 3.97, "learning_rate": 1.3972055888223554e-06, "loss": 0.0913, "step": 3320 }, { "epoch": 4.0, "learning_rate": 6.65335994677312e-08, "loss": 0.0927, "step": 3340 } ], "logging_steps": 20, "max_steps": 3340, "num_train_epochs": 4, "save_steps": 500, "total_flos": 1.6896316557662093e+19, "trial_name": null, "trial_params": null }