{ "best_metric": 6.209214687347412, "best_model_checkpoint": "./results/models/checkpoint-104305", "epoch": 24.0, "eval_steps": 500, "global_step": 108840, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.11, "learning_rate": 0.0019955898566703417, "loss": 6.7688, "step": 500 }, { "epoch": 0.22, "learning_rate": 0.001991179713340684, "loss": 6.6804, "step": 1000 }, { "epoch": 0.33, "learning_rate": 0.001986769570011025, "loss": 6.6645, "step": 1500 }, { "epoch": 0.44, "learning_rate": 0.0019823594266813673, "loss": 6.6495, "step": 2000 }, { "epoch": 0.55, "learning_rate": 0.001977949283351709, "loss": 6.6246, "step": 2500 }, { "epoch": 0.66, "learning_rate": 0.0019735391400220507, "loss": 6.5823, "step": 3000 }, { "epoch": 0.77, "learning_rate": 0.001969128996692393, "loss": 6.5645, "step": 3500 }, { "epoch": 0.88, "learning_rate": 0.001964718853362734, "loss": 6.5409, "step": 4000 }, { "epoch": 0.99, "learning_rate": 0.001960308710033076, "loss": 6.5185, "step": 4500 }, { "epoch": 1.0, "eval_loss": 6.5362162590026855, "eval_runtime": 3.2086, "eval_samples_per_second": 90.695, "eval_steps_per_second": 1.558, "step": 4535 }, { "epoch": 1.1, "learning_rate": 0.001955898566703418, "loss": 6.501, "step": 5000 }, { "epoch": 1.21, "learning_rate": 0.0019514884233737598, "loss": 6.5008, "step": 5500 }, { "epoch": 1.32, "learning_rate": 0.0019470782800441013, "loss": 6.5114, "step": 6000 }, { "epoch": 1.43, "learning_rate": 0.0019426681367144432, "loss": 6.4978, "step": 6500 }, { "epoch": 1.54, "learning_rate": 0.0019382579933847851, "loss": 6.5002, "step": 7000 }, { "epoch": 1.65, "learning_rate": 0.0019338478500551268, "loss": 6.4791, "step": 7500 }, { "epoch": 1.76, "learning_rate": 0.0019294377067254687, "loss": 6.4779, "step": 8000 }, { "epoch": 1.87, "learning_rate": 0.0019250275633958104, "loss": 6.4694, "step": 8500 }, { "epoch": 1.98, "learning_rate": 0.0019206174200661521, "loss": 6.466, "step": 9000 }, { "epoch": 2.0, "eval_loss": 6.482348918914795, "eval_runtime": 3.1798, "eval_samples_per_second": 91.514, "eval_steps_per_second": 1.572, "step": 9070 }, { "epoch": 2.09, "learning_rate": 0.001916207276736494, "loss": 6.4515, "step": 9500 }, { "epoch": 2.21, "learning_rate": 0.0019117971334068357, "loss": 6.4551, "step": 10000 }, { "epoch": 2.32, "learning_rate": 0.0019073869900771776, "loss": 6.444, "step": 10500 }, { "epoch": 2.43, "learning_rate": 0.0019029768467475193, "loss": 6.4236, "step": 11000 }, { "epoch": 2.54, "learning_rate": 0.001898566703417861, "loss": 6.4277, "step": 11500 }, { "epoch": 2.65, "learning_rate": 0.001894156560088203, "loss": 6.4185, "step": 12000 }, { "epoch": 2.76, "learning_rate": 0.0018897464167585446, "loss": 6.4164, "step": 12500 }, { "epoch": 2.87, "learning_rate": 0.0018853362734288866, "loss": 6.4177, "step": 13000 }, { "epoch": 2.98, "learning_rate": 0.0018809261300992283, "loss": 6.4135, "step": 13500 }, { "epoch": 3.0, "eval_loss": 6.432834625244141, "eval_runtime": 3.1903, "eval_samples_per_second": 91.214, "eval_steps_per_second": 1.567, "step": 13605 }, { "epoch": 3.09, "learning_rate": 0.00187651598676957, "loss": 6.4072, "step": 14000 }, { "epoch": 3.2, "learning_rate": 0.0018721058434399119, "loss": 6.3986, "step": 14500 }, { "epoch": 3.31, "learning_rate": 0.0018676957001102538, "loss": 6.3877, "step": 15000 }, { "epoch": 3.42, "learning_rate": 0.0018632855567805953, "loss": 6.3881, "step": 15500 }, { "epoch": 3.53, "learning_rate": 0.0018588754134509372, "loss": 6.3853, "step": 16000 }, { "epoch": 3.64, "learning_rate": 0.0018544652701212789, "loss": 6.3969, "step": 16500 }, { "epoch": 3.75, "learning_rate": 0.0018500551267916208, "loss": 6.3868, "step": 17000 }, { "epoch": 3.86, "learning_rate": 0.0018456449834619627, "loss": 6.3914, "step": 17500 }, { "epoch": 3.97, "learning_rate": 0.0018412348401323042, "loss": 6.3761, "step": 18000 }, { "epoch": 4.0, "eval_loss": 6.406026840209961, "eval_runtime": 3.1898, "eval_samples_per_second": 91.228, "eval_steps_per_second": 1.567, "step": 18140 }, { "epoch": 4.08, "learning_rate": 0.001836824696802646, "loss": 6.3767, "step": 18500 }, { "epoch": 4.19, "learning_rate": 0.001832414553472988, "loss": 6.3771, "step": 19000 }, { "epoch": 4.3, "learning_rate": 0.0018280044101433297, "loss": 6.3629, "step": 19500 }, { "epoch": 4.41, "learning_rate": 0.0018235942668136716, "loss": 6.3658, "step": 20000 }, { "epoch": 4.52, "learning_rate": 0.0018191841234840131, "loss": 6.3683, "step": 20500 }, { "epoch": 4.63, "learning_rate": 0.001814773980154355, "loss": 6.3679, "step": 21000 }, { "epoch": 4.74, "learning_rate": 0.001810363836824697, "loss": 6.3651, "step": 21500 }, { "epoch": 4.85, "learning_rate": 0.0018059536934950386, "loss": 6.3492, "step": 22000 }, { "epoch": 4.96, "learning_rate": 0.0018015435501653806, "loss": 6.3456, "step": 22500 }, { "epoch": 5.0, "eval_loss": 6.364647388458252, "eval_runtime": 3.181, "eval_samples_per_second": 91.481, "eval_steps_per_second": 1.572, "step": 22675 }, { "epoch": 5.07, "learning_rate": 0.001797133406835722, "loss": 6.3347, "step": 23000 }, { "epoch": 5.18, "learning_rate": 0.001792723263506064, "loss": 6.3421, "step": 23500 }, { "epoch": 5.29, "learning_rate": 0.0017883131201764059, "loss": 6.3458, "step": 24000 }, { "epoch": 5.4, "learning_rate": 0.0017839029768467476, "loss": 6.34, "step": 24500 }, { "epoch": 5.51, "learning_rate": 0.0017794928335170893, "loss": 6.3494, "step": 25000 }, { "epoch": 5.62, "learning_rate": 0.0017750826901874312, "loss": 6.3306, "step": 25500 }, { "epoch": 5.73, "learning_rate": 0.0017706725468577729, "loss": 6.3336, "step": 26000 }, { "epoch": 5.84, "learning_rate": 0.0017662624035281148, "loss": 6.3347, "step": 26500 }, { "epoch": 5.95, "learning_rate": 0.0017618522601984565, "loss": 6.3298, "step": 27000 }, { "epoch": 6.0, "eval_loss": 6.354611396789551, "eval_runtime": 3.181, "eval_samples_per_second": 91.481, "eval_steps_per_second": 1.572, "step": 27210 }, { "epoch": 6.06, "learning_rate": 0.0017574421168687982, "loss": 6.3297, "step": 27500 }, { "epoch": 6.17, "learning_rate": 0.00175303197353914, "loss": 6.3288, "step": 28000 }, { "epoch": 6.28, "learning_rate": 0.0017486218302094818, "loss": 6.3304, "step": 28500 }, { "epoch": 6.39, "learning_rate": 0.0017442116868798237, "loss": 6.3151, "step": 29000 }, { "epoch": 6.5, "learning_rate": 0.0017398015435501656, "loss": 6.3186, "step": 29500 }, { "epoch": 6.62, "learning_rate": 0.001735391400220507, "loss": 6.3194, "step": 30000 }, { "epoch": 6.73, "learning_rate": 0.001730981256890849, "loss": 6.3115, "step": 30500 }, { "epoch": 6.84, "learning_rate": 0.0017265711135611907, "loss": 6.3191, "step": 31000 }, { "epoch": 6.95, "learning_rate": 0.0017221609702315326, "loss": 6.3108, "step": 31500 }, { "epoch": 7.0, "eval_loss": 6.336333274841309, "eval_runtime": 3.1987, "eval_samples_per_second": 90.975, "eval_steps_per_second": 1.563, "step": 31745 }, { "epoch": 7.06, "learning_rate": 0.0017177508269018745, "loss": 6.3078, "step": 32000 }, { "epoch": 7.17, "learning_rate": 0.001713340683572216, "loss": 6.3046, "step": 32500 }, { "epoch": 7.28, "learning_rate": 0.001708930540242558, "loss": 6.3013, "step": 33000 }, { "epoch": 7.39, "learning_rate": 0.0017045203969128996, "loss": 6.3082, "step": 33500 }, { "epoch": 7.5, "learning_rate": 0.0017001102535832415, "loss": 6.3015, "step": 34000 }, { "epoch": 7.61, "learning_rate": 0.0016957001102535832, "loss": 6.2922, "step": 34500 }, { "epoch": 7.72, "learning_rate": 0.001691289966923925, "loss": 6.2995, "step": 35000 }, { "epoch": 7.83, "learning_rate": 0.0016868798235942669, "loss": 6.302, "step": 35500 }, { "epoch": 7.94, "learning_rate": 0.0016824696802646088, "loss": 6.3015, "step": 36000 }, { "epoch": 8.0, "eval_loss": 6.320508003234863, "eval_runtime": 3.1701, "eval_samples_per_second": 91.795, "eval_steps_per_second": 1.577, "step": 36280 }, { "epoch": 8.05, "learning_rate": 0.0016780595369349505, "loss": 6.2848, "step": 36500 }, { "epoch": 8.16, "learning_rate": 0.0016736493936052922, "loss": 6.297, "step": 37000 }, { "epoch": 8.27, "learning_rate": 0.0016692392502756339, "loss": 6.2874, "step": 37500 }, { "epoch": 8.38, "learning_rate": 0.0016648291069459758, "loss": 6.2883, "step": 38000 }, { "epoch": 8.49, "learning_rate": 0.0016604189636163177, "loss": 6.2775, "step": 38500 }, { "epoch": 8.6, "learning_rate": 0.0016560088202866594, "loss": 6.2791, "step": 39000 }, { "epoch": 8.71, "learning_rate": 0.001651598676957001, "loss": 6.2816, "step": 39500 }, { "epoch": 8.82, "learning_rate": 0.0016471885336273428, "loss": 6.2744, "step": 40000 }, { "epoch": 8.93, "learning_rate": 0.0016427783902976847, "loss": 6.2721, "step": 40500 }, { "epoch": 9.0, "eval_loss": 6.303074836730957, "eval_runtime": 3.1711, "eval_samples_per_second": 91.767, "eval_steps_per_second": 1.577, "step": 40815 }, { "epoch": 9.04, "learning_rate": 0.0016383682469680266, "loss": 6.276, "step": 41000 }, { "epoch": 9.15, "learning_rate": 0.0016339581036383683, "loss": 6.2713, "step": 41500 }, { "epoch": 9.26, "learning_rate": 0.00162954796030871, "loss": 6.2678, "step": 42000 }, { "epoch": 9.37, "learning_rate": 0.001625137816979052, "loss": 6.2672, "step": 42500 }, { "epoch": 9.48, "learning_rate": 0.0016207276736493936, "loss": 6.2596, "step": 43000 }, { "epoch": 9.59, "learning_rate": 0.0016163175303197355, "loss": 6.2705, "step": 43500 }, { "epoch": 9.7, "learning_rate": 0.001611907386990077, "loss": 6.2771, "step": 44000 }, { "epoch": 9.81, "learning_rate": 0.001607497243660419, "loss": 6.2601, "step": 44500 }, { "epoch": 9.92, "learning_rate": 0.0016030871003307608, "loss": 6.2657, "step": 45000 }, { "epoch": 10.0, "eval_loss": 6.291815280914307, "eval_runtime": 3.1834, "eval_samples_per_second": 91.412, "eval_steps_per_second": 1.571, "step": 45350 }, { "epoch": 10.03, "learning_rate": 0.0015986769570011025, "loss": 6.264, "step": 45500 }, { "epoch": 10.14, "learning_rate": 0.0015942668136714445, "loss": 6.2611, "step": 46000 }, { "epoch": 10.25, "learning_rate": 0.0015898566703417862, "loss": 6.2654, "step": 46500 }, { "epoch": 10.36, "learning_rate": 0.0015854465270121279, "loss": 6.258, "step": 47000 }, { "epoch": 10.47, "learning_rate": 0.0015810363836824698, "loss": 6.2543, "step": 47500 }, { "epoch": 10.58, "learning_rate": 0.0015766262403528115, "loss": 6.2513, "step": 48000 }, { "epoch": 10.69, "learning_rate": 0.0015722160970231534, "loss": 6.2556, "step": 48500 }, { "epoch": 10.8, "learning_rate": 0.001567805953693495, "loss": 6.2581, "step": 49000 }, { "epoch": 10.92, "learning_rate": 0.0015633958103638368, "loss": 6.2431, "step": 49500 }, { "epoch": 11.0, "eval_loss": 6.28529691696167, "eval_runtime": 3.1715, "eval_samples_per_second": 91.755, "eval_steps_per_second": 1.577, "step": 49885 }, { "epoch": 11.03, "learning_rate": 0.0015589856670341787, "loss": 6.2599, "step": 50000 }, { "epoch": 11.14, "learning_rate": 0.0015545755237045204, "loss": 6.2521, "step": 50500 }, { "epoch": 11.25, "learning_rate": 0.0015501653803748623, "loss": 6.2548, "step": 51000 }, { "epoch": 11.36, "learning_rate": 0.001545755237045204, "loss": 6.2448, "step": 51500 }, { "epoch": 11.47, "learning_rate": 0.0015413450937155457, "loss": 6.2482, "step": 52000 }, { "epoch": 11.58, "learning_rate": 0.0015369349503858876, "loss": 6.2521, "step": 52500 }, { "epoch": 11.69, "learning_rate": 0.0015325248070562295, "loss": 6.2511, "step": 53000 }, { "epoch": 11.8, "learning_rate": 0.001528114663726571, "loss": 6.2509, "step": 53500 }, { "epoch": 11.91, "learning_rate": 0.001523704520396913, "loss": 6.2467, "step": 54000 }, { "epoch": 12.0, "eval_loss": 6.272589206695557, "eval_runtime": 3.2247, "eval_samples_per_second": 90.241, "eval_steps_per_second": 1.551, "step": 54420 }, { "epoch": 12.02, "learning_rate": 0.0015192943770672546, "loss": 6.2525, "step": 54500 }, { "epoch": 12.13, "learning_rate": 0.0015148842337375965, "loss": 6.236, "step": 55000 }, { "epoch": 12.24, "learning_rate": 0.0015104740904079384, "loss": 6.2487, "step": 55500 }, { "epoch": 12.35, "learning_rate": 0.00150606394707828, "loss": 6.2375, "step": 56000 }, { "epoch": 12.46, "learning_rate": 0.0015016538037486218, "loss": 6.2375, "step": 56500 }, { "epoch": 12.57, "learning_rate": 0.0014972436604189638, "loss": 6.2414, "step": 57000 }, { "epoch": 12.68, "learning_rate": 0.0014928335170893055, "loss": 6.2343, "step": 57500 }, { "epoch": 12.79, "learning_rate": 0.0014884233737596474, "loss": 6.225, "step": 58000 }, { "epoch": 12.9, "learning_rate": 0.0014840132304299888, "loss": 6.2336, "step": 58500 }, { "epoch": 13.0, "eval_loss": 6.260488033294678, "eval_runtime": 3.1929, "eval_samples_per_second": 91.14, "eval_steps_per_second": 1.566, "step": 58955 }, { "epoch": 13.01, "learning_rate": 0.0014796030871003308, "loss": 6.2281, "step": 59000 }, { "epoch": 13.12, "learning_rate": 0.0014751929437706727, "loss": 6.2288, "step": 59500 }, { "epoch": 13.23, "learning_rate": 0.0014707828004410144, "loss": 6.2271, "step": 60000 }, { "epoch": 13.34, "learning_rate": 0.0014663726571113563, "loss": 6.232, "step": 60500 }, { "epoch": 13.45, "learning_rate": 0.0014619625137816978, "loss": 6.2307, "step": 61000 }, { "epoch": 13.56, "learning_rate": 0.0014575523704520397, "loss": 6.2294, "step": 61500 }, { "epoch": 13.67, "learning_rate": 0.0014531422271223816, "loss": 6.2215, "step": 62000 }, { "epoch": 13.78, "learning_rate": 0.0014487320837927233, "loss": 6.2218, "step": 62500 }, { "epoch": 13.89, "learning_rate": 0.001444321940463065, "loss": 6.2279, "step": 63000 }, { "epoch": 14.0, "eval_loss": 6.25548791885376, "eval_runtime": 3.2087, "eval_samples_per_second": 90.691, "eval_steps_per_second": 1.558, "step": 63490 }, { "epoch": 14.0, "learning_rate": 0.001439911797133407, "loss": 6.2282, "step": 63500 }, { "epoch": 14.11, "learning_rate": 0.0014355016538037486, "loss": 6.2224, "step": 64000 }, { "epoch": 14.22, "learning_rate": 0.0014310915104740905, "loss": 6.2242, "step": 64500 }, { "epoch": 14.33, "learning_rate": 0.0014266813671444322, "loss": 6.2338, "step": 65000 }, { "epoch": 14.44, "learning_rate": 0.001422271223814774, "loss": 6.2136, "step": 65500 }, { "epoch": 14.55, "learning_rate": 0.0014178610804851158, "loss": 6.2176, "step": 66000 }, { "epoch": 14.66, "learning_rate": 0.0014134509371554575, "loss": 6.2128, "step": 66500 }, { "epoch": 14.77, "learning_rate": 0.0014090407938257994, "loss": 6.2227, "step": 67000 }, { "epoch": 14.88, "learning_rate": 0.0014046306504961414, "loss": 6.2145, "step": 67500 }, { "epoch": 14.99, "learning_rate": 0.0014002205071664828, "loss": 6.2222, "step": 68000 }, { "epoch": 15.0, "eval_loss": 6.248010158538818, "eval_runtime": 3.1977, "eval_samples_per_second": 91.003, "eval_steps_per_second": 1.564, "step": 68025 }, { "epoch": 15.1, "learning_rate": 0.0013958103638368248, "loss": 6.2192, "step": 68500 }, { "epoch": 15.21, "learning_rate": 0.0013914002205071664, "loss": 6.2112, "step": 69000 }, { "epoch": 15.33, "learning_rate": 0.0013869900771775084, "loss": 6.2227, "step": 69500 }, { "epoch": 15.44, "learning_rate": 0.0013825799338478503, "loss": 6.2186, "step": 70000 }, { "epoch": 15.55, "learning_rate": 0.0013781697905181918, "loss": 6.2133, "step": 70500 }, { "epoch": 15.66, "learning_rate": 0.0013737596471885337, "loss": 6.2115, "step": 71000 }, { "epoch": 15.77, "learning_rate": 0.0013693495038588754, "loss": 6.2185, "step": 71500 }, { "epoch": 15.88, "learning_rate": 0.0013649393605292173, "loss": 6.212, "step": 72000 }, { "epoch": 15.99, "learning_rate": 0.001360529217199559, "loss": 6.2089, "step": 72500 }, { "epoch": 16.0, "eval_loss": 6.242463111877441, "eval_runtime": 3.2691, "eval_samples_per_second": 89.016, "eval_steps_per_second": 1.529, "step": 72560 }, { "epoch": 16.1, "learning_rate": 0.0013561190738699007, "loss": 6.2139, "step": 73000 }, { "epoch": 16.21, "learning_rate": 0.0013517089305402426, "loss": 6.2089, "step": 73500 }, { "epoch": 16.32, "learning_rate": 0.0013472987872105845, "loss": 6.2019, "step": 74000 }, { "epoch": 16.43, "learning_rate": 0.0013428886438809262, "loss": 6.1961, "step": 74500 }, { "epoch": 16.54, "learning_rate": 0.001338478500551268, "loss": 6.2156, "step": 75000 }, { "epoch": 16.65, "learning_rate": 0.0013340683572216096, "loss": 6.2056, "step": 75500 }, { "epoch": 16.76, "learning_rate": 0.0013296582138919515, "loss": 6.2078, "step": 76000 }, { "epoch": 16.87, "learning_rate": 0.0013252480705622934, "loss": 6.2113, "step": 76500 }, { "epoch": 16.98, "learning_rate": 0.0013208379272326351, "loss": 6.2133, "step": 77000 }, { "epoch": 17.0, "eval_loss": 6.236937046051025, "eval_runtime": 3.2084, "eval_samples_per_second": 90.7, "eval_steps_per_second": 1.558, "step": 77095 }, { "epoch": 17.09, "learning_rate": 0.0013164277839029768, "loss": 6.2033, "step": 77500 }, { "epoch": 17.2, "learning_rate": 0.0013120176405733185, "loss": 6.2032, "step": 78000 }, { "epoch": 17.31, "learning_rate": 0.0013076074972436604, "loss": 6.2016, "step": 78500 }, { "epoch": 17.42, "learning_rate": 0.0013031973539140024, "loss": 6.2063, "step": 79000 }, { "epoch": 17.53, "learning_rate": 0.001298787210584344, "loss": 6.2016, "step": 79500 }, { "epoch": 17.64, "learning_rate": 0.0012943770672546857, "loss": 6.1975, "step": 80000 }, { "epoch": 17.75, "learning_rate": 0.0012899669239250277, "loss": 6.1994, "step": 80500 }, { "epoch": 17.86, "learning_rate": 0.0012855567805953694, "loss": 6.1992, "step": 81000 }, { "epoch": 17.97, "learning_rate": 0.0012811466372657113, "loss": 6.1978, "step": 81500 }, { "epoch": 18.0, "eval_loss": 6.230894088745117, "eval_runtime": 3.1986, "eval_samples_per_second": 90.977, "eval_steps_per_second": 1.563, "step": 81630 }, { "epoch": 18.08, "learning_rate": 0.0012767364939360528, "loss": 6.1995, "step": 82000 }, { "epoch": 18.19, "learning_rate": 0.0012723263506063947, "loss": 6.2002, "step": 82500 }, { "epoch": 18.3, "learning_rate": 0.0012679162072767366, "loss": 6.1985, "step": 83000 }, { "epoch": 18.41, "learning_rate": 0.0012635060639470783, "loss": 6.1986, "step": 83500 }, { "epoch": 18.52, "learning_rate": 0.0012590959206174202, "loss": 6.1846, "step": 84000 }, { "epoch": 18.63, "learning_rate": 0.0012546857772877619, "loss": 6.1968, "step": 84500 }, { "epoch": 18.74, "learning_rate": 0.0012502756339581036, "loss": 6.1935, "step": 85000 }, { "epoch": 18.85, "learning_rate": 0.0012458654906284455, "loss": 6.1969, "step": 85500 }, { "epoch": 18.96, "learning_rate": 0.0012414553472987872, "loss": 6.1936, "step": 86000 }, { "epoch": 19.0, "eval_loss": 6.223233699798584, "eval_runtime": 3.2991, "eval_samples_per_second": 88.205, "eval_steps_per_second": 1.516, "step": 86165 }, { "epoch": 19.07, "learning_rate": 0.0012370452039691291, "loss": 6.1966, "step": 86500 }, { "epoch": 19.18, "learning_rate": 0.0012326350606394708, "loss": 6.1873, "step": 87000 }, { "epoch": 19.29, "learning_rate": 0.0012282249173098125, "loss": 6.1899, "step": 87500 }, { "epoch": 19.4, "learning_rate": 0.0012238147739801544, "loss": 6.1917, "step": 88000 }, { "epoch": 19.51, "learning_rate": 0.0012194046306504961, "loss": 6.1894, "step": 88500 }, { "epoch": 19.63, "learning_rate": 0.001214994487320838, "loss": 6.196, "step": 89000 }, { "epoch": 19.74, "learning_rate": 0.0012105843439911797, "loss": 6.186, "step": 89500 }, { "epoch": 19.85, "learning_rate": 0.0012061742006615214, "loss": 6.1871, "step": 90000 }, { "epoch": 19.96, "learning_rate": 0.0012017640573318633, "loss": 6.1913, "step": 90500 }, { "epoch": 20.0, "eval_loss": 6.2211713790893555, "eval_runtime": 3.2092, "eval_samples_per_second": 90.677, "eval_steps_per_second": 1.558, "step": 90700 }, { "epoch": 20.07, "learning_rate": 0.0011973539140022053, "loss": 6.1844, "step": 91000 }, { "epoch": 20.18, "learning_rate": 0.0011929437706725467, "loss": 6.1876, "step": 91500 }, { "epoch": 20.29, "learning_rate": 0.0011885336273428887, "loss": 6.1959, "step": 92000 }, { "epoch": 20.4, "learning_rate": 0.0011841234840132304, "loss": 6.191, "step": 92500 }, { "epoch": 20.51, "learning_rate": 0.0011797133406835723, "loss": 6.1884, "step": 93000 }, { "epoch": 20.62, "learning_rate": 0.0011753031973539142, "loss": 6.1791, "step": 93500 }, { "epoch": 20.73, "learning_rate": 0.0011708930540242557, "loss": 6.188, "step": 94000 }, { "epoch": 20.84, "learning_rate": 0.0011664829106945976, "loss": 6.1839, "step": 94500 }, { "epoch": 20.95, "learning_rate": 0.0011620727673649395, "loss": 6.18, "step": 95000 }, { "epoch": 21.0, "eval_loss": 6.215102195739746, "eval_runtime": 3.1993, "eval_samples_per_second": 90.959, "eval_steps_per_second": 1.563, "step": 95235 }, { "epoch": 21.06, "learning_rate": 0.0011576626240352812, "loss": 6.1761, "step": 95500 }, { "epoch": 21.17, "learning_rate": 0.001153252480705623, "loss": 6.1808, "step": 96000 }, { "epoch": 21.28, "learning_rate": 0.0011488423373759646, "loss": 6.1794, "step": 96500 }, { "epoch": 21.39, "learning_rate": 0.0011444321940463065, "loss": 6.1841, "step": 97000 }, { "epoch": 21.5, "learning_rate": 0.0011400220507166484, "loss": 6.183, "step": 97500 }, { "epoch": 21.61, "learning_rate": 0.0011356119073869901, "loss": 6.1844, "step": 98000 }, { "epoch": 21.72, "learning_rate": 0.001131201764057332, "loss": 6.1853, "step": 98500 }, { "epoch": 21.83, "learning_rate": 0.0011267916207276735, "loss": 6.1764, "step": 99000 }, { "epoch": 21.94, "learning_rate": 0.0011223814773980154, "loss": 6.1855, "step": 99500 }, { "epoch": 22.0, "eval_loss": 6.212313175201416, "eval_runtime": 3.1975, "eval_samples_per_second": 91.009, "eval_steps_per_second": 1.564, "step": 99770 }, { "epoch": 22.05, "learning_rate": 0.0011179713340683573, "loss": 6.1747, "step": 100000 }, { "epoch": 22.16, "learning_rate": 0.001113561190738699, "loss": 6.1786, "step": 100500 }, { "epoch": 22.27, "learning_rate": 0.0011091510474090407, "loss": 6.1784, "step": 101000 }, { "epoch": 22.38, "learning_rate": 0.0011047409040793826, "loss": 6.1798, "step": 101500 }, { "epoch": 22.49, "learning_rate": 0.0011003307607497243, "loss": 6.1793, "step": 102000 }, { "epoch": 22.6, "learning_rate": 0.0010959206174200663, "loss": 6.1775, "step": 102500 }, { "epoch": 22.71, "learning_rate": 0.001091510474090408, "loss": 6.1778, "step": 103000 }, { "epoch": 22.82, "learning_rate": 0.0010871003307607497, "loss": 6.1798, "step": 103500 }, { "epoch": 22.93, "learning_rate": 0.0010826901874310916, "loss": 6.1758, "step": 104000 }, { "epoch": 23.0, "eval_loss": 6.209214687347412, "eval_runtime": 3.2078, "eval_samples_per_second": 90.718, "eval_steps_per_second": 1.559, "step": 104305 }, { "epoch": 23.04, "learning_rate": 0.0010782800441014333, "loss": 6.1856, "step": 104500 }, { "epoch": 23.15, "learning_rate": 0.0010738699007717752, "loss": 6.1849, "step": 105000 }, { "epoch": 23.26, "learning_rate": 0.001069459757442117, "loss": 6.179, "step": 105500 }, { "epoch": 23.37, "learning_rate": 0.0010650496141124586, "loss": 6.1804, "step": 106000 }, { "epoch": 23.48, "learning_rate": 0.0010606394707828005, "loss": 6.1782, "step": 106500 }, { "epoch": 23.59, "learning_rate": 0.0010562293274531422, "loss": 6.1745, "step": 107000 }, { "epoch": 23.7, "learning_rate": 0.001051819184123484, "loss": 6.183, "step": 107500 }, { "epoch": 23.81, "learning_rate": 0.001047409040793826, "loss": 6.1828, "step": 108000 }, { "epoch": 23.93, "learning_rate": 0.0010429988974641675, "loss": 6.18, "step": 108500 }, { "epoch": 24.0, "eval_loss": 6.213593006134033, "eval_runtime": 3.2056, "eval_samples_per_second": 90.78, "eval_steps_per_second": 1.56, "step": 108840 } ], "logging_steps": 500, "max_steps": 226750, "num_input_tokens_seen": 0, "num_train_epochs": 50, "save_steps": 500, "total_flos": 1.23831193938535e+19, "train_batch_size": 64, "trial_name": null, "trial_params": null }