|
{ |
|
"best_metric": 6.209214687347412, |
|
"best_model_checkpoint": "./results/models/checkpoint-104305", |
|
"epoch": 24.0, |
|
"eval_steps": 500, |
|
"global_step": 108840, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.11, |
|
"learning_rate": 0.0019955898566703417, |
|
"loss": 6.7688, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"learning_rate": 0.001991179713340684, |
|
"loss": 6.6804, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"learning_rate": 0.001986769570011025, |
|
"loss": 6.6645, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"learning_rate": 0.0019823594266813673, |
|
"loss": 6.6495, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"learning_rate": 0.001977949283351709, |
|
"loss": 6.6246, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"learning_rate": 0.0019735391400220507, |
|
"loss": 6.5823, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"learning_rate": 0.001969128996692393, |
|
"loss": 6.5645, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"learning_rate": 0.001964718853362734, |
|
"loss": 6.5409, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"learning_rate": 0.001960308710033076, |
|
"loss": 6.5185, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_loss": 6.5362162590026855, |
|
"eval_runtime": 3.2086, |
|
"eval_samples_per_second": 90.695, |
|
"eval_steps_per_second": 1.558, |
|
"step": 4535 |
|
}, |
|
{ |
|
"epoch": 1.1, |
|
"learning_rate": 0.001955898566703418, |
|
"loss": 6.501, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 1.21, |
|
"learning_rate": 0.0019514884233737598, |
|
"loss": 6.5008, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 1.32, |
|
"learning_rate": 0.0019470782800441013, |
|
"loss": 6.5114, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 1.43, |
|
"learning_rate": 0.0019426681367144432, |
|
"loss": 6.4978, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 1.54, |
|
"learning_rate": 0.0019382579933847851, |
|
"loss": 6.5002, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 1.65, |
|
"learning_rate": 0.0019338478500551268, |
|
"loss": 6.4791, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 1.76, |
|
"learning_rate": 0.0019294377067254687, |
|
"loss": 6.4779, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 1.87, |
|
"learning_rate": 0.0019250275633958104, |
|
"loss": 6.4694, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 1.98, |
|
"learning_rate": 0.0019206174200661521, |
|
"loss": 6.466, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_loss": 6.482348918914795, |
|
"eval_runtime": 3.1798, |
|
"eval_samples_per_second": 91.514, |
|
"eval_steps_per_second": 1.572, |
|
"step": 9070 |
|
}, |
|
{ |
|
"epoch": 2.09, |
|
"learning_rate": 0.001916207276736494, |
|
"loss": 6.4515, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 2.21, |
|
"learning_rate": 0.0019117971334068357, |
|
"loss": 6.4551, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 2.32, |
|
"learning_rate": 0.0019073869900771776, |
|
"loss": 6.444, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 2.43, |
|
"learning_rate": 0.0019029768467475193, |
|
"loss": 6.4236, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 2.54, |
|
"learning_rate": 0.001898566703417861, |
|
"loss": 6.4277, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 2.65, |
|
"learning_rate": 0.001894156560088203, |
|
"loss": 6.4185, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 2.76, |
|
"learning_rate": 0.0018897464167585446, |
|
"loss": 6.4164, |
|
"step": 12500 |
|
}, |
|
{ |
|
"epoch": 2.87, |
|
"learning_rate": 0.0018853362734288866, |
|
"loss": 6.4177, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 2.98, |
|
"learning_rate": 0.0018809261300992283, |
|
"loss": 6.4135, |
|
"step": 13500 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_loss": 6.432834625244141, |
|
"eval_runtime": 3.1903, |
|
"eval_samples_per_second": 91.214, |
|
"eval_steps_per_second": 1.567, |
|
"step": 13605 |
|
}, |
|
{ |
|
"epoch": 3.09, |
|
"learning_rate": 0.00187651598676957, |
|
"loss": 6.4072, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 3.2, |
|
"learning_rate": 0.0018721058434399119, |
|
"loss": 6.3986, |
|
"step": 14500 |
|
}, |
|
{ |
|
"epoch": 3.31, |
|
"learning_rate": 0.0018676957001102538, |
|
"loss": 6.3877, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 3.42, |
|
"learning_rate": 0.0018632855567805953, |
|
"loss": 6.3881, |
|
"step": 15500 |
|
}, |
|
{ |
|
"epoch": 3.53, |
|
"learning_rate": 0.0018588754134509372, |
|
"loss": 6.3853, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 3.64, |
|
"learning_rate": 0.0018544652701212789, |
|
"loss": 6.3969, |
|
"step": 16500 |
|
}, |
|
{ |
|
"epoch": 3.75, |
|
"learning_rate": 0.0018500551267916208, |
|
"loss": 6.3868, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 3.86, |
|
"learning_rate": 0.0018456449834619627, |
|
"loss": 6.3914, |
|
"step": 17500 |
|
}, |
|
{ |
|
"epoch": 3.97, |
|
"learning_rate": 0.0018412348401323042, |
|
"loss": 6.3761, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"eval_loss": 6.406026840209961, |
|
"eval_runtime": 3.1898, |
|
"eval_samples_per_second": 91.228, |
|
"eval_steps_per_second": 1.567, |
|
"step": 18140 |
|
}, |
|
{ |
|
"epoch": 4.08, |
|
"learning_rate": 0.001836824696802646, |
|
"loss": 6.3767, |
|
"step": 18500 |
|
}, |
|
{ |
|
"epoch": 4.19, |
|
"learning_rate": 0.001832414553472988, |
|
"loss": 6.3771, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 4.3, |
|
"learning_rate": 0.0018280044101433297, |
|
"loss": 6.3629, |
|
"step": 19500 |
|
}, |
|
{ |
|
"epoch": 4.41, |
|
"learning_rate": 0.0018235942668136716, |
|
"loss": 6.3658, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 4.52, |
|
"learning_rate": 0.0018191841234840131, |
|
"loss": 6.3683, |
|
"step": 20500 |
|
}, |
|
{ |
|
"epoch": 4.63, |
|
"learning_rate": 0.001814773980154355, |
|
"loss": 6.3679, |
|
"step": 21000 |
|
}, |
|
{ |
|
"epoch": 4.74, |
|
"learning_rate": 0.001810363836824697, |
|
"loss": 6.3651, |
|
"step": 21500 |
|
}, |
|
{ |
|
"epoch": 4.85, |
|
"learning_rate": 0.0018059536934950386, |
|
"loss": 6.3492, |
|
"step": 22000 |
|
}, |
|
{ |
|
"epoch": 4.96, |
|
"learning_rate": 0.0018015435501653806, |
|
"loss": 6.3456, |
|
"step": 22500 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"eval_loss": 6.364647388458252, |
|
"eval_runtime": 3.181, |
|
"eval_samples_per_second": 91.481, |
|
"eval_steps_per_second": 1.572, |
|
"step": 22675 |
|
}, |
|
{ |
|
"epoch": 5.07, |
|
"learning_rate": 0.001797133406835722, |
|
"loss": 6.3347, |
|
"step": 23000 |
|
}, |
|
{ |
|
"epoch": 5.18, |
|
"learning_rate": 0.001792723263506064, |
|
"loss": 6.3421, |
|
"step": 23500 |
|
}, |
|
{ |
|
"epoch": 5.29, |
|
"learning_rate": 0.0017883131201764059, |
|
"loss": 6.3458, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 5.4, |
|
"learning_rate": 0.0017839029768467476, |
|
"loss": 6.34, |
|
"step": 24500 |
|
}, |
|
{ |
|
"epoch": 5.51, |
|
"learning_rate": 0.0017794928335170893, |
|
"loss": 6.3494, |
|
"step": 25000 |
|
}, |
|
{ |
|
"epoch": 5.62, |
|
"learning_rate": 0.0017750826901874312, |
|
"loss": 6.3306, |
|
"step": 25500 |
|
}, |
|
{ |
|
"epoch": 5.73, |
|
"learning_rate": 0.0017706725468577729, |
|
"loss": 6.3336, |
|
"step": 26000 |
|
}, |
|
{ |
|
"epoch": 5.84, |
|
"learning_rate": 0.0017662624035281148, |
|
"loss": 6.3347, |
|
"step": 26500 |
|
}, |
|
{ |
|
"epoch": 5.95, |
|
"learning_rate": 0.0017618522601984565, |
|
"loss": 6.3298, |
|
"step": 27000 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"eval_loss": 6.354611396789551, |
|
"eval_runtime": 3.181, |
|
"eval_samples_per_second": 91.481, |
|
"eval_steps_per_second": 1.572, |
|
"step": 27210 |
|
}, |
|
{ |
|
"epoch": 6.06, |
|
"learning_rate": 0.0017574421168687982, |
|
"loss": 6.3297, |
|
"step": 27500 |
|
}, |
|
{ |
|
"epoch": 6.17, |
|
"learning_rate": 0.00175303197353914, |
|
"loss": 6.3288, |
|
"step": 28000 |
|
}, |
|
{ |
|
"epoch": 6.28, |
|
"learning_rate": 0.0017486218302094818, |
|
"loss": 6.3304, |
|
"step": 28500 |
|
}, |
|
{ |
|
"epoch": 6.39, |
|
"learning_rate": 0.0017442116868798237, |
|
"loss": 6.3151, |
|
"step": 29000 |
|
}, |
|
{ |
|
"epoch": 6.5, |
|
"learning_rate": 0.0017398015435501656, |
|
"loss": 6.3186, |
|
"step": 29500 |
|
}, |
|
{ |
|
"epoch": 6.62, |
|
"learning_rate": 0.001735391400220507, |
|
"loss": 6.3194, |
|
"step": 30000 |
|
}, |
|
{ |
|
"epoch": 6.73, |
|
"learning_rate": 0.001730981256890849, |
|
"loss": 6.3115, |
|
"step": 30500 |
|
}, |
|
{ |
|
"epoch": 6.84, |
|
"learning_rate": 0.0017265711135611907, |
|
"loss": 6.3191, |
|
"step": 31000 |
|
}, |
|
{ |
|
"epoch": 6.95, |
|
"learning_rate": 0.0017221609702315326, |
|
"loss": 6.3108, |
|
"step": 31500 |
|
}, |
|
{ |
|
"epoch": 7.0, |
|
"eval_loss": 6.336333274841309, |
|
"eval_runtime": 3.1987, |
|
"eval_samples_per_second": 90.975, |
|
"eval_steps_per_second": 1.563, |
|
"step": 31745 |
|
}, |
|
{ |
|
"epoch": 7.06, |
|
"learning_rate": 0.0017177508269018745, |
|
"loss": 6.3078, |
|
"step": 32000 |
|
}, |
|
{ |
|
"epoch": 7.17, |
|
"learning_rate": 0.001713340683572216, |
|
"loss": 6.3046, |
|
"step": 32500 |
|
}, |
|
{ |
|
"epoch": 7.28, |
|
"learning_rate": 0.001708930540242558, |
|
"loss": 6.3013, |
|
"step": 33000 |
|
}, |
|
{ |
|
"epoch": 7.39, |
|
"learning_rate": 0.0017045203969128996, |
|
"loss": 6.3082, |
|
"step": 33500 |
|
}, |
|
{ |
|
"epoch": 7.5, |
|
"learning_rate": 0.0017001102535832415, |
|
"loss": 6.3015, |
|
"step": 34000 |
|
}, |
|
{ |
|
"epoch": 7.61, |
|
"learning_rate": 0.0016957001102535832, |
|
"loss": 6.2922, |
|
"step": 34500 |
|
}, |
|
{ |
|
"epoch": 7.72, |
|
"learning_rate": 0.001691289966923925, |
|
"loss": 6.2995, |
|
"step": 35000 |
|
}, |
|
{ |
|
"epoch": 7.83, |
|
"learning_rate": 0.0016868798235942669, |
|
"loss": 6.302, |
|
"step": 35500 |
|
}, |
|
{ |
|
"epoch": 7.94, |
|
"learning_rate": 0.0016824696802646088, |
|
"loss": 6.3015, |
|
"step": 36000 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"eval_loss": 6.320508003234863, |
|
"eval_runtime": 3.1701, |
|
"eval_samples_per_second": 91.795, |
|
"eval_steps_per_second": 1.577, |
|
"step": 36280 |
|
}, |
|
{ |
|
"epoch": 8.05, |
|
"learning_rate": 0.0016780595369349505, |
|
"loss": 6.2848, |
|
"step": 36500 |
|
}, |
|
{ |
|
"epoch": 8.16, |
|
"learning_rate": 0.0016736493936052922, |
|
"loss": 6.297, |
|
"step": 37000 |
|
}, |
|
{ |
|
"epoch": 8.27, |
|
"learning_rate": 0.0016692392502756339, |
|
"loss": 6.2874, |
|
"step": 37500 |
|
}, |
|
{ |
|
"epoch": 8.38, |
|
"learning_rate": 0.0016648291069459758, |
|
"loss": 6.2883, |
|
"step": 38000 |
|
}, |
|
{ |
|
"epoch": 8.49, |
|
"learning_rate": 0.0016604189636163177, |
|
"loss": 6.2775, |
|
"step": 38500 |
|
}, |
|
{ |
|
"epoch": 8.6, |
|
"learning_rate": 0.0016560088202866594, |
|
"loss": 6.2791, |
|
"step": 39000 |
|
}, |
|
{ |
|
"epoch": 8.71, |
|
"learning_rate": 0.001651598676957001, |
|
"loss": 6.2816, |
|
"step": 39500 |
|
}, |
|
{ |
|
"epoch": 8.82, |
|
"learning_rate": 0.0016471885336273428, |
|
"loss": 6.2744, |
|
"step": 40000 |
|
}, |
|
{ |
|
"epoch": 8.93, |
|
"learning_rate": 0.0016427783902976847, |
|
"loss": 6.2721, |
|
"step": 40500 |
|
}, |
|
{ |
|
"epoch": 9.0, |
|
"eval_loss": 6.303074836730957, |
|
"eval_runtime": 3.1711, |
|
"eval_samples_per_second": 91.767, |
|
"eval_steps_per_second": 1.577, |
|
"step": 40815 |
|
}, |
|
{ |
|
"epoch": 9.04, |
|
"learning_rate": 0.0016383682469680266, |
|
"loss": 6.276, |
|
"step": 41000 |
|
}, |
|
{ |
|
"epoch": 9.15, |
|
"learning_rate": 0.0016339581036383683, |
|
"loss": 6.2713, |
|
"step": 41500 |
|
}, |
|
{ |
|
"epoch": 9.26, |
|
"learning_rate": 0.00162954796030871, |
|
"loss": 6.2678, |
|
"step": 42000 |
|
}, |
|
{ |
|
"epoch": 9.37, |
|
"learning_rate": 0.001625137816979052, |
|
"loss": 6.2672, |
|
"step": 42500 |
|
}, |
|
{ |
|
"epoch": 9.48, |
|
"learning_rate": 0.0016207276736493936, |
|
"loss": 6.2596, |
|
"step": 43000 |
|
}, |
|
{ |
|
"epoch": 9.59, |
|
"learning_rate": 0.0016163175303197355, |
|
"loss": 6.2705, |
|
"step": 43500 |
|
}, |
|
{ |
|
"epoch": 9.7, |
|
"learning_rate": 0.001611907386990077, |
|
"loss": 6.2771, |
|
"step": 44000 |
|
}, |
|
{ |
|
"epoch": 9.81, |
|
"learning_rate": 0.001607497243660419, |
|
"loss": 6.2601, |
|
"step": 44500 |
|
}, |
|
{ |
|
"epoch": 9.92, |
|
"learning_rate": 0.0016030871003307608, |
|
"loss": 6.2657, |
|
"step": 45000 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"eval_loss": 6.291815280914307, |
|
"eval_runtime": 3.1834, |
|
"eval_samples_per_second": 91.412, |
|
"eval_steps_per_second": 1.571, |
|
"step": 45350 |
|
}, |
|
{ |
|
"epoch": 10.03, |
|
"learning_rate": 0.0015986769570011025, |
|
"loss": 6.264, |
|
"step": 45500 |
|
}, |
|
{ |
|
"epoch": 10.14, |
|
"learning_rate": 0.0015942668136714445, |
|
"loss": 6.2611, |
|
"step": 46000 |
|
}, |
|
{ |
|
"epoch": 10.25, |
|
"learning_rate": 0.0015898566703417862, |
|
"loss": 6.2654, |
|
"step": 46500 |
|
}, |
|
{ |
|
"epoch": 10.36, |
|
"learning_rate": 0.0015854465270121279, |
|
"loss": 6.258, |
|
"step": 47000 |
|
}, |
|
{ |
|
"epoch": 10.47, |
|
"learning_rate": 0.0015810363836824698, |
|
"loss": 6.2543, |
|
"step": 47500 |
|
}, |
|
{ |
|
"epoch": 10.58, |
|
"learning_rate": 0.0015766262403528115, |
|
"loss": 6.2513, |
|
"step": 48000 |
|
}, |
|
{ |
|
"epoch": 10.69, |
|
"learning_rate": 0.0015722160970231534, |
|
"loss": 6.2556, |
|
"step": 48500 |
|
}, |
|
{ |
|
"epoch": 10.8, |
|
"learning_rate": 0.001567805953693495, |
|
"loss": 6.2581, |
|
"step": 49000 |
|
}, |
|
{ |
|
"epoch": 10.92, |
|
"learning_rate": 0.0015633958103638368, |
|
"loss": 6.2431, |
|
"step": 49500 |
|
}, |
|
{ |
|
"epoch": 11.0, |
|
"eval_loss": 6.28529691696167, |
|
"eval_runtime": 3.1715, |
|
"eval_samples_per_second": 91.755, |
|
"eval_steps_per_second": 1.577, |
|
"step": 49885 |
|
}, |
|
{ |
|
"epoch": 11.03, |
|
"learning_rate": 0.0015589856670341787, |
|
"loss": 6.2599, |
|
"step": 50000 |
|
}, |
|
{ |
|
"epoch": 11.14, |
|
"learning_rate": 0.0015545755237045204, |
|
"loss": 6.2521, |
|
"step": 50500 |
|
}, |
|
{ |
|
"epoch": 11.25, |
|
"learning_rate": 0.0015501653803748623, |
|
"loss": 6.2548, |
|
"step": 51000 |
|
}, |
|
{ |
|
"epoch": 11.36, |
|
"learning_rate": 0.001545755237045204, |
|
"loss": 6.2448, |
|
"step": 51500 |
|
}, |
|
{ |
|
"epoch": 11.47, |
|
"learning_rate": 0.0015413450937155457, |
|
"loss": 6.2482, |
|
"step": 52000 |
|
}, |
|
{ |
|
"epoch": 11.58, |
|
"learning_rate": 0.0015369349503858876, |
|
"loss": 6.2521, |
|
"step": 52500 |
|
}, |
|
{ |
|
"epoch": 11.69, |
|
"learning_rate": 0.0015325248070562295, |
|
"loss": 6.2511, |
|
"step": 53000 |
|
}, |
|
{ |
|
"epoch": 11.8, |
|
"learning_rate": 0.001528114663726571, |
|
"loss": 6.2509, |
|
"step": 53500 |
|
}, |
|
{ |
|
"epoch": 11.91, |
|
"learning_rate": 0.001523704520396913, |
|
"loss": 6.2467, |
|
"step": 54000 |
|
}, |
|
{ |
|
"epoch": 12.0, |
|
"eval_loss": 6.272589206695557, |
|
"eval_runtime": 3.2247, |
|
"eval_samples_per_second": 90.241, |
|
"eval_steps_per_second": 1.551, |
|
"step": 54420 |
|
}, |
|
{ |
|
"epoch": 12.02, |
|
"learning_rate": 0.0015192943770672546, |
|
"loss": 6.2525, |
|
"step": 54500 |
|
}, |
|
{ |
|
"epoch": 12.13, |
|
"learning_rate": 0.0015148842337375965, |
|
"loss": 6.236, |
|
"step": 55000 |
|
}, |
|
{ |
|
"epoch": 12.24, |
|
"learning_rate": 0.0015104740904079384, |
|
"loss": 6.2487, |
|
"step": 55500 |
|
}, |
|
{ |
|
"epoch": 12.35, |
|
"learning_rate": 0.00150606394707828, |
|
"loss": 6.2375, |
|
"step": 56000 |
|
}, |
|
{ |
|
"epoch": 12.46, |
|
"learning_rate": 0.0015016538037486218, |
|
"loss": 6.2375, |
|
"step": 56500 |
|
}, |
|
{ |
|
"epoch": 12.57, |
|
"learning_rate": 0.0014972436604189638, |
|
"loss": 6.2414, |
|
"step": 57000 |
|
}, |
|
{ |
|
"epoch": 12.68, |
|
"learning_rate": 0.0014928335170893055, |
|
"loss": 6.2343, |
|
"step": 57500 |
|
}, |
|
{ |
|
"epoch": 12.79, |
|
"learning_rate": 0.0014884233737596474, |
|
"loss": 6.225, |
|
"step": 58000 |
|
}, |
|
{ |
|
"epoch": 12.9, |
|
"learning_rate": 0.0014840132304299888, |
|
"loss": 6.2336, |
|
"step": 58500 |
|
}, |
|
{ |
|
"epoch": 13.0, |
|
"eval_loss": 6.260488033294678, |
|
"eval_runtime": 3.1929, |
|
"eval_samples_per_second": 91.14, |
|
"eval_steps_per_second": 1.566, |
|
"step": 58955 |
|
}, |
|
{ |
|
"epoch": 13.01, |
|
"learning_rate": 0.0014796030871003308, |
|
"loss": 6.2281, |
|
"step": 59000 |
|
}, |
|
{ |
|
"epoch": 13.12, |
|
"learning_rate": 0.0014751929437706727, |
|
"loss": 6.2288, |
|
"step": 59500 |
|
}, |
|
{ |
|
"epoch": 13.23, |
|
"learning_rate": 0.0014707828004410144, |
|
"loss": 6.2271, |
|
"step": 60000 |
|
}, |
|
{ |
|
"epoch": 13.34, |
|
"learning_rate": 0.0014663726571113563, |
|
"loss": 6.232, |
|
"step": 60500 |
|
}, |
|
{ |
|
"epoch": 13.45, |
|
"learning_rate": 0.0014619625137816978, |
|
"loss": 6.2307, |
|
"step": 61000 |
|
}, |
|
{ |
|
"epoch": 13.56, |
|
"learning_rate": 0.0014575523704520397, |
|
"loss": 6.2294, |
|
"step": 61500 |
|
}, |
|
{ |
|
"epoch": 13.67, |
|
"learning_rate": 0.0014531422271223816, |
|
"loss": 6.2215, |
|
"step": 62000 |
|
}, |
|
{ |
|
"epoch": 13.78, |
|
"learning_rate": 0.0014487320837927233, |
|
"loss": 6.2218, |
|
"step": 62500 |
|
}, |
|
{ |
|
"epoch": 13.89, |
|
"learning_rate": 0.001444321940463065, |
|
"loss": 6.2279, |
|
"step": 63000 |
|
}, |
|
{ |
|
"epoch": 14.0, |
|
"eval_loss": 6.25548791885376, |
|
"eval_runtime": 3.2087, |
|
"eval_samples_per_second": 90.691, |
|
"eval_steps_per_second": 1.558, |
|
"step": 63490 |
|
}, |
|
{ |
|
"epoch": 14.0, |
|
"learning_rate": 0.001439911797133407, |
|
"loss": 6.2282, |
|
"step": 63500 |
|
}, |
|
{ |
|
"epoch": 14.11, |
|
"learning_rate": 0.0014355016538037486, |
|
"loss": 6.2224, |
|
"step": 64000 |
|
}, |
|
{ |
|
"epoch": 14.22, |
|
"learning_rate": 0.0014310915104740905, |
|
"loss": 6.2242, |
|
"step": 64500 |
|
}, |
|
{ |
|
"epoch": 14.33, |
|
"learning_rate": 0.0014266813671444322, |
|
"loss": 6.2338, |
|
"step": 65000 |
|
}, |
|
{ |
|
"epoch": 14.44, |
|
"learning_rate": 0.001422271223814774, |
|
"loss": 6.2136, |
|
"step": 65500 |
|
}, |
|
{ |
|
"epoch": 14.55, |
|
"learning_rate": 0.0014178610804851158, |
|
"loss": 6.2176, |
|
"step": 66000 |
|
}, |
|
{ |
|
"epoch": 14.66, |
|
"learning_rate": 0.0014134509371554575, |
|
"loss": 6.2128, |
|
"step": 66500 |
|
}, |
|
{ |
|
"epoch": 14.77, |
|
"learning_rate": 0.0014090407938257994, |
|
"loss": 6.2227, |
|
"step": 67000 |
|
}, |
|
{ |
|
"epoch": 14.88, |
|
"learning_rate": 0.0014046306504961414, |
|
"loss": 6.2145, |
|
"step": 67500 |
|
}, |
|
{ |
|
"epoch": 14.99, |
|
"learning_rate": 0.0014002205071664828, |
|
"loss": 6.2222, |
|
"step": 68000 |
|
}, |
|
{ |
|
"epoch": 15.0, |
|
"eval_loss": 6.248010158538818, |
|
"eval_runtime": 3.1977, |
|
"eval_samples_per_second": 91.003, |
|
"eval_steps_per_second": 1.564, |
|
"step": 68025 |
|
}, |
|
{ |
|
"epoch": 15.1, |
|
"learning_rate": 0.0013958103638368248, |
|
"loss": 6.2192, |
|
"step": 68500 |
|
}, |
|
{ |
|
"epoch": 15.21, |
|
"learning_rate": 0.0013914002205071664, |
|
"loss": 6.2112, |
|
"step": 69000 |
|
}, |
|
{ |
|
"epoch": 15.33, |
|
"learning_rate": 0.0013869900771775084, |
|
"loss": 6.2227, |
|
"step": 69500 |
|
}, |
|
{ |
|
"epoch": 15.44, |
|
"learning_rate": 0.0013825799338478503, |
|
"loss": 6.2186, |
|
"step": 70000 |
|
}, |
|
{ |
|
"epoch": 15.55, |
|
"learning_rate": 0.0013781697905181918, |
|
"loss": 6.2133, |
|
"step": 70500 |
|
}, |
|
{ |
|
"epoch": 15.66, |
|
"learning_rate": 0.0013737596471885337, |
|
"loss": 6.2115, |
|
"step": 71000 |
|
}, |
|
{ |
|
"epoch": 15.77, |
|
"learning_rate": 0.0013693495038588754, |
|
"loss": 6.2185, |
|
"step": 71500 |
|
}, |
|
{ |
|
"epoch": 15.88, |
|
"learning_rate": 0.0013649393605292173, |
|
"loss": 6.212, |
|
"step": 72000 |
|
}, |
|
{ |
|
"epoch": 15.99, |
|
"learning_rate": 0.001360529217199559, |
|
"loss": 6.2089, |
|
"step": 72500 |
|
}, |
|
{ |
|
"epoch": 16.0, |
|
"eval_loss": 6.242463111877441, |
|
"eval_runtime": 3.2691, |
|
"eval_samples_per_second": 89.016, |
|
"eval_steps_per_second": 1.529, |
|
"step": 72560 |
|
}, |
|
{ |
|
"epoch": 16.1, |
|
"learning_rate": 0.0013561190738699007, |
|
"loss": 6.2139, |
|
"step": 73000 |
|
}, |
|
{ |
|
"epoch": 16.21, |
|
"learning_rate": 0.0013517089305402426, |
|
"loss": 6.2089, |
|
"step": 73500 |
|
}, |
|
{ |
|
"epoch": 16.32, |
|
"learning_rate": 0.0013472987872105845, |
|
"loss": 6.2019, |
|
"step": 74000 |
|
}, |
|
{ |
|
"epoch": 16.43, |
|
"learning_rate": 0.0013428886438809262, |
|
"loss": 6.1961, |
|
"step": 74500 |
|
}, |
|
{ |
|
"epoch": 16.54, |
|
"learning_rate": 0.001338478500551268, |
|
"loss": 6.2156, |
|
"step": 75000 |
|
}, |
|
{ |
|
"epoch": 16.65, |
|
"learning_rate": 0.0013340683572216096, |
|
"loss": 6.2056, |
|
"step": 75500 |
|
}, |
|
{ |
|
"epoch": 16.76, |
|
"learning_rate": 0.0013296582138919515, |
|
"loss": 6.2078, |
|
"step": 76000 |
|
}, |
|
{ |
|
"epoch": 16.87, |
|
"learning_rate": 0.0013252480705622934, |
|
"loss": 6.2113, |
|
"step": 76500 |
|
}, |
|
{ |
|
"epoch": 16.98, |
|
"learning_rate": 0.0013208379272326351, |
|
"loss": 6.2133, |
|
"step": 77000 |
|
}, |
|
{ |
|
"epoch": 17.0, |
|
"eval_loss": 6.236937046051025, |
|
"eval_runtime": 3.2084, |
|
"eval_samples_per_second": 90.7, |
|
"eval_steps_per_second": 1.558, |
|
"step": 77095 |
|
}, |
|
{ |
|
"epoch": 17.09, |
|
"learning_rate": 0.0013164277839029768, |
|
"loss": 6.2033, |
|
"step": 77500 |
|
}, |
|
{ |
|
"epoch": 17.2, |
|
"learning_rate": 0.0013120176405733185, |
|
"loss": 6.2032, |
|
"step": 78000 |
|
}, |
|
{ |
|
"epoch": 17.31, |
|
"learning_rate": 0.0013076074972436604, |
|
"loss": 6.2016, |
|
"step": 78500 |
|
}, |
|
{ |
|
"epoch": 17.42, |
|
"learning_rate": 0.0013031973539140024, |
|
"loss": 6.2063, |
|
"step": 79000 |
|
}, |
|
{ |
|
"epoch": 17.53, |
|
"learning_rate": 0.001298787210584344, |
|
"loss": 6.2016, |
|
"step": 79500 |
|
}, |
|
{ |
|
"epoch": 17.64, |
|
"learning_rate": 0.0012943770672546857, |
|
"loss": 6.1975, |
|
"step": 80000 |
|
}, |
|
{ |
|
"epoch": 17.75, |
|
"learning_rate": 0.0012899669239250277, |
|
"loss": 6.1994, |
|
"step": 80500 |
|
}, |
|
{ |
|
"epoch": 17.86, |
|
"learning_rate": 0.0012855567805953694, |
|
"loss": 6.1992, |
|
"step": 81000 |
|
}, |
|
{ |
|
"epoch": 17.97, |
|
"learning_rate": 0.0012811466372657113, |
|
"loss": 6.1978, |
|
"step": 81500 |
|
}, |
|
{ |
|
"epoch": 18.0, |
|
"eval_loss": 6.230894088745117, |
|
"eval_runtime": 3.1986, |
|
"eval_samples_per_second": 90.977, |
|
"eval_steps_per_second": 1.563, |
|
"step": 81630 |
|
}, |
|
{ |
|
"epoch": 18.08, |
|
"learning_rate": 0.0012767364939360528, |
|
"loss": 6.1995, |
|
"step": 82000 |
|
}, |
|
{ |
|
"epoch": 18.19, |
|
"learning_rate": 0.0012723263506063947, |
|
"loss": 6.2002, |
|
"step": 82500 |
|
}, |
|
{ |
|
"epoch": 18.3, |
|
"learning_rate": 0.0012679162072767366, |
|
"loss": 6.1985, |
|
"step": 83000 |
|
}, |
|
{ |
|
"epoch": 18.41, |
|
"learning_rate": 0.0012635060639470783, |
|
"loss": 6.1986, |
|
"step": 83500 |
|
}, |
|
{ |
|
"epoch": 18.52, |
|
"learning_rate": 0.0012590959206174202, |
|
"loss": 6.1846, |
|
"step": 84000 |
|
}, |
|
{ |
|
"epoch": 18.63, |
|
"learning_rate": 0.0012546857772877619, |
|
"loss": 6.1968, |
|
"step": 84500 |
|
}, |
|
{ |
|
"epoch": 18.74, |
|
"learning_rate": 0.0012502756339581036, |
|
"loss": 6.1935, |
|
"step": 85000 |
|
}, |
|
{ |
|
"epoch": 18.85, |
|
"learning_rate": 0.0012458654906284455, |
|
"loss": 6.1969, |
|
"step": 85500 |
|
}, |
|
{ |
|
"epoch": 18.96, |
|
"learning_rate": 0.0012414553472987872, |
|
"loss": 6.1936, |
|
"step": 86000 |
|
}, |
|
{ |
|
"epoch": 19.0, |
|
"eval_loss": 6.223233699798584, |
|
"eval_runtime": 3.2991, |
|
"eval_samples_per_second": 88.205, |
|
"eval_steps_per_second": 1.516, |
|
"step": 86165 |
|
}, |
|
{ |
|
"epoch": 19.07, |
|
"learning_rate": 0.0012370452039691291, |
|
"loss": 6.1966, |
|
"step": 86500 |
|
}, |
|
{ |
|
"epoch": 19.18, |
|
"learning_rate": 0.0012326350606394708, |
|
"loss": 6.1873, |
|
"step": 87000 |
|
}, |
|
{ |
|
"epoch": 19.29, |
|
"learning_rate": 0.0012282249173098125, |
|
"loss": 6.1899, |
|
"step": 87500 |
|
}, |
|
{ |
|
"epoch": 19.4, |
|
"learning_rate": 0.0012238147739801544, |
|
"loss": 6.1917, |
|
"step": 88000 |
|
}, |
|
{ |
|
"epoch": 19.51, |
|
"learning_rate": 0.0012194046306504961, |
|
"loss": 6.1894, |
|
"step": 88500 |
|
}, |
|
{ |
|
"epoch": 19.63, |
|
"learning_rate": 0.001214994487320838, |
|
"loss": 6.196, |
|
"step": 89000 |
|
}, |
|
{ |
|
"epoch": 19.74, |
|
"learning_rate": 0.0012105843439911797, |
|
"loss": 6.186, |
|
"step": 89500 |
|
}, |
|
{ |
|
"epoch": 19.85, |
|
"learning_rate": 0.0012061742006615214, |
|
"loss": 6.1871, |
|
"step": 90000 |
|
}, |
|
{ |
|
"epoch": 19.96, |
|
"learning_rate": 0.0012017640573318633, |
|
"loss": 6.1913, |
|
"step": 90500 |
|
}, |
|
{ |
|
"epoch": 20.0, |
|
"eval_loss": 6.2211713790893555, |
|
"eval_runtime": 3.2092, |
|
"eval_samples_per_second": 90.677, |
|
"eval_steps_per_second": 1.558, |
|
"step": 90700 |
|
}, |
|
{ |
|
"epoch": 20.07, |
|
"learning_rate": 0.0011973539140022053, |
|
"loss": 6.1844, |
|
"step": 91000 |
|
}, |
|
{ |
|
"epoch": 20.18, |
|
"learning_rate": 0.0011929437706725467, |
|
"loss": 6.1876, |
|
"step": 91500 |
|
}, |
|
{ |
|
"epoch": 20.29, |
|
"learning_rate": 0.0011885336273428887, |
|
"loss": 6.1959, |
|
"step": 92000 |
|
}, |
|
{ |
|
"epoch": 20.4, |
|
"learning_rate": 0.0011841234840132304, |
|
"loss": 6.191, |
|
"step": 92500 |
|
}, |
|
{ |
|
"epoch": 20.51, |
|
"learning_rate": 0.0011797133406835723, |
|
"loss": 6.1884, |
|
"step": 93000 |
|
}, |
|
{ |
|
"epoch": 20.62, |
|
"learning_rate": 0.0011753031973539142, |
|
"loss": 6.1791, |
|
"step": 93500 |
|
}, |
|
{ |
|
"epoch": 20.73, |
|
"learning_rate": 0.0011708930540242557, |
|
"loss": 6.188, |
|
"step": 94000 |
|
}, |
|
{ |
|
"epoch": 20.84, |
|
"learning_rate": 0.0011664829106945976, |
|
"loss": 6.1839, |
|
"step": 94500 |
|
}, |
|
{ |
|
"epoch": 20.95, |
|
"learning_rate": 0.0011620727673649395, |
|
"loss": 6.18, |
|
"step": 95000 |
|
}, |
|
{ |
|
"epoch": 21.0, |
|
"eval_loss": 6.215102195739746, |
|
"eval_runtime": 3.1993, |
|
"eval_samples_per_second": 90.959, |
|
"eval_steps_per_second": 1.563, |
|
"step": 95235 |
|
}, |
|
{ |
|
"epoch": 21.06, |
|
"learning_rate": 0.0011576626240352812, |
|
"loss": 6.1761, |
|
"step": 95500 |
|
}, |
|
{ |
|
"epoch": 21.17, |
|
"learning_rate": 0.001153252480705623, |
|
"loss": 6.1808, |
|
"step": 96000 |
|
}, |
|
{ |
|
"epoch": 21.28, |
|
"learning_rate": 0.0011488423373759646, |
|
"loss": 6.1794, |
|
"step": 96500 |
|
}, |
|
{ |
|
"epoch": 21.39, |
|
"learning_rate": 0.0011444321940463065, |
|
"loss": 6.1841, |
|
"step": 97000 |
|
}, |
|
{ |
|
"epoch": 21.5, |
|
"learning_rate": 0.0011400220507166484, |
|
"loss": 6.183, |
|
"step": 97500 |
|
}, |
|
{ |
|
"epoch": 21.61, |
|
"learning_rate": 0.0011356119073869901, |
|
"loss": 6.1844, |
|
"step": 98000 |
|
}, |
|
{ |
|
"epoch": 21.72, |
|
"learning_rate": 0.001131201764057332, |
|
"loss": 6.1853, |
|
"step": 98500 |
|
}, |
|
{ |
|
"epoch": 21.83, |
|
"learning_rate": 0.0011267916207276735, |
|
"loss": 6.1764, |
|
"step": 99000 |
|
}, |
|
{ |
|
"epoch": 21.94, |
|
"learning_rate": 0.0011223814773980154, |
|
"loss": 6.1855, |
|
"step": 99500 |
|
}, |
|
{ |
|
"epoch": 22.0, |
|
"eval_loss": 6.212313175201416, |
|
"eval_runtime": 3.1975, |
|
"eval_samples_per_second": 91.009, |
|
"eval_steps_per_second": 1.564, |
|
"step": 99770 |
|
}, |
|
{ |
|
"epoch": 22.05, |
|
"learning_rate": 0.0011179713340683573, |
|
"loss": 6.1747, |
|
"step": 100000 |
|
}, |
|
{ |
|
"epoch": 22.16, |
|
"learning_rate": 0.001113561190738699, |
|
"loss": 6.1786, |
|
"step": 100500 |
|
}, |
|
{ |
|
"epoch": 22.27, |
|
"learning_rate": 0.0011091510474090407, |
|
"loss": 6.1784, |
|
"step": 101000 |
|
}, |
|
{ |
|
"epoch": 22.38, |
|
"learning_rate": 0.0011047409040793826, |
|
"loss": 6.1798, |
|
"step": 101500 |
|
}, |
|
{ |
|
"epoch": 22.49, |
|
"learning_rate": 0.0011003307607497243, |
|
"loss": 6.1793, |
|
"step": 102000 |
|
}, |
|
{ |
|
"epoch": 22.6, |
|
"learning_rate": 0.0010959206174200663, |
|
"loss": 6.1775, |
|
"step": 102500 |
|
}, |
|
{ |
|
"epoch": 22.71, |
|
"learning_rate": 0.001091510474090408, |
|
"loss": 6.1778, |
|
"step": 103000 |
|
}, |
|
{ |
|
"epoch": 22.82, |
|
"learning_rate": 0.0010871003307607497, |
|
"loss": 6.1798, |
|
"step": 103500 |
|
}, |
|
{ |
|
"epoch": 22.93, |
|
"learning_rate": 0.0010826901874310916, |
|
"loss": 6.1758, |
|
"step": 104000 |
|
}, |
|
{ |
|
"epoch": 23.0, |
|
"eval_loss": 6.209214687347412, |
|
"eval_runtime": 3.2078, |
|
"eval_samples_per_second": 90.718, |
|
"eval_steps_per_second": 1.559, |
|
"step": 104305 |
|
}, |
|
{ |
|
"epoch": 23.04, |
|
"learning_rate": 0.0010782800441014333, |
|
"loss": 6.1856, |
|
"step": 104500 |
|
}, |
|
{ |
|
"epoch": 23.15, |
|
"learning_rate": 0.0010738699007717752, |
|
"loss": 6.1849, |
|
"step": 105000 |
|
}, |
|
{ |
|
"epoch": 23.26, |
|
"learning_rate": 0.001069459757442117, |
|
"loss": 6.179, |
|
"step": 105500 |
|
}, |
|
{ |
|
"epoch": 23.37, |
|
"learning_rate": 0.0010650496141124586, |
|
"loss": 6.1804, |
|
"step": 106000 |
|
}, |
|
{ |
|
"epoch": 23.48, |
|
"learning_rate": 0.0010606394707828005, |
|
"loss": 6.1782, |
|
"step": 106500 |
|
}, |
|
{ |
|
"epoch": 23.59, |
|
"learning_rate": 0.0010562293274531422, |
|
"loss": 6.1745, |
|
"step": 107000 |
|
}, |
|
{ |
|
"epoch": 23.7, |
|
"learning_rate": 0.001051819184123484, |
|
"loss": 6.183, |
|
"step": 107500 |
|
}, |
|
{ |
|
"epoch": 23.81, |
|
"learning_rate": 0.001047409040793826, |
|
"loss": 6.1828, |
|
"step": 108000 |
|
}, |
|
{ |
|
"epoch": 23.93, |
|
"learning_rate": 0.0010429988974641675, |
|
"loss": 6.18, |
|
"step": 108500 |
|
}, |
|
{ |
|
"epoch": 24.0, |
|
"eval_loss": 6.213593006134033, |
|
"eval_runtime": 3.2056, |
|
"eval_samples_per_second": 90.78, |
|
"eval_steps_per_second": 1.56, |
|
"step": 108840 |
|
} |
|
], |
|
"logging_steps": 500, |
|
"max_steps": 226750, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 50, |
|
"save_steps": 500, |
|
"total_flos": 1.23831193938535e+19, |
|
"train_batch_size": 64, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|