translation_adapter / trainer_state.json
DanteAl97's picture
Upload 11 files
f3c0d07 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.572701541042038,
"eval_steps": 100,
"global_step": 5000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.03,
"grad_norm": 0.49666959047317505,
"learning_rate": 0.0001,
"loss": 3.1694,
"step": 100
},
{
"epoch": 0.03,
"eval_loss": 2.1809041500091553,
"eval_runtime": 688.5742,
"eval_samples_per_second": 49.254,
"eval_steps_per_second": 1.539,
"step": 100
},
{
"epoch": 0.06,
"grad_norm": 0.5750323534011841,
"learning_rate": 9.655172413793105e-05,
"loss": 2.0372,
"step": 200
},
{
"epoch": 0.06,
"eval_loss": 1.9693516492843628,
"eval_runtime": 688.4711,
"eval_samples_per_second": 49.261,
"eval_steps_per_second": 1.54,
"step": 200
},
{
"epoch": 0.09,
"grad_norm": 0.6069626808166504,
"learning_rate": 9.310344827586207e-05,
"loss": 1.9397,
"step": 300
},
{
"epoch": 0.09,
"eval_loss": 1.914998173713684,
"eval_runtime": 688.3078,
"eval_samples_per_second": 49.273,
"eval_steps_per_second": 1.54,
"step": 300
},
{
"epoch": 0.13,
"grad_norm": 0.6587632894515991,
"learning_rate": 8.96551724137931e-05,
"loss": 1.9029,
"step": 400
},
{
"epoch": 0.13,
"eval_loss": 1.887677550315857,
"eval_runtime": 688.3053,
"eval_samples_per_second": 49.273,
"eval_steps_per_second": 1.54,
"step": 400
},
{
"epoch": 0.16,
"grad_norm": 0.6251624226570129,
"learning_rate": 8.620689655172413e-05,
"loss": 1.8749,
"step": 500
},
{
"epoch": 0.16,
"eval_loss": 1.8642584085464478,
"eval_runtime": 688.2945,
"eval_samples_per_second": 49.274,
"eval_steps_per_second": 1.54,
"step": 500
},
{
"epoch": 0.19,
"grad_norm": 0.6488652229309082,
"learning_rate": 8.275862068965517e-05,
"loss": 1.8444,
"step": 600
},
{
"epoch": 0.19,
"eval_loss": 1.8471214771270752,
"eval_runtime": 688.3193,
"eval_samples_per_second": 49.272,
"eval_steps_per_second": 1.54,
"step": 600
},
{
"epoch": 0.22,
"grad_norm": 0.5931334495544434,
"learning_rate": 7.931034482758621e-05,
"loss": 1.8316,
"step": 700
},
{
"epoch": 0.22,
"eval_loss": 1.8358324766159058,
"eval_runtime": 688.2994,
"eval_samples_per_second": 49.274,
"eval_steps_per_second": 1.54,
"step": 700
},
{
"epoch": 0.25,
"grad_norm": 0.5620170831680298,
"learning_rate": 7.586206896551724e-05,
"loss": 1.8312,
"step": 800
},
{
"epoch": 0.25,
"eval_loss": 1.824582815170288,
"eval_runtime": 688.296,
"eval_samples_per_second": 49.274,
"eval_steps_per_second": 1.54,
"step": 800
},
{
"epoch": 0.28,
"grad_norm": 0.5843664407730103,
"learning_rate": 7.241379310344828e-05,
"loss": 1.8203,
"step": 900
},
{
"epoch": 0.28,
"eval_loss": 1.8139162063598633,
"eval_runtime": 688.4311,
"eval_samples_per_second": 49.264,
"eval_steps_per_second": 1.54,
"step": 900
},
{
"epoch": 0.31,
"grad_norm": 0.6241691708564758,
"learning_rate": 6.896551724137931e-05,
"loss": 1.8057,
"step": 1000
},
{
"epoch": 0.31,
"eval_loss": 1.8063077926635742,
"eval_runtime": 688.4241,
"eval_samples_per_second": 49.265,
"eval_steps_per_second": 1.54,
"step": 1000
},
{
"epoch": 0.35,
"grad_norm": 0.639207661151886,
"learning_rate": 6.551724137931034e-05,
"loss": 1.7986,
"step": 1100
},
{
"epoch": 0.35,
"eval_loss": 1.7982680797576904,
"eval_runtime": 688.2656,
"eval_samples_per_second": 49.276,
"eval_steps_per_second": 1.54,
"step": 1100
},
{
"epoch": 0.38,
"grad_norm": 0.6417416930198669,
"learning_rate": 6.206896551724138e-05,
"loss": 1.7969,
"step": 1200
},
{
"epoch": 0.38,
"eval_loss": 1.7924365997314453,
"eval_runtime": 688.2797,
"eval_samples_per_second": 49.275,
"eval_steps_per_second": 1.54,
"step": 1200
},
{
"epoch": 0.41,
"grad_norm": 0.6295751333236694,
"learning_rate": 5.862068965517241e-05,
"loss": 1.779,
"step": 1300
},
{
"epoch": 0.41,
"eval_loss": 1.785871148109436,
"eval_runtime": 688.4975,
"eval_samples_per_second": 49.259,
"eval_steps_per_second": 1.54,
"step": 1300
},
{
"epoch": 0.44,
"grad_norm": 0.6242513656616211,
"learning_rate": 5.517241379310345e-05,
"loss": 1.7743,
"step": 1400
},
{
"epoch": 0.44,
"eval_loss": 1.780082106590271,
"eval_runtime": 688.2673,
"eval_samples_per_second": 49.276,
"eval_steps_per_second": 1.54,
"step": 1400
},
{
"epoch": 0.47,
"grad_norm": 0.6389493942260742,
"learning_rate": 5.172413793103449e-05,
"loss": 1.769,
"step": 1500
},
{
"epoch": 0.47,
"eval_loss": 1.7744983434677124,
"eval_runtime": 687.8653,
"eval_samples_per_second": 49.305,
"eval_steps_per_second": 1.541,
"step": 1500
},
{
"epoch": 0.5,
"grad_norm": 0.669941246509552,
"learning_rate": 4.827586206896552e-05,
"loss": 1.7734,
"step": 1600
},
{
"epoch": 0.5,
"eval_loss": 1.7702207565307617,
"eval_runtime": 688.2155,
"eval_samples_per_second": 49.28,
"eval_steps_per_second": 1.54,
"step": 1600
},
{
"epoch": 0.53,
"grad_norm": 0.6194856762886047,
"learning_rate": 4.482758620689655e-05,
"loss": 1.7666,
"step": 1700
},
{
"epoch": 0.53,
"eval_loss": 1.7659046649932861,
"eval_runtime": 688.2457,
"eval_samples_per_second": 49.277,
"eval_steps_per_second": 1.54,
"step": 1700
},
{
"epoch": 0.57,
"grad_norm": 0.6265833377838135,
"learning_rate": 4.1379310344827587e-05,
"loss": 1.7578,
"step": 1800
},
{
"epoch": 0.57,
"eval_loss": 1.7620124816894531,
"eval_runtime": 688.4272,
"eval_samples_per_second": 49.264,
"eval_steps_per_second": 1.54,
"step": 1800
},
{
"epoch": 0.6,
"grad_norm": 0.6443737745285034,
"learning_rate": 3.793103448275862e-05,
"loss": 1.7321,
"step": 1900
},
{
"epoch": 0.6,
"eval_loss": 1.7327184677124023,
"eval_runtime": 679.9956,
"eval_samples_per_second": 49.875,
"eval_steps_per_second": 1.559,
"step": 1900
},
{
"epoch": 0.63,
"grad_norm": 0.6571601033210754,
"learning_rate": 3.4482758620689657e-05,
"loss": 1.7336,
"step": 2000
},
{
"epoch": 0.63,
"eval_loss": 1.730404019355774,
"eval_runtime": 679.76,
"eval_samples_per_second": 49.893,
"eval_steps_per_second": 1.559,
"step": 2000
},
{
"epoch": 0.66,
"grad_norm": 0.651569128036499,
"learning_rate": 3.103448275862069e-05,
"loss": 1.7216,
"step": 2100
},
{
"epoch": 0.66,
"eval_loss": 1.7281814813613892,
"eval_runtime": 679.7835,
"eval_samples_per_second": 49.891,
"eval_steps_per_second": 1.559,
"step": 2100
},
{
"epoch": 0.69,
"grad_norm": 0.6475515365600586,
"learning_rate": 2.7586206896551727e-05,
"loss": 1.7263,
"step": 2200
},
{
"epoch": 0.69,
"eval_loss": 1.726309061050415,
"eval_runtime": 679.7496,
"eval_samples_per_second": 49.893,
"eval_steps_per_second": 1.559,
"step": 2200
},
{
"epoch": 0.72,
"grad_norm": 0.6775307059288025,
"learning_rate": 2.413793103448276e-05,
"loss": 1.728,
"step": 2300
},
{
"epoch": 0.72,
"eval_loss": 1.7238056659698486,
"eval_runtime": 679.9872,
"eval_samples_per_second": 49.876,
"eval_steps_per_second": 1.559,
"step": 2300
},
{
"epoch": 0.75,
"grad_norm": 0.7065313458442688,
"learning_rate": 2.0689655172413793e-05,
"loss": 1.7222,
"step": 2400
},
{
"epoch": 0.75,
"eval_loss": 1.7217180728912354,
"eval_runtime": 679.9742,
"eval_samples_per_second": 49.877,
"eval_steps_per_second": 1.559,
"step": 2400
},
{
"epoch": 0.79,
"grad_norm": 0.6938297748565674,
"learning_rate": 1.7241379310344828e-05,
"loss": 1.7249,
"step": 2500
},
{
"epoch": 0.79,
"eval_loss": 1.719938039779663,
"eval_runtime": 679.7345,
"eval_samples_per_second": 49.894,
"eval_steps_per_second": 1.559,
"step": 2500
},
{
"epoch": 0.82,
"grad_norm": 0.6739135980606079,
"learning_rate": 1.3793103448275863e-05,
"loss": 1.7116,
"step": 2600
},
{
"epoch": 0.82,
"eval_loss": 1.7181174755096436,
"eval_runtime": 679.9623,
"eval_samples_per_second": 49.878,
"eval_steps_per_second": 1.559,
"step": 2600
},
{
"epoch": 0.85,
"grad_norm": 0.6775588989257812,
"learning_rate": 1.0344827586206897e-05,
"loss": 1.7146,
"step": 2700
},
{
"epoch": 0.85,
"eval_loss": 1.7168805599212646,
"eval_runtime": 679.7273,
"eval_samples_per_second": 49.895,
"eval_steps_per_second": 1.559,
"step": 2700
},
{
"epoch": 0.88,
"grad_norm": 0.6750255823135376,
"learning_rate": 6.896551724137932e-06,
"loss": 1.712,
"step": 2800
},
{
"epoch": 0.88,
"eval_loss": 1.715585708618164,
"eval_runtime": 679.7333,
"eval_samples_per_second": 49.895,
"eval_steps_per_second": 1.559,
"step": 2800
},
{
"epoch": 0.91,
"grad_norm": 0.68208247423172,
"learning_rate": 3.448275862068966e-06,
"loss": 1.7184,
"step": 2900
},
{
"epoch": 0.91,
"eval_loss": 1.7148162126541138,
"eval_runtime": 679.6399,
"eval_samples_per_second": 49.901,
"eval_steps_per_second": 1.56,
"step": 2900
},
{
"epoch": 0.94,
"grad_norm": 0.6982720494270325,
"learning_rate": 0.0,
"loss": 1.7137,
"step": 3000
},
{
"epoch": 0.94,
"eval_loss": 1.7144192457199097,
"eval_runtime": 679.8535,
"eval_samples_per_second": 49.886,
"eval_steps_per_second": 1.559,
"step": 3000
},
{
"epoch": 0.97,
"grad_norm": 0.7738541960716248,
"learning_rate": 3.8775510204081634e-05,
"loss": 1.7029,
"step": 3100
},
{
"epoch": 0.97,
"eval_loss": 1.7050038576126099,
"eval_runtime": 687.1983,
"eval_samples_per_second": 49.353,
"eval_steps_per_second": 1.542,
"step": 3100
},
{
"epoch": 1.01,
"grad_norm": 0.6895627379417419,
"learning_rate": 3.673469387755102e-05,
"loss": 1.7094,
"step": 3200
},
{
"epoch": 1.01,
"eval_loss": 1.7042526006698608,
"eval_runtime": 687.1845,
"eval_samples_per_second": 49.354,
"eval_steps_per_second": 1.543,
"step": 3200
},
{
"epoch": 1.04,
"grad_norm": 0.8041057586669922,
"learning_rate": 3.469387755102041e-05,
"loss": 1.7049,
"step": 3300
},
{
"epoch": 1.04,
"eval_loss": 1.7035516500473022,
"eval_runtime": 687.4035,
"eval_samples_per_second": 49.338,
"eval_steps_per_second": 1.542,
"step": 3300
},
{
"epoch": 1.07,
"grad_norm": 0.7259939908981323,
"learning_rate": 3.265306122448979e-05,
"loss": 1.7098,
"step": 3400
},
{
"epoch": 1.07,
"eval_loss": 1.7024834156036377,
"eval_runtime": 687.1647,
"eval_samples_per_second": 49.355,
"eval_steps_per_second": 1.543,
"step": 3400
},
{
"epoch": 1.1,
"grad_norm": 0.7912746667861938,
"learning_rate": 3.061224489795919e-05,
"loss": 1.7015,
"step": 3500
},
{
"epoch": 1.1,
"eval_loss": 1.7005605697631836,
"eval_runtime": 687.1366,
"eval_samples_per_second": 49.357,
"eval_steps_per_second": 1.543,
"step": 3500
},
{
"epoch": 1.13,
"grad_norm": 0.8287527561187744,
"learning_rate": 2.857142857142857e-05,
"loss": 1.6876,
"step": 3600
},
{
"epoch": 1.13,
"eval_loss": 1.6933950185775757,
"eval_runtime": 683.5096,
"eval_samples_per_second": 49.619,
"eval_steps_per_second": 1.551,
"step": 3600
},
{
"epoch": 1.16,
"grad_norm": 0.736217737197876,
"learning_rate": 2.6530612244897963e-05,
"loss": 1.6958,
"step": 3700
},
{
"epoch": 1.16,
"eval_loss": 1.692893624305725,
"eval_runtime": 683.433,
"eval_samples_per_second": 49.624,
"eval_steps_per_second": 1.551,
"step": 3700
},
{
"epoch": 1.2,
"grad_norm": 0.7109358906745911,
"learning_rate": 2.448979591836735e-05,
"loss": 1.6885,
"step": 3800
},
{
"epoch": 1.2,
"eval_loss": 1.6916097402572632,
"eval_runtime": 683.3969,
"eval_samples_per_second": 49.627,
"eval_steps_per_second": 1.551,
"step": 3800
},
{
"epoch": 1.23,
"grad_norm": 0.7234348654747009,
"learning_rate": 2.2448979591836737e-05,
"loss": 1.6934,
"step": 3900
},
{
"epoch": 1.23,
"eval_loss": 1.6902754306793213,
"eval_runtime": 683.1628,
"eval_samples_per_second": 49.644,
"eval_steps_per_second": 1.552,
"step": 3900
},
{
"epoch": 1.26,
"grad_norm": 0.7684239149093628,
"learning_rate": 2.0408163265306123e-05,
"loss": 1.6909,
"step": 4000
},
{
"epoch": 1.26,
"eval_loss": 1.689305067062378,
"eval_runtime": 683.1661,
"eval_samples_per_second": 49.644,
"eval_steps_per_second": 1.552,
"step": 4000
},
{
"epoch": 1.29,
"grad_norm": 0.7669008374214172,
"learning_rate": 1.836734693877551e-05,
"loss": 1.6907,
"step": 4100
},
{
"epoch": 1.29,
"eval_loss": 1.688330888748169,
"eval_runtime": 683.1804,
"eval_samples_per_second": 49.643,
"eval_steps_per_second": 1.552,
"step": 4100
},
{
"epoch": 1.32,
"grad_norm": 0.7422395348548889,
"learning_rate": 1.6326530612244897e-05,
"loss": 1.6912,
"step": 4200
},
{
"epoch": 1.32,
"eval_loss": 1.687252163887024,
"eval_runtime": 683.2129,
"eval_samples_per_second": 49.64,
"eval_steps_per_second": 1.551,
"step": 4200
},
{
"epoch": 1.35,
"grad_norm": 0.7352548837661743,
"learning_rate": 1.4285714285714285e-05,
"loss": 1.6873,
"step": 4300
},
{
"epoch": 1.35,
"eval_loss": 1.6862083673477173,
"eval_runtime": 683.1788,
"eval_samples_per_second": 49.643,
"eval_steps_per_second": 1.552,
"step": 4300
},
{
"epoch": 1.38,
"grad_norm": 0.7130007147789001,
"learning_rate": 1.2244897959183674e-05,
"loss": 1.6858,
"step": 4400
},
{
"epoch": 1.38,
"eval_loss": 1.6853961944580078,
"eval_runtime": 683.1786,
"eval_samples_per_second": 49.643,
"eval_steps_per_second": 1.552,
"step": 4400
},
{
"epoch": 1.42,
"grad_norm": 0.7947734594345093,
"learning_rate": 1.0204081632653061e-05,
"loss": 1.6813,
"step": 4500
},
{
"epoch": 1.42,
"eval_loss": 1.6845451593399048,
"eval_runtime": 683.4171,
"eval_samples_per_second": 49.626,
"eval_steps_per_second": 1.551,
"step": 4500
},
{
"epoch": 1.45,
"grad_norm": 0.7227717041969299,
"learning_rate": 8.163265306122448e-06,
"loss": 1.6867,
"step": 4600
},
{
"epoch": 1.45,
"eval_loss": 1.6836014986038208,
"eval_runtime": 683.3718,
"eval_samples_per_second": 49.629,
"eval_steps_per_second": 1.551,
"step": 4600
},
{
"epoch": 1.48,
"grad_norm": 0.746582567691803,
"learning_rate": 6.122448979591837e-06,
"loss": 1.6882,
"step": 4700
},
{
"epoch": 1.48,
"eval_loss": 1.682924509048462,
"eval_runtime": 683.3662,
"eval_samples_per_second": 49.629,
"eval_steps_per_second": 1.551,
"step": 4700
},
{
"epoch": 1.51,
"grad_norm": 0.7279271483421326,
"learning_rate": 4.081632653061224e-06,
"loss": 1.6872,
"step": 4800
},
{
"epoch": 1.51,
"eval_loss": 1.682388186454773,
"eval_runtime": 683.1514,
"eval_samples_per_second": 49.645,
"eval_steps_per_second": 1.552,
"step": 4800
},
{
"epoch": 1.54,
"grad_norm": 0.7303986549377441,
"learning_rate": 2.040816326530612e-06,
"loss": 1.6898,
"step": 4900
},
{
"epoch": 1.54,
"eval_loss": 1.682073950767517,
"eval_runtime": 683.3608,
"eval_samples_per_second": 49.63,
"eval_steps_per_second": 1.551,
"step": 4900
},
{
"epoch": 1.57,
"grad_norm": 0.763130784034729,
"learning_rate": 0.0,
"loss": 1.6845,
"step": 5000
},
{
"epoch": 1.57,
"eval_loss": 1.6819010972976685,
"eval_runtime": 683.1778,
"eval_samples_per_second": 49.643,
"eval_steps_per_second": 1.552,
"step": 5000
}
],
"logging_steps": 100,
"max_steps": 5000,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 50,
"total_flos": 2.03873794720034e+18,
"train_batch_size": 32,
"trial_name": null,
"trial_params": null
}