output / trainer_state.json
alhosseini's picture
End of training
12bc6c7 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.0,
"eval_steps": 500,
"global_step": 130,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.03076923076923077,
"grad_norm": 1.9069570302963257,
"learning_rate": 1.998832226832327e-05,
"loss": 4.801,
"step": 2
},
{
"epoch": 0.06153846153846154,
"grad_norm": 2.2477622032165527,
"learning_rate": 1.995331634717649e-05,
"loss": 4.3802,
"step": 4
},
{
"epoch": 0.09230769230769231,
"grad_norm": 1.8538563251495361,
"learning_rate": 1.9895063994510512e-05,
"loss": 4.2892,
"step": 6
},
{
"epoch": 0.12307692307692308,
"grad_norm": 2.284632444381714,
"learning_rate": 1.9813701261394136e-05,
"loss": 4.2137,
"step": 8
},
{
"epoch": 0.15384615384615385,
"grad_norm": 2.513190746307373,
"learning_rate": 1.9709418174260523e-05,
"loss": 3.3077,
"step": 10
},
{
"epoch": 0.18461538461538463,
"grad_norm": 1.1242982149124146,
"learning_rate": 1.9582458291091664e-05,
"loss": 4.0115,
"step": 12
},
{
"epoch": 0.2153846153846154,
"grad_norm": 1.4252392053604126,
"learning_rate": 1.9433118132577432e-05,
"loss": 3.8625,
"step": 14
},
{
"epoch": 0.24615384615384617,
"grad_norm": 2.058634042739868,
"learning_rate": 1.9261746489577767e-05,
"loss": 3.7415,
"step": 16
},
{
"epoch": 0.27692307692307694,
"grad_norm": 1.3263213634490967,
"learning_rate": 1.9068743608505454e-05,
"loss": 3.5245,
"step": 18
},
{
"epoch": 0.3076923076923077,
"grad_norm": 2.1401660442352295,
"learning_rate": 1.8854560256532098e-05,
"loss": 3.5519,
"step": 20
},
{
"epoch": 0.3384615384615385,
"grad_norm": 4.305651664733887,
"learning_rate": 1.8619696668800494e-05,
"loss": 3.3539,
"step": 22
},
{
"epoch": 0.36923076923076925,
"grad_norm": 1.7320072650909424,
"learning_rate": 1.8364701380102267e-05,
"loss": 3.2424,
"step": 24
},
{
"epoch": 0.4,
"grad_norm": 0.6560227274894714,
"learning_rate": 1.8090169943749477e-05,
"loss": 3.0355,
"step": 26
},
{
"epoch": 0.4307692307692308,
"grad_norm": 1.277785301208496,
"learning_rate": 1.7796743540632226e-05,
"loss": 3.2686,
"step": 28
},
{
"epoch": 0.46153846153846156,
"grad_norm": 0.8572943210601807,
"learning_rate": 1.7485107481711014e-05,
"loss": 3.3207,
"step": 30
},
{
"epoch": 0.49230769230769234,
"grad_norm": 1.0174134969711304,
"learning_rate": 1.715598960744121e-05,
"loss": 2.5371,
"step": 32
},
{
"epoch": 0.5230769230769231,
"grad_norm": 0.8519155979156494,
"learning_rate": 1.6810158587867973e-05,
"loss": 2.7066,
"step": 34
},
{
"epoch": 0.5538461538461539,
"grad_norm": 0.9035856127738953,
"learning_rate": 1.6448422127361707e-05,
"loss": 2.7816,
"step": 36
},
{
"epoch": 0.5846153846153846,
"grad_norm": 0.8610332608222961,
"learning_rate": 1.6071625078187113e-05,
"loss": 2.77,
"step": 38
},
{
"epoch": 0.6153846153846154,
"grad_norm": 0.9529484510421753,
"learning_rate": 1.568064746731156e-05,
"loss": 2.816,
"step": 40
},
{
"epoch": 0.6461538461538462,
"grad_norm": 0.8644328117370605,
"learning_rate": 1.527640244106133e-05,
"loss": 2.8391,
"step": 42
},
{
"epoch": 0.676923076923077,
"grad_norm": 0.8374277353286743,
"learning_rate": 1.485983413242606e-05,
"loss": 2.7795,
"step": 44
},
{
"epoch": 0.7076923076923077,
"grad_norm": 1.0576844215393066,
"learning_rate": 1.4431915455992416e-05,
"loss": 2.6977,
"step": 46
},
{
"epoch": 0.7384615384615385,
"grad_norm": 0.47086605429649353,
"learning_rate": 1.3993645835656955e-05,
"loss": 2.9187,
"step": 48
},
{
"epoch": 0.7692307692307693,
"grad_norm": 0.599407970905304,
"learning_rate": 1.3546048870425356e-05,
"loss": 2.744,
"step": 50
},
{
"epoch": 0.8,
"grad_norm": 0.5703226923942566,
"learning_rate": 1.3090169943749475e-05,
"loss": 2.8795,
"step": 52
},
{
"epoch": 0.8307692307692308,
"grad_norm": 1.1178271770477295,
"learning_rate": 1.262707378198587e-05,
"loss": 2.8823,
"step": 54
},
{
"epoch": 0.8615384615384616,
"grad_norm": 0.5153272747993469,
"learning_rate": 1.2157841967678064e-05,
"loss": 2.7723,
"step": 56
},
{
"epoch": 0.8923076923076924,
"grad_norm": 0.5014742016792297,
"learning_rate": 1.1683570413470384e-05,
"loss": 2.4378,
"step": 58
},
{
"epoch": 0.9230769230769231,
"grad_norm": 0.41535821557044983,
"learning_rate": 1.1205366802553231e-05,
"loss": 2.6488,
"step": 60
},
{
"epoch": 0.9538461538461539,
"grad_norm": 0.339928537607193,
"learning_rate": 1.0724348001617626e-05,
"loss": 2.5376,
"step": 62
},
{
"epoch": 0.9846153846153847,
"grad_norm": 0.34572508931159973,
"learning_rate": 1.0241637452361323e-05,
"loss": 2.8047,
"step": 64
},
{
"epoch": 1.0,
"eval_loss": 2.666728973388672,
"eval_runtime": 59.8407,
"eval_samples_per_second": 0.267,
"eval_steps_per_second": 0.267,
"step": 65
},
{
"epoch": 1.0153846153846153,
"grad_norm": 0.3885950446128845,
"learning_rate": 9.75836254763868e-06,
"loss": 2.7142,
"step": 66
},
{
"epoch": 1.0461538461538462,
"grad_norm": 0.7309185266494751,
"learning_rate": 9.275651998382377e-06,
"loss": 2.0571,
"step": 68
},
{
"epoch": 1.0769230769230769,
"grad_norm": 0.5309126973152161,
"learning_rate": 8.79463319744677e-06,
"loss": 2.0639,
"step": 70
},
{
"epoch": 1.1076923076923078,
"grad_norm": 0.5351013541221619,
"learning_rate": 8.316429586529616e-06,
"loss": 2.7287,
"step": 72
},
{
"epoch": 1.1384615384615384,
"grad_norm": 0.348666787147522,
"learning_rate": 7.84215803232194e-06,
"loss": 2.6614,
"step": 74
},
{
"epoch": 1.1692307692307693,
"grad_norm": 0.3705930709838867,
"learning_rate": 7.372926218014131e-06,
"loss": 2.6831,
"step": 76
},
{
"epoch": 1.2,
"grad_norm": 0.2703797221183777,
"learning_rate": 6.909830056250527e-06,
"loss": 2.6375,
"step": 78
},
{
"epoch": 1.2307692307692308,
"grad_norm": 0.35063040256500244,
"learning_rate": 6.453951129574644e-06,
"loss": 2.472,
"step": 80
},
{
"epoch": 1.2615384615384615,
"grad_norm": 1.5272430181503296,
"learning_rate": 6.006354164343047e-06,
"loss": 2.5923,
"step": 82
},
{
"epoch": 1.2923076923076924,
"grad_norm": 0.27407440543174744,
"learning_rate": 5.5680845440075885e-06,
"loss": 2.3395,
"step": 84
},
{
"epoch": 1.323076923076923,
"grad_norm": 0.29066547751426697,
"learning_rate": 5.14016586757394e-06,
"loss": 2.4793,
"step": 86
},
{
"epoch": 1.353846153846154,
"grad_norm": 0.27350541949272156,
"learning_rate": 4.7235975589386715e-06,
"loss": 2.5946,
"step": 88
},
{
"epoch": 1.3846153846153846,
"grad_norm": 0.30824142694473267,
"learning_rate": 4.319352532688444e-06,
"loss": 2.5754,
"step": 90
},
{
"epoch": 1.4153846153846155,
"grad_norm": 0.35746267437934875,
"learning_rate": 3.9283749218128885e-06,
"loss": 2.5823,
"step": 92
},
{
"epoch": 1.4461538461538461,
"grad_norm": 0.351965993642807,
"learning_rate": 3.5515778726382967e-06,
"loss": 2.5187,
"step": 94
},
{
"epoch": 1.476923076923077,
"grad_norm": 0.3084265887737274,
"learning_rate": 3.1898414121320277e-06,
"loss": 2.4335,
"step": 96
},
{
"epoch": 1.5076923076923077,
"grad_norm": 0.31473106145858765,
"learning_rate": 2.8440103925587904e-06,
"loss": 2.5966,
"step": 98
},
{
"epoch": 1.5384615384615383,
"grad_norm": 0.2767007648944855,
"learning_rate": 2.514892518288988e-06,
"loss": 2.4298,
"step": 100
},
{
"epoch": 1.5692307692307692,
"grad_norm": 0.2785211205482483,
"learning_rate": 2.2032564593677773e-06,
"loss": 2.5009,
"step": 102
},
{
"epoch": 1.6,
"grad_norm": 0.3996152877807617,
"learning_rate": 1.9098300562505266e-06,
"loss": 2.6447,
"step": 104
},
{
"epoch": 1.6307692307692307,
"grad_norm": 0.2684395909309387,
"learning_rate": 1.6352986198977327e-06,
"loss": 2.75,
"step": 106
},
{
"epoch": 1.6615384615384614,
"grad_norm": 0.29984214901924133,
"learning_rate": 1.3803033311995072e-06,
"loss": 2.5326,
"step": 108
},
{
"epoch": 1.6923076923076923,
"grad_norm": 0.30965209007263184,
"learning_rate": 1.1454397434679022e-06,
"loss": 2.6393,
"step": 110
},
{
"epoch": 1.7230769230769232,
"grad_norm": 0.23374854028224945,
"learning_rate": 9.312563914945461e-07,
"loss": 2.7773,
"step": 112
},
{
"epoch": 1.7538461538461538,
"grad_norm": 0.36213433742523193,
"learning_rate": 7.382535104222366e-07,
"loss": 2.4598,
"step": 114
},
{
"epoch": 1.7846153846153845,
"grad_norm": 0.2818911373615265,
"learning_rate": 5.668818674225684e-07,
"loss": 2.7295,
"step": 116
},
{
"epoch": 1.8153846153846154,
"grad_norm": 0.27589595317840576,
"learning_rate": 4.1754170890833777e-07,
"loss": 2.4782,
"step": 118
},
{
"epoch": 1.8461538461538463,
"grad_norm": 0.5232753753662109,
"learning_rate": 2.905818257394799e-07,
"loss": 2.4467,
"step": 120
},
{
"epoch": 1.876923076923077,
"grad_norm": 0.2822287678718567,
"learning_rate": 1.8629873860586567e-07,
"loss": 2.6134,
"step": 122
},
{
"epoch": 1.9076923076923076,
"grad_norm": 0.3719305396080017,
"learning_rate": 1.0493600548948879e-07,
"loss": 2.5161,
"step": 124
},
{
"epoch": 1.9384615384615385,
"grad_norm": 0.2847210466861725,
"learning_rate": 4.6683652823513725e-08,
"loss": 2.6886,
"step": 126
},
{
"epoch": 1.9692307692307693,
"grad_norm": 0.3320281207561493,
"learning_rate": 1.1677731676733584e-08,
"loss": 2.5856,
"step": 128
},
{
"epoch": 2.0,
"grad_norm": 0.3256252408027649,
"learning_rate": 0.0,
"loss": 2.5324,
"step": 130
},
{
"epoch": 2.0,
"eval_loss": 2.6215591430664062,
"eval_runtime": 59.7931,
"eval_samples_per_second": 0.268,
"eval_steps_per_second": 0.268,
"step": 130
},
{
"epoch": 2.0,
"step": 130,
"total_flos": 4.830307521921024e+16,
"train_loss": 2.869425494854267,
"train_runtime": 1444.917,
"train_samples_per_second": 0.09,
"train_steps_per_second": 0.09
}
],
"logging_steps": 2,
"max_steps": 130,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 500,
"total_flos": 4.830307521921024e+16,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}