mistral_on_wikibio / trainer_state.json
moetezsa's picture
Upload folder using huggingface_hub
8a0b3e4 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 8.0,
"eval_steps": 500,
"global_step": 146,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0547945205479452,
"grad_norm": 1.6586511135101318,
"learning_rate": 1e-06,
"loss": 2.2379,
"step": 1
},
{
"epoch": 0.1095890410958904,
"grad_norm": 1.6110830307006836,
"learning_rate": 1e-06,
"loss": 2.2933,
"step": 2
},
{
"epoch": 0.1643835616438356,
"grad_norm": 1.5261093378067017,
"learning_rate": 1e-06,
"loss": 2.2564,
"step": 3
},
{
"epoch": 0.2191780821917808,
"grad_norm": 1.6366506814956665,
"learning_rate": 1e-06,
"loss": 2.2794,
"step": 4
},
{
"epoch": 0.273972602739726,
"grad_norm": 1.5530800819396973,
"learning_rate": 1e-06,
"loss": 2.2344,
"step": 5
},
{
"epoch": 0.3287671232876712,
"grad_norm": 1.5802958011627197,
"learning_rate": 1e-06,
"loss": 2.2363,
"step": 6
},
{
"epoch": 0.3835616438356164,
"grad_norm": 1.5483659505844116,
"learning_rate": 1e-06,
"loss": 2.256,
"step": 7
},
{
"epoch": 0.4383561643835616,
"grad_norm": 1.5273737907409668,
"learning_rate": 1e-06,
"loss": 2.2733,
"step": 8
},
{
"epoch": 0.4931506849315068,
"grad_norm": 1.534605860710144,
"learning_rate": 1e-06,
"loss": 2.223,
"step": 9
},
{
"epoch": 0.547945205479452,
"grad_norm": 1.5523834228515625,
"learning_rate": 1e-06,
"loss": 2.2755,
"step": 10
},
{
"epoch": 0.6027397260273972,
"grad_norm": 1.5962920188903809,
"learning_rate": 1e-06,
"loss": 2.2875,
"step": 11
},
{
"epoch": 0.6575342465753424,
"grad_norm": 1.5564601421356201,
"learning_rate": 1e-06,
"loss": 2.2716,
"step": 12
},
{
"epoch": 0.7123287671232876,
"grad_norm": 1.5305095911026,
"learning_rate": 1e-06,
"loss": 2.2485,
"step": 13
},
{
"epoch": 0.7671232876712328,
"grad_norm": 1.4675662517547607,
"learning_rate": 1e-06,
"loss": 2.2574,
"step": 14
},
{
"epoch": 0.821917808219178,
"grad_norm": 1.4668537378311157,
"learning_rate": 1e-06,
"loss": 2.2226,
"step": 15
},
{
"epoch": 0.8767123287671232,
"grad_norm": 1.5306854248046875,
"learning_rate": 1e-06,
"loss": 2.2798,
"step": 16
},
{
"epoch": 0.9315068493150684,
"grad_norm": 1.5047531127929688,
"learning_rate": 1e-06,
"loss": 2.2486,
"step": 17
},
{
"epoch": 0.9863013698630136,
"grad_norm": 1.4622173309326172,
"learning_rate": 1e-06,
"loss": 2.217,
"step": 18
},
{
"epoch": 1.0410958904109588,
"grad_norm": 1.5452288389205933,
"learning_rate": 1e-06,
"loss": 2.2271,
"step": 19
},
{
"epoch": 1.095890410958904,
"grad_norm": 1.4995627403259277,
"learning_rate": 1e-06,
"loss": 2.2222,
"step": 20
},
{
"epoch": 1.1506849315068493,
"grad_norm": 1.4030557870864868,
"learning_rate": 1e-06,
"loss": 2.2547,
"step": 21
},
{
"epoch": 1.2054794520547945,
"grad_norm": 1.4066240787506104,
"learning_rate": 1e-06,
"loss": 2.2279,
"step": 22
},
{
"epoch": 1.2602739726027397,
"grad_norm": 1.4491875171661377,
"learning_rate": 1e-06,
"loss": 2.2497,
"step": 23
},
{
"epoch": 1.3150684931506849,
"grad_norm": 1.3880819082260132,
"learning_rate": 1e-06,
"loss": 2.2593,
"step": 24
},
{
"epoch": 1.36986301369863,
"grad_norm": 1.471488118171692,
"learning_rate": 1e-06,
"loss": 2.2496,
"step": 25
},
{
"epoch": 1.4246575342465753,
"grad_norm": 1.388680338859558,
"learning_rate": 1e-06,
"loss": 2.2262,
"step": 26
},
{
"epoch": 1.4794520547945205,
"grad_norm": 1.4523004293441772,
"learning_rate": 1e-06,
"loss": 2.2535,
"step": 27
},
{
"epoch": 1.5342465753424657,
"grad_norm": 1.4338841438293457,
"learning_rate": 1e-06,
"loss": 2.2315,
"step": 28
},
{
"epoch": 1.589041095890411,
"grad_norm": 1.3985637426376343,
"learning_rate": 1e-06,
"loss": 2.262,
"step": 29
},
{
"epoch": 1.643835616438356,
"grad_norm": 1.3776822090148926,
"learning_rate": 1e-06,
"loss": 2.224,
"step": 30
},
{
"epoch": 1.6986301369863015,
"grad_norm": 1.3197417259216309,
"learning_rate": 1e-06,
"loss": 2.2009,
"step": 31
},
{
"epoch": 1.7534246575342465,
"grad_norm": 1.4159483909606934,
"learning_rate": 1e-06,
"loss": 2.2131,
"step": 32
},
{
"epoch": 1.808219178082192,
"grad_norm": 1.3864014148712158,
"learning_rate": 1e-06,
"loss": 2.2498,
"step": 33
},
{
"epoch": 1.8630136986301369,
"grad_norm": 1.3488203287124634,
"learning_rate": 1e-06,
"loss": 2.2147,
"step": 34
},
{
"epoch": 1.9178082191780823,
"grad_norm": 1.345689296722412,
"learning_rate": 1e-06,
"loss": 2.2383,
"step": 35
},
{
"epoch": 1.9726027397260273,
"grad_norm": 1.344303011894226,
"learning_rate": 1e-06,
"loss": 2.2159,
"step": 36
},
{
"epoch": 2.0273972602739727,
"grad_norm": 1.3895442485809326,
"learning_rate": 1e-06,
"loss": 2.2265,
"step": 37
},
{
"epoch": 2.0821917808219177,
"grad_norm": 1.3593428134918213,
"learning_rate": 1e-06,
"loss": 2.2063,
"step": 38
},
{
"epoch": 2.136986301369863,
"grad_norm": 1.3060978651046753,
"learning_rate": 1e-06,
"loss": 2.2572,
"step": 39
},
{
"epoch": 2.191780821917808,
"grad_norm": 1.3199517726898193,
"learning_rate": 1e-06,
"loss": 2.2099,
"step": 40
},
{
"epoch": 2.2465753424657535,
"grad_norm": 1.3381460905075073,
"learning_rate": 1e-06,
"loss": 2.2693,
"step": 41
},
{
"epoch": 2.3013698630136985,
"grad_norm": 1.334553599357605,
"learning_rate": 1e-06,
"loss": 2.2206,
"step": 42
},
{
"epoch": 2.356164383561644,
"grad_norm": 1.3222883939743042,
"learning_rate": 1e-06,
"loss": 2.1851,
"step": 43
},
{
"epoch": 2.410958904109589,
"grad_norm": 1.3213746547698975,
"learning_rate": 1e-06,
"loss": 2.2542,
"step": 44
},
{
"epoch": 2.4657534246575343,
"grad_norm": 1.3214170932769775,
"learning_rate": 1e-06,
"loss": 2.2319,
"step": 45
},
{
"epoch": 2.5205479452054793,
"grad_norm": 1.345453143119812,
"learning_rate": 1e-06,
"loss": 2.222,
"step": 46
},
{
"epoch": 2.5753424657534247,
"grad_norm": 1.2182488441467285,
"learning_rate": 1e-06,
"loss": 2.2069,
"step": 47
},
{
"epoch": 2.6301369863013697,
"grad_norm": 1.2841640710830688,
"learning_rate": 1e-06,
"loss": 2.2181,
"step": 48
},
{
"epoch": 2.684931506849315,
"grad_norm": 1.270230770111084,
"learning_rate": 1e-06,
"loss": 2.2097,
"step": 49
},
{
"epoch": 2.73972602739726,
"grad_norm": 1.213972806930542,
"learning_rate": 1e-06,
"loss": 2.218,
"step": 50
},
{
"epoch": 2.7945205479452055,
"grad_norm": 1.2877941131591797,
"learning_rate": 1e-06,
"loss": 2.2055,
"step": 51
},
{
"epoch": 2.8493150684931505,
"grad_norm": 1.273301601409912,
"learning_rate": 1e-06,
"loss": 2.1895,
"step": 52
},
{
"epoch": 2.904109589041096,
"grad_norm": 1.2318782806396484,
"learning_rate": 1e-06,
"loss": 2.2255,
"step": 53
},
{
"epoch": 2.958904109589041,
"grad_norm": 1.1937693357467651,
"learning_rate": 1e-06,
"loss": 2.1865,
"step": 54
},
{
"epoch": 3.0136986301369864,
"grad_norm": 1.1707606315612793,
"learning_rate": 1e-06,
"loss": 2.2179,
"step": 55
},
{
"epoch": 3.0684931506849313,
"grad_norm": 1.2074235677719116,
"learning_rate": 1e-06,
"loss": 2.155,
"step": 56
},
{
"epoch": 3.1232876712328768,
"grad_norm": 1.1725316047668457,
"learning_rate": 1e-06,
"loss": 2.2011,
"step": 57
},
{
"epoch": 3.1780821917808217,
"grad_norm": 1.1967130899429321,
"learning_rate": 1e-06,
"loss": 2.2155,
"step": 58
},
{
"epoch": 3.232876712328767,
"grad_norm": 1.1932190656661987,
"learning_rate": 1e-06,
"loss": 2.1858,
"step": 59
},
{
"epoch": 3.287671232876712,
"grad_norm": 1.19328773021698,
"learning_rate": 1e-06,
"loss": 2.2351,
"step": 60
},
{
"epoch": 3.3424657534246576,
"grad_norm": 1.1168928146362305,
"learning_rate": 1e-06,
"loss": 2.2022,
"step": 61
},
{
"epoch": 3.3972602739726026,
"grad_norm": 1.2043449878692627,
"learning_rate": 1e-06,
"loss": 2.1964,
"step": 62
},
{
"epoch": 3.452054794520548,
"grad_norm": 1.2224105596542358,
"learning_rate": 1e-06,
"loss": 2.1919,
"step": 63
},
{
"epoch": 3.506849315068493,
"grad_norm": 1.2362271547317505,
"learning_rate": 1e-06,
"loss": 2.199,
"step": 64
},
{
"epoch": 3.5616438356164384,
"grad_norm": 1.2123560905456543,
"learning_rate": 1e-06,
"loss": 2.2357,
"step": 65
},
{
"epoch": 3.616438356164384,
"grad_norm": 1.1854863166809082,
"learning_rate": 1e-06,
"loss": 2.1878,
"step": 66
},
{
"epoch": 3.671232876712329,
"grad_norm": 1.1320362091064453,
"learning_rate": 1e-06,
"loss": 2.1872,
"step": 67
},
{
"epoch": 3.7260273972602738,
"grad_norm": 1.1633937358856201,
"learning_rate": 1e-06,
"loss": 2.205,
"step": 68
},
{
"epoch": 3.780821917808219,
"grad_norm": 1.1435497999191284,
"learning_rate": 1e-06,
"loss": 2.1972,
"step": 69
},
{
"epoch": 3.8356164383561646,
"grad_norm": 1.1820743083953857,
"learning_rate": 1e-06,
"loss": 2.1961,
"step": 70
},
{
"epoch": 3.8904109589041096,
"grad_norm": 1.203647255897522,
"learning_rate": 1e-06,
"loss": 2.2149,
"step": 71
},
{
"epoch": 3.9452054794520546,
"grad_norm": 1.1167892217636108,
"learning_rate": 1e-06,
"loss": 2.197,
"step": 72
},
{
"epoch": 4.0,
"grad_norm": 1.0951488018035889,
"learning_rate": 1e-06,
"loss": 2.1898,
"step": 73
},
{
"epoch": 4.054794520547945,
"grad_norm": 1.1908702850341797,
"learning_rate": 1e-06,
"loss": 2.1973,
"step": 74
},
{
"epoch": 4.109589041095891,
"grad_norm": 1.0710009336471558,
"learning_rate": 1e-06,
"loss": 2.2014,
"step": 75
},
{
"epoch": 4.164383561643835,
"grad_norm": 1.1268314123153687,
"learning_rate": 1e-06,
"loss": 2.2125,
"step": 76
},
{
"epoch": 4.219178082191781,
"grad_norm": 1.0808967351913452,
"learning_rate": 1e-06,
"loss": 2.2184,
"step": 77
},
{
"epoch": 4.273972602739726,
"grad_norm": 1.0744292736053467,
"learning_rate": 1e-06,
"loss": 2.162,
"step": 78
},
{
"epoch": 4.328767123287671,
"grad_norm": 1.0902713537216187,
"learning_rate": 1e-06,
"loss": 2.2045,
"step": 79
},
{
"epoch": 4.383561643835616,
"grad_norm": 1.1404340267181396,
"learning_rate": 1e-06,
"loss": 2.1919,
"step": 80
},
{
"epoch": 4.438356164383562,
"grad_norm": 1.0819721221923828,
"learning_rate": 1e-06,
"loss": 2.1848,
"step": 81
},
{
"epoch": 4.493150684931507,
"grad_norm": 1.0939464569091797,
"learning_rate": 1e-06,
"loss": 2.197,
"step": 82
},
{
"epoch": 4.5479452054794525,
"grad_norm": 1.1371257305145264,
"learning_rate": 1e-06,
"loss": 2.1802,
"step": 83
},
{
"epoch": 4.602739726027397,
"grad_norm": 1.0913671255111694,
"learning_rate": 1e-06,
"loss": 2.182,
"step": 84
},
{
"epoch": 4.657534246575342,
"grad_norm": 1.0597493648529053,
"learning_rate": 1e-06,
"loss": 2.1663,
"step": 85
},
{
"epoch": 4.712328767123288,
"grad_norm": 1.040493130683899,
"learning_rate": 1e-06,
"loss": 2.1774,
"step": 86
},
{
"epoch": 4.767123287671232,
"grad_norm": 1.0556532144546509,
"learning_rate": 1e-06,
"loss": 2.2029,
"step": 87
},
{
"epoch": 4.821917808219178,
"grad_norm": 1.0801831483840942,
"learning_rate": 1e-06,
"loss": 2.1648,
"step": 88
},
{
"epoch": 4.876712328767123,
"grad_norm": 1.073749303817749,
"learning_rate": 1e-06,
"loss": 2.174,
"step": 89
},
{
"epoch": 4.931506849315069,
"grad_norm": 1.0210574865341187,
"learning_rate": 1e-06,
"loss": 2.1474,
"step": 90
},
{
"epoch": 4.986301369863014,
"grad_norm": 1.0152342319488525,
"learning_rate": 1e-06,
"loss": 2.1629,
"step": 91
},
{
"epoch": 5.041095890410959,
"grad_norm": 1.0388507843017578,
"learning_rate": 1e-06,
"loss": 2.1931,
"step": 92
},
{
"epoch": 5.095890410958904,
"grad_norm": 1.011426329612732,
"learning_rate": 1e-06,
"loss": 2.204,
"step": 93
},
{
"epoch": 5.1506849315068495,
"grad_norm": 1.0486528873443604,
"learning_rate": 1e-06,
"loss": 2.1908,
"step": 94
},
{
"epoch": 5.205479452054795,
"grad_norm": 0.9501799941062927,
"learning_rate": 1e-06,
"loss": 2.1823,
"step": 95
},
{
"epoch": 5.260273972602739,
"grad_norm": 1.0336531400680542,
"learning_rate": 1e-06,
"loss": 2.1965,
"step": 96
},
{
"epoch": 5.315068493150685,
"grad_norm": 1.0227267742156982,
"learning_rate": 1e-06,
"loss": 2.1896,
"step": 97
},
{
"epoch": 5.36986301369863,
"grad_norm": 1.0686023235321045,
"learning_rate": 1e-06,
"loss": 2.1496,
"step": 98
},
{
"epoch": 5.424657534246576,
"grad_norm": 0.9931809902191162,
"learning_rate": 1e-06,
"loss": 2.1474,
"step": 99
},
{
"epoch": 5.47945205479452,
"grad_norm": 0.9578049778938293,
"learning_rate": 1e-06,
"loss": 2.1488,
"step": 100
},
{
"epoch": 5.534246575342466,
"grad_norm": 0.9815987944602966,
"learning_rate": 1e-06,
"loss": 2.1755,
"step": 101
},
{
"epoch": 5.589041095890411,
"grad_norm": 0.9837309718132019,
"learning_rate": 1e-06,
"loss": 2.1559,
"step": 102
},
{
"epoch": 5.6438356164383565,
"grad_norm": 0.9334861040115356,
"learning_rate": 1e-06,
"loss": 2.1773,
"step": 103
},
{
"epoch": 5.698630136986301,
"grad_norm": 1.0627118349075317,
"learning_rate": 1e-06,
"loss": 2.2116,
"step": 104
},
{
"epoch": 5.7534246575342465,
"grad_norm": 0.9978325963020325,
"learning_rate": 1e-06,
"loss": 2.1413,
"step": 105
},
{
"epoch": 5.808219178082192,
"grad_norm": 0.9550198912620544,
"learning_rate": 1e-06,
"loss": 2.1535,
"step": 106
},
{
"epoch": 5.863013698630137,
"grad_norm": 0.9339421987533569,
"learning_rate": 1e-06,
"loss": 2.1504,
"step": 107
},
{
"epoch": 5.917808219178082,
"grad_norm": 0.9043423533439636,
"learning_rate": 1e-06,
"loss": 2.1469,
"step": 108
},
{
"epoch": 5.972602739726027,
"grad_norm": 0.921292781829834,
"learning_rate": 1e-06,
"loss": 2.1337,
"step": 109
},
{
"epoch": 6.027397260273973,
"grad_norm": 0.9245712757110596,
"learning_rate": 1e-06,
"loss": 2.1762,
"step": 110
},
{
"epoch": 6.082191780821918,
"grad_norm": 0.9610967636108398,
"learning_rate": 1e-06,
"loss": 2.1618,
"step": 111
},
{
"epoch": 6.136986301369863,
"grad_norm": 0.9136860370635986,
"learning_rate": 1e-06,
"loss": 2.1505,
"step": 112
},
{
"epoch": 6.191780821917808,
"grad_norm": 0.9340102672576904,
"learning_rate": 1e-06,
"loss": 2.1692,
"step": 113
},
{
"epoch": 6.2465753424657535,
"grad_norm": 0.8885300159454346,
"learning_rate": 1e-06,
"loss": 2.1494,
"step": 114
},
{
"epoch": 6.301369863013699,
"grad_norm": 0.917847216129303,
"learning_rate": 1e-06,
"loss": 2.1503,
"step": 115
},
{
"epoch": 6.3561643835616435,
"grad_norm": 0.9519619345664978,
"learning_rate": 1e-06,
"loss": 2.1766,
"step": 116
},
{
"epoch": 6.410958904109589,
"grad_norm": 0.8926482200622559,
"learning_rate": 1e-06,
"loss": 2.1493,
"step": 117
},
{
"epoch": 6.465753424657534,
"grad_norm": 0.817862868309021,
"learning_rate": 1e-06,
"loss": 2.166,
"step": 118
},
{
"epoch": 6.52054794520548,
"grad_norm": 0.8948012590408325,
"learning_rate": 1e-06,
"loss": 2.1346,
"step": 119
},
{
"epoch": 6.575342465753424,
"grad_norm": 0.9632709622383118,
"learning_rate": 1e-06,
"loss": 2.1427,
"step": 120
},
{
"epoch": 6.63013698630137,
"grad_norm": 0.9267117381095886,
"learning_rate": 1e-06,
"loss": 2.1581,
"step": 121
},
{
"epoch": 6.684931506849315,
"grad_norm": 0.9063679575920105,
"learning_rate": 1e-06,
"loss": 2.1453,
"step": 122
},
{
"epoch": 6.739726027397261,
"grad_norm": 0.9395270347595215,
"learning_rate": 1e-06,
"loss": 2.1515,
"step": 123
},
{
"epoch": 6.794520547945205,
"grad_norm": 0.9410396218299866,
"learning_rate": 1e-06,
"loss": 2.1518,
"step": 124
},
{
"epoch": 6.8493150684931505,
"grad_norm": 0.9229517579078674,
"learning_rate": 1e-06,
"loss": 2.1703,
"step": 125
},
{
"epoch": 6.904109589041096,
"grad_norm": 0.8469845652580261,
"learning_rate": 1e-06,
"loss": 2.1491,
"step": 126
},
{
"epoch": 6.958904109589041,
"grad_norm": 0.9080257415771484,
"learning_rate": 1e-06,
"loss": 2.1472,
"step": 127
},
{
"epoch": 7.013698630136986,
"grad_norm": 0.9071102142333984,
"learning_rate": 1e-06,
"loss": 2.1685,
"step": 128
},
{
"epoch": 7.068493150684931,
"grad_norm": 0.8933852910995483,
"learning_rate": 1e-06,
"loss": 2.1617,
"step": 129
},
{
"epoch": 7.123287671232877,
"grad_norm": 0.9227753281593323,
"learning_rate": 1e-06,
"loss": 2.1617,
"step": 130
},
{
"epoch": 7.178082191780822,
"grad_norm": 0.8686262965202332,
"learning_rate": 1e-06,
"loss": 2.1546,
"step": 131
},
{
"epoch": 7.232876712328767,
"grad_norm": 0.8385916948318481,
"learning_rate": 1e-06,
"loss": 2.1442,
"step": 132
},
{
"epoch": 7.287671232876712,
"grad_norm": 0.8217021822929382,
"learning_rate": 1e-06,
"loss": 2.1606,
"step": 133
},
{
"epoch": 7.342465753424658,
"grad_norm": 0.862777590751648,
"learning_rate": 1e-06,
"loss": 2.153,
"step": 134
},
{
"epoch": 7.397260273972603,
"grad_norm": 0.8956757187843323,
"learning_rate": 1e-06,
"loss": 2.1807,
"step": 135
},
{
"epoch": 7.4520547945205475,
"grad_norm": 0.781984806060791,
"learning_rate": 1e-06,
"loss": 2.1469,
"step": 136
},
{
"epoch": 7.506849315068493,
"grad_norm": 0.8100602030754089,
"learning_rate": 1e-06,
"loss": 2.107,
"step": 137
},
{
"epoch": 7.561643835616438,
"grad_norm": 0.8204404711723328,
"learning_rate": 1e-06,
"loss": 2.1477,
"step": 138
},
{
"epoch": 7.616438356164384,
"grad_norm": 0.8198928236961365,
"learning_rate": 1e-06,
"loss": 2.1514,
"step": 139
},
{
"epoch": 7.671232876712329,
"grad_norm": 0.8388807773590088,
"learning_rate": 1e-06,
"loss": 2.1265,
"step": 140
},
{
"epoch": 7.726027397260274,
"grad_norm": 0.8662092089653015,
"learning_rate": 1e-06,
"loss": 2.1316,
"step": 141
},
{
"epoch": 7.780821917808219,
"grad_norm": 0.7682031393051147,
"learning_rate": 1e-06,
"loss": 2.1164,
"step": 142
},
{
"epoch": 7.835616438356165,
"grad_norm": 0.796292781829834,
"learning_rate": 1e-06,
"loss": 2.1342,
"step": 143
},
{
"epoch": 7.890410958904109,
"grad_norm": 0.8075994253158569,
"learning_rate": 1e-06,
"loss": 2.1221,
"step": 144
},
{
"epoch": 7.945205479452055,
"grad_norm": 0.8507598638534546,
"learning_rate": 1e-06,
"loss": 2.1513,
"step": 145
},
{
"epoch": 8.0,
"grad_norm": 0.768495500087738,
"learning_rate": 1e-06,
"loss": 2.1369,
"step": 146
}
],
"logging_steps": 1,
"max_steps": 540,
"num_input_tokens_seen": 0,
"num_train_epochs": 30,
"save_steps": 500,
"total_flos": 1.0225056854153626e+17,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}