gpt2-medium-chat / trainer_state.json
rwl4's picture
Initial commit.
db830ab
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9844639286263652,
"global_step": 400,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0,
"learning_rate": 0.0,
"loss": 76.185,
"step": 1
},
{
"epoch": 0.0,
"learning_rate": 0.0,
"loss": 75.9406,
"step": 2
},
{
"epoch": 0.01,
"learning_rate": 0.0,
"loss": 74.2765,
"step": 3
},
{
"epoch": 0.01,
"learning_rate": 0.0,
"loss": 75.5684,
"step": 4
},
{
"epoch": 0.01,
"learning_rate": 0.0,
"loss": 74.7604,
"step": 5
},
{
"epoch": 0.01,
"learning_rate": 2.9999999999999997e-06,
"loss": 75.6519,
"step": 6
},
{
"epoch": 0.02,
"learning_rate": 5.999999999999999e-06,
"loss": 75.5014,
"step": 7
},
{
"epoch": 0.02,
"learning_rate": 8.999999999999999e-06,
"loss": 72.6792,
"step": 8
},
{
"epoch": 0.02,
"learning_rate": 1.1999999999999999e-05,
"loss": 67.9613,
"step": 9
},
{
"epoch": 0.02,
"learning_rate": 1.4999999999999999e-05,
"loss": 60.5481,
"step": 10
},
{
"epoch": 0.03,
"learning_rate": 1.7999999999999997e-05,
"loss": 51.063,
"step": 11
},
{
"epoch": 0.03,
"learning_rate": 1.7999999999999997e-05,
"loss": 40.1325,
"step": 12
},
{
"epoch": 0.03,
"learning_rate": 2.1e-05,
"loss": 41.0159,
"step": 13
},
{
"epoch": 0.03,
"learning_rate": 2.3999999999999997e-05,
"loss": 27.2134,
"step": 14
},
{
"epoch": 0.04,
"learning_rate": 2.6999999999999996e-05,
"loss": 17.269,
"step": 15
},
{
"epoch": 0.04,
"learning_rate": 2.9999999999999997e-05,
"loss": 12.7363,
"step": 16
},
{
"epoch": 0.04,
"learning_rate": 3.2999999999999996e-05,
"loss": 11.2271,
"step": 17
},
{
"epoch": 0.04,
"learning_rate": 3.5999999999999994e-05,
"loss": 10.9063,
"step": 18
},
{
"epoch": 0.05,
"learning_rate": 3.9e-05,
"loss": 8.7719,
"step": 19
},
{
"epoch": 0.05,
"learning_rate": 4.2e-05,
"loss": 8.6839,
"step": 20
},
{
"epoch": 0.05,
"learning_rate": 4.4999999999999996e-05,
"loss": 7.9548,
"step": 21
},
{
"epoch": 0.05,
"learning_rate": 4.7999999999999994e-05,
"loss": 7.007,
"step": 22
},
{
"epoch": 0.06,
"learning_rate": 5.1e-05,
"loss": 6.3333,
"step": 23
},
{
"epoch": 0.06,
"learning_rate": 5.399999999999999e-05,
"loss": 5.5819,
"step": 24
},
{
"epoch": 0.06,
"learning_rate": 5.6999999999999996e-05,
"loss": 5.024,
"step": 25
},
{
"epoch": 0.06,
"learning_rate": 5.9999999999999995e-05,
"loss": 4.6655,
"step": 26
},
{
"epoch": 0.07,
"learning_rate": 6.299999999999999e-05,
"loss": 4.2421,
"step": 27
},
{
"epoch": 0.07,
"learning_rate": 6.599999999999999e-05,
"loss": 3.8165,
"step": 28
},
{
"epoch": 0.07,
"learning_rate": 6.9e-05,
"loss": 3.5531,
"step": 29
},
{
"epoch": 0.07,
"learning_rate": 7.199999999999999e-05,
"loss": 3.3202,
"step": 30
},
{
"epoch": 0.08,
"learning_rate": 7.5e-05,
"loss": 2.967,
"step": 31
},
{
"epoch": 0.08,
"learning_rate": 7.8e-05,
"loss": 2.6995,
"step": 32
},
{
"epoch": 0.08,
"learning_rate": 8.1e-05,
"loss": 2.5744,
"step": 33
},
{
"epoch": 0.08,
"learning_rate": 8.4e-05,
"loss": 2.4186,
"step": 34
},
{
"epoch": 0.09,
"learning_rate": 8.699999999999999e-05,
"loss": 2.1822,
"step": 35
},
{
"epoch": 0.09,
"learning_rate": 8.999999999999999e-05,
"loss": 2.0079,
"step": 36
},
{
"epoch": 0.09,
"learning_rate": 9.3e-05,
"loss": 1.9346,
"step": 37
},
{
"epoch": 0.09,
"learning_rate": 9.599999999999999e-05,
"loss": 1.8907,
"step": 38
},
{
"epoch": 0.1,
"learning_rate": 9.9e-05,
"loss": 1.6931,
"step": 39
},
{
"epoch": 0.1,
"learning_rate": 0.000102,
"loss": 1.6808,
"step": 40
},
{
"epoch": 0.1,
"learning_rate": 0.00010499999999999999,
"loss": 1.5507,
"step": 41
},
{
"epoch": 0.1,
"learning_rate": 0.00010799999999999998,
"loss": 1.5886,
"step": 42
},
{
"epoch": 0.11,
"learning_rate": 0.00011099999999999999,
"loss": 1.5385,
"step": 43
},
{
"epoch": 0.11,
"learning_rate": 0.00011399999999999999,
"loss": 1.5667,
"step": 44
},
{
"epoch": 0.11,
"learning_rate": 0.000117,
"loss": 1.5299,
"step": 45
},
{
"epoch": 0.11,
"learning_rate": 0.00011999999999999999,
"loss": 1.4758,
"step": 46
},
{
"epoch": 0.12,
"learning_rate": 0.00012299999999999998,
"loss": 1.4841,
"step": 47
},
{
"epoch": 0.12,
"learning_rate": 0.00012599999999999997,
"loss": 1.4327,
"step": 48
},
{
"epoch": 0.12,
"learning_rate": 0.000129,
"loss": 1.3712,
"step": 49
},
{
"epoch": 0.12,
"learning_rate": 0.00013199999999999998,
"loss": 1.3586,
"step": 50
},
{
"epoch": 0.13,
"learning_rate": 0.000135,
"loss": 1.3877,
"step": 51
},
{
"epoch": 0.13,
"learning_rate": 0.000138,
"loss": 1.3471,
"step": 52
},
{
"epoch": 0.13,
"learning_rate": 0.00014099999999999998,
"loss": 1.3334,
"step": 53
},
{
"epoch": 0.13,
"learning_rate": 0.00014399999999999998,
"loss": 1.3129,
"step": 54
},
{
"epoch": 0.14,
"learning_rate": 0.000147,
"loss": 1.3245,
"step": 55
},
{
"epoch": 0.14,
"learning_rate": 0.00015,
"loss": 1.3082,
"step": 56
},
{
"epoch": 0.14,
"learning_rate": 0.00015299999999999998,
"loss": 1.3382,
"step": 57
},
{
"epoch": 0.14,
"learning_rate": 0.000156,
"loss": 1.2748,
"step": 58
},
{
"epoch": 0.15,
"learning_rate": 0.000159,
"loss": 1.3189,
"step": 59
},
{
"epoch": 0.15,
"learning_rate": 0.000162,
"loss": 1.2445,
"step": 60
},
{
"epoch": 0.15,
"learning_rate": 0.000165,
"loss": 1.2634,
"step": 61
},
{
"epoch": 0.15,
"learning_rate": 0.000168,
"loss": 1.2505,
"step": 62
},
{
"epoch": 0.16,
"learning_rate": 0.00017099999999999998,
"loss": 1.282,
"step": 63
},
{
"epoch": 0.16,
"learning_rate": 0.00017399999999999997,
"loss": 1.2642,
"step": 64
},
{
"epoch": 0.16,
"learning_rate": 0.00017699999999999997,
"loss": 1.2287,
"step": 65
},
{
"epoch": 0.16,
"learning_rate": 0.00017999999999999998,
"loss": 1.2677,
"step": 66
},
{
"epoch": 0.16,
"learning_rate": 0.00018299999999999998,
"loss": 1.3291,
"step": 67
},
{
"epoch": 0.17,
"learning_rate": 0.000186,
"loss": 1.2742,
"step": 68
},
{
"epoch": 0.17,
"learning_rate": 0.00018899999999999999,
"loss": 1.3171,
"step": 69
},
{
"epoch": 0.17,
"learning_rate": 0.00019199999999999998,
"loss": 1.3109,
"step": 70
},
{
"epoch": 0.17,
"learning_rate": 0.000195,
"loss": 1.265,
"step": 71
},
{
"epoch": 0.18,
"learning_rate": 0.000198,
"loss": 1.267,
"step": 72
},
{
"epoch": 0.18,
"learning_rate": 0.000201,
"loss": 1.2664,
"step": 73
},
{
"epoch": 0.18,
"learning_rate": 0.000204,
"loss": 1.2781,
"step": 74
},
{
"epoch": 0.18,
"learning_rate": 0.00020699999999999996,
"loss": 1.1558,
"step": 75
},
{
"epoch": 0.19,
"learning_rate": 0.00020999999999999998,
"loss": 1.2308,
"step": 76
},
{
"epoch": 0.19,
"learning_rate": 0.00021299999999999997,
"loss": 1.3119,
"step": 77
},
{
"epoch": 0.19,
"learning_rate": 0.00021599999999999996,
"loss": 1.303,
"step": 78
},
{
"epoch": 0.19,
"learning_rate": 0.00021899999999999998,
"loss": 1.1757,
"step": 79
},
{
"epoch": 0.2,
"learning_rate": 0.00022199999999999998,
"loss": 1.2544,
"step": 80
},
{
"epoch": 0.2,
"learning_rate": 0.000225,
"loss": 1.1772,
"step": 81
},
{
"epoch": 0.2,
"learning_rate": 0.00022799999999999999,
"loss": 1.189,
"step": 82
},
{
"epoch": 0.2,
"learning_rate": 0.00023099999999999998,
"loss": 1.2382,
"step": 83
},
{
"epoch": 0.21,
"learning_rate": 0.000234,
"loss": 1.2537,
"step": 84
},
{
"epoch": 0.21,
"learning_rate": 0.000237,
"loss": 1.2509,
"step": 85
},
{
"epoch": 0.21,
"learning_rate": 0.00023999999999999998,
"loss": 1.2463,
"step": 86
},
{
"epoch": 0.21,
"learning_rate": 0.000243,
"loss": 1.1712,
"step": 87
},
{
"epoch": 0.22,
"learning_rate": 0.00024599999999999996,
"loss": 1.1861,
"step": 88
},
{
"epoch": 0.22,
"learning_rate": 0.000249,
"loss": 1.2451,
"step": 89
},
{
"epoch": 0.22,
"learning_rate": 0.00025199999999999995,
"loss": 1.1949,
"step": 90
},
{
"epoch": 0.22,
"learning_rate": 0.00025499999999999996,
"loss": 1.2673,
"step": 91
},
{
"epoch": 0.23,
"learning_rate": 0.000258,
"loss": 1.2087,
"step": 92
},
{
"epoch": 0.23,
"learning_rate": 0.000261,
"loss": 1.1405,
"step": 93
},
{
"epoch": 0.23,
"learning_rate": 0.00026399999999999997,
"loss": 1.1733,
"step": 94
},
{
"epoch": 0.23,
"learning_rate": 0.000267,
"loss": 1.1969,
"step": 95
},
{
"epoch": 0.24,
"learning_rate": 0.00027,
"loss": 1.1963,
"step": 96
},
{
"epoch": 0.24,
"learning_rate": 0.00027299999999999997,
"loss": 1.1919,
"step": 97
},
{
"epoch": 0.24,
"learning_rate": 0.000276,
"loss": 1.2095,
"step": 98
},
{
"epoch": 0.24,
"learning_rate": 0.000279,
"loss": 1.2056,
"step": 99
},
{
"epoch": 0.25,
"learning_rate": 0.00028199999999999997,
"loss": 1.1847,
"step": 100
},
{
"epoch": 0.25,
"learning_rate": 0.000285,
"loss": 1.1293,
"step": 101
},
{
"epoch": 0.25,
"learning_rate": 0.00028799999999999995,
"loss": 1.2037,
"step": 102
},
{
"epoch": 0.25,
"learning_rate": 0.00029099999999999997,
"loss": 1.2516,
"step": 103
},
{
"epoch": 0.26,
"learning_rate": 0.000294,
"loss": 1.1555,
"step": 104
},
{
"epoch": 0.26,
"learning_rate": 0.00029699999999999996,
"loss": 1.1996,
"step": 105
},
{
"epoch": 0.26,
"learning_rate": 0.0003,
"loss": 1.19,
"step": 106
},
{
"epoch": 0.26,
"learning_rate": 0.00029973166368515205,
"loss": 1.1211,
"step": 107
},
{
"epoch": 0.27,
"learning_rate": 0.00029946332737030407,
"loss": 1.1689,
"step": 108
},
{
"epoch": 0.27,
"learning_rate": 0.00029919499105545615,
"loss": 1.2607,
"step": 109
},
{
"epoch": 0.27,
"learning_rate": 0.00029892665474060817,
"loss": 1.1527,
"step": 110
},
{
"epoch": 0.27,
"learning_rate": 0.00029865831842576025,
"loss": 1.1717,
"step": 111
},
{
"epoch": 0.28,
"learning_rate": 0.0002983899821109123,
"loss": 1.1793,
"step": 112
},
{
"epoch": 0.28,
"learning_rate": 0.0002981216457960644,
"loss": 1.1861,
"step": 113
},
{
"epoch": 0.28,
"learning_rate": 0.0002978533094812164,
"loss": 1.1954,
"step": 114
},
{
"epoch": 0.28,
"learning_rate": 0.0002975849731663685,
"loss": 1.2359,
"step": 115
},
{
"epoch": 0.29,
"learning_rate": 0.00029731663685152057,
"loss": 1.1814,
"step": 116
},
{
"epoch": 0.29,
"learning_rate": 0.0002970483005366726,
"loss": 1.204,
"step": 117
},
{
"epoch": 0.29,
"learning_rate": 0.00029677996422182467,
"loss": 1.1333,
"step": 118
},
{
"epoch": 0.29,
"learning_rate": 0.0002965116279069767,
"loss": 1.1817,
"step": 119
},
{
"epoch": 0.3,
"learning_rate": 0.00029624329159212877,
"loss": 1.1993,
"step": 120
},
{
"epoch": 0.3,
"learning_rate": 0.00029597495527728084,
"loss": 1.2073,
"step": 121
},
{
"epoch": 0.3,
"learning_rate": 0.0002957066189624329,
"loss": 1.2209,
"step": 122
},
{
"epoch": 0.3,
"learning_rate": 0.00029543828264758494,
"loss": 1.2042,
"step": 123
},
{
"epoch": 0.31,
"learning_rate": 0.000295169946332737,
"loss": 1.118,
"step": 124
},
{
"epoch": 0.31,
"learning_rate": 0.0002949016100178891,
"loss": 1.1397,
"step": 125
},
{
"epoch": 0.31,
"learning_rate": 0.0002946332737030411,
"loss": 1.2251,
"step": 126
},
{
"epoch": 0.31,
"learning_rate": 0.0002943649373881932,
"loss": 1.0826,
"step": 127
},
{
"epoch": 0.32,
"learning_rate": 0.0002940966010733452,
"loss": 1.2536,
"step": 128
},
{
"epoch": 0.32,
"learning_rate": 0.0002938282647584973,
"loss": 1.0998,
"step": 129
},
{
"epoch": 0.32,
"learning_rate": 0.0002935599284436493,
"loss": 1.1557,
"step": 130
},
{
"epoch": 0.32,
"learning_rate": 0.0002932915921288014,
"loss": 1.1682,
"step": 131
},
{
"epoch": 0.32,
"learning_rate": 0.00029302325581395347,
"loss": 1.1785,
"step": 132
},
{
"epoch": 0.33,
"learning_rate": 0.00029275491949910554,
"loss": 1.1704,
"step": 133
},
{
"epoch": 0.33,
"learning_rate": 0.00029248658318425756,
"loss": 1.2328,
"step": 134
},
{
"epoch": 0.33,
"learning_rate": 0.00029221824686940964,
"loss": 1.1834,
"step": 135
},
{
"epoch": 0.33,
"learning_rate": 0.0002919499105545617,
"loss": 1.1196,
"step": 136
},
{
"epoch": 0.34,
"learning_rate": 0.00029168157423971374,
"loss": 1.1125,
"step": 137
},
{
"epoch": 0.34,
"learning_rate": 0.0002914132379248658,
"loss": 1.2163,
"step": 138
},
{
"epoch": 0.34,
"learning_rate": 0.00029114490161001784,
"loss": 1.1696,
"step": 139
},
{
"epoch": 0.34,
"learning_rate": 0.0002908765652951699,
"loss": 1.1833,
"step": 140
},
{
"epoch": 0.35,
"learning_rate": 0.000290608228980322,
"loss": 1.1865,
"step": 141
},
{
"epoch": 0.35,
"learning_rate": 0.00029033989266547406,
"loss": 1.2112,
"step": 142
},
{
"epoch": 0.35,
"learning_rate": 0.0002900715563506261,
"loss": 1.1015,
"step": 143
},
{
"epoch": 0.35,
"learning_rate": 0.00028980322003577816,
"loss": 1.1675,
"step": 144
},
{
"epoch": 0.36,
"learning_rate": 0.00028953488372093024,
"loss": 1.1234,
"step": 145
},
{
"epoch": 0.36,
"learning_rate": 0.00028926654740608226,
"loss": 1.1741,
"step": 146
},
{
"epoch": 0.36,
"learning_rate": 0.00028899821109123434,
"loss": 1.1877,
"step": 147
},
{
"epoch": 0.36,
"learning_rate": 0.00028872987477638636,
"loss": 1.1374,
"step": 148
},
{
"epoch": 0.37,
"learning_rate": 0.00028846153846153843,
"loss": 1.1488,
"step": 149
},
{
"epoch": 0.37,
"learning_rate": 0.00028819320214669046,
"loss": 1.1256,
"step": 150
},
{
"epoch": 0.37,
"learning_rate": 0.00028792486583184253,
"loss": 1.1783,
"step": 151
},
{
"epoch": 0.37,
"learning_rate": 0.0002876565295169946,
"loss": 1.1299,
"step": 152
},
{
"epoch": 0.38,
"learning_rate": 0.0002873881932021467,
"loss": 1.1604,
"step": 153
},
{
"epoch": 0.38,
"learning_rate": 0.00028711985688729876,
"loss": 1.1568,
"step": 154
},
{
"epoch": 0.38,
"learning_rate": 0.0002868515205724508,
"loss": 1.1771,
"step": 155
},
{
"epoch": 0.38,
"learning_rate": 0.00028658318425760286,
"loss": 1.1661,
"step": 156
},
{
"epoch": 0.39,
"learning_rate": 0.0002863148479427549,
"loss": 1.1863,
"step": 157
},
{
"epoch": 0.39,
"learning_rate": 0.00028604651162790696,
"loss": 1.1564,
"step": 158
},
{
"epoch": 0.39,
"learning_rate": 0.000285778175313059,
"loss": 1.1202,
"step": 159
},
{
"epoch": 0.39,
"learning_rate": 0.00028550983899821105,
"loss": 1.0977,
"step": 160
},
{
"epoch": 0.4,
"learning_rate": 0.00028524150268336313,
"loss": 1.1039,
"step": 161
},
{
"epoch": 0.4,
"learning_rate": 0.0002849731663685152,
"loss": 1.1355,
"step": 162
},
{
"epoch": 0.4,
"learning_rate": 0.00028470483005366723,
"loss": 1.1598,
"step": 163
},
{
"epoch": 0.4,
"learning_rate": 0.0002844364937388193,
"loss": 1.2163,
"step": 164
},
{
"epoch": 0.41,
"learning_rate": 0.0002841681574239714,
"loss": 1.1515,
"step": 165
},
{
"epoch": 0.41,
"learning_rate": 0.0002838998211091234,
"loss": 1.1433,
"step": 166
},
{
"epoch": 0.41,
"learning_rate": 0.0002836314847942755,
"loss": 1.1179,
"step": 167
},
{
"epoch": 0.41,
"learning_rate": 0.0002833631484794275,
"loss": 1.1703,
"step": 168
},
{
"epoch": 0.42,
"learning_rate": 0.0002830948121645796,
"loss": 1.1532,
"step": 169
},
{
"epoch": 0.42,
"learning_rate": 0.0002828264758497316,
"loss": 1.1333,
"step": 170
},
{
"epoch": 0.42,
"learning_rate": 0.0002825581395348837,
"loss": 1.1743,
"step": 171
},
{
"epoch": 0.42,
"learning_rate": 0.00028228980322003575,
"loss": 1.1067,
"step": 172
},
{
"epoch": 0.43,
"learning_rate": 0.00028202146690518783,
"loss": 1.1853,
"step": 173
},
{
"epoch": 0.43,
"learning_rate": 0.0002817531305903399,
"loss": 1.1502,
"step": 174
},
{
"epoch": 0.43,
"learning_rate": 0.0002814847942754919,
"loss": 1.1678,
"step": 175
},
{
"epoch": 0.43,
"learning_rate": 0.000281216457960644,
"loss": 1.1618,
"step": 176
},
{
"epoch": 0.44,
"learning_rate": 0.000280948121645796,
"loss": 1.1018,
"step": 177
},
{
"epoch": 0.44,
"learning_rate": 0.0002806797853309481,
"loss": 1.1307,
"step": 178
},
{
"epoch": 0.44,
"learning_rate": 0.0002804114490161001,
"loss": 1.1702,
"step": 179
},
{
"epoch": 0.44,
"learning_rate": 0.0002801431127012522,
"loss": 1.1261,
"step": 180
},
{
"epoch": 0.45,
"learning_rate": 0.0002798747763864043,
"loss": 1.1724,
"step": 181
},
{
"epoch": 0.45,
"learning_rate": 0.00027960644007155635,
"loss": 1.105,
"step": 182
},
{
"epoch": 0.45,
"learning_rate": 0.00027933810375670837,
"loss": 1.132,
"step": 183
},
{
"epoch": 0.45,
"learning_rate": 0.00027906976744186045,
"loss": 1.1482,
"step": 184
},
{
"epoch": 0.46,
"learning_rate": 0.0002788014311270125,
"loss": 1.174,
"step": 185
},
{
"epoch": 0.46,
"learning_rate": 0.00027853309481216455,
"loss": 1.0358,
"step": 186
},
{
"epoch": 0.46,
"learning_rate": 0.0002782647584973166,
"loss": 1.1621,
"step": 187
},
{
"epoch": 0.46,
"learning_rate": 0.00027799642218246864,
"loss": 1.1396,
"step": 188
},
{
"epoch": 0.47,
"learning_rate": 0.0002777280858676207,
"loss": 1.2077,
"step": 189
},
{
"epoch": 0.47,
"learning_rate": 0.0002774597495527728,
"loss": 1.1479,
"step": 190
},
{
"epoch": 0.47,
"learning_rate": 0.0002771914132379248,
"loss": 1.0947,
"step": 191
},
{
"epoch": 0.47,
"learning_rate": 0.0002769230769230769,
"loss": 1.087,
"step": 192
},
{
"epoch": 0.48,
"learning_rate": 0.00027665474060822897,
"loss": 1.0965,
"step": 193
},
{
"epoch": 0.48,
"learning_rate": 0.00027638640429338105,
"loss": 1.0773,
"step": 194
},
{
"epoch": 0.48,
"learning_rate": 0.00027611806797853307,
"loss": 1.1498,
"step": 195
},
{
"epoch": 0.48,
"learning_rate": 0.00027584973166368514,
"loss": 1.1767,
"step": 196
},
{
"epoch": 0.48,
"learning_rate": 0.00027558139534883717,
"loss": 1.1041,
"step": 197
},
{
"epoch": 0.49,
"learning_rate": 0.00027531305903398924,
"loss": 1.1357,
"step": 198
},
{
"epoch": 0.49,
"learning_rate": 0.00027504472271914126,
"loss": 1.1808,
"step": 199
},
{
"epoch": 0.49,
"learning_rate": 0.00027477638640429334,
"loss": 1.0874,
"step": 200
},
{
"epoch": 0.49,
"learning_rate": 0.0002745080500894454,
"loss": 1.1758,
"step": 201
},
{
"epoch": 0.5,
"learning_rate": 0.0002742397137745975,
"loss": 1.1293,
"step": 202
},
{
"epoch": 0.5,
"learning_rate": 0.0002739713774597495,
"loss": 1.1178,
"step": 203
},
{
"epoch": 0.5,
"learning_rate": 0.0002737030411449016,
"loss": 1.1319,
"step": 204
},
{
"epoch": 0.5,
"learning_rate": 0.00027343470483005367,
"loss": 1.0862,
"step": 205
},
{
"epoch": 0.51,
"learning_rate": 0.0002731663685152057,
"loss": 1.1207,
"step": 206
},
{
"epoch": 0.51,
"learning_rate": 0.00027289803220035777,
"loss": 1.1536,
"step": 207
},
{
"epoch": 0.51,
"learning_rate": 0.0002726296958855098,
"loss": 1.1135,
"step": 208
},
{
"epoch": 0.51,
"learning_rate": 0.00027236135957066186,
"loss": 1.1426,
"step": 209
},
{
"epoch": 0.52,
"learning_rate": 0.00027209302325581394,
"loss": 1.104,
"step": 210
},
{
"epoch": 0.52,
"learning_rate": 0.00027182468694096596,
"loss": 1.1456,
"step": 211
},
{
"epoch": 0.52,
"learning_rate": 0.00027155635062611804,
"loss": 1.1502,
"step": 212
},
{
"epoch": 0.52,
"learning_rate": 0.0002712880143112701,
"loss": 1.1319,
"step": 213
},
{
"epoch": 0.53,
"learning_rate": 0.0002710196779964222,
"loss": 1.0572,
"step": 214
},
{
"epoch": 0.53,
"learning_rate": 0.0002707513416815742,
"loss": 1.1197,
"step": 215
},
{
"epoch": 0.53,
"learning_rate": 0.0002704830053667263,
"loss": 1.0701,
"step": 216
},
{
"epoch": 0.53,
"learning_rate": 0.0002702146690518783,
"loss": 1.08,
"step": 217
},
{
"epoch": 0.54,
"learning_rate": 0.0002699463327370304,
"loss": 1.1022,
"step": 218
},
{
"epoch": 0.54,
"learning_rate": 0.0002696779964221824,
"loss": 1.1585,
"step": 219
},
{
"epoch": 0.54,
"learning_rate": 0.0002694096601073345,
"loss": 1.1125,
"step": 220
},
{
"epoch": 0.54,
"learning_rate": 0.00026914132379248656,
"loss": 1.1683,
"step": 221
},
{
"epoch": 0.55,
"learning_rate": 0.00026887298747763864,
"loss": 1.0717,
"step": 222
},
{
"epoch": 0.55,
"learning_rate": 0.00026860465116279066,
"loss": 1.0566,
"step": 223
},
{
"epoch": 0.55,
"learning_rate": 0.00026833631484794273,
"loss": 1.1283,
"step": 224
},
{
"epoch": 0.55,
"learning_rate": 0.0002680679785330948,
"loss": 1.0599,
"step": 225
},
{
"epoch": 0.56,
"learning_rate": 0.00026779964221824683,
"loss": 1.1555,
"step": 226
},
{
"epoch": 0.56,
"learning_rate": 0.0002675313059033989,
"loss": 1.1151,
"step": 227
},
{
"epoch": 0.56,
"learning_rate": 0.00026726296958855093,
"loss": 1.1031,
"step": 228
},
{
"epoch": 0.56,
"learning_rate": 0.000266994633273703,
"loss": 1.1357,
"step": 229
},
{
"epoch": 0.57,
"learning_rate": 0.0002667262969588551,
"loss": 1.0255,
"step": 230
},
{
"epoch": 0.57,
"learning_rate": 0.0002664579606440071,
"loss": 1.082,
"step": 231
},
{
"epoch": 0.57,
"learning_rate": 0.0002661896243291592,
"loss": 1.1354,
"step": 232
},
{
"epoch": 0.57,
"learning_rate": 0.00026592128801431126,
"loss": 1.1089,
"step": 233
},
{
"epoch": 0.58,
"learning_rate": 0.00026565295169946333,
"loss": 1.025,
"step": 234
},
{
"epoch": 0.58,
"learning_rate": 0.00026538461538461536,
"loss": 1.155,
"step": 235
},
{
"epoch": 0.58,
"learning_rate": 0.00026511627906976743,
"loss": 1.0183,
"step": 236
},
{
"epoch": 0.58,
"learning_rate": 0.00026484794275491945,
"loss": 1.0879,
"step": 237
},
{
"epoch": 0.59,
"learning_rate": 0.00026457960644007153,
"loss": 1.1757,
"step": 238
},
{
"epoch": 0.59,
"learning_rate": 0.0002643112701252236,
"loss": 1.113,
"step": 239
},
{
"epoch": 0.59,
"learning_rate": 0.00026404293381037563,
"loss": 1.2179,
"step": 240
},
{
"epoch": 0.59,
"learning_rate": 0.0002637745974955277,
"loss": 1.1009,
"step": 241
},
{
"epoch": 0.6,
"learning_rate": 0.0002635062611806798,
"loss": 1.1551,
"step": 242
},
{
"epoch": 0.6,
"learning_rate": 0.0002632379248658318,
"loss": 1.134,
"step": 243
},
{
"epoch": 0.6,
"learning_rate": 0.0002629695885509839,
"loss": 1.1067,
"step": 244
},
{
"epoch": 0.6,
"learning_rate": 0.00026270125223613595,
"loss": 1.1171,
"step": 245
},
{
"epoch": 0.61,
"learning_rate": 0.000262432915921288,
"loss": 1.1095,
"step": 246
},
{
"epoch": 0.61,
"learning_rate": 0.00026216457960644005,
"loss": 1.1101,
"step": 247
},
{
"epoch": 0.61,
"learning_rate": 0.0002618962432915921,
"loss": 1.131,
"step": 248
},
{
"epoch": 0.61,
"learning_rate": 0.00026162790697674415,
"loss": 1.1268,
"step": 249
},
{
"epoch": 0.62,
"learning_rate": 0.0002613595706618962,
"loss": 1.109,
"step": 250
},
{
"epoch": 0.62,
"learning_rate": 0.00026109123434704825,
"loss": 1.1276,
"step": 251
},
{
"epoch": 0.62,
"learning_rate": 0.0002608228980322003,
"loss": 1.1385,
"step": 252
},
{
"epoch": 0.62,
"learning_rate": 0.0002605545617173524,
"loss": 1.0746,
"step": 253
},
{
"epoch": 0.63,
"learning_rate": 0.0002602862254025045,
"loss": 1.0876,
"step": 254
},
{
"epoch": 0.63,
"learning_rate": 0.0002600178890876565,
"loss": 1.0698,
"step": 255
},
{
"epoch": 0.63,
"learning_rate": 0.0002597495527728086,
"loss": 1.1184,
"step": 256
},
{
"epoch": 0.63,
"learning_rate": 0.0002594812164579606,
"loss": 1.1368,
"step": 257
},
{
"epoch": 0.63,
"learning_rate": 0.00025921288014311267,
"loss": 1.1468,
"step": 258
},
{
"epoch": 0.64,
"learning_rate": 0.00025894454382826475,
"loss": 1.1032,
"step": 259
},
{
"epoch": 0.64,
"learning_rate": 0.00025867620751341677,
"loss": 1.1172,
"step": 260
},
{
"epoch": 0.64,
"learning_rate": 0.00025840787119856885,
"loss": 1.1366,
"step": 261
},
{
"epoch": 0.64,
"learning_rate": 0.0002581395348837209,
"loss": 1.1089,
"step": 262
},
{
"epoch": 0.65,
"learning_rate": 0.000257871198568873,
"loss": 1.1322,
"step": 263
},
{
"epoch": 0.65,
"learning_rate": 0.000257602862254025,
"loss": 1.1345,
"step": 264
},
{
"epoch": 0.65,
"learning_rate": 0.0002573345259391771,
"loss": 1.1979,
"step": 265
},
{
"epoch": 0.65,
"learning_rate": 0.0002570661896243291,
"loss": 1.1247,
"step": 266
},
{
"epoch": 0.66,
"learning_rate": 0.0002567978533094812,
"loss": 1.1441,
"step": 267
},
{
"epoch": 0.66,
"learning_rate": 0.00025652951699463327,
"loss": 1.1256,
"step": 268
},
{
"epoch": 0.66,
"learning_rate": 0.0002562611806797853,
"loss": 1.0387,
"step": 269
},
{
"epoch": 0.66,
"learning_rate": 0.00025599284436493737,
"loss": 1.1434,
"step": 270
},
{
"epoch": 0.67,
"learning_rate": 0.0002557245080500894,
"loss": 1.1023,
"step": 271
},
{
"epoch": 0.67,
"learning_rate": 0.00025545617173524147,
"loss": 1.1393,
"step": 272
},
{
"epoch": 0.67,
"learning_rate": 0.00025518783542039354,
"loss": 1.195,
"step": 273
},
{
"epoch": 0.67,
"learning_rate": 0.0002549194991055456,
"loss": 1.1038,
"step": 274
},
{
"epoch": 0.68,
"learning_rate": 0.00025465116279069764,
"loss": 1.1721,
"step": 275
},
{
"epoch": 0.68,
"learning_rate": 0.0002543828264758497,
"loss": 1.1387,
"step": 276
},
{
"epoch": 0.68,
"learning_rate": 0.0002541144901610018,
"loss": 1.1454,
"step": 277
},
{
"epoch": 0.68,
"learning_rate": 0.0002538461538461538,
"loss": 1.0933,
"step": 278
},
{
"epoch": 0.69,
"learning_rate": 0.0002535778175313059,
"loss": 1.1365,
"step": 279
},
{
"epoch": 0.69,
"learning_rate": 0.0002533094812164579,
"loss": 1.082,
"step": 280
},
{
"epoch": 0.69,
"learning_rate": 0.00025304114490161,
"loss": 1.1628,
"step": 281
},
{
"epoch": 0.69,
"learning_rate": 0.00025277280858676207,
"loss": 1.0917,
"step": 282
},
{
"epoch": 0.7,
"learning_rate": 0.00025250447227191414,
"loss": 1.0876,
"step": 283
},
{
"epoch": 0.7,
"learning_rate": 0.00025223613595706616,
"loss": 1.0861,
"step": 284
},
{
"epoch": 0.7,
"learning_rate": 0.00025196779964221824,
"loss": 1.0868,
"step": 285
},
{
"epoch": 0.7,
"learning_rate": 0.00025169946332737026,
"loss": 1.0905,
"step": 286
},
{
"epoch": 0.71,
"learning_rate": 0.00025143112701252234,
"loss": 1.1386,
"step": 287
},
{
"epoch": 0.71,
"learning_rate": 0.0002511627906976744,
"loss": 1.0859,
"step": 288
},
{
"epoch": 0.71,
"learning_rate": 0.00025089445438282644,
"loss": 1.1498,
"step": 289
},
{
"epoch": 0.71,
"learning_rate": 0.0002506261180679785,
"loss": 1.1066,
"step": 290
},
{
"epoch": 0.72,
"learning_rate": 0.00025035778175313053,
"loss": 1.0394,
"step": 291
},
{
"epoch": 0.72,
"learning_rate": 0.0002500894454382826,
"loss": 1.0937,
"step": 292
},
{
"epoch": 0.72,
"learning_rate": 0.0002498211091234347,
"loss": 1.0588,
"step": 293
},
{
"epoch": 0.72,
"learning_rate": 0.00024955277280858676,
"loss": 1.1741,
"step": 294
},
{
"epoch": 0.73,
"learning_rate": 0.0002492844364937388,
"loss": 1.1132,
"step": 295
},
{
"epoch": 0.73,
"learning_rate": 0.00024901610017889086,
"loss": 1.1105,
"step": 296
},
{
"epoch": 0.73,
"learning_rate": 0.00024874776386404294,
"loss": 1.0776,
"step": 297
},
{
"epoch": 0.73,
"learning_rate": 0.00024847942754919496,
"loss": 1.0707,
"step": 298
},
{
"epoch": 0.74,
"learning_rate": 0.00024821109123434703,
"loss": 1.0268,
"step": 299
},
{
"epoch": 0.74,
"learning_rate": 0.00024794275491949906,
"loss": 1.1174,
"step": 300
},
{
"epoch": 0.74,
"learning_rate": 0.00024767441860465113,
"loss": 1.0998,
"step": 301
},
{
"epoch": 0.74,
"learning_rate": 0.0002474060822898032,
"loss": 1.0585,
"step": 302
},
{
"epoch": 0.75,
"learning_rate": 0.0002471377459749553,
"loss": 1.0848,
"step": 303
},
{
"epoch": 0.75,
"learning_rate": 0.0002468694096601073,
"loss": 1.0092,
"step": 304
},
{
"epoch": 0.75,
"learning_rate": 0.0002466010733452594,
"loss": 1.1124,
"step": 305
},
{
"epoch": 0.75,
"learning_rate": 0.00024633273703041146,
"loss": 1.1583,
"step": 306
},
{
"epoch": 0.76,
"learning_rate": 0.0002460644007155635,
"loss": 1.0845,
"step": 307
},
{
"epoch": 0.76,
"learning_rate": 0.00024579606440071556,
"loss": 1.0651,
"step": 308
},
{
"epoch": 0.76,
"learning_rate": 0.0002455277280858676,
"loss": 1.1906,
"step": 309
},
{
"epoch": 0.76,
"learning_rate": 0.00024525939177101966,
"loss": 1.0914,
"step": 310
},
{
"epoch": 0.77,
"learning_rate": 0.0002449910554561717,
"loss": 1.1395,
"step": 311
},
{
"epoch": 0.77,
"learning_rate": 0.00024472271914132375,
"loss": 1.0841,
"step": 312
},
{
"epoch": 0.77,
"learning_rate": 0.00024445438282647583,
"loss": 1.0969,
"step": 313
},
{
"epoch": 0.77,
"learning_rate": 0.0002441860465116279,
"loss": 1.0996,
"step": 314
},
{
"epoch": 0.78,
"learning_rate": 0.00024391771019677995,
"loss": 1.0965,
"step": 315
},
{
"epoch": 0.78,
"learning_rate": 0.000243649373881932,
"loss": 1.1313,
"step": 316
},
{
"epoch": 0.78,
"learning_rate": 0.00024338103756708408,
"loss": 1.0488,
"step": 317
},
{
"epoch": 0.78,
"learning_rate": 0.0002431127012522361,
"loss": 1.0903,
"step": 318
},
{
"epoch": 0.79,
"learning_rate": 0.00024284436493738818,
"loss": 1.1083,
"step": 319
},
{
"epoch": 0.79,
"learning_rate": 0.00024257602862254023,
"loss": 1.0908,
"step": 320
},
{
"epoch": 0.79,
"learning_rate": 0.0002423076923076923,
"loss": 1.1991,
"step": 321
},
{
"epoch": 0.79,
"learning_rate": 0.00024203935599284433,
"loss": 1.1049,
"step": 322
},
{
"epoch": 0.79,
"learning_rate": 0.0002417710196779964,
"loss": 1.0764,
"step": 323
},
{
"epoch": 0.8,
"learning_rate": 0.00024150268336314848,
"loss": 1.1085,
"step": 324
},
{
"epoch": 0.8,
"learning_rate": 0.00024123434704830053,
"loss": 1.0947,
"step": 325
},
{
"epoch": 0.8,
"learning_rate": 0.00024096601073345258,
"loss": 1.0968,
"step": 326
},
{
"epoch": 0.8,
"learning_rate": 0.00024069767441860462,
"loss": 1.1083,
"step": 327
},
{
"epoch": 0.81,
"learning_rate": 0.0002404293381037567,
"loss": 1.0891,
"step": 328
},
{
"epoch": 0.81,
"learning_rate": 0.00024016100178890872,
"loss": 1.0765,
"step": 329
},
{
"epoch": 0.81,
"learning_rate": 0.0002398926654740608,
"loss": 1.0876,
"step": 330
},
{
"epoch": 0.81,
"learning_rate": 0.00023962432915921285,
"loss": 1.0825,
"step": 331
},
{
"epoch": 0.82,
"learning_rate": 0.00023935599284436492,
"loss": 1.0511,
"step": 332
},
{
"epoch": 0.82,
"learning_rate": 0.00023908765652951695,
"loss": 1.1108,
"step": 333
},
{
"epoch": 0.82,
"learning_rate": 0.00023881932021466902,
"loss": 1.1657,
"step": 334
},
{
"epoch": 0.82,
"learning_rate": 0.0002385509838998211,
"loss": 1.0432,
"step": 335
},
{
"epoch": 0.83,
"learning_rate": 0.00023828264758497315,
"loss": 1.0239,
"step": 336
},
{
"epoch": 0.83,
"learning_rate": 0.00023801431127012522,
"loss": 1.0903,
"step": 337
},
{
"epoch": 0.83,
"learning_rate": 0.00023774597495527724,
"loss": 1.1875,
"step": 338
},
{
"epoch": 0.83,
"learning_rate": 0.00023747763864042932,
"loss": 1.105,
"step": 339
},
{
"epoch": 0.84,
"learning_rate": 0.00023720930232558137,
"loss": 1.1065,
"step": 340
},
{
"epoch": 0.84,
"learning_rate": 0.00023694096601073345,
"loss": 1.078,
"step": 341
},
{
"epoch": 0.84,
"learning_rate": 0.00023667262969588547,
"loss": 1.0853,
"step": 342
},
{
"epoch": 0.84,
"learning_rate": 0.00023640429338103754,
"loss": 1.1088,
"step": 343
},
{
"epoch": 0.85,
"learning_rate": 0.00023613595706618962,
"loss": 1.1454,
"step": 344
},
{
"epoch": 0.85,
"learning_rate": 0.00023586762075134167,
"loss": 1.0618,
"step": 345
},
{
"epoch": 0.85,
"learning_rate": 0.00023559928443649372,
"loss": 1.0485,
"step": 346
},
{
"epoch": 0.85,
"learning_rate": 0.00023533094812164577,
"loss": 1.1067,
"step": 347
},
{
"epoch": 0.86,
"learning_rate": 0.00023506261180679784,
"loss": 1.1629,
"step": 348
},
{
"epoch": 0.86,
"learning_rate": 0.0002347942754919499,
"loss": 1.1023,
"step": 349
},
{
"epoch": 0.86,
"learning_rate": 0.00023452593917710194,
"loss": 1.0401,
"step": 350
},
{
"epoch": 0.86,
"learning_rate": 0.000234257602862254,
"loss": 1.1236,
"step": 351
},
{
"epoch": 0.87,
"learning_rate": 0.00023398926654740607,
"loss": 1.051,
"step": 352
},
{
"epoch": 0.87,
"learning_rate": 0.00023372093023255814,
"loss": 1.1537,
"step": 353
},
{
"epoch": 0.87,
"learning_rate": 0.00023345259391771016,
"loss": 1.0741,
"step": 354
},
{
"epoch": 0.87,
"learning_rate": 0.00023318425760286224,
"loss": 1.1004,
"step": 355
},
{
"epoch": 0.88,
"learning_rate": 0.0002329159212880143,
"loss": 1.1571,
"step": 356
},
{
"epoch": 0.88,
"learning_rate": 0.00023264758497316637,
"loss": 1.1061,
"step": 357
},
{
"epoch": 0.88,
"learning_rate": 0.0002323792486583184,
"loss": 1.1042,
"step": 358
},
{
"epoch": 0.88,
"learning_rate": 0.00023211091234347046,
"loss": 1.1012,
"step": 359
},
{
"epoch": 0.89,
"learning_rate": 0.0002318425760286225,
"loss": 1.1164,
"step": 360
},
{
"epoch": 0.89,
"learning_rate": 0.0002315742397137746,
"loss": 1.016,
"step": 361
},
{
"epoch": 0.89,
"learning_rate": 0.00023130590339892664,
"loss": 1.1053,
"step": 362
},
{
"epoch": 0.89,
"learning_rate": 0.0002310375670840787,
"loss": 1.106,
"step": 363
},
{
"epoch": 0.9,
"learning_rate": 0.00023076923076923076,
"loss": 1.0676,
"step": 364
},
{
"epoch": 0.9,
"learning_rate": 0.0002305008944543828,
"loss": 1.0848,
"step": 365
},
{
"epoch": 0.9,
"learning_rate": 0.00023023255813953486,
"loss": 1.0142,
"step": 366
},
{
"epoch": 0.9,
"learning_rate": 0.0002299642218246869,
"loss": 1.0715,
"step": 367
},
{
"epoch": 0.91,
"learning_rate": 0.000229695885509839,
"loss": 1.1395,
"step": 368
},
{
"epoch": 0.91,
"learning_rate": 0.00022942754919499104,
"loss": 1.1241,
"step": 369
},
{
"epoch": 0.91,
"learning_rate": 0.00022915921288014308,
"loss": 1.1105,
"step": 370
},
{
"epoch": 0.91,
"learning_rate": 0.00022889087656529513,
"loss": 1.0588,
"step": 371
},
{
"epoch": 0.92,
"learning_rate": 0.0002286225402504472,
"loss": 1.071,
"step": 372
},
{
"epoch": 0.92,
"learning_rate": 0.00022835420393559929,
"loss": 1.1485,
"step": 373
},
{
"epoch": 0.92,
"learning_rate": 0.0002280858676207513,
"loss": 1.1344,
"step": 374
},
{
"epoch": 0.92,
"learning_rate": 0.00022781753130590338,
"loss": 1.2254,
"step": 375
},
{
"epoch": 0.93,
"learning_rate": 0.00022754919499105543,
"loss": 1.1192,
"step": 376
},
{
"epoch": 0.93,
"learning_rate": 0.0002272808586762075,
"loss": 1.0923,
"step": 377
},
{
"epoch": 0.93,
"learning_rate": 0.00022701252236135953,
"loss": 1.0844,
"step": 378
},
{
"epoch": 0.93,
"learning_rate": 0.0002267441860465116,
"loss": 1.1138,
"step": 379
},
{
"epoch": 0.94,
"learning_rate": 0.00022647584973166366,
"loss": 1.1149,
"step": 380
},
{
"epoch": 0.94,
"learning_rate": 0.00022620751341681573,
"loss": 1.1026,
"step": 381
},
{
"epoch": 0.94,
"learning_rate": 0.00022593917710196778,
"loss": 1.123,
"step": 382
},
{
"epoch": 0.94,
"learning_rate": 0.00022567084078711983,
"loss": 1.0526,
"step": 383
},
{
"epoch": 0.95,
"learning_rate": 0.0002254025044722719,
"loss": 1.1391,
"step": 384
},
{
"epoch": 0.95,
"learning_rate": 0.00022513416815742396,
"loss": 1.0788,
"step": 385
},
{
"epoch": 0.95,
"learning_rate": 0.000224865831842576,
"loss": 1.0645,
"step": 386
},
{
"epoch": 0.95,
"learning_rate": 0.00022459749552772805,
"loss": 1.0788,
"step": 387
},
{
"epoch": 0.95,
"learning_rate": 0.00022432915921288013,
"loss": 1.1153,
"step": 388
},
{
"epoch": 0.96,
"learning_rate": 0.00022406082289803218,
"loss": 1.1073,
"step": 389
},
{
"epoch": 0.96,
"learning_rate": 0.00022379248658318423,
"loss": 1.0972,
"step": 390
},
{
"epoch": 0.96,
"learning_rate": 0.0002235241502683363,
"loss": 1.0866,
"step": 391
},
{
"epoch": 0.96,
"learning_rate": 0.00022325581395348835,
"loss": 1.1452,
"step": 392
},
{
"epoch": 0.97,
"learning_rate": 0.00022298747763864043,
"loss": 1.0327,
"step": 393
},
{
"epoch": 0.97,
"learning_rate": 0.00022271914132379245,
"loss": 1.0988,
"step": 394
},
{
"epoch": 0.97,
"learning_rate": 0.00022245080500894453,
"loss": 1.0475,
"step": 395
},
{
"epoch": 0.97,
"learning_rate": 0.00022218246869409658,
"loss": 1.1704,
"step": 396
},
{
"epoch": 0.98,
"learning_rate": 0.00022191413237924865,
"loss": 1.0807,
"step": 397
},
{
"epoch": 0.98,
"learning_rate": 0.00022164579606440067,
"loss": 1.1002,
"step": 398
},
{
"epoch": 0.98,
"learning_rate": 0.00022137745974955275,
"loss": 1.047,
"step": 399
},
{
"epoch": 0.98,
"learning_rate": 0.00022110912343470483,
"loss": 1.0945,
"step": 400
}
],
"max_steps": 1218,
"num_train_epochs": 3,
"total_flos": 4.75494755598336e+16,
"trial_name": null,
"trial_params": null
}