huranokuma commited on
Commit
c9df23c
1 Parent(s): 09fd33b

End of training

Browse files
Files changed (3) hide show
  1. all_results.json +5 -5
  2. train_results.json +5 -5
  3. trainer_state.json +1245 -33
all_results.json CHANGED
@@ -1,5 +1,5 @@
1
  {
2
- "epoch": 5.0,
3
  "eval_accuracy": 0.5895981374153325,
4
  "eval_loss": 1.842421054840088,
5
  "eval_runtime": 6167.0931,
@@ -7,9 +7,9 @@
7
  "eval_samples_per_second": 21.881,
8
  "eval_steps_per_second": 10.94,
9
  "perplexity": 6.311800993960819,
10
- "train_loss": 1.883479554409491,
11
- "train_runtime": 65223.4395,
12
  "train_samples": 134942,
13
- "train_samples_per_second": 10.345,
14
- "train_steps_per_second": 2.586
15
  }
 
1
  {
2
+ "epoch": 2.0,
3
  "eval_accuracy": 0.5895981374153325,
4
  "eval_loss": 1.842421054840088,
5
  "eval_runtime": 6167.0931,
 
7
  "eval_samples_per_second": 21.881,
8
  "eval_steps_per_second": 10.94,
9
  "perplexity": 6.311800993960819,
10
+ "train_loss": 0.612412237187797,
11
+ "train_runtime": 12901.2681,
12
  "train_samples": 134942,
13
+ "train_samples_per_second": 20.919,
14
+ "train_steps_per_second": 20.919
15
  }
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 5.0,
3
- "train_loss": 1.883479554409491,
4
- "train_runtime": 65223.4395,
5
  "train_samples": 134942,
6
- "train_samples_per_second": 10.345,
7
- "train_steps_per_second": 2.586
8
  }
 
1
  {
2
+ "epoch": 2.0,
3
+ "train_loss": 0.612412237187797,
4
+ "train_runtime": 12901.2681,
5
  "train_samples": 134942,
6
+ "train_samples_per_second": 20.919,
7
+ "train_steps_per_second": 20.919
8
  }
trainer_state.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 5.0,
5
- "global_step": 168680,
6
  "is_hyper_param_search": false,
7
  "is_local_process_zero": true,
8
  "is_world_process_zero": true,
@@ -1988,60 +1988,1272 @@
1988
  "step": 165000
1989
  },
1990
  {
1991
- "epoch": 4.91,
1992
- "learning_rate": 9.426132321555609e-07,
1993
- "loss": 1.7517,
1994
  "step": 165500
1995
  },
1996
  {
1997
- "epoch": 4.92,
1998
- "learning_rate": 7.944036044581456e-07,
1999
- "loss": 1.7506,
2000
  "step": 166000
2001
  },
2002
  {
2003
- "epoch": 4.94,
2004
- "learning_rate": 6.461939767607305e-07,
2005
- "loss": 1.7592,
2006
  "step": 166500
2007
  },
2008
  {
2009
- "epoch": 4.95,
2010
- "learning_rate": 4.979843490633152e-07,
2011
- "loss": 1.7486,
2012
  "step": 167000
2013
  },
2014
  {
2015
- "epoch": 4.97,
2016
- "learning_rate": 3.4977472136589993e-07,
2017
- "loss": 1.7369,
2018
  "step": 167500
2019
  },
2020
  {
2021
- "epoch": 4.98,
2022
- "learning_rate": 2.0156509366848474e-07,
2023
- "loss": 1.7659,
2024
  "step": 168000
2025
  },
2026
  {
2027
- "epoch": 4.99,
2028
- "learning_rate": 5.3355465971069484e-08,
2029
- "loss": 1.747,
2030
  "step": 168500
2031
  },
2032
  {
2033
- "epoch": 5.0,
2034
- "step": 168680,
2035
- "total_flos": 6.266036456049869e+17,
2036
- "train_loss": 1.883479554409491,
2037
- "train_runtime": 65223.4395,
2038
- "train_samples_per_second": 10.345,
2039
- "train_steps_per_second": 2.586
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2040
  }
2041
  ],
2042
- "max_steps": 168680,
2043
- "num_train_epochs": 5,
2044
- "total_flos": 6.266036456049869e+17,
2045
  "trial_name": null,
2046
  "trial_params": null
2047
  }
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 2.0,
5
+ "global_step": 269884,
6
  "is_hyper_param_search": false,
7
  "is_local_process_zero": true,
8
  "is_world_process_zero": true,
 
1988
  "step": 165000
1989
  },
1990
  {
1991
+ "epoch": 1.23,
1992
+ "learning_rate": 3.7735471535919134e-05,
1993
+ "loss": 1.7432,
1994
  "step": 165500
1995
  },
1996
  {
1997
+ "epoch": 1.23,
1998
+ "learning_rate": 3.76984185798343e-05,
1999
+ "loss": 1.7559,
2000
  "step": 166000
2001
  },
2002
  {
2003
+ "epoch": 1.23,
2004
+ "learning_rate": 3.766136562374947e-05,
2005
+ "loss": 1.7691,
2006
  "step": 166500
2007
  },
2008
  {
2009
+ "epoch": 1.24,
2010
+ "learning_rate": 3.762431266766463e-05,
2011
+ "loss": 1.7535,
2012
  "step": 167000
2013
  },
2014
  {
2015
+ "epoch": 1.24,
2016
+ "learning_rate": 3.7587259711579795e-05,
2017
+ "loss": 1.7727,
2018
  "step": 167500
2019
  },
2020
  {
2021
+ "epoch": 1.24,
2022
+ "learning_rate": 3.755020675549495e-05,
2023
+ "loss": 1.7932,
2024
  "step": 168000
2025
  },
2026
  {
2027
+ "epoch": 1.25,
2028
+ "learning_rate": 3.7513153799410116e-05,
2029
+ "loss": 1.7775,
2030
  "step": 168500
2031
  },
2032
  {
2033
+ "epoch": 1.25,
2034
+ "learning_rate": 3.7476100843325286e-05,
2035
+ "loss": 1.7774,
2036
+ "step": 169000
2037
+ },
2038
+ {
2039
+ "epoch": 1.26,
2040
+ "learning_rate": 3.743904788724045e-05,
2041
+ "loss": 1.7898,
2042
+ "step": 169500
2043
+ },
2044
+ {
2045
+ "epoch": 1.26,
2046
+ "learning_rate": 3.7401994931155607e-05,
2047
+ "loss": 1.7823,
2048
+ "step": 170000
2049
+ },
2050
+ {
2051
+ "epoch": 1.26,
2052
+ "learning_rate": 3.736494197507077e-05,
2053
+ "loss": 1.8264,
2054
+ "step": 170500
2055
+ },
2056
+ {
2057
+ "epoch": 1.27,
2058
+ "learning_rate": 3.7327889018985934e-05,
2059
+ "loss": 1.792,
2060
+ "step": 171000
2061
+ },
2062
+ {
2063
+ "epoch": 1.27,
2064
+ "learning_rate": 3.7290836062901104e-05,
2065
+ "loss": 1.8131,
2066
+ "step": 171500
2067
+ },
2068
+ {
2069
+ "epoch": 1.27,
2070
+ "learning_rate": 3.725378310681626e-05,
2071
+ "loss": 1.8023,
2072
+ "step": 172000
2073
+ },
2074
+ {
2075
+ "epoch": 1.28,
2076
+ "learning_rate": 3.7216730150731425e-05,
2077
+ "loss": 1.8306,
2078
+ "step": 172500
2079
+ },
2080
+ {
2081
+ "epoch": 1.28,
2082
+ "learning_rate": 3.717967719464659e-05,
2083
+ "loss": 1.7956,
2084
+ "step": 173000
2085
+ },
2086
+ {
2087
+ "epoch": 1.29,
2088
+ "learning_rate": 3.714262423856175e-05,
2089
+ "loss": 1.8114,
2090
+ "step": 173500
2091
+ },
2092
+ {
2093
+ "epoch": 1.29,
2094
+ "learning_rate": 3.710557128247692e-05,
2095
+ "loss": 1.8154,
2096
+ "step": 174000
2097
+ },
2098
+ {
2099
+ "epoch": 1.29,
2100
+ "learning_rate": 3.706851832639208e-05,
2101
+ "loss": 1.8034,
2102
+ "step": 174500
2103
+ },
2104
+ {
2105
+ "epoch": 1.3,
2106
+ "learning_rate": 3.703146537030724e-05,
2107
+ "loss": 1.8171,
2108
+ "step": 175000
2109
+ },
2110
+ {
2111
+ "epoch": 1.3,
2112
+ "learning_rate": 3.699441241422241e-05,
2113
+ "loss": 1.8111,
2114
+ "step": 175500
2115
+ },
2116
+ {
2117
+ "epoch": 1.3,
2118
+ "learning_rate": 3.695735945813757e-05,
2119
+ "loss": 1.8063,
2120
+ "step": 176000
2121
+ },
2122
+ {
2123
+ "epoch": 1.31,
2124
+ "learning_rate": 3.6920306502052734e-05,
2125
+ "loss": 1.828,
2126
+ "step": 176500
2127
+ },
2128
+ {
2129
+ "epoch": 1.31,
2130
+ "learning_rate": 3.68832535459679e-05,
2131
+ "loss": 1.8398,
2132
+ "step": 177000
2133
+ },
2134
+ {
2135
+ "epoch": 1.32,
2136
+ "learning_rate": 3.684620058988306e-05,
2137
+ "loss": 1.8362,
2138
+ "step": 177500
2139
+ },
2140
+ {
2141
+ "epoch": 1.32,
2142
+ "learning_rate": 3.6809147633798225e-05,
2143
+ "loss": 1.8502,
2144
+ "step": 178000
2145
+ },
2146
+ {
2147
+ "epoch": 1.32,
2148
+ "learning_rate": 3.6772094677713396e-05,
2149
+ "loss": 1.8581,
2150
+ "step": 178500
2151
+ },
2152
+ {
2153
+ "epoch": 1.33,
2154
+ "learning_rate": 3.673504172162855e-05,
2155
+ "loss": 1.8401,
2156
+ "step": 179000
2157
+ },
2158
+ {
2159
+ "epoch": 1.33,
2160
+ "learning_rate": 3.6697988765543716e-05,
2161
+ "loss": 1.8324,
2162
+ "step": 179500
2163
+ },
2164
+ {
2165
+ "epoch": 1.33,
2166
+ "learning_rate": 3.666093580945888e-05,
2167
+ "loss": 1.8523,
2168
+ "step": 180000
2169
+ },
2170
+ {
2171
+ "epoch": 1.34,
2172
+ "learning_rate": 1.6559707133435105e-05,
2173
+ "loss": 1.817,
2174
+ "step": 180500
2175
+ },
2176
+ {
2177
+ "epoch": 1.34,
2178
+ "learning_rate": 1.6467074743223014e-05,
2179
+ "loss": 1.8734,
2180
+ "step": 181000
2181
+ },
2182
+ {
2183
+ "epoch": 1.35,
2184
+ "learning_rate": 1.6374442353010923e-05,
2185
+ "loss": 1.812,
2186
+ "step": 181500
2187
+ },
2188
+ {
2189
+ "epoch": 1.35,
2190
+ "learning_rate": 1.6281809962798833e-05,
2191
+ "loss": 1.8239,
2192
+ "step": 182000
2193
+ },
2194
+ {
2195
+ "epoch": 1.35,
2196
+ "learning_rate": 1.6189177572586742e-05,
2197
+ "loss": 1.8156,
2198
+ "step": 182500
2199
+ },
2200
+ {
2201
+ "epoch": 1.36,
2202
+ "learning_rate": 1.609654518237465e-05,
2203
+ "loss": 1.8019,
2204
+ "step": 183000
2205
+ },
2206
+ {
2207
+ "epoch": 1.36,
2208
+ "learning_rate": 1.600391279216256e-05,
2209
+ "loss": 1.8153,
2210
+ "step": 183500
2211
+ },
2212
+ {
2213
+ "epoch": 1.36,
2214
+ "learning_rate": 1.591128040195047e-05,
2215
+ "loss": 1.8063,
2216
+ "step": 184000
2217
+ },
2218
+ {
2219
+ "epoch": 1.37,
2220
+ "learning_rate": 1.581864801173838e-05,
2221
+ "loss": 1.8251,
2222
+ "step": 184500
2223
+ },
2224
+ {
2225
+ "epoch": 1.37,
2226
+ "learning_rate": 1.5726015621526284e-05,
2227
+ "loss": 1.8205,
2228
+ "step": 185000
2229
+ },
2230
+ {
2231
+ "epoch": 1.37,
2232
+ "learning_rate": 1.5633383231314193e-05,
2233
+ "loss": 1.805,
2234
+ "step": 185500
2235
+ },
2236
+ {
2237
+ "epoch": 1.38,
2238
+ "learning_rate": 1.5540750841102102e-05,
2239
+ "loss": 1.839,
2240
+ "step": 186000
2241
+ },
2242
+ {
2243
+ "epoch": 1.38,
2244
+ "learning_rate": 1.544811845089001e-05,
2245
+ "loss": 1.8183,
2246
+ "step": 186500
2247
+ },
2248
+ {
2249
+ "epoch": 1.39,
2250
+ "learning_rate": 1.535548606067792e-05,
2251
+ "loss": 1.8262,
2252
+ "step": 187000
2253
+ },
2254
+ {
2255
+ "epoch": 1.39,
2256
+ "learning_rate": 1.526285367046583e-05,
2257
+ "loss": 1.7968,
2258
+ "step": 187500
2259
+ },
2260
+ {
2261
+ "epoch": 1.39,
2262
+ "learning_rate": 1.5170221280253737e-05,
2263
+ "loss": 1.822,
2264
+ "step": 188000
2265
+ },
2266
+ {
2267
+ "epoch": 1.4,
2268
+ "learning_rate": 1.507758889004165e-05,
2269
+ "loss": 1.8239,
2270
+ "step": 188500
2271
+ },
2272
+ {
2273
+ "epoch": 1.4,
2274
+ "learning_rate": 1.4984956499829559e-05,
2275
+ "loss": 1.8336,
2276
+ "step": 189000
2277
+ },
2278
+ {
2279
+ "epoch": 1.4,
2280
+ "learning_rate": 1.4892324109617466e-05,
2281
+ "loss": 1.8248,
2282
+ "step": 189500
2283
+ },
2284
+ {
2285
+ "epoch": 1.41,
2286
+ "learning_rate": 1.4799691719405376e-05,
2287
+ "loss": 1.8142,
2288
+ "step": 190000
2289
+ },
2290
+ {
2291
+ "epoch": 1.41,
2292
+ "learning_rate": 1.4707059329193285e-05,
2293
+ "loss": 1.8187,
2294
+ "step": 190500
2295
+ },
2296
+ {
2297
+ "epoch": 1.42,
2298
+ "learning_rate": 1.4614426938981194e-05,
2299
+ "loss": 1.8219,
2300
+ "step": 191000
2301
+ },
2302
+ {
2303
+ "epoch": 1.42,
2304
+ "learning_rate": 1.4521794548769101e-05,
2305
+ "loss": 1.8274,
2306
+ "step": 191500
2307
+ },
2308
+ {
2309
+ "epoch": 1.42,
2310
+ "learning_rate": 1.442916215855701e-05,
2311
+ "loss": 1.7992,
2312
+ "step": 192000
2313
+ },
2314
+ {
2315
+ "epoch": 1.43,
2316
+ "learning_rate": 1.433652976834492e-05,
2317
+ "loss": 1.8345,
2318
+ "step": 192500
2319
+ },
2320
+ {
2321
+ "epoch": 1.43,
2322
+ "learning_rate": 1.4243897378132829e-05,
2323
+ "loss": 1.8225,
2324
+ "step": 193000
2325
+ },
2326
+ {
2327
+ "epoch": 1.43,
2328
+ "learning_rate": 1.4151264987920736e-05,
2329
+ "loss": 1.8417,
2330
+ "step": 193500
2331
+ },
2332
+ {
2333
+ "epoch": 1.44,
2334
+ "learning_rate": 1.4058632597708645e-05,
2335
+ "loss": 1.8025,
2336
+ "step": 194000
2337
+ },
2338
+ {
2339
+ "epoch": 1.44,
2340
+ "learning_rate": 1.3966000207496555e-05,
2341
+ "loss": 1.818,
2342
+ "step": 194500
2343
+ },
2344
+ {
2345
+ "epoch": 1.45,
2346
+ "learning_rate": 1.3873367817284464e-05,
2347
+ "loss": 1.803,
2348
+ "step": 195000
2349
+ },
2350
+ {
2351
+ "epoch": 1.45,
2352
+ "learning_rate": 1.3780735427072371e-05,
2353
+ "loss": 1.8012,
2354
+ "step": 195500
2355
+ },
2356
+ {
2357
+ "epoch": 1.45,
2358
+ "learning_rate": 1.368810303686028e-05,
2359
+ "loss": 1.8183,
2360
+ "step": 196000
2361
+ },
2362
+ {
2363
+ "epoch": 1.46,
2364
+ "learning_rate": 1.359547064664819e-05,
2365
+ "loss": 1.822,
2366
+ "step": 196500
2367
+ },
2368
+ {
2369
+ "epoch": 1.46,
2370
+ "learning_rate": 1.3502838256436099e-05,
2371
+ "loss": 1.8482,
2372
+ "step": 197000
2373
+ },
2374
+ {
2375
+ "epoch": 1.46,
2376
+ "learning_rate": 1.3410205866224008e-05,
2377
+ "loss": 1.8322,
2378
+ "step": 197500
2379
+ },
2380
+ {
2381
+ "epoch": 1.47,
2382
+ "learning_rate": 1.3317573476011915e-05,
2383
+ "loss": 1.8187,
2384
+ "step": 198000
2385
+ },
2386
+ {
2387
+ "epoch": 1.47,
2388
+ "learning_rate": 1.3224941085799824e-05,
2389
+ "loss": 1.8166,
2390
+ "step": 198500
2391
+ },
2392
+ {
2393
+ "epoch": 1.47,
2394
+ "learning_rate": 1.3132308695587734e-05,
2395
+ "loss": 1.8481,
2396
+ "step": 199000
2397
+ },
2398
+ {
2399
+ "epoch": 1.48,
2400
+ "learning_rate": 1.3039676305375643e-05,
2401
+ "loss": 1.8319,
2402
+ "step": 199500
2403
+ },
2404
+ {
2405
+ "epoch": 1.48,
2406
+ "learning_rate": 1.2947043915163554e-05,
2407
+ "loss": 1.8159,
2408
+ "step": 200000
2409
+ },
2410
+ {
2411
+ "epoch": 1.49,
2412
+ "learning_rate": 1.2854411524951463e-05,
2413
+ "loss": 1.8474,
2414
+ "step": 200500
2415
+ },
2416
+ {
2417
+ "epoch": 1.49,
2418
+ "learning_rate": 1.2761779134739372e-05,
2419
+ "loss": 1.8305,
2420
+ "step": 201000
2421
+ },
2422
+ {
2423
+ "epoch": 1.49,
2424
+ "learning_rate": 1.266914674452728e-05,
2425
+ "loss": 1.827,
2426
+ "step": 201500
2427
+ },
2428
+ {
2429
+ "epoch": 1.5,
2430
+ "learning_rate": 1.2576514354315189e-05,
2431
+ "loss": 1.8219,
2432
+ "step": 202000
2433
+ },
2434
+ {
2435
+ "epoch": 1.5,
2436
+ "learning_rate": 1.2483881964103098e-05,
2437
+ "loss": 1.8142,
2438
+ "step": 202500
2439
+ },
2440
+ {
2441
+ "epoch": 1.5,
2442
+ "learning_rate": 1.2391249573891007e-05,
2443
+ "loss": 1.8161,
2444
+ "step": 203000
2445
+ },
2446
+ {
2447
+ "epoch": 1.51,
2448
+ "learning_rate": 1.2298617183678914e-05,
2449
+ "loss": 1.8336,
2450
+ "step": 203500
2451
+ },
2452
+ {
2453
+ "epoch": 1.51,
2454
+ "learning_rate": 1.2205984793466823e-05,
2455
+ "loss": 1.816,
2456
+ "step": 204000
2457
+ },
2458
+ {
2459
+ "epoch": 1.52,
2460
+ "learning_rate": 1.2113352403254733e-05,
2461
+ "loss": 1.8304,
2462
+ "step": 204500
2463
+ },
2464
+ {
2465
+ "epoch": 1.52,
2466
+ "learning_rate": 1.2020720013042642e-05,
2467
+ "loss": 1.8101,
2468
+ "step": 205000
2469
+ },
2470
+ {
2471
+ "epoch": 1.52,
2472
+ "learning_rate": 1.192808762283055e-05,
2473
+ "loss": 1.8219,
2474
+ "step": 205500
2475
+ },
2476
+ {
2477
+ "epoch": 1.53,
2478
+ "learning_rate": 1.1835455232618458e-05,
2479
+ "loss": 1.8271,
2480
+ "step": 206000
2481
+ },
2482
+ {
2483
+ "epoch": 1.53,
2484
+ "learning_rate": 1.1742822842406368e-05,
2485
+ "loss": 1.8542,
2486
+ "step": 206500
2487
+ },
2488
+ {
2489
+ "epoch": 1.53,
2490
+ "learning_rate": 1.1650190452194277e-05,
2491
+ "loss": 1.8504,
2492
+ "step": 207000
2493
+ },
2494
+ {
2495
+ "epoch": 1.54,
2496
+ "learning_rate": 1.1557558061982184e-05,
2497
+ "loss": 1.8177,
2498
+ "step": 207500
2499
+ },
2500
+ {
2501
+ "epoch": 1.54,
2502
+ "learning_rate": 1.1464925671770093e-05,
2503
+ "loss": 1.8297,
2504
+ "step": 208000
2505
+ },
2506
+ {
2507
+ "epoch": 1.55,
2508
+ "learning_rate": 1.1372293281558004e-05,
2509
+ "loss": 1.8154,
2510
+ "step": 208500
2511
+ },
2512
+ {
2513
+ "epoch": 1.55,
2514
+ "learning_rate": 1.1279660891345913e-05,
2515
+ "loss": 1.8371,
2516
+ "step": 209000
2517
+ },
2518
+ {
2519
+ "epoch": 1.55,
2520
+ "learning_rate": 1.118702850113382e-05,
2521
+ "loss": 1.8386,
2522
+ "step": 209500
2523
+ },
2524
+ {
2525
+ "epoch": 1.56,
2526
+ "learning_rate": 1.109439611092173e-05,
2527
+ "loss": 1.8414,
2528
+ "step": 210000
2529
+ },
2530
+ {
2531
+ "epoch": 1.56,
2532
+ "learning_rate": 1.1001763720709639e-05,
2533
+ "loss": 1.8708,
2534
+ "step": 210500
2535
+ },
2536
+ {
2537
+ "epoch": 1.56,
2538
+ "learning_rate": 1.0909131330497548e-05,
2539
+ "loss": 1.8323,
2540
+ "step": 211000
2541
+ },
2542
+ {
2543
+ "epoch": 1.57,
2544
+ "learning_rate": 1.0816498940285456e-05,
2545
+ "loss": 1.8587,
2546
+ "step": 211500
2547
+ },
2548
+ {
2549
+ "epoch": 1.57,
2550
+ "learning_rate": 1.0723866550073365e-05,
2551
+ "loss": 1.8364,
2552
+ "step": 212000
2553
+ },
2554
+ {
2555
+ "epoch": 1.57,
2556
+ "learning_rate": 1.0631234159861274e-05,
2557
+ "loss": 1.8504,
2558
+ "step": 212500
2559
+ },
2560
+ {
2561
+ "epoch": 1.58,
2562
+ "learning_rate": 1.0538601769649183e-05,
2563
+ "loss": 1.8394,
2564
+ "step": 213000
2565
+ },
2566
+ {
2567
+ "epoch": 1.58,
2568
+ "learning_rate": 1.044596937943709e-05,
2569
+ "loss": 1.811,
2570
+ "step": 213500
2571
+ },
2572
+ {
2573
+ "epoch": 1.59,
2574
+ "learning_rate": 1.0353336989225001e-05,
2575
+ "loss": 1.8517,
2576
+ "step": 214000
2577
+ },
2578
+ {
2579
+ "epoch": 1.59,
2580
+ "learning_rate": 1.026070459901291e-05,
2581
+ "loss": 1.8225,
2582
+ "step": 214500
2583
+ },
2584
+ {
2585
+ "epoch": 1.59,
2586
+ "learning_rate": 1.016807220880082e-05,
2587
+ "loss": 1.8488,
2588
+ "step": 215000
2589
+ },
2590
+ {
2591
+ "epoch": 1.6,
2592
+ "learning_rate": 1.0075439818588727e-05,
2593
+ "loss": 1.8289,
2594
+ "step": 215500
2595
+ },
2596
+ {
2597
+ "epoch": 1.6,
2598
+ "learning_rate": 9.982807428376636e-06,
2599
+ "loss": 1.8847,
2600
+ "step": 216000
2601
+ },
2602
+ {
2603
+ "epoch": 1.6,
2604
+ "learning_rate": 9.890175038164546e-06,
2605
+ "loss": 1.8436,
2606
+ "step": 216500
2607
+ },
2608
+ {
2609
+ "epoch": 1.61,
2610
+ "learning_rate": 9.797542647952455e-06,
2611
+ "loss": 1.8496,
2612
+ "step": 217000
2613
+ },
2614
+ {
2615
+ "epoch": 1.61,
2616
+ "learning_rate": 9.704910257740362e-06,
2617
+ "loss": 1.8299,
2618
+ "step": 217500
2619
+ },
2620
+ {
2621
+ "epoch": 1.62,
2622
+ "learning_rate": 9.612277867528271e-06,
2623
+ "loss": 1.8302,
2624
+ "step": 218000
2625
+ },
2626
+ {
2627
+ "epoch": 1.62,
2628
+ "learning_rate": 9.51964547731618e-06,
2629
+ "loss": 1.8361,
2630
+ "step": 218500
2631
+ },
2632
+ {
2633
+ "epoch": 1.62,
2634
+ "learning_rate": 9.42701308710409e-06,
2635
+ "loss": 1.8672,
2636
+ "step": 219000
2637
+ },
2638
+ {
2639
+ "epoch": 1.63,
2640
+ "learning_rate": 9.334380696891999e-06,
2641
+ "loss": 1.8126,
2642
+ "step": 219500
2643
+ },
2644
+ {
2645
+ "epoch": 1.63,
2646
+ "learning_rate": 9.241748306679908e-06,
2647
+ "loss": 1.8328,
2648
+ "step": 220000
2649
+ },
2650
+ {
2651
+ "epoch": 1.63,
2652
+ "learning_rate": 9.149115916467817e-06,
2653
+ "loss": 1.8235,
2654
+ "step": 220500
2655
+ },
2656
+ {
2657
+ "epoch": 1.64,
2658
+ "learning_rate": 9.056483526255726e-06,
2659
+ "loss": 1.834,
2660
+ "step": 221000
2661
+ },
2662
+ {
2663
+ "epoch": 1.64,
2664
+ "learning_rate": 8.963851136043634e-06,
2665
+ "loss": 1.8367,
2666
+ "step": 221500
2667
+ },
2668
+ {
2669
+ "epoch": 1.65,
2670
+ "learning_rate": 8.871218745831543e-06,
2671
+ "loss": 1.8178,
2672
+ "step": 222000
2673
+ },
2674
+ {
2675
+ "epoch": 1.65,
2676
+ "learning_rate": 8.778586355619452e-06,
2677
+ "loss": 1.827,
2678
+ "step": 222500
2679
+ },
2680
+ {
2681
+ "epoch": 1.65,
2682
+ "learning_rate": 8.685953965407361e-06,
2683
+ "loss": 1.8176,
2684
+ "step": 223000
2685
+ },
2686
+ {
2687
+ "epoch": 1.66,
2688
+ "learning_rate": 8.593321575195269e-06,
2689
+ "loss": 1.8043,
2690
+ "step": 223500
2691
+ },
2692
+ {
2693
+ "epoch": 1.66,
2694
+ "learning_rate": 8.500689184983178e-06,
2695
+ "loss": 1.824,
2696
+ "step": 224000
2697
+ },
2698
+ {
2699
+ "epoch": 1.66,
2700
+ "learning_rate": 8.408056794771087e-06,
2701
+ "loss": 1.8549,
2702
+ "step": 224500
2703
+ },
2704
+ {
2705
+ "epoch": 1.67,
2706
+ "learning_rate": 8.315424404558996e-06,
2707
+ "loss": 1.8264,
2708
+ "step": 225000
2709
+ },
2710
+ {
2711
+ "epoch": 1.67,
2712
+ "learning_rate": 8.222792014346905e-06,
2713
+ "loss": 1.8622,
2714
+ "step": 225500
2715
+ },
2716
+ {
2717
+ "epoch": 1.67,
2718
+ "learning_rate": 8.130159624134814e-06,
2719
+ "loss": 1.8566,
2720
+ "step": 226000
2721
+ },
2722
+ {
2723
+ "epoch": 1.68,
2724
+ "learning_rate": 8.037527233922724e-06,
2725
+ "loss": 1.8803,
2726
+ "step": 226500
2727
+ },
2728
+ {
2729
+ "epoch": 1.68,
2730
+ "learning_rate": 7.944894843710633e-06,
2731
+ "loss": 1.8482,
2732
+ "step": 227000
2733
+ },
2734
+ {
2735
+ "epoch": 1.69,
2736
+ "learning_rate": 7.85226245349854e-06,
2737
+ "loss": 1.8063,
2738
+ "step": 227500
2739
+ },
2740
+ {
2741
+ "epoch": 1.69,
2742
+ "learning_rate": 7.75963006328645e-06,
2743
+ "loss": 1.8298,
2744
+ "step": 228000
2745
+ },
2746
+ {
2747
+ "epoch": 1.69,
2748
+ "learning_rate": 7.666997673074358e-06,
2749
+ "loss": 1.8222,
2750
+ "step": 228500
2751
+ },
2752
+ {
2753
+ "epoch": 1.7,
2754
+ "learning_rate": 7.574365282862267e-06,
2755
+ "loss": 1.8158,
2756
+ "step": 229000
2757
+ },
2758
+ {
2759
+ "epoch": 1.7,
2760
+ "learning_rate": 7.481732892650176e-06,
2761
+ "loss": 1.8452,
2762
+ "step": 229500
2763
+ },
2764
+ {
2765
+ "epoch": 1.7,
2766
+ "learning_rate": 7.389100502438084e-06,
2767
+ "loss": 1.8256,
2768
+ "step": 230000
2769
+ },
2770
+ {
2771
+ "epoch": 1.71,
2772
+ "learning_rate": 7.296468112225993e-06,
2773
+ "loss": 1.7952,
2774
+ "step": 230500
2775
+ },
2776
+ {
2777
+ "epoch": 1.71,
2778
+ "learning_rate": 7.203835722013903e-06,
2779
+ "loss": 1.8425,
2780
+ "step": 231000
2781
+ },
2782
+ {
2783
+ "epoch": 1.72,
2784
+ "learning_rate": 7.111203331801812e-06,
2785
+ "loss": 1.838,
2786
+ "step": 231500
2787
+ },
2788
+ {
2789
+ "epoch": 1.72,
2790
+ "learning_rate": 7.018570941589721e-06,
2791
+ "loss": 1.8279,
2792
+ "step": 232000
2793
+ },
2794
+ {
2795
+ "epoch": 1.72,
2796
+ "learning_rate": 6.925938551377629e-06,
2797
+ "loss": 1.8274,
2798
+ "step": 232500
2799
+ },
2800
+ {
2801
+ "epoch": 1.73,
2802
+ "learning_rate": 6.833306161165538e-06,
2803
+ "loss": 1.8322,
2804
+ "step": 233000
2805
+ },
2806
+ {
2807
+ "epoch": 1.73,
2808
+ "learning_rate": 6.7406737709534474e-06,
2809
+ "loss": 1.8451,
2810
+ "step": 233500
2811
+ },
2812
+ {
2813
+ "epoch": 1.73,
2814
+ "learning_rate": 6.648041380741356e-06,
2815
+ "loss": 1.8469,
2816
+ "step": 234000
2817
+ },
2818
+ {
2819
+ "epoch": 1.74,
2820
+ "learning_rate": 6.555408990529265e-06,
2821
+ "loss": 1.8336,
2822
+ "step": 234500
2823
+ },
2824
+ {
2825
+ "epoch": 1.74,
2826
+ "learning_rate": 6.462776600317173e-06,
2827
+ "loss": 1.8538,
2828
+ "step": 235000
2829
+ },
2830
+ {
2831
+ "epoch": 1.75,
2832
+ "learning_rate": 6.370144210105082e-06,
2833
+ "loss": 1.8638,
2834
+ "step": 235500
2835
+ },
2836
+ {
2837
+ "epoch": 1.75,
2838
+ "learning_rate": 6.277511819892991e-06,
2839
+ "loss": 1.8497,
2840
+ "step": 236000
2841
+ },
2842
+ {
2843
+ "epoch": 1.75,
2844
+ "learning_rate": 6.184879429680901e-06,
2845
+ "loss": 1.8547,
2846
+ "step": 236500
2847
+ },
2848
+ {
2849
+ "epoch": 1.76,
2850
+ "learning_rate": 6.092247039468809e-06,
2851
+ "loss": 1.8248,
2852
+ "step": 237000
2853
+ },
2854
+ {
2855
+ "epoch": 1.76,
2856
+ "learning_rate": 5.999614649256718e-06,
2857
+ "loss": 1.8354,
2858
+ "step": 237500
2859
+ },
2860
+ {
2861
+ "epoch": 1.76,
2862
+ "learning_rate": 5.906982259044627e-06,
2863
+ "loss": 1.8425,
2864
+ "step": 238000
2865
+ },
2866
+ {
2867
+ "epoch": 1.77,
2868
+ "learning_rate": 5.8143498688325356e-06,
2869
+ "loss": 1.8571,
2870
+ "step": 238500
2871
+ },
2872
+ {
2873
+ "epoch": 1.77,
2874
+ "learning_rate": 5.721717478620445e-06,
2875
+ "loss": 1.8436,
2876
+ "step": 239000
2877
+ },
2878
+ {
2879
+ "epoch": 1.77,
2880
+ "learning_rate": 5.629085088408354e-06,
2881
+ "loss": 1.863,
2882
+ "step": 239500
2883
+ },
2884
+ {
2885
+ "epoch": 1.78,
2886
+ "learning_rate": 5.536452698196262e-06,
2887
+ "loss": 1.8557,
2888
+ "step": 240000
2889
+ },
2890
+ {
2891
+ "epoch": 1.78,
2892
+ "learning_rate": 5.443820307984171e-06,
2893
+ "loss": 1.8647,
2894
+ "step": 240500
2895
+ },
2896
+ {
2897
+ "epoch": 1.79,
2898
+ "learning_rate": 5.3511879177720805e-06,
2899
+ "loss": 1.8469,
2900
+ "step": 241000
2901
+ },
2902
+ {
2903
+ "epoch": 1.79,
2904
+ "learning_rate": 5.258555527559989e-06,
2905
+ "loss": 1.8427,
2906
+ "step": 241500
2907
+ },
2908
+ {
2909
+ "epoch": 1.79,
2910
+ "learning_rate": 5.165923137347898e-06,
2911
+ "loss": 1.8385,
2912
+ "step": 242000
2913
+ },
2914
+ {
2915
+ "epoch": 1.8,
2916
+ "learning_rate": 5.073290747135806e-06,
2917
+ "loss": 1.8224,
2918
+ "step": 242500
2919
+ },
2920
+ {
2921
+ "epoch": 1.8,
2922
+ "learning_rate": 4.980658356923715e-06,
2923
+ "loss": 1.8266,
2924
+ "step": 243000
2925
+ },
2926
+ {
2927
+ "epoch": 1.8,
2928
+ "learning_rate": 4.8880259667116246e-06,
2929
+ "loss": 1.8374,
2930
+ "step": 243500
2931
+ },
2932
+ {
2933
+ "epoch": 1.81,
2934
+ "learning_rate": 4.795393576499534e-06,
2935
+ "loss": 1.8227,
2936
+ "step": 244000
2937
+ },
2938
+ {
2939
+ "epoch": 1.81,
2940
+ "learning_rate": 4.702761186287442e-06,
2941
+ "loss": 1.9014,
2942
+ "step": 244500
2943
+ },
2944
+ {
2945
+ "epoch": 1.82,
2946
+ "learning_rate": 4.610128796075351e-06,
2947
+ "loss": 1.8057,
2948
+ "step": 245000
2949
+ },
2950
+ {
2951
+ "epoch": 1.82,
2952
+ "learning_rate": 4.5174964058632595e-06,
2953
+ "loss": 1.8626,
2954
+ "step": 245500
2955
+ },
2956
+ {
2957
+ "epoch": 1.82,
2958
+ "learning_rate": 4.424864015651169e-06,
2959
+ "loss": 1.8353,
2960
+ "step": 246000
2961
+ },
2962
+ {
2963
+ "epoch": 1.83,
2964
+ "learning_rate": 4.332231625439078e-06,
2965
+ "loss": 1.8497,
2966
+ "step": 246500
2967
+ },
2968
+ {
2969
+ "epoch": 1.83,
2970
+ "learning_rate": 4.239599235226987e-06,
2971
+ "loss": 1.8413,
2972
+ "step": 247000
2973
+ },
2974
+ {
2975
+ "epoch": 1.83,
2976
+ "learning_rate": 4.146966845014895e-06,
2977
+ "loss": 1.8455,
2978
+ "step": 247500
2979
+ },
2980
+ {
2981
+ "epoch": 1.84,
2982
+ "learning_rate": 4.054334454802804e-06,
2983
+ "loss": 1.8563,
2984
+ "step": 248000
2985
+ },
2986
+ {
2987
+ "epoch": 1.84,
2988
+ "learning_rate": 3.961702064590713e-06,
2989
+ "loss": 1.8326,
2990
+ "step": 248500
2991
+ },
2992
+ {
2993
+ "epoch": 1.85,
2994
+ "learning_rate": 3.869069674378622e-06,
2995
+ "loss": 1.862,
2996
+ "step": 249000
2997
+ },
2998
+ {
2999
+ "epoch": 1.85,
3000
+ "learning_rate": 3.7764372841665314e-06,
3001
+ "loss": 1.8478,
3002
+ "step": 249500
3003
+ },
3004
+ {
3005
+ "epoch": 1.85,
3006
+ "learning_rate": 3.68380489395444e-06,
3007
+ "loss": 1.8536,
3008
+ "step": 250000
3009
+ },
3010
+ {
3011
+ "epoch": 1.86,
3012
+ "learning_rate": 3.591172503742349e-06,
3013
+ "loss": 1.8461,
3014
+ "step": 250500
3015
+ },
3016
+ {
3017
+ "epoch": 1.86,
3018
+ "learning_rate": 3.4985401135302576e-06,
3019
+ "loss": 1.8516,
3020
+ "step": 251000
3021
+ },
3022
+ {
3023
+ "epoch": 1.86,
3024
+ "learning_rate": 3.4059077233181664e-06,
3025
+ "loss": 1.8275,
3026
+ "step": 251500
3027
+ },
3028
+ {
3029
+ "epoch": 1.87,
3030
+ "learning_rate": 3.313275333106075e-06,
3031
+ "loss": 1.842,
3032
+ "step": 252000
3033
+ },
3034
+ {
3035
+ "epoch": 1.87,
3036
+ "learning_rate": 3.2206429428939847e-06,
3037
+ "loss": 1.8754,
3038
+ "step": 252500
3039
+ },
3040
+ {
3041
+ "epoch": 1.87,
3042
+ "learning_rate": 3.1280105526818934e-06,
3043
+ "loss": 1.8505,
3044
+ "step": 253000
3045
+ },
3046
+ {
3047
+ "epoch": 1.88,
3048
+ "learning_rate": 3.035378162469802e-06,
3049
+ "loss": 1.8354,
3050
+ "step": 253500
3051
+ },
3052
+ {
3053
+ "epoch": 1.88,
3054
+ "learning_rate": 2.942745772257711e-06,
3055
+ "loss": 1.871,
3056
+ "step": 254000
3057
+ },
3058
+ {
3059
+ "epoch": 1.89,
3060
+ "learning_rate": 2.85011338204562e-06,
3061
+ "loss": 1.8274,
3062
+ "step": 254500
3063
+ },
3064
+ {
3065
+ "epoch": 1.89,
3066
+ "learning_rate": 2.7574809918335287e-06,
3067
+ "loss": 1.8193,
3068
+ "step": 255000
3069
+ },
3070
+ {
3071
+ "epoch": 1.89,
3072
+ "learning_rate": 2.6648486016214375e-06,
3073
+ "loss": 1.8336,
3074
+ "step": 255500
3075
+ },
3076
+ {
3077
+ "epoch": 1.9,
3078
+ "learning_rate": 2.5722162114093466e-06,
3079
+ "loss": 1.8599,
3080
+ "step": 256000
3081
+ },
3082
+ {
3083
+ "epoch": 1.9,
3084
+ "learning_rate": 2.4795838211972554e-06,
3085
+ "loss": 1.8584,
3086
+ "step": 256500
3087
+ },
3088
+ {
3089
+ "epoch": 1.9,
3090
+ "learning_rate": 2.386951430985164e-06,
3091
+ "loss": 1.8074,
3092
+ "step": 257000
3093
+ },
3094
+ {
3095
+ "epoch": 1.91,
3096
+ "learning_rate": 2.2943190407730732e-06,
3097
+ "loss": 1.8509,
3098
+ "step": 257500
3099
+ },
3100
+ {
3101
+ "epoch": 1.91,
3102
+ "learning_rate": 2.201686650560982e-06,
3103
+ "loss": 1.8579,
3104
+ "step": 258000
3105
+ },
3106
+ {
3107
+ "epoch": 1.92,
3108
+ "learning_rate": 2.1090542603488907e-06,
3109
+ "loss": 1.8529,
3110
+ "step": 258500
3111
+ },
3112
+ {
3113
+ "epoch": 1.92,
3114
+ "learning_rate": 2.0164218701368e-06,
3115
+ "loss": 1.8535,
3116
+ "step": 259000
3117
+ },
3118
+ {
3119
+ "epoch": 1.92,
3120
+ "learning_rate": 1.9237894799247086e-06,
3121
+ "loss": 1.899,
3122
+ "step": 259500
3123
+ },
3124
+ {
3125
+ "epoch": 1.93,
3126
+ "learning_rate": 1.8311570897126173e-06,
3127
+ "loss": 1.8707,
3128
+ "step": 260000
3129
+ },
3130
+ {
3131
+ "epoch": 1.93,
3132
+ "learning_rate": 1.7385246995005263e-06,
3133
+ "loss": 1.8482,
3134
+ "step": 260500
3135
+ },
3136
+ {
3137
+ "epoch": 1.93,
3138
+ "learning_rate": 1.645892309288435e-06,
3139
+ "loss": 1.8445,
3140
+ "step": 261000
3141
+ },
3142
+ {
3143
+ "epoch": 1.94,
3144
+ "learning_rate": 1.553259919076344e-06,
3145
+ "loss": 1.8472,
3146
+ "step": 261500
3147
+ },
3148
+ {
3149
+ "epoch": 1.94,
3150
+ "learning_rate": 1.4606275288642529e-06,
3151
+ "loss": 1.838,
3152
+ "step": 262000
3153
+ },
3154
+ {
3155
+ "epoch": 1.95,
3156
+ "learning_rate": 1.3679951386521616e-06,
3157
+ "loss": 1.8691,
3158
+ "step": 262500
3159
+ },
3160
+ {
3161
+ "epoch": 1.95,
3162
+ "learning_rate": 1.2753627484400705e-06,
3163
+ "loss": 1.8888,
3164
+ "step": 263000
3165
+ },
3166
+ {
3167
+ "epoch": 1.95,
3168
+ "learning_rate": 1.1827303582279795e-06,
3169
+ "loss": 1.8937,
3170
+ "step": 263500
3171
+ },
3172
+ {
3173
+ "epoch": 1.96,
3174
+ "learning_rate": 1.0900979680158882e-06,
3175
+ "loss": 1.8475,
3176
+ "step": 264000
3177
+ },
3178
+ {
3179
+ "epoch": 1.96,
3180
+ "learning_rate": 9.974655778037972e-07,
3181
+ "loss": 1.8725,
3182
+ "step": 264500
3183
+ },
3184
+ {
3185
+ "epoch": 1.96,
3186
+ "learning_rate": 9.048331875917061e-07,
3187
+ "loss": 1.8697,
3188
+ "step": 265000
3189
+ },
3190
+ {
3191
+ "epoch": 1.97,
3192
+ "learning_rate": 8.12200797379615e-07,
3193
+ "loss": 1.8696,
3194
+ "step": 265500
3195
+ },
3196
+ {
3197
+ "epoch": 1.97,
3198
+ "learning_rate": 7.195684071675239e-07,
3199
+ "loss": 1.852,
3200
+ "step": 266000
3201
+ },
3202
+ {
3203
+ "epoch": 1.97,
3204
+ "learning_rate": 6.269360169554327e-07,
3205
+ "loss": 1.8908,
3206
+ "step": 266500
3207
+ },
3208
+ {
3209
+ "epoch": 1.98,
3210
+ "learning_rate": 5.343036267433415e-07,
3211
+ "loss": 1.8715,
3212
+ "step": 267000
3213
+ },
3214
+ {
3215
+ "epoch": 1.98,
3216
+ "learning_rate": 4.416712365312505e-07,
3217
+ "loss": 1.8736,
3218
+ "step": 267500
3219
+ },
3220
+ {
3221
+ "epoch": 1.99,
3222
+ "learning_rate": 3.490388463191593e-07,
3223
+ "loss": 1.8706,
3224
+ "step": 268000
3225
+ },
3226
+ {
3227
+ "epoch": 1.99,
3228
+ "learning_rate": 2.564064561070682e-07,
3229
+ "loss": 1.8902,
3230
+ "step": 268500
3231
+ },
3232
+ {
3233
+ "epoch": 1.99,
3234
+ "learning_rate": 1.637740658949771e-07,
3235
+ "loss": 1.8956,
3236
+ "step": 269000
3237
+ },
3238
+ {
3239
+ "epoch": 2.0,
3240
+ "learning_rate": 7.114167568288598e-08,
3241
+ "loss": 1.8512,
3242
+ "step": 269500
3243
+ },
3244
+ {
3245
+ "epoch": 2.0,
3246
+ "step": 269884,
3247
+ "total_flos": 7.103408724277985e+17,
3248
+ "train_loss": 0.612412237187797,
3249
+ "train_runtime": 12901.2681,
3250
+ "train_samples_per_second": 20.919,
3251
+ "train_steps_per_second": 20.919
3252
  }
3253
  ],
3254
+ "max_steps": 269884,
3255
+ "num_train_epochs": 2,
3256
+ "total_flos": 7.103408724277985e+17,
3257
  "trial_name": null,
3258
  "trial_params": null
3259
  }