enoriega commited on
Commit
ea7b8f9
1 Parent(s): 082a1e2

End of training

Browse files
all_results.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 19.86,
3
+ "eval_accuracy": 0.9343159108876246,
4
+ "eval_loss": 0.07712016254663467,
5
+ "eval_runtime": 14.8238,
6
+ "eval_samples": 604,
7
+ "eval_samples_per_second": 40.745,
8
+ "eval_steps_per_second": 13.627,
9
+ "perplexity": 1.0801718647966687,
10
+ "train_loss": 0.09398916166987312,
11
+ "train_runtime": 60723.6526,
12
+ "train_samples": 53774,
13
+ "train_samples_per_second": 17.711,
14
+ "train_steps_per_second": 0.029
15
+ }
eval_results.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 19.86,
3
+ "eval_accuracy": 0.9343159108876246,
4
+ "eval_loss": 0.07712016254663467,
5
+ "eval_runtime": 14.8238,
6
+ "eval_samples": 604,
7
+ "eval_samples_per_second": 40.745,
8
+ "eval_steps_per_second": 13.627,
9
+ "perplexity": 1.0801718647966687
10
+ }
runs/Apr13_18-23-53_rogue/events.out.tfevents.1681496338.rogue.3067847.2 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:627e56976d997d3dd2e15723563ffbd11c1836f94706ad3a90a9c23692e1d772
3
+ size 363
train_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 19.86,
3
+ "train_loss": 0.09398916166987312,
4
+ "train_runtime": 60723.6526,
5
+ "train_samples": 53774,
6
+ "train_samples_per_second": 17.711,
7
+ "train_steps_per_second": 0.029
8
+ }
trainer_state.json ADDED
@@ -0,0 +1,1354 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 19.860529986053,
5
+ "global_step": 1780,
6
+ "is_hyper_param_search": false,
7
+ "is_local_process_zero": true,
8
+ "is_world_process_zero": true,
9
+ "log_history": [
10
+ {
11
+ "epoch": 0.11,
12
+ "learning_rate": 4.971910112359551e-05,
13
+ "loss": 6.9134,
14
+ "step": 10
15
+ },
16
+ {
17
+ "epoch": 0.22,
18
+ "learning_rate": 4.943820224719101e-05,
19
+ "loss": 2.1399,
20
+ "step": 20
21
+ },
22
+ {
23
+ "epoch": 0.33,
24
+ "learning_rate": 4.915730337078652e-05,
25
+ "loss": 1.0034,
26
+ "step": 30
27
+ },
28
+ {
29
+ "epoch": 0.45,
30
+ "learning_rate": 4.8876404494382024e-05,
31
+ "loss": 0.4659,
32
+ "step": 40
33
+ },
34
+ {
35
+ "epoch": 0.56,
36
+ "learning_rate": 4.859550561797753e-05,
37
+ "loss": 0.2436,
38
+ "step": 50
39
+ },
40
+ {
41
+ "epoch": 0.67,
42
+ "learning_rate": 4.831460674157304e-05,
43
+ "loss": 0.1612,
44
+ "step": 60
45
+ },
46
+ {
47
+ "epoch": 0.67,
48
+ "eval_accuracy": 0.9375623695908555,
49
+ "eval_loss": 0.11446994543075562,
50
+ "eval_runtime": 8.4911,
51
+ "eval_samples_per_second": 71.133,
52
+ "eval_steps_per_second": 23.79,
53
+ "step": 60
54
+ },
55
+ {
56
+ "epoch": 0.78,
57
+ "learning_rate": 4.803370786516854e-05,
58
+ "loss": 0.1068,
59
+ "step": 70
60
+ },
61
+ {
62
+ "epoch": 0.89,
63
+ "learning_rate": 4.7752808988764046e-05,
64
+ "loss": 0.0831,
65
+ "step": 80
66
+ },
67
+ {
68
+ "epoch": 1.0,
69
+ "learning_rate": 4.747191011235955e-05,
70
+ "loss": 0.0751,
71
+ "step": 90
72
+ },
73
+ {
74
+ "epoch": 1.12,
75
+ "learning_rate": 4.719101123595506e-05,
76
+ "loss": 0.0706,
77
+ "step": 100
78
+ },
79
+ {
80
+ "epoch": 1.23,
81
+ "learning_rate": 4.691011235955056e-05,
82
+ "loss": 0.0676,
83
+ "step": 110
84
+ },
85
+ {
86
+ "epoch": 1.34,
87
+ "learning_rate": 4.662921348314607e-05,
88
+ "loss": 0.0666,
89
+ "step": 120
90
+ },
91
+ {
92
+ "epoch": 1.34,
93
+ "eval_accuracy": 0.9356021824496832,
94
+ "eval_loss": 0.06282978504896164,
95
+ "eval_runtime": 8.387,
96
+ "eval_samples_per_second": 72.016,
97
+ "eval_steps_per_second": 24.085,
98
+ "step": 120
99
+ },
100
+ {
101
+ "epoch": 1.45,
102
+ "learning_rate": 4.6348314606741575e-05,
103
+ "loss": 0.0648,
104
+ "step": 130
105
+ },
106
+ {
107
+ "epoch": 1.56,
108
+ "learning_rate": 4.606741573033708e-05,
109
+ "loss": 0.0632,
110
+ "step": 140
111
+ },
112
+ {
113
+ "epoch": 1.67,
114
+ "learning_rate": 4.578651685393259e-05,
115
+ "loss": 0.0616,
116
+ "step": 150
117
+ },
118
+ {
119
+ "epoch": 1.79,
120
+ "learning_rate": 4.550561797752809e-05,
121
+ "loss": 0.0609,
122
+ "step": 160
123
+ },
124
+ {
125
+ "epoch": 1.9,
126
+ "learning_rate": 4.52247191011236e-05,
127
+ "loss": 0.0608,
128
+ "step": 170
129
+ },
130
+ {
131
+ "epoch": 2.01,
132
+ "learning_rate": 4.4943820224719104e-05,
133
+ "loss": 0.0599,
134
+ "step": 180
135
+ },
136
+ {
137
+ "epoch": 2.01,
138
+ "eval_accuracy": 0.9355179430022939,
139
+ "eval_loss": 0.061063703149557114,
140
+ "eval_runtime": 8.4253,
141
+ "eval_samples_per_second": 71.689,
142
+ "eval_steps_per_second": 23.975,
143
+ "step": 180
144
+ },
145
+ {
146
+ "epoch": 2.12,
147
+ "learning_rate": 4.4662921348314605e-05,
148
+ "loss": 0.0595,
149
+ "step": 190
150
+ },
151
+ {
152
+ "epoch": 2.23,
153
+ "learning_rate": 4.438202247191011e-05,
154
+ "loss": 0.0586,
155
+ "step": 200
156
+ },
157
+ {
158
+ "epoch": 2.34,
159
+ "learning_rate": 4.410112359550562e-05,
160
+ "loss": 0.058,
161
+ "step": 210
162
+ },
163
+ {
164
+ "epoch": 2.45,
165
+ "learning_rate": 4.3820224719101126e-05,
166
+ "loss": 0.0573,
167
+ "step": 220
168
+ },
169
+ {
170
+ "epoch": 2.57,
171
+ "learning_rate": 4.353932584269663e-05,
172
+ "loss": 0.0567,
173
+ "step": 230
174
+ },
175
+ {
176
+ "epoch": 2.68,
177
+ "learning_rate": 4.3258426966292134e-05,
178
+ "loss": 0.0563,
179
+ "step": 240
180
+ },
181
+ {
182
+ "epoch": 2.68,
183
+ "eval_accuracy": 0.9352101450214486,
184
+ "eval_loss": 0.06314567476511002,
185
+ "eval_runtime": 8.3902,
186
+ "eval_samples_per_second": 71.989,
187
+ "eval_steps_per_second": 24.076,
188
+ "step": 240
189
+ },
190
+ {
191
+ "epoch": 2.79,
192
+ "learning_rate": 4.297752808988764e-05,
193
+ "loss": 0.0555,
194
+ "step": 250
195
+ },
196
+ {
197
+ "epoch": 2.9,
198
+ "learning_rate": 4.269662921348315e-05,
199
+ "loss": 0.0543,
200
+ "step": 260
201
+ },
202
+ {
203
+ "epoch": 3.01,
204
+ "learning_rate": 4.2415730337078655e-05,
205
+ "loss": 0.0544,
206
+ "step": 270
207
+ },
208
+ {
209
+ "epoch": 3.12,
210
+ "learning_rate": 4.2134831460674156e-05,
211
+ "loss": 0.0528,
212
+ "step": 280
213
+ },
214
+ {
215
+ "epoch": 3.24,
216
+ "learning_rate": 4.185393258426967e-05,
217
+ "loss": 0.052,
218
+ "step": 290
219
+ },
220
+ {
221
+ "epoch": 3.35,
222
+ "learning_rate": 4.157303370786517e-05,
223
+ "loss": 0.0512,
224
+ "step": 300
225
+ },
226
+ {
227
+ "epoch": 3.35,
228
+ "eval_accuracy": 0.9347176682520962,
229
+ "eval_loss": 0.06297876685857773,
230
+ "eval_runtime": 8.3784,
231
+ "eval_samples_per_second": 72.09,
232
+ "eval_steps_per_second": 24.11,
233
+ "step": 300
234
+ },
235
+ {
236
+ "epoch": 3.46,
237
+ "learning_rate": 4.129213483146068e-05,
238
+ "loss": 0.05,
239
+ "step": 310
240
+ },
241
+ {
242
+ "epoch": 3.57,
243
+ "learning_rate": 4.1011235955056184e-05,
244
+ "loss": 0.0498,
245
+ "step": 320
246
+ },
247
+ {
248
+ "epoch": 3.68,
249
+ "learning_rate": 4.0730337078651685e-05,
250
+ "loss": 0.0493,
251
+ "step": 330
252
+ },
253
+ {
254
+ "epoch": 3.79,
255
+ "learning_rate": 4.044943820224719e-05,
256
+ "loss": 0.0484,
257
+ "step": 340
258
+ },
259
+ {
260
+ "epoch": 3.91,
261
+ "learning_rate": 4.01685393258427e-05,
262
+ "loss": 0.048,
263
+ "step": 350
264
+ },
265
+ {
266
+ "epoch": 4.02,
267
+ "learning_rate": 3.98876404494382e-05,
268
+ "loss": 0.0472,
269
+ "step": 360
270
+ },
271
+ {
272
+ "epoch": 4.02,
273
+ "eval_accuracy": 0.9338493539482381,
274
+ "eval_loss": 0.06378066539764404,
275
+ "eval_runtime": 8.4193,
276
+ "eval_samples_per_second": 71.74,
277
+ "eval_steps_per_second": 23.993,
278
+ "step": 360
279
+ },
280
+ {
281
+ "epoch": 4.13,
282
+ "learning_rate": 3.960674157303371e-05,
283
+ "loss": 0.046,
284
+ "step": 370
285
+ },
286
+ {
287
+ "epoch": 4.24,
288
+ "learning_rate": 3.9325842696629214e-05,
289
+ "loss": 0.0456,
290
+ "step": 380
291
+ },
292
+ {
293
+ "epoch": 4.35,
294
+ "learning_rate": 3.904494382022472e-05,
295
+ "loss": 0.0448,
296
+ "step": 390
297
+ },
298
+ {
299
+ "epoch": 4.46,
300
+ "learning_rate": 3.876404494382023e-05,
301
+ "loss": 0.0447,
302
+ "step": 400
303
+ },
304
+ {
305
+ "epoch": 4.57,
306
+ "learning_rate": 3.8483146067415735e-05,
307
+ "loss": 0.0441,
308
+ "step": 410
309
+ },
310
+ {
311
+ "epoch": 4.69,
312
+ "learning_rate": 3.8202247191011236e-05,
313
+ "loss": 0.0438,
314
+ "step": 420
315
+ },
316
+ {
317
+ "epoch": 4.69,
318
+ "eval_accuracy": 0.933936833374373,
319
+ "eval_loss": 0.06545107811689377,
320
+ "eval_runtime": 8.4022,
321
+ "eval_samples_per_second": 71.886,
322
+ "eval_steps_per_second": 24.041,
323
+ "step": 420
324
+ },
325
+ {
326
+ "epoch": 4.8,
327
+ "learning_rate": 3.792134831460674e-05,
328
+ "loss": 0.0431,
329
+ "step": 430
330
+ },
331
+ {
332
+ "epoch": 4.91,
333
+ "learning_rate": 3.764044943820225e-05,
334
+ "loss": 0.0422,
335
+ "step": 440
336
+ },
337
+ {
338
+ "epoch": 5.02,
339
+ "learning_rate": 3.735955056179776e-05,
340
+ "loss": 0.0422,
341
+ "step": 450
342
+ },
343
+ {
344
+ "epoch": 5.13,
345
+ "learning_rate": 3.7078651685393264e-05,
346
+ "loss": 0.0408,
347
+ "step": 460
348
+ },
349
+ {
350
+ "epoch": 5.24,
351
+ "learning_rate": 3.6797752808988765e-05,
352
+ "loss": 0.0402,
353
+ "step": 470
354
+ },
355
+ {
356
+ "epoch": 5.36,
357
+ "learning_rate": 3.651685393258427e-05,
358
+ "loss": 0.0405,
359
+ "step": 480
360
+ },
361
+ {
362
+ "epoch": 5.36,
363
+ "eval_accuracy": 0.9344649499099286,
364
+ "eval_loss": 0.06603477150201797,
365
+ "eval_runtime": 8.3885,
366
+ "eval_samples_per_second": 72.003,
367
+ "eval_steps_per_second": 24.081,
368
+ "step": 480
369
+ },
370
+ {
371
+ "epoch": 5.47,
372
+ "learning_rate": 3.623595505617978e-05,
373
+ "loss": 0.039,
374
+ "step": 490
375
+ },
376
+ {
377
+ "epoch": 5.58,
378
+ "learning_rate": 3.595505617977528e-05,
379
+ "loss": 0.0387,
380
+ "step": 500
381
+ },
382
+ {
383
+ "epoch": 5.69,
384
+ "learning_rate": 3.5674157303370787e-05,
385
+ "loss": 0.0386,
386
+ "step": 510
387
+ },
388
+ {
389
+ "epoch": 5.8,
390
+ "learning_rate": 3.5393258426966294e-05,
391
+ "loss": 0.0385,
392
+ "step": 520
393
+ },
394
+ {
395
+ "epoch": 5.91,
396
+ "learning_rate": 3.51123595505618e-05,
397
+ "loss": 0.0376,
398
+ "step": 530
399
+ },
400
+ {
401
+ "epoch": 6.03,
402
+ "learning_rate": 3.483146067415731e-05,
403
+ "loss": 0.0378,
404
+ "step": 540
405
+ },
406
+ {
407
+ "epoch": 6.03,
408
+ "eval_accuracy": 0.9341765918015578,
409
+ "eval_loss": 0.06659498065710068,
410
+ "eval_runtime": 8.3709,
411
+ "eval_samples_per_second": 72.154,
412
+ "eval_steps_per_second": 24.131,
413
+ "step": 540
414
+ },
415
+ {
416
+ "epoch": 6.14,
417
+ "learning_rate": 3.455056179775281e-05,
418
+ "loss": 0.0366,
419
+ "step": 550
420
+ },
421
+ {
422
+ "epoch": 6.25,
423
+ "learning_rate": 3.4269662921348316e-05,
424
+ "loss": 0.0356,
425
+ "step": 560
426
+ },
427
+ {
428
+ "epoch": 6.36,
429
+ "learning_rate": 3.398876404494382e-05,
430
+ "loss": 0.0351,
431
+ "step": 570
432
+ },
433
+ {
434
+ "epoch": 6.47,
435
+ "learning_rate": 3.370786516853933e-05,
436
+ "loss": 0.0353,
437
+ "step": 580
438
+ },
439
+ {
440
+ "epoch": 6.58,
441
+ "learning_rate": 3.342696629213483e-05,
442
+ "loss": 0.0348,
443
+ "step": 590
444
+ },
445
+ {
446
+ "epoch": 6.69,
447
+ "learning_rate": 3.314606741573034e-05,
448
+ "loss": 0.0344,
449
+ "step": 600
450
+ },
451
+ {
452
+ "epoch": 6.69,
453
+ "eval_accuracy": 0.93427379116393,
454
+ "eval_loss": 0.06691180914640427,
455
+ "eval_runtime": 8.391,
456
+ "eval_samples_per_second": 71.982,
457
+ "eval_steps_per_second": 24.073,
458
+ "step": 600
459
+ },
460
+ {
461
+ "epoch": 6.81,
462
+ "learning_rate": 3.2865168539325845e-05,
463
+ "loss": 0.0334,
464
+ "step": 610
465
+ },
466
+ {
467
+ "epoch": 6.92,
468
+ "learning_rate": 3.258426966292135e-05,
469
+ "loss": 0.0342,
470
+ "step": 620
471
+ },
472
+ {
473
+ "epoch": 7.03,
474
+ "learning_rate": 3.230337078651686e-05,
475
+ "loss": 0.0331,
476
+ "step": 630
477
+ },
478
+ {
479
+ "epoch": 7.14,
480
+ "learning_rate": 3.202247191011236e-05,
481
+ "loss": 0.0324,
482
+ "step": 640
483
+ },
484
+ {
485
+ "epoch": 7.25,
486
+ "learning_rate": 3.1741573033707866e-05,
487
+ "loss": 0.0314,
488
+ "step": 650
489
+ },
490
+ {
491
+ "epoch": 7.36,
492
+ "learning_rate": 3.1460674157303374e-05,
493
+ "loss": 0.0323,
494
+ "step": 660
495
+ },
496
+ {
497
+ "epoch": 7.36,
498
+ "eval_accuracy": 0.9343839504412851,
499
+ "eval_loss": 0.06778556853532791,
500
+ "eval_runtime": 8.8059,
501
+ "eval_samples_per_second": 68.591,
502
+ "eval_steps_per_second": 22.939,
503
+ "step": 660
504
+ },
505
+ {
506
+ "epoch": 7.48,
507
+ "learning_rate": 3.1179775280898874e-05,
508
+ "loss": 0.0317,
509
+ "step": 670
510
+ },
511
+ {
512
+ "epoch": 7.59,
513
+ "learning_rate": 3.089887640449438e-05,
514
+ "loss": 0.0324,
515
+ "step": 680
516
+ },
517
+ {
518
+ "epoch": 7.7,
519
+ "learning_rate": 3.061797752808989e-05,
520
+ "loss": 0.0318,
521
+ "step": 690
522
+ },
523
+ {
524
+ "epoch": 7.81,
525
+ "learning_rate": 3.0337078651685396e-05,
526
+ "loss": 0.0313,
527
+ "step": 700
528
+ },
529
+ {
530
+ "epoch": 7.92,
531
+ "learning_rate": 3.0056179775280903e-05,
532
+ "loss": 0.031,
533
+ "step": 710
534
+ },
535
+ {
536
+ "epoch": 8.03,
537
+ "learning_rate": 2.9775280898876406e-05,
538
+ "loss": 0.0307,
539
+ "step": 720
540
+ },
541
+ {
542
+ "epoch": 8.03,
543
+ "eval_accuracy": 0.9343191508663703,
544
+ "eval_loss": 0.06937924772500992,
545
+ "eval_runtime": 8.3943,
546
+ "eval_samples_per_second": 71.954,
547
+ "eval_steps_per_second": 24.064,
548
+ "step": 720
549
+ },
550
+ {
551
+ "epoch": 8.15,
552
+ "learning_rate": 2.949438202247191e-05,
553
+ "loss": 0.0304,
554
+ "step": 730
555
+ },
556
+ {
557
+ "epoch": 8.26,
558
+ "learning_rate": 2.9213483146067417e-05,
559
+ "loss": 0.0304,
560
+ "step": 740
561
+ },
562
+ {
563
+ "epoch": 8.37,
564
+ "learning_rate": 2.893258426966292e-05,
565
+ "loss": 0.0296,
566
+ "step": 750
567
+ },
568
+ {
569
+ "epoch": 8.48,
570
+ "learning_rate": 2.8651685393258425e-05,
571
+ "loss": 0.0295,
572
+ "step": 760
573
+ },
574
+ {
575
+ "epoch": 8.59,
576
+ "learning_rate": 2.8370786516853936e-05,
577
+ "loss": 0.0299,
578
+ "step": 770
579
+ },
580
+ {
581
+ "epoch": 8.7,
582
+ "learning_rate": 2.8089887640449443e-05,
583
+ "loss": 0.0294,
584
+ "step": 780
585
+ },
586
+ {
587
+ "epoch": 8.7,
588
+ "eval_accuracy": 0.9344681898886743,
589
+ "eval_loss": 0.07063037902116776,
590
+ "eval_runtime": 16.4408,
591
+ "eval_samples_per_second": 36.738,
592
+ "eval_steps_per_second": 12.286,
593
+ "step": 780
594
+ },
595
+ {
596
+ "epoch": 8.81,
597
+ "learning_rate": 2.7808988764044946e-05,
598
+ "loss": 0.03,
599
+ "step": 790
600
+ },
601
+ {
602
+ "epoch": 8.93,
603
+ "learning_rate": 2.752808988764045e-05,
604
+ "loss": 0.0296,
605
+ "step": 800
606
+ },
607
+ {
608
+ "epoch": 9.04,
609
+ "learning_rate": 2.7247191011235957e-05,
610
+ "loss": 0.0288,
611
+ "step": 810
612
+ },
613
+ {
614
+ "epoch": 9.15,
615
+ "learning_rate": 2.696629213483146e-05,
616
+ "loss": 0.0281,
617
+ "step": 820
618
+ },
619
+ {
620
+ "epoch": 9.26,
621
+ "learning_rate": 2.6685393258426965e-05,
622
+ "loss": 0.0286,
623
+ "step": 830
624
+ },
625
+ {
626
+ "epoch": 9.37,
627
+ "learning_rate": 2.6404494382022472e-05,
628
+ "loss": 0.0286,
629
+ "step": 840
630
+ },
631
+ {
632
+ "epoch": 9.37,
633
+ "eval_accuracy": 0.9342381513977268,
634
+ "eval_loss": 0.0725179985165596,
635
+ "eval_runtime": 16.4083,
636
+ "eval_samples_per_second": 36.811,
637
+ "eval_steps_per_second": 12.311,
638
+ "step": 840
639
+ },
640
+ {
641
+ "epoch": 9.48,
642
+ "learning_rate": 2.6123595505617983e-05,
643
+ "loss": 0.0288,
644
+ "step": 850
645
+ },
646
+ {
647
+ "epoch": 9.6,
648
+ "learning_rate": 2.5842696629213486e-05,
649
+ "loss": 0.0284,
650
+ "step": 860
651
+ },
652
+ {
653
+ "epoch": 9.71,
654
+ "learning_rate": 2.556179775280899e-05,
655
+ "loss": 0.0283,
656
+ "step": 870
657
+ },
658
+ {
659
+ "epoch": 9.82,
660
+ "learning_rate": 2.5280898876404497e-05,
661
+ "loss": 0.0282,
662
+ "step": 880
663
+ },
664
+ {
665
+ "epoch": 9.93,
666
+ "learning_rate": 2.5e-05,
667
+ "loss": 0.0278,
668
+ "step": 890
669
+ },
670
+ {
671
+ "epoch": 10.04,
672
+ "learning_rate": 2.4719101123595505e-05,
673
+ "loss": 0.0275,
674
+ "step": 900
675
+ },
676
+ {
677
+ "epoch": 10.04,
678
+ "eval_accuracy": 0.9343159108876246,
679
+ "eval_loss": 0.0727309137582779,
680
+ "eval_runtime": 16.3442,
681
+ "eval_samples_per_second": 36.955,
682
+ "eval_steps_per_second": 12.359,
683
+ "step": 900
684
+ },
685
+ {
686
+ "epoch": 10.15,
687
+ "learning_rate": 2.4438202247191012e-05,
688
+ "loss": 0.0272,
689
+ "step": 910
690
+ },
691
+ {
692
+ "epoch": 10.26,
693
+ "learning_rate": 2.415730337078652e-05,
694
+ "loss": 0.0271,
695
+ "step": 920
696
+ },
697
+ {
698
+ "epoch": 10.38,
699
+ "learning_rate": 2.3876404494382023e-05,
700
+ "loss": 0.027,
701
+ "step": 930
702
+ },
703
+ {
704
+ "epoch": 10.49,
705
+ "learning_rate": 2.359550561797753e-05,
706
+ "loss": 0.0275,
707
+ "step": 940
708
+ },
709
+ {
710
+ "epoch": 10.6,
711
+ "learning_rate": 2.3314606741573034e-05,
712
+ "loss": 0.0271,
713
+ "step": 950
714
+ },
715
+ {
716
+ "epoch": 10.71,
717
+ "learning_rate": 2.303370786516854e-05,
718
+ "loss": 0.0282,
719
+ "step": 960
720
+ },
721
+ {
722
+ "epoch": 10.71,
723
+ "eval_accuracy": 0.934247871333964,
724
+ "eval_loss": 0.07324212789535522,
725
+ "eval_runtime": 16.4154,
726
+ "eval_samples_per_second": 36.795,
727
+ "eval_steps_per_second": 12.306,
728
+ "step": 960
729
+ },
730
+ {
731
+ "epoch": 10.82,
732
+ "learning_rate": 2.2752808988764045e-05,
733
+ "loss": 0.0263,
734
+ "step": 970
735
+ },
736
+ {
737
+ "epoch": 10.93,
738
+ "learning_rate": 2.2471910112359552e-05,
739
+ "loss": 0.0266,
740
+ "step": 980
741
+ },
742
+ {
743
+ "epoch": 11.05,
744
+ "learning_rate": 2.2191011235955056e-05,
745
+ "loss": 0.0269,
746
+ "step": 990
747
+ },
748
+ {
749
+ "epoch": 11.16,
750
+ "learning_rate": 2.1910112359550563e-05,
751
+ "loss": 0.0266,
752
+ "step": 1000
753
+ },
754
+ {
755
+ "epoch": 11.27,
756
+ "learning_rate": 2.1629213483146067e-05,
757
+ "loss": 0.0259,
758
+ "step": 1010
759
+ },
760
+ {
761
+ "epoch": 11.38,
762
+ "learning_rate": 2.1348314606741574e-05,
763
+ "loss": 0.0264,
764
+ "step": 1020
765
+ },
766
+ {
767
+ "epoch": 11.38,
768
+ "eval_accuracy": 0.9342802711214214,
769
+ "eval_loss": 0.07354074716567993,
770
+ "eval_runtime": 16.5992,
771
+ "eval_samples_per_second": 36.387,
772
+ "eval_steps_per_second": 12.169,
773
+ "step": 1020
774
+ },
775
+ {
776
+ "epoch": 11.49,
777
+ "learning_rate": 2.1067415730337078e-05,
778
+ "loss": 0.0265,
779
+ "step": 1030
780
+ },
781
+ {
782
+ "epoch": 11.6,
783
+ "learning_rate": 2.0786516853932585e-05,
784
+ "loss": 0.0262,
785
+ "step": 1040
786
+ },
787
+ {
788
+ "epoch": 11.72,
789
+ "learning_rate": 2.0505617977528092e-05,
790
+ "loss": 0.0259,
791
+ "step": 1050
792
+ },
793
+ {
794
+ "epoch": 11.83,
795
+ "learning_rate": 2.0224719101123596e-05,
796
+ "loss": 0.026,
797
+ "step": 1060
798
+ },
799
+ {
800
+ "epoch": 11.94,
801
+ "learning_rate": 1.99438202247191e-05,
802
+ "loss": 0.026,
803
+ "step": 1070
804
+ },
805
+ {
806
+ "epoch": 12.05,
807
+ "learning_rate": 1.9662921348314607e-05,
808
+ "loss": 0.026,
809
+ "step": 1080
810
+ },
811
+ {
812
+ "epoch": 12.05,
813
+ "eval_accuracy": 0.9342413913764726,
814
+ "eval_loss": 0.07500974833965302,
815
+ "eval_runtime": 16.431,
816
+ "eval_samples_per_second": 36.76,
817
+ "eval_steps_per_second": 12.294,
818
+ "step": 1080
819
+ },
820
+ {
821
+ "epoch": 12.16,
822
+ "learning_rate": 1.9382022471910114e-05,
823
+ "loss": 0.0254,
824
+ "step": 1090
825
+ },
826
+ {
827
+ "epoch": 12.27,
828
+ "learning_rate": 1.9101123595505618e-05,
829
+ "loss": 0.0253,
830
+ "step": 1100
831
+ },
832
+ {
833
+ "epoch": 12.38,
834
+ "learning_rate": 1.8820224719101125e-05,
835
+ "loss": 0.0259,
836
+ "step": 1110
837
+ },
838
+ {
839
+ "epoch": 12.5,
840
+ "learning_rate": 1.8539325842696632e-05,
841
+ "loss": 0.0253,
842
+ "step": 1120
843
+ },
844
+ {
845
+ "epoch": 12.61,
846
+ "learning_rate": 1.8258426966292136e-05,
847
+ "loss": 0.0253,
848
+ "step": 1130
849
+ },
850
+ {
851
+ "epoch": 12.72,
852
+ "learning_rate": 1.797752808988764e-05,
853
+ "loss": 0.0254,
854
+ "step": 1140
855
+ },
856
+ {
857
+ "epoch": 12.72,
858
+ "eval_accuracy": 0.9343094309301331,
859
+ "eval_loss": 0.07532921433448792,
860
+ "eval_runtime": 16.5723,
861
+ "eval_samples_per_second": 36.446,
862
+ "eval_steps_per_second": 12.189,
863
+ "step": 1140
864
+ },
865
+ {
866
+ "epoch": 12.83,
867
+ "learning_rate": 1.7696629213483147e-05,
868
+ "loss": 0.0254,
869
+ "step": 1150
870
+ },
871
+ {
872
+ "epoch": 12.94,
873
+ "learning_rate": 1.7415730337078654e-05,
874
+ "loss": 0.0255,
875
+ "step": 1160
876
+ },
877
+ {
878
+ "epoch": 13.05,
879
+ "learning_rate": 1.7134831460674158e-05,
880
+ "loss": 0.0249,
881
+ "step": 1170
882
+ },
883
+ {
884
+ "epoch": 13.17,
885
+ "learning_rate": 1.6853932584269665e-05,
886
+ "loss": 0.0246,
887
+ "step": 1180
888
+ },
889
+ {
890
+ "epoch": 13.28,
891
+ "learning_rate": 1.657303370786517e-05,
892
+ "loss": 0.0245,
893
+ "step": 1190
894
+ },
895
+ {
896
+ "epoch": 13.39,
897
+ "learning_rate": 1.6292134831460676e-05,
898
+ "loss": 0.0244,
899
+ "step": 1200
900
+ },
901
+ {
902
+ "epoch": 13.39,
903
+ "eval_accuracy": 0.9343515506538277,
904
+ "eval_loss": 0.07460575550794601,
905
+ "eval_runtime": 16.4167,
906
+ "eval_samples_per_second": 36.792,
907
+ "eval_steps_per_second": 12.305,
908
+ "step": 1200
909
+ },
910
+ {
911
+ "epoch": 13.5,
912
+ "learning_rate": 1.601123595505618e-05,
913
+ "loss": 0.025,
914
+ "step": 1210
915
+ },
916
+ {
917
+ "epoch": 13.61,
918
+ "learning_rate": 1.5730337078651687e-05,
919
+ "loss": 0.025,
920
+ "step": 1220
921
+ },
922
+ {
923
+ "epoch": 13.72,
924
+ "learning_rate": 1.544943820224719e-05,
925
+ "loss": 0.0244,
926
+ "step": 1230
927
+ },
928
+ {
929
+ "epoch": 13.84,
930
+ "learning_rate": 1.5168539325842698e-05,
931
+ "loss": 0.0247,
932
+ "step": 1240
933
+ },
934
+ {
935
+ "epoch": 13.95,
936
+ "learning_rate": 1.4887640449438203e-05,
937
+ "loss": 0.0248,
938
+ "step": 1250
939
+ },
940
+ {
941
+ "epoch": 14.06,
942
+ "learning_rate": 1.4606741573033709e-05,
943
+ "loss": 0.0242,
944
+ "step": 1260
945
+ },
946
+ {
947
+ "epoch": 14.06,
948
+ "eval_accuracy": 0.9342964710151501,
949
+ "eval_loss": 0.0752115547657013,
950
+ "eval_runtime": 16.4268,
951
+ "eval_samples_per_second": 36.769,
952
+ "eval_steps_per_second": 12.297,
953
+ "step": 1260
954
+ },
955
+ {
956
+ "epoch": 14.17,
957
+ "learning_rate": 1.4325842696629212e-05,
958
+ "loss": 0.0243,
959
+ "step": 1270
960
+ },
961
+ {
962
+ "epoch": 14.28,
963
+ "learning_rate": 1.4044943820224721e-05,
964
+ "loss": 0.0243,
965
+ "step": 1280
966
+ },
967
+ {
968
+ "epoch": 14.39,
969
+ "learning_rate": 1.3764044943820225e-05,
970
+ "loss": 0.0245,
971
+ "step": 1290
972
+ },
973
+ {
974
+ "epoch": 14.5,
975
+ "learning_rate": 1.348314606741573e-05,
976
+ "loss": 0.0241,
977
+ "step": 1300
978
+ },
979
+ {
980
+ "epoch": 14.62,
981
+ "learning_rate": 1.3202247191011236e-05,
982
+ "loss": 0.0245,
983
+ "step": 1310
984
+ },
985
+ {
986
+ "epoch": 14.73,
987
+ "learning_rate": 1.2921348314606743e-05,
988
+ "loss": 0.024,
989
+ "step": 1320
990
+ },
991
+ {
992
+ "epoch": 14.73,
993
+ "eval_accuracy": 0.9341701118440663,
994
+ "eval_loss": 0.07579129934310913,
995
+ "eval_runtime": 16.4351,
996
+ "eval_samples_per_second": 36.751,
997
+ "eval_steps_per_second": 12.291,
998
+ "step": 1320
999
+ },
1000
+ {
1001
+ "epoch": 14.84,
1002
+ "learning_rate": 1.2640449438202249e-05,
1003
+ "loss": 0.0238,
1004
+ "step": 1330
1005
+ },
1006
+ {
1007
+ "epoch": 14.95,
1008
+ "learning_rate": 1.2359550561797752e-05,
1009
+ "loss": 0.0239,
1010
+ "step": 1340
1011
+ },
1012
+ {
1013
+ "epoch": 15.06,
1014
+ "learning_rate": 1.207865168539326e-05,
1015
+ "loss": 0.0233,
1016
+ "step": 1350
1017
+ },
1018
+ {
1019
+ "epoch": 15.17,
1020
+ "learning_rate": 1.1797752808988765e-05,
1021
+ "loss": 0.0237,
1022
+ "step": 1360
1023
+ },
1024
+ {
1025
+ "epoch": 15.29,
1026
+ "learning_rate": 1.151685393258427e-05,
1027
+ "loss": 0.0238,
1028
+ "step": 1370
1029
+ },
1030
+ {
1031
+ "epoch": 15.4,
1032
+ "learning_rate": 1.1235955056179776e-05,
1033
+ "loss": 0.0239,
1034
+ "step": 1380
1035
+ },
1036
+ {
1037
+ "epoch": 15.4,
1038
+ "eval_accuracy": 0.934322390845116,
1039
+ "eval_loss": 0.07643470913171768,
1040
+ "eval_runtime": 16.4296,
1041
+ "eval_samples_per_second": 36.763,
1042
+ "eval_steps_per_second": 12.295,
1043
+ "step": 1380
1044
+ },
1045
+ {
1046
+ "epoch": 15.51,
1047
+ "learning_rate": 1.0955056179775282e-05,
1048
+ "loss": 0.0234,
1049
+ "step": 1390
1050
+ },
1051
+ {
1052
+ "epoch": 15.62,
1053
+ "learning_rate": 1.0674157303370787e-05,
1054
+ "loss": 0.0235,
1055
+ "step": 1400
1056
+ },
1057
+ {
1058
+ "epoch": 15.73,
1059
+ "learning_rate": 1.0393258426966292e-05,
1060
+ "loss": 0.0233,
1061
+ "step": 1410
1062
+ },
1063
+ {
1064
+ "epoch": 15.84,
1065
+ "learning_rate": 1.0112359550561798e-05,
1066
+ "loss": 0.0236,
1067
+ "step": 1420
1068
+ },
1069
+ {
1070
+ "epoch": 15.96,
1071
+ "learning_rate": 9.831460674157303e-06,
1072
+ "loss": 0.0236,
1073
+ "step": 1430
1074
+ },
1075
+ {
1076
+ "epoch": 16.07,
1077
+ "learning_rate": 9.550561797752809e-06,
1078
+ "loss": 0.0234,
1079
+ "step": 1440
1080
+ },
1081
+ {
1082
+ "epoch": 16.07,
1083
+ "eval_accuracy": 0.9342835111001672,
1084
+ "eval_loss": 0.07627058029174805,
1085
+ "eval_runtime": 16.539,
1086
+ "eval_samples_per_second": 36.52,
1087
+ "eval_steps_per_second": 12.214,
1088
+ "step": 1440
1089
+ },
1090
+ {
1091
+ "epoch": 16.18,
1092
+ "learning_rate": 9.269662921348316e-06,
1093
+ "loss": 0.0235,
1094
+ "step": 1450
1095
+ },
1096
+ {
1097
+ "epoch": 16.29,
1098
+ "learning_rate": 8.98876404494382e-06,
1099
+ "loss": 0.023,
1100
+ "step": 1460
1101
+ },
1102
+ {
1103
+ "epoch": 16.4,
1104
+ "learning_rate": 8.707865168539327e-06,
1105
+ "loss": 0.0236,
1106
+ "step": 1470
1107
+ },
1108
+ {
1109
+ "epoch": 16.51,
1110
+ "learning_rate": 8.426966292134832e-06,
1111
+ "loss": 0.0231,
1112
+ "step": 1480
1113
+ },
1114
+ {
1115
+ "epoch": 16.62,
1116
+ "learning_rate": 8.146067415730338e-06,
1117
+ "loss": 0.0234,
1118
+ "step": 1490
1119
+ },
1120
+ {
1121
+ "epoch": 16.74,
1122
+ "learning_rate": 7.865168539325843e-06,
1123
+ "loss": 0.0231,
1124
+ "step": 1500
1125
+ },
1126
+ {
1127
+ "epoch": 16.74,
1128
+ "eval_accuracy": 0.9343126709088788,
1129
+ "eval_loss": 0.07644984126091003,
1130
+ "eval_runtime": 16.3314,
1131
+ "eval_samples_per_second": 36.984,
1132
+ "eval_steps_per_second": 12.369,
1133
+ "step": 1500
1134
+ },
1135
+ {
1136
+ "epoch": 16.85,
1137
+ "learning_rate": 7.584269662921349e-06,
1138
+ "loss": 0.0229,
1139
+ "step": 1510
1140
+ },
1141
+ {
1142
+ "epoch": 16.96,
1143
+ "learning_rate": 7.303370786516854e-06,
1144
+ "loss": 0.0233,
1145
+ "step": 1520
1146
+ },
1147
+ {
1148
+ "epoch": 17.07,
1149
+ "learning_rate": 7.022471910112361e-06,
1150
+ "loss": 0.0231,
1151
+ "step": 1530
1152
+ },
1153
+ {
1154
+ "epoch": 17.18,
1155
+ "learning_rate": 6.741573033707865e-06,
1156
+ "loss": 0.023,
1157
+ "step": 1540
1158
+ },
1159
+ {
1160
+ "epoch": 17.29,
1161
+ "learning_rate": 6.460674157303372e-06,
1162
+ "loss": 0.0226,
1163
+ "step": 1550
1164
+ },
1165
+ {
1166
+ "epoch": 17.41,
1167
+ "learning_rate": 6.179775280898876e-06,
1168
+ "loss": 0.0226,
1169
+ "step": 1560
1170
+ },
1171
+ {
1172
+ "epoch": 17.41,
1173
+ "eval_accuracy": 0.9342932310364044,
1174
+ "eval_loss": 0.07702977955341339,
1175
+ "eval_runtime": 16.4049,
1176
+ "eval_samples_per_second": 36.818,
1177
+ "eval_steps_per_second": 12.313,
1178
+ "step": 1560
1179
+ },
1180
+ {
1181
+ "epoch": 17.52,
1182
+ "learning_rate": 5.8988764044943826e-06,
1183
+ "loss": 0.023,
1184
+ "step": 1570
1185
+ },
1186
+ {
1187
+ "epoch": 17.63,
1188
+ "learning_rate": 5.617977528089888e-06,
1189
+ "loss": 0.0227,
1190
+ "step": 1580
1191
+ },
1192
+ {
1193
+ "epoch": 17.74,
1194
+ "learning_rate": 5.3370786516853935e-06,
1195
+ "loss": 0.0233,
1196
+ "step": 1590
1197
+ },
1198
+ {
1199
+ "epoch": 17.85,
1200
+ "learning_rate": 5.056179775280899e-06,
1201
+ "loss": 0.0229,
1202
+ "step": 1600
1203
+ },
1204
+ {
1205
+ "epoch": 17.96,
1206
+ "learning_rate": 4.7752808988764044e-06,
1207
+ "loss": 0.0228,
1208
+ "step": 1610
1209
+ },
1210
+ {
1211
+ "epoch": 18.08,
1212
+ "learning_rate": 4.49438202247191e-06,
1213
+ "loss": 0.023,
1214
+ "step": 1620
1215
+ },
1216
+ {
1217
+ "epoch": 18.08,
1218
+ "eval_accuracy": 0.9343126709088788,
1219
+ "eval_loss": 0.07704292237758636,
1220
+ "eval_runtime": 16.4328,
1221
+ "eval_samples_per_second": 36.756,
1222
+ "eval_steps_per_second": 12.292,
1223
+ "step": 1620
1224
+ },
1225
+ {
1226
+ "epoch": 18.19,
1227
+ "learning_rate": 4.213483146067416e-06,
1228
+ "loss": 0.0223,
1229
+ "step": 1630
1230
+ },
1231
+ {
1232
+ "epoch": 18.3,
1233
+ "learning_rate": 3.932584269662922e-06,
1234
+ "loss": 0.0223,
1235
+ "step": 1640
1236
+ },
1237
+ {
1238
+ "epoch": 18.41,
1239
+ "learning_rate": 3.651685393258427e-06,
1240
+ "loss": 0.0229,
1241
+ "step": 1650
1242
+ },
1243
+ {
1244
+ "epoch": 18.52,
1245
+ "learning_rate": 3.3707865168539327e-06,
1246
+ "loss": 0.0225,
1247
+ "step": 1660
1248
+ },
1249
+ {
1250
+ "epoch": 18.63,
1251
+ "learning_rate": 3.089887640449438e-06,
1252
+ "loss": 0.0229,
1253
+ "step": 1670
1254
+ },
1255
+ {
1256
+ "epoch": 18.74,
1257
+ "learning_rate": 2.808988764044944e-06,
1258
+ "loss": 0.0227,
1259
+ "step": 1680
1260
+ },
1261
+ {
1262
+ "epoch": 18.74,
1263
+ "eval_accuracy": 0.9343094309301331,
1264
+ "eval_loss": 0.07714974880218506,
1265
+ "eval_runtime": 16.563,
1266
+ "eval_samples_per_second": 36.467,
1267
+ "eval_steps_per_second": 12.196,
1268
+ "step": 1680
1269
+ },
1270
+ {
1271
+ "epoch": 18.86,
1272
+ "learning_rate": 2.5280898876404495e-06,
1273
+ "loss": 0.0232,
1274
+ "step": 1690
1275
+ },
1276
+ {
1277
+ "epoch": 18.97,
1278
+ "learning_rate": 2.247191011235955e-06,
1279
+ "loss": 0.0224,
1280
+ "step": 1700
1281
+ },
1282
+ {
1283
+ "epoch": 19.08,
1284
+ "learning_rate": 1.966292134831461e-06,
1285
+ "loss": 0.0222,
1286
+ "step": 1710
1287
+ },
1288
+ {
1289
+ "epoch": 19.19,
1290
+ "learning_rate": 1.6853932584269663e-06,
1291
+ "loss": 0.0225,
1292
+ "step": 1720
1293
+ },
1294
+ {
1295
+ "epoch": 19.3,
1296
+ "learning_rate": 1.404494382022472e-06,
1297
+ "loss": 0.0223,
1298
+ "step": 1730
1299
+ },
1300
+ {
1301
+ "epoch": 19.41,
1302
+ "learning_rate": 1.1235955056179775e-06,
1303
+ "loss": 0.0221,
1304
+ "step": 1740
1305
+ },
1306
+ {
1307
+ "epoch": 19.41,
1308
+ "eval_accuracy": 0.934322390845116,
1309
+ "eval_loss": 0.07709047198295593,
1310
+ "eval_runtime": 16.3614,
1311
+ "eval_samples_per_second": 36.916,
1312
+ "eval_steps_per_second": 12.346,
1313
+ "step": 1740
1314
+ },
1315
+ {
1316
+ "epoch": 19.53,
1317
+ "learning_rate": 8.426966292134832e-07,
1318
+ "loss": 0.0231,
1319
+ "step": 1750
1320
+ },
1321
+ {
1322
+ "epoch": 19.64,
1323
+ "learning_rate": 5.617977528089887e-07,
1324
+ "loss": 0.0227,
1325
+ "step": 1760
1326
+ },
1327
+ {
1328
+ "epoch": 19.75,
1329
+ "learning_rate": 2.8089887640449437e-07,
1330
+ "loss": 0.0226,
1331
+ "step": 1770
1332
+ },
1333
+ {
1334
+ "epoch": 19.86,
1335
+ "learning_rate": 0.0,
1336
+ "loss": 0.0227,
1337
+ "step": 1780
1338
+ },
1339
+ {
1340
+ "epoch": 19.86,
1341
+ "step": 1780,
1342
+ "total_flos": 6.551573269655685e+17,
1343
+ "train_loss": 0.09398916166987312,
1344
+ "train_runtime": 60723.6526,
1345
+ "train_samples_per_second": 17.711,
1346
+ "train_steps_per_second": 0.029
1347
+ }
1348
+ ],
1349
+ "max_steps": 1780,
1350
+ "num_train_epochs": 20,
1351
+ "total_flos": 6.551573269655685e+17,
1352
+ "trial_name": null,
1353
+ "trial_params": null
1354
+ }