itsLeen commited on
Commit
2b879a2
·
verified ·
1 Parent(s): d46744b

Model save

Browse files
README.md CHANGED
@@ -18,8 +18,8 @@ should probably proofread and complete it, then remove this comment. -->
18
 
19
  This model is a fine-tuned version of [google/vit-base-patch16-224](https://huggingface.co/google/vit-base-patch16-224) on an unknown dataset.
20
  It achieves the following results on the evaluation set:
21
- - Loss: 0.3400
22
- - Accuracy: 0.8425
23
 
24
  ## Model description
25
 
@@ -51,8 +51,8 @@ The following hyperparameters were used during training:
51
 
52
  | Training Loss | Epoch | Step | Validation Loss | Accuracy |
53
  |:-------------:|:------:|:----:|:---------------:|:--------:|
54
- | 0.3732 | 1.9231 | 100 | 0.4088 | 0.7945 |
55
- | 0.1451 | 3.8462 | 200 | 0.3400 | 0.8425 |
56
 
57
 
58
  ### Framework versions
 
18
 
19
  This model is a fine-tuned version of [google/vit-base-patch16-224](https://huggingface.co/google/vit-base-patch16-224) on an unknown dataset.
20
  It achieves the following results on the evaluation set:
21
+ - Loss: 0.4633
22
+ - Accuracy: 0.8836
23
 
24
  ## Model description
25
 
 
51
 
52
  | Training Loss | Epoch | Step | Validation Loss | Accuracy |
53
  |:-------------:|:------:|:----:|:---------------:|:--------:|
54
+ | 0.1137 | 1.9231 | 100 | 0.4869 | 0.8288 |
55
+ | 0.1002 | 3.8462 | 200 | 0.4633 | 0.8836 |
56
 
57
 
58
  ### Framework versions
all_results.json CHANGED
@@ -1,13 +1,13 @@
1
  {
2
- "epoch": 10.0,
3
- "eval_accuracy": 0.9785138764547896,
4
- "eval_loss": 0.08193562924861908,
5
- "eval_runtime": 73.3486,
6
- "eval_samples_per_second": 15.229,
7
- "eval_steps_per_second": 1.909,
8
- "total_flos": 4.904158054749069e+18,
9
- "train_loss": 0.06213315485569771,
10
- "train_runtime": 7084.9204,
11
- "train_samples_per_second": 8.93,
12
- "train_steps_per_second": 0.559
13
  }
 
1
  {
2
+ "epoch": 4.0,
3
+ "eval_accuracy": 0.8424657534246576,
4
+ "eval_loss": 0.339973121881485,
5
+ "eval_runtime": 4.934,
6
+ "eval_samples_per_second": 29.591,
7
+ "eval_steps_per_second": 3.851,
8
+ "total_flos": 2.5832369176982323e+17,
9
+ "train_loss": 0.38543382860147035,
10
+ "train_runtime": 140.7513,
11
+ "train_samples_per_second": 23.474,
12
+ "train_steps_per_second": 1.478
13
  }
eval_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 10.0,
3
- "eval_accuracy": 0.9785138764547896,
4
- "eval_loss": 0.08193562924861908,
5
- "eval_runtime": 73.3486,
6
- "eval_samples_per_second": 15.229,
7
- "eval_steps_per_second": 1.909
8
  }
 
1
  {
2
+ "epoch": 4.0,
3
+ "eval_accuracy": 0.8424657534246576,
4
+ "eval_loss": 0.339973121881485,
5
+ "eval_runtime": 4.934,
6
+ "eval_samples_per_second": 29.591,
7
+ "eval_steps_per_second": 3.851
8
  }
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:33da5abc04f7b115e193f7059f53c2aad4581aae49d924d31d3f6858cab55f07
3
  size 346293856
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:512d40137dc0c3f1c101013dfbd20fdfd2937beec13685e09bf63194b77590e1
3
  size 346293856
runs/Sep24_13-50-15_37647b9c0688/events.out.tfevents.1727186045.37647b9c0688.3797.2 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fc7909bb50be85700b9e8f4d0e19ac74cd8a9f5f7fd272c82c03b66db39cbf91
3
+ size 411
runs/Sep24_13-50-15_37647b9c0688/events.out.tfevents.1727186162.37647b9c0688.3797.3 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:53815df8386eafe1b5653a10861136aca181009ff49855d756efd06fb14417bd
3
+ size 4955
runs/Sep24_13-50-15_37647b9c0688/events.out.tfevents.1727186206.37647b9c0688.3797.4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:42dcb323426977b7e3f0468cace45dbc312852a7a44babfb5959a178a0117d74
3
+ size 10121
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 10.0,
3
- "total_flos": 4.904158054749069e+18,
4
- "train_loss": 0.06213315485569771,
5
- "train_runtime": 7084.9204,
6
- "train_samples_per_second": 8.93,
7
- "train_steps_per_second": 0.559
8
  }
 
1
  {
2
+ "epoch": 4.0,
3
+ "total_flos": 2.5832369176982323e+17,
4
+ "train_loss": 0.38543382860147035,
5
+ "train_runtime": 140.7513,
6
+ "train_samples_per_second": 23.474,
7
+ "train_steps_per_second": 1.478
8
  }
trainer_state.json CHANGED
@@ -1,3150 +1,185 @@
1
  {
2
- "best_metric": 0.08193562924861908,
3
- "best_model_checkpoint": "realFake-img/checkpoint-2500",
4
- "epoch": 10.0,
5
  "eval_steps": 100,
6
- "global_step": 3960,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.025252525252525252,
13
- "grad_norm": 8.32949447631836,
14
- "learning_rate": 0.0001994949494949495,
15
- "loss": 0.1124,
16
  "step": 10
17
  },
18
  {
19
- "epoch": 0.050505050505050504,
20
- "grad_norm": 4.660865306854248,
21
- "learning_rate": 0.000198989898989899,
22
- "loss": 0.2631,
23
  "step": 20
24
  },
25
  {
26
- "epoch": 0.07575757575757576,
27
- "grad_norm": 4.1171956062316895,
28
- "learning_rate": 0.0001984848484848485,
29
- "loss": 0.1366,
30
  "step": 30
31
  },
32
  {
33
- "epoch": 0.10101010101010101,
34
- "grad_norm": 4.586099147796631,
35
- "learning_rate": 0.000197979797979798,
36
- "loss": 0.1395,
37
  "step": 40
38
  },
39
  {
40
- "epoch": 0.12626262626262627,
41
- "grad_norm": 3.6707675457000732,
42
- "learning_rate": 0.0001974747474747475,
43
- "loss": 0.178,
44
  "step": 50
45
  },
46
  {
47
- "epoch": 0.15151515151515152,
48
- "grad_norm": 0.39073047041893005,
49
- "learning_rate": 0.00019696969696969698,
50
- "loss": 0.2038,
51
  "step": 60
52
  },
53
  {
54
- "epoch": 0.17676767676767677,
55
- "grad_norm": 3.4298012256622314,
56
- "learning_rate": 0.0001964646464646465,
57
- "loss": 0.0964,
58
  "step": 70
59
  },
60
  {
61
- "epoch": 0.20202020202020202,
62
- "grad_norm": 4.532003402709961,
63
- "learning_rate": 0.00019595959595959596,
64
- "loss": 0.171,
65
  "step": 80
66
  },
67
  {
68
- "epoch": 0.22727272727272727,
69
- "grad_norm": 2.3665497303009033,
70
- "learning_rate": 0.00019545454545454548,
71
- "loss": 0.1166,
72
  "step": 90
73
  },
74
  {
75
- "epoch": 0.25252525252525254,
76
- "grad_norm": 1.0514458417892456,
77
- "learning_rate": 0.00019494949494949494,
78
- "loss": 0.2578,
79
  "step": 100
80
  },
81
  {
82
- "epoch": 0.25252525252525254,
83
- "eval_accuracy": 0.9418084153983886,
84
- "eval_loss": 0.1593756079673767,
85
- "eval_runtime": 72.9833,
86
- "eval_samples_per_second": 15.305,
87
- "eval_steps_per_second": 1.918,
88
  "step": 100
89
  },
90
  {
91
- "epoch": 0.2777777777777778,
92
- "grad_norm": 2.9928767681121826,
93
- "learning_rate": 0.00019444444444444446,
94
- "loss": 0.1794,
95
  "step": 110
96
  },
97
  {
98
- "epoch": 0.30303030303030304,
99
- "grad_norm": 0.6943581104278564,
100
- "learning_rate": 0.00019393939393939395,
101
- "loss": 0.1713,
102
  "step": 120
103
  },
104
  {
105
- "epoch": 0.3282828282828283,
106
- "grad_norm": 5.296023845672607,
107
- "learning_rate": 0.00019343434343434344,
108
- "loss": 0.1822,
109
  "step": 130
110
  },
111
  {
112
- "epoch": 0.35353535353535354,
113
- "grad_norm": 4.849494934082031,
114
- "learning_rate": 0.00019292929292929293,
115
- "loss": 0.1667,
116
  "step": 140
117
  },
118
  {
119
- "epoch": 0.3787878787878788,
120
- "grad_norm": 2.1953601837158203,
121
- "learning_rate": 0.00019242424242424245,
122
- "loss": 0.1353,
123
  "step": 150
124
  },
125
  {
126
- "epoch": 0.40404040404040403,
127
- "grad_norm": 3.5325512886047363,
128
- "learning_rate": 0.00019191919191919191,
129
- "loss": 0.2191,
130
  "step": 160
131
  },
132
  {
133
- "epoch": 0.4292929292929293,
134
- "grad_norm": 1.513462781906128,
135
- "learning_rate": 0.00019141414141414143,
136
- "loss": 0.0864,
137
  "step": 170
138
  },
139
  {
140
- "epoch": 0.45454545454545453,
141
- "grad_norm": 1.1227214336395264,
142
- "learning_rate": 0.00019090909090909092,
143
- "loss": 0.0972,
144
  "step": 180
145
  },
146
  {
147
- "epoch": 0.4797979797979798,
148
- "grad_norm": 4.3201212882995605,
149
- "learning_rate": 0.0001904040404040404,
150
- "loss": 0.1356,
151
  "step": 190
152
  },
153
  {
154
- "epoch": 0.5050505050505051,
155
- "grad_norm": 0.13399846851825714,
156
- "learning_rate": 0.0001898989898989899,
157
- "loss": 0.0944,
158
  "step": 200
159
  },
160
  {
161
- "epoch": 0.5050505050505051,
162
- "eval_accuracy": 0.937332139659803,
163
- "eval_loss": 0.22425174713134766,
164
- "eval_runtime": 72.9458,
165
- "eval_samples_per_second": 15.313,
166
- "eval_steps_per_second": 1.919,
167
  "step": 200
168
  },
169
  {
170
- "epoch": 0.5303030303030303,
171
- "grad_norm": 0.07937999069690704,
172
- "learning_rate": 0.00018939393939393942,
173
- "loss": 0.0798,
174
- "step": 210
175
- },
176
- {
177
- "epoch": 0.5555555555555556,
178
- "grad_norm": 6.126536846160889,
179
- "learning_rate": 0.00018888888888888888,
180
- "loss": 0.2437,
181
- "step": 220
182
- },
183
- {
184
- "epoch": 0.5808080808080808,
185
- "grad_norm": 8.01685619354248,
186
- "learning_rate": 0.0001883838383838384,
187
- "loss": 0.2746,
188
- "step": 230
189
- },
190
- {
191
- "epoch": 0.6060606060606061,
192
- "grad_norm": 3.1425938606262207,
193
- "learning_rate": 0.0001878787878787879,
194
- "loss": 0.1937,
195
- "step": 240
196
- },
197
- {
198
- "epoch": 0.6313131313131313,
199
- "grad_norm": 1.1262303590774536,
200
- "learning_rate": 0.00018737373737373738,
201
- "loss": 0.2495,
202
- "step": 250
203
- },
204
- {
205
- "epoch": 0.6565656565656566,
206
- "grad_norm": 3.994985342025757,
207
- "learning_rate": 0.00018686868686868687,
208
- "loss": 0.0914,
209
- "step": 260
210
- },
211
- {
212
- "epoch": 0.6818181818181818,
213
- "grad_norm": 3.6686558723449707,
214
- "learning_rate": 0.00018636363636363636,
215
- "loss": 0.1241,
216
- "step": 270
217
- },
218
- {
219
- "epoch": 0.7070707070707071,
220
- "grad_norm": 2.8421552181243896,
221
- "learning_rate": 0.00018585858585858586,
222
- "loss": 0.162,
223
- "step": 280
224
- },
225
- {
226
- "epoch": 0.7323232323232324,
227
- "grad_norm": 0.06576777994632721,
228
- "learning_rate": 0.00018535353535353537,
229
- "loss": 0.0863,
230
- "step": 290
231
- },
232
- {
233
- "epoch": 0.7575757575757576,
234
- "grad_norm": 3.127112865447998,
235
- "learning_rate": 0.00018484848484848484,
236
- "loss": 0.1747,
237
- "step": 300
238
- },
239
- {
240
- "epoch": 0.7575757575757576,
241
- "eval_accuracy": 0.9292748433303492,
242
- "eval_loss": 0.24716989696025848,
243
- "eval_runtime": 73.2274,
244
- "eval_samples_per_second": 15.254,
245
- "eval_steps_per_second": 1.912,
246
- "step": 300
247
- },
248
- {
249
- "epoch": 0.7828282828282829,
250
- "grad_norm": 1.235567569732666,
251
- "learning_rate": 0.00018434343434343435,
252
- "loss": 0.0742,
253
- "step": 310
254
- },
255
- {
256
- "epoch": 0.8080808080808081,
257
- "grad_norm": 5.305884838104248,
258
- "learning_rate": 0.00018383838383838384,
259
- "loss": 0.1013,
260
- "step": 320
261
- },
262
- {
263
- "epoch": 0.8333333333333334,
264
- "grad_norm": 3.124811887741089,
265
- "learning_rate": 0.00018333333333333334,
266
- "loss": 0.2439,
267
- "step": 330
268
- },
269
- {
270
- "epoch": 0.8585858585858586,
271
- "grad_norm": 5.361472129821777,
272
- "learning_rate": 0.00018282828282828283,
273
- "loss": 0.0468,
274
- "step": 340
275
- },
276
- {
277
- "epoch": 0.8838383838383839,
278
- "grad_norm": 3.3062198162078857,
279
- "learning_rate": 0.00018232323232323234,
280
- "loss": 0.0855,
281
- "step": 350
282
- },
283
- {
284
- "epoch": 0.9090909090909091,
285
- "grad_norm": 1.9714092016220093,
286
- "learning_rate": 0.00018181818181818183,
287
- "loss": 0.1645,
288
- "step": 360
289
- },
290
- {
291
- "epoch": 0.9343434343434344,
292
- "grad_norm": 1.7579039335250854,
293
- "learning_rate": 0.00018131313131313132,
294
- "loss": 0.193,
295
- "step": 370
296
- },
297
- {
298
- "epoch": 0.9595959595959596,
299
- "grad_norm": 3.588534355163574,
300
- "learning_rate": 0.00018080808080808082,
301
- "loss": 0.1305,
302
- "step": 380
303
- },
304
- {
305
- "epoch": 0.9848484848484849,
306
- "grad_norm": 6.151834487915039,
307
- "learning_rate": 0.0001803030303030303,
308
- "loss": 0.1004,
309
- "step": 390
310
- },
311
- {
312
- "epoch": 1.0101010101010102,
313
- "grad_norm": 3.521318197250366,
314
- "learning_rate": 0.0001797979797979798,
315
- "loss": 0.1328,
316
- "step": 400
317
- },
318
- {
319
- "epoch": 1.0101010101010102,
320
- "eval_accuracy": 0.9337511190689346,
321
- "eval_loss": 0.17739379405975342,
322
- "eval_runtime": 72.9497,
323
- "eval_samples_per_second": 15.312,
324
- "eval_steps_per_second": 1.919,
325
- "step": 400
326
- },
327
- {
328
- "epoch": 1.0353535353535352,
329
- "grad_norm": 0.5116239786148071,
330
- "learning_rate": 0.00017929292929292931,
331
- "loss": 0.0932,
332
- "step": 410
333
- },
334
- {
335
- "epoch": 1.0606060606060606,
336
- "grad_norm": 0.37958571314811707,
337
- "learning_rate": 0.0001787878787878788,
338
- "loss": 0.0538,
339
- "step": 420
340
- },
341
- {
342
- "epoch": 1.0858585858585859,
343
- "grad_norm": 3.976700782775879,
344
- "learning_rate": 0.0001782828282828283,
345
- "loss": 0.2245,
346
- "step": 430
347
- },
348
- {
349
- "epoch": 1.1111111111111112,
350
- "grad_norm": 2.8285045623779297,
351
- "learning_rate": 0.00017777777777777779,
352
- "loss": 0.1332,
353
- "step": 440
354
- },
355
- {
356
- "epoch": 1.1363636363636362,
357
- "grad_norm": 3.683419704437256,
358
- "learning_rate": 0.00017727272727272728,
359
- "loss": 0.1162,
360
- "step": 450
361
- },
362
- {
363
- "epoch": 1.1616161616161615,
364
- "grad_norm": 4.30293607711792,
365
- "learning_rate": 0.0001767676767676768,
366
- "loss": 0.0678,
367
- "step": 460
368
- },
369
- {
370
- "epoch": 1.1868686868686869,
371
- "grad_norm": 0.15934455394744873,
372
- "learning_rate": 0.00017626262626262626,
373
- "loss": 0.1587,
374
- "step": 470
375
- },
376
- {
377
- "epoch": 1.2121212121212122,
378
- "grad_norm": 1.5525578260421753,
379
- "learning_rate": 0.00017575757575757578,
380
- "loss": 0.0637,
381
- "step": 480
382
- },
383
- {
384
- "epoch": 1.2373737373737375,
385
- "grad_norm": 1.534348964691162,
386
- "learning_rate": 0.00017525252525252527,
387
- "loss": 0.1103,
388
- "step": 490
389
- },
390
- {
391
- "epoch": 1.2626262626262625,
392
- "grad_norm": 1.6843178272247314,
393
- "learning_rate": 0.00017474747474747476,
394
- "loss": 0.1918,
395
- "step": 500
396
- },
397
- {
398
- "epoch": 1.2626262626262625,
399
- "eval_accuracy": 0.9570277529095792,
400
- "eval_loss": 0.12820282578468323,
401
- "eval_runtime": 73.1443,
402
- "eval_samples_per_second": 15.271,
403
- "eval_steps_per_second": 1.914,
404
- "step": 500
405
- },
406
- {
407
- "epoch": 1.2878787878787878,
408
- "grad_norm": 0.6296999454498291,
409
- "learning_rate": 0.00017424242424242425,
410
- "loss": 0.0461,
411
- "step": 510
412
- },
413
- {
414
- "epoch": 1.3131313131313131,
415
- "grad_norm": 4.980341911315918,
416
- "learning_rate": 0.00017373737373737377,
417
- "loss": 0.1479,
418
- "step": 520
419
- },
420
- {
421
- "epoch": 1.3383838383838385,
422
- "grad_norm": 0.36140933632850647,
423
- "learning_rate": 0.00017323232323232323,
424
- "loss": 0.0726,
425
- "step": 530
426
- },
427
- {
428
- "epoch": 1.3636363636363638,
429
- "grad_norm": 0.2907123267650604,
430
- "learning_rate": 0.00017272727272727275,
431
- "loss": 0.1109,
432
- "step": 540
433
- },
434
- {
435
- "epoch": 1.3888888888888888,
436
- "grad_norm": 1.1450049877166748,
437
- "learning_rate": 0.00017222222222222224,
438
- "loss": 0.0888,
439
- "step": 550
440
- },
441
- {
442
- "epoch": 1.4141414141414141,
443
- "grad_norm": 3.324134588241577,
444
- "learning_rate": 0.00017171717171717173,
445
- "loss": 0.1074,
446
- "step": 560
447
- },
448
- {
449
- "epoch": 1.4393939393939394,
450
- "grad_norm": 0.9428613185882568,
451
- "learning_rate": 0.00017121212121212122,
452
- "loss": 0.0856,
453
- "step": 570
454
- },
455
- {
456
- "epoch": 1.4646464646464645,
457
- "grad_norm": 0.1330060064792633,
458
- "learning_rate": 0.0001707070707070707,
459
- "loss": 0.061,
460
- "step": 580
461
- },
462
- {
463
- "epoch": 1.4898989898989898,
464
- "grad_norm": 4.435102939605713,
465
- "learning_rate": 0.0001702020202020202,
466
- "loss": 0.1137,
467
- "step": 590
468
- },
469
- {
470
- "epoch": 1.5151515151515151,
471
- "grad_norm": 2.5744283199310303,
472
- "learning_rate": 0.00016969696969696972,
473
- "loss": 0.169,
474
- "step": 600
475
- },
476
- {
477
- "epoch": 1.5151515151515151,
478
- "eval_accuracy": 0.9346463742166518,
479
- "eval_loss": 0.2247086614370346,
480
- "eval_runtime": 73.2754,
481
- "eval_samples_per_second": 15.244,
482
- "eval_steps_per_second": 1.911,
483
- "step": 600
484
- },
485
- {
486
- "epoch": 1.5404040404040404,
487
- "grad_norm": 3.7209930419921875,
488
- "learning_rate": 0.00016919191919191918,
489
- "loss": 0.1929,
490
- "step": 610
491
- },
492
- {
493
- "epoch": 1.5656565656565657,
494
- "grad_norm": 4.9047322273254395,
495
- "learning_rate": 0.0001686868686868687,
496
- "loss": 0.144,
497
- "step": 620
498
- },
499
- {
500
- "epoch": 1.5909090909090908,
501
- "grad_norm": 8.181381225585938,
502
- "learning_rate": 0.0001681818181818182,
503
- "loss": 0.1008,
504
- "step": 630
505
- },
506
- {
507
- "epoch": 1.6161616161616161,
508
- "grad_norm": 0.5650784969329834,
509
- "learning_rate": 0.00016767676767676768,
510
- "loss": 0.1385,
511
- "step": 640
512
- },
513
- {
514
- "epoch": 1.6414141414141414,
515
- "grad_norm": 0.4483976364135742,
516
- "learning_rate": 0.00016717171717171717,
517
- "loss": 0.1112,
518
- "step": 650
519
- },
520
- {
521
- "epoch": 1.6666666666666665,
522
- "grad_norm": 2.8870067596435547,
523
- "learning_rate": 0.0001666666666666667,
524
- "loss": 0.0868,
525
- "step": 660
526
- },
527
- {
528
- "epoch": 1.691919191919192,
529
- "grad_norm": 5.016068458557129,
530
- "learning_rate": 0.00016616161616161615,
531
- "loss": 0.0948,
532
- "step": 670
533
- },
534
- {
535
- "epoch": 1.7171717171717171,
536
- "grad_norm": 4.62065315246582,
537
- "learning_rate": 0.00016565656565656567,
538
- "loss": 0.2336,
539
- "step": 680
540
- },
541
- {
542
- "epoch": 1.7424242424242424,
543
- "grad_norm": 0.04882610961794853,
544
- "learning_rate": 0.00016515151515151516,
545
- "loss": 0.1006,
546
- "step": 690
547
- },
548
- {
549
- "epoch": 1.7676767676767677,
550
- "grad_norm": 1.2523910999298096,
551
- "learning_rate": 0.00016464646464646465,
552
- "loss": 0.2595,
553
- "step": 700
554
- },
555
- {
556
- "epoch": 1.7676767676767677,
557
- "eval_accuracy": 0.9444941808415398,
558
- "eval_loss": 0.1785079687833786,
559
- "eval_runtime": 73.2828,
560
- "eval_samples_per_second": 15.242,
561
- "eval_steps_per_second": 1.91,
562
- "step": 700
563
- },
564
- {
565
- "epoch": 1.7929292929292928,
566
- "grad_norm": 0.28372153639793396,
567
- "learning_rate": 0.00016414141414141414,
568
- "loss": 0.0657,
569
- "step": 710
570
- },
571
- {
572
- "epoch": 1.8181818181818183,
573
- "grad_norm": 0.061366915702819824,
574
- "learning_rate": 0.00016363636363636366,
575
- "loss": 0.2048,
576
- "step": 720
577
- },
578
- {
579
- "epoch": 1.8434343434343434,
580
- "grad_norm": 2.9858274459838867,
581
- "learning_rate": 0.00016313131313131312,
582
- "loss": 0.0489,
583
- "step": 730
584
- },
585
- {
586
- "epoch": 1.8686868686868687,
587
- "grad_norm": 4.050809383392334,
588
- "learning_rate": 0.00016262626262626264,
589
- "loss": 0.1095,
590
- "step": 740
591
- },
592
- {
593
- "epoch": 1.893939393939394,
594
- "grad_norm": 3.725325584411621,
595
- "learning_rate": 0.00016212121212121213,
596
- "loss": 0.2613,
597
- "step": 750
598
- },
599
- {
600
- "epoch": 1.9191919191919191,
601
- "grad_norm": 2.09786319732666,
602
- "learning_rate": 0.00016161616161616162,
603
- "loss": 0.0492,
604
- "step": 760
605
- },
606
- {
607
- "epoch": 1.9444444444444444,
608
- "grad_norm": 1.9398726224899292,
609
- "learning_rate": 0.0001611111111111111,
610
- "loss": 0.0831,
611
- "step": 770
612
- },
613
- {
614
- "epoch": 1.9696969696969697,
615
- "grad_norm": 0.6055514812469482,
616
- "learning_rate": 0.0001606060606060606,
617
- "loss": 0.1733,
618
- "step": 780
619
- },
620
- {
621
- "epoch": 1.9949494949494948,
622
- "grad_norm": 0.22102850675582886,
623
- "learning_rate": 0.00016010101010101012,
624
- "loss": 0.1106,
625
- "step": 790
626
- },
627
- {
628
- "epoch": 2.0202020202020203,
629
- "grad_norm": 3.681710720062256,
630
- "learning_rate": 0.0001595959595959596,
631
- "loss": 0.0911,
632
- "step": 800
633
- },
634
- {
635
- "epoch": 2.0202020202020203,
636
- "eval_accuracy": 0.9534467323187108,
637
- "eval_loss": 0.1352938562631607,
638
- "eval_runtime": 73.2218,
639
- "eval_samples_per_second": 15.255,
640
- "eval_steps_per_second": 1.912,
641
- "step": 800
642
- },
643
- {
644
- "epoch": 2.0454545454545454,
645
- "grad_norm": 0.574734091758728,
646
- "learning_rate": 0.0001590909090909091,
647
- "loss": 0.044,
648
- "step": 810
649
- },
650
- {
651
- "epoch": 2.0707070707070705,
652
- "grad_norm": 0.253918319940567,
653
- "learning_rate": 0.0001585858585858586,
654
- "loss": 0.0476,
655
- "step": 820
656
- },
657
- {
658
- "epoch": 2.095959595959596,
659
- "grad_norm": 0.1252337247133255,
660
- "learning_rate": 0.00015808080808080808,
661
- "loss": 0.1279,
662
- "step": 830
663
- },
664
- {
665
- "epoch": 2.121212121212121,
666
- "grad_norm": 0.26320022344589233,
667
- "learning_rate": 0.00015757575757575757,
668
- "loss": 0.2042,
669
- "step": 840
670
- },
671
- {
672
- "epoch": 2.1464646464646466,
673
- "grad_norm": 0.7983365058898926,
674
- "learning_rate": 0.0001570707070707071,
675
- "loss": 0.1208,
676
- "step": 850
677
- },
678
- {
679
- "epoch": 2.1717171717171717,
680
- "grad_norm": 0.36479347944259644,
681
- "learning_rate": 0.00015656565656565658,
682
- "loss": 0.0881,
683
- "step": 860
684
- },
685
- {
686
- "epoch": 2.196969696969697,
687
- "grad_norm": 0.11645219475030899,
688
- "learning_rate": 0.00015606060606060607,
689
- "loss": 0.0955,
690
- "step": 870
691
- },
692
- {
693
- "epoch": 2.2222222222222223,
694
- "grad_norm": 1.1980379819869995,
695
- "learning_rate": 0.00015555555555555556,
696
- "loss": 0.077,
697
- "step": 880
698
- },
699
- {
700
- "epoch": 2.2474747474747474,
701
- "grad_norm": 0.06797017902135849,
702
- "learning_rate": 0.00015505050505050508,
703
- "loss": 0.0377,
704
- "step": 890
705
- },
706
- {
707
- "epoch": 2.2727272727272725,
708
- "grad_norm": 0.48521897196769714,
709
- "learning_rate": 0.00015454545454545454,
710
- "loss": 0.0548,
711
- "step": 900
712
- },
713
- {
714
- "epoch": 2.2727272727272725,
715
- "eval_accuracy": 0.9471799462846912,
716
- "eval_loss": 0.19982792437076569,
717
- "eval_runtime": 72.9425,
718
- "eval_samples_per_second": 15.313,
719
- "eval_steps_per_second": 1.919,
720
- "step": 900
721
- },
722
- {
723
- "epoch": 2.297979797979798,
724
- "grad_norm": 0.017012102529406548,
725
- "learning_rate": 0.00015404040404040406,
726
- "loss": 0.1089,
727
- "step": 910
728
- },
729
- {
730
- "epoch": 2.323232323232323,
731
- "grad_norm": 0.2808210849761963,
732
- "learning_rate": 0.00015353535353535353,
733
- "loss": 0.0789,
734
- "step": 920
735
- },
736
- {
737
- "epoch": 2.3484848484848486,
738
- "grad_norm": 4.9768781661987305,
739
- "learning_rate": 0.00015303030303030304,
740
- "loss": 0.1004,
741
- "step": 930
742
- },
743
- {
744
- "epoch": 2.3737373737373737,
745
- "grad_norm": 1.5323927402496338,
746
- "learning_rate": 0.00015252525252525253,
747
- "loss": 0.0357,
748
- "step": 940
749
- },
750
- {
751
- "epoch": 2.398989898989899,
752
- "grad_norm": 4.321779251098633,
753
- "learning_rate": 0.00015202020202020202,
754
- "loss": 0.0348,
755
- "step": 950
756
- },
757
- {
758
- "epoch": 2.4242424242424243,
759
- "grad_norm": 6.227025032043457,
760
- "learning_rate": 0.00015151515151515152,
761
- "loss": 0.1679,
762
- "step": 960
763
- },
764
- {
765
- "epoch": 2.4494949494949494,
766
- "grad_norm": 1.045432209968567,
767
- "learning_rate": 0.00015101010101010103,
768
- "loss": 0.1222,
769
- "step": 970
770
- },
771
- {
772
- "epoch": 2.474747474747475,
773
- "grad_norm": 3.0685787200927734,
774
- "learning_rate": 0.0001505050505050505,
775
- "loss": 0.1434,
776
- "step": 980
777
- },
778
- {
779
- "epoch": 2.5,
780
- "grad_norm": 0.04191284626722336,
781
- "learning_rate": 0.00015000000000000001,
782
- "loss": 0.086,
783
- "step": 990
784
- },
785
- {
786
- "epoch": 2.525252525252525,
787
- "grad_norm": 3.1016695499420166,
788
- "learning_rate": 0.0001494949494949495,
789
- "loss": 0.1399,
790
- "step": 1000
791
- },
792
- {
793
- "epoch": 2.525252525252525,
794
- "eval_accuracy": 0.9444941808415398,
795
- "eval_loss": 0.19705650210380554,
796
- "eval_runtime": 73.3829,
797
- "eval_samples_per_second": 15.222,
798
- "eval_steps_per_second": 1.908,
799
- "step": 1000
800
- },
801
- {
802
- "epoch": 2.5505050505050506,
803
- "grad_norm": 4.877354145050049,
804
- "learning_rate": 0.000148989898989899,
805
- "loss": 0.1418,
806
- "step": 1010
807
- },
808
- {
809
- "epoch": 2.5757575757575757,
810
- "grad_norm": 4.7359700202941895,
811
- "learning_rate": 0.00014848484848484849,
812
- "loss": 0.1084,
813
- "step": 1020
814
- },
815
- {
816
- "epoch": 2.601010101010101,
817
- "grad_norm": 0.7143091559410095,
818
- "learning_rate": 0.000147979797979798,
819
- "loss": 0.1074,
820
- "step": 1030
821
- },
822
- {
823
- "epoch": 2.6262626262626263,
824
- "grad_norm": 0.4162321388721466,
825
- "learning_rate": 0.00014747474747474747,
826
- "loss": 0.1317,
827
- "step": 1040
828
- },
829
- {
830
- "epoch": 2.6515151515151514,
831
- "grad_norm": 5.558507442474365,
832
- "learning_rate": 0.00014696969696969698,
833
- "loss": 0.0829,
834
- "step": 1050
835
- },
836
- {
837
- "epoch": 2.676767676767677,
838
- "grad_norm": 0.08041220903396606,
839
- "learning_rate": 0.00014646464646464648,
840
- "loss": 0.0905,
841
- "step": 1060
842
- },
843
- {
844
- "epoch": 2.702020202020202,
845
- "grad_norm": 3.554946184158325,
846
- "learning_rate": 0.00014595959595959597,
847
- "loss": 0.14,
848
- "step": 1070
849
- },
850
- {
851
- "epoch": 2.7272727272727275,
852
- "grad_norm": 0.9108226895332336,
853
- "learning_rate": 0.00014545454545454546,
854
- "loss": 0.0355,
855
- "step": 1080
856
- },
857
- {
858
- "epoch": 2.7525252525252526,
859
- "grad_norm": 1.091728925704956,
860
- "learning_rate": 0.00014494949494949495,
861
- "loss": 0.059,
862
- "step": 1090
863
- },
864
- {
865
- "epoch": 2.7777777777777777,
866
- "grad_norm": 0.07620527595281601,
867
- "learning_rate": 0.00014444444444444444,
868
- "loss": 0.2001,
869
- "step": 1100
870
- },
871
- {
872
- "epoch": 2.7777777777777777,
873
- "eval_accuracy": 0.937332139659803,
874
- "eval_loss": 0.24790146946907043,
875
- "eval_runtime": 73.1059,
876
- "eval_samples_per_second": 15.279,
877
- "eval_steps_per_second": 1.915,
878
- "step": 1100
879
- },
880
- {
881
- "epoch": 2.8030303030303028,
882
- "grad_norm": 0.10709954053163528,
883
- "learning_rate": 0.00014393939393939396,
884
- "loss": 0.0487,
885
- "step": 1110
886
- },
887
- {
888
- "epoch": 2.8282828282828283,
889
- "grad_norm": 4.047976493835449,
890
- "learning_rate": 0.00014343434343434342,
891
- "loss": 0.0774,
892
- "step": 1120
893
- },
894
- {
895
- "epoch": 2.8535353535353534,
896
- "grad_norm": 2.409966468811035,
897
- "learning_rate": 0.00014292929292929294,
898
- "loss": 0.0744,
899
- "step": 1130
900
- },
901
- {
902
- "epoch": 2.878787878787879,
903
- "grad_norm": 0.3456668257713318,
904
- "learning_rate": 0.00014242424242424243,
905
- "loss": 0.0125,
906
- "step": 1140
907
- },
908
- {
909
- "epoch": 2.904040404040404,
910
- "grad_norm": 0.046853143721818924,
911
- "learning_rate": 0.00014191919191919192,
912
- "loss": 0.0756,
913
- "step": 1150
914
- },
915
- {
916
- "epoch": 2.929292929292929,
917
- "grad_norm": 3.4357807636260986,
918
- "learning_rate": 0.0001414141414141414,
919
- "loss": 0.1375,
920
- "step": 1160
921
- },
922
- {
923
- "epoch": 2.9545454545454546,
924
- "grad_norm": 1.010414719581604,
925
- "learning_rate": 0.00014090909090909093,
926
- "loss": 0.0704,
927
- "step": 1170
928
- },
929
- {
930
- "epoch": 2.9797979797979797,
931
- "grad_norm": 0.008091296069324017,
932
- "learning_rate": 0.00014040404040404042,
933
- "loss": 0.0791,
934
- "step": 1180
935
- },
936
- {
937
- "epoch": 3.005050505050505,
938
- "grad_norm": 1.9511629343032837,
939
- "learning_rate": 0.0001398989898989899,
940
- "loss": 0.0754,
941
- "step": 1190
942
- },
943
- {
944
- "epoch": 3.0303030303030303,
945
- "grad_norm": 10.075323104858398,
946
- "learning_rate": 0.0001393939393939394,
947
- "loss": 0.0976,
948
- "step": 1200
949
- },
950
- {
951
- "epoch": 3.0303030303030303,
952
- "eval_accuracy": 0.9498657117278424,
953
- "eval_loss": 0.16011768579483032,
954
- "eval_runtime": 73.2182,
955
- "eval_samples_per_second": 15.256,
956
- "eval_steps_per_second": 1.912,
957
- "step": 1200
958
- },
959
- {
960
- "epoch": 3.0555555555555554,
961
- "grad_norm": 0.027206294238567352,
962
- "learning_rate": 0.0001388888888888889,
963
- "loss": 0.0906,
964
- "step": 1210
965
- },
966
- {
967
- "epoch": 3.080808080808081,
968
- "grad_norm": 1.425262689590454,
969
- "learning_rate": 0.0001383838383838384,
970
- "loss": 0.0349,
971
- "step": 1220
972
- },
973
- {
974
- "epoch": 3.106060606060606,
975
- "grad_norm": 7.3463616371154785,
976
- "learning_rate": 0.0001378787878787879,
977
- "loss": 0.0804,
978
- "step": 1230
979
- },
980
- {
981
- "epoch": 3.1313131313131315,
982
- "grad_norm": 1.0737591981887817,
983
- "learning_rate": 0.0001373737373737374,
984
- "loss": 0.068,
985
- "step": 1240
986
- },
987
- {
988
- "epoch": 3.1565656565656566,
989
- "grad_norm": 7.525305271148682,
990
- "learning_rate": 0.00013686868686868688,
991
- "loss": 0.1145,
992
- "step": 1250
993
- },
994
- {
995
- "epoch": 3.1818181818181817,
996
- "grad_norm": 0.4561030864715576,
997
- "learning_rate": 0.00013636363636363637,
998
- "loss": 0.0977,
999
- "step": 1260
1000
- },
1001
- {
1002
- "epoch": 3.207070707070707,
1003
- "grad_norm": 0.11276185512542725,
1004
- "learning_rate": 0.00013585858585858586,
1005
- "loss": 0.0743,
1006
- "step": 1270
1007
- },
1008
- {
1009
- "epoch": 3.2323232323232323,
1010
- "grad_norm": 1.0171997547149658,
1011
- "learning_rate": 0.00013535353535353538,
1012
- "loss": 0.0775,
1013
- "step": 1280
1014
- },
1015
- {
1016
- "epoch": 3.257575757575758,
1017
- "grad_norm": 3.1414084434509277,
1018
- "learning_rate": 0.00013484848484848484,
1019
- "loss": 0.0309,
1020
- "step": 1290
1021
- },
1022
- {
1023
- "epoch": 3.282828282828283,
1024
- "grad_norm": 0.037932224571704865,
1025
- "learning_rate": 0.00013434343434343436,
1026
- "loss": 0.1291,
1027
- "step": 1300
1028
- },
1029
- {
1030
- "epoch": 3.282828282828283,
1031
- "eval_accuracy": 0.9588182632050134,
1032
- "eval_loss": 0.160703644156456,
1033
- "eval_runtime": 73.0017,
1034
- "eval_samples_per_second": 15.301,
1035
- "eval_steps_per_second": 1.918,
1036
- "step": 1300
1037
- },
1038
- {
1039
- "epoch": 3.308080808080808,
1040
- "grad_norm": 2.9155356884002686,
1041
- "learning_rate": 0.00013383838383838385,
1042
- "loss": 0.0215,
1043
- "step": 1310
1044
- },
1045
- {
1046
- "epoch": 3.3333333333333335,
1047
- "grad_norm": 5.102810382843018,
1048
- "learning_rate": 0.00013333333333333334,
1049
- "loss": 0.0716,
1050
- "step": 1320
1051
- },
1052
- {
1053
- "epoch": 3.3585858585858586,
1054
- "grad_norm": 0.020925424993038177,
1055
- "learning_rate": 0.00013282828282828283,
1056
- "loss": 0.0372,
1057
- "step": 1330
1058
- },
1059
- {
1060
- "epoch": 3.3838383838383836,
1061
- "grad_norm": 0.10292687267065048,
1062
- "learning_rate": 0.00013232323232323235,
1063
- "loss": 0.0211,
1064
- "step": 1340
1065
- },
1066
- {
1067
- "epoch": 3.409090909090909,
1068
- "grad_norm": 2.7968993186950684,
1069
- "learning_rate": 0.0001318181818181818,
1070
- "loss": 0.0708,
1071
- "step": 1350
1072
- },
1073
- {
1074
- "epoch": 3.4343434343434343,
1075
- "grad_norm": 3.1068055629730225,
1076
- "learning_rate": 0.00013131313131313133,
1077
- "loss": 0.1007,
1078
- "step": 1360
1079
- },
1080
- {
1081
- "epoch": 3.45959595959596,
1082
- "grad_norm": 0.032499730587005615,
1083
- "learning_rate": 0.00013080808080808082,
1084
- "loss": 0.0713,
1085
- "step": 1370
1086
- },
1087
- {
1088
- "epoch": 3.484848484848485,
1089
- "grad_norm": 0.20779326558113098,
1090
- "learning_rate": 0.0001303030303030303,
1091
- "loss": 0.048,
1092
- "step": 1380
1093
- },
1094
- {
1095
- "epoch": 3.51010101010101,
1096
- "grad_norm": 5.266826152801514,
1097
- "learning_rate": 0.0001297979797979798,
1098
- "loss": 0.193,
1099
- "step": 1390
1100
- },
1101
- {
1102
- "epoch": 3.5353535353535355,
1103
- "grad_norm": 0.42106470465660095,
1104
- "learning_rate": 0.00012929292929292932,
1105
- "loss": 0.0721,
1106
- "step": 1400
1107
- },
1108
- {
1109
- "epoch": 3.5353535353535355,
1110
- "eval_accuracy": 0.9588182632050134,
1111
- "eval_loss": 0.18219807744026184,
1112
- "eval_runtime": 73.033,
1113
- "eval_samples_per_second": 15.294,
1114
- "eval_steps_per_second": 1.917,
1115
- "step": 1400
1116
- },
1117
- {
1118
- "epoch": 3.5606060606060606,
1119
- "grad_norm": 1.7371455430984497,
1120
- "learning_rate": 0.00012878787878787878,
1121
- "loss": 0.0927,
1122
- "step": 1410
1123
- },
1124
- {
1125
- "epoch": 3.5858585858585856,
1126
- "grad_norm": 0.636141836643219,
1127
- "learning_rate": 0.0001282828282828283,
1128
- "loss": 0.0295,
1129
- "step": 1420
1130
- },
1131
- {
1132
- "epoch": 3.611111111111111,
1133
- "grad_norm": 0.10211779177188873,
1134
- "learning_rate": 0.00012777777777777776,
1135
- "loss": 0.0287,
1136
- "step": 1430
1137
- },
1138
- {
1139
- "epoch": 3.6363636363636362,
1140
- "grad_norm": 0.803653359413147,
1141
- "learning_rate": 0.00012727272727272728,
1142
- "loss": 0.0621,
1143
- "step": 1440
1144
- },
1145
- {
1146
- "epoch": 3.6616161616161618,
1147
- "grad_norm": 0.11753907799720764,
1148
- "learning_rate": 0.00012676767676767677,
1149
- "loss": 0.0465,
1150
- "step": 1450
1151
- },
1152
- {
1153
- "epoch": 3.686868686868687,
1154
- "grad_norm": 0.05394851416349411,
1155
- "learning_rate": 0.00012626262626262626,
1156
- "loss": 0.0474,
1157
- "step": 1460
1158
- },
1159
- {
1160
- "epoch": 3.712121212121212,
1161
- "grad_norm": 3.631462574005127,
1162
- "learning_rate": 0.00012575757575757575,
1163
- "loss": 0.093,
1164
- "step": 1470
1165
- },
1166
- {
1167
- "epoch": 3.7373737373737375,
1168
- "grad_norm": 0.1336178481578827,
1169
- "learning_rate": 0.00012525252525252527,
1170
- "loss": 0.0736,
1171
- "step": 1480
1172
- },
1173
- {
1174
- "epoch": 3.7626262626262625,
1175
- "grad_norm": 0.0858420580625534,
1176
- "learning_rate": 0.00012474747474747473,
1177
- "loss": 0.1211,
1178
- "step": 1490
1179
- },
1180
- {
1181
- "epoch": 3.787878787878788,
1182
- "grad_norm": 1.1731150150299072,
1183
- "learning_rate": 0.00012424242424242425,
1184
- "loss": 0.0592,
1185
- "step": 1500
1186
- },
1187
- {
1188
- "epoch": 3.787878787878788,
1189
- "eval_accuracy": 0.9623992837958818,
1190
- "eval_loss": 0.12546713650226593,
1191
- "eval_runtime": 73.0966,
1192
- "eval_samples_per_second": 15.281,
1193
- "eval_steps_per_second": 1.915,
1194
- "step": 1500
1195
- },
1196
- {
1197
- "epoch": 3.813131313131313,
1198
- "grad_norm": 1.533412218093872,
1199
- "learning_rate": 0.00012373737373737374,
1200
- "loss": 0.0663,
1201
- "step": 1510
1202
- },
1203
- {
1204
- "epoch": 3.8383838383838382,
1205
- "grad_norm": 7.734765529632568,
1206
- "learning_rate": 0.00012323232323232323,
1207
- "loss": 0.075,
1208
- "step": 1520
1209
- },
1210
- {
1211
- "epoch": 3.8636363636363638,
1212
- "grad_norm": 0.4143606126308441,
1213
- "learning_rate": 0.00012272727272727272,
1214
- "loss": 0.0158,
1215
- "step": 1530
1216
- },
1217
- {
1218
- "epoch": 3.888888888888889,
1219
- "grad_norm": 4.032654762268066,
1220
- "learning_rate": 0.00012222222222222224,
1221
- "loss": 0.0898,
1222
- "step": 1540
1223
- },
1224
- {
1225
- "epoch": 3.9141414141414144,
1226
- "grad_norm": 0.2919144928455353,
1227
- "learning_rate": 0.00012171717171717172,
1228
- "loss": 0.0904,
1229
- "step": 1550
1230
- },
1231
- {
1232
- "epoch": 3.9393939393939394,
1233
- "grad_norm": 6.036355018615723,
1234
- "learning_rate": 0.00012121212121212122,
1235
- "loss": 0.0725,
1236
- "step": 1560
1237
- },
1238
- {
1239
- "epoch": 3.9646464646464645,
1240
- "grad_norm": 0.34402996301651,
1241
- "learning_rate": 0.0001207070707070707,
1242
- "loss": 0.0643,
1243
- "step": 1570
1244
- },
1245
- {
1246
- "epoch": 3.98989898989899,
1247
- "grad_norm": 0.307706356048584,
1248
- "learning_rate": 0.0001202020202020202,
1249
- "loss": 0.1061,
1250
- "step": 1580
1251
- },
1252
- {
1253
- "epoch": 4.015151515151516,
1254
- "grad_norm": 0.04210241511464119,
1255
- "learning_rate": 0.00011969696969696971,
1256
- "loss": 0.1015,
1257
- "step": 1590
1258
- },
1259
- {
1260
- "epoch": 4.040404040404041,
1261
- "grad_norm": 4.686149597167969,
1262
- "learning_rate": 0.00011919191919191919,
1263
- "loss": 0.0964,
1264
- "step": 1600
1265
- },
1266
- {
1267
- "epoch": 4.040404040404041,
1268
- "eval_accuracy": 0.954341987466428,
1269
- "eval_loss": 0.16204935312271118,
1270
- "eval_runtime": 72.8935,
1271
- "eval_samples_per_second": 15.324,
1272
- "eval_steps_per_second": 1.921,
1273
- "step": 1600
1274
- },
1275
- {
1276
- "epoch": 4.065656565656566,
1277
- "grad_norm": 0.9774217009544373,
1278
- "learning_rate": 0.00011868686868686869,
1279
- "loss": 0.0342,
1280
- "step": 1610
1281
- },
1282
- {
1283
- "epoch": 4.090909090909091,
1284
- "grad_norm": 2.1450870037078857,
1285
- "learning_rate": 0.0001181818181818182,
1286
- "loss": 0.0852,
1287
- "step": 1620
1288
- },
1289
- {
1290
- "epoch": 4.116161616161616,
1291
- "grad_norm": 4.826761722564697,
1292
- "learning_rate": 0.00011767676767676767,
1293
- "loss": 0.0612,
1294
- "step": 1630
1295
- },
1296
- {
1297
- "epoch": 4.141414141414141,
1298
- "grad_norm": 0.7088700532913208,
1299
- "learning_rate": 0.00011717171717171717,
1300
- "loss": 0.0369,
1301
- "step": 1640
1302
- },
1303
- {
1304
- "epoch": 4.166666666666667,
1305
- "grad_norm": 0.07485224306583405,
1306
- "learning_rate": 0.00011666666666666668,
1307
- "loss": 0.0075,
1308
- "step": 1650
1309
- },
1310
- {
1311
- "epoch": 4.191919191919192,
1312
- "grad_norm": 7.588441371917725,
1313
- "learning_rate": 0.00011616161616161616,
1314
- "loss": 0.0492,
1315
- "step": 1660
1316
- },
1317
- {
1318
- "epoch": 4.217171717171717,
1319
- "grad_norm": 0.06588041037321091,
1320
- "learning_rate": 0.00011565656565656566,
1321
- "loss": 0.0619,
1322
- "step": 1670
1323
- },
1324
- {
1325
- "epoch": 4.242424242424242,
1326
- "grad_norm": 0.3317614495754242,
1327
- "learning_rate": 0.00011515151515151516,
1328
- "loss": 0.0504,
1329
- "step": 1680
1330
- },
1331
- {
1332
- "epoch": 4.267676767676767,
1333
- "grad_norm": 4.261381149291992,
1334
- "learning_rate": 0.00011464646464646464,
1335
- "loss": 0.0534,
1336
- "step": 1690
1337
- },
1338
- {
1339
- "epoch": 4.292929292929293,
1340
- "grad_norm": 1.7030925750732422,
1341
- "learning_rate": 0.00011414141414141415,
1342
- "loss": 0.0738,
1343
- "step": 1700
1344
- },
1345
- {
1346
- "epoch": 4.292929292929293,
1347
- "eval_accuracy": 0.9650850492390332,
1348
- "eval_loss": 0.12794509530067444,
1349
- "eval_runtime": 73.4006,
1350
- "eval_samples_per_second": 15.218,
1351
- "eval_steps_per_second": 1.907,
1352
- "step": 1700
1353
- },
1354
- {
1355
- "epoch": 4.318181818181818,
1356
- "grad_norm": 3.9137349128723145,
1357
- "learning_rate": 0.00011363636363636365,
1358
- "loss": 0.0269,
1359
- "step": 1710
1360
- },
1361
- {
1362
- "epoch": 4.343434343434343,
1363
- "grad_norm": 0.012919370085000992,
1364
- "learning_rate": 0.00011313131313131313,
1365
- "loss": 0.0314,
1366
- "step": 1720
1367
- },
1368
- {
1369
- "epoch": 4.3686868686868685,
1370
- "grad_norm": 0.07363598793745041,
1371
- "learning_rate": 0.00011262626262626263,
1372
- "loss": 0.0233,
1373
- "step": 1730
1374
- },
1375
- {
1376
- "epoch": 4.393939393939394,
1377
- "grad_norm": 0.137301966547966,
1378
- "learning_rate": 0.00011212121212121212,
1379
- "loss": 0.0863,
1380
- "step": 1740
1381
- },
1382
- {
1383
- "epoch": 4.41919191919192,
1384
- "grad_norm": 6.548308849334717,
1385
- "learning_rate": 0.00011161616161616161,
1386
- "loss": 0.0463,
1387
- "step": 1750
1388
- },
1389
- {
1390
- "epoch": 4.444444444444445,
1391
- "grad_norm": 2.40230655670166,
1392
- "learning_rate": 0.00011111111111111112,
1393
- "loss": 0.0668,
1394
- "step": 1760
1395
- },
1396
- {
1397
- "epoch": 4.46969696969697,
1398
- "grad_norm": 0.018276751041412354,
1399
- "learning_rate": 0.00011060606060606061,
1400
- "loss": 0.0193,
1401
- "step": 1770
1402
- },
1403
- {
1404
- "epoch": 4.494949494949495,
1405
- "grad_norm": 4.558255195617676,
1406
- "learning_rate": 0.00011010101010101011,
1407
- "loss": 0.1149,
1408
- "step": 1780
1409
- },
1410
- {
1411
- "epoch": 4.52020202020202,
1412
- "grad_norm": 0.04581284150481224,
1413
- "learning_rate": 0.0001095959595959596,
1414
- "loss": 0.0227,
1415
- "step": 1790
1416
- },
1417
- {
1418
- "epoch": 4.545454545454545,
1419
- "grad_norm": 1.2669509649276733,
1420
- "learning_rate": 0.00010909090909090909,
1421
- "loss": 0.0504,
1422
- "step": 1800
1423
- },
1424
- {
1425
- "epoch": 4.545454545454545,
1426
- "eval_accuracy": 0.9588182632050134,
1427
- "eval_loss": 0.16235476732254028,
1428
- "eval_runtime": 73.0538,
1429
- "eval_samples_per_second": 15.29,
1430
- "eval_steps_per_second": 1.916,
1431
- "step": 1800
1432
- },
1433
- {
1434
- "epoch": 4.570707070707071,
1435
- "grad_norm": 0.07127434760332108,
1436
- "learning_rate": 0.0001085858585858586,
1437
- "loss": 0.0492,
1438
- "step": 1810
1439
- },
1440
- {
1441
- "epoch": 4.595959595959596,
1442
- "grad_norm": 1.7907336950302124,
1443
- "learning_rate": 0.00010808080808080809,
1444
- "loss": 0.0358,
1445
- "step": 1820
1446
- },
1447
- {
1448
- "epoch": 4.621212121212121,
1449
- "grad_norm": 4.024843692779541,
1450
- "learning_rate": 0.00010757575757575758,
1451
- "loss": 0.0856,
1452
- "step": 1830
1453
- },
1454
- {
1455
- "epoch": 4.646464646464646,
1456
- "grad_norm": 0.020713260397315025,
1457
- "learning_rate": 0.00010707070707070708,
1458
- "loss": 0.0101,
1459
- "step": 1840
1460
- },
1461
- {
1462
- "epoch": 4.671717171717171,
1463
- "grad_norm": 0.06845160573720932,
1464
- "learning_rate": 0.00010656565656565659,
1465
- "loss": 0.0153,
1466
- "step": 1850
1467
- },
1468
- {
1469
- "epoch": 4.696969696969697,
1470
- "grad_norm": 1.0333762168884277,
1471
- "learning_rate": 0.00010606060606060606,
1472
- "loss": 0.1535,
1473
- "step": 1860
1474
- },
1475
- {
1476
- "epoch": 4.722222222222222,
1477
- "grad_norm": 0.019528638571500778,
1478
- "learning_rate": 0.00010555555555555557,
1479
- "loss": 0.089,
1480
- "step": 1870
1481
- },
1482
- {
1483
- "epoch": 4.747474747474747,
1484
- "grad_norm": 0.12054427713155746,
1485
- "learning_rate": 0.00010505050505050507,
1486
- "loss": 0.0154,
1487
- "step": 1880
1488
- },
1489
- {
1490
- "epoch": 4.7727272727272725,
1491
- "grad_norm": 0.053187351673841476,
1492
- "learning_rate": 0.00010454545454545455,
1493
- "loss": 0.1073,
1494
- "step": 1890
1495
- },
1496
- {
1497
- "epoch": 4.797979797979798,
1498
- "grad_norm": 0.03637217357754707,
1499
- "learning_rate": 0.00010404040404040405,
1500
- "loss": 0.0972,
1501
- "step": 1900
1502
- },
1503
- {
1504
- "epoch": 4.797979797979798,
1505
- "eval_accuracy": 0.9623992837958818,
1506
- "eval_loss": 0.15791860222816467,
1507
- "eval_runtime": 73.2114,
1508
- "eval_samples_per_second": 15.257,
1509
- "eval_steps_per_second": 1.912,
1510
- "step": 1900
1511
- },
1512
- {
1513
- "epoch": 4.8232323232323235,
1514
- "grad_norm": 6.812131404876709,
1515
- "learning_rate": 0.00010353535353535353,
1516
- "loss": 0.1274,
1517
- "step": 1910
1518
- },
1519
- {
1520
- "epoch": 4.848484848484849,
1521
- "grad_norm": 2.3793511390686035,
1522
- "learning_rate": 0.00010303030303030303,
1523
- "loss": 0.1051,
1524
- "step": 1920
1525
- },
1526
- {
1527
- "epoch": 4.873737373737374,
1528
- "grad_norm": 1.2393810749053955,
1529
- "learning_rate": 0.00010252525252525254,
1530
- "loss": 0.0167,
1531
- "step": 1930
1532
- },
1533
- {
1534
- "epoch": 4.898989898989899,
1535
- "grad_norm": 1.5232930183410645,
1536
- "learning_rate": 0.00010202020202020202,
1537
- "loss": 0.0065,
1538
- "step": 1940
1539
- },
1540
- {
1541
- "epoch": 4.924242424242424,
1542
- "grad_norm": 0.00905653741210699,
1543
- "learning_rate": 0.00010151515151515152,
1544
- "loss": 0.0419,
1545
- "step": 1950
1546
- },
1547
- {
1548
- "epoch": 4.94949494949495,
1549
- "grad_norm": 0.8604415655136108,
1550
- "learning_rate": 0.00010101010101010102,
1551
- "loss": 0.0769,
1552
- "step": 1960
1553
- },
1554
- {
1555
- "epoch": 4.974747474747475,
1556
- "grad_norm": 4.089222431182861,
1557
- "learning_rate": 0.0001005050505050505,
1558
- "loss": 0.0366,
1559
- "step": 1970
1560
- },
1561
- {
1562
- "epoch": 5.0,
1563
- "grad_norm": 2.2072501182556152,
1564
- "learning_rate": 0.0001,
1565
- "loss": 0.0746,
1566
- "step": 1980
1567
- },
1568
- {
1569
- "epoch": 5.025252525252525,
1570
- "grad_norm": 0.010899940505623817,
1571
- "learning_rate": 9.94949494949495e-05,
1572
- "loss": 0.0597,
1573
- "step": 1990
1574
- },
1575
- {
1576
- "epoch": 5.05050505050505,
1577
- "grad_norm": 1.6260383129119873,
1578
- "learning_rate": 9.8989898989899e-05,
1579
- "loss": 0.0456,
1580
- "step": 2000
1581
- },
1582
- {
1583
- "epoch": 5.05050505050505,
1584
- "eval_accuracy": 0.9489704565801254,
1585
- "eval_loss": 0.19649948179721832,
1586
- "eval_runtime": 73.1131,
1587
- "eval_samples_per_second": 15.278,
1588
- "eval_steps_per_second": 1.915,
1589
- "step": 2000
1590
- },
1591
- {
1592
- "epoch": 5.075757575757576,
1593
- "grad_norm": 0.009620290249586105,
1594
- "learning_rate": 9.848484848484849e-05,
1595
- "loss": 0.018,
1596
- "step": 2010
1597
- },
1598
- {
1599
- "epoch": 5.101010101010101,
1600
- "grad_norm": 4.627386093139648,
1601
- "learning_rate": 9.797979797979798e-05,
1602
- "loss": 0.0906,
1603
- "step": 2020
1604
- },
1605
- {
1606
- "epoch": 5.126262626262626,
1607
- "grad_norm": 0.5775233507156372,
1608
- "learning_rate": 9.747474747474747e-05,
1609
- "loss": 0.0179,
1610
- "step": 2030
1611
- },
1612
- {
1613
- "epoch": 5.151515151515151,
1614
- "grad_norm": 0.3100966513156891,
1615
- "learning_rate": 9.696969696969698e-05,
1616
- "loss": 0.0225,
1617
- "step": 2040
1618
- },
1619
- {
1620
- "epoch": 5.1767676767676765,
1621
- "grad_norm": 0.012251541949808598,
1622
- "learning_rate": 9.646464646464647e-05,
1623
- "loss": 0.0062,
1624
- "step": 2050
1625
- },
1626
- {
1627
- "epoch": 5.202020202020202,
1628
- "grad_norm": 3.9397971630096436,
1629
- "learning_rate": 9.595959595959596e-05,
1630
- "loss": 0.0497,
1631
- "step": 2060
1632
- },
1633
- {
1634
- "epoch": 5.2272727272727275,
1635
- "grad_norm": 0.002988005056977272,
1636
- "learning_rate": 9.545454545454546e-05,
1637
- "loss": 0.0242,
1638
- "step": 2070
1639
- },
1640
- {
1641
- "epoch": 5.252525252525253,
1642
- "grad_norm": 0.15744374692440033,
1643
- "learning_rate": 9.494949494949495e-05,
1644
- "loss": 0.0165,
1645
- "step": 2080
1646
- },
1647
- {
1648
- "epoch": 5.277777777777778,
1649
- "grad_norm": 2.624490976333618,
1650
- "learning_rate": 9.444444444444444e-05,
1651
- "loss": 0.0595,
1652
- "step": 2090
1653
- },
1654
- {
1655
- "epoch": 5.303030303030303,
1656
- "grad_norm": 1.7126376628875732,
1657
- "learning_rate": 9.393939393939395e-05,
1658
- "loss": 0.0334,
1659
- "step": 2100
1660
- },
1661
- {
1662
- "epoch": 5.303030303030303,
1663
- "eval_accuracy": 0.9570277529095792,
1664
- "eval_loss": 0.165226012468338,
1665
- "eval_runtime": 73.2601,
1666
- "eval_samples_per_second": 15.247,
1667
- "eval_steps_per_second": 1.911,
1668
- "step": 2100
1669
- },
1670
- {
1671
- "epoch": 5.328282828282829,
1672
- "grad_norm": 0.003406533505767584,
1673
- "learning_rate": 9.343434343434344e-05,
1674
- "loss": 0.0201,
1675
- "step": 2110
1676
- },
1677
- {
1678
- "epoch": 5.353535353535354,
1679
- "grad_norm": 0.18647323548793793,
1680
- "learning_rate": 9.292929292929293e-05,
1681
- "loss": 0.0471,
1682
- "step": 2120
1683
- },
1684
- {
1685
- "epoch": 5.378787878787879,
1686
- "grad_norm": 4.275173664093018,
1687
- "learning_rate": 9.242424242424242e-05,
1688
- "loss": 0.0565,
1689
- "step": 2130
1690
- },
1691
- {
1692
- "epoch": 5.404040404040404,
1693
- "grad_norm": 3.319251537322998,
1694
- "learning_rate": 9.191919191919192e-05,
1695
- "loss": 0.0687,
1696
- "step": 2140
1697
- },
1698
- {
1699
- "epoch": 5.429292929292929,
1700
- "grad_norm": 0.067157082259655,
1701
- "learning_rate": 9.141414141414141e-05,
1702
- "loss": 0.0507,
1703
- "step": 2150
1704
- },
1705
- {
1706
- "epoch": 5.454545454545454,
1707
- "grad_norm": 0.18047641217708588,
1708
- "learning_rate": 9.090909090909092e-05,
1709
- "loss": 0.0555,
1710
- "step": 2160
1711
- },
1712
- {
1713
- "epoch": 5.47979797979798,
1714
- "grad_norm": 0.0075127603486180305,
1715
- "learning_rate": 9.040404040404041e-05,
1716
- "loss": 0.0488,
1717
- "step": 2170
1718
- },
1719
- {
1720
- "epoch": 5.505050505050505,
1721
- "grad_norm": 0.01690557599067688,
1722
- "learning_rate": 8.98989898989899e-05,
1723
- "loss": 0.0626,
1724
- "step": 2180
1725
- },
1726
- {
1727
- "epoch": 5.53030303030303,
1728
- "grad_norm": 0.005741783883422613,
1729
- "learning_rate": 8.93939393939394e-05,
1730
- "loss": 0.0014,
1731
- "step": 2190
1732
- },
1733
- {
1734
- "epoch": 5.555555555555555,
1735
- "grad_norm": 0.05627870187163353,
1736
- "learning_rate": 8.888888888888889e-05,
1737
- "loss": 0.0242,
1738
- "step": 2200
1739
- },
1740
- {
1741
- "epoch": 5.555555555555555,
1742
- "eval_accuracy": 0.9749328558639212,
1743
- "eval_loss": 0.11822798103094101,
1744
- "eval_runtime": 73.1232,
1745
- "eval_samples_per_second": 15.276,
1746
- "eval_steps_per_second": 1.915,
1747
- "step": 2200
1748
- },
1749
- {
1750
- "epoch": 5.58080808080808,
1751
- "grad_norm": 0.012817220762372017,
1752
- "learning_rate": 8.83838383838384e-05,
1753
- "loss": 0.0277,
1754
- "step": 2210
1755
- },
1756
- {
1757
- "epoch": 5.606060606060606,
1758
- "grad_norm": 0.00884329341351986,
1759
- "learning_rate": 8.787878787878789e-05,
1760
- "loss": 0.0067,
1761
- "step": 2220
1762
- },
1763
- {
1764
- "epoch": 5.6313131313131315,
1765
- "grad_norm": 0.034603264182806015,
1766
- "learning_rate": 8.737373737373738e-05,
1767
- "loss": 0.0702,
1768
- "step": 2230
1769
- },
1770
- {
1771
- "epoch": 5.656565656565657,
1772
- "grad_norm": 0.0622437559068203,
1773
- "learning_rate": 8.686868686868688e-05,
1774
- "loss": 0.0171,
1775
- "step": 2240
1776
- },
1777
- {
1778
- "epoch": 5.681818181818182,
1779
- "grad_norm": 0.04042644053697586,
1780
- "learning_rate": 8.636363636363637e-05,
1781
- "loss": 0.0592,
1782
- "step": 2250
1783
- },
1784
- {
1785
- "epoch": 5.707070707070707,
1786
- "grad_norm": 0.04215148836374283,
1787
- "learning_rate": 8.585858585858586e-05,
1788
- "loss": 0.0761,
1789
- "step": 2260
1790
- },
1791
- {
1792
- "epoch": 5.732323232323233,
1793
- "grad_norm": 0.22815492749214172,
1794
- "learning_rate": 8.535353535353535e-05,
1795
- "loss": 0.0133,
1796
- "step": 2270
1797
- },
1798
- {
1799
- "epoch": 5.757575757575758,
1800
- "grad_norm": 0.3139846622943878,
1801
- "learning_rate": 8.484848484848486e-05,
1802
- "loss": 0.0013,
1803
- "step": 2280
1804
- },
1805
- {
1806
- "epoch": 5.782828282828283,
1807
- "grad_norm": 0.008748591877520084,
1808
- "learning_rate": 8.434343434343435e-05,
1809
- "loss": 0.036,
1810
- "step": 2290
1811
- },
1812
- {
1813
- "epoch": 5.808080808080808,
1814
- "grad_norm": 0.10703355818986893,
1815
- "learning_rate": 8.383838383838384e-05,
1816
- "loss": 0.0715,
1817
- "step": 2300
1818
- },
1819
- {
1820
- "epoch": 5.808080808080808,
1821
- "eval_accuracy": 0.9650850492390332,
1822
- "eval_loss": 0.12497912347316742,
1823
- "eval_runtime": 72.9451,
1824
- "eval_samples_per_second": 15.313,
1825
- "eval_steps_per_second": 1.919,
1826
- "step": 2300
1827
- },
1828
- {
1829
- "epoch": 5.833333333333333,
1830
- "grad_norm": 0.02993335947394371,
1831
- "learning_rate": 8.333333333333334e-05,
1832
- "loss": 0.017,
1833
- "step": 2310
1834
- },
1835
- {
1836
- "epoch": 5.858585858585858,
1837
- "grad_norm": 0.004180525429546833,
1838
- "learning_rate": 8.282828282828283e-05,
1839
- "loss": 0.0388,
1840
- "step": 2320
1841
- },
1842
- {
1843
- "epoch": 5.883838383838384,
1844
- "grad_norm": 0.0341310054063797,
1845
- "learning_rate": 8.232323232323233e-05,
1846
- "loss": 0.0193,
1847
- "step": 2330
1848
- },
1849
- {
1850
- "epoch": 5.909090909090909,
1851
- "grad_norm": 0.02368093468248844,
1852
- "learning_rate": 8.181818181818183e-05,
1853
- "loss": 0.0314,
1854
- "step": 2340
1855
- },
1856
- {
1857
- "epoch": 5.934343434343434,
1858
- "grad_norm": 0.01623358018696308,
1859
- "learning_rate": 8.131313131313132e-05,
1860
- "loss": 0.0578,
1861
- "step": 2350
1862
- },
1863
- {
1864
- "epoch": 5.959595959595959,
1865
- "grad_norm": 0.006059895269572735,
1866
- "learning_rate": 8.080808080808081e-05,
1867
- "loss": 0.0066,
1868
- "step": 2360
1869
- },
1870
- {
1871
- "epoch": 5.984848484848484,
1872
- "grad_norm": 0.024945911020040512,
1873
- "learning_rate": 8.03030303030303e-05,
1874
- "loss": 0.0032,
1875
- "step": 2370
1876
- },
1877
- {
1878
- "epoch": 6.01010101010101,
1879
- "grad_norm": 0.010317071340978146,
1880
- "learning_rate": 7.97979797979798e-05,
1881
- "loss": 0.0047,
1882
- "step": 2380
1883
- },
1884
- {
1885
- "epoch": 6.0353535353535355,
1886
- "grad_norm": 0.4775066673755646,
1887
- "learning_rate": 7.92929292929293e-05,
1888
- "loss": 0.0193,
1889
- "step": 2390
1890
- },
1891
- {
1892
- "epoch": 6.0606060606060606,
1893
- "grad_norm": 6.233785629272461,
1894
- "learning_rate": 7.878787878787879e-05,
1895
- "loss": 0.0407,
1896
- "step": 2400
1897
- },
1898
- {
1899
- "epoch": 6.0606060606060606,
1900
- "eval_accuracy": 0.9695613249776186,
1901
- "eval_loss": 0.11715386807918549,
1902
- "eval_runtime": 73.3488,
1903
- "eval_samples_per_second": 15.229,
1904
- "eval_steps_per_second": 1.909,
1905
- "step": 2400
1906
- },
1907
- {
1908
- "epoch": 6.085858585858586,
1909
- "grad_norm": 0.04230092465877533,
1910
- "learning_rate": 7.828282828282829e-05,
1911
- "loss": 0.0028,
1912
- "step": 2410
1913
- },
1914
- {
1915
- "epoch": 6.111111111111111,
1916
- "grad_norm": 0.0015748771838843822,
1917
- "learning_rate": 7.777777777777778e-05,
1918
- "loss": 0.0421,
1919
- "step": 2420
1920
- },
1921
- {
1922
- "epoch": 6.136363636363637,
1923
- "grad_norm": 0.00564368162304163,
1924
- "learning_rate": 7.727272727272727e-05,
1925
- "loss": 0.0631,
1926
- "step": 2430
1927
- },
1928
- {
1929
- "epoch": 6.161616161616162,
1930
- "grad_norm": 0.4366774559020996,
1931
- "learning_rate": 7.676767676767676e-05,
1932
- "loss": 0.0429,
1933
- "step": 2440
1934
- },
1935
- {
1936
- "epoch": 6.186868686868687,
1937
- "grad_norm": 0.6611001491546631,
1938
- "learning_rate": 7.626262626262627e-05,
1939
- "loss": 0.0901,
1940
- "step": 2450
1941
- },
1942
- {
1943
- "epoch": 6.212121212121212,
1944
- "grad_norm": 5.706575870513916,
1945
- "learning_rate": 7.575757575757576e-05,
1946
- "loss": 0.0857,
1947
- "step": 2460
1948
- },
1949
- {
1950
- "epoch": 6.237373737373737,
1951
- "grad_norm": 0.007969530299305916,
1952
- "learning_rate": 7.525252525252525e-05,
1953
- "loss": 0.0227,
1954
- "step": 2470
1955
- },
1956
- {
1957
- "epoch": 6.262626262626263,
1958
- "grad_norm": 0.28915736079216003,
1959
- "learning_rate": 7.474747474747475e-05,
1960
- "loss": 0.0113,
1961
- "step": 2480
1962
- },
1963
- {
1964
- "epoch": 6.287878787878788,
1965
- "grad_norm": 0.2088274508714676,
1966
- "learning_rate": 7.424242424242424e-05,
1967
- "loss": 0.0026,
1968
- "step": 2490
1969
- },
1970
- {
1971
- "epoch": 6.313131313131313,
1972
- "grad_norm": 0.004980772268027067,
1973
- "learning_rate": 7.373737373737373e-05,
1974
- "loss": 0.0003,
1975
- "step": 2500
1976
- },
1977
- {
1978
- "epoch": 6.313131313131313,
1979
- "eval_accuracy": 0.9785138764547896,
1980
- "eval_loss": 0.08193562924861908,
1981
- "eval_runtime": 73.1145,
1982
- "eval_samples_per_second": 15.277,
1983
- "eval_steps_per_second": 1.915,
1984
- "step": 2500
1985
- },
1986
- {
1987
- "epoch": 6.338383838383838,
1988
- "grad_norm": 0.001987410243600607,
1989
- "learning_rate": 7.323232323232324e-05,
1990
- "loss": 0.0383,
1991
- "step": 2510
1992
- },
1993
- {
1994
- "epoch": 6.363636363636363,
1995
- "grad_norm": 1.1499226093292236,
1996
- "learning_rate": 7.272727272727273e-05,
1997
- "loss": 0.0171,
1998
- "step": 2520
1999
- },
2000
- {
2001
- "epoch": 6.388888888888889,
2002
- "grad_norm": 0.03895330801606178,
2003
- "learning_rate": 7.222222222222222e-05,
2004
- "loss": 0.0127,
2005
- "step": 2530
2006
- },
2007
- {
2008
- "epoch": 6.414141414141414,
2009
- "grad_norm": 0.3166453540325165,
2010
- "learning_rate": 7.171717171717171e-05,
2011
- "loss": 0.0278,
2012
- "step": 2540
2013
- },
2014
- {
2015
- "epoch": 6.4393939393939394,
2016
- "grad_norm": 0.005140668712556362,
2017
- "learning_rate": 7.121212121212121e-05,
2018
- "loss": 0.0795,
2019
- "step": 2550
2020
- },
2021
- {
2022
- "epoch": 6.4646464646464645,
2023
- "grad_norm": 14.462100982666016,
2024
- "learning_rate": 7.07070707070707e-05,
2025
- "loss": 0.085,
2026
- "step": 2560
2027
- },
2028
- {
2029
- "epoch": 6.48989898989899,
2030
- "grad_norm": 0.24089215695858002,
2031
- "learning_rate": 7.020202020202021e-05,
2032
- "loss": 0.0026,
2033
- "step": 2570
2034
- },
2035
- {
2036
- "epoch": 6.515151515151516,
2037
- "grad_norm": 0.22834239900112152,
2038
- "learning_rate": 6.96969696969697e-05,
2039
- "loss": 0.005,
2040
- "step": 2580
2041
- },
2042
- {
2043
- "epoch": 6.540404040404041,
2044
- "grad_norm": 8.35010814666748,
2045
- "learning_rate": 6.91919191919192e-05,
2046
- "loss": 0.0728,
2047
- "step": 2590
2048
- },
2049
- {
2050
- "epoch": 6.565656565656566,
2051
- "grad_norm": 4.920100212097168,
2052
- "learning_rate": 6.86868686868687e-05,
2053
- "loss": 0.0072,
2054
- "step": 2600
2055
- },
2056
- {
2057
- "epoch": 6.565656565656566,
2058
- "eval_accuracy": 0.9713518352730528,
2059
- "eval_loss": 0.14060670137405396,
2060
- "eval_runtime": 73.0266,
2061
- "eval_samples_per_second": 15.296,
2062
- "eval_steps_per_second": 1.917,
2063
- "step": 2600
2064
- },
2065
- {
2066
- "epoch": 6.590909090909091,
2067
- "grad_norm": 0.23918700218200684,
2068
- "learning_rate": 6.818181818181818e-05,
2069
- "loss": 0.0821,
2070
- "step": 2610
2071
- },
2072
- {
2073
- "epoch": 6.616161616161616,
2074
- "grad_norm": 0.06384919583797455,
2075
- "learning_rate": 6.767676767676769e-05,
2076
- "loss": 0.0761,
2077
- "step": 2620
2078
- },
2079
- {
2080
- "epoch": 6.641414141414142,
2081
- "grad_norm": 0.4447100758552551,
2082
- "learning_rate": 6.717171717171718e-05,
2083
- "loss": 0.0139,
2084
- "step": 2630
2085
- },
2086
- {
2087
- "epoch": 6.666666666666667,
2088
- "grad_norm": 0.0030958615243434906,
2089
- "learning_rate": 6.666666666666667e-05,
2090
- "loss": 0.0341,
2091
- "step": 2640
2092
- },
2093
- {
2094
- "epoch": 6.691919191919192,
2095
- "grad_norm": 0.05117692053318024,
2096
- "learning_rate": 6.616161616161617e-05,
2097
- "loss": 0.0152,
2098
- "step": 2650
2099
- },
2100
- {
2101
- "epoch": 6.717171717171717,
2102
- "grad_norm": 0.003273693146184087,
2103
- "learning_rate": 6.565656565656566e-05,
2104
- "loss": 0.0314,
2105
- "step": 2660
2106
- },
2107
- {
2108
- "epoch": 6.742424242424242,
2109
- "grad_norm": 0.005075991619378328,
2110
- "learning_rate": 6.515151515151516e-05,
2111
- "loss": 0.0164,
2112
- "step": 2670
2113
- },
2114
- {
2115
- "epoch": 6.767676767676767,
2116
- "grad_norm": 0.23585616052150726,
2117
- "learning_rate": 6.464646464646466e-05,
2118
- "loss": 0.0139,
2119
- "step": 2680
2120
- },
2121
- {
2122
- "epoch": 6.792929292929293,
2123
- "grad_norm": 6.123977184295654,
2124
- "learning_rate": 6.414141414141415e-05,
2125
- "loss": 0.0113,
2126
- "step": 2690
2127
- },
2128
- {
2129
- "epoch": 6.818181818181818,
2130
- "grad_norm": 2.395871162414551,
2131
- "learning_rate": 6.363636363636364e-05,
2132
- "loss": 0.0183,
2133
- "step": 2700
2134
- },
2135
- {
2136
- "epoch": 6.818181818181818,
2137
- "eval_accuracy": 0.9749328558639212,
2138
- "eval_loss": 0.11515188962221146,
2139
- "eval_runtime": 73.0277,
2140
- "eval_samples_per_second": 15.296,
2141
- "eval_steps_per_second": 1.917,
2142
- "step": 2700
2143
- },
2144
- {
2145
- "epoch": 6.843434343434343,
2146
- "grad_norm": 0.005218807607889175,
2147
- "learning_rate": 6.313131313131313e-05,
2148
- "loss": 0.003,
2149
- "step": 2710
2150
- },
2151
- {
2152
- "epoch": 6.8686868686868685,
2153
- "grad_norm": 0.0012497535208240151,
2154
- "learning_rate": 6.262626262626264e-05,
2155
- "loss": 0.0116,
2156
- "step": 2720
2157
- },
2158
- {
2159
- "epoch": 6.893939393939394,
2160
- "grad_norm": 0.0025018516462296247,
2161
- "learning_rate": 6.212121212121213e-05,
2162
- "loss": 0.005,
2163
- "step": 2730
2164
- },
2165
- {
2166
- "epoch": 6.91919191919192,
2167
- "grad_norm": 0.005596707109361887,
2168
- "learning_rate": 6.161616161616162e-05,
2169
- "loss": 0.037,
2170
- "step": 2740
2171
- },
2172
- {
2173
- "epoch": 6.944444444444445,
2174
- "grad_norm": 0.0010910239070653915,
2175
- "learning_rate": 6.111111111111112e-05,
2176
- "loss": 0.0338,
2177
- "step": 2750
2178
- },
2179
- {
2180
- "epoch": 6.96969696969697,
2181
- "grad_norm": 0.6075408458709717,
2182
- "learning_rate": 6.060606060606061e-05,
2183
- "loss": 0.0268,
2184
- "step": 2760
2185
- },
2186
- {
2187
- "epoch": 6.994949494949495,
2188
- "grad_norm": 0.25022584199905396,
2189
- "learning_rate": 6.01010101010101e-05,
2190
- "loss": 0.0125,
2191
- "step": 2770
2192
- },
2193
- {
2194
- "epoch": 7.02020202020202,
2195
- "grad_norm": 0.12169167399406433,
2196
- "learning_rate": 5.959595959595959e-05,
2197
- "loss": 0.0082,
2198
- "step": 2780
2199
- },
2200
- {
2201
- "epoch": 7.045454545454546,
2202
- "grad_norm": 3.5715599060058594,
2203
- "learning_rate": 5.90909090909091e-05,
2204
- "loss": 0.0144,
2205
- "step": 2790
2206
- },
2207
- {
2208
- "epoch": 7.070707070707071,
2209
- "grad_norm": 0.09293267875909805,
2210
- "learning_rate": 5.858585858585859e-05,
2211
- "loss": 0.0021,
2212
- "step": 2800
2213
- },
2214
- {
2215
- "epoch": 7.070707070707071,
2216
- "eval_accuracy": 0.973142345568487,
2217
- "eval_loss": 0.13676650822162628,
2218
- "eval_runtime": 72.9405,
2219
- "eval_samples_per_second": 15.314,
2220
- "eval_steps_per_second": 1.919,
2221
- "step": 2800
2222
- },
2223
- {
2224
- "epoch": 7.095959595959596,
2225
- "grad_norm": 0.009541651234030724,
2226
- "learning_rate": 5.808080808080808e-05,
2227
- "loss": 0.0058,
2228
- "step": 2810
2229
- },
2230
- {
2231
- "epoch": 7.121212121212121,
2232
- "grad_norm": 0.0016315419925376773,
2233
- "learning_rate": 5.757575757575758e-05,
2234
- "loss": 0.0064,
2235
- "step": 2820
2236
- },
2237
- {
2238
- "epoch": 7.146464646464646,
2239
- "grad_norm": 10.356843948364258,
2240
- "learning_rate": 5.707070707070707e-05,
2241
- "loss": 0.0595,
2242
- "step": 2830
2243
- },
2244
- {
2245
- "epoch": 7.171717171717171,
2246
- "grad_norm": 0.0018419253174215555,
2247
- "learning_rate": 5.6565656565656563e-05,
2248
- "loss": 0.016,
2249
- "step": 2840
2250
- },
2251
- {
2252
- "epoch": 7.196969696969697,
2253
- "grad_norm": 0.010135513730347157,
2254
- "learning_rate": 5.606060606060606e-05,
2255
- "loss": 0.052,
2256
- "step": 2850
2257
- },
2258
- {
2259
- "epoch": 7.222222222222222,
2260
- "grad_norm": 6.740849494934082,
2261
- "learning_rate": 5.555555555555556e-05,
2262
- "loss": 0.0374,
2263
- "step": 2860
2264
- },
2265
- {
2266
- "epoch": 7.247474747474747,
2267
- "grad_norm": 0.4412079155445099,
2268
- "learning_rate": 5.5050505050505056e-05,
2269
- "loss": 0.0117,
2270
- "step": 2870
2271
- },
2272
- {
2273
- "epoch": 7.2727272727272725,
2274
- "grad_norm": 0.001609967672266066,
2275
- "learning_rate": 5.4545454545454546e-05,
2276
- "loss": 0.0824,
2277
- "step": 2880
2278
- },
2279
- {
2280
- "epoch": 7.297979797979798,
2281
- "grad_norm": 0.005415134131908417,
2282
- "learning_rate": 5.4040404040404044e-05,
2283
- "loss": 0.0177,
2284
- "step": 2890
2285
- },
2286
- {
2287
- "epoch": 7.3232323232323235,
2288
- "grad_norm": 0.02915014885365963,
2289
- "learning_rate": 5.353535353535354e-05,
2290
- "loss": 0.046,
2291
- "step": 2900
2292
- },
2293
- {
2294
- "epoch": 7.3232323232323235,
2295
- "eval_accuracy": 0.9794091316025068,
2296
- "eval_loss": 0.09002197533845901,
2297
- "eval_runtime": 73.1136,
2298
- "eval_samples_per_second": 15.278,
2299
- "eval_steps_per_second": 1.915,
2300
- "step": 2900
2301
- },
2302
- {
2303
- "epoch": 7.348484848484849,
2304
- "grad_norm": 0.020192056894302368,
2305
- "learning_rate": 5.303030303030303e-05,
2306
- "loss": 0.0004,
2307
- "step": 2910
2308
- },
2309
- {
2310
- "epoch": 7.373737373737374,
2311
- "grad_norm": 0.7057023644447327,
2312
- "learning_rate": 5.2525252525252536e-05,
2313
- "loss": 0.0699,
2314
- "step": 2920
2315
- },
2316
- {
2317
- "epoch": 7.398989898989899,
2318
- "grad_norm": 0.0018105951603502035,
2319
- "learning_rate": 5.2020202020202026e-05,
2320
- "loss": 0.0379,
2321
- "step": 2930
2322
- },
2323
- {
2324
- "epoch": 7.424242424242424,
2325
- "grad_norm": 0.002236352302134037,
2326
- "learning_rate": 5.151515151515152e-05,
2327
- "loss": 0.0576,
2328
- "step": 2940
2329
- },
2330
- {
2331
- "epoch": 7.44949494949495,
2332
- "grad_norm": 0.46005484461784363,
2333
- "learning_rate": 5.101010101010101e-05,
2334
- "loss": 0.0007,
2335
- "step": 2950
2336
- },
2337
- {
2338
- "epoch": 7.474747474747475,
2339
- "grad_norm": 0.17090271413326263,
2340
- "learning_rate": 5.050505050505051e-05,
2341
- "loss": 0.0066,
2342
- "step": 2960
2343
- },
2344
- {
2345
- "epoch": 7.5,
2346
- "grad_norm": 0.002259742235764861,
2347
- "learning_rate": 5e-05,
2348
- "loss": 0.0043,
2349
- "step": 2970
2350
- },
2351
- {
2352
- "epoch": 7.525252525252525,
2353
- "grad_norm": 0.0029255333356559277,
2354
- "learning_rate": 4.94949494949495e-05,
2355
- "loss": 0.0239,
2356
- "step": 2980
2357
- },
2358
- {
2359
- "epoch": 7.55050505050505,
2360
- "grad_norm": 2.9925894737243652,
2361
- "learning_rate": 4.898989898989899e-05,
2362
- "loss": 0.0063,
2363
- "step": 2990
2364
- },
2365
- {
2366
- "epoch": 7.575757575757576,
2367
- "grad_norm": 0.052914004772901535,
2368
- "learning_rate": 4.848484848484849e-05,
2369
- "loss": 0.033,
2370
- "step": 3000
2371
- },
2372
- {
2373
- "epoch": 7.575757575757576,
2374
- "eval_accuracy": 0.9785138764547896,
2375
- "eval_loss": 0.10143210738897324,
2376
- "eval_runtime": 73.4907,
2377
- "eval_samples_per_second": 15.199,
2378
- "eval_steps_per_second": 1.905,
2379
- "step": 3000
2380
- },
2381
- {
2382
- "epoch": 7.601010101010101,
2383
- "grad_norm": 0.04058058559894562,
2384
- "learning_rate": 4.797979797979798e-05,
2385
- "loss": 0.0245,
2386
- "step": 3010
2387
- },
2388
- {
2389
- "epoch": 7.626262626262626,
2390
- "grad_norm": 0.03967829421162605,
2391
- "learning_rate": 4.7474747474747476e-05,
2392
- "loss": 0.0006,
2393
- "step": 3020
2394
- },
2395
- {
2396
- "epoch": 7.651515151515151,
2397
- "grad_norm": 0.621035635471344,
2398
- "learning_rate": 4.696969696969697e-05,
2399
- "loss": 0.0175,
2400
- "step": 3030
2401
- },
2402
- {
2403
- "epoch": 7.6767676767676765,
2404
- "grad_norm": 0.36977216601371765,
2405
- "learning_rate": 4.6464646464646464e-05,
2406
- "loss": 0.0388,
2407
- "step": 3040
2408
- },
2409
- {
2410
- "epoch": 7.702020202020202,
2411
- "grad_norm": 3.2532241344451904,
2412
- "learning_rate": 4.595959595959596e-05,
2413
- "loss": 0.0905,
2414
- "step": 3050
2415
- },
2416
- {
2417
- "epoch": 7.7272727272727275,
2418
- "grad_norm": 0.004156060051172972,
2419
- "learning_rate": 4.545454545454546e-05,
2420
- "loss": 0.0002,
2421
- "step": 3060
2422
- },
2423
- {
2424
- "epoch": 7.752525252525253,
2425
- "grad_norm": 0.6550003290176392,
2426
- "learning_rate": 4.494949494949495e-05,
2427
- "loss": 0.0066,
2428
- "step": 3070
2429
- },
2430
- {
2431
- "epoch": 7.777777777777778,
2432
- "grad_norm": 0.0028251020703464746,
2433
- "learning_rate": 4.4444444444444447e-05,
2434
- "loss": 0.0083,
2435
- "step": 3080
2436
- },
2437
- {
2438
- "epoch": 7.803030303030303,
2439
- "grad_norm": 0.008767428807914257,
2440
- "learning_rate": 4.3939393939393944e-05,
2441
- "loss": 0.0006,
2442
- "step": 3090
2443
- },
2444
- {
2445
- "epoch": 7.828282828282829,
2446
- "grad_norm": 0.04811250418424606,
2447
- "learning_rate": 4.343434343434344e-05,
2448
- "loss": 0.0354,
2449
- "step": 3100
2450
- },
2451
- {
2452
- "epoch": 7.828282828282829,
2453
- "eval_accuracy": 0.9767233661593554,
2454
- "eval_loss": 0.09683331102132797,
2455
- "eval_runtime": 73.2348,
2456
- "eval_samples_per_second": 15.252,
2457
- "eval_steps_per_second": 1.912,
2458
- "step": 3100
2459
- },
2460
- {
2461
- "epoch": 7.853535353535354,
2462
- "grad_norm": 0.00525275431573391,
2463
- "learning_rate": 4.292929292929293e-05,
2464
- "loss": 0.0088,
2465
- "step": 3110
2466
- },
2467
- {
2468
- "epoch": 7.878787878787879,
2469
- "grad_norm": 0.015972474589943886,
2470
- "learning_rate": 4.242424242424243e-05,
2471
- "loss": 0.0011,
2472
- "step": 3120
2473
- },
2474
- {
2475
- "epoch": 7.904040404040404,
2476
- "grad_norm": 0.006997071672230959,
2477
- "learning_rate": 4.191919191919192e-05,
2478
- "loss": 0.0017,
2479
- "step": 3130
2480
- },
2481
- {
2482
- "epoch": 7.929292929292929,
2483
- "grad_norm": 0.023101719096302986,
2484
- "learning_rate": 4.141414141414142e-05,
2485
- "loss": 0.0567,
2486
- "step": 3140
2487
- },
2488
- {
2489
- "epoch": 7.954545454545455,
2490
- "grad_norm": 0.003169642062857747,
2491
- "learning_rate": 4.0909090909090915e-05,
2492
- "loss": 0.1026,
2493
- "step": 3150
2494
- },
2495
- {
2496
- "epoch": 7.97979797979798,
2497
- "grad_norm": 0.003613903187215328,
2498
- "learning_rate": 4.0404040404040405e-05,
2499
- "loss": 0.005,
2500
- "step": 3160
2501
- },
2502
- {
2503
- "epoch": 8.005050505050505,
2504
- "grad_norm": 1.0490131378173828,
2505
- "learning_rate": 3.98989898989899e-05,
2506
- "loss": 0.0023,
2507
- "step": 3170
2508
- },
2509
- {
2510
- "epoch": 8.030303030303031,
2511
- "grad_norm": 0.003916851244866848,
2512
- "learning_rate": 3.939393939393939e-05,
2513
- "loss": 0.0023,
2514
- "step": 3180
2515
- },
2516
- {
2517
- "epoch": 8.055555555555555,
2518
- "grad_norm": 0.016336582601070404,
2519
- "learning_rate": 3.888888888888889e-05,
2520
- "loss": 0.0079,
2521
- "step": 3190
2522
- },
2523
- {
2524
- "epoch": 8.080808080808081,
2525
- "grad_norm": 0.8970369696617126,
2526
- "learning_rate": 3.838383838383838e-05,
2527
- "loss": 0.0026,
2528
- "step": 3200
2529
- },
2530
- {
2531
- "epoch": 8.080808080808081,
2532
- "eval_accuracy": 0.973142345568487,
2533
- "eval_loss": 0.1217464730143547,
2534
- "eval_runtime": 73.5035,
2535
- "eval_samples_per_second": 15.197,
2536
- "eval_steps_per_second": 1.905,
2537
- "step": 3200
2538
- },
2539
- {
2540
- "epoch": 8.106060606060606,
2541
- "grad_norm": 0.03298179805278778,
2542
- "learning_rate": 3.787878787878788e-05,
2543
- "loss": 0.0051,
2544
- "step": 3210
2545
- },
2546
- {
2547
- "epoch": 8.131313131313131,
2548
- "grad_norm": 0.5918856263160706,
2549
- "learning_rate": 3.7373737373737376e-05,
2550
- "loss": 0.032,
2551
- "step": 3220
2552
- },
2553
- {
2554
- "epoch": 8.156565656565657,
2555
- "grad_norm": 0.0031904878560453653,
2556
- "learning_rate": 3.686868686868687e-05,
2557
- "loss": 0.029,
2558
- "step": 3230
2559
- },
2560
- {
2561
- "epoch": 8.181818181818182,
2562
- "grad_norm": 0.043024152517318726,
2563
- "learning_rate": 3.6363636363636364e-05,
2564
- "loss": 0.0003,
2565
- "step": 3240
2566
- },
2567
- {
2568
- "epoch": 8.207070707070708,
2569
- "grad_norm": 0.011919928714632988,
2570
- "learning_rate": 3.5858585858585855e-05,
2571
- "loss": 0.0028,
2572
- "step": 3250
2573
- },
2574
- {
2575
- "epoch": 8.232323232323232,
2576
- "grad_norm": 0.007164669223129749,
2577
- "learning_rate": 3.535353535353535e-05,
2578
- "loss": 0.0146,
2579
- "step": 3260
2580
- },
2581
- {
2582
- "epoch": 8.257575757575758,
2583
- "grad_norm": 0.03415270894765854,
2584
- "learning_rate": 3.484848484848485e-05,
2585
- "loss": 0.0041,
2586
- "step": 3270
2587
- },
2588
- {
2589
- "epoch": 8.282828282828282,
2590
- "grad_norm": 0.03534342721104622,
2591
- "learning_rate": 3.434343434343435e-05,
2592
- "loss": 0.0035,
2593
- "step": 3280
2594
- },
2595
- {
2596
- "epoch": 8.308080808080808,
2597
- "grad_norm": 0.3735661804676056,
2598
- "learning_rate": 3.3838383838383844e-05,
2599
- "loss": 0.0745,
2600
- "step": 3290
2601
- },
2602
- {
2603
- "epoch": 8.333333333333334,
2604
- "grad_norm": 0.0013512909645214677,
2605
- "learning_rate": 3.3333333333333335e-05,
2606
- "loss": 0.0002,
2607
- "step": 3300
2608
- },
2609
- {
2610
- "epoch": 8.333333333333334,
2611
- "eval_accuracy": 0.9794091316025068,
2612
- "eval_loss": 0.08283615112304688,
2613
- "eval_runtime": 73.1651,
2614
- "eval_samples_per_second": 15.267,
2615
- "eval_steps_per_second": 1.913,
2616
- "step": 3300
2617
- },
2618
- {
2619
- "epoch": 8.358585858585858,
2620
- "grad_norm": 0.023621654137969017,
2621
- "learning_rate": 3.282828282828283e-05,
2622
- "loss": 0.0174,
2623
- "step": 3310
2624
- },
2625
- {
2626
- "epoch": 8.383838383838384,
2627
- "grad_norm": 0.006960035767406225,
2628
- "learning_rate": 3.232323232323233e-05,
2629
- "loss": 0.0004,
2630
- "step": 3320
2631
- },
2632
- {
2633
- "epoch": 8.409090909090908,
2634
- "grad_norm": 0.0008190835942514241,
2635
- "learning_rate": 3.181818181818182e-05,
2636
- "loss": 0.0374,
2637
- "step": 3330
2638
- },
2639
- {
2640
- "epoch": 8.434343434343434,
2641
- "grad_norm": 0.016193361952900887,
2642
- "learning_rate": 3.131313131313132e-05,
2643
- "loss": 0.0007,
2644
- "step": 3340
2645
- },
2646
- {
2647
- "epoch": 8.45959595959596,
2648
- "grad_norm": 0.2075665146112442,
2649
- "learning_rate": 3.080808080808081e-05,
2650
- "loss": 0.0422,
2651
- "step": 3350
2652
- },
2653
- {
2654
- "epoch": 8.484848484848484,
2655
- "grad_norm": 0.009178784675896168,
2656
- "learning_rate": 3.0303030303030306e-05,
2657
- "loss": 0.0332,
2658
- "step": 3360
2659
- },
2660
- {
2661
- "epoch": 8.51010101010101,
2662
- "grad_norm": 8.036938667297363,
2663
- "learning_rate": 2.9797979797979796e-05,
2664
- "loss": 0.0436,
2665
- "step": 3370
2666
- },
2667
- {
2668
- "epoch": 8.535353535353535,
2669
- "grad_norm": 0.0013093262678012252,
2670
- "learning_rate": 2.9292929292929294e-05,
2671
- "loss": 0.0109,
2672
- "step": 3380
2673
- },
2674
- {
2675
- "epoch": 8.56060606060606,
2676
- "grad_norm": 0.0033100605942308903,
2677
- "learning_rate": 2.878787878787879e-05,
2678
- "loss": 0.0011,
2679
- "step": 3390
2680
- },
2681
- {
2682
- "epoch": 8.585858585858587,
2683
- "grad_norm": 0.0015343882841989398,
2684
- "learning_rate": 2.8282828282828282e-05,
2685
- "loss": 0.0006,
2686
- "step": 3400
2687
- },
2688
- {
2689
- "epoch": 8.585858585858587,
2690
- "eval_accuracy": 0.9794091316025068,
2691
- "eval_loss": 0.09259337186813354,
2692
- "eval_runtime": 72.8639,
2693
- "eval_samples_per_second": 15.33,
2694
- "eval_steps_per_second": 1.921,
2695
- "step": 3400
2696
- },
2697
- {
2698
- "epoch": 8.61111111111111,
2699
- "grad_norm": 0.030406756326556206,
2700
- "learning_rate": 2.777777777777778e-05,
2701
- "loss": 0.0026,
2702
- "step": 3410
2703
- },
2704
- {
2705
- "epoch": 8.636363636363637,
2706
- "grad_norm": 0.0022419544402509928,
2707
- "learning_rate": 2.7272727272727273e-05,
2708
- "loss": 0.0007,
2709
- "step": 3420
2710
- },
2711
- {
2712
- "epoch": 8.66161616161616,
2713
- "grad_norm": 0.0011131414212286472,
2714
- "learning_rate": 2.676767676767677e-05,
2715
- "loss": 0.0006,
2716
- "step": 3430
2717
- },
2718
- {
2719
- "epoch": 8.686868686868687,
2720
- "grad_norm": 0.005616435315459967,
2721
- "learning_rate": 2.6262626262626268e-05,
2722
- "loss": 0.0003,
2723
- "step": 3440
2724
- },
2725
- {
2726
- "epoch": 8.712121212121213,
2727
- "grad_norm": 0.1008942499756813,
2728
- "learning_rate": 2.575757575757576e-05,
2729
- "loss": 0.0097,
2730
- "step": 3450
2731
- },
2732
- {
2733
- "epoch": 8.737373737373737,
2734
- "grad_norm": 0.002821123693138361,
2735
- "learning_rate": 2.5252525252525256e-05,
2736
- "loss": 0.0669,
2737
- "step": 3460
2738
- },
2739
- {
2740
- "epoch": 8.762626262626263,
2741
- "grad_norm": 0.013286658562719822,
2742
- "learning_rate": 2.474747474747475e-05,
2743
- "loss": 0.0265,
2744
- "step": 3470
2745
- },
2746
- {
2747
- "epoch": 8.787878787878787,
2748
- "grad_norm": 0.003963208291679621,
2749
- "learning_rate": 2.4242424242424244e-05,
2750
- "loss": 0.0178,
2751
- "step": 3480
2752
- },
2753
- {
2754
- "epoch": 8.813131313131313,
2755
- "grad_norm": 0.002018690574914217,
2756
- "learning_rate": 2.3737373737373738e-05,
2757
- "loss": 0.0082,
2758
- "step": 3490
2759
- },
2760
- {
2761
- "epoch": 8.83838383838384,
2762
- "grad_norm": 0.1014542207121849,
2763
- "learning_rate": 2.3232323232323232e-05,
2764
- "loss": 0.0006,
2765
- "step": 3500
2766
- },
2767
- {
2768
- "epoch": 8.83838383838384,
2769
- "eval_accuracy": 0.9794091316025068,
2770
- "eval_loss": 0.10012003779411316,
2771
- "eval_runtime": 73.1859,
2772
- "eval_samples_per_second": 15.263,
2773
- "eval_steps_per_second": 1.913,
2774
- "step": 3500
2775
- },
2776
- {
2777
- "epoch": 8.863636363636363,
2778
- "grad_norm": 0.002746024401858449,
2779
- "learning_rate": 2.272727272727273e-05,
2780
- "loss": 0.0063,
2781
- "step": 3510
2782
- },
2783
- {
2784
- "epoch": 8.88888888888889,
2785
- "grad_norm": 0.0018340348033234477,
2786
- "learning_rate": 2.2222222222222223e-05,
2787
- "loss": 0.0024,
2788
- "step": 3520
2789
- },
2790
- {
2791
- "epoch": 8.914141414141413,
2792
- "grad_norm": 0.004108617547899485,
2793
- "learning_rate": 2.171717171717172e-05,
2794
- "loss": 0.0083,
2795
- "step": 3530
2796
- },
2797
- {
2798
- "epoch": 8.93939393939394,
2799
- "grad_norm": 0.00315410690382123,
2800
- "learning_rate": 2.1212121212121215e-05,
2801
- "loss": 0.0462,
2802
- "step": 3540
2803
- },
2804
- {
2805
- "epoch": 8.964646464646465,
2806
- "grad_norm": 0.024781817570328712,
2807
- "learning_rate": 2.070707070707071e-05,
2808
- "loss": 0.0029,
2809
- "step": 3550
2810
- },
2811
- {
2812
- "epoch": 8.98989898989899,
2813
- "grad_norm": 0.005382045172154903,
2814
- "learning_rate": 2.0202020202020203e-05,
2815
- "loss": 0.0047,
2816
- "step": 3560
2817
- },
2818
- {
2819
- "epoch": 9.015151515151516,
2820
- "grad_norm": 1.6344341039657593,
2821
- "learning_rate": 1.9696969696969697e-05,
2822
- "loss": 0.0038,
2823
- "step": 3570
2824
- },
2825
- {
2826
- "epoch": 9.04040404040404,
2827
- "grad_norm": 0.010318132117390633,
2828
- "learning_rate": 1.919191919191919e-05,
2829
- "loss": 0.0096,
2830
- "step": 3580
2831
- },
2832
- {
2833
- "epoch": 9.065656565656566,
2834
- "grad_norm": 0.0016402292530983686,
2835
- "learning_rate": 1.8686868686868688e-05,
2836
- "loss": 0.0321,
2837
- "step": 3590
2838
- },
2839
- {
2840
- "epoch": 9.090909090909092,
2841
- "grad_norm": 0.004027374088764191,
2842
- "learning_rate": 1.8181818181818182e-05,
2843
- "loss": 0.0006,
2844
- "step": 3600
2845
- },
2846
- {
2847
- "epoch": 9.090909090909092,
2848
- "eval_accuracy": 0.9847806624888094,
2849
- "eval_loss": 0.08629997074604034,
2850
- "eval_runtime": 73.127,
2851
- "eval_samples_per_second": 15.275,
2852
- "eval_steps_per_second": 1.914,
2853
- "step": 3600
2854
- },
2855
- {
2856
- "epoch": 9.116161616161616,
2857
- "grad_norm": 0.0007902685320004821,
2858
- "learning_rate": 1.7676767676767676e-05,
2859
- "loss": 0.0059,
2860
- "step": 3610
2861
- },
2862
- {
2863
- "epoch": 9.141414141414142,
2864
- "grad_norm": 0.0024135063868016005,
2865
- "learning_rate": 1.7171717171717173e-05,
2866
- "loss": 0.0269,
2867
- "step": 3620
2868
- },
2869
- {
2870
- "epoch": 9.166666666666666,
2871
- "grad_norm": 0.026507705450057983,
2872
- "learning_rate": 1.6666666666666667e-05,
2873
- "loss": 0.0003,
2874
- "step": 3630
2875
- },
2876
- {
2877
- "epoch": 9.191919191919192,
2878
- "grad_norm": 0.10678762197494507,
2879
- "learning_rate": 1.6161616161616165e-05,
2880
- "loss": 0.0059,
2881
- "step": 3640
2882
- },
2883
- {
2884
- "epoch": 9.217171717171718,
2885
- "grad_norm": 0.08362487703561783,
2886
- "learning_rate": 1.565656565656566e-05,
2887
- "loss": 0.0545,
2888
- "step": 3650
2889
- },
2890
- {
2891
- "epoch": 9.242424242424242,
2892
- "grad_norm": 0.002414940157905221,
2893
- "learning_rate": 1.5151515151515153e-05,
2894
- "loss": 0.0221,
2895
- "step": 3660
2896
- },
2897
- {
2898
- "epoch": 9.267676767676768,
2899
- "grad_norm": 0.0013868235982954502,
2900
- "learning_rate": 1.4646464646464647e-05,
2901
- "loss": 0.0005,
2902
- "step": 3670
2903
- },
2904
- {
2905
- "epoch": 9.292929292929292,
2906
- "grad_norm": 0.0013921884819865227,
2907
- "learning_rate": 1.4141414141414141e-05,
2908
- "loss": 0.041,
2909
- "step": 3680
2910
- },
2911
- {
2912
- "epoch": 9.318181818181818,
2913
- "grad_norm": 0.08867702633142471,
2914
- "learning_rate": 1.3636363636363637e-05,
2915
- "loss": 0.026,
2916
- "step": 3690
2917
- },
2918
- {
2919
- "epoch": 9.343434343434343,
2920
- "grad_norm": 0.0012104762718081474,
2921
- "learning_rate": 1.3131313131313134e-05,
2922
- "loss": 0.0633,
2923
- "step": 3700
2924
- },
2925
- {
2926
- "epoch": 9.343434343434343,
2927
- "eval_accuracy": 0.9803043867502238,
2928
- "eval_loss": 0.09109070897102356,
2929
- "eval_runtime": 71.4974,
2930
- "eval_samples_per_second": 15.623,
2931
- "eval_steps_per_second": 1.958,
2932
- "step": 3700
2933
- },
2934
- {
2935
- "epoch": 9.368686868686869,
2936
- "grad_norm": 0.007544935215264559,
2937
- "learning_rate": 1.2626262626262628e-05,
2938
- "loss": 0.002,
2939
- "step": 3710
2940
- },
2941
- {
2942
- "epoch": 9.393939393939394,
2943
- "grad_norm": 0.01898648589849472,
2944
- "learning_rate": 1.2121212121212122e-05,
2945
- "loss": 0.0005,
2946
- "step": 3720
2947
- },
2948
- {
2949
- "epoch": 9.419191919191919,
2950
- "grad_norm": 0.00644712382927537,
2951
- "learning_rate": 1.1616161616161616e-05,
2952
- "loss": 0.0059,
2953
- "step": 3730
2954
- },
2955
- {
2956
- "epoch": 9.444444444444445,
2957
- "grad_norm": 0.00872492603957653,
2958
- "learning_rate": 1.1111111111111112e-05,
2959
- "loss": 0.0011,
2960
- "step": 3740
2961
- },
2962
- {
2963
- "epoch": 9.469696969696969,
2964
- "grad_norm": 1.6075825691223145,
2965
- "learning_rate": 1.0606060606060607e-05,
2966
- "loss": 0.0099,
2967
- "step": 3750
2968
- },
2969
- {
2970
- "epoch": 9.494949494949495,
2971
- "grad_norm": 6.320465087890625,
2972
- "learning_rate": 1.0101010101010101e-05,
2973
- "loss": 0.0163,
2974
- "step": 3760
2975
- },
2976
- {
2977
- "epoch": 9.52020202020202,
2978
- "grad_norm": 0.0037208800204098225,
2979
- "learning_rate": 9.595959595959595e-06,
2980
- "loss": 0.0002,
2981
- "step": 3770
2982
- },
2983
- {
2984
- "epoch": 9.545454545454545,
2985
- "grad_norm": 3.3599369525909424,
2986
- "learning_rate": 9.090909090909091e-06,
2987
- "loss": 0.0053,
2988
- "step": 3780
2989
- },
2990
- {
2991
- "epoch": 9.570707070707071,
2992
- "grad_norm": 0.5879691243171692,
2993
- "learning_rate": 8.585858585858587e-06,
2994
- "loss": 0.0019,
2995
- "step": 3790
2996
- },
2997
- {
2998
- "epoch": 9.595959595959595,
2999
- "grad_norm": 0.26342862844467163,
3000
- "learning_rate": 8.080808080808082e-06,
3001
- "loss": 0.0009,
3002
- "step": 3800
3003
- },
3004
- {
3005
- "epoch": 9.595959595959595,
3006
- "eval_accuracy": 0.982094897045658,
3007
- "eval_loss": 0.09413682669401169,
3008
- "eval_runtime": 73.1451,
3009
- "eval_samples_per_second": 15.271,
3010
- "eval_steps_per_second": 1.914,
3011
- "step": 3800
3012
- },
3013
- {
3014
- "epoch": 9.621212121212121,
3015
- "grad_norm": 0.042649831622838974,
3016
- "learning_rate": 7.5757575757575764e-06,
3017
- "loss": 0.0226,
3018
- "step": 3810
3019
- },
3020
- {
3021
- "epoch": 9.646464646464647,
3022
- "grad_norm": 0.0022528120316565037,
3023
- "learning_rate": 7.0707070707070704e-06,
3024
- "loss": 0.0136,
3025
- "step": 3820
3026
- },
3027
- {
3028
- "epoch": 9.671717171717171,
3029
- "grad_norm": 0.12108311802148819,
3030
- "learning_rate": 6.565656565656567e-06,
3031
- "loss": 0.0408,
3032
- "step": 3830
3033
- },
3034
- {
3035
- "epoch": 9.696969696969697,
3036
- "grad_norm": 0.7086867690086365,
3037
- "learning_rate": 6.060606060606061e-06,
3038
- "loss": 0.0035,
3039
- "step": 3840
3040
- },
3041
- {
3042
- "epoch": 9.722222222222221,
3043
- "grad_norm": 0.049748744815588,
3044
- "learning_rate": 5.555555555555556e-06,
3045
- "loss": 0.0012,
3046
- "step": 3850
3047
- },
3048
- {
3049
- "epoch": 9.747474747474747,
3050
- "grad_norm": 0.004345474299043417,
3051
- "learning_rate": 5.050505050505051e-06,
3052
- "loss": 0.0002,
3053
- "step": 3860
3054
- },
3055
- {
3056
- "epoch": 9.772727272727273,
3057
- "grad_norm": 0.005164165981113911,
3058
- "learning_rate": 4.5454545454545455e-06,
3059
- "loss": 0.0049,
3060
- "step": 3870
3061
- },
3062
- {
3063
- "epoch": 9.797979797979798,
3064
- "grad_norm": 0.003518365090712905,
3065
- "learning_rate": 4.040404040404041e-06,
3066
- "loss": 0.002,
3067
- "step": 3880
3068
- },
3069
- {
3070
- "epoch": 9.823232323232324,
3071
- "grad_norm": 0.0017797194886952639,
3072
- "learning_rate": 3.5353535353535352e-06,
3073
- "loss": 0.0005,
3074
- "step": 3890
3075
- },
3076
- {
3077
- "epoch": 9.848484848484848,
3078
- "grad_norm": 4.788568496704102,
3079
- "learning_rate": 3.0303030303030305e-06,
3080
- "loss": 0.0247,
3081
- "step": 3900
3082
- },
3083
- {
3084
- "epoch": 9.848484848484848,
3085
- "eval_accuracy": 0.9785138764547896,
3086
- "eval_loss": 0.09876807779073715,
3087
- "eval_runtime": 73.1729,
3088
- "eval_samples_per_second": 15.265,
3089
- "eval_steps_per_second": 1.913,
3090
- "step": 3900
3091
- },
3092
- {
3093
- "epoch": 9.873737373737374,
3094
- "grad_norm": 0.0013341947924345732,
3095
- "learning_rate": 2.5252525252525253e-06,
3096
- "loss": 0.0082,
3097
- "step": 3910
3098
- },
3099
- {
3100
- "epoch": 9.8989898989899,
3101
- "grad_norm": 0.004278136417269707,
3102
- "learning_rate": 2.0202020202020206e-06,
3103
- "loss": 0.0019,
3104
- "step": 3920
3105
- },
3106
- {
3107
- "epoch": 9.924242424242424,
3108
- "grad_norm": 0.002301498083397746,
3109
- "learning_rate": 1.5151515151515152e-06,
3110
- "loss": 0.0245,
3111
- "step": 3930
3112
- },
3113
- {
3114
- "epoch": 9.94949494949495,
3115
- "grad_norm": 0.000858976156450808,
3116
- "learning_rate": 1.0101010101010103e-06,
3117
- "loss": 0.0013,
3118
- "step": 3940
3119
- },
3120
- {
3121
- "epoch": 9.974747474747474,
3122
- "grad_norm": 0.007369679398834705,
3123
- "learning_rate": 5.050505050505052e-07,
3124
- "loss": 0.0774,
3125
- "step": 3950
3126
- },
3127
- {
3128
- "epoch": 10.0,
3129
- "grad_norm": 0.008844327181577682,
3130
- "learning_rate": 0.0,
3131
- "loss": 0.0004,
3132
- "step": 3960
3133
- },
3134
- {
3135
- "epoch": 10.0,
3136
- "step": 3960,
3137
- "total_flos": 4.904158054749069e+18,
3138
- "train_loss": 0.06213315485569771,
3139
- "train_runtime": 7084.9204,
3140
- "train_samples_per_second": 8.93,
3141
- "train_steps_per_second": 0.559
3142
  }
3143
  ],
3144
  "logging_steps": 10,
3145
- "max_steps": 3960,
3146
  "num_input_tokens_seen": 0,
3147
- "num_train_epochs": 10,
3148
  "save_steps": 100,
3149
  "stateful_callbacks": {
3150
  "TrainerControl": {
@@ -3158,7 +193,7 @@
3158
  "attributes": {}
3159
  }
3160
  },
3161
- "total_flos": 4.904158054749069e+18,
3162
  "train_batch_size": 16,
3163
  "trial_name": null,
3164
  "trial_params": null
 
1
  {
2
+ "best_metric": 0.339973121881485,
3
+ "best_model_checkpoint": "realFake-img/checkpoint-200",
4
+ "epoch": 4.0,
5
  "eval_steps": 100,
6
+ "global_step": 208,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.19230769230769232,
13
+ "grad_norm": 3.016617774963379,
14
+ "learning_rate": 0.00019134615384615387,
15
+ "loss": 0.7043,
16
  "step": 10
17
  },
18
  {
19
+ "epoch": 0.38461538461538464,
20
+ "grad_norm": 8.659211158752441,
21
+ "learning_rate": 0.00018173076923076923,
22
+ "loss": 0.6331,
23
  "step": 20
24
  },
25
  {
26
+ "epoch": 0.5769230769230769,
27
+ "grad_norm": 3.7159981727600098,
28
+ "learning_rate": 0.00017211538461538463,
29
+ "loss": 0.6697,
30
  "step": 30
31
  },
32
  {
33
+ "epoch": 0.7692307692307693,
34
+ "grad_norm": 5.222133636474609,
35
+ "learning_rate": 0.00016250000000000002,
36
+ "loss": 0.5777,
37
  "step": 40
38
  },
39
  {
40
+ "epoch": 0.9615384615384616,
41
+ "grad_norm": 6.754155158996582,
42
+ "learning_rate": 0.00015288461538461539,
43
+ "loss": 0.5347,
44
  "step": 50
45
  },
46
  {
47
+ "epoch": 1.1538461538461537,
48
+ "grad_norm": 1.658116102218628,
49
+ "learning_rate": 0.00014326923076923078,
50
+ "loss": 0.4377,
51
  "step": 60
52
  },
53
  {
54
+ "epoch": 1.3461538461538463,
55
+ "grad_norm": 5.012947082519531,
56
+ "learning_rate": 0.00013365384615384614,
57
+ "loss": 0.4354,
58
  "step": 70
59
  },
60
  {
61
+ "epoch": 1.5384615384615383,
62
+ "grad_norm": 4.971016883850098,
63
+ "learning_rate": 0.00012403846153846154,
64
+ "loss": 0.4928,
65
  "step": 80
66
  },
67
  {
68
+ "epoch": 1.7307692307692308,
69
+ "grad_norm": 4.061134338378906,
70
+ "learning_rate": 0.00011442307692307692,
71
+ "loss": 0.3803,
72
  "step": 90
73
  },
74
  {
75
+ "epoch": 1.9230769230769231,
76
+ "grad_norm": 4.557286739349365,
77
+ "learning_rate": 0.00010480769230769232,
78
+ "loss": 0.3732,
79
  "step": 100
80
  },
81
  {
82
+ "epoch": 1.9230769230769231,
83
+ "eval_accuracy": 0.7945205479452054,
84
+ "eval_loss": 0.4088120758533478,
85
+ "eval_runtime": 5.0993,
86
+ "eval_samples_per_second": 28.631,
87
+ "eval_steps_per_second": 3.726,
88
  "step": 100
89
  },
90
  {
91
+ "epoch": 2.1153846153846154,
92
+ "grad_norm": 2.305349826812744,
93
+ "learning_rate": 9.519230769230769e-05,
94
+ "loss": 0.3172,
95
  "step": 110
96
  },
97
  {
98
+ "epoch": 2.3076923076923075,
99
+ "grad_norm": 1.443526029586792,
100
+ "learning_rate": 8.557692307692308e-05,
101
+ "loss": 0.3155,
102
  "step": 120
103
  },
104
  {
105
+ "epoch": 2.5,
106
+ "grad_norm": 1.3094813823699951,
107
+ "learning_rate": 7.596153846153846e-05,
108
+ "loss": 0.2671,
109
  "step": 130
110
  },
111
  {
112
+ "epoch": 2.6923076923076925,
113
+ "grad_norm": 5.578798770904541,
114
+ "learning_rate": 6.634615384615385e-05,
115
+ "loss": 0.2603,
116
  "step": 140
117
  },
118
  {
119
+ "epoch": 2.8846153846153846,
120
+ "grad_norm": 1.8392003774642944,
121
+ "learning_rate": 5.673076923076923e-05,
122
+ "loss": 0.2804,
123
  "step": 150
124
  },
125
  {
126
+ "epoch": 3.076923076923077,
127
+ "grad_norm": 0.6986391544342041,
128
+ "learning_rate": 4.711538461538462e-05,
129
+ "loss": 0.3171,
130
  "step": 160
131
  },
132
  {
133
+ "epoch": 3.269230769230769,
134
+ "grad_norm": 4.677719593048096,
135
+ "learning_rate": 3.7500000000000003e-05,
136
+ "loss": 0.1962,
137
  "step": 170
138
  },
139
  {
140
+ "epoch": 3.4615384615384617,
141
+ "grad_norm": 1.8367177248001099,
142
+ "learning_rate": 2.7884615384615386e-05,
143
+ "loss": 0.3216,
144
  "step": 180
145
  },
146
  {
147
+ "epoch": 3.6538461538461537,
148
+ "grad_norm": 5.29152774810791,
149
+ "learning_rate": 1.826923076923077e-05,
150
+ "loss": 0.1938,
151
  "step": 190
152
  },
153
  {
154
+ "epoch": 3.8461538461538463,
155
+ "grad_norm": 1.2080388069152832,
156
+ "learning_rate": 8.653846153846155e-06,
157
+ "loss": 0.1451,
158
  "step": 200
159
  },
160
  {
161
+ "epoch": 3.8461538461538463,
162
+ "eval_accuracy": 0.8424657534246576,
163
+ "eval_loss": 0.339973121881485,
164
+ "eval_runtime": 4.8463,
165
+ "eval_samples_per_second": 30.126,
166
+ "eval_steps_per_second": 3.921,
167
  "step": 200
168
  },
169
  {
170
+ "epoch": 4.0,
171
+ "step": 208,
172
+ "total_flos": 2.5832369176982323e+17,
173
+ "train_loss": 0.38543382860147035,
174
+ "train_runtime": 140.7513,
175
+ "train_samples_per_second": 23.474,
176
+ "train_steps_per_second": 1.478
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
177
  }
178
  ],
179
  "logging_steps": 10,
180
+ "max_steps": 208,
181
  "num_input_tokens_seen": 0,
182
+ "num_train_epochs": 4,
183
  "save_steps": 100,
184
  "stateful_callbacks": {
185
  "TrainerControl": {
 
193
  "attributes": {}
194
  }
195
  },
196
+ "total_flos": 2.5832369176982323e+17,
197
  "train_batch_size": 16,
198
  "trial_name": null,
199
  "trial_params": null