laszlokiss27 commited on
Commit
96d0021
1 Parent(s): 2722c1e

doodle-dash

Browse files
README.md CHANGED
@@ -17,8 +17,8 @@ should probably proofread and complete it, then remove this comment. -->
17
 
18
  This model is a fine-tuned version of [apple/mobilevit-small](https://huggingface.co/apple/mobilevit-small) on an unknown dataset.
19
  It achieves the following results on the evaluation set:
20
- - Loss: 1.3757
21
- - Accuracy: 0.6597
22
 
23
  ## Model description
24
 
@@ -42,12 +42,31 @@ The following hyperparameters were used during training:
42
  - eval_batch_size: 256
43
  - seed: 42
44
  - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
45
- - lr_scheduler_type: linear
46
  - num_epochs: 5
47
  - mixed_precision_training: Native AMP
48
 
49
  ### Training results
50
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
51
 
52
 
53
  ### Framework versions
 
17
 
18
  This model is a fine-tuned version of [apple/mobilevit-small](https://huggingface.co/apple/mobilevit-small) on an unknown dataset.
19
  It achieves the following results on the evaluation set:
20
+ - Loss: 0.9890
21
+ - Accuracy: 0.7494
22
 
23
  ## Model description
24
 
 
42
  - eval_batch_size: 256
43
  - seed: 42
44
  - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
45
+ - lr_scheduler_type: cosine
46
  - num_epochs: 5
47
  - mixed_precision_training: Native AMP
48
 
49
  ### Training results
50
 
51
+ | Training Loss | Epoch | Step | Validation Loss | Accuracy |
52
+ |:-------------:|:-----:|:-----:|:---------------:|:--------:|
53
+ | 1.4668 | 0.28 | 5000 | 1.4146 | 0.6488 |
54
+ | 1.3282 | 0.57 | 10000 | 1.2907 | 0.6766 |
55
+ | 1.2617 | 0.85 | 15000 | 1.2270 | 0.6905 |
56
+ | 1.196 | 1.14 | 20000 | 1.1758 | 0.7035 |
57
+ | 1.1664 | 1.42 | 25000 | 1.1527 | 0.7093 |
58
+ | 1.1504 | 1.71 | 30000 | 1.1152 | 0.7170 |
59
+ | 1.1234 | 1.99 | 35000 | 1.0903 | 0.7241 |
60
+ | 1.0819 | 2.28 | 40000 | 1.0728 | 0.7283 |
61
+ | 1.0707 | 2.56 | 45000 | 1.0533 | 0.7334 |
62
+ | 1.049 | 2.84 | 50000 | 1.0399 | 0.7369 |
63
+ | 1.0017 | 3.13 | 55000 | 1.0253 | 0.7405 |
64
+ | 0.995 | 3.41 | 60000 | 1.0120 | 0.7438 |
65
+ | 0.9829 | 3.7 | 65000 | 0.9977 | 0.7468 |
66
+ | 0.9807 | 3.98 | 70000 | 0.9908 | 0.7487 |
67
+ | 0.9452 | 4.27 | 75000 | 0.9875 | 0.7498 |
68
+ | 0.949 | 4.55 | 80000 | 0.9845 | 0.7507 |
69
+ | 0.9509 | 4.84 | 85000 | 0.9841 | 0.7509 |
70
 
71
 
72
  ### Framework versions
all_results.json CHANGED
@@ -1,13 +1,13 @@
1
  {
2
  "epoch": 5.0,
3
- "eval_accuracy": 0.6597333333333333,
4
- "eval_loss": 1.3756999969482422,
5
- "eval_runtime": 16.2859,
6
- "eval_samples_per_second": 460.521,
7
- "eval_steps_per_second": 1.842,
8
- "total_flos": 8.55731423232e+16,
9
- "train_loss": 1.5635583357377485,
10
- "train_runtime": 1391.129,
11
- "train_samples_per_second": 485.217,
12
- "train_steps_per_second": 1.898
13
  }
 
1
  {
2
  "epoch": 5.0,
3
+ "eval_accuracy": 0.749444,
4
+ "eval_loss": 0.9889541864395142,
5
+ "eval_runtime": 480.919,
6
+ "eval_samples_per_second": 519.838,
7
+ "eval_steps_per_second": 2.032,
8
+ "total_flos": 5.4597447576e+17,
9
+ "train_loss": 1.1248133655402486,
10
+ "train_runtime": 51144.106,
11
+ "train_samples_per_second": 439.933,
12
+ "train_steps_per_second": 1.719
13
  }
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4594fe1cdfb2d8f9baf4a3c32c01066461f5b9e25044f0208a0a12061d6d2c49
3
  size 20730036
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3cdbd0fe9d0acb014744296fad548e9c5803a702dc65c640d5a141db652944dd
3
  size 20730036
preprocessor_config.json CHANGED
@@ -1,7 +1,7 @@
1
  {
2
  "crop_size": {
3
- "height": 64,
4
- "width": 64
5
  },
6
  "do_center_crop": true,
7
  "do_convert_rgb": false,
@@ -12,6 +12,6 @@
12
  "resample": 2,
13
  "rescale_factor": 0.00392156862745098,
14
  "size": {
15
- "shortest_edge": 64
16
  }
17
  }
 
1
  {
2
  "crop_size": {
3
+ "height": 28,
4
+ "width": 28
5
  },
6
  "do_center_crop": true,
7
  "do_convert_rgb": false,
 
12
  "resample": 2,
13
  "rescale_factor": 0.00392156862745098,
14
  "size": {
15
+ "shortest_edge": 28
16
  }
17
  }
test_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "epoch": 5.0,
3
- "eval_accuracy": 0.6597333333333333,
4
- "eval_loss": 1.3756999969482422,
5
- "eval_runtime": 16.2859,
6
- "eval_samples_per_second": 460.521,
7
- "eval_steps_per_second": 1.842
8
  }
 
1
  {
2
  "epoch": 5.0,
3
+ "eval_accuracy": 0.749444,
4
+ "eval_loss": 0.9889541864395142,
5
+ "eval_runtime": 480.919,
6
+ "eval_samples_per_second": 519.838,
7
+ "eval_steps_per_second": 2.032
8
  }
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "epoch": 5.0,
3
- "total_flos": 8.55731423232e+16,
4
- "train_loss": 1.5635583357377485,
5
- "train_runtime": 1391.129,
6
- "train_samples_per_second": 485.217,
7
- "train_steps_per_second": 1.898
8
  }
 
1
  {
2
  "epoch": 5.0,
3
+ "total_flos": 5.4597447576e+17,
4
+ "train_loss": 1.1248133655402486,
5
+ "train_runtime": 51144.106,
6
+ "train_samples_per_second": 439.933,
7
+ "train_steps_per_second": 1.719
8
  }
trainer_state.json CHANGED
@@ -3,41 +3,789 @@
3
  "best_model_checkpoint": null,
4
  "epoch": 5.0,
5
  "eval_steps": 5000,
6
- "global_step": 2640,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 1.89,
13
- "grad_norm": 1.7177796363830566,
14
- "learning_rate": 0.0004978787878787879,
15
- "loss": 2.2138,
16
  "step": 1000
17
  },
18
  {
19
- "epoch": 3.79,
20
- "grad_norm": 2.4473989009857178,
21
- "learning_rate": 0.00019484848484848486,
22
- "loss": 1.2577,
23
  "step": 2000
24
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
  {
26
  "epoch": 5.0,
27
- "step": 2640,
28
- "total_flos": 8.55731423232e+16,
29
- "train_loss": 1.5635583357377485,
30
- "train_runtime": 1391.129,
31
- "train_samples_per_second": 485.217,
32
- "train_steps_per_second": 1.898
33
  }
34
  ],
35
  "logging_steps": 1000,
36
- "max_steps": 2640,
37
  "num_input_tokens_seen": 0,
38
  "num_train_epochs": 5,
39
  "save_steps": 5000,
40
- "total_flos": 8.55731423232e+16,
41
  "train_batch_size": 256,
42
  "trial_name": null,
43
  "trial_params": null
 
3
  "best_model_checkpoint": null,
4
  "epoch": 5.0,
5
  "eval_steps": 5000,
6
+ "global_step": 87895,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.06,
13
+ "grad_norm": 2.239067792892456,
14
+ "learning_rate": 0.0007997460514380593,
15
+ "loss": 2.6519,
16
  "step": 1000
17
  },
18
  {
19
+ "epoch": 0.11,
20
+ "grad_norm": 1.9583632946014404,
21
+ "learning_rate": 0.0007989814716192582,
22
+ "loss": 1.7524,
23
  "step": 2000
24
  },
25
+ {
26
+ "epoch": 0.17,
27
+ "grad_norm": 1.9535893201828003,
28
+ "learning_rate": 0.0007977072349143816,
29
+ "loss": 1.6001,
30
+ "step": 3000
31
+ },
32
+ {
33
+ "epoch": 0.23,
34
+ "grad_norm": 2.063387632369995,
35
+ "learning_rate": 0.0007959249690268624,
36
+ "loss": 1.5217,
37
+ "step": 4000
38
+ },
39
+ {
40
+ "epoch": 0.28,
41
+ "grad_norm": 1.9860559701919556,
42
+ "learning_rate": 0.0007936369506140068,
43
+ "loss": 1.4668,
44
+ "step": 5000
45
+ },
46
+ {
47
+ "epoch": 0.28,
48
+ "eval_accuracy": 0.648804,
49
+ "eval_loss": 1.4146429300308228,
50
+ "eval_runtime": 478.1117,
51
+ "eval_samples_per_second": 522.89,
52
+ "eval_steps_per_second": 2.043,
53
+ "step": 5000
54
+ },
55
+ {
56
+ "epoch": 0.34,
57
+ "grad_norm": 2.0301945209503174,
58
+ "learning_rate": 0.0007908461023788039,
59
+ "loss": 1.4298,
60
+ "step": 6000
61
+ },
62
+ {
63
+ "epoch": 0.4,
64
+ "grad_norm": 1.5042238235473633,
65
+ "learning_rate": 0.0007875595274863339,
66
+ "loss": 1.3942,
67
+ "step": 7000
68
+ },
69
+ {
70
+ "epoch": 0.46,
71
+ "grad_norm": 1.6014552116394043,
72
+ "learning_rate": 0.0007837748451621974,
73
+ "loss": 1.3678,
74
+ "step": 8000
75
+ },
76
+ {
77
+ "epoch": 0.51,
78
+ "grad_norm": 1.509202241897583,
79
+ "learning_rate": 0.0007795044488381889,
80
+ "loss": 1.3487,
81
+ "step": 9000
82
+ },
83
+ {
84
+ "epoch": 0.57,
85
+ "grad_norm": 1.4688481092453003,
86
+ "learning_rate": 0.0007747452450657864,
87
+ "loss": 1.3282,
88
+ "step": 10000
89
+ },
90
+ {
91
+ "epoch": 0.57,
92
+ "eval_accuracy": 0.676572,
93
+ "eval_loss": 1.2906984090805054,
94
+ "eval_runtime": 477.0964,
95
+ "eval_samples_per_second": 524.003,
96
+ "eval_steps_per_second": 2.048,
97
+ "step": 10000
98
+ },
99
+ {
100
+ "epoch": 0.63,
101
+ "grad_norm": 1.4521937370300293,
102
+ "learning_rate": 0.0007695128184733147,
103
+ "loss": 1.3244,
104
+ "step": 11000
105
+ },
106
+ {
107
+ "epoch": 0.68,
108
+ "grad_norm": 1.4638675451278687,
109
+ "learning_rate": 0.000763803378723512,
110
+ "loss": 1.2934,
111
+ "step": 12000
112
+ },
113
+ {
114
+ "epoch": 0.74,
115
+ "grad_norm": 1.614662528038025,
116
+ "learning_rate": 0.0007576292184031235,
117
+ "loss": 1.2847,
118
+ "step": 13000
119
+ },
120
+ {
121
+ "epoch": 0.8,
122
+ "grad_norm": 1.3933509588241577,
123
+ "learning_rate": 0.0007510050807642281,
124
+ "loss": 1.2779,
125
+ "step": 14000
126
+ },
127
+ {
128
+ "epoch": 0.85,
129
+ "grad_norm": 1.424271583557129,
130
+ "learning_rate": 0.0007439261673227427,
131
+ "loss": 1.2617,
132
+ "step": 15000
133
+ },
134
+ {
135
+ "epoch": 0.85,
136
+ "eval_accuracy": 0.690468,
137
+ "eval_loss": 1.227000117301941,
138
+ "eval_runtime": 474.9613,
139
+ "eval_samples_per_second": 526.359,
140
+ "eval_steps_per_second": 2.057,
141
+ "step": 15000
142
+ },
143
+ {
144
+ "epoch": 0.91,
145
+ "grad_norm": 1.5942885875701904,
146
+ "learning_rate": 0.0007364156588963428,
147
+ "loss": 1.2525,
148
+ "step": 16000
149
+ },
150
+ {
151
+ "epoch": 0.97,
152
+ "grad_norm": 1.342366099357605,
153
+ "learning_rate": 0.0007284681149194631,
154
+ "loss": 1.2437,
155
+ "step": 17000
156
+ },
157
+ {
158
+ "epoch": 1.02,
159
+ "grad_norm": 1.3556225299835205,
160
+ "learning_rate": 0.0007201095606528187,
161
+ "loss": 1.2252,
162
+ "step": 18000
163
+ },
164
+ {
165
+ "epoch": 1.08,
166
+ "grad_norm": 1.18772554397583,
167
+ "learning_rate": 0.0007113339412117152,
168
+ "loss": 1.1982,
169
+ "step": 19000
170
+ },
171
+ {
172
+ "epoch": 1.14,
173
+ "grad_norm": 1.2163102626800537,
174
+ "learning_rate": 0.0007021606253904673,
175
+ "loss": 1.196,
176
+ "step": 20000
177
+ },
178
+ {
179
+ "epoch": 1.14,
180
+ "eval_accuracy": 0.703472,
181
+ "eval_loss": 1.1758021116256714,
182
+ "eval_runtime": 477.2497,
183
+ "eval_samples_per_second": 523.835,
184
+ "eval_steps_per_second": 2.047,
185
+ "step": 20000
186
+ },
187
+ {
188
+ "epoch": 1.19,
189
+ "grad_norm": 1.4264676570892334,
190
+ "learning_rate": 0.0006926013311356693,
191
+ "loss": 1.1909,
192
+ "step": 21000
193
+ },
194
+ {
195
+ "epoch": 1.25,
196
+ "grad_norm": 1.4394527673721313,
197
+ "learning_rate": 0.0006826783850151473,
198
+ "loss": 1.1906,
199
+ "step": 22000
200
+ },
201
+ {
202
+ "epoch": 1.31,
203
+ "grad_norm": 1.6702680587768555,
204
+ "learning_rate": 0.0006723845989084832,
205
+ "loss": 1.1836,
206
+ "step": 23000
207
+ },
208
+ {
209
+ "epoch": 1.37,
210
+ "grad_norm": 1.4739456176757812,
211
+ "learning_rate": 0.0006617536811497173,
212
+ "loss": 1.1826,
213
+ "step": 24000
214
+ },
215
+ {
216
+ "epoch": 1.42,
217
+ "grad_norm": 1.8557323217391968,
218
+ "learning_rate": 0.0006507779307681826,
219
+ "loss": 1.1664,
220
+ "step": 25000
221
+ },
222
+ {
223
+ "epoch": 1.42,
224
+ "eval_accuracy": 0.709336,
225
+ "eval_loss": 1.1526598930358887,
226
+ "eval_runtime": 478.812,
227
+ "eval_samples_per_second": 522.126,
228
+ "eval_steps_per_second": 2.04,
229
+ "step": 25000
230
+ },
231
+ {
232
+ "epoch": 1.48,
233
+ "grad_norm": 1.0533519983291626,
234
+ "learning_rate": 0.0006394818379437445,
235
+ "loss": 1.1641,
236
+ "step": 26000
237
+ },
238
+ {
239
+ "epoch": 1.54,
240
+ "grad_norm": 1.3917081356048584,
241
+ "learning_rate": 0.0006278798322474558,
242
+ "loss": 1.1606,
243
+ "step": 27000
244
+ },
245
+ {
246
+ "epoch": 1.59,
247
+ "grad_norm": 1.281437873840332,
248
+ "learning_rate": 0.000616010800690931,
249
+ "loss": 1.1553,
250
+ "step": 28000
251
+ },
252
+ {
253
+ "epoch": 1.65,
254
+ "grad_norm": 1.2246198654174805,
255
+ "learning_rate": 0.0006038423385201748,
256
+ "loss": 1.149,
257
+ "step": 29000
258
+ },
259
+ {
260
+ "epoch": 1.71,
261
+ "grad_norm": 1.2770333290100098,
262
+ "learning_rate": 0.0005914134891913911,
263
+ "loss": 1.1504,
264
+ "step": 30000
265
+ },
266
+ {
267
+ "epoch": 1.71,
268
+ "eval_accuracy": 0.717008,
269
+ "eval_loss": 1.115194320678711,
270
+ "eval_runtime": 479.3599,
271
+ "eval_samples_per_second": 521.529,
272
+ "eval_steps_per_second": 2.038,
273
+ "step": 30000
274
+ },
275
+ {
276
+ "epoch": 1.76,
277
+ "grad_norm": 1.062719702720642,
278
+ "learning_rate": 0.0005787401292529838,
279
+ "loss": 1.1433,
280
+ "step": 31000
281
+ },
282
+ {
283
+ "epoch": 1.82,
284
+ "grad_norm": 1.4529552459716797,
285
+ "learning_rate": 0.0005658514578562903,
286
+ "loss": 1.1445,
287
+ "step": 32000
288
+ },
289
+ {
290
+ "epoch": 1.88,
291
+ "grad_norm": 1.107438564300537,
292
+ "learning_rate": 0.000552738138527959,
293
+ "loss": 1.1345,
294
+ "step": 33000
295
+ },
296
+ {
297
+ "epoch": 1.93,
298
+ "grad_norm": 1.309801697731018,
299
+ "learning_rate": 0.0005394431125282525,
300
+ "loss": 1.1255,
301
+ "step": 34000
302
+ },
303
+ {
304
+ "epoch": 1.99,
305
+ "grad_norm": 1.2301312685012817,
306
+ "learning_rate": 0.0005259567490230731,
307
+ "loss": 1.1234,
308
+ "step": 35000
309
+ },
310
+ {
311
+ "epoch": 1.99,
312
+ "eval_accuracy": 0.724084,
313
+ "eval_loss": 1.0902520418167114,
314
+ "eval_runtime": 480.0595,
315
+ "eval_samples_per_second": 520.769,
316
+ "eval_steps_per_second": 2.035,
317
+ "step": 35000
318
+ },
319
+ {
320
+ "epoch": 2.05,
321
+ "grad_norm": 1.4310553073883057,
322
+ "learning_rate": 0.0005123232108540917,
323
+ "loss": 1.0933,
324
+ "step": 36000
325
+ },
326
+ {
327
+ "epoch": 2.1,
328
+ "grad_norm": 1.1885666847229004,
329
+ "learning_rate": 0.0004985326219711018,
330
+ "loss": 1.0855,
331
+ "step": 37000
332
+ },
333
+ {
334
+ "epoch": 2.16,
335
+ "grad_norm": 1.2444497346878052,
336
+ "learning_rate": 0.0004846161680220303,
337
+ "loss": 1.0785,
338
+ "step": 38000
339
+ },
340
+ {
341
+ "epoch": 2.22,
342
+ "grad_norm": 1.5714720487594604,
343
+ "learning_rate": 0.0004705916258137954,
344
+ "loss": 1.0748,
345
+ "step": 39000
346
+ },
347
+ {
348
+ "epoch": 2.28,
349
+ "grad_norm": 1.5065577030181885,
350
+ "learning_rate": 0.0004564910639893322,
351
+ "loss": 1.0819,
352
+ "step": 40000
353
+ },
354
+ {
355
+ "epoch": 2.28,
356
+ "eval_accuracy": 0.728284,
357
+ "eval_loss": 1.072808027267456,
358
+ "eval_runtime": 478.1415,
359
+ "eval_samples_per_second": 522.858,
360
+ "eval_steps_per_second": 2.043,
361
+ "step": 40000
362
+ },
363
+ {
364
+ "epoch": 2.33,
365
+ "grad_norm": 1.459184169769287,
366
+ "learning_rate": 0.00044230426818940436,
367
+ "loss": 1.077,
368
+ "step": 41000
369
+ },
370
+ {
371
+ "epoch": 2.39,
372
+ "grad_norm": 1.0881271362304688,
373
+ "learning_rate": 0.0004280776949118281,
374
+ "loss": 1.0763,
375
+ "step": 42000
376
+ },
377
+ {
378
+ "epoch": 2.45,
379
+ "grad_norm": 1.257857322692871,
380
+ "learning_rate": 0.000413801038515455,
381
+ "loss": 1.0654,
382
+ "step": 43000
383
+ },
384
+ {
385
+ "epoch": 2.5,
386
+ "grad_norm": 1.3307342529296875,
387
+ "learning_rate": 0.000399521049758247,
388
+ "loss": 1.0652,
389
+ "step": 44000
390
+ },
391
+ {
392
+ "epoch": 2.56,
393
+ "grad_norm": 1.1706323623657227,
394
+ "learning_rate": 0.00038522738430521474,
395
+ "loss": 1.0707,
396
+ "step": 45000
397
+ },
398
+ {
399
+ "epoch": 2.56,
400
+ "eval_accuracy": 0.733432,
401
+ "eval_loss": 1.0532697439193726,
402
+ "eval_runtime": 479.4432,
403
+ "eval_samples_per_second": 521.438,
404
+ "eval_steps_per_second": 2.038,
405
+ "step": 45000
406
+ },
407
+ {
408
+ "epoch": 2.62,
409
+ "grad_norm": 1.3150368928909302,
410
+ "learning_rate": 0.00037096684861267625,
411
+ "loss": 1.0657,
412
+ "step": 46000
413
+ },
414
+ {
415
+ "epoch": 2.67,
416
+ "grad_norm": 1.3128031492233276,
417
+ "learning_rate": 0.00035672911243732087,
418
+ "loss": 1.0601,
419
+ "step": 47000
420
+ },
421
+ {
422
+ "epoch": 2.73,
423
+ "grad_norm": 1.3551691770553589,
424
+ "learning_rate": 0.0003425607990878131,
425
+ "loss": 1.0575,
426
+ "step": 48000
427
+ },
428
+ {
429
+ "epoch": 2.79,
430
+ "grad_norm": 1.2833973169326782,
431
+ "learning_rate": 0.0003284516451261337,
432
+ "loss": 1.059,
433
+ "step": 49000
434
+ },
435
+ {
436
+ "epoch": 2.84,
437
+ "grad_norm": 1.4435667991638184,
438
+ "learning_rate": 0.000314447852801857,
439
+ "loss": 1.049,
440
+ "step": 50000
441
+ },
442
+ {
443
+ "epoch": 2.84,
444
+ "eval_accuracy": 0.736904,
445
+ "eval_loss": 1.039907693862915,
446
+ "eval_runtime": 482.5464,
447
+ "eval_samples_per_second": 518.085,
448
+ "eval_steps_per_second": 2.025,
449
+ "step": 50000
450
+ },
451
+ {
452
+ "epoch": 2.9,
453
+ "grad_norm": 1.2984023094177246,
454
+ "learning_rate": 0.0003005392778549707,
455
+ "loss": 1.0418,
456
+ "step": 51000
457
+ },
458
+ {
459
+ "epoch": 2.96,
460
+ "grad_norm": 1.109937310218811,
461
+ "learning_rate": 0.0002867714657131215,
462
+ "loss": 1.048,
463
+ "step": 52000
464
+ },
465
+ {
466
+ "epoch": 3.01,
467
+ "grad_norm": 1.1215381622314453,
468
+ "learning_rate": 0.00027313444305887276,
469
+ "loss": 1.0303,
470
+ "step": 53000
471
+ },
472
+ {
473
+ "epoch": 3.07,
474
+ "grad_norm": 1.3672548532485962,
475
+ "learning_rate": 0.00025967286608676553,
476
+ "loss": 1.0047,
477
+ "step": 54000
478
+ },
479
+ {
480
+ "epoch": 3.13,
481
+ "grad_norm": 1.4645944833755493,
482
+ "learning_rate": 0.0002463769833142144,
483
+ "loss": 1.0017,
484
+ "step": 55000
485
+ },
486
+ {
487
+ "epoch": 3.13,
488
+ "eval_accuracy": 0.740544,
489
+ "eval_loss": 1.0252685546875,
490
+ "eval_runtime": 480.6997,
491
+ "eval_samples_per_second": 520.075,
492
+ "eval_steps_per_second": 2.032,
493
+ "step": 55000
494
+ },
495
+ {
496
+ "epoch": 3.19,
497
+ "grad_norm": 1.3108264207839966,
498
+ "learning_rate": 0.00023329033382909358,
499
+ "loss": 1.0012,
500
+ "step": 56000
501
+ },
502
+ {
503
+ "epoch": 3.24,
504
+ "grad_norm": 1.4289699792861938,
505
+ "learning_rate": 0.00022040343774395584,
506
+ "loss": 0.9987,
507
+ "step": 57000
508
+ },
509
+ {
510
+ "epoch": 3.3,
511
+ "grad_norm": 1.2855439186096191,
512
+ "learning_rate": 0.00020775849486686778,
513
+ "loss": 0.9988,
514
+ "step": 58000
515
+ },
516
+ {
517
+ "epoch": 3.36,
518
+ "grad_norm": 1.162169098854065,
519
+ "learning_rate": 0.00019534634527990013,
520
+ "loss": 1.0006,
521
+ "step": 59000
522
+ },
523
+ {
524
+ "epoch": 3.41,
525
+ "grad_norm": 1.2256358861923218,
526
+ "learning_rate": 0.00018320763417230612,
527
+ "loss": 0.995,
528
+ "step": 60000
529
+ },
530
+ {
531
+ "epoch": 3.41,
532
+ "eval_accuracy": 0.743764,
533
+ "eval_loss": 1.0119822025299072,
534
+ "eval_runtime": 480.4029,
535
+ "eval_samples_per_second": 520.396,
536
+ "eval_steps_per_second": 2.034,
537
+ "step": 60000
538
+ },
539
+ {
540
+ "epoch": 3.47,
541
+ "grad_norm": 1.2014998197555542,
542
+ "learning_rate": 0.00017133356833730004,
543
+ "loss": 0.9976,
544
+ "step": 61000
545
+ },
546
+ {
547
+ "epoch": 3.53,
548
+ "grad_norm": 1.1588609218597412,
549
+ "learning_rate": 0.00015975159982961664,
550
+ "loss": 1.001,
551
+ "step": 62000
552
+ },
553
+ {
554
+ "epoch": 3.58,
555
+ "grad_norm": 1.2534708976745605,
556
+ "learning_rate": 0.00014847652339644127,
557
+ "loss": 0.9967,
558
+ "step": 63000
559
+ },
560
+ {
561
+ "epoch": 3.64,
562
+ "grad_norm": 1.2062395811080933,
563
+ "learning_rate": 0.00013753353036528426,
564
+ "loss": 0.9972,
565
+ "step": 64000
566
+ },
567
+ {
568
+ "epoch": 3.7,
569
+ "grad_norm": 1.280290126800537,
570
+ "learning_rate": 0.00012691469370009338,
571
+ "loss": 0.9829,
572
+ "step": 65000
573
+ },
574
+ {
575
+ "epoch": 3.7,
576
+ "eval_accuracy": 0.746796,
577
+ "eval_loss": 0.9977088570594788,
578
+ "eval_runtime": 476.9935,
579
+ "eval_samples_per_second": 524.116,
580
+ "eval_steps_per_second": 2.048,
581
+ "step": 65000
582
+ },
583
+ {
584
+ "epoch": 3.75,
585
+ "grad_norm": 1.2085012197494507,
586
+ "learning_rate": 0.00011665478615500634,
587
+ "loss": 0.9863,
588
+ "step": 66000
589
+ },
590
+ {
591
+ "epoch": 3.81,
592
+ "grad_norm": 1.7737034559249878,
593
+ "learning_rate": 0.00010674637551684559,
594
+ "loss": 0.9777,
595
+ "step": 67000
596
+ },
597
+ {
598
+ "epoch": 3.87,
599
+ "grad_norm": 1.1058114767074585,
600
+ "learning_rate": 9.722190815783432e-05,
601
+ "loss": 0.9903,
602
+ "step": 68000
603
+ },
604
+ {
605
+ "epoch": 3.93,
606
+ "grad_norm": 1.285079836845398,
607
+ "learning_rate": 8.807448461308951e-05,
608
+ "loss": 0.9814,
609
+ "step": 69000
610
+ },
611
+ {
612
+ "epoch": 3.98,
613
+ "grad_norm": 1.4150310754776,
614
+ "learning_rate": 7.933405930416787e-05,
615
+ "loss": 0.9807,
616
+ "step": 70000
617
+ },
618
+ {
619
+ "epoch": 3.98,
620
+ "eval_accuracy": 0.7487,
621
+ "eval_loss": 0.9908215999603271,
622
+ "eval_runtime": 478.9031,
623
+ "eval_samples_per_second": 522.026,
624
+ "eval_steps_per_second": 2.04,
625
+ "step": 70000
626
+ },
627
+ {
628
+ "epoch": 4.04,
629
+ "grad_norm": 1.0839868783950806,
630
+ "learning_rate": 7.100243228624242e-05,
631
+ "loss": 0.963,
632
+ "step": 71000
633
+ },
634
+ {
635
+ "epoch": 4.1,
636
+ "grad_norm": 1.2299129962921143,
637
+ "learning_rate": 6.30825184828638e-05,
638
+ "loss": 0.955,
639
+ "step": 72000
640
+ },
641
+ {
642
+ "epoch": 4.15,
643
+ "grad_norm": 1.3780415058135986,
644
+ "learning_rate": 5.5592981342836236e-05,
645
+ "loss": 0.9508,
646
+ "step": 73000
647
+ },
648
+ {
649
+ "epoch": 4.21,
650
+ "grad_norm": 1.1320440769195557,
651
+ "learning_rate": 4.8543387962500266e-05,
652
+ "loss": 0.9493,
653
+ "step": 74000
654
+ },
655
+ {
656
+ "epoch": 4.27,
657
+ "grad_norm": 1.120684027671814,
658
+ "learning_rate": 4.1949116996191016e-05,
659
+ "loss": 0.9452,
660
+ "step": 75000
661
+ },
662
+ {
663
+ "epoch": 4.27,
664
+ "eval_accuracy": 0.749816,
665
+ "eval_loss": 0.9875096082687378,
666
+ "eval_runtime": 477.8512,
667
+ "eval_samples_per_second": 523.175,
668
+ "eval_steps_per_second": 2.045,
669
+ "step": 75000
670
+ },
671
+ {
672
+ "epoch": 4.32,
673
+ "grad_norm": 1.4629569053649902,
674
+ "learning_rate": 3.5805391595057494e-05,
675
+ "loss": 0.9484,
676
+ "step": 76000
677
+ },
678
+ {
679
+ "epoch": 4.38,
680
+ "grad_norm": 1.2522917985916138,
681
+ "learning_rate": 3.013233018257653e-05,
682
+ "loss": 0.9567,
683
+ "step": 77000
684
+ },
685
+ {
686
+ "epoch": 4.44,
687
+ "grad_norm": 1.331236720085144,
688
+ "learning_rate": 2.492582322836503e-05,
689
+ "loss": 0.9494,
690
+ "step": 78000
691
+ },
692
+ {
693
+ "epoch": 4.49,
694
+ "grad_norm": 1.312769889831543,
695
+ "learning_rate": 2.0202920112872213e-05,
696
+ "loss": 0.9494,
697
+ "step": 79000
698
+ },
699
+ {
700
+ "epoch": 4.55,
701
+ "grad_norm": 1.0436272621154785,
702
+ "learning_rate": 1.5960199594472392e-05,
703
+ "loss": 0.949,
704
+ "step": 80000
705
+ },
706
+ {
707
+ "epoch": 4.55,
708
+ "eval_accuracy": 0.750728,
709
+ "eval_loss": 0.9844790697097778,
710
+ "eval_runtime": 480.1647,
711
+ "eval_samples_per_second": 520.655,
712
+ "eval_steps_per_second": 2.035,
713
+ "step": 80000
714
+ },
715
+ {
716
+ "epoch": 4.61,
717
+ "grad_norm": 1.7521427869796753,
718
+ "learning_rate": 1.221155501027127e-05,
719
+ "loss": 0.9459,
720
+ "step": 81000
721
+ },
722
+ {
723
+ "epoch": 4.66,
724
+ "grad_norm": 1.4406906366348267,
725
+ "learning_rate": 8.954270865460369e-06,
726
+ "loss": 0.9507,
727
+ "step": 82000
728
+ },
729
+ {
730
+ "epoch": 4.72,
731
+ "grad_norm": 1.2117027044296265,
732
+ "learning_rate": 6.196506529965529e-06,
733
+ "loss": 0.9471,
734
+ "step": 83000
735
+ },
736
+ {
737
+ "epoch": 4.78,
738
+ "grad_norm": 1.1171038150787354,
739
+ "learning_rate": 3.943787197483806e-06,
740
+ "loss": 0.951,
741
+ "step": 84000
742
+ },
743
+ {
744
+ "epoch": 4.84,
745
+ "grad_norm": 1.388543963432312,
746
+ "learning_rate": 2.194481012149785e-06,
747
+ "loss": 0.9509,
748
+ "step": 85000
749
+ },
750
+ {
751
+ "epoch": 4.84,
752
+ "eval_accuracy": 0.750944,
753
+ "eval_loss": 0.984071671962738,
754
+ "eval_runtime": 479.0227,
755
+ "eval_samples_per_second": 521.896,
756
+ "eval_steps_per_second": 2.04,
757
+ "step": 85000
758
+ },
759
+ {
760
+ "epoch": 4.89,
761
+ "grad_norm": 1.2518051862716675,
762
+ "learning_rate": 9.543163033286728e-07,
763
+ "loss": 0.955,
764
+ "step": 86000
765
+ },
766
+ {
767
+ "epoch": 4.95,
768
+ "grad_norm": 1.1738619804382324,
769
+ "learning_rate": 2.2239470342309e-07,
770
+ "loss": 0.9482,
771
+ "step": 87000
772
+ },
773
  {
774
  "epoch": 5.0,
775
+ "step": 87895,
776
+ "total_flos": 5.4597447576e+17,
777
+ "train_loss": 1.1248133655402486,
778
+ "train_runtime": 51144.106,
779
+ "train_samples_per_second": 439.933,
780
+ "train_steps_per_second": 1.719
781
  }
782
  ],
783
  "logging_steps": 1000,
784
+ "max_steps": 87895,
785
  "num_input_tokens_seen": 0,
786
  "num_train_epochs": 5,
787
  "save_steps": 5000,
788
+ "total_flos": 5.4597447576e+17,
789
  "train_batch_size": 256,
790
  "trial_name": null,
791
  "trial_params": null
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:57eb28f43f42cafdd51441529e7f5eee73bdb0c9dd9f65304199c2aea12458a4
3
  size 4856
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4d9a6f1a8233545c2d422a032a79a3101f63ca1427a7c955c87c0125bf8df20e
3
  size 4856