laszlokiss27 commited on
Commit
affe533
1 Parent(s): 96d0021

doodle-dash8

Browse files
README.md CHANGED
@@ -1,6 +1,6 @@
1
  ---
2
  license: other
3
- base_model: apple/mobilevit-small
4
  tags:
5
  - generated_from_trainer
6
  metrics:
@@ -15,10 +15,10 @@ should probably proofread and complete it, then remove this comment. -->
15
 
16
  # results
17
 
18
- This model is a fine-tuned version of [apple/mobilevit-small](https://huggingface.co/apple/mobilevit-small) on an unknown dataset.
19
  It achieves the following results on the evaluation set:
20
- - Loss: 0.9890
21
- - Accuracy: 0.7494
22
 
23
  ## Model description
24
 
@@ -41,37 +41,19 @@ The following hyperparameters were used during training:
41
  - train_batch_size: 256
42
  - eval_batch_size: 256
43
  - seed: 42
 
44
  - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
45
- - lr_scheduler_type: cosine
46
  - num_epochs: 5
47
  - mixed_precision_training: Native AMP
48
 
49
  ### Training results
50
 
51
- | Training Loss | Epoch | Step | Validation Loss | Accuracy |
52
- |:-------------:|:-----:|:-----:|:---------------:|:--------:|
53
- | 1.4668 | 0.28 | 5000 | 1.4146 | 0.6488 |
54
- | 1.3282 | 0.57 | 10000 | 1.2907 | 0.6766 |
55
- | 1.2617 | 0.85 | 15000 | 1.2270 | 0.6905 |
56
- | 1.196 | 1.14 | 20000 | 1.1758 | 0.7035 |
57
- | 1.1664 | 1.42 | 25000 | 1.1527 | 0.7093 |
58
- | 1.1504 | 1.71 | 30000 | 1.1152 | 0.7170 |
59
- | 1.1234 | 1.99 | 35000 | 1.0903 | 0.7241 |
60
- | 1.0819 | 2.28 | 40000 | 1.0728 | 0.7283 |
61
- | 1.0707 | 2.56 | 45000 | 1.0533 | 0.7334 |
62
- | 1.049 | 2.84 | 50000 | 1.0399 | 0.7369 |
63
- | 1.0017 | 3.13 | 55000 | 1.0253 | 0.7405 |
64
- | 0.995 | 3.41 | 60000 | 1.0120 | 0.7438 |
65
- | 0.9829 | 3.7 | 65000 | 0.9977 | 0.7468 |
66
- | 0.9807 | 3.98 | 70000 | 0.9908 | 0.7487 |
67
- | 0.9452 | 4.27 | 75000 | 0.9875 | 0.7498 |
68
- | 0.949 | 4.55 | 80000 | 0.9845 | 0.7507 |
69
- | 0.9509 | 4.84 | 85000 | 0.9841 | 0.7509 |
70
 
71
 
72
  ### Framework versions
73
 
74
- - Transformers 4.38.2
75
- - Pytorch 2.2.1+cu121
76
- - Datasets 2.18.0
77
- - Tokenizers 0.15.2
 
1
  ---
2
  license: other
3
+ base_model: apple/mobilevitv2-1.0-imagenet1k-256
4
  tags:
5
  - generated_from_trainer
6
  metrics:
 
15
 
16
  # results
17
 
18
+ This model is a fine-tuned version of [apple/mobilevitv2-1.0-imagenet1k-256](https://huggingface.co/apple/mobilevitv2-1.0-imagenet1k-256) on an unknown dataset.
19
  It achieves the following results on the evaluation set:
20
+ - Loss: 5.6378
21
+ - Accuracy: 0.84
22
 
23
  ## Model description
24
 
 
41
  - train_batch_size: 256
42
  - eval_batch_size: 256
43
  - seed: 42
44
+ - distributed_type: multi-GPU
45
  - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
46
+ - lr_scheduler_type: linear
47
  - num_epochs: 5
48
  - mixed_precision_training: Native AMP
49
 
50
  ### Training results
51
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52
 
53
 
54
  ### Framework versions
55
 
56
+ - Transformers 4.40.0
57
+ - Pytorch 2.2.2+cu121
58
+ - Datasets 2.19.0
59
+ - Tokenizers 0.19.1
all_results.json CHANGED
@@ -1,13 +1,13 @@
1
  {
2
  "epoch": 5.0,
3
- "eval_accuracy": 0.749444,
4
- "eval_loss": 0.9889541864395142,
5
- "eval_runtime": 480.919,
6
- "eval_samples_per_second": 519.838,
7
- "eval_steps_per_second": 2.032,
8
- "total_flos": 5.4597447576e+17,
9
- "train_loss": 1.1248133655402486,
10
- "train_runtime": 51144.106,
11
- "train_samples_per_second": 439.933,
12
- "train_steps_per_second": 1.719
13
  }
 
1
  {
2
  "epoch": 5.0,
3
+ "eval_accuracy": 0.84,
4
+ "eval_loss": 5.637753486633301,
5
+ "eval_runtime": 0.5983,
6
+ "eval_samples_per_second": 334.259,
7
+ "eval_steps_per_second": 1.671,
8
+ "total_flos": 429498709311488.0,
9
+ "train_loss": 5.7403564453125,
10
+ "train_runtime": 6.8068,
11
+ "train_samples_per_second": 734.556,
12
+ "train_steps_per_second": 2.938
13
  }
config.json CHANGED
@@ -1,26 +1,27 @@
1
  {
2
- "_name_or_path": "apple/mobilevit-small",
3
  "architectures": [
4
- "MobileViTForImageClassification"
5
  ],
6
  "aspp_dropout_prob": 0.1,
7
- "aspp_out_channels": 256,
8
  "atrous_rates": [
9
  6,
10
  12,
11
  18
12
  ],
13
- "attention_probs_dropout_prob": 0.0,
14
- "classifier_dropout_prob": 0.1,
15
- "conv_kernel_size": 3,
16
- "expand_ratio": 4.0,
17
- "hidden_act": "silu",
18
- "hidden_dropout_prob": 0.1,
19
- "hidden_sizes": [
20
- 144,
21
  192,
22
- 240
23
  ],
 
 
 
 
 
 
24
  "id2label": {
25
  "0": "aircraft carrier",
26
  "1": "airplane",
@@ -719,23 +720,18 @@
719
  },
720
  "layer_norm_eps": 1e-05,
721
  "mlp_ratio": 2.0,
722
- "model_type": "mobilevit",
723
- "neck_hidden_sizes": [
724
- 16,
725
- 32,
726
- 64,
727
- 96,
728
- 128,
729
- 160,
730
- 640
731
  ],
732
- "num_attention_heads": 4,
733
  "num_channels": 1,
734
  "output_stride": 32,
735
  "patch_size": 2,
736
  "problem_type": "single_label_classification",
737
- "qkv_bias": true,
738
  "semantic_loss_ignore_index": 255,
739
  "torch_dtype": "float32",
740
- "transformers_version": "4.38.2"
 
741
  }
 
1
  {
2
+ "_name_or_path": "apple/mobilevitv2-1.0-imagenet1k-256",
3
  "architectures": [
4
+ "MobileViTV2ForImageClassification"
5
  ],
6
  "aspp_dropout_prob": 0.1,
7
+ "aspp_out_channels": 512,
8
  "atrous_rates": [
9
  6,
10
  12,
11
  18
12
  ],
13
+ "attn_dropout": 0.0,
14
+ "base_attn_unit_dims": [
15
+ 128,
 
 
 
 
 
16
  192,
17
+ 256
18
  ],
19
+ "classifier_dropout_prob": 0.1,
20
+ "conv_kernel_size": 3,
21
+ "expand_ratio": 2.0,
22
+ "ffn_dropout": 0.0,
23
+ "ffn_multiplier": 2,
24
+ "hidden_act": "swish",
25
  "id2label": {
26
  "0": "aircraft carrier",
27
  "1": "airplane",
 
720
  },
721
  "layer_norm_eps": 1e-05,
722
  "mlp_ratio": 2.0,
723
+ "model_type": "mobilevitv2",
724
+ "n_attn_blocks": [
725
+ 2,
726
+ 4,
727
+ 3
 
 
 
 
728
  ],
 
729
  "num_channels": 1,
730
  "output_stride": 32,
731
  "patch_size": 2,
732
  "problem_type": "single_label_classification",
 
733
  "semantic_loss_ignore_index": 255,
734
  "torch_dtype": "float32",
735
+ "transformers_version": "4.40.0",
736
+ "width_multiplier": 1.0
737
  }
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3cdbd0fe9d0acb014744296fad548e9c5803a702dc65c640d5a141db652944dd
3
- size 20730036
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:64c40e94a7f11eb9e5f594ab9f6825b2f681c1da4fbb2aeffaaebabf0456aae3
3
+ size 18360744
preprocessor_config.json CHANGED
@@ -1,7 +1,22 @@
1
  {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  "crop_size": {
3
- "height": 28,
4
- "width": 28
5
  },
6
  "do_center_crop": true,
7
  "do_convert_rgb": false,
@@ -12,6 +27,6 @@
12
  "resample": 2,
13
  "rescale_factor": 0.00392156862745098,
14
  "size": {
15
- "shortest_edge": 28
16
  }
17
  }
 
1
  {
2
+ "_valid_processor_keys": [
3
+ "images",
4
+ "segmentation_maps",
5
+ "do_resize",
6
+ "size",
7
+ "resample",
8
+ "do_rescale",
9
+ "rescale_factor",
10
+ "do_center_crop",
11
+ "crop_size",
12
+ "do_flip_channel_order",
13
+ "return_tensors",
14
+ "data_format",
15
+ "input_data_format"
16
+ ],
17
  "crop_size": {
18
+ "height": 56,
19
+ "width": 56
20
  },
21
  "do_center_crop": true,
22
  "do_convert_rgb": false,
 
27
  "resample": 2,
28
  "rescale_factor": 0.00392156862745098,
29
  "size": {
30
+ "shortest_edge": 56
31
  }
32
  }
test_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "epoch": 5.0,
3
- "eval_accuracy": 0.749444,
4
- "eval_loss": 0.9889541864395142,
5
- "eval_runtime": 480.919,
6
- "eval_samples_per_second": 519.838,
7
- "eval_steps_per_second": 2.032
8
  }
 
1
  {
2
  "epoch": 5.0,
3
+ "eval_accuracy": 0.84,
4
+ "eval_loss": 5.637753486633301,
5
+ "eval_runtime": 0.5983,
6
+ "eval_samples_per_second": 334.259,
7
+ "eval_steps_per_second": 1.671
8
  }
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "epoch": 5.0,
3
- "total_flos": 5.4597447576e+17,
4
- "train_loss": 1.1248133655402486,
5
- "train_runtime": 51144.106,
6
- "train_samples_per_second": 439.933,
7
- "train_steps_per_second": 1.719
8
  }
 
1
  {
2
  "epoch": 5.0,
3
+ "total_flos": 429498709311488.0,
4
+ "train_loss": 5.7403564453125,
5
+ "train_runtime": 6.8068,
6
+ "train_samples_per_second": 734.556,
7
+ "train_steps_per_second": 2.938
8
  }
trainer_state.json CHANGED
@@ -3,789 +3,27 @@
3
  "best_model_checkpoint": null,
4
  "epoch": 5.0,
5
  "eval_steps": 5000,
6
- "global_step": 87895,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
- {
12
- "epoch": 0.06,
13
- "grad_norm": 2.239067792892456,
14
- "learning_rate": 0.0007997460514380593,
15
- "loss": 2.6519,
16
- "step": 1000
17
- },
18
- {
19
- "epoch": 0.11,
20
- "grad_norm": 1.9583632946014404,
21
- "learning_rate": 0.0007989814716192582,
22
- "loss": 1.7524,
23
- "step": 2000
24
- },
25
- {
26
- "epoch": 0.17,
27
- "grad_norm": 1.9535893201828003,
28
- "learning_rate": 0.0007977072349143816,
29
- "loss": 1.6001,
30
- "step": 3000
31
- },
32
- {
33
- "epoch": 0.23,
34
- "grad_norm": 2.063387632369995,
35
- "learning_rate": 0.0007959249690268624,
36
- "loss": 1.5217,
37
- "step": 4000
38
- },
39
- {
40
- "epoch": 0.28,
41
- "grad_norm": 1.9860559701919556,
42
- "learning_rate": 0.0007936369506140068,
43
- "loss": 1.4668,
44
- "step": 5000
45
- },
46
- {
47
- "epoch": 0.28,
48
- "eval_accuracy": 0.648804,
49
- "eval_loss": 1.4146429300308228,
50
- "eval_runtime": 478.1117,
51
- "eval_samples_per_second": 522.89,
52
- "eval_steps_per_second": 2.043,
53
- "step": 5000
54
- },
55
- {
56
- "epoch": 0.34,
57
- "grad_norm": 2.0301945209503174,
58
- "learning_rate": 0.0007908461023788039,
59
- "loss": 1.4298,
60
- "step": 6000
61
- },
62
- {
63
- "epoch": 0.4,
64
- "grad_norm": 1.5042238235473633,
65
- "learning_rate": 0.0007875595274863339,
66
- "loss": 1.3942,
67
- "step": 7000
68
- },
69
- {
70
- "epoch": 0.46,
71
- "grad_norm": 1.6014552116394043,
72
- "learning_rate": 0.0007837748451621974,
73
- "loss": 1.3678,
74
- "step": 8000
75
- },
76
- {
77
- "epoch": 0.51,
78
- "grad_norm": 1.509202241897583,
79
- "learning_rate": 0.0007795044488381889,
80
- "loss": 1.3487,
81
- "step": 9000
82
- },
83
- {
84
- "epoch": 0.57,
85
- "grad_norm": 1.4688481092453003,
86
- "learning_rate": 0.0007747452450657864,
87
- "loss": 1.3282,
88
- "step": 10000
89
- },
90
- {
91
- "epoch": 0.57,
92
- "eval_accuracy": 0.676572,
93
- "eval_loss": 1.2906984090805054,
94
- "eval_runtime": 477.0964,
95
- "eval_samples_per_second": 524.003,
96
- "eval_steps_per_second": 2.048,
97
- "step": 10000
98
- },
99
- {
100
- "epoch": 0.63,
101
- "grad_norm": 1.4521937370300293,
102
- "learning_rate": 0.0007695128184733147,
103
- "loss": 1.3244,
104
- "step": 11000
105
- },
106
- {
107
- "epoch": 0.68,
108
- "grad_norm": 1.4638675451278687,
109
- "learning_rate": 0.000763803378723512,
110
- "loss": 1.2934,
111
- "step": 12000
112
- },
113
- {
114
- "epoch": 0.74,
115
- "grad_norm": 1.614662528038025,
116
- "learning_rate": 0.0007576292184031235,
117
- "loss": 1.2847,
118
- "step": 13000
119
- },
120
- {
121
- "epoch": 0.8,
122
- "grad_norm": 1.3933509588241577,
123
- "learning_rate": 0.0007510050807642281,
124
- "loss": 1.2779,
125
- "step": 14000
126
- },
127
- {
128
- "epoch": 0.85,
129
- "grad_norm": 1.424271583557129,
130
- "learning_rate": 0.0007439261673227427,
131
- "loss": 1.2617,
132
- "step": 15000
133
- },
134
- {
135
- "epoch": 0.85,
136
- "eval_accuracy": 0.690468,
137
- "eval_loss": 1.227000117301941,
138
- "eval_runtime": 474.9613,
139
- "eval_samples_per_second": 526.359,
140
- "eval_steps_per_second": 2.057,
141
- "step": 15000
142
- },
143
- {
144
- "epoch": 0.91,
145
- "grad_norm": 1.5942885875701904,
146
- "learning_rate": 0.0007364156588963428,
147
- "loss": 1.2525,
148
- "step": 16000
149
- },
150
- {
151
- "epoch": 0.97,
152
- "grad_norm": 1.342366099357605,
153
- "learning_rate": 0.0007284681149194631,
154
- "loss": 1.2437,
155
- "step": 17000
156
- },
157
- {
158
- "epoch": 1.02,
159
- "grad_norm": 1.3556225299835205,
160
- "learning_rate": 0.0007201095606528187,
161
- "loss": 1.2252,
162
- "step": 18000
163
- },
164
- {
165
- "epoch": 1.08,
166
- "grad_norm": 1.18772554397583,
167
- "learning_rate": 0.0007113339412117152,
168
- "loss": 1.1982,
169
- "step": 19000
170
- },
171
- {
172
- "epoch": 1.14,
173
- "grad_norm": 1.2163102626800537,
174
- "learning_rate": 0.0007021606253904673,
175
- "loss": 1.196,
176
- "step": 20000
177
- },
178
- {
179
- "epoch": 1.14,
180
- "eval_accuracy": 0.703472,
181
- "eval_loss": 1.1758021116256714,
182
- "eval_runtime": 477.2497,
183
- "eval_samples_per_second": 523.835,
184
- "eval_steps_per_second": 2.047,
185
- "step": 20000
186
- },
187
- {
188
- "epoch": 1.19,
189
- "grad_norm": 1.4264676570892334,
190
- "learning_rate": 0.0006926013311356693,
191
- "loss": 1.1909,
192
- "step": 21000
193
- },
194
- {
195
- "epoch": 1.25,
196
- "grad_norm": 1.4394527673721313,
197
- "learning_rate": 0.0006826783850151473,
198
- "loss": 1.1906,
199
- "step": 22000
200
- },
201
- {
202
- "epoch": 1.31,
203
- "grad_norm": 1.6702680587768555,
204
- "learning_rate": 0.0006723845989084832,
205
- "loss": 1.1836,
206
- "step": 23000
207
- },
208
- {
209
- "epoch": 1.37,
210
- "grad_norm": 1.4739456176757812,
211
- "learning_rate": 0.0006617536811497173,
212
- "loss": 1.1826,
213
- "step": 24000
214
- },
215
- {
216
- "epoch": 1.42,
217
- "grad_norm": 1.8557323217391968,
218
- "learning_rate": 0.0006507779307681826,
219
- "loss": 1.1664,
220
- "step": 25000
221
- },
222
- {
223
- "epoch": 1.42,
224
- "eval_accuracy": 0.709336,
225
- "eval_loss": 1.1526598930358887,
226
- "eval_runtime": 478.812,
227
- "eval_samples_per_second": 522.126,
228
- "eval_steps_per_second": 2.04,
229
- "step": 25000
230
- },
231
- {
232
- "epoch": 1.48,
233
- "grad_norm": 1.0533519983291626,
234
- "learning_rate": 0.0006394818379437445,
235
- "loss": 1.1641,
236
- "step": 26000
237
- },
238
- {
239
- "epoch": 1.54,
240
- "grad_norm": 1.3917081356048584,
241
- "learning_rate": 0.0006278798322474558,
242
- "loss": 1.1606,
243
- "step": 27000
244
- },
245
- {
246
- "epoch": 1.59,
247
- "grad_norm": 1.281437873840332,
248
- "learning_rate": 0.000616010800690931,
249
- "loss": 1.1553,
250
- "step": 28000
251
- },
252
- {
253
- "epoch": 1.65,
254
- "grad_norm": 1.2246198654174805,
255
- "learning_rate": 0.0006038423385201748,
256
- "loss": 1.149,
257
- "step": 29000
258
- },
259
- {
260
- "epoch": 1.71,
261
- "grad_norm": 1.2770333290100098,
262
- "learning_rate": 0.0005914134891913911,
263
- "loss": 1.1504,
264
- "step": 30000
265
- },
266
- {
267
- "epoch": 1.71,
268
- "eval_accuracy": 0.717008,
269
- "eval_loss": 1.115194320678711,
270
- "eval_runtime": 479.3599,
271
- "eval_samples_per_second": 521.529,
272
- "eval_steps_per_second": 2.038,
273
- "step": 30000
274
- },
275
- {
276
- "epoch": 1.76,
277
- "grad_norm": 1.062719702720642,
278
- "learning_rate": 0.0005787401292529838,
279
- "loss": 1.1433,
280
- "step": 31000
281
- },
282
- {
283
- "epoch": 1.82,
284
- "grad_norm": 1.4529552459716797,
285
- "learning_rate": 0.0005658514578562903,
286
- "loss": 1.1445,
287
- "step": 32000
288
- },
289
- {
290
- "epoch": 1.88,
291
- "grad_norm": 1.107438564300537,
292
- "learning_rate": 0.000552738138527959,
293
- "loss": 1.1345,
294
- "step": 33000
295
- },
296
- {
297
- "epoch": 1.93,
298
- "grad_norm": 1.309801697731018,
299
- "learning_rate": 0.0005394431125282525,
300
- "loss": 1.1255,
301
- "step": 34000
302
- },
303
- {
304
- "epoch": 1.99,
305
- "grad_norm": 1.2301312685012817,
306
- "learning_rate": 0.0005259567490230731,
307
- "loss": 1.1234,
308
- "step": 35000
309
- },
310
- {
311
- "epoch": 1.99,
312
- "eval_accuracy": 0.724084,
313
- "eval_loss": 1.0902520418167114,
314
- "eval_runtime": 480.0595,
315
- "eval_samples_per_second": 520.769,
316
- "eval_steps_per_second": 2.035,
317
- "step": 35000
318
- },
319
- {
320
- "epoch": 2.05,
321
- "grad_norm": 1.4310553073883057,
322
- "learning_rate": 0.0005123232108540917,
323
- "loss": 1.0933,
324
- "step": 36000
325
- },
326
- {
327
- "epoch": 2.1,
328
- "grad_norm": 1.1885666847229004,
329
- "learning_rate": 0.0004985326219711018,
330
- "loss": 1.0855,
331
- "step": 37000
332
- },
333
- {
334
- "epoch": 2.16,
335
- "grad_norm": 1.2444497346878052,
336
- "learning_rate": 0.0004846161680220303,
337
- "loss": 1.0785,
338
- "step": 38000
339
- },
340
- {
341
- "epoch": 2.22,
342
- "grad_norm": 1.5714720487594604,
343
- "learning_rate": 0.0004705916258137954,
344
- "loss": 1.0748,
345
- "step": 39000
346
- },
347
- {
348
- "epoch": 2.28,
349
- "grad_norm": 1.5065577030181885,
350
- "learning_rate": 0.0004564910639893322,
351
- "loss": 1.0819,
352
- "step": 40000
353
- },
354
- {
355
- "epoch": 2.28,
356
- "eval_accuracy": 0.728284,
357
- "eval_loss": 1.072808027267456,
358
- "eval_runtime": 478.1415,
359
- "eval_samples_per_second": 522.858,
360
- "eval_steps_per_second": 2.043,
361
- "step": 40000
362
- },
363
- {
364
- "epoch": 2.33,
365
- "grad_norm": 1.459184169769287,
366
- "learning_rate": 0.00044230426818940436,
367
- "loss": 1.077,
368
- "step": 41000
369
- },
370
- {
371
- "epoch": 2.39,
372
- "grad_norm": 1.0881271362304688,
373
- "learning_rate": 0.0004280776949118281,
374
- "loss": 1.0763,
375
- "step": 42000
376
- },
377
- {
378
- "epoch": 2.45,
379
- "grad_norm": 1.257857322692871,
380
- "learning_rate": 0.000413801038515455,
381
- "loss": 1.0654,
382
- "step": 43000
383
- },
384
- {
385
- "epoch": 2.5,
386
- "grad_norm": 1.3307342529296875,
387
- "learning_rate": 0.000399521049758247,
388
- "loss": 1.0652,
389
- "step": 44000
390
- },
391
- {
392
- "epoch": 2.56,
393
- "grad_norm": 1.1706323623657227,
394
- "learning_rate": 0.00038522738430521474,
395
- "loss": 1.0707,
396
- "step": 45000
397
- },
398
- {
399
- "epoch": 2.56,
400
- "eval_accuracy": 0.733432,
401
- "eval_loss": 1.0532697439193726,
402
- "eval_runtime": 479.4432,
403
- "eval_samples_per_second": 521.438,
404
- "eval_steps_per_second": 2.038,
405
- "step": 45000
406
- },
407
- {
408
- "epoch": 2.62,
409
- "grad_norm": 1.3150368928909302,
410
- "learning_rate": 0.00037096684861267625,
411
- "loss": 1.0657,
412
- "step": 46000
413
- },
414
- {
415
- "epoch": 2.67,
416
- "grad_norm": 1.3128031492233276,
417
- "learning_rate": 0.00035672911243732087,
418
- "loss": 1.0601,
419
- "step": 47000
420
- },
421
- {
422
- "epoch": 2.73,
423
- "grad_norm": 1.3551691770553589,
424
- "learning_rate": 0.0003425607990878131,
425
- "loss": 1.0575,
426
- "step": 48000
427
- },
428
- {
429
- "epoch": 2.79,
430
- "grad_norm": 1.2833973169326782,
431
- "learning_rate": 0.0003284516451261337,
432
- "loss": 1.059,
433
- "step": 49000
434
- },
435
- {
436
- "epoch": 2.84,
437
- "grad_norm": 1.4435667991638184,
438
- "learning_rate": 0.000314447852801857,
439
- "loss": 1.049,
440
- "step": 50000
441
- },
442
- {
443
- "epoch": 2.84,
444
- "eval_accuracy": 0.736904,
445
- "eval_loss": 1.039907693862915,
446
- "eval_runtime": 482.5464,
447
- "eval_samples_per_second": 518.085,
448
- "eval_steps_per_second": 2.025,
449
- "step": 50000
450
- },
451
- {
452
- "epoch": 2.9,
453
- "grad_norm": 1.2984023094177246,
454
- "learning_rate": 0.0003005392778549707,
455
- "loss": 1.0418,
456
- "step": 51000
457
- },
458
- {
459
- "epoch": 2.96,
460
- "grad_norm": 1.109937310218811,
461
- "learning_rate": 0.0002867714657131215,
462
- "loss": 1.048,
463
- "step": 52000
464
- },
465
- {
466
- "epoch": 3.01,
467
- "grad_norm": 1.1215381622314453,
468
- "learning_rate": 0.00027313444305887276,
469
- "loss": 1.0303,
470
- "step": 53000
471
- },
472
- {
473
- "epoch": 3.07,
474
- "grad_norm": 1.3672548532485962,
475
- "learning_rate": 0.00025967286608676553,
476
- "loss": 1.0047,
477
- "step": 54000
478
- },
479
- {
480
- "epoch": 3.13,
481
- "grad_norm": 1.4645944833755493,
482
- "learning_rate": 0.0002463769833142144,
483
- "loss": 1.0017,
484
- "step": 55000
485
- },
486
- {
487
- "epoch": 3.13,
488
- "eval_accuracy": 0.740544,
489
- "eval_loss": 1.0252685546875,
490
- "eval_runtime": 480.6997,
491
- "eval_samples_per_second": 520.075,
492
- "eval_steps_per_second": 2.032,
493
- "step": 55000
494
- },
495
- {
496
- "epoch": 3.19,
497
- "grad_norm": 1.3108264207839966,
498
- "learning_rate": 0.00023329033382909358,
499
- "loss": 1.0012,
500
- "step": 56000
501
- },
502
- {
503
- "epoch": 3.24,
504
- "grad_norm": 1.4289699792861938,
505
- "learning_rate": 0.00022040343774395584,
506
- "loss": 0.9987,
507
- "step": 57000
508
- },
509
- {
510
- "epoch": 3.3,
511
- "grad_norm": 1.2855439186096191,
512
- "learning_rate": 0.00020775849486686778,
513
- "loss": 0.9988,
514
- "step": 58000
515
- },
516
- {
517
- "epoch": 3.36,
518
- "grad_norm": 1.162169098854065,
519
- "learning_rate": 0.00019534634527990013,
520
- "loss": 1.0006,
521
- "step": 59000
522
- },
523
- {
524
- "epoch": 3.41,
525
- "grad_norm": 1.2256358861923218,
526
- "learning_rate": 0.00018320763417230612,
527
- "loss": 0.995,
528
- "step": 60000
529
- },
530
- {
531
- "epoch": 3.41,
532
- "eval_accuracy": 0.743764,
533
- "eval_loss": 1.0119822025299072,
534
- "eval_runtime": 480.4029,
535
- "eval_samples_per_second": 520.396,
536
- "eval_steps_per_second": 2.034,
537
- "step": 60000
538
- },
539
- {
540
- "epoch": 3.47,
541
- "grad_norm": 1.2014998197555542,
542
- "learning_rate": 0.00017133356833730004,
543
- "loss": 0.9976,
544
- "step": 61000
545
- },
546
- {
547
- "epoch": 3.53,
548
- "grad_norm": 1.1588609218597412,
549
- "learning_rate": 0.00015975159982961664,
550
- "loss": 1.001,
551
- "step": 62000
552
- },
553
- {
554
- "epoch": 3.58,
555
- "grad_norm": 1.2534708976745605,
556
- "learning_rate": 0.00014847652339644127,
557
- "loss": 0.9967,
558
- "step": 63000
559
- },
560
- {
561
- "epoch": 3.64,
562
- "grad_norm": 1.2062395811080933,
563
- "learning_rate": 0.00013753353036528426,
564
- "loss": 0.9972,
565
- "step": 64000
566
- },
567
- {
568
- "epoch": 3.7,
569
- "grad_norm": 1.280290126800537,
570
- "learning_rate": 0.00012691469370009338,
571
- "loss": 0.9829,
572
- "step": 65000
573
- },
574
- {
575
- "epoch": 3.7,
576
- "eval_accuracy": 0.746796,
577
- "eval_loss": 0.9977088570594788,
578
- "eval_runtime": 476.9935,
579
- "eval_samples_per_second": 524.116,
580
- "eval_steps_per_second": 2.048,
581
- "step": 65000
582
- },
583
- {
584
- "epoch": 3.75,
585
- "grad_norm": 1.2085012197494507,
586
- "learning_rate": 0.00011665478615500634,
587
- "loss": 0.9863,
588
- "step": 66000
589
- },
590
- {
591
- "epoch": 3.81,
592
- "grad_norm": 1.7737034559249878,
593
- "learning_rate": 0.00010674637551684559,
594
- "loss": 0.9777,
595
- "step": 67000
596
- },
597
- {
598
- "epoch": 3.87,
599
- "grad_norm": 1.1058114767074585,
600
- "learning_rate": 9.722190815783432e-05,
601
- "loss": 0.9903,
602
- "step": 68000
603
- },
604
- {
605
- "epoch": 3.93,
606
- "grad_norm": 1.285079836845398,
607
- "learning_rate": 8.807448461308951e-05,
608
- "loss": 0.9814,
609
- "step": 69000
610
- },
611
- {
612
- "epoch": 3.98,
613
- "grad_norm": 1.4150310754776,
614
- "learning_rate": 7.933405930416787e-05,
615
- "loss": 0.9807,
616
- "step": 70000
617
- },
618
- {
619
- "epoch": 3.98,
620
- "eval_accuracy": 0.7487,
621
- "eval_loss": 0.9908215999603271,
622
- "eval_runtime": 478.9031,
623
- "eval_samples_per_second": 522.026,
624
- "eval_steps_per_second": 2.04,
625
- "step": 70000
626
- },
627
- {
628
- "epoch": 4.04,
629
- "grad_norm": 1.0839868783950806,
630
- "learning_rate": 7.100243228624242e-05,
631
- "loss": 0.963,
632
- "step": 71000
633
- },
634
- {
635
- "epoch": 4.1,
636
- "grad_norm": 1.2299129962921143,
637
- "learning_rate": 6.30825184828638e-05,
638
- "loss": 0.955,
639
- "step": 72000
640
- },
641
- {
642
- "epoch": 4.15,
643
- "grad_norm": 1.3780415058135986,
644
- "learning_rate": 5.5592981342836236e-05,
645
- "loss": 0.9508,
646
- "step": 73000
647
- },
648
- {
649
- "epoch": 4.21,
650
- "grad_norm": 1.1320440769195557,
651
- "learning_rate": 4.8543387962500266e-05,
652
- "loss": 0.9493,
653
- "step": 74000
654
- },
655
- {
656
- "epoch": 4.27,
657
- "grad_norm": 1.120684027671814,
658
- "learning_rate": 4.1949116996191016e-05,
659
- "loss": 0.9452,
660
- "step": 75000
661
- },
662
- {
663
- "epoch": 4.27,
664
- "eval_accuracy": 0.749816,
665
- "eval_loss": 0.9875096082687378,
666
- "eval_runtime": 477.8512,
667
- "eval_samples_per_second": 523.175,
668
- "eval_steps_per_second": 2.045,
669
- "step": 75000
670
- },
671
- {
672
- "epoch": 4.32,
673
- "grad_norm": 1.4629569053649902,
674
- "learning_rate": 3.5805391595057494e-05,
675
- "loss": 0.9484,
676
- "step": 76000
677
- },
678
- {
679
- "epoch": 4.38,
680
- "grad_norm": 1.2522917985916138,
681
- "learning_rate": 3.013233018257653e-05,
682
- "loss": 0.9567,
683
- "step": 77000
684
- },
685
- {
686
- "epoch": 4.44,
687
- "grad_norm": 1.331236720085144,
688
- "learning_rate": 2.492582322836503e-05,
689
- "loss": 0.9494,
690
- "step": 78000
691
- },
692
- {
693
- "epoch": 4.49,
694
- "grad_norm": 1.312769889831543,
695
- "learning_rate": 2.0202920112872213e-05,
696
- "loss": 0.9494,
697
- "step": 79000
698
- },
699
- {
700
- "epoch": 4.55,
701
- "grad_norm": 1.0436272621154785,
702
- "learning_rate": 1.5960199594472392e-05,
703
- "loss": 0.949,
704
- "step": 80000
705
- },
706
- {
707
- "epoch": 4.55,
708
- "eval_accuracy": 0.750728,
709
- "eval_loss": 0.9844790697097778,
710
- "eval_runtime": 480.1647,
711
- "eval_samples_per_second": 520.655,
712
- "eval_steps_per_second": 2.035,
713
- "step": 80000
714
- },
715
- {
716
- "epoch": 4.61,
717
- "grad_norm": 1.7521427869796753,
718
- "learning_rate": 1.221155501027127e-05,
719
- "loss": 0.9459,
720
- "step": 81000
721
- },
722
- {
723
- "epoch": 4.66,
724
- "grad_norm": 1.4406906366348267,
725
- "learning_rate": 8.954270865460369e-06,
726
- "loss": 0.9507,
727
- "step": 82000
728
- },
729
- {
730
- "epoch": 4.72,
731
- "grad_norm": 1.2117027044296265,
732
- "learning_rate": 6.196506529965529e-06,
733
- "loss": 0.9471,
734
- "step": 83000
735
- },
736
- {
737
- "epoch": 4.78,
738
- "grad_norm": 1.1171038150787354,
739
- "learning_rate": 3.943787197483806e-06,
740
- "loss": 0.951,
741
- "step": 84000
742
- },
743
- {
744
- "epoch": 4.84,
745
- "grad_norm": 1.388543963432312,
746
- "learning_rate": 2.194481012149785e-06,
747
- "loss": 0.9509,
748
- "step": 85000
749
- },
750
- {
751
- "epoch": 4.84,
752
- "eval_accuracy": 0.750944,
753
- "eval_loss": 0.984071671962738,
754
- "eval_runtime": 479.0227,
755
- "eval_samples_per_second": 521.896,
756
- "eval_steps_per_second": 2.04,
757
- "step": 85000
758
- },
759
- {
760
- "epoch": 4.89,
761
- "grad_norm": 1.2518051862716675,
762
- "learning_rate": 9.543163033286728e-07,
763
- "loss": 0.955,
764
- "step": 86000
765
- },
766
- {
767
- "epoch": 4.95,
768
- "grad_norm": 1.1738619804382324,
769
- "learning_rate": 2.2239470342309e-07,
770
- "loss": 0.9482,
771
- "step": 87000
772
- },
773
  {
774
  "epoch": 5.0,
775
- "step": 87895,
776
- "total_flos": 5.4597447576e+17,
777
- "train_loss": 1.1248133655402486,
778
- "train_runtime": 51144.106,
779
- "train_samples_per_second": 439.933,
780
- "train_steps_per_second": 1.719
781
  }
782
  ],
783
  "logging_steps": 1000,
784
- "max_steps": 87895,
785
  "num_input_tokens_seen": 0,
786
  "num_train_epochs": 5,
787
  "save_steps": 5000,
788
- "total_flos": 5.4597447576e+17,
789
  "train_batch_size": 256,
790
  "trial_name": null,
791
  "trial_params": null
 
3
  "best_model_checkpoint": null,
4
  "epoch": 5.0,
5
  "eval_steps": 5000,
6
+ "global_step": 20,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
  {
12
  "epoch": 5.0,
13
+ "step": 20,
14
+ "total_flos": 429498709311488.0,
15
+ "train_loss": 5.7403564453125,
16
+ "train_runtime": 6.8068,
17
+ "train_samples_per_second": 734.556,
18
+ "train_steps_per_second": 2.938
19
  }
20
  ],
21
  "logging_steps": 1000,
22
+ "max_steps": 20,
23
  "num_input_tokens_seen": 0,
24
  "num_train_epochs": 5,
25
  "save_steps": 5000,
26
+ "total_flos": 429498709311488.0,
27
  "train_batch_size": 256,
28
  "trial_name": null,
29
  "trial_params": null
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4d9a6f1a8233545c2d422a032a79a3101f63ca1427a7c955c87c0125bf8df20e
3
- size 4856
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:94028c98d450c3cf82efda505f3a584702831ed0c33a23dd3ade11963108d031
3
+ size 4984