tejaskamtam commited on
Commit
08afa7f
1 Parent(s): 3d99889

End of training

Browse files
README.md ADDED
@@ -0,0 +1,115 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: apache-2.0
3
+ base_model: google/electra-base-generator
4
+ tags:
5
+ - generated_from_trainer
6
+ datasets:
7
+ - datasets/all_binary_and_xe_ey_fae_counterfactual
8
+ metrics:
9
+ - accuracy
10
+ model-index:
11
+ - name: electra-adapter-finetuned-xe_ey_fae
12
+ results:
13
+ - task:
14
+ name: Masked Language Modeling
15
+ type: fill-mask
16
+ dataset:
17
+ name: datasets/all_binary_and_xe_ey_fae_counterfactual
18
+ type: datasets/all_binary_and_xe_ey_fae_counterfactual
19
+ metrics:
20
+ - name: Accuracy
21
+ type: accuracy
22
+ value: 0.6258363412553052
23
+ ---
24
+
25
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
26
+ should probably proofread and complete it, then remove this comment. -->
27
+
28
+ # electra-adapter-finetuned-xe_ey_fae
29
+
30
+ This model is a fine-tuned version of [google/electra-base-generator](https://huggingface.co/google/electra-base-generator) on the datasets/all_binary_and_xe_ey_fae_counterfactual dataset.
31
+ It achieves the following results on the evaluation set:
32
+ - Loss: 2.0392
33
+ - Accuracy: 0.6258
34
+
35
+ ## Model description
36
+
37
+ More information needed
38
+
39
+ ## Intended uses & limitations
40
+
41
+ More information needed
42
+
43
+ ## Training and evaluation data
44
+
45
+ More information needed
46
+
47
+ ## Training procedure
48
+
49
+ ### Training hyperparameters
50
+
51
+ The following hyperparameters were used during training:
52
+ - learning_rate: 1e-05
53
+ - train_batch_size: 8
54
+ - eval_batch_size: 8
55
+ - seed: 100
56
+ - gradient_accumulation_steps: 2
57
+ - total_train_batch_size: 16
58
+ - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
59
+ - lr_scheduler_type: linear
60
+ - num_epochs: 3.0
61
+ - mixed_precision_training: Native AMP
62
+
63
+ ### Training results
64
+
65
+ | Training Loss | Epoch | Step | Validation Loss | Accuracy |
66
+ |:-------------:|:-----:|:-----:|:---------------:|:--------:|
67
+ | 3.9488 | 0.06 | 500 | 3.1500 | 0.5509 |
68
+ | 2.942 | 0.13 | 1000 | 2.5844 | 0.5680 |
69
+ | 2.6751 | 0.19 | 1500 | 2.4443 | 0.5790 |
70
+ | 2.582 | 0.26 | 2000 | 2.3701 | 0.5869 |
71
+ | 2.5267 | 0.32 | 2500 | 2.3097 | 0.5937 |
72
+ | 2.4722 | 0.39 | 3000 | 2.2695 | 0.5986 |
73
+ | 2.4289 | 0.45 | 3500 | 2.2329 | 0.6024 |
74
+ | 2.404 | 0.52 | 4000 | 2.2063 | 0.6055 |
75
+ | 2.3826 | 0.58 | 4500 | 2.1840 | 0.6087 |
76
+ | 2.3633 | 0.64 | 5000 | 2.1646 | 0.6109 |
77
+ | 2.3425 | 0.71 | 5500 | 2.1557 | 0.6121 |
78
+ | 2.333 | 0.77 | 6000 | 2.1350 | 0.6141 |
79
+ | 2.311 | 0.84 | 6500 | 2.1292 | 0.6152 |
80
+ | 2.3014 | 0.9 | 7000 | 2.1182 | 0.6166 |
81
+ | 2.2974 | 0.97 | 7500 | 2.1121 | 0.6170 |
82
+ | 2.2866 | 1.03 | 8000 | 2.1079 | 0.6173 |
83
+ | 2.2675 | 1.1 | 8500 | 2.0940 | 0.6192 |
84
+ | 2.2789 | 1.16 | 9000 | 2.0882 | 0.6201 |
85
+ | 2.2684 | 1.22 | 9500 | 2.0873 | 0.6200 |
86
+ | 2.2608 | 1.29 | 10000 | 2.0796 | 0.6209 |
87
+ | 2.2478 | 1.35 | 10500 | 2.0827 | 0.6204 |
88
+ | 2.2524 | 1.42 | 11000 | 2.0741 | 0.6215 |
89
+ | 2.2502 | 1.48 | 11500 | 2.0685 | 0.6220 |
90
+ | 2.243 | 1.55 | 12000 | 2.0665 | 0.6228 |
91
+ | 2.2417 | 1.61 | 12500 | 2.0632 | 0.6229 |
92
+ | 2.2398 | 1.68 | 13000 | 2.0593 | 0.6232 |
93
+ | 2.2233 | 1.74 | 13500 | 2.0600 | 0.6232 |
94
+ | 2.2277 | 1.8 | 14000 | 2.0535 | 0.6236 |
95
+ | 2.2344 | 1.87 | 14500 | 2.0485 | 0.6248 |
96
+ | 2.2274 | 1.93 | 15000 | 2.0507 | 0.6245 |
97
+ | 2.2212 | 2.0 | 15500 | 2.0428 | 0.6256 |
98
+ | 2.214 | 2.06 | 16000 | 2.0464 | 0.6244 |
99
+ | 2.2104 | 2.13 | 16500 | 2.0477 | 0.6250 |
100
+ | 2.2185 | 2.19 | 17000 | 2.0397 | 0.6257 |
101
+ | 2.2157 | 2.26 | 17500 | 2.0419 | 0.6257 |
102
+ | 2.2128 | 2.32 | 18000 | 2.0439 | 0.6255 |
103
+ | 2.2154 | 2.38 | 18500 | 2.0372 | 0.6259 |
104
+ | 2.2099 | 2.45 | 19000 | 2.0337 | 0.6263 |
105
+ | 2.2045 | 2.51 | 19500 | 2.0396 | 0.6259 |
106
+ | 2.2138 | 2.58 | 20000 | 2.0390 | 0.6262 |
107
+ | 2.2103 | 2.64 | 20500 | 2.0339 | 0.6263 |
108
+
109
+
110
+ ### Framework versions
111
+
112
+ - Transformers 4.36.2
113
+ - Pytorch 2.2.0+cu121
114
+ - Datasets 2.17.0
115
+ - Tokenizers 0.15.2
all_results.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 2.64,
3
+ "eval_accuracy": 0.6258363412553052,
4
+ "eval_loss": 2.0392136573791504,
5
+ "eval_runtime": 87.4708,
6
+ "eval_samples": 15525,
7
+ "eval_samples_per_second": 177.488,
8
+ "eval_steps_per_second": 22.19,
9
+ "perplexity": 7.684564122147852,
10
+ "train_loss": 2.351401915015244,
11
+ "train_runtime": 7059.8447,
12
+ "train_samples": 124124,
13
+ "train_samples_per_second": 52.745,
14
+ "train_steps_per_second": 3.297
15
+ }
datasets/all_binary_and_xe_ey_fae_counterfactual/adapter_config.json ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "config": {
3
+ "adapter_residual_before_ln": false,
4
+ "cross_adapter": false,
5
+ "factorized_phm_W": true,
6
+ "factorized_phm_rule": false,
7
+ "hypercomplex_nonlinearity": "glorot-uniform",
8
+ "init_weights": "bert",
9
+ "inv_adapter": null,
10
+ "inv_adapter_reduction_factor": null,
11
+ "is_parallel": false,
12
+ "learn_phm": true,
13
+ "leave_out": [],
14
+ "ln_after": false,
15
+ "ln_before": false,
16
+ "mh_adapter": false,
17
+ "non_linearity": "relu",
18
+ "original_ln_after": true,
19
+ "original_ln_before": true,
20
+ "output_adapter": true,
21
+ "phm_bias": true,
22
+ "phm_c_init": "normal",
23
+ "phm_dim": 4,
24
+ "phm_init_range": 0.0001,
25
+ "phm_layer": false,
26
+ "phm_rank": 1,
27
+ "reduction_factor": 16,
28
+ "residual_before_ln": true,
29
+ "scaling": 1.0,
30
+ "shared_W_phm": false,
31
+ "shared_phm_rule": true,
32
+ "use_gating": false
33
+ },
34
+ "config_id": "9076f36a74755ac4",
35
+ "hidden_size": 256,
36
+ "model_class": "ElectraForMaskedLM",
37
+ "model_name": "google/electra-base-generator",
38
+ "model_type": "electra",
39
+ "name": "datasets/all_binary_and_xe_ey_fae_counterfactual",
40
+ "version": "0.1.2"
41
+ }
datasets/all_binary_and_xe_ey_fae_counterfactual/head_config.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "config": null,
3
+ "hidden_size": 256,
4
+ "label2id": {
5
+ "LABEL_0": 0,
6
+ "LABEL_1": 1
7
+ },
8
+ "model_class": "ElectraForMaskedLM",
9
+ "model_name": "google/electra-base-generator",
10
+ "model_type": "electra",
11
+ "name": null,
12
+ "num_labels": 2,
13
+ "version": "0.1.2"
14
+ }
datasets/all_binary_and_xe_ey_fae_counterfactual/pytorch_adapter.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:48344746d047a38e3f7159048f13f2271dd6ce2f8ee3e79f55ed1043d9b44f21
3
+ size 425830
datasets/all_binary_and_xe_ey_fae_counterfactual/pytorch_model_head.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2c0648fbb6aa6e8b4fd9494a9da8a1ff89ca22bc8ff2ead6f16d707f696a993c
3
+ size 94684086
eval_results.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 2.64,
3
+ "eval_accuracy": 0.6258363412553052,
4
+ "eval_loss": 2.0392136573791504,
5
+ "eval_runtime": 87.4708,
6
+ "eval_samples": 15525,
7
+ "eval_samples_per_second": 177.488,
8
+ "eval_steps_per_second": 22.19,
9
+ "perplexity": 7.684564122147852
10
+ }
train_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 2.64,
3
+ "train_loss": 2.351401915015244,
4
+ "train_runtime": 7059.8447,
5
+ "train_samples": 124124,
6
+ "train_samples_per_second": 52.745,
7
+ "train_steps_per_second": 3.297
8
+ }
trainer_state.json ADDED
@@ -0,0 +1,645 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 2.0336806774139404,
3
+ "best_model_checkpoint": "finetuning/output/electra-adapter-finetuned_xe_ey_fae/checkpoint-19000",
4
+ "epoch": 2.642433616911575,
5
+ "eval_steps": 500,
6
+ "global_step": 20500,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.06,
13
+ "learning_rate": 9.785167998625076e-06,
14
+ "loss": 3.9488,
15
+ "step": 500
16
+ },
17
+ {
18
+ "epoch": 0.06,
19
+ "eval_accuracy": 0.5508950432882589,
20
+ "eval_loss": 3.1499977111816406,
21
+ "eval_runtime": 85.1217,
22
+ "eval_samples_per_second": 182.386,
23
+ "eval_steps_per_second": 22.803,
24
+ "step": 500
25
+ },
26
+ {
27
+ "epoch": 0.13,
28
+ "learning_rate": 9.57033599725015e-06,
29
+ "loss": 2.942,
30
+ "step": 1000
31
+ },
32
+ {
33
+ "epoch": 0.13,
34
+ "eval_accuracy": 0.5680209177510359,
35
+ "eval_loss": 2.584392547607422,
36
+ "eval_runtime": 79.4716,
37
+ "eval_samples_per_second": 195.353,
38
+ "eval_steps_per_second": 24.424,
39
+ "step": 1000
40
+ },
41
+ {
42
+ "epoch": 0.19,
43
+ "learning_rate": 9.355503995875225e-06,
44
+ "loss": 2.6751,
45
+ "step": 1500
46
+ },
47
+ {
48
+ "epoch": 0.19,
49
+ "eval_accuracy": 0.578970986434046,
50
+ "eval_loss": 2.444335699081421,
51
+ "eval_runtime": 87.5675,
52
+ "eval_samples_per_second": 177.292,
53
+ "eval_steps_per_second": 22.166,
54
+ "step": 1500
55
+ },
56
+ {
57
+ "epoch": 0.26,
58
+ "learning_rate": 9.140671994500302e-06,
59
+ "loss": 2.582,
60
+ "step": 2000
61
+ },
62
+ {
63
+ "epoch": 0.26,
64
+ "eval_accuracy": 0.5868782143731802,
65
+ "eval_loss": 2.3700673580169678,
66
+ "eval_runtime": 83.7436,
67
+ "eval_samples_per_second": 185.387,
68
+ "eval_steps_per_second": 23.178,
69
+ "step": 2000
70
+ },
71
+ {
72
+ "epoch": 0.32,
73
+ "learning_rate": 8.926269657128126e-06,
74
+ "loss": 2.5267,
75
+ "step": 2500
76
+ },
77
+ {
78
+ "epoch": 0.32,
79
+ "eval_accuracy": 0.5937291646823507,
80
+ "eval_loss": 2.309689998626709,
81
+ "eval_runtime": 81.3517,
82
+ "eval_samples_per_second": 190.838,
83
+ "eval_steps_per_second": 23.859,
84
+ "step": 2500
85
+ },
86
+ {
87
+ "epoch": 0.39,
88
+ "learning_rate": 8.711437655753203e-06,
89
+ "loss": 2.4722,
90
+ "step": 3000
91
+ },
92
+ {
93
+ "epoch": 0.39,
94
+ "eval_accuracy": 0.5985969269659629,
95
+ "eval_loss": 2.2695114612579346,
96
+ "eval_runtime": 87.4381,
97
+ "eval_samples_per_second": 177.554,
98
+ "eval_steps_per_second": 22.199,
99
+ "step": 3000
100
+ },
101
+ {
102
+ "epoch": 0.45,
103
+ "learning_rate": 8.497035318381027e-06,
104
+ "loss": 2.4289,
105
+ "step": 3500
106
+ },
107
+ {
108
+ "epoch": 0.45,
109
+ "eval_accuracy": 0.602404170197503,
110
+ "eval_loss": 2.2328779697418213,
111
+ "eval_runtime": 83.7759,
112
+ "eval_samples_per_second": 185.316,
113
+ "eval_steps_per_second": 23.169,
114
+ "step": 3500
115
+ },
116
+ {
117
+ "epoch": 0.52,
118
+ "learning_rate": 8.282203317006102e-06,
119
+ "loss": 2.404,
120
+ "step": 4000
121
+ },
122
+ {
123
+ "epoch": 0.52,
124
+ "eval_accuracy": 0.6055254061674608,
125
+ "eval_loss": 2.206317901611328,
126
+ "eval_runtime": 87.3965,
127
+ "eval_samples_per_second": 177.639,
128
+ "eval_steps_per_second": 22.209,
129
+ "step": 4000
130
+ },
131
+ {
132
+ "epoch": 0.58,
133
+ "learning_rate": 8.067371315631177e-06,
134
+ "loss": 2.3826,
135
+ "step": 4500
136
+ },
137
+ {
138
+ "epoch": 0.58,
139
+ "eval_accuracy": 0.6086694296803393,
140
+ "eval_loss": 2.183983087539673,
141
+ "eval_runtime": 87.0314,
142
+ "eval_samples_per_second": 178.384,
143
+ "eval_steps_per_second": 22.302,
144
+ "step": 4500
145
+ },
146
+ {
147
+ "epoch": 0.64,
148
+ "learning_rate": 7.852539314256252e-06,
149
+ "loss": 2.3633,
150
+ "step": 5000
151
+ },
152
+ {
153
+ "epoch": 0.64,
154
+ "eval_accuracy": 0.6108753723178051,
155
+ "eval_loss": 2.1645586490631104,
156
+ "eval_runtime": 83.9383,
157
+ "eval_samples_per_second": 184.957,
158
+ "eval_steps_per_second": 23.124,
159
+ "step": 5000
160
+ },
161
+ {
162
+ "epoch": 0.71,
163
+ "learning_rate": 7.637707312881327e-06,
164
+ "loss": 2.3425,
165
+ "step": 5500
166
+ },
167
+ {
168
+ "epoch": 0.71,
169
+ "eval_accuracy": 0.6121162378522405,
170
+ "eval_loss": 2.155695676803589,
171
+ "eval_runtime": 87.4417,
172
+ "eval_samples_per_second": 177.547,
173
+ "eval_steps_per_second": 22.198,
174
+ "step": 5500
175
+ },
176
+ {
177
+ "epoch": 0.77,
178
+ "learning_rate": 7.4228753115064025e-06,
179
+ "loss": 2.333,
180
+ "step": 6000
181
+ },
182
+ {
183
+ "epoch": 0.77,
184
+ "eval_accuracy": 0.6140775893820937,
185
+ "eval_loss": 2.1349785327911377,
186
+ "eval_runtime": 85.1022,
187
+ "eval_samples_per_second": 182.428,
188
+ "eval_steps_per_second": 22.808,
189
+ "step": 6000
190
+ },
191
+ {
192
+ "epoch": 0.84,
193
+ "learning_rate": 7.208472974134228e-06,
194
+ "loss": 2.311,
195
+ "step": 6500
196
+ },
197
+ {
198
+ "epoch": 0.84,
199
+ "eval_accuracy": 0.6151508455851109,
200
+ "eval_loss": 2.1292011737823486,
201
+ "eval_runtime": 79.4597,
202
+ "eval_samples_per_second": 195.382,
203
+ "eval_steps_per_second": 24.427,
204
+ "step": 6500
205
+ },
206
+ {
207
+ "epoch": 0.9,
208
+ "learning_rate": 6.993640972759303e-06,
209
+ "loss": 2.3014,
210
+ "step": 7000
211
+ },
212
+ {
213
+ "epoch": 0.9,
214
+ "eval_accuracy": 0.6166432908599604,
215
+ "eval_loss": 2.1181797981262207,
216
+ "eval_runtime": 87.6275,
217
+ "eval_samples_per_second": 177.17,
218
+ "eval_steps_per_second": 22.151,
219
+ "step": 7000
220
+ },
221
+ {
222
+ "epoch": 0.97,
223
+ "learning_rate": 6.7788089713843775e-06,
224
+ "loss": 2.2974,
225
+ "step": 7500
226
+ },
227
+ {
228
+ "epoch": 0.97,
229
+ "eval_accuracy": 0.6169897785349233,
230
+ "eval_loss": 2.112070083618164,
231
+ "eval_runtime": 83.9336,
232
+ "eval_samples_per_second": 184.968,
233
+ "eval_steps_per_second": 23.125,
234
+ "step": 7500
235
+ },
236
+ {
237
+ "epoch": 1.03,
238
+ "learning_rate": 6.563976970009453e-06,
239
+ "loss": 2.2866,
240
+ "step": 8000
241
+ },
242
+ {
243
+ "epoch": 1.03,
244
+ "eval_accuracy": 0.6173022781800038,
245
+ "eval_loss": 2.107919454574585,
246
+ "eval_runtime": 82.2636,
247
+ "eval_samples_per_second": 188.723,
248
+ "eval_steps_per_second": 23.595,
249
+ "step": 8000
250
+ },
251
+ {
252
+ "epoch": 1.1,
253
+ "learning_rate": 6.349574632637278e-06,
254
+ "loss": 2.2675,
255
+ "step": 8500
256
+ },
257
+ {
258
+ "epoch": 1.1,
259
+ "eval_accuracy": 0.6191927234863566,
260
+ "eval_loss": 2.0939817428588867,
261
+ "eval_runtime": 87.5998,
262
+ "eval_samples_per_second": 177.226,
263
+ "eval_steps_per_second": 22.158,
264
+ "step": 8500
265
+ },
266
+ {
267
+ "epoch": 1.16,
268
+ "learning_rate": 6.134742631262354e-06,
269
+ "loss": 2.2789,
270
+ "step": 9000
271
+ },
272
+ {
273
+ "epoch": 1.16,
274
+ "eval_accuracy": 0.6201220093575694,
275
+ "eval_loss": 2.088168144226074,
276
+ "eval_runtime": 83.772,
277
+ "eval_samples_per_second": 185.324,
278
+ "eval_steps_per_second": 23.17,
279
+ "step": 9000
280
+ },
281
+ {
282
+ "epoch": 1.22,
283
+ "learning_rate": 5.919910629887429e-06,
284
+ "loss": 2.2684,
285
+ "step": 9500
286
+ },
287
+ {
288
+ "epoch": 1.22,
289
+ "eval_accuracy": 0.6199849943877651,
290
+ "eval_loss": 2.0872652530670166,
291
+ "eval_runtime": 87.4418,
292
+ "eval_samples_per_second": 177.547,
293
+ "eval_steps_per_second": 22.198,
294
+ "step": 9500
295
+ },
296
+ {
297
+ "epoch": 1.29,
298
+ "learning_rate": 5.705078628512504e-06,
299
+ "loss": 2.2608,
300
+ "step": 10000
301
+ },
302
+ {
303
+ "epoch": 1.29,
304
+ "eval_accuracy": 0.6208952330586832,
305
+ "eval_loss": 2.0795998573303223,
306
+ "eval_runtime": 86.9343,
307
+ "eval_samples_per_second": 178.583,
308
+ "eval_steps_per_second": 22.327,
309
+ "step": 10000
310
+ },
311
+ {
312
+ "epoch": 1.35,
313
+ "learning_rate": 5.490246627137579e-06,
314
+ "loss": 2.2478,
315
+ "step": 10500
316
+ },
317
+ {
318
+ "epoch": 1.35,
319
+ "eval_accuracy": 0.620409766315376,
320
+ "eval_loss": 2.082674503326416,
321
+ "eval_runtime": 84.0547,
322
+ "eval_samples_per_second": 184.701,
323
+ "eval_steps_per_second": 23.092,
324
+ "step": 10500
325
+ },
326
+ {
327
+ "epoch": 1.42,
328
+ "learning_rate": 5.275844289765404e-06,
329
+ "loss": 2.2524,
330
+ "step": 11000
331
+ },
332
+ {
333
+ "epoch": 1.42,
334
+ "eval_accuracy": 0.6214935816878795,
335
+ "eval_loss": 2.074056386947632,
336
+ "eval_runtime": 87.5237,
337
+ "eval_samples_per_second": 177.381,
338
+ "eval_steps_per_second": 22.177,
339
+ "step": 11000
340
+ },
341
+ {
342
+ "epoch": 1.48,
343
+ "learning_rate": 5.061012288390479e-06,
344
+ "loss": 2.2502,
345
+ "step": 11500
346
+ },
347
+ {
348
+ "epoch": 1.48,
349
+ "eval_accuracy": 0.6220323169678965,
350
+ "eval_loss": 2.068490505218506,
351
+ "eval_runtime": 84.958,
352
+ "eval_samples_per_second": 182.737,
353
+ "eval_steps_per_second": 22.847,
354
+ "step": 11500
355
+ },
356
+ {
357
+ "epoch": 1.55,
358
+ "learning_rate": 4.8461802870155545e-06,
359
+ "loss": 2.243,
360
+ "step": 12000
361
+ },
362
+ {
363
+ "epoch": 1.55,
364
+ "eval_accuracy": 0.622761702720804,
365
+ "eval_loss": 2.0664761066436768,
366
+ "eval_runtime": 79.0021,
367
+ "eval_samples_per_second": 196.514,
368
+ "eval_steps_per_second": 24.569,
369
+ "step": 12000
370
+ },
371
+ {
372
+ "epoch": 1.61,
373
+ "learning_rate": 4.631348285640629e-06,
374
+ "loss": 2.2417,
375
+ "step": 12500
376
+ },
377
+ {
378
+ "epoch": 1.61,
379
+ "eval_accuracy": 0.6228723852166125,
380
+ "eval_loss": 2.0631983280181885,
381
+ "eval_runtime": 87.1566,
382
+ "eval_samples_per_second": 178.128,
383
+ "eval_steps_per_second": 22.27,
384
+ "step": 12500
385
+ },
386
+ {
387
+ "epoch": 1.68,
388
+ "learning_rate": 4.416516284265704e-06,
389
+ "loss": 2.2398,
390
+ "step": 13000
391
+ },
392
+ {
393
+ "epoch": 1.68,
394
+ "eval_accuracy": 0.6232123058100858,
395
+ "eval_loss": 2.0592522621154785,
396
+ "eval_runtime": 83.668,
397
+ "eval_samples_per_second": 185.555,
398
+ "eval_steps_per_second": 23.199,
399
+ "step": 13000
400
+ },
401
+ {
402
+ "epoch": 1.74,
403
+ "learning_rate": 4.20168428289078e-06,
404
+ "loss": 2.2233,
405
+ "step": 13500
406
+ },
407
+ {
408
+ "epoch": 1.74,
409
+ "eval_accuracy": 0.6232258668129607,
410
+ "eval_loss": 2.060002326965332,
411
+ "eval_runtime": 80.0466,
412
+ "eval_samples_per_second": 193.95,
413
+ "eval_steps_per_second": 24.248,
414
+ "step": 13500
415
+ },
416
+ {
417
+ "epoch": 1.8,
418
+ "learning_rate": 3.987281945518604e-06,
419
+ "loss": 2.2277,
420
+ "step": 14000
421
+ },
422
+ {
423
+ "epoch": 1.8,
424
+ "eval_accuracy": 0.623606800420627,
425
+ "eval_loss": 2.0534963607788086,
426
+ "eval_runtime": 87.4565,
427
+ "eval_samples_per_second": 177.517,
428
+ "eval_steps_per_second": 22.194,
429
+ "step": 14000
430
+ },
431
+ {
432
+ "epoch": 1.87,
433
+ "learning_rate": 3.77244994414368e-06,
434
+ "loss": 2.2344,
435
+ "step": 14500
436
+ },
437
+ {
438
+ "epoch": 1.87,
439
+ "eval_accuracy": 0.6247527084114421,
440
+ "eval_loss": 2.0484962463378906,
441
+ "eval_runtime": 83.8183,
442
+ "eval_samples_per_second": 185.222,
443
+ "eval_steps_per_second": 23.157,
444
+ "step": 14500
445
+ },
446
+ {
447
+ "epoch": 1.93,
448
+ "learning_rate": 3.5576179427687554e-06,
449
+ "loss": 2.2274,
450
+ "step": 15000
451
+ },
452
+ {
453
+ "epoch": 1.93,
454
+ "eval_accuracy": 0.6244717527399175,
455
+ "eval_loss": 2.050738573074341,
456
+ "eval_runtime": 87.5865,
457
+ "eval_samples_per_second": 177.253,
458
+ "eval_steps_per_second": 22.161,
459
+ "step": 15000
460
+ },
461
+ {
462
+ "epoch": 2.0,
463
+ "learning_rate": 3.34321560539658e-06,
464
+ "loss": 2.2212,
465
+ "step": 15500
466
+ },
467
+ {
468
+ "epoch": 2.0,
469
+ "eval_accuracy": 0.6256074101917349,
470
+ "eval_loss": 2.0428130626678467,
471
+ "eval_runtime": 86.8032,
472
+ "eval_samples_per_second": 178.853,
473
+ "eval_steps_per_second": 22.361,
474
+ "step": 15500
475
+ },
476
+ {
477
+ "epoch": 2.06,
478
+ "learning_rate": 3.1283836040216555e-06,
479
+ "loss": 2.214,
480
+ "step": 16000
481
+ },
482
+ {
483
+ "epoch": 2.06,
484
+ "eval_accuracy": 0.6244417876710062,
485
+ "eval_loss": 2.0463979244232178,
486
+ "eval_runtime": 84.1399,
487
+ "eval_samples_per_second": 184.514,
488
+ "eval_steps_per_second": 23.069,
489
+ "step": 16000
490
+ },
491
+ {
492
+ "epoch": 2.13,
493
+ "learning_rate": 2.9135516026467303e-06,
494
+ "loss": 2.2104,
495
+ "step": 16500
496
+ },
497
+ {
498
+ "epoch": 2.13,
499
+ "eval_accuracy": 0.6249873550076295,
500
+ "eval_loss": 2.0476861000061035,
501
+ "eval_runtime": 87.5417,
502
+ "eval_samples_per_second": 177.344,
503
+ "eval_steps_per_second": 22.172,
504
+ "step": 16500
505
+ },
506
+ {
507
+ "epoch": 2.19,
508
+ "learning_rate": 2.698719601271806e-06,
509
+ "loss": 2.2185,
510
+ "step": 17000
511
+ },
512
+ {
513
+ "epoch": 2.19,
514
+ "eval_accuracy": 0.6257313721221357,
515
+ "eval_loss": 2.039674758911133,
516
+ "eval_runtime": 84.986,
517
+ "eval_samples_per_second": 182.677,
518
+ "eval_steps_per_second": 22.839,
519
+ "step": 17000
520
+ },
521
+ {
522
+ "epoch": 2.26,
523
+ "learning_rate": 2.483887599896881e-06,
524
+ "loss": 2.2157,
525
+ "step": 17500
526
+ },
527
+ {
528
+ "epoch": 2.26,
529
+ "eval_accuracy": 0.6257406865679764,
530
+ "eval_loss": 2.041879177093506,
531
+ "eval_runtime": 79.7413,
532
+ "eval_samples_per_second": 194.692,
533
+ "eval_steps_per_second": 24.341,
534
+ "step": 17500
535
+ },
536
+ {
537
+ "epoch": 2.32,
538
+ "learning_rate": 2.2690555985219558e-06,
539
+ "loss": 2.2128,
540
+ "step": 18000
541
+ },
542
+ {
543
+ "epoch": 2.32,
544
+ "eval_accuracy": 0.6254893845927666,
545
+ "eval_loss": 2.043928623199463,
546
+ "eval_runtime": 87.45,
547
+ "eval_samples_per_second": 177.53,
548
+ "eval_steps_per_second": 22.196,
549
+ "step": 18000
550
+ },
551
+ {
552
+ "epoch": 2.38,
553
+ "learning_rate": 2.054223597147031e-06,
554
+ "loss": 2.2154,
555
+ "step": 18500
556
+ },
557
+ {
558
+ "epoch": 2.38,
559
+ "eval_accuracy": 0.6259225237275015,
560
+ "eval_loss": 2.037231683731079,
561
+ "eval_runtime": 83.6819,
562
+ "eval_samples_per_second": 185.524,
563
+ "eval_steps_per_second": 23.195,
564
+ "step": 18500
565
+ },
566
+ {
567
+ "epoch": 2.45,
568
+ "learning_rate": 1.8393915957721066e-06,
569
+ "loss": 2.2099,
570
+ "step": 19000
571
+ },
572
+ {
573
+ "epoch": 2.45,
574
+ "eval_accuracy": 0.62631184758297,
575
+ "eval_loss": 2.0336806774139404,
576
+ "eval_runtime": 81.3506,
577
+ "eval_samples_per_second": 190.841,
578
+ "eval_steps_per_second": 23.86,
579
+ "step": 19000
580
+ },
581
+ {
582
+ "epoch": 2.51,
583
+ "learning_rate": 1.6245595943971814e-06,
584
+ "loss": 2.2045,
585
+ "step": 19500
586
+ },
587
+ {
588
+ "epoch": 2.51,
589
+ "eval_accuracy": 0.6258799592390727,
590
+ "eval_loss": 2.039562225341797,
591
+ "eval_runtime": 87.4501,
592
+ "eval_samples_per_second": 177.53,
593
+ "eval_steps_per_second": 22.196,
594
+ "step": 19500
595
+ },
596
+ {
597
+ "epoch": 2.58,
598
+ "learning_rate": 1.4097275930222567e-06,
599
+ "loss": 2.2138,
600
+ "step": 20000
601
+ },
602
+ {
603
+ "epoch": 2.58,
604
+ "eval_accuracy": 0.6261649440028011,
605
+ "eval_loss": 2.0390186309814453,
606
+ "eval_runtime": 83.8434,
607
+ "eval_samples_per_second": 185.167,
608
+ "eval_steps_per_second": 23.15,
609
+ "step": 20000
610
+ },
611
+ {
612
+ "epoch": 2.64,
613
+ "learning_rate": 1.194895591647332e-06,
614
+ "loss": 2.2103,
615
+ "step": 20500
616
+ },
617
+ {
618
+ "epoch": 2.64,
619
+ "eval_accuracy": 0.6262993215315168,
620
+ "eval_loss": 2.03385329246521,
621
+ "eval_runtime": 87.3376,
622
+ "eval_samples_per_second": 177.759,
623
+ "eval_steps_per_second": 22.224,
624
+ "step": 20500
625
+ },
626
+ {
627
+ "epoch": 2.64,
628
+ "step": 20500,
629
+ "total_flos": 1.0082485751267328e+16,
630
+ "train_loss": 2.351401915015244,
631
+ "train_runtime": 7059.8447,
632
+ "train_samples_per_second": 52.745,
633
+ "train_steps_per_second": 3.297
634
+ }
635
+ ],
636
+ "logging_steps": 500,
637
+ "max_steps": 23274,
638
+ "num_input_tokens_seen": 0,
639
+ "num_train_epochs": 3,
640
+ "save_steps": 500,
641
+ "total_flos": 1.0082485751267328e+16,
642
+ "train_batch_size": 8,
643
+ "trial_name": null,
644
+ "trial_params": null
645
+ }