tejaskamtam commited on
Commit
52d9354
1 Parent(s): 59081f4

End of training

Browse files
Files changed (5) hide show
  1. README.md +16 -5
  2. all_results.json +15 -0
  3. eval_results.json +10 -0
  4. train_results.json +8 -0
  5. trainer_state.json +780 -0
README.md CHANGED
@@ -1,13 +1,24 @@
1
  ---
2
- license: apache-2.0
3
  base_model: facebook/bart-base
4
  tags:
5
  - generated_from_trainer
 
 
6
  metrics:
7
  - accuracy
8
  model-index:
9
  - name: bart-base-finetuned-xe_ey_fae
10
- results: []
 
 
 
 
 
 
 
 
 
 
11
  ---
12
 
13
  <!-- This model card has been generated automatically according to the information the Trainer had access to. You
@@ -15,10 +26,10 @@ should probably proofread and complete it, then remove this comment. -->
15
 
16
  # bart-base-finetuned-xe_ey_fae
17
 
18
- This model is a fine-tuned version of [facebook/bart-base](https://huggingface.co/facebook/bart-base) on an unknown dataset.
19
  It achieves the following results on the evaluation set:
20
- - Loss: 1.3910
21
- - Accuracy: 0.7182
22
 
23
  ## Model description
24
 
 
1
  ---
 
2
  base_model: facebook/bart-base
3
  tags:
4
  - generated_from_trainer
5
+ datasets:
6
+ - datasets/all_binary_and_xe_ey_fae_counterfactual
7
  metrics:
8
  - accuracy
9
  model-index:
10
  - name: bart-base-finetuned-xe_ey_fae
11
+ results:
12
+ - task:
13
+ name: Masked Language Modeling
14
+ type: fill-mask
15
+ dataset:
16
+ name: datasets/all_binary_and_xe_ey_fae_counterfactual
17
+ type: datasets/all_binary_and_xe_ey_fae_counterfactual
18
+ metrics:
19
+ - name: Accuracy
20
+ type: accuracy
21
+ value: 0.7180178883360112
22
  ---
23
 
24
  <!-- This model card has been generated automatically according to the information the Trainer had access to. You
 
26
 
27
  # bart-base-finetuned-xe_ey_fae
28
 
29
+ This model is a fine-tuned version of [facebook/bart-base](https://huggingface.co/facebook/bart-base) on the datasets/all_binary_and_xe_ey_fae_counterfactual dataset.
30
  It achieves the following results on the evaluation set:
31
+ - Loss: 1.3945
32
+ - Accuracy: 0.7180
33
 
34
  ## Model description
35
 
all_results.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 3.0,
3
+ "eval_accuracy": 0.7180178883360112,
4
+ "eval_loss": 1.3944889307022095,
5
+ "eval_runtime": 95.1252,
6
+ "eval_samples": 16928,
7
+ "eval_samples_per_second": 177.955,
8
+ "eval_steps_per_second": 22.244,
9
+ "perplexity": 4.032912947872197,
10
+ "train_loss": 2.057705193860182,
11
+ "train_runtime": 15314.6638,
12
+ "train_samples": 135339,
13
+ "train_samples_per_second": 26.512,
14
+ "train_steps_per_second": 1.657
15
+ }
eval_results.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 3.0,
3
+ "eval_accuracy": 0.7180178883360112,
4
+ "eval_loss": 1.3944889307022095,
5
+ "eval_runtime": 95.1252,
6
+ "eval_samples": 16928,
7
+ "eval_samples_per_second": 177.955,
8
+ "eval_steps_per_second": 22.244,
9
+ "perplexity": 4.032912947872197
10
+ }
train_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 3.0,
3
+ "train_loss": 2.057705193860182,
4
+ "train_runtime": 15314.6638,
5
+ "train_samples": 135339,
6
+ "train_samples_per_second": 26.512,
7
+ "train_steps_per_second": 1.657
8
+ }
trainer_state.json ADDED
@@ -0,0 +1,780 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 1.3909834623336792,
3
+ "best_model_checkpoint": "finetuning/output/bart-base-finetuned_xe_ey_fae/checkpoint-25000",
4
+ "epoch": 3.0,
5
+ "eval_steps": 500,
6
+ "global_step": 25377,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.06,
13
+ "learning_rate": 9.804547424833511e-06,
14
+ "loss": 5.4226,
15
+ "step": 500
16
+ },
17
+ {
18
+ "epoch": 0.06,
19
+ "eval_accuracy": 0.3627901941481408,
20
+ "eval_loss": 3.8137550354003906,
21
+ "eval_runtime": 98.6024,
22
+ "eval_samples_per_second": 171.679,
23
+ "eval_steps_per_second": 21.46,
24
+ "step": 500
25
+ },
26
+ {
27
+ "epoch": 0.12,
28
+ "learning_rate": 9.607518619222132e-06,
29
+ "loss": 4.0408,
30
+ "step": 1000
31
+ },
32
+ {
33
+ "epoch": 0.12,
34
+ "eval_accuracy": 0.46300121473546585,
35
+ "eval_loss": 3.057621717453003,
36
+ "eval_runtime": 99.414,
37
+ "eval_samples_per_second": 170.278,
38
+ "eval_steps_per_second": 21.285,
39
+ "step": 1000
40
+ },
41
+ {
42
+ "epoch": 0.18,
43
+ "learning_rate": 9.41048981361075e-06,
44
+ "loss": 3.4979,
45
+ "step": 1500
46
+ },
47
+ {
48
+ "epoch": 0.18,
49
+ "eval_accuracy": 0.5132904448434071,
50
+ "eval_loss": 2.70158314704895,
51
+ "eval_runtime": 99.9098,
52
+ "eval_samples_per_second": 169.433,
53
+ "eval_steps_per_second": 21.179,
54
+ "step": 1500
55
+ },
56
+ {
57
+ "epoch": 0.24,
58
+ "learning_rate": 9.21346100799937e-06,
59
+ "loss": 3.1691,
60
+ "step": 2000
61
+ },
62
+ {
63
+ "epoch": 0.24,
64
+ "eval_accuracy": 0.5430825323065444,
65
+ "eval_loss": 2.4879872798919678,
66
+ "eval_runtime": 99.733,
67
+ "eval_samples_per_second": 169.733,
68
+ "eval_steps_per_second": 21.217,
69
+ "step": 2000
70
+ },
71
+ {
72
+ "epoch": 0.3,
73
+ "learning_rate": 9.01643220238799e-06,
74
+ "loss": 2.9564,
75
+ "step": 2500
76
+ },
77
+ {
78
+ "epoch": 0.3,
79
+ "eval_accuracy": 0.5644360825553116,
80
+ "eval_loss": 2.330946445465088,
81
+ "eval_runtime": 100.0932,
82
+ "eval_samples_per_second": 169.122,
83
+ "eval_steps_per_second": 21.14,
84
+ "step": 2500
85
+ },
86
+ {
87
+ "epoch": 0.35,
88
+ "learning_rate": 8.819797454387831e-06,
89
+ "loss": 2.8078,
90
+ "step": 3000
91
+ },
92
+ {
93
+ "epoch": 0.35,
94
+ "eval_accuracy": 0.5792018144043999,
95
+ "eval_loss": 2.232025384902954,
96
+ "eval_runtime": 100.0923,
97
+ "eval_samples_per_second": 169.124,
98
+ "eval_steps_per_second": 21.14,
99
+ "step": 3000
100
+ },
101
+ {
102
+ "epoch": 0.41,
103
+ "learning_rate": 8.622768648776452e-06,
104
+ "loss": 2.6741,
105
+ "step": 3500
106
+ },
107
+ {
108
+ "epoch": 0.41,
109
+ "eval_accuracy": 0.592379386392151,
110
+ "eval_loss": 2.1506171226501465,
111
+ "eval_runtime": 99.9507,
112
+ "eval_samples_per_second": 169.364,
113
+ "eval_steps_per_second": 21.17,
114
+ "step": 3500
115
+ },
116
+ {
117
+ "epoch": 0.47,
118
+ "learning_rate": 8.425739843165071e-06,
119
+ "loss": 2.5323,
120
+ "step": 4000
121
+ },
122
+ {
123
+ "epoch": 0.47,
124
+ "eval_accuracy": 0.617633758132823,
125
+ "eval_loss": 1.9845681190490723,
126
+ "eval_runtime": 100.0279,
127
+ "eval_samples_per_second": 169.233,
128
+ "eval_steps_per_second": 21.154,
129
+ "step": 4000
130
+ },
131
+ {
132
+ "epoch": 0.53,
133
+ "learning_rate": 8.22871103755369e-06,
134
+ "loss": 2.3678,
135
+ "step": 4500
136
+ },
137
+ {
138
+ "epoch": 0.53,
139
+ "eval_accuracy": 0.6374534268418744,
140
+ "eval_loss": 1.8812607526779175,
141
+ "eval_runtime": 100.101,
142
+ "eval_samples_per_second": 169.109,
143
+ "eval_steps_per_second": 21.139,
144
+ "step": 4500
145
+ },
146
+ {
147
+ "epoch": 0.59,
148
+ "learning_rate": 8.03168223194231e-06,
149
+ "loss": 2.25,
150
+ "step": 5000
151
+ },
152
+ {
153
+ "epoch": 0.59,
154
+ "eval_accuracy": 0.6496838449438552,
155
+ "eval_loss": 1.809983253479004,
156
+ "eval_runtime": 100.2479,
157
+ "eval_samples_per_second": 168.861,
158
+ "eval_steps_per_second": 21.108,
159
+ "step": 5000
160
+ },
161
+ {
162
+ "epoch": 0.65,
163
+ "learning_rate": 7.83465342633093e-06,
164
+ "loss": 2.1795,
165
+ "step": 5500
166
+ },
167
+ {
168
+ "epoch": 0.65,
169
+ "eval_accuracy": 0.6579494225370981,
170
+ "eval_loss": 1.7632389068603516,
171
+ "eval_runtime": 100.0951,
172
+ "eval_samples_per_second": 169.119,
173
+ "eval_steps_per_second": 21.14,
174
+ "step": 5500
175
+ },
176
+ {
177
+ "epoch": 0.71,
178
+ "learning_rate": 7.63762462071955e-06,
179
+ "loss": 2.1203,
180
+ "step": 6000
181
+ },
182
+ {
183
+ "epoch": 0.71,
184
+ "eval_accuracy": 0.664559097259069,
185
+ "eval_loss": 1.7238309383392334,
186
+ "eval_runtime": 99.9087,
187
+ "eval_samples_per_second": 169.435,
188
+ "eval_steps_per_second": 21.179,
189
+ "step": 6000
190
+ },
191
+ {
192
+ "epoch": 0.77,
193
+ "learning_rate": 7.440595815108169e-06,
194
+ "loss": 2.0764,
195
+ "step": 6500
196
+ },
197
+ {
198
+ "epoch": 0.77,
199
+ "eval_accuracy": 0.6713205569113848,
200
+ "eval_loss": 1.6855953931808472,
201
+ "eval_runtime": 100.047,
202
+ "eval_samples_per_second": 169.201,
203
+ "eval_steps_per_second": 21.15,
204
+ "step": 6500
205
+ },
206
+ {
207
+ "epoch": 0.83,
208
+ "learning_rate": 7.2435670094967895e-06,
209
+ "loss": 2.026,
210
+ "step": 7000
211
+ },
212
+ {
213
+ "epoch": 0.83,
214
+ "eval_accuracy": 0.6759595736369565,
215
+ "eval_loss": 1.6568557024002075,
216
+ "eval_runtime": 99.903,
217
+ "eval_samples_per_second": 169.444,
218
+ "eval_steps_per_second": 21.181,
219
+ "step": 7000
220
+ },
221
+ {
222
+ "epoch": 0.89,
223
+ "learning_rate": 7.046932261496632e-06,
224
+ "loss": 1.9942,
225
+ "step": 7500
226
+ },
227
+ {
228
+ "epoch": 0.89,
229
+ "eval_accuracy": 0.6803347736385223,
230
+ "eval_loss": 1.6309233903884888,
231
+ "eval_runtime": 100.1047,
232
+ "eval_samples_per_second": 169.103,
233
+ "eval_steps_per_second": 21.138,
234
+ "step": 7500
235
+ },
236
+ {
237
+ "epoch": 0.95,
238
+ "learning_rate": 6.849903455885251e-06,
239
+ "loss": 1.9665,
240
+ "step": 8000
241
+ },
242
+ {
243
+ "epoch": 0.95,
244
+ "eval_accuracy": 0.6836478246699454,
245
+ "eval_loss": 1.612231731414795,
246
+ "eval_runtime": 206.2817,
247
+ "eval_samples_per_second": 82.063,
248
+ "eval_steps_per_second": 10.258,
249
+ "step": 8000
250
+ },
251
+ {
252
+ "epoch": 1.0,
253
+ "learning_rate": 6.652874650273871e-06,
254
+ "loss": 1.9395,
255
+ "step": 8500
256
+ },
257
+ {
258
+ "epoch": 1.0,
259
+ "eval_accuracy": 0.6866433413548132,
260
+ "eval_loss": 1.5912940502166748,
261
+ "eval_runtime": 206.5703,
262
+ "eval_samples_per_second": 81.948,
263
+ "eval_steps_per_second": 10.243,
264
+ "step": 8500
265
+ },
266
+ {
267
+ "epoch": 1.06,
268
+ "learning_rate": 6.455845844662491e-06,
269
+ "loss": 1.9155,
270
+ "step": 9000
271
+ },
272
+ {
273
+ "epoch": 1.06,
274
+ "eval_accuracy": 0.6894629039599454,
275
+ "eval_loss": 1.5758066177368164,
276
+ "eval_runtime": 206.7537,
277
+ "eval_samples_per_second": 81.875,
278
+ "eval_steps_per_second": 10.234,
279
+ "step": 9000
280
+ },
281
+ {
282
+ "epoch": 1.12,
283
+ "learning_rate": 6.25881703905111e-06,
284
+ "loss": 1.8828,
285
+ "step": 9500
286
+ },
287
+ {
288
+ "epoch": 1.12,
289
+ "eval_accuracy": 0.6918324332777558,
290
+ "eval_loss": 1.5607072114944458,
291
+ "eval_runtime": 203.7553,
292
+ "eval_samples_per_second": 83.08,
293
+ "eval_steps_per_second": 10.385,
294
+ "step": 9500
295
+ },
296
+ {
297
+ "epoch": 1.18,
298
+ "learning_rate": 6.06178823343973e-06,
299
+ "loss": 1.8721,
300
+ "step": 10000
301
+ },
302
+ {
303
+ "epoch": 1.18,
304
+ "eval_accuracy": 0.6948063170580184,
305
+ "eval_loss": 1.5421587228775024,
306
+ "eval_runtime": 205.9617,
307
+ "eval_samples_per_second": 82.19,
308
+ "eval_steps_per_second": 10.274,
309
+ "step": 10000
310
+ },
311
+ {
312
+ "epoch": 1.24,
313
+ "learning_rate": 5.8647594278283496e-06,
314
+ "loss": 1.8474,
315
+ "step": 10500
316
+ },
317
+ {
318
+ "epoch": 1.24,
319
+ "eval_accuracy": 0.6963892745418871,
320
+ "eval_loss": 1.5320152044296265,
321
+ "eval_runtime": 206.4027,
322
+ "eval_samples_per_second": 82.014,
323
+ "eval_steps_per_second": 10.252,
324
+ "step": 10500
325
+ },
326
+ {
327
+ "epoch": 1.3,
328
+ "learning_rate": 5.667730622216968e-06,
329
+ "loss": 1.8293,
330
+ "step": 11000
331
+ },
332
+ {
333
+ "epoch": 1.3,
334
+ "eval_accuracy": 0.6978303363523796,
335
+ "eval_loss": 1.5213782787322998,
336
+ "eval_runtime": 206.4515,
337
+ "eval_samples_per_second": 81.995,
338
+ "eval_steps_per_second": 10.249,
339
+ "step": 11000
340
+ },
341
+ {
342
+ "epoch": 1.36,
343
+ "learning_rate": 5.471095874216811e-06,
344
+ "loss": 1.8129,
345
+ "step": 11500
346
+ },
347
+ {
348
+ "epoch": 1.36,
349
+ "eval_accuracy": 0.6997515674908317,
350
+ "eval_loss": 1.5102019309997559,
351
+ "eval_runtime": 203.4625,
352
+ "eval_samples_per_second": 83.2,
353
+ "eval_steps_per_second": 10.4,
354
+ "step": 11500
355
+ },
356
+ {
357
+ "epoch": 1.42,
358
+ "learning_rate": 5.274067068605431e-06,
359
+ "loss": 1.8148,
360
+ "step": 12000
361
+ },
362
+ {
363
+ "epoch": 1.42,
364
+ "eval_accuracy": 0.7013130680794967,
365
+ "eval_loss": 1.5009928941726685,
366
+ "eval_runtime": 206.7456,
367
+ "eval_samples_per_second": 81.878,
368
+ "eval_steps_per_second": 10.235,
369
+ "step": 12000
370
+ },
371
+ {
372
+ "epoch": 1.48,
373
+ "learning_rate": 5.077038262994051e-06,
374
+ "loss": 1.7903,
375
+ "step": 12500
376
+ },
377
+ {
378
+ "epoch": 1.48,
379
+ "eval_accuracy": 0.7037519606361885,
380
+ "eval_loss": 1.484366774559021,
381
+ "eval_runtime": 207.0125,
382
+ "eval_samples_per_second": 81.773,
383
+ "eval_steps_per_second": 10.222,
384
+ "step": 12500
385
+ },
386
+ {
387
+ "epoch": 1.54,
388
+ "learning_rate": 4.88000945738267e-06,
389
+ "loss": 1.7815,
390
+ "step": 13000
391
+ },
392
+ {
393
+ "epoch": 1.54,
394
+ "eval_accuracy": 0.7039102273054718,
395
+ "eval_loss": 1.4823458194732666,
396
+ "eval_runtime": 206.0669,
397
+ "eval_samples_per_second": 82.148,
398
+ "eval_steps_per_second": 10.269,
399
+ "step": 13000
400
+ },
401
+ {
402
+ "epoch": 1.6,
403
+ "learning_rate": 4.68298065177129e-06,
404
+ "loss": 1.7637,
405
+ "step": 13500
406
+ },
407
+ {
408
+ "epoch": 1.6,
409
+ "eval_accuracy": 0.705173223800616,
410
+ "eval_loss": 1.4746402502059937,
411
+ "eval_runtime": 202.4173,
412
+ "eval_samples_per_second": 83.629,
413
+ "eval_steps_per_second": 10.454,
414
+ "step": 13500
415
+ },
416
+ {
417
+ "epoch": 1.66,
418
+ "learning_rate": 4.485951846159909e-06,
419
+ "loss": 1.7623,
420
+ "step": 14000
421
+ },
422
+ {
423
+ "epoch": 1.66,
424
+ "eval_accuracy": 0.706123367116372,
425
+ "eval_loss": 1.470130205154419,
426
+ "eval_runtime": 205.8377,
427
+ "eval_samples_per_second": 82.24,
428
+ "eval_steps_per_second": 10.28,
429
+ "step": 14000
430
+ },
431
+ {
432
+ "epoch": 1.71,
433
+ "learning_rate": 4.289317098159752e-06,
434
+ "loss": 1.7402,
435
+ "step": 14500
436
+ },
437
+ {
438
+ "epoch": 1.71,
439
+ "eval_accuracy": 0.7075649407306767,
440
+ "eval_loss": 1.4597938060760498,
441
+ "eval_runtime": 206.177,
442
+ "eval_samples_per_second": 82.104,
443
+ "eval_steps_per_second": 10.263,
444
+ "step": 14500
445
+ },
446
+ {
447
+ "epoch": 1.77,
448
+ "learning_rate": 4.092288292548371e-06,
449
+ "loss": 1.7376,
450
+ "step": 15000
451
+ },
452
+ {
453
+ "epoch": 1.77,
454
+ "eval_accuracy": 0.7089666967285505,
455
+ "eval_loss": 1.451911449432373,
456
+ "eval_runtime": 206.3085,
457
+ "eval_samples_per_second": 82.052,
458
+ "eval_steps_per_second": 10.256,
459
+ "step": 15000
460
+ },
461
+ {
462
+ "epoch": 1.83,
463
+ "learning_rate": 3.89525948693699e-06,
464
+ "loss": 1.7287,
465
+ "step": 15500
466
+ },
467
+ {
468
+ "epoch": 1.83,
469
+ "eval_accuracy": 0.7101150715078346,
470
+ "eval_loss": 1.4501255750656128,
471
+ "eval_runtime": 100.0594,
472
+ "eval_samples_per_second": 169.18,
473
+ "eval_steps_per_second": 21.147,
474
+ "step": 15500
475
+ },
476
+ {
477
+ "epoch": 1.89,
478
+ "learning_rate": 3.6982306813256103e-06,
479
+ "loss": 1.7273,
480
+ "step": 16000
481
+ },
482
+ {
483
+ "epoch": 1.89,
484
+ "eval_accuracy": 0.7106747872019036,
485
+ "eval_loss": 1.4408985376358032,
486
+ "eval_runtime": 100.2351,
487
+ "eval_samples_per_second": 168.883,
488
+ "eval_steps_per_second": 21.11,
489
+ "step": 16000
490
+ },
491
+ {
492
+ "epoch": 1.95,
493
+ "learning_rate": 3.5012018757142298e-06,
494
+ "loss": 1.7119,
495
+ "step": 16500
496
+ },
497
+ {
498
+ "epoch": 1.95,
499
+ "eval_accuracy": 0.7125312598082394,
500
+ "eval_loss": 1.431384563446045,
501
+ "eval_runtime": 100.206,
502
+ "eval_samples_per_second": 168.932,
503
+ "eval_steps_per_second": 21.117,
504
+ "step": 16500
505
+ },
506
+ {
507
+ "epoch": 2.01,
508
+ "learning_rate": 3.3045671277140724e-06,
509
+ "loss": 1.7098,
510
+ "step": 17000
511
+ },
512
+ {
513
+ "epoch": 2.01,
514
+ "eval_accuracy": 0.712873669928985,
515
+ "eval_loss": 1.4268542528152466,
516
+ "eval_runtime": 99.9713,
517
+ "eval_samples_per_second": 169.329,
518
+ "eval_steps_per_second": 21.166,
519
+ "step": 17000
520
+ },
521
+ {
522
+ "epoch": 2.07,
523
+ "learning_rate": 3.1075383221026915e-06,
524
+ "loss": 1.6978,
525
+ "step": 17500
526
+ },
527
+ {
528
+ "epoch": 2.07,
529
+ "eval_accuracy": 0.7132452679915875,
530
+ "eval_loss": 1.4275221824645996,
531
+ "eval_runtime": 100.0415,
532
+ "eval_samples_per_second": 169.21,
533
+ "eval_steps_per_second": 21.151,
534
+ "step": 17500
535
+ },
536
+ {
537
+ "epoch": 2.13,
538
+ "learning_rate": 2.910509516491311e-06,
539
+ "loss": 1.698,
540
+ "step": 18000
541
+ },
542
+ {
543
+ "epoch": 2.13,
544
+ "eval_accuracy": 0.7139832935058783,
545
+ "eval_loss": 1.421799898147583,
546
+ "eval_runtime": 100.2878,
547
+ "eval_samples_per_second": 168.794,
548
+ "eval_steps_per_second": 21.099,
549
+ "step": 18000
550
+ },
551
+ {
552
+ "epoch": 2.19,
553
+ "learning_rate": 2.713480710879931e-06,
554
+ "loss": 1.6837,
555
+ "step": 18500
556
+ },
557
+ {
558
+ "epoch": 2.19,
559
+ "eval_accuracy": 0.7146896815582429,
560
+ "eval_loss": 1.4150662422180176,
561
+ "eval_runtime": 100.1729,
562
+ "eval_samples_per_second": 168.988,
563
+ "eval_steps_per_second": 21.123,
564
+ "step": 18500
565
+ },
566
+ {
567
+ "epoch": 2.25,
568
+ "learning_rate": 2.5164519052685504e-06,
569
+ "loss": 1.6908,
570
+ "step": 19000
571
+ },
572
+ {
573
+ "epoch": 2.25,
574
+ "eval_accuracy": 0.7148777636104067,
575
+ "eval_loss": 1.413697361946106,
576
+ "eval_runtime": 100.0403,
577
+ "eval_samples_per_second": 169.212,
578
+ "eval_steps_per_second": 21.151,
579
+ "step": 19000
580
+ },
581
+ {
582
+ "epoch": 2.31,
583
+ "learning_rate": 2.3194230996571703e-06,
584
+ "loss": 1.6902,
585
+ "step": 19500
586
+ },
587
+ {
588
+ "epoch": 2.31,
589
+ "eval_accuracy": 0.7161167332062431,
590
+ "eval_loss": 1.4084678888320923,
591
+ "eval_runtime": 99.9514,
592
+ "eval_samples_per_second": 169.362,
593
+ "eval_steps_per_second": 21.17,
594
+ "step": 19500
595
+ },
596
+ {
597
+ "epoch": 2.36,
598
+ "learning_rate": 2.12239429404579e-06,
599
+ "loss": 1.6741,
600
+ "step": 20000
601
+ },
602
+ {
603
+ "epoch": 2.36,
604
+ "eval_accuracy": 0.7153571848548731,
605
+ "eval_loss": 1.4121222496032715,
606
+ "eval_runtime": 99.7721,
607
+ "eval_samples_per_second": 169.667,
608
+ "eval_steps_per_second": 21.208,
609
+ "step": 20000
610
+ },
611
+ {
612
+ "epoch": 2.42,
613
+ "learning_rate": 1.925759546045632e-06,
614
+ "loss": 1.6823,
615
+ "step": 20500
616
+ },
617
+ {
618
+ "epoch": 2.42,
619
+ "eval_accuracy": 0.7164751883355099,
620
+ "eval_loss": 1.4036943912506104,
621
+ "eval_runtime": 96.9786,
622
+ "eval_samples_per_second": 174.554,
623
+ "eval_steps_per_second": 21.819,
624
+ "step": 20500
625
+ },
626
+ {
627
+ "epoch": 2.48,
628
+ "learning_rate": 1.7287307404342515e-06,
629
+ "loss": 1.6692,
630
+ "step": 21000
631
+ },
632
+ {
633
+ "epoch": 2.48,
634
+ "eval_accuracy": 0.7164227335870778,
635
+ "eval_loss": 1.4038887023925781,
636
+ "eval_runtime": 96.6299,
637
+ "eval_samples_per_second": 175.184,
638
+ "eval_steps_per_second": 21.898,
639
+ "step": 21000
640
+ },
641
+ {
642
+ "epoch": 2.54,
643
+ "learning_rate": 1.5317019348228712e-06,
644
+ "loss": 1.6669,
645
+ "step": 21500
646
+ },
647
+ {
648
+ "epoch": 2.54,
649
+ "eval_accuracy": 0.7171810007042829,
650
+ "eval_loss": 1.4014757871627808,
651
+ "eval_runtime": 96.5289,
652
+ "eval_samples_per_second": 175.367,
653
+ "eval_steps_per_second": 21.921,
654
+ "step": 21500
655
+ },
656
+ {
657
+ "epoch": 2.6,
658
+ "learning_rate": 1.334673129211491e-06,
659
+ "loss": 1.6613,
660
+ "step": 22000
661
+ },
662
+ {
663
+ "epoch": 2.6,
664
+ "eval_accuracy": 0.7179443895145537,
665
+ "eval_loss": 1.3979177474975586,
666
+ "eval_runtime": 96.4739,
667
+ "eval_samples_per_second": 175.467,
668
+ "eval_steps_per_second": 21.933,
669
+ "step": 22000
670
+ },
671
+ {
672
+ "epoch": 2.66,
673
+ "learning_rate": 1.1376443236001104e-06,
674
+ "loss": 1.664,
675
+ "step": 22500
676
+ },
677
+ {
678
+ "epoch": 2.66,
679
+ "eval_accuracy": 0.7180417425737022,
680
+ "eval_loss": 1.3960251808166504,
681
+ "eval_runtime": 96.2769,
682
+ "eval_samples_per_second": 175.826,
683
+ "eval_steps_per_second": 21.978,
684
+ "step": 22500
685
+ },
686
+ {
687
+ "epoch": 2.72,
688
+ "learning_rate": 9.406155179887299e-07,
689
+ "loss": 1.6615,
690
+ "step": 23000
691
+ },
692
+ {
693
+ "epoch": 2.72,
694
+ "eval_accuracy": 0.71719773048631,
695
+ "eval_loss": 1.4012339115142822,
696
+ "eval_runtime": 96.2915,
697
+ "eval_samples_per_second": 175.8,
698
+ "eval_steps_per_second": 21.975,
699
+ "step": 23000
700
+ },
701
+ {
702
+ "epoch": 2.78,
703
+ "learning_rate": 7.435867123773496e-07,
704
+ "loss": 1.6627,
705
+ "step": 23500
706
+ },
707
+ {
708
+ "epoch": 2.78,
709
+ "eval_accuracy": 0.7177754487686726,
710
+ "eval_loss": 1.3974287509918213,
711
+ "eval_runtime": 96.2242,
712
+ "eval_samples_per_second": 175.922,
713
+ "eval_steps_per_second": 21.99,
714
+ "step": 23500
715
+ },
716
+ {
717
+ "epoch": 2.84,
718
+ "learning_rate": 5.465579067659692e-07,
719
+ "loss": 1.6489,
720
+ "step": 24000
721
+ },
722
+ {
723
+ "epoch": 2.84,
724
+ "eval_accuracy": 0.7182007239397646,
725
+ "eval_loss": 1.3947515487670898,
726
+ "eval_runtime": 96.06,
727
+ "eval_samples_per_second": 176.223,
728
+ "eval_steps_per_second": 22.028,
729
+ "step": 24000
730
+ },
731
+ {
732
+ "epoch": 2.9,
733
+ "learning_rate": 3.499231587658116e-07,
734
+ "loss": 1.6429,
735
+ "step": 24500
736
+ },
737
+ {
738
+ "epoch": 2.9,
739
+ "eval_accuracy": 0.7183795073646381,
740
+ "eval_loss": 1.3920938968658447,
741
+ "eval_runtime": 96.1507,
742
+ "eval_samples_per_second": 176.057,
743
+ "eval_steps_per_second": 22.007,
744
+ "step": 24500
745
+ },
746
+ {
747
+ "epoch": 2.96,
748
+ "learning_rate": 1.528943531544312e-07,
749
+ "loss": 1.6477,
750
+ "step": 25000
751
+ },
752
+ {
753
+ "epoch": 2.96,
754
+ "eval_accuracy": 0.7182484820177487,
755
+ "eval_loss": 1.3909834623336792,
756
+ "eval_runtime": 96.1141,
757
+ "eval_samples_per_second": 176.124,
758
+ "eval_steps_per_second": 22.016,
759
+ "step": 25000
760
+ },
761
+ {
762
+ "epoch": 3.0,
763
+ "step": 25377,
764
+ "total_flos": 1.2378168378261504e+17,
765
+ "train_loss": 2.057705193860182,
766
+ "train_runtime": 15314.6638,
767
+ "train_samples_per_second": 26.512,
768
+ "train_steps_per_second": 1.657
769
+ }
770
+ ],
771
+ "logging_steps": 500,
772
+ "max_steps": 25377,
773
+ "num_input_tokens_seen": 0,
774
+ "num_train_epochs": 3,
775
+ "save_steps": 500,
776
+ "total_flos": 1.2378168378261504e+17,
777
+ "train_batch_size": 8,
778
+ "trial_name": null,
779
+ "trial_params": null
780
+ }