edanigoben commited on
Commit
40a3b7b
·
1 Parent(s): 937d27f

(11) class crawler 65e

Browse files
Files changed (7) hide show
  1. optimizer.pt +3 -0
  2. pytorch_model.bin +1 -1
  3. rng_state.pth +3 -0
  4. scaler.pt +3 -0
  5. scheduler.pt +3 -0
  6. trainer_state.json +991 -0
  7. training_args.bin +2 -2
optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f6fa49861293b096f4ec4a7287f7759ac99a2062f0f275e63f7280fc15e0f9f6
3
+ size 535750213
pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:41975f2be25efe13ece3d822a1ccbaeff3f8da514555385595b946d19a28e048
3
  size 267880109
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:491feb260883d9c03042e7f92de7b291e3680759cf403faee560c936244e4b35
3
  size 267880109
rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3899805085e9d2ac6cbf41d1e231c8019961099f94de919796ff76ec1a2a6d27
3
+ size 14575
scaler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1f164eca888cfa14782f70fc9a282dd4e6c1593faf867ae17dfa46d264aa2942
3
+ size 557
scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:772dd81534f16ccad499eda8bef2dc60508234c8daa4ecae9a5aa2c25f5ac971
3
+ size 627
trainer_state.json ADDED
@@ -0,0 +1,991 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 65.0,
5
+ "global_step": 6760,
6
+ "is_hyper_param_search": false,
7
+ "is_local_process_zero": true,
8
+ "is_world_process_zero": true,
9
+ "log_history": [
10
+ {
11
+ "epoch": 0.99,
12
+ "learning_rate": 4.9238165680473374e-05,
13
+ "loss": 1.4565,
14
+ "step": 103
15
+ },
16
+ {
17
+ "epoch": 1.0,
18
+ "eval_f1": 0.5061315988298739,
19
+ "eval_loss": 1.226438045501709,
20
+ "eval_runtime": 2.9031,
21
+ "eval_samples_per_second": 190.487,
22
+ "eval_steps_per_second": 12.056,
23
+ "step": 104
24
+ },
25
+ {
26
+ "epoch": 1.98,
27
+ "learning_rate": 4.8483727810650895e-05,
28
+ "loss": 1.08,
29
+ "step": 206
30
+ },
31
+ {
32
+ "epoch": 2.0,
33
+ "eval_f1": 0.5585235698125446,
34
+ "eval_loss": 1.1974396705627441,
35
+ "eval_runtime": 2.9416,
36
+ "eval_samples_per_second": 187.991,
37
+ "eval_steps_per_second": 11.898,
38
+ "step": 208
39
+ },
40
+ {
41
+ "epoch": 2.97,
42
+ "learning_rate": 4.772189349112427e-05,
43
+ "loss": 0.8073,
44
+ "step": 309
45
+ },
46
+ {
47
+ "epoch": 3.0,
48
+ "eval_f1": 0.5539323613883379,
49
+ "eval_loss": 1.276489496231079,
50
+ "eval_runtime": 2.969,
51
+ "eval_samples_per_second": 186.259,
52
+ "eval_steps_per_second": 11.789,
53
+ "step": 312
54
+ },
55
+ {
56
+ "epoch": 3.96,
57
+ "learning_rate": 4.696005917159764e-05,
58
+ "loss": 0.5577,
59
+ "step": 412
60
+ },
61
+ {
62
+ "epoch": 4.0,
63
+ "eval_f1": 0.5553847052045314,
64
+ "eval_loss": 1.427822232246399,
65
+ "eval_runtime": 2.9361,
66
+ "eval_samples_per_second": 188.346,
67
+ "eval_steps_per_second": 11.921,
68
+ "step": 416
69
+ },
70
+ {
71
+ "epoch": 4.95,
72
+ "learning_rate": 4.619822485207101e-05,
73
+ "loss": 0.3941,
74
+ "step": 515
75
+ },
76
+ {
77
+ "epoch": 5.0,
78
+ "eval_f1": 0.5570342860609194,
79
+ "eval_loss": 1.6517128944396973,
80
+ "eval_runtime": 2.9022,
81
+ "eval_samples_per_second": 190.542,
82
+ "eval_steps_per_second": 12.06,
83
+ "step": 520
84
+ },
85
+ {
86
+ "epoch": 5.94,
87
+ "learning_rate": 4.543639053254438e-05,
88
+ "loss": 0.2878,
89
+ "step": 618
90
+ },
91
+ {
92
+ "epoch": 6.0,
93
+ "eval_f1": 0.5619826716090497,
94
+ "eval_loss": 1.8180437088012695,
95
+ "eval_runtime": 2.855,
96
+ "eval_samples_per_second": 193.697,
97
+ "eval_steps_per_second": 12.259,
98
+ "step": 624
99
+ },
100
+ {
101
+ "epoch": 6.93,
102
+ "learning_rate": 4.468195266272189e-05,
103
+ "loss": 0.2337,
104
+ "step": 721
105
+ },
106
+ {
107
+ "epoch": 7.0,
108
+ "eval_f1": 0.5674708526030706,
109
+ "eval_loss": 1.9061989784240723,
110
+ "eval_runtime": 2.8641,
111
+ "eval_samples_per_second": 193.077,
112
+ "eval_steps_per_second": 12.22,
113
+ "step": 728
114
+ },
115
+ {
116
+ "epoch": 7.92,
117
+ "learning_rate": 4.392011834319526e-05,
118
+ "loss": 0.1743,
119
+ "step": 824
120
+ },
121
+ {
122
+ "epoch": 8.0,
123
+ "eval_f1": 0.5571774381839604,
124
+ "eval_loss": 2.166078805923462,
125
+ "eval_runtime": 2.8885,
126
+ "eval_samples_per_second": 191.449,
127
+ "eval_steps_per_second": 12.117,
128
+ "step": 832
129
+ },
130
+ {
131
+ "epoch": 8.91,
132
+ "learning_rate": 4.315828402366864e-05,
133
+ "loss": 0.1324,
134
+ "step": 927
135
+ },
136
+ {
137
+ "epoch": 9.0,
138
+ "eval_f1": 0.5586999359656486,
139
+ "eval_loss": 2.1434192657470703,
140
+ "eval_runtime": 2.8793,
141
+ "eval_samples_per_second": 192.062,
142
+ "eval_steps_per_second": 12.156,
143
+ "step": 936
144
+ },
145
+ {
146
+ "epoch": 9.9,
147
+ "learning_rate": 4.239644970414201e-05,
148
+ "loss": 0.1051,
149
+ "step": 1030
150
+ },
151
+ {
152
+ "epoch": 10.0,
153
+ "eval_f1": 0.571861247626083,
154
+ "eval_loss": 2.2514231204986572,
155
+ "eval_runtime": 2.8876,
156
+ "eval_samples_per_second": 191.511,
157
+ "eval_steps_per_second": 12.121,
158
+ "step": 1040
159
+ },
160
+ {
161
+ "epoch": 10.89,
162
+ "learning_rate": 4.163461538461539e-05,
163
+ "loss": 0.1016,
164
+ "step": 1133
165
+ },
166
+ {
167
+ "epoch": 11.0,
168
+ "eval_f1": 0.5608736700927537,
169
+ "eval_loss": 2.452277898788452,
170
+ "eval_runtime": 2.9662,
171
+ "eval_samples_per_second": 186.434,
172
+ "eval_steps_per_second": 11.8,
173
+ "step": 1144
174
+ },
175
+ {
176
+ "epoch": 11.88,
177
+ "learning_rate": 4.0872781065088764e-05,
178
+ "loss": 0.0814,
179
+ "step": 1236
180
+ },
181
+ {
182
+ "epoch": 12.0,
183
+ "eval_f1": 0.5643677851728315,
184
+ "eval_loss": 2.5340888500213623,
185
+ "eval_runtime": 2.8457,
186
+ "eval_samples_per_second": 194.329,
187
+ "eval_steps_per_second": 12.299,
188
+ "step": 1248
189
+ },
190
+ {
191
+ "epoch": 12.88,
192
+ "learning_rate": 4.0110946745562136e-05,
193
+ "loss": 0.0673,
194
+ "step": 1339
195
+ },
196
+ {
197
+ "epoch": 13.0,
198
+ "eval_f1": 0.5738915229311208,
199
+ "eval_loss": 2.6217703819274902,
200
+ "eval_runtime": 2.9035,
201
+ "eval_samples_per_second": 190.46,
202
+ "eval_steps_per_second": 12.054,
203
+ "step": 1352
204
+ },
205
+ {
206
+ "epoch": 13.87,
207
+ "learning_rate": 3.934911242603551e-05,
208
+ "loss": 0.0684,
209
+ "step": 1442
210
+ },
211
+ {
212
+ "epoch": 14.0,
213
+ "eval_f1": 0.5366433281464598,
214
+ "eval_loss": 2.9552414417266846,
215
+ "eval_runtime": 2.9063,
216
+ "eval_samples_per_second": 190.277,
217
+ "eval_steps_per_second": 12.043,
218
+ "step": 1456
219
+ },
220
+ {
221
+ "epoch": 14.86,
222
+ "learning_rate": 3.858727810650888e-05,
223
+ "loss": 0.0466,
224
+ "step": 1545
225
+ },
226
+ {
227
+ "epoch": 15.0,
228
+ "eval_f1": 0.5787084254032917,
229
+ "eval_loss": 2.7240512371063232,
230
+ "eval_runtime": 2.8887,
231
+ "eval_samples_per_second": 191.438,
232
+ "eval_steps_per_second": 12.116,
233
+ "step": 1560
234
+ },
235
+ {
236
+ "epoch": 15.85,
237
+ "learning_rate": 3.782544378698225e-05,
238
+ "loss": 0.0577,
239
+ "step": 1648
240
+ },
241
+ {
242
+ "epoch": 16.0,
243
+ "eval_f1": 0.5666557248979172,
244
+ "eval_loss": 2.821897506713867,
245
+ "eval_runtime": 2.8994,
246
+ "eval_samples_per_second": 190.727,
247
+ "eval_steps_per_second": 12.071,
248
+ "step": 1664
249
+ },
250
+ {
251
+ "epoch": 16.84,
252
+ "learning_rate": 3.706360946745562e-05,
253
+ "loss": 0.042,
254
+ "step": 1751
255
+ },
256
+ {
257
+ "epoch": 17.0,
258
+ "eval_f1": 0.56033452806457,
259
+ "eval_loss": 2.9155900478363037,
260
+ "eval_runtime": 2.9258,
261
+ "eval_samples_per_second": 189.01,
262
+ "eval_steps_per_second": 11.963,
263
+ "step": 1768
264
+ },
265
+ {
266
+ "epoch": 17.83,
267
+ "learning_rate": 3.6301775147928995e-05,
268
+ "loss": 0.0404,
269
+ "step": 1854
270
+ },
271
+ {
272
+ "epoch": 18.0,
273
+ "eval_f1": 0.5621979513908701,
274
+ "eval_loss": 2.893630266189575,
275
+ "eval_runtime": 2.9319,
276
+ "eval_samples_per_second": 188.613,
277
+ "eval_steps_per_second": 11.938,
278
+ "step": 1872
279
+ },
280
+ {
281
+ "epoch": 18.82,
282
+ "learning_rate": 3.553994082840237e-05,
283
+ "loss": 0.0426,
284
+ "step": 1957
285
+ },
286
+ {
287
+ "epoch": 19.0,
288
+ "eval_f1": 0.5766503161850353,
289
+ "eval_loss": 3.00762939453125,
290
+ "eval_runtime": 2.9064,
291
+ "eval_samples_per_second": 190.271,
292
+ "eval_steps_per_second": 12.042,
293
+ "step": 1976
294
+ },
295
+ {
296
+ "epoch": 19.81,
297
+ "learning_rate": 3.477810650887574e-05,
298
+ "loss": 0.0361,
299
+ "step": 2060
300
+ },
301
+ {
302
+ "epoch": 20.0,
303
+ "eval_f1": 0.5565194377868121,
304
+ "eval_loss": 3.043562173843384,
305
+ "eval_runtime": 2.9099,
306
+ "eval_samples_per_second": 190.043,
307
+ "eval_steps_per_second": 12.028,
308
+ "step": 2080
309
+ },
310
+ {
311
+ "epoch": 20.8,
312
+ "learning_rate": 3.401627218934911e-05,
313
+ "loss": 0.039,
314
+ "step": 2163
315
+ },
316
+ {
317
+ "epoch": 21.0,
318
+ "eval_f1": 0.5661283975776907,
319
+ "eval_loss": 3.034050226211548,
320
+ "eval_runtime": 2.8472,
321
+ "eval_samples_per_second": 194.228,
322
+ "eval_steps_per_second": 12.293,
323
+ "step": 2184
324
+ },
325
+ {
326
+ "epoch": 21.79,
327
+ "learning_rate": 3.325443786982248e-05,
328
+ "loss": 0.0311,
329
+ "step": 2266
330
+ },
331
+ {
332
+ "epoch": 22.0,
333
+ "eval_f1": 0.5698598461896062,
334
+ "eval_loss": 3.1546428203582764,
335
+ "eval_runtime": 2.8965,
336
+ "eval_samples_per_second": 190.921,
337
+ "eval_steps_per_second": 12.084,
338
+ "step": 2288
339
+ },
340
+ {
341
+ "epoch": 22.78,
342
+ "learning_rate": 3.2492603550295855e-05,
343
+ "loss": 0.0296,
344
+ "step": 2369
345
+ },
346
+ {
347
+ "epoch": 23.0,
348
+ "eval_f1": 0.5584145320343268,
349
+ "eval_loss": 3.3160221576690674,
350
+ "eval_runtime": 2.9004,
351
+ "eval_samples_per_second": 190.664,
352
+ "eval_steps_per_second": 12.067,
353
+ "step": 2392
354
+ },
355
+ {
356
+ "epoch": 23.77,
357
+ "learning_rate": 3.1730769230769234e-05,
358
+ "loss": 0.03,
359
+ "step": 2472
360
+ },
361
+ {
362
+ "epoch": 24.0,
363
+ "eval_f1": 0.5765799312977243,
364
+ "eval_loss": 3.2025678157806396,
365
+ "eval_runtime": 2.837,
366
+ "eval_samples_per_second": 194.921,
367
+ "eval_steps_per_second": 12.337,
368
+ "step": 2496
369
+ },
370
+ {
371
+ "epoch": 24.76,
372
+ "learning_rate": 3.0968934911242606e-05,
373
+ "loss": 0.0333,
374
+ "step": 2575
375
+ },
376
+ {
377
+ "epoch": 25.0,
378
+ "eval_f1": 0.5689553713820321,
379
+ "eval_loss": 3.211634397506714,
380
+ "eval_runtime": 2.9044,
381
+ "eval_samples_per_second": 190.402,
382
+ "eval_steps_per_second": 12.051,
383
+ "step": 2600
384
+ },
385
+ {
386
+ "epoch": 25.75,
387
+ "learning_rate": 3.0207100591715974e-05,
388
+ "loss": 0.0321,
389
+ "step": 2678
390
+ },
391
+ {
392
+ "epoch": 26.0,
393
+ "eval_f1": 0.5756108062994573,
394
+ "eval_loss": 3.2678425312042236,
395
+ "eval_runtime": 2.8888,
396
+ "eval_samples_per_second": 191.428,
397
+ "eval_steps_per_second": 12.116,
398
+ "step": 2704
399
+ },
400
+ {
401
+ "epoch": 26.74,
402
+ "learning_rate": 2.944526627218935e-05,
403
+ "loss": 0.0263,
404
+ "step": 2781
405
+ },
406
+ {
407
+ "epoch": 27.0,
408
+ "eval_f1": 0.5758065273285641,
409
+ "eval_loss": 3.2969822883605957,
410
+ "eval_runtime": 2.9527,
411
+ "eval_samples_per_second": 187.286,
412
+ "eval_steps_per_second": 11.854,
413
+ "step": 2808
414
+ },
415
+ {
416
+ "epoch": 27.73,
417
+ "learning_rate": 2.8683431952662725e-05,
418
+ "loss": 0.0281,
419
+ "step": 2884
420
+ },
421
+ {
422
+ "epoch": 28.0,
423
+ "eval_f1": 0.5781354966097151,
424
+ "eval_loss": 3.3730037212371826,
425
+ "eval_runtime": 2.8614,
426
+ "eval_samples_per_second": 193.264,
427
+ "eval_steps_per_second": 12.232,
428
+ "step": 2912
429
+ },
430
+ {
431
+ "epoch": 28.72,
432
+ "learning_rate": 2.7921597633136097e-05,
433
+ "loss": 0.0282,
434
+ "step": 2987
435
+ },
436
+ {
437
+ "epoch": 29.0,
438
+ "eval_f1": 0.5741866124789994,
439
+ "eval_loss": 3.364117383956909,
440
+ "eval_runtime": 2.8696,
441
+ "eval_samples_per_second": 192.707,
442
+ "eval_steps_per_second": 12.197,
443
+ "step": 3016
444
+ },
445
+ {
446
+ "epoch": 29.71,
447
+ "learning_rate": 2.7159763313609472e-05,
448
+ "loss": 0.0296,
449
+ "step": 3090
450
+ },
451
+ {
452
+ "epoch": 30.0,
453
+ "eval_f1": 0.5771762774162508,
454
+ "eval_loss": 3.3623032569885254,
455
+ "eval_runtime": 2.9567,
456
+ "eval_samples_per_second": 187.031,
457
+ "eval_steps_per_second": 11.837,
458
+ "step": 3120
459
+ },
460
+ {
461
+ "epoch": 30.7,
462
+ "learning_rate": 2.6397928994082844e-05,
463
+ "loss": 0.0308,
464
+ "step": 3193
465
+ },
466
+ {
467
+ "epoch": 31.0,
468
+ "eval_f1": 0.578537002980747,
469
+ "eval_loss": 3.4039528369903564,
470
+ "eval_runtime": 2.8263,
471
+ "eval_samples_per_second": 195.66,
472
+ "eval_steps_per_second": 12.384,
473
+ "step": 3224
474
+ },
475
+ {
476
+ "epoch": 31.69,
477
+ "learning_rate": 2.5636094674556216e-05,
478
+ "loss": 0.0308,
479
+ "step": 3296
480
+ },
481
+ {
482
+ "epoch": 32.0,
483
+ "eval_f1": 0.575919412837488,
484
+ "eval_loss": 3.392319679260254,
485
+ "eval_runtime": 2.9375,
486
+ "eval_samples_per_second": 188.254,
487
+ "eval_steps_per_second": 11.915,
488
+ "step": 3328
489
+ },
490
+ {
491
+ "epoch": 32.68,
492
+ "learning_rate": 2.4874260355029588e-05,
493
+ "loss": 0.0262,
494
+ "step": 3399
495
+ },
496
+ {
497
+ "epoch": 33.0,
498
+ "eval_f1": 0.5563772891428104,
499
+ "eval_loss": 3.4757542610168457,
500
+ "eval_runtime": 2.865,
501
+ "eval_samples_per_second": 193.019,
502
+ "eval_steps_per_second": 12.216,
503
+ "step": 3432
504
+ },
505
+ {
506
+ "epoch": 33.67,
507
+ "learning_rate": 2.411242603550296e-05,
508
+ "loss": 0.0319,
509
+ "step": 3502
510
+ },
511
+ {
512
+ "epoch": 34.0,
513
+ "eval_f1": 0.5738865992034025,
514
+ "eval_loss": 3.425334930419922,
515
+ "eval_runtime": 2.9109,
516
+ "eval_samples_per_second": 189.974,
517
+ "eval_steps_per_second": 12.024,
518
+ "step": 3536
519
+ },
520
+ {
521
+ "epoch": 34.66,
522
+ "learning_rate": 2.3350591715976332e-05,
523
+ "loss": 0.0277,
524
+ "step": 3605
525
+ },
526
+ {
527
+ "epoch": 35.0,
528
+ "eval_f1": 0.5785980513801816,
529
+ "eval_loss": 3.4686436653137207,
530
+ "eval_runtime": 2.93,
531
+ "eval_samples_per_second": 188.738,
532
+ "eval_steps_per_second": 11.945,
533
+ "step": 3640
534
+ },
535
+ {
536
+ "epoch": 35.65,
537
+ "learning_rate": 2.2588757396449707e-05,
538
+ "loss": 0.0289,
539
+ "step": 3708
540
+ },
541
+ {
542
+ "epoch": 36.0,
543
+ "eval_f1": 0.5836924697871717,
544
+ "eval_loss": 3.462078094482422,
545
+ "eval_runtime": 2.8428,
546
+ "eval_samples_per_second": 194.527,
547
+ "eval_steps_per_second": 12.312,
548
+ "step": 3744
549
+ },
550
+ {
551
+ "epoch": 36.64,
552
+ "learning_rate": 2.182692307692308e-05,
553
+ "loss": 0.0247,
554
+ "step": 3811
555
+ },
556
+ {
557
+ "epoch": 37.0,
558
+ "eval_f1": 0.5734707197245945,
559
+ "eval_loss": 3.481998920440674,
560
+ "eval_runtime": 3.0017,
561
+ "eval_samples_per_second": 184.228,
562
+ "eval_steps_per_second": 11.66,
563
+ "step": 3848
564
+ },
565
+ {
566
+ "epoch": 37.63,
567
+ "learning_rate": 2.106508875739645e-05,
568
+ "loss": 0.0303,
569
+ "step": 3914
570
+ },
571
+ {
572
+ "epoch": 38.0,
573
+ "eval_f1": 0.5770262969511715,
574
+ "eval_loss": 3.466510772705078,
575
+ "eval_runtime": 2.8587,
576
+ "eval_samples_per_second": 193.442,
577
+ "eval_steps_per_second": 12.243,
578
+ "step": 3952
579
+ },
580
+ {
581
+ "epoch": 38.62,
582
+ "learning_rate": 2.0303254437869823e-05,
583
+ "loss": 0.0239,
584
+ "step": 4017
585
+ },
586
+ {
587
+ "epoch": 39.0,
588
+ "eval_f1": 0.5666519467364683,
589
+ "eval_loss": 3.5593807697296143,
590
+ "eval_runtime": 2.8222,
591
+ "eval_samples_per_second": 195.946,
592
+ "eval_steps_per_second": 12.402,
593
+ "step": 4056
594
+ },
595
+ {
596
+ "epoch": 39.62,
597
+ "learning_rate": 1.9541420118343195e-05,
598
+ "loss": 0.0262,
599
+ "step": 4120
600
+ },
601
+ {
602
+ "epoch": 40.0,
603
+ "eval_f1": 0.5808476343157906,
604
+ "eval_loss": 3.5302422046661377,
605
+ "eval_runtime": 2.8598,
606
+ "eval_samples_per_second": 193.368,
607
+ "eval_steps_per_second": 12.238,
608
+ "step": 4160
609
+ },
610
+ {
611
+ "epoch": 40.61,
612
+ "learning_rate": 1.8779585798816567e-05,
613
+ "loss": 0.0282,
614
+ "step": 4223
615
+ },
616
+ {
617
+ "epoch": 41.0,
618
+ "eval_f1": 0.5835890408164021,
619
+ "eval_loss": 3.4572339057922363,
620
+ "eval_runtime": 2.8566,
621
+ "eval_samples_per_second": 193.584,
622
+ "eval_steps_per_second": 12.252,
623
+ "step": 4264
624
+ },
625
+ {
626
+ "epoch": 41.6,
627
+ "learning_rate": 1.8025147928994084e-05,
628
+ "loss": 0.0469,
629
+ "step": 4326
630
+ },
631
+ {
632
+ "epoch": 42.0,
633
+ "eval_f1": 0.5685331156394952,
634
+ "eval_loss": 3.609334707260132,
635
+ "eval_runtime": 2.8251,
636
+ "eval_samples_per_second": 195.747,
637
+ "eval_steps_per_second": 12.389,
638
+ "step": 4368
639
+ },
640
+ {
641
+ "epoch": 42.59,
642
+ "learning_rate": 1.7263313609467456e-05,
643
+ "loss": 0.0302,
644
+ "step": 4429
645
+ },
646
+ {
647
+ "epoch": 43.0,
648
+ "eval_f1": 0.5684067370608473,
649
+ "eval_loss": 3.6115400791168213,
650
+ "eval_runtime": 2.9194,
651
+ "eval_samples_per_second": 189.42,
652
+ "eval_steps_per_second": 11.989,
653
+ "step": 4472
654
+ },
655
+ {
656
+ "epoch": 43.58,
657
+ "learning_rate": 1.650147928994083e-05,
658
+ "loss": 0.0289,
659
+ "step": 4532
660
+ },
661
+ {
662
+ "epoch": 44.0,
663
+ "eval_f1": 0.5757900647671246,
664
+ "eval_loss": 3.629568099975586,
665
+ "eval_runtime": 2.9036,
666
+ "eval_samples_per_second": 190.453,
667
+ "eval_steps_per_second": 12.054,
668
+ "step": 4576
669
+ },
670
+ {
671
+ "epoch": 44.57,
672
+ "learning_rate": 1.5739644970414204e-05,
673
+ "loss": 0.0254,
674
+ "step": 4635
675
+ },
676
+ {
677
+ "epoch": 45.0,
678
+ "eval_f1": 0.5689505752768721,
679
+ "eval_loss": 3.7250843048095703,
680
+ "eval_runtime": 2.9726,
681
+ "eval_samples_per_second": 186.035,
682
+ "eval_steps_per_second": 11.774,
683
+ "step": 4680
684
+ },
685
+ {
686
+ "epoch": 45.56,
687
+ "learning_rate": 1.4977810650887576e-05,
688
+ "loss": 0.0283,
689
+ "step": 4738
690
+ },
691
+ {
692
+ "epoch": 46.0,
693
+ "eval_f1": 0.5592198654774546,
694
+ "eval_loss": 3.726353645324707,
695
+ "eval_runtime": 2.9328,
696
+ "eval_samples_per_second": 188.559,
697
+ "eval_steps_per_second": 11.934,
698
+ "step": 4784
699
+ },
700
+ {
701
+ "epoch": 46.55,
702
+ "learning_rate": 1.4215976331360948e-05,
703
+ "loss": 0.0246,
704
+ "step": 4841
705
+ },
706
+ {
707
+ "epoch": 47.0,
708
+ "eval_f1": 0.5650157110711802,
709
+ "eval_loss": 3.7832093238830566,
710
+ "eval_runtime": 2.9067,
711
+ "eval_samples_per_second": 190.249,
712
+ "eval_steps_per_second": 12.041,
713
+ "step": 4888
714
+ },
715
+ {
716
+ "epoch": 47.54,
717
+ "learning_rate": 1.345414201183432e-05,
718
+ "loss": 0.0311,
719
+ "step": 4944
720
+ },
721
+ {
722
+ "epoch": 48.0,
723
+ "eval_f1": 0.5681512072556809,
724
+ "eval_loss": 3.6964025497436523,
725
+ "eval_runtime": 2.9008,
726
+ "eval_samples_per_second": 190.634,
727
+ "eval_steps_per_second": 12.065,
728
+ "step": 4992
729
+ },
730
+ {
731
+ "epoch": 48.53,
732
+ "learning_rate": 1.2692307692307691e-05,
733
+ "loss": 0.0268,
734
+ "step": 5047
735
+ },
736
+ {
737
+ "epoch": 49.0,
738
+ "eval_f1": 0.5674808111122996,
739
+ "eval_loss": 3.7195167541503906,
740
+ "eval_runtime": 2.8604,
741
+ "eval_samples_per_second": 193.33,
742
+ "eval_steps_per_second": 12.236,
743
+ "step": 5096
744
+ },
745
+ {
746
+ "epoch": 49.52,
747
+ "learning_rate": 1.1930473372781067e-05,
748
+ "loss": 0.0293,
749
+ "step": 5150
750
+ },
751
+ {
752
+ "epoch": 50.0,
753
+ "eval_f1": 0.5614419693521525,
754
+ "eval_loss": 3.752530097961426,
755
+ "eval_runtime": 2.8761,
756
+ "eval_samples_per_second": 192.275,
757
+ "eval_steps_per_second": 12.169,
758
+ "step": 5200
759
+ },
760
+ {
761
+ "epoch": 50.51,
762
+ "learning_rate": 1.1168639053254439e-05,
763
+ "loss": 0.0282,
764
+ "step": 5253
765
+ },
766
+ {
767
+ "epoch": 51.0,
768
+ "eval_f1": 0.5655838635083059,
769
+ "eval_loss": 3.7514984607696533,
770
+ "eval_runtime": 2.8609,
771
+ "eval_samples_per_second": 193.296,
772
+ "eval_steps_per_second": 12.234,
773
+ "step": 5304
774
+ },
775
+ {
776
+ "epoch": 51.5,
777
+ "learning_rate": 1.040680473372781e-05,
778
+ "loss": 0.0248,
779
+ "step": 5356
780
+ },
781
+ {
782
+ "epoch": 52.0,
783
+ "eval_f1": 0.5590951084274065,
784
+ "eval_loss": 3.7639315128326416,
785
+ "eval_runtime": 2.8211,
786
+ "eval_samples_per_second": 196.025,
787
+ "eval_steps_per_second": 12.407,
788
+ "step": 5408
789
+ },
790
+ {
791
+ "epoch": 52.49,
792
+ "learning_rate": 9.644970414201183e-06,
793
+ "loss": 0.0257,
794
+ "step": 5459
795
+ },
796
+ {
797
+ "epoch": 53.0,
798
+ "eval_f1": 0.5480134247467852,
799
+ "eval_loss": 3.824922800064087,
800
+ "eval_runtime": 2.8475,
801
+ "eval_samples_per_second": 194.205,
802
+ "eval_steps_per_second": 12.291,
803
+ "step": 5512
804
+ },
805
+ {
806
+ "epoch": 53.48,
807
+ "learning_rate": 8.883136094674558e-06,
808
+ "loss": 0.0235,
809
+ "step": 5562
810
+ },
811
+ {
812
+ "epoch": 54.0,
813
+ "eval_f1": 0.5565796472147394,
814
+ "eval_loss": 3.7871253490448,
815
+ "eval_runtime": 2.9817,
816
+ "eval_samples_per_second": 185.462,
817
+ "eval_steps_per_second": 11.738,
818
+ "step": 5616
819
+ },
820
+ {
821
+ "epoch": 54.47,
822
+ "learning_rate": 8.12130177514793e-06,
823
+ "loss": 0.0299,
824
+ "step": 5665
825
+ },
826
+ {
827
+ "epoch": 55.0,
828
+ "eval_f1": 0.5574154263000176,
829
+ "eval_loss": 3.788760185241699,
830
+ "eval_runtime": 2.8852,
831
+ "eval_samples_per_second": 191.665,
832
+ "eval_steps_per_second": 12.131,
833
+ "step": 5720
834
+ },
835
+ {
836
+ "epoch": 55.46,
837
+ "learning_rate": 7.359467455621302e-06,
838
+ "loss": 0.0277,
839
+ "step": 5768
840
+ },
841
+ {
842
+ "epoch": 56.0,
843
+ "eval_f1": 0.563024311843682,
844
+ "eval_loss": 3.7907044887542725,
845
+ "eval_runtime": 2.8658,
846
+ "eval_samples_per_second": 192.962,
847
+ "eval_steps_per_second": 12.213,
848
+ "step": 5824
849
+ },
850
+ {
851
+ "epoch": 56.45,
852
+ "learning_rate": 6.597633136094675e-06,
853
+ "loss": 0.0256,
854
+ "step": 5871
855
+ },
856
+ {
857
+ "epoch": 57.0,
858
+ "eval_f1": 0.56153234588093,
859
+ "eval_loss": 3.799422264099121,
860
+ "eval_runtime": 2.8666,
861
+ "eval_samples_per_second": 192.912,
862
+ "eval_steps_per_second": 12.21,
863
+ "step": 5928
864
+ },
865
+ {
866
+ "epoch": 57.44,
867
+ "learning_rate": 5.8357988165680474e-06,
868
+ "loss": 0.0226,
869
+ "step": 5974
870
+ },
871
+ {
872
+ "epoch": 58.0,
873
+ "eval_f1": 0.5555061070073688,
874
+ "eval_loss": 3.811858892440796,
875
+ "eval_runtime": 2.8683,
876
+ "eval_samples_per_second": 192.797,
877
+ "eval_steps_per_second": 12.202,
878
+ "step": 6032
879
+ },
880
+ {
881
+ "epoch": 58.43,
882
+ "learning_rate": 5.07396449704142e-06,
883
+ "loss": 0.0284,
884
+ "step": 6077
885
+ },
886
+ {
887
+ "epoch": 59.0,
888
+ "eval_f1": 0.5597671150511061,
889
+ "eval_loss": 3.8192451000213623,
890
+ "eval_runtime": 2.8512,
891
+ "eval_samples_per_second": 193.951,
892
+ "eval_steps_per_second": 12.275,
893
+ "step": 6136
894
+ },
895
+ {
896
+ "epoch": 59.42,
897
+ "learning_rate": 4.312130177514793e-06,
898
+ "loss": 0.0233,
899
+ "step": 6180
900
+ },
901
+ {
902
+ "epoch": 60.0,
903
+ "eval_f1": 0.5584681716027172,
904
+ "eval_loss": 3.823091983795166,
905
+ "eval_runtime": 2.9385,
906
+ "eval_samples_per_second": 188.191,
907
+ "eval_steps_per_second": 11.911,
908
+ "step": 6240
909
+ },
910
+ {
911
+ "epoch": 60.41,
912
+ "learning_rate": 3.550295857988166e-06,
913
+ "loss": 0.0266,
914
+ "step": 6283
915
+ },
916
+ {
917
+ "epoch": 61.0,
918
+ "eval_f1": 0.5625000576804086,
919
+ "eval_loss": 3.8085415363311768,
920
+ "eval_runtime": 2.9015,
921
+ "eval_samples_per_second": 190.588,
922
+ "eval_steps_per_second": 12.063,
923
+ "step": 6344
924
+ },
925
+ {
926
+ "epoch": 61.4,
927
+ "learning_rate": 2.7958579881656803e-06,
928
+ "loss": 0.0267,
929
+ "step": 6386
930
+ },
931
+ {
932
+ "epoch": 62.0,
933
+ "eval_f1": 0.5622167257088028,
934
+ "eval_loss": 3.80642032623291,
935
+ "eval_runtime": 2.8514,
936
+ "eval_samples_per_second": 193.94,
937
+ "eval_steps_per_second": 12.275,
938
+ "step": 6448
939
+ },
940
+ {
941
+ "epoch": 62.39,
942
+ "learning_rate": 2.034023668639053e-06,
943
+ "loss": 0.0281,
944
+ "step": 6489
945
+ },
946
+ {
947
+ "epoch": 63.0,
948
+ "eval_f1": 0.564106811375439,
949
+ "eval_loss": 3.8057875633239746,
950
+ "eval_runtime": 2.8945,
951
+ "eval_samples_per_second": 191.055,
952
+ "eval_steps_per_second": 12.092,
953
+ "step": 6552
954
+ },
955
+ {
956
+ "epoch": 63.38,
957
+ "learning_rate": 1.2721893491124261e-06,
958
+ "loss": 0.025,
959
+ "step": 6592
960
+ },
961
+ {
962
+ "epoch": 64.0,
963
+ "eval_f1": 0.5644375312998279,
964
+ "eval_loss": 3.807055950164795,
965
+ "eval_runtime": 2.8941,
966
+ "eval_samples_per_second": 191.08,
967
+ "eval_steps_per_second": 12.094,
968
+ "step": 6656
969
+ },
970
+ {
971
+ "epoch": 64.38,
972
+ "learning_rate": 5.103550295857988e-07,
973
+ "loss": 0.0226,
974
+ "step": 6695
975
+ },
976
+ {
977
+ "epoch": 65.0,
978
+ "eval_f1": 0.5644375312998279,
979
+ "eval_loss": 3.807528018951416,
980
+ "eval_runtime": 2.8626,
981
+ "eval_samples_per_second": 193.181,
982
+ "eval_steps_per_second": 12.227,
983
+ "step": 6760
984
+ }
985
+ ],
986
+ "max_steps": 6760,
987
+ "num_train_epochs": 65,
988
+ "total_flos": 1.4286659901696e+16,
989
+ "trial_name": null,
990
+ "trial_params": null
991
+ }
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:302bac30143d46b59f4bc9c24632e9b0d21519a131eaf8effb35474b56fd7548
3
- size 3643
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:69dba15562adc963c0e958faa40949482ed3f4a4e7db086eedc1130e4eecc7b6
3
+ size 3707