root commited on
Commit
aa4fb01
1 Parent(s): ce3976c

model trained

Browse files
Files changed (6) hide show
  1. optimizer.pt +1 -1
  2. pytorch_model.bin +1 -1
  3. rng_state.pth +1 -1
  4. scheduler.pt +1 -1
  5. trainer_state.json +426 -168
  6. training_args.bin +2 -2
optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9419b0eb7839d5e42ca271954b41f67991b101009766c0844982d592f8182675
3
  size 1575738181
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:20fdb5c2c4323d207318908a0c1e872739153bd4b1b3f357b7fd7f3270225435
3
  size 1575738181
pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:472de990d8149c4c98f9c4d9fc04749f29d8f591735c97266ff7ccfb00af577d
3
  size 787895775
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b1ae5619b0c0979a19f537880b5641b0f5fcdc9c9ff5dfa462f2489c466ddfeb
3
  size 787895775
rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3fedf0b601049d3e138f125d5ae7fc032c75040b14857ef8bae20f0d3d37de50
3
  size 14503
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ad5e0a56dbbaa43f9d8745ae549f2f2bd42a7d1d9f6dc982adb3cb0b96d71acb
3
  size 14503
scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1183dc560918171dd71579eda592900e97fdd64a88c22c27997a4148326cda9f
3
  size 623
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4694a64b29aceba9bc01f87f0f202f1ee1c1bba0130df96038ab6f4b43c4c590
3
  size 623
trainer_state.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 5.319148936170213,
5
- "global_step": 500,
6
  "is_hyper_param_search": false,
7
  "is_local_process_zero": true,
8
  "is_world_process_zero": true,
@@ -10,347 +10,605 @@
10
  {
11
  "epoch": 0.11,
12
  "learning_rate": 3e-06,
13
- "loss": 3.4862,
14
  "step": 10
15
  },
16
  {
17
- "epoch": 0.21,
18
  "learning_rate": 6e-06,
19
- "loss": 3.4466,
20
  "step": 20
21
  },
22
  {
23
- "epoch": 0.32,
24
  "learning_rate": 9e-06,
25
- "loss": 3.3785,
26
  "step": 30
27
  },
28
  {
29
- "epoch": 0.43,
30
  "learning_rate": 1.2e-05,
31
- "loss": 3.1635,
32
  "step": 40
33
  },
34
  {
35
- "epoch": 0.53,
36
  "learning_rate": 1.5e-05,
37
- "loss": 2.7939,
38
  "step": 50
39
  },
40
  {
41
- "epoch": 0.64,
42
  "learning_rate": 1.8e-05,
43
- "loss": 2.6209,
44
  "step": 60
45
  },
46
  {
47
- "epoch": 0.74,
48
  "learning_rate": 2.1e-05,
49
- "loss": 2.546,
50
  "step": 70
51
  },
52
  {
53
- "epoch": 0.85,
54
  "learning_rate": 2.4e-05,
55
- "loss": 2.4935,
56
  "step": 80
57
  },
58
  {
59
- "epoch": 0.96,
60
  "learning_rate": 2.7000000000000002e-05,
61
- "loss": 2.5316,
62
  "step": 90
63
  },
64
  {
65
- "epoch": 1.06,
66
  "learning_rate": 3e-05,
67
- "loss": 2.2506,
68
  "step": 100
69
  },
70
  {
71
- "epoch": 1.06,
72
- "eval_loss": 2.3243110179901123,
73
- "eval_runtime": 30.5113,
74
- "eval_samples_per_second": 5.178,
75
- "eval_steps_per_second": 0.164,
76
  "step": 100
77
  },
78
  {
79
- "epoch": 1.17,
80
- "learning_rate": 2.9642857142857144e-05,
81
- "loss": 2.4025,
82
  "step": 110
83
  },
84
  {
85
- "epoch": 1.28,
86
- "learning_rate": 2.9285714285714284e-05,
87
- "loss": 2.1887,
88
  "step": 120
89
  },
90
  {
91
- "epoch": 1.38,
92
- "learning_rate": 2.892857142857143e-05,
93
- "loss": 2.1512,
94
  "step": 130
95
  },
96
  {
97
- "epoch": 1.49,
98
- "learning_rate": 2.857142857142857e-05,
99
- "loss": 1.9806,
100
  "step": 140
101
  },
102
  {
103
- "epoch": 1.6,
104
- "learning_rate": 2.8214285714285714e-05,
105
- "loss": 1.9524,
106
  "step": 150
107
  },
108
  {
109
- "epoch": 1.7,
110
- "learning_rate": 2.7857142857142858e-05,
111
- "loss": 1.9475,
112
  "step": 160
113
  },
114
  {
115
- "epoch": 1.81,
116
- "learning_rate": 2.75e-05,
117
- "loss": 1.927,
118
  "step": 170
119
  },
120
  {
121
- "epoch": 1.91,
122
- "learning_rate": 2.7142857142857144e-05,
123
- "loss": 1.9126,
124
  "step": 180
125
  },
126
  {
127
- "epoch": 2.02,
128
- "learning_rate": 2.6785714285714288e-05,
129
- "loss": 1.8077,
130
  "step": 190
131
  },
132
  {
133
- "epoch": 2.13,
134
- "learning_rate": 2.6428571428571428e-05,
135
- "loss": 1.4381,
136
  "step": 200
137
  },
138
  {
139
- "epoch": 2.13,
140
- "eval_loss": 1.754191279411316,
141
- "eval_runtime": 31.7299,
142
- "eval_samples_per_second": 4.98,
143
- "eval_steps_per_second": 0.158,
144
  "step": 200
145
  },
146
  {
147
- "epoch": 2.23,
148
- "learning_rate": 2.607142857142857e-05,
149
- "loss": 1.5071,
150
  "step": 210
151
  },
152
  {
153
- "epoch": 2.34,
154
- "learning_rate": 2.5714285714285714e-05,
155
- "loss": 1.426,
156
  "step": 220
157
  },
158
  {
159
- "epoch": 2.45,
160
- "learning_rate": 2.5357142857142858e-05,
161
- "loss": 1.4455,
162
  "step": 230
163
  },
164
  {
165
- "epoch": 2.55,
166
- "learning_rate": 2.5e-05,
167
- "loss": 1.4927,
168
  "step": 240
169
  },
170
  {
171
- "epoch": 2.66,
172
- "learning_rate": 2.464285714285714e-05,
173
- "loss": 1.4041,
174
  "step": 250
175
  },
176
  {
177
- "epoch": 2.77,
178
- "learning_rate": 2.4285714285714288e-05,
179
- "loss": 1.4084,
180
  "step": 260
181
  },
182
  {
183
- "epoch": 2.87,
184
- "learning_rate": 2.392857142857143e-05,
185
- "loss": 1.414,
186
  "step": 270
187
  },
188
  {
189
- "epoch": 2.98,
190
- "learning_rate": 2.357142857142857e-05,
191
- "loss": 1.385,
192
  "step": 280
193
  },
194
  {
195
- "epoch": 3.09,
196
- "learning_rate": 2.3214285714285715e-05,
197
- "loss": 1.0958,
198
  "step": 290
199
  },
200
  {
201
- "epoch": 3.19,
202
- "learning_rate": 2.2857142857142858e-05,
203
- "loss": 1.0876,
204
  "step": 300
205
  },
206
  {
207
- "epoch": 3.19,
208
- "eval_loss": 1.5441479682922363,
209
- "eval_runtime": 42.3423,
210
- "eval_samples_per_second": 3.731,
211
- "eval_steps_per_second": 0.118,
212
  "step": 300
213
  },
214
  {
215
- "epoch": 3.3,
216
- "learning_rate": 2.25e-05,
217
- "loss": 1.0589,
218
  "step": 310
219
  },
220
  {
221
- "epoch": 3.4,
222
- "learning_rate": 2.2142857142857145e-05,
223
- "loss": 1.137,
224
  "step": 320
225
  },
226
  {
227
- "epoch": 3.51,
228
- "learning_rate": 2.1785714285714285e-05,
229
- "loss": 1.0838,
230
  "step": 330
231
  },
232
  {
233
- "epoch": 3.62,
234
- "learning_rate": 2.1428571428571428e-05,
235
- "loss": 1.1054,
236
  "step": 340
237
  },
238
  {
239
- "epoch": 3.72,
240
- "learning_rate": 2.107142857142857e-05,
241
- "loss": 1.0415,
242
  "step": 350
243
  },
244
  {
245
- "epoch": 3.83,
246
- "learning_rate": 2.0714285714285715e-05,
247
- "loss": 1.0444,
248
  "step": 360
249
  },
250
  {
251
- "epoch": 3.94,
252
- "learning_rate": 2.0357142857142858e-05,
253
- "loss": 0.9463,
254
  "step": 370
255
  },
256
  {
257
- "epoch": 4.04,
258
- "learning_rate": 1.9999999999999998e-05,
259
- "loss": 1.0593,
260
  "step": 380
261
  },
262
  {
263
- "epoch": 4.15,
264
- "learning_rate": 1.9642857142857145e-05,
265
- "loss": 0.8724,
266
  "step": 390
267
  },
268
  {
269
- "epoch": 4.26,
270
- "learning_rate": 1.928571428571429e-05,
271
- "loss": 0.8654,
272
  "step": 400
273
  },
274
  {
275
- "epoch": 4.26,
276
- "eval_loss": 1.2680882215499878,
277
- "eval_runtime": 30.013,
278
- "eval_samples_per_second": 5.264,
279
- "eval_steps_per_second": 0.167,
280
  "step": 400
281
  },
282
  {
283
- "epoch": 4.36,
284
- "learning_rate": 1.892857142857143e-05,
285
- "loss": 0.8456,
286
  "step": 410
287
  },
288
  {
289
- "epoch": 4.47,
290
- "learning_rate": 1.8571428571428572e-05,
291
- "loss": 0.8181,
292
  "step": 420
293
  },
294
  {
295
- "epoch": 4.57,
296
- "learning_rate": 1.8214285714285712e-05,
297
- "loss": 0.8859,
298
  "step": 430
299
  },
300
  {
301
- "epoch": 4.68,
302
- "learning_rate": 1.785714285714286e-05,
303
- "loss": 0.7514,
304
  "step": 440
305
  },
306
  {
307
- "epoch": 4.79,
308
- "learning_rate": 1.7500000000000002e-05,
309
- "loss": 0.8128,
310
  "step": 450
311
  },
312
  {
313
- "epoch": 4.89,
314
- "learning_rate": 1.7142857142857142e-05,
315
- "loss": 0.8051,
316
  "step": 460
317
  },
318
  {
319
- "epoch": 5.0,
320
- "learning_rate": 1.6785714285714285e-05,
321
- "loss": 0.7211,
322
  "step": 470
323
  },
324
  {
325
- "epoch": 5.11,
326
- "learning_rate": 1.6428571428571432e-05,
327
- "loss": 0.6213,
328
  "step": 480
329
  },
330
  {
331
- "epoch": 5.21,
332
- "learning_rate": 1.6071428571428572e-05,
333
- "loss": 0.6465,
334
  "step": 490
335
  },
336
  {
337
- "epoch": 5.32,
338
- "learning_rate": 1.5714285714285715e-05,
339
- "loss": 0.6749,
340
  "step": 500
341
  },
342
  {
343
- "epoch": 5.32,
344
- "eval_loss": 1.1592129468917847,
345
- "eval_runtime": 26.8811,
346
- "eval_samples_per_second": 5.878,
347
- "eval_steps_per_second": 0.186,
348
  "step": 500
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
349
  }
350
  ],
351
- "max_steps": 940,
352
  "num_train_epochs": 10,
353
- "total_flos": 1325810169734400.0,
354
  "trial_name": null,
355
  "trial_params": null
356
  }
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 9.846153846153847,
5
+ "global_step": 896,
6
  "is_hyper_param_search": false,
7
  "is_local_process_zero": true,
8
  "is_world_process_zero": true,
 
10
  {
11
  "epoch": 0.11,
12
  "learning_rate": 3e-06,
13
+ "loss": 4.0528,
14
  "step": 10
15
  },
16
  {
17
+ "epoch": 0.22,
18
  "learning_rate": 6e-06,
19
+ "loss": 4.0182,
20
  "step": 20
21
  },
22
  {
23
+ "epoch": 0.33,
24
  "learning_rate": 9e-06,
25
+ "loss": 3.9239,
26
  "step": 30
27
  },
28
  {
29
+ "epoch": 0.44,
30
  "learning_rate": 1.2e-05,
31
+ "loss": 3.5011,
32
  "step": 40
33
  },
34
  {
35
+ "epoch": 0.55,
36
  "learning_rate": 1.5e-05,
37
+ "loss": 3.2438,
38
  "step": 50
39
  },
40
  {
41
+ "epoch": 0.66,
42
  "learning_rate": 1.8e-05,
43
+ "loss": 3.1774,
44
  "step": 60
45
  },
46
  {
47
+ "epoch": 0.77,
48
  "learning_rate": 2.1e-05,
49
+ "loss": 3.1056,
50
  "step": 70
51
  },
52
  {
53
+ "epoch": 0.88,
54
  "learning_rate": 2.4e-05,
55
+ "loss": 3.0254,
56
  "step": 80
57
  },
58
  {
59
+ "epoch": 0.99,
60
  "learning_rate": 2.7000000000000002e-05,
61
+ "loss": 2.941,
62
  "step": 90
63
  },
64
  {
65
+ "epoch": 1.1,
66
  "learning_rate": 3e-05,
67
+ "loss": 2.8396,
68
  "step": 100
69
  },
70
  {
71
+ "epoch": 1.1,
72
+ "eval_loss": 2.859715223312378,
73
+ "eval_runtime": 56.6611,
74
+ "eval_samples_per_second": 4.73,
75
+ "eval_steps_per_second": 0.088,
76
  "step": 100
77
  },
78
  {
79
+ "epoch": 1.21,
80
+ "learning_rate": 2.962962962962963e-05,
81
+ "loss": 2.7745,
82
  "step": 110
83
  },
84
  {
85
+ "epoch": 1.32,
86
+ "learning_rate": 2.925925925925926e-05,
87
+ "loss": 2.6745,
88
  "step": 120
89
  },
90
  {
91
+ "epoch": 1.43,
92
+ "learning_rate": 2.8888888888888888e-05,
93
+ "loss": 2.6359,
94
  "step": 130
95
  },
96
  {
97
+ "epoch": 1.54,
98
+ "learning_rate": 2.851851851851852e-05,
99
+ "loss": 2.5991,
100
  "step": 140
101
  },
102
  {
103
+ "epoch": 1.65,
104
+ "learning_rate": 2.8148148148148147e-05,
105
+ "loss": 2.6046,
106
  "step": 150
107
  },
108
  {
109
+ "epoch": 1.76,
110
+ "learning_rate": 2.777777777777778e-05,
111
+ "loss": 2.4419,
112
  "step": 160
113
  },
114
  {
115
+ "epoch": 1.87,
116
+ "learning_rate": 2.7407407407407408e-05,
117
+ "loss": 2.4514,
118
  "step": 170
119
  },
120
  {
121
+ "epoch": 1.98,
122
+ "learning_rate": 2.7037037037037037e-05,
123
+ "loss": 2.4557,
124
  "step": 180
125
  },
126
  {
127
+ "epoch": 2.09,
128
+ "learning_rate": 2.6666666666666667e-05,
129
+ "loss": 2.1795,
130
  "step": 190
131
  },
132
  {
133
+ "epoch": 2.2,
134
+ "learning_rate": 2.6296296296296296e-05,
135
+ "loss": 2.1014,
136
  "step": 200
137
  },
138
  {
139
+ "epoch": 2.2,
140
+ "eval_loss": 2.2379915714263916,
141
+ "eval_runtime": 62.1156,
142
+ "eval_samples_per_second": 4.315,
143
+ "eval_steps_per_second": 0.08,
144
  "step": 200
145
  },
146
  {
147
+ "epoch": 2.31,
148
+ "learning_rate": 2.5925925925925925e-05,
149
+ "loss": 2.0129,
150
  "step": 210
151
  },
152
  {
153
+ "epoch": 2.42,
154
+ "learning_rate": 2.5555555555555557e-05,
155
+ "loss": 2.0986,
156
  "step": 220
157
  },
158
  {
159
+ "epoch": 2.53,
160
+ "learning_rate": 2.5185185185185183e-05,
161
+ "loss": 2.0814,
162
  "step": 230
163
  },
164
  {
165
+ "epoch": 2.64,
166
+ "learning_rate": 2.4814814814814816e-05,
167
+ "loss": 1.9973,
168
  "step": 240
169
  },
170
  {
171
+ "epoch": 2.75,
172
+ "learning_rate": 2.4444444444444445e-05,
173
+ "loss": 1.959,
174
  "step": 250
175
  },
176
  {
177
+ "epoch": 2.86,
178
+ "learning_rate": 2.4074074074074074e-05,
179
+ "loss": 1.9774,
180
  "step": 260
181
  },
182
  {
183
+ "epoch": 2.97,
184
+ "learning_rate": 2.3703703703703703e-05,
185
+ "loss": 1.9267,
186
  "step": 270
187
  },
188
  {
189
+ "epoch": 3.08,
190
+ "learning_rate": 2.3333333333333336e-05,
191
+ "loss": 1.8192,
192
  "step": 280
193
  },
194
  {
195
+ "epoch": 3.19,
196
+ "learning_rate": 2.296296296296296e-05,
197
+ "loss": 1.6512,
198
  "step": 290
199
  },
200
  {
201
+ "epoch": 3.3,
202
+ "learning_rate": 2.2592592592592594e-05,
203
+ "loss": 1.5577,
204
  "step": 300
205
  },
206
  {
207
+ "epoch": 3.3,
208
+ "eval_loss": 2.048072576522827,
209
+ "eval_runtime": 55.3343,
210
+ "eval_samples_per_second": 4.843,
211
+ "eval_steps_per_second": 0.09,
212
  "step": 300
213
  },
214
  {
215
+ "epoch": 3.41,
216
+ "learning_rate": 2.222222222222222e-05,
217
+ "loss": 1.5442,
218
  "step": 310
219
  },
220
  {
221
+ "epoch": 3.52,
222
+ "learning_rate": 2.1851851851851852e-05,
223
+ "loss": 1.5807,
224
  "step": 320
225
  },
226
  {
227
+ "epoch": 3.63,
228
+ "learning_rate": 2.148148148148148e-05,
229
+ "loss": 1.6009,
230
  "step": 330
231
  },
232
  {
233
+ "epoch": 3.74,
234
+ "learning_rate": 2.111111111111111e-05,
235
+ "loss": 1.5485,
236
  "step": 340
237
  },
238
  {
239
+ "epoch": 3.85,
240
+ "learning_rate": 2.074074074074074e-05,
241
+ "loss": 1.6185,
242
  "step": 350
243
  },
244
  {
245
+ "epoch": 3.96,
246
+ "learning_rate": 2.0370370370370372e-05,
247
+ "loss": 1.5032,
248
  "step": 360
249
  },
250
  {
251
+ "epoch": 4.07,
252
+ "learning_rate": 1.9999999999999998e-05,
253
+ "loss": 1.2841,
254
  "step": 370
255
  },
256
  {
257
+ "epoch": 4.18,
258
+ "learning_rate": 1.962962962962963e-05,
259
+ "loss": 1.2237,
260
  "step": 380
261
  },
262
  {
263
+ "epoch": 4.29,
264
+ "learning_rate": 1.925925925925926e-05,
265
+ "loss": 1.2177,
266
  "step": 390
267
  },
268
  {
269
+ "epoch": 4.4,
270
+ "learning_rate": 1.888888888888889e-05,
271
+ "loss": 1.2009,
272
  "step": 400
273
  },
274
  {
275
+ "epoch": 4.4,
276
+ "eval_loss": 1.8517041206359863,
277
+ "eval_runtime": 56.0224,
278
+ "eval_samples_per_second": 4.784,
279
+ "eval_steps_per_second": 0.089,
280
  "step": 400
281
  },
282
  {
283
+ "epoch": 4.51,
284
+ "learning_rate": 1.8518518518518518e-05,
285
+ "loss": 1.1844,
286
  "step": 410
287
  },
288
  {
289
+ "epoch": 4.62,
290
+ "learning_rate": 1.814814814814815e-05,
291
+ "loss": 1.2252,
292
  "step": 420
293
  },
294
  {
295
+ "epoch": 4.73,
296
+ "learning_rate": 1.7777777777777777e-05,
297
+ "loss": 1.1829,
298
  "step": 430
299
  },
300
  {
301
+ "epoch": 4.84,
302
+ "learning_rate": 1.740740740740741e-05,
303
+ "loss": 1.177,
304
  "step": 440
305
  },
306
  {
307
+ "epoch": 4.95,
308
+ "learning_rate": 1.7037037037037035e-05,
309
+ "loss": 1.194,
310
  "step": 450
311
  },
312
  {
313
+ "epoch": 5.05,
314
+ "learning_rate": 1.6666666666666667e-05,
315
+ "loss": 1.0947,
316
  "step": 460
317
  },
318
  {
319
+ "epoch": 5.16,
320
+ "learning_rate": 1.6296296296296297e-05,
321
+ "loss": 1.0177,
322
  "step": 470
323
  },
324
  {
325
+ "epoch": 5.27,
326
+ "learning_rate": 1.5925925925925926e-05,
327
+ "loss": 0.9692,
328
  "step": 480
329
  },
330
  {
331
+ "epoch": 5.38,
332
+ "learning_rate": 1.5555555555555555e-05,
333
+ "loss": 0.9933,
334
  "step": 490
335
  },
336
  {
337
+ "epoch": 5.49,
338
+ "learning_rate": 1.5185185185185186e-05,
339
+ "loss": 0.9451,
340
  "step": 500
341
  },
342
  {
343
+ "epoch": 5.49,
344
+ "eval_loss": 1.683161973953247,
345
+ "eval_runtime": 51.3302,
346
+ "eval_samples_per_second": 5.221,
347
+ "eval_steps_per_second": 0.097,
348
  "step": 500
349
+ },
350
+ {
351
+ "epoch": 5.6,
352
+ "learning_rate": 1.4814814814814815e-05,
353
+ "loss": 0.9674,
354
+ "step": 510
355
+ },
356
+ {
357
+ "epoch": 5.71,
358
+ "learning_rate": 1.4444444444444444e-05,
359
+ "loss": 0.9519,
360
+ "step": 520
361
+ },
362
+ {
363
+ "epoch": 5.82,
364
+ "learning_rate": 1.4074074074074073e-05,
365
+ "loss": 0.9108,
366
+ "step": 530
367
+ },
368
+ {
369
+ "epoch": 5.93,
370
+ "learning_rate": 1.3703703703703704e-05,
371
+ "loss": 0.8963,
372
+ "step": 540
373
+ },
374
+ {
375
+ "epoch": 6.04,
376
+ "learning_rate": 1.3333333333333333e-05,
377
+ "loss": 0.8255,
378
+ "step": 550
379
+ },
380
+ {
381
+ "epoch": 6.15,
382
+ "learning_rate": 1.2962962962962962e-05,
383
+ "loss": 0.773,
384
+ "step": 560
385
+ },
386
+ {
387
+ "epoch": 6.26,
388
+ "learning_rate": 1.2592592592592592e-05,
389
+ "loss": 0.7945,
390
+ "step": 570
391
+ },
392
+ {
393
+ "epoch": 6.37,
394
+ "learning_rate": 1.2222222222222222e-05,
395
+ "loss": 0.7398,
396
+ "step": 580
397
+ },
398
+ {
399
+ "epoch": 6.48,
400
+ "learning_rate": 1.1851851851851852e-05,
401
+ "loss": 0.7495,
402
+ "step": 590
403
+ },
404
+ {
405
+ "epoch": 6.59,
406
+ "learning_rate": 1.148148148148148e-05,
407
+ "loss": 0.7491,
408
+ "step": 600
409
+ },
410
+ {
411
+ "epoch": 6.59,
412
+ "eval_loss": 1.6291619539260864,
413
+ "eval_runtime": 56.8018,
414
+ "eval_samples_per_second": 4.718,
415
+ "eval_steps_per_second": 0.088,
416
+ "step": 600
417
+ },
418
+ {
419
+ "epoch": 6.7,
420
+ "learning_rate": 1.111111111111111e-05,
421
+ "loss": 0.7644,
422
+ "step": 610
423
+ },
424
+ {
425
+ "epoch": 6.81,
426
+ "learning_rate": 1.074074074074074e-05,
427
+ "loss": 0.7328,
428
+ "step": 620
429
+ },
430
+ {
431
+ "epoch": 6.92,
432
+ "learning_rate": 1.037037037037037e-05,
433
+ "loss": 0.7294,
434
+ "step": 630
435
+ },
436
+ {
437
+ "epoch": 7.03,
438
+ "learning_rate": 9.999999999999999e-06,
439
+ "loss": 0.6665,
440
+ "step": 640
441
+ },
442
+ {
443
+ "epoch": 7.14,
444
+ "learning_rate": 9.62962962962963e-06,
445
+ "loss": 0.6109,
446
+ "step": 650
447
+ },
448
+ {
449
+ "epoch": 7.25,
450
+ "learning_rate": 9.259259259259259e-06,
451
+ "loss": 0.6219,
452
+ "step": 660
453
+ },
454
+ {
455
+ "epoch": 7.36,
456
+ "learning_rate": 8.888888888888888e-06,
457
+ "loss": 0.592,
458
+ "step": 670
459
+ },
460
+ {
461
+ "epoch": 7.47,
462
+ "learning_rate": 8.518518518518517e-06,
463
+ "loss": 0.5722,
464
+ "step": 680
465
+ },
466
+ {
467
+ "epoch": 7.58,
468
+ "learning_rate": 8.148148148148148e-06,
469
+ "loss": 0.5538,
470
+ "step": 690
471
+ },
472
+ {
473
+ "epoch": 7.69,
474
+ "learning_rate": 7.777777777777777e-06,
475
+ "loss": 0.5546,
476
+ "step": 700
477
+ },
478
+ {
479
+ "epoch": 7.69,
480
+ "eval_loss": 1.5901862382888794,
481
+ "eval_runtime": 52.5596,
482
+ "eval_samples_per_second": 5.099,
483
+ "eval_steps_per_second": 0.095,
484
+ "step": 700
485
+ },
486
+ {
487
+ "epoch": 7.8,
488
+ "learning_rate": 7.4074074074074075e-06,
489
+ "loss": 0.546,
490
+ "step": 710
491
+ },
492
+ {
493
+ "epoch": 7.91,
494
+ "learning_rate": 7.037037037037037e-06,
495
+ "loss": 0.5322,
496
+ "step": 720
497
+ },
498
+ {
499
+ "epoch": 8.02,
500
+ "learning_rate": 6.666666666666667e-06,
501
+ "loss": 0.588,
502
+ "step": 730
503
+ },
504
+ {
505
+ "epoch": 8.13,
506
+ "learning_rate": 6.296296296296296e-06,
507
+ "loss": 0.4936,
508
+ "step": 740
509
+ },
510
+ {
511
+ "epoch": 8.24,
512
+ "learning_rate": 5.925925925925926e-06,
513
+ "loss": 0.4625,
514
+ "step": 750
515
+ },
516
+ {
517
+ "epoch": 8.35,
518
+ "learning_rate": 5.555555555555555e-06,
519
+ "loss": 0.4577,
520
+ "step": 760
521
+ },
522
+ {
523
+ "epoch": 8.46,
524
+ "learning_rate": 5.185185185185185e-06,
525
+ "loss": 0.4619,
526
+ "step": 770
527
+ },
528
+ {
529
+ "epoch": 8.57,
530
+ "learning_rate": 4.814814814814815e-06,
531
+ "loss": 0.5216,
532
+ "step": 780
533
+ },
534
+ {
535
+ "epoch": 8.68,
536
+ "learning_rate": 4.444444444444444e-06,
537
+ "loss": 0.4771,
538
+ "step": 790
539
+ },
540
+ {
541
+ "epoch": 8.79,
542
+ "learning_rate": 4.074074074074074e-06,
543
+ "loss": 0.452,
544
+ "step": 800
545
+ },
546
+ {
547
+ "epoch": 8.79,
548
+ "eval_loss": 1.4902839660644531,
549
+ "eval_runtime": 54.4822,
550
+ "eval_samples_per_second": 4.919,
551
+ "eval_steps_per_second": 0.092,
552
+ "step": 800
553
+ },
554
+ {
555
+ "epoch": 8.9,
556
+ "learning_rate": 3.7037037037037037e-06,
557
+ "loss": 0.4645,
558
+ "step": 810
559
+ },
560
+ {
561
+ "epoch": 9.01,
562
+ "learning_rate": 3.3333333333333333e-06,
563
+ "loss": 0.4459,
564
+ "step": 820
565
+ },
566
+ {
567
+ "epoch": 9.12,
568
+ "learning_rate": 2.962962962962963e-06,
569
+ "loss": 0.4157,
570
+ "step": 830
571
+ },
572
+ {
573
+ "epoch": 9.23,
574
+ "learning_rate": 2.5925925925925925e-06,
575
+ "loss": 0.3948,
576
+ "step": 840
577
+ },
578
+ {
579
+ "epoch": 9.34,
580
+ "learning_rate": 2.222222222222222e-06,
581
+ "loss": 0.4186,
582
+ "step": 850
583
+ },
584
+ {
585
+ "epoch": 9.45,
586
+ "learning_rate": 1.8518518518518519e-06,
587
+ "loss": 0.3841,
588
+ "step": 860
589
+ },
590
+ {
591
+ "epoch": 9.56,
592
+ "learning_rate": 1.4814814814814815e-06,
593
+ "loss": 0.4086,
594
+ "step": 870
595
+ },
596
+ {
597
+ "epoch": 9.67,
598
+ "learning_rate": 1.111111111111111e-06,
599
+ "loss": 0.4018,
600
+ "step": 880
601
+ },
602
+ {
603
+ "epoch": 9.78,
604
+ "learning_rate": 7.407407407407407e-07,
605
+ "loss": 0.405,
606
+ "step": 890
607
  }
608
  ],
609
+ "max_steps": 910,
610
  "num_train_epochs": 10,
611
+ "total_flos": 4163675271142080.0,
612
  "trial_name": null,
613
  "trial_params": null
614
  }
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b898b254fd52b5f76b47c7563c394df3170917e3ca8dbd76b57a13829029972d
3
- size 3247
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4f5cae42c017fd963e96f9cfd9dfe587553a09e551e3e4633e4e56493842b807
3
+ size 3311