juyongjiang commited on
Commit
6ba91f9
1 Parent(s): 591bc39

update model checkpoint

Browse files
README.md CHANGED
@@ -1,14 +1,11 @@
1
  ---
2
- license: gemma
3
  library_name: peft
4
  tags:
5
  - alignment-handbook
6
- - trl
7
- - sft
8
  - generated_from_trainer
9
- base_model: google/gemma-7b
10
  datasets:
11
  - llama-duo/synth_summarize_dataset_dedup
 
12
  model-index:
13
  - name: gemma7b-summarize-gpt4o-4k
14
  results: []
@@ -21,7 +18,7 @@ should probably proofread and complete it, then remove this comment. -->
21
 
22
  This model is a fine-tuned version of [google/gemma-7b](https://huggingface.co/google/gemma-7b) on the llama-duo/synth_summarize_dataset_dedup dataset.
23
  It achieves the following results on the evaluation set:
24
- - Loss: 4.0658
25
 
26
  ## Model description
27
 
@@ -42,13 +39,13 @@ More information needed
42
  The following hyperparameters were used during training:
43
  - learning_rate: 0.0002
44
  - train_batch_size: 4
45
- - eval_batch_size: 4
46
  - seed: 42
47
  - distributed_type: multi-GPU
48
- - num_devices: 2
49
  - gradient_accumulation_steps: 2
50
- - total_train_batch_size: 16
51
- - total_eval_batch_size: 8
52
  - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
53
  - lr_scheduler_type: cosine
54
  - lr_scheduler_warmup_ratio: 0.1
@@ -56,24 +53,24 @@ The following hyperparameters were used during training:
56
 
57
  ### Training results
58
 
59
- | Training Loss | Epoch | Step | Validation Loss |
60
- |:-------------:|:------:|:----:|:---------------:|
61
- | 16.725 | 0.9804 | 25 | 6.5520 |
62
- | 1.5122 | 2.0 | 51 | 2.6626 |
63
- | 1.1154 | 2.9804 | 76 | 2.5917 |
64
- | 0.9204 | 4.0 | 102 | 2.6570 |
65
- | 0.779 | 4.9804 | 127 | 2.7498 |
66
- | 0.6207 | 6.0 | 153 | 2.9976 |
67
- | 0.4762 | 6.9804 | 178 | 3.4668 |
68
- | 0.3908 | 8.0 | 204 | 3.8246 |
69
- | 0.3418 | 8.9804 | 229 | 4.0561 |
70
- | 0.3252 | 9.8039 | 250 | 4.0658 |
71
 
72
 
73
  ### Framework versions
74
 
75
  - PEFT 0.10.0
76
  - Transformers 4.40.0
77
- - Pytorch 2.2.1+cu121
78
  - Datasets 2.18.0
79
  - Tokenizers 0.19.1
 
1
  ---
 
2
  library_name: peft
3
  tags:
4
  - alignment-handbook
 
 
5
  - generated_from_trainer
 
6
  datasets:
7
  - llama-duo/synth_summarize_dataset_dedup
8
+ base_model: google/gemma-7b
9
  model-index:
10
  - name: gemma7b-summarize-gpt4o-4k
11
  results: []
 
18
 
19
  This model is a fine-tuned version of [google/gemma-7b](https://huggingface.co/google/gemma-7b) on the llama-duo/synth_summarize_dataset_dedup dataset.
20
  It achieves the following results on the evaluation set:
21
+ - Loss: 6.0322
22
 
23
  ## Model description
24
 
 
39
  The following hyperparameters were used during training:
40
  - learning_rate: 0.0002
41
  - train_batch_size: 4
42
+ - eval_batch_size: 2
43
  - seed: 42
44
  - distributed_type: multi-GPU
45
+ - num_devices: 8
46
  - gradient_accumulation_steps: 2
47
+ - total_train_batch_size: 64
48
+ - total_eval_batch_size: 16
49
  - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
50
  - lr_scheduler_type: cosine
51
  - lr_scheduler_warmup_ratio: 0.1
 
53
 
54
  ### Training results
55
 
56
+ | Training Loss | Epoch | Step | Validation Loss |
57
+ |:-------------:|:-----:|:----:|:---------------:|
58
+ | 44.3098 | 1.0 | 7 | 13.7204 |
59
+ | 25.7366 | 2.0 | 14 | 8.6916 |
60
+ | 19.0375 | 3.0 | 21 | 7.6308 |
61
+ | 18.2973 | 4.0 | 28 | 7.1198 |
62
+ | 14.8387 | 5.0 | 35 | 6.8470 |
63
+ | 12.5684 | 6.0 | 42 | 6.8495 |
64
+ | 9.6308 | 7.0 | 49 | 6.6799 |
65
+ | 5.7187 | 8.0 | 56 | 6.0818 |
66
+ | 4.8487 | 9.0 | 63 | 6.0318 |
67
+ | 4.4303 | 10.0 | 70 | 6.0322 |
68
 
69
 
70
  ### Framework versions
71
 
72
  - PEFT 0.10.0
73
  - Transformers 4.40.0
74
+ - Pytorch 2.1.2+cu121
75
  - Datasets 2.18.0
76
  - Tokenizers 0.19.1
adapter_config.json CHANGED
@@ -21,12 +21,7 @@
21
  "revision": null,
22
  "target_modules": [
23
  "v_proj",
24
- "up_proj",
25
- "down_proj",
26
- "q_proj",
27
- "k_proj",
28
- "gate_proj",
29
- "o_proj"
30
  ],
31
  "task_type": "CAUSAL_LM",
32
  "use_dora": false,
 
21
  "revision": null,
22
  "target_modules": [
23
  "v_proj",
24
+ "q_proj"
 
 
 
 
 
25
  ],
26
  "task_type": "CAUSAL_LM",
27
  "use_dora": false,
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0d0324a90521c27231d849c8c00f3dca466edadcf64bb738c9a3abae4b6ef709
3
- size 50056096
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e8bd0611a140bb49692da9a4e4e6c94e205e6baa337c23ddd71a8b04006987ac
3
+ size 6437384
all_results.json CHANGED
@@ -1,14 +1,14 @@
1
  {
2
- "epoch": 9.803921568627452,
3
- "eval_loss": 4.065803050994873,
4
- "eval_runtime": 2.0617,
5
  "eval_samples": 25,
6
- "eval_samples_per_second": 4.85,
7
- "eval_steps_per_second": 0.97,
8
- "total_flos": 1.9110914639160934e+17,
9
- "train_loss": 3.4213723726272582,
10
- "train_runtime": 1646.3335,
11
- "train_samples": 3749,
12
- "train_samples_per_second": 2.448,
13
- "train_steps_per_second": 0.152
14
  }
 
1
  {
2
+ "epoch": 10.0,
3
+ "eval_loss": 6.032225608825684,
4
+ "eval_runtime": 0.2331,
5
  "eval_samples": 25,
6
+ "eval_samples_per_second": 42.9,
7
+ "eval_steps_per_second": 4.29,
8
+ "total_flos": 2.1344245100262195e+17,
9
+ "train_loss": 14.937398610796247,
10
+ "train_runtime": 171.2768,
11
+ "train_samples": 4038,
12
+ "train_samples_per_second": 25.573,
13
+ "train_steps_per_second": 0.409
14
  }
config.json CHANGED
@@ -23,9 +23,9 @@
23
  "_load_in_4bit": true,
24
  "_load_in_8bit": false,
25
  "bnb_4bit_compute_dtype": "bfloat16",
26
- "bnb_4bit_quant_storage": "bfloat16",
27
  "bnb_4bit_quant_type": "nf4",
28
- "bnb_4bit_use_double_quant": true,
29
  "llm_int8_enable_fp32_cpu_offload": false,
30
  "llm_int8_has_fp16_weight": false,
31
  "llm_int8_skip_modules": null,
 
23
  "_load_in_4bit": true,
24
  "_load_in_8bit": false,
25
  "bnb_4bit_compute_dtype": "bfloat16",
26
+ "bnb_4bit_quant_storage": "uint8",
27
  "bnb_4bit_quant_type": "nf4",
28
+ "bnb_4bit_use_double_quant": false,
29
  "llm_int8_enable_fp32_cpu_offload": false,
30
  "llm_int8_has_fp16_weight": false,
31
  "llm_int8_skip_modules": null,
eval_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 9.803921568627452,
3
- "eval_loss": 4.065803050994873,
4
- "eval_runtime": 2.0617,
5
  "eval_samples": 25,
6
- "eval_samples_per_second": 4.85,
7
- "eval_steps_per_second": 0.97
8
  }
 
1
  {
2
+ "epoch": 10.0,
3
+ "eval_loss": 6.032225608825684,
4
+ "eval_runtime": 0.2331,
5
  "eval_samples": 25,
6
+ "eval_samples_per_second": 42.9,
7
+ "eval_steps_per_second": 4.29
8
  }
runs/Jun13_05-47-39_gpu1-2/events.out.tfevents.1718228907.gpu1-2.1119368.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aa9206e38c31c7f482395e1debcf45d6543f300b9fb75dc5ea12a9754fd215b1
3
+ size 11560
runs/Jun13_05-47-39_gpu1-2/events.out.tfevents.1718229079.gpu1-2.1119368.1 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:83e53ba28fdee41c2c8aca1ef0d2294fdcb16f2132303d6d9ea828e75435eca8
3
+ size 354
train_results.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
- "epoch": 9.803921568627452,
3
- "total_flos": 1.9110914639160934e+17,
4
- "train_loss": 3.4213723726272582,
5
- "train_runtime": 1646.3335,
6
- "train_samples": 3749,
7
- "train_samples_per_second": 2.448,
8
- "train_steps_per_second": 0.152
9
  }
 
1
  {
2
+ "epoch": 10.0,
3
+ "total_flos": 2.1344245100262195e+17,
4
+ "train_loss": 14.937398610796247,
5
+ "train_runtime": 171.2768,
6
+ "train_samples": 4038,
7
+ "train_samples_per_second": 25.573,
8
+ "train_steps_per_second": 0.409
9
  }
trainer_state.json CHANGED
@@ -1,466 +1,214 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 9.803921568627452,
5
  "eval_steps": 500,
6
- "global_step": 250,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.0392156862745098,
13
- "grad_norm": 536.0,
14
- "learning_rate": 8.000000000000001e-06,
15
- "loss": 51.7984,
16
  "step": 1
17
  },
18
  {
19
- "epoch": 0.19607843137254902,
20
- "grad_norm": 314.0,
21
- "learning_rate": 4e-05,
22
- "loss": 43.035,
23
  "step": 5
24
  },
25
  {
26
- "epoch": 0.39215686274509803,
27
- "grad_norm": 45.25,
28
- "learning_rate": 8e-05,
29
- "loss": 28.3835,
30
- "step": 10
31
- },
32
- {
33
- "epoch": 0.5882352941176471,
34
- "grad_norm": 11.5,
35
- "learning_rate": 0.00012,
36
- "loss": 21.6194,
37
- "step": 15
38
- },
39
- {
40
- "epoch": 0.7843137254901961,
41
- "grad_norm": 10.0625,
42
- "learning_rate": 0.00016,
43
- "loss": 19.3838,
44
- "step": 20
45
- },
46
- {
47
- "epoch": 0.9803921568627451,
48
- "grad_norm": 37.75,
49
- "learning_rate": 0.0002,
50
- "loss": 16.725,
51
- "step": 25
52
- },
53
- {
54
- "epoch": 0.9803921568627451,
55
- "eval_loss": 6.552042484283447,
56
- "eval_runtime": 2.0523,
57
- "eval_samples_per_second": 4.873,
58
- "eval_steps_per_second": 0.975,
59
- "step": 25
60
  },
61
  {
62
- "epoch": 1.1764705882352942,
63
- "grad_norm": 10.875,
64
- "learning_rate": 0.00019975640502598244,
65
- "loss": 7.143,
66
- "step": 30
67
- },
68
- {
69
- "epoch": 1.3725490196078431,
70
- "grad_norm": 7.75,
71
- "learning_rate": 0.00019902680687415705,
72
- "loss": 2.2222,
73
- "step": 35
74
- },
75
- {
76
- "epoch": 1.5686274509803921,
77
- "grad_norm": 2.453125,
78
- "learning_rate": 0.00019781476007338058,
79
- "loss": 1.7799,
80
- "step": 40
81
- },
82
- {
83
- "epoch": 1.7647058823529411,
84
- "grad_norm": 4.34375,
85
- "learning_rate": 0.0001961261695938319,
86
- "loss": 1.5965,
87
- "step": 45
88
- },
89
- {
90
- "epoch": 1.9607843137254903,
91
- "grad_norm": 2.4375,
92
- "learning_rate": 0.00019396926207859084,
93
- "loss": 1.5122,
94
- "step": 50
95
  },
96
  {
97
  "epoch": 2.0,
98
- "eval_loss": 2.6625783443450928,
99
- "eval_runtime": 2.0517,
100
- "eval_samples_per_second": 4.874,
101
- "eval_steps_per_second": 0.975,
102
- "step": 51
103
- },
104
- {
105
- "epoch": 2.156862745098039,
106
- "grad_norm": 1.7421875,
107
- "learning_rate": 0.0001913545457642601,
108
- "loss": 1.3478,
109
- "step": 55
110
- },
111
- {
112
- "epoch": 2.3529411764705883,
113
- "grad_norm": 2.046875,
114
- "learning_rate": 0.00018829475928589271,
115
- "loss": 1.26,
116
- "step": 60
117
  },
118
  {
119
- "epoch": 2.549019607843137,
120
- "grad_norm": 3.8125,
121
- "learning_rate": 0.0001848048096156426,
122
- "loss": 1.2315,
123
- "step": 65
124
  },
125
  {
126
- "epoch": 2.7450980392156863,
127
  "grad_norm": 4.46875,
128
- "learning_rate": 0.00018090169943749476,
129
- "loss": 1.1548,
130
- "step": 70
131
- },
132
- {
133
- "epoch": 2.9411764705882355,
134
- "grad_norm": 1.5,
135
- "learning_rate": 0.0001766044443118978,
136
- "loss": 1.1154,
137
- "step": 75
138
- },
139
- {
140
- "epoch": 2.980392156862745,
141
- "eval_loss": 2.591742753982544,
142
- "eval_runtime": 2.0491,
143
- "eval_samples_per_second": 4.88,
144
- "eval_steps_per_second": 0.976,
145
- "step": 76
146
- },
147
- {
148
- "epoch": 3.1372549019607843,
149
- "grad_norm": 1.9375,
150
- "learning_rate": 0.0001719339800338651,
151
- "loss": 1.0096,
152
- "step": 80
153
- },
154
- {
155
- "epoch": 3.3333333333333335,
156
- "grad_norm": 3.40625,
157
- "learning_rate": 0.00016691306063588583,
158
- "loss": 0.9379,
159
- "step": 85
160
- },
161
- {
162
- "epoch": 3.5294117647058822,
163
- "grad_norm": 1.65625,
164
- "learning_rate": 0.0001615661475325658,
165
- "loss": 0.9333,
166
- "step": 90
167
  },
168
  {
169
- "epoch": 3.7254901960784315,
170
- "grad_norm": 3.109375,
171
- "learning_rate": 0.0001559192903470747,
172
- "loss": 0.9114,
173
- "step": 95
 
174
  },
175
  {
176
- "epoch": 3.9215686274509802,
177
- "grad_norm": 11.0,
178
- "learning_rate": 0.00015000000000000001,
179
- "loss": 0.9204,
180
- "step": 100
181
  },
182
  {
183
  "epoch": 4.0,
184
- "eval_loss": 2.6570026874542236,
185
- "eval_runtime": 2.0401,
186
- "eval_samples_per_second": 4.902,
187
- "eval_steps_per_second": 0.98,
188
- "step": 102
189
  },
190
  {
191
- "epoch": 4.117647058823529,
192
- "grad_norm": 1.4296875,
193
- "learning_rate": 0.00014383711467890774,
194
- "loss": 0.8556,
195
- "step": 105
196
- },
197
- {
198
- "epoch": 4.313725490196078,
199
- "grad_norm": 1.3984375,
200
- "learning_rate": 0.00013746065934159123,
201
- "loss": 0.786,
202
- "step": 110
203
- },
204
- {
205
- "epoch": 4.509803921568627,
206
- "grad_norm": 1.0703125,
207
- "learning_rate": 0.00013090169943749476,
208
- "loss": 0.797,
209
- "step": 115
210
- },
211
- {
212
- "epoch": 4.705882352941177,
213
- "grad_norm": 1.9609375,
214
- "learning_rate": 0.00012419218955996676,
215
- "loss": 0.7689,
216
- "step": 120
217
  },
218
  {
219
- "epoch": 4.901960784313726,
220
- "grad_norm": 1.4296875,
221
  "learning_rate": 0.00011736481776669306,
222
- "loss": 0.779,
223
- "step": 125
224
- },
225
- {
226
- "epoch": 4.980392156862745,
227
- "eval_loss": 2.749800205230713,
228
- "eval_runtime": 2.0472,
229
- "eval_samples_per_second": 4.885,
230
- "eval_steps_per_second": 0.977,
231
- "step": 127
232
- },
233
- {
234
- "epoch": 5.098039215686274,
235
- "grad_norm": 15.25,
236
- "learning_rate": 0.00011045284632676536,
237
- "loss": 0.6889,
238
- "step": 130
239
- },
240
- {
241
- "epoch": 5.294117647058823,
242
- "grad_norm": 1.28125,
243
- "learning_rate": 0.00010348994967025012,
244
- "loss": 0.6232,
245
- "step": 135
246
- },
247
- {
248
- "epoch": 5.490196078431373,
249
- "grad_norm": 1.359375,
250
- "learning_rate": 9.651005032974994e-05,
251
- "loss": 0.6134,
252
- "step": 140
253
  },
254
  {
255
- "epoch": 5.686274509803922,
256
- "grad_norm": 1.25,
257
- "learning_rate": 8.954715367323468e-05,
258
- "loss": 0.612,
259
- "step": 145
 
260
  },
261
  {
262
- "epoch": 5.882352941176471,
263
- "grad_norm": 1.0390625,
264
- "learning_rate": 8.263518223330697e-05,
265
- "loss": 0.6207,
266
- "step": 150
267
  },
268
  {
269
  "epoch": 6.0,
270
- "eval_loss": 2.9976024627685547,
271
- "eval_runtime": 2.0445,
272
- "eval_samples_per_second": 4.891,
273
- "eval_steps_per_second": 0.978,
274
- "step": 153
275
- },
276
- {
277
- "epoch": 6.078431372549019,
278
- "grad_norm": 1.1328125,
279
- "learning_rate": 7.580781044003324e-05,
280
- "loss": 0.5836,
281
- "step": 155
282
- },
283
- {
284
- "epoch": 6.2745098039215685,
285
- "grad_norm": 1.203125,
286
- "learning_rate": 6.909830056250527e-05,
287
- "loss": 0.5047,
288
- "step": 160
289
- },
290
- {
291
- "epoch": 6.470588235294118,
292
- "grad_norm": 1.4765625,
293
- "learning_rate": 6.25393406584088e-05,
294
- "loss": 0.4862,
295
- "step": 165
296
- },
297
- {
298
- "epoch": 6.666666666666667,
299
- "grad_norm": 1.21875,
300
- "learning_rate": 5.616288532109225e-05,
301
- "loss": 0.4953,
302
- "step": 170
303
- },
304
- {
305
- "epoch": 6.862745098039216,
306
- "grad_norm": 1.1484375,
307
- "learning_rate": 5.000000000000002e-05,
308
- "loss": 0.4762,
309
- "step": 175
310
  },
311
  {
312
- "epoch": 6.980392156862745,
313
- "eval_loss": 3.466813325881958,
314
- "eval_runtime": 2.0565,
315
- "eval_samples_per_second": 4.863,
316
- "eval_steps_per_second": 0.973,
317
- "step": 178
318
- },
319
- {
320
- "epoch": 7.0588235294117645,
321
- "grad_norm": 1.203125,
322
- "learning_rate": 4.4080709652925336e-05,
323
- "loss": 0.4669,
324
- "step": 180
325
- },
326
- {
327
- "epoch": 7.254901960784314,
328
- "grad_norm": 1.4140625,
329
- "learning_rate": 3.843385246743417e-05,
330
- "loss": 0.3974,
331
- "step": 185
332
  },
333
  {
334
- "epoch": 7.450980392156863,
335
- "grad_norm": 1.0625,
336
- "learning_rate": 3.308693936411421e-05,
337
- "loss": 0.3956,
338
- "step": 190
 
339
  },
340
  {
341
- "epoch": 7.647058823529412,
342
- "grad_norm": 1.0703125,
343
- "learning_rate": 2.8066019966134904e-05,
344
- "loss": 0.386,
345
- "step": 195
346
  },
347
  {
348
- "epoch": 7.8431372549019605,
349
- "grad_norm": 1.1328125,
350
- "learning_rate": 2.339555568810221e-05,
351
- "loss": 0.3908,
352
- "step": 200
353
  },
354
  {
355
  "epoch": 8.0,
356
- "eval_loss": 3.824570417404175,
357
- "eval_runtime": 2.0426,
358
- "eval_samples_per_second": 4.896,
359
- "eval_steps_per_second": 0.979,
360
- "step": 204
361
- },
362
- {
363
- "epoch": 8.03921568627451,
364
- "grad_norm": 0.984375,
365
- "learning_rate": 1.9098300562505266e-05,
366
- "loss": 0.3721,
367
- "step": 205
368
- },
369
- {
370
- "epoch": 8.235294117647058,
371
- "grad_norm": 0.96875,
372
- "learning_rate": 1.5195190384357404e-05,
373
- "loss": 0.3342,
374
- "step": 210
375
- },
376
- {
377
- "epoch": 8.431372549019608,
378
- "grad_norm": 1.046875,
379
- "learning_rate": 1.1705240714107302e-05,
380
- "loss": 0.346,
381
- "step": 215
382
  },
383
  {
384
- "epoch": 8.627450980392156,
385
- "grad_norm": 1.046875,
386
- "learning_rate": 8.645454235739903e-06,
387
- "loss": 0.3381,
388
- "step": 220
389
- },
390
- {
391
- "epoch": 8.823529411764707,
392
- "grad_norm": 1.046875,
393
- "learning_rate": 6.030737921409169e-06,
394
- "loss": 0.3418,
395
- "step": 225
396
- },
397
- {
398
- "epoch": 8.980392156862745,
399
- "eval_loss": 4.056135177612305,
400
- "eval_runtime": 2.0561,
401
- "eval_samples_per_second": 4.864,
402
- "eval_steps_per_second": 0.973,
403
- "step": 229
404
- },
405
- {
406
- "epoch": 9.019607843137255,
407
- "grad_norm": 0.88671875,
408
- "learning_rate": 3.873830406168111e-06,
409
- "loss": 0.3294,
410
- "step": 230
411
- },
412
- {
413
- "epoch": 9.215686274509803,
414
- "grad_norm": 0.89453125,
415
- "learning_rate": 2.1852399266194314e-06,
416
- "loss": 0.3265,
417
- "step": 235
418
  },
419
  {
420
- "epoch": 9.411764705882353,
421
- "grad_norm": 0.9296875,
422
- "learning_rate": 9.731931258429638e-07,
423
- "loss": 0.3268,
424
- "step": 240
 
425
  },
426
  {
427
- "epoch": 9.607843137254902,
428
- "grad_norm": 0.91796875,
429
- "learning_rate": 2.4359497401758024e-07,
430
- "loss": 0.3257,
431
- "step": 245
432
  },
433
  {
434
- "epoch": 9.803921568627452,
435
- "grad_norm": 0.9140625,
436
  "learning_rate": 0.0,
437
- "loss": 0.3252,
438
- "step": 250
439
  },
440
  {
441
- "epoch": 9.803921568627452,
442
- "eval_loss": 4.065803050994873,
443
- "eval_runtime": 2.0429,
444
- "eval_samples_per_second": 4.895,
445
- "eval_steps_per_second": 0.979,
446
- "step": 250
447
  },
448
  {
449
- "epoch": 9.803921568627452,
450
- "step": 250,
451
- "total_flos": 1.9110914639160934e+17,
452
- "train_loss": 3.4213723726272582,
453
- "train_runtime": 1646.3335,
454
- "train_samples_per_second": 2.448,
455
- "train_steps_per_second": 0.152
456
  }
457
  ],
458
  "logging_steps": 5,
459
- "max_steps": 250,
460
  "num_input_tokens_seen": 0,
461
  "num_train_epochs": 10,
462
  "save_steps": 100,
463
- "total_flos": 1.9110914639160934e+17,
464
  "train_batch_size": 4,
465
  "trial_name": null,
466
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 10.0,
5
  "eval_steps": 500,
6
+ "global_step": 70,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.14285714285714285,
13
+ "grad_norm": 183.0,
14
+ "learning_rate": 2.857142857142857e-05,
15
+ "loss": 47.2142,
16
  "step": 1
17
  },
18
  {
19
+ "epoch": 0.7142857142857143,
20
+ "grad_norm": 97.5,
21
+ "learning_rate": 0.00014285714285714287,
22
+ "loss": 44.3098,
23
  "step": 5
24
  },
25
  {
26
+ "epoch": 1.0,
27
+ "eval_loss": 13.720385551452637,
28
+ "eval_runtime": 0.2481,
29
+ "eval_samples_per_second": 40.305,
30
+ "eval_steps_per_second": 4.031,
31
+ "step": 7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
  },
33
  {
34
+ "epoch": 1.4285714285714286,
35
+ "grad_norm": 18.375,
36
+ "learning_rate": 0.00019888308262251285,
37
+ "loss": 25.7366,
38
+ "step": 10
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39
  },
40
  {
41
  "epoch": 2.0,
42
+ "eval_loss": 8.691579818725586,
43
+ "eval_runtime": 0.2437,
44
+ "eval_samples_per_second": 41.04,
45
+ "eval_steps_per_second": 4.104,
46
+ "step": 14
 
 
 
 
 
 
 
 
 
 
 
 
 
 
47
  },
48
  {
49
+ "epoch": 2.142857142857143,
50
+ "grad_norm": 4.53125,
51
+ "learning_rate": 0.00019214762118704076,
52
+ "loss": 20.6774,
53
+ "step": 15
54
  },
55
  {
56
+ "epoch": 2.857142857142857,
57
  "grad_norm": 4.46875,
58
+ "learning_rate": 0.00017971325072229226,
59
+ "loss": 19.0375,
60
+ "step": 20
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
61
  },
62
  {
63
+ "epoch": 3.0,
64
+ "eval_loss": 7.630770683288574,
65
+ "eval_runtime": 0.2338,
66
+ "eval_samples_per_second": 42.776,
67
+ "eval_steps_per_second": 4.278,
68
+ "step": 21
69
  },
70
  {
71
+ "epoch": 3.571428571428571,
72
+ "grad_norm": 8.375,
73
+ "learning_rate": 0.00016234898018587337,
74
+ "loss": 18.2973,
75
+ "step": 25
76
  },
77
  {
78
  "epoch": 4.0,
79
+ "eval_loss": 7.119815826416016,
80
+ "eval_runtime": 0.2413,
81
+ "eval_samples_per_second": 41.435,
82
+ "eval_steps_per_second": 4.143,
83
+ "step": 28
84
  },
85
  {
86
+ "epoch": 4.285714285714286,
87
+ "grad_norm": 12.3125,
88
+ "learning_rate": 0.00014112871031306119,
89
+ "loss": 16.6166,
90
+ "step": 30
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
91
  },
92
  {
93
+ "epoch": 5.0,
94
+ "grad_norm": 16.625,
95
  "learning_rate": 0.00011736481776669306,
96
+ "loss": 14.8387,
97
+ "step": 35
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
98
  },
99
  {
100
+ "epoch": 5.0,
101
+ "eval_loss": 6.846951484680176,
102
+ "eval_runtime": 0.2339,
103
+ "eval_samples_per_second": 42.753,
104
+ "eval_steps_per_second": 4.275,
105
+ "step": 35
106
  },
107
  {
108
+ "epoch": 5.714285714285714,
109
+ "grad_norm": 23.5,
110
+ "learning_rate": 9.252699064135758e-05,
111
+ "loss": 12.5684,
112
+ "step": 40
113
  },
114
  {
115
  "epoch": 6.0,
116
+ "eval_loss": 6.8495354652404785,
117
+ "eval_runtime": 0.2342,
118
+ "eval_samples_per_second": 42.698,
119
+ "eval_steps_per_second": 4.27,
120
+ "step": 42
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
121
  },
122
  {
123
+ "epoch": 6.428571428571429,
124
+ "grad_norm": 26.0,
125
+ "learning_rate": 6.815133497483157e-05,
126
+ "loss": 9.6308,
127
+ "step": 45
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
128
  },
129
  {
130
+ "epoch": 7.0,
131
+ "eval_loss": 6.679884910583496,
132
+ "eval_runtime": 0.2349,
133
+ "eval_samples_per_second": 42.574,
134
+ "eval_steps_per_second": 4.257,
135
+ "step": 49
136
  },
137
  {
138
+ "epoch": 7.142857142857143,
139
+ "grad_norm": 26.25,
140
+ "learning_rate": 4.574537361342407e-05,
141
+ "loss": 7.3589,
142
+ "step": 50
143
  },
144
  {
145
+ "epoch": 7.857142857142857,
146
+ "grad_norm": 23.125,
147
+ "learning_rate": 2.669481281701739e-05,
148
+ "loss": 5.7187,
149
+ "step": 55
150
  },
151
  {
152
  "epoch": 8.0,
153
+ "eval_loss": 6.081761360168457,
154
+ "eval_runtime": 0.2356,
155
+ "eval_samples_per_second": 42.45,
156
+ "eval_steps_per_second": 4.245,
157
+ "step": 56
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
158
  },
159
  {
160
+ "epoch": 8.571428571428571,
161
+ "grad_norm": 22.625,
162
+ "learning_rate": 1.2177842662977135e-05,
163
+ "loss": 4.8487,
164
+ "step": 60
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
165
  },
166
  {
167
+ "epoch": 9.0,
168
+ "eval_loss": 6.031832695007324,
169
+ "eval_runtime": 0.2458,
170
+ "eval_samples_per_second": 40.691,
171
+ "eval_steps_per_second": 4.069,
172
+ "step": 63
173
  },
174
  {
175
+ "epoch": 9.285714285714286,
176
+ "grad_norm": 22.25,
177
+ "learning_rate": 3.092271377092215e-06,
178
+ "loss": 4.4729,
179
+ "step": 65
180
  },
181
  {
182
+ "epoch": 10.0,
183
+ "grad_norm": 21.75,
184
  "learning_rate": 0.0,
185
+ "loss": 4.4303,
186
+ "step": 70
187
  },
188
  {
189
+ "epoch": 10.0,
190
+ "eval_loss": 6.032225608825684,
191
+ "eval_runtime": 0.2341,
192
+ "eval_samples_per_second": 42.713,
193
+ "eval_steps_per_second": 4.271,
194
+ "step": 70
195
  },
196
  {
197
+ "epoch": 10.0,
198
+ "step": 70,
199
+ "total_flos": 2.1344245100262195e+17,
200
+ "train_loss": 14.937398610796247,
201
+ "train_runtime": 171.2768,
202
+ "train_samples_per_second": 25.573,
203
+ "train_steps_per_second": 0.409
204
  }
205
  ],
206
  "logging_steps": 5,
207
+ "max_steps": 70,
208
  "num_input_tokens_seen": 0,
209
  "num_train_epochs": 10,
210
  "save_steps": 100,
211
+ "total_flos": 2.1344245100262195e+17,
212
  "train_batch_size": 4,
213
  "trial_name": null,
214
  "trial_params": null
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:bfce4a04a478c718b10def89fee944166ea94cfde8205dce51c434187249fb6b
3
  size 5176
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:419732def1570977ca1dc458a816ea21cf64063785f04b3d10fe09e6b0bc2b83
3
  size 5176