chansung commited on
Commit
0e91216
1 Parent(s): 5138b4d

Model save

Browse files
README.md ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: gemma
3
+ library_name: peft
4
+ tags:
5
+ - trl
6
+ - sft
7
+ - generated_from_trainer
8
+ base_model: google/gemma-7b
9
+ datasets:
10
+ - generator
11
+ model-index:
12
+ - name: gemma7b-summarize-gpt4o-4k
13
+ results: []
14
+ ---
15
+
16
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
17
+ should probably proofread and complete it, then remove this comment. -->
18
+
19
+ # gemma7b-summarize-gpt4o-4k
20
+
21
+ This model is a fine-tuned version of [google/gemma-7b](https://huggingface.co/google/gemma-7b) on the generator dataset.
22
+ It achieves the following results on the evaluation set:
23
+ - Loss: 4.0658
24
+
25
+ ## Model description
26
+
27
+ More information needed
28
+
29
+ ## Intended uses & limitations
30
+
31
+ More information needed
32
+
33
+ ## Training and evaluation data
34
+
35
+ More information needed
36
+
37
+ ## Training procedure
38
+
39
+ ### Training hyperparameters
40
+
41
+ The following hyperparameters were used during training:
42
+ - learning_rate: 0.0002
43
+ - train_batch_size: 4
44
+ - eval_batch_size: 4
45
+ - seed: 42
46
+ - distributed_type: multi-GPU
47
+ - num_devices: 2
48
+ - gradient_accumulation_steps: 2
49
+ - total_train_batch_size: 16
50
+ - total_eval_batch_size: 8
51
+ - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
52
+ - lr_scheduler_type: cosine
53
+ - lr_scheduler_warmup_ratio: 0.1
54
+ - num_epochs: 10
55
+
56
+ ### Training results
57
+
58
+ | Training Loss | Epoch | Step | Validation Loss |
59
+ |:-------------:|:------:|:----:|:---------------:|
60
+ | 16.725 | 0.9804 | 25 | 6.5520 |
61
+ | 1.5122 | 2.0 | 51 | 2.6626 |
62
+ | 1.1154 | 2.9804 | 76 | 2.5917 |
63
+ | 0.9204 | 4.0 | 102 | 2.6570 |
64
+ | 0.779 | 4.9804 | 127 | 2.7498 |
65
+ | 0.6207 | 6.0 | 153 | 2.9976 |
66
+ | 0.4762 | 6.9804 | 178 | 3.4668 |
67
+ | 0.3908 | 8.0 | 204 | 3.8246 |
68
+ | 0.3418 | 8.9804 | 229 | 4.0561 |
69
+ | 0.3252 | 9.8039 | 250 | 4.0658 |
70
+
71
+
72
+ ### Framework versions
73
+
74
+ - PEFT 0.10.0
75
+ - Transformers 4.40.0
76
+ - Pytorch 2.2.1+cu121
77
+ - Datasets 2.18.0
78
+ - Tokenizers 0.19.1
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:319516c7ca137896c6dbfc37cb949462ee6b86790e17cc16617d54c05d859fa5
3
  size 50056096
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0d0324a90521c27231d849c8c00f3dca466edadcf64bb738c9a3abae4b6ef709
3
  size 50056096
all_results.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 9.803921568627452,
3
+ "total_flos": 1.9110914639160934e+17,
4
+ "train_loss": 3.4213723726272582,
5
+ "train_runtime": 1646.3335,
6
+ "train_samples": 3749,
7
+ "train_samples_per_second": 2.448,
8
+ "train_steps_per_second": 0.152
9
+ }
runs/Jun08_13-44-34_user-WS-C621E-SAGE-Series/events.out.tfevents.1717821906.user-WS-C621E-SAGE-Series.10929.0 CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9fb1fe411be3cfcf3320e2769a3e424c4628b2a3bc082a626ac840a87ff2646d
3
- size 15847
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b53bf5031ac84a1ae4123cb069b020b28f0a899c55c97fd511e7cfd287cd9a90
3
+ size 19124
train_results.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 9.803921568627452,
3
+ "total_flos": 1.9110914639160934e+17,
4
+ "train_loss": 3.4213723726272582,
5
+ "train_runtime": 1646.3335,
6
+ "train_samples": 3749,
7
+ "train_samples_per_second": 2.448,
8
+ "train_steps_per_second": 0.152
9
+ }
trainer_state.json ADDED
@@ -0,0 +1,467 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 9.803921568627452,
5
+ "eval_steps": 500,
6
+ "global_step": 250,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.0392156862745098,
13
+ "grad_norm": 536.0,
14
+ "learning_rate": 8.000000000000001e-06,
15
+ "loss": 51.7984,
16
+ "step": 1
17
+ },
18
+ {
19
+ "epoch": 0.19607843137254902,
20
+ "grad_norm": 314.0,
21
+ "learning_rate": 4e-05,
22
+ "loss": 43.035,
23
+ "step": 5
24
+ },
25
+ {
26
+ "epoch": 0.39215686274509803,
27
+ "grad_norm": 45.25,
28
+ "learning_rate": 8e-05,
29
+ "loss": 28.3835,
30
+ "step": 10
31
+ },
32
+ {
33
+ "epoch": 0.5882352941176471,
34
+ "grad_norm": 11.5,
35
+ "learning_rate": 0.00012,
36
+ "loss": 21.6194,
37
+ "step": 15
38
+ },
39
+ {
40
+ "epoch": 0.7843137254901961,
41
+ "grad_norm": 10.0625,
42
+ "learning_rate": 0.00016,
43
+ "loss": 19.3838,
44
+ "step": 20
45
+ },
46
+ {
47
+ "epoch": 0.9803921568627451,
48
+ "grad_norm": 37.75,
49
+ "learning_rate": 0.0002,
50
+ "loss": 16.725,
51
+ "step": 25
52
+ },
53
+ {
54
+ "epoch": 0.9803921568627451,
55
+ "eval_loss": 6.552042484283447,
56
+ "eval_runtime": 2.0523,
57
+ "eval_samples_per_second": 4.873,
58
+ "eval_steps_per_second": 0.975,
59
+ "step": 25
60
+ },
61
+ {
62
+ "epoch": 1.1764705882352942,
63
+ "grad_norm": 10.875,
64
+ "learning_rate": 0.00019975640502598244,
65
+ "loss": 7.143,
66
+ "step": 30
67
+ },
68
+ {
69
+ "epoch": 1.3725490196078431,
70
+ "grad_norm": 7.75,
71
+ "learning_rate": 0.00019902680687415705,
72
+ "loss": 2.2222,
73
+ "step": 35
74
+ },
75
+ {
76
+ "epoch": 1.5686274509803921,
77
+ "grad_norm": 2.453125,
78
+ "learning_rate": 0.00019781476007338058,
79
+ "loss": 1.7799,
80
+ "step": 40
81
+ },
82
+ {
83
+ "epoch": 1.7647058823529411,
84
+ "grad_norm": 4.34375,
85
+ "learning_rate": 0.0001961261695938319,
86
+ "loss": 1.5965,
87
+ "step": 45
88
+ },
89
+ {
90
+ "epoch": 1.9607843137254903,
91
+ "grad_norm": 2.4375,
92
+ "learning_rate": 0.00019396926207859084,
93
+ "loss": 1.5122,
94
+ "step": 50
95
+ },
96
+ {
97
+ "epoch": 2.0,
98
+ "eval_loss": 2.6625783443450928,
99
+ "eval_runtime": 2.0517,
100
+ "eval_samples_per_second": 4.874,
101
+ "eval_steps_per_second": 0.975,
102
+ "step": 51
103
+ },
104
+ {
105
+ "epoch": 2.156862745098039,
106
+ "grad_norm": 1.7421875,
107
+ "learning_rate": 0.0001913545457642601,
108
+ "loss": 1.3478,
109
+ "step": 55
110
+ },
111
+ {
112
+ "epoch": 2.3529411764705883,
113
+ "grad_norm": 2.046875,
114
+ "learning_rate": 0.00018829475928589271,
115
+ "loss": 1.26,
116
+ "step": 60
117
+ },
118
+ {
119
+ "epoch": 2.549019607843137,
120
+ "grad_norm": 3.8125,
121
+ "learning_rate": 0.0001848048096156426,
122
+ "loss": 1.2315,
123
+ "step": 65
124
+ },
125
+ {
126
+ "epoch": 2.7450980392156863,
127
+ "grad_norm": 4.46875,
128
+ "learning_rate": 0.00018090169943749476,
129
+ "loss": 1.1548,
130
+ "step": 70
131
+ },
132
+ {
133
+ "epoch": 2.9411764705882355,
134
+ "grad_norm": 1.5,
135
+ "learning_rate": 0.0001766044443118978,
136
+ "loss": 1.1154,
137
+ "step": 75
138
+ },
139
+ {
140
+ "epoch": 2.980392156862745,
141
+ "eval_loss": 2.591742753982544,
142
+ "eval_runtime": 2.0491,
143
+ "eval_samples_per_second": 4.88,
144
+ "eval_steps_per_second": 0.976,
145
+ "step": 76
146
+ },
147
+ {
148
+ "epoch": 3.1372549019607843,
149
+ "grad_norm": 1.9375,
150
+ "learning_rate": 0.0001719339800338651,
151
+ "loss": 1.0096,
152
+ "step": 80
153
+ },
154
+ {
155
+ "epoch": 3.3333333333333335,
156
+ "grad_norm": 3.40625,
157
+ "learning_rate": 0.00016691306063588583,
158
+ "loss": 0.9379,
159
+ "step": 85
160
+ },
161
+ {
162
+ "epoch": 3.5294117647058822,
163
+ "grad_norm": 1.65625,
164
+ "learning_rate": 0.0001615661475325658,
165
+ "loss": 0.9333,
166
+ "step": 90
167
+ },
168
+ {
169
+ "epoch": 3.7254901960784315,
170
+ "grad_norm": 3.109375,
171
+ "learning_rate": 0.0001559192903470747,
172
+ "loss": 0.9114,
173
+ "step": 95
174
+ },
175
+ {
176
+ "epoch": 3.9215686274509802,
177
+ "grad_norm": 11.0,
178
+ "learning_rate": 0.00015000000000000001,
179
+ "loss": 0.9204,
180
+ "step": 100
181
+ },
182
+ {
183
+ "epoch": 4.0,
184
+ "eval_loss": 2.6570026874542236,
185
+ "eval_runtime": 2.0401,
186
+ "eval_samples_per_second": 4.902,
187
+ "eval_steps_per_second": 0.98,
188
+ "step": 102
189
+ },
190
+ {
191
+ "epoch": 4.117647058823529,
192
+ "grad_norm": 1.4296875,
193
+ "learning_rate": 0.00014383711467890774,
194
+ "loss": 0.8556,
195
+ "step": 105
196
+ },
197
+ {
198
+ "epoch": 4.313725490196078,
199
+ "grad_norm": 1.3984375,
200
+ "learning_rate": 0.00013746065934159123,
201
+ "loss": 0.786,
202
+ "step": 110
203
+ },
204
+ {
205
+ "epoch": 4.509803921568627,
206
+ "grad_norm": 1.0703125,
207
+ "learning_rate": 0.00013090169943749476,
208
+ "loss": 0.797,
209
+ "step": 115
210
+ },
211
+ {
212
+ "epoch": 4.705882352941177,
213
+ "grad_norm": 1.9609375,
214
+ "learning_rate": 0.00012419218955996676,
215
+ "loss": 0.7689,
216
+ "step": 120
217
+ },
218
+ {
219
+ "epoch": 4.901960784313726,
220
+ "grad_norm": 1.4296875,
221
+ "learning_rate": 0.00011736481776669306,
222
+ "loss": 0.779,
223
+ "step": 125
224
+ },
225
+ {
226
+ "epoch": 4.980392156862745,
227
+ "eval_loss": 2.749800205230713,
228
+ "eval_runtime": 2.0472,
229
+ "eval_samples_per_second": 4.885,
230
+ "eval_steps_per_second": 0.977,
231
+ "step": 127
232
+ },
233
+ {
234
+ "epoch": 5.098039215686274,
235
+ "grad_norm": 15.25,
236
+ "learning_rate": 0.00011045284632676536,
237
+ "loss": 0.6889,
238
+ "step": 130
239
+ },
240
+ {
241
+ "epoch": 5.294117647058823,
242
+ "grad_norm": 1.28125,
243
+ "learning_rate": 0.00010348994967025012,
244
+ "loss": 0.6232,
245
+ "step": 135
246
+ },
247
+ {
248
+ "epoch": 5.490196078431373,
249
+ "grad_norm": 1.359375,
250
+ "learning_rate": 9.651005032974994e-05,
251
+ "loss": 0.6134,
252
+ "step": 140
253
+ },
254
+ {
255
+ "epoch": 5.686274509803922,
256
+ "grad_norm": 1.25,
257
+ "learning_rate": 8.954715367323468e-05,
258
+ "loss": 0.612,
259
+ "step": 145
260
+ },
261
+ {
262
+ "epoch": 5.882352941176471,
263
+ "grad_norm": 1.0390625,
264
+ "learning_rate": 8.263518223330697e-05,
265
+ "loss": 0.6207,
266
+ "step": 150
267
+ },
268
+ {
269
+ "epoch": 6.0,
270
+ "eval_loss": 2.9976024627685547,
271
+ "eval_runtime": 2.0445,
272
+ "eval_samples_per_second": 4.891,
273
+ "eval_steps_per_second": 0.978,
274
+ "step": 153
275
+ },
276
+ {
277
+ "epoch": 6.078431372549019,
278
+ "grad_norm": 1.1328125,
279
+ "learning_rate": 7.580781044003324e-05,
280
+ "loss": 0.5836,
281
+ "step": 155
282
+ },
283
+ {
284
+ "epoch": 6.2745098039215685,
285
+ "grad_norm": 1.203125,
286
+ "learning_rate": 6.909830056250527e-05,
287
+ "loss": 0.5047,
288
+ "step": 160
289
+ },
290
+ {
291
+ "epoch": 6.470588235294118,
292
+ "grad_norm": 1.4765625,
293
+ "learning_rate": 6.25393406584088e-05,
294
+ "loss": 0.4862,
295
+ "step": 165
296
+ },
297
+ {
298
+ "epoch": 6.666666666666667,
299
+ "grad_norm": 1.21875,
300
+ "learning_rate": 5.616288532109225e-05,
301
+ "loss": 0.4953,
302
+ "step": 170
303
+ },
304
+ {
305
+ "epoch": 6.862745098039216,
306
+ "grad_norm": 1.1484375,
307
+ "learning_rate": 5.000000000000002e-05,
308
+ "loss": 0.4762,
309
+ "step": 175
310
+ },
311
+ {
312
+ "epoch": 6.980392156862745,
313
+ "eval_loss": 3.466813325881958,
314
+ "eval_runtime": 2.0565,
315
+ "eval_samples_per_second": 4.863,
316
+ "eval_steps_per_second": 0.973,
317
+ "step": 178
318
+ },
319
+ {
320
+ "epoch": 7.0588235294117645,
321
+ "grad_norm": 1.203125,
322
+ "learning_rate": 4.4080709652925336e-05,
323
+ "loss": 0.4669,
324
+ "step": 180
325
+ },
326
+ {
327
+ "epoch": 7.254901960784314,
328
+ "grad_norm": 1.4140625,
329
+ "learning_rate": 3.843385246743417e-05,
330
+ "loss": 0.3974,
331
+ "step": 185
332
+ },
333
+ {
334
+ "epoch": 7.450980392156863,
335
+ "grad_norm": 1.0625,
336
+ "learning_rate": 3.308693936411421e-05,
337
+ "loss": 0.3956,
338
+ "step": 190
339
+ },
340
+ {
341
+ "epoch": 7.647058823529412,
342
+ "grad_norm": 1.0703125,
343
+ "learning_rate": 2.8066019966134904e-05,
344
+ "loss": 0.386,
345
+ "step": 195
346
+ },
347
+ {
348
+ "epoch": 7.8431372549019605,
349
+ "grad_norm": 1.1328125,
350
+ "learning_rate": 2.339555568810221e-05,
351
+ "loss": 0.3908,
352
+ "step": 200
353
+ },
354
+ {
355
+ "epoch": 8.0,
356
+ "eval_loss": 3.824570417404175,
357
+ "eval_runtime": 2.0426,
358
+ "eval_samples_per_second": 4.896,
359
+ "eval_steps_per_second": 0.979,
360
+ "step": 204
361
+ },
362
+ {
363
+ "epoch": 8.03921568627451,
364
+ "grad_norm": 0.984375,
365
+ "learning_rate": 1.9098300562505266e-05,
366
+ "loss": 0.3721,
367
+ "step": 205
368
+ },
369
+ {
370
+ "epoch": 8.235294117647058,
371
+ "grad_norm": 0.96875,
372
+ "learning_rate": 1.5195190384357404e-05,
373
+ "loss": 0.3342,
374
+ "step": 210
375
+ },
376
+ {
377
+ "epoch": 8.431372549019608,
378
+ "grad_norm": 1.046875,
379
+ "learning_rate": 1.1705240714107302e-05,
380
+ "loss": 0.346,
381
+ "step": 215
382
+ },
383
+ {
384
+ "epoch": 8.627450980392156,
385
+ "grad_norm": 1.046875,
386
+ "learning_rate": 8.645454235739903e-06,
387
+ "loss": 0.3381,
388
+ "step": 220
389
+ },
390
+ {
391
+ "epoch": 8.823529411764707,
392
+ "grad_norm": 1.046875,
393
+ "learning_rate": 6.030737921409169e-06,
394
+ "loss": 0.3418,
395
+ "step": 225
396
+ },
397
+ {
398
+ "epoch": 8.980392156862745,
399
+ "eval_loss": 4.056135177612305,
400
+ "eval_runtime": 2.0561,
401
+ "eval_samples_per_second": 4.864,
402
+ "eval_steps_per_second": 0.973,
403
+ "step": 229
404
+ },
405
+ {
406
+ "epoch": 9.019607843137255,
407
+ "grad_norm": 0.88671875,
408
+ "learning_rate": 3.873830406168111e-06,
409
+ "loss": 0.3294,
410
+ "step": 230
411
+ },
412
+ {
413
+ "epoch": 9.215686274509803,
414
+ "grad_norm": 0.89453125,
415
+ "learning_rate": 2.1852399266194314e-06,
416
+ "loss": 0.3265,
417
+ "step": 235
418
+ },
419
+ {
420
+ "epoch": 9.411764705882353,
421
+ "grad_norm": 0.9296875,
422
+ "learning_rate": 9.731931258429638e-07,
423
+ "loss": 0.3268,
424
+ "step": 240
425
+ },
426
+ {
427
+ "epoch": 9.607843137254902,
428
+ "grad_norm": 0.91796875,
429
+ "learning_rate": 2.4359497401758024e-07,
430
+ "loss": 0.3257,
431
+ "step": 245
432
+ },
433
+ {
434
+ "epoch": 9.803921568627452,
435
+ "grad_norm": 0.9140625,
436
+ "learning_rate": 0.0,
437
+ "loss": 0.3252,
438
+ "step": 250
439
+ },
440
+ {
441
+ "epoch": 9.803921568627452,
442
+ "eval_loss": 4.065803050994873,
443
+ "eval_runtime": 2.0429,
444
+ "eval_samples_per_second": 4.895,
445
+ "eval_steps_per_second": 0.979,
446
+ "step": 250
447
+ },
448
+ {
449
+ "epoch": 9.803921568627452,
450
+ "step": 250,
451
+ "total_flos": 1.9110914639160934e+17,
452
+ "train_loss": 3.4213723726272582,
453
+ "train_runtime": 1646.3335,
454
+ "train_samples_per_second": 2.448,
455
+ "train_steps_per_second": 0.152
456
+ }
457
+ ],
458
+ "logging_steps": 5,
459
+ "max_steps": 250,
460
+ "num_input_tokens_seen": 0,
461
+ "num_train_epochs": 10,
462
+ "save_steps": 100,
463
+ "total_flos": 1.9110914639160934e+17,
464
+ "train_batch_size": 4,
465
+ "trial_name": null,
466
+ "trial_params": null
467
+ }