Qin Liu commited on
Commit
5fe4b0e
1 Parent(s): 093e233

Model save

Browse files
README.md CHANGED
@@ -1,13 +1,11 @@
1
  ---
2
  base_model: meta-llama/Meta-Llama-3-8B
3
- datasets:
4
- - HuggingFaceH4/ultrachat_200k
5
  library_name: peft
6
  license: llama3
7
  tags:
8
- - alignment-handbook
9
  - trl
10
  - sft
 
11
  - generated_from_trainer
12
  model-index:
13
  - name: llama3-sudo
@@ -19,9 +17,9 @@ should probably proofread and complete it, then remove this comment. -->
19
 
20
  # llama3-sudo
21
 
22
- This model is a fine-tuned version of [meta-llama/Meta-Llama-3-8B](https://huggingface.co/meta-llama/Meta-Llama-3-8B) on the HuggingFaceH4/ultrachat_200k dataset.
23
  It achieves the following results on the evaluation set:
24
- - Loss: 1.0709
25
 
26
  ## Model description
27
 
@@ -41,12 +39,12 @@ More information needed
41
 
42
  The following hyperparameters were used during training:
43
  - learning_rate: 0.0002
44
- - train_batch_size: 8
45
  - eval_batch_size: 4
46
  - seed: 42
47
  - distributed_type: multi-GPU
48
  - num_devices: 4
49
- - gradient_accumulation_steps: 8
50
  - total_train_batch_size: 256
51
  - total_eval_batch_size: 16
52
  - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
@@ -60,6 +58,8 @@ The following hyperparameters were used during training:
60
  |:-------------:|:------:|:----:|:---------------:|
61
  | 1.3252 | 0.9697 | 24 | 1.1693 |
62
  | 1.1352 | 1.9798 | 49 | 1.0709 |
 
 
63
 
64
 
65
  ### Framework versions
 
1
  ---
2
  base_model: meta-llama/Meta-Llama-3-8B
 
 
3
  library_name: peft
4
  license: llama3
5
  tags:
 
6
  - trl
7
  - sft
8
+ - alignment-handbook
9
  - generated_from_trainer
10
  model-index:
11
  - name: llama3-sudo
 
17
 
18
  # llama3-sudo
19
 
20
+ This model is a fine-tuned version of [meta-llama/Meta-Llama-3-8B](https://huggingface.co/meta-llama/Meta-Llama-3-8B) on an unknown dataset.
21
  It achieves the following results on the evaluation set:
22
+ - Loss: 1.0100
23
 
24
  ## Model description
25
 
 
39
 
40
  The following hyperparameters were used during training:
41
  - learning_rate: 0.0002
42
+ - train_batch_size: 16
43
  - eval_batch_size: 4
44
  - seed: 42
45
  - distributed_type: multi-GPU
46
  - num_devices: 4
47
+ - gradient_accumulation_steps: 4
48
  - total_train_batch_size: 256
49
  - total_eval_batch_size: 16
50
  - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
 
58
  |:-------------:|:------:|:----:|:---------------:|
59
  | 1.3252 | 0.9697 | 24 | 1.1693 |
60
  | 1.1352 | 1.9798 | 49 | 1.0709 |
61
+ | 1.1265 | 1.9899 | 98 | 1.0308 |
62
+ | 1.1113 | 2.9798 | 147 | 1.0100 |
63
 
64
 
65
  ### Framework versions
all_results.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
- "epoch": 2.909090909090909,
3
- "total_flos": 644347544469504.0,
4
- "train_loss": 0.0,
5
- "train_runtime": 0.0111,
6
  "train_samples": 6321,
7
- "train_samples_per_second": 1702737.829,
8
- "train_steps_per_second": 6465.07
9
  }
 
1
  {
2
+ "epoch": 2.9797979797979797,
3
+ "total_flos": 981865656745984.0,
4
+ "train_loss": 0.5657350935903537,
5
+ "train_runtime": 914.4964,
6
  "train_samples": 6321,
7
+ "train_samples_per_second": 20.736,
8
+ "train_steps_per_second": 0.161
9
  }
runs/Aug13_19-05-34_ip-172-31-10-237/events.out.tfevents.1723575945.ip-172-31-10-237.662759.0 CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f2a9fbe250686fd5af471baa634830930f33838aea00a8eb3dfcb0d7751deb67
3
- size 8938
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8c20b797bd8224221ce2960fdc3e46fdeae386ed3cafad64f825e32718371658
3
+ size 9563
train_results.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
- "epoch": 2.909090909090909,
3
- "total_flos": 644347544469504.0,
4
- "train_loss": 0.0,
5
- "train_runtime": 0.0111,
6
  "train_samples": 6321,
7
- "train_samples_per_second": 1702737.829,
8
- "train_steps_per_second": 6465.07
9
  }
 
1
  {
2
+ "epoch": 2.9797979797979797,
3
+ "total_flos": 981865656745984.0,
4
+ "train_loss": 0.5657350935903537,
5
+ "train_runtime": 914.4964,
6
  "train_samples": 6321,
7
+ "train_samples_per_second": 20.736,
8
+ "train_steps_per_second": 0.161
9
  }
trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 2.909090909090909,
5
  "eval_steps": 500,
6
- "global_step": 72,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -130,17 +130,138 @@
130
  "step": 70
131
  },
132
  {
133
- "epoch": 2.909090909090909,
134
- "step": 72,
135
- "total_flos": 644347544469504.0,
136
- "train_loss": 0.0,
137
- "train_runtime": 0.0111,
138
- "train_samples_per_second": 1702737.829,
139
- "train_steps_per_second": 6465.07
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
140
  }
141
  ],
142
  "logging_steps": 5,
143
- "max_steps": 72,
144
  "num_input_tokens_seen": 0,
145
  "num_train_epochs": 3,
146
  "save_steps": 25,
@@ -156,7 +277,7 @@
156
  "attributes": {}
157
  }
158
  },
159
- "total_flos": 644347544469504.0,
160
  "train_batch_size": 8,
161
  "trial_name": null,
162
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 2.9797979797979797,
5
  "eval_steps": 500,
6
+ "global_step": 147,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
130
  "step": 70
131
  },
132
  {
133
+ "epoch": 1.5252525252525253,
134
+ "grad_norm": 1.5131939349650319,
135
+ "learning_rate": 0.00011423148382732853,
136
+ "loss": 1.4021,
137
+ "step": 75
138
+ },
139
+ {
140
+ "epoch": 1.6262626262626263,
141
+ "grad_norm": 0.4578537596053918,
142
+ "learning_rate": 0.00010237976975461075,
143
+ "loss": 1.2517,
144
+ "step": 80
145
+ },
146
+ {
147
+ "epoch": 1.7272727272727273,
148
+ "grad_norm": 0.33225716851066756,
149
+ "learning_rate": 9.049439566958175e-05,
150
+ "loss": 1.139,
151
+ "step": 85
152
+ },
153
+ {
154
+ "epoch": 1.8282828282828283,
155
+ "grad_norm": 0.27743266981716935,
156
+ "learning_rate": 7.874347104470234e-05,
157
+ "loss": 1.0842,
158
+ "step": 90
159
+ },
160
+ {
161
+ "epoch": 1.9292929292929293,
162
+ "grad_norm": 0.3335911253392335,
163
+ "learning_rate": 6.729320366825784e-05,
164
+ "loss": 1.1265,
165
+ "step": 95
166
+ },
167
+ {
168
+ "epoch": 1.98989898989899,
169
+ "eval_loss": 1.0307520627975464,
170
+ "eval_runtime": 165.0307,
171
+ "eval_samples_per_second": 38.302,
172
+ "eval_steps_per_second": 2.4,
173
+ "step": 98
174
+ },
175
+ {
176
+ "epoch": 2.0303030303030303,
177
+ "grad_norm": 0.2767977674400862,
178
+ "learning_rate": 5.630554876306407e-05,
179
+ "loss": 1.0492,
180
+ "step": 100
181
+ },
182
+ {
183
+ "epoch": 2.1313131313131315,
184
+ "grad_norm": 0.2747976722762822,
185
+ "learning_rate": 4.593591825444028e-05,
186
+ "loss": 1.0602,
187
+ "step": 105
188
+ },
189
+ {
190
+ "epoch": 2.2323232323232323,
191
+ "grad_norm": 0.2987036396560577,
192
+ "learning_rate": 3.6330982588091186e-05,
193
+ "loss": 1.0785,
194
+ "step": 110
195
+ },
196
+ {
197
+ "epoch": 2.3333333333333335,
198
+ "grad_norm": 0.27812388202445604,
199
+ "learning_rate": 2.7626596189492983e-05,
200
+ "loss": 1.0563,
201
+ "step": 115
202
+ },
203
+ {
204
+ "epoch": 2.4343434343434343,
205
+ "grad_norm": 0.24657888130613284,
206
+ "learning_rate": 1.994587590756397e-05,
207
+ "loss": 1.0835,
208
+ "step": 120
209
+ },
210
+ {
211
+ "epoch": 2.5353535353535355,
212
+ "grad_norm": 0.24337497990605553,
213
+ "learning_rate": 1.339745962155613e-05,
214
+ "loss": 1.071,
215
+ "step": 125
216
+ },
217
+ {
218
+ "epoch": 2.6363636363636362,
219
+ "grad_norm": 0.2979117602370218,
220
+ "learning_rate": 8.073969641833445e-06,
221
+ "loss": 1.036,
222
+ "step": 130
223
+ },
224
+ {
225
+ "epoch": 2.7373737373737375,
226
+ "grad_norm": 0.25304418891814984,
227
+ "learning_rate": 4.050702638550275e-06,
228
+ "loss": 1.1085,
229
+ "step": 135
230
+ },
231
+ {
232
+ "epoch": 2.8383838383838382,
233
+ "grad_norm": 0.26313547369333573,
234
+ "learning_rate": 1.3845646281813507e-06,
235
+ "loss": 1.0692,
236
+ "step": 140
237
+ },
238
+ {
239
+ "epoch": 2.9393939393939394,
240
+ "grad_norm": 0.2581657699360705,
241
+ "learning_rate": 1.1326608169920372e-07,
242
+ "loss": 1.1113,
243
+ "step": 145
244
+ },
245
+ {
246
+ "epoch": 2.9797979797979797,
247
+ "eval_loss": 1.0100449323654175,
248
+ "eval_runtime": 162.6209,
249
+ "eval_samples_per_second": 38.87,
250
+ "eval_steps_per_second": 2.435,
251
+ "step": 147
252
+ },
253
+ {
254
+ "epoch": 2.9797979797979797,
255
+ "step": 147,
256
+ "total_flos": 981865656745984.0,
257
+ "train_loss": 0.5657350935903537,
258
+ "train_runtime": 914.4964,
259
+ "train_samples_per_second": 20.736,
260
+ "train_steps_per_second": 0.161
261
  }
262
  ],
263
  "logging_steps": 5,
264
+ "max_steps": 147,
265
  "num_input_tokens_seen": 0,
266
  "num_train_epochs": 3,
267
  "save_steps": 25,
 
277
  "attributes": {}
278
  }
279
  },
280
+ "total_flos": 981865656745984.0,
281
  "train_batch_size": 8,
282
  "trial_name": null,
283
  "trial_params": null