chansung commited on
Commit
de1b350
·
verified ·
1 Parent(s): 54cd5b3

Model save

Browse files
Files changed (4) hide show
  1. README.md +7 -7
  2. all_results.json +7 -12
  3. train_results.json +7 -7
  4. trainer_state.json +170 -72
README.md CHANGED
@@ -20,7 +20,7 @@ should probably proofread and complete it, then remove this comment. -->
20
 
21
  This model is a fine-tuned version of [meta-llama/Meta-Llama-3-8B](https://huggingface.co/meta-llama/Meta-Llama-3-8B) on the generator dataset.
22
  It achieves the following results on the evaluation set:
23
- - Loss: 1.8595
24
 
25
  ## Model description
26
 
@@ -48,22 +48,22 @@ The following hyperparameters were used during training:
48
  - gradient_accumulation_steps: 2
49
  - total_train_batch_size: 256
50
  - total_eval_batch_size: 128
51
- - optimizer: Use adamw_torch with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
52
  - lr_scheduler_type: cosine
53
  - lr_scheduler_warmup_ratio: 0.1
54
  - num_epochs: 1
55
 
56
  ### Training results
57
 
58
- | Training Loss | Epoch | Step | Validation Loss |
59
- |:-------------:|:------:|:----:|:---------------:|
60
- | 1.9457 | 0.9927 | 68 | 1.8595 |
61
 
62
 
63
  ### Framework versions
64
 
65
  - PEFT 0.13.1.dev0
66
- - Transformers 4.46.2
67
- - Pytorch 2.5.1+cu124
68
  - Datasets 3.1.0
69
  - Tokenizers 0.20.3
 
20
 
21
  This model is a fine-tuned version of [meta-llama/Meta-Llama-3-8B](https://huggingface.co/meta-llama/Meta-Llama-3-8B) on the generator dataset.
22
  It achieves the following results on the evaluation set:
23
+ - Loss: 1.7616
24
 
25
  ## Model description
26
 
 
48
  - gradient_accumulation_steps: 2
49
  - total_train_batch_size: 256
50
  - total_eval_batch_size: 128
51
+ - optimizer: Use OptimizerNames.ADAMW_TORCH with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
52
  - lr_scheduler_type: cosine
53
  - lr_scheduler_warmup_ratio: 0.1
54
  - num_epochs: 1
55
 
56
  ### Training results
57
 
58
+ | Training Loss | Epoch | Step | Validation Loss |
59
+ |:-------------:|:-----:|:----:|:---------------:|
60
+ | 1.3703 | 1.0 | 137 | 1.7616 |
61
 
62
 
63
  ### Framework versions
64
 
65
  - PEFT 0.13.1.dev0
66
+ - Transformers 4.46.3
67
+ - Pytorch 2.3.1+cu121
68
  - Datasets 3.1.0
69
  - Tokenizers 0.20.3
all_results.json CHANGED
@@ -1,14 +1,9 @@
1
  {
2
- "epoch": 0.9927007299270073,
3
- "eval_loss": 1.859512209892273,
4
- "eval_runtime": 1.6224,
5
- "eval_samples": 518,
6
- "eval_samples_per_second": 107.251,
7
- "eval_steps_per_second": 1.233,
8
- "total_flos": 4.015265797185208e+17,
9
- "train_loss": 2.0918032141292797,
10
- "train_runtime": 338.6733,
11
- "train_samples": 51241,
12
- "train_samples_per_second": 51.761,
13
- "train_steps_per_second": 0.201
14
  }
 
1
  {
2
+ "epoch": 1.0,
3
+ "total_flos": 8.089579620799611e+17,
4
+ "train_loss": 1.4743173070197557,
5
+ "train_runtime": 700.8271,
6
+ "train_samples": 116368,
7
+ "train_samples_per_second": 49.884,
8
+ "train_steps_per_second": 0.195
 
 
 
 
 
9
  }
train_results.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
- "epoch": 0.9927007299270073,
3
- "total_flos": 4.015265797185208e+17,
4
- "train_loss": 2.0918032141292797,
5
- "train_runtime": 338.6733,
6
- "train_samples": 51241,
7
- "train_samples_per_second": 51.761,
8
- "train_steps_per_second": 0.201
9
  }
 
1
  {
2
+ "epoch": 1.0,
3
+ "total_flos": 8.089579620799611e+17,
4
+ "train_loss": 1.4743173070197557,
5
+ "train_runtime": 700.8271,
6
+ "train_samples": 116368,
7
+ "train_samples_per_second": 49.884,
8
+ "train_steps_per_second": 0.195
9
  }
trainer_state.json CHANGED
@@ -1,131 +1,229 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.9927007299270073,
5
  "eval_steps": 500,
6
- "global_step": 68,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.014598540145985401,
13
- "grad_norm": 2.3731939792633057,
14
- "learning_rate": 2.857142857142857e-05,
15
- "loss": 2.5137,
16
  "step": 1
17
  },
 
 
 
 
 
 
 
18
  {
19
  "epoch": 0.072992700729927,
20
- "grad_norm": 2.052441358566284,
21
  "learning_rate": 0.00014285714285714287,
22
- "loss": 2.5009,
23
- "step": 5
 
 
 
 
 
 
 
24
  },
25
  {
26
  "epoch": 0.145985401459854,
27
- "grad_norm": 0.4984005093574524,
28
- "learning_rate": 0.00019880878960910772,
29
- "loss": 2.3982,
30
- "step": 10
 
 
 
 
 
 
 
31
  },
32
  {
33
  "epoch": 0.21897810218978103,
34
- "grad_norm": 0.5986955165863037,
35
- "learning_rate": 0.0001916316904487005,
36
- "loss": 2.2688,
37
- "step": 15
 
 
 
 
 
 
 
38
  },
39
  {
40
  "epoch": 0.291970802919708,
41
- "grad_norm": 0.4892929494380951,
42
- "learning_rate": 0.00017841198065767107,
43
- "loss": 2.1764,
44
- "step": 20
 
 
 
 
 
 
 
45
  },
46
  {
47
  "epoch": 0.36496350364963503,
48
- "grad_norm": 0.46845513582229614,
49
- "learning_rate": 0.00016002142805483685,
50
- "loss": 2.0874,
51
- "step": 25
 
 
 
 
 
 
 
52
  },
53
  {
54
  "epoch": 0.43795620437956206,
55
- "grad_norm": 0.34990543127059937,
56
- "learning_rate": 0.00013767278936351854,
57
- "loss": 2.0344,
58
- "step": 30
 
 
 
 
 
 
 
59
  },
60
  {
61
  "epoch": 0.5109489051094891,
62
- "grad_norm": 0.26925715804100037,
63
- "learning_rate": 0.00011283983551465511,
64
- "loss": 1.9991,
65
- "step": 35
 
 
 
 
 
 
 
66
  },
67
  {
68
  "epoch": 0.583941605839416,
69
- "grad_norm": 0.24737241864204407,
70
- "learning_rate": 8.71601644853449e-05,
71
- "loss": 1.9808,
72
- "step": 40
 
 
 
 
 
 
 
73
  },
74
  {
75
  "epoch": 0.656934306569343,
76
- "grad_norm": 0.24646887183189392,
77
- "learning_rate": 6.232721063648148e-05,
78
- "loss": 1.9615,
79
- "step": 45
 
 
 
 
 
 
 
80
  },
81
  {
82
  "epoch": 0.7299270072992701,
83
- "grad_norm": 0.2257496565580368,
84
- "learning_rate": 3.997857194516319e-05,
85
- "loss": 1.9739,
86
- "step": 50
 
 
 
 
 
 
 
87
  },
88
  {
89
  "epoch": 0.8029197080291971,
90
- "grad_norm": 0.20391573011875153,
91
- "learning_rate": 2.1588019342328968e-05,
92
- "loss": 1.9468,
93
- "step": 55
 
 
 
 
 
 
 
94
  },
95
  {
96
  "epoch": 0.8759124087591241,
97
- "grad_norm": 0.19893679022789001,
98
- "learning_rate": 8.368309551299536e-06,
99
- "loss": 1.9727,
100
- "step": 60
 
 
 
 
 
 
 
101
  },
102
  {
103
  "epoch": 0.948905109489051,
104
- "grad_norm": 0.19601836800575256,
105
- "learning_rate": 1.1912103908922945e-06,
106
- "loss": 1.9457,
107
- "step": 65
 
 
 
 
 
 
 
108
  },
109
  {
110
- "epoch": 0.9927007299270073,
111
- "eval_loss": 1.859512209892273,
112
- "eval_runtime": 1.6081,
113
- "eval_samples_per_second": 108.202,
114
- "eval_steps_per_second": 1.244,
115
- "step": 68
116
  },
117
  {
118
- "epoch": 0.9927007299270073,
119
- "step": 68,
120
- "total_flos": 4.015265797185208e+17,
121
- "train_loss": 2.0918032141292797,
122
- "train_runtime": 338.6733,
123
- "train_samples_per_second": 51.761,
124
- "train_steps_per_second": 0.201
125
  }
126
  ],
127
  "logging_steps": 5,
128
- "max_steps": 68,
129
  "num_input_tokens_seen": 0,
130
  "num_train_epochs": 1,
131
  "save_steps": 100,
@@ -141,7 +239,7 @@
141
  "attributes": {}
142
  }
143
  },
144
- "total_flos": 4.015265797185208e+17,
145
  "train_batch_size": 16,
146
  "trial_name": null,
147
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 1.0,
5
  "eval_steps": 500,
6
+ "global_step": 137,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.0072992700729927005,
13
+ "grad_norm": 0.6737354397773743,
14
+ "learning_rate": 1.4285714285714285e-05,
15
+ "loss": 1.9885,
16
  "step": 1
17
  },
18
+ {
19
+ "epoch": 0.0364963503649635,
20
+ "grad_norm": 0.5888584852218628,
21
+ "learning_rate": 7.142857142857143e-05,
22
+ "loss": 1.96,
23
+ "step": 5
24
+ },
25
  {
26
  "epoch": 0.072992700729927,
27
+ "grad_norm": 0.44593068957328796,
28
  "learning_rate": 0.00014285714285714287,
29
+ "loss": 1.927,
30
+ "step": 10
31
+ },
32
+ {
33
+ "epoch": 0.10948905109489052,
34
+ "grad_norm": 0.5411117076873779,
35
+ "learning_rate": 0.00019996738360808565,
36
+ "loss": 1.8403,
37
+ "step": 15
38
  },
39
  {
40
  "epoch": 0.145985401459854,
41
+ "grad_norm": 0.4955069422721863,
42
+ "learning_rate": 0.00019882804237803488,
43
+ "loss": 1.6916,
44
+ "step": 20
45
+ },
46
+ {
47
+ "epoch": 0.18248175182481752,
48
+ "grad_norm": 0.51289963722229,
49
+ "learning_rate": 0.00019607909582962477,
50
+ "loss": 1.5926,
51
+ "step": 25
52
  },
53
  {
54
  "epoch": 0.21897810218978103,
55
+ "grad_norm": 0.2931584417819977,
56
+ "learning_rate": 0.0001917653158603628,
57
+ "loss": 1.5275,
58
+ "step": 30
59
+ },
60
+ {
61
+ "epoch": 0.25547445255474455,
62
+ "grad_norm": 0.20454026758670807,
63
+ "learning_rate": 0.00018595696069872013,
64
+ "loss": 1.4805,
65
+ "step": 35
66
  },
67
  {
68
  "epoch": 0.291970802919708,
69
+ "grad_norm": 0.15512123703956604,
70
+ "learning_rate": 0.00017874863061334657,
71
+ "loss": 1.4545,
72
+ "step": 40
73
+ },
74
+ {
75
+ "epoch": 0.3284671532846715,
76
+ "grad_norm": 0.14049112796783447,
77
+ "learning_rate": 0.00017025772716520323,
78
+ "loss": 1.4489,
79
+ "step": 45
80
  },
81
  {
82
  "epoch": 0.36496350364963503,
83
+ "grad_norm": 0.12365967035293579,
84
+ "learning_rate": 0.0001606225410966638,
85
+ "loss": 1.4337,
86
+ "step": 50
87
+ },
88
+ {
89
+ "epoch": 0.40145985401459855,
90
+ "grad_norm": 0.12197204679250717,
91
+ "learning_rate": 0.00015000000000000001,
92
+ "loss": 1.4202,
93
+ "step": 55
94
  },
95
  {
96
  "epoch": 0.43795620437956206,
97
+ "grad_norm": 0.1118067055940628,
98
+ "learning_rate": 0.0001385631124488136,
99
+ "loss": 1.4099,
100
+ "step": 60
101
+ },
102
+ {
103
+ "epoch": 0.4744525547445255,
104
+ "grad_norm": 0.10998713970184326,
105
+ "learning_rate": 0.0001264981502196662,
106
+ "loss": 1.3993,
107
+ "step": 65
108
  },
109
  {
110
  "epoch": 0.5109489051094891,
111
+ "grad_norm": 0.11097019165754318,
112
+ "learning_rate": 0.00011400161449686293,
113
+ "loss": 1.3987,
114
+ "step": 70
115
+ },
116
+ {
117
+ "epoch": 0.5474452554744526,
118
+ "grad_norm": 0.1111425831913948,
119
+ "learning_rate": 0.00010127703547159739,
120
+ "loss": 1.3845,
121
+ "step": 75
122
  },
123
  {
124
  "epoch": 0.583941605839416,
125
+ "grad_norm": 0.11766236275434494,
126
+ "learning_rate": 8.853165746015997e-05,
127
+ "loss": 1.381,
128
+ "step": 80
129
+ },
130
+ {
131
+ "epoch": 0.6204379562043796,
132
+ "grad_norm": 0.11702164262533188,
133
+ "learning_rate": 7.597306353045393e-05,
134
+ "loss": 1.3715,
135
+ "step": 85
136
  },
137
  {
138
  "epoch": 0.656934306569343,
139
+ "grad_norm": 0.1169838011264801,
140
+ "learning_rate": 6.380579461128819e-05,
141
+ "loss": 1.3859,
142
+ "step": 90
143
+ },
144
+ {
145
+ "epoch": 0.6934306569343066,
146
+ "grad_norm": 0.10516153275966644,
147
+ "learning_rate": 5.222801814877369e-05,
148
+ "loss": 1.368,
149
+ "step": 95
150
  },
151
  {
152
  "epoch": 0.7299270072992701,
153
+ "grad_norm": 0.11746218055486679,
154
+ "learning_rate": 4.142830056718052e-05,
155
+ "loss": 1.3735,
156
+ "step": 100
157
+ },
158
+ {
159
+ "epoch": 0.7664233576642335,
160
+ "grad_norm": 0.11317908763885498,
161
+ "learning_rate": 3.158253610095697e-05,
162
+ "loss": 1.3675,
163
+ "step": 105
164
  },
165
  {
166
  "epoch": 0.8029197080291971,
167
+ "grad_norm": 0.1135973334312439,
168
+ "learning_rate": 2.2851082017805703e-05,
169
+ "loss": 1.3633,
170
+ "step": 110
171
+ },
172
+ {
173
+ "epoch": 0.8394160583941606,
174
+ "grad_norm": 0.11384975910186768,
175
+ "learning_rate": 1.5376146891235598e-05,
176
+ "loss": 1.3684,
177
+ "step": 115
178
  },
179
  {
180
  "epoch": 0.8759124087591241,
181
+ "grad_norm": 0.10755354166030884,
182
+ "learning_rate": 9.279474459608805e-06,
183
+ "loss": 1.3735,
184
+ "step": 120
185
+ },
186
+ {
187
+ "epoch": 0.9124087591240876,
188
+ "grad_norm": 0.11003435403108597,
189
+ "learning_rate": 4.660360794506946e-06,
190
+ "loss": 1.3648,
191
+ "step": 125
192
  },
193
  {
194
  "epoch": 0.948905109489051,
195
+ "grad_norm": 0.10320563614368439,
196
+ "learning_rate": 1.5940370726542863e-06,
197
+ "loss": 1.3714,
198
+ "step": 130
199
+ },
200
+ {
201
+ "epoch": 0.9854014598540146,
202
+ "grad_norm": 0.10423589497804642,
203
+ "learning_rate": 1.3044429107700318e-07,
204
+ "loss": 1.3703,
205
+ "step": 135
206
  },
207
  {
208
+ "epoch": 1.0,
209
+ "eval_loss": 1.7615731954574585,
210
+ "eval_runtime": 0.8323,
211
+ "eval_samples_per_second": 10.813,
212
+ "eval_steps_per_second": 1.201,
213
+ "step": 137
214
  },
215
  {
216
+ "epoch": 1.0,
217
+ "step": 137,
218
+ "total_flos": 8.089579620799611e+17,
219
+ "train_loss": 1.4743173070197557,
220
+ "train_runtime": 700.8271,
221
+ "train_samples_per_second": 49.884,
222
+ "train_steps_per_second": 0.195
223
  }
224
  ],
225
  "logging_steps": 5,
226
+ "max_steps": 137,
227
  "num_input_tokens_seen": 0,
228
  "num_train_epochs": 1,
229
  "save_steps": 100,
 
239
  "attributes": {}
240
  }
241
  },
242
+ "total_flos": 8.089579620799611e+17,
243
  "train_batch_size": 16,
244
  "trial_name": null,
245
  "trial_params": null