pepoo20 commited on
Commit
6f51c81
1 Parent(s): d6e480b

End of training

Browse files
README.md CHANGED
@@ -3,6 +3,7 @@ license: other
3
  base_model: MathSymbol/BasicSFT_1.8_Pretrain_Lightning
4
  tags:
5
  - llama-factory
 
6
  - generated_from_trainer
7
  model-index:
8
  - name: WordProblem
@@ -14,7 +15,7 @@ should probably proofread and complete it, then remove this comment. -->
14
 
15
  # WordProblem
16
 
17
- This model is a fine-tuned version of [MathSymbol/BasicSFT_1.8_Pretrain_Lightning](https://huggingface.co/MathSymbol/BasicSFT_1.8_Pretrain_Lightning) on an unknown dataset.
18
  It achieves the following results on the evaluation set:
19
  - Loss: 0.1677
20
 
 
3
  base_model: MathSymbol/BasicSFT_1.8_Pretrain_Lightning
4
  tags:
5
  - llama-factory
6
+ - full
7
  - generated_from_trainer
8
  model-index:
9
  - name: WordProblem
 
15
 
16
  # WordProblem
17
 
18
+ This model is a fine-tuned version of [MathSymbol/BasicSFT_1.8_Pretrain_Lightning](https://huggingface.co/MathSymbol/BasicSFT_1.8_Pretrain_Lightning) on the WordProblems_SFT dataset.
19
  It achieves the following results on the evaluation set:
20
  - Loss: 0.1677
21
 
all_results.json ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 0.9999725884707108,
3
+ "eval_loss": 0.1676628440618515,
4
+ "eval_runtime": 79.4786,
5
+ "eval_samples_per_second": 37.482,
6
+ "eval_steps_per_second": 9.374,
7
+ "total_flos": 3.6976201313039155e+17,
8
+ "train_loss": 0.19690959160788019,
9
+ "train_runtime": 20589.0374,
10
+ "train_samples_per_second": 7.087,
11
+ "train_steps_per_second": 0.443
12
+ }
eval_results.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 0.9999725884707108,
3
+ "eval_loss": 0.1676628440618515,
4
+ "eval_runtime": 79.4786,
5
+ "eval_samples_per_second": 37.482,
6
+ "eval_steps_per_second": 9.374
7
+ }
train_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 0.9999725884707108,
3
+ "total_flos": 3.6976201313039155e+17,
4
+ "train_loss": 0.19690959160788019,
5
+ "train_runtime": 20589.0374,
6
+ "train_samples_per_second": 7.087,
7
+ "train_steps_per_second": 0.443
8
+ }
trainer_log.jsonl CHANGED
@@ -35,3 +35,4 @@
35
  {"current_steps": 9000, "total_steps": 9120, "loss": 0.168, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 2.3904986054812396e-08, "epoch": 0.9868150544118857, "percentage": 98.68, "elapsed_time": "5:36:30", "remaining_time": "0:04:29"}
36
  {"current_steps": 9000, "total_steps": 9120, "loss": null, "eval_loss": 0.1676628440618515, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": null, "epoch": 0.9868150544118857, "percentage": 98.68, "elapsed_time": "5:36:30", "remaining_time": "0:04:29"}
37
  {"current_steps": 9120, "total_steps": 9120, "loss": null, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": null, "epoch": 0.9999725884707108, "percentage": 100.0, "elapsed_time": "5:43:04", "remaining_time": "0:00:00"}
 
 
35
  {"current_steps": 9000, "total_steps": 9120, "loss": 0.168, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 2.3904986054812396e-08, "epoch": 0.9868150544118857, "percentage": 98.68, "elapsed_time": "5:36:30", "remaining_time": "0:04:29"}
36
  {"current_steps": 9000, "total_steps": 9120, "loss": null, "eval_loss": 0.1676628440618515, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": null, "epoch": 0.9868150544118857, "percentage": 98.68, "elapsed_time": "5:36:30", "remaining_time": "0:04:29"}
37
  {"current_steps": 9120, "total_steps": 9120, "loss": null, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": null, "epoch": 0.9999725884707108, "percentage": 100.0, "elapsed_time": "5:43:04", "remaining_time": "0:00:00"}
38
+ {"current_steps": 745, "total_steps": 745, "loss": null, "eval_loss": 0.1676628440618515, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": null, "epoch": 0.9999725884707108, "percentage": 100.0, "elapsed_time": "5:44:46", "remaining_time": "0:00:00"}
trainer_state.json ADDED
@@ -0,0 +1,288 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 0.1676628440618515,
3
+ "best_model_checkpoint": "saves/Qwen1.5-1.8B/WordProblem/checkpoint-9000",
4
+ "epoch": 0.9999725884707108,
5
+ "eval_steps": 1500,
6
+ "global_step": 9120,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.03289383514706285,
13
+ "grad_norm": 4.8125,
14
+ "learning_rate": 3e-05,
15
+ "loss": 0.7143,
16
+ "step": 300
17
+ },
18
+ {
19
+ "epoch": 0.0657876702941257,
20
+ "grad_norm": 3.5,
21
+ "learning_rate": 4.998339850669331e-05,
22
+ "loss": 0.2219,
23
+ "step": 600
24
+ },
25
+ {
26
+ "epoch": 0.09868150544118856,
27
+ "grad_norm": 4.40625,
28
+ "learning_rate": 4.9734816848192624e-05,
29
+ "loss": 0.2074,
30
+ "step": 900
31
+ },
32
+ {
33
+ "epoch": 0.1315753405882514,
34
+ "grad_norm": 4.03125,
35
+ "learning_rate": 4.9190839785031474e-05,
36
+ "loss": 0.1906,
37
+ "step": 1200
38
+ },
39
+ {
40
+ "epoch": 0.16446917573531428,
41
+ "grad_norm": 3.78125,
42
+ "learning_rate": 4.835796376008569e-05,
43
+ "loss": 0.1923,
44
+ "step": 1500
45
+ },
46
+ {
47
+ "epoch": 0.16446917573531428,
48
+ "eval_loss": 0.1849033087491989,
49
+ "eval_runtime": 79.6037,
50
+ "eval_samples_per_second": 37.423,
51
+ "eval_steps_per_second": 9.359,
52
+ "step": 1500
53
+ },
54
+ {
55
+ "epoch": 0.19736301088237712,
56
+ "grad_norm": 4.15625,
57
+ "learning_rate": 4.7246135390382216e-05,
58
+ "loss": 0.1839,
59
+ "step": 1800
60
+ },
61
+ {
62
+ "epoch": 0.23025684602944,
63
+ "grad_norm": 4.5,
64
+ "learning_rate": 4.586863267968384e-05,
65
+ "loss": 0.1938,
66
+ "step": 2100
67
+ },
68
+ {
69
+ "epoch": 0.2631506811765028,
70
+ "grad_norm": 3.1875,
71
+ "learning_rate": 4.4241906446007296e-05,
72
+ "loss": 0.1863,
73
+ "step": 2400
74
+ },
75
+ {
76
+ "epoch": 0.2960445163235657,
77
+ "grad_norm": 2.71875,
78
+ "learning_rate": 4.238538385782601e-05,
79
+ "loss": 0.1797,
80
+ "step": 2700
81
+ },
82
+ {
83
+ "epoch": 0.32893835147062855,
84
+ "grad_norm": 3.65625,
85
+ "learning_rate": 4.032123642522486e-05,
86
+ "loss": 0.176,
87
+ "step": 3000
88
+ },
89
+ {
90
+ "epoch": 0.32893835147062855,
91
+ "eval_loss": 0.1760552078485489,
92
+ "eval_runtime": 79.6333,
93
+ "eval_samples_per_second": 37.409,
94
+ "eval_steps_per_second": 9.355,
95
+ "step": 3000
96
+ },
97
+ {
98
+ "epoch": 0.3618321866176914,
99
+ "grad_norm": 4.0,
100
+ "learning_rate": 3.8074115216771435e-05,
101
+ "loss": 0.1791,
102
+ "step": 3300
103
+ },
104
+ {
105
+ "epoch": 0.39472602176475424,
106
+ "grad_norm": 3.90625,
107
+ "learning_rate": 3.567085646427478e-05,
108
+ "loss": 0.1808,
109
+ "step": 3600
110
+ },
111
+ {
112
+ "epoch": 0.4276198569118171,
113
+ "grad_norm": 3.421875,
114
+ "learning_rate": 3.3140161071244915e-05,
115
+ "loss": 0.1805,
116
+ "step": 3900
117
+ },
118
+ {
119
+ "epoch": 0.46051369205888,
120
+ "grad_norm": 2.640625,
121
+ "learning_rate": 3.05122518525215e-05,
122
+ "loss": 0.1738,
123
+ "step": 4200
124
+ },
125
+ {
126
+ "epoch": 0.49340752720594283,
127
+ "grad_norm": 4.5,
128
+ "learning_rate": 2.781851259848554e-05,
129
+ "loss": 0.1736,
130
+ "step": 4500
131
+ },
132
+ {
133
+ "epoch": 0.49340752720594283,
134
+ "eval_loss": 0.17090687155723572,
135
+ "eval_runtime": 79.6329,
136
+ "eval_samples_per_second": 37.409,
137
+ "eval_steps_per_second": 9.355,
138
+ "step": 4500
139
+ },
140
+ {
141
+ "epoch": 0.5263013623530056,
142
+ "grad_norm": 3.578125,
143
+ "learning_rate": 2.509111327432736e-05,
144
+ "loss": 0.1709,
145
+ "step": 4800
146
+ },
147
+ {
148
+ "epoch": 0.5591951975000685,
149
+ "grad_norm": 3.515625,
150
+ "learning_rate": 2.236262583042668e-05,
151
+ "loss": 0.1775,
152
+ "step": 5100
153
+ },
154
+ {
155
+ "epoch": 0.5920890326471314,
156
+ "grad_norm": 4.9375,
157
+ "learning_rate": 1.966563521202681e-05,
158
+ "loss": 0.1759,
159
+ "step": 5400
160
+ },
161
+ {
162
+ "epoch": 0.6249828677941942,
163
+ "grad_norm": 3.640625,
164
+ "learning_rate": 1.7032350213717874e-05,
165
+ "loss": 0.1754,
166
+ "step": 5700
167
+ },
168
+ {
169
+ "epoch": 0.6578767029412571,
170
+ "grad_norm": 3.578125,
171
+ "learning_rate": 1.4494218826096939e-05,
172
+ "loss": 0.1688,
173
+ "step": 6000
174
+ },
175
+ {
176
+ "epoch": 0.6578767029412571,
177
+ "eval_loss": 0.16823573410511017,
178
+ "eval_runtime": 79.6163,
179
+ "eval_samples_per_second": 37.417,
180
+ "eval_steps_per_second": 9.357,
181
+ "step": 6000
182
+ },
183
+ {
184
+ "epoch": 0.6907705380883199,
185
+ "grad_norm": 3.6875,
186
+ "learning_rate": 1.2081552668325321e-05,
187
+ "loss": 0.1707,
188
+ "step": 6300
189
+ },
190
+ {
191
+ "epoch": 0.7236643732353828,
192
+ "grad_norm": 4.0625,
193
+ "learning_rate": 9.82316499179518e-06,
194
+ "loss": 0.171,
195
+ "step": 6600
196
+ },
197
+ {
198
+ "epoch": 0.7565582083824457,
199
+ "grad_norm": 3.984375,
200
+ "learning_rate": 7.74602657804425e-06,
201
+ "loss": 0.1702,
202
+ "step": 6900
203
+ },
204
+ {
205
+ "epoch": 0.7894520435295085,
206
+ "grad_norm": 3.421875,
207
+ "learning_rate": 5.874943640356082e-06,
208
+ "loss": 0.1718,
209
+ "step": 7200
210
+ },
211
+ {
212
+ "epoch": 0.8223458786765714,
213
+ "grad_norm": 4.1875,
214
+ "learning_rate": 4.232261575703861e-06,
215
+ "loss": 0.1689,
216
+ "step": 7500
217
+ },
218
+ {
219
+ "epoch": 0.8223458786765714,
220
+ "eval_loss": 0.16773280501365662,
221
+ "eval_runtime": 79.6198,
222
+ "eval_samples_per_second": 37.415,
223
+ "eval_steps_per_second": 9.357,
224
+ "step": 7500
225
+ },
226
+ {
227
+ "epoch": 0.8552397138236342,
228
+ "grad_norm": 2.703125,
229
+ "learning_rate": 2.83759810497852e-06,
230
+ "loss": 0.1692,
231
+ "step": 7800
232
+ },
233
+ {
234
+ "epoch": 0.8881335489706971,
235
+ "grad_norm": 4.0625,
236
+ "learning_rate": 1.70760898847247e-06,
237
+ "loss": 0.1787,
238
+ "step": 8100
239
+ },
240
+ {
241
+ "epoch": 0.92102738411776,
242
+ "grad_norm": 4.71875,
243
+ "learning_rate": 8.557891145603042e-07,
244
+ "loss": 0.1733,
245
+ "step": 8400
246
+ },
247
+ {
248
+ "epoch": 0.9539212192648228,
249
+ "grad_norm": 3.6875,
250
+ "learning_rate": 2.923113370737779e-07,
251
+ "loss": 0.1741,
252
+ "step": 8700
253
+ },
254
+ {
255
+ "epoch": 0.9868150544118857,
256
+ "grad_norm": 4.3125,
257
+ "learning_rate": 2.3904986054812396e-08,
258
+ "loss": 0.168,
259
+ "step": 9000
260
+ },
261
+ {
262
+ "epoch": 0.9868150544118857,
263
+ "eval_loss": 0.1676628440618515,
264
+ "eval_runtime": 79.7085,
265
+ "eval_samples_per_second": 37.374,
266
+ "eval_steps_per_second": 9.347,
267
+ "step": 9000
268
+ },
269
+ {
270
+ "epoch": 0.9999725884707108,
271
+ "step": 9120,
272
+ "total_flos": 3.6976201313039155e+17,
273
+ "train_loss": 0.19690959160788019,
274
+ "train_runtime": 20589.0374,
275
+ "train_samples_per_second": 7.087,
276
+ "train_steps_per_second": 0.443
277
+ }
278
+ ],
279
+ "logging_steps": 300,
280
+ "max_steps": 9120,
281
+ "num_input_tokens_seen": 0,
282
+ "num_train_epochs": 1,
283
+ "save_steps": 3000,
284
+ "total_flos": 3.6976201313039155e+17,
285
+ "train_batch_size": 4,
286
+ "trial_name": null,
287
+ "trial_params": null
288
+ }
training_eval_loss.png ADDED
training_loss.png ADDED