sumuks commited on
Commit
cf5aa95
·
verified ·
1 Parent(s): d551b2a

End of training

Browse files
README.md CHANGED
@@ -4,6 +4,7 @@ license: apache-2.0
4
  base_model: Qwen/Qwen2.5-32B
5
  tags:
6
  - llama-factory
 
7
  - generated_from_trainer
8
  model-index:
9
  - name: pretrain
@@ -15,9 +16,9 @@ should probably proofread and complete it, then remove this comment. -->
15
 
16
  # pretrain
17
 
18
- This model is a fine-tuned version of [Qwen/Qwen2.5-32B](https://huggingface.co/Qwen/Qwen2.5-32B) on an unknown dataset.
19
  It achieves the following results on the evaluation set:
20
- - Loss: 1.1075
21
 
22
  ## Model description
23
 
 
4
  base_model: Qwen/Qwen2.5-32B
5
  tags:
6
  - llama-factory
7
+ - lora
8
  - generated_from_trainer
9
  model-index:
10
  - name: pretrain
 
16
 
17
  # pretrain
18
 
19
+ This model is a fine-tuned version of [Qwen/Qwen2.5-32B](https://huggingface.co/Qwen/Qwen2.5-32B) on the openreview dataset.
20
  It achieves the following results on the evaluation set:
21
+ - Loss: 1.1076
22
 
23
  ## Model description
24
 
all_results.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 0.9995988768551946,
3
+ "eval_loss": 1.1075533628463745,
4
+ "eval_runtime": 37.4732,
5
+ "eval_samples_per_second": 5.391,
6
+ "eval_steps_per_second": 1.361,
7
+ "perplexity": 3.026943494933559,
8
+ "total_flos": 6308174864842752.0,
9
+ "train_loss": 1.1638640125146074,
10
+ "train_runtime": 6246.6188,
11
+ "train_samples_per_second": 3.192,
12
+ "train_steps_per_second": 0.1
13
+ }
eval_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 0.9995988768551946,
3
+ "eval_loss": 1.1075533628463745,
4
+ "eval_runtime": 37.4732,
5
+ "eval_samples_per_second": 5.391,
6
+ "eval_steps_per_second": 1.361,
7
+ "perplexity": 3.026943494933559
8
+ }
train_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 0.9995988768551946,
3
+ "total_flos": 6308174864842752.0,
4
+ "train_loss": 1.1638640125146074,
5
+ "train_runtime": 6246.6188,
6
+ "train_samples_per_second": 3.192,
7
+ "train_steps_per_second": 0.1
8
+ }
trainer_state.json ADDED
@@ -0,0 +1,524 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 0.9995988768551946,
5
+ "eval_steps": 100,
6
+ "global_step": 623,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.016044925792218213,
13
+ "grad_norm": 0.07964161388964953,
14
+ "learning_rate": 1.5873015873015873e-06,
15
+ "loss": 1.478,
16
+ "step": 10
17
+ },
18
+ {
19
+ "epoch": 0.032089851584436425,
20
+ "grad_norm": 0.08446800204864474,
21
+ "learning_rate": 3.1746031746031746e-06,
22
+ "loss": 1.4734,
23
+ "step": 20
24
+ },
25
+ {
26
+ "epoch": 0.048134777376654635,
27
+ "grad_norm": 0.0880665470615365,
28
+ "learning_rate": 4.761904761904762e-06,
29
+ "loss": 1.4686,
30
+ "step": 30
31
+ },
32
+ {
33
+ "epoch": 0.06417970316887285,
34
+ "grad_norm": 0.11036773606126081,
35
+ "learning_rate": 6.349206349206349e-06,
36
+ "loss": 1.4799,
37
+ "step": 40
38
+ },
39
+ {
40
+ "epoch": 0.08022462896109106,
41
+ "grad_norm": 0.1984820440725813,
42
+ "learning_rate": 7.936507936507936e-06,
43
+ "loss": 1.4712,
44
+ "step": 50
45
+ },
46
+ {
47
+ "epoch": 0.09626955475330927,
48
+ "grad_norm": 0.3636876671405763,
49
+ "learning_rate": 9.523809523809525e-06,
50
+ "loss": 1.4479,
51
+ "step": 60
52
+ },
53
+ {
54
+ "epoch": 0.11231448054552748,
55
+ "grad_norm": 0.33961243491902277,
56
+ "learning_rate": 9.996145181203616e-06,
57
+ "loss": 1.3449,
58
+ "step": 70
59
+ },
60
+ {
61
+ "epoch": 0.1283594063377457,
62
+ "grad_norm": 0.11623936014320749,
63
+ "learning_rate": 9.977278743495434e-06,
64
+ "loss": 1.3048,
65
+ "step": 80
66
+ },
67
+ {
68
+ "epoch": 0.1444043321299639,
69
+ "grad_norm": 0.09934569363354419,
70
+ "learning_rate": 9.942751945444437e-06,
71
+ "loss": 1.2392,
72
+ "step": 90
73
+ },
74
+ {
75
+ "epoch": 0.16044925792218212,
76
+ "grad_norm": 0.0749779795039777,
77
+ "learning_rate": 9.892673421130979e-06,
78
+ "loss": 1.2155,
79
+ "step": 100
80
+ },
81
+ {
82
+ "epoch": 0.16044925792218212,
83
+ "eval_loss": 1.2045714855194092,
84
+ "eval_runtime": 36.9278,
85
+ "eval_samples_per_second": 5.47,
86
+ "eval_steps_per_second": 1.381,
87
+ "step": 100
88
+ },
89
+ {
90
+ "epoch": 0.17649418371440032,
91
+ "grad_norm": 0.06497134561472019,
92
+ "learning_rate": 9.827200736119815e-06,
93
+ "loss": 1.2129,
94
+ "step": 110
95
+ },
96
+ {
97
+ "epoch": 0.19253910950661854,
98
+ "grad_norm": 0.06149214025937759,
99
+ "learning_rate": 9.746539891700558e-06,
100
+ "loss": 1.1695,
101
+ "step": 120
102
+ },
103
+ {
104
+ "epoch": 0.20858403529883673,
105
+ "grad_norm": 0.05334876254417194,
106
+ "learning_rate": 9.650944676731383e-06,
107
+ "loss": 1.162,
108
+ "step": 130
109
+ },
110
+ {
111
+ "epoch": 0.22462896109105496,
112
+ "grad_norm": 0.04556668435804944,
113
+ "learning_rate": 9.540715869125407e-06,
114
+ "loss": 1.1491,
115
+ "step": 140
116
+ },
117
+ {
118
+ "epoch": 0.24067388688327315,
119
+ "grad_norm": 0.04356126220951498,
120
+ "learning_rate": 9.416200289492092e-06,
121
+ "loss": 1.1358,
122
+ "step": 150
123
+ },
124
+ {
125
+ "epoch": 0.2567188126754914,
126
+ "grad_norm": 0.03723127058817997,
127
+ "learning_rate": 9.27778970991129e-06,
128
+ "loss": 1.1338,
129
+ "step": 160
130
+ },
131
+ {
132
+ "epoch": 0.27276373846770957,
133
+ "grad_norm": 0.03598344746987169,
134
+ "learning_rate": 9.125919621273348e-06,
135
+ "loss": 1.1333,
136
+ "step": 170
137
+ },
138
+ {
139
+ "epoch": 0.2888086642599278,
140
+ "grad_norm": 0.03958997125425865,
141
+ "learning_rate": 8.961067863063638e-06,
142
+ "loss": 1.1382,
143
+ "step": 180
144
+ },
145
+ {
146
+ "epoch": 0.304853590052146,
147
+ "grad_norm": 0.033584445768925664,
148
+ "learning_rate": 8.783753119902766e-06,
149
+ "loss": 1.1329,
150
+ "step": 190
151
+ },
152
+ {
153
+ "epoch": 0.32089851584436424,
154
+ "grad_norm": 0.0370694089463755,
155
+ "learning_rate": 8.594533289572852e-06,
156
+ "loss": 1.1392,
157
+ "step": 200
158
+ },
159
+ {
160
+ "epoch": 0.32089851584436424,
161
+ "eval_loss": 1.1237837076187134,
162
+ "eval_runtime": 36.9468,
163
+ "eval_samples_per_second": 5.467,
164
+ "eval_steps_per_second": 1.38,
165
+ "step": 200
166
+ },
167
+ {
168
+ "epoch": 0.3369434416365824,
169
+ "grad_norm": 0.031404112904854176,
170
+ "learning_rate": 8.39400372766471e-06,
171
+ "loss": 1.1338,
172
+ "step": 210
173
+ },
174
+ {
175
+ "epoch": 0.35298836742880063,
176
+ "grad_norm": 0.03068751780314738,
177
+ "learning_rate": 8.182795374368893e-06,
178
+ "loss": 1.1313,
179
+ "step": 220
180
+ },
181
+ {
182
+ "epoch": 0.36903329322101885,
183
+ "grad_norm": 0.030467037926615172,
184
+ "learning_rate": 7.961572769304437e-06,
185
+ "loss": 1.1247,
186
+ "step": 230
187
+ },
188
+ {
189
+ "epoch": 0.3850782190132371,
190
+ "grad_norm": 0.030773507709064975,
191
+ "learning_rate": 7.731031960631354e-06,
192
+ "loss": 1.1318,
193
+ "step": 240
194
+ },
195
+ {
196
+ "epoch": 0.4011231448054553,
197
+ "grad_norm": 0.033328095202181884,
198
+ "learning_rate": 7.491898315025615e-06,
199
+ "loss": 1.1249,
200
+ "step": 250
201
+ },
202
+ {
203
+ "epoch": 0.41716807059767347,
204
+ "grad_norm": 0.03128616232047187,
205
+ "learning_rate": 7.244924235407224e-06,
206
+ "loss": 1.1162,
207
+ "step": 260
208
+ },
209
+ {
210
+ "epoch": 0.4332129963898917,
211
+ "grad_norm": 0.031188168038119562,
212
+ "learning_rate": 6.990886793602268e-06,
213
+ "loss": 1.1133,
214
+ "step": 270
215
+ },
216
+ {
217
+ "epoch": 0.4492579221821099,
218
+ "grad_norm": 0.03333159876932075,
219
+ "learning_rate": 6.730585285387465e-06,
220
+ "loss": 1.1056,
221
+ "step": 280
222
+ },
223
+ {
224
+ "epoch": 0.46530284797432814,
225
+ "grad_norm": 0.0316593444984735,
226
+ "learning_rate": 6.464838715609945e-06,
227
+ "loss": 1.1266,
228
+ "step": 290
229
+ },
230
+ {
231
+ "epoch": 0.4813477737665463,
232
+ "grad_norm": 0.03167501381379994,
233
+ "learning_rate": 6.194483221294989e-06,
234
+ "loss": 1.1181,
235
+ "step": 300
236
+ },
237
+ {
238
+ "epoch": 0.4813477737665463,
239
+ "eval_loss": 1.1140353679656982,
240
+ "eval_runtime": 36.7217,
241
+ "eval_samples_per_second": 5.501,
242
+ "eval_steps_per_second": 1.389,
243
+ "step": 300
244
+ },
245
+ {
246
+ "epoch": 0.49739269955876453,
247
+ "grad_norm": 0.03328370476401871,
248
+ "learning_rate": 5.920369440849609e-06,
249
+ "loss": 1.0962,
250
+ "step": 310
251
+ },
252
+ {
253
+ "epoch": 0.5134376253509828,
254
+ "grad_norm": 0.03447585267125308,
255
+ "learning_rate": 5.643359837639419e-06,
256
+ "loss": 1.1171,
257
+ "step": 320
258
+ },
259
+ {
260
+ "epoch": 0.529482551143201,
261
+ "grad_norm": 0.03781496580664963,
262
+ "learning_rate": 5.3643259863598015e-06,
263
+ "loss": 1.1091,
264
+ "step": 330
265
+ },
266
+ {
267
+ "epoch": 0.5455274769354191,
268
+ "grad_norm": 0.03172320948571133,
269
+ "learning_rate": 5.084145830739462e-06,
270
+ "loss": 1.1017,
271
+ "step": 340
272
+ },
273
+ {
274
+ "epoch": 0.5615724027276374,
275
+ "grad_norm": 0.03412876166847143,
276
+ "learning_rate": 4.803700921204659e-06,
277
+ "loss": 1.1169,
278
+ "step": 350
279
+ },
280
+ {
281
+ "epoch": 0.5776173285198556,
282
+ "grad_norm": 0.03375281781263732,
283
+ "learning_rate": 4.5238736411954075e-06,
284
+ "loss": 1.0998,
285
+ "step": 360
286
+ },
287
+ {
288
+ "epoch": 0.5936622543120738,
289
+ "grad_norm": 0.03717435838572482,
290
+ "learning_rate": 4.245544430860743e-06,
291
+ "loss": 1.1117,
292
+ "step": 370
293
+ },
294
+ {
295
+ "epoch": 0.609707180104292,
296
+ "grad_norm": 0.029559551388140908,
297
+ "learning_rate": 3.969589016868269e-06,
298
+ "loss": 1.1283,
299
+ "step": 380
300
+ },
301
+ {
302
+ "epoch": 0.6257521058965102,
303
+ "grad_norm": 0.04200189123455742,
304
+ "learning_rate": 3.6968756570440735e-06,
305
+ "loss": 1.1228,
306
+ "step": 390
307
+ },
308
+ {
309
+ "epoch": 0.6417970316887285,
310
+ "grad_norm": 0.03378722314455019,
311
+ "learning_rate": 3.42826240851239e-06,
312
+ "loss": 1.1252,
313
+ "step": 400
314
+ },
315
+ {
316
+ "epoch": 0.6417970316887285,
317
+ "eval_loss": 1.1096988916397095,
318
+ "eval_runtime": 36.792,
319
+ "eval_samples_per_second": 5.49,
320
+ "eval_steps_per_second": 1.386,
321
+ "step": 400
322
+ },
323
+ {
324
+ "epoch": 0.6578419574809466,
325
+ "grad_norm": 0.03734682816128997,
326
+ "learning_rate": 3.1645944279304296e-06,
327
+ "loss": 1.1085,
328
+ "step": 410
329
+ },
330
+ {
331
+ "epoch": 0.6738868832731648,
332
+ "grad_norm": 0.06074049582614517,
333
+ "learning_rate": 2.906701312312861e-06,
334
+ "loss": 1.0973,
335
+ "step": 420
336
+ },
337
+ {
338
+ "epoch": 0.6899318090653831,
339
+ "grad_norm": 0.03310219959096242,
340
+ "learning_rate": 2.6553944888126772e-06,
341
+ "loss": 1.0965,
342
+ "step": 430
343
+ },
344
+ {
345
+ "epoch": 0.7059767348576013,
346
+ "grad_norm": 0.036021492275449166,
347
+ "learning_rate": 2.4114646616711844e-06,
348
+ "loss": 1.1028,
349
+ "step": 440
350
+ },
351
+ {
352
+ "epoch": 0.7220216606498195,
353
+ "grad_norm": 0.03370764681001696,
354
+ "learning_rate": 2.175679324369913e-06,
355
+ "loss": 1.1094,
356
+ "step": 450
357
+ },
358
+ {
359
+ "epoch": 0.7380665864420377,
360
+ "grad_norm": 0.032620569117429374,
361
+ "learning_rate": 1.948780344812181e-06,
362
+ "loss": 1.103,
363
+ "step": 460
364
+ },
365
+ {
366
+ "epoch": 0.7541115122342559,
367
+ "grad_norm": 0.03518623684780172,
368
+ "learning_rate": 1.7314816311322219e-06,
369
+ "loss": 1.1019,
370
+ "step": 470
371
+ },
372
+ {
373
+ "epoch": 0.7701564380264742,
374
+ "grad_norm": 0.033154472751256124,
375
+ "learning_rate": 1.5244668854760459e-06,
376
+ "loss": 1.1025,
377
+ "step": 480
378
+ },
379
+ {
380
+ "epoch": 0.7862013638186923,
381
+ "grad_norm": 0.03282196969531501,
382
+ "learning_rate": 1.3283874528215735e-06,
383
+ "loss": 1.0912,
384
+ "step": 490
385
+ },
386
+ {
387
+ "epoch": 0.8022462896109106,
388
+ "grad_norm": 0.03261545464797978,
389
+ "learning_rate": 1.143860271606333e-06,
390
+ "loss": 1.1199,
391
+ "step": 500
392
+ },
393
+ {
394
+ "epoch": 0.8022462896109106,
395
+ "eval_loss": 1.1079450845718384,
396
+ "eval_runtime": 37.0687,
397
+ "eval_samples_per_second": 5.449,
398
+ "eval_steps_per_second": 1.376,
399
+ "step": 500
400
+ },
401
+ {
402
+ "epoch": 0.8182912154031288,
403
+ "grad_norm": 0.03521866325178647,
404
+ "learning_rate": 9.714659326109138e-07,
405
+ "loss": 1.1117,
406
+ "step": 510
407
+ },
408
+ {
409
+ "epoch": 0.8343361411953469,
410
+ "grad_norm": 0.03534230366225303,
411
+ "learning_rate": 8.117468522055578e-07,
412
+ "loss": 1.1197,
413
+ "step": 520
414
+ },
415
+ {
416
+ "epoch": 0.8503810669875652,
417
+ "grad_norm": 0.03192663388371955,
418
+ "learning_rate": 6.652055657075845e-07,
419
+ "loss": 1.1212,
420
+ "step": 530
421
+ },
422
+ {
423
+ "epoch": 0.8664259927797834,
424
+ "grad_norm": 0.03381094832008424,
425
+ "learning_rate": 5.323031462193757e-07,
426
+ "loss": 1.1147,
427
+ "step": 540
428
+ },
429
+ {
430
+ "epoch": 0.8824709185720016,
431
+ "grad_norm": 0.0312520726762653,
432
+ "learning_rate": 4.134577539217965e-07,
433
+ "loss": 1.1094,
434
+ "step": 550
435
+ },
436
+ {
437
+ "epoch": 0.8985158443642198,
438
+ "grad_norm": 0.031196657419516906,
439
+ "learning_rate": 3.0904332038757977e-07,
440
+ "loss": 1.1008,
441
+ "step": 560
442
+ },
443
+ {
444
+ "epoch": 0.914560770156438,
445
+ "grad_norm": 0.03159621090668423,
446
+ "learning_rate": 2.1938837205424002e-07,
447
+ "loss": 1.1146,
448
+ "step": 570
449
+ },
450
+ {
451
+ "epoch": 0.9306056959486563,
452
+ "grad_norm": 0.03478277084648844,
453
+ "learning_rate": 1.4477499655837278e-07,
454
+ "loss": 1.1122,
455
+ "step": 580
456
+ },
457
+ {
458
+ "epoch": 0.9466506217408744,
459
+ "grad_norm": 0.03684852374687432,
460
+ "learning_rate": 8.543795518357767e-08,
461
+ "loss": 1.1185,
462
+ "step": 590
463
+ },
464
+ {
465
+ "epoch": 0.9626955475330926,
466
+ "grad_norm": 0.03649848908454171,
467
+ "learning_rate": 4.15639442146093e-08,
468
+ "loss": 1.1104,
469
+ "step": 600
470
+ },
471
+ {
472
+ "epoch": 0.9626955475330926,
473
+ "eval_loss": 1.107527494430542,
474
+ "eval_runtime": 36.8485,
475
+ "eval_samples_per_second": 5.482,
476
+ "eval_steps_per_second": 1.384,
477
+ "step": 600
478
+ },
479
+ {
480
+ "epoch": 0.9787404733253109,
481
+ "grad_norm": 0.032198270473962474,
482
+ "learning_rate": 1.3291007521799015e-08,
483
+ "loss": 1.1028,
484
+ "step": 610
485
+ },
486
+ {
487
+ "epoch": 0.9947853991175291,
488
+ "grad_norm": 0.0365869343105355,
489
+ "learning_rate": 7.081022239591173e-10,
490
+ "loss": 1.1179,
491
+ "step": 620
492
+ },
493
+ {
494
+ "epoch": 0.9995988768551946,
495
+ "step": 623,
496
+ "total_flos": 6308174864842752.0,
497
+ "train_loss": 1.1638640125146074,
498
+ "train_runtime": 6246.6188,
499
+ "train_samples_per_second": 3.192,
500
+ "train_steps_per_second": 0.1
501
+ }
502
+ ],
503
+ "logging_steps": 10,
504
+ "max_steps": 623,
505
+ "num_input_tokens_seen": 0,
506
+ "num_train_epochs": 1,
507
+ "save_steps": 500,
508
+ "stateful_callbacks": {
509
+ "TrainerControl": {
510
+ "args": {
511
+ "should_epoch_stop": false,
512
+ "should_evaluate": false,
513
+ "should_log": false,
514
+ "should_save": true,
515
+ "should_training_stop": true
516
+ },
517
+ "attributes": {}
518
+ }
519
+ },
520
+ "total_flos": 6308174864842752.0,
521
+ "train_batch_size": 2,
522
+ "trial_name": null,
523
+ "trial_params": null
524
+ }
training_eval_loss.png ADDED
training_loss.png ADDED