youndukn commited on
Commit
e5ba52e
1 Parent(s): cf2844d

Model save

Browse files
README.md CHANGED
@@ -2,13 +2,11 @@
2
  license: other
3
  library_name: peft
4
  tags:
5
- - alignment-handbook
6
- - generated_from_trainer
7
  - trl
8
  - sft
9
  - generated_from_trainer
10
  datasets:
11
- - youndukn/ROLE_PLAY_INSTRUCT
12
  base_model: Gryphe/MythoMax-L2-13b
13
  model-index:
14
  - name: mythomax-13b-sft-lora
@@ -20,9 +18,9 @@ should probably proofread and complete it, then remove this comment. -->
20
 
21
  # mythomax-13b-sft-lora
22
 
23
- This model is a fine-tuned version of [Gryphe/MythoMax-L2-13b](https://huggingface.co/Gryphe/MythoMax-L2-13b) on the youndukn/ROLE_PLAY_INSTRUCT dataset.
24
  It achieves the following results on the evaluation set:
25
- - Loss: 0.8712
26
 
27
  ## Model description
28
 
@@ -46,8 +44,8 @@ The following hyperparameters were used during training:
46
  - eval_batch_size: 8
47
  - seed: 42
48
  - distributed_type: multi-GPU
49
- - gradient_accumulation_steps: 2
50
- - total_train_batch_size: 8
51
  - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
52
  - lr_scheduler_type: cosine
53
  - lr_scheduler_warmup_ratio: 0.1
@@ -57,7 +55,7 @@ The following hyperparameters were used during training:
57
 
58
  | Training Loss | Epoch | Step | Validation Loss |
59
  |:-------------:|:-----:|:----:|:---------------:|
60
- | 0.8799 | 1.0 | 711 | 0.8712 |
61
 
62
 
63
  ### Framework versions
 
2
  license: other
3
  library_name: peft
4
  tags:
 
 
5
  - trl
6
  - sft
7
  - generated_from_trainer
8
  datasets:
9
+ - generator
10
  base_model: Gryphe/MythoMax-L2-13b
11
  model-index:
12
  - name: mythomax-13b-sft-lora
 
18
 
19
  # mythomax-13b-sft-lora
20
 
21
+ This model is a fine-tuned version of [Gryphe/MythoMax-L2-13b](https://huggingface.co/Gryphe/MythoMax-L2-13b) on the generator dataset.
22
  It achieves the following results on the evaluation set:
23
+ - Loss: 1.5062
24
 
25
  ## Model description
26
 
 
44
  - eval_batch_size: 8
45
  - seed: 42
46
  - distributed_type: multi-GPU
47
+ - gradient_accumulation_steps: 128
48
+ - total_train_batch_size: 512
49
  - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
50
  - lr_scheduler_type: cosine
51
  - lr_scheduler_warmup_ratio: 0.1
 
55
 
56
  | Training Loss | Epoch | Step | Validation Loss |
57
  |:-------------:|:-----:|:----:|:---------------:|
58
+ | 1.5495 | 0.99 | 11 | 1.5062 |
59
 
60
 
61
  ### Framework versions
adapter_config.json CHANGED
@@ -15,17 +15,17 @@
15
  "megatron_core": "megatron.core",
16
  "modules_to_save": null,
17
  "peft_type": "LORA",
18
- "r": 16,
19
  "rank_pattern": {},
20
  "revision": null,
21
  "target_modules": [
 
22
  "v_proj",
 
23
  "down_proj",
24
  "k_proj",
25
- "o_proj",
26
  "up_proj",
27
- "gate_proj",
28
- "q_proj"
29
  ],
30
  "task_type": "CAUSAL_LM"
31
  }
 
15
  "megatron_core": "megatron.core",
16
  "modules_to_save": null,
17
  "peft_type": "LORA",
18
+ "r": 64,
19
  "rank_pattern": {},
20
  "revision": null,
21
  "target_modules": [
22
+ "q_proj",
23
  "v_proj",
24
+ "gate_proj",
25
  "down_proj",
26
  "k_proj",
 
27
  "up_proj",
28
+ "o_proj"
 
29
  ],
30
  "task_type": "CAUSAL_LM"
31
  }
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:55f703e544e39aa021e4f0bcd5a356fe1010a8795fdceaa684eca0e4df86c45e
3
- size 125249184
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0cb365c99f8cf3346fdbb14cd2122fe2728458bacd0b2b5bfb40312743dd1e18
3
+ size 500771216
all_results.json CHANGED
@@ -1,13 +1,13 @@
1
  {
2
- "epoch": 1.0,
3
- "eval_loss": 0.8712352514266968,
4
- "eval_runtime": 636.6369,
5
  "eval_samples": 1769,
6
- "eval_samples_per_second": 0.996,
7
- "eval_steps_per_second": 0.126,
8
- "train_loss": 0.9507392455421587,
9
- "train_runtime": 19724.4028,
10
  "train_samples": 15899,
11
- "train_samples_per_second": 0.288,
12
- "train_steps_per_second": 0.036
13
  }
 
1
  {
2
+ "epoch": 0.99,
3
+ "eval_loss": 1.506221055984497,
4
+ "eval_runtime": 639.7589,
5
  "eval_samples": 1769,
6
+ "eval_samples_per_second": 0.991,
7
+ "eval_steps_per_second": 0.125,
8
+ "train_loss": 1.6525719924406572,
9
+ "train_runtime": 19611.737,
10
  "train_samples": 15899,
11
+ "train_samples_per_second": 0.29,
12
+ "train_steps_per_second": 0.001
13
  }
eval_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 1.0,
3
- "eval_loss": 0.8712352514266968,
4
- "eval_runtime": 636.6369,
5
  "eval_samples": 1769,
6
- "eval_samples_per_second": 0.996,
7
- "eval_steps_per_second": 0.126
8
  }
 
1
  {
2
+ "epoch": 0.99,
3
+ "eval_loss": 1.506221055984497,
4
+ "eval_runtime": 639.7589,
5
  "eval_samples": 1769,
6
+ "eval_samples_per_second": 0.991,
7
+ "eval_steps_per_second": 0.125
8
  }
runs/Jan17_14-36-44_77716d823da8/events.out.tfevents.1705502500.77716d823da8.825.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7d5fe914be043838003e537aaf1af37ccd3d5e94f3cf6fac99f71afb19338b3b
3
+ size 4504
runs/Jan17_14-56-12_77716d823da8/events.out.tfevents.1705503412.77716d823da8.1051.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9cfd73f9dfb29b624f646973ab3a09dfac36b70b6653a6ccbfc63ce5885e59ce
3
+ size 5580
runs/Jan17_14-56-12_77716d823da8/events.out.tfevents.1705523663.77716d823da8.1051.1 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:12cdaff80c2a7994e318ab692337a7bb10f5ba709bdea165889ee2efacea60b9
3
+ size 354
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 1.0,
3
- "train_loss": 0.9507392455421587,
4
- "train_runtime": 19724.4028,
5
  "train_samples": 15899,
6
- "train_samples_per_second": 0.288,
7
- "train_steps_per_second": 0.036
8
  }
 
1
  {
2
+ "epoch": 0.99,
3
+ "train_loss": 1.6525719924406572,
4
+ "train_runtime": 19611.737,
5
  "train_samples": 15899,
6
+ "train_samples_per_second": 0.29,
7
+ "train_steps_per_second": 0.001
8
  }
trainer_state.json CHANGED
@@ -1,895 +1,55 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.9992972593113141,
5
  "eval_steps": 500,
6
- "global_step": 711,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
- {
12
- "epoch": 0.0,
13
- "learning_rate": 2.777777777777778e-06,
14
- "loss": 1.8297,
15
- "step": 1
16
- },
17
- {
18
- "epoch": 0.01,
19
- "learning_rate": 1.388888888888889e-05,
20
- "loss": 1.8176,
21
- "step": 5
22
- },
23
- {
24
- "epoch": 0.01,
25
- "learning_rate": 2.777777777777778e-05,
26
- "loss": 1.8447,
27
- "step": 10
28
- },
29
- {
30
- "epoch": 0.02,
31
- "learning_rate": 4.166666666666667e-05,
32
- "loss": 1.8067,
33
- "step": 15
34
- },
35
- {
36
- "epoch": 0.03,
37
- "learning_rate": 5.555555555555556e-05,
38
- "loss": 1.7216,
39
- "step": 20
40
- },
41
- {
42
- "epoch": 0.04,
43
- "learning_rate": 6.944444444444444e-05,
44
- "loss": 1.5794,
45
- "step": 25
46
- },
47
- {
48
- "epoch": 0.04,
49
- "learning_rate": 8.333333333333334e-05,
50
- "loss": 1.5115,
51
- "step": 30
52
- },
53
- {
54
- "epoch": 0.05,
55
- "learning_rate": 9.722222222222223e-05,
56
- "loss": 1.4649,
57
- "step": 35
58
- },
59
- {
60
- "epoch": 0.06,
61
- "learning_rate": 0.00011111111111111112,
62
- "loss": 1.4083,
63
- "step": 40
64
- },
65
- {
66
- "epoch": 0.06,
67
- "learning_rate": 0.000125,
68
- "loss": 1.3655,
69
- "step": 45
70
- },
71
- {
72
- "epoch": 0.07,
73
- "learning_rate": 0.0001388888888888889,
74
- "loss": 1.3277,
75
- "step": 50
76
- },
77
- {
78
- "epoch": 0.08,
79
- "learning_rate": 0.00015277777777777777,
80
- "loss": 1.3157,
81
- "step": 55
82
- },
83
- {
84
- "epoch": 0.08,
85
- "learning_rate": 0.0001666666666666667,
86
- "loss": 1.2691,
87
- "step": 60
88
- },
89
  {
90
  "epoch": 0.09,
91
- "learning_rate": 0.00018055555555555557,
92
- "loss": 1.25,
93
- "step": 65
94
- },
95
- {
96
- "epoch": 0.1,
97
- "learning_rate": 0.00019444444444444446,
98
- "loss": 1.2222,
99
- "step": 70
100
- },
101
- {
102
- "epoch": 0.11,
103
- "learning_rate": 0.0001999891231617599,
104
- "loss": 1.0998,
105
- "step": 75
106
- },
107
- {
108
- "epoch": 0.11,
109
- "learning_rate": 0.00019992266216318035,
110
- "loss": 0.9863,
111
- "step": 80
112
- },
113
- {
114
- "epoch": 0.12,
115
- "learning_rate": 0.0001997958229642588,
116
- "loss": 1.0027,
117
- "step": 85
118
- },
119
- {
120
- "epoch": 0.13,
121
- "learning_rate": 0.00019960868220749448,
122
- "loss": 0.9667,
123
- "step": 90
124
- },
125
- {
126
- "epoch": 0.13,
127
- "learning_rate": 0.00019936135297256185,
128
- "loss": 0.9511,
129
- "step": 95
130
- },
131
- {
132
- "epoch": 0.14,
133
- "learning_rate": 0.00019905398470798207,
134
- "loss": 0.9538,
135
- "step": 100
136
- },
137
- {
138
- "epoch": 0.15,
139
- "learning_rate": 0.00019868676314081904,
140
- "loss": 0.9471,
141
- "step": 105
142
- },
143
- {
144
- "epoch": 0.15,
145
- "learning_rate": 0.00019825991016445387,
146
- "loss": 0.9514,
147
- "step": 110
148
- },
149
- {
150
- "epoch": 0.16,
151
- "learning_rate": 0.0001977736837045058,
152
- "loss": 0.936,
153
- "step": 115
154
- },
155
- {
156
- "epoch": 0.17,
157
- "learning_rate": 0.00019722837756298113,
158
- "loss": 0.9268,
159
- "step": 120
160
- },
161
- {
162
- "epoch": 0.18,
163
- "learning_rate": 0.00019662432124074327,
164
- "loss": 0.93,
165
- "step": 125
166
- },
167
- {
168
- "epoch": 0.18,
169
- "learning_rate": 0.00019596187973841217,
170
- "loss": 0.9328,
171
- "step": 130
172
- },
173
- {
174
- "epoch": 0.19,
175
- "learning_rate": 0.00019524145333581317,
176
- "loss": 0.9228,
177
- "step": 135
178
- },
179
- {
180
- "epoch": 0.2,
181
- "learning_rate": 0.00019446347735010763,
182
- "loss": 0.9346,
183
- "step": 140
184
- },
185
- {
186
- "epoch": 0.2,
187
- "learning_rate": 0.00019362842187275355,
188
- "loss": 0.9255,
189
- "step": 145
190
- },
191
- {
192
- "epoch": 0.21,
193
- "learning_rate": 0.00019273679148545245,
194
- "loss": 0.9112,
195
- "step": 150
196
- },
197
- {
198
- "epoch": 0.22,
199
- "learning_rate": 0.00019178912495525677,
200
- "loss": 0.9124,
201
- "step": 155
202
- },
203
- {
204
- "epoch": 0.22,
205
- "learning_rate": 0.00019078599490901984,
206
- "loss": 0.9172,
207
- "step": 160
208
- },
209
- {
210
- "epoch": 0.23,
211
- "learning_rate": 0.0001897280074873868,
212
- "loss": 0.9099,
213
- "step": 165
214
- },
215
- {
216
- "epoch": 0.24,
217
- "learning_rate": 0.00018861580197853422,
218
- "loss": 0.9153,
219
- "step": 170
220
- },
221
- {
222
- "epoch": 0.25,
223
- "learning_rate": 0.00018745005043188103,
224
- "loss": 0.9142,
225
- "step": 175
226
- },
227
- {
228
- "epoch": 0.25,
229
- "learning_rate": 0.00018623145725200278,
230
- "loss": 0.9315,
231
- "step": 180
232
- },
233
- {
234
- "epoch": 0.26,
235
- "learning_rate": 0.00018496075877299584,
236
- "loss": 0.9145,
237
- "step": 185
238
- },
239
- {
240
- "epoch": 0.27,
241
- "learning_rate": 0.00018363872281354797,
242
- "loss": 0.8927,
243
- "step": 190
244
- },
245
- {
246
- "epoch": 0.27,
247
- "learning_rate": 0.0001822661482129844,
248
- "loss": 0.9047,
249
- "step": 195
250
- },
251
- {
252
- "epoch": 0.28,
253
- "learning_rate": 0.0001808438643485698,
254
- "loss": 0.8972,
255
- "step": 200
256
- },
257
- {
258
- "epoch": 0.29,
259
- "learning_rate": 0.00017937273063435737,
260
- "loss": 0.9032,
261
- "step": 205
262
- },
263
- {
264
- "epoch": 0.3,
265
- "learning_rate": 0.00017785363600188894,
266
- "loss": 0.9133,
267
- "step": 210
268
- },
269
- {
270
- "epoch": 0.3,
271
- "learning_rate": 0.0001762874983630582,
272
- "loss": 0.9044,
273
- "step": 215
274
- },
275
- {
276
- "epoch": 0.31,
277
- "learning_rate": 0.00017467526405546343,
278
- "loss": 0.8999,
279
- "step": 220
280
- },
281
- {
282
- "epoch": 0.32,
283
- "learning_rate": 0.00017301790727058345,
284
- "loss": 0.9027,
285
- "step": 225
286
- },
287
- {
288
- "epoch": 0.32,
289
- "learning_rate": 0.00017131642946512313,
290
- "loss": 0.8985,
291
- "step": 230
292
- },
293
- {
294
- "epoch": 0.33,
295
- "learning_rate": 0.000169571858755884,
296
- "loss": 0.909,
297
- "step": 235
298
- },
299
- {
300
- "epoch": 0.34,
301
- "learning_rate": 0.00016778524929852512,
302
- "loss": 0.8904,
303
- "step": 240
304
- },
305
- {
306
- "epoch": 0.34,
307
- "learning_rate": 0.00016595768065059047,
308
- "loss": 0.8983,
309
- "step": 245
310
- },
311
- {
312
- "epoch": 0.35,
313
- "learning_rate": 0.0001640902571191869,
314
- "loss": 0.8829,
315
- "step": 250
316
- },
317
- {
318
- "epoch": 0.36,
319
- "learning_rate": 0.00016218410709370736,
320
- "loss": 0.9014,
321
- "step": 255
322
- },
323
- {
324
- "epoch": 0.37,
325
- "learning_rate": 0.00016024038236400246,
326
- "loss": 0.8854,
327
- "step": 260
328
- },
329
- {
330
- "epoch": 0.37,
331
- "learning_rate": 0.00015826025742441207,
332
- "loss": 0.8947,
333
- "step": 265
334
- },
335
- {
336
- "epoch": 0.38,
337
- "learning_rate": 0.0001562449287640781,
338
- "loss": 0.894,
339
- "step": 270
340
- },
341
- {
342
- "epoch": 0.39,
343
- "learning_rate": 0.00015419561414396657,
344
- "loss": 0.898,
345
- "step": 275
346
- },
347
- {
348
- "epoch": 0.39,
349
- "learning_rate": 0.00015211355186103655,
350
- "loss": 0.9012,
351
- "step": 280
352
- },
353
- {
354
- "epoch": 0.4,
355
- "learning_rate": 0.00015000000000000001,
356
- "loss": 0.8836,
357
- "step": 285
358
- },
359
- {
360
- "epoch": 0.41,
361
- "learning_rate": 0.00014785623567312492,
362
- "loss": 0.8784,
363
- "step": 290
364
- },
365
- {
366
- "epoch": 0.41,
367
- "learning_rate": 0.00014568355424854113,
368
- "loss": 0.8905,
369
- "step": 295
370
- },
371
- {
372
- "epoch": 0.42,
373
- "learning_rate": 0.00014348326856751496,
374
- "loss": 0.8873,
375
- "step": 300
376
- },
377
- {
378
- "epoch": 0.43,
379
- "learning_rate": 0.00014125670815116588,
380
- "loss": 0.8773,
381
- "step": 305
382
- },
383
- {
384
- "epoch": 0.44,
385
- "learning_rate": 0.00013900521839710426,
386
- "loss": 0.9042,
387
- "step": 310
388
- },
389
- {
390
- "epoch": 0.44,
391
- "learning_rate": 0.00013673015976647568,
392
- "loss": 0.9007,
393
- "step": 315
394
  },
395
  {
396
  "epoch": 0.45,
397
- "learning_rate": 0.00013443290696190334,
398
- "loss": 0.8909,
399
- "step": 320
400
- },
401
- {
402
- "epoch": 0.46,
403
- "learning_rate": 0.00013211484809682483,
404
- "loss": 0.887,
405
- "step": 325
406
- },
407
- {
408
- "epoch": 0.46,
409
- "learning_rate": 0.00012977738385672557,
410
- "loss": 0.8729,
411
- "step": 330
412
- },
413
- {
414
- "epoch": 0.47,
415
- "learning_rate": 0.00012742192665277568,
416
- "loss": 0.8833,
417
- "step": 335
418
- },
419
- {
420
- "epoch": 0.48,
421
- "learning_rate": 0.00012504989976838132,
422
- "loss": 0.8891,
423
- "step": 340
424
- },
425
- {
426
- "epoch": 0.48,
427
- "learning_rate": 0.0001226627364991667,
428
- "loss": 0.884,
429
- "step": 345
430
- },
431
- {
432
- "epoch": 0.49,
433
- "learning_rate": 0.00012026187928690629,
434
- "loss": 0.886,
435
- "step": 350
436
- },
437
- {
438
- "epoch": 0.5,
439
- "learning_rate": 0.00011784877884793031,
440
- "loss": 0.8876,
441
- "step": 355
442
- },
443
- {
444
- "epoch": 0.51,
445
- "learning_rate": 0.00011542489329653024,
446
- "loss": 0.8865,
447
- "step": 360
448
- },
449
- {
450
- "epoch": 0.51,
451
- "learning_rate": 0.00011299168726389448,
452
- "loss": 0.8861,
453
- "step": 365
454
- },
455
- {
456
- "epoch": 0.52,
457
- "learning_rate": 0.00011055063101310581,
458
- "loss": 0.874,
459
- "step": 370
460
- },
461
- {
462
- "epoch": 0.53,
463
- "learning_rate": 0.00010810319955073601,
464
- "loss": 0.8807,
465
- "step": 375
466
- },
467
- {
468
- "epoch": 0.53,
469
- "learning_rate": 0.00010565087173557395,
470
- "loss": 0.8913,
471
- "step": 380
472
- },
473
- {
474
- "epoch": 0.54,
475
- "learning_rate": 0.00010319512938502654,
476
- "loss": 0.8937,
477
- "step": 385
478
- },
479
- {
480
- "epoch": 0.55,
481
- "learning_rate": 0.00010073745637973124,
482
- "loss": 0.8738,
483
- "step": 390
484
- },
485
- {
486
- "epoch": 0.56,
487
- "learning_rate": 9.827933776692235e-05,
488
- "loss": 0.8868,
489
- "step": 395
490
- },
491
- {
492
- "epoch": 0.56,
493
- "learning_rate": 9.582225886309217e-05,
494
- "loss": 0.8901,
495
- "step": 400
496
- },
497
- {
498
- "epoch": 0.57,
499
- "learning_rate": 9.336770435648964e-05,
500
- "loss": 0.881,
501
- "step": 405
502
- },
503
- {
504
- "epoch": 0.58,
505
- "learning_rate": 9.091715740999828e-05,
506
- "loss": 0.8922,
507
- "step": 410
508
- },
509
- {
510
- "epoch": 0.58,
511
- "learning_rate": 8.84720987649363e-05,
512
- "loss": 0.8695,
513
- "step": 415
514
- },
515
- {
516
- "epoch": 0.59,
517
- "learning_rate": 8.60340058463194e-05,
518
- "loss": 0.8959,
519
- "step": 420
520
- },
521
- {
522
- "epoch": 0.6,
523
- "learning_rate": 8.360435187012788e-05,
524
- "loss": 0.8757,
525
- "step": 425
526
- },
527
- {
528
- "epoch": 0.6,
529
- "learning_rate": 8.118460495311686e-05,
530
- "loss": 0.8832,
531
- "step": 430
532
- },
533
- {
534
- "epoch": 0.61,
535
- "learning_rate": 7.877622722570771e-05,
536
- "loss": 0.8807,
537
- "step": 435
538
- },
539
- {
540
- "epoch": 0.62,
541
- "learning_rate": 7.638067394849671e-05,
542
- "loss": 0.8664,
543
- "step": 440
544
- },
545
- {
546
- "epoch": 0.63,
547
- "learning_rate": 7.399939263291493e-05,
548
- "loss": 0.8539,
549
- "step": 445
550
- },
551
- {
552
- "epoch": 0.63,
553
- "learning_rate": 7.163382216657034e-05,
554
- "loss": 0.8616,
555
- "step": 450
556
- },
557
- {
558
- "epoch": 0.64,
559
- "learning_rate": 6.928539194380102e-05,
560
- "loss": 0.8619,
561
- "step": 455
562
- },
563
- {
564
- "epoch": 0.65,
565
- "learning_rate": 6.695552100196452e-05,
566
- "loss": 0.8686,
567
- "step": 460
568
- },
569
- {
570
- "epoch": 0.65,
571
- "learning_rate": 6.464561716398565e-05,
572
- "loss": 0.8732,
573
- "step": 465
574
- },
575
- {
576
- "epoch": 0.66,
577
- "learning_rate": 6.235707618768032e-05,
578
- "loss": 0.8701,
579
- "step": 470
580
- },
581
- {
582
- "epoch": 0.67,
583
- "learning_rate": 6.009128092236983e-05,
584
- "loss": 0.8812,
585
- "step": 475
586
- },
587
- {
588
- "epoch": 0.67,
589
- "learning_rate": 5.784960047329519e-05,
590
- "loss": 0.8758,
591
- "step": 480
592
- },
593
- {
594
- "epoch": 0.68,
595
- "learning_rate": 5.563338937433622e-05,
596
- "loss": 0.877,
597
- "step": 485
598
- },
599
- {
600
- "epoch": 0.69,
601
- "learning_rate": 5.344398676953526e-05,
602
- "loss": 0.8754,
603
- "step": 490
604
- },
605
- {
606
- "epoch": 0.7,
607
- "learning_rate": 5.1282715603920374e-05,
608
- "loss": 0.8702,
609
- "step": 495
610
- },
611
- {
612
- "epoch": 0.7,
613
- "learning_rate": 4.915088182411675e-05,
614
- "loss": 0.8798,
615
- "step": 500
616
- },
617
- {
618
- "epoch": 0.71,
619
- "learning_rate": 4.7049773589229306e-05,
620
- "loss": 0.8813,
621
- "step": 505
622
- },
623
- {
624
- "epoch": 0.72,
625
- "learning_rate": 4.498066049247344e-05,
626
- "loss": 0.8857,
627
- "step": 510
628
- },
629
- {
630
- "epoch": 0.72,
631
- "learning_rate": 4.29447927940242e-05,
632
- "loss": 0.8769,
633
- "step": 515
634
- },
635
- {
636
- "epoch": 0.73,
637
- "learning_rate": 4.094340066554743e-05,
638
- "loss": 0.8794,
639
- "step": 520
640
- },
641
- {
642
- "epoch": 0.74,
643
- "learning_rate": 3.897769344686929e-05,
644
- "loss": 0.8766,
645
- "step": 525
646
- },
647
- {
648
- "epoch": 0.74,
649
- "learning_rate": 3.7048858915233664e-05,
650
- "loss": 0.8678,
651
- "step": 530
652
- },
653
- {
654
- "epoch": 0.75,
655
- "learning_rate": 3.515806256758847e-05,
656
- "loss": 0.8538,
657
- "step": 535
658
- },
659
- {
660
- "epoch": 0.76,
661
- "learning_rate": 3.330644691633492e-05,
662
- "loss": 0.8692,
663
- "step": 540
664
- },
665
- {
666
- "epoch": 0.77,
667
- "learning_rate": 3.149513079896521e-05,
668
- "loss": 0.8638,
669
- "step": 545
670
- },
671
- {
672
- "epoch": 0.77,
673
- "learning_rate": 2.9725208702005734e-05,
674
- "loss": 0.8612,
675
- "step": 550
676
- },
677
- {
678
- "epoch": 0.78,
679
- "learning_rate": 2.799775009967428e-05,
680
- "loss": 0.872,
681
- "step": 555
682
- },
683
- {
684
- "epoch": 0.79,
685
- "learning_rate": 2.631379880765107e-05,
686
- "loss": 0.8836,
687
- "step": 560
688
- },
689
- {
690
- "epoch": 0.79,
691
- "learning_rate": 2.4674372352353782e-05,
692
- "loss": 0.8773,
693
- "step": 565
694
- },
695
- {
696
- "epoch": 0.8,
697
- "learning_rate": 2.3080461356097937e-05,
698
- "loss": 0.8609,
699
- "step": 570
700
- },
701
- {
702
- "epoch": 0.81,
703
- "learning_rate": 2.1533028938514012e-05,
704
- "loss": 0.8571,
705
- "step": 575
706
- },
707
- {
708
- "epoch": 0.82,
709
- "learning_rate": 2.0033010134583086e-05,
710
- "loss": 0.8782,
711
- "step": 580
712
- },
713
- {
714
- "epoch": 0.82,
715
- "learning_rate": 1.858131132964259e-05,
716
- "loss": 0.878,
717
- "step": 585
718
- },
719
- {
720
- "epoch": 0.83,
721
- "learning_rate": 1.7178809711703523e-05,
722
- "loss": 0.8755,
723
- "step": 590
724
- },
725
- {
726
- "epoch": 0.84,
727
- "learning_rate": 1.5826352741410334e-05,
728
- "loss": 0.8753,
729
- "step": 595
730
- },
731
- {
732
- "epoch": 0.84,
733
- "learning_rate": 1.452475763996326e-05,
734
- "loss": 0.8657,
735
- "step": 600
736
- },
737
- {
738
- "epoch": 0.85,
739
- "learning_rate": 1.3274810895313083e-05,
740
- "loss": 0.8544,
741
- "step": 605
742
- },
743
- {
744
- "epoch": 0.86,
745
- "learning_rate": 1.207726778692625e-05,
746
- "loss": 0.8521,
747
- "step": 610
748
- },
749
- {
750
- "epoch": 0.86,
751
- "learning_rate": 1.0932851929407827e-05,
752
- "loss": 0.8825,
753
- "step": 615
754
- },
755
- {
756
- "epoch": 0.87,
757
- "learning_rate": 9.842254835257791e-06,
758
- "loss": 0.8695,
759
- "step": 620
760
- },
761
- {
762
- "epoch": 0.88,
763
- "learning_rate": 8.80613549702518e-06,
764
- "loss": 0.8845,
765
- "step": 625
766
- },
767
- {
768
- "epoch": 0.89,
769
- "learning_rate": 7.825119989112173e-06,
770
- "loss": 0.8821,
771
- "step": 630
772
- },
773
- {
774
- "epoch": 0.89,
775
- "learning_rate": 6.899801089469204e-06,
776
- "loss": 0.8722,
777
- "step": 635
778
  },
779
  {
780
  "epoch": 0.9,
781
  "learning_rate": 6.030737921409169e-06,
782
- "loss": 0.8653,
783
- "step": 640
784
- },
785
- {
786
- "epoch": 0.91,
787
- "learning_rate": 5.2184556157576e-06,
788
- "loss": 0.8717,
789
- "step": 645
790
- },
791
- {
792
- "epoch": 0.91,
793
- "learning_rate": 4.463444993542721e-06,
794
- "loss": 0.8811,
795
- "step": 650
796
- },
797
- {
798
- "epoch": 0.92,
799
- "learning_rate": 3.7661622694171394e-06,
800
- "loss": 0.8849,
801
- "step": 655
802
- },
803
- {
804
- "epoch": 0.93,
805
- "learning_rate": 3.127028775990515e-06,
806
- "loss": 0.8598,
807
- "step": 660
808
- },
809
- {
810
- "epoch": 0.93,
811
- "learning_rate": 2.546430709239578e-06,
812
- "loss": 0.8475,
813
- "step": 665
814
- },
815
- {
816
- "epoch": 0.94,
817
- "learning_rate": 2.02471889514948e-06,
818
- "loss": 0.8658,
819
- "step": 670
820
- },
821
- {
822
- "epoch": 0.95,
823
- "learning_rate": 1.562208577727442e-06,
824
- "loss": 0.8674,
825
- "step": 675
826
- },
827
- {
828
- "epoch": 0.96,
829
- "learning_rate": 1.1591792285167603e-06,
830
- "loss": 0.8774,
831
- "step": 680
832
- },
833
- {
834
- "epoch": 0.96,
835
- "learning_rate": 8.158743777263333e-07,
836
- "loss": 0.8786,
837
- "step": 685
838
- },
839
- {
840
- "epoch": 0.97,
841
- "learning_rate": 5.325014670776951e-07,
842
- "loss": 0.8766,
843
- "step": 690
844
- },
845
- {
846
- "epoch": 0.98,
847
- "learning_rate": 3.092317244584919e-07,
848
- "loss": 0.874,
849
- "step": 695
850
- },
851
- {
852
- "epoch": 0.98,
853
- "learning_rate": 1.4620006045816815e-07,
854
- "loss": 0.8683,
855
- "step": 700
856
  },
857
  {
858
  "epoch": 0.99,
859
- "learning_rate": 4.350498684829729e-08,
860
- "loss": 0.8548,
861
- "step": 705
 
 
862
  },
863
  {
864
- "epoch": 1.0,
865
- "learning_rate": 1.20855705696421e-09,
866
- "loss": 0.8799,
867
- "step": 710
868
- },
869
- {
870
- "epoch": 1.0,
871
- "eval_loss": 0.8712352514266968,
872
- "eval_runtime": 637.1016,
873
- "eval_samples_per_second": 0.995,
874
- "eval_steps_per_second": 0.126,
875
- "step": 711
876
- },
877
- {
878
- "epoch": 1.0,
879
- "step": 711,
880
- "total_flos": 9.02655718774014e+17,
881
- "train_loss": 0.9507392455421587,
882
- "train_runtime": 19724.4028,
883
- "train_samples_per_second": 0.288,
884
- "train_steps_per_second": 0.036
885
  }
886
  ],
887
  "logging_steps": 5,
888
- "max_steps": 711,
889
  "num_input_tokens_seen": 0,
890
  "num_train_epochs": 1,
891
  "save_steps": 100,
892
- "total_flos": 9.02655718774014e+17,
893
  "train_batch_size": 4,
894
  "trial_name": null,
895
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.9894588896697118,
5
  "eval_steps": 500,
6
+ "global_step": 11,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
  {
12
  "epoch": 0.09,
13
+ "learning_rate": 0.0001,
14
+ "loss": 1.8236,
15
+ "step": 1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
  },
17
  {
18
  "epoch": 0.45,
19
+ "learning_rate": 0.00015000000000000001,
20
+ "loss": 1.775,
21
+ "step": 5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
  },
23
  {
24
  "epoch": 0.9,
25
  "learning_rate": 6.030737921409169e-06,
26
+ "loss": 1.5495,
27
+ "step": 10
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
  },
29
  {
30
  "epoch": 0.99,
31
+ "eval_loss": 1.506221055984497,
32
+ "eval_runtime": 640.0172,
33
+ "eval_samples_per_second": 0.991,
34
+ "eval_steps_per_second": 0.125,
35
+ "step": 11
36
  },
37
  {
38
+ "epoch": 0.99,
39
+ "step": 11,
40
+ "total_flos": 9.067629788193096e+17,
41
+ "train_loss": 1.6525719924406572,
42
+ "train_runtime": 19611.737,
43
+ "train_samples_per_second": 0.29,
44
+ "train_steps_per_second": 0.001
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45
  }
46
  ],
47
  "logging_steps": 5,
48
+ "max_steps": 11,
49
  "num_input_tokens_seen": 0,
50
  "num_train_epochs": 1,
51
  "save_steps": 100,
52
+ "total_flos": 9.067629788193096e+17,
53
  "train_batch_size": 4,
54
  "trial_name": null,
55
  "trial_params": null
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e73049ec1c34175942c3be0595b94d5c7201a0e1f8ef51232c21bea6cb57bea8
3
  size 4792
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9d2eb55aad41f2b966da5a317095cc531f303dfdfe517a3735c14a12b6a6b483
3
  size 4792