sedrickkeh commited on
Commit
6dfb5b1
·
verified ·
1 Parent(s): 8af96d8

End of training

Browse files
README.md CHANGED
@@ -4,6 +4,7 @@ license: llama3.1
4
  base_model: meta-llama/Llama-3.1-8B
5
  tags:
6
  - llama-factory
 
7
  - generated_from_trainer
8
  model-index:
9
  - name: OH_DCFT_V3_wo_gpteacher
@@ -15,7 +16,7 @@ should probably proofread and complete it, then remove this comment. -->
15
 
16
  # OH_DCFT_V3_wo_gpteacher
17
 
18
- This model is a fine-tuned version of [meta-llama/Llama-3.1-8B](https://huggingface.co/meta-llama/Llama-3.1-8B) on an unknown dataset.
19
  It achieves the following results on the evaluation set:
20
  - Loss: 0.6407
21
 
 
4
  base_model: meta-llama/Llama-3.1-8B
5
  tags:
6
  - llama-factory
7
+ - full
8
  - generated_from_trainer
9
  model-index:
10
  - name: OH_DCFT_V3_wo_gpteacher
 
16
 
17
  # OH_DCFT_V3_wo_gpteacher
18
 
19
+ This model is a fine-tuned version of [meta-llama/Llama-3.1-8B](https://huggingface.co/meta-llama/Llama-3.1-8B) on the mlfoundations-dev/OH_DCFT_V3_wo_gpteacher dataset.
20
  It achieves the following results on the evaluation set:
21
  - Loss: 0.6407
22
 
all_results.json CHANGED
@@ -1,12 +1,12 @@
1
  {
2
  "epoch": 3.0,
3
- "eval_loss": 0.6440668702125549,
4
- "eval_runtime": 227.293,
5
- "eval_samples_per_second": 49.786,
6
- "eval_steps_per_second": 0.392,
7
  "total_flos": 2110128169943040.0,
8
- "train_loss": 0.6162170792382861,
9
- "train_runtime": 37920.9719,
10
- "train_samples_per_second": 17.008,
11
  "train_steps_per_second": 0.033
12
  }
 
1
  {
2
  "epoch": 3.0,
3
+ "eval_loss": 0.64065021276474,
4
+ "eval_runtime": 226.5797,
5
+ "eval_samples_per_second": 49.943,
6
+ "eval_steps_per_second": 0.393,
7
  "total_flos": 2110128169943040.0,
8
+ "train_loss": 0.6183440295476762,
9
+ "train_runtime": 37778.6065,
10
+ "train_samples_per_second": 17.072,
11
  "train_steps_per_second": 0.033
12
  }
eval_results.json CHANGED
@@ -1,7 +1,7 @@
1
  {
2
  "epoch": 3.0,
3
- "eval_loss": 0.6440668702125549,
4
- "eval_runtime": 227.293,
5
- "eval_samples_per_second": 49.786,
6
- "eval_steps_per_second": 0.392
7
  }
 
1
  {
2
  "epoch": 3.0,
3
+ "eval_loss": 0.64065021276474,
4
+ "eval_runtime": 226.5797,
5
+ "eval_samples_per_second": 49.943,
6
+ "eval_steps_per_second": 0.393
7
  }
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "epoch": 3.0,
3
  "total_flos": 2110128169943040.0,
4
- "train_loss": 0.6162170792382861,
5
- "train_runtime": 37920.9719,
6
- "train_samples_per_second": 17.008,
7
  "train_steps_per_second": 0.033
8
  }
 
1
  {
2
  "epoch": 3.0,
3
  "total_flos": 2110128169943040.0,
4
+ "train_loss": 0.6183440295476762,
5
+ "train_runtime": 37778.6065,
6
+ "train_samples_per_second": 17.072,
7
  "train_steps_per_second": 0.033
8
  }
trainer_state.json CHANGED
@@ -10,907 +10,907 @@
10
  "log_history": [
11
  {
12
  "epoch": 0.023809523809523808,
13
- "grad_norm": 2.979096394688749,
14
  "learning_rate": 5e-06,
15
- "loss": 0.9025,
16
  "step": 10
17
  },
18
  {
19
  "epoch": 0.047619047619047616,
20
- "grad_norm": 2.223142048644808,
21
  "learning_rate": 5e-06,
22
- "loss": 0.7906,
23
  "step": 20
24
  },
25
  {
26
  "epoch": 0.07142857142857142,
27
- "grad_norm": 0.9659685115817319,
28
  "learning_rate": 5e-06,
29
- "loss": 0.7618,
30
  "step": 30
31
  },
32
  {
33
  "epoch": 0.09523809523809523,
34
- "grad_norm": 2.5886379347886272,
35
  "learning_rate": 5e-06,
36
- "loss": 0.7377,
37
  "step": 40
38
  },
39
  {
40
  "epoch": 0.11904761904761904,
41
- "grad_norm": 1.0245503548131887,
42
  "learning_rate": 5e-06,
43
- "loss": 0.7259,
44
  "step": 50
45
  },
46
  {
47
  "epoch": 0.14285714285714285,
48
- "grad_norm": 1.738748891091501,
49
  "learning_rate": 5e-06,
50
- "loss": 0.7176,
51
  "step": 60
52
  },
53
  {
54
  "epoch": 0.16666666666666666,
55
- "grad_norm": 1.2057121051713384,
56
  "learning_rate": 5e-06,
57
- "loss": 0.7096,
58
  "step": 70
59
  },
60
  {
61
  "epoch": 0.19047619047619047,
62
- "grad_norm": 0.9909295460877846,
63
  "learning_rate": 5e-06,
64
- "loss": 0.6993,
65
  "step": 80
66
  },
67
  {
68
  "epoch": 0.21428571428571427,
69
- "grad_norm": 1.0715651790004184,
70
  "learning_rate": 5e-06,
71
- "loss": 0.6967,
72
  "step": 90
73
  },
74
  {
75
  "epoch": 0.23809523809523808,
76
- "grad_norm": 0.6693921792141905,
77
  "learning_rate": 5e-06,
78
- "loss": 0.6894,
79
  "step": 100
80
  },
81
  {
82
  "epoch": 0.2619047619047619,
83
- "grad_norm": 0.7540545910263109,
84
  "learning_rate": 5e-06,
85
- "loss": 0.6897,
86
  "step": 110
87
  },
88
  {
89
  "epoch": 0.2857142857142857,
90
- "grad_norm": 0.9619591233907567,
91
  "learning_rate": 5e-06,
92
- "loss": 0.6791,
93
  "step": 120
94
  },
95
  {
96
  "epoch": 0.30952380952380953,
97
- "grad_norm": 0.5780917568032095,
98
  "learning_rate": 5e-06,
99
- "loss": 0.6797,
100
  "step": 130
101
  },
102
  {
103
  "epoch": 0.3333333333333333,
104
- "grad_norm": 0.9635552433253597,
105
  "learning_rate": 5e-06,
106
- "loss": 0.6841,
107
  "step": 140
108
  },
109
  {
110
  "epoch": 0.35714285714285715,
111
- "grad_norm": 0.8584456045570616,
112
  "learning_rate": 5e-06,
113
- "loss": 0.6867,
114
  "step": 150
115
  },
116
  {
117
  "epoch": 0.38095238095238093,
118
- "grad_norm": 0.7712335735700714,
119
  "learning_rate": 5e-06,
120
- "loss": 0.6742,
121
  "step": 160
122
  },
123
  {
124
  "epoch": 0.40476190476190477,
125
- "grad_norm": 0.5607748596375883,
126
  "learning_rate": 5e-06,
127
- "loss": 0.6645,
128
  "step": 170
129
  },
130
  {
131
  "epoch": 0.42857142857142855,
132
- "grad_norm": 0.6860408185076606,
133
  "learning_rate": 5e-06,
134
- "loss": 0.67,
135
  "step": 180
136
  },
137
  {
138
  "epoch": 0.4523809523809524,
139
- "grad_norm": 0.6423601136831741,
140
  "learning_rate": 5e-06,
141
- "loss": 0.6617,
142
  "step": 190
143
  },
144
  {
145
  "epoch": 0.47619047619047616,
146
- "grad_norm": 0.6913927833272892,
147
  "learning_rate": 5e-06,
148
- "loss": 0.6693,
149
  "step": 200
150
  },
151
  {
152
  "epoch": 0.5,
153
- "grad_norm": 0.731217906174741,
154
  "learning_rate": 5e-06,
155
- "loss": 0.6613,
156
  "step": 210
157
  },
158
  {
159
  "epoch": 0.5238095238095238,
160
- "grad_norm": 0.7416700934245687,
161
  "learning_rate": 5e-06,
162
- "loss": 0.665,
163
  "step": 220
164
  },
165
  {
166
  "epoch": 0.5476190476190477,
167
- "grad_norm": 0.693116297233903,
168
  "learning_rate": 5e-06,
169
- "loss": 0.6664,
170
  "step": 230
171
  },
172
  {
173
  "epoch": 0.5714285714285714,
174
- "grad_norm": 0.6060455581468875,
175
  "learning_rate": 5e-06,
176
- "loss": 0.6685,
177
  "step": 240
178
  },
179
  {
180
  "epoch": 0.5952380952380952,
181
- "grad_norm": 0.5949942186099269,
182
  "learning_rate": 5e-06,
183
- "loss": 0.6648,
184
  "step": 250
185
  },
186
  {
187
  "epoch": 0.6190476190476191,
188
- "grad_norm": 0.6650051022426187,
189
  "learning_rate": 5e-06,
190
- "loss": 0.6698,
191
  "step": 260
192
  },
193
  {
194
  "epoch": 0.6428571428571429,
195
- "grad_norm": 0.4881193268448229,
196
  "learning_rate": 5e-06,
197
- "loss": 0.6579,
198
  "step": 270
199
  },
200
  {
201
  "epoch": 0.6666666666666666,
202
- "grad_norm": 0.6835244906664516,
203
  "learning_rate": 5e-06,
204
- "loss": 0.663,
205
  "step": 280
206
  },
207
  {
208
  "epoch": 0.6904761904761905,
209
- "grad_norm": 0.549317421807424,
210
  "learning_rate": 5e-06,
211
- "loss": 0.6693,
212
  "step": 290
213
  },
214
  {
215
  "epoch": 0.7142857142857143,
216
- "grad_norm": 0.7488182034561941,
217
  "learning_rate": 5e-06,
218
- "loss": 0.654,
219
  "step": 300
220
  },
221
  {
222
  "epoch": 0.7380952380952381,
223
- "grad_norm": 0.5270894181702861,
224
  "learning_rate": 5e-06,
225
- "loss": 0.6587,
226
  "step": 310
227
  },
228
  {
229
  "epoch": 0.7619047619047619,
230
- "grad_norm": 0.5020088435096579,
231
  "learning_rate": 5e-06,
232
- "loss": 0.6555,
233
  "step": 320
234
  },
235
  {
236
  "epoch": 0.7857142857142857,
237
- "grad_norm": 0.6580789108763763,
238
  "learning_rate": 5e-06,
239
- "loss": 0.6594,
240
  "step": 330
241
  },
242
  {
243
  "epoch": 0.8095238095238095,
244
- "grad_norm": 0.5327515064364189,
245
  "learning_rate": 5e-06,
246
- "loss": 0.6562,
247
  "step": 340
248
  },
249
  {
250
  "epoch": 0.8333333333333334,
251
- "grad_norm": 0.6766119147927919,
252
  "learning_rate": 5e-06,
253
- "loss": 0.6622,
254
  "step": 350
255
  },
256
  {
257
  "epoch": 0.8571428571428571,
258
- "grad_norm": 0.48312467069161585,
259
  "learning_rate": 5e-06,
260
- "loss": 0.6499,
261
  "step": 360
262
  },
263
  {
264
  "epoch": 0.8809523809523809,
265
- "grad_norm": 0.5489308235550809,
266
  "learning_rate": 5e-06,
267
- "loss": 0.6511,
268
  "step": 370
269
  },
270
  {
271
  "epoch": 0.9047619047619048,
272
- "grad_norm": 0.531393638730665,
273
  "learning_rate": 5e-06,
274
- "loss": 0.6474,
275
  "step": 380
276
  },
277
  {
278
  "epoch": 0.9285714285714286,
279
- "grad_norm": 0.5442860805200723,
280
  "learning_rate": 5e-06,
281
- "loss": 0.647,
282
  "step": 390
283
  },
284
  {
285
  "epoch": 0.9523809523809523,
286
- "grad_norm": 0.5341470533579747,
287
  "learning_rate": 5e-06,
288
- "loss": 0.6524,
289
  "step": 400
290
  },
291
  {
292
  "epoch": 0.9761904761904762,
293
- "grad_norm": 0.5888795467394295,
294
  "learning_rate": 5e-06,
295
- "loss": 0.6542,
296
  "step": 410
297
  },
298
  {
299
  "epoch": 1.0,
300
- "grad_norm": 0.49834935458120216,
301
  "learning_rate": 5e-06,
302
- "loss": 0.656,
303
  "step": 420
304
  },
305
  {
306
  "epoch": 1.0,
307
- "eval_loss": 0.6510941386222839,
308
- "eval_runtime": 227.0311,
309
- "eval_samples_per_second": 49.843,
310
- "eval_steps_per_second": 0.392,
311
  "step": 420
312
  },
313
  {
314
  "epoch": 1.0238095238095237,
315
- "grad_norm": 0.6687540925652379,
316
  "learning_rate": 5e-06,
317
- "loss": 0.6017,
318
  "step": 430
319
  },
320
  {
321
  "epoch": 1.0476190476190477,
322
- "grad_norm": 0.6881882382037877,
323
  "learning_rate": 5e-06,
324
- "loss": 0.605,
325
  "step": 440
326
  },
327
  {
328
  "epoch": 1.0714285714285714,
329
- "grad_norm": 0.6838107122997646,
330
  "learning_rate": 5e-06,
331
- "loss": 0.6057,
332
  "step": 450
333
  },
334
  {
335
  "epoch": 1.0952380952380953,
336
- "grad_norm": 0.6206546806913035,
337
  "learning_rate": 5e-06,
338
- "loss": 0.6025,
339
  "step": 460
340
  },
341
  {
342
  "epoch": 1.119047619047619,
343
- "grad_norm": 0.5234446949681405,
344
  "learning_rate": 5e-06,
345
- "loss": 0.6026,
346
  "step": 470
347
  },
348
  {
349
  "epoch": 1.1428571428571428,
350
- "grad_norm": 0.5688053143712357,
351
  "learning_rate": 5e-06,
352
- "loss": 0.6084,
353
  "step": 480
354
  },
355
  {
356
  "epoch": 1.1666666666666667,
357
- "grad_norm": 0.5144927383094415,
358
  "learning_rate": 5e-06,
359
- "loss": 0.6056,
360
  "step": 490
361
  },
362
  {
363
  "epoch": 1.1904761904761905,
364
- "grad_norm": 0.5798202925506201,
365
  "learning_rate": 5e-06,
366
- "loss": 0.6051,
367
  "step": 500
368
  },
369
  {
370
  "epoch": 1.2142857142857142,
371
- "grad_norm": 0.5763013849243751,
372
  "learning_rate": 5e-06,
373
- "loss": 0.6072,
374
  "step": 510
375
  },
376
  {
377
  "epoch": 1.2380952380952381,
378
- "grad_norm": 0.516136225290882,
379
  "learning_rate": 5e-06,
380
- "loss": 0.6039,
381
  "step": 520
382
  },
383
  {
384
  "epoch": 1.2619047619047619,
385
- "grad_norm": 0.5434868232176754,
386
  "learning_rate": 5e-06,
387
- "loss": 0.6049,
388
  "step": 530
389
  },
390
  {
391
  "epoch": 1.2857142857142856,
392
- "grad_norm": 0.5781098423365609,
393
  "learning_rate": 5e-06,
394
- "loss": 0.6106,
395
  "step": 540
396
  },
397
  {
398
  "epoch": 1.3095238095238095,
399
- "grad_norm": 0.6160851889527316,
400
  "learning_rate": 5e-06,
401
- "loss": 0.6131,
402
  "step": 550
403
  },
404
  {
405
  "epoch": 1.3333333333333333,
406
- "grad_norm": 0.6505657162711183,
407
  "learning_rate": 5e-06,
408
- "loss": 0.6044,
409
  "step": 560
410
  },
411
  {
412
  "epoch": 1.3571428571428572,
413
- "grad_norm": 0.7226294537660097,
414
  "learning_rate": 5e-06,
415
- "loss": 0.5995,
416
  "step": 570
417
  },
418
  {
419
  "epoch": 1.380952380952381,
420
- "grad_norm": 0.7401627819549035,
421
  "learning_rate": 5e-06,
422
- "loss": 0.6076,
423
  "step": 580
424
  },
425
  {
426
  "epoch": 1.4047619047619047,
427
- "grad_norm": 0.5976345814355858,
428
  "learning_rate": 5e-06,
429
- "loss": 0.6059,
430
  "step": 590
431
  },
432
  {
433
  "epoch": 1.4285714285714286,
434
- "grad_norm": 0.5726009859635873,
435
  "learning_rate": 5e-06,
436
- "loss": 0.6096,
437
  "step": 600
438
  },
439
  {
440
  "epoch": 1.4523809523809523,
441
- "grad_norm": 0.6957346206924405,
442
  "learning_rate": 5e-06,
443
- "loss": 0.6068,
444
  "step": 610
445
  },
446
  {
447
  "epoch": 1.4761904761904763,
448
- "grad_norm": 0.6013418142360826,
449
  "learning_rate": 5e-06,
450
- "loss": 0.6054,
451
  "step": 620
452
  },
453
  {
454
  "epoch": 1.5,
455
- "grad_norm": 0.529882487661824,
456
  "learning_rate": 5e-06,
457
- "loss": 0.6094,
458
  "step": 630
459
  },
460
  {
461
  "epoch": 1.5238095238095237,
462
- "grad_norm": 0.5964013041735027,
463
  "learning_rate": 5e-06,
464
- "loss": 0.6087,
465
  "step": 640
466
  },
467
  {
468
  "epoch": 1.5476190476190477,
469
- "grad_norm": 0.6720997706264525,
470
  "learning_rate": 5e-06,
471
- "loss": 0.6089,
472
  "step": 650
473
  },
474
  {
475
  "epoch": 1.5714285714285714,
476
- "grad_norm": 0.5540017744559399,
477
  "learning_rate": 5e-06,
478
- "loss": 0.6159,
479
  "step": 660
480
  },
481
  {
482
  "epoch": 1.5952380952380953,
483
- "grad_norm": 0.634842502281549,
484
  "learning_rate": 5e-06,
485
- "loss": 0.6028,
486
  "step": 670
487
  },
488
  {
489
  "epoch": 1.619047619047619,
490
- "grad_norm": 0.5437572103572672,
491
  "learning_rate": 5e-06,
492
- "loss": 0.6114,
493
  "step": 680
494
  },
495
  {
496
  "epoch": 1.6428571428571428,
497
- "grad_norm": 0.6906266274795664,
498
  "learning_rate": 5e-06,
499
- "loss": 0.6044,
500
  "step": 690
501
  },
502
  {
503
  "epoch": 1.6666666666666665,
504
- "grad_norm": 0.5334026010038674,
505
  "learning_rate": 5e-06,
506
- "loss": 0.601,
507
  "step": 700
508
  },
509
  {
510
  "epoch": 1.6904761904761905,
511
- "grad_norm": 0.5451974027222483,
512
  "learning_rate": 5e-06,
513
- "loss": 0.6032,
514
  "step": 710
515
  },
516
  {
517
  "epoch": 1.7142857142857144,
518
- "grad_norm": 0.6762373268566487,
519
  "learning_rate": 5e-06,
520
- "loss": 0.5986,
521
  "step": 720
522
  },
523
  {
524
  "epoch": 1.7380952380952381,
525
- "grad_norm": 0.5412074184482999,
526
  "learning_rate": 5e-06,
527
- "loss": 0.6054,
528
  "step": 730
529
  },
530
  {
531
  "epoch": 1.7619047619047619,
532
- "grad_norm": 0.771274774231781,
533
  "learning_rate": 5e-06,
534
- "loss": 0.6092,
535
  "step": 740
536
  },
537
  {
538
  "epoch": 1.7857142857142856,
539
- "grad_norm": 0.48730103990677553,
540
  "learning_rate": 5e-06,
541
- "loss": 0.6035,
542
  "step": 750
543
  },
544
  {
545
  "epoch": 1.8095238095238095,
546
- "grad_norm": 0.6623660039317416,
547
  "learning_rate": 5e-06,
548
- "loss": 0.5956,
549
  "step": 760
550
  },
551
  {
552
  "epoch": 1.8333333333333335,
553
- "grad_norm": 0.47412102834711495,
554
  "learning_rate": 5e-06,
555
- "loss": 0.6046,
556
  "step": 770
557
  },
558
  {
559
  "epoch": 1.8571428571428572,
560
- "grad_norm": 0.5242018166622929,
561
  "learning_rate": 5e-06,
562
- "loss": 0.6023,
563
  "step": 780
564
  },
565
  {
566
  "epoch": 1.880952380952381,
567
- "grad_norm": 0.5058468113558267,
568
  "learning_rate": 5e-06,
569
- "loss": 0.6032,
570
  "step": 790
571
  },
572
  {
573
  "epoch": 1.9047619047619047,
574
- "grad_norm": 0.5259508772857945,
575
  "learning_rate": 5e-06,
576
- "loss": 0.6039,
577
  "step": 800
578
  },
579
  {
580
  "epoch": 1.9285714285714286,
581
- "grad_norm": 0.5204834424492372,
582
  "learning_rate": 5e-06,
583
- "loss": 0.5964,
584
  "step": 810
585
  },
586
  {
587
  "epoch": 1.9523809523809523,
588
- "grad_norm": 0.6677307987141299,
589
  "learning_rate": 5e-06,
590
- "loss": 0.6039,
591
  "step": 820
592
  },
593
  {
594
  "epoch": 1.9761904761904763,
595
- "grad_norm": 0.6038451203219031,
596
  "learning_rate": 5e-06,
597
- "loss": 0.6051,
598
  "step": 830
599
  },
600
  {
601
  "epoch": 2.0,
602
- "grad_norm": 0.5104923296850143,
603
  "learning_rate": 5e-06,
604
- "loss": 0.6054,
605
  "step": 840
606
  },
607
  {
608
  "epoch": 2.0,
609
- "eval_loss": 0.6414868831634521,
610
- "eval_runtime": 226.8523,
611
- "eval_samples_per_second": 49.883,
612
  "eval_steps_per_second": 0.392,
613
  "step": 840
614
  },
615
  {
616
  "epoch": 2.0238095238095237,
617
- "grad_norm": 0.6730357183628786,
618
  "learning_rate": 5e-06,
619
- "loss": 0.5583,
620
  "step": 850
621
  },
622
  {
623
  "epoch": 2.0476190476190474,
624
- "grad_norm": 0.6443977651353381,
625
  "learning_rate": 5e-06,
626
- "loss": 0.5484,
627
  "step": 860
628
  },
629
  {
630
  "epoch": 2.0714285714285716,
631
- "grad_norm": 0.6621011415103402,
632
  "learning_rate": 5e-06,
633
- "loss": 0.5541,
634
  "step": 870
635
  },
636
  {
637
  "epoch": 2.0952380952380953,
638
- "grad_norm": 0.5941142046912498,
639
  "learning_rate": 5e-06,
640
- "loss": 0.5557,
641
  "step": 880
642
  },
643
  {
644
  "epoch": 2.119047619047619,
645
- "grad_norm": 0.6880423100090476,
646
  "learning_rate": 5e-06,
647
- "loss": 0.5522,
648
  "step": 890
649
  },
650
  {
651
  "epoch": 2.142857142857143,
652
- "grad_norm": 0.5826827164455727,
653
  "learning_rate": 5e-06,
654
- "loss": 0.5575,
655
  "step": 900
656
  },
657
  {
658
  "epoch": 2.1666666666666665,
659
- "grad_norm": 0.546984665323485,
660
  "learning_rate": 5e-06,
661
- "loss": 0.5573,
662
  "step": 910
663
  },
664
  {
665
  "epoch": 2.1904761904761907,
666
- "grad_norm": 0.6578594473450741,
667
  "learning_rate": 5e-06,
668
- "loss": 0.5572,
669
  "step": 920
670
  },
671
  {
672
  "epoch": 2.2142857142857144,
673
- "grad_norm": 0.5174066412507444,
674
  "learning_rate": 5e-06,
675
- "loss": 0.5558,
676
  "step": 930
677
  },
678
  {
679
  "epoch": 2.238095238095238,
680
- "grad_norm": 0.5665396877262667,
681
  "learning_rate": 5e-06,
682
- "loss": 0.5591,
683
  "step": 940
684
  },
685
  {
686
  "epoch": 2.261904761904762,
687
- "grad_norm": 0.5517767824029327,
688
  "learning_rate": 5e-06,
689
- "loss": 0.5686,
690
  "step": 950
691
  },
692
  {
693
  "epoch": 2.2857142857142856,
694
- "grad_norm": 0.7067335515343864,
695
  "learning_rate": 5e-06,
696
- "loss": 0.5571,
697
  "step": 960
698
  },
699
  {
700
  "epoch": 2.3095238095238093,
701
- "grad_norm": 0.5430060019384252,
702
  "learning_rate": 5e-06,
703
- "loss": 0.5482,
704
  "step": 970
705
  },
706
  {
707
  "epoch": 2.3333333333333335,
708
- "grad_norm": 0.6706596793287323,
709
  "learning_rate": 5e-06,
710
- "loss": 0.5556,
711
  "step": 980
712
  },
713
  {
714
  "epoch": 2.357142857142857,
715
- "grad_norm": 0.566293047488571,
716
  "learning_rate": 5e-06,
717
- "loss": 0.5641,
718
  "step": 990
719
  },
720
  {
721
  "epoch": 2.380952380952381,
722
- "grad_norm": 0.6116527741940925,
723
  "learning_rate": 5e-06,
724
- "loss": 0.5622,
725
  "step": 1000
726
  },
727
  {
728
  "epoch": 2.4047619047619047,
729
- "grad_norm": 0.6143562660668103,
730
  "learning_rate": 5e-06,
731
- "loss": 0.568,
732
  "step": 1010
733
  },
734
  {
735
  "epoch": 2.4285714285714284,
736
- "grad_norm": 0.6090596093797429,
737
  "learning_rate": 5e-06,
738
- "loss": 0.5636,
739
  "step": 1020
740
  },
741
  {
742
  "epoch": 2.4523809523809526,
743
- "grad_norm": 0.5243339859967657,
744
  "learning_rate": 5e-06,
745
- "loss": 0.5611,
746
  "step": 1030
747
  },
748
  {
749
  "epoch": 2.4761904761904763,
750
- "grad_norm": 0.5859833474274038,
751
  "learning_rate": 5e-06,
752
- "loss": 0.5646,
753
  "step": 1040
754
  },
755
  {
756
  "epoch": 2.5,
757
- "grad_norm": 0.6091596308635463,
758
  "learning_rate": 5e-06,
759
- "loss": 0.5616,
760
  "step": 1050
761
  },
762
  {
763
  "epoch": 2.5238095238095237,
764
- "grad_norm": 0.5720777402597989,
765
  "learning_rate": 5e-06,
766
- "loss": 0.5625,
767
  "step": 1060
768
  },
769
  {
770
  "epoch": 2.5476190476190474,
771
- "grad_norm": 0.6196708033793199,
772
  "learning_rate": 5e-06,
773
- "loss": 0.5631,
774
  "step": 1070
775
  },
776
  {
777
  "epoch": 2.571428571428571,
778
- "grad_norm": 0.6052531031436665,
779
  "learning_rate": 5e-06,
780
- "loss": 0.5566,
781
  "step": 1080
782
  },
783
  {
784
  "epoch": 2.5952380952380953,
785
- "grad_norm": 0.554207795958823,
786
  "learning_rate": 5e-06,
787
- "loss": 0.5659,
788
  "step": 1090
789
  },
790
  {
791
  "epoch": 2.619047619047619,
792
- "grad_norm": 0.564240933144199,
793
  "learning_rate": 5e-06,
794
- "loss": 0.5594,
795
  "step": 1100
796
  },
797
  {
798
  "epoch": 2.642857142857143,
799
- "grad_norm": 0.6888429556851143,
800
  "learning_rate": 5e-06,
801
- "loss": 0.5661,
802
  "step": 1110
803
  },
804
  {
805
  "epoch": 2.6666666666666665,
806
- "grad_norm": 0.506844447200667,
807
  "learning_rate": 5e-06,
808
- "loss": 0.5625,
809
  "step": 1120
810
  },
811
  {
812
  "epoch": 2.6904761904761907,
813
- "grad_norm": 0.5934885397813097,
814
  "learning_rate": 5e-06,
815
- "loss": 0.5603,
816
  "step": 1130
817
  },
818
  {
819
  "epoch": 2.7142857142857144,
820
- "grad_norm": 0.5569090391621104,
821
  "learning_rate": 5e-06,
822
- "loss": 0.5626,
823
  "step": 1140
824
  },
825
  {
826
  "epoch": 2.738095238095238,
827
- "grad_norm": 0.523043491964592,
828
  "learning_rate": 5e-06,
829
- "loss": 0.5645,
830
  "step": 1150
831
  },
832
  {
833
  "epoch": 2.761904761904762,
834
- "grad_norm": 0.6108836571145032,
835
  "learning_rate": 5e-06,
836
- "loss": 0.5648,
837
  "step": 1160
838
  },
839
  {
840
  "epoch": 2.7857142857142856,
841
- "grad_norm": 0.5434168095034596,
842
  "learning_rate": 5e-06,
843
- "loss": 0.5592,
844
  "step": 1170
845
  },
846
  {
847
  "epoch": 2.8095238095238093,
848
- "grad_norm": 0.5074570216881327,
849
  "learning_rate": 5e-06,
850
- "loss": 0.5612,
851
  "step": 1180
852
  },
853
  {
854
  "epoch": 2.8333333333333335,
855
- "grad_norm": 0.8613619535923634,
856
  "learning_rate": 5e-06,
857
- "loss": 0.5673,
858
  "step": 1190
859
  },
860
  {
861
  "epoch": 2.857142857142857,
862
- "grad_norm": 0.5411037216568878,
863
  "learning_rate": 5e-06,
864
- "loss": 0.568,
865
  "step": 1200
866
  },
867
  {
868
  "epoch": 2.880952380952381,
869
- "grad_norm": 0.5977798849441338,
870
  "learning_rate": 5e-06,
871
- "loss": 0.5652,
872
  "step": 1210
873
  },
874
  {
875
  "epoch": 2.9047619047619047,
876
- "grad_norm": 0.5535606431439672,
877
  "learning_rate": 5e-06,
878
- "loss": 0.5515,
879
  "step": 1220
880
  },
881
  {
882
  "epoch": 2.928571428571429,
883
- "grad_norm": 0.541578167814688,
884
  "learning_rate": 5e-06,
885
- "loss": 0.5605,
886
  "step": 1230
887
  },
888
  {
889
  "epoch": 2.9523809523809526,
890
- "grad_norm": 0.6599900818842629,
891
  "learning_rate": 5e-06,
892
- "loss": 0.5592,
893
  "step": 1240
894
  },
895
  {
896
  "epoch": 2.9761904761904763,
897
- "grad_norm": 0.5712008591735968,
898
  "learning_rate": 5e-06,
899
- "loss": 0.567,
900
  "step": 1250
901
  },
902
  {
903
  "epoch": 3.0,
904
- "grad_norm": 0.5872625935196057,
905
  "learning_rate": 5e-06,
906
- "loss": 0.5671,
907
  "step": 1260
908
  },
909
  {
910
  "epoch": 3.0,
911
- "eval_loss": 0.6440668702125549,
912
- "eval_runtime": 227.2775,
913
- "eval_samples_per_second": 49.789,
914
  "eval_steps_per_second": 0.392,
915
  "step": 1260
916
  },
@@ -918,9 +918,9 @@
918
  "epoch": 3.0,
919
  "step": 1260,
920
  "total_flos": 2110128169943040.0,
921
- "train_loss": 0.6162170792382861,
922
- "train_runtime": 37920.9719,
923
- "train_samples_per_second": 17.008,
924
  "train_steps_per_second": 0.033
925
  }
926
  ],
 
10
  "log_history": [
11
  {
12
  "epoch": 0.023809523809523808,
13
+ "grad_norm": 8.445232891321785,
14
  "learning_rate": 5e-06,
15
+ "loss": 0.8762,
16
  "step": 10
17
  },
18
  {
19
  "epoch": 0.047619047619047616,
20
+ "grad_norm": 0.904843982790976,
21
  "learning_rate": 5e-06,
22
+ "loss": 0.7786,
23
  "step": 20
24
  },
25
  {
26
  "epoch": 0.07142857142857142,
27
+ "grad_norm": 0.8545978145052984,
28
  "learning_rate": 5e-06,
29
+ "loss": 0.7527,
30
  "step": 30
31
  },
32
  {
33
  "epoch": 0.09523809523809523,
34
+ "grad_norm": 1.6189319907091486,
35
  "learning_rate": 5e-06,
36
+ "loss": 0.7327,
37
  "step": 40
38
  },
39
  {
40
  "epoch": 0.11904761904761904,
41
+ "grad_norm": 3.2111995463545036,
42
  "learning_rate": 5e-06,
43
+ "loss": 0.721,
44
  "step": 50
45
  },
46
  {
47
  "epoch": 0.14285714285714285,
48
+ "grad_norm": 1.5622579588100545,
49
  "learning_rate": 5e-06,
50
+ "loss": 0.7155,
51
  "step": 60
52
  },
53
  {
54
  "epoch": 0.16666666666666666,
55
+ "grad_norm": 0.9194869158617756,
56
  "learning_rate": 5e-06,
57
+ "loss": 0.7073,
58
  "step": 70
59
  },
60
  {
61
  "epoch": 0.19047619047619047,
62
+ "grad_norm": 1.1360615566124799,
63
  "learning_rate": 5e-06,
64
+ "loss": 0.6982,
65
  "step": 80
66
  },
67
  {
68
  "epoch": 0.21428571428571427,
69
+ "grad_norm": 0.6337591570267407,
70
  "learning_rate": 5e-06,
71
+ "loss": 0.6948,
72
  "step": 90
73
  },
74
  {
75
  "epoch": 0.23809523809523808,
76
+ "grad_norm": 0.6294601373379741,
77
  "learning_rate": 5e-06,
78
+ "loss": 0.6868,
79
  "step": 100
80
  },
81
  {
82
  "epoch": 0.2619047619047619,
83
+ "grad_norm": 0.7227604147889286,
84
  "learning_rate": 5e-06,
85
+ "loss": 0.6873,
86
  "step": 110
87
  },
88
  {
89
  "epoch": 0.2857142857142857,
90
+ "grad_norm": 0.6329554150338043,
91
  "learning_rate": 5e-06,
92
+ "loss": 0.6767,
93
  "step": 120
94
  },
95
  {
96
  "epoch": 0.30952380952380953,
97
+ "grad_norm": 0.7399122002499049,
98
  "learning_rate": 5e-06,
99
+ "loss": 0.6773,
100
  "step": 130
101
  },
102
  {
103
  "epoch": 0.3333333333333333,
104
+ "grad_norm": 0.9154334676485532,
105
  "learning_rate": 5e-06,
106
+ "loss": 0.6816,
107
  "step": 140
108
  },
109
  {
110
  "epoch": 0.35714285714285715,
111
+ "grad_norm": 0.7517066478000081,
112
  "learning_rate": 5e-06,
113
+ "loss": 0.6844,
114
  "step": 150
115
  },
116
  {
117
  "epoch": 0.38095238095238093,
118
+ "grad_norm": 0.5944319769110347,
119
  "learning_rate": 5e-06,
120
+ "loss": 0.6718,
121
  "step": 160
122
  },
123
  {
124
  "epoch": 0.40476190476190477,
125
+ "grad_norm": 0.5554236598726099,
126
  "learning_rate": 5e-06,
127
+ "loss": 0.6625,
128
  "step": 170
129
  },
130
  {
131
  "epoch": 0.42857142857142855,
132
+ "grad_norm": 0.5527412054134143,
133
  "learning_rate": 5e-06,
134
+ "loss": 0.6679,
135
  "step": 180
136
  },
137
  {
138
  "epoch": 0.4523809523809524,
139
+ "grad_norm": 0.6675222560631745,
140
  "learning_rate": 5e-06,
141
+ "loss": 0.6599,
142
  "step": 190
143
  },
144
  {
145
  "epoch": 0.47619047619047616,
146
+ "grad_norm": 0.5135528660090127,
147
  "learning_rate": 5e-06,
148
+ "loss": 0.6673,
149
  "step": 200
150
  },
151
  {
152
  "epoch": 0.5,
153
+ "grad_norm": 0.5424652267211545,
154
  "learning_rate": 5e-06,
155
+ "loss": 0.6595,
156
  "step": 210
157
  },
158
  {
159
  "epoch": 0.5238095238095238,
160
+ "grad_norm": 0.6772483317418574,
161
  "learning_rate": 5e-06,
162
+ "loss": 0.6632,
163
  "step": 220
164
  },
165
  {
166
  "epoch": 0.5476190476190477,
167
+ "grad_norm": 0.6246331080259045,
168
  "learning_rate": 5e-06,
169
+ "loss": 0.6648,
170
  "step": 230
171
  },
172
  {
173
  "epoch": 0.5714285714285714,
174
+ "grad_norm": 0.8107636093046625,
175
  "learning_rate": 5e-06,
176
+ "loss": 0.6669,
177
  "step": 240
178
  },
179
  {
180
  "epoch": 0.5952380952380952,
181
+ "grad_norm": 0.6057267628051226,
182
  "learning_rate": 5e-06,
183
+ "loss": 0.6632,
184
  "step": 250
185
  },
186
  {
187
  "epoch": 0.6190476190476191,
188
+ "grad_norm": 0.4697523845613325,
189
  "learning_rate": 5e-06,
190
+ "loss": 0.6682,
191
  "step": 260
192
  },
193
  {
194
  "epoch": 0.6428571428571429,
195
+ "grad_norm": 0.7106341402212613,
196
  "learning_rate": 5e-06,
197
+ "loss": 0.6565,
198
  "step": 270
199
  },
200
  {
201
  "epoch": 0.6666666666666666,
202
+ "grad_norm": 0.6640978568662332,
203
  "learning_rate": 5e-06,
204
+ "loss": 0.6614,
205
  "step": 280
206
  },
207
  {
208
  "epoch": 0.6904761904761905,
209
+ "grad_norm": 0.47105154073005556,
210
  "learning_rate": 5e-06,
211
+ "loss": 0.6678,
212
  "step": 290
213
  },
214
  {
215
  "epoch": 0.7142857142857143,
216
+ "grad_norm": 0.5076235307186338,
217
  "learning_rate": 5e-06,
218
+ "loss": 0.6526,
219
  "step": 300
220
  },
221
  {
222
  "epoch": 0.7380952380952381,
223
+ "grad_norm": 0.5774485428478285,
224
  "learning_rate": 5e-06,
225
+ "loss": 0.6572,
226
  "step": 310
227
  },
228
  {
229
  "epoch": 0.7619047619047619,
230
+ "grad_norm": 0.44356473492316634,
231
  "learning_rate": 5e-06,
232
+ "loss": 0.6541,
233
  "step": 320
234
  },
235
  {
236
  "epoch": 0.7857142857142857,
237
+ "grad_norm": 0.5583060645539694,
238
  "learning_rate": 5e-06,
239
+ "loss": 0.6579,
240
  "step": 330
241
  },
242
  {
243
  "epoch": 0.8095238095238095,
244
+ "grad_norm": 0.5093809388308376,
245
  "learning_rate": 5e-06,
246
+ "loss": 0.6549,
247
  "step": 340
248
  },
249
  {
250
  "epoch": 0.8333333333333334,
251
+ "grad_norm": 0.6286453858043566,
252
  "learning_rate": 5e-06,
253
+ "loss": 0.6609,
254
  "step": 350
255
  },
256
  {
257
  "epoch": 0.8571428571428571,
258
+ "grad_norm": 0.4665382898949909,
259
  "learning_rate": 5e-06,
260
+ "loss": 0.6486,
261
  "step": 360
262
  },
263
  {
264
  "epoch": 0.8809523809523809,
265
+ "grad_norm": 0.48421942235168053,
266
  "learning_rate": 5e-06,
267
+ "loss": 0.6498,
268
  "step": 370
269
  },
270
  {
271
  "epoch": 0.9047619047619048,
272
+ "grad_norm": 0.47968168006990375,
273
  "learning_rate": 5e-06,
274
+ "loss": 0.6463,
275
  "step": 380
276
  },
277
  {
278
  "epoch": 0.9285714285714286,
279
+ "grad_norm": 0.6498313381053872,
280
  "learning_rate": 5e-06,
281
+ "loss": 0.646,
282
  "step": 390
283
  },
284
  {
285
  "epoch": 0.9523809523809523,
286
+ "grad_norm": 0.501652313286086,
287
  "learning_rate": 5e-06,
288
+ "loss": 0.6513,
289
  "step": 400
290
  },
291
  {
292
  "epoch": 0.9761904761904762,
293
+ "grad_norm": 0.6946015511436041,
294
  "learning_rate": 5e-06,
295
+ "loss": 0.6531,
296
  "step": 410
297
  },
298
  {
299
  "epoch": 1.0,
300
+ "grad_norm": 0.4229645503919674,
301
  "learning_rate": 5e-06,
302
+ "loss": 0.655,
303
  "step": 420
304
  },
305
  {
306
  "epoch": 1.0,
307
+ "eval_loss": 0.6498768329620361,
308
+ "eval_runtime": 225.8577,
309
+ "eval_samples_per_second": 50.102,
310
+ "eval_steps_per_second": 0.394,
311
  "step": 420
312
  },
313
  {
314
  "epoch": 1.0238095238095237,
315
+ "grad_norm": 0.5848273077976011,
316
  "learning_rate": 5e-06,
317
+ "loss": 0.6049,
318
  "step": 430
319
  },
320
  {
321
  "epoch": 1.0476190476190477,
322
+ "grad_norm": 0.6081820995830461,
323
  "learning_rate": 5e-06,
324
+ "loss": 0.6086,
325
  "step": 440
326
  },
327
  {
328
  "epoch": 1.0714285714285714,
329
+ "grad_norm": 0.7117254288676989,
330
  "learning_rate": 5e-06,
331
+ "loss": 0.6092,
332
  "step": 450
333
  },
334
  {
335
  "epoch": 1.0952380952380953,
336
+ "grad_norm": 0.5386642213868081,
337
  "learning_rate": 5e-06,
338
+ "loss": 0.6057,
339
  "step": 460
340
  },
341
  {
342
  "epoch": 1.119047619047619,
343
+ "grad_norm": 0.4838860637353068,
344
  "learning_rate": 5e-06,
345
+ "loss": 0.6063,
346
  "step": 470
347
  },
348
  {
349
  "epoch": 1.1428571428571428,
350
+ "grad_norm": 0.6899201478964959,
351
  "learning_rate": 5e-06,
352
+ "loss": 0.6117,
353
  "step": 480
354
  },
355
  {
356
  "epoch": 1.1666666666666667,
357
+ "grad_norm": 0.4519793880714107,
358
  "learning_rate": 5e-06,
359
+ "loss": 0.6088,
360
  "step": 490
361
  },
362
  {
363
  "epoch": 1.1904761904761905,
364
+ "grad_norm": 0.5061125852880405,
365
  "learning_rate": 5e-06,
366
+ "loss": 0.6083,
367
  "step": 500
368
  },
369
  {
370
  "epoch": 1.2142857142857142,
371
+ "grad_norm": 0.5248863647668095,
372
  "learning_rate": 5e-06,
373
+ "loss": 0.6104,
374
  "step": 510
375
  },
376
  {
377
  "epoch": 1.2380952380952381,
378
+ "grad_norm": 0.5968882106247873,
379
  "learning_rate": 5e-06,
380
+ "loss": 0.6069,
381
  "step": 520
382
  },
383
  {
384
  "epoch": 1.2619047619047619,
385
+ "grad_norm": 0.5501143820464358,
386
  "learning_rate": 5e-06,
387
+ "loss": 0.608,
388
  "step": 530
389
  },
390
  {
391
  "epoch": 1.2857142857142856,
392
+ "grad_norm": 0.4956499652626632,
393
  "learning_rate": 5e-06,
394
+ "loss": 0.6137,
395
  "step": 540
396
  },
397
  {
398
  "epoch": 1.3095238095238095,
399
+ "grad_norm": 0.5885785221479344,
400
  "learning_rate": 5e-06,
401
+ "loss": 0.616,
402
  "step": 550
403
  },
404
  {
405
  "epoch": 1.3333333333333333,
406
+ "grad_norm": 0.6508207591047148,
407
  "learning_rate": 5e-06,
408
+ "loss": 0.6076,
409
  "step": 560
410
  },
411
  {
412
  "epoch": 1.3571428571428572,
413
+ "grad_norm": 0.6567861781663986,
414
  "learning_rate": 5e-06,
415
+ "loss": 0.6024,
416
  "step": 570
417
  },
418
  {
419
  "epoch": 1.380952380952381,
420
+ "grad_norm": 0.6368222901082733,
421
  "learning_rate": 5e-06,
422
+ "loss": 0.6105,
423
  "step": 580
424
  },
425
  {
426
  "epoch": 1.4047619047619047,
427
+ "grad_norm": 0.4765033439165101,
428
  "learning_rate": 5e-06,
429
+ "loss": 0.6085,
430
  "step": 590
431
  },
432
  {
433
  "epoch": 1.4285714285714286,
434
+ "grad_norm": 0.5412789060995679,
435
  "learning_rate": 5e-06,
436
+ "loss": 0.6125,
437
  "step": 600
438
  },
439
  {
440
  "epoch": 1.4523809523809523,
441
+ "grad_norm": 0.6550891112594979,
442
  "learning_rate": 5e-06,
443
+ "loss": 0.6098,
444
  "step": 610
445
  },
446
  {
447
  "epoch": 1.4761904761904763,
448
+ "grad_norm": 0.4941780495813866,
449
  "learning_rate": 5e-06,
450
+ "loss": 0.6079,
451
  "step": 620
452
  },
453
  {
454
  "epoch": 1.5,
455
+ "grad_norm": 0.4616359398639311,
456
  "learning_rate": 5e-06,
457
+ "loss": 0.6124,
458
  "step": 630
459
  },
460
  {
461
  "epoch": 1.5238095238095237,
462
+ "grad_norm": 0.5672461066448958,
463
  "learning_rate": 5e-06,
464
+ "loss": 0.6116,
465
  "step": 640
466
  },
467
  {
468
  "epoch": 1.5476190476190477,
469
+ "grad_norm": 0.5489798189032514,
470
  "learning_rate": 5e-06,
471
+ "loss": 0.6117,
472
  "step": 650
473
  },
474
  {
475
  "epoch": 1.5714285714285714,
476
+ "grad_norm": 0.48057024394509507,
477
  "learning_rate": 5e-06,
478
+ "loss": 0.6187,
479
  "step": 660
480
  },
481
  {
482
  "epoch": 1.5952380952380953,
483
+ "grad_norm": 0.5181771441671735,
484
  "learning_rate": 5e-06,
485
+ "loss": 0.6054,
486
  "step": 670
487
  },
488
  {
489
  "epoch": 1.619047619047619,
490
+ "grad_norm": 0.4437262379380175,
491
  "learning_rate": 5e-06,
492
+ "loss": 0.6137,
493
  "step": 680
494
  },
495
  {
496
  "epoch": 1.6428571428571428,
497
+ "grad_norm": 0.7378881109321974,
498
  "learning_rate": 5e-06,
499
+ "loss": 0.6068,
500
  "step": 690
501
  },
502
  {
503
  "epoch": 1.6666666666666665,
504
+ "grad_norm": 0.499037021825346,
505
  "learning_rate": 5e-06,
506
+ "loss": 0.6034,
507
  "step": 700
508
  },
509
  {
510
  "epoch": 1.6904761904761905,
511
+ "grad_norm": 0.4703517200789926,
512
  "learning_rate": 5e-06,
513
+ "loss": 0.606,
514
  "step": 710
515
  },
516
  {
517
  "epoch": 1.7142857142857144,
518
+ "grad_norm": 0.6587144015590305,
519
  "learning_rate": 5e-06,
520
+ "loss": 0.6012,
521
  "step": 720
522
  },
523
  {
524
  "epoch": 1.7380952380952381,
525
+ "grad_norm": 0.48833715383705506,
526
  "learning_rate": 5e-06,
527
+ "loss": 0.6079,
528
  "step": 730
529
  },
530
  {
531
  "epoch": 1.7619047619047619,
532
+ "grad_norm": 0.6685361241012429,
533
  "learning_rate": 5e-06,
534
+ "loss": 0.6116,
535
  "step": 740
536
  },
537
  {
538
  "epoch": 1.7857142857142856,
539
+ "grad_norm": 0.6149706599367468,
540
  "learning_rate": 5e-06,
541
+ "loss": 0.6062,
542
  "step": 750
543
  },
544
  {
545
  "epoch": 1.8095238095238095,
546
+ "grad_norm": 0.5091886649503007,
547
  "learning_rate": 5e-06,
548
+ "loss": 0.5981,
549
  "step": 760
550
  },
551
  {
552
  "epoch": 1.8333333333333335,
553
+ "grad_norm": 0.5048302650982107,
554
  "learning_rate": 5e-06,
555
+ "loss": 0.6071,
556
  "step": 770
557
  },
558
  {
559
  "epoch": 1.8571428571428572,
560
+ "grad_norm": 0.43120999313392744,
561
  "learning_rate": 5e-06,
562
+ "loss": 0.6047,
563
  "step": 780
564
  },
565
  {
566
  "epoch": 1.880952380952381,
567
+ "grad_norm": 0.43029897993140503,
568
  "learning_rate": 5e-06,
569
+ "loss": 0.6056,
570
  "step": 790
571
  },
572
  {
573
  "epoch": 1.9047619047619047,
574
+ "grad_norm": 0.496655514939085,
575
  "learning_rate": 5e-06,
576
+ "loss": 0.6063,
577
  "step": 800
578
  },
579
  {
580
  "epoch": 1.9285714285714286,
581
+ "grad_norm": 0.6058031312406722,
582
  "learning_rate": 5e-06,
583
+ "loss": 0.5987,
584
  "step": 810
585
  },
586
  {
587
  "epoch": 1.9523809523809523,
588
+ "grad_norm": 0.5801525856212552,
589
  "learning_rate": 5e-06,
590
+ "loss": 0.6062,
591
  "step": 820
592
  },
593
  {
594
  "epoch": 1.9761904761904763,
595
+ "grad_norm": 0.5287188007584436,
596
  "learning_rate": 5e-06,
597
+ "loss": 0.6075,
598
  "step": 830
599
  },
600
  {
601
  "epoch": 2.0,
602
+ "grad_norm": 0.4693366296588104,
603
  "learning_rate": 5e-06,
604
+ "loss": 0.6078,
605
  "step": 840
606
  },
607
  {
608
  "epoch": 2.0,
609
+ "eval_loss": 0.6398360133171082,
610
+ "eval_runtime": 226.8062,
611
+ "eval_samples_per_second": 49.893,
612
  "eval_steps_per_second": 0.392,
613
  "step": 840
614
  },
615
  {
616
  "epoch": 2.0238095238095237,
617
+ "grad_norm": 0.6317055308986993,
618
  "learning_rate": 5e-06,
619
+ "loss": 0.5656,
620
  "step": 850
621
  },
622
  {
623
  "epoch": 2.0476190476190474,
624
+ "grad_norm": 0.5508213482577374,
625
  "learning_rate": 5e-06,
626
+ "loss": 0.5557,
627
  "step": 860
628
  },
629
  {
630
  "epoch": 2.0714285714285716,
631
+ "grad_norm": 0.5665189426466252,
632
  "learning_rate": 5e-06,
633
+ "loss": 0.5617,
634
  "step": 870
635
  },
636
  {
637
  "epoch": 2.0952380952380953,
638
+ "grad_norm": 0.5708719211761654,
639
  "learning_rate": 5e-06,
640
+ "loss": 0.5629,
641
  "step": 880
642
  },
643
  {
644
  "epoch": 2.119047619047619,
645
+ "grad_norm": 0.585919720488859,
646
  "learning_rate": 5e-06,
647
+ "loss": 0.5592,
648
  "step": 890
649
  },
650
  {
651
  "epoch": 2.142857142857143,
652
+ "grad_norm": 0.512641311308892,
653
  "learning_rate": 5e-06,
654
+ "loss": 0.5645,
655
  "step": 900
656
  },
657
  {
658
  "epoch": 2.1666666666666665,
659
+ "grad_norm": 0.5448411844682196,
660
  "learning_rate": 5e-06,
661
+ "loss": 0.5644,
662
  "step": 910
663
  },
664
  {
665
  "epoch": 2.1904761904761907,
666
+ "grad_norm": 0.6240052214528613,
667
  "learning_rate": 5e-06,
668
+ "loss": 0.5643,
669
  "step": 920
670
  },
671
  {
672
  "epoch": 2.2142857142857144,
673
+ "grad_norm": 0.5089889720545453,
674
  "learning_rate": 5e-06,
675
+ "loss": 0.5628,
676
  "step": 930
677
  },
678
  {
679
  "epoch": 2.238095238095238,
680
+ "grad_norm": 0.4392590521750202,
681
  "learning_rate": 5e-06,
682
+ "loss": 0.5659,
683
  "step": 940
684
  },
685
  {
686
  "epoch": 2.261904761904762,
687
+ "grad_norm": 0.4848503543872302,
688
  "learning_rate": 5e-06,
689
+ "loss": 0.5755,
690
  "step": 950
691
  },
692
  {
693
  "epoch": 2.2857142857142856,
694
+ "grad_norm": 0.6304825214497957,
695
  "learning_rate": 5e-06,
696
+ "loss": 0.564,
697
  "step": 960
698
  },
699
  {
700
  "epoch": 2.3095238095238093,
701
+ "grad_norm": 0.5022963193078647,
702
  "learning_rate": 5e-06,
703
+ "loss": 0.5548,
704
  "step": 970
705
  },
706
  {
707
  "epoch": 2.3333333333333335,
708
+ "grad_norm": 0.5369612472077095,
709
  "learning_rate": 5e-06,
710
+ "loss": 0.5625,
711
  "step": 980
712
  },
713
  {
714
  "epoch": 2.357142857142857,
715
+ "grad_norm": 0.4759917866765363,
716
  "learning_rate": 5e-06,
717
+ "loss": 0.5708,
718
  "step": 990
719
  },
720
  {
721
  "epoch": 2.380952380952381,
722
+ "grad_norm": 0.6016766885366513,
723
  "learning_rate": 5e-06,
724
+ "loss": 0.5689,
725
  "step": 1000
726
  },
727
  {
728
  "epoch": 2.4047619047619047,
729
+ "grad_norm": 0.5307133776111298,
730
  "learning_rate": 5e-06,
731
+ "loss": 0.5744,
732
  "step": 1010
733
  },
734
  {
735
  "epoch": 2.4285714285714284,
736
+ "grad_norm": 0.5106189519040727,
737
  "learning_rate": 5e-06,
738
+ "loss": 0.5701,
739
  "step": 1020
740
  },
741
  {
742
  "epoch": 2.4523809523809526,
743
+ "grad_norm": 0.5120395251477051,
744
  "learning_rate": 5e-06,
745
+ "loss": 0.5674,
746
  "step": 1030
747
  },
748
  {
749
  "epoch": 2.4761904761904763,
750
+ "grad_norm": 0.5360273318914704,
751
  "learning_rate": 5e-06,
752
+ "loss": 0.5712,
753
  "step": 1040
754
  },
755
  {
756
  "epoch": 2.5,
757
+ "grad_norm": 0.567087734254625,
758
  "learning_rate": 5e-06,
759
+ "loss": 0.5681,
760
  "step": 1050
761
  },
762
  {
763
  "epoch": 2.5238095238095237,
764
+ "grad_norm": 0.4786686531657179,
765
  "learning_rate": 5e-06,
766
+ "loss": 0.5689,
767
  "step": 1060
768
  },
769
  {
770
  "epoch": 2.5476190476190474,
771
+ "grad_norm": 0.6357185519881481,
772
  "learning_rate": 5e-06,
773
+ "loss": 0.5695,
774
  "step": 1070
775
  },
776
  {
777
  "epoch": 2.571428571428571,
778
+ "grad_norm": 0.5407397510648377,
779
  "learning_rate": 5e-06,
780
+ "loss": 0.5629,
781
  "step": 1080
782
  },
783
  {
784
  "epoch": 2.5952380952380953,
785
+ "grad_norm": 0.6222781368696665,
786
  "learning_rate": 5e-06,
787
+ "loss": 0.5724,
788
  "step": 1090
789
  },
790
  {
791
  "epoch": 2.619047619047619,
792
+ "grad_norm": 0.5165820385889532,
793
  "learning_rate": 5e-06,
794
+ "loss": 0.5655,
795
  "step": 1100
796
  },
797
  {
798
  "epoch": 2.642857142857143,
799
+ "grad_norm": 0.608766287830255,
800
  "learning_rate": 5e-06,
801
+ "loss": 0.5725,
802
  "step": 1110
803
  },
804
  {
805
  "epoch": 2.6666666666666665,
806
+ "grad_norm": 0.4594249383292841,
807
  "learning_rate": 5e-06,
808
+ "loss": 0.5684,
809
  "step": 1120
810
  },
811
  {
812
  "epoch": 2.6904761904761907,
813
+ "grad_norm": 0.4874560641150811,
814
  "learning_rate": 5e-06,
815
+ "loss": 0.5662,
816
  "step": 1130
817
  },
818
  {
819
  "epoch": 2.7142857142857144,
820
+ "grad_norm": 0.4712693165037158,
821
  "learning_rate": 5e-06,
822
+ "loss": 0.5687,
823
  "step": 1140
824
  },
825
  {
826
  "epoch": 2.738095238095238,
827
+ "grad_norm": 0.45012903888687444,
828
  "learning_rate": 5e-06,
829
+ "loss": 0.5706,
830
  "step": 1150
831
  },
832
  {
833
  "epoch": 2.761904761904762,
834
+ "grad_norm": 0.5496212836763024,
835
  "learning_rate": 5e-06,
836
+ "loss": 0.5708,
837
  "step": 1160
838
  },
839
  {
840
  "epoch": 2.7857142857142856,
841
+ "grad_norm": 0.46476871854434104,
842
  "learning_rate": 5e-06,
843
+ "loss": 0.5649,
844
  "step": 1170
845
  },
846
  {
847
  "epoch": 2.8095238095238093,
848
+ "grad_norm": 0.49154119108233535,
849
  "learning_rate": 5e-06,
850
+ "loss": 0.5673,
851
  "step": 1180
852
  },
853
  {
854
  "epoch": 2.8333333333333335,
855
+ "grad_norm": 0.658009656544454,
856
  "learning_rate": 5e-06,
857
+ "loss": 0.5731,
858
  "step": 1190
859
  },
860
  {
861
  "epoch": 2.857142857142857,
862
+ "grad_norm": 0.587948539981229,
863
  "learning_rate": 5e-06,
864
+ "loss": 0.5739,
865
  "step": 1200
866
  },
867
  {
868
  "epoch": 2.880952380952381,
869
+ "grad_norm": 0.4564005311443965,
870
  "learning_rate": 5e-06,
871
+ "loss": 0.5709,
872
  "step": 1210
873
  },
874
  {
875
  "epoch": 2.9047619047619047,
876
+ "grad_norm": 0.4778927219605842,
877
  "learning_rate": 5e-06,
878
+ "loss": 0.5571,
879
  "step": 1220
880
  },
881
  {
882
  "epoch": 2.928571428571429,
883
+ "grad_norm": 0.45838251308426436,
884
  "learning_rate": 5e-06,
885
+ "loss": 0.5665,
886
  "step": 1230
887
  },
888
  {
889
  "epoch": 2.9523809523809526,
890
+ "grad_norm": 0.5496247257686689,
891
  "learning_rate": 5e-06,
892
+ "loss": 0.5649,
893
  "step": 1240
894
  },
895
  {
896
  "epoch": 2.9761904761904763,
897
+ "grad_norm": 0.4480068307641731,
898
  "learning_rate": 5e-06,
899
+ "loss": 0.5728,
900
  "step": 1250
901
  },
902
  {
903
  "epoch": 3.0,
904
+ "grad_norm": 0.5038304772459032,
905
  "learning_rate": 5e-06,
906
+ "loss": 0.5729,
907
  "step": 1260
908
  },
909
  {
910
  "epoch": 3.0,
911
+ "eval_loss": 0.64065021276474,
912
+ "eval_runtime": 227.2124,
913
+ "eval_samples_per_second": 49.804,
914
  "eval_steps_per_second": 0.392,
915
  "step": 1260
916
  },
 
918
  "epoch": 3.0,
919
  "step": 1260,
920
  "total_flos": 2110128169943040.0,
921
+ "train_loss": 0.6183440295476762,
922
+ "train_runtime": 37778.6065,
923
+ "train_samples_per_second": 17.072,
924
  "train_steps_per_second": 0.033
925
  }
926
  ],
training_eval_loss.png CHANGED
training_loss.png CHANGED