sedrickkeh commited on
Commit
da5d2cc
1 Parent(s): f0fcab0

End of training

Browse files
README.md CHANGED
@@ -4,6 +4,7 @@ license: llama3.1
4
  base_model: meta-llama/Llama-3.1-8B
5
  tags:
6
  - llama-factory
 
7
  - generated_from_trainer
8
  model-index:
9
  - name: OH_DCFT_V3_wo_unnatural_instructions
@@ -15,7 +16,7 @@ should probably proofread and complete it, then remove this comment. -->
15
 
16
  # OH_DCFT_V3_wo_unnatural_instructions
17
 
18
- This model is a fine-tuned version of [meta-llama/Llama-3.1-8B](https://huggingface.co/meta-llama/Llama-3.1-8B) on an unknown dataset.
19
  It achieves the following results on the evaluation set:
20
  - Loss: 0.6454
21
 
 
4
  base_model: meta-llama/Llama-3.1-8B
5
  tags:
6
  - llama-factory
7
+ - full
8
  - generated_from_trainer
9
  model-index:
10
  - name: OH_DCFT_V3_wo_unnatural_instructions
 
16
 
17
  # OH_DCFT_V3_wo_unnatural_instructions
18
 
19
+ This model is a fine-tuned version of [meta-llama/Llama-3.1-8B](https://huggingface.co/meta-llama/Llama-3.1-8B) on the mlfoundations-dev/OH_DCFT_V3_wo_unnatural_instructions dataset.
20
  It achieves the following results on the evaluation set:
21
  - Loss: 0.6454
22
 
all_results.json CHANGED
@@ -1,12 +1,12 @@
1
  {
2
  "epoch": 2.9946777054997042,
3
- "eval_loss": 0.648719847202301,
4
- "eval_runtime": 224.9555,
5
- "eval_samples_per_second": 50.619,
6
- "eval_steps_per_second": 0.396,
7
  "total_flos": 2120178393415680.0,
8
- "train_loss": 0.6180764295478568,
9
- "train_runtime": 37891.5513,
10
- "train_samples_per_second": 17.129,
11
  "train_steps_per_second": 0.033
12
  }
 
1
  {
2
  "epoch": 2.9946777054997042,
3
+ "eval_loss": 0.6453979015350342,
4
+ "eval_runtime": 225.6683,
5
+ "eval_samples_per_second": 50.459,
6
+ "eval_steps_per_second": 0.394,
7
  "total_flos": 2120178393415680.0,
8
+ "train_loss": 0.6197805719164687,
9
+ "train_runtime": 38167.6556,
10
+ "train_samples_per_second": 17.005,
11
  "train_steps_per_second": 0.033
12
  }
eval_results.json CHANGED
@@ -1,7 +1,7 @@
1
  {
2
  "epoch": 2.9946777054997042,
3
- "eval_loss": 0.648719847202301,
4
- "eval_runtime": 224.9555,
5
- "eval_samples_per_second": 50.619,
6
- "eval_steps_per_second": 0.396
7
  }
 
1
  {
2
  "epoch": 2.9946777054997042,
3
+ "eval_loss": 0.6453979015350342,
4
+ "eval_runtime": 225.6683,
5
+ "eval_samples_per_second": 50.459,
6
+ "eval_steps_per_second": 0.394
7
  }
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "epoch": 2.9946777054997042,
3
  "total_flos": 2120178393415680.0,
4
- "train_loss": 0.6180764295478568,
5
- "train_runtime": 37891.5513,
6
- "train_samples_per_second": 17.129,
7
  "train_steps_per_second": 0.033
8
  }
 
1
  {
2
  "epoch": 2.9946777054997042,
3
  "total_flos": 2120178393415680.0,
4
+ "train_loss": 0.6197805719164687,
5
+ "train_runtime": 38167.6556,
6
+ "train_samples_per_second": 17.005,
7
  "train_steps_per_second": 0.033
8
  }
trainer_state.json CHANGED
@@ -10,917 +10,917 @@
10
  "log_history": [
11
  {
12
  "epoch": 0.02365464222353637,
13
- "grad_norm": 39.04922181795541,
14
  "learning_rate": 5e-06,
15
- "loss": 0.9233,
16
  "step": 10
17
  },
18
  {
19
  "epoch": 0.04730928444707274,
20
- "grad_norm": 8.275301054500245,
21
  "learning_rate": 5e-06,
22
- "loss": 0.8198,
23
  "step": 20
24
  },
25
  {
26
  "epoch": 0.0709639266706091,
27
- "grad_norm": 16.91349063149131,
28
  "learning_rate": 5e-06,
29
- "loss": 0.8094,
30
  "step": 30
31
  },
32
  {
33
  "epoch": 0.09461856889414548,
34
- "grad_norm": 14.148988171179118,
35
  "learning_rate": 5e-06,
36
- "loss": 0.7679,
37
  "step": 40
38
  },
39
  {
40
  "epoch": 0.11827321111768184,
41
- "grad_norm": 1.026404466612184,
42
  "learning_rate": 5e-06,
43
- "loss": 0.7523,
44
  "step": 50
45
  },
46
  {
47
  "epoch": 0.1419278533412182,
48
- "grad_norm": 0.8542225183892468,
49
  "learning_rate": 5e-06,
50
- "loss": 0.725,
51
  "step": 60
52
  },
53
  {
54
  "epoch": 0.16558249556475457,
55
- "grad_norm": 0.7426518873227826,
56
  "learning_rate": 5e-06,
57
- "loss": 0.7185,
58
  "step": 70
59
  },
60
  {
61
  "epoch": 0.18923713778829096,
62
- "grad_norm": 0.6764382829611749,
63
  "learning_rate": 5e-06,
64
- "loss": 0.6999,
65
  "step": 80
66
  },
67
  {
68
  "epoch": 0.21289178001182732,
69
- "grad_norm": 0.5663920137594394,
70
  "learning_rate": 5e-06,
71
- "loss": 0.7,
72
  "step": 90
73
  },
74
  {
75
  "epoch": 0.23654642223536368,
76
- "grad_norm": 0.6218835377066562,
77
  "learning_rate": 5e-06,
78
- "loss": 0.6968,
79
  "step": 100
80
  },
81
  {
82
  "epoch": 0.26020106445890007,
83
- "grad_norm": 0.601497886261039,
84
  "learning_rate": 5e-06,
85
- "loss": 0.6885,
86
  "step": 110
87
  },
88
  {
89
  "epoch": 0.2838557066824364,
90
- "grad_norm": 0.8786039473525534,
91
  "learning_rate": 5e-06,
92
- "loss": 0.6863,
93
  "step": 120
94
  },
95
  {
96
  "epoch": 0.3075103489059728,
97
- "grad_norm": 0.5373447312315734,
98
  "learning_rate": 5e-06,
99
- "loss": 0.6792,
100
  "step": 130
101
  },
102
  {
103
  "epoch": 0.33116499112950915,
104
- "grad_norm": 0.5195473153997355,
105
  "learning_rate": 5e-06,
106
- "loss": 0.6746,
107
  "step": 140
108
  },
109
  {
110
  "epoch": 0.35481963335304556,
111
- "grad_norm": 0.5999120946052041,
112
  "learning_rate": 5e-06,
113
- "loss": 0.68,
114
  "step": 150
115
  },
116
  {
117
  "epoch": 0.3784742755765819,
118
- "grad_norm": 0.5351205582865509,
119
  "learning_rate": 5e-06,
120
- "loss": 0.6766,
121
  "step": 160
122
  },
123
  {
124
  "epoch": 0.4021289178001183,
125
- "grad_norm": 0.5425010920017291,
126
  "learning_rate": 5e-06,
127
- "loss": 0.6839,
128
  "step": 170
129
  },
130
  {
131
  "epoch": 0.42578356002365464,
132
- "grad_norm": 0.702582958286065,
133
  "learning_rate": 5e-06,
134
- "loss": 0.6711,
135
  "step": 180
136
  },
137
  {
138
  "epoch": 0.449438202247191,
139
- "grad_norm": 1.2564450479675764,
140
  "learning_rate": 5e-06,
141
- "loss": 0.6776,
142
  "step": 190
143
  },
144
  {
145
  "epoch": 0.47309284447072736,
146
- "grad_norm": 0.6226292820244915,
147
  "learning_rate": 5e-06,
148
- "loss": 0.6695,
149
  "step": 200
150
  },
151
  {
152
  "epoch": 0.4967474866942638,
153
- "grad_norm": 0.6729451633589562,
154
  "learning_rate": 5e-06,
155
- "loss": 0.6656,
156
  "step": 210
157
  },
158
  {
159
  "epoch": 0.5204021289178001,
160
- "grad_norm": 0.72360498041814,
161
  "learning_rate": 5e-06,
162
- "loss": 0.6699,
163
  "step": 220
164
  },
165
  {
166
  "epoch": 0.5440567711413364,
167
- "grad_norm": 0.5730443009789895,
168
  "learning_rate": 5e-06,
169
- "loss": 0.6693,
170
  "step": 230
171
  },
172
  {
173
  "epoch": 0.5677114133648729,
174
- "grad_norm": 0.6304204873609028,
175
  "learning_rate": 5e-06,
176
- "loss": 0.6564,
177
  "step": 240
178
  },
179
  {
180
  "epoch": 0.5913660555884093,
181
- "grad_norm": 0.6327686473365169,
182
  "learning_rate": 5e-06,
183
- "loss": 0.6707,
184
  "step": 250
185
  },
186
  {
187
  "epoch": 0.6150206978119456,
188
- "grad_norm": 0.5640335014419563,
189
  "learning_rate": 5e-06,
190
- "loss": 0.6617,
191
  "step": 260
192
  },
193
  {
194
  "epoch": 0.638675340035482,
195
- "grad_norm": 0.6026872513783592,
196
  "learning_rate": 5e-06,
197
- "loss": 0.6711,
198
  "step": 270
199
  },
200
  {
201
  "epoch": 0.6623299822590183,
202
- "grad_norm": 0.5194797310260643,
203
  "learning_rate": 5e-06,
204
- "loss": 0.6561,
205
  "step": 280
206
  },
207
  {
208
  "epoch": 0.6859846244825547,
209
- "grad_norm": 0.6658270095766984,
210
  "learning_rate": 5e-06,
211
- "loss": 0.663,
212
  "step": 290
213
  },
214
  {
215
  "epoch": 0.7096392667060911,
216
- "grad_norm": 0.5259962549449988,
217
  "learning_rate": 5e-06,
218
- "loss": 0.6511,
219
  "step": 300
220
  },
221
  {
222
  "epoch": 0.7332939089296274,
223
- "grad_norm": 0.5776532705758929,
224
  "learning_rate": 5e-06,
225
- "loss": 0.6649,
226
  "step": 310
227
  },
228
  {
229
  "epoch": 0.7569485511531638,
230
- "grad_norm": 0.5249892835904177,
231
  "learning_rate": 5e-06,
232
- "loss": 0.6648,
233
  "step": 320
234
  },
235
  {
236
  "epoch": 0.7806031933767001,
237
- "grad_norm": 0.5092145613062358,
238
  "learning_rate": 5e-06,
239
- "loss": 0.6614,
240
  "step": 330
241
  },
242
  {
243
  "epoch": 0.8042578356002366,
244
- "grad_norm": 0.5273167065626364,
245
  "learning_rate": 5e-06,
246
- "loss": 0.6468,
247
  "step": 340
248
  },
249
  {
250
  "epoch": 0.8279124778237729,
251
- "grad_norm": 0.5666036582386984,
252
  "learning_rate": 5e-06,
253
- "loss": 0.6562,
254
  "step": 350
255
  },
256
  {
257
  "epoch": 0.8515671200473093,
258
- "grad_norm": 0.6164474600239763,
259
  "learning_rate": 5e-06,
260
- "loss": 0.6544,
261
  "step": 360
262
  },
263
  {
264
  "epoch": 0.8752217622708457,
265
- "grad_norm": 0.5854672267431167,
266
  "learning_rate": 5e-06,
267
- "loss": 0.6547,
268
  "step": 370
269
  },
270
  {
271
  "epoch": 0.898876404494382,
272
- "grad_norm": 0.5313196039449892,
273
  "learning_rate": 5e-06,
274
- "loss": 0.6584,
275
  "step": 380
276
  },
277
  {
278
  "epoch": 0.9225310467179184,
279
- "grad_norm": 0.5598019500152581,
280
  "learning_rate": 5e-06,
281
- "loss": 0.6533,
282
  "step": 390
283
  },
284
  {
285
  "epoch": 0.9461856889414547,
286
- "grad_norm": 0.6003027055491813,
287
  "learning_rate": 5e-06,
288
- "loss": 0.6458,
289
  "step": 400
290
  },
291
  {
292
  "epoch": 0.9698403311649911,
293
- "grad_norm": 0.6341763962447327,
294
  "learning_rate": 5e-06,
295
- "loss": 0.656,
296
  "step": 410
297
  },
298
  {
299
  "epoch": 0.9934949733885275,
300
- "grad_norm": 0.6083507877729101,
301
  "learning_rate": 5e-06,
302
- "loss": 0.6538,
303
  "step": 420
304
  },
305
  {
306
  "epoch": 0.9982259018332348,
307
- "eval_loss": 0.6558582186698914,
308
- "eval_runtime": 224.885,
309
- "eval_samples_per_second": 50.635,
310
- "eval_steps_per_second": 0.396,
311
  "step": 422
312
  },
313
  {
314
  "epoch": 1.0171496156120639,
315
- "grad_norm": 0.9361782463296332,
316
  "learning_rate": 5e-06,
317
- "loss": 0.6157,
318
  "step": 430
319
  },
320
  {
321
  "epoch": 1.0408042578356003,
322
- "grad_norm": 0.6204113010596938,
323
  "learning_rate": 5e-06,
324
- "loss": 0.6065,
325
  "step": 440
326
  },
327
  {
328
  "epoch": 1.0644589000591367,
329
- "grad_norm": 0.6564336264095381,
330
  "learning_rate": 5e-06,
331
- "loss": 0.6048,
332
  "step": 450
333
  },
334
  {
335
  "epoch": 1.0881135422826729,
336
- "grad_norm": 0.5555833545533679,
337
  "learning_rate": 5e-06,
338
- "loss": 0.6072,
339
  "step": 460
340
  },
341
  {
342
  "epoch": 1.1117681845062093,
343
- "grad_norm": 0.568736434370096,
344
  "learning_rate": 5e-06,
345
- "loss": 0.6026,
346
  "step": 470
347
  },
348
  {
349
  "epoch": 1.1354228267297457,
350
- "grad_norm": 0.5963174527245159,
351
  "learning_rate": 5e-06,
352
- "loss": 0.6041,
353
  "step": 480
354
  },
355
  {
356
  "epoch": 1.1590774689532821,
357
- "grad_norm": 0.6296624775692966,
358
  "learning_rate": 5e-06,
359
- "loss": 0.6036,
360
  "step": 490
361
  },
362
  {
363
  "epoch": 1.1827321111768185,
364
- "grad_norm": 0.7667349314546962,
365
  "learning_rate": 5e-06,
366
- "loss": 0.6068,
367
  "step": 500
368
  },
369
  {
370
  "epoch": 1.2063867534003547,
371
- "grad_norm": 0.6034621980970892,
372
  "learning_rate": 5e-06,
373
- "loss": 0.6104,
374
  "step": 510
375
  },
376
  {
377
  "epoch": 1.2300413956238911,
378
- "grad_norm": 0.5825117431703367,
379
  "learning_rate": 5e-06,
380
- "loss": 0.6026,
381
  "step": 520
382
  },
383
  {
384
  "epoch": 1.2536960378474276,
385
- "grad_norm": 0.5671081402783421,
386
  "learning_rate": 5e-06,
387
- "loss": 0.604,
388
  "step": 530
389
  },
390
  {
391
  "epoch": 1.277350680070964,
392
- "grad_norm": 0.5309591912112671,
393
  "learning_rate": 5e-06,
394
- "loss": 0.6132,
395
  "step": 540
396
  },
397
  {
398
  "epoch": 1.3010053222945004,
399
- "grad_norm": 0.5636046858771947,
400
  "learning_rate": 5e-06,
401
- "loss": 0.605,
402
  "step": 550
403
  },
404
  {
405
  "epoch": 1.3246599645180366,
406
- "grad_norm": 0.6623955102141809,
407
  "learning_rate": 5e-06,
408
- "loss": 0.6082,
409
  "step": 560
410
  },
411
  {
412
  "epoch": 1.348314606741573,
413
- "grad_norm": 0.5742305096790601,
414
  "learning_rate": 5e-06,
415
- "loss": 0.605,
416
  "step": 570
417
  },
418
  {
419
  "epoch": 1.3719692489651094,
420
- "grad_norm": 0.5167065988140831,
421
  "learning_rate": 5e-06,
422
- "loss": 0.6091,
423
  "step": 580
424
  },
425
  {
426
  "epoch": 1.3956238911886458,
427
- "grad_norm": 0.5112713876137833,
428
  "learning_rate": 5e-06,
429
- "loss": 0.6049,
430
  "step": 590
431
  },
432
  {
433
  "epoch": 1.4192785334121822,
434
- "grad_norm": 0.515536375353522,
435
  "learning_rate": 5e-06,
436
- "loss": 0.6079,
437
  "step": 600
438
  },
439
  {
440
  "epoch": 1.4429331756357184,
441
- "grad_norm": 0.5943800369847494,
442
  "learning_rate": 5e-06,
443
- "loss": 0.602,
444
  "step": 610
445
  },
446
  {
447
  "epoch": 1.4665878178592548,
448
- "grad_norm": 0.5570413849081146,
449
  "learning_rate": 5e-06,
450
- "loss": 0.6074,
451
  "step": 620
452
  },
453
  {
454
  "epoch": 1.4902424600827913,
455
- "grad_norm": 0.5383074416990815,
456
  "learning_rate": 5e-06,
457
- "loss": 0.6055,
458
  "step": 630
459
  },
460
  {
461
  "epoch": 1.5138971023063275,
462
- "grad_norm": 0.6221748842819845,
463
  "learning_rate": 5e-06,
464
- "loss": 0.6048,
465
  "step": 640
466
  },
467
  {
468
  "epoch": 1.537551744529864,
469
- "grad_norm": 0.623130737085543,
470
  "learning_rate": 5e-06,
471
- "loss": 0.6124,
472
  "step": 650
473
  },
474
  {
475
  "epoch": 1.5612063867534003,
476
- "grad_norm": 0.7728758992657894,
477
  "learning_rate": 5e-06,
478
- "loss": 0.6073,
479
  "step": 660
480
  },
481
  {
482
  "epoch": 1.5848610289769367,
483
- "grad_norm": 0.5531126954202661,
484
  "learning_rate": 5e-06,
485
- "loss": 0.6023,
486
  "step": 670
487
  },
488
  {
489
  "epoch": 1.6085156712004731,
490
- "grad_norm": 0.8207249388527519,
491
  "learning_rate": 5e-06,
492
- "loss": 0.6041,
493
  "step": 680
494
  },
495
  {
496
  "epoch": 1.6321703134240093,
497
- "grad_norm": 0.7382668830128054,
498
  "learning_rate": 5e-06,
499
- "loss": 0.6027,
500
  "step": 690
501
  },
502
  {
503
  "epoch": 1.655824955647546,
504
- "grad_norm": 0.8181634349883082,
505
  "learning_rate": 5e-06,
506
- "loss": 0.6077,
507
  "step": 700
508
  },
509
  {
510
  "epoch": 1.6794795978710821,
511
- "grad_norm": 0.5715750112816181,
512
  "learning_rate": 5e-06,
513
- "loss": 0.603,
514
  "step": 710
515
  },
516
  {
517
  "epoch": 1.7031342400946186,
518
- "grad_norm": 0.564060422032355,
519
  "learning_rate": 5e-06,
520
- "loss": 0.6137,
521
  "step": 720
522
  },
523
  {
524
  "epoch": 1.726788882318155,
525
- "grad_norm": 0.5112934215435977,
526
  "learning_rate": 5e-06,
527
- "loss": 0.6041,
528
  "step": 730
529
  },
530
  {
531
  "epoch": 1.7504435245416912,
532
- "grad_norm": 0.6498040890743698,
533
  "learning_rate": 5e-06,
534
- "loss": 0.6038,
535
  "step": 740
536
  },
537
  {
538
  "epoch": 1.7740981667652278,
539
- "grad_norm": 0.6625174306160165,
540
  "learning_rate": 5e-06,
541
- "loss": 0.604,
542
  "step": 750
543
  },
544
  {
545
  "epoch": 1.797752808988764,
546
- "grad_norm": 0.5200239654238437,
547
  "learning_rate": 5e-06,
548
- "loss": 0.604,
549
  "step": 760
550
  },
551
  {
552
  "epoch": 1.8214074512123004,
553
- "grad_norm": 0.5056250667365105,
554
  "learning_rate": 5e-06,
555
- "loss": 0.601,
556
  "step": 770
557
  },
558
  {
559
  "epoch": 1.8450620934358368,
560
- "grad_norm": 0.5465224841554837,
561
  "learning_rate": 5e-06,
562
- "loss": 0.6043,
563
  "step": 780
564
  },
565
  {
566
  "epoch": 1.868716735659373,
567
- "grad_norm": 0.5173445168820222,
568
  "learning_rate": 5e-06,
569
- "loss": 0.6051,
570
  "step": 790
571
  },
572
  {
573
  "epoch": 1.8923713778829097,
574
- "grad_norm": 0.5037163086029489,
575
  "learning_rate": 5e-06,
576
- "loss": 0.6071,
577
  "step": 800
578
  },
579
  {
580
  "epoch": 1.9160260201064458,
581
- "grad_norm": 0.5032092904194995,
582
  "learning_rate": 5e-06,
583
- "loss": 0.6097,
584
  "step": 810
585
  },
586
  {
587
  "epoch": 1.9396806623299823,
588
- "grad_norm": 0.5153373413177225,
589
  "learning_rate": 5e-06,
590
- "loss": 0.6017,
591
  "step": 820
592
  },
593
  {
594
  "epoch": 1.9633353045535187,
595
- "grad_norm": 0.48445434626456924,
596
  "learning_rate": 5e-06,
597
- "loss": 0.6097,
598
  "step": 830
599
  },
600
  {
601
  "epoch": 1.9869899467770549,
602
- "grad_norm": 0.5493690840998718,
603
  "learning_rate": 5e-06,
604
- "loss": 0.5996,
605
  "step": 840
606
  },
607
  {
608
  "epoch": 1.9988172678888232,
609
- "eval_loss": 0.6458428502082825,
610
- "eval_runtime": 226.2611,
611
- "eval_samples_per_second": 50.327,
612
- "eval_steps_per_second": 0.393,
613
  "step": 845
614
  },
615
  {
616
  "epoch": 2.0106445890005915,
617
- "grad_norm": 0.6225484532666892,
618
  "learning_rate": 5e-06,
619
- "loss": 0.5795,
620
  "step": 850
621
  },
622
  {
623
  "epoch": 2.0342992312241277,
624
- "grad_norm": 0.5819857678964343,
625
  "learning_rate": 5e-06,
626
- "loss": 0.5553,
627
  "step": 860
628
  },
629
  {
630
  "epoch": 2.057953873447664,
631
- "grad_norm": 0.7185360534865078,
632
  "learning_rate": 5e-06,
633
- "loss": 0.5506,
634
  "step": 870
635
  },
636
  {
637
  "epoch": 2.0816085156712005,
638
- "grad_norm": 0.5134284842767335,
639
  "learning_rate": 5e-06,
640
- "loss": 0.5539,
641
  "step": 880
642
  },
643
  {
644
  "epoch": 2.1052631578947367,
645
- "grad_norm": 0.5986326239884353,
646
  "learning_rate": 5e-06,
647
- "loss": 0.5609,
648
  "step": 890
649
  },
650
  {
651
  "epoch": 2.1289178001182734,
652
- "grad_norm": 0.579714763513885,
653
  "learning_rate": 5e-06,
654
- "loss": 0.5546,
655
  "step": 900
656
  },
657
  {
658
  "epoch": 2.1525724423418096,
659
- "grad_norm": 0.570292196409214,
660
  "learning_rate": 5e-06,
661
- "loss": 0.5586,
662
  "step": 910
663
  },
664
  {
665
  "epoch": 2.1762270845653457,
666
- "grad_norm": 0.5811117001743673,
667
  "learning_rate": 5e-06,
668
- "loss": 0.5585,
669
  "step": 920
670
  },
671
  {
672
  "epoch": 2.1998817267888824,
673
- "grad_norm": 0.554144816987719,
674
  "learning_rate": 5e-06,
675
- "loss": 0.5562,
676
  "step": 930
677
  },
678
  {
679
  "epoch": 2.2235363690124186,
680
- "grad_norm": 0.6493826388527278,
681
  "learning_rate": 5e-06,
682
- "loss": 0.5565,
683
  "step": 940
684
  },
685
  {
686
  "epoch": 2.247191011235955,
687
- "grad_norm": 0.5220557856218626,
688
  "learning_rate": 5e-06,
689
- "loss": 0.5694,
690
  "step": 950
691
  },
692
  {
693
  "epoch": 2.2708456534594914,
694
- "grad_norm": 0.6378102281048501,
695
  "learning_rate": 5e-06,
696
- "loss": 0.5602,
697
  "step": 960
698
  },
699
  {
700
  "epoch": 2.2945002956830276,
701
- "grad_norm": 0.5497371876386185,
702
  "learning_rate": 5e-06,
703
- "loss": 0.5628,
704
  "step": 970
705
  },
706
  {
707
  "epoch": 2.3181549379065642,
708
- "grad_norm": 0.6521682175920844,
709
  "learning_rate": 5e-06,
710
- "loss": 0.5565,
711
  "step": 980
712
  },
713
  {
714
  "epoch": 2.3418095801301004,
715
- "grad_norm": 0.5734936169662879,
716
  "learning_rate": 5e-06,
717
- "loss": 0.5674,
718
  "step": 990
719
  },
720
  {
721
  "epoch": 2.365464222353637,
722
- "grad_norm": 0.5394258314592499,
723
  "learning_rate": 5e-06,
724
- "loss": 0.5569,
725
  "step": 1000
726
  },
727
  {
728
  "epoch": 2.3891188645771733,
729
- "grad_norm": 0.5306593171364488,
730
  "learning_rate": 5e-06,
731
- "loss": 0.5502,
732
  "step": 1010
733
  },
734
  {
735
  "epoch": 2.4127735068007095,
736
- "grad_norm": 0.5344199954837688,
737
  "learning_rate": 5e-06,
738
- "loss": 0.5549,
739
  "step": 1020
740
  },
741
  {
742
  "epoch": 2.436428149024246,
743
- "grad_norm": 0.5892751227456119,
744
  "learning_rate": 5e-06,
745
- "loss": 0.5533,
746
  "step": 1030
747
  },
748
  {
749
  "epoch": 2.4600827912477823,
750
- "grad_norm": 0.6529042003930223,
751
  "learning_rate": 5e-06,
752
- "loss": 0.5613,
753
  "step": 1040
754
  },
755
  {
756
  "epoch": 2.483737433471319,
757
- "grad_norm": 0.5765438425321338,
758
  "learning_rate": 5e-06,
759
- "loss": 0.5646,
760
  "step": 1050
761
  },
762
  {
763
  "epoch": 2.507392075694855,
764
- "grad_norm": 0.6764490044193554,
765
  "learning_rate": 5e-06,
766
- "loss": 0.567,
767
  "step": 1060
768
  },
769
  {
770
  "epoch": 2.5310467179183913,
771
- "grad_norm": 0.5365218264481744,
772
  "learning_rate": 5e-06,
773
- "loss": 0.5532,
774
  "step": 1070
775
  },
776
  {
777
  "epoch": 2.554701360141928,
778
- "grad_norm": 0.6033785399498255,
779
  "learning_rate": 5e-06,
780
- "loss": 0.5622,
781
  "step": 1080
782
  },
783
  {
784
  "epoch": 2.578356002365464,
785
- "grad_norm": 0.8004909937255467,
786
  "learning_rate": 5e-06,
787
- "loss": 0.5661,
788
  "step": 1090
789
  },
790
  {
791
  "epoch": 2.6020106445890008,
792
- "grad_norm": 0.5819582134735406,
793
  "learning_rate": 5e-06,
794
- "loss": 0.5616,
795
  "step": 1100
796
  },
797
  {
798
  "epoch": 2.625665286812537,
799
- "grad_norm": 0.5537773395049099,
800
  "learning_rate": 5e-06,
801
- "loss": 0.5628,
802
  "step": 1110
803
  },
804
  {
805
  "epoch": 2.649319929036073,
806
- "grad_norm": 0.5539615560141525,
807
  "learning_rate": 5e-06,
808
- "loss": 0.5648,
809
  "step": 1120
810
  },
811
  {
812
  "epoch": 2.67297457125961,
813
- "grad_norm": 0.6206027218523953,
814
  "learning_rate": 5e-06,
815
- "loss": 0.5643,
816
  "step": 1130
817
  },
818
  {
819
  "epoch": 2.696629213483146,
820
- "grad_norm": 0.5108322877934205,
821
  "learning_rate": 5e-06,
822
- "loss": 0.5586,
823
  "step": 1140
824
  },
825
  {
826
  "epoch": 2.7202838557066826,
827
- "grad_norm": 0.48797735494965916,
828
  "learning_rate": 5e-06,
829
- "loss": 0.5563,
830
  "step": 1150
831
  },
832
  {
833
  "epoch": 2.743938497930219,
834
- "grad_norm": 0.5823974142352172,
835
  "learning_rate": 5e-06,
836
- "loss": 0.5671,
837
  "step": 1160
838
  },
839
  {
840
  "epoch": 2.767593140153755,
841
- "grad_norm": 0.8599218035136146,
842
  "learning_rate": 5e-06,
843
- "loss": 0.5723,
844
  "step": 1170
845
  },
846
  {
847
  "epoch": 2.7912477823772917,
848
- "grad_norm": 0.6555716714163583,
849
  "learning_rate": 5e-06,
850
- "loss": 0.5633,
851
  "step": 1180
852
  },
853
  {
854
  "epoch": 2.814902424600828,
855
- "grad_norm": 0.49879910164951613,
856
  "learning_rate": 5e-06,
857
- "loss": 0.5581,
858
  "step": 1190
859
  },
860
  {
861
  "epoch": 2.8385570668243645,
862
- "grad_norm": 0.5241725506783274,
863
  "learning_rate": 5e-06,
864
- "loss": 0.5623,
865
  "step": 1200
866
  },
867
  {
868
  "epoch": 2.8622117090479007,
869
- "grad_norm": 0.6173811070502804,
870
  "learning_rate": 5e-06,
871
- "loss": 0.569,
872
  "step": 1210
873
  },
874
  {
875
  "epoch": 2.885866351271437,
876
- "grad_norm": 0.5397292738316359,
877
  "learning_rate": 5e-06,
878
- "loss": 0.5642,
879
  "step": 1220
880
  },
881
  {
882
  "epoch": 2.9095209934949735,
883
- "grad_norm": 0.7053290870019903,
884
  "learning_rate": 5e-06,
885
- "loss": 0.5593,
886
  "step": 1230
887
  },
888
  {
889
  "epoch": 2.9331756357185097,
890
- "grad_norm": 0.5500348460578961,
891
  "learning_rate": 5e-06,
892
- "loss": 0.5591,
893
  "step": 1240
894
  },
895
  {
896
  "epoch": 2.9568302779420463,
897
- "grad_norm": 0.5833114667049699,
898
  "learning_rate": 5e-06,
899
- "loss": 0.5649,
900
  "step": 1250
901
  },
902
  {
903
  "epoch": 2.9804849201655825,
904
- "grad_norm": 0.569413301750619,
905
  "learning_rate": 5e-06,
906
- "loss": 0.5577,
907
  "step": 1260
908
  },
909
  {
910
  "epoch": 2.9946777054997042,
911
- "eval_loss": 0.648719847202301,
912
- "eval_runtime": 225.8886,
913
- "eval_samples_per_second": 50.41,
914
- "eval_steps_per_second": 0.394,
915
  "step": 1266
916
  },
917
  {
918
  "epoch": 2.9946777054997042,
919
  "step": 1266,
920
  "total_flos": 2120178393415680.0,
921
- "train_loss": 0.6180764295478568,
922
- "train_runtime": 37891.5513,
923
- "train_samples_per_second": 17.129,
924
  "train_steps_per_second": 0.033
925
  }
926
  ],
 
10
  "log_history": [
11
  {
12
  "epoch": 0.02365464222353637,
13
+ "grad_norm": 4.438574633904732,
14
  "learning_rate": 5e-06,
15
+ "loss": 0.8864,
16
  "step": 10
17
  },
18
  {
19
  "epoch": 0.04730928444707274,
20
+ "grad_norm": 3.743436107422786,
21
  "learning_rate": 5e-06,
22
+ "loss": 0.7865,
23
  "step": 20
24
  },
25
  {
26
  "epoch": 0.0709639266706091,
27
+ "grad_norm": 1.3773266130731683,
28
  "learning_rate": 5e-06,
29
+ "loss": 0.7718,
30
  "step": 30
31
  },
32
  {
33
  "epoch": 0.09461856889414548,
34
+ "grad_norm": 1.0328014979269187,
35
  "learning_rate": 5e-06,
36
+ "loss": 0.7321,
37
  "step": 40
38
  },
39
  {
40
  "epoch": 0.11827321111768184,
41
+ "grad_norm": 0.9419322611015702,
42
  "learning_rate": 5e-06,
43
+ "loss": 0.7263,
44
  "step": 50
45
  },
46
  {
47
  "epoch": 0.1419278533412182,
48
+ "grad_norm": 1.0108429467163185,
49
  "learning_rate": 5e-06,
50
+ "loss": 0.7095,
51
  "step": 60
52
  },
53
  {
54
  "epoch": 0.16558249556475457,
55
+ "grad_norm": 0.7116397296212089,
56
  "learning_rate": 5e-06,
57
+ "loss": 0.7076,
58
  "step": 70
59
  },
60
  {
61
  "epoch": 0.18923713778829096,
62
+ "grad_norm": 0.6226004481870809,
63
  "learning_rate": 5e-06,
64
+ "loss": 0.6923,
65
  "step": 80
66
  },
67
  {
68
  "epoch": 0.21289178001182732,
69
+ "grad_norm": 0.7788334547916798,
70
  "learning_rate": 5e-06,
71
+ "loss": 0.6939,
72
  "step": 90
73
  },
74
  {
75
  "epoch": 0.23654642223536368,
76
+ "grad_norm": 0.5579006889125326,
77
  "learning_rate": 5e-06,
78
+ "loss": 0.6916,
79
  "step": 100
80
  },
81
  {
82
  "epoch": 0.26020106445890007,
83
+ "grad_norm": 0.8348224630921526,
84
  "learning_rate": 5e-06,
85
+ "loss": 0.6841,
86
  "step": 110
87
  },
88
  {
89
  "epoch": 0.2838557066824364,
90
+ "grad_norm": 1.0981466464621934,
91
  "learning_rate": 5e-06,
92
+ "loss": 0.6822,
93
  "step": 120
94
  },
95
  {
96
  "epoch": 0.3075103489059728,
97
+ "grad_norm": 0.8917486118802607,
98
  "learning_rate": 5e-06,
99
+ "loss": 0.6761,
100
  "step": 130
101
  },
102
  {
103
  "epoch": 0.33116499112950915,
104
+ "grad_norm": 0.6543556998725211,
105
  "learning_rate": 5e-06,
106
+ "loss": 0.6715,
107
  "step": 140
108
  },
109
  {
110
  "epoch": 0.35481963335304556,
111
+ "grad_norm": 0.8125188565799322,
112
  "learning_rate": 5e-06,
113
+ "loss": 0.6773,
114
  "step": 150
115
  },
116
  {
117
  "epoch": 0.3784742755765819,
118
+ "grad_norm": 0.5688602721338011,
119
  "learning_rate": 5e-06,
120
+ "loss": 0.674,
121
  "step": 160
122
  },
123
  {
124
  "epoch": 0.4021289178001183,
125
+ "grad_norm": 0.491756225050921,
126
  "learning_rate": 5e-06,
127
+ "loss": 0.6815,
128
  "step": 170
129
  },
130
  {
131
  "epoch": 0.42578356002365464,
132
+ "grad_norm": 0.6509377411475789,
133
  "learning_rate": 5e-06,
134
+ "loss": 0.669,
135
  "step": 180
136
  },
137
  {
138
  "epoch": 0.449438202247191,
139
+ "grad_norm": 0.6207735885705108,
140
  "learning_rate": 5e-06,
141
+ "loss": 0.6755,
142
  "step": 190
143
  },
144
  {
145
  "epoch": 0.47309284447072736,
146
+ "grad_norm": 0.528808960645126,
147
  "learning_rate": 5e-06,
148
+ "loss": 0.6674,
149
  "step": 200
150
  },
151
  {
152
  "epoch": 0.4967474866942638,
153
+ "grad_norm": 0.8472585224068009,
154
  "learning_rate": 5e-06,
155
+ "loss": 0.6637,
156
  "step": 210
157
  },
158
  {
159
  "epoch": 0.5204021289178001,
160
+ "grad_norm": 0.5848774410670773,
161
  "learning_rate": 5e-06,
162
+ "loss": 0.6681,
163
  "step": 220
164
  },
165
  {
166
  "epoch": 0.5440567711413364,
167
+ "grad_norm": 0.6344280914148243,
168
  "learning_rate": 5e-06,
169
+ "loss": 0.6675,
170
  "step": 230
171
  },
172
  {
173
  "epoch": 0.5677114133648729,
174
+ "grad_norm": 0.7485250474806475,
175
  "learning_rate": 5e-06,
176
+ "loss": 0.6549,
177
  "step": 240
178
  },
179
  {
180
  "epoch": 0.5913660555884093,
181
+ "grad_norm": 0.7332796111883003,
182
  "learning_rate": 5e-06,
183
+ "loss": 0.6693,
184
  "step": 250
185
  },
186
  {
187
  "epoch": 0.6150206978119456,
188
+ "grad_norm": 0.5097881381268425,
189
  "learning_rate": 5e-06,
190
+ "loss": 0.6603,
191
  "step": 260
192
  },
193
  {
194
  "epoch": 0.638675340035482,
195
+ "grad_norm": 0.49176638410796597,
196
  "learning_rate": 5e-06,
197
+ "loss": 0.6697,
198
  "step": 270
199
  },
200
  {
201
  "epoch": 0.6623299822590183,
202
+ "grad_norm": 0.4404068786810332,
203
  "learning_rate": 5e-06,
204
+ "loss": 0.6547,
205
  "step": 280
206
  },
207
  {
208
  "epoch": 0.6859846244825547,
209
+ "grad_norm": 0.45956579197536424,
210
  "learning_rate": 5e-06,
211
+ "loss": 0.6617,
212
  "step": 290
213
  },
214
  {
215
  "epoch": 0.7096392667060911,
216
+ "grad_norm": 0.5489471352518822,
217
  "learning_rate": 5e-06,
218
+ "loss": 0.65,
219
  "step": 300
220
  },
221
  {
222
  "epoch": 0.7332939089296274,
223
+ "grad_norm": 0.5071925000559494,
224
  "learning_rate": 5e-06,
225
+ "loss": 0.6639,
226
  "step": 310
227
  },
228
  {
229
  "epoch": 0.7569485511531638,
230
+ "grad_norm": 0.4479532221316009,
231
  "learning_rate": 5e-06,
232
+ "loss": 0.6638,
233
  "step": 320
234
  },
235
  {
236
  "epoch": 0.7806031933767001,
237
+ "grad_norm": 0.5768991016278898,
238
  "learning_rate": 5e-06,
239
+ "loss": 0.6601,
240
  "step": 330
241
  },
242
  {
243
  "epoch": 0.8042578356002366,
244
+ "grad_norm": 0.47440990509293773,
245
  "learning_rate": 5e-06,
246
+ "loss": 0.6457,
247
  "step": 340
248
  },
249
  {
250
  "epoch": 0.8279124778237729,
251
+ "grad_norm": 0.5535872360742707,
252
  "learning_rate": 5e-06,
253
+ "loss": 0.6552,
254
  "step": 350
255
  },
256
  {
257
  "epoch": 0.8515671200473093,
258
+ "grad_norm": 0.5692309710225549,
259
  "learning_rate": 5e-06,
260
+ "loss": 0.6533,
261
  "step": 360
262
  },
263
  {
264
  "epoch": 0.8752217622708457,
265
+ "grad_norm": 0.477161966378483,
266
  "learning_rate": 5e-06,
267
+ "loss": 0.6538,
268
  "step": 370
269
  },
270
  {
271
  "epoch": 0.898876404494382,
272
+ "grad_norm": 0.5742339288892304,
273
  "learning_rate": 5e-06,
274
+ "loss": 0.6576,
275
  "step": 380
276
  },
277
  {
278
  "epoch": 0.9225310467179184,
279
+ "grad_norm": 0.6276883665595651,
280
  "learning_rate": 5e-06,
281
+ "loss": 0.6524,
282
  "step": 390
283
  },
284
  {
285
  "epoch": 0.9461856889414547,
286
+ "grad_norm": 0.5739867998426434,
287
  "learning_rate": 5e-06,
288
+ "loss": 0.645,
289
  "step": 400
290
  },
291
  {
292
  "epoch": 0.9698403311649911,
293
+ "grad_norm": 0.6714991896688636,
294
  "learning_rate": 5e-06,
295
+ "loss": 0.6552,
296
  "step": 410
297
  },
298
  {
299
  "epoch": 0.9934949733885275,
300
+ "grad_norm": 0.5532850958647405,
301
  "learning_rate": 5e-06,
302
+ "loss": 0.6528,
303
  "step": 420
304
  },
305
  {
306
  "epoch": 0.9982259018332348,
307
+ "eval_loss": 0.6549943685531616,
308
+ "eval_runtime": 226.5831,
309
+ "eval_samples_per_second": 50.255,
310
+ "eval_steps_per_second": 0.393,
311
  "step": 422
312
  },
313
  {
314
  "epoch": 1.0171496156120639,
315
+ "grad_norm": 0.6577034498689445,
316
  "learning_rate": 5e-06,
317
+ "loss": 0.6186,
318
  "step": 430
319
  },
320
  {
321
  "epoch": 1.0408042578356003,
322
+ "grad_norm": 0.6919211184212087,
323
  "learning_rate": 5e-06,
324
+ "loss": 0.611,
325
  "step": 440
326
  },
327
  {
328
  "epoch": 1.0644589000591367,
329
+ "grad_norm": 0.49818432774881954,
330
  "learning_rate": 5e-06,
331
+ "loss": 0.6094,
332
  "step": 450
333
  },
334
  {
335
  "epoch": 1.0881135422826729,
336
+ "grad_norm": 0.5304613381757841,
337
  "learning_rate": 5e-06,
338
+ "loss": 0.6118,
339
  "step": 460
340
  },
341
  {
342
  "epoch": 1.1117681845062093,
343
+ "grad_norm": 0.5661240677080396,
344
  "learning_rate": 5e-06,
345
+ "loss": 0.6069,
346
  "step": 470
347
  },
348
  {
349
  "epoch": 1.1354228267297457,
350
+ "grad_norm": 0.4725657759678031,
351
  "learning_rate": 5e-06,
352
+ "loss": 0.6081,
353
  "step": 480
354
  },
355
  {
356
  "epoch": 1.1590774689532821,
357
+ "grad_norm": 0.5348982555181953,
358
  "learning_rate": 5e-06,
359
+ "loss": 0.608,
360
  "step": 490
361
  },
362
  {
363
  "epoch": 1.1827321111768185,
364
+ "grad_norm": 0.7009257467225577,
365
  "learning_rate": 5e-06,
366
+ "loss": 0.6107,
367
  "step": 500
368
  },
369
  {
370
  "epoch": 1.2063867534003547,
371
+ "grad_norm": 0.516755234113577,
372
  "learning_rate": 5e-06,
373
+ "loss": 0.6145,
374
  "step": 510
375
  },
376
  {
377
  "epoch": 1.2300413956238911,
378
+ "grad_norm": 0.5565870238553596,
379
  "learning_rate": 5e-06,
380
+ "loss": 0.6064,
381
  "step": 520
382
  },
383
  {
384
  "epoch": 1.2536960378474276,
385
+ "grad_norm": 0.5176934237005286,
386
  "learning_rate": 5e-06,
387
+ "loss": 0.6078,
388
  "step": 530
389
  },
390
  {
391
  "epoch": 1.277350680070964,
392
+ "grad_norm": 0.5399011019791115,
393
  "learning_rate": 5e-06,
394
+ "loss": 0.6173,
395
  "step": 540
396
  },
397
  {
398
  "epoch": 1.3010053222945004,
399
+ "grad_norm": 0.48804065232921706,
400
  "learning_rate": 5e-06,
401
+ "loss": 0.6089,
402
  "step": 550
403
  },
404
  {
405
  "epoch": 1.3246599645180366,
406
+ "grad_norm": 0.6247022748083035,
407
  "learning_rate": 5e-06,
408
+ "loss": 0.612,
409
  "step": 560
410
  },
411
  {
412
  "epoch": 1.348314606741573,
413
+ "grad_norm": 0.5205181494692162,
414
  "learning_rate": 5e-06,
415
+ "loss": 0.6087,
416
  "step": 570
417
  },
418
  {
419
  "epoch": 1.3719692489651094,
420
+ "grad_norm": 0.4444906716754459,
421
  "learning_rate": 5e-06,
422
+ "loss": 0.6129,
423
  "step": 580
424
  },
425
  {
426
  "epoch": 1.3956238911886458,
427
+ "grad_norm": 0.4699507974891951,
428
  "learning_rate": 5e-06,
429
+ "loss": 0.6084,
430
  "step": 590
431
  },
432
  {
433
  "epoch": 1.4192785334121822,
434
+ "grad_norm": 0.438759746705871,
435
  "learning_rate": 5e-06,
436
+ "loss": 0.6118,
437
  "step": 600
438
  },
439
  {
440
  "epoch": 1.4429331756357184,
441
+ "grad_norm": 0.492167276336904,
442
  "learning_rate": 5e-06,
443
+ "loss": 0.6057,
444
  "step": 610
445
  },
446
  {
447
  "epoch": 1.4665878178592548,
448
+ "grad_norm": 0.508896134049524,
449
  "learning_rate": 5e-06,
450
+ "loss": 0.6111,
451
  "step": 620
452
  },
453
  {
454
  "epoch": 1.4902424600827913,
455
+ "grad_norm": 0.5044935497801236,
456
  "learning_rate": 5e-06,
457
+ "loss": 0.6091,
458
  "step": 630
459
  },
460
  {
461
  "epoch": 1.5138971023063275,
462
+ "grad_norm": 0.611513828523435,
463
  "learning_rate": 5e-06,
464
+ "loss": 0.6084,
465
  "step": 640
466
  },
467
  {
468
  "epoch": 1.537551744529864,
469
+ "grad_norm": 0.4284886945389684,
470
  "learning_rate": 5e-06,
471
+ "loss": 0.6161,
472
  "step": 650
473
  },
474
  {
475
  "epoch": 1.5612063867534003,
476
+ "grad_norm": 0.7397737868115762,
477
  "learning_rate": 5e-06,
478
+ "loss": 0.611,
479
  "step": 660
480
  },
481
  {
482
  "epoch": 1.5848610289769367,
483
+ "grad_norm": 0.45834776518516607,
484
  "learning_rate": 5e-06,
485
+ "loss": 0.6054,
486
  "step": 670
487
  },
488
  {
489
  "epoch": 1.6085156712004731,
490
+ "grad_norm": 0.9830107568320281,
491
  "learning_rate": 5e-06,
492
+ "loss": 0.6077,
493
  "step": 680
494
  },
495
  {
496
  "epoch": 1.6321703134240093,
497
+ "grad_norm": 0.7316301006660809,
498
  "learning_rate": 5e-06,
499
+ "loss": 0.6066,
500
  "step": 690
501
  },
502
  {
503
  "epoch": 1.655824955647546,
504
+ "grad_norm": 0.8713979852654485,
505
  "learning_rate": 5e-06,
506
+ "loss": 0.6112,
507
  "step": 700
508
  },
509
  {
510
  "epoch": 1.6794795978710821,
511
+ "grad_norm": 0.6609904307136948,
512
  "learning_rate": 5e-06,
513
+ "loss": 0.6063,
514
  "step": 710
515
  },
516
  {
517
  "epoch": 1.7031342400946186,
518
+ "grad_norm": 0.5730385784454821,
519
  "learning_rate": 5e-06,
520
+ "loss": 0.6174,
521
  "step": 720
522
  },
523
  {
524
  "epoch": 1.726788882318155,
525
+ "grad_norm": 0.428675321624077,
526
  "learning_rate": 5e-06,
527
+ "loss": 0.6075,
528
  "step": 730
529
  },
530
  {
531
  "epoch": 1.7504435245416912,
532
+ "grad_norm": 0.5747845078803645,
533
  "learning_rate": 5e-06,
534
+ "loss": 0.6073,
535
  "step": 740
536
  },
537
  {
538
  "epoch": 1.7740981667652278,
539
+ "grad_norm": 0.6019599666582006,
540
  "learning_rate": 5e-06,
541
+ "loss": 0.6074,
542
  "step": 750
543
  },
544
  {
545
  "epoch": 1.797752808988764,
546
+ "grad_norm": 0.484871680178572,
547
  "learning_rate": 5e-06,
548
+ "loss": 0.6076,
549
  "step": 760
550
  },
551
  {
552
  "epoch": 1.8214074512123004,
553
+ "grad_norm": 0.4801879662753807,
554
  "learning_rate": 5e-06,
555
+ "loss": 0.6046,
556
  "step": 770
557
  },
558
  {
559
  "epoch": 1.8450620934358368,
560
+ "grad_norm": 0.44588625446373603,
561
  "learning_rate": 5e-06,
562
+ "loss": 0.6078,
563
  "step": 780
564
  },
565
  {
566
  "epoch": 1.868716735659373,
567
+ "grad_norm": 0.43921853360548113,
568
  "learning_rate": 5e-06,
569
+ "loss": 0.6086,
570
  "step": 790
571
  },
572
  {
573
  "epoch": 1.8923713778829097,
574
+ "grad_norm": 0.4461189307976923,
575
  "learning_rate": 5e-06,
576
+ "loss": 0.6105,
577
  "step": 800
578
  },
579
  {
580
  "epoch": 1.9160260201064458,
581
+ "grad_norm": 0.4949761836327779,
582
  "learning_rate": 5e-06,
583
+ "loss": 0.613,
584
  "step": 810
585
  },
586
  {
587
  "epoch": 1.9396806623299823,
588
+ "grad_norm": 0.44062812260467765,
589
  "learning_rate": 5e-06,
590
+ "loss": 0.6049,
591
  "step": 820
592
  },
593
  {
594
  "epoch": 1.9633353045535187,
595
+ "grad_norm": 0.4460160929815086,
596
  "learning_rate": 5e-06,
597
+ "loss": 0.613,
598
  "step": 830
599
  },
600
  {
601
  "epoch": 1.9869899467770549,
602
+ "grad_norm": 0.5175110899521405,
603
  "learning_rate": 5e-06,
604
+ "loss": 0.6028,
605
  "step": 840
606
  },
607
  {
608
  "epoch": 1.9988172678888232,
609
+ "eval_loss": 0.6445377469062805,
610
+ "eval_runtime": 227.0244,
611
+ "eval_samples_per_second": 50.158,
612
+ "eval_steps_per_second": 0.392,
613
  "step": 845
614
  },
615
  {
616
  "epoch": 2.0106445890005915,
617
+ "grad_norm": 0.5892375412389526,
618
  "learning_rate": 5e-06,
619
+ "loss": 0.5851,
620
  "step": 850
621
  },
622
  {
623
  "epoch": 2.0342992312241277,
624
+ "grad_norm": 0.5097516172118646,
625
  "learning_rate": 5e-06,
626
+ "loss": 0.5643,
627
  "step": 860
628
  },
629
  {
630
  "epoch": 2.057953873447664,
631
+ "grad_norm": 0.6536746176311915,
632
  "learning_rate": 5e-06,
633
+ "loss": 0.5592,
634
  "step": 870
635
  },
636
  {
637
  "epoch": 2.0816085156712005,
638
+ "grad_norm": 0.47983268810356666,
639
  "learning_rate": 5e-06,
640
+ "loss": 0.5626,
641
  "step": 880
642
  },
643
  {
644
  "epoch": 2.1052631578947367,
645
+ "grad_norm": 0.6017282349204336,
646
  "learning_rate": 5e-06,
647
+ "loss": 0.5697,
648
  "step": 890
649
  },
650
  {
651
  "epoch": 2.1289178001182734,
652
+ "grad_norm": 0.5728407157654074,
653
  "learning_rate": 5e-06,
654
+ "loss": 0.5632,
655
  "step": 900
656
  },
657
  {
658
  "epoch": 2.1525724423418096,
659
+ "grad_norm": 0.5680779384221303,
660
  "learning_rate": 5e-06,
661
+ "loss": 0.5672,
662
  "step": 910
663
  },
664
  {
665
  "epoch": 2.1762270845653457,
666
+ "grad_norm": 0.48858908601906337,
667
  "learning_rate": 5e-06,
668
+ "loss": 0.567,
669
  "step": 920
670
  },
671
  {
672
  "epoch": 2.1998817267888824,
673
+ "grad_norm": 0.5005707887249943,
674
  "learning_rate": 5e-06,
675
+ "loss": 0.5646,
676
  "step": 930
677
  },
678
  {
679
  "epoch": 2.2235363690124186,
680
+ "grad_norm": 0.5829558904651037,
681
  "learning_rate": 5e-06,
682
+ "loss": 0.5648,
683
  "step": 940
684
  },
685
  {
686
  "epoch": 2.247191011235955,
687
+ "grad_norm": 0.48798199303667406,
688
  "learning_rate": 5e-06,
689
+ "loss": 0.578,
690
  "step": 950
691
  },
692
  {
693
  "epoch": 2.2708456534594914,
694
+ "grad_norm": 0.582446153234459,
695
  "learning_rate": 5e-06,
696
+ "loss": 0.5682,
697
  "step": 960
698
  },
699
  {
700
  "epoch": 2.2945002956830276,
701
+ "grad_norm": 0.46970294592756995,
702
  "learning_rate": 5e-06,
703
+ "loss": 0.571,
704
  "step": 970
705
  },
706
  {
707
  "epoch": 2.3181549379065642,
708
+ "grad_norm": 0.5759020549520256,
709
  "learning_rate": 5e-06,
710
+ "loss": 0.5647,
711
  "step": 980
712
  },
713
  {
714
  "epoch": 2.3418095801301004,
715
+ "grad_norm": 0.549340588982862,
716
  "learning_rate": 5e-06,
717
+ "loss": 0.5755,
718
  "step": 990
719
  },
720
  {
721
  "epoch": 2.365464222353637,
722
+ "grad_norm": 0.46429208051701265,
723
  "learning_rate": 5e-06,
724
+ "loss": 0.5648,
725
  "step": 1000
726
  },
727
  {
728
  "epoch": 2.3891188645771733,
729
+ "grad_norm": 0.5160254392452897,
730
  "learning_rate": 5e-06,
731
+ "loss": 0.558,
732
  "step": 1010
733
  },
734
  {
735
  "epoch": 2.4127735068007095,
736
+ "grad_norm": 0.4799281597192369,
737
  "learning_rate": 5e-06,
738
+ "loss": 0.5627,
739
  "step": 1020
740
  },
741
  {
742
  "epoch": 2.436428149024246,
743
+ "grad_norm": 0.5121330286207769,
744
  "learning_rate": 5e-06,
745
+ "loss": 0.5608,
746
  "step": 1030
747
  },
748
  {
749
  "epoch": 2.4600827912477823,
750
+ "grad_norm": 0.5841580086447481,
751
  "learning_rate": 5e-06,
752
+ "loss": 0.5693,
753
  "step": 1040
754
  },
755
  {
756
  "epoch": 2.483737433471319,
757
+ "grad_norm": 0.557020183414569,
758
  "learning_rate": 5e-06,
759
+ "loss": 0.5726,
760
  "step": 1050
761
  },
762
  {
763
  "epoch": 2.507392075694855,
764
+ "grad_norm": 0.6374112998842234,
765
  "learning_rate": 5e-06,
766
+ "loss": 0.5747,
767
  "step": 1060
768
  },
769
  {
770
  "epoch": 2.5310467179183913,
771
+ "grad_norm": 0.5343754869995426,
772
  "learning_rate": 5e-06,
773
+ "loss": 0.5607,
774
  "step": 1070
775
  },
776
  {
777
  "epoch": 2.554701360141928,
778
+ "grad_norm": 0.5496554790900547,
779
  "learning_rate": 5e-06,
780
+ "loss": 0.5698,
781
  "step": 1080
782
  },
783
  {
784
  "epoch": 2.578356002365464,
785
+ "grad_norm": 0.6822773581077988,
786
  "learning_rate": 5e-06,
787
+ "loss": 0.5738,
788
  "step": 1090
789
  },
790
  {
791
  "epoch": 2.6020106445890008,
792
+ "grad_norm": 0.49632724788385346,
793
  "learning_rate": 5e-06,
794
+ "loss": 0.5692,
795
  "step": 1100
796
  },
797
  {
798
  "epoch": 2.625665286812537,
799
+ "grad_norm": 0.4859614320386073,
800
  "learning_rate": 5e-06,
801
+ "loss": 0.5704,
802
  "step": 1110
803
  },
804
  {
805
  "epoch": 2.649319929036073,
806
+ "grad_norm": 0.5005521245693028,
807
  "learning_rate": 5e-06,
808
+ "loss": 0.5721,
809
  "step": 1120
810
  },
811
  {
812
  "epoch": 2.67297457125961,
813
+ "grad_norm": 0.5418331476470847,
814
  "learning_rate": 5e-06,
815
+ "loss": 0.5719,
816
  "step": 1130
817
  },
818
  {
819
  "epoch": 2.696629213483146,
820
+ "grad_norm": 0.4518235189693759,
821
  "learning_rate": 5e-06,
822
+ "loss": 0.5658,
823
  "step": 1140
824
  },
825
  {
826
  "epoch": 2.7202838557066826,
827
+ "grad_norm": 0.45229828628235735,
828
  "learning_rate": 5e-06,
829
+ "loss": 0.5638,
830
  "step": 1150
831
  },
832
  {
833
  "epoch": 2.743938497930219,
834
+ "grad_norm": 0.5051148207876189,
835
  "learning_rate": 5e-06,
836
+ "loss": 0.5748,
837
  "step": 1160
838
  },
839
  {
840
  "epoch": 2.767593140153755,
841
+ "grad_norm": 0.7455413514573421,
842
  "learning_rate": 5e-06,
843
+ "loss": 0.5797,
844
  "step": 1170
845
  },
846
  {
847
  "epoch": 2.7912477823772917,
848
+ "grad_norm": 0.5362145068936747,
849
  "learning_rate": 5e-06,
850
+ "loss": 0.5705,
851
  "step": 1180
852
  },
853
  {
854
  "epoch": 2.814902424600828,
855
+ "grad_norm": 0.46118669511344673,
856
  "learning_rate": 5e-06,
857
+ "loss": 0.5653,
858
  "step": 1190
859
  },
860
  {
861
  "epoch": 2.8385570668243645,
862
+ "grad_norm": 0.5498761802579338,
863
  "learning_rate": 5e-06,
864
+ "loss": 0.5694,
865
  "step": 1200
866
  },
867
  {
868
  "epoch": 2.8622117090479007,
869
+ "grad_norm": 0.5720658060375756,
870
  "learning_rate": 5e-06,
871
+ "loss": 0.5761,
872
  "step": 1210
873
  },
874
  {
875
  "epoch": 2.885866351271437,
876
+ "grad_norm": 0.4735883791639776,
877
  "learning_rate": 5e-06,
878
+ "loss": 0.5714,
879
  "step": 1220
880
  },
881
  {
882
  "epoch": 2.9095209934949735,
883
+ "grad_norm": 0.6126626053091963,
884
  "learning_rate": 5e-06,
885
+ "loss": 0.5665,
886
  "step": 1230
887
  },
888
  {
889
  "epoch": 2.9331756357185097,
890
+ "grad_norm": 0.5724885076669786,
891
  "learning_rate": 5e-06,
892
+ "loss": 0.5666,
893
  "step": 1240
894
  },
895
  {
896
  "epoch": 2.9568302779420463,
897
+ "grad_norm": 0.5144727847784881,
898
  "learning_rate": 5e-06,
899
+ "loss": 0.5721,
900
  "step": 1250
901
  },
902
  {
903
  "epoch": 2.9804849201655825,
904
+ "grad_norm": 0.4637250585550989,
905
  "learning_rate": 5e-06,
906
+ "loss": 0.5645,
907
  "step": 1260
908
  },
909
  {
910
  "epoch": 2.9946777054997042,
911
+ "eval_loss": 0.6453979015350342,
912
+ "eval_runtime": 226.895,
913
+ "eval_samples_per_second": 50.186,
914
+ "eval_steps_per_second": 0.392,
915
  "step": 1266
916
  },
917
  {
918
  "epoch": 2.9946777054997042,
919
  "step": 1266,
920
  "total_flos": 2120178393415680.0,
921
+ "train_loss": 0.6197805719164687,
922
+ "train_runtime": 38167.6556,
923
+ "train_samples_per_second": 17.005,
924
  "train_steps_per_second": 0.033
925
  }
926
  ],
training_eval_loss.png CHANGED
training_loss.png CHANGED