sedrickkeh commited on
Commit
1df7997
·
verified ·
1 Parent(s): a7f2055

End of training

Browse files
README.md CHANGED
@@ -4,6 +4,7 @@ license: llama3.1
4
  base_model: meta-llama/Llama-3.1-8B
5
  tags:
6
  - llama-factory
 
7
  - generated_from_trainer
8
  model-index:
9
  - name: OH_DCFT_V3_wo_gpt4_llm
@@ -15,7 +16,7 @@ should probably proofread and complete it, then remove this comment. -->
15
 
16
  # OH_DCFT_V3_wo_gpt4_llm
17
 
18
- This model is a fine-tuned version of [meta-llama/Llama-3.1-8B](https://huggingface.co/meta-llama/Llama-3.1-8B) on an unknown dataset.
19
  It achieves the following results on the evaluation set:
20
  - Loss: 0.6373
21
 
 
4
  base_model: meta-llama/Llama-3.1-8B
5
  tags:
6
  - llama-factory
7
+ - full
8
  - generated_from_trainer
9
  model-index:
10
  - name: OH_DCFT_V3_wo_gpt4_llm
 
16
 
17
  # OH_DCFT_V3_wo_gpt4_llm
18
 
19
+ This model is a fine-tuned version of [meta-llama/Llama-3.1-8B](https://huggingface.co/meta-llama/Llama-3.1-8B) on the mlfoundations-dev/OH_DCFT_V3_wo_gpt4_llm dataset.
20
  It achieves the following results on the evaluation set:
21
  - Loss: 0.6373
22
 
all_results.json CHANGED
@@ -1,12 +1,12 @@
1
  {
2
  "epoch": 2.996415770609319,
3
- "eval_loss": 0.6404265761375427,
4
- "eval_runtime": 225.386,
5
- "eval_samples_per_second": 50.016,
6
- "eval_steps_per_second": 0.395,
7
  "total_flos": 2100077946470400.0,
8
- "train_loss": 0.6158634758832162,
9
- "train_runtime": 37532.7652,
10
- "train_samples_per_second": 17.12,
11
  "train_steps_per_second": 0.033
12
  }
 
1
  {
2
  "epoch": 2.996415770609319,
3
+ "eval_loss": 0.6373269557952881,
4
+ "eval_runtime": 226.1722,
5
+ "eval_samples_per_second": 49.843,
6
+ "eval_steps_per_second": 0.394,
7
  "total_flos": 2100077946470400.0,
8
+ "train_loss": 0.6173035094612523,
9
+ "train_runtime": 37751.1113,
10
+ "train_samples_per_second": 17.021,
11
  "train_steps_per_second": 0.033
12
  }
eval_results.json CHANGED
@@ -1,7 +1,7 @@
1
  {
2
  "epoch": 2.996415770609319,
3
- "eval_loss": 0.6404265761375427,
4
- "eval_runtime": 225.386,
5
- "eval_samples_per_second": 50.016,
6
- "eval_steps_per_second": 0.395
7
  }
 
1
  {
2
  "epoch": 2.996415770609319,
3
+ "eval_loss": 0.6373269557952881,
4
+ "eval_runtime": 226.1722,
5
+ "eval_samples_per_second": 49.843,
6
+ "eval_steps_per_second": 0.394
7
  }
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "epoch": 2.996415770609319,
3
  "total_flos": 2100077946470400.0,
4
- "train_loss": 0.6158634758832162,
5
- "train_runtime": 37532.7652,
6
- "train_samples_per_second": 17.12,
7
  "train_steps_per_second": 0.033
8
  }
 
1
  {
2
  "epoch": 2.996415770609319,
3
  "total_flos": 2100077946470400.0,
4
+ "train_loss": 0.6173035094612523,
5
+ "train_runtime": 37751.1113,
6
+ "train_samples_per_second": 17.021,
7
  "train_steps_per_second": 0.033
8
  }
trainer_state.json CHANGED
@@ -10,910 +10,910 @@
10
  "log_history": [
11
  {
12
  "epoch": 0.023894862604540025,
13
- "grad_norm": 6.081180104447192,
14
  "learning_rate": 5e-06,
15
- "loss": 0.9213,
16
  "step": 10
17
  },
18
  {
19
  "epoch": 0.04778972520908005,
20
- "grad_norm": 2.5675420707703682,
21
  "learning_rate": 5e-06,
22
- "loss": 0.8064,
23
  "step": 20
24
  },
25
  {
26
  "epoch": 0.07168458781362007,
27
- "grad_norm": 1.116884956350582,
28
  "learning_rate": 5e-06,
29
- "loss": 0.764,
30
  "step": 30
31
  },
32
  {
33
  "epoch": 0.0955794504181601,
34
- "grad_norm": 0.9827896973879405,
35
  "learning_rate": 5e-06,
36
- "loss": 0.7456,
37
  "step": 40
38
  },
39
  {
40
  "epoch": 0.11947431302270012,
41
- "grad_norm": 1.1716177346699552,
42
  "learning_rate": 5e-06,
43
- "loss": 0.7252,
44
  "step": 50
45
  },
46
  {
47
  "epoch": 0.14336917562724014,
48
- "grad_norm": 1.054189475268262,
49
  "learning_rate": 5e-06,
50
- "loss": 0.7144,
51
  "step": 60
52
  },
53
  {
54
  "epoch": 0.16726403823178015,
55
- "grad_norm": 0.7494472231038515,
56
  "learning_rate": 5e-06,
57
- "loss": 0.7153,
58
  "step": 70
59
  },
60
  {
61
  "epoch": 0.1911589008363202,
62
- "grad_norm": 0.5903583553666529,
63
  "learning_rate": 5e-06,
64
- "loss": 0.6941,
65
  "step": 80
66
  },
67
  {
68
  "epoch": 0.21505376344086022,
69
- "grad_norm": 0.7850320924725688,
70
  "learning_rate": 5e-06,
71
- "loss": 0.6917,
72
  "step": 90
73
  },
74
  {
75
  "epoch": 0.23894862604540024,
76
- "grad_norm": 0.8082497951696401,
77
  "learning_rate": 5e-06,
78
- "loss": 0.6931,
79
  "step": 100
80
  },
81
  {
82
  "epoch": 0.2628434886499403,
83
- "grad_norm": 0.5831514385960807,
84
  "learning_rate": 5e-06,
85
- "loss": 0.6808,
86
  "step": 110
87
  },
88
  {
89
  "epoch": 0.2867383512544803,
90
- "grad_norm": 0.7344040626713287,
91
  "learning_rate": 5e-06,
92
- "loss": 0.689,
93
  "step": 120
94
  },
95
  {
96
  "epoch": 0.3106332138590203,
97
- "grad_norm": 0.8291631762782786,
98
  "learning_rate": 5e-06,
99
- "loss": 0.6852,
100
  "step": 130
101
  },
102
  {
103
  "epoch": 0.3345280764635603,
104
- "grad_norm": 0.555446198309624,
105
  "learning_rate": 5e-06,
106
- "loss": 0.6746,
107
  "step": 140
108
  },
109
  {
110
  "epoch": 0.35842293906810035,
111
- "grad_norm": 0.8214482724693175,
112
  "learning_rate": 5e-06,
113
- "loss": 0.6789,
114
  "step": 150
115
  },
116
  {
117
  "epoch": 0.3823178016726404,
118
- "grad_norm": 0.5332479678739207,
119
  "learning_rate": 5e-06,
120
- "loss": 0.6676,
121
  "step": 160
122
  },
123
  {
124
  "epoch": 0.4062126642771804,
125
- "grad_norm": 1.0446840993388027,
126
  "learning_rate": 5e-06,
127
- "loss": 0.6669,
128
  "step": 170
129
  },
130
  {
131
  "epoch": 0.43010752688172044,
132
- "grad_norm": 0.5354819297917649,
133
  "learning_rate": 5e-06,
134
- "loss": 0.6671,
135
  "step": 180
136
  },
137
  {
138
  "epoch": 0.4540023894862604,
139
- "grad_norm": 0.5790753975231967,
140
  "learning_rate": 5e-06,
141
- "loss": 0.6637,
142
  "step": 190
143
  },
144
  {
145
  "epoch": 0.4778972520908005,
146
- "grad_norm": 0.5611754139446838,
147
  "learning_rate": 5e-06,
148
- "loss": 0.6706,
149
  "step": 200
150
  },
151
  {
152
  "epoch": 0.5017921146953405,
153
- "grad_norm": 0.541229197735182,
154
  "learning_rate": 5e-06,
155
- "loss": 0.6678,
156
  "step": 210
157
  },
158
  {
159
  "epoch": 0.5256869772998806,
160
- "grad_norm": 0.6290230152316767,
161
  "learning_rate": 5e-06,
162
- "loss": 0.6615,
163
  "step": 220
164
  },
165
  {
166
  "epoch": 0.5495818399044206,
167
- "grad_norm": 0.5103134958958712,
168
  "learning_rate": 5e-06,
169
- "loss": 0.6621,
170
  "step": 230
171
  },
172
  {
173
  "epoch": 0.5734767025089605,
174
- "grad_norm": 0.4963914880777678,
175
  "learning_rate": 5e-06,
176
- "loss": 0.6587,
177
  "step": 240
178
  },
179
  {
180
  "epoch": 0.5973715651135006,
181
- "grad_norm": 0.7047770736230026,
182
  "learning_rate": 5e-06,
183
- "loss": 0.6633,
184
  "step": 250
185
  },
186
  {
187
  "epoch": 0.6212664277180406,
188
- "grad_norm": 0.6875405023947134,
189
  "learning_rate": 5e-06,
190
- "loss": 0.6602,
191
  "step": 260
192
  },
193
  {
194
  "epoch": 0.6451612903225806,
195
- "grad_norm": 0.5469403807072362,
196
  "learning_rate": 5e-06,
197
- "loss": 0.6649,
198
  "step": 270
199
  },
200
  {
201
  "epoch": 0.6690561529271206,
202
- "grad_norm": 0.6301316104025243,
203
  "learning_rate": 5e-06,
204
- "loss": 0.6605,
205
  "step": 280
206
  },
207
  {
208
  "epoch": 0.6929510155316607,
209
- "grad_norm": 0.7436109186331767,
210
  "learning_rate": 5e-06,
211
- "loss": 0.657,
212
  "step": 290
213
  },
214
  {
215
  "epoch": 0.7168458781362007,
216
- "grad_norm": 0.6316182942840975,
217
  "learning_rate": 5e-06,
218
- "loss": 0.659,
219
  "step": 300
220
  },
221
  {
222
  "epoch": 0.7407407407407407,
223
- "grad_norm": 0.5862276185836299,
224
  "learning_rate": 5e-06,
225
- "loss": 0.6543,
226
  "step": 310
227
  },
228
  {
229
  "epoch": 0.7646356033452808,
230
- "grad_norm": 0.5727274506679324,
231
  "learning_rate": 5e-06,
232
- "loss": 0.6578,
233
  "step": 320
234
  },
235
  {
236
  "epoch": 0.7885304659498208,
237
- "grad_norm": 0.8637237307062305,
238
  "learning_rate": 5e-06,
239
- "loss": 0.6572,
240
  "step": 330
241
  },
242
  {
243
  "epoch": 0.8124253285543608,
244
- "grad_norm": 0.5716637099962937,
245
  "learning_rate": 5e-06,
246
- "loss": 0.6544,
247
  "step": 340
248
  },
249
  {
250
  "epoch": 0.8363201911589009,
251
- "grad_norm": 0.5267829079741267,
252
  "learning_rate": 5e-06,
253
- "loss": 0.6542,
254
  "step": 350
255
  },
256
  {
257
  "epoch": 0.8602150537634409,
258
- "grad_norm": 0.5819870889688735,
259
  "learning_rate": 5e-06,
260
- "loss": 0.6524,
261
  "step": 360
262
  },
263
  {
264
  "epoch": 0.8841099163679809,
265
- "grad_norm": 0.5230379233220511,
266
  "learning_rate": 5e-06,
267
- "loss": 0.6561,
268
  "step": 370
269
  },
270
  {
271
  "epoch": 0.9080047789725209,
272
- "grad_norm": 0.5405733925967506,
273
  "learning_rate": 5e-06,
274
- "loss": 0.6576,
275
  "step": 380
276
  },
277
  {
278
  "epoch": 0.931899641577061,
279
- "grad_norm": 0.6252081499402665,
280
  "learning_rate": 5e-06,
281
- "loss": 0.6431,
282
  "step": 390
283
  },
284
  {
285
  "epoch": 0.955794504181601,
286
- "grad_norm": 0.5241989931128205,
287
  "learning_rate": 5e-06,
288
- "loss": 0.645,
289
  "step": 400
290
  },
291
  {
292
  "epoch": 0.9796893667861409,
293
- "grad_norm": 0.5917706193349264,
294
  "learning_rate": 5e-06,
295
- "loss": 0.6548,
296
  "step": 410
297
  },
298
  {
299
  "epoch": 0.998805256869773,
300
- "eval_loss": 0.6481666564941406,
301
- "eval_runtime": 225.2584,
302
- "eval_samples_per_second": 50.045,
303
- "eval_steps_per_second": 0.395,
304
  "step": 418
305
  },
306
  {
307
  "epoch": 1.003584229390681,
308
- "grad_norm": 0.8892508812440383,
309
  "learning_rate": 5e-06,
310
- "loss": 0.645,
311
  "step": 420
312
  },
313
  {
314
  "epoch": 1.027479091995221,
315
- "grad_norm": 0.6856782905740205,
316
  "learning_rate": 5e-06,
317
- "loss": 0.6088,
318
  "step": 430
319
  },
320
  {
321
  "epoch": 1.0513739545997611,
322
- "grad_norm": 0.5419930096362386,
323
  "learning_rate": 5e-06,
324
- "loss": 0.6083,
325
  "step": 440
326
  },
327
  {
328
  "epoch": 1.075268817204301,
329
- "grad_norm": 0.6254120282769089,
330
  "learning_rate": 5e-06,
331
- "loss": 0.6102,
332
  "step": 450
333
  },
334
  {
335
  "epoch": 1.099163679808841,
336
- "grad_norm": 0.7886162221301777,
337
  "learning_rate": 5e-06,
338
- "loss": 0.6022,
339
  "step": 460
340
  },
341
  {
342
  "epoch": 1.1230585424133812,
343
- "grad_norm": 0.5575253634257799,
344
  "learning_rate": 5e-06,
345
- "loss": 0.6008,
346
  "step": 470
347
  },
348
  {
349
  "epoch": 1.146953405017921,
350
- "grad_norm": 0.5457321558023005,
351
  "learning_rate": 5e-06,
352
- "loss": 0.6076,
353
  "step": 480
354
  },
355
  {
356
  "epoch": 1.1708482676224612,
357
- "grad_norm": 0.4910125465988992,
358
  "learning_rate": 5e-06,
359
- "loss": 0.5998,
360
  "step": 490
361
  },
362
  {
363
  "epoch": 1.194743130227001,
364
- "grad_norm": 0.5851438416753514,
365
  "learning_rate": 5e-06,
366
- "loss": 0.6095,
367
  "step": 500
368
  },
369
  {
370
  "epoch": 1.2186379928315412,
371
- "grad_norm": 0.5544196352892787,
372
  "learning_rate": 5e-06,
373
- "loss": 0.608,
374
  "step": 510
375
  },
376
  {
377
  "epoch": 1.2425328554360813,
378
- "grad_norm": 0.5835218258508235,
379
  "learning_rate": 5e-06,
380
- "loss": 0.6053,
381
  "step": 520
382
  },
383
  {
384
  "epoch": 1.2664277180406214,
385
- "grad_norm": 0.6402155111012049,
386
  "learning_rate": 5e-06,
387
- "loss": 0.6086,
388
  "step": 530
389
  },
390
  {
391
  "epoch": 1.2903225806451613,
392
- "grad_norm": 0.5274545363597922,
393
  "learning_rate": 5e-06,
394
- "loss": 0.6082,
395
  "step": 540
396
  },
397
  {
398
  "epoch": 1.3142174432497014,
399
- "grad_norm": 0.5023370306863523,
400
  "learning_rate": 5e-06,
401
- "loss": 0.6097,
402
  "step": 550
403
  },
404
  {
405
  "epoch": 1.3381123058542412,
406
- "grad_norm": 0.5336199850801069,
407
  "learning_rate": 5e-06,
408
- "loss": 0.6037,
409
  "step": 560
410
  },
411
  {
412
  "epoch": 1.3620071684587813,
413
- "grad_norm": 0.7734469958692578,
414
  "learning_rate": 5e-06,
415
- "loss": 0.6066,
416
  "step": 570
417
  },
418
  {
419
  "epoch": 1.3859020310633214,
420
- "grad_norm": 0.8452375931165987,
421
  "learning_rate": 5e-06,
422
- "loss": 0.604,
423
  "step": 580
424
  },
425
  {
426
  "epoch": 1.4097968936678615,
427
- "grad_norm": 0.5907657942794627,
428
  "learning_rate": 5e-06,
429
- "loss": 0.6042,
430
  "step": 590
431
  },
432
  {
433
  "epoch": 1.4336917562724014,
434
- "grad_norm": 0.6105214261701881,
435
  "learning_rate": 5e-06,
436
- "loss": 0.6077,
437
  "step": 600
438
  },
439
  {
440
  "epoch": 1.4575866188769415,
441
- "grad_norm": 0.6803070474017702,
442
  "learning_rate": 5e-06,
443
- "loss": 0.6145,
444
  "step": 610
445
  },
446
  {
447
  "epoch": 1.4814814814814814,
448
- "grad_norm": 0.5137151572440128,
449
  "learning_rate": 5e-06,
450
- "loss": 0.6068,
451
  "step": 620
452
  },
453
  {
454
  "epoch": 1.5053763440860215,
455
- "grad_norm": 0.5526611853120886,
456
  "learning_rate": 5e-06,
457
- "loss": 0.6023,
458
  "step": 630
459
  },
460
  {
461
  "epoch": 1.5292712066905616,
462
- "grad_norm": 0.5276740317417068,
463
  "learning_rate": 5e-06,
464
- "loss": 0.6066,
465
  "step": 640
466
  },
467
  {
468
  "epoch": 1.5531660692951017,
469
- "grad_norm": 0.5266173782221237,
470
  "learning_rate": 5e-06,
471
- "loss": 0.604,
472
  "step": 650
473
  },
474
  {
475
  "epoch": 1.5770609318996416,
476
- "grad_norm": 0.5879899658739348,
477
  "learning_rate": 5e-06,
478
- "loss": 0.6062,
479
  "step": 660
480
  },
481
  {
482
  "epoch": 1.6009557945041815,
483
- "grad_norm": 0.6426153237314072,
484
  "learning_rate": 5e-06,
485
- "loss": 0.6044,
486
  "step": 670
487
  },
488
  {
489
  "epoch": 1.6248506571087216,
490
- "grad_norm": 0.5633170243940351,
491
  "learning_rate": 5e-06,
492
- "loss": 0.5975,
493
  "step": 680
494
  },
495
  {
496
  "epoch": 1.6487455197132617,
497
- "grad_norm": 0.5448798915341956,
498
  "learning_rate": 5e-06,
499
- "loss": 0.5969,
500
  "step": 690
501
  },
502
  {
503
  "epoch": 1.6726403823178018,
504
- "grad_norm": 0.484774901310647,
505
  "learning_rate": 5e-06,
506
- "loss": 0.6069,
507
  "step": 700
508
  },
509
  {
510
  "epoch": 1.6965352449223416,
511
- "grad_norm": 0.5257867856081395,
512
  "learning_rate": 5e-06,
513
- "loss": 0.5985,
514
  "step": 710
515
  },
516
  {
517
  "epoch": 1.7204301075268817,
518
- "grad_norm": 0.5602525718442715,
519
  "learning_rate": 5e-06,
520
- "loss": 0.612,
521
  "step": 720
522
  },
523
  {
524
  "epoch": 1.7443249701314216,
525
- "grad_norm": 0.5040592279383703,
526
  "learning_rate": 5e-06,
527
- "loss": 0.6021,
528
  "step": 730
529
  },
530
  {
531
  "epoch": 1.7682198327359617,
532
- "grad_norm": 0.6140508371910811,
533
  "learning_rate": 5e-06,
534
- "loss": 0.6029,
535
  "step": 740
536
  },
537
  {
538
  "epoch": 1.7921146953405018,
539
- "grad_norm": 0.526723228546995,
540
  "learning_rate": 5e-06,
541
- "loss": 0.6051,
542
  "step": 750
543
  },
544
  {
545
  "epoch": 1.816009557945042,
546
- "grad_norm": 0.5485611376004595,
547
  "learning_rate": 5e-06,
548
- "loss": 0.602,
549
  "step": 760
550
  },
551
  {
552
  "epoch": 1.8399044205495818,
553
- "grad_norm": 0.4925549773889819,
554
  "learning_rate": 5e-06,
555
- "loss": 0.5976,
556
  "step": 770
557
  },
558
  {
559
  "epoch": 1.863799283154122,
560
- "grad_norm": 0.5127568984869073,
561
  "learning_rate": 5e-06,
562
- "loss": 0.6015,
563
  "step": 780
564
  },
565
  {
566
  "epoch": 1.8876941457586618,
567
- "grad_norm": 0.7953660178238419,
568
  "learning_rate": 5e-06,
569
- "loss": 0.5983,
570
  "step": 790
571
  },
572
  {
573
  "epoch": 1.911589008363202,
574
- "grad_norm": 0.4644171655680959,
575
  "learning_rate": 5e-06,
576
- "loss": 0.6043,
577
  "step": 800
578
  },
579
  {
580
  "epoch": 1.935483870967742,
581
- "grad_norm": 0.5504656432596945,
582
  "learning_rate": 5e-06,
583
- "loss": 0.5991,
584
  "step": 810
585
  },
586
  {
587
  "epoch": 1.959378733572282,
588
- "grad_norm": 0.5014583862445193,
589
  "learning_rate": 5e-06,
590
- "loss": 0.6027,
591
  "step": 820
592
  },
593
  {
594
  "epoch": 1.983273596176822,
595
- "grad_norm": 0.5169701196971361,
596
  "learning_rate": 5e-06,
597
- "loss": 0.6065,
598
  "step": 830
599
  },
600
  {
601
  "epoch": 2.0,
602
- "eval_loss": 0.6383097767829895,
603
- "eval_runtime": 225.1334,
604
- "eval_samples_per_second": 50.073,
605
- "eval_steps_per_second": 0.395,
606
  "step": 837
607
  },
608
  {
609
  "epoch": 2.007168458781362,
610
- "grad_norm": 0.9522384301889181,
611
  "learning_rate": 5e-06,
612
- "loss": 0.5892,
613
  "step": 840
614
  },
615
  {
616
  "epoch": 2.031063321385902,
617
- "grad_norm": 0.5604355482368693,
618
  "learning_rate": 5e-06,
619
- "loss": 0.555,
620
  "step": 850
621
  },
622
  {
623
  "epoch": 2.054958183990442,
624
- "grad_norm": 0.8454377137422782,
625
  "learning_rate": 5e-06,
626
- "loss": 0.559,
627
  "step": 860
628
  },
629
  {
630
  "epoch": 2.078853046594982,
631
- "grad_norm": 0.6125609351007736,
632
  "learning_rate": 5e-06,
633
- "loss": 0.5588,
634
  "step": 870
635
  },
636
  {
637
  "epoch": 2.1027479091995223,
638
- "grad_norm": 0.550684661961655,
639
  "learning_rate": 5e-06,
640
- "loss": 0.5566,
641
  "step": 880
642
  },
643
  {
644
  "epoch": 2.126642771804062,
645
- "grad_norm": 0.5268020228547002,
646
  "learning_rate": 5e-06,
647
- "loss": 0.5509,
648
  "step": 890
649
  },
650
  {
651
  "epoch": 2.150537634408602,
652
- "grad_norm": 0.5374686724727457,
653
  "learning_rate": 5e-06,
654
- "loss": 0.5558,
655
  "step": 900
656
  },
657
  {
658
  "epoch": 2.174432497013142,
659
- "grad_norm": 0.6591728468868026,
660
  "learning_rate": 5e-06,
661
- "loss": 0.5523,
662
  "step": 910
663
  },
664
  {
665
  "epoch": 2.198327359617682,
666
- "grad_norm": 0.5758411262414109,
667
  "learning_rate": 5e-06,
668
- "loss": 0.5626,
669
  "step": 920
670
  },
671
  {
672
  "epoch": 2.2222222222222223,
673
- "grad_norm": 0.5974631029863997,
674
  "learning_rate": 5e-06,
675
- "loss": 0.5633,
676
  "step": 930
677
  },
678
  {
679
  "epoch": 2.2461170848267624,
680
- "grad_norm": 0.6693430289411901,
681
  "learning_rate": 5e-06,
682
- "loss": 0.5569,
683
  "step": 940
684
  },
685
  {
686
  "epoch": 2.270011947431302,
687
- "grad_norm": 0.5235798291837075,
688
  "learning_rate": 5e-06,
689
- "loss": 0.5592,
690
  "step": 950
691
  },
692
  {
693
  "epoch": 2.293906810035842,
694
- "grad_norm": 0.5951063252000355,
695
  "learning_rate": 5e-06,
696
- "loss": 0.558,
697
  "step": 960
698
  },
699
  {
700
  "epoch": 2.3178016726403823,
701
- "grad_norm": 0.7007904171286725,
702
  "learning_rate": 5e-06,
703
- "loss": 0.5585,
704
  "step": 970
705
  },
706
  {
707
  "epoch": 2.3416965352449224,
708
- "grad_norm": 0.6157005538099132,
709
  "learning_rate": 5e-06,
710
- "loss": 0.5584,
711
  "step": 980
712
  },
713
  {
714
  "epoch": 2.3655913978494625,
715
- "grad_norm": 0.5560793638442904,
716
  "learning_rate": 5e-06,
717
- "loss": 0.5536,
718
  "step": 990
719
  },
720
  {
721
  "epoch": 2.389486260454002,
722
- "grad_norm": 0.5994284875647052,
723
  "learning_rate": 5e-06,
724
- "loss": 0.559,
725
  "step": 1000
726
  },
727
  {
728
  "epoch": 2.4133811230585422,
729
- "grad_norm": 0.6178447526276027,
730
  "learning_rate": 5e-06,
731
- "loss": 0.5622,
732
  "step": 1010
733
  },
734
  {
735
  "epoch": 2.4372759856630823,
736
- "grad_norm": 0.598268249438012,
737
  "learning_rate": 5e-06,
738
- "loss": 0.5617,
739
  "step": 1020
740
  },
741
  {
742
  "epoch": 2.4611708482676224,
743
- "grad_norm": 0.6456807780700198,
744
  "learning_rate": 5e-06,
745
- "loss": 0.5598,
746
  "step": 1030
747
  },
748
  {
749
  "epoch": 2.4850657108721625,
750
- "grad_norm": 0.5838521623874042,
751
  "learning_rate": 5e-06,
752
- "loss": 0.5565,
753
  "step": 1040
754
  },
755
  {
756
  "epoch": 2.5089605734767026,
757
- "grad_norm": 0.6160425605583054,
758
  "learning_rate": 5e-06,
759
- "loss": 0.561,
760
  "step": 1050
761
  },
762
  {
763
  "epoch": 2.5328554360812428,
764
- "grad_norm": 0.6471012332418825,
765
  "learning_rate": 5e-06,
766
- "loss": 0.5609,
767
  "step": 1060
768
  },
769
  {
770
  "epoch": 2.5567502986857824,
771
- "grad_norm": 0.7438495174826055,
772
  "learning_rate": 5e-06,
773
- "loss": 0.559,
774
  "step": 1070
775
  },
776
  {
777
  "epoch": 2.5806451612903225,
778
- "grad_norm": 0.5965448247540126,
779
  "learning_rate": 5e-06,
780
- "loss": 0.57,
781
  "step": 1080
782
  },
783
  {
784
  "epoch": 2.6045400238948626,
785
- "grad_norm": 0.6030850533182174,
786
  "learning_rate": 5e-06,
787
- "loss": 0.5619,
788
  "step": 1090
789
  },
790
  {
791
  "epoch": 2.6284348864994027,
792
- "grad_norm": 0.5627845167167422,
793
  "learning_rate": 5e-06,
794
- "loss": 0.5615,
795
  "step": 1100
796
  },
797
  {
798
  "epoch": 2.652329749103943,
799
- "grad_norm": 0.5586383353478368,
800
  "learning_rate": 5e-06,
801
- "loss": 0.5567,
802
  "step": 1110
803
  },
804
  {
805
  "epoch": 2.6762246117084825,
806
- "grad_norm": 0.6169226364241693,
807
  "learning_rate": 5e-06,
808
- "loss": 0.5614,
809
  "step": 1120
810
  },
811
  {
812
  "epoch": 2.7001194743130226,
813
- "grad_norm": 0.5655962840268283,
814
  "learning_rate": 5e-06,
815
- "loss": 0.566,
816
  "step": 1130
817
  },
818
  {
819
  "epoch": 2.7240143369175627,
820
- "grad_norm": 0.5379947967296507,
821
  "learning_rate": 5e-06,
822
- "loss": 0.5585,
823
  "step": 1140
824
  },
825
  {
826
  "epoch": 2.7479091995221028,
827
- "grad_norm": 0.6022462071356245,
828
  "learning_rate": 5e-06,
829
- "loss": 0.5647,
830
  "step": 1150
831
  },
832
  {
833
  "epoch": 2.771804062126643,
834
- "grad_norm": 0.5630793813942363,
835
  "learning_rate": 5e-06,
836
- "loss": 0.5701,
837
  "step": 1160
838
  },
839
  {
840
  "epoch": 2.795698924731183,
841
- "grad_norm": 0.7669976902930744,
842
  "learning_rate": 5e-06,
843
- "loss": 0.5584,
844
  "step": 1170
845
  },
846
  {
847
  "epoch": 2.819593787335723,
848
- "grad_norm": 0.6081830710715922,
849
  "learning_rate": 5e-06,
850
- "loss": 0.5619,
851
  "step": 1180
852
  },
853
  {
854
  "epoch": 2.8434886499402627,
855
- "grad_norm": 0.5471146843592755,
856
  "learning_rate": 5e-06,
857
- "loss": 0.5607,
858
  "step": 1190
859
  },
860
  {
861
  "epoch": 2.867383512544803,
862
- "grad_norm": 0.500432815577127,
863
  "learning_rate": 5e-06,
864
- "loss": 0.5605,
865
  "step": 1200
866
  },
867
  {
868
  "epoch": 2.891278375149343,
869
- "grad_norm": 0.5513789797790463,
870
  "learning_rate": 5e-06,
871
- "loss": 0.558,
872
  "step": 1210
873
  },
874
  {
875
  "epoch": 2.915173237753883,
876
- "grad_norm": 0.5699534515350526,
877
  "learning_rate": 5e-06,
878
- "loss": 0.5626,
879
  "step": 1220
880
  },
881
  {
882
  "epoch": 2.9390681003584227,
883
- "grad_norm": 0.6140827402540434,
884
  "learning_rate": 5e-06,
885
- "loss": 0.5648,
886
  "step": 1230
887
  },
888
  {
889
  "epoch": 2.962962962962963,
890
- "grad_norm": 0.6486122276400309,
891
  "learning_rate": 5e-06,
892
- "loss": 0.5627,
893
  "step": 1240
894
  },
895
  {
896
  "epoch": 2.986857825567503,
897
- "grad_norm": 0.5448840278284515,
898
  "learning_rate": 5e-06,
899
- "loss": 0.5672,
900
  "step": 1250
901
  },
902
  {
903
  "epoch": 2.996415770609319,
904
- "eval_loss": 0.6404265761375427,
905
- "eval_runtime": 226.6238,
906
- "eval_samples_per_second": 49.743,
907
- "eval_steps_per_second": 0.393,
908
  "step": 1254
909
  },
910
  {
911
  "epoch": 2.996415770609319,
912
  "step": 1254,
913
  "total_flos": 2100077946470400.0,
914
- "train_loss": 0.6158634758832162,
915
- "train_runtime": 37532.7652,
916
- "train_samples_per_second": 17.12,
917
  "train_steps_per_second": 0.033
918
  }
919
  ],
 
10
  "log_history": [
11
  {
12
  "epoch": 0.023894862604540025,
13
+ "grad_norm": 4.236361518666415,
14
  "learning_rate": 5e-06,
15
+ "loss": 0.881,
16
  "step": 10
17
  },
18
  {
19
  "epoch": 0.04778972520908005,
20
+ "grad_norm": 5.741199257309677,
21
  "learning_rate": 5e-06,
22
+ "loss": 0.7803,
23
  "step": 20
24
  },
25
  {
26
  "epoch": 0.07168458781362007,
27
+ "grad_norm": 19.255957544263854,
28
  "learning_rate": 5e-06,
29
+ "loss": 0.7636,
30
  "step": 30
31
  },
32
  {
33
  "epoch": 0.0955794504181601,
34
+ "grad_norm": 1.4389740219584857,
35
  "learning_rate": 5e-06,
36
+ "loss": 0.7601,
37
  "step": 40
38
  },
39
  {
40
  "epoch": 0.11947431302270012,
41
+ "grad_norm": 2.052033228969374,
42
  "learning_rate": 5e-06,
43
+ "loss": 0.7307,
44
  "step": 50
45
  },
46
  {
47
  "epoch": 0.14336917562724014,
48
+ "grad_norm": 1.4883942502737415,
49
  "learning_rate": 5e-06,
50
+ "loss": 0.7194,
51
  "step": 60
52
  },
53
  {
54
  "epoch": 0.16726403823178015,
55
+ "grad_norm": 0.7721229562083435,
56
  "learning_rate": 5e-06,
57
+ "loss": 0.7186,
58
  "step": 70
59
  },
60
  {
61
  "epoch": 0.1911589008363202,
62
+ "grad_norm": 0.5683129469939435,
63
  "learning_rate": 5e-06,
64
+ "loss": 0.6965,
65
  "step": 80
66
  },
67
  {
68
  "epoch": 0.21505376344086022,
69
+ "grad_norm": 0.527733611127623,
70
  "learning_rate": 5e-06,
71
+ "loss": 0.6922,
72
  "step": 90
73
  },
74
  {
75
  "epoch": 0.23894862604540024,
76
+ "grad_norm": 0.5540046789983225,
77
  "learning_rate": 5e-06,
78
+ "loss": 0.693,
79
  "step": 100
80
  },
81
  {
82
  "epoch": 0.2628434886499403,
83
+ "grad_norm": 0.5451390307514128,
84
  "learning_rate": 5e-06,
85
+ "loss": 0.6802,
86
  "step": 110
87
  },
88
  {
89
  "epoch": 0.2867383512544803,
90
+ "grad_norm": 0.5143838898116624,
91
  "learning_rate": 5e-06,
92
+ "loss": 0.688,
93
  "step": 120
94
  },
95
  {
96
  "epoch": 0.3106332138590203,
97
+ "grad_norm": 0.6502984472755421,
98
  "learning_rate": 5e-06,
99
+ "loss": 0.6838,
100
  "step": 130
101
  },
102
  {
103
  "epoch": 0.3345280764635603,
104
+ "grad_norm": 0.5635569077666838,
105
  "learning_rate": 5e-06,
106
+ "loss": 0.6733,
107
  "step": 140
108
  },
109
  {
110
  "epoch": 0.35842293906810035,
111
+ "grad_norm": 0.6029469287016763,
112
  "learning_rate": 5e-06,
113
+ "loss": 0.6776,
114
  "step": 150
115
  },
116
  {
117
  "epoch": 0.3823178016726404,
118
+ "grad_norm": 0.486292600711864,
119
  "learning_rate": 5e-06,
120
+ "loss": 0.6661,
121
  "step": 160
122
  },
123
  {
124
  "epoch": 0.4062126642771804,
125
+ "grad_norm": 0.6615883711779132,
126
  "learning_rate": 5e-06,
127
+ "loss": 0.6652,
128
  "step": 170
129
  },
130
  {
131
  "epoch": 0.43010752688172044,
132
+ "grad_norm": 0.4717863479299739,
133
  "learning_rate": 5e-06,
134
+ "loss": 0.6655,
135
  "step": 180
136
  },
137
  {
138
  "epoch": 0.4540023894862604,
139
+ "grad_norm": 0.4888275284899482,
140
  "learning_rate": 5e-06,
141
+ "loss": 0.662,
142
  "step": 190
143
  },
144
  {
145
  "epoch": 0.4778972520908005,
146
+ "grad_norm": 0.5394213188181476,
147
  "learning_rate": 5e-06,
148
+ "loss": 0.6687,
149
  "step": 200
150
  },
151
  {
152
  "epoch": 0.5017921146953405,
153
+ "grad_norm": 0.45576158948311507,
154
  "learning_rate": 5e-06,
155
+ "loss": 0.6661,
156
  "step": 210
157
  },
158
  {
159
  "epoch": 0.5256869772998806,
160
+ "grad_norm": 0.45151984287636476,
161
  "learning_rate": 5e-06,
162
+ "loss": 0.6597,
163
  "step": 220
164
  },
165
  {
166
  "epoch": 0.5495818399044206,
167
+ "grad_norm": 0.4799647706900106,
168
  "learning_rate": 5e-06,
169
+ "loss": 0.6602,
170
  "step": 230
171
  },
172
  {
173
  "epoch": 0.5734767025089605,
174
+ "grad_norm": 0.47076133511342133,
175
  "learning_rate": 5e-06,
176
+ "loss": 0.6571,
177
  "step": 240
178
  },
179
  {
180
  "epoch": 0.5973715651135006,
181
+ "grad_norm": 0.5919116297131423,
182
  "learning_rate": 5e-06,
183
+ "loss": 0.6615,
184
  "step": 250
185
  },
186
  {
187
  "epoch": 0.6212664277180406,
188
+ "grad_norm": 0.5500231129527917,
189
  "learning_rate": 5e-06,
190
+ "loss": 0.6585,
191
  "step": 260
192
  },
193
  {
194
  "epoch": 0.6451612903225806,
195
+ "grad_norm": 0.5242998976170237,
196
  "learning_rate": 5e-06,
197
+ "loss": 0.6633,
198
  "step": 270
199
  },
200
  {
201
  "epoch": 0.6690561529271206,
202
+ "grad_norm": 0.44132900428051,
203
  "learning_rate": 5e-06,
204
+ "loss": 0.6588,
205
  "step": 280
206
  },
207
  {
208
  "epoch": 0.6929510155316607,
209
+ "grad_norm": 0.6925054556015406,
210
  "learning_rate": 5e-06,
211
+ "loss": 0.6553,
212
  "step": 290
213
  },
214
  {
215
  "epoch": 0.7168458781362007,
216
+ "grad_norm": 0.4625241785333385,
217
  "learning_rate": 5e-06,
218
+ "loss": 0.6574,
219
  "step": 300
220
  },
221
  {
222
  "epoch": 0.7407407407407407,
223
+ "grad_norm": 0.4229402308269957,
224
  "learning_rate": 5e-06,
225
+ "loss": 0.6527,
226
  "step": 310
227
  },
228
  {
229
  "epoch": 0.7646356033452808,
230
+ "grad_norm": 0.5130609463277542,
231
  "learning_rate": 5e-06,
232
+ "loss": 0.6561,
233
  "step": 320
234
  },
235
  {
236
  "epoch": 0.7885304659498208,
237
+ "grad_norm": 0.6838274381409521,
238
  "learning_rate": 5e-06,
239
+ "loss": 0.6555,
240
  "step": 330
241
  },
242
  {
243
  "epoch": 0.8124253285543608,
244
+ "grad_norm": 0.4426103821343896,
245
  "learning_rate": 5e-06,
246
+ "loss": 0.6528,
247
  "step": 340
248
  },
249
  {
250
  "epoch": 0.8363201911589009,
251
+ "grad_norm": 0.4768048776745041,
252
  "learning_rate": 5e-06,
253
+ "loss": 0.6526,
254
  "step": 350
255
  },
256
  {
257
  "epoch": 0.8602150537634409,
258
+ "grad_norm": 0.47979657505843953,
259
  "learning_rate": 5e-06,
260
+ "loss": 0.6507,
261
  "step": 360
262
  },
263
  {
264
  "epoch": 0.8841099163679809,
265
+ "grad_norm": 0.43210991398577236,
266
  "learning_rate": 5e-06,
267
+ "loss": 0.6545,
268
  "step": 370
269
  },
270
  {
271
  "epoch": 0.9080047789725209,
272
+ "grad_norm": 0.4219482631866451,
273
  "learning_rate": 5e-06,
274
+ "loss": 0.6561,
275
  "step": 380
276
  },
277
  {
278
  "epoch": 0.931899641577061,
279
+ "grad_norm": 0.4889263682317913,
280
  "learning_rate": 5e-06,
281
+ "loss": 0.6415,
282
  "step": 390
283
  },
284
  {
285
  "epoch": 0.955794504181601,
286
+ "grad_norm": 0.4994356501839893,
287
  "learning_rate": 5e-06,
288
+ "loss": 0.6434,
289
  "step": 400
290
  },
291
  {
292
  "epoch": 0.9796893667861409,
293
+ "grad_norm": 0.5756138907013993,
294
  "learning_rate": 5e-06,
295
+ "loss": 0.6535,
296
  "step": 410
297
  },
298
  {
299
  "epoch": 0.998805256869773,
300
+ "eval_loss": 0.6466529965400696,
301
+ "eval_runtime": 225.9354,
302
+ "eval_samples_per_second": 49.895,
303
+ "eval_steps_per_second": 0.394,
304
  "step": 418
305
  },
306
  {
307
  "epoch": 1.003584229390681,
308
+ "grad_norm": 0.7256416169536216,
309
  "learning_rate": 5e-06,
310
+ "loss": 0.6438,
311
  "step": 420
312
  },
313
  {
314
  "epoch": 1.027479091995221,
315
+ "grad_norm": 0.6564158902335233,
316
  "learning_rate": 5e-06,
317
+ "loss": 0.6108,
318
  "step": 430
319
  },
320
  {
321
  "epoch": 1.0513739545997611,
322
+ "grad_norm": 0.4999679801637927,
323
  "learning_rate": 5e-06,
324
+ "loss": 0.6106,
325
  "step": 440
326
  },
327
  {
328
  "epoch": 1.075268817204301,
329
+ "grad_norm": 0.5241048691611577,
330
  "learning_rate": 5e-06,
331
+ "loss": 0.6124,
332
  "step": 450
333
  },
334
  {
335
  "epoch": 1.099163679808841,
336
+ "grad_norm": 0.5456228664692746,
337
  "learning_rate": 5e-06,
338
+ "loss": 0.6042,
339
  "step": 460
340
  },
341
  {
342
  "epoch": 1.1230585424133812,
343
+ "grad_norm": 0.5456744152195628,
344
  "learning_rate": 5e-06,
345
+ "loss": 0.6028,
346
  "step": 470
347
  },
348
  {
349
  "epoch": 1.146953405017921,
350
+ "grad_norm": 0.4664933079979728,
351
  "learning_rate": 5e-06,
352
+ "loss": 0.6095,
353
  "step": 480
354
  },
355
  {
356
  "epoch": 1.1708482676224612,
357
+ "grad_norm": 0.4894583019401931,
358
  "learning_rate": 5e-06,
359
+ "loss": 0.6019,
360
  "step": 490
361
  },
362
  {
363
  "epoch": 1.194743130227001,
364
+ "grad_norm": 0.4942642519947347,
365
  "learning_rate": 5e-06,
366
+ "loss": 0.6114,
367
  "step": 500
368
  },
369
  {
370
  "epoch": 1.2186379928315412,
371
+ "grad_norm": 0.46554339302452813,
372
  "learning_rate": 5e-06,
373
+ "loss": 0.6099,
374
  "step": 510
375
  },
376
  {
377
  "epoch": 1.2425328554360813,
378
+ "grad_norm": 0.5215764597896382,
379
  "learning_rate": 5e-06,
380
+ "loss": 0.6073,
381
  "step": 520
382
  },
383
  {
384
  "epoch": 1.2664277180406214,
385
+ "grad_norm": 0.5142341654295087,
386
  "learning_rate": 5e-06,
387
+ "loss": 0.6105,
388
  "step": 530
389
  },
390
  {
391
  "epoch": 1.2903225806451613,
392
+ "grad_norm": 0.4429903840954624,
393
  "learning_rate": 5e-06,
394
+ "loss": 0.61,
395
  "step": 540
396
  },
397
  {
398
  "epoch": 1.3142174432497014,
399
+ "grad_norm": 0.4244756990330428,
400
  "learning_rate": 5e-06,
401
+ "loss": 0.6113,
402
  "step": 550
403
  },
404
  {
405
  "epoch": 1.3381123058542412,
406
+ "grad_norm": 0.4664930270424248,
407
  "learning_rate": 5e-06,
408
+ "loss": 0.6057,
409
  "step": 560
410
  },
411
  {
412
  "epoch": 1.3620071684587813,
413
+ "grad_norm": 0.6747787167132405,
414
  "learning_rate": 5e-06,
415
+ "loss": 0.6084,
416
  "step": 570
417
  },
418
  {
419
  "epoch": 1.3859020310633214,
420
+ "grad_norm": 0.8515989236641928,
421
  "learning_rate": 5e-06,
422
+ "loss": 0.6058,
423
  "step": 580
424
  },
425
  {
426
  "epoch": 1.4097968936678615,
427
+ "grad_norm": 0.634857639704424,
428
  "learning_rate": 5e-06,
429
+ "loss": 0.6058,
430
  "step": 590
431
  },
432
  {
433
  "epoch": 1.4336917562724014,
434
+ "grad_norm": 0.5282115500074044,
435
  "learning_rate": 5e-06,
436
+ "loss": 0.6096,
437
  "step": 600
438
  },
439
  {
440
  "epoch": 1.4575866188769415,
441
+ "grad_norm": 0.5576953727126037,
442
  "learning_rate": 5e-06,
443
+ "loss": 0.616,
444
  "step": 610
445
  },
446
  {
447
  "epoch": 1.4814814814814814,
448
+ "grad_norm": 0.45965397939992636,
449
  "learning_rate": 5e-06,
450
+ "loss": 0.6082,
451
  "step": 620
452
  },
453
  {
454
  "epoch": 1.5053763440860215,
455
+ "grad_norm": 0.5729607655893968,
456
  "learning_rate": 5e-06,
457
+ "loss": 0.6037,
458
  "step": 630
459
  },
460
  {
461
  "epoch": 1.5292712066905616,
462
+ "grad_norm": 0.4420855639504453,
463
  "learning_rate": 5e-06,
464
+ "loss": 0.608,
465
  "step": 640
466
  },
467
  {
468
  "epoch": 1.5531660692951017,
469
+ "grad_norm": 0.4815965030552482,
470
  "learning_rate": 5e-06,
471
+ "loss": 0.6053,
472
  "step": 650
473
  },
474
  {
475
  "epoch": 1.5770609318996416,
476
+ "grad_norm": 0.5446732967871324,
477
  "learning_rate": 5e-06,
478
+ "loss": 0.6076,
479
  "step": 660
480
  },
481
  {
482
  "epoch": 1.6009557945041815,
483
+ "grad_norm": 0.5773921107864519,
484
  "learning_rate": 5e-06,
485
+ "loss": 0.6058,
486
  "step": 670
487
  },
488
  {
489
  "epoch": 1.6248506571087216,
490
+ "grad_norm": 0.44904612161350127,
491
  "learning_rate": 5e-06,
492
+ "loss": 0.5988,
493
  "step": 680
494
  },
495
  {
496
  "epoch": 1.6487455197132617,
497
+ "grad_norm": 0.4659803956684399,
498
  "learning_rate": 5e-06,
499
+ "loss": 0.598,
500
  "step": 690
501
  },
502
  {
503
  "epoch": 1.6726403823178018,
504
+ "grad_norm": 0.4361474003132107,
505
  "learning_rate": 5e-06,
506
+ "loss": 0.6081,
507
  "step": 700
508
  },
509
  {
510
  "epoch": 1.6965352449223416,
511
+ "grad_norm": 0.4702827100539838,
512
  "learning_rate": 5e-06,
513
+ "loss": 0.5997,
514
  "step": 710
515
  },
516
  {
517
  "epoch": 1.7204301075268817,
518
+ "grad_norm": 0.46962735672309736,
519
  "learning_rate": 5e-06,
520
+ "loss": 0.6135,
521
  "step": 720
522
  },
523
  {
524
  "epoch": 1.7443249701314216,
525
+ "grad_norm": 0.5064462322593579,
526
  "learning_rate": 5e-06,
527
+ "loss": 0.6034,
528
  "step": 730
529
  },
530
  {
531
  "epoch": 1.7682198327359617,
532
+ "grad_norm": 0.6442892941899157,
533
  "learning_rate": 5e-06,
534
+ "loss": 0.6044,
535
  "step": 740
536
  },
537
  {
538
  "epoch": 1.7921146953405018,
539
+ "grad_norm": 0.449859458258856,
540
  "learning_rate": 5e-06,
541
+ "loss": 0.6062,
542
  "step": 750
543
  },
544
  {
545
  "epoch": 1.816009557945042,
546
+ "grad_norm": 0.47467567108778363,
547
  "learning_rate": 5e-06,
548
+ "loss": 0.6035,
549
  "step": 760
550
  },
551
  {
552
  "epoch": 1.8399044205495818,
553
+ "grad_norm": 0.43550415026449085,
554
  "learning_rate": 5e-06,
555
+ "loss": 0.5987,
556
  "step": 770
557
  },
558
  {
559
  "epoch": 1.863799283154122,
560
+ "grad_norm": 0.48913780227876247,
561
  "learning_rate": 5e-06,
562
+ "loss": 0.6031,
563
  "step": 780
564
  },
565
  {
566
  "epoch": 1.8876941457586618,
567
+ "grad_norm": 0.5594004132295759,
568
  "learning_rate": 5e-06,
569
+ "loss": 0.5995,
570
  "step": 790
571
  },
572
  {
573
  "epoch": 1.911589008363202,
574
+ "grad_norm": 0.4971730954697683,
575
  "learning_rate": 5e-06,
576
+ "loss": 0.6056,
577
  "step": 800
578
  },
579
  {
580
  "epoch": 1.935483870967742,
581
+ "grad_norm": 0.46186692571258725,
582
  "learning_rate": 5e-06,
583
+ "loss": 0.6004,
584
  "step": 810
585
  },
586
  {
587
  "epoch": 1.959378733572282,
588
+ "grad_norm": 0.4508830943663248,
589
  "learning_rate": 5e-06,
590
+ "loss": 0.604,
591
  "step": 820
592
  },
593
  {
594
  "epoch": 1.983273596176822,
595
+ "grad_norm": 0.501207898912081,
596
  "learning_rate": 5e-06,
597
+ "loss": 0.608,
598
  "step": 830
599
  },
600
  {
601
  "epoch": 2.0,
602
+ "eval_loss": 0.6365451812744141,
603
+ "eval_runtime": 226.5918,
604
+ "eval_samples_per_second": 49.75,
605
+ "eval_steps_per_second": 0.393,
606
  "step": 837
607
  },
608
  {
609
  "epoch": 2.007168458781362,
610
+ "grad_norm": 0.85054207213812,
611
  "learning_rate": 5e-06,
612
+ "loss": 0.592,
613
  "step": 840
614
  },
615
  {
616
  "epoch": 2.031063321385902,
617
+ "grad_norm": 0.5201507086108782,
618
  "learning_rate": 5e-06,
619
+ "loss": 0.5608,
620
  "step": 850
621
  },
622
  {
623
  "epoch": 2.054958183990442,
624
+ "grad_norm": 0.7188535226812537,
625
  "learning_rate": 5e-06,
626
+ "loss": 0.5647,
627
  "step": 860
628
  },
629
  {
630
  "epoch": 2.078853046594982,
631
+ "grad_norm": 0.4763195641282365,
632
  "learning_rate": 5e-06,
633
+ "loss": 0.5644,
634
  "step": 870
635
  },
636
  {
637
  "epoch": 2.1027479091995223,
638
+ "grad_norm": 0.5081415859208832,
639
  "learning_rate": 5e-06,
640
+ "loss": 0.5622,
641
  "step": 880
642
  },
643
  {
644
  "epoch": 2.126642771804062,
645
+ "grad_norm": 0.5931792000293172,
646
  "learning_rate": 5e-06,
647
+ "loss": 0.5563,
648
  "step": 890
649
  },
650
  {
651
  "epoch": 2.150537634408602,
652
+ "grad_norm": 0.49851033855755,
653
  "learning_rate": 5e-06,
654
+ "loss": 0.5611,
655
  "step": 900
656
  },
657
  {
658
  "epoch": 2.174432497013142,
659
+ "grad_norm": 0.5379278365329638,
660
  "learning_rate": 5e-06,
661
+ "loss": 0.5573,
662
  "step": 910
663
  },
664
  {
665
  "epoch": 2.198327359617682,
666
+ "grad_norm": 0.5350268044233742,
667
  "learning_rate": 5e-06,
668
+ "loss": 0.5678,
669
  "step": 920
670
  },
671
  {
672
  "epoch": 2.2222222222222223,
673
+ "grad_norm": 0.5689123372030673,
674
  "learning_rate": 5e-06,
675
+ "loss": 0.5686,
676
  "step": 930
677
  },
678
  {
679
  "epoch": 2.2461170848267624,
680
+ "grad_norm": 0.6671996296787344,
681
  "learning_rate": 5e-06,
682
+ "loss": 0.5622,
683
  "step": 940
684
  },
685
  {
686
  "epoch": 2.270011947431302,
687
+ "grad_norm": 0.4506810352733908,
688
  "learning_rate": 5e-06,
689
+ "loss": 0.5642,
690
  "step": 950
691
  },
692
  {
693
  "epoch": 2.293906810035842,
694
+ "grad_norm": 0.5358151280205125,
695
  "learning_rate": 5e-06,
696
+ "loss": 0.5628,
697
  "step": 960
698
  },
699
  {
700
  "epoch": 2.3178016726403823,
701
+ "grad_norm": 0.5566771627404731,
702
  "learning_rate": 5e-06,
703
+ "loss": 0.5634,
704
  "step": 970
705
  },
706
  {
707
  "epoch": 2.3416965352449224,
708
+ "grad_norm": 0.49963936030628325,
709
  "learning_rate": 5e-06,
710
+ "loss": 0.5632,
711
  "step": 980
712
  },
713
  {
714
  "epoch": 2.3655913978494625,
715
+ "grad_norm": 0.48679480824629434,
716
  "learning_rate": 5e-06,
717
+ "loss": 0.5583,
718
  "step": 990
719
  },
720
  {
721
  "epoch": 2.389486260454002,
722
+ "grad_norm": 0.5074816823498985,
723
  "learning_rate": 5e-06,
724
+ "loss": 0.5636,
725
  "step": 1000
726
  },
727
  {
728
  "epoch": 2.4133811230585422,
729
+ "grad_norm": 0.5739148795335686,
730
  "learning_rate": 5e-06,
731
+ "loss": 0.5671,
732
  "step": 1010
733
  },
734
  {
735
  "epoch": 2.4372759856630823,
736
+ "grad_norm": 0.6501742104516552,
737
  "learning_rate": 5e-06,
738
+ "loss": 0.5666,
739
  "step": 1020
740
  },
741
  {
742
  "epoch": 2.4611708482676224,
743
+ "grad_norm": 0.43406800220014613,
744
  "learning_rate": 5e-06,
745
+ "loss": 0.5645,
746
  "step": 1030
747
  },
748
  {
749
  "epoch": 2.4850657108721625,
750
+ "grad_norm": 0.47946981158627366,
751
  "learning_rate": 5e-06,
752
+ "loss": 0.5612,
753
  "step": 1040
754
  },
755
  {
756
  "epoch": 2.5089605734767026,
757
+ "grad_norm": 0.5508677225984592,
758
  "learning_rate": 5e-06,
759
+ "loss": 0.5658,
760
  "step": 1050
761
  },
762
  {
763
  "epoch": 2.5328554360812428,
764
+ "grad_norm": 0.6172108213167418,
765
  "learning_rate": 5e-06,
766
+ "loss": 0.5656,
767
  "step": 1060
768
  },
769
  {
770
  "epoch": 2.5567502986857824,
771
+ "grad_norm": 0.6149816712572169,
772
  "learning_rate": 5e-06,
773
+ "loss": 0.5637,
774
  "step": 1070
775
  },
776
  {
777
  "epoch": 2.5806451612903225,
778
+ "grad_norm": 0.5494076230620691,
779
  "learning_rate": 5e-06,
780
+ "loss": 0.5748,
781
  "step": 1080
782
  },
783
  {
784
  "epoch": 2.6045400238948626,
785
+ "grad_norm": 0.5098015036653776,
786
  "learning_rate": 5e-06,
787
+ "loss": 0.5665,
788
  "step": 1090
789
  },
790
  {
791
  "epoch": 2.6284348864994027,
792
+ "grad_norm": 0.4763003977246298,
793
  "learning_rate": 5e-06,
794
+ "loss": 0.5659,
795
  "step": 1100
796
  },
797
  {
798
  "epoch": 2.652329749103943,
799
+ "grad_norm": 0.45015059391064355,
800
  "learning_rate": 5e-06,
801
+ "loss": 0.5613,
802
  "step": 1110
803
  },
804
  {
805
  "epoch": 2.6762246117084825,
806
+ "grad_norm": 0.5195016081388676,
807
  "learning_rate": 5e-06,
808
+ "loss": 0.5661,
809
  "step": 1120
810
  },
811
  {
812
  "epoch": 2.7001194743130226,
813
+ "grad_norm": 0.461979850463992,
814
  "learning_rate": 5e-06,
815
+ "loss": 0.5703,
816
  "step": 1130
817
  },
818
  {
819
  "epoch": 2.7240143369175627,
820
+ "grad_norm": 0.4611698536891998,
821
  "learning_rate": 5e-06,
822
+ "loss": 0.5628,
823
  "step": 1140
824
  },
825
  {
826
  "epoch": 2.7479091995221028,
827
+ "grad_norm": 0.5474996121575114,
828
  "learning_rate": 5e-06,
829
+ "loss": 0.5692,
830
  "step": 1150
831
  },
832
  {
833
  "epoch": 2.771804062126643,
834
+ "grad_norm": 0.477411852958178,
835
  "learning_rate": 5e-06,
836
+ "loss": 0.5745,
837
  "step": 1160
838
  },
839
  {
840
  "epoch": 2.795698924731183,
841
+ "grad_norm": 0.48004817339516165,
842
  "learning_rate": 5e-06,
843
+ "loss": 0.5625,
844
  "step": 1170
845
  },
846
  {
847
  "epoch": 2.819593787335723,
848
+ "grad_norm": 0.5043226922994581,
849
  "learning_rate": 5e-06,
850
+ "loss": 0.5664,
851
  "step": 1180
852
  },
853
  {
854
  "epoch": 2.8434886499402627,
855
+ "grad_norm": 0.4988305698181874,
856
  "learning_rate": 5e-06,
857
+ "loss": 0.5649,
858
  "step": 1190
859
  },
860
  {
861
  "epoch": 2.867383512544803,
862
+ "grad_norm": 0.4569103859069353,
863
  "learning_rate": 5e-06,
864
+ "loss": 0.5647,
865
  "step": 1200
866
  },
867
  {
868
  "epoch": 2.891278375149343,
869
+ "grad_norm": 0.46286445346886024,
870
  "learning_rate": 5e-06,
871
+ "loss": 0.5621,
872
  "step": 1210
873
  },
874
  {
875
  "epoch": 2.915173237753883,
876
+ "grad_norm": 0.5296890930558641,
877
  "learning_rate": 5e-06,
878
+ "loss": 0.5668,
879
  "step": 1220
880
  },
881
  {
882
  "epoch": 2.9390681003584227,
883
+ "grad_norm": 0.5546209266748766,
884
  "learning_rate": 5e-06,
885
+ "loss": 0.5688,
886
  "step": 1230
887
  },
888
  {
889
  "epoch": 2.962962962962963,
890
+ "grad_norm": 0.5910470543653078,
891
  "learning_rate": 5e-06,
892
+ "loss": 0.5668,
893
  "step": 1240
894
  },
895
  {
896
  "epoch": 2.986857825567503,
897
+ "grad_norm": 0.5524701632367459,
898
  "learning_rate": 5e-06,
899
+ "loss": 0.5714,
900
  "step": 1250
901
  },
902
  {
903
  "epoch": 2.996415770609319,
904
+ "eval_loss": 0.6373269557952881,
905
+ "eval_runtime": 227.0283,
906
+ "eval_samples_per_second": 49.655,
907
+ "eval_steps_per_second": 0.392,
908
  "step": 1254
909
  },
910
  {
911
  "epoch": 2.996415770609319,
912
  "step": 1254,
913
  "total_flos": 2100077946470400.0,
914
+ "train_loss": 0.6173035094612523,
915
+ "train_runtime": 37751.1113,
916
+ "train_samples_per_second": 17.021,
917
  "train_steps_per_second": 0.033
918
  }
919
  ],
training_eval_loss.png CHANGED
training_loss.png CHANGED