jdannem6 commited on
Commit
f388348
·
verified ·
1 Parent(s): 6e9c399

Uploaded checkpoint-22500

Browse files
adapter_config.json CHANGED
@@ -20,11 +20,11 @@
20
  "revision": null,
21
  "target_modules": [
22
  "down_proj",
23
- "up_proj",
24
  "gate_proj",
 
25
  "v_proj",
26
- "k_proj",
27
  "o_proj",
 
28
  "q_proj"
29
  ],
30
  "task_type": "CAUSAL_LM",
 
20
  "revision": null,
21
  "target_modules": [
22
  "down_proj",
 
23
  "gate_proj",
24
+ "up_proj",
25
  "v_proj",
 
26
  "o_proj",
27
+ "k_proj",
28
  "q_proj"
29
  ],
30
  "task_type": "CAUSAL_LM",
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1094cefddb8a4c25c681c6cde66e2e7b24fd394103df2badf5c69d6900ada43b
3
  size 119975656
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:242bd1c8837db6c74ea117245bfc1b46592098a098bf72cb4a75b6fa1c50ea96
3
  size 119975656
optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:645d5b94ee5359b4733aca4181803ae6254706a9713eb85a854d8057e3a67182
3
- size 60477396
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:39f5976b7e007e60478770ca750c77a010d3dbba2afa30dc3c72a0856e2cd01d
3
+ size 240145026
rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4e33dda9942df9cbad9cd46793f638f52f82780e545c7592c3d1cbe682087eb0
3
- size 14180
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:763d4f397fbd8e2128612f32f5c273b211bf68ec4372f02c7c91ca944e405a2f
3
+ size 14244
scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:29c7a79b53a589de48d3b7a21df9c0d024be4dea79f68869f72fdc01ae3b212a
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d71c00e9bfbac2252002b6eca4a38910300bb6c14e6c56273842dfbc024260d9
3
  size 1064
trainer_state.json CHANGED
@@ -1,1484 +1,1810 @@
1
  {
2
- "best_metric": null,
3
- "best_model_checkpoint": null,
4
- "epoch": 0.5,
5
- "eval_steps": 2500,
6
- "global_step": 20000,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 0.0,
13
- "grad_norm": 4.094185829162598,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
  "learning_rate": 4.000000000000001e-06,
15
- "loss": 1.8542,
16
  "step": 100
17
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
  {
19
  "epoch": 0.01,
20
- "grad_norm": 8.345755577087402,
21
  "learning_rate": 8.000000000000001e-06,
22
- "loss": 1.4774,
23
  "step": 200
24
  },
25
  {
26
  "epoch": 0.01,
27
- "grad_norm": 3.6847422122955322,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
  "learning_rate": 1.2e-05,
29
- "loss": 1.3027,
30
  "step": 300
31
  },
32
  {
33
  "epoch": 0.01,
34
- "grad_norm": 15.149823188781738,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35
  "learning_rate": 1.6000000000000003e-05,
36
- "loss": 1.2168,
37
  "step": 400
38
  },
39
  {
40
  "epoch": 0.01,
41
- "grad_norm": 9.95534896850586,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42
  "learning_rate": 2e-05,
43
- "loss": 1.1544,
44
  "step": 500
45
  },
46
  {
47
  "epoch": 0.01,
48
- "grad_norm": 3.96409273147583,
49
- "learning_rate": 1.98974358974359e-05,
50
- "loss": 1.134,
51
- "step": 600
 
52
  },
53
  {
54
- "epoch": 0.02,
55
- "grad_norm": 2.587010383605957,
56
- "learning_rate": 1.9794871794871798e-05,
57
- "loss": 1.1294,
58
- "step": 700
59
  },
60
  {
61
- "epoch": 0.02,
62
- "grad_norm": 5.926353454589844,
63
- "learning_rate": 1.9692307692307696e-05,
64
- "loss": 1.0886,
65
- "step": 800
66
  },
67
  {
68
- "epoch": 0.02,
69
- "grad_norm": 4.175276756286621,
70
- "learning_rate": 1.958974358974359e-05,
71
- "loss": 1.1227,
72
- "step": 900
73
  },
74
  {
75
- "epoch": 0.03,
76
- "grad_norm": 2.2265052795410156,
77
- "learning_rate": 1.9487179487179488e-05,
78
- "loss": 1.0694,
79
- "step": 1000
80
  },
81
  {
82
- "epoch": 0.03,
83
- "grad_norm": 6.808347702026367,
84
- "learning_rate": 1.9384615384615386e-05,
85
- "loss": 1.1084,
86
- "step": 1100
87
  },
88
  {
89
- "epoch": 0.03,
90
- "grad_norm": 2.2117719650268555,
91
- "learning_rate": 1.9282051282051284e-05,
92
- "loss": 1.0758,
93
- "step": 1200
94
  },
95
  {
96
- "epoch": 0.03,
97
- "grad_norm": 2.893665075302124,
98
- "learning_rate": 1.9179487179487182e-05,
99
- "loss": 1.0732,
100
- "step": 1300
101
  },
102
  {
103
- "epoch": 0.04,
104
- "grad_norm": 4.583731174468994,
105
- "learning_rate": 1.907692307692308e-05,
106
- "loss": 1.0345,
107
- "step": 1400
108
  },
109
  {
110
- "epoch": 0.04,
111
- "grad_norm": 2.2239737510681152,
112
- "learning_rate": 1.8974358974358975e-05,
113
- "loss": 1.0151,
114
- "step": 1500
115
  },
116
  {
117
- "epoch": 0.04,
118
- "grad_norm": 6.440332412719727,
119
- "learning_rate": 1.8871794871794873e-05,
120
- "loss": 1.0249,
121
- "step": 1600
122
  },
123
  {
124
- "epoch": 0.04,
125
- "grad_norm": 3.9038124084472656,
126
- "learning_rate": 1.876923076923077e-05,
127
- "loss": 1.0481,
128
- "step": 1700
129
  },
130
  {
131
- "epoch": 0.04,
132
- "grad_norm": 4.901433944702148,
133
- "learning_rate": 1.866666666666667e-05,
134
- "loss": 1.0383,
135
- "step": 1800
136
  },
137
  {
138
- "epoch": 0.05,
139
- "grad_norm": 2.6100122928619385,
140
- "learning_rate": 1.8564102564102567e-05,
141
- "loss": 0.9715,
142
- "step": 1900
143
  },
144
  {
145
- "epoch": 0.05,
146
- "grad_norm": 4.283998012542725,
147
- "learning_rate": 1.8461538461538465e-05,
148
- "loss": 0.9946,
149
- "step": 2000
150
  },
151
  {
152
- "epoch": 0.05,
153
- "grad_norm": 5.045602798461914,
154
- "learning_rate": 1.835897435897436e-05,
155
- "loss": 1.0233,
156
- "step": 2100
157
  },
158
  {
159
- "epoch": 0.06,
160
- "grad_norm": 3.054832935333252,
161
- "learning_rate": 1.8256410256410257e-05,
162
- "loss": 1.0177,
163
- "step": 2200
164
  },
165
  {
166
- "epoch": 0.06,
167
- "grad_norm": 4.251312732696533,
168
- "learning_rate": 1.8153846153846155e-05,
169
- "loss": 0.9562,
170
- "step": 2300
171
  },
172
  {
173
- "epoch": 0.06,
174
- "grad_norm": 2.6943576335906982,
175
- "learning_rate": 1.8051282051282053e-05,
176
- "loss": 1.0076,
177
- "step": 2400
178
  },
179
  {
180
- "epoch": 0.06,
181
- "grad_norm": 3.307131290435791,
182
- "learning_rate": 1.794871794871795e-05,
183
- "loss": 0.9687,
184
- "step": 2500
185
  },
186
  {
187
- "epoch": 0.06,
188
- "eval_loss": 0.9706119894981384,
189
- "eval_runtime": 104.0832,
190
- "eval_samples_per_second": 9.608,
191
- "eval_steps_per_second": 9.608,
192
- "step": 2500
193
  },
194
  {
195
- "epoch": 0.07,
196
- "grad_norm": 1.8508224487304688,
197
- "learning_rate": 1.784615384615385e-05,
198
- "loss": 0.9693,
199
- "step": 2600
200
  },
201
  {
202
- "epoch": 0.07,
203
- "grad_norm": 2.838670253753662,
204
- "learning_rate": 1.7743589743589744e-05,
205
- "loss": 0.955,
206
- "step": 2700
207
  },
208
  {
209
- "epoch": 0.07,
210
- "grad_norm": 2.9186294078826904,
211
- "learning_rate": 1.7641025641025642e-05,
212
- "loss": 0.9504,
213
- "step": 2800
214
  },
215
  {
216
- "epoch": 0.07,
217
- "grad_norm": 4.183789253234863,
218
- "learning_rate": 1.753846153846154e-05,
219
- "loss": 0.9579,
220
- "step": 2900
221
  },
222
  {
223
- "epoch": 0.07,
224
- "grad_norm": 8.950007438659668,
225
- "learning_rate": 1.7435897435897438e-05,
226
- "loss": 0.9493,
227
- "step": 3000
228
  },
229
  {
230
- "epoch": 0.08,
231
- "grad_norm": 2.4844536781311035,
232
- "learning_rate": 1.7333333333333336e-05,
233
- "loss": 0.9271,
234
- "step": 3100
235
  },
236
  {
237
- "epoch": 0.08,
238
- "grad_norm": 2.786226272583008,
239
- "learning_rate": 1.7230769230769234e-05,
240
- "loss": 0.955,
241
- "step": 3200
242
  },
243
  {
244
- "epoch": 0.08,
245
- "grad_norm": 3.8355979919433594,
246
- "learning_rate": 1.7128205128205128e-05,
247
- "loss": 0.9351,
248
- "step": 3300
249
  },
250
  {
251
- "epoch": 0.09,
252
- "grad_norm": 11.382020950317383,
253
- "learning_rate": 1.7025641025641026e-05,
254
- "loss": 0.9472,
255
- "step": 3400
256
  },
257
  {
258
- "epoch": 0.09,
259
- "grad_norm": 5.132159233093262,
260
- "learning_rate": 1.6923076923076924e-05,
261
- "loss": 0.9172,
262
- "step": 3500
263
  },
264
  {
265
- "epoch": 0.09,
266
- "grad_norm": 7.389036178588867,
267
- "learning_rate": 1.6820512820512822e-05,
268
- "loss": 0.9273,
269
- "step": 3600
270
  },
271
  {
272
- "epoch": 0.09,
273
- "grad_norm": 3.8746137619018555,
274
- "learning_rate": 1.671794871794872e-05,
275
- "loss": 0.9497,
276
- "step": 3700
277
  },
278
  {
279
- "epoch": 0.1,
280
- "grad_norm": 2.999476194381714,
281
- "learning_rate": 1.6615384615384618e-05,
282
- "loss": 0.9321,
283
- "step": 3800
284
  },
285
  {
286
- "epoch": 0.1,
287
- "grad_norm": 5.325080394744873,
288
- "learning_rate": 1.6512820512820513e-05,
289
- "loss": 0.9375,
290
- "step": 3900
291
  },
292
  {
293
- "epoch": 0.1,
294
- "grad_norm": 2.8472201824188232,
295
- "learning_rate": 1.641025641025641e-05,
296
- "loss": 0.9285,
297
- "step": 4000
298
  },
299
  {
300
- "epoch": 0.1,
301
- "grad_norm": 4.391159534454346,
302
- "learning_rate": 1.630769230769231e-05,
303
- "loss": 0.9285,
304
- "step": 4100
305
  },
306
  {
307
- "epoch": 0.1,
308
- "grad_norm": 2.1041903495788574,
309
- "learning_rate": 1.6205128205128207e-05,
310
- "loss": 0.9262,
311
- "step": 4200
312
  },
313
  {
314
- "epoch": 0.11,
315
- "grad_norm": 2.626622200012207,
316
- "learning_rate": 1.6102564102564105e-05,
317
- "loss": 0.9557,
318
- "step": 4300
319
  },
320
  {
321
- "epoch": 0.11,
322
- "grad_norm": 3.374565362930298,
323
- "learning_rate": 1.6000000000000003e-05,
324
- "loss": 0.9518,
325
- "step": 4400
326
  },
327
  {
328
- "epoch": 0.11,
329
- "grad_norm": 3.344024658203125,
330
- "learning_rate": 1.5897435897435897e-05,
331
- "loss": 0.9141,
332
- "step": 4500
333
  },
334
  {
335
- "epoch": 0.12,
336
- "grad_norm": 4.982439994812012,
337
- "learning_rate": 1.5794871794871795e-05,
338
- "loss": 0.9209,
339
- "step": 4600
340
  },
341
  {
342
- "epoch": 0.12,
343
- "grad_norm": 3.430849075317383,
344
- "learning_rate": 1.5692307692307693e-05,
345
- "loss": 0.8916,
346
- "step": 4700
347
  },
348
  {
349
- "epoch": 0.12,
350
- "grad_norm": 3.118523597717285,
351
- "learning_rate": 1.558974358974359e-05,
352
- "loss": 0.9536,
353
- "step": 4800
354
  },
355
  {
356
- "epoch": 0.12,
357
- "grad_norm": 1.9410160779953003,
358
- "learning_rate": 1.548717948717949e-05,
359
- "loss": 0.9154,
360
- "step": 4900
361
  },
362
  {
363
- "epoch": 0.12,
364
- "grad_norm": 3.892230749130249,
365
- "learning_rate": 1.5384615384615387e-05,
366
- "loss": 0.9289,
367
- "step": 5000
368
  },
369
  {
370
- "epoch": 0.12,
371
- "eval_loss": 0.9065942168235779,
372
- "eval_runtime": 104.6944,
373
- "eval_samples_per_second": 9.552,
374
- "eval_steps_per_second": 9.552,
375
- "step": 5000
376
  },
377
  {
378
- "epoch": 0.13,
379
- "grad_norm": 3.4030818939208984,
380
- "learning_rate": 1.5282051282051282e-05,
381
- "loss": 0.8939,
382
- "step": 5100
383
  },
384
  {
385
- "epoch": 0.13,
386
- "grad_norm": 5.378746509552002,
387
- "learning_rate": 1.517948717948718e-05,
388
- "loss": 0.9134,
389
- "step": 5200
390
  },
391
  {
392
- "epoch": 0.13,
393
- "grad_norm": 6.0667009353637695,
394
- "learning_rate": 1.5076923076923078e-05,
395
- "loss": 0.884,
396
- "step": 5300
397
  },
398
  {
399
- "epoch": 0.14,
400
- "grad_norm": 10.8038969039917,
401
- "learning_rate": 1.4974358974358976e-05,
402
- "loss": 0.9061,
403
- "step": 5400
404
  },
405
  {
406
- "epoch": 0.14,
407
- "grad_norm": 4.017248630523682,
408
- "learning_rate": 1.4871794871794874e-05,
409
- "loss": 0.8847,
410
- "step": 5500
 
411
  },
412
  {
413
- "epoch": 0.14,
414
- "grad_norm": 3.3564505577087402,
415
- "learning_rate": 1.4769230769230772e-05,
416
- "loss": 0.8768,
417
- "step": 5600
418
  },
419
  {
420
- "epoch": 0.14,
421
- "grad_norm": 9.728605270385742,
422
- "learning_rate": 1.4666666666666666e-05,
423
- "loss": 0.8867,
424
- "step": 5700
425
  },
426
  {
427
- "epoch": 0.14,
428
- "grad_norm": 2.472195863723755,
429
- "learning_rate": 1.4564102564102564e-05,
430
- "loss": 0.898,
431
- "step": 5800
432
  },
433
  {
434
- "epoch": 0.15,
435
- "grad_norm": 6.605821132659912,
436
- "learning_rate": 1.4461538461538462e-05,
437
- "loss": 0.9115,
438
- "step": 5900
439
  },
440
  {
441
- "epoch": 0.15,
442
- "grad_norm": 4.562441825866699,
443
- "learning_rate": 1.435897435897436e-05,
444
- "loss": 0.8726,
445
- "step": 6000
446
  },
447
  {
448
- "epoch": 0.15,
449
- "grad_norm": 2.6972761154174805,
450
- "learning_rate": 1.4256410256410258e-05,
451
- "loss": 0.8702,
452
- "step": 6100
453
  },
454
  {
455
- "epoch": 0.15,
456
- "grad_norm": 4.478190898895264,
457
- "learning_rate": 1.4153846153846156e-05,
458
- "loss": 0.8905,
459
- "step": 6200
460
  },
461
  {
462
- "epoch": 0.16,
463
- "grad_norm": 4.112303733825684,
464
- "learning_rate": 1.405128205128205e-05,
465
- "loss": 0.9162,
466
- "step": 6300
467
  },
468
  {
469
- "epoch": 0.16,
470
- "grad_norm": 4.536581993103027,
471
- "learning_rate": 1.3948717948717949e-05,
472
- "loss": 0.8941,
473
- "step": 6400
474
  },
475
  {
476
- "epoch": 0.16,
477
- "grad_norm": 4.727623462677002,
478
- "learning_rate": 1.3846153846153847e-05,
479
- "loss": 0.8582,
480
- "step": 6500
481
  },
482
  {
483
- "epoch": 0.17,
484
- "grad_norm": 3.3152599334716797,
485
- "learning_rate": 1.3743589743589745e-05,
486
- "loss": 0.903,
487
- "step": 6600
488
  },
489
  {
490
- "epoch": 0.17,
491
- "grad_norm": 2.5421881675720215,
492
- "learning_rate": 1.3641025641025643e-05,
493
- "loss": 0.8967,
494
- "step": 6700
495
  },
496
  {
497
- "epoch": 0.17,
498
- "grad_norm": 2.4081993103027344,
499
- "learning_rate": 1.353846153846154e-05,
500
- "loss": 0.8801,
501
- "step": 6800
502
  },
503
  {
504
- "epoch": 0.17,
505
- "grad_norm": 12.206317901611328,
506
- "learning_rate": 1.3435897435897435e-05,
507
- "loss": 0.8594,
508
- "step": 6900
509
  },
510
  {
511
- "epoch": 0.17,
512
- "grad_norm": 4.930089473724365,
513
- "learning_rate": 1.3333333333333333e-05,
514
- "loss": 0.853,
515
- "step": 7000
516
  },
517
  {
518
- "epoch": 0.18,
519
- "grad_norm": 3.7278289794921875,
520
- "learning_rate": 1.3230769230769231e-05,
521
- "loss": 0.8462,
522
- "step": 7100
523
  },
524
  {
525
- "epoch": 0.18,
526
- "grad_norm": 2.5738131999969482,
527
- "learning_rate": 1.312820512820513e-05,
528
- "loss": 0.8466,
529
- "step": 7200
530
  },
531
  {
532
- "epoch": 0.18,
533
- "grad_norm": 3.267303466796875,
534
- "learning_rate": 1.3025641025641027e-05,
535
- "loss": 0.8616,
536
- "step": 7300
537
  },
538
  {
539
- "epoch": 0.18,
540
- "grad_norm": 2.1865787506103516,
541
- "learning_rate": 1.2923076923076925e-05,
542
- "loss": 0.8802,
543
- "step": 7400
544
  },
545
  {
546
- "epoch": 0.19,
547
- "grad_norm": 2.0264055728912354,
548
- "learning_rate": 1.2820512820512823e-05,
549
- "loss": 0.8841,
550
- "step": 7500
551
  },
552
  {
553
- "epoch": 0.19,
554
- "eval_loss": 0.8533282279968262,
555
- "eval_runtime": 106.0657,
556
- "eval_samples_per_second": 9.428,
557
- "eval_steps_per_second": 9.428,
558
- "step": 7500
559
  },
560
  {
561
- "epoch": 0.19,
562
- "grad_norm": 3.4310085773468018,
563
- "learning_rate": 1.2717948717948718e-05,
564
- "loss": 0.8767,
565
- "step": 7600
566
  },
567
  {
568
- "epoch": 0.19,
569
- "grad_norm": 2.533520460128784,
570
- "learning_rate": 1.2615384615384616e-05,
571
- "loss": 0.8784,
572
- "step": 7700
573
  },
574
  {
575
- "epoch": 0.2,
576
- "grad_norm": 4.673364639282227,
577
- "learning_rate": 1.2512820512820514e-05,
578
- "loss": 0.8504,
579
- "step": 7800
580
  },
581
  {
582
- "epoch": 0.2,
583
- "grad_norm": 2.4026598930358887,
584
- "learning_rate": 1.2410256410256412e-05,
585
- "loss": 0.8647,
586
- "step": 7900
587
  },
588
  {
589
- "epoch": 0.2,
590
- "grad_norm": 6.66796875,
591
- "learning_rate": 1.230769230769231e-05,
592
- "loss": 0.8634,
593
- "step": 8000
594
  },
595
  {
596
- "epoch": 0.2,
597
- "grad_norm": 1.8087568283081055,
598
- "learning_rate": 1.2205128205128208e-05,
599
- "loss": 0.8277,
600
- "step": 8100
601
  },
602
  {
603
- "epoch": 0.2,
604
- "grad_norm": 3.196040630340576,
605
- "learning_rate": 1.2102564102564102e-05,
606
- "loss": 0.8739,
607
- "step": 8200
608
  },
609
  {
610
- "epoch": 0.21,
611
- "grad_norm": 1.6817710399627686,
612
- "learning_rate": 1.2e-05,
613
- "loss": 0.8367,
614
- "step": 8300
615
  },
616
  {
617
- "epoch": 0.21,
618
- "grad_norm": 5.548306941986084,
619
- "learning_rate": 1.1897435897435898e-05,
620
- "loss": 0.8247,
621
- "step": 8400
622
  },
623
  {
624
- "epoch": 0.21,
625
- "grad_norm": 6.069587707519531,
626
- "learning_rate": 1.1794871794871796e-05,
627
- "loss": 0.8248,
628
- "step": 8500
629
  },
630
  {
631
- "epoch": 0.21,
632
- "grad_norm": 3.085785150527954,
633
- "learning_rate": 1.1692307692307694e-05,
634
- "loss": 0.8618,
635
- "step": 8600
636
  },
637
  {
638
- "epoch": 0.22,
639
- "grad_norm": 1.7855651378631592,
640
- "learning_rate": 1.1589743589743592e-05,
641
- "loss": 0.8601,
642
- "step": 8700
643
  },
644
  {
645
- "epoch": 0.22,
646
- "grad_norm": 3.378775119781494,
647
- "learning_rate": 1.1487179487179487e-05,
648
- "loss": 0.8712,
649
- "step": 8800
650
  },
651
  {
652
- "epoch": 0.22,
653
- "grad_norm": 2.7686617374420166,
654
- "learning_rate": 1.1384615384615385e-05,
655
- "loss": 0.852,
656
- "step": 8900
657
  },
658
  {
659
- "epoch": 0.23,
660
- "grad_norm": 5.424912452697754,
661
- "learning_rate": 1.1282051282051283e-05,
662
- "loss": 0.8796,
663
- "step": 9000
664
  },
665
  {
666
- "epoch": 0.23,
667
- "grad_norm": 6.806646347045898,
668
- "learning_rate": 1.117948717948718e-05,
669
- "loss": 0.8457,
670
- "step": 9100
671
  },
672
  {
673
- "epoch": 0.23,
674
- "grad_norm": 4.3275837898254395,
675
- "learning_rate": 1.1076923076923079e-05,
676
- "loss": 0.8346,
677
- "step": 9200
678
  },
679
  {
680
- "epoch": 0.23,
681
- "grad_norm": 5.901556015014648,
682
- "learning_rate": 1.0974358974358977e-05,
683
- "loss": 0.8489,
684
- "step": 9300
685
  },
686
  {
687
- "epoch": 0.23,
688
- "grad_norm": 6.287178993225098,
689
- "learning_rate": 1.0871794871794871e-05,
690
- "loss": 0.8463,
691
- "step": 9400
692
  },
693
  {
694
- "epoch": 0.24,
695
- "grad_norm": 2.2666897773742676,
696
- "learning_rate": 1.076923076923077e-05,
697
- "loss": 0.8399,
698
- "step": 9500
699
  },
700
  {
701
- "epoch": 0.24,
702
- "grad_norm": 2.2565557956695557,
703
- "learning_rate": 1.0666666666666667e-05,
704
- "loss": 0.8452,
705
- "step": 9600
706
  },
707
  {
708
- "epoch": 0.24,
709
- "grad_norm": 3.512251615524292,
710
- "learning_rate": 1.0564102564102565e-05,
711
- "loss": 0.8665,
712
- "step": 9700
713
  },
714
  {
715
- "epoch": 0.24,
716
- "grad_norm": 5.637045860290527,
717
- "learning_rate": 1.0461538461538463e-05,
718
- "loss": 0.829,
719
- "step": 9800
720
  },
721
  {
722
- "epoch": 0.25,
723
- "grad_norm": 2.9041316509246826,
724
- "learning_rate": 1.0358974358974361e-05,
725
- "loss": 0.8273,
726
- "step": 9900
727
  },
728
  {
729
- "epoch": 0.25,
730
- "grad_norm": 2.120234727859497,
731
- "learning_rate": 1.0256410256410256e-05,
732
- "loss": 0.7933,
733
- "step": 10000
734
  },
735
  {
736
- "epoch": 0.25,
737
- "eval_loss": 0.8178455829620361,
738
- "eval_runtime": 103.8104,
739
- "eval_samples_per_second": 9.633,
740
- "eval_steps_per_second": 9.633,
741
- "step": 10000
742
  },
743
  {
744
- "epoch": 0.25,
745
- "grad_norm": 5.0967559814453125,
746
- "learning_rate": 1.0153846153846154e-05,
747
- "loss": 0.833,
748
- "step": 10100
749
  },
750
  {
751
- "epoch": 0.26,
752
- "grad_norm": 9.097169876098633,
753
- "learning_rate": 1.0051282051282052e-05,
754
- "loss": 0.8363,
755
- "step": 10200
756
  },
757
  {
758
- "epoch": 0.26,
759
- "grad_norm": 3.159578561782837,
760
- "learning_rate": 9.94871794871795e-06,
761
- "loss": 0.8154,
762
- "step": 10300
763
  },
764
  {
765
- "epoch": 0.26,
766
- "grad_norm": 5.177265644073486,
767
- "learning_rate": 9.846153846153848e-06,
768
- "loss": 0.8391,
769
- "step": 10400
 
770
  },
771
  {
772
- "epoch": 0.26,
773
- "grad_norm": 4.336682319641113,
774
- "learning_rate": 9.743589743589744e-06,
775
- "loss": 0.8393,
776
- "step": 10500
777
  },
778
  {
779
- "epoch": 0.27,
780
- "grad_norm": 4.355902194976807,
781
- "learning_rate": 9.641025641025642e-06,
782
- "loss": 0.8491,
783
- "step": 10600
784
  },
785
  {
786
- "epoch": 0.27,
787
- "grad_norm": 6.8868279457092285,
788
- "learning_rate": 9.53846153846154e-06,
789
- "loss": 0.8026,
790
- "step": 10700
791
  },
792
  {
793
- "epoch": 0.27,
794
- "grad_norm": 2.8234918117523193,
795
- "learning_rate": 9.435897435897436e-06,
796
- "loss": 0.8412,
797
- "step": 10800
798
  },
799
  {
800
- "epoch": 0.27,
801
- "grad_norm": 4.60006046295166,
802
- "learning_rate": 9.333333333333334e-06,
803
- "loss": 0.8022,
804
- "step": 10900
805
  },
806
  {
807
- "epoch": 0.28,
808
- "grad_norm": 4.048822402954102,
809
- "learning_rate": 9.230769230769232e-06,
810
- "loss": 0.8117,
811
- "step": 11000
812
  },
813
  {
814
- "epoch": 0.28,
815
- "grad_norm": 3.5352272987365723,
816
- "learning_rate": 9.128205128205129e-06,
817
- "loss": 0.8621,
818
- "step": 11100
819
  },
820
  {
821
- "epoch": 0.28,
822
- "grad_norm": 3.284557819366455,
823
- "learning_rate": 9.025641025641027e-06,
824
- "loss": 0.7945,
825
- "step": 11200
826
  },
827
  {
828
- "epoch": 0.28,
829
- "grad_norm": 6.281557559967041,
830
- "learning_rate": 8.923076923076925e-06,
831
- "loss": 0.8398,
832
- "step": 11300
833
  },
834
  {
835
- "epoch": 0.28,
836
- "grad_norm": 4.4348297119140625,
837
- "learning_rate": 8.820512820512821e-06,
838
- "loss": 0.8151,
839
- "step": 11400
840
  },
841
  {
842
- "epoch": 0.29,
843
- "grad_norm": 4.739795684814453,
844
- "learning_rate": 8.717948717948719e-06,
845
- "loss": 0.8223,
846
- "step": 11500
847
  },
848
  {
849
- "epoch": 0.29,
850
- "grad_norm": 5.187675476074219,
851
- "learning_rate": 8.615384615384617e-06,
852
- "loss": 0.7946,
853
- "step": 11600
854
  },
855
  {
856
- "epoch": 0.29,
857
- "grad_norm": 6.4138360023498535,
858
- "learning_rate": 8.512820512820513e-06,
859
- "loss": 0.8118,
860
- "step": 11700
861
  },
862
  {
863
- "epoch": 0.29,
864
- "grad_norm": 3.3624444007873535,
865
- "learning_rate": 8.410256410256411e-06,
866
- "loss": 0.8234,
867
- "step": 11800
868
  },
869
  {
870
- "epoch": 0.3,
871
- "grad_norm": 1.7718826532363892,
872
- "learning_rate": 8.307692307692309e-06,
873
- "loss": 0.816,
874
- "step": 11900
875
  },
876
  {
877
- "epoch": 0.3,
878
- "grad_norm": 5.3870158195495605,
879
- "learning_rate": 8.205128205128205e-06,
880
- "loss": 0.83,
881
- "step": 12000
882
  },
883
  {
884
- "epoch": 0.3,
885
- "grad_norm": 7.233886241912842,
886
- "learning_rate": 8.102564102564103e-06,
887
- "loss": 0.7626,
888
- "step": 12100
889
  },
890
  {
891
- "epoch": 0.3,
892
- "grad_norm": 1.8522437810897827,
893
- "learning_rate": 8.000000000000001e-06,
894
- "loss": 0.7786,
895
- "step": 12200
 
 
 
 
 
 
 
896
  },
897
  {
898
- "epoch": 0.31,
899
- "grad_norm": 3.0882771015167236,
900
- "learning_rate": 7.897435897435898e-06,
901
- "loss": 0.8297,
902
- "step": 12300
903
  },
904
  {
905
- "epoch": 0.31,
906
- "grad_norm": 5.807680606842041,
907
- "learning_rate": 7.794871794871796e-06,
908
- "loss": 0.8571,
909
- "step": 12400
910
  },
911
  {
912
- "epoch": 0.31,
913
- "grad_norm": 3.875642776489258,
914
- "learning_rate": 7.692307692307694e-06,
915
- "loss": 0.8101,
916
- "step": 12500
917
  },
918
  {
919
- "epoch": 0.31,
920
- "eval_loss": 0.8394359946250916,
921
- "eval_runtime": 104.1474,
922
- "eval_samples_per_second": 9.602,
923
- "eval_steps_per_second": 9.602,
924
- "step": 12500
925
  },
926
  {
927
- "epoch": 0.32,
928
- "grad_norm": 10.450545310974121,
929
- "learning_rate": 7.58974358974359e-06,
930
- "loss": 0.7913,
931
- "step": 12600
932
  },
933
  {
934
- "epoch": 0.32,
935
- "grad_norm": 4.064128398895264,
936
- "learning_rate": 7.487179487179488e-06,
937
- "loss": 0.8523,
938
- "step": 12700
939
  },
940
  {
941
- "epoch": 0.32,
942
- "grad_norm": 2.707719564437866,
943
- "learning_rate": 7.384615384615386e-06,
944
- "loss": 0.8403,
945
- "step": 12800
946
  },
947
  {
948
- "epoch": 0.32,
949
- "grad_norm": 4.44093132019043,
950
- "learning_rate": 7.282051282051282e-06,
951
- "loss": 0.8243,
952
- "step": 12900
953
  },
954
  {
955
- "epoch": 0.33,
956
- "grad_norm": 4.285432815551758,
957
- "learning_rate": 7.17948717948718e-06,
958
- "loss": 0.8011,
959
- "step": 13000
960
  },
961
  {
962
- "epoch": 0.33,
963
- "grad_norm": 3.158308744430542,
964
- "learning_rate": 7.076923076923078e-06,
965
- "loss": 0.8062,
966
- "step": 13100
967
  },
968
  {
969
- "epoch": 0.33,
970
- "grad_norm": 5.444665431976318,
971
- "learning_rate": 6.974358974358974e-06,
972
- "loss": 0.8229,
973
- "step": 13200
974
  },
975
  {
976
- "epoch": 0.33,
977
- "grad_norm": 11.230988502502441,
978
- "learning_rate": 6.871794871794872e-06,
979
- "loss": 0.8169,
980
- "step": 13300
981
  },
982
  {
983
- "epoch": 0.34,
984
- "grad_norm": 16.40984344482422,
985
- "learning_rate": 6.76923076923077e-06,
986
- "loss": 0.8354,
987
- "step": 13400
988
  },
989
  {
990
- "epoch": 0.34,
991
- "grad_norm": 5.534363269805908,
992
- "learning_rate": 6.666666666666667e-06,
993
- "loss": 0.7963,
994
- "step": 13500
995
  },
996
  {
997
- "epoch": 0.34,
998
- "grad_norm": 5.745026588439941,
999
- "learning_rate": 6.564102564102565e-06,
1000
- "loss": 0.7899,
1001
- "step": 13600
1002
  },
1003
  {
1004
- "epoch": 0.34,
1005
- "grad_norm": 3.449707508087158,
1006
- "learning_rate": 6.461538461538463e-06,
1007
- "loss": 0.7783,
1008
- "step": 13700
1009
  },
1010
  {
1011
- "epoch": 0.34,
1012
- "grad_norm": 2.5562901496887207,
1013
- "learning_rate": 6.358974358974359e-06,
1014
- "loss": 0.8222,
1015
- "step": 13800
1016
  },
1017
  {
1018
- "epoch": 0.35,
1019
- "grad_norm": 4.387004375457764,
1020
- "learning_rate": 6.256410256410257e-06,
1021
- "loss": 0.8033,
1022
- "step": 13900
1023
  },
1024
  {
1025
- "epoch": 0.35,
1026
- "grad_norm": 3.628570318222046,
1027
- "learning_rate": 6.153846153846155e-06,
1028
- "loss": 0.7791,
1029
- "step": 14000
1030
  },
1031
  {
1032
- "epoch": 0.35,
1033
- "grad_norm": 4.86137580871582,
1034
- "learning_rate": 6.051282051282051e-06,
1035
- "loss": 0.7974,
1036
- "step": 14100
1037
  },
1038
  {
1039
- "epoch": 0.35,
1040
- "grad_norm": 3.2952165603637695,
1041
- "learning_rate": 5.948717948717949e-06,
1042
- "loss": 0.7768,
1043
- "step": 14200
1044
  },
1045
  {
1046
- "epoch": 0.36,
1047
- "grad_norm": 3.655470848083496,
1048
- "learning_rate": 5.846153846153847e-06,
1049
- "loss": 0.7969,
1050
- "step": 14300
1051
  },
1052
  {
1053
- "epoch": 0.36,
1054
- "grad_norm": 5.8347086906433105,
1055
- "learning_rate": 5.743589743589743e-06,
1056
- "loss": 0.801,
1057
- "step": 14400
1058
  },
1059
  {
1060
- "epoch": 0.36,
1061
- "grad_norm": 4.130991458892822,
1062
- "learning_rate": 5.641025641025641e-06,
1063
- "loss": 0.7876,
1064
- "step": 14500
1065
  },
1066
  {
1067
- "epoch": 0.36,
1068
- "grad_norm": 6.501937389373779,
1069
- "learning_rate": 5.538461538461539e-06,
1070
- "loss": 0.8172,
1071
- "step": 14600
1072
  },
1073
  {
1074
- "epoch": 0.37,
1075
- "grad_norm": 5.493655204772949,
1076
- "learning_rate": 5.435897435897436e-06,
1077
- "loss": 0.8158,
1078
- "step": 14700
1079
  },
1080
  {
1081
- "epoch": 0.37,
1082
- "grad_norm": 5.281980037689209,
1083
- "learning_rate": 5.333333333333334e-06,
1084
- "loss": 0.7831,
1085
- "step": 14800
1086
  },
1087
  {
1088
- "epoch": 0.37,
1089
- "grad_norm": 4.665294647216797,
1090
- "learning_rate": 5.230769230769232e-06,
1091
- "loss": 0.7772,
1092
- "step": 14900
1093
  },
1094
  {
1095
- "epoch": 0.38,
1096
- "grad_norm": 3.9457015991210938,
1097
- "learning_rate": 5.128205128205128e-06,
1098
- "loss": 0.777,
1099
- "step": 15000
1100
  },
1101
  {
1102
- "epoch": 0.38,
1103
- "eval_loss": 0.8082045316696167,
1104
- "eval_runtime": 104.2332,
1105
- "eval_samples_per_second": 9.594,
1106
- "eval_steps_per_second": 9.594,
1107
- "step": 15000
1108
  },
1109
  {
1110
- "epoch": 0.38,
1111
- "grad_norm": 9.883415222167969,
1112
- "learning_rate": 5.025641025641026e-06,
1113
- "loss": 0.793,
1114
- "step": 15100
1115
  },
1116
  {
1117
- "epoch": 0.38,
1118
- "grad_norm": 8.593897819519043,
1119
- "learning_rate": 4.923076923076924e-06,
1120
- "loss": 0.7879,
1121
- "step": 15200
 
1122
  },
1123
  {
1124
- "epoch": 0.38,
1125
- "grad_norm": 19.679561614990234,
1126
- "learning_rate": 4.820512820512821e-06,
1127
- "loss": 0.836,
1128
- "step": 15300
1129
  },
1130
  {
1131
- "epoch": 0.39,
1132
- "grad_norm": 4.36007833480835,
1133
- "learning_rate": 4.717948717948718e-06,
1134
- "loss": 0.81,
1135
- "step": 15400
1136
  },
1137
  {
1138
- "epoch": 0.39,
1139
- "grad_norm": 4.863149166107178,
1140
- "learning_rate": 4.615384615384616e-06,
1141
- "loss": 0.8149,
1142
- "step": 15500
1143
  },
1144
  {
1145
- "epoch": 0.39,
1146
- "grad_norm": 9.058311462402344,
1147
- "learning_rate": 4.512820512820513e-06,
1148
- "loss": 0.7914,
1149
- "step": 15600
1150
  },
1151
  {
1152
- "epoch": 0.39,
1153
- "grad_norm": 13.729168891906738,
1154
- "learning_rate": 4.4102564102564104e-06,
1155
- "loss": 0.7978,
1156
- "step": 15700
1157
  },
1158
  {
1159
- "epoch": 0.4,
1160
- "grad_norm": 2.711949586868286,
1161
- "learning_rate": 4.307692307692308e-06,
1162
- "loss": 0.7575,
1163
- "step": 15800
1164
  },
1165
  {
1166
- "epoch": 0.4,
1167
- "grad_norm": 5.580270767211914,
1168
- "learning_rate": 4.2051282051282055e-06,
1169
- "loss": 0.7934,
1170
- "step": 15900
1171
  },
1172
  {
1173
- "epoch": 0.4,
1174
- "grad_norm": 21.650022506713867,
1175
- "learning_rate": 4.102564102564103e-06,
1176
- "loss": 0.8141,
1177
- "step": 16000
1178
  },
1179
  {
1180
- "epoch": 0.4,
1181
- "grad_norm": 7.138460636138916,
1182
- "learning_rate": 4.000000000000001e-06,
1183
- "loss": 0.7433,
1184
- "step": 16100
1185
  },
1186
  {
1187
- "epoch": 0.41,
1188
- "grad_norm": 3.7532575130462646,
1189
- "learning_rate": 3.897435897435898e-06,
1190
- "loss": 0.7704,
1191
- "step": 16200
1192
  },
1193
  {
1194
- "epoch": 0.41,
1195
- "grad_norm": 2.153252601623535,
1196
- "learning_rate": 3.794871794871795e-06,
1197
- "loss": 0.7647,
1198
- "step": 16300
1199
  },
1200
  {
1201
- "epoch": 0.41,
1202
- "grad_norm": 4.485107898712158,
1203
- "learning_rate": 3.692307692307693e-06,
1204
- "loss": 0.8182,
1205
- "step": 16400
1206
  },
1207
  {
1208
- "epoch": 0.41,
1209
- "grad_norm": 5.237086772918701,
1210
- "learning_rate": 3.58974358974359e-06,
1211
- "loss": 0.7965,
1212
- "step": 16500
1213
  },
1214
  {
1215
- "epoch": 0.41,
1216
- "grad_norm": 2.242441177368164,
1217
- "learning_rate": 3.487179487179487e-06,
1218
- "loss": 0.8619,
1219
- "step": 16600
1220
  },
1221
  {
1222
- "epoch": 0.42,
1223
- "grad_norm": 3.2443642616271973,
1224
- "learning_rate": 3.384615384615385e-06,
1225
- "loss": 0.7702,
1226
- "step": 16700
1227
  },
1228
  {
1229
- "epoch": 0.42,
1230
- "grad_norm": 6.27290678024292,
1231
- "learning_rate": 3.2820512820512823e-06,
1232
- "loss": 0.7802,
1233
- "step": 16800
1234
  },
1235
  {
1236
- "epoch": 0.42,
1237
- "grad_norm": 5.323145866394043,
1238
- "learning_rate": 3.1794871794871795e-06,
1239
- "loss": 0.8103,
1240
- "step": 16900
1241
  },
1242
  {
1243
- "epoch": 0.42,
1244
- "grad_norm": 11.099617004394531,
1245
- "learning_rate": 3.0769230769230774e-06,
1246
- "loss": 0.7787,
1247
- "step": 17000
1248
  },
1249
  {
1250
- "epoch": 0.43,
1251
- "grad_norm": 3.3490378856658936,
1252
- "learning_rate": 2.9743589743589746e-06,
1253
- "loss": 0.7739,
1254
- "step": 17100
1255
  },
1256
  {
1257
- "epoch": 0.43,
1258
- "grad_norm": 5.076713562011719,
1259
- "learning_rate": 2.8717948717948717e-06,
1260
- "loss": 0.7401,
1261
- "step": 17200
1262
  },
1263
  {
1264
- "epoch": 0.43,
1265
- "grad_norm": 4.410634517669678,
1266
- "learning_rate": 2.7692307692307697e-06,
1267
- "loss": 0.7738,
1268
- "step": 17300
1269
  },
1270
  {
1271
- "epoch": 0.43,
1272
- "grad_norm": 3.249955177307129,
1273
- "learning_rate": 2.666666666666667e-06,
1274
- "loss": 0.7749,
1275
- "step": 17400
1276
  },
1277
  {
1278
- "epoch": 0.44,
1279
- "grad_norm": 4.0387349128723145,
1280
- "learning_rate": 2.564102564102564e-06,
1281
- "loss": 0.7704,
1282
- "step": 17500
1283
  },
1284
  {
1285
- "epoch": 0.44,
1286
- "eval_loss": 0.7883001565933228,
1287
- "eval_runtime": 104.2311,
1288
- "eval_samples_per_second": 9.594,
1289
- "eval_steps_per_second": 9.594,
1290
- "step": 17500
1291
  },
1292
  {
1293
- "epoch": 0.44,
1294
- "grad_norm": 7.914300918579102,
1295
- "learning_rate": 2.461538461538462e-06,
1296
- "loss": 0.7651,
1297
- "step": 17600
1298
  },
1299
  {
1300
- "epoch": 0.44,
1301
- "grad_norm": 4.809656620025635,
1302
- "learning_rate": 2.358974358974359e-06,
1303
- "loss": 0.7631,
1304
- "step": 17700
1305
  },
1306
  {
1307
- "epoch": 0.45,
1308
- "grad_norm": 6.220585823059082,
1309
- "learning_rate": 2.2564102564102566e-06,
1310
- "loss": 0.7925,
1311
- "step": 17800
1312
  },
1313
  {
1314
- "epoch": 0.45,
1315
- "grad_norm": 3.666391611099243,
1316
- "learning_rate": 2.153846153846154e-06,
1317
- "loss": 0.7857,
1318
- "step": 17900
1319
  },
1320
  {
1321
- "epoch": 0.45,
1322
- "grad_norm": 5.744978427886963,
1323
- "learning_rate": 2.0512820512820513e-06,
1324
- "loss": 0.8025,
1325
- "step": 18000
1326
  },
1327
  {
1328
- "epoch": 0.45,
1329
- "grad_norm": 5.490359783172607,
1330
- "learning_rate": 1.948717948717949e-06,
1331
- "loss": 0.8005,
1332
- "step": 18100
1333
  },
1334
  {
1335
- "epoch": 0.46,
1336
- "grad_norm": 3.3625869750976562,
1337
- "learning_rate": 1.8461538461538465e-06,
1338
- "loss": 0.7753,
1339
- "step": 18200
1340
  },
1341
  {
1342
- "epoch": 0.46,
1343
- "grad_norm": 13.186784744262695,
1344
- "learning_rate": 1.7435897435897436e-06,
1345
- "loss": 0.7705,
1346
- "step": 18300
1347
  },
1348
  {
1349
- "epoch": 0.46,
1350
- "grad_norm": 2.9938299655914307,
1351
- "learning_rate": 1.6410256410256412e-06,
1352
- "loss": 0.7838,
1353
- "step": 18400
1354
  },
1355
  {
1356
- "epoch": 0.46,
1357
- "grad_norm": 3.876194477081299,
1358
- "learning_rate": 1.5384615384615387e-06,
1359
- "loss": 0.7963,
1360
- "step": 18500
1361
  },
1362
  {
1363
- "epoch": 0.47,
1364
- "grad_norm": 8.027066230773926,
1365
- "learning_rate": 1.4358974358974359e-06,
1366
- "loss": 0.7841,
1367
- "step": 18600
1368
  },
1369
  {
1370
- "epoch": 0.47,
1371
- "grad_norm": 6.673095226287842,
1372
- "learning_rate": 1.3333333333333334e-06,
1373
- "loss": 0.7676,
1374
- "step": 18700
1375
  },
1376
  {
1377
- "epoch": 0.47,
1378
- "grad_norm": 6.047390460968018,
1379
- "learning_rate": 1.230769230769231e-06,
1380
- "loss": 0.7792,
1381
- "step": 18800
1382
  },
1383
  {
1384
- "epoch": 0.47,
1385
- "grad_norm": 3.341261625289917,
1386
- "learning_rate": 1.1282051282051283e-06,
1387
- "loss": 0.7712,
1388
- "step": 18900
1389
  },
1390
  {
1391
- "epoch": 0.47,
1392
- "grad_norm": 9.690947532653809,
1393
- "learning_rate": 1.0256410256410257e-06,
1394
- "loss": 0.768,
1395
- "step": 19000
1396
  },
1397
  {
1398
- "epoch": 0.48,
1399
- "grad_norm": 2.3877036571502686,
1400
- "learning_rate": 9.230769230769232e-07,
1401
- "loss": 0.786,
1402
- "step": 19100
1403
  },
1404
  {
1405
- "epoch": 0.48,
1406
- "grad_norm": 5.060111045837402,
1407
- "learning_rate": 8.205128205128206e-07,
1408
- "loss": 0.7492,
1409
- "step": 19200
1410
  },
1411
  {
1412
- "epoch": 0.48,
1413
- "grad_norm": 4.0241570472717285,
1414
- "learning_rate": 7.179487179487179e-07,
1415
- "loss": 0.7638,
1416
- "step": 19300
1417
  },
1418
  {
1419
- "epoch": 0.48,
1420
- "grad_norm": 6.047507286071777,
1421
- "learning_rate": 6.153846153846155e-07,
1422
- "loss": 0.7702,
1423
- "step": 19400
1424
  },
1425
  {
1426
- "epoch": 0.49,
1427
- "grad_norm": 4.642309665679932,
1428
- "learning_rate": 5.128205128205128e-07,
1429
- "loss": 0.7541,
1430
- "step": 19500
1431
  },
1432
  {
1433
- "epoch": 0.49,
1434
- "grad_norm": 10.096720695495605,
1435
- "learning_rate": 4.102564102564103e-07,
1436
- "loss": 0.7686,
1437
- "step": 19600
1438
  },
1439
  {
1440
- "epoch": 0.49,
1441
- "grad_norm": 11.970602035522461,
1442
- "learning_rate": 3.0769230769230774e-07,
1443
- "loss": 0.7619,
1444
- "step": 19700
1445
  },
1446
  {
1447
- "epoch": 0.49,
1448
- "grad_norm": 6.973097801208496,
1449
- "learning_rate": 2.0512820512820514e-07,
1450
- "loss": 0.7798,
1451
- "step": 19800
1452
  },
1453
  {
1454
- "epoch": 0.5,
1455
- "grad_norm": 4.512222766876221,
1456
- "learning_rate": 1.0256410256410257e-07,
1457
- "loss": 0.7444,
1458
- "step": 19900
1459
  },
1460
  {
1461
- "epoch": 0.5,
1462
- "grad_norm": 3.21940541267395,
1463
- "learning_rate": 0.0,
1464
- "loss": 0.7902,
1465
- "step": 20000
1466
  },
1467
  {
1468
- "epoch": 0.5,
1469
- "eval_loss": 0.7663924694061279,
1470
- "eval_runtime": 104.2963,
1471
- "eval_samples_per_second": 9.588,
1472
- "eval_steps_per_second": 9.588,
1473
- "step": 20000
 
 
 
 
 
 
 
1474
  }
1475
  ],
1476
- "logging_steps": 100,
1477
- "max_steps": 20000,
1478
  "num_input_tokens_seen": 0,
1479
  "num_train_epochs": 1,
1480
  "save_steps": 2500,
1481
- "total_flos": 3.2204251987968e+17,
1482
  "train_batch_size": 1,
1483
  "trial_name": null,
1484
  "trial_params": null
 
1
  {
2
+ "best_metric": 0.8163847923278809,
3
+ "best_model_checkpoint": "runs/deepseek_lora_20240423-223943/checkpoint-2500",
4
+ "epoch": 0.0625,
5
+ "eval_steps": 500,
6
+ "global_step": 2500,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 0.0,
13
+ "grad_norm": 3.086414337158203,
14
+ "learning_rate": 4.0000000000000003e-07,
15
+ "loss": 0.7892,
16
+ "step": 10
17
+ },
18
+ {
19
+ "epoch": 0.0,
20
+ "grad_norm": 8.478134155273438,
21
+ "learning_rate": 8.000000000000001e-07,
22
+ "loss": 0.7746,
23
+ "step": 20
24
+ },
25
+ {
26
+ "epoch": 0.0,
27
+ "grad_norm": 5.574502468109131,
28
+ "learning_rate": 1.2000000000000002e-06,
29
+ "loss": 0.8222,
30
+ "step": 30
31
+ },
32
+ {
33
+ "epoch": 0.0,
34
+ "grad_norm": 2.6497371196746826,
35
+ "learning_rate": 1.6000000000000001e-06,
36
+ "loss": 0.7423,
37
+ "step": 40
38
+ },
39
+ {
40
+ "epoch": 0.0,
41
+ "grad_norm": 3.116753339767456,
42
+ "learning_rate": 2.0000000000000003e-06,
43
+ "loss": 0.7622,
44
+ "step": 50
45
+ },
46
+ {
47
+ "epoch": 0.0,
48
+ "grad_norm": 3.179832696914673,
49
+ "learning_rate": 2.4000000000000003e-06,
50
+ "loss": 0.8183,
51
+ "step": 60
52
+ },
53
+ {
54
+ "epoch": 0.0,
55
+ "grad_norm": 3.9869463443756104,
56
+ "learning_rate": 2.8000000000000003e-06,
57
+ "loss": 0.822,
58
+ "step": 70
59
+ },
60
+ {
61
+ "epoch": 0.0,
62
+ "grad_norm": 5.093494415283203,
63
+ "learning_rate": 3.2000000000000003e-06,
64
+ "loss": 0.7966,
65
+ "step": 80
66
+ },
67
+ {
68
+ "epoch": 0.0,
69
+ "grad_norm": 5.230633735656738,
70
+ "learning_rate": 3.6000000000000003e-06,
71
+ "loss": 0.8113,
72
+ "step": 90
73
+ },
74
+ {
75
+ "epoch": 0.0,
76
+ "grad_norm": 9.374403953552246,
77
  "learning_rate": 4.000000000000001e-06,
78
+ "loss": 0.7582,
79
  "step": 100
80
  },
81
+ {
82
+ "epoch": 0.0,
83
+ "grad_norm": 6.465492248535156,
84
+ "learning_rate": 4.4e-06,
85
+ "loss": 0.7662,
86
+ "step": 110
87
+ },
88
+ {
89
+ "epoch": 0.0,
90
+ "grad_norm": 6.279934883117676,
91
+ "learning_rate": 4.800000000000001e-06,
92
+ "loss": 0.8376,
93
+ "step": 120
94
+ },
95
+ {
96
+ "epoch": 0.0,
97
+ "grad_norm": 5.799221992492676,
98
+ "learning_rate": 5.2e-06,
99
+ "loss": 0.7965,
100
+ "step": 130
101
+ },
102
+ {
103
+ "epoch": 0.0,
104
+ "grad_norm": 3.222240686416626,
105
+ "learning_rate": 5.600000000000001e-06,
106
+ "loss": 0.8855,
107
+ "step": 140
108
+ },
109
+ {
110
+ "epoch": 0.0,
111
+ "grad_norm": 9.009174346923828,
112
+ "learning_rate": 6e-06,
113
+ "loss": 0.8394,
114
+ "step": 150
115
+ },
116
+ {
117
+ "epoch": 0.0,
118
+ "grad_norm": 8.040350914001465,
119
+ "learning_rate": 6.4000000000000006e-06,
120
+ "loss": 0.8426,
121
+ "step": 160
122
+ },
123
+ {
124
+ "epoch": 0.0,
125
+ "grad_norm": 4.131030559539795,
126
+ "learning_rate": 6.800000000000001e-06,
127
+ "loss": 0.7747,
128
+ "step": 170
129
+ },
130
+ {
131
+ "epoch": 0.0,
132
+ "grad_norm": 3.31986927986145,
133
+ "learning_rate": 7.2000000000000005e-06,
134
+ "loss": 0.7125,
135
+ "step": 180
136
+ },
137
+ {
138
+ "epoch": 0.0,
139
+ "grad_norm": 5.7623395919799805,
140
+ "learning_rate": 7.600000000000001e-06,
141
+ "loss": 0.7854,
142
+ "step": 190
143
+ },
144
  {
145
  "epoch": 0.01,
146
+ "grad_norm": 10.848206520080566,
147
  "learning_rate": 8.000000000000001e-06,
148
+ "loss": 0.7756,
149
  "step": 200
150
  },
151
  {
152
  "epoch": 0.01,
153
+ "grad_norm": 13.455166816711426,
154
+ "learning_rate": 8.400000000000001e-06,
155
+ "loss": 0.7894,
156
+ "step": 210
157
+ },
158
+ {
159
+ "epoch": 0.01,
160
+ "grad_norm": 12.759767532348633,
161
+ "learning_rate": 8.8e-06,
162
+ "loss": 0.7454,
163
+ "step": 220
164
+ },
165
+ {
166
+ "epoch": 0.01,
167
+ "grad_norm": 4.262899875640869,
168
+ "learning_rate": 9.200000000000002e-06,
169
+ "loss": 0.8555,
170
+ "step": 230
171
+ },
172
+ {
173
+ "epoch": 0.01,
174
+ "grad_norm": 4.28985071182251,
175
+ "learning_rate": 9.600000000000001e-06,
176
+ "loss": 0.6845,
177
+ "step": 240
178
+ },
179
+ {
180
+ "epoch": 0.01,
181
+ "grad_norm": 4.174241542816162,
182
+ "learning_rate": 1e-05,
183
+ "loss": 0.7983,
184
+ "step": 250
185
+ },
186
+ {
187
+ "epoch": 0.01,
188
+ "grad_norm": 12.931599617004395,
189
+ "learning_rate": 1.04e-05,
190
+ "loss": 0.9041,
191
+ "step": 260
192
+ },
193
+ {
194
+ "epoch": 0.01,
195
+ "grad_norm": 7.004627227783203,
196
+ "learning_rate": 1.0800000000000002e-05,
197
+ "loss": 0.817,
198
+ "step": 270
199
+ },
200
+ {
201
+ "epoch": 0.01,
202
+ "grad_norm": 3.6102757453918457,
203
+ "learning_rate": 1.1200000000000001e-05,
204
+ "loss": 0.7292,
205
+ "step": 280
206
+ },
207
+ {
208
+ "epoch": 0.01,
209
+ "grad_norm": 2.764902353286743,
210
+ "learning_rate": 1.16e-05,
211
+ "loss": 0.9042,
212
+ "step": 290
213
+ },
214
+ {
215
+ "epoch": 0.01,
216
+ "grad_norm": 3.958317995071411,
217
  "learning_rate": 1.2e-05,
218
+ "loss": 0.7539,
219
  "step": 300
220
  },
221
  {
222
  "epoch": 0.01,
223
+ "grad_norm": 7.098923683166504,
224
+ "learning_rate": 1.2400000000000002e-05,
225
+ "loss": 0.7955,
226
+ "step": 310
227
+ },
228
+ {
229
+ "epoch": 0.01,
230
+ "grad_norm": 12.129098892211914,
231
+ "learning_rate": 1.2800000000000001e-05,
232
+ "loss": 0.849,
233
+ "step": 320
234
+ },
235
+ {
236
+ "epoch": 0.01,
237
+ "grad_norm": 2.054119825363159,
238
+ "learning_rate": 1.3200000000000002e-05,
239
+ "loss": 0.8645,
240
+ "step": 330
241
+ },
242
+ {
243
+ "epoch": 0.01,
244
+ "grad_norm": 5.205028057098389,
245
+ "learning_rate": 1.3600000000000002e-05,
246
+ "loss": 0.8175,
247
+ "step": 340
248
+ },
249
+ {
250
+ "epoch": 0.01,
251
+ "grad_norm": 2.614790439605713,
252
+ "learning_rate": 1.4e-05,
253
+ "loss": 0.8998,
254
+ "step": 350
255
+ },
256
+ {
257
+ "epoch": 0.01,
258
+ "grad_norm": 2.9891204833984375,
259
+ "learning_rate": 1.4400000000000001e-05,
260
+ "loss": 0.8108,
261
+ "step": 360
262
+ },
263
+ {
264
+ "epoch": 0.01,
265
+ "grad_norm": 4.152099609375,
266
+ "learning_rate": 1.48e-05,
267
+ "loss": 0.7855,
268
+ "step": 370
269
+ },
270
+ {
271
+ "epoch": 0.01,
272
+ "grad_norm": 9.833850860595703,
273
+ "learning_rate": 1.5200000000000002e-05,
274
+ "loss": 0.7736,
275
+ "step": 380
276
+ },
277
+ {
278
+ "epoch": 0.01,
279
+ "grad_norm": 3.849621295928955,
280
+ "learning_rate": 1.5600000000000003e-05,
281
+ "loss": 0.7668,
282
+ "step": 390
283
+ },
284
+ {
285
+ "epoch": 0.01,
286
+ "grad_norm": 5.4542975425720215,
287
  "learning_rate": 1.6000000000000003e-05,
288
+ "loss": 0.7781,
289
  "step": 400
290
  },
291
  {
292
  "epoch": 0.01,
293
+ "grad_norm": 6.197661876678467,
294
+ "learning_rate": 1.64e-05,
295
+ "loss": 0.8654,
296
+ "step": 410
297
+ },
298
+ {
299
+ "epoch": 0.01,
300
+ "grad_norm": 3.2606770992279053,
301
+ "learning_rate": 1.6800000000000002e-05,
302
+ "loss": 0.7565,
303
+ "step": 420
304
+ },
305
+ {
306
+ "epoch": 0.01,
307
+ "grad_norm": 3.9680209159851074,
308
+ "learning_rate": 1.72e-05,
309
+ "loss": 0.7886,
310
+ "step": 430
311
+ },
312
+ {
313
+ "epoch": 0.01,
314
+ "grad_norm": 18.749984741210938,
315
+ "learning_rate": 1.76e-05,
316
+ "loss": 0.7305,
317
+ "step": 440
318
+ },
319
+ {
320
+ "epoch": 0.01,
321
+ "grad_norm": 5.822000503540039,
322
+ "learning_rate": 1.8e-05,
323
+ "loss": 0.7833,
324
+ "step": 450
325
+ },
326
+ {
327
+ "epoch": 0.01,
328
+ "grad_norm": 12.999715805053711,
329
+ "learning_rate": 1.8400000000000003e-05,
330
+ "loss": 0.8483,
331
+ "step": 460
332
+ },
333
+ {
334
+ "epoch": 0.01,
335
+ "grad_norm": 7.193736553192139,
336
+ "learning_rate": 1.88e-05,
337
+ "loss": 0.84,
338
+ "step": 470
339
+ },
340
+ {
341
+ "epoch": 0.01,
342
+ "grad_norm": 12.573124885559082,
343
+ "learning_rate": 1.9200000000000003e-05,
344
+ "loss": 0.8437,
345
+ "step": 480
346
+ },
347
+ {
348
+ "epoch": 0.01,
349
+ "grad_norm": 4.4221601486206055,
350
+ "learning_rate": 1.9600000000000002e-05,
351
+ "loss": 0.6836,
352
+ "step": 490
353
+ },
354
+ {
355
+ "epoch": 0.01,
356
+ "grad_norm": 3.0399410724639893,
357
  "learning_rate": 2e-05,
358
+ "loss": 0.8264,
359
  "step": 500
360
  },
361
  {
362
  "epoch": 0.01,
363
+ "eval_loss": 0.8175864219665527,
364
+ "eval_runtime": 67.7802,
365
+ "eval_samples_per_second": 14.754,
366
+ "eval_steps_per_second": 14.754,
367
+ "step": 500
368
  },
369
  {
370
+ "epoch": 0.01,
371
+ "grad_norm": 3.971303701400757,
372
+ "learning_rate": 1.9978947368421054e-05,
373
+ "loss": 0.7385,
374
+ "step": 510
375
  },
376
  {
377
+ "epoch": 0.01,
378
+ "grad_norm": 3.8043839931488037,
379
+ "learning_rate": 1.9957894736842107e-05,
380
+ "loss": 0.7826,
381
+ "step": 520
382
  },
383
  {
384
+ "epoch": 0.01,
385
+ "grad_norm": 11.702253341674805,
386
+ "learning_rate": 1.993684210526316e-05,
387
+ "loss": 0.7971,
388
+ "step": 530
389
  },
390
  {
391
+ "epoch": 0.01,
392
+ "grad_norm": 5.176826000213623,
393
+ "learning_rate": 1.9915789473684212e-05,
394
+ "loss": 0.748,
395
+ "step": 540
396
  },
397
  {
398
+ "epoch": 0.01,
399
+ "grad_norm": 7.120133876800537,
400
+ "learning_rate": 1.9894736842105265e-05,
401
+ "loss": 0.8461,
402
+ "step": 550
403
  },
404
  {
405
+ "epoch": 0.01,
406
+ "grad_norm": 12.286151885986328,
407
+ "learning_rate": 1.9873684210526318e-05,
408
+ "loss": 0.8335,
409
+ "step": 560
410
  },
411
  {
412
+ "epoch": 0.01,
413
+ "grad_norm": 7.857172966003418,
414
+ "learning_rate": 1.985263157894737e-05,
415
+ "loss": 0.7231,
416
+ "step": 570
417
  },
418
  {
419
+ "epoch": 0.01,
420
+ "grad_norm": 5.327859401702881,
421
+ "learning_rate": 1.9831578947368423e-05,
422
+ "loss": 0.877,
423
+ "step": 580
424
  },
425
  {
426
+ "epoch": 0.01,
427
+ "grad_norm": 6.9340362548828125,
428
+ "learning_rate": 1.9810526315789476e-05,
429
+ "loss": 0.8984,
430
+ "step": 590
431
  },
432
  {
433
+ "epoch": 0.01,
434
+ "grad_norm": 2.1034326553344727,
435
+ "learning_rate": 1.9789473684210528e-05,
436
+ "loss": 0.7045,
437
+ "step": 600
438
  },
439
  {
440
+ "epoch": 0.02,
441
+ "grad_norm": 3.853721857070923,
442
+ "learning_rate": 1.976842105263158e-05,
443
+ "loss": 0.761,
444
+ "step": 610
445
  },
446
  {
447
+ "epoch": 0.02,
448
+ "grad_norm": 7.6926398277282715,
449
+ "learning_rate": 1.9747368421052633e-05,
450
+ "loss": 0.9493,
451
+ "step": 620
452
  },
453
  {
454
+ "epoch": 0.02,
455
+ "grad_norm": 6.261799335479736,
456
+ "learning_rate": 1.9726315789473686e-05,
457
+ "loss": 0.7719,
458
+ "step": 630
459
  },
460
  {
461
+ "epoch": 0.02,
462
+ "grad_norm": 3.864114284515381,
463
+ "learning_rate": 1.970526315789474e-05,
464
+ "loss": 0.9406,
465
+ "step": 640
466
  },
467
  {
468
+ "epoch": 0.02,
469
+ "grad_norm": 7.093533515930176,
470
+ "learning_rate": 1.968421052631579e-05,
471
+ "loss": 0.7951,
472
+ "step": 650
473
  },
474
  {
475
+ "epoch": 0.02,
476
+ "grad_norm": 2.3724496364593506,
477
+ "learning_rate": 1.9663157894736844e-05,
478
+ "loss": 0.8648,
479
+ "step": 660
480
  },
481
  {
482
+ "epoch": 0.02,
483
+ "grad_norm": 10.12341022491455,
484
+ "learning_rate": 1.9642105263157897e-05,
485
+ "loss": 0.7823,
486
+ "step": 670
487
  },
488
  {
489
+ "epoch": 0.02,
490
+ "grad_norm": 2.80940842628479,
491
+ "learning_rate": 1.962105263157895e-05,
492
+ "loss": 0.706,
493
+ "step": 680
494
  },
495
  {
496
+ "epoch": 0.02,
497
+ "grad_norm": 8.243487358093262,
498
+ "learning_rate": 1.9600000000000002e-05,
499
+ "loss": 0.8244,
500
+ "step": 690
501
  },
502
  {
503
+ "epoch": 0.02,
504
+ "grad_norm": 11.420123100280762,
505
+ "learning_rate": 1.9578947368421055e-05,
506
+ "loss": 0.6753,
507
+ "step": 700
 
508
  },
509
  {
510
+ "epoch": 0.02,
511
+ "grad_norm": 63.8618278503418,
512
+ "learning_rate": 1.9557894736842107e-05,
513
+ "loss": 0.8309,
514
+ "step": 710
515
  },
516
  {
517
+ "epoch": 0.02,
518
+ "grad_norm": 4.521258354187012,
519
+ "learning_rate": 1.953684210526316e-05,
520
+ "loss": 0.8101,
521
+ "step": 720
522
  },
523
  {
524
+ "epoch": 0.02,
525
+ "grad_norm": 2.9532318115234375,
526
+ "learning_rate": 1.9515789473684213e-05,
527
+ "loss": 0.8533,
528
+ "step": 730
529
  },
530
  {
531
+ "epoch": 0.02,
532
+ "grad_norm": 3.792180061340332,
533
+ "learning_rate": 1.9494736842105265e-05,
534
+ "loss": 0.7573,
535
+ "step": 740
536
  },
537
  {
538
+ "epoch": 0.02,
539
+ "grad_norm": 5.155513286590576,
540
+ "learning_rate": 1.9473684210526318e-05,
541
+ "loss": 0.8961,
542
+ "step": 750
543
  },
544
  {
545
+ "epoch": 0.02,
546
+ "grad_norm": 9.195950508117676,
547
+ "learning_rate": 1.945263157894737e-05,
548
+ "loss": 0.8398,
549
+ "step": 760
550
  },
551
  {
552
+ "epoch": 0.02,
553
+ "grad_norm": 6.699478626251221,
554
+ "learning_rate": 1.9431578947368423e-05,
555
+ "loss": 0.8018,
556
+ "step": 770
557
  },
558
  {
559
+ "epoch": 0.02,
560
+ "grad_norm": 5.254507541656494,
561
+ "learning_rate": 1.9410526315789476e-05,
562
+ "loss": 0.8408,
563
+ "step": 780
564
  },
565
  {
566
+ "epoch": 0.02,
567
+ "grad_norm": 4.351966857910156,
568
+ "learning_rate": 1.9389473684210525e-05,
569
+ "loss": 0.7323,
570
+ "step": 790
571
  },
572
  {
573
+ "epoch": 0.02,
574
+ "grad_norm": 2.361276626586914,
575
+ "learning_rate": 1.936842105263158e-05,
576
+ "loss": 0.8401,
577
+ "step": 800
578
  },
579
  {
580
+ "epoch": 0.02,
581
+ "grad_norm": 5.449990272521973,
582
+ "learning_rate": 1.9347368421052634e-05,
583
+ "loss": 0.726,
584
+ "step": 810
585
  },
586
  {
587
+ "epoch": 0.02,
588
+ "grad_norm": 5.375738143920898,
589
+ "learning_rate": 1.9326315789473687e-05,
590
+ "loss": 0.8305,
591
+ "step": 820
592
  },
593
  {
594
+ "epoch": 0.02,
595
+ "grad_norm": 2.601025342941284,
596
+ "learning_rate": 1.930526315789474e-05,
597
+ "loss": 0.9152,
598
+ "step": 830
599
  },
600
  {
601
+ "epoch": 0.02,
602
+ "grad_norm": 12.153268814086914,
603
+ "learning_rate": 1.9284210526315792e-05,
604
+ "loss": 0.8423,
605
+ "step": 840
606
  },
607
  {
608
+ "epoch": 0.02,
609
+ "grad_norm": 3.785663604736328,
610
+ "learning_rate": 1.9263157894736845e-05,
611
+ "loss": 0.7733,
612
+ "step": 850
613
  },
614
  {
615
+ "epoch": 0.02,
616
+ "grad_norm": 10.162787437438965,
617
+ "learning_rate": 1.9242105263157894e-05,
618
+ "loss": 0.893,
619
+ "step": 860
620
  },
621
  {
622
+ "epoch": 0.02,
623
+ "grad_norm": 3.871621608734131,
624
+ "learning_rate": 1.922105263157895e-05,
625
+ "loss": 0.798,
626
+ "step": 870
627
  },
628
  {
629
+ "epoch": 0.02,
630
+ "grad_norm": 2.9919800758361816,
631
+ "learning_rate": 1.9200000000000003e-05,
632
+ "loss": 0.8484,
633
+ "step": 880
634
  },
635
  {
636
+ "epoch": 0.02,
637
+ "grad_norm": 5.40109920501709,
638
+ "learning_rate": 1.9178947368421055e-05,
639
+ "loss": 0.9129,
640
+ "step": 890
641
  },
642
  {
643
+ "epoch": 0.02,
644
+ "grad_norm": 6.794926643371582,
645
+ "learning_rate": 1.9157894736842108e-05,
646
+ "loss": 0.8687,
647
+ "step": 900
648
  },
649
  {
650
+ "epoch": 0.02,
651
+ "grad_norm": 5.942440986633301,
652
+ "learning_rate": 1.913684210526316e-05,
653
+ "loss": 0.8564,
654
+ "step": 910
655
  },
656
  {
657
+ "epoch": 0.02,
658
+ "grad_norm": 5.968307018280029,
659
+ "learning_rate": 1.9115789473684213e-05,
660
+ "loss": 0.8495,
661
+ "step": 920
662
  },
663
  {
664
+ "epoch": 0.02,
665
+ "grad_norm": 8.425616264343262,
666
+ "learning_rate": 1.9094736842105262e-05,
667
+ "loss": 0.7242,
668
+ "step": 930
669
  },
670
  {
671
+ "epoch": 0.02,
672
+ "grad_norm": 2.819301128387451,
673
+ "learning_rate": 1.907368421052632e-05,
674
+ "loss": 0.8381,
675
+ "step": 940
676
  },
677
  {
678
+ "epoch": 0.02,
679
+ "grad_norm": 6.81688117980957,
680
+ "learning_rate": 1.9052631578947368e-05,
681
+ "loss": 0.8817,
682
+ "step": 950
683
  },
684
  {
685
+ "epoch": 0.02,
686
+ "grad_norm": 5.102423191070557,
687
+ "learning_rate": 1.9031578947368424e-05,
688
+ "loss": 0.8274,
689
+ "step": 960
 
690
  },
691
  {
692
+ "epoch": 0.02,
693
+ "grad_norm": 4.12994909286499,
694
+ "learning_rate": 1.9010526315789476e-05,
695
+ "loss": 0.7052,
696
+ "step": 970
697
  },
698
  {
699
+ "epoch": 0.02,
700
+ "grad_norm": 5.15468692779541,
701
+ "learning_rate": 1.898947368421053e-05,
702
+ "loss": 0.772,
703
+ "step": 980
704
  },
705
  {
706
+ "epoch": 0.02,
707
+ "grad_norm": 1.62323796749115,
708
+ "learning_rate": 1.8968421052631582e-05,
709
+ "loss": 0.7764,
710
+ "step": 990
711
  },
712
  {
713
+ "epoch": 0.03,
714
+ "grad_norm": 2.546677589416504,
715
+ "learning_rate": 1.894736842105263e-05,
716
+ "loss": 0.8365,
717
+ "step": 1000
718
  },
719
  {
720
+ "epoch": 0.03,
721
+ "eval_loss": 0.7952949404716492,
722
+ "eval_runtime": 67.7544,
723
+ "eval_samples_per_second": 14.759,
724
+ "eval_steps_per_second": 14.759,
725
+ "step": 1000
726
  },
727
  {
728
+ "epoch": 0.03,
729
+ "grad_norm": 9.28386402130127,
730
+ "learning_rate": 1.8926315789473687e-05,
731
+ "loss": 0.8765,
732
+ "step": 1010
733
  },
734
  {
735
+ "epoch": 0.03,
736
+ "grad_norm": 7.3430304527282715,
737
+ "learning_rate": 1.8905263157894736e-05,
738
+ "loss": 0.8763,
739
+ "step": 1020
740
  },
741
  {
742
+ "epoch": 0.03,
743
+ "grad_norm": 4.0531206130981445,
744
+ "learning_rate": 1.8884210526315792e-05,
745
+ "loss": 0.7943,
746
+ "step": 1030
747
  },
748
  {
749
+ "epoch": 0.03,
750
+ "grad_norm": 3.028320074081421,
751
+ "learning_rate": 1.886315789473684e-05,
752
+ "loss": 0.836,
753
+ "step": 1040
754
  },
755
  {
756
+ "epoch": 0.03,
757
+ "grad_norm": 3.3861188888549805,
758
+ "learning_rate": 1.8842105263157898e-05,
759
+ "loss": 0.7336,
760
+ "step": 1050
761
  },
762
  {
763
+ "epoch": 0.03,
764
+ "grad_norm": 3.7832908630371094,
765
+ "learning_rate": 1.882105263157895e-05,
766
+ "loss": 0.9283,
767
+ "step": 1060
768
  },
769
  {
770
+ "epoch": 0.03,
771
+ "grad_norm": 3.8170342445373535,
772
+ "learning_rate": 1.88e-05,
773
+ "loss": 0.7655,
774
+ "step": 1070
775
  },
776
  {
777
+ "epoch": 0.03,
778
+ "grad_norm": 6.15322732925415,
779
+ "learning_rate": 1.8778947368421056e-05,
780
+ "loss": 0.9341,
781
+ "step": 1080
782
  },
783
  {
784
+ "epoch": 0.03,
785
+ "grad_norm": 7.066686153411865,
786
+ "learning_rate": 1.8757894736842105e-05,
787
+ "loss": 0.85,
788
+ "step": 1090
789
  },
790
  {
791
+ "epoch": 0.03,
792
+ "grad_norm": 2.986961603164673,
793
+ "learning_rate": 1.873684210526316e-05,
794
+ "loss": 0.8943,
795
+ "step": 1100
796
  },
797
  {
798
+ "epoch": 0.03,
799
+ "grad_norm": 2.8456902503967285,
800
+ "learning_rate": 1.871578947368421e-05,
801
+ "loss": 0.8279,
802
+ "step": 1110
803
  },
804
  {
805
+ "epoch": 0.03,
806
+ "grad_norm": 3.6177377700805664,
807
+ "learning_rate": 1.8694736842105266e-05,
808
+ "loss": 0.8192,
809
+ "step": 1120
810
  },
811
  {
812
+ "epoch": 0.03,
813
+ "grad_norm": 14.768010139465332,
814
+ "learning_rate": 1.8673684210526316e-05,
815
+ "loss": 0.8005,
816
+ "step": 1130
817
  },
818
  {
819
+ "epoch": 0.03,
820
+ "grad_norm": 11.347342491149902,
821
+ "learning_rate": 1.8652631578947368e-05,
822
+ "loss": 0.8081,
823
+ "step": 1140
824
  },
825
  {
826
+ "epoch": 0.03,
827
+ "grad_norm": 4.0560150146484375,
828
+ "learning_rate": 1.8631578947368424e-05,
829
+ "loss": 0.9389,
830
+ "step": 1150
831
  },
832
  {
833
+ "epoch": 0.03,
834
+ "grad_norm": 3.3164710998535156,
835
+ "learning_rate": 1.8610526315789473e-05,
836
+ "loss": 0.8501,
837
+ "step": 1160
838
  },
839
  {
840
+ "epoch": 0.03,
841
+ "grad_norm": 11.112225532531738,
842
+ "learning_rate": 1.858947368421053e-05,
843
+ "loss": 0.7162,
844
+ "step": 1170
845
  },
846
  {
847
+ "epoch": 0.03,
848
+ "grad_norm": 6.200588703155518,
849
+ "learning_rate": 1.856842105263158e-05,
850
+ "loss": 0.7448,
851
+ "step": 1180
852
  },
853
  {
854
+ "epoch": 0.03,
855
+ "grad_norm": 6.573482513427734,
856
+ "learning_rate": 1.8547368421052635e-05,
857
+ "loss": 0.8071,
858
+ "step": 1190
859
  },
860
  {
861
+ "epoch": 0.03,
862
+ "grad_norm": 5.153548717498779,
863
+ "learning_rate": 1.8526315789473684e-05,
864
+ "loss": 0.7957,
865
+ "step": 1200
866
  },
867
  {
868
+ "epoch": 0.03,
869
+ "grad_norm": 5.3308305740356445,
870
+ "learning_rate": 1.8505263157894737e-05,
871
+ "loss": 0.7301,
872
+ "step": 1210
 
873
  },
874
  {
875
+ "epoch": 0.03,
876
+ "grad_norm": 5.269808769226074,
877
+ "learning_rate": 1.8484210526315793e-05,
878
+ "loss": 0.8072,
879
+ "step": 1220
880
  },
881
  {
882
+ "epoch": 0.03,
883
+ "grad_norm": 5.588324546813965,
884
+ "learning_rate": 1.8463157894736842e-05,
885
+ "loss": 0.8587,
886
+ "step": 1230
887
  },
888
  {
889
+ "epoch": 0.03,
890
+ "grad_norm": 4.593557357788086,
891
+ "learning_rate": 1.8442105263157898e-05,
892
+ "loss": 0.856,
893
+ "step": 1240
894
  },
895
  {
896
+ "epoch": 0.03,
897
+ "grad_norm": 5.2591094970703125,
898
+ "learning_rate": 1.8421052631578947e-05,
899
+ "loss": 0.7717,
900
+ "step": 1250
901
  },
902
  {
903
+ "epoch": 0.03,
904
+ "grad_norm": 4.052567958831787,
905
+ "learning_rate": 1.8400000000000003e-05,
906
+ "loss": 0.7823,
907
+ "step": 1260
908
  },
909
  {
910
+ "epoch": 0.03,
911
+ "grad_norm": 4.447838306427002,
912
+ "learning_rate": 1.8378947368421053e-05,
913
+ "loss": 0.83,
914
+ "step": 1270
915
  },
916
  {
917
+ "epoch": 0.03,
918
+ "grad_norm": 4.029257774353027,
919
+ "learning_rate": 1.8357894736842105e-05,
920
+ "loss": 0.7504,
921
+ "step": 1280
922
  },
923
  {
924
+ "epoch": 0.03,
925
+ "grad_norm": 9.053960800170898,
926
+ "learning_rate": 1.8336842105263158e-05,
927
+ "loss": 0.9074,
928
+ "step": 1290
929
  },
930
  {
931
+ "epoch": 0.03,
932
+ "grad_norm": 2.2877705097198486,
933
+ "learning_rate": 1.831578947368421e-05,
934
+ "loss": 0.772,
935
+ "step": 1300
936
  },
937
  {
938
+ "epoch": 0.03,
939
+ "grad_norm": 3.4482290744781494,
940
+ "learning_rate": 1.8294736842105267e-05,
941
+ "loss": 0.8658,
942
+ "step": 1310
943
  },
944
  {
945
+ "epoch": 0.03,
946
+ "grad_norm": 6.684794902801514,
947
+ "learning_rate": 1.8273684210526316e-05,
948
+ "loss": 0.7848,
949
+ "step": 1320
950
  },
951
  {
952
+ "epoch": 0.03,
953
+ "grad_norm": 3.553828716278076,
954
+ "learning_rate": 1.8252631578947372e-05,
955
+ "loss": 0.8219,
956
+ "step": 1330
957
  },
958
  {
959
+ "epoch": 0.03,
960
+ "grad_norm": 2.5203397274017334,
961
+ "learning_rate": 1.823157894736842e-05,
962
+ "loss": 0.9071,
963
+ "step": 1340
964
  },
965
  {
966
+ "epoch": 0.03,
967
+ "grad_norm": 4.961795806884766,
968
+ "learning_rate": 1.8210526315789477e-05,
969
+ "loss": 0.6542,
970
+ "step": 1350
971
  },
972
  {
973
+ "epoch": 0.03,
974
+ "grad_norm": 3.663081645965576,
975
+ "learning_rate": 1.8189473684210527e-05,
976
+ "loss": 0.7402,
977
+ "step": 1360
978
  },
979
  {
980
+ "epoch": 0.03,
981
+ "grad_norm": 8.785040855407715,
982
+ "learning_rate": 1.816842105263158e-05,
983
+ "loss": 0.7462,
984
+ "step": 1370
985
  },
986
  {
987
+ "epoch": 0.03,
988
+ "grad_norm": 4.659074783325195,
989
+ "learning_rate": 1.8147368421052632e-05,
990
+ "loss": 0.6951,
991
+ "step": 1380
992
  },
993
  {
994
+ "epoch": 0.03,
995
+ "grad_norm": 3.5885703563690186,
996
+ "learning_rate": 1.8126315789473685e-05,
997
+ "loss": 0.7008,
998
+ "step": 1390
999
  },
1000
  {
1001
+ "epoch": 0.04,
1002
+ "grad_norm": 3.1295347213745117,
1003
+ "learning_rate": 1.810526315789474e-05,
1004
+ "loss": 0.9103,
1005
+ "step": 1400
1006
  },
1007
  {
1008
+ "epoch": 0.04,
1009
+ "grad_norm": 2.4699888229370117,
1010
+ "learning_rate": 1.808421052631579e-05,
1011
+ "loss": 0.841,
1012
+ "step": 1410
1013
  },
1014
  {
1015
+ "epoch": 0.04,
1016
+ "grad_norm": 5.3273444175720215,
1017
+ "learning_rate": 1.8063157894736846e-05,
1018
+ "loss": 0.9041,
1019
+ "step": 1420
1020
  },
1021
  {
1022
+ "epoch": 0.04,
1023
+ "grad_norm": 5.149638652801514,
1024
+ "learning_rate": 1.8042105263157895e-05,
1025
+ "loss": 0.7784,
1026
+ "step": 1430
1027
  },
1028
  {
1029
+ "epoch": 0.04,
1030
+ "grad_norm": 3.4124910831451416,
1031
+ "learning_rate": 1.8021052631578948e-05,
1032
+ "loss": 0.8208,
1033
+ "step": 1440
1034
  },
1035
  {
1036
+ "epoch": 0.04,
1037
+ "grad_norm": 2.9231085777282715,
1038
+ "learning_rate": 1.8e-05,
1039
+ "loss": 0.7173,
1040
+ "step": 1450
1041
  },
1042
  {
1043
+ "epoch": 0.04,
1044
+ "grad_norm": 4.008113384246826,
1045
+ "learning_rate": 1.7978947368421053e-05,
1046
+ "loss": 0.7383,
1047
+ "step": 1460
1048
  },
1049
  {
1050
+ "epoch": 0.04,
1051
+ "grad_norm": 5.1748046875,
1052
+ "learning_rate": 1.795789473684211e-05,
1053
+ "loss": 0.8399,
1054
+ "step": 1470
 
1055
  },
1056
  {
1057
+ "epoch": 0.04,
1058
+ "grad_norm": 3.4990293979644775,
1059
+ "learning_rate": 1.793684210526316e-05,
1060
+ "loss": 0.6721,
1061
+ "step": 1480
1062
  },
1063
  {
1064
+ "epoch": 0.04,
1065
+ "grad_norm": 3.1186299324035645,
1066
+ "learning_rate": 1.7915789473684214e-05,
1067
+ "loss": 0.782,
1068
+ "step": 1490
1069
  },
1070
  {
1071
+ "epoch": 0.04,
1072
+ "grad_norm": 5.12732458114624,
1073
+ "learning_rate": 1.7894736842105264e-05,
1074
+ "loss": 0.7211,
1075
+ "step": 1500
1076
  },
1077
  {
1078
+ "epoch": 0.04,
1079
+ "eval_loss": 0.811568021774292,
1080
+ "eval_runtime": 67.7961,
1081
+ "eval_samples_per_second": 14.75,
1082
+ "eval_steps_per_second": 14.75,
1083
+ "step": 1500
1084
  },
1085
  {
1086
+ "epoch": 0.04,
1087
+ "grad_norm": 3.631096124649048,
1088
+ "learning_rate": 1.7873684210526316e-05,
1089
+ "loss": 0.7557,
1090
+ "step": 1510
1091
  },
1092
  {
1093
+ "epoch": 0.04,
1094
+ "grad_norm": 8.850045204162598,
1095
+ "learning_rate": 1.785263157894737e-05,
1096
+ "loss": 0.8757,
1097
+ "step": 1520
1098
  },
1099
  {
1100
+ "epoch": 0.04,
1101
+ "grad_norm": 3.1114978790283203,
1102
+ "learning_rate": 1.7831578947368422e-05,
1103
+ "loss": 0.7613,
1104
+ "step": 1530
1105
  },
1106
  {
1107
+ "epoch": 0.04,
1108
+ "grad_norm": 4.5038743019104,
1109
+ "learning_rate": 1.7810526315789474e-05,
1110
+ "loss": 0.8049,
1111
+ "step": 1540
1112
  },
1113
  {
1114
+ "epoch": 0.04,
1115
+ "grad_norm": 4.2331156730651855,
1116
+ "learning_rate": 1.7789473684210527e-05,
1117
+ "loss": 0.8277,
1118
+ "step": 1550
1119
  },
1120
  {
1121
+ "epoch": 0.04,
1122
+ "grad_norm": 5.05696964263916,
1123
+ "learning_rate": 1.7768421052631583e-05,
1124
+ "loss": 0.7973,
1125
+ "step": 1560
1126
  },
1127
  {
1128
+ "epoch": 0.04,
1129
+ "grad_norm": 2.1331920623779297,
1130
+ "learning_rate": 1.7747368421052632e-05,
1131
+ "loss": 0.7688,
1132
+ "step": 1570
1133
  },
1134
  {
1135
+ "epoch": 0.04,
1136
+ "grad_norm": 4.984541416168213,
1137
+ "learning_rate": 1.7726315789473685e-05,
1138
+ "loss": 0.7865,
1139
+ "step": 1580
1140
  },
1141
  {
1142
+ "epoch": 0.04,
1143
+ "grad_norm": 7.149406433105469,
1144
+ "learning_rate": 1.7705263157894738e-05,
1145
+ "loss": 0.7728,
1146
+ "step": 1590
1147
  },
1148
  {
1149
+ "epoch": 0.04,
1150
+ "grad_norm": 8.092243194580078,
1151
+ "learning_rate": 1.768421052631579e-05,
1152
+ "loss": 0.935,
1153
+ "step": 1600
1154
  },
1155
  {
1156
+ "epoch": 0.04,
1157
+ "grad_norm": 13.16551399230957,
1158
+ "learning_rate": 1.7663157894736843e-05,
1159
+ "loss": 0.8286,
1160
+ "step": 1610
1161
  },
1162
  {
1163
+ "epoch": 0.04,
1164
+ "grad_norm": 2.131350517272949,
1165
+ "learning_rate": 1.7642105263157896e-05,
1166
+ "loss": 0.7864,
1167
+ "step": 1620
1168
  },
1169
  {
1170
+ "epoch": 0.04,
1171
+ "grad_norm": 7.870023727416992,
1172
+ "learning_rate": 1.7621052631578948e-05,
1173
+ "loss": 0.8645,
1174
+ "step": 1630
1175
  },
1176
  {
1177
+ "epoch": 0.04,
1178
+ "grad_norm": 10.631692886352539,
1179
+ "learning_rate": 1.76e-05,
1180
+ "loss": 0.8473,
1181
+ "step": 1640
1182
  },
1183
  {
1184
+ "epoch": 0.04,
1185
+ "grad_norm": 6.421032905578613,
1186
+ "learning_rate": 1.7578947368421054e-05,
1187
+ "loss": 0.7868,
1188
+ "step": 1650
1189
  },
1190
  {
1191
+ "epoch": 0.04,
1192
+ "grad_norm": 4.57529878616333,
1193
+ "learning_rate": 1.7557894736842106e-05,
1194
+ "loss": 0.7882,
1195
+ "step": 1660
1196
  },
1197
  {
1198
+ "epoch": 0.04,
1199
+ "grad_norm": 3.8785624504089355,
1200
+ "learning_rate": 1.753684210526316e-05,
1201
+ "loss": 0.7543,
1202
+ "step": 1670
1203
  },
1204
  {
1205
+ "epoch": 0.04,
1206
+ "grad_norm": 5.722006320953369,
1207
+ "learning_rate": 1.751578947368421e-05,
1208
+ "loss": 0.9626,
1209
+ "step": 1680
1210
+ },
1211
+ {
1212
+ "epoch": 0.04,
1213
+ "grad_norm": 2.466771364212036,
1214
+ "learning_rate": 1.7494736842105264e-05,
1215
+ "loss": 0.783,
1216
+ "step": 1690
1217
  },
1218
  {
1219
+ "epoch": 0.04,
1220
+ "grad_norm": 3.072049856185913,
1221
+ "learning_rate": 1.7473684210526317e-05,
1222
+ "loss": 0.7503,
1223
+ "step": 1700
1224
  },
1225
  {
1226
+ "epoch": 0.04,
1227
+ "grad_norm": 5.768575668334961,
1228
+ "learning_rate": 1.745263157894737e-05,
1229
+ "loss": 0.8193,
1230
+ "step": 1710
1231
  },
1232
  {
1233
+ "epoch": 0.04,
1234
+ "grad_norm": 2.585022211074829,
1235
+ "learning_rate": 1.7431578947368422e-05,
1236
+ "loss": 0.8808,
1237
+ "step": 1720
1238
  },
1239
  {
1240
+ "epoch": 0.04,
1241
+ "grad_norm": 3.0711567401885986,
1242
+ "learning_rate": 1.7410526315789475e-05,
1243
+ "loss": 0.8098,
1244
+ "step": 1730
 
1245
  },
1246
  {
1247
+ "epoch": 0.04,
1248
+ "grad_norm": 3.3020272254943848,
1249
+ "learning_rate": 1.7389473684210527e-05,
1250
+ "loss": 0.7196,
1251
+ "step": 1740
1252
  },
1253
  {
1254
+ "epoch": 0.04,
1255
+ "grad_norm": 3.645238161087036,
1256
+ "learning_rate": 1.736842105263158e-05,
1257
+ "loss": 0.8904,
1258
+ "step": 1750
1259
  },
1260
  {
1261
+ "epoch": 0.04,
1262
+ "grad_norm": 6.018638610839844,
1263
+ "learning_rate": 1.7347368421052633e-05,
1264
+ "loss": 0.7937,
1265
+ "step": 1760
1266
  },
1267
  {
1268
+ "epoch": 0.04,
1269
+ "grad_norm": 3.629096746444702,
1270
+ "learning_rate": 1.7326315789473685e-05,
1271
+ "loss": 0.9171,
1272
+ "step": 1770
1273
  },
1274
  {
1275
+ "epoch": 0.04,
1276
+ "grad_norm": 2.5619189739227295,
1277
+ "learning_rate": 1.7305263157894738e-05,
1278
+ "loss": 0.9488,
1279
+ "step": 1780
1280
  },
1281
  {
1282
+ "epoch": 0.04,
1283
+ "grad_norm": 9.464752197265625,
1284
+ "learning_rate": 1.728421052631579e-05,
1285
+ "loss": 0.8459,
1286
+ "step": 1790
1287
  },
1288
  {
1289
+ "epoch": 0.04,
1290
+ "grad_norm": 3.9856364727020264,
1291
+ "learning_rate": 1.7263157894736843e-05,
1292
+ "loss": 0.8378,
1293
+ "step": 1800
1294
  },
1295
  {
1296
+ "epoch": 0.05,
1297
+ "grad_norm": 3.753553867340088,
1298
+ "learning_rate": 1.7242105263157896e-05,
1299
+ "loss": 0.8093,
1300
+ "step": 1810
1301
  },
1302
  {
1303
+ "epoch": 0.05,
1304
+ "grad_norm": 3.4593358039855957,
1305
+ "learning_rate": 1.722105263157895e-05,
1306
+ "loss": 0.7896,
1307
+ "step": 1820
1308
  },
1309
  {
1310
+ "epoch": 0.05,
1311
+ "grad_norm": 2.7163546085357666,
1312
+ "learning_rate": 1.72e-05,
1313
+ "loss": 0.7188,
1314
+ "step": 1830
1315
  },
1316
  {
1317
+ "epoch": 0.05,
1318
+ "grad_norm": 3.105628728866577,
1319
+ "learning_rate": 1.7178947368421054e-05,
1320
+ "loss": 0.7643,
1321
+ "step": 1840
1322
  },
1323
  {
1324
+ "epoch": 0.05,
1325
+ "grad_norm": 2.387368679046631,
1326
+ "learning_rate": 1.7157894736842107e-05,
1327
+ "loss": 0.8465,
1328
+ "step": 1850
1329
  },
1330
  {
1331
+ "epoch": 0.05,
1332
+ "grad_norm": 6.020385265350342,
1333
+ "learning_rate": 1.713684210526316e-05,
1334
+ "loss": 0.7798,
1335
+ "step": 1860
1336
  },
1337
  {
1338
+ "epoch": 0.05,
1339
+ "grad_norm": 4.560520172119141,
1340
+ "learning_rate": 1.7115789473684212e-05,
1341
+ "loss": 0.7704,
1342
+ "step": 1870
1343
  },
1344
  {
1345
+ "epoch": 0.05,
1346
+ "grad_norm": 15.739727973937988,
1347
+ "learning_rate": 1.7094736842105265e-05,
1348
+ "loss": 0.7148,
1349
+ "step": 1880
1350
  },
1351
  {
1352
+ "epoch": 0.05,
1353
+ "grad_norm": 5.79690408706665,
1354
+ "learning_rate": 1.7073684210526317e-05,
1355
+ "loss": 0.798,
1356
+ "step": 1890
1357
  },
1358
  {
1359
+ "epoch": 0.05,
1360
+ "grad_norm": 2.6939146518707275,
1361
+ "learning_rate": 1.705263157894737e-05,
1362
+ "loss": 0.7641,
1363
+ "step": 1900
1364
  },
1365
  {
1366
+ "epoch": 0.05,
1367
+ "grad_norm": 5.193384170532227,
1368
+ "learning_rate": 1.7031578947368423e-05,
1369
+ "loss": 0.7866,
1370
+ "step": 1910
1371
  },
1372
  {
1373
+ "epoch": 0.05,
1374
+ "grad_norm": 4.940731525421143,
1375
+ "learning_rate": 1.7010526315789475e-05,
1376
+ "loss": 0.8261,
1377
+ "step": 1920
1378
  },
1379
  {
1380
+ "epoch": 0.05,
1381
+ "grad_norm": 2.1812446117401123,
1382
+ "learning_rate": 1.6989473684210528e-05,
1383
+ "loss": 0.7973,
1384
+ "step": 1930
1385
  },
1386
  {
1387
+ "epoch": 0.05,
1388
+ "grad_norm": 3.7413289546966553,
1389
+ "learning_rate": 1.696842105263158e-05,
1390
+ "loss": 0.7818,
1391
+ "step": 1940
1392
  },
1393
  {
1394
+ "epoch": 0.05,
1395
+ "grad_norm": 4.024014472961426,
1396
+ "learning_rate": 1.6947368421052633e-05,
1397
+ "loss": 0.7237,
1398
+ "step": 1950
1399
  },
1400
  {
1401
+ "epoch": 0.05,
1402
+ "grad_norm": 3.0871291160583496,
1403
+ "learning_rate": 1.6926315789473686e-05,
1404
+ "loss": 0.772,
1405
+ "step": 1960
1406
  },
1407
  {
1408
+ "epoch": 0.05,
1409
+ "grad_norm": 3.28814435005188,
1410
+ "learning_rate": 1.690526315789474e-05,
1411
+ "loss": 0.7067,
1412
+ "step": 1970
1413
  },
1414
  {
1415
+ "epoch": 0.05,
1416
+ "grad_norm": 2.8241286277770996,
1417
+ "learning_rate": 1.688421052631579e-05,
1418
+ "loss": 0.8175,
1419
+ "step": 1980
1420
  },
1421
  {
1422
+ "epoch": 0.05,
1423
+ "grad_norm": 2.5942068099975586,
1424
+ "learning_rate": 1.6863157894736844e-05,
1425
+ "loss": 0.9265,
1426
+ "step": 1990
 
1427
  },
1428
  {
1429
+ "epoch": 0.05,
1430
+ "grad_norm": 6.6822662353515625,
1431
+ "learning_rate": 1.6842105263157896e-05,
1432
+ "loss": 0.8593,
1433
+ "step": 2000
1434
  },
1435
  {
1436
+ "epoch": 0.05,
1437
+ "eval_loss": 0.8064771890640259,
1438
+ "eval_runtime": 67.7887,
1439
+ "eval_samples_per_second": 14.752,
1440
+ "eval_steps_per_second": 14.752,
1441
+ "step": 2000
1442
  },
1443
  {
1444
+ "epoch": 0.05,
1445
+ "grad_norm": 7.032164573669434,
1446
+ "learning_rate": 1.682105263157895e-05,
1447
+ "loss": 0.8819,
1448
+ "step": 2010
1449
  },
1450
  {
1451
+ "epoch": 0.05,
1452
+ "grad_norm": 4.874982833862305,
1453
+ "learning_rate": 1.6800000000000002e-05,
1454
+ "loss": 0.8021,
1455
+ "step": 2020
1456
  },
1457
  {
1458
+ "epoch": 0.05,
1459
+ "grad_norm": 2.6172547340393066,
1460
+ "learning_rate": 1.6778947368421054e-05,
1461
+ "loss": 0.8017,
1462
+ "step": 2030
1463
  },
1464
  {
1465
+ "epoch": 0.05,
1466
+ "grad_norm": 10.659741401672363,
1467
+ "learning_rate": 1.6757894736842107e-05,
1468
+ "loss": 0.8896,
1469
+ "step": 2040
1470
  },
1471
  {
1472
+ "epoch": 0.05,
1473
+ "grad_norm": 6.189141750335693,
1474
+ "learning_rate": 1.673684210526316e-05,
1475
+ "loss": 0.7997,
1476
+ "step": 2050
1477
  },
1478
  {
1479
+ "epoch": 0.05,
1480
+ "grad_norm": 4.523468971252441,
1481
+ "learning_rate": 1.6715789473684212e-05,
1482
+ "loss": 0.8498,
1483
+ "step": 2060
1484
  },
1485
  {
1486
+ "epoch": 0.05,
1487
+ "grad_norm": 8.533658981323242,
1488
+ "learning_rate": 1.6694736842105265e-05,
1489
+ "loss": 0.8857,
1490
+ "step": 2070
1491
  },
1492
  {
1493
+ "epoch": 0.05,
1494
+ "grad_norm": 3.0041606426239014,
1495
+ "learning_rate": 1.6673684210526318e-05,
1496
+ "loss": 0.8112,
1497
+ "step": 2080
1498
  },
1499
  {
1500
+ "epoch": 0.05,
1501
+ "grad_norm": 5.055651664733887,
1502
+ "learning_rate": 1.665263157894737e-05,
1503
+ "loss": 0.7872,
1504
+ "step": 2090
1505
  },
1506
  {
1507
+ "epoch": 0.05,
1508
+ "grad_norm": 5.761922836303711,
1509
+ "learning_rate": 1.6631578947368423e-05,
1510
+ "loss": 0.7727,
1511
+ "step": 2100
1512
  },
1513
  {
1514
+ "epoch": 0.05,
1515
+ "grad_norm": 2.518223524093628,
1516
+ "learning_rate": 1.6610526315789476e-05,
1517
+ "loss": 0.7997,
1518
+ "step": 2110
1519
  },
1520
  {
1521
+ "epoch": 0.05,
1522
+ "grad_norm": 4.975761890411377,
1523
+ "learning_rate": 1.658947368421053e-05,
1524
+ "loss": 0.7457,
1525
+ "step": 2120
1526
  },
1527
  {
1528
+ "epoch": 0.05,
1529
+ "grad_norm": 3.2227561473846436,
1530
+ "learning_rate": 1.656842105263158e-05,
1531
+ "loss": 0.816,
1532
+ "step": 2130
1533
  },
1534
  {
1535
+ "epoch": 0.05,
1536
+ "grad_norm": 4.705923080444336,
1537
+ "learning_rate": 1.6547368421052634e-05,
1538
+ "loss": 0.8113,
1539
+ "step": 2140
1540
  },
1541
  {
1542
+ "epoch": 0.05,
1543
+ "grad_norm": 2.655057430267334,
1544
+ "learning_rate": 1.6526315789473686e-05,
1545
+ "loss": 0.7912,
1546
+ "step": 2150
1547
  },
1548
  {
1549
+ "epoch": 0.05,
1550
+ "grad_norm": 3.0186755657196045,
1551
+ "learning_rate": 1.650526315789474e-05,
1552
+ "loss": 0.8608,
1553
+ "step": 2160
1554
  },
1555
  {
1556
+ "epoch": 0.05,
1557
+ "grad_norm": 1.232386827468872,
1558
+ "learning_rate": 1.648421052631579e-05,
1559
+ "loss": 0.8549,
1560
+ "step": 2170
1561
  },
1562
  {
1563
+ "epoch": 0.05,
1564
+ "grad_norm": 11.968620300292969,
1565
+ "learning_rate": 1.6463157894736844e-05,
1566
+ "loss": 0.868,
1567
+ "step": 2180
1568
  },
1569
  {
1570
+ "epoch": 0.05,
1571
+ "grad_norm": 3.5853216648101807,
1572
+ "learning_rate": 1.6442105263157897e-05,
1573
+ "loss": 0.8388,
1574
+ "step": 2190
1575
  },
1576
  {
1577
+ "epoch": 0.06,
1578
+ "grad_norm": 2.375610589981079,
1579
+ "learning_rate": 1.642105263157895e-05,
1580
+ "loss": 0.9111,
1581
+ "step": 2200
1582
  },
1583
  {
1584
+ "epoch": 0.06,
1585
+ "grad_norm": 1.9734487533569336,
1586
+ "learning_rate": 1.64e-05,
1587
+ "loss": 0.7288,
1588
+ "step": 2210
1589
  },
1590
  {
1591
+ "epoch": 0.06,
1592
+ "grad_norm": 10.517192840576172,
1593
+ "learning_rate": 1.6378947368421055e-05,
1594
+ "loss": 0.698,
1595
+ "step": 2220
1596
  },
1597
  {
1598
+ "epoch": 0.06,
1599
+ "grad_norm": 4.183718204498291,
1600
+ "learning_rate": 1.6357894736842108e-05,
1601
+ "loss": 0.7759,
1602
+ "step": 2230
1603
  },
1604
  {
1605
+ "epoch": 0.06,
1606
+ "grad_norm": 3.9075675010681152,
1607
+ "learning_rate": 1.633684210526316e-05,
1608
+ "loss": 0.7829,
1609
+ "step": 2240
 
1610
  },
1611
  {
1612
+ "epoch": 0.06,
1613
+ "grad_norm": 5.287744998931885,
1614
+ "learning_rate": 1.6315789473684213e-05,
1615
+ "loss": 0.7057,
1616
+ "step": 2250
1617
  },
1618
  {
1619
+ "epoch": 0.06,
1620
+ "grad_norm": 4.977657318115234,
1621
+ "learning_rate": 1.6294736842105265e-05,
1622
+ "loss": 0.8346,
1623
+ "step": 2260
1624
  },
1625
  {
1626
+ "epoch": 0.06,
1627
+ "grad_norm": 7.196689128875732,
1628
+ "learning_rate": 1.6273684210526318e-05,
1629
+ "loss": 0.8508,
1630
+ "step": 2270
1631
  },
1632
  {
1633
+ "epoch": 0.06,
1634
+ "grad_norm": 2.467477798461914,
1635
+ "learning_rate": 1.6252631578947367e-05,
1636
+ "loss": 0.7179,
1637
+ "step": 2280
1638
  },
1639
  {
1640
+ "epoch": 0.06,
1641
+ "grad_norm": 7.059762954711914,
1642
+ "learning_rate": 1.6231578947368423e-05,
1643
+ "loss": 0.7549,
1644
+ "step": 2290
1645
  },
1646
  {
1647
+ "epoch": 0.06,
1648
+ "grad_norm": 3.980865955352783,
1649
+ "learning_rate": 1.6210526315789473e-05,
1650
+ "loss": 0.814,
1651
+ "step": 2300
1652
  },
1653
  {
1654
+ "epoch": 0.06,
1655
+ "grad_norm": 7.675939559936523,
1656
+ "learning_rate": 1.618947368421053e-05,
1657
+ "loss": 0.8227,
1658
+ "step": 2310
1659
  },
1660
  {
1661
+ "epoch": 0.06,
1662
+ "grad_norm": 3.530073642730713,
1663
+ "learning_rate": 1.616842105263158e-05,
1664
+ "loss": 0.8517,
1665
+ "step": 2320
1666
  },
1667
  {
1668
+ "epoch": 0.06,
1669
+ "grad_norm": 3.6851344108581543,
1670
+ "learning_rate": 1.6147368421052634e-05,
1671
+ "loss": 0.7684,
1672
+ "step": 2330
1673
  },
1674
  {
1675
+ "epoch": 0.06,
1676
+ "grad_norm": 5.206923961639404,
1677
+ "learning_rate": 1.6126315789473687e-05,
1678
+ "loss": 0.8199,
1679
+ "step": 2340
1680
  },
1681
  {
1682
+ "epoch": 0.06,
1683
+ "grad_norm": 5.220828056335449,
1684
+ "learning_rate": 1.6105263157894736e-05,
1685
+ "loss": 0.8871,
1686
+ "step": 2350
1687
  },
1688
  {
1689
+ "epoch": 0.06,
1690
+ "grad_norm": 3.5062482357025146,
1691
+ "learning_rate": 1.6084210526315792e-05,
1692
+ "loss": 0.8281,
1693
+ "step": 2360
1694
  },
1695
  {
1696
+ "epoch": 0.06,
1697
+ "grad_norm": 1.9830796718597412,
1698
+ "learning_rate": 1.606315789473684e-05,
1699
+ "loss": 0.8678,
1700
+ "step": 2370
1701
  },
1702
  {
1703
+ "epoch": 0.06,
1704
+ "grad_norm": 3.3255491256713867,
1705
+ "learning_rate": 1.6042105263157897e-05,
1706
+ "loss": 0.8337,
1707
+ "step": 2380
1708
  },
1709
  {
1710
+ "epoch": 0.06,
1711
+ "grad_norm": 5.259572505950928,
1712
+ "learning_rate": 1.6021052631578947e-05,
1713
+ "loss": 0.7954,
1714
+ "step": 2390
1715
  },
1716
  {
1717
+ "epoch": 0.06,
1718
+ "grad_norm": 3.6201376914978027,
1719
+ "learning_rate": 1.6000000000000003e-05,
1720
+ "loss": 0.818,
1721
+ "step": 2400
1722
  },
1723
  {
1724
+ "epoch": 0.06,
1725
+ "grad_norm": 3.3598544597625732,
1726
+ "learning_rate": 1.5978947368421055e-05,
1727
+ "loss": 0.7697,
1728
+ "step": 2410
1729
  },
1730
  {
1731
+ "epoch": 0.06,
1732
+ "grad_norm": 6.34808349609375,
1733
+ "learning_rate": 1.5957894736842105e-05,
1734
+ "loss": 0.6347,
1735
+ "step": 2420
1736
  },
1737
  {
1738
+ "epoch": 0.06,
1739
+ "grad_norm": 3.967682361602783,
1740
+ "learning_rate": 1.593684210526316e-05,
1741
+ "loss": 0.7178,
1742
+ "step": 2430
1743
  },
1744
  {
1745
+ "epoch": 0.06,
1746
+ "grad_norm": 10.222978591918945,
1747
+ "learning_rate": 1.591578947368421e-05,
1748
+ "loss": 0.7642,
1749
+ "step": 2440
1750
  },
1751
  {
1752
+ "epoch": 0.06,
1753
+ "grad_norm": 3.9339826107025146,
1754
+ "learning_rate": 1.5894736842105266e-05,
1755
+ "loss": 0.8197,
1756
+ "step": 2450
1757
  },
1758
  {
1759
+ "epoch": 0.06,
1760
+ "grad_norm": 2.3337771892547607,
1761
+ "learning_rate": 1.5873684210526315e-05,
1762
+ "loss": 0.9375,
1763
+ "step": 2460
1764
  },
1765
  {
1766
+ "epoch": 0.06,
1767
+ "grad_norm": 2.8479838371276855,
1768
+ "learning_rate": 1.585263157894737e-05,
1769
+ "loss": 0.9196,
1770
+ "step": 2470
1771
  },
1772
  {
1773
+ "epoch": 0.06,
1774
+ "grad_norm": 9.294541358947754,
1775
+ "learning_rate": 1.5831578947368424e-05,
1776
+ "loss": 0.7144,
1777
+ "step": 2480
1778
  },
1779
  {
1780
+ "epoch": 0.06,
1781
+ "grad_norm": 5.325323104858398,
1782
+ "learning_rate": 1.5810526315789473e-05,
1783
+ "loss": 0.7897,
1784
+ "step": 2490
1785
  },
1786
  {
1787
+ "epoch": 0.06,
1788
+ "grad_norm": 4.377369403839111,
1789
+ "learning_rate": 1.578947368421053e-05,
1790
+ "loss": 0.9008,
1791
+ "step": 2500
1792
+ },
1793
+ {
1794
+ "epoch": 0.06,
1795
+ "eval_loss": 0.8163847923278809,
1796
+ "eval_runtime": 67.7994,
1797
+ "eval_samples_per_second": 14.749,
1798
+ "eval_steps_per_second": 14.749,
1799
+ "step": 2500
1800
  }
1801
  ],
1802
+ "logging_steps": 10,
1803
+ "max_steps": 10000,
1804
  "num_input_tokens_seen": 0,
1805
  "num_train_epochs": 1,
1806
  "save_steps": 2500,
1807
+ "total_flos": 4.025531498496e+16,
1808
  "train_batch_size": 1,
1809
  "trial_name": null,
1810
  "trial_params": null
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ae8864807404348e5714abdca7ecd3f7b499a2f8b4bff1a613654ec5edf69101
3
- size 4984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:51ac7424107c168679594d767b2ffefa42eac9e349caa7916abcb7990d9f453e
3
+ size 4920