pkarypis commited on
Commit
5728785
1 Parent(s): 8bc5e6b

Model save

Browse files
README.md CHANGED
@@ -2,16 +2,11 @@
2
  license: apache-2.0
3
  base_model: JackFram/llama-68m
4
  tags:
5
- - alignment-handbook
6
  - trl
7
  - sft
8
  - generated_from_trainer
9
- - trl
10
- - sft
11
- - alignment-handbook
12
- - generated_from_trainer
13
  datasets:
14
- - HuggingFaceH4/ultrachat_200k
15
  model-index:
16
  - name: gpt2-sft-port
17
  results: []
@@ -22,9 +17,9 @@ should probably proofread and complete it, then remove this comment. -->
22
 
23
  # gpt2-sft-port
24
 
25
- This model is a fine-tuned version of [JackFram/llama-68m](https://huggingface.co/JackFram/llama-68m) on the HuggingFaceH4/ultrachat_200k dataset.
26
  It achieves the following results on the evaluation set:
27
- - Loss: 2.1067
28
 
29
  ## Model description
30
 
@@ -60,8 +55,8 @@ The following hyperparameters were used during training:
60
 
61
  | Training Loss | Epoch | Step | Validation Loss |
62
  |:-------------:|:-----:|:----:|:---------------:|
63
- | 2.1213 | 1.0 | 1129 | 2.1273 |
64
- | 2.0929 | 2.0 | 2258 | 2.1067 |
65
 
66
 
67
  ### Framework versions
 
2
  license: apache-2.0
3
  base_model: JackFram/llama-68m
4
  tags:
 
5
  - trl
6
  - sft
7
  - generated_from_trainer
 
 
 
 
8
  datasets:
9
+ - generator
10
  model-index:
11
  - name: gpt2-sft-port
12
  results: []
 
17
 
18
  # gpt2-sft-port
19
 
20
+ This model is a fine-tuned version of [JackFram/llama-68m](https://huggingface.co/JackFram/llama-68m) on the generator dataset.
21
  It achieves the following results on the evaluation set:
22
+ - Loss: 2.1059
23
 
24
  ## Model description
25
 
 
55
 
56
  | Training Loss | Epoch | Step | Validation Loss |
57
  |:-------------:|:-----:|:----:|:---------------:|
58
+ | 2.1213 | 1.0 | 1129 | 2.1265 |
59
+ | 2.0928 | 2.0 | 2258 | 2.1059 |
60
 
61
 
62
  ### Framework versions
all_results.json CHANGED
@@ -1,13 +1,8 @@
1
  {
2
  "epoch": 2.0,
3
- "eval_loss": 2.106666326522827,
4
- "eval_runtime": 22.5513,
5
- "eval_samples": 23110,
6
- "eval_samples_per_second": 1418.056,
7
- "eval_steps_per_second": 22.172,
8
- "train_loss": 2.1773925889794863,
9
- "train_runtime": 649.8909,
10
  "train_samples": 207865,
11
- "train_samples_per_second": 889.158,
12
- "train_steps_per_second": 3.474
13
  }
 
1
  {
2
  "epoch": 2.0,
3
+ "train_loss": 2.1773926187554116,
4
+ "train_runtime": 648.0178,
 
 
 
 
 
5
  "train_samples": 207865,
6
+ "train_samples_per_second": 891.729,
7
+ "train_steps_per_second": 3.484
8
  }
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2f535408f7c9de8f0a08d48c11b21bd78b4fe7a02bcb033ae0dddbe4412df957
3
  size 136062744
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8eb940098e0c986e2dc0472474629609901a79e74afca9822d643312a7527757
3
  size 136062744
runs/Apr24_23-11-31_aga39/events.out.tfevents.1714018496.aga39.699328.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5ee831862a8536d4a958097f05ad656d16a1efa6f308bf7be11125488ab61218
3
+ size 100878
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "epoch": 2.0,
3
- "train_loss": 2.1773925889794863,
4
- "train_runtime": 649.8909,
5
  "train_samples": 207865,
6
- "train_samples_per_second": 889.158,
7
- "train_steps_per_second": 3.474
8
  }
 
1
  {
2
  "epoch": 2.0,
3
+ "train_loss": 2.1773926187554116,
4
+ "train_runtime": 648.0178,
5
  "train_samples": 207865,
6
+ "train_samples_per_second": 891.729,
7
+ "train_steps_per_second": 3.484
8
  }
trainer_state.json CHANGED
@@ -10,3192 +10,3192 @@
10
  "log_history": [
11
  {
12
  "epoch": 0.0,
13
- "grad_norm": 6.166676968836395,
14
  "learning_rate": 8.849557522123894e-08,
15
  "loss": 3.1048,
16
  "step": 1
17
  },
18
  {
19
  "epoch": 0.0,
20
- "grad_norm": 6.547092311647801,
21
  "learning_rate": 4.4247787610619474e-07,
22
  "loss": 3.1401,
23
  "step": 5
24
  },
25
  {
26
  "epoch": 0.01,
27
- "grad_norm": 6.696142762723074,
28
  "learning_rate": 8.849557522123895e-07,
29
  "loss": 3.1347,
30
  "step": 10
31
  },
32
  {
33
  "epoch": 0.01,
34
- "grad_norm": 6.564424462036202,
35
  "learning_rate": 1.3274336283185843e-06,
36
  "loss": 3.1075,
37
  "step": 15
38
  },
39
  {
40
  "epoch": 0.02,
41
- "grad_norm": 6.56340215907664,
42
  "learning_rate": 1.769911504424779e-06,
43
  "loss": 3.1301,
44
  "step": 20
45
  },
46
  {
47
  "epoch": 0.02,
48
- "grad_norm": 5.858177007035812,
49
  "learning_rate": 2.212389380530974e-06,
50
  "loss": 3.0924,
51
  "step": 25
52
  },
53
  {
54
  "epoch": 0.03,
55
- "grad_norm": 5.070355541120401,
56
  "learning_rate": 2.6548672566371687e-06,
57
  "loss": 3.0522,
58
  "step": 30
59
  },
60
  {
61
  "epoch": 0.03,
62
- "grad_norm": 4.8552301265241065,
63
  "learning_rate": 3.097345132743363e-06,
64
  "loss": 2.9879,
65
  "step": 35
66
  },
67
  {
68
  "epoch": 0.04,
69
- "grad_norm": 3.704784290191808,
70
  "learning_rate": 3.539823008849558e-06,
71
  "loss": 2.9571,
72
  "step": 40
73
  },
74
  {
75
  "epoch": 0.04,
76
- "grad_norm": 3.384795613726869,
77
  "learning_rate": 3.982300884955752e-06,
78
  "loss": 2.8407,
79
  "step": 45
80
  },
81
  {
82
  "epoch": 0.04,
83
- "grad_norm": 2.7582544847296315,
84
  "learning_rate": 4.424778761061948e-06,
85
  "loss": 2.8395,
86
  "step": 50
87
  },
88
  {
89
  "epoch": 0.05,
90
- "grad_norm": 2.417714409849194,
91
  "learning_rate": 4.867256637168142e-06,
92
  "loss": 2.7639,
93
  "step": 55
94
  },
95
  {
96
  "epoch": 0.05,
97
- "grad_norm": 1.9028772367498012,
98
  "learning_rate": 5.309734513274337e-06,
99
- "loss": 2.724,
100
  "step": 60
101
  },
102
  {
103
  "epoch": 0.06,
104
- "grad_norm": 1.7118899554117333,
105
  "learning_rate": 5.752212389380532e-06,
106
  "loss": 2.6849,
107
  "step": 65
108
  },
109
  {
110
  "epoch": 0.06,
111
- "grad_norm": 1.5287880760894983,
112
  "learning_rate": 6.194690265486726e-06,
113
  "loss": 2.6676,
114
  "step": 70
115
  },
116
  {
117
  "epoch": 0.07,
118
- "grad_norm": 1.3317826031028697,
119
  "learning_rate": 6.6371681415929215e-06,
120
  "loss": 2.6535,
121
  "step": 75
122
  },
123
  {
124
  "epoch": 0.07,
125
- "grad_norm": 1.2363556687855088,
126
  "learning_rate": 7.079646017699116e-06,
127
  "loss": 2.6052,
128
  "step": 80
129
  },
130
  {
131
  "epoch": 0.08,
132
- "grad_norm": 1.172363689278875,
133
  "learning_rate": 7.5221238938053095e-06,
134
  "loss": 2.5976,
135
  "step": 85
136
  },
137
  {
138
  "epoch": 0.08,
139
- "grad_norm": 1.0436173018457393,
140
  "learning_rate": 7.964601769911505e-06,
141
  "loss": 2.5714,
142
  "step": 90
143
  },
144
  {
145
  "epoch": 0.08,
146
- "grad_norm": 1.0463179684092647,
147
  "learning_rate": 8.4070796460177e-06,
148
  "loss": 2.5294,
149
  "step": 95
150
  },
151
  {
152
  "epoch": 0.09,
153
- "grad_norm": 1.0027666321118367,
154
  "learning_rate": 8.849557522123895e-06,
155
  "loss": 2.5195,
156
  "step": 100
157
  },
158
  {
159
  "epoch": 0.09,
160
- "grad_norm": 0.9052842619737691,
161
  "learning_rate": 9.29203539823009e-06,
162
  "loss": 2.4895,
163
  "step": 105
164
  },
165
  {
166
  "epoch": 0.1,
167
- "grad_norm": 0.834205826232326,
168
  "learning_rate": 9.734513274336284e-06,
169
  "loss": 2.4613,
170
  "step": 110
171
  },
172
  {
173
  "epoch": 0.1,
174
- "grad_norm": 0.8277997415459304,
175
  "learning_rate": 1.0176991150442479e-05,
176
  "loss": 2.4672,
177
  "step": 115
178
  },
179
  {
180
  "epoch": 0.11,
181
- "grad_norm": 0.7759754798637605,
182
  "learning_rate": 1.0619469026548675e-05,
183
  "loss": 2.4404,
184
  "step": 120
185
  },
186
  {
187
  "epoch": 0.11,
188
- "grad_norm": 0.7125985572317155,
189
  "learning_rate": 1.1061946902654867e-05,
190
  "loss": 2.4359,
191
  "step": 125
192
  },
193
  {
194
  "epoch": 0.12,
195
- "grad_norm": 0.6316938002742314,
196
  "learning_rate": 1.1504424778761064e-05,
197
  "loss": 2.4162,
198
  "step": 130
199
  },
200
  {
201
  "epoch": 0.12,
202
- "grad_norm": 0.6326333481727333,
203
  "learning_rate": 1.1946902654867258e-05,
204
  "loss": 2.4092,
205
  "step": 135
206
  },
207
  {
208
  "epoch": 0.12,
209
- "grad_norm": 0.6153942588692012,
210
  "learning_rate": 1.2389380530973452e-05,
211
  "loss": 2.3838,
212
  "step": 140
213
  },
214
  {
215
  "epoch": 0.13,
216
- "grad_norm": 0.6211022417404793,
217
  "learning_rate": 1.2831858407079647e-05,
218
  "loss": 2.3835,
219
  "step": 145
220
  },
221
  {
222
  "epoch": 0.13,
223
- "grad_norm": 0.5291168929482543,
224
  "learning_rate": 1.3274336283185843e-05,
225
  "loss": 2.3808,
226
  "step": 150
227
  },
228
  {
229
  "epoch": 0.14,
230
- "grad_norm": 0.546151747873949,
231
  "learning_rate": 1.3716814159292036e-05,
232
  "loss": 2.3642,
233
  "step": 155
234
  },
235
  {
236
  "epoch": 0.14,
237
- "grad_norm": 0.5139700009146303,
238
  "learning_rate": 1.4159292035398232e-05,
239
- "loss": 2.3453,
240
  "step": 160
241
  },
242
  {
243
  "epoch": 0.15,
244
- "grad_norm": 0.5379650107333647,
245
  "learning_rate": 1.4601769911504426e-05,
246
  "loss": 2.3306,
247
  "step": 165
248
  },
249
  {
250
  "epoch": 0.15,
251
- "grad_norm": 0.474674376228336,
252
  "learning_rate": 1.5044247787610619e-05,
253
  "loss": 2.3065,
254
  "step": 170
255
  },
256
  {
257
  "epoch": 0.16,
258
- "grad_norm": 0.5521680095926059,
259
  "learning_rate": 1.5486725663716813e-05,
260
  "loss": 2.3428,
261
  "step": 175
262
  },
263
  {
264
  "epoch": 0.16,
265
- "grad_norm": 0.5087274289016205,
266
  "learning_rate": 1.592920353982301e-05,
267
  "loss": 2.3521,
268
  "step": 180
269
  },
270
  {
271
  "epoch": 0.16,
272
- "grad_norm": 0.4613323165494926,
273
  "learning_rate": 1.6371681415929206e-05,
274
  "loss": 2.3497,
275
  "step": 185
276
  },
277
  {
278
  "epoch": 0.17,
279
- "grad_norm": 0.5212979633254762,
280
  "learning_rate": 1.68141592920354e-05,
281
- "loss": 2.3224,
282
  "step": 190
283
  },
284
  {
285
  "epoch": 0.17,
286
- "grad_norm": 0.4905873485826262,
287
  "learning_rate": 1.7256637168141594e-05,
288
  "loss": 2.32,
289
  "step": 195
290
  },
291
  {
292
  "epoch": 0.18,
293
- "grad_norm": 0.458442915659364,
294
  "learning_rate": 1.769911504424779e-05,
295
  "loss": 2.3187,
296
  "step": 200
297
  },
298
  {
299
  "epoch": 0.18,
300
- "grad_norm": 0.5323359325757816,
301
  "learning_rate": 1.8141592920353983e-05,
302
  "loss": 2.318,
303
  "step": 205
304
  },
305
  {
306
  "epoch": 0.19,
307
- "grad_norm": 0.44625920204060826,
308
  "learning_rate": 1.858407079646018e-05,
309
  "loss": 2.3148,
310
  "step": 210
311
  },
312
  {
313
  "epoch": 0.19,
314
- "grad_norm": 0.47295003997186863,
315
  "learning_rate": 1.9026548672566376e-05,
316
  "loss": 2.2872,
317
  "step": 215
318
  },
319
  {
320
  "epoch": 0.19,
321
- "grad_norm": 0.4492671106545463,
322
  "learning_rate": 1.946902654867257e-05,
323
- "loss": 2.2928,
324
  "step": 220
325
  },
326
  {
327
  "epoch": 0.2,
328
- "grad_norm": 0.4568010624833796,
329
  "learning_rate": 1.991150442477876e-05,
330
  "loss": 2.2822,
331
  "step": 225
332
  },
333
  {
334
  "epoch": 0.2,
335
- "grad_norm": 0.4766732778999827,
336
  "learning_rate": 1.9999808776641724e-05,
337
  "loss": 2.2906,
338
  "step": 230
339
  },
340
  {
341
  "epoch": 0.21,
342
- "grad_norm": 0.4569147400105531,
343
  "learning_rate": 1.999903194428269e-05,
344
  "loss": 2.2843,
345
  "step": 235
346
  },
347
  {
348
  "epoch": 0.21,
349
- "grad_norm": 0.4592675300479139,
350
  "learning_rate": 1.999765759784862e-05,
351
  "loss": 2.2903,
352
  "step": 240
353
  },
354
  {
355
  "epoch": 0.22,
356
- "grad_norm": 0.4462533142569423,
357
  "learning_rate": 1.9995685819466593e-05,
358
- "loss": 2.2752,
359
  "step": 245
360
  },
361
  {
362
  "epoch": 0.22,
363
- "grad_norm": 0.48978869664298863,
364
  "learning_rate": 1.9993116726964554e-05,
365
  "loss": 2.2752,
366
  "step": 250
367
  },
368
  {
369
  "epoch": 0.23,
370
- "grad_norm": 0.4466960733195815,
371
  "learning_rate": 1.9989950473864254e-05,
372
- "loss": 2.2765,
373
  "step": 255
374
  },
375
  {
376
  "epoch": 0.23,
377
- "grad_norm": 0.45341714551603735,
378
  "learning_rate": 1.998618724937209e-05,
379
  "loss": 2.2672,
380
  "step": 260
381
  },
382
  {
383
  "epoch": 0.23,
384
- "grad_norm": 0.4330038108668258,
385
  "learning_rate": 1.9981827278367796e-05,
386
- "loss": 2.2747,
387
  "step": 265
388
  },
389
  {
390
  "epoch": 0.24,
391
- "grad_norm": 0.446782199165444,
392
  "learning_rate": 1.997687082139099e-05,
393
  "loss": 2.2488,
394
  "step": 270
395
  },
396
  {
397
  "epoch": 0.24,
398
- "grad_norm": 0.4743235199223858,
399
  "learning_rate": 1.9971318174625633e-05,
400
  "loss": 2.2535,
401
  "step": 275
402
  },
403
  {
404
  "epoch": 0.25,
405
- "grad_norm": 0.4537297621087457,
406
  "learning_rate": 1.9965169669882293e-05,
407
- "loss": 2.2387,
408
  "step": 280
409
  },
410
  {
411
  "epoch": 0.25,
412
- "grad_norm": 0.4787071457902011,
413
  "learning_rate": 1.9958425674578364e-05,
414
- "loss": 2.2449,
415
  "step": 285
416
  },
417
  {
418
  "epoch": 0.26,
419
- "grad_norm": 0.45545831621478816,
420
  "learning_rate": 1.995108659171607e-05,
421
  "loss": 2.2723,
422
  "step": 290
423
  },
424
  {
425
  "epoch": 0.26,
426
- "grad_norm": 0.42823438881971565,
427
  "learning_rate": 1.9943152859858386e-05,
428
  "loss": 2.2374,
429
  "step": 295
430
  },
431
  {
432
  "epoch": 0.27,
433
- "grad_norm": 0.4579693482199022,
434
  "learning_rate": 1.9934624953102858e-05,
435
- "loss": 2.2525,
436
  "step": 300
437
  },
438
  {
439
  "epoch": 0.27,
440
- "grad_norm": 0.49024930805480815,
441
  "learning_rate": 1.9925503381053258e-05,
442
- "loss": 2.2439,
443
  "step": 305
444
  },
445
  {
446
  "epoch": 0.27,
447
- "grad_norm": 0.45751286328133933,
448
  "learning_rate": 1.9915788688789107e-05,
449
  "loss": 2.2538,
450
  "step": 310
451
  },
452
  {
453
  "epoch": 0.28,
454
- "grad_norm": 0.5005039680420447,
455
  "learning_rate": 1.990548145683315e-05,
456
  "loss": 2.2404,
457
  "step": 315
458
  },
459
  {
460
  "epoch": 0.28,
461
- "grad_norm": 0.45150676792989397,
462
  "learning_rate": 1.9894582301116633e-05,
463
  "loss": 2.2373,
464
  "step": 320
465
  },
466
  {
467
  "epoch": 0.29,
468
- "grad_norm": 0.4486548021238474,
469
  "learning_rate": 1.9883091872942484e-05,
470
- "loss": 2.2357,
471
  "step": 325
472
  },
473
  {
474
  "epoch": 0.29,
475
- "grad_norm": 0.4747142459548906,
476
  "learning_rate": 1.9871010858946443e-05,
477
- "loss": 2.2351,
478
  "step": 330
479
  },
480
  {
481
  "epoch": 0.3,
482
- "grad_norm": 0.41924076010195294,
483
  "learning_rate": 1.985833998105598e-05,
484
  "loss": 2.2175,
485
  "step": 335
486
  },
487
  {
488
  "epoch": 0.3,
489
- "grad_norm": 0.458116233847905,
490
  "learning_rate": 1.984507999644719e-05,
491
- "loss": 2.2245,
492
  "step": 340
493
  },
494
  {
495
  "epoch": 0.31,
496
- "grad_norm": 0.4602293418537829,
497
  "learning_rate": 1.9831231697499515e-05,
498
  "loss": 2.2288,
499
  "step": 345
500
  },
501
  {
502
  "epoch": 0.31,
503
- "grad_norm": 0.4540108944188571,
504
  "learning_rate": 1.9816795911748422e-05,
505
- "loss": 2.2503,
506
  "step": 350
507
  },
508
  {
509
  "epoch": 0.31,
510
- "grad_norm": 0.42466916253285114,
511
  "learning_rate": 1.980177350183594e-05,
512
- "loss": 2.2326,
513
  "step": 355
514
  },
515
  {
516
  "epoch": 0.32,
517
- "grad_norm": 0.4287613744262243,
518
  "learning_rate": 1.9786165365459102e-05,
519
  "loss": 2.2368,
520
  "step": 360
521
  },
522
  {
523
  "epoch": 0.32,
524
- "grad_norm": 0.4285356683451498,
525
  "learning_rate": 1.976997243531632e-05,
526
  "loss": 2.2194,
527
  "step": 365
528
  },
529
  {
530
  "epoch": 0.33,
531
- "grad_norm": 0.442631881258217,
532
  "learning_rate": 1.975319567905163e-05,
533
  "loss": 2.2236,
534
  "step": 370
535
  },
536
  {
537
  "epoch": 0.33,
538
- "grad_norm": 0.438683408848578,
539
  "learning_rate": 1.9735836099196882e-05,
540
  "loss": 2.2242,
541
  "step": 375
542
  },
543
  {
544
  "epoch": 0.34,
545
- "grad_norm": 0.4391072060565573,
546
  "learning_rate": 1.971789473311184e-05,
547
  "loss": 2.2149,
548
  "step": 380
549
  },
550
  {
551
  "epoch": 0.34,
552
- "grad_norm": 0.4470779082761735,
553
  "learning_rate": 1.9699372652922154e-05,
554
- "loss": 2.2342,
555
  "step": 385
556
  },
557
  {
558
  "epoch": 0.35,
559
- "grad_norm": 0.4335384322435459,
560
  "learning_rate": 1.9680270965455343e-05,
561
  "loss": 2.2334,
562
  "step": 390
563
  },
564
  {
565
  "epoch": 0.35,
566
- "grad_norm": 0.4536377530873907,
567
  "learning_rate": 1.966059081217461e-05,
568
  "loss": 2.2198,
569
  "step": 395
570
  },
571
  {
572
  "epoch": 0.35,
573
- "grad_norm": 0.45063797587497534,
574
  "learning_rate": 1.9640333369110662e-05,
575
  "loss": 2.2177,
576
  "step": 400
577
  },
578
  {
579
  "epoch": 0.36,
580
- "grad_norm": 0.4584379354380471,
581
  "learning_rate": 1.9619499846791426e-05,
582
  "loss": 2.2251,
583
  "step": 405
584
  },
585
  {
586
  "epoch": 0.36,
587
- "grad_norm": 0.4679749982241818,
588
  "learning_rate": 1.9598091490169696e-05,
589
- "loss": 2.2034,
590
  "step": 410
591
  },
592
  {
593
  "epoch": 0.37,
594
- "grad_norm": 0.4205822694237699,
595
  "learning_rate": 1.9576109578548757e-05,
596
  "loss": 2.2156,
597
  "step": 415
598
  },
599
  {
600
  "epoch": 0.37,
601
- "grad_norm": 0.4756983603009965,
602
  "learning_rate": 1.9553555425505933e-05,
603
  "loss": 2.187,
604
  "step": 420
605
  },
606
  {
607
  "epoch": 0.38,
608
- "grad_norm": 0.4471308636117222,
609
  "learning_rate": 1.953043037881408e-05,
610
  "loss": 2.212,
611
  "step": 425
612
  },
613
  {
614
  "epoch": 0.38,
615
- "grad_norm": 0.4635628440386862,
616
  "learning_rate": 1.9506735820361065e-05,
617
  "loss": 2.2216,
618
  "step": 430
619
  },
620
  {
621
  "epoch": 0.39,
622
- "grad_norm": 0.41554796293016955,
623
  "learning_rate": 1.9482473166067177e-05,
624
  "loss": 2.2124,
625
  "step": 435
626
  },
627
  {
628
  "epoch": 0.39,
629
- "grad_norm": 0.42373518194153276,
630
  "learning_rate": 1.945764386580051e-05,
631
  "loss": 2.2188,
632
  "step": 440
633
  },
634
  {
635
  "epoch": 0.39,
636
- "grad_norm": 0.4265418403653279,
637
  "learning_rate": 1.9432249403290337e-05,
638
  "loss": 2.2159,
639
  "step": 445
640
  },
641
  {
642
  "epoch": 0.4,
643
- "grad_norm": 0.42957740678058587,
644
  "learning_rate": 1.940629129603844e-05,
645
  "loss": 2.202,
646
  "step": 450
647
  },
648
  {
649
  "epoch": 0.4,
650
- "grad_norm": 0.46195902587162346,
651
  "learning_rate": 1.9379771095228426e-05,
652
  "loss": 2.1968,
653
  "step": 455
654
  },
655
  {
656
  "epoch": 0.41,
657
- "grad_norm": 0.4442009598682672,
658
  "learning_rate": 1.935269038563303e-05,
659
  "loss": 2.1957,
660
  "step": 460
661
  },
662
  {
663
  "epoch": 0.41,
664
- "grad_norm": 0.4425447401279064,
665
  "learning_rate": 1.9325050785519438e-05,
666
  "loss": 2.2145,
667
  "step": 465
668
  },
669
  {
670
  "epoch": 0.42,
671
- "grad_norm": 0.49386487755532904,
672
  "learning_rate": 1.9296853946552532e-05,
673
  "loss": 2.1951,
674
  "step": 470
675
  },
676
  {
677
  "epoch": 0.42,
678
- "grad_norm": 0.45076829333164975,
679
  "learning_rate": 1.9268101553696255e-05,
680
  "loss": 2.2028,
681
  "step": 475
682
  },
683
  {
684
  "epoch": 0.43,
685
- "grad_norm": 0.4371766420184323,
686
  "learning_rate": 1.9238795325112867e-05,
687
  "loss": 2.1919,
688
  "step": 480
689
  },
690
  {
691
  "epoch": 0.43,
692
- "grad_norm": 0.4630443680779592,
693
  "learning_rate": 1.9208937012060316e-05,
694
  "loss": 2.1974,
695
  "step": 485
696
  },
697
  {
698
  "epoch": 0.43,
699
- "grad_norm": 0.43953369063880826,
700
  "learning_rate": 1.9178528398787553e-05,
701
  "loss": 2.2052,
702
  "step": 490
703
  },
704
  {
705
  "epoch": 0.44,
706
- "grad_norm": 0.43565666642275414,
707
  "learning_rate": 1.9147571302427927e-05,
708
  "loss": 2.1925,
709
  "step": 495
710
  },
711
  {
712
  "epoch": 0.44,
713
- "grad_norm": 0.43331908792654805,
714
  "learning_rate": 1.9116067572890603e-05,
715
  "loss": 2.1824,
716
  "step": 500
717
  },
718
  {
719
  "epoch": 0.45,
720
- "grad_norm": 0.45451370123371676,
721
  "learning_rate": 1.9084019092750007e-05,
722
- "loss": 2.1731,
723
  "step": 505
724
  },
725
  {
726
  "epoch": 0.45,
727
- "grad_norm": 0.4218630388874148,
728
  "learning_rate": 1.9051427777133328e-05,
729
  "loss": 2.1803,
730
  "step": 510
731
  },
732
  {
733
  "epoch": 0.46,
734
- "grad_norm": 0.4377968795381709,
735
  "learning_rate": 1.901829557360608e-05,
736
  "loss": 2.1803,
737
  "step": 515
738
  },
739
  {
740
  "epoch": 0.46,
741
- "grad_norm": 0.43436421911900935,
742
  "learning_rate": 1.8984624462055724e-05,
743
  "loss": 2.1779,
744
  "step": 520
745
  },
746
  {
747
  "epoch": 0.47,
748
- "grad_norm": 0.44522218893836113,
749
  "learning_rate": 1.895041645457335e-05,
750
  "loss": 2.1787,
751
  "step": 525
752
  },
753
  {
754
  "epoch": 0.47,
755
- "grad_norm": 0.48010307420852916,
756
  "learning_rate": 1.8915673595333443e-05,
757
  "loss": 2.1894,
758
  "step": 530
759
  },
760
  {
761
  "epoch": 0.47,
762
- "grad_norm": 0.4277317034286137,
763
  "learning_rate": 1.8880397960471724e-05,
764
  "loss": 2.1769,
765
  "step": 535
766
  },
767
  {
768
  "epoch": 0.48,
769
- "grad_norm": 0.4223729610270656,
770
  "learning_rate": 1.8844591657961083e-05,
771
  "loss": 2.1678,
772
  "step": 540
773
  },
774
  {
775
  "epoch": 0.48,
776
- "grad_norm": 0.41873513382242905,
777
  "learning_rate": 1.880825682748563e-05,
778
- "loss": 2.1944,
779
  "step": 545
780
  },
781
  {
782
  "epoch": 0.49,
783
- "grad_norm": 0.4841142269572139,
784
  "learning_rate": 1.877139564031282e-05,
785
  "loss": 2.1905,
786
  "step": 550
787
  },
788
  {
789
  "epoch": 0.49,
790
- "grad_norm": 0.45042257178916767,
791
  "learning_rate": 1.87340102991637e-05,
792
- "loss": 2.1567,
793
  "step": 555
794
  },
795
  {
796
  "epoch": 0.5,
797
- "grad_norm": 0.45235847746614205,
798
  "learning_rate": 1.8696103038081297e-05,
799
  "loss": 2.1736,
800
  "step": 560
801
  },
802
  {
803
  "epoch": 0.5,
804
- "grad_norm": 0.45785393303896005,
805
  "learning_rate": 1.86576761222971e-05,
806
  "loss": 2.1615,
807
  "step": 565
808
  },
809
  {
810
  "epoch": 0.5,
811
- "grad_norm": 0.45592839482297626,
812
  "learning_rate": 1.8618731848095706e-05,
813
  "loss": 2.1693,
814
  "step": 570
815
  },
816
  {
817
  "epoch": 0.51,
818
- "grad_norm": 0.4178746899545828,
819
  "learning_rate": 1.8579272542677597e-05,
820
  "loss": 2.1782,
821
  "step": 575
822
  },
823
  {
824
  "epoch": 0.51,
825
- "grad_norm": 0.43258494539611597,
826
  "learning_rate": 1.853930056402008e-05,
827
- "loss": 2.18,
828
  "step": 580
829
  },
830
  {
831
  "epoch": 0.52,
832
- "grad_norm": 0.47577360389490636,
833
  "learning_rate": 1.849881830073637e-05,
834
  "loss": 2.1631,
835
  "step": 585
836
  },
837
  {
838
  "epoch": 0.52,
839
- "grad_norm": 0.4215100329653353,
840
  "learning_rate": 1.845782817193286e-05,
841
  "loss": 2.1567,
842
  "step": 590
843
  },
844
  {
845
  "epoch": 0.53,
846
- "grad_norm": 0.41666810187919245,
847
  "learning_rate": 1.841633262706456e-05,
848
  "loss": 2.1756,
849
  "step": 595
850
  },
851
  {
852
  "epoch": 0.53,
853
- "grad_norm": 0.4389131441725863,
854
  "learning_rate": 1.8374334145788723e-05,
855
  "loss": 2.1694,
856
  "step": 600
857
  },
858
  {
859
  "epoch": 0.54,
860
- "grad_norm": 0.4512797901325279,
861
  "learning_rate": 1.833183523781668e-05,
862
- "loss": 2.188,
863
  "step": 605
864
  },
865
  {
866
  "epoch": 0.54,
867
- "grad_norm": 0.42481197443972873,
868
  "learning_rate": 1.8288838442763838e-05,
869
  "loss": 2.1538,
870
  "step": 610
871
  },
872
  {
873
  "epoch": 0.54,
874
- "grad_norm": 0.515190864112531,
875
  "learning_rate": 1.824534632999796e-05,
876
  "loss": 2.1667,
877
  "step": 615
878
  },
879
  {
880
  "epoch": 0.55,
881
- "grad_norm": 0.4519190062356542,
882
  "learning_rate": 1.820136149848559e-05,
883
  "loss": 2.161,
884
  "step": 620
885
  },
886
  {
887
  "epoch": 0.55,
888
- "grad_norm": 0.4678758078868846,
889
  "learning_rate": 1.8156886576636758e-05,
890
  "loss": 2.1816,
891
  "step": 625
892
  },
893
  {
894
  "epoch": 0.56,
895
- "grad_norm": 0.45278881209343136,
896
  "learning_rate": 1.8111924222147927e-05,
897
  "loss": 2.1684,
898
  "step": 630
899
  },
900
  {
901
  "epoch": 0.56,
902
- "grad_norm": 0.41288167838075435,
903
  "learning_rate": 1.8066477121843163e-05,
904
- "loss": 2.1557,
905
  "step": 635
906
  },
907
  {
908
  "epoch": 0.57,
909
- "grad_norm": 0.4722866419052222,
910
  "learning_rate": 1.8020547991513583e-05,
911
- "loss": 2.1671,
912
  "step": 640
913
  },
914
  {
915
  "epoch": 0.57,
916
- "grad_norm": 0.46965706971767573,
917
  "learning_rate": 1.7974139575755055e-05,
918
  "loss": 2.1623,
919
  "step": 645
920
  },
921
  {
922
  "epoch": 0.58,
923
- "grad_norm": 0.4018088717091893,
924
  "learning_rate": 1.792725464780421e-05,
925
  "loss": 2.1715,
926
  "step": 650
927
  },
928
  {
929
  "epoch": 0.58,
930
- "grad_norm": 0.42883660357995995,
931
  "learning_rate": 1.7879896009372698e-05,
932
  "loss": 2.1668,
933
  "step": 655
934
  },
935
  {
936
  "epoch": 0.58,
937
- "grad_norm": 0.4461661785561243,
938
  "learning_rate": 1.7832066490479797e-05,
939
  "loss": 2.1821,
940
  "step": 660
941
  },
942
  {
943
  "epoch": 0.59,
944
- "grad_norm": 0.4333935279355887,
945
  "learning_rate": 1.7783768949283258e-05,
946
  "loss": 2.1658,
947
  "step": 665
948
  },
949
  {
950
  "epoch": 0.59,
951
- "grad_norm": 0.4472872976863638,
952
  "learning_rate": 1.773500627190854e-05,
953
  "loss": 2.172,
954
  "step": 670
955
  },
956
  {
957
  "epoch": 0.6,
958
- "grad_norm": 0.43628627480641025,
959
  "learning_rate": 1.7685781372276338e-05,
960
  "loss": 2.1711,
961
  "step": 675
962
  },
963
  {
964
  "epoch": 0.6,
965
- "grad_norm": 0.4238861668856612,
966
  "learning_rate": 1.7636097191928437e-05,
967
  "loss": 2.161,
968
  "step": 680
969
  },
970
  {
971
  "epoch": 0.61,
972
- "grad_norm": 0.4451195799551666,
973
  "learning_rate": 1.758595669985197e-05,
974
  "loss": 2.1715,
975
  "step": 685
976
  },
977
  {
978
  "epoch": 0.61,
979
- "grad_norm": 0.43764181843895616,
980
  "learning_rate": 1.7535362892301953e-05,
981
  "loss": 2.173,
982
  "step": 690
983
  },
984
  {
985
  "epoch": 0.62,
986
- "grad_norm": 0.41307028870838386,
987
  "learning_rate": 1.748431879262229e-05,
988
- "loss": 2.167,
989
  "step": 695
990
  },
991
  {
992
  "epoch": 0.62,
993
- "grad_norm": 0.41449033777088845,
994
  "learning_rate": 1.7432827451065052e-05,
995
  "loss": 2.174,
996
  "step": 700
997
  },
998
  {
999
  "epoch": 0.62,
1000
- "grad_norm": 0.4623365746349171,
1001
  "learning_rate": 1.7380891944608243e-05,
1002
  "loss": 2.1566,
1003
  "step": 705
1004
  },
1005
  {
1006
  "epoch": 0.63,
1007
- "grad_norm": 0.41161734051829374,
1008
  "learning_rate": 1.732851537677191e-05,
1009
  "loss": 2.1415,
1010
  "step": 710
1011
  },
1012
  {
1013
  "epoch": 0.63,
1014
- "grad_norm": 0.43317843491347463,
1015
  "learning_rate": 1.7275700877432693e-05,
1016
  "loss": 2.1382,
1017
  "step": 715
1018
  },
1019
  {
1020
  "epoch": 0.64,
1021
- "grad_norm": 0.4140928909417553,
1022
  "learning_rate": 1.7222451602636785e-05,
1023
  "loss": 2.1524,
1024
  "step": 720
1025
  },
1026
  {
1027
  "epoch": 0.64,
1028
- "grad_norm": 0.4447518172252142,
1029
  "learning_rate": 1.7168770734411344e-05,
1030
  "loss": 2.1498,
1031
  "step": 725
1032
  },
1033
  {
1034
  "epoch": 0.65,
1035
- "grad_norm": 0.4301039298734055,
1036
  "learning_rate": 1.711466148057433e-05,
1037
  "loss": 2.1498,
1038
  "step": 730
1039
  },
1040
  {
1041
  "epoch": 0.65,
1042
- "grad_norm": 0.4100594877248418,
1043
  "learning_rate": 1.7060127074542847e-05,
1044
  "loss": 2.1407,
1045
  "step": 735
1046
  },
1047
  {
1048
  "epoch": 0.66,
1049
- "grad_norm": 0.44101143872847415,
1050
  "learning_rate": 1.700517077513987e-05,
1051
  "loss": 2.1445,
1052
  "step": 740
1053
  },
1054
  {
1055
  "epoch": 0.66,
1056
- "grad_norm": 0.4554503663906989,
1057
  "learning_rate": 1.6949795866399554e-05,
1058
  "loss": 2.1559,
1059
  "step": 745
1060
  },
1061
  {
1062
  "epoch": 0.66,
1063
- "grad_norm": 0.400815267904289,
1064
  "learning_rate": 1.689400565737098e-05,
1065
  "loss": 2.17,
1066
  "step": 750
1067
  },
1068
  {
1069
  "epoch": 0.67,
1070
- "grad_norm": 0.4518621921666788,
1071
  "learning_rate": 1.6837803481920393e-05,
1072
  "loss": 2.1655,
1073
  "step": 755
1074
  },
1075
  {
1076
  "epoch": 0.67,
1077
- "grad_norm": 0.43391502038484003,
1078
  "learning_rate": 1.6781192698532e-05,
1079
  "loss": 2.1475,
1080
  "step": 760
1081
  },
1082
  {
1083
  "epoch": 0.68,
1084
- "grad_norm": 0.45206448473580985,
1085
  "learning_rate": 1.6724176690107272e-05,
1086
  "loss": 2.1689,
1087
  "step": 765
1088
  },
1089
  {
1090
  "epoch": 0.68,
1091
- "grad_norm": 0.4347796166704268,
1092
  "learning_rate": 1.6666758863762796e-05,
1093
  "loss": 2.1565,
1094
  "step": 770
1095
  },
1096
  {
1097
  "epoch": 0.69,
1098
- "grad_norm": 0.4652753268343201,
1099
  "learning_rate": 1.6608942650626655e-05,
1100
  "loss": 2.1498,
1101
  "step": 775
1102
  },
1103
  {
1104
  "epoch": 0.69,
1105
- "grad_norm": 0.45290611146554866,
1106
  "learning_rate": 1.655073150563343e-05,
1107
  "loss": 2.1371,
1108
  "step": 780
1109
  },
1110
  {
1111
  "epoch": 0.7,
1112
- "grad_norm": 0.4142163428491422,
1113
  "learning_rate": 1.6492128907317696e-05,
1114
- "loss": 2.1449,
1115
  "step": 785
1116
  },
1117
  {
1118
  "epoch": 0.7,
1119
- "grad_norm": 0.43176216957612273,
1120
  "learning_rate": 1.6433138357606198e-05,
1121
  "loss": 2.1386,
1122
  "step": 790
1123
  },
1124
  {
1125
  "epoch": 0.7,
1126
- "grad_norm": 0.46705041752490284,
1127
  "learning_rate": 1.637376338160856e-05,
1128
  "loss": 2.15,
1129
  "step": 795
1130
  },
1131
  {
1132
  "epoch": 0.71,
1133
- "grad_norm": 0.4154132593768116,
1134
  "learning_rate": 1.6314007527406643e-05,
1135
  "loss": 2.1479,
1136
  "step": 800
1137
  },
1138
  {
1139
  "epoch": 0.71,
1140
- "grad_norm": 0.502093736514611,
1141
  "learning_rate": 1.6253874365842518e-05,
1142
  "loss": 2.1617,
1143
  "step": 805
1144
  },
1145
  {
1146
  "epoch": 0.72,
1147
- "grad_norm": 0.399856629462898,
1148
  "learning_rate": 1.619336749030509e-05,
1149
  "loss": 2.1512,
1150
  "step": 810
1151
  },
1152
  {
1153
  "epoch": 0.72,
1154
- "grad_norm": 0.42096181913149455,
1155
  "learning_rate": 1.613249051651535e-05,
1156
  "loss": 2.1392,
1157
  "step": 815
1158
  },
1159
  {
1160
  "epoch": 0.73,
1161
- "grad_norm": 0.4227263518476382,
1162
  "learning_rate": 1.6071247082310337e-05,
1163
  "loss": 2.1453,
1164
  "step": 820
1165
  },
1166
  {
1167
  "epoch": 0.73,
1168
- "grad_norm": 0.4155818226998206,
1169
  "learning_rate": 1.6009640847425726e-05,
1170
  "loss": 2.1523,
1171
  "step": 825
1172
  },
1173
  {
1174
  "epoch": 0.74,
1175
- "grad_norm": 0.41381056713927417,
1176
  "learning_rate": 1.594767549327714e-05,
1177
- "loss": 2.1365,
1178
  "step": 830
1179
  },
1180
  {
1181
  "epoch": 0.74,
1182
- "grad_norm": 0.4122127862061903,
1183
  "learning_rate": 1.588535472274017e-05,
1184
- "loss": 2.1413,
1185
  "step": 835
1186
  },
1187
  {
1188
  "epoch": 0.74,
1189
- "grad_norm": 0.4354517648924038,
1190
  "learning_rate": 1.5822682259929086e-05,
1191
- "loss": 2.1527,
1192
  "step": 840
1193
  },
1194
  {
1195
  "epoch": 0.75,
1196
- "grad_norm": 0.4201697844062031,
1197
  "learning_rate": 1.57596618499743e-05,
1198
- "loss": 2.1338,
1199
  "step": 845
1200
  },
1201
  {
1202
  "epoch": 0.75,
1203
- "grad_norm": 0.4067857342158813,
1204
  "learning_rate": 1.5696297258798573e-05,
1205
- "loss": 2.133,
1206
  "step": 850
1207
  },
1208
  {
1209
  "epoch": 0.76,
1210
- "grad_norm": 0.40089743671647377,
1211
  "learning_rate": 1.5632592272891964e-05,
1212
  "loss": 2.1276,
1213
  "step": 855
1214
  },
1215
  {
1216
  "epoch": 0.76,
1217
- "grad_norm": 0.44038257751723847,
1218
  "learning_rate": 1.5568550699085574e-05,
1219
  "loss": 2.1346,
1220
  "step": 860
1221
  },
1222
  {
1223
  "epoch": 0.77,
1224
- "grad_norm": 0.4151864092587149,
1225
  "learning_rate": 1.550417636432404e-05,
1226
  "loss": 2.1318,
1227
  "step": 865
1228
  },
1229
  {
1230
  "epoch": 0.77,
1231
- "grad_norm": 0.4340239337775399,
1232
  "learning_rate": 1.5439473115436872e-05,
1233
- "loss": 2.1352,
1234
  "step": 870
1235
  },
1236
  {
1237
  "epoch": 0.78,
1238
- "grad_norm": 0.4110957124891134,
1239
  "learning_rate": 1.5374444818908553e-05,
1240
- "loss": 2.1155,
1241
  "step": 875
1242
  },
1243
  {
1244
  "epoch": 0.78,
1245
- "grad_norm": 0.4281757953377905,
1246
  "learning_rate": 1.5309095360647505e-05,
1247
- "loss": 2.1413,
1248
  "step": 880
1249
  },
1250
  {
1251
  "epoch": 0.78,
1252
- "grad_norm": 0.39691328863809533,
1253
  "learning_rate": 1.5243428645753877e-05,
1254
- "loss": 2.138,
1255
  "step": 885
1256
  },
1257
  {
1258
  "epoch": 0.79,
1259
- "grad_norm": 0.4405888273042577,
1260
  "learning_rate": 1.5177448598286182e-05,
1261
  "loss": 2.152,
1262
  "step": 890
1263
  },
1264
  {
1265
  "epoch": 0.79,
1266
- "grad_norm": 0.4368019496984126,
1267
  "learning_rate": 1.5111159161026802e-05,
1268
  "loss": 2.126,
1269
  "step": 895
1270
  },
1271
  {
1272
  "epoch": 0.8,
1273
- "grad_norm": 0.41043912954495615,
1274
  "learning_rate": 1.5044564295246395e-05,
1275
  "loss": 2.1289,
1276
  "step": 900
1277
  },
1278
  {
1279
  "epoch": 0.8,
1280
- "grad_norm": 0.4099907936063609,
1281
  "learning_rate": 1.4977667980467162e-05,
1282
  "loss": 2.1446,
1283
  "step": 905
1284
  },
1285
  {
1286
  "epoch": 0.81,
1287
- "grad_norm": 0.40782209831122196,
1288
  "learning_rate": 1.491047421422505e-05,
1289
  "loss": 2.1455,
1290
  "step": 910
1291
  },
1292
  {
1293
  "epoch": 0.81,
1294
- "grad_norm": 0.42851229092043813,
1295
  "learning_rate": 1.4842987011830871e-05,
1296
  "loss": 2.146,
1297
  "step": 915
1298
  },
1299
  {
1300
  "epoch": 0.81,
1301
- "grad_norm": 0.44536832258247705,
1302
  "learning_rate": 1.4775210406130358e-05,
1303
  "loss": 2.1246,
1304
  "step": 920
1305
  },
1306
  {
1307
  "epoch": 0.82,
1308
- "grad_norm": 0.4038309295898599,
1309
  "learning_rate": 1.4707148447263178e-05,
1310
  "loss": 2.1182,
1311
  "step": 925
1312
  },
1313
  {
1314
  "epoch": 0.82,
1315
- "grad_norm": 0.44833983372752934,
1316
  "learning_rate": 1.4638805202420896e-05,
1317
  "loss": 2.1547,
1318
  "step": 930
1319
  },
1320
  {
1321
  "epoch": 0.83,
1322
- "grad_norm": 0.43884811866616985,
1323
  "learning_rate": 1.4570184755603936e-05,
1324
  "loss": 2.1369,
1325
  "step": 935
1326
  },
1327
  {
1328
  "epoch": 0.83,
1329
- "grad_norm": 0.4349134291579271,
1330
  "learning_rate": 1.4501291207377537e-05,
1331
  "loss": 2.1361,
1332
  "step": 940
1333
  },
1334
  {
1335
  "epoch": 0.84,
1336
- "grad_norm": 0.4558190205274744,
1337
  "learning_rate": 1.4432128674626713e-05,
1338
  "loss": 2.1414,
1339
  "step": 945
1340
  },
1341
  {
1342
  "epoch": 0.84,
1343
- "grad_norm": 0.4303111633287122,
1344
  "learning_rate": 1.4362701290310234e-05,
1345
  "loss": 2.1348,
1346
  "step": 950
1347
  },
1348
  {
1349
  "epoch": 0.85,
1350
- "grad_norm": 0.4273754253029059,
1351
  "learning_rate": 1.4293013203213662e-05,
1352
  "loss": 2.1229,
1353
  "step": 955
1354
  },
1355
  {
1356
  "epoch": 0.85,
1357
- "grad_norm": 0.42314456373695775,
1358
  "learning_rate": 1.422306857770141e-05,
1359
- "loss": 2.1287,
1360
  "step": 960
1361
  },
1362
  {
1363
  "epoch": 0.85,
1364
- "grad_norm": 0.3998554573407628,
1365
  "learning_rate": 1.415287159346793e-05,
1366
  "loss": 2.1325,
1367
  "step": 965
1368
  },
1369
  {
1370
  "epoch": 0.86,
1371
- "grad_norm": 0.4050830474983297,
1372
  "learning_rate": 1.4082426445287904e-05,
1373
  "loss": 2.1254,
1374
  "step": 970
1375
  },
1376
  {
1377
  "epoch": 0.86,
1378
- "grad_norm": 0.41849098746749486,
1379
  "learning_rate": 1.4011737342765604e-05,
1380
- "loss": 2.1436,
1381
  "step": 975
1382
  },
1383
  {
1384
  "epoch": 0.87,
1385
- "grad_norm": 0.4282294364908284,
1386
  "learning_rate": 1.3940808510083321e-05,
1387
  "loss": 2.152,
1388
  "step": 980
1389
  },
1390
  {
1391
  "epoch": 0.87,
1392
- "grad_norm": 0.4478990101450553,
1393
  "learning_rate": 1.3869644185748954e-05,
1394
  "loss": 2.1294,
1395
  "step": 985
1396
  },
1397
  {
1398
  "epoch": 0.88,
1399
- "grad_norm": 0.40046099543325664,
1400
  "learning_rate": 1.3798248622342719e-05,
1401
  "loss": 2.1373,
1402
  "step": 990
1403
  },
1404
  {
1405
  "epoch": 0.88,
1406
- "grad_norm": 0.4212863510735111,
1407
  "learning_rate": 1.3726626086263029e-05,
1408
  "loss": 2.127,
1409
  "step": 995
1410
  },
1411
  {
1412
  "epoch": 0.89,
1413
- "grad_norm": 0.42718872028352106,
1414
  "learning_rate": 1.3654780857471548e-05,
1415
  "loss": 2.1204,
1416
  "step": 1000
1417
  },
1418
  {
1419
  "epoch": 0.89,
1420
- "grad_norm": 0.4428343715874287,
1421
  "learning_rate": 1.3582717229237434e-05,
1422
- "loss": 2.1386,
1423
  "step": 1005
1424
  },
1425
  {
1426
  "epoch": 0.89,
1427
- "grad_norm": 0.4351947611027273,
1428
  "learning_rate": 1.3510439507880778e-05,
1429
- "loss": 2.1493,
1430
  "step": 1010
1431
  },
1432
  {
1433
  "epoch": 0.9,
1434
- "grad_norm": 0.40432078075529426,
1435
  "learning_rate": 1.3437952012515275e-05,
1436
  "loss": 2.1321,
1437
  "step": 1015
1438
  },
1439
  {
1440
  "epoch": 0.9,
1441
- "grad_norm": 0.4534127198524785,
1442
  "learning_rate": 1.336525907479013e-05,
1443
- "loss": 2.1362,
1444
  "step": 1020
1445
  },
1446
  {
1447
  "epoch": 0.91,
1448
- "grad_norm": 0.4113722084901958,
1449
  "learning_rate": 1.32923650386312e-05,
1450
  "loss": 2.1278,
1451
  "step": 1025
1452
  },
1453
  {
1454
  "epoch": 0.91,
1455
- "grad_norm": 0.4161160256044331,
1456
  "learning_rate": 1.321927425998143e-05,
1457
  "loss": 2.1422,
1458
  "step": 1030
1459
  },
1460
  {
1461
  "epoch": 0.92,
1462
- "grad_norm": 0.42966295064418314,
1463
  "learning_rate": 1.314599110654053e-05,
1464
- "loss": 2.1298,
1465
  "step": 1035
1466
  },
1467
  {
1468
  "epoch": 0.92,
1469
- "grad_norm": 0.4345705926429682,
1470
  "learning_rate": 1.3072519957504e-05,
1471
  "loss": 2.1327,
1472
  "step": 1040
1473
  },
1474
  {
1475
  "epoch": 0.93,
1476
- "grad_norm": 0.39710008326638535,
1477
  "learning_rate": 1.2998865203301424e-05,
1478
  "loss": 2.1233,
1479
  "step": 1045
1480
  },
1481
  {
1482
  "epoch": 0.93,
1483
- "grad_norm": 0.41618825461238634,
1484
  "learning_rate": 1.2925031245334112e-05,
1485
  "loss": 2.132,
1486
  "step": 1050
1487
  },
1488
  {
1489
  "epoch": 0.93,
1490
- "grad_norm": 0.4378715632069394,
1491
  "learning_rate": 1.2851022495712092e-05,
1492
  "loss": 2.1316,
1493
  "step": 1055
1494
  },
1495
  {
1496
  "epoch": 0.94,
1497
- "grad_norm": 0.42877369379016117,
1498
  "learning_rate": 1.2776843376990448e-05,
1499
  "loss": 2.1199,
1500
  "step": 1060
1501
  },
1502
  {
1503
  "epoch": 0.94,
1504
- "grad_norm": 0.434209770548659,
1505
  "learning_rate": 1.270249832190505e-05,
1506
  "loss": 2.1191,
1507
  "step": 1065
1508
  },
1509
  {
1510
  "epoch": 0.95,
1511
- "grad_norm": 0.43198002994305346,
1512
  "learning_rate": 1.2627991773107651e-05,
1513
- "loss": 2.1168,
1514
  "step": 1070
1515
  },
1516
  {
1517
  "epoch": 0.95,
1518
- "grad_norm": 0.44034530302111496,
1519
  "learning_rate": 1.2553328182900414e-05,
1520
  "loss": 2.1438,
1521
  "step": 1075
1522
  },
1523
  {
1524
  "epoch": 0.96,
1525
- "grad_norm": 0.4483007774275172,
1526
  "learning_rate": 1.2478512012969864e-05,
1527
  "loss": 2.1134,
1528
  "step": 1080
1529
  },
1530
  {
1531
  "epoch": 0.96,
1532
- "grad_norm": 0.4304461205730933,
1533
  "learning_rate": 1.2403547734120253e-05,
1534
- "loss": 2.1255,
1535
  "step": 1085
1536
  },
1537
  {
1538
  "epoch": 0.97,
1539
- "grad_norm": 0.409879288942517,
1540
  "learning_rate": 1.2328439826006415e-05,
1541
  "loss": 2.1301,
1542
  "step": 1090
1543
  },
1544
  {
1545
  "epoch": 0.97,
1546
- "grad_norm": 0.4042150892470327,
1547
  "learning_rate": 1.2253192776866059e-05,
1548
  "loss": 2.1218,
1549
  "step": 1095
1550
  },
1551
  {
1552
  "epoch": 0.97,
1553
- "grad_norm": 0.3961855111118892,
1554
  "learning_rate": 1.2177811083251572e-05,
1555
- "loss": 2.097,
1556
  "step": 1100
1557
  },
1558
  {
1559
  "epoch": 0.98,
1560
- "grad_norm": 0.42340663380133325,
1561
  "learning_rate": 1.2102299249761315e-05,
1562
  "loss": 2.1216,
1563
  "step": 1105
1564
  },
1565
  {
1566
  "epoch": 0.98,
1567
- "grad_norm": 0.4282749142801059,
1568
  "learning_rate": 1.2026661788770453e-05,
1569
  "loss": 2.1199,
1570
  "step": 1110
1571
  },
1572
  {
1573
  "epoch": 0.99,
1574
- "grad_norm": 0.43581529664733076,
1575
  "learning_rate": 1.1950903220161286e-05,
1576
  "loss": 2.1192,
1577
  "step": 1115
1578
  },
1579
  {
1580
  "epoch": 0.99,
1581
- "grad_norm": 0.4580086749293766,
1582
  "learning_rate": 1.1875028071053165e-05,
1583
  "loss": 2.1384,
1584
  "step": 1120
1585
  },
1586
  {
1587
  "epoch": 1.0,
1588
- "grad_norm": 0.428214022963639,
1589
  "learning_rate": 1.1799040875531975e-05,
1590
  "loss": 2.1213,
1591
  "step": 1125
1592
  },
1593
  {
1594
  "epoch": 1.0,
1595
- "eval_loss": 2.1272776126861572,
1596
- "eval_runtime": 22.1271,
1597
- "eval_samples_per_second": 1445.238,
1598
- "eval_steps_per_second": 22.597,
1599
  "step": 1129
1600
  },
1601
  {
1602
  "epoch": 1.0,
1603
- "grad_norm": 0.4085688194013796,
1604
  "learning_rate": 1.1722946174379168e-05,
1605
  "loss": 2.1371,
1606
  "step": 1130
1607
  },
1608
  {
1609
  "epoch": 1.01,
1610
- "grad_norm": 0.41531851586267077,
1611
  "learning_rate": 1.1646748514800441e-05,
1612
- "loss": 2.1133,
1613
  "step": 1135
1614
  },
1615
  {
1616
  "epoch": 1.01,
1617
- "grad_norm": 0.4293935785008533,
1618
  "learning_rate": 1.1570452450153992e-05,
1619
  "loss": 2.1102,
1620
  "step": 1140
1621
  },
1622
  {
1623
  "epoch": 1.01,
1624
- "grad_norm": 0.41937604617290536,
1625
  "learning_rate": 1.149406253967843e-05,
1626
  "loss": 2.1215,
1627
  "step": 1145
1628
  },
1629
  {
1630
  "epoch": 1.02,
1631
- "grad_norm": 0.41749108177206645,
1632
  "learning_rate": 1.1417583348220322e-05,
1633
  "loss": 2.1132,
1634
  "step": 1150
1635
  },
1636
  {
1637
  "epoch": 1.02,
1638
- "grad_norm": 0.39072764669937704,
1639
  "learning_rate": 1.134101944596143e-05,
1640
  "loss": 2.1223,
1641
  "step": 1155
1642
  },
1643
  {
1644
  "epoch": 1.03,
1645
- "grad_norm": 0.39077965857073454,
1646
  "learning_rate": 1.1264375408145582e-05,
1647
- "loss": 2.1277,
1648
  "step": 1160
1649
  },
1650
  {
1651
  "epoch": 1.03,
1652
- "grad_norm": 0.3953349755989083,
1653
  "learning_rate": 1.118765581480529e-05,
1654
  "loss": 2.1014,
1655
  "step": 1165
1656
  },
1657
  {
1658
  "epoch": 1.04,
1659
- "grad_norm": 0.41490100884622594,
1660
  "learning_rate": 1.1110865250488047e-05,
1661
  "loss": 2.1277,
1662
  "step": 1170
1663
  },
1664
  {
1665
  "epoch": 1.04,
1666
- "grad_norm": 0.419096760493875,
1667
  "learning_rate": 1.1034008303982373e-05,
1668
  "loss": 2.124,
1669
  "step": 1175
1670
  },
1671
  {
1672
  "epoch": 1.05,
1673
- "grad_norm": 0.41311331133015966,
1674
  "learning_rate": 1.0957089568043607e-05,
1675
  "loss": 2.1107,
1676
  "step": 1180
1677
  },
1678
  {
1679
  "epoch": 1.05,
1680
- "grad_norm": 0.41483369522970326,
1681
  "learning_rate": 1.088011363911944e-05,
1682
  "loss": 2.1053,
1683
  "step": 1185
1684
  },
1685
  {
1686
  "epoch": 1.05,
1687
- "grad_norm": 0.42625693021658856,
1688
  "learning_rate": 1.080308511707527e-05,
1689
- "loss": 2.1179,
1690
  "step": 1190
1691
  },
1692
  {
1693
  "epoch": 1.06,
1694
- "grad_norm": 0.4161060548680695,
1695
  "learning_rate": 1.0726008604919296e-05,
1696
  "loss": 2.1071,
1697
  "step": 1195
1698
  },
1699
  {
1700
  "epoch": 1.06,
1701
- "grad_norm": 0.41461042839461326,
1702
  "learning_rate": 1.0648888708527481e-05,
1703
  "loss": 2.1217,
1704
  "step": 1200
1705
  },
1706
  {
1707
  "epoch": 1.07,
1708
- "grad_norm": 0.4078868808684218,
1709
  "learning_rate": 1.0571730036368308e-05,
1710
- "loss": 2.1057,
1711
  "step": 1205
1712
  },
1713
  {
1714
  "epoch": 1.07,
1715
- "grad_norm": 0.41839762680310305,
1716
  "learning_rate": 1.0494537199227393e-05,
1717
  "loss": 2.1029,
1718
  "step": 1210
1719
  },
1720
  {
1721
  "epoch": 1.08,
1722
- "grad_norm": 0.39748160366223706,
1723
  "learning_rate": 1.0417314809931945e-05,
1724
  "loss": 2.1044,
1725
  "step": 1215
1726
  },
1727
  {
1728
  "epoch": 1.08,
1729
- "grad_norm": 0.43680744318110537,
1730
  "learning_rate": 1.0340067483075135e-05,
1731
  "loss": 2.0963,
1732
  "step": 1220
1733
  },
1734
  {
1735
  "epoch": 1.09,
1736
- "grad_norm": 0.4135383344906544,
1737
  "learning_rate": 1.0262799834740334e-05,
1738
- "loss": 2.1291,
1739
  "step": 1225
1740
  },
1741
  {
1742
  "epoch": 1.09,
1743
- "grad_norm": 0.40832519211963375,
1744
  "learning_rate": 1.0185516482225264e-05,
1745
  "loss": 2.1263,
1746
  "step": 1230
1747
  },
1748
  {
1749
  "epoch": 1.09,
1750
- "grad_norm": 0.41205256370674664,
1751
  "learning_rate": 1.0108222043766087e-05,
1752
  "loss": 2.12,
1753
  "step": 1235
1754
  },
1755
  {
1756
  "epoch": 1.1,
1757
- "grad_norm": 0.40194430216956495,
1758
  "learning_rate": 1.0030921138261422e-05,
1759
  "loss": 2.0842,
1760
  "step": 1240
1761
  },
1762
  {
1763
  "epoch": 1.1,
1764
- "grad_norm": 0.4037430427573228,
1765
  "learning_rate": 9.953618384996353e-06,
1766
- "loss": 2.1133,
1767
  "step": 1245
1768
  },
1769
  {
1770
  "epoch": 1.11,
1771
- "grad_norm": 0.40682567785758733,
1772
  "learning_rate": 9.876318403366371e-06,
1773
  "loss": 2.0966,
1774
  "step": 1250
1775
  },
1776
  {
1777
  "epoch": 1.11,
1778
- "grad_norm": 0.40152408183966254,
1779
  "learning_rate": 9.79902581260135e-06,
1780
  "loss": 2.1092,
1781
  "step": 1255
1782
  },
1783
  {
1784
  "epoch": 1.12,
1785
- "grad_norm": 0.4064003924795871,
1786
  "learning_rate": 9.721745231489499e-06,
1787
  "loss": 2.1149,
1788
  "step": 1260
1789
  },
1790
  {
1791
  "epoch": 1.12,
1792
- "grad_norm": 0.40271410084160175,
1793
  "learning_rate": 9.644481278101366e-06,
1794
- "loss": 2.1309,
1795
  "step": 1265
1796
  },
1797
  {
1798
  "epoch": 1.12,
1799
- "grad_norm": 0.41422324108295044,
1800
  "learning_rate": 9.567238569513872e-06,
1801
  "loss": 2.1018,
1802
  "step": 1270
1803
  },
1804
  {
1805
  "epoch": 1.13,
1806
- "grad_norm": 0.4222329200835926,
1807
  "learning_rate": 9.49002172153442e-06,
1808
  "loss": 2.1129,
1809
  "step": 1275
1810
  },
1811
  {
1812
  "epoch": 1.13,
1813
- "grad_norm": 0.4423380953300777,
1814
  "learning_rate": 9.412835348425038e-06,
1815
  "loss": 2.1136,
1816
  "step": 1280
1817
  },
1818
  {
1819
  "epoch": 1.14,
1820
- "grad_norm": 0.4365705241590506,
1821
  "learning_rate": 9.335684062626669e-06,
1822
  "loss": 2.112,
1823
  "step": 1285
1824
  },
1825
  {
1826
  "epoch": 1.14,
1827
- "grad_norm": 0.41635126619610124,
1828
  "learning_rate": 9.25857247448354e-06,
1829
  "loss": 2.1002,
1830
  "step": 1290
1831
  },
1832
  {
1833
  "epoch": 1.15,
1834
- "grad_norm": 0.42176680465683547,
1835
  "learning_rate": 9.181505191967656e-06,
1836
  "loss": 2.1032,
1837
  "step": 1295
1838
  },
1839
  {
1840
  "epoch": 1.15,
1841
- "grad_norm": 0.40598859135963455,
1842
  "learning_rate": 9.104486820403438e-06,
1843
- "loss": 2.1136,
1844
  "step": 1300
1845
  },
1846
  {
1847
  "epoch": 1.16,
1848
- "grad_norm": 0.39125975910719296,
1849
  "learning_rate": 9.027521962192532e-06,
1850
  "loss": 2.1151,
1851
  "step": 1305
1852
  },
1853
  {
1854
  "epoch": 1.16,
1855
- "grad_norm": 0.4377962326285569,
1856
  "learning_rate": 8.950615216538765e-06,
1857
  "loss": 2.1281,
1858
  "step": 1310
1859
  },
1860
  {
1861
  "epoch": 1.16,
1862
- "grad_norm": 0.43492048606926564,
1863
  "learning_rate": 8.873771179173339e-06,
1864
  "loss": 2.1031,
1865
  "step": 1315
1866
  },
1867
  {
1868
  "epoch": 1.17,
1869
- "grad_norm": 0.4201455471732833,
1870
  "learning_rate": 8.796994442080167e-06,
1871
  "loss": 2.1161,
1872
  "step": 1320
1873
  },
1874
  {
1875
  "epoch": 1.17,
1876
- "grad_norm": 0.4172474909693453,
1877
  "learning_rate": 8.720289593221502e-06,
1878
  "loss": 2.1147,
1879
  "step": 1325
1880
  },
1881
  {
1882
  "epoch": 1.18,
1883
- "grad_norm": 0.4407123382776677,
1884
  "learning_rate": 8.643661216263744e-06,
1885
  "loss": 2.0824,
1886
  "step": 1330
1887
  },
1888
  {
1889
  "epoch": 1.18,
1890
- "grad_norm": 0.3947779944095814,
1891
  "learning_rate": 8.567113890303554e-06,
1892
  "loss": 2.1216,
1893
  "step": 1335
1894
  },
1895
  {
1896
  "epoch": 1.19,
1897
- "grad_norm": 0.4421135799294026,
1898
  "learning_rate": 8.490652189594212e-06,
1899
  "loss": 2.1071,
1900
  "step": 1340
1901
  },
1902
  {
1903
  "epoch": 1.19,
1904
- "grad_norm": 0.40075314834170295,
1905
  "learning_rate": 8.414280683272273e-06,
1906
  "loss": 2.1239,
1907
  "step": 1345
1908
  },
1909
  {
1910
  "epoch": 1.2,
1911
- "grad_norm": 0.3949908313192302,
1912
  "learning_rate": 8.338003935084531e-06,
1913
  "loss": 2.1129,
1914
  "step": 1350
1915
  },
1916
  {
1917
  "epoch": 1.2,
1918
- "grad_norm": 0.42804723778290726,
1919
  "learning_rate": 8.2618265031153e-06,
1920
- "loss": 2.1125,
1921
  "step": 1355
1922
  },
1923
  {
1924
  "epoch": 1.2,
1925
- "grad_norm": 0.395763409721863,
1926
  "learning_rate": 8.185752939514026e-06,
1927
  "loss": 2.0961,
1928
  "step": 1360
1929
  },
1930
  {
1931
  "epoch": 1.21,
1932
- "grad_norm": 0.4060318677339845,
1933
  "learning_rate": 8.109787790223285e-06,
1934
  "loss": 2.0981,
1935
  "step": 1365
1936
  },
1937
  {
1938
  "epoch": 1.21,
1939
- "grad_norm": 0.4559920210627191,
1940
  "learning_rate": 8.033935594707116e-06,
1941
- "loss": 2.1023,
1942
  "step": 1370
1943
  },
1944
  {
1945
  "epoch": 1.22,
1946
- "grad_norm": 0.42047036555597217,
1947
  "learning_rate": 7.958200885679752e-06,
1948
  "loss": 2.1216,
1949
  "step": 1375
1950
  },
1951
  {
1952
  "epoch": 1.22,
1953
- "grad_norm": 0.4047916642286104,
1954
  "learning_rate": 7.88258818883477e-06,
1955
  "loss": 2.1316,
1956
  "step": 1380
1957
  },
1958
  {
1959
  "epoch": 1.23,
1960
- "grad_norm": 0.40584607674840606,
1961
  "learning_rate": 7.807102022574631e-06,
1962
  "loss": 2.0998,
1963
  "step": 1385
1964
  },
1965
  {
1966
  "epoch": 1.23,
1967
- "grad_norm": 0.4087195630497667,
1968
  "learning_rate": 7.7317468977407e-06,
1969
- "loss": 2.1044,
1970
  "step": 1390
1971
  },
1972
  {
1973
  "epoch": 1.24,
1974
- "grad_norm": 0.41488817076545803,
1975
  "learning_rate": 7.65652731734366e-06,
1976
  "loss": 2.0905,
1977
  "step": 1395
1978
  },
1979
  {
1980
  "epoch": 1.24,
1981
- "grad_norm": 0.4159089183998814,
1982
  "learning_rate": 7.5814477762944435e-06,
1983
  "loss": 2.1261,
1984
  "step": 1400
1985
  },
1986
  {
1987
  "epoch": 1.24,
1988
- "grad_norm": 0.41601071843232423,
1989
  "learning_rate": 7.506512761135627e-06,
1990
  "loss": 2.104,
1991
  "step": 1405
1992
  },
1993
  {
1994
  "epoch": 1.25,
1995
- "grad_norm": 0.40388849499531787,
1996
  "learning_rate": 7.431726749773322e-06,
1997
  "loss": 2.0818,
1998
  "step": 1410
1999
  },
2000
  {
2001
  "epoch": 1.25,
2002
- "grad_norm": 0.39440012759705656,
2003
  "learning_rate": 7.3570942112095955e-06,
2004
  "loss": 2.1178,
2005
  "step": 1415
2006
  },
2007
  {
2008
  "epoch": 1.26,
2009
- "grad_norm": 0.41960376561614865,
2010
  "learning_rate": 7.282619605275409e-06,
2011
  "loss": 2.1073,
2012
  "step": 1420
2013
  },
2014
  {
2015
  "epoch": 1.26,
2016
- "grad_norm": 0.4238900620477477,
2017
  "learning_rate": 7.208307382364111e-06,
2018
  "loss": 2.0965,
2019
  "step": 1425
2020
  },
2021
  {
2022
  "epoch": 1.27,
2023
- "grad_norm": 0.41891497784045856,
2024
  "learning_rate": 7.134161983165498e-06,
2025
- "loss": 2.1126,
2026
  "step": 1430
2027
  },
2028
  {
2029
  "epoch": 1.27,
2030
- "grad_norm": 0.3998004661957372,
2031
  "learning_rate": 7.060187838400451e-06,
2032
  "loss": 2.1136,
2033
  "step": 1435
2034
  },
2035
  {
2036
  "epoch": 1.28,
2037
- "grad_norm": 0.42065942628940173,
2038
  "learning_rate": 6.986389368556168e-06,
2039
- "loss": 2.1076,
2040
  "step": 1440
2041
  },
2042
  {
2043
  "epoch": 1.28,
2044
- "grad_norm": 0.4038101105905165,
2045
  "learning_rate": 6.912770983622008e-06,
2046
  "loss": 2.1171,
2047
  "step": 1445
2048
  },
2049
  {
2050
  "epoch": 1.28,
2051
- "grad_norm": 0.40623073885338706,
2052
  "learning_rate": 6.839337082825954e-06,
2053
  "loss": 2.1166,
2054
  "step": 1450
2055
  },
2056
  {
2057
  "epoch": 1.29,
2058
- "grad_norm": 0.40889404318282746,
2059
  "learning_rate": 6.766092054371744e-06,
2060
  "loss": 2.1066,
2061
  "step": 1455
2062
  },
2063
  {
2064
  "epoch": 1.29,
2065
- "grad_norm": 0.42640688726553155,
2066
  "learning_rate": 6.693040275176623e-06,
2067
  "loss": 2.1284,
2068
  "step": 1460
2069
  },
2070
  {
2071
  "epoch": 1.3,
2072
- "grad_norm": 0.41083600428874123,
2073
  "learning_rate": 6.62018611060982e-06,
2074
- "loss": 2.1022,
2075
  "step": 1465
2076
  },
2077
  {
2078
  "epoch": 1.3,
2079
- "grad_norm": 0.4021136568811981,
2080
  "learning_rate": 6.547533914231654e-06,
2081
  "loss": 2.1095,
2082
  "step": 1470
2083
  },
2084
  {
2085
  "epoch": 1.31,
2086
- "grad_norm": 0.42074206929266916,
2087
  "learning_rate": 6.475088027533399e-06,
2088
  "loss": 2.1251,
2089
  "step": 1475
2090
  },
2091
  {
2092
  "epoch": 1.31,
2093
- "grad_norm": 0.4154754488835367,
2094
  "learning_rate": 6.40285277967784e-06,
2095
  "loss": 2.1165,
2096
  "step": 1480
2097
  },
2098
  {
2099
  "epoch": 1.32,
2100
- "grad_norm": 0.42426069316839393,
2101
  "learning_rate": 6.330832487240573e-06,
2102
  "loss": 2.0901,
2103
  "step": 1485
2104
  },
2105
  {
2106
  "epoch": 1.32,
2107
- "grad_norm": 0.4084328363742136,
2108
  "learning_rate": 6.2590314539520695e-06,
2109
  "loss": 2.1171,
2110
  "step": 1490
2111
  },
2112
  {
2113
  "epoch": 1.32,
2114
- "grad_norm": 0.41238954670377825,
2115
  "learning_rate": 6.187453970440484e-06,
2116
- "loss": 2.0994,
2117
  "step": 1495
2118
  },
2119
  {
2120
  "epoch": 1.33,
2121
- "grad_norm": 0.40817907104649587,
2122
  "learning_rate": 6.116104313975267e-06,
2123
  "loss": 2.1215,
2124
  "step": 1500
2125
  },
2126
  {
2127
  "epoch": 1.33,
2128
- "grad_norm": 0.40313155498679465,
2129
  "learning_rate": 6.044986748211556e-06,
2130
  "loss": 2.0906,
2131
  "step": 1505
2132
  },
2133
  {
2134
  "epoch": 1.34,
2135
- "grad_norm": 0.3909990052489633,
2136
  "learning_rate": 5.974105522935416e-06,
2137
  "loss": 2.1004,
2138
  "step": 1510
2139
  },
2140
  {
2141
  "epoch": 1.34,
2142
- "grad_norm": 0.39362860587726756,
2143
  "learning_rate": 5.903464873809854e-06,
2144
  "loss": 2.0894,
2145
  "step": 1515
2146
  },
2147
  {
2148
  "epoch": 1.35,
2149
- "grad_norm": 0.4615926116745974,
2150
  "learning_rate": 5.833069022121727e-06,
2151
  "loss": 2.1079,
2152
  "step": 1520
2153
  },
2154
  {
2155
  "epoch": 1.35,
2156
- "grad_norm": 0.4010761147634409,
2157
  "learning_rate": 5.762922174529482e-06,
2158
  "loss": 2.0802,
2159
  "step": 1525
2160
  },
2161
  {
2162
  "epoch": 1.36,
2163
- "grad_norm": 0.4264470129906522,
2164
  "learning_rate": 5.693028522811783e-06,
2165
  "loss": 2.0999,
2166
  "step": 1530
2167
  },
2168
  {
2169
  "epoch": 1.36,
2170
- "grad_norm": 0.4133985640334462,
2171
  "learning_rate": 5.6233922436170205e-06,
2172
  "loss": 2.0991,
2173
  "step": 1535
2174
  },
2175
  {
2176
  "epoch": 1.36,
2177
- "grad_norm": 0.420524380989124,
2178
  "learning_rate": 5.5540174982137185e-06,
2179
  "loss": 2.1105,
2180
  "step": 1540
2181
  },
2182
  {
2183
  "epoch": 1.37,
2184
- "grad_norm": 0.4100354369474598,
2185
  "learning_rate": 5.484908432241889e-06,
2186
- "loss": 2.1046,
2187
  "step": 1545
2188
  },
2189
  {
2190
  "epoch": 1.37,
2191
- "grad_norm": 0.39927144918184515,
2192
  "learning_rate": 5.416069175465274e-06,
2193
- "loss": 2.1033,
2194
  "step": 1550
2195
  },
2196
  {
2197
  "epoch": 1.38,
2198
- "grad_norm": 0.40701326109433633,
2199
  "learning_rate": 5.347503841524582e-06,
2200
  "loss": 2.1235,
2201
  "step": 1555
2202
  },
2203
  {
2204
  "epoch": 1.38,
2205
- "grad_norm": 0.41789807073609253,
2206
  "learning_rate": 5.279216527691657e-06,
2207
  "loss": 2.1116,
2208
  "step": 1560
2209
  },
2210
  {
2211
  "epoch": 1.39,
2212
- "grad_norm": 0.41741283298931336,
2213
  "learning_rate": 5.211211314624653e-06,
2214
  "loss": 2.1007,
2215
  "step": 1565
2216
  },
2217
  {
2218
  "epoch": 1.39,
2219
- "grad_norm": 0.41997249537382586,
2220
  "learning_rate": 5.143492266124164e-06,
2221
- "loss": 2.1085,
2222
  "step": 1570
2223
  },
2224
  {
2225
  "epoch": 1.4,
2226
- "grad_norm": 0.3942044655278752,
2227
  "learning_rate": 5.076063428890393e-06,
2228
  "loss": 2.1039,
2229
  "step": 1575
2230
  },
2231
  {
2232
  "epoch": 1.4,
2233
- "grad_norm": 0.4224932468107878,
2234
  "learning_rate": 5.008928832281339e-06,
2235
  "loss": 2.095,
2236
  "step": 1580
2237
  },
2238
  {
2239
  "epoch": 1.4,
2240
- "grad_norm": 0.4203107068713221,
2241
  "learning_rate": 4.942092488072e-06,
2242
- "loss": 2.1066,
2243
  "step": 1585
2244
  },
2245
  {
2246
  "epoch": 1.41,
2247
- "grad_norm": 0.411259991404189,
2248
  "learning_rate": 4.875558390214652e-06,
2249
  "loss": 2.0944,
2250
  "step": 1590
2251
  },
2252
  {
2253
  "epoch": 1.41,
2254
- "grad_norm": 0.4184136992586782,
2255
  "learning_rate": 4.8093305146001815e-06,
2256
  "loss": 2.0941,
2257
  "step": 1595
2258
  },
2259
  {
2260
  "epoch": 1.42,
2261
- "grad_norm": 0.4173687843747853,
2262
  "learning_rate": 4.743412818820488e-06,
2263
  "loss": 2.1052,
2264
  "step": 1600
2265
  },
2266
  {
2267
  "epoch": 1.42,
2268
- "grad_norm": 0.398835059698419,
2269
  "learning_rate": 4.677809241931994e-06,
2270
  "loss": 2.1039,
2271
  "step": 1605
2272
  },
2273
  {
2274
  "epoch": 1.43,
2275
- "grad_norm": 0.4047780904385868,
2276
  "learning_rate": 4.612523704220264e-06,
2277
  "loss": 2.1022,
2278
  "step": 1610
2279
  },
2280
  {
2281
  "epoch": 1.43,
2282
- "grad_norm": 0.4404843855988767,
2283
  "learning_rate": 4.5475601069657304e-06,
2284
  "loss": 2.1029,
2285
  "step": 1615
2286
  },
2287
  {
2288
  "epoch": 1.43,
2289
- "grad_norm": 0.460917367376776,
2290
  "learning_rate": 4.482922332210569e-06,
2291
  "loss": 2.0943,
2292
  "step": 1620
2293
  },
2294
  {
2295
  "epoch": 1.44,
2296
- "grad_norm": 0.45465122948799785,
2297
  "learning_rate": 4.418614242526717e-06,
2298
  "loss": 2.0889,
2299
  "step": 1625
2300
  },
2301
  {
2302
  "epoch": 1.44,
2303
- "grad_norm": 0.3929544786357408,
2304
  "learning_rate": 4.354639680785059e-06,
2305
  "loss": 2.1044,
2306
  "step": 1630
2307
  },
2308
  {
2309
  "epoch": 1.45,
2310
- "grad_norm": 0.3981800262798204,
2311
  "learning_rate": 4.291002469925782e-06,
2312
  "loss": 2.1184,
2313
  "step": 1635
2314
  },
2315
  {
2316
  "epoch": 1.45,
2317
- "grad_norm": 0.422621851812633,
2318
  "learning_rate": 4.227706412729943e-06,
2319
- "loss": 2.1014,
2320
  "step": 1640
2321
  },
2322
  {
2323
  "epoch": 1.46,
2324
- "grad_norm": 0.3852282063127696,
2325
  "learning_rate": 4.1647552915922e-06,
2326
  "loss": 2.0999,
2327
  "step": 1645
2328
  },
2329
  {
2330
  "epoch": 1.46,
2331
- "grad_norm": 0.4025385240560515,
2332
  "learning_rate": 4.1021528682948064e-06,
2333
  "loss": 2.0864,
2334
  "step": 1650
2335
  },
2336
  {
2337
  "epoch": 1.47,
2338
- "grad_norm": 0.3897195958926085,
2339
  "learning_rate": 4.039902883782814e-06,
2340
  "loss": 2.1092,
2341
  "step": 1655
2342
  },
2343
  {
2344
  "epoch": 1.47,
2345
- "grad_norm": 0.40170774992535624,
2346
  "learning_rate": 3.978009057940518e-06,
2347
  "loss": 2.1063,
2348
  "step": 1660
2349
  },
2350
  {
2351
  "epoch": 1.47,
2352
- "grad_norm": 0.39270529953678246,
2353
  "learning_rate": 3.916475089369175e-06,
2354
- "loss": 2.0991,
2355
  "step": 1665
2356
  },
2357
  {
2358
  "epoch": 1.48,
2359
- "grad_norm": 0.40584630349461415,
2360
  "learning_rate": 3.855304655165978e-06,
2361
  "loss": 2.111,
2362
  "step": 1670
2363
  },
2364
  {
2365
  "epoch": 1.48,
2366
- "grad_norm": 0.4152239829996968,
2367
  "learning_rate": 3.794501410704331e-06,
2368
- "loss": 2.1021,
2369
  "step": 1675
2370
  },
2371
  {
2372
  "epoch": 1.49,
2373
- "grad_norm": 0.3983781052549036,
2374
  "learning_rate": 3.7340689894154023e-06,
2375
  "loss": 2.0816,
2376
  "step": 1680
2377
  },
2378
  {
2379
  "epoch": 1.49,
2380
- "grad_norm": 0.394718473346905,
2381
  "learning_rate": 3.674011002571022e-06,
2382
  "loss": 2.0942,
2383
  "step": 1685
2384
  },
2385
  {
2386
  "epoch": 1.5,
2387
- "grad_norm": 0.39161083448396583,
2388
  "learning_rate": 3.6143310390678544e-06,
2389
- "loss": 2.0922,
2390
  "step": 1690
2391
  },
2392
  {
2393
  "epoch": 1.5,
2394
- "grad_norm": 0.44043118102175927,
2395
  "learning_rate": 3.555032665212964e-06,
2396
  "loss": 2.1227,
2397
  "step": 1695
2398
  },
2399
  {
2400
  "epoch": 1.51,
2401
- "grad_norm": 0.4181858401645642,
2402
  "learning_rate": 3.496119424510678e-06,
2403
  "loss": 2.0929,
2404
  "step": 1700
2405
  },
2406
  {
2407
  "epoch": 1.51,
2408
- "grad_norm": 0.4189402248321076,
2409
  "learning_rate": 3.4375948374508516e-06,
2410
  "loss": 2.0741,
2411
  "step": 1705
2412
  },
2413
  {
2414
  "epoch": 1.51,
2415
- "grad_norm": 0.41159481219274896,
2416
  "learning_rate": 3.3794624012984913e-06,
2417
  "loss": 2.0993,
2418
  "step": 1710
2419
  },
2420
  {
2421
  "epoch": 1.52,
2422
- "grad_norm": 0.43632853472389527,
2423
  "learning_rate": 3.3217255898847635e-06,
2424
  "loss": 2.0996,
2425
  "step": 1715
2426
  },
2427
  {
2428
  "epoch": 1.52,
2429
- "grad_norm": 0.40523538878872667,
2430
  "learning_rate": 3.2643878533994145e-06,
2431
  "loss": 2.1051,
2432
  "step": 1720
2433
  },
2434
  {
2435
  "epoch": 1.53,
2436
- "grad_norm": 0.39715432170566595,
2437
  "learning_rate": 3.20745261818459e-06,
2438
- "loss": 2.0882,
2439
  "step": 1725
2440
  },
2441
  {
2442
  "epoch": 1.53,
2443
- "grad_norm": 0.392910321529135,
2444
  "learning_rate": 3.1509232865300886e-06,
2445
- "loss": 2.0966,
2446
  "step": 1730
2447
  },
2448
  {
2449
  "epoch": 1.54,
2450
- "grad_norm": 0.3878024238298016,
2451
  "learning_rate": 3.09480323647006e-06,
2452
  "loss": 2.0907,
2453
  "step": 1735
2454
  },
2455
  {
2456
  "epoch": 1.54,
2457
- "grad_norm": 0.3988069401578506,
2458
  "learning_rate": 3.039095821581127e-06,
2459
- "loss": 2.103,
2460
  "step": 1740
2461
  },
2462
  {
2463
  "epoch": 1.55,
2464
- "grad_norm": 0.39345046737475675,
2465
  "learning_rate": 2.983804370781996e-06,
2466
  "loss": 2.1006,
2467
  "step": 1745
2468
  },
2469
  {
2470
  "epoch": 1.55,
2471
- "grad_norm": 0.39314147338729954,
2472
  "learning_rate": 2.9289321881345257e-06,
2473
- "loss": 2.1112,
2474
  "step": 1750
2475
  },
2476
  {
2477
  "epoch": 1.55,
2478
- "grad_norm": 0.4185416508087659,
2479
  "learning_rate": 2.8744825526462882e-06,
2480
  "loss": 2.1027,
2481
  "step": 1755
2482
  },
2483
  {
2484
  "epoch": 1.56,
2485
- "grad_norm": 0.3862438722622432,
2486
  "learning_rate": 2.8204587180746256e-06,
2487
- "loss": 2.1025,
2488
  "step": 1760
2489
  },
2490
  {
2491
  "epoch": 1.56,
2492
- "grad_norm": 0.4029231651195777,
2493
  "learning_rate": 2.7668639127322084e-06,
2494
  "loss": 2.1158,
2495
  "step": 1765
2496
  },
2497
  {
2498
  "epoch": 1.57,
2499
- "grad_norm": 0.3841430640303821,
2500
  "learning_rate": 2.713701339294129e-06,
2501
  "loss": 2.0938,
2502
  "step": 1770
2503
  },
2504
  {
2505
  "epoch": 1.57,
2506
- "grad_norm": 0.41087055242630033,
2507
  "learning_rate": 2.66097417460651e-06,
2508
  "loss": 2.0973,
2509
  "step": 1775
2510
  },
2511
  {
2512
  "epoch": 1.58,
2513
- "grad_norm": 0.3825660035874534,
2514
  "learning_rate": 2.6086855694966795e-06,
2515
- "loss": 2.0823,
2516
  "step": 1780
2517
  },
2518
  {
2519
  "epoch": 1.58,
2520
- "grad_norm": 0.40421192136713935,
2521
  "learning_rate": 2.5568386485848663e-06,
2522
  "loss": 2.1023,
2523
  "step": 1785
2524
  },
2525
  {
2526
  "epoch": 1.59,
2527
- "grad_norm": 0.3900700470696687,
2528
  "learning_rate": 2.505436510097494e-06,
2529
  "loss": 2.1128,
2530
  "step": 1790
2531
  },
2532
  {
2533
  "epoch": 1.59,
2534
- "grad_norm": 0.41978213613176885,
2535
  "learning_rate": 2.45448222568204e-06,
2536
  "loss": 2.108,
2537
  "step": 1795
2538
  },
2539
  {
2540
  "epoch": 1.59,
2541
- "grad_norm": 0.39653265205370863,
2542
  "learning_rate": 2.4039788402234787e-06,
2543
- "loss": 2.0815,
2544
  "step": 1800
2545
  },
2546
  {
2547
  "epoch": 1.6,
2548
- "grad_norm": 0.3904988304393235,
2549
  "learning_rate": 2.3539293716623268e-06,
2550
  "loss": 2.1154,
2551
  "step": 1805
2552
  },
2553
  {
2554
  "epoch": 1.6,
2555
- "grad_norm": 0.41088869514529786,
2556
  "learning_rate": 2.304336810814305e-06,
2557
- "loss": 2.0988,
2558
  "step": 1810
2559
  },
2560
  {
2561
  "epoch": 1.61,
2562
- "grad_norm": 0.3944752622451121,
2563
  "learning_rate": 2.2552041211916052e-06,
2564
  "loss": 2.0972,
2565
  "step": 1815
2566
  },
2567
  {
2568
  "epoch": 1.61,
2569
- "grad_norm": 0.4199794267969718,
2570
  "learning_rate": 2.2065342388258193e-06,
2571
  "loss": 2.0875,
2572
  "step": 1820
2573
  },
2574
  {
2575
  "epoch": 1.62,
2576
- "grad_norm": 0.4018868915941079,
2577
  "learning_rate": 2.1583300720924604e-06,
2578
  "loss": 2.0744,
2579
  "step": 1825
2580
  },
2581
  {
2582
  "epoch": 1.62,
2583
- "grad_norm": 0.41301804029081357,
2584
  "learning_rate": 2.1105945015371985e-06,
2585
  "loss": 2.092,
2586
  "step": 1830
2587
  },
2588
  {
2589
  "epoch": 1.63,
2590
- "grad_norm": 0.4081654998221138,
2591
  "learning_rate": 2.063330379703702e-06,
2592
  "loss": 2.1053,
2593
  "step": 1835
2594
  },
2595
  {
2596
  "epoch": 1.63,
2597
- "grad_norm": 0.39416563602352717,
2598
  "learning_rate": 2.016540530963188e-06,
2599
- "loss": 2.0908,
2600
  "step": 1840
2601
  },
2602
  {
2603
  "epoch": 1.63,
2604
- "grad_norm": 0.3945726154982212,
2605
  "learning_rate": 1.9702277513456493e-06,
2606
  "loss": 2.1015,
2607
  "step": 1845
2608
  },
2609
  {
2610
  "epoch": 1.64,
2611
- "grad_norm": 0.39279928238606776,
2612
  "learning_rate": 1.9243948083727626e-06,
2613
  "loss": 2.1065,
2614
  "step": 1850
2615
  },
2616
  {
2617
  "epoch": 1.64,
2618
- "grad_norm": 0.37787428944134677,
2619
  "learning_rate": 1.879044440892517e-06,
2620
- "loss": 2.1049,
2621
  "step": 1855
2622
  },
2623
  {
2624
  "epoch": 1.65,
2625
- "grad_norm": 0.40537368502843923,
2626
  "learning_rate": 1.8341793589155444e-06,
2627
  "loss": 2.0941,
2628
  "step": 1860
2629
  },
2630
  {
2631
  "epoch": 1.65,
2632
- "grad_norm": 0.40078605326949507,
2633
  "learning_rate": 1.789802243453178e-06,
2634
  "loss": 2.0958,
2635
  "step": 1865
2636
  },
2637
  {
2638
  "epoch": 1.66,
2639
- "grad_norm": 0.44481298266147135,
2640
  "learning_rate": 1.7459157463572396e-06,
2641
  "loss": 2.1184,
2642
  "step": 1870
2643
  },
2644
  {
2645
  "epoch": 1.66,
2646
- "grad_norm": 0.40272869649156834,
2647
  "learning_rate": 1.7025224901615811e-06,
2648
  "loss": 2.087,
2649
  "step": 1875
2650
  },
2651
  {
2652
  "epoch": 1.67,
2653
- "grad_norm": 0.40256734727861593,
2654
  "learning_rate": 1.6596250679253568e-06,
2655
  "loss": 2.1043,
2656
  "step": 1880
2657
  },
2658
  {
2659
  "epoch": 1.67,
2660
- "grad_norm": 0.40369235793920866,
2661
  "learning_rate": 1.6172260430780772e-06,
2662
  "loss": 2.0896,
2663
  "step": 1885
2664
  },
2665
  {
2666
  "epoch": 1.67,
2667
- "grad_norm": 0.40338141640376785,
2668
  "learning_rate": 1.5753279492664264e-06,
2669
- "loss": 2.0809,
2670
  "step": 1890
2671
  },
2672
  {
2673
  "epoch": 1.68,
2674
- "grad_norm": 0.40404764905997415,
2675
  "learning_rate": 1.5339332902028537e-06,
2676
  "loss": 2.1114,
2677
  "step": 1895
2678
  },
2679
  {
2680
  "epoch": 1.68,
2681
- "grad_norm": 0.40310967377683793,
2682
  "learning_rate": 1.493044539515961e-06,
2683
  "loss": 2.1031,
2684
  "step": 1900
2685
  },
2686
  {
2687
  "epoch": 1.69,
2688
- "grad_norm": 0.39868877773871236,
2689
  "learning_rate": 1.4526641406026898e-06,
2690
  "loss": 2.1044,
2691
  "step": 1905
2692
  },
2693
  {
2694
  "epoch": 1.69,
2695
- "grad_norm": 0.3929122965795558,
2696
  "learning_rate": 1.4127945064823023e-06,
2697
  "loss": 2.0905,
2698
  "step": 1910
2699
  },
2700
  {
2701
  "epoch": 1.7,
2702
- "grad_norm": 0.38647811469854176,
2703
  "learning_rate": 1.3734380196521923e-06,
2704
  "loss": 2.0806,
2705
  "step": 1915
2706
  },
2707
  {
2708
  "epoch": 1.7,
2709
- "grad_norm": 0.38559139379344426,
2710
  "learning_rate": 1.334597031945517e-06,
2711
  "loss": 2.0962,
2712
  "step": 1920
2713
  },
2714
  {
2715
  "epoch": 1.71,
2716
- "grad_norm": 0.3978680847732746,
2717
  "learning_rate": 1.296273864390646e-06,
2718
  "loss": 2.0921,
2719
  "step": 1925
2720
  },
2721
  {
2722
  "epoch": 1.71,
2723
- "grad_norm": 0.4021005918682771,
2724
  "learning_rate": 1.2584708070724738e-06,
2725
  "loss": 2.0863,
2726
  "step": 1930
2727
  },
2728
  {
2729
  "epoch": 1.71,
2730
- "grad_norm": 0.4100118268308445,
2731
  "learning_rate": 1.2211901189955689e-06,
2732
  "loss": 2.0845,
2733
  "step": 1935
2734
  },
2735
  {
2736
  "epoch": 1.72,
2737
- "grad_norm": 0.4153149630591261,
2738
  "learning_rate": 1.1844340279491772e-06,
2739
  "loss": 2.091,
2740
  "step": 1940
2741
  },
2742
  {
2743
  "epoch": 1.72,
2744
- "grad_norm": 0.3766336576726111,
2745
  "learning_rate": 1.1482047303740996e-06,
2746
  "loss": 2.1058,
2747
  "step": 1945
2748
  },
2749
  {
2750
  "epoch": 1.73,
2751
- "grad_norm": 0.37979716883373044,
2752
  "learning_rate": 1.1125043912314438e-06,
2753
  "loss": 2.0792,
2754
  "step": 1950
2755
  },
2756
  {
2757
  "epoch": 1.73,
2758
- "grad_norm": 0.39410441439894295,
2759
  "learning_rate": 1.0773351438732392e-06,
2760
  "loss": 2.0941,
2761
  "step": 1955
2762
  },
2763
  {
2764
  "epoch": 1.74,
2765
- "grad_norm": 0.39632265790424465,
2766
  "learning_rate": 1.0426990899149658e-06,
2767
  "loss": 2.1108,
2768
  "step": 1960
2769
  },
2770
  {
2771
  "epoch": 1.74,
2772
- "grad_norm": 0.41425174918384033,
2773
  "learning_rate": 1.0085982991099585e-06,
2774
  "loss": 2.0842,
2775
  "step": 1965
2776
  },
2777
  {
2778
  "epoch": 1.74,
2779
- "grad_norm": 0.40245376356667706,
2780
  "learning_rate": 9.750348092257368e-07,
2781
  "loss": 2.1133,
2782
  "step": 1970
2783
  },
2784
  {
2785
  "epoch": 1.75,
2786
- "grad_norm": 0.40108425655069946,
2787
  "learning_rate": 9.420106259222184e-07,
2788
- "loss": 2.0977,
2789
  "step": 1975
2790
  },
2791
  {
2792
  "epoch": 1.75,
2793
- "grad_norm": 0.38658469741160056,
2794
  "learning_rate": 9.095277226318766e-07,
2795
  "loss": 2.106,
2796
  "step": 1980
2797
  },
2798
  {
2799
  "epoch": 1.76,
2800
- "grad_norm": 0.38971694065470736,
2801
  "learning_rate": 8.775880404418113e-07,
2802
  "loss": 2.1073,
2803
  "step": 1985
2804
  },
2805
  {
2806
  "epoch": 1.76,
2807
- "grad_norm": 0.39236639824692643,
2808
  "learning_rate": 8.461934879777545e-07,
2809
  "loss": 2.107,
2810
  "step": 1990
2811
  },
2812
  {
2813
  "epoch": 1.77,
2814
- "grad_norm": 0.39264689558068677,
2815
  "learning_rate": 8.153459412900156e-07,
2816
  "loss": 2.1047,
2817
  "step": 1995
2818
  },
2819
  {
2820
  "epoch": 1.77,
2821
- "grad_norm": 0.3881182512544317,
2822
  "learning_rate": 7.850472437413748e-07,
2823
  "loss": 2.0887,
2824
  "step": 2000
2825
  },
2826
  {
2827
  "epoch": 1.78,
2828
- "grad_norm": 0.4099221827496233,
2829
  "learning_rate": 7.552992058969299e-07,
2830
  "loss": 2.0946,
2831
  "step": 2005
2832
  },
2833
  {
2834
  "epoch": 1.78,
2835
- "grad_norm": 0.4008848120727289,
2836
  "learning_rate": 7.261036054158965e-07,
2837
  "loss": 2.0936,
2838
  "step": 2010
2839
  },
2840
  {
2841
  "epoch": 1.78,
2842
- "grad_norm": 0.3901817272531918,
2843
  "learning_rate": 6.974621869453924e-07,
2844
  "loss": 2.1059,
2845
  "step": 2015
2846
  },
2847
  {
2848
  "epoch": 1.79,
2849
- "grad_norm": 0.4036104225742069,
2850
  "learning_rate": 6.693766620161691e-07,
2851
  "loss": 2.0739,
2852
  "step": 2020
2853
  },
2854
  {
2855
  "epoch": 1.79,
2856
- "grad_norm": 0.40124538026020445,
2857
  "learning_rate": 6.418487089403392e-07,
2858
  "loss": 2.0968,
2859
  "step": 2025
2860
  },
2861
  {
2862
  "epoch": 1.8,
2863
- "grad_norm": 0.4128906676750628,
2864
  "learning_rate": 6.148799727110911e-07,
2865
  "loss": 2.113,
2866
  "step": 2030
2867
  },
2868
  {
2869
  "epoch": 1.8,
2870
- "grad_norm": 0.40326437614909544,
2871
  "learning_rate": 5.884720649043807e-07,
2872
- "loss": 2.1008,
2873
  "step": 2035
2874
  },
2875
  {
2876
  "epoch": 1.81,
2877
- "grad_norm": 0.39815093894666054,
2878
  "learning_rate": 5.626265635826367e-07,
2879
  "loss": 2.0865,
2880
  "step": 2040
2881
  },
2882
  {
2883
  "epoch": 1.81,
2884
- "grad_norm": 0.41328457201535923,
2885
  "learning_rate": 5.373450132004499e-07,
2886
  "loss": 2.0913,
2887
  "step": 2045
2888
  },
2889
  {
2890
  "epoch": 1.82,
2891
- "grad_norm": 0.39991862490609165,
2892
  "learning_rate": 5.126289245122906e-07,
2893
  "loss": 2.0932,
2894
  "step": 2050
2895
  },
2896
  {
2897
  "epoch": 1.82,
2898
- "grad_norm": 0.38850870390010905,
2899
  "learning_rate": 4.884797744822212e-07,
2900
  "loss": 2.0978,
2901
  "step": 2055
2902
  },
2903
  {
2904
  "epoch": 1.82,
2905
- "grad_norm": 0.3825674690477348,
2906
  "learning_rate": 4.648990061956493e-07,
2907
  "loss": 2.07,
2908
  "step": 2060
2909
  },
2910
  {
2911
  "epoch": 1.83,
2912
- "grad_norm": 0.4080938117585051,
2913
  "learning_rate": 4.418880287730798e-07,
2914
  "loss": 2.1086,
2915
  "step": 2065
2916
  },
2917
  {
2918
  "epoch": 1.83,
2919
- "grad_norm": 0.39154913896551635,
2920
  "learning_rate": 4.194482172859127e-07,
2921
  "loss": 2.1096,
2922
  "step": 2070
2923
  },
2924
  {
2925
  "epoch": 1.84,
2926
- "grad_norm": 0.39276140367617096,
2927
  "learning_rate": 3.9758091267428245e-07,
2928
  "loss": 2.1058,
2929
  "step": 2075
2930
  },
2931
  {
2932
  "epoch": 1.84,
2933
- "grad_norm": 0.40266297058608747,
2934
  "learning_rate": 3.762874216669166e-07,
2935
  "loss": 2.0968,
2936
  "step": 2080
2937
  },
2938
  {
2939
  "epoch": 1.85,
2940
- "grad_norm": 0.38416418274470837,
2941
  "learning_rate": 3.555690167030512e-07,
2942
- "loss": 2.0943,
2943
  "step": 2085
2944
  },
2945
  {
2946
  "epoch": 1.85,
2947
- "grad_norm": 0.38627102267650687,
2948
  "learning_rate": 3.354269358563966e-07,
2949
  "loss": 2.0752,
2950
  "step": 2090
2951
  },
2952
  {
2953
  "epoch": 1.86,
2954
- "grad_norm": 0.39948485243169796,
2955
  "learning_rate": 3.158623827611529e-07,
2956
  "loss": 2.082,
2957
  "step": 2095
2958
  },
2959
  {
2960
  "epoch": 1.86,
2961
- "grad_norm": 0.39155833643949695,
2962
  "learning_rate": 2.968765265400808e-07,
2963
- "loss": 2.1093,
2964
  "step": 2100
2965
  },
2966
  {
2967
  "epoch": 1.86,
2968
- "grad_norm": 0.3898094314171678,
2969
  "learning_rate": 2.784705017346423e-07,
2970
  "loss": 2.1011,
2971
  "step": 2105
2972
  },
2973
  {
2974
  "epoch": 1.87,
2975
- "grad_norm": 0.4161268651639999,
2976
  "learning_rate": 2.606454082372045e-07,
2977
  "loss": 2.118,
2978
  "step": 2110
2979
  },
2980
  {
2981
  "epoch": 1.87,
2982
- "grad_norm": 0.39270199978496556,
2983
  "learning_rate": 2.4340231122530477e-07,
2984
  "loss": 2.091,
2985
  "step": 2115
2986
  },
2987
  {
2988
  "epoch": 1.88,
2989
- "grad_norm": 0.3965377293981766,
2990
  "learning_rate": 2.2674224109800913e-07,
2991
  "loss": 2.1105,
2992
  "step": 2120
2993
  },
2994
  {
2995
  "epoch": 1.88,
2996
- "grad_norm": 0.4070531814341945,
2997
  "learning_rate": 2.106661934143317e-07,
2998
  "loss": 2.0776,
2999
  "step": 2125
3000
  },
3001
  {
3002
  "epoch": 1.89,
3003
- "grad_norm": 0.40126305868439827,
3004
  "learning_rate": 1.9517512883374667e-07,
3005
- "loss": 2.0765,
3006
  "step": 2130
3007
  },
3008
  {
3009
  "epoch": 1.89,
3010
- "grad_norm": 0.393427448726336,
3011
  "learning_rate": 1.802699730587798e-07,
3012
  "loss": 2.0959,
3013
  "step": 2135
3014
  },
3015
  {
3016
  "epoch": 1.9,
3017
- "grad_norm": 0.38224864123958974,
3018
  "learning_rate": 1.659516167796904e-07,
3019
  "loss": 2.0748,
3020
  "step": 2140
3021
  },
3022
  {
3023
  "epoch": 1.9,
3024
- "grad_norm": 0.39001902571032665,
3025
  "learning_rate": 1.522209156212484e-07,
3026
  "loss": 2.095,
3027
  "step": 2145
3028
  },
3029
  {
3030
  "epoch": 1.9,
3031
- "grad_norm": 0.3886098993911617,
3032
  "learning_rate": 1.3907869009160525e-07,
3033
  "loss": 2.0935,
3034
  "step": 2150
3035
  },
3036
  {
3037
  "epoch": 1.91,
3038
- "grad_norm": 0.40509023917038356,
3039
  "learning_rate": 1.265257255332586e-07,
3040
- "loss": 2.0797,
3041
  "step": 2155
3042
  },
3043
  {
3044
  "epoch": 1.91,
3045
- "grad_norm": 0.4006132339342348,
3046
  "learning_rate": 1.1456277207612554e-07,
3047
  "loss": 2.1011,
3048
  "step": 2160
3049
  },
3050
  {
3051
  "epoch": 1.92,
3052
- "grad_norm": 0.39903804360873457,
3053
  "learning_rate": 1.0319054459271837e-07,
3054
- "loss": 2.1072,
3055
  "step": 2165
3056
  },
3057
  {
3058
  "epoch": 1.92,
3059
- "grad_norm": 0.3941511383568346,
3060
  "learning_rate": 9.240972265541992e-08,
3061
  "loss": 2.1013,
3062
  "step": 2170
3063
  },
3064
  {
3065
  "epoch": 1.93,
3066
- "grad_norm": 0.39465869086240923,
3067
  "learning_rate": 8.222095049588264e-08,
3068
  "loss": 2.0907,
3069
  "step": 2175
3070
  },
3071
  {
3072
  "epoch": 1.93,
3073
- "grad_norm": 0.3886795229575515,
3074
  "learning_rate": 7.262483696652167e-08,
3075
  "loss": 2.1052,
3076
  "step": 2180
3077
  },
3078
  {
3079
  "epoch": 1.94,
3080
- "grad_norm": 0.3896724676511193,
3081
  "learning_rate": 6.362195550413953e-08,
3082
  "loss": 2.0855,
3083
  "step": 2185
3084
  },
3085
  {
3086
  "epoch": 1.94,
3087
- "grad_norm": 0.38537356847165705,
3088
  "learning_rate": 5.521284409565675e-08,
3089
  "loss": 2.0939,
3090
  "step": 2190
3091
  },
3092
  {
3093
  "epoch": 1.94,
3094
- "grad_norm": 0.4110911825589815,
3095
  "learning_rate": 4.739800524595884e-08,
3096
  "loss": 2.1071,
3097
  "step": 2195
3098
  },
3099
  {
3100
  "epoch": 1.95,
3101
- "grad_norm": 0.39664920267890863,
3102
  "learning_rate": 4.017790594787574e-08,
3103
- "loss": 2.0964,
3104
  "step": 2200
3105
  },
3106
  {
3107
  "epoch": 1.95,
3108
- "grad_norm": 0.3997936501506563,
3109
  "learning_rate": 3.355297765426868e-08,
3110
  "loss": 2.0959,
3111
  "step": 2205
3112
  },
3113
  {
3114
  "epoch": 1.96,
3115
- "grad_norm": 0.38711089783301433,
3116
  "learning_rate": 2.7523616252252972e-08,
3117
- "loss": 2.0801,
3118
  "step": 2210
3119
  },
3120
  {
3121
  "epoch": 1.96,
3122
- "grad_norm": 0.3882020891197347,
3123
  "learning_rate": 2.2090182039538055e-08,
3124
  "loss": 2.0808,
3125
  "step": 2215
3126
  },
3127
  {
3128
  "epoch": 1.97,
3129
- "grad_norm": 0.38163332350201257,
3130
  "learning_rate": 1.7252999702894736e-08,
3131
  "loss": 2.089,
3132
  "step": 2220
3133
  },
3134
  {
3135
  "epoch": 1.97,
3136
- "grad_norm": 0.3823615274538012,
3137
  "learning_rate": 1.3012358298760686e-08,
3138
  "loss": 2.0984,
3139
  "step": 2225
3140
  },
3141
  {
3142
  "epoch": 1.98,
3143
- "grad_norm": 0.41756473271306493,
3144
  "learning_rate": 9.368511235958722e-09,
3145
  "loss": 2.1074,
3146
  "step": 2230
3147
  },
3148
  {
3149
  "epoch": 1.98,
3150
- "grad_norm": 0.38841621078812283,
3151
  "learning_rate": 6.3216762605589064e-09,
3152
  "loss": 2.0877,
3153
  "step": 2235
3154
  },
3155
  {
3156
  "epoch": 1.98,
3157
- "grad_norm": 0.3797191083714544,
3158
  "learning_rate": 3.87203544286563e-09,
3159
  "loss": 2.1056,
3160
  "step": 2240
3161
  },
3162
  {
3163
  "epoch": 1.99,
3164
- "grad_norm": 0.3944732061433803,
3165
  "learning_rate": 2.019735166534087e-09,
3166
  "loss": 2.0838,
3167
  "step": 2245
3168
  },
3169
  {
3170
  "epoch": 1.99,
3171
- "grad_norm": 0.4217956121494208,
3172
  "learning_rate": 7.648861198306101e-10,
3173
  "loss": 2.1125,
3174
  "step": 2250
3175
  },
3176
  {
3177
  "epoch": 2.0,
3178
- "grad_norm": 0.4044063079872186,
3179
  "learning_rate": 1.0756328901018188e-10,
3180
- "loss": 2.0929,
3181
  "step": 2255
3182
  },
3183
  {
3184
  "epoch": 2.0,
3185
- "eval_loss": 2.106666326522827,
3186
- "eval_runtime": 22.306,
3187
- "eval_samples_per_second": 1433.647,
3188
- "eval_steps_per_second": 22.415,
3189
  "step": 2258
3190
  },
3191
  {
3192
  "epoch": 2.0,
3193
  "step": 2258,
3194
  "total_flos": 13637863342080.0,
3195
- "train_loss": 2.1773925889794863,
3196
- "train_runtime": 649.8909,
3197
- "train_samples_per_second": 889.158,
3198
- "train_steps_per_second": 3.474
3199
  }
3200
  ],
3201
  "logging_steps": 5,
 
10
  "log_history": [
11
  {
12
  "epoch": 0.0,
13
+ "grad_norm": 6.166644832357895,
14
  "learning_rate": 8.849557522123894e-08,
15
  "loss": 3.1048,
16
  "step": 1
17
  },
18
  {
19
  "epoch": 0.0,
20
+ "grad_norm": 6.547137390213372,
21
  "learning_rate": 4.4247787610619474e-07,
22
  "loss": 3.1401,
23
  "step": 5
24
  },
25
  {
26
  "epoch": 0.01,
27
+ "grad_norm": 6.696783975456761,
28
  "learning_rate": 8.849557522123895e-07,
29
  "loss": 3.1347,
30
  "step": 10
31
  },
32
  {
33
  "epoch": 0.01,
34
+ "grad_norm": 6.565174886284236,
35
  "learning_rate": 1.3274336283185843e-06,
36
  "loss": 3.1075,
37
  "step": 15
38
  },
39
  {
40
  "epoch": 0.02,
41
+ "grad_norm": 6.564097662799965,
42
  "learning_rate": 1.769911504424779e-06,
43
  "loss": 3.1301,
44
  "step": 20
45
  },
46
  {
47
  "epoch": 0.02,
48
+ "grad_norm": 5.858614718276462,
49
  "learning_rate": 2.212389380530974e-06,
50
  "loss": 3.0924,
51
  "step": 25
52
  },
53
  {
54
  "epoch": 0.03,
55
+ "grad_norm": 5.070422553033002,
56
  "learning_rate": 2.6548672566371687e-06,
57
  "loss": 3.0522,
58
  "step": 30
59
  },
60
  {
61
  "epoch": 0.03,
62
+ "grad_norm": 4.85611313965228,
63
  "learning_rate": 3.097345132743363e-06,
64
  "loss": 2.9879,
65
  "step": 35
66
  },
67
  {
68
  "epoch": 0.04,
69
+ "grad_norm": 3.705112773069458,
70
  "learning_rate": 3.539823008849558e-06,
71
  "loss": 2.9571,
72
  "step": 40
73
  },
74
  {
75
  "epoch": 0.04,
76
+ "grad_norm": 3.3846074522276495,
77
  "learning_rate": 3.982300884955752e-06,
78
  "loss": 2.8407,
79
  "step": 45
80
  },
81
  {
82
  "epoch": 0.04,
83
+ "grad_norm": 2.7596431694637418,
84
  "learning_rate": 4.424778761061948e-06,
85
  "loss": 2.8395,
86
  "step": 50
87
  },
88
  {
89
  "epoch": 0.05,
90
+ "grad_norm": 2.4166924070830507,
91
  "learning_rate": 4.867256637168142e-06,
92
  "loss": 2.7639,
93
  "step": 55
94
  },
95
  {
96
  "epoch": 0.05,
97
+ "grad_norm": 1.9023199017991137,
98
  "learning_rate": 5.309734513274337e-06,
99
+ "loss": 2.7241,
100
  "step": 60
101
  },
102
  {
103
  "epoch": 0.06,
104
+ "grad_norm": 1.7127226131554742,
105
  "learning_rate": 5.752212389380532e-06,
106
  "loss": 2.6849,
107
  "step": 65
108
  },
109
  {
110
  "epoch": 0.06,
111
+ "grad_norm": 1.5288819308151376,
112
  "learning_rate": 6.194690265486726e-06,
113
  "loss": 2.6676,
114
  "step": 70
115
  },
116
  {
117
  "epoch": 0.07,
118
+ "grad_norm": 1.3304603350726332,
119
  "learning_rate": 6.6371681415929215e-06,
120
  "loss": 2.6535,
121
  "step": 75
122
  },
123
  {
124
  "epoch": 0.07,
125
+ "grad_norm": 1.2361791442984238,
126
  "learning_rate": 7.079646017699116e-06,
127
  "loss": 2.6052,
128
  "step": 80
129
  },
130
  {
131
  "epoch": 0.08,
132
+ "grad_norm": 1.1715130887245255,
133
  "learning_rate": 7.5221238938053095e-06,
134
  "loss": 2.5976,
135
  "step": 85
136
  },
137
  {
138
  "epoch": 0.08,
139
+ "grad_norm": 1.0438274967714645,
140
  "learning_rate": 7.964601769911505e-06,
141
  "loss": 2.5714,
142
  "step": 90
143
  },
144
  {
145
  "epoch": 0.08,
146
+ "grad_norm": 1.0463621526060618,
147
  "learning_rate": 8.4070796460177e-06,
148
  "loss": 2.5294,
149
  "step": 95
150
  },
151
  {
152
  "epoch": 0.09,
153
+ "grad_norm": 1.0020484167730224,
154
  "learning_rate": 8.849557522123895e-06,
155
  "loss": 2.5195,
156
  "step": 100
157
  },
158
  {
159
  "epoch": 0.09,
160
+ "grad_norm": 0.9056288322795102,
161
  "learning_rate": 9.29203539823009e-06,
162
  "loss": 2.4895,
163
  "step": 105
164
  },
165
  {
166
  "epoch": 0.1,
167
+ "grad_norm": 0.833576084032464,
168
  "learning_rate": 9.734513274336284e-06,
169
  "loss": 2.4613,
170
  "step": 110
171
  },
172
  {
173
  "epoch": 0.1,
174
+ "grad_norm": 0.8280411391519304,
175
  "learning_rate": 1.0176991150442479e-05,
176
  "loss": 2.4672,
177
  "step": 115
178
  },
179
  {
180
  "epoch": 0.11,
181
+ "grad_norm": 0.7773488088473773,
182
  "learning_rate": 1.0619469026548675e-05,
183
  "loss": 2.4404,
184
  "step": 120
185
  },
186
  {
187
  "epoch": 0.11,
188
+ "grad_norm": 0.7126701603391907,
189
  "learning_rate": 1.1061946902654867e-05,
190
  "loss": 2.4359,
191
  "step": 125
192
  },
193
  {
194
  "epoch": 0.12,
195
+ "grad_norm": 0.6317818410183665,
196
  "learning_rate": 1.1504424778761064e-05,
197
  "loss": 2.4162,
198
  "step": 130
199
  },
200
  {
201
  "epoch": 0.12,
202
+ "grad_norm": 0.6331415128839278,
203
  "learning_rate": 1.1946902654867258e-05,
204
  "loss": 2.4092,
205
  "step": 135
206
  },
207
  {
208
  "epoch": 0.12,
209
+ "grad_norm": 0.6159804066612772,
210
  "learning_rate": 1.2389380530973452e-05,
211
  "loss": 2.3838,
212
  "step": 140
213
  },
214
  {
215
  "epoch": 0.13,
216
+ "grad_norm": 0.6215640814359188,
217
  "learning_rate": 1.2831858407079647e-05,
218
  "loss": 2.3835,
219
  "step": 145
220
  },
221
  {
222
  "epoch": 0.13,
223
+ "grad_norm": 0.5294354141640165,
224
  "learning_rate": 1.3274336283185843e-05,
225
  "loss": 2.3808,
226
  "step": 150
227
  },
228
  {
229
  "epoch": 0.14,
230
+ "grad_norm": 0.545389364003436,
231
  "learning_rate": 1.3716814159292036e-05,
232
  "loss": 2.3642,
233
  "step": 155
234
  },
235
  {
236
  "epoch": 0.14,
237
+ "grad_norm": 0.5171175171539601,
238
  "learning_rate": 1.4159292035398232e-05,
239
+ "loss": 2.3454,
240
  "step": 160
241
  },
242
  {
243
  "epoch": 0.15,
244
+ "grad_norm": 0.5367136014344058,
245
  "learning_rate": 1.4601769911504426e-05,
246
  "loss": 2.3306,
247
  "step": 165
248
  },
249
  {
250
  "epoch": 0.15,
251
+ "grad_norm": 0.4742492654414562,
252
  "learning_rate": 1.5044247787610619e-05,
253
  "loss": 2.3065,
254
  "step": 170
255
  },
256
  {
257
  "epoch": 0.16,
258
+ "grad_norm": 0.548194578492432,
259
  "learning_rate": 1.5486725663716813e-05,
260
  "loss": 2.3428,
261
  "step": 175
262
  },
263
  {
264
  "epoch": 0.16,
265
+ "grad_norm": 0.5077658281995345,
266
  "learning_rate": 1.592920353982301e-05,
267
  "loss": 2.3521,
268
  "step": 180
269
  },
270
  {
271
  "epoch": 0.16,
272
+ "grad_norm": 0.46274842578404124,
273
  "learning_rate": 1.6371681415929206e-05,
274
  "loss": 2.3497,
275
  "step": 185
276
  },
277
  {
278
  "epoch": 0.17,
279
+ "grad_norm": 0.5215055422633831,
280
  "learning_rate": 1.68141592920354e-05,
281
+ "loss": 2.3223,
282
  "step": 190
283
  },
284
  {
285
  "epoch": 0.17,
286
+ "grad_norm": 0.4911463491627382,
287
  "learning_rate": 1.7256637168141594e-05,
288
  "loss": 2.32,
289
  "step": 195
290
  },
291
  {
292
  "epoch": 0.18,
293
+ "grad_norm": 0.45833419456324376,
294
  "learning_rate": 1.769911504424779e-05,
295
  "loss": 2.3187,
296
  "step": 200
297
  },
298
  {
299
  "epoch": 0.18,
300
+ "grad_norm": 0.5331840173984403,
301
  "learning_rate": 1.8141592920353983e-05,
302
  "loss": 2.318,
303
  "step": 205
304
  },
305
  {
306
  "epoch": 0.19,
307
+ "grad_norm": 0.44845883947380627,
308
  "learning_rate": 1.858407079646018e-05,
309
  "loss": 2.3148,
310
  "step": 210
311
  },
312
  {
313
  "epoch": 0.19,
314
+ "grad_norm": 0.47309229724596086,
315
  "learning_rate": 1.9026548672566376e-05,
316
  "loss": 2.2872,
317
  "step": 215
318
  },
319
  {
320
  "epoch": 0.19,
321
+ "grad_norm": 0.4498719193921967,
322
  "learning_rate": 1.946902654867257e-05,
323
+ "loss": 2.2929,
324
  "step": 220
325
  },
326
  {
327
  "epoch": 0.2,
328
+ "grad_norm": 0.4576469978258525,
329
  "learning_rate": 1.991150442477876e-05,
330
  "loss": 2.2822,
331
  "step": 225
332
  },
333
  {
334
  "epoch": 0.2,
335
+ "grad_norm": 0.4746021797726598,
336
  "learning_rate": 1.9999808776641724e-05,
337
  "loss": 2.2906,
338
  "step": 230
339
  },
340
  {
341
  "epoch": 0.21,
342
+ "grad_norm": 0.45862070548887435,
343
  "learning_rate": 1.999903194428269e-05,
344
  "loss": 2.2843,
345
  "step": 235
346
  },
347
  {
348
  "epoch": 0.21,
349
+ "grad_norm": 0.4593641696408351,
350
  "learning_rate": 1.999765759784862e-05,
351
  "loss": 2.2903,
352
  "step": 240
353
  },
354
  {
355
  "epoch": 0.22,
356
+ "grad_norm": 0.44728442843797506,
357
  "learning_rate": 1.9995685819466593e-05,
358
+ "loss": 2.2751,
359
  "step": 245
360
  },
361
  {
362
  "epoch": 0.22,
363
+ "grad_norm": 0.4905181074653029,
364
  "learning_rate": 1.9993116726964554e-05,
365
  "loss": 2.2752,
366
  "step": 250
367
  },
368
  {
369
  "epoch": 0.23,
370
+ "grad_norm": 0.445910469529929,
371
  "learning_rate": 1.9989950473864254e-05,
372
+ "loss": 2.2766,
373
  "step": 255
374
  },
375
  {
376
  "epoch": 0.23,
377
+ "grad_norm": 0.44961271558777066,
378
  "learning_rate": 1.998618724937209e-05,
379
  "loss": 2.2672,
380
  "step": 260
381
  },
382
  {
383
  "epoch": 0.23,
384
+ "grad_norm": 0.4327966352470938,
385
  "learning_rate": 1.9981827278367796e-05,
386
+ "loss": 2.2748,
387
  "step": 265
388
  },
389
  {
390
  "epoch": 0.24,
391
+ "grad_norm": 0.4455548944584037,
392
  "learning_rate": 1.997687082139099e-05,
393
  "loss": 2.2488,
394
  "step": 270
395
  },
396
  {
397
  "epoch": 0.24,
398
+ "grad_norm": 0.47402423105947356,
399
  "learning_rate": 1.9971318174625633e-05,
400
  "loss": 2.2535,
401
  "step": 275
402
  },
403
  {
404
  "epoch": 0.25,
405
+ "grad_norm": 0.4545737206225055,
406
  "learning_rate": 1.9965169669882293e-05,
407
+ "loss": 2.2388,
408
  "step": 280
409
  },
410
  {
411
  "epoch": 0.25,
412
+ "grad_norm": 0.4820678878118451,
413
  "learning_rate": 1.9958425674578364e-05,
414
+ "loss": 2.245,
415
  "step": 285
416
  },
417
  {
418
  "epoch": 0.26,
419
+ "grad_norm": 0.454117663270049,
420
  "learning_rate": 1.995108659171607e-05,
421
  "loss": 2.2723,
422
  "step": 290
423
  },
424
  {
425
  "epoch": 0.26,
426
+ "grad_norm": 0.4308481926341958,
427
  "learning_rate": 1.9943152859858386e-05,
428
  "loss": 2.2374,
429
  "step": 295
430
  },
431
  {
432
  "epoch": 0.27,
433
+ "grad_norm": 0.457943720428271,
434
  "learning_rate": 1.9934624953102858e-05,
435
+ "loss": 2.2524,
436
  "step": 300
437
  },
438
  {
439
  "epoch": 0.27,
440
+ "grad_norm": 0.4906285667223247,
441
  "learning_rate": 1.9925503381053258e-05,
442
+ "loss": 2.2438,
443
  "step": 305
444
  },
445
  {
446
  "epoch": 0.27,
447
+ "grad_norm": 0.4593610178083461,
448
  "learning_rate": 1.9915788688789107e-05,
449
  "loss": 2.2538,
450
  "step": 310
451
  },
452
  {
453
  "epoch": 0.28,
454
+ "grad_norm": 0.4995396687971251,
455
  "learning_rate": 1.990548145683315e-05,
456
  "loss": 2.2404,
457
  "step": 315
458
  },
459
  {
460
  "epoch": 0.28,
461
+ "grad_norm": 0.4528076525137905,
462
  "learning_rate": 1.9894582301116633e-05,
463
  "loss": 2.2373,
464
  "step": 320
465
  },
466
  {
467
  "epoch": 0.29,
468
+ "grad_norm": 0.4481518195673047,
469
  "learning_rate": 1.9883091872942484e-05,
470
+ "loss": 2.2358,
471
  "step": 325
472
  },
473
  {
474
  "epoch": 0.29,
475
+ "grad_norm": 0.47849857697478076,
476
  "learning_rate": 1.9871010858946443e-05,
477
+ "loss": 2.2352,
478
  "step": 330
479
  },
480
  {
481
  "epoch": 0.3,
482
+ "grad_norm": 0.4204182029306216,
483
  "learning_rate": 1.985833998105598e-05,
484
  "loss": 2.2175,
485
  "step": 335
486
  },
487
  {
488
  "epoch": 0.3,
489
+ "grad_norm": 0.458038912321597,
490
  "learning_rate": 1.984507999644719e-05,
491
+ "loss": 2.2244,
492
  "step": 340
493
  },
494
  {
495
  "epoch": 0.31,
496
+ "grad_norm": 0.4601423924745291,
497
  "learning_rate": 1.9831231697499515e-05,
498
  "loss": 2.2288,
499
  "step": 345
500
  },
501
  {
502
  "epoch": 0.31,
503
+ "grad_norm": 0.45435633931580244,
504
  "learning_rate": 1.9816795911748422e-05,
505
+ "loss": 2.2502,
506
  "step": 350
507
  },
508
  {
509
  "epoch": 0.31,
510
+ "grad_norm": 0.42520336741983306,
511
  "learning_rate": 1.980177350183594e-05,
512
+ "loss": 2.2327,
513
  "step": 355
514
  },
515
  {
516
  "epoch": 0.32,
517
+ "grad_norm": 0.42964313838533047,
518
  "learning_rate": 1.9786165365459102e-05,
519
  "loss": 2.2368,
520
  "step": 360
521
  },
522
  {
523
  "epoch": 0.32,
524
+ "grad_norm": 0.42668300300777884,
525
  "learning_rate": 1.976997243531632e-05,
526
  "loss": 2.2194,
527
  "step": 365
528
  },
529
  {
530
  "epoch": 0.33,
531
+ "grad_norm": 0.4424756748882247,
532
  "learning_rate": 1.975319567905163e-05,
533
  "loss": 2.2236,
534
  "step": 370
535
  },
536
  {
537
  "epoch": 0.33,
538
+ "grad_norm": 0.4380736771663855,
539
  "learning_rate": 1.9735836099196882e-05,
540
  "loss": 2.2242,
541
  "step": 375
542
  },
543
  {
544
  "epoch": 0.34,
545
+ "grad_norm": 0.4365125689503889,
546
  "learning_rate": 1.971789473311184e-05,
547
  "loss": 2.2149,
548
  "step": 380
549
  },
550
  {
551
  "epoch": 0.34,
552
+ "grad_norm": 0.448131392880714,
553
  "learning_rate": 1.9699372652922154e-05,
554
+ "loss": 2.2341,
555
  "step": 385
556
  },
557
  {
558
  "epoch": 0.35,
559
+ "grad_norm": 0.43379842503532956,
560
  "learning_rate": 1.9680270965455343e-05,
561
  "loss": 2.2334,
562
  "step": 390
563
  },
564
  {
565
  "epoch": 0.35,
566
+ "grad_norm": 0.4526418213836391,
567
  "learning_rate": 1.966059081217461e-05,
568
  "loss": 2.2198,
569
  "step": 395
570
  },
571
  {
572
  "epoch": 0.35,
573
+ "grad_norm": 0.45143976872092273,
574
  "learning_rate": 1.9640333369110662e-05,
575
  "loss": 2.2177,
576
  "step": 400
577
  },
578
  {
579
  "epoch": 0.36,
580
+ "grad_norm": 0.4598962725387317,
581
  "learning_rate": 1.9619499846791426e-05,
582
  "loss": 2.2251,
583
  "step": 405
584
  },
585
  {
586
  "epoch": 0.36,
587
+ "grad_norm": 0.4662451413096498,
588
  "learning_rate": 1.9598091490169696e-05,
589
+ "loss": 2.2033,
590
  "step": 410
591
  },
592
  {
593
  "epoch": 0.37,
594
+ "grad_norm": 0.4203729538995507,
595
  "learning_rate": 1.9576109578548757e-05,
596
  "loss": 2.2156,
597
  "step": 415
598
  },
599
  {
600
  "epoch": 0.37,
601
+ "grad_norm": 0.4791967373627921,
602
  "learning_rate": 1.9553555425505933e-05,
603
  "loss": 2.187,
604
  "step": 420
605
  },
606
  {
607
  "epoch": 0.38,
608
+ "grad_norm": 0.448274957521754,
609
  "learning_rate": 1.953043037881408e-05,
610
  "loss": 2.212,
611
  "step": 425
612
  },
613
  {
614
  "epoch": 0.38,
615
+ "grad_norm": 0.4650450102318859,
616
  "learning_rate": 1.9506735820361065e-05,
617
  "loss": 2.2216,
618
  "step": 430
619
  },
620
  {
621
  "epoch": 0.39,
622
+ "grad_norm": 0.4166836881106268,
623
  "learning_rate": 1.9482473166067177e-05,
624
  "loss": 2.2124,
625
  "step": 435
626
  },
627
  {
628
  "epoch": 0.39,
629
+ "grad_norm": 0.4244327137348908,
630
  "learning_rate": 1.945764386580051e-05,
631
  "loss": 2.2188,
632
  "step": 440
633
  },
634
  {
635
  "epoch": 0.39,
636
+ "grad_norm": 0.4271192503834766,
637
  "learning_rate": 1.9432249403290337e-05,
638
  "loss": 2.2159,
639
  "step": 445
640
  },
641
  {
642
  "epoch": 0.4,
643
+ "grad_norm": 0.4256331246211807,
644
  "learning_rate": 1.940629129603844e-05,
645
  "loss": 2.202,
646
  "step": 450
647
  },
648
  {
649
  "epoch": 0.4,
650
+ "grad_norm": 0.464978637587654,
651
  "learning_rate": 1.9379771095228426e-05,
652
  "loss": 2.1968,
653
  "step": 455
654
  },
655
  {
656
  "epoch": 0.41,
657
+ "grad_norm": 0.44651256811388607,
658
  "learning_rate": 1.935269038563303e-05,
659
  "loss": 2.1957,
660
  "step": 460
661
  },
662
  {
663
  "epoch": 0.41,
664
+ "grad_norm": 0.44743821721822236,
665
  "learning_rate": 1.9325050785519438e-05,
666
  "loss": 2.2145,
667
  "step": 465
668
  },
669
  {
670
  "epoch": 0.42,
671
+ "grad_norm": 0.4951764755480859,
672
  "learning_rate": 1.9296853946552532e-05,
673
  "loss": 2.1951,
674
  "step": 470
675
  },
676
  {
677
  "epoch": 0.42,
678
+ "grad_norm": 0.4479555817427096,
679
  "learning_rate": 1.9268101553696255e-05,
680
  "loss": 2.2028,
681
  "step": 475
682
  },
683
  {
684
  "epoch": 0.43,
685
+ "grad_norm": 0.43525920739151547,
686
  "learning_rate": 1.9238795325112867e-05,
687
  "loss": 2.1919,
688
  "step": 480
689
  },
690
  {
691
  "epoch": 0.43,
692
+ "grad_norm": 0.4608113989736665,
693
  "learning_rate": 1.9208937012060316e-05,
694
  "loss": 2.1974,
695
  "step": 485
696
  },
697
  {
698
  "epoch": 0.43,
699
+ "grad_norm": 0.4406444489063602,
700
  "learning_rate": 1.9178528398787553e-05,
701
  "loss": 2.2052,
702
  "step": 490
703
  },
704
  {
705
  "epoch": 0.44,
706
+ "grad_norm": 0.43574994152555524,
707
  "learning_rate": 1.9147571302427927e-05,
708
  "loss": 2.1925,
709
  "step": 495
710
  },
711
  {
712
  "epoch": 0.44,
713
+ "grad_norm": 0.42949144077248924,
714
  "learning_rate": 1.9116067572890603e-05,
715
  "loss": 2.1824,
716
  "step": 500
717
  },
718
  {
719
  "epoch": 0.45,
720
+ "grad_norm": 0.4603679893190777,
721
  "learning_rate": 1.9084019092750007e-05,
722
+ "loss": 2.1732,
723
  "step": 505
724
  },
725
  {
726
  "epoch": 0.45,
727
+ "grad_norm": 0.4209594032010091,
728
  "learning_rate": 1.9051427777133328e-05,
729
  "loss": 2.1803,
730
  "step": 510
731
  },
732
  {
733
  "epoch": 0.46,
734
+ "grad_norm": 0.4403010953163875,
735
  "learning_rate": 1.901829557360608e-05,
736
  "loss": 2.1803,
737
  "step": 515
738
  },
739
  {
740
  "epoch": 0.46,
741
+ "grad_norm": 0.43475758156931377,
742
  "learning_rate": 1.8984624462055724e-05,
743
  "loss": 2.1779,
744
  "step": 520
745
  },
746
  {
747
  "epoch": 0.47,
748
+ "grad_norm": 0.44415202937171544,
749
  "learning_rate": 1.895041645457335e-05,
750
  "loss": 2.1787,
751
  "step": 525
752
  },
753
  {
754
  "epoch": 0.47,
755
+ "grad_norm": 0.47765712755847844,
756
  "learning_rate": 1.8915673595333443e-05,
757
  "loss": 2.1894,
758
  "step": 530
759
  },
760
  {
761
  "epoch": 0.47,
762
+ "grad_norm": 0.4300832072827714,
763
  "learning_rate": 1.8880397960471724e-05,
764
  "loss": 2.1769,
765
  "step": 535
766
  },
767
  {
768
  "epoch": 0.48,
769
+ "grad_norm": 0.41943334089156564,
770
  "learning_rate": 1.8844591657961083e-05,
771
  "loss": 2.1678,
772
  "step": 540
773
  },
774
  {
775
  "epoch": 0.48,
776
+ "grad_norm": 0.4209449326502485,
777
  "learning_rate": 1.880825682748563e-05,
778
+ "loss": 2.1943,
779
  "step": 545
780
  },
781
  {
782
  "epoch": 0.49,
783
+ "grad_norm": 0.4809108947390715,
784
  "learning_rate": 1.877139564031282e-05,
785
  "loss": 2.1905,
786
  "step": 550
787
  },
788
  {
789
  "epoch": 0.49,
790
+ "grad_norm": 0.45104745368521976,
791
  "learning_rate": 1.87340102991637e-05,
792
+ "loss": 2.1568,
793
  "step": 555
794
  },
795
  {
796
  "epoch": 0.5,
797
+ "grad_norm": 0.4525167073378338,
798
  "learning_rate": 1.8696103038081297e-05,
799
  "loss": 2.1736,
800
  "step": 560
801
  },
802
  {
803
  "epoch": 0.5,
804
+ "grad_norm": 0.45653670332049995,
805
  "learning_rate": 1.86576761222971e-05,
806
  "loss": 2.1615,
807
  "step": 565
808
  },
809
  {
810
  "epoch": 0.5,
811
+ "grad_norm": 0.45654955813685105,
812
  "learning_rate": 1.8618731848095706e-05,
813
  "loss": 2.1693,
814
  "step": 570
815
  },
816
  {
817
  "epoch": 0.51,
818
+ "grad_norm": 0.41771940774213,
819
  "learning_rate": 1.8579272542677597e-05,
820
  "loss": 2.1782,
821
  "step": 575
822
  },
823
  {
824
  "epoch": 0.51,
825
+ "grad_norm": 0.4321344155514586,
826
  "learning_rate": 1.853930056402008e-05,
827
+ "loss": 2.1799,
828
  "step": 580
829
  },
830
  {
831
  "epoch": 0.52,
832
+ "grad_norm": 0.47763410916552507,
833
  "learning_rate": 1.849881830073637e-05,
834
  "loss": 2.1631,
835
  "step": 585
836
  },
837
  {
838
  "epoch": 0.52,
839
+ "grad_norm": 0.42170540546143326,
840
  "learning_rate": 1.845782817193286e-05,
841
  "loss": 2.1567,
842
  "step": 590
843
  },
844
  {
845
  "epoch": 0.53,
846
+ "grad_norm": 0.41444685845592893,
847
  "learning_rate": 1.841633262706456e-05,
848
  "loss": 2.1756,
849
  "step": 595
850
  },
851
  {
852
  "epoch": 0.53,
853
+ "grad_norm": 0.43808624300007026,
854
  "learning_rate": 1.8374334145788723e-05,
855
  "loss": 2.1694,
856
  "step": 600
857
  },
858
  {
859
  "epoch": 0.54,
860
+ "grad_norm": 0.44922932936357124,
861
  "learning_rate": 1.833183523781668e-05,
862
+ "loss": 2.1879,
863
  "step": 605
864
  },
865
  {
866
  "epoch": 0.54,
867
+ "grad_norm": 0.42474450421173265,
868
  "learning_rate": 1.8288838442763838e-05,
869
  "loss": 2.1538,
870
  "step": 610
871
  },
872
  {
873
  "epoch": 0.54,
874
+ "grad_norm": 0.5120377008361465,
875
  "learning_rate": 1.824534632999796e-05,
876
  "loss": 2.1667,
877
  "step": 615
878
  },
879
  {
880
  "epoch": 0.55,
881
+ "grad_norm": 0.4468560336467633,
882
  "learning_rate": 1.820136149848559e-05,
883
  "loss": 2.161,
884
  "step": 620
885
  },
886
  {
887
  "epoch": 0.55,
888
+ "grad_norm": 0.4651190048983478,
889
  "learning_rate": 1.8156886576636758e-05,
890
  "loss": 2.1816,
891
  "step": 625
892
  },
893
  {
894
  "epoch": 0.56,
895
+ "grad_norm": 0.4521866301989512,
896
  "learning_rate": 1.8111924222147927e-05,
897
  "loss": 2.1684,
898
  "step": 630
899
  },
900
  {
901
  "epoch": 0.56,
902
+ "grad_norm": 0.41111198915767083,
903
  "learning_rate": 1.8066477121843163e-05,
904
+ "loss": 2.1558,
905
  "step": 635
906
  },
907
  {
908
  "epoch": 0.57,
909
+ "grad_norm": 0.47583719934058827,
910
  "learning_rate": 1.8020547991513583e-05,
911
+ "loss": 2.167,
912
  "step": 640
913
  },
914
  {
915
  "epoch": 0.57,
916
+ "grad_norm": 0.47155059029791335,
917
  "learning_rate": 1.7974139575755055e-05,
918
  "loss": 2.1623,
919
  "step": 645
920
  },
921
  {
922
  "epoch": 0.58,
923
+ "grad_norm": 0.40080295270482796,
924
  "learning_rate": 1.792725464780421e-05,
925
  "loss": 2.1715,
926
  "step": 650
927
  },
928
  {
929
  "epoch": 0.58,
930
+ "grad_norm": 0.4328921669907557,
931
  "learning_rate": 1.7879896009372698e-05,
932
  "loss": 2.1668,
933
  "step": 655
934
  },
935
  {
936
  "epoch": 0.58,
937
+ "grad_norm": 0.4454535033104222,
938
  "learning_rate": 1.7832066490479797e-05,
939
  "loss": 2.1821,
940
  "step": 660
941
  },
942
  {
943
  "epoch": 0.59,
944
+ "grad_norm": 0.4350656772715166,
945
  "learning_rate": 1.7783768949283258e-05,
946
  "loss": 2.1658,
947
  "step": 665
948
  },
949
  {
950
  "epoch": 0.59,
951
+ "grad_norm": 0.4468843087382892,
952
  "learning_rate": 1.773500627190854e-05,
953
  "loss": 2.172,
954
  "step": 670
955
  },
956
  {
957
  "epoch": 0.6,
958
+ "grad_norm": 0.4368818462301088,
959
  "learning_rate": 1.7685781372276338e-05,
960
  "loss": 2.1711,
961
  "step": 675
962
  },
963
  {
964
  "epoch": 0.6,
965
+ "grad_norm": 0.42440860729329116,
966
  "learning_rate": 1.7636097191928437e-05,
967
  "loss": 2.161,
968
  "step": 680
969
  },
970
  {
971
  "epoch": 0.61,
972
+ "grad_norm": 0.44675450168849135,
973
  "learning_rate": 1.758595669985197e-05,
974
  "loss": 2.1715,
975
  "step": 685
976
  },
977
  {
978
  "epoch": 0.61,
979
+ "grad_norm": 0.43829279542516164,
980
  "learning_rate": 1.7535362892301953e-05,
981
  "loss": 2.173,
982
  "step": 690
983
  },
984
  {
985
  "epoch": 0.62,
986
+ "grad_norm": 0.4131531710729138,
987
  "learning_rate": 1.748431879262229e-05,
988
+ "loss": 2.1669,
989
  "step": 695
990
  },
991
  {
992
  "epoch": 0.62,
993
+ "grad_norm": 0.4143392540313518,
994
  "learning_rate": 1.7432827451065052e-05,
995
  "loss": 2.174,
996
  "step": 700
997
  },
998
  {
999
  "epoch": 0.62,
1000
+ "grad_norm": 0.4655990641094679,
1001
  "learning_rate": 1.7380891944608243e-05,
1002
  "loss": 2.1566,
1003
  "step": 705
1004
  },
1005
  {
1006
  "epoch": 0.63,
1007
+ "grad_norm": 0.4133019233382481,
1008
  "learning_rate": 1.732851537677191e-05,
1009
  "loss": 2.1415,
1010
  "step": 710
1011
  },
1012
  {
1013
  "epoch": 0.63,
1014
+ "grad_norm": 0.4343478806770909,
1015
  "learning_rate": 1.7275700877432693e-05,
1016
  "loss": 2.1382,
1017
  "step": 715
1018
  },
1019
  {
1020
  "epoch": 0.64,
1021
+ "grad_norm": 0.4148108227181473,
1022
  "learning_rate": 1.7222451602636785e-05,
1023
  "loss": 2.1524,
1024
  "step": 720
1025
  },
1026
  {
1027
  "epoch": 0.64,
1028
+ "grad_norm": 0.44585563716026727,
1029
  "learning_rate": 1.7168770734411344e-05,
1030
  "loss": 2.1498,
1031
  "step": 725
1032
  },
1033
  {
1034
  "epoch": 0.65,
1035
+ "grad_norm": 0.42868628699604844,
1036
  "learning_rate": 1.711466148057433e-05,
1037
  "loss": 2.1498,
1038
  "step": 730
1039
  },
1040
  {
1041
  "epoch": 0.65,
1042
+ "grad_norm": 0.40858180310847936,
1043
  "learning_rate": 1.7060127074542847e-05,
1044
  "loss": 2.1407,
1045
  "step": 735
1046
  },
1047
  {
1048
  "epoch": 0.66,
1049
+ "grad_norm": 0.44134300003234284,
1050
  "learning_rate": 1.700517077513987e-05,
1051
  "loss": 2.1445,
1052
  "step": 740
1053
  },
1054
  {
1055
  "epoch": 0.66,
1056
+ "grad_norm": 0.45761970000954805,
1057
  "learning_rate": 1.6949795866399554e-05,
1058
  "loss": 2.1559,
1059
  "step": 745
1060
  },
1061
  {
1062
  "epoch": 0.66,
1063
+ "grad_norm": 0.40368739077739324,
1064
  "learning_rate": 1.689400565737098e-05,
1065
  "loss": 2.17,
1066
  "step": 750
1067
  },
1068
  {
1069
  "epoch": 0.67,
1070
+ "grad_norm": 0.4522860522707782,
1071
  "learning_rate": 1.6837803481920393e-05,
1072
  "loss": 2.1655,
1073
  "step": 755
1074
  },
1075
  {
1076
  "epoch": 0.67,
1077
+ "grad_norm": 0.43489563682227156,
1078
  "learning_rate": 1.6781192698532e-05,
1079
  "loss": 2.1475,
1080
  "step": 760
1081
  },
1082
  {
1083
  "epoch": 0.68,
1084
+ "grad_norm": 0.45103686349499067,
1085
  "learning_rate": 1.6724176690107272e-05,
1086
  "loss": 2.1689,
1087
  "step": 765
1088
  },
1089
  {
1090
  "epoch": 0.68,
1091
+ "grad_norm": 0.43066938562904067,
1092
  "learning_rate": 1.6666758863762796e-05,
1093
  "loss": 2.1565,
1094
  "step": 770
1095
  },
1096
  {
1097
  "epoch": 0.69,
1098
+ "grad_norm": 0.4668016417225619,
1099
  "learning_rate": 1.6608942650626655e-05,
1100
  "loss": 2.1498,
1101
  "step": 775
1102
  },
1103
  {
1104
  "epoch": 0.69,
1105
+ "grad_norm": 0.45398011122993853,
1106
  "learning_rate": 1.655073150563343e-05,
1107
  "loss": 2.1371,
1108
  "step": 780
1109
  },
1110
  {
1111
  "epoch": 0.7,
1112
+ "grad_norm": 0.4137045611399235,
1113
  "learning_rate": 1.6492128907317696e-05,
1114
+ "loss": 2.1448,
1115
  "step": 785
1116
  },
1117
  {
1118
  "epoch": 0.7,
1119
+ "grad_norm": 0.42766725815738255,
1120
  "learning_rate": 1.6433138357606198e-05,
1121
  "loss": 2.1386,
1122
  "step": 790
1123
  },
1124
  {
1125
  "epoch": 0.7,
1126
+ "grad_norm": 0.4859220503111576,
1127
  "learning_rate": 1.637376338160856e-05,
1128
  "loss": 2.15,
1129
  "step": 795
1130
  },
1131
  {
1132
  "epoch": 0.71,
1133
+ "grad_norm": 0.41871421755269245,
1134
  "learning_rate": 1.6314007527406643e-05,
1135
  "loss": 2.1479,
1136
  "step": 800
1137
  },
1138
  {
1139
  "epoch": 0.71,
1140
+ "grad_norm": 0.5058912515812595,
1141
  "learning_rate": 1.6253874365842518e-05,
1142
  "loss": 2.1617,
1143
  "step": 805
1144
  },
1145
  {
1146
  "epoch": 0.72,
1147
+ "grad_norm": 0.40359399707231297,
1148
  "learning_rate": 1.619336749030509e-05,
1149
  "loss": 2.1512,
1150
  "step": 810
1151
  },
1152
  {
1153
  "epoch": 0.72,
1154
+ "grad_norm": 0.4286098723233825,
1155
  "learning_rate": 1.613249051651535e-05,
1156
  "loss": 2.1392,
1157
  "step": 815
1158
  },
1159
  {
1160
  "epoch": 0.73,
1161
+ "grad_norm": 0.4271511167666986,
1162
  "learning_rate": 1.6071247082310337e-05,
1163
  "loss": 2.1453,
1164
  "step": 820
1165
  },
1166
  {
1167
  "epoch": 0.73,
1168
+ "grad_norm": 0.4191827290206147,
1169
  "learning_rate": 1.6009640847425726e-05,
1170
  "loss": 2.1523,
1171
  "step": 825
1172
  },
1173
  {
1174
  "epoch": 0.74,
1175
+ "grad_norm": 0.42090263234491304,
1176
  "learning_rate": 1.594767549327714e-05,
1177
+ "loss": 2.1364,
1178
  "step": 830
1179
  },
1180
  {
1181
  "epoch": 0.74,
1182
+ "grad_norm": 0.4150471777441493,
1183
  "learning_rate": 1.588535472274017e-05,
1184
+ "loss": 2.1414,
1185
  "step": 835
1186
  },
1187
  {
1188
  "epoch": 0.74,
1189
+ "grad_norm": 0.4374142549111757,
1190
  "learning_rate": 1.5822682259929086e-05,
1191
+ "loss": 2.1528,
1192
  "step": 840
1193
  },
1194
  {
1195
  "epoch": 0.75,
1196
+ "grad_norm": 0.42231610168434414,
1197
  "learning_rate": 1.57596618499743e-05,
1198
+ "loss": 2.1337,
1199
  "step": 845
1200
  },
1201
  {
1202
  "epoch": 0.75,
1203
+ "grad_norm": 0.4064219239907817,
1204
  "learning_rate": 1.5696297258798573e-05,
1205
+ "loss": 2.1329,
1206
  "step": 850
1207
  },
1208
  {
1209
  "epoch": 0.76,
1210
+ "grad_norm": 0.403733820859978,
1211
  "learning_rate": 1.5632592272891964e-05,
1212
  "loss": 2.1276,
1213
  "step": 855
1214
  },
1215
  {
1216
  "epoch": 0.76,
1217
+ "grad_norm": 0.4376353978066983,
1218
  "learning_rate": 1.5568550699085574e-05,
1219
  "loss": 2.1346,
1220
  "step": 860
1221
  },
1222
  {
1223
  "epoch": 0.77,
1224
+ "grad_norm": 0.4155182941205036,
1225
  "learning_rate": 1.550417636432404e-05,
1226
  "loss": 2.1318,
1227
  "step": 865
1228
  },
1229
  {
1230
  "epoch": 0.77,
1231
+ "grad_norm": 0.43512014827105794,
1232
  "learning_rate": 1.5439473115436872e-05,
1233
+ "loss": 2.1353,
1234
  "step": 870
1235
  },
1236
  {
1237
  "epoch": 0.78,
1238
+ "grad_norm": 0.40975739052285454,
1239
  "learning_rate": 1.5374444818908553e-05,
1240
+ "loss": 2.1156,
1241
  "step": 875
1242
  },
1243
  {
1244
  "epoch": 0.78,
1245
+ "grad_norm": 0.42514295153487675,
1246
  "learning_rate": 1.5309095360647505e-05,
1247
+ "loss": 2.1414,
1248
  "step": 880
1249
  },
1250
  {
1251
  "epoch": 0.78,
1252
+ "grad_norm": 0.3993292587155139,
1253
  "learning_rate": 1.5243428645753877e-05,
1254
+ "loss": 2.1379,
1255
  "step": 885
1256
  },
1257
  {
1258
  "epoch": 0.79,
1259
+ "grad_norm": 0.44134886145633384,
1260
  "learning_rate": 1.5177448598286182e-05,
1261
  "loss": 2.152,
1262
  "step": 890
1263
  },
1264
  {
1265
  "epoch": 0.79,
1266
+ "grad_norm": 0.4385531024930876,
1267
  "learning_rate": 1.5111159161026802e-05,
1268
  "loss": 2.126,
1269
  "step": 895
1270
  },
1271
  {
1272
  "epoch": 0.8,
1273
+ "grad_norm": 0.4108453318091144,
1274
  "learning_rate": 1.5044564295246395e-05,
1275
  "loss": 2.1289,
1276
  "step": 900
1277
  },
1278
  {
1279
  "epoch": 0.8,
1280
+ "grad_norm": 0.4116649925258244,
1281
  "learning_rate": 1.4977667980467162e-05,
1282
  "loss": 2.1446,
1283
  "step": 905
1284
  },
1285
  {
1286
  "epoch": 0.81,
1287
+ "grad_norm": 0.40974726712864873,
1288
  "learning_rate": 1.491047421422505e-05,
1289
  "loss": 2.1455,
1290
  "step": 910
1291
  },
1292
  {
1293
  "epoch": 0.81,
1294
+ "grad_norm": 0.43137528729137836,
1295
  "learning_rate": 1.4842987011830871e-05,
1296
  "loss": 2.146,
1297
  "step": 915
1298
  },
1299
  {
1300
  "epoch": 0.81,
1301
+ "grad_norm": 0.4438853705060339,
1302
  "learning_rate": 1.4775210406130358e-05,
1303
  "loss": 2.1246,
1304
  "step": 920
1305
  },
1306
  {
1307
  "epoch": 0.82,
1308
+ "grad_norm": 0.4027263645222192,
1309
  "learning_rate": 1.4707148447263178e-05,
1310
  "loss": 2.1182,
1311
  "step": 925
1312
  },
1313
  {
1314
  "epoch": 0.82,
1315
+ "grad_norm": 0.4410545591857016,
1316
  "learning_rate": 1.4638805202420896e-05,
1317
  "loss": 2.1547,
1318
  "step": 930
1319
  },
1320
  {
1321
  "epoch": 0.83,
1322
+ "grad_norm": 0.43985167955927634,
1323
  "learning_rate": 1.4570184755603936e-05,
1324
  "loss": 2.1369,
1325
  "step": 935
1326
  },
1327
  {
1328
  "epoch": 0.83,
1329
+ "grad_norm": 0.4355491918272838,
1330
  "learning_rate": 1.4501291207377537e-05,
1331
  "loss": 2.1361,
1332
  "step": 940
1333
  },
1334
  {
1335
  "epoch": 0.84,
1336
+ "grad_norm": 0.454779025988056,
1337
  "learning_rate": 1.4432128674626713e-05,
1338
  "loss": 2.1414,
1339
  "step": 945
1340
  },
1341
  {
1342
  "epoch": 0.84,
1343
+ "grad_norm": 0.4307815003812548,
1344
  "learning_rate": 1.4362701290310234e-05,
1345
  "loss": 2.1348,
1346
  "step": 950
1347
  },
1348
  {
1349
  "epoch": 0.85,
1350
+ "grad_norm": 0.426571598680512,
1351
  "learning_rate": 1.4293013203213662e-05,
1352
  "loss": 2.1229,
1353
  "step": 955
1354
  },
1355
  {
1356
  "epoch": 0.85,
1357
+ "grad_norm": 0.4246473743751572,
1358
  "learning_rate": 1.422306857770141e-05,
1359
+ "loss": 2.1288,
1360
  "step": 960
1361
  },
1362
  {
1363
  "epoch": 0.85,
1364
+ "grad_norm": 0.3999523059473982,
1365
  "learning_rate": 1.415287159346793e-05,
1366
  "loss": 2.1325,
1367
  "step": 965
1368
  },
1369
  {
1370
  "epoch": 0.86,
1371
+ "grad_norm": 0.4067806403847973,
1372
  "learning_rate": 1.4082426445287904e-05,
1373
  "loss": 2.1254,
1374
  "step": 970
1375
  },
1376
  {
1377
  "epoch": 0.86,
1378
+ "grad_norm": 0.4195583726491516,
1379
  "learning_rate": 1.4011737342765604e-05,
1380
+ "loss": 2.1435,
1381
  "step": 975
1382
  },
1383
  {
1384
  "epoch": 0.87,
1385
+ "grad_norm": 0.4296009008111393,
1386
  "learning_rate": 1.3940808510083321e-05,
1387
  "loss": 2.152,
1388
  "step": 980
1389
  },
1390
  {
1391
  "epoch": 0.87,
1392
+ "grad_norm": 0.44603114772371527,
1393
  "learning_rate": 1.3869644185748954e-05,
1394
  "loss": 2.1294,
1395
  "step": 985
1396
  },
1397
  {
1398
  "epoch": 0.88,
1399
+ "grad_norm": 0.40138829464705644,
1400
  "learning_rate": 1.3798248622342719e-05,
1401
  "loss": 2.1373,
1402
  "step": 990
1403
  },
1404
  {
1405
  "epoch": 0.88,
1406
+ "grad_norm": 0.4211199997995903,
1407
  "learning_rate": 1.3726626086263029e-05,
1408
  "loss": 2.127,
1409
  "step": 995
1410
  },
1411
  {
1412
  "epoch": 0.89,
1413
+ "grad_norm": 0.4258462470968173,
1414
  "learning_rate": 1.3654780857471548e-05,
1415
  "loss": 2.1204,
1416
  "step": 1000
1417
  },
1418
  {
1419
  "epoch": 0.89,
1420
+ "grad_norm": 0.44460567505016096,
1421
  "learning_rate": 1.3582717229237434e-05,
1422
+ "loss": 2.1385,
1423
  "step": 1005
1424
  },
1425
  {
1426
  "epoch": 0.89,
1427
+ "grad_norm": 0.43396247394626575,
1428
  "learning_rate": 1.3510439507880778e-05,
1429
+ "loss": 2.1494,
1430
  "step": 1010
1431
  },
1432
  {
1433
  "epoch": 0.9,
1434
+ "grad_norm": 0.4032140738720204,
1435
  "learning_rate": 1.3437952012515275e-05,
1436
  "loss": 2.1321,
1437
  "step": 1015
1438
  },
1439
  {
1440
  "epoch": 0.9,
1441
+ "grad_norm": 0.4511691309939334,
1442
  "learning_rate": 1.336525907479013e-05,
1443
+ "loss": 2.1361,
1444
  "step": 1020
1445
  },
1446
  {
1447
  "epoch": 0.91,
1448
+ "grad_norm": 0.41068509369036366,
1449
  "learning_rate": 1.32923650386312e-05,
1450
  "loss": 2.1278,
1451
  "step": 1025
1452
  },
1453
  {
1454
  "epoch": 0.91,
1455
+ "grad_norm": 0.41629721555088506,
1456
  "learning_rate": 1.321927425998143e-05,
1457
  "loss": 2.1422,
1458
  "step": 1030
1459
  },
1460
  {
1461
  "epoch": 0.92,
1462
+ "grad_norm": 0.4291095541986292,
1463
  "learning_rate": 1.314599110654053e-05,
1464
+ "loss": 2.1299,
1465
  "step": 1035
1466
  },
1467
  {
1468
  "epoch": 0.92,
1469
+ "grad_norm": 0.4393758682295335,
1470
  "learning_rate": 1.3072519957504e-05,
1471
  "loss": 2.1327,
1472
  "step": 1040
1473
  },
1474
  {
1475
  "epoch": 0.93,
1476
+ "grad_norm": 0.3959617472852389,
1477
  "learning_rate": 1.2998865203301424e-05,
1478
  "loss": 2.1233,
1479
  "step": 1045
1480
  },
1481
  {
1482
  "epoch": 0.93,
1483
+ "grad_norm": 0.42149029056968895,
1484
  "learning_rate": 1.2925031245334112e-05,
1485
  "loss": 2.132,
1486
  "step": 1050
1487
  },
1488
  {
1489
  "epoch": 0.93,
1490
+ "grad_norm": 0.437723393378722,
1491
  "learning_rate": 1.2851022495712092e-05,
1492
  "loss": 2.1316,
1493
  "step": 1055
1494
  },
1495
  {
1496
  "epoch": 0.94,
1497
+ "grad_norm": 0.42773127538515715,
1498
  "learning_rate": 1.2776843376990448e-05,
1499
  "loss": 2.1199,
1500
  "step": 1060
1501
  },
1502
  {
1503
  "epoch": 0.94,
1504
+ "grad_norm": 0.43476650039814163,
1505
  "learning_rate": 1.270249832190505e-05,
1506
  "loss": 2.1191,
1507
  "step": 1065
1508
  },
1509
  {
1510
  "epoch": 0.95,
1511
+ "grad_norm": 0.43286071322418734,
1512
  "learning_rate": 1.2627991773107651e-05,
1513
+ "loss": 2.1169,
1514
  "step": 1070
1515
  },
1516
  {
1517
  "epoch": 0.95,
1518
+ "grad_norm": 0.4412845796053013,
1519
  "learning_rate": 1.2553328182900414e-05,
1520
  "loss": 2.1438,
1521
  "step": 1075
1522
  },
1523
  {
1524
  "epoch": 0.96,
1525
+ "grad_norm": 0.4471162386801732,
1526
  "learning_rate": 1.2478512012969864e-05,
1527
  "loss": 2.1134,
1528
  "step": 1080
1529
  },
1530
  {
1531
  "epoch": 0.96,
1532
+ "grad_norm": 0.4294332123394731,
1533
  "learning_rate": 1.2403547734120253e-05,
1534
+ "loss": 2.1256,
1535
  "step": 1085
1536
  },
1537
  {
1538
  "epoch": 0.97,
1539
+ "grad_norm": 0.410006105020709,
1540
  "learning_rate": 1.2328439826006415e-05,
1541
  "loss": 2.1301,
1542
  "step": 1090
1543
  },
1544
  {
1545
  "epoch": 0.97,
1546
+ "grad_norm": 0.4061707061885615,
1547
  "learning_rate": 1.2253192776866059e-05,
1548
  "loss": 2.1218,
1549
  "step": 1095
1550
  },
1551
  {
1552
  "epoch": 0.97,
1553
+ "grad_norm": 0.39702909548389576,
1554
  "learning_rate": 1.2177811083251572e-05,
1555
+ "loss": 2.0971,
1556
  "step": 1100
1557
  },
1558
  {
1559
  "epoch": 0.98,
1560
+ "grad_norm": 0.42322184280411995,
1561
  "learning_rate": 1.2102299249761315e-05,
1562
  "loss": 2.1216,
1563
  "step": 1105
1564
  },
1565
  {
1566
  "epoch": 0.98,
1567
+ "grad_norm": 0.42833967065330203,
1568
  "learning_rate": 1.2026661788770453e-05,
1569
  "loss": 2.1199,
1570
  "step": 1110
1571
  },
1572
  {
1573
  "epoch": 0.99,
1574
+ "grad_norm": 0.43638375935123347,
1575
  "learning_rate": 1.1950903220161286e-05,
1576
  "loss": 2.1192,
1577
  "step": 1115
1578
  },
1579
  {
1580
  "epoch": 0.99,
1581
+ "grad_norm": 0.46170416867922665,
1582
  "learning_rate": 1.1875028071053165e-05,
1583
  "loss": 2.1384,
1584
  "step": 1120
1585
  },
1586
  {
1587
  "epoch": 1.0,
1588
+ "grad_norm": 0.430510324577715,
1589
  "learning_rate": 1.1799040875531975e-05,
1590
  "loss": 2.1213,
1591
  "step": 1125
1592
  },
1593
  {
1594
  "epoch": 1.0,
1595
+ "eval_loss": 2.12646484375,
1596
+ "eval_runtime": 22.1427,
1597
+ "eval_samples_per_second": 1444.223,
1598
+ "eval_steps_per_second": 22.581,
1599
  "step": 1129
1600
  },
1601
  {
1602
  "epoch": 1.0,
1603
+ "grad_norm": 0.40787800687014425,
1604
  "learning_rate": 1.1722946174379168e-05,
1605
  "loss": 2.1371,
1606
  "step": 1130
1607
  },
1608
  {
1609
  "epoch": 1.01,
1610
+ "grad_norm": 0.4165260116303724,
1611
  "learning_rate": 1.1646748514800441e-05,
1612
+ "loss": 2.1134,
1613
  "step": 1135
1614
  },
1615
  {
1616
  "epoch": 1.01,
1617
+ "grad_norm": 0.42929598573052125,
1618
  "learning_rate": 1.1570452450153992e-05,
1619
  "loss": 2.1102,
1620
  "step": 1140
1621
  },
1622
  {
1623
  "epoch": 1.01,
1624
+ "grad_norm": 0.41876342866360117,
1625
  "learning_rate": 1.149406253967843e-05,
1626
  "loss": 2.1215,
1627
  "step": 1145
1628
  },
1629
  {
1630
  "epoch": 1.02,
1631
+ "grad_norm": 0.42019385875713755,
1632
  "learning_rate": 1.1417583348220322e-05,
1633
  "loss": 2.1132,
1634
  "step": 1150
1635
  },
1636
  {
1637
  "epoch": 1.02,
1638
+ "grad_norm": 0.3915643276668065,
1639
  "learning_rate": 1.134101944596143e-05,
1640
  "loss": 2.1223,
1641
  "step": 1155
1642
  },
1643
  {
1644
  "epoch": 1.03,
1645
+ "grad_norm": 0.39231290437055993,
1646
  "learning_rate": 1.1264375408145582e-05,
1647
+ "loss": 2.1278,
1648
  "step": 1160
1649
  },
1650
  {
1651
  "epoch": 1.03,
1652
+ "grad_norm": 0.39599183284076206,
1653
  "learning_rate": 1.118765581480529e-05,
1654
  "loss": 2.1014,
1655
  "step": 1165
1656
  },
1657
  {
1658
  "epoch": 1.04,
1659
+ "grad_norm": 0.4117960974101987,
1660
  "learning_rate": 1.1110865250488047e-05,
1661
  "loss": 2.1277,
1662
  "step": 1170
1663
  },
1664
  {
1665
  "epoch": 1.04,
1666
+ "grad_norm": 0.4216703078232253,
1667
  "learning_rate": 1.1034008303982373e-05,
1668
  "loss": 2.124,
1669
  "step": 1175
1670
  },
1671
  {
1672
  "epoch": 1.05,
1673
+ "grad_norm": 0.41487469113530573,
1674
  "learning_rate": 1.0957089568043607e-05,
1675
  "loss": 2.1107,
1676
  "step": 1180
1677
  },
1678
  {
1679
  "epoch": 1.05,
1680
+ "grad_norm": 0.4139823788692836,
1681
  "learning_rate": 1.088011363911944e-05,
1682
  "loss": 2.1053,
1683
  "step": 1185
1684
  },
1685
  {
1686
  "epoch": 1.05,
1687
+ "grad_norm": 0.42425049272736226,
1688
  "learning_rate": 1.080308511707527e-05,
1689
+ "loss": 2.118,
1690
  "step": 1190
1691
  },
1692
  {
1693
  "epoch": 1.06,
1694
+ "grad_norm": 0.4162658153411123,
1695
  "learning_rate": 1.0726008604919296e-05,
1696
  "loss": 2.1071,
1697
  "step": 1195
1698
  },
1699
  {
1700
  "epoch": 1.06,
1701
+ "grad_norm": 0.4140960985951221,
1702
  "learning_rate": 1.0648888708527481e-05,
1703
  "loss": 2.1217,
1704
  "step": 1200
1705
  },
1706
  {
1707
  "epoch": 1.07,
1708
+ "grad_norm": 0.40871622926812157,
1709
  "learning_rate": 1.0571730036368308e-05,
1710
+ "loss": 2.1056,
1711
  "step": 1205
1712
  },
1713
  {
1714
  "epoch": 1.07,
1715
+ "grad_norm": 0.4191797458484589,
1716
  "learning_rate": 1.0494537199227393e-05,
1717
  "loss": 2.1029,
1718
  "step": 1210
1719
  },
1720
  {
1721
  "epoch": 1.08,
1722
+ "grad_norm": 0.39652273174754216,
1723
  "learning_rate": 1.0417314809931945e-05,
1724
  "loss": 2.1044,
1725
  "step": 1215
1726
  },
1727
  {
1728
  "epoch": 1.08,
1729
+ "grad_norm": 0.4350415156817985,
1730
  "learning_rate": 1.0340067483075135e-05,
1731
  "loss": 2.0963,
1732
  "step": 1220
1733
  },
1734
  {
1735
  "epoch": 1.09,
1736
+ "grad_norm": 0.41314099698230733,
1737
  "learning_rate": 1.0262799834740334e-05,
1738
+ "loss": 2.129,
1739
  "step": 1225
1740
  },
1741
  {
1742
  "epoch": 1.09,
1743
+ "grad_norm": 0.40628715336164606,
1744
  "learning_rate": 1.0185516482225264e-05,
1745
  "loss": 2.1263,
1746
  "step": 1230
1747
  },
1748
  {
1749
  "epoch": 1.09,
1750
+ "grad_norm": 0.41082200401156493,
1751
  "learning_rate": 1.0108222043766087e-05,
1752
  "loss": 2.12,
1753
  "step": 1235
1754
  },
1755
  {
1756
  "epoch": 1.1,
1757
+ "grad_norm": 0.40358092811130275,
1758
  "learning_rate": 1.0030921138261422e-05,
1759
  "loss": 2.0842,
1760
  "step": 1240
1761
  },
1762
  {
1763
  "epoch": 1.1,
1764
+ "grad_norm": 0.40154966225872907,
1765
  "learning_rate": 9.953618384996353e-06,
1766
+ "loss": 2.1134,
1767
  "step": 1245
1768
  },
1769
  {
1770
  "epoch": 1.11,
1771
+ "grad_norm": 0.4061806463348494,
1772
  "learning_rate": 9.876318403366371e-06,
1773
  "loss": 2.0966,
1774
  "step": 1250
1775
  },
1776
  {
1777
  "epoch": 1.11,
1778
+ "grad_norm": 0.4033065891608578,
1779
  "learning_rate": 9.79902581260135e-06,
1780
  "loss": 2.1092,
1781
  "step": 1255
1782
  },
1783
  {
1784
  "epoch": 1.12,
1785
+ "grad_norm": 0.4075543686866664,
1786
  "learning_rate": 9.721745231489499e-06,
1787
  "loss": 2.1149,
1788
  "step": 1260
1789
  },
1790
  {
1791
  "epoch": 1.12,
1792
+ "grad_norm": 0.40259604700214474,
1793
  "learning_rate": 9.644481278101366e-06,
1794
+ "loss": 2.1308,
1795
  "step": 1265
1796
  },
1797
  {
1798
  "epoch": 1.12,
1799
+ "grad_norm": 0.4118825922238076,
1800
  "learning_rate": 9.567238569513872e-06,
1801
  "loss": 2.1018,
1802
  "step": 1270
1803
  },
1804
  {
1805
  "epoch": 1.13,
1806
+ "grad_norm": 0.4206984650260599,
1807
  "learning_rate": 9.49002172153442e-06,
1808
  "loss": 2.1129,
1809
  "step": 1275
1810
  },
1811
  {
1812
  "epoch": 1.13,
1813
+ "grad_norm": 0.4419153569108646,
1814
  "learning_rate": 9.412835348425038e-06,
1815
  "loss": 2.1136,
1816
  "step": 1280
1817
  },
1818
  {
1819
  "epoch": 1.14,
1820
+ "grad_norm": 0.435736091946508,
1821
  "learning_rate": 9.335684062626669e-06,
1822
  "loss": 2.112,
1823
  "step": 1285
1824
  },
1825
  {
1826
  "epoch": 1.14,
1827
+ "grad_norm": 0.4180743992684701,
1828
  "learning_rate": 9.25857247448354e-06,
1829
  "loss": 2.1002,
1830
  "step": 1290
1831
  },
1832
  {
1833
  "epoch": 1.15,
1834
+ "grad_norm": 0.4217230981943121,
1835
  "learning_rate": 9.181505191967656e-06,
1836
  "loss": 2.1032,
1837
  "step": 1295
1838
  },
1839
  {
1840
  "epoch": 1.15,
1841
+ "grad_norm": 0.40752463084234536,
1842
  "learning_rate": 9.104486820403438e-06,
1843
+ "loss": 2.1137,
1844
  "step": 1300
1845
  },
1846
  {
1847
  "epoch": 1.16,
1848
+ "grad_norm": 0.3923238583618452,
1849
  "learning_rate": 9.027521962192532e-06,
1850
  "loss": 2.1151,
1851
  "step": 1305
1852
  },
1853
  {
1854
  "epoch": 1.16,
1855
+ "grad_norm": 0.4373302648314911,
1856
  "learning_rate": 8.950615216538765e-06,
1857
  "loss": 2.1281,
1858
  "step": 1310
1859
  },
1860
  {
1861
  "epoch": 1.16,
1862
+ "grad_norm": 0.43300868025613454,
1863
  "learning_rate": 8.873771179173339e-06,
1864
  "loss": 2.1031,
1865
  "step": 1315
1866
  },
1867
  {
1868
  "epoch": 1.17,
1869
+ "grad_norm": 0.4205719678143442,
1870
  "learning_rate": 8.796994442080167e-06,
1871
  "loss": 2.1161,
1872
  "step": 1320
1873
  },
1874
  {
1875
  "epoch": 1.17,
1876
+ "grad_norm": 0.41894348844643914,
1877
  "learning_rate": 8.720289593221502e-06,
1878
  "loss": 2.1147,
1879
  "step": 1325
1880
  },
1881
  {
1882
  "epoch": 1.18,
1883
+ "grad_norm": 0.44069670556967233,
1884
  "learning_rate": 8.643661216263744e-06,
1885
  "loss": 2.0824,
1886
  "step": 1330
1887
  },
1888
  {
1889
  "epoch": 1.18,
1890
+ "grad_norm": 0.3944374256948791,
1891
  "learning_rate": 8.567113890303554e-06,
1892
  "loss": 2.1216,
1893
  "step": 1335
1894
  },
1895
  {
1896
  "epoch": 1.19,
1897
+ "grad_norm": 0.4413321404072914,
1898
  "learning_rate": 8.490652189594212e-06,
1899
  "loss": 2.1071,
1900
  "step": 1340
1901
  },
1902
  {
1903
  "epoch": 1.19,
1904
+ "grad_norm": 0.4015802057316869,
1905
  "learning_rate": 8.414280683272273e-06,
1906
  "loss": 2.1239,
1907
  "step": 1345
1908
  },
1909
  {
1910
  "epoch": 1.2,
1911
+ "grad_norm": 0.3950742041142527,
1912
  "learning_rate": 8.338003935084531e-06,
1913
  "loss": 2.1129,
1914
  "step": 1350
1915
  },
1916
  {
1917
  "epoch": 1.2,
1918
+ "grad_norm": 0.4292903646961872,
1919
  "learning_rate": 8.2618265031153e-06,
1920
+ "loss": 2.1124,
1921
  "step": 1355
1922
  },
1923
  {
1924
  "epoch": 1.2,
1925
+ "grad_norm": 0.3981531880228784,
1926
  "learning_rate": 8.185752939514026e-06,
1927
  "loss": 2.0961,
1928
  "step": 1360
1929
  },
1930
  {
1931
  "epoch": 1.21,
1932
+ "grad_norm": 0.40787482798566205,
1933
  "learning_rate": 8.109787790223285e-06,
1934
  "loss": 2.0981,
1935
  "step": 1365
1936
  },
1937
  {
1938
  "epoch": 1.21,
1939
+ "grad_norm": 0.4565206646631857,
1940
  "learning_rate": 8.033935594707116e-06,
1941
+ "loss": 2.1022,
1942
  "step": 1370
1943
  },
1944
  {
1945
  "epoch": 1.22,
1946
+ "grad_norm": 0.41953912968223384,
1947
  "learning_rate": 7.958200885679752e-06,
1948
  "loss": 2.1216,
1949
  "step": 1375
1950
  },
1951
  {
1952
  "epoch": 1.22,
1953
+ "grad_norm": 0.4056920155350916,
1954
  "learning_rate": 7.88258818883477e-06,
1955
  "loss": 2.1316,
1956
  "step": 1380
1957
  },
1958
  {
1959
  "epoch": 1.23,
1960
+ "grad_norm": 0.40732433266741186,
1961
  "learning_rate": 7.807102022574631e-06,
1962
  "loss": 2.0998,
1963
  "step": 1385
1964
  },
1965
  {
1966
  "epoch": 1.23,
1967
+ "grad_norm": 0.4094150520135308,
1968
  "learning_rate": 7.7317468977407e-06,
1969
+ "loss": 2.1043,
1970
  "step": 1390
1971
  },
1972
  {
1973
  "epoch": 1.24,
1974
+ "grad_norm": 0.4143968764627874,
1975
  "learning_rate": 7.65652731734366e-06,
1976
  "loss": 2.0905,
1977
  "step": 1395
1978
  },
1979
  {
1980
  "epoch": 1.24,
1981
+ "grad_norm": 0.41429132965318916,
1982
  "learning_rate": 7.5814477762944435e-06,
1983
  "loss": 2.1261,
1984
  "step": 1400
1985
  },
1986
  {
1987
  "epoch": 1.24,
1988
+ "grad_norm": 0.41651300684161424,
1989
  "learning_rate": 7.506512761135627e-06,
1990
  "loss": 2.104,
1991
  "step": 1405
1992
  },
1993
  {
1994
  "epoch": 1.25,
1995
+ "grad_norm": 0.40430702749471803,
1996
  "learning_rate": 7.431726749773322e-06,
1997
  "loss": 2.0818,
1998
  "step": 1410
1999
  },
2000
  {
2001
  "epoch": 1.25,
2002
+ "grad_norm": 0.39545065916224165,
2003
  "learning_rate": 7.3570942112095955e-06,
2004
  "loss": 2.1178,
2005
  "step": 1415
2006
  },
2007
  {
2008
  "epoch": 1.26,
2009
+ "grad_norm": 0.41845323074942986,
2010
  "learning_rate": 7.282619605275409e-06,
2011
  "loss": 2.1073,
2012
  "step": 1420
2013
  },
2014
  {
2015
  "epoch": 1.26,
2016
+ "grad_norm": 0.4291565825269165,
2017
  "learning_rate": 7.208307382364111e-06,
2018
  "loss": 2.0965,
2019
  "step": 1425
2020
  },
2021
  {
2022
  "epoch": 1.27,
2023
+ "grad_norm": 0.4180727624857724,
2024
  "learning_rate": 7.134161983165498e-06,
2025
+ "loss": 2.1125,
2026
  "step": 1430
2027
  },
2028
  {
2029
  "epoch": 1.27,
2030
+ "grad_norm": 0.4000511580619176,
2031
  "learning_rate": 7.060187838400451e-06,
2032
  "loss": 2.1136,
2033
  "step": 1435
2034
  },
2035
  {
2036
  "epoch": 1.28,
2037
+ "grad_norm": 0.4199755899525794,
2038
  "learning_rate": 6.986389368556168e-06,
2039
+ "loss": 2.1077,
2040
  "step": 1440
2041
  },
2042
  {
2043
  "epoch": 1.28,
2044
+ "grad_norm": 0.40083106141467084,
2045
  "learning_rate": 6.912770983622008e-06,
2046
  "loss": 2.1171,
2047
  "step": 1445
2048
  },
2049
  {
2050
  "epoch": 1.28,
2051
+ "grad_norm": 0.4051211620073931,
2052
  "learning_rate": 6.839337082825954e-06,
2053
  "loss": 2.1166,
2054
  "step": 1450
2055
  },
2056
  {
2057
  "epoch": 1.29,
2058
+ "grad_norm": 0.40880848512095863,
2059
  "learning_rate": 6.766092054371744e-06,
2060
  "loss": 2.1066,
2061
  "step": 1455
2062
  },
2063
  {
2064
  "epoch": 1.29,
2065
+ "grad_norm": 0.4270223799938442,
2066
  "learning_rate": 6.693040275176623e-06,
2067
  "loss": 2.1284,
2068
  "step": 1460
2069
  },
2070
  {
2071
  "epoch": 1.3,
2072
+ "grad_norm": 0.410943245645624,
2073
  "learning_rate": 6.62018611060982e-06,
2074
+ "loss": 2.1023,
2075
  "step": 1465
2076
  },
2077
  {
2078
  "epoch": 1.3,
2079
+ "grad_norm": 0.40110281421249894,
2080
  "learning_rate": 6.547533914231654e-06,
2081
  "loss": 2.1095,
2082
  "step": 1470
2083
  },
2084
  {
2085
  "epoch": 1.31,
2086
+ "grad_norm": 0.4222937432875456,
2087
  "learning_rate": 6.475088027533399e-06,
2088
  "loss": 2.1251,
2089
  "step": 1475
2090
  },
2091
  {
2092
  "epoch": 1.31,
2093
+ "grad_norm": 0.41734461880760143,
2094
  "learning_rate": 6.40285277967784e-06,
2095
  "loss": 2.1165,
2096
  "step": 1480
2097
  },
2098
  {
2099
  "epoch": 1.32,
2100
+ "grad_norm": 0.4219500307007948,
2101
  "learning_rate": 6.330832487240573e-06,
2102
  "loss": 2.0901,
2103
  "step": 1485
2104
  },
2105
  {
2106
  "epoch": 1.32,
2107
+ "grad_norm": 0.41050615869957313,
2108
  "learning_rate": 6.2590314539520695e-06,
2109
  "loss": 2.1171,
2110
  "step": 1490
2111
  },
2112
  {
2113
  "epoch": 1.32,
2114
+ "grad_norm": 0.41236984958005407,
2115
  "learning_rate": 6.187453970440484e-06,
2116
+ "loss": 2.0995,
2117
  "step": 1495
2118
  },
2119
  {
2120
  "epoch": 1.33,
2121
+ "grad_norm": 0.4100875999463757,
2122
  "learning_rate": 6.116104313975267e-06,
2123
  "loss": 2.1215,
2124
  "step": 1500
2125
  },
2126
  {
2127
  "epoch": 1.33,
2128
+ "grad_norm": 0.4035150398104019,
2129
  "learning_rate": 6.044986748211556e-06,
2130
  "loss": 2.0906,
2131
  "step": 1505
2132
  },
2133
  {
2134
  "epoch": 1.34,
2135
+ "grad_norm": 0.3889743143857003,
2136
  "learning_rate": 5.974105522935416e-06,
2137
  "loss": 2.1004,
2138
  "step": 1510
2139
  },
2140
  {
2141
  "epoch": 1.34,
2142
+ "grad_norm": 0.3965299458622196,
2143
  "learning_rate": 5.903464873809854e-06,
2144
  "loss": 2.0894,
2145
  "step": 1515
2146
  },
2147
  {
2148
  "epoch": 1.35,
2149
+ "grad_norm": 0.4648219163263186,
2150
  "learning_rate": 5.833069022121727e-06,
2151
  "loss": 2.1079,
2152
  "step": 1520
2153
  },
2154
  {
2155
  "epoch": 1.35,
2156
+ "grad_norm": 0.4020431492622865,
2157
  "learning_rate": 5.762922174529482e-06,
2158
  "loss": 2.0802,
2159
  "step": 1525
2160
  },
2161
  {
2162
  "epoch": 1.36,
2163
+ "grad_norm": 0.4251665370252372,
2164
  "learning_rate": 5.693028522811783e-06,
2165
  "loss": 2.0999,
2166
  "step": 1530
2167
  },
2168
  {
2169
  "epoch": 1.36,
2170
+ "grad_norm": 0.41522383258788625,
2171
  "learning_rate": 5.6233922436170205e-06,
2172
  "loss": 2.0991,
2173
  "step": 1535
2174
  },
2175
  {
2176
  "epoch": 1.36,
2177
+ "grad_norm": 0.4234451479989133,
2178
  "learning_rate": 5.5540174982137185e-06,
2179
  "loss": 2.1105,
2180
  "step": 1540
2181
  },
2182
  {
2183
  "epoch": 1.37,
2184
+ "grad_norm": 0.4096964557325324,
2185
  "learning_rate": 5.484908432241889e-06,
2186
+ "loss": 2.1047,
2187
  "step": 1545
2188
  },
2189
  {
2190
  "epoch": 1.37,
2191
+ "grad_norm": 0.3997042044620638,
2192
  "learning_rate": 5.416069175465274e-06,
2193
+ "loss": 2.1032,
2194
  "step": 1550
2195
  },
2196
  {
2197
  "epoch": 1.38,
2198
+ "grad_norm": 0.4082547088759691,
2199
  "learning_rate": 5.347503841524582e-06,
2200
  "loss": 2.1235,
2201
  "step": 1555
2202
  },
2203
  {
2204
  "epoch": 1.38,
2205
+ "grad_norm": 0.41818638001837766,
2206
  "learning_rate": 5.279216527691657e-06,
2207
  "loss": 2.1116,
2208
  "step": 1560
2209
  },
2210
  {
2211
  "epoch": 1.39,
2212
+ "grad_norm": 0.4188081490024075,
2213
  "learning_rate": 5.211211314624653e-06,
2214
  "loss": 2.1007,
2215
  "step": 1565
2216
  },
2217
  {
2218
  "epoch": 1.39,
2219
+ "grad_norm": 0.41766241522336955,
2220
  "learning_rate": 5.143492266124164e-06,
2221
+ "loss": 2.1084,
2222
  "step": 1570
2223
  },
2224
  {
2225
  "epoch": 1.4,
2226
+ "grad_norm": 0.3941481034674606,
2227
  "learning_rate": 5.076063428890393e-06,
2228
  "loss": 2.1039,
2229
  "step": 1575
2230
  },
2231
  {
2232
  "epoch": 1.4,
2233
+ "grad_norm": 0.4234050444874177,
2234
  "learning_rate": 5.008928832281339e-06,
2235
  "loss": 2.095,
2236
  "step": 1580
2237
  },
2238
  {
2239
  "epoch": 1.4,
2240
+ "grad_norm": 0.42252410020979136,
2241
  "learning_rate": 4.942092488072e-06,
2242
+ "loss": 2.1065,
2243
  "step": 1585
2244
  },
2245
  {
2246
  "epoch": 1.41,
2247
+ "grad_norm": 0.41261067702886783,
2248
  "learning_rate": 4.875558390214652e-06,
2249
  "loss": 2.0944,
2250
  "step": 1590
2251
  },
2252
  {
2253
  "epoch": 1.41,
2254
+ "grad_norm": 0.41835742656052355,
2255
  "learning_rate": 4.8093305146001815e-06,
2256
  "loss": 2.0941,
2257
  "step": 1595
2258
  },
2259
  {
2260
  "epoch": 1.42,
2261
+ "grad_norm": 0.413982332712424,
2262
  "learning_rate": 4.743412818820488e-06,
2263
  "loss": 2.1052,
2264
  "step": 1600
2265
  },
2266
  {
2267
  "epoch": 1.42,
2268
+ "grad_norm": 0.39906438790571663,
2269
  "learning_rate": 4.677809241931994e-06,
2270
  "loss": 2.1039,
2271
  "step": 1605
2272
  },
2273
  {
2274
  "epoch": 1.43,
2275
+ "grad_norm": 0.4062409617387541,
2276
  "learning_rate": 4.612523704220264e-06,
2277
  "loss": 2.1022,
2278
  "step": 1610
2279
  },
2280
  {
2281
  "epoch": 1.43,
2282
+ "grad_norm": 0.4380990308522443,
2283
  "learning_rate": 4.5475601069657304e-06,
2284
  "loss": 2.1029,
2285
  "step": 1615
2286
  },
2287
  {
2288
  "epoch": 1.43,
2289
+ "grad_norm": 0.4640210302953327,
2290
  "learning_rate": 4.482922332210569e-06,
2291
  "loss": 2.0943,
2292
  "step": 1620
2293
  },
2294
  {
2295
  "epoch": 1.44,
2296
+ "grad_norm": 0.4561287177059882,
2297
  "learning_rate": 4.418614242526717e-06,
2298
  "loss": 2.0889,
2299
  "step": 1625
2300
  },
2301
  {
2302
  "epoch": 1.44,
2303
+ "grad_norm": 0.39324073539195364,
2304
  "learning_rate": 4.354639680785059e-06,
2305
  "loss": 2.1044,
2306
  "step": 1630
2307
  },
2308
  {
2309
  "epoch": 1.45,
2310
+ "grad_norm": 0.3976959516919322,
2311
  "learning_rate": 4.291002469925782e-06,
2312
  "loss": 2.1184,
2313
  "step": 1635
2314
  },
2315
  {
2316
  "epoch": 1.45,
2317
+ "grad_norm": 0.4250848928093303,
2318
  "learning_rate": 4.227706412729943e-06,
2319
+ "loss": 2.1015,
2320
  "step": 1640
2321
  },
2322
  {
2323
  "epoch": 1.46,
2324
+ "grad_norm": 0.38268455321701433,
2325
  "learning_rate": 4.1647552915922e-06,
2326
  "loss": 2.0999,
2327
  "step": 1645
2328
  },
2329
  {
2330
  "epoch": 1.46,
2331
+ "grad_norm": 0.4043023039481698,
2332
  "learning_rate": 4.1021528682948064e-06,
2333
  "loss": 2.0864,
2334
  "step": 1650
2335
  },
2336
  {
2337
  "epoch": 1.47,
2338
+ "grad_norm": 0.38846037782467163,
2339
  "learning_rate": 4.039902883782814e-06,
2340
  "loss": 2.1092,
2341
  "step": 1655
2342
  },
2343
  {
2344
  "epoch": 1.47,
2345
+ "grad_norm": 0.40424586787374106,
2346
  "learning_rate": 3.978009057940518e-06,
2347
  "loss": 2.1063,
2348
  "step": 1660
2349
  },
2350
  {
2351
  "epoch": 1.47,
2352
+ "grad_norm": 0.3922816188111355,
2353
  "learning_rate": 3.916475089369175e-06,
2354
+ "loss": 2.099,
2355
  "step": 1665
2356
  },
2357
  {
2358
  "epoch": 1.48,
2359
+ "grad_norm": 0.4069372600164316,
2360
  "learning_rate": 3.855304655165978e-06,
2361
  "loss": 2.111,
2362
  "step": 1670
2363
  },
2364
  {
2365
  "epoch": 1.48,
2366
+ "grad_norm": 0.4149213347066681,
2367
  "learning_rate": 3.794501410704331e-06,
2368
+ "loss": 2.1022,
2369
  "step": 1675
2370
  },
2371
  {
2372
  "epoch": 1.49,
2373
+ "grad_norm": 0.39873703123061466,
2374
  "learning_rate": 3.7340689894154023e-06,
2375
  "loss": 2.0816,
2376
  "step": 1680
2377
  },
2378
  {
2379
  "epoch": 1.49,
2380
+ "grad_norm": 0.39381797320899137,
2381
  "learning_rate": 3.674011002571022e-06,
2382
  "loss": 2.0942,
2383
  "step": 1685
2384
  },
2385
  {
2386
  "epoch": 1.5,
2387
+ "grad_norm": 0.390940916824622,
2388
  "learning_rate": 3.6143310390678544e-06,
2389
+ "loss": 2.0923,
2390
  "step": 1690
2391
  },
2392
  {
2393
  "epoch": 1.5,
2394
+ "grad_norm": 0.4389483255039723,
2395
  "learning_rate": 3.555032665212964e-06,
2396
  "loss": 2.1227,
2397
  "step": 1695
2398
  },
2399
  {
2400
  "epoch": 1.51,
2401
+ "grad_norm": 0.4185120148672096,
2402
  "learning_rate": 3.496119424510678e-06,
2403
  "loss": 2.0929,
2404
  "step": 1700
2405
  },
2406
  {
2407
  "epoch": 1.51,
2408
+ "grad_norm": 0.41799065246733313,
2409
  "learning_rate": 3.4375948374508516e-06,
2410
  "loss": 2.0741,
2411
  "step": 1705
2412
  },
2413
  {
2414
  "epoch": 1.51,
2415
+ "grad_norm": 0.4103681102471153,
2416
  "learning_rate": 3.3794624012984913e-06,
2417
  "loss": 2.0993,
2418
  "step": 1710
2419
  },
2420
  {
2421
  "epoch": 1.52,
2422
+ "grad_norm": 0.4418789396816657,
2423
  "learning_rate": 3.3217255898847635e-06,
2424
  "loss": 2.0996,
2425
  "step": 1715
2426
  },
2427
  {
2428
  "epoch": 1.52,
2429
+ "grad_norm": 0.4048227744685057,
2430
  "learning_rate": 3.2643878533994145e-06,
2431
  "loss": 2.1051,
2432
  "step": 1720
2433
  },
2434
  {
2435
  "epoch": 1.53,
2436
+ "grad_norm": 0.3943223755161985,
2437
  "learning_rate": 3.20745261818459e-06,
2438
+ "loss": 2.0883,
2439
  "step": 1725
2440
  },
2441
  {
2442
  "epoch": 1.53,
2443
+ "grad_norm": 0.39426422818649126,
2444
  "learning_rate": 3.1509232865300886e-06,
2445
+ "loss": 2.0967,
2446
  "step": 1730
2447
  },
2448
  {
2449
  "epoch": 1.54,
2450
+ "grad_norm": 0.38700690171764346,
2451
  "learning_rate": 3.09480323647006e-06,
2452
  "loss": 2.0907,
2453
  "step": 1735
2454
  },
2455
  {
2456
  "epoch": 1.54,
2457
+ "grad_norm": 0.3966796530994539,
2458
  "learning_rate": 3.039095821581127e-06,
2459
+ "loss": 2.1031,
2460
  "step": 1740
2461
  },
2462
  {
2463
  "epoch": 1.55,
2464
+ "grad_norm": 0.39218836669208584,
2465
  "learning_rate": 2.983804370781996e-06,
2466
  "loss": 2.1006,
2467
  "step": 1745
2468
  },
2469
  {
2470
  "epoch": 1.55,
2471
+ "grad_norm": 0.3917800151933228,
2472
  "learning_rate": 2.9289321881345257e-06,
2473
+ "loss": 2.1113,
2474
  "step": 1750
2475
  },
2476
  {
2477
  "epoch": 1.55,
2478
+ "grad_norm": 0.4191497902264968,
2479
  "learning_rate": 2.8744825526462882e-06,
2480
  "loss": 2.1027,
2481
  "step": 1755
2482
  },
2483
  {
2484
  "epoch": 1.56,
2485
+ "grad_norm": 0.3855110275882673,
2486
  "learning_rate": 2.8204587180746256e-06,
2487
+ "loss": 2.1024,
2488
  "step": 1760
2489
  },
2490
  {
2491
  "epoch": 1.56,
2492
+ "grad_norm": 0.4039614288126536,
2493
  "learning_rate": 2.7668639127322084e-06,
2494
  "loss": 2.1158,
2495
  "step": 1765
2496
  },
2497
  {
2498
  "epoch": 1.57,
2499
+ "grad_norm": 0.3831972297482568,
2500
  "learning_rate": 2.713701339294129e-06,
2501
  "loss": 2.0938,
2502
  "step": 1770
2503
  },
2504
  {
2505
  "epoch": 1.57,
2506
+ "grad_norm": 0.41044962218424763,
2507
  "learning_rate": 2.66097417460651e-06,
2508
  "loss": 2.0973,
2509
  "step": 1775
2510
  },
2511
  {
2512
  "epoch": 1.58,
2513
+ "grad_norm": 0.3820988690333304,
2514
  "learning_rate": 2.6086855694966795e-06,
2515
+ "loss": 2.0822,
2516
  "step": 1780
2517
  },
2518
  {
2519
  "epoch": 1.58,
2520
+ "grad_norm": 0.4032872025327132,
2521
  "learning_rate": 2.5568386485848663e-06,
2522
  "loss": 2.1023,
2523
  "step": 1785
2524
  },
2525
  {
2526
  "epoch": 1.59,
2527
+ "grad_norm": 0.38963938048915986,
2528
  "learning_rate": 2.505436510097494e-06,
2529
  "loss": 2.1128,
2530
  "step": 1790
2531
  },
2532
  {
2533
  "epoch": 1.59,
2534
+ "grad_norm": 0.4156509435322595,
2535
  "learning_rate": 2.45448222568204e-06,
2536
  "loss": 2.108,
2537
  "step": 1795
2538
  },
2539
  {
2540
  "epoch": 1.59,
2541
+ "grad_norm": 0.39608532296147336,
2542
  "learning_rate": 2.4039788402234787e-06,
2543
+ "loss": 2.0814,
2544
  "step": 1800
2545
  },
2546
  {
2547
  "epoch": 1.6,
2548
+ "grad_norm": 0.39076693130582835,
2549
  "learning_rate": 2.3539293716623268e-06,
2550
  "loss": 2.1154,
2551
  "step": 1805
2552
  },
2553
  {
2554
  "epoch": 1.6,
2555
+ "grad_norm": 0.41114466213461554,
2556
  "learning_rate": 2.304336810814305e-06,
2557
+ "loss": 2.0987,
2558
  "step": 1810
2559
  },
2560
  {
2561
  "epoch": 1.61,
2562
+ "grad_norm": 0.393208663523278,
2563
  "learning_rate": 2.2552041211916052e-06,
2564
  "loss": 2.0972,
2565
  "step": 1815
2566
  },
2567
  {
2568
  "epoch": 1.61,
2569
+ "grad_norm": 0.4189608140637361,
2570
  "learning_rate": 2.2065342388258193e-06,
2571
  "loss": 2.0875,
2572
  "step": 1820
2573
  },
2574
  {
2575
  "epoch": 1.62,
2576
+ "grad_norm": 0.4055460138739848,
2577
  "learning_rate": 2.1583300720924604e-06,
2578
  "loss": 2.0744,
2579
  "step": 1825
2580
  },
2581
  {
2582
  "epoch": 1.62,
2583
+ "grad_norm": 0.40958882001013913,
2584
  "learning_rate": 2.1105945015371985e-06,
2585
  "loss": 2.092,
2586
  "step": 1830
2587
  },
2588
  {
2589
  "epoch": 1.63,
2590
+ "grad_norm": 0.40591763869925823,
2591
  "learning_rate": 2.063330379703702e-06,
2592
  "loss": 2.1053,
2593
  "step": 1835
2594
  },
2595
  {
2596
  "epoch": 1.63,
2597
+ "grad_norm": 0.39520037641985134,
2598
  "learning_rate": 2.016540530963188e-06,
2599
+ "loss": 2.0907,
2600
  "step": 1840
2601
  },
2602
  {
2603
  "epoch": 1.63,
2604
+ "grad_norm": 0.3966353129481412,
2605
  "learning_rate": 1.9702277513456493e-06,
2606
  "loss": 2.1015,
2607
  "step": 1845
2608
  },
2609
  {
2610
  "epoch": 1.64,
2611
+ "grad_norm": 0.39424791708720475,
2612
  "learning_rate": 1.9243948083727626e-06,
2613
  "loss": 2.1065,
2614
  "step": 1850
2615
  },
2616
  {
2617
  "epoch": 1.64,
2618
+ "grad_norm": 0.3780834002401699,
2619
  "learning_rate": 1.879044440892517e-06,
2620
+ "loss": 2.105,
2621
  "step": 1855
2622
  },
2623
  {
2624
  "epoch": 1.65,
2625
+ "grad_norm": 0.4052494322948412,
2626
  "learning_rate": 1.8341793589155444e-06,
2627
  "loss": 2.0941,
2628
  "step": 1860
2629
  },
2630
  {
2631
  "epoch": 1.65,
2632
+ "grad_norm": 0.398956260434951,
2633
  "learning_rate": 1.789802243453178e-06,
2634
  "loss": 2.0958,
2635
  "step": 1865
2636
  },
2637
  {
2638
  "epoch": 1.66,
2639
+ "grad_norm": 0.4421180394888075,
2640
  "learning_rate": 1.7459157463572396e-06,
2641
  "loss": 2.1184,
2642
  "step": 1870
2643
  },
2644
  {
2645
  "epoch": 1.66,
2646
+ "grad_norm": 0.4024654751913048,
2647
  "learning_rate": 1.7025224901615811e-06,
2648
  "loss": 2.087,
2649
  "step": 1875
2650
  },
2651
  {
2652
  "epoch": 1.67,
2653
+ "grad_norm": 0.4030389382667108,
2654
  "learning_rate": 1.6596250679253568e-06,
2655
  "loss": 2.1043,
2656
  "step": 1880
2657
  },
2658
  {
2659
  "epoch": 1.67,
2660
+ "grad_norm": 0.40272562891260966,
2661
  "learning_rate": 1.6172260430780772e-06,
2662
  "loss": 2.0896,
2663
  "step": 1885
2664
  },
2665
  {
2666
  "epoch": 1.67,
2667
+ "grad_norm": 0.4035871482994007,
2668
  "learning_rate": 1.5753279492664264e-06,
2669
+ "loss": 2.0808,
2670
  "step": 1890
2671
  },
2672
  {
2673
  "epoch": 1.68,
2674
+ "grad_norm": 0.4039426831776706,
2675
  "learning_rate": 1.5339332902028537e-06,
2676
  "loss": 2.1114,
2677
  "step": 1895
2678
  },
2679
  {
2680
  "epoch": 1.68,
2681
+ "grad_norm": 0.4002984146786066,
2682
  "learning_rate": 1.493044539515961e-06,
2683
  "loss": 2.1031,
2684
  "step": 1900
2685
  },
2686
  {
2687
  "epoch": 1.69,
2688
+ "grad_norm": 0.39867014379856125,
2689
  "learning_rate": 1.4526641406026898e-06,
2690
  "loss": 2.1044,
2691
  "step": 1905
2692
  },
2693
  {
2694
  "epoch": 1.69,
2695
+ "grad_norm": 0.39343752425975886,
2696
  "learning_rate": 1.4127945064823023e-06,
2697
  "loss": 2.0905,
2698
  "step": 1910
2699
  },
2700
  {
2701
  "epoch": 1.7,
2702
+ "grad_norm": 0.3862488530579884,
2703
  "learning_rate": 1.3734380196521923e-06,
2704
  "loss": 2.0806,
2705
  "step": 1915
2706
  },
2707
  {
2708
  "epoch": 1.7,
2709
+ "grad_norm": 0.3856981650198434,
2710
  "learning_rate": 1.334597031945517e-06,
2711
  "loss": 2.0962,
2712
  "step": 1920
2713
  },
2714
  {
2715
  "epoch": 1.71,
2716
+ "grad_norm": 0.39778936805885257,
2717
  "learning_rate": 1.296273864390646e-06,
2718
  "loss": 2.0921,
2719
  "step": 1925
2720
  },
2721
  {
2722
  "epoch": 1.71,
2723
+ "grad_norm": 0.4011081821698257,
2724
  "learning_rate": 1.2584708070724738e-06,
2725
  "loss": 2.0863,
2726
  "step": 1930
2727
  },
2728
  {
2729
  "epoch": 1.71,
2730
+ "grad_norm": 0.4111465470087662,
2731
  "learning_rate": 1.2211901189955689e-06,
2732
  "loss": 2.0845,
2733
  "step": 1935
2734
  },
2735
  {
2736
  "epoch": 1.72,
2737
+ "grad_norm": 0.41517182840169037,
2738
  "learning_rate": 1.1844340279491772e-06,
2739
  "loss": 2.091,
2740
  "step": 1940
2741
  },
2742
  {
2743
  "epoch": 1.72,
2744
+ "grad_norm": 0.37709500205555146,
2745
  "learning_rate": 1.1482047303740996e-06,
2746
  "loss": 2.1058,
2747
  "step": 1945
2748
  },
2749
  {
2750
  "epoch": 1.73,
2751
+ "grad_norm": 0.37953080829155994,
2752
  "learning_rate": 1.1125043912314438e-06,
2753
  "loss": 2.0792,
2754
  "step": 1950
2755
  },
2756
  {
2757
  "epoch": 1.73,
2758
+ "grad_norm": 0.3950572745024171,
2759
  "learning_rate": 1.0773351438732392e-06,
2760
  "loss": 2.0941,
2761
  "step": 1955
2762
  },
2763
  {
2764
  "epoch": 1.74,
2765
+ "grad_norm": 0.3963540647101632,
2766
  "learning_rate": 1.0426990899149658e-06,
2767
  "loss": 2.1108,
2768
  "step": 1960
2769
  },
2770
  {
2771
  "epoch": 1.74,
2772
+ "grad_norm": 0.41568275143765726,
2773
  "learning_rate": 1.0085982991099585e-06,
2774
  "loss": 2.0842,
2775
  "step": 1965
2776
  },
2777
  {
2778
  "epoch": 1.74,
2779
+ "grad_norm": 0.40339761288384357,
2780
  "learning_rate": 9.750348092257368e-07,
2781
  "loss": 2.1133,
2782
  "step": 1970
2783
  },
2784
  {
2785
  "epoch": 1.75,
2786
+ "grad_norm": 0.40150493005304355,
2787
  "learning_rate": 9.420106259222184e-07,
2788
+ "loss": 2.0978,
2789
  "step": 1975
2790
  },
2791
  {
2792
  "epoch": 1.75,
2793
+ "grad_norm": 0.3867737463604247,
2794
  "learning_rate": 9.095277226318766e-07,
2795
  "loss": 2.106,
2796
  "step": 1980
2797
  },
2798
  {
2799
  "epoch": 1.76,
2800
+ "grad_norm": 0.39268212242665723,
2801
  "learning_rate": 8.775880404418113e-07,
2802
  "loss": 2.1073,
2803
  "step": 1985
2804
  },
2805
  {
2806
  "epoch": 1.76,
2807
+ "grad_norm": 0.39360186170964856,
2808
  "learning_rate": 8.461934879777545e-07,
2809
  "loss": 2.107,
2810
  "step": 1990
2811
  },
2812
  {
2813
  "epoch": 1.77,
2814
+ "grad_norm": 0.3913381025647452,
2815
  "learning_rate": 8.153459412900156e-07,
2816
  "loss": 2.1047,
2817
  "step": 1995
2818
  },
2819
  {
2820
  "epoch": 1.77,
2821
+ "grad_norm": 0.38805441021421166,
2822
  "learning_rate": 7.850472437413748e-07,
2823
  "loss": 2.0887,
2824
  "step": 2000
2825
  },
2826
  {
2827
  "epoch": 1.78,
2828
+ "grad_norm": 0.4100490913779501,
2829
  "learning_rate": 7.552992058969299e-07,
2830
  "loss": 2.0946,
2831
  "step": 2005
2832
  },
2833
  {
2834
  "epoch": 1.78,
2835
+ "grad_norm": 0.40381558154437197,
2836
  "learning_rate": 7.261036054158965e-07,
2837
  "loss": 2.0936,
2838
  "step": 2010
2839
  },
2840
  {
2841
  "epoch": 1.78,
2842
+ "grad_norm": 0.38988389242579946,
2843
  "learning_rate": 6.974621869453924e-07,
2844
  "loss": 2.1059,
2845
  "step": 2015
2846
  },
2847
  {
2848
  "epoch": 1.79,
2849
+ "grad_norm": 0.40333371645506444,
2850
  "learning_rate": 6.693766620161691e-07,
2851
  "loss": 2.0739,
2852
  "step": 2020
2853
  },
2854
  {
2855
  "epoch": 1.79,
2856
+ "grad_norm": 0.3991452959215482,
2857
  "learning_rate": 6.418487089403392e-07,
2858
  "loss": 2.0968,
2859
  "step": 2025
2860
  },
2861
  {
2862
  "epoch": 1.8,
2863
+ "grad_norm": 0.4122588062715049,
2864
  "learning_rate": 6.148799727110911e-07,
2865
  "loss": 2.113,
2866
  "step": 2030
2867
  },
2868
  {
2869
  "epoch": 1.8,
2870
+ "grad_norm": 0.4031821625034364,
2871
  "learning_rate": 5.884720649043807e-07,
2872
+ "loss": 2.1007,
2873
  "step": 2035
2874
  },
2875
  {
2876
  "epoch": 1.81,
2877
+ "grad_norm": 0.3997787254136987,
2878
  "learning_rate": 5.626265635826367e-07,
2879
  "loss": 2.0865,
2880
  "step": 2040
2881
  },
2882
  {
2883
  "epoch": 1.81,
2884
+ "grad_norm": 0.41541986135925696,
2885
  "learning_rate": 5.373450132004499e-07,
2886
  "loss": 2.0913,
2887
  "step": 2045
2888
  },
2889
  {
2890
  "epoch": 1.82,
2891
+ "grad_norm": 0.39924447549454817,
2892
  "learning_rate": 5.126289245122906e-07,
2893
  "loss": 2.0932,
2894
  "step": 2050
2895
  },
2896
  {
2897
  "epoch": 1.82,
2898
+ "grad_norm": 0.3876113975609659,
2899
  "learning_rate": 4.884797744822212e-07,
2900
  "loss": 2.0978,
2901
  "step": 2055
2902
  },
2903
  {
2904
  "epoch": 1.82,
2905
+ "grad_norm": 0.38102064194275936,
2906
  "learning_rate": 4.648990061956493e-07,
2907
  "loss": 2.07,
2908
  "step": 2060
2909
  },
2910
  {
2911
  "epoch": 1.83,
2912
+ "grad_norm": 0.4078023295865823,
2913
  "learning_rate": 4.418880287730798e-07,
2914
  "loss": 2.1086,
2915
  "step": 2065
2916
  },
2917
  {
2918
  "epoch": 1.83,
2919
+ "grad_norm": 0.39162424207146235,
2920
  "learning_rate": 4.194482172859127e-07,
2921
  "loss": 2.1096,
2922
  "step": 2070
2923
  },
2924
  {
2925
  "epoch": 1.84,
2926
+ "grad_norm": 0.3909418495579691,
2927
  "learning_rate": 3.9758091267428245e-07,
2928
  "loss": 2.1058,
2929
  "step": 2075
2930
  },
2931
  {
2932
  "epoch": 1.84,
2933
+ "grad_norm": 0.40147038699020526,
2934
  "learning_rate": 3.762874216669166e-07,
2935
  "loss": 2.0968,
2936
  "step": 2080
2937
  },
2938
  {
2939
  "epoch": 1.85,
2940
+ "grad_norm": 0.3829646525328004,
2941
  "learning_rate": 3.555690167030512e-07,
2942
+ "loss": 2.0942,
2943
  "step": 2085
2944
  },
2945
  {
2946
  "epoch": 1.85,
2947
+ "grad_norm": 0.3862469704899032,
2948
  "learning_rate": 3.354269358563966e-07,
2949
  "loss": 2.0752,
2950
  "step": 2090
2951
  },
2952
  {
2953
  "epoch": 1.86,
2954
+ "grad_norm": 0.4002412540219113,
2955
  "learning_rate": 3.158623827611529e-07,
2956
  "loss": 2.082,
2957
  "step": 2095
2958
  },
2959
  {
2960
  "epoch": 1.86,
2961
+ "grad_norm": 0.39128463328310525,
2962
  "learning_rate": 2.968765265400808e-07,
2963
+ "loss": 2.1094,
2964
  "step": 2100
2965
  },
2966
  {
2967
  "epoch": 1.86,
2968
+ "grad_norm": 0.389066863845164,
2969
  "learning_rate": 2.784705017346423e-07,
2970
  "loss": 2.1011,
2971
  "step": 2105
2972
  },
2973
  {
2974
  "epoch": 1.87,
2975
+ "grad_norm": 0.41577012460693696,
2976
  "learning_rate": 2.606454082372045e-07,
2977
  "loss": 2.118,
2978
  "step": 2110
2979
  },
2980
  {
2981
  "epoch": 1.87,
2982
+ "grad_norm": 0.39352078147596564,
2983
  "learning_rate": 2.4340231122530477e-07,
2984
  "loss": 2.091,
2985
  "step": 2115
2986
  },
2987
  {
2988
  "epoch": 1.88,
2989
+ "grad_norm": 0.397169174292838,
2990
  "learning_rate": 2.2674224109800913e-07,
2991
  "loss": 2.1105,
2992
  "step": 2120
2993
  },
2994
  {
2995
  "epoch": 1.88,
2996
+ "grad_norm": 0.4067199205244666,
2997
  "learning_rate": 2.106661934143317e-07,
2998
  "loss": 2.0776,
2999
  "step": 2125
3000
  },
3001
  {
3002
  "epoch": 1.89,
3003
+ "grad_norm": 0.40084827816794355,
3004
  "learning_rate": 1.9517512883374667e-07,
3005
+ "loss": 2.0766,
3006
  "step": 2130
3007
  },
3008
  {
3009
  "epoch": 1.89,
3010
+ "grad_norm": 0.39440381008574477,
3011
  "learning_rate": 1.802699730587798e-07,
3012
  "loss": 2.0959,
3013
  "step": 2135
3014
  },
3015
  {
3016
  "epoch": 1.9,
3017
+ "grad_norm": 0.38231822526623593,
3018
  "learning_rate": 1.659516167796904e-07,
3019
  "loss": 2.0748,
3020
  "step": 2140
3021
  },
3022
  {
3023
  "epoch": 1.9,
3024
+ "grad_norm": 0.38955827108970986,
3025
  "learning_rate": 1.522209156212484e-07,
3026
  "loss": 2.095,
3027
  "step": 2145
3028
  },
3029
  {
3030
  "epoch": 1.9,
3031
+ "grad_norm": 0.3889290130329203,
3032
  "learning_rate": 1.3907869009160525e-07,
3033
  "loss": 2.0935,
3034
  "step": 2150
3035
  },
3036
  {
3037
  "epoch": 1.91,
3038
+ "grad_norm": 0.4057029376556727,
3039
  "learning_rate": 1.265257255332586e-07,
3040
+ "loss": 2.0796,
3041
  "step": 2155
3042
  },
3043
  {
3044
  "epoch": 1.91,
3045
+ "grad_norm": 0.4003272379090444,
3046
  "learning_rate": 1.1456277207612554e-07,
3047
  "loss": 2.1011,
3048
  "step": 2160
3049
  },
3050
  {
3051
  "epoch": 1.92,
3052
+ "grad_norm": 0.39936245493182826,
3053
  "learning_rate": 1.0319054459271837e-07,
3054
+ "loss": 2.1073,
3055
  "step": 2165
3056
  },
3057
  {
3058
  "epoch": 1.92,
3059
+ "grad_norm": 0.39419603933628955,
3060
  "learning_rate": 9.240972265541992e-08,
3061
  "loss": 2.1013,
3062
  "step": 2170
3063
  },
3064
  {
3065
  "epoch": 1.93,
3066
+ "grad_norm": 0.39469369641995194,
3067
  "learning_rate": 8.222095049588264e-08,
3068
  "loss": 2.0907,
3069
  "step": 2175
3070
  },
3071
  {
3072
  "epoch": 1.93,
3073
+ "grad_norm": 0.3888813295747844,
3074
  "learning_rate": 7.262483696652167e-08,
3075
  "loss": 2.1052,
3076
  "step": 2180
3077
  },
3078
  {
3079
  "epoch": 1.94,
3080
+ "grad_norm": 0.3898068967567598,
3081
  "learning_rate": 6.362195550413953e-08,
3082
  "loss": 2.0855,
3083
  "step": 2185
3084
  },
3085
  {
3086
  "epoch": 1.94,
3087
+ "grad_norm": 0.38592800803774174,
3088
  "learning_rate": 5.521284409565675e-08,
3089
  "loss": 2.0939,
3090
  "step": 2190
3091
  },
3092
  {
3093
  "epoch": 1.94,
3094
+ "grad_norm": 0.41041114740866647,
3095
  "learning_rate": 4.739800524595884e-08,
3096
  "loss": 2.1071,
3097
  "step": 2195
3098
  },
3099
  {
3100
  "epoch": 1.95,
3101
+ "grad_norm": 0.3962264267381791,
3102
  "learning_rate": 4.017790594787574e-08,
3103
+ "loss": 2.0963,
3104
  "step": 2200
3105
  },
3106
  {
3107
  "epoch": 1.95,
3108
+ "grad_norm": 0.3998693452725598,
3109
  "learning_rate": 3.355297765426868e-08,
3110
  "loss": 2.0959,
3111
  "step": 2205
3112
  },
3113
  {
3114
  "epoch": 1.96,
3115
+ "grad_norm": 0.38673248438287033,
3116
  "learning_rate": 2.7523616252252972e-08,
3117
+ "loss": 2.08,
3118
  "step": 2210
3119
  },
3120
  {
3121
  "epoch": 1.96,
3122
+ "grad_norm": 0.38881590946908995,
3123
  "learning_rate": 2.2090182039538055e-08,
3124
  "loss": 2.0808,
3125
  "step": 2215
3126
  },
3127
  {
3128
  "epoch": 1.97,
3129
+ "grad_norm": 0.38221362495592137,
3130
  "learning_rate": 1.7252999702894736e-08,
3131
  "loss": 2.089,
3132
  "step": 2220
3133
  },
3134
  {
3135
  "epoch": 1.97,
3136
+ "grad_norm": 0.3823748413744665,
3137
  "learning_rate": 1.3012358298760686e-08,
3138
  "loss": 2.0984,
3139
  "step": 2225
3140
  },
3141
  {
3142
  "epoch": 1.98,
3143
+ "grad_norm": 0.4178774637846201,
3144
  "learning_rate": 9.368511235958722e-09,
3145
  "loss": 2.1074,
3146
  "step": 2230
3147
  },
3148
  {
3149
  "epoch": 1.98,
3150
+ "grad_norm": 0.3886650486408838,
3151
  "learning_rate": 6.3216762605589064e-09,
3152
  "loss": 2.0877,
3153
  "step": 2235
3154
  },
3155
  {
3156
  "epoch": 1.98,
3157
+ "grad_norm": 0.38036565966227426,
3158
  "learning_rate": 3.87203544286563e-09,
3159
  "loss": 2.1056,
3160
  "step": 2240
3161
  },
3162
  {
3163
  "epoch": 1.99,
3164
+ "grad_norm": 0.3937092650401667,
3165
  "learning_rate": 2.019735166534087e-09,
3166
  "loss": 2.0838,
3167
  "step": 2245
3168
  },
3169
  {
3170
  "epoch": 1.99,
3171
+ "grad_norm": 0.4224908408185913,
3172
  "learning_rate": 7.648861198306101e-10,
3173
  "loss": 2.1125,
3174
  "step": 2250
3175
  },
3176
  {
3177
  "epoch": 2.0,
3178
+ "grad_norm": 0.40597066232796053,
3179
  "learning_rate": 1.0756328901018188e-10,
3180
+ "loss": 2.0928,
3181
  "step": 2255
3182
  },
3183
  {
3184
  "epoch": 2.0,
3185
+ "eval_loss": 2.105895519256592,
3186
+ "eval_runtime": 22.5515,
3187
+ "eval_samples_per_second": 1418.042,
3188
+ "eval_steps_per_second": 22.171,
3189
  "step": 2258
3190
  },
3191
  {
3192
  "epoch": 2.0,
3193
  "step": 2258,
3194
  "total_flos": 13637863342080.0,
3195
+ "train_loss": 2.1773926187554116,
3196
+ "train_runtime": 648.0178,
3197
+ "train_samples_per_second": 891.729,
3198
+ "train_steps_per_second": 3.484
3199
  }
3200
  ],
3201
  "logging_steps": 5,
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:45f8b6cc78440714767f67b8543c1cc3b8156360ee0d45609137cfc23fa9e437
3
  size 6072
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d203f29511be687a1a4537fb517d729383833cd200fcef8fd39067853f096759
3
  size 6072