hZzy commited on
Commit
051a63e
1 Parent(s): 6a164d2

Model save

Browse files
README.md CHANGED
@@ -2,15 +2,12 @@
2
  license: apache-2.0
3
  base_model: Qwen/Qwen2-0.5B
4
  tags:
5
- - alignment-handbook
6
- - trl
7
- - sft
8
- - generated_from_trainer
9
  - trl
10
  - sft
 
11
  - generated_from_trainer
12
  datasets:
13
- - HuggingFaceH4/ultrachat_200k
14
  model-index:
15
  - name: qwen2-0.5b-sft
16
  results: []
@@ -19,10 +16,10 @@ model-index:
19
  <!-- This model card has been generated automatically according to the information the Trainer had access to. You
20
  should probably proofread and complete it, then remove this comment. -->
21
 
22
- [<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="200" height="32"/>](https://wandb.ai/zhiyuzha-university-of-florida/huggingface/runs/xiljxzlv)
23
  # qwen2-0.5b-sft
24
 
25
- This model is a fine-tuned version of [Qwen/Qwen2-0.5B](https://huggingface.co/Qwen/Qwen2-0.5B) on the HuggingFaceH4/ultrachat_200k dataset.
26
  It achieves the following results on the evaluation set:
27
  - Loss: 1.5327
28
 
 
2
  license: apache-2.0
3
  base_model: Qwen/Qwen2-0.5B
4
  tags:
 
 
 
 
5
  - trl
6
  - sft
7
+ - alignment-handbook
8
  - generated_from_trainer
9
  datasets:
10
+ - generator
11
  model-index:
12
  - name: qwen2-0.5b-sft
13
  results: []
 
16
  <!-- This model card has been generated automatically according to the information the Trainer had access to. You
17
  should probably proofread and complete it, then remove this comment. -->
18
 
19
+ [<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="200" height="32"/>](https://wandb.ai/zhiyuzha-university-of-florida/huggingface/runs/i7u6rhg6)
20
  # qwen2-0.5b-sft
21
 
22
+ This model is a fine-tuned version of [Qwen/Qwen2-0.5B](https://huggingface.co/Qwen/Qwen2-0.5B) on the generator dataset.
23
  It achieves the following results on the evaluation set:
24
  - Loss: 1.5327
25
 
all_results.json CHANGED
@@ -5,10 +5,10 @@
5
  "eval_samples": 23109,
6
  "eval_samples_per_second": 117.789,
7
  "eval_steps_per_second": 4.911,
8
- "total_flos": 106151314194432.0,
9
- "train_loss": 1.5477958324414178,
10
- "train_runtime": 7077.9094,
11
  "train_samples": 207864,
12
- "train_samples_per_second": 34.147,
13
- "train_steps_per_second": 0.178
14
  }
 
5
  "eval_samples": 23109,
6
  "eval_samples_per_second": 117.789,
7
  "eval_steps_per_second": 4.911,
8
+ "total_flos": 106140763422720.0,
9
+ "train_loss": 1.5477893754295022,
10
+ "train_runtime": 7899.7649,
11
  "train_samples": 207864,
12
+ "train_samples_per_second": 30.594,
13
+ "train_steps_per_second": 0.159
14
  }
config.json CHANGED
@@ -23,7 +23,7 @@
23
  "tie_word_embeddings": true,
24
  "torch_dtype": "float16",
25
  "transformers_version": "4.42.0",
26
- "use_cache": true,
27
  "use_sliding_window": false,
28
  "vocab_size": 151646
29
  }
 
23
  "tie_word_embeddings": true,
24
  "torch_dtype": "float16",
25
  "transformers_version": "4.42.0",
26
+ "use_cache": false,
27
  "use_sliding_window": false,
28
  "vocab_size": 151646
29
  }
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b74d2269794e95922a0105c706be76c9cd775f9596e41957d1490b1726489791
3
  size 987577856
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2e87fce8b36d878cc402f4983e370644357d946da7766b027599a6cfdcf8f83b
3
  size 987577856
train_results.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "epoch": 0.9993049349617714,
3
- "total_flos": 106151314194432.0,
4
- "train_loss": 1.5477958324414178,
5
- "train_runtime": 7077.9094,
6
  "train_samples": 207864,
7
- "train_samples_per_second": 34.147,
8
- "train_steps_per_second": 0.178
9
  }
 
1
  {
2
  "epoch": 0.9993049349617714,
3
+ "total_flos": 106140763422720.0,
4
+ "train_loss": 1.5477893754295022,
5
+ "train_runtime": 7899.7649,
6
  "train_samples": 207864,
7
+ "train_samples_per_second": 30.594,
8
+ "train_steps_per_second": 0.159
9
  }
trainer_state.json CHANGED
@@ -10,1784 +10,1784 @@
10
  "log_history": [
11
  {
12
  "epoch": 0.0007943600436898023,
13
- "grad_norm": 11.121467420168102,
14
  "learning_rate": 1.5873015873015874e-07,
15
  "loss": 1.8009,
16
  "step": 1
17
  },
18
  {
19
  "epoch": 0.003971800218449012,
20
- "grad_norm": 10.105126147283379,
21
  "learning_rate": 7.936507936507937e-07,
22
  "loss": 1.7719,
23
  "step": 5
24
  },
25
  {
26
  "epoch": 0.007943600436898023,
27
- "grad_norm": 2.716144406794696,
28
  "learning_rate": 1.5873015873015873e-06,
29
  "loss": 1.7086,
30
  "step": 10
31
  },
32
  {
33
  "epoch": 0.011915400655347037,
34
- "grad_norm": 1.5688914094087087,
35
  "learning_rate": 2.380952380952381e-06,
36
  "loss": 1.6525,
37
  "step": 15
38
  },
39
  {
40
  "epoch": 0.015887200873796047,
41
- "grad_norm": 1.1647233888301007,
42
  "learning_rate": 3.1746031746031746e-06,
43
  "loss": 1.6423,
44
  "step": 20
45
  },
46
  {
47
  "epoch": 0.01985900109224506,
48
- "grad_norm": 1.0271897567673653,
49
  "learning_rate": 3.968253968253968e-06,
50
  "loss": 1.6245,
51
  "step": 25
52
  },
53
  {
54
  "epoch": 0.023830801310694073,
55
- "grad_norm": 0.951882231987063,
56
  "learning_rate": 4.761904761904762e-06,
57
  "loss": 1.6242,
58
  "step": 30
59
  },
60
  {
61
  "epoch": 0.027802601529143083,
62
- "grad_norm": 0.9090288489527357,
63
  "learning_rate": 5.555555555555557e-06,
64
  "loss": 1.6067,
65
  "step": 35
66
  },
67
  {
68
  "epoch": 0.03177440174759209,
69
- "grad_norm": 0.928699319472879,
70
  "learning_rate": 6.349206349206349e-06,
71
  "loss": 1.5848,
72
  "step": 40
73
  },
74
  {
75
  "epoch": 0.035746201966041107,
76
- "grad_norm": 0.8919167830175897,
77
  "learning_rate": 7.1428571428571436e-06,
78
  "loss": 1.6,
79
  "step": 45
80
  },
81
  {
82
  "epoch": 0.03971800218449012,
83
- "grad_norm": 0.9666222091181399,
84
  "learning_rate": 7.936507936507936e-06,
85
  "loss": 1.6023,
86
  "step": 50
87
  },
88
  {
89
  "epoch": 0.04368980240293913,
90
- "grad_norm": 0.9486935987547116,
91
  "learning_rate": 8.730158730158731e-06,
92
- "loss": 1.5825,
93
  "step": 55
94
  },
95
  {
96
  "epoch": 0.04766160262138815,
97
- "grad_norm": 0.9143653123318525,
98
  "learning_rate": 9.523809523809525e-06,
99
  "loss": 1.5978,
100
  "step": 60
101
  },
102
  {
103
  "epoch": 0.05163340283983715,
104
- "grad_norm": 0.9168257475922792,
105
  "learning_rate": 1.031746031746032e-05,
106
  "loss": 1.6003,
107
  "step": 65
108
  },
109
  {
110
  "epoch": 0.055605203058286166,
111
- "grad_norm": 0.919045459151749,
112
  "learning_rate": 1.1111111111111113e-05,
113
  "loss": 1.5856,
114
  "step": 70
115
  },
116
  {
117
  "epoch": 0.05957700327673518,
118
- "grad_norm": 0.8902024273489265,
119
  "learning_rate": 1.1904761904761905e-05,
120
  "loss": 1.5846,
121
  "step": 75
122
  },
123
  {
124
  "epoch": 0.06354880349518419,
125
- "grad_norm": 0.9673452898082758,
126
  "learning_rate": 1.2698412698412699e-05,
127
  "loss": 1.6089,
128
  "step": 80
129
  },
130
  {
131
  "epoch": 0.0675206037136332,
132
- "grad_norm": 0.9325220485287637,
133
  "learning_rate": 1.3492063492063494e-05,
134
  "loss": 1.5815,
135
  "step": 85
136
  },
137
  {
138
  "epoch": 0.07149240393208221,
139
- "grad_norm": 0.9819780029738107,
140
  "learning_rate": 1.4285714285714287e-05,
141
  "loss": 1.5816,
142
  "step": 90
143
  },
144
  {
145
  "epoch": 0.07546420415053123,
146
- "grad_norm": 0.9031316483601133,
147
  "learning_rate": 1.507936507936508e-05,
148
  "loss": 1.5958,
149
  "step": 95
150
  },
151
  {
152
  "epoch": 0.07943600436898024,
153
- "grad_norm": 0.9802215884208159,
154
  "learning_rate": 1.5873015873015872e-05,
155
  "loss": 1.5911,
156
  "step": 100
157
  },
158
  {
159
  "epoch": 0.08340780458742925,
160
- "grad_norm": 0.9517468404747221,
161
  "learning_rate": 1.6666666666666667e-05,
162
  "loss": 1.5865,
163
  "step": 105
164
  },
165
  {
166
  "epoch": 0.08737960480587827,
167
- "grad_norm": 0.9264528912180998,
168
  "learning_rate": 1.7460317460317463e-05,
169
  "loss": 1.5793,
170
  "step": 110
171
  },
172
  {
173
  "epoch": 0.09135140502432727,
174
- "grad_norm": 0.9237176030611468,
175
  "learning_rate": 1.8253968253968254e-05,
176
  "loss": 1.5822,
177
  "step": 115
178
  },
179
  {
180
  "epoch": 0.0953232052427763,
181
- "grad_norm": 0.9568296021604813,
182
  "learning_rate": 1.904761904761905e-05,
183
  "loss": 1.5718,
184
  "step": 120
185
  },
186
  {
187
  "epoch": 0.0992950054612253,
188
- "grad_norm": 1.0111943996294692,
189
  "learning_rate": 1.9841269841269845e-05,
190
  "loss": 1.5773,
191
  "step": 125
192
  },
193
  {
194
  "epoch": 0.1032668056796743,
195
- "grad_norm": 1.0398553358796776,
196
  "learning_rate": 1.999938384153589e-05,
197
  "loss": 1.585,
198
  "step": 130
199
  },
200
  {
201
  "epoch": 0.10723860589812333,
202
- "grad_norm": 0.9495791958934532,
203
  "learning_rate": 1.999688082790923e-05,
204
  "loss": 1.5868,
205
  "step": 135
206
  },
207
  {
208
  "epoch": 0.11121040611657233,
209
- "grad_norm": 0.958633787129993,
210
  "learning_rate": 1.9992452930796544e-05,
211
  "loss": 1.5776,
212
  "step": 140
213
  },
214
  {
215
  "epoch": 0.11518220633502135,
216
- "grad_norm": 0.933994419412705,
217
  "learning_rate": 1.9986101002782376e-05,
218
  "loss": 1.5789,
219
  "step": 145
220
  },
221
  {
222
  "epoch": 0.11915400655347036,
223
- "grad_norm": 0.9531381305363663,
224
  "learning_rate": 1.997782626692034e-05,
225
  "loss": 1.5814,
226
  "step": 150
227
  },
228
  {
229
  "epoch": 0.12312580677191937,
230
- "grad_norm": 0.9202778818496322,
231
  "learning_rate": 1.9967630316497663e-05,
232
  "loss": 1.5659,
233
  "step": 155
234
  },
235
  {
236
  "epoch": 0.12709760699036837,
237
- "grad_norm": 0.9577280868902154,
238
  "learning_rate": 1.995551511472836e-05,
239
- "loss": 1.5843,
240
  "step": 160
241
  },
242
  {
243
  "epoch": 0.1310694072088174,
244
- "grad_norm": 0.9496762483788141,
245
  "learning_rate": 1.994148299437524e-05,
246
  "loss": 1.559,
247
  "step": 165
248
  },
249
  {
250
  "epoch": 0.1350412074272664,
251
- "grad_norm": 0.9772506574092267,
252
  "learning_rate": 1.9925536657300734e-05,
253
  "loss": 1.5783,
254
  "step": 170
255
  },
256
  {
257
  "epoch": 0.13901300764571542,
258
- "grad_norm": 0.9077294916049774,
259
  "learning_rate": 1.990767917394666e-05,
260
  "loss": 1.5716,
261
  "step": 175
262
  },
263
  {
264
  "epoch": 0.14298480786416443,
265
- "grad_norm": 0.9866617979378287,
266
  "learning_rate": 1.9887913982743e-05,
267
  "loss": 1.5705,
268
  "step": 180
269
  },
270
  {
271
  "epoch": 0.14695660808261343,
272
- "grad_norm": 0.899114782791873,
273
  "learning_rate": 1.986624488944585e-05,
274
  "loss": 1.5738,
275
  "step": 185
276
  },
277
  {
278
  "epoch": 0.15092840830106247,
279
- "grad_norm": 0.9204283940478478,
280
  "learning_rate": 1.984267606640462e-05,
281
  "loss": 1.5729,
282
  "step": 190
283
  },
284
  {
285
  "epoch": 0.15490020851951147,
286
- "grad_norm": 0.9526503744776189,
287
  "learning_rate": 1.9817212051758667e-05,
288
  "loss": 1.5674,
289
  "step": 195
290
  },
291
  {
292
  "epoch": 0.15887200873796048,
293
- "grad_norm": 0.9000235837089589,
294
  "learning_rate": 1.978985774856346e-05,
295
  "loss": 1.5683,
296
  "step": 200
297
  },
298
  {
299
  "epoch": 0.16284380895640949,
300
- "grad_norm": 0.9726071022051445,
301
  "learning_rate": 1.9760618423846526e-05,
302
  "loss": 1.5738,
303
  "step": 205
304
  },
305
  {
306
  "epoch": 0.1668156091748585,
307
- "grad_norm": 0.9823636998203206,
308
  "learning_rate": 1.9729499707593284e-05,
309
  "loss": 1.5826,
310
  "step": 210
311
  },
312
  {
313
  "epoch": 0.17078740939330753,
314
- "grad_norm": 0.9302300170642017,
315
  "learning_rate": 1.9696507591663003e-05,
316
  "loss": 1.5565,
317
  "step": 215
318
  },
319
  {
320
  "epoch": 0.17475920961175653,
321
- "grad_norm": 0.9087097776923584,
322
  "learning_rate": 1.9661648428635066e-05,
323
  "loss": 1.5621,
324
  "step": 220
325
  },
326
  {
327
  "epoch": 0.17873100983020554,
328
- "grad_norm": 0.9285603183944053,
329
  "learning_rate": 1.962492893058582e-05,
330
  "loss": 1.5533,
331
  "step": 225
332
  },
333
  {
334
  "epoch": 0.18270281004865455,
335
- "grad_norm": 0.9285965898864252,
336
  "learning_rate": 1.9586356167796145e-05,
337
  "loss": 1.5801,
338
  "step": 230
339
  },
340
  {
341
  "epoch": 0.18667461026710355,
342
- "grad_norm": 0.9222463524528106,
343
  "learning_rate": 1.954593756739009e-05,
344
  "loss": 1.5802,
345
  "step": 235
346
  },
347
  {
348
  "epoch": 0.1906464104855526,
349
- "grad_norm": 0.9918027121214139,
350
- "learning_rate": 1.9512278901942467e-05,
351
  "loss": 1.5817,
352
  "step": 240
353
  },
354
  {
355
  "epoch": 0.1946182107040016,
356
- "grad_norm": 0.9353795309289739,
357
- "learning_rate": 1.9468557643703262e-05,
358
  "loss": 1.571,
359
  "step": 245
360
  },
361
  {
362
  "epoch": 0.1985900109224506,
363
- "grad_norm": 0.9177319040019111,
364
- "learning_rate": 1.942301322976593e-05,
365
  "loss": 1.5694,
366
  "step": 250
367
  },
368
  {
369
  "epoch": 0.2025618111408996,
370
- "grad_norm": 0.9154910309805477,
371
- "learning_rate": 1.9375654429634866e-05,
372
  "loss": 1.556,
373
  "step": 255
374
  },
375
  {
376
  "epoch": 0.2065336113593486,
377
- "grad_norm": 0.9061242036153607,
378
- "learning_rate": 1.9326490362171625e-05,
379
- "loss": 1.5763,
380
  "step": 260
381
  },
382
  {
383
  "epoch": 0.21050541157779765,
384
- "grad_norm": 0.9020925779619676,
385
- "learning_rate": 1.9275530493839118e-05,
386
- "loss": 1.5707,
387
  "step": 265
388
  },
389
  {
390
  "epoch": 0.21447721179624665,
391
- "grad_norm": 0.9060526130936534,
392
- "learning_rate": 1.9222784636878853e-05,
393
- "loss": 1.5671,
394
  "step": 270
395
  },
396
  {
397
  "epoch": 0.21844901201469566,
398
- "grad_norm": 0.9180977301557971,
399
- "learning_rate": 1.91682629474216e-05,
400
- "loss": 1.5743,
401
  "step": 275
402
  },
403
  {
404
  "epoch": 0.22242081223314467,
405
- "grad_norm": 0.9541418358542395,
406
- "learning_rate": 1.9111975923531858e-05,
407
  "loss": 1.5653,
408
  "step": 280
409
  },
410
  {
411
  "epoch": 0.22639261245159367,
412
- "grad_norm": 0.9432630338289705,
413
- "learning_rate": 1.905393440318645e-05,
414
- "loss": 1.5551,
415
  "step": 285
416
  },
417
  {
418
  "epoch": 0.2303644126700427,
419
- "grad_norm": 0.9562298032141088,
420
- "learning_rate": 1.8994149562187702e-05,
421
- "loss": 1.5625,
422
  "step": 290
423
  },
424
  {
425
  "epoch": 0.2343362128884917,
426
- "grad_norm": 0.9432931518842519,
427
- "learning_rate": 1.8932632912011565e-05,
428
- "loss": 1.5568,
429
  "step": 295
430
  },
431
  {
432
  "epoch": 0.23830801310694072,
433
- "grad_norm": 0.974609418111715,
434
- "learning_rate": 1.886939629759107e-05,
435
- "loss": 1.5553,
436
  "step": 300
437
  },
438
  {
439
  "epoch": 0.24227981332538973,
440
- "grad_norm": 0.9241516964595516,
441
- "learning_rate": 1.8804451895035645e-05,
442
- "loss": 1.5792,
443
  "step": 305
444
  },
445
  {
446
  "epoch": 0.24625161354383873,
447
- "grad_norm": 0.960522948664812,
448
- "learning_rate": 1.873781220928659e-05,
449
- "loss": 1.5605,
450
  "step": 310
451
  },
452
  {
453
  "epoch": 0.25022341376228774,
454
- "grad_norm": 0.898378217476936,
455
- "learning_rate": 1.866949007170929e-05,
456
- "loss": 1.565,
457
  "step": 315
458
  },
459
  {
460
  "epoch": 0.25419521398073675,
461
- "grad_norm": 0.878099848163542,
462
- "learning_rate": 1.859949863762256e-05,
463
- "loss": 1.5751,
464
  "step": 320
465
  },
466
  {
467
  "epoch": 0.2581670141991858,
468
- "grad_norm": 0.9531550617032691,
469
- "learning_rate": 1.852785138376558e-05,
470
- "loss": 1.5753,
471
  "step": 325
472
  },
473
  {
474
  "epoch": 0.2621388144176348,
475
- "grad_norm": 0.8716992776076363,
476
- "learning_rate": 1.8454562105703e-05,
477
- "loss": 1.5577,
478
  "step": 330
479
  },
480
  {
481
  "epoch": 0.2661106146360838,
482
- "grad_norm": 0.871675734840579,
483
- "learning_rate": 1.8379644915168623e-05,
484
  "loss": 1.55,
485
  "step": 335
486
  },
487
  {
488
  "epoch": 0.2700824148545328,
489
- "grad_norm": 0.9064851932473592,
490
- "learning_rate": 1.83031142373482e-05,
491
- "loss": 1.5737,
492
  "step": 340
493
  },
494
  {
495
  "epoch": 0.27405421507298183,
496
- "grad_norm": 0.8728863317820471,
497
- "learning_rate": 1.822498480810189e-05,
498
- "loss": 1.574,
499
  "step": 345
500
  },
501
  {
502
  "epoch": 0.27802601529143084,
503
- "grad_norm": 0.9475535267391176,
504
- "learning_rate": 1.8145271671126892e-05,
505
- "loss": 1.5657,
506
  "step": 350
507
  },
508
  {
509
  "epoch": 0.28199781550987985,
510
- "grad_norm": 0.9267506445082312,
511
- "learning_rate": 1.8063990175060807e-05,
512
  "loss": 1.5601,
513
  "step": 355
514
  },
515
  {
516
  "epoch": 0.28596961572832885,
517
- "grad_norm": 0.8854031406900612,
518
- "learning_rate": 1.798115597052629e-05,
519
- "loss": 1.5596,
520
  "step": 360
521
  },
522
  {
523
  "epoch": 0.28994141594677786,
524
- "grad_norm": 0.9231659613553668,
525
- "learning_rate": 1.7896785007117526e-05,
526
- "loss": 1.5682,
527
  "step": 365
528
  },
529
  {
530
  "epoch": 0.29391321616522686,
531
- "grad_norm": 0.9188990992337205,
532
- "learning_rate": 1.781089353032918e-05,
533
- "loss": 1.5614,
534
  "step": 370
535
  },
536
  {
537
  "epoch": 0.2978850163836759,
538
- "grad_norm": 0.9422567353428425,
539
- "learning_rate": 1.7723498078428355e-05,
540
  "loss": 1.5726,
541
  "step": 375
542
  },
543
  {
544
  "epoch": 0.30185681660212493,
545
- "grad_norm": 0.8795755203467719,
546
- "learning_rate": 1.7634615479270157e-05,
547
- "loss": 1.542,
548
  "step": 380
549
  },
550
  {
551
  "epoch": 0.30582861682057394,
552
- "grad_norm": 0.897277213053898,
553
- "learning_rate": 1.754426284705753e-05,
554
- "loss": 1.5517,
555
  "step": 385
556
  },
557
  {
558
  "epoch": 0.30980041703902295,
559
- "grad_norm": 0.9125693278793655,
560
- "learning_rate": 1.7452457579045948e-05,
561
- "loss": 1.5566,
562
  "step": 390
563
  },
564
  {
565
  "epoch": 0.31377221725747195,
566
- "grad_norm": 0.9300634130828153,
567
- "learning_rate": 1.7359217352193587e-05,
568
- "loss": 1.5733,
569
  "step": 395
570
  },
571
  {
572
  "epoch": 0.31774401747592096,
573
- "grad_norm": 0.9074698285987416,
574
- "learning_rate": 1.726456011975767e-05,
575
- "loss": 1.5493,
576
  "step": 400
577
  },
578
  {
579
  "epoch": 0.32171581769436997,
580
- "grad_norm": 0.9006504788659523,
581
- "learning_rate": 1.716850410783758e-05,
582
- "loss": 1.5696,
583
  "step": 405
584
  },
585
  {
586
  "epoch": 0.32568761791281897,
587
- "grad_norm": 0.9137764692116158,
588
- "learning_rate": 1.7071067811865477e-05,
589
  "loss": 1.5507,
590
  "step": 410
591
  },
592
  {
593
  "epoch": 0.329659418131268,
594
- "grad_norm": 0.8945253552547247,
595
- "learning_rate": 1.6972269993045004e-05,
596
- "loss": 1.5903,
597
  "step": 415
598
  },
599
  {
600
  "epoch": 0.333631218349717,
601
- "grad_norm": 0.8843903321390951,
602
- "learning_rate": 1.6872129674738866e-05,
603
- "loss": 1.5593,
604
  "step": 420
605
  },
606
  {
607
  "epoch": 0.33760301856816605,
608
- "grad_norm": 0.9037525517546994,
609
- "learning_rate": 1.6770666138805904e-05,
610
  "loss": 1.5829,
611
  "step": 425
612
  },
613
  {
614
  "epoch": 0.34157481878661505,
615
- "grad_norm": 0.8587208142089539,
616
- "learning_rate": 1.666789892188841e-05,
617
- "loss": 1.5577,
618
  "step": 430
619
  },
620
  {
621
  "epoch": 0.34554661900506406,
622
- "grad_norm": 0.9297268199181588,
623
  "learning_rate": 1.6563847811650376e-05,
624
- "loss": 1.5684,
625
  "step": 435
626
  },
627
  {
628
  "epoch": 0.34951841922351307,
629
- "grad_norm": 0.8737934998928475,
630
  "learning_rate": 1.64585328429674e-05,
631
  "loss": 1.5448,
632
  "step": 440
633
  },
634
  {
635
  "epoch": 0.3534902194419621,
636
- "grad_norm": 0.8747772650383047,
637
  "learning_rate": 1.635197429406901e-05,
638
  "loss": 1.5726,
639
  "step": 445
640
  },
641
  {
642
  "epoch": 0.3574620196604111,
643
- "grad_norm": 0.9494066241816427,
644
  "learning_rate": 1.6244192682634143e-05,
645
- "loss": 1.5464,
646
  "step": 450
647
  },
648
  {
649
  "epoch": 0.3614338198788601,
650
- "grad_norm": 0.9312998364165999,
651
  "learning_rate": 1.6135208761840457e-05,
652
- "loss": 1.559,
653
  "step": 455
654
  },
655
  {
656
  "epoch": 0.3654056200973091,
657
- "grad_norm": 0.9440997013889066,
658
  "learning_rate": 1.602504351636838e-05,
659
  "loss": 1.5534,
660
  "step": 460
661
  },
662
  {
663
  "epoch": 0.3693774203157581,
664
- "grad_norm": 0.9211086942466344,
665
  "learning_rate": 1.591371815836051e-05,
666
  "loss": 1.5543,
667
  "step": 465
668
  },
669
  {
670
  "epoch": 0.3733492205342071,
671
- "grad_norm": 0.8873332047780428,
672
  "learning_rate": 1.580125412333728e-05,
673
  "loss": 1.5402,
674
  "step": 470
675
  },
676
  {
677
  "epoch": 0.37732102075265617,
678
- "grad_norm": 0.8697606247963089,
679
  "learning_rate": 1.5687673066069568e-05,
680
  "loss": 1.552,
681
  "step": 475
682
  },
683
  {
684
  "epoch": 0.3812928209711052,
685
- "grad_norm": 0.9009584572462931,
686
  "learning_rate": 1.5572996856409094e-05,
687
- "loss": 1.5639,
688
  "step": 480
689
  },
690
  {
691
  "epoch": 0.3852646211895542,
692
- "grad_norm": 0.8955208053301982,
693
  "learning_rate": 1.5457247575077445e-05,
694
  "loss": 1.5406,
695
  "step": 485
696
  },
697
  {
698
  "epoch": 0.3892364214080032,
699
- "grad_norm": 0.90920168608274,
700
  "learning_rate": 1.534044750941444e-05,
701
  "loss": 1.5472,
702
  "step": 490
703
  },
704
  {
705
  "epoch": 0.3932082216264522,
706
- "grad_norm": 0.8801713016760236,
707
  "learning_rate": 1.5222619149086746e-05,
708
- "loss": 1.5413,
709
  "step": 495
710
  },
711
  {
712
  "epoch": 0.3971800218449012,
713
- "grad_norm": 0.8776939253029902,
714
  "learning_rate": 1.5103785181757533e-05,
715
  "loss": 1.5396,
716
  "step": 500
717
  },
718
  {
719
  "epoch": 0.4011518220633502,
720
- "grad_norm": 0.8660391458498254,
721
  "learning_rate": 1.4983968488718005e-05,
722
  "loss": 1.5426,
723
  "step": 505
724
  },
725
  {
726
  "epoch": 0.4051236222817992,
727
- "grad_norm": 0.8925651828569715,
728
  "learning_rate": 1.4863192140481624e-05,
729
- "loss": 1.5538,
730
  "step": 510
731
  },
732
  {
733
  "epoch": 0.4090954225002482,
734
- "grad_norm": 0.844029495816824,
735
  "learning_rate": 1.4741479392341941e-05,
736
  "loss": 1.5586,
737
  "step": 515
738
  },
739
  {
740
  "epoch": 0.4130672227186972,
741
- "grad_norm": 0.9416264844686819,
742
  "learning_rate": 1.4618853679894813e-05,
743
  "loss": 1.5202,
744
  "step": 520
745
  },
746
  {
747
  "epoch": 0.4170390229371463,
748
- "grad_norm": 0.8865786227355646,
749
  "learning_rate": 1.4495338614525927e-05,
750
- "loss": 1.5503,
751
  "step": 525
752
  },
753
  {
754
  "epoch": 0.4210108231555953,
755
- "grad_norm": 0.964575419053074,
756
  "learning_rate": 1.437095797886445e-05,
757
  "loss": 1.5488,
758
  "step": 530
759
  },
760
  {
761
  "epoch": 0.4249826233740443,
762
- "grad_norm": 0.9223622871019674,
763
  "learning_rate": 1.4245735722203736e-05,
764
- "loss": 1.54,
765
  "step": 535
766
  },
767
  {
768
  "epoch": 0.4289544235924933,
769
- "grad_norm": 0.9529794800211927,
770
  "learning_rate": 1.4119695955889925e-05,
771
- "loss": 1.5493,
772
  "step": 540
773
  },
774
  {
775
  "epoch": 0.4329262238109423,
776
- "grad_norm": 0.9338108378518882,
777
- "learning_rate": 1.4018291839323084e-05,
778
  "loss": 1.5491,
779
  "step": 545
780
  },
781
  {
782
  "epoch": 0.4368980240293913,
783
- "grad_norm": 0.8747517102177081,
784
- "learning_rate": 1.389084181569683e-05,
785
- "loss": 1.5483,
786
  "step": 550
787
  },
788
  {
789
  "epoch": 0.4408698242478403,
790
- "grad_norm": 0.8342834567313075,
791
- "learning_rate": 1.3762642616674098e-05,
792
- "loss": 1.5487,
793
  "step": 555
794
  },
795
  {
796
  "epoch": 0.44484162446628933,
797
- "grad_norm": 0.8212301004118387,
798
- "learning_rate": 1.3633718926806541e-05,
799
- "loss": 1.5479,
800
  "step": 560
801
  },
802
  {
803
  "epoch": 0.44881342468473834,
804
- "grad_norm": 0.8325603507497541,
805
- "learning_rate": 1.3504095570145364e-05,
806
  "loss": 1.5598,
807
  "step": 565
808
  },
809
  {
810
  "epoch": 0.45278522490318734,
811
- "grad_norm": 0.8222680870437219,
812
- "learning_rate": 1.3373797505461501e-05,
813
  "loss": 1.5141,
814
  "step": 570
815
  },
816
  {
817
  "epoch": 0.4567570251216364,
818
- "grad_norm": 0.8373171419431287,
819
- "learning_rate": 1.324284982143984e-05,
820
- "loss": 1.5339,
821
  "step": 575
822
  },
823
  {
824
  "epoch": 0.4607288253400854,
825
- "grad_norm": 0.9465969511868977,
826
- "learning_rate": 1.3111277731848442e-05,
827
- "loss": 1.5634,
828
  "step": 580
829
  },
830
  {
831
  "epoch": 0.4647006255585344,
832
- "grad_norm": 0.8753229605568046,
833
  "learning_rate": 1.2979106570683663e-05,
834
  "loss": 1.5624,
835
  "step": 585
836
  },
837
  {
838
  "epoch": 0.4686724257769834,
839
- "grad_norm": 0.8298818762937755,
840
  "learning_rate": 1.2846361787292137e-05,
841
- "loss": 1.5513,
842
  "step": 590
843
  },
844
  {
845
  "epoch": 0.47264422599543243,
846
- "grad_norm": 0.9059782434582991,
847
  "learning_rate": 1.2713068941470547e-05,
848
  "loss": 1.5609,
849
  "step": 595
850
  },
851
  {
852
  "epoch": 0.47661602621388144,
853
- "grad_norm": 0.9678071693027257,
854
  "learning_rate": 1.2579253698544124e-05,
855
  "loss": 1.5421,
856
  "step": 600
857
  },
858
  {
859
  "epoch": 0.48058782643233044,
860
- "grad_norm": 0.879491847618731,
861
  "learning_rate": 1.2444941824424825e-05,
862
- "loss": 1.5391,
863
  "step": 605
864
  },
865
  {
866
  "epoch": 0.48455962665077945,
867
- "grad_norm": 0.9141636205552669,
868
  "learning_rate": 1.2310159180650158e-05,
869
- "loss": 1.5278,
870
  "step": 610
871
  },
872
  {
873
  "epoch": 0.48853142686922846,
874
- "grad_norm": 0.8988946036669367,
875
  "learning_rate": 1.2174931719403568e-05,
876
- "loss": 1.5205,
877
  "step": 615
878
  },
879
  {
880
  "epoch": 0.49250322708767746,
881
- "grad_norm": 0.8383311925133096,
882
  "learning_rate": 1.2039285478517417e-05,
883
  "loss": 1.5363,
884
  "step": 620
885
  },
886
  {
887
  "epoch": 0.4964750273061265,
888
- "grad_norm": 0.8551398676198184,
889
  "learning_rate": 1.1903246576459398e-05,
890
  "loss": 1.5188,
891
  "step": 625
892
  },
893
  {
894
  "epoch": 0.5004468275245755,
895
- "grad_norm": 0.8949920571148887,
896
  "learning_rate": 1.1766841207303498e-05,
897
- "loss": 1.5387,
898
  "step": 630
899
  },
900
  {
901
  "epoch": 0.5044186277430245,
902
- "grad_norm": 0.9193358351653105,
903
  "learning_rate": 1.1630095635686359e-05,
904
  "loss": 1.5246,
905
  "step": 635
906
  },
907
  {
908
  "epoch": 0.5083904279614735,
909
- "grad_norm": 0.8447830008497511,
910
  "learning_rate": 1.1493036191750067e-05,
911
  "loss": 1.5597,
912
  "step": 640
913
  },
914
  {
915
  "epoch": 0.5123622281799225,
916
- "grad_norm": 0.8981899757973651,
917
  "learning_rate": 1.1355689266072314e-05,
918
  "loss": 1.5407,
919
  "step": 645
920
  },
921
  {
922
  "epoch": 0.5163340283983716,
923
- "grad_norm": 0.8817935879440193,
924
  "learning_rate": 1.1218081304584959e-05,
925
- "loss": 1.5357,
926
  "step": 650
927
  },
928
  {
929
  "epoch": 0.5203058286168206,
930
- "grad_norm": 0.8785553690035353,
931
  "learning_rate": 1.1080238803481878e-05,
932
  "loss": 1.5529,
933
  "step": 655
934
  },
935
  {
936
  "epoch": 0.5242776288352696,
937
- "grad_norm": 0.8844025724443876,
938
  "learning_rate": 1.0942188304117184e-05,
939
- "loss": 1.5372,
940
  "step": 660
941
  },
942
  {
943
  "epoch": 0.5282494290537186,
944
- "grad_norm": 0.8347398093234588,
945
  "learning_rate": 1.0803956387894715e-05,
946
  "loss": 1.5454,
947
  "step": 665
948
  },
949
  {
950
  "epoch": 0.5322212292721676,
951
- "grad_norm": 0.8862140567723474,
952
  "learning_rate": 1.066556967114984e-05,
953
- "loss": 1.5284,
954
  "step": 670
955
  },
956
  {
957
  "epoch": 0.5361930294906166,
958
- "grad_norm": 0.8553729652420549,
959
  "learning_rate": 1.0527054800024537e-05,
960
  "loss": 1.5434,
961
  "step": 675
962
  },
963
  {
964
  "epoch": 0.5401648297090657,
965
- "grad_norm": 0.8475253277125595,
966
  "learning_rate": 1.0388438445336677e-05,
967
  "loss": 1.5134,
968
  "step": 680
969
  },
970
  {
971
  "epoch": 0.5441366299275147,
972
- "grad_norm": 0.8842675764975674,
973
  "learning_rate": 1.0249747297444659e-05,
974
  "loss": 1.5412,
975
  "step": 685
976
  },
977
  {
978
  "epoch": 0.5481084301459637,
979
- "grad_norm": 0.8161072684112473,
980
  "learning_rate": 1.0111008061108176e-05,
981
  "loss": 1.5327,
982
  "step": 690
983
  },
984
  {
985
  "epoch": 0.5520802303644127,
986
- "grad_norm": 0.8239564377921251,
987
  "learning_rate": 9.972247450346272e-06,
988
  "loss": 1.5083,
989
  "step": 695
990
  },
991
  {
992
  "epoch": 0.5560520305828617,
993
- "grad_norm": 0.8876139349108952,
994
  "learning_rate": 9.833492183293616e-06,
995
  "loss": 1.5481,
996
  "step": 700
997
  },
998
  {
999
  "epoch": 0.5600238308013107,
1000
- "grad_norm": 0.8743716033382151,
1001
  "learning_rate": 9.69476897705595e-06,
1002
  "loss": 1.5224,
1003
  "step": 705
1004
  },
1005
  {
1006
  "epoch": 0.5639956310197597,
1007
- "grad_norm": 0.864762033911022,
1008
  "learning_rate": 9.55610454256575e-06,
1009
- "loss": 1.5292,
1010
  "step": 710
1011
  },
1012
  {
1013
  "epoch": 0.5679674312382087,
1014
- "grad_norm": 0.8822836523953117,
1015
  "learning_rate": 9.417525579439094e-06,
1016
- "loss": 1.5249,
1017
  "step": 715
1018
  },
1019
  {
1020
  "epoch": 0.5719392314566577,
1021
- "grad_norm": 0.8455067733133467,
1022
  "learning_rate": 9.279058770834679e-06,
1023
  "loss": 1.5264,
1024
  "step": 720
1025
  },
1026
  {
1027
  "epoch": 0.5759110316751067,
1028
- "grad_norm": 0.8556967593591447,
1029
  "learning_rate": 9.140730778316037e-06,
1030
- "loss": 1.5465,
1031
  "step": 725
1032
  },
1033
  {
1034
  "epoch": 0.5798828318935557,
1035
- "grad_norm": 0.8847018650145665,
1036
  "learning_rate": 9.002568236717863e-06,
1037
  "loss": 1.5389,
1038
  "step": 730
1039
  },
1040
  {
1041
  "epoch": 0.5838546321120047,
1042
- "grad_norm": 0.8606824176415128,
1043
  "learning_rate": 8.864597749017566e-06,
1044
  "loss": 1.5392,
1045
  "step": 735
1046
  },
1047
  {
1048
  "epoch": 0.5878264323304537,
1049
- "grad_norm": 0.8183994433300168,
1050
  "learning_rate": 8.72684588121287e-06,
1051
- "loss": 1.5579,
1052
  "step": 740
1053
  },
1054
  {
1055
  "epoch": 0.5917982325489027,
1056
- "grad_norm": 0.8486694055889199,
1057
  "learning_rate": 8.589339157206583e-06,
1058
  "loss": 1.5388,
1059
  "step": 745
1060
  },
1061
  {
1062
  "epoch": 0.5957700327673519,
1063
- "grad_norm": 0.8083960590705446,
1064
  "learning_rate": 8.452104053699474e-06,
1065
  "loss": 1.5313,
1066
  "step": 750
1067
  },
1068
  {
1069
  "epoch": 0.5997418329858009,
1070
- "grad_norm": 0.8453222791905652,
1071
  "learning_rate": 8.315166995092206e-06,
1072
  "loss": 1.5259,
1073
  "step": 755
1074
  },
1075
  {
1076
  "epoch": 0.6037136332042499,
1077
- "grad_norm": 0.8489237281711289,
1078
  "learning_rate": 8.178554348397388e-06,
1079
- "loss": 1.5194,
1080
  "step": 760
1081
  },
1082
  {
1083
  "epoch": 0.6076854334226989,
1084
- "grad_norm": 0.8152254669646656,
1085
  "learning_rate": 8.042292418162611e-06,
1086
  "loss": 1.5046,
1087
  "step": 765
1088
  },
1089
  {
1090
  "epoch": 0.6116572336411479,
1091
- "grad_norm": 0.8327239180168435,
1092
  "learning_rate": 7.906407441405586e-06,
1093
  "loss": 1.5372,
1094
  "step": 770
1095
  },
1096
  {
1097
  "epoch": 0.6156290338595969,
1098
- "grad_norm": 0.8331426734497274,
1099
  "learning_rate": 7.770925582562228e-06,
1100
  "loss": 1.5365,
1101
  "step": 775
1102
  },
1103
  {
1104
  "epoch": 0.6196008340780459,
1105
- "grad_norm": 0.8330643533609132,
1106
  "learning_rate": 7.635872928448734e-06,
1107
  "loss": 1.5326,
1108
  "step": 780
1109
  },
1110
  {
1111
  "epoch": 0.6235726342964949,
1112
- "grad_norm": 0.8446585726808572,
1113
  "learning_rate": 7.501275483238619e-06,
1114
  "loss": 1.543,
1115
  "step": 785
1116
  },
1117
  {
1118
  "epoch": 0.6275444345149439,
1119
- "grad_norm": 0.8621583370984449,
1120
  "learning_rate": 7.367159163455648e-06,
1121
  "loss": 1.5259,
1122
  "step": 790
1123
  },
1124
  {
1125
  "epoch": 0.6315162347333929,
1126
- "grad_norm": 0.8369419293176187,
1127
  "learning_rate": 7.2335497929836565e-06,
1128
- "loss": 1.5464,
1129
  "step": 795
1130
  },
1131
  {
1132
  "epoch": 0.6354880349518419,
1133
- "grad_norm": 0.879967516287963,
1134
  "learning_rate": 7.10047309809418e-06,
1135
- "loss": 1.5411,
1136
  "step": 800
1137
  },
1138
  {
1139
  "epoch": 0.6394598351702909,
1140
- "grad_norm": 0.8552476117110279,
1141
  "learning_rate": 6.967954702492939e-06,
1142
  "loss": 1.5207,
1143
  "step": 805
1144
  },
1145
  {
1146
  "epoch": 0.6434316353887399,
1147
- "grad_norm": 0.8340974355120994,
1148
  "learning_rate": 6.8360201223860024e-06,
1149
  "loss": 1.5407,
1150
  "step": 810
1151
  },
1152
  {
1153
  "epoch": 0.6474034356071889,
1154
- "grad_norm": 0.8456787458821341,
1155
  "learning_rate": 6.704694761566697e-06,
1156
  "loss": 1.5217,
1157
  "step": 815
1158
  },
1159
  {
1160
  "epoch": 0.6513752358256379,
1161
- "grad_norm": 0.8280796416104202,
1162
  "learning_rate": 6.574003906524149e-06,
1163
  "loss": 1.5389,
1164
  "step": 820
1165
  },
1166
  {
1167
  "epoch": 0.655347036044087,
1168
- "grad_norm": 0.8538847164706099,
1169
  "learning_rate": 6.443972721574409e-06,
1170
  "loss": 1.5046,
1171
  "step": 825
1172
  },
1173
  {
1174
  "epoch": 0.659318836262536,
1175
- "grad_norm": 0.8248683267185835,
1176
  "learning_rate": 6.314626244015099e-06,
1177
- "loss": 1.5061,
1178
  "step": 830
1179
  },
1180
  {
1181
  "epoch": 0.663290636480985,
1182
- "grad_norm": 0.8475577096730741,
1183
  "learning_rate": 6.18598937930452e-06,
1184
- "loss": 1.5202,
1185
  "step": 835
1186
  },
1187
  {
1188
  "epoch": 0.667262436699434,
1189
- "grad_norm": 0.8409738056108454,
1190
  "learning_rate": 6.058086896266149e-06,
1191
- "loss": 1.5241,
1192
  "step": 840
1193
  },
1194
  {
1195
  "epoch": 0.671234236917883,
1196
- "grad_norm": 0.8616872442801605,
1197
  "learning_rate": 5.930943422319453e-06,
1198
  "loss": 1.5055,
1199
  "step": 845
1200
  },
1201
  {
1202
  "epoch": 0.6752060371363321,
1203
- "grad_norm": 0.8783917854734298,
1204
  "learning_rate": 5.80458343873789e-06,
1205
  "loss": 1.5257,
1206
  "step": 850
1207
  },
1208
  {
1209
  "epoch": 0.6791778373547811,
1210
- "grad_norm": 0.8251109783631763,
1211
  "learning_rate": 5.679031275935104e-06,
1212
  "loss": 1.5312,
1213
  "step": 855
1214
  },
1215
  {
1216
  "epoch": 0.6831496375732301,
1217
- "grad_norm": 0.889314258931523,
1218
  "learning_rate": 5.55431110878014e-06,
1219
  "loss": 1.5074,
1220
  "step": 860
1221
  },
1222
  {
1223
  "epoch": 0.6871214377916791,
1224
- "grad_norm": 0.8631137147413197,
1225
  "learning_rate": 5.430446951942597e-06,
1226
  "loss": 1.538,
1227
  "step": 865
1228
  },
1229
  {
1230
  "epoch": 0.6910932380101281,
1231
- "grad_norm": 0.8278253724721275,
1232
  "learning_rate": 5.307462655268651e-06,
1233
  "loss": 1.5146,
1234
  "step": 870
1235
  },
1236
  {
1237
  "epoch": 0.6950650382285771,
1238
- "grad_norm": 0.8514984190554086,
1239
  "learning_rate": 5.185381899188811e-06,
1240
- "loss": 1.5275,
1241
  "step": 875
1242
  },
1243
  {
1244
  "epoch": 0.6990368384470261,
1245
- "grad_norm": 0.8340543993526015,
1246
  "learning_rate": 5.064228190158274e-06,
1247
- "loss": 1.528,
1248
  "step": 880
1249
  },
1250
  {
1251
  "epoch": 0.7030086386654751,
1252
- "grad_norm": 0.817234688490343,
1253
  "learning_rate": 4.944024856130813e-06,
1254
  "loss": 1.5093,
1255
  "step": 885
1256
  },
1257
  {
1258
  "epoch": 0.7069804388839241,
1259
- "grad_norm": 0.8474305564065862,
1260
  "learning_rate": 4.824795042066997e-06,
1261
  "loss": 1.5455,
1262
  "step": 890
1263
  },
1264
  {
1265
  "epoch": 0.7109522391023732,
1266
- "grad_norm": 0.8437363327628384,
1267
  "learning_rate": 4.706561705477687e-06,
1268
- "loss": 1.5225,
1269
  "step": 895
1270
  },
1271
  {
1272
  "epoch": 0.7149240393208222,
1273
- "grad_norm": 0.8371928287718869,
1274
  "learning_rate": 4.5893476120035895e-06,
1275
- "loss": 1.5414,
1276
  "step": 900
1277
  },
1278
  {
1279
  "epoch": 0.7188958395392712,
1280
- "grad_norm": 0.8431019148388188,
1281
  "learning_rate": 4.473175331031765e-06,
1282
- "loss": 1.5174,
1283
  "step": 905
1284
  },
1285
  {
1286
  "epoch": 0.7228676397577202,
1287
- "grad_norm": 0.803923492547389,
1288
  "learning_rate": 4.358067231349942e-06,
1289
  "loss": 1.5276,
1290
  "step": 910
1291
  },
1292
  {
1293
  "epoch": 0.7268394399761692,
1294
- "grad_norm": 0.796247804617719,
1295
  "learning_rate": 4.244045476839439e-06,
1296
- "loss": 1.5166,
1297
  "step": 915
1298
  },
1299
  {
1300
  "epoch": 0.7308112401946182,
1301
- "grad_norm": 0.8376189987586626,
1302
  "learning_rate": 4.131132022207537e-06,
1303
  "loss": 1.5445,
1304
  "step": 920
1305
  },
1306
  {
1307
  "epoch": 0.7347830404130672,
1308
- "grad_norm": 0.8304688166484404,
1309
  "learning_rate": 4.019348608760137e-06,
1310
  "loss": 1.5374,
1311
  "step": 925
1312
  },
1313
  {
1314
  "epoch": 0.7387548406315162,
1315
- "grad_norm": 0.8203389349371003,
1316
  "learning_rate": 3.908716760215513e-06,
1317
  "loss": 1.5204,
1318
  "step": 930
1319
  },
1320
  {
1321
  "epoch": 0.7427266408499652,
1322
- "grad_norm": 0.8197165574633112,
1323
  "learning_rate": 3.799257778559955e-06,
1324
- "loss": 1.5291,
1325
  "step": 935
1326
  },
1327
  {
1328
  "epoch": 0.7466984410684142,
1329
- "grad_norm": 0.8251638651993347,
1330
  "learning_rate": 3.6909927399460942e-06,
1331
  "loss": 1.5336,
1332
  "step": 940
1333
  },
1334
  {
1335
  "epoch": 0.7506702412868632,
1336
- "grad_norm": 0.814872680544521,
1337
  "learning_rate": 3.5839424906347274e-06,
1338
- "loss": 1.5091,
1339
  "step": 945
1340
  },
1341
  {
1342
  "epoch": 0.7546420415053123,
1343
- "grad_norm": 0.8086901358971553,
1344
  "learning_rate": 3.4781276429809153e-06,
1345
  "loss": 1.5314,
1346
  "step": 950
1347
  },
1348
  {
1349
  "epoch": 0.7586138417237613,
1350
- "grad_norm": 0.809463565917492,
1351
  "learning_rate": 3.3735685714650925e-06,
1352
  "loss": 1.5235,
1353
  "step": 955
1354
  },
1355
  {
1356
  "epoch": 0.7625856419422103,
1357
- "grad_norm": 0.8387394901103278,
1358
  "learning_rate": 3.270285408769991e-06,
1359
  "loss": 1.5381,
1360
  "step": 960
1361
  },
1362
  {
1363
  "epoch": 0.7665574421606594,
1364
- "grad_norm": 0.8393405952568794,
1365
  "learning_rate": 3.168298041904141e-06,
1366
  "loss": 1.5217,
1367
  "step": 965
1368
  },
1369
  {
1370
  "epoch": 0.7705292423791084,
1371
- "grad_norm": 0.8510390941633654,
1372
  "learning_rate": 3.0676261083726466e-06,
1373
  "loss": 1.5293,
1374
  "step": 970
1375
  },
1376
  {
1377
  "epoch": 0.7745010425975574,
1378
- "grad_norm": 0.8368762924881645,
1379
  "learning_rate": 2.968288992396009e-06,
1380
- "loss": 1.5131,
1381
  "step": 975
1382
  },
1383
  {
1384
  "epoch": 0.7784728428160064,
1385
- "grad_norm": 0.8292515955856028,
1386
  "learning_rate": 2.870305821177747e-06,
1387
  "loss": 1.5268,
1388
  "step": 980
1389
  },
1390
  {
1391
  "epoch": 0.7824446430344554,
1392
- "grad_norm": 0.839457283372409,
1393
  "learning_rate": 2.773695461221464e-06,
1394
  "loss": 1.5098,
1395
  "step": 985
1396
  },
1397
  {
1398
  "epoch": 0.7864164432529044,
1399
- "grad_norm": 0.821061769872833,
1400
  "learning_rate": 2.678476514698146e-06,
1401
  "loss": 1.5431,
1402
  "step": 990
1403
  },
1404
  {
1405
  "epoch": 0.7903882434713534,
1406
- "grad_norm": 0.8151987614948591,
1407
  "learning_rate": 2.584667315864334e-06,
1408
- "loss": 1.5523,
1409
  "step": 995
1410
  },
1411
  {
1412
  "epoch": 0.7943600436898024,
1413
- "grad_norm": 0.8137261569330774,
1414
  "learning_rate": 2.492285927531893e-06,
1415
- "loss": 1.5245,
1416
  "step": 1000
1417
  },
1418
  {
1419
  "epoch": 0.7983318439082514,
1420
- "grad_norm": 0.8301409209530792,
1421
  "learning_rate": 2.4013501375900604e-06,
1422
  "loss": 1.5428,
1423
  "step": 1005
1424
  },
1425
  {
1426
  "epoch": 0.8023036441267004,
1427
- "grad_norm": 0.8344612075866578,
1428
  "learning_rate": 2.3118774555803915e-06,
1429
  "loss": 1.5073,
1430
  "step": 1010
1431
  },
1432
  {
1433
  "epoch": 0.8062754443451494,
1434
- "grad_norm": 0.8068948072381649,
1435
  "learning_rate": 2.2238851093253476e-06,
1436
  "loss": 1.518,
1437
  "step": 1015
1438
  },
1439
  {
1440
  "epoch": 0.8102472445635984,
1441
- "grad_norm": 0.8007339180594394,
1442
  "learning_rate": 2.1373900416110973e-06,
1443
  "loss": 1.5272,
1444
  "step": 1020
1445
  },
1446
  {
1447
  "epoch": 0.8142190447820474,
1448
- "grad_norm": 0.8237271122294172,
1449
  "learning_rate": 2.0524089069252106e-06,
1450
  "loss": 1.5028,
1451
  "step": 1025
1452
  },
1453
  {
1454
  "epoch": 0.8181908450004964,
1455
- "grad_norm": 0.8293389949109566,
1456
  "learning_rate": 1.9689580682498553e-06,
1457
- "loss": 1.5267,
1458
  "step": 1030
1459
  },
1460
  {
1461
  "epoch": 0.8221626452189454,
1462
- "grad_norm": 0.8080620921250098,
1463
  "learning_rate": 1.887053593911149e-06,
1464
  "loss": 1.5427,
1465
  "step": 1035
1466
  },
1467
  {
1468
  "epoch": 0.8261344454373944,
1469
- "grad_norm": 0.8234966302697843,
1470
  "learning_rate": 1.806711254485215e-06,
1471
- "loss": 1.5388,
1472
  "step": 1040
1473
  },
1474
  {
1475
  "epoch": 0.8301062456558436,
1476
- "grad_norm": 0.8369256481610651,
1477
  "learning_rate": 1.727946519761583e-06,
1478
- "loss": 1.5014,
1479
  "step": 1045
1480
  },
1481
  {
1482
  "epoch": 0.8340780458742926,
1483
- "grad_norm": 0.8368431055139667,
1484
  "learning_rate": 1.6507745557645127e-06,
1485
  "loss": 1.5009,
1486
  "step": 1050
1487
  },
1488
  {
1489
  "epoch": 0.8380498460927416,
1490
- "grad_norm": 0.8247366170148002,
1491
  "learning_rate": 1.575210221832799e-06,
1492
  "loss": 1.525,
1493
  "step": 1055
1494
  },
1495
  {
1496
  "epoch": 0.8420216463111906,
1497
- "grad_norm": 0.8041055000154714,
1498
  "learning_rate": 1.5012680677586222e-06,
1499
- "loss": 1.5135,
1500
  "step": 1060
1501
  },
1502
  {
1503
  "epoch": 0.8459934465296396,
1504
- "grad_norm": 0.7986074078853562,
1505
- "learning_rate": 1.4289623309860134e-06,
1506
  "loss": 1.5128,
1507
  "step": 1065
1508
  },
1509
  {
1510
  "epoch": 0.8499652467480886,
1511
- "grad_norm": 0.8426676577021704,
1512
- "learning_rate": 1.3583069338694622e-06,
1513
- "loss": 1.5145,
1514
  "step": 1070
1515
  },
1516
  {
1517
  "epoch": 0.8539370469665376,
1518
- "grad_norm": 0.8032858985594136,
1519
- "learning_rate": 1.2893154809931852e-06,
1520
- "loss": 1.5601,
1521
  "step": 1075
1522
  },
1523
  {
1524
  "epoch": 0.8579088471849866,
1525
- "grad_norm": 0.805594988380154,
1526
- "learning_rate": 1.2220012565515794e-06,
1527
  "loss": 1.5104,
1528
  "step": 1080
1529
  },
1530
  {
1531
  "epoch": 0.8618806474034356,
1532
- "grad_norm": 0.8099057800957227,
1533
- "learning_rate": 1.1563772217913838e-06,
1534
  "loss": 1.5272,
1535
  "step": 1085
1536
  },
1537
  {
1538
  "epoch": 0.8658524476218846,
1539
- "grad_norm": 0.8110951463376851,
1540
- "learning_rate": 1.0924560125160021e-06,
1541
  "loss": 1.5276,
1542
  "step": 1090
1543
  },
1544
  {
1545
  "epoch": 0.8698242478403336,
1546
- "grad_norm": 0.8246733517620527,
1547
- "learning_rate": 1.0302499366525031e-06,
1548
- "loss": 1.5435,
1549
  "step": 1095
1550
  },
1551
  {
1552
  "epoch": 0.8737960480587826,
1553
- "grad_norm": 0.8040100436072144,
1554
- "learning_rate": 9.697709718817461e-07,
1555
  "loss": 1.5007,
1556
  "step": 1100
1557
  },
1558
  {
1559
  "epoch": 0.8777678482772316,
1560
- "grad_norm": 0.8110583363269713,
1561
- "learning_rate": 9.110307633321014e-07,
1562
  "loss": 1.5176,
1563
  "step": 1105
1564
  },
1565
  {
1566
  "epoch": 0.8817396484956807,
1567
- "grad_norm": 0.7962644683677889,
1568
- "learning_rate": 8.540406213371999e-07,
1569
  "loss": 1.5135,
1570
  "step": 1110
1571
  },
1572
  {
1573
  "epoch": 0.8857114487141297,
1574
- "grad_norm": 0.819543995512171,
1575
- "learning_rate": 7.988115192581358e-07,
1576
- "loss": 1.531,
1577
  "step": 1115
1578
  },
1579
  {
1580
  "epoch": 0.8896832489325787,
1581
- "grad_norm": 0.7823027495146351,
1582
- "learning_rate": 7.453540913705803e-07,
1583
- "loss": 1.5286,
1584
  "step": 1120
1585
  },
1586
  {
1587
  "epoch": 0.8936550491510277,
1588
- "grad_norm": 0.8047503933691855,
1589
- "learning_rate": 6.93678630817155e-07,
1590
- "loss": 1.5189,
1591
  "step": 1125
1592
  },
1593
  {
1594
  "epoch": 0.8976268493694767,
1595
- "grad_norm": 0.82648053250675,
1596
- "learning_rate": 6.437950876255116e-07,
1597
  "loss": 1.5137,
1598
  "step": 1130
1599
  },
1600
  {
1601
  "epoch": 0.9015986495879257,
1602
- "grad_norm": 0.8095834297434426,
1603
- "learning_rate": 5.957130667924615e-07,
1604
  "loss": 1.5288,
1605
  "step": 1135
1606
  },
1607
  {
1608
  "epoch": 0.9055704498063747,
1609
- "grad_norm": 0.8094036888394253,
1610
- "learning_rate": 5.494418264345613e-07,
1611
  "loss": 1.5212,
1612
  "step": 1140
1613
  },
1614
  {
1615
  "epoch": 0.9095422500248238,
1616
- "grad_norm": 0.8091906974798446,
1617
- "learning_rate": 5.04990276005467e-07,
1618
  "loss": 1.4942,
1619
  "step": 1145
1620
  },
1621
  {
1622
  "epoch": 0.9135140502432728,
1623
- "grad_norm": 0.8116362011252354,
1624
- "learning_rate": 4.623669745804371e-07,
1625
  "loss": 1.5361,
1626
  "step": 1150
1627
  },
1628
  {
1629
  "epoch": 0.9174858504617218,
1630
- "grad_norm": 0.8138824552183045,
1631
- "learning_rate": 4.2158012920829216e-07,
1632
  "loss": 1.5306,
1633
  "step": 1155
1634
  },
1635
  {
1636
  "epoch": 0.9214576506801708,
1637
- "grad_norm": 0.8093811176797965,
1638
- "learning_rate": 3.826375933311677e-07,
1639
- "loss": 1.5136,
1640
  "step": 1160
1641
  },
1642
  {
1643
  "epoch": 0.9254294508986198,
1644
- "grad_norm": 0.7885511801172403,
1645
- "learning_rate": 3.455468652723426e-07,
1646
- "loss": 1.5192,
1647
  "step": 1165
1648
  },
1649
  {
1650
  "epoch": 0.9294012511170688,
1651
- "grad_norm": 0.8102489469594919,
1652
- "learning_rate": 3.1031508679244427e-07,
1653
- "loss": 1.5301,
1654
  "step": 1170
1655
  },
1656
  {
1657
  "epoch": 0.9333730513355178,
1658
- "grad_norm": 0.7847030652070358,
1659
- "learning_rate": 2.7694904171432503e-07,
1660
- "loss": 1.5406,
1661
  "step": 1175
1662
  },
1663
  {
1664
  "epoch": 0.9373448515539669,
1665
- "grad_norm": 0.80389618954725,
1666
- "learning_rate": 2.454551546168349e-07,
1667
  "loss": 1.5143,
1668
  "step": 1180
1669
  },
1670
  {
1671
  "epoch": 0.9413166517724159,
1672
- "grad_norm": 0.7981380918291116,
1673
- "learning_rate": 2.158394895977922e-07,
1674
- "loss": 1.5169,
1675
  "step": 1185
1676
  },
1677
  {
1678
  "epoch": 0.9452884519908649,
1679
- "grad_norm": 0.7989188232547147,
1680
- "learning_rate": 1.8810774910633768e-07,
1681
  "loss": 1.5233,
1682
  "step": 1190
1683
  },
1684
  {
1685
  "epoch": 0.9492602522093139,
1686
- "grad_norm": 0.8079846851149015,
1687
- "learning_rate": 1.6226527284495032e-07,
1688
  "loss": 1.5193,
1689
  "step": 1195
1690
  },
1691
  {
1692
  "epoch": 0.9532320524277629,
1693
- "grad_norm": 0.828242462038138,
1694
- "learning_rate": 1.383170367412856e-07,
1695
  "loss": 1.5306,
1696
  "step": 1200
1697
  },
1698
  {
1699
  "epoch": 0.9572038526462119,
1700
- "grad_norm": 0.8207916114036836,
1701
- "learning_rate": 1.1626765199007406e-07,
1702
- "loss": 1.5185,
1703
  "step": 1205
1704
  },
1705
  {
1706
  "epoch": 0.9611756528646609,
1707
- "grad_norm": 0.8219670054125537,
1708
- "learning_rate": 9.612136416524143e-08,
1709
  "loss": 1.5262,
1710
  "step": 1210
1711
  },
1712
  {
1713
  "epoch": 0.9651474530831099,
1714
- "grad_norm": 0.8116316500127695,
1715
- "learning_rate": 7.788205240243152e-08,
1716
  "loss": 1.5243,
1717
  "step": 1215
1718
  },
1719
  {
1720
  "epoch": 0.9691192533015589,
1721
- "grad_norm": 0.7863225239227837,
1722
- "learning_rate": 6.155322865208368e-08,
1723
  "loss": 1.5169,
1724
  "step": 1220
1725
  },
1726
  {
1727
  "epoch": 0.9730910535200079,
1728
- "grad_norm": 0.7964472945010072,
1729
- "learning_rate": 4.7138037003210404e-08,
1730
  "loss": 1.5238,
1731
  "step": 1225
1732
  },
1733
  {
1734
  "epoch": 0.9770628537384569,
1735
- "grad_norm": 0.7945636615801865,
1736
- "learning_rate": 3.4639253078011524e-08,
1737
- "loss": 1.5132,
1738
  "step": 1230
1739
  },
1740
  {
1741
  "epoch": 0.9810346539569059,
1742
- "grad_norm": 0.8176718968571114,
1743
- "learning_rate": 2.4059283497430698e-08,
1744
  "loss": 1.5142,
1745
  "step": 1235
1746
  },
1747
  {
1748
  "epoch": 0.9850064541753549,
1749
- "grad_norm": 0.79062459107796,
1750
- "learning_rate": 1.5400165417766012e-08,
1751
  "loss": 1.5156,
1752
  "step": 1240
1753
  },
1754
  {
1755
  "epoch": 0.988978254393804,
1756
- "grad_norm": 0.7931885757173004,
1757
- "learning_rate": 8.663566138414903e-09,
1758
  "loss": 1.5025,
1759
  "step": 1245
1760
  },
1761
  {
1762
  "epoch": 0.992950054612253,
1763
- "grad_norm": 0.7967442789767752,
1764
- "learning_rate": 3.850782780843166e-09,
1765
  "loss": 1.5062,
1766
  "step": 1250
1767
  },
1768
  {
1769
  "epoch": 0.9969218548307021,
1770
- "grad_norm": 0.7797882019691252,
1771
- "learning_rate": 9.627420388225172e-10,
1772
  "loss": 1.526,
1773
  "step": 1255
1774
  },
1775
  {
1776
  "epoch": 0.9993049349617714,
1777
- "eval_loss": 1.5326945781707764,
1778
- "eval_runtime": 225.7167,
1779
- "eval_samples_per_second": 118.485,
1780
- "eval_steps_per_second": 4.94,
1781
  "step": 1258
1782
  },
1783
  {
1784
  "epoch": 0.9993049349617714,
1785
  "step": 1258,
1786
- "total_flos": 106151314194432.0,
1787
- "train_loss": 1.5477958324414178,
1788
- "train_runtime": 7077.9094,
1789
- "train_samples_per_second": 34.147,
1790
- "train_steps_per_second": 0.178
1791
  }
1792
  ],
1793
  "logging_steps": 5,
@@ -1807,7 +1807,7 @@
1807
  "attributes": {}
1808
  }
1809
  },
1810
- "total_flos": 106151314194432.0,
1811
  "train_batch_size": 8,
1812
  "trial_name": null,
1813
  "trial_params": null
 
10
  "log_history": [
11
  {
12
  "epoch": 0.0007943600436898023,
13
+ "grad_norm": 11.12151660776111,
14
  "learning_rate": 1.5873015873015874e-07,
15
  "loss": 1.8009,
16
  "step": 1
17
  },
18
  {
19
  "epoch": 0.003971800218449012,
20
+ "grad_norm": 10.10420314385476,
21
  "learning_rate": 7.936507936507937e-07,
22
  "loss": 1.7719,
23
  "step": 5
24
  },
25
  {
26
  "epoch": 0.007943600436898023,
27
+ "grad_norm": 2.7149708608696828,
28
  "learning_rate": 1.5873015873015873e-06,
29
  "loss": 1.7086,
30
  "step": 10
31
  },
32
  {
33
  "epoch": 0.011915400655347037,
34
+ "grad_norm": 1.570269172415534,
35
  "learning_rate": 2.380952380952381e-06,
36
  "loss": 1.6525,
37
  "step": 15
38
  },
39
  {
40
  "epoch": 0.015887200873796047,
41
+ "grad_norm": 1.166276790017584,
42
  "learning_rate": 3.1746031746031746e-06,
43
  "loss": 1.6423,
44
  "step": 20
45
  },
46
  {
47
  "epoch": 0.01985900109224506,
48
+ "grad_norm": 1.0260020187790146,
49
  "learning_rate": 3.968253968253968e-06,
50
  "loss": 1.6245,
51
  "step": 25
52
  },
53
  {
54
  "epoch": 0.023830801310694073,
55
+ "grad_norm": 0.9521643373391829,
56
  "learning_rate": 4.761904761904762e-06,
57
  "loss": 1.6242,
58
  "step": 30
59
  },
60
  {
61
  "epoch": 0.027802601529143083,
62
+ "grad_norm": 0.9087502803225287,
63
  "learning_rate": 5.555555555555557e-06,
64
  "loss": 1.6067,
65
  "step": 35
66
  },
67
  {
68
  "epoch": 0.03177440174759209,
69
+ "grad_norm": 0.9276432527674262,
70
  "learning_rate": 6.349206349206349e-06,
71
  "loss": 1.5848,
72
  "step": 40
73
  },
74
  {
75
  "epoch": 0.035746201966041107,
76
+ "grad_norm": 0.8919073066787582,
77
  "learning_rate": 7.1428571428571436e-06,
78
  "loss": 1.6,
79
  "step": 45
80
  },
81
  {
82
  "epoch": 0.03971800218449012,
83
+ "grad_norm": 0.965858397020833,
84
  "learning_rate": 7.936507936507936e-06,
85
  "loss": 1.6023,
86
  "step": 50
87
  },
88
  {
89
  "epoch": 0.04368980240293913,
90
+ "grad_norm": 0.9487708676860586,
91
  "learning_rate": 8.730158730158731e-06,
92
+ "loss": 1.5826,
93
  "step": 55
94
  },
95
  {
96
  "epoch": 0.04766160262138815,
97
+ "grad_norm": 0.9151332727520083,
98
  "learning_rate": 9.523809523809525e-06,
99
  "loss": 1.5978,
100
  "step": 60
101
  },
102
  {
103
  "epoch": 0.05163340283983715,
104
+ "grad_norm": 0.917185703733017,
105
  "learning_rate": 1.031746031746032e-05,
106
  "loss": 1.6003,
107
  "step": 65
108
  },
109
  {
110
  "epoch": 0.055605203058286166,
111
+ "grad_norm": 0.9194628312444804,
112
  "learning_rate": 1.1111111111111113e-05,
113
  "loss": 1.5856,
114
  "step": 70
115
  },
116
  {
117
  "epoch": 0.05957700327673518,
118
+ "grad_norm": 0.8901707069616845,
119
  "learning_rate": 1.1904761904761905e-05,
120
  "loss": 1.5846,
121
  "step": 75
122
  },
123
  {
124
  "epoch": 0.06354880349518419,
125
+ "grad_norm": 0.966608805341671,
126
  "learning_rate": 1.2698412698412699e-05,
127
  "loss": 1.6089,
128
  "step": 80
129
  },
130
  {
131
  "epoch": 0.0675206037136332,
132
+ "grad_norm": 0.9313133663663362,
133
  "learning_rate": 1.3492063492063494e-05,
134
  "loss": 1.5815,
135
  "step": 85
136
  },
137
  {
138
  "epoch": 0.07149240393208221,
139
+ "grad_norm": 0.9808540327178217,
140
  "learning_rate": 1.4285714285714287e-05,
141
  "loss": 1.5816,
142
  "step": 90
143
  },
144
  {
145
  "epoch": 0.07546420415053123,
146
+ "grad_norm": 0.9026305570096459,
147
  "learning_rate": 1.507936507936508e-05,
148
  "loss": 1.5958,
149
  "step": 95
150
  },
151
  {
152
  "epoch": 0.07943600436898024,
153
+ "grad_norm": 0.9788483223436265,
154
  "learning_rate": 1.5873015873015872e-05,
155
  "loss": 1.5911,
156
  "step": 100
157
  },
158
  {
159
  "epoch": 0.08340780458742925,
160
+ "grad_norm": 0.9515538442938523,
161
  "learning_rate": 1.6666666666666667e-05,
162
  "loss": 1.5865,
163
  "step": 105
164
  },
165
  {
166
  "epoch": 0.08737960480587827,
167
+ "grad_norm": 0.926406626131289,
168
  "learning_rate": 1.7460317460317463e-05,
169
  "loss": 1.5793,
170
  "step": 110
171
  },
172
  {
173
  "epoch": 0.09135140502432727,
174
+ "grad_norm": 0.922601693661366,
175
  "learning_rate": 1.8253968253968254e-05,
176
  "loss": 1.5822,
177
  "step": 115
178
  },
179
  {
180
  "epoch": 0.0953232052427763,
181
+ "grad_norm": 0.9585781023399166,
182
  "learning_rate": 1.904761904761905e-05,
183
  "loss": 1.5718,
184
  "step": 120
185
  },
186
  {
187
  "epoch": 0.0992950054612253,
188
+ "grad_norm": 1.0121070868569275,
189
  "learning_rate": 1.9841269841269845e-05,
190
  "loss": 1.5773,
191
  "step": 125
192
  },
193
  {
194
  "epoch": 0.1032668056796743,
195
+ "grad_norm": 1.0425213086708742,
196
  "learning_rate": 1.999938384153589e-05,
197
  "loss": 1.585,
198
  "step": 130
199
  },
200
  {
201
  "epoch": 0.10723860589812333,
202
+ "grad_norm": 0.9511949716486409,
203
  "learning_rate": 1.999688082790923e-05,
204
  "loss": 1.5868,
205
  "step": 135
206
  },
207
  {
208
  "epoch": 0.11121040611657233,
209
+ "grad_norm": 0.9590697791262225,
210
  "learning_rate": 1.9992452930796544e-05,
211
  "loss": 1.5776,
212
  "step": 140
213
  },
214
  {
215
  "epoch": 0.11518220633502135,
216
+ "grad_norm": 0.932339341553828,
217
  "learning_rate": 1.9986101002782376e-05,
218
  "loss": 1.5789,
219
  "step": 145
220
  },
221
  {
222
  "epoch": 0.11915400655347036,
223
+ "grad_norm": 0.9529377325330747,
224
  "learning_rate": 1.997782626692034e-05,
225
  "loss": 1.5814,
226
  "step": 150
227
  },
228
  {
229
  "epoch": 0.12312580677191937,
230
+ "grad_norm": 0.9257334084732001,
231
  "learning_rate": 1.9967630316497663e-05,
232
  "loss": 1.5659,
233
  "step": 155
234
  },
235
  {
236
  "epoch": 0.12709760699036837,
237
+ "grad_norm": 0.9578201119584917,
238
  "learning_rate": 1.995551511472836e-05,
239
+ "loss": 1.5844,
240
  "step": 160
241
  },
242
  {
243
  "epoch": 0.1310694072088174,
244
+ "grad_norm": 0.9483988649883341,
245
  "learning_rate": 1.994148299437524e-05,
246
  "loss": 1.559,
247
  "step": 165
248
  },
249
  {
250
  "epoch": 0.1350412074272664,
251
+ "grad_norm": 0.9752973142389638,
252
  "learning_rate": 1.9925536657300734e-05,
253
  "loss": 1.5783,
254
  "step": 170
255
  },
256
  {
257
  "epoch": 0.13901300764571542,
258
+ "grad_norm": 0.9082694825570907,
259
  "learning_rate": 1.990767917394666e-05,
260
  "loss": 1.5716,
261
  "step": 175
262
  },
263
  {
264
  "epoch": 0.14298480786416443,
265
+ "grad_norm": 0.9870799951805275,
266
  "learning_rate": 1.9887913982743e-05,
267
  "loss": 1.5705,
268
  "step": 180
269
  },
270
  {
271
  "epoch": 0.14695660808261343,
272
+ "grad_norm": 0.8978375791866258,
273
  "learning_rate": 1.986624488944585e-05,
274
  "loss": 1.5738,
275
  "step": 185
276
  },
277
  {
278
  "epoch": 0.15092840830106247,
279
+ "grad_norm": 0.9206959902444366,
280
  "learning_rate": 1.984267606640462e-05,
281
  "loss": 1.5729,
282
  "step": 190
283
  },
284
  {
285
  "epoch": 0.15490020851951147,
286
+ "grad_norm": 0.9532760851392515,
287
  "learning_rate": 1.9817212051758667e-05,
288
  "loss": 1.5674,
289
  "step": 195
290
  },
291
  {
292
  "epoch": 0.15887200873796048,
293
+ "grad_norm": 0.8995123548574,
294
  "learning_rate": 1.978985774856346e-05,
295
  "loss": 1.5683,
296
  "step": 200
297
  },
298
  {
299
  "epoch": 0.16284380895640949,
300
+ "grad_norm": 0.9731444458977212,
301
  "learning_rate": 1.9760618423846526e-05,
302
  "loss": 1.5738,
303
  "step": 205
304
  },
305
  {
306
  "epoch": 0.1668156091748585,
307
+ "grad_norm": 0.9813203114593326,
308
  "learning_rate": 1.9729499707593284e-05,
309
  "loss": 1.5826,
310
  "step": 210
311
  },
312
  {
313
  "epoch": 0.17078740939330753,
314
+ "grad_norm": 0.9301814711446849,
315
  "learning_rate": 1.9696507591663003e-05,
316
  "loss": 1.5565,
317
  "step": 215
318
  },
319
  {
320
  "epoch": 0.17475920961175653,
321
+ "grad_norm": 0.9084286747837647,
322
  "learning_rate": 1.9661648428635066e-05,
323
  "loss": 1.5621,
324
  "step": 220
325
  },
326
  {
327
  "epoch": 0.17873100983020554,
328
+ "grad_norm": 0.9292775329242645,
329
  "learning_rate": 1.962492893058582e-05,
330
  "loss": 1.5533,
331
  "step": 225
332
  },
333
  {
334
  "epoch": 0.18270281004865455,
335
+ "grad_norm": 0.9282763244366917,
336
  "learning_rate": 1.9586356167796145e-05,
337
  "loss": 1.5801,
338
  "step": 230
339
  },
340
  {
341
  "epoch": 0.18667461026710355,
342
+ "grad_norm": 0.9219988029696228,
343
  "learning_rate": 1.954593756739009e-05,
344
  "loss": 1.5802,
345
  "step": 235
346
  },
347
  {
348
  "epoch": 0.1906464104855526,
349
+ "grad_norm": 0.9871605765917675,
350
+ "learning_rate": 1.9503680911904822e-05,
351
  "loss": 1.5817,
352
  "step": 240
353
  },
354
  {
355
  "epoch": 0.1946182107040016,
356
+ "grad_norm": 0.9551024433790934,
357
+ "learning_rate": 1.9459594337792063e-05,
358
  "loss": 1.571,
359
  "step": 245
360
  },
361
  {
362
  "epoch": 0.1985900109224506,
363
+ "grad_norm": 0.9155920092825396,
364
+ "learning_rate": 1.9413686333851465e-05,
365
  "loss": 1.5694,
366
  "step": 250
367
  },
368
  {
369
  "epoch": 0.2025618111408996,
370
+ "grad_norm": 0.9097167589577474,
371
+ "learning_rate": 1.9365965739596086e-05,
372
  "loss": 1.556,
373
  "step": 255
374
  },
375
  {
376
  "epoch": 0.2065336113593486,
377
+ "grad_norm": 0.9023225707856134,
378
+ "learning_rate": 1.9316441743550375e-05,
379
+ "loss": 1.5762,
380
  "step": 260
381
  },
382
  {
383
  "epoch": 0.21050541157779765,
384
+ "grad_norm": 0.9077263206493167,
385
+ "learning_rate": 1.9265123881480912e-05,
386
+ "loss": 1.5706,
387
  "step": 265
388
  },
389
  {
390
  "epoch": 0.21447721179624665,
391
+ "grad_norm": 0.8937248614200829,
392
+ "learning_rate": 1.9212022034560332e-05,
393
+ "loss": 1.567,
394
  "step": 270
395
  },
396
  {
397
  "epoch": 0.21844901201469566,
398
+ "grad_norm": 0.9122061973911317,
399
+ "learning_rate": 1.91571464274647e-05,
400
+ "loss": 1.5742,
401
  "step": 275
402
  },
403
  {
404
  "epoch": 0.22242081223314467,
405
+ "grad_norm": 0.9625286348265168,
406
+ "learning_rate": 1.91005076264048e-05,
407
  "loss": 1.5653,
408
  "step": 280
409
  },
410
  {
411
  "epoch": 0.22639261245159367,
412
+ "grad_norm": 0.9485988527754959,
413
+ "learning_rate": 1.9042116537091583e-05,
414
+ "loss": 1.555,
415
  "step": 285
416
  },
417
  {
418
  "epoch": 0.2303644126700427,
419
+ "grad_norm": 0.9576868955487222,
420
+ "learning_rate": 1.898198440263633e-05,
421
+ "loss": 1.5624,
422
  "step": 290
423
  },
424
  {
425
  "epoch": 0.2343362128884917,
426
+ "grad_norm": 0.947081159201408,
427
+ "learning_rate": 1.8920122801385785e-05,
428
+ "loss": 1.5567,
429
  "step": 295
430
  },
431
  {
432
  "epoch": 0.23830801310694072,
433
+ "grad_norm": 0.9810396328169484,
434
+ "learning_rate": 1.8856543644692767e-05,
435
+ "loss": 1.5552,
436
  "step": 300
437
  },
438
  {
439
  "epoch": 0.24227981332538973,
440
+ "grad_norm": 0.9296206498184163,
441
+ "learning_rate": 1.8791259174622668e-05,
442
+ "loss": 1.5791,
443
  "step": 305
444
  },
445
  {
446
  "epoch": 0.24625161354383873,
447
+ "grad_norm": 0.9647483866705726,
448
+ "learning_rate": 1.8724281961596255e-05,
449
+ "loss": 1.5604,
450
  "step": 310
451
  },
452
  {
453
  "epoch": 0.25022341376228774,
454
+ "grad_norm": 0.8990721005609152,
455
+ "learning_rate": 1.865562490196924e-05,
456
+ "loss": 1.5648,
457
  "step": 315
458
  },
459
  {
460
  "epoch": 0.25419521398073675,
461
+ "grad_norm": 0.8752521530386431,
462
+ "learning_rate": 1.8585301215549152e-05,
463
+ "loss": 1.575,
464
  "step": 320
465
  },
466
  {
467
  "epoch": 0.2581670141991858,
468
+ "grad_norm": 0.9677847810573758,
469
+ "learning_rate": 1.8513324443049826e-05,
470
+ "loss": 1.5752,
471
  "step": 325
472
  },
473
  {
474
  "epoch": 0.2621388144176348,
475
+ "grad_norm": 0.8650992533303118,
476
+ "learning_rate": 1.8439708443484212e-05,
477
+ "loss": 1.5576,
478
  "step": 330
479
  },
480
  {
481
  "epoch": 0.2661106146360838,
482
+ "grad_norm": 0.8721763870519639,
483
+ "learning_rate": 1.836446739149581e-05,
484
  "loss": 1.55,
485
  "step": 335
486
  },
487
  {
488
  "epoch": 0.2700824148545328,
489
+ "grad_norm": 0.9043150779726963,
490
+ "learning_rate": 1.8287615774629372e-05,
491
+ "loss": 1.5736,
492
  "step": 340
493
  },
494
  {
495
  "epoch": 0.27405421507298183,
496
+ "grad_norm": 0.872060546407473,
497
+ "learning_rate": 1.820916839054137e-05,
498
+ "loss": 1.5739,
499
  "step": 345
500
  },
501
  {
502
  "epoch": 0.27802601529143084,
503
+ "grad_norm": 0.9490597285856667,
504
+ "learning_rate": 1.8129140344150698e-05,
505
+ "loss": 1.5656,
506
  "step": 350
507
  },
508
  {
509
  "epoch": 0.28199781550987985,
510
+ "grad_norm": 0.9293959378534794,
511
+ "learning_rate": 1.8047547044730266e-05,
512
  "loss": 1.5601,
513
  "step": 355
514
  },
515
  {
516
  "epoch": 0.28596961572832885,
517
+ "grad_norm": 0.8875229973252522,
518
+ "learning_rate": 1.796440420293996e-05,
519
+ "loss": 1.5595,
520
  "step": 360
521
  },
522
  {
523
  "epoch": 0.28994141594677786,
524
+ "grad_norm": 0.9241685744107659,
525
+ "learning_rate": 1.7879727827801587e-05,
526
+ "loss": 1.5681,
527
  "step": 365
528
  },
529
  {
530
  "epoch": 0.29391321616522686,
531
+ "grad_norm": 0.9255387788030182,
532
+ "learning_rate": 1.7793534223616354e-05,
533
+ "loss": 1.5613,
534
  "step": 370
535
  },
536
  {
537
  "epoch": 0.2978850163836759,
538
+ "grad_norm": 0.9447825641314226,
539
+ "learning_rate": 1.7705839986825502e-05,
540
  "loss": 1.5726,
541
  "step": 375
542
  },
543
  {
544
  "epoch": 0.30185681660212493,
545
+ "grad_norm": 0.879507513143992,
546
+ "learning_rate": 1.7616662002814704e-05,
547
+ "loss": 1.5419,
548
  "step": 380
549
  },
550
  {
551
  "epoch": 0.30582861682057394,
552
+ "grad_norm": 0.89862098882536,
553
+ "learning_rate": 1.752601744266278e-05,
554
+ "loss": 1.5516,
555
  "step": 385
556
  },
557
  {
558
  "epoch": 0.30980041703902295,
559
+ "grad_norm": 0.9106190928925514,
560
+ "learning_rate": 1.7433923759835468e-05,
561
+ "loss": 1.5565,
562
  "step": 390
563
  },
564
  {
565
  "epoch": 0.31377221725747195,
566
+ "grad_norm": 0.9288737547498717,
567
+ "learning_rate": 1.7340398686824755e-05,
568
+ "loss": 1.5732,
569
  "step": 395
570
  },
571
  {
572
  "epoch": 0.31774401747592096,
573
+ "grad_norm": 0.9087033425211577,
574
+ "learning_rate": 1.7245460231734537e-05,
575
+ "loss": 1.5492,
576
  "step": 400
577
  },
578
  {
579
  "epoch": 0.32171581769436997,
580
+ "grad_norm": 0.8999862928001036,
581
+ "learning_rate": 1.7149126674813174e-05,
582
+ "loss": 1.5695,
583
  "step": 405
584
  },
585
  {
586
  "epoch": 0.32568761791281897,
587
+ "grad_norm": 0.9119036306365369,
588
+ "learning_rate": 1.7051416564933677e-05,
589
  "loss": 1.5507,
590
  "step": 410
591
  },
592
  {
593
  "epoch": 0.329659418131268,
594
+ "grad_norm": 0.8975417094048892,
595
+ "learning_rate": 1.6952348716022112e-05,
596
+ "loss": 1.5902,
597
  "step": 415
598
  },
599
  {
600
  "epoch": 0.333631218349717,
601
+ "grad_norm": 0.8840046538984045,
602
+ "learning_rate": 1.6851942203435056e-05,
603
+ "loss": 1.5592,
604
  "step": 420
605
  },
606
  {
607
  "epoch": 0.33760301856816605,
608
+ "grad_norm": 0.9029659180561906,
609
+ "learning_rate": 1.6750216360286634e-05,
610
  "loss": 1.5829,
611
  "step": 425
612
  },
613
  {
614
  "epoch": 0.34157481878661505,
615
+ "grad_norm": 0.8585662547884547,
616
+ "learning_rate": 1.664719077372597e-05,
617
+ "loss": 1.5576,
618
  "step": 430
619
  },
620
  {
621
  "epoch": 0.34554661900506406,
622
+ "grad_norm": 0.8785123954008921,
623
  "learning_rate": 1.6563847811650376e-05,
624
+ "loss": 1.5683,
625
  "step": 435
626
  },
627
  {
628
  "epoch": 0.34951841922351307,
629
+ "grad_norm": 0.8697699611475651,
630
  "learning_rate": 1.64585328429674e-05,
631
  "loss": 1.5448,
632
  "step": 440
633
  },
634
  {
635
  "epoch": 0.3534902194419621,
636
+ "grad_norm": 0.8843659106432961,
637
  "learning_rate": 1.635197429406901e-05,
638
  "loss": 1.5726,
639
  "step": 445
640
  },
641
  {
642
  "epoch": 0.3574620196604111,
643
+ "grad_norm": 0.9485513906574914,
644
  "learning_rate": 1.6244192682634143e-05,
645
+ "loss": 1.5465,
646
  "step": 450
647
  },
648
  {
649
  "epoch": 0.3614338198788601,
650
+ "grad_norm": 0.9407868053462191,
651
  "learning_rate": 1.6135208761840457e-05,
652
+ "loss": 1.5591,
653
  "step": 455
654
  },
655
  {
656
  "epoch": 0.3654056200973091,
657
+ "grad_norm": 0.9465851646246182,
658
  "learning_rate": 1.602504351636838e-05,
659
  "loss": 1.5534,
660
  "step": 460
661
  },
662
  {
663
  "epoch": 0.3693774203157581,
664
+ "grad_norm": 0.9186695674403026,
665
  "learning_rate": 1.591371815836051e-05,
666
  "loss": 1.5543,
667
  "step": 465
668
  },
669
  {
670
  "epoch": 0.3733492205342071,
671
+ "grad_norm": 0.8884315626992574,
672
  "learning_rate": 1.580125412333728e-05,
673
  "loss": 1.5402,
674
  "step": 470
675
  },
676
  {
677
  "epoch": 0.37732102075265617,
678
+ "grad_norm": 0.8733398363412331,
679
  "learning_rate": 1.5687673066069568e-05,
680
  "loss": 1.552,
681
  "step": 475
682
  },
683
  {
684
  "epoch": 0.3812928209711052,
685
+ "grad_norm": 0.8992808033952767,
686
  "learning_rate": 1.5572996856409094e-05,
687
+ "loss": 1.5638,
688
  "step": 480
689
  },
690
  {
691
  "epoch": 0.3852646211895542,
692
+ "grad_norm": 0.8937067889490861,
693
  "learning_rate": 1.5457247575077445e-05,
694
  "loss": 1.5406,
695
  "step": 485
696
  },
697
  {
698
  "epoch": 0.3892364214080032,
699
+ "grad_norm": 0.9108964394640126,
700
  "learning_rate": 1.534044750941444e-05,
701
  "loss": 1.5472,
702
  "step": 490
703
  },
704
  {
705
  "epoch": 0.3932082216264522,
706
+ "grad_norm": 0.8793030627405377,
707
  "learning_rate": 1.5222619149086746e-05,
708
+ "loss": 1.5412,
709
  "step": 495
710
  },
711
  {
712
  "epoch": 0.3971800218449012,
713
+ "grad_norm": 0.8782946871477869,
714
  "learning_rate": 1.5103785181757533e-05,
715
  "loss": 1.5396,
716
  "step": 500
717
  },
718
  {
719
  "epoch": 0.4011518220633502,
720
+ "grad_norm": 0.8674461660159545,
721
  "learning_rate": 1.4983968488718005e-05,
722
  "loss": 1.5426,
723
  "step": 505
724
  },
725
  {
726
  "epoch": 0.4051236222817992,
727
+ "grad_norm": 0.8976341288348811,
728
  "learning_rate": 1.4863192140481624e-05,
729
+ "loss": 1.5537,
730
  "step": 510
731
  },
732
  {
733
  "epoch": 0.4090954225002482,
734
+ "grad_norm": 0.8431858627506479,
735
  "learning_rate": 1.4741479392341941e-05,
736
  "loss": 1.5586,
737
  "step": 515
738
  },
739
  {
740
  "epoch": 0.4130672227186972,
741
+ "grad_norm": 0.9421670179657328,
742
  "learning_rate": 1.4618853679894813e-05,
743
  "loss": 1.5202,
744
  "step": 520
745
  },
746
  {
747
  "epoch": 0.4170390229371463,
748
+ "grad_norm": 0.8878585099095585,
749
  "learning_rate": 1.4495338614525927e-05,
750
+ "loss": 1.5507,
751
  "step": 525
752
  },
753
  {
754
  "epoch": 0.4210108231555953,
755
+ "grad_norm": 0.9642989519892418,
756
  "learning_rate": 1.437095797886445e-05,
757
  "loss": 1.5488,
758
  "step": 530
759
  },
760
  {
761
  "epoch": 0.4249826233740443,
762
+ "grad_norm": 0.9246966546921916,
763
  "learning_rate": 1.4245735722203736e-05,
764
+ "loss": 1.5401,
765
  "step": 535
766
  },
767
  {
768
  "epoch": 0.4289544235924933,
769
+ "grad_norm": 0.9529371824264209,
770
  "learning_rate": 1.4119695955889925e-05,
771
+ "loss": 1.5495,
772
  "step": 540
773
  },
774
  {
775
  "epoch": 0.4329262238109423,
776
+ "grad_norm": 0.8825791672603804,
777
+ "learning_rate": 1.3992862948679332e-05,
778
  "loss": 1.5491,
779
  "step": 545
780
  },
781
  {
782
  "epoch": 0.4368980240293913,
783
+ "grad_norm": 0.8881684368996328,
784
+ "learning_rate": 1.3865261122065551e-05,
785
+ "loss": 1.5482,
786
  "step": 550
787
  },
788
  {
789
  "epoch": 0.4408698242478403,
790
+ "grad_norm": 0.8423283640939411,
791
+ "learning_rate": 1.3736915045577122e-05,
792
+ "loss": 1.5488,
793
  "step": 555
794
  },
795
  {
796
  "epoch": 0.44484162446628933,
797
+ "grad_norm": 0.8255688998685623,
798
+ "learning_rate": 1.3607849432046717e-05,
799
+ "loss": 1.5478,
800
  "step": 560
801
  },
802
  {
803
  "epoch": 0.44881342468473834,
804
+ "grad_norm": 0.8357255814716047,
805
+ "learning_rate": 1.3478089132852717e-05,
806
  "loss": 1.5598,
807
  "step": 565
808
  },
809
  {
810
  "epoch": 0.45278522490318734,
811
+ "grad_norm": 0.8217394668509155,
812
+ "learning_rate": 1.3347659133134118e-05,
813
  "loss": 1.5141,
814
  "step": 570
815
  },
816
  {
817
  "epoch": 0.4567570251216364,
818
+ "grad_norm": 0.8370380332545342,
819
+ "learning_rate": 1.3216584546979702e-05,
820
+ "loss": 1.5338,
821
  "step": 575
822
  },
823
  {
824
  "epoch": 0.4607288253400854,
825
+ "grad_norm": 0.9499613626096974,
826
+ "learning_rate": 1.3084890612592325e-05,
827
+ "loss": 1.5633,
828
  "step": 580
829
  },
830
  {
831
  "epoch": 0.4647006255585344,
832
+ "grad_norm": 0.9043571179729512,
833
  "learning_rate": 1.2979106570683663e-05,
834
  "loss": 1.5624,
835
  "step": 585
836
  },
837
  {
838
  "epoch": 0.4686724257769834,
839
+ "grad_norm": 0.837192808810411,
840
  "learning_rate": 1.2846361787292137e-05,
841
+ "loss": 1.5514,
842
  "step": 590
843
  },
844
  {
845
  "epoch": 0.47264422599543243,
846
+ "grad_norm": 0.893985182264359,
847
  "learning_rate": 1.2713068941470547e-05,
848
  "loss": 1.5609,
849
  "step": 595
850
  },
851
  {
852
  "epoch": 0.47661602621388144,
853
+ "grad_norm": 0.9569825734990193,
854
  "learning_rate": 1.2579253698544124e-05,
855
  "loss": 1.5421,
856
  "step": 600
857
  },
858
  {
859
  "epoch": 0.48058782643233044,
860
+ "grad_norm": 0.8770646136960255,
861
  "learning_rate": 1.2444941824424825e-05,
862
+ "loss": 1.5392,
863
  "step": 605
864
  },
865
  {
866
  "epoch": 0.48455962665077945,
867
+ "grad_norm": 0.9065759679781771,
868
  "learning_rate": 1.2310159180650158e-05,
869
+ "loss": 1.5277,
870
  "step": 610
871
  },
872
  {
873
  "epoch": 0.48853142686922846,
874
+ "grad_norm": 0.8727166144942986,
875
  "learning_rate": 1.2174931719403568e-05,
876
+ "loss": 1.5206,
877
  "step": 615
878
  },
879
  {
880
  "epoch": 0.49250322708767746,
881
+ "grad_norm": 0.8374618600702926,
882
  "learning_rate": 1.2039285478517417e-05,
883
  "loss": 1.5363,
884
  "step": 620
885
  },
886
  {
887
  "epoch": 0.4964750273061265,
888
+ "grad_norm": 0.8559233592367627,
889
  "learning_rate": 1.1903246576459398e-05,
890
  "loss": 1.5188,
891
  "step": 625
892
  },
893
  {
894
  "epoch": 0.5004468275245755,
895
+ "grad_norm": 0.8928106567260038,
896
  "learning_rate": 1.1766841207303498e-05,
897
+ "loss": 1.5388,
898
  "step": 630
899
  },
900
  {
901
  "epoch": 0.5044186277430245,
902
+ "grad_norm": 0.9205521273045262,
903
  "learning_rate": 1.1630095635686359e-05,
904
  "loss": 1.5246,
905
  "step": 635
906
  },
907
  {
908
  "epoch": 0.5083904279614735,
909
+ "grad_norm": 0.8459927952590032,
910
  "learning_rate": 1.1493036191750067e-05,
911
  "loss": 1.5597,
912
  "step": 640
913
  },
914
  {
915
  "epoch": 0.5123622281799225,
916
+ "grad_norm": 0.8954533380109134,
917
  "learning_rate": 1.1355689266072314e-05,
918
  "loss": 1.5407,
919
  "step": 645
920
  },
921
  {
922
  "epoch": 0.5163340283983716,
923
+ "grad_norm": 0.8801173721660838,
924
  "learning_rate": 1.1218081304584959e-05,
925
+ "loss": 1.5358,
926
  "step": 650
927
  },
928
  {
929
  "epoch": 0.5203058286168206,
930
+ "grad_norm": 0.8777403433212762,
931
  "learning_rate": 1.1080238803481878e-05,
932
  "loss": 1.5529,
933
  "step": 655
934
  },
935
  {
936
  "epoch": 0.5242776288352696,
937
+ "grad_norm": 0.8804685666128383,
938
  "learning_rate": 1.0942188304117184e-05,
939
+ "loss": 1.5373,
940
  "step": 660
941
  },
942
  {
943
  "epoch": 0.5282494290537186,
944
+ "grad_norm": 0.8340381127557642,
945
  "learning_rate": 1.0803956387894715e-05,
946
  "loss": 1.5454,
947
  "step": 665
948
  },
949
  {
950
  "epoch": 0.5322212292721676,
951
+ "grad_norm": 0.8855144262951853,
952
  "learning_rate": 1.066556967114984e-05,
953
+ "loss": 1.5283,
954
  "step": 670
955
  },
956
  {
957
  "epoch": 0.5361930294906166,
958
+ "grad_norm": 0.8545713944344788,
959
  "learning_rate": 1.0527054800024537e-05,
960
  "loss": 1.5434,
961
  "step": 675
962
  },
963
  {
964
  "epoch": 0.5401648297090657,
965
+ "grad_norm": 0.8489068751890607,
966
  "learning_rate": 1.0388438445336677e-05,
967
  "loss": 1.5134,
968
  "step": 680
969
  },
970
  {
971
  "epoch": 0.5441366299275147,
972
+ "grad_norm": 0.8830252810636811,
973
  "learning_rate": 1.0249747297444659e-05,
974
  "loss": 1.5412,
975
  "step": 685
976
  },
977
  {
978
  "epoch": 0.5481084301459637,
979
+ "grad_norm": 0.815565252068504,
980
  "learning_rate": 1.0111008061108176e-05,
981
  "loss": 1.5327,
982
  "step": 690
983
  },
984
  {
985
  "epoch": 0.5520802303644127,
986
+ "grad_norm": 0.8249011581838526,
987
  "learning_rate": 9.972247450346272e-06,
988
  "loss": 1.5083,
989
  "step": 695
990
  },
991
  {
992
  "epoch": 0.5560520305828617,
993
+ "grad_norm": 0.8855145666062009,
994
  "learning_rate": 9.833492183293616e-06,
995
  "loss": 1.5481,
996
  "step": 700
997
  },
998
  {
999
  "epoch": 0.5600238308013107,
1000
+ "grad_norm": 0.8735055764957946,
1001
  "learning_rate": 9.69476897705595e-06,
1002
  "loss": 1.5224,
1003
  "step": 705
1004
  },
1005
  {
1006
  "epoch": 0.5639956310197597,
1007
+ "grad_norm": 0.8651835408338951,
1008
  "learning_rate": 9.55610454256575e-06,
1009
+ "loss": 1.5291,
1010
  "step": 710
1011
  },
1012
  {
1013
  "epoch": 0.5679674312382087,
1014
+ "grad_norm": 0.8804958974715262,
1015
  "learning_rate": 9.417525579439094e-06,
1016
+ "loss": 1.5248,
1017
  "step": 715
1018
  },
1019
  {
1020
  "epoch": 0.5719392314566577,
1021
+ "grad_norm": 0.8450121942474024,
1022
  "learning_rate": 9.279058770834679e-06,
1023
  "loss": 1.5264,
1024
  "step": 720
1025
  },
1026
  {
1027
  "epoch": 0.5759110316751067,
1028
+ "grad_norm": 0.8556582918574475,
1029
  "learning_rate": 9.140730778316037e-06,
1030
+ "loss": 1.5464,
1031
  "step": 725
1032
  },
1033
  {
1034
  "epoch": 0.5798828318935557,
1035
+ "grad_norm": 0.8851588599533484,
1036
  "learning_rate": 9.002568236717863e-06,
1037
  "loss": 1.5389,
1038
  "step": 730
1039
  },
1040
  {
1041
  "epoch": 0.5838546321120047,
1042
+ "grad_norm": 0.8608369842804535,
1043
  "learning_rate": 8.864597749017566e-06,
1044
  "loss": 1.5392,
1045
  "step": 735
1046
  },
1047
  {
1048
  "epoch": 0.5878264323304537,
1049
+ "grad_norm": 0.8184232636661748,
1050
  "learning_rate": 8.72684588121287e-06,
1051
+ "loss": 1.558,
1052
  "step": 740
1053
  },
1054
  {
1055
  "epoch": 0.5917982325489027,
1056
+ "grad_norm": 0.84877451025036,
1057
  "learning_rate": 8.589339157206583e-06,
1058
  "loss": 1.5388,
1059
  "step": 745
1060
  },
1061
  {
1062
  "epoch": 0.5957700327673519,
1063
+ "grad_norm": 0.8092058683281641,
1064
  "learning_rate": 8.452104053699474e-06,
1065
  "loss": 1.5313,
1066
  "step": 750
1067
  },
1068
  {
1069
  "epoch": 0.5997418329858009,
1070
+ "grad_norm": 0.8452233779619619,
1071
  "learning_rate": 8.315166995092206e-06,
1072
  "loss": 1.5259,
1073
  "step": 755
1074
  },
1075
  {
1076
  "epoch": 0.6037136332042499,
1077
+ "grad_norm": 0.8484399724532287,
1078
  "learning_rate": 8.178554348397388e-06,
1079
+ "loss": 1.5193,
1080
  "step": 760
1081
  },
1082
  {
1083
  "epoch": 0.6076854334226989,
1084
+ "grad_norm": 0.8144552151209362,
1085
  "learning_rate": 8.042292418162611e-06,
1086
  "loss": 1.5046,
1087
  "step": 765
1088
  },
1089
  {
1090
  "epoch": 0.6116572336411479,
1091
+ "grad_norm": 0.8339533354115302,
1092
  "learning_rate": 7.906407441405586e-06,
1093
  "loss": 1.5372,
1094
  "step": 770
1095
  },
1096
  {
1097
  "epoch": 0.6156290338595969,
1098
+ "grad_norm": 0.8305252567757494,
1099
  "learning_rate": 7.770925582562228e-06,
1100
  "loss": 1.5365,
1101
  "step": 775
1102
  },
1103
  {
1104
  "epoch": 0.6196008340780459,
1105
+ "grad_norm": 0.8327672905609981,
1106
  "learning_rate": 7.635872928448734e-06,
1107
  "loss": 1.5326,
1108
  "step": 780
1109
  },
1110
  {
1111
  "epoch": 0.6235726342964949,
1112
+ "grad_norm": 0.844461561584183,
1113
  "learning_rate": 7.501275483238619e-06,
1114
  "loss": 1.543,
1115
  "step": 785
1116
  },
1117
  {
1118
  "epoch": 0.6275444345149439,
1119
+ "grad_norm": 0.8619627758469628,
1120
  "learning_rate": 7.367159163455648e-06,
1121
  "loss": 1.5259,
1122
  "step": 790
1123
  },
1124
  {
1125
  "epoch": 0.6315162347333929,
1126
+ "grad_norm": 0.8366660180458262,
1127
  "learning_rate": 7.2335497929836565e-06,
1128
+ "loss": 1.5465,
1129
  "step": 795
1130
  },
1131
  {
1132
  "epoch": 0.6354880349518419,
1133
+ "grad_norm": 0.8803323174183961,
1134
  "learning_rate": 7.10047309809418e-06,
1135
+ "loss": 1.5412,
1136
  "step": 800
1137
  },
1138
  {
1139
  "epoch": 0.6394598351702909,
1140
+ "grad_norm": 0.8544630082563632,
1141
  "learning_rate": 6.967954702492939e-06,
1142
  "loss": 1.5207,
1143
  "step": 805
1144
  },
1145
  {
1146
  "epoch": 0.6434316353887399,
1147
+ "grad_norm": 0.8328235652984776,
1148
  "learning_rate": 6.8360201223860024e-06,
1149
  "loss": 1.5407,
1150
  "step": 810
1151
  },
1152
  {
1153
  "epoch": 0.6474034356071889,
1154
+ "grad_norm": 0.8462106579634066,
1155
  "learning_rate": 6.704694761566697e-06,
1156
  "loss": 1.5217,
1157
  "step": 815
1158
  },
1159
  {
1160
  "epoch": 0.6513752358256379,
1161
+ "grad_norm": 0.8277784955761321,
1162
  "learning_rate": 6.574003906524149e-06,
1163
  "loss": 1.5389,
1164
  "step": 820
1165
  },
1166
  {
1167
  "epoch": 0.655347036044087,
1168
+ "grad_norm": 0.8532449534893878,
1169
  "learning_rate": 6.443972721574409e-06,
1170
  "loss": 1.5046,
1171
  "step": 825
1172
  },
1173
  {
1174
  "epoch": 0.659318836262536,
1175
+ "grad_norm": 0.8230178573303815,
1176
  "learning_rate": 6.314626244015099e-06,
1177
+ "loss": 1.5062,
1178
  "step": 830
1179
  },
1180
  {
1181
  "epoch": 0.663290636480985,
1182
+ "grad_norm": 0.84801479235042,
1183
  "learning_rate": 6.18598937930452e-06,
1184
+ "loss": 1.5203,
1185
  "step": 835
1186
  },
1187
  {
1188
  "epoch": 0.667262436699434,
1189
+ "grad_norm": 0.8397075331547054,
1190
  "learning_rate": 6.058086896266149e-06,
1191
+ "loss": 1.5242,
1192
  "step": 840
1193
  },
1194
  {
1195
  "epoch": 0.671234236917883,
1196
+ "grad_norm": 0.8597360676196729,
1197
  "learning_rate": 5.930943422319453e-06,
1198
  "loss": 1.5055,
1199
  "step": 845
1200
  },
1201
  {
1202
  "epoch": 0.6752060371363321,
1203
+ "grad_norm": 0.8775604658344065,
1204
  "learning_rate": 5.80458343873789e-06,
1205
  "loss": 1.5257,
1206
  "step": 850
1207
  },
1208
  {
1209
  "epoch": 0.6791778373547811,
1210
+ "grad_norm": 0.8235956970430471,
1211
  "learning_rate": 5.679031275935104e-06,
1212
  "loss": 1.5312,
1213
  "step": 855
1214
  },
1215
  {
1216
  "epoch": 0.6831496375732301,
1217
+ "grad_norm": 0.8867364985685439,
1218
  "learning_rate": 5.55431110878014e-06,
1219
  "loss": 1.5074,
1220
  "step": 860
1221
  },
1222
  {
1223
  "epoch": 0.6871214377916791,
1224
+ "grad_norm": 0.861522055731343,
1225
  "learning_rate": 5.430446951942597e-06,
1226
  "loss": 1.538,
1227
  "step": 865
1228
  },
1229
  {
1230
  "epoch": 0.6910932380101281,
1231
+ "grad_norm": 0.8275026007168744,
1232
  "learning_rate": 5.307462655268651e-06,
1233
  "loss": 1.5146,
1234
  "step": 870
1235
  },
1236
  {
1237
  "epoch": 0.6950650382285771,
1238
+ "grad_norm": 1.0158409774185204,
1239
  "learning_rate": 5.185381899188811e-06,
1240
+ "loss": 1.5276,
1241
  "step": 875
1242
  },
1243
  {
1244
  "epoch": 0.6990368384470261,
1245
+ "grad_norm": 0.8341799420785135,
1246
  "learning_rate": 5.064228190158274e-06,
1247
+ "loss": 1.5281,
1248
  "step": 880
1249
  },
1250
  {
1251
  "epoch": 0.7030086386654751,
1252
+ "grad_norm": 0.8162862561088432,
1253
  "learning_rate": 4.944024856130813e-06,
1254
  "loss": 1.5093,
1255
  "step": 885
1256
  },
1257
  {
1258
  "epoch": 0.7069804388839241,
1259
+ "grad_norm": 0.8479134502646151,
1260
  "learning_rate": 4.824795042066997e-06,
1261
  "loss": 1.5455,
1262
  "step": 890
1263
  },
1264
  {
1265
  "epoch": 0.7109522391023732,
1266
+ "grad_norm": 0.8433313440901115,
1267
  "learning_rate": 4.706561705477687e-06,
1268
+ "loss": 1.5226,
1269
  "step": 895
1270
  },
1271
  {
1272
  "epoch": 0.7149240393208222,
1273
+ "grad_norm": 0.8370841428358435,
1274
  "learning_rate": 4.5893476120035895e-06,
1275
+ "loss": 1.5412,
1276
  "step": 900
1277
  },
1278
  {
1279
  "epoch": 0.7188958395392712,
1280
+ "grad_norm": 0.8424960564588163,
1281
  "learning_rate": 4.473175331031765e-06,
1282
+ "loss": 1.5175,
1283
  "step": 905
1284
  },
1285
  {
1286
  "epoch": 0.7228676397577202,
1287
+ "grad_norm": 0.8032981360581487,
1288
  "learning_rate": 4.358067231349942e-06,
1289
  "loss": 1.5276,
1290
  "step": 910
1291
  },
1292
  {
1293
  "epoch": 0.7268394399761692,
1294
+ "grad_norm": 0.7960926551931078,
1295
  "learning_rate": 4.244045476839439e-06,
1296
+ "loss": 1.5167,
1297
  "step": 915
1298
  },
1299
  {
1300
  "epoch": 0.7308112401946182,
1301
+ "grad_norm": 0.838320712946578,
1302
  "learning_rate": 4.131132022207537e-06,
1303
  "loss": 1.5445,
1304
  "step": 920
1305
  },
1306
  {
1307
  "epoch": 0.7347830404130672,
1308
+ "grad_norm": 0.8306388787438427,
1309
  "learning_rate": 4.019348608760137e-06,
1310
  "loss": 1.5374,
1311
  "step": 925
1312
  },
1313
  {
1314
  "epoch": 0.7387548406315162,
1315
+ "grad_norm": 0.8194701201913649,
1316
  "learning_rate": 3.908716760215513e-06,
1317
  "loss": 1.5204,
1318
  "step": 930
1319
  },
1320
  {
1321
  "epoch": 0.7427266408499652,
1322
+ "grad_norm": 0.8194871368550529,
1323
  "learning_rate": 3.799257778559955e-06,
1324
+ "loss": 1.5292,
1325
  "step": 935
1326
  },
1327
  {
1328
  "epoch": 0.7466984410684142,
1329
+ "grad_norm": 0.824217253107322,
1330
  "learning_rate": 3.6909927399460942e-06,
1331
  "loss": 1.5336,
1332
  "step": 940
1333
  },
1334
  {
1335
  "epoch": 0.7506702412868632,
1336
+ "grad_norm": 0.8142575339219603,
1337
  "learning_rate": 3.5839424906347274e-06,
1338
+ "loss": 1.5092,
1339
  "step": 945
1340
  },
1341
  {
1342
  "epoch": 0.7546420415053123,
1343
+ "grad_norm": 0.807677248718189,
1344
  "learning_rate": 3.4781276429809153e-06,
1345
  "loss": 1.5314,
1346
  "step": 950
1347
  },
1348
  {
1349
  "epoch": 0.7586138417237613,
1350
+ "grad_norm": 0.8089021775387019,
1351
  "learning_rate": 3.3735685714650925e-06,
1352
  "loss": 1.5235,
1353
  "step": 955
1354
  },
1355
  {
1356
  "epoch": 0.7625856419422103,
1357
+ "grad_norm": 0.8366198862903579,
1358
  "learning_rate": 3.270285408769991e-06,
1359
  "loss": 1.5381,
1360
  "step": 960
1361
  },
1362
  {
1363
  "epoch": 0.7665574421606594,
1364
+ "grad_norm": 0.8405851483984087,
1365
  "learning_rate": 3.168298041904141e-06,
1366
  "loss": 1.5217,
1367
  "step": 965
1368
  },
1369
  {
1370
  "epoch": 0.7705292423791084,
1371
+ "grad_norm": 0.8520055208027894,
1372
  "learning_rate": 3.0676261083726466e-06,
1373
  "loss": 1.5293,
1374
  "step": 970
1375
  },
1376
  {
1377
  "epoch": 0.7745010425975574,
1378
+ "grad_norm": 0.8355631433834037,
1379
  "learning_rate": 2.968288992396009e-06,
1380
+ "loss": 1.5132,
1381
  "step": 975
1382
  },
1383
  {
1384
  "epoch": 0.7784728428160064,
1385
+ "grad_norm": 0.8291513738741879,
1386
  "learning_rate": 2.870305821177747e-06,
1387
  "loss": 1.5268,
1388
  "step": 980
1389
  },
1390
  {
1391
  "epoch": 0.7824446430344554,
1392
+ "grad_norm": 0.8378530571179096,
1393
  "learning_rate": 2.773695461221464e-06,
1394
  "loss": 1.5098,
1395
  "step": 985
1396
  },
1397
  {
1398
  "epoch": 0.7864164432529044,
1399
+ "grad_norm": 0.8199379802303441,
1400
  "learning_rate": 2.678476514698146e-06,
1401
  "loss": 1.5431,
1402
  "step": 990
1403
  },
1404
  {
1405
  "epoch": 0.7903882434713534,
1406
+ "grad_norm": 0.8149428301792468,
1407
  "learning_rate": 2.584667315864334e-06,
1408
+ "loss": 1.5524,
1409
  "step": 995
1410
  },
1411
  {
1412
  "epoch": 0.7943600436898024,
1413
+ "grad_norm": 0.8127178564027231,
1414
  "learning_rate": 2.492285927531893e-06,
1415
+ "loss": 1.5246,
1416
  "step": 1000
1417
  },
1418
  {
1419
  "epoch": 0.7983318439082514,
1420
+ "grad_norm": 0.8296085136845848,
1421
  "learning_rate": 2.4013501375900604e-06,
1422
  "loss": 1.5428,
1423
  "step": 1005
1424
  },
1425
  {
1426
  "epoch": 0.8023036441267004,
1427
+ "grad_norm": 0.8331298630227156,
1428
  "learning_rate": 2.3118774555803915e-06,
1429
  "loss": 1.5073,
1430
  "step": 1010
1431
  },
1432
  {
1433
  "epoch": 0.8062754443451494,
1434
+ "grad_norm": 0.8084677789049869,
1435
  "learning_rate": 2.2238851093253476e-06,
1436
  "loss": 1.518,
1437
  "step": 1015
1438
  },
1439
  {
1440
  "epoch": 0.8102472445635984,
1441
+ "grad_norm": 0.7995439438308841,
1442
  "learning_rate": 2.1373900416110973e-06,
1443
  "loss": 1.5272,
1444
  "step": 1020
1445
  },
1446
  {
1447
  "epoch": 0.8142190447820474,
1448
+ "grad_norm": 0.8236450629839644,
1449
  "learning_rate": 2.0524089069252106e-06,
1450
  "loss": 1.5028,
1451
  "step": 1025
1452
  },
1453
  {
1454
  "epoch": 0.8181908450004964,
1455
+ "grad_norm": 0.8275359166427305,
1456
  "learning_rate": 1.9689580682498553e-06,
1457
+ "loss": 1.5268,
1458
  "step": 1030
1459
  },
1460
  {
1461
  "epoch": 0.8221626452189454,
1462
+ "grad_norm": 0.8081276364031936,
1463
  "learning_rate": 1.887053593911149e-06,
1464
  "loss": 1.5427,
1465
  "step": 1035
1466
  },
1467
  {
1468
  "epoch": 0.8261344454373944,
1469
+ "grad_norm": 0.8237710475989976,
1470
  "learning_rate": 1.806711254485215e-06,
1471
+ "loss": 1.5389,
1472
  "step": 1040
1473
  },
1474
  {
1475
  "epoch": 0.8301062456558436,
1476
+ "grad_norm": 0.8384266531879961,
1477
  "learning_rate": 1.727946519761583e-06,
1478
+ "loss": 1.5015,
1479
  "step": 1045
1480
  },
1481
  {
1482
  "epoch": 0.8340780458742926,
1483
+ "grad_norm": 0.8364585808120162,
1484
  "learning_rate": 1.6507745557645127e-06,
1485
  "loss": 1.5009,
1486
  "step": 1050
1487
  },
1488
  {
1489
  "epoch": 0.8380498460927416,
1490
+ "grad_norm": 0.8244749141974579,
1491
  "learning_rate": 1.575210221832799e-06,
1492
  "loss": 1.525,
1493
  "step": 1055
1494
  },
1495
  {
1496
  "epoch": 0.8420216463111906,
1497
+ "grad_norm": 0.8040790359125747,
1498
  "learning_rate": 1.5012680677586222e-06,
1499
+ "loss": 1.5134,
1500
  "step": 1060
1501
  },
1502
  {
1503
  "epoch": 0.8459934465296396,
1504
+ "grad_norm": 0.7975576330551882,
1505
+ "learning_rate": 1.4432918921243055e-06,
1506
  "loss": 1.5128,
1507
  "step": 1065
1508
  },
1509
  {
1510
  "epoch": 0.8499652467480886,
1511
+ "grad_norm": 0.7987138186718241,
1512
+ "learning_rate": 1.3723053285030463e-06,
1513
+ "loss": 1.5146,
1514
  "step": 1070
1515
  },
1516
  {
1517
  "epoch": 0.8539370469665376,
1518
+ "grad_norm": 0.8041473257969093,
1519
+ "learning_rate": 1.3029800137534632e-06,
1520
+ "loss": 1.56,
1521
  "step": 1075
1522
  },
1523
  {
1524
  "epoch": 0.8579088471849866,
1525
+ "grad_norm": 0.8017665713122328,
1526
+ "learning_rate": 1.235329296354526e-06,
1527
  "loss": 1.5104,
1528
  "step": 1080
1529
  },
1530
  {
1531
  "epoch": 0.8618806474034356,
1532
+ "grad_norm": 0.8106278967681714,
1533
+ "learning_rate": 1.1693662023441577e-06,
1534
  "loss": 1.5272,
1535
  "step": 1085
1536
  },
1537
  {
1538
  "epoch": 0.8658524476218846,
1539
+ "grad_norm": 0.8102683680589345,
1540
+ "learning_rate": 1.1051034328110776e-06,
1541
  "loss": 1.5276,
1542
  "step": 1090
1543
  },
1544
  {
1545
  "epoch": 0.8698242478403336,
1546
+ "grad_norm": 0.8245536236761711,
1547
+ "learning_rate": 1.0425533614492412e-06,
1548
+ "loss": 1.5436,
1549
  "step": 1095
1550
  },
1551
  {
1552
  "epoch": 0.8737960480587826,
1553
+ "grad_norm": 0.8040355908066084,
1554
+ "learning_rate": 9.817280321752898e-07,
1555
  "loss": 1.5007,
1556
  "step": 1100
1557
  },
1558
  {
1559
  "epoch": 0.8777678482772316,
1560
+ "grad_norm": 0.8123936825847906,
1561
+ "learning_rate": 9.226391568095306e-07,
1562
  "loss": 1.5176,
1563
  "step": 1105
1564
  },
1565
  {
1566
  "epoch": 0.8817396484956807,
1567
+ "grad_norm": 0.7956886909023395,
1568
+ "learning_rate": 8.652981128208315e-07,
1569
  "loss": 1.5135,
1570
  "step": 1110
1571
  },
1572
  {
1573
  "epoch": 0.8857114487141297,
1574
+ "grad_norm": 0.8198537899032718,
1575
+ "learning_rate": 8.097159411359135e-07,
1576
+ "loss": 1.5309,
1577
  "step": 1115
1578
  },
1579
  {
1580
  "epoch": 0.8896832489325787,
1581
+ "grad_norm": 0.782162948876607,
1582
+ "learning_rate": 7.559033440134311e-07,
1583
+ "loss": 1.5287,
1584
  "step": 1120
1585
  },
1586
  {
1587
  "epoch": 0.8936550491510277,
1588
+ "grad_norm": 0.8040565507060439,
1589
+ "learning_rate": 7.038706829832808e-07,
1590
+ "loss": 1.519,
1591
  "step": 1125
1592
  },
1593
  {
1594
  "epoch": 0.8976268493694767,
1595
+ "grad_norm": 0.8259422263740301,
1596
+ "learning_rate": 6.536279768514952e-07,
1597
  "loss": 1.5137,
1598
  "step": 1130
1599
  },
1600
  {
1601
  "epoch": 0.9015986495879257,
1602
+ "grad_norm": 0.8095546356558041,
1603
+ "learning_rate": 6.051848997711395e-07,
1604
  "loss": 1.5288,
1605
  "step": 1135
1606
  },
1607
  {
1608
  "epoch": 0.9055704498063747,
1609
+ "grad_norm": 0.8093720744796288,
1610
+ "learning_rate": 5.585507793795763e-07,
1611
  "loss": 1.5212,
1612
  "step": 1140
1613
  },
1614
  {
1615
  "epoch": 0.9095422500248238,
1616
+ "grad_norm": 0.8077656938263478,
1617
+ "learning_rate": 5.137345950024309e-07,
1618
  "loss": 1.4942,
1619
  "step": 1145
1620
  },
1621
  {
1622
  "epoch": 0.9135140502432728,
1623
+ "grad_norm": 0.8113816503121841,
1624
+ "learning_rate": 4.7074497592465074e-07,
1625
  "loss": 1.5361,
1626
  "step": 1150
1627
  },
1628
  {
1629
  "epoch": 0.9174858504617218,
1630
+ "grad_norm": 0.813231569130407,
1631
+ "learning_rate": 4.2959019972893644e-07,
1632
  "loss": 1.5306,
1633
  "step": 1155
1634
  },
1635
  {
1636
  "epoch": 0.9214576506801708,
1637
+ "grad_norm": 0.8099017485840051,
1638
+ "learning_rate": 3.9027819070191706e-07,
1639
+ "loss": 1.5137,
1640
  "step": 1160
1641
  },
1642
  {
1643
  "epoch": 0.9254294508986198,
1644
+ "grad_norm": 0.7881148123811212,
1645
+ "learning_rate": 3.5281651830833987e-07,
1646
+ "loss": 1.5193,
1647
  "step": 1165
1648
  },
1649
  {
1650
  "epoch": 0.9294012511170688,
1651
+ "grad_norm": 0.8097066087342274,
1652
+ "learning_rate": 3.1721239573357264e-07,
1653
+ "loss": 1.53,
1654
  "step": 1170
1655
  },
1656
  {
1657
  "epoch": 0.9333730513355178,
1658
+ "grad_norm": 0.7843276174810709,
1659
+ "learning_rate": 2.834726784947273e-07,
1660
+ "loss": 1.5407,
1661
  "step": 1175
1662
  },
1663
  {
1664
  "epoch": 0.9373448515539669,
1665
+ "grad_norm": 0.8036612590725093,
1666
+ "learning_rate": 2.5160386312063855e-07,
1667
  "loss": 1.5143,
1668
  "step": 1180
1669
  },
1670
  {
1671
  "epoch": 0.9413166517724159,
1672
+ "grad_norm": 0.7981854209700985,
1673
+ "learning_rate": 2.2161208590096407e-07,
1674
+ "loss": 1.517,
1675
  "step": 1185
1676
  },
1677
  {
1678
  "epoch": 0.9452884519908649,
1679
+ "grad_norm": 0.7979530312793264,
1680
+ "learning_rate": 1.9350312170465234e-07,
1681
  "loss": 1.5233,
1682
  "step": 1190
1683
  },
1684
  {
1685
  "epoch": 0.9492602522093139,
1686
+ "grad_norm": 0.8079703791008118,
1687
+ "learning_rate": 1.672823828680037e-07,
1688
  "loss": 1.5193,
1689
  "step": 1195
1690
  },
1691
  {
1692
  "epoch": 0.9532320524277629,
1693
+ "grad_norm": 0.8283052457474909,
1694
+ "learning_rate": 1.4295491815253138e-07,
1695
  "loss": 1.5306,
1696
  "step": 1200
1697
  },
1698
  {
1699
  "epoch": 0.9572038526462119,
1700
+ "grad_norm": 0.8204454900067728,
1701
+ "learning_rate": 1.205254117728316e-07,
1702
+ "loss": 1.5186,
1703
  "step": 1205
1704
  },
1705
  {
1706
  "epoch": 0.9611756528646609,
1707
+ "grad_norm": 0.8209623860337765,
1708
+ "learning_rate": 9.999818249464389e-08,
1709
  "loss": 1.5262,
1710
  "step": 1210
1711
  },
1712
  {
1713
  "epoch": 0.9651474530831099,
1714
+ "grad_norm": 0.8116412111952241,
1715
+ "learning_rate": 8.137718280328166e-08,
1716
  "loss": 1.5243,
1717
  "step": 1215
1718
  },
1719
  {
1720
  "epoch": 0.9691192533015589,
1721
+ "grad_norm": 0.7858348914688461,
1722
+ "learning_rate": 6.46659981425879e-08,
1723
  "loss": 1.5169,
1724
  "step": 1220
1725
  },
1726
  {
1727
  "epoch": 0.9730910535200079,
1728
+ "grad_norm": 0.7965100492008761,
1729
+ "learning_rate": 4.9867846224559423e-08,
1730
  "loss": 1.5238,
1731
  "step": 1225
1732
  },
1733
  {
1734
  "epoch": 0.9770628537384569,
1735
+ "grad_norm": 0.7942704227422176,
1736
+ "learning_rate": 3.6985576409787064e-08,
1737
+ "loss": 1.5131,
1738
  "step": 1230
1739
  },
1740
  {
1741
  "epoch": 0.9810346539569059,
1742
+ "grad_norm": 0.8182170297859925,
1743
+ "learning_rate": 2.6021669158811104e-08,
1744
  "loss": 1.5142,
1745
  "step": 1235
1746
  },
1747
  {
1748
  "epoch": 0.9850064541753549,
1749
+ "grad_norm": 0.7890859967844505,
1750
+ "learning_rate": 1.697823555451561e-08,
1751
  "loss": 1.5156,
1752
  "step": 1240
1753
  },
1754
  {
1755
  "epoch": 0.988978254393804,
1756
+ "grad_norm": 0.792384402297532,
1757
+ "learning_rate": 9.857016895642446e-09,
1758
  "loss": 1.5025,
1759
  "step": 1245
1760
  },
1761
  {
1762
  "epoch": 0.992950054612253,
1763
+ "grad_norm": 0.7972540802101755,
1764
+ "learning_rate": 4.6593843615050374e-09,
1765
  "loss": 1.5062,
1766
  "step": 1250
1767
  },
1768
  {
1769
  "epoch": 0.9969218548307021,
1770
+ "grad_norm": 0.7796868385026204,
1771
+ "learning_rate": 1.386338747972893e-09,
1772
  "loss": 1.526,
1773
  "step": 1255
1774
  },
1775
  {
1776
  "epoch": 0.9993049349617714,
1777
+ "eval_loss": 1.5326974391937256,
1778
+ "eval_runtime": 248.3752,
1779
+ "eval_samples_per_second": 107.676,
1780
+ "eval_steps_per_second": 4.489,
1781
  "step": 1258
1782
  },
1783
  {
1784
  "epoch": 0.9993049349617714,
1785
  "step": 1258,
1786
+ "total_flos": 106140763422720.0,
1787
+ "train_loss": 1.5477893754295022,
1788
+ "train_runtime": 7899.7649,
1789
+ "train_samples_per_second": 30.594,
1790
+ "train_steps_per_second": 0.159
1791
  }
1792
  ],
1793
  "logging_steps": 5,
 
1807
  "attributes": {}
1808
  }
1809
  },
1810
+ "total_flos": 106140763422720.0,
1811
  "train_batch_size": 8,
1812
  "trial_name": null,
1813
  "trial_params": null
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d29a7994edea432db07e13baa6ac32c58725e9c7a051fcf1484c7961779a9062
3
  size 7224
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a415c9358b725cf82ee07da89920514942286474d37e169ddf39186b08a3429d
3
  size 7224