alvarobartt HF staff commited on
Commit
452a9b9
1 Parent(s): 33d1954

Model save

Browse files
Files changed (4) hide show
  1. README.md +7 -7
  2. all_results.json +6 -6
  3. train_results.json +6 -6
  4. trainer_state.json +1925 -707
README.md CHANGED
@@ -1,5 +1,5 @@
1
  ---
2
- base_model: google/gemma-2-9b-it
3
  datasets:
4
  - generator
5
  library_name: peft
@@ -9,16 +9,16 @@ tags:
9
  - sft
10
  - generated_from_trainer
11
  model-index:
12
- - name: gemma-2-9b-it-lora-magicoder
13
  results: []
14
  ---
15
 
16
  <!-- This model card has been generated automatically according to the information the Trainer had access to. You
17
  should probably proofread and complete it, then remove this comment. -->
18
 
19
- # gemma-2-9b-it-lora-magicoder
20
 
21
- This model is a fine-tuned version of [google/gemma-2-9b-it](https://huggingface.co/google/gemma-2-9b-it) on the alvarobartt/Magicoder-OAI dataset.
22
 
23
  ## Model description
24
 
@@ -42,10 +42,10 @@ The following hyperparameters were used during training:
42
  - eval_batch_size: 8
43
  - seed: 42
44
  - distributed_type: multi-GPU
45
- - num_devices: 4
46
  - gradient_accumulation_steps: 2
47
- - total_train_batch_size: 32
48
- - total_eval_batch_size: 32
49
  - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
50
  - lr_scheduler_type: cosine
51
  - lr_scheduler_warmup_ratio: 0.1
 
1
  ---
2
+ base_model: google/gemma-2-2b-it
3
  datasets:
4
  - generator
5
  library_name: peft
 
9
  - sft
10
  - generated_from_trainer
11
  model-index:
12
+ - name: gemma-2-2b-it-lora-magicoder
13
  results: []
14
  ---
15
 
16
  <!-- This model card has been generated automatically according to the information the Trainer had access to. You
17
  should probably proofread and complete it, then remove this comment. -->
18
 
19
+ # gemma-2-2b-it-lora-magicoder
20
 
21
+ This model is a fine-tuned version of [google/gemma-2-2b-it](https://huggingface.co/google/gemma-2-2b-it) on the generator dataset.
22
 
23
  ## Model description
24
 
 
42
  - eval_batch_size: 8
43
  - seed: 42
44
  - distributed_type: multi-GPU
45
+ - num_devices: 2
46
  - gradient_accumulation_steps: 2
47
+ - total_train_batch_size: 16
48
+ - total_eval_batch_size: 16
49
  - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
50
  - lr_scheduler_type: cosine
51
  - lr_scheduler_warmup_ratio: 0.1
all_results.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
- "epoch": 2.997420464316423,
3
- "total_flos": 1.7624457120055296e+16,
4
- "train_loss": 0.4506890232783245,
5
- "train_runtime": 9589.9992,
6
  "train_samples": 67677,
7
- "train_samples_per_second": 5.818,
8
- "train_steps_per_second": 0.182
9
  }
 
1
  {
2
+ "epoch": 2.9987096774193547,
3
+ "total_flos": 6514990399881216.0,
4
+ "train_loss": 0.5279938033170147,
5
+ "train_runtime": 9440.7969,
6
  "train_samples": 67677,
7
+ "train_samples_per_second": 5.91,
8
+ "train_steps_per_second": 0.369
9
  }
train_results.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
- "epoch": 2.997420464316423,
3
- "total_flos": 1.7624457120055296e+16,
4
- "train_loss": 0.4506890232783245,
5
- "train_runtime": 9589.9992,
6
  "train_samples": 67677,
7
- "train_samples_per_second": 5.818,
8
- "train_steps_per_second": 0.182
9
  }
 
1
  {
2
+ "epoch": 2.9987096774193547,
3
+ "total_flos": 6514990399881216.0,
4
+ "train_loss": 0.5279938033170147,
5
+ "train_runtime": 9440.7969,
6
  "train_samples": 67677,
7
+ "train_samples_per_second": 5.91,
8
+ "train_steps_per_second": 0.369
9
  }
trainer_state.json CHANGED
@@ -1,1243 +1,2461 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 2.997420464316423,
5
  "eval_steps": 500,
6
- "global_step": 1743,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.017196904557179708,
13
- "grad_norm": 1.236184773002013,
14
- "learning_rate": 1.1428571428571429e-05,
15
- "loss": 1.185,
16
  "step": 10
17
  },
18
  {
19
- "epoch": 0.034393809114359415,
20
- "grad_norm": 0.5456076333168367,
21
- "learning_rate": 2.2857142857142858e-05,
22
- "loss": 1.0131,
23
  "step": 20
24
  },
25
  {
26
- "epoch": 0.051590713671539126,
27
- "grad_norm": 0.35549053304697176,
28
- "learning_rate": 3.428571428571429e-05,
29
- "loss": 0.8296,
30
  "step": 30
31
  },
32
  {
33
- "epoch": 0.06878761822871883,
34
- "grad_norm": 0.254673976618386,
35
- "learning_rate": 4.5714285714285716e-05,
36
- "loss": 0.7454,
37
  "step": 40
38
  },
39
  {
40
- "epoch": 0.08598452278589853,
41
- "grad_norm": 0.23070603652224944,
42
- "learning_rate": 5.714285714285714e-05,
43
- "loss": 0.6662,
44
  "step": 50
45
  },
46
  {
47
- "epoch": 0.10318142734307825,
48
- "grad_norm": 0.17478929504818824,
49
- "learning_rate": 6.857142857142858e-05,
50
- "loss": 0.596,
51
  "step": 60
52
  },
53
  {
54
- "epoch": 0.12037833190025796,
55
- "grad_norm": 0.1561819886139925,
56
- "learning_rate": 8e-05,
57
- "loss": 0.578,
58
  "step": 70
59
  },
60
  {
61
- "epoch": 0.13757523645743766,
62
- "grad_norm": 0.20767032014128362,
63
- "learning_rate": 9.142857142857143e-05,
64
- "loss": 0.56,
65
  "step": 80
66
  },
67
  {
68
- "epoch": 0.15477214101461736,
69
- "grad_norm": 0.1783753453274605,
70
- "learning_rate": 0.00010285714285714286,
71
- "loss": 0.5486,
72
  "step": 90
73
  },
74
  {
75
- "epoch": 0.17196904557179707,
76
- "grad_norm": 0.21780447055549776,
77
- "learning_rate": 0.00011428571428571428,
78
- "loss": 0.5442,
79
  "step": 100
80
  },
81
  {
82
- "epoch": 0.18916595012897677,
83
- "grad_norm": 0.17636411695201717,
84
- "learning_rate": 0.00012571428571428572,
85
- "loss": 0.5336,
86
  "step": 110
87
  },
88
  {
89
- "epoch": 0.2063628546861565,
90
- "grad_norm": 0.17514613701190404,
91
- "learning_rate": 0.00013714285714285716,
92
- "loss": 0.5267,
93
  "step": 120
94
  },
95
  {
96
- "epoch": 0.2235597592433362,
97
- "grad_norm": 0.19025963821244604,
98
- "learning_rate": 0.00014857142857142857,
99
- "loss": 0.5274,
100
  "step": 130
101
  },
102
  {
103
- "epoch": 0.2407566638005159,
104
- "grad_norm": 0.18019488365092756,
105
- "learning_rate": 0.00016,
106
- "loss": 0.5166,
107
  "step": 140
108
  },
109
  {
110
- "epoch": 0.2579535683576956,
111
- "grad_norm": 0.17880762343061918,
112
- "learning_rate": 0.00017142857142857143,
113
- "loss": 0.5203,
114
  "step": 150
115
  },
116
  {
117
- "epoch": 0.2751504729148753,
118
- "grad_norm": 0.18388862726424096,
119
- "learning_rate": 0.00018285714285714286,
120
- "loss": 0.5224,
121
  "step": 160
122
  },
123
  {
124
- "epoch": 0.292347377472055,
125
- "grad_norm": 0.16430698379785788,
126
- "learning_rate": 0.0001942857142857143,
127
- "loss": 0.5188,
128
  "step": 170
129
  },
130
  {
131
- "epoch": 0.30954428202923473,
132
- "grad_norm": 0.15725107599349494,
133
- "learning_rate": 0.00019999498219234568,
134
- "loss": 0.516,
135
  "step": 180
136
  },
137
  {
138
- "epoch": 0.32674118658641443,
139
- "grad_norm": 0.15173489549988936,
140
- "learning_rate": 0.0001999548427524678,
141
- "loss": 0.5058,
142
  "step": 190
143
  },
144
  {
145
- "epoch": 0.34393809114359414,
146
- "grad_norm": 0.16657669365074929,
147
- "learning_rate": 0.0001998745799852668,
148
- "loss": 0.5072,
149
  "step": 200
150
  },
151
  {
152
- "epoch": 0.36113499570077384,
153
- "grad_norm": 0.14245522126193838,
154
- "learning_rate": 0.00019975422610938462,
155
- "loss": 0.5092,
156
  "step": 210
157
  },
158
  {
159
- "epoch": 0.37833190025795355,
160
- "grad_norm": 0.14204763106782423,
161
- "learning_rate": 0.00019959382943661704,
162
- "loss": 0.5124,
163
  "step": 220
164
  },
165
  {
166
- "epoch": 0.39552880481513325,
167
- "grad_norm": 0.14674244927319882,
168
- "learning_rate": 0.00019939345435252088,
169
- "loss": 0.515,
170
  "step": 230
171
  },
172
  {
173
- "epoch": 0.412725709372313,
174
- "grad_norm": 0.13843089052767493,
175
- "learning_rate": 0.00019915318129056853,
176
- "loss": 0.4994,
177
  "step": 240
178
  },
179
  {
180
- "epoch": 0.4299226139294927,
181
- "grad_norm": 0.146900895726969,
182
- "learning_rate": 0.00019887310669986085,
183
- "loss": 0.5052,
184
  "step": 250
185
  },
186
  {
187
- "epoch": 0.4471195184866724,
188
- "grad_norm": 0.14008859218618483,
189
- "learning_rate": 0.00019855334300641114,
190
- "loss": 0.5041,
191
  "step": 260
192
  },
193
  {
194
- "epoch": 0.4643164230438521,
195
- "grad_norm": 0.18809570469767778,
196
- "learning_rate": 0.0001981940185680156,
197
- "loss": 0.4978,
198
  "step": 270
199
  },
200
  {
201
- "epoch": 0.4815133276010318,
202
- "grad_norm": 0.14104099043343712,
203
- "learning_rate": 0.00019779527762272877,
204
- "loss": 0.5001,
205
  "step": 280
206
  },
207
  {
208
- "epoch": 0.49871023215821153,
209
- "grad_norm": 0.13804026353342516,
210
- "learning_rate": 0.0001973572802309642,
211
- "loss": 0.4885,
212
  "step": 290
213
  },
214
  {
215
- "epoch": 0.5159071367153912,
216
- "grad_norm": 0.14300868500069924,
217
- "learning_rate": 0.00019688020221124376,
218
- "loss": 0.495,
219
  "step": 300
220
  },
221
  {
222
- "epoch": 0.5331040412725709,
223
- "grad_norm": 0.14885187886758067,
224
- "learning_rate": 0.00019636423506962181,
225
- "loss": 0.4963,
226
  "step": 310
227
  },
228
  {
229
- "epoch": 0.5503009458297506,
230
- "grad_norm": 0.13442343336925347,
231
- "learning_rate": 0.00019580958592281167,
232
- "loss": 0.4924,
233
  "step": 320
234
  },
235
  {
236
- "epoch": 0.5674978503869303,
237
- "grad_norm": 0.13673211813600614,
238
- "learning_rate": 0.00019521647741504604,
239
- "loss": 0.5004,
240
  "step": 330
241
  },
242
  {
243
- "epoch": 0.58469475494411,
244
- "grad_norm": 0.13956825916960136,
245
- "learning_rate": 0.00019458514762870426,
246
- "loss": 0.485,
247
  "step": 340
248
  },
249
  {
250
- "epoch": 0.6018916595012898,
251
- "grad_norm": 0.14073396199862337,
252
- "learning_rate": 0.0001939158499887428,
253
- "loss": 0.4933,
254
  "step": 350
255
  },
256
  {
257
- "epoch": 0.6190885640584695,
258
- "grad_norm": 0.13603301551856595,
259
- "learning_rate": 0.00019320885316096654,
260
- "loss": 0.4938,
261
  "step": 360
262
  },
263
  {
264
- "epoch": 0.6362854686156492,
265
- "grad_norm": 0.1420552122294903,
266
- "learning_rate": 0.00019246444094418255,
267
- "loss": 0.4786,
268
  "step": 370
269
  },
270
  {
271
- "epoch": 0.6534823731728289,
272
- "grad_norm": 0.1414519600117056,
273
- "learning_rate": 0.00019168291215627926,
274
- "loss": 0.4926,
275
  "step": 380
276
  },
277
  {
278
- "epoch": 0.6706792777300086,
279
- "grad_norm": 0.13984821923297133,
280
- "learning_rate": 0.00019086458051427622,
281
- "loss": 0.4911,
282
  "step": 390
283
  },
284
  {
285
- "epoch": 0.6878761822871883,
286
- "grad_norm": 0.13752756195301288,
287
- "learning_rate": 0.00019000977450839393,
288
- "loss": 0.4827,
289
  "step": 400
290
  },
291
  {
292
- "epoch": 0.705073086844368,
293
- "grad_norm": 0.13335244333926366,
294
- "learning_rate": 0.00018911883727019285,
295
- "loss": 0.4898,
296
  "step": 410
297
  },
298
  {
299
- "epoch": 0.7222699914015477,
300
- "grad_norm": 0.13230952803690713,
301
- "learning_rate": 0.0001881921264348355,
302
- "loss": 0.4854,
303
  "step": 420
304
  },
305
  {
306
- "epoch": 0.7394668959587274,
307
- "grad_norm": 0.13478694398925248,
308
- "learning_rate": 0.00018723001399752653,
309
- "loss": 0.4897,
310
  "step": 430
311
  },
312
  {
313
- "epoch": 0.7566638005159071,
314
- "grad_norm": 0.1442034436213339,
315
- "learning_rate": 0.0001862328861641883,
316
- "loss": 0.4822,
317
  "step": 440
318
  },
319
  {
320
- "epoch": 0.7738607050730868,
321
- "grad_norm": 0.13449084966209246,
322
- "learning_rate": 0.00018520114319643235,
323
- "loss": 0.4886,
324
  "step": 450
325
  },
326
  {
327
- "epoch": 0.7910576096302665,
328
- "grad_norm": 0.1312938599246684,
329
- "learning_rate": 0.0001841351992508885,
330
- "loss": 0.4786,
331
  "step": 460
332
  },
333
  {
334
- "epoch": 0.8082545141874462,
335
- "grad_norm": 0.12664552711595364,
336
- "learning_rate": 0.0001830354822129564,
337
- "loss": 0.4882,
338
  "step": 470
339
  },
340
  {
341
- "epoch": 0.825451418744626,
342
- "grad_norm": 0.134819817437524,
343
- "learning_rate": 0.00018190243352504597,
344
- "loss": 0.48,
345
  "step": 480
346
  },
347
  {
348
- "epoch": 0.8426483233018057,
349
- "grad_norm": 0.14466404063388616,
350
- "learning_rate": 0.00018073650800937624,
351
- "loss": 0.4809,
352
  "step": 490
353
  },
354
  {
355
- "epoch": 0.8598452278589854,
356
- "grad_norm": 0.13311350418133092,
357
- "learning_rate": 0.00017953817368540292,
358
- "loss": 0.4834,
359
  "step": 500
360
  },
361
  {
362
- "epoch": 0.8770421324161651,
363
- "grad_norm": 0.13856027344204028,
364
- "learning_rate": 0.0001783079115819486,
365
- "loss": 0.4913,
366
  "step": 510
367
  },
368
  {
369
- "epoch": 0.8942390369733448,
370
- "grad_norm": 0.12870004634007948,
371
- "learning_rate": 0.00017704621554411084,
372
- "loss": 0.4827,
373
  "step": 520
374
  },
375
  {
376
- "epoch": 0.9114359415305245,
377
- "grad_norm": 0.12894303718503525,
378
- "learning_rate": 0.0001757535920350255,
379
- "loss": 0.4768,
380
  "step": 530
381
  },
382
  {
383
- "epoch": 0.9286328460877042,
384
- "grad_norm": 0.13086927730335407,
385
- "learning_rate": 0.0001744305599325652,
386
- "loss": 0.4857,
387
  "step": 540
388
  },
389
  {
390
- "epoch": 0.945829750644884,
391
- "grad_norm": 0.12929634161460313,
392
- "learning_rate": 0.00017307765032105406,
393
- "loss": 0.4883,
394
  "step": 550
395
  },
396
  {
397
- "epoch": 0.9630266552020637,
398
- "grad_norm": 0.13721669025539573,
399
- "learning_rate": 0.00017169540627808274,
400
- "loss": 0.4875,
401
  "step": 560
402
  },
403
  {
404
- "epoch": 0.9802235597592434,
405
- "grad_norm": 0.1285154415557853,
406
- "learning_rate": 0.00017028438265650933,
407
- "loss": 0.4821,
408
  "step": 570
409
  },
410
  {
411
- "epoch": 0.9974204643164231,
412
- "grad_norm": 0.13060285124046392,
413
- "learning_rate": 0.0001688451458617332,
414
- "loss": 0.4796,
415
  "step": 580
416
  },
417
  {
418
- "epoch": 1.0146173688736027,
419
- "grad_norm": 0.136066972815243,
420
- "learning_rate": 0.00016737827362433164,
421
- "loss": 0.4527,
422
  "step": 590
423
  },
424
  {
425
- "epoch": 1.0318142734307825,
426
- "grad_norm": 0.1366031218446295,
427
- "learning_rate": 0.0001658843547681506,
428
- "loss": 0.4484,
429
  "step": 600
430
  },
431
  {
432
- "epoch": 1.049011177987962,
433
- "grad_norm": 0.1378157386033563,
434
- "learning_rate": 0.000164363988973942,
435
- "loss": 0.4465,
436
  "step": 610
437
  },
438
  {
439
- "epoch": 1.0662080825451419,
440
- "grad_norm": 0.1435554531245416,
441
- "learning_rate": 0.00016281778653864316,
442
- "loss": 0.4415,
443
  "step": 620
444
  },
445
  {
446
- "epoch": 1.0834049871023215,
447
- "grad_norm": 0.14661925606567783,
448
- "learning_rate": 0.00016124636813039502,
449
- "loss": 0.441,
450
  "step": 630
451
  },
452
  {
453
- "epoch": 1.1006018916595013,
454
- "grad_norm": 0.13987832152903712,
455
- "learning_rate": 0.0001596503645393966,
456
- "loss": 0.4359,
457
  "step": 640
458
  },
459
  {
460
- "epoch": 1.117798796216681,
461
- "grad_norm": 0.14735126394013195,
462
- "learning_rate": 0.0001580304164246968,
463
- "loss": 0.4473,
464
  "step": 650
465
  },
466
  {
467
- "epoch": 1.1349957007738607,
468
- "grad_norm": 0.1429953902523938,
469
- "learning_rate": 0.0001563871740570245,
470
- "loss": 0.4451,
471
  "step": 660
472
  },
473
  {
474
- "epoch": 1.1521926053310405,
475
- "grad_norm": 0.1459735378688384,
476
- "learning_rate": 0.00015472129705776047,
477
- "loss": 0.4422,
478
  "step": 670
479
  },
480
  {
481
- "epoch": 1.16938950988822,
482
- "grad_norm": 0.14561658254102702,
483
- "learning_rate": 0.00015303345413415564,
484
- "loss": 0.4409,
485
  "step": 680
486
  },
487
  {
488
- "epoch": 1.1865864144454,
489
- "grad_norm": 0.14737248104484174,
490
- "learning_rate": 0.00015132432281090256,
491
- "loss": 0.4419,
492
  "step": 690
493
  },
494
  {
495
- "epoch": 1.2037833190025795,
496
- "grad_norm": 0.15311335569967213,
497
- "learning_rate": 0.0001495945891581668,
498
- "loss": 0.4419,
499
  "step": 700
500
  },
501
  {
502
- "epoch": 1.2209802235597593,
503
- "grad_norm": 0.14524361407042563,
504
- "learning_rate": 0.00014784494751618853,
505
- "loss": 0.4411,
506
  "step": 710
507
  },
508
  {
509
- "epoch": 1.238177128116939,
510
- "grad_norm": 0.14910830182050996,
511
- "learning_rate": 0.0001460761002165645,
512
- "loss": 0.4471,
513
  "step": 720
514
  },
515
  {
516
- "epoch": 1.2553740326741187,
517
- "grad_norm": 0.15226336213789984,
518
- "learning_rate": 0.00014428875730032145,
519
- "loss": 0.4423,
520
  "step": 730
521
  },
522
  {
523
- "epoch": 1.2725709372312983,
524
- "grad_norm": 0.15781539164450759,
525
- "learning_rate": 0.00014248363623289574,
526
- "loss": 0.4359,
527
  "step": 740
528
  },
529
  {
530
- "epoch": 1.2897678417884781,
531
- "grad_norm": 0.14138385096885275,
532
- "learning_rate": 0.00014066146161613208,
533
- "loss": 0.4414,
534
  "step": 750
535
  },
536
  {
537
- "epoch": 1.3069647463456577,
538
- "grad_norm": 0.14745231010770607,
539
- "learning_rate": 0.00013882296489741783,
540
- "loss": 0.4457,
541
  "step": 760
542
  },
543
  {
544
- "epoch": 1.3241616509028376,
545
- "grad_norm": 0.155323506204182,
546
- "learning_rate": 0.00013696888407606952,
547
- "loss": 0.4468,
548
  "step": 770
549
  },
550
  {
551
- "epoch": 1.3413585554600171,
552
- "grad_norm": 0.14906188336013929,
553
- "learning_rate": 0.0001350999634070889,
554
- "loss": 0.4561,
555
  "step": 780
556
  },
557
  {
558
- "epoch": 1.358555460017197,
559
- "grad_norm": 0.1509482546916061,
560
- "learning_rate": 0.0001332169531024085,
561
- "loss": 0.4488,
562
  "step": 790
563
  },
564
  {
565
- "epoch": 1.3757523645743766,
566
- "grad_norm": 0.1577110542147225,
567
- "learning_rate": 0.00013132060902974554,
568
- "loss": 0.4481,
569
  "step": 800
570
  },
571
  {
572
- "epoch": 1.3929492691315564,
573
- "grad_norm": 0.1519302579308639,
574
- "learning_rate": 0.00012941169240918534,
575
- "loss": 0.4434,
576
  "step": 810
577
  },
578
  {
579
- "epoch": 1.410146173688736,
580
- "grad_norm": 0.16574786040919845,
581
- "learning_rate": 0.00012749096950761702,
582
- "loss": 0.4422,
583
  "step": 820
584
  },
585
  {
586
- "epoch": 1.4273430782459158,
587
- "grad_norm": 0.14939058136631353,
588
- "learning_rate": 0.00012555921133114247,
589
- "loss": 0.4439,
590
  "step": 830
591
  },
592
  {
593
- "epoch": 1.4445399828030954,
594
- "grad_norm": 0.14433259289852834,
595
- "learning_rate": 0.00012361719331558345,
596
- "loss": 0.4412,
597
  "step": 840
598
  },
599
  {
600
- "epoch": 1.4617368873602752,
601
- "grad_norm": 0.14779220063991433,
602
- "learning_rate": 0.00012166569501521017,
603
- "loss": 0.444,
604
  "step": 850
605
  },
606
  {
607
- "epoch": 1.4789337919174548,
608
- "grad_norm": 0.15109790098843823,
609
- "learning_rate": 0.00011970549978981715,
610
- "loss": 0.4489,
611
  "step": 860
612
  },
613
  {
614
- "epoch": 1.4961306964746346,
615
- "grad_norm": 0.14824981470720835,
616
- "learning_rate": 0.00011773739449027108,
617
- "loss": 0.4377,
618
  "step": 870
619
  },
620
  {
621
- "epoch": 1.5133276010318144,
622
- "grad_norm": 0.14490199589122096,
623
- "learning_rate": 0.00011576216914265734,
624
- "loss": 0.4454,
625
  "step": 880
626
  },
627
  {
628
- "epoch": 1.530524505588994,
629
- "grad_norm": 0.14668336107167357,
630
- "learning_rate": 0.00011378061663115222,
631
- "loss": 0.4374,
632
  "step": 890
633
  },
634
  {
635
- "epoch": 1.5477214101461736,
636
- "grad_norm": 0.15306161877683747,
637
- "learning_rate": 0.00011179353237974756,
638
- "loss": 0.441,
639
  "step": 900
640
  },
641
  {
642
- "epoch": 1.5649183147033534,
643
- "grad_norm": 0.14629253984953647,
644
- "learning_rate": 0.0001098017140329561,
645
- "loss": 0.4402,
646
  "step": 910
647
  },
648
  {
649
- "epoch": 1.5821152192605332,
650
- "grad_norm": 0.15181211231388803,
651
- "learning_rate": 0.00010780596113562514,
652
- "loss": 0.4441,
653
  "step": 920
654
  },
655
  {
656
- "epoch": 1.5993121238177128,
657
- "grad_norm": 0.14895949287614227,
658
- "learning_rate": 0.00010580707481198796,
659
- "loss": 0.4415,
660
  "step": 930
661
  },
662
  {
663
- "epoch": 1.6165090283748924,
664
- "grad_norm": 0.14919588155250404,
665
- "learning_rate": 0.00010380585744408065,
666
- "loss": 0.4428,
667
  "step": 940
668
  },
669
  {
670
- "epoch": 1.6337059329320722,
671
- "grad_norm": 0.15195396903926595,
672
- "learning_rate": 0.00010180311234965433,
673
- "loss": 0.4338,
674
  "step": 950
675
  },
676
  {
677
- "epoch": 1.650902837489252,
678
- "grad_norm": 0.15192117531451788,
679
- "learning_rate": 9.979964345971188e-05,
680
- "loss": 0.445,
681
  "step": 960
682
  },
683
  {
684
- "epoch": 1.6680997420464316,
685
- "grad_norm": 0.14674515514268918,
686
- "learning_rate": 9.779625499579805e-05,
687
- "loss": 0.4416,
688
  "step": 970
689
  },
690
  {
691
- "epoch": 1.6852966466036112,
692
- "grad_norm": 0.14766438549971136,
693
- "learning_rate": 9.579375114717351e-05,
694
- "loss": 0.434,
695
  "step": 980
696
  },
697
  {
698
- "epoch": 1.702493551160791,
699
- "grad_norm": 0.14681911182175214,
700
- "learning_rate": 9.379293574800154e-05,
701
- "loss": 0.4363,
702
  "step": 990
703
  },
704
  {
705
- "epoch": 1.7196904557179709,
706
- "grad_norm": 0.1464492709373849,
707
- "learning_rate": 9.179461195467714e-05,
708
- "loss": 0.4353,
709
  "step": 1000
710
  },
711
  {
712
- "epoch": 1.7368873602751504,
713
- "grad_norm": 0.15011221390319449,
714
- "learning_rate": 8.979958192342862e-05,
715
- "loss": 0.4445,
716
  "step": 1010
717
  },
718
  {
719
- "epoch": 1.75408426483233,
720
- "grad_norm": 0.152429302734013,
721
- "learning_rate": 8.780864648832022e-05,
722
- "loss": 0.4404,
723
  "step": 1020
724
  },
725
  {
726
- "epoch": 1.7712811693895099,
727
- "grad_norm": 0.14681226342261067,
728
- "learning_rate": 8.58226048397857e-05,
729
- "loss": 0.4435,
730
  "step": 1030
731
  },
732
  {
733
- "epoch": 1.7884780739466897,
734
- "grad_norm": 0.15354804740449426,
735
- "learning_rate": 8.384225420382185e-05,
736
- "loss": 0.4377,
737
  "step": 1040
738
  },
739
  {
740
- "epoch": 1.8056749785038693,
741
- "grad_norm": 0.15108728472484245,
742
- "learning_rate": 8.186838952197018e-05,
743
- "loss": 0.4344,
744
  "step": 1050
745
  },
746
  {
747
- "epoch": 1.8228718830610489,
748
- "grad_norm": 0.14903695640608858,
749
- "learning_rate": 7.990180313221596e-05,
750
- "loss": 0.4381,
751
  "step": 1060
752
  },
753
  {
754
- "epoch": 1.8400687876182287,
755
- "grad_norm": 0.14676064171835554,
756
- "learning_rate": 7.794328445093208e-05,
757
- "loss": 0.4379,
758
  "step": 1070
759
  },
760
  {
761
- "epoch": 1.8572656921754085,
762
- "grad_norm": 0.15258822928003124,
763
- "learning_rate": 7.599361965599606e-05,
764
- "loss": 0.4426,
765
  "step": 1080
766
  },
767
  {
768
- "epoch": 1.874462596732588,
769
- "grad_norm": 0.26094072520435535,
770
- "learning_rate": 7.405359137120662e-05,
771
- "loss": 0.4368,
772
  "step": 1090
773
  },
774
  {
775
- "epoch": 1.8916595012897677,
776
- "grad_norm": 0.15223526485978567,
777
- "learning_rate": 7.212397835212722e-05,
778
- "loss": 0.4426,
779
  "step": 1100
780
  },
781
  {
782
- "epoch": 1.9088564058469477,
783
- "grad_norm": 0.14158786202820334,
784
- "learning_rate": 7.02055551734822e-05,
785
- "loss": 0.4304,
786
  "step": 1110
787
  },
788
  {
789
- "epoch": 1.9260533104041273,
790
- "grad_norm": 0.15562496727433017,
791
- "learning_rate": 6.829909191823121e-05,
792
- "loss": 0.4426,
793
  "step": 1120
794
  },
795
  {
796
- "epoch": 1.943250214961307,
797
- "grad_norm": 0.14930981803960258,
798
- "learning_rate": 6.640535386844679e-05,
799
- "loss": 0.4418,
800
  "step": 1130
801
  },
802
  {
803
- "epoch": 1.9604471195184867,
804
- "grad_norm": 0.1552156563224316,
805
- "learning_rate": 6.452510119811895e-05,
806
- "loss": 0.4367,
807
  "step": 1140
808
  },
809
  {
810
- "epoch": 1.9776440240756665,
811
- "grad_norm": 0.19090518551388455,
812
- "learning_rate": 6.26590886680103e-05,
813
- "loss": 0.4303,
814
  "step": 1150
815
  },
816
  {
817
- "epoch": 1.9948409286328461,
818
- "grad_norm": 0.1496699651502689,
819
- "learning_rate": 6.0808065322683993e-05,
820
- "loss": 0.4307,
821
  "step": 1160
822
  },
823
  {
824
- "epoch": 2.0120378331900257,
825
- "grad_norm": 0.16615207318616432,
826
- "learning_rate": 5.897277418982672e-05,
827
- "loss": 0.3983,
828
  "step": 1170
829
  },
830
  {
831
- "epoch": 2.0292347377472053,
832
- "grad_norm": 0.1652827612761684,
833
- "learning_rate": 5.715395198198603e-05,
834
- "loss": 0.3812,
835
  "step": 1180
836
  },
837
  {
838
- "epoch": 2.0464316423043853,
839
- "grad_norm": 0.16911724923353538,
840
- "learning_rate": 5.5352328800843724e-05,
841
- "loss": 0.3766,
842
  "step": 1190
843
  },
844
  {
845
- "epoch": 2.063628546861565,
846
- "grad_norm": 0.16951539871441879,
847
- "learning_rate": 5.356862784414199e-05,
848
- "loss": 0.3789,
849
  "step": 1200
850
  },
851
  {
852
- "epoch": 2.0808254514187445,
853
- "grad_norm": 0.17010125437035936,
854
- "learning_rate": 5.1803565115381694e-05,
855
- "loss": 0.3783,
856
  "step": 1210
857
  },
858
  {
859
- "epoch": 2.098022355975924,
860
- "grad_norm": 0.16860757374113347,
861
- "learning_rate": 5.0057849136407874e-05,
862
- "loss": 0.3815,
863
  "step": 1220
864
  },
865
  {
866
- "epoch": 2.115219260533104,
867
- "grad_norm": 0.17279827710775011,
868
- "learning_rate": 4.833218066299896e-05,
869
- "loss": 0.3775,
870
  "step": 1230
871
  },
872
  {
873
- "epoch": 2.1324161650902838,
874
- "grad_norm": 0.16772946452531912,
875
- "learning_rate": 4.6627252403573085e-05,
876
- "loss": 0.3719,
877
  "step": 1240
878
  },
879
  {
880
- "epoch": 2.1496130696474633,
881
- "grad_norm": 0.17982428185526986,
882
- "learning_rate": 4.4943748741124934e-05,
883
- "loss": 0.3779,
884
  "step": 1250
885
  },
886
  {
887
- "epoch": 2.166809974204643,
888
- "grad_norm": 0.18313593415470272,
889
- "learning_rate": 4.328234545850442e-05,
890
- "loss": 0.3836,
891
  "step": 1260
892
  },
893
  {
894
- "epoch": 2.184006878761823,
895
- "grad_norm": 0.1725383341952266,
896
- "learning_rate": 4.1643709467147615e-05,
897
- "loss": 0.369,
898
  "step": 1270
899
  },
900
  {
901
- "epoch": 2.2012037833190026,
902
- "grad_norm": 0.1971081873293993,
903
- "learning_rate": 4.002849853936891e-05,
904
- "loss": 0.3728,
905
  "step": 1280
906
  },
907
  {
908
- "epoch": 2.218400687876182,
909
- "grad_norm": 0.17991148711708474,
910
- "learning_rate": 3.843736104432137e-05,
911
- "loss": 0.3804,
912
  "step": 1290
913
  },
914
  {
915
- "epoch": 2.235597592433362,
916
- "grad_norm": 0.17934271418386846,
917
- "learning_rate": 3.687093568773229e-05,
918
- "loss": 0.3844,
919
  "step": 1300
920
  },
921
  {
922
- "epoch": 2.252794496990542,
923
- "grad_norm": 0.17862554374418718,
924
- "learning_rate": 3.532985125551715e-05,
925
- "loss": 0.385,
926
  "step": 1310
927
  },
928
  {
929
- "epoch": 2.2699914015477214,
930
- "grad_norm": 0.17397364598815349,
931
- "learning_rate": 3.381472636137591e-05,
932
- "loss": 0.3755,
933
  "step": 1320
934
  },
935
  {
936
- "epoch": 2.287188306104901,
937
- "grad_norm": 0.18500269890886714,
938
- "learning_rate": 3.2326169198472556e-05,
939
- "loss": 0.384,
940
  "step": 1330
941
  },
942
  {
943
- "epoch": 2.304385210662081,
944
- "grad_norm": 0.18186600816942164,
945
- "learning_rate": 3.0864777295297376e-05,
946
- "loss": 0.3726,
947
  "step": 1340
948
  },
949
  {
950
- "epoch": 2.3215821152192606,
951
- "grad_norm": 0.17773737221105249,
952
- "learning_rate": 2.9431137275810317e-05,
953
- "loss": 0.3704,
954
  "step": 1350
955
  },
956
  {
957
- "epoch": 2.33877901977644,
958
- "grad_norm": 0.18256211613697612,
959
- "learning_rate": 2.8025824623961773e-05,
960
- "loss": 0.3769,
961
  "step": 1360
962
  },
963
  {
964
- "epoch": 2.35597592433362,
965
- "grad_norm": 0.18129287061779392,
966
- "learning_rate": 2.664940345268483e-05,
967
- "loss": 0.3766,
968
  "step": 1370
969
  },
970
  {
971
- "epoch": 2.3731728288908,
972
- "grad_norm": 0.1785686917578723,
973
- "learning_rate": 2.5302426277452172e-05,
974
- "loss": 0.3701,
975
  "step": 1380
976
  },
977
  {
978
- "epoch": 2.3903697334479794,
979
- "grad_norm": 0.19834019533954172,
980
- "learning_rate": 2.398543379448832e-05,
981
- "loss": 0.3768,
982
  "step": 1390
983
  },
984
  {
985
- "epoch": 2.407566638005159,
986
- "grad_norm": 0.18177776105752433,
987
- "learning_rate": 2.26989546637263e-05,
988
- "loss": 0.3755,
989
  "step": 1400
990
  },
991
  {
992
- "epoch": 2.4247635425623386,
993
- "grad_norm": 0.18362730161587285,
994
- "learning_rate": 2.144350529659589e-05,
995
- "loss": 0.368,
996
  "step": 1410
997
  },
998
  {
999
- "epoch": 2.4419604471195187,
1000
- "grad_norm": 0.18330397808911847,
1001
- "learning_rate": 2.021958964872851e-05,
1002
- "loss": 0.3794,
1003
  "step": 1420
1004
  },
1005
  {
1006
- "epoch": 2.4591573516766982,
1007
- "grad_norm": 0.19205219312460517,
1008
- "learning_rate": 1.9027699017662194e-05,
1009
- "loss": 0.3703,
1010
  "step": 1430
1011
  },
1012
  {
1013
- "epoch": 2.476354256233878,
1014
- "grad_norm": 0.18216660996651907,
1015
- "learning_rate": 1.7868311845627472e-05,
1016
- "loss": 0.3754,
1017
  "step": 1440
1018
  },
1019
  {
1020
- "epoch": 2.4935511607910574,
1021
- "grad_norm": 0.18809064927597854,
1022
- "learning_rate": 1.6741893527493858e-05,
1023
- "loss": 0.3707,
1024
  "step": 1450
1025
  },
1026
  {
1027
- "epoch": 2.5107480653482375,
1028
- "grad_norm": 0.18562863684234907,
1029
- "learning_rate": 1.564889622395349e-05,
1030
- "loss": 0.3698,
1031
  "step": 1460
1032
  },
1033
  {
1034
- "epoch": 2.527944969905417,
1035
- "grad_norm": 0.18350802969723742,
1036
- "learning_rate": 1.4589758680017263e-05,
1037
- "loss": 0.3733,
1038
  "step": 1470
1039
  },
1040
  {
1041
- "epoch": 2.5451418744625967,
1042
- "grad_norm": 0.18684066406776073,
1043
- "learning_rate": 1.356490604889622e-05,
1044
- "loss": 0.3787,
1045
  "step": 1480
1046
  },
1047
  {
1048
- "epoch": 2.5623387790197762,
1049
- "grad_norm": 0.1811515744478759,
1050
- "learning_rate": 1.2574749721338874e-05,
1051
- "loss": 0.3769,
1052
  "step": 1490
1053
  },
1054
  {
1055
- "epoch": 2.5795356835769563,
1056
- "grad_norm": 0.18069550077179752,
1057
- "learning_rate": 1.1619687160492953e-05,
1058
- "loss": 0.3701,
1059
  "step": 1500
1060
  },
1061
  {
1062
- "epoch": 2.596732588134136,
1063
- "grad_norm": 0.18462589607546182,
1064
- "learning_rate": 1.0700101742357926e-05,
1065
- "loss": 0.3744,
1066
  "step": 1510
1067
  },
1068
  {
1069
- "epoch": 2.6139294926913155,
1070
- "grad_norm": 0.18712041177077796,
1071
- "learning_rate": 9.816362601892326e-06,
1072
- "loss": 0.3677,
1073
  "step": 1520
1074
  },
1075
  {
1076
- "epoch": 2.6311263972484955,
1077
- "grad_norm": 0.18908434345209518,
1078
- "learning_rate": 8.968824484837578e-06,
1079
- "loss": 0.3685,
1080
  "step": 1530
1081
  },
1082
  {
1083
- "epoch": 2.648323301805675,
1084
- "grad_norm": 0.1847336144413599,
1085
- "learning_rate": 8.157827605317892e-06,
1086
- "loss": 0.3713,
1087
  "step": 1540
1088
  },
1089
  {
1090
- "epoch": 2.6655202063628547,
1091
- "grad_norm": 0.18269892602018764,
1092
- "learning_rate": 7.383697509273424e-06,
1093
- "loss": 0.367,
1094
  "step": 1550
1095
  },
1096
  {
1097
- "epoch": 2.6827171109200343,
1098
- "grad_norm": 0.18647288570365314,
1099
- "learning_rate": 6.646744943781325e-06,
1100
- "loss": 0.3724,
1101
  "step": 1560
1102
  },
1103
  {
1104
- "epoch": 2.699914015477214,
1105
- "grad_norm": 0.18698726008663819,
1106
- "learning_rate": 5.947265732317408e-06,
1107
- "loss": 0.3659,
1108
  "step": 1570
1109
  },
1110
  {
1111
- "epoch": 2.717110920034394,
1112
- "grad_norm": 0.18309326292481923,
1113
- "learning_rate": 5.285540656008303e-06,
1114
- "loss": 0.3757,
1115
  "step": 1580
1116
  },
1117
  {
1118
- "epoch": 2.7343078245915735,
1119
- "grad_norm": 0.18668212507421977,
1120
- "learning_rate": 4.6618353409217386e-06,
1121
- "loss": 0.3673,
1122
  "step": 1590
1123
  },
1124
  {
1125
- "epoch": 2.751504729148753,
1126
- "grad_norm": 0.1840554864418513,
1127
- "learning_rate": 4.076400151440485e-06,
1128
- "loss": 0.367,
1129
  "step": 1600
1130
  },
1131
  {
1132
- "epoch": 2.768701633705933,
1133
- "grad_norm": 0.1838274059601279,
1134
- "learning_rate": 3.529470089762421e-06,
1135
- "loss": 0.3676,
1136
  "step": 1610
1137
  },
1138
  {
1139
- "epoch": 2.7858985382631127,
1140
- "grad_norm": 0.1905340116630113,
1141
- "learning_rate": 3.021264701567206e-06,
1142
- "loss": 0.3745,
1143
  "step": 1620
1144
  },
1145
  {
1146
- "epoch": 2.8030954428202923,
1147
- "grad_norm": 0.1859751703399611,
1148
- "learning_rate": 2.551987987887461e-06,
1149
- "loss": 0.369,
1150
  "step": 1630
1151
  },
1152
  {
1153
- "epoch": 2.820292347377472,
1154
- "grad_norm": 0.18905809126144052,
1155
- "learning_rate": 2.1218283232198212e-06,
1156
- "loss": 0.3774,
1157
  "step": 1640
1158
  },
1159
  {
1160
- "epoch": 2.8374892519346515,
1161
- "grad_norm": 0.1853105742586414,
1162
- "learning_rate": 1.7309583799086094e-06,
1163
- "loss": 0.3726,
1164
  "step": 1650
1165
  },
1166
  {
1167
- "epoch": 2.8546861564918316,
1168
- "grad_norm": 0.1840942740034349,
1169
- "learning_rate": 1.3795350588327261e-06,
1170
- "loss": 0.3779,
1171
  "step": 1660
1172
  },
1173
  {
1174
- "epoch": 2.871883061049011,
1175
- "grad_norm": 0.18025938643579378,
1176
- "learning_rate": 1.0676994264232854e-06,
1177
- "loss": 0.3692,
1178
  "step": 1670
1179
  },
1180
  {
1181
- "epoch": 2.8890799656061907,
1182
- "grad_norm": 0.18685969171459274,
1183
- "learning_rate": 7.955766580375335e-07,
1184
- "loss": 0.3688,
1185
  "step": 1680
1186
  },
1187
  {
1188
- "epoch": 2.9062768701633708,
1189
- "grad_norm": 0.1830773378552538,
1190
- "learning_rate": 5.632759877116422e-07,
1191
- "loss": 0.3738,
1192
  "step": 1690
1193
  },
1194
  {
1195
- "epoch": 2.9234737747205504,
1196
- "grad_norm": 0.18687096055320696,
1197
- "learning_rate": 3.708906643125509e-07,
1198
- "loss": 0.3754,
1199
  "step": 1700
1200
  },
1201
  {
1202
- "epoch": 2.94067067927773,
1203
- "grad_norm": 0.18843515900821306,
1204
- "learning_rate": 2.184979141065413e-07,
1205
- "loss": 0.374,
1206
  "step": 1710
1207
  },
1208
  {
1209
- "epoch": 2.9578675838349096,
1210
- "grad_norm": 0.1907738099777219,
1211
- "learning_rate": 1.061589097595017e-07,
1212
- "loss": 0.3719,
1213
  "step": 1720
1214
  },
1215
  {
1216
- "epoch": 2.9750644883920896,
1217
- "grad_norm": 0.18629496289787387,
1218
- "learning_rate": 3.3918745781291725e-08,
1219
- "loss": 0.3734,
1220
  "step": 1730
1221
  },
1222
  {
1223
- "epoch": 2.992261392949269,
1224
- "grad_norm": 0.1807736649886976,
1225
- "learning_rate": 1.8064204241774462e-09,
1226
- "loss": 0.3669,
1227
  "step": 1740
1228
  },
1229
  {
1230
- "epoch": 2.997420464316423,
1231
- "step": 1743,
1232
- "total_flos": 1.7624457120055296e+16,
1233
- "train_loss": 0.4506890232783245,
1234
- "train_runtime": 9589.9992,
1235
- "train_samples_per_second": 5.818,
1236
- "train_steps_per_second": 0.182
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1237
  }
1238
  ],
1239
  "logging_steps": 10,
1240
- "max_steps": 1743,
1241
  "num_input_tokens_seen": 0,
1242
  "num_train_epochs": 3,
1243
  "save_steps": 500,
@@ -1253,7 +2471,7 @@
1253
  "attributes": {}
1254
  }
1255
  },
1256
- "total_flos": 1.7624457120055296e+16,
1257
  "train_batch_size": 4,
1258
  "trial_name": null,
1259
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 2.9987096774193547,
5
  "eval_steps": 500,
6
+ "global_step": 3486,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.008602150537634409,
13
+ "grad_norm": 1.435989601453093,
14
+ "learning_rate": 5.730659025787966e-06,
15
+ "loss": 1.2534,
16
  "step": 10
17
  },
18
  {
19
+ "epoch": 0.017204301075268817,
20
+ "grad_norm": 0.956311142058486,
21
+ "learning_rate": 1.1461318051575932e-05,
22
+ "loss": 1.1941,
23
  "step": 20
24
  },
25
  {
26
+ "epoch": 0.025806451612903226,
27
+ "grad_norm": 0.5955272986463238,
28
+ "learning_rate": 1.7191977077363898e-05,
29
+ "loss": 1.0416,
30
  "step": 30
31
  },
32
  {
33
+ "epoch": 0.034408602150537634,
34
+ "grad_norm": 0.5371220864077184,
35
+ "learning_rate": 2.2922636103151864e-05,
36
+ "loss": 0.9252,
37
  "step": 40
38
  },
39
  {
40
+ "epoch": 0.043010752688172046,
41
+ "grad_norm": 0.3298242735866968,
42
+ "learning_rate": 2.8653295128939826e-05,
43
+ "loss": 0.8582,
44
  "step": 50
45
  },
46
  {
47
+ "epoch": 0.05161290322580645,
48
+ "grad_norm": 0.3017426075394883,
49
+ "learning_rate": 3.4383954154727795e-05,
50
+ "loss": 0.7998,
51
  "step": 60
52
  },
53
  {
54
+ "epoch": 0.060215053763440864,
55
+ "grad_norm": 0.3375526814194791,
56
+ "learning_rate": 4.011461318051576e-05,
57
+ "loss": 0.7638,
58
  "step": 70
59
  },
60
  {
61
+ "epoch": 0.06881720430107527,
62
+ "grad_norm": 0.2705319023822148,
63
+ "learning_rate": 4.584527220630373e-05,
64
+ "loss": 0.731,
65
  "step": 80
66
  },
67
  {
68
+ "epoch": 0.07741935483870968,
69
+ "grad_norm": 0.2857694097360681,
70
+ "learning_rate": 5.157593123209169e-05,
71
+ "loss": 0.7083,
72
  "step": 90
73
  },
74
  {
75
+ "epoch": 0.08602150537634409,
76
+ "grad_norm": 0.3098635398347818,
77
+ "learning_rate": 5.730659025787965e-05,
78
+ "loss": 0.6841,
79
  "step": 100
80
  },
81
  {
82
+ "epoch": 0.09462365591397849,
83
+ "grad_norm": 0.3198686339975185,
84
+ "learning_rate": 6.303724928366761e-05,
85
+ "loss": 0.6606,
86
  "step": 110
87
  },
88
  {
89
+ "epoch": 0.1032258064516129,
90
+ "grad_norm": 0.3190855908911574,
91
+ "learning_rate": 6.876790830945559e-05,
92
+ "loss": 0.6557,
93
  "step": 120
94
  },
95
  {
96
+ "epoch": 0.11182795698924732,
97
+ "grad_norm": 0.3381628025718819,
98
+ "learning_rate": 7.449856733524355e-05,
99
+ "loss": 0.649,
100
  "step": 130
101
  },
102
  {
103
+ "epoch": 0.12043010752688173,
104
+ "grad_norm": 0.31598591910637613,
105
+ "learning_rate": 8.022922636103152e-05,
106
+ "loss": 0.6497,
107
  "step": 140
108
  },
109
  {
110
+ "epoch": 0.12903225806451613,
111
+ "grad_norm": 0.3128921107543011,
112
+ "learning_rate": 8.595988538681948e-05,
113
+ "loss": 0.6392,
114
  "step": 150
115
  },
116
  {
117
+ "epoch": 0.13763440860215054,
118
+ "grad_norm": 0.32601075629670023,
119
+ "learning_rate": 9.169054441260745e-05,
120
+ "loss": 0.6329,
121
  "step": 160
122
  },
123
  {
124
+ "epoch": 0.14623655913978495,
125
+ "grad_norm": 0.3810140632755748,
126
+ "learning_rate": 9.742120343839543e-05,
127
+ "loss": 0.6246,
128
  "step": 170
129
  },
130
  {
131
+ "epoch": 0.15483870967741936,
132
+ "grad_norm": 0.32108686980228696,
133
+ "learning_rate": 0.00010315186246418338,
134
+ "loss": 0.6341,
135
  "step": 180
136
  },
137
  {
138
+ "epoch": 0.16344086021505377,
139
+ "grad_norm": 0.35140482524130334,
140
+ "learning_rate": 0.00010888252148997136,
141
+ "loss": 0.6341,
142
  "step": 190
143
  },
144
  {
145
+ "epoch": 0.17204301075268819,
146
+ "grad_norm": 0.29928696988494263,
147
+ "learning_rate": 0.0001146131805157593,
148
+ "loss": 0.6256,
149
  "step": 200
150
  },
151
  {
152
+ "epoch": 0.18064516129032257,
153
+ "grad_norm": 0.29203351329015653,
154
+ "learning_rate": 0.0001203438395415473,
155
+ "loss": 0.6206,
156
  "step": 210
157
  },
158
  {
159
+ "epoch": 0.18924731182795698,
160
+ "grad_norm": 0.269908780529402,
161
+ "learning_rate": 0.00012607449856733523,
162
+ "loss": 0.6201,
163
  "step": 220
164
  },
165
  {
166
+ "epoch": 0.1978494623655914,
167
+ "grad_norm": 0.2918951896599236,
168
+ "learning_rate": 0.0001318051575931232,
169
+ "loss": 0.6181,
170
  "step": 230
171
  },
172
  {
173
+ "epoch": 0.2064516129032258,
174
+ "grad_norm": 0.2708453980927917,
175
+ "learning_rate": 0.00013753581661891118,
176
+ "loss": 0.6053,
177
  "step": 240
178
  },
179
  {
180
+ "epoch": 0.21505376344086022,
181
+ "grad_norm": 0.2662594416135698,
182
+ "learning_rate": 0.00014326647564469916,
183
+ "loss": 0.6033,
184
  "step": 250
185
  },
186
  {
187
+ "epoch": 0.22365591397849463,
188
+ "grad_norm": 0.2776129880103733,
189
+ "learning_rate": 0.0001489971346704871,
190
+ "loss": 0.618,
191
  "step": 260
192
  },
193
  {
194
+ "epoch": 0.23225806451612904,
195
+ "grad_norm": 0.279068656400945,
196
+ "learning_rate": 0.00015472779369627508,
197
+ "loss": 0.6111,
198
  "step": 270
199
  },
200
  {
201
+ "epoch": 0.24086021505376345,
202
+ "grad_norm": 0.2524990953581738,
203
+ "learning_rate": 0.00016045845272206303,
204
+ "loss": 0.5888,
205
  "step": 280
206
  },
207
  {
208
+ "epoch": 0.24946236559139784,
209
+ "grad_norm": 0.27628464942088166,
210
+ "learning_rate": 0.000166189111747851,
211
+ "loss": 0.6051,
212
  "step": 290
213
  },
214
  {
215
+ "epoch": 0.25806451612903225,
216
+ "grad_norm": 0.2738432329342113,
217
+ "learning_rate": 0.00017191977077363896,
218
+ "loss": 0.6025,
219
  "step": 300
220
  },
221
  {
222
+ "epoch": 0.26666666666666666,
223
+ "grad_norm": 0.25607005690614404,
224
+ "learning_rate": 0.00017765042979942693,
225
+ "loss": 0.6035,
226
  "step": 310
227
  },
228
  {
229
+ "epoch": 0.2752688172043011,
230
+ "grad_norm": 0.247801611216806,
231
+ "learning_rate": 0.0001833810888252149,
232
+ "loss": 0.6118,
233
  "step": 320
234
  },
235
  {
236
+ "epoch": 0.2838709677419355,
237
+ "grad_norm": 0.24971595808191485,
238
+ "learning_rate": 0.00018911174785100288,
239
+ "loss": 0.6003,
240
  "step": 330
241
  },
242
  {
243
+ "epoch": 0.2924731182795699,
244
+ "grad_norm": 0.2411785853482113,
245
+ "learning_rate": 0.00019484240687679086,
246
+ "loss": 0.6019,
247
  "step": 340
248
  },
249
  {
250
+ "epoch": 0.3010752688172043,
251
+ "grad_norm": 0.24693597842223552,
252
+ "learning_rate": 0.0001999999498534943,
253
+ "loss": 0.5912,
254
  "step": 350
255
  },
256
  {
257
+ "epoch": 0.3096774193548387,
258
+ "grad_norm": 0.23649676383007667,
259
+ "learning_rate": 0.0001999939323336644,
260
+ "loss": 0.6032,
261
  "step": 360
262
  },
263
  {
264
+ "epoch": 0.31827956989247314,
265
+ "grad_norm": 0.23340962816367694,
266
+ "learning_rate": 0.0001999778862042167,
267
+ "loss": 0.5867,
268
  "step": 370
269
  },
270
  {
271
+ "epoch": 0.32688172043010755,
272
+ "grad_norm": 0.228292747503904,
273
+ "learning_rate": 0.0001999518130744525,
274
+ "loss": 0.5869,
275
  "step": 380
276
  },
277
  {
278
+ "epoch": 0.33548387096774196,
279
+ "grad_norm": 0.23546242335669995,
280
+ "learning_rate": 0.0001999157155593029,
281
+ "loss": 0.585,
282
  "step": 390
283
  },
284
  {
285
+ "epoch": 0.34408602150537637,
286
+ "grad_norm": 0.23135215802858206,
287
+ "learning_rate": 0.0001998695972790664,
288
+ "loss": 0.5929,
289
  "step": 400
290
  },
291
  {
292
+ "epoch": 0.35268817204301073,
293
+ "grad_norm": 0.22524299635343834,
294
+ "learning_rate": 0.00019981346285904595,
295
+ "loss": 0.5976,
296
  "step": 410
297
  },
298
  {
299
+ "epoch": 0.36129032258064514,
300
+ "grad_norm": 0.21756192058074578,
301
+ "learning_rate": 0.00019974731792908506,
302
+ "loss": 0.587,
303
  "step": 420
304
  },
305
  {
306
+ "epoch": 0.36989247311827955,
307
+ "grad_norm": 0.23763473038875538,
308
+ "learning_rate": 0.000199671169123003,
309
+ "loss": 0.5934,
310
  "step": 430
311
  },
312
  {
313
+ "epoch": 0.37849462365591396,
314
+ "grad_norm": 0.23701431523985933,
315
+ "learning_rate": 0.00019958502407792963,
316
+ "loss": 0.6009,
317
  "step": 440
318
  },
319
  {
320
+ "epoch": 0.3870967741935484,
321
+ "grad_norm": 0.22134697304917678,
322
+ "learning_rate": 0.00019948889143353948,
323
+ "loss": 0.5994,
324
  "step": 450
325
  },
326
  {
327
+ "epoch": 0.3956989247311828,
328
+ "grad_norm": 0.2268467235014266,
329
+ "learning_rate": 0.00019938278083118517,
330
+ "loss": 0.5982,
331
  "step": 460
332
  },
333
  {
334
+ "epoch": 0.4043010752688172,
335
+ "grad_norm": 0.2247581060586749,
336
+ "learning_rate": 0.00019926670291293055,
337
+ "loss": 0.5782,
338
  "step": 470
339
  },
340
  {
341
+ "epoch": 0.4129032258064516,
342
+ "grad_norm": 0.21243655986167898,
343
+ "learning_rate": 0.00019914066932048317,
344
+ "loss": 0.5874,
345
  "step": 480
346
  },
347
  {
348
+ "epoch": 0.421505376344086,
349
+ "grad_norm": 0.21385338375703783,
350
+ "learning_rate": 0.00019900469269402708,
351
+ "loss": 0.5813,
352
  "step": 490
353
  },
354
  {
355
+ "epoch": 0.43010752688172044,
356
+ "grad_norm": 0.21415772228830113,
357
+ "learning_rate": 0.00019885878667095472,
358
+ "loss": 0.5934,
359
  "step": 500
360
  },
361
  {
362
+ "epoch": 0.43870967741935485,
363
+ "grad_norm": 0.23153816801674154,
364
+ "learning_rate": 0.00019870296588449948,
365
+ "loss": 0.5766,
366
  "step": 510
367
  },
368
  {
369
+ "epoch": 0.44731182795698926,
370
+ "grad_norm": 0.21768913976009108,
371
+ "learning_rate": 0.00019853724596226795,
372
+ "loss": 0.5964,
373
  "step": 520
374
  },
375
  {
376
+ "epoch": 0.4559139784946237,
377
+ "grad_norm": 0.21481929780656356,
378
+ "learning_rate": 0.00019836164352467263,
379
+ "loss": 0.5748,
380
  "step": 530
381
  },
382
  {
383
+ "epoch": 0.4645161290322581,
384
+ "grad_norm": 0.29328457757364856,
385
+ "learning_rate": 0.00019817617618326516,
386
+ "loss": 0.5863,
387
  "step": 540
388
  },
389
  {
390
+ "epoch": 0.4731182795698925,
391
+ "grad_norm": 0.2140715456768234,
392
+ "learning_rate": 0.00019798086253896974,
393
+ "loss": 0.5787,
394
  "step": 550
395
  },
396
  {
397
+ "epoch": 0.4817204301075269,
398
+ "grad_norm": 0.22386132897333755,
399
+ "learning_rate": 0.00019777572218021789,
400
+ "loss": 0.5863,
401
  "step": 560
402
  },
403
  {
404
+ "epoch": 0.49032258064516127,
405
+ "grad_norm": 0.23289220288123677,
406
+ "learning_rate": 0.00019756077568098372,
407
+ "loss": 0.5638,
408
  "step": 570
409
  },
410
  {
411
+ "epoch": 0.4989247311827957,
412
+ "grad_norm": 0.21115266625450407,
413
+ "learning_rate": 0.00019733604459872055,
414
+ "loss": 0.5744,
415
  "step": 580
416
  },
417
  {
418
+ "epoch": 0.5075268817204301,
419
+ "grad_norm": 0.2207246383657776,
420
+ "learning_rate": 0.00019710155147219886,
421
+ "loss": 0.5789,
422
  "step": 590
423
  },
424
  {
425
+ "epoch": 0.5161290322580645,
426
+ "grad_norm": 0.2127412426650313,
427
+ "learning_rate": 0.00019685731981924592,
428
+ "loss": 0.5764,
429
  "step": 600
430
  },
431
  {
432
+ "epoch": 0.524731182795699,
433
+ "grad_norm": 0.2256297779595538,
434
+ "learning_rate": 0.00019660337413438697,
435
+ "loss": 0.5798,
436
  "step": 610
437
  },
438
  {
439
+ "epoch": 0.5333333333333333,
440
+ "grad_norm": 0.22243638154383857,
441
+ "learning_rate": 0.00019633973988638877,
442
+ "loss": 0.5767,
443
  "step": 620
444
  },
445
  {
446
+ "epoch": 0.5419354838709678,
447
+ "grad_norm": 0.21425852582870433,
448
+ "learning_rate": 0.00019606644351570518,
449
+ "loss": 0.5736,
450
  "step": 630
451
  },
452
  {
453
+ "epoch": 0.5505376344086022,
454
+ "grad_norm": 0.20506928092936336,
455
+ "learning_rate": 0.00019578351243182545,
456
+ "loss": 0.5766,
457
  "step": 640
458
  },
459
  {
460
+ "epoch": 0.5591397849462365,
461
+ "grad_norm": 0.2175808639953357,
462
+ "learning_rate": 0.0001954909750105252,
463
+ "loss": 0.5883,
464
  "step": 650
465
  },
466
  {
467
+ "epoch": 0.567741935483871,
468
+ "grad_norm": 0.21972174494018643,
469
+ "learning_rate": 0.00019518886059102062,
470
+ "loss": 0.5774,
471
  "step": 660
472
  },
473
  {
474
+ "epoch": 0.5763440860215053,
475
+ "grad_norm": 0.2222552681428519,
476
+ "learning_rate": 0.0001948771994730259,
477
+ "loss": 0.5694,
478
  "step": 670
479
  },
480
  {
481
+ "epoch": 0.5849462365591398,
482
+ "grad_norm": 0.21552094411185982,
483
+ "learning_rate": 0.0001945560229137145,
484
+ "loss": 0.5657,
485
  "step": 680
486
  },
487
  {
488
+ "epoch": 0.5935483870967742,
489
+ "grad_norm": 0.21243053577901072,
490
+ "learning_rate": 0.0001942253631245842,
491
+ "loss": 0.5776,
492
  "step": 690
493
  },
494
  {
495
+ "epoch": 0.6021505376344086,
496
+ "grad_norm": 0.21257424919590467,
497
+ "learning_rate": 0.00019388525326822665,
498
+ "loss": 0.5764,
499
  "step": 700
500
  },
501
  {
502
+ "epoch": 0.610752688172043,
503
+ "grad_norm": 0.21485742957719725,
504
+ "learning_rate": 0.0001935357274550013,
505
+ "loss": 0.5821,
506
  "step": 710
507
  },
508
  {
509
+ "epoch": 0.6193548387096774,
510
+ "grad_norm": 0.2184569847068338,
511
+ "learning_rate": 0.00019317682073961444,
512
+ "loss": 0.5674,
513
  "step": 720
514
  },
515
  {
516
+ "epoch": 0.6279569892473118,
517
+ "grad_norm": 0.20912150886711225,
518
+ "learning_rate": 0.00019280856911760354,
519
+ "loss": 0.559,
520
  "step": 730
521
  },
522
  {
523
+ "epoch": 0.6365591397849463,
524
+ "grad_norm": 0.21433879312995557,
525
+ "learning_rate": 0.00019243100952172723,
526
+ "loss": 0.5615,
527
  "step": 740
528
  },
529
  {
530
+ "epoch": 0.6451612903225806,
531
+ "grad_norm": 0.22214855400737676,
532
+ "learning_rate": 0.00019204417981826091,
533
+ "loss": 0.5761,
534
  "step": 750
535
  },
536
  {
537
+ "epoch": 0.6537634408602151,
538
+ "grad_norm": 0.2196194509755129,
539
+ "learning_rate": 0.00019164811880319958,
540
+ "loss": 0.5718,
541
  "step": 760
542
  },
543
  {
544
+ "epoch": 0.6623655913978495,
545
+ "grad_norm": 0.2089872234950888,
546
+ "learning_rate": 0.00019124286619836637,
547
+ "loss": 0.5819,
548
  "step": 770
549
  },
550
  {
551
+ "epoch": 0.6709677419354839,
552
+ "grad_norm": 0.20958335214398927,
553
+ "learning_rate": 0.00019082846264742917,
554
+ "loss": 0.5663,
555
  "step": 780
556
  },
557
  {
558
+ "epoch": 0.6795698924731183,
559
+ "grad_norm": 0.22250020784958888,
560
+ "learning_rate": 0.00019040494971182413,
561
+ "loss": 0.572,
562
  "step": 790
563
  },
564
  {
565
+ "epoch": 0.6881720430107527,
566
+ "grad_norm": 0.2162793985278103,
567
+ "learning_rate": 0.00018997236986658753,
568
+ "loss": 0.5578,
569
  "step": 800
570
  },
571
  {
572
+ "epoch": 0.6967741935483871,
573
+ "grad_norm": 0.21650281403806723,
574
+ "learning_rate": 0.00018953076649609564,
575
+ "loss": 0.5698,
576
  "step": 810
577
  },
578
  {
579
+ "epoch": 0.7053763440860215,
580
+ "grad_norm": 0.21882852535131322,
581
+ "learning_rate": 0.000189080183889714,
582
+ "loss": 0.5712,
583
  "step": 820
584
  },
585
  {
586
+ "epoch": 0.7139784946236559,
587
+ "grad_norm": 0.21577728486568806,
588
+ "learning_rate": 0.00018862066723735512,
589
+ "loss": 0.5676,
590
  "step": 830
591
  },
592
  {
593
+ "epoch": 0.7225806451612903,
594
+ "grad_norm": 0.20731869739137426,
595
+ "learning_rate": 0.00018815226262494655,
596
+ "loss": 0.567,
597
  "step": 840
598
  },
599
  {
600
+ "epoch": 0.7311827956989247,
601
+ "grad_norm": 0.21966746890763253,
602
+ "learning_rate": 0.00018767501702980874,
603
+ "loss": 0.5738,
604
  "step": 850
605
  },
606
  {
607
+ "epoch": 0.7397849462365591,
608
+ "grad_norm": 0.2199440815107056,
609
+ "learning_rate": 0.00018718897831594355,
610
+ "loss": 0.5691,
611
  "step": 860
612
  },
613
  {
614
+ "epoch": 0.7483870967741936,
615
+ "grad_norm": 0.21722437739899045,
616
+ "learning_rate": 0.00018669419522923393,
617
+ "loss": 0.5753,
618
  "step": 870
619
  },
620
  {
621
+ "epoch": 0.7569892473118279,
622
+ "grad_norm": 0.2218916153172574,
623
+ "learning_rate": 0.00018619071739255506,
624
+ "loss": 0.5572,
625
  "step": 880
626
  },
627
  {
628
+ "epoch": 0.7655913978494624,
629
+ "grad_norm": 0.20612913226427296,
630
+ "learning_rate": 0.00018567859530079753,
631
+ "loss": 0.567,
632
  "step": 890
633
  },
634
  {
635
+ "epoch": 0.7741935483870968,
636
+ "grad_norm": 0.2296891108499081,
637
+ "learning_rate": 0.00018515788031580317,
638
+ "loss": 0.5735,
639
  "step": 900
640
  },
641
  {
642
+ "epoch": 0.7827956989247312,
643
+ "grad_norm": 0.20791733581224098,
644
+ "learning_rate": 0.0001846286246612138,
645
+ "loss": 0.56,
646
  "step": 910
647
  },
648
  {
649
+ "epoch": 0.7913978494623656,
650
+ "grad_norm": 0.2130914822501822,
651
+ "learning_rate": 0.00018409088141723364,
652
+ "loss": 0.5588,
653
  "step": 920
654
  },
655
  {
656
+ "epoch": 0.8,
657
+ "grad_norm": 0.21931892060132505,
658
+ "learning_rate": 0.00018354470451530574,
659
+ "loss": 0.5718,
660
  "step": 930
661
  },
662
  {
663
+ "epoch": 0.8086021505376344,
664
+ "grad_norm": 0.20510223609898656,
665
+ "learning_rate": 0.00018299014873270314,
666
+ "loss": 0.5693,
667
  "step": 940
668
  },
669
  {
670
+ "epoch": 0.8172043010752689,
671
+ "grad_norm": 0.20754250893604087,
672
+ "learning_rate": 0.00018242726968703505,
673
+ "loss": 0.5657,
674
  "step": 950
675
  },
676
  {
677
+ "epoch": 0.8258064516129032,
678
+ "grad_norm": 0.2119047697743998,
679
+ "learning_rate": 0.00018185612383066893,
680
+ "loss": 0.5523,
681
  "step": 960
682
  },
683
  {
684
+ "epoch": 0.8344086021505376,
685
+ "grad_norm": 0.21665086623022414,
686
+ "learning_rate": 0.00018127676844506874,
687
+ "loss": 0.5564,
688
  "step": 970
689
  },
690
  {
691
+ "epoch": 0.843010752688172,
692
+ "grad_norm": 0.22240006490076028,
693
+ "learning_rate": 0.00018068926163505,
694
+ "loss": 0.5661,
695
  "step": 980
696
  },
697
  {
698
+ "epoch": 0.8516129032258064,
699
+ "grad_norm": 0.20224509449303205,
700
+ "learning_rate": 0.00018009366232295235,
701
+ "loss": 0.5709,
702
  "step": 990
703
  },
704
  {
705
+ "epoch": 0.8602150537634409,
706
+ "grad_norm": 0.20943438346218987,
707
+ "learning_rate": 0.00017949003024273015,
708
+ "loss": 0.5567,
709
  "step": 1000
710
  },
711
  {
712
+ "epoch": 0.8688172043010752,
713
+ "grad_norm": 0.2119577392060311,
714
+ "learning_rate": 0.0001788784259339616,
715
+ "loss": 0.5726,
716
  "step": 1010
717
  },
718
  {
719
+ "epoch": 0.8774193548387097,
720
+ "grad_norm": 0.21671277914581005,
721
+ "learning_rate": 0.0001782589107357771,
722
+ "loss": 0.5721,
723
  "step": 1020
724
  },
725
  {
726
+ "epoch": 0.886021505376344,
727
+ "grad_norm": 0.21042958546072746,
728
+ "learning_rate": 0.00017763154678070733,
729
+ "loss": 0.5682,
730
  "step": 1030
731
  },
732
  {
733
+ "epoch": 0.8946236559139785,
734
+ "grad_norm": 0.21134103487495276,
735
+ "learning_rate": 0.0001769963969884521,
736
+ "loss": 0.5578,
737
  "step": 1040
738
  },
739
  {
740
+ "epoch": 0.9032258064516129,
741
+ "grad_norm": 0.2138330925142033,
742
+ "learning_rate": 0.0001763535250595696,
743
+ "loss": 0.5622,
744
  "step": 1050
745
  },
746
  {
747
+ "epoch": 0.9118279569892473,
748
+ "grad_norm": 0.20191022832768318,
749
+ "learning_rate": 0.00017570299546908812,
750
+ "loss": 0.5543,
751
  "step": 1060
752
  },
753
  {
754
+ "epoch": 0.9204301075268817,
755
+ "grad_norm": 0.20993539174391937,
756
+ "learning_rate": 0.0001750448734600394,
757
+ "loss": 0.5687,
758
  "step": 1070
759
  },
760
  {
761
+ "epoch": 0.9290322580645162,
762
+ "grad_norm": 0.21448241595075324,
763
+ "learning_rate": 0.0001743792250369155,
764
+ "loss": 0.5628,
765
  "step": 1080
766
  },
767
  {
768
+ "epoch": 0.9376344086021505,
769
+ "grad_norm": 0.21562115673441815,
770
+ "learning_rate": 0.00017370611695904895,
771
+ "loss": 0.5707,
772
  "step": 1090
773
  },
774
  {
775
+ "epoch": 0.946236559139785,
776
+ "grad_norm": 0.20800257947078069,
777
+ "learning_rate": 0.00017302561673391732,
778
+ "loss": 0.5682,
779
  "step": 1100
780
  },
781
  {
782
+ "epoch": 0.9548387096774194,
783
+ "grad_norm": 0.2142950321860787,
784
+ "learning_rate": 0.00017233779261037268,
785
+ "loss": 0.5644,
786
  "step": 1110
787
  },
788
  {
789
+ "epoch": 0.9634408602150538,
790
+ "grad_norm": 0.22988301024268806,
791
+ "learning_rate": 0.00017164271357179698,
792
+ "loss": 0.5747,
793
  "step": 1120
794
  },
795
  {
796
+ "epoch": 0.9720430107526882,
797
+ "grad_norm": 0.2044546345524181,
798
+ "learning_rate": 0.00017094044932918336,
799
+ "loss": 0.5627,
800
  "step": 1130
801
  },
802
  {
803
+ "epoch": 0.9806451612903225,
804
+ "grad_norm": 0.20801733651601362,
805
+ "learning_rate": 0.00017023107031414477,
806
+ "loss": 0.5655,
807
  "step": 1140
808
  },
809
  {
810
+ "epoch": 0.989247311827957,
811
+ "grad_norm": 0.21348629669321226,
812
+ "learning_rate": 0.00016951464767185013,
813
+ "loss": 0.5569,
814
  "step": 1150
815
  },
816
  {
817
+ "epoch": 0.9978494623655914,
818
+ "grad_norm": 0.21098078403488563,
819
+ "learning_rate": 0.0001687912532538892,
820
+ "loss": 0.5686,
821
  "step": 1160
822
  },
823
  {
824
+ "epoch": 1.0064516129032257,
825
+ "grad_norm": 0.23232328451409845,
826
+ "learning_rate": 0.00016806095961106632,
827
+ "loss": 0.5352,
828
  "step": 1170
829
  },
830
  {
831
+ "epoch": 1.0150537634408603,
832
+ "grad_norm": 0.21849412389861325,
833
+ "learning_rate": 0.00016732383998612407,
834
+ "loss": 0.5166,
835
  "step": 1180
836
  },
837
  {
838
+ "epoch": 1.0236559139784946,
839
+ "grad_norm": 0.22255257432573036,
840
+ "learning_rate": 0.00016657996830639774,
841
+ "loss": 0.5286,
842
  "step": 1190
843
  },
844
  {
845
+ "epoch": 1.032258064516129,
846
+ "grad_norm": 0.23162311614487038,
847
+ "learning_rate": 0.0001658294191764009,
848
+ "loss": 0.5209,
849
  "step": 1200
850
  },
851
  {
852
+ "epoch": 1.0408602150537634,
853
+ "grad_norm": 0.23679830183057837,
854
+ "learning_rate": 0.0001650722678703432,
855
+ "loss": 0.5155,
856
  "step": 1210
857
  },
858
  {
859
+ "epoch": 1.049462365591398,
860
+ "grad_norm": 0.24431579126710756,
861
+ "learning_rate": 0.00016430859032458086,
862
+ "loss": 0.5239,
863
  "step": 1220
864
  },
865
  {
866
+ "epoch": 1.0580645161290323,
867
+ "grad_norm": 0.21925666538005387,
868
+ "learning_rate": 0.00016353846313000098,
869
+ "loss": 0.5142,
870
  "step": 1230
871
  },
872
  {
873
+ "epoch": 1.0666666666666667,
874
+ "grad_norm": 0.23676074055821078,
875
+ "learning_rate": 0.00016276196352434,
876
+ "loss": 0.5267,
877
  "step": 1240
878
  },
879
  {
880
+ "epoch": 1.075268817204301,
881
+ "grad_norm": 0.23532815378000083,
882
+ "learning_rate": 0.00016197916938443733,
883
+ "loss": 0.5211,
884
  "step": 1250
885
  },
886
  {
887
+ "epoch": 1.0838709677419356,
888
+ "grad_norm": 0.23216395268938977,
889
+ "learning_rate": 0.00016119015921842503,
890
+ "loss": 0.5161,
891
  "step": 1260
892
  },
893
  {
894
+ "epoch": 1.09247311827957,
895
+ "grad_norm": 0.23822646088799643,
896
+ "learning_rate": 0.0001603950121578539,
897
+ "loss": 0.5077,
898
  "step": 1270
899
  },
900
  {
901
+ "epoch": 1.1010752688172043,
902
+ "grad_norm": 0.2290007557059369,
903
+ "learning_rate": 0.00015959380794975734,
904
+ "loss": 0.5156,
905
  "step": 1280
906
  },
907
  {
908
+ "epoch": 1.1096774193548387,
909
+ "grad_norm": 0.22484506558131512,
910
+ "learning_rate": 0.00015878662694865327,
911
+ "loss": 0.5205,
912
  "step": 1290
913
  },
914
  {
915
+ "epoch": 1.118279569892473,
916
+ "grad_norm": 0.22971689352325197,
917
+ "learning_rate": 0.00015797355010848519,
918
+ "loss": 0.5218,
919
  "step": 1300
920
  },
921
  {
922
+ "epoch": 1.1268817204301076,
923
+ "grad_norm": 0.23491219834804952,
924
+ "learning_rate": 0.00015715465897450317,
925
+ "loss": 0.5248,
926
  "step": 1310
927
  },
928
  {
929
+ "epoch": 1.135483870967742,
930
+ "grad_norm": 0.23555948179310204,
931
+ "learning_rate": 0.00015633003567508557,
932
+ "loss": 0.524,
933
  "step": 1320
934
  },
935
  {
936
+ "epoch": 1.1440860215053763,
937
+ "grad_norm": 0.2275140567256555,
938
+ "learning_rate": 0.00015549976291350204,
939
+ "loss": 0.517,
940
  "step": 1330
941
  },
942
  {
943
+ "epoch": 1.1526881720430107,
944
+ "grad_norm": 0.22712302146976637,
945
+ "learning_rate": 0.00015466392395961908,
946
+ "loss": 0.5243,
947
  "step": 1340
948
  },
949
  {
950
+ "epoch": 1.1612903225806452,
951
+ "grad_norm": 0.22097671231695554,
952
+ "learning_rate": 0.0001538226026415489,
953
+ "loss": 0.5118,
954
  "step": 1350
955
  },
956
  {
957
+ "epoch": 1.1698924731182796,
958
+ "grad_norm": 0.22706370204760404,
959
+ "learning_rate": 0.00015297588333724183,
960
+ "loss": 0.5235,
961
  "step": 1360
962
  },
963
  {
964
+ "epoch": 1.178494623655914,
965
+ "grad_norm": 0.2309770159886724,
966
+ "learning_rate": 0.00015212385096602415,
967
+ "loss": 0.5202,
968
  "step": 1370
969
  },
970
  {
971
+ "epoch": 1.1870967741935483,
972
+ "grad_norm": 0.23885726185135214,
973
+ "learning_rate": 0.0001512665909800811,
974
+ "loss": 0.5201,
975
  "step": 1380
976
  },
977
  {
978
+ "epoch": 1.1956989247311829,
979
+ "grad_norm": 0.23447118491147034,
980
+ "learning_rate": 0.00015040418935588682,
981
+ "loss": 0.5103,
982
  "step": 1390
983
  },
984
  {
985
+ "epoch": 1.2043010752688172,
986
+ "grad_norm": 0.24148993682260503,
987
+ "learning_rate": 0.0001495367325855816,
988
+ "loss": 0.5229,
989
  "step": 1400
990
  },
991
  {
992
+ "epoch": 1.2129032258064516,
993
+ "grad_norm": 0.22483070355440393,
994
+ "learning_rate": 0.00014866430766829743,
995
+ "loss": 0.5132,
996
  "step": 1410
997
  },
998
  {
999
+ "epoch": 1.221505376344086,
1000
+ "grad_norm": 0.2254845362019036,
1001
+ "learning_rate": 0.00014778700210143242,
1002
+ "loss": 0.5231,
1003
  "step": 1420
1004
  },
1005
  {
1006
+ "epoch": 1.2301075268817203,
1007
+ "grad_norm": 0.23952864821588327,
1008
+ "learning_rate": 0.00014690490387187584,
1009
+ "loss": 0.5157,
1010
  "step": 1430
1011
  },
1012
  {
1013
+ "epoch": 1.238709677419355,
1014
+ "grad_norm": 0.2257464446574845,
1015
+ "learning_rate": 0.00014601810144718345,
1016
+ "loss": 0.537,
1017
  "step": 1440
1018
  },
1019
  {
1020
+ "epoch": 1.2473118279569892,
1021
+ "grad_norm": 0.23617654028249638,
1022
+ "learning_rate": 0.0001451266837667051,
1023
+ "loss": 0.5187,
1024
  "step": 1450
1025
  },
1026
  {
1027
+ "epoch": 1.2559139784946236,
1028
+ "grad_norm": 0.23682708876074962,
1029
+ "learning_rate": 0.00014423074023266463,
1030
+ "loss": 0.52,
1031
  "step": 1460
1032
  },
1033
  {
1034
+ "epoch": 1.2645161290322582,
1035
+ "grad_norm": 0.22896552654447108,
1036
+ "learning_rate": 0.00014333036070119363,
1037
+ "loss": 0.5163,
1038
  "step": 1470
1039
  },
1040
  {
1041
+ "epoch": 1.2731182795698925,
1042
+ "grad_norm": 0.24027250250612717,
1043
+ "learning_rate": 0.00014242563547331956,
1044
+ "loss": 0.517,
1045
  "step": 1480
1046
  },
1047
  {
1048
+ "epoch": 1.281720430107527,
1049
+ "grad_norm": 0.23395780934976781,
1050
+ "learning_rate": 0.00014151665528590925,
1051
+ "loss": 0.5215,
1052
  "step": 1490
1053
  },
1054
  {
1055
+ "epoch": 1.2903225806451613,
1056
+ "grad_norm": 0.24728067879450083,
1057
+ "learning_rate": 0.0001406035113025687,
1058
+ "loss": 0.5208,
1059
  "step": 1500
1060
  },
1061
  {
1062
+ "epoch": 1.2989247311827956,
1063
+ "grad_norm": 0.22394688366493132,
1064
+ "learning_rate": 0.0001396862951045001,
1065
+ "loss": 0.5274,
1066
  "step": 1510
1067
  },
1068
  {
1069
+ "epoch": 1.3075268817204302,
1070
+ "grad_norm": 0.23212309124084482,
1071
+ "learning_rate": 0.00013876509868131692,
1072
+ "loss": 0.5172,
1073
  "step": 1520
1074
  },
1075
  {
1076
+ "epoch": 1.3161290322580645,
1077
+ "grad_norm": 0.23245525514987117,
1078
+ "learning_rate": 0.0001378400144218181,
1079
+ "loss": 0.521,
1080
  "step": 1530
1081
  },
1082
  {
1083
+ "epoch": 1.324731182795699,
1084
+ "grad_norm": 0.24340497246468923,
1085
+ "learning_rate": 0.00013691113510472212,
1086
+ "loss": 0.533,
1087
  "step": 1540
1088
  },
1089
  {
1090
+ "epoch": 1.3333333333333333,
1091
+ "grad_norm": 0.2321928544760521,
1092
+ "learning_rate": 0.00013597855388936217,
1093
+ "loss": 0.5283,
1094
  "step": 1550
1095
  },
1096
  {
1097
+ "epoch": 1.3419354838709676,
1098
+ "grad_norm": 0.23326523784187517,
1099
+ "learning_rate": 0.00013504236430634286,
1100
+ "loss": 0.5416,
1101
  "step": 1560
1102
  },
1103
  {
1104
+ "epoch": 1.3505376344086022,
1105
+ "grad_norm": 0.2262845978211386,
1106
+ "learning_rate": 0.0001341026602481597,
1107
+ "loss": 0.5231,
1108
  "step": 1570
1109
  },
1110
  {
1111
+ "epoch": 1.3591397849462366,
1112
+ "grad_norm": 0.2371091538942524,
1113
+ "learning_rate": 0.00013315953595978287,
1114
+ "loss": 0.5328,
1115
  "step": 1580
1116
  },
1117
  {
1118
+ "epoch": 1.367741935483871,
1119
+ "grad_norm": 0.2404491874213034,
1120
+ "learning_rate": 0.00013221308602920468,
1121
+ "loss": 0.5273,
1122
  "step": 1590
1123
  },
1124
  {
1125
+ "epoch": 1.3763440860215055,
1126
+ "grad_norm": 0.2382492165329928,
1127
+ "learning_rate": 0.00013126340537795343,
1128
+ "loss": 0.529,
1129
  "step": 1600
1130
  },
1131
  {
1132
+ "epoch": 1.3849462365591398,
1133
+ "grad_norm": 0.22921228109841923,
1134
+ "learning_rate": 0.0001303105892515734,
1135
+ "loss": 0.5198,
1136
  "step": 1610
1137
  },
1138
  {
1139
+ "epoch": 1.3935483870967742,
1140
+ "grad_norm": 0.22823020260902263,
1141
+ "learning_rate": 0.00012935473321007255,
1142
+ "loss": 0.5215,
1143
  "step": 1620
1144
  },
1145
  {
1146
+ "epoch": 1.4021505376344086,
1147
+ "grad_norm": 0.23341543795313835,
1148
+ "learning_rate": 0.0001283959331183386,
1149
+ "loss": 0.5179,
1150
  "step": 1630
1151
  },
1152
  {
1153
+ "epoch": 1.410752688172043,
1154
+ "grad_norm": 0.23483174318348113,
1155
+ "learning_rate": 0.00012743428513652442,
1156
+ "loss": 0.5242,
1157
  "step": 1640
1158
  },
1159
  {
1160
+ "epoch": 1.4193548387096775,
1161
+ "grad_norm": 0.23015296236307278,
1162
+ "learning_rate": 0.00012646988571040398,
1163
+ "loss": 0.522,
1164
  "step": 1650
1165
  },
1166
  {
1167
+ "epoch": 1.4279569892473118,
1168
+ "grad_norm": 0.24213814226406005,
1169
+ "learning_rate": 0.00012550283156169965,
1170
+ "loss": 0.5238,
1171
  "step": 1660
1172
  },
1173
  {
1174
+ "epoch": 1.4365591397849462,
1175
+ "grad_norm": 0.23239320809561437,
1176
+ "learning_rate": 0.0001245332196783817,
1177
+ "loss": 0.5156,
1178
  "step": 1670
1179
  },
1180
  {
1181
+ "epoch": 1.4451612903225808,
1182
+ "grad_norm": 0.23180542434985962,
1183
+ "learning_rate": 0.00012356114730494123,
1184
+ "loss": 0.5184,
1185
  "step": 1680
1186
  },
1187
  {
1188
+ "epoch": 1.453763440860215,
1189
+ "grad_norm": 0.23588148083202282,
1190
+ "learning_rate": 0.00012258671193263716,
1191
+ "loss": 0.5155,
1192
  "step": 1690
1193
  },
1194
  {
1195
+ "epoch": 1.4623655913978495,
1196
+ "grad_norm": 0.23435001337205247,
1197
+ "learning_rate": 0.00012161001128971879,
1198
+ "loss": 0.5293,
1199
  "step": 1700
1200
  },
1201
  {
1202
+ "epoch": 1.4709677419354839,
1203
+ "grad_norm": 0.22453229677185074,
1204
+ "learning_rate": 0.00012063114333162438,
1205
+ "loss": 0.5151,
1206
  "step": 1710
1207
  },
1208
  {
1209
+ "epoch": 1.4795698924731182,
1210
+ "grad_norm": 0.2513082837299489,
1211
+ "learning_rate": 0.00011965020623115688,
1212
+ "loss": 0.5416,
1213
  "step": 1720
1214
  },
1215
  {
1216
+ "epoch": 1.4881720430107528,
1217
+ "grad_norm": 0.23485488435380322,
1218
+ "learning_rate": 0.0001186672983686381,
1219
+ "loss": 0.5114,
1220
  "step": 1730
1221
  },
1222
  {
1223
+ "epoch": 1.4967741935483871,
1224
+ "grad_norm": 0.2339236059203922,
1225
+ "learning_rate": 0.00011768251832204187,
1226
+ "loss": 0.5189,
1227
  "step": 1740
1228
  },
1229
  {
1230
+ "epoch": 1.5053763440860215,
1231
+ "grad_norm": 0.2363738308894046,
1232
+ "learning_rate": 0.00011669596485710741,
1233
+ "loss": 0.5236,
1234
+ "step": 1750
1235
+ },
1236
+ {
1237
+ "epoch": 1.513978494623656,
1238
+ "grad_norm": 0.23458902400188686,
1239
+ "learning_rate": 0.00011570773691743397,
1240
+ "loss": 0.5284,
1241
+ "step": 1760
1242
+ },
1243
+ {
1244
+ "epoch": 1.5225806451612902,
1245
+ "grad_norm": 0.23836545061013703,
1246
+ "learning_rate": 0.0001147179336145575,
1247
+ "loss": 0.5121,
1248
+ "step": 1770
1249
+ },
1250
+ {
1251
+ "epoch": 1.5311827956989248,
1252
+ "grad_norm": 0.23790373487214236,
1253
+ "learning_rate": 0.00011372665421801056,
1254
+ "loss": 0.5165,
1255
+ "step": 1780
1256
+ },
1257
+ {
1258
+ "epoch": 1.5397849462365591,
1259
+ "grad_norm": 0.23929874324218836,
1260
+ "learning_rate": 0.0001127339981453664,
1261
+ "loss": 0.5257,
1262
+ "step": 1790
1263
+ },
1264
+ {
1265
+ "epoch": 1.5483870967741935,
1266
+ "grad_norm": 0.23994815774490452,
1267
+ "learning_rate": 0.00011174006495226812,
1268
+ "loss": 0.5125,
1269
+ "step": 1800
1270
+ },
1271
+ {
1272
+ "epoch": 1.556989247311828,
1273
+ "grad_norm": 0.22986043036890028,
1274
+ "learning_rate": 0.00011074495432244397,
1275
+ "loss": 0.5142,
1276
+ "step": 1810
1277
+ },
1278
+ {
1279
+ "epoch": 1.5655913978494622,
1280
+ "grad_norm": 0.2276160657387438,
1281
+ "learning_rate": 0.0001097487660577099,
1282
+ "loss": 0.5198,
1283
+ "step": 1820
1284
+ },
1285
+ {
1286
+ "epoch": 1.5741935483870968,
1287
+ "grad_norm": 0.23590750768735258,
1288
+ "learning_rate": 0.00010875160006796024,
1289
+ "loss": 0.5203,
1290
+ "step": 1830
1291
+ },
1292
+ {
1293
+ "epoch": 1.5827956989247312,
1294
+ "grad_norm": 0.24238494113010198,
1295
+ "learning_rate": 0.00010775355636114755,
1296
+ "loss": 0.5288,
1297
+ "step": 1840
1298
+ },
1299
+ {
1300
+ "epoch": 1.5913978494623655,
1301
+ "grad_norm": 0.23079601762605023,
1302
+ "learning_rate": 0.00010675473503325245,
1303
+ "loss": 0.516,
1304
+ "step": 1850
1305
+ },
1306
+ {
1307
+ "epoch": 1.6,
1308
+ "grad_norm": 0.23473655518093983,
1309
+ "learning_rate": 0.00010575523625824488,
1310
+ "loss": 0.5245,
1311
+ "step": 1860
1312
+ },
1313
+ {
1314
+ "epoch": 1.6086021505376344,
1315
+ "grad_norm": 0.2297499982621768,
1316
+ "learning_rate": 0.00010475516027803751,
1317
+ "loss": 0.5162,
1318
+ "step": 1870
1319
+ },
1320
+ {
1321
+ "epoch": 1.6172043010752688,
1322
+ "grad_norm": 0.23292165198339548,
1323
+ "learning_rate": 0.00010375460739243215,
1324
+ "loss": 0.5246,
1325
+ "step": 1880
1326
+ },
1327
+ {
1328
+ "epoch": 1.6258064516129034,
1329
+ "grad_norm": 0.24340787369255004,
1330
+ "learning_rate": 0.00010275367794906044,
1331
+ "loss": 0.5099,
1332
+ "step": 1890
1333
+ },
1334
+ {
1335
+ "epoch": 1.6344086021505375,
1336
+ "grad_norm": 0.24374482359627925,
1337
+ "learning_rate": 0.00010175247233331989,
1338
+ "loss": 0.5097,
1339
+ "step": 1900
1340
+ },
1341
+ {
1342
+ "epoch": 1.643010752688172,
1343
+ "grad_norm": 0.23616305839556445,
1344
+ "learning_rate": 0.00010075109095830584,
1345
+ "loss": 0.5231,
1346
+ "step": 1910
1347
+ },
1348
+ {
1349
+ "epoch": 1.6516129032258065,
1350
+ "grad_norm": 0.23231972508384582,
1351
+ "learning_rate": 9.974963425474106e-05,
1352
+ "loss": 0.5213,
1353
+ "step": 1920
1354
+ },
1355
+ {
1356
+ "epoch": 1.6602150537634408,
1357
+ "grad_norm": 0.24023523508198666,
1358
+ "learning_rate": 9.874820266090303e-05,
1359
+ "loss": 0.5145,
1360
+ "step": 1930
1361
+ },
1362
+ {
1363
+ "epoch": 1.6688172043010754,
1364
+ "grad_norm": 0.24186555744758984,
1365
+ "learning_rate": 9.774689661255106e-05,
1366
+ "loss": 0.5256,
1367
+ "step": 1940
1368
+ },
1369
+ {
1370
+ "epoch": 1.6774193548387095,
1371
+ "grad_norm": 0.2372332084270041,
1372
+ "learning_rate": 9.67458165328531e-05,
1373
+ "loss": 0.5094,
1374
+ "step": 1950
1375
+ },
1376
+ {
1377
+ "epoch": 1.686021505376344,
1378
+ "grad_norm": 0.2350639603238352,
1379
+ "learning_rate": 9.574506282231433e-05,
1380
+ "loss": 0.5119,
1381
+ "step": 1960
1382
+ },
1383
+ {
1384
+ "epoch": 1.6946236559139785,
1385
+ "grad_norm": 0.23559422057113213,
1386
+ "learning_rate": 9.474473584870757e-05,
1387
+ "loss": 0.5155,
1388
+ "step": 1970
1389
+ },
1390
+ {
1391
+ "epoch": 1.7032258064516128,
1392
+ "grad_norm": 0.2336022634685686,
1393
+ "learning_rate": 9.374493593700723e-05,
1394
+ "loss": 0.5113,
1395
+ "step": 1980
1396
+ },
1397
+ {
1398
+ "epoch": 1.7118279569892474,
1399
+ "grad_norm": 0.23539681381001779,
1400
+ "learning_rate": 9.274576335932767e-05,
1401
+ "loss": 0.5189,
1402
+ "step": 1990
1403
+ },
1404
+ {
1405
+ "epoch": 1.7204301075268817,
1406
+ "grad_norm": 0.23650024951926094,
1407
+ "learning_rate": 9.174731832486648e-05,
1408
+ "loss": 0.5082,
1409
+ "step": 2000
1410
+ },
1411
+ {
1412
+ "epoch": 1.729032258064516,
1413
+ "grad_norm": 0.23916533165546086,
1414
+ "learning_rate": 9.074970096985427e-05,
1415
+ "loss": 0.5284,
1416
+ "step": 2010
1417
+ },
1418
+ {
1419
+ "epoch": 1.7376344086021507,
1420
+ "grad_norm": 0.23608187656307292,
1421
+ "learning_rate": 8.975301134751202e-05,
1422
+ "loss": 0.5172,
1423
+ "step": 2020
1424
+ },
1425
+ {
1426
+ "epoch": 1.7462365591397848,
1427
+ "grad_norm": 0.23344322072994478,
1428
+ "learning_rate": 8.87573494180163e-05,
1429
+ "loss": 0.511,
1430
+ "step": 2030
1431
+ },
1432
+ {
1433
+ "epoch": 1.7548387096774194,
1434
+ "grad_norm": 0.24030220361622895,
1435
+ "learning_rate": 8.77628150384741e-05,
1436
+ "loss": 0.5254,
1437
+ "step": 2040
1438
+ },
1439
+ {
1440
+ "epoch": 1.7634408602150538,
1441
+ "grad_norm": 0.23404925065530963,
1442
+ "learning_rate": 8.676950795290802e-05,
1443
+ "loss": 0.5189,
1444
+ "step": 2050
1445
+ },
1446
+ {
1447
+ "epoch": 1.772043010752688,
1448
+ "grad_norm": 0.23053178874827976,
1449
+ "learning_rate": 8.57775277822526e-05,
1450
+ "loss": 0.5252,
1451
+ "step": 2060
1452
+ },
1453
+ {
1454
+ "epoch": 1.7806451612903227,
1455
+ "grad_norm": 0.23781516582822665,
1456
+ "learning_rate": 8.478697401436323e-05,
1457
+ "loss": 0.5146,
1458
+ "step": 2070
1459
+ },
1460
+ {
1461
+ "epoch": 1.789247311827957,
1462
+ "grad_norm": 0.2276955393985772,
1463
+ "learning_rate": 8.379794599403836e-05,
1464
+ "loss": 0.5169,
1465
+ "step": 2080
1466
+ },
1467
+ {
1468
+ "epoch": 1.7978494623655914,
1469
+ "grad_norm": 0.23946966468353473,
1470
+ "learning_rate": 8.281054291305566e-05,
1471
+ "loss": 0.5181,
1472
+ "step": 2090
1473
+ },
1474
+ {
1475
+ "epoch": 1.8064516129032258,
1476
+ "grad_norm": 0.22844184267941248,
1477
+ "learning_rate": 8.182486380022426e-05,
1478
+ "loss": 0.5009,
1479
+ "step": 2100
1480
+ },
1481
+ {
1482
+ "epoch": 1.8150537634408601,
1483
+ "grad_norm": 0.2386146449957716,
1484
+ "learning_rate": 8.084100751145277e-05,
1485
+ "loss": 0.5167,
1486
+ "step": 2110
1487
+ },
1488
+ {
1489
+ "epoch": 1.8236559139784947,
1490
+ "grad_norm": 0.22996861067382443,
1491
+ "learning_rate": 7.985907271983467e-05,
1492
+ "loss": 0.5172,
1493
+ "step": 2120
1494
+ },
1495
+ {
1496
+ "epoch": 1.832258064516129,
1497
+ "grad_norm": 0.23122069068423765,
1498
+ "learning_rate": 7.887915790575241e-05,
1499
+ "loss": 0.512,
1500
+ "step": 2130
1501
+ },
1502
+ {
1503
+ "epoch": 1.8408602150537634,
1504
+ "grad_norm": 0.226577342689351,
1505
+ "learning_rate": 7.790136134700042e-05,
1506
+ "loss": 0.5181,
1507
+ "step": 2140
1508
+ },
1509
+ {
1510
+ "epoch": 1.849462365591398,
1511
+ "grad_norm": 0.24205636496242264,
1512
+ "learning_rate": 7.692578110892876e-05,
1513
+ "loss": 0.5206,
1514
+ "step": 2150
1515
+ },
1516
+ {
1517
+ "epoch": 1.8580645161290321,
1518
+ "grad_norm": 0.24349873078348327,
1519
+ "learning_rate": 7.595251503460778e-05,
1520
+ "loss": 0.5226,
1521
+ "step": 2160
1522
+ },
1523
+ {
1524
+ "epoch": 1.8666666666666667,
1525
+ "grad_norm": 0.23758597762780134,
1526
+ "learning_rate": 7.498166073501529e-05,
1527
+ "loss": 0.5244,
1528
+ "step": 2170
1529
+ },
1530
+ {
1531
+ "epoch": 1.875268817204301,
1532
+ "grad_norm": 0.2284958254909026,
1533
+ "learning_rate": 7.401331557924707e-05,
1534
+ "loss": 0.505,
1535
+ "step": 2180
1536
+ },
1537
+ {
1538
+ "epoch": 1.8838709677419354,
1539
+ "grad_norm": 0.24598875919420438,
1540
+ "learning_rate": 7.304757668475122e-05,
1541
+ "loss": 0.5317,
1542
+ "step": 2190
1543
+ },
1544
+ {
1545
+ "epoch": 1.89247311827957,
1546
+ "grad_norm": 0.24113280579774263,
1547
+ "learning_rate": 7.208454090758832e-05,
1548
+ "loss": 0.5178,
1549
+ "step": 2200
1550
+ },
1551
+ {
1552
+ "epoch": 1.9010752688172043,
1553
+ "grad_norm": 0.2362286661628526,
1554
+ "learning_rate": 7.112430483271746e-05,
1555
+ "loss": 0.5202,
1556
+ "step": 2210
1557
+ },
1558
+ {
1559
+ "epoch": 1.9096774193548387,
1560
+ "grad_norm": 0.23413327678165968,
1561
+ "learning_rate": 7.016696476430931e-05,
1562
+ "loss": 0.4944,
1563
+ "step": 2220
1564
+ },
1565
+ {
1566
+ "epoch": 1.9182795698924733,
1567
+ "grad_norm": 0.23501629015109748,
1568
+ "learning_rate": 6.921261671608791e-05,
1569
+ "loss": 0.5153,
1570
+ "step": 2230
1571
+ },
1572
+ {
1573
+ "epoch": 1.9268817204301074,
1574
+ "grad_norm": 0.27066815871872996,
1575
+ "learning_rate": 6.826135640170101e-05,
1576
+ "loss": 0.5252,
1577
+ "step": 2240
1578
+ },
1579
+ {
1580
+ "epoch": 1.935483870967742,
1581
+ "grad_norm": 0.2439503871040344,
1582
+ "learning_rate": 6.731327922512074e-05,
1583
+ "loss": 0.5112,
1584
+ "step": 2250
1585
+ },
1586
+ {
1587
+ "epoch": 1.9440860215053763,
1588
+ "grad_norm": 0.23161029568643865,
1589
+ "learning_rate": 6.636848027107544e-05,
1590
+ "loss": 0.5307,
1591
+ "step": 2260
1592
+ },
1593
+ {
1594
+ "epoch": 1.9526881720430107,
1595
+ "grad_norm": 0.24210935717094484,
1596
+ "learning_rate": 6.54270542955134e-05,
1597
+ "loss": 0.5011,
1598
+ "step": 2270
1599
+ },
1600
+ {
1601
+ "epoch": 1.9612903225806453,
1602
+ "grad_norm": 0.24063211639329316,
1603
+ "learning_rate": 6.44890957160994e-05,
1604
+ "loss": 0.5234,
1605
+ "step": 2280
1606
+ },
1607
+ {
1608
+ "epoch": 1.9698924731182794,
1609
+ "grad_norm": 0.23615678793407338,
1610
+ "learning_rate": 6.355469860274574e-05,
1611
+ "loss": 0.5046,
1612
+ "step": 2290
1613
+ },
1614
+ {
1615
+ "epoch": 1.978494623655914,
1616
+ "grad_norm": 0.22842985809994715,
1617
+ "learning_rate": 6.262395666817724e-05,
1618
+ "loss": 0.5012,
1619
+ "step": 2300
1620
+ },
1621
+ {
1622
+ "epoch": 1.9870967741935484,
1623
+ "grad_norm": 0.23784853454359128,
1624
+ "learning_rate": 6.169696325853312e-05,
1625
+ "loss": 0.519,
1626
+ "step": 2310
1627
+ },
1628
+ {
1629
+ "epoch": 1.9956989247311827,
1630
+ "grad_norm": 0.2342469521451372,
1631
+ "learning_rate": 6.077381134400462e-05,
1632
+ "loss": 0.5002,
1633
+ "step": 2320
1634
+ },
1635
+ {
1636
+ "epoch": 2.0043010752688173,
1637
+ "grad_norm": 0.22833969780513486,
1638
+ "learning_rate": 5.985459350951121e-05,
1639
+ "loss": 0.4845,
1640
+ "step": 2330
1641
+ },
1642
+ {
1643
+ "epoch": 2.0129032258064514,
1644
+ "grad_norm": 0.2544744796985262,
1645
+ "learning_rate": 5.893940194541492e-05,
1646
+ "loss": 0.464,
1647
+ "step": 2340
1648
+ },
1649
+ {
1650
+ "epoch": 2.021505376344086,
1651
+ "grad_norm": 0.24382320744489763,
1652
+ "learning_rate": 5.802832843827419e-05,
1653
+ "loss": 0.455,
1654
+ "step": 2350
1655
+ },
1656
+ {
1657
+ "epoch": 2.0301075268817206,
1658
+ "grad_norm": 0.24697875997895144,
1659
+ "learning_rate": 5.712146436163863e-05,
1660
+ "loss": 0.4598,
1661
+ "step": 2360
1662
+ },
1663
+ {
1664
+ "epoch": 2.0387096774193547,
1665
+ "grad_norm": 0.2513303656347981,
1666
+ "learning_rate": 5.6218900666884975e-05,
1667
+ "loss": 0.4515,
1668
+ "step": 2370
1669
+ },
1670
+ {
1671
+ "epoch": 2.0473118279569893,
1672
+ "grad_norm": 0.26236895102262536,
1673
+ "learning_rate": 5.5320727874095014e-05,
1674
+ "loss": 0.4598,
1675
+ "step": 2380
1676
+ },
1677
+ {
1678
+ "epoch": 2.055913978494624,
1679
+ "grad_norm": 0.25978280232463796,
1680
+ "learning_rate": 5.4427036062977744e-05,
1681
+ "loss": 0.451,
1682
+ "step": 2390
1683
+ },
1684
+ {
1685
+ "epoch": 2.064516129032258,
1686
+ "grad_norm": 0.26162376545024385,
1687
+ "learning_rate": 5.3537914863834374e-05,
1688
+ "loss": 0.4563,
1689
+ "step": 2400
1690
+ },
1691
+ {
1692
+ "epoch": 2.0731182795698926,
1693
+ "grad_norm": 0.27200603174186766,
1694
+ "learning_rate": 5.265345344856979e-05,
1695
+ "loss": 0.4595,
1696
+ "step": 2410
1697
+ },
1698
+ {
1699
+ "epoch": 2.0817204301075267,
1700
+ "grad_norm": 0.2516133186216482,
1701
+ "learning_rate": 5.1773740521748793e-05,
1702
+ "loss": 0.4543,
1703
+ "step": 2420
1704
+ },
1705
+ {
1706
+ "epoch": 2.0903225806451613,
1707
+ "grad_norm": 0.2559266512535566,
1708
+ "learning_rate": 5.089886431169999e-05,
1709
+ "loss": 0.4576,
1710
+ "step": 2430
1711
+ },
1712
+ {
1713
+ "epoch": 2.098924731182796,
1714
+ "grad_norm": 0.26643890557668787,
1715
+ "learning_rate": 5.0028912561667104e-05,
1716
+ "loss": 0.4651,
1717
+ "step": 2440
1718
+ },
1719
+ {
1720
+ "epoch": 2.10752688172043,
1721
+ "grad_norm": 0.2594035563202495,
1722
+ "learning_rate": 4.916397252100892e-05,
1723
+ "loss": 0.4581,
1724
+ "step": 2450
1725
+ },
1726
+ {
1727
+ "epoch": 2.1161290322580646,
1728
+ "grad_norm": 0.2589087763537241,
1729
+ "learning_rate": 4.830413093644913e-05,
1730
+ "loss": 0.4508,
1731
+ "step": 2460
1732
+ },
1733
+ {
1734
+ "epoch": 2.1247311827956987,
1735
+ "grad_norm": 0.25564217019549307,
1736
+ "learning_rate": 4.744947404337605e-05,
1737
+ "loss": 0.4498,
1738
+ "step": 2470
1739
+ },
1740
+ {
1741
+ "epoch": 2.1333333333333333,
1742
+ "grad_norm": 0.25846858564022296,
1743
+ "learning_rate": 4.660008755719397e-05,
1744
+ "loss": 0.4439,
1745
+ "step": 2480
1746
+ },
1747
+ {
1748
+ "epoch": 2.141935483870968,
1749
+ "grad_norm": 0.2615309859057683,
1750
+ "learning_rate": 4.5756056664726554e-05,
1751
+ "loss": 0.4621,
1752
+ "step": 2490
1753
+ },
1754
+ {
1755
+ "epoch": 2.150537634408602,
1756
+ "grad_norm": 0.2751137583558904,
1757
+ "learning_rate": 4.491746601567343e-05,
1758
+ "loss": 0.4542,
1759
+ "step": 2500
1760
+ },
1761
+ {
1762
+ "epoch": 2.1591397849462366,
1763
+ "grad_norm": 0.2640093266145403,
1764
+ "learning_rate": 4.408439971412013e-05,
1765
+ "loss": 0.4709,
1766
+ "step": 2510
1767
+ },
1768
+ {
1769
+ "epoch": 2.167741935483871,
1770
+ "grad_norm": 0.26804533689921306,
1771
+ "learning_rate": 4.325694131010346e-05,
1772
+ "loss": 0.4545,
1773
+ "step": 2520
1774
+ },
1775
+ {
1776
+ "epoch": 2.1763440860215053,
1777
+ "grad_norm": 0.26953837584010415,
1778
+ "learning_rate": 4.243517379123193e-05,
1779
+ "loss": 0.447,
1780
+ "step": 2530
1781
+ },
1782
+ {
1783
+ "epoch": 2.18494623655914,
1784
+ "grad_norm": 0.26331494847521747,
1785
+ "learning_rate": 4.161917957436271e-05,
1786
+ "loss": 0.4454,
1787
+ "step": 2540
1788
+ },
1789
+ {
1790
+ "epoch": 2.193548387096774,
1791
+ "grad_norm": 0.26650178799391877,
1792
+ "learning_rate": 4.080904049733607e-05,
1793
+ "loss": 0.4452,
1794
+ "step": 2550
1795
+ },
1796
+ {
1797
+ "epoch": 2.2021505376344086,
1798
+ "grad_norm": 0.2596397199813554,
1799
+ "learning_rate": 4.0004837810767294e-05,
1800
+ "loss": 0.455,
1801
+ "step": 2560
1802
+ },
1803
+ {
1804
+ "epoch": 2.210752688172043,
1805
+ "grad_norm": 0.2855359315189352,
1806
+ "learning_rate": 3.9206652169898364e-05,
1807
+ "loss": 0.4619,
1808
+ "step": 2570
1809
+ },
1810
+ {
1811
+ "epoch": 2.2193548387096773,
1812
+ "grad_norm": 0.2715169650377671,
1813
+ "learning_rate": 3.841456362650837e-05,
1814
+ "loss": 0.4615,
1815
+ "step": 2580
1816
+ },
1817
+ {
1818
+ "epoch": 2.227956989247312,
1819
+ "grad_norm": 0.2628755048951637,
1820
+ "learning_rate": 3.7628651620885444e-05,
1821
+ "loss": 0.4645,
1822
+ "step": 2590
1823
+ },
1824
+ {
1825
+ "epoch": 2.236559139784946,
1826
+ "grad_norm": 0.28592361434576347,
1827
+ "learning_rate": 3.6848994973859105e-05,
1828
+ "loss": 0.4692,
1829
+ "step": 2600
1830
+ },
1831
+ {
1832
+ "epoch": 2.2451612903225806,
1833
+ "grad_norm": 0.26764492546621405,
1834
+ "learning_rate": 3.607567187889538e-05,
1835
+ "loss": 0.4563,
1836
+ "step": 2610
1837
+ },
1838
+ {
1839
+ "epoch": 2.253763440860215,
1840
+ "grad_norm": 0.271421407542708,
1841
+ "learning_rate": 3.5308759894254496e-05,
1842
+ "loss": 0.4671,
1843
+ "step": 2620
1844
+ },
1845
+ {
1846
+ "epoch": 2.2623655913978493,
1847
+ "grad_norm": 0.2653346076557134,
1848
+ "learning_rate": 3.45483359352125e-05,
1849
+ "loss": 0.4535,
1850
+ "step": 2630
1851
+ },
1852
+ {
1853
+ "epoch": 2.270967741935484,
1854
+ "grad_norm": 0.2672124362703016,
1855
+ "learning_rate": 3.379447626634712e-05,
1856
+ "loss": 0.4523,
1857
+ "step": 2640
1858
+ },
1859
+ {
1860
+ "epoch": 2.279569892473118,
1861
+ "grad_norm": 0.2815570906050588,
1862
+ "learning_rate": 3.304725649388919e-05,
1863
+ "loss": 0.468,
1864
+ "step": 2650
1865
+ },
1866
+ {
1867
+ "epoch": 2.2881720430107526,
1868
+ "grad_norm": 0.27533793807613155,
1869
+ "learning_rate": 3.230675155813979e-05,
1870
+ "loss": 0.4587,
1871
+ "step": 2660
1872
+ },
1873
+ {
1874
+ "epoch": 2.296774193548387,
1875
+ "grad_norm": 0.2718010567194757,
1876
+ "learning_rate": 3.1573035725954344e-05,
1877
+ "loss": 0.4518,
1878
+ "step": 2670
1879
+ },
1880
+ {
1881
+ "epoch": 2.3053763440860213,
1882
+ "grad_norm": 0.26881948505806064,
1883
+ "learning_rate": 3.084618258329443e-05,
1884
+ "loss": 0.4511,
1885
+ "step": 2680
1886
+ },
1887
+ {
1888
+ "epoch": 2.313978494623656,
1889
+ "grad_norm": 0.2735905144552066,
1890
+ "learning_rate": 3.012626502784729e-05,
1891
+ "loss": 0.4437,
1892
+ "step": 2690
1893
+ },
1894
+ {
1895
+ "epoch": 2.3225806451612905,
1896
+ "grad_norm": 0.2753569674239171,
1897
+ "learning_rate": 2.9413355261715192e-05,
1898
+ "loss": 0.457,
1899
+ "step": 2700
1900
+ },
1901
+ {
1902
+ "epoch": 2.3311827956989246,
1903
+ "grad_norm": 0.27264140236415646,
1904
+ "learning_rate": 2.87075247841738e-05,
1905
+ "loss": 0.4562,
1906
+ "step": 2710
1907
+ },
1908
+ {
1909
+ "epoch": 2.339784946236559,
1910
+ "grad_norm": 0.2750362048193341,
1911
+ "learning_rate": 2.8008844384501566e-05,
1912
+ "loss": 0.4578,
1913
+ "step": 2720
1914
+ },
1915
+ {
1916
+ "epoch": 2.3483870967741938,
1917
+ "grad_norm": 0.2743768587791329,
1918
+ "learning_rate": 2.7317384134879965e-05,
1919
+ "loss": 0.4569,
1920
+ "step": 2730
1921
+ },
1922
+ {
1923
+ "epoch": 2.356989247311828,
1924
+ "grad_norm": 0.27994254494622967,
1925
+ "learning_rate": 2.6633213383365906e-05,
1926
+ "loss": 0.4554,
1927
+ "step": 2740
1928
+ },
1929
+ {
1930
+ "epoch": 2.3655913978494625,
1931
+ "grad_norm": 0.273412382006339,
1932
+ "learning_rate": 2.595640074693664e-05,
1933
+ "loss": 0.4521,
1934
+ "step": 2750
1935
+ },
1936
+ {
1937
+ "epoch": 2.3741935483870966,
1938
+ "grad_norm": 0.26693176903025,
1939
+ "learning_rate": 2.5287014104607975e-05,
1940
+ "loss": 0.4472,
1941
+ "step": 2760
1942
+ },
1943
+ {
1944
+ "epoch": 2.382795698924731,
1945
+ "grad_norm": 0.26563176753886897,
1946
+ "learning_rate": 2.4625120590626595e-05,
1947
+ "loss": 0.4487,
1948
+ "step": 2770
1949
+ },
1950
+ {
1951
+ "epoch": 2.3913978494623658,
1952
+ "grad_norm": 0.27008804180981577,
1953
+ "learning_rate": 2.397078658773699e-05,
1954
+ "loss": 0.4611,
1955
+ "step": 2780
1956
+ },
1957
+ {
1958
+ "epoch": 2.4,
1959
+ "grad_norm": 0.2624172113992881,
1960
+ "learning_rate": 2.3324077720523785e-05,
1961
+ "loss": 0.4461,
1962
+ "step": 2790
1963
+ },
1964
+ {
1965
+ "epoch": 2.4086021505376345,
1966
+ "grad_norm": 0.27677007427727796,
1967
+ "learning_rate": 2.2685058848830076e-05,
1968
+ "loss": 0.4536,
1969
+ "step": 2800
1970
+ },
1971
+ {
1972
+ "epoch": 2.4172043010752686,
1973
+ "grad_norm": 0.27569949386731046,
1974
+ "learning_rate": 2.2053794061252675e-05,
1975
+ "loss": 0.4474,
1976
+ "step": 2810
1977
+ },
1978
+ {
1979
+ "epoch": 2.425806451612903,
1980
+ "grad_norm": 0.267987384900051,
1981
+ "learning_rate": 2.1430346668714175e-05,
1982
+ "loss": 0.4487,
1983
+ "step": 2820
1984
+ },
1985
+ {
1986
+ "epoch": 2.434408602150538,
1987
+ "grad_norm": 0.2669869480120869,
1988
+ "learning_rate": 2.0814779198113687e-05,
1989
+ "loss": 0.455,
1990
+ "step": 2830
1991
+ },
1992
+ {
1993
+ "epoch": 2.443010752688172,
1994
+ "grad_norm": 0.26354591329030985,
1995
+ "learning_rate": 2.020715338605581e-05,
1996
+ "loss": 0.4616,
1997
+ "step": 2840
1998
+ },
1999
+ {
2000
+ "epoch": 2.4516129032258065,
2001
+ "grad_norm": 0.2785720489510887,
2002
+ "learning_rate": 1.9607530172658715e-05,
2003
+ "loss": 0.4419,
2004
+ "step": 2850
2005
+ },
2006
+ {
2007
+ "epoch": 2.4602150537634406,
2008
+ "grad_norm": 0.27851326696395934,
2009
+ "learning_rate": 1.9015969695442704e-05,
2010
+ "loss": 0.4572,
2011
+ "step": 2860
2012
+ },
2013
+ {
2014
+ "epoch": 2.468817204301075,
2015
+ "grad_norm": 0.27236418612561275,
2016
+ "learning_rate": 1.8432531283298458e-05,
2017
+ "loss": 0.453,
2018
+ "step": 2870
2019
+ },
2020
+ {
2021
+ "epoch": 2.47741935483871,
2022
+ "grad_norm": 0.2816067593156165,
2023
+ "learning_rate": 1.7857273450537227e-05,
2024
+ "loss": 0.4607,
2025
+ "step": 2880
2026
+ },
2027
+ {
2028
+ "epoch": 2.486021505376344,
2029
+ "grad_norm": 0.27424768533392857,
2030
+ "learning_rate": 1.7290253891022e-05,
2031
+ "loss": 0.4385,
2032
+ "step": 2890
2033
+ },
2034
+ {
2035
+ "epoch": 2.4946236559139785,
2036
+ "grad_norm": 0.2766562755277204,
2037
+ "learning_rate": 1.673152947238139e-05,
2038
+ "loss": 0.4567,
2039
+ "step": 2900
2040
+ },
2041
+ {
2042
+ "epoch": 2.5032258064516126,
2043
+ "grad_norm": 0.2614813998070769,
2044
+ "learning_rate": 1.618115623030625e-05,
2045
+ "loss": 0.4428,
2046
+ "step": 2910
2047
+ },
2048
+ {
2049
+ "epoch": 2.511827956989247,
2050
+ "grad_norm": 0.272212168117098,
2051
+ "learning_rate": 1.5639189362929695e-05,
2052
+ "loss": 0.453,
2053
+ "step": 2920
2054
+ },
2055
+ {
2056
+ "epoch": 2.520430107526882,
2057
+ "grad_norm": 0.2747575768340182,
2058
+ "learning_rate": 1.5105683225291211e-05,
2059
+ "loss": 0.4586,
2060
+ "step": 2930
2061
+ },
2062
+ {
2063
+ "epoch": 2.5290322580645164,
2064
+ "grad_norm": 0.2711344401379175,
2065
+ "learning_rate": 1.4580691323885209e-05,
2066
+ "loss": 0.4506,
2067
+ "step": 2940
2068
+ },
2069
+ {
2070
+ "epoch": 2.5376344086021505,
2071
+ "grad_norm": 0.27821711929840565,
2072
+ "learning_rate": 1.4064266311294793e-05,
2073
+ "loss": 0.4669,
2074
+ "step": 2950
2075
+ },
2076
+ {
2077
+ "epoch": 2.546236559139785,
2078
+ "grad_norm": 0.2807569627250837,
2079
+ "learning_rate": 1.3556459980911085e-05,
2080
+ "loss": 0.4567,
2081
+ "step": 2960
2082
+ },
2083
+ {
2084
+ "epoch": 2.554838709677419,
2085
+ "grad_norm": 0.27478294823892596,
2086
+ "learning_rate": 1.305732326173882e-05,
2087
+ "loss": 0.4629,
2088
+ "step": 2970
2089
+ },
2090
+ {
2091
+ "epoch": 2.563440860215054,
2092
+ "grad_norm": 0.2694458161205595,
2093
+ "learning_rate": 1.2566906213288388e-05,
2094
+ "loss": 0.4518,
2095
+ "step": 2980
2096
+ },
2097
+ {
2098
+ "epoch": 2.5720430107526884,
2099
+ "grad_norm": 0.26379184313364523,
2100
+ "learning_rate": 1.2085258020555556e-05,
2101
+ "loss": 0.4489,
2102
+ "step": 2990
2103
+ },
2104
+ {
2105
+ "epoch": 2.5806451612903225,
2106
+ "grad_norm": 0.27978762666210216,
2107
+ "learning_rate": 1.1612426989088232e-05,
2108
+ "loss": 0.4493,
2109
+ "step": 3000
2110
+ },
2111
+ {
2112
+ "epoch": 2.589247311827957,
2113
+ "grad_norm": 0.28005079300575625,
2114
+ "learning_rate": 1.1148460540142125e-05,
2115
+ "loss": 0.4501,
2116
+ "step": 3010
2117
+ },
2118
+ {
2119
+ "epoch": 2.5978494623655912,
2120
+ "grad_norm": 0.2765800997942907,
2121
+ "learning_rate": 1.0693405205924579e-05,
2122
+ "loss": 0.4599,
2123
+ "step": 3020
2124
+ },
2125
+ {
2126
+ "epoch": 2.606451612903226,
2127
+ "grad_norm": 0.26856411845690764,
2128
+ "learning_rate": 1.0247306624927789e-05,
2129
+ "loss": 0.4489,
2130
+ "step": 3030
2131
+ },
2132
+ {
2133
+ "epoch": 2.6150537634408604,
2134
+ "grad_norm": 0.2758438510594412,
2135
+ "learning_rate": 9.810209537351645e-06,
2136
+ "loss": 0.4427,
2137
+ "step": 3040
2138
+ },
2139
+ {
2140
+ "epoch": 2.6236559139784945,
2141
+ "grad_norm": 0.2808673768423207,
2142
+ "learning_rate": 9.382157780616606e-06,
2143
+ "loss": 0.4369,
2144
+ "step": 3050
2145
+ },
2146
+ {
2147
+ "epoch": 2.632258064516129,
2148
+ "grad_norm": 0.26355013415952777,
2149
+ "learning_rate": 8.963194284967202e-06,
2150
+ "loss": 0.4543,
2151
+ "step": 3060
2152
+ },
2153
+ {
2154
+ "epoch": 2.6408602150537632,
2155
+ "grad_norm": 0.26682463219677827,
2156
+ "learning_rate": 8.553361069166388e-06,
2157
+ "loss": 0.4448,
2158
+ "step": 3070
2159
+ },
2160
+ {
2161
+ "epoch": 2.649462365591398,
2162
+ "grad_norm": 0.2824919005760602,
2163
+ "learning_rate": 8.15269923628147e-06,
2164
+ "loss": 0.463,
2165
+ "step": 3080
2166
+ },
2167
+ {
2168
+ "epoch": 2.6580645161290324,
2169
+ "grad_norm": 0.2606472077942361,
2170
+ "learning_rate": 7.761248969561729e-06,
2171
+ "loss": 0.4435,
2172
+ "step": 3090
2173
+ },
2174
+ {
2175
+ "epoch": 2.6666666666666665,
2176
+ "grad_norm": 0.2716672911173004,
2177
+ "learning_rate": 7.379049528408433e-06,
2178
+ "loss": 0.4436,
2179
+ "step": 3100
2180
+ },
2181
+ {
2182
+ "epoch": 2.675268817204301,
2183
+ "grad_norm": 0.2692189798747163,
2184
+ "learning_rate": 7.00613924443726e-06,
2185
+ "loss": 0.456,
2186
+ "step": 3110
2187
+ },
2188
+ {
2189
+ "epoch": 2.6838709677419352,
2190
+ "grad_norm": 0.2751219149255623,
2191
+ "learning_rate": 6.642555517634197e-06,
2192
+ "loss": 0.4486,
2193
+ "step": 3120
2194
+ },
2195
+ {
2196
+ "epoch": 2.69247311827957,
2197
+ "grad_norm": 0.26558006981794996,
2198
+ "learning_rate": 6.288334812604324e-06,
2199
+ "loss": 0.448,
2200
+ "step": 3130
2201
+ },
2202
+ {
2203
+ "epoch": 2.7010752688172044,
2204
+ "grad_norm": 0.2859552474881587,
2205
+ "learning_rate": 5.943512654914951e-06,
2206
+ "loss": 0.4357,
2207
+ "step": 3140
2208
+ },
2209
+ {
2210
+ "epoch": 2.709677419354839,
2211
+ "grad_norm": 0.27548323875619696,
2212
+ "learning_rate": 5.6081236275325355e-06,
2213
+ "loss": 0.4624,
2214
+ "step": 3150
2215
+ },
2216
+ {
2217
+ "epoch": 2.718279569892473,
2218
+ "grad_norm": 0.2806542534666262,
2219
+ "learning_rate": 5.282201367354245e-06,
2220
+ "loss": 0.4505,
2221
+ "step": 3160
2222
+ },
2223
+ {
2224
+ "epoch": 2.7268817204301077,
2225
+ "grad_norm": 0.26475293341781525,
2226
+ "learning_rate": 4.965778561834644e-06,
2227
+ "loss": 0.4386,
2228
+ "step": 3170
2229
+ },
2230
+ {
2231
+ "epoch": 2.735483870967742,
2232
+ "grad_norm": 0.28440947626379526,
2233
+ "learning_rate": 4.658886945707164e-06,
2234
+ "loss": 0.4542,
2235
+ "step": 3180
2236
+ },
2237
+ {
2238
+ "epoch": 2.7440860215053764,
2239
+ "grad_norm": 0.2696410246213196,
2240
+ "learning_rate": 4.361557297801499e-06,
2241
+ "loss": 0.4416,
2242
+ "step": 3190
2243
+ },
2244
+ {
2245
+ "epoch": 2.752688172043011,
2246
+ "grad_norm": 0.27708040093077324,
2247
+ "learning_rate": 4.073819437956694e-06,
2248
+ "loss": 0.4471,
2249
+ "step": 3200
2250
+ },
2251
+ {
2252
+ "epoch": 2.761290322580645,
2253
+ "grad_norm": 0.2615573424318419,
2254
+ "learning_rate": 3.7957022240304173e-06,
2255
+ "loss": 0.4505,
2256
+ "step": 3210
2257
+ },
2258
+ {
2259
+ "epoch": 2.7698924731182797,
2260
+ "grad_norm": 0.2776744589099636,
2261
+ "learning_rate": 3.5272335490047937e-06,
2262
+ "loss": 0.4416,
2263
+ "step": 3220
2264
+ },
2265
+ {
2266
+ "epoch": 2.778494623655914,
2267
+ "grad_norm": 0.28362883428775265,
2268
+ "learning_rate": 3.2684403381889272e-06,
2269
+ "loss": 0.4562,
2270
+ "step": 3230
2271
+ },
2272
+ {
2273
+ "epoch": 2.7870967741935484,
2274
+ "grad_norm": 0.2659089658104915,
2275
+ "learning_rate": 3.019348546518508e-06,
2276
+ "loss": 0.4542,
2277
+ "step": 3240
2278
+ },
2279
+ {
2280
+ "epoch": 2.795698924731183,
2281
+ "grad_norm": 0.27817617897785124,
2282
+ "learning_rate": 2.7799831559527258e-06,
2283
+ "loss": 0.4535,
2284
+ "step": 3250
2285
+ },
2286
+ {
2287
+ "epoch": 2.804301075268817,
2288
+ "grad_norm": 0.2767257855534767,
2289
+ "learning_rate": 2.550368172968809e-06,
2290
+ "loss": 0.4438,
2291
+ "step": 3260
2292
+ },
2293
+ {
2294
+ "epoch": 2.8129032258064517,
2295
+ "grad_norm": 0.2659520383981123,
2296
+ "learning_rate": 2.3305266261542945e-06,
2297
+ "loss": 0.4648,
2298
+ "step": 3270
2299
+ },
2300
+ {
2301
+ "epoch": 2.821505376344086,
2302
+ "grad_norm": 0.26860959961537617,
2303
+ "learning_rate": 2.1204805638975646e-06,
2304
+ "loss": 0.4478,
2305
+ "step": 3280
2306
+ },
2307
+ {
2308
+ "epoch": 2.8301075268817204,
2309
+ "grad_norm": 0.2772290842479072,
2310
+ "learning_rate": 1.9202510521763696e-06,
2311
+ "loss": 0.4556,
2312
+ "step": 3290
2313
+ },
2314
+ {
2315
+ "epoch": 2.838709677419355,
2316
+ "grad_norm": 0.2773729267266774,
2317
+ "learning_rate": 1.7298581724452978e-06,
2318
+ "loss": 0.4546,
2319
+ "step": 3300
2320
+ },
2321
+ {
2322
+ "epoch": 2.847311827956989,
2323
+ "grad_norm": 0.288243164539903,
2324
+ "learning_rate": 1.5493210196216079e-06,
2325
+ "loss": 0.457,
2326
+ "step": 3310
2327
+ },
2328
+ {
2329
+ "epoch": 2.8559139784946237,
2330
+ "grad_norm": 0.27792696429990915,
2331
+ "learning_rate": 1.378657700170205e-06,
2332
+ "loss": 0.4585,
2333
+ "step": 3320
2334
+ },
2335
+ {
2336
+ "epoch": 2.864516129032258,
2337
+ "grad_norm": 0.27882422779333554,
2338
+ "learning_rate": 1.2178853302877159e-06,
2339
+ "loss": 0.4486,
2340
+ "step": 3330
2341
+ },
2342
+ {
2343
+ "epoch": 2.8731182795698924,
2344
+ "grad_norm": 0.28133695329057534,
2345
+ "learning_rate": 1.0670200341858394e-06,
2346
+ "loss": 0.4502,
2347
+ "step": 3340
2348
+ },
2349
+ {
2350
+ "epoch": 2.881720430107527,
2351
+ "grad_norm": 0.2859131700306455,
2352
+ "learning_rate": 9.260769424742633e-07,
2353
+ "loss": 0.4452,
2354
+ "step": 3350
2355
+ },
2356
+ {
2357
+ "epoch": 2.8903225806451616,
2358
+ "grad_norm": 0.26707708683830167,
2359
+ "learning_rate": 7.950701906431324e-07,
2360
+ "loss": 0.4498,
2361
+ "step": 3360
2362
+ },
2363
+ {
2364
+ "epoch": 2.8989247311827957,
2365
+ "grad_norm": 0.2716338526967156,
2366
+ "learning_rate": 6.740129176453725e-07,
2367
+ "loss": 0.4568,
2368
+ "step": 3370
2369
+ },
2370
+ {
2371
+ "epoch": 2.90752688172043,
2372
+ "grad_norm": 0.29044410591630965,
2373
+ "learning_rate": 5.629172645789882e-07,
2374
+ "loss": 0.4517,
2375
+ "step": 3380
2376
+ },
2377
+ {
2378
+ "epoch": 2.9161290322580644,
2379
+ "grad_norm": 0.27054199049752875,
2380
+ "learning_rate": 4.617943734694152e-07,
2381
+ "loss": 0.4559,
2382
+ "step": 3390
2383
+ },
2384
+ {
2385
+ "epoch": 2.924731182795699,
2386
+ "grad_norm": 0.27515263341966606,
2387
+ "learning_rate": 3.7065438615198066e-07,
2388
+ "loss": 0.4576,
2389
+ "step": 3400
2390
+ },
2391
+ {
2392
+ "epoch": 2.9333333333333336,
2393
+ "grad_norm": 0.2653013119464242,
2394
+ "learning_rate": 2.8950644325485e-07,
2395
+ "loss": 0.449,
2396
+ "step": 3410
2397
+ },
2398
+ {
2399
+ "epoch": 2.9419354838709677,
2400
+ "grad_norm": 0.2791971697586357,
2401
+ "learning_rate": 2.183586832822493e-07,
2402
+ "loss": 0.4562,
2403
+ "step": 3420
2404
+ },
2405
+ {
2406
+ "epoch": 2.9505376344086023,
2407
+ "grad_norm": 0.2668878384341789,
2408
+ "learning_rate": 1.572182417982515e-07,
2409
+ "loss": 0.447,
2410
+ "step": 3430
2411
+ },
2412
+ {
2413
+ "epoch": 2.9591397849462364,
2414
+ "grad_norm": 0.2759015749441583,
2415
+ "learning_rate": 1.0609125071109338e-07,
2416
+ "loss": 0.4567,
2417
+ "step": 3440
2418
+ },
2419
+ {
2420
+ "epoch": 2.967741935483871,
2421
+ "grad_norm": 0.283562302360861,
2422
+ "learning_rate": 6.49828376582673e-08,
2423
+ "loss": 0.4503,
2424
+ "step": 3450
2425
+ },
2426
+ {
2427
+ "epoch": 2.9763440860215056,
2428
+ "grad_norm": 0.28059419780497596,
2429
+ "learning_rate": 3.3897125492188266e-08,
2430
+ "loss": 0.4525,
2431
+ "step": 3460
2432
+ },
2433
+ {
2434
+ "epoch": 2.9849462365591397,
2435
+ "grad_norm": 0.2757143204657877,
2436
+ "learning_rate": 1.2837231866746902e-08,
2437
+ "loss": 0.4455,
2438
+ "step": 3470
2439
+ },
2440
+ {
2441
+ "epoch": 2.9935483870967743,
2442
+ "grad_norm": 0.26729797929321086,
2443
+ "learning_rate": 1.8052689246150779e-09,
2444
+ "loss": 0.4443,
2445
+ "step": 3480
2446
+ },
2447
+ {
2448
+ "epoch": 2.9987096774193547,
2449
+ "step": 3486,
2450
+ "total_flos": 6514990399881216.0,
2451
+ "train_loss": 0.5279938033170147,
2452
+ "train_runtime": 9440.7969,
2453
+ "train_samples_per_second": 5.91,
2454
+ "train_steps_per_second": 0.369
2455
  }
2456
  ],
2457
  "logging_steps": 10,
2458
+ "max_steps": 3486,
2459
  "num_input_tokens_seen": 0,
2460
  "num_train_epochs": 3,
2461
  "save_steps": 500,
 
2471
  "attributes": {}
2472
  }
2473
  },
2474
+ "total_flos": 6514990399881216.0,
2475
  "train_batch_size": 4,
2476
  "trial_name": null,
2477
  "trial_params": null