chansung commited on
Commit
d3c84c3
1 Parent(s): a08bf1f

Model save

Browse files
README.md CHANGED
@@ -2,13 +2,12 @@
2
  license: gemma
3
  library_name: peft
4
  tags:
5
- - alignment-handbook
6
  - trl
7
  - sft
8
  - generated_from_trainer
9
  base_model: google/gemma-2b
10
  datasets:
11
- - llama-duo/synth_summarize_dataset_dedup
12
  model-index:
13
  - name: gemma2b-summarize-gemini1_5flash-64k
14
  results: []
@@ -19,9 +18,9 @@ should probably proofread and complete it, then remove this comment. -->
19
 
20
  # gemma2b-summarize-gemini1_5flash-64k
21
 
22
- This model is a fine-tuned version of [google/gemma-2b](https://huggingface.co/google/gemma-2b) on the llama-duo/synth_summarize_dataset_dedup dataset.
23
  It achieves the following results on the evaluation set:
24
- - Loss: 2.5425
25
 
26
  ## Model description
27
 
@@ -41,14 +40,14 @@ More information needed
41
 
42
  The following hyperparameters were used during training:
43
  - learning_rate: 0.0002
44
- - train_batch_size: 8
45
- - eval_batch_size: 8
46
  - seed: 42
47
  - distributed_type: multi-GPU
48
- - num_devices: 4
49
  - gradient_accumulation_steps: 2
50
- - total_train_batch_size: 64
51
- - total_eval_batch_size: 32
52
  - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
53
  - lr_scheduler_type: cosine
54
  - lr_scheduler_warmup_ratio: 0.1
@@ -58,27 +57,27 @@ The following hyperparameters were used during training:
58
 
59
  | Training Loss | Epoch | Step | Validation Loss |
60
  |:-------------:|:-------:|:----:|:---------------:|
61
- | 1.2724 | 0.9952 | 104 | 2.5531 |
62
- | 1.0792 | 2.0 | 209 | 2.4752 |
63
- | 1.0312 | 2.9952 | 313 | 2.4632 |
64
- | 0.9957 | 4.0 | 418 | 2.4705 |
65
- | 0.9775 | 4.9952 | 522 | 2.4764 |
66
- | 0.9584 | 6.0 | 627 | 2.4870 |
67
- | 0.9368 | 6.9952 | 731 | 2.4975 |
68
- | 0.93 | 8.0 | 836 | 2.5067 |
69
- | 0.9195 | 8.9952 | 940 | 2.5168 |
70
- | 0.912 | 10.0 | 1045 | 2.5271 |
71
- | 0.9003 | 10.9952 | 1149 | 2.5356 |
72
- | 0.9032 | 12.0 | 1254 | 2.5401 |
73
- | 0.9006 | 12.9952 | 1358 | 2.5426 |
74
- | 0.9007 | 14.0 | 1463 | 2.5433 |
75
- | 0.896 | 14.9282 | 1560 | 2.5425 |
76
 
77
 
78
  ### Framework versions
79
 
80
  - PEFT 0.11.1
81
- - Transformers 4.40.1
82
- - Pytorch 2.2.0+cu121
83
  - Datasets 2.19.2
84
  - Tokenizers 0.19.1
 
2
  license: gemma
3
  library_name: peft
4
  tags:
 
5
  - trl
6
  - sft
7
  - generated_from_trainer
8
  base_model: google/gemma-2b
9
  datasets:
10
+ - generator
11
  model-index:
12
  - name: gemma2b-summarize-gemini1_5flash-64k
13
  results: []
 
18
 
19
  # gemma2b-summarize-gemini1_5flash-64k
20
 
21
+ This model is a fine-tuned version of [google/gemma-2b](https://huggingface.co/google/gemma-2b) on the generator dataset.
22
  It achieves the following results on the evaluation set:
23
+ - Loss: 2.7185
24
 
25
  ## Model description
26
 
 
40
 
41
  The following hyperparameters were used during training:
42
  - learning_rate: 0.0002
43
+ - train_batch_size: 16
44
+ - eval_batch_size: 16
45
  - seed: 42
46
  - distributed_type: multi-GPU
47
+ - num_devices: 8
48
  - gradient_accumulation_steps: 2
49
+ - total_train_batch_size: 256
50
+ - total_eval_batch_size: 128
51
  - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
52
  - lr_scheduler_type: cosine
53
  - lr_scheduler_warmup_ratio: 0.1
 
57
 
58
  | Training Loss | Epoch | Step | Validation Loss |
59
  |:-------------:|:-------:|:----:|:---------------:|
60
+ | 1.518 | 0.9905 | 52 | 2.7709 |
61
+ | 1.1423 | 2.0 | 105 | 2.6595 |
62
+ | 1.0681 | 2.9905 | 157 | 2.6406 |
63
+ | 1.0335 | 4.0 | 210 | 2.6427 |
64
+ | 1.0079 | 4.9905 | 262 | 2.6459 |
65
+ | 0.9837 | 6.0 | 315 | 2.6574 |
66
+ | 0.966 | 6.9905 | 367 | 2.6700 |
67
+ | 0.9474 | 8.0 | 420 | 2.6799 |
68
+ | 0.9406 | 8.9905 | 472 | 2.6883 |
69
+ | 0.9245 | 10.0 | 525 | 2.6975 |
70
+ | 0.9208 | 10.9905 | 577 | 2.7079 |
71
+ | 0.9195 | 12.0 | 630 | 2.7148 |
72
+ | 0.9212 | 12.9905 | 682 | 2.7154 |
73
+ | 0.9136 | 14.0 | 735 | 2.7181 |
74
+ | 0.9103 | 14.8571 | 780 | 2.7185 |
75
 
76
 
77
  ### Framework versions
78
 
79
  - PEFT 0.11.1
80
+ - Transformers 4.41.2
81
+ - Pytorch 2.3.1+cu121
82
  - Datasets 2.19.2
83
  - Tokenizers 0.19.1
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9c2747846863e52a0aaf9d156c39e150d13213de860c580c3e4dd71702df720c
3
  size 39256960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2ae6086e7cdaa1c7742eb4042577d161b6afc3838df568f2b317918f4e82a95d
3
  size 39256960
all_results.json CHANGED
@@ -1,14 +1,9 @@
1
  {
2
- "epoch": 14.92822966507177,
3
- "eval_loss": 2.542541980743408,
4
- "eval_runtime": 0.5093,
5
- "eval_samples": 25,
6
- "eval_samples_per_second": 19.634,
7
- "eval_steps_per_second": 1.963,
8
- "total_flos": 1.2217365722824704e+18,
9
- "train_loss": 1.0227742999027938,
10
- "train_runtime": 5355.4835,
11
  "train_samples": 63353,
12
- "train_samples_per_second": 18.668,
13
- "train_steps_per_second": 0.291
14
  }
 
1
  {
2
+ "epoch": 14.857142857142858,
3
+ "total_flos": 1.2277516310308454e+18,
4
+ "train_loss": 1.077159938445458,
5
+ "train_runtime": 4175.1629,
 
 
 
 
 
6
  "train_samples": 63353,
7
+ "train_samples_per_second": 47.883,
8
+ "train_steps_per_second": 0.187
9
  }
runs/Jun10_02-23-10_48ddfe8e991f/events.out.tfevents.1717986227.48ddfe8e991f.24991.0 CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:cedcb2a2fda8cd94bb1b3b5813c49136881a917554c388d17c88d3c7d7dba042
3
- size 38765
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5119fd0853b4e3e482596fc5841bf36b582f1f8f34c1c401585c1877c033ad9a
3
+ size 43037
train_results.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
- "epoch": 14.92822966507177,
3
- "total_flos": 1.2217365722824704e+18,
4
- "train_loss": 1.0227742999027938,
5
- "train_runtime": 5355.4835,
6
  "train_samples": 63353,
7
- "train_samples_per_second": 18.668,
8
- "train_steps_per_second": 0.291
9
  }
 
1
  {
2
+ "epoch": 14.857142857142858,
3
+ "total_flos": 1.2277516310308454e+18,
4
+ "train_loss": 1.077159938445458,
5
+ "train_runtime": 4175.1629,
6
  "train_samples": 63353,
7
+ "train_samples_per_second": 47.883,
8
+ "train_steps_per_second": 0.187
9
  }
trainer_state.json CHANGED
@@ -1,2341 +1,1261 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 14.92822966507177,
5
  "eval_steps": 500,
6
- "global_step": 1560,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.009569377990430622,
13
- "grad_norm": 2.171875,
14
- "learning_rate": 1.282051282051282e-06,
15
- "loss": 3.0241,
16
  "step": 1
17
  },
18
  {
19
- "epoch": 0.04784688995215311,
20
- "grad_norm": 2.40625,
21
- "learning_rate": 6.41025641025641e-06,
22
- "loss": 3.0093,
23
  "step": 5
24
  },
25
  {
26
- "epoch": 0.09569377990430622,
27
- "grad_norm": 2.09375,
28
- "learning_rate": 1.282051282051282e-05,
29
- "loss": 3.0072,
30
  "step": 10
31
  },
32
  {
33
- "epoch": 0.14354066985645933,
34
- "grad_norm": 1.984375,
35
- "learning_rate": 1.923076923076923e-05,
36
- "loss": 2.9982,
37
  "step": 15
38
  },
39
  {
40
- "epoch": 0.19138755980861244,
41
- "grad_norm": 2.015625,
42
- "learning_rate": 2.564102564102564e-05,
43
- "loss": 2.9122,
44
  "step": 20
45
  },
46
  {
47
- "epoch": 0.23923444976076555,
48
- "grad_norm": 2.53125,
49
- "learning_rate": 3.205128205128206e-05,
50
- "loss": 2.785,
51
  "step": 25
52
  },
53
  {
54
- "epoch": 0.28708133971291866,
55
- "grad_norm": 1.875,
56
- "learning_rate": 3.846153846153846e-05,
57
- "loss": 2.6063,
58
  "step": 30
59
  },
60
  {
61
- "epoch": 0.3349282296650718,
62
- "grad_norm": 2.65625,
63
- "learning_rate": 4.4871794871794874e-05,
64
- "loss": 2.4641,
65
  "step": 35
66
  },
67
  {
68
- "epoch": 0.3827751196172249,
69
- "grad_norm": 1.890625,
70
- "learning_rate": 5.128205128205128e-05,
71
- "loss": 2.3442,
72
  "step": 40
73
  },
74
  {
75
- "epoch": 0.430622009569378,
76
- "grad_norm": 73.5,
77
- "learning_rate": 5.769230769230769e-05,
78
- "loss": 2.2165,
79
  "step": 45
80
  },
81
  {
82
- "epoch": 0.4784688995215311,
83
- "grad_norm": 1.1171875,
84
- "learning_rate": 6.410256410256412e-05,
85
- "loss": 2.0922,
86
  "step": 50
87
  },
88
  {
89
- "epoch": 0.5263157894736842,
90
- "grad_norm": 0.87890625,
91
- "learning_rate": 7.051282051282052e-05,
92
- "loss": 1.9553,
 
 
 
 
 
 
 
 
93
  "step": 55
94
  },
95
  {
96
- "epoch": 0.5741626794258373,
97
- "grad_norm": 0.89453125,
98
- "learning_rate": 7.692307692307693e-05,
99
- "loss": 1.8161,
100
  "step": 60
101
  },
102
  {
103
- "epoch": 0.6220095693779905,
104
- "grad_norm": 0.65234375,
105
- "learning_rate": 8.333333333333334e-05,
106
- "loss": 1.6939,
107
  "step": 65
108
  },
109
  {
110
- "epoch": 0.6698564593301436,
111
- "grad_norm": 0.52734375,
112
- "learning_rate": 8.974358974358975e-05,
113
- "loss": 1.6032,
114
  "step": 70
115
  },
116
  {
117
- "epoch": 0.7177033492822966,
118
- "grad_norm": 0.546875,
119
- "learning_rate": 9.615384615384617e-05,
120
- "loss": 1.5079,
121
  "step": 75
122
  },
123
  {
124
- "epoch": 0.7655502392344498,
125
- "grad_norm": 0.498046875,
126
- "learning_rate": 0.00010256410256410256,
127
- "loss": 1.4389,
128
  "step": 80
129
  },
130
  {
131
- "epoch": 0.8133971291866029,
132
- "grad_norm": 0.67578125,
133
- "learning_rate": 0.00010897435897435896,
134
- "loss": 1.3952,
135
  "step": 85
136
  },
137
  {
138
- "epoch": 0.861244019138756,
139
- "grad_norm": 0.302734375,
140
- "learning_rate": 0.00011538461538461538,
141
- "loss": 1.3286,
142
  "step": 90
143
  },
144
  {
145
- "epoch": 0.9090909090909091,
146
- "grad_norm": 0.55859375,
147
- "learning_rate": 0.00012179487179487179,
148
- "loss": 1.3036,
149
  "step": 95
150
  },
151
  {
152
- "epoch": 0.9569377990430622,
153
- "grad_norm": 0.515625,
154
- "learning_rate": 0.00012820512820512823,
155
- "loss": 1.2724,
156
  "step": 100
157
  },
158
  {
159
- "epoch": 0.9952153110047847,
160
- "eval_loss": 2.5530645847320557,
161
- "eval_runtime": 0.6584,
162
- "eval_samples_per_second": 15.189,
163
- "eval_steps_per_second": 1.519,
164
- "step": 104
165
  },
166
  {
167
- "epoch": 1.0047846889952152,
168
- "grad_norm": 0.31640625,
169
- "learning_rate": 0.00013461538461538464,
170
- "loss": 1.2461,
 
171
  "step": 105
172
  },
173
  {
174
- "epoch": 1.0526315789473684,
175
- "grad_norm": 0.486328125,
176
- "learning_rate": 0.00014102564102564104,
177
- "loss": 1.2256,
178
  "step": 110
179
  },
180
  {
181
- "epoch": 1.1004784688995215,
182
- "grad_norm": 0.255859375,
183
- "learning_rate": 0.00014743589743589745,
184
- "loss": 1.2074,
185
  "step": 115
186
  },
187
  {
188
- "epoch": 1.1483253588516746,
189
- "grad_norm": 0.396484375,
190
- "learning_rate": 0.00015384615384615385,
191
- "loss": 1.1911,
192
  "step": 120
193
  },
194
  {
195
- "epoch": 1.1961722488038278,
196
- "grad_norm": 1.0390625,
197
- "learning_rate": 0.00016025641025641028,
198
- "loss": 1.1684,
199
  "step": 125
200
  },
201
  {
202
- "epoch": 1.244019138755981,
203
- "grad_norm": 0.453125,
204
- "learning_rate": 0.0001666666666666667,
205
- "loss": 1.1506,
206
  "step": 130
207
  },
208
  {
209
- "epoch": 1.291866028708134,
210
- "grad_norm": 0.56640625,
211
- "learning_rate": 0.0001730769230769231,
212
- "loss": 1.1572,
213
  "step": 135
214
  },
215
  {
216
- "epoch": 1.339712918660287,
217
- "grad_norm": 0.75,
218
- "learning_rate": 0.0001794871794871795,
219
- "loss": 1.136,
220
  "step": 140
221
  },
222
  {
223
- "epoch": 1.38755980861244,
224
- "grad_norm": 0.7890625,
225
- "learning_rate": 0.0001858974358974359,
226
- "loss": 1.138,
227
  "step": 145
228
  },
229
  {
230
- "epoch": 1.4354066985645932,
231
- "grad_norm": 0.2890625,
232
- "learning_rate": 0.00019230769230769233,
233
- "loss": 1.1339,
234
  "step": 150
235
  },
236
  {
237
- "epoch": 1.4832535885167464,
238
- "grad_norm": 0.71875,
239
- "learning_rate": 0.00019871794871794874,
240
- "loss": 1.1258,
241
  "step": 155
242
  },
243
  {
244
- "epoch": 1.5311004784688995,
245
- "grad_norm": 1.078125,
246
- "learning_rate": 0.00019999599453798524,
247
- "loss": 1.118,
 
 
 
 
 
 
 
 
248
  "step": 160
249
  },
250
  {
251
- "epoch": 1.5789473684210527,
252
- "grad_norm": 1.1015625,
253
- "learning_rate": 0.00019997972289848503,
254
- "loss": 1.1211,
255
  "step": 165
256
  },
257
  {
258
- "epoch": 1.6267942583732058,
259
- "grad_norm": 0.291015625,
260
- "learning_rate": 0.0001999509367752813,
261
- "loss": 1.109,
262
  "step": 170
263
  },
264
  {
265
- "epoch": 1.674641148325359,
266
- "grad_norm": 0.27734375,
267
- "learning_rate": 0.00019990963977153936,
268
- "loss": 1.1016,
269
  "step": 175
270
  },
271
  {
272
- "epoch": 1.722488038277512,
273
- "grad_norm": 0.37890625,
274
- "learning_rate": 0.00019985583705641418,
275
- "loss": 1.1047,
276
  "step": 180
277
  },
278
  {
279
- "epoch": 1.7703349282296652,
280
- "grad_norm": 0.310546875,
281
- "learning_rate": 0.00019978953536440336,
282
- "loss": 1.0877,
283
  "step": 185
284
  },
285
  {
286
- "epoch": 1.8181818181818183,
287
- "grad_norm": 0.267578125,
288
- "learning_rate": 0.0001997107429945041,
289
- "loss": 1.0849,
290
  "step": 190
291
  },
292
  {
293
- "epoch": 1.8660287081339713,
294
- "grad_norm": 0.306640625,
295
- "learning_rate": 0.00019961946980917456,
296
- "loss": 1.0945,
297
  "step": 195
298
  },
299
  {
300
- "epoch": 1.9138755980861244,
301
- "grad_norm": 0.376953125,
302
- "learning_rate": 0.0001995157272330992,
303
- "loss": 1.0796,
304
  "step": 200
305
  },
306
  {
307
- "epoch": 1.9617224880382775,
308
- "grad_norm": 0.412109375,
309
- "learning_rate": 0.00019939952825175888,
310
- "loss": 1.0792,
311
  "step": 205
312
  },
313
  {
314
- "epoch": 2.0,
315
- "eval_loss": 2.475158214569092,
316
- "eval_runtime": 0.5364,
317
- "eval_samples_per_second": 18.643,
318
- "eval_steps_per_second": 1.864,
319
- "step": 209
320
  },
321
  {
322
- "epoch": 2.0095693779904304,
323
- "grad_norm": 0.337890625,
324
- "learning_rate": 0.0001992708874098054,
325
- "loss": 1.0691,
 
326
  "step": 210
327
  },
328
  {
329
- "epoch": 2.0574162679425836,
330
- "grad_norm": 0.326171875,
331
- "learning_rate": 0.00019912982080924103,
332
- "loss": 1.0586,
333
  "step": 215
334
  },
335
  {
336
- "epoch": 2.1052631578947367,
337
- "grad_norm": 0.31640625,
338
- "learning_rate": 0.00019897634610740287,
339
- "loss": 1.0492,
340
  "step": 220
341
  },
342
  {
343
- "epoch": 2.15311004784689,
344
- "grad_norm": 0.33203125,
345
- "learning_rate": 0.0001988104825147528,
346
- "loss": 1.0467,
347
  "step": 225
348
  },
349
  {
350
- "epoch": 2.200956937799043,
351
- "grad_norm": 0.80078125,
352
- "learning_rate": 0.00019863225079247285,
353
- "loss": 1.0492,
354
  "step": 230
355
  },
356
  {
357
- "epoch": 2.248803827751196,
358
- "grad_norm": 0.37890625,
359
- "learning_rate": 0.00019844167324986657,
360
- "loss": 1.0444,
361
  "step": 235
362
  },
363
  {
364
- "epoch": 2.2966507177033493,
365
- "grad_norm": 0.396484375,
366
- "learning_rate": 0.00019823877374156647,
367
- "loss": 1.049,
368
  "step": 240
369
  },
370
  {
371
- "epoch": 2.3444976076555024,
372
- "grad_norm": 0.353515625,
373
- "learning_rate": 0.00019802357766454827,
374
- "loss": 1.047,
375
  "step": 245
376
  },
377
  {
378
- "epoch": 2.3923444976076556,
379
- "grad_norm": 0.57421875,
380
- "learning_rate": 0.00019779611195495177,
381
- "loss": 1.044,
382
  "step": 250
383
  },
384
  {
385
- "epoch": 2.4401913875598087,
386
- "grad_norm": 0.341796875,
387
- "learning_rate": 0.00019755640508470942,
388
- "loss": 1.0424,
389
  "step": 255
390
  },
391
  {
392
- "epoch": 2.488038277511962,
393
- "grad_norm": 0.3125,
394
- "learning_rate": 0.00019730448705798239,
395
- "loss": 1.0362,
396
  "step": 260
397
  },
398
  {
399
- "epoch": 2.535885167464115,
400
- "grad_norm": 0.423828125,
401
- "learning_rate": 0.00019704038940740505,
402
- "loss": 1.031,
 
 
 
 
 
 
 
 
403
  "step": 265
404
  },
405
  {
406
- "epoch": 2.583732057416268,
407
- "grad_norm": 0.57421875,
408
- "learning_rate": 0.00019676414519013781,
409
- "loss": 1.0416,
410
  "step": 270
411
  },
412
  {
413
- "epoch": 2.6315789473684212,
414
- "grad_norm": 1.0859375,
415
- "learning_rate": 0.0001964757889837296,
416
- "loss": 1.0326,
417
  "step": 275
418
  },
419
  {
420
- "epoch": 2.679425837320574,
421
- "grad_norm": 0.408203125,
422
- "learning_rate": 0.0001961753568817896,
423
- "loss": 1.0317,
424
  "step": 280
425
  },
426
  {
427
- "epoch": 2.7272727272727275,
428
- "grad_norm": 0.3828125,
429
- "learning_rate": 0.00019586288648946947,
430
- "loss": 1.0449,
431
  "step": 285
432
  },
433
  {
434
- "epoch": 2.77511961722488,
435
- "grad_norm": 0.396484375,
436
- "learning_rate": 0.0001955384169187563,
437
- "loss": 1.0297,
438
  "step": 290
439
  },
440
  {
441
- "epoch": 2.8229665071770333,
442
- "grad_norm": 0.55078125,
443
- "learning_rate": 0.00019520198878357703,
444
- "loss": 1.0319,
445
  "step": 295
446
  },
447
  {
448
- "epoch": 2.8708133971291865,
449
- "grad_norm": 0.5625,
450
- "learning_rate": 0.00019485364419471454,
451
- "loss": 1.031,
452
  "step": 300
453
  },
454
  {
455
- "epoch": 2.9186602870813396,
456
- "grad_norm": 0.3359375,
457
- "learning_rate": 0.00019449342675453707,
458
- "loss": 1.0256,
459
  "step": 305
460
  },
461
  {
462
- "epoch": 2.9665071770334928,
463
- "grad_norm": 0.3359375,
464
- "learning_rate": 0.00019412138155154002,
465
- "loss": 1.0312,
466
  "step": 310
467
  },
468
  {
469
- "epoch": 2.9952153110047846,
470
- "eval_loss": 2.463235855102539,
471
- "eval_runtime": 0.6563,
472
- "eval_samples_per_second": 15.237,
473
- "eval_steps_per_second": 1.524,
474
- "step": 313
475
  },
476
  {
477
- "epoch": 3.014354066985646,
478
- "grad_norm": 0.37890625,
479
- "learning_rate": 0.00019373755515470254,
480
- "loss": 1.0296,
 
481
  "step": 315
482
  },
483
  {
484
- "epoch": 3.062200956937799,
485
- "grad_norm": 0.330078125,
486
- "learning_rate": 0.0001933419956076584,
487
- "loss": 1.0058,
488
  "step": 320
489
  },
490
  {
491
- "epoch": 3.110047846889952,
492
- "grad_norm": 0.34375,
493
- "learning_rate": 0.00019293475242268223,
494
- "loss": 1.0065,
495
  "step": 325
496
  },
497
  {
498
- "epoch": 3.1578947368421053,
499
- "grad_norm": 0.40625,
500
- "learning_rate": 0.00019251587657449236,
501
- "loss": 1.0095,
502
  "step": 330
503
  },
504
  {
505
- "epoch": 3.2057416267942584,
506
- "grad_norm": 0.8203125,
507
- "learning_rate": 0.0001920854204938699,
508
- "loss": 1.0087,
509
  "step": 335
510
  },
511
  {
512
- "epoch": 3.2535885167464116,
513
- "grad_norm": 0.353515625,
514
- "learning_rate": 0.00019164343806109632,
515
- "loss": 1.0066,
516
  "step": 340
517
  },
518
  {
519
- "epoch": 3.3014354066985647,
520
- "grad_norm": 0.486328125,
521
- "learning_rate": 0.00019118998459920902,
522
- "loss": 1.002,
523
  "step": 345
524
  },
525
  {
526
- "epoch": 3.349282296650718,
527
- "grad_norm": 0.6484375,
528
- "learning_rate": 0.00019072511686707663,
529
- "loss": 1.0099,
530
  "step": 350
531
  },
532
  {
533
- "epoch": 3.397129186602871,
534
- "grad_norm": 0.3671875,
535
- "learning_rate": 0.00019024889305229456,
536
- "loss": 0.9971,
537
  "step": 355
538
  },
539
  {
540
- "epoch": 3.444976076555024,
541
- "grad_norm": 0.306640625,
542
- "learning_rate": 0.0001897613727639014,
543
- "loss": 0.9993,
544
  "step": 360
545
  },
546
  {
547
- "epoch": 3.492822966507177,
548
- "grad_norm": 0.318359375,
549
- "learning_rate": 0.00018926261702491797,
550
- "loss": 1.0025,
551
  "step": 365
552
  },
553
  {
554
- "epoch": 3.5406698564593304,
555
- "grad_norm": 0.46875,
556
- "learning_rate": 0.00018875268826470872,
557
- "loss": 0.9953,
 
 
 
 
 
 
 
 
558
  "step": 370
559
  },
560
  {
561
- "epoch": 3.588516746411483,
562
- "grad_norm": 0.470703125,
563
- "learning_rate": 0.0001882316503111678,
564
- "loss": 0.9988,
565
  "step": 375
566
  },
567
  {
568
- "epoch": 3.6363636363636362,
569
- "grad_norm": 0.71484375,
570
- "learning_rate": 0.00018769956838272936,
571
- "loss": 1.0103,
572
  "step": 380
573
  },
574
  {
575
- "epoch": 3.6842105263157894,
576
- "grad_norm": 0.6171875,
577
- "learning_rate": 0.00018715650908020427,
578
- "loss": 1.0031,
579
  "step": 385
580
  },
581
  {
582
- "epoch": 3.7320574162679425,
583
- "grad_norm": 0.330078125,
584
- "learning_rate": 0.00018660254037844388,
585
- "loss": 0.9989,
586
  "step": 390
587
  },
588
  {
589
- "epoch": 3.7799043062200957,
590
- "grad_norm": 0.396484375,
591
- "learning_rate": 0.00018603773161783124,
592
- "loss": 0.9975,
593
  "step": 395
594
  },
595
  {
596
- "epoch": 3.827751196172249,
597
- "grad_norm": 0.482421875,
598
- "learning_rate": 0.00018546215349560203,
599
- "loss": 0.9895,
600
  "step": 400
601
  },
602
  {
603
- "epoch": 3.875598086124402,
604
- "grad_norm": 0.56640625,
605
- "learning_rate": 0.00018487587805699526,
606
- "loss": 0.9941,
607
  "step": 405
608
  },
609
  {
610
- "epoch": 3.923444976076555,
611
- "grad_norm": 0.65625,
612
- "learning_rate": 0.00018427897868623534,
613
- "loss": 0.9996,
614
  "step": 410
615
  },
616
  {
617
- "epoch": 3.971291866028708,
618
- "grad_norm": 0.8046875,
619
- "learning_rate": 0.00018367153009734655,
620
- "loss": 0.9957,
621
  "step": 415
622
  },
623
  {
624
- "epoch": 4.0,
625
- "eval_loss": 2.470475912094116,
626
- "eval_runtime": 0.5362,
627
- "eval_samples_per_second": 18.649,
628
- "eval_steps_per_second": 1.865,
629
- "step": 418
630
  },
631
  {
632
- "epoch": 4.019138755980861,
633
- "grad_norm": 0.59375,
634
- "learning_rate": 0.00018305360832480117,
635
- "loss": 0.9875,
 
636
  "step": 420
637
  },
638
  {
639
- "epoch": 4.0669856459330145,
640
- "grad_norm": 0.70703125,
641
- "learning_rate": 0.00018242529071400214,
642
- "loss": 0.9719,
643
  "step": 425
644
  },
645
  {
646
- "epoch": 4.114832535885167,
647
- "grad_norm": 0.408203125,
648
- "learning_rate": 0.00018178665591160172,
649
- "loss": 0.9655,
650
  "step": 430
651
  },
652
  {
653
- "epoch": 4.162679425837321,
654
- "grad_norm": 0.427734375,
655
- "learning_rate": 0.00018113778385565733,
656
- "loss": 0.9659,
657
  "step": 435
658
  },
659
  {
660
- "epoch": 4.2105263157894735,
661
- "grad_norm": 0.625,
662
- "learning_rate": 0.00018047875576562557,
663
- "loss": 0.9758,
664
  "step": 440
665
  },
666
  {
667
- "epoch": 4.258373205741627,
668
- "grad_norm": 0.55859375,
669
- "learning_rate": 0.0001798096541321961,
670
- "loss": 0.983,
671
  "step": 445
672
  },
673
  {
674
- "epoch": 4.30622009569378,
675
- "grad_norm": 0.390625,
676
- "learning_rate": 0.0001791305627069662,
677
- "loss": 0.979,
678
  "step": 450
679
  },
680
  {
681
- "epoch": 4.354066985645933,
682
- "grad_norm": 0.35546875,
683
- "learning_rate": 0.00017844156649195759,
684
- "loss": 0.9764,
685
  "step": 455
686
  },
687
  {
688
- "epoch": 4.401913875598086,
689
- "grad_norm": 0.4453125,
690
- "learning_rate": 0.0001777427517289766,
691
- "loss": 0.9775,
692
  "step": 460
693
  },
694
  {
695
- "epoch": 4.44976076555024,
696
- "grad_norm": 0.80078125,
697
- "learning_rate": 0.00017703420588881946,
698
- "loss": 0.9746,
699
  "step": 465
700
  },
701
  {
702
- "epoch": 4.497607655502392,
703
- "grad_norm": 0.373046875,
704
- "learning_rate": 0.00017631601766032336,
705
- "loss": 0.972,
706
  "step": 470
707
  },
708
  {
709
- "epoch": 4.545454545454545,
710
- "grad_norm": 0.44140625,
711
- "learning_rate": 0.00017558827693926534,
712
- "loss": 0.9814,
 
 
 
 
 
 
 
 
713
  "step": 475
714
  },
715
  {
716
- "epoch": 4.5933014354066986,
717
- "grad_norm": 0.32421875,
718
- "learning_rate": 0.00017485107481711012,
719
- "loss": 0.9813,
720
  "step": 480
721
  },
722
  {
723
- "epoch": 4.641148325358852,
724
- "grad_norm": 0.408203125,
725
- "learning_rate": 0.00017410450356960795,
726
- "loss": 0.9811,
727
  "step": 485
728
  },
729
  {
730
- "epoch": 4.688995215311005,
731
- "grad_norm": 0.3515625,
732
- "learning_rate": 0.0001733486566452446,
733
- "loss": 0.9705,
734
- "step": 490
735
- },
736
- {
737
- "epoch": 4.7368421052631575,
738
- "grad_norm": 0.353515625,
739
- "learning_rate": 0.00017258362865354426,
740
- "loss": 0.985,
741
- "step": 495
742
- },
743
- {
744
- "epoch": 4.784688995215311,
745
- "grad_norm": 0.380859375,
746
- "learning_rate": 0.0001718095153532274,
747
- "loss": 0.9817,
748
- "step": 500
749
- },
750
- {
751
- "epoch": 4.832535885167464,
752
- "grad_norm": 0.474609375,
753
- "learning_rate": 0.00017102641364022457,
754
- "loss": 0.9686,
755
- "step": 505
756
- },
757
- {
758
- "epoch": 4.880382775119617,
759
- "grad_norm": 0.384765625,
760
- "learning_rate": 0.00017023442153554777,
761
- "loss": 0.9838,
762
- "step": 510
763
- },
764
- {
765
- "epoch": 4.92822966507177,
766
- "grad_norm": 0.53515625,
767
- "learning_rate": 0.00016943363817302135,
768
- "loss": 0.9774,
769
- "step": 515
770
- },
771
- {
772
- "epoch": 4.976076555023924,
773
- "grad_norm": 0.50390625,
774
- "learning_rate": 0.0001686241637868734,
775
- "loss": 0.9775,
776
- "step": 520
777
- },
778
- {
779
- "epoch": 4.9952153110047846,
780
- "eval_loss": 2.476422071456909,
781
- "eval_runtime": 0.6478,
782
- "eval_samples_per_second": 15.436,
783
- "eval_steps_per_second": 1.544,
784
- "step": 522
785
- },
786
- {
787
- "epoch": 5.023923444976076,
788
- "grad_norm": 0.625,
789
- "learning_rate": 0.0001678060996991891,
790
- "loss": 0.9684,
791
- "step": 525
792
- },
793
- {
794
- "epoch": 5.07177033492823,
795
- "grad_norm": 0.33203125,
796
- "learning_rate": 0.00016697954830722868,
797
- "loss": 0.9614,
798
- "step": 530
799
- },
800
- {
801
- "epoch": 5.119617224880383,
802
- "grad_norm": 0.376953125,
803
- "learning_rate": 0.00016614461307061,
804
- "loss": 0.963,
805
- "step": 535
806
- },
807
- {
808
- "epoch": 5.167464114832536,
809
- "grad_norm": 0.439453125,
810
- "learning_rate": 0.0001653013984983585,
811
- "loss": 0.9558,
812
- "step": 540
813
- },
814
- {
815
- "epoch": 5.215311004784689,
816
- "grad_norm": 0.375,
817
- "learning_rate": 0.00016445001013582608,
818
- "loss": 0.9533,
819
- "step": 545
820
- },
821
- {
822
- "epoch": 5.2631578947368425,
823
- "grad_norm": 0.40234375,
824
- "learning_rate": 0.0001635905545514795,
825
- "loss": 0.9664,
826
- "step": 550
827
- },
828
- {
829
- "epoch": 5.311004784688995,
830
- "grad_norm": 0.380859375,
831
- "learning_rate": 0.00016272313932356162,
832
- "loss": 0.9552,
833
- "step": 555
834
- },
835
- {
836
- "epoch": 5.358851674641148,
837
- "grad_norm": 0.416015625,
838
- "learning_rate": 0.0001618478730266255,
839
- "loss": 0.9505,
840
- "step": 560
841
- },
842
- {
843
- "epoch": 5.4066985645933014,
844
- "grad_norm": 0.39453125,
845
- "learning_rate": 0.00016096486521794434,
846
- "loss": 0.964,
847
- "step": 565
848
- },
849
- {
850
- "epoch": 5.454545454545454,
851
- "grad_norm": 0.416015625,
852
- "learning_rate": 0.0001600742264237979,
853
- "loss": 0.9558,
854
- "step": 570
855
- },
856
- {
857
- "epoch": 5.502392344497608,
858
- "grad_norm": 0.5625,
859
- "learning_rate": 0.0001591760681256382,
860
- "loss": 0.9573,
861
- "step": 575
862
- },
863
- {
864
- "epoch": 5.55023923444976,
865
- "grad_norm": 0.3515625,
866
- "learning_rate": 0.00015827050274613513,
867
- "loss": 0.9587,
868
- "step": 580
869
- },
870
- {
871
- "epoch": 5.598086124401914,
872
- "grad_norm": 0.421875,
873
- "learning_rate": 0.0001573576436351046,
874
- "loss": 0.9638,
875
- "step": 585
876
- },
877
- {
878
- "epoch": 5.645933014354067,
879
- "grad_norm": 0.4140625,
880
- "learning_rate": 0.0001564376050553205,
881
- "loss": 0.9513,
882
- "step": 590
883
- },
884
- {
885
- "epoch": 5.69377990430622,
886
- "grad_norm": 0.62109375,
887
- "learning_rate": 0.0001555105021682123,
888
- "loss": 0.9627,
889
- "step": 595
890
- },
891
- {
892
- "epoch": 5.741626794258373,
893
- "grad_norm": 0.62890625,
894
- "learning_rate": 0.00015457645101945046,
895
- "loss": 0.9546,
896
- "step": 600
897
- },
898
- {
899
- "epoch": 5.7894736842105265,
900
- "grad_norm": 0.361328125,
901
- "learning_rate": 0.00015363556852442085,
902
- "loss": 0.9481,
903
- "step": 605
904
- },
905
- {
906
- "epoch": 5.837320574162679,
907
- "grad_norm": 0.462890625,
908
- "learning_rate": 0.00015268797245359035,
909
- "loss": 0.9629,
910
- "step": 610
911
- },
912
- {
913
- "epoch": 5.885167464114833,
914
- "grad_norm": 0.33984375,
915
- "learning_rate": 0.00015173378141776568,
916
- "loss": 0.9519,
917
- "step": 615
918
- },
919
- {
920
- "epoch": 5.9330143540669855,
921
- "grad_norm": 0.423828125,
922
- "learning_rate": 0.0001507731148532468,
923
- "loss": 0.9584,
924
- "step": 620
925
- },
926
- {
927
- "epoch": 5.980861244019139,
928
- "grad_norm": 0.3828125,
929
- "learning_rate": 0.00014980609300687683,
930
- "loss": 0.9584,
931
- "step": 625
932
- },
933
- {
934
- "epoch": 6.0,
935
- "eval_loss": 2.487025737762451,
936
- "eval_runtime": 0.5347,
937
- "eval_samples_per_second": 18.7,
938
- "eval_steps_per_second": 1.87,
939
- "step": 627
940
- },
941
- {
942
- "epoch": 6.028708133971292,
943
- "grad_norm": 0.361328125,
944
- "learning_rate": 0.00014883283692099112,
945
- "loss": 0.9435,
946
- "step": 630
947
- },
948
- {
949
- "epoch": 6.076555023923445,
950
- "grad_norm": 0.388671875,
951
- "learning_rate": 0.000147853468418266,
952
- "loss": 0.9462,
953
- "step": 635
954
- },
955
- {
956
- "epoch": 6.124401913875598,
957
- "grad_norm": 0.443359375,
958
- "learning_rate": 0.00014686811008647038,
959
- "loss": 0.9372,
960
- "step": 640
961
- },
962
- {
963
- "epoch": 6.172248803827751,
964
- "grad_norm": 0.37109375,
965
- "learning_rate": 0.00014587688526312143,
966
- "loss": 0.9405,
967
- "step": 645
968
- },
969
- {
970
- "epoch": 6.220095693779904,
971
- "grad_norm": 0.474609375,
972
- "learning_rate": 0.00014487991802004623,
973
- "loss": 0.942,
974
- "step": 650
975
- },
976
- {
977
- "epoch": 6.267942583732057,
978
- "grad_norm": 0.3984375,
979
- "learning_rate": 0.00014387733314785193,
980
- "loss": 0.9495,
981
- "step": 655
982
- },
983
- {
984
- "epoch": 6.315789473684211,
985
- "grad_norm": 0.38671875,
986
- "learning_rate": 0.00014286925614030542,
987
- "loss": 0.9415,
988
- "step": 660
989
- },
990
- {
991
- "epoch": 6.363636363636363,
992
- "grad_norm": 0.390625,
993
- "learning_rate": 0.00014185581317862546,
994
- "loss": 0.94,
995
- "step": 665
996
- },
997
- {
998
- "epoch": 6.411483253588517,
999
- "grad_norm": 0.48046875,
1000
- "learning_rate": 0.00014083713111568842,
1001
- "loss": 0.9344,
1002
- "step": 670
1003
- },
1004
- {
1005
- "epoch": 6.45933014354067,
1006
- "grad_norm": 0.439453125,
1007
- "learning_rate": 0.0001398133374601501,
1008
- "loss": 0.9438,
1009
- "step": 675
1010
- },
1011
- {
1012
- "epoch": 6.507177033492823,
1013
- "grad_norm": 0.333984375,
1014
- "learning_rate": 0.0001387845603604855,
1015
- "loss": 0.941,
1016
- "step": 680
1017
- },
1018
- {
1019
- "epoch": 6.555023923444976,
1020
- "grad_norm": 0.5078125,
1021
- "learning_rate": 0.00013775092858894837,
1022
- "loss": 0.9433,
1023
- "step": 685
1024
- },
1025
- {
1026
- "epoch": 6.6028708133971294,
1027
- "grad_norm": 0.3515625,
1028
- "learning_rate": 0.00013671257152545277,
1029
- "loss": 0.9433,
1030
- "step": 690
1031
- },
1032
- {
1033
- "epoch": 6.650717703349282,
1034
- "grad_norm": 0.451171875,
1035
- "learning_rate": 0.00013566961914137867,
1036
- "loss": 0.9435,
1037
- "step": 695
1038
- },
1039
- {
1040
- "epoch": 6.698564593301436,
1041
- "grad_norm": 0.41015625,
1042
- "learning_rate": 0.00013462220198330328,
1043
- "loss": 0.9431,
1044
- "step": 700
1045
- },
1046
- {
1047
- "epoch": 6.746411483253588,
1048
- "grad_norm": 0.44140625,
1049
- "learning_rate": 0.0001335704511566605,
1050
- "loss": 0.9409,
1051
- "step": 705
1052
- },
1053
- {
1054
- "epoch": 6.794258373205742,
1055
- "grad_norm": 0.546875,
1056
- "learning_rate": 0.0001325144983093305,
1057
- "loss": 0.9437,
1058
- "step": 710
1059
- },
1060
- {
1061
- "epoch": 6.842105263157895,
1062
- "grad_norm": 0.412109375,
1063
- "learning_rate": 0.00013145447561516138,
1064
- "loss": 0.9491,
1065
- "step": 715
1066
- },
1067
- {
1068
- "epoch": 6.889952153110048,
1069
- "grad_norm": 0.38671875,
1070
- "learning_rate": 0.0001303905157574247,
1071
- "loss": 0.9445,
1072
- "step": 720
1073
- },
1074
- {
1075
- "epoch": 6.937799043062201,
1076
- "grad_norm": 0.380859375,
1077
- "learning_rate": 0.00012932275191220776,
1078
- "loss": 0.9315,
1079
- "step": 725
1080
- },
1081
- {
1082
- "epoch": 6.985645933014354,
1083
- "grad_norm": 0.36328125,
1084
- "learning_rate": 0.0001282513177317437,
1085
- "loss": 0.9368,
1086
- "step": 730
1087
- },
1088
- {
1089
- "epoch": 6.9952153110047846,
1090
- "eval_loss": 2.497544765472412,
1091
- "eval_runtime": 0.6485,
1092
- "eval_samples_per_second": 15.419,
1093
- "eval_steps_per_second": 1.542,
1094
- "step": 731
1095
- },
1096
- {
1097
- "epoch": 7.033492822966507,
1098
- "grad_norm": 0.69921875,
1099
- "learning_rate": 0.00012717634732768243,
1100
- "loss": 0.9238,
1101
- "step": 735
1102
- },
1103
- {
1104
- "epoch": 7.08133971291866,
1105
- "grad_norm": 0.39453125,
1106
- "learning_rate": 0.00012609797525430373,
1107
- "loss": 0.9235,
1108
- "step": 740
1109
- },
1110
- {
1111
- "epoch": 7.1291866028708135,
1112
- "grad_norm": 0.3984375,
1113
- "learning_rate": 0.00012501633649167495,
1114
- "loss": 0.9148,
1115
- "step": 745
1116
- },
1117
- {
1118
- "epoch": 7.177033492822966,
1119
- "grad_norm": 0.455078125,
1120
- "learning_rate": 0.0001239315664287558,
1121
- "loss": 0.9182,
1122
- "step": 750
1123
- },
1124
- {
1125
- "epoch": 7.22488038277512,
1126
- "grad_norm": 0.435546875,
1127
- "learning_rate": 0.00012284380084645139,
1128
- "loss": 0.9237,
1129
- "step": 755
1130
- },
1131
- {
1132
- "epoch": 7.2727272727272725,
1133
- "grad_norm": 0.443359375,
1134
- "learning_rate": 0.00012175317590061674,
1135
- "loss": 0.93,
1136
- "step": 760
1137
- },
1138
- {
1139
- "epoch": 7.320574162679426,
1140
- "grad_norm": 0.75,
1141
- "learning_rate": 0.00012065982810501404,
1142
- "loss": 0.9268,
1143
- "step": 765
1144
- },
1145
- {
1146
- "epoch": 7.368421052631579,
1147
- "grad_norm": 0.48828125,
1148
- "learning_rate": 0.00011956389431422507,
1149
- "loss": 0.9214,
1150
- "step": 770
1151
- },
1152
- {
1153
- "epoch": 7.416267942583732,
1154
- "grad_norm": 0.6015625,
1155
- "learning_rate": 0.00011846551170652127,
1156
- "loss": 0.931,
1157
- "step": 775
1158
- },
1159
- {
1160
- "epoch": 7.464114832535885,
1161
- "grad_norm": 0.453125,
1162
- "learning_rate": 0.00011736481776669306,
1163
- "loss": 0.9316,
1164
- "step": 780
1165
- },
1166
- {
1167
- "epoch": 7.511961722488039,
1168
- "grad_norm": 0.369140625,
1169
- "learning_rate": 0.0001162619502688407,
1170
- "loss": 0.9272,
1171
- "step": 785
1172
- },
1173
- {
1174
- "epoch": 7.559808612440191,
1175
- "grad_norm": 0.435546875,
1176
- "learning_rate": 0.00011515704725912926,
1177
- "loss": 0.9219,
1178
- "step": 790
1179
- },
1180
- {
1181
- "epoch": 7.607655502392344,
1182
- "grad_norm": 0.42578125,
1183
- "learning_rate": 0.00011405024703850929,
1184
- "loss": 0.9363,
1185
- "step": 795
1186
- },
1187
- {
1188
- "epoch": 7.655502392344498,
1189
- "grad_norm": 0.5390625,
1190
- "learning_rate": 0.00011294168814540553,
1191
- "loss": 0.9388,
1192
- "step": 800
1193
- },
1194
- {
1195
- "epoch": 7.703349282296651,
1196
- "grad_norm": 0.431640625,
1197
- "learning_rate": 0.00011183150933837632,
1198
- "loss": 0.9284,
1199
- "step": 805
1200
- },
1201
- {
1202
- "epoch": 7.751196172248804,
1203
- "grad_norm": 0.353515625,
1204
- "learning_rate": 0.00011071984957874479,
1205
- "loss": 0.9222,
1206
- "step": 810
1207
- },
1208
- {
1209
- "epoch": 7.7990430622009566,
1210
- "grad_norm": 0.42578125,
1211
- "learning_rate": 0.00010960684801320536,
1212
- "loss": 0.9335,
1213
- "step": 815
1214
- },
1215
- {
1216
- "epoch": 7.84688995215311,
1217
- "grad_norm": 0.6171875,
1218
- "learning_rate": 0.00010849264395640649,
1219
- "loss": 0.9382,
1220
- "step": 820
1221
- },
1222
- {
1223
- "epoch": 7.894736842105263,
1224
- "grad_norm": 0.66015625,
1225
- "learning_rate": 0.00010737737687351284,
1226
- "loss": 0.9414,
1227
- "step": 825
1228
- },
1229
- {
1230
- "epoch": 7.942583732057416,
1231
- "grad_norm": 0.5234375,
1232
- "learning_rate": 0.0001062611863627482,
1233
- "loss": 0.9374,
1234
- "step": 830
1235
- },
1236
- {
1237
- "epoch": 7.990430622009569,
1238
- "grad_norm": 0.33984375,
1239
- "learning_rate": 0.00010514421213792205,
1240
- "loss": 0.93,
1241
- "step": 835
1242
- },
1243
- {
1244
- "epoch": 8.0,
1245
- "eval_loss": 2.506725311279297,
1246
- "eval_runtime": 0.5363,
1247
- "eval_samples_per_second": 18.647,
1248
- "eval_steps_per_second": 1.865,
1249
- "step": 836
1250
- },
1251
- {
1252
- "epoch": 8.038277511961722,
1253
- "grad_norm": 0.390625,
1254
- "learning_rate": 0.00010402659401094152,
1255
- "loss": 0.9129,
1256
- "step": 840
1257
- },
1258
- {
1259
- "epoch": 8.086124401913876,
1260
- "grad_norm": 0.35546875,
1261
- "learning_rate": 0.00010290847187431113,
1262
- "loss": 0.9107,
1263
- "step": 845
1264
- },
1265
- {
1266
- "epoch": 8.133971291866029,
1267
- "grad_norm": 0.36328125,
1268
- "learning_rate": 0.00010178998568362243,
1269
- "loss": 0.9226,
1270
- "step": 850
1271
- },
1272
- {
1273
- "epoch": 8.181818181818182,
1274
- "grad_norm": 0.345703125,
1275
- "learning_rate": 0.00010067127544003563,
1276
- "loss": 0.9184,
1277
- "step": 855
1278
- },
1279
- {
1280
- "epoch": 8.229665071770334,
1281
- "grad_norm": 0.35546875,
1282
- "learning_rate": 9.955248117275566e-05,
1283
- "loss": 0.915,
1284
- "step": 860
1285
- },
1286
- {
1287
- "epoch": 8.277511961722489,
1288
- "grad_norm": 0.396484375,
1289
- "learning_rate": 9.843374292150488e-05,
1290
- "loss": 0.9234,
1291
- "step": 865
1292
- },
1293
- {
1294
- "epoch": 8.325358851674642,
1295
- "grad_norm": 0.3671875,
1296
- "learning_rate": 9.73152007189939e-05,
1297
- "loss": 0.9169,
1298
- "step": 870
1299
- },
1300
- {
1301
- "epoch": 8.373205741626794,
1302
- "grad_norm": 0.373046875,
1303
- "learning_rate": 9.619699457339405e-05,
1304
- "loss": 0.9131,
1305
- "step": 875
1306
- },
1307
- {
1308
- "epoch": 8.421052631578947,
1309
- "grad_norm": 0.3671875,
1310
- "learning_rate": 9.507926445081219e-05,
1311
- "loss": 0.9189,
1312
- "step": 880
1313
- },
1314
- {
1315
- "epoch": 8.4688995215311,
1316
- "grad_norm": 0.359375,
1317
- "learning_rate": 9.396215025777139e-05,
1318
- "loss": 0.9125,
1319
- "step": 885
1320
- },
1321
- {
1322
- "epoch": 8.516746411483254,
1323
- "grad_norm": 0.341796875,
1324
- "learning_rate": 9.284579182369867e-05,
1325
- "loss": 0.9089,
1326
- "step": 890
1327
- },
1328
- {
1329
- "epoch": 8.564593301435407,
1330
- "grad_norm": 0.4453125,
1331
- "learning_rate": 9.173032888342244e-05,
1332
- "loss": 0.9221,
1333
- "step": 895
1334
- },
1335
- {
1336
- "epoch": 8.61244019138756,
1337
- "grad_norm": 0.5546875,
1338
- "learning_rate": 9.061590105968208e-05,
1339
- "loss": 0.9155,
1340
- "step": 900
1341
- },
1342
- {
1343
- "epoch": 8.660287081339712,
1344
- "grad_norm": 0.55859375,
1345
- "learning_rate": 8.950264784565112e-05,
1346
- "loss": 0.9286,
1347
- "step": 905
1348
- },
1349
- {
1350
- "epoch": 8.708133971291867,
1351
- "grad_norm": 0.36328125,
1352
- "learning_rate": 8.839070858747697e-05,
1353
- "loss": 0.9199,
1354
- "step": 910
1355
- },
1356
- {
1357
- "epoch": 8.75598086124402,
1358
- "grad_norm": 0.365234375,
1359
- "learning_rate": 8.728022246683894e-05,
1360
- "loss": 0.908,
1361
- "step": 915
1362
- },
1363
- {
1364
- "epoch": 8.803827751196172,
1365
- "grad_norm": 0.345703125,
1366
- "learning_rate": 8.617132848352671e-05,
1367
- "loss": 0.9169,
1368
- "step": 920
1369
- },
1370
- {
1371
- "epoch": 8.851674641148325,
1372
- "grad_norm": 0.390625,
1373
- "learning_rate": 8.506416543804182e-05,
1374
- "loss": 0.9238,
1375
- "step": 925
1376
- },
1377
- {
1378
- "epoch": 8.89952153110048,
1379
- "grad_norm": 0.5703125,
1380
- "learning_rate": 8.395887191422397e-05,
1381
- "loss": 0.9209,
1382
- "step": 930
1383
- },
1384
- {
1385
- "epoch": 8.947368421052632,
1386
- "grad_norm": 0.46875,
1387
- "learning_rate": 8.285558626190447e-05,
1388
- "loss": 0.9189,
1389
- "step": 935
1390
- },
1391
- {
1392
- "epoch": 8.995215311004785,
1393
- "grad_norm": 0.337890625,
1394
- "learning_rate": 8.175444657958876e-05,
1395
- "loss": 0.9195,
1396
- "step": 940
1397
- },
1398
- {
1399
- "epoch": 8.995215311004785,
1400
- "eval_loss": 2.5168216228485107,
1401
- "eval_runtime": 0.6511,
1402
- "eval_samples_per_second": 15.36,
1403
- "eval_steps_per_second": 1.536,
1404
- "step": 940
1405
- },
1406
- {
1407
- "epoch": 9.043062200956937,
1408
- "grad_norm": 0.40625,
1409
- "learning_rate": 8.065559069717088e-05,
1410
- "loss": 0.9021,
1411
- "step": 945
1412
- },
1413
- {
1414
- "epoch": 9.090909090909092,
1415
- "grad_norm": 0.41015625,
1416
- "learning_rate": 7.955915615868111e-05,
1417
- "loss": 0.9103,
1418
- "step": 950
1419
- },
1420
- {
1421
- "epoch": 9.138755980861244,
1422
- "grad_norm": 0.392578125,
1423
- "learning_rate": 7.846528020506957e-05,
1424
- "loss": 0.8983,
1425
- "step": 955
1426
- },
1427
- {
1428
- "epoch": 9.186602870813397,
1429
- "grad_norm": 0.36328125,
1430
- "learning_rate": 7.73740997570278e-05,
1431
- "loss": 0.9037,
1432
- "step": 960
1433
- },
1434
- {
1435
- "epoch": 9.23444976076555,
1436
- "grad_norm": 0.353515625,
1437
- "learning_rate": 7.628575139785024e-05,
1438
- "loss": 0.9036,
1439
- "step": 965
1440
- },
1441
- {
1442
- "epoch": 9.282296650717702,
1443
- "grad_norm": 0.388671875,
1444
- "learning_rate": 7.520037135633816e-05,
1445
- "loss": 0.9051,
1446
- "step": 970
1447
- },
1448
- {
1449
- "epoch": 9.330143540669857,
1450
- "grad_norm": 0.439453125,
1451
- "learning_rate": 7.411809548974792e-05,
1452
- "loss": 0.9186,
1453
- "step": 975
1454
- },
1455
- {
1456
- "epoch": 9.37799043062201,
1457
- "grad_norm": 0.365234375,
1458
  "learning_rate": 7.303905926678564e-05,
1459
- "loss": 0.9095,
1460
- "step": 980
1461
- },
1462
- {
1463
- "epoch": 9.425837320574162,
1464
- "grad_norm": 0.375,
1465
- "learning_rate": 7.196339775065042e-05,
1466
- "loss": 0.9069,
1467
- "step": 985
1468
  },
1469
  {
1470
- "epoch": 9.473684210526315,
1471
- "grad_norm": 0.376953125,
1472
  "learning_rate": 7.089124558212871e-05,
1473
- "loss": 0.908,
1474
- "step": 990
1475
- },
1476
- {
1477
- "epoch": 9.52153110047847,
1478
- "grad_norm": 0.38671875,
1479
- "learning_rate": 6.982273696274106e-05,
1480
- "loss": 0.9121,
1481
- "step": 995
1482
  },
1483
  {
1484
- "epoch": 9.569377990430622,
1485
- "grad_norm": 0.41796875,
1486
  "learning_rate": 6.875800563794425e-05,
1487
- "loss": 0.9097,
1488
- "step": 1000
1489
- },
1490
- {
1491
- "epoch": 9.617224880382775,
1492
- "grad_norm": 0.361328125,
1493
- "learning_rate": 6.769718488039023e-05,
1494
- "loss": 0.9101,
1495
- "step": 1005
1496
  },
1497
  {
1498
- "epoch": 9.665071770334928,
1499
- "grad_norm": 0.3515625,
1500
  "learning_rate": 6.664040747324437e-05,
1501
- "loss": 0.9117,
1502
- "step": 1010
1503
- },
1504
- {
1505
- "epoch": 9.712918660287082,
1506
- "grad_norm": 0.375,
1507
- "learning_rate": 6.558780569356507e-05,
1508
- "loss": 0.9106,
1509
- "step": 1015
1510
  },
1511
  {
1512
- "epoch": 9.760765550239235,
1513
- "grad_norm": 0.376953125,
1514
  "learning_rate": 6.453951129574644e-05,
1515
- "loss": 0.9037,
1516
- "step": 1020
1517
- },
1518
- {
1519
- "epoch": 9.808612440191387,
1520
- "grad_norm": 0.40234375,
1521
- "learning_rate": 6.349565549502676e-05,
1522
- "loss": 0.9033,
1523
- "step": 1025
1524
  },
1525
  {
1526
- "epoch": 9.85645933014354,
1527
- "grad_norm": 0.3515625,
1528
  "learning_rate": 6.245636895106402e-05,
1529
- "loss": 0.9156,
1530
- "step": 1030
1531
- },
1532
- {
1533
- "epoch": 9.904306220095695,
1534
- "grad_norm": 0.357421875,
1535
- "learning_rate": 6.142178175158149e-05,
1536
- "loss": 0.9082,
1537
- "step": 1035
1538
  },
1539
  {
1540
- "epoch": 9.952153110047847,
1541
- "grad_norm": 0.451171875,
1542
  "learning_rate": 6.039202339608432e-05,
1543
- "loss": 0.9147,
1544
- "step": 1040
1545
- },
1546
- {
1547
- "epoch": 10.0,
1548
- "grad_norm": 0.50390625,
1549
- "learning_rate": 5.9367222779650334e-05,
1550
- "loss": 0.912,
1551
- "step": 1045
1552
  },
1553
  {
1554
  "epoch": 10.0,
1555
- "eval_loss": 2.5271153450012207,
1556
- "eval_runtime": 0.5317,
1557
- "eval_samples_per_second": 18.809,
1558
- "eval_steps_per_second": 1.881,
1559
- "step": 1045
1560
- },
1561
- {
1562
- "epoch": 10.047846889952153,
1563
- "grad_norm": 0.439453125,
1564
  "learning_rate": 5.834750817679606e-05,
1565
- "loss": 0.9084,
1566
- "step": 1050
1567
  },
1568
  {
1569
- "epoch": 10.095693779904305,
1570
- "grad_norm": 0.3515625,
1571
- "learning_rate": 5.733300722542045e-05,
1572
- "loss": 0.897,
1573
- "step": 1055
 
1574
  },
1575
  {
1576
- "epoch": 10.14354066985646,
1577
- "grad_norm": 0.376953125,
1578
  "learning_rate": 5.6323846910828735e-05,
1579
- "loss": 0.8998,
1580
- "step": 1060
1581
- },
1582
- {
1583
- "epoch": 10.191387559808613,
1584
- "grad_norm": 0.349609375,
1585
- "learning_rate": 5.5320153549837415e-05,
1586
- "loss": 0.9026,
1587
- "step": 1065
1588
  },
1589
  {
1590
- "epoch": 10.239234449760765,
1591
- "grad_norm": 0.3515625,
1592
  "learning_rate": 5.432205277496327e-05,
1593
- "loss": 0.8996,
1594
- "step": 1070
1595
- },
1596
- {
1597
- "epoch": 10.287081339712918,
1598
- "grad_norm": 0.359375,
1599
- "learning_rate": 5.33296695186977e-05,
1600
- "loss": 0.9,
1601
- "step": 1075
1602
  },
1603
  {
1604
- "epoch": 10.334928229665072,
1605
- "grad_norm": 0.41015625,
1606
  "learning_rate": 5.234312799786921e-05,
1607
- "loss": 0.8988,
1608
- "step": 1080
1609
- },
1610
- {
1611
- "epoch": 10.382775119617225,
1612
- "grad_norm": 0.41015625,
1613
- "learning_rate": 5.1362551698094964e-05,
1614
- "loss": 0.8969,
1615
- "step": 1085
1616
  },
1617
  {
1618
- "epoch": 10.430622009569378,
1619
- "grad_norm": 0.35546875,
1620
  "learning_rate": 5.0388063358324134e-05,
1621
- "loss": 0.8954,
1622
- "step": 1090
1623
- },
1624
- {
1625
- "epoch": 10.47846889952153,
1626
- "grad_norm": 0.3515625,
1627
- "learning_rate": 4.9419784955474524e-05,
1628
- "loss": 0.8991,
1629
- "step": 1095
1630
  },
1631
  {
1632
- "epoch": 10.526315789473685,
1633
- "grad_norm": 0.359375,
1634
  "learning_rate": 4.845783768916482e-05,
1635
- "loss": 0.9074,
1636
- "step": 1100
1637
- },
1638
- {
1639
- "epoch": 10.574162679425838,
1640
- "grad_norm": 0.353515625,
1641
- "learning_rate": 4.7502341966544e-05,
1642
- "loss": 0.9012,
1643
- "step": 1105
1644
  },
1645
  {
1646
- "epoch": 10.62200956937799,
1647
- "grad_norm": 0.34765625,
1648
  "learning_rate": 4.6553417387219886e-05,
1649
- "loss": 0.8993,
1650
- "step": 1110
1651
- },
1652
- {
1653
- "epoch": 10.669856459330143,
1654
- "grad_norm": 0.333984375,
1655
- "learning_rate": 4.5611182728288895e-05,
1656
- "loss": 0.9073,
1657
- "step": 1115
1658
  },
1659
  {
1660
- "epoch": 10.717703349282296,
1661
- "grad_norm": 0.341796875,
1662
  "learning_rate": 4.467575592946864e-05,
1663
- "loss": 0.9073,
1664
- "step": 1120
1665
- },
1666
- {
1667
- "epoch": 10.76555023923445,
1668
- "grad_norm": 0.349609375,
1669
- "learning_rate": 4.374725407833532e-05,
1670
- "loss": 0.902,
1671
- "step": 1125
1672
  },
1673
  {
1674
- "epoch": 10.813397129186603,
1675
- "grad_norm": 0.380859375,
1676
  "learning_rate": 4.282579339566802e-05,
1677
- "loss": 0.8993,
1678
- "step": 1130
1679
- },
1680
- {
1681
- "epoch": 10.861244019138756,
1682
- "grad_norm": 0.37109375,
1683
- "learning_rate": 4.1911489220901236e-05,
1684
- "loss": 0.9002,
1685
- "step": 1135
1686
  },
1687
  {
1688
- "epoch": 10.909090909090908,
1689
- "grad_norm": 0.3515625,
1690
  "learning_rate": 4.100445599768774e-05,
1691
- "loss": 0.9115,
1692
- "step": 1140
1693
- },
1694
- {
1695
- "epoch": 10.956937799043063,
1696
- "grad_norm": 0.34375,
1697
- "learning_rate": 4.0104807259573716e-05,
1698
- "loss": 0.9003,
1699
- "step": 1145
1700
- },
1701
- {
1702
- "epoch": 10.995215311004785,
1703
- "eval_loss": 2.535585403442383,
1704
- "eval_runtime": 0.6791,
1705
- "eval_samples_per_second": 14.726,
1706
- "eval_steps_per_second": 1.473,
1707
- "step": 1149
1708
  },
1709
  {
1710
- "epoch": 11.004784688995215,
1711
- "grad_norm": 0.345703125,
1712
  "learning_rate": 3.9212655615787804e-05,
1713
- "loss": 0.9032,
1714
- "step": 1150
1715
  },
1716
  {
1717
- "epoch": 11.052631578947368,
1718
- "grad_norm": 0.34765625,
1719
- "learning_rate": 3.832811273714569e-05,
1720
- "loss": 0.902,
1721
- "step": 1155
 
1722
  },
1723
  {
1724
- "epoch": 11.10047846889952,
1725
- "grad_norm": 0.36328125,
1726
  "learning_rate": 3.745128934207225e-05,
1727
- "loss": 0.8894,
1728
- "step": 1160
1729
- },
1730
- {
1731
- "epoch": 11.148325358851675,
1732
- "grad_norm": 0.388671875,
1733
- "learning_rate": 3.6582295182742964e-05,
1734
- "loss": 0.8935,
1735
- "step": 1165
1736
  },
1737
  {
1738
- "epoch": 11.196172248803828,
1739
- "grad_norm": 0.365234375,
1740
  "learning_rate": 3.5721239031346066e-05,
1741
- "loss": 0.8997,
1742
- "step": 1170
1743
- },
1744
- {
1745
- "epoch": 11.24401913875598,
1746
- "grad_norm": 0.3359375,
1747
- "learning_rate": 3.4868228666467704e-05,
1748
- "loss": 0.8972,
1749
- "step": 1175
1750
  },
1751
  {
1752
- "epoch": 11.291866028708133,
1753
- "grad_norm": 0.349609375,
1754
  "learning_rate": 3.402337085960119e-05,
1755
- "loss": 0.9028,
1756
- "step": 1180
1757
- },
1758
- {
1759
- "epoch": 11.339712918660288,
1760
- "grad_norm": 0.341796875,
1761
- "learning_rate": 3.318677136178228e-05,
1762
- "loss": 0.8943,
1763
- "step": 1185
1764
  },
1765
  {
1766
- "epoch": 11.38755980861244,
1767
- "grad_norm": 0.345703125,
1768
  "learning_rate": 3.235853489035241e-05,
1769
- "loss": 0.8851,
1770
- "step": 1190
1771
- },
1772
- {
1773
- "epoch": 11.435406698564593,
1774
- "grad_norm": 0.384765625,
1775
- "learning_rate": 3.153876511585122e-05,
1776
- "loss": 0.8917,
1777
- "step": 1195
1778
  },
1779
  {
1780
- "epoch": 11.483253588516746,
1781
- "grad_norm": 0.392578125,
1782
  "learning_rate": 3.072756464904006e-05,
1783
- "loss": 0.8974,
1784
- "step": 1200
1785
- },
1786
- {
1787
- "epoch": 11.5311004784689,
1788
- "grad_norm": 0.337890625,
1789
- "learning_rate": 2.9925035028058134e-05,
1790
- "loss": 0.8951,
1791
- "step": 1205
1792
  },
1793
  {
1794
- "epoch": 11.578947368421053,
1795
- "grad_norm": 0.330078125,
1796
  "learning_rate": 2.9131276705713006e-05,
1797
- "loss": 0.893,
1798
- "step": 1210
1799
- },
1800
- {
1801
- "epoch": 11.626794258373206,
1802
- "grad_norm": 0.345703125,
1803
- "learning_rate": 2.8346389036906828e-05,
1804
- "loss": 0.899,
1805
- "step": 1215
1806
  },
1807
  {
1808
- "epoch": 11.674641148325358,
1809
- "grad_norm": 0.337890625,
1810
  "learning_rate": 2.7570470266200176e-05,
1811
- "loss": 0.9037,
1812
- "step": 1220
1813
- },
1814
- {
1815
- "epoch": 11.722488038277511,
1816
- "grad_norm": 0.33984375,
1817
- "learning_rate": 2.68036175155147e-05,
1818
- "loss": 0.8917,
1819
- "step": 1225
1820
  },
1821
  {
1822
- "epoch": 11.770334928229666,
1823
- "grad_norm": 0.37890625,
1824
  "learning_rate": 2.6045926771976303e-05,
1825
- "loss": 0.9061,
1826
- "step": 1230
1827
- },
1828
- {
1829
- "epoch": 11.818181818181818,
1830
- "grad_norm": 0.3671875,
1831
- "learning_rate": 2.529749287590042e-05,
1832
- "loss": 0.9028,
1833
- "step": 1235
1834
  },
1835
  {
1836
- "epoch": 11.866028708133971,
1837
- "grad_norm": 0.337890625,
1838
  "learning_rate": 2.4558409508920986e-05,
1839
- "loss": 0.8946,
1840
- "step": 1240
1841
- },
1842
- {
1843
- "epoch": 11.913875598086124,
1844
- "grad_norm": 0.349609375,
1845
- "learning_rate": 2.382876918226409e-05,
1846
- "loss": 0.8938,
1847
- "step": 1245
1848
  },
1849
  {
1850
- "epoch": 11.961722488038278,
1851
- "grad_norm": 0.36328125,
1852
  "learning_rate": 2.3108663225168435e-05,
1853
- "loss": 0.9032,
1854
- "step": 1250
1855
  },
1856
  {
1857
  "epoch": 12.0,
1858
- "eval_loss": 2.5401320457458496,
1859
- "eval_runtime": 0.536,
1860
- "eval_samples_per_second": 18.658,
1861
- "eval_steps_per_second": 1.866,
1862
- "step": 1254
1863
- },
1864
- {
1865
- "epoch": 12.009569377990431,
1866
- "grad_norm": 0.33984375,
1867
- "learning_rate": 2.239818177345364e-05,
1868
- "loss": 0.9023,
1869
- "step": 1255
1870
- },
1871
- {
1872
- "epoch": 12.057416267942584,
1873
- "grad_norm": 0.337890625,
1874
  "learning_rate": 2.1697413758237784e-05,
1875
- "loss": 0.8902,
1876
- "step": 1260
1877
  },
1878
  {
1879
- "epoch": 12.105263157894736,
1880
- "grad_norm": 0.359375,
1881
- "learning_rate": 2.1006446894806065e-05,
1882
- "loss": 0.8958,
1883
- "step": 1265
 
1884
  },
1885
  {
1886
- "epoch": 12.15311004784689,
1887
- "grad_norm": 0.337890625,
1888
  "learning_rate": 2.032536767163141e-05,
1889
- "loss": 0.8949,
1890
- "step": 1270
1891
- },
1892
- {
1893
- "epoch": 12.200956937799043,
1894
- "grad_norm": 0.3671875,
1895
- "learning_rate": 1.965426133954854e-05,
1896
- "loss": 0.9023,
1897
- "step": 1275
1898
  },
1899
  {
1900
- "epoch": 12.248803827751196,
1901
- "grad_norm": 0.333984375,
1902
  "learning_rate": 1.8993211901083353e-05,
1903
- "loss": 0.895,
1904
- "step": 1280
1905
- },
1906
- {
1907
- "epoch": 12.296650717703349,
1908
- "grad_norm": 0.328125,
1909
- "learning_rate": 1.8342302099938057e-05,
1910
- "loss": 0.8925,
1911
- "step": 1285
1912
  },
1913
  {
1914
- "epoch": 12.344497607655502,
1915
- "grad_norm": 0.328125,
1916
  "learning_rate": 1.7701613410634365e-05,
1917
- "loss": 0.8986,
1918
- "step": 1290
1919
- },
1920
- {
1921
- "epoch": 12.392344497607656,
1922
- "grad_norm": 0.33203125,
1923
- "learning_rate": 1.7071226028315113e-05,
1924
- "loss": 0.8922,
1925
- "step": 1295
1926
  },
1927
  {
1928
- "epoch": 12.440191387559809,
1929
- "grad_norm": 0.35546875,
1930
  "learning_rate": 1.6451218858706374e-05,
1931
- "loss": 0.8878,
1932
- "step": 1300
1933
- },
1934
- {
1935
- "epoch": 12.488038277511961,
1936
- "grad_norm": 0.337890625,
1937
- "learning_rate": 1.584166950824061e-05,
1938
- "loss": 0.8992,
1939
- "step": 1305
1940
  },
1941
  {
1942
- "epoch": 12.535885167464114,
1943
- "grad_norm": 0.361328125,
1944
  "learning_rate": 1.5242654274342894e-05,
1945
- "loss": 0.8879,
1946
- "step": 1310
1947
- },
1948
- {
1949
- "epoch": 12.583732057416269,
1950
- "grad_norm": 0.353515625,
1951
- "learning_rate": 1.4654248135880621e-05,
1952
- "loss": 0.8942,
1953
- "step": 1315
1954
  },
1955
  {
1956
- "epoch": 12.631578947368421,
1957
- "grad_norm": 0.353515625,
1958
  "learning_rate": 1.4076524743778319e-05,
1959
- "loss": 0.8957,
1960
- "step": 1320
1961
- },
1962
- {
1963
- "epoch": 12.679425837320574,
1964
- "grad_norm": 0.33203125,
1965
- "learning_rate": 1.350955641179893e-05,
1966
- "loss": 0.8981,
1967
- "step": 1325
1968
  },
1969
  {
1970
- "epoch": 12.727272727272727,
1971
- "grad_norm": 0.353515625,
1972
  "learning_rate": 1.295341410749208e-05,
1973
- "loss": 0.8952,
1974
- "step": 1330
1975
- },
1976
- {
1977
- "epoch": 12.775119617224881,
1978
- "grad_norm": 0.34375,
1979
- "learning_rate": 1.2408167443311214e-05,
1980
- "loss": 0.8945,
1981
- "step": 1335
1982
  },
1983
  {
1984
- "epoch": 12.822966507177034,
1985
- "grad_norm": 0.359375,
1986
  "learning_rate": 1.1873884667900125e-05,
1987
- "loss": 0.8851,
1988
- "step": 1340
1989
- },
1990
- {
1991
- "epoch": 12.870813397129186,
1992
- "grad_norm": 0.34375,
1993
- "learning_rate": 1.1350632657550253e-05,
1994
- "loss": 0.8922,
1995
- "step": 1345
1996
  },
1997
  {
1998
- "epoch": 12.91866028708134,
1999
- "grad_norm": 0.3515625,
2000
  "learning_rate": 1.083847690782972e-05,
2001
- "loss": 0.8926,
2002
- "step": 1350
2003
- },
2004
- {
2005
- "epoch": 12.966507177033494,
2006
- "grad_norm": 0.349609375,
2007
- "learning_rate": 1.0337481525385362e-05,
2008
- "loss": 0.9006,
2009
- "step": 1355
2010
- },
2011
- {
2012
- "epoch": 12.995215311004785,
2013
- "eval_loss": 2.5425758361816406,
2014
- "eval_runtime": 0.6353,
2015
- "eval_samples_per_second": 15.741,
2016
- "eval_steps_per_second": 1.574,
2017
- "step": 1358
2018
  },
2019
  {
2020
- "epoch": 13.014354066985646,
2021
- "grad_norm": 0.345703125,
2022
  "learning_rate": 9.8477092199184e-06,
2023
- "loss": 0.9002,
2024
- "step": 1360
2025
  },
2026
  {
2027
- "epoch": 13.062200956937799,
2028
- "grad_norm": 0.34375,
2029
- "learning_rate": 9.369221296335006e-06,
2030
- "loss": 0.9007,
2031
- "step": 1365
 
2032
  },
2033
  {
2034
- "epoch": 13.110047846889952,
2035
- "grad_norm": 0.34765625,
2036
  "learning_rate": 8.902077647072881e-06,
2037
- "loss": 0.8882,
2038
- "step": 1370
2039
- },
2040
- {
2041
- "epoch": 13.157894736842104,
2042
- "grad_norm": 0.34375,
2043
- "learning_rate": 8.446336744604378e-06,
2044
- "loss": 0.888,
2045
- "step": 1375
2046
  },
2047
  {
2048
- "epoch": 13.205741626794259,
2049
- "grad_norm": 0.33203125,
2050
  "learning_rate": 8.002055634117578e-06,
2051
- "loss": 0.8869,
2052
- "step": 1380
2053
- },
2054
- {
2055
- "epoch": 13.253588516746412,
2056
- "grad_norm": 0.34375,
2057
- "learning_rate": 7.569289926375933e-06,
2058
- "loss": 0.8931,
2059
- "step": 1385
2060
  },
2061
  {
2062
- "epoch": 13.301435406698564,
2063
- "grad_norm": 0.326171875,
2064
  "learning_rate": 7.148093790757371e-06,
2065
- "loss": 0.8958,
2066
- "step": 1390
2067
- },
2068
- {
2069
- "epoch": 13.349282296650717,
2070
- "grad_norm": 0.330078125,
2071
- "learning_rate": 6.738519948473976e-06,
2072
- "loss": 0.8914,
2073
- "step": 1395
2074
  },
2075
  {
2076
- "epoch": 13.397129186602871,
2077
- "grad_norm": 0.349609375,
2078
  "learning_rate": 6.3406196659728465e-06,
2079
- "loss": 0.8975,
2080
- "step": 1400
2081
- },
2082
- {
2083
- "epoch": 13.444976076555024,
2084
- "grad_norm": 0.330078125,
2085
- "learning_rate": 5.954442748519073e-06,
2086
- "loss": 0.8908,
2087
- "step": 1405
2088
  },
2089
  {
2090
- "epoch": 13.492822966507177,
2091
- "grad_norm": 0.345703125,
2092
  "learning_rate": 5.580037533961546e-06,
2093
- "loss": 0.8938,
2094
- "step": 1410
2095
- },
2096
- {
2097
- "epoch": 13.54066985645933,
2098
- "grad_norm": 0.349609375,
2099
- "learning_rate": 5.217450886682584e-06,
2100
- "loss": 0.8958,
2101
- "step": 1415
2102
  },
2103
  {
2104
- "epoch": 13.588516746411484,
2105
- "grad_norm": 0.3359375,
2106
  "learning_rate": 4.866728191731829e-06,
2107
- "loss": 0.8905,
2108
- "step": 1420
2109
- },
2110
- {
2111
- "epoch": 13.636363636363637,
2112
- "grad_norm": 0.330078125,
2113
- "learning_rate": 4.527913349145441e-06,
2114
- "loss": 0.8995,
2115
- "step": 1425
2116
  },
2117
  {
2118
- "epoch": 13.68421052631579,
2119
- "grad_norm": 0.3671875,
2120
  "learning_rate": 4.20104876845111e-06,
2121
- "loss": 0.895,
2122
- "step": 1430
2123
- },
2124
- {
2125
- "epoch": 13.732057416267942,
2126
- "grad_norm": 0.33203125,
2127
- "learning_rate": 3.886175363359646e-06,
2128
- "loss": 0.8964,
2129
- "step": 1435
2130
  },
2131
  {
2132
- "epoch": 13.779904306220097,
2133
- "grad_norm": 0.34375,
2134
  "learning_rate": 3.5833325466437694e-06,
2135
- "loss": 0.891,
2136
- "step": 1440
2137
- },
2138
- {
2139
- "epoch": 13.82775119617225,
2140
- "grad_norm": 0.341796875,
2141
- "learning_rate": 3.2925582252048338e-06,
2142
- "loss": 0.8903,
2143
- "step": 1445
2144
  },
2145
  {
2146
- "epoch": 13.875598086124402,
2147
- "grad_norm": 0.3359375,
2148
  "learning_rate": 3.013888795328057e-06,
2149
- "loss": 0.8926,
2150
- "step": 1450
2151
- },
2152
- {
2153
- "epoch": 13.923444976076555,
2154
- "grad_norm": 0.330078125,
2155
- "learning_rate": 2.7473591381266708e-06,
2156
- "loss": 0.8921,
2157
- "step": 1455
2158
  },
2159
  {
2160
- "epoch": 13.971291866028707,
2161
- "grad_norm": 0.357421875,
2162
  "learning_rate": 2.4930026151759766e-06,
2163
- "loss": 0.9007,
2164
- "step": 1460
2165
  },
2166
  {
2167
  "epoch": 14.0,
2168
- "eval_loss": 2.5432519912719727,
2169
- "eval_runtime": 0.5348,
2170
- "eval_samples_per_second": 18.7,
2171
- "eval_steps_per_second": 1.87,
2172
- "step": 1463
2173
- },
2174
- {
2175
- "epoch": 14.019138755980862,
2176
- "grad_norm": 0.3359375,
2177
- "learning_rate": 2.250851064337367e-06,
2178
- "loss": 0.8857,
2179
- "step": 1465
2180
- },
2181
- {
2182
- "epoch": 14.066985645933014,
2183
- "grad_norm": 0.330078125,
2184
  "learning_rate": 2.0209347957732328e-06,
2185
- "loss": 0.897,
2186
- "step": 1470
2187
  },
2188
  {
2189
- "epoch": 14.114832535885167,
2190
- "grad_norm": 0.337890625,
2191
- "learning_rate": 1.8032825881530213e-06,
2192
- "loss": 0.8863,
2193
- "step": 1475
 
2194
  },
2195
  {
2196
- "epoch": 14.16267942583732,
2197
- "grad_norm": 0.3359375,
2198
  "learning_rate": 1.5979216850509848e-06,
2199
- "loss": 0.8968,
2200
- "step": 1480
2201
- },
2202
- {
2203
- "epoch": 14.210526315789474,
2204
- "grad_norm": 0.345703125,
2205
- "learning_rate": 1.404877791536141e-06,
2206
- "loss": 0.8934,
2207
- "step": 1485
2208
  },
2209
  {
2210
- "epoch": 14.258373205741627,
2211
- "grad_norm": 0.3671875,
2212
  "learning_rate": 1.2241750709546917e-06,
2213
- "loss": 0.8868,
2214
- "step": 1490
2215
- },
2216
- {
2217
- "epoch": 14.30622009569378,
2218
- "grad_norm": 0.341796875,
2219
- "learning_rate": 1.055836141905553e-06,
2220
- "loss": 0.8992,
2221
- "step": 1495
2222
  },
2223
  {
2224
- "epoch": 14.354066985645932,
2225
- "grad_norm": 0.337890625,
2226
  "learning_rate": 8.998820754091531e-07,
2227
- "loss": 0.8934,
2228
- "step": 1500
2229
- },
2230
- {
2231
- "epoch": 14.401913875598087,
2232
- "grad_norm": 0.3515625,
2233
- "learning_rate": 7.563323922699983e-07,
2234
- "loss": 0.8897,
2235
- "step": 1505
2236
  },
2237
  {
2238
- "epoch": 14.44976076555024,
2239
- "grad_norm": 0.328125,
2240
  "learning_rate": 6.25205060633205e-07,
2241
- "loss": 0.8929,
2242
- "step": 1510
2243
- },
2244
- {
2245
- "epoch": 14.497607655502392,
2246
- "grad_norm": 0.3359375,
2247
- "learning_rate": 5.065164937354428e-07,
2248
- "loss": 0.9045,
2249
- "step": 1515
2250
  },
2251
  {
2252
- "epoch": 14.545454545454545,
2253
- "grad_norm": 0.328125,
2254
  "learning_rate": 4.0028154785050063e-07,
2255
- "loss": 0.8877,
2256
- "step": 1520
2257
- },
2258
- {
2259
- "epoch": 14.593301435406698,
2260
- "grad_norm": 0.3515625,
2261
- "learning_rate": 3.065135204296965e-07,
2262
- "loss": 0.8995,
2263
- "step": 1525
2264
  },
2265
  {
2266
- "epoch": 14.641148325358852,
2267
- "grad_norm": 0.3359375,
2268
  "learning_rate": 2.2522414843748618e-07,
2269
- "loss": 0.8919,
2270
- "step": 1530
2271
- },
2272
- {
2273
- "epoch": 14.688995215311005,
2274
- "grad_norm": 0.353515625,
2275
- "learning_rate": 1.5642360688225e-07,
2276
- "loss": 0.8905,
2277
- "step": 1535
2278
  },
2279
  {
2280
- "epoch": 14.736842105263158,
2281
- "grad_norm": 0.33203125,
2282
  "learning_rate": 1.0012050754277802e-07,
2283
- "loss": 0.8943,
2284
- "step": 1540
2285
- },
2286
- {
2287
- "epoch": 14.78468899521531,
2288
- "grad_norm": 0.34375,
2289
- "learning_rate": 5.632189789027687e-08,
2290
- "loss": 0.8943,
2291
- "step": 1545
2292
  },
2293
  {
2294
- "epoch": 14.832535885167465,
2295
- "grad_norm": 0.330078125,
2296
  "learning_rate": 2.5033260206275277e-08,
2297
- "loss": 0.8969,
2298
- "step": 1550
2299
- },
2300
- {
2301
- "epoch": 14.880382775119617,
2302
- "grad_norm": 0.326171875,
2303
- "learning_rate": 6.25851089636198e-09,
2304
- "loss": 0.8944,
2305
- "step": 1555
2306
  },
2307
  {
2308
- "epoch": 14.92822966507177,
2309
- "grad_norm": 0.330078125,
2310
  "learning_rate": 0.0,
2311
- "loss": 0.896,
2312
- "step": 1560
2313
  },
2314
  {
2315
- "epoch": 14.92822966507177,
2316
- "eval_loss": 2.542541980743408,
2317
- "eval_runtime": 0.5324,
2318
- "eval_samples_per_second": 18.782,
2319
- "eval_steps_per_second": 1.878,
2320
- "step": 1560
2321
  },
2322
  {
2323
- "epoch": 14.92822966507177,
2324
- "step": 1560,
2325
- "total_flos": 1.2217365722824704e+18,
2326
- "train_loss": 1.0227742999027938,
2327
- "train_runtime": 5355.4835,
2328
- "train_samples_per_second": 18.668,
2329
- "train_steps_per_second": 0.291
2330
  }
2331
  ],
2332
  "logging_steps": 5,
2333
- "max_steps": 1560,
2334
  "num_input_tokens_seen": 0,
2335
  "num_train_epochs": 15,
2336
  "save_steps": 100,
2337
- "total_flos": 1.2217365722824704e+18,
2338
- "train_batch_size": 8,
 
 
 
 
 
 
 
 
 
 
 
 
2339
  "trial_name": null,
2340
  "trial_params": null
2341
  }
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 14.857142857142858,
5
  "eval_steps": 500,
6
+ "global_step": 780,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.01904761904761905,
13
+ "grad_norm": 3.15625,
14
+ "learning_rate": 2.564102564102564e-06,
15
+ "loss": 3.3205,
16
  "step": 1
17
  },
18
  {
19
+ "epoch": 0.09523809523809523,
20
+ "grad_norm": 4.34375,
21
+ "learning_rate": 1.282051282051282e-05,
22
+ "loss": 3.3126,
23
  "step": 5
24
  },
25
  {
26
+ "epoch": 0.19047619047619047,
27
+ "grad_norm": 2.609375,
28
+ "learning_rate": 2.564102564102564e-05,
29
+ "loss": 3.2719,
30
  "step": 10
31
  },
32
  {
33
+ "epoch": 0.2857142857142857,
34
+ "grad_norm": 2.828125,
35
+ "learning_rate": 3.846153846153846e-05,
36
+ "loss": 3.1205,
37
  "step": 15
38
  },
39
  {
40
+ "epoch": 0.38095238095238093,
41
+ "grad_norm": 2.03125,
42
+ "learning_rate": 5.128205128205128e-05,
43
+ "loss": 2.8114,
44
  "step": 20
45
  },
46
  {
47
+ "epoch": 0.47619047619047616,
48
+ "grad_norm": 2.125,
49
+ "learning_rate": 6.410256410256412e-05,
50
+ "loss": 2.5384,
51
  "step": 25
52
  },
53
  {
54
+ "epoch": 0.5714285714285714,
55
+ "grad_norm": 6.03125,
56
+ "learning_rate": 7.692307692307693e-05,
57
+ "loss": 2.3201,
58
  "step": 30
59
  },
60
  {
61
+ "epoch": 0.6666666666666666,
62
+ "grad_norm": 2.0625,
63
+ "learning_rate": 8.974358974358975e-05,
64
+ "loss": 2.1053,
65
  "step": 35
66
  },
67
  {
68
+ "epoch": 0.7619047619047619,
69
+ "grad_norm": 1.359375,
70
+ "learning_rate": 0.00010256410256410256,
71
+ "loss": 1.8717,
72
  "step": 40
73
  },
74
  {
75
+ "epoch": 0.8571428571428571,
76
+ "grad_norm": 0.54296875,
77
+ "learning_rate": 0.00011538461538461538,
78
+ "loss": 1.6592,
79
  "step": 45
80
  },
81
  {
82
+ "epoch": 0.9523809523809523,
83
+ "grad_norm": 0.427734375,
84
+ "learning_rate": 0.00012820512820512823,
85
+ "loss": 1.518,
86
  "step": 50
87
  },
88
  {
89
+ "epoch": 0.9904761904761905,
90
+ "eval_loss": 2.7708818912506104,
91
+ "eval_runtime": 0.4987,
92
+ "eval_samples_per_second": 42.112,
93
+ "eval_steps_per_second": 2.005,
94
+ "step": 52
95
+ },
96
+ {
97
+ "epoch": 1.0476190476190477,
98
+ "grad_norm": 0.396484375,
99
+ "learning_rate": 0.00014102564102564104,
100
+ "loss": 1.426,
101
  "step": 55
102
  },
103
  {
104
+ "epoch": 1.1428571428571428,
105
+ "grad_norm": 0.51953125,
106
+ "learning_rate": 0.00015384615384615385,
107
+ "loss": 1.3513,
108
  "step": 60
109
  },
110
  {
111
+ "epoch": 1.2380952380952381,
112
+ "grad_norm": 0.75390625,
113
+ "learning_rate": 0.0001666666666666667,
114
+ "loss": 1.3036,
115
  "step": 65
116
  },
117
  {
118
+ "epoch": 1.3333333333333333,
119
+ "grad_norm": 0.703125,
120
+ "learning_rate": 0.0001794871794871795,
121
+ "loss": 1.2627,
122
  "step": 70
123
  },
124
  {
125
+ "epoch": 1.4285714285714286,
126
+ "grad_norm": 0.6015625,
127
+ "learning_rate": 0.00019230769230769233,
128
+ "loss": 1.2335,
129
  "step": 75
130
  },
131
  {
132
+ "epoch": 1.5238095238095237,
133
+ "grad_norm": 0.31640625,
134
+ "learning_rate": 0.00019999599453798524,
135
+ "loss": 1.2094,
136
  "step": 80
137
  },
138
  {
139
+ "epoch": 1.619047619047619,
140
+ "grad_norm": 0.359375,
141
+ "learning_rate": 0.0001999509367752813,
142
+ "loss": 1.1957,
143
  "step": 85
144
  },
145
  {
146
+ "epoch": 1.7142857142857144,
147
+ "grad_norm": 0.80859375,
148
+ "learning_rate": 0.00019985583705641418,
149
+ "loss": 1.1772,
150
  "step": 90
151
  },
152
  {
153
+ "epoch": 1.8095238095238095,
154
+ "grad_norm": 0.5,
155
+ "learning_rate": 0.0001997107429945041,
156
+ "loss": 1.1595,
157
  "step": 95
158
  },
159
  {
160
+ "epoch": 1.9047619047619047,
161
+ "grad_norm": 0.48046875,
162
+ "learning_rate": 0.0001995157272330992,
163
+ "loss": 1.1451,
164
  "step": 100
165
  },
166
  {
167
+ "epoch": 2.0,
168
+ "grad_norm": 0.408203125,
169
+ "learning_rate": 0.0001992708874098054,
170
+ "loss": 1.1423,
171
+ "step": 105
 
172
  },
173
  {
174
+ "epoch": 2.0,
175
+ "eval_loss": 2.659493923187256,
176
+ "eval_runtime": 0.4877,
177
+ "eval_samples_per_second": 43.055,
178
+ "eval_steps_per_second": 2.05,
179
  "step": 105
180
  },
181
  {
182
+ "epoch": 2.0952380952380953,
183
+ "grad_norm": 0.419921875,
184
+ "learning_rate": 0.00019897634610740287,
185
+ "loss": 1.1131,
186
  "step": 110
187
  },
188
  {
189
+ "epoch": 2.1904761904761907,
190
+ "grad_norm": 0.50390625,
191
+ "learning_rate": 0.00019863225079247285,
192
+ "loss": 1.1121,
193
  "step": 115
194
  },
195
  {
196
+ "epoch": 2.2857142857142856,
197
+ "grad_norm": 0.333984375,
198
+ "learning_rate": 0.00019823877374156647,
199
+ "loss": 1.103,
200
  "step": 120
201
  },
202
  {
203
+ "epoch": 2.380952380952381,
204
+ "grad_norm": 0.5859375,
205
+ "learning_rate": 0.00019779611195495177,
206
+ "loss": 1.1023,
207
  "step": 125
208
  },
209
  {
210
+ "epoch": 2.4761904761904763,
211
+ "grad_norm": 0.95703125,
212
+ "learning_rate": 0.00019730448705798239,
213
+ "loss": 1.086,
214
  "step": 130
215
  },
216
  {
217
+ "epoch": 2.571428571428571,
218
+ "grad_norm": 0.392578125,
219
+ "learning_rate": 0.00019676414519013781,
220
+ "loss": 1.0886,
221
  "step": 135
222
  },
223
  {
224
+ "epoch": 2.6666666666666665,
225
+ "grad_norm": 0.70703125,
226
+ "learning_rate": 0.0001961753568817896,
227
+ "loss": 1.0818,
228
  "step": 140
229
  },
230
  {
231
+ "epoch": 2.761904761904762,
232
+ "grad_norm": 0.49609375,
233
+ "learning_rate": 0.0001955384169187563,
234
+ "loss": 1.0799,
235
  "step": 145
236
  },
237
  {
238
+ "epoch": 2.857142857142857,
239
+ "grad_norm": 0.71484375,
240
+ "learning_rate": 0.00019485364419471454,
241
+ "loss": 1.0792,
242
  "step": 150
243
  },
244
  {
245
+ "epoch": 2.9523809523809526,
246
+ "grad_norm": 0.267578125,
247
+ "learning_rate": 0.00019412138155154002,
248
+ "loss": 1.0681,
249
  "step": 155
250
  },
251
  {
252
+ "epoch": 2.9904761904761905,
253
+ "eval_loss": 2.6406373977661133,
254
+ "eval_runtime": 0.4936,
255
+ "eval_samples_per_second": 42.54,
256
+ "eval_steps_per_second": 2.026,
257
+ "step": 157
258
+ },
259
+ {
260
+ "epoch": 3.0476190476190474,
261
+ "grad_norm": 0.291015625,
262
+ "learning_rate": 0.0001933419956076584,
263
+ "loss": 1.0607,
264
  "step": 160
265
  },
266
  {
267
+ "epoch": 3.142857142857143,
268
+ "grad_norm": 0.384765625,
269
+ "learning_rate": 0.00019251587657449236,
270
+ "loss": 1.0467,
271
  "step": 165
272
  },
273
  {
274
+ "epoch": 3.238095238095238,
275
+ "grad_norm": 0.2578125,
276
+ "learning_rate": 0.00019164343806109632,
277
+ "loss": 1.0429,
278
  "step": 170
279
  },
280
  {
281
+ "epoch": 3.3333333333333335,
282
+ "grad_norm": 0.28125,
283
+ "learning_rate": 0.00019072511686707663,
284
+ "loss": 1.0477,
285
  "step": 175
286
  },
287
  {
288
+ "epoch": 3.4285714285714284,
289
+ "grad_norm": 0.23046875,
290
+ "learning_rate": 0.0001897613727639014,
291
+ "loss": 1.0407,
292
  "step": 180
293
  },
294
  {
295
+ "epoch": 3.5238095238095237,
296
+ "grad_norm": 0.390625,
297
+ "learning_rate": 0.00018875268826470872,
298
+ "loss": 1.0408,
299
  "step": 185
300
  },
301
  {
302
+ "epoch": 3.619047619047619,
303
+ "grad_norm": 0.3046875,
304
+ "learning_rate": 0.00018769956838272936,
305
+ "loss": 1.0392,
306
  "step": 190
307
  },
308
  {
309
+ "epoch": 3.7142857142857144,
310
+ "grad_norm": 0.3203125,
311
+ "learning_rate": 0.00018660254037844388,
312
+ "loss": 1.0393,
313
  "step": 195
314
  },
315
  {
316
+ "epoch": 3.8095238095238093,
317
+ "grad_norm": 0.32421875,
318
+ "learning_rate": 0.00018546215349560203,
319
+ "loss": 1.0385,
320
  "step": 200
321
  },
322
  {
323
+ "epoch": 3.9047619047619047,
324
+ "grad_norm": 0.828125,
325
+ "learning_rate": 0.00018427897868623534,
326
+ "loss": 1.0383,
327
  "step": 205
328
  },
329
  {
330
+ "epoch": 4.0,
331
+ "grad_norm": 0.4375,
332
+ "learning_rate": 0.00018305360832480117,
333
+ "loss": 1.0335,
334
+ "step": 210
 
335
  },
336
  {
337
+ "epoch": 4.0,
338
+ "eval_loss": 2.642651319503784,
339
+ "eval_runtime": 0.4862,
340
+ "eval_samples_per_second": 43.193,
341
+ "eval_steps_per_second": 2.057,
342
  "step": 210
343
  },
344
  {
345
+ "epoch": 4.095238095238095,
346
+ "grad_norm": 0.291015625,
347
+ "learning_rate": 0.00018178665591160172,
348
+ "loss": 1.0188,
349
  "step": 215
350
  },
351
  {
352
+ "epoch": 4.190476190476191,
353
+ "grad_norm": 0.27734375,
354
+ "learning_rate": 0.00018047875576562557,
355
+ "loss": 1.0134,
356
  "step": 220
357
  },
358
  {
359
+ "epoch": 4.285714285714286,
360
+ "grad_norm": 0.255859375,
361
+ "learning_rate": 0.0001791305627069662,
362
+ "loss": 1.0109,
363
  "step": 225
364
  },
365
  {
366
+ "epoch": 4.380952380952381,
367
+ "grad_norm": 0.265625,
368
+ "learning_rate": 0.0001777427517289766,
369
+ "loss": 1.009,
370
  "step": 230
371
  },
372
  {
373
+ "epoch": 4.476190476190476,
374
+ "grad_norm": 0.2890625,
375
+ "learning_rate": 0.00017631601766032336,
376
+ "loss": 1.0116,
377
  "step": 235
378
  },
379
  {
380
+ "epoch": 4.571428571428571,
381
+ "grad_norm": 0.5,
382
+ "learning_rate": 0.00017485107481711012,
383
+ "loss": 1.0067,
384
  "step": 240
385
  },
386
  {
387
+ "epoch": 4.666666666666667,
388
+ "grad_norm": 0.58203125,
389
+ "learning_rate": 0.0001733486566452446,
390
+ "loss": 1.0112,
391
  "step": 245
392
  },
393
  {
394
+ "epoch": 4.761904761904762,
395
+ "grad_norm": 0.40234375,
396
+ "learning_rate": 0.0001718095153532274,
397
+ "loss": 1.0096,
398
  "step": 250
399
  },
400
  {
401
+ "epoch": 4.857142857142857,
402
+ "grad_norm": 0.67578125,
403
+ "learning_rate": 0.00017023442153554777,
404
+ "loss": 1.0046,
405
  "step": 255
406
  },
407
  {
408
+ "epoch": 4.9523809523809526,
409
+ "grad_norm": 0.76953125,
410
+ "learning_rate": 0.0001686241637868734,
411
+ "loss": 1.0079,
412
  "step": 260
413
  },
414
  {
415
+ "epoch": 4.9904761904761905,
416
+ "eval_loss": 2.645949363708496,
417
+ "eval_runtime": 0.4992,
418
+ "eval_samples_per_second": 42.069,
419
+ "eval_steps_per_second": 2.003,
420
+ "step": 262
421
+ },
422
+ {
423
+ "epoch": 5.0476190476190474,
424
+ "grad_norm": 0.310546875,
425
+ "learning_rate": 0.00016697954830722868,
426
+ "loss": 0.9948,
427
  "step": 265
428
  },
429
  {
430
+ "epoch": 5.142857142857143,
431
+ "grad_norm": 0.34375,
432
+ "learning_rate": 0.0001653013984983585,
433
+ "loss": 0.9868,
434
  "step": 270
435
  },
436
  {
437
+ "epoch": 5.238095238095238,
438
+ "grad_norm": 0.5625,
439
+ "learning_rate": 0.0001635905545514795,
440
+ "loss": 0.9845,
441
  "step": 275
442
  },
443
  {
444
+ "epoch": 5.333333333333333,
445
+ "grad_norm": 0.5859375,
446
+ "learning_rate": 0.0001618478730266255,
447
+ "loss": 0.9812,
448
  "step": 280
449
  },
450
  {
451
+ "epoch": 5.428571428571429,
452
+ "grad_norm": 0.65625,
453
+ "learning_rate": 0.0001600742264237979,
454
+ "loss": 0.9887,
455
  "step": 285
456
  },
457
  {
458
+ "epoch": 5.523809523809524,
459
+ "grad_norm": 0.357421875,
460
+ "learning_rate": 0.00015827050274613513,
461
+ "loss": 0.9863,
462
  "step": 290
463
  },
464
  {
465
+ "epoch": 5.619047619047619,
466
+ "grad_norm": 0.30078125,
467
+ "learning_rate": 0.0001564376050553205,
468
+ "loss": 0.9862,
469
  "step": 295
470
  },
471
  {
472
+ "epoch": 5.714285714285714,
473
+ "grad_norm": 0.275390625,
474
+ "learning_rate": 0.00015457645101945046,
475
+ "loss": 0.9865,
476
  "step": 300
477
  },
478
  {
479
+ "epoch": 5.809523809523809,
480
+ "grad_norm": 0.337890625,
481
+ "learning_rate": 0.00015268797245359035,
482
+ "loss": 0.9929,
483
  "step": 305
484
  },
485
  {
486
+ "epoch": 5.904761904761905,
487
+ "grad_norm": 0.294921875,
488
+ "learning_rate": 0.0001507731148532468,
489
+ "loss": 0.9868,
490
  "step": 310
491
  },
492
  {
493
+ "epoch": 6.0,
494
+ "grad_norm": 0.32421875,
495
+ "learning_rate": 0.00014883283692099112,
496
+ "loss": 0.9837,
497
+ "step": 315
 
498
  },
499
  {
500
+ "epoch": 6.0,
501
+ "eval_loss": 2.657383441925049,
502
+ "eval_runtime": 0.4893,
503
+ "eval_samples_per_second": 42.915,
504
+ "eval_steps_per_second": 2.044,
505
  "step": 315
506
  },
507
  {
508
+ "epoch": 6.095238095238095,
509
+ "grad_norm": 0.294921875,
510
+ "learning_rate": 0.00014686811008647038,
511
+ "loss": 0.9669,
512
  "step": 320
513
  },
514
  {
515
+ "epoch": 6.190476190476191,
516
+ "grad_norm": 0.287109375,
517
+ "learning_rate": 0.00014487991802004623,
518
+ "loss": 0.9674,
519
  "step": 325
520
  },
521
  {
522
+ "epoch": 6.285714285714286,
523
+ "grad_norm": 0.3046875,
524
+ "learning_rate": 0.00014286925614030542,
525
+ "loss": 0.9712,
526
  "step": 330
527
  },
528
  {
529
+ "epoch": 6.380952380952381,
530
+ "grad_norm": 0.28515625,
531
+ "learning_rate": 0.00014083713111568842,
532
+ "loss": 0.9663,
533
  "step": 335
534
  },
535
  {
536
+ "epoch": 6.476190476190476,
537
+ "grad_norm": 0.462890625,
538
+ "learning_rate": 0.0001387845603604855,
539
+ "loss": 0.9674,
540
  "step": 340
541
  },
542
  {
543
+ "epoch": 6.571428571428571,
544
+ "grad_norm": 0.54296875,
545
+ "learning_rate": 0.00013671257152545277,
546
+ "loss": 0.9732,
547
  "step": 345
548
  },
549
  {
550
+ "epoch": 6.666666666666667,
551
+ "grad_norm": 0.490234375,
552
+ "learning_rate": 0.00013462220198330328,
553
+ "loss": 0.9743,
554
  "step": 350
555
  },
556
  {
557
+ "epoch": 6.761904761904762,
558
+ "grad_norm": 0.296875,
559
+ "learning_rate": 0.0001325144983093305,
560
+ "loss": 0.9704,
561
  "step": 355
562
  },
563
  {
564
+ "epoch": 6.857142857142857,
565
+ "grad_norm": 0.326171875,
566
+ "learning_rate": 0.0001303905157574247,
567
+ "loss": 0.9617,
568
  "step": 360
569
  },
570
  {
571
+ "epoch": 6.9523809523809526,
572
+ "grad_norm": 0.3984375,
573
+ "learning_rate": 0.0001282513177317437,
574
+ "loss": 0.966,
575
  "step": 365
576
  },
577
  {
578
+ "epoch": 6.9904761904761905,
579
+ "eval_loss": 2.6700327396392822,
580
+ "eval_runtime": 0.5365,
581
+ "eval_samples_per_second": 39.146,
582
+ "eval_steps_per_second": 1.864,
583
+ "step": 367
584
+ },
585
+ {
586
+ "epoch": 7.0476190476190474,
587
+ "grad_norm": 0.294921875,
588
+ "learning_rate": 0.00012609797525430373,
589
+ "loss": 0.9592,
590
  "step": 370
591
  },
592
  {
593
+ "epoch": 7.142857142857143,
594
+ "grad_norm": 0.3203125,
595
+ "learning_rate": 0.0001239315664287558,
596
+ "loss": 0.9535,
597
  "step": 375
598
  },
599
  {
600
+ "epoch": 7.238095238095238,
601
+ "grad_norm": 0.26953125,
602
+ "learning_rate": 0.00012175317590061674,
603
+ "loss": 0.9528,
604
  "step": 380
605
  },
606
  {
607
+ "epoch": 7.333333333333333,
608
+ "grad_norm": 0.416015625,
609
+ "learning_rate": 0.00011956389431422507,
610
+ "loss": 0.9572,
611
  "step": 385
612
  },
613
  {
614
+ "epoch": 7.428571428571429,
615
+ "grad_norm": 0.341796875,
616
+ "learning_rate": 0.00011736481776669306,
617
+ "loss": 0.9461,
618
  "step": 390
619
  },
620
  {
621
+ "epoch": 7.523809523809524,
622
+ "grad_norm": 0.30078125,
623
+ "learning_rate": 0.00011515704725912926,
624
+ "loss": 0.9513,
625
  "step": 395
626
  },
627
  {
628
+ "epoch": 7.619047619047619,
629
+ "grad_norm": 0.3125,
630
+ "learning_rate": 0.00011294168814540553,
631
+ "loss": 0.9566,
632
  "step": 400
633
  },
634
  {
635
+ "epoch": 7.714285714285714,
636
+ "grad_norm": 0.609375,
637
+ "learning_rate": 0.00011071984957874479,
638
+ "loss": 0.9594,
639
  "step": 405
640
  },
641
  {
642
+ "epoch": 7.809523809523809,
643
+ "grad_norm": 0.296875,
644
+ "learning_rate": 0.00010849264395640649,
645
+ "loss": 0.9554,
646
  "step": 410
647
  },
648
  {
649
+ "epoch": 7.904761904761905,
650
+ "grad_norm": 0.33203125,
651
+ "learning_rate": 0.0001062611863627482,
652
+ "loss": 0.9479,
653
  "step": 415
654
  },
655
  {
656
+ "epoch": 8.0,
657
+ "grad_norm": 0.353515625,
658
+ "learning_rate": 0.00010402659401094152,
659
+ "loss": 0.9474,
660
+ "step": 420
 
661
  },
662
  {
663
+ "epoch": 8.0,
664
+ "eval_loss": 2.6798880100250244,
665
+ "eval_runtime": 0.4848,
666
+ "eval_samples_per_second": 43.314,
667
+ "eval_steps_per_second": 2.063,
668
  "step": 420
669
  },
670
  {
671
+ "epoch": 8.095238095238095,
672
+ "grad_norm": 0.404296875,
673
+ "learning_rate": 0.00010178998568362243,
674
+ "loss": 0.9432,
675
  "step": 425
676
  },
677
  {
678
+ "epoch": 8.19047619047619,
679
+ "grad_norm": 0.26953125,
680
+ "learning_rate": 9.955248117275566e-05,
681
+ "loss": 0.9398,
682
  "step": 430
683
  },
684
  {
685
+ "epoch": 8.285714285714286,
686
+ "grad_norm": 0.326171875,
687
+ "learning_rate": 9.73152007189939e-05,
688
+ "loss": 0.9348,
689
  "step": 435
690
  },
691
  {
692
+ "epoch": 8.380952380952381,
693
+ "grad_norm": 0.30078125,
694
+ "learning_rate": 9.507926445081219e-05,
695
+ "loss": 0.9416,
696
  "step": 440
697
  },
698
  {
699
+ "epoch": 8.476190476190476,
700
+ "grad_norm": 0.28515625,
701
+ "learning_rate": 9.284579182369867e-05,
702
+ "loss": 0.9352,
703
  "step": 445
704
  },
705
  {
706
+ "epoch": 8.571428571428571,
707
+ "grad_norm": 0.283203125,
708
+ "learning_rate": 9.061590105968208e-05,
709
+ "loss": 0.9427,
710
  "step": 450
711
  },
712
  {
713
+ "epoch": 8.666666666666666,
714
+ "grad_norm": 0.283203125,
715
+ "learning_rate": 8.839070858747697e-05,
716
+ "loss": 0.9413,
717
  "step": 455
718
  },
719
  {
720
+ "epoch": 8.761904761904763,
721
+ "grad_norm": 0.294921875,
722
+ "learning_rate": 8.617132848352671e-05,
723
+ "loss": 0.9409,
724
  "step": 460
725
  },
726
  {
727
+ "epoch": 8.857142857142858,
728
+ "grad_norm": 0.2890625,
729
+ "learning_rate": 8.395887191422397e-05,
730
+ "loss": 0.942,
731
  "step": 465
732
  },
733
  {
734
+ "epoch": 8.952380952380953,
735
+ "grad_norm": 0.298828125,
736
+ "learning_rate": 8.175444657958876e-05,
737
+ "loss": 0.9406,
738
  "step": 470
739
  },
740
  {
741
+ "epoch": 8.99047619047619,
742
+ "eval_loss": 2.688331365585327,
743
+ "eval_runtime": 0.5482,
744
+ "eval_samples_per_second": 38.309,
745
+ "eval_steps_per_second": 1.824,
746
+ "step": 472
747
+ },
748
+ {
749
+ "epoch": 9.047619047619047,
750
+ "grad_norm": 0.302734375,
751
+ "learning_rate": 7.955915615868111e-05,
752
+ "loss": 0.9375,
753
  "step": 475
754
  },
755
  {
756
+ "epoch": 9.142857142857142,
757
+ "grad_norm": 0.275390625,
758
+ "learning_rate": 7.73740997570278e-05,
759
+ "loss": 0.9322,
760
  "step": 480
761
  },
762
  {
763
+ "epoch": 9.238095238095237,
764
+ "grad_norm": 0.26171875,
765
+ "learning_rate": 7.520037135633816e-05,
766
+ "loss": 0.9292,
767
  "step": 485
768
  },
769
  {
770
+ "epoch": 9.333333333333334,
771
+ "grad_norm": 0.287109375,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
772
  "learning_rate": 7.303905926678564e-05,
773
+ "loss": 0.9269,
774
+ "step": 490
 
 
 
 
 
 
 
775
  },
776
  {
777
+ "epoch": 9.428571428571429,
778
+ "grad_norm": 0.341796875,
779
  "learning_rate": 7.089124558212871e-05,
780
+ "loss": 0.9331,
781
+ "step": 495
 
 
 
 
 
 
 
782
  },
783
  {
784
+ "epoch": 9.523809523809524,
785
+ "grad_norm": 0.28125,
786
  "learning_rate": 6.875800563794425e-05,
787
+ "loss": 0.9344,
788
+ "step": 500
 
 
 
 
 
 
 
789
  },
790
  {
791
+ "epoch": 9.619047619047619,
792
+ "grad_norm": 0.28515625,
793
  "learning_rate": 6.664040747324437e-05,
794
+ "loss": 0.9294,
795
+ "step": 505
 
 
 
 
 
 
 
796
  },
797
  {
798
+ "epoch": 9.714285714285714,
799
+ "grad_norm": 0.267578125,
800
  "learning_rate": 6.453951129574644e-05,
801
+ "loss": 0.9347,
802
+ "step": 510
 
 
 
 
 
 
 
803
  },
804
  {
805
+ "epoch": 9.80952380952381,
806
+ "grad_norm": 0.30078125,
807
  "learning_rate": 6.245636895106402e-05,
808
+ "loss": 0.9282,
809
+ "step": 515
 
 
 
 
 
 
 
810
  },
811
  {
812
+ "epoch": 9.904761904761905,
813
+ "grad_norm": 0.353515625,
814
  "learning_rate": 6.039202339608432e-05,
815
+ "loss": 0.9299,
816
+ "step": 520
 
 
 
 
 
 
 
817
  },
818
  {
819
  "epoch": 10.0,
820
+ "grad_norm": 0.27734375,
 
 
 
 
 
 
 
 
821
  "learning_rate": 5.834750817679606e-05,
822
+ "loss": 0.9245,
823
+ "step": 525
824
  },
825
  {
826
+ "epoch": 10.0,
827
+ "eval_loss": 2.6975326538085938,
828
+ "eval_runtime": 0.485,
829
+ "eval_samples_per_second": 43.303,
830
+ "eval_steps_per_second": 2.062,
831
+ "step": 525
832
  },
833
  {
834
+ "epoch": 10.095238095238095,
835
+ "grad_norm": 0.30078125,
836
  "learning_rate": 5.6323846910828735e-05,
837
+ "loss": 0.9233,
838
+ "step": 530
 
 
 
 
 
 
 
839
  },
840
  {
841
+ "epoch": 10.19047619047619,
842
+ "grad_norm": 0.265625,
843
  "learning_rate": 5.432205277496327e-05,
844
+ "loss": 0.9235,
845
+ "step": 535
 
 
 
 
 
 
 
846
  },
847
  {
848
+ "epoch": 10.285714285714286,
849
+ "grad_norm": 0.2734375,
850
  "learning_rate": 5.234312799786921e-05,
851
+ "loss": 0.9194,
852
+ "step": 540
 
 
 
 
 
 
 
853
  },
854
  {
855
+ "epoch": 10.380952380952381,
856
+ "grad_norm": 0.29296875,
857
  "learning_rate": 5.0388063358324134e-05,
858
+ "loss": 0.9235,
859
+ "step": 545
 
 
 
 
 
 
 
860
  },
861
  {
862
+ "epoch": 10.476190476190476,
863
+ "grad_norm": 0.279296875,
864
  "learning_rate": 4.845783768916482e-05,
865
+ "loss": 0.9217,
866
+ "step": 550
 
 
 
 
 
 
 
867
  },
868
  {
869
+ "epoch": 10.571428571428571,
870
+ "grad_norm": 0.265625,
871
  "learning_rate": 4.6553417387219886e-05,
872
+ "loss": 0.9243,
873
+ "step": 555
 
 
 
 
 
 
 
874
  },
875
  {
876
+ "epoch": 10.666666666666666,
877
+ "grad_norm": 0.306640625,
878
  "learning_rate": 4.467575592946864e-05,
879
+ "loss": 0.9262,
880
+ "step": 560
 
 
 
 
 
 
 
881
  },
882
  {
883
+ "epoch": 10.761904761904763,
884
+ "grad_norm": 0.255859375,
885
  "learning_rate": 4.282579339566802e-05,
886
+ "loss": 0.9206,
887
+ "step": 565
 
 
 
 
 
 
 
888
  },
889
  {
890
+ "epoch": 10.857142857142858,
891
+ "grad_norm": 0.2578125,
892
  "learning_rate": 4.100445599768774e-05,
893
+ "loss": 0.9256,
894
+ "step": 570
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
895
  },
896
  {
897
+ "epoch": 10.952380952380953,
898
+ "grad_norm": 0.265625,
899
  "learning_rate": 3.9212655615787804e-05,
900
+ "loss": 0.9208,
901
+ "step": 575
902
  },
903
  {
904
+ "epoch": 10.99047619047619,
905
+ "eval_loss": 2.707897663116455,
906
+ "eval_runtime": 0.647,
907
+ "eval_samples_per_second": 32.456,
908
+ "eval_steps_per_second": 1.546,
909
+ "step": 577
910
  },
911
  {
912
+ "epoch": 11.047619047619047,
913
+ "grad_norm": 0.25,
914
  "learning_rate": 3.745128934207225e-05,
915
+ "loss": 0.9226,
916
+ "step": 580
 
 
 
 
 
 
 
917
  },
918
  {
919
+ "epoch": 11.142857142857142,
920
+ "grad_norm": 0.263671875,
921
  "learning_rate": 3.5721239031346066e-05,
922
+ "loss": 0.9121,
923
+ "step": 585
 
 
 
 
 
 
 
924
  },
925
  {
926
+ "epoch": 11.238095238095237,
927
+ "grad_norm": 0.259765625,
928
  "learning_rate": 3.402337085960119e-05,
929
+ "loss": 0.9155,
930
+ "step": 590
 
 
 
 
 
 
 
931
  },
932
  {
933
+ "epoch": 11.333333333333334,
934
+ "grad_norm": 0.2578125,
935
  "learning_rate": 3.235853489035241e-05,
936
+ "loss": 0.9194,
937
+ "step": 595
 
 
 
 
 
 
 
938
  },
939
  {
940
+ "epoch": 11.428571428571429,
941
+ "grad_norm": 0.26953125,
942
  "learning_rate": 3.072756464904006e-05,
943
+ "loss": 0.9202,
944
+ "step": 600
 
 
 
 
 
 
 
945
  },
946
  {
947
+ "epoch": 11.523809523809524,
948
+ "grad_norm": 0.27734375,
949
  "learning_rate": 2.9131276705713006e-05,
950
+ "loss": 0.9186,
951
+ "step": 605
 
 
 
 
 
 
 
952
  },
953
  {
954
+ "epoch": 11.619047619047619,
955
+ "grad_norm": 0.259765625,
956
  "learning_rate": 2.7570470266200176e-05,
957
+ "loss": 0.9195,
958
+ "step": 610
 
 
 
 
 
 
 
959
  },
960
  {
961
+ "epoch": 11.714285714285714,
962
+ "grad_norm": 0.2470703125,
963
  "learning_rate": 2.6045926771976303e-05,
964
+ "loss": 0.9223,
965
+ "step": 615
 
 
 
 
 
 
 
966
  },
967
  {
968
+ "epoch": 11.80952380952381,
969
+ "grad_norm": 0.25,
970
  "learning_rate": 2.4558409508920986e-05,
971
+ "loss": 0.9139,
972
+ "step": 620
 
 
 
 
 
 
 
973
  },
974
  {
975
+ "epoch": 11.904761904761905,
976
+ "grad_norm": 0.248046875,
977
  "learning_rate": 2.3108663225168435e-05,
978
+ "loss": 0.9117,
979
+ "step": 625
980
  },
981
  {
982
  "epoch": 12.0,
983
+ "grad_norm": 0.263671875,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
984
  "learning_rate": 2.1697413758237784e-05,
985
+ "loss": 0.9195,
986
+ "step": 630
987
  },
988
  {
989
+ "epoch": 12.0,
990
+ "eval_loss": 2.714789867401123,
991
+ "eval_runtime": 0.4906,
992
+ "eval_samples_per_second": 42.808,
993
+ "eval_steps_per_second": 2.038,
994
+ "step": 630
995
  },
996
  {
997
+ "epoch": 12.095238095238095,
998
+ "grad_norm": 0.2392578125,
999
  "learning_rate": 2.032536767163141e-05,
1000
+ "loss": 0.9167,
1001
+ "step": 635
 
 
 
 
 
 
 
1002
  },
1003
  {
1004
+ "epoch": 12.19047619047619,
1005
+ "grad_norm": 0.25,
1006
  "learning_rate": 1.8993211901083353e-05,
1007
+ "loss": 0.9144,
1008
+ "step": 640
 
 
 
 
 
 
 
1009
  },
1010
  {
1011
+ "epoch": 12.285714285714286,
1012
+ "grad_norm": 0.2578125,
1013
  "learning_rate": 1.7701613410634365e-05,
1014
+ "loss": 0.915,
1015
+ "step": 645
 
 
 
 
 
 
 
1016
  },
1017
  {
1018
+ "epoch": 12.380952380952381,
1019
+ "grad_norm": 0.248046875,
1020
  "learning_rate": 1.6451218858706374e-05,
1021
+ "loss": 0.9153,
1022
+ "step": 650
 
 
 
 
 
 
 
1023
  },
1024
  {
1025
+ "epoch": 12.476190476190476,
1026
+ "grad_norm": 0.251953125,
1027
  "learning_rate": 1.5242654274342894e-05,
1028
+ "loss": 0.9133,
1029
+ "step": 655
 
 
 
 
 
 
 
1030
  },
1031
  {
1032
+ "epoch": 12.571428571428571,
1033
+ "grad_norm": 0.25,
1034
  "learning_rate": 1.4076524743778319e-05,
1035
+ "loss": 0.9134,
1036
+ "step": 660
 
 
 
 
 
 
 
1037
  },
1038
  {
1039
+ "epoch": 12.666666666666666,
1040
+ "grad_norm": 0.248046875,
1041
  "learning_rate": 1.295341410749208e-05,
1042
+ "loss": 0.9126,
1043
+ "step": 665
 
 
 
 
 
 
 
1044
  },
1045
  {
1046
+ "epoch": 12.761904761904763,
1047
+ "grad_norm": 0.25,
1048
  "learning_rate": 1.1873884667900125e-05,
1049
+ "loss": 0.9094,
1050
+ "step": 670
 
 
 
 
 
 
 
1051
  },
1052
  {
1053
+ "epoch": 12.857142857142858,
1054
+ "grad_norm": 0.236328125,
1055
  "learning_rate": 1.083847690782972e-05,
1056
+ "loss": 0.9183,
1057
+ "step": 675
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1058
  },
1059
  {
1060
+ "epoch": 12.952380952380953,
1061
+ "grad_norm": 0.255859375,
1062
  "learning_rate": 9.8477092199184e-06,
1063
+ "loss": 0.9212,
1064
+ "step": 680
1065
  },
1066
  {
1067
+ "epoch": 12.99047619047619,
1068
+ "eval_loss": 2.7153878211975098,
1069
+ "eval_runtime": 0.5024,
1070
+ "eval_samples_per_second": 41.796,
1071
+ "eval_steps_per_second": 1.99,
1072
+ "step": 682
1073
  },
1074
  {
1075
+ "epoch": 13.047619047619047,
1076
+ "grad_norm": 0.2431640625,
1077
  "learning_rate": 8.902077647072881e-06,
1078
+ "loss": 0.9139,
1079
+ "step": 685
 
 
 
 
 
 
 
1080
  },
1081
  {
1082
+ "epoch": 13.142857142857142,
1083
+ "grad_norm": 0.2431640625,
1084
  "learning_rate": 8.002055634117578e-06,
1085
+ "loss": 0.9123,
1086
+ "step": 690
 
 
 
 
 
 
 
1087
  },
1088
  {
1089
+ "epoch": 13.238095238095237,
1090
+ "grad_norm": 0.2470703125,
1091
  "learning_rate": 7.148093790757371e-06,
1092
+ "loss": 0.9152,
1093
+ "step": 695
 
 
 
 
 
 
 
1094
  },
1095
  {
1096
+ "epoch": 13.333333333333334,
1097
+ "grad_norm": 0.2578125,
1098
  "learning_rate": 6.3406196659728465e-06,
1099
+ "loss": 0.9087,
1100
+ "step": 700
 
 
 
 
 
 
 
1101
  },
1102
  {
1103
+ "epoch": 13.428571428571429,
1104
+ "grad_norm": 0.23828125,
1105
  "learning_rate": 5.580037533961546e-06,
1106
+ "loss": 0.9212,
1107
+ "step": 705
 
 
 
 
 
 
 
1108
  },
1109
  {
1110
+ "epoch": 13.523809523809524,
1111
+ "grad_norm": 0.2373046875,
1112
  "learning_rate": 4.866728191731829e-06,
1113
+ "loss": 0.909,
1114
+ "step": 710
 
 
 
 
 
 
 
1115
  },
1116
  {
1117
+ "epoch": 13.619047619047619,
1118
+ "grad_norm": 0.2431640625,
1119
  "learning_rate": 4.20104876845111e-06,
1120
+ "loss": 0.9146,
1121
+ "step": 715
 
 
 
 
 
 
 
1122
  },
1123
  {
1124
+ "epoch": 13.714285714285714,
1125
+ "grad_norm": 0.2431640625,
1126
  "learning_rate": 3.5833325466437694e-06,
1127
+ "loss": 0.9107,
1128
+ "step": 720
 
 
 
 
 
 
 
1129
  },
1130
  {
1131
+ "epoch": 13.80952380952381,
1132
+ "grad_norm": 0.2451171875,
1133
  "learning_rate": 3.013888795328057e-06,
1134
+ "loss": 0.9136,
1135
+ "step": 725
 
 
 
 
 
 
 
1136
  },
1137
  {
1138
+ "epoch": 13.904761904761905,
1139
+ "grad_norm": 0.236328125,
1140
  "learning_rate": 2.4930026151759766e-06,
1141
+ "loss": 0.9147,
1142
+ "step": 730
1143
  },
1144
  {
1145
  "epoch": 14.0,
1146
+ "grad_norm": 0.2470703125,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1147
  "learning_rate": 2.0209347957732328e-06,
1148
+ "loss": 0.9136,
1149
+ "step": 735
1150
  },
1151
  {
1152
+ "epoch": 14.0,
1153
+ "eval_loss": 2.7180798053741455,
1154
+ "eval_runtime": 0.4838,
1155
+ "eval_samples_per_second": 43.41,
1156
+ "eval_steps_per_second": 2.067,
1157
+ "step": 735
1158
  },
1159
  {
1160
+ "epoch": 14.095238095238095,
1161
+ "grad_norm": 0.23828125,
1162
  "learning_rate": 1.5979216850509848e-06,
1163
+ "loss": 0.9092,
1164
+ "step": 740
 
 
 
 
 
 
 
1165
  },
1166
  {
1167
+ "epoch": 14.19047619047619,
1168
+ "grad_norm": 0.2578125,
1169
  "learning_rate": 1.2241750709546917e-06,
1170
+ "loss": 0.9159,
1171
+ "step": 745
 
 
 
 
 
 
 
1172
  },
1173
  {
1174
+ "epoch": 14.285714285714286,
1175
+ "grad_norm": 0.236328125,
1176
  "learning_rate": 8.998820754091531e-07,
1177
+ "loss": 0.9153,
1178
+ "step": 750
 
 
 
 
 
 
 
1179
  },
1180
  {
1181
+ "epoch": 14.380952380952381,
1182
+ "grad_norm": 0.244140625,
1183
  "learning_rate": 6.25205060633205e-07,
1184
+ "loss": 0.9114,
1185
+ "step": 755
 
 
 
 
 
 
 
1186
  },
1187
  {
1188
+ "epoch": 14.476190476190476,
1189
+ "grad_norm": 0.244140625,
1190
  "learning_rate": 4.0028154785050063e-07,
1191
+ "loss": 0.9163,
1192
+ "step": 760
 
 
 
 
 
 
 
1193
  },
1194
  {
1195
+ "epoch": 14.571428571428571,
1196
+ "grad_norm": 0.236328125,
1197
  "learning_rate": 2.2522414843748618e-07,
1198
+ "loss": 0.9153,
1199
+ "step": 765
 
 
 
 
 
 
 
1200
  },
1201
  {
1202
+ "epoch": 14.666666666666666,
1203
+ "grad_norm": 0.23828125,
1204
  "learning_rate": 1.0012050754277802e-07,
1205
+ "loss": 0.912,
1206
+ "step": 770
 
 
 
 
 
 
 
1207
  },
1208
  {
1209
+ "epoch": 14.761904761904763,
1210
+ "grad_norm": 0.25,
1211
  "learning_rate": 2.5033260206275277e-08,
1212
+ "loss": 0.9155,
1213
+ "step": 775
 
 
 
 
 
 
 
1214
  },
1215
  {
1216
+ "epoch": 14.857142857142858,
1217
+ "grad_norm": 0.240234375,
1218
  "learning_rate": 0.0,
1219
+ "loss": 0.9103,
1220
+ "step": 780
1221
  },
1222
  {
1223
+ "epoch": 14.857142857142858,
1224
+ "eval_loss": 2.71852707862854,
1225
+ "eval_runtime": 0.4854,
1226
+ "eval_samples_per_second": 43.26,
1227
+ "eval_steps_per_second": 2.06,
1228
+ "step": 780
1229
  },
1230
  {
1231
+ "epoch": 14.857142857142858,
1232
+ "step": 780,
1233
+ "total_flos": 1.2277516310308454e+18,
1234
+ "train_loss": 1.077159938445458,
1235
+ "train_runtime": 4175.1629,
1236
+ "train_samples_per_second": 47.883,
1237
+ "train_steps_per_second": 0.187
1238
  }
1239
  ],
1240
  "logging_steps": 5,
1241
+ "max_steps": 780,
1242
  "num_input_tokens_seen": 0,
1243
  "num_train_epochs": 15,
1244
  "save_steps": 100,
1245
+ "stateful_callbacks": {
1246
+ "TrainerControl": {
1247
+ "args": {
1248
+ "should_epoch_stop": false,
1249
+ "should_evaluate": false,
1250
+ "should_log": false,
1251
+ "should_save": true,
1252
+ "should_training_stop": false
1253
+ },
1254
+ "attributes": {}
1255
+ }
1256
+ },
1257
+ "total_flos": 1.2277516310308454e+18,
1258
+ "train_batch_size": 16,
1259
  "trial_name": null,
1260
  "trial_params": null
1261
  }