edlee123 commited on
Commit
c7c9383
1 Parent(s): db190e5

End of training

Browse files
README.md CHANGED
@@ -4,7 +4,7 @@ base_model: BridgeTower/bridgetower-large-itm-mlm-itc
4
  tags:
5
  - generated_from_trainer
6
  datasets:
7
- - newyorker_caption_contest
8
  model-index:
9
  - name: bridgetower
10
  results: []
@@ -15,7 +15,12 @@ should probably proofread and complete it, then remove this comment. -->
15
 
16
  # bridgetower
17
 
18
- This model is a fine-tuned version of [BridgeTower/bridgetower-large-itm-mlm-itc](https://huggingface.co/BridgeTower/bridgetower-large-itm-mlm-itc) on the newyorker_caption_contest dataset.
 
 
 
 
 
19
 
20
  ## Model description
21
 
 
4
  tags:
5
  - generated_from_trainer
6
  datasets:
7
+ - jmhessel/newyorker_caption_contest
8
  model-index:
9
  - name: bridgetower
10
  results: []
 
15
 
16
  # bridgetower
17
 
18
+ This model is a fine-tuned version of [BridgeTower/bridgetower-large-itm-mlm-itc](https://huggingface.co/BridgeTower/bridgetower-large-itm-mlm-itc) on the jmhessel/newyorker_caption_contest matching dataset.
19
+ It achieves the following results on the evaluation set:
20
+ - Loss: 0.1360
21
+ - Memory Allocated (gb): 51.27
22
+ - Max Memory Allocated (gb): 57.18
23
+ - Total Memory Available (gb): 94.62
24
 
25
  ## Model description
26
 
all_results.json CHANGED
@@ -1,15 +1,15 @@
1
  {
2
  "epoch": 5.0,
3
- "eval_loss": 0.12152472138404846,
4
- "eval_runtime": 3.7753,
5
- "eval_samples_per_second": 141.222,
6
- "eval_steps_per_second": 8.826,
7
- "max_memory_allocated (GB)": 60.52,
8
  "memory_allocated (GB)": 51.27,
9
  "total_flos": 3.0598946525952e+16,
10
  "total_memory_available (GB)": 94.62,
11
- "train_loss": 0.06085505417415074,
12
- "train_runtime": 1020.8061,
13
- "train_samples_per_second": 55.51,
14
- "train_steps_per_second": 1.389
15
  }
 
1
  {
2
  "epoch": 5.0,
3
+ "eval_loss": 0.13596950471401215,
4
+ "eval_runtime": 5.0456,
5
+ "eval_samples_per_second": 129.562,
6
+ "eval_steps_per_second": 8.098,
7
+ "max_memory_allocated (GB)": 57.18,
8
  "memory_allocated (GB)": 51.27,
9
  "total_flos": 3.0598946525952e+16,
10
  "total_memory_available (GB)": 94.62,
11
+ "train_loss": 0.06072675045655698,
12
+ "train_runtime": 1077.821,
13
+ "train_samples_per_second": 52.682,
14
+ "train_steps_per_second": 1.318
15
  }
runs/Oct15_14-09-47_workload-ai-workshop/events.out.tfevents.1729002622.workload-ai-workshop.5557.1 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3ca34ee03ed6583b409728b4632ba8e40a869d7123617a5ba16c2ae2d6f39cf2
3
+ size 998
test_results.json CHANGED
@@ -1,10 +1,10 @@
1
  {
2
  "epoch": 5.0,
3
- "eval_loss": 0.12152472138404846,
4
- "eval_runtime": 3.7753,
5
- "eval_samples_per_second": 141.222,
6
- "eval_steps_per_second": 8.826,
7
- "max_memory_allocated (GB)": 60.52,
8
  "memory_allocated (GB)": 51.27,
9
  "total_memory_available (GB)": 94.62
10
  }
 
1
  {
2
  "epoch": 5.0,
3
+ "eval_loss": 0.13596950471401215,
4
+ "eval_runtime": 5.0456,
5
+ "eval_samples_per_second": 129.562,
6
+ "eval_steps_per_second": 8.098,
7
+ "max_memory_allocated (GB)": 57.18,
8
  "memory_allocated (GB)": 51.27,
9
  "total_memory_available (GB)": 94.62
10
  }
tokenizer.json CHANGED
@@ -1,21 +1,7 @@
1
  {
2
  "version": "1.0",
3
- "truncation": {
4
- "direction": "Right",
5
- "max_length": 128,
6
- "strategy": "LongestFirst",
7
- "stride": 0
8
- },
9
- "padding": {
10
- "strategy": {
11
- "Fixed": 128
12
- },
13
- "direction": "Right",
14
- "pad_to_multiple_of": null,
15
- "pad_id": 1,
16
- "pad_type_id": 0,
17
- "pad_token": "<pad>"
18
- },
19
  "added_tokens": [
20
  {
21
  "id": 0,
 
1
  {
2
  "version": "1.0",
3
+ "truncation": null,
4
+ "padding": null,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
  "added_tokens": [
6
  {
7
  "id": 0,
train_results.json CHANGED
@@ -1,11 +1,11 @@
1
  {
2
  "epoch": 5.0,
3
- "max_memory_allocated (GB)": 60.52,
4
  "memory_allocated (GB)": 50.57,
5
  "total_flos": 3.0598946525952e+16,
6
  "total_memory_available (GB)": 94.62,
7
- "train_loss": 0.06085505417415074,
8
- "train_runtime": 1020.8061,
9
- "train_samples_per_second": 55.51,
10
- "train_steps_per_second": 1.389
11
  }
 
1
  {
2
  "epoch": 5.0,
3
+ "max_memory_allocated (GB)": 57.18,
4
  "memory_allocated (GB)": 50.57,
5
  "total_flos": 3.0598946525952e+16,
6
  "total_memory_available (GB)": 94.62,
7
+ "train_loss": 0.06072675045655698,
8
+ "train_runtime": 1077.821,
9
+ "train_samples_per_second": 52.682,
10
+ "train_steps_per_second": 1.318
11
  }
trainer_state.json CHANGED
@@ -10,9 +10,9 @@
10
  "log_history": [
11
  {
12
  "epoch": 0.04081632653061224,
13
- "grad_norm": 9.589848518371582,
14
  "learning_rate": 9.918367346938776e-06,
15
- "loss": 0.2612,
16
  "max_memory_allocated (GB)": 57.18,
17
  "memory_allocated (GB)": 50.57,
18
  "step": 10,
@@ -20,9 +20,9 @@
20
  },
21
  {
22
  "epoch": 0.08163265306122448,
23
- "grad_norm": 6.701302528381348,
24
  "learning_rate": 9.836734693877552e-06,
25
- "loss": 0.154,
26
  "max_memory_allocated (GB)": 57.18,
27
  "memory_allocated (GB)": 50.57,
28
  "step": 20,
@@ -30,9 +30,9 @@
30
  },
31
  {
32
  "epoch": 0.12244897959183673,
33
- "grad_norm": 5.337311267852783,
34
  "learning_rate": 9.755102040816327e-06,
35
- "loss": 0.1235,
36
  "max_memory_allocated (GB)": 57.18,
37
  "memory_allocated (GB)": 50.57,
38
  "step": 30,
@@ -40,9 +40,9 @@
40
  },
41
  {
42
  "epoch": 0.16326530612244897,
43
- "grad_norm": 4.5042338371276855,
44
  "learning_rate": 9.673469387755103e-06,
45
- "loss": 0.1096,
46
  "max_memory_allocated (GB)": 57.18,
47
  "memory_allocated (GB)": 50.57,
48
  "step": 40,
@@ -50,9 +50,9 @@
50
  },
51
  {
52
  "epoch": 0.20408163265306123,
53
- "grad_norm": 4.461822032928467,
54
  "learning_rate": 9.591836734693878e-06,
55
- "loss": 0.1196,
56
  "max_memory_allocated (GB)": 57.18,
57
  "memory_allocated (GB)": 50.57,
58
  "step": 50,
@@ -60,9 +60,9 @@
60
  },
61
  {
62
  "epoch": 0.24489795918367346,
63
- "grad_norm": 2.2825701236724854,
64
  "learning_rate": 9.510204081632653e-06,
65
- "loss": 0.0805,
66
  "max_memory_allocated (GB)": 57.18,
67
  "memory_allocated (GB)": 50.57,
68
  "step": 60,
@@ -70,9 +70,9 @@
70
  },
71
  {
72
  "epoch": 0.2857142857142857,
73
- "grad_norm": 3.725268602371216,
74
  "learning_rate": 9.42857142857143e-06,
75
- "loss": 0.1026,
76
  "max_memory_allocated (GB)": 57.18,
77
  "memory_allocated (GB)": 50.57,
78
  "step": 70,
@@ -80,9 +80,9 @@
80
  },
81
  {
82
  "epoch": 0.32653061224489793,
83
- "grad_norm": 1.707739233970642,
84
  "learning_rate": 9.346938775510204e-06,
85
- "loss": 0.1111,
86
  "max_memory_allocated (GB)": 57.18,
87
  "memory_allocated (GB)": 50.57,
88
  "step": 80,
@@ -90,9 +90,9 @@
90
  },
91
  {
92
  "epoch": 0.3673469387755102,
93
- "grad_norm": 4.5863938331604,
94
  "learning_rate": 9.26530612244898e-06,
95
- "loss": 0.0856,
96
  "max_memory_allocated (GB)": 57.18,
97
  "memory_allocated (GB)": 50.57,
98
  "step": 90,
@@ -100,9 +100,9 @@
100
  },
101
  {
102
  "epoch": 0.40816326530612246,
103
- "grad_norm": 11.972647666931152,
104
  "learning_rate": 9.183673469387756e-06,
105
- "loss": 0.0759,
106
  "max_memory_allocated (GB)": 57.18,
107
  "memory_allocated (GB)": 50.57,
108
  "step": 100,
@@ -110,9 +110,9 @@
110
  },
111
  {
112
  "epoch": 0.4489795918367347,
113
- "grad_norm": 4.550654888153076,
114
  "learning_rate": 9.102040816326532e-06,
115
- "loss": 0.0717,
116
  "max_memory_allocated (GB)": 57.18,
117
  "memory_allocated (GB)": 50.57,
118
  "step": 110,
@@ -120,9 +120,9 @@
120
  },
121
  {
122
  "epoch": 0.4897959183673469,
123
- "grad_norm": 4.418276786804199,
124
  "learning_rate": 9.020408163265307e-06,
125
- "loss": 0.0717,
126
  "max_memory_allocated (GB)": 57.18,
127
  "memory_allocated (GB)": 50.57,
128
  "step": 120,
@@ -130,9 +130,9 @@
130
  },
131
  {
132
  "epoch": 0.5306122448979592,
133
- "grad_norm": 1.651443600654602,
134
  "learning_rate": 8.938775510204082e-06,
135
- "loss": 0.0581,
136
  "max_memory_allocated (GB)": 57.18,
137
  "memory_allocated (GB)": 50.57,
138
  "step": 130,
@@ -140,9 +140,9 @@
140
  },
141
  {
142
  "epoch": 0.5714285714285714,
143
- "grad_norm": 1.5251814126968384,
144
  "learning_rate": 8.857142857142858e-06,
145
- "loss": 0.0481,
146
  "max_memory_allocated (GB)": 57.18,
147
  "memory_allocated (GB)": 50.57,
148
  "step": 140,
@@ -150,9 +150,9 @@
150
  },
151
  {
152
  "epoch": 0.6122448979591837,
153
- "grad_norm": 1.7455183267593384,
154
  "learning_rate": 8.775510204081633e-06,
155
- "loss": 0.0625,
156
  "max_memory_allocated (GB)": 57.18,
157
  "memory_allocated (GB)": 50.57,
158
  "step": 150,
@@ -160,9 +160,9 @@
160
  },
161
  {
162
  "epoch": 0.6530612244897959,
163
- "grad_norm": 1.7588891983032227,
164
  "learning_rate": 8.69387755102041e-06,
165
- "loss": 0.0711,
166
  "max_memory_allocated (GB)": 57.18,
167
  "memory_allocated (GB)": 50.57,
168
  "step": 160,
@@ -170,9 +170,9 @@
170
  },
171
  {
172
  "epoch": 0.6938775510204082,
173
- "grad_norm": 2.7675328254699707,
174
  "learning_rate": 8.612244897959184e-06,
175
- "loss": 0.0747,
176
  "max_memory_allocated (GB)": 57.18,
177
  "memory_allocated (GB)": 50.57,
178
  "step": 170,
@@ -180,9 +180,9 @@
180
  },
181
  {
182
  "epoch": 0.7346938775510204,
183
- "grad_norm": 1.781469464302063,
184
  "learning_rate": 8.530612244897961e-06,
185
- "loss": 0.061,
186
  "max_memory_allocated (GB)": 57.18,
187
  "memory_allocated (GB)": 50.57,
188
  "step": 180,
@@ -190,9 +190,9 @@
190
  },
191
  {
192
  "epoch": 0.7755102040816326,
193
- "grad_norm": 2.3728435039520264,
194
  "learning_rate": 8.448979591836736e-06,
195
- "loss": 0.0588,
196
  "max_memory_allocated (GB)": 57.18,
197
  "memory_allocated (GB)": 50.57,
198
  "step": 190,
@@ -200,9 +200,9 @@
200
  },
201
  {
202
  "epoch": 0.8163265306122449,
203
- "grad_norm": 0.8711996674537659,
204
  "learning_rate": 8.36734693877551e-06,
205
- "loss": 0.062,
206
  "max_memory_allocated (GB)": 57.18,
207
  "memory_allocated (GB)": 50.57,
208
  "step": 200,
@@ -210,9 +210,9 @@
210
  },
211
  {
212
  "epoch": 0.8571428571428571,
213
- "grad_norm": 1.1986733675003052,
214
  "learning_rate": 8.285714285714287e-06,
215
- "loss": 0.0627,
216
  "max_memory_allocated (GB)": 57.18,
217
  "memory_allocated (GB)": 50.57,
218
  "step": 210,
@@ -220,9 +220,9 @@
220
  },
221
  {
222
  "epoch": 0.8979591836734694,
223
- "grad_norm": 2.8968520164489746,
224
  "learning_rate": 8.204081632653062e-06,
225
- "loss": 0.0604,
226
  "max_memory_allocated (GB)": 57.18,
227
  "memory_allocated (GB)": 50.57,
228
  "step": 220,
@@ -230,9 +230,9 @@
230
  },
231
  {
232
  "epoch": 0.9387755102040817,
233
- "grad_norm": 0.8414793610572815,
234
  "learning_rate": 8.122448979591837e-06,
235
- "loss": 0.0559,
236
  "max_memory_allocated (GB)": 57.18,
237
  "memory_allocated (GB)": 50.57,
238
  "step": 230,
@@ -240,9 +240,9 @@
240
  },
241
  {
242
  "epoch": 0.9795918367346939,
243
- "grad_norm": 0.7434167861938477,
244
  "learning_rate": 8.040816326530613e-06,
245
- "loss": 0.0498,
246
  "max_memory_allocated (GB)": 57.18,
247
  "memory_allocated (GB)": 50.57,
248
  "step": 240,
@@ -250,9 +250,9 @@
250
  },
251
  {
252
  "epoch": 1.0204081632653061,
253
- "grad_norm": 0.8703041076660156,
254
  "learning_rate": 7.959183673469388e-06,
255
- "loss": 0.0618,
256
  "max_memory_allocated (GB)": 57.18,
257
  "memory_allocated (GB)": 50.57,
258
  "step": 250,
@@ -260,9 +260,9 @@
260
  },
261
  {
262
  "epoch": 1.0612244897959184,
263
- "grad_norm": 1.0856379270553589,
264
  "learning_rate": 7.877551020408164e-06,
265
- "loss": 0.056,
266
  "max_memory_allocated (GB)": 57.18,
267
  "memory_allocated (GB)": 50.57,
268
  "step": 260,
@@ -270,9 +270,9 @@
270
  },
271
  {
272
  "epoch": 1.1020408163265305,
273
- "grad_norm": 0.8847401142120361,
274
  "learning_rate": 7.79591836734694e-06,
275
- "loss": 0.0625,
276
  "max_memory_allocated (GB)": 57.18,
277
  "memory_allocated (GB)": 50.57,
278
  "step": 270,
@@ -280,9 +280,9 @@
280
  },
281
  {
282
  "epoch": 1.1428571428571428,
283
- "grad_norm": 1.5929882526397705,
284
  "learning_rate": 7.714285714285716e-06,
285
- "loss": 0.0571,
286
  "max_memory_allocated (GB)": 57.18,
287
  "memory_allocated (GB)": 50.57,
288
  "step": 280,
@@ -290,9 +290,9 @@
290
  },
291
  {
292
  "epoch": 1.183673469387755,
293
- "grad_norm": 0.8007532954216003,
294
  "learning_rate": 7.63265306122449e-06,
295
- "loss": 0.0511,
296
  "max_memory_allocated (GB)": 57.18,
297
  "memory_allocated (GB)": 50.57,
298
  "step": 290,
@@ -300,9 +300,9 @@
300
  },
301
  {
302
  "epoch": 1.2244897959183674,
303
- "grad_norm": 1.2002859115600586,
304
  "learning_rate": 7.551020408163265e-06,
305
- "loss": 0.065,
306
  "max_memory_allocated (GB)": 57.18,
307
  "memory_allocated (GB)": 50.57,
308
  "step": 300,
@@ -310,7 +310,7 @@
310
  },
311
  {
312
  "epoch": 1.2653061224489797,
313
- "grad_norm": 12.871713638305664,
314
  "learning_rate": 7.469387755102041e-06,
315
  "loss": 0.0664,
316
  "max_memory_allocated (GB)": 57.18,
@@ -320,9 +320,9 @@
320
  },
321
  {
322
  "epoch": 1.306122448979592,
323
- "grad_norm": 2.46173357963562,
324
  "learning_rate": 7.387755102040817e-06,
325
- "loss": 0.0495,
326
  "max_memory_allocated (GB)": 57.18,
327
  "memory_allocated (GB)": 50.57,
328
  "step": 320,
@@ -330,9 +330,9 @@
330
  },
331
  {
332
  "epoch": 1.346938775510204,
333
- "grad_norm": 0.860598087310791,
334
  "learning_rate": 7.306122448979592e-06,
335
- "loss": 0.0603,
336
  "max_memory_allocated (GB)": 57.18,
337
  "memory_allocated (GB)": 50.57,
338
  "step": 330,
@@ -340,9 +340,9 @@
340
  },
341
  {
342
  "epoch": 1.3877551020408163,
343
- "grad_norm": 2.5583598613739014,
344
  "learning_rate": 7.224489795918368e-06,
345
- "loss": 0.0547,
346
  "max_memory_allocated (GB)": 57.18,
347
  "memory_allocated (GB)": 50.57,
348
  "step": 340,
@@ -350,9 +350,9 @@
350
  },
351
  {
352
  "epoch": 1.4285714285714286,
353
- "grad_norm": 0.37155964970588684,
354
  "learning_rate": 7.1428571428571436e-06,
355
- "loss": 0.048,
356
  "max_memory_allocated (GB)": 57.18,
357
  "memory_allocated (GB)": 50.57,
358
  "step": 350,
@@ -360,9 +360,9 @@
360
  },
361
  {
362
  "epoch": 1.469387755102041,
363
- "grad_norm": 1.808316707611084,
364
  "learning_rate": 7.061224489795919e-06,
365
- "loss": 0.0462,
366
  "max_memory_allocated (GB)": 57.18,
367
  "memory_allocated (GB)": 50.57,
368
  "step": 360,
@@ -370,9 +370,9 @@
370
  },
371
  {
372
  "epoch": 1.510204081632653,
373
- "grad_norm": 1.0183931589126587,
374
  "learning_rate": 6.979591836734695e-06,
375
- "loss": 0.0594,
376
  "max_memory_allocated (GB)": 57.18,
377
  "memory_allocated (GB)": 50.57,
378
  "step": 370,
@@ -380,9 +380,9 @@
380
  },
381
  {
382
  "epoch": 1.5510204081632653,
383
- "grad_norm": 0.5249583721160889,
384
  "learning_rate": 6.8979591836734705e-06,
385
- "loss": 0.0479,
386
  "max_memory_allocated (GB)": 57.18,
387
  "memory_allocated (GB)": 50.57,
388
  "step": 380,
@@ -390,9 +390,9 @@
390
  },
391
  {
392
  "epoch": 1.5918367346938775,
393
- "grad_norm": 1.1005572080612183,
394
  "learning_rate": 6.816326530612245e-06,
395
- "loss": 0.0649,
396
  "max_memory_allocated (GB)": 57.18,
397
  "memory_allocated (GB)": 50.57,
398
  "step": 390,
@@ -400,9 +400,9 @@
400
  },
401
  {
402
  "epoch": 1.6326530612244898,
403
- "grad_norm": 0.6047573089599609,
404
  "learning_rate": 6.734693877551021e-06,
405
- "loss": 0.0607,
406
  "max_memory_allocated (GB)": 57.18,
407
  "memory_allocated (GB)": 50.57,
408
  "step": 400,
@@ -410,9 +410,9 @@
410
  },
411
  {
412
  "epoch": 1.6734693877551021,
413
- "grad_norm": 0.7261654734611511,
414
  "learning_rate": 6.653061224489797e-06,
415
- "loss": 0.0606,
416
  "max_memory_allocated (GB)": 57.18,
417
  "memory_allocated (GB)": 50.57,
418
  "step": 410,
@@ -420,9 +420,9 @@
420
  },
421
  {
422
  "epoch": 1.7142857142857144,
423
- "grad_norm": 0.848527193069458,
424
  "learning_rate": 6.571428571428572e-06,
425
- "loss": 0.0532,
426
  "max_memory_allocated (GB)": 57.18,
427
  "memory_allocated (GB)": 50.57,
428
  "step": 420,
@@ -430,9 +430,9 @@
430
  },
431
  {
432
  "epoch": 1.7551020408163265,
433
- "grad_norm": 0.23483288288116455,
434
  "learning_rate": 6.489795918367348e-06,
435
- "loss": 0.068,
436
  "max_memory_allocated (GB)": 57.18,
437
  "memory_allocated (GB)": 50.57,
438
  "step": 430,
@@ -440,9 +440,9 @@
440
  },
441
  {
442
  "epoch": 1.7959183673469388,
443
- "grad_norm": 2.0767459869384766,
444
  "learning_rate": 6.408163265306124e-06,
445
- "loss": 0.0617,
446
  "max_memory_allocated (GB)": 57.18,
447
  "memory_allocated (GB)": 50.57,
448
  "step": 440,
@@ -450,9 +450,9 @@
450
  },
451
  {
452
  "epoch": 1.836734693877551,
453
- "grad_norm": 0.5654011368751526,
454
  "learning_rate": 6.326530612244899e-06,
455
- "loss": 0.044,
456
  "max_memory_allocated (GB)": 57.18,
457
  "memory_allocated (GB)": 50.57,
458
  "step": 450,
@@ -460,9 +460,9 @@
460
  },
461
  {
462
  "epoch": 1.8775510204081631,
463
- "grad_norm": 0.7382919788360596,
464
  "learning_rate": 6.244897959183675e-06,
465
- "loss": 0.0537,
466
  "max_memory_allocated (GB)": 57.18,
467
  "memory_allocated (GB)": 50.57,
468
  "step": 460,
@@ -470,9 +470,9 @@
470
  },
471
  {
472
  "epoch": 1.9183673469387754,
473
- "grad_norm": 1.3547204732894897,
474
  "learning_rate": 6.163265306122449e-06,
475
- "loss": 0.0432,
476
  "max_memory_allocated (GB)": 57.18,
477
  "memory_allocated (GB)": 50.57,
478
  "step": 470,
@@ -480,9 +480,9 @@
480
  },
481
  {
482
  "epoch": 1.9591836734693877,
483
- "grad_norm": 0.19681082665920258,
484
  "learning_rate": 6.0816326530612245e-06,
485
- "loss": 0.0498,
486
  "max_memory_allocated (GB)": 57.18,
487
  "memory_allocated (GB)": 50.57,
488
  "step": 480,
@@ -490,9 +490,9 @@
490
  },
491
  {
492
  "epoch": 2.0,
493
- "grad_norm": 1.109737515449524,
494
  "learning_rate": 6e-06,
495
- "loss": 0.0639,
496
  "max_memory_allocated (GB)": 57.18,
497
  "memory_allocated (GB)": 50.57,
498
  "step": 490,
@@ -500,9 +500,9 @@
500
  },
501
  {
502
  "epoch": 2.0408163265306123,
503
- "grad_norm": 0.5894625782966614,
504
  "learning_rate": 5.918367346938776e-06,
505
- "loss": 0.0593,
506
  "max_memory_allocated (GB)": 57.18,
507
  "memory_allocated (GB)": 50.57,
508
  "step": 500,
@@ -510,9 +510,9 @@
510
  },
511
  {
512
  "epoch": 2.0816326530612246,
513
- "grad_norm": 0.7122555375099182,
514
  "learning_rate": 5.8367346938775515e-06,
515
- "loss": 0.0498,
516
  "max_memory_allocated (GB)": 57.18,
517
  "memory_allocated (GB)": 50.57,
518
  "step": 510,
@@ -520,9 +520,9 @@
520
  },
521
  {
522
  "epoch": 2.122448979591837,
523
- "grad_norm": 0.8958902955055237,
524
  "learning_rate": 5.755102040816327e-06,
525
- "loss": 0.0457,
526
  "max_memory_allocated (GB)": 57.18,
527
  "memory_allocated (GB)": 50.57,
528
  "step": 520,
@@ -530,7 +530,7 @@
530
  },
531
  {
532
  "epoch": 2.163265306122449,
533
- "grad_norm": 11.620415687561035,
534
  "learning_rate": 5.673469387755103e-06,
535
  "loss": 0.0626,
536
  "max_memory_allocated (GB)": 57.18,
@@ -540,9 +540,9 @@
540
  },
541
  {
542
  "epoch": 2.204081632653061,
543
- "grad_norm": 0.3538230061531067,
544
  "learning_rate": 5.591836734693878e-06,
545
- "loss": 0.0584,
546
  "max_memory_allocated (GB)": 57.18,
547
  "memory_allocated (GB)": 50.57,
548
  "step": 540,
@@ -550,9 +550,9 @@
550
  },
551
  {
552
  "epoch": 2.2448979591836733,
553
- "grad_norm": 1.5313146114349365,
554
  "learning_rate": 5.510204081632653e-06,
555
- "loss": 0.0627,
556
  "max_memory_allocated (GB)": 57.18,
557
  "memory_allocated (GB)": 50.57,
558
  "step": 550,
@@ -560,9 +560,9 @@
560
  },
561
  {
562
  "epoch": 2.2857142857142856,
563
- "grad_norm": 1.3519809246063232,
564
  "learning_rate": 5.428571428571429e-06,
565
- "loss": 0.0572,
566
  "max_memory_allocated (GB)": 57.18,
567
  "memory_allocated (GB)": 50.57,
568
  "step": 560,
@@ -570,9 +570,9 @@
570
  },
571
  {
572
  "epoch": 2.326530612244898,
573
- "grad_norm": 1.0263270139694214,
574
  "learning_rate": 5.3469387755102045e-06,
575
- "loss": 0.0585,
576
  "max_memory_allocated (GB)": 57.18,
577
  "memory_allocated (GB)": 50.57,
578
  "step": 570,
@@ -580,9 +580,9 @@
580
  },
581
  {
582
  "epoch": 2.36734693877551,
583
- "grad_norm": 0.8926671147346497,
584
  "learning_rate": 5.26530612244898e-06,
585
- "loss": 0.0673,
586
  "max_memory_allocated (GB)": 57.18,
587
  "memory_allocated (GB)": 50.57,
588
  "step": 580,
@@ -590,9 +590,9 @@
590
  },
591
  {
592
  "epoch": 2.4081632653061225,
593
- "grad_norm": 0.3185974955558777,
594
  "learning_rate": 5.183673469387756e-06,
595
- "loss": 0.0537,
596
  "max_memory_allocated (GB)": 57.18,
597
  "memory_allocated (GB)": 50.57,
598
  "step": 590,
@@ -600,9 +600,9 @@
600
  },
601
  {
602
  "epoch": 2.4489795918367347,
603
- "grad_norm": 0.944624662399292,
604
  "learning_rate": 5.1020408163265315e-06,
605
- "loss": 0.0442,
606
  "max_memory_allocated (GB)": 57.18,
607
  "memory_allocated (GB)": 50.57,
608
  "step": 600,
@@ -610,9 +610,9 @@
610
  },
611
  {
612
  "epoch": 2.489795918367347,
613
- "grad_norm": 0.32796111702919006,
614
  "learning_rate": 5.020408163265307e-06,
615
- "loss": 0.0413,
616
  "max_memory_allocated (GB)": 57.18,
617
  "memory_allocated (GB)": 50.57,
618
  "step": 610,
@@ -620,9 +620,9 @@
620
  },
621
  {
622
  "epoch": 2.5306122448979593,
623
- "grad_norm": 0.7929801940917969,
624
  "learning_rate": 4.938775510204082e-06,
625
- "loss": 0.0428,
626
  "max_memory_allocated (GB)": 57.18,
627
  "memory_allocated (GB)": 50.57,
628
  "step": 620,
@@ -630,9 +630,9 @@
630
  },
631
  {
632
  "epoch": 2.571428571428571,
633
- "grad_norm": 0.910254955291748,
634
  "learning_rate": 4.857142857142858e-06,
635
- "loss": 0.0813,
636
  "max_memory_allocated (GB)": 57.18,
637
  "memory_allocated (GB)": 50.57,
638
  "step": 630,
@@ -640,9 +640,9 @@
640
  },
641
  {
642
  "epoch": 2.612244897959184,
643
- "grad_norm": 1.101942539215088,
644
  "learning_rate": 4.775510204081633e-06,
645
- "loss": 0.0495,
646
  "max_memory_allocated (GB)": 57.18,
647
  "memory_allocated (GB)": 50.57,
648
  "step": 640,
@@ -650,9 +650,9 @@
650
  },
651
  {
652
  "epoch": 2.6530612244897958,
653
- "grad_norm": 0.7182526588439941,
654
  "learning_rate": 4.693877551020409e-06,
655
- "loss": 0.0471,
656
  "max_memory_allocated (GB)": 57.18,
657
  "memory_allocated (GB)": 50.57,
658
  "step": 650,
@@ -660,9 +660,9 @@
660
  },
661
  {
662
  "epoch": 2.693877551020408,
663
- "grad_norm": 0.8068158626556396,
664
  "learning_rate": 4.612244897959184e-06,
665
- "loss": 0.0469,
666
  "max_memory_allocated (GB)": 57.18,
667
  "memory_allocated (GB)": 50.57,
668
  "step": 660,
@@ -670,9 +670,9 @@
670
  },
671
  {
672
  "epoch": 2.7346938775510203,
673
- "grad_norm": 1.2375913858413696,
674
  "learning_rate": 4.530612244897959e-06,
675
- "loss": 0.0857,
676
  "max_memory_allocated (GB)": 57.18,
677
  "memory_allocated (GB)": 50.57,
678
  "step": 670,
@@ -680,9 +680,9 @@
680
  },
681
  {
682
  "epoch": 2.7755102040816326,
683
- "grad_norm": 1.1524357795715332,
684
  "learning_rate": 4.448979591836735e-06,
685
- "loss": 0.0488,
686
  "max_memory_allocated (GB)": 57.18,
687
  "memory_allocated (GB)": 50.57,
688
  "step": 680,
@@ -690,9 +690,9 @@
690
  },
691
  {
692
  "epoch": 2.816326530612245,
693
- "grad_norm": 0.3913586437702179,
694
  "learning_rate": 4.367346938775511e-06,
695
- "loss": 0.0451,
696
  "max_memory_allocated (GB)": 57.18,
697
  "memory_allocated (GB)": 50.57,
698
  "step": 690,
@@ -700,9 +700,9 @@
700
  },
701
  {
702
  "epoch": 2.857142857142857,
703
- "grad_norm": 0.47935113310813904,
704
  "learning_rate": 4.2857142857142855e-06,
705
- "loss": 0.0433,
706
  "max_memory_allocated (GB)": 57.18,
707
  "memory_allocated (GB)": 50.57,
708
  "step": 700,
@@ -710,9 +710,9 @@
710
  },
711
  {
712
  "epoch": 2.8979591836734695,
713
- "grad_norm": 0.8084143996238708,
714
  "learning_rate": 4.204081632653061e-06,
715
- "loss": 0.0548,
716
  "max_memory_allocated (GB)": 57.18,
717
  "memory_allocated (GB)": 50.57,
718
  "step": 710,
@@ -720,9 +720,9 @@
720
  },
721
  {
722
  "epoch": 2.938775510204082,
723
- "grad_norm": 1.7315497398376465,
724
  "learning_rate": 4.122448979591837e-06,
725
- "loss": 0.0587,
726
  "max_memory_allocated (GB)": 57.18,
727
  "memory_allocated (GB)": 50.57,
728
  "step": 720,
@@ -730,9 +730,9 @@
730
  },
731
  {
732
  "epoch": 2.979591836734694,
733
- "grad_norm": 0.20743349194526672,
734
  "learning_rate": 4.040816326530612e-06,
735
- "loss": 0.0342,
736
  "max_memory_allocated (GB)": 57.18,
737
  "memory_allocated (GB)": 50.57,
738
  "step": 730,
@@ -740,7 +740,7 @@
740
  },
741
  {
742
  "epoch": 3.020408163265306,
743
- "grad_norm": 0.8024761080741882,
744
  "learning_rate": 3.959183673469388e-06,
745
  "loss": 0.053,
746
  "max_memory_allocated (GB)": 57.18,
@@ -750,9 +750,9 @@
750
  },
751
  {
752
  "epoch": 3.061224489795918,
753
- "grad_norm": 0.45326006412506104,
754
  "learning_rate": 3.877551020408164e-06,
755
- "loss": 0.0619,
756
  "max_memory_allocated (GB)": 57.18,
757
  "memory_allocated (GB)": 50.57,
758
  "step": 750,
@@ -760,7 +760,7 @@
760
  },
761
  {
762
  "epoch": 3.1020408163265305,
763
- "grad_norm": 0.6953087449073792,
764
  "learning_rate": 3.795918367346939e-06,
765
  "loss": 0.0527,
766
  "max_memory_allocated (GB)": 57.18,
@@ -770,9 +770,9 @@
770
  },
771
  {
772
  "epoch": 3.142857142857143,
773
- "grad_norm": 1.2290390729904175,
774
  "learning_rate": 3.7142857142857146e-06,
775
- "loss": 0.0689,
776
  "max_memory_allocated (GB)": 57.18,
777
  "memory_allocated (GB)": 50.57,
778
  "step": 770,
@@ -780,9 +780,9 @@
780
  },
781
  {
782
  "epoch": 3.183673469387755,
783
- "grad_norm": 0.6281890869140625,
784
  "learning_rate": 3.6326530612244903e-06,
785
- "loss": 0.0647,
786
  "max_memory_allocated (GB)": 57.18,
787
  "memory_allocated (GB)": 50.57,
788
  "step": 780,
@@ -790,9 +790,9 @@
790
  },
791
  {
792
  "epoch": 3.2244897959183674,
793
- "grad_norm": 0.3096281588077545,
794
  "learning_rate": 3.5510204081632655e-06,
795
- "loss": 0.0522,
796
  "max_memory_allocated (GB)": 57.18,
797
  "memory_allocated (GB)": 50.57,
798
  "step": 790,
@@ -800,9 +800,9 @@
800
  },
801
  {
802
  "epoch": 3.2653061224489797,
803
- "grad_norm": 0.9390127062797546,
804
  "learning_rate": 3.469387755102041e-06,
805
- "loss": 0.0432,
806
  "max_memory_allocated (GB)": 57.18,
807
  "memory_allocated (GB)": 50.57,
808
  "step": 800,
@@ -810,9 +810,9 @@
810
  },
811
  {
812
  "epoch": 3.306122448979592,
813
- "grad_norm": 0.87565016746521,
814
  "learning_rate": 3.3877551020408168e-06,
815
- "loss": 0.0555,
816
  "max_memory_allocated (GB)": 57.18,
817
  "memory_allocated (GB)": 50.57,
818
  "step": 810,
@@ -820,9 +820,9 @@
820
  },
821
  {
822
  "epoch": 3.3469387755102042,
823
- "grad_norm": 1.0797837972640991,
824
  "learning_rate": 3.3061224489795924e-06,
825
- "loss": 0.0455,
826
  "max_memory_allocated (GB)": 57.18,
827
  "memory_allocated (GB)": 50.57,
828
  "step": 820,
@@ -830,9 +830,9 @@
830
  },
831
  {
832
  "epoch": 3.387755102040816,
833
- "grad_norm": 0.3658354878425598,
834
  "learning_rate": 3.2244897959183672e-06,
835
- "loss": 0.0487,
836
  "max_memory_allocated (GB)": 57.18,
837
  "memory_allocated (GB)": 50.57,
838
  "step": 830,
@@ -840,9 +840,9 @@
840
  },
841
  {
842
  "epoch": 3.4285714285714284,
843
- "grad_norm": 0.4766336977481842,
844
  "learning_rate": 3.142857142857143e-06,
845
- "loss": 0.053,
846
  "max_memory_allocated (GB)": 57.18,
847
  "memory_allocated (GB)": 50.57,
848
  "step": 840,
@@ -850,9 +850,9 @@
850
  },
851
  {
852
  "epoch": 3.4693877551020407,
853
- "grad_norm": 0.49318933486938477,
854
  "learning_rate": 3.0612244897959185e-06,
855
- "loss": 0.0812,
856
  "max_memory_allocated (GB)": 57.18,
857
  "memory_allocated (GB)": 50.57,
858
  "step": 850,
@@ -860,7 +860,7 @@
860
  },
861
  {
862
  "epoch": 3.510204081632653,
863
- "grad_norm": 1.3475311994552612,
864
  "learning_rate": 2.979591836734694e-06,
865
  "loss": 0.0451,
866
  "max_memory_allocated (GB)": 57.18,
@@ -870,9 +870,9 @@
870
  },
871
  {
872
  "epoch": 3.5510204081632653,
873
- "grad_norm": 0.36763882637023926,
874
  "learning_rate": 2.8979591836734694e-06,
875
- "loss": 0.0646,
876
  "max_memory_allocated (GB)": 57.18,
877
  "memory_allocated (GB)": 50.57,
878
  "step": 870,
@@ -880,9 +880,9 @@
880
  },
881
  {
882
  "epoch": 3.5918367346938775,
883
- "grad_norm": 3.085198402404785,
884
  "learning_rate": 2.816326530612245e-06,
885
- "loss": 0.0439,
886
  "max_memory_allocated (GB)": 57.18,
887
  "memory_allocated (GB)": 50.57,
888
  "step": 880,
@@ -890,9 +890,9 @@
890
  },
891
  {
892
  "epoch": 3.63265306122449,
893
- "grad_norm": 0.17229312658309937,
894
  "learning_rate": 2.7346938775510207e-06,
895
- "loss": 0.0288,
896
  "max_memory_allocated (GB)": 57.18,
897
  "memory_allocated (GB)": 50.57,
898
  "step": 890,
@@ -900,9 +900,9 @@
900
  },
901
  {
902
  "epoch": 3.673469387755102,
903
- "grad_norm": 1.0760900974273682,
904
  "learning_rate": 2.6530612244897964e-06,
905
- "loss": 0.0514,
906
  "max_memory_allocated (GB)": 57.18,
907
  "memory_allocated (GB)": 50.57,
908
  "step": 900,
@@ -910,9 +910,9 @@
910
  },
911
  {
912
  "epoch": 3.7142857142857144,
913
- "grad_norm": 0.45855164527893066,
914
  "learning_rate": 2.571428571428571e-06,
915
- "loss": 0.0602,
916
  "max_memory_allocated (GB)": 57.18,
917
  "memory_allocated (GB)": 50.57,
918
  "step": 910,
@@ -920,9 +920,9 @@
920
  },
921
  {
922
  "epoch": 3.7551020408163263,
923
- "grad_norm": 0.15575875341892242,
924
  "learning_rate": 2.489795918367347e-06,
925
- "loss": 0.0543,
926
  "max_memory_allocated (GB)": 57.18,
927
  "memory_allocated (GB)": 50.57,
928
  "step": 920,
@@ -930,9 +930,9 @@
930
  },
931
  {
932
  "epoch": 3.795918367346939,
933
- "grad_norm": 0.779755175113678,
934
  "learning_rate": 2.4081632653061225e-06,
935
- "loss": 0.0497,
936
  "max_memory_allocated (GB)": 57.18,
937
  "memory_allocated (GB)": 50.57,
938
  "step": 930,
@@ -940,9 +940,9 @@
940
  },
941
  {
942
  "epoch": 3.836734693877551,
943
- "grad_norm": 0.7307060956954956,
944
  "learning_rate": 2.326530612244898e-06,
945
- "loss": 0.0486,
946
  "max_memory_allocated (GB)": 57.18,
947
  "memory_allocated (GB)": 50.57,
948
  "step": 940,
@@ -950,9 +950,9 @@
950
  },
951
  {
952
  "epoch": 3.877551020408163,
953
- "grad_norm": 1.062565803527832,
954
  "learning_rate": 2.244897959183674e-06,
955
- "loss": 0.0594,
956
  "max_memory_allocated (GB)": 57.18,
957
  "memory_allocated (GB)": 50.57,
958
  "step": 950,
@@ -960,9 +960,9 @@
960
  },
961
  {
962
  "epoch": 3.9183673469387754,
963
- "grad_norm": 0.3031039535999298,
964
  "learning_rate": 2.1632653061224495e-06,
965
- "loss": 0.0497,
966
  "max_memory_allocated (GB)": 57.18,
967
  "memory_allocated (GB)": 50.57,
968
  "step": 960,
@@ -970,9 +970,9 @@
970
  },
971
  {
972
  "epoch": 3.9591836734693877,
973
- "grad_norm": 2.310593843460083,
974
  "learning_rate": 2.0816326530612247e-06,
975
- "loss": 0.0746,
976
  "max_memory_allocated (GB)": 57.18,
977
  "memory_allocated (GB)": 50.57,
978
  "step": 970,
@@ -980,265 +980,265 @@
980
  },
981
  {
982
  "epoch": 4.0,
983
- "grad_norm": 0.6998704075813293,
984
  "learning_rate": 2.0000000000000003e-06,
985
- "loss": 0.0703,
986
- "max_memory_allocated (GB)": 60.52,
987
  "memory_allocated (GB)": 50.57,
988
  "step": 980,
989
  "total_memory_available (GB)": 94.62
990
  },
991
  {
992
  "epoch": 4.040816326530612,
993
- "grad_norm": 0.7492395639419556,
994
  "learning_rate": 1.9183673469387756e-06,
995
- "loss": 0.0486,
996
- "max_memory_allocated (GB)": 60.52,
997
  "memory_allocated (GB)": 50.57,
998
  "step": 990,
999
  "total_memory_available (GB)": 94.62
1000
  },
1001
  {
1002
  "epoch": 4.081632653061225,
1003
- "grad_norm": 0.7633445858955383,
1004
  "learning_rate": 1.8367346938775512e-06,
1005
- "loss": 0.0625,
1006
- "max_memory_allocated (GB)": 60.52,
1007
  "memory_allocated (GB)": 50.57,
1008
  "step": 1000,
1009
  "total_memory_available (GB)": 94.62
1010
  },
1011
  {
1012
  "epoch": 4.122448979591836,
1013
- "grad_norm": 0.6911561489105225,
1014
  "learning_rate": 1.7551020408163267e-06,
1015
- "loss": 0.0632,
1016
- "max_memory_allocated (GB)": 60.52,
1017
  "memory_allocated (GB)": 50.57,
1018
  "step": 1010,
1019
  "total_memory_available (GB)": 94.62
1020
  },
1021
  {
1022
  "epoch": 4.163265306122449,
1023
- "grad_norm": 0.33521902561187744,
1024
  "learning_rate": 1.6734693877551023e-06,
1025
- "loss": 0.0406,
1026
- "max_memory_allocated (GB)": 60.52,
1027
  "memory_allocated (GB)": 50.57,
1028
  "step": 1020,
1029
  "total_memory_available (GB)": 94.62
1030
  },
1031
  {
1032
  "epoch": 4.204081632653061,
1033
- "grad_norm": 0.7509037852287292,
1034
  "learning_rate": 1.5918367346938775e-06,
1035
- "loss": 0.0531,
1036
- "max_memory_allocated (GB)": 60.52,
1037
  "memory_allocated (GB)": 50.57,
1038
  "step": 1030,
1039
  "total_memory_available (GB)": 94.62
1040
  },
1041
  {
1042
  "epoch": 4.244897959183674,
1043
- "grad_norm": 0.5234070420265198,
1044
  "learning_rate": 1.5102040816326532e-06,
1045
- "loss": 0.0396,
1046
- "max_memory_allocated (GB)": 60.52,
1047
  "memory_allocated (GB)": 50.57,
1048
  "step": 1040,
1049
  "total_memory_available (GB)": 94.62
1050
  },
1051
  {
1052
  "epoch": 4.285714285714286,
1053
- "grad_norm": 0.7997304797172546,
1054
  "learning_rate": 1.4285714285714286e-06,
1055
- "loss": 0.05,
1056
- "max_memory_allocated (GB)": 60.52,
1057
  "memory_allocated (GB)": 50.57,
1058
  "step": 1050,
1059
  "total_memory_available (GB)": 94.62
1060
  },
1061
  {
1062
  "epoch": 4.326530612244898,
1063
- "grad_norm": 0.2255077213048935,
1064
  "learning_rate": 1.3469387755102043e-06,
1065
- "loss": 0.0457,
1066
- "max_memory_allocated (GB)": 60.52,
1067
  "memory_allocated (GB)": 50.57,
1068
  "step": 1060,
1069
  "total_memory_available (GB)": 94.62
1070
  },
1071
  {
1072
  "epoch": 4.36734693877551,
1073
- "grad_norm": 0.5182124376296997,
1074
  "learning_rate": 1.2653061224489795e-06,
1075
- "loss": 0.0485,
1076
- "max_memory_allocated (GB)": 60.52,
1077
  "memory_allocated (GB)": 50.57,
1078
  "step": 1070,
1079
  "total_memory_available (GB)": 94.62
1080
  },
1081
  {
1082
  "epoch": 4.408163265306122,
1083
- "grad_norm": 0.35046374797821045,
1084
  "learning_rate": 1.1836734693877552e-06,
1085
- "loss": 0.0519,
1086
- "max_memory_allocated (GB)": 60.52,
1087
  "memory_allocated (GB)": 50.57,
1088
  "step": 1080,
1089
  "total_memory_available (GB)": 94.62
1090
  },
1091
  {
1092
  "epoch": 4.448979591836735,
1093
- "grad_norm": 0.3923434615135193,
1094
  "learning_rate": 1.1020408163265308e-06,
1095
- "loss": 0.0507,
1096
- "max_memory_allocated (GB)": 60.52,
1097
  "memory_allocated (GB)": 50.57,
1098
  "step": 1090,
1099
  "total_memory_available (GB)": 94.62
1100
  },
1101
  {
1102
  "epoch": 4.489795918367347,
1103
- "grad_norm": 0.23866137862205505,
1104
  "learning_rate": 1.0204081632653063e-06,
1105
- "loss": 0.0362,
1106
- "max_memory_allocated (GB)": 60.52,
1107
  "memory_allocated (GB)": 50.57,
1108
  "step": 1100,
1109
  "total_memory_available (GB)": 94.62
1110
  },
1111
  {
1112
  "epoch": 4.530612244897959,
1113
- "grad_norm": 0.15117916464805603,
1114
  "learning_rate": 9.387755102040817e-07,
1115
- "loss": 0.0464,
1116
- "max_memory_allocated (GB)": 60.52,
1117
  "memory_allocated (GB)": 50.57,
1118
  "step": 1110,
1119
  "total_memory_available (GB)": 94.62
1120
  },
1121
  {
1122
  "epoch": 4.571428571428571,
1123
- "grad_norm": 0.5993088483810425,
1124
  "learning_rate": 8.571428571428572e-07,
1125
- "loss": 0.0404,
1126
- "max_memory_allocated (GB)": 60.52,
1127
  "memory_allocated (GB)": 50.57,
1128
  "step": 1120,
1129
  "total_memory_available (GB)": 94.62
1130
  },
1131
  {
1132
  "epoch": 4.612244897959184,
1133
- "grad_norm": 0.30265432596206665,
1134
  "learning_rate": 7.755102040816327e-07,
1135
- "loss": 0.0545,
1136
- "max_memory_allocated (GB)": 60.52,
1137
  "memory_allocated (GB)": 50.57,
1138
  "step": 1130,
1139
  "total_memory_available (GB)": 94.62
1140
  },
1141
  {
1142
  "epoch": 4.653061224489796,
1143
- "grad_norm": 0.6385183334350586,
1144
  "learning_rate": 6.938775510204082e-07,
1145
- "loss": 0.0731,
1146
- "max_memory_allocated (GB)": 60.52,
1147
  "memory_allocated (GB)": 50.57,
1148
  "step": 1140,
1149
  "total_memory_available (GB)": 94.62
1150
  },
1151
  {
1152
  "epoch": 4.6938775510204085,
1153
- "grad_norm": 1.128566026687622,
1154
  "learning_rate": 6.122448979591837e-07,
1155
- "loss": 0.0516,
1156
- "max_memory_allocated (GB)": 60.52,
1157
  "memory_allocated (GB)": 50.57,
1158
  "step": 1150,
1159
  "total_memory_available (GB)": 94.62
1160
  },
1161
  {
1162
  "epoch": 4.73469387755102,
1163
- "grad_norm": 1.1660116910934448,
1164
  "learning_rate": 5.306122448979592e-07,
1165
- "loss": 0.0611,
1166
- "max_memory_allocated (GB)": 60.52,
1167
  "memory_allocated (GB)": 50.57,
1168
  "step": 1160,
1169
  "total_memory_available (GB)": 94.62
1170
  },
1171
  {
1172
  "epoch": 4.775510204081632,
1173
- "grad_norm": 0.5327439904212952,
1174
  "learning_rate": 4.489795918367347e-07,
1175
- "loss": 0.0549,
1176
- "max_memory_allocated (GB)": 60.52,
1177
  "memory_allocated (GB)": 50.57,
1178
  "step": 1170,
1179
  "total_memory_available (GB)": 94.62
1180
  },
1181
  {
1182
  "epoch": 4.816326530612245,
1183
- "grad_norm": 0.8764423131942749,
1184
  "learning_rate": 3.6734693877551025e-07,
1185
- "loss": 0.0441,
1186
- "max_memory_allocated (GB)": 60.52,
1187
  "memory_allocated (GB)": 50.57,
1188
  "step": 1180,
1189
  "total_memory_available (GB)": 94.62
1190
  },
1191
  {
1192
  "epoch": 4.857142857142857,
1193
- "grad_norm": 0.47835007309913635,
1194
  "learning_rate": 2.8571428571428575e-07,
1195
- "loss": 0.0541,
1196
- "max_memory_allocated (GB)": 60.52,
1197
  "memory_allocated (GB)": 50.57,
1198
  "step": 1190,
1199
  "total_memory_available (GB)": 94.62
1200
  },
1201
  {
1202
  "epoch": 4.8979591836734695,
1203
- "grad_norm": 1.048047661781311,
1204
  "learning_rate": 2.0408163265306121e-07,
1205
- "loss": 0.0731,
1206
- "max_memory_allocated (GB)": 60.52,
1207
  "memory_allocated (GB)": 50.57,
1208
  "step": 1200,
1209
  "total_memory_available (GB)": 94.62
1210
  },
1211
  {
1212
  "epoch": 4.938775510204081,
1213
- "grad_norm": 0.3101171851158142,
1214
  "learning_rate": 1.2244897959183673e-07,
1215
- "loss": 0.0648,
1216
- "max_memory_allocated (GB)": 60.52,
1217
  "memory_allocated (GB)": 50.57,
1218
  "step": 1210,
1219
  "total_memory_available (GB)": 94.62
1220
  },
1221
  {
1222
  "epoch": 4.979591836734694,
1223
- "grad_norm": 0.76802659034729,
1224
  "learning_rate": 4.0816326530612253e-08,
1225
- "loss": 0.0418,
1226
- "max_memory_allocated (GB)": 60.52,
1227
  "memory_allocated (GB)": 50.57,
1228
  "step": 1220,
1229
  "total_memory_available (GB)": 94.62
1230
  },
1231
  {
1232
  "epoch": 5.0,
1233
- "max_memory_allocated (GB)": 60.52,
1234
  "memory_allocated (GB)": 50.57,
1235
  "step": 1225,
1236
  "total_flos": 3.0598946525952e+16,
1237
  "total_memory_available (GB)": 94.62,
1238
- "train_loss": 0.06085505417415074,
1239
- "train_runtime": 1020.8061,
1240
- "train_samples_per_second": 55.51,
1241
- "train_steps_per_second": 1.389
1242
  }
1243
  ],
1244
  "logging_steps": 10,
 
10
  "log_history": [
11
  {
12
  "epoch": 0.04081632653061224,
13
+ "grad_norm": 16.731555938720703,
14
  "learning_rate": 9.918367346938776e-06,
15
+ "loss": 0.2616,
16
  "max_memory_allocated (GB)": 57.18,
17
  "memory_allocated (GB)": 50.57,
18
  "step": 10,
 
20
  },
21
  {
22
  "epoch": 0.08163265306122448,
23
+ "grad_norm": 18.564252853393555,
24
  "learning_rate": 9.836734693877552e-06,
25
+ "loss": 0.1554,
26
  "max_memory_allocated (GB)": 57.18,
27
  "memory_allocated (GB)": 50.57,
28
  "step": 20,
 
30
  },
31
  {
32
  "epoch": 0.12244897959183673,
33
+ "grad_norm": 7.802953720092773,
34
  "learning_rate": 9.755102040816327e-06,
35
+ "loss": 0.126,
36
  "max_memory_allocated (GB)": 57.18,
37
  "memory_allocated (GB)": 50.57,
38
  "step": 30,
 
40
  },
41
  {
42
  "epoch": 0.16326530612244897,
43
+ "grad_norm": 4.599184036254883,
44
  "learning_rate": 9.673469387755103e-06,
45
+ "loss": 0.1103,
46
  "max_memory_allocated (GB)": 57.18,
47
  "memory_allocated (GB)": 50.57,
48
  "step": 40,
 
50
  },
51
  {
52
  "epoch": 0.20408163265306123,
53
+ "grad_norm": 6.734092712402344,
54
  "learning_rate": 9.591836734693878e-06,
55
+ "loss": 0.1217,
56
  "max_memory_allocated (GB)": 57.18,
57
  "memory_allocated (GB)": 50.57,
58
  "step": 50,
 
60
  },
61
  {
62
  "epoch": 0.24489795918367346,
63
+ "grad_norm": 1.7976917028427124,
64
  "learning_rate": 9.510204081632653e-06,
65
+ "loss": 0.08,
66
  "max_memory_allocated (GB)": 57.18,
67
  "memory_allocated (GB)": 50.57,
68
  "step": 60,
 
70
  },
71
  {
72
  "epoch": 0.2857142857142857,
73
+ "grad_norm": 4.0598835945129395,
74
  "learning_rate": 9.42857142857143e-06,
75
+ "loss": 0.1023,
76
  "max_memory_allocated (GB)": 57.18,
77
  "memory_allocated (GB)": 50.57,
78
  "step": 70,
 
80
  },
81
  {
82
  "epoch": 0.32653061224489793,
83
+ "grad_norm": 2.0350396633148193,
84
  "learning_rate": 9.346938775510204e-06,
85
+ "loss": 0.1128,
86
  "max_memory_allocated (GB)": 57.18,
87
  "memory_allocated (GB)": 50.57,
88
  "step": 80,
 
90
  },
91
  {
92
  "epoch": 0.3673469387755102,
93
+ "grad_norm": 3.785065174102783,
94
  "learning_rate": 9.26530612244898e-06,
95
+ "loss": 0.0842,
96
  "max_memory_allocated (GB)": 57.18,
97
  "memory_allocated (GB)": 50.57,
98
  "step": 90,
 
100
  },
101
  {
102
  "epoch": 0.40816326530612246,
103
+ "grad_norm": 1.9926950931549072,
104
  "learning_rate": 9.183673469387756e-06,
105
+ "loss": 0.0728,
106
  "max_memory_allocated (GB)": 57.18,
107
  "memory_allocated (GB)": 50.57,
108
  "step": 100,
 
110
  },
111
  {
112
  "epoch": 0.4489795918367347,
113
+ "grad_norm": 4.878537178039551,
114
  "learning_rate": 9.102040816326532e-06,
115
+ "loss": 0.0692,
116
  "max_memory_allocated (GB)": 57.18,
117
  "memory_allocated (GB)": 50.57,
118
  "step": 110,
 
120
  },
121
  {
122
  "epoch": 0.4897959183673469,
123
+ "grad_norm": 2.5495357513427734,
124
  "learning_rate": 9.020408163265307e-06,
125
+ "loss": 0.0691,
126
  "max_memory_allocated (GB)": 57.18,
127
  "memory_allocated (GB)": 50.57,
128
  "step": 120,
 
130
  },
131
  {
132
  "epoch": 0.5306122448979592,
133
+ "grad_norm": 3.0813372135162354,
134
  "learning_rate": 8.938775510204082e-06,
135
+ "loss": 0.0597,
136
  "max_memory_allocated (GB)": 57.18,
137
  "memory_allocated (GB)": 50.57,
138
  "step": 130,
 
140
  },
141
  {
142
  "epoch": 0.5714285714285714,
143
+ "grad_norm": 1.2760021686553955,
144
  "learning_rate": 8.857142857142858e-06,
145
+ "loss": 0.0507,
146
  "max_memory_allocated (GB)": 57.18,
147
  "memory_allocated (GB)": 50.57,
148
  "step": 140,
 
150
  },
151
  {
152
  "epoch": 0.6122448979591837,
153
+ "grad_norm": 2.0892932415008545,
154
  "learning_rate": 8.775510204081633e-06,
155
+ "loss": 0.0634,
156
  "max_memory_allocated (GB)": 57.18,
157
  "memory_allocated (GB)": 50.57,
158
  "step": 150,
 
160
  },
161
  {
162
  "epoch": 0.6530612244897959,
163
+ "grad_norm": 2.0191688537597656,
164
  "learning_rate": 8.69387755102041e-06,
165
+ "loss": 0.0718,
166
  "max_memory_allocated (GB)": 57.18,
167
  "memory_allocated (GB)": 50.57,
168
  "step": 160,
 
170
  },
171
  {
172
  "epoch": 0.6938775510204082,
173
+ "grad_norm": 2.4970450401306152,
174
  "learning_rate": 8.612244897959184e-06,
175
+ "loss": 0.0753,
176
  "max_memory_allocated (GB)": 57.18,
177
  "memory_allocated (GB)": 50.57,
178
  "step": 170,
 
180
  },
181
  {
182
  "epoch": 0.7346938775510204,
183
+ "grad_norm": 1.5428143739700317,
184
  "learning_rate": 8.530612244897961e-06,
185
+ "loss": 0.0612,
186
  "max_memory_allocated (GB)": 57.18,
187
  "memory_allocated (GB)": 50.57,
188
  "step": 180,
 
190
  },
191
  {
192
  "epoch": 0.7755102040816326,
193
+ "grad_norm": 3.6041452884674072,
194
  "learning_rate": 8.448979591836736e-06,
195
+ "loss": 0.0573,
196
  "max_memory_allocated (GB)": 57.18,
197
  "memory_allocated (GB)": 50.57,
198
  "step": 190,
 
200
  },
201
  {
202
  "epoch": 0.8163265306122449,
203
+ "grad_norm": 1.0115529298782349,
204
  "learning_rate": 8.36734693877551e-06,
205
+ "loss": 0.0631,
206
  "max_memory_allocated (GB)": 57.18,
207
  "memory_allocated (GB)": 50.57,
208
  "step": 200,
 
210
  },
211
  {
212
  "epoch": 0.8571428571428571,
213
+ "grad_norm": 0.8029147982597351,
214
  "learning_rate": 8.285714285714287e-06,
215
+ "loss": 0.0643,
216
  "max_memory_allocated (GB)": 57.18,
217
  "memory_allocated (GB)": 50.57,
218
  "step": 210,
 
220
  },
221
  {
222
  "epoch": 0.8979591836734694,
223
+ "grad_norm": 1.130996584892273,
224
  "learning_rate": 8.204081632653062e-06,
225
+ "loss": 0.0608,
226
  "max_memory_allocated (GB)": 57.18,
227
  "memory_allocated (GB)": 50.57,
228
  "step": 220,
 
230
  },
231
  {
232
  "epoch": 0.9387755102040817,
233
+ "grad_norm": 0.7962849140167236,
234
  "learning_rate": 8.122448979591837e-06,
235
+ "loss": 0.0535,
236
  "max_memory_allocated (GB)": 57.18,
237
  "memory_allocated (GB)": 50.57,
238
  "step": 230,
 
240
  },
241
  {
242
  "epoch": 0.9795918367346939,
243
+ "grad_norm": 2.3987386226654053,
244
  "learning_rate": 8.040816326530613e-06,
245
+ "loss": 0.0499,
246
  "max_memory_allocated (GB)": 57.18,
247
  "memory_allocated (GB)": 50.57,
248
  "step": 240,
 
250
  },
251
  {
252
  "epoch": 1.0204081632653061,
253
+ "grad_norm": 1.084067463874817,
254
  "learning_rate": 7.959183673469388e-06,
255
+ "loss": 0.0612,
256
  "max_memory_allocated (GB)": 57.18,
257
  "memory_allocated (GB)": 50.57,
258
  "step": 250,
 
260
  },
261
  {
262
  "epoch": 1.0612244897959184,
263
+ "grad_norm": 3.247530460357666,
264
  "learning_rate": 7.877551020408164e-06,
265
+ "loss": 0.055,
266
  "max_memory_allocated (GB)": 57.18,
267
  "memory_allocated (GB)": 50.57,
268
  "step": 260,
 
270
  },
271
  {
272
  "epoch": 1.1020408163265305,
273
+ "grad_norm": 1.5961194038391113,
274
  "learning_rate": 7.79591836734694e-06,
275
+ "loss": 0.0632,
276
  "max_memory_allocated (GB)": 57.18,
277
  "memory_allocated (GB)": 50.57,
278
  "step": 270,
 
280
  },
281
  {
282
  "epoch": 1.1428571428571428,
283
+ "grad_norm": 0.32916340231895447,
284
  "learning_rate": 7.714285714285716e-06,
285
+ "loss": 0.0565,
286
  "max_memory_allocated (GB)": 57.18,
287
  "memory_allocated (GB)": 50.57,
288
  "step": 280,
 
290
  },
291
  {
292
  "epoch": 1.183673469387755,
293
+ "grad_norm": 0.6009345054626465,
294
  "learning_rate": 7.63265306122449e-06,
295
+ "loss": 0.0503,
296
  "max_memory_allocated (GB)": 57.18,
297
  "memory_allocated (GB)": 50.57,
298
  "step": 290,
 
300
  },
301
  {
302
  "epoch": 1.2244897959183674,
303
+ "grad_norm": 9.806236267089844,
304
  "learning_rate": 7.551020408163265e-06,
305
+ "loss": 0.0635,
306
  "max_memory_allocated (GB)": 57.18,
307
  "memory_allocated (GB)": 50.57,
308
  "step": 300,
 
310
  },
311
  {
312
  "epoch": 1.2653061224489797,
313
+ "grad_norm": 1.231447696685791,
314
  "learning_rate": 7.469387755102041e-06,
315
  "loss": 0.0664,
316
  "max_memory_allocated (GB)": 57.18,
 
320
  },
321
  {
322
  "epoch": 1.306122448979592,
323
+ "grad_norm": 1.016727328300476,
324
  "learning_rate": 7.387755102040817e-06,
325
+ "loss": 0.0502,
326
  "max_memory_allocated (GB)": 57.18,
327
  "memory_allocated (GB)": 50.57,
328
  "step": 320,
 
330
  },
331
  {
332
  "epoch": 1.346938775510204,
333
+ "grad_norm": 0.7703081965446472,
334
  "learning_rate": 7.306122448979592e-06,
335
+ "loss": 0.0599,
336
  "max_memory_allocated (GB)": 57.18,
337
  "memory_allocated (GB)": 50.57,
338
  "step": 330,
 
340
  },
341
  {
342
  "epoch": 1.3877551020408163,
343
+ "grad_norm": 5.310096740722656,
344
  "learning_rate": 7.224489795918368e-06,
345
+ "loss": 0.0541,
346
  "max_memory_allocated (GB)": 57.18,
347
  "memory_allocated (GB)": 50.57,
348
  "step": 340,
 
350
  },
351
  {
352
  "epoch": 1.4285714285714286,
353
+ "grad_norm": 1.1444560289382935,
354
  "learning_rate": 7.1428571428571436e-06,
355
+ "loss": 0.0493,
356
  "max_memory_allocated (GB)": 57.18,
357
  "memory_allocated (GB)": 50.57,
358
  "step": 350,
 
360
  },
361
  {
362
  "epoch": 1.469387755102041,
363
+ "grad_norm": 3.1823084354400635,
364
  "learning_rate": 7.061224489795919e-06,
365
+ "loss": 0.0459,
366
  "max_memory_allocated (GB)": 57.18,
367
  "memory_allocated (GB)": 50.57,
368
  "step": 360,
 
370
  },
371
  {
372
  "epoch": 1.510204081632653,
373
+ "grad_norm": 1.1924108266830444,
374
  "learning_rate": 6.979591836734695e-06,
375
+ "loss": 0.058,
376
  "max_memory_allocated (GB)": 57.18,
377
  "memory_allocated (GB)": 50.57,
378
  "step": 370,
 
380
  },
381
  {
382
  "epoch": 1.5510204081632653,
383
+ "grad_norm": 3.677968740463257,
384
  "learning_rate": 6.8979591836734705e-06,
385
+ "loss": 0.0468,
386
  "max_memory_allocated (GB)": 57.18,
387
  "memory_allocated (GB)": 50.57,
388
  "step": 380,
 
390
  },
391
  {
392
  "epoch": 1.5918367346938775,
393
+ "grad_norm": 1.0082957744598389,
394
  "learning_rate": 6.816326530612245e-06,
395
+ "loss": 0.0652,
396
  "max_memory_allocated (GB)": 57.18,
397
  "memory_allocated (GB)": 50.57,
398
  "step": 390,
 
400
  },
401
  {
402
  "epoch": 1.6326530612244898,
403
+ "grad_norm": 0.6584922075271606,
404
  "learning_rate": 6.734693877551021e-06,
405
+ "loss": 0.0605,
406
  "max_memory_allocated (GB)": 57.18,
407
  "memory_allocated (GB)": 50.57,
408
  "step": 400,
 
410
  },
411
  {
412
  "epoch": 1.6734693877551021,
413
+ "grad_norm": 1.2264763116836548,
414
  "learning_rate": 6.653061224489797e-06,
415
+ "loss": 0.0599,
416
  "max_memory_allocated (GB)": 57.18,
417
  "memory_allocated (GB)": 50.57,
418
  "step": 410,
 
420
  },
421
  {
422
  "epoch": 1.7142857142857144,
423
+ "grad_norm": 1.824320912361145,
424
  "learning_rate": 6.571428571428572e-06,
425
+ "loss": 0.053,
426
  "max_memory_allocated (GB)": 57.18,
427
  "memory_allocated (GB)": 50.57,
428
  "step": 420,
 
430
  },
431
  {
432
  "epoch": 1.7551020408163265,
433
+ "grad_norm": 0.24405838549137115,
434
  "learning_rate": 6.489795918367348e-06,
435
+ "loss": 0.067,
436
  "max_memory_allocated (GB)": 57.18,
437
  "memory_allocated (GB)": 50.57,
438
  "step": 430,
 
440
  },
441
  {
442
  "epoch": 1.7959183673469388,
443
+ "grad_norm": 1.0410544872283936,
444
  "learning_rate": 6.408163265306124e-06,
445
+ "loss": 0.06,
446
  "max_memory_allocated (GB)": 57.18,
447
  "memory_allocated (GB)": 50.57,
448
  "step": 440,
 
450
  },
451
  {
452
  "epoch": 1.836734693877551,
453
+ "grad_norm": 0.39765527844429016,
454
  "learning_rate": 6.326530612244899e-06,
455
+ "loss": 0.043,
456
  "max_memory_allocated (GB)": 57.18,
457
  "memory_allocated (GB)": 50.57,
458
  "step": 450,
 
460
  },
461
  {
462
  "epoch": 1.8775510204081631,
463
+ "grad_norm": 0.29981690645217896,
464
  "learning_rate": 6.244897959183675e-06,
465
+ "loss": 0.0533,
466
  "max_memory_allocated (GB)": 57.18,
467
  "memory_allocated (GB)": 50.57,
468
  "step": 460,
 
470
  },
471
  {
472
  "epoch": 1.9183673469387754,
473
+ "grad_norm": 0.6634105443954468,
474
  "learning_rate": 6.163265306122449e-06,
475
+ "loss": 0.0428,
476
  "max_memory_allocated (GB)": 57.18,
477
  "memory_allocated (GB)": 50.57,
478
  "step": 470,
 
480
  },
481
  {
482
  "epoch": 1.9591836734693877,
483
+ "grad_norm": 0.2060549110174179,
484
  "learning_rate": 6.0816326530612245e-06,
485
+ "loss": 0.0505,
486
  "max_memory_allocated (GB)": 57.18,
487
  "memory_allocated (GB)": 50.57,
488
  "step": 480,
 
490
  },
491
  {
492
  "epoch": 2.0,
493
+ "grad_norm": 1.0800402164459229,
494
  "learning_rate": 6e-06,
495
+ "loss": 0.0647,
496
  "max_memory_allocated (GB)": 57.18,
497
  "memory_allocated (GB)": 50.57,
498
  "step": 490,
 
500
  },
501
  {
502
  "epoch": 2.0408163265306123,
503
+ "grad_norm": 0.6078592538833618,
504
  "learning_rate": 5.918367346938776e-06,
505
+ "loss": 0.0596,
506
  "max_memory_allocated (GB)": 57.18,
507
  "memory_allocated (GB)": 50.57,
508
  "step": 500,
 
510
  },
511
  {
512
  "epoch": 2.0816326530612246,
513
+ "grad_norm": 0.6262472867965698,
514
  "learning_rate": 5.8367346938775515e-06,
515
+ "loss": 0.0495,
516
  "max_memory_allocated (GB)": 57.18,
517
  "memory_allocated (GB)": 50.57,
518
  "step": 510,
 
520
  },
521
  {
522
  "epoch": 2.122448979591837,
523
+ "grad_norm": 1.4308090209960938,
524
  "learning_rate": 5.755102040816327e-06,
525
+ "loss": 0.0461,
526
  "max_memory_allocated (GB)": 57.18,
527
  "memory_allocated (GB)": 50.57,
528
  "step": 520,
 
530
  },
531
  {
532
  "epoch": 2.163265306122449,
533
+ "grad_norm": 0.8761769533157349,
534
  "learning_rate": 5.673469387755103e-06,
535
  "loss": 0.0626,
536
  "max_memory_allocated (GB)": 57.18,
 
540
  },
541
  {
542
  "epoch": 2.204081632653061,
543
+ "grad_norm": 0.6150110363960266,
544
  "learning_rate": 5.591836734693878e-06,
545
+ "loss": 0.0576,
546
  "max_memory_allocated (GB)": 57.18,
547
  "memory_allocated (GB)": 50.57,
548
  "step": 540,
 
550
  },
551
  {
552
  "epoch": 2.2448979591836733,
553
+ "grad_norm": 9.280069351196289,
554
  "learning_rate": 5.510204081632653e-06,
555
+ "loss": 0.0634,
556
  "max_memory_allocated (GB)": 57.18,
557
  "memory_allocated (GB)": 50.57,
558
  "step": 550,
 
560
  },
561
  {
562
  "epoch": 2.2857142857142856,
563
+ "grad_norm": 2.136209726333618,
564
  "learning_rate": 5.428571428571429e-06,
565
+ "loss": 0.0579,
566
  "max_memory_allocated (GB)": 57.18,
567
  "memory_allocated (GB)": 50.57,
568
  "step": 560,
 
570
  },
571
  {
572
  "epoch": 2.326530612244898,
573
+ "grad_norm": 0.9547491073608398,
574
  "learning_rate": 5.3469387755102045e-06,
575
+ "loss": 0.0589,
576
  "max_memory_allocated (GB)": 57.18,
577
  "memory_allocated (GB)": 50.57,
578
  "step": 570,
 
580
  },
581
  {
582
  "epoch": 2.36734693877551,
583
+ "grad_norm": 2.1748363971710205,
584
  "learning_rate": 5.26530612244898e-06,
585
+ "loss": 0.0672,
586
  "max_memory_allocated (GB)": 57.18,
587
  "memory_allocated (GB)": 50.57,
588
  "step": 580,
 
590
  },
591
  {
592
  "epoch": 2.4081632653061225,
593
+ "grad_norm": 0.9653811454772949,
594
  "learning_rate": 5.183673469387756e-06,
595
+ "loss": 0.0536,
596
  "max_memory_allocated (GB)": 57.18,
597
  "memory_allocated (GB)": 50.57,
598
  "step": 590,
 
600
  },
601
  {
602
  "epoch": 2.4489795918367347,
603
+ "grad_norm": 0.7852123379707336,
604
  "learning_rate": 5.1020408163265315e-06,
605
+ "loss": 0.0443,
606
  "max_memory_allocated (GB)": 57.18,
607
  "memory_allocated (GB)": 50.57,
608
  "step": 600,
 
610
  },
611
  {
612
  "epoch": 2.489795918367347,
613
+ "grad_norm": 0.7405697107315063,
614
  "learning_rate": 5.020408163265307e-06,
615
+ "loss": 0.041,
616
  "max_memory_allocated (GB)": 57.18,
617
  "memory_allocated (GB)": 50.57,
618
  "step": 610,
 
620
  },
621
  {
622
  "epoch": 2.5306122448979593,
623
+ "grad_norm": 0.7166327834129333,
624
  "learning_rate": 4.938775510204082e-06,
625
+ "loss": 0.0431,
626
  "max_memory_allocated (GB)": 57.18,
627
  "memory_allocated (GB)": 50.57,
628
  "step": 620,
 
630
  },
631
  {
632
  "epoch": 2.571428571428571,
633
+ "grad_norm": 0.5675875544548035,
634
  "learning_rate": 4.857142857142858e-06,
635
+ "loss": 0.0807,
636
  "max_memory_allocated (GB)": 57.18,
637
  "memory_allocated (GB)": 50.57,
638
  "step": 630,
 
640
  },
641
  {
642
  "epoch": 2.612244897959184,
643
+ "grad_norm": 0.954290509223938,
644
  "learning_rate": 4.775510204081633e-06,
645
+ "loss": 0.0492,
646
  "max_memory_allocated (GB)": 57.18,
647
  "memory_allocated (GB)": 50.57,
648
  "step": 640,
 
650
  },
651
  {
652
  "epoch": 2.6530612244897958,
653
+ "grad_norm": 0.9061315655708313,
654
  "learning_rate": 4.693877551020409e-06,
655
+ "loss": 0.0472,
656
  "max_memory_allocated (GB)": 57.18,
657
  "memory_allocated (GB)": 50.57,
658
  "step": 650,
 
660
  },
661
  {
662
  "epoch": 2.693877551020408,
663
+ "grad_norm": 0.7333698868751526,
664
  "learning_rate": 4.612244897959184e-06,
665
+ "loss": 0.0474,
666
  "max_memory_allocated (GB)": 57.18,
667
  "memory_allocated (GB)": 50.57,
668
  "step": 660,
 
670
  },
671
  {
672
  "epoch": 2.7346938775510203,
673
+ "grad_norm": 1.7820810079574585,
674
  "learning_rate": 4.530612244897959e-06,
675
+ "loss": 0.0869,
676
  "max_memory_allocated (GB)": 57.18,
677
  "memory_allocated (GB)": 50.57,
678
  "step": 670,
 
680
  },
681
  {
682
  "epoch": 2.7755102040816326,
683
+ "grad_norm": 1.3184058666229248,
684
  "learning_rate": 4.448979591836735e-06,
685
+ "loss": 0.0494,
686
  "max_memory_allocated (GB)": 57.18,
687
  "memory_allocated (GB)": 50.57,
688
  "step": 680,
 
690
  },
691
  {
692
  "epoch": 2.816326530612245,
693
+ "grad_norm": 0.4212433397769928,
694
  "learning_rate": 4.367346938775511e-06,
695
+ "loss": 0.0445,
696
  "max_memory_allocated (GB)": 57.18,
697
  "memory_allocated (GB)": 50.57,
698
  "step": 690,
 
700
  },
701
  {
702
  "epoch": 2.857142857142857,
703
+ "grad_norm": 0.39547938108444214,
704
  "learning_rate": 4.2857142857142855e-06,
705
+ "loss": 0.0429,
706
  "max_memory_allocated (GB)": 57.18,
707
  "memory_allocated (GB)": 50.57,
708
  "step": 700,
 
710
  },
711
  {
712
  "epoch": 2.8979591836734695,
713
+ "grad_norm": 1.0822113752365112,
714
  "learning_rate": 4.204081632653061e-06,
715
+ "loss": 0.0549,
716
  "max_memory_allocated (GB)": 57.18,
717
  "memory_allocated (GB)": 50.57,
718
  "step": 710,
 
720
  },
721
  {
722
  "epoch": 2.938775510204082,
723
+ "grad_norm": 1.0838605165481567,
724
  "learning_rate": 4.122448979591837e-06,
725
+ "loss": 0.0582,
726
  "max_memory_allocated (GB)": 57.18,
727
  "memory_allocated (GB)": 50.57,
728
  "step": 720,
 
730
  },
731
  {
732
  "epoch": 2.979591836734694,
733
+ "grad_norm": 0.2726249098777771,
734
  "learning_rate": 4.040816326530612e-06,
735
+ "loss": 0.0341,
736
  "max_memory_allocated (GB)": 57.18,
737
  "memory_allocated (GB)": 50.57,
738
  "step": 730,
 
740
  },
741
  {
742
  "epoch": 3.020408163265306,
743
+ "grad_norm": 0.836703360080719,
744
  "learning_rate": 3.959183673469388e-06,
745
  "loss": 0.053,
746
  "max_memory_allocated (GB)": 57.18,
 
750
  },
751
  {
752
  "epoch": 3.061224489795918,
753
+ "grad_norm": 0.6878814697265625,
754
  "learning_rate": 3.877551020408164e-06,
755
+ "loss": 0.0618,
756
  "max_memory_allocated (GB)": 57.18,
757
  "memory_allocated (GB)": 50.57,
758
  "step": 750,
 
760
  },
761
  {
762
  "epoch": 3.1020408163265305,
763
+ "grad_norm": 1.3712306022644043,
764
  "learning_rate": 3.795918367346939e-06,
765
  "loss": 0.0527,
766
  "max_memory_allocated (GB)": 57.18,
 
770
  },
771
  {
772
  "epoch": 3.142857142857143,
773
+ "grad_norm": 2.1615536212921143,
774
  "learning_rate": 3.7142857142857146e-06,
775
+ "loss": 0.069,
776
  "max_memory_allocated (GB)": 57.18,
777
  "memory_allocated (GB)": 50.57,
778
  "step": 770,
 
780
  },
781
  {
782
  "epoch": 3.183673469387755,
783
+ "grad_norm": 0.45463302731513977,
784
  "learning_rate": 3.6326530612244903e-06,
785
+ "loss": 0.064,
786
  "max_memory_allocated (GB)": 57.18,
787
  "memory_allocated (GB)": 50.57,
788
  "step": 780,
 
790
  },
791
  {
792
  "epoch": 3.2244897959183674,
793
+ "grad_norm": 0.40240758657455444,
794
  "learning_rate": 3.5510204081632655e-06,
795
+ "loss": 0.0528,
796
  "max_memory_allocated (GB)": 57.18,
797
  "memory_allocated (GB)": 50.57,
798
  "step": 790,
 
800
  },
801
  {
802
  "epoch": 3.2653061224489797,
803
+ "grad_norm": 0.5408643484115601,
804
  "learning_rate": 3.469387755102041e-06,
805
+ "loss": 0.0433,
806
  "max_memory_allocated (GB)": 57.18,
807
  "memory_allocated (GB)": 50.57,
808
  "step": 800,
 
810
  },
811
  {
812
  "epoch": 3.306122448979592,
813
+ "grad_norm": 0.6606118679046631,
814
  "learning_rate": 3.3877551020408168e-06,
815
+ "loss": 0.0548,
816
  "max_memory_allocated (GB)": 57.18,
817
  "memory_allocated (GB)": 50.57,
818
  "step": 810,
 
820
  },
821
  {
822
  "epoch": 3.3469387755102042,
823
+ "grad_norm": 0.89394211769104,
824
  "learning_rate": 3.3061224489795924e-06,
825
+ "loss": 0.0448,
826
  "max_memory_allocated (GB)": 57.18,
827
  "memory_allocated (GB)": 50.57,
828
  "step": 820,
 
830
  },
831
  {
832
  "epoch": 3.387755102040816,
833
+ "grad_norm": 0.3227229416370392,
834
  "learning_rate": 3.2244897959183672e-06,
835
+ "loss": 0.0491,
836
  "max_memory_allocated (GB)": 57.18,
837
  "memory_allocated (GB)": 50.57,
838
  "step": 830,
 
840
  },
841
  {
842
  "epoch": 3.4285714285714284,
843
+ "grad_norm": 0.928611159324646,
844
  "learning_rate": 3.142857142857143e-06,
845
+ "loss": 0.0523,
846
  "max_memory_allocated (GB)": 57.18,
847
  "memory_allocated (GB)": 50.57,
848
  "step": 840,
 
850
  },
851
  {
852
  "epoch": 3.4693877551020407,
853
+ "grad_norm": 0.6023752093315125,
854
  "learning_rate": 3.0612244897959185e-06,
855
+ "loss": 0.0813,
856
  "max_memory_allocated (GB)": 57.18,
857
  "memory_allocated (GB)": 50.57,
858
  "step": 850,
 
860
  },
861
  {
862
  "epoch": 3.510204081632653,
863
+ "grad_norm": 1.1742885112762451,
864
  "learning_rate": 2.979591836734694e-06,
865
  "loss": 0.0451,
866
  "max_memory_allocated (GB)": 57.18,
 
870
  },
871
  {
872
  "epoch": 3.5510204081632653,
873
+ "grad_norm": 0.21970601379871368,
874
  "learning_rate": 2.8979591836734694e-06,
875
+ "loss": 0.0644,
876
  "max_memory_allocated (GB)": 57.18,
877
  "memory_allocated (GB)": 50.57,
878
  "step": 870,
 
880
  },
881
  {
882
  "epoch": 3.5918367346938775,
883
+ "grad_norm": 1.2270339727401733,
884
  "learning_rate": 2.816326530612245e-06,
885
+ "loss": 0.044,
886
  "max_memory_allocated (GB)": 57.18,
887
  "memory_allocated (GB)": 50.57,
888
  "step": 880,
 
890
  },
891
  {
892
  "epoch": 3.63265306122449,
893
+ "grad_norm": 0.15171077847480774,
894
  "learning_rate": 2.7346938775510207e-06,
895
+ "loss": 0.0291,
896
  "max_memory_allocated (GB)": 57.18,
897
  "memory_allocated (GB)": 50.57,
898
  "step": 890,
 
900
  },
901
  {
902
  "epoch": 3.673469387755102,
903
+ "grad_norm": 2.9102306365966797,
904
  "learning_rate": 2.6530612244897964e-06,
905
+ "loss": 0.0516,
906
  "max_memory_allocated (GB)": 57.18,
907
  "memory_allocated (GB)": 50.57,
908
  "step": 900,
 
910
  },
911
  {
912
  "epoch": 3.7142857142857144,
913
+ "grad_norm": 0.6113564968109131,
914
  "learning_rate": 2.571428571428571e-06,
915
+ "loss": 0.061,
916
  "max_memory_allocated (GB)": 57.18,
917
  "memory_allocated (GB)": 50.57,
918
  "step": 910,
 
920
  },
921
  {
922
  "epoch": 3.7551020408163263,
923
+ "grad_norm": 0.19307516515254974,
924
  "learning_rate": 2.489795918367347e-06,
925
+ "loss": 0.0539,
926
  "max_memory_allocated (GB)": 57.18,
927
  "memory_allocated (GB)": 50.57,
928
  "step": 920,
 
930
  },
931
  {
932
  "epoch": 3.795918367346939,
933
+ "grad_norm": 0.38657116889953613,
934
  "learning_rate": 2.4081632653061225e-06,
935
+ "loss": 0.05,
936
  "max_memory_allocated (GB)": 57.18,
937
  "memory_allocated (GB)": 50.57,
938
  "step": 930,
 
940
  },
941
  {
942
  "epoch": 3.836734693877551,
943
+ "grad_norm": 0.9545940160751343,
944
  "learning_rate": 2.326530612244898e-06,
945
+ "loss": 0.0481,
946
  "max_memory_allocated (GB)": 57.18,
947
  "memory_allocated (GB)": 50.57,
948
  "step": 940,
 
950
  },
951
  {
952
  "epoch": 3.877551020408163,
953
+ "grad_norm": 1.3555389642715454,
954
  "learning_rate": 2.244897959183674e-06,
955
+ "loss": 0.0596,
956
  "max_memory_allocated (GB)": 57.18,
957
  "memory_allocated (GB)": 50.57,
958
  "step": 950,
 
960
  },
961
  {
962
  "epoch": 3.9183673469387754,
963
+ "grad_norm": 0.3931931257247925,
964
  "learning_rate": 2.1632653061224495e-06,
965
+ "loss": 0.0498,
966
  "max_memory_allocated (GB)": 57.18,
967
  "memory_allocated (GB)": 50.57,
968
  "step": 960,
 
970
  },
971
  {
972
  "epoch": 3.9591836734693877,
973
+ "grad_norm": 0.6945517063140869,
974
  "learning_rate": 2.0816326530612247e-06,
975
+ "loss": 0.0734,
976
  "max_memory_allocated (GB)": 57.18,
977
  "memory_allocated (GB)": 50.57,
978
  "step": 970,
 
980
  },
981
  {
982
  "epoch": 4.0,
983
+ "grad_norm": 1.0659183263778687,
984
  "learning_rate": 2.0000000000000003e-06,
985
+ "loss": 0.0707,
986
+ "max_memory_allocated (GB)": 57.18,
987
  "memory_allocated (GB)": 50.57,
988
  "step": 980,
989
  "total_memory_available (GB)": 94.62
990
  },
991
  {
992
  "epoch": 4.040816326530612,
993
+ "grad_norm": 0.8017714023590088,
994
  "learning_rate": 1.9183673469387756e-06,
995
+ "loss": 0.0476,
996
+ "max_memory_allocated (GB)": 57.18,
997
  "memory_allocated (GB)": 50.57,
998
  "step": 990,
999
  "total_memory_available (GB)": 94.62
1000
  },
1001
  {
1002
  "epoch": 4.081632653061225,
1003
+ "grad_norm": 1.1733373403549194,
1004
  "learning_rate": 1.8367346938775512e-06,
1005
+ "loss": 0.0617,
1006
+ "max_memory_allocated (GB)": 57.18,
1007
  "memory_allocated (GB)": 50.57,
1008
  "step": 1000,
1009
  "total_memory_available (GB)": 94.62
1010
  },
1011
  {
1012
  "epoch": 4.122448979591836,
1013
+ "grad_norm": 0.6048600077629089,
1014
  "learning_rate": 1.7551020408163267e-06,
1015
+ "loss": 0.0625,
1016
+ "max_memory_allocated (GB)": 57.18,
1017
  "memory_allocated (GB)": 50.57,
1018
  "step": 1010,
1019
  "total_memory_available (GB)": 94.62
1020
  },
1021
  {
1022
  "epoch": 4.163265306122449,
1023
+ "grad_norm": 1.096007227897644,
1024
  "learning_rate": 1.6734693877551023e-06,
1025
+ "loss": 0.0407,
1026
+ "max_memory_allocated (GB)": 57.18,
1027
  "memory_allocated (GB)": 50.57,
1028
  "step": 1020,
1029
  "total_memory_available (GB)": 94.62
1030
  },
1031
  {
1032
  "epoch": 4.204081632653061,
1033
+ "grad_norm": 4.377923965454102,
1034
  "learning_rate": 1.5918367346938775e-06,
1035
+ "loss": 0.0528,
1036
+ "max_memory_allocated (GB)": 57.18,
1037
  "memory_allocated (GB)": 50.57,
1038
  "step": 1030,
1039
  "total_memory_available (GB)": 94.62
1040
  },
1041
  {
1042
  "epoch": 4.244897959183674,
1043
+ "grad_norm": 0.7295175194740295,
1044
  "learning_rate": 1.5102040816326532e-06,
1045
+ "loss": 0.04,
1046
+ "max_memory_allocated (GB)": 57.18,
1047
  "memory_allocated (GB)": 50.57,
1048
  "step": 1040,
1049
  "total_memory_available (GB)": 94.62
1050
  },
1051
  {
1052
  "epoch": 4.285714285714286,
1053
+ "grad_norm": 0.5129045248031616,
1054
  "learning_rate": 1.4285714285714286e-06,
1055
+ "loss": 0.0493,
1056
+ "max_memory_allocated (GB)": 57.18,
1057
  "memory_allocated (GB)": 50.57,
1058
  "step": 1050,
1059
  "total_memory_available (GB)": 94.62
1060
  },
1061
  {
1062
  "epoch": 4.326530612244898,
1063
+ "grad_norm": 0.505799412727356,
1064
  "learning_rate": 1.3469387755102043e-06,
1065
+ "loss": 0.046,
1066
+ "max_memory_allocated (GB)": 57.18,
1067
  "memory_allocated (GB)": 50.57,
1068
  "step": 1060,
1069
  "total_memory_available (GB)": 94.62
1070
  },
1071
  {
1072
  "epoch": 4.36734693877551,
1073
+ "grad_norm": 0.8460046052932739,
1074
  "learning_rate": 1.2653061224489795e-06,
1075
+ "loss": 0.0483,
1076
+ "max_memory_allocated (GB)": 57.18,
1077
  "memory_allocated (GB)": 50.57,
1078
  "step": 1070,
1079
  "total_memory_available (GB)": 94.62
1080
  },
1081
  {
1082
  "epoch": 4.408163265306122,
1083
+ "grad_norm": 0.5636487603187561,
1084
  "learning_rate": 1.1836734693877552e-06,
1085
+ "loss": 0.0523,
1086
+ "max_memory_allocated (GB)": 57.18,
1087
  "memory_allocated (GB)": 50.57,
1088
  "step": 1080,
1089
  "total_memory_available (GB)": 94.62
1090
  },
1091
  {
1092
  "epoch": 4.448979591836735,
1093
+ "grad_norm": 0.6893309354782104,
1094
  "learning_rate": 1.1020408163265308e-06,
1095
+ "loss": 0.0505,
1096
+ "max_memory_allocated (GB)": 57.18,
1097
  "memory_allocated (GB)": 50.57,
1098
  "step": 1090,
1099
  "total_memory_available (GB)": 94.62
1100
  },
1101
  {
1102
  "epoch": 4.489795918367347,
1103
+ "grad_norm": 0.6645925045013428,
1104
  "learning_rate": 1.0204081632653063e-06,
1105
+ "loss": 0.0361,
1106
+ "max_memory_allocated (GB)": 57.18,
1107
  "memory_allocated (GB)": 50.57,
1108
  "step": 1100,
1109
  "total_memory_available (GB)": 94.62
1110
  },
1111
  {
1112
  "epoch": 4.530612244897959,
1113
+ "grad_norm": 0.14709816873073578,
1114
  "learning_rate": 9.387755102040817e-07,
1115
+ "loss": 0.0458,
1116
+ "max_memory_allocated (GB)": 57.18,
1117
  "memory_allocated (GB)": 50.57,
1118
  "step": 1110,
1119
  "total_memory_available (GB)": 94.62
1120
  },
1121
  {
1122
  "epoch": 4.571428571428571,
1123
+ "grad_norm": 0.8794461488723755,
1124
  "learning_rate": 8.571428571428572e-07,
1125
+ "loss": 0.0414,
1126
+ "max_memory_allocated (GB)": 57.18,
1127
  "memory_allocated (GB)": 50.57,
1128
  "step": 1120,
1129
  "total_memory_available (GB)": 94.62
1130
  },
1131
  {
1132
  "epoch": 4.612244897959184,
1133
+ "grad_norm": 1.345910906791687,
1134
  "learning_rate": 7.755102040816327e-07,
1135
+ "loss": 0.0526,
1136
+ "max_memory_allocated (GB)": 57.18,
1137
  "memory_allocated (GB)": 50.57,
1138
  "step": 1130,
1139
  "total_memory_available (GB)": 94.62
1140
  },
1141
  {
1142
  "epoch": 4.653061224489796,
1143
+ "grad_norm": 0.5657418370246887,
1144
  "learning_rate": 6.938775510204082e-07,
1145
+ "loss": 0.0721,
1146
+ "max_memory_allocated (GB)": 57.18,
1147
  "memory_allocated (GB)": 50.57,
1148
  "step": 1140,
1149
  "total_memory_available (GB)": 94.62
1150
  },
1151
  {
1152
  "epoch": 4.6938775510204085,
1153
+ "grad_norm": 2.0943238735198975,
1154
  "learning_rate": 6.122448979591837e-07,
1155
+ "loss": 0.0515,
1156
+ "max_memory_allocated (GB)": 57.18,
1157
  "memory_allocated (GB)": 50.57,
1158
  "step": 1150,
1159
  "total_memory_available (GB)": 94.62
1160
  },
1161
  {
1162
  "epoch": 4.73469387755102,
1163
+ "grad_norm": 3.1750009059906006,
1164
  "learning_rate": 5.306122448979592e-07,
1165
+ "loss": 0.0607,
1166
+ "max_memory_allocated (GB)": 57.18,
1167
  "memory_allocated (GB)": 50.57,
1168
  "step": 1160,
1169
  "total_memory_available (GB)": 94.62
1170
  },
1171
  {
1172
  "epoch": 4.775510204081632,
1173
+ "grad_norm": 0.8910292387008667,
1174
  "learning_rate": 4.489795918367347e-07,
1175
+ "loss": 0.054,
1176
+ "max_memory_allocated (GB)": 57.18,
1177
  "memory_allocated (GB)": 50.57,
1178
  "step": 1170,
1179
  "total_memory_available (GB)": 94.62
1180
  },
1181
  {
1182
  "epoch": 4.816326530612245,
1183
+ "grad_norm": 0.9796111583709717,
1184
  "learning_rate": 3.6734693877551025e-07,
1185
+ "loss": 0.0432,
1186
+ "max_memory_allocated (GB)": 57.18,
1187
  "memory_allocated (GB)": 50.57,
1188
  "step": 1180,
1189
  "total_memory_available (GB)": 94.62
1190
  },
1191
  {
1192
  "epoch": 4.857142857142857,
1193
+ "grad_norm": 1.7865172624588013,
1194
  "learning_rate": 2.8571428571428575e-07,
1195
+ "loss": 0.0549,
1196
+ "max_memory_allocated (GB)": 57.18,
1197
  "memory_allocated (GB)": 50.57,
1198
  "step": 1190,
1199
  "total_memory_available (GB)": 94.62
1200
  },
1201
  {
1202
  "epoch": 4.8979591836734695,
1203
+ "grad_norm": 0.7347144484519958,
1204
  "learning_rate": 2.0408163265306121e-07,
1205
+ "loss": 0.0711,
1206
+ "max_memory_allocated (GB)": 57.18,
1207
  "memory_allocated (GB)": 50.57,
1208
  "step": 1200,
1209
  "total_memory_available (GB)": 94.62
1210
  },
1211
  {
1212
  "epoch": 4.938775510204081,
1213
+ "grad_norm": 1.210486888885498,
1214
  "learning_rate": 1.2244897959183673e-07,
1215
+ "loss": 0.0652,
1216
+ "max_memory_allocated (GB)": 57.18,
1217
  "memory_allocated (GB)": 50.57,
1218
  "step": 1210,
1219
  "total_memory_available (GB)": 94.62
1220
  },
1221
  {
1222
  "epoch": 4.979591836734694,
1223
+ "grad_norm": 1.2078640460968018,
1224
  "learning_rate": 4.0816326530612253e-08,
1225
+ "loss": 0.0414,
1226
+ "max_memory_allocated (GB)": 57.18,
1227
  "memory_allocated (GB)": 50.57,
1228
  "step": 1220,
1229
  "total_memory_available (GB)": 94.62
1230
  },
1231
  {
1232
  "epoch": 5.0,
1233
+ "max_memory_allocated (GB)": 57.18,
1234
  "memory_allocated (GB)": 50.57,
1235
  "step": 1225,
1236
  "total_flos": 3.0598946525952e+16,
1237
  "total_memory_available (GB)": 94.62,
1238
+ "train_loss": 0.06072675045655698,
1239
+ "train_runtime": 1077.821,
1240
+ "train_samples_per_second": 52.682,
1241
+ "train_steps_per_second": 1.318
1242
  }
1243
  ],
1244
  "logging_steps": 10,
validation_results.json CHANGED
@@ -1,10 +1,10 @@
1
  {
2
  "epoch": 5.0,
3
- "eval_loss": 0.1272137612104416,
4
- "eval_runtime": 30.5176,
5
- "eval_samples_per_second": 31.06,
6
- "eval_steps_per_second": 1.994,
7
- "max_memory_allocated (GB)": 60.52,
8
  "memory_allocated (GB)": 51.27,
9
  "total_memory_available (GB)": 94.62
10
  }
 
1
  {
2
  "epoch": 5.0,
3
+ "eval_loss": 0.12326910346746445,
4
+ "eval_runtime": 26.1444,
5
+ "eval_samples_per_second": 36.653,
6
+ "eval_steps_per_second": 2.352,
7
+ "max_memory_allocated (GB)": 57.18,
8
  "memory_allocated (GB)": 51.27,
9
  "total_memory_available (GB)": 94.62
10
  }