laszlokiss27 commited on
Commit
79f6bbc
1 Parent(s): 813a325

doodle-zero

Browse files
.DS_Store ADDED
Binary file (6.15 kB). View file
 
README.md CHANGED
@@ -1,8 +1,7 @@
1
  ---
 
2
  tags:
3
  - generated_from_trainer
4
- metrics:
5
- - accuracy
6
  model-index:
7
  - name: results
8
  results: []
@@ -13,10 +12,14 @@ should probably proofread and complete it, then remove this comment. -->
13
 
14
  # results
15
 
16
- This model is a fine-tuned version of [](https://huggingface.co/) on an unknown dataset.
17
  It achieves the following results on the evaluation set:
18
- - Loss: 1.1000
19
- - Accuracy: 0.7236
 
 
 
 
20
 
21
  ## Model description
22
 
@@ -39,38 +42,13 @@ The following hyperparameters were used during training:
39
  - train_batch_size: 256
40
  - eval_batch_size: 256
41
  - seed: 42
42
- - distributed_type: multi-GPU
43
  - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
44
  - lr_scheduler_type: linear
45
  - num_epochs: 5
46
- - mixed_precision_training: Native AMP
47
-
48
- ### Training results
49
-
50
- | Training Loss | Epoch | Step | Validation Loss | Accuracy |
51
- |:-------------:|:------:|:-----:|:---------------:|:--------:|
52
- | 1.7698 | 0.2844 | 5000 | 1.7124 | 0.5802 |
53
- | 1.5445 | 0.5689 | 10000 | 1.5021 | 0.6270 |
54
- | 1.439 | 0.8533 | 15000 | 1.3989 | 0.6520 |
55
- | 1.3625 | 1.1377 | 20000 | 1.3447 | 0.6647 |
56
- | 1.3192 | 1.4222 | 25000 | 1.2965 | 0.6756 |
57
- | 1.3 | 1.7066 | 30000 | 1.2788 | 0.6795 |
58
- | 1.2695 | 1.9910 | 35000 | 1.2347 | 0.6900 |
59
- | 1.2297 | 2.2754 | 40000 | 1.2160 | 0.6955 |
60
- | 1.2144 | 2.5599 | 45000 | 1.1894 | 0.7021 |
61
- | 1.1945 | 2.8443 | 50000 | 1.1734 | 0.7058 |
62
- | 1.1551 | 3.1287 | 55000 | 1.1611 | 0.7084 |
63
- | 1.1471 | 3.4132 | 60000 | 1.1523 | 0.7104 |
64
- | 1.1301 | 3.6976 | 65000 | 1.1314 | 0.7156 |
65
- | 1.1286 | 3.9820 | 70000 | 1.1220 | 0.7186 |
66
- | 1.0898 | 4.2665 | 75000 | 1.1140 | 0.7203 |
67
- | 1.093 | 4.5509 | 80000 | 1.1040 | 0.7232 |
68
- | 1.0893 | 4.8353 | 85000 | 1.0986 | 0.7246 |
69
-
70
 
71
  ### Framework versions
72
 
73
- - Transformers 4.40.0
74
- - Pytorch 2.2.2+cu121
75
  - Datasets 2.19.0
76
- - Tokenizers 0.19.1
 
1
  ---
2
+ base_model: laszlokiss27/results
3
  tags:
4
  - generated_from_trainer
 
 
5
  model-index:
6
  - name: results
7
  results: []
 
12
 
13
  # results
14
 
15
+ This model is a fine-tuned version of [laszlokiss27/results](https://huggingface.co/laszlokiss27/results) on an unknown dataset.
16
  It achieves the following results on the evaluation set:
17
+ - eval_loss: 1.1000
18
+ - eval_accuracy: 0.7236
19
+ - eval_runtime: 831.0467
20
+ - eval_samples_per_second: 300.825
21
+ - eval_steps_per_second: 1.176
22
+ - step: 0
23
 
24
  ## Model description
25
 
 
42
  - train_batch_size: 256
43
  - eval_batch_size: 256
44
  - seed: 42
 
45
  - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
46
  - lr_scheduler_type: linear
47
  - num_epochs: 5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48
 
49
  ### Framework versions
50
 
51
+ - Transformers 4.33.2
52
+ - Pytorch 2.2.2
53
  - Datasets 2.19.0
54
+ - Tokenizers 0.13.3
all_results.json CHANGED
@@ -1,13 +1,13 @@
1
  {
2
  "epoch": 5.0,
3
  "eval_accuracy": 0.723616,
4
- "eval_loss": 1.100016713142395,
5
- "eval_runtime": 118.4292,
6
- "eval_samples_per_second": 2110.967,
7
- "eval_steps_per_second": 8.25,
8
- "total_flos": 5.4597445596112486e+17,
9
- "train_loss": 1.296092871571504,
10
- "train_runtime": 24664.1985,
11
- "train_samples_per_second": 912.253,
12
- "train_steps_per_second": 3.564
13
  }
 
1
  {
2
  "epoch": 5.0,
3
  "eval_accuracy": 0.723616,
4
+ "eval_loss": 1.100019931793213,
5
+ "eval_runtime": 831.0467,
6
+ "eval_samples_per_second": 300.825,
7
+ "eval_steps_per_second": 1.176,
8
+ "total_flos": 1.93274424e+18,
9
+ "train_loss": 0.9357909288237652,
10
+ "train_runtime": 45635.435,
11
+ "train_samples_per_second": 493.038,
12
+ "train_steps_per_second": 1.926
13
  }
config.json CHANGED
@@ -1,4 +1,5 @@
1
  {
 
2
  "architectures": [
3
  "MobileViTForImageClassification"
4
  ],
@@ -24,104 +25,7 @@
24
  "id2label": {
25
  "0": "aircraft carrier",
26
  "1": "airplane",
27
- "2": "alarm clock",
28
- "3": "ambulance",
29
- "4": "angel",
30
- "5": "animal migration",
31
- "6": "ant",
32
- "7": "anvil",
33
- "8": "apple",
34
- "9": "arm",
35
  "10": "asparagus",
36
- "11": "axe",
37
- "12": "backpack",
38
- "13": "banana",
39
- "14": "bandage",
40
- "15": "barn",
41
- "16": "baseball bat",
42
- "17": "baseball",
43
- "18": "basket",
44
- "19": "basketball",
45
- "20": "bat",
46
- "21": "bathtub",
47
- "22": "beach",
48
- "23": "bear",
49
- "24": "beard",
50
- "25": "bed",
51
- "26": "bee",
52
- "27": "belt",
53
- "28": "bench",
54
- "29": "bicycle",
55
- "30": "binoculars",
56
- "31": "bird",
57
- "32": "birthday cake",
58
- "33": "blackberry",
59
- "34": "blueberry",
60
- "35": "book",
61
- "36": "boomerang",
62
- "37": "bottlecap",
63
- "38": "bowtie",
64
- "39": "bracelet",
65
- "40": "brain",
66
- "41": "bread",
67
- "42": "bridge",
68
- "43": "broccoli",
69
- "44": "broom",
70
- "45": "bucket",
71
- "46": "bulldozer",
72
- "47": "bus",
73
- "48": "bush",
74
- "49": "butterfly",
75
- "50": "cactus",
76
- "51": "cake",
77
- "52": "calculator",
78
- "53": "calendar",
79
- "54": "camel",
80
- "55": "camera",
81
- "56": "camouflage",
82
- "57": "campfire",
83
- "58": "candle",
84
- "59": "cannon",
85
- "60": "canoe",
86
- "61": "car",
87
- "62": "carrot",
88
- "63": "castle",
89
- "64": "cat",
90
- "65": "ceiling fan",
91
- "66": "cell phone",
92
- "67": "cello",
93
- "68": "chair",
94
- "69": "chandelier",
95
- "70": "church",
96
- "71": "circle",
97
- "72": "clarinet",
98
- "73": "clock",
99
- "74": "cloud",
100
- "75": "coffee cup",
101
- "76": "compass",
102
- "77": "computer",
103
- "78": "cookie",
104
- "79": "cooler",
105
- "80": "couch",
106
- "81": "cow",
107
- "82": "crab",
108
- "83": "crayon",
109
- "84": "crocodile",
110
- "85": "crown",
111
- "86": "cruise ship",
112
- "87": "cup",
113
- "88": "diamond",
114
- "89": "dishwasher",
115
- "90": "diving board",
116
- "91": "dog",
117
- "92": "dolphin",
118
- "93": "donut",
119
- "94": "door",
120
- "95": "dragon",
121
- "96": "dresser",
122
- "97": "drill",
123
- "98": "drums",
124
- "99": "duck",
125
  "100": "dumbbell",
126
  "101": "ear",
127
  "102": "elbow",
@@ -132,6 +36,7 @@
132
  "107": "eyeglasses",
133
  "108": "face",
134
  "109": "fan",
 
135
  "110": "feather",
136
  "111": "fence",
137
  "112": "finger",
@@ -142,6 +47,7 @@
142
  "117": "flamingo",
143
  "118": "flashlight",
144
  "119": "flip flops",
 
145
  "120": "floor lamp",
146
  "121": "flower",
147
  "122": "flying saucer",
@@ -152,6 +58,7 @@
152
  "127": "garden hose",
153
  "128": "garden",
154
  "129": "giraffe",
 
155
  "130": "goatee",
156
  "131": "golf club",
157
  "132": "grapes",
@@ -162,6 +69,7 @@
162
  "137": "hand",
163
  "138": "harp",
164
  "139": "hat",
 
165
  "140": "headphones",
166
  "141": "hedgehog",
167
  "142": "helicopter",
@@ -172,6 +80,7 @@
172
  "147": "horse",
173
  "148": "hospital",
174
  "149": "hot air balloon",
 
175
  "150": "hot dog",
176
  "151": "hot tub",
177
  "152": "hourglass",
@@ -182,6 +91,7 @@
182
  "157": "jacket",
183
  "158": "jail",
184
  "159": "kangaroo",
 
185
  "160": "key",
186
  "161": "keyboard",
187
  "162": "knee",
@@ -192,6 +102,7 @@
192
  "167": "leaf",
193
  "168": "leg",
194
  "169": "light bulb",
 
195
  "170": "lighter",
196
  "171": "lighthouse",
197
  "172": "lightning",
@@ -202,6 +113,7 @@
202
  "177": "lollipop",
203
  "178": "mailbox",
204
  "179": "map",
 
205
  "180": "marker",
206
  "181": "matches",
207
  "182": "megaphone",
@@ -212,6 +124,7 @@
212
  "187": "moon",
213
  "188": "mosquito",
214
  "189": "motorbike",
 
215
  "190": "mountain",
216
  "191": "mouse",
217
  "192": "moustache",
@@ -222,6 +135,8 @@
222
  "197": "necklace",
223
  "198": "nose",
224
  "199": "ocean",
 
 
225
  "200": "octagon",
226
  "201": "octopus",
227
  "202": "onion",
@@ -232,6 +147,7 @@
232
  "207": "palm tree",
233
  "208": "panda",
234
  "209": "pants",
 
235
  "210": "paper clip",
236
  "211": "parachute",
237
  "212": "parrot",
@@ -242,6 +158,7 @@
242
  "217": "pencil",
243
  "218": "penguin",
244
  "219": "piano",
 
245
  "220": "pickup truck",
246
  "221": "picture frame",
247
  "222": "pig",
@@ -252,6 +169,7 @@
252
  "227": "police car",
253
  "228": "pond",
254
  "229": "pool",
 
255
  "230": "popsicle",
256
  "231": "postcard",
257
  "232": "potato",
@@ -262,6 +180,7 @@
262
  "237": "radio",
263
  "238": "rain",
264
  "239": "rainbow",
 
265
  "240": "rake",
266
  "241": "remote control",
267
  "242": "rhinoceros",
@@ -272,6 +191,7 @@
272
  "247": "sailboat",
273
  "248": "sandwich",
274
  "249": "saw",
 
275
  "250": "saxophone",
276
  "251": "school bus",
277
  "252": "scissors",
@@ -282,6 +202,7 @@
282
  "257": "shark",
283
  "258": "sheep",
284
  "259": "shoe",
 
285
  "260": "shorts",
286
  "261": "shovel",
287
  "262": "sink",
@@ -292,6 +213,7 @@
292
  "267": "smiley face",
293
  "268": "snail",
294
  "269": "snake",
 
295
  "270": "snorkel",
296
  "271": "snowflake",
297
  "272": "snowman",
@@ -302,6 +224,7 @@
302
  "277": "spoon",
303
  "278": "spreadsheet",
304
  "279": "square",
 
305
  "280": "squiggle",
306
  "281": "squirrel",
307
  "282": "stairs",
@@ -312,6 +235,7 @@
312
  "287": "stitches",
313
  "288": "stop sign",
314
  "289": "stove",
 
315
  "290": "strawberry",
316
  "291": "streetlight",
317
  "292": "string bean",
@@ -322,6 +246,8 @@
322
  "297": "sweater",
323
  "298": "swing set",
324
  "299": "sword",
 
 
325
  "300": "syringe",
326
  "301": "t-shirt",
327
  "302": "table",
@@ -332,6 +258,7 @@
332
  "307": "tennis racquet",
333
  "308": "tent",
334
  "309": "The Eiffel Tower",
 
335
  "310": "The Great Wall of China",
336
  "311": "The Mona Lisa",
337
  "312": "tiger",
@@ -342,6 +269,7 @@
342
  "317": "toothbrush",
343
  "318": "toothpaste",
344
  "319": "tornado",
 
345
  "320": "tractor",
346
  "321": "traffic light",
347
  "322": "train",
@@ -352,6 +280,7 @@
352
  "327": "trumpet",
353
  "328": "umbrella",
354
  "329": "underwear",
 
355
  "330": "van",
356
  "331": "vase",
357
  "332": "violin",
@@ -362,11 +291,83 @@
362
  "337": "wheel",
363
  "338": "windmill",
364
  "339": "wine bottle",
 
365
  "340": "wine glass",
366
  "341": "wristwatch",
367
  "342": "yoga",
368
  "343": "zebra",
369
- "344": "zigzag"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
370
  },
371
  "ignore_mismatched_sizes": true,
372
  "image_size": 64,
@@ -740,5 +741,5 @@
740
  "qkv_bias": true,
741
  "semantic_loss_ignore_index": 255,
742
  "torch_dtype": "float32",
743
- "transformers_version": "4.40.0"
744
  }
 
1
  {
2
+ "_name_or_path": "laszlokiss27/results",
3
  "architectures": [
4
  "MobileViTForImageClassification"
5
  ],
 
25
  "id2label": {
26
  "0": "aircraft carrier",
27
  "1": "airplane",
 
 
 
 
 
 
 
 
28
  "10": "asparagus",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
  "100": "dumbbell",
30
  "101": "ear",
31
  "102": "elbow",
 
36
  "107": "eyeglasses",
37
  "108": "face",
38
  "109": "fan",
39
+ "11": "axe",
40
  "110": "feather",
41
  "111": "fence",
42
  "112": "finger",
 
47
  "117": "flamingo",
48
  "118": "flashlight",
49
  "119": "flip flops",
50
+ "12": "backpack",
51
  "120": "floor lamp",
52
  "121": "flower",
53
  "122": "flying saucer",
 
58
  "127": "garden hose",
59
  "128": "garden",
60
  "129": "giraffe",
61
+ "13": "banana",
62
  "130": "goatee",
63
  "131": "golf club",
64
  "132": "grapes",
 
69
  "137": "hand",
70
  "138": "harp",
71
  "139": "hat",
72
+ "14": "bandage",
73
  "140": "headphones",
74
  "141": "hedgehog",
75
  "142": "helicopter",
 
80
  "147": "horse",
81
  "148": "hospital",
82
  "149": "hot air balloon",
83
+ "15": "barn",
84
  "150": "hot dog",
85
  "151": "hot tub",
86
  "152": "hourglass",
 
91
  "157": "jacket",
92
  "158": "jail",
93
  "159": "kangaroo",
94
+ "16": "baseball bat",
95
  "160": "key",
96
  "161": "keyboard",
97
  "162": "knee",
 
102
  "167": "leaf",
103
  "168": "leg",
104
  "169": "light bulb",
105
+ "17": "baseball",
106
  "170": "lighter",
107
  "171": "lighthouse",
108
  "172": "lightning",
 
113
  "177": "lollipop",
114
  "178": "mailbox",
115
  "179": "map",
116
+ "18": "basket",
117
  "180": "marker",
118
  "181": "matches",
119
  "182": "megaphone",
 
124
  "187": "moon",
125
  "188": "mosquito",
126
  "189": "motorbike",
127
+ "19": "basketball",
128
  "190": "mountain",
129
  "191": "mouse",
130
  "192": "moustache",
 
135
  "197": "necklace",
136
  "198": "nose",
137
  "199": "ocean",
138
+ "2": "alarm clock",
139
+ "20": "bat",
140
  "200": "octagon",
141
  "201": "octopus",
142
  "202": "onion",
 
147
  "207": "palm tree",
148
  "208": "panda",
149
  "209": "pants",
150
+ "21": "bathtub",
151
  "210": "paper clip",
152
  "211": "parachute",
153
  "212": "parrot",
 
158
  "217": "pencil",
159
  "218": "penguin",
160
  "219": "piano",
161
+ "22": "beach",
162
  "220": "pickup truck",
163
  "221": "picture frame",
164
  "222": "pig",
 
169
  "227": "police car",
170
  "228": "pond",
171
  "229": "pool",
172
+ "23": "bear",
173
  "230": "popsicle",
174
  "231": "postcard",
175
  "232": "potato",
 
180
  "237": "radio",
181
  "238": "rain",
182
  "239": "rainbow",
183
+ "24": "beard",
184
  "240": "rake",
185
  "241": "remote control",
186
  "242": "rhinoceros",
 
191
  "247": "sailboat",
192
  "248": "sandwich",
193
  "249": "saw",
194
+ "25": "bed",
195
  "250": "saxophone",
196
  "251": "school bus",
197
  "252": "scissors",
 
202
  "257": "shark",
203
  "258": "sheep",
204
  "259": "shoe",
205
+ "26": "bee",
206
  "260": "shorts",
207
  "261": "shovel",
208
  "262": "sink",
 
213
  "267": "smiley face",
214
  "268": "snail",
215
  "269": "snake",
216
+ "27": "belt",
217
  "270": "snorkel",
218
  "271": "snowflake",
219
  "272": "snowman",
 
224
  "277": "spoon",
225
  "278": "spreadsheet",
226
  "279": "square",
227
+ "28": "bench",
228
  "280": "squiggle",
229
  "281": "squirrel",
230
  "282": "stairs",
 
235
  "287": "stitches",
236
  "288": "stop sign",
237
  "289": "stove",
238
+ "29": "bicycle",
239
  "290": "strawberry",
240
  "291": "streetlight",
241
  "292": "string bean",
 
246
  "297": "sweater",
247
  "298": "swing set",
248
  "299": "sword",
249
+ "3": "ambulance",
250
+ "30": "binoculars",
251
  "300": "syringe",
252
  "301": "t-shirt",
253
  "302": "table",
 
258
  "307": "tennis racquet",
259
  "308": "tent",
260
  "309": "The Eiffel Tower",
261
+ "31": "bird",
262
  "310": "The Great Wall of China",
263
  "311": "The Mona Lisa",
264
  "312": "tiger",
 
269
  "317": "toothbrush",
270
  "318": "toothpaste",
271
  "319": "tornado",
272
+ "32": "birthday cake",
273
  "320": "tractor",
274
  "321": "traffic light",
275
  "322": "train",
 
280
  "327": "trumpet",
281
  "328": "umbrella",
282
  "329": "underwear",
283
+ "33": "blackberry",
284
  "330": "van",
285
  "331": "vase",
286
  "332": "violin",
 
291
  "337": "wheel",
292
  "338": "windmill",
293
  "339": "wine bottle",
294
+ "34": "blueberry",
295
  "340": "wine glass",
296
  "341": "wristwatch",
297
  "342": "yoga",
298
  "343": "zebra",
299
+ "344": "zigzag",
300
+ "35": "book",
301
+ "36": "boomerang",
302
+ "37": "bottlecap",
303
+ "38": "bowtie",
304
+ "39": "bracelet",
305
+ "4": "angel",
306
+ "40": "brain",
307
+ "41": "bread",
308
+ "42": "bridge",
309
+ "43": "broccoli",
310
+ "44": "broom",
311
+ "45": "bucket",
312
+ "46": "bulldozer",
313
+ "47": "bus",
314
+ "48": "bush",
315
+ "49": "butterfly",
316
+ "5": "animal migration",
317
+ "50": "cactus",
318
+ "51": "cake",
319
+ "52": "calculator",
320
+ "53": "calendar",
321
+ "54": "camel",
322
+ "55": "camera",
323
+ "56": "camouflage",
324
+ "57": "campfire",
325
+ "58": "candle",
326
+ "59": "cannon",
327
+ "6": "ant",
328
+ "60": "canoe",
329
+ "61": "car",
330
+ "62": "carrot",
331
+ "63": "castle",
332
+ "64": "cat",
333
+ "65": "ceiling fan",
334
+ "66": "cell phone",
335
+ "67": "cello",
336
+ "68": "chair",
337
+ "69": "chandelier",
338
+ "7": "anvil",
339
+ "70": "church",
340
+ "71": "circle",
341
+ "72": "clarinet",
342
+ "73": "clock",
343
+ "74": "cloud",
344
+ "75": "coffee cup",
345
+ "76": "compass",
346
+ "77": "computer",
347
+ "78": "cookie",
348
+ "79": "cooler",
349
+ "8": "apple",
350
+ "80": "couch",
351
+ "81": "cow",
352
+ "82": "crab",
353
+ "83": "crayon",
354
+ "84": "crocodile",
355
+ "85": "crown",
356
+ "86": "cruise ship",
357
+ "87": "cup",
358
+ "88": "diamond",
359
+ "89": "dishwasher",
360
+ "9": "arm",
361
+ "90": "diving board",
362
+ "91": "dog",
363
+ "92": "dolphin",
364
+ "93": "donut",
365
+ "94": "door",
366
+ "95": "dragon",
367
+ "96": "dresser",
368
+ "97": "drill",
369
+ "98": "drums",
370
+ "99": "duck"
371
  },
372
  "ignore_mismatched_sizes": true,
373
  "image_size": 64,
 
741
  "qkv_bias": true,
742
  "semantic_loss_ignore_index": 255,
743
  "torch_dtype": "float32",
744
+ "transformers_version": "4.33.2"
745
  }
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d35f61619e3ffc371ead68851aa0232c0fe6cc18ab6a5362d0e589c58eb59a19
3
- size 20730036
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:28184813e695da9074eb277c8000be311e12d53352ec1ed2b6b268532b81b323
3
+ size 18360744
preprocessor_config.json CHANGED
@@ -1,19 +1,4 @@
1
  {
2
- "_valid_processor_keys": [
3
- "images",
4
- "segmentation_maps",
5
- "do_resize",
6
- "size",
7
- "resample",
8
- "do_rescale",
9
- "rescale_factor",
10
- "do_center_crop",
11
- "crop_size",
12
- "do_flip_channel_order",
13
- "return_tensors",
14
- "data_format",
15
- "input_data_format"
16
- ],
17
  "crop_size": {
18
  "height": 28,
19
  "width": 28
 
1
  {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  "crop_size": {
3
  "height": 28,
4
  "width": 28
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:579e9e6dde3a9f639be92e42b626a108dbd9c7b2f4ca7bee37623bca653d4abb
3
+ size 20803638
test_results.json CHANGED
@@ -1,8 +1,7 @@
1
  {
2
- "epoch": 5.0,
3
  "eval_accuracy": 0.723616,
4
- "eval_loss": 1.100016713142395,
5
- "eval_runtime": 118.4292,
6
- "eval_samples_per_second": 2110.967,
7
- "eval_steps_per_second": 8.25
8
  }
 
1
  {
 
2
  "eval_accuracy": 0.723616,
3
+ "eval_loss": 1.100019931793213,
4
+ "eval_runtime": 831.0467,
5
+ "eval_samples_per_second": 300.825,
6
+ "eval_steps_per_second": 1.176
7
  }
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "epoch": 5.0,
3
- "total_flos": 5.4597445596112486e+17,
4
- "train_loss": 1.296092871571504,
5
- "train_runtime": 24664.1985,
6
- "train_samples_per_second": 912.253,
7
- "train_steps_per_second": 3.564
8
  }
 
1
  {
2
  "epoch": 5.0,
3
+ "total_flos": 1.93274424e+18,
4
+ "train_loss": 0.9357909288237652,
5
+ "train_runtime": 45635.435,
6
+ "train_samples_per_second": 493.038,
7
+ "train_steps_per_second": 1.926
8
  }
trainer_state.json CHANGED
@@ -10,774 +10,774 @@
10
  "log_history": [
11
  {
12
  "epoch": 0.05688605722737357,
13
- "grad_norm": 4.992003440856934,
14
- "learning_rate": 0.0007909346379202458,
15
- "loss": 3.165,
16
  "step": 1000
17
  },
18
  {
19
  "epoch": 0.11377211445474714,
20
- "grad_norm": 3.379929542541504,
21
- "learning_rate": 0.0007818419705330224,
22
- "loss": 2.2365,
23
  "step": 2000
24
  },
25
  {
26
  "epoch": 0.17065817168212072,
27
- "grad_norm": 2.9609272480010986,
28
- "learning_rate": 0.0007727402013766426,
29
- "loss": 1.9869,
30
  "step": 3000
31
  },
32
  {
33
  "epoch": 0.22754422890949427,
34
- "grad_norm": 2.5321693420410156,
35
- "learning_rate": 0.0007636384322202628,
36
- "loss": 1.8608,
37
  "step": 4000
38
  },
39
  {
40
  "epoch": 0.2844302861368678,
41
- "grad_norm": 2.3447465896606445,
42
- "learning_rate": 0.0007545366630638831,
43
- "loss": 1.7698,
44
  "step": 5000
45
  },
46
  {
47
  "epoch": 0.2844302861368678,
48
- "eval_accuracy": 0.580204,
49
- "eval_loss": 1.7123581171035767,
50
- "eval_runtime": 119.0147,
51
- "eval_samples_per_second": 2100.581,
52
- "eval_steps_per_second": 8.209,
53
  "step": 5000
54
  },
55
  {
56
  "epoch": 0.34131634336424144,
57
- "grad_norm": 2.4563510417938232,
58
- "learning_rate": 0.0007454348939075033,
59
- "loss": 1.7017,
60
  "step": 6000
61
  },
62
  {
63
  "epoch": 0.398202400591615,
64
- "grad_norm": 2.0900540351867676,
65
- "learning_rate": 0.0007363331247511235,
66
- "loss": 1.6446,
67
  "step": 7000
68
  },
69
  {
70
  "epoch": 0.45508845781898855,
71
- "grad_norm": 2.0227878093719482,
72
- "learning_rate": 0.0007272404573639001,
73
- "loss": 1.6037,
74
  "step": 8000
75
  },
76
  {
77
  "epoch": 0.5119745150463622,
78
- "grad_norm": 1.9194427728652954,
79
- "learning_rate": 0.0007181477899766768,
80
- "loss": 1.5706,
81
  "step": 9000
82
  },
83
  {
84
  "epoch": 0.5688605722737357,
85
- "grad_norm": 2.123502254486084,
86
- "learning_rate": 0.0007090642243586097,
87
- "loss": 1.5445,
88
  "step": 10000
89
  },
90
  {
91
  "epoch": 0.5688605722737357,
92
- "eval_accuracy": 0.626964,
93
- "eval_loss": 1.5020641088485718,
94
- "eval_runtime": 118.3365,
95
- "eval_samples_per_second": 2112.62,
96
- "eval_steps_per_second": 8.256,
97
  "step": 10000
98
  },
99
  {
100
  "epoch": 0.6257466295011093,
101
- "grad_norm": 2.0052566528320312,
102
- "learning_rate": 0.00069996245520223,
103
- "loss": 1.525,
104
  "step": 11000
105
  },
106
  {
107
  "epoch": 0.6826326867284829,
108
- "grad_norm": 1.8293088674545288,
109
- "learning_rate": 0.0006908606860458502,
110
- "loss": 1.4918,
111
  "step": 12000
112
  },
113
  {
114
  "epoch": 0.7395187439558564,
115
- "grad_norm": 1.8381669521331787,
116
- "learning_rate": 0.0006817589168894705,
117
- "loss": 1.4769,
118
  "step": 13000
119
  },
120
  {
121
  "epoch": 0.79640480118323,
122
- "grad_norm": 1.7285594940185547,
123
- "learning_rate": 0.0006726571477330906,
124
- "loss": 1.4607,
125
  "step": 14000
126
  },
127
  {
128
  "epoch": 0.8532908584106036,
129
- "grad_norm": 1.7327940464019775,
130
- "learning_rate": 0.0006635553785767108,
131
- "loss": 1.439,
132
  "step": 15000
133
  },
134
  {
135
  "epoch": 0.8532908584106036,
136
- "eval_accuracy": 0.652024,
137
- "eval_loss": 1.3989214897155762,
138
- "eval_runtime": 118.43,
139
- "eval_samples_per_second": 2110.951,
140
- "eval_steps_per_second": 8.25,
141
  "step": 15000
142
  },
143
  {
144
  "epoch": 0.9101769156379771,
145
- "grad_norm": 1.7836127281188965,
146
- "learning_rate": 0.0006544536094203311,
147
- "loss": 1.429,
148
  "step": 16000
149
  },
150
  {
151
  "epoch": 0.9670629728653507,
152
- "grad_norm": 1.707483172416687,
153
- "learning_rate": 0.0006453609420331078,
154
- "loss": 1.4131,
155
  "step": 17000
156
  },
157
  {
158
  "epoch": 1.0239490300927243,
159
- "grad_norm": 1.5185352563858032,
160
- "learning_rate": 0.0006362682746458843,
161
- "loss": 1.3919,
162
  "step": 18000
163
  },
164
  {
165
  "epoch": 1.0808350873200978,
166
- "grad_norm": 1.5501619577407837,
167
- "learning_rate": 0.0006271665054895046,
168
- "loss": 1.3684,
169
  "step": 19000
170
  },
171
  {
172
  "epoch": 1.1377211445474713,
173
- "grad_norm": 1.6240971088409424,
174
- "learning_rate": 0.0006180738381022812,
175
- "loss": 1.3625,
176
  "step": 20000
177
  },
178
  {
179
  "epoch": 1.1377211445474713,
180
- "eval_accuracy": 0.664736,
181
- "eval_loss": 1.344669222831726,
182
- "eval_runtime": 118.5872,
183
- "eval_samples_per_second": 2108.154,
184
- "eval_steps_per_second": 8.239,
185
  "step": 20000
186
  },
187
  {
188
  "epoch": 1.194607201774845,
189
- "grad_norm": 1.4840081930160522,
190
- "learning_rate": 0.0006089811707150577,
191
- "loss": 1.3516,
192
  "step": 21000
193
  },
194
  {
195
  "epoch": 1.2514932590022185,
196
- "grad_norm": 1.844332218170166,
197
- "learning_rate": 0.000599879401558678,
198
- "loss": 1.3509,
199
  "step": 22000
200
  },
201
  {
202
  "epoch": 1.3083793162295922,
203
- "grad_norm": 1.6865944862365723,
204
- "learning_rate": 0.0005907776324022982,
205
- "loss": 1.3429,
206
  "step": 23000
207
  },
208
  {
209
  "epoch": 1.3652653734569657,
210
- "grad_norm": 1.7245532274246216,
211
- "learning_rate": 0.0005816758632459185,
212
- "loss": 1.3377,
213
  "step": 24000
214
  },
215
  {
216
  "epoch": 1.4221514306843392,
217
- "grad_norm": 1.4441121816635132,
218
- "learning_rate": 0.0005725740940895386,
219
- "loss": 1.3192,
220
  "step": 25000
221
  },
222
  {
223
  "epoch": 1.4221514306843392,
224
- "eval_accuracy": 0.675628,
225
- "eval_loss": 1.296504020690918,
226
- "eval_runtime": 118.6208,
227
- "eval_samples_per_second": 2107.555,
228
- "eval_steps_per_second": 8.236,
229
  "step": 25000
230
  },
231
  {
232
  "epoch": 1.4790374879117127,
233
- "grad_norm": 1.5026684999465942,
234
- "learning_rate": 0.0005634723249331589,
235
- "loss": 1.3156,
236
  "step": 26000
237
  },
238
  {
239
  "epoch": 1.5359235451390862,
240
- "grad_norm": 1.373693823814392,
241
- "learning_rate": 0.0005543705557767792,
242
- "loss": 1.3152,
243
  "step": 27000
244
  },
245
  {
246
  "epoch": 1.59280960236646,
247
- "grad_norm": 1.4744044542312622,
248
- "learning_rate": 0.0005452687866203994,
249
- "loss": 1.307,
250
  "step": 28000
251
  },
252
  {
253
  "epoch": 1.6496956595938337,
254
- "grad_norm": 1.45868718624115,
255
- "learning_rate": 0.0005361670174640196,
256
- "loss": 1.2985,
257
  "step": 29000
258
  },
259
  {
260
  "epoch": 1.7065817168212072,
261
- "grad_norm": 1.4872881174087524,
262
- "learning_rate": 0.0005270652483076398,
263
- "loss": 1.3,
264
  "step": 30000
265
  },
266
  {
267
  "epoch": 1.7065817168212072,
268
- "eval_accuracy": 0.679452,
269
- "eval_loss": 1.2787636518478394,
270
- "eval_runtime": 118.4552,
271
- "eval_samples_per_second": 2110.502,
272
- "eval_steps_per_second": 8.248,
273
  "step": 30000
274
  },
275
  {
276
  "epoch": 1.7634677740485807,
277
- "grad_norm": 1.4146182537078857,
278
- "learning_rate": 0.0005179725809204165,
279
- "loss": 1.2917,
280
  "step": 31000
281
  },
282
  {
283
  "epoch": 1.8203538312759542,
284
- "grad_norm": 1.511470913887024,
285
- "learning_rate": 0.000508879913533193,
286
- "loss": 1.2894,
287
  "step": 32000
288
  },
289
  {
290
  "epoch": 1.8772398885033277,
291
- "grad_norm": 1.3366495370864868,
292
- "learning_rate": 0.0004997781443768132,
293
- "loss": 1.2836,
294
  "step": 33000
295
  },
296
  {
297
  "epoch": 1.9341259457307014,
298
- "grad_norm": 1.3103934526443481,
299
- "learning_rate": 0.0004906763752204335,
300
- "loss": 1.2712,
301
  "step": 34000
302
  },
303
  {
304
  "epoch": 1.9910120029580751,
305
- "grad_norm": 1.6586687564849854,
306
- "learning_rate": 0.00048157460606405373,
307
- "loss": 1.2695,
308
  "step": 35000
309
  },
310
  {
311
  "epoch": 1.9910120029580751,
312
- "eval_accuracy": 0.69002,
313
- "eval_loss": 1.2347031831741333,
314
- "eval_runtime": 118.4567,
315
- "eval_samples_per_second": 2110.476,
316
- "eval_steps_per_second": 8.248,
317
  "step": 35000
318
  },
319
  {
320
  "epoch": 2.0478980601854486,
321
- "grad_norm": 1.3782742023468018,
322
- "learning_rate": 0.0004724819386768303,
323
- "loss": 1.242,
324
  "step": 36000
325
  },
326
  {
327
  "epoch": 2.104784117412822,
328
- "grad_norm": 1.4477494955062866,
329
- "learning_rate": 0.0004633983730587633,
330
- "loss": 1.2336,
331
  "step": 37000
332
  },
333
  {
334
  "epoch": 2.1616701746401956,
335
- "grad_norm": 1.2619287967681885,
336
- "learning_rate": 0.0004543057056715399,
337
- "loss": 1.2275,
338
  "step": 38000
339
  },
340
  {
341
  "epoch": 2.218556231867569,
342
- "grad_norm": 1.4561299085617065,
343
- "learning_rate": 0.00044520393651516015,
344
- "loss": 1.2187,
345
  "step": 39000
346
  },
347
  {
348
  "epoch": 2.2754422890949426,
349
- "grad_norm": 1.4557944536209106,
350
- "learning_rate": 0.0004361021673587804,
351
- "loss": 1.2297,
352
  "step": 40000
353
  },
354
  {
355
  "epoch": 2.2754422890949426,
356
- "eval_accuracy": 0.695532,
357
- "eval_loss": 1.215972661972046,
358
- "eval_runtime": 118.5665,
359
- "eval_samples_per_second": 2108.522,
360
- "eval_steps_per_second": 8.24,
361
  "step": 40000
362
  },
363
  {
364
  "epoch": 2.3323283463223166,
365
- "grad_norm": 1.6357080936431885,
366
- "learning_rate": 0.00042700039820240066,
367
- "loss": 1.2232,
368
  "step": 41000
369
  },
370
  {
371
  "epoch": 2.38921440354969,
372
- "grad_norm": 1.4996728897094727,
373
- "learning_rate": 0.00041789862904602083,
374
- "loss": 1.2221,
375
  "step": 42000
376
  },
377
  {
378
  "epoch": 2.4461004607770636,
379
- "grad_norm": 1.3550739288330078,
380
- "learning_rate": 0.0004088059616587974,
381
- "loss": 1.2131,
382
  "step": 43000
383
  },
384
  {
385
  "epoch": 2.502986518004437,
386
- "grad_norm": 1.3799809217453003,
387
- "learning_rate": 0.00039970419250241767,
388
- "loss": 1.2118,
389
  "step": 44000
390
  },
391
  {
392
  "epoch": 2.5598725752318106,
393
- "grad_norm": 1.3335360288619995,
394
- "learning_rate": 0.0003906024233460379,
395
- "loss": 1.2144,
396
  "step": 45000
397
  },
398
  {
399
  "epoch": 2.5598725752318106,
400
- "eval_accuracy": 0.702096,
401
- "eval_loss": 1.1893980503082275,
402
- "eval_runtime": 118.534,
403
- "eval_samples_per_second": 2109.1,
404
- "eval_steps_per_second": 8.242,
405
  "step": 45000
406
  },
407
  {
408
  "epoch": 2.6167586324591845,
409
- "grad_norm": 1.405167818069458,
410
- "learning_rate": 0.0003815097559588145,
411
- "loss": 1.2104,
412
  "step": 46000
413
  },
414
  {
415
  "epoch": 2.673644689686558,
416
- "grad_norm": 1.3935025930404663,
417
- "learning_rate": 0.00037240798680243474,
418
- "loss": 1.2059,
419
  "step": 47000
420
  },
421
  {
422
  "epoch": 2.7305307469139315,
423
- "grad_norm": 1.5090906620025635,
424
- "learning_rate": 0.00036330621764605497,
425
- "loss": 1.2026,
426
  "step": 48000
427
  },
428
  {
429
  "epoch": 2.787416804141305,
430
- "grad_norm": 1.4666266441345215,
431
- "learning_rate": 0.0003542044484896752,
432
- "loss": 1.2032,
433
  "step": 49000
434
  },
435
  {
436
  "epoch": 2.8443028613686785,
437
- "grad_norm": 1.4617928266525269,
438
- "learning_rate": 0.0003451026793332954,
439
- "loss": 1.1945,
440
  "step": 50000
441
  },
442
  {
443
  "epoch": 2.8443028613686785,
444
- "eval_accuracy": 0.705848,
445
- "eval_loss": 1.1734095811843872,
446
- "eval_runtime": 118.5433,
447
- "eval_samples_per_second": 2108.934,
448
- "eval_steps_per_second": 8.242,
449
  "step": 50000
450
  },
451
  {
452
  "epoch": 2.901188918596052,
453
- "grad_norm": 1.4179085493087769,
454
- "learning_rate": 0.00033601001194607204,
455
- "loss": 1.1888,
456
  "step": 51000
457
  },
458
  {
459
  "epoch": 2.9580749758234255,
460
- "grad_norm": 1.4129358530044556,
461
- "learning_rate": 0.00032691734455884866,
462
- "loss": 1.1952,
463
  "step": 52000
464
  },
465
  {
466
  "epoch": 3.0149610330507994,
467
- "grad_norm": 1.3465383052825928,
468
- "learning_rate": 0.0003178155754024689,
469
- "loss": 1.1782,
470
  "step": 53000
471
  },
472
  {
473
  "epoch": 3.071847090278173,
474
- "grad_norm": 1.4815254211425781,
475
- "learning_rate": 0.00030871380624608905,
476
- "loss": 1.1572,
477
  "step": 54000
478
  },
479
  {
480
  "epoch": 3.1287331475055464,
481
- "grad_norm": 1.472550868988037,
482
- "learning_rate": 0.00029961203708970933,
483
- "loss": 1.1551,
484
  "step": 55000
485
  },
486
  {
487
  "epoch": 3.1287331475055464,
488
- "eval_accuracy": 0.708444,
489
- "eval_loss": 1.1611371040344238,
490
- "eval_runtime": 118.5055,
491
- "eval_samples_per_second": 2109.606,
492
- "eval_steps_per_second": 8.244,
493
  "step": 55000
494
  },
495
  {
496
  "epoch": 3.18561920473292,
497
- "grad_norm": 1.6439310312271118,
498
- "learning_rate": 0.00029051026793332956,
499
- "loss": 1.1542,
500
  "step": 56000
501
  },
502
  {
503
  "epoch": 3.2425052619602934,
504
- "grad_norm": 1.4789113998413086,
505
- "learning_rate": 0.0002814176005461062,
506
- "loss": 1.1503,
507
  "step": 57000
508
  },
509
  {
510
  "epoch": 3.299391319187667,
511
- "grad_norm": 1.2807673215866089,
512
- "learning_rate": 0.00027232493315888274,
513
- "loss": 1.1468,
514
  "step": 58000
515
  },
516
  {
517
  "epoch": 3.356277376415041,
518
- "grad_norm": 1.3815586566925049,
519
- "learning_rate": 0.000263223164002503,
520
- "loss": 1.1523,
521
  "step": 59000
522
  },
523
  {
524
  "epoch": 3.4131634336424144,
525
- "grad_norm": 1.4986367225646973,
526
- "learning_rate": 0.00025412139484612325,
527
- "loss": 1.1471,
528
  "step": 60000
529
  },
530
  {
531
  "epoch": 3.4131634336424144,
532
- "eval_accuracy": 0.7104,
533
- "eval_loss": 1.152265191078186,
534
- "eval_runtime": 118.4796,
535
- "eval_samples_per_second": 2110.067,
536
- "eval_steps_per_second": 8.246,
537
  "step": 60000
538
  },
539
  {
540
  "epoch": 3.470049490869788,
541
- "grad_norm": 1.3522818088531494,
542
- "learning_rate": 0.00024501962568974347,
543
- "loss": 1.1493,
544
  "step": 61000
545
  },
546
  {
547
  "epoch": 3.5269355480971614,
548
- "grad_norm": 1.3799934387207031,
549
- "learning_rate": 0.0002359178565333637,
550
- "loss": 1.1534,
551
  "step": 62000
552
  },
553
  {
554
  "epoch": 3.583821605324535,
555
- "grad_norm": 1.4971522092819214,
556
- "learning_rate": 0.00022681608737698392,
557
- "loss": 1.1475,
558
  "step": 63000
559
  },
560
  {
561
  "epoch": 3.6407076625519084,
562
- "grad_norm": 1.4308714866638184,
563
- "learning_rate": 0.00021771431822060415,
564
- "loss": 1.1478,
565
  "step": 64000
566
  },
567
  {
568
  "epoch": 3.697593719779282,
569
- "grad_norm": 1.6222587823867798,
570
- "learning_rate": 0.00020861254906422437,
571
- "loss": 1.1301,
572
  "step": 65000
573
  },
574
  {
575
  "epoch": 3.697593719779282,
576
- "eval_accuracy": 0.715576,
577
- "eval_loss": 1.1314274072647095,
578
- "eval_runtime": 118.553,
579
- "eval_samples_per_second": 2108.761,
580
- "eval_steps_per_second": 8.241,
581
  "step": 65000
582
  },
583
  {
584
  "epoch": 3.754479777006656,
585
- "grad_norm": 1.3719106912612915,
586
- "learning_rate": 0.00019951988167700096,
587
- "loss": 1.1364,
588
  "step": 66000
589
  },
590
  {
591
  "epoch": 3.8113658342340293,
592
- "grad_norm": 1.5775474309921265,
593
- "learning_rate": 0.00019042721428977758,
594
- "loss": 1.1268,
595
  "step": 67000
596
  },
597
  {
598
  "epoch": 3.868251891461403,
599
- "grad_norm": 1.4434072971343994,
600
- "learning_rate": 0.0001813254451333978,
601
- "loss": 1.1395,
602
  "step": 68000
603
  },
604
  {
605
  "epoch": 3.9251379486887763,
606
- "grad_norm": 1.6004397869110107,
607
- "learning_rate": 0.00017222367597701806,
608
- "loss": 1.1324,
609
  "step": 69000
610
  },
611
  {
612
  "epoch": 3.98202400591615,
613
- "grad_norm": 1.4771836996078491,
614
- "learning_rate": 0.00016312190682063826,
615
- "loss": 1.1286,
616
  "step": 70000
617
  },
618
  {
619
  "epoch": 3.98202400591615,
620
- "eval_accuracy": 0.718576,
621
- "eval_loss": 1.1219959259033203,
622
- "eval_runtime": 118.4895,
623
- "eval_samples_per_second": 2109.892,
624
- "eval_steps_per_second": 8.245,
625
  "step": 70000
626
  },
627
  {
628
  "epoch": 4.038910063143524,
629
- "grad_norm": 1.2995303869247437,
630
- "learning_rate": 0.00015402013766425848,
631
- "loss": 1.1102,
632
  "step": 71000
633
  },
634
  {
635
  "epoch": 4.095796120370897,
636
- "grad_norm": 1.5995845794677734,
637
- "learning_rate": 0.0001449183685078787,
638
- "loss": 1.1039,
639
  "step": 72000
640
  },
641
  {
642
  "epoch": 4.152682177598271,
643
- "grad_norm": 1.4186768531799316,
644
- "learning_rate": 0.00013582570112065533,
645
- "loss": 1.0986,
646
  "step": 73000
647
  },
648
  {
649
  "epoch": 4.209568234825644,
650
- "grad_norm": 1.3645439147949219,
651
- "learning_rate": 0.00012672393196427555,
652
- "loss": 1.097,
653
  "step": 74000
654
  },
655
  {
656
  "epoch": 4.266454292053018,
657
- "grad_norm": 1.3423221111297607,
658
- "learning_rate": 0.00011762216280789579,
659
- "loss": 1.0898,
660
  "step": 75000
661
  },
662
  {
663
  "epoch": 4.266454292053018,
664
- "eval_accuracy": 0.720332,
665
- "eval_loss": 1.1140097379684448,
666
- "eval_runtime": 118.5031,
667
- "eval_samples_per_second": 2109.649,
668
- "eval_steps_per_second": 8.245,
669
  "step": 75000
670
  },
671
  {
672
  "epoch": 4.323340349280391,
673
- "grad_norm": 1.4441511631011963,
674
- "learning_rate": 0.00010852949542067239,
675
- "loss": 1.0967,
676
  "step": 76000
677
  },
678
  {
679
  "epoch": 4.380226406507765,
680
- "grad_norm": 1.5282950401306152,
681
- "learning_rate": 9.942772626429264e-05,
682
- "loss": 1.1011,
683
  "step": 77000
684
  },
685
  {
686
  "epoch": 4.437112463735138,
687
- "grad_norm": 1.3688595294952393,
688
- "learning_rate": 9.033505887706923e-05,
689
- "loss": 1.0954,
690
  "step": 78000
691
  },
692
  {
693
  "epoch": 4.493998520962512,
694
- "grad_norm": 1.5577939748764038,
695
- "learning_rate": 8.123328972068947e-05,
696
- "loss": 1.0949,
697
  "step": 79000
698
  },
699
  {
700
  "epoch": 4.550884578189885,
701
- "grad_norm": 1.6534169912338257,
702
- "learning_rate": 7.21315205643097e-05,
703
- "loss": 1.093,
704
  "step": 80000
705
  },
706
  {
707
  "epoch": 4.550884578189885,
708
- "eval_accuracy": 0.723164,
709
- "eval_loss": 1.1039903163909912,
710
- "eval_runtime": 118.5054,
711
- "eval_samples_per_second": 2109.609,
712
- "eval_steps_per_second": 8.244,
713
  "step": 80000
714
  },
715
  {
716
  "epoch": 4.607770635417259,
717
- "grad_norm": 1.5630171298980713,
718
- "learning_rate": 6.302975140792992e-05,
719
- "loss": 1.0889,
720
  "step": 81000
721
  },
722
  {
723
  "epoch": 4.664656692644633,
724
- "grad_norm": 1.511986494064331,
725
- "learning_rate": 5.393708402070653e-05,
726
- "loss": 1.0893,
727
  "step": 82000
728
  },
729
  {
730
  "epoch": 4.721542749872007,
731
- "grad_norm": 1.5626702308654785,
732
- "learning_rate": 4.483531486432676e-05,
733
- "loss": 1.0857,
734
  "step": 83000
735
  },
736
  {
737
  "epoch": 4.77842880709938,
738
- "grad_norm": 1.3917585611343384,
739
- "learning_rate": 3.574264747710336e-05,
740
- "loss": 1.0879,
741
  "step": 84000
742
  },
743
  {
744
  "epoch": 4.835314864326754,
745
- "grad_norm": 1.6141693592071533,
746
- "learning_rate": 2.6649980089879973e-05,
747
- "loss": 1.0893,
748
  "step": 85000
749
  },
750
  {
751
  "epoch": 4.835314864326754,
752
- "eval_accuracy": 0.724572,
753
- "eval_loss": 1.0985814332962036,
754
- "eval_runtime": 118.5475,
755
- "eval_samples_per_second": 2108.859,
756
- "eval_steps_per_second": 8.241,
757
  "step": 85000
758
  },
759
  {
760
  "epoch": 4.892200921554127,
761
- "grad_norm": 1.6234523057937622,
762
- "learning_rate": 1.75482109335002e-05,
763
- "loss": 1.0915,
764
  "step": 86000
765
  },
766
  {
767
  "epoch": 4.949086978781501,
768
- "grad_norm": 1.462381362915039,
769
- "learning_rate": 8.446441777120428e-06,
770
- "loss": 1.0843,
771
  "step": 87000
772
  },
773
  {
774
  "epoch": 5.0,
775
  "step": 87895,
776
- "total_flos": 5.4597445596112486e+17,
777
- "train_loss": 1.296092871571504,
778
- "train_runtime": 24664.1985,
779
- "train_samples_per_second": 912.253,
780
- "train_steps_per_second": 3.564
781
  }
782
  ],
783
  "logging_steps": 1000,
@@ -785,7 +785,7 @@
785
  "num_input_tokens_seen": 0,
786
  "num_train_epochs": 5,
787
  "save_steps": 5000,
788
- "total_flos": 5.4597445596112486e+17,
789
  "train_batch_size": 256,
790
  "trial_name": null,
791
  "trial_params": null
 
10
  "log_history": [
11
  {
12
  "epoch": 0.05688605722737357,
13
+ "grad_norm": 2.4062280654907227,
14
+ "learning_rate": 0.0007908982308436202,
15
+ "loss": 2.1918,
16
  "step": 1000
17
  },
18
  {
19
  "epoch": 0.11377211445474714,
20
+ "grad_norm": 1.431848406791687,
21
+ "learning_rate": 0.0007817964616872405,
22
+ "loss": 1.4818,
23
  "step": 2000
24
  },
25
  {
26
  "epoch": 0.17065817168212072,
27
+ "grad_norm": 1.5747077465057373,
28
+ "learning_rate": 0.0007726946925308607,
29
+ "loss": 1.3634,
30
  "step": 3000
31
  },
32
  {
33
  "epoch": 0.22754422890949427,
34
+ "grad_norm": 1.4864206314086914,
35
+ "learning_rate": 0.0007635929233744809,
36
+ "loss": 1.2967,
37
  "step": 4000
38
  },
39
  {
40
  "epoch": 0.2844302861368678,
41
+ "grad_norm": 1.2000905275344849,
42
+ "learning_rate": 0.0007544911542181011,
43
+ "loss": 1.2574,
44
  "step": 5000
45
  },
46
  {
47
  "epoch": 0.2844302861368678,
48
+ "eval_accuracy": 0.689128,
49
+ "eval_loss": 1.238457202911377,
50
+ "eval_runtime": 203.0197,
51
+ "eval_samples_per_second": 1231.407,
52
+ "eval_steps_per_second": 4.812,
53
  "step": 5000
54
  },
55
  {
56
  "epoch": 0.34131634336424144,
57
+ "grad_norm": 1.2910780906677246,
58
+ "learning_rate": 0.0007453893850617214,
59
+ "loss": 1.2181,
60
  "step": 6000
61
  },
62
  {
63
  "epoch": 0.398202400591615,
64
+ "grad_norm": 1.1383774280548096,
65
+ "learning_rate": 0.0007362876159053416,
66
+ "loss": 1.1863,
67
  "step": 7000
68
  },
69
  {
70
  "epoch": 0.45508845781898855,
71
+ "grad_norm": 1.135689616203308,
72
+ "learning_rate": 0.0007271858467489618,
73
+ "loss": 1.1653,
74
  "step": 8000
75
  },
76
  {
77
  "epoch": 0.5119745150463622,
78
+ "grad_norm": 1.1965036392211914,
79
+ "learning_rate": 0.0007180840775925821,
80
+ "loss": 1.147,
81
  "step": 9000
82
  },
83
  {
84
  "epoch": 0.5688605722737357,
85
+ "grad_norm": 1.0561026334762573,
86
+ "learning_rate": 0.0007089823084362024,
87
+ "loss": 1.1281,
88
  "step": 10000
89
  },
90
  {
91
  "epoch": 0.5688605722737357,
92
+ "eval_accuracy": 0.715764,
93
+ "eval_loss": 1.1192152500152588,
94
+ "eval_runtime": 128.8781,
95
+ "eval_samples_per_second": 1939.817,
96
+ "eval_steps_per_second": 7.581,
97
  "step": 10000
98
  },
99
  {
100
  "epoch": 0.6257466295011093,
101
+ "grad_norm": 0.9711835980415344,
102
+ "learning_rate": 0.0006998805392798226,
103
+ "loss": 1.1232,
104
  "step": 11000
105
  },
106
  {
107
  "epoch": 0.6826326867284829,
108
+ "grad_norm": 0.8913602828979492,
109
+ "learning_rate": 0.0006907787701234428,
110
+ "loss": 1.0988,
111
  "step": 12000
112
  },
113
  {
114
  "epoch": 0.7395187439558564,
115
+ "grad_norm": 1.092698097229004,
116
+ "learning_rate": 0.000681677000967063,
117
+ "loss": 1.0897,
118
  "step": 13000
119
  },
120
  {
121
  "epoch": 0.79640480118323,
122
+ "grad_norm": 0.9319038391113281,
123
+ "learning_rate": 0.0006725752318106833,
124
+ "loss": 1.0826,
125
  "step": 14000
126
  },
127
  {
128
  "epoch": 0.8532908584106036,
129
+ "grad_norm": 1.0223675966262817,
130
+ "learning_rate": 0.0006634734626543035,
131
+ "loss": 1.0698,
132
  "step": 15000
133
  },
134
  {
135
  "epoch": 0.8532908584106036,
136
+ "eval_accuracy": 0.728676,
137
+ "eval_loss": 1.0653605461120605,
138
+ "eval_runtime": 128.0826,
139
+ "eval_samples_per_second": 1951.866,
140
+ "eval_steps_per_second": 7.628,
141
  "step": 15000
142
  },
143
  {
144
  "epoch": 0.9101769156379771,
145
+ "grad_norm": 0.8995338678359985,
146
+ "learning_rate": 0.0006543716934979237,
147
+ "loss": 1.0624,
148
  "step": 16000
149
  },
150
  {
151
  "epoch": 0.9670629728653507,
152
+ "grad_norm": 0.8418471217155457,
153
+ "learning_rate": 0.0006452699243415439,
154
+ "loss": 1.0538,
155
  "step": 17000
156
  },
157
  {
158
  "epoch": 1.0239490300927243,
159
+ "grad_norm": 1.024624228477478,
160
+ "learning_rate": 0.0006361681551851641,
161
+ "loss": 1.0311,
162
  "step": 18000
163
  },
164
  {
165
  "epoch": 1.0808350873200978,
166
+ "grad_norm": 0.9130891561508179,
167
+ "learning_rate": 0.0006270663860287844,
168
+ "loss": 0.999,
169
  "step": 19000
170
  },
171
  {
172
  "epoch": 1.1377211445474713,
173
+ "grad_norm": 0.8896342515945435,
174
+ "learning_rate": 0.0006179646168724045,
175
+ "loss": 1.0,
176
  "step": 20000
177
  },
178
  {
179
  "epoch": 1.1377211445474713,
180
+ "eval_accuracy": 0.739712,
181
+ "eval_loss": 1.0235533714294434,
182
+ "eval_runtime": 127.2585,
183
+ "eval_samples_per_second": 1964.505,
184
+ "eval_steps_per_second": 7.677,
185
  "step": 20000
186
  },
187
  {
188
  "epoch": 1.194607201774845,
189
+ "grad_norm": 0.7940112948417664,
190
+ "learning_rate": 0.0006088628477160248,
191
+ "loss": 0.9957,
192
  "step": 21000
193
  },
194
  {
195
  "epoch": 1.2514932590022185,
196
+ "grad_norm": 0.9015308618545532,
197
+ "learning_rate": 0.000599761078559645,
198
+ "loss": 0.9967,
199
  "step": 22000
200
  },
201
  {
202
  "epoch": 1.3083793162295922,
203
+ "grad_norm": 0.9106078147888184,
204
+ "learning_rate": 0.0005906593094032653,
205
+ "loss": 0.9939,
206
  "step": 23000
207
  },
208
  {
209
  "epoch": 1.3652653734569657,
210
+ "grad_norm": 0.9563422203063965,
211
+ "learning_rate": 0.0005815575402468854,
212
+ "loss": 0.9931,
213
  "step": 24000
214
  },
215
  {
216
  "epoch": 1.4221514306843392,
217
+ "grad_norm": 0.7646272778511047,
218
+ "learning_rate": 0.0005724557710905057,
219
+ "loss": 0.9774,
220
  "step": 25000
221
  },
222
  {
223
  "epoch": 1.4221514306843392,
224
+ "eval_accuracy": 0.743348,
225
+ "eval_loss": 1.0054922103881836,
226
+ "eval_runtime": 127.7729,
227
+ "eval_samples_per_second": 1956.596,
228
+ "eval_steps_per_second": 7.646,
229
  "step": 25000
230
  },
231
  {
232
  "epoch": 1.4790374879117127,
233
+ "grad_norm": 0.7779045104980469,
234
+ "learning_rate": 0.000563354001934126,
235
+ "loss": 0.9792,
236
  "step": 26000
237
  },
238
  {
239
  "epoch": 1.5359235451390862,
240
+ "grad_norm": 0.8506484627723694,
241
+ "learning_rate": 0.0005542522327777463,
242
+ "loss": 0.9778,
243
  "step": 27000
244
  },
245
  {
246
  "epoch": 1.59280960236646,
247
+ "grad_norm": 0.8443676829338074,
248
+ "learning_rate": 0.0005451504636213664,
249
+ "loss": 0.9715,
250
  "step": 28000
251
  },
252
  {
253
  "epoch": 1.6496956595938337,
254
+ "grad_norm": 0.9333568215370178,
255
+ "learning_rate": 0.0005360486944649867,
256
+ "loss": 0.9679,
257
  "step": 29000
258
  },
259
  {
260
  "epoch": 1.7065817168212072,
261
+ "grad_norm": 0.9501623511314392,
262
+ "learning_rate": 0.0005269469253086069,
263
+ "loss": 0.9684,
264
  "step": 30000
265
  },
266
  {
267
  "epoch": 1.7065817168212072,
268
+ "eval_accuracy": 0.749276,
269
+ "eval_loss": 0.9812818765640259,
270
+ "eval_runtime": 128.5758,
271
+ "eval_samples_per_second": 1944.379,
272
+ "eval_steps_per_second": 7.599,
273
  "step": 30000
274
  },
275
  {
276
  "epoch": 1.7634677740485807,
277
+ "grad_norm": 0.7442188262939453,
278
+ "learning_rate": 0.0005178451561522272,
279
+ "loss": 0.9636,
280
  "step": 31000
281
  },
282
  {
283
  "epoch": 1.8203538312759542,
284
+ "grad_norm": 0.7510819435119629,
285
+ "learning_rate": 0.0005087433869958473,
286
+ "loss": 0.9647,
287
  "step": 32000
288
  },
289
  {
290
  "epoch": 1.8772398885033277,
291
+ "grad_norm": 0.7448764443397522,
292
+ "learning_rate": 0.0004996416178394676,
293
+ "loss": 0.9591,
294
  "step": 33000
295
  },
296
  {
297
  "epoch": 1.9341259457307014,
298
+ "grad_norm": 0.8019358515739441,
299
+ "learning_rate": 0.0004905398486830878,
300
+ "loss": 0.9513,
301
  "step": 34000
302
  },
303
  {
304
  "epoch": 1.9910120029580751,
305
+ "grad_norm": 0.9495121240615845,
306
+ "learning_rate": 0.00048143807952670797,
307
+ "loss": 0.9511,
308
  "step": 35000
309
  },
310
  {
311
  "epoch": 1.9910120029580751,
312
+ "eval_accuracy": 0.755448,
313
+ "eval_loss": 0.9558805227279663,
314
+ "eval_runtime": 127.8711,
315
+ "eval_samples_per_second": 1955.094,
316
+ "eval_steps_per_second": 7.641,
317
  "step": 35000
318
  },
319
  {
320
  "epoch": 2.0478980601854486,
321
+ "grad_norm": 0.8410281538963318,
322
+ "learning_rate": 0.00047233631037032825,
323
+ "loss": 0.9081,
324
  "step": 36000
325
  },
326
  {
327
  "epoch": 2.104784117412822,
328
+ "grad_norm": 0.8246123194694519,
329
+ "learning_rate": 0.00046323454121394847,
330
+ "loss": 0.8964,
331
  "step": 37000
332
  },
333
  {
334
  "epoch": 2.1616701746401956,
335
+ "grad_norm": 0.9567108154296875,
336
+ "learning_rate": 0.0004541327720575687,
337
+ "loss": 0.8952,
338
  "step": 38000
339
  },
340
  {
341
  "epoch": 2.218556231867569,
342
+ "grad_norm": 0.8104901313781738,
343
+ "learning_rate": 0.0004450310029011889,
344
+ "loss": 0.8925,
345
  "step": 39000
346
  },
347
  {
348
  "epoch": 2.2754422890949426,
349
+ "grad_norm": 0.9034276008605957,
350
+ "learning_rate": 0.0004359292337448092,
351
+ "loss": 0.8998,
352
  "step": 40000
353
  },
354
  {
355
  "epoch": 2.2754422890949426,
356
+ "eval_accuracy": 0.755948,
357
+ "eval_loss": 0.9492226839065552,
358
+ "eval_runtime": 127.8812,
359
+ "eval_samples_per_second": 1954.94,
360
+ "eval_steps_per_second": 7.64,
361
  "step": 40000
362
  },
363
  {
364
  "epoch": 2.3323283463223166,
365
+ "grad_norm": 1.3229442834854126,
366
+ "learning_rate": 0.00042682746458842937,
367
+ "loss": 0.8962,
368
  "step": 41000
369
  },
370
  {
371
  "epoch": 2.38921440354969,
372
+ "grad_norm": 0.8582925200462341,
373
+ "learning_rate": 0.00041772569543204965,
374
+ "loss": 0.8976,
375
  "step": 42000
376
  },
377
  {
378
  "epoch": 2.4461004607770636,
379
+ "grad_norm": 0.8881712555885315,
380
+ "learning_rate": 0.0004086239262756698,
381
+ "loss": 0.8898,
382
  "step": 43000
383
  },
384
  {
385
  "epoch": 2.502986518004437,
386
+ "grad_norm": 0.8713961839675903,
387
+ "learning_rate": 0.00039952215711929005,
388
+ "loss": 0.8927,
389
  "step": 44000
390
  },
391
  {
392
  "epoch": 2.5598725752318106,
393
+ "grad_norm": 0.7883007526397705,
394
+ "learning_rate": 0.00039042038796291027,
395
+ "loss": 0.8967,
396
  "step": 45000
397
  },
398
  {
399
  "epoch": 2.5598725752318106,
400
+ "eval_accuracy": 0.760028,
401
+ "eval_loss": 0.937300980091095,
402
+ "eval_runtime": 130.0782,
403
+ "eval_samples_per_second": 1921.921,
404
+ "eval_steps_per_second": 7.511,
405
  "step": 45000
406
  },
407
  {
408
  "epoch": 2.6167586324591845,
409
+ "grad_norm": 0.8600155711174011,
410
+ "learning_rate": 0.00038131861880653055,
411
+ "loss": 0.8927,
412
  "step": 46000
413
  },
414
  {
415
  "epoch": 2.673644689686558,
416
+ "grad_norm": 0.8501909971237183,
417
+ "learning_rate": 0.0003722168496501508,
418
+ "loss": 0.8913,
419
  "step": 47000
420
  },
421
  {
422
  "epoch": 2.7305307469139315,
423
+ "grad_norm": 0.8116582632064819,
424
+ "learning_rate": 0.000363115080493771,
425
+ "loss": 0.8889,
426
  "step": 48000
427
  },
428
  {
429
  "epoch": 2.787416804141305,
430
+ "grad_norm": 0.8065186738967896,
431
+ "learning_rate": 0.0003540133113373912,
432
+ "loss": 0.8896,
433
  "step": 49000
434
  },
435
  {
436
  "epoch": 2.8443028613686785,
437
+ "grad_norm": 0.9248031973838806,
438
+ "learning_rate": 0.00034491154218101145,
439
+ "loss": 0.8837,
440
  "step": 50000
441
  },
442
  {
443
  "epoch": 2.8443028613686785,
444
+ "eval_accuracy": 0.762176,
445
+ "eval_loss": 0.9251159429550171,
446
+ "eval_runtime": 128.4439,
447
+ "eval_samples_per_second": 1946.376,
448
+ "eval_steps_per_second": 7.606,
449
  "step": 50000
450
  },
451
  {
452
  "epoch": 2.901188918596052,
453
+ "grad_norm": 0.8191467523574829,
454
+ "learning_rate": 0.0003358097730246317,
455
+ "loss": 0.878,
456
  "step": 51000
457
  },
458
  {
459
  "epoch": 2.9580749758234255,
460
+ "grad_norm": 0.7620063424110413,
461
+ "learning_rate": 0.0003267080038682519,
462
+ "loss": 0.8832,
463
  "step": 52000
464
  },
465
  {
466
  "epoch": 3.0149610330507994,
467
+ "grad_norm": 0.8365482687950134,
468
+ "learning_rate": 0.0003176062347118721,
469
+ "loss": 0.8621,
470
  "step": 53000
471
  },
472
  {
473
  "epoch": 3.071847090278173,
474
+ "grad_norm": 0.9817807078361511,
475
+ "learning_rate": 0.00030850446555549235,
476
+ "loss": 0.8224,
477
  "step": 54000
478
  },
479
  {
480
  "epoch": 3.1287331475055464,
481
+ "grad_norm": 0.847806453704834,
482
+ "learning_rate": 0.00029940269639911263,
483
+ "loss": 0.8253,
484
  "step": 55000
485
  },
486
  {
487
  "epoch": 3.1287331475055464,
488
+ "eval_accuracy": 0.76438,
489
+ "eval_loss": 0.9235970973968506,
490
+ "eval_runtime": 126.2531,
491
+ "eval_samples_per_second": 1980.15,
492
+ "eval_steps_per_second": 7.738,
493
  "step": 55000
494
  },
495
  {
496
  "epoch": 3.18561920473292,
497
+ "grad_norm": 1.1729530096054077,
498
+ "learning_rate": 0.00029030092724273285,
499
+ "loss": 0.8225,
500
  "step": 56000
501
  },
502
  {
503
  "epoch": 3.2425052619602934,
504
+ "grad_norm": 1.0548408031463623,
505
+ "learning_rate": 0.0002811991580863531,
506
+ "loss": 0.821,
507
  "step": 57000
508
  },
509
  {
510
  "epoch": 3.299391319187667,
511
+ "grad_norm": 1.0199774503707886,
512
+ "learning_rate": 0.0002720973889299733,
513
+ "loss": 0.8213,
514
  "step": 58000
515
  },
516
  {
517
  "epoch": 3.356277376415041,
518
+ "grad_norm": 0.9180177450180054,
519
+ "learning_rate": 0.00026299561977359353,
520
+ "loss": 0.8274,
521
  "step": 59000
522
  },
523
  {
524
  "epoch": 3.4131634336424144,
525
+ "grad_norm": 0.9745663404464722,
526
+ "learning_rate": 0.0002538938506172137,
527
+ "loss": 0.8229,
528
  "step": 60000
529
  },
530
  {
531
  "epoch": 3.4131634336424144,
532
+ "eval_accuracy": 0.766832,
533
+ "eval_loss": 0.9138370156288147,
534
+ "eval_runtime": 129.2727,
535
+ "eval_samples_per_second": 1933.897,
536
+ "eval_steps_per_second": 7.558,
537
  "step": 60000
538
  },
539
  {
540
  "epoch": 3.470049490869788,
541
+ "grad_norm": 0.8708947896957397,
542
+ "learning_rate": 0.0002447920814608339,
543
+ "loss": 0.8256,
544
  "step": 61000
545
  },
546
  {
547
  "epoch": 3.5269355480971614,
548
+ "grad_norm": 0.9808185696601868,
549
+ "learning_rate": 0.00023569031230445418,
550
+ "loss": 0.8298,
551
  "step": 62000
552
  },
553
  {
554
  "epoch": 3.583821605324535,
555
+ "grad_norm": 0.8228833079338074,
556
+ "learning_rate": 0.0002265885431480744,
557
+ "loss": 0.827,
558
  "step": 63000
559
  },
560
  {
561
  "epoch": 3.6407076625519084,
562
+ "grad_norm": 0.9581019878387451,
563
+ "learning_rate": 0.00021748677399169463,
564
+ "loss": 0.8275,
565
  "step": 64000
566
  },
567
  {
568
  "epoch": 3.697593719779282,
569
+ "grad_norm": 0.8560314178466797,
570
+ "learning_rate": 0.00020838500483531488,
571
+ "loss": 0.8145,
572
  "step": 65000
573
  },
574
  {
575
  "epoch": 3.697593719779282,
576
+ "eval_accuracy": 0.769172,
577
+ "eval_loss": 0.9042648673057556,
578
+ "eval_runtime": 129.2138,
579
+ "eval_samples_per_second": 1934.778,
580
+ "eval_steps_per_second": 7.561,
581
  "step": 65000
582
  },
583
  {
584
  "epoch": 3.754479777006656,
585
+ "grad_norm": 0.8918451070785522,
586
+ "learning_rate": 0.0001992832356789351,
587
+ "loss": 0.819,
588
  "step": 66000
589
  },
590
  {
591
  "epoch": 3.8113658342340293,
592
+ "grad_norm": 1.0977294445037842,
593
+ "learning_rate": 0.00019018146652255533,
594
+ "loss": 0.8122,
595
  "step": 67000
596
  },
597
  {
598
  "epoch": 3.868251891461403,
599
+ "grad_norm": 0.7856444716453552,
600
+ "learning_rate": 0.00018107969736617555,
601
+ "loss": 0.8225,
602
  "step": 68000
603
  },
604
  {
605
  "epoch": 3.9251379486887763,
606
+ "grad_norm": 0.9270259141921997,
607
+ "learning_rate": 0.00017197792820979578,
608
+ "loss": 0.8158,
609
  "step": 69000
610
  },
611
  {
612
  "epoch": 3.98202400591615,
613
+ "grad_norm": 1.082774043083191,
614
+ "learning_rate": 0.00016287615905341603,
615
+ "loss": 0.8156,
616
  "step": 70000
617
  },
618
  {
619
  "epoch": 3.98202400591615,
620
+ "eval_accuracy": 0.770764,
621
+ "eval_loss": 0.8961142301559448,
622
+ "eval_runtime": 138.0555,
623
+ "eval_samples_per_second": 1810.866,
624
+ "eval_steps_per_second": 7.077,
625
  "step": 70000
626
  },
627
  {
628
  "epoch": 4.038910063143524,
629
+ "grad_norm": 0.909858226776123,
630
+ "learning_rate": 0.00015377438989703626,
631
+ "loss": 0.7785,
632
  "step": 71000
633
  },
634
  {
635
  "epoch": 4.095796120370897,
636
+ "grad_norm": 0.931280791759491,
637
+ "learning_rate": 0.00014467262074065645,
638
+ "loss": 0.7637,
639
  "step": 72000
640
  },
641
  {
642
  "epoch": 4.152682177598271,
643
+ "grad_norm": 0.94422847032547,
644
+ "learning_rate": 0.0001355708515842767,
645
+ "loss": 0.7612,
646
  "step": 73000
647
  },
648
  {
649
  "epoch": 4.209568234825644,
650
+ "grad_norm": 0.9250127077102661,
651
+ "learning_rate": 0.00012646908242789693,
652
+ "loss": 0.7616,
653
  "step": 74000
654
  },
655
  {
656
  "epoch": 4.266454292053018,
657
+ "grad_norm": 0.8467296957969666,
658
+ "learning_rate": 0.00011736731327151716,
659
+ "loss": 0.7557,
660
  "step": 75000
661
  },
662
  {
663
  "epoch": 4.266454292053018,
664
+ "eval_accuracy": 0.77204,
665
+ "eval_loss": 0.9022773504257202,
666
+ "eval_runtime": 144.5432,
667
+ "eval_samples_per_second": 1729.587,
668
+ "eval_steps_per_second": 6.759,
669
  "step": 75000
670
  },
671
  {
672
  "epoch": 4.323340349280391,
673
+ "grad_norm": 0.8985564708709717,
674
+ "learning_rate": 0.00010826554411513738,
675
+ "loss": 0.7604,
676
  "step": 76000
677
  },
678
  {
679
  "epoch": 4.380226406507765,
680
+ "grad_norm": 0.8618564605712891,
681
+ "learning_rate": 9.916377495875762e-05,
682
+ "loss": 0.7632,
683
  "step": 77000
684
  },
685
  {
686
  "epoch": 4.437112463735138,
687
+ "grad_norm": 0.9467126727104187,
688
+ "learning_rate": 9.006200580237784e-05,
689
+ "loss": 0.7614,
690
  "step": 78000
691
  },
692
  {
693
  "epoch": 4.493998520962512,
694
+ "grad_norm": 1.0163730382919312,
695
+ "learning_rate": 8.096023664599807e-05,
696
+ "loss": 0.7575,
697
  "step": 79000
698
  },
699
  {
700
  "epoch": 4.550884578189885,
701
+ "grad_norm": 1.1194038391113281,
702
+ "learning_rate": 7.18584674896183e-05,
703
+ "loss": 0.7595,
704
  "step": 80000
705
  },
706
  {
707
  "epoch": 4.550884578189885,
708
+ "eval_accuracy": 0.772256,
709
+ "eval_loss": 0.897346019744873,
710
+ "eval_runtime": 136.9434,
711
+ "eval_samples_per_second": 1825.571,
712
+ "eval_steps_per_second": 7.134,
713
  "step": 80000
714
  },
715
  {
716
  "epoch": 4.607770635417259,
717
+ "grad_norm": 1.0589629411697388,
718
+ "learning_rate": 6.275669833323853e-05,
719
+ "loss": 0.7548,
720
  "step": 81000
721
  },
722
  {
723
  "epoch": 4.664656692644633,
724
+ "grad_norm": 0.8540852665901184,
725
+ "learning_rate": 5.365492917685876e-05,
726
+ "loss": 0.7601,
727
  "step": 82000
728
  },
729
  {
730
  "epoch": 4.721542749872007,
731
+ "grad_norm": 1.127475380897522,
732
+ "learning_rate": 4.455316002047898e-05,
733
+ "loss": 0.7554,
734
  "step": 83000
735
  },
736
  {
737
  "epoch": 4.77842880709938,
738
+ "grad_norm": 0.9464063048362732,
739
+ "learning_rate": 3.545139086409921e-05,
740
+ "loss": 0.756,
741
  "step": 84000
742
  },
743
  {
744
  "epoch": 4.835314864326754,
745
+ "grad_norm": 0.9705914855003357,
746
+ "learning_rate": 2.634962170771944e-05,
747
+ "loss": 0.7581,
748
  "step": 85000
749
  },
750
  {
751
  "epoch": 4.835314864326754,
752
+ "eval_accuracy": 0.773724,
753
+ "eval_loss": 0.8925997018814087,
754
+ "eval_runtime": 138.7415,
755
+ "eval_samples_per_second": 1801.913,
756
+ "eval_steps_per_second": 7.042,
757
  "step": 85000
758
  },
759
  {
760
  "epoch": 4.892200921554127,
761
+ "grad_norm": 0.8879310488700867,
762
+ "learning_rate": 1.7247852551339668e-05,
763
+ "loss": 0.758,
764
  "step": 86000
765
  },
766
  {
767
  "epoch": 4.949086978781501,
768
+ "grad_norm": 1.2024400234222412,
769
+ "learning_rate": 8.146083394959896e-06,
770
+ "loss": 0.751,
771
  "step": 87000
772
  },
773
  {
774
  "epoch": 5.0,
775
  "step": 87895,
776
+ "total_flos": 1.93274424e+18,
777
+ "train_loss": 0.9357909288237652,
778
+ "train_runtime": 45635.435,
779
+ "train_samples_per_second": 493.038,
780
+ "train_steps_per_second": 1.926
781
  }
782
  ],
783
  "logging_steps": 1000,
 
785
  "num_input_tokens_seen": 0,
786
  "num_train_epochs": 5,
787
  "save_steps": 5000,
788
+ "total_flos": 1.93274424e+18,
789
  "train_batch_size": 256,
790
  "trial_name": null,
791
  "trial_params": null
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:eb6b0f81e80c1f33b16aaf9c2fc69495be719e29fb2dfcec7d7a4debabe294f0
3
- size 4920
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bf823d8cbbdbfa0a6f25c927a43b2dbea4fdd4c372f9f50bf4026741c3ca5e20
3
+ size 4472