doodle-zero

Browse files

Files changed (11) hide show

.DS_Store +0 -0
README.md +11 -33
all_results.json +9 -9
config.json +100 -99
model.safetensors +2 -2
preprocessor_config.json +0 -15
pytorch_model.bin +3 -0
test_results.json +4 -5
train_results.json +5 -5
trainer_state.json +352 -352
training_args.bin +2 -2

.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

README.md CHANGED Viewed

@@ -1,8 +1,7 @@
 ---
 tags:
 - generated_from_trainer
-metrics:
-- accuracy
 model-index:
 - name: results
   results: []
@@ -13,10 +12,14 @@ should probably proofread and complete it, then remove this comment. -->
 # results
-This model is a fine-tuned version of [](https://huggingface.co/) on an unknown dataset.
 It achieves the following results on the evaluation set:
-- Loss: 1.1000
-- Accuracy: 0.7236
 ## Model description
@@ -39,38 +42,13 @@ The following hyperparameters were used during training:
 - train_batch_size: 256
 - eval_batch_size: 256
 - seed: 42
-- distributed_type: multi-GPU
 - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
 - lr_scheduler_type: linear
 - num_epochs: 5
-- mixed_precision_training: Native AMP
-### Training results
-| Training Loss | Epoch  | Step  | Validation Loss | Accuracy |
-|:-------------:|:------:|:-----:|:---------------:|:--------:|
-| 1.7698        | 0.2844 | 5000  | 1.7124          | 0.5802   |
-| 1.5445        | 0.5689 | 10000 | 1.5021          | 0.6270   |
-| 1.439         | 0.8533 | 15000 | 1.3989          | 0.6520   |
-| 1.3625        | 1.1377 | 20000 | 1.3447          | 0.6647   |
-| 1.3192        | 1.4222 | 25000 | 1.2965          | 0.6756   |
-| 1.3           | 1.7066 | 30000 | 1.2788          | 0.6795   |
-| 1.2695        | 1.9910 | 35000 | 1.2347          | 0.6900   |
-| 1.2297        | 2.2754 | 40000 | 1.2160          | 0.6955   |
-| 1.2144        | 2.5599 | 45000 | 1.1894          | 0.7021   |
-| 1.1945        | 2.8443 | 50000 | 1.1734          | 0.7058   |
-| 1.1551        | 3.1287 | 55000 | 1.1611          | 0.7084   |
-| 1.1471        | 3.4132 | 60000 | 1.1523          | 0.7104   |
-| 1.1301        | 3.6976 | 65000 | 1.1314          | 0.7156   |
-| 1.1286        | 3.9820 | 70000 | 1.1220          | 0.7186   |
-| 1.0898        | 4.2665 | 75000 | 1.1140          | 0.7203   |
-| 1.093         | 4.5509 | 80000 | 1.1040          | 0.7232   |
-| 1.0893        | 4.8353 | 85000 | 1.0986          | 0.7246   |
 ### Framework versions
-- Transformers 4.40.0
-- Pytorch 2.2.2+cu121
 - Datasets 2.19.0
-- Tokenizers 0.19.1

 ---
+base_model: laszlokiss27/results
 tags:
 - generated_from_trainer
 model-index:
 - name: results
   results: []
 # results
+This model is a fine-tuned version of [laszlokiss27/results](https://huggingface.co/laszlokiss27/results) on an unknown dataset.
 It achieves the following results on the evaluation set:
+- eval_loss: 1.1000
+- eval_accuracy: 0.7236
+- eval_runtime: 831.0467
+- eval_samples_per_second: 300.825
+- eval_steps_per_second: 1.176
+- step: 0
 ## Model description
 - train_batch_size: 256
 - eval_batch_size: 256
 - seed: 42
 - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
 - lr_scheduler_type: linear
 - num_epochs: 5
 ### Framework versions
+- Transformers 4.33.2
+- Pytorch 2.2.2
 - Datasets 2.19.0
+- Tokenizers 0.13.3

all_results.json CHANGED Viewed

@@ -1,13 +1,13 @@
 {
     "epoch": 5.0,
     "eval_accuracy": 0.723616,
-    "eval_loss": 1.100016713142395,
-    "eval_runtime": 118.4292,
-    "eval_samples_per_second": 2110.967,
-    "eval_steps_per_second": 8.25,
-    "total_flos": 5.4597445596112486e+17,
-    "train_loss": 1.296092871571504,
-    "train_runtime": 24664.1985,
-    "train_samples_per_second": 912.253,
-    "train_steps_per_second": 3.564
 }

 {
     "epoch": 5.0,
     "eval_accuracy": 0.723616,
+    "eval_loss": 1.100019931793213,
+    "eval_runtime": 831.0467,
+    "eval_samples_per_second": 300.825,
+    "eval_steps_per_second": 1.176,
+    "total_flos": 1.93274424e+18,
+    "train_loss": 0.9357909288237652,
+    "train_runtime": 45635.435,
+    "train_samples_per_second": 493.038,
+    "train_steps_per_second": 1.926
 }

config.json CHANGED Viewed

@@ -1,4 +1,5 @@
 {
   "architectures": [
     "MobileViTForImageClassification"
   ],
@@ -24,104 +25,7 @@
   "id2label": {
     "0": "aircraft carrier",
     "1": "airplane",
-    "2": "alarm clock",
-    "3": "ambulance",
-    "4": "angel",
-    "5": "animal migration",
-    "6": "ant",
-    "7": "anvil",
-    "8": "apple",
-    "9": "arm",
     "10": "asparagus",
-    "11": "axe",
-    "12": "backpack",
-    "13": "banana",
-    "14": "bandage",
-    "15": "barn",
-    "16": "baseball bat",
-    "17": "baseball",
-    "18": "basket",
-    "19": "basketball",
-    "20": "bat",
-    "21": "bathtub",
-    "22": "beach",
-    "23": "bear",
-    "24": "beard",
-    "25": "bed",
-    "26": "bee",
-    "27": "belt",
-    "28": "bench",
-    "29": "bicycle",
-    "30": "binoculars",
-    "31": "bird",
-    "32": "birthday cake",
-    "33": "blackberry",
-    "34": "blueberry",
-    "35": "book",
-    "36": "boomerang",
-    "37": "bottlecap",
-    "38": "bowtie",
-    "39": "bracelet",
-    "40": "brain",
-    "41": "bread",
-    "42": "bridge",
-    "43": "broccoli",
-    "44": "broom",
-    "45": "bucket",
-    "46": "bulldozer",
-    "47": "bus",
-    "48": "bush",
-    "49": "butterfly",
-    "50": "cactus",
-    "51": "cake",
-    "52": "calculator",
-    "53": "calendar",
-    "54": "camel",
-    "55": "camera",
-    "56": "camouflage",
-    "57": "campfire",
-    "58": "candle",
-    "59": "cannon",
-    "60": "canoe",
-    "61": "car",
-    "62": "carrot",
-    "63": "castle",
-    "64": "cat",
-    "65": "ceiling fan",
-    "66": "cell phone",
-    "67": "cello",
-    "68": "chair",
-    "69": "chandelier",
-    "70": "church",
-    "71": "circle",
-    "72": "clarinet",
-    "73": "clock",
-    "74": "cloud",
-    "75": "coffee cup",
-    "76": "compass",
-    "77": "computer",
-    "78": "cookie",
-    "79": "cooler",
-    "80": "couch",
-    "81": "cow",
-    "82": "crab",
-    "83": "crayon",
-    "84": "crocodile",
-    "85": "crown",
-    "86": "cruise ship",
-    "87": "cup",
-    "88": "diamond",
-    "89": "dishwasher",
-    "90": "diving board",
-    "91": "dog",
-    "92": "dolphin",
-    "93": "donut",
-    "94": "door",
-    "95": "dragon",
-    "96": "dresser",
-    "97": "drill",
-    "98": "drums",
-    "99": "duck",
     "100": "dumbbell",
     "101": "ear",
     "102": "elbow",
@@ -132,6 +36,7 @@
     "107": "eyeglasses",
     "108": "face",
     "109": "fan",
     "110": "feather",
     "111": "fence",
     "112": "finger",
@@ -142,6 +47,7 @@
     "117": "flamingo",
     "118": "flashlight",
     "119": "flip flops",
     "120": "floor lamp",
     "121": "flower",
     "122": "flying saucer",
@@ -152,6 +58,7 @@
     "127": "garden hose",
     "128": "garden",
     "129": "giraffe",
     "130": "goatee",
     "131": "golf club",
     "132": "grapes",
@@ -162,6 +69,7 @@
     "137": "hand",
     "138": "harp",
     "139": "hat",
     "140": "headphones",
     "141": "hedgehog",
     "142": "helicopter",
@@ -172,6 +80,7 @@
     "147": "horse",
     "148": "hospital",
     "149": "hot air balloon",
     "150": "hot dog",
     "151": "hot tub",
     "152": "hourglass",
@@ -182,6 +91,7 @@
     "157": "jacket",
     "158": "jail",
     "159": "kangaroo",
     "160": "key",
     "161": "keyboard",
     "162": "knee",
@@ -192,6 +102,7 @@
     "167": "leaf",
     "168": "leg",
     "169": "light bulb",
     "170": "lighter",
     "171": "lighthouse",
     "172": "lightning",
@@ -202,6 +113,7 @@
     "177": "lollipop",
     "178": "mailbox",
     "179": "map",
     "180": "marker",
     "181": "matches",
     "182": "megaphone",
@@ -212,6 +124,7 @@
     "187": "moon",
     "188": "mosquito",
     "189": "motorbike",
     "190": "mountain",
     "191": "mouse",
     "192": "moustache",
@@ -222,6 +135,8 @@
     "197": "necklace",
     "198": "nose",
     "199": "ocean",
     "200": "octagon",
     "201": "octopus",
     "202": "onion",
@@ -232,6 +147,7 @@
     "207": "palm tree",
     "208": "panda",
     "209": "pants",
     "210": "paper clip",
     "211": "parachute",
     "212": "parrot",
@@ -242,6 +158,7 @@
     "217": "pencil",
     "218": "penguin",
     "219": "piano",
     "220": "pickup truck",
     "221": "picture frame",
     "222": "pig",
@@ -252,6 +169,7 @@
     "227": "police car",
     "228": "pond",
     "229": "pool",
     "230": "popsicle",
     "231": "postcard",
     "232": "potato",
@@ -262,6 +180,7 @@
     "237": "radio",
     "238": "rain",
     "239": "rainbow",
     "240": "rake",
     "241": "remote control",
     "242": "rhinoceros",
@@ -272,6 +191,7 @@
     "247": "sailboat",
     "248": "sandwich",
     "249": "saw",
     "250": "saxophone",
     "251": "school bus",
     "252": "scissors",
@@ -282,6 +202,7 @@
     "257": "shark",
     "258": "sheep",
     "259": "shoe",
     "260": "shorts",
     "261": "shovel",
     "262": "sink",
@@ -292,6 +213,7 @@
     "267": "smiley face",
     "268": "snail",
     "269": "snake",
     "270": "snorkel",
     "271": "snowflake",
     "272": "snowman",
@@ -302,6 +224,7 @@
     "277": "spoon",
     "278": "spreadsheet",
     "279": "square",
     "280": "squiggle",
     "281": "squirrel",
     "282": "stairs",
@@ -312,6 +235,7 @@
     "287": "stitches",
     "288": "stop sign",
     "289": "stove",
     "290": "strawberry",
     "291": "streetlight",
     "292": "string bean",
@@ -322,6 +246,8 @@
     "297": "sweater",
     "298": "swing set",
     "299": "sword",
     "300": "syringe",
     "301": "t-shirt",
     "302": "table",
@@ -332,6 +258,7 @@
     "307": "tennis racquet",
     "308": "tent",
     "309": "The Eiffel Tower",
     "310": "The Great Wall of China",
     "311": "The Mona Lisa",
     "312": "tiger",
@@ -342,6 +269,7 @@
     "317": "toothbrush",
     "318": "toothpaste",
     "319": "tornado",
     "320": "tractor",
     "321": "traffic light",
     "322": "train",
@@ -352,6 +280,7 @@
     "327": "trumpet",
     "328": "umbrella",
     "329": "underwear",
     "330": "van",
     "331": "vase",
     "332": "violin",
@@ -362,11 +291,83 @@
     "337": "wheel",
     "338": "windmill",
     "339": "wine bottle",
     "340": "wine glass",
     "341": "wristwatch",
     "342": "yoga",
     "343": "zebra",
-    "344": "zigzag"
   },
   "ignore_mismatched_sizes": true,
   "image_size": 64,
@@ -740,5 +741,5 @@
   "qkv_bias": true,
   "semantic_loss_ignore_index": 255,
   "torch_dtype": "float32",
-  "transformers_version": "4.40.0"
 }

 {
+  "_name_or_path": "laszlokiss27/results",
   "architectures": [
     "MobileViTForImageClassification"
   ],
   "id2label": {
     "0": "aircraft carrier",
     "1": "airplane",
     "10": "asparagus",
     "100": "dumbbell",
     "101": "ear",
     "102": "elbow",
     "107": "eyeglasses",
     "108": "face",
     "109": "fan",
+    "11": "axe",
     "110": "feather",
     "111": "fence",
     "112": "finger",
     "117": "flamingo",
     "118": "flashlight",
     "119": "flip flops",
+    "12": "backpack",
     "120": "floor lamp",
     "121": "flower",
     "122": "flying saucer",
     "127": "garden hose",
     "128": "garden",
     "129": "giraffe",
+    "13": "banana",
     "130": "goatee",
     "131": "golf club",
     "132": "grapes",
     "137": "hand",
     "138": "harp",
     "139": "hat",
+    "14": "bandage",
     "140": "headphones",
     "141": "hedgehog",
     "142": "helicopter",
     "147": "horse",
     "148": "hospital",
     "149": "hot air balloon",
+    "15": "barn",
     "150": "hot dog",
     "151": "hot tub",
     "152": "hourglass",
     "157": "jacket",
     "158": "jail",
     "159": "kangaroo",
+    "16": "baseball bat",
     "160": "key",
     "161": "keyboard",
     "162": "knee",
     "167": "leaf",
     "168": "leg",
     "169": "light bulb",
+    "17": "baseball",
     "170": "lighter",
     "171": "lighthouse",
     "172": "lightning",
     "177": "lollipop",
     "178": "mailbox",
     "179": "map",
+    "18": "basket",
     "180": "marker",
     "181": "matches",
     "182": "megaphone",
     "187": "moon",
     "188": "mosquito",
     "189": "motorbike",
+    "19": "basketball",
     "190": "mountain",
     "191": "mouse",
     "192": "moustache",
     "197": "necklace",
     "198": "nose",
     "199": "ocean",
+    "2": "alarm clock",
+    "20": "bat",
     "200": "octagon",
     "201": "octopus",
     "202": "onion",
     "207": "palm tree",
     "208": "panda",
     "209": "pants",
+    "21": "bathtub",
     "210": "paper clip",
     "211": "parachute",
     "212": "parrot",
     "217": "pencil",
     "218": "penguin",
     "219": "piano",
+    "22": "beach",
     "220": "pickup truck",
     "221": "picture frame",
     "222": "pig",
     "227": "police car",
     "228": "pond",
     "229": "pool",
+    "23": "bear",
     "230": "popsicle",
     "231": "postcard",
     "232": "potato",
     "237": "radio",
     "238": "rain",
     "239": "rainbow",
+    "24": "beard",
     "240": "rake",
     "241": "remote control",
     "242": "rhinoceros",
     "247": "sailboat",
     "248": "sandwich",
     "249": "saw",
+    "25": "bed",
     "250": "saxophone",
     "251": "school bus",
     "252": "scissors",
     "257": "shark",
     "258": "sheep",
     "259": "shoe",
+    "26": "bee",
     "260": "shorts",
     "261": "shovel",
     "262": "sink",
     "267": "smiley face",
     "268": "snail",
     "269": "snake",
+    "27": "belt",
     "270": "snorkel",
     "271": "snowflake",
     "272": "snowman",
     "277": "spoon",
     "278": "spreadsheet",
     "279": "square",
+    "28": "bench",
     "280": "squiggle",
     "281": "squirrel",
     "282": "stairs",
     "287": "stitches",
     "288": "stop sign",
     "289": "stove",
+    "29": "bicycle",
     "290": "strawberry",
     "291": "streetlight",
     "292": "string bean",
     "297": "sweater",
     "298": "swing set",
     "299": "sword",
+    "3": "ambulance",
+    "30": "binoculars",
     "300": "syringe",
     "301": "t-shirt",
     "302": "table",
     "307": "tennis racquet",
     "308": "tent",
     "309": "The Eiffel Tower",
+    "31": "bird",
     "310": "The Great Wall of China",
     "311": "The Mona Lisa",
     "312": "tiger",
     "317": "toothbrush",
     "318": "toothpaste",
     "319": "tornado",
+    "32": "birthday cake",
     "320": "tractor",
     "321": "traffic light",
     "322": "train",
     "327": "trumpet",
     "328": "umbrella",
     "329": "underwear",
+    "33": "blackberry",
     "330": "van",
     "331": "vase",
     "332": "violin",
     "337": "wheel",
     "338": "windmill",
     "339": "wine bottle",
+    "34": "blueberry",
     "340": "wine glass",
     "341": "wristwatch",
     "342": "yoga",
     "343": "zebra",
+    "344": "zigzag",
+    "35": "book",
+    "36": "boomerang",
+    "37": "bottlecap",
+    "38": "bowtie",
+    "39": "bracelet",
+    "4": "angel",
+    "40": "brain",
+    "41": "bread",
+    "42": "bridge",
+    "43": "broccoli",
+    "44": "broom",
+    "45": "bucket",
+    "46": "bulldozer",
+    "47": "bus",
+    "48": "bush",
+    "49": "butterfly",
+    "5": "animal migration",
+    "50": "cactus",
+    "51": "cake",
+    "52": "calculator",
+    "53": "calendar",
+    "54": "camel",
+    "55": "camera",
+    "56": "camouflage",
+    "57": "campfire",
+    "58": "candle",
+    "59": "cannon",
+    "6": "ant",
+    "60": "canoe",
+    "61": "car",
+    "62": "carrot",
+    "63": "castle",
+    "64": "cat",
+    "65": "ceiling fan",
+    "66": "cell phone",
+    "67": "cello",
+    "68": "chair",
+    "69": "chandelier",
+    "7": "anvil",
+    "70": "church",
+    "71": "circle",
+    "72": "clarinet",
+    "73": "clock",
+    "74": "cloud",
+    "75": "coffee cup",
+    "76": "compass",
+    "77": "computer",
+    "78": "cookie",
+    "79": "cooler",
+    "8": "apple",
+    "80": "couch",
+    "81": "cow",
+    "82": "crab",
+    "83": "crayon",
+    "84": "crocodile",
+    "85": "crown",
+    "86": "cruise ship",
+    "87": "cup",
+    "88": "diamond",
+    "89": "dishwasher",
+    "9": "arm",
+    "90": "diving board",
+    "91": "dog",
+    "92": "dolphin",
+    "93": "donut",
+    "94": "door",
+    "95": "dragon",
+    "96": "dresser",
+    "97": "drill",
+    "98": "drums",
+    "99": "duck"
   },
   "ignore_mismatched_sizes": true,
   "image_size": 64,
   "qkv_bias": true,
   "semantic_loss_ignore_index": 255,
   "torch_dtype": "float32",
+  "transformers_version": "4.33.2"
 }

model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:d35f61619e3ffc371ead68851aa0232c0fe6cc18ab6a5362d0e589c58eb59a19
-size 20730036

 version https://git-lfs.github.com/spec/v1
+oid sha256:28184813e695da9074eb277c8000be311e12d53352ec1ed2b6b268532b81b323
+size 18360744

preprocessor_config.json CHANGED Viewed

@@ -1,19 +1,4 @@
 {
-  "_valid_processor_keys": [
-    "images",
-    "segmentation_maps",
-    "do_resize",
-    "size",
-    "resample",
-    "do_rescale",
-    "rescale_factor",
-    "do_center_crop",
-    "crop_size",
-    "do_flip_channel_order",
-    "return_tensors",
-    "data_format",
-    "input_data_format"
-  ],
   "crop_size": {
     "height": 28,
     "width": 28

 {
   "crop_size": {
     "height": 28,
     "width": 28

pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:579e9e6dde3a9f639be92e42b626a108dbd9c7b2f4ca7bee37623bca653d4abb
+size 20803638

test_results.json CHANGED Viewed

@@ -1,8 +1,7 @@
 {
-    "epoch": 5.0,
     "eval_accuracy": 0.723616,
-    "eval_loss": 1.100016713142395,
-    "eval_runtime": 118.4292,
-    "eval_samples_per_second": 2110.967,
-    "eval_steps_per_second": 8.25
 }

 {
     "eval_accuracy": 0.723616,
+    "eval_loss": 1.100019931793213,
+    "eval_runtime": 831.0467,
+    "eval_samples_per_second": 300.825,
+    "eval_steps_per_second": 1.176
 }

train_results.json CHANGED Viewed

@@ -1,8 +1,8 @@
 {
     "epoch": 5.0,
-    "total_flos": 5.4597445596112486e+17,
-    "train_loss": 1.296092871571504,
-    "train_runtime": 24664.1985,
-    "train_samples_per_second": 912.253,
-    "train_steps_per_second": 3.564
 }

 {
     "epoch": 5.0,
+    "total_flos": 1.93274424e+18,
+    "train_loss": 0.9357909288237652,
+    "train_runtime": 45635.435,
+    "train_samples_per_second": 493.038,
+    "train_steps_per_second": 1.926
 }

trainer_state.json CHANGED Viewed

@@ -10,774 +10,774 @@
   "log_history": [
     {
       "epoch": 0.05688605722737357,
-      "grad_norm": 4.992003440856934,
-      "learning_rate": 0.0007909346379202458,
-      "loss": 3.165,
       "step": 1000
     },
     {
       "epoch": 0.11377211445474714,
-      "grad_norm": 3.379929542541504,
-      "learning_rate": 0.0007818419705330224,
-      "loss": 2.2365,
       "step": 2000
     },
     {
       "epoch": 0.17065817168212072,
-      "grad_norm": 2.9609272480010986,
-      "learning_rate": 0.0007727402013766426,
-      "loss": 1.9869,
       "step": 3000
     },
     {
       "epoch": 0.22754422890949427,
-      "grad_norm": 2.5321693420410156,
-      "learning_rate": 0.0007636384322202628,
-      "loss": 1.8608,
       "step": 4000
     },
     {
       "epoch": 0.2844302861368678,
-      "grad_norm": 2.3447465896606445,
-      "learning_rate": 0.0007545366630638831,
-      "loss": 1.7698,
       "step": 5000
     },
     {
       "epoch": 0.2844302861368678,
-      "eval_accuracy": 0.580204,
-      "eval_loss": 1.7123581171035767,
-      "eval_runtime": 119.0147,
-      "eval_samples_per_second": 2100.581,
-      "eval_steps_per_second": 8.209,
       "step": 5000
     },
     {
       "epoch": 0.34131634336424144,
-      "grad_norm": 2.4563510417938232,
-      "learning_rate": 0.0007454348939075033,
-      "loss": 1.7017,
       "step": 6000
     },
     {
       "epoch": 0.398202400591615,
-      "grad_norm": 2.0900540351867676,
-      "learning_rate": 0.0007363331247511235,
-      "loss": 1.6446,
       "step": 7000
     },
     {
       "epoch": 0.45508845781898855,
-      "grad_norm": 2.0227878093719482,
-      "learning_rate": 0.0007272404573639001,
-      "loss": 1.6037,
       "step": 8000
     },
     {
       "epoch": 0.5119745150463622,
-      "grad_norm": 1.9194427728652954,
-      "learning_rate": 0.0007181477899766768,
-      "loss": 1.5706,
       "step": 9000
     },
     {
       "epoch": 0.5688605722737357,
-      "grad_norm": 2.123502254486084,
-      "learning_rate": 0.0007090642243586097,
-      "loss": 1.5445,
       "step": 10000
     },
     {
       "epoch": 0.5688605722737357,
-      "eval_accuracy": 0.626964,
-      "eval_loss": 1.5020641088485718,
-      "eval_runtime": 118.3365,
-      "eval_samples_per_second": 2112.62,
-      "eval_steps_per_second": 8.256,
       "step": 10000
     },
     {
       "epoch": 0.6257466295011093,
-      "grad_norm": 2.0052566528320312,
-      "learning_rate": 0.00069996245520223,
-      "loss": 1.525,
       "step": 11000
     },
     {
       "epoch": 0.6826326867284829,
-      "grad_norm": 1.8293088674545288,
-      "learning_rate": 0.0006908606860458502,
-      "loss": 1.4918,
       "step": 12000
     },
     {
       "epoch": 0.7395187439558564,
-      "grad_norm": 1.8381669521331787,
-      "learning_rate": 0.0006817589168894705,
-      "loss": 1.4769,
       "step": 13000
     },
     {
       "epoch": 0.79640480118323,
-      "grad_norm": 1.7285594940185547,
-      "learning_rate": 0.0006726571477330906,
-      "loss": 1.4607,
       "step": 14000
     },
     {
       "epoch": 0.8532908584106036,
-      "grad_norm": 1.7327940464019775,
-      "learning_rate": 0.0006635553785767108,
-      "loss": 1.439,
       "step": 15000
     },
     {
       "epoch": 0.8532908584106036,
-      "eval_accuracy": 0.652024,
-      "eval_loss": 1.3989214897155762,
-      "eval_runtime": 118.43,
-      "eval_samples_per_second": 2110.951,
-      "eval_steps_per_second": 8.25,
       "step": 15000
     },
     {
       "epoch": 0.9101769156379771,
-      "grad_norm": 1.7836127281188965,
-      "learning_rate": 0.0006544536094203311,
-      "loss": 1.429,
       "step": 16000
     },
     {
       "epoch": 0.9670629728653507,
-      "grad_norm": 1.707483172416687,
-      "learning_rate": 0.0006453609420331078,
-      "loss": 1.4131,
       "step": 17000
     },
     {
       "epoch": 1.0239490300927243,
-      "grad_norm": 1.5185352563858032,
-      "learning_rate": 0.0006362682746458843,
-      "loss": 1.3919,
       "step": 18000
     },
     {
       "epoch": 1.0808350873200978,
-      "grad_norm": 1.5501619577407837,
-      "learning_rate": 0.0006271665054895046,
-      "loss": 1.3684,
       "step": 19000
     },
     {
       "epoch": 1.1377211445474713,
-      "grad_norm": 1.6240971088409424,
-      "learning_rate": 0.0006180738381022812,
-      "loss": 1.3625,
       "step": 20000
     },
     {
       "epoch": 1.1377211445474713,
-      "eval_accuracy": 0.664736,
-      "eval_loss": 1.344669222831726,
-      "eval_runtime": 118.5872,
-      "eval_samples_per_second": 2108.154,
-      "eval_steps_per_second": 8.239,
       "step": 20000
     },
     {
       "epoch": 1.194607201774845,
-      "grad_norm": 1.4840081930160522,
-      "learning_rate": 0.0006089811707150577,
-      "loss": 1.3516,
       "step": 21000
     },
     {
       "epoch": 1.2514932590022185,
-      "grad_norm": 1.844332218170166,
-      "learning_rate": 0.000599879401558678,
-      "loss": 1.3509,
       "step": 22000
     },
     {
       "epoch": 1.3083793162295922,
-      "grad_norm": 1.6865944862365723,
-      "learning_rate": 0.0005907776324022982,
-      "loss": 1.3429,
       "step": 23000
     },
     {
       "epoch": 1.3652653734569657,
-      "grad_norm": 1.7245532274246216,
-      "learning_rate": 0.0005816758632459185,
-      "loss": 1.3377,
       "step": 24000
     },
     {
       "epoch": 1.4221514306843392,
-      "grad_norm": 1.4441121816635132,
-      "learning_rate": 0.0005725740940895386,
-      "loss": 1.3192,
       "step": 25000
     },
     {
       "epoch": 1.4221514306843392,
-      "eval_accuracy": 0.675628,
-      "eval_loss": 1.296504020690918,
-      "eval_runtime": 118.6208,
-      "eval_samples_per_second": 2107.555,
-      "eval_steps_per_second": 8.236,
       "step": 25000
     },
     {
       "epoch": 1.4790374879117127,
-      "grad_norm": 1.5026684999465942,
-      "learning_rate": 0.0005634723249331589,
-      "loss": 1.3156,
       "step": 26000
     },
     {
       "epoch": 1.5359235451390862,
-      "grad_norm": 1.373693823814392,
-      "learning_rate": 0.0005543705557767792,
-      "loss": 1.3152,
       "step": 27000
     },
     {
       "epoch": 1.59280960236646,
-      "grad_norm": 1.4744044542312622,
-      "learning_rate": 0.0005452687866203994,
-      "loss": 1.307,
       "step": 28000
     },
     {
       "epoch": 1.6496956595938337,
-      "grad_norm": 1.45868718624115,
-      "learning_rate": 0.0005361670174640196,
-      "loss": 1.2985,
       "step": 29000
     },
     {
       "epoch": 1.7065817168212072,
-      "grad_norm": 1.4872881174087524,
-      "learning_rate": 0.0005270652483076398,
-      "loss": 1.3,
       "step": 30000
     },
     {
       "epoch": 1.7065817168212072,
-      "eval_accuracy": 0.679452,
-      "eval_loss": 1.2787636518478394,
-      "eval_runtime": 118.4552,
-      "eval_samples_per_second": 2110.502,
-      "eval_steps_per_second": 8.248,
       "step": 30000
     },
     {
       "epoch": 1.7634677740485807,
-      "grad_norm": 1.4146182537078857,
-      "learning_rate": 0.0005179725809204165,
-      "loss": 1.2917,
       "step": 31000
     },
     {
       "epoch": 1.8203538312759542,
-      "grad_norm": 1.511470913887024,
-      "learning_rate": 0.000508879913533193,
-      "loss": 1.2894,
       "step": 32000
     },
     {
       "epoch": 1.8772398885033277,
-      "grad_norm": 1.3366495370864868,
-      "learning_rate": 0.0004997781443768132,
-      "loss": 1.2836,
       "step": 33000
     },
     {
       "epoch": 1.9341259457307014,
-      "grad_norm": 1.3103934526443481,
-      "learning_rate": 0.0004906763752204335,
-      "loss": 1.2712,
       "step": 34000
     },
     {
       "epoch": 1.9910120029580751,
-      "grad_norm": 1.6586687564849854,
-      "learning_rate": 0.00048157460606405373,
-      "loss": 1.2695,
       "step": 35000
     },
     {
       "epoch": 1.9910120029580751,
-      "eval_accuracy": 0.69002,
-      "eval_loss": 1.2347031831741333,
-      "eval_runtime": 118.4567,
-      "eval_samples_per_second": 2110.476,
-      "eval_steps_per_second": 8.248,
       "step": 35000
     },
     {
       "epoch": 2.0478980601854486,
-      "grad_norm": 1.3782742023468018,
-      "learning_rate": 0.0004724819386768303,
-      "loss": 1.242,
       "step": 36000
     },
     {
       "epoch": 2.104784117412822,
-      "grad_norm": 1.4477494955062866,
-      "learning_rate": 0.0004633983730587633,
-      "loss": 1.2336,
       "step": 37000
     },
     {
       "epoch": 2.1616701746401956,
-      "grad_norm": 1.2619287967681885,
-      "learning_rate": 0.0004543057056715399,
-      "loss": 1.2275,
       "step": 38000
     },
     {
       "epoch": 2.218556231867569,
-      "grad_norm": 1.4561299085617065,
-      "learning_rate": 0.00044520393651516015,
-      "loss": 1.2187,
       "step": 39000
     },
     {
       "epoch": 2.2754422890949426,
-      "grad_norm": 1.4557944536209106,
-      "learning_rate": 0.0004361021673587804,
-      "loss": 1.2297,
       "step": 40000
     },
     {
       "epoch": 2.2754422890949426,
-      "eval_accuracy": 0.695532,
-      "eval_loss": 1.215972661972046,
-      "eval_runtime": 118.5665,
-      "eval_samples_per_second": 2108.522,
-      "eval_steps_per_second": 8.24,
       "step": 40000
     },
     {
       "epoch": 2.3323283463223166,
-      "grad_norm": 1.6357080936431885,
-      "learning_rate": 0.00042700039820240066,
-      "loss": 1.2232,
       "step": 41000
     },
     {
       "epoch": 2.38921440354969,
-      "grad_norm": 1.4996728897094727,
-      "learning_rate": 0.00041789862904602083,
-      "loss": 1.2221,
       "step": 42000
     },
     {
       "epoch": 2.4461004607770636,
-      "grad_norm": 1.3550739288330078,
-      "learning_rate": 0.0004088059616587974,
-      "loss": 1.2131,
       "step": 43000
     },
     {
       "epoch": 2.502986518004437,
-      "grad_norm": 1.3799809217453003,
-      "learning_rate": 0.00039970419250241767,
-      "loss": 1.2118,
       "step": 44000
     },
     {
       "epoch": 2.5598725752318106,
-      "grad_norm": 1.3335360288619995,
-      "learning_rate": 0.0003906024233460379,
-      "loss": 1.2144,
       "step": 45000
     },
     {
       "epoch": 2.5598725752318106,
-      "eval_accuracy": 0.702096,
-      "eval_loss": 1.1893980503082275,
-      "eval_runtime": 118.534,
-      "eval_samples_per_second": 2109.1,
-      "eval_steps_per_second": 8.242,
       "step": 45000
     },
     {
       "epoch": 2.6167586324591845,
-      "grad_norm": 1.405167818069458,
-      "learning_rate": 0.0003815097559588145,
-      "loss": 1.2104,
       "step": 46000
     },
     {
       "epoch": 2.673644689686558,
-      "grad_norm": 1.3935025930404663,
-      "learning_rate": 0.00037240798680243474,
-      "loss": 1.2059,
       "step": 47000
     },
     {
       "epoch": 2.7305307469139315,
-      "grad_norm": 1.5090906620025635,
-      "learning_rate": 0.00036330621764605497,
-      "loss": 1.2026,
       "step": 48000
     },
     {
       "epoch": 2.787416804141305,
-      "grad_norm": 1.4666266441345215,
-      "learning_rate": 0.0003542044484896752,
-      "loss": 1.2032,
       "step": 49000
     },
     {
       "epoch": 2.8443028613686785,
-      "grad_norm": 1.4617928266525269,
-      "learning_rate": 0.0003451026793332954,
-      "loss": 1.1945,
       "step": 50000
     },
     {
       "epoch": 2.8443028613686785,
-      "eval_accuracy": 0.705848,
-      "eval_loss": 1.1734095811843872,
-      "eval_runtime": 118.5433,
-      "eval_samples_per_second": 2108.934,
-      "eval_steps_per_second": 8.242,
       "step": 50000
     },
     {
       "epoch": 2.901188918596052,
-      "grad_norm": 1.4179085493087769,
-      "learning_rate": 0.00033601001194607204,
-      "loss": 1.1888,
       "step": 51000
     },
     {
       "epoch": 2.9580749758234255,
-      "grad_norm": 1.4129358530044556,
-      "learning_rate": 0.00032691734455884866,
-      "loss": 1.1952,
       "step": 52000
     },
     {
       "epoch": 3.0149610330507994,
-      "grad_norm": 1.3465383052825928,
-      "learning_rate": 0.0003178155754024689,
-      "loss": 1.1782,
       "step": 53000
     },
     {
       "epoch": 3.071847090278173,
-      "grad_norm": 1.4815254211425781,
-      "learning_rate": 0.00030871380624608905,
-      "loss": 1.1572,
       "step": 54000
     },
     {
       "epoch": 3.1287331475055464,
-      "grad_norm": 1.472550868988037,
-      "learning_rate": 0.00029961203708970933,
-      "loss": 1.1551,
       "step": 55000
     },
     {
       "epoch": 3.1287331475055464,
-      "eval_accuracy": 0.708444,
-      "eval_loss": 1.1611371040344238,
-      "eval_runtime": 118.5055,
-      "eval_samples_per_second": 2109.606,
-      "eval_steps_per_second": 8.244,
       "step": 55000
     },
     {
       "epoch": 3.18561920473292,
-      "grad_norm": 1.6439310312271118,
-      "learning_rate": 0.00029051026793332956,
-      "loss": 1.1542,
       "step": 56000
     },
     {
       "epoch": 3.2425052619602934,
-      "grad_norm": 1.4789113998413086,
-      "learning_rate": 0.0002814176005461062,
-      "loss": 1.1503,
       "step": 57000
     },
     {
       "epoch": 3.299391319187667,
-      "grad_norm": 1.2807673215866089,
-      "learning_rate": 0.00027232493315888274,
-      "loss": 1.1468,
       "step": 58000
     },
     {
       "epoch": 3.356277376415041,
-      "grad_norm": 1.3815586566925049,
-      "learning_rate": 0.000263223164002503,
-      "loss": 1.1523,
       "step": 59000
     },
     {
       "epoch": 3.4131634336424144,
-      "grad_norm": 1.4986367225646973,
-      "learning_rate": 0.00025412139484612325,
-      "loss": 1.1471,
       "step": 60000
     },
     {
       "epoch": 3.4131634336424144,
-      "eval_accuracy": 0.7104,
-      "eval_loss": 1.152265191078186,
-      "eval_runtime": 118.4796,
-      "eval_samples_per_second": 2110.067,
-      "eval_steps_per_second": 8.246,
       "step": 60000
     },
     {
       "epoch": 3.470049490869788,
-      "grad_norm": 1.3522818088531494,
-      "learning_rate": 0.00024501962568974347,
-      "loss": 1.1493,
       "step": 61000
     },
     {
       "epoch": 3.5269355480971614,
-      "grad_norm": 1.3799934387207031,
-      "learning_rate": 0.0002359178565333637,
-      "loss": 1.1534,
       "step": 62000
     },
     {
       "epoch": 3.583821605324535,
-      "grad_norm": 1.4971522092819214,
-      "learning_rate": 0.00022681608737698392,
-      "loss": 1.1475,
       "step": 63000
     },
     {
       "epoch": 3.6407076625519084,
-      "grad_norm": 1.4308714866638184,
-      "learning_rate": 0.00021771431822060415,
-      "loss": 1.1478,
       "step": 64000
     },
     {
       "epoch": 3.697593719779282,
-      "grad_norm": 1.6222587823867798,
-      "learning_rate": 0.00020861254906422437,
-      "loss": 1.1301,
       "step": 65000
     },
     {
       "epoch": 3.697593719779282,
-      "eval_accuracy": 0.715576,
-      "eval_loss": 1.1314274072647095,
-      "eval_runtime": 118.553,
-      "eval_samples_per_second": 2108.761,
-      "eval_steps_per_second": 8.241,
       "step": 65000
     },
     {
       "epoch": 3.754479777006656,
-      "grad_norm": 1.3719106912612915,
-      "learning_rate": 0.00019951988167700096,
-      "loss": 1.1364,
       "step": 66000
     },
     {
       "epoch": 3.8113658342340293,
-      "grad_norm": 1.5775474309921265,
-      "learning_rate": 0.00019042721428977758,
-      "loss": 1.1268,
       "step": 67000
     },
     {
       "epoch": 3.868251891461403,
-      "grad_norm": 1.4434072971343994,
-      "learning_rate": 0.0001813254451333978,
-      "loss": 1.1395,
       "step": 68000
     },
     {
       "epoch": 3.9251379486887763,
-      "grad_norm": 1.6004397869110107,
-      "learning_rate": 0.00017222367597701806,
-      "loss": 1.1324,
       "step": 69000
     },
     {
       "epoch": 3.98202400591615,
-      "grad_norm": 1.4771836996078491,
-      "learning_rate": 0.00016312190682063826,
-      "loss": 1.1286,
       "step": 70000
     },
     {
       "epoch": 3.98202400591615,
-      "eval_accuracy": 0.718576,
-      "eval_loss": 1.1219959259033203,
-      "eval_runtime": 118.4895,
-      "eval_samples_per_second": 2109.892,
-      "eval_steps_per_second": 8.245,
       "step": 70000
     },
     {
       "epoch": 4.038910063143524,
-      "grad_norm": 1.2995303869247437,
-      "learning_rate": 0.00015402013766425848,
-      "loss": 1.1102,
       "step": 71000
     },
     {
       "epoch": 4.095796120370897,
-      "grad_norm": 1.5995845794677734,
-      "learning_rate": 0.0001449183685078787,
-      "loss": 1.1039,
       "step": 72000
     },
     {
       "epoch": 4.152682177598271,
-      "grad_norm": 1.4186768531799316,
-      "learning_rate": 0.00013582570112065533,
-      "loss": 1.0986,
       "step": 73000
     },
     {
       "epoch": 4.209568234825644,
-      "grad_norm": 1.3645439147949219,
-      "learning_rate": 0.00012672393196427555,
-      "loss": 1.097,
       "step": 74000
     },
     {
       "epoch": 4.266454292053018,
-      "grad_norm": 1.3423221111297607,
-      "learning_rate": 0.00011762216280789579,
-      "loss": 1.0898,
       "step": 75000
     },
     {
       "epoch": 4.266454292053018,
-      "eval_accuracy": 0.720332,
-      "eval_loss": 1.1140097379684448,
-      "eval_runtime": 118.5031,
-      "eval_samples_per_second": 2109.649,
-      "eval_steps_per_second": 8.245,
       "step": 75000
     },
     {
       "epoch": 4.323340349280391,
-      "grad_norm": 1.4441511631011963,
-      "learning_rate": 0.00010852949542067239,
-      "loss": 1.0967,
       "step": 76000
     },
     {
       "epoch": 4.380226406507765,
-      "grad_norm": 1.5282950401306152,
-      "learning_rate": 9.942772626429264e-05,
-      "loss": 1.1011,
       "step": 77000
     },
     {
       "epoch": 4.437112463735138,
-      "grad_norm": 1.3688595294952393,
-      "learning_rate": 9.033505887706923e-05,
-      "loss": 1.0954,
       "step": 78000
     },
     {
       "epoch": 4.493998520962512,
-      "grad_norm": 1.5577939748764038,
-      "learning_rate": 8.123328972068947e-05,
-      "loss": 1.0949,
       "step": 79000
     },
     {
       "epoch": 4.550884578189885,
-      "grad_norm": 1.6534169912338257,
-      "learning_rate": 7.21315205643097e-05,
-      "loss": 1.093,
       "step": 80000
     },
     {
       "epoch": 4.550884578189885,
-      "eval_accuracy": 0.723164,
-      "eval_loss": 1.1039903163909912,
-      "eval_runtime": 118.5054,
-      "eval_samples_per_second": 2109.609,
-      "eval_steps_per_second": 8.244,
       "step": 80000
     },
     {
       "epoch": 4.607770635417259,
-      "grad_norm": 1.5630171298980713,
-      "learning_rate": 6.302975140792992e-05,
-      "loss": 1.0889,
       "step": 81000
     },
     {
       "epoch": 4.664656692644633,
-      "grad_norm": 1.511986494064331,
-      "learning_rate": 5.393708402070653e-05,
-      "loss": 1.0893,
       "step": 82000
     },
     {
       "epoch": 4.721542749872007,
-      "grad_norm": 1.5626702308654785,
-      "learning_rate": 4.483531486432676e-05,
-      "loss": 1.0857,
       "step": 83000
     },
     {
       "epoch": 4.77842880709938,
-      "grad_norm": 1.3917585611343384,
-      "learning_rate": 3.574264747710336e-05,
-      "loss": 1.0879,
       "step": 84000
     },
     {
       "epoch": 4.835314864326754,
-      "grad_norm": 1.6141693592071533,
-      "learning_rate": 2.6649980089879973e-05,
-      "loss": 1.0893,
       "step": 85000
     },
     {
       "epoch": 4.835314864326754,
-      "eval_accuracy": 0.724572,
-      "eval_loss": 1.0985814332962036,
-      "eval_runtime": 118.5475,
-      "eval_samples_per_second": 2108.859,
-      "eval_steps_per_second": 8.241,
       "step": 85000
     },
     {
       "epoch": 4.892200921554127,
-      "grad_norm": 1.6234523057937622,
-      "learning_rate": 1.75482109335002e-05,
-      "loss": 1.0915,
       "step": 86000
     },
     {
       "epoch": 4.949086978781501,
-      "grad_norm": 1.462381362915039,
-      "learning_rate": 8.446441777120428e-06,
-      "loss": 1.0843,
       "step": 87000
     },
     {
       "epoch": 5.0,
       "step": 87895,
-      "total_flos": 5.4597445596112486e+17,
-      "train_loss": 1.296092871571504,
-      "train_runtime": 24664.1985,
-      "train_samples_per_second": 912.253,
-      "train_steps_per_second": 3.564
     }
   ],
   "logging_steps": 1000,
@@ -785,7 +785,7 @@
   "num_input_tokens_seen": 0,
   "num_train_epochs": 5,
   "save_steps": 5000,
-  "total_flos": 5.4597445596112486e+17,
   "train_batch_size": 256,
   "trial_name": null,
   "trial_params": null

   "log_history": [
     {
       "epoch": 0.05688605722737357,
+      "grad_norm": 2.4062280654907227,
+      "learning_rate": 0.0007908982308436202,
+      "loss": 2.1918,
       "step": 1000
     },
     {
       "epoch": 0.11377211445474714,
+      "grad_norm": 1.431848406791687,
+      "learning_rate": 0.0007817964616872405,
+      "loss": 1.4818,
       "step": 2000
     },
     {
       "epoch": 0.17065817168212072,
+      "grad_norm": 1.5747077465057373,
+      "learning_rate": 0.0007726946925308607,
+      "loss": 1.3634,
       "step": 3000
     },
     {
       "epoch": 0.22754422890949427,
+      "grad_norm": 1.4864206314086914,
+      "learning_rate": 0.0007635929233744809,
+      "loss": 1.2967,
       "step": 4000
     },
     {
       "epoch": 0.2844302861368678,
+      "grad_norm": 1.2000905275344849,
+      "learning_rate": 0.0007544911542181011,
+      "loss": 1.2574,
       "step": 5000
     },
     {
       "epoch": 0.2844302861368678,
+      "eval_accuracy": 0.689128,
+      "eval_loss": 1.238457202911377,
+      "eval_runtime": 203.0197,
+      "eval_samples_per_second": 1231.407,
+      "eval_steps_per_second": 4.812,
       "step": 5000
     },
     {
       "epoch": 0.34131634336424144,
+      "grad_norm": 1.2910780906677246,
+      "learning_rate": 0.0007453893850617214,
+      "loss": 1.2181,
       "step": 6000
     },
     {
       "epoch": 0.398202400591615,
+      "grad_norm": 1.1383774280548096,
+      "learning_rate": 0.0007362876159053416,
+      "loss": 1.1863,
       "step": 7000
     },
     {
       "epoch": 0.45508845781898855,
+      "grad_norm": 1.135689616203308,
+      "learning_rate": 0.0007271858467489618,
+      "loss": 1.1653,
       "step": 8000
     },
     {
       "epoch": 0.5119745150463622,
+      "grad_norm": 1.1965036392211914,
+      "learning_rate": 0.0007180840775925821,
+      "loss": 1.147,
       "step": 9000
     },
     {
       "epoch": 0.5688605722737357,
+      "grad_norm": 1.0561026334762573,
+      "learning_rate": 0.0007089823084362024,
+      "loss": 1.1281,
       "step": 10000
     },
     {
       "epoch": 0.5688605722737357,
+      "eval_accuracy": 0.715764,
+      "eval_loss": 1.1192152500152588,
+      "eval_runtime": 128.8781,
+      "eval_samples_per_second": 1939.817,
+      "eval_steps_per_second": 7.581,
       "step": 10000
     },
     {
       "epoch": 0.6257466295011093,
+      "grad_norm": 0.9711835980415344,
+      "learning_rate": 0.0006998805392798226,
+      "loss": 1.1232,
       "step": 11000
     },
     {
       "epoch": 0.6826326867284829,
+      "grad_norm": 0.8913602828979492,
+      "learning_rate": 0.0006907787701234428,
+      "loss": 1.0988,
       "step": 12000
     },
     {
       "epoch": 0.7395187439558564,
+      "grad_norm": 1.092698097229004,
+      "learning_rate": 0.000681677000967063,
+      "loss": 1.0897,
       "step": 13000
     },
     {
       "epoch": 0.79640480118323,
+      "grad_norm": 0.9319038391113281,
+      "learning_rate": 0.0006725752318106833,
+      "loss": 1.0826,
       "step": 14000
     },
     {
       "epoch": 0.8532908584106036,
+      "grad_norm": 1.0223675966262817,
+      "learning_rate": 0.0006634734626543035,
+      "loss": 1.0698,
       "step": 15000
     },
     {
       "epoch": 0.8532908584106036,
+      "eval_accuracy": 0.728676,
+      "eval_loss": 1.0653605461120605,
+      "eval_runtime": 128.0826,
+      "eval_samples_per_second": 1951.866,
+      "eval_steps_per_second": 7.628,
       "step": 15000
     },
     {
       "epoch": 0.9101769156379771,
+      "grad_norm": 0.8995338678359985,
+      "learning_rate": 0.0006543716934979237,
+      "loss": 1.0624,
       "step": 16000
     },
     {
       "epoch": 0.9670629728653507,
+      "grad_norm": 0.8418471217155457,
+      "learning_rate": 0.0006452699243415439,
+      "loss": 1.0538,
       "step": 17000
     },
     {
       "epoch": 1.0239490300927243,
+      "grad_norm": 1.024624228477478,
+      "learning_rate": 0.0006361681551851641,
+      "loss": 1.0311,
       "step": 18000
     },
     {
       "epoch": 1.0808350873200978,
+      "grad_norm": 0.9130891561508179,
+      "learning_rate": 0.0006270663860287844,
+      "loss": 0.999,
       "step": 19000
     },
     {
       "epoch": 1.1377211445474713,
+      "grad_norm": 0.8896342515945435,
+      "learning_rate": 0.0006179646168724045,
+      "loss": 1.0,
       "step": 20000
     },
     {
       "epoch": 1.1377211445474713,
+      "eval_accuracy": 0.739712,
+      "eval_loss": 1.0235533714294434,
+      "eval_runtime": 127.2585,
+      "eval_samples_per_second": 1964.505,
+      "eval_steps_per_second": 7.677,
       "step": 20000
     },
     {
       "epoch": 1.194607201774845,
+      "grad_norm": 0.7940112948417664,
+      "learning_rate": 0.0006088628477160248,
+      "loss": 0.9957,
       "step": 21000
     },
     {
       "epoch": 1.2514932590022185,
+      "grad_norm": 0.9015308618545532,
+      "learning_rate": 0.000599761078559645,
+      "loss": 0.9967,
       "step": 22000
     },
     {
       "epoch": 1.3083793162295922,
+      "grad_norm": 0.9106078147888184,
+      "learning_rate": 0.0005906593094032653,
+      "loss": 0.9939,
       "step": 23000
     },
     {
       "epoch": 1.3652653734569657,
+      "grad_norm": 0.9563422203063965,
+      "learning_rate": 0.0005815575402468854,
+      "loss": 0.9931,
       "step": 24000
     },
     {
       "epoch": 1.4221514306843392,
+      "grad_norm": 0.7646272778511047,
+      "learning_rate": 0.0005724557710905057,
+      "loss": 0.9774,
       "step": 25000
     },
     {
       "epoch": 1.4221514306843392,
+      "eval_accuracy": 0.743348,
+      "eval_loss": 1.0054922103881836,
+      "eval_runtime": 127.7729,
+      "eval_samples_per_second": 1956.596,
+      "eval_steps_per_second": 7.646,
       "step": 25000
     },
     {
       "epoch": 1.4790374879117127,
+      "grad_norm": 0.7779045104980469,
+      "learning_rate": 0.000563354001934126,
+      "loss": 0.9792,
       "step": 26000
     },
     {
       "epoch": 1.5359235451390862,
+      "grad_norm": 0.8506484627723694,
+      "learning_rate": 0.0005542522327777463,
+      "loss": 0.9778,
       "step": 27000
     },
     {
       "epoch": 1.59280960236646,
+      "grad_norm": 0.8443676829338074,
+      "learning_rate": 0.0005451504636213664,
+      "loss": 0.9715,
       "step": 28000
     },
     {
       "epoch": 1.6496956595938337,
+      "grad_norm": 0.9333568215370178,
+      "learning_rate": 0.0005360486944649867,
+      "loss": 0.9679,
       "step": 29000
     },
     {
       "epoch": 1.7065817168212072,
+      "grad_norm": 0.9501623511314392,
+      "learning_rate": 0.0005269469253086069,
+      "loss": 0.9684,
       "step": 30000
     },
     {
       "epoch": 1.7065817168212072,
+      "eval_accuracy": 0.749276,
+      "eval_loss": 0.9812818765640259,
+      "eval_runtime": 128.5758,
+      "eval_samples_per_second": 1944.379,
+      "eval_steps_per_second": 7.599,
       "step": 30000
     },
     {
       "epoch": 1.7634677740485807,
+      "grad_norm": 0.7442188262939453,
+      "learning_rate": 0.0005178451561522272,
+      "loss": 0.9636,
       "step": 31000
     },
     {
       "epoch": 1.8203538312759542,
+      "grad_norm": 0.7510819435119629,
+      "learning_rate": 0.0005087433869958473,
+      "loss": 0.9647,
       "step": 32000
     },
     {
       "epoch": 1.8772398885033277,
+      "grad_norm": 0.7448764443397522,
+      "learning_rate": 0.0004996416178394676,
+      "loss": 0.9591,
       "step": 33000
     },
     {
       "epoch": 1.9341259457307014,
+      "grad_norm": 0.8019358515739441,
+      "learning_rate": 0.0004905398486830878,
+      "loss": 0.9513,
       "step": 34000
     },
     {
       "epoch": 1.9910120029580751,
+      "grad_norm": 0.9495121240615845,
+      "learning_rate": 0.00048143807952670797,
+      "loss": 0.9511,
       "step": 35000
     },
     {
       "epoch": 1.9910120029580751,
+      "eval_accuracy": 0.755448,
+      "eval_loss": 0.9558805227279663,
+      "eval_runtime": 127.8711,
+      "eval_samples_per_second": 1955.094,
+      "eval_steps_per_second": 7.641,
       "step": 35000
     },
     {
       "epoch": 2.0478980601854486,
+      "grad_norm": 0.8410281538963318,
+      "learning_rate": 0.00047233631037032825,
+      "loss": 0.9081,
       "step": 36000
     },
     {
       "epoch": 2.104784117412822,
+      "grad_norm": 0.8246123194694519,
+      "learning_rate": 0.00046323454121394847,
+      "loss": 0.8964,
       "step": 37000
     },
     {
       "epoch": 2.1616701746401956,
+      "grad_norm": 0.9567108154296875,
+      "learning_rate": 0.0004541327720575687,
+      "loss": 0.8952,
       "step": 38000
     },
     {
       "epoch": 2.218556231867569,
+      "grad_norm": 0.8104901313781738,
+      "learning_rate": 0.0004450310029011889,
+      "loss": 0.8925,
       "step": 39000
     },
     {
       "epoch": 2.2754422890949426,
+      "grad_norm": 0.9034276008605957,
+      "learning_rate": 0.0004359292337448092,
+      "loss": 0.8998,
       "step": 40000
     },
     {
       "epoch": 2.2754422890949426,
+      "eval_accuracy": 0.755948,
+      "eval_loss": 0.9492226839065552,
+      "eval_runtime": 127.8812,
+      "eval_samples_per_second": 1954.94,
+      "eval_steps_per_second": 7.64,
       "step": 40000
     },
     {
       "epoch": 2.3323283463223166,
+      "grad_norm": 1.3229442834854126,
+      "learning_rate": 0.00042682746458842937,
+      "loss": 0.8962,
       "step": 41000
     },
     {
       "epoch": 2.38921440354969,
+      "grad_norm": 0.8582925200462341,
+      "learning_rate": 0.00041772569543204965,
+      "loss": 0.8976,
       "step": 42000
     },
     {
       "epoch": 2.4461004607770636,
+      "grad_norm": 0.8881712555885315,
+      "learning_rate": 0.0004086239262756698,
+      "loss": 0.8898,
       "step": 43000
     },
     {
       "epoch": 2.502986518004437,
+      "grad_norm": 0.8713961839675903,
+      "learning_rate": 0.00039952215711929005,
+      "loss": 0.8927,
       "step": 44000
     },
     {
       "epoch": 2.5598725752318106,
+      "grad_norm": 0.7883007526397705,
+      "learning_rate": 0.00039042038796291027,
+      "loss": 0.8967,
       "step": 45000
     },
     {
       "epoch": 2.5598725752318106,
+      "eval_accuracy": 0.760028,
+      "eval_loss": 0.937300980091095,
+      "eval_runtime": 130.0782,
+      "eval_samples_per_second": 1921.921,
+      "eval_steps_per_second": 7.511,
       "step": 45000
     },
     {
       "epoch": 2.6167586324591845,
+      "grad_norm": 0.8600155711174011,
+      "learning_rate": 0.00038131861880653055,
+      "loss": 0.8927,
       "step": 46000
     },
     {
       "epoch": 2.673644689686558,
+      "grad_norm": 0.8501909971237183,
+      "learning_rate": 0.0003722168496501508,
+      "loss": 0.8913,
       "step": 47000
     },
     {
       "epoch": 2.7305307469139315,
+      "grad_norm": 0.8116582632064819,
+      "learning_rate": 0.000363115080493771,
+      "loss": 0.8889,
       "step": 48000
     },
     {
       "epoch": 2.787416804141305,
+      "grad_norm": 0.8065186738967896,
+      "learning_rate": 0.0003540133113373912,
+      "loss": 0.8896,
       "step": 49000
     },
     {
       "epoch": 2.8443028613686785,
+      "grad_norm": 0.9248031973838806,
+      "learning_rate": 0.00034491154218101145,
+      "loss": 0.8837,
       "step": 50000
     },
     {
       "epoch": 2.8443028613686785,
+      "eval_accuracy": 0.762176,
+      "eval_loss": 0.9251159429550171,
+      "eval_runtime": 128.4439,
+      "eval_samples_per_second": 1946.376,
+      "eval_steps_per_second": 7.606,
       "step": 50000
     },
     {
       "epoch": 2.901188918596052,
+      "grad_norm": 0.8191467523574829,
+      "learning_rate": 0.0003358097730246317,
+      "loss": 0.878,
       "step": 51000
     },
     {
       "epoch": 2.9580749758234255,
+      "grad_norm": 0.7620063424110413,
+      "learning_rate": 0.0003267080038682519,
+      "loss": 0.8832,
       "step": 52000
     },
     {
       "epoch": 3.0149610330507994,
+      "grad_norm": 0.8365482687950134,
+      "learning_rate": 0.0003176062347118721,
+      "loss": 0.8621,
       "step": 53000
     },
     {
       "epoch": 3.071847090278173,
+      "grad_norm": 0.9817807078361511,
+      "learning_rate": 0.00030850446555549235,
+      "loss": 0.8224,
       "step": 54000
     },
     {
       "epoch": 3.1287331475055464,
+      "grad_norm": 0.847806453704834,
+      "learning_rate": 0.00029940269639911263,
+      "loss": 0.8253,
       "step": 55000
     },
     {
       "epoch": 3.1287331475055464,
+      "eval_accuracy": 0.76438,
+      "eval_loss": 0.9235970973968506,
+      "eval_runtime": 126.2531,
+      "eval_samples_per_second": 1980.15,
+      "eval_steps_per_second": 7.738,
       "step": 55000
     },
     {
       "epoch": 3.18561920473292,
+      "grad_norm": 1.1729530096054077,
+      "learning_rate": 0.00029030092724273285,
+      "loss": 0.8225,
       "step": 56000
     },
     {
       "epoch": 3.2425052619602934,
+      "grad_norm": 1.0548408031463623,
+      "learning_rate": 0.0002811991580863531,
+      "loss": 0.821,
       "step": 57000
     },
     {
       "epoch": 3.299391319187667,
+      "grad_norm": 1.0199774503707886,
+      "learning_rate": 0.0002720973889299733,
+      "loss": 0.8213,
       "step": 58000
     },
     {
       "epoch": 3.356277376415041,
+      "grad_norm": 0.9180177450180054,
+      "learning_rate": 0.00026299561977359353,
+      "loss": 0.8274,
       "step": 59000
     },
     {
       "epoch": 3.4131634336424144,
+      "grad_norm": 0.9745663404464722,
+      "learning_rate": 0.0002538938506172137,
+      "loss": 0.8229,
       "step": 60000
     },
     {
       "epoch": 3.4131634336424144,
+      "eval_accuracy": 0.766832,
+      "eval_loss": 0.9138370156288147,
+      "eval_runtime": 129.2727,
+      "eval_samples_per_second": 1933.897,
+      "eval_steps_per_second": 7.558,
       "step": 60000
     },
     {
       "epoch": 3.470049490869788,
+      "grad_norm": 0.8708947896957397,
+      "learning_rate": 0.0002447920814608339,
+      "loss": 0.8256,
       "step": 61000
     },
     {
       "epoch": 3.5269355480971614,
+      "grad_norm": 0.9808185696601868,
+      "learning_rate": 0.00023569031230445418,
+      "loss": 0.8298,
       "step": 62000
     },
     {
       "epoch": 3.583821605324535,
+      "grad_norm": 0.8228833079338074,
+      "learning_rate": 0.0002265885431480744,
+      "loss": 0.827,
       "step": 63000
     },
     {
       "epoch": 3.6407076625519084,
+      "grad_norm": 0.9581019878387451,
+      "learning_rate": 0.00021748677399169463,
+      "loss": 0.8275,
       "step": 64000
     },
     {
       "epoch": 3.697593719779282,
+      "grad_norm": 0.8560314178466797,
+      "learning_rate": 0.00020838500483531488,
+      "loss": 0.8145,
       "step": 65000
     },
     {
       "epoch": 3.697593719779282,
+      "eval_accuracy": 0.769172,
+      "eval_loss": 0.9042648673057556,
+      "eval_runtime": 129.2138,
+      "eval_samples_per_second": 1934.778,
+      "eval_steps_per_second": 7.561,
       "step": 65000
     },
     {
       "epoch": 3.754479777006656,
+      "grad_norm": 0.8918451070785522,
+      "learning_rate": 0.0001992832356789351,
+      "loss": 0.819,
       "step": 66000
     },
     {
       "epoch": 3.8113658342340293,
+      "grad_norm": 1.0977294445037842,
+      "learning_rate": 0.00019018146652255533,
+      "loss": 0.8122,
       "step": 67000
     },
     {
       "epoch": 3.868251891461403,
+      "grad_norm": 0.7856444716453552,
+      "learning_rate": 0.00018107969736617555,
+      "loss": 0.8225,
       "step": 68000
     },
     {
       "epoch": 3.9251379486887763,
+      "grad_norm": 0.9270259141921997,
+      "learning_rate": 0.00017197792820979578,
+      "loss": 0.8158,
       "step": 69000
     },
     {
       "epoch": 3.98202400591615,
+      "grad_norm": 1.082774043083191,
+      "learning_rate": 0.00016287615905341603,
+      "loss": 0.8156,
       "step": 70000
     },
     {
       "epoch": 3.98202400591615,
+      "eval_accuracy": 0.770764,
+      "eval_loss": 0.8961142301559448,
+      "eval_runtime": 138.0555,
+      "eval_samples_per_second": 1810.866,
+      "eval_steps_per_second": 7.077,
       "step": 70000
     },
     {
       "epoch": 4.038910063143524,
+      "grad_norm": 0.909858226776123,
+      "learning_rate": 0.00015377438989703626,
+      "loss": 0.7785,
       "step": 71000
     },
     {
       "epoch": 4.095796120370897,
+      "grad_norm": 0.931280791759491,
+      "learning_rate": 0.00014467262074065645,
+      "loss": 0.7637,
       "step": 72000
     },
     {
       "epoch": 4.152682177598271,
+      "grad_norm": 0.94422847032547,
+      "learning_rate": 0.0001355708515842767,
+      "loss": 0.7612,
       "step": 73000
     },
     {
       "epoch": 4.209568234825644,
+      "grad_norm": 0.9250127077102661,
+      "learning_rate": 0.00012646908242789693,
+      "loss": 0.7616,
       "step": 74000
     },
     {
       "epoch": 4.266454292053018,
+      "grad_norm": 0.8467296957969666,
+      "learning_rate": 0.00011736731327151716,
+      "loss": 0.7557,
       "step": 75000
     },
     {
       "epoch": 4.266454292053018,
+      "eval_accuracy": 0.77204,
+      "eval_loss": 0.9022773504257202,
+      "eval_runtime": 144.5432,
+      "eval_samples_per_second": 1729.587,
+      "eval_steps_per_second": 6.759,
       "step": 75000
     },
     {
       "epoch": 4.323340349280391,
+      "grad_norm": 0.8985564708709717,
+      "learning_rate": 0.00010826554411513738,
+      "loss": 0.7604,
       "step": 76000
     },
     {
       "epoch": 4.380226406507765,
+      "grad_norm": 0.8618564605712891,
+      "learning_rate": 9.916377495875762e-05,
+      "loss": 0.7632,
       "step": 77000
     },
     {
       "epoch": 4.437112463735138,
+      "grad_norm": 0.9467126727104187,
+      "learning_rate": 9.006200580237784e-05,
+      "loss": 0.7614,
       "step": 78000
     },
     {
       "epoch": 4.493998520962512,
+      "grad_norm": 1.0163730382919312,
+      "learning_rate": 8.096023664599807e-05,
+      "loss": 0.7575,
       "step": 79000
     },
     {
       "epoch": 4.550884578189885,
+      "grad_norm": 1.1194038391113281,
+      "learning_rate": 7.18584674896183e-05,
+      "loss": 0.7595,
       "step": 80000
     },
     {
       "epoch": 4.550884578189885,
+      "eval_accuracy": 0.772256,
+      "eval_loss": 0.897346019744873,
+      "eval_runtime": 136.9434,
+      "eval_samples_per_second": 1825.571,
+      "eval_steps_per_second": 7.134,
       "step": 80000
     },
     {
       "epoch": 4.607770635417259,
+      "grad_norm": 1.0589629411697388,
+      "learning_rate": 6.275669833323853e-05,
+      "loss": 0.7548,
       "step": 81000
     },
     {
       "epoch": 4.664656692644633,
+      "grad_norm": 0.8540852665901184,
+      "learning_rate": 5.365492917685876e-05,
+      "loss": 0.7601,
       "step": 82000
     },
     {
       "epoch": 4.721542749872007,
+      "grad_norm": 1.127475380897522,
+      "learning_rate": 4.455316002047898e-05,
+      "loss": 0.7554,
       "step": 83000
     },
     {
       "epoch": 4.77842880709938,
+      "grad_norm": 0.9464063048362732,
+      "learning_rate": 3.545139086409921e-05,
+      "loss": 0.756,
       "step": 84000
     },
     {
       "epoch": 4.835314864326754,
+      "grad_norm": 0.9705914855003357,
+      "learning_rate": 2.634962170771944e-05,
+      "loss": 0.7581,
       "step": 85000
     },
     {
       "epoch": 4.835314864326754,
+      "eval_accuracy": 0.773724,
+      "eval_loss": 0.8925997018814087,
+      "eval_runtime": 138.7415,
+      "eval_samples_per_second": 1801.913,
+      "eval_steps_per_second": 7.042,
       "step": 85000
     },
     {
       "epoch": 4.892200921554127,
+      "grad_norm": 0.8879310488700867,
+      "learning_rate": 1.7247852551339668e-05,
+      "loss": 0.758,
       "step": 86000
     },
     {
       "epoch": 4.949086978781501,
+      "grad_norm": 1.2024400234222412,
+      "learning_rate": 8.146083394959896e-06,
+      "loss": 0.751,
       "step": 87000
     },
     {
       "epoch": 5.0,
       "step": 87895,
+      "total_flos": 1.93274424e+18,
+      "train_loss": 0.9357909288237652,
+      "train_runtime": 45635.435,
+      "train_samples_per_second": 493.038,
+      "train_steps_per_second": 1.926
     }
   ],
   "logging_steps": 1000,
   "num_input_tokens_seen": 0,
   "num_train_epochs": 5,
   "save_steps": 5000,
+  "total_flos": 1.93274424e+18,
   "train_batch_size": 256,
   "trial_name": null,
   "trial_params": null

training_args.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:eb6b0f81e80c1f33b16aaf9c2fc69495be719e29fb2dfcec7d7a4debabe294f0
-size 4920

 version https://git-lfs.github.com/spec/v1
+oid sha256:bf823d8cbbdbfa0a6f25c927a43b2dbea4fdd4c372f9f50bf4026741c3ca5e20
+size 4472