upload model checkpoint

Browse files

Files changed (15) hide show

.gitattributes +1 -0
README.md +76 -3
adapter_config.json +29 -0
adapter_model.safetensors +3 -0
all_results.json +14 -0
config.json +44 -0
eval_results.json +8 -0
runs/Jun13_06-31-36_gpu1-1/events.out.tfevents.1718231645.gpu1-1.449944.0 +3 -0
runs/Jun13_06-31-36_gpu1-1/events.out.tfevents.1718235304.gpu1-1.449944.1 +3 -0
special_tokens_map.json +34 -0
tokenizer.json +3 -0
tokenizer_config.json +70 -0
train_results.json +9 -0
trainer_state.json +2217 -0
training_args.bin +3 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+tokenizer.json filter=lfs diff=lfs merge=lfs -text

README.md CHANGED Viewed

@@ -1,3 +1,76 @@
----
-license: gemma
----

+---
+library_name: peft
+tags:
+- alignment-handbook
+- generated_from_trainer
+datasets:
+- llama-duo/synth_summarize_dataset_dedup
+base_model: google/gemma-7b
+model-index:
+- name: gemma7b-summarize-claude3sonnet-64k
+  results: []
+---
+<!-- This model card has been generated automatically according to the information the Trainer had access to. You
+should probably proofread and complete it, then remove this comment. -->
+# gemma7b-summarize-claude3sonnet-64k
+This model is a fine-tuned version of [google/gemma-7b](https://huggingface.co/google/gemma-7b) on the llama-duo/synth_summarize_dataset_dedup dataset.
+It achieves the following results on the evaluation set:
+- Loss: 2.5547
+## Model description
+More information needed
+## Intended uses & limitations
+More information needed
+## Training and evaluation data
+More information needed
+## Training procedure
+### Training hyperparameters
+The following hyperparameters were used during training:
+- learning_rate: 0.0002
+- train_batch_size: 4
+- eval_batch_size: 2
+- seed: 42
+- distributed_type: multi-GPU
+- num_devices: 8
+- gradient_accumulation_steps: 2
+- total_train_batch_size: 64
+- total_eval_batch_size: 16
+- optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
+- lr_scheduler_type: cosine
+- lr_scheduler_warmup_ratio: 0.1
+- num_epochs: 10
+### Training results
+| Training Loss | Epoch  | Step | Validation Loss |
+|:-------------:|:------:|:----:|:---------------:|
+| 1.1216        | 0.9967 | 150  | 2.5805          |
+| 0.9828        | 2.0    | 301  | 2.5169          |
+| 0.9157        | 2.9967 | 451  | 2.4836          |
+| 0.8753        | 4.0    | 602  | 2.5011          |
+| 0.8334        | 4.9967 | 752  | 2.4945          |
+| 0.796         | 6.0    | 903  | 2.5317          |
+| 0.7745        | 6.9967 | 1053 | 2.5436          |
+| 0.7582        | 8.0    | 1204 | 2.5522          |
+| 0.754         | 8.9967 | 1354 | 2.5504          |
+| 0.7572        | 9.9668 | 1500 | 2.5547          |
+### Framework versions
+- PEFT 0.10.0
+- Transformers 4.40.0
+- Pytorch 2.1.2+cu121
+- Datasets 2.18.0
+- Tokenizers 0.19.1

adapter_config.json ADDED Viewed

	@@ -0,0 +1,29 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "google/gemma-7b",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 64,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 32,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "v_proj",
+    "q_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}

adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:af543f051c5eebf1124e67f209d3c059c602369b5c1de960797b5b23c7c5a483
+size 25705248

all_results.json ADDED Viewed

	@@ -0,0 +1,14 @@

+{
+    "epoch": 9.966777408637874,
+    "eval_loss": 2.5546562671661377,
+    "eval_runtime": 0.2362,
+    "eval_samples": 25,
+    "eval_samples_per_second": 42.34,
+    "eval_steps_per_second": 4.234,
+    "total_flos": 4.5794490708666614e+18,
+    "train_loss": 1.5882705609003702,
+    "train_runtime": 3659.0045,
+    "train_samples": 63494,
+    "train_samples_per_second": 26.291,
+    "train_steps_per_second": 0.41
+}

config.json ADDED Viewed

	@@ -0,0 +1,44 @@

+{
+  "_name_or_path": "google/gemma-7b",
+  "architectures": [
+    "GemmaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "bos_token_id": 2,
+  "eos_token_id": 1,
+  "head_dim": 256,
+  "hidden_act": "gelu",
+  "hidden_activation": null,
+  "hidden_size": 3072,
+  "initializer_range": 0.02,
+  "intermediate_size": 24576,
+  "max_position_embeddings": 8192,
+  "model_type": "gemma",
+  "num_attention_heads": 16,
+  "num_hidden_layers": 28,
+  "num_key_value_heads": 16,
+  "pad_token_id": 0,
+  "quantization_config": {
+    "_load_in_4bit": true,
+    "_load_in_8bit": false,
+    "bnb_4bit_compute_dtype": "bfloat16",
+    "bnb_4bit_quant_storage": "uint8",
+    "bnb_4bit_quant_type": "nf4",
+    "bnb_4bit_use_double_quant": false,
+    "llm_int8_enable_fp32_cpu_offload": false,
+    "llm_int8_has_fp16_weight": false,
+    "llm_int8_skip_modules": null,
+    "llm_int8_threshold": 6.0,
+    "load_in_4bit": true,
+    "load_in_8bit": false,
+    "quant_method": "bitsandbytes"
+  },
+  "rms_norm_eps": 1e-06,
+  "rope_scaling": null,
+  "rope_theta": 10000.0,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.40.0",
+  "use_cache": true,
+  "vocab_size": 256000
+}

eval_results.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+    "epoch": 9.966777408637874,
+    "eval_loss": 2.5546562671661377,
+    "eval_runtime": 0.2362,
+    "eval_samples": 25,
+    "eval_samples_per_second": 42.34,
+    "eval_steps_per_second": 4.234
+}

runs/Jun13_06-31-36_gpu1-1/events.out.tfevents.1718231645.gpu1-1.449944.0 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:23fd17c3b5113c660586eeb95bdc67c19ef297471656bfbdd9954c69e5f42366
+size 71954

runs/Jun13_06-31-36_gpu1-1/events.out.tfevents.1718235304.gpu1-1.449944.1 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9f62d91cfb5510b2161ecc761700b913e816534575e08294745849bd38afa518
+size 359

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,34 @@

+{
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>"
+  ],
+  "bos_token": {
+    "content": "<bos>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "<eos>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<pad>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:322a5f52ab5cab196761ab397a022d6fa3a2e1418585e532bb6efb2fedd2ae94
+size 17477501

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,70 @@

+{
+  "add_bos_token": false,
+  "add_eos_token": false,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<pad>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<eos>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "<bos>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "3": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "106": {
+      "content": "<|im_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "107": {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>"
+  ],
+  "bos_token": "<bos>",
+  "chat_template": "{% if messages[0]['role'] == 'user' or messages[0]['role'] == 'system' %}{{ bos_token }}{% endif %}{% for message in messages %}{{ '<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n' }}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% elif messages[-1]['role'] == 'assistant' %}{{ eos_token }}{% endif %}",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<eos>",
+  "legacy": null,
+  "model_max_length": 2048,
+  "pad_token": "<pad>",
+  "sp_model_kwargs": {},
+  "spaces_between_special_tokens": false,
+  "tokenizer_class": "GemmaTokenizer",
+  "unk_token": "<unk>",
+  "use_default_system_prompt": false
+}

train_results.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+    "epoch": 9.966777408637874,
+    "total_flos": 4.5794490708666614e+18,
+    "train_loss": 1.5882705609003702,
+    "train_runtime": 3659.0045,
+    "train_samples": 63494,
+    "train_samples_per_second": 26.291,
+    "train_steps_per_second": 0.41
+}

trainer_state.json ADDED Viewed

	@@ -0,0 +1,2217 @@

+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 9.966777408637874,
+  "eval_steps": 500,
+  "global_step": 1500,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.006644518272425249,
+      "grad_norm": 324.0,
+      "learning_rate": 1.3333333333333334e-06,
+      "loss": 34.1539,
+      "step": 1
+    },
+    {
+      "epoch": 0.03322259136212625,
+      "grad_norm": 328.0,
+      "learning_rate": 6.666666666666667e-06,
+      "loss": 34.4732,
+      "step": 5
+    },
+    {
+      "epoch": 0.0664451827242525,
+      "grad_norm": 132.0,
+      "learning_rate": 1.3333333333333333e-05,
+      "loss": 30.9731,
+      "step": 10
+    },
+    {
+      "epoch": 0.09966777408637874,
+      "grad_norm": 57.75,
+      "learning_rate": 2e-05,
+      "loss": 24.1357,
+      "step": 15
+    },
+    {
+      "epoch": 0.132890365448505,
+      "grad_norm": 19.125,
+      "learning_rate": 2.6666666666666667e-05,
+      "loss": 19.6743,
+      "step": 20
+    },
+    {
+      "epoch": 0.16611295681063123,
+      "grad_norm": 14.1875,
+      "learning_rate": 3.3333333333333335e-05,
+      "loss": 17.9465,
+      "step": 25
+    },
+    {
+      "epoch": 0.19933554817275748,
+      "grad_norm": 7.25,
+      "learning_rate": 4e-05,
+      "loss": 15.9561,
+      "step": 30
+    },
+    {
+      "epoch": 0.23255813953488372,
+      "grad_norm": 4.125,
+      "learning_rate": 4.666666666666667e-05,
+      "loss": 14.7788,
+      "step": 35
+    },
+    {
+      "epoch": 0.26578073089701,
+      "grad_norm": 3.484375,
+      "learning_rate": 5.333333333333333e-05,
+      "loss": 14.139,
+      "step": 40
+    },
+    {
+      "epoch": 0.29900332225913623,
+      "grad_norm": 4.75,
+      "learning_rate": 6e-05,
+      "loss": 13.5886,
+      "step": 45
+    },
+    {
+      "epoch": 0.33222591362126247,
+      "grad_norm": 6.125,
+      "learning_rate": 6.666666666666667e-05,
+      "loss": 13.0275,
+      "step": 50
+    },
+    {
+      "epoch": 0.3654485049833887,
+      "grad_norm": 11.5625,
+      "learning_rate": 7.333333333333333e-05,
+      "loss": 11.9071,
+      "step": 55
+    },
+    {
+      "epoch": 0.39867109634551495,
+      "grad_norm": 18.375,
+      "learning_rate": 8e-05,
+      "loss": 9.4575,
+      "step": 60
+    },
+    {
+      "epoch": 0.4318936877076412,
+      "grad_norm": 21.25,
+      "learning_rate": 8.666666666666667e-05,
+      "loss": 5.8479,
+      "step": 65
+    },
+    {
+      "epoch": 0.46511627906976744,
+      "grad_norm": 5.09375,
+      "learning_rate": 9.333333333333334e-05,
+      "loss": 2.6937,
+      "step": 70
+    },
+    {
+      "epoch": 0.4983388704318937,
+      "grad_norm": 6.46875,
+      "learning_rate": 0.0001,
+      "loss": 2.0051,
+      "step": 75
+    },
+    {
+      "epoch": 0.53156146179402,
+      "grad_norm": 2.78125,
+      "learning_rate": 0.00010666666666666667,
+      "loss": 1.7309,
+      "step": 80
+    },
+    {
+      "epoch": 0.5647840531561462,
+      "grad_norm": 0.73828125,
+      "learning_rate": 0.00011333333333333334,
+      "loss": 1.5823,
+      "step": 85
+    },
+    {
+      "epoch": 0.5980066445182725,
+      "grad_norm": 1.359375,
+      "learning_rate": 0.00012,
+      "loss": 1.4702,
+      "step": 90
+    },
+    {
+      "epoch": 0.6312292358803987,
+      "grad_norm": 0.94140625,
+      "learning_rate": 0.00012666666666666666,
+      "loss": 1.3996,
+      "step": 95
+    },
+    {
+      "epoch": 0.6644518272425249,
+      "grad_norm": 0.859375,
+      "learning_rate": 0.00013333333333333334,
+      "loss": 1.3389,
+      "step": 100
+    },
+    {
+      "epoch": 0.6976744186046512,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.00014,
+      "loss": 1.293,
+      "step": 105
+    },
+    {
+      "epoch": 0.7308970099667774,
+      "grad_norm": 1.2265625,
+      "learning_rate": 0.00014666666666666666,
+      "loss": 1.2656,
+      "step": 110
+    },
+    {
+      "epoch": 0.7641196013289037,
+      "grad_norm": 0.5703125,
+      "learning_rate": 0.00015333333333333334,
+      "loss": 1.2254,
+      "step": 115
+    },
+    {
+      "epoch": 0.7973421926910299,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.00016,
+      "loss": 1.2072,
+      "step": 120
+    },
+    {
+      "epoch": 0.8305647840531561,
+      "grad_norm": 1.28125,
+      "learning_rate": 0.0001666666666666667,
+      "loss": 1.1856,
+      "step": 125
+    },
+    {
+      "epoch": 0.8637873754152824,
+      "grad_norm": 1.3046875,
+      "learning_rate": 0.00017333333333333334,
+      "loss": 1.169,
+      "step": 130
+    },
+    {
+      "epoch": 0.8970099667774086,
+      "grad_norm": 1.3125,
+      "learning_rate": 0.00018,
+      "loss": 1.1497,
+      "step": 135
+    },
+    {
+      "epoch": 0.9302325581395349,
+      "grad_norm": 1.9765625,
+      "learning_rate": 0.0001866666666666667,
+      "loss": 1.131,
+      "step": 140
+    },
+    {
+      "epoch": 0.9634551495016611,
+      "grad_norm": 1.609375,
+      "learning_rate": 0.00019333333333333333,
+      "loss": 1.1275,
+      "step": 145
+    },
+    {
+      "epoch": 0.9966777408637874,
+      "grad_norm": 6.65625,
+      "learning_rate": 0.0002,
+      "loss": 1.1216,
+      "step": 150
+    },
+    {
+      "epoch": 0.9966777408637874,
+      "eval_loss": 2.580465793609619,
+      "eval_runtime": 0.2799,
+      "eval_samples_per_second": 35.728,
+      "eval_steps_per_second": 3.573,
+      "step": 150
+    },
+    {
+      "epoch": 1.0299003322259137,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.00019999323080037624,
+      "loss": 1.1202,
+      "step": 155
+    },
+    {
+      "epoch": 1.06312292358804,
+      "grad_norm": 7.21875,
+      "learning_rate": 0.00019997292411794618,
+      "loss": 1.0982,
+      "step": 160
+    },
+    {
+      "epoch": 1.0963455149501662,
+      "grad_norm": 0.6875,
+      "learning_rate": 0.0001999390827019096,
+      "loss": 1.1059,
+      "step": 165
+    },
+    {
+      "epoch": 1.1295681063122924,
+      "grad_norm": 0.74609375,
+      "learning_rate": 0.0001998917111338525,
+      "loss": 1.079,
+      "step": 170
+    },
+    {
+      "epoch": 1.1627906976744187,
+      "grad_norm": 1.203125,
+      "learning_rate": 0.00019983081582712685,
+      "loss": 1.0626,
+      "step": 175
+    },
+    {
+      "epoch": 1.196013289036545,
+      "grad_norm": 3.75,
+      "learning_rate": 0.00019975640502598244,
+      "loss": 1.0644,
+      "step": 180
+    },
+    {
+      "epoch": 1.2292358803986712,
+      "grad_norm": 0.85546875,
+      "learning_rate": 0.00019966848880445062,
+      "loss": 1.064,
+      "step": 185
+    },
+    {
+      "epoch": 1.2624584717607974,
+      "grad_norm": 1.1796875,
+      "learning_rate": 0.00019956707906498044,
+      "loss": 1.0638,
+      "step": 190
+    },
+    {
+      "epoch": 1.2956810631229236,
+      "grad_norm": 1.75,
+      "learning_rate": 0.00019945218953682734,
+      "loss": 1.0598,
+      "step": 195
+    },
+    {
+      "epoch": 1.3289036544850499,
+      "grad_norm": 1.2734375,
+      "learning_rate": 0.00019932383577419432,
+      "loss": 1.0433,
+      "step": 200
+    },
+    {
+      "epoch": 1.3621262458471761,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.00019918203515412617,
+      "loss": 1.0375,
+      "step": 205
+    },
+    {
+      "epoch": 1.3953488372093024,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.00019902680687415705,
+      "loss": 1.0293,
+      "step": 210
+    },
+    {
+      "epoch": 1.4285714285714286,
+      "grad_norm": 1.1640625,
+      "learning_rate": 0.00019885817194971117,
+      "loss": 1.0196,
+      "step": 215
+    },
+    {
+      "epoch": 1.4617940199335548,
+      "grad_norm": 1.3828125,
+      "learning_rate": 0.00019867615321125795,
+      "loss": 1.0227,
+      "step": 220
+    },
+    {
+      "epoch": 1.495016611295681,
+      "grad_norm": 2.703125,
+      "learning_rate": 0.00019848077530122083,
+      "loss": 1.0192,
+      "step": 225
+    },
+    {
+      "epoch": 1.5282392026578073,
+      "grad_norm": 2.90625,
+      "learning_rate": 0.00019827206467064133,
+      "loss": 1.0254,
+      "step": 230
+    },
+    {
+      "epoch": 1.5614617940199336,
+      "grad_norm": 1.90625,
+      "learning_rate": 0.00019805004957559793,
+      "loss": 1.0076,
+      "step": 235
+    },
+    {
+      "epoch": 1.5946843853820598,
+      "grad_norm": 1.2578125,
+      "learning_rate": 0.00019781476007338058,
+      "loss": 0.9979,
+      "step": 240
+    },
+    {
+      "epoch": 1.627906976744186,
+      "grad_norm": 4.1875,
+      "learning_rate": 0.00019756622801842143,
+      "loss": 0.9963,
+      "step": 245
+    },
+    {
+      "epoch": 1.6611295681063123,
+      "grad_norm": 2.625,
+      "learning_rate": 0.00019730448705798239,
+      "loss": 1.0017,
+      "step": 250
+    },
+    {
+      "epoch": 1.6943521594684385,
+      "grad_norm": 2.9375,
+      "learning_rate": 0.00019702957262759965,
+      "loss": 1.0055,
+      "step": 255
+    },
+    {
+      "epoch": 1.7275747508305648,
+      "grad_norm": 2.40625,
+      "learning_rate": 0.00019674152194628638,
+      "loss": 0.993,
+      "step": 260
+    },
+    {
+      "epoch": 1.760797342192691,
+      "grad_norm": 1.3046875,
+      "learning_rate": 0.0001964403740114939,
+      "loss": 0.9875,
+      "step": 265
+    },
+    {
+      "epoch": 1.7940199335548173,
+      "grad_norm": 1.2734375,
+      "learning_rate": 0.0001961261695938319,
+      "loss": 1.0015,
+      "step": 270
+    },
+    {
+      "epoch": 1.8272425249169435,
+      "grad_norm": 1.0,
+      "learning_rate": 0.0001957989512315489,
+      "loss": 0.9879,
+      "step": 275
+    },
+    {
+      "epoch": 1.8604651162790697,
+      "grad_norm": 1.8828125,
+      "learning_rate": 0.0001954587632247732,
+      "loss": 0.9846,
+      "step": 280
+    },
+    {
+      "epoch": 1.893687707641196,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.00019510565162951537,
+      "loss": 0.9816,
+      "step": 285
+    },
+    {
+      "epoch": 1.9269102990033222,
+      "grad_norm": 1.15625,
+      "learning_rate": 0.00019473966425143292,
+      "loss": 0.9832,
+      "step": 290
+    },
+    {
+      "epoch": 1.9601328903654485,
+      "grad_norm": 1.3359375,
+      "learning_rate": 0.00019436085063935835,
+      "loss": 0.9838,
+      "step": 295
+    },
+    {
+      "epoch": 1.9933554817275747,
+      "grad_norm": 0.76171875,
+      "learning_rate": 0.00019396926207859084,
+      "loss": 0.9828,
+      "step": 300
+    },
+    {
+      "epoch": 2.0,
+      "eval_loss": 2.516935110092163,
+      "eval_runtime": 0.2355,
+      "eval_samples_per_second": 42.456,
+      "eval_steps_per_second": 4.246,
+      "step": 301
+    },
+    {
+      "epoch": 2.026578073089701,
+      "grad_norm": 1.765625,
+      "learning_rate": 0.00019356495158395315,
+      "loss": 0.9602,
+      "step": 305
+    },
+    {
+      "epoch": 2.0598006644518274,
+      "grad_norm": 3.375,
+      "learning_rate": 0.00019314797389261424,
+      "loss": 0.9484,
+      "step": 310
+    },
+    {
+      "epoch": 2.0930232558139537,
+      "grad_norm": 0.54296875,
+      "learning_rate": 0.00019271838545667876,
+      "loss": 0.9496,
+      "step": 315
+    },
+    {
+      "epoch": 2.12624584717608,
+      "grad_norm": 0.80859375,
+      "learning_rate": 0.00019227624443554425,
+      "loss": 0.9405,
+      "step": 320
+    },
+    {
+      "epoch": 2.159468438538206,
+      "grad_norm": 1.4765625,
+      "learning_rate": 0.00019182161068802741,
+      "loss": 0.9509,
+      "step": 325
+    },
+    {
+      "epoch": 2.1926910299003324,
+      "grad_norm": 1.3515625,
+      "learning_rate": 0.0001913545457642601,
+      "loss": 0.9532,
+      "step": 330
+    },
+    {
+      "epoch": 2.2259136212624586,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.00019087511289735644,
+      "loss": 0.9421,
+      "step": 335
+    },
+    {
+      "epoch": 2.259136212624585,
+      "grad_norm": 3.453125,
+      "learning_rate": 0.00019038337699485208,
+      "loss": 0.9347,
+      "step": 340
+    },
+    {
+      "epoch": 2.292358803986711,
+      "grad_norm": 1.265625,
+      "learning_rate": 0.0001898794046299167,
+      "loss": 0.9451,
+      "step": 345
+    },
+    {
+      "epoch": 2.3255813953488373,
+      "grad_norm": 5.25,
+      "learning_rate": 0.00018936326403234125,
+      "loss": 0.9503,
+      "step": 350
+    },
+    {
+      "epoch": 2.3588039867109636,
+      "grad_norm": 1.2421875,
+      "learning_rate": 0.00018883502507930042,
+      "loss": 0.9515,
+      "step": 355
+    },
+    {
+      "epoch": 2.39202657807309,
+      "grad_norm": 1.4375,
+      "learning_rate": 0.00018829475928589271,
+      "loss": 0.9382,
+      "step": 360
+    },
+    {
+      "epoch": 2.425249169435216,
+      "grad_norm": 0.82421875,
+      "learning_rate": 0.0001877425397954582,
+      "loss": 0.9309,
+      "step": 365
+    },
+    {
+      "epoch": 2.4584717607973423,
+      "grad_norm": 1.578125,
+      "learning_rate": 0.00018717844136967624,
+      "loss": 0.9487,
+      "step": 370
+    },
+    {
+      "epoch": 2.4916943521594686,
+      "grad_norm": 1.3359375,
+      "learning_rate": 0.00018660254037844388,
+      "loss": 0.9414,
+      "step": 375
+    },
+    {
+      "epoch": 2.524916943521595,
+      "grad_norm": 1.3125,
+      "learning_rate": 0.00018601491478953657,
+      "loss": 0.9575,
+      "step": 380
+    },
+    {
+      "epoch": 2.558139534883721,
+      "grad_norm": 1.90625,
+      "learning_rate": 0.00018541564415805258,
+      "loss": 0.9469,
+      "step": 385
+    },
+    {
+      "epoch": 2.5913621262458473,
+      "grad_norm": 8.25,
+      "learning_rate": 0.0001848048096156426,
+      "loss": 0.9246,
+      "step": 390
+    },
+    {
+      "epoch": 2.6245847176079735,
+      "grad_norm": 0.921875,
+      "learning_rate": 0.00018418249385952575,
+      "loss": 0.9357,
+      "step": 395
+    },
+    {
+      "epoch": 2.6578073089700998,
+      "grad_norm": 1.59375,
+      "learning_rate": 0.00018354878114129367,
+      "loss": 0.9264,
+      "step": 400
+    },
+    {
+      "epoch": 2.691029900332226,
+      "grad_norm": 2.125,
+      "learning_rate": 0.00018290375725550417,
+      "loss": 0.934,
+      "step": 405
+    },
+    {
+      "epoch": 2.7242524916943522,
+      "grad_norm": 5.15625,
+      "learning_rate": 0.00018224750952806624,
+      "loss": 0.9378,
+      "step": 410
+    },
+    {
+      "epoch": 2.7574750830564785,
+      "grad_norm": 0.66796875,
+      "learning_rate": 0.00018158012680441723,
+      "loss": 0.9325,
+      "step": 415
+    },
+    {
+      "epoch": 2.7906976744186047,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.00018090169943749476,
+      "loss": 0.9343,
+      "step": 420
+    },
+    {
+      "epoch": 2.823920265780731,
+      "grad_norm": 0.68359375,
+      "learning_rate": 0.0001802123192755044,
+      "loss": 0.9322,
+      "step": 425
+    },
+    {
+      "epoch": 2.857142857142857,
+      "grad_norm": 1.25,
+      "learning_rate": 0.0001795120796494848,
+      "loss": 0.9203,
+      "step": 430
+    },
+    {
+      "epoch": 2.8903654485049834,
+      "grad_norm": 0.67578125,
+      "learning_rate": 0.00017880107536067218,
+      "loss": 0.9181,
+      "step": 435
+    },
+    {
+      "epoch": 2.9235880398671097,
+      "grad_norm": 0.66796875,
+      "learning_rate": 0.00017807940266766593,
+      "loss": 0.9152,
+      "step": 440
+    },
+    {
+      "epoch": 2.956810631229236,
+      "grad_norm": 0.57421875,
+      "learning_rate": 0.0001773471592733964,
+      "loss": 0.9193,
+      "step": 445
+    },
+    {
+      "epoch": 2.990033222591362,
+      "grad_norm": 0.69140625,
+      "learning_rate": 0.0001766044443118978,
+      "loss": 0.9157,
+      "step": 450
+    },
+    {
+      "epoch": 2.9966777408637872,
+      "eval_loss": 2.4835643768310547,
+      "eval_runtime": 0.2608,
+      "eval_samples_per_second": 38.338,
+      "eval_steps_per_second": 3.834,
+      "step": 451
+    },
+    {
+      "epoch": 3.0232558139534884,
+      "grad_norm": 1.390625,
+      "learning_rate": 0.00017585135833488692,
+      "loss": 0.9023,
+      "step": 455
+    },
+    {
+      "epoch": 3.0564784053156147,
+      "grad_norm": 1.5078125,
+      "learning_rate": 0.00017508800329814995,
+      "loss": 0.8957,
+      "step": 460
+    },
+    {
+      "epoch": 3.089700996677741,
+      "grad_norm": 1.75,
+      "learning_rate": 0.00017431448254773944,
+      "loss": 0.8963,
+      "step": 465
+    },
+    {
+      "epoch": 3.122923588039867,
+      "grad_norm": 1.4921875,
+      "learning_rate": 0.0001735309008059829,
+      "loss": 0.8938,
+      "step": 470
+    },
+    {
+      "epoch": 3.1561461794019934,
+      "grad_norm": 1.1484375,
+      "learning_rate": 0.00017273736415730488,
+      "loss": 0.8832,
+      "step": 475
+    },
+    {
+      "epoch": 3.1893687707641196,
+      "grad_norm": 0.734375,
+      "learning_rate": 0.0001719339800338651,
+      "loss": 0.8824,
+      "step": 480
+    },
+    {
+      "epoch": 3.222591362126246,
+      "grad_norm": 0.92578125,
+      "learning_rate": 0.00017112085720101373,
+      "loss": 0.8985,
+      "step": 485
+    },
+    {
+      "epoch": 3.255813953488372,
+      "grad_norm": 0.77734375,
+      "learning_rate": 0.0001702981057425662,
+      "loss": 0.8915,
+      "step": 490
+    },
+    {
+      "epoch": 3.2890365448504983,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.00016946583704589973,
+      "loss": 0.8959,
+      "step": 495
+    },
+    {
+      "epoch": 3.3222591362126246,
+      "grad_norm": 0.640625,
+      "learning_rate": 0.0001686241637868734,
+      "loss": 0.8932,
+      "step": 500
+    },
+    {
+      "epoch": 3.355481727574751,
+      "grad_norm": 0.875,
+      "learning_rate": 0.00016777319991457325,
+      "loss": 0.9034,
+      "step": 505
+    },
+    {
+      "epoch": 3.388704318936877,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.00016691306063588583,
+      "loss": 0.8914,
+      "step": 510
+    },
+    {
+      "epoch": 3.4219269102990033,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.00016604386239990078,
+      "loss": 0.8968,
+      "step": 515
+    },
+    {
+      "epoch": 3.4551495016611296,
+      "grad_norm": 0.7109375,
+      "learning_rate": 0.00016516572288214552,
+      "loss": 0.8899,
+      "step": 520
+    },
+    {
+      "epoch": 3.488372093023256,
+      "grad_norm": 0.55078125,
+      "learning_rate": 0.00016427876096865394,
+      "loss": 0.888,
+      "step": 525
+    },
+    {
+      "epoch": 3.521594684385382,
+      "grad_norm": 1.5703125,
+      "learning_rate": 0.00016338309673987101,
+      "loss": 0.8966,
+      "step": 530
+    },
+    {
+      "epoch": 3.5548172757475083,
+      "grad_norm": 0.7890625,
+      "learning_rate": 0.000162478851454396,
+      "loss": 0.8802,
+      "step": 535
+    },
+    {
+      "epoch": 3.5880398671096345,
+      "grad_norm": 0.63671875,
+      "learning_rate": 0.0001615661475325658,
+      "loss": 0.8864,
+      "step": 540
+    },
+    {
+      "epoch": 3.6212624584717608,
+      "grad_norm": 1.3359375,
+      "learning_rate": 0.00016064510853988138,
+      "loss": 0.8816,
+      "step": 545
+    },
+    {
+      "epoch": 3.654485049833887,
+      "grad_norm": 1.484375,
+      "learning_rate": 0.00015971585917027862,
+      "loss": 0.8906,
+      "step": 550
+    },
+    {
+      "epoch": 3.6877076411960132,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.00015877852522924732,
+      "loss": 0.8896,
+      "step": 555
+    },
+    {
+      "epoch": 3.7209302325581395,
+      "grad_norm": 0.73046875,
+      "learning_rate": 0.00015783323361679864,
+      "loss": 0.8806,
+      "step": 560
+    },
+    {
+      "epoch": 3.7541528239202657,
+      "grad_norm": 1.25,
+      "learning_rate": 0.00015688011231028518,
+      "loss": 0.8758,
+      "step": 565
+    },
+    {
+      "epoch": 3.787375415282392,
+      "grad_norm": 1.2109375,
+      "learning_rate": 0.0001559192903470747,
+      "loss": 0.871,
+      "step": 570
+    },
+    {
+      "epoch": 3.820598006644518,
+      "grad_norm": 0.7578125,
+      "learning_rate": 0.0001549508978070806,
+      "loss": 0.8882,
+      "step": 575
+    },
+    {
+      "epoch": 3.8538205980066444,
+      "grad_norm": 0.66015625,
+      "learning_rate": 0.0001539750657951513,
+      "loss": 0.8719,
+      "step": 580
+    },
+    {
+      "epoch": 3.8870431893687707,
+      "grad_norm": 0.58203125,
+      "learning_rate": 0.0001529919264233205,
+      "loss": 0.8794,
+      "step": 585
+    },
+    {
+      "epoch": 3.920265780730897,
+      "grad_norm": 0.82421875,
+      "learning_rate": 0.00015200161279292155,
+      "loss": 0.8787,
+      "step": 590
+    },
+    {
+      "epoch": 3.953488372093023,
+      "grad_norm": 0.8125,
+      "learning_rate": 0.00015100425897656753,
+      "loss": 0.873,
+      "step": 595
+    },
+    {
+      "epoch": 3.9867109634551494,
+      "grad_norm": 0.578125,
+      "learning_rate": 0.00015000000000000001,
+      "loss": 0.8753,
+      "step": 600
+    },
+    {
+      "epoch": 4.0,
+      "eval_loss": 2.5010673999786377,
+      "eval_runtime": 0.239,
+      "eval_samples_per_second": 41.842,
+      "eval_steps_per_second": 4.184,
+      "step": 602
+    },
+    {
+      "epoch": 4.019933554817276,
+      "grad_norm": 1.359375,
+      "learning_rate": 0.0001489889718238087,
+      "loss": 0.8697,
+      "step": 605
+    },
+    {
+      "epoch": 4.053156146179402,
+      "grad_norm": 0.9921875,
+      "learning_rate": 0.00014797131132502465,
+      "loss": 0.8496,
+      "step": 610
+    },
+    {
+      "epoch": 4.086378737541528,
+      "grad_norm": 1.765625,
+      "learning_rate": 0.00014694715627858908,
+      "loss": 0.8601,
+      "step": 615
+    },
+    {
+      "epoch": 4.119601328903655,
+      "grad_norm": 1.140625,
+      "learning_rate": 0.00014591664533870118,
+      "loss": 0.8647,
+      "step": 620
+    },
+    {
+      "epoch": 4.152823920265781,
+      "grad_norm": 1.140625,
+      "learning_rate": 0.00014487991802004623,
+      "loss": 0.8541,
+      "step": 625
+    },
+    {
+      "epoch": 4.186046511627907,
+      "grad_norm": 0.6015625,
+      "learning_rate": 0.00014383711467890774,
+      "loss": 0.8481,
+      "step": 630
+    },
+    {
+      "epoch": 4.219269102990033,
+      "grad_norm": 0.96875,
+      "learning_rate": 0.00014278837649416544,
+      "loss": 0.8514,
+      "step": 635
+    },
+    {
+      "epoch": 4.25249169435216,
+      "grad_norm": 0.79296875,
+      "learning_rate": 0.0001417338454481818,
+      "loss": 0.8498,
+      "step": 640
+    },
+    {
+      "epoch": 4.285714285714286,
+      "grad_norm": 0.734375,
+      "learning_rate": 0.00014067366430758004,
+      "loss": 0.8368,
+      "step": 645
+    },
+    {
+      "epoch": 4.318936877076412,
+      "grad_norm": 0.875,
+      "learning_rate": 0.0001396079766039157,
+      "loss": 0.8439,
+      "step": 650
+    },
+    {
+      "epoch": 4.352159468438538,
+      "grad_norm": 0.67578125,
+      "learning_rate": 0.00013853692661424484,
+      "loss": 0.8565,
+      "step": 655
+    },
+    {
+      "epoch": 4.385382059800665,
+      "grad_norm": 0.79296875,
+      "learning_rate": 0.00013746065934159123,
+      "loss": 0.8426,
+      "step": 660
+    },
+    {
+      "epoch": 4.4186046511627906,
+      "grad_norm": 0.6953125,
+      "learning_rate": 0.00013637932049531516,
+      "loss": 0.8471,
+      "step": 665
+    },
+    {
+      "epoch": 4.451827242524917,
+      "grad_norm": 1.28125,
+      "learning_rate": 0.00013529305647138687,
+      "loss": 0.8417,
+      "step": 670
+    },
+    {
+      "epoch": 4.485049833887043,
+      "grad_norm": 0.94140625,
+      "learning_rate": 0.00013420201433256689,
+      "loss": 0.8493,
+      "step": 675
+    },
+    {
+      "epoch": 4.51827242524917,
+      "grad_norm": 1.59375,
+      "learning_rate": 0.0001331063417884958,
+      "loss": 0.8506,
+      "step": 680
+    },
+    {
+      "epoch": 4.5514950166112955,
+      "grad_norm": 0.97265625,
+      "learning_rate": 0.00013200618717569714,
+      "loss": 0.841,
+      "step": 685
+    },
+    {
+      "epoch": 4.584717607973422,
+      "grad_norm": 0.75,
+      "learning_rate": 0.00013090169943749476,
+      "loss": 0.8415,
+      "step": 690
+    },
+    {
+      "epoch": 4.617940199335548,
+      "grad_norm": 0.703125,
+      "learning_rate": 0.0001297930281038482,
+      "loss": 0.8506,
+      "step": 695
+    },
+    {
+      "epoch": 4.651162790697675,
+      "grad_norm": 0.8359375,
+      "learning_rate": 0.00012868032327110904,
+      "loss": 0.8425,
+      "step": 700
+    },
+    {
+      "epoch": 4.6843853820598005,
+      "grad_norm": 1.1953125,
+      "learning_rate": 0.0001275637355816999,
+      "loss": 0.8466,
+      "step": 705
+    },
+    {
+      "epoch": 4.717607973421927,
+      "grad_norm": 0.51171875,
+      "learning_rate": 0.00012644341620372023,
+      "loss": 0.841,
+      "step": 710
+    },
+    {
+      "epoch": 4.750830564784053,
+      "grad_norm": 1.7578125,
+      "learning_rate": 0.0001253195168104802,
+      "loss": 0.8396,
+      "step": 715
+    },
+    {
+      "epoch": 4.78405315614618,
+      "grad_norm": 1.609375,
+      "learning_rate": 0.00012419218955996676,
+      "loss": 0.8423,
+      "step": 720
+    },
+    {
+      "epoch": 4.8172757475083055,
+      "grad_norm": 0.86328125,
+      "learning_rate": 0.00012306158707424403,
+      "loss": 0.839,
+      "step": 725
+    },
+    {
+      "epoch": 4.850498338870432,
+      "grad_norm": 0.75,
+      "learning_rate": 0.00012192786241879033,
+      "loss": 0.8342,
+      "step": 730
+    },
+    {
+      "epoch": 4.883720930232558,
+      "grad_norm": 1.1328125,
+      "learning_rate": 0.00012079116908177593,
+      "loss": 0.8358,
+      "step": 735
+    },
+    {
+      "epoch": 4.916943521594685,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.00011965166095328301,
+      "loss": 0.8432,
+      "step": 740
+    },
+    {
+      "epoch": 4.95016611295681,
+      "grad_norm": 0.68359375,
+      "learning_rate": 0.00011850949230447145,
+      "loss": 0.8368,
+      "step": 745
+    },
+    {
+      "epoch": 4.983388704318937,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.00011736481776669306,
+      "loss": 0.8334,
+      "step": 750
+    },
+    {
+      "epoch": 4.996677740863787,
+      "eval_loss": 2.4944658279418945,
+      "eval_runtime": 0.2592,
+      "eval_samples_per_second": 38.58,
+      "eval_steps_per_second": 3.858,
+      "step": 752
+    },
+    {
+      "epoch": 5.016611295681063,
+      "grad_norm": 0.7265625,
+      "learning_rate": 0.00011621779231055676,
+      "loss": 0.8264,
+      "step": 755
+    },
+    {
+      "epoch": 5.04983388704319,
+      "grad_norm": 1.6484375,
+      "learning_rate": 0.00011506857122494831,
+      "loss": 0.8175,
+      "step": 760
+    },
+    {
+      "epoch": 5.083056478405315,
+      "grad_norm": 0.8828125,
+      "learning_rate": 0.00011391731009600654,
+      "loss": 0.8207,
+      "step": 765
+    },
+    {
+      "epoch": 5.116279069767442,
+      "grad_norm": 0.8359375,
+      "learning_rate": 0.00011276416478605949,
+      "loss": 0.8134,
+      "step": 770
+    },
+    {
+      "epoch": 5.149501661129568,
+      "grad_norm": 0.8828125,
+      "learning_rate": 0.00011160929141252303,
+      "loss": 0.8146,
+      "step": 775
+    },
+    {
+      "epoch": 5.1827242524916945,
+      "grad_norm": 1.140625,
+      "learning_rate": 0.00011045284632676536,
+      "loss": 0.8118,
+      "step": 780
+    },
+    {
+      "epoch": 5.21594684385382,
+      "grad_norm": 3.4375,
+      "learning_rate": 0.00010929498609293924,
+      "loss": 0.8142,
+      "step": 785
+    },
+    {
+      "epoch": 5.249169435215947,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.00010813586746678583,
+      "loss": 0.8156,
+      "step": 790
+    },
+    {
+      "epoch": 5.282392026578073,
+      "grad_norm": 2.828125,
+      "learning_rate": 0.00010697564737441252,
+      "loss": 0.8097,
+      "step": 795
+    },
+    {
+      "epoch": 5.3156146179401995,
+      "grad_norm": 0.8828125,
+      "learning_rate": 0.00010581448289104758,
+      "loss": 0.8213,
+      "step": 800
+    },
+    {
+      "epoch": 5.348837209302325,
+      "grad_norm": 1.3984375,
+      "learning_rate": 0.0001046525312197747,
+      "loss": 0.8087,
+      "step": 805
+    },
+    {
+      "epoch": 5.382059800664452,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.00010348994967025012,
+      "loss": 0.8046,
+      "step": 810
+    },
+    {
+      "epoch": 5.415282392026578,
+      "grad_norm": 2.734375,
+      "learning_rate": 0.00010232689563740563,
+      "loss": 0.8086,
+      "step": 815
+    },
+    {
+      "epoch": 5.4485049833887045,
+      "grad_norm": 0.9921875,
+      "learning_rate": 0.00010116352658013973,
+      "loss": 0.809,
+      "step": 820
+    },
+    {
+      "epoch": 5.48172757475083,
+      "grad_norm": 1.0,
+      "learning_rate": 0.0001,
+      "loss": 0.8155,
+      "step": 825
+    },
+    {
+      "epoch": 5.514950166112957,
+      "grad_norm": 0.73046875,
+      "learning_rate": 9.883647341986032e-05,
+      "loss": 0.8016,
+      "step": 830
+    },
+    {
+      "epoch": 5.548172757475083,
+      "grad_norm": 0.6796875,
+      "learning_rate": 9.767310436259438e-05,
+      "loss": 0.8013,
+      "step": 835
+    },
+    {
+      "epoch": 5.5813953488372094,
+      "grad_norm": 0.81640625,
+      "learning_rate": 9.651005032974994e-05,
+      "loss": 0.8123,
+      "step": 840
+    },
+    {
+      "epoch": 5.614617940199335,
+      "grad_norm": 2.1875,
+      "learning_rate": 9.534746878022534e-05,
+      "loss": 0.8163,
+      "step": 845
+    },
+    {
+      "epoch": 5.647840531561462,
+      "grad_norm": 0.72265625,
+      "learning_rate": 9.418551710895243e-05,
+      "loss": 0.8164,
+      "step": 850
+    },
+    {
+      "epoch": 5.681063122923588,
+      "grad_norm": 1.6953125,
+      "learning_rate": 9.302435262558747e-05,
+      "loss": 0.7974,
+      "step": 855
+    },
+    {
+      "epoch": 5.714285714285714,
+      "grad_norm": 0.76953125,
+      "learning_rate": 9.186413253321418e-05,
+      "loss": 0.8142,
+      "step": 860
+    },
+    {
+      "epoch": 5.74750830564784,
+      "grad_norm": 1.109375,
+      "learning_rate": 9.070501390706079e-05,
+      "loss": 0.8026,
+      "step": 865
+    },
+    {
+      "epoch": 5.780730897009967,
+      "grad_norm": 0.640625,
+      "learning_rate": 8.954715367323468e-05,
+      "loss": 0.8005,
+      "step": 870
+    },
+    {
+      "epoch": 5.813953488372093,
+      "grad_norm": 0.85546875,
+      "learning_rate": 8.839070858747697e-05,
+      "loss": 0.8015,
+      "step": 875
+    },
+    {
+      "epoch": 5.847176079734219,
+      "grad_norm": 0.52734375,
+      "learning_rate": 8.723583521394054e-05,
+      "loss": 0.7924,
+      "step": 880
+    },
+    {
+      "epoch": 5.880398671096345,
+      "grad_norm": 0.59765625,
+      "learning_rate": 8.608268990399349e-05,
+      "loss": 0.812,
+      "step": 885
+    },
+    {
+      "epoch": 5.913621262458472,
+      "grad_norm": 0.70703125,
+      "learning_rate": 8.49314287750517e-05,
+      "loss": 0.7969,
+      "step": 890
+    },
+    {
+      "epoch": 5.946843853820598,
+      "grad_norm": 0.74609375,
+      "learning_rate": 8.378220768944327e-05,
+      "loss": 0.7965,
+      "step": 895
+    },
+    {
+      "epoch": 5.980066445182724,
+      "grad_norm": 2.015625,
+      "learning_rate": 8.263518223330697e-05,
+      "loss": 0.796,
+      "step": 900
+    },
+    {
+      "epoch": 6.0,
+      "eval_loss": 2.531708240509033,
+      "eval_runtime": 0.239,
+      "eval_samples_per_second": 41.85,
+      "eval_steps_per_second": 4.185,
+      "step": 903
+    },
+    {
+      "epoch": 6.01328903654485,
+      "grad_norm": 0.482421875,
+      "learning_rate": 8.149050769552856e-05,
+      "loss": 0.7892,
+      "step": 905
+    },
+    {
+      "epoch": 6.046511627906977,
+      "grad_norm": 0.65234375,
+      "learning_rate": 8.034833904671698e-05,
+      "loss": 0.7792,
+      "step": 910
+    },
+    {
+      "epoch": 6.079734219269103,
+      "grad_norm": 0.7578125,
+      "learning_rate": 7.920883091822408e-05,
+      "loss": 0.7814,
+      "step": 915
+    },
+    {
+      "epoch": 6.112956810631229,
+      "grad_norm": 0.484375,
+      "learning_rate": 7.807213758120966e-05,
+      "loss": 0.7822,
+      "step": 920
+    },
+    {
+      "epoch": 6.146179401993355,
+      "grad_norm": 0.80859375,
+      "learning_rate": 7.693841292575598e-05,
+      "loss": 0.7749,
+      "step": 925
+    },
+    {
+      "epoch": 6.179401993355482,
+      "grad_norm": 0.81640625,
+      "learning_rate": 7.580781044003324e-05,
+      "loss": 0.7821,
+      "step": 930
+    },
+    {
+      "epoch": 6.212624584717608,
+      "grad_norm": 5.34375,
+      "learning_rate": 7.468048318951983e-05,
+      "loss": 0.7872,
+      "step": 935
+    },
+    {
+      "epoch": 6.245847176079734,
+      "grad_norm": 2.21875,
+      "learning_rate": 7.35565837962798e-05,
+      "loss": 0.7855,
+      "step": 940
+    },
+    {
+      "epoch": 6.27906976744186,
+      "grad_norm": 3.28125,
+      "learning_rate": 7.243626441830009e-05,
+      "loss": 0.7763,
+      "step": 945
+    },
+    {
+      "epoch": 6.312292358803987,
+      "grad_norm": 0.62890625,
+      "learning_rate": 7.131967672889101e-05,
+      "loss": 0.7901,
+      "step": 950
+    },
+    {
+      "epoch": 6.3455149501661126,
+      "grad_norm": 0.9765625,
+      "learning_rate": 7.02069718961518e-05,
+      "loss": 0.7814,
+      "step": 955
+    },
+    {
+      "epoch": 6.378737541528239,
+      "grad_norm": 0.8203125,
+      "learning_rate": 6.909830056250527e-05,
+      "loss": 0.7752,
+      "step": 960
+    },
+    {
+      "epoch": 6.411960132890365,
+      "grad_norm": 0.92578125,
+      "learning_rate": 6.799381282430284e-05,
+      "loss": 0.7782,
+      "step": 965
+    },
+    {
+      "epoch": 6.445182724252492,
+      "grad_norm": 0.91796875,
+      "learning_rate": 6.68936582115042e-05,
+      "loss": 0.7748,
+      "step": 970
+    },
+    {
+      "epoch": 6.4784053156146175,
+      "grad_norm": 1.1328125,
+      "learning_rate": 6.579798566743314e-05,
+      "loss": 0.7815,
+      "step": 975
+    },
+    {
+      "epoch": 6.511627906976744,
+      "grad_norm": 3.734375,
+      "learning_rate": 6.470694352861312e-05,
+      "loss": 0.7747,
+      "step": 980
+    },
+    {
+      "epoch": 6.544850498338871,
+      "grad_norm": 0.6015625,
+      "learning_rate": 6.362067950468489e-05,
+      "loss": 0.785,
+      "step": 985
+    },
+    {
+      "epoch": 6.578073089700997,
+      "grad_norm": 0.73828125,
+      "learning_rate": 6.25393406584088e-05,
+      "loss": 0.7716,
+      "step": 990
+    },
+    {
+      "epoch": 6.6112956810631225,
+      "grad_norm": 0.79296875,
+      "learning_rate": 6.146307338575519e-05,
+      "loss": 0.7723,
+      "step": 995
+    },
+    {
+      "epoch": 6.644518272425249,
+      "grad_norm": 0.69921875,
+      "learning_rate": 6.039202339608432e-05,
+      "loss": 0.7745,
+      "step": 1000
+    },
+    {
+      "epoch": 6.677740863787376,
+      "grad_norm": 1.96875,
+      "learning_rate": 5.9326335692419995e-05,
+      "loss": 0.7848,
+      "step": 1005
+    },
+    {
+      "epoch": 6.710963455149502,
+      "grad_norm": 0.734375,
+      "learning_rate": 5.8266154551818216e-05,
+      "loss": 0.7797,
+      "step": 1010
+    },
+    {
+      "epoch": 6.7441860465116275,
+      "grad_norm": 0.474609375,
+      "learning_rate": 5.72116235058346e-05,
+      "loss": 0.7714,
+      "step": 1015
+    },
+    {
+      "epoch": 6.777408637873754,
+      "grad_norm": 0.478515625,
+      "learning_rate": 5.616288532109225e-05,
+      "loss": 0.7716,
+      "step": 1020
+    },
+    {
+      "epoch": 6.810631229235881,
+      "grad_norm": 0.494140625,
+      "learning_rate": 5.5120081979953785e-05,
+      "loss": 0.7807,
+      "step": 1025
+    },
+    {
+      "epoch": 6.843853820598007,
+      "grad_norm": 0.65234375,
+      "learning_rate": 5.4083354661298814e-05,
+      "loss": 0.7647,
+      "step": 1030
+    },
+    {
+      "epoch": 6.877076411960132,
+      "grad_norm": 0.6328125,
+      "learning_rate": 5.305284372141095e-05,
+      "loss": 0.7755,
+      "step": 1035
+    },
+    {
+      "epoch": 6.910299003322259,
+      "grad_norm": 0.4765625,
+      "learning_rate": 5.2028688674975415e-05,
+      "loss": 0.7738,
+      "step": 1040
+    },
+    {
+      "epoch": 6.943521594684386,
+      "grad_norm": 0.5625,
+      "learning_rate": 5.101102817619131e-05,
+      "loss": 0.7765,
+      "step": 1045
+    },
+    {
+      "epoch": 6.976744186046512,
+      "grad_norm": 0.70703125,
+      "learning_rate": 5.000000000000002e-05,
+      "loss": 0.7745,
+      "step": 1050
+    },
+    {
+      "epoch": 6.996677740863787,
+      "eval_loss": 2.5435612201690674,
+      "eval_runtime": 0.2585,
+      "eval_samples_per_second": 38.679,
+      "eval_steps_per_second": 3.868,
+      "step": 1053
+    },
+    {
+      "epoch": 7.009966777408638,
+      "grad_norm": 0.53125,
+      "learning_rate": 4.899574102343247e-05,
+      "loss": 0.771,
+      "step": 1055
+    },
+    {
+      "epoch": 7.043189368770764,
+      "grad_norm": 0.640625,
+      "learning_rate": 4.799838720707846e-05,
+      "loss": 0.7653,
+      "step": 1060
+    },
+    {
+      "epoch": 7.076411960132891,
+      "grad_norm": 0.52734375,
+      "learning_rate": 4.700807357667952e-05,
+      "loss": 0.7644,
+      "step": 1065
+    },
+    {
+      "epoch": 7.1096345514950166,
+      "grad_norm": 0.490234375,
+      "learning_rate": 4.6024934204848745e-05,
+      "loss": 0.7632,
+      "step": 1070
+    },
+    {
+      "epoch": 7.142857142857143,
+      "grad_norm": 0.55859375,
+      "learning_rate": 4.50491021929194e-05,
+      "loss": 0.7686,
+      "step": 1075
+    },
+    {
+      "epoch": 7.176079734219269,
+      "grad_norm": 0.46484375,
+      "learning_rate": 4.4080709652925336e-05,
+      "loss": 0.7549,
+      "step": 1080
+    },
+    {
+      "epoch": 7.209302325581396,
+      "grad_norm": 0.58203125,
+      "learning_rate": 4.3119887689714844e-05,
+      "loss": 0.7626,
+      "step": 1085
+    },
+    {
+      "epoch": 7.2425249169435215,
+      "grad_norm": 0.5546875,
+      "learning_rate": 4.216676638320135e-05,
+      "loss": 0.7588,
+      "step": 1090
+    },
+    {
+      "epoch": 7.275747508305648,
+      "grad_norm": 0.5,
+      "learning_rate": 4.12214747707527e-05,
+      "loss": 0.7583,
+      "step": 1095
+    },
+    {
+      "epoch": 7.308970099667774,
+      "grad_norm": 0.6015625,
+      "learning_rate": 4.028414082972141e-05,
+      "loss": 0.7529,
+      "step": 1100
+    },
+    {
+      "epoch": 7.342192691029901,
+      "grad_norm": 0.72265625,
+      "learning_rate": 3.935489146011869e-05,
+      "loss": 0.766,
+      "step": 1105
+    },
+    {
+      "epoch": 7.3754152823920265,
+      "grad_norm": 0.46875,
+      "learning_rate": 3.843385246743417e-05,
+      "loss": 0.7592,
+      "step": 1110
+    },
+    {
+      "epoch": 7.408637873754153,
+      "grad_norm": 0.431640625,
+      "learning_rate": 3.7521148545604e-05,
+      "loss": 0.7645,
+      "step": 1115
+    },
+    {
+      "epoch": 7.441860465116279,
+      "grad_norm": 0.455078125,
+      "learning_rate": 3.661690326012897e-05,
+      "loss": 0.7629,
+      "step": 1120
+    },
+    {
+      "epoch": 7.475083056478406,
+      "grad_norm": 0.4765625,
+      "learning_rate": 3.5721239031346066e-05,
+      "loss": 0.7591,
+      "step": 1125
+    },
+    {
+      "epoch": 7.5083056478405314,
+      "grad_norm": 0.71484375,
+      "learning_rate": 3.483427711785449e-05,
+      "loss": 0.7558,
+      "step": 1130
+    },
+    {
+      "epoch": 7.541528239202658,
+      "grad_norm": 0.53515625,
+      "learning_rate": 3.395613760009925e-05,
+      "loss": 0.7611,
+      "step": 1135
+    },
+    {
+      "epoch": 7.574750830564784,
+      "grad_norm": 0.56640625,
+      "learning_rate": 3.308693936411421e-05,
+      "loss": 0.7619,
+      "step": 1140
+    },
+    {
+      "epoch": 7.607973421926911,
+      "grad_norm": 0.44921875,
+      "learning_rate": 3.222680008542678e-05,
+      "loss": 0.7585,
+      "step": 1145
+    },
+    {
+      "epoch": 7.641196013289036,
+      "grad_norm": 0.490234375,
+      "learning_rate": 3.137583621312665e-05,
+      "loss": 0.7551,
+      "step": 1150
+    },
+    {
+      "epoch": 7.674418604651163,
+      "grad_norm": 0.490234375,
+      "learning_rate": 3.053416295410026e-05,
+      "loss": 0.7626,
+      "step": 1155
+    },
+    {
+      "epoch": 7.707641196013289,
+      "grad_norm": 0.5,
+      "learning_rate": 2.9701894257433826e-05,
+      "loss": 0.764,
+      "step": 1160
+    },
+    {
+      "epoch": 7.740863787375416,
+      "grad_norm": 0.46875,
+      "learning_rate": 2.8879142798986292e-05,
+      "loss": 0.755,
+      "step": 1165
+    },
+    {
+      "epoch": 7.774086378737541,
+      "grad_norm": 0.46875,
+      "learning_rate": 2.8066019966134904e-05,
+      "loss": 0.7563,
+      "step": 1170
+    },
+    {
+      "epoch": 7.807308970099668,
+      "grad_norm": 0.451171875,
+      "learning_rate": 2.7262635842695127e-05,
+      "loss": 0.7688,
+      "step": 1175
+    },
+    {
+      "epoch": 7.840531561461794,
+      "grad_norm": 0.546875,
+      "learning_rate": 2.6469099194017143e-05,
+      "loss": 0.7665,
+      "step": 1180
+    },
+    {
+      "epoch": 7.8737541528239205,
+      "grad_norm": 0.4453125,
+      "learning_rate": 2.5685517452260567e-05,
+      "loss": 0.7664,
+      "step": 1185
+    },
+    {
+      "epoch": 7.906976744186046,
+      "grad_norm": 0.443359375,
+      "learning_rate": 2.491199670185008e-05,
+      "loss": 0.753,
+      "step": 1190
+    },
+    {
+      "epoch": 7.940199335548173,
+      "grad_norm": 0.44921875,
+      "learning_rate": 2.4148641665113113e-05,
+      "loss": 0.7614,
+      "step": 1195
+    },
+    {
+      "epoch": 7.973421926910299,
+      "grad_norm": 0.484375,
+      "learning_rate": 2.339555568810221e-05,
+      "loss": 0.7582,
+      "step": 1200
+    },
+    {
+      "epoch": 8.0,
+      "eval_loss": 2.5521774291992188,
+      "eval_runtime": 0.24,
+      "eval_samples_per_second": 41.669,
+      "eval_steps_per_second": 4.167,
+      "step": 1204
+    },
+    {
+      "epoch": 8.006644518272426,
+      "grad_norm": 0.423828125,
+      "learning_rate": 2.265284072660362e-05,
+      "loss": 0.7646,
+      "step": 1205
+    },
+    {
+      "epoch": 8.039867109634551,
+      "grad_norm": 0.44140625,
+      "learning_rate": 2.192059733233408e-05,
+      "loss": 0.758,
+      "step": 1210
+    },
+    {
+      "epoch": 8.073089700996677,
+      "grad_norm": 0.439453125,
+      "learning_rate": 2.119892463932781e-05,
+      "loss": 0.7566,
+      "step": 1215
+    },
+    {
+      "epoch": 8.106312292358805,
+      "grad_norm": 0.451171875,
+      "learning_rate": 2.0487920350515212e-05,
+      "loss": 0.7551,
+      "step": 1220
+    },
+    {
+      "epoch": 8.13953488372093,
+      "grad_norm": 0.4375,
+      "learning_rate": 1.9787680724495617e-05,
+      "loss": 0.7421,
+      "step": 1225
+    },
+    {
+      "epoch": 8.172757475083056,
+      "grad_norm": 0.44921875,
+      "learning_rate": 1.9098300562505266e-05,
+      "loss": 0.7513,
+      "step": 1230
+    },
+    {
+      "epoch": 8.205980066445182,
+      "grad_norm": 0.44921875,
+      "learning_rate": 1.8419873195582814e-05,
+      "loss": 0.7578,
+      "step": 1235
+    },
+    {
+      "epoch": 8.23920265780731,
+      "grad_norm": 0.421875,
+      "learning_rate": 1.775249047193377e-05,
+      "loss": 0.7518,
+      "step": 1240
+    },
+    {
+      "epoch": 8.272425249169435,
+      "grad_norm": 0.498046875,
+      "learning_rate": 1.7096242744495837e-05,
+      "loss": 0.7519,
+      "step": 1245
+    },
+    {
+      "epoch": 8.305647840531561,
+      "grad_norm": 0.5390625,
+      "learning_rate": 1.6451218858706374e-05,
+      "loss": 0.7514,
+      "step": 1250
+    },
+    {
+      "epoch": 8.338870431893687,
+      "grad_norm": 0.43359375,
+      "learning_rate": 1.5817506140474247e-05,
+      "loss": 0.7553,
+      "step": 1255
+    },
+    {
+      "epoch": 8.372093023255815,
+      "grad_norm": 0.466796875,
+      "learning_rate": 1.5195190384357404e-05,
+      "loss": 0.7487,
+      "step": 1260
+    },
+    {
+      "epoch": 8.40531561461794,
+      "grad_norm": 0.43359375,
+      "learning_rate": 1.458435584194745e-05,
+      "loss": 0.7518,
+      "step": 1265
+    },
+    {
+      "epoch": 8.438538205980066,
+      "grad_norm": 0.4296875,
+      "learning_rate": 1.3985085210463477e-05,
+      "loss": 0.7487,
+      "step": 1270
+    },
+    {
+      "epoch": 8.471760797342192,
+      "grad_norm": 0.423828125,
+      "learning_rate": 1.339745962155613e-05,
+      "loss": 0.7467,
+      "step": 1275
+    },
+    {
+      "epoch": 8.50498338870432,
+      "grad_norm": 0.4296875,
+      "learning_rate": 1.2821558630323772e-05,
+      "loss": 0.7478,
+      "step": 1280
+    },
+    {
+      "epoch": 8.538205980066445,
+      "grad_norm": 0.46875,
+      "learning_rate": 1.2257460204541794e-05,
+      "loss": 0.7558,
+      "step": 1285
+    },
+    {
+      "epoch": 8.571428571428571,
+      "grad_norm": 0.44921875,
+      "learning_rate": 1.1705240714107302e-05,
+      "loss": 0.7426,
+      "step": 1290
+    },
+    {
+      "epoch": 8.604651162790697,
+      "grad_norm": 0.46875,
+      "learning_rate": 1.116497492069961e-05,
+      "loss": 0.7411,
+      "step": 1295
+    },
+    {
+      "epoch": 8.637873754152825,
+      "grad_norm": 0.44140625,
+      "learning_rate": 1.0636735967658784e-05,
+      "loss": 0.7524,
+      "step": 1300
+    },
+    {
+      "epoch": 8.67109634551495,
+      "grad_norm": 0.453125,
+      "learning_rate": 1.0120595370083318e-05,
+      "loss": 0.7499,
+      "step": 1305
+    },
+    {
+      "epoch": 8.704318936877076,
+      "grad_norm": 0.435546875,
+      "learning_rate": 9.616623005147951e-06,
+      "loss": 0.7603,
+      "step": 1310
+    },
+    {
+      "epoch": 8.737541528239202,
+      "grad_norm": 0.44140625,
+      "learning_rate": 9.124887102643575e-06,
+      "loss": 0.7563,
+      "step": 1315
+    },
+    {
+      "epoch": 8.77076411960133,
+      "grad_norm": 0.4296875,
+      "learning_rate": 8.645454235739903e-06,
+      "loss": 0.7594,
+      "step": 1320
+    },
+    {
+      "epoch": 8.803986710963455,
+      "grad_norm": 0.431640625,
+      "learning_rate": 8.178389311972612e-06,
+      "loss": 0.7648,
+      "step": 1325
+    },
+    {
+      "epoch": 8.837209302325581,
+      "grad_norm": 0.443359375,
+      "learning_rate": 7.72375556445577e-06,
+      "loss": 0.7555,
+      "step": 1330
+    },
+    {
+      "epoch": 8.870431893687707,
+      "grad_norm": 0.44140625,
+      "learning_rate": 7.281614543321269e-06,
+      "loss": 0.7461,
+      "step": 1335
+    },
+    {
+      "epoch": 8.903654485049834,
+      "grad_norm": 0.470703125,
+      "learning_rate": 6.852026107385756e-06,
+      "loss": 0.7606,
+      "step": 1340
+    },
+    {
+      "epoch": 8.93687707641196,
+      "grad_norm": 0.435546875,
+      "learning_rate": 6.435048416046863e-06,
+      "loss": 0.7598,
+      "step": 1345
+    },
+    {
+      "epoch": 8.970099667774086,
+      "grad_norm": 0.439453125,
+      "learning_rate": 6.030737921409169e-06,
+      "loss": 0.754,
+      "step": 1350
+    },
+    {
+      "epoch": 8.996677740863788,
+      "eval_loss": 2.5503978729248047,
+      "eval_runtime": 0.2601,
+      "eval_samples_per_second": 38.445,
+      "eval_steps_per_second": 3.845,
+      "step": 1354
+    },
+    {
+      "epoch": 9.003322259136212,
+      "grad_norm": 0.4375,
+      "learning_rate": 5.639149360641649e-06,
+      "loss": 0.7546,
+      "step": 1355
+    },
+    {
+      "epoch": 9.03654485049834,
+      "grad_norm": 0.439453125,
+      "learning_rate": 5.26033574856708e-06,
+      "loss": 0.7562,
+      "step": 1360
+    },
+    {
+      "epoch": 9.069767441860465,
+      "grad_norm": 0.419921875,
+      "learning_rate": 4.8943483704846475e-06,
+      "loss": 0.7522,
+      "step": 1365
+    },
+    {
+      "epoch": 9.102990033222591,
+      "grad_norm": 0.427734375,
+      "learning_rate": 4.541236775226809e-06,
+      "loss": 0.7522,
+      "step": 1370
+    },
+    {
+      "epoch": 9.136212624584717,
+      "grad_norm": 0.427734375,
+      "learning_rate": 4.20104876845111e-06,
+      "loss": 0.7509,
+      "step": 1375
+    },
+    {
+      "epoch": 9.169435215946844,
+      "grad_norm": 0.41796875,
+      "learning_rate": 3.873830406168111e-06,
+      "loss": 0.7444,
+      "step": 1380
+    },
+    {
+      "epoch": 9.20265780730897,
+      "grad_norm": 0.453125,
+      "learning_rate": 3.5596259885061102e-06,
+      "loss": 0.7561,
+      "step": 1385
+    },
+    {
+      "epoch": 9.235880398671096,
+      "grad_norm": 0.443359375,
+      "learning_rate": 3.2584780537136207e-06,
+      "loss": 0.7502,
+      "step": 1390
+    },
+    {
+      "epoch": 9.269102990033222,
+      "grad_norm": 0.4921875,
+      "learning_rate": 2.970427372400353e-06,
+      "loss": 0.7546,
+      "step": 1395
+    },
+    {
+      "epoch": 9.30232558139535,
+      "grad_norm": 0.45703125,
+      "learning_rate": 2.6955129420176196e-06,
+      "loss": 0.7506,
+      "step": 1400
+    },
+    {
+      "epoch": 9.335548172757475,
+      "grad_norm": 0.4296875,
+      "learning_rate": 2.433771981578581e-06,
+      "loss": 0.7531,
+      "step": 1405
+    },
+    {
+      "epoch": 9.368770764119601,
+      "grad_norm": 0.427734375,
+      "learning_rate": 2.1852399266194314e-06,
+      "loss": 0.75,
+      "step": 1410
+    },
+    {
+      "epoch": 9.401993355481727,
+      "grad_norm": 0.4765625,
+      "learning_rate": 1.9499504244020693e-06,
+      "loss": 0.7449,
+      "step": 1415
+    },
+    {
+      "epoch": 9.435215946843854,
+      "grad_norm": 0.416015625,
+      "learning_rate": 1.7279353293586765e-06,
+      "loss": 0.765,
+      "step": 1420
+    },
+    {
+      "epoch": 9.46843853820598,
+      "grad_norm": 0.44921875,
+      "learning_rate": 1.5192246987791981e-06,
+      "loss": 0.7472,
+      "step": 1425
+    },
+    {
+      "epoch": 9.501661129568106,
+      "grad_norm": 0.431640625,
+      "learning_rate": 1.323846788742078e-06,
+      "loss": 0.7461,
+      "step": 1430
+    },
+    {
+      "epoch": 9.534883720930232,
+      "grad_norm": 0.443359375,
+      "learning_rate": 1.14182805028884e-06,
+      "loss": 0.7501,
+      "step": 1435
+    },
+    {
+      "epoch": 9.56810631229236,
+      "grad_norm": 0.43359375,
+      "learning_rate": 9.731931258429638e-07,
+      "loss": 0.7501,
+      "step": 1440
+    },
+    {
+      "epoch": 9.601328903654485,
+      "grad_norm": 0.41796875,
+      "learning_rate": 8.17964845873831e-07,
+      "loss": 0.7511,
+      "step": 1445
+    },
+    {
+      "epoch": 9.634551495016611,
+      "grad_norm": 0.427734375,
+      "learning_rate": 6.761642258056978e-07,
+      "loss": 0.7556,
+      "step": 1450
+    },
+    {
+      "epoch": 9.667774086378738,
+      "grad_norm": 0.42578125,
+      "learning_rate": 5.478104631726711e-07,
+      "loss": 0.751,
+      "step": 1455
+    },
+    {
+      "epoch": 9.700996677740864,
+      "grad_norm": 0.421875,
+      "learning_rate": 4.329209350195651e-07,
+      "loss": 0.7598,
+      "step": 1460
+    },
+    {
+      "epoch": 9.73421926910299,
+      "grad_norm": 0.4375,
+      "learning_rate": 3.315111955493944e-07,
+      "loss": 0.7572,
+      "step": 1465
+    },
+    {
+      "epoch": 9.767441860465116,
+      "grad_norm": 0.46484375,
+      "learning_rate": 2.4359497401758024e-07,
+      "loss": 0.7478,
+      "step": 1470
+    },
+    {
+      "epoch": 9.800664451827242,
+      "grad_norm": 0.419921875,
+      "learning_rate": 1.6918417287318245e-07,
+      "loss": 0.749,
+      "step": 1475
+    },
+    {
+      "epoch": 9.83388704318937,
+      "grad_norm": 0.44921875,
+      "learning_rate": 1.0828886614754341e-07,
+      "loss": 0.7488,
+      "step": 1480
+    },
+    {
+      "epoch": 9.867109634551495,
+      "grad_norm": 0.4609375,
+      "learning_rate": 6.09172980904238e-08,
+      "loss": 0.7407,
+      "step": 1485
+    },
+    {
+      "epoch": 9.90033222591362,
+      "grad_norm": 0.43359375,
+      "learning_rate": 2.7075882053828605e-08,
+      "loss": 0.7491,
+      "step": 1490
+    },
+    {
+      "epoch": 9.933554817275748,
+      "grad_norm": 0.447265625,
+      "learning_rate": 6.769199623779532e-09,
+      "loss": 0.7417,
+      "step": 1495
+    },
+    {
+      "epoch": 9.966777408637874,
+      "grad_norm": 0.435546875,
+      "learning_rate": 0.0,
+      "loss": 0.7572,
+      "step": 1500
+    },
+    {
+      "epoch": 9.966777408637874,
+      "eval_loss": 2.5546562671661377,
+      "eval_runtime": 0.2333,
+      "eval_samples_per_second": 42.867,
+      "eval_steps_per_second": 4.287,
+      "step": 1500
+    },
+    {
+      "epoch": 9.966777408637874,
+      "step": 1500,
+      "total_flos": 4.5794490708666614e+18,
+      "train_loss": 1.5882705609003702,
+      "train_runtime": 3659.0045,
+      "train_samples_per_second": 26.291,
+      "train_steps_per_second": 0.41
+    }
+  ],
+  "logging_steps": 5,
+  "max_steps": 1500,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 10,
+  "save_steps": 100,
+  "total_flos": 4.5794490708666614e+18,
+  "train_batch_size": 4,
+  "trial_name": null,
+  "trial_params": null
+}

training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:259949fe184f41eaa8cac44a9bfa362aab2bed7016354254f4c4ab87a6e99cd7
+size 5176